From e487683990972bf9aa4e688434c46ead76748bca Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 20 Jun 2009 23:27:16 -0700 Subject: [PATCH 0001/1662] x86, mce: fix typo in comment in asm/mce.h Fix comment to match the actual declaration. Signed-off-by: Borislav Petkov Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 5cdd8d100ec9..b50b9e9042c4 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -9,7 +9,7 @@ */ #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ -#define MCG_CTL_P (1ULL<<8) /* MCG_CAP register available */ +#define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ #define MCG_CMCI_P (1ULL<<10) /* CMCI supported */ #define MCG_EXT_CNT_MASK 0xff0000 /* Number of Extended registers */ From a95436e44a76a32dcbe7c8df59701ddde53017c1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 20 Jun 2009 23:28:22 -0700 Subject: [PATCH 0002/1662] x86, mce: use atomic_inc_return() instead of add by 1 Use atomic_inc_return() instead of atomic_add_return() by 1. Signed-off-by: Borislav Petkov Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968bc..7da8fec9ca88 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -242,7 +242,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) /* * Make sure only one CPU runs in machine check panic */ - if (atomic_add_return(1, &mce_paniced) > 1) + if (atomic_inc_return(&mce_paniced) > 1) wait_for_panic(); barrier(); @@ -705,7 +705,7 @@ static int mce_start(int *no_way_out) * global_nwo should be updated before mce_callin */ smp_wmb(); - order = atomic_add_return(1, &mce_callin); + order = atomic_inc_return(&mce_callin); /* * Wait for everyone. From e74e396204bfcb67570ba4517b08f5918e69afea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 30 Mar 2009 19:07:44 +0900 Subject: [PATCH 0003/1662] percpu: use dynamic percpu allocator as the default percpu allocator This patch makes most !CONFIG_HAVE_SETUP_PER_CPU_AREA archs use dynamic percpu allocator. The first chunk is allocated using embedding helper and 8k is reserved for modules. This ensures that the new allocator behaves almost identically to the original allocator as long as static percpu variables are concerned, so it shouldn't introduce much breakage. s390 and alpha use custom SHIFT_PERCPU_PTR() to work around addressing range limit the addressing model imposes. Unfortunately, this breaks if the address is specified using a variable, so for now, the two archs aren't converted. The following architectures are affected by this change. * sh * arm * cris * mips * sparc(32) * blackfin * avr32 * parisc (broken, under investigation) * m32r * powerpc(32) As this change makes the dynamic allocator the default one, CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is replaced with its invert - CONFIG_HAVE_LEGACY_PER_CPU_AREA, which is added to yet-to-be converted archs. These archs implement their own setup_per_cpu_areas() and the conversion is not trivial. * powerpc(64) * sparc(64) * ia64 * alpha * s390 Boot and batch alloc/free tests on x86_32 with debug code (x86_32 doesn't use default first chunk initialization). Compile tested on sparc(32), powerpc(32), arm and alpha. Kyle McMartin reported that this change breaks parisc. The problem is still under investigation and he is okay with pushing this patch forward and fixing parisc later. [ Impact: use dynamic allocator for most archs w/o custom percpu setup ] Signed-off-by: Tejun Heo Acked-by: Rusty Russell Acked-by: David S. Miller Acked-by: Benjamin Herrenschmidt Acked-by: Martin Schwidefsky Reviewed-by: Christoph Lameter Cc: Paul Mundt Cc: Russell King Cc: Mikael Starvik Cc: Ralf Baechle Cc: Bryan Wu Cc: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Hirokazu Takata Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Heiko Carstens Cc: Ingo Molnar --- arch/alpha/Kconfig | 3 +++ arch/ia64/Kconfig | 3 +++ arch/powerpc/Kconfig | 3 +++ arch/s390/Kconfig | 3 +++ arch/sparc/Kconfig | 3 +++ arch/x86/Kconfig | 3 --- include/linux/percpu.h | 12 +++++++++--- init/main.c | 24 ------------------------ kernel/module.c | 6 +++--- mm/Makefile | 2 +- mm/allocpercpu.c | 28 ++++++++++++++++++++++++++++ mm/percpu.c | 40 +++++++++++++++++++++++++++++++++++++++- 12 files changed, 95 insertions(+), 35 deletions(-) diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 9fb8aae5c391..05d86407188c 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -70,6 +70,9 @@ config AUTO_IRQ_AFFINITY depends on SMP default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 170042b420d4..328d2f8b8c3f 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -89,6 +89,9 @@ config GENERIC_TIME_VSYSCALL bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index bf6cedfa05db..a774c2acbe69 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -46,6 +46,9 @@ config GENERIC_HARDIRQS_NO__DO_IRQ bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool PPC64 + config HAVE_SETUP_PER_CPU_AREA def_bool PPC64 diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a14dba0e4d67..f4a3cc62d28f 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -75,6 +75,9 @@ config VIRT_CPU_ACCOUNTING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y + mainmenu "Linux Kernel Configuration" config S390 diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 3f8b6a92eabd..7a8698b913fe 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -92,6 +92,9 @@ config AUDIT_ARCH bool default y +config HAVE_LEGACY_PER_CPU_AREA + def_bool y if SPARC64 + config HAVE_SETUP_PER_CPU_AREA def_bool y if SPARC64 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d1430ef6b4f9..a48a90076d83 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -149,9 +149,6 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y -config HAVE_DYNAMIC_PER_CPU_AREA - def_bool y - config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 26fd9d12f050..e5000343dd61 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -34,7 +34,7 @@ #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA /* minimum unit size, also is the maximum supported allocation size */ #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(64 << 10) @@ -80,7 +80,7 @@ extern ssize_t __init pcpu_embed_first_chunk( extern void *__alloc_reserved_percpu(size_t size, size_t align); -#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ struct percpu_data { void *ptrs[1]; @@ -99,11 +99,15 @@ struct percpu_data { (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ extern void *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void *__pdata); +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +extern void __init setup_per_cpu_areas(void); +#endif + #else /* CONFIG_SMP */ #define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) @@ -124,6 +128,8 @@ static inline void free_percpu(void *p) kfree(p); } +static inline void __init setup_per_cpu_areas(void) { } + #endif /* CONFIG_SMP */ #define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ diff --git a/init/main.c b/init/main.c index 09131ec090c1..602d724afa5c 100644 --- a/init/main.c +++ b/init/main.c @@ -357,7 +357,6 @@ static void __init smp_init(void) #define smp_init() do { } while (0) #endif -static inline void setup_per_cpu_areas(void) { } static inline void setup_nr_cpu_ids(void) { } static inline void smp_prepare_cpus(unsigned int maxcpus) { } @@ -378,29 +377,6 @@ static void __init setup_nr_cpu_ids(void) nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } -#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; - -EXPORT_SYMBOL(__per_cpu_offset); - -static void __init setup_per_cpu_areas(void) -{ - unsigned long size, i; - char *ptr; - unsigned long nr_possible_cpus = num_possible_cpus(); - - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); - ptr = alloc_bootmem_pages(size * nr_possible_cpus); - - for_each_possible_cpu(i) { - __per_cpu_offset[i] = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - ptr += size; - } -} -#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ - /* Called by boot processor to activate the rest. */ static void __init smp_init(void) { diff --git a/kernel/module.c b/kernel/module.c index 38928fcaff2b..f5934954fa99 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -364,7 +364,7 @@ EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA static void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) @@ -389,7 +389,7 @@ static void percpu_modfree(void *freeme) free_percpu(freeme); } -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */ /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; @@ -535,7 +535,7 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, diff --git a/mm/Makefile b/mm/Makefile index 5e0bd6426693..c77c6487552f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,7 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA obj-$(CONFIG_SMP) += percpu.o else obj-$(CONFIG_SMP) += allocpercpu.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index dfdee6a47359..df34ceae0c67 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -5,6 +5,8 @@ */ #include #include +#include +#include #ifndef cache_line_size #define cache_line_size() L1_CACHE_BYTES @@ -147,3 +149,29 @@ void free_percpu(void *__pdata) kfree(__percpu_disguise(__pdata)); } EXPORT_SYMBOL_GPL(free_percpu); + +/* + * Generic percpu area setup. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; + +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + unsigned long size, i; + char *ptr; + unsigned long nr_possible_cpus = num_possible_cpus(); + + /* Copy section for each CPU (we discard the original) */ + size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ptr = alloc_bootmem_pages(size * nr_possible_cpus); + + for_each_possible_cpu(i) { + __per_cpu_offset[i] = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + ptr += size; + } +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/percpu.c b/mm/percpu.c index b70f2acd8853..b14984566f5a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -43,7 +43,7 @@ * * To use this allocator, arch code should do the followings. * - * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be @@ -1275,3 +1275,41 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, reserved_size, dyn_size, pcpue_unit_size, pcpue_ptr, NULL); } + +/* + * Generic percpu area setup. + * + * The embedding helper is used because its behavior closely resembles + * the original non-dynamic generic percpu area setup. This is + * important because many archs have addressing restrictions and might + * fail if the percpu area is located far away from the previous + * location. As an added bonus, in non-NUMA cases, embedding is + * generally a good idea TLB-wise because percpu area can piggy back + * on the physical linear memory mapping which uses large page + * mappings on applicable archs. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + size_t static_size = __per_cpu_end - __per_cpu_start; + ssize_t unit_size; + unsigned long delta; + unsigned int cpu; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, -1); + if (unit_size < 0) + panic("Failed to initialized percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + cpu * unit_size; +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ From 405d967dc70002991f8fc35c20e0d3cbc7614f63 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:38 +0900 Subject: [PATCH 0004/1662] linker script: throw away .discard section x86 throws away .discard section but no other archs do. Also, .discard is not thrown away while linking modules. Make every arch and module linking throw it away. This will be used to define dummy variables for percpu declarations and definitions. This patch is based on Ivan Kokshaysky's alpha percpu patch. [ Impact: always throw away everything in .discard ] Signed-off-by: Tejun Heo Cc: Ivan Kokshaysky Cc: Richard Henderson Cc: Russell King Cc: Haavard Skinnemoen Cc: Bryan Wu Cc: Mikael Starvik Cc: Jesper Nilsson Cc: David Howells Cc: Yoshinori Sato Cc: Tony Luck Cc: Hirokazu Takata Cc: Geert Uytterhoeven Cc: Michal Simek Cc: Ralf Baechle Cc: Kyle McMartin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Paul Mundt Cc: David S. Miller Cc: Jeff Dike Cc: Chris Zankel Cc: Rusty Russell Cc: Ingo Molnar --- Makefile | 2 +- arch/alpha/kernel/vmlinux.lds.S | 1 + arch/arm/kernel/vmlinux.lds.S | 1 + arch/avr32/kernel/vmlinux.lds.S | 1 + arch/blackfin/kernel/vmlinux.lds.S | 1 + arch/cris/kernel/vmlinux.lds.S | 1 + arch/frv/kernel/vmlinux.lds.S | 2 ++ arch/h8300/kernel/vmlinux.lds.S | 1 + arch/ia64/kernel/vmlinux.lds.S | 1 + arch/m32r/kernel/vmlinux.lds.S | 1 + arch/m68k/kernel/vmlinux-std.lds | 1 + arch/m68k/kernel/vmlinux-sun3.lds | 1 + arch/m68knommu/kernel/vmlinux.lds.S | 1 + arch/microblaze/kernel/vmlinux.lds.S | 2 ++ arch/mips/kernel/vmlinux.lds.S | 1 + arch/mn10300/kernel/vmlinux.lds.S | 1 + arch/parisc/kernel/vmlinux.lds.S | 1 + arch/powerpc/kernel/vmlinux.lds.S | 1 + arch/s390/kernel/vmlinux.lds.S | 1 + arch/sh/kernel/vmlinux.lds.S | 1 + arch/sparc/kernel/vmlinux.lds.S | 1 + arch/um/kernel/dyn.lds.S | 2 ++ arch/um/kernel/uml.lds.S | 2 ++ arch/xtensa/kernel/vmlinux.lds.S | 1 + include/asm-generic/vmlinux.lds.h | 8 ++++++++ scripts/module-common.lds | 8 ++++++++ 26 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 scripts/module-common.lds diff --git a/Makefile b/Makefile index 46e1c9d03d51..12245be05122 100644 --- a/Makefile +++ b/Makefile @@ -327,7 +327,7 @@ CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \ MODFLAGS = -DMODULE CFLAGS_MODULE = $(MODFLAGS) AFLAGS_MODULE = $(MODFLAGS) -LDFLAGS_MODULE = +LDFLAGS_MODULE = -T $(srctree)/scripts/module-common.lds CFLAGS_KERNEL = AFLAGS_KERNEL = CFLAGS_GCOV = -fprofile-arcs -ftest-coverage diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index b9d6568e5f7f..75fe1d6877e9 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -139,6 +139,7 @@ SECTIONS EXIT_TEXT EXIT_DATA *(.exitcall.exit) + *(.discard) } .mdebug 0 : { diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 6c0779792546..e256c57b8981 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -82,6 +82,7 @@ SECTIONS EXIT_TEXT EXIT_DATA *(.exitcall.exit) + *(.discard) *(.ARM.exidx.exit.text) *(.ARM.extab.exit.text) #ifndef CONFIG_MMU diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S index 7910d41eb886..b8324608ec0c 100644 --- a/arch/avr32/kernel/vmlinux.lds.S +++ b/arch/avr32/kernel/vmlinux.lds.S @@ -131,6 +131,7 @@ SECTIONS /DISCARD/ : { EXIT_DATA *(.exitcall.exit) + *(.discard) } DWARF_DEBUG diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index 6ac307ca0d80..6e8eabd8f0a6 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S @@ -280,5 +280,6 @@ SECTIONS /DISCARD/ : { *(.exitcall.exit) + *(.discard) } } diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S index 0d2adfc794d4..a3175ebb38cc 100644 --- a/arch/cris/kernel/vmlinux.lds.S +++ b/arch/cris/kernel/vmlinux.lds.S @@ -145,6 +145,7 @@ SECTIONS EXIT_TEXT EXIT_DATA *(.exitcall.exit) + *(.discard) } dram_end = dram_start + (CONFIG_ETRAX_DRAM_SIZE - __CONFIG_ETRAX_VMEM_SIZE)*1024*1024; diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index 22d9787406ed..64b5a5e4d35e 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -177,6 +177,8 @@ SECTIONS .debug_ranges 0 : { *(.debug_ranges) } .comment 0 : { *(.comment) } + + /DISCARD/ : { *(.discard) } } __kernel_image_size_no_bss = __bss_start - __kernel_image_start; diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S index 43a87b9085b6..03d6c0df33db 100644 --- a/arch/h8300/kernel/vmlinux.lds.S +++ b/arch/h8300/kernel/vmlinux.lds.S @@ -154,6 +154,7 @@ SECTIONS } /DISCARD/ : { *(.exitcall.exit) + *(.discard) } .romfs : { diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 4a95e86b9ac2..13d958975874 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -29,6 +29,7 @@ SECTIONS EXIT_TEXT EXIT_DATA *(.exitcall.exit) + *(.discard) *(.IA_64.unwind.exit.text) *(.IA_64.unwind_info.exit.text) } diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S index 4179adf6c624..480a49944cfd 100644 --- a/arch/m32r/kernel/vmlinux.lds.S +++ b/arch/m32r/kernel/vmlinux.lds.S @@ -125,6 +125,7 @@ SECTIONS EXIT_TEXT EXIT_DATA *(.exitcall.exit) + *(.discard) } /* Stabs debugging sections. */ diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds index 01d212bb05a6..905a797ada93 100644 --- a/arch/m68k/kernel/vmlinux-std.lds +++ b/arch/m68k/kernel/vmlinux-std.lds @@ -87,6 +87,7 @@ SECTIONS EXIT_TEXT EXIT_DATA *(.exitcall.exit) + *(.discard) } /* Stabs debugging sections. */ diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds index c192f773db96..47d04be322aa 100644 --- a/arch/m68k/kernel/vmlinux-sun3.lds +++ b/arch/m68k/kernel/vmlinux-sun3.lds @@ -82,6 +82,7 @@ __init_begin = .; EXIT_TEXT EXIT_DATA *(.exitcall.exit) + *(.discard) } .crap : { diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S index b7fe505e358d..68111a61a77f 100644 --- a/arch/m68knommu/kernel/vmlinux.lds.S +++ b/arch/m68knommu/kernel/vmlinux.lds.S @@ -188,6 +188,7 @@ SECTIONS { EXIT_TEXT EXIT_DATA *(.exitcall.exit) + *(.discard) } .bss : { diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S index d34d38dcd12c..a207543c5927 100644 --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S @@ -162,4 +162,6 @@ SECTIONS { } . = ALIGN(4096); _end = .; + + /DISCARD/ : { *(.discard) } } diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 58738c8d754f..45901609b741 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -179,6 +179,7 @@ SECTIONS /* Sections to be discarded */ /DISCARD/ : { *(.exitcall.exit) + *(.discard) /* ABI crap starts here */ *(.MIPS.options) diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S index 24de6b90f401..5d9f2f96ad92 100644 --- a/arch/mn10300/kernel/vmlinux.lds.S +++ b/arch/mn10300/kernel/vmlinux.lds.S @@ -146,6 +146,7 @@ SECTIONS /* Sections to be discarded */ /DISCARD/ : { *(.exitcall.exit) + *(.discard) } STABS_DEBUG diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index fd2cc4fd2b65..ccf58341845a 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -240,6 +240,7 @@ SECTIONS /* Sections to be discarded */ /DISCARD/ : { *(.exitcall.exit) + *(.discard) #ifdef CONFIG_64BIT /* temporary hack until binutils is fixed to not emit these * for static binaries diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 8ef8a14abc95..7fca9355fd3d 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -40,6 +40,7 @@ SECTIONS /* Sections to be discarded. */ /DISCARD/ : { *(.exitcall.exit) + *(.discard) EXIT_DATA } diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index a53db23ee092..98867dfea469 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -161,6 +161,7 @@ SECTIONS /DISCARD/ : { EXIT_DATA *(.exitcall.exit) + *(.discard) } /* Debugging sections. */ diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index f53c76acaede..766976d27b21 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -171,6 +171,7 @@ SECTIONS */ /DISCARD/ : { *(.exitcall.exit) + *(.discard) } STABS_DEBUG diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index fcbbd000ec08..d63cf914667d 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -175,6 +175,7 @@ SECTIONS EXIT_TEXT EXIT_DATA *(.exitcall.exit) + *(.discard) } STABS_DEBUG diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index 9975e1ab44fb..2916d6eadffd 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -156,4 +156,6 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + + /DISCARD/ : { *(.discard) } } diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 11b835248b86..1f8a622cabe1 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -100,4 +100,6 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + + /DISCARD/ : { *(.discard) } } diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index 41c159cd872f..b1e24638acd7 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -287,6 +287,7 @@ SECTIONS EXIT_TEXT EXIT_DATA *(.exitcall.exit) + *(.discard) } .xt.lit : { *(.xt.lit) } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 55413e568f07..a19120c4e109 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -628,6 +628,14 @@ #define INITRAMFS #endif +#define DISCARDS \ + /DISCARD/ : { \ + EXIT_TEXT \ + EXIT_DATA \ + *(.exitcall.exit) \ + *(.discard) \ + } + /** * PERCPU_VADDR - define output section for percpu area * @vaddr: explicit base address (optional) diff --git a/scripts/module-common.lds b/scripts/module-common.lds new file mode 100644 index 000000000000..47a1f9ae0ede --- /dev/null +++ b/scripts/module-common.lds @@ -0,0 +1,8 @@ +/* + * Common module linker script, always used when linking a module. + * Archs are free to supply their own linker scripts. ld will + * combine them automatically. + */ +SECTIONS { + /DISCARD/ : { *(.discard) } +} From fe87f94f341a4b4097285b46f003059b26eb59bf Mon Sep 17 00:00:00 2001 From: Jesper Nilsson Date: Wed, 24 Jun 2009 15:13:41 +0900 Subject: [PATCH 0005/1662] CRIS: Change DEFINE_PER_CPU of current_pgd to be non volatile. The DEFINE_PER_CPU of current_pgd was on CRIS defined using volatile, which is not needed. Remove volatile. Tested on an ARTPEC-3 (CRISv32) board. tj: extern DEFINE_PER_CPU() replaced with DECLARE_PER_CPU() [ Impact: code cleanup ] Signed-off-by: Jesper Nilsson Signed-off-by: Tejun Heo --- arch/cris/include/asm/mmu_context.h | 3 ++- arch/cris/mm/fault.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/cris/include/asm/mmu_context.h b/arch/cris/include/asm/mmu_context.h index 72ba08dcfd18..1d45fd6365b7 100644 --- a/arch/cris/include/asm/mmu_context.h +++ b/arch/cris/include/asm/mmu_context.h @@ -17,7 +17,8 @@ extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, * registers like cr3 on the i386 */ -extern volatile DEFINE_PER_CPU(pgd_t *,current_pgd); /* defined in arch/cris/mm/fault.c */ +/* defined in arch/cris/mm/fault.c */ +DECLARE_PER_CPU(pgd_t *, current_pgd); static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index f925115e3250..4a7cdd9ea1ee 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c @@ -29,7 +29,7 @@ extern void die_if_kernel(const char *, struct pt_regs *, long); /* current active page directory */ -volatile DEFINE_PER_CPU(pgd_t *,current_pgd); +DEFINE_PER_CPU(pgd_t *, current_pgd); unsigned long cris_signal_return_page; /* From 204fba4aa303ea4a7bb726a539bf4a5b9e3203d0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:45 +0900 Subject: [PATCH 0006/1662] percpu: cleanup percpu array definitions Currently, the following three different ways to define percpu arrays are in use. 1. DEFINE_PER_CPU(elem_type[array_len], array_name); 2. DEFINE_PER_CPU(elem_type, array_name[array_len]); 3. DEFINE_PER_CPU(elem_type, array_name)[array_len]; Unify to #1 which correctly separates the roles of the two parameters and thus allows more flexibility in the way percpu variables are defined. [ Impact: cleanup ] Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Cc: Ingo Molnar Cc: Tony Luck Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Jeremy Fitzhardinge Cc: linux-mm@kvack.org Cc: Christoph Lameter Cc: David S. Miller --- arch/ia64/kernel/smp.c | 2 +- arch/ia64/sn/kernel/setup.c | 2 +- arch/powerpc/mm/stab.c | 2 +- arch/powerpc/platforms/ps3/smp.c | 2 +- arch/x86/kernel/cpu/cpu_debug.c | 4 ++-- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 2 +- drivers/xen/events.c | 4 ++-- mm/quicklist.c | 2 +- mm/slub.c | 4 ++-- net/ipv4/syncookies.c | 2 +- net/ipv6/syncookies.c | 2 +- 12 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index f0c521b0ba4c..94cf78ba28fa 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -58,7 +58,7 @@ static struct local_tlb_flush_counts { unsigned int count; } __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS]; -static DEFINE_PER_CPU(unsigned short, shadow_flush_counts[NR_CPUS]) ____cacheline_aligned; +static DEFINE_PER_CPU(unsigned short [NR_CPUS], shadow_flush_counts) ____cacheline_aligned; #define IPI_CALL_FUNC 0 #define IPI_CPU_STOP 1 diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c index e456f062f241..ece1bf994499 100644 --- a/arch/ia64/sn/kernel/setup.c +++ b/arch/ia64/sn/kernel/setup.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second); DEFINE_PER_CPU(struct sn_hub_info_s, __sn_hub_info); EXPORT_PER_CPU_SYMBOL(__sn_hub_info); -DEFINE_PER_CPU(short, __sn_cnodeid_to_nasid[MAX_COMPACT_NODES]); +DEFINE_PER_CPU(short [MAX_COMPACT_NODES], __sn_cnodeid_to_nasid); EXPORT_PER_CPU_SYMBOL(__sn_cnodeid_to_nasid); DEFINE_PER_CPU(struct nodepda_s *, __sn_nodepda); diff --git a/arch/powerpc/mm/stab.c b/arch/powerpc/mm/stab.c index 98cd1dc2ae75..6e9b69c99856 100644 --- a/arch/powerpc/mm/stab.c +++ b/arch/powerpc/mm/stab.c @@ -31,7 +31,7 @@ struct stab_entry { #define NR_STAB_CACHE_ENTRIES 8 static DEFINE_PER_CPU(long, stab_cache_ptr); -static DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]); +static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache); /* * Create a segment table entry for the given esid/vsid pair. diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c index f6e04bcc70ef..51ffde40af2b 100644 --- a/arch/powerpc/platforms/ps3/smp.c +++ b/arch/powerpc/platforms/ps3/smp.c @@ -37,7 +37,7 @@ */ #define MSG_COUNT 4 -static DEFINE_PER_CPU(unsigned int, ps3_ipi_virqs[MSG_COUNT]); +static DEFINE_PER_CPU(unsigned int [MSG_COUNT], ps3_ipi_virqs); static void do_message_pass(int target, int msg) { diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c index 6b2a52dd0403..dca325c03999 100644 --- a/arch/x86/kernel/cpu/cpu_debug.c +++ b/arch/x86/kernel/cpu/cpu_debug.c @@ -30,8 +30,8 @@ #include #include -static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); -static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); +static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr); +static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr); static DEFINE_PER_CPU(int, cpu_priv_count); static DEFINE_MUTEX(cpu_debug_lock); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index ddae21620bda..bd2a2fa84628 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -69,7 +69,7 @@ struct threshold_bank { struct threshold_block *blocks; cpumask_var_t cpus; }; -static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); +static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); #ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 76dfef23f789..4946288d6832 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -862,7 +862,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) x86_pmu_disable_counter(hwc, idx); } -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 891d2e90753a..ab581fa62681 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -47,10 +47,10 @@ static DEFINE_SPINLOCK(irq_mapping_update_lock); /* IRQ <-> VIRQ mapping. */ -static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; +static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; /* IRQ <-> IPI mapping */ -static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; +static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; /* Interrupt types. */ enum xen_irq_type { diff --git a/mm/quicklist.c b/mm/quicklist.c index e66d07d1b4ff..6eedf7e473d1 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -19,7 +19,7 @@ #include #include -DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; +DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); #define FRACTION_OF_NODE_MEM 16 diff --git a/mm/slub.c b/mm/slub.c index ce62b770e2fc..23bb79acc4b9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2086,8 +2086,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) */ #define NR_KMEM_CACHE_CPU 100 -static DEFINE_PER_CPU(struct kmem_cache_cpu, - kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; +static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], + kmem_cache_cpu); static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index cd2b97f1b6e1..84d90f2799bb 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -37,7 +37,7 @@ __initcall(init_syncookies); #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 8c2513982b61..23d0d6db0461 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -74,7 +74,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, return child; } -static DEFINE_PER_CPU(__u32, cookie_scratch)[16 + 5 + SHA_WORKSPACE_WORDS]; +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) From b9bf3121af348d9255f1c917830fe8c2df52efcb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:47 +0900 Subject: [PATCH 0007/1662] percpu: use DEFINE_PER_CPU_SHARED_ALIGNED() There are a few places where ___cacheline_aligned* is used with DEFINE_PER_CPU(). Use DEFINE_PER_CPU_SHARED_ALIGNED() instead. DEFINE_PER_CPU_SHARED_ALIGNED() applies alignment only on SMPs. While all other converted places used _in_smp variant or only get compiled for SMP, net/rds used unconditional ____cacheline_aligned. I don't see any reason these data structures should be aligned on UP and thus converted together. Signed-off-by: Tejun Heo Cc: Mike Frysinger Cc: Tony Luck Cc: Andy Grover --- arch/blackfin/mm/sram-alloc.c | 6 +++--- arch/ia64/kernel/smp.c | 3 ++- kernel/sched.c | 4 ++-- net/rds/ib_stats.c | 2 +- net/rds/iw_stats.c | 2 +- net/rds/page.c | 2 +- 6 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/blackfin/mm/sram-alloc.c b/arch/blackfin/mm/sram-alloc.c index 0bc3c4ef0aad..99e4dbb1dfd1 100644 --- a/arch/blackfin/mm/sram-alloc.c +++ b/arch/blackfin/mm/sram-alloc.c @@ -42,9 +42,9 @@ #include #include "blackfin_sram.h" -static DEFINE_PER_CPU(spinlock_t, l1sram_lock) ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(spinlock_t, l1_data_sram_lock) ____cacheline_aligned_in_smp; -static DEFINE_PER_CPU(spinlock_t, l1_inst_sram_lock) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1sram_lock); +static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_data_sram_lock); +static DEFINE_PER_CPU_SHARED_ALIGNED(spinlock_t, l1_inst_sram_lock); static spinlock_t l2_sram_lock ____cacheline_aligned_in_smp; /* the data structure for L1 scratchpad and DATA SRAM */ diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index 94cf78ba28fa..93ebfea43c6c 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -58,7 +58,8 @@ static struct local_tlb_flush_counts { unsigned int count; } __attribute__((__aligned__(32))) local_tlb_flush_counts[NR_CPUS]; -static DEFINE_PER_CPU(unsigned short [NR_CPUS], shadow_flush_counts) ____cacheline_aligned; +static DEFINE_PER_CPU_SHARED_ALIGNED(unsigned short [NR_CPUS], + shadow_flush_counts); #define IPI_CALL_FUNC 0 #define IPI_CPU_STOP 1 diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e6..34fd81d21784 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -318,12 +318,12 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); #endif /* CONFIG_RT_GROUP_SCHED */ #else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index 02e3e3d50d4a..301ae51ae409 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -37,7 +37,7 @@ #include "rds.h" #include "ib.h" -DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); static char *rds_ib_stat_names[] = { "ib_connect_raced", diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c index ccc7e8f0bf0e..fafea3cc92d7 100644 --- a/net/rds/iw_stats.c +++ b/net/rds/iw_stats.c @@ -37,7 +37,7 @@ #include "rds.h" #include "iw.h" -DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats); static char *rds_iw_stat_names[] = { "iw_connect_raced", diff --git a/net/rds/page.c b/net/rds/page.c index c460743a89ad..de7bb84bcd78 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -39,7 +39,7 @@ struct rds_page_remainder { unsigned long r_offset; }; -DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); /* * returns 0 on success or -errno on failure. From 245b2e70eabd797932adb263a65da0bab3711753 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:48 +0900 Subject: [PATCH 0008/1662] percpu: clean up percpu variable definitions Percpu variable definition is about to be updated such that all percpu symbols including the static ones must be unique. Update percpu variable definitions accordingly. * as,cfq: rename ioc_count uniquely * cpufreq: rename cpu_dbs_info uniquely * xen: move nesting_count out of xen_evtchn_do_upcall() and rename it * mm: move ratelimits out of balance_dirty_pages_ratelimited_nr() and rename it * ipv4,6: rename cookie_scratch uniquely * x86 perf_counter: rename prev_left to pmc_prev_left, irq_entry to pmc_irq_entry and nmi_entry to pmc_nmi_entry * perf_counter: rename disable_count to perf_disable_count * ftrace: rename test_event_disable to ftrace_test_event_disable * kmemleak: rename test_pointer to kmemleak_test_pointer * mce: rename next_interval to mce_next_interval [ Impact: percpu usage cleanups, no duplicate static percpu var names ] Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Cc: Ivan Kokshaysky Cc: Jens Axboe Cc: Dave Jones Cc: Jeremy Fitzhardinge Cc: linux-mm Cc: David S. Miller Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Li Zefan Cc: Catalin Marinas Cc: Andi Kleen --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- block/as-iosched.c | 10 +++++----- block/cfq-iosched.c | 10 +++++----- drivers/cpufreq/cpufreq_conservative.c | 12 ++++++------ drivers/cpufreq/cpufreq_ondemand.c | 15 ++++++++------- drivers/xen/events.c | 9 +++++---- kernel/perf_counter.c | 6 +++--- kernel/trace/trace_events.c | 6 +++--- mm/kmemleak-test.c | 6 +++--- mm/page-writeback.c | 5 +++-- net/ipv4/syncookies.c | 5 +++-- net/ipv6/syncookies.c | 5 +++-- 13 files changed, 58 insertions(+), 53 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 284d1de968bc..cba8cd3e957b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1091,7 +1091,7 @@ void mce_log_therm_throt_event(__u64 status) */ static int check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ +static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); static void mcheck_timer(unsigned long data) @@ -1110,7 +1110,7 @@ static void mcheck_timer(unsigned long data) * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(next_interval); + n = &__get_cpu_var(mce_next_interval); if (mce_notify_irq()) *n = max(*n/2, HZ/100); else @@ -1311,7 +1311,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c) static void mce_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(next_interval); + int *n = &__get_cpu_var(mce_next_interval); if (mce_ignore_ce) return; @@ -1914,7 +1914,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: t->expires = round_jiffies(jiffies + - __get_cpu_var(next_interval)); + __get_cpu_var(mce_next_interval)); add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); break; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 4946288d6832..5fdf63aaaba1 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -862,7 +862,7 @@ amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) x86_pmu_disable_counter(hwc, idx); } -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], prev_left); +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -901,7 +901,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (left > x86_pmu.max_period) left = x86_pmu.max_period; - per_cpu(prev_left[idx], smp_processor_id()) = left; + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; /* * The hw counter starts counting from this counter offset, @@ -1089,7 +1089,7 @@ void perf_counter_print_debug(void) rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); - prev_left = per_cpu(prev_left[idx], cpu); + prev_left = per_cpu(pmc_prev_left[idx], cpu); pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); @@ -1561,8 +1561,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) entry->ip[entry->nr++] = ip; } -static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); static void @@ -1709,9 +1709,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_callchain_entry *entry; if (in_nmi()) - entry = &__get_cpu_var(nmi_entry); + entry = &__get_cpu_var(pmc_nmi_entry); else - entry = &__get_cpu_var(irq_entry); + entry = &__get_cpu_var(pmc_irq_entry); entry->nr = 0; diff --git a/block/as-iosched.c b/block/as-iosched.c index 7a12cf6ee1d3..ce8ba57c6557 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -146,7 +146,7 @@ enum arq_state { #define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) #define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, as_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -161,7 +161,7 @@ static void as_antic_stop(struct as_data *ad); static void free_as_io_context(struct as_io_context *aic) { kfree(aic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(as_ioc_count); if (ioc_gone) { /* * AS scheduler is exiting, grab exit lock and check @@ -169,7 +169,7 @@ static void free_as_io_context(struct as_io_context *aic) * complete ioc_gone and set it back to NULL. */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -211,7 +211,7 @@ static struct as_io_context *alloc_as_io_context(void) ret->seek_total = 0; ret->seek_samples = 0; ret->seek_mean = 0; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(as_ioc_count); } return ret; @@ -1507,7 +1507,7 @@ static void __exit as_exit(void) ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ smp_wmb(); - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(as_ioc_count)) wait_for_completion(&all_gone); synchronize_rcu(); } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 833ec18eaa63..0f1cc7d3855e 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -48,7 +48,7 @@ static int cfq_slice_idle = HZ / 125; static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; -static DEFINE_PER_CPU(unsigned long, ioc_count); +static DEFINE_PER_CPU(unsigned long, cfq_ioc_count); static struct completion *ioc_gone; static DEFINE_SPINLOCK(ioc_gone_lock); @@ -1422,7 +1422,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) cic = container_of(head, struct cfq_io_context, rcu_head); kmem_cache_free(cfq_ioc_pool, cic); - elv_ioc_count_dec(ioc_count); + elv_ioc_count_dec(cfq_ioc_count); if (ioc_gone) { /* @@ -1431,7 +1431,7 @@ static void cfq_cic_free_rcu(struct rcu_head *head) * complete ioc_gone and set it back to NULL */ spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(ioc_count)) { + if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) { complete(ioc_gone); ioc_gone = NULL; } @@ -1557,7 +1557,7 @@ cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask) INIT_HLIST_NODE(&cic->cic_list); cic->dtor = cfq_free_io_context; cic->exit = cfq_exit_io_context; - elv_ioc_count_inc(ioc_count); + elv_ioc_count_inc(cfq_ioc_count); } return cic; @@ -2658,7 +2658,7 @@ static void __exit cfq_exit(void) * this also protects us from entering cfq_slab_kill() with * pending RCU callbacks */ - if (elv_ioc_count_read(ioc_count)) + if (elv_ioc_count_read(cfq_ioc_count)) wait_for_completion(&all_gone); cfq_slab_kill(); } diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 7fc58af748b4..a7ef465c83b9 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -65,7 +65,7 @@ struct cpu_dbs_info_s { int cpu; unsigned int enable:1; }; -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); +static DEFINE_PER_CPU(struct cpu_dbs_info_s, cs_cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ @@ -138,7 +138,7 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cpu_dbs_info, + struct cpu_dbs_info_s *this_dbs_info = &per_cpu(cs_cpu_dbs_info, freq->cpu); struct cpufreq_policy *policy; @@ -298,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, /* we need to re-evaluate prev_cpu_idle */ for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; - dbs_info = &per_cpu(cpu_dbs_info, j); + dbs_info = &per_cpu(cs_cpu_dbs_info, j); dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) @@ -388,7 +388,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cputime64_t cur_wall_time, cur_idle_time; unsigned int idle_time, wall_time; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); @@ -528,7 +528,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int j; int rc; - this_dbs_info = &per_cpu(cpu_dbs_info, cpu); + this_dbs_info = &per_cpu(cs_cpu_dbs_info, cpu); switch (event) { case CPUFREQ_GOV_START: @@ -548,7 +548,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(cs_cpu_dbs_info, j); j_dbs_info->cur_policy = policy; j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 1911d1729353..36f292a7bd01 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -73,7 +73,7 @@ struct cpu_dbs_info_s { unsigned int enable:1, sample_type:1; }; -static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info); +static DEFINE_PER_CPU(struct cpu_dbs_info_s, od_cpu_dbs_info); static unsigned int dbs_enable; /* number of CPUs using this policy */ @@ -151,7 +151,8 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy, unsigned int freq_hi, freq_lo; unsigned int index = 0; unsigned int jiffies_total, jiffies_hi, jiffies_lo; - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu); + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, + policy->cpu); if (!dbs_info->freq_table) { dbs_info->freq_lo = 0; @@ -196,7 +197,7 @@ static void ondemand_powersave_bias_init(void) { int i; for_each_online_cpu(i) { - struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, i); + struct cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, i); dbs_info->freq_table = cpufreq_frequency_get_table(i); dbs_info->freq_lo = 0; } @@ -297,7 +298,7 @@ static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy, /* we need to re-evaluate prev_cpu_idle */ for_each_online_cpu(j) { struct cpu_dbs_info_s *dbs_info; - dbs_info = &per_cpu(cpu_dbs_info, j); + dbs_info = &per_cpu(od_cpu_dbs_info, j); dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) @@ -391,7 +392,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) unsigned int load, load_freq; int freq_avg; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); @@ -548,7 +549,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, unsigned int j; int rc; - this_dbs_info = &per_cpu(cpu_dbs_info, cpu); + this_dbs_info = &per_cpu(od_cpu_dbs_info, cpu); switch (event) { case CPUFREQ_GOV_START: @@ -570,7 +571,7 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, for_each_cpu(j, policy->cpus) { struct cpu_dbs_info_s *j_dbs_info; - j_dbs_info = &per_cpu(cpu_dbs_info, j); + j_dbs_info = &per_cpu(od_cpu_dbs_info, j); j_dbs_info->cur_policy = policy; j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, diff --git a/drivers/xen/events.c b/drivers/xen/events.c index ab581fa62681..7d2987e9b1bb 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -602,6 +602,8 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +static DEFINE_PER_CPU(unsigned, xed_nesting_count); + /* * Search the CPUs pending events bitmasks. For each one found, map * the event number to an irq, and feed it into do_IRQ() for @@ -617,7 +619,6 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); struct shared_info *s = HYPERVISOR_shared_info; struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - static DEFINE_PER_CPU(unsigned, nesting_count); unsigned count; exit_idle(); @@ -628,7 +629,7 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) vcpu_info->evtchn_upcall_pending = 0; - if (__get_cpu_var(nesting_count)++) + if (__get_cpu_var(xed_nesting_count)++) goto out; #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ @@ -653,8 +654,8 @@ void xen_evtchn_do_upcall(struct pt_regs *regs) BUG_ON(!irqs_disabled()); - count = __get_cpu_var(nesting_count); - __get_cpu_var(nesting_count) = 0; + count = __get_cpu_var(xed_nesting_count); + __get_cpu_var(xed_nesting_count) = 0; } while(count != 1); out: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1a933a221ea4..1fd7a2e75754 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -98,16 +98,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader, void __weak perf_counter_print_debug(void) { } -static DEFINE_PER_CPU(int, disable_count); +static DEFINE_PER_CPU(int, perf_disable_count); void __perf_disable(void) { - __get_cpu_var(disable_count)++; + __get_cpu_var(perf_disable_count)++; } bool __perf_enable(void) { - return !--__get_cpu_var(disable_count); + return !--__get_cpu_var(perf_disable_count); } void perf_disable(void) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index aa08be69a1b6..54b1de5074b6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1318,7 +1318,7 @@ static __init void event_trace_self_tests(void) #ifdef CONFIG_FUNCTION_TRACER -static DEFINE_PER_CPU(atomic_t, test_event_disable); +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); static void function_test_events_call(unsigned long ip, unsigned long parent_ip) @@ -1334,7 +1334,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) pc = preempt_count(); resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); if (disabled != 1) goto out; @@ -1352,7 +1352,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) trace_nowake_buffer_unlock_commit(event, flags, pc); out: - atomic_dec(&per_cpu(test_event_disable, cpu)); + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); ftrace_preempt_enable(resched); } diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index d5292fc6f523..177a5169bbde 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -36,7 +36,7 @@ struct test_node { }; static LIST_HEAD(test_list); -static DEFINE_PER_CPU(void *, test_pointer); +static DEFINE_PER_CPU(void *, kmemleak_test_pointer); /* * Some very simple testing. This function needs to be extended for @@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void) } for_each_possible_cpu(i) { - per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); + per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); pr_info("kmemleak: kmalloc(129) = %p\n", - per_cpu(test_pointer, i)); + per_cpu(kmemleak_test_pointer, i)); } return 0; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7b0dcea4935b..2c075dcf03d4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -607,6 +607,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) } } +static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; + /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -624,7 +626,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; unsigned long ratelimit; unsigned long *p; @@ -637,7 +638,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, * tasks in balance_dirty_pages(). Period. */ preempt_disable(); - p = &__get_cpu_var(ratelimits); + p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { *p = 0; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 84d90f2799bb..a6e0e077ac33 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -37,12 +37,13 @@ __initcall(init_syncookies); #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv4_cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(cookie_scratch); + __u32 *tmp = __get_cpu_var(ipv4_cookie_scratch); memcpy(tmp + 4, syncookie_secret[c], sizeof(syncookie_secret[c])); tmp[0] = (__force u32)saddr; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 23d0d6db0461..6b6ae913b5d4 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -74,12 +74,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, return child; } -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], + ipv6_cookie_scratch); static u32 cookie_hash(struct in6_addr *saddr, struct in6_addr *daddr, __be16 sport, __be16 dport, u32 count, int c) { - __u32 *tmp = __get_cpu_var(cookie_scratch); + __u32 *tmp = __get_cpu_var(ipv6_cookie_scratch); /* * we have 320 bits of information to hash, copy in the remaining From 7c756e6e19e71f0327760d8955f7077118ebb2b1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:50 +0900 Subject: [PATCH 0009/1662] percpu: implement optional weak percpu definitions Some archs (alpha and s390) need to use weak definitions for percpu variables in modules so that the compiler generates external references for them. This patch implements weak percpu definitions which arch can enable by defining ARCH_NEEDS_WEAK_PER_CPU in arch percpu header file. This weak definition adds the following two restrictions on percpu variable definitions. 1. percpu symbols must be unique whether static or not 2. percpu variables can't be defined inside a function To ensure that these restrictions are observed in generic code, config option DEBUG_FORCE_WEAK_PER_CPU enables weak percpu definitions for all cases. This patch is inspired by Ivan Kokshaysky's alpha percpu patch. [ Impact: stricter rules for percpu variables, one more debug config option ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: David Howells Cc: Ivan Kokshaysky --- include/linux/percpu-defs.h | 63 ++++++++++++++++++++++++++++++++----- lib/Kconfig.debug | 15 +++++++++ 2 files changed, 70 insertions(+), 8 deletions(-) diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 8f921d74f49f..cf32838ad0fa 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -10,21 +10,68 @@ /* * Base implementations of per-CPU variable declarations and definitions, where * the section in which the variable is to be placed is provided by the - * 'section' argument. This may be used to affect the parameters governing the + * 'sec' argument. This may be used to affect the parameters governing the * variable's storage. * * NOTE! The sections for the DECLARE and for the DEFINE must match, lest * linkage errors occur due the compiler generating the wrong code to access * that section. */ -#define DECLARE_PER_CPU_SECTION(type, name, section) \ - extern \ - __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ - PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name +#define __PCPU_ATTRS(sec) \ + __attribute__((section(PER_CPU_BASE_SECTION sec))) \ + PER_CPU_ATTRIBUTES -#define DEFINE_PER_CPU_SECTION(type, name, section) \ - __attribute__((__section__(PER_CPU_BASE_SECTION section))) \ - PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name +#define __PCPU_DUMMY_ATTRS \ + __attribute__((section(".discard"), unused)) + +/* + * s390 and alpha modules require percpu variables to be defined as + * weak to force the compiler to generate GOT based external + * references for them. This is necessary because percpu sections + * will be located outside of the usually addressable area. + * + * This definition puts the following two extra restrictions when + * defining percpu variables. + * + * 1. The symbol must be globally unique, even the static ones. + * 2. Static percpu variables cannot be defined inside a function. + * + * Archs which need weak percpu definitions should define + * ARCH_NEEDS_WEAK_PER_CPU in asm/percpu.h when necessary. + * + * To ensure that the generic code observes the above two + * restrictions, if CONFIG_DEBUG_FORCE_WEAK_PER_CPU is set weak + * definition is used for all cases. + */ +#if defined(ARCH_NEEDS_WEAK_PER_CPU) || defined(CONFIG_DEBUG_FORCE_WEAK_PER_CPU) +/* + * __pcpu_scope_* dummy variable is used to enforce scope. It + * receives the static modifier when it's used in front of + * DEFINE_PER_CPU() and will trigger build failure if + * DECLARE_PER_CPU() is used for the same variable. + * + * __pcpu_unique_* dummy variable is used to enforce symbol uniqueness + * such that hidden weak symbol collision, which will cause unrelated + * variables to share the same address, can be detected during build. + */ +#define DECLARE_PER_CPU_SECTION(type, name, sec) \ + extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ + extern __PCPU_ATTRS(sec) __weak __typeof__(type) per_cpu__##name + +#define DEFINE_PER_CPU_SECTION(type, name, sec) \ + __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ + __PCPU_DUMMY_ATTRS char __pcpu_unique_##name; \ + __PCPU_ATTRS(sec) __weak __typeof__(type) per_cpu__##name +#else +/* + * Normal declaration and definition macros. + */ +#define DECLARE_PER_CPU_SECTION(type, name, sec) \ + extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name + +#define DEFINE_PER_CPU_SECTION(type, name, sec) \ + __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name +#endif /* * Variant on the per-CPU variable declaration/definition theme used for diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 23067ab1a73c..77e0d8b1b7c5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -777,6 +777,21 @@ config DEBUG_BLOCK_EXT_DEVT Say N if you are unsure. +config DEBUG_FORCE_WEAK_PER_CPU + bool "Force weak per-cpu definitions" + depends on DEBUG_KERNEL + help + s390 and alpha require percpu variables in modules to be + defined weak to work around addressing range issue which + puts the following two restrictions on percpu variable + definitions. + + 1. percpu symbols must be unique whether static or not + 2. percpu variables can't be defined inside a function + + To ensure that generic code follows the above rules, this + option forces all percpu variables to be defined as weak. + config LKDTM tristate "Linux Kernel Dump Test Tool Module" depends on DEBUG_KERNEL From 6088464cf1ae9fb3d2ccc0ec5feb3f5b971098d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:52 +0900 Subject: [PATCH 0010/1662] alpha: kill unnecessary __used attribute in PER_CPU_ATTRIBUTES With the previous percpu variable definition change, all percpu variables are global and there's no need to specify __used, which only triggers on recent compilers anyway. Kill it. [ Impact: remove unnecessary percpu attribute ] Signed-off-by: Tejun Heo Cc: Ivan Kokshaysky Cc: Richard Henderson --- arch/alpha/include/asm/percpu.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/alpha/include/asm/percpu.h b/arch/alpha/include/asm/percpu.h index 06c5c7a4afd3..7f0a9c4f2fd0 100644 --- a/arch/alpha/include/asm/percpu.h +++ b/arch/alpha/include/asm/percpu.h @@ -30,7 +30,6 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; #ifndef MODULE #define SHIFT_PERCPU_PTR(var, offset) RELOC_HIDE(&per_cpu_var(var), (offset)) -#define PER_CPU_ATTRIBUTES #else /* * To calculate addresses of locally defined variables, GCC uses 32-bit @@ -49,8 +48,6 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; : "=&r"(__ptr), "=&r"(tmp_gp)); \ (typeof(&per_cpu_var(var)))(__ptr + (offset)); }) -#define PER_CPU_ATTRIBUTES __used - #endif /* MODULE */ /* @@ -71,8 +68,6 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; #define __get_cpu_var(var) per_cpu_var(var) #define __raw_get_cpu_var(var) per_cpu_var(var) -#define PER_CPU_ATTRIBUTES - #endif /* SMP */ #ifdef CONFIG_SMP From 9b7dbc7dc0365a943af2d73b1376a6f0aac5dc0d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:52 +0900 Subject: [PATCH 0011/1662] alpha: switch to dynamic percpu allocator Alpha implements custom SHIFT_PERCPU_PTR for modules because percpu area can be located far away from the 4G area where the module text is located. The custom SHIFT_PERCPU_PTR forces GOT usage using ldq instruction with literal relocation; however, the relocation can't be used with dynamically allocated percpu variables. Fortunately, similar result can be achieved by using weak percpu variable definitions. This patch makes alpha use weak definitions and switch to dynamic percpu allocator. asm/tlbflush.h was getting linux/sched.h via asm/percpu.h which no longer needs it. Include linux/sched.h directly in asm/tlbflush.h. Compile tested. Generation of litereal relocation verified. This patch is based on Ivan Kokshaysky's alpha percpu patch. [ Impact: use dynamic percpu allocator ] Signed-off-by: Tejun Heo Acked-by: Ivan Kokshaysky Cc: Richard Henderson --- arch/alpha/Kconfig | 3 - arch/alpha/include/asm/percpu.h | 95 +++---------------------------- arch/alpha/include/asm/tlbflush.h | 1 + 3 files changed, 9 insertions(+), 90 deletions(-) diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 05d86407188c..9fb8aae5c391 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -70,9 +70,6 @@ config AUTO_IRQ_AFFINITY depends on SMP default y -config HAVE_LEGACY_PER_CPU_AREA - def_bool y - source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/alpha/include/asm/percpu.h b/arch/alpha/include/asm/percpu.h index 7f0a9c4f2fd0..2c12378e3aa9 100644 --- a/arch/alpha/include/asm/percpu.h +++ b/arch/alpha/include/asm/percpu.h @@ -1,97 +1,18 @@ #ifndef __ALPHA_PERCPU_H #define __ALPHA_PERCPU_H -#include -#include -#include - /* - * Determine the real variable name from the name visible in the - * kernel sources. - */ -#define per_cpu_var(var) per_cpu__##var - -#ifdef CONFIG_SMP - -/* - * per_cpu_offset() is the offset that has to be added to a - * percpu variable to get to the instance for a certain processor. - */ -extern unsigned long __per_cpu_offset[NR_CPUS]; - -#define per_cpu_offset(x) (__per_cpu_offset[x]) - -#define __my_cpu_offset per_cpu_offset(raw_smp_processor_id()) -#ifdef CONFIG_DEBUG_PREEMPT -#define my_cpu_offset per_cpu_offset(smp_processor_id()) -#else -#define my_cpu_offset __my_cpu_offset -#endif - -#ifndef MODULE -#define SHIFT_PERCPU_PTR(var, offset) RELOC_HIDE(&per_cpu_var(var), (offset)) -#else -/* - * To calculate addresses of locally defined variables, GCC uses 32-bit - * displacement from the GP. Which doesn't work for per cpu variables in - * modules, as an offset to the kernel per cpu area is way above 4G. + * To calculate addresses of locally defined variables, GCC uses + * 32-bit displacement from the GP. Which doesn't work for per cpu + * variables in modules, as an offset to the kernel per cpu area is + * way above 4G. * - * This forces allocation of a GOT entry for per cpu variable using - * ldq instruction with a 'literal' relocation. + * Always use weak definitions for percpu variables in modules. */ -#define SHIFT_PERCPU_PTR(var, offset) ({ \ - extern int simple_identifier_##var(void); \ - unsigned long __ptr, tmp_gp; \ - asm ( "br %1, 1f \n\ - 1: ldgp %1, 0(%1) \n\ - ldq %0, per_cpu__" #var"(%1)\t!literal" \ - : "=&r"(__ptr), "=&r"(tmp_gp)); \ - (typeof(&per_cpu_var(var)))(__ptr + (offset)); }) - -#endif /* MODULE */ - -/* - * A percpu variable may point to a discarded regions. The following are - * established ways to produce a usable pointer from the percpu variable - * offset. - */ -#define per_cpu(var, cpu) \ - (*SHIFT_PERCPU_PTR(var, per_cpu_offset(cpu))) -#define __get_cpu_var(var) \ - (*SHIFT_PERCPU_PTR(var, my_cpu_offset)) -#define __raw_get_cpu_var(var) \ - (*SHIFT_PERCPU_PTR(var, __my_cpu_offset)) - -#else /* ! SMP */ - -#define per_cpu(var, cpu) (*((void)(cpu), &per_cpu_var(var))) -#define __get_cpu_var(var) per_cpu_var(var) -#define __raw_get_cpu_var(var) per_cpu_var(var) - -#endif /* SMP */ - -#ifdef CONFIG_SMP -#define PER_CPU_BASE_SECTION ".data.percpu" -#else -#define PER_CPU_BASE_SECTION ".data" +#if defined(MODULE) && defined(CONFIG_SMP) +#define ARCH_NEEDS_WEAK_PER_CPU #endif -#ifdef CONFIG_SMP - -#ifdef MODULE -#define PER_CPU_SHARED_ALIGNED_SECTION "" -#else -#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned" -#endif -#define PER_CPU_FIRST_SECTION ".first" - -#else - -#define PER_CPU_SHARED_ALIGNED_SECTION "" -#define PER_CPU_FIRST_SECTION "" - -#endif - -#define PER_CPU_ATTRIBUTES +#include #endif /* __ALPHA_PERCPU_H */ diff --git a/arch/alpha/include/asm/tlbflush.h b/arch/alpha/include/asm/tlbflush.h index 9d87aaa08c0d..e89e0c2e15b1 100644 --- a/arch/alpha/include/asm/tlbflush.h +++ b/arch/alpha/include/asm/tlbflush.h @@ -2,6 +2,7 @@ #define _ALPHA_TLBFLUSH_H #include +#include #include #include From 9a0ef2923abd2cc2c6f78d3663ac7af34c0220e8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:53 +0900 Subject: [PATCH 0012/1662] s390: switch to dynamic percpu allocator 64bit s390 shares the same problem with alpha regarding percpu symbol addressing from modules. It needs assembly magic to force GOTENT reference when building module as the percpu address will be outside the usual 4G range from the module text. This can be solved by using weak percpu variable definitions. This patch makes s390 use weak definitions and switch to dynamic percpu allocator. Please note that weak attribute is not added if !SMP as percpu variables behave exactly the same as normal variables on UP. Compile tested. Generation of GOTENT reference verified. This patch is based on Ivan Kokshaysky's alpha percpu patch. [ Impact: use dynamic percpu allocator ] Signed-off-by: Tejun Heo Cc: Martin Schwidefsky Cc: Heiko Carstens --- arch/s390/Kconfig | 3 --- arch/s390/include/asm/percpu.h | 34 +++++++++------------------------- 2 files changed, 9 insertions(+), 28 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f4a3cc62d28f..a14dba0e4d67 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -75,9 +75,6 @@ config VIRT_CPU_ACCOUNTING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y -config HAVE_LEGACY_PER_CPU_AREA - def_bool y - mainmenu "Linux Kernel Configuration" config S390 diff --git a/arch/s390/include/asm/percpu.h b/arch/s390/include/asm/percpu.h index 408d60b4f75b..f7ad8719d02d 100644 --- a/arch/s390/include/asm/percpu.h +++ b/arch/s390/include/asm/percpu.h @@ -1,37 +1,21 @@ #ifndef __ARCH_S390_PERCPU__ #define __ARCH_S390_PERCPU__ -#include -#include - /* * s390 uses its own implementation for per cpu data, the offset of * the cpu local data area is cached in the cpu's lowcore memory. - * For 64 bit module code s390 forces the use of a GOT slot for the - * address of the per cpu variable. This is needed because the module - * may be more than 4G above the per cpu area. */ -#if defined(__s390x__) && defined(MODULE) - -#define SHIFT_PERCPU_PTR(ptr,offset) (({ \ - extern int simple_identifier_##var(void); \ - unsigned long *__ptr; \ - asm ( "larl %0, %1@GOTENT" \ - : "=a" (__ptr) : "X" (ptr) ); \ - (typeof(ptr))((*__ptr) + (offset)); })) - -#else - -#define SHIFT_PERCPU_PTR(ptr, offset) (({ \ - extern int simple_identifier_##var(void); \ - unsigned long __ptr; \ - asm ( "" : "=a" (__ptr) : "0" (ptr) ); \ - (typeof(ptr)) (__ptr + (offset)); })) - -#endif - #define __my_cpu_offset S390_lowcore.percpu_offset +/* + * For 64 bit module code, the module may be more than 4G above the + * per cpu area, use weak definitions to force the compiler to + * generate external references. + */ +#if defined(CONFIG_SMP) && defined(__s390x__) && defined(MODULE) +#define ARCH_NEEDS_WEAK_PER_CPU +#endif + #include #endif /* __ARCH_S390_PERCPU__ */ From bf4bb2b1f285ec56e7f3cbf0190761b42131871c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 16:57:03 +0900 Subject: [PATCH 0013/1662] sparc64: fix build breakage introduced by percpu-convert-most patchset Commit e74e396204bfcb67570ba4517b08f5918e69afea incorrectly added HAVE_LEGACY_PER_CPU_AREA to sparc64 although it already has been converted to dynamic percpu allocator. Drop both HAVE_{LEGACY|DYNAMIC}_PER_CPU_AREA. Signed-off-by: Tejun Heo Acked-by: David Miller --- arch/sparc/Kconfig | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 7a8698b913fe..4f6ed0f113f0 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -92,15 +92,9 @@ config AUDIT_ARCH bool default y -config HAVE_LEGACY_PER_CPU_AREA - def_bool y if SPARC64 - config HAVE_SETUP_PER_CPU_AREA def_bool y if SPARC64 -config HAVE_DYNAMIC_PER_CPU_AREA - def_bool y if SPARC64 - config GENERIC_HARDIRQS_NO__DO_IRQ bool def_bool y if SPARC64 From 1a8dd307cc0a2119be4e578c517795464e6dabba Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 29 Jun 2009 17:45:39 +0900 Subject: [PATCH 0014/1662] percpu: use __weak only in the definition of weak percpu variables __weak is necessary only for definition and might even not work in declaration. Drop it from declaration. This change was suggested by Ivan Kokshaysky. Signed-off-by: Tejun Heo Acked-by: Ivan Kokshaysky --- include/linux/percpu-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index cf32838ad0fa..9b7a53cc16eb 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -56,7 +56,7 @@ */ #define DECLARE_PER_CPU_SECTION(type, name, sec) \ extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ - extern __PCPU_ATTRS(sec) __weak __typeof__(type) per_cpu__##name + extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name #define DEFINE_PER_CPU_SECTION(type, name, sec) \ __PCPU_DUMMY_ATTRS char __pcpu_scope_##name; \ From 370f048214b4e9aa2102fa3c454ae1374da287c5 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 2 Jul 2009 00:09:33 -0500 Subject: [PATCH 0015/1662] xfs: add more statics & drop some unused functions A lot more functions could be made static, but they need forward declarations; this does some easy ones, and also found a few unused functions in the process. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_super.c | 2 +- fs/xfs/linux-2.6/xfs_sync.c | 15 ------------- fs/xfs/linux-2.6/xfs_sync.h | 1 - fs/xfs/xfs_ag.h | 3 --- fs/xfs/xfs_alloc.c | 2 +- fs/xfs/xfs_bmap.c | 2 +- fs/xfs/xfs_bmap.h | 11 ---------- fs/xfs/xfs_bmap_btree.c | 20 ++++++++--------- fs/xfs/xfs_bmap_btree.h | 1 - fs/xfs/xfs_btree.c | 42 +----------------------------------- fs/xfs/xfs_btree.h | 15 ------------- fs/xfs/xfs_inode.c | 8 +++---- fs/xfs/xfs_inode.h | 5 ----- fs/xfs/xfs_itable.c | 2 +- fs/xfs/xfs_itable.h | 5 ----- fs/xfs/xfs_log_priv.h | 2 -- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_mount.h | 3 --- fs/xfs/xfs_mru_cache.c | 29 ------------------------- fs/xfs/xfs_mru_cache.h | 1 - fs/xfs/xfs_rw.h | 6 ------ fs/xfs/xfs_vnodeops.c | 2 +- 23 files changed, 22 insertions(+), 159 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 36fb8a657c55..de5937872327 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -687,7 +687,7 @@ xfs_barrier_test( return error; } -void +STATIC void xfs_mountfs_check_barriers(xfs_mount_t *mp) { int error; diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index b619d6b8ca43..fbf3e0288b34 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -740,21 +740,6 @@ __xfs_inode_clear_reclaim_tag( XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); } -void -xfs_inode_clear_reclaim_tag( - xfs_inode_t *ip) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); - - read_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - __xfs_inode_clear_reclaim_tag(mp, pag, ip); - spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); - xfs_put_perag(mp, pag); -} - STATIC int xfs_reclaim_inode_now( struct xfs_inode *ip, diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 2a10301c99c7..23e7e7e6e136 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -48,7 +48,6 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); -void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip); void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, struct xfs_inode *ip); diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index f24b50b68d03..af3cfeb18b0d 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -91,9 +91,6 @@ typedef struct xfs_agf { #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) #define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) -extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); - /* * Size of the unlinked inode hash table in the agi. */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 2cf944eb796d..6316004922d7 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2248,7 +2248,7 @@ xfs_alloc_put_freelist( /* * Read in the allocation group header (free/alloc section). */ -int /* error */ +STATIC int /* error */ xfs_read_agf( struct xfs_mount *mp, /* mount point structure */ struct xfs_trans *tp, /* transaction pointer */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 7928b9983c1d..975972482e8f 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3713,7 +3713,7 @@ xfs_bmap_local_to_extents( * entry (null if none). Else, *lastxp will be set to the index * of the found entry; *gotp will contain the entry. */ -xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ xfs_bmap_search_multi_extents( xfs_ifork_t *ifp, /* inode fork pointer */ xfs_fileoff_t bno, /* block number searched for */ diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 1b8ff9256bd0..56f62d2edc35 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -392,17 +392,6 @@ xfs_bmap_count_blocks( int whichfork, int *count); -/* - * Search the extent records for the entry containing block bno. - * If bno lies in a hole, point to the next entry. If bno lies - * past eof, *eofp will be set, and *prevp will contain the last - * entry (null if none). Else, *lastxp will be set to the index - * of the found entry; *gotp will contain the entry. - */ -xfs_bmbt_rec_host_t * -xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *, - xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *); - #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 5c1ade06578e..eb7b702d0690 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -202,16 +202,6 @@ xfs_bmbt_get_state( ext_flag); } -/* Endian flipping versions of the bmbt extraction functions */ -void -xfs_bmbt_disk_get_all( - xfs_bmbt_rec_t *r, - xfs_bmbt_irec_t *s) -{ - __xfs_bmbt_get_all(get_unaligned_be64(&r->l0), - get_unaligned_be64(&r->l1), s); -} - /* * Extract the blockcount field from an on disk bmap extent record. */ @@ -816,6 +806,16 @@ xfs_bmbt_trace_key( *l1 = 0; } +/* Endian flipping versions of the bmbt extraction functions */ +STATIC void +xfs_bmbt_disk_get_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + __xfs_bmbt_get_all(get_unaligned_be64(&r->l0), + get_unaligned_be64(&r->l1), s); +} + STATIC void xfs_bmbt_trace_record( struct xfs_btree_cur *cur, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 0e8df007615e..5549d495947f 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r); extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); -extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index e9df99574829..cde5a2668293 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -645,46 +645,6 @@ xfs_btree_read_bufl( return 0; } -/* - * Get a buffer for the block, return it read in. - * Short-form addressing. - */ -int /* error */ -xfs_btree_read_bufs( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - uint lock, /* lock flags for read_buf */ - xfs_buf_t **bpp, /* buffer for agno/agbno */ - int refval) /* ref count value for buffer */ -{ - xfs_buf_t *bp; /* return value */ - xfs_daddr_t d; /* real disk block address */ - int error; - - ASSERT(agno != NULLAGNUMBER); - ASSERT(agbno != NULLAGBLOCK); - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp))) { - return error; - } - ASSERT(!bp || !XFS_BUF_GETERROR(bp)); - if (bp != NULL) { - switch (refval) { - case XFS_ALLOC_BTREE_REF: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); - break; - case XFS_INO_BTREE_REF: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval); - break; - } - } - *bpp = bp; - return 0; -} - /* * Read-ahead the block, don't wait for it, don't return a buffer. * Long-form addressing. @@ -2951,7 +2911,7 @@ xfs_btree_insert( * inode we have to copy the single block it was pointing to into the * inode. */ -int +STATIC int xfs_btree_kill_iroot( struct xfs_btree_cur *cur) { diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 4f852b735b96..7fa07062bdda 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -378,20 +378,6 @@ xfs_btree_read_bufl( struct xfs_buf **bpp, /* buffer for fsbno */ int refval);/* ref count value for buffer */ -/* - * Get a buffer for the block, return it read in. - * Short-form addressing. - */ -int /* error */ -xfs_btree_read_bufs( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - uint lock, /* lock flags for read_buf */ - struct xfs_buf **bpp, /* buffer for agno/agbno */ - int refval);/* ref count value for buffer */ - /* * Read-ahead the block, don't wait for it, don't return a buffer. * Long-form addressing. @@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *); int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); -int xfs_btree_kill_iroot(struct xfs_btree_cur *); int xfs_btree_insert(struct xfs_btree_cur *, int *); int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1f22d65fed0a..2dcb3d781ae5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -641,7 +641,7 @@ xfs_iformat_btree( return 0; } -void +STATIC void xfs_dinode_from_disk( xfs_icdinode_t *to, xfs_dinode_t *from) @@ -1237,7 +1237,7 @@ xfs_isize_check( * In that case the pages will still be in memory, but the inode size * will never have been updated. */ -xfs_fsize_t +STATIC xfs_fsize_t xfs_file_last_byte( xfs_inode_t *ip) { @@ -3827,7 +3827,7 @@ xfs_iext_inline_to_direct( /* * Resize an extent indirection array to new_size bytes. */ -void +STATIC void xfs_iext_realloc_indirect( xfs_ifork_t *ifp, /* inode fork pointer */ int new_size) /* new indirection array size */ @@ -3852,7 +3852,7 @@ xfs_iext_realloc_indirect( /* * Switch from indirection array to linear (direct) extent allocations. */ -void +STATIC void xfs_iext_indirect_to_direct( xfs_ifork_t *ifp) /* inode fork pointer */ { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 77016702938b..a0b2da66df3c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -526,7 +526,6 @@ void xfs_ipin(xfs_inode_t *); void xfs_iunpin(xfs_inode_t *); int xfs_iflush(xfs_inode_t *, uint); void xfs_ichgtime(xfs_inode_t *, int); -xfs_fsize_t xfs_file_last_byte(xfs_inode_t *); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); @@ -594,8 +593,6 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, xfs_daddr_t, uint); -void xfs_dinode_from_disk(struct xfs_icdinode *, - struct xfs_dinode *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); @@ -614,8 +611,6 @@ void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int); void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int); void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int); void xfs_iext_realloc_direct(xfs_ifork_t *, int); -void xfs_iext_realloc_indirect(xfs_ifork_t *, int); -void xfs_iext_indirect_to_direct(xfs_ifork_t *); void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t); void xfs_iext_inline_to_direct(xfs_ifork_t *, int); void xfs_iext_destroy(xfs_ifork_t *); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index aeb2d2221c7d..c471122d2234 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -39,7 +39,7 @@ #include "xfs_error.h" #include "xfs_btree.h" -int +STATIC int xfs_internal_inum( xfs_mount_t *mp, xfs_ino_t ino) diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 1fb04e7deb61..20792bf45946 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -99,11 +99,6 @@ xfs_bulkstat_one( void *dibuff, int *stat); -int -xfs_internal_inum( - xfs_mount_t *mp, - xfs_ino_t ino); - typedef int (*inumbers_fmt_pf)( void __user *ubuffer, /* buffer to write to */ const xfs_inogrp_t *buffer, /* buffer to read from */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index bcad5f4c1fd1..679c7c4926a2 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -451,8 +451,6 @@ extern int xlog_find_tail(xlog_t *log, extern int xlog_recover(xlog_t *log); extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); -extern void xlog_recover_process_iunlinks(xlog_t *log); - extern struct xfs_buf *xlog_get_bp(xlog_t *, int); extern void xlog_put_bp(struct xfs_buf *); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 47da2fb45377..1099395d7d6c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink( * freeing of the inode and its removal from the list must be * atomic. */ -void +STATIC void xlog_recover_process_iunlinks( xlog_t *log) { diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 5c6f092659c1..8b6c9e807efb 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) * * The m_sb_lock must be held when this routine is called. */ -int +STATIC int xfs_mod_incore_sb_unlocked( xfs_mount_t *mp, xfs_sb_field_t field, diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a5122382afde..a6c023bc0fb2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -414,13 +414,10 @@ typedef struct xfs_mod_sb { extern int xfs_log_sbcount(xfs_mount_t *, uint); extern int xfs_mountfs(xfs_mount_t *mp); -extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); extern int xfs_unmountfs_writesb(xfs_mount_t *); extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); -extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, - int64_t, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index afee7eb24323..4b0613d99faa 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -563,35 +563,6 @@ xfs_mru_cache_lookup( return elem ? elem->value : NULL; } -/* - * To look up an element using its key, but leave its location in the internal - * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this - * function returns NULL. - * - * See the comments above the declaration of the xfs_mru_cache_lookup() function - * for important locking information pertaining to this call. - */ -void * -xfs_mru_cache_peek( - xfs_mru_cache_t *mru, - unsigned long key) -{ - xfs_mru_cache_elem_t *elem; - - ASSERT(mru && mru->lists); - if (!mru || !mru->lists) - return NULL; - - spin_lock(&mru->lock); - elem = radix_tree_lookup(&mru->store, key); - if (!elem) - spin_unlock(&mru->lock); - else - __release(mru_lock); /* help sparse not be stupid */ - - return elem ? elem->value : NULL; -} - /* * To release the internal data structure spinlock after having performed an * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h index dd58ea1bbebe..5d439f34b0c9 100644 --- a/fs/xfs/xfs_mru_cache.h +++ b/fs/xfs/xfs_mru_cache.h @@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); -void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key); void xfs_mru_cache_done(struct xfs_mru_cache *mru); #endif /* __XFS_MRU_CACHE_H__ */ diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index f76c003ec55d..ae65f0df87da 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -78,10 +78,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, xfs_buf_t *bp, xfs_daddr_t blkno); -/* - * Prototypes for functions in xfs_vnodeops.c. - */ -extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, - int flags); - #endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c4eca5ed5dab..1dd706887156 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -716,7 +716,7 @@ xfs_fsync( * when the link count isn't zero and by xfs_dm_punch_hole() when * punching a hole to EOF. */ -int +STATIC int xfs_free_eofblocks( xfs_mount_t *mp, xfs_inode_t *ip, From ab8b9baac3f48fb3fc3d47790950c0eec134e678 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 2 Jul 2009 21:35:43 -0500 Subject: [PATCH 0016/1662] un-static xfs_read_agf CONFIG_XFS_DEBUG builds still need xfs_read_agf to be non-static, oops. Signed-off-by: Eric Sandeen Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_ag.h | 3 +++ fs/xfs/xfs_alloc.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index af3cfeb18b0d..f24b50b68d03 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -91,6 +91,9 @@ typedef struct xfs_agf { #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) #define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) +extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); + /* * Size of the unlinked inode hash table in the agi. */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 6316004922d7..2cf944eb796d 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2248,7 +2248,7 @@ xfs_alloc_put_freelist( /* * Read in the allocation group header (free/alloc section). */ -STATIC int /* error */ +int /* error */ xfs_read_agf( struct xfs_mount *mp, /* mount point structure */ struct xfs_trans *tp, /* transaction pointer */ From 79ba6ac825fac187894e236c9df1ba5fcbf53fd3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:58 +0900 Subject: [PATCH 0017/1662] x86: make pcpu_chunk_addr_search() matching stricter The @addr passed into pcpu_chunk_addr_search() is unit0 based address and thus should be matched inside unit0 area. Currently, when it uses chunk size when determining whether the address falls in the first chunk. Addresses in unitN where N>0 shouldn't be passed in anyway, so this doesn't cause any malfunction but fix it for consistency. [ Impact: mostly cleanup ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/percpu.c b/mm/percpu.c index b14984566f5a..19dd83b5cbdc 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -290,7 +290,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) void *first_start = pcpu_first_chunk->vm->addr; /* is it in the first chunk? */ - if (addr >= first_start && addr < first_start + pcpu_chunk_size) { + if (addr >= first_start && addr < first_start + pcpu_unit_size) { /* is it in the reserved area? */ if (addr < first_start + pcpu_reserved_chunk_limit) return pcpu_reserved_chunk; From 788e5abc5441e9046dd91c995c6f1f75bbd144bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:58 +0900 Subject: [PATCH 0018/1662] percpu: drop @unit_size from embed first chunk allocator The only extra feature @unit_size provides is making dead space at the end of the first chunk which doesn't have any valid usecase. Drop the parameter. This will increase consistency with generalized 4k allocator. James Bottomley spotted missing conversion for the default setup_per_cpu_areas() which caused build breakage on all arcsh which use it. [ Impact: drop unused code path ] Signed-off-by: Tejun Heo Cc: James Bottomley Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 2 +- include/linux/percpu.h | 2 +- mm/percpu.c | 18 ++++++------------ 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 29a3eef7cf4a..14728206fb52 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -342,7 +342,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) return -EINVAL; return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, -1); + reserve - PERCPU_FIRST_CHUNK_RESERVE); } /* diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e5000343dd61..83bff053bd1c 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -69,7 +69,7 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size); + ssize_t dyn_size); /* * Use this to get to a cpu's version of the per-cpu object diff --git a/mm/percpu.c b/mm/percpu.c index 19dd83b5cbdc..fc6babe6e554 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1207,7 +1207,6 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. @@ -1219,9 +1218,9 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * page size. * * When @dyn_size is positive, dynamic area might be larger than - * specified to fill page alignment. Also, when @dyn_size is auto, - * @dyn_size does not fill the whole first chunk but only what's - * necessary for page alignment after static and reserved areas. + * specified to fill page alignment. When @dyn_size is auto, + * @dyn_size is just big enough to fill page alignment after static + * and reserved areas. * * If the needed size is smaller than the minimum or specified unit * size, the leftover is returned to the bootmem allocator. @@ -1231,7 +1230,7 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) * percpu access on success, -errno on failure. */ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size) + ssize_t dyn_size) { size_t chunk_size; unsigned int cpu; @@ -1242,12 +1241,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, if (dyn_size != 0) dyn_size = pcpue_size - static_size - reserved_size; - if (unit_size >= 0) { - BUG_ON(unit_size < pcpue_size); - pcpue_unit_size = unit_size; - } else - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - + pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); chunk_size = pcpue_unit_size * num_possible_cpus(); pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, @@ -1304,7 +1298,7 @@ void __init setup_per_cpu_areas(void) * what the legacy allocator did. */ unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE, -1); + PERCPU_DYNAMIC_RESERVE); if (unit_size < 0) panic("Failed to initialized percpu areas."); From d4b95f80399471e4bce5e992700ff7f06ef91f6a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: [PATCH 0019/1662] x86,percpu: generalize 4k first chunk allocator Generalize and move x86 setup_pcpu_4k() into pcpu_4k_first_chunk(). setup_pcpu_4k() now is a simple wrapper around the generalized version. Other than taking size parameters and using arch supplied callbacks to allocate/free memory, pcpu_4k_first_chunk() is identical to the original implementation. This simplifies arch code and will help converting more archs to dynamic percpu allocator. While at it, s/pcpu_populate_pte_fn_t/pcpu_fc_populate_pte_fn_t/ for consistency. [ Impact: code reorganization and generalization ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 78 ++++++++----------------------- include/linux/percpu.h | 12 ++++- mm/percpu.c | 85 +++++++++++++++++++++++++++++++++- 3 files changed, 113 insertions(+), 62 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 14728206fb52..ab896b31e80b 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -123,6 +123,19 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } +/* + * Helpers for first chunk memory allocation + */ +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size) +{ + return pcpu_alloc_bootmem(cpu, size, size); +} + +static void __init pcpu_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + /* * Large page remap allocator * @@ -346,22 +359,11 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) } /* - * 4k page allocator + * 4k allocator * - * This is the basic allocator. Static percpu area is allocated - * page-by-page and most of initialization is done by the generic - * setup function. + * Boring fallback 4k allocator. This allocator puts more pressure on + * PTE TLBs but other than that behaves nicely on both UMA and NUMA. */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_nr_static_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_nr_static_pages) - return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; - return NULL; -} - static void __init pcpu4k_populate_pte(unsigned long addr) { populate_extra_pte(addr); @@ -369,51 +371,9 @@ static void __init pcpu4k_populate_pte(unsigned long addr) static ssize_t __init setup_pcpu_4k(size_t static_size) { - size_t pages_size; - unsigned int cpu; - int i, j; - ssize_t ret; - - pcpu4k_nr_static_pages = PFN_UP(static_size); - - /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() - * sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); - - /* allocate and copy */ - j = 0; - for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_nr_static_pages; i++) { - void *ptr; - - ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE); - if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); - goto enomem; - } - - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); - pcpu4k_pages[j++] = virt_to_page(ptr); - } - - /* we're ready, commit */ - pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", - pcpu4k_nr_static_pages, static_size); - - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, -1, - -1, NULL, pcpu4k_populate_pte); - goto out_free_ar; - -enomem: - while (--j >= 0) - free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE); - ret = -ENOMEM; -out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); - return ret; + return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpu4k_populate_pte); } /* for explicit first chunk allocator selection */ diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 83bff053bd1c..41b5bfab4195 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -59,18 +59,26 @@ extern void *pcpu_base_addr; typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); -typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); +typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); +typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); +typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, ssize_t dyn_size, ssize_t unit_size, void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn); + pcpu_fc_populate_pte_fn_t populate_pte_fn); extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); +extern ssize_t __init pcpu_4k_first_chunk( + size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn); + /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index fc6babe6e554..27b0f40a3ea8 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1037,7 +1037,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, ssize_t dyn_size, ssize_t unit_size, void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn) + pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; static int smap[2], dmap[2]; @@ -1270,6 +1270,89 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, pcpue_unit_size, pcpue_ptr, NULL); } +/* + * 4k page first chunk setup helper. + */ +static struct page **pcpu4k_pages __initdata; +static int pcpu4k_nr_static_pages __initdata; + +static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) +{ + if (pageno < pcpu4k_nr_static_pages) + return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; + return NULL; +} + +/** + * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE + * @populate_pte_fn: function to populate pte + * + * This is a helper to ease setting up embedded first percpu chunk and + * can be called where pcpu_setup_first_chunk() is expected. + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page into vmalloc area. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access on success, -errno on failure. + */ +ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) +{ + size_t pages_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + pcpu4k_nr_static_pages = PFN_UP(static_size); + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() * + sizeof(pcpu4k_pages[0])); + pcpu4k_pages = alloc_bootmem(pages_size); + + /* allocate and copy */ + j = 0; + for_each_possible_cpu(cpu) + for (i = 0; i < pcpu4k_nr_static_pages; i++) { + void *ptr; + + ptr = alloc_fn(cpu, PAGE_SIZE); + if (!ptr) { + pr_warning("PERCPU: failed to allocate " + "4k page for cpu%u\n", cpu); + goto enomem; + } + + memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); + pcpu4k_pages[j++] = virt_to_page(ptr); + } + + /* we're ready, commit */ + pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", + pcpu4k_nr_static_pages, static_size); + + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, + reserved_size, -1, + -1, NULL, populate_pte_fn); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE); + ret = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pcpu4k_pages), pages_size); + return ret; +} + /* * Generic percpu area setup. * From 8f05a6a65d944f2fed4eb384fb58aa8c8e5a9bab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: [PATCH 0020/1662] percpu: make 4k first chunk allocator map memory At first, percpu first chunk was always setup page-by-page by the generic code. To add other allocators, different parts of the generic initialization was made optional. Now we have three allocators - embed, remap and 4k. embed and remap fully handle allocation and mapping of the first chunk while 4k still depends on generic code for those. This makes the generic alloc/map paths specifci to 4k and makes the code unnecessary complicated with optional generic behaviors. This patch makes the 4k allocator to allocate and map memory directly instead of depending on the generic code. The only outside visible change is that now dynamic area in the first chunk is allocated up-front instead of on-demand. This doesn't make any meaningful difference as the area is minimal (usually less than a page, just enough to fill the alignment) on 4k allocator. Plus, dynamic area in the first chunk usually gets fully used anyway. This will allow simplification of pcpu_setpu_first_chunk() and removal of chunk->page array. [ Impact: no outside visible change other than up-front allocation of dyn area ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- mm/percpu.c | 71 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 17 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 27b0f40a3ea8..f3fe7bc7378f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -632,6 +632,13 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, pcpu_unmap(chunk, unmap_start, unmap_end, flush); } +static int __pcpu_map_pages(unsigned long addr, struct page **pages, + int nr_pages) +{ + return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, + PAGE_KERNEL, pages); +} + /** * pcpu_map - map pages into a pcpu_chunk * @chunk: chunk of interest @@ -651,11 +658,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) WARN_ON(chunk->immutable); for_each_possible_cpu(cpu) { - err = map_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT, - PAGE_KERNEL, - pcpu_chunk_pagep(chunk, cpu, page_start)); + err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), + pcpu_chunk_pagep(chunk, cpu, page_start), + page_end - page_start); if (err < 0) return err; } @@ -1274,12 +1279,12 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, * 4k page first chunk setup helper. */ static struct page **pcpu4k_pages __initdata; -static int pcpu4k_nr_static_pages __initdata; +static int pcpu4k_unit_pages __initdata; static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) { - if (pageno < pcpu4k_nr_static_pages) - return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno]; + if (pageno < pcpu4k_unit_pages) + return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno]; return NULL; } @@ -1306,22 +1311,24 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn) { + static struct vm_struct vm; size_t pages_size; unsigned int cpu; int i, j; ssize_t ret; - pcpu4k_nr_static_pages = PFN_UP(static_size); + pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, + PCPU_MIN_UNIT_SIZE)); /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus() * + pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() * sizeof(pcpu4k_pages[0])); pcpu4k_pages = alloc_bootmem(pages_size); - /* allocate and copy */ + /* allocate pages */ j = 0; for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_nr_static_pages; i++) { + for (i = 0; i < pcpu4k_unit_pages; i++) { void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE); @@ -1330,18 +1337,48 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, "4k page for cpu%u\n", cpu); goto enomem; } - - memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE); pcpu4k_pages[j++] = virt_to_page(ptr); } + /* allocate vm area, map the pages and copy static data */ + vm.flags = VM_ALLOC; + vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT; + vm_area_register_early(&vm, PAGE_SIZE); + + for_each_possible_cpu(cpu) { + unsigned long unit_addr = (unsigned long)vm.addr + + (cpu * pcpu4k_unit_pages << PAGE_SHIFT); + + for (i = 0; i < pcpu4k_unit_pages; i++) + populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); + + /* pte already populated, the following shouldn't fail */ + ret = __pcpu_map_pages(unit_addr, + &pcpu4k_pages[cpu * pcpu4k_unit_pages], + pcpu4k_unit_pages); + if (ret < 0) + panic("failed to map percpu area, err=%zd\n", ret); + + /* + * FIXME: Archs with virtual cache should flush local + * cache for the linear mapping here - something + * equivalent to flush_cache_vmap() on the local cpu. + * flush_cache_vmap() can't be used as most supporting + * data structures are not set up yet. + */ + + /* copy static data */ + memcpy((void *)unit_addr, __per_cpu_load, static_size); + } + /* we're ready, commit */ - pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", - pcpu4k_nr_static_pages, static_size); + pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n", + pcpu4k_unit_pages, static_size); ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, reserved_size, -1, - -1, NULL, populate_pte_fn); + pcpu4k_unit_pages << PAGE_SHIFT, vm.addr, + NULL); goto out_free_ar; enomem: From 8c4bfc6e8801616ab2e01c38140b2159b388d2ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: [PATCH 0021/1662] x86,percpu: generalize lpage first chunk allocator Generalize and move x86 setup_pcpu_lpage() into pcpu_lpage_first_chunk(). setup_pcpu_lpage() now is a simple wrapper around the generalized version. Other than taking size parameters and using arch supplied callbacks to allocate/free/map memory, pcpu_lpage_first_chunk() is identical to the original implementation. This simplifies arch code and will help converting more archs to dynamic percpu allocator. While at it, factor out pcpu_calc_fc_sizes() which is common to pcpu_embed_first_chunk() and pcpu_lpage_first_chunk(). [ Impact: code reorganization and generalization ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/x86/include/asm/percpu.h | 9 -- arch/x86/kernel/setup_percpu.c | 169 ++------------------------ arch/x86/mm/pageattr.c | 1 + include/linux/percpu.h | 27 +++++ mm/percpu.c | 209 ++++++++++++++++++++++++++++++++- 5 files changed, 244 insertions(+), 171 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 103f1ddb0d85..a18c038a3079 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -156,15 +156,6 @@ do { \ /* We can use this directly for local CPU (faster). */ DECLARE_PER_CPU(unsigned long, this_cpu_off); -#ifdef CONFIG_NEED_MULTIPLE_NODES -void *pcpu_lpage_remapped(void *kaddr); -#else -static inline void *pcpu_lpage_remapped(void *kaddr) -{ - return NULL; -} -#endif - #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_SMP diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ab896b31e80b..4f2e0ac9130b 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -137,44 +137,21 @@ static void __init pcpu_fc_free(void *ptr, size_t size) } /* - * Large page remap allocator - * - * This allocator uses PMD page as unit. A PMD page is allocated for - * each cpu and each is remapped into vmalloc area using PMD mapping. - * As PMD page is quite large, only part of it is used for the first - * chunk. Unused part is returned to the bootmem allocator. - * - * So, the PMD pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more PMD TLB entry pressure but still is much - * better than only using 4k mappings while still being NUMA friendly. + * Large page remapping allocator */ #ifdef CONFIG_NEED_MULTIPLE_NODES -struct pcpul_ent { - unsigned int cpu; - void *ptr; -}; - -static size_t pcpul_size; -static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; - -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +static void __init pcpul_map(void *ptr, size_t size, void *addr) { - size_t off = (size_t)pageno << PAGE_SHIFT; + pmd_t *pmd, pmd_v; - if (off >= pcpul_size) - return NULL; - - return virt_to_page(pcpul_map[cpu].ptr + off); + pmd = populate_extra_pmd((unsigned long)addr); + pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE); + set_pmd(pmd, pmd_v); } static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { - size_t map_size, dyn_size; - unsigned int cpu; - int i, j; - ssize_t ret; + size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; @@ -198,134 +175,10 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) return -EINVAL; } - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - if (pcpul_size > PMD_SIZE) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } - dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE; - - /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); - pcpul_map = alloc_bootmem(map_size); - - for_each_possible_cpu(cpu) { - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE, - PMD_SIZE); - if (!pcpul_map[cpu].ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The 2MB up-rounding bootmem is needed to make - * sure the partial 2MB page is still fully RAM - it's - * not well-specified to have a PAT-incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size), - PMD_SIZE - pcpul_size); - - memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size); - } - - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = num_possible_cpus() * PMD_SIZE; - vm_area_register_early(&pcpul_vm, PMD_SIZE); - - for_each_possible_cpu(cpu) { - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr + - cpu * PMD_SIZE); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)), - PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); - } - - /* we're ready, commit */ - pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); - - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, - PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpul_vm.addr, NULL); - - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < num_possible_cpus() - 1; i++) - for (j = i + 1; j < num_possible_cpus(); j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - return ret; - -enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; -} - -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu PMD - * mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used PMD - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) -{ - void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); - unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; - int left = 0, right = num_possible_cpus() - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < pmd_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > pmd_addr) - right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * PMD_SIZE + offset; - } - } - - return NULL; + return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + reserve - PERCPU_FIRST_CHUNK_RESERVE, + PMD_SIZE, + pcpu_fc_alloc, pcpu_fc_free, pcpul_map); } #else static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1b734d7a8966..c106f7852424 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 41b5bfab4195..9f6bfd7d4b92 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -62,6 +62,7 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); +typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, @@ -79,6 +80,32 @@ extern ssize_t __init pcpu_4k_first_chunk( pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); +#ifdef CONFIG_NEED_MULTIPLE_NODES +extern ssize_t __init pcpu_lpage_first_chunk( + size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn); + +extern void *pcpu_lpage_remapped(void *kaddr); +#else +static inline ssize_t __init pcpu_lpage_first_chunk( + size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) +{ + return -EINVAL; +} + +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} +#endif + /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index f3fe7bc7378f..17db527ee2e2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1190,6 +1190,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, return pcpu_unit_size; } +static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep) +{ + size_t size_sum; + + size_sum = PFN_ALIGN(static_size + reserved_size + + (*dyn_sizep >= 0 ? *dyn_sizep : 0)); + if (*dyn_sizep != 0) + *dyn_sizep = size_sum - static_size - reserved_size; + + return size_sum; +} + /* * Embedding first chunk setup helper. */ @@ -1241,10 +1254,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, unsigned int cpu; /* determine parameters and allocate */ - pcpue_size = PFN_ALIGN(static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0)); - if (dyn_size != 0) - dyn_size = pcpue_size - static_size - reserved_size; + pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); chunk_size = pcpue_unit_size * num_possible_cpus(); @@ -1390,6 +1400,197 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, return ret; } +/* + * Large page remapping first chunk setup helper + */ +#ifdef CONFIG_NEED_MULTIPLE_NODES +struct pcpul_ent { + unsigned int cpu; + void *ptr; +}; + +static size_t pcpul_size; +static size_t pcpul_unit_size; +static struct pcpul_ent *pcpul_map; +static struct vm_struct pcpul_vm; + +static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) +{ + size_t off = (size_t)pageno << PAGE_SHIFT; + + if (off >= pcpul_size) + return NULL; + + return virt_to_page(pcpul_map[cpu].ptr + off); +} + +/** + * pcpu_lpage_first_chunk - remap the first percpu chunk using large page + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @lpage_size: the size of a large page + * @alloc_fn: function to allocate percpu lpage, always called with lpage_size + * @free_fn: function to free percpu memory, @size <= lpage_size + * @map_fn: function to map percpu lpage, always called with lpage_size + * + * This allocator uses large page as unit. A large page is allocated + * for each cpu and each is remapped into vmalloc area using large + * page mapping. As large page can be quite large, only part of it is + * used for the first chunk. Unused part is returned to the bootmem + * allocator. + * + * So, the large pages are mapped twice - once to the physical mapping + * and to the vmalloc area for the first percpu chunk. The double + * mapping does add one more large TLB entry pressure but still is + * much better than only using 4k mappings while still being NUMA + * friendly. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access on success, -errno on failure. + */ +ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, + ssize_t dyn_size, size_t lpage_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) +{ + size_t size_sum; + size_t map_size; + unsigned int cpu; + int i, j; + ssize_t ret; + + /* + * Currently supports only single page. Supporting multiple + * pages won't be too difficult if it ever becomes necessary. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + + pcpul_unit_size = lpage_size; + pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + if (pcpul_size > pcpul_unit_size) { + pr_warning("PERCPU: static data is larger than large page, " + "can't use large page\n"); + return -EINVAL; + } + + /* allocate pointer array and alloc large pages */ + map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + pcpul_map = alloc_bootmem(map_size); + + for_each_possible_cpu(cpu) { + void *ptr; + + ptr = alloc_fn(cpu, lpage_size); + if (!ptr) { + pr_warning("PERCPU: failed to allocate large page " + "for cpu%u\n", cpu); + goto enomem; + } + + /* + * Only use pcpul_size bytes and give back the rest. + * + * Ingo: The lpage_size up-rounding bootmem is needed + * to make sure the partial lpage is still fully RAM - + * it's not well-specified to have a incompatible area + * (unmapped RAM, device memory, etc.) in that hole. + */ + free_fn(ptr + pcpul_size, lpage_size - pcpul_size); + + pcpul_map[cpu].cpu = cpu; + pcpul_map[cpu].ptr = ptr; + + memcpy(ptr, __per_cpu_load, static_size); + } + + /* allocate address and map */ + pcpul_vm.flags = VM_ALLOC; + pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; + vm_area_register_early(&pcpul_vm, pcpul_unit_size); + + for_each_possible_cpu(cpu) + map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, + pcpul_vm.addr + cpu * pcpul_unit_size); + + /* we're ready, commit */ + pr_info("PERCPU: Remapped at %p with large pages, static data " + "%zu bytes\n", pcpul_vm.addr, static_size); + + ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, + reserved_size, dyn_size, pcpul_unit_size, + pcpul_vm.addr, NULL); + + /* sort pcpul_map array for pcpu_lpage_remapped() */ + for (i = 0; i < num_possible_cpus() - 1; i++) + for (j = i + 1; j < num_possible_cpus(); j++) + if (pcpul_map[i].ptr > pcpul_map[j].ptr) { + struct pcpul_ent tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + return ret; + +enomem: + for_each_possible_cpu(cpu) + if (pcpul_map[cpu].ptr) + free_fn(pcpul_map[cpu].ptr, pcpul_size); + free_bootmem(__pa(pcpul_map), map_size); + return -ENOMEM; +} + +/** + * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area + * @kaddr: the kernel address in question + * + * Determine whether @kaddr falls in the pcpul recycled area. This is + * used by pageattr to detect VM aliases and break up the pcpu large + * page mapping such that the same physical page is not mapped under + * different attributes. + * + * The recycled area is always at the tail of a partially used large + * page. + * + * RETURNS: + * Address of corresponding remapped pcpu address if match is found; + * otherwise, NULL. + */ +void *pcpu_lpage_remapped(void *kaddr) +{ + unsigned long unit_mask = pcpul_unit_size - 1; + void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); + unsigned long offset = (unsigned long)kaddr & unit_mask; + int left = 0, right = num_possible_cpus() - 1; + int pos; + + /* pcpul in use at all? */ + if (!pcpul_map) + return NULL; + + /* okay, perform binary search */ + while (left <= right) { + pos = (left + right) / 2; + + if (pcpul_map[pos].ptr < lpage_addr) + left = pos + 1; + else if (pcpul_map[pos].ptr > lpage_addr) + right = pos - 1; + else { + /* it shouldn't be in the area for the first chunk */ + WARN_ON(offset < pcpul_size); + + return pcpul_vm.addr + + pcpul_map[pos].cpu * pcpul_unit_size + offset; + } + } + + return NULL; +} +#endif + /* * Generic percpu area setup. * From 38a6be525460f52ac6f2de1c3f73c5615a8853cd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: [PATCH 0022/1662] percpu: simplify pcpu_setup_first_chunk() Now that all first chunk allocator helpers allocate and map the first chunk themselves, there's no need to have optional default alloc/map in pcpu_setup_first_chunk(). Drop @populate_pte_fn and only leave @dyn_size optional and make all other params mandatory. This makes it much easier to follow what pcpu_setup_first_chunk() is doing and what actual differences tweaking each parameter results in. [ Impact: drop unused code path ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- arch/sparc/kernel/smp_64.c | 2 +- include/linux/percpu.h | 5 +- mm/percpu.c | 104 +++++++++++-------------------------- 3 files changed, 33 insertions(+), 78 deletions(-) diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index fa44eaf8d897..ccad7b20ae75 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1528,7 +1528,7 @@ void __init setup_per_cpu_areas(void) pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size, PERCPU_MODULE_RESERVE, dyn_size, - PCPU_CHUNK_SIZE, vm.addr, NULL); + PCPU_CHUNK_SIZE, vm.addr); free_bootmem(__pa(pcpur_ptrs), ptrs_size); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 9f6bfd7d4b92..ec64357e1762 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -66,9 +66,8 @@ typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size, - void *base_addr, - pcpu_fc_populate_pte_fn_t populate_pte_fn); + ssize_t dyn_size, size_t unit_size, + void *base_addr); extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, diff --git a/mm/percpu.c b/mm/percpu.c index 17db527ee2e2..21d938a10662 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -983,24 +983,22 @@ EXPORT_SYMBOL_GPL(free_percpu); * pcpu_setup_first_chunk - initialize the first percpu chunk * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes - * @reserved_size: the size of reserved percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes, 0 for none * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto - * @base_addr: mapped address, NULL for auto - * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary + * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE + * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area - * setup path. The first two parameters are mandatory. The rest are - * optional. + * setup path. * * @get_page_fn() should return pointer to percpu page given cpu * number and page number. It should at least return enough pages to * cover the static area. The returned pages for static area should - * have been initialized with valid data. If @unit_size is specified, - * it can also return pages after the static area. NULL return - * indicates end of pages for the cpu. Note that @get_page_fn() must - * return the same number of pages for all cpus. + * have been initialized with valid data. It can also return pages + * after the static area. NULL return indicates end of pages for the + * cpu. Note that @get_page_fn() must return the same number of pages + * for all cpus. * * @reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves @@ -1015,17 +1013,12 @@ EXPORT_SYMBOL_GPL(free_percpu); * non-negative value makes percpu leave alone the area beyond * @static_size + @reserved_size + @dyn_size. * - * @unit_size, if non-negative, specifies unit size and must be - * aligned to PAGE_SIZE and equal to or larger than @static_size + - * @reserved_size + if non-negative, @dyn_size. + * @unit_size specifies unit size and must be aligned to PAGE_SIZE and + * equal to or larger than @static_size + @reserved_size + if + * non-negative, @dyn_size. * - * Non-null @base_addr means that the caller already allocated virtual - * region for the first chunk and mapped it. percpu must not mess - * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL - * @populate_pte_fn doesn't make any sense. - * - * @populate_pte_fn is used to populate the pagetable. NULL means the - * caller already populated the pagetable. + * The caller should have mapped the first chunk at @base_addr and + * copied static data to each unit. * * If the first chunk ends up with both reserved and dynamic areas, it * is served by two chunks - one to serve the core static and reserved @@ -1040,9 +1033,8 @@ EXPORT_SYMBOL_GPL(free_percpu); */ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size, - void *base_addr, - pcpu_fc_populate_pte_fn_t populate_pte_fn) + ssize_t dyn_size, size_t unit_size, + void *base_addr) { static struct vm_struct first_vm; static int smap[2], dmap[2]; @@ -1050,27 +1042,18 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, (dyn_size >= 0 ? dyn_size : 0); struct pcpu_chunk *schunk, *dchunk = NULL; unsigned int cpu; - int nr_pages; - int err, i; + int i, nr_pages; /* santiy checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); - if (unit_size >= 0) { - BUG_ON(unit_size < size_sum); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); - } else - BUG_ON(base_addr); - BUG_ON(base_addr && populate_pte_fn); - - if (unit_size >= 0) - pcpu_unit_pages = unit_size >> PAGE_SHIFT; - else - pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, - PFN_UP(size_sum)); + BUG_ON(!base_addr); + BUG_ON(unit_size < size_sum); + BUG_ON(unit_size & ~PAGE_MASK); + BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); + pcpu_unit_pages = unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) @@ -1079,6 +1062,10 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (dyn_size < 0) dyn_size = pcpu_unit_size - static_size - reserved_size; + first_vm.flags = VM_ALLOC; + first_vm.size = pcpu_chunk_size; + first_vm.addr = base_addr; + /* * Allocate chunk slots. The additional last slot is for * empty chunks. @@ -1101,6 +1088,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->page = schunk->page_ar; + schunk->immutable = true; if (reserved_size) { schunk->free_size = reserved_size; @@ -1124,31 +1112,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->page = schunk->page_ar; /* share page map with schunk */ + dchunk->immutable = true; dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; dchunk->map[dchunk->map_used++] = dchunk->free_size; } - /* allocate vm address */ - first_vm.flags = VM_ALLOC; - first_vm.size = pcpu_chunk_size; - - if (!base_addr) - vm_area_register_early(&first_vm, PAGE_SIZE); - else { - /* - * Pages already mapped. No need to remap into - * vmalloc area. In this case the first chunks can't - * be mapped or unmapped by percpu and are marked - * immutable. - */ - first_vm.addr = base_addr; - schunk->immutable = true; - if (dchunk) - dchunk->immutable = true; - } - /* assign pages */ nr_pages = -1; for_each_possible_cpu(cpu) { @@ -1168,19 +1138,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, BUG_ON(nr_pages != i); } - /* map them */ - if (populate_pte_fn) { - for_each_possible_cpu(cpu) - for (i = 0; i < nr_pages; i++) - populate_pte_fn(pcpu_chunk_addr(schunk, - cpu, i)); - - err = pcpu_map(schunk, 0, nr_pages); - if (err) - panic("failed to setup static percpu area, err=%d\n", - err); - } - /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; pcpu_chunk_relocate(pcpu_first_chunk, -1); @@ -1282,7 +1239,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, return pcpu_setup_first_chunk(pcpue_get_page, static_size, reserved_size, dyn_size, - pcpue_unit_size, pcpue_ptr, NULL); + pcpue_unit_size, pcpue_ptr); } /* @@ -1387,8 +1344,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, reserved_size, -1, - pcpu4k_unit_pages << PAGE_SHIFT, vm.addr, - NULL); + pcpu4k_unit_pages << PAGE_SHIFT, vm.addr); goto out_free_ar; enomem: @@ -1521,7 +1477,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, reserved_size, dyn_size, pcpul_unit_size, - pcpul_vm.addr, NULL); + pcpul_vm.addr); /* sort pcpul_map array for pcpu_lpage_remapped() */ for (i = 0; i < num_possible_cpus() - 1; i++) From c8a51be4cabb7009db5f865169389242d49c4c60 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:10:59 +0900 Subject: [PATCH 0023/1662] percpu: reorder a few functions in mm/percpu.c (de)populate functions are about to be reimplemented to drop pcpu_chunk->page array. Move a few functions so that the rewrite patch doesn't have code movement making it more difficult to read. [ Impact: code movement ] Signed-off-by: Tejun Heo Cc: Ingo Molnar --- mm/percpu.c | 90 ++++++++++++++++++++++++++--------------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 21d938a10662..639fce4d2caf 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -181,12 +181,6 @@ static int pcpu_page_idx(unsigned int cpu, int page_idx) return cpu * pcpu_unit_pages + page_idx; } -static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, - unsigned int cpu, int page_idx) -{ - return &chunk->page[pcpu_page_idx(cpu, page_idx)]; -} - static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { @@ -194,6 +188,12 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); } +static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return &chunk->page[pcpu_page_idx(cpu, page_idx)]; +} + static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, int page_idx) { @@ -583,6 +583,45 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, pcpu_chunk_addr(chunk, last, page_end)); } +static int __pcpu_map_pages(unsigned long addr, struct page **pages, + int nr_pages) +{ + return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, + PAGE_KERNEL, pages); +} + +/** + * pcpu_map - map pages into a pcpu_chunk + * @chunk: chunk of interest + * @page_start: page index of the first page to map + * @page_end: page index of the last page to map + 1 + * + * For each cpu, map pages [@page_start,@page_end) into @chunk. + * vcache is flushed afterwards. + */ +static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) +{ + unsigned int last = num_possible_cpus() - 1; + unsigned int cpu; + int err; + + /* map must not be done on immutable chunk */ + WARN_ON(chunk->immutable); + + for_each_possible_cpu(cpu) { + err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), + pcpu_chunk_pagep(chunk, cpu, page_start), + page_end - page_start); + if (err < 0) + return err; + } + + /* flush at once, please read comments in pcpu_unmap() */ + flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); + return 0; +} + /** * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk * @chunk: chunk to depopulate @@ -632,45 +671,6 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, pcpu_unmap(chunk, unmap_start, unmap_end, flush); } -static int __pcpu_map_pages(unsigned long addr, struct page **pages, - int nr_pages) -{ - return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, - PAGE_KERNEL, pages); -} - -/** - * pcpu_map - map pages into a pcpu_chunk - * @chunk: chunk of interest - * @page_start: page index of the first page to map - * @page_end: page index of the last page to map + 1 - * - * For each cpu, map pages [@page_start,@page_end) into @chunk. - * vcache is flushed afterwards. - */ -static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) -{ - unsigned int last = num_possible_cpus() - 1; - unsigned int cpu; - int err; - - /* map must not be done on immutable chunk */ - WARN_ON(chunk->immutable); - - for_each_possible_cpu(cpu) { - err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), - pcpu_chunk_pagep(chunk, cpu, page_start), - page_end - page_start); - if (err < 0) - return err; - } - - /* flush at once, please read comments in pcpu_unmap() */ - flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); - return 0; -} - /** * pcpu_populate_chunk - populate and map an area of a pcpu_chunk * @chunk: chunk of interest From ce3141a277ff6cc37e51008b8888dc2cb7456ef1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:11:00 +0900 Subject: [PATCH 0024/1662] percpu: drop pcpu_chunk->page[] percpu core doesn't need to tack all the allocated pages. It needs to know whether certain pages are populated and a way to reverse map address to page when freeing. This patch drops pcpu_chunk->page[] and use populated bitmap and vmalloc_to_page() lookup instead. Using vmalloc_to_page() exclusively is also possible but complicates first chunk handling, inflates cache footprint and prevents non-standard memory allocation for percpu memory. pcpu_chunk->page[] was used to track each page's allocation and allowed asymmetric population which happens during failure path; however, with single bitmap for all units, this is no longer possible. Bite the bullet and rewrite (de)populate functions so that things are done in clearly separated steps such that asymmetric population doesn't happen. This makes the (de)population process much more modular and will also ease implementing non-standard memory usage in the future (e.g. large pages). This makes @get_page_fn parameter to pcpu_setup_first_chunk() unnecessary. The parameter is dropped and all first chunk helpers are updated accordingly. Please note that despite the volume most changes to first chunk helpers are symbol renames for variables which don't need to be referenced outside of the helper anymore. This change reduces memory usage and cache footprint of pcpu_chunk. Now only #unit_pages bits are necessary per chunk. [ Impact: reduced memory usage and cache footprint for bookkeeping ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: David Miller --- arch/sparc/kernel/smp_64.c | 42 +-- include/linux/percpu.h | 3 +- mm/percpu.c | 618 +++++++++++++++++++++++-------------- 3 files changed, 407 insertions(+), 256 deletions(-) diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index ccad7b20ae75..f2f22ee97a7a 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1415,19 +1415,6 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } -static size_t pcpur_size __initdata; -static void **pcpur_ptrs __initdata; - -static struct page * __init pcpur_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpur_size) - return NULL; - - return virt_to_page(pcpur_ptrs[cpu] + off); -} - #define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL) static void __init pcpu_map_range(unsigned long start, unsigned long end, @@ -1491,25 +1478,26 @@ void __init setup_per_cpu_areas(void) size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start; static struct vm_struct vm; unsigned long delta, cpu; - size_t pcpu_unit_size; + size_t size_sum, pcpu_unit_size; size_t ptrs_size; + void **ptrs; - pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - dyn_size = pcpur_size - static_size - PERCPU_MODULE_RESERVE; + size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE); + dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE; - ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0])); - pcpur_ptrs = alloc_bootmem(ptrs_size); + ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(ptrs[0])); + ptrs = alloc_bootmem(ptrs_size); for_each_possible_cpu(cpu) { - pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE, - PCPU_CHUNK_SIZE); + ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE, + PCPU_CHUNK_SIZE); - free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size), - PCPU_CHUNK_SIZE - pcpur_size); + free_bootmem(__pa(ptrs[cpu] + size_sum), + PCPU_CHUNK_SIZE - size_sum); - memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size); + memcpy(ptrs[cpu], __per_cpu_load, static_size); } /* allocate address and map */ @@ -1523,14 +1511,14 @@ void __init setup_per_cpu_areas(void) start += cpu * PCPU_CHUNK_SIZE; end = start + PCPU_CHUNK_SIZE; - pcpu_map_range(start, end, virt_to_page(pcpur_ptrs[cpu])); + pcpu_map_range(start, end, virt_to_page(ptrs[cpu])); } - pcpu_unit_size = pcpu_setup_first_chunk(pcpur_get_page, static_size, + pcpu_unit_size = pcpu_setup_first_chunk(static_size, PERCPU_MODULE_RESERVE, dyn_size, PCPU_CHUNK_SIZE, vm.addr); - free_bootmem(__pa(pcpur_ptrs), ptrs_size); + free_bootmem(__pa(ptrs), ptrs_size); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { diff --git a/include/linux/percpu.h b/include/linux/percpu.h index ec64357e1762..63c8b7a23e66 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -58,13 +58,12 @@ extern void *pcpu_base_addr; -typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); -extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, +extern size_t __init pcpu_setup_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t unit_size, void *base_addr); diff --git a/mm/percpu.c b/mm/percpu.c index 639fce4d2caf..21756814d99f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -94,8 +94,7 @@ struct pcpu_chunk { int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ bool immutable; /* no [de]population allowed */ - struct page **page; /* points to page array */ - struct page *page_ar[]; /* #cpus * UNIT_PAGES */ + unsigned long populated[]; /* populated bitmap */ }; static int pcpu_unit_pages __read_mostly; @@ -129,9 +128,9 @@ static int pcpu_reserved_chunk_limit; * Synchronization rules. * * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks and chunk->page arrays. - * The latter is a spinlock and protects the index data structures - - * chunk slots, chunks and area maps in chunks. + * protects allocation/reclaim paths, chunks, populated bitmap and + * vmalloc mapping. The latter is a spinlock and protects the index + * data structures - chunk slots, chunks and area maps in chunks. * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory @@ -188,16 +187,13 @@ static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); } -static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, - unsigned int cpu, int page_idx) +static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) { - return &chunk->page[pcpu_page_idx(cpu, page_idx)]; -} + /* must not be used on pre-mapped chunk */ + WARN_ON(chunk->immutable); -static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, - int page_idx) -{ - return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; + return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); } /* set the pointer to a chunk in a page struct */ @@ -212,6 +208,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) return (struct pcpu_chunk *)page->index; } +static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +{ + *rs = find_next_zero_bit(chunk->populated, end, *rs); + *re = find_next_bit(chunk->populated, end, *rs + 1); +} + +static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +{ + *rs = find_next_bit(chunk->populated, end, *rs); + *re = find_next_zero_bit(chunk->populated, end, *rs + 1); +} + +/* + * (Un)populated page region iterators. Iterate over (un)populated + * page regions betwen @start and @end in @chunk. @rs and @re should + * be integer variables and will be set to start and end page index of + * the current region. + */ +#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) + +#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) + /** * pcpu_mem_alloc - allocate memory * @size: bytes to allocate @@ -545,42 +569,197 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) } /** - * pcpu_unmap - unmap pages out of a pcpu_chunk + * pcpu_get_pages_and_bitmap - get temp pages array and bitmap * @chunk: chunk of interest - * @page_start: page index of the first page to unmap - * @page_end: page index of the last page to unmap + 1 - * @flush_tlb: whether to flush tlb or not + * @bitmapp: output parameter for bitmap + * @may_alloc: may allocate the array * - * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. - * If @flush is true, vcache is flushed before unmapping and tlb - * after. + * Returns pointer to array of pointers to struct page and bitmap, + * both of which can be indexed with pcpu_page_idx(). The returned + * array is cleared to zero and *@bitmapp is copied from + * @chunk->populated. Note that there is only one array and bitmap + * and access exclusion is the caller's responsibility. + * + * CONTEXT: + * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. + * Otherwise, don't care. + * + * RETURNS: + * Pointer to temp pages array on success, NULL on failure. */ -static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, - bool flush_tlb) +static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, + unsigned long **bitmapp, + bool may_alloc) +{ + static struct page **pages; + static unsigned long *bitmap; + size_t pages_size = num_possible_cpus() * pcpu_unit_pages * + sizeof(pages[0]); + size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * + sizeof(unsigned long); + + if (!pages || !bitmap) { + if (may_alloc && !pages) + pages = pcpu_mem_alloc(pages_size); + if (may_alloc && !bitmap) + bitmap = pcpu_mem_alloc(bitmap_size); + if (!pages || !bitmap) + return NULL; + } + + memset(pages, 0, pages_size); + bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); + + *bitmapp = bitmap; + return pages; +} + +/** + * pcpu_free_pages - free pages which were allocated for @chunk + * @chunk: chunk pages were allocated for + * @pages: array of pages to be freed, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be freed + * @page_end: page index of the last page to be freed + 1 + * + * Free pages [@page_start and @page_end) in @pages for all units. + * The pages were allocated for @chunk. + */ +static void pcpu_free_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page = pages[pcpu_page_idx(cpu, i)]; + + if (page) + __free_page(page); + } + } +} + +/** + * pcpu_alloc_pages - allocates pages for @chunk + * @chunk: target chunk + * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be allocated + * @page_end: page index of the last page to be allocated + 1 + * + * Allocate pages [@page_start,@page_end) into @pages for all units. + * The allocation is for @chunk. Percpu core doesn't care about the + * content of @pages and will pass it verbatim to pcpu_map_pages(). + */ +static int pcpu_alloc_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; + + *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); + if (!*pagep) { + pcpu_free_pages(chunk, pages, populated, + page_start, page_end); + return -ENOMEM; + } + } + } + return 0; +} + +/** + * pcpu_pre_unmap_flush - flush cache prior to unmapping + * @chunk: chunk the regions to be flushed belongs to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages in [@page_start,@page_end) of @chunk are about to be + * unmapped. Flush cache. As each flushing trial can be very + * expensive, issue flush on the whole region at once rather than + * doing it for each cpu. This could be an overkill but is more + * scalable. + */ +static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) { unsigned int last = num_possible_cpus() - 1; - unsigned int cpu; - /* unmap must not be done on immutable chunk */ - WARN_ON(chunk->immutable); - - /* - * Each flushing trial can be very expensive, issue flush on - * the whole region at once rather than doing it for each cpu. - * This could be an overkill but is more scalable. - */ flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), pcpu_chunk_addr(chunk, last, page_end)); +} - for_each_possible_cpu(cpu) - unmap_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT); +static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) +{ + unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); +} - /* ditto as flush_cache_vunmap() */ - if (flush_tlb) - flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); +/** + * pcpu_unmap_pages - unmap pages out of a pcpu_chunk + * @chunk: chunk of interest + * @pages: pages array which can be used to pass information to free + * @populated: populated bitmap + * @page_start: page index of the first page to unmap + * @page_end: page index of the last page to unmap + 1 + * + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. + * Corresponding elements in @pages were cleared by the caller and can + * be used to carry information to pcpu_free_pages() which will be + * called after all unmaps are finished. The caller should call + * proper pre/post flush functions. + */ +static void pcpu_unmap_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page; + + page = pcpu_chunk_page(chunk, cpu, i); + WARN_ON(!page); + pages[pcpu_page_idx(cpu, i)] = page; + } + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), + page_end - page_start); + } + + for (i = page_start; i < page_end; i++) + __clear_bit(i, populated); +} + +/** + * pcpu_post_unmap_tlb_flush - flush TLB after unmapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush + * TLB for the regions. This can be skipped if the area is to be + * returned to vmalloc as vmalloc will handle TLB flushing lazily. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + unsigned int last = num_possible_cpus() - 1; + + flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, @@ -591,35 +770,76 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, } /** - * pcpu_map - map pages into a pcpu_chunk + * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest + * @pages: pages array containing pages to be mapped + * @populated: populated bitmap * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * - * For each cpu, map pages [@page_start,@page_end) into @chunk. - * vcache is flushed afterwards. + * For each cpu, map pages [@page_start,@page_end) into @chunk. The + * caller is responsible for calling pcpu_post_map_flush() after all + * mappings are complete. + * + * This function is responsible for setting corresponding bits in + * @chunk->populated bitmap and whatever is necessary for reverse + * lookup (addr -> chunk). */ -static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) +static int pcpu_map_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; - unsigned int cpu; - int err; - - /* map must not be done on immutable chunk */ - WARN_ON(chunk->immutable); + unsigned int cpu, tcpu; + int i, err; for_each_possible_cpu(cpu) { err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), - pcpu_chunk_pagep(chunk, cpu, page_start), + &pages[pcpu_page_idx(cpu, page_start)], page_end - page_start); if (err < 0) - return err; + goto err; } + /* mapping successful, link chunk and mark populated */ + for (i = page_start; i < page_end; i++) { + for_each_possible_cpu(cpu) + pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], + chunk); + __set_bit(i, populated); + } + + return 0; + +err: + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), + page_end - page_start); + } + return err; +} + +/** + * pcpu_post_map_flush - flush cache after mapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been mapped. Flush + * cache. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_map_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + unsigned int last = num_possible_cpus() - 1; + /* flush at once, please read comments in pcpu_unmap() */ flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), pcpu_chunk_addr(chunk, last, page_end)); - return 0; } /** @@ -636,39 +856,45 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) * CONTEXT: * pcpu_alloc_mutex. */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, - bool flush) +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) { int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); - int unmap_start = -1; - int uninitialized_var(unmap_end); - unsigned int cpu; - int i; + struct page **pages; + unsigned long *populated; + int rs, re; - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); - - if (!*pagep) - continue; - - __free_page(*pagep); - - /* - * If it's partial depopulation, it might get - * populated or depopulated again. Mark the - * page gone. - */ - *pagep = NULL; - - unmap_start = unmap_start < 0 ? i : unmap_start; - unmap_end = i + 1; - } + /* quick path, check whether it's empty already */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + if (rs == page_start && re == page_end) + return; + break; } - if (unmap_start >= 0) - pcpu_unmap(chunk, unmap_start, unmap_end, flush); + /* immutable chunks can't be depopulated */ + WARN_ON(chunk->immutable); + + /* + * If control reaches here, there must have been at least one + * successful population attempt so the temp pages array must + * be available now. + */ + pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + BUG_ON(!pages); + + /* unmap and free */ + pcpu_pre_unmap_flush(chunk, page_start, page_end); + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + + /* no need to flush tlb, vmalloc will handle it lazily */ + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); } /** @@ -685,50 +911,61 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { - const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); - int map_start = -1; - int uninitialized_var(map_end); + int free_end = page_start, unmap_end = page_start; + struct page **pages; + unsigned long *populated; unsigned int cpu; - int i; + int rs, re, rc; - for (i = page_start; i < page_end; i++) { - if (pcpu_chunk_page_occupied(chunk, i)) { - if (map_start >= 0) { - if (pcpu_map(chunk, map_start, map_end)) - goto err; - map_start = -1; - } - continue; - } - - map_start = map_start < 0 ? i : map_start; - map_end = i + 1; - - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); - - *pagep = alloc_pages_node(cpu_to_node(cpu), - alloc_mask, 0); - if (!*pagep) - goto err; - pcpu_set_page_chunk(*pagep, chunk); - } + /* quick path, check whether all pages are already there */ + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { + if (rs == page_start && re == page_end) + goto clear; + break; } - if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) - goto err; + /* need to allocate and map pages, this chunk can't be immutable */ + WARN_ON(chunk->immutable); + pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + if (!pages) + return -ENOMEM; + + /* alloc and map */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_free; + free_end = re; + } + + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_map_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_unmap; + unmap_end = re; + } + pcpu_post_map_flush(chunk, page_start, page_end); + + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); +clear: for_each_possible_cpu(cpu) memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, size); - return 0; -err: - /* likely under heavy memory pressure, give memory back */ - pcpu_depopulate_chunk(chunk, off, size, true); - return -ENOMEM; + +err_unmap: + pcpu_pre_unmap_flush(chunk, page_start, unmap_end); + pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); +err_free: + pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + return rc; } static void free_pcpu_chunk(struct pcpu_chunk *chunk) @@ -752,7 +989,6 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map[chunk->map_used++] = pcpu_unit_size; - chunk->page = chunk->page_ar; chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); if (!chunk->vm) { @@ -933,7 +1169,7 @@ static void pcpu_reclaim(struct work_struct *work) mutex_unlock(&pcpu_alloc_mutex); list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); free_pcpu_chunk(chunk); } } @@ -981,7 +1217,6 @@ EXPORT_SYMBOL_GPL(free_percpu); /** * pcpu_setup_first_chunk - initialize the first percpu chunk - * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes, 0 for none * @dyn_size: free size for dynamic allocation in bytes, -1 for auto @@ -992,14 +1227,6 @@ EXPORT_SYMBOL_GPL(free_percpu); * perpcu area. This function is to be called from arch percpu area * setup path. * - * @get_page_fn() should return pointer to percpu page given cpu - * number and page number. It should at least return enough pages to - * cover the static area. The returned pages for static area should - * have been initialized with valid data. It can also return pages - * after the static area. NULL return indicates end of pages for the - * cpu. Note that @get_page_fn() must return the same number of pages - * for all cpus. - * * @reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved @@ -1031,8 +1258,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * The determined pcpu_unit_size which can be used to initialize * percpu access. */ -size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, size_t reserved_size, +size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t unit_size, void *base_addr) { @@ -1041,8 +1267,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, size_t size_sum = static_size + reserved_size + (dyn_size >= 0 ? dyn_size : 0); struct pcpu_chunk *schunk, *dchunk = NULL; - unsigned int cpu; - int i, nr_pages; + int i; /* santiy checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || @@ -1056,8 +1281,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_unit_pages = unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; - pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); if (dyn_size < 0) dyn_size = pcpu_unit_size - static_size - reserved_size; @@ -1087,8 +1312,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, schunk->vm = &first_vm; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); - schunk->page = schunk->page_ar; schunk->immutable = true; + bitmap_fill(schunk->populated, pcpu_unit_pages); if (reserved_size) { schunk->free_size = reserved_size; @@ -1106,38 +1331,19 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, /* init dynamic chunk if necessary */ if (dyn_size) { - dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); + dchunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&dchunk->list); dchunk->vm = &first_vm; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); - dchunk->page = schunk->page_ar; /* share page map with schunk */ dchunk->immutable = true; + bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; dchunk->map[dchunk->map_used++] = dchunk->free_size; } - /* assign pages */ - nr_pages = -1; - for_each_possible_cpu(cpu) { - for (i = 0; i < pcpu_unit_pages; i++) { - struct page *page = get_page_fn(cpu, i); - - if (!page) - break; - *pcpu_chunk_pagep(schunk, cpu, i) = page; - } - - BUG_ON(i < PFN_UP(static_size)); - - if (nr_pages < 0) - nr_pages = i; - else - BUG_ON(nr_pages != i); - } - /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; pcpu_chunk_relocate(pcpu_first_chunk, -1); @@ -1160,23 +1366,6 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, return size_sum; } -/* - * Embedding first chunk setup helper. - */ -static void *pcpue_ptr __initdata; -static size_t pcpue_size __initdata; -static size_t pcpue_unit_size __initdata; - -static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpue_size) - return NULL; - - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); -} - /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @static_size: the size of static percpu area in bytes @@ -1207,18 +1396,19 @@ static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size) { - size_t chunk_size; + size_t size_sum, unit_size, chunk_size; + void *base; unsigned int cpu; /* determine parameters and allocate */ - pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - chunk_size = pcpue_unit_size * num_possible_cpus(); + unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + chunk_size = unit_size * num_possible_cpus(); - pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, - __pa(MAX_DMA_ADDRESS)); - if (!pcpue_ptr) { + base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS)); + if (!base) { pr_warning("PERCPU: failed to allocate %zu bytes for " "embedding\n", chunk_size); return -ENOMEM; @@ -1226,33 +1416,18 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, /* return the leftover and copy */ for_each_possible_cpu(cpu) { - void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + void *ptr = base + cpu * unit_size; - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); + free_bootmem(__pa(ptr + size_sum), unit_size - size_sum); memcpy(ptr, __per_cpu_load, static_size); } /* we're ready, commit */ pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); + size_sum >> PAGE_SHIFT, base, static_size); - return pcpu_setup_first_chunk(pcpue_get_page, static_size, - reserved_size, dyn_size, - pcpue_unit_size, pcpue_ptr); -} - -/* - * 4k page first chunk setup helper. - */ -static struct page **pcpu4k_pages __initdata; -static int pcpu4k_unit_pages __initdata; - -static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno) -{ - if (pageno < pcpu4k_unit_pages) - return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno]; - return NULL; + return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, + unit_size, base); } /** @@ -1279,23 +1454,25 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; + int unit_pages; size_t pages_size; + struct page **pages; unsigned int cpu; int i, j; ssize_t ret; - pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, - PCPU_MIN_UNIT_SIZE)); + unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, + PCPU_MIN_UNIT_SIZE)); /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() * - sizeof(pcpu4k_pages[0])); - pcpu4k_pages = alloc_bootmem(pages_size); + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * + sizeof(pages[0])); + pages = alloc_bootmem(pages_size); /* allocate pages */ j = 0; for_each_possible_cpu(cpu) - for (i = 0; i < pcpu4k_unit_pages; i++) { + for (i = 0; i < unit_pages; i++) { void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE); @@ -1304,25 +1481,24 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, "4k page for cpu%u\n", cpu); goto enomem; } - pcpu4k_pages[j++] = virt_to_page(ptr); + pages[j++] = virt_to_page(ptr); } /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; - vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT; + vm.size = num_possible_cpus() * unit_pages << PAGE_SHIFT; vm_area_register_early(&vm, PAGE_SIZE); for_each_possible_cpu(cpu) { unsigned long unit_addr = (unsigned long)vm.addr + - (cpu * pcpu4k_unit_pages << PAGE_SHIFT); + (cpu * unit_pages << PAGE_SHIFT); - for (i = 0; i < pcpu4k_unit_pages; i++) + for (i = 0; i < unit_pages; i++) populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ - ret = __pcpu_map_pages(unit_addr, - &pcpu4k_pages[cpu * pcpu4k_unit_pages], - pcpu4k_unit_pages); + ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages], + unit_pages); if (ret < 0) panic("failed to map percpu area, err=%zd\n", ret); @@ -1340,19 +1516,18 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, /* we're ready, commit */ pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n", - pcpu4k_unit_pages, static_size); + unit_pages, static_size); - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, - reserved_size, -1, - pcpu4k_unit_pages << PAGE_SHIFT, vm.addr); + ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, + unit_pages << PAGE_SHIFT, vm.addr); goto out_free_ar; enomem: while (--j >= 0) - free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE); + free_fn(page_address(pages[j]), PAGE_SIZE); ret = -ENOMEM; out_free_ar: - free_bootmem(__pa(pcpu4k_pages), pages_size); + free_bootmem(__pa(pages), pages_size); return ret; } @@ -1370,16 +1545,6 @@ static size_t pcpul_unit_size; static struct pcpul_ent *pcpul_map; static struct vm_struct pcpul_vm; -static struct page * __init pcpul_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; - - if (off >= pcpul_size) - return NULL; - - return virt_to_page(pcpul_map[cpu].ptr + off); -} - /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page * @static_size: the size of static percpu area in bytes @@ -1475,9 +1640,8 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", pcpul_vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpul_get_page, static_size, - reserved_size, dyn_size, pcpul_unit_size, - pcpul_vm.addr); + ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, + pcpul_unit_size, pcpul_vm.addr); /* sort pcpul_map array for pcpu_lpage_remapped() */ for (i = 0; i < num_possible_cpus() - 1; i++) From 2f39e637ea240efb74cf807d31c93a71a0b89174 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:11:00 +0900 Subject: [PATCH 0025/1662] percpu: allow non-linear / sparse cpu -> unit mapping Currently cpu and unit are always identity mapped. To allow more efficient large page support on NUMA and lazy allocation for possible but offline cpus, cpu -> unit mapping needs to be non-linear and/or sparse. This can be easily implemented by adding a cpu -> unit mapping array and using it whenever looking up the matching unit for a cpu. The only unusal conversion is in pcpu_chunk_addr_search(). The passed in address is unit0 based and unit0 might not be in use so it needs to be converted to address of an in-use unit. This is easily done by adding the unit offset for the current processor. [ Impact: allows non-linear/sparse cpu -> unit mapping, no visible change yet ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: David Miller --- arch/sparc/kernel/smp_64.c | 2 +- include/linux/percpu.h | 3 +- mm/percpu.c | 129 +++++++++++++++++++++++++++---------- 3 files changed, 97 insertions(+), 37 deletions(-) diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index f2f22ee97a7a..6970333b48b8 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1516,7 +1516,7 @@ void __init setup_per_cpu_areas(void) pcpu_unit_size = pcpu_setup_first_chunk(static_size, PERCPU_MODULE_RESERVE, dyn_size, - PCPU_CHUNK_SIZE, vm.addr); + PCPU_CHUNK_SIZE, vm.addr, NULL); free_bootmem(__pa(ptrs), ptrs_size); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 63c8b7a23e66..1e0e8878dc2a 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -57,6 +57,7 @@ #endif extern void *pcpu_base_addr; +extern const int *pcpu_unit_map; typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); @@ -66,7 +67,7 @@ typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t unit_size, - void *base_addr); + void *base_addr, const int *unit_map); extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, diff --git a/mm/percpu.c b/mm/percpu.c index 21756814d99f..2196fae24f00 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -8,12 +8,13 @@ * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of num_possible_cpus() units and the first chunk - * is used for static percpu variables in the kernel image (special - * boot time alloc/init handling necessary as these areas need to be - * brought up before allocation services are running). Unit grows as - * necessary and all units grow or shrink in unison. When a chunk is - * filled up, another chunk is allocated. ie. in vmalloc area + * chunk is consisted of boot-time determined number of units and the + * first chunk is used for static percpu variables in the kernel image + * (special boot time alloc/init handling necessary as these areas + * need to be brought up before allocation services are running). + * Unit grows as necessary and all units grow or shrink in unison. + * When a chunk is filled up, another chunk is allocated. ie. in + * vmalloc area * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -22,11 +23,13 @@ * * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, - * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring - * percpu base registers pcpu_unit_size apart. + * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to + * cpus. On NUMA, the mapping can be non-linear and even sparse. + * Percpu access can be done by configuring percpu base registers + * according to cpu to unit mapping and pcpu_unit_size. * - * There are usually many small percpu allocations many of them as - * small as 4 bytes. The allocator organizes chunks into lists + * There are usually many small percpu allocations many of them being + * as small as 4 bytes. The allocator organizes chunks into lists * according to free size and tries to allocate from the fullest one. * Each chunk keeps the maximum contiguous area size hint which is * guaranteed to be eqaul to or larger than the maximum contiguous @@ -99,14 +102,22 @@ struct pcpu_chunk { static int pcpu_unit_pages __read_mostly; static int pcpu_unit_size __read_mostly; +static int pcpu_nr_units __read_mostly; static int pcpu_chunk_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; +/* cpus with the lowest and highest unit numbers */ +static unsigned int pcpu_first_unit_cpu __read_mostly; +static unsigned int pcpu_last_unit_cpu __read_mostly; + /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); +/* cpu -> unit map */ +const int *pcpu_unit_map __read_mostly; + /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different @@ -177,7 +188,7 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) static int pcpu_page_idx(unsigned int cpu, int page_idx) { - return cpu * pcpu_unit_pages + page_idx; + return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, @@ -321,6 +332,14 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) return pcpu_first_chunk; } + /* + * The address is relative to unit0 which might be unused and + * thus unmapped. Offset the address to the unit space of the + * current processor before looking it up in the vmalloc + * space. Note that any possible cpu id can be used here, so + * there's no need to worry about preemption or cpu hotplug. + */ + addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size; return pcpu_get_page_chunk(vmalloc_to_page(addr)); } @@ -593,8 +612,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, { static struct page **pages; static unsigned long *bitmap; - size_t pages_size = num_possible_cpus() * pcpu_unit_pages * - sizeof(pages[0]); + size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); @@ -692,10 +710,9 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; - - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + flush_cache_vunmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); } static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) @@ -756,10 +773,9 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; - - flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + flush_tlb_kernel_range( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); } static int __pcpu_map_pages(unsigned long addr, struct page **pages, @@ -835,11 +851,9 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk, static void pcpu_post_map_flush(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; - - /* flush at once, please read comments in pcpu_unmap() */ - flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + flush_cache_vmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); } /** @@ -953,8 +967,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) bitmap_copy(chunk->populated, populated, pcpu_unit_pages); clear: for_each_possible_cpu(cpu) - memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, - size); + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); return 0; err_unmap: @@ -1088,6 +1101,7 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) mutex_unlock(&pcpu_alloc_mutex); + /* return address relative to unit0 */ return __addr_to_pcpu_ptr(chunk->vm->addr + off); fail_unlock: @@ -1222,6 +1236,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE * @base_addr: mapped address + * @unit_map: cpu -> unit map, NULL for sequential mapping * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area @@ -1260,16 +1275,17 @@ EXPORT_SYMBOL_GPL(free_percpu); */ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, ssize_t dyn_size, size_t unit_size, - void *base_addr) + void *base_addr, const int *unit_map) { static struct vm_struct first_vm; static int smap[2], dmap[2]; size_t size_sum = static_size + reserved_size + (dyn_size >= 0 ? dyn_size : 0); struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned int cpu, tcpu; int i; - /* santiy checks */ + /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); @@ -1278,9 +1294,52 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, BUG_ON(unit_size & ~PAGE_MASK); BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); + /* determine number of units and verify and initialize pcpu_unit_map */ + if (unit_map) { + int first_unit = INT_MAX, last_unit = INT_MIN; + + for_each_possible_cpu(cpu) { + int unit = unit_map[cpu]; + + BUG_ON(unit < 0); + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + /* the mapping should be one-to-one */ + BUG_ON(unit_map[tcpu] == unit); + } + + if (unit < first_unit) { + pcpu_first_unit_cpu = cpu; + first_unit = unit; + } + if (unit > last_unit) { + pcpu_last_unit_cpu = cpu; + last_unit = unit; + } + } + pcpu_nr_units = last_unit + 1; + pcpu_unit_map = unit_map; + } else { + int *identity_map; + + /* #units == #cpus, identity mapped */ + identity_map = alloc_bootmem(num_possible_cpus() * + sizeof(identity_map[0])); + + for_each_possible_cpu(cpu) + identity_map[cpu] = cpu; + + pcpu_first_unit_cpu = 0; + pcpu_last_unit_cpu = pcpu_nr_units - 1; + pcpu_nr_units = num_possible_cpus(); + pcpu_unit_map = identity_map; + } + + /* determine basic parameters */ pcpu_unit_pages = unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; + pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); @@ -1349,7 +1408,7 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ - pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); + pcpu_base_addr = schunk->vm->addr; return pcpu_unit_size; } @@ -1427,7 +1486,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, size_sum >> PAGE_SHIFT, base, static_size); return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, base); + unit_size, base, NULL); } /** @@ -1519,7 +1578,7 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, unit_pages, static_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, - unit_pages << PAGE_SHIFT, vm.addr); + unit_pages << PAGE_SHIFT, vm.addr, NULL); goto out_free_ar; enomem: @@ -1641,7 +1700,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, "%zu bytes\n", pcpul_vm.addr, static_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - pcpul_unit_size, pcpul_vm.addr); + pcpul_unit_size, pcpul_vm.addr, NULL); /* sort pcpul_map array for pcpu_lpage_remapped() */ for (i = 0; i < num_possible_cpus() - 1; i++) From a530b7958612bafe2027e21359083dba84f0b3b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 4 Jul 2009 08:11:00 +0900 Subject: [PATCH 0026/1662] percpu: teach large page allocator about NUMA Large page first chunk allocator is primarily used for NUMA machines; however, its NUMA handling is extremely simplistic. Regardless of their proximity, each cpu is put into separate large page just to return most of the allocated space back wasting large amount of vmalloc space and increasing cache footprint. This patch teachs NUMA details to large page allocator. Given processor proximity information, pcpu_lpage_build_unit_map() will find fitting cpu -> unit mapping in which cpus in LOCAL_DISTANCE share the same large page and not too much virtual address space is wasted. This greatly reduces the unit and thus chunk size and wastes much less address space for the first chunk. For example, on 4/4 NUMA machine, the original code occupied 16MB of virtual space for the first chunk while the new code only uses 4MB - one 2MB page for each node. [ Impact: much better space efficiency on NUMA machines ] Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Jan Beulich Cc: Andi Kleen Cc: David Miller --- arch/x86/kernel/setup_percpu.c | 78 ++++++-- include/linux/percpu.h | 24 ++- mm/percpu.c | 356 ++++++++++++++++++++++++++------- 3 files changed, 361 insertions(+), 97 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4f2e0ac9130b..7501bb14bd51 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -149,25 +149,26 @@ static void __init pcpul_map(void *ptr, size_t size, void *addr) set_pmd(pmd, pmd_v); } +static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) +{ + if (early_cpu_to_node(from) == early_cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; +} + static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; + size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; + size_t unit_map_size, unit_size; + int *unit_map; + int nr_units; + ssize_t ret; - if (!chosen) { - size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = num_possible_cpus() * PMD_SIZE; - - /* on non-NUMA, embedding is better */ - if (!pcpu_need_numa()) - return -EINVAL; - - /* don't consume more than 20% of vmalloc area */ - if (tot_size > vm_size / 5) { - pr_info("PERCPU: too large chunk size %zuMB for " - "large page remap\n", tot_size >> 20); - return -EINVAL; - } - } + /* on non-NUMA, embedding is better */ + if (!chosen && !pcpu_need_numa()) + return -EINVAL; /* need PSE */ if (!cpu_has_pse) { @@ -175,10 +176,46 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) return -EINVAL; } - return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, - PMD_SIZE, - pcpu_fc_alloc, pcpu_fc_free, pcpul_map); + /* allocate and build unit_map */ + unit_map_size = num_possible_cpus() * sizeof(int); + unit_map = alloc_bootmem_nopanic(unit_map_size); + if (!unit_map) { + pr_warning("PERCPU: failed to allocate unit_map\n"); + return -ENOMEM; + } + + ret = pcpu_lpage_build_unit_map(static_size, + PERCPU_FIRST_CHUNK_RESERVE, + &dyn_size, &unit_size, PMD_SIZE, + unit_map, pcpu_lpage_cpu_distance); + if (ret < 0) { + pr_warning("PERCPU: failed to build unit_map\n"); + goto out_free; + } + nr_units = ret; + + /* do the parameters look okay? */ + if (!chosen) { + size_t vm_size = VMALLOC_END - VMALLOC_START; + size_t tot_size = nr_units * unit_size; + + /* don't consume more than 20% of vmalloc area */ + if (tot_size > vm_size / 5) { + pr_info("PERCPU: too large chunk size %zuMB for " + "large page remap\n", tot_size >> 20); + ret = -EINVAL; + goto out_free; + } + } + + ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, unit_size, PMD_SIZE, + unit_map, nr_units, + pcpu_fc_alloc, pcpu_fc_free, pcpul_map); +out_free: + if (ret < 0) + free_bootmem(__pa(unit_map), unit_map_size); + return ret; } #else static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) @@ -299,7 +336,8 @@ void __init setup_per_cpu_areas(void) /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; + per_cpu_offset(cpu) = + delta + pcpu_unit_map[cpu] * pcpu_unit_size; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 1e0e8878dc2a..8ce91af4aa19 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -62,6 +62,7 @@ extern const int *pcpu_unit_map; typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); +typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk( @@ -80,18 +81,37 @@ extern ssize_t __init pcpu_4k_first_chunk( pcpu_fc_populate_pte_fn_t populate_pte_fn); #ifdef CONFIG_NEED_MULTIPLE_NODES +extern int __init pcpu_lpage_build_unit_map( + size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn); + extern ssize_t __init pcpu_lpage_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); extern void *pcpu_lpage_remapped(void *kaddr); #else +static inline int pcpu_lpage_build_unit_map( + size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + return -EINVAL; +} + static inline ssize_t __init pcpu_lpage_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) diff --git a/mm/percpu.c b/mm/percpu.c index 2196fae24f00..b3d0bcff8c7c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -1594,75 +1595,259 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, * Large page remapping first chunk setup helper */ #ifdef CONFIG_NEED_MULTIPLE_NODES + +/** + * pcpu_lpage_build_unit_map - build unit_map for large page remapping + * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_sizep: in/out parameter for dynamic size, -1 for auto + * @unit_sizep: out parameter for unit size + * @unit_map: unit_map to be filled + * @cpu_distance_fn: callback to determine distance between cpus + * + * This function builds cpu -> unit map and determine other parameters + * considering needed percpu size, large page size and distances + * between CPUs in NUMA. + * + * CPUs which are of LOCAL_DISTANCE both ways are grouped together and + * may share units in the same large page. The returned configuration + * is guaranteed to have CPUs on different nodes on different large + * pages and >=75% usage of allocated virtual address space. + * + * RETURNS: + * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and + * returns the number of units to be allocated. -errno on failure. + */ +int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size, + ssize_t *dyn_sizep, size_t *unit_sizep, + size_t lpage_size, int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + int group_cnt_max = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs; + unsigned int cpu, tcpu; + int group, unit; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of lpage_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, lpage_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + group_cnt_max = max(group_cnt_max, group_cnt[group]); + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group_cnt[group]; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 25%. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + *unit_sizep = alloc_size / best_upa; + + /* assign units to cpus accordingly */ + unit = 0; + for (group = 0; group_cnt[group]; group++) { + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + unit_map[cpu] = unit++; + unit = roundup(unit, best_upa); + } + + return unit; /* unit contains aligned number of units */ +} + struct pcpul_ent { - unsigned int cpu; void *ptr; + void *map_addr; }; static size_t pcpul_size; -static size_t pcpul_unit_size; +static size_t pcpul_lpage_size; +static int pcpul_nr_lpages; static struct pcpul_ent *pcpul_map; -static struct vm_struct pcpul_vm; + +static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, + unsigned int *cpup) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + if (unit_map[cpu] == unit) { + if (cpup) + *cpup = cpu; + return true; + } + + return false; +} + +static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, + size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units) +{ + int width = 1, v = nr_units; + char empty_str[] = "--------"; + int upl, lpl; /* units per lpage, lpage per line */ + unsigned int cpu; + int lpage, unit; + + while (v /= 10) + width++; + empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; + + upl = max_t(int, lpage_size / unit_size, 1); + lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); + + printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, + static_size, reserved_size, dyn_size, unit_size, lpage_size); + + for (lpage = 0, unit = 0; unit < nr_units; unit++) { + if (!(unit % upl)) { + if (!(lpage++ % lpl)) { + printk("\n"); + printk("%spcpu-lpage: ", lvl); + } else + printk("| "); + } + if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + printk("%0*d ", width, cpu); + else + printk("%s ", empty_str); + } + printk("\n"); +} /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @dyn_size: free size for dynamic allocation in bytes + * @unit_size: unit size in bytes * @lpage_size: the size of a large page + * @unit_map: cpu -> unit mapping + * @nr_units: the number of units * @alloc_fn: function to allocate percpu lpage, always called with lpage_size * @free_fn: function to free percpu memory, @size <= lpage_size * @map_fn: function to map percpu lpage, always called with lpage_size * - * This allocator uses large page as unit. A large page is allocated - * for each cpu and each is remapped into vmalloc area using large - * page mapping. As large page can be quite large, only part of it is - * used for the first chunk. Unused part is returned to the bootmem - * allocator. + * This allocator uses large page to build and map the first chunk. + * Unlike other helpers, the caller should always specify @dyn_size + * and @unit_size. These parameters along with @unit_map and + * @nr_units can be determined using pcpu_lpage_build_unit_map(). + * This two stage initialization is to allow arch code to evaluate the + * parameters before committing to it. * - * So, the large pages are mapped twice - once to the physical mapping - * and to the vmalloc area for the first percpu chunk. The double - * mapping does add one more large TLB entry pressure but still is - * much better than only using 4k mappings while still being NUMA - * friendly. + * Large pages are allocated as directed by @unit_map and other + * parameters and mapped to vmalloc space. Unused holes are returned + * to the page allocator. Note that these holes end up being actively + * mapped twice - once to the physical mapping and to the vmalloc area + * for the first percpu chunk. Depending on architecture, this might + * cause problem when changing page attributes of the returned area. + * These double mapped areas can be detected using + * pcpu_lpage_remapped(). * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t lpage_size, + size_t dyn_size, size_t unit_size, + size_t lpage_size, const int *unit_map, + int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { - size_t size_sum; + static struct vm_struct vm; + size_t chunk_size = unit_size * nr_units; size_t map_size; unsigned int cpu; - int i, j; ssize_t ret; + int i, j, unit; - /* - * Currently supports only single page. Supporting multiple - * pages won't be too difficult if it ever becomes necessary. - */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size, + unit_size, lpage_size, unit_map, nr_units); - pcpul_unit_size = lpage_size; - pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - if (pcpul_size > pcpul_unit_size) { - pr_warning("PERCPU: static data is larger than large page, " - "can't use large page\n"); - return -EINVAL; - } + BUG_ON(chunk_size % lpage_size); + + pcpul_size = static_size + reserved_size + dyn_size; + pcpul_lpage_size = lpage_size; + pcpul_nr_lpages = chunk_size / lpage_size; /* allocate pointer array and alloc large pages */ - map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); + map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]); pcpul_map = alloc_bootmem(map_size); - for_each_possible_cpu(cpu) { + /* allocate all pages */ + for (i = 0; i < pcpul_nr_lpages; i++) { + size_t offset = i * lpage_size; + int first_unit = offset / unit_size; + int last_unit = (offset + lpage_size - 1) / unit_size; void *ptr; + /* find out which cpu is mapped to this unit */ + for (unit = first_unit; unit <= last_unit; unit++) + if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + goto found; + continue; + found: ptr = alloc_fn(cpu, lpage_size); if (!ptr) { pr_warning("PERCPU: failed to allocate large page " @@ -1670,53 +1855,79 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, goto enomem; } - /* - * Only use pcpul_size bytes and give back the rest. - * - * Ingo: The lpage_size up-rounding bootmem is needed - * to make sure the partial lpage is still fully RAM - - * it's not well-specified to have a incompatible area - * (unmapped RAM, device memory, etc.) in that hole. - */ - free_fn(ptr + pcpul_size, lpage_size - pcpul_size); - - pcpul_map[cpu].cpu = cpu; - pcpul_map[cpu].ptr = ptr; - - memcpy(ptr, __per_cpu_load, static_size); + pcpul_map[i].ptr = ptr; } - /* allocate address and map */ - pcpul_vm.flags = VM_ALLOC; - pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; - vm_area_register_early(&pcpul_vm, pcpul_unit_size); + /* return unused holes */ + for (unit = 0; unit < nr_units; unit++) { + size_t start = unit * unit_size; + size_t end = start + unit_size; + size_t off, next; + + /* don't free used part of occupied unit */ + if (pcpul_unit_to_cpu(unit, unit_map, NULL)) + start += pcpul_size; + + /* unit can span more than one page, punch the holes */ + for (off = start; off < end; off = next) { + void *ptr = pcpul_map[off / lpage_size].ptr; + next = min(roundup(off + 1, lpage_size), end); + if (ptr) + free_fn(ptr + off % lpage_size, next - off); + } + } + + /* allocate address, map and copy */ + vm.flags = VM_ALLOC; + vm.size = chunk_size; + vm_area_register_early(&vm, unit_size); + + for (i = 0; i < pcpul_nr_lpages; i++) { + if (!pcpul_map[i].ptr) + continue; + pcpul_map[i].map_addr = vm.addr + i * lpage_size; + map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr); + } for_each_possible_cpu(cpu) - map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, - pcpul_vm.addr + cpu * pcpul_unit_size); + memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load, + static_size); /* we're ready, commit */ pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", pcpul_vm.addr, static_size); + "%zu bytes\n", vm.addr, static_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - pcpul_unit_size, pcpul_vm.addr, NULL); + unit_size, vm.addr, unit_map); - /* sort pcpul_map array for pcpu_lpage_remapped() */ - for (i = 0; i < num_possible_cpus() - 1; i++) - for (j = i + 1; j < num_possible_cpus(); j++) - if (pcpul_map[i].ptr > pcpul_map[j].ptr) { - struct pcpul_ent tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } + /* + * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped + * lpages are pushed to the end and trimmed. + */ + for (i = 0; i < pcpul_nr_lpages - 1; i++) + for (j = i + 1; j < pcpul_nr_lpages; j++) { + struct pcpul_ent tmp; + + if (!pcpul_map[j].ptr) + continue; + if (pcpul_map[i].ptr && + pcpul_map[i].ptr < pcpul_map[j].ptr) + continue; + + tmp = pcpul_map[i]; + pcpul_map[i] = pcpul_map[j]; + pcpul_map[j] = tmp; + } + + while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) + pcpul_nr_lpages--; return ret; enomem: - for_each_possible_cpu(cpu) - if (pcpul_map[cpu].ptr) - free_fn(pcpul_map[cpu].ptr, pcpul_size); + for (i = 0; i < pcpul_nr_lpages; i++) + if (pcpul_map[i].ptr) + free_fn(pcpul_map[i].ptr, lpage_size); free_bootmem(__pa(pcpul_map), map_size); return -ENOMEM; } @@ -1739,10 +1950,10 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, */ void *pcpu_lpage_remapped(void *kaddr) { - unsigned long unit_mask = pcpul_unit_size - 1; - void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); - unsigned long offset = (unsigned long)kaddr & unit_mask; - int left = 0, right = num_possible_cpus() - 1; + unsigned long lpage_mask = pcpul_lpage_size - 1; + void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask); + unsigned long offset = (unsigned long)kaddr & lpage_mask; + int left = 0, right = pcpul_nr_lpages - 1; int pos; /* pcpul in use at all? */ @@ -1757,13 +1968,8 @@ void *pcpu_lpage_remapped(void *kaddr) left = pos + 1; else if (pcpul_map[pos].ptr > lpage_addr) right = pos - 1; - else { - /* it shouldn't be in the area for the first chunk */ - WARN_ON(offset < pcpul_size); - - return pcpul_vm.addr + - pcpul_map[pos].cpu * pcpul_unit_size + offset; - } + else + return pcpul_map[pos].map_addr + offset; } return NULL; From b56063453881a6d94cf5718c6769de6e35e67753 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sun, 5 Jul 2009 12:23:35 -0500 Subject: [PATCH 0027/1662] xfs: remove XFS_INO64_OFFSET Commit a19d9f887d81106d52cacbc9930207b487e07e0e removed the ino64 option but left the XFS_INO64_OFFSET define it used in place - just remove it. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_inum.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h index 7a28191cb0de..b8e4ee4e89a4 100644 --- a/fs/xfs/xfs_inum.h +++ b/fs/xfs/xfs_inum.h @@ -72,7 +72,6 @@ struct xfs_mount; #if XFS_BIG_INUMS #define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) -#define XFS_INO64_OFFSET ((xfs_ino_t)(1ULL << 32)) #else #define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL)) #endif From 1dcdd0911b5553f0282ce8525773955b59a56919 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Thu, 9 Jul 2009 11:27:40 +0900 Subject: [PATCH 0028/1662] microblaze: include EXIT_TEXT to _stext Microblaze wants to throw out EXIT_TEXT during runtime too. This hasn't caused trouble till now because the linker script didn't discard EXIT_TEXT and it ended up in its default output section. As discard definition is about to be unified, include EXIT_TEXT into _stext explicitly and while at it replace explicit exitcall definition to EXIT_CALL. Signed-off-by: Michal Simek Signed-off-by: Tejun Heo --- arch/microblaze/kernel/vmlinux.lds.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S index a207543c5927..81bebdcb18fe 100644 --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S @@ -23,8 +23,8 @@ SECTIONS { _stext = . ; *(.text .text.*) *(.fixup) - - *(.exitcall.exit) + EXIT_TEXT + EXIT_CALL SCHED_TEXT LOCK_TEXT KPROBES_TEXT From 023bf6f1b8bf58dc4da7f0dc1cf4787b0d5297c1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jul 2009 11:27:40 +0900 Subject: [PATCH 0029/1662] linker script: unify usage of discard definition Discarded sections in different archs share some commonality but have considerable differences. This led to linker script for each arch implementing its own /DISCARD/ definition, which makes maintaining tedious and adding new entries error-prone. This patch makes all linker scripts to move discard definitions to the end of the linker script and use the common DISCARDS macro. As ld uses the first matching section definition, archs can include default discarded sections by including them earlier in the linker script. ia64 is notable because it first throws away some ia64 specific subsections and then include the rest of the sections into the final image, so those sections must be discarded before the inclusion. defconfig compile tested for x86, x86-64, powerpc, powerpc64, ia64, alpha, sparc, sparc64 and s390. Michal Simek tested microblaze. Signed-off-by: Tejun Heo Acked-by: Paul Mundt Acked-by: Mike Frysinger Tested-by: Michal Simek Cc: linux-arch@vger.kernel.org Cc: Michal Simek Cc: microblaze-uclinux@itee.uq.edu.au Cc: Sam Ravnborg Cc: Tony Luck --- arch/alpha/kernel/vmlinux.lds.S | 10 ++-------- arch/avr32/kernel/vmlinux.lds.S | 10 +++------- arch/blackfin/kernel/vmlinux.lds.S | 6 +----- arch/cris/kernel/vmlinux.lds.S | 10 ++-------- arch/frv/kernel/vmlinux.lds.S | 2 +- arch/h8300/kernel/vmlinux.lds.S | 6 ++---- arch/ia64/kernel/vmlinux.lds.S | 17 ++++++++--------- arch/m32r/kernel/vmlinux.lds.S | 11 +++-------- arch/m68k/kernel/vmlinux-std.lds | 11 +++-------- arch/m68k/kernel/vmlinux-sun3.lds | 10 ++-------- arch/m68knommu/kernel/vmlinux.lds.S | 8 +------- arch/microblaze/kernel/vmlinux.lds.S | 2 +- arch/mips/kernel/vmlinux.lds.S | 22 ++++++++++------------ arch/mn10300/kernel/vmlinux.lds.S | 9 +++------ arch/parisc/kernel/vmlinux.lds.S | 9 ++++----- arch/powerpc/kernel/vmlinux.lds.S | 10 +++------- arch/s390/kernel/vmlinux.lds.S | 10 +++------- arch/sh/kernel/vmlinux.lds.S | 11 ++++------- arch/sparc/kernel/vmlinux.lds.S | 9 ++------- arch/um/include/asm/common.lds.S | 5 ----- arch/um/kernel/dyn.lds.S | 2 +- arch/um/kernel/uml.lds.S | 2 +- arch/x86/kernel/vmlinux.lds.S | 11 ++++------- arch/xtensa/kernel/vmlinux.lds.S | 14 ++++---------- include/asm-generic/vmlinux.lds.h | 18 ++++++++++++------ 25 files changed, 80 insertions(+), 155 deletions(-) diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 75fe1d6877e9..6dc03c35caa0 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -134,14 +134,6 @@ SECTIONS __bss_stop = .; _end = .; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - .mdebug 0 : { *(.mdebug) } @@ -151,4 +143,6 @@ SECTIONS STABS_DEBUG DWARF_DEBUG + + DISCARDS } diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S index b8324608ec0c..c4b56654349a 100644 --- a/arch/avr32/kernel/vmlinux.lds.S +++ b/arch/avr32/kernel/vmlinux.lds.S @@ -124,15 +124,11 @@ SECTIONS _end = .; } + DWARF_DEBUG + /* When something in the kernel is NOT compiled as a module, the module * cleanup code and data are put into these segments. Both can then be * thrown away, as cleanup code is never called unless it's a module. */ - /DISCARD/ : { - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - - DWARF_DEBUG + DISCARDS } diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index 6e8eabd8f0a6..d7ffe299b979 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S @@ -277,9 +277,5 @@ SECTIONS DWARF_DEBUG - /DISCARD/ : - { - *(.exitcall.exit) - *(.discard) - } + DISCARDS } diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S index a3175ebb38cc..6c81836b9229 100644 --- a/arch/cris/kernel/vmlinux.lds.S +++ b/arch/cris/kernel/vmlinux.lds.S @@ -140,13 +140,7 @@ SECTIONS _end = .; __end = .; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - dram_end = dram_start + (CONFIG_ETRAX_DRAM_SIZE - __CONFIG_ETRAX_VMEM_SIZE)*1024*1024; + + DISCARDS } diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index 64b5a5e4d35e..7dbf41f68b52 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -178,7 +178,7 @@ SECTIONS .comment 0 : { *(.comment) } - /DISCARD/ : { *(.discard) } + DISCARDS } __kernel_image_size_no_bss = __bss_start - __kernel_image_start; diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S index 03d6c0df33db..662b02ecb86e 100644 --- a/arch/h8300/kernel/vmlinux.lds.S +++ b/arch/h8300/kernel/vmlinux.lds.S @@ -152,10 +152,6 @@ SECTIONS __end = . ; __ramstart = .; } - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - } .romfs : { *(.romfs*) @@ -166,4 +162,6 @@ SECTIONS COMMAND_START = . - 0x200 ; __ramend = . ; } + + DISCARDS } diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index 13d958975874..eb4214d1c5af 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -24,15 +24,14 @@ PHDRS { } SECTIONS { - /* Sections to be discarded */ + /* unwind exit sections must be discarded before the rest of the + sections get included. */ /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) *(.IA_64.unwind.exit.text) *(.IA_64.unwind_info.exit.text) - } + *(.comment) + *(.note) + } v = PAGE_OFFSET; /* this symbol is here to make debugging easier... */ phys_start = _start - LOAD_OFFSET; @@ -317,7 +316,7 @@ SECTIONS .debug_funcnames 0 : { *(.debug_funcnames) } .debug_typenames 0 : { *(.debug_typenames) } .debug_varnames 0 : { *(.debug_varnames) } - /* These must appear regardless of . */ - /DISCARD/ : { *(.comment) } - /DISCARD/ : { *(.note) } + + /* Default discards */ + DISCARDS } diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S index 480a49944cfd..de5e21cca6a5 100644 --- a/arch/m32r/kernel/vmlinux.lds.S +++ b/arch/m32r/kernel/vmlinux.lds.S @@ -120,14 +120,6 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - /* Stabs debugging sections. */ .stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } @@ -136,4 +128,7 @@ SECTIONS .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds index 905a797ada93..47eac19e8f61 100644 --- a/arch/m68k/kernel/vmlinux-std.lds +++ b/arch/m68k/kernel/vmlinux-std.lds @@ -82,14 +82,6 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - /* Stabs debugging sections. */ .stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } @@ -98,4 +90,7 @@ SECTIONS .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) } + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds index 47d04be322aa..03efaf04d7d7 100644 --- a/arch/m68k/kernel/vmlinux-sun3.lds +++ b/arch/m68k/kernel/vmlinux-sun3.lds @@ -77,14 +77,6 @@ __init_begin = .; _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - .crap : { /* Stabs debugging sections. */ *(.stab) @@ -97,4 +89,6 @@ __init_begin = .; *(.note) } + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S index 68111a61a77f..2736a5e309c0 100644 --- a/arch/m68knommu/kernel/vmlinux.lds.S +++ b/arch/m68knommu/kernel/vmlinux.lds.S @@ -184,13 +184,6 @@ SECTIONS { __init_end = .; } > INIT - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - .bss : { . = ALIGN(4); _sbss = . ; @@ -201,5 +194,6 @@ SECTIONS { _end = . ; } > BSS + DISCARDS } diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S index 81bebdcb18fe..ec5fa91a48d8 100644 --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S @@ -163,5 +163,5 @@ SECTIONS { . = ALIGN(4096); _end = .; - /DISCARD/ : { *(.discard) } + DISCARDS } diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 45901609b741..1474c18fb777 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -176,18 +176,6 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - - /* ABI crap starts here */ - *(.MIPS.options) - *(.options) - *(.pdr) - *(.reginfo) - } - /* These mark the ABI of the kernel for debuggers. */ .mdebug.abi32 : { KEEP(*(.mdebug.abi32)) @@ -213,4 +201,14 @@ SECTIONS *(.gptab.bss) *(.gptab.sbss) } + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { + /* ABI crap starts here */ + *(.MIPS.options) + *(.options) + *(.pdr) + *(.reginfo) + } } diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S index 5609d4962a55..8fcd0f1e21de 100644 --- a/arch/mn10300/kernel/vmlinux.lds.S +++ b/arch/mn10300/kernel/vmlinux.lds.S @@ -115,13 +115,10 @@ SECTIONS . = ALIGN(PAGE_SIZE); pg0 = .; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_CALL - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index ccf58341845a..aea1784edbd1 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -237,10 +237,12 @@ SECTIONS /* freed after init ends here */ _end = . ; + STABS_DEBUG + .note 0 : { *(.note) } + /* Sections to be discarded */ + DISCARDS /DISCARD/ : { - *(.exitcall.exit) - *(.discard) #ifdef CONFIG_64BIT /* temporary hack until binutils is fixed to not emit these * for static binaries @@ -253,7 +255,4 @@ SECTIONS *(.gnu.hash) #endif } - - STABS_DEBUG - .note 0 : { *(.note) } } diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 7fca9355fd3d..244e3658983c 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -37,13 +37,6 @@ jiffies = jiffies_64 + 4; #endif SECTIONS { - /* Sections to be discarded. */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - EXIT_DATA - } - . = KERNELBASE; /* @@ -299,4 +292,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); _end = . ; PROVIDE32 (end = .); + + /* Sections to be discarded. */ + DISCARDS } diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 98867dfea469..82415c75b996 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -157,14 +157,10 @@ SECTIONS _end = . ; - /* Sections to be discarded */ - /DISCARD/ : { - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - /* Debugging sections. */ STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS } diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 766976d27b21..0ce254bca92f 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -163,17 +163,14 @@ SECTIONS _end = . ; } + STABS_DEBUG + DWARF_DEBUG + /* * When something in the kernel is NOT compiled as a module, the * module cleanup code and data are put into these segments. Both * can then be thrown away, as cleanup code is never called unless * it's a module. */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - } - - STABS_DEBUG - DWARF_DEBUG + DISCARDS } diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index d63cf914667d..866390feb683 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -171,13 +171,8 @@ SECTIONS } _end = . ; - /DISCARD/ : { - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + DISCARDS } diff --git a/arch/um/include/asm/common.lds.S b/arch/um/include/asm/common.lds.S index cb0248616d49..37ecc5577a9a 100644 --- a/arch/um/include/asm/common.lds.S +++ b/arch/um/include/asm/common.lds.S @@ -123,8 +123,3 @@ __initramfs_end = .; } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - } - diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index 2916d6eadffd..715a188c0472 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -157,5 +157,5 @@ SECTIONS DWARF_DEBUG - /DISCARD/ : { *(.discard) } + DISCARDS } diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 1f8a622cabe1..2ebd39765db8 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -101,5 +101,5 @@ SECTIONS DWARF_DEBUG - /DISCARD/ : { *(.discard) } + DISCARDS } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 367e87882041..b600c843710b 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -387,15 +387,12 @@ SECTIONS _end = .; } - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } - STABS_DEBUG DWARF_DEBUG + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.eh_frame) } } diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index b1e24638acd7..921b6ff3b645 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -280,16 +280,6 @@ SECTIONS *(.ResetVector.text) } - /* Sections to be discarded */ - /DISCARD/ : - { - *(.exit.literal) - EXIT_TEXT - EXIT_DATA - *(.exitcall.exit) - *(.discard) - } - .xt.lit : { *(.xt.lit) } .xt.prop : { *(.xt.prop) } @@ -322,4 +312,8 @@ SECTIONS *(.xt.lit) *(.gnu.linkonce.p*) } + + /* Sections to be discarded */ + DISCARDS + /DISCARD/ : { *(.exit.literal) } } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c5c18ac878ab..ab8ea9b7741e 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -35,13 +35,10 @@ * __bss_stop = .; * _end = .; * - * /DISCARD/ : { - * EXIT_TEXT - * EXIT_DATA - * EXIT_CALL - * } * STABS_DEBUG * DWARF_DEBUG + * + * DISCARDS // must be the last * } * * [__init_begin, __init_end] is the init section that may be freed after init @@ -629,11 +626,20 @@ #define INIT_RAM_FS #endif +/* + * Default discarded sections. + * + * Some archs want to discard exit text/data at runtime rather than + * link time due to cross-section references such as alt instructions, + * bug table, eh_frame, etc. DISCARDS must be the last of output + * section definitions so that such archs put those in earlier section + * definitions. + */ #define DISCARDS \ /DISCARD/ : { \ EXIT_TEXT \ EXIT_DATA \ - *(.exitcall.exit) \ + EXIT_CALL \ *(.discard) \ } From c31d96338a6041520ba5f1b6a4a5012ef00686b3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:37 +0200 Subject: [PATCH 0030/1662] x86: mce: Make CONFIG_X86_ANCIENT_MCE dependent on CONFIG_X86_MCE Add a missing depency for ANCIENT_MCE. It didn't matter in practice because the ANCIENT code wasn't compiled without X86_MCE, but it's better to express that clearly in Kconfig. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 356d2ec8e2fb..5962b872a7ad 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -823,7 +823,7 @@ config X86_MCE_AMD config X86_ANCIENT_MCE def_bool n - depends on X86_32 + depends on X86_32 && X86_MCE prompt "Support for old Pentium 5 / WinChip machine checks" ---help--- Include support for machine check handling on old Pentium 5 or WinChip From bab9bc6583fe6c1660d6ed36dd14bbb4edfaf393 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:38 +0200 Subject: [PATCH 0031/1662] x86: mce: Update X86_MCE description in x86/Kconfig - Clarify that this config controls thermal throttling reporting too - Clarify the types of errors reported by machine checks - Drop references to ancient CPUs. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5962b872a7ad..134a8c0d80dd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -774,20 +774,12 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS increased on these systems. config X86_MCE - bool "Machine Check Exception" + bool "Machine Check / overheating reporting" ---help--- - Machine Check Exception support allows the processor to notify the - kernel if it detects a problem (e.g. overheating, component failure). + Machine Check support allows the processor to notify the + kernel if it detects a problem (e.g. overheating, data corruption). The action the kernel takes depends on the severity of the problem, - ranging from a warning message on the console, to halting the machine. - Your processor must be a Pentium or newer to support this - check the - flags in /proc/cpuinfo for mce. Note that some older Pentium systems - have a design flaw which leads to false MCE events - hence MCE is - disabled on all P5 processors, unless explicitly enabled with "mce" - as a boot argument. Similarly, if MCE is built in and creates a - problem on some new non-standard machine, you can boot with "nomce" - to disable it. MCE support simply ignores non-MCE processors like - the 386 and 486, so nearly everyone can say Y here. + ranging from warning messages to halting the machine. config X86_OLD_MCE depends on X86_32 && X86_MCE From 5bb38adcb54cf7192b154368ad62982caa11ca0b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:39 +0200 Subject: [PATCH 0032/1662] x86: mce: Remove old i386 machine check code As announced in feature-remove-schedule.txt remove CONFIG_X86_OLD_MCE This patch only removes code. The ancient machine check code for very old systems that are not supported by CONFIG_X86_NEW_MCE is still kept. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- Documentation/feature-removal-schedule.txt | 10 -- arch/x86/Kconfig | 35 +---- arch/x86/include/asm/mce.h | 11 -- arch/x86/kernel/cpu/mcheck/Makefile | 2 - arch/x86/kernel/cpu/mcheck/k7.c | 116 --------------- arch/x86/kernel/cpu/mcheck/mce.c | 47 ------ arch/x86/kernel/cpu/mcheck/non-fatal.c | 94 ------------ arch/x86/kernel/cpu/mcheck/p4.c | 163 --------------------- arch/x86/kernel/cpu/mcheck/p6.c | 127 ---------------- 9 files changed, 2 insertions(+), 603 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mcheck/k7.c delete mode 100644 arch/x86/kernel/cpu/mcheck/non-fatal.c delete mode 100644 arch/x86/kernel/cpu/mcheck/p4.c delete mode 100644 arch/x86/kernel/cpu/mcheck/p6.c diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 7129846a2785..edb2f0b07616 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -444,13 +444,3 @@ What: CONFIG_RFKILL_INPUT When: 2.6.33 Why: Should be implemented in userspace, policy daemon. Who: Johannes Berg - ----------------------------- - -What: CONFIG_X86_OLD_MCE -When: 2.6.32 -Why: Remove the old legacy 32bit machine check code. This has been - superseded by the newer machine check code from the 64bit port, - but the old version has been kept around for easier testing. Note this - doesn't impact the old P5 and WinChip machine check handlers. -Who: Andi Kleen diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 134a8c0d80dd..d986769a7d90 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -781,21 +781,10 @@ config X86_MCE The action the kernel takes depends on the severity of the problem, ranging from warning messages to halting the machine. -config X86_OLD_MCE - depends on X86_32 && X86_MCE - bool "Use legacy machine check code (will go away)" - default n - select X86_ANCIENT_MCE - ---help--- - Use the old i386 machine check code. This is merely intended for - testing in a transition period. Try this if you run into any machine - check related software problems, but report the problem to - linux-kernel. When in doubt say no. - config X86_NEW_MCE depends on X86_MCE bool - default y if (!X86_OLD_MCE && X86_32) || X86_64 + default y config X86_MCE_INTEL def_bool y @@ -835,29 +824,9 @@ config X86_MCE_INJECT If you don't know what a machine check is and you don't do kernel QA it is safe to say n. -config X86_MCE_NONFATAL - tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4" - depends on X86_OLD_MCE - ---help--- - Enabling this feature starts a timer that triggers every 5 seconds which - will look at the machine check registers to see if anything happened. - Non-fatal problems automatically get corrected (but still logged). - Disable this if you don't want to see these messages. - Seeing the messages this option prints out may be indicative of dying - or out-of-spec (ie, overclocked) hardware. - This option only does something on certain CPUs. - (AMD Athlon/Duron and Intel Pentium 4) - -config X86_MCE_P4THERMAL - bool "check for P4 thermal throttling interrupt." - depends on X86_OLD_MCE && X86_MCE && (X86_UP_APIC || SMP) - ---help--- - Enabling this feature will cause a message to be printed when the P4 - enters thermal throttling. - config X86_THERMAL_VECTOR def_bool y - depends on X86_MCE_P4THERMAL || X86_MCE_INTEL + depends on X86_MCE_INTEL config VM86 bool "Enable VM86 support" if EMBEDDED diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b50b9e9042c4..6b8a974e1270 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -115,13 +115,6 @@ void mcheck_init(struct cpuinfo_x86 *c); static inline void mcheck_init(struct cpuinfo_x86 *c) {} #endif -#ifdef CONFIG_X86_OLD_MCE -extern int nr_mce_banks; -void amd_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); -#endif - #ifdef CONFIG_X86_ANCIENT_MCE void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); @@ -208,11 +201,7 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); void intel_init_thermal(struct cpuinfo_x86 *c); -#ifdef CONFIG_X86_NEW_MCE void mce_log_therm_throt_event(__u64 status); -#else -static inline void mce_log_therm_throt_event(__u64 status) {} -#endif #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 188a1ca5ad2b..022a036ce21b 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,11 +1,9 @@ obj-y = mce.o obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o -obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o -obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c deleted file mode 100644 index b945d5dbc609..000000000000 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Athlon specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* Machine Check Handler For AMD Athlon/Duron: */ -static void k7_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 1; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - - /* Clear it: */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - - -/* AMD K7 machine check is Intel like: */ -void amd_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - machine_check_vector = k7_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Clear status for MC index 0 separately, we don't touch CTL, - * as some K7 Athlons cause spurious MCEs when its enabled: - */ - if (boot_cpu_data.x86 == 6) { - wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0); - i = 1; - } else - i = 0; - - for (; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7da8fec9ca88..5ff6362ecb18 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -58,8 +58,6 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = int mce_disabled __read_mostly; -#ifdef CONFIG_X86_NEW_MCE - #define MISC_MCELOG_MINOR 227 #define SPINUNIT 100 /* 100ns */ @@ -1993,51 +1991,6 @@ static __init int mce_init_device(void) device_initcall(mce_init_device); -#else /* CONFIG_X86_OLD_MCE: */ - -int nr_mce_banks; -EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ - -/* This has to be run for each processor */ -void mcheck_init(struct cpuinfo_x86 *c) -{ - if (mce_disabled) - return; - - switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; - - case X86_VENDOR_INTEL: - if (c->x86 == 5) - intel_p5_mcheck_init(c); - if (c->x86 == 6) - intel_p6_mcheck_init(c); - if (c->x86 == 15) - intel_p4_mcheck_init(c); - break; - - case X86_VENDOR_CENTAUR: - if (c->x86 == 5) - winchip_mcheck_init(c); - break; - - default: - break; - } - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); -} - -static int __init mcheck_enable(char *str) -{ - mce_p5_enabled = 1; - return 1; -} -__setup("mce", mcheck_enable); - -#endif /* CONFIG_X86_OLD_MCE */ - /* * Old style boot options parsing. Only for compatibility. */ diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c deleted file mode 100644 index f5f2d6f71fb6..000000000000 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Non Fatal Machine Check Exception Reporting - * - * (C) Copyright 2002 Dave Jones. - * - * This file contains routines to check for non-fatal MCEs every 15s - * - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -static int firstbank; - -#define MCE_RATE (15*HZ) /* timer rate is 15s */ - -static void mce_checkregs(void *info) -{ - u32 low, high; - int i; - - for (i = firstbank; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - - if (!(high & (1<<31))) - continue; - - printk(KERN_INFO "MCE: The hardware reports a non fatal, " - "correctable incident occurred on CPU %d.\n", - smp_processor_id()); - - printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low); - - /* - * Scrub the error so we don't pick it up in MCE_RATE - * seconds time: - */ - wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); - - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } -} - -static void mce_work_fn(struct work_struct *work); -static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); - -static void mce_work_fn(struct work_struct *work) -{ - on_each_cpu(mce_checkregs, NULL, 1); - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); -} - -static int __init init_nonfatal_mce_checker(void) -{ - struct cpuinfo_x86 *c = &boot_cpu_data; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return -ENODEV; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return -ENODEV; - - /* Some Athlons misbehave when we frob bank 0 */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; - else - firstbank = 0; - - /* - * Check for non-fatal errors every MCE_RATE s - */ - schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); - printk(KERN_INFO "Machine check exception polling timer started.\n"); - - return 0; -} -module_init(init_nonfatal_mce_checker); - -MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c deleted file mode 100644 index 4482aea9aa2e..000000000000 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * P4 specific Machine Check Exception Reporting - */ -#include -#include -#include -#include - -#include -#include -#include - -/* as supported by the P4/Xeon family */ -struct intel_mce_extended_msrs { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; - u32 edi; - u32 ebp; - u32 esp; - u32 eflags; - u32 eip; - /* u32 *reserved[]; */ -}; - -static int mce_num_extended_msrs; - -/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ -static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) -{ - u32 h; - - rdmsr(MSR_IA32_MCG_EAX, r->eax, h); - rdmsr(MSR_IA32_MCG_EBX, r->ebx, h); - rdmsr(MSR_IA32_MCG_ECX, r->ecx, h); - rdmsr(MSR_IA32_MCG_EDX, r->edx, h); - rdmsr(MSR_IA32_MCG_ESI, r->esi, h); - rdmsr(MSR_IA32_MCG_EDI, r->edi, h); - rdmsr(MSR_IA32_MCG_EBP, r->ebp, h); - rdmsr(MSR_IA32_MCG_ESP, r->esp, h); - rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h); - rdmsr(MSR_IA32_MCG_EIP, r->eip, h); -} - -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - if (mce_num_extended_msrs > 0) { - struct intel_mce_extended_msrs dbg; - - intel_get_extended_msrs(&dbg); - - printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" - "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" - "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", - smp_processor_id(), dbg.eip, dbg.eflags, - dbg.eax, dbg.ebx, dbg.ecx, dbg.edx, - dbg.esi, dbg.edi, dbg.ebp, dbg.esp); - } - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = addr[0] = '\0'; - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error. - */ - for (i = 0; i < nr_mce_banks; i++) { - u32 msr; - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high&(1<<31)) { - /* Clear it */ - wrmsr(msr, 0UL, 0UL); - /* Serialize */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -void intel_p4_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - machine_check_vector = intel_machine_check; - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - for (i = 0; i < nr_mce_banks; i++) { - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - } - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); - - /* Check for P4/Xeon extended MCE MSRs */ - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<9)) {/* MCG_EXT_P */ - mce_num_extended_msrs = (l >> 16) & 0xff; - printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" - " available\n", - smp_processor_id(), mce_num_extended_msrs); - -#ifdef CONFIG_X86_MCE_P4THERMAL - /* Check for P4/Xeon Thermal monitor */ - intel_init_thermal(c); -#endif - } -} diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c deleted file mode 100644 index 01e4f8178183..000000000000 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * P6 specific Machine Check Exception Reporting - * (C) Copyright 2002 Alan Cox - */ -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* Machine Check Handler For PII/PIII */ -static void intel_machine_check(struct pt_regs *regs, long error_code) -{ - u32 alow, ahigh, high, low; - u32 mcgstl, mcgsth; - int recover = 1; - int i; - - rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); - if (mcgstl & (1<<0)) /* Recoverable ? */ - recover = 0; - - printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", - smp_processor_id(), mcgsth, mcgstl); - - for (i = 0; i < nr_mce_banks; i++) { - rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); - if (high & (1<<31)) { - char misc[20]; - char addr[24]; - - misc[0] = '\0'; - addr[0] = '\0'; - - if (high & (1<<29)) - recover |= 1; - if (high & (1<<25)) - recover |= 2; - high &= ~(1<<31); - - if (high & (1<<27)) { - rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); - snprintf(misc, 20, "[%08x%08x]", ahigh, alow); - } - if (high & (1<<26)) { - rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - snprintf(addr, 24, " at %08x%08x", ahigh, alow); - } - - printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", - smp_processor_id(), i, high, low, misc, addr); - } - } - - if (recover & 2) - panic("CPU context corrupt"); - if (recover & 1) - panic("Unable to continue"); - - printk(KERN_EMERG "Attempting to continue.\n"); - /* - * Do not clear the MSR_IA32_MCi_STATUS if the error is not - * recoverable/continuable.This will allow BIOS to look at the MSRs - * for errors if the OS could not log the error: - */ - for (i = 0; i < nr_mce_banks; i++) { - unsigned int msr; - - msr = MSR_IA32_MC0_STATUS+i*4; - rdmsr(msr, low, high); - if (high & (1<<31)) { - /* Clear it: */ - wrmsr(msr, 0UL, 0UL); - /* Serialize: */ - wmb(); - add_taint(TAINT_MACHINE_CHECK); - } - } - mcgstl &= ~(1<<2); - wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth); -} - -/* Set up machine check reporting for processors with Intel style MCE: */ -void intel_p6_mcheck_init(struct cpuinfo_x86 *c) -{ - u32 l, h; - int i; - - /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) - return; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return; - - /* Ok machine check is available */ - machine_check_vector = intel_machine_check; - /* Make sure the vector pointer is visible before we enable MCEs: */ - wmb(); - - printk(KERN_INFO "Intel machine check architecture supported.\n"); - rdmsr(MSR_IA32_MCG_CAP, l, h); - if (l & (1<<8)) /* Control register present ? */ - wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - nr_mce_banks = l & 0xff; - - /* - * Following the example in IA-32 SDM Vol 3: - * - MC0_CTL should not be written - * - Status registers on all banks should be cleared on reset - */ - for (i = 1; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); - - for (i = 0; i < nr_mce_banks; i++) - wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); - - set_in_cr4(X86_CR4_MCE); - printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", - smp_processor_id()); -} From c1ebf835617035b1f08f734247dcb981e17aac6b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:41 +0200 Subject: [PATCH 0033/1662] x86: mce: Rename CONFIG_X86_NEW_MCE to CONFIG_X86_MCE Drop the CONFIG_X86_NEW_MCE symbol and change all references to it to check for CONFIG_X86_MCE directly. No code changes Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 11 +++-------- arch/x86/include/asm/entry_arch.h | 2 +- arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/cpu/mcheck/Makefile | 3 +-- arch/x86/kernel/irq.c | 4 ++-- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/signal.c | 2 +- 7 files changed, 10 insertions(+), 16 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d986769a7d90..06880ca677f4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -781,15 +781,10 @@ config X86_MCE The action the kernel takes depends on the severity of the problem, ranging from warning messages to halting the machine. -config X86_NEW_MCE - depends on X86_MCE - bool - default y - config X86_MCE_INTEL def_bool y prompt "Intel MCE features" - depends on X86_NEW_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC ---help--- Additional support for intel specific MCE features such as the thermal monitor. @@ -797,7 +792,7 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_NEW_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. @@ -817,7 +812,7 @@ config X86_MCE_THRESHOLD default y config X86_MCE_INJECT - depends on X86_NEW_MCE + depends on X86_MCE tristate "Machine check injector support" ---help--- Provide support for injecting machine checks for testing purposes. diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index ff8cbfa07851..5e3f2044f0d3 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -61,7 +61,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR) #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) #endif diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index b3025b43b63a..f4227289caf7 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu) static inline int mce_in_progress(void) { -#if defined(CONFIG_X86_NEW_MCE) +#if defined(CONFIG_X86_MCE) return atomic_read(&mce_entry) > 0; #endif return 0; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 022a036ce21b..4ac6d48fe11b 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -1,6 +1,5 @@ -obj-y = mce.o +obj-y = mce.o mce-severity.o -obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b0cdde6932f5..74656d1d4e30 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) seq_printf(p, " Threshold APIC interrupts\n"); # endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE seq_printf(p, "%*s: ", prec, "MCE"); for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); @@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_threshold_count; # endif #endif -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE sum += per_cpu(mce_exception_count, cpu); sum += per_cpu(mce_poll_count, cpu); #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 696f0e475c2d..8a194ad357ed 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -190,7 +190,7 @@ static void __init apic_intr_init(void) #ifdef CONFIG_X86_THRESHOLD alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); #endif -#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) +#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC) alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); #endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4c578751e94e..cc26ad4c3070 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -856,7 +856,7 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#ifdef CONFIG_X86_NEW_MCE +#ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_process(); From 9eda8cb3ac235217e4ffa01cb9cedee1c1550599 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:42 +0200 Subject: [PATCH 0034/1662] x86: mce: Move code in mce.c Now that the X86_OLD_MCE ifdefs are gone move some code that used to be outside the big ifdef to a more natural place near its user. No code change. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 5ff6362ecb18..e16271f01ac4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -45,17 +45,6 @@ #include "mce-internal.h" -/* Handle unconfigured int18 (should never happen) */ -static void unexpected_machine_check(struct pt_regs *regs, long error_code) -{ - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", - smp_processor_id()); -} - -/* Call the installed machine check handler for this CPU setup. */ -void (*machine_check_vector)(struct pt_regs *, long error_code) = - unexpected_machine_check; - int mce_disabled __read_mostly; #define MISC_MCELOG_MINOR 227 @@ -1322,6 +1311,17 @@ static void mce_init_timer(void) add_timer(t); } +/* Handle unconfigured int18 (should never happen) */ +static void unexpected_machine_check(struct pt_regs *regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void (*machine_check_vector)(struct pt_regs *, long error_code) = + unexpected_machine_check; + /* * Called for each booted CPU to set up machine checks. * Must be called with preempt off: From cebe182033f156b430952370fb0f9dbe6e89b081 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:43 +0200 Subject: [PATCH 0035/1662] x86: mce: Move per bank data in a single datastructure This addresses one of the leftover review comments. Move the per bank data into a single structure. This avoids several separate variables and also separate allocation of sysfs objects. I didn't move the CMCI ownership information so far because that would have needed some non trivial changes in the algorithms. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-internal.h | 14 +++ arch/x86/kernel/cpu/mcheck/mce.c | 109 +++++++++++----------- 2 files changed, 67 insertions(+), 56 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 54dcb8ff12e5..6bd51e7ba87b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,3 +1,4 @@ +#include #include enum severity_level { @@ -10,6 +11,19 @@ enum severity_level { MCE_PANIC_SEVERITY, }; +#define ATTR_LEN 16 + +/* One object for each MCE bank, shared by all CPUs */ +struct mce_bank { + u64 ctl; /* subevents to enable */ + unsigned char init; /* initialise bank? */ + struct sysdev_attribute attr; /* sysdev attribute */ + char attrname[ATTR_LEN]; /* attribute name */ +}; + int mce_severity(struct mce *a, int tolerant, char **msg); extern int mce_ser; + +extern struct mce_bank *mce_banks; + diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e16271f01ac4..a04806e01a82 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -64,7 +64,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count); */ static int tolerant __read_mostly = 1; static int banks __read_mostly; -static u64 *bank __read_mostly; static int rip_msr __read_mostly; static int mce_bootlog __read_mostly = -1; static int monarch_timeout __read_mostly = -1; @@ -74,13 +73,13 @@ int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; +struct mce_bank *mce_banks __read_mostly; + /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; static char mce_helper[128]; static char *mce_helper_argv[2] = { mce_helper, NULL }; -static unsigned long dont_init_banks; - static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; @@ -91,11 +90,6 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; -static inline int skip_bank_init(int i) -{ - return i < BITS_PER_LONG && test_bit(i, &dont_init_banks); -} - static DEFINE_PER_CPU(struct work_struct, mce_work); /* Do initial initialization of a struct mce */ @@ -482,7 +476,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); for (i = 0; i < banks; i++) { - if (!bank[i] || !test_bit(i, *b)) + if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; m.misc = 0; @@ -903,7 +897,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) order = mce_start(&no_way_out); for (i = 0; i < banks; i++) { __clear_bit(i, toclear); - if (!bank[i]) + if (!mce_banks[i].ctl) continue; m.misc = 0; @@ -1146,6 +1140,21 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); +static int mce_banks_init(void) +{ + int i; + + mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + if (!mce_banks) + return -ENOMEM; + for (i = 0; i < banks; i++) { + struct mce_bank *b = &mce_banks[i]; + b->ctl = -1ULL; + b->init = 1; + } + return 0; +} + /* * Initialize Machine Checks for a CPU. */ @@ -1169,11 +1178,10 @@ static int mce_cap_init(void) /* Don't support asymmetric configurations today */ WARN_ON(banks != 0 && b != banks); banks = b; - if (!bank) { - bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); - if (!bank) - return -ENOMEM; - memset(bank, 0xff, banks * sizeof(u64)); + if (!mce_banks) { + int err = mce_banks_init(); + if (err) + return err; } /* Use accurate RIP reporting if available. */ @@ -1205,9 +1213,10 @@ static void mce_init(void) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); for (i = 0; i < banks; i++) { - if (skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + if (!b->init) continue; - wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); + wrmsrl(MSR_IA32_MC0_CTL+4*i, b->ctl); wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); } } @@ -1223,7 +1232,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * trips off incorrectly with the IOMMU & 3ware * & Cerberus: */ - clear_bit(10, (unsigned long *)&bank[4]); + clear_bit(10, (unsigned long *)&mce_banks[4].ctl); } if (c->x86 <= 17 && mce_bootlog < 0) { /* @@ -1237,7 +1246,7 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * by default. */ if (c->x86 == 6 && banks > 0) - bank[0] = 0; + mce_banks[0].ctl = 0; } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1250,8 +1259,8 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A) - __set_bit(0, &dont_init_banks); + if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + mce_banks[0].init = 0; /* * All newer Intel systems support MCE broadcasting. Enable @@ -1578,7 +1587,8 @@ static int mce_disable(void) int i; for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } return 0; @@ -1654,14 +1664,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static struct sysdev_attribute *bank_attrs; +static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +{ + return container_of(attr, struct mce_bank, attr); +} static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, char *buf) { - u64 b = bank[attr - bank_attrs]; - - return sprintf(buf, "%llx\n", b); + return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, @@ -1672,7 +1683,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, if (strict_strtoull(buf, 0, &new) < 0) return -EINVAL; - bank[attr - bank_attrs] = new; + attr_to_bank(attr)->ctl = new; mce_restart(); return size; @@ -1816,7 +1827,7 @@ static __cpuinit int mce_create_device(unsigned int cpu) } for (j = 0; j < banks; j++) { err = sysdev_create_file(&per_cpu(mce_dev, cpu), - &bank_attrs[j]); + &mce_banks[j].attr); if (err) goto error2; } @@ -1825,10 +1836,10 @@ static __cpuinit int mce_create_device(unsigned int cpu) return 0; error2: while (--j >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); @@ -1846,7 +1857,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); + sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); cpumask_clear_cpu(cpu, mce_dev_initialized); @@ -1863,7 +1874,8 @@ static void mce_disable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) + struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); } } @@ -1879,8 +1891,9 @@ static void mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); for (i = 0; i < banks; i++) { - if (!skip_bank_init(i)) - wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); + struct mce_bank *b = &mce_banks[i]; + if (b->init) + wrmsrl(MSR_IA32_MC0_CTL + i*4, b->ctl); } } @@ -1928,35 +1941,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; -static __init int mce_init_banks(void) +static __init void mce_init_banks(void) { int i; - bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks, - GFP_KERNEL); - if (!bank_attrs) - return -ENOMEM; - for (i = 0; i < banks; i++) { - struct sysdev_attribute *a = &bank_attrs[i]; + struct mce_bank *b = &mce_banks[i]; + struct sysdev_attribute *a = &b->attr; - a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); - if (!a->attr.name) - goto nomem; + a->attr.name = b->attrname; + snprintf(b->attrname, ATTR_LEN, "bank%d", i); a->attr.mode = 0644; a->show = show_bank; a->store = set_bank; } - return 0; - -nomem: - while (--i >= 0) - kfree(bank_attrs[i].attr.name); - kfree(bank_attrs); - bank_attrs = NULL; - - return -ENOMEM; } static __init int mce_init_device(void) @@ -1969,9 +1968,7 @@ static __init int mce_init_device(void) zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); - err = mce_init_banks(); - if (err) - return err; + mce_init_banks(); err = sysdev_class_register(&mce_sysclass); if (err) From a2d32bcbc008aa0f9c301a7c6f3494cb23e6af54 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:44 +0200 Subject: [PATCH 0036/1662] x86: mce: macros to compute banks MSRs Instead of open coded calculations for bank MSRs hide the indexing of higher banks MCE register MSRs in new macros. No semantic changes. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 7 ++++++ arch/x86/kernel/cpu/mcheck/mce.c | 34 +++++++++++++------------- arch/x86/kernel/cpu/mcheck/mce_intel.c | 10 ++++---- 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 1692fb5050e3..3d1ce094586a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -81,8 +81,15 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) +#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) +#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) +#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) + /* These are consecutive and not in the normal 4er MCE bank block */ #define MSR_IA32_MC0_CTL2 0x00000280 +#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) + #define CMCI_EN (1ULL << 30) #define CMCI_THRESHOLD_MASK 0xffffULL diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a04806e01a82..07139a0578e3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -267,11 +267,11 @@ static int msr_to_offset(u32 msr) unsigned bank = __get_cpu_var(injectm.bank); if (msr == rip_msr) return offsetof(struct mce, ip); - if (msr == MSR_IA32_MC0_STATUS + bank*4) + if (msr == MSR_IA32_MCx_STATUS(bank)) return offsetof(struct mce, status); - if (msr == MSR_IA32_MC0_ADDR + bank*4) + if (msr == MSR_IA32_MCx_ADDR(bank)) return offsetof(struct mce, addr); - if (msr == MSR_IA32_MC0_MISC + bank*4) + if (msr == MSR_IA32_MCx_MISC(bank)) return offsetof(struct mce, misc); if (msr == MSR_IA32_MCG_STATUS) return offsetof(struct mce, mcgstatus); @@ -485,7 +485,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.tsc = 0; barrier(); - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (!(m.status & MCI_STATUS_VAL)) continue; @@ -500,9 +500,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) continue; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -518,7 +518,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) /* * Clear state for this bank. */ - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } /* @@ -539,7 +539,7 @@ static int mce_no_way_out(struct mce *m, char **msg) int i; for (i = 0; i < banks; i++) { - m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) return 1; } @@ -823,7 +823,7 @@ static void mce_clear_state(unsigned long *toclear) for (i = 0; i < banks; i++) { if (test_bit(i, toclear)) - mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } @@ -904,7 +904,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) m.addr = 0; m.bank = i; - m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); + m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); if ((m.status & MCI_STATUS_VAL) == 0) continue; @@ -945,9 +945,9 @@ void do_machine_check(struct pt_regs *regs, long error_code) kill_it = 1; if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); + m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); + m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); /* * Action optional error. Queue address for later processing. @@ -1216,8 +1216,8 @@ static void mce_init(void) struct mce_bank *b = &mce_banks[i]; if (!b->init) continue; - wrmsrl(MSR_IA32_MC0_CTL+4*i, b->ctl); - wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); + wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } @@ -1589,7 +1589,7 @@ static int mce_disable(void) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } return 0; } @@ -1876,7 +1876,7 @@ static void mce_disable_cpu(void *h) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); + wrmsrl(MSR_IA32_MCx_CTL(i), 0); } } @@ -1893,7 +1893,7 @@ static void mce_reenable_cpu(void *h) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(MSR_IA32_MC0_CTL + i*4, b->ctl); + wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); } } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index e1acec0f7a32..889f665fe93d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -90,7 +90,7 @@ static void cmci_discover(int banks, int boot) if (test_bit(i, owned)) continue; - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ if (val & CMCI_EN) { @@ -101,8 +101,8 @@ static void cmci_discover(int banks, int boot) } val |= CMCI_EN | CMCI_THRESHOLD; - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ if (val & CMCI_EN) { @@ -152,9 +152,9 @@ void cmci_clear(void) if (!test_bit(i, __get_cpu_var(mce_banks_owned))) continue; /* Disable CMCI */ - rdmsrl(MSR_IA32_MC0_CTL2 + i, val); + rdmsrl(MSR_IA32_MCx_CTL2(i), val); val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MC0_CTL2 + i, val); + wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } spin_unlock_irqrestore(&cmci_discover_lock, flags); From 3ccdccfadbd2548abe38682b587f4ba27eac2fc9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 9 Jul 2009 00:31:45 +0200 Subject: [PATCH 0037/1662] x86: mce: Lower maximum number of banks to architecture limit The Intel x86 architecture right now only supports 32 machine check banks, more would bump into other MSRs. So lower the max define to 32. This only affects a few bitmaps, most data structures are dynamically sized anyways. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6b8a974e1270..ad7535372918 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -130,10 +130,11 @@ void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); /* - * To support more than 128 would need to escape the predefined - * Linux defined extended banks first. + * Maximum banks number. + * This is the limit of the current register layout on + * Intel CPUs. */ -#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1) +#define MAX_NR_BANKS 32 #ifdef CONFIG_X86_MCE_INTEL extern int mce_cmci_disabled; From 872fb6dd6b07986417964e089074e7acfd025f4c Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Mon, 13 Jul 2009 13:09:43 -0700 Subject: [PATCH 0038/1662] ia64: Fix setup_per_cpu_areas() compilation error Fix ia64 build setup_per_cpu_areas() redifinition issue in UP configuration. When compiling ia64 kernel in UP configuration, the following compilation errors are reported: arch/ia64/kernel/setup.c:860: error: redefinition of 'setup_per_cpu_areas' include/linux/percpu.h:185: error: previous definition of 'setup_per_cpu_areas' was here The patch fixes the issue in arch/ia64/kernel/setup.c Signed-off-by: Fenghua Yu Signed-off-by: Tejun Heo --- arch/ia64/kernel/setup.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 1b23ec126b63..1de86c96801d 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -855,11 +855,17 @@ identify_cpu (struct cpuinfo_ia64 *c) c->unimpl_pa_mask = ~((1L<<63) | ((1L << phys_addr_size) - 1)); } +/* + * In UP configuration, setup_per_cpu_areas() is defined in + * include/linux/percpu.h + */ +#ifdef CONFIG_SMP void __init setup_per_cpu_areas (void) { /* start_kernel() requires this... */ } +#endif /* * Do the following calculations: From a76761b621bcd8336065c4fe3a74f046858bc34c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Jul 2009 23:35:14 +0900 Subject: [PATCH 0039/1662] percpu: add dummy pcpu_lpage_remapped() for !CONFIG_SMP !CONFIG_SMP was missing pcpu_lpage_remapped() definition causing build failure. Add dummy implementation. This was discovered by linux-next testing. Signed-off-by: Tejun Heo Cc: Randy Dunlap Cc: Kamalesh Babulal Cc: Stephen Rothwell --- include/linux/percpu.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 8ce91af4aa19..e134c8229631 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -184,6 +184,11 @@ static inline void free_percpu(void *p) static inline void __init setup_per_cpu_areas(void) { } +static inline void *pcpu_lpage_remapped(void *kaddr) +{ + return NULL; +} + #endif /* CONFIG_SMP */ #define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \ From 3162534069597e34dd0ac9eb711be8dc23835ae7 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 30 Jun 2009 19:30:59 -0700 Subject: [PATCH 0040/1662] x86, intel_txt: Intel TXT boot support This patch adds kernel configuration and boot support for Intel Trusted Execution Technology (Intel TXT). Intel's technology for safer computing, Intel Trusted Execution Technology (Intel TXT), defines platform-level enhancements that provide the building blocks for creating trusted platforms. Intel TXT was formerly known by the code name LaGrande Technology (LT). Intel TXT in Brief: o Provides dynamic root of trust for measurement (DRTM) o Data protection in case of improper shutdown o Measurement and verification of launched environment Intel TXT is part of the vPro(TM) brand and is also available some non-vPro systems. It is currently available on desktop systems based on the Q35, X38, Q45, and Q43 Express chipsets (e.g. Dell Optiplex 755, HP dc7800, etc.) and mobile systems based on the GM45, PM45, and GS45 Express chipsets. For more information, see http://www.intel.com/technology/security/. This site also has a link to the Intel TXT MLE Developers Manual, which has been updated for the new released platforms. A much more complete description of how these patches support TXT, how to configure a system for it, etc. is in the Documentation/intel_txt.txt file in this patch. This patch provides the TXT support routines for complete functionality, documentation for TXT support and for the changes to the boot_params structure, and boot detection of a TXT launch. Attempts to shutdown (reboot, Sx) the system will result in platform resets; subsequent patches will support these shutdown modes properly. Documentation/intel_txt.txt | 210 +++++++++++++++++++++ Documentation/x86/zero-page.txt | 1 arch/x86/include/asm/bootparam.h | 3 arch/x86/include/asm/fixmap.h | 3 arch/x86/include/asm/tboot.h | 197 ++++++++++++++++++++ arch/x86/kernel/Makefile | 1 arch/x86/kernel/setup.c | 4 arch/x86/kernel/tboot.c | 379 +++++++++++++++++++++++++++++++++++++++ security/Kconfig | 30 +++ 9 files changed, 827 insertions(+), 1 deletion(-) Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Signed-off-by: Gang Wei Signed-off-by: H. Peter Anvin --- Documentation/intel_txt.txt | 210 +++++++++++++++++ Documentation/x86/zero-page.txt | 1 + arch/x86/include/asm/bootparam.h | 3 +- arch/x86/include/asm/fixmap.h | 3 + arch/x86/include/asm/tboot.h | 197 ++++++++++++++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/setup.c | 4 + arch/x86/kernel/tboot.c | 379 +++++++++++++++++++++++++++++++ security/Kconfig | 30 +++ 9 files changed, 827 insertions(+), 1 deletion(-) create mode 100644 Documentation/intel_txt.txt create mode 100644 arch/x86/include/asm/tboot.h create mode 100644 arch/x86/kernel/tboot.c diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt new file mode 100644 index 000000000000..f40a1f030019 --- /dev/null +++ b/Documentation/intel_txt.txt @@ -0,0 +1,210 @@ +Intel(R) TXT Overview: +===================== + +Intel's technology for safer computing, Intel(R) Trusted Execution +Technology (Intel(R) TXT), defines platform-level enhancements that +provide the building blocks for creating trusted platforms. + +Intel TXT was formerly known by the code name LaGrande Technology (LT). + +Intel TXT in Brief: +o Provides dynamic root of trust for measurement (DRTM) +o Data protection in case of improper shutdown +o Measurement and verification of launched environment + +Intel TXT is part of the vPro(TM) brand and is also available some +non-vPro systems. It is currently available on desktop systems +based on the Q35, X38, Q45, and Q43 Express chipsets (e.g. Dell +Optiplex 755, HP dc7800, etc.) and mobile systems based on the GM45, +PM45, and GS45 Express chipsets. + +For more information, see http://www.intel.com/technology/security/. +This site also has a link to the Intel TXT MLE Developers Manual, +which has been updated for the new released platforms. + +Intel TXT has been presented at various events over the past few +years, some of which are: + LinuxTAG 2008: + http://www.linuxtag.org/2008/en/conf/events/vp-donnerstag/ + details.html?talkid=110 + TRUST2008: + http://www.trust2008.eu/downloads/Keynote-Speakers/ + 3_David-Grawrock_The-Front-Door-of-Trusted-Computing.pdf + IDF 2008, Shanghai: + http://inteldeveloperforum.com.edgesuite.net/shanghai_2008/ + aep/PROS003/index.html + IDFs 2006, 2007 (I'm not sure if/where they are online) + +Trusted Boot Project Overview: +============================= + +Trusted Boot (tboot) is an open source, pre- kernel/VMM module that +uses Intel TXT to perform a measured and verified launch of an OS +kernel/VMM. + +It is hosted on SourceForge at http://sourceforge.net/projects/tboot. +The mercurial source repo is available at http://www.bughost.org/ +repos.hg/tboot.hg. + +Tboot currently supports launching Xen (open source VMM/hypervisor +w/ TXT support since v3.2), and now Linux kernels. + + +Value Proposition for Linux or "Why should you care?" +===================================================== + +While there are many products and technologies that attempt to +measure or protect the integrity of a running kernel, they all +assume the kernel is "good" to begin with. The Integrity +Measurement Architecture (IMA) and Linux Integrity Module interface +are examples of such solutions. + +To get trust in the initial kernel without using Intel TXT, a +static root of trust must be used. This bases trust in BIOS +starting at system reset and requires measurement of all code +executed between system reset through the completion of the kernel +boot as well as data objects used by that code. In the case of a +Linux kernel, this means all of BIOS, any option ROMs, the +bootloader and the boot config. In practice, this is a lot of +code/data, much of which is subject to change from boot to boot +(e.g. changing NICs may change option ROMs). Without reference +hashes, these measurement changes are difficult to assess or +confirm as benign. This process also does not provide DMA +protection, memory configuration/alias checks and locks, crash +protection, or policy support. + +By using the hardware-based root of trust that Intel TXT provides, +many of these issues can be mitigated. Specifically: many +pre-launch components can be removed from the trust chain, DMA +protection is provided to all launched components, a large number +of platform configuration checks are performed and values locked, +protection is provided for any data in the event of an improper +shutdown, and there is support for policy-based execution/verification. +This provides a more stable measurement and a higher assurance of +system configuration and initial state than would be otherwise +possible. Since the tboot project is open source, source code for +almost all parts of the trust chain is available (excepting SMM and +Intel-provided firmware). + +How Does it Work? +================= + +o Tboot is an executable that is launched by the bootloader as + the "kernel" (the binary the bootloader executes). +o It performs all of the work necessary to determine if the + platform supports Intel TXT and, if so, executes the GETSEC[SENTER] + processor instruction that initiates the dynamic root of trust. + - If tboot determines that the system does not support Intel TXT + or is not configured correctly (e.g. the SINIT AC Module was + incorrect), it will directly launch the kernel with no changes + to any state. + - Tboot will output various information about its progress to the + terminal, serial port, and/or an in-memory log; the output + locations can be configured with a command line switch. +o The GETSEC[SENTER] instruction will return control to tboot and + tboot then verifies certain aspects of the environment (e.g. TPM NV + lock, e820 table does not have invalid entries, etc.). +o It will wake the APs from the special sleep state the GETSEC[SENTER] + instruction had put them in and place them into a wait-for-SIPI + state. + - Because the processors will not respond to an INIT or SIPI when + in the TXT environment, it is necessary to create a small VT-x + guest for the APs. When they run in this guest, they will + simply wait for the INIT-SIPI-SIPI sequence, which will cause + VMEXITs, and then disable VT and jump to the SIPI vector. This + approach seemed like a better choice than having to insert + special code into the kernel's MP wakeup sequence. +o Tboot then applies an (optional) user-defined launch policy to + verify the kernel and initrd. + - This policy is rooted in TPM NV and is described in the tboot + project. The tboot project also contains code for tools to + create and provision the policy. + - Policies are completely under user control and if not present + then any kernel will be launched. + - Policy action is flexible and can include halting on failures + or simply logging them and continuing. +o Tboot adjusts the e820 table provided by the bootloader to reserve + its own location in memory as well as to reserve certain other + TXT-related regions. +o As part of it's launch, tboot DMA protects all of RAM (using the + VT-d PMRs). Thus, the kernel must be booted with 'intel_iommu=on' + in order to remove this blanket protection and use VT-d's + page-level protection. +o Tboot will populate a shared page with some data about itself and + pass this to the Linux kernel as it transfers control. + - The location of the shared page is passed via the boot_params + struct as a physical address. +o The kernel will look for the tboot shared page address and, if it + exists, map it. +o As one of the checks/protections provided by TXT, it makes a copy + of the VT-d DMARs in a DMA-protected region of memory and verifies + them for correctness. The VT-d code will detect if the kernel was + launched with tboot and use this copy instead of the one in the + ACPI table. +o At this point, tboot and TXT are out of the picture until a + shutdown (S) +o In order to put a system into any of the sleep states after a TXT + launch, TXT must first be exited. This is to prevent attacks that + attempt to crash the system to gain control on reboot and steal + data left in memory. + - The kernel will perform all of its sleep preparation and + populate the shared page with the ACPI data needed to put the + platform in the desired sleep state. + - Then the kernel jumps into tboot via the vector specified in the + shared page. + - Tboot will clean up the environment and disable TXT, then use the + kernel-provided ACPI information to actually place the platform + into the desired sleep state. + - In the case of S3, tboot will also register itself as the resume + vector. This is necessary because it must re-establish the + measured environment upon resume. Once the TXT environment + has been restored, it will restore the TPM PCRs and then + transfer control back to the kernel's S3 resume vector. + In order to preserve system integrity across S3, the kernel + provides tboot with a set of memory ranges (kernel + code/data/bss, S3 resume code, and AP trampoline) that tboot + will calculate a MAC (message authentication code) over and then + seal with the TPM. On resume and once the measured environment + has been re-established, tboot will re-calculate the MAC and + verify it against the sealed value. Tboot's policy determines + what happens if the verification fails. + +That's pretty much it for TXT support. + + +Configuring the System: +====================== + +This code works with 32bit, 32bit PAE, and 64bit (x86_64) kernels. + +In BIOS, the user must enable: TPM, TXT, VT-x, VT-d. Not all BIOSes +allow these to be individually enabled/disabled and the screens in +which to find them are BIOS-specific. + +grub.conf needs to be modified as follows: + title Linux 2.6.29-tip w/ tboot + root (hd0,0) + kernel /tboot.gz logging=serial,vga,memory + module /vmlinuz-2.6.29-tip intel_iommu=on ro + root=LABEL=/ rhgb console=ttyS0,115200 3 + module /initrd-2.6.29-tip.img + module /Q35_SINIT_17.BIN + +The kernel option for enabling Intel TXT support is found under the +Security top-level menu and is called "Enable Intel(R) Trusted +Execution Technology (TXT)". It is marked as EXPERIMENTAL and +depends on the generic x86 support (to allow maximum flexibility in +kernel build options), since the tboot code will detect whether the +platform actually supports Intel TXT and thus whether any of the +kernel code is executed. + +The Q35_SINIT_17.BIN file is what Intel TXT refers to as an +Authenticated Code Module. It is specific to the chipset in the +system and can also be found on the Trusted Boot site. It is an +(unencrypted) module signed by Intel that is used as part of the +DRTM process to verify and configure the system. It is signed +because it operates at a higher privilege level in the system than +any other macrocode and its correct operation is critical to the +establishment of the DRTM. The process for determining the correct +SINIT ACM for a system is documented in the SINIT-guide.txt file +that is on the tboot SourceForge site under the SINIT ACM downloads. diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt index 4f913857b8a2..feb37e177010 100644 --- a/Documentation/x86/zero-page.txt +++ b/Documentation/x86/zero-page.txt @@ -12,6 +12,7 @@ Offset Proto Name Meaning 000/040 ALL screen_info Text mode or frame buffer information (struct screen_info) 040/014 ALL apm_bios_info APM BIOS information (struct apm_bios_info) +058/008 ALL tboot_addr Physical address of tboot shared page 060/010 ALL ist_info Intel SpeedStep (IST) BIOS support information (struct ist_info) 080/010 ALL hd0_info hd0 disk parameter, OBSOLETE!! diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 1724e8de317c..6ca20218dd72 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -85,7 +85,8 @@ struct efi_info { struct boot_params { struct screen_info screen_info; /* 0x000 */ struct apm_bios_info apm_bios_info; /* 0x040 */ - __u8 _pad2[12]; /* 0x054 */ + __u8 _pad2[4]; /* 0x054 */ + __u64 tboot_addr; /* 0x058 */ struct ist_info ist_info; /* 0x060 */ __u8 _pad3[16]; /* 0x070 */ __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 7b2d71df39a6..14f9890eb495 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -131,6 +131,9 @@ enum fixed_addresses { #endif #ifdef CONFIG_X86_32 FIX_WP_TEST, +#endif +#ifdef CONFIG_INTEL_TXT + FIX_TBOOT_BASE, #endif __end_of_fixed_addresses }; diff --git a/arch/x86/include/asm/tboot.h b/arch/x86/include/asm/tboot.h new file mode 100644 index 000000000000..b13929d4e5f4 --- /dev/null +++ b/arch/x86/include/asm/tboot.h @@ -0,0 +1,197 @@ +/* + * tboot.h: shared data structure with tboot and kernel and functions + * used by kernel for runtime support of Intel(R) Trusted + * Execution Technology + * + * Copyright (c) 2006-2009, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#ifndef _ASM_TBOOT_H +#define _ASM_TBOOT_H + +#include + +/* these must have the values from 0-5 in this order */ +enum { + TB_SHUTDOWN_REBOOT = 0, + TB_SHUTDOWN_S5, + TB_SHUTDOWN_S4, + TB_SHUTDOWN_S3, + TB_SHUTDOWN_HALT, + TB_SHUTDOWN_WFS +}; + +#ifdef CONFIG_INTEL_TXT + +/* used to communicate between tboot and the launched kernel */ + +#define TB_KEY_SIZE 64 /* 512 bits */ + +#define MAX_TB_MAC_REGIONS 32 + +struct tboot_mac_region { + u64 start; /* must be 64 byte -aligned */ + u32 size; /* must be 64 byte -granular */ +} __packed; + +/* GAS - Generic Address Structure (ACPI 2.0+) */ +struct tboot_acpi_generic_address { + u8 space_id; + u8 bit_width; + u8 bit_offset; + u8 access_width; + u64 address; +} __packed; + +/* + * combines Sx info from FADT and FACS tables per ACPI 2.0+ spec + * (http://www.acpi.info/) + */ +struct tboot_acpi_sleep_info { + struct tboot_acpi_generic_address pm1a_cnt_blk; + struct tboot_acpi_generic_address pm1b_cnt_blk; + struct tboot_acpi_generic_address pm1a_evt_blk; + struct tboot_acpi_generic_address pm1b_evt_blk; + u16 pm1a_cnt_val; + u16 pm1b_cnt_val; + u64 wakeup_vector; + u32 vector_width; + u64 kernel_s3_resume_vector; +} __packed; + +/* + * shared memory page used for communication between tboot and kernel + */ +struct tboot { + /* + * version 3+ fields: + */ + + /* TBOOT_UUID */ + u8 uuid[16]; + + /* version number: 5 is current */ + u32 version; + + /* physical addr of tb_log_t log */ + u32 log_addr; + + /* + * physical addr of entry point for tboot shutdown and + * type of shutdown (TB_SHUTDOWN_*) being requested + */ + u32 shutdown_entry; + u32 shutdown_type; + + /* kernel-specified ACPI info for Sx shutdown */ + struct tboot_acpi_sleep_info acpi_sinfo; + + /* tboot location in memory (physical) */ + u32 tboot_base; + u32 tboot_size; + + /* memory regions (phys addrs) for tboot to MAC on S3 */ + u8 num_mac_regions; + struct tboot_mac_region mac_regions[MAX_TB_MAC_REGIONS]; + + + /* + * version 4+ fields: + */ + + /* symmetric key for use by kernel; will be encrypted on S3 */ + u8 s3_key[TB_KEY_SIZE]; + + + /* + * version 5+ fields: + */ + + /* used to 4byte-align num_in_wfs */ + u8 reserved_align[3]; + + /* number of processors in wait-for-SIPI */ + u32 num_in_wfs; +} __packed; + +/* + * UUID for tboot data struct to facilitate matching + * defined as {663C8DFF-E8B3-4b82-AABF-19EA4D057A08} by tboot, which is + * represented as {} in the char array used here + */ +#define TBOOT_UUID {0xff, 0x8d, 0x3c, 0x66, 0xb3, 0xe8, 0x82, 0x4b, 0xbf,\ + 0xaa, 0x19, 0xea, 0x4d, 0x5, 0x7a, 0x8} + +extern struct tboot *tboot; + +static inline int tboot_enabled(void) +{ + return tboot != NULL; +} + +extern void tboot_probe(void); +extern void tboot_create_trampoline(void); +extern void tboot_shutdown(u32 shutdown_type); +extern void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control); +extern int tboot_wait_for_aps(int num_aps); +extern struct acpi_table_header *tboot_get_dmar_table( + struct acpi_table_header *dmar_tbl); +extern int tboot_force_iommu(void); + +#else /* CONFIG_INTEL_TXT */ + +static inline int tboot_enabled(void) +{ + return 0; +} + +static inline void tboot_probe(void) +{ +} + +static inline void tboot_create_trampoline(void) +{ +} + +static inline void tboot_shutdown(u32 shutdown_type) +{ +} + +static inline void tboot_sleep(u8 sleep_state, u32 pm1a_control, + u32 pm1b_control) +{ +} + +static inline int tboot_wait_for_aps(int num_aps) +{ + return 0; +} + +static inline struct acpi_table_header *tboot_get_dmar_table( + struct acpi_table_header *dmar_tbl) +{ + return dmar_tbl; +} + +static inline int tboot_force_iommu(void) +{ + return 0; +} + +#endif /* !CONFIG_INTEL_TXT */ + +#endif /* _ASM_TBOOT_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 430d5b24af7b..832cb838cb48 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o +obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index de2cab132844..80d6e9e32483 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -145,6 +145,8 @@ struct boot_params __initdata boot_params; struct boot_params boot_params; #endif +#include + /* * Machine setup.. */ @@ -964,6 +966,8 @@ void __init setup_arch(char **cmdline_p) paravirt_pagetable_setup_done(swapper_pg_dir); paravirt_post_allocator_init(); + tboot_probe(); + #ifdef CONFIG_X86_64 map_vsyscall(); #endif diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c new file mode 100644 index 000000000000..263591afd29e --- /dev/null +++ b/arch/x86/kernel/tboot.c @@ -0,0 +1,379 @@ +/* + * tboot.c: main implementation of helper functions used by kernel for + * runtime support of Intel(R) Trusted Execution Technology + * + * Copyright (c) 2006-2009, Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "acpi/realmode/wakeup.h" + +/* Global pointer to shared data; NULL means no measured launch. */ +struct tboot *tboot __read_mostly; + +/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ +#define AP_WAIT_TIMEOUT 1 + +#undef pr_fmt +#define pr_fmt(fmt) "tboot: " fmt + +static u8 tboot_uuid[16] __initdata = TBOOT_UUID; + +void __init tboot_probe(void) +{ + /* Look for valid page-aligned address for shared page. */ + if (!boot_params.tboot_addr) + return; + /* + * also verify that it is mapped as we expect it before calling + * set_fixmap(), to reduce chance of garbage value causing crash + */ + if (!e820_any_mapped(boot_params.tboot_addr, + boot_params.tboot_addr, E820_RESERVED)) { + pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n"); + return; + } + + /* only a natively booted kernel should be using TXT */ + if (paravirt_enabled()) { + pr_warning("non-0 tboot_addr but pv_ops is enabled\n"); + return; + } + + /* Map and check for tboot UUID. */ + set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); + tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); + if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { + pr_warning("tboot at 0x%llx is invalid\n", + boot_params.tboot_addr); + tboot = NULL; + return; + } + if (tboot->version < 5) { + pr_warning("tboot version is invalid: %u\n", tboot->version); + tboot = NULL; + return; + } + + pr_info("found shared page at phys addr 0x%llx:\n", + boot_params.tboot_addr); + pr_debug("version: %d\n", tboot->version); + pr_debug("log_addr: 0x%08x\n", tboot->log_addr); + pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry); + pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base); + pr_debug("tboot_size: 0x%x\n", tboot->tboot_size); +} + +static pgd_t *tboot_pg_dir; +static struct mm_struct tboot_mm = { + .mm_rb = RB_ROOT, + .pgd = swapper_pg_dir, + .mm_users = ATOMIC_INIT(2), + .mm_count = ATOMIC_INIT(1), + .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .cpu_vm_mask = CPU_MASK_ALL, +}; + +static inline void switch_to_tboot_pt(void) +{ + write_cr3(virt_to_phys(tboot_pg_dir)); +} + +static int map_tboot_page(unsigned long vaddr, unsigned long pfn, + pgprot_t prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(&tboot_mm, vaddr); + pud = pud_alloc(&tboot_mm, pgd, vaddr); + if (!pud) + return -1; + pmd = pmd_alloc(&tboot_mm, pud, vaddr); + if (!pmd) + return -1; + pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + if (!pte) + return -1; + set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); + pte_unmap(pte); + return 0; +} + +static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn, + unsigned long nr) +{ + /* Reuse the original kernel mapping */ + tboot_pg_dir = pgd_alloc(&tboot_mm); + if (!tboot_pg_dir) + return -1; + + for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) { + if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC)) + return -1; + } + + return 0; +} + +void tboot_create_trampoline(void) +{ + u32 map_base, map_size; + + if (!tboot_enabled()) + return; + + /* Create identity map for tboot shutdown code. */ + map_base = PFN_DOWN(tboot->tboot_base); + map_size = PFN_UP(tboot->tboot_size); + if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) + panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", map_base, map_size); +} + +static void set_mac_regions(void) +{ + tboot->num_mac_regions = 3; + /* S3 resume code */ + tboot->mac_regions[0].start = PFN_PHYS(PFN_DOWN(acpi_wakeup_address)); + tboot->mac_regions[0].size = PFN_UP(WAKEUP_SIZE) << PAGE_SHIFT; + /* AP trampoline code */ + tboot->mac_regions[1].start = + PFN_PHYS(PFN_DOWN(virt_to_phys(trampoline_base))); + tboot->mac_regions[1].size = PFN_UP(TRAMPOLINE_SIZE) << PAGE_SHIFT; + /* kernel code + data + bss */ + tboot->mac_regions[2].start = PFN_PHYS(PFN_DOWN(virt_to_phys(&_text))); + tboot->mac_regions[2].size = PFN_PHYS(PFN_UP(virt_to_phys(&_end))) - + PFN_PHYS(PFN_DOWN(virt_to_phys(&_text))); +} + +void tboot_shutdown(u32 shutdown_type) +{ + void (*shutdown)(void); + + if (!tboot_enabled()) + return; + + /* + * if we're being called before the 1:1 mapping is set up then just + * return and let the normal shutdown happen; this should only be + * due to very early panic() + */ + if (!tboot_pg_dir) + return; + + /* if this is S3 then set regions to MAC */ + if (shutdown_type == TB_SHUTDOWN_S3) + set_mac_regions(); + + tboot->shutdown_type = shutdown_type; + + switch_to_tboot_pt(); + + shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry; + shutdown(); + + /* should not reach here */ + while (1) + halt(); +} + +static void tboot_copy_fadt(const struct acpi_table_fadt *fadt) +{ +#define TB_COPY_GAS(tbg, g) \ + tbg.space_id = g.space_id; \ + tbg.bit_width = g.bit_width; \ + tbg.bit_offset = g.bit_offset; \ + tbg.access_width = g.access_width; \ + tbg.address = g.address; + + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block); + TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block); + + /* + * We need phys addr of waking vector, but can't use virt_to_phys() on + * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys + * addr. + */ + tboot->acpi_sinfo.wakeup_vector = fadt->facs + + offsetof(struct acpi_table_facs, firmware_waking_vector); +} + +void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) +{ + static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { + /* S0,1,2: */ -1, -1, -1, + /* S3: */ TB_SHUTDOWN_S3, + /* S4: */ TB_SHUTDOWN_S4, + /* S5: */ TB_SHUTDOWN_S5 }; + + if (!tboot_enabled()) + return; + + tboot_copy_fadt(&acpi_gbl_FADT); + tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; + tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; + /* we always use the 32b wakeup vector */ + tboot->acpi_sinfo.vector_width = 32; + tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + + if (sleep_state >= ACPI_S_STATE_COUNT || + acpi_shutdown_map[sleep_state] == -1) { + pr_warning("unsupported sleep state 0x%x\n", sleep_state); + return; + } + + tboot_shutdown(acpi_shutdown_map[sleep_state]); +} + +int tboot_wait_for_aps(int num_aps) +{ + unsigned long timeout; + + if (!tboot_enabled()) + return 0; + + timeout = jiffies + AP_WAIT_TIMEOUT*HZ; + while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps && + time_before(jiffies, timeout)) + cpu_relax(); + + return time_before(jiffies, timeout) ? 0 : 1; +} + +/* + * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE) + */ + +#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000 +#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000 + +/* # pages for each config regs space - used by fixmap */ +#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \ + TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT) + +/* offsets from pub/priv config space */ +#define TXTCR_HEAP_BASE 0x0300 +#define TXTCR_HEAP_SIZE 0x0308 + +#define SHA1_SIZE 20 + +struct sha1_hash { + u8 hash[SHA1_SIZE]; +}; + +struct sinit_mle_data { + u32 version; /* currently 6 */ + struct sha1_hash bios_acm_id; + u32 edx_senter_flags; + u64 mseg_valid; + struct sha1_hash sinit_hash; + struct sha1_hash mle_hash; + struct sha1_hash stm_hash; + struct sha1_hash lcp_policy_hash; + u32 lcp_policy_control; + u32 rlp_wakeup_addr; + u32 reserved; + u32 num_mdrs; + u32 mdrs_off; + u32 num_vtd_dmars; + u32 vtd_dmars_off; +} __packed; + +struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl) +{ + void *heap_base, *heap_ptr, *config; + + if (!tboot_enabled()) + return dmar_tbl; + + /* + * ACPI tables may not be DMA protected by tboot, so use DMAR copy + * SINIT saved in SinitMleData in TXT heap (which is DMA protected) + */ + + /* map config space in order to get heap addr */ + config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES * + PAGE_SIZE); + if (!config) + return NULL; + + /* now map TXT heap */ + heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE), + *(u64 *)(config + TXTCR_HEAP_SIZE)); + iounmap(config); + if (!heap_base) + return NULL; + + /* walk heap to SinitMleData */ + /* skip BiosData */ + heap_ptr = heap_base + *(u64 *)heap_base; + /* skip OsMleData */ + heap_ptr += *(u64 *)heap_ptr; + /* skip OsSinitData */ + heap_ptr += *(u64 *)heap_ptr; + /* now points to SinitMleDataSize; set to SinitMleData */ + heap_ptr += sizeof(u64); + /* get addr of DMAR table */ + dmar_tbl = (struct acpi_table_header *)(heap_ptr + + ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off - + sizeof(u64)); + + /* don't unmap heap because dmar.c needs access to this */ + + return dmar_tbl; +} + +int tboot_force_iommu(void) +{ + if (!tboot_enabled()) + return 0; + + if (no_iommu || swiotlb || dmar_disabled) + pr_warning("Forcing Intel-IOMMU to enabled\n"); + + dmar_disabled = 0; +#ifdef CONFIG_SWIOTLB + swiotlb = 0; +#endif + no_iommu = 0; + + return 1; +} diff --git a/security/Kconfig b/security/Kconfig index d23c839038f0..edc7cbdc012a 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -113,6 +113,36 @@ config SECURITY_ROOTPLUG If you are unsure how to answer this question, answer N. +config INTEL_TXT + bool "Enable Intel(R) Trusted Execution Technology (Intel(R) TXT)" + depends on EXPERIMENTAL && X86 && DMAR && ACPI + help + This option enables support for booting the kernel with the + Trusted Boot (tboot) module. This will utilize + Intel(R) Trusted Execution Technology to perform a measured launch + of the kernel. If the system does not support Intel(R) TXT, this + will have no effect. + + Intel TXT will provide higher assurance of sysem configuration and + initial state as well as data reset protection. This is used to + create a robust initial kernel measurement and verification, which + helps to ensure that kernel security mechanisms are functioning + correctly. This level of protection requires a root of trust outside + of the kernel itself. + + Intel TXT also helps solve real end user concerns about having + confidence that their hardware is running the VMM or kernel that + it was conigured with, especially since they may be responsible for + providing such assurances to VMs and services running on it. + + See for more information + about Intel(R) TXT. + See for more information about tboot. + See Documentation/intel_txt.txt for a description of how to enable + Intel TXT support in a kernel boot. + + If you are unsure as to whether this is required, answer N. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig From 840c2baf2d4cdf35ecc3b7fcbba7740f97de30a4 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 30 Jun 2009 19:31:02 -0700 Subject: [PATCH 0041/1662] x86, intel_txt: Intel TXT reboot/halt shutdown support Support for graceful handling of kernel reboots after an Intel(R) TXT launch. Without this patch, attempting to reboot or halt the system will cause the TXT hardware to lock memory upon system restart because the secrets-in-memory flag that was set on launch was never cleared. This will in turn cause BIOS to execute a TXT Authenticated Code Module (ACM) that will scrub all of memory and then unlock it. Depending on the amount of memory in the system and its type, this may take some time. This patch creates a 1:1 address mapping to the tboot module and then calls back into tboot so that it may properly and securely clean up system state and clear the secrets-in-memory flag. When it has completed these steps, the tboot module will reboot or halt the system. arch/x86/kernel/reboot.c | 8 ++++++++ init/main.c | 3 +++ 2 files changed, 11 insertions(+) Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/kernel/reboot.c | 8 ++++++++ init/main.c | 3 +++ 2 files changed, 11 insertions(+) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index d2d1ce8170f0..9de01c5d9794 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -24,6 +24,8 @@ # include #endif +#include + /* * Power off function, if any */ @@ -460,6 +462,8 @@ static void native_machine_emergency_restart(void) if (reboot_emergency) emergency_vmx_disable_all(); + tboot_shutdown(TB_SHUTDOWN_REBOOT); + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -586,6 +590,8 @@ static void native_machine_halt(void) /* stop other cpus and apics */ machine_shutdown(); + tboot_shutdown(TB_SHUTDOWN_HALT); + /* stop this cpu */ stop_this_cpu(NULL); } @@ -597,6 +603,8 @@ static void native_machine_power_off(void) machine_shutdown(); pm_power_off(); } + /* a fallback in case there is no PM info available */ + tboot_shutdown(TB_SHUTDOWN_HALT); } struct machine_ops machine_ops = { diff --git a/init/main.c b/init/main.c index 2c5ade79eb81..56ada27c4f47 100644 --- a/init/main.c +++ b/init/main.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include @@ -715,6 +716,8 @@ asmlinkage void __init start_kernel(void) ftrace_init(); + tboot_create_trampoline(); + /* Do the rest non-__init'ed, we're now alive */ rest_init(); } From 86886e55b273f565935491816c7c96b82469d4f8 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 30 Jun 2009 19:31:07 -0700 Subject: [PATCH 0042/1662] x86, intel_txt: Intel TXT Sx shutdown support Support for graceful handling of sleep states (S3/S4/S5) after an Intel(R) TXT launch. Without this patch, attempting to place the system in one of the ACPI sleep states (S3/S4/S5) will cause the TXT hardware to treat this as an attack and will cause a system reset, with memory locked. Not only may the subsequent memory scrub take some time, but the platform will be unable to enter the requested power state. This patch calls back into the tboot so that it may properly and securely clean up system state and clear the secrets-in-memory flag, after which it will place the system into the requested sleep state using ACPI information passed by the kernel. arch/x86/kernel/smpboot.c | 2 ++ drivers/acpi/acpica/hwsleep.c | 3 +++ kernel/cpu.c | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/kernel/smpboot.c | 2 ++ drivers/acpi/acpica/hwsleep.c | 3 +++ kernel/cpu.c | 7 ++++++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee64..61cc40887c48 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -1317,6 +1318,7 @@ void play_dead_common(void) void native_play_dead(void) { play_dead_common(); + tboot_shutdown(TB_SHUTDOWN_WFS); wbinvd_halt(); } diff --git a/drivers/acpi/acpica/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index db307a356f08..8c01dd3724e0 100644 --- a/drivers/acpi/acpica/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -45,6 +45,7 @@ #include #include "accommon.h" #include "actables.h" +#include #define _COMPONENT ACPI_HARDWARE ACPI_MODULE_NAME("hwsleep") @@ -342,6 +343,8 @@ acpi_status asmlinkage acpi_enter_sleep_state(u8 sleep_state) ACPI_FLUSH_CPU_CACHE(); + tboot_sleep(sleep_state, pm1a_control, pm1b_control); + /* Write #2: Write both SLP_TYP + SLP_EN */ status = acpi_hw_write_pm1_control(pm1a_control, pm1b_control); diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce10043e4ac..ff071e022a85 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -14,6 +14,7 @@ #include #include #include +#include #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ @@ -376,7 +377,7 @@ static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error; + int cpu, first_cpu, error, num_cpus = 0; error = stop_machine_create(); if (error) @@ -391,6 +392,7 @@ int disable_nonboot_cpus(void) for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; + num_cpus++; error = _cpu_down(cpu, 1); if (!error) { cpumask_set_cpu(cpu, frozen_cpus); @@ -401,6 +403,9 @@ int disable_nonboot_cpus(void) break; } } + /* ensure all CPUs have gone into wait-for-SIPI */ + error |= tboot_wait_for_aps(num_cpus); + if (!error) { BUG_ON(num_online_cpus() > 1); /* Make sure the CPUs won't be enabled by someone else */ From a59b50e995465911ba580df0bd10cf64aa81fc43 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 30 Jun 2009 19:31:10 -0700 Subject: [PATCH 0043/1662] intel_txt: Force IOMMU on for Intel TXT launch The tboot module will DMA protect all of memory in order to ensure the that kernel will be able to initialize without compromise (from DMA). Consequently, the kernel must enable Intel Virtualization Technology for Directed I/O (VT-d or Intel IOMMU) in order to replace this broad protection with the appropriate page-granular protection. Otherwise DMA devices will be unable to read or write from memory and the kernel will eventually panic. Because runtime IOMMU support is configurable by command line options, this patch will force it to be enabled regardless of the options specified, and will log a message if it was required to force it on. dmar.c | 7 +++++++ intel-iommu.c | 17 +++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Cc: David Woodhouse Signed-off-by: H. Peter Anvin --- drivers/pci/dmar.c | 7 +++++++ drivers/pci/intel-iommu.c | 19 ++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index 7b287cb38b7a..0cbc5fd26c3c 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -33,6 +33,7 @@ #include #include #include +#include #undef PREFIX #define PREFIX "DMAR:" @@ -413,6 +414,12 @@ parse_dmar_table(void) */ dmar_table_detect(); + /* + * ACPI tables may not be DMA protected by tboot, so use DMAR copy + * SINIT saved in SinitMleData in TXT heap (which is DMA protected) + */ + dmar_tbl = tboot_get_dmar_table(dmar_tbl); + dmar = (struct acpi_table_dmar *)dmar_tbl; if (!dmar) return -ENODEV; diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index ebc9b8dca881..2dc72a6d7412 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include "pci.h" @@ -3183,12 +3184,22 @@ static int __init init_iommu_sysfs(void) int __init intel_iommu_init(void) { int ret = 0; + int force_on = 0; - if (dmar_table_init()) - return -ENODEV; + /* VT-d is required for a TXT/tboot launch, so enforce that */ + force_on = tboot_force_iommu(); - if (dmar_dev_scope_init()) + if (dmar_table_init()) { + if (force_on) + panic("tboot: Failed to initialize DMAR table\n"); return -ENODEV; + } + + if (dmar_dev_scope_init()) { + if (force_on) + panic("tboot: Failed to initialize DMAR device scope\n"); + return -ENODEV; + } /* * Check the need for DMA-remapping initialization now. @@ -3204,6 +3215,8 @@ int __init intel_iommu_init(void) ret = init_dmars(); if (ret) { + if (force_on) + panic("tboot: Failed to initialize DMARs\n"); printk(KERN_ERR "IOMMU: dmar init failed\n"); put_iova_domain(&reserved_iova_list); iommu_exit_mempool(); From 5c04c78afba4805846519f29f0b55ac8759e0d48 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sun, 26 Jul 2009 21:52:01 -0500 Subject: [PATCH 0044/1662] xfs: reduce bmv_count in xfs_vn_fiemap commit 6321e3ed2acf3ee9643cdd403e1c88605d7944ba caused the full bmv_count's worth of getbmapx structures to get allocated; telling it to do MAXEXTNUM was a bit insane, resulting in ENOMEM every time. Chop it down to something reasonable, the number of slots in the caller's input buffer. If this is too large the caller may get ENOMEM but the reason should not be a mystery, and they can try again with something smaller. We add 1 to the value because in the normal getbmap world, bmv_count includes the header and xfs_getbmap does: nex = bmv->bmv_count - 1; if (nex <= 0) return XFS_ERROR(EINVAL); Signed-off-by: Eric Sandeen Reviewed-by: Olaf Weber Reviewed-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_iops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 58973bb46038..8070b34cc287 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -680,8 +680,8 @@ xfs_vn_fiemap( else bm.bmv_length = BTOBB(length); - /* our formatter will tell xfs_getbmap when to stop. */ - bm.bmv_count = MAXEXTNUM; + /* We add one because in getbmap world count includes the header */ + bm.bmv_count = fieinfo->fi_extents_max + 1; bm.bmv_iflags = BMV_IF_PREALLOC; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) bm.bmv_iflags |= BMV_IF_ATTRFORK; From 94699b04eddd4b247d871930431d6fa1a46c175e Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:52:54 +0200 Subject: [PATCH 0045/1662] x86, mce: don't log boot MCEs on Pentium M (model == 13) CPUs On my legacy Pentium M laptop (Acer Extensa 2900) I get bogus MCE on a cold boot with CONFIG_X86_NEW_MCE enabled, i.e. (after decoding it with mcelog): MCE 0 HARDWARE ERROR. This is *NOT* a software problem! Please contact your hardware vendor CPU 0 BANK 1 MCG status: MCi status: Error overflow Uncorrected error Error enabled Processor context corrupt MCA: Data CACHE Level-1 UNKNOWN Error STATUS f200000000000195 MCGSTATUS 0 [ The other STATUS values observed: f2000000000001b5 (... UNKNOWN error) and f200000000000115 (... READ Error). To verify that this is not a CONFIG_X86_NEW_MCE bug I also modified the CONFIG_X86_OLD_MCE code (which doesn't log any MCEs) to dump content of STATUS MSR before it is cleared during initialization. ] Since the bogus MCE results in a kernel taint (which in turn disables lockdep support) don't log boot MCEs on Pentium M (model == 13) CPUs by default ("mce=bootlog" boot parameter can be be used to get the old behavior). Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 07139a0578e3..7bd19c7f5315 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1269,6 +1269,10 @@ static void mce_cpu_quirks(struct cpuinfo_x86 *c) if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && monarch_timeout < 0) monarch_timeout = USEC_PER_SEC; + + /* There are also broken BIOSes on some Pentium M systems. */ + if (c->x86 == 6 && c->x86_model == 13 && mce_bootlog < 0) + mce_bootlog = 0; } if (monarch_timeout < 0) monarch_timeout = 0; From e3346fc48204d780f92527d06df8bf6f28d603ec Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:55:09 +0200 Subject: [PATCH 0046/1662] x86, mce: fix "mce" boot option handling for CONFIG_X86_NEW_MCE "mce argument mce ignored. Please use /sys" message shouldn't be printed when using "mce" boot option. Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7bd19c7f5315..75919440a188 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1549,8 +1549,10 @@ static struct miscdevice mce_log_device = { */ static int __init mcheck_enable(char *str) { - if (*str == 0) + if (*str == 0) { enable_p5_mce(); + return 1; + } if (*str == '=') str++; if (!strcmp(str, "off")) From 419d6162c0c0103fa2f44f6691dff9cac14c650d Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:56:00 +0200 Subject: [PATCH 0047/1662] x86, mce: add missing __cpuinit tags mce_cap_init() and mce_cpu_quirks() can be tagged with __cpuinit. Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 75919440a188..1ce6db1f8789 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1158,7 +1158,7 @@ static int mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int mce_cap_init(void) +static int __cpuinit mce_cap_init(void) { unsigned b; u64 cap; @@ -1222,7 +1222,7 @@ static void mce_init(void) } /* Add per CPU specific workarounds here */ -static void mce_cpu_quirks(struct cpuinfo_x86 *c) +static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { From d0c87d1f61704ed589fc0788bedd753632340e98 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 23:56:37 +0200 Subject: [PATCH 0048/1662] x86, mce: remove never executed code fseverities_coverage is never NULL in err_out code path. Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-severity.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index ff0807f97056..51f7c725dab5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -209,8 +209,6 @@ static int __init severities_debugfs_init(void) return 0; err_out: - if (fseverities_coverage) - debugfs_remove(fseverities_coverage); if (dmce) debugfs_remove(dmce); return -ENOMEM; From f3a0867b12e0cf1512c0bd0665f2339fc75ed2a8 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Wed, 29 Jul 2009 00:04:59 +0200 Subject: [PATCH 0049/1662] x86, mce: fix reporting of Thermal Monitoring mechanism enabled Early Pentium M models use different method for enabling TM2 (per paragraph 13.5.2.3 of the "Intel 64 and IA-32 Architectures Software Developer's Manual Volume 3A: System Programming Guide, Part 1"). Tested on the affected Pentium M variant (model == 13). Signed-off-by: Bartlomiej Zolnierkiewicz Cc: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/msr-index.h | 4 ++++ arch/x86/kernel/cpu/mcheck/therm_throt.c | 13 ++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3d1ce094586a..cbec06deb68b 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -222,6 +222,10 @@ #define THERM_STATUS_PROCHOT (1 << 0) +#define MSR_THERM2_CTL 0x0000019d + +#define MSR_THERM2_CTL_TM_SELECT (1ULL << 16) + #define MSR_IA32_MISC_ENABLE 0x000001a0 /* MISC_ENABLE bits: architectural */ diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index bff8dd191dd5..15f2bc07bb60 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -253,9 +253,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } - if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2)) - tm2 = 1; - /* Check whether a vector already exists */ if (h & APIC_VECTOR_MASK) { printk(KERN_DEBUG @@ -264,6 +261,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c) return; } + /* early Pentium M models use different method for enabling TM2 */ + if (cpu_has(c, X86_FEATURE_TM2)) { + if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) { + rdmsr(MSR_THERM2_CTL, l, h); + if (l & MSR_THERM2_CTL_TM_SELECT) + tm2 = 1; + } else if (l & MSR_IA32_MISC_ENABLE_TM2) + tm2 = 1; + } + /* We'll mask the thermal vector in the lapic till we're ready: */ h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; apic_write(APIC_LVTTHMR, h); From ede58517ca93277547a0d054728c352618212d85 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 27 Jul 2009 18:15:25 +0200 Subject: [PATCH 0050/1662] fs/xfs: Correct redundant test bp was tested for NULL a few lines before, followed by a return, and there is no intervening modification of its value. A simplified version of the semantic match that finds this problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; expression E; position p1,p2; @@ if (x == NULL || ...) { ... when forall return ...; } ... when != \(x=E\|x--\|x++\|--x\|++x\|x-=E\|x+=E\|x|=E\|x&=E\|&x\) ( *x == NULL | *x != NULL ) // Signed-off-by: Julia Lawall Acked-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_trans_buf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 8ee2f8c8b0a6..218829e6a152 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -307,7 +307,7 @@ xfs_trans_read_buf( return (flags & XFS_BUF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); - if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) { + if (XFS_BUF_GETERROR(bp) != 0) { xfs_ioerror_alert("xfs_trans_read_buf", mp, bp, blkno); error = XFS_BUF_GETERROR(bp); @@ -315,7 +315,7 @@ xfs_trans_read_buf( return error; } #ifdef DEBUG - if (xfs_do_error && (bp != NULL)) { + if (xfs_do_error) { if (xfs_error_target == target) { if (((xfs_req_num++) % xfs_error_mod) == 0) { xfs_buf_relse(bp); From a81655ae5a9da16a08ba14637dc6c10217e57492 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 31 Jul 2009 00:02:17 -0500 Subject: [PATCH 0051/1662] xfs: bump up nr_to_write in xfs_vm_writepage VM calculation for nr_to_write seems off. Bump it way up, this gets simple streaming writes zippy again. To be reviewed again after Jens' writeback changes. Signed-off-by: Christoph Hellwig Signed-off-by: Eric Sandeen Cc: Chris Mason Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_aops.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 7ec89fc05b2b..aecf2519db76 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1268,6 +1268,14 @@ xfs_vm_writepage( if (!page_has_buffers(page)) create_empty_buffers(page, 1 << inode->i_blkbits, 0); + + /* + * VM calculation for nr_to_write seems off. Bump it way + * up, this gets simple streaming writes zippy again. + * To be reviewed again after Jens' writeback changes. + */ + wbc->nr_to_write *= 4; + /* * Convert delayed allocate, unwritten or unmapped space * to real space and flush out to disk. From 2a4ceb6d3e6a566cb4a9dc8f974177f031d27cd7 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 27 Jul 2009 10:27:29 +0100 Subject: [PATCH 0052/1662] agp: Switch mask_memory() method to take address argument again, not page In commit 07613ba2 ("agp: switch AGP to use page array instead of unsigned long array") we switched the mask_memory() method to take a 'struct page *' instead of an address. This is painful, because in some cases it has to be an IOMMU-mapped virtual bus address (in fact, shouldn't it _always_ be a dma_addr_t returned from pci_map_xxx(), and we just happen to get lucky most of the time?) Signed-off-by: David Woodhouse --- drivers/char/agp/agp.h | 4 ++-- drivers/char/agp/amd-k7-agp.c | 4 +++- drivers/char/agp/amd64-agp.c | 3 ++- drivers/char/agp/ati-agp.c | 3 ++- drivers/char/agp/backend.c | 4 ++-- drivers/char/agp/generic.c | 7 ++++--- drivers/char/agp/hp-agp.c | 4 +--- drivers/char/agp/i460-agp.c | 13 +++---------- drivers/char/agp/intel-agp.c | 14 +++++++------- drivers/char/agp/nvidia-agp.c | 2 +- drivers/char/agp/parisc-agp.c | 12 ++---------- drivers/char/agp/sgi-agp.c | 8 ++++---- drivers/char/agp/sworks-agp.c | 4 +++- 13 files changed, 36 insertions(+), 46 deletions(-) diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h index 178e2e9e9f09..ce110a3bf298 100644 --- a/drivers/char/agp/agp.h +++ b/drivers/char/agp/agp.h @@ -107,7 +107,7 @@ struct agp_bridge_driver { void (*agp_enable)(struct agp_bridge_data *, u32); void (*cleanup)(void); void (*tlb_flush)(struct agp_memory *); - unsigned long (*mask_memory)(struct agp_bridge_data *, struct page *, int); + unsigned long (*mask_memory)(struct agp_bridge_data *, dma_addr_t, int); void (*cache_flush)(void); int (*create_gatt_table)(struct agp_bridge_data *); int (*free_gatt_table)(struct agp_bridge_data *); @@ -291,7 +291,7 @@ int agp_3_5_enable(struct agp_bridge_data *bridge); void global_cache_flush(void); void get_agp_version(struct agp_bridge_data *bridge); unsigned long agp_generic_mask_memory(struct agp_bridge_data *bridge, - struct page *page, int type); + dma_addr_t phys, int type); int agp_generic_type_to_mask_type(struct agp_bridge_data *bridge, int type); struct agp_bridge_data *agp_generic_find_bridge(struct pci_dev *pdev); diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c index ba9bde71eaaf..542a87895ae9 100644 --- a/drivers/char/agp/amd-k7-agp.c +++ b/drivers/char/agp/amd-k7-agp.c @@ -325,7 +325,9 @@ static int amd_insert_memory(struct agp_memory *mem, off_t pg_start, int type) addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = GET_GATT(addr); writel(agp_generic_mask_memory(agp_bridge, - mem->pages[i], mem->type), cur_gatt+GET_GATT_OFF(addr)); + phys_to_gart(page_to_phys(mem->pages[i])), + mem->type), + cur_gatt+GET_GATT_OFF(addr)); readl(cur_gatt+GET_GATT_OFF(addr)); /* PCI Posting. */ } amd_irongate_tlbflush(mem); diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 3bf5dda90f4a..e85a5b3e952e 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -79,7 +79,8 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type) for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { tmp = agp_bridge->driver->mask_memory(agp_bridge, - mem->pages[i], mask_type); + phys_to_gart(page_to_phys(mem->pages[i])), + mask_type); BUG_ON(tmp & 0xffffff0000000ffcULL); pte = (tmp & 0x000000ff00000000ULL) >> 28; diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c index 33656e144cc5..59ebd60c1b60 100644 --- a/drivers/char/agp/ati-agp.c +++ b/drivers/char/agp/ati-agp.c @@ -302,7 +302,8 @@ static int ati_insert_memory(struct agp_memory * mem, addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = GET_GATT(addr); writel(agp_bridge->driver->mask_memory(agp_bridge, - mem->pages[i], mem->type), + phys_to_gart(page_to_phys(mem->pages[i])), + mem->type), cur_gatt+GET_GATT_OFF(addr)); } readl(GET_GATT(agp_bridge->gart_bus_addr)); /* PCI posting */ diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index cfa5a649dfe7..3bd7e503de41 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -150,8 +150,8 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge) } bridge->scratch_page_real = phys_to_gart(page_to_phys(page)); - bridge->scratch_page = - bridge->driver->mask_memory(bridge, page, 0); + bridge->scratch_page = bridge->driver->mask_memory(bridge, + phys_to_gart(page_to_phys(page)), 0); } size_value = bridge->driver->fetch_size(); diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c index 1e8b461b91f1..a3bcc7ef42f9 100644 --- a/drivers/char/agp/generic.c +++ b/drivers/char/agp/generic.c @@ -1132,7 +1132,9 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type) } for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { - writel(bridge->driver->mask_memory(bridge, mem->pages[i], mask_type), + writel(bridge->driver->mask_memory(bridge, + phys_to_gart(page_to_phys(mem->pages[i])), + mask_type), bridge->gatt_table+j); } readl(bridge->gatt_table+j-1); /* PCI Posting. */ @@ -1347,9 +1349,8 @@ void global_cache_flush(void) EXPORT_SYMBOL(global_cache_flush); unsigned long agp_generic_mask_memory(struct agp_bridge_data *bridge, - struct page *page, int type) + dma_addr_t addr, int type) { - unsigned long addr = phys_to_gart(page_to_phys(page)); /* memory type is ignored in the generic routine */ if (bridge->driver->masks) return addr | bridge->driver->masks[0].mask; diff --git a/drivers/char/agp/hp-agp.c b/drivers/char/agp/hp-agp.c index 8f3d4c184914..64dbf4b1cf2f 100644 --- a/drivers/char/agp/hp-agp.c +++ b/drivers/char/agp/hp-agp.c @@ -394,10 +394,8 @@ hp_zx1_remove_memory (struct agp_memory *mem, off_t pg_start, int type) } static unsigned long -hp_zx1_mask_memory (struct agp_bridge_data *bridge, - struct page *page, int type) +hp_zx1_mask_memory (struct agp_bridge_data *bridge, dma_addr_t addr, int type) { - unsigned long addr = phys_to_gart(page_to_phys(page)); return HP_ZX1_PDIR_VALID_BIT | addr; } diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c index 60cc35bb5db7..54191f860539 100644 --- a/drivers/char/agp/i460-agp.c +++ b/drivers/char/agp/i460-agp.c @@ -61,7 +61,7 @@ #define WR_FLUSH_GATT(index) RD_GATT(index) static unsigned long i460_mask_memory (struct agp_bridge_data *bridge, - unsigned long addr, int type); + dma_addr_t addr, int type); static struct { void *gatt; /* ioremap'd GATT area */ @@ -546,20 +546,13 @@ static void i460_destroy_page (struct page *page, int flags) #endif /* I460_LARGE_IO_PAGES */ static unsigned long i460_mask_memory (struct agp_bridge_data *bridge, - unsigned long addr, int type) + dma_addr_t addr, int type) { /* Make sure the returned address is a valid GATT entry */ return bridge->driver->masks[0].mask | (((addr & ~((1 << I460_IO_PAGE_SHIFT) - 1)) & 0xfffff000) >> 12); } -static unsigned long i460_page_mask_memory(struct agp_bridge_data *bridge, - struct page *page, int type) -{ - unsigned long addr = phys_to_gart(page_to_phys(page)); - return i460_mask_memory(bridge, addr, type); -} - const struct agp_bridge_driver intel_i460_driver = { .owner = THIS_MODULE, .aperture_sizes = i460_sizes, @@ -569,7 +562,7 @@ const struct agp_bridge_driver intel_i460_driver = { .fetch_size = i460_fetch_size, .cleanup = i460_cleanup, .tlb_flush = i460_tlb_flush, - .mask_memory = i460_page_mask_memory, + .mask_memory = i460_mask_memory, .masks = i460_masks, .agp_enable = agp_generic_enable, .cache_flush = global_cache_flush, diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index 8c9d50db5c3a..21983456d672 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -343,7 +343,7 @@ static int intel_i810_insert_entries(struct agp_memory *mem, off_t pg_start, global_cache_flush(); for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - mem->pages[i], + phys_to_gart(page_to_phys(mem->pages[i])), mask_type), intel_private.registers+I810_PTE_BASE+(j*4)); } @@ -461,9 +461,8 @@ static void intel_i810_free_by_type(struct agp_memory *curr) } static unsigned long intel_i810_mask_memory(struct agp_bridge_data *bridge, - struct page *page, int type) + dma_addr_t addr, int type) { - unsigned long addr = phys_to_gart(page_to_phys(page)); /* Type checking must be done elsewhere */ return addr | bridge->driver->masks[type].mask; } @@ -851,7 +850,7 @@ static int intel_i830_insert_entries(struct agp_memory *mem, off_t pg_start, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - mem->pages[i], mask_type), + phys_to_gart(page_to_phys(mem->pages[i])), mask_type), intel_private.registers+I810_PTE_BASE+(j*4)); } readl(intel_private.registers+I810_PTE_BASE+((j-1)*4)); @@ -1081,7 +1080,9 @@ static int intel_i915_insert_entries(struct agp_memory *mem, off_t pg_start, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - mem->pages[i], mask_type), intel_private.gtt+j); + phys_to_gart(page_to_phys(mem->pages[i])), + mask_type), + intel_private.gtt+j); } readl(intel_private.gtt+j-1); @@ -1196,9 +1197,8 @@ static int intel_i915_create_gatt_table(struct agp_bridge_data *bridge) * this conditional. */ static unsigned long intel_i965_mask_memory(struct agp_bridge_data *bridge, - struct page *page, int type) + dma_addr_t addr, int type) { - dma_addr_t addr = phys_to_gart(page_to_phys(page)); /* Shift high bits down */ addr |= (addr >> 28) & 0xf0; diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c index 263d71dd441c..cedacee30ec3 100644 --- a/drivers/char/agp/nvidia-agp.c +++ b/drivers/char/agp/nvidia-agp.c @@ -225,7 +225,7 @@ static int nvidia_insert_memory(struct agp_memory *mem, off_t pg_start, int type } for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - mem->pages[i], mask_type), + phys_to_gart(page_to_phys(mem->pages[i])), mask_type), agp_bridge->gatt_table+nvidia_private.pg_offset+j); } diff --git a/drivers/char/agp/parisc-agp.c b/drivers/char/agp/parisc-agp.c index f4bb43fb8016..1c129211302d 100644 --- a/drivers/char/agp/parisc-agp.c +++ b/drivers/char/agp/parisc-agp.c @@ -32,7 +32,7 @@ #define AGP8X_MODE (1 << AGP8X_MODE_BIT) static unsigned long -parisc_agp_mask_memory(struct agp_bridge_data *bridge, unsigned long addr, +parisc_agp_mask_memory(struct agp_bridge_data *bridge, dma_addr_t addr, int type); static struct _parisc_agp_info { @@ -189,20 +189,12 @@ parisc_agp_remove_memory(struct agp_memory *mem, off_t pg_start, int type) } static unsigned long -parisc_agp_mask_memory(struct agp_bridge_data *bridge, unsigned long addr, +parisc_agp_mask_memory(struct agp_bridge_data *bridge, dma_addr_t addr, int type) { return SBA_PDIR_VALID_BIT | addr; } -static unsigned long -parisc_agp_page_mask_memory(struct agp_bridge_data *bridge, struct page *page, - int type) -{ - unsigned long addr = phys_to_gart(page_to_phys(page)); - return SBA_PDIR_VALID_BIT | addr; -} - static void parisc_agp_enable(struct agp_bridge_data *bridge, u32 mode) { diff --git a/drivers/char/agp/sgi-agp.c b/drivers/char/agp/sgi-agp.c index d3ea2e4226b5..0d47fa847404 100644 --- a/drivers/char/agp/sgi-agp.c +++ b/drivers/char/agp/sgi-agp.c @@ -70,10 +70,9 @@ static void sgi_tioca_tlbflush(struct agp_memory *mem) * entry. */ static unsigned long -sgi_tioca_mask_memory(struct agp_bridge_data *bridge, - struct page *page, int type) +sgi_tioca_mask_memory(struct agp_bridge_data *bridge, dma_addr_t addr, + int type) { - unsigned long addr = phys_to_gart(page_to_phys(page)); return tioca_physpage_to_gart(addr); } @@ -190,7 +189,8 @@ static int sgi_tioca_insert_memory(struct agp_memory *mem, off_t pg_start, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { table[j] = - bridge->driver->mask_memory(bridge, mem->pages[i], + bridge->driver->mask_memory(bridge, + phys_to_gart(page_to_phys(mem->pages[i])), mem->type); } diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c index b964a2199329..07259952fc32 100644 --- a/drivers/char/agp/sworks-agp.c +++ b/drivers/char/agp/sworks-agp.c @@ -349,7 +349,9 @@ static int serverworks_insert_memory(struct agp_memory *mem, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = SVRWRKS_GET_GATT(addr); - writel(agp_bridge->driver->mask_memory(agp_bridge, mem->pages[i], mem->type), cur_gatt+GET_GATT_OFF(addr)); + writel(agp_bridge->driver->mask_memory(agp_bridge, + phys_to_gart(page_to_phys(mem->pages[i])), mem->type), + cur_gatt+GET_GATT_OFF(addr)); } serverworks_tlbflush(mem); return 0; From ff663cf8705bea101d5f73cf471855c85242575e Mon Sep 17 00:00:00 2001 From: Zhenyu Wang Date: Thu, 23 Jul 2009 17:25:49 +0100 Subject: [PATCH 0053/1662] agp: Add generic support for graphics dma remapping New driver hooks for support graphics memory dma remapping are introduced in this patch. It makes generic code can tell if current device needs dma remapping, then call driver provided interfaces for mapping and unmapping. Change has also been made to handle scratch_page in remapping case. Signed-off-by: Zhenyu Wang Signed-off-by: David Woodhouse --- drivers/char/agp/agp.h | 6 ++++++ drivers/char/agp/backend.c | 20 ++++++++++++++++++++ drivers/char/agp/generic.c | 9 +++++++++ include/linux/agp_backend.h | 6 +++++- 4 files changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h index ce110a3bf298..17e6d0d3ba36 100644 --- a/drivers/char/agp/agp.h +++ b/drivers/char/agp/agp.h @@ -121,6 +121,11 @@ struct agp_bridge_driver { void (*agp_destroy_pages)(struct agp_memory *); int (*agp_type_to_mask_type) (struct agp_bridge_data *, int); void (*chipset_flush)(struct agp_bridge_data *); + + int (*agp_map_page)(void *addr, dma_addr_t *ret); + void (*agp_unmap_page)(void *addr, dma_addr_t dma); + int (*agp_map_memory)(struct agp_memory *mem); + void (*agp_unmap_memory)(struct agp_memory *mem); }; struct agp_bridge_data { @@ -135,6 +140,7 @@ struct agp_bridge_data { u32 *gatt_table_real; unsigned long scratch_page; unsigned long scratch_page_real; + dma_addr_t scratch_page_dma; unsigned long gart_bus_addr; unsigned long gatt_bus_addr; u32 mode; diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 3bd7e503de41..19ac3663acdc 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -152,6 +152,15 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge) bridge->scratch_page_real = phys_to_gart(page_to_phys(page)); bridge->scratch_page = bridge->driver->mask_memory(bridge, phys_to_gart(page_to_phys(page)), 0); + + if (bridge->driver->agp_map_page && + bridge->driver->agp_map_page(phys_to_virt(page_to_phys(page)), + &bridge->scratch_page_dma)) { + dev_err(&bridge->dev->dev, + "unable to dma-map scratch page\n"); + rc = -ENOMEM; + goto err_out_nounmap; + } } size_value = bridge->driver->fetch_size(); @@ -191,6 +200,13 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge) return 0; err_out: + if (bridge->driver->needs_scratch_page && + bridge->driver->agp_unmap_page) { + void *va = gart_to_virt(bridge->scratch_page_real); + + bridge->driver->agp_unmap_page(va, bridge->scratch_page_dma); + } +err_out_nounmap: if (bridge->driver->needs_scratch_page) { void *va = gart_to_virt(bridge->scratch_page_real); @@ -221,6 +237,10 @@ static void agp_backend_cleanup(struct agp_bridge_data *bridge) bridge->driver->needs_scratch_page) { void *va = gart_to_virt(bridge->scratch_page_real); + if (bridge->driver->agp_unmap_page) + bridge->driver->agp_unmap_page(va, + bridge->scratch_page_dma); + bridge->driver->agp_destroy_page(va, AGP_PAGE_DESTROY_UNMAP); bridge->driver->agp_destroy_page(va, AGP_PAGE_DESTROY_FREE); } diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c index a3bcc7ef42f9..28f0208c66a6 100644 --- a/drivers/char/agp/generic.c +++ b/drivers/char/agp/generic.c @@ -437,6 +437,12 @@ int agp_bind_memory(struct agp_memory *curr, off_t pg_start) curr->bridge->driver->cache_flush(); curr->is_flushed = true; } + + if (curr->bridge->driver->agp_map_memory) { + ret_val = curr->bridge->driver->agp_map_memory(curr); + if (ret_val) + return ret_val; + } ret_val = curr->bridge->driver->insert_memory(curr, pg_start, curr->type); if (ret_val != 0) @@ -478,6 +484,9 @@ int agp_unbind_memory(struct agp_memory *curr) if (ret_val != 0) return ret_val; + if (curr->bridge->driver->agp_unmap_memory) + curr->bridge->driver->agp_unmap_memory(curr); + curr->is_bound = false; curr->pg_start = 0; spin_lock(&curr->bridge->mapped_lock); diff --git a/include/linux/agp_backend.h b/include/linux/agp_backend.h index 76fa794fdac0..8a294d65b9b1 100644 --- a/include/linux/agp_backend.h +++ b/include/linux/agp_backend.h @@ -79,9 +79,13 @@ struct agp_memory { u32 physical; bool is_bound; bool is_flushed; - bool vmalloc_flag; + bool vmalloc_flag; + bool sg_vmalloc_flag; /* list of agp_memory mapped to the aperture */ struct list_head mapped_list; + /* DMA-mapped addresses */ + struct scatterlist *sg_list; + int num_sg; }; #define AGP_NORMAL_MEMORY 0 From 176616814d700f19914d8509d9f65dec51a6ebf7 Mon Sep 17 00:00:00 2001 From: Zhenyu Wang Date: Mon, 27 Jul 2009 12:59:57 +0100 Subject: [PATCH 0054/1662] intel_agp: Use PCI DMA API correctly on chipsets new enough to have IOMMU When graphics dma remapping engine is active, we must fill gart table with dma address from dmar engine, as now graphics device access to graphics memory must go through dma remapping table to get real physical address. Add this support to all drivers which use intel_i915_insert_entries() Signed-off-by: Zhenyu Wang Signed-off-by: David Woodhouse --- drivers/char/agp/intel-agp.c | 174 ++++++++++++++++++++++++++++++++--- 1 file changed, 162 insertions(+), 12 deletions(-) diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index 21983456d672..20fe82b99fdb 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -10,6 +10,16 @@ #include #include "agp.h" +/* + * If we have Intel graphics, we're not going to have anything other than + * an Intel IOMMU. So make the correct use of the PCI DMA API contingent + * on the Intel IOMMU support (CONFIG_DMAR). + * Only newer chipsets need to bother with this, of course. + */ +#ifdef CONFIG_DMAR +#define USE_PCI_DMA_API 1 +#endif + #define PCI_DEVICE_ID_INTEL_E7221_HB 0x2588 #define PCI_DEVICE_ID_INTEL_E7221_IG 0x258a #define PCI_DEVICE_ID_INTEL_82946GZ_HB 0x2970 @@ -170,6 +180,131 @@ static struct _intel_private { int resource_valid; } intel_private; +#ifdef USE_PCI_DMA_API +static int intel_agp_map_page(void *addr, dma_addr_t *ret) +{ + *ret = pci_map_single(intel_private.pcidev, addr, + PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); + if (pci_dma_mapping_error(intel_private.pcidev, *ret)) + return -EINVAL; + return 0; +} + +static void intel_agp_unmap_page(void *addr, dma_addr_t dma) +{ + pci_unmap_single(intel_private.pcidev, dma, + PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); +} + +static int intel_agp_map_memory(struct agp_memory *mem) +{ + struct scatterlist *sg; + int i; + + DBG("try mapping %lu pages\n", (unsigned long)mem->page_count); + + if ((mem->page_count * sizeof(*mem->sg_list)) < 2*PAGE_SIZE) + mem->sg_list = kcalloc(mem->page_count, sizeof(*mem->sg_list), + GFP_KERNEL); + + if (mem->sg_list == NULL) { + mem->sg_list = vmalloc(mem->page_count * sizeof(*mem->sg_list)); + mem->sg_vmalloc_flag = 1; + } + + if (!mem->sg_list) { + mem->sg_vmalloc_flag = 0; + return -ENOMEM; + } + sg_init_table(mem->sg_list, mem->page_count); + + sg = mem->sg_list; + for (i = 0 ; i < mem->page_count; i++, sg = sg_next(sg)) + sg_set_page(sg, mem->pages[i], PAGE_SIZE, 0); + + mem->num_sg = pci_map_sg(intel_private.pcidev, mem->sg_list, + mem->page_count, PCI_DMA_BIDIRECTIONAL); + if (!mem->num_sg) { + if (mem->sg_vmalloc_flag) + vfree(mem->sg_list); + else + kfree(mem->sg_list); + mem->sg_list = NULL; + mem->sg_vmalloc_flag = 0; + return -ENOMEM; + } + return 0; +} + +static void intel_agp_unmap_memory(struct agp_memory *mem) +{ + DBG("try unmapping %lu pages\n", (unsigned long)mem->page_count); + + pci_unmap_sg(intel_private.pcidev, mem->sg_list, + mem->page_count, PCI_DMA_BIDIRECTIONAL); + if (mem->sg_vmalloc_flag) + vfree(mem->sg_list); + else + kfree(mem->sg_list); + mem->sg_vmalloc_flag = 0; + mem->sg_list = NULL; + mem->num_sg = 0; +} + +static void intel_agp_insert_sg_entries(struct agp_memory *mem, + off_t pg_start, int mask_type) +{ + struct scatterlist *sg; + int i, j; + + j = pg_start; + + WARN_ON(!mem->num_sg); + + if (mem->num_sg == mem->page_count) { + for_each_sg(mem->sg_list, sg, mem->page_count, i) { + writel(agp_bridge->driver->mask_memory(agp_bridge, + sg_dma_address(sg), mask_type), + intel_private.gtt+j); + j++; + } + } else { + /* sg may merge pages, but we have to seperate + * per-page addr for GTT */ + unsigned int len, m; + + for_each_sg(mem->sg_list, sg, mem->num_sg, i) { + len = sg_dma_len(sg) / PAGE_SIZE; + for (m = 0; m < len; m++) { + writel(agp_bridge->driver->mask_memory(agp_bridge, + sg_dma_address(sg) + m * PAGE_SIZE, + mask_type), + intel_private.gtt+j); + j++; + } + } + } + readl(intel_private.gtt+j-1); +} + +#else + +static void intel_agp_insert_sg_entries(struct agp_memory *mem, + off_t pg_start, int mask_type) +{ + int i, j; + + for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { + writel(agp_bridge->driver->mask_memory(agp_bridge, + phys_to_gart(page_to_phys(mem->pages[i])), mask_type), + intel_private.gtt+j); + } + + readl(intel_private.gtt+j-1); +} + +#endif + static int intel_i810_fetch_size(void) { u32 smram_miscc; @@ -1003,9 +1138,13 @@ static int intel_i915_configure(void) writel(agp_bridge->gatt_bus_addr|I810_PGETBL_ENABLED, intel_private.registers+I810_PGETBL_CTL); readl(intel_private.registers+I810_PGETBL_CTL); /* PCI Posting. */ +#ifndef USE_PCI_DMA_API + agp_bridge->scratch_page_dma = agp_bridge->scratch_page; +#endif + if (agp_bridge->driver->needs_scratch_page) { for (i = intel_private.gtt_entries; i < current_size->num_entries; i++) { - writel(agp_bridge->scratch_page, intel_private.gtt+i); + writel(agp_bridge->scratch_page_dma, intel_private.gtt+i); } readl(intel_private.gtt+i-1); /* PCI Posting. */ } @@ -1038,7 +1177,7 @@ static void intel_i915_chipset_flush(struct agp_bridge_data *bridge) static int intel_i915_insert_entries(struct agp_memory *mem, off_t pg_start, int type) { - int i, j, num_entries; + int num_entries; void *temp; int ret = -EINVAL; int mask_type; @@ -1062,7 +1201,7 @@ static int intel_i915_insert_entries(struct agp_memory *mem, off_t pg_start, if ((pg_start + mem->page_count) > num_entries) goto out_err; - /* The i915 can't check the GTT for entries since its read only, + /* The i915 can't check the GTT for entries since it's read only; * depend on the caller to make the correct offset decisions. */ @@ -1078,14 +1217,7 @@ static int intel_i915_insert_entries(struct agp_memory *mem, off_t pg_start, if (!mem->is_flushed) global_cache_flush(); - for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { - writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), - mask_type), - intel_private.gtt+j); - } - - readl(intel_private.gtt+j-1); + intel_agp_insert_sg_entries(mem, pg_start, mask_type); agp_bridge->driver->tlb_flush(mem); out: @@ -1110,7 +1242,7 @@ static int intel_i915_remove_entries(struct agp_memory *mem, off_t pg_start, } for (i = pg_start; i < (mem->page_count + pg_start); i++) - writel(agp_bridge->scratch_page, intel_private.gtt+i); + writel(agp_bridge->scratch_page_dma, intel_private.gtt+i); readl(intel_private.gtt+i-1); @@ -2003,6 +2135,12 @@ static const struct agp_bridge_driver intel_915_driver = { .agp_destroy_pages = agp_generic_destroy_pages, .agp_type_to_mask_type = intel_i830_type_to_mask_type, .chipset_flush = intel_i915_chipset_flush, +#ifdef USE_PCI_DMA_API + .agp_map_page = intel_agp_map_page, + .agp_unmap_page = intel_agp_unmap_page, + .agp_map_memory = intel_agp_map_memory, + .agp_unmap_memory = intel_agp_unmap_memory, +#endif }; static const struct agp_bridge_driver intel_i965_driver = { @@ -2031,6 +2169,12 @@ static const struct agp_bridge_driver intel_i965_driver = { .agp_destroy_pages = agp_generic_destroy_pages, .agp_type_to_mask_type = intel_i830_type_to_mask_type, .chipset_flush = intel_i915_chipset_flush, +#ifdef USE_PCI_DMA_API + .agp_map_page = intel_agp_map_page, + .agp_unmap_page = intel_agp_unmap_page, + .agp_map_memory = intel_agp_map_memory, + .agp_unmap_memory = intel_agp_unmap_memory, +#endif }; static const struct agp_bridge_driver intel_7505_driver = { @@ -2085,6 +2229,12 @@ static const struct agp_bridge_driver intel_g33_driver = { .agp_destroy_pages = agp_generic_destroy_pages, .agp_type_to_mask_type = intel_i830_type_to_mask_type, .chipset_flush = intel_i915_chipset_flush, +#ifdef USE_PCI_DMA_API + .agp_map_page = intel_agp_map_page, + .agp_unmap_page = intel_agp_unmap_page, + .agp_map_memory = intel_agp_map_memory, + .agp_unmap_memory = intel_agp_unmap_memory, +#endif }; static int find_gmch(u16 device) From 56ec4c1e72865c6d99f643b6574e6e074c3e8823 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 27 Jul 2009 16:44:32 +0100 Subject: [PATCH 0055/1662] agp: tidy up handling of scratch pages w.r.t. DMA API Signed-off-by: David Woodhouse --- drivers/char/agp/backend.c | 23 +++++++++++++---------- drivers/char/agp/intel-agp.c | 8 ++------ 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 19ac3663acdc..3c3a487f7b9d 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -150,17 +150,20 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge) } bridge->scratch_page_real = phys_to_gart(page_to_phys(page)); - bridge->scratch_page = bridge->driver->mask_memory(bridge, - phys_to_gart(page_to_phys(page)), 0); - - if (bridge->driver->agp_map_page && - bridge->driver->agp_map_page(phys_to_virt(page_to_phys(page)), - &bridge->scratch_page_dma)) { - dev_err(&bridge->dev->dev, - "unable to dma-map scratch page\n"); - rc = -ENOMEM; - goto err_out_nounmap; + if (bridge->driver->agp_map_page) { + if (bridge->driver->agp_map_page(phys_to_virt(page_to_phys(page)), + &bridge->scratch_page_dma)) { + dev_err(&bridge->dev->dev, + "unable to dma-map scratch page\n"); + rc = -ENOMEM; + goto err_out_nounmap; + } + } else { + bridge->scratch_page_dma = phys_to_gart(page_to_phys(page)); } + + bridge->scratch_page = bridge->driver->mask_memory(bridge, + bridge->scratch_page_dma, 0); } size_value = bridge->driver->fetch_size(); diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index 20fe82b99fdb..b8f2c75b98d1 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -1138,13 +1138,9 @@ static int intel_i915_configure(void) writel(agp_bridge->gatt_bus_addr|I810_PGETBL_ENABLED, intel_private.registers+I810_PGETBL_CTL); readl(intel_private.registers+I810_PGETBL_CTL); /* PCI Posting. */ -#ifndef USE_PCI_DMA_API - agp_bridge->scratch_page_dma = agp_bridge->scratch_page; -#endif - if (agp_bridge->driver->needs_scratch_page) { for (i = intel_private.gtt_entries; i < current_size->num_entries; i++) { - writel(agp_bridge->scratch_page_dma, intel_private.gtt+i); + writel(agp_bridge->scratch_page, intel_private.gtt+i); } readl(intel_private.gtt+i-1); /* PCI Posting. */ } @@ -1242,7 +1238,7 @@ static int intel_i915_remove_entries(struct agp_memory *mem, off_t pg_start, } for (i = pg_start; i < (mem->page_count + pg_start); i++) - writel(agp_bridge->scratch_page_dma, intel_private.gtt+i); + writel(agp_bridge->scratch_page, intel_private.gtt+i); readl(intel_private.gtt+i-1); From c2980d8c2961113f24863f70d8ad016f55224c81 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 29 Jul 2009 08:39:26 +0100 Subject: [PATCH 0056/1662] agp: Switch agp_{un,}map_page() to take struct page * argument Signed-off-by: David Woodhouse --- drivers/char/agp/agp.h | 6 +++--- drivers/char/agp/backend.c | 17 ++++++++--------- drivers/char/agp/intel-agp.c | 12 ++++++------ 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h index 17e6d0d3ba36..4c6e5079d870 100644 --- a/drivers/char/agp/agp.h +++ b/drivers/char/agp/agp.h @@ -122,8 +122,8 @@ struct agp_bridge_driver { int (*agp_type_to_mask_type) (struct agp_bridge_data *, int); void (*chipset_flush)(struct agp_bridge_data *); - int (*agp_map_page)(void *addr, dma_addr_t *ret); - void (*agp_unmap_page)(void *addr, dma_addr_t dma); + int (*agp_map_page)(struct page *page, dma_addr_t *ret); + void (*agp_unmap_page)(struct page *page, dma_addr_t dma); int (*agp_map_memory)(struct agp_memory *mem); void (*agp_unmap_memory)(struct agp_memory *mem); }; @@ -139,7 +139,7 @@ struct agp_bridge_data { u32 __iomem *gatt_table; u32 *gatt_table_real; unsigned long scratch_page; - unsigned long scratch_page_real; + struct page *scratch_page_page; dma_addr_t scratch_page_dma; unsigned long gart_bus_addr; unsigned long gatt_bus_addr; diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 3c3a487f7b9d..343f102090a0 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -149,9 +149,9 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge) return -ENOMEM; } - bridge->scratch_page_real = phys_to_gart(page_to_phys(page)); + bridge->scratch_page_page = page; if (bridge->driver->agp_map_page) { - if (bridge->driver->agp_map_page(phys_to_virt(page_to_phys(page)), + if (bridge->driver->agp_map_page(page, &bridge->scratch_page_dma)) { dev_err(&bridge->dev->dev, "unable to dma-map scratch page\n"); @@ -205,13 +205,12 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge) err_out: if (bridge->driver->needs_scratch_page && bridge->driver->agp_unmap_page) { - void *va = gart_to_virt(bridge->scratch_page_real); - - bridge->driver->agp_unmap_page(va, bridge->scratch_page_dma); + bridge->driver->agp_unmap_page(bridge->scratch_page_page, + bridge->scratch_page_dma); } err_out_nounmap: if (bridge->driver->needs_scratch_page) { - void *va = gart_to_virt(bridge->scratch_page_real); + void *va = page_address(bridge->scratch_page_page); bridge->driver->agp_destroy_page(va, AGP_PAGE_DESTROY_UNMAP); bridge->driver->agp_destroy_page(va, AGP_PAGE_DESTROY_FREE); @@ -238,11 +237,11 @@ static void agp_backend_cleanup(struct agp_bridge_data *bridge) if (bridge->driver->agp_destroy_page && bridge->driver->needs_scratch_page) { - void *va = gart_to_virt(bridge->scratch_page_real); + void *va = page_address(bridge->scratch_page_page); if (bridge->driver->agp_unmap_page) - bridge->driver->agp_unmap_page(va, - bridge->scratch_page_dma); + bridge->driver->agp_unmap_page(bridge->scratch_page_page, + bridge->scratch_page_dma); bridge->driver->agp_destroy_page(va, AGP_PAGE_DESTROY_UNMAP); bridge->driver->agp_destroy_page(va, AGP_PAGE_DESTROY_FREE); diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index b8f2c75b98d1..148d7e38fddf 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -181,19 +181,19 @@ static struct _intel_private { } intel_private; #ifdef USE_PCI_DMA_API -static int intel_agp_map_page(void *addr, dma_addr_t *ret) +static int intel_agp_map_page(struct page *page, dma_addr_t *ret) { - *ret = pci_map_single(intel_private.pcidev, addr, - PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); + *ret = pci_map_page(intel_private.pcidev, page, 0, + PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); if (pci_dma_mapping_error(intel_private.pcidev, *ret)) return -EINVAL; return 0; } -static void intel_agp_unmap_page(void *addr, dma_addr_t dma) +static void intel_agp_unmap_page(struct page *page, dma_addr_t dma) { - pci_unmap_single(intel_private.pcidev, dma, - PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); + pci_unmap_page(intel_private.pcidev, dma, + PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); } static int intel_agp_map_memory(struct agp_memory *mem) From 91b8e3056bf9107b688eb076c9b804171364db71 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 29 Jul 2009 08:49:12 +0100 Subject: [PATCH 0057/1662] intel-agp: Move repeated sglist free into separate function Signed-off-by: David Woodhouse --- drivers/char/agp/intel-agp.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index 148d7e38fddf..b9d9886ff3c3 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -196,6 +196,18 @@ static void intel_agp_unmap_page(struct page *page, dma_addr_t dma) PAGE_SIZE, PCI_DMA_BIDIRECTIONAL); } +static void intel_agp_free_sglist(struct agp_memory *mem) +{ + + if (mem->sg_vmalloc_flag) + vfree(mem->sg_list); + else + kfree(mem->sg_list); + mem->sg_vmalloc_flag = 0; + mem->sg_list = NULL; + mem->num_sg = 0; +} + static int intel_agp_map_memory(struct agp_memory *mem) { struct scatterlist *sg; @@ -224,13 +236,8 @@ static int intel_agp_map_memory(struct agp_memory *mem) mem->num_sg = pci_map_sg(intel_private.pcidev, mem->sg_list, mem->page_count, PCI_DMA_BIDIRECTIONAL); - if (!mem->num_sg) { - if (mem->sg_vmalloc_flag) - vfree(mem->sg_list); - else - kfree(mem->sg_list); - mem->sg_list = NULL; - mem->sg_vmalloc_flag = 0; + if (unlikely(!mem->num_sg)) { + intel_agp_free_sglist(mem); return -ENOMEM; } return 0; @@ -242,13 +249,7 @@ static void intel_agp_unmap_memory(struct agp_memory *mem) pci_unmap_sg(intel_private.pcidev, mem->sg_list, mem->page_count, PCI_DMA_BIDIRECTIONAL); - if (mem->sg_vmalloc_flag) - vfree(mem->sg_list); - else - kfree(mem->sg_list); - mem->sg_vmalloc_flag = 0; - mem->sg_list = NULL; - mem->num_sg = 0; + intel_agp_free_sglist(mem); } static void intel_agp_insert_sg_entries(struct agp_memory *mem, From f692775d7e0a22477143cd884e45c955448ac7d2 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 29 Jul 2009 09:28:45 +0100 Subject: [PATCH 0058/1662] intel-agp: fix sglist allocation to avoid vmalloc() Signed-off-by: David Woodhouse --- drivers/char/agp/intel-agp.c | 29 ++++++++++------------------- include/linux/agp_backend.h | 1 - 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index b9d9886ff3c3..d8c80d8be5e2 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -198,39 +198,30 @@ static void intel_agp_unmap_page(struct page *page, dma_addr_t dma) static void intel_agp_free_sglist(struct agp_memory *mem) { + struct sg_table st; + + st.sgl = mem->sg_list; + st.orig_nents = st.nents = mem->page_count; + + sg_free_table(&st); - if (mem->sg_vmalloc_flag) - vfree(mem->sg_list); - else - kfree(mem->sg_list); - mem->sg_vmalloc_flag = 0; mem->sg_list = NULL; mem->num_sg = 0; } static int intel_agp_map_memory(struct agp_memory *mem) { + struct sg_table st; struct scatterlist *sg; int i; DBG("try mapping %lu pages\n", (unsigned long)mem->page_count); - if ((mem->page_count * sizeof(*mem->sg_list)) < 2*PAGE_SIZE) - mem->sg_list = kcalloc(mem->page_count, sizeof(*mem->sg_list), - GFP_KERNEL); - - if (mem->sg_list == NULL) { - mem->sg_list = vmalloc(mem->page_count * sizeof(*mem->sg_list)); - mem->sg_vmalloc_flag = 1; - } - - if (!mem->sg_list) { - mem->sg_vmalloc_flag = 0; + if (sg_alloc_table(&st, mem->page_count, GFP_KERNEL)) return -ENOMEM; - } - sg_init_table(mem->sg_list, mem->page_count); - sg = mem->sg_list; + mem->sg_list = sg = st.sgl; + for (i = 0 ; i < mem->page_count; i++, sg = sg_next(sg)) sg_set_page(sg, mem->pages[i], PAGE_SIZE, 0); diff --git a/include/linux/agp_backend.h b/include/linux/agp_backend.h index 8a294d65b9b1..880130f7311f 100644 --- a/include/linux/agp_backend.h +++ b/include/linux/agp_backend.h @@ -80,7 +80,6 @@ struct agp_memory { bool is_bound; bool is_flushed; bool vmalloc_flag; - bool sg_vmalloc_flag; /* list of agp_memory mapped to the aperture */ struct list_head mapped_list; /* DMA-mapped addresses */ From 6a12235c7d2d75c7d94b9afcaaecd422ff845ce0 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 29 Jul 2009 10:25:58 +0100 Subject: [PATCH 0059/1662] agp: kill phys_to_gart() and gart_to_phys() There seems to be no reason for these -- they're a 1:1 mapping on all platforms. Signed-off-by: David Woodhouse --- arch/alpha/include/asm/agp.h | 4 ---- arch/ia64/include/asm/agp.h | 4 ---- arch/parisc/include/asm/agp.h | 4 ---- arch/powerpc/include/asm/agp.h | 4 ---- arch/sparc/include/asm/agp.h | 4 ---- arch/x86/include/asm/agp.h | 4 ---- drivers/char/agp/agp.h | 3 --- drivers/char/agp/ali-agp.c | 4 ++-- drivers/char/agp/amd-k7-agp.c | 8 ++++---- drivers/char/agp/amd64-agp.c | 6 +++--- drivers/char/agp/ati-agp.c | 6 +++--- drivers/char/agp/backend.c | 2 +- drivers/char/agp/efficeon-agp.c | 4 ++-- drivers/char/agp/generic.c | 6 +++--- drivers/char/agp/hp-agp.c | 4 ++-- drivers/char/agp/i460-agp.c | 4 ++-- drivers/char/agp/intel-agp.c | 7 +++---- drivers/char/agp/nvidia-agp.c | 2 +- drivers/char/agp/sgi-agp.c | 2 +- drivers/char/agp/sworks-agp.c | 8 ++++---- drivers/char/agp/uninorth-agp.c | 2 +- 21 files changed, 32 insertions(+), 60 deletions(-) diff --git a/arch/alpha/include/asm/agp.h b/arch/alpha/include/asm/agp.h index 26c179135293..a94d48b8677f 100644 --- a/arch/alpha/include/asm/agp.h +++ b/arch/alpha/include/asm/agp.h @@ -9,10 +9,6 @@ #define unmap_page_from_agp(page) #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/ia64/include/asm/agp.h b/arch/ia64/include/asm/agp.h index c11fdd8ab4d7..01d09c401c5c 100644 --- a/arch/ia64/include/asm/agp.h +++ b/arch/ia64/include/asm/agp.h @@ -17,10 +17,6 @@ #define unmap_page_from_agp(page) /* nothing */ #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/parisc/include/asm/agp.h b/arch/parisc/include/asm/agp.h index 9651660da639..d226ffa8fc12 100644 --- a/arch/parisc/include/asm/agp.h +++ b/arch/parisc/include/asm/agp.h @@ -11,10 +11,6 @@ #define unmap_page_from_agp(page) /* nothing */ #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/powerpc/include/asm/agp.h b/arch/powerpc/include/asm/agp.h index 86455c4c31ee..416e12c2d505 100644 --- a/arch/powerpc/include/asm/agp.h +++ b/arch/powerpc/include/asm/agp.h @@ -8,10 +8,6 @@ #define unmap_page_from_agp(page) #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/sparc/include/asm/agp.h b/arch/sparc/include/asm/agp.h index c2456870b05c..70f52c1661bc 100644 --- a/arch/sparc/include/asm/agp.h +++ b/arch/sparc/include/asm/agp.h @@ -7,10 +7,6 @@ #define unmap_page_from_agp(page) #define flush_agp_cache() mb() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h index 9825cd64c9b6..eec2a70d4376 100644 --- a/arch/x86/include/asm/agp.h +++ b/arch/x86/include/asm/agp.h @@ -22,10 +22,6 @@ */ #define flush_agp_cache() wbinvd() -/* Convert a physical address to an address suitable for the GART. */ -#define phys_to_gart(x) (x) -#define gart_to_phys(x) (x) - /* GATT allocation. Returns/accepts GATT kernel virtual address. */ #define alloc_gatt_pages(order) \ ((char *)__get_free_pages(GFP_KERNEL, (order))) diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h index 4c6e5079d870..d6f36c004d9b 100644 --- a/drivers/char/agp/agp.h +++ b/drivers/char/agp/agp.h @@ -318,9 +318,6 @@ void agp3_generic_cleanup(void); #define AGP_GENERIC_SIZES_ENTRIES 11 extern const struct aper_size_info_16 agp3_generic_sizes[]; -#define virt_to_gart(x) (phys_to_gart(virt_to_phys(x))) -#define gart_to_virt(x) (phys_to_virt(gart_to_phys(x))) - extern int agp_off; extern int agp_try_unsupported_boot; diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c index 201ef3ffd484..d2ce68f27e4b 100644 --- a/drivers/char/agp/ali-agp.c +++ b/drivers/char/agp/ali-agp.c @@ -152,7 +152,7 @@ static struct page *m1541_alloc_page(struct agp_bridge_data *bridge) pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp); pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | - phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN )); + page_to_phys(page)) | ALI_CACHE_FLUSH_EN )); return page; } @@ -180,7 +180,7 @@ static void m1541_destroy_page(struct page *page, int flags) pci_read_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, &temp); pci_write_config_dword(agp_bridge->dev, ALI_CACHE_FLUSH_CTRL, (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | - phys_to_gart(page_to_phys(page))) | ALI_CACHE_FLUSH_EN)); + page_to_phys(page)) | ALI_CACHE_FLUSH_EN)); } agp_generic_destroy_page(page, flags); } diff --git a/drivers/char/agp/amd-k7-agp.c b/drivers/char/agp/amd-k7-agp.c index 542a87895ae9..73dbf40c874d 100644 --- a/drivers/char/agp/amd-k7-agp.c +++ b/drivers/char/agp/amd-k7-agp.c @@ -44,7 +44,7 @@ static int amd_create_page_map(struct amd_page_map *page_map) #ifndef CONFIG_X86 SetPageReserved(virt_to_page(page_map->real)); global_cache_flush(); - page_map->remapped = ioremap_nocache(virt_to_gart(page_map->real), + page_map->remapped = ioremap_nocache(virt_to_phys(page_map->real), PAGE_SIZE); if (page_map->remapped == NULL) { ClearPageReserved(virt_to_page(page_map->real)); @@ -160,7 +160,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge) agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); /* Get the address for the gart region. * This is a bus address even on the alpha, b/c its @@ -173,7 +173,7 @@ static int amd_create_gatt_table(struct agp_bridge_data *bridge) /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) { - writel(virt_to_gart(amd_irongate_private.gatt_pages[i]->real) | 1, + writel(virt_to_phys(amd_irongate_private.gatt_pages[i]->real) | 1, page_dir.remapped+GET_PAGE_DIR_OFF(addr)); readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr)); /* PCI Posting. */ } @@ -325,7 +325,7 @@ static int amd_insert_memory(struct agp_memory *mem, off_t pg_start, int type) addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = GET_GATT(addr); writel(agp_generic_mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mem->type), cur_gatt+GET_GATT_OFF(addr)); readl(cur_gatt+GET_GATT_OFF(addr)); /* PCI Posting. */ diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index e85a5b3e952e..2fb2e6cc322a 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -79,7 +79,7 @@ static int amd64_insert_memory(struct agp_memory *mem, off_t pg_start, int type) for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { tmp = agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mask_type); BUG_ON(tmp & 0xffffff0000000ffcULL); @@ -178,7 +178,7 @@ static const struct aper_size_info_32 amd_8151_sizes[7] = static int amd_8151_configure(void) { - unsigned long gatt_bus = virt_to_gart(agp_bridge->gatt_table_real); + unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real); int i; /* Configure AGP regs in each x86-64 host bridge. */ @@ -558,7 +558,7 @@ static void __devexit agp_amd64_remove(struct pci_dev *pdev) { struct agp_bridge_data *bridge = pci_get_drvdata(pdev); - release_mem_region(virt_to_gart(bridge->gatt_table_real), + release_mem_region(virt_to_phys(bridge->gatt_table_real), amd64_aperture_sizes[bridge->aperture_size_idx].size); agp_remove_bridge(bridge); agp_put_bridge(bridge); diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c index 59ebd60c1b60..3b2ecbe86ebe 100644 --- a/drivers/char/agp/ati-agp.c +++ b/drivers/char/agp/ati-agp.c @@ -302,7 +302,7 @@ static int ati_insert_memory(struct agp_memory * mem, addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = GET_GATT(addr); writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mem->type), cur_gatt+GET_GATT_OFF(addr)); } @@ -360,7 +360,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge) agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *) page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); /* Write out the size register */ current_size = A_SIZE_LVL2(agp_bridge->current_size); @@ -390,7 +390,7 @@ static int ati_create_gatt_table(struct agp_bridge_data *bridge) /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++, addr += 0x00400000) { - writel(virt_to_gart(ati_generic_private.gatt_pages[i]->real) | 1, + writel(virt_to_phys(ati_generic_private.gatt_pages[i]->real) | 1, page_dir.remapped+GET_PAGE_DIR_OFF(addr)); readl(page_dir.remapped+GET_PAGE_DIR_OFF(addr)); /* PCI Posting. */ } diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index 343f102090a0..ad87753f6de4 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -159,7 +159,7 @@ static int agp_backend_initialize(struct agp_bridge_data *bridge) goto err_out_nounmap; } } else { - bridge->scratch_page_dma = phys_to_gart(page_to_phys(page)); + bridge->scratch_page_dma = page_to_phys(page); } bridge->scratch_page = bridge->driver->mask_memory(bridge, diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c index 35d50f2861b6..793f39ea9618 100644 --- a/drivers/char/agp/efficeon-agp.c +++ b/drivers/char/agp/efficeon-agp.c @@ -67,7 +67,7 @@ static const struct gatt_mask efficeon_generic_masks[] = /* This function does the same thing as mask_memory() for this chipset... */ static inline unsigned long efficeon_mask_memory(struct page *page) { - unsigned long addr = phys_to_gart(page_to_phys(page)); + unsigned long addr = page_to_phys(page); return addr | 0x00000001; } @@ -226,7 +226,7 @@ static int efficeon_create_gatt_table(struct agp_bridge_data *bridge) efficeon_private.l1_table[index] = page; - value = virt_to_gart((unsigned long *)page) | pati | present | index; + value = virt_to_phys((unsigned long *)page) | pati | present | index; pci_write_config_dword(agp_bridge->dev, EFFICEON_ATTPAGE, value); diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c index 28f0208c66a6..c50543966eb2 100644 --- a/drivers/char/agp/generic.c +++ b/drivers/char/agp/generic.c @@ -988,7 +988,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge) set_memory_uc((unsigned long)table, 1 << page_order); bridge->gatt_table = (void *)table; #else - bridge->gatt_table = ioremap_nocache(virt_to_gart(table), + bridge->gatt_table = ioremap_nocache(virt_to_phys(table), (PAGE_SIZE * (1 << page_order))); bridge->driver->cache_flush(); #endif @@ -1001,7 +1001,7 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge) return -ENOMEM; } - bridge->gatt_bus_addr = virt_to_gart(bridge->gatt_table_real); + bridge->gatt_bus_addr = virt_to_phys(bridge->gatt_table_real); /* AK: bogus, should encode addresses > 4GB */ for (i = 0; i < num_entries; i++) { @@ -1142,7 +1142,7 @@ int agp_generic_insert_memory(struct agp_memory * mem, off_t pg_start, int type) for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(bridge->driver->mask_memory(bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mask_type), bridge->gatt_table+j); } diff --git a/drivers/char/agp/hp-agp.c b/drivers/char/agp/hp-agp.c index 64dbf4b1cf2f..501e293e5ad0 100644 --- a/drivers/char/agp/hp-agp.c +++ b/drivers/char/agp/hp-agp.c @@ -107,7 +107,7 @@ static int __init hp_zx1_ioc_shared(void) hp->gart_size = HP_ZX1_GART_SIZE; hp->gatt_entries = hp->gart_size / hp->io_page_size; - hp->io_pdir = gart_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE)); + hp->io_pdir = phys_to_virt(readq(hp->ioc_regs+HP_ZX1_PDIR_BASE)); hp->gatt = &hp->io_pdir[HP_ZX1_IOVA_TO_PDIR(hp->gart_base)]; if (hp->gatt[0] != HP_ZX1_SBA_IOMMU_COOKIE) { @@ -246,7 +246,7 @@ hp_zx1_configure (void) agp_bridge->mode = readl(hp->lba_regs+hp->lba_cap_offset+PCI_AGP_STATUS); if (hp->io_pdir_owner) { - writel(virt_to_gart(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE); + writel(virt_to_phys(hp->io_pdir), hp->ioc_regs+HP_ZX1_PDIR_BASE); readl(hp->ioc_regs+HP_ZX1_PDIR_BASE); writel(hp->io_tlb_ps, hp->ioc_regs+HP_ZX1_TCNFG); readl(hp->ioc_regs+HP_ZX1_TCNFG); diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c index 54191f860539..e763d3312ce7 100644 --- a/drivers/char/agp/i460-agp.c +++ b/drivers/char/agp/i460-agp.c @@ -325,7 +325,7 @@ static int i460_insert_memory_small_io_page (struct agp_memory *mem, io_page_size = 1UL << I460_IO_PAGE_SHIFT; for (i = 0, j = io_pg_start; i < mem->page_count; i++) { - paddr = phys_to_gart(page_to_phys(mem->pages[i])); + paddr = page_to_phys(mem->pages[i]); for (k = 0; k < I460_IOPAGES_PER_KPAGE; k++, j++, paddr += io_page_size) WR_GATT(j, i460_mask_memory(agp_bridge, paddr, mem->type)); } @@ -382,7 +382,7 @@ static int i460_alloc_large_page (struct lp_desc *lp) return -ENOMEM; } - lp->paddr = phys_to_gart(page_to_phys(lp->page)); + lp->paddr = page_to_phys(lp->page); lp->refcount = 0; atomic_add(I460_KPAGES_PER_IOPAGE, &agp_bridge->current_memory_agp); return 0; diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index d8c80d8be5e2..aa8889e8afc8 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -288,7 +288,7 @@ static void intel_agp_insert_sg_entries(struct agp_memory *mem, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mask_type), + page_to_phys(mem->pages[i]), mask_type), intel_private.gtt+j); } @@ -470,8 +470,7 @@ static int intel_i810_insert_entries(struct agp_memory *mem, off_t pg_start, global_cache_flush(); for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), - mask_type), + page_to_phys(mem->pages[i]), mask_type), intel_private.registers+I810_PTE_BASE+(j*4)); } readl(intel_private.registers+I810_PTE_BASE+((j-1)*4)); @@ -977,7 +976,7 @@ static int intel_i830_insert_entries(struct agp_memory *mem, off_t pg_start, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mask_type), + page_to_phys(mem->pages[i]), mask_type), intel_private.registers+I810_PTE_BASE+(j*4)); } readl(intel_private.registers+I810_PTE_BASE+((j-1)*4)); diff --git a/drivers/char/agp/nvidia-agp.c b/drivers/char/agp/nvidia-agp.c index cedacee30ec3..7e36d2b4f9d4 100644 --- a/drivers/char/agp/nvidia-agp.c +++ b/drivers/char/agp/nvidia-agp.c @@ -225,7 +225,7 @@ static int nvidia_insert_memory(struct agp_memory *mem, off_t pg_start, int type } for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mask_type), + page_to_phys(mem->pages[i]), mask_type), agp_bridge->gatt_table+nvidia_private.pg_offset+j); } diff --git a/drivers/char/agp/sgi-agp.c b/drivers/char/agp/sgi-agp.c index 0d47fa847404..0d426ae39c85 100644 --- a/drivers/char/agp/sgi-agp.c +++ b/drivers/char/agp/sgi-agp.c @@ -190,7 +190,7 @@ static int sgi_tioca_insert_memory(struct agp_memory *mem, off_t pg_start, for (i = 0, j = pg_start; i < mem->page_count; i++, j++) { table[j] = bridge->driver->mask_memory(bridge, - phys_to_gart(page_to_phys(mem->pages[i])), + page_to_phys(mem->pages[i]), mem->type); } diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c index 07259952fc32..13acaaf64edb 100644 --- a/drivers/char/agp/sworks-agp.c +++ b/drivers/char/agp/sworks-agp.c @@ -155,7 +155,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge) /* Create a fake scratch directory */ for (i = 0; i < 1024; i++) { writel(agp_bridge->scratch_page, serverworks_private.scratch_dir.remapped+i); - writel(virt_to_gart(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); + writel(virt_to_phys(serverworks_private.scratch_dir.real) | 1, page_dir.remapped+i); } retval = serverworks_create_gatt_pages(value->num_entries / 1024); @@ -167,7 +167,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge) agp_bridge->gatt_table_real = (u32 *)page_dir.real; agp_bridge->gatt_table = (u32 __iomem *)page_dir.remapped; - agp_bridge->gatt_bus_addr = virt_to_gart(page_dir.real); + agp_bridge->gatt_bus_addr = virt_to_phys(page_dir.real); /* Get the address for the gart region. * This is a bus address even on the alpha, b/c its @@ -179,7 +179,7 @@ static int serverworks_create_gatt_table(struct agp_bridge_data *bridge) /* Calculate the agp offset */ for (i = 0; i < value->num_entries / 1024; i++) - writel(virt_to_gart(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); + writel(virt_to_phys(serverworks_private.gatt_pages[i]->real)|1, page_dir.remapped+i); return 0; } @@ -350,7 +350,7 @@ static int serverworks_insert_memory(struct agp_memory *mem, addr = (j * PAGE_SIZE) + agp_bridge->gart_bus_addr; cur_gatt = SVRWRKS_GET_GATT(addr); writel(agp_bridge->driver->mask_memory(agp_bridge, - phys_to_gart(page_to_phys(mem->pages[i])), mem->type), + page_to_phys(mem->pages[i]), mem->type), cur_gatt+GET_GATT_OFF(addr)); } serverworks_tlbflush(mem); diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c index f192c3b9ad41..2e993112ab88 100644 --- a/drivers/char/agp/uninorth-agp.c +++ b/drivers/char/agp/uninorth-agp.c @@ -431,7 +431,7 @@ static int uninorth_create_gatt_table(struct agp_bridge_data *bridge) bridge->gatt_table_real = (u32 *) table; bridge->gatt_table = (u32 *)table; - bridge->gatt_bus_addr = virt_to_gart(table); + bridge->gatt_bus_addr = virt_to_phys(table); for (i = 0; i < num_entries; i++) bridge->gatt_table[i] = 0; From ba3139f2577eee24479db73b8dfc7d78eaf4c486 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 5 Aug 2009 08:12:40 +0100 Subject: [PATCH 0060/1662] intel-agp: Set dma mask for i915 If DMAR is configured in but absent, we really do want to make sure that the dma mask is set appropriately. Otherwise we get mapping failures on highmem. Spotted by Zhenyu Wang. Signed-off-by: David Woodhouse --- drivers/char/agp/intel-agp.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c index aa8889e8afc8..9bc3a0b82b96 100644 --- a/drivers/char/agp/intel-agp.c +++ b/drivers/char/agp/intel-agp.c @@ -1140,6 +1140,12 @@ static int intel_i915_configure(void) intel_i9xx_setup_flush(); +#ifdef USE_PCI_DMA_API + if (pci_set_dma_mask(intel_private.pcidev, DMA_BIT_MASK(36))) + dev_err(&intel_private.pcidev->dev, + "set gfx device dma mask 36bit failed!\n"); +#endif + return 0; } From 5e8d6b8bf94f1ffcb7e3c31b73284c20f297f191 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 6 Aug 2009 20:20:43 +1000 Subject: [PATCH 0061/1662] agp: fix uninorth build Signed-off-by: Dave Airlie --- drivers/char/agp/uninorth-agp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c index 2e993112ab88..4317a5588daf 100644 --- a/drivers/char/agp/uninorth-agp.c +++ b/drivers/char/agp/uninorth-agp.c @@ -135,7 +135,7 @@ static int uninorth_configure(void) if (is_u3) { pci_write_config_dword(agp_bridge->dev, UNI_N_CFG_GART_DUMMY_PAGE, - agp_bridge->scratch_page_real >> 12); + page_to_phys(agp_bridge->scratch_page_page) >> 12); } return 0; From a907905219dc83f501274d5d8c6d2aa2161ff8c3 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Tue, 23 Jun 2009 04:12:24 -0700 Subject: [PATCH 0062/1662] IDE: Save a call to PageHighMem() PageHighMem() isn't cheap so avoid calling it twice on the same page. Signed-off-by: Jean Delvare Acked-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/ide-taskfile.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 75b85a8cd2d4..90e5c0deb0e0 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -236,6 +236,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, while (len) { unsigned nr_bytes = min(len, cursg->length - cmd->cursg_ofs); + int page_is_high; if (nr_bytes > PAGE_SIZE) nr_bytes = PAGE_SIZE; @@ -247,7 +248,8 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, page = nth_page(page, (offset >> PAGE_SHIFT)); offset %= PAGE_SIZE; - if (PageHighMem(page)) + page_is_high = PageHighMem(page); + if (page_is_high) local_irq_save(flags); buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset; @@ -268,7 +270,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, kunmap_atomic(buf, KM_BIO_SRC_IRQ); - if (PageHighMem(page)) + if (page_is_high) local_irq_restore(flags); len -= nr_bytes; From 7fa350b4754cd69c8352ef3f5d23082fbdcab0bd Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 23 Jun 2009 04:16:04 -0700 Subject: [PATCH 0063/1662] ide: Fix annoying warning in ide_pio_bytes(). GCC can't see that flags is only set and used when PageHighmem() is true. Inspired by a patch from Jean Delvare. Signed-off-by: David S. Miller --- drivers/ide/ide-taskfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 90e5c0deb0e0..3a5224cbd0f2 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -225,8 +225,8 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, ide_hwif_t *hwif = drive->hwif; struct scatterlist *sg = hwif->sg_table; struct scatterlist *cursg = cmd->cursg; + unsigned long uninitialized_var(flags); struct page *page; - unsigned long flags; unsigned int offset; u8 *buf; From 37bbe084d1152cb580d2cc88b4eda2004506a141 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Fri, 17 Jul 2009 23:55:15 +0000 Subject: [PATCH 0064/1662] ide-tape: fix debug call This error only occurs when IDETAPE_DEBUG_LOG is enabled. Signed-off-by: Mark de Wever Signed-off-by: David S. Miller --- drivers/ide/ide-tape.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index bc5fb12b913c..1d74f159bcc4 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -583,7 +583,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, struct ide_cmd cmd; u8 stat; - debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %u\n" + debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq)); BUG_ON(!(blk_special_request(rq) || blk_sense_request(rq))); From e972d7027c0fb7055f5f2fe02d662c9528063bef Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 17 Jul 2009 23:55:16 +0000 Subject: [PATCH 0065/1662] ide-tape: convert to ide_debug_log macro Remove tape->debug_mask and use drive->debug_mask instead. There should be no functional change resulting from this patch. Signed-off-by: Borislav Petkov Signed-off-by: David S. Miller --- drivers/ide/ide-tape.c | 102 ++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 53 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 1d74f159bcc4..3468ece98496 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -47,28 +47,13 @@ #include #include -enum { - /* output errors only */ - DBG_ERR = (1 << 0), - /* output all sense key/asc */ - DBG_SENSE = (1 << 1), - /* info regarding all chrdev-related procedures */ - DBG_CHRDEV = (1 << 2), - /* all remaining procedures */ - DBG_PROCS = (1 << 3), -}; - /* define to see debug info */ -#define IDETAPE_DEBUG_LOG 0 +#undef IDETAPE_DEBUG_LOG -#if IDETAPE_DEBUG_LOG -#define debug_log(lvl, fmt, args...) \ -{ \ - if (tape->debug_mask & lvl) \ - printk(KERN_INFO "ide-tape: " fmt, ## args); \ -} +#ifdef IDETAPE_DEBUG_LOG +#define ide_debug_log(lvl, fmt, args...) __ide_debug_log(lvl, fmt, ## args) #else -#define debug_log(lvl, fmt, args...) do {} while (0) +#define ide_debug_log(lvl, fmt, args...) do {} while (0) #endif /**************************** Tunable parameters *****************************/ @@ -230,8 +215,6 @@ typedef struct ide_tape_obj { char drv_write_prot; /* the tape is write protected (hardware or opened as read-only) */ char write_prot; - - u32 debug_mask; } idetape_tape_t; static DEFINE_MUTEX(idetape_ref_mutex); @@ -290,8 +273,9 @@ static void idetape_analyze_error(ide_drive_t *drive) tape->asc = sense[12]; tape->ascq = sense[13]; - debug_log(DBG_ERR, "pc = %x, sense key = %x, asc = %x, ascq = %x\n", - pc->c[0], tape->sense_key, tape->asc, tape->ascq); + ide_debug_log(IDE_DBG_FUNC, + "cmd: 0x%x, sense key = %x, asc = %x, ascq = %x", + rq->cmd[0], tape->sense_key, tape->asc, tape->ascq); /* correct remaining bytes to transfer */ if (pc->flags & PC_FLAG_DMA_ERROR) @@ -344,7 +328,8 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc) int uptodate = pc->error ? 0 : 1; int err = uptodate ? 0 : IDE_DRV_ERROR_GENERAL; - debug_log(DBG_PROCS, "Enter %s\n", __func__); + ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x, dsc: %d, err: %d", rq->cmd[0], + dsc, err); if (dsc) ide_tape_handle_dsc(drive); @@ -390,10 +375,12 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc) static void idetape_postpone_request(ide_drive_t *drive) { idetape_tape_t *tape = drive->driver_data; + struct request *rq = drive->hwif->rq; - debug_log(DBG_PROCS, "Enter %s\n", __func__); + ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x, dsc_poll_freq: %lu", + rq->cmd[0], tape->dsc_poll_freq); - tape->postponed_rq = drive->hwif->rq; + tape->postponed_rq = rq; ide_stall_queue(drive, tape->dsc_poll_freq); } @@ -488,7 +475,8 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive, ide_complete_rq(drive, -EIO, blk_rq_bytes(rq)); return ide_stopped; } - debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]); + ide_debug_log(IDE_DBG_SENSE, "retry #%d, cmd: 0x%02x", pc->retries, + pc->c[0]); pc->retries++; @@ -583,8 +571,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, struct ide_cmd cmd; u8 stat; - debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %u\n", - (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq)); + ide_debug_log(IDE_DBG_RQ, "cmd: 0x%x, sector: %llu, nr_sectors: %u", + rq->cmd[0], (unsigned long long)blk_rq_pos(rq), + blk_rq_sectors(rq)); BUG_ON(!(blk_special_request(rq) || blk_sense_request(rq))); @@ -745,7 +734,7 @@ static int ide_tape_read_position(ide_drive_t *drive) struct ide_atapi_pc pc; u8 buf[20]; - debug_log(DBG_PROCS, "Enter %s\n", __func__); + ide_debug_log(IDE_DBG_FUNC, "enter"); /* prep cmd */ ide_init_pc(&pc); @@ -756,9 +745,9 @@ static int ide_tape_read_position(ide_drive_t *drive) return -1; if (!pc.error) { - debug_log(DBG_SENSE, "BOP - %s\n", + ide_debug_log(IDE_DBG_FUNC, "BOP - %s", (buf[0] & 0x80) ? "Yes" : "No"); - debug_log(DBG_SENSE, "EOP - %s\n", + ide_debug_log(IDE_DBG_FUNC, "EOP - %s", (buf[0] & 0x40) ? "Yes" : "No"); if (buf[0] & 0x4) { @@ -768,8 +757,8 @@ static int ide_tape_read_position(ide_drive_t *drive) &drive->atapi_flags); return -1; } else { - debug_log(DBG_SENSE, "Block Location - %u\n", - be32_to_cpup((__be32 *)&buf[4])); + ide_debug_log(IDE_DBG_FUNC, "Block Location: %u", + be32_to_cpup((__be32 *)&buf[4])); tape->partition = buf[1]; tape->first_frame = be32_to_cpup((__be32 *)&buf[4]); @@ -866,7 +855,8 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) struct request *rq; int ret; - debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd); + ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x, size: %d", cmd, size); + BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE); BUG_ON(size < 0 || size % tape->blk_size); @@ -1029,7 +1019,7 @@ static int idetape_rewind_tape(ide_drive_t *drive) struct ide_atapi_pc pc; int ret; - debug_log(DBG_SENSE, "Enter %s\n", __func__); + ide_debug_log(IDE_DBG_FUNC, "enter"); idetape_create_rewind_cmd(drive, &pc); ret = ide_queue_pc_tail(drive, disk, &pc, NULL, 0); @@ -1055,7 +1045,7 @@ static int idetape_blkdev_ioctl(ide_drive_t *drive, unsigned int cmd, int nr_stages; } config; - debug_log(DBG_PROCS, "Enter %s\n", __func__); + ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%04x", cmd); switch (cmd) { case 0x0340: @@ -1085,6 +1075,9 @@ static int idetape_space_over_filemarks(ide_drive_t *drive, short mt_op, int retval, count = 0; int sprev = !!(tape->caps[4] & 0x20); + + ide_debug_log(IDE_DBG_FUNC, "mt_op: %d, mt_count: %d", mt_op, mt_count); + if (mt_count == 0) return 0; if (MTBSF == mt_op || MTBSFM == mt_op) { @@ -1148,7 +1141,7 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, ssize_t ret = 0; int rc; - debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count); + ide_debug_log(IDE_DBG_FUNC, "count %Zd", count); if (tape->chrdev_dir != IDETAPE_DIR_READ) { if (test_bit(ilog2(IDE_AFLAG_DETECT_BS), &drive->atapi_flags)) @@ -1187,8 +1180,6 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, } if (!done && test_bit(ilog2(IDE_AFLAG_FILEMARK), &drive->atapi_flags)) { - debug_log(DBG_SENSE, "%s: spacing over filemark\n", tape->name); - idetape_space_over_filemarks(drive, MTFSF, 1); return 0; } @@ -1209,7 +1200,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, if (tape->write_prot) return -EACCES; - debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count); + ide_debug_log(IDE_DBG_FUNC, "count %Zd", count); /* Initialize write operation */ rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE); @@ -1273,8 +1264,8 @@ static int idetape_mtioctop(ide_drive_t *drive, short mt_op, int mt_count) struct ide_atapi_pc pc; int i, retval; - debug_log(DBG_ERR, "Handling MTIOCTOP ioctl: mt_op=%d, mt_count=%d\n", - mt_op, mt_count); + ide_debug_log(IDE_DBG_FUNC, "MTIOCTOP ioctl: mt_op: %d, mt_count: %d", + mt_op, mt_count); switch (mt_op) { case MTFSF: @@ -1393,7 +1384,7 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file, int block_offset = 0, position = tape->first_frame; void __user *argp = (void __user *)arg; - debug_log(DBG_CHRDEV, "Enter %s, cmd=%u\n", __func__, cmd); + ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x", cmd); if (tape->chrdev_dir == IDETAPE_DIR_WRITE) { ide_tape_flush_merge_buffer(drive); @@ -1461,6 +1452,9 @@ static void ide_tape_get_bsize_from_bdesc(ide_drive_t *drive) (buf[4 + 6] << 8) + buf[4 + 7]; tape->drv_write_prot = (buf[2] & 0x80) >> 7; + + ide_debug_log(IDE_DBG_FUNC, "blk_size: %d, write_prot: %d", + tape->blk_size, tape->drv_write_prot); } static int idetape_chrdev_open(struct inode *inode, struct file *filp) @@ -1480,7 +1474,10 @@ static int idetape_chrdev_open(struct inode *inode, struct file *filp) return -ENXIO; } - debug_log(DBG_CHRDEV, "Enter %s\n", __func__); + drive = tape->drive; + filp->private_data = tape; + + ide_debug_log(IDE_DBG_FUNC, "enter"); /* * We really want to do nonseekable_open(inode, filp); here, but some @@ -1489,9 +1486,6 @@ static int idetape_chrdev_open(struct inode *inode, struct file *filp) */ filp->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE); - drive = tape->drive; - - filp->private_data = tape; if (test_and_set_bit(ilog2(IDE_AFLAG_BUSY), &drive->atapi_flags)) { retval = -EBUSY; @@ -1570,7 +1564,7 @@ static int idetape_chrdev_release(struct inode *inode, struct file *filp) lock_kernel(); tape = drive->driver_data; - debug_log(DBG_CHRDEV, "Enter %s\n", __func__); + ide_debug_log(IDE_DBG_FUNC, "enter"); if (tape->chrdev_dir == IDETAPE_DIR_WRITE) idetape_write_release(drive, minor); @@ -1707,7 +1701,6 @@ static int divf_buffer_size(ide_drive_t *drive) { return 1024; } ide_devset_rw_flag(dsc_overlap, IDE_DFLAG_DSC_OVERLAP); -ide_tape_devset_rw_field(debug_mask, debug_mask); ide_tape_devset_rw_field(tdsc, best_dsc_rw_freq); ide_tape_devset_r_field(avg_speed, avg_speed); @@ -1719,7 +1712,6 @@ static const struct ide_proc_devset idetape_settings[] = { __IDE_PROC_DEVSET(avg_speed, 0, 0xffff, NULL, NULL), __IDE_PROC_DEVSET(buffer, 0, 0xffff, NULL, divf_buffer), __IDE_PROC_DEVSET(buffer_size, 0, 0xffff, NULL, divf_buffer_size), - __IDE_PROC_DEVSET(debug_mask, 0, 0xffff, NULL, NULL), __IDE_PROC_DEVSET(dsc_overlap, 0, 1, NULL, NULL), __IDE_PROC_DEVSET(speed, 0, 0xffff, NULL, NULL), __IDE_PROC_DEVSET(tdsc, IDETAPE_DSC_RW_MIN, IDETAPE_DSC_RW_MAX, @@ -1746,7 +1738,9 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor) int buffer_size; u16 *ctl = (u16 *)&tape->caps[12]; - drive->pc_callback = ide_tape_callback; + ide_debug_log(IDE_DBG_FUNC, "minor: %d", minor); + + drive->pc_callback = ide_tape_callback; drive->dev_flags |= IDE_DFLAG_DSC_OVERLAP; @@ -1932,7 +1926,9 @@ static int ide_tape_probe(ide_drive_t *drive) struct gendisk *g; int minor; - if (!strstr("ide-tape", drive->driver_req)) + ide_debug_log(IDE_DBG_FUNC, "enter"); + + if (!strstr(DRV_NAME, drive->driver_req)) goto failed; if (drive->media != ide_tape) From 6f3848ac2399faac0be3f26648bf1d7a644a8242 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 21 Jul 2009 23:08:23 -0700 Subject: [PATCH 0066/1662] ide-tape: fix handling of postponed rqs ide-tape used to hit [ 58.614854] ide-tape: ht0: BUG: Two DSC requests queued! due to the fact that another rq was being issued while the driver was waiting for DSC to get set for the device executing ATAPI commands which set the DSC to 1 to indicate completion. Here's a sample output of that case: issue REZERO_UNIT [ 143.088505] ide-tape: ide_tape_issue_pc: retry #0, cmd: 0x01 [ 143.095122] ide: Enter ide_pc_intr - interrupt handler [ 143.096118] ide: Packet command completed, 0 bytes transferred [ 143.106319] ide-tape: ide_tape_callback: cmd: 0x1, dsc: 1, err: 0 [ 143.112601] ide-tape: idetape_postpone_request: cmd: 0x1, dsc_poll_freq: 2000 we stall the ide-tape queue here waiting for DSC [ 143.119936] ide-tape: ide_tape_read_position: enter [ 145.119019] ide-tape: idetape_do_request: sector: 4294967295, nr_sectors: 0 and issue the new READ_POSITION rq and hit the check. [ 145.126247] ide-tape: ht0: BUG: Two DSC requests queued! [ 145.131748] ide-tape: ide_tape_read_position: BOP - No [ 145.137059] ide-tape: ide_tape_read_position: EOP - No Also, ->postponed_rq used to point to that postponed request. To make things worse, in certain circumstances the rq it was pointing to got replaced unterneath it by swiftly reusing the same rq from the mempool of the block layer practically confusing stuff even more. However, we don't need to keep a pointer to that rq but simply wait for DSC to be set first before issuing the follow-up request in the drive's queue. In order to do that, we make idetape_do_request() first check the DSC and if not set, we stall the drive queue giving the other device on that IDE channel a chance. Signed-off-by: Borislav Petkov Signed-off-by: David S. Miller --- drivers/ide/ide-tape.c | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 3468ece98496..7b2032bc357b 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -155,7 +155,8 @@ typedef struct ide_tape_obj { * other device. Note that at most we will have only one DSC (usually * data transfer) request in the device request queue. */ - struct request *postponed_rq; + bool postponed_rq; + /* The time in which we started polling for DSC */ unsigned long dsc_polling_start; /* Timer used to poll for dsc */ @@ -372,15 +373,14 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc) * Postpone the current request so that ide.c will be able to service requests * from another device on the same port while we are polling for DSC. */ -static void idetape_postpone_request(ide_drive_t *drive) +static void ide_tape_stall_queue(ide_drive_t *drive) { idetape_tape_t *tape = drive->driver_data; - struct request *rq = drive->hwif->rq; ide_debug_log(IDE_DBG_FUNC, "cmd: 0x%x, dsc_poll_freq: %lu", - rq->cmd[0], tape->dsc_poll_freq); + drive->hwif->rq->cmd[0], tape->dsc_poll_freq); - tape->postponed_rq = rq; + tape->postponed_rq = true; ide_stall_queue(drive, tape->dsc_poll_freq); } @@ -394,7 +394,7 @@ static void ide_tape_handle_dsc(ide_drive_t *drive) tape->dsc_poll_freq = IDETAPE_DSC_MA_FAST; tape->dsc_timeout = jiffies + IDETAPE_DSC_MA_TIMEOUT; /* Allow ide.c to handle other requests */ - idetape_postpone_request(drive); + ide_tape_stall_queue(drive); } /* @@ -567,7 +567,6 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, ide_hwif_t *hwif = drive->hwif; idetape_tape_t *tape = drive->driver_data; struct ide_atapi_pc *pc = NULL; - struct request *postponed_rq = tape->postponed_rq; struct ide_cmd cmd; u8 stat; @@ -583,18 +582,6 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, goto out; } - if (postponed_rq != NULL) - if (rq != postponed_rq) { - printk(KERN_ERR "ide-tape: ide-tape.c bug - " - "Two DSC requests were queued\n"); - drive->failed_pc = NULL; - rq->errors = 0; - ide_complete_rq(drive, 0, blk_rq_bytes(rq)); - return ide_stopped; - } - - tape->postponed_rq = NULL; - /* * If the tape is still busy, postpone our request and service * the other device meanwhile. @@ -612,7 +599,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, if (!(drive->atapi_flags & IDE_AFLAG_IGNORE_DSC) && !(stat & ATA_DSC)) { - if (postponed_rq == NULL) { + if (!tape->postponed_rq) { tape->dsc_polling_start = jiffies; tape->dsc_poll_freq = tape->best_dsc_rw_freq; tape->dsc_timeout = jiffies + IDETAPE_DSC_RW_TIMEOUT; @@ -629,10 +616,12 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, tape->dsc_polling_start + IDETAPE_DSC_MA_THRESHOLD)) tape->dsc_poll_freq = IDETAPE_DSC_MA_SLOW; - idetape_postpone_request(drive); + ide_tape_stall_queue(drive); return ide_stopped; - } else + } else { drive->atapi_flags &= ~IDE_AFLAG_IGNORE_DSC; + tape->postponed_rq = false; + } if (rq->cmd[13] & REQ_IDETAPE_READ) { pc = &tape->queued_pc; From 72db37b2c9c5b71e49068f5fac6433a6c36498a5 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 2 Aug 2009 13:19:05 -0700 Subject: [PATCH 0067/1662] drivers/ide/ide-cd.c: Use DIV_ROUND_CLOSEST The kernel.h macro DIV_ROUND_CLOSEST performs the computation (x + d/2)/d but is perhaps more readable. The semantic patch that makes this change is as follows: (http://www.emn.fr/x-info/coccinelle/) // @haskernel@ @@ #include @depends on haskernel@ expression x,__divisor; @@ - (((x) + ((__divisor) / 2)) / (__divisor)) + DIV_ROUND_CLOSEST(x,__divisor) // Signed-off-by: Julia Lawall Acked-by: Borislav Petkov Signed-off-by: David S. Miller --- drivers/ide/ide-cd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 6a9a769bffc1..ad0ab0c0a493 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1146,8 +1146,8 @@ void ide_cdrom_update_speed(ide_drive_t *drive, u8 *buf) ide_debug_log(IDE_DBG_PROBE, "curspeed: %u, maxspeed: %u", curspeed, maxspeed); - cd->current_speed = (curspeed + (176/2)) / 176; - cd->max_speed = (maxspeed + (176/2)) / 176; + cd->current_speed = DIV_ROUND_CLOSEST(curspeed, 176); + cd->max_speed = DIV_ROUND_CLOSEST(maxspeed, 176); } #define IDE_CD_CAPABILITIES \ From 2d5abcedeb41f4af9582c60cef70749c3ab90a3b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 2 Aug 2009 20:17:34 -0700 Subject: [PATCH 0068/1662] ide: ide-taskfile.c fix style problems Fix trivial style problems: WARNING: Use #include instead of WARNING: space prohibited between function name and open parenthesis '(' WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable ERROR: do not use C99 // comments X 2 ERROR: trailing statements should be on next line ERROR: trailing whitespace ERROR: switch and case should be at the same indent WARNING: line over 80 characters total: 5 errors, 4 warnings Also removed dead code Also used pr_err() to avoid line breaks Signed-off-by: Jaswinder Singh Rajput Acked-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/ide-taskfile.c | 109 +++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 58 deletions(-) diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 3a5224cbd0f2..50336d51eebc 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -19,8 +19,8 @@ #include #include #include +#include -#include #include void ide_tf_readback(ide_drive_t *drive, struct ide_cmd *cmd) @@ -53,7 +53,7 @@ void ide_tf_dump(const char *s, struct ide_cmd *cmd) #endif } -int taskfile_lib_get_identify (ide_drive_t *drive, u8 *buf) +int taskfile_lib_get_identify(ide_drive_t *drive, u8 *buf) { struct ide_cmd cmd; @@ -86,7 +86,7 @@ ide_startstop_t do_rw_taskfile(ide_drive_t *drive, struct ide_cmd *orig_cmd) if (orig_cmd->protocol == ATA_PROT_PIO && (orig_cmd->tf_flags & IDE_TFLAG_MULTI_PIO) && drive->mult_count == 0) { - printk(KERN_ERR "%s: multimode not set!\n", drive->name); + pr_err("%s: multimode not set!\n", drive->name); return ide_stopped; } @@ -214,7 +214,7 @@ static u8 wait_drive_not_busy(ide_drive_t *drive) } if (stat & ATA_BUSY) - printk(KERN_ERR "%s: drive still BUSY!\n", drive->name); + pr_err("%s: drive still BUSY!\n", drive->name); return stat; } @@ -400,8 +400,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive, if (ide_wait_stat(&startstop, drive, ATA_DRQ, drive->bad_wstat, WAIT_DRQ)) { - printk(KERN_ERR "%s: no DRQ after issuing %sWRITE%s\n", - drive->name, + pr_err("%s: no DRQ after issuing %sWRITE%s\n", drive->name, (cmd->tf_flags & IDE_TFLAG_MULTI_PIO) ? "MULT" : "", (drive->dev_flags & IDE_DFLAG_LBA48) ? "_EXT" : ""); return startstop; @@ -451,7 +450,6 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, blk_put_request(rq); return error; } - EXPORT_SYMBOL(ide_raw_taskfile); int ide_no_data_taskfile(ide_drive_t *drive, struct ide_cmd *cmd) @@ -477,10 +475,9 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg) u16 nsect = 0; char __user *buf = (char __user *)arg; -// printk("IDE Taskfile ...\n"); - req_task = kzalloc(tasksize, GFP_KERNEL); - if (req_task == NULL) return -ENOMEM; + if (req_task == NULL) + return -ENOMEM; if (copy_from_user(req_task, buf, tasksize)) { kfree(req_task); return -EFAULT; @@ -488,7 +485,7 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg) taskout = req_task->out_size; taskin = req_task->in_size; - + if (taskin > 65536 || taskout > 65536) { err = -EINVAL; goto abort; @@ -578,51 +575,49 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg) cmd.protocol = ATA_PROT_DMA; switch (req_task->data_phase) { - case TASKFILE_MULTI_OUT: - if (!drive->mult_count) { - /* (hs): give up if multcount is not set */ - printk(KERN_ERR "%s: %s Multimode Write " \ - "multcount is not set\n", - drive->name, __func__); - err = -EPERM; - goto abort; - } - cmd.tf_flags |= IDE_TFLAG_MULTI_PIO; - /* fall through */ - case TASKFILE_OUT: - cmd.protocol = ATA_PROT_PIO; - /* fall through */ - case TASKFILE_OUT_DMAQ: - case TASKFILE_OUT_DMA: - cmd.tf_flags |= IDE_TFLAG_WRITE; - nsect = taskout / SECTOR_SIZE; - data_buf = outbuf; - break; - case TASKFILE_MULTI_IN: - if (!drive->mult_count) { - /* (hs): give up if multcount is not set */ - printk(KERN_ERR "%s: %s Multimode Read failure " \ - "multcount is not set\n", - drive->name, __func__); - err = -EPERM; - goto abort; - } - cmd.tf_flags |= IDE_TFLAG_MULTI_PIO; - /* fall through */ - case TASKFILE_IN: - cmd.protocol = ATA_PROT_PIO; - /* fall through */ - case TASKFILE_IN_DMAQ: - case TASKFILE_IN_DMA: - nsect = taskin / SECTOR_SIZE; - data_buf = inbuf; - break; - case TASKFILE_NO_DATA: - cmd.protocol = ATA_PROT_NODATA; - break; - default: - err = -EFAULT; + case TASKFILE_MULTI_OUT: + if (!drive->mult_count) { + /* (hs): give up if multcount is not set */ + pr_err("%s: %s Multimode Write multcount is not set\n", + drive->name, __func__); + err = -EPERM; goto abort; + } + cmd.tf_flags |= IDE_TFLAG_MULTI_PIO; + /* fall through */ + case TASKFILE_OUT: + cmd.protocol = ATA_PROT_PIO; + /* fall through */ + case TASKFILE_OUT_DMAQ: + case TASKFILE_OUT_DMA: + cmd.tf_flags |= IDE_TFLAG_WRITE; + nsect = taskout / SECTOR_SIZE; + data_buf = outbuf; + break; + case TASKFILE_MULTI_IN: + if (!drive->mult_count) { + /* (hs): give up if multcount is not set */ + pr_err("%s: %s Multimode Read multcount is not set\n", + drive->name, __func__); + err = -EPERM; + goto abort; + } + cmd.tf_flags |= IDE_TFLAG_MULTI_PIO; + /* fall through */ + case TASKFILE_IN: + cmd.protocol = ATA_PROT_PIO; + /* fall through */ + case TASKFILE_IN_DMAQ: + case TASKFILE_IN_DMA: + nsect = taskin / SECTOR_SIZE; + data_buf = inbuf; + break; + case TASKFILE_NO_DATA: + cmd.protocol = ATA_PROT_NODATA; + break; + default: + err = -EFAULT; + goto abort; } if (req_task->req_cmd == IDE_DRIVE_TASK_NO_DATA) @@ -631,7 +626,7 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg) nsect = (cmd.hob.nsect << 8) | cmd.tf.nsect; if (!nsect) { - printk(KERN_ERR "%s: in/out command without data\n", + pr_err("%s: in/out command without data\n", drive->name); err = -EFAULT; goto abort; @@ -673,8 +668,6 @@ int ide_taskfile_ioctl(ide_drive_t *drive, unsigned long arg) kfree(outbuf); kfree(inbuf); -// printk("IDE Taskfile ioctl ended. rc = %i\n", err); - return err; } #endif From fa56d4cb4022c8b313c3b99236e1b87effc3655b Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 23 Jun 2009 11:29:11 +0000 Subject: [PATCH 0069/1662] ide: allow ide_dev_read_id() to be called from the IRQ context * Un-static __ide_wait_stat(). * Allow ide_dev_read_id() helper to be called from the IRQ context by adding irq_ctx flag and using mdelay()/__ide_wait_stat() when needed. * Switch ide_driveid_update() to set irq_ctx flag. This change is needed for the consecutive patch which fixes races in handling of user-space SET XFER commands but for improved bisectability and clarity it is better to do it in a separate patch. Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/ide-iops.c | 6 +++--- drivers/ide/ide-probe.c | 31 +++++++++++++++++++++---------- include/linux/ide.h | 3 ++- 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index 2892b242bbe1..b99873845d21 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -102,8 +102,8 @@ EXPORT_SYMBOL(ide_fixstring); * setting a timer to wake up at half second intervals thereafter, * until timeout is achieved, before timing out. */ -static int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad, - unsigned long timeout, u8 *rstat) +int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad, + unsigned long timeout, u8 *rstat) { ide_hwif_t *hwif = drive->hwif; const struct ide_tp_ops *tp_ops = hwif->tp_ops; @@ -316,7 +316,7 @@ int ide_driveid_update(ide_drive_t *drive) return 0; SELECT_MASK(drive, 1); - rc = ide_dev_read_id(drive, ATA_CMD_ID_ATA, id); + rc = ide_dev_read_id(drive, ATA_CMD_ID_ATA, id, 1); SELECT_MASK(drive, 0); if (rc) diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 1bb106f6221a..8de442cbee94 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -238,6 +238,7 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id) * @drive: drive to identify * @cmd: command to use * @id: buffer for IDENTIFY data + * @irq_ctx: flag set when called from the IRQ context * * Sends an ATA(PI) IDENTIFY request to a drive and waits for a response. * @@ -246,7 +247,7 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id) * 2 device aborted the command (refused to identify itself) */ -int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id) +int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id, int irq_ctx) { ide_hwif_t *hwif = drive->hwif; struct ide_io_ports *io_ports = &hwif->io_ports; @@ -263,7 +264,10 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id) tp_ops->write_devctl(hwif, ATA_NIEN | ATA_DEVCTL_OBS); /* take a deep breath */ - msleep(50); + if (irq_ctx) + mdelay(50); + else + msleep(50); if (io_ports->ctl_addr && (hwif->host_flags & IDE_HFLAG_BROKEN_ALTSTATUS) == 0) { @@ -295,12 +299,19 @@ int ide_dev_read_id(ide_drive_t *drive, u8 cmd, u16 *id) timeout = ((cmd == ATA_CMD_ID_ATA) ? WAIT_WORSTCASE : WAIT_PIDENTIFY) / 2; - if (ide_busy_sleep(drive, timeout, use_altstatus)) - return 1; - /* wait for IRQ and ATA_DRQ */ - msleep(50); - s = tp_ops->read_status(hwif); + if (irq_ctx) { + rc = __ide_wait_stat(drive, ATA_DRQ, BAD_R_STAT, timeout, &s); + if (rc) + return 1; + } else { + rc = ide_busy_sleep(drive, timeout, use_altstatus); + if (rc) + return 1; + + msleep(50); + s = tp_ops->read_status(hwif); + } if (OK_STAT(s, ATA_DRQ, BAD_R_STAT)) { /* drive returned ID */ @@ -406,10 +417,10 @@ static int do_probe (ide_drive_t *drive, u8 cmd) if (OK_STAT(stat, ATA_DRDY, ATA_BUSY) || present || cmd == ATA_CMD_ID_ATAPI) { - rc = ide_dev_read_id(drive, cmd, id); + rc = ide_dev_read_id(drive, cmd, id, 0); if (rc) /* failed: try again */ - rc = ide_dev_read_id(drive, cmd, id); + rc = ide_dev_read_id(drive, cmd, id, 0); stat = tp_ops->read_status(hwif); @@ -424,7 +435,7 @@ static int do_probe (ide_drive_t *drive, u8 cmd) msleep(50); tp_ops->exec_command(hwif, ATA_CMD_DEV_RESET); (void)ide_busy_sleep(drive, WAIT_WORSTCASE, 0); - rc = ide_dev_read_id(drive, cmd, id); + rc = ide_dev_read_id(drive, cmd, id, 0); } /* ensure drive IRQ is clear */ diff --git a/include/linux/ide.h b/include/linux/ide.h index edc93a6d931d..cb6cd0459a5e 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1081,6 +1081,7 @@ extern void ide_fixstring(u8 *, const int, const int); int ide_busy_sleep(ide_drive_t *, unsigned long, int); +int __ide_wait_stat(ide_drive_t *, u8, u8, unsigned long, u8 *); int ide_wait_stat(ide_startstop_t *, ide_drive_t *, u8, u8, unsigned long); ide_startstop_t ide_do_park_unpark(ide_drive_t *, struct request *); @@ -1169,7 +1170,7 @@ int ide_no_data_taskfile(ide_drive_t *, struct ide_cmd *); int ide_taskfile_ioctl(ide_drive_t *, unsigned long); -int ide_dev_read_id(ide_drive_t *, u8, u16 *); +int ide_dev_read_id(ide_drive_t *, u8, u16 *, int); extern int ide_driveid_update(ide_drive_t *); extern int ide_config_drive_speed(ide_drive_t *, u8); From 665d66e8fad60a5a162c4615f27f916ad1a6d567 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 23 Jun 2009 11:35:51 +0000 Subject: [PATCH 0070/1662] ide: fix races in handling of user-space SET XFER commands * Make cmd->tf_flags field 'u16' and add IDE_TFLAG_SET_XFER taskfile flag. * Update ide_finish_cmd() to set xfer / re-read id if the new flag is set. * Convert set_xfer_rate() (write handler for /proc/ide/hd?/current_speed) and ide_cmd_ioctl() (HDIO_DRIVE_CMD ioctl handler) to use the new flag. * Remove no longer needed disable_irq_nosync() + enable_irq() from ide_config_drive_speed(). Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: David S. Miller --- drivers/ide/ide-ioctls.c | 8 ++------ drivers/ide/ide-iops.c | 10 ---------- drivers/ide/ide-proc.c | 10 ++-------- drivers/ide/ide-taskfile.c | 9 ++++++++- include/linux/ide.h | 3 ++- 5 files changed, 14 insertions(+), 26 deletions(-) diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index e246d3d3fbcc..d3440b5010a5 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -167,6 +167,8 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg) err = -EINVAL; goto abort; } + + cmd.tf_flags |= IDE_TFLAG_SET_XFER; } err = ide_raw_taskfile(drive, &cmd, buf, args[3]); @@ -174,12 +176,6 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg) args[0] = tf->status; args[1] = tf->error; args[2] = tf->nsect; - - if (!err && xfer_rate) { - /* active-retuning-calls future */ - ide_set_xfer_rate(drive, xfer_rate); - ide_driveid_update(drive); - } abort: if (copy_to_user((void __user *)arg, &args, 4)) err = -EFAULT; diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index b99873845d21..b14fa9a87c49 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -363,14 +363,6 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed) * this point (lost interrupt). */ - /* - * FIXME: we race against the running IRQ here if - * this is called from non IRQ context. If we use - * disable_irq() we hang on the error path. Work - * is needed. - */ - disable_irq_nosync(hwif->irq); - udelay(1); tp_ops->dev_select(drive); SELECT_MASK(drive, 1); @@ -394,8 +386,6 @@ int ide_config_drive_speed(ide_drive_t *drive, u8 speed) SELECT_MASK(drive, 0); - enable_irq(hwif->irq); - if (error) { (void) ide_dump_status(drive, "set_drive_speed_status", stat); return error; diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c index 3242698832a4..021de41655e6 100644 --- a/drivers/ide/ide-proc.c +++ b/drivers/ide/ide-proc.c @@ -195,7 +195,6 @@ ide_devset_get(xfer_rate, current_speed); static int set_xfer_rate (ide_drive_t *drive, int arg) { struct ide_cmd cmd; - int err; if (arg < XFER_PIO_0 || arg > XFER_UDMA_6) return -EINVAL; @@ -206,14 +205,9 @@ static int set_xfer_rate (ide_drive_t *drive, int arg) cmd.tf.nsect = (u8)arg; cmd.valid.out.tf = IDE_VALID_FEATURE | IDE_VALID_NSECT; cmd.valid.in.tf = IDE_VALID_NSECT; + cmd.tf_flags = IDE_TFLAG_SET_XFER; - err = ide_no_data_taskfile(drive, &cmd); - - if (!err) { - ide_set_xfer_rate(drive, (u8) arg); - ide_driveid_update(drive); - } - return err; + return ide_no_data_taskfile(drive, &cmd); } ide_devset_rw(current_speed, xfer_rate); diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 50336d51eebc..cc8633cbe133 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -324,10 +324,17 @@ static void ide_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd) void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat) { struct request *rq = drive->hwif->rq; - u8 err = ide_read_error(drive); + u8 err = ide_read_error(drive), nsect = cmd->tf.nsect; + u8 set_xfer = !!(cmd->tf_flags & IDE_TFLAG_SET_XFER); ide_complete_cmd(drive, cmd, stat, err); rq->errors = err; + + if (err == 0 && set_xfer) { + ide_set_xfer_rate(drive, nsect); + ide_driveid_update(drive); + } + ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq)); } diff --git a/include/linux/ide.h b/include/linux/ide.h index cb6cd0459a5e..803c1ae31237 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -258,6 +258,7 @@ enum { IDE_TFLAG_DYN = (1 << 5), IDE_TFLAG_FS = (1 << 6), IDE_TFLAG_MULTI_PIO = (1 << 7), + IDE_TFLAG_SET_XFER = (1 << 8), }; enum { @@ -294,7 +295,7 @@ struct ide_cmd { } out, in; } valid; - u8 tf_flags; + u16 tf_flags; u8 ftf_flags; /* for TASKFILE ioctl */ int protocol; From 5b7e88edc6193f36941bccbfd5ed9ed5fe27d2e1 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:40 +0800 Subject: [PATCH 0071/1662] x86, mce: Support specifying context for software mce injection The cpu context is specified via the new mce.inject_flags fields. This allows more realistic machine check testing in different situations. "RANDOM" context is implemented via NMI broadcasting to add randomization to testing. AK: Fix NMI broadcasting check. Fix 32-bit building. Some race fixes. Move to module. Various changes ChangeLog: v3: - Re-based on latest x86-tip.git/mce4 - Fix 32-bit building v2: - Re-base on latest x86-tip.git/mce3 Signed-off-by: Huang Ying Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 11 +- arch/x86/kernel/cpu/mcheck/mce-inject.c | 166 +++++++++++++++++++----- 2 files changed, 140 insertions(+), 37 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index ad7535372918..8945be9ad2b1 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -38,6 +38,13 @@ #define MCM_ADDR_MEM 3 /* memory address */ #define MCM_ADDR_GENERIC 7 /* generic */ +#define MCJ_CTX_MASK 3 +#define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) +#define MCJ_CTX_RANDOM 0 /* inject context: random */ +#define MCJ_CTX_PROCESS 1 /* inject context: process */ +#define MCJ_CTX_IRQ 2 /* inject context: IRQ */ +#define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ + /* Fields are zero when not available */ struct mce { __u64 status; @@ -48,8 +55,8 @@ struct mce { __u64 tsc; /* cpu time stamp counter */ __u64 time; /* wall time_t when error was detected */ __u8 cpuvendor; /* cpu vendor as encoded in system.h */ - __u8 pad1; - __u16 pad2; + __u8 inject_flags; /* software inject flags */ + __u16 pad; __u32 cpuid; /* CPUID 1 EAX */ __u8 cs; /* code segment */ __u8 bank; /* machine check bank */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index a3a235a53f09..ad5d92790ebc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -18,7 +18,12 @@ #include #include #include +#include +#include +#include +#include #include +#include /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) @@ -39,44 +44,141 @@ static void inject_mce(struct mce *m) i->finished = 1; } -struct delayed_mce { - struct timer_list timer; - struct mce m; -}; - -/* Inject mce on current CPU */ -static void raise_mce(unsigned long data) +static void raise_corrected(struct mce *m) { - struct delayed_mce *dm = (struct delayed_mce *)data; - struct mce *m = &dm->m; - int cpu = m->extcpu; + unsigned long flags; + mce_banks_t b; - inject_mce(m); - if (m->status & MCI_STATUS_UC) { - struct pt_regs regs; + memset(&b, 0xff, sizeof(mce_banks_t)); + local_irq_save(flags); + machine_check_poll(0, &b); + local_irq_restore(flags); + m->finished = 0; +} + +static void raise_uncorrected(struct mce *m, struct pt_regs *pregs) +{ + struct pt_regs regs; + unsigned long flags; + + if (!pregs) { memset(®s, 0, sizeof(struct pt_regs)); regs.ip = m->ip; regs.cs = m->cs; - printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); - do_machine_check(®s, 0); - printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); - } else { - mce_banks_t b; - memset(&b, 0xff, sizeof(mce_banks_t)); - printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); - machine_check_poll(0, &b); - mce_notify_irq(); - printk(KERN_INFO "Finished machine check poll on CPU %d\n", - cpu); + pregs = ®s; } - kfree(dm); + /* in mcheck exeception handler, irq will be disabled */ + local_irq_save(flags); + do_machine_check(pregs, 0); + local_irq_restore(flags); + m->finished = 0; +} + +static cpumask_t mce_inject_cpumask; + +static int mce_raise_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) + return NOTIFY_DONE; + cpu_clear(cpu, mce_inject_cpumask); + if (m->status & MCI_STATUS_UC) + raise_uncorrected(m, args->regs); + else if (m->status) + raise_corrected(m); + return NOTIFY_STOP; +} + +static struct notifier_block mce_raise_nb = { + .notifier_call = mce_raise_notify, + .priority = 1000, +}; + +/* Inject mce on current CPU */ +static int raise_local(struct mce *m) +{ + int context = MCJ_CTX(m->inject_flags); + int ret = 0; + int cpu = m->extcpu; + + if (m->status & MCI_STATUS_UC) { + printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); + switch (context) { + case MCJ_CTX_IRQ: + /* + * Could do more to fake interrupts like + * calling irq_enter, but the necessary + * machinery isn't exported currently. + */ + /*FALL THROUGH*/ + case MCJ_CTX_PROCESS: + raise_uncorrected(m, NULL); + break; + default: + printk(KERN_INFO "Invalid MCE context\n"); + ret = -EINVAL; + } + printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); + } else if (m->status) { + printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); + raise_corrected(m); + mce_notify_irq(); + printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); + } else + m->finished = 0; + + return ret; +} + +static void raise_mce(struct mce *m) +{ + int context = MCJ_CTX(m->inject_flags); + + inject_mce(m); + + if (context == MCJ_CTX_RANDOM) + return; + +#ifdef CONFIG_X86_LOCAL_APIC + if (m->inject_flags & MCJ_NMI_BROADCAST) { + unsigned long start; + int cpu; + get_online_cpus(); + mce_inject_cpumask = cpu_online_map; + cpu_clear(get_cpu(), mce_inject_cpumask); + for_each_online_cpu(cpu) { + struct mce *mcpu = &per_cpu(injectm, cpu); + if (!mcpu->finished || + MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) + cpu_clear(cpu, mce_inject_cpumask); + } + if (!cpus_empty(mce_inject_cpumask)) + apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); + start = jiffies; + while (!cpus_empty(mce_inject_cpumask)) { + if (!time_before(jiffies, start + 2*HZ)) { + printk(KERN_ERR + "Timeout waiting for mce inject NMI %lx\n", + *cpus_addr(mce_inject_cpumask)); + break; + } + cpu_relax(); + } + raise_local(m); + put_cpu(); + put_online_cpus(); + } else +#endif + raise_local(m); } /* Error injection interface */ static ssize_t mce_write(struct file *filp, const char __user *ubuf, size_t usize, loff_t *off) { - struct delayed_mce *dm; struct mce m; if (!capable(CAP_SYS_ADMIN)) @@ -96,19 +198,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf, if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) return -EINVAL; - dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL); - if (!dm) - return -ENOMEM; - /* * Need to give user space some time to set everything up, * so do it a jiffie or two later everywhere. - * Should we use a hrtimer here for better synchronization? */ - memcpy(&dm->m, &m, sizeof(struct mce)); - setup_timer(&dm->timer, raise_mce, (unsigned long)dm); - dm->timer.expires = jiffies + 2; - add_timer_on(&dm->timer, m.extcpu); + schedule_timeout(2); + raise_mce(&m); return usize; } @@ -116,6 +211,7 @@ static int inject_init(void) { printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; + register_die_notifier(&mce_raise_nb); return 0; } From 0dcc66851f1091af421416c28a9458836885f522 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:41 +0800 Subject: [PATCH 0072/1662] x86, mce: Support specifying raise mode for software MCE injection Raise mode include raising as exception or raising as poll, it is specified via the mce.inject_flags field. This can be used to specify raise mode of UCNA, which is UC error but raised not as exception. And this can be used to test the filter code of poll handler or exception handler too. For example, enforce a poll raise mode for a fatal MCE. ChangeLog: v2: - Re-base on latest x86-tip.git/mce3 Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce-inject.c | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 8945be9ad2b1..b608a64c5814 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -44,6 +44,7 @@ #define MCJ_CTX_PROCESS 1 /* inject context: process */ #define MCJ_CTX_IRQ 2 /* inject context: IRQ */ #define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ +#define MCJ_EXCEPTION 8 /* raise as exception */ /* Fields are zero when not available */ struct mce { diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index ad5d92790ebc..7029f0e2acad 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -44,7 +44,7 @@ static void inject_mce(struct mce *m) i->finished = 1; } -static void raise_corrected(struct mce *m) +static void raise_poll(struct mce *m) { unsigned long flags; mce_banks_t b; @@ -56,7 +56,7 @@ static void raise_corrected(struct mce *m) m->finished = 0; } -static void raise_uncorrected(struct mce *m, struct pt_regs *pregs) +static void raise_exception(struct mce *m, struct pt_regs *pregs) { struct pt_regs regs; unsigned long flags; @@ -85,10 +85,10 @@ static int mce_raise_notify(struct notifier_block *self, if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) return NOTIFY_DONE; cpu_clear(cpu, mce_inject_cpumask); - if (m->status & MCI_STATUS_UC) - raise_uncorrected(m, args->regs); + if (m->inject_flags & MCJ_EXCEPTION) + raise_exception(m, args->regs); else if (m->status) - raise_corrected(m); + raise_poll(m); return NOTIFY_STOP; } @@ -104,7 +104,7 @@ static int raise_local(struct mce *m) int ret = 0; int cpu = m->extcpu; - if (m->status & MCI_STATUS_UC) { + if (m->inject_flags & MCJ_EXCEPTION) { printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); switch (context) { case MCJ_CTX_IRQ: @@ -115,7 +115,7 @@ static int raise_local(struct mce *m) */ /*FALL THROUGH*/ case MCJ_CTX_PROCESS: - raise_uncorrected(m, NULL); + raise_exception(m, NULL); break; default: printk(KERN_INFO "Invalid MCE context\n"); @@ -124,7 +124,7 @@ static int raise_local(struct mce *m) printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); } else if (m->status) { printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); - raise_corrected(m); + raise_poll(m); mce_notify_irq(); printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu); } else From 5be9ed251f58881dfc3dd6742a81ff9ad1a7bb04 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:42 +0800 Subject: [PATCH 0073/1662] x86, mce: Move debugfs mce dir creating to mce.c Because more debugfs files under mce dir will be create in mce.c. ChangeLog: v5: - Rebased on x86-tip.git/mce Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce-internal.h | 1 + arch/x86/kernel/cpu/mcheck/mce-severity.c | 4 +--- arch/x86/kernel/cpu/mcheck/mce.c | 13 +++++++++++++ 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 6bd51e7ba87b..32996f9fab67 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -22,6 +22,7 @@ struct mce_bank { }; int mce_severity(struct mce *a, int tolerant, char **msg); +struct dentry *mce_get_debugfs_dir(void); extern int mce_ser; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 51f7c725dab5..bc35a073d151 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -197,7 +197,7 @@ static int __init severities_debugfs_init(void) { struct dentry *dmce = NULL, *fseverities_coverage = NULL; - dmce = debugfs_create_dir("mce", NULL); + dmce = mce_get_debugfs_dir(); if (dmce == NULL) goto err_out; fseverities_coverage = debugfs_create_file("severities-coverage", @@ -209,8 +209,6 @@ static int __init severities_debugfs_init(void) return 0; err_out: - if (dmce) - debugfs_remove(dmce); return -ENOMEM; } late_initcall(severities_debugfs_init); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 1ce6db1f8789..9c7419e459d6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -2003,3 +2004,15 @@ static int __init mcheck_disable(char *str) return 1; } __setup("nomce", mcheck_disable); + +#ifdef CONFIG_DEBUG_FS +struct dentry *mce_get_debugfs_dir(void) +{ + static struct dentry *dmce; + + if (!dmce) + dmce = debugfs_create_dir("mce", NULL); + + return dmce; +} +#endif From bf783f9f7d33576815bc89f9f1856a7309ea2f17 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 31 Jul 2009 09:41:43 +0800 Subject: [PATCH 0074/1662] x86, mce: Fake panic support for MCE testing If "fake panic" mode is turned on, just log panic message instead of go real panic. This is used for testing only, so that the test suite can check for the correct panic message and do regression testing for MCE would go panic. This patch is based on x86-tip.git/mce. ChangeLog: v5: - Rebased on x86-tip.git/mce v4: - Move config file from sysfs to debugfs Signed-off-by: Huang Ying Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce.c | 75 +++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 9c7419e459d6..54bd1b2fb4c0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -204,6 +204,9 @@ static void print_mce_tail(void) static atomic_t mce_paniced; +static int fake_panic; +static atomic_t mce_fake_paniced; + /* Panic in progress. Enable interrupts and wait for final IPI */ static void wait_for_panic(void) { @@ -221,15 +224,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp) { int i; - /* - * Make sure only one CPU runs in machine check panic - */ - if (atomic_inc_return(&mce_paniced) > 1) - wait_for_panic(); - barrier(); + if (!fake_panic) { + /* + * Make sure only one CPU runs in machine check panic + */ + if (atomic_inc_return(&mce_paniced) > 1) + wait_for_panic(); + barrier(); - bust_spinlocks(1); - console_verbose(); + bust_spinlocks(1); + console_verbose(); + } else { + /* Don't log too much for fake panic */ + if (atomic_inc_return(&mce_fake_paniced) > 1) + return; + } print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -256,9 +265,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp) print_mce_tail(); if (exp) printk(KERN_EMERG "Machine check: %s\n", exp); - if (panic_timeout == 0) - panic_timeout = mce_panic_timeout; - panic(msg); + if (!fake_panic) { + if (panic_timeout == 0) + panic_timeout = mce_panic_timeout; + panic(msg); + } else + printk(KERN_EMERG "Fake kernel panic: %s\n", msg); } /* Support code for software error injection */ @@ -2015,4 +2027,45 @@ struct dentry *mce_get_debugfs_dir(void) return dmce; } + +static void mce_reset(void) +{ + cpu_missing = 0; + atomic_set(&mce_fake_paniced, 0); + atomic_set(&mce_executing, 0); + atomic_set(&mce_callin, 0); + atomic_set(&global_nwo, 0); +} + +static int fake_panic_get(void *data, u64 *val) +{ + *val = fake_panic; + return 0; +} + +static int fake_panic_set(void *data, u64 val) +{ + mce_reset(); + fake_panic = val; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, + fake_panic_set, "%llu\n"); + +static int __init mce_debugfs_init(void) +{ + struct dentry *dmce, *ffake_panic; + + dmce = mce_get_debugfs_dir(); + if (!dmce) + return -ENOMEM; + ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, + &fake_panic_fops); + if (!ffake_panic) + return -ENOMEM; + + return 0; +} +late_initcall(mce_debugfs_init); #endif From efe1f30e8f05a3836dfcf6302774707b3b47193f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:53 -0400 Subject: [PATCH 0075/1662] xfs: avoid memory allocation under m_peraglock in growfs code Allocate the memory for the larger m_perag array before taking the per-AG lock as the per-AG lock can be taken under the i_lock which can be taken from reclaim context. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_fsops.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cbd451bb4848..2d0b3e1da9e6 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -167,17 +167,25 @@ xfs_growfs_data_private( new = nb - mp->m_sb.sb_dblocks; oagcount = mp->m_sb.sb_agcount; if (nagcount > oagcount) { + void *new_perag, *old_perag; + xfs_filestream_flush(mp); + + new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount, + KM_MAYFAIL); + if (!new_perag) + return XFS_ERROR(ENOMEM); + down_write(&mp->m_peraglock); - mp->m_perag = kmem_realloc(mp->m_perag, - sizeof(xfs_perag_t) * nagcount, - sizeof(xfs_perag_t) * oagcount, - KM_SLEEP); - memset(&mp->m_perag[oagcount], 0, - (nagcount - oagcount) * sizeof(xfs_perag_t)); + memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount); + old_perag = mp->m_perag; + mp->m_perag = new_perag; + mp->m_flags |= XFS_MOUNT_32BITINODES; nagimax = xfs_initialize_perag(mp, nagcount); up_write(&mp->m_peraglock); + + kmem_free(old_perag); } tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); tp->t_flags |= XFS_TRANS_RESERVE; From 1ec2b8a777d6ac5464a836b3c5520b1a5190194c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:54 -0400 Subject: [PATCH 0076/1662] xfs: switch to NOFS allocation under i_lock in xfs_getbmap xfs_getbmap allocates memory with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 975972482e8f..8971fb09d387 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -6009,7 +6009,7 @@ xfs_getbmap( */ error = ENOMEM; subnex = 16; - map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL); + map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); if (!map) goto out_unlock_ilock; From 0043dd88af762d3a9154317d351400b5b941ec8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:55 -0400 Subject: [PATCH 0077/1662] xfs: switch to NOFS allocation under i_lock in xfs_da_state_alloc xfs_da_state_alloc is always called with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_da_btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 9ff6e57a5075..bd0bb6dfcdfa 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2201,7 +2201,7 @@ kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ xfs_da_state_t * xfs_da_state_alloc(void) { - return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP); + return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); } /* From eeb6f67ba1176a141b6802b2ec8505835482f631 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:56 -0400 Subject: [PATCH 0078/1662] xfs: switch to NOFS allocation under i_lock in xfs_da_buf_make i_lock is taken in the reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_da_btree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index bd0bb6dfcdfa..2847bbc1c534 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2261,9 +2261,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) int off; if (nbuf == 1) - dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP); + dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS); else - dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP); + dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS); dabuf->dirty = 0; #ifdef XFS_DABUF_DEBUG dabuf->ra = ra; From ea78cd5e6b6ebe123641382d5040252d19829332 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:57 -0400 Subject: [PATCH 0079/1662] xfs: switch to NOFS allocation under i_lock in xfs_dir_cilookup_result xfs_dir_cilookup_result is always called with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_dir2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index c657bec6d951..bb1d58eb3982 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -256,7 +256,7 @@ xfs_dir_cilookup_result( !(args->op_flags & XFS_DA_OP_CILOOKUP)) return EEXIST; - args->value = kmem_alloc(len, KM_MAYFAIL); + args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); if (!args->value) return ENOMEM; From 837273b8a150a3761467573c5eb477791cf19268 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:58 -0400 Subject: [PATCH 0080/1662] xfs: switch to NOFS allocation under i_lock in xfs_buf_associate_memory xfs_buf_associate_memory is used for setting up the spare buffer for the log wrap case in xlog_sync which can happen under i_lock when called from xfs_fsync. The i_lock mutex is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. There are a couple more uses of xfs_buf_associate_memory in the log recovery code that are also affected by this, but I'd rather keep the code simple than passing on a gfp_mask argument. Longer term we should just stop requiring the memoery allocation in xlog_sync by some smaller rework of the buffer layer. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index e28800a9f2b5..da3b666f1ace 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -770,7 +770,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, 0); + rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); if (rval) return rval; From f56ebd66df411446030e2cc820554c6ce4ba0836 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:59 -0400 Subject: [PATCH 0081/1662] xfs: switch to NOFS allocation under i_lock in xfs_attr_rmtval_set xfs_attr_rmtval_set is always called with i_lock held, and i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_attr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index db15feb906ff..bfb583791fe2 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2141,8 +2141,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, - blkcnt, XFS_BUF_LOCK); + bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt, + XFS_BUF_LOCK | XBF_DONT_BLOCK); ASSERT(bp); ASSERT(!XFS_BUF_GETERROR(bp)); From 5a25f1abb26f5ab7c6c72fe0b3807c137ee484bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:15:00 -0400 Subject: [PATCH 0082/1662] xfs: switch to NOFS allocation under i_lock in xfs_readlink_bmap xfs_readlink_bmap is called with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_vnodeops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 1dd706887156..ceecafd1f9c1 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -538,7 +538,9 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); + bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt), + XBF_LOCK | XBF_MAPPED | + XBF_DONT_BLOCK); error = XFS_BUF_GETERROR(bp); if (error) { xfs_ioerror_alert("xfs_readlink", From 7509ee34a365c041718d0b676e5ee255b58b5dda Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:15:01 -0400 Subject: [PATCH 0083/1662] xfs: switch to NOFS allocation under i_lock in xfs_attr_rmtval_get xfs_attr_rmtval_get is always called with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_attr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index bfb583791fe2..4ece1906bd41 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2010,7 +2010,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, - blkcnt, XFS_BUF_LOCK, &bp); + blkcnt, + XFS_BUF_LOCK | XBF_DONT_BLOCK, + &bp); if (error) return(error); From 583e0e41ee29478e2147aeffeb772657f8db767a Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 20 Jul 2009 10:52:15 -0500 Subject: [PATCH 0084/1662] use XFS_CORRUPTION_ERROR in xfs_btree_check_sblock In Red Hat Bug 512552 - Can't write to XFS mount during raid5 resync a user ran into corruption while resyncing a raid, and we failed a consistency test, but didn't get much more info; it'd be nice to call XFS_CORRUPTION_ERROR here so we can see the buffer contents. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/xfs_btree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index cde5a2668293..52b5f14d0c32 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -120,8 +120,8 @@ xfs_btree_check_sblock( XFS_RANDOM_BTREE_CHECK_SBLOCK))) { if (bp) xfs_buftrace("SBTREE ERROR", bp); - XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW, - cur->bc_mp); + XFS_CORRUPTION_ERROR("xfs_btree_check_sblock", + XFS_ERRLEVEL_LOW, cur->bc_mp, block); return XFS_ERROR(EFSCORRUPTED); } return 0; From 16715dbe64ccac265010ab8b60848d212d002521 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Aug 2009 11:32:18 -0300 Subject: [PATCH 0085/1662] xfs: check for dinode realtime flag corruption Ramon tested XFS with a modified version of fsfuzzer and hit a NULL pointer dereference in __xfs_get_blocks due to the RT device target pointer being NULL. To fix this reject inode with the realtime bit set on a a filesystem without an RT subvolume during inode read. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reviewed-by: Felix Blyakher Reported-by: Ramon de Carvalho Valle Tested-by: Ramon de Carvalho Valle Signed-off-by: Felix Blyakher --- fs/xfs/xfs_inode.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2dcb3d781ae5..c1dc7ef5a1d8 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -343,6 +343,16 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } + if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && + !ip->i_mount->m_rtdev_targp)) { + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt dinode %Lu, has realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + switch (ip->i_d.di_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: From 79dd43bb85d64ba14a781f940c858d7bbe8c9a6d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Aug 2009 11:32:44 -0300 Subject: [PATCH 0086/1662] xfs: fix spin_is_locked assert on uni-processor builds Without SMP or preemption spin_is_locked always returns false, so we can't do an assert with it. Instead use assert_spin_locked, which does the right thing on all builds. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reported-by: Johannes Engel Tested-by: Johannes Engel Signed-off-by: Felix Blyakher --- fs/xfs/xfs_log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3750f04ede0b..9dbdff3ea484 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3180,7 +3180,7 @@ xlog_state_sync(xlog_t *log, STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) { - ASSERT(spin_is_locked(&log->l_icloglock)); + assert_spin_locked(&log->l_icloglock); if (iclog->ic_state == XLOG_STATE_ACTIVE) { xlog_state_switch_iclogs(log, iclog, 0); From 81e2d7b30d718824434725a4a24d5864a71b1d30 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 12 Aug 2009 05:45:34 -0700 Subject: [PATCH 0087/1662] x86, intel_txt: tboot.c needs arch/x86/kernel/tboot.c needs . In most configurations that ends up getting implicitly included, but not in all, causing build failures in some configurations. Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin Cc: Joseph Cihula Cc: Shane Wang --- arch/x86/kernel/tboot.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 263591afd29e..1ab801208945 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include From 971f3918a5a8febbbab355079972fb31ee7c0f33 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:49 +0900 Subject: [PATCH 0088/1662] percpu: fix pcpu_reclaim() locking pcpu_reclaim() calls pcpu_depopulate_chunk() which makes use of pages array and bitmap returned by pcpu_get_pages_and_bitmap() and thus should be called under pcpu_alloc_mutex. pcpu_reclaim() released the mutex before calling depopulate leading to double free and other strange problems caused by the unexpected concurrent usages of pages array and bitmap. Fix it. Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter --- mm/percpu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/percpu.c b/mm/percpu.c index 3f9f182f9b44..42ab0024a6ed 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1181,12 +1181,13 @@ static void pcpu_reclaim(struct work_struct *work) } spin_unlock_irq(&pcpu_lock); - mutex_unlock(&pcpu_alloc_mutex); list_for_each_entry_safe(chunk, next, &todo, list) { pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); free_pcpu_chunk(chunk); } + + mutex_unlock(&pcpu_alloc_mutex); } /** From 004018e2c06b9c650e88dddd973ae36799ed72b9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:49 +0900 Subject: [PATCH 0089/1662] percpu: improve boot messages Improve percpu boot messages such that they're uniform and contain more information. Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter --- mm/percpu.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 42ab0024a6ed..cbddcbdab681 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1488,8 +1488,9 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } /* we're ready, commit */ - pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - size_sum >> PAGE_SHIFT, base, static_size); + pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", + PFN_DOWN(size_sum), base, static_size, reserved_size, dyn_size, + unit_size); return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, unit_size, base, NULL); @@ -1579,8 +1580,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, } /* we're ready, commit */ - pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n", - unit_pages, static_size); + pr_info("PERCPU: %d 4k pages/cpu @%p s%zu r%zu\n", + unit_pages, vm.addr, static_size, reserved_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, unit_pages << PAGE_SHIFT, vm.addr, NULL); @@ -1898,8 +1899,8 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, static_size); /* we're ready, commit */ - pr_info("PERCPU: Remapped at %p with large pages, static data " - "%zu bytes\n", vm.addr, static_size); + pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n", + vm.addr, static_size, reserved_size, dyn_size, unit_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, unit_size, vm.addr, unit_map); From 00ae4064b1445524752575dd84df227c0687c99d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:49 +0900 Subject: [PATCH 0090/1662] percpu: rename 4k first chunk allocator to page Page size isn't always 4k depending on arch and configuration. Rename 4k first chunk allocator to page. Signed-off-by: Tejun Heo Cc: David Howells --- Documentation/kernel-parameters.txt | 2 +- arch/x86/kernel/setup_percpu.c | 23 ++++++++++++----------- include/linux/percpu.h | 2 +- mm/percpu.c | 25 ++++++++++++++----------- 4 files changed, 28 insertions(+), 24 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7936b801fe6a..12e9eb77ee0d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1920,7 +1920,7 @@ and is between 256 and 4096 characters. It is defined in the file See arch/parisc/kernel/pdc_chassis.c percpu_alloc= [X86] Select which percpu first chunk allocator to use. - Allowed values are one of "lpage", "embed" and "4k". + Allowed values are one of "lpage", "embed" and "page". See comments in arch/x86/kernel/setup_percpu.c for details on each allocator. This parameter is primarily for debugging and performance comparison. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index a26ff61e2fb0..1e17711c29d6 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -249,21 +249,22 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) } /* - * 4k allocator + * Page allocator * - * Boring fallback 4k allocator. This allocator puts more pressure on - * PTE TLBs but other than that behaves nicely on both UMA and NUMA. + * Boring fallback 4k page allocator. This allocator puts more + * pressure on PTE TLBs but other than that behaves nicely on both UMA + * and NUMA. */ -static void __init pcpu4k_populate_pte(unsigned long addr) +static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_4k(size_t static_size) +static ssize_t __init setup_pcpu_page(size_t static_size) { - return pcpu_4k_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - pcpu_fc_alloc, pcpu_fc_free, - pcpu4k_populate_pte); + return pcpu_page_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); } /* for explicit first chunk allocator selection */ @@ -307,7 +308,7 @@ void __init setup_per_cpu_areas(void) */ ret = -EINVAL; if (strlen(pcpu_chosen_alloc)) { - if (strcmp(pcpu_chosen_alloc, "4k")) { + if (strcmp(pcpu_chosen_alloc, "page")) { if (!strcmp(pcpu_chosen_alloc, "lpage")) ret = setup_pcpu_lpage(static_size, true); else if (!strcmp(pcpu_chosen_alloc, "embed")) @@ -317,7 +318,7 @@ void __init setup_per_cpu_areas(void) "specified\n", pcpu_chosen_alloc); if (ret < 0) pr_warning("PERCPU: %s allocator failed (%zd), " - "falling back to 4k\n", + "falling back to page size\n", pcpu_chosen_alloc, ret); } } else { @@ -326,7 +327,7 @@ void __init setup_per_cpu_areas(void) ret = setup_pcpu_embed(static_size, false); } if (ret < 0) - ret = setup_pcpu_4k(static_size); + ret = setup_pcpu_page(static_size); if (ret < 0) panic("cannot allocate static percpu area (%zu bytes, err=%zd)", static_size, ret); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e134c8229631..7989f61b03f3 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -74,7 +74,7 @@ extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); -extern ssize_t __init pcpu_4k_first_chunk( +extern ssize_t __init pcpu_page_first_chunk( size_t static_size, size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, diff --git a/mm/percpu.c b/mm/percpu.c index cbddcbdab681..6feac7934904 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1497,15 +1497,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } /** - * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages + * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE * @free_fn: funtion to free percpu page, always called with PAGE_SIZE * @populate_pte_fn: function to populate pte * - * This is a helper to ease setting up embedded first percpu chunk and - * can be called where pcpu_setup_first_chunk() is expected. + * This is a helper to ease setting up page-remapped first percpu + * chunk and can be called where pcpu_setup_first_chunk() is expected. * * This is the basic allocator. Static percpu area is allocated * page-by-page into vmalloc area. @@ -1514,12 +1514,13 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn) +ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; + char psize_str[16]; int unit_pages; size_t pages_size; struct page **pages; @@ -1527,6 +1528,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, int i, j; ssize_t ret; + snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); + unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, PCPU_MIN_UNIT_SIZE)); @@ -1542,8 +1545,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, ptr = alloc_fn(cpu, PAGE_SIZE); if (!ptr) { - pr_warning("PERCPU: failed to allocate " - "4k page for cpu%u\n", cpu); + pr_warning("PERCPU: failed to allocate %s page " + "for cpu%u\n", psize_str, cpu); goto enomem; } pages[j++] = virt_to_page(ptr); @@ -1580,8 +1583,8 @@ ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size, } /* we're ready, commit */ - pr_info("PERCPU: %d 4k pages/cpu @%p s%zu r%zu\n", - unit_pages, vm.addr, static_size, reserved_size); + pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu\n", + unit_pages, psize_str, vm.addr, static_size, reserved_size); ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, unit_pages << PAGE_SHIFT, vm.addr, NULL); From 08fc45806103e59a37418e84719b878f9bb32540 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:49 +0900 Subject: [PATCH 0091/1662] percpu: build first chunk allocators selectively There's no need to build unused first chunk allocators in. Define CONFIG_NEED_PER_CPU_*_FIRST_CHUNK and let archs enable them selectively. Signed-off-by: Tejun Heo --- arch/x86/Kconfig | 10 ++++++++++ include/linux/percpu.h | 27 +++++---------------------- mm/percpu.c | 19 +++++++++++-------- 3 files changed, 26 insertions(+), 30 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e06b2eeff9f2..f7ac27215512 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -150,6 +150,16 @@ config ARCH_HAS_CACHE_LINE_SIZE config HAVE_SETUP_PER_CPU_AREA def_bool y +config NEED_PER_CPU_EMBED_FIRST_CHUNK + def_bool y + +config NEED_PER_CPU_PAGE_FIRST_CHUNK + def_bool y + +config NEED_PER_CPU_LPAGE_FIRST_CHUNK + def_bool y + depends on NEED_MULTIPLE_NODES + config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 7989f61b03f3..e26788e0da4a 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -70,17 +70,21 @@ extern size_t __init pcpu_setup_first_chunk( ssize_t dyn_size, size_t unit_size, void *base_addr, const int *unit_map); +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK extern ssize_t __init pcpu_embed_first_chunk( size_t static_size, size_t reserved_size, ssize_t dyn_size); +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern ssize_t __init pcpu_page_first_chunk( size_t static_size, size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); +#endif -#ifdef CONFIG_NEED_MULTIPLE_NODES +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK extern int __init pcpu_lpage_build_unit_map( size_t static_size, size_t reserved_size, ssize_t *dyn_sizep, size_t *unit_sizep, @@ -98,27 +102,6 @@ extern ssize_t __init pcpu_lpage_first_chunk( extern void *pcpu_lpage_remapped(void *kaddr); #else -static inline int pcpu_lpage_build_unit_map( - size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep, size_t *unit_sizep, - size_t lpage_size, int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) -{ - return -EINVAL; -} - -static inline ssize_t __init pcpu_lpage_first_chunk( - size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - size_t lpage_size, const int *unit_map, - int nr_units, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn) -{ - return -EINVAL; -} - static inline void *pcpu_lpage_remapped(void *kaddr) { return NULL; diff --git a/mm/percpu.c b/mm/percpu.c index 6feac7934904..7971997de310 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1414,8 +1414,9 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, return pcpu_unit_size; } -static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep) +static inline size_t pcpu_calc_fc_sizes(size_t static_size, + size_t reserved_size, + ssize_t *dyn_sizep) { size_t size_sum; @@ -1427,6 +1428,8 @@ static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, return size_sum; } +#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ + !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @static_size: the size of static percpu area in bytes @@ -1495,7 +1498,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, unit_size, base, NULL); } +#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || + !CONFIG_HAVE_SETUP_PER_CPU_AREA */ +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @static_size: the size of static percpu area in bytes @@ -1598,12 +1604,9 @@ ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size, free_bootmem(__pa(pages), pages_size); return ret; } +#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ -/* - * Large page remapping first chunk setup helper - */ -#ifdef CONFIG_NEED_MULTIPLE_NODES - +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK /** * pcpu_lpage_build_unit_map - build unit_map for large page remapping * @static_size: the size of static percpu area in bytes @@ -1982,7 +1985,7 @@ void *pcpu_lpage_remapped(void *kaddr) return NULL; } -#endif +#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */ /* * Generic percpu area setup. From f58dc01ba2ca9fe3ab2ba4ca43d9c8a735cf62d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: [PATCH 0092/1662] percpu: generalize first chunk allocator selection Now that all first chunk allocators are in mm/percpu.c, it makes sense to make generalize percpu_alloc kernel parameter. Define PCPU_FC_* and set pcpu_chosen_fc using early_param() in mm/percpu.c. Arch code can use the set value to determine which first chunk allocator to use. Signed-off-by: Tejun Heo --- Documentation/kernel-parameters.txt | 11 +++++----- arch/x86/kernel/setup_percpu.c | 24 ++++++---------------- include/linux/percpu.h | 12 +++++++++++ mm/percpu.c | 32 +++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 23 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 12e9eb77ee0d..dee9ce2e6cfa 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1919,11 +1919,12 @@ and is between 256 and 4096 characters. It is defined in the file Format: { 0 | 1 } See arch/parisc/kernel/pdc_chassis.c - percpu_alloc= [X86] Select which percpu first chunk allocator to use. - Allowed values are one of "lpage", "embed" and "page". - See comments in arch/x86/kernel/setup_percpu.c for - details on each allocator. This parameter is primarily - for debugging and performance comparison. + percpu_alloc= Select which percpu first chunk allocator to use. + Currently supported values are "embed", "page" and + "lpage". Archs may support subset or none of the + selections. See comments in mm/percpu.c for details + on each allocator. This parameter is primarily for + debugging and performance comparison. pf. [PARIDE] See Documentation/blockdev/paride.txt. diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 1e17711c29d6..b961d99e6416 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -267,16 +267,6 @@ static ssize_t __init setup_pcpu_page(size_t static_size) pcpup_populate_pte); } -/* for explicit first chunk allocator selection */ -static char pcpu_chosen_alloc[16] __initdata; - -static int __init percpu_alloc_setup(char *str) -{ - strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1); - return 0; -} -early_param("percpu_alloc", percpu_alloc_setup); - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -307,19 +297,17 @@ void __init setup_per_cpu_areas(void) * each allocator for details. */ ret = -EINVAL; - if (strlen(pcpu_chosen_alloc)) { - if (strcmp(pcpu_chosen_alloc, "page")) { - if (!strcmp(pcpu_chosen_alloc, "lpage")) + if (pcpu_chosen_fc != PCPU_FC_AUTO) { + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + if (pcpu_chosen_fc == PCPU_FC_LPAGE) ret = setup_pcpu_lpage(static_size, true); - else if (!strcmp(pcpu_chosen_alloc, "embed")) - ret = setup_pcpu_embed(static_size, true); else - pr_warning("PERCPU: unknown allocator %s " - "specified\n", pcpu_chosen_alloc); + ret = setup_pcpu_embed(static_size, true); + if (ret < 0) pr_warning("PERCPU: %s allocator failed (%zd), " "falling back to page size\n", - pcpu_chosen_alloc, ret); + pcpu_fc_names[pcpu_chosen_fc], ret); } } else { ret = setup_pcpu_lpage(static_size, false); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e26788e0da4a..9be05cbe5ee0 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -59,6 +59,18 @@ extern void *pcpu_base_addr; extern const int *pcpu_unit_map; +enum pcpu_fc { + PCPU_FC_AUTO, + PCPU_FC_EMBED, + PCPU_FC_PAGE, + PCPU_FC_LPAGE, + + PCPU_FC_NR, +}; +extern const char *pcpu_fc_names[PCPU_FC_NR]; + +extern enum pcpu_fc pcpu_chosen_fc; + typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); diff --git a/mm/percpu.c b/mm/percpu.c index 7971997de310..7fb40bb1555a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1414,6 +1414,38 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, return pcpu_unit_size; } +const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { + [PCPU_FC_AUTO] = "auto", + [PCPU_FC_EMBED] = "embed", + [PCPU_FC_PAGE] = "page", + [PCPU_FC_LPAGE] = "lpage", +}; + +enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; + +static int __init percpu_alloc_setup(char *str) +{ + if (0) + /* nada */; +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK + else if (!strcmp(str, "embed")) + pcpu_chosen_fc = PCPU_FC_EMBED; +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + else if (!strcmp(str, "page")) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK + else if (!strcmp(str, "lpage")) + pcpu_chosen_fc = PCPU_FC_LPAGE; +#endif + else + pr_warning("PERCPU: unknown allocator %s specified\n", str); + + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); + static inline size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size, ssize_t *dyn_sizep) From 9a7737691e90d3cce0e5248f91826c50e5aa3fcf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: [PATCH 0093/1662] percpu: drop @static_size from first chunk allocators First chunk allocators assume percpu areas have been linked using one of PERCPU_*() macros and depend on __per_cpu_load symbol defined by those macros, so there isn't much point in passing in static area size explicitly when it can be easily calculated from __per_cpu_start and __per_cpu_end. Drop @static_size from all percpu first chunk allocators and helpers. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 34 +++++++++++++++------------------- include/linux/percpu.h | 18 ++++++++---------- mm/percpu.c | 29 +++++++++++++---------------- 3 files changed, 36 insertions(+), 45 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index b961d99e6416..8aad486c688f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -157,7 +157,7 @@ static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) +static ssize_t __init setup_pcpu_lpage(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; @@ -184,8 +184,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) return -ENOMEM; } - ret = pcpu_lpage_build_unit_map(static_size, - PERCPU_FIRST_CHUNK_RESERVE, + ret = pcpu_lpage_build_unit_map(PERCPU_FIRST_CHUNK_RESERVE, &dyn_size, &unit_size, PMD_SIZE, unit_map, pcpu_lpage_cpu_distance); if (ret < 0) { @@ -208,9 +207,8 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) } } - ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, - dyn_size, unit_size, PMD_SIZE, - unit_map, nr_units, + ret = pcpu_lpage_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, + unit_size, PMD_SIZE, unit_map, nr_units, pcpu_fc_alloc, pcpu_fc_free, pcpul_map); out_free: if (ret < 0) @@ -218,7 +216,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) return ret; } #else -static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) +static ssize_t __init setup_pcpu_lpage(bool chosen) { return -EINVAL; } @@ -232,7 +230,7 @@ static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) +static ssize_t __init setup_pcpu_embed(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -244,7 +242,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen) if (!chosen && (!cpu_has_pse || pcpu_need_numa())) return -EINVAL; - return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, reserve - PERCPU_FIRST_CHUNK_RESERVE); } @@ -260,9 +258,9 @@ static void __init pcpup_populate_pte(unsigned long addr) populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_page(size_t static_size) +static ssize_t __init setup_pcpu_page(void) { - return pcpu_page_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, + return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, pcpu_fc_alloc, pcpu_fc_free, pcpup_populate_pte); } @@ -282,7 +280,6 @@ static inline void setup_percpu_segment(int cpu) void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; unsigned int cpu; unsigned long delta; size_t pcpu_unit_size; @@ -300,9 +297,9 @@ void __init setup_per_cpu_areas(void) if (pcpu_chosen_fc != PCPU_FC_AUTO) { if (pcpu_chosen_fc != PCPU_FC_PAGE) { if (pcpu_chosen_fc == PCPU_FC_LPAGE) - ret = setup_pcpu_lpage(static_size, true); + ret = setup_pcpu_lpage(true); else - ret = setup_pcpu_embed(static_size, true); + ret = setup_pcpu_embed(true); if (ret < 0) pr_warning("PERCPU: %s allocator failed (%zd), " @@ -310,15 +307,14 @@ void __init setup_per_cpu_areas(void) pcpu_fc_names[pcpu_chosen_fc], ret); } } else { - ret = setup_pcpu_lpage(static_size, false); + ret = setup_pcpu_lpage(false); if (ret < 0) - ret = setup_pcpu_embed(static_size, false); + ret = setup_pcpu_embed(false); } if (ret < 0) - ret = setup_pcpu_page(static_size); + ret = setup_pcpu_page(); if (ret < 0) - panic("cannot allocate static percpu area (%zu bytes, err=%zd)", - static_size, ret); + panic("cannot initialize percpu area (err=%zd)", ret); pcpu_unit_size = ret; diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 9be05cbe5ee0..be2fc8fb9b6f 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -84,13 +84,12 @@ extern size_t __init pcpu_setup_first_chunk( #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK extern ssize_t __init pcpu_embed_first_chunk( - size_t static_size, size_t reserved_size, - ssize_t dyn_size); + size_t reserved_size, ssize_t dyn_size); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern ssize_t __init pcpu_page_first_chunk( - size_t static_size, size_t reserved_size, + size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); @@ -98,16 +97,15 @@ extern ssize_t __init pcpu_page_first_chunk( #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK extern int __init pcpu_lpage_build_unit_map( - size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep, size_t *unit_sizep, - size_t lpage_size, int *unit_map, + size_t reserved_size, ssize_t *dyn_sizep, + size_t *unit_sizep, size_t lpage_size, + int *unit_map, pcpu_fc_cpu_distance_fn_t cpu_distance_fn); extern ssize_t __init pcpu_lpage_first_chunk( - size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - size_t lpage_size, const int *unit_map, - int nr_units, + size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); diff --git a/mm/percpu.c b/mm/percpu.c index 7fb40bb1555a..e2ac58a39bb2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1464,7 +1464,6 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size, !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @@ -1489,9 +1488,9 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size) +ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) { + const size_t static_size = __per_cpu_end - __per_cpu_start; size_t size_sum, unit_size, chunk_size; void *base; unsigned int cpu; @@ -1536,7 +1535,6 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE * @free_fn: funtion to free percpu page, always called with PAGE_SIZE @@ -1552,12 +1550,13 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size, +ssize_t __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; + const size_t static_size = __per_cpu_end - __per_cpu_start; char psize_str[16]; int unit_pages; size_t pages_size; @@ -1641,7 +1640,6 @@ ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size, #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK /** * pcpu_lpage_build_unit_map - build unit_map for large page remapping - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_sizep: in/out parameter for dynamic size, -1 for auto * @unit_sizep: out parameter for unit size @@ -1661,13 +1659,14 @@ ssize_t __init pcpu_page_first_chunk(size_t static_size, size_t reserved_size, * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and * returns the number of units to be allocated. -errno on failure. */ -int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size, - ssize_t *dyn_sizep, size_t *unit_sizep, - size_t lpage_size, int *unit_map, +int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, + size_t *unit_sizep, size_t lpage_size, + int *unit_map, pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; int group_cnt_max = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ @@ -1819,7 +1818,6 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes * @unit_size: unit size in bytes @@ -1850,15 +1848,15 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - size_t lpage_size, const int *unit_map, - int nr_units, +ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { static struct vm_struct vm; + const size_t static_size = __per_cpu_end - __per_cpu_start; size_t chunk_size = unit_size * nr_units; size_t map_size; unsigned int cpu; @@ -2037,7 +2035,6 @@ EXPORT_SYMBOL(__per_cpu_offset); void __init setup_per_cpu_areas(void) { - size_t static_size = __per_cpu_end - __per_cpu_start; ssize_t unit_size; unsigned long delta; unsigned int cpu; @@ -2046,7 +2043,7 @@ void __init setup_per_cpu_areas(void) * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ - unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE, + unit_size = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE); if (unit_size < 0) panic("Failed to initialized percpu areas."); From 1d9d32572163b30be81dbe1409dfa7ea9763d0e8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: [PATCH 0094/1662] percpu: make @dyn_size mandatory for pcpu_setup_first_chunk() Now that all actual first chunk allocation and copying happen in the first chunk allocators and helpers, there's no reason for pcpu_setup_first_chunk() to try to determine @dyn_size automatically. The only left user is page first chunk allocator. Make it determine dyn_size like other allocators and make @dyn_size mandatory for pcpu_setup_first_chunk(). Signed-off-by: Tejun Heo --- include/linux/percpu.h | 2 +- mm/percpu.c | 39 +++++++++++++++++++-------------------- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index be2fc8fb9b6f..0cfdd14d096a 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -79,7 +79,7 @@ typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern size_t __init pcpu_setup_first_chunk( size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t unit_size, + size_t dyn_size, size_t unit_size, void *base_addr, const int *unit_map); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK diff --git a/mm/percpu.c b/mm/percpu.c index e2ac58a39bb2..287f59cc5fb9 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1235,7 +1235,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * pcpu_setup_first_chunk - initialize the first percpu chunk * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes, 0 for none - * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @dyn_size: free size for dynamic allocation in bytes * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE * @base_addr: mapped address * @unit_map: cpu -> unit map, NULL for sequential mapping @@ -1252,10 +1252,9 @@ EXPORT_SYMBOL_GPL(free_percpu); * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * - * @dyn_size, if non-negative, determines the number of bytes - * available for dynamic allocation in the first chunk. Specifying - * non-negative value makes percpu leave alone the area beyond - * @static_size + @reserved_size + @dyn_size. + * @dyn_size determines the number of bytes available for dynamic + * allocation in the first chunk. The area between @static_size + + * @reserved_size + @dyn_size and @unit_size is unused. * * @unit_size specifies unit size and must be aligned to PAGE_SIZE and * equal to or larger than @static_size + @reserved_size + if @@ -1276,13 +1275,12 @@ EXPORT_SYMBOL_GPL(free_percpu); * percpu access. */ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, size_t unit_size, + size_t dyn_size, size_t unit_size, void *base_addr, const int *unit_map) { static struct vm_struct first_vm; static int smap[2], dmap[2]; - size_t size_sum = static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0); + size_t size_sum = static_size + reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; unsigned int cpu, tcpu; int i; @@ -1345,9 +1343,6 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); - if (dyn_size < 0) - dyn_size = pcpu_unit_size - static_size - reserved_size; - first_vm.flags = VM_ALLOC; first_vm.size = pcpu_chunk_size; first_vm.addr = base_addr; @@ -1557,6 +1552,8 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, { static struct vm_struct vm; const size_t static_size = __per_cpu_end - __per_cpu_start; + ssize_t dyn_size = -1; + size_t size_sum, unit_size; char psize_str[16]; int unit_pages; size_t pages_size; @@ -1567,8 +1564,9 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); - unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size, - PCPU_MIN_UNIT_SIZE)); + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + unit_pages = unit_size >> PAGE_SHIFT; /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0])); @@ -1591,12 +1589,12 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; - vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT; + vm.size = nr_cpu_ids * unit_size; vm_area_register_early(&vm, PAGE_SIZE); for_each_possible_cpu(cpu) { - unsigned long unit_addr = (unsigned long)vm.addr + - (cpu * unit_pages << PAGE_SHIFT); + unsigned long unit_addr = + (unsigned long)vm.addr + cpu * unit_size; for (i = 0; i < unit_pages; i++) populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); @@ -1620,11 +1618,12 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, } /* we're ready, commit */ - pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu\n", - unit_pages, psize_str, vm.addr, static_size, reserved_size); + pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", + unit_pages, psize_str, vm.addr, static_size, reserved_size, + dyn_size); - ret = pcpu_setup_first_chunk(static_size, reserved_size, -1, - unit_pages << PAGE_SHIFT, vm.addr, NULL); + ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, + unit_size, vm.addr, NULL); goto out_free_ar; enomem: From 3cbc85652767c38b252c8de55f9fd180b29e4c0d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:50 +0900 Subject: [PATCH 0095/1662] percpu: add @align to pcpu_fc_alloc_fn_t pcpu_fc_alloc_fn_t is about to see more interesting usage, add @align parameter. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 4 ++-- include/linux/percpu.h | 3 ++- mm/percpu.c | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8aad486c688f..660cde133141 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -126,9 +126,9 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, /* * Helpers for first chunk memory allocation */ -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return pcpu_alloc_bootmem(cpu, size, size); + return pcpu_alloc_bootmem(cpu, size, align); } static void __init pcpu_fc_free(void *ptr, size_t size) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 0cfdd14d096a..d385dbcf190b 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -71,7 +71,8 @@ extern const char *pcpu_fc_names[PCPU_FC_NR]; extern enum pcpu_fc pcpu_chosen_fc; -typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); +typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, + size_t align); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); diff --git a/mm/percpu.c b/mm/percpu.c index 287f59cc5fb9..3316e3aac7ee 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1578,7 +1578,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, for (i = 0; i < unit_pages; i++) { void *ptr; - ptr = alloc_fn(cpu, PAGE_SIZE); + ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { pr_warning("PERCPU: failed to allocate %s page " "for cpu%u\n", psize_str, cpu); @@ -1888,7 +1888,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, goto found; continue; found: - ptr = alloc_fn(cpu, lpage_size); + ptr = alloc_fn(cpu, lpage_size, lpage_size); if (!ptr) { pr_warning("PERCPU: failed to allocate large page " "for cpu%u\n", cpu); From 033e48fb82958053113178264ddb9d5038d5e38b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:51 +0900 Subject: [PATCH 0096/1662] percpu: move pcpu_lpage_build_unit_map() and pcpul_lpage_dump_cfg() upward Unit map handling will be generalized and extended and used for embedding sparse first chunk and other purposes. Relocate two unit_map related functions upward in preparation. This patch just moves the code without any actual change. Signed-off-by: Tejun Heo --- include/linux/percpu.h | 14 +- mm/percpu.c | 339 +++++++++++++++++++++-------------------- 2 files changed, 180 insertions(+), 173 deletions(-) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index d385dbcf190b..570fb18de2ba 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -78,6 +78,14 @@ typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK +extern int __init pcpu_lpage_build_unit_map( + size_t reserved_size, ssize_t *dyn_sizep, + size_t *unit_sizep, size_t lpage_size, + int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn); +#endif + extern size_t __init pcpu_setup_first_chunk( size_t static_size, size_t reserved_size, size_t dyn_size, size_t unit_size, @@ -97,12 +105,6 @@ extern ssize_t __init pcpu_page_first_chunk( #endif #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern int __init pcpu_lpage_build_unit_map( - size_t reserved_size, ssize_t *dyn_sizep, - size_t *unit_sizep, size_t lpage_size, - int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn); - extern ssize_t __init pcpu_lpage_first_chunk( size_t reserved_size, size_t dyn_size, size_t unit_size, size_t lpage_size, diff --git a/mm/percpu.c b/mm/percpu.c index 3316e3aac7ee..2b9c4b2a2fc0 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1231,6 +1231,178 @@ void free_percpu(void *ptr) } EXPORT_SYMBOL_GPL(free_percpu); +static inline size_t pcpu_calc_fc_sizes(size_t static_size, + size_t reserved_size, + ssize_t *dyn_sizep) +{ + size_t size_sum; + + size_sum = PFN_ALIGN(static_size + reserved_size + + (*dyn_sizep >= 0 ? *dyn_sizep : 0)); + if (*dyn_sizep != 0) + *dyn_sizep = size_sum - static_size - reserved_size; + + return size_sum; +} + +#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK +/** + * pcpu_lpage_build_unit_map - build unit_map for large page remapping + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_sizep: in/out parameter for dynamic size, -1 for auto + * @unit_sizep: out parameter for unit size + * @unit_map: unit_map to be filled + * @cpu_distance_fn: callback to determine distance between cpus + * + * This function builds cpu -> unit map and determine other parameters + * considering needed percpu size, large page size and distances + * between CPUs in NUMA. + * + * CPUs which are of LOCAL_DISTANCE both ways are grouped together and + * may share units in the same large page. The returned configuration + * is guaranteed to have CPUs on different nodes on different large + * pages and >=75% usage of allocated virtual address space. + * + * RETURNS: + * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and + * returns the number of units to be allocated. -errno on failure. + */ +int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, + size_t *unit_sizep, size_t lpage_size, + int *unit_map, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; + int group_cnt_max = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs; + unsigned int cpu, tcpu; + int group, unit; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of lpage_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, lpage_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + group_cnt_max = max(group_cnt_max, group_cnt[group]); + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group_cnt[group]; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 25%. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + *unit_sizep = alloc_size / best_upa; + + /* assign units to cpus accordingly */ + unit = 0; + for (group = 0; group_cnt[group]; group++) { + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + unit_map[cpu] = unit++; + unit = roundup(unit, best_upa); + } + + return unit; /* unit contains aligned number of units */ +} + +static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, + unsigned int *cpup); + +static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, + size_t reserved_size, size_t dyn_size, + size_t unit_size, size_t lpage_size, + const int *unit_map, int nr_units) +{ + int width = 1, v = nr_units; + char empty_str[] = "--------"; + int upl, lpl; /* units per lpage, lpage per line */ + unsigned int cpu; + int lpage, unit; + + while (v /= 10) + width++; + empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; + + upl = max_t(int, lpage_size / unit_size, 1); + lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); + + printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, + static_size, reserved_size, dyn_size, unit_size, lpage_size); + + for (lpage = 0, unit = 0; unit < nr_units; unit++) { + if (!(unit % upl)) { + if (!(lpage++ % lpl)) { + printk("\n"); + printk("%spcpu-lpage: ", lvl); + } else + printk("| "); + } + if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + printk("%0*d ", width, cpu); + else + printk("%s ", empty_str); + } + printk("\n"); +} +#endif + /** * pcpu_setup_first_chunk - initialize the first percpu chunk * @static_size: the size of static percpu area in bytes @@ -1441,20 +1613,6 @@ static int __init percpu_alloc_setup(char *str) } early_param("percpu_alloc", percpu_alloc_setup); -static inline size_t pcpu_calc_fc_sizes(size_t static_size, - size_t reserved_size, - ssize_t *dyn_sizep) -{ - size_t size_sum; - - size_sum = PFN_ALIGN(static_size + reserved_size + - (*dyn_sizep >= 0 ? *dyn_sizep : 0)); - if (*dyn_sizep != 0) - *dyn_sizep = size_sum - static_size - reserved_size; - - return size_sum; -} - #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** @@ -1637,122 +1795,6 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -/** - * pcpu_lpage_build_unit_map - build unit_map for large page remapping - * @reserved_size: the size of reserved percpu area in bytes - * @dyn_sizep: in/out parameter for dynamic size, -1 for auto - * @unit_sizep: out parameter for unit size - * @unit_map: unit_map to be filled - * @cpu_distance_fn: callback to determine distance between cpus - * - * This function builds cpu -> unit map and determine other parameters - * considering needed percpu size, large page size and distances - * between CPUs in NUMA. - * - * CPUs which are of LOCAL_DISTANCE both ways are grouped together and - * may share units in the same large page. The returned configuration - * is guaranteed to have CPUs on different nodes on different large - * pages and >=75% usage of allocated virtual address space. - * - * RETURNS: - * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and - * returns the number of units to be allocated. -errno on failure. - */ -int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, - size_t *unit_sizep, size_t lpage_size, - int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) -{ - static int group_map[NR_CPUS] __initdata; - static int group_cnt[NR_CPUS] __initdata; - const size_t static_size = __per_cpu_end - __per_cpu_start; - int group_cnt_max = 0; - size_t size_sum, min_unit_size, alloc_size; - int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ - int last_allocs; - unsigned int cpu, tcpu; - int group, unit; - - /* - * Determine min_unit_size, alloc_size and max_upa such that - * alloc_size is multiple of lpage_size and is the smallest - * which can accomodate 4k aligned segments which are equal to - * or larger than min_unit_size. - */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); - min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - - alloc_size = roundup(min_unit_size, lpage_size); - upa = alloc_size / min_unit_size; - while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) - upa--; - max_upa = upa; - - /* group cpus according to their proximity */ - for_each_possible_cpu(cpu) { - group = 0; - next_group: - for_each_possible_cpu(tcpu) { - if (cpu == tcpu) - break; - if (group_map[tcpu] == group && - (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || - cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { - group++; - goto next_group; - } - } - group_map[cpu] = group; - group_cnt[group]++; - group_cnt_max = max(group_cnt_max, group_cnt[group]); - } - - /* - * Expand unit size until address space usage goes over 75% - * and then as much as possible without using more address - * space. - */ - last_allocs = INT_MAX; - for (upa = max_upa; upa; upa--) { - int allocs = 0, wasted = 0; - - if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) - continue; - - for (group = 0; group_cnt[group]; group++) { - int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); - allocs += this_allocs; - wasted += this_allocs * upa - group_cnt[group]; - } - - /* - * Don't accept if wastage is over 25%. The - * greater-than comparison ensures upa==1 always - * passes the following check. - */ - if (wasted > num_possible_cpus() / 3) - continue; - - /* and then don't consume more memory */ - if (allocs > last_allocs) - break; - last_allocs = allocs; - best_upa = upa; - } - *unit_sizep = alloc_size / best_upa; - - /* assign units to cpus accordingly */ - unit = 0; - for (group = 0; group_cnt[group]; group++) { - for_each_possible_cpu(cpu) - if (group_map[cpu] == group) - unit_map[cpu] = unit++; - unit = roundup(unit, best_upa); - } - - return unit; /* unit contains aligned number of units */ -} - struct pcpul_ent { void *ptr; void *map_addr; @@ -1778,43 +1820,6 @@ static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, return false; } -static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, - size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units) -{ - int width = 1, v = nr_units; - char empty_str[] = "--------"; - int upl, lpl; /* units per lpage, lpage per line */ - unsigned int cpu; - int lpage, unit; - - while (v /= 10) - width++; - empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; - - upl = max_t(int, lpage_size / unit_size, 1); - lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); - - printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, - static_size, reserved_size, dyn_size, unit_size, lpage_size); - - for (lpage = 0, unit = 0; unit < nr_units; unit++) { - if (!(unit % upl)) { - if (!(lpage++ % lpl)) { - printk("\n"); - printk("%spcpu-lpage: ", lvl); - } else - printk("| "); - } - if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) - printk("%0*d ", width, cpu); - else - printk("%s ", empty_str); - } - printk("\n"); -} - /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page * @reserved_size: the size of reserved percpu area in bytes From fd1e8a1fe2b54df6c185b4fa65f181f50b9c4d4e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:51 +0900 Subject: [PATCH 0097/1662] percpu: introduce pcpu_alloc_info and pcpu_group_info Till now, non-linear cpu->unit map was expressed using an integer array which maps each cpu to a unit and used only by lpage allocator. Although how many units have been placed in a single contiguos area (group) is known while building unit_map, the information is lost when the result is recorded into the unit_map array. For lpage allocator, as all allocations are done by lpages and whether two adjacent lpages are in the same group or not is irrelevant, this didn't cause any problem. Non-linear cpu->unit mapping will be used for sparse embedding and this grouping information is necessary for that. This patch introduces pcpu_alloc_info which contains all the information necessary for initializing percpu allocator. pcpu_alloc_info contains array of pcpu_group_info which describes how units are grouped and mapped to cpus. pcpu_group_info also has base_offset field to specify its offset from the chunk's base address. pcpu_build_alloc_info() initializes this field as if all groups are allocated back-to-back as is currently done but this will be used to sparsely place groups. pcpu_alloc_info is a rather complex data structure which contains a flexible array which in turn points to nested cpu_map arrays. * pcpu_alloc_alloc_info() and pcpu_free_alloc_info() are provided to help dealing with pcpu_alloc_info. * pcpu_lpage_build_unit_map() is updated to build pcpu_alloc_info, generalized and renamed to pcpu_build_alloc_info(). @cpu_distance_fn may be NULL indicating that all cpus are of LOCAL_DISTANCE. * pcpul_lpage_dump_cfg() is updated to process pcpu_alloc_info, generalized and renamed to pcpu_dump_alloc_info(). It now also prints which group each alloc unit belongs to. * pcpu_setup_first_chunk() now takes pcpu_alloc_info instead of the separate parameters. All first chunk allocators are updated to use pcpu_build_alloc_info() to build alloc_info and call pcpu_setup_first_chunk() with it. This has the side effect of packing units for sparse possible cpus. ie. if cpus 0, 2 and 4 are possible, they'll be assigned unit 0, 1 and 2 instead of 0, 2 and 4. * x86 setup_pcpu_lpage() is updated to deal with alloc_info. * sparc64 setup_per_cpu_areas() is updated to build alloc_info. Although the changes made by this patch are pretty pervasive, it doesn't cause any behavior difference other than packing of sparse cpus. It mostly changes how information is passed among initialization functions and makes room for more flexibility. Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: David Miller --- arch/sparc/kernel/smp_64.c | 26 +- arch/x86/kernel/setup_percpu.c | 38 +-- include/linux/percpu.h | 44 ++- mm/percpu.c | 533 ++++++++++++++++++++------------- 4 files changed, 393 insertions(+), 248 deletions(-) diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 9856d866b77b..a42a4a744d14 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1475,17 +1475,29 @@ static void __init pcpu_map_range(unsigned long start, unsigned long end, void __init setup_per_cpu_areas(void) { - size_t dyn_size, static_size = __per_cpu_end - __per_cpu_start; static struct vm_struct vm; + struct pcpu_alloc_info *ai; unsigned long delta, cpu; size_t size_sum, pcpu_unit_size; size_t ptrs_size; void **ptrs; - size_sum = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE + - PERCPU_DYNAMIC_RESERVE); - dyn_size = size_sum - static_size - PERCPU_MODULE_RESERVE; + ai = pcpu_alloc_alloc_info(1, nr_cpu_ids); + ai->static_size = __per_cpu_end - __per_cpu_start; + ai->reserved_size = PERCPU_MODULE_RESERVE; + + size_sum = PFN_ALIGN(ai->static_size + ai->reserved_size + + PERCPU_DYNAMIC_RESERVE); + + ai->dyn_size = size_sum - ai->static_size - ai->reserved_size; + ai->unit_size = PCPU_CHUNK_SIZE; + ai->atom_size = PCPU_CHUNK_SIZE; + ai->alloc_size = PCPU_CHUNK_SIZE; + ai->groups[0].nr_units = nr_cpu_ids; + + for_each_possible_cpu(cpu) + ai->groups[0].cpu_map[cpu] = cpu; ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0])); ptrs = alloc_bootmem(ptrs_size); @@ -1497,7 +1509,7 @@ void __init setup_per_cpu_areas(void) free_bootmem(__pa(ptrs[cpu] + size_sum), PCPU_CHUNK_SIZE - size_sum); - memcpy(ptrs[cpu], __per_cpu_load, static_size); + memcpy(ptrs[cpu], __per_cpu_load, ai->static_size); } /* allocate address and map */ @@ -1514,9 +1526,7 @@ void __init setup_per_cpu_areas(void) pcpu_map_range(start, end, virt_to_page(ptrs[cpu])); } - pcpu_unit_size = pcpu_setup_first_chunk(static_size, - PERCPU_MODULE_RESERVE, dyn_size, - PCPU_CHUNK_SIZE, vm.addr, NULL); + pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr); free_bootmem(__pa(ptrs), ptrs_size); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 660cde133141..db5f9c49fec5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -161,9 +161,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; - size_t unit_map_size, unit_size; - int *unit_map; - int nr_units; + struct pcpu_alloc_info *ai; ssize_t ret; /* on non-NUMA, embedding is better */ @@ -177,26 +175,22 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) } /* allocate and build unit_map */ - unit_map_size = nr_cpu_ids * sizeof(int); - unit_map = alloc_bootmem_nopanic(unit_map_size); - if (!unit_map) { - pr_warning("PERCPU: failed to allocate unit_map\n"); - return -ENOMEM; + ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, + PMD_SIZE, pcpu_lpage_cpu_distance); + if (IS_ERR(ai)) { + pr_warning("PERCPU: failed to build unit_map (%ld)\n", + PTR_ERR(ai)); + return PTR_ERR(ai); } - ret = pcpu_lpage_build_unit_map(PERCPU_FIRST_CHUNK_RESERVE, - &dyn_size, &unit_size, PMD_SIZE, - unit_map, pcpu_lpage_cpu_distance); - if (ret < 0) { - pr_warning("PERCPU: failed to build unit_map\n"); - goto out_free; - } - nr_units = ret; - /* do the parameters look okay? */ if (!chosen) { size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = nr_units * unit_size; + size_t tot_size = 0; + int group; + + for (group = 0; group < ai->nr_groups; group++) + tot_size += ai->unit_size * ai->groups[group].nr_units; /* don't consume more than 20% of vmalloc area */ if (tot_size > vm_size / 5) { @@ -207,12 +201,10 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) } } - ret = pcpu_lpage_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - unit_size, PMD_SIZE, unit_map, nr_units, - pcpu_fc_alloc, pcpu_fc_free, pcpul_map); + ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, + pcpul_map); out_free: - if (ret < 0) - free_bootmem(__pa(unit_map), unit_map_size); + pcpu_free_alloc_info(ai); return ret; } #else diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 570fb18de2ba..77b86be8ce4f 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -59,6 +59,25 @@ extern void *pcpu_base_addr; extern const int *pcpu_unit_map; +struct pcpu_group_info { + int nr_units; /* aligned # of units */ + unsigned long base_offset; /* base address offset */ + unsigned int *cpu_map; /* unit->cpu map, empty + * entries contain NR_CPUS */ +}; + +struct pcpu_alloc_info { + size_t static_size; + size_t reserved_size; + size_t dyn_size; + size_t unit_size; + size_t atom_size; + size_t alloc_size; + size_t __ai_size; /* internal, don't use */ + int nr_groups; /* 0 if grouping unnecessary */ + struct pcpu_group_info groups[]; +}; + enum pcpu_fc { PCPU_FC_AUTO, PCPU_FC_EMBED, @@ -78,18 +97,17 @@ typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern int __init pcpu_lpage_build_unit_map( - size_t reserved_size, ssize_t *dyn_sizep, - size_t *unit_sizep, size_t lpage_size, - int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn); -#endif +extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units); +extern void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai); -extern size_t __init pcpu_setup_first_chunk( - size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - void *base_addr, const int *unit_map); +extern struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn); + +extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK extern ssize_t __init pcpu_embed_first_chunk( @@ -106,9 +124,7 @@ extern ssize_t __init pcpu_page_first_chunk( #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK extern ssize_t __init pcpu_lpage_first_chunk( - size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units, + const struct pcpu_alloc_info *ai, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); diff --git a/mm/percpu.c b/mm/percpu.c index 2b9c4b2a2fc0..99f7fa682722 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -58,6 +58,7 @@ #include #include +#include #include #include #include @@ -1245,53 +1246,108 @@ static inline size_t pcpu_calc_fc_sizes(size_t static_size, return size_sum; } -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK /** - * pcpu_lpage_build_unit_map - build unit_map for large page remapping - * @reserved_size: the size of reserved percpu area in bytes - * @dyn_sizep: in/out parameter for dynamic size, -1 for auto - * @unit_sizep: out parameter for unit size - * @unit_map: unit_map to be filled - * @cpu_distance_fn: callback to determine distance between cpus + * pcpu_alloc_alloc_info - allocate percpu allocation info + * @nr_groups: the number of groups + * @nr_units: the number of units * - * This function builds cpu -> unit map and determine other parameters - * considering needed percpu size, large page size and distances - * between CPUs in NUMA. - * - * CPUs which are of LOCAL_DISTANCE both ways are grouped together and - * may share units in the same large page. The returned configuration - * is guaranteed to have CPUs on different nodes on different large - * pages and >=75% usage of allocated virtual address space. + * Allocate ai which is large enough for @nr_groups groups containing + * @nr_units units. The returned ai's groups[0].cpu_map points to the + * cpu_map array which is long enough for @nr_units and filled with + * NR_CPUS. It's the caller's responsibility to initialize cpu_map + * pointer of other groups. * * RETURNS: - * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and - * returns the number of units to be allocated. -errno on failure. + * Pointer to the allocated pcpu_alloc_info on success, NULL on + * failure. */ -int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, - size_t *unit_sizep, size_t lpage_size, - int *unit_map, - pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units) +{ + struct pcpu_alloc_info *ai; + size_t base_size, ai_size; + void *ptr; + int unit; + + base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), + __alignof__(ai->groups[0].cpu_map[0])); + ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); + + ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); + if (!ptr) + return NULL; + ai = ptr; + ptr += base_size; + + ai->groups[0].cpu_map = ptr; + + for (unit = 0; unit < nr_units; unit++) + ai->groups[0].cpu_map[unit] = NR_CPUS; + + ai->nr_groups = nr_groups; + ai->__ai_size = PFN_ALIGN(ai_size); + + return ai; +} + +/** + * pcpu_free_alloc_info - free percpu allocation info + * @ai: pcpu_alloc_info to free + * + * Free @ai which was allocated by pcpu_alloc_alloc_info(). + */ +void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) +{ + free_bootmem(__pa(ai), ai->__ai_size); +} + +/** + * pcpu_build_alloc_info - build alloc_info considering distances between CPUs + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * + * This function determines grouping of units, their mappings to cpus + * and other parameters considering needed percpu size, allocation + * atom size and distances between CPUs. + * + * Groups are always mutliples of atom size and CPUs which are of + * LOCAL_DISTANCE both ways are grouped together and share space for + * units in the same group. The returned configuration is guaranteed + * to have CPUs on different nodes on different groups and >=75% usage + * of allocated virtual address space. + * + * RETURNS: + * On success, pointer to the new allocation_info is returned. On + * failure, ERR_PTR value is returned. + */ +struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; const size_t static_size = __per_cpu_end - __per_cpu_start; - int group_cnt_max = 0; + int group_cnt_max = 0, nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ - int last_allocs; + int last_allocs, group, unit; unsigned int cpu, tcpu; - int group, unit; + struct pcpu_alloc_info *ai; + unsigned int *cpu_map; /* * Determine min_unit_size, alloc_size and max_upa such that - * alloc_size is multiple of lpage_size and is the smallest + * alloc_size is multiple of atom_size and is the smallest * which can accomodate 4k aligned segments which are equal to * or larger than min_unit_size. */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - alloc_size = roundup(min_unit_size, lpage_size); + alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) upa--; @@ -1304,10 +1360,11 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, for_each_possible_cpu(tcpu) { if (cpu == tcpu) break; - if (group_map[tcpu] == group && + if (group_map[tcpu] == group && cpu_distance_fn && (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { group++; + nr_groups = max(nr_groups, group + 1); goto next_group; } } @@ -1328,7 +1385,7 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) continue; - for (group = 0; group_cnt[group]; group++) { + for (group = 0; group < nr_groups; group++) { int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); allocs += this_allocs; wasted += this_allocs * upa - group_cnt[group]; @@ -1348,75 +1405,122 @@ int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep, last_allocs = allocs; best_upa = upa; } - *unit_sizep = alloc_size / best_upa; + upa = best_upa; - /* assign units to cpus accordingly */ - unit = 0; - for (group = 0; group_cnt[group]; group++) { - for_each_possible_cpu(cpu) - if (group_map[cpu] == group) - unit_map[cpu] = unit++; - unit = roundup(unit, best_upa); + /* allocate and fill alloc_info */ + for (group = 0; group < nr_groups; group++) + nr_units += roundup(group_cnt[group], upa); + + ai = pcpu_alloc_alloc_info(nr_groups, nr_units); + if (!ai) + return ERR_PTR(-ENOMEM); + cpu_map = ai->groups[0].cpu_map; + + for (group = 0; group < nr_groups; group++) { + ai->groups[group].cpu_map = cpu_map; + cpu_map += roundup(group_cnt[group], upa); } - return unit; /* unit contains aligned number of units */ + ai->static_size = static_size; + ai->reserved_size = reserved_size; + ai->dyn_size = dyn_size; + ai->unit_size = alloc_size / upa; + ai->atom_size = atom_size; + ai->alloc_size = alloc_size; + + for (group = 0, unit = 0; group_cnt[group]; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + + /* + * Initialize base_offset as if all groups are located + * back-to-back. The caller should update this to + * reflect actual allocation. + */ + gi->base_offset = unit * ai->unit_size; + + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + gi->cpu_map[gi->nr_units++] = cpu; + gi->nr_units = roundup(gi->nr_units, upa); + unit += gi->nr_units; + } + BUG_ON(unit != nr_units); + + return ai; } -static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, - unsigned int *cpup); - -static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, - size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units) +/** + * pcpu_dump_alloc_info - print out information about pcpu_alloc_info + * @lvl: loglevel + * @ai: allocation info to dump + * + * Print out information about @ai using loglevel @lvl. + */ +static void pcpu_dump_alloc_info(const char *lvl, + const struct pcpu_alloc_info *ai) { - int width = 1, v = nr_units; + int group_width = 1, cpu_width = 1, width; char empty_str[] = "--------"; - int upl, lpl; /* units per lpage, lpage per line */ - unsigned int cpu; - int lpage, unit; + int alloc = 0, alloc_end = 0; + int group, v; + int upa, apl; /* units per alloc, allocs per line */ + v = ai->nr_groups; while (v /= 10) - width++; - empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; + group_width++; - upl = max_t(int, lpage_size / unit_size, 1); - lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); + v = num_possible_cpus(); + while (v /= 10) + cpu_width++; + empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; - printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, - static_size, reserved_size, dyn_size, unit_size, lpage_size); + upa = ai->alloc_size / ai->unit_size; + width = upa * (cpu_width + 1) + group_width + 3; + apl = rounddown_pow_of_two(max(60 / width, 1)); - for (lpage = 0, unit = 0; unit < nr_units; unit++) { - if (!(unit % upl)) { - if (!(lpage++ % lpl)) { + printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", + lvl, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); + + for (group = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + int unit = 0, unit_end = 0; + + BUG_ON(gi->nr_units % upa); + for (alloc_end += gi->nr_units / upa; + alloc < alloc_end; alloc++) { + if (!(alloc % apl)) { printk("\n"); - printk("%spcpu-lpage: ", lvl); - } else - printk("| "); + printk("%spcpu-alloc: ", lvl); + } + printk("[%0*d] ", group_width, group); + + for (unit_end += upa; unit < unit_end; unit++) + if (gi->cpu_map[unit] != NR_CPUS) + printk("%0*d ", cpu_width, + gi->cpu_map[unit]); + else + printk("%s ", empty_str); } - if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) - printk("%0*d ", width, cpu); - else - printk("%s ", empty_str); } printk("\n"); } -#endif /** * pcpu_setup_first_chunk - initialize the first percpu chunk - * @static_size: the size of static percpu area in bytes - * @reserved_size: the size of reserved percpu area in bytes, 0 for none - * @dyn_size: free size for dynamic allocation in bytes - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE + * @ai: pcpu_alloc_info describing how to percpu area is shaped * @base_addr: mapped address - * @unit_map: cpu -> unit map, NULL for sequential mapping * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area * setup path. * - * @reserved_size, if non-zero, specifies the amount of bytes to + * @ai contains all information necessary to initialize the first + * chunk and prime the dynamic percpu allocator. + * + * @ai->static_size is the size of static percpu area. + * + * @ai->reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu @@ -1424,13 +1528,26 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * - * @dyn_size determines the number of bytes available for dynamic - * allocation in the first chunk. The area between @static_size + - * @reserved_size + @dyn_size and @unit_size is unused. + * @ai->dyn_size determines the number of bytes available for dynamic + * allocation in the first chunk. The area between @ai->static_size + + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. * - * @unit_size specifies unit size and must be aligned to PAGE_SIZE and - * equal to or larger than @static_size + @reserved_size + if - * non-negative, @dyn_size. + * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE + * and equal to or larger than @ai->static_size + @ai->reserved_size + + * @ai->dyn_size. + * + * @ai->atom_size is the allocation atom size and used as alignment + * for vm areas. + * + * @ai->alloc_size is the allocation size and always multiple of + * @ai->atom_size. This is larger than @ai->atom_size if + * @ai->unit_size is larger than @ai->atom_size. + * + * @ai->nr_groups and @ai->groups describe virtual memory layout of + * percpu areas. Units which should be colocated are put into the + * same group. Dynamic VM areas will be allocated according to these + * groupings. If @ai->nr_groups is zero, a single group containing + * all units is assumed. * * The caller should have mapped the first chunk at @base_addr and * copied static data to each unit. @@ -1446,70 +1563,63 @@ static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, * The determined pcpu_unit_size which can be used to initialize * percpu access. */ -size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, - size_t dyn_size, size_t unit_size, - void *base_addr, const int *unit_map) +size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { static struct vm_struct first_vm; static int smap[2], dmap[2]; - size_t size_sum = static_size + reserved_size + dyn_size; + size_t dyn_size = ai->dyn_size; + size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; - unsigned int cpu, tcpu; - int i; + unsigned int cpu; + int *unit_map; + int group, unit, i; /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); - BUG_ON(!static_size); + BUG_ON(ai->nr_groups <= 0); + BUG_ON(!ai->static_size); BUG_ON(!base_addr); - BUG_ON(unit_size < size_sum); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); + BUG_ON(ai->unit_size < size_sum); + BUG_ON(ai->unit_size & ~PAGE_MASK); + BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + + pcpu_dump_alloc_info(KERN_DEBUG, ai); /* determine number of units and verify and initialize pcpu_unit_map */ - if (unit_map) { - int first_unit = INT_MAX, last_unit = INT_MIN; + unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); - for_each_possible_cpu(cpu) { - int unit = unit_map[cpu]; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + unit_map[cpu] = NR_CPUS; + pcpu_first_unit_cpu = NR_CPUS; - BUG_ON(unit < 0); - for_each_possible_cpu(tcpu) { - if (tcpu == cpu) - break; - /* the mapping should be one-to-one */ - BUG_ON(unit_map[tcpu] == unit); - } + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; - if (unit < first_unit) { + for (i = 0; i < gi->nr_units; i++) { + cpu = gi->cpu_map[i]; + if (cpu == NR_CPUS) + continue; + + BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu)); + BUG_ON(unit_map[cpu] != NR_CPUS); + + unit_map[cpu] = unit + i; + if (pcpu_first_unit_cpu == NR_CPUS) pcpu_first_unit_cpu = cpu; - first_unit = unit; - } - if (unit > last_unit) { - pcpu_last_unit_cpu = cpu; - last_unit = unit; - } } - pcpu_nr_units = last_unit + 1; - pcpu_unit_map = unit_map; - } else { - int *identity_map; - - /* #units == #cpus, identity mapped */ - identity_map = alloc_bootmem(nr_cpu_ids * - sizeof(identity_map[0])); - - for_each_possible_cpu(cpu) - identity_map[cpu] = cpu; - - pcpu_first_unit_cpu = 0; - pcpu_last_unit_cpu = pcpu_nr_units - 1; - pcpu_nr_units = nr_cpu_ids; - pcpu_unit_map = identity_map; } + pcpu_last_unit_cpu = cpu; + pcpu_nr_units = unit; + + for_each_possible_cpu(cpu) + BUG_ON(unit_map[cpu] == NR_CPUS); + + pcpu_unit_map = unit_map; /* determine basic parameters */ - pcpu_unit_pages = unit_size >> PAGE_SHIFT; + pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + @@ -1543,17 +1653,17 @@ size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size, schunk->immutable = true; bitmap_fill(schunk->populated, pcpu_unit_pages); - if (reserved_size) { - schunk->free_size = reserved_size; + if (ai->reserved_size) { + schunk->free_size = ai->reserved_size; pcpu_reserved_chunk = schunk; - pcpu_reserved_chunk_limit = static_size + reserved_size; + pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ } schunk->contig_hint = schunk->free_size; - schunk->map[schunk->map_used++] = -static_size; + schunk->map[schunk->map_used++] = -ai->static_size; if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; @@ -1643,44 +1753,47 @@ early_param("percpu_alloc", percpu_alloc_setup); */ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) { - const size_t static_size = __per_cpu_end - __per_cpu_start; - size_t size_sum, unit_size, chunk_size; + struct pcpu_alloc_info *ai; + size_t size_sum, chunk_size; void *base; - unsigned int cpu; + int unit; + ssize_t ret; - /* determine parameters and allocate */ - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); - unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - chunk_size = unit_size * nr_cpu_ids; + size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + chunk_size = ai->unit_size * num_possible_cpus(); base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!base) { pr_warning("PERCPU: failed to allocate %zu bytes for " "embedding\n", chunk_size); - return -ENOMEM; + ret = -ENOMEM; + goto out_free_ai; } /* return the leftover and copy */ - for (cpu = 0; cpu < nr_cpu_ids; cpu++) { - void *ptr = base + cpu * unit_size; + for (unit = 0; unit < num_possible_cpus(); unit++) { + void *ptr = base + unit * ai->unit_size; - if (cpu_possible(cpu)) { - free_bootmem(__pa(ptr + size_sum), - unit_size - size_sum); - memcpy(ptr, __per_cpu_load, static_size); - } else - free_bootmem(__pa(ptr), unit_size); + free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum); + memcpy(ptr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", - PFN_DOWN(size_sum), base, static_size, reserved_size, dyn_size, - unit_size); + PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + ai->dyn_size, ai->unit_size); - return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, base, NULL); + ret = pcpu_setup_first_chunk(ai, base); +out_free_ai: + pcpu_free_alloc_info(ai); + return ret; } #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || !CONFIG_HAVE_SETUP_PER_CPU_AREA */ @@ -1709,31 +1822,34 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; - const size_t static_size = __per_cpu_end - __per_cpu_start; - ssize_t dyn_size = -1; - size_t size_sum, unit_size; + struct pcpu_alloc_info *ai; char psize_str[16]; int unit_pages; size_t pages_size; struct page **pages; - unsigned int cpu; - int i, j; + int unit, i, j; ssize_t ret; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); - size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); - unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); - unit_pages = unit_size >> PAGE_SHIFT; + ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + + unit_pages = ai->unit_size >> PAGE_SHIFT; /* unaligned allocations can't be freed, round up to page size */ - pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0])); + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * + sizeof(pages[0])); pages = alloc_bootmem(pages_size); /* allocate pages */ j = 0; - for_each_possible_cpu(cpu) + for (unit = 0; unit < num_possible_cpus(); unit++) for (i = 0; i < unit_pages; i++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); @@ -1747,18 +1863,18 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; - vm.size = nr_cpu_ids * unit_size; + vm.size = num_possible_cpus() * ai->unit_size; vm_area_register_early(&vm, PAGE_SIZE); - for_each_possible_cpu(cpu) { + for (unit = 0; unit < num_possible_cpus(); unit++) { unsigned long unit_addr = - (unsigned long)vm.addr + cpu * unit_size; + (unsigned long)vm.addr + unit * ai->unit_size; for (i = 0; i < unit_pages; i++) populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ - ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages], + ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], unit_pages); if (ret < 0) panic("failed to map percpu area, err=%zd\n", ret); @@ -1772,16 +1888,15 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, */ /* copy static data */ - memcpy((void *)unit_addr, __per_cpu_load, static_size); + memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", - unit_pages, psize_str, vm.addr, static_size, reserved_size, - dyn_size); + unit_pages, psize_str, vm.addr, ai->static_size, + ai->reserved_size, ai->dyn_size); - ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, vm.addr, NULL); + ret = pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: @@ -1790,6 +1905,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, ret = -ENOMEM; out_free_ar: free_bootmem(__pa(pages), pages_size); + pcpu_free_alloc_info(ai); return ret; } #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ @@ -1805,38 +1921,50 @@ static size_t pcpul_lpage_size; static int pcpul_nr_lpages; static struct pcpul_ent *pcpul_map; -static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, +static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai, unsigned int *cpup) { - unsigned int cpu; + int group, cunit; - for_each_possible_cpu(cpu) - if (unit_map[cpu] == unit) { + for (group = 0, cunit = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + if (unit < cunit + gi->nr_units) { if (cpup) - *cpup = cpu; + *cpup = gi->cpu_map[unit - cunit]; return true; } + cunit += gi->nr_units; + } return false; } +static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai) +{ + int group, unit, i; + + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + for (i = 0; i < gi->nr_units; i++) + if (gi->cpu_map[i] == cpu) + return unit + i; + } + BUG(); +} + /** * pcpu_lpage_first_chunk - remap the first percpu chunk using large page - * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes - * @unit_size: unit size in bytes - * @lpage_size: the size of a large page - * @unit_map: cpu -> unit mapping - * @nr_units: the number of units + * @ai: pcpu_alloc_info * @alloc_fn: function to allocate percpu lpage, always called with lpage_size * @free_fn: function to free percpu memory, @size <= lpage_size * @map_fn: function to map percpu lpage, always called with lpage_size * * This allocator uses large page to build and map the first chunk. - * Unlike other helpers, the caller should always specify @dyn_size - * and @unit_size. These parameters along with @unit_map and - * @nr_units can be determined using pcpu_lpage_build_unit_map(). - * This two stage initialization is to allow arch code to evaluate the + * Unlike other helpers, the caller should provide fully initialized + * @ai. This can be done using pcpu_build_alloc_info(). This two + * stage initialization is to allow arch code to evaluate the * parameters before committing to it. * * Large pages are allocated as directed by @unit_map and other @@ -1852,27 +1980,26 @@ static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, * The determined pcpu_unit_size which can be used to initialize * percpu access on success, -errno on failure. */ -ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, - size_t unit_size, size_t lpage_size, - const int *unit_map, int nr_units, +ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn) { static struct vm_struct vm; - const size_t static_size = __per_cpu_end - __per_cpu_start; - size_t chunk_size = unit_size * nr_units; - size_t map_size; + const size_t lpage_size = ai->atom_size; + size_t chunk_size, map_size; unsigned int cpu; ssize_t ret; - int i, j, unit; + int i, j, unit, nr_units; - pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size, - unit_size, lpage_size, unit_map, nr_units); + nr_units = 0; + for (i = 0; i < ai->nr_groups; i++) + nr_units += ai->groups[i].nr_units; + chunk_size = ai->unit_size * nr_units; BUG_ON(chunk_size % lpage_size); - pcpul_size = static_size + reserved_size + dyn_size; + pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size; pcpul_lpage_size = lpage_size; pcpul_nr_lpages = chunk_size / lpage_size; @@ -1883,13 +2010,13 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, /* allocate all pages */ for (i = 0; i < pcpul_nr_lpages; i++) { size_t offset = i * lpage_size; - int first_unit = offset / unit_size; - int last_unit = (offset + lpage_size - 1) / unit_size; + int first_unit = offset / ai->unit_size; + int last_unit = (offset + lpage_size - 1) / ai->unit_size; void *ptr; /* find out which cpu is mapped to this unit */ for (unit = first_unit; unit <= last_unit; unit++) - if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) + if (pcpul_unit_to_cpu(unit, ai, &cpu)) goto found; continue; found: @@ -1905,12 +2032,12 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, /* return unused holes */ for (unit = 0; unit < nr_units; unit++) { - size_t start = unit * unit_size; - size_t end = start + unit_size; + size_t start = unit * ai->unit_size; + size_t end = start + ai->unit_size; size_t off, next; /* don't free used part of occupied unit */ - if (pcpul_unit_to_cpu(unit, unit_map, NULL)) + if (pcpul_unit_to_cpu(unit, ai, NULL)) start += pcpul_size; /* unit can span more than one page, punch the holes */ @@ -1925,7 +2052,7 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, /* allocate address, map and copy */ vm.flags = VM_ALLOC; vm.size = chunk_size; - vm_area_register_early(&vm, unit_size); + vm_area_register_early(&vm, ai->unit_size); for (i = 0; i < pcpul_nr_lpages; i++) { if (!pcpul_map[i].ptr) @@ -1935,15 +2062,15 @@ ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size, } for_each_possible_cpu(cpu) - memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load, - static_size); + memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size, + __per_cpu_load, ai->static_size); /* we're ready, commit */ pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n", - vm.addr, static_size, reserved_size, dyn_size, unit_size); + vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size); - ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, - unit_size, vm.addr, unit_map); + ret = pcpu_setup_first_chunk(ai, vm.addr); /* * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped From fb435d5233f8b6f9b93c11d6304d8e98fed03234 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:51 +0900 Subject: [PATCH 0098/1662] percpu: add pcpu_unit_offsets[] Currently units are mapped sequentially into address space. This patch adds pcpu_unit_offsets[] which allows units to be mapped to arbitrary offsets from the chunk base address. This is necessary to allow sparse embedding which might would need to allocate address ranges and memory areas which aren't aligned to unit size but allocation atom size (page or large page size). This also simplifies things a bit by removing the need to calculate offset from unit number. With this change, there's no need for the arch code to know pcpu_unit_size. Update pcpu_setup_first_chunk() and first chunk allocators to return regular 0 or -errno return code instead of unit size or -errno. Signed-off-by: Tejun Heo Cc: David S. Miller --- arch/sparc/kernel/smp_64.c | 12 +++-- arch/x86/kernel/setup_percpu.c | 51 ++++++++---------- include/linux/percpu.h | 16 +++--- mm/percpu.c | 95 +++++++++++++++++----------------- 4 files changed, 84 insertions(+), 90 deletions(-) diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index a42a4a744d14..b03fd362c629 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1478,9 +1478,10 @@ void __init setup_per_cpu_areas(void) static struct vm_struct vm; struct pcpu_alloc_info *ai; unsigned long delta, cpu; - size_t size_sum, pcpu_unit_size; + size_t size_sum; size_t ptrs_size; void **ptrs; + int rc; ai = pcpu_alloc_alloc_info(1, nr_cpu_ids); @@ -1526,14 +1527,15 @@ void __init setup_per_cpu_areas(void) pcpu_map_range(start, end, virt_to_page(ptrs[cpu])); } - pcpu_unit_size = pcpu_setup_first_chunk(ai, vm.addr); + rc = pcpu_setup_first_chunk(ai, vm.addr); + if (rc) + panic("failed to setup percpu first chunk (%d)", rc); free_bootmem(__pa(ptrs), ptrs_size); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; - for_each_possible_cpu(cpu) { - __per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; - } + for_each_possible_cpu(cpu) + __per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; /* Setup %g5 for the boot cpu. */ __local_per_cpu_offset = __per_cpu_offset(smp_processor_id()); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index db5f9c49fec5..9becc5d4b518 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -157,12 +157,12 @@ static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } -static ssize_t __init setup_pcpu_lpage(bool chosen) +static int __init setup_pcpu_lpage(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; struct pcpu_alloc_info *ai; - ssize_t ret; + int rc; /* on non-NUMA, embedding is better */ if (!chosen && !pcpu_need_numa()) @@ -196,19 +196,18 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) if (tot_size > vm_size / 5) { pr_info("PERCPU: too large chunk size %zuMB for " "large page remap\n", tot_size >> 20); - ret = -EINVAL; + rc = -EINVAL; goto out_free; } } - ret = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, - pcpul_map); + rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map); out_free: pcpu_free_alloc_info(ai); - return ret; + return rc; } #else -static ssize_t __init setup_pcpu_lpage(bool chosen) +static int __init setup_pcpu_lpage(bool chosen) { return -EINVAL; } @@ -222,7 +221,7 @@ static ssize_t __init setup_pcpu_lpage(bool chosen) * mapping so that it can use PMD mapping without additional TLB * pressure. */ -static ssize_t __init setup_pcpu_embed(bool chosen) +static int __init setup_pcpu_embed(bool chosen) { size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -250,7 +249,7 @@ static void __init pcpup_populate_pte(unsigned long addr) populate_extra_pte(addr); } -static ssize_t __init setup_pcpu_page(void) +static int __init setup_pcpu_page(void) { return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, pcpu_fc_alloc, pcpu_fc_free, @@ -274,8 +273,7 @@ void __init setup_per_cpu_areas(void) { unsigned int cpu; unsigned long delta; - size_t pcpu_unit_size; - ssize_t ret; + int rc; pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); @@ -285,36 +283,33 @@ void __init setup_per_cpu_areas(void) * of large page mappings. Please read comments on top of * each allocator for details. */ - ret = -EINVAL; + rc = -EINVAL; if (pcpu_chosen_fc != PCPU_FC_AUTO) { if (pcpu_chosen_fc != PCPU_FC_PAGE) { if (pcpu_chosen_fc == PCPU_FC_LPAGE) - ret = setup_pcpu_lpage(true); + rc = setup_pcpu_lpage(true); else - ret = setup_pcpu_embed(true); + rc = setup_pcpu_embed(true); - if (ret < 0) - pr_warning("PERCPU: %s allocator failed (%zd), " + if (rc < 0) + pr_warning("PERCPU: %s allocator failed (%d), " "falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], ret); + pcpu_fc_names[pcpu_chosen_fc], rc); } } else { - ret = setup_pcpu_lpage(false); - if (ret < 0) - ret = setup_pcpu_embed(false); + rc = setup_pcpu_lpage(false); + if (rc < 0) + rc = setup_pcpu_embed(false); } - if (ret < 0) - ret = setup_pcpu_page(); - if (ret < 0) - panic("cannot initialize percpu area (err=%zd)", ret); - - pcpu_unit_size = ret; + if (rc < 0) + rc = setup_pcpu_page(); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); /* alrighty, percpu areas up and running */ delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { - per_cpu_offset(cpu) = - delta + pcpu_unit_map[cpu] * pcpu_unit_size; + per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu]; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 77b86be8ce4f..a7ec840f596c 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -57,7 +57,7 @@ #endif extern void *pcpu_base_addr; -extern const int *pcpu_unit_map; +extern const unsigned long *pcpu_unit_offsets; struct pcpu_group_info { int nr_units; /* aligned # of units */ @@ -106,25 +106,23 @@ extern struct pcpu_alloc_info * __init pcpu_build_alloc_info( size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn); -extern size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, - void *base_addr); +extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK -extern ssize_t __init pcpu_embed_first_chunk( - size_t reserved_size, ssize_t dyn_size); +extern int __init pcpu_embed_first_chunk(size_t reserved_size, + ssize_t dyn_size); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK -extern ssize_t __init pcpu_page_first_chunk( - size_t reserved_size, +extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif #ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern ssize_t __init pcpu_lpage_first_chunk( - const struct pcpu_alloc_info *ai, +extern int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_map_fn_t map_fn); diff --git a/mm/percpu.c b/mm/percpu.c index 99f7fa682722..653b02c40200 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -117,8 +117,8 @@ static unsigned int pcpu_last_unit_cpu __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); -/* cpu -> unit map */ -const int *pcpu_unit_map __read_mostly; +static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ /* * The first chunk which always exists. Note that unlike other @@ -196,8 +196,8 @@ static int pcpu_page_idx(unsigned int cpu, int page_idx) static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { - return (unsigned long)chunk->vm->addr + - (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); + return (unsigned long)chunk->vm->addr + pcpu_unit_offsets[cpu] + + (page_idx << PAGE_SHIFT); } static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, @@ -341,7 +341,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) * space. Note that any possible cpu id can be used here, so * there's no need to worry about preemption or cpu hotplug. */ - addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size; + addr += pcpu_unit_offsets[smp_processor_id()]; return pcpu_get_page_chunk(vmalloc_to_page(addr)); } @@ -1560,17 +1560,17 @@ static void pcpu_dump_alloc_info(const char *lvl, * and available for dynamic allocation like any other chunks. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access. + * 0 on success, -errno on failure. */ -size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, - void *base_addr) +int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { static struct vm_struct first_vm; static int smap[2], dmap[2]; size_t dyn_size = ai->dyn_size; size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; @@ -1587,8 +1587,9 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_dump_alloc_info(KERN_DEBUG, ai); - /* determine number of units and verify and initialize pcpu_unit_map */ + /* determine number of units and initialize unit_map and base */ unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); + unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = NR_CPUS; @@ -1606,6 +1607,8 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, BUG_ON(unit_map[cpu] != NR_CPUS); unit_map[cpu] = unit + i; + unit_off[cpu] = gi->base_offset + i * ai->unit_size; + if (pcpu_first_unit_cpu == NR_CPUS) pcpu_first_unit_cpu = cpu; } @@ -1617,6 +1620,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, BUG_ON(unit_map[cpu] == NR_CPUS); pcpu_unit_map = unit_map; + pcpu_unit_offsets = unit_off; /* determine basic parameters */ pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; @@ -1688,7 +1692,7 @@ size_t __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* we're done */ pcpu_base_addr = schunk->vm->addr; - return pcpu_unit_size; + return 0; } const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { @@ -1748,16 +1752,15 @@ early_param("percpu_alloc", percpu_alloc_setup); * size, the leftover is returned to the bootmem allocator. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) +int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) { struct pcpu_alloc_info *ai; size_t size_sum, chunk_size; void *base; int unit; - ssize_t ret; + int rc; ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL); if (IS_ERR(ai)) @@ -1773,7 +1776,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) if (!base) { pr_warning("PERCPU: failed to allocate %zu bytes for " "embedding\n", chunk_size); - ret = -ENOMEM; + rc = -ENOMEM; goto out_free_ai; } @@ -1790,10 +1793,10 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); - ret = pcpu_setup_first_chunk(ai, base); + rc = pcpu_setup_first_chunk(ai, base); out_free_ai: pcpu_free_alloc_info(ai); - return ret; + return rc; } #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || !CONFIG_HAVE_SETUP_PER_CPU_AREA */ @@ -1813,13 +1816,12 @@ ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) * page-by-page into vmalloc area. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn) +int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; struct pcpu_alloc_info *ai; @@ -1827,8 +1829,7 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, int unit_pages; size_t pages_size; struct page **pages; - int unit, i, j; - ssize_t ret; + int unit, i, j, rc; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); @@ -1874,10 +1875,10 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ - ret = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], - unit_pages); - if (ret < 0) - panic("failed to map percpu area, err=%zd\n", ret); + rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], + unit_pages); + if (rc < 0) + panic("failed to map percpu area, err=%d\n", rc); /* * FIXME: Archs with virtual cache should flush local @@ -1896,17 +1897,17 @@ ssize_t __init pcpu_page_first_chunk(size_t reserved_size, unit_pages, psize_str, vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size); - ret = pcpu_setup_first_chunk(ai, vm.addr); + rc = pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: while (--j >= 0) free_fn(page_address(pages[j]), PAGE_SIZE); - ret = -ENOMEM; + rc = -ENOMEM; out_free_ar: free_bootmem(__pa(pages), pages_size); pcpu_free_alloc_info(ai); - return ret; + return rc; } #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ @@ -1977,20 +1978,18 @@ static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai) * pcpu_lpage_remapped(). * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn) +int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_map_fn_t map_fn) { static struct vm_struct vm; const size_t lpage_size = ai->atom_size; size_t chunk_size, map_size; unsigned int cpu; - ssize_t ret; - int i, j, unit, nr_units; + int i, j, unit, nr_units, rc; nr_units = 0; for (i = 0; i < ai->nr_groups; i++) @@ -2070,7 +2069,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); - ret = pcpu_setup_first_chunk(ai, vm.addr); + rc = pcpu_setup_first_chunk(ai, vm.addr); /* * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped @@ -2094,7 +2093,7 @@ ssize_t __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) pcpul_nr_lpages--; - return ret; + return rc; enomem: for (i = 0; i < pcpul_nr_lpages; i++) @@ -2166,21 +2165,21 @@ EXPORT_SYMBOL(__per_cpu_offset); void __init setup_per_cpu_areas(void) { - ssize_t unit_size; unsigned long delta; unsigned int cpu; + int rc; /* * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ - unit_size = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE); - if (unit_size < 0) + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE); + if (rc < 0) panic("Failed to initialized percpu areas."); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) - __per_cpu_offset[cpu] = delta + cpu * unit_size; + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ From bba174f5e03a40a4ab1c63a2272ea5530b98a067 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:51 +0900 Subject: [PATCH 0099/1662] percpu: add chunk->base_addr The only thing percpu allocator wants to know about a vmalloc area is the base address. Instead of requiring chunk->vm, add chunk->base_addr which contains the necessary value. This simplifies the code a bit and makes the dummy first_vm unnecessary. This change will ease allowing a chunk to be mapped by multiple vms. Signed-off-by: Tejun Heo --- mm/percpu.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 653b02c40200..548624309f83 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -94,10 +94,11 @@ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ - struct vm_struct *vm; /* mapped vmalloc region */ + void *base_addr; /* base address of this chunk */ int map_used; /* # of map entries used */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + struct vm_struct *vm; /* mapped vmalloc region */ bool immutable; /* no [de]population allowed */ unsigned long populated[]; /* populated bitmap */ }; @@ -196,7 +197,7 @@ static int pcpu_page_idx(unsigned int cpu, int page_idx) static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { - return (unsigned long)chunk->vm->addr + pcpu_unit_offsets[cpu] + + return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT); } @@ -324,7 +325,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { - void *first_start = pcpu_first_chunk->vm->addr; + void *first_start = pcpu_first_chunk->base_addr; /* is it in the first chunk? */ if (addr >= first_start && addr < first_start + pcpu_unit_size) { @@ -1014,6 +1015,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) INIT_LIST_HEAD(&chunk->list); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; + chunk->base_addr = chunk->vm->addr; return chunk; } @@ -1103,8 +1105,8 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) mutex_unlock(&pcpu_alloc_mutex); - /* return address relative to unit0 */ - return __addr_to_pcpu_ptr(chunk->vm->addr + off); + /* return address relative to base address */ + return __addr_to_pcpu_ptr(chunk->base_addr + off); fail_unlock: spin_unlock_irq(&pcpu_lock); @@ -1213,7 +1215,7 @@ void free_percpu(void *ptr) spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); - off = addr - chunk->vm->addr; + off = addr - chunk->base_addr; pcpu_free_area(chunk, off); @@ -1565,7 +1567,6 @@ static void pcpu_dump_alloc_info(const char *lvl, int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { - static struct vm_struct first_vm; static int smap[2], dmap[2]; size_t dyn_size = ai->dyn_size; size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; @@ -1629,10 +1630,6 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); - first_vm.flags = VM_ALLOC; - first_vm.size = pcpu_chunk_size; - first_vm.addr = base_addr; - /* * Allocate chunk slots. The additional last slot is for * empty chunks. @@ -1651,7 +1648,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ schunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&schunk->list); - schunk->vm = &first_vm; + schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->immutable = true; @@ -1675,7 +1672,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (dyn_size) { dchunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&dchunk->list); - dchunk->vm = &first_vm; + dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); dchunk->immutable = true; @@ -1691,7 +1688,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ - pcpu_base_addr = schunk->vm->addr; + pcpu_base_addr = base_addr; return 0; } From cf88c79006bd6a09ad725ba0b34c0e23db20b19e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:52 +0900 Subject: [PATCH 0100/1662] vmalloc: separate out insert_vmalloc_vm() Separate out insert_vmalloc_vm() from __get_vm_area_node(). insert_vmalloc_vm() initializes vm_struct from vmap_area and inserts it into vmlist. insert_vmalloc_vm() only initializes fields which can be determined from @vm, @flags and @caller The rest should be initialized by the caller. For __get_vm_area_node(), all other fields just need to be cleared and this is done by using kzalloc instead of kmalloc. This will be used to implement pcpu_get_vm_areas(). Signed-off-by: Tejun Heo Cc: Nick Piggin --- mm/vmalloc.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f8189a4b3e13..2eb461c3a46e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1122,13 +1122,34 @@ EXPORT_SYMBOL_GPL(map_vm_area); DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; +static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, + unsigned long flags, void *caller) +{ + struct vm_struct *tmp, **p; + + vm->flags = flags; + vm->addr = (void *)va->va_start; + vm->size = va->va_end - va->va_start; + vm->caller = caller; + va->private = vm; + va->flags |= VM_VM_AREA; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= vm->addr) + break; + } + vm->next = *p; + *p = vm; + write_unlock(&vmlist_lock); +} + static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, void *caller) { static struct vmap_area *va; struct vm_struct *area; - struct vm_struct *tmp, **p; unsigned long align = 1; BUG_ON(in_interrupt()); @@ -1147,7 +1168,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (unlikely(!size)) return NULL; - area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); + area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; @@ -1162,25 +1183,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - area->flags = flags; - area->addr = (void *)va->va_start; - area->size = size; - area->pages = NULL; - area->nr_pages = 0; - area->phys_addr = 0; - area->caller = caller; - va->private = area; - va->flags |= VM_VM_AREA; - - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { - if (tmp->addr >= area->addr) - break; - } - area->next = *p; - *p = area; - write_unlock(&vmlist_lock); - + insert_vmalloc_vm(area, va, flags, caller); return area; } From ca23e405e06d5fffb005df004c72781f76062f51 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:52 +0900 Subject: [PATCH 0101/1662] vmalloc: implement pcpu_get_vm_areas() To directly use spread NUMA memories for percpu units, percpu allocator will be updated to allow sparsely mapping units in a chunk. As the distances between units can be very large, this makes allocating single vmap area for each chunk undesirable. This patch implements pcpu_get_vm_areas() and pcpu_free_vm_areas() which allocates and frees sparse congruent vmap areas. pcpu_get_vm_areas() take @offsets and @sizes array which define distances and sizes of vmap areas. It scans down from the top of vmalloc area looking for the top-most address which can accomodate all the areas. The top-down scan is to avoid interacting with regular vmallocs which can push up these congruent areas up little by little ending up wasting address space and page table. To speed up top-down scan, the highest possible address hint is maintained. Although the scan is linear from the hint, given the usual large holes between memory addresses between NUMA nodes, the scanning is highly likely to finish after finding the first hole for the last unit which is scanned first. Signed-off-by: Tejun Heo Cc: Nick Piggin --- include/linux/vmalloc.h | 6 + mm/vmalloc.c | 293 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 299 insertions(+) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index a43ebec3a7b9..227c2a585e4f 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -115,4 +115,10 @@ extern rwlock_t vmlist_lock; extern struct vm_struct *vmlist; extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); +struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, + const size_t *sizes, int nr_vms, + size_t align, gfp_t gfp_mask); + +void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); + #endif /* _LINUX_VMALLOC_H */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2eb461c3a46e..204b8243d8ab 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -265,6 +265,7 @@ struct vmap_area { static DEFINE_SPINLOCK(vmap_area_lock); static struct rb_root vmap_area_root = RB_ROOT; static LIST_HEAD(vmap_area_list); +static unsigned long vmap_area_pcpu_hole; static struct vmap_area *__find_vmap_area(unsigned long addr) { @@ -431,6 +432,15 @@ static void __free_vmap_area(struct vmap_area *va) RB_CLEAR_NODE(&va->rb_node); list_del_rcu(&va->list); + /* + * Track the highest possible candidate for pcpu area + * allocation. Areas outside of vmalloc area can be returned + * here too, consider only end addresses which fall inside + * vmalloc area proper. + */ + if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) + vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); + call_rcu(&va->rcu_head, rcu_free_va); } @@ -1038,6 +1048,9 @@ void __init vmalloc_init(void) va->va_end = va->va_start + tmp->size; __insert_vmap_area(va); } + + vmap_area_pcpu_hole = VMALLOC_END; + vmap_initialized = true; } @@ -1821,6 +1834,286 @@ void free_vm_area(struct vm_struct *area) } EXPORT_SYMBOL_GPL(free_vm_area); +static struct vmap_area *node_to_va(struct rb_node *n) +{ + return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; +} + +/** + * pvm_find_next_prev - find the next and prev vmap_area surrounding @end + * @end: target address + * @pnext: out arg for the next vmap_area + * @pprev: out arg for the previous vmap_area + * + * Returns: %true if either or both of next and prev are found, + * %false if no vmap_area exists + * + * Find vmap_areas end addresses of which enclose @end. ie. if not + * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. + */ +static bool pvm_find_next_prev(unsigned long end, + struct vmap_area **pnext, + struct vmap_area **pprev) +{ + struct rb_node *n = vmap_area_root.rb_node; + struct vmap_area *va = NULL; + + while (n) { + va = rb_entry(n, struct vmap_area, rb_node); + if (end < va->va_end) + n = n->rb_left; + else if (end > va->va_end) + n = n->rb_right; + else + break; + } + + if (!va) + return false; + + if (va->va_end > end) { + *pnext = va; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } else { + *pprev = va; + *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); + } + return true; +} + +/** + * pvm_determine_end - find the highest aligned address between two vmap_areas + * @pnext: in/out arg for the next vmap_area + * @pprev: in/out arg for the previous vmap_area + * @align: alignment + * + * Returns: determined end address + * + * Find the highest aligned address between *@pnext and *@pprev below + * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned + * down address is between the end addresses of the two vmap_areas. + * + * Please note that the address returned by this function may fall + * inside *@pnext vmap_area. The caller is responsible for checking + * that. + */ +static unsigned long pvm_determine_end(struct vmap_area **pnext, + struct vmap_area **pprev, + unsigned long align) +{ + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + unsigned long addr; + + if (*pnext) + addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); + else + addr = vmalloc_end; + + while (*pprev && (*pprev)->va_end > addr) { + *pnext = *pprev; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } + + return addr; +} + +/** + * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator + * @offsets: array containing offset of each area + * @sizes: array containing size of each area + * @nr_vms: the number of areas to allocate + * @align: alignment, all entries in @offsets and @sizes must be aligned to this + * @gfp_mask: allocation mask + * + * Returns: kmalloc'd vm_struct pointer array pointing to allocated + * vm_structs on success, %NULL on failure + * + * Percpu allocator wants to use congruent vm areas so that it can + * maintain the offsets among percpu areas. This function allocates + * congruent vmalloc areas for it. These areas tend to be scattered + * pretty far, distance between two areas easily going up to + * gigabytes. To avoid interacting with regular vmallocs, these areas + * are allocated from top. + * + * Despite its complicated look, this allocator is rather simple. It + * does everything top-down and scans areas from the end looking for + * matching slot. While scanning, if any of the areas overlaps with + * existing vmap_area, the base address is pulled down to fit the + * area. Scanning is repeated till all the areas fit and then all + * necessary data structres are inserted and the result is returned. + */ +struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, + const size_t *sizes, int nr_vms, + size_t align, gfp_t gfp_mask) +{ + const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + struct vmap_area **vas, *prev, *next; + struct vm_struct **vms; + int area, area2, last_area, term_area; + unsigned long base, start, end, last_end; + bool purged = false; + + gfp_mask &= GFP_RECLAIM_MASK; + + /* verify parameters and allocate data structures */ + BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); + for (last_area = 0, area = 0; area < nr_vms; area++) { + start = offsets[area]; + end = start + sizes[area]; + + /* is everything aligned properly? */ + BUG_ON(!IS_ALIGNED(offsets[area], align)); + BUG_ON(!IS_ALIGNED(sizes[area], align)); + + /* detect the area with the highest address */ + if (start > offsets[last_area]) + last_area = area; + + for (area2 = 0; area2 < nr_vms; area2++) { + unsigned long start2 = offsets[area2]; + unsigned long end2 = start2 + sizes[area2]; + + if (area2 == area) + continue; + + BUG_ON(start2 >= start && start2 < end); + BUG_ON(end2 <= end && end2 > start); + } + } + last_end = offsets[last_area] + sizes[last_area]; + + if (vmalloc_end - vmalloc_start < last_end) { + WARN_ON(true); + return NULL; + } + + vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); + vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); + if (!vas || !vms) + goto err_free; + + for (area = 0; area < nr_vms; area++) { + vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); + vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); + if (!vas[area] || !vms[area]) + goto err_free; + } +retry: + spin_lock(&vmap_area_lock); + + /* start scanning - we scan from the top, begin with the last area */ + area = term_area = last_area; + start = offsets[area]; + end = start + sizes[area]; + + if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { + base = vmalloc_end - last_end; + goto found; + } + base = pvm_determine_end(&next, &prev, align) - end; + + while (true) { + BUG_ON(next && next->va_end <= base + end); + BUG_ON(prev && prev->va_end > base + end); + + /* + * base might have underflowed, add last_end before + * comparing. + */ + if (base + last_end < vmalloc_start + last_end) { + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = true; + goto retry; + } + goto err_free; + } + + /* + * If next overlaps, move base downwards so that it's + * right below next and then recheck. + */ + if (next && next->va_start < base + end) { + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * If prev overlaps, shift down next and prev and move + * base so that it's right below new next and then + * recheck. + */ + if (prev && prev->va_end > base + start) { + next = prev; + prev = node_to_va(rb_prev(&next->rb_node)); + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * This area fits, move on to the previous one. If + * the previous one is the terminal one, we're done. + */ + area = (area + nr_vms - 1) % nr_vms; + if (area == term_area) + break; + start = offsets[area]; + end = start + sizes[area]; + pvm_find_next_prev(base + end, &next, &prev); + } +found: + /* we've found a fitting base, insert all va's */ + for (area = 0; area < nr_vms; area++) { + struct vmap_area *va = vas[area]; + + va->va_start = base + offsets[area]; + va->va_end = va->va_start + sizes[area]; + __insert_vmap_area(va); + } + + vmap_area_pcpu_hole = base + offsets[last_area]; + + spin_unlock(&vmap_area_lock); + + /* insert all vm's */ + for (area = 0; area < nr_vms; area++) + insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + pcpu_get_vm_areas); + + kfree(vas); + return vms; + +err_free: + for (area = 0; area < nr_vms; area++) { + if (vas) + kfree(vas[area]); + if (vms) + kfree(vms[area]); + } + kfree(vas); + kfree(vms); + return NULL; +} + +/** + * pcpu_free_vm_areas - free vmalloc areas for percpu allocator + * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() + * @nr_vms: the number of allocated areas + * + * Free vm_structs and the array allocated by pcpu_get_vm_areas(). + */ +void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) +{ + int i; + + for (i = 0; i < nr_vms; i++) + free_vm_area(vms[i]); + kfree(vms); +} #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) From 6563297ceafab6bbcc931b52e2a9e660fbb21fb2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:52 +0900 Subject: [PATCH 0102/1662] percpu: use group information to allocate vmap areas sparsely ai->groups[] contains which units need to be put consecutively and at what offset from the chunk base address. Compile this information into pcpu_group_offsets[] and pcpu_group_sizes[] in pcpu_setup_first_chunk() and use them to allocate sparse vm areas using pcpu_get_vm_areas(). This will be used to allow directly using sparse NUMA memories as percpu areas. Signed-off-by: Tejun Heo Cc: Nick Piggin --- mm/percpu.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 548624309f83..cc9c4c64606d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -98,7 +98,7 @@ struct pcpu_chunk { int map_used; /* # of map entries used */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ - struct vm_struct *vm; /* mapped vmalloc region */ + struct vm_struct **vms; /* mapped vmalloc regions */ bool immutable; /* no [de]population allowed */ unsigned long populated[]; /* populated bitmap */ }; @@ -106,7 +106,7 @@ struct pcpu_chunk { static int pcpu_unit_pages __read_mostly; static int pcpu_unit_size __read_mostly; static int pcpu_nr_units __read_mostly; -static int pcpu_chunk_size __read_mostly; +static int pcpu_atom_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; @@ -121,6 +121,11 @@ EXPORT_SYMBOL_GPL(pcpu_base_addr); static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ +/* group information, used for vm allocation */ +static int pcpu_nr_groups __read_mostly; +static const unsigned long *pcpu_group_offsets __read_mostly; +static const size_t *pcpu_group_sizes __read_mostly; + /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different @@ -988,8 +993,8 @@ static void free_pcpu_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; - if (chunk->vm) - free_vm_area(chunk->vm); + if (chunk->vms) + pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups); pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); kfree(chunk); } @@ -1006,8 +1011,10 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map[chunk->map_used++] = pcpu_unit_size; - chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); - if (!chunk->vm) { + chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, + pcpu_nr_groups, pcpu_atom_size, + GFP_KERNEL); + if (!chunk->vms) { free_pcpu_chunk(chunk); return NULL; } @@ -1015,7 +1022,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) INIT_LIST_HEAD(&chunk->list); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; - chunk->base_addr = chunk->vm->addr; + chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0]; return chunk; } @@ -1571,6 +1578,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, size_t dyn_size = ai->dyn_size; size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned long *group_offsets; + size_t *group_sizes; unsigned long *unit_off; unsigned int cpu; int *unit_map; @@ -1588,7 +1597,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_dump_alloc_info(KERN_DEBUG, ai); - /* determine number of units and initialize unit_map and base */ + /* process group information and build config tables accordingly */ + group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); + group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); @@ -1599,6 +1610,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { const struct pcpu_group_info *gi = &ai->groups[group]; + group_offsets[group] = gi->base_offset; + group_sizes[group] = gi->nr_units * ai->unit_size; + for (i = 0; i < gi->nr_units; i++) { cpu = gi->cpu_map[i]; if (cpu == NR_CPUS) @@ -1620,13 +1634,16 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, for_each_possible_cpu(cpu) BUG_ON(unit_map[cpu] == NR_CPUS); + pcpu_nr_groups = ai->nr_groups; + pcpu_group_offsets = group_offsets; + pcpu_group_sizes = group_sizes; pcpu_unit_map = unit_map; pcpu_unit_offsets = unit_off; /* determine basic parameters */ pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size; + pcpu_atom_size = ai->atom_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); From c8826dd538602d730ed2c18c6753f1bbfa6c4933 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:52 +0900 Subject: [PATCH 0103/1662] percpu: update embedding first chunk allocator to handle sparse units Now that percpu core can handle very sparse units, given that vmalloc space is large enough, embedding first chunk allocator can use any memory to build the first chunk. This patch teaches pcpu_embed_first_chunk() about distances between cpus and to use alloc/free callbacks to allocate node specific areas for each group and use them for the first chunk. This brings the benefits of embedding allocator to NUMA configurations - no extra TLB pressure with the flexibility of unified dynamic allocator and no need to restructure arch code to build memory layout suitable for percpu. With units put into atom_size aligned groups according to cpu distances, using large page for dynamic chunks is also easily possible with falling back to reuglar pages if large allocation fails. Embedding allocator users are converted to specify NULL cpu_distance_fn, so this patch doesn't cause any visible behavior difference. Following patches will convert them. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 4 +- include/linux/percpu.h | 7 +- mm/percpu.c | 113 +++++++++++++++++++++++++-------- 3 files changed, 93 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9becc5d4b518..67f6314de9f1 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -234,7 +234,9 @@ static int __init setup_pcpu_embed(bool chosen) return -EINVAL; return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE); + reserve - PERCPU_FIRST_CHUNK_RESERVE, + PAGE_SIZE, NULL, pcpu_fc_alloc, + pcpu_fc_free); } /* diff --git a/include/linux/percpu.h b/include/linux/percpu.h index a7ec840f596c..25359932740e 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -110,8 +110,11 @@ extern int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr); #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK -extern int __init pcpu_embed_first_chunk(size_t reserved_size, - ssize_t dyn_size); +extern int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK diff --git a/mm/percpu.c b/mm/percpu.c index cc9c4c64606d..c2826d05505c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1747,15 +1747,25 @@ early_param("percpu_alloc", percpu_alloc_setup); * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * @alloc_fn: function to allocate percpu page + * @free_fn: funtion to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated - * as a contiguous area using bootmem allocator and used as-is without - * being mapped into vmalloc area. This enables the first chunk to - * piggy back on the linear physical mapping which often uses larger - * page size. + * by calling @alloc_fn and used as-is without being mapped into + * vmalloc area. Allocations are always whole multiples of @atom_size + * aligned to @atom_size. + * + * This enables the first chunk to piggy back on the linear physical + * mapping which often uses larger page size. Please note that this + * can result in very sparse cpu->unit mapping on NUMA machines thus + * requiring large vmalloc address space. Don't use this allocator if + * vmalloc space is not orders of magnitude larger than distances + * between node memory addresses (ie. 32bit NUMA machines). * * When @dyn_size is positive, dynamic area might be larger than * specified to fill page alignment. When @dyn_size is auto, @@ -1763,53 +1773,88 @@ early_param("percpu_alloc", percpu_alloc_setup); * and reserved areas. * * If the needed size is smaller than the minimum or specified unit - * size, the leftover is returned to the bootmem allocator. + * size, the leftover is returned using @free_fn. * * RETURNS: * 0 on success, -errno on failure. */ -int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) +int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn) { + void *base = (void *)ULONG_MAX; + void **areas = NULL; struct pcpu_alloc_info *ai; - size_t size_sum, chunk_size; - void *base; - int unit; - int rc; + size_t size_sum, areas_size; + int group, i, rc; - ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL); + ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, + cpu_distance_fn); if (IS_ERR(ai)) return PTR_ERR(ai); - BUG_ON(ai->nr_groups != 1); - BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; - chunk_size = ai->unit_size * num_possible_cpus(); + areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); - base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, - __pa(MAX_DMA_ADDRESS)); - if (!base) { - pr_warning("PERCPU: failed to allocate %zu bytes for " - "embedding\n", chunk_size); + areas = alloc_bootmem_nopanic(areas_size); + if (!areas) { rc = -ENOMEM; - goto out_free_ai; + goto out_free; } - /* return the leftover and copy */ - for (unit = 0; unit < num_possible_cpus(); unit++) { - void *ptr = base + unit * ai->unit_size; + /* allocate, copy and determine base address */ + for (group = 0; group < ai->nr_groups; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + unsigned int cpu = NR_CPUS; + void *ptr; - free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum); - memcpy(ptr, __per_cpu_load, ai->static_size); + for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) + cpu = gi->cpu_map[i]; + BUG_ON(cpu == NR_CPUS); + + /* allocate space for the whole group */ + ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); + if (!ptr) { + rc = -ENOMEM; + goto out_free_areas; + } + areas[group] = ptr; + + base = min(ptr, base); + + for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { + if (gi->cpu_map[i] == NR_CPUS) { + /* unused unit, free whole */ + free_fn(ptr, ai->unit_size); + continue; + } + /* copy and return the unused part */ + memcpy(ptr, __per_cpu_load, ai->static_size); + free_fn(ptr + size_sum, ai->unit_size - size_sum); + } } - /* we're ready, commit */ + /* base address is now known, determine group base offsets */ + for (group = 0; group < ai->nr_groups; group++) + ai->groups[group].base_offset = areas[group] - base; + pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); rc = pcpu_setup_first_chunk(ai, base); -out_free_ai: + goto out_free; + +out_free_areas: + for (group = 0; group < ai->nr_groups; group++) + free_fn(areas[group], + ai->groups[group].nr_units * ai->unit_size); +out_free: pcpu_free_alloc_info(ai); + if (areas) + free_bootmem(__pa(areas), areas_size); return rc; } #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || @@ -2177,6 +2222,17 @@ void *pcpu_lpage_remapped(void *kaddr) unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); +static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, + size_t align) +{ + return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); +} + +static void __init pcpu_dfl_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + void __init setup_per_cpu_areas(void) { unsigned long delta; @@ -2188,7 +2244,8 @@ void __init setup_per_cpu_areas(void) * what the legacy allocator did. */ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE); + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, + pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); if (rc < 0) panic("Failed to initialized percpu areas."); From 4518e6a0c038b98be4c480e6f4481e8676bd15dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:52 +0900 Subject: [PATCH 0104/1662] x86,percpu: use embedding for 64bit NUMA and page for 32bit NUMA Embedding percpu first chunk allocator can now handle very sparse unit mapping. Use embedding allocator instead of lpage for 64bit NUMA. This removes extra TLB pressure and the need to do complex and fragile dancing when changing page attributes. For 32bit, using very sparse unit mapping isn't a good idea because the vmalloc space is very constrained. 32bit NUMA machines aren't exactly the focus of optimization and it isn't very clear whether lpage performs better than page. Use page first chunk allocator for 32bit NUMAs. As this leaves setup_pcpu_*() functions pretty much empty, fold them into setup_per_cpu_areas(). Signed-off-by: Tejun Heo Cc: Ingo Molnar Cc: Andi Kleen --- arch/x86/Kconfig | 4 - arch/x86/kernel/setup_percpu.c | 153 ++++++--------------------------- 2 files changed, 27 insertions(+), 130 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f7ac27215512..869d7d301448 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -156,10 +156,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK config NEED_PER_CPU_PAGE_FIRST_CHUNK def_bool y -config NEED_PER_CPU_LPAGE_FIRST_CHUNK - def_bool y - depends on NEED_MULTIPLE_NODES - config HAVE_CPUMASK_OF_CPU_MAP def_bool X86_64_SMP diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 67f6314de9f1..d559af913e1f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset); #define PERCPU_FIRST_CHUNK_RESERVE 0 #endif +#ifdef CONFIG_X86_32 /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void) #endif return false; } +#endif /** * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu @@ -136,128 +138,23 @@ static void __init pcpu_fc_free(void *ptr, size_t size) free_bootmem(__pa(ptr), size); } -/* - * Large page remapping allocator - */ +static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) +{ #ifdef CONFIG_NEED_MULTIPLE_NODES -static void __init pcpul_map(void *ptr, size_t size, void *addr) -{ - pmd_t *pmd, pmd_v; - - pmd = populate_extra_pmd((unsigned long)addr); - pmd_v = pfn_pmd(page_to_pfn(virt_to_page(ptr)), PAGE_KERNEL_LARGE); - set_pmd(pmd, pmd_v); -} - -static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) -{ if (early_cpu_to_node(from) == early_cpu_to_node(to)) return LOCAL_DISTANCE; else return REMOTE_DISTANCE; -} - -static int __init setup_pcpu_lpage(bool chosen) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; - struct pcpu_alloc_info *ai; - int rc; - - /* on non-NUMA, embedding is better */ - if (!chosen && !pcpu_need_numa()) - return -EINVAL; - - /* need PSE */ - if (!cpu_has_pse) { - pr_warning("PERCPU: lpage allocator requires PSE\n"); - return -EINVAL; - } - - /* allocate and build unit_map */ - ai = pcpu_build_alloc_info(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, - PMD_SIZE, pcpu_lpage_cpu_distance); - if (IS_ERR(ai)) { - pr_warning("PERCPU: failed to build unit_map (%ld)\n", - PTR_ERR(ai)); - return PTR_ERR(ai); - } - - /* do the parameters look okay? */ - if (!chosen) { - size_t vm_size = VMALLOC_END - VMALLOC_START; - size_t tot_size = 0; - int group; - - for (group = 0; group < ai->nr_groups; group++) - tot_size += ai->unit_size * ai->groups[group].nr_units; - - /* don't consume more than 20% of vmalloc area */ - if (tot_size > vm_size / 5) { - pr_info("PERCPU: too large chunk size %zuMB for " - "large page remap\n", tot_size >> 20); - rc = -EINVAL; - goto out_free; - } - } - - rc = pcpu_lpage_first_chunk(ai, pcpu_fc_alloc, pcpu_fc_free, pcpul_map); -out_free: - pcpu_free_alloc_info(ai); - return rc; -} #else -static int __init setup_pcpu_lpage(bool chosen) -{ - return -EINVAL; -} + return LOCAL_DISTANCE; #endif - -/* - * Embedding allocator - * - * The first chunk is sized to just contain the static area plus - * module and dynamic reserves and embedded into linear physical - * mapping so that it can use PMD mapping without additional TLB - * pressure. - */ -static int __init setup_pcpu_embed(bool chosen) -{ - size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; - - /* - * If large page isn't supported, there's no benefit in doing - * this. Also, embedding allocation doesn't play well with - * NUMA. - */ - if (!chosen && (!cpu_has_pse || pcpu_need_numa())) - return -EINVAL; - - return pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - reserve - PERCPU_FIRST_CHUNK_RESERVE, - PAGE_SIZE, NULL, pcpu_fc_alloc, - pcpu_fc_free); } -/* - * Page allocator - * - * Boring fallback 4k page allocator. This allocator puts more - * pressure on PTE TLBs but other than that behaves nicely on both UMA - * and NUMA. - */ static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); } -static int __init setup_pcpu_page(void) -{ - return pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - pcpu_fc_alloc, pcpu_fc_free, - pcpup_populate_pte); -} - static inline void setup_percpu_segment(int cpu) { #ifdef CONFIG_X86_32 @@ -281,30 +178,34 @@ void __init setup_per_cpu_areas(void) NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); /* - * Allocate percpu area. If PSE is supported, try to make use - * of large page mappings. Please read comments on top of - * each allocator for details. + * Allocate percpu area. Embedding allocator is our favorite; + * however, on NUMA configurations, it can result in very + * sparse unit mapping and vmalloc area isn't spacious enough + * on 32bit. Use page in that case. */ +#ifdef CONFIG_X86_32 + if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif rc = -EINVAL; - if (pcpu_chosen_fc != PCPU_FC_AUTO) { - if (pcpu_chosen_fc != PCPU_FC_PAGE) { - if (pcpu_chosen_fc == PCPU_FC_LPAGE) - rc = setup_pcpu_lpage(true); - else - rc = setup_pcpu_embed(true); + if (pcpu_chosen_fc != PCPU_FC_PAGE) { + const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE; + const size_t dyn_size = PERCPU_MODULE_RESERVE + + PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; - if (rc < 0) - pr_warning("PERCPU: %s allocator failed (%d), " - "falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], rc); - } - } else { - rc = setup_pcpu_lpage(false); + rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + dyn_size, atom_size, + pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - rc = setup_pcpu_embed(false); + pr_warning("PERCPU: %s allocator failed (%d), " + "falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) - rc = setup_pcpu_page(); + rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_fc_alloc, pcpu_fc_free, + pcpup_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); From e933a73f48e3b2d40cfa56d81e2646f194b5a66a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:53 +0900 Subject: [PATCH 0105/1662] percpu: kill lpage first chunk allocator With x86 converted to embedding allocator, lpage doesn't have any user left. Kill it along with cpa handling code. Signed-off-by: Tejun Heo Cc: Jan Beulich --- Documentation/kernel-parameters.txt | 10 +- arch/x86/mm/pageattr.c | 20 +-- include/linux/percpu.h | 16 -- mm/percpu.c | 241 ---------------------------- 4 files changed, 6 insertions(+), 281 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index dee9ce2e6cfa..e710093e3d32 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1920,11 +1920,11 @@ and is between 256 and 4096 characters. It is defined in the file See arch/parisc/kernel/pdc_chassis.c percpu_alloc= Select which percpu first chunk allocator to use. - Currently supported values are "embed", "page" and - "lpage". Archs may support subset or none of the - selections. See comments in mm/percpu.c for details - on each allocator. This parameter is primarily for - debugging and performance comparison. + Currently supported values are "embed" and "page". + Archs may support subset or none of the selections. + See comments in mm/percpu.c for details on each + allocator. This parameter is primarily for debugging + and performance comparison. pf. [PARIDE] See Documentation/blockdev/paride.txt. diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index dce282f65700..f53cfc7f963d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -687,7 +687,7 @@ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT); - unsigned long vaddr, remapped; + unsigned long vaddr; int ret; if (cpa->pfn >= max_pfn_mapped) @@ -745,24 +745,6 @@ static int cpa_process_alias(struct cpa_data *cpa) } #endif - /* - * If the PMD page was partially used for per-cpu remapping, - * the recycled area needs to be split and modified. Because - * the area is always proper subset of a PMD page - * cpa->numpages is guaranteed to be 1 for these areas, so - * there's no need to loop over and check for further remaps. - */ - remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr); - if (remapped) { - WARN_ON(cpa->numpages > 1); - alias_cpa = *cpa; - alias_cpa.vaddr = &remapped; - alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); - ret = __change_page_attr_set_clr(&alias_cpa, 0); - if (ret) - return ret; - } - return 0; } diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 25359932740e..878836ca999c 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -82,7 +82,6 @@ enum pcpu_fc { PCPU_FC_AUTO, PCPU_FC_EMBED, PCPU_FC_PAGE, - PCPU_FC_LPAGE, PCPU_FC_NR, }; @@ -95,7 +94,6 @@ typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); -typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, int nr_units); @@ -124,20 +122,6 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -extern int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn); - -extern void *pcpu_lpage_remapped(void *kaddr); -#else -static inline void *pcpu_lpage_remapped(void *kaddr) -{ - return NULL; -} -#endif - /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's diff --git a/mm/percpu.c b/mm/percpu.c index c2826d05505c..77933928107d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1713,7 +1713,6 @@ const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", [PCPU_FC_PAGE] = "page", - [PCPU_FC_LPAGE] = "lpage", }; enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; @@ -1729,10 +1728,6 @@ static int __init percpu_alloc_setup(char *str) #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK else if (!strcmp(str, "page")) pcpu_chosen_fc = PCPU_FC_PAGE; -#endif -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK - else if (!strcmp(str, "lpage")) - pcpu_chosen_fc = PCPU_FC_LPAGE; #endif else pr_warning("PERCPU: unknown allocator %s specified\n", str); @@ -1970,242 +1965,6 @@ int __init pcpu_page_first_chunk(size_t reserved_size, } #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ -#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK -struct pcpul_ent { - void *ptr; - void *map_addr; -}; - -static size_t pcpul_size; -static size_t pcpul_lpage_size; -static int pcpul_nr_lpages; -static struct pcpul_ent *pcpul_map; - -static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai, - unsigned int *cpup) -{ - int group, cunit; - - for (group = 0, cunit = 0; group < ai->nr_groups; group++) { - const struct pcpu_group_info *gi = &ai->groups[group]; - - if (unit < cunit + gi->nr_units) { - if (cpup) - *cpup = gi->cpu_map[unit - cunit]; - return true; - } - cunit += gi->nr_units; - } - - return false; -} - -static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai) -{ - int group, unit, i; - - for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { - const struct pcpu_group_info *gi = &ai->groups[group]; - - for (i = 0; i < gi->nr_units; i++) - if (gi->cpu_map[i] == cpu) - return unit + i; - } - BUG(); -} - -/** - * pcpu_lpage_first_chunk - remap the first percpu chunk using large page - * @ai: pcpu_alloc_info - * @alloc_fn: function to allocate percpu lpage, always called with lpage_size - * @free_fn: function to free percpu memory, @size <= lpage_size - * @map_fn: function to map percpu lpage, always called with lpage_size - * - * This allocator uses large page to build and map the first chunk. - * Unlike other helpers, the caller should provide fully initialized - * @ai. This can be done using pcpu_build_alloc_info(). This two - * stage initialization is to allow arch code to evaluate the - * parameters before committing to it. - * - * Large pages are allocated as directed by @unit_map and other - * parameters and mapped to vmalloc space. Unused holes are returned - * to the page allocator. Note that these holes end up being actively - * mapped twice - once to the physical mapping and to the vmalloc area - * for the first percpu chunk. Depending on architecture, this might - * cause problem when changing page attributes of the returned area. - * These double mapped areas can be detected using - * pcpu_lpage_remapped(). - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_map_fn_t map_fn) -{ - static struct vm_struct vm; - const size_t lpage_size = ai->atom_size; - size_t chunk_size, map_size; - unsigned int cpu; - int i, j, unit, nr_units, rc; - - nr_units = 0; - for (i = 0; i < ai->nr_groups; i++) - nr_units += ai->groups[i].nr_units; - - chunk_size = ai->unit_size * nr_units; - BUG_ON(chunk_size % lpage_size); - - pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size; - pcpul_lpage_size = lpage_size; - pcpul_nr_lpages = chunk_size / lpage_size; - - /* allocate pointer array and alloc large pages */ - map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]); - pcpul_map = alloc_bootmem(map_size); - - /* allocate all pages */ - for (i = 0; i < pcpul_nr_lpages; i++) { - size_t offset = i * lpage_size; - int first_unit = offset / ai->unit_size; - int last_unit = (offset + lpage_size - 1) / ai->unit_size; - void *ptr; - - /* find out which cpu is mapped to this unit */ - for (unit = first_unit; unit <= last_unit; unit++) - if (pcpul_unit_to_cpu(unit, ai, &cpu)) - goto found; - continue; - found: - ptr = alloc_fn(cpu, lpage_size, lpage_size); - if (!ptr) { - pr_warning("PERCPU: failed to allocate large page " - "for cpu%u\n", cpu); - goto enomem; - } - - pcpul_map[i].ptr = ptr; - } - - /* return unused holes */ - for (unit = 0; unit < nr_units; unit++) { - size_t start = unit * ai->unit_size; - size_t end = start + ai->unit_size; - size_t off, next; - - /* don't free used part of occupied unit */ - if (pcpul_unit_to_cpu(unit, ai, NULL)) - start += pcpul_size; - - /* unit can span more than one page, punch the holes */ - for (off = start; off < end; off = next) { - void *ptr = pcpul_map[off / lpage_size].ptr; - next = min(roundup(off + 1, lpage_size), end); - if (ptr) - free_fn(ptr + off % lpage_size, next - off); - } - } - - /* allocate address, map and copy */ - vm.flags = VM_ALLOC; - vm.size = chunk_size; - vm_area_register_early(&vm, ai->unit_size); - - for (i = 0; i < pcpul_nr_lpages; i++) { - if (!pcpul_map[i].ptr) - continue; - pcpul_map[i].map_addr = vm.addr + i * lpage_size; - map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr); - } - - for_each_possible_cpu(cpu) - memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size, - __per_cpu_load, ai->static_size); - - /* we're ready, commit */ - pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n", - vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size, - ai->unit_size); - - rc = pcpu_setup_first_chunk(ai, vm.addr); - - /* - * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped - * lpages are pushed to the end and trimmed. - */ - for (i = 0; i < pcpul_nr_lpages - 1; i++) - for (j = i + 1; j < pcpul_nr_lpages; j++) { - struct pcpul_ent tmp; - - if (!pcpul_map[j].ptr) - continue; - if (pcpul_map[i].ptr && - pcpul_map[i].ptr < pcpul_map[j].ptr) - continue; - - tmp = pcpul_map[i]; - pcpul_map[i] = pcpul_map[j]; - pcpul_map[j] = tmp; - } - - while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) - pcpul_nr_lpages--; - - return rc; - -enomem: - for (i = 0; i < pcpul_nr_lpages; i++) - if (pcpul_map[i].ptr) - free_fn(pcpul_map[i].ptr, lpage_size); - free_bootmem(__pa(pcpul_map), map_size); - return -ENOMEM; -} - -/** - * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area - * @kaddr: the kernel address in question - * - * Determine whether @kaddr falls in the pcpul recycled area. This is - * used by pageattr to detect VM aliases and break up the pcpu large - * page mapping such that the same physical page is not mapped under - * different attributes. - * - * The recycled area is always at the tail of a partially used large - * page. - * - * RETURNS: - * Address of corresponding remapped pcpu address if match is found; - * otherwise, NULL. - */ -void *pcpu_lpage_remapped(void *kaddr) -{ - unsigned long lpage_mask = pcpul_lpage_size - 1; - void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask); - unsigned long offset = (unsigned long)kaddr & lpage_mask; - int left = 0, right = pcpul_nr_lpages - 1; - int pos; - - /* pcpul in use at all? */ - if (!pcpul_map) - return NULL; - - /* okay, perform binary search */ - while (left <= right) { - pos = (left + right) / 2; - - if (pcpul_map[pos].ptr < lpage_addr) - left = pos + 1; - else if (pcpul_map[pos].ptr > lpage_addr) - right = pos - 1; - else - return pcpul_map[pos].map_addr + offset; - } - - return NULL; -} -#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */ - /* * Generic percpu area setup. * From bcb2107fdbecef3de55d597d23453747af81ba88 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:53 +0900 Subject: [PATCH 0106/1662] sparc64: use embedding percpu first chunk allocator sparc64 currently allocates a large page for each cpu and partially remap them into vmalloc area much like what lpage first chunk allocator did. As a 4M page is used for each cpu, this results in very large unit size and also adds TLB pressure due to the double mapping of pages in the first chunk. This patch converts sparc64 to use the embedding percpu first chunk allocator which now knows how to handle NUMA configurations. This simplifies the code a lot, doesn't incur any extra TLB pressure and results in better utilization of address space. Signed-off-by: Tejun Heo Acked-by: David S. Miller --- arch/sparc/Kconfig | 3 + arch/sparc/kernel/smp_64.c | 128 ++++++------------------------------- 2 files changed, 21 insertions(+), 110 deletions(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 4f6ed0f113f0..fbd1233b392d 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -95,6 +95,9 @@ config AUDIT_ARCH config HAVE_SETUP_PER_CPU_AREA def_bool y if SPARC64 +config NEED_PER_CPU_EMBED_FIRST_CHUNK + def_bool y if SPARC64 + config GENERIC_HARDIRQS_NO__DO_IRQ bool def_bool y if SPARC64 diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index b03fd362c629..ff68373ce6d6 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1389,8 +1389,8 @@ void smp_send_stop(void) * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, - unsigned long align) +static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, + size_t align) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); #ifdef CONFIG_NEED_MULTIPLE_NODES @@ -1415,123 +1415,31 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, #endif } -#define PCPU_CHUNK_SIZE (4UL * 1024UL * 1024UL) - -static void __init pcpu_map_range(unsigned long start, unsigned long end, - struct page *page) +static void __init pcpu_free_bootmem(void *ptr, size_t size) { - unsigned long pfn = page_to_pfn(page); - unsigned long pte_base; + free_bootmem(__pa(ptr), size); +} - BUG_ON((pfn< end) - this_end = end; - - while (start < this_end) { - unsigned long paddr = pfn << PAGE_SHIFT; - - pte_val(*pte) = (paddr | pte_base); - - start += PAGE_SIZE; - pte++; - pfn++; - } - } +static int pcpu_cpu_distance(unsigned int from, unsigned int to) +{ + if (cpu_to_node(from) == cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; } void __init setup_per_cpu_areas(void) { - static struct vm_struct vm; - struct pcpu_alloc_info *ai; - unsigned long delta, cpu; - size_t size_sum; - size_t ptrs_size; - void **ptrs; + unsigned long delta; + unsigned int cpu; int rc; - ai = pcpu_alloc_alloc_info(1, nr_cpu_ids); - - ai->static_size = __per_cpu_end - __per_cpu_start; - ai->reserved_size = PERCPU_MODULE_RESERVE; - - size_sum = PFN_ALIGN(ai->static_size + ai->reserved_size + - PERCPU_DYNAMIC_RESERVE); - - ai->dyn_size = size_sum - ai->static_size - ai->reserved_size; - ai->unit_size = PCPU_CHUNK_SIZE; - ai->atom_size = PCPU_CHUNK_SIZE; - ai->alloc_size = PCPU_CHUNK_SIZE; - ai->groups[0].nr_units = nr_cpu_ids; - - for_each_possible_cpu(cpu) - ai->groups[0].cpu_map[cpu] = cpu; - - ptrs_size = PFN_ALIGN(nr_cpu_ids * sizeof(ptrs[0])); - ptrs = alloc_bootmem(ptrs_size); - - for_each_possible_cpu(cpu) { - ptrs[cpu] = pcpu_alloc_bootmem(cpu, PCPU_CHUNK_SIZE, - PCPU_CHUNK_SIZE); - - free_bootmem(__pa(ptrs[cpu] + size_sum), - PCPU_CHUNK_SIZE - size_sum); - - memcpy(ptrs[cpu], __per_cpu_load, ai->static_size); - } - - /* allocate address and map */ - vm.flags = VM_ALLOC; - vm.size = nr_cpu_ids * PCPU_CHUNK_SIZE; - vm_area_register_early(&vm, PCPU_CHUNK_SIZE); - - for_each_possible_cpu(cpu) { - unsigned long start = (unsigned long) vm.addr; - unsigned long end; - - start += cpu * PCPU_CHUNK_SIZE; - end = start + PCPU_CHUNK_SIZE; - pcpu_map_range(start, end, virt_to_page(ptrs[cpu])); - } - - rc = pcpu_setup_first_chunk(ai, vm.addr); + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, 4 << 20, + pcpu_cpu_distance, pcpu_alloc_bootmem, + pcpu_free_bootmem); if (rc) - panic("failed to setup percpu first chunk (%d)", rc); - - free_bootmem(__pa(ptrs), ptrs_size); + panic("failed to initialize first chunk (%d)", rc); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) From c2a7e818019f20a5cf7fb26a6eb59e212e6c0cd8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 14 Aug 2009 15:00:53 +0900 Subject: [PATCH 0107/1662] powerpc64: convert to dynamic percpu allocator Now that percpu allows arbitrary embedding of the first chunk, powerpc64 can easily be converted to dynamic percpu allocator. Convert it. powerpc supports several large page sizes. Cap atom_size at 1M. There isn't much to gain by going above that anyway. Signed-off-by: Tejun Heo Cc: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 4 +-- arch/powerpc/kernel/setup_64.c | 57 +++++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 61bbffa2fe60..2c42e1526d03 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -46,10 +46,10 @@ config GENERIC_HARDIRQS_NO__DO_IRQ bool default y -config HAVE_LEGACY_PER_CPU_AREA +config HAVE_SETUP_PER_CPU_AREA def_bool PPC64 -config HAVE_SETUP_PER_CPU_AREA +config NEED_PER_CPU_EMBED_FIRST_CHUNK def_bool PPC64 config IRQ_PER_CPU diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 1f6816003ebe..aa6e4500635f 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include @@ -569,25 +570,53 @@ void cpu_die(void) } #ifdef CONFIG_SMP +#define PCPU_DYN_SIZE () + +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) +{ + return __alloc_bootmem_node(NODE_DATA(cpu_to_node(cpu)), size, align, + __pa(MAX_DMA_ADDRESS)); +} + +static void __init pcpu_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + +static int pcpu_cpu_distance(unsigned int from, unsigned int to) +{ + if (cpu_to_node(from) == cpu_to_node(to)) + return LOCAL_DISTANCE; + else + return REMOTE_DISTANCE; +} + void __init setup_per_cpu_areas(void) { - int i; - unsigned long size; - char *ptr; + const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; + size_t atom_size; + unsigned long delta; + unsigned int cpu; + int rc; - /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE); -#ifdef CONFIG_MODULES - if (size < PERCPU_ENOUGH_ROOM) - size = PERCPU_ENOUGH_ROOM; -#endif + /* + * Linear mapping is one of 4K, 1M and 16M. For 4K, no need + * to group units. For larger mappings, use 1M atom which + * should be large enough to contain a number of units. + */ + if (mmu_linear_psize == MMU_PAGE_4K) + atom_size = PAGE_SIZE; + else + atom_size = 1 << 20; - for_each_possible_cpu(i) { - ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); + rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, + pcpu_fc_alloc, pcpu_fc_free); + if (rc < 0) + panic("cannot initialize percpu area (err=%d)", rc); - paca[i].data_offset = ptr - __per_cpu_start; - memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - } + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + paca[cpu].data_offset = delta + pcpu_unit_offsets[cpu]; } #endif From 58c41d28259c246dbc11358d85d332dc20ccd57b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 14 Aug 2009 12:14:19 -0700 Subject: [PATCH 0108/1662] x86, intel_txt: Factor out the code for S3 setup S3 sleep requires special setup in tboot. However, the data structures needed to do such setup are only available if CONFIG_ACPI_SLEEP is enabled. Abstract them out as much as possible, so we can have a single tboot_setup_sleep() which either is a proper implementation or a stub which simply calls BUG(). Signed-off-by: H. Peter Anvin Acked-by: Shane Wang Cc: Joseph Cihula --- arch/x86/kernel/tboot.c | 59 ++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 1ab801208945..a183beffe39e 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -164,25 +165,51 @@ void tboot_create_trampoline(void) map_base = PFN_DOWN(tboot->tboot_base); map_size = PFN_UP(tboot->tboot_size); if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size)) - panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", map_base, map_size); + panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n", + map_base, map_size); } -static void set_mac_regions(void) +#ifdef CONFIG_ACPI_SLEEP + +static void add_mac_region(phys_addr_t start, unsigned long size) { - tboot->num_mac_regions = 3; - /* S3 resume code */ - tboot->mac_regions[0].start = PFN_PHYS(PFN_DOWN(acpi_wakeup_address)); - tboot->mac_regions[0].size = PFN_UP(WAKEUP_SIZE) << PAGE_SHIFT; - /* AP trampoline code */ - tboot->mac_regions[1].start = - PFN_PHYS(PFN_DOWN(virt_to_phys(trampoline_base))); - tboot->mac_regions[1].size = PFN_UP(TRAMPOLINE_SIZE) << PAGE_SHIFT; - /* kernel code + data + bss */ - tboot->mac_regions[2].start = PFN_PHYS(PFN_DOWN(virt_to_phys(&_text))); - tboot->mac_regions[2].size = PFN_PHYS(PFN_UP(virt_to_phys(&_end))) - - PFN_PHYS(PFN_DOWN(virt_to_phys(&_text))); + struct tboot_mac_region *mr; + phys_addr_t end = start + size; + + if (start && size) { + mr = &tboot->mac_regions[tboot->num_mac_regions++]; + mr->start = round_down(start, PAGE_SIZE); + mr->size = round_up(end, PAGE_SIZE) - mr->start; + } } +static int tboot_setup_sleep(void) +{ + tboot->num_mac_regions = 0; + + /* S3 resume code */ + add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); + /* AP trampoline code */ + add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); + /* kernel code + data + bss */ + add_mac_region(virt_to_phys(_text), _end - _text); + + tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; + + return 0; +} + +#else /* no CONFIG_ACPI_SLEEP */ + +static int tboot_setup_sleep(void) +{ + /* S3 shutdown requested, but S3 not supported by the kernel... */ + BUG(); + return -1; +} + +#endif + void tboot_shutdown(u32 shutdown_type) { void (*shutdown)(void); @@ -200,7 +227,8 @@ void tboot_shutdown(u32 shutdown_type) /* if this is S3 then set regions to MAC */ if (shutdown_type == TB_SHUTDOWN_S3) - set_mac_regions(); + if (tboot_setup_sleep()) + return; tboot->shutdown_type = shutdown_type; @@ -253,7 +281,6 @@ void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control; /* we always use the 32b wakeup vector */ tboot->acpi_sinfo.vector_width = 32; - tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; if (sleep_state >= ACPI_S_STATE_COUNT || acpi_shutdown_map[sleep_state] == -1) { From 3c556e4198926b284ff5ff6756111a64e1e98cb0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 12 Aug 2009 12:00:40 -0300 Subject: [PATCH 0109/1662] x86, intel_txt: Fix typos in Kconfig help Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: H. Peter Anvin --- security/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/Kconfig b/security/Kconfig index edc7cbdc012a..6631774672c1 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -123,7 +123,7 @@ config INTEL_TXT of the kernel. If the system does not support Intel(R) TXT, this will have no effect. - Intel TXT will provide higher assurance of sysem configuration and + Intel TXT will provide higher assurance of system configuration and initial state as well as data reset protection. This is used to create a robust initial kernel measurement and verification, which helps to ensure that kernel security mechanisms are functioning @@ -132,7 +132,7 @@ config INTEL_TXT Intel TXT also helps solve real end user concerns about having confidence that their hardware is running the VMM or kernel that - it was conigured with, especially since they may be responsible for + it was configured with, especially since they may be responsible for providing such assurances to VMs and services running on it. See for more information From 468b5ef8a8f67a780dd5f51410f8c92761b36f06 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Mon, 6 Jul 2009 12:26:16 +0000 Subject: [PATCH 0110/1662] IDE: palm_bk3710: convert clock usage after clkdev conversion DaVinci core code has converted to the new clkdev API so clock name strings are not needed. Instead, just the a 'struct device' pointer is needed. Signed-off-by: Kevin Hilman Acked-by: Sergei Shtylyov Signed-off-by: David S. Miller --- drivers/ide/palm_bk3710.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ide/palm_bk3710.c b/drivers/ide/palm_bk3710.c index 3c1dc0152153..f8eddf05ecb8 100644 --- a/drivers/ide/palm_bk3710.c +++ b/drivers/ide/palm_bk3710.c @@ -318,7 +318,7 @@ static int __init palm_bk3710_probe(struct platform_device *pdev) int i, rc; struct ide_hw hw, *hws[] = { &hw }; - clk = clk_get(&pdev->dev, "IDECLK"); + clk = clk_get(&pdev->dev, NULL); if (IS_ERR(clk)) return -ENODEV; From 76fbebfbb593bd66780db0a808afe1d21c7ff6d6 Mon Sep 17 00:00:00 2001 From: Sergey Matyukevich Date: Sat, 15 Aug 2009 00:02:41 +0000 Subject: [PATCH 0111/1662] at91_ide: remove headers specific for at91sam9263 This driver requires only static memory controller definitions and macroses contained in generic header at91sam9_smc.h. Those extra headers are misleading since this driver also works fine for at91sam9260 SoC: tests were performed on afeb9260 board. Signed-off-by: Sergey Matyukevich Acked-by: Stanislaw Gruszka Signed-off-by: David S. Miller --- drivers/ide/at91_ide.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/ide/at91_ide.c b/drivers/ide/at91_ide.c index dbfeda42b940..248219a89a68 100644 --- a/drivers/ide/at91_ide.c +++ b/drivers/ide/at91_ide.c @@ -29,9 +29,7 @@ #include #include -#include #include -#include #define DRV_NAME "at91_ide" From 62a3207b8cf3de35368cdc3822b30b82d59eea95 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 17 Aug 2009 11:16:16 -0700 Subject: [PATCH 0112/1662] x86, intel_txt: Handle ACPI_SLEEP without X86_TRAMPOLINE On 32 bits, we can have CONFIG_ACPI_SLEEP set without implying CONFIG_X86_TRAMPOLINE. In that case, we simply do not need to mark the trampoline as a MAC region. Signed-off-by: H. Peter Anvin Cc: Shane Wang Cc: Joseph Cihula --- arch/x86/kernel/tboot.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a183beffe39e..c2e760ca7b01 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -189,8 +189,12 @@ static int tboot_setup_sleep(void) /* S3 resume code */ add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); + +#ifdef CONFIG_X86_TRAMPOLINE /* AP trampoline code */ add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); +#endif + /* kernel code + data + bss */ add_mac_region(virt_to_phys(_text), _end - _text); From a022fe09700365c51d1f55884bca9754eb96a802 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 16 Aug 2009 20:36:34 -0400 Subject: [PATCH 0113/1662] xfs: fix locking in xfs_iget_cache_hit The locking in xfs_iget_cache_hit currently has numerous problems: - we clear the reclaim tag without i_flags_lock which protects modifications to it - we call inode_init_always which can sleep with pag_ici_lock held (this is oss.sgi.com BZ #819) - we acquire and drop i_flags_lock a lot and thus provide no consistency between the various flags we set/clear under it This patch fixes all that with a major revamp of the locking in the function. The new version acquires i_flags_lock early and only drops it once we need to call into inode_init_always or before calling xfs_ilock. This patch fixes a bug seen in the wild where we race modifying the reclaim tag. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_sync.c | 13 +++- fs/xfs/linux-2.6/xfs_sync.h | 1 + fs/xfs/xfs_iget.c | 115 ++++++++++++++++++------------------ 3 files changed, 71 insertions(+), 58 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index fbf3e0288b34..320be6aea492 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -708,6 +708,16 @@ xfs_reclaim_inode( return 0; } +void +__xfs_inode_set_reclaim_tag( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); +} + /* * We set the inode flag atomically with the radix tree tag. * Once we get tag lookups on the radix tree, this inode flag @@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag( read_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_set_reclaim_tag(pag, ip); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 23e7e7e6e136..27920eb7a820 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, struct xfs_inode *ip); diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 76c540f719e4..91adfab2f45f 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -134,80 +134,82 @@ xfs_iget_cache_hit( int flags, int lock_flags) __releases(pag->pag_ici_lock) { + struct inode *inode = VFS_I(ip); struct xfs_mount *mp = ip->i_mount; - int error = EAGAIN; + int error; + + spin_lock(&ip->i_flags_lock); /* - * If INEW is set this inode is being set up - * If IRECLAIM is set this inode is being torn down - * Pause and try again. + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. */ - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) { + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; goto out_error; } - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */ - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { - - /* - * If lookup is racing with unlink, then we should return an - * error immediately so we don't remove it from the reclaim - * list and potentially leak the inode. - */ - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_error; - } + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_error; + } + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); /* - * We need to re-initialise the VFS inode as it has been - * 'freed' by the VFS. Do this here so we can deal with - * errors cleanly, then tag it so it can be set up correctly - * later. + * We need to set XFS_INEW atomically with clearing the + * reclaimable tag so that we do have an indicator of the + * inode still being initialized. */ - if (!inode_init_always(mp->m_super, VFS_I(ip))) { - error = ENOMEM; + ip->i_flags |= XFS_INEW; + ip->i_flags &= ~XFS_IRECLAIMABLE; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); + + spin_unlock(&ip->i_flags_lock); + read_unlock(&pag->pag_ici_lock); + + error = -inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + read_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~XFS_INEW; + ip->i_flags |= XFS_IRECLAIMABLE; + __xfs_inode_set_reclaim_tag(pag, ip); + goto out_error; + } + inode->i_state = I_LOCK|I_NEW; + } else { + /* If the VFS inode is being torn down, pause and try again. */ + if (!igrab(inode)) { + error = EAGAIN; goto out_error; } - /* - * We must set the XFS_INEW flag before clearing the - * XFS_IRECLAIMABLE flag so that if a racing lookup does - * not find the XFS_IRECLAIMABLE above but has the igrab() - * below succeed we can safely check XFS_INEW to detect - * that this inode is still being initialised. - */ - xfs_iflags_set(ip, XFS_INEW); - xfs_iflags_clear(ip, XFS_IRECLAIMABLE); - - /* clear the radix tree reclaim flag as well. */ - __xfs_inode_clear_reclaim_tag(mp, pag, ip); - } else if (!igrab(VFS_I(ip))) { - /* If the VFS inode is being torn down, pause and try again. */ - XFS_STATS_INC(xs_ig_frecycle); - goto out_error; - } else if (xfs_iflags_test(ip, XFS_INEW)) { - /* - * We are racing with another cache hit that is - * currently recycling this inode out of the XFS_IRECLAIMABLE - * state. Wait for the initialisation to complete before - * continuing. - */ - wait_on_inode(VFS_I(ip)); + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + read_unlock(&pag->pag_ici_lock); } - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - iput(VFS_I(ip)); - goto out_error; - } - - /* We've got a live one. */ - read_unlock(&pag->pag_ici_lock); - if (lock_flags != 0) xfs_ilock(ip, lock_flags); @@ -217,6 +219,7 @@ xfs_iget_cache_hit( return 0; out_error: + spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); return error; } From 66dc3304f3875ea85c630a57a88ecf79032890c4 Mon Sep 17 00:00:00 2001 From: Gerhard Pircher Date: Fri, 19 Jun 2009 11:40:57 +0000 Subject: [PATCH 0114/1662] powerpc/amigaone: Convert amigaone_init() to a machine_device_initcall() This allows to remove the ppc_md.init() hook in the setup code. Signed-off-by: Gerhard Pircher Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/amigaone/setup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/amigaone/setup.c b/arch/powerpc/platforms/amigaone/setup.c index 443035366c12..9290a7a442d0 100644 --- a/arch/powerpc/platforms/amigaone/setup.c +++ b/arch/powerpc/platforms/amigaone/setup.c @@ -110,13 +110,16 @@ void __init amigaone_init_IRQ(void) irq_set_default_host(i8259_get_host()); } -void __init amigaone_init(void) +static int __init request_isa_regions(void) { request_region(0x00, 0x20, "dma1"); request_region(0x40, 0x20, "timer"); request_region(0x80, 0x10, "dma page reg"); request_region(0xc0, 0x20, "dma2"); + + return 0; } +machine_device_initcall(amigaone, request_isa_regions); void amigaone_restart(char *cmd) { @@ -161,7 +164,6 @@ define_machine(amigaone) { .name = "AmigaOne", .probe = amigaone_probe, .setup_arch = amigaone_setup_arch, - .init = amigaone_init, .show_cpuinfo = amigaone_show_cpuinfo, .init_IRQ = amigaone_init_IRQ, .restart = amigaone_restart, From 11a6b292c1bc9cb39970e44edd6958250f23d3a8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sun, 5 Jul 2009 16:08:52 +0000 Subject: [PATCH 0115/1662] powerpc/mpic: Fix MPIC_BROKEN_REGREAD on non broken MPICs The workaround enabled by CONFIG_MPIC_BROKEN_REGREAD does not work on non-broken MPICs. The symptom is no interrupts being received. The fix is twofold. Firstly the code was broken for multiple isus, we need to index into the shadow array with the src_no, not the idx. Secondly, we always do the read, but only use the VECPRI_MASK and VECPRI_ACTIVITY bits from the hardware, the rest of "val" comes from the shadow. Signed-off-by: Michael Ellerman Signed-off-by: Olof Johansson Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/sysdev/mpic.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 3981ae4cb58e..30c44e6b0413 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -230,14 +230,16 @@ static inline u32 _mpic_irq_read(struct mpic *mpic, unsigned int src_no, unsigne { unsigned int isu = src_no >> mpic->isu_shift; unsigned int idx = src_no & mpic->isu_mask; + unsigned int val; + val = _mpic_read(mpic->reg_type, &mpic->isus[isu], + reg + (idx * MPIC_INFO(IRQ_STRIDE))); #ifdef CONFIG_MPIC_BROKEN_REGREAD if (reg == 0) - return mpic->isu_reg0_shadow[idx]; - else + val = (val & (MPIC_VECPRI_MASK | MPIC_VECPRI_ACTIVITY)) | + mpic->isu_reg0_shadow[src_no]; #endif - return _mpic_read(mpic->reg_type, &mpic->isus[isu], - reg + (idx * MPIC_INFO(IRQ_STRIDE))); + return val; } static inline void _mpic_irq_write(struct mpic *mpic, unsigned int src_no, @@ -251,7 +253,8 @@ static inline void _mpic_irq_write(struct mpic *mpic, unsigned int src_no, #ifdef CONFIG_MPIC_BROKEN_REGREAD if (reg == 0) - mpic->isu_reg0_shadow[idx] = value; + mpic->isu_reg0_shadow[src_no] = + value & ~(MPIC_VECPRI_MASK | MPIC_VECPRI_ACTIVITY); #endif } From 0d2d3e38f72e400f602dade3f0ddffe0b3b9d4df Mon Sep 17 00:00:00 2001 From: Geoff Thorpe Date: Tue, 7 Jul 2009 15:23:56 +0000 Subject: [PATCH 0116/1662] powerpc: expose the multi-bit ops that underlie single-bit ops. The bitops.h functions that operate on a single bit in a bitfield are implemented by operating on the corresponding word location. In all cases the inner logic is valid if the mask being applied has more than one bit set, so this patch exposes those inner operations. Indeed, set_bits() was already available, but it duplicated code from set_bit() (rather than making the latter a wrapper) - it was also missing the PPC405_ERR77() workaround and the "volatile" address qualifier present in other APIs. This corrects that, and exposes the other multi-bit equivalents. One advantage of these multi-bit forms is that they allow word-sized variables to essentially be their own spinlocks, eg. very useful for state machines where an atomic "flags" variable can obviate the need for any additional locking. Signed-off-by: Geoff Thorpe Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/bitops.h | 196 ++++++++++-------------------- 1 file changed, 62 insertions(+), 134 deletions(-) diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 897eade3afbe..56f2f2ea5631 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -56,174 +56,102 @@ #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) #define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7) +/* Macro for generating the ***_bits() functions */ +#define DEFINE_BITOP(fn, op, prefix, postfix) \ +static __inline__ void fn(unsigned long mask, \ + volatile unsigned long *_p) \ +{ \ + unsigned long old; \ + unsigned long *p = (unsigned long *)_p; \ + __asm__ __volatile__ ( \ + prefix \ +"1:" PPC_LLARX "%0,0,%3\n" \ + stringify_in_c(op) "%0,%0,%2\n" \ + PPC405_ERR77(0,%3) \ + PPC_STLCX "%0,0,%3\n" \ + "bne- 1b\n" \ + postfix \ + : "=&r" (old), "+m" (*p) \ + : "r" (mask), "r" (p) \ + : "cc", "memory"); \ +} + +DEFINE_BITOP(set_bits, or, "", "") +DEFINE_BITOP(clear_bits, andc, "", "") +DEFINE_BITOP(clear_bits_unlock, andc, LWSYNC_ON_SMP, "") +DEFINE_BITOP(change_bits, xor, "", "") + static __inline__ void set_bit(int nr, volatile unsigned long *addr) { - unsigned long old; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%3 # set_bit\n" - "or %0,%0,%2\n" - PPC405_ERR77(0,%3) - PPC_STLCX "%0,0,%3\n" - "bne- 1b" - : "=&r" (old), "+m" (*p) - : "r" (mask), "r" (p) - : "cc" ); + set_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)); } static __inline__ void clear_bit(int nr, volatile unsigned long *addr) { - unsigned long old; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%3 # clear_bit\n" - "andc %0,%0,%2\n" - PPC405_ERR77(0,%3) - PPC_STLCX "%0,0,%3\n" - "bne- 1b" - : "=&r" (old), "+m" (*p) - : "r" (mask), "r" (p) - : "cc" ); + clear_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)); } static __inline__ void clear_bit_unlock(int nr, volatile unsigned long *addr) { - unsigned long old; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( - LWSYNC_ON_SMP -"1:" PPC_LLARX "%0,0,%3 # clear_bit_unlock\n" - "andc %0,%0,%2\n" - PPC405_ERR77(0,%3) - PPC_STLCX "%0,0,%3\n" - "bne- 1b" - : "=&r" (old), "+m" (*p) - : "r" (mask), "r" (p) - : "cc", "memory"); + clear_bits_unlock(BITOP_MASK(nr), addr + BITOP_WORD(nr)); } static __inline__ void change_bit(int nr, volatile unsigned long *addr) { - unsigned long old; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%3 # change_bit\n" - "xor %0,%0,%2\n" - PPC405_ERR77(0,%3) - PPC_STLCX "%0,0,%3\n" - "bne- 1b" - : "=&r" (old), "+m" (*p) - : "r" (mask), "r" (p) - : "cc" ); + change_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)); } +/* Like DEFINE_BITOP(), with changes to the arguments to 'op' and the output + * operands. */ +#define DEFINE_TESTOP(fn, op, prefix, postfix) \ +static __inline__ unsigned long fn( \ + unsigned long mask, \ + volatile unsigned long *_p) \ +{ \ + unsigned long old, t; \ + unsigned long *p = (unsigned long *)_p; \ + __asm__ __volatile__ ( \ + prefix \ +"1:" PPC_LLARX "%0,0,%3\n" \ + stringify_in_c(op) "%1,%0,%2\n" \ + PPC405_ERR77(0,%3) \ + PPC_STLCX "%1,0,%3\n" \ + "bne- 1b\n" \ + postfix \ + : "=&r" (old), "=&r" (t) \ + : "r" (mask), "r" (p) \ + : "cc", "memory"); \ + return (old & mask); \ +} + +DEFINE_TESTOP(test_and_set_bits, or, LWSYNC_ON_SMP, ISYNC_ON_SMP) +DEFINE_TESTOP(test_and_set_bits_lock, or, "", ISYNC_ON_SMP) +DEFINE_TESTOP(test_and_clear_bits, andc, LWSYNC_ON_SMP, ISYNC_ON_SMP) +DEFINE_TESTOP(test_and_change_bits, xor, LWSYNC_ON_SMP, ISYNC_ON_SMP) + static __inline__ int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned long old, t; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( - LWSYNC_ON_SMP -"1:" PPC_LLARX "%0,0,%3 # test_and_set_bit\n" - "or %1,%0,%2 \n" - PPC405_ERR77(0,%3) - PPC_STLCX "%1,0,%3 \n" - "bne- 1b" - ISYNC_ON_SMP - : "=&r" (old), "=&r" (t) - : "r" (mask), "r" (p) - : "cc", "memory"); - - return (old & mask) != 0; + return test_and_set_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)) != 0; } static __inline__ int test_and_set_bit_lock(unsigned long nr, volatile unsigned long *addr) { - unsigned long old, t; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%3 # test_and_set_bit_lock\n" - "or %1,%0,%2 \n" - PPC405_ERR77(0,%3) - PPC_STLCX "%1,0,%3 \n" - "bne- 1b" - ISYNC_ON_SMP - : "=&r" (old), "=&r" (t) - : "r" (mask), "r" (p) - : "cc", "memory"); - - return (old & mask) != 0; + return test_and_set_bits_lock(BITOP_MASK(nr), + addr + BITOP_WORD(nr)) != 0; } static __inline__ int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned long old, t; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( - LWSYNC_ON_SMP -"1:" PPC_LLARX "%0,0,%3 # test_and_clear_bit\n" - "andc %1,%0,%2 \n" - PPC405_ERR77(0,%3) - PPC_STLCX "%1,0,%3 \n" - "bne- 1b" - ISYNC_ON_SMP - : "=&r" (old), "=&r" (t) - : "r" (mask), "r" (p) - : "cc", "memory"); - - return (old & mask) != 0; + return test_and_clear_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)) != 0; } static __inline__ int test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { - unsigned long old, t; - unsigned long mask = BITOP_MASK(nr); - unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); - - __asm__ __volatile__( - LWSYNC_ON_SMP -"1:" PPC_LLARX "%0,0,%3 # test_and_change_bit\n" - "xor %1,%0,%2 \n" - PPC405_ERR77(0,%3) - PPC_STLCX "%1,0,%3 \n" - "bne- 1b" - ISYNC_ON_SMP - : "=&r" (old), "=&r" (t) - : "r" (mask), "r" (p) - : "cc", "memory"); - - return (old & mask) != 0; -} - -static __inline__ void set_bits(unsigned long mask, unsigned long *addr) -{ - unsigned long old; - - __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%3 # set_bits\n" - "or %0,%0,%2\n" - PPC_STLCX "%0,0,%3\n" - "bne- 1b" - : "=&r" (old), "+m" (*addr) - : "r" (mask), "r" (addr) - : "cc"); + return test_and_change_bits(BITOP_MASK(nr), addr + BITOP_WORD(nr)) != 0; } #include From 30d0b3682887a81f0335b42f20116fd40d743371 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Jul 2009 20:53:51 +0000 Subject: [PATCH 0117/1662] powerpc: Move 64bit VDSO to improve context switch performance On 64bit applications the VDSO is the only thing in segment 0. Since the VDSO is position independent we can remove the hint and let get_unmapped_area pick an area. This will mean the vdso will be near other mmaps and will share an SLB entry: 10000000-10001000 r-xp 00000000 08:06 5778459 /root/context_switch_64 10010000-10011000 r--p 00000000 08:06 5778459 /root/context_switch_64 10011000-10012000 rw-p 00001000 08:06 5778459 /root/context_switch_64 fffa92ae000-fffa92b0000 rw-p 00000000 00:00 0 fffa92b0000-fffa9453000 r-xp 00000000 08:06 4334051 /lib64/power6/libc-2.9.so fffa9453000-fffa9462000 ---p 001a3000 08:06 4334051 /lib64/power6/libc-2.9.so fffa9462000-fffa9466000 r--p 001a2000 08:06 4334051 /lib64/power6/libc-2.9.so fffa9466000-fffa947c000 rw-p 001a6000 08:06 4334051 /lib64/power6/libc-2.9.so fffa947c000-fffa9480000 rw-p 00000000 00:00 0 fffa9480000-fffa94a8000 r-xp 00000000 08:06 4333852 /lib64/ld-2.9.so fffa94b3000-fffa94b4000 rw-p 00000000 00:00 0 fffa94b4000-fffa94b7000 r-xp 00000000 00:00 0 [vdso] <----- here I am fffa94b7000-fffa94b8000 r--p 00027000 08:06 4333852 /lib64/ld-2.9.so fffa94b8000-fffa94bb000 rw-p 00028000 08:06 4333852 /lib64/ld-2.9.so fffa94bb000-fffa94bc000 rw-p 00000000 00:00 0 fffe4c10000-fffe4c25000 rw-p 00000000 00:00 0 [stack] On a microbenchmark that bounces a token between two 64bit processes over pipes and calls gettimeofday each iteration (to access the VDSO), our context switch rate goes from 268k to 277k ctx switches/sec (tested on a 4GHz POWER6). Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/vdso.h | 3 +-- arch/powerpc/kernel/vdso.c | 7 ++++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h index 26fc449bd989..dc0419b66f17 100644 --- a/arch/powerpc/include/asm/vdso.h +++ b/arch/powerpc/include/asm/vdso.h @@ -7,9 +7,8 @@ #define VDSO32_LBASE 0x100000 #define VDSO64_LBASE 0x100000 -/* Default map addresses */ +/* Default map addresses for 32bit vDSO */ #define VDSO32_MBASE VDSO32_LBASE -#define VDSO64_MBASE VDSO64_LBASE #define VDSO_VERSION_STRING LINUX_2.6.15 diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index ad06d5c75b15..a0abce251d0a 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -203,7 +203,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) } else { vdso_pagelist = vdso64_pagelist; vdso_pages = vdso64_pages; - vdso_base = VDSO64_MBASE; + /* + * On 64bit we don't have a preferred map address. This + * allows get_unmapped_area to find an area near other mmaps + * and most likely share a SLB entry. + */ + vdso_base = 0; } #else vdso_pagelist = vdso32_pagelist; From 5eb9bac0406f2beb84b21aac6feb89d33d9f3f5c Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Jul 2009 20:53:52 +0000 Subject: [PATCH 0118/1662] powerpc: Rearrange SLB preload code With the new top down layout it is likely that the pc and stack will be in the same segment, because the pc is most likely in a library allocated via a top down mmap. Right now we bail out early if these segments match. Rearrange the SLB preload code to sanity check all SLB preload addresses are not in the kernel, then check all addresses for conflicts. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/slb.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 5b7038f248b6..227056c21eee 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -218,23 +218,18 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) else unmapped_base = TASK_UNMAPPED_BASE_USER64; - if (is_kernel_addr(pc)) + if (is_kernel_addr(pc) || is_kernel_addr(stack) || + is_kernel_addr(unmapped_base)) return; + slb_allocate(pc); - if (esids_match(pc,stack)) - return; + if (!esids_match(pc, stack)) + slb_allocate(stack); - if (is_kernel_addr(stack)) - return; - slb_allocate(stack); - - if (esids_match(pc,unmapped_base) || esids_match(stack,unmapped_base)) - return; - - if (is_kernel_addr(unmapped_base)) - return; - slb_allocate(unmapped_base); + if (!esids_match(pc, unmapped_base) && + !esids_match(stack, unmapped_base)) + slb_allocate(unmapped_base); } static inline void patch_slb_encoding(unsigned int *insn_addr, From de4376c2846bb5a8fc6fe8dbd0e4ff30905493e6 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Jul 2009 20:53:53 +0000 Subject: [PATCH 0119/1662] powerpc: Preload application text segment instead of TASK_UNMAPPED_BASE TASK_UNMAPPED_BASE is not used with the new top down mmap layout. We can reuse this preload slot by loading in the segment at 0x10000000, where almost all PowerPC binaries are linked at. On a microbenchmark that bounces a token between two 64bit processes over pipes and calls gettimeofday each iteration (to access the VDSO), both the 32bit and 64bit context switch rate improves (tested on a 4GHz POWER6): 32bit: 273k/sec -> 283k/sec 64bit: 277k/sec -> 284k/sec Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/slb.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 227056c21eee..6bc8b4aeba5c 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -184,7 +184,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) unsigned long slbie_data = 0; unsigned long pc = KSTK_EIP(tsk); unsigned long stack = KSTK_ESP(tsk); - unsigned long unmapped_base; + unsigned long exec_base; if (!cpu_has_feature(CPU_FTR_NO_SLBIE_B) && offset <= SLB_CACHE_ENTRIES) { @@ -212,14 +212,13 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) /* * preload some userspace segments into the SLB. + * Almost all 32 and 64bit PowerPC executables are linked at + * 0x10000000 so it makes sense to preload this segment. */ - if (test_tsk_thread_flag(tsk, TIF_32BIT)) - unmapped_base = TASK_UNMAPPED_BASE_USER32; - else - unmapped_base = TASK_UNMAPPED_BASE_USER64; + exec_base = 0x10000000; if (is_kernel_addr(pc) || is_kernel_addr(stack) || - is_kernel_addr(unmapped_base)) + is_kernel_addr(exec_base)) return; slb_allocate(pc); @@ -227,9 +226,9 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) if (!esids_match(pc, stack)) slb_allocate(stack); - if (!esids_match(pc, unmapped_base) && - !esids_match(stack, unmapped_base)) - slb_allocate(unmapped_base); + if (!esids_match(pc, exec_base) && + !esids_match(stack, exec_base)) + slb_allocate(exec_base); } static inline void patch_slb_encoding(unsigned int *insn_addr, From 8aa34ab8b2dc96ca6c4feecfb87ed13f0d40ef98 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 14 Jul 2009 20:52:52 +0000 Subject: [PATCH 0120/1662] powerpc: Rename exception.h to exception-64s.h The file include/asm/exception.h contains definitions that are specific to exception handling on 64-bit server type processors. This renames the file to exception-64s.h to reflect that fact and avoid confusion. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/{exception.h => exception-64s.h} | 0 arch/powerpc/kernel/exceptions-64s.S | 2 ++ arch/powerpc/kernel/head_64.S | 1 - arch/powerpc/platforms/iseries/exception.h | 2 +- 4 files changed, 3 insertions(+), 2 deletions(-) rename arch/powerpc/include/asm/{exception.h => exception-64s.h} (100%) diff --git a/arch/powerpc/include/asm/exception.h b/arch/powerpc/include/asm/exception-64s.h similarity index 100% rename from arch/powerpc/include/asm/exception.h rename to arch/powerpc/include/asm/exception-64s.h diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index eb898112e577..72644cf22cac 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -12,6 +12,8 @@ * */ +#include + /* * We layout physical memory as follows: * 0x0000 - 0x00ff : Secondary processor spin code diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 012505ebd9f9..9196ef36d433 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -36,7 +36,6 @@ #include #include #include -#include #include /* The physical memory is layed out such that the secondary processor diff --git a/arch/powerpc/platforms/iseries/exception.h b/arch/powerpc/platforms/iseries/exception.h index ced45a8fa1aa..e26eb86ac73d 100644 --- a/arch/powerpc/platforms/iseries/exception.h +++ b/arch/powerpc/platforms/iseries/exception.h @@ -24,7 +24,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include +#include #define EXCEPTION_PROLOG_ISERIES_1 \ mfmsr r10; \ From ee43eb788b3a06425fffb912677e2e1c8b00dd3b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 14 Jul 2009 20:52:54 +0000 Subject: [PATCH 0121/1662] powerpc: Use names rather than numbers for SPRGs (v2) The kernel uses SPRG registers for various purposes, typically in low level assembly code as scratch registers or to hold per-cpu global infos such as the PACA or the current thread_info pointer. We want to be able to easily shuffle the usage of those registers as some implementations have specific constraints realted to some of them, for example, some have userspace readable aliases, etc.. and the current choice isn't always the best. This patch should not change any code generation, and replaces the usage of SPRN_SPRGn everywhere in the kernel with a named replacement and adds documentation next to the definition of the names as to what those are used for on each processor family. The only parts that still use the original numbers are bits of KVM or suspend/resume code that just blindly needs to save/restore all the SPRGs. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/exception-64s.h | 18 +-- arch/powerpc/include/asm/reg.h | 113 +++++++++++++++++++ arch/powerpc/kernel/cpu_setup_6xx.S | 2 +- arch/powerpc/kernel/entry_32.S | 20 ++-- arch/powerpc/kernel/entry_64.S | 4 +- arch/powerpc/kernel/exceptions-64s.S | 44 +++----- arch/powerpc/kernel/fpu.S | 2 +- arch/powerpc/kernel/head_32.S | 40 +++---- arch/powerpc/kernel/head_40x.S | 124 ++++++++++----------- arch/powerpc/kernel/head_44x.S | 56 +++++----- arch/powerpc/kernel/head_64.S | 14 +-- arch/powerpc/kernel/head_8xx.S | 13 ++- arch/powerpc/kernel/head_booke.h | 50 ++++----- arch/powerpc/kernel/head_fsl_booke.S | 60 +++++----- arch/powerpc/kernel/setup_64.c | 4 +- arch/powerpc/kernel/vector.S | 2 +- arch/powerpc/kvm/booke_interrupts.S | 18 +-- arch/powerpc/mm/hash_low_32.S | 4 +- arch/powerpc/platforms/iseries/exception.S | 28 ++--- arch/powerpc/platforms/iseries/exception.h | 4 +- 20 files changed, 356 insertions(+), 264 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index d3d4534e3c74..773e380b5fe8 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -57,12 +57,12 @@ addi reg,reg,(label)-_stext; /* virt addr of handler ... */ #define EXCEPTION_PROLOG_1(area) \ - mfspr r13,SPRN_SPRG3; /* get paca address into r13 */ \ + mfspr r13,SPRN_SPRG_PACA; /* get paca address into r13 */ \ std r9,area+EX_R9(r13); /* save r9 - r12 */ \ std r10,area+EX_R10(r13); \ std r11,area+EX_R11(r13); \ std r12,area+EX_R12(r13); \ - mfspr r9,SPRN_SPRG1; \ + mfspr r9,SPRN_SPRG_SCRATCH0; \ std r9,area+EX_R13(r13); \ mfcr r9 @@ -144,7 +144,7 @@ .globl label##_pSeries; \ label##_pSeries: \ HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ + mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common) #define HSTD_EXCEPTION_PSERIES(n, label) \ @@ -152,13 +152,13 @@ label##_pSeries: \ .globl label##_pSeries; \ label##_pSeries: \ HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r20; /* save r20 */ \ + mtspr SPRN_SPRG_SCRATCH0,r20; /* save r20 */ \ mfspr r20,SPRN_HSRR0; /* copy HSRR0 to SRR0 */ \ mtspr SPRN_SRR0,r20; \ mfspr r20,SPRN_HSRR1; /* copy HSRR0 to SRR0 */ \ mtspr SPRN_SRR1,r20; \ - mfspr r20,SPRN_SPRG1; /* restore r20 */ \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ + mfspr r20,SPRN_SPRG_SCRATCH0; /* restore r20 */ \ + mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common) @@ -167,15 +167,15 @@ label##_pSeries: \ .globl label##_pSeries; \ label##_pSeries: \ HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ - mfspr r13,SPRN_SPRG3; /* get paca address into r13 */ \ + mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ + mfspr r13,SPRN_SPRG_PACA; /* get paca address into r13 */ \ std r9,PACA_EXGEN+EX_R9(r13); /* save r9, r10 */ \ std r10,PACA_EXGEN+EX_R10(r13); \ lbz r10,PACASOFTIRQEN(r13); \ mfcr r9; \ cmpwi r10,0; \ beq masked_interrupt; \ - mfspr r10,SPRN_SPRG1; \ + mfspr r10,SPRN_SPRG_SCRATCH0; \ std r10,PACA_EXGEN+EX_R13(r13); \ std r11,PACA_EXGEN+EX_R11(r13); \ std r12,PACA_EXGEN+EX_R12(r13); \ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 1170267736d3..a8179cc99ac4 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -645,6 +645,119 @@ #define MMCR0_PMC2_LOADMISSTIME 0x5 #endif +/* + * SPRG usage: + * + * All 64-bit: + * - SPRG3 stores PACA pointer + * + * 64-bit server: + * - SPRG0 unused (reserved for HV on Power4) + * - SPRG1 scratch for exception vectors + * - SPRG2 scratch for exception vectors + * + * All 32-bit: + * - SPRG3 current thread_info pointer + * (virtual on BookE, physical on others) + * + * 32-bit classic: + * - SPRG0 scratch for exception vectors + * - SPRG1 scratch for exception vectors + * - SPRG2 indicator that we are in RTAS + * - SPRG4 (603 only) pseudo TLB LRU data + * + * 32-bit 40x: + * - SPRG0 scratch for exception vectors + * - SPRG1 scratch for exception vectors + * - SPRG2 scratch for exception vectors + * - SPRG4 scratch for exception vectors (not 403) + * - SPRG5 scratch for exception vectors (not 403) + * - SPRG6 scratch for exception vectors (not 403) + * - SPRG7 scratch for exception vectors (not 403) + * + * 32-bit 440 and FSL BookE: + * - SPRG0 scratch for exception vectors + * - SPRG1 scratch for exception vectors (*) + * - SPRG2 scratch for crit interrupts handler + * - SPRG4 scratch for exception vectors + * - SPRG5 scratch for exception vectors + * - SPRG6 scratch for machine check handler + * - SPRG7 scratch for exception vectors + * - SPRG9 scratch for debug vectors (e500 only) + * + * Additionally, BookE separates "read" and "write" + * of those registers. That allows to use the userspace + * readable variant for reads, which can avoid a fault + * with KVM type virtualization. + * + * (*) Under KVM, the host SPRG1 is used to point to + * the current VCPU data structure + * + * 32-bit 8xx: + * - SPRG0 scratch for exception vectors + * - SPRG1 scratch for exception vectors + * - SPRG2 apparently unused but initialized + * + */ +#ifdef CONFIG_PPC64 +#define SPRN_SPRG_PACA SPRN_SPRG3 +#else +#define SPRN_SPRG_THREAD SPRN_SPRG3 +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 +#define SPRN_SPRG_SCRATCH0 SPRN_SPRG1 +#define SPRN_SPRG_SCRATCH1 SPRN_SPRG2 +#endif + +#ifdef CONFIG_PPC_BOOK3S_32 +#define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 +#define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 +#define SPRN_SPRG_RTAS SPRN_SPRG2 +#define SPRN_SPRG_603_LRU SPRN_SPRG4 +#endif + +#ifdef CONFIG_40x +#define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 +#define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 +#define SPRN_SPRG_SCRATCH2 SPRN_SPRG2 +#define SPRN_SPRG_SCRATCH3 SPRN_SPRG4 +#define SPRN_SPRG_SCRATCH4 SPRN_SPRG5 +#define SPRN_SPRG_SCRATCH5 SPRN_SPRG6 +#define SPRN_SPRG_SCRATCH6 SPRN_SPRG7 +#endif + +#ifdef CONFIG_BOOKE +#define SPRN_SPRG_RSCRATCH0 SPRN_SPRG0 +#define SPRN_SPRG_WSCRATCH0 SPRN_SPRG0 +#define SPRN_SPRG_RSCRATCH1 SPRN_SPRG1 +#define SPRN_SPRG_WSCRATCH1 SPRN_SPRG1 +#define SPRN_SPRG_RSCRATCH_CRIT SPRN_SPRG2 +#define SPRN_SPRG_WSCRATCH_CRIT SPRN_SPRG2 +#define SPRN_SPRG_RSCRATCH2 SPRN_SPRG4R +#define SPRN_SPRG_WSCRATCH2 SPRN_SPRG4W +#define SPRN_SPRG_RSCRATCH3 SPRN_SPRG5R +#define SPRN_SPRG_WSCRATCH3 SPRN_SPRG5W +#define SPRN_SPRG_RSCRATCH_MC SPRN_SPRG6R +#define SPRN_SPRG_WSCRATCH_MC SPRN_SPRG6W +#define SPRN_SPRG_RSCRATCH4 SPRN_SPRG7R +#define SPRN_SPRG_WSCRATCH4 SPRN_SPRG7W +#ifdef CONFIG_E200 +#define SPRN_SPRG_RSCRATCH_DBG SPRN_SPRG6R +#define SPRN_SPRG_WSCRATCH_DBG SPRN_SPRG6W +#else +#define SPRN_SPRG_RSCRATCH_DBG SPRN_SPRG9 +#define SPRN_SPRG_WSCRATCH_DBG SPRN_SPRG9 +#endif +#define SPRN_SPRG_RVCPU SPRN_SPRG1 +#define SPRN_SPRG_WVCPU SPRN_SPRG1 +#endif + +#ifdef CONFIG_8xx +#define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 +#define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 +#endif + /* * An mtfsf instruction with the L bit set. On CPUs that support this a * full 64bits of FPSCR is restored and on other CPUs the L bit is ignored. diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S index 1e9949e68856..55cba4a8a959 100644 --- a/arch/powerpc/kernel/cpu_setup_6xx.S +++ b/arch/powerpc/kernel/cpu_setup_6xx.S @@ -21,7 +21,7 @@ _GLOBAL(__setup_cpu_603) mflr r4 BEGIN_MMU_FTR_SECTION li r10,0 - mtspr SPRN_SPRG4,r10 /* init SW LRU tracking */ + mtspr SPRN_SPRG_603_LRU,r10 /* init SW LRU tracking */ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) BEGIN_FTR_SECTION bl __init_fpu_registers diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 3cadba60a4b6..1175a8539e6c 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -88,7 +88,7 @@ crit_transfer_to_handler: mfspr r0,SPRN_SRR1 stw r0,_SRR1(r11) - mfspr r8,SPRN_SPRG3 + mfspr r8,SPRN_SPRG_THREAD lwz r0,KSP_LIMIT(r8) stw r0,SAVED_KSP_LIMIT(r11) rlwimi r0,r1,0,0,(31-THREAD_SHIFT) @@ -108,7 +108,7 @@ crit_transfer_to_handler: mfspr r0,SPRN_SRR1 stw r0,crit_srr1@l(0) - mfspr r8,SPRN_SPRG3 + mfspr r8,SPRN_SPRG_THREAD lwz r0,KSP_LIMIT(r8) stw r0,saved_ksp_limit@l(0) rlwimi r0,r1,0,0,(31-THREAD_SHIFT) @@ -138,7 +138,7 @@ transfer_to_handler: mfspr r2,SPRN_XER stw r12,_CTR(r11) stw r2,_XER(r11) - mfspr r12,SPRN_SPRG3 + mfspr r12,SPRN_SPRG_THREAD addi r2,r12,-THREAD tovirt(r2,r2) /* set r2 to current */ beq 2f /* if from user, fix up THREAD.regs */ @@ -680,7 +680,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE) tophys(r0,r4) CLR_TOP32(r0) - mtspr SPRN_SPRG3,r0 /* Update current THREAD phys addr */ + mtspr SPRN_SPRG_THREAD,r0 /* Update current THREAD phys addr */ lwz r1,KSP(r4) /* Load new stack pointer */ /* save the old current 'last' for return value */ @@ -1057,7 +1057,7 @@ exc_exit_restart_end: #ifdef CONFIG_40x .globl ret_from_crit_exc ret_from_crit_exc: - mfspr r9,SPRN_SPRG3 + mfspr r9,SPRN_SPRG_THREAD lis r10,saved_ksp_limit@ha; lwz r10,saved_ksp_limit@l(r10); tovirt(r9,r9); @@ -1074,7 +1074,7 @@ ret_from_crit_exc: #ifdef CONFIG_BOOKE .globl ret_from_crit_exc ret_from_crit_exc: - mfspr r9,SPRN_SPRG3 + mfspr r9,SPRN_SPRG_THREAD lwz r10,SAVED_KSP_LIMIT(r1) stw r10,KSP_LIMIT(r9) RESTORE_xSRR(SRR0,SRR1); @@ -1083,7 +1083,7 @@ ret_from_crit_exc: .globl ret_from_debug_exc ret_from_debug_exc: - mfspr r9,SPRN_SPRG3 + mfspr r9,SPRN_SPRG_THREAD lwz r10,SAVED_KSP_LIMIT(r1) stw r10,KSP_LIMIT(r9) lwz r9,THREAD_INFO-THREAD(r9) @@ -1097,7 +1097,7 @@ ret_from_debug_exc: .globl ret_from_mcheck_exc ret_from_mcheck_exc: - mfspr r9,SPRN_SPRG3 + mfspr r9,SPRN_SPRG_THREAD lwz r10,SAVED_KSP_LIMIT(r1) stw r10,KSP_LIMIT(r9) RESTORE_xSRR(SRR0,SRR1); @@ -1255,7 +1255,7 @@ _GLOBAL(enter_rtas) MTMSRD(r0) /* don't get trashed */ li r9,MSR_KERNEL & ~(MSR_IR|MSR_DR) mtlr r6 - mtspr SPRN_SPRG2,r7 + mtspr SPRN_SPRG_RTAS,r7 mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 RFI @@ -1265,7 +1265,7 @@ _GLOBAL(enter_rtas) FIX_SRR1(r9,r0) addi r1,r1,INT_FRAME_SIZE li r0,0 - mtspr SPRN_SPRG2,r0 + mtspr SPRN_SPRG_RTAS,r0 mtspr SPRN_SRR0,r8 mtspr SPRN_SRR1,r9 RFI /* return to caller */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 43e073477c34..dbf0e3115611 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -762,7 +762,7 @@ _GLOBAL(enter_rtas) _STATIC(rtas_return_loc) /* relocation is off at this point */ - mfspr r4,SPRN_SPRG3 /* Get PACA */ + mfspr r4,SPRN_SPRG_PACA /* Get PACA */ clrldi r4,r4,2 /* convert to realmode address */ bcl 20,31,$+4 @@ -793,7 +793,7 @@ _STATIC(rtas_restore_regs) REST_8GPRS(14, r1) /* Restore the non-volatiles */ REST_10GPRS(22, r1) /* ditto */ - mfspr r13,SPRN_SPRG3 + mfspr r13,SPRN_SPRG_PACA ld r4,_CCR(r1) mtcr r4 diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 72644cf22cac..4e9640cc0563 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -24,18 +24,6 @@ * 0x8000 - : Early init and support code */ - -/* - * SPRG Usage - * - * Register Definition - * - * SPRG0 reserved for hypervisor - * SPRG1 temp - used to save gpr - * SPRG2 temp - used to save gpr - * SPRG3 virt addr of paca - */ - /* * This is the start of the interrupt handlers for pSeries * This code runs with relocation off. @@ -53,16 +41,16 @@ __start_interrupts: . = 0x200 _machine_check_pSeries: HMT_MEDIUM - mtspr SPRN_SPRG1,r13 /* save r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */ EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common) . = 0x300 .globl data_access_pSeries data_access_pSeries: HMT_MEDIUM - mtspr SPRN_SPRG1,r13 + mtspr SPRN_SPRG_SCRATCH0,r13 BEGIN_FTR_SECTION - mtspr SPRN_SPRG2,r12 + mtspr SPRN_SPRG_SCRATCH1,r12 mfspr r13,SPRN_DAR mfspr r12,SPRN_DSISR srdi r13,r13,60 @@ -71,7 +59,7 @@ BEGIN_FTR_SECTION cmpwi r13,0x2c beq do_stab_bolted_pSeries mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG2 + mfspr r12,SPRN_SPRG_SCRATCH1 END_FTR_SECTION_IFCLR(CPU_FTR_SLB) EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common) @@ -79,8 +67,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SLB) .globl data_access_slb_pSeries data_access_slb_pSeries: HMT_MEDIUM - mtspr SPRN_SPRG1,r13 - mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 + mfspr r13,SPRN_SPRG_PACA /* get paca address into r13 */ std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_DAR std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */ @@ -93,7 +81,7 @@ data_access_slb_pSeries: std r10,PACA_EXSLB+EX_R10(r13) std r11,PACA_EXSLB+EX_R11(r13) std r12,PACA_EXSLB+EX_R12(r13) - mfspr r10,SPRN_SPRG1 + mfspr r10,SPRN_SPRG_SCRATCH0 std r10,PACA_EXSLB+EX_R13(r13) mfspr r12,SPRN_SRR1 /* and SRR1 */ #ifndef CONFIG_RELOCATABLE @@ -117,8 +105,8 @@ data_access_slb_pSeries: .globl instruction_access_slb_pSeries instruction_access_slb_pSeries: HMT_MEDIUM - mtspr SPRN_SPRG1,r13 - mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 + mfspr r13,SPRN_SPRG_PACA /* get paca address into r13 */ std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ std r9,PACA_EXSLB+EX_R9(r13) /* save r9 - r12 */ @@ -131,7 +119,7 @@ instruction_access_slb_pSeries: std r10,PACA_EXSLB+EX_R10(r13) std r11,PACA_EXSLB+EX_R11(r13) std r12,PACA_EXSLB+EX_R12(r13) - mfspr r10,SPRN_SPRG1 + mfspr r10,SPRN_SPRG_SCRATCH0 std r10,PACA_EXSLB+EX_R13(r13) mfspr r12,SPRN_SRR1 /* and SRR1 */ #ifndef CONFIG_RELOCATABLE @@ -161,7 +149,7 @@ BEGIN_FTR_SECTION beq- 1f END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) mr r9,r13 - mfspr r13,SPRN_SPRG3 + mfspr r13,SPRN_SPRG_PACA mfspr r11,SPRN_SRR0 ld r12,PACAKBASE(r13) ld r10,PACAKMSR(r13) @@ -230,14 +218,14 @@ masked_interrupt: rotldi r10,r10,16 mtspr SPRN_SRR1,r10 ld r10,PACA_EXGEN+EX_R10(r13) - mfspr r13,SPRN_SPRG1 + mfspr r13,SPRN_SPRG_SCRATCH0 rfid b . .align 7 do_stab_bolted_pSeries: mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG2 + mfspr r12,SPRN_SPRG_SCRATCH1 EXCEPTION_PROLOG_PSERIES(PACA_EXSLB, .do_stab_bolted) #ifdef CONFIG_PPC_PSERIES @@ -248,14 +236,14 @@ do_stab_bolted_pSeries: .align 7 system_reset_fwnmi: HMT_MEDIUM - mtspr SPRN_SPRG1,r13 /* save r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common) .globl machine_check_fwnmi .align 7 machine_check_fwnmi: HMT_MEDIUM - mtspr SPRN_SPRG1,r13 /* save r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */ EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common) #endif /* CONFIG_PPC_PSERIES */ @@ -270,7 +258,7 @@ slb_miss_user_pseries: std r10,PACA_EXGEN+EX_R10(r13) std r11,PACA_EXGEN+EX_R11(r13) std r12,PACA_EXGEN+EX_R12(r13) - mfspr r10,SPRG1 + mfspr r10,SPRG_SCRATCH0 ld r11,PACA_EXSLB+EX_R9(r13) ld r12,PACA_EXSLB+EX_R3(r13) std r10,PACA_EXGEN+EX_R13(r13) diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 2436df33c6f4..fc8f5b14019c 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -91,7 +91,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif /* CONFIG_SMP */ /* enable use of FP after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG3 /* current task's THREAD (phys) */ + mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ lwz r4,THREAD_FPEXC_MODE(r5) ori r9,r9,MSR_FP /* enable FP for current */ or r9,r9,r4 diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index fc2132942754..829c3fe7c5a2 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -244,8 +244,8 @@ __secondary_hold_acknowledge: * task's thread_struct. */ #define EXCEPTION_PROLOG \ - mtspr SPRN_SPRG0,r10; \ - mtspr SPRN_SPRG1,r11; \ + mtspr SPRN_SPRG_SCRATCH0,r10; \ + mtspr SPRN_SPRG_SCRATCH1,r11; \ mfcr r10; \ EXCEPTION_PROLOG_1; \ EXCEPTION_PROLOG_2 @@ -255,7 +255,7 @@ __secondary_hold_acknowledge: andi. r11,r11,MSR_PR; \ tophys(r11,r1); /* use tophys(r1) if kernel */ \ beq 1f; \ - mfspr r11,SPRN_SPRG3; \ + mfspr r11,SPRN_SPRG_THREAD; \ lwz r11,THREAD_INFO-THREAD(r11); \ addi r11,r11,THREAD_SIZE; \ tophys(r11,r11); \ @@ -267,9 +267,9 @@ __secondary_hold_acknowledge: stw r10,_CCR(r11); /* save registers */ \ stw r12,GPR12(r11); \ stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG0; \ + mfspr r10,SPRN_SPRG_SCRATCH0; \ stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG1; \ + mfspr r12,SPRN_SPRG_SCRATCH1; \ stw r12,GPR11(r11); \ mflr r10; \ stw r10,_LINK(r11); \ @@ -355,11 +355,11 @@ i##n: \ * -- paulus. */ . = 0x200 - mtspr SPRN_SPRG0,r10 - mtspr SPRN_SPRG1,r11 + mtspr SPRN_SPRG_SCRATCH0,r10 + mtspr SPRN_SPRG_SCRATCH1,r11 mfcr r10 #ifdef CONFIG_PPC_CHRP - mfspr r11,SPRN_SPRG2 + mfspr r11,SPRN_SPRG_RTAS cmpwi 0,r11,0 bne 7f #endif /* CONFIG_PPC_CHRP */ @@ -367,7 +367,7 @@ i##n: \ 7: EXCEPTION_PROLOG_2 addi r3,r1,STACK_FRAME_OVERHEAD #ifdef CONFIG_PPC_CHRP - mfspr r4,SPRN_SPRG2 + mfspr r4,SPRN_SPRG_RTAS cmpwi cr1,r4,0 bne cr1,1f #endif @@ -485,7 +485,7 @@ InstructionTLBMiss: mfspr r3,SPRN_IMISS lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG3 + mfspr r2,SPRN_SPRG_THREAD li r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */ lwz r2,PGDIR(r2) bge- 112f @@ -559,7 +559,7 @@ DataLoadTLBMiss: mfspr r3,SPRN_DMISS lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG3 + mfspr r2,SPRN_SPRG_THREAD li r1,_PAGE_USER|_PAGE_PRESENT /* low addresses tested as user */ lwz r2,PGDIR(r2) bge- 112f @@ -598,12 +598,12 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) mtcrf 0x80,r2 BEGIN_MMU_FTR_SECTION li r0,1 - mfspr r1,SPRN_SPRG4 + mfspr r1,SPRN_SPRG_603_LRU rlwinm r2,r3,20,27,31 /* Get Address bits 15:19 */ slw r0,r0,r2 xor r1,r0,r1 srw r0,r1,r2 - mtspr SPRN_SPRG4,r1 + mtspr SPRN_SPRG_603_LRU,r1 mfspr r2,SPRN_SRR1 rlwimi r2,r0,31-14,14,14 mtspr SPRN_SRR1,r2 @@ -643,7 +643,7 @@ DataStoreTLBMiss: mfspr r3,SPRN_DMISS lis r1,PAGE_OFFSET@h /* check if kernel address */ cmplw 0,r1,r3 - mfspr r2,SPRN_SPRG3 + mfspr r2,SPRN_SPRG_THREAD li r1,_PAGE_RW|_PAGE_USER|_PAGE_PRESENT /* access flags */ lwz r2,PGDIR(r2) bge- 112f @@ -678,12 +678,12 @@ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) mtcrf 0x80,r2 BEGIN_MMU_FTR_SECTION li r0,1 - mfspr r1,SPRN_SPRG4 + mfspr r1,SPRN_SPRG_603_LRU rlwinm r2,r3,20,27,31 /* Get Address bits 15:19 */ slw r0,r0,r2 xor r1,r0,r1 srw r0,r1,r2 - mtspr SPRN_SPRG4,r1 + mtspr SPRN_SPRG_603_LRU,r1 mfspr r2,SPRN_SRR1 rlwimi r2,r0,31-14,14,14 mtspr SPRN_SRR1,r2 @@ -864,9 +864,9 @@ __secondary_start: tophys(r4,r2) addi r4,r4,THREAD /* phys address of our thread_struct */ CLR_TOP32(r4) - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 li r3,0 - mtspr SPRN_SPRG2,r3 /* 0 => not in RTAS */ + mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ /* enable MMU and jump to start_secondary */ li r4,MSR_KERNEL @@ -947,9 +947,9 @@ start_here: tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ CLR_TOP32(r4) - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 li r3,0 - mtspr SPRN_SPRG2,r3 /* 0 => not in RTAS */ + mtspr SPRN_SPRG_RTAS,r3 /* 0 => not in RTAS */ /* stack */ lis r1,init_thread_union@ha diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 0c96911d4299..a90625f9b485 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -103,21 +103,21 @@ _ENTRY(saved_ksp_limit) /* * Exception vector entry code. This code runs with address translation - * turned off (i.e. using physical addresses). We assume SPRG3 has the - * physical address of the current task thread_struct. + * turned off (i.e. using physical addresses). We assume SPRG_THREAD has + * the physical address of the current task thread_struct. * Note that we have to have decremented r1 before we write to any fields * of the exception frame, since a critical interrupt could occur at any * time, and it will write to the area immediately below the current r1. */ #define NORMAL_EXCEPTION_PROLOG \ - mtspr SPRN_SPRG0,r10; /* save two registers to work with */\ - mtspr SPRN_SPRG1,r11; \ - mtspr SPRN_SPRG2,r1; \ + mtspr SPRN_SPRG_SCRATCH0,r10; /* save two registers to work with */\ + mtspr SPRN_SPRG_SCRATCH1,r11; \ + mtspr SPRN_SPRG_SCRATCH2,r1; \ mfcr r10; /* save CR in r10 for now */\ mfspr r11,SPRN_SRR1; /* check whether user or kernel */\ andi. r11,r11,MSR_PR; \ beq 1f; \ - mfspr r1,SPRN_SPRG3; /* if from user, start at top of */\ + mfspr r1,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r1,THREAD_INFO-THREAD(r1); /* this thread's kernel stack */\ addi r1,r1,THREAD_SIZE; \ 1: subi r1,r1,INT_FRAME_SIZE; /* Allocate an exception frame */\ @@ -125,13 +125,13 @@ _ENTRY(saved_ksp_limit) stw r10,_CCR(r11); /* save various registers */\ stw r12,GPR12(r11); \ stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG0; \ + mfspr r10,SPRN_SPRG_SCRATCH0; \ stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG1; \ + mfspr r12,SPRN_SPRG_SCRATCH1; \ stw r12,GPR11(r11); \ mflr r10; \ stw r10,_LINK(r11); \ - mfspr r10,SPRN_SPRG2; \ + mfspr r10,SPRN_SPRG_SCRATCH2; \ mfspr r12,SPRN_SRR0; \ stw r10,GPR1(r11); \ mfspr r9,SPRN_SRR1; \ @@ -160,7 +160,7 @@ _ENTRY(saved_ksp_limit) lwz r11,critirq_ctx@l(r11); \ beq 1f; \ /* COMING FROM USER MODE */ \ - mfspr r11,SPRN_SPRG3; /* if from user, start at top of */\ + mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ 1: addi r11,r11,THREAD_SIZE-INT_FRAME_SIZE; /* Alloc an excpt frm */\ tophys(r11,r11); \ @@ -265,8 +265,8 @@ label: * and exit. Otherwise, we call heavywight functions to do the work. */ START_EXCEPTION(0x0300, DataStorage) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 + mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_SCRATCH1, r11 #ifdef CONFIG_403GCX stw r12, 0(r0) stw r9, 4(r0) @@ -275,12 +275,12 @@ label: stw r11, 8(r0) stw r12, 12(r0) #else - mtspr SPRN_SPRG4, r12 - mtspr SPRN_SPRG5, r9 + mtspr SPRN_SPRG_SCRATCH3, r12 + mtspr SPRN_SPRG_SCRATCH4, r9 mfcr r11 mfspr r12, SPRN_PID - mtspr SPRN_SPRG7, r11 - mtspr SPRN_SPRG6, r12 + mtspr SPRN_SPRG_SCRATCH6, r11 + mtspr SPRN_SPRG_SCRATCH5, r12 #endif /* First, check if it was a zone fault (which means a user @@ -308,7 +308,7 @@ label: /* Get the PGD for the current thread. */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: tophys(r11, r11) @@ -355,15 +355,15 @@ label: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 PPC405_ERR77_SYNC rfi /* Should sync shadow TLBs */ b . /* prevent prefetch past rfi */ @@ -380,15 +380,15 @@ label: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 b DataAccess /* @@ -466,8 +466,8 @@ label: * load TLB entries from the page table if they exist. */ START_EXCEPTION(0x1100, DTLBMiss) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 + mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_SCRATCH1, r11 #ifdef CONFIG_403GCX stw r12, 0(r0) stw r9, 4(r0) @@ -476,12 +476,12 @@ label: stw r11, 8(r0) stw r12, 12(r0) #else - mtspr SPRN_SPRG4, r12 - mtspr SPRN_SPRG5, r9 + mtspr SPRN_SPRG_SCRATCH3, r12 + mtspr SPRN_SPRG_SCRATCH4, r9 mfcr r11 mfspr r12, SPRN_PID - mtspr SPRN_SPRG7, r11 - mtspr SPRN_SPRG6, r12 + mtspr SPRN_SPRG_SCRATCH6, r11 + mtspr SPRN_SPRG_SCRATCH5, r12 #endif mfspr r10, SPRN_DEAR /* Get faulting address */ @@ -500,7 +500,7 @@ label: /* Get the PGD for the current thread. */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: tophys(r11, r11) @@ -550,15 +550,15 @@ label: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 b DataAccess /* 0x1200 - Instruction TLB Miss Exception @@ -566,8 +566,8 @@ label: * registers and bailout to a different point. */ START_EXCEPTION(0x1200, ITLBMiss) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 + mtspr SPRN_SPRG_SCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_SCRATCH1, r11 #ifdef CONFIG_403GCX stw r12, 0(r0) stw r9, 4(r0) @@ -576,12 +576,12 @@ label: stw r11, 8(r0) stw r12, 12(r0) #else - mtspr SPRN_SPRG4, r12 - mtspr SPRN_SPRG5, r9 + mtspr SPRN_SPRG_SCRATCH3, r12 + mtspr SPRN_SPRG_SCRATCH4, r9 mfcr r11 mfspr r12, SPRN_PID - mtspr SPRN_SPRG7, r11 - mtspr SPRN_SPRG6, r12 + mtspr SPRN_SPRG_SCRATCH6, r11 + mtspr SPRN_SPRG_SCRATCH5, r12 #endif mfspr r10, SPRN_SRR0 /* Get faulting address */ @@ -600,7 +600,7 @@ label: /* Get the PGD for the current thread. */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: tophys(r11, r11) @@ -650,15 +650,15 @@ label: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 b InstructionAccess EXCEPTION(0x1300, Trap_13, unknown_exception, EXC_XFER_EE) @@ -803,15 +803,15 @@ finish_tlb_load: lwz r9, 4(r0) lwz r12, 0(r0) #else - mfspr r12, SPRN_SPRG6 - mfspr r11, SPRN_SPRG7 + mfspr r12, SPRN_SPRG_SCRATCH5 + mfspr r11, SPRN_SPRG_SCRATCH6 mtspr SPRN_PID, r12 mtcr r11 - mfspr r9, SPRN_SPRG5 - mfspr r12, SPRN_SPRG4 + mfspr r9, SPRN_SPRG_SCRATCH4 + mfspr r12, SPRN_SPRG_SCRATCH3 #endif - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r11, SPRN_SPRG_SCRATCH1 + mfspr r10, SPRN_SPRG_SCRATCH0 PPC405_ERR77_SYNC rfi /* Should sync shadow TLBs */ b . /* prevent prefetch past rfi */ @@ -835,7 +835,7 @@ start_here: /* ptr to phys current thread */ tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 /* stack */ lis r1,init_thread_union@ha diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 18d8a1677c4d..656cfb2d6666 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -239,7 +239,7 @@ skpinv: addi r4,r4,1 /* Increment */ /* ptr to current thread */ addi r4,r2,THREAD /* init task's THREAD */ - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 /* stack */ lis r1,init_thread_union@h @@ -350,12 +350,12 @@ interrupt_base: /* Data TLB Error Interrupt */ START_EXCEPTION(DataTLBError) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_WSCRATCH1, r11 + mtspr SPRN_SPRG_WSCRATCH2, r12 + mtspr SPRN_SPRG_WSCRATCH3, r13 mfcr r11 - mtspr SPRN_SPRG7W, r11 + mtspr SPRN_SPRG_WSCRATCH4, r11 mfspr r10, SPRN_DEAR /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -374,7 +374,7 @@ interrupt_base: /* Get the PGD for the current thread */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) /* Load PID into MMUCR TID */ @@ -446,12 +446,12 @@ tlb_44x_patch_hwater_D: /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 b DataStorage /* Instruction TLB Error Interrupt */ @@ -461,12 +461,12 @@ tlb_44x_patch_hwater_D: * to a different point. */ START_EXCEPTION(InstructionTLBError) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_WSCRATCH1, r11 + mtspr SPRN_SPRG_WSCRATCH2, r12 + mtspr SPRN_SPRG_WSCRATCH3, r13 mfcr r11 - mtspr SPRN_SPRG7W, r11 + mtspr SPRN_SPRG_WSCRATCH4, r11 mfspr r10, SPRN_SRR0 /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -485,7 +485,7 @@ tlb_44x_patch_hwater_D: /* Get the PGD for the current thread */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) /* Load PID into MMUCR TID */ @@ -542,12 +542,12 @@ tlb_44x_patch_hwater_I: /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 b InstructionStorage /* Debug Interrupt */ @@ -593,12 +593,12 @@ finish_tlb_load: /* Done...restore registers and get out of here. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 rfi /* Force context change */ /* diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 9196ef36d433..0552f01041ab 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -195,7 +195,7 @@ _GLOBAL(generic_secondary_smp_init) mr r3,r24 /* not found, copy phys to r3 */ b .kexec_wait /* next kernel might do better */ -2: mtspr SPRN_SPRG3,r13 /* Save vaddr of paca in SPRG3 */ +2: mtspr SPRN_SPRG_PACA,r13 /* Save vaddr of paca in an SPRG */ /* From now on, r24 is expected to be logical cpuid */ mr r24,r5 3: HMT_LOW @@ -484,7 +484,7 @@ _GLOBAL(pmac_secondary_start) LOAD_REG_ADDR(r4,paca) /* Get base vaddr of paca array */ mulli r13,r24,PACA_SIZE /* Calculate vaddr of right paca */ add r13,r13,r4 /* for this processor. */ - mtspr SPRN_SPRG3,r13 /* Save vaddr of paca in SPRG3 */ + mtspr SPRN_SPRG_PACA,r13 /* Save vaddr of paca in an SPRG*/ /* Create a temp kernel stack for use before relocation is on. */ ld r1,PACAEMERGSP(r13) @@ -502,10 +502,10 @@ _GLOBAL(pmac_secondary_start) * 1. Processor number * 2. Segment table pointer (virtual address) * On entry the following are set: - * r1 = stack pointer. vaddr for iSeries, raddr (temp stack) for pSeries - * r24 = cpu# (in Linux terms) - * r13 = paca virtual address - * SPRG3 = paca virtual address + * r1 = stack pointer. vaddr for iSeries, raddr (temp stack) for pSeries + * r24 = cpu# (in Linux terms) + * r13 = paca virtual address + * SPRG_PACA = paca virtual address */ .globl __secondary_start __secondary_start: @@ -641,7 +641,7 @@ _INIT_STATIC(start_here_multiplatform) /* Restore parameters passed from prom_init/kexec */ mr r3,r31 - bl .early_setup /* also sets r13 and SPRG3 */ + bl .early_setup /* also sets r13 and SPRG_PACA */ LOAD_REG_ADDR(r3, .start_here_common) ld r4,PACAKMSR(r13) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 52ff8c53b93c..6ded19d01891 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -110,8 +110,8 @@ turn_on_mmu: * task's thread_struct. */ #define EXCEPTION_PROLOG \ - mtspr SPRN_SPRG0,r10; \ - mtspr SPRN_SPRG1,r11; \ + mtspr SPRN_SPRG_SCRATCH0,r10; \ + mtspr SPRN_SPRG_SCRATCH1,r11; \ mfcr r10; \ EXCEPTION_PROLOG_1; \ EXCEPTION_PROLOG_2 @@ -121,7 +121,7 @@ turn_on_mmu: andi. r11,r11,MSR_PR; \ tophys(r11,r1); /* use tophys(r1) if kernel */ \ beq 1f; \ - mfspr r11,SPRN_SPRG3; \ + mfspr r11,SPRN_SPRG_THREAD; \ lwz r11,THREAD_INFO-THREAD(r11); \ addi r11,r11,THREAD_SIZE; \ tophys(r11,r11); \ @@ -133,9 +133,9 @@ turn_on_mmu: stw r10,_CCR(r11); /* save registers */ \ stw r12,GPR12(r11); \ stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG0; \ + mfspr r10,SPRN_SPRG_SCRATCH0; \ stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG1; \ + mfspr r12,SPRN_SPRG_SCRATCH1; \ stw r12,GPR11(r11); \ mflr r10; \ stw r10,_LINK(r11); \ @@ -603,8 +603,9 @@ start_here: /* ptr to phys current thread */ tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 li r3,0 + /* XXX What is that for ? SPRG2 appears otherwise unused on 8xx */ mtspr SPRN_SPRG2,r3 /* 0 => r1 has kernel sp */ /* stack */ diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h index 5f9febc8d143..50504ae39cb7 100644 --- a/arch/powerpc/kernel/head_booke.h +++ b/arch/powerpc/kernel/head_booke.h @@ -20,14 +20,14 @@ #endif #define NORMAL_EXCEPTION_PROLOG \ - mtspr SPRN_SPRG0,r10; /* save two registers to work with */\ - mtspr SPRN_SPRG1,r11; \ - mtspr SPRN_SPRG4W,r1; \ + mtspr SPRN_SPRG_WSCRATCH0,r10;/* save two registers to work with */\ + mtspr SPRN_SPRG_WSCRATCH1,r11; \ + mtspr SPRN_SPRG_WSCRATCH2,r1; \ mfcr r10; /* save CR in r10 for now */\ mfspr r11,SPRN_SRR1; /* check whether user or kernel */\ andi. r11,r11,MSR_PR; \ beq 1f; \ - mfspr r1,SPRN_SPRG3; /* if from user, start at top of */\ + mfspr r1,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r1,THREAD_INFO-THREAD(r1); /* this thread's kernel stack */\ ALLOC_STACK_FRAME(r1, THREAD_SIZE); \ 1: subi r1,r1,INT_FRAME_SIZE; /* Allocate an exception frame */\ @@ -35,13 +35,13 @@ stw r10,_CCR(r11); /* save various registers */\ stw r12,GPR12(r11); \ stw r9,GPR9(r11); \ - mfspr r10,SPRN_SPRG0; \ + mfspr r10,SPRN_SPRG_RSCRATCH0; \ stw r10,GPR10(r11); \ - mfspr r12,SPRN_SPRG1; \ + mfspr r12,SPRN_SPRG_RSCRATCH1; \ stw r12,GPR11(r11); \ mflr r10; \ stw r10,_LINK(r11); \ - mfspr r10,SPRN_SPRG4R; \ + mfspr r10,SPRN_SPRG_RSCRATCH2; \ mfspr r12,SPRN_SRR0; \ stw r10,GPR1(r11); \ mfspr r9,SPRN_SRR1; \ @@ -69,21 +69,11 @@ * providing configurations that micro-optimize space usage. */ -/* CRIT_SPRG only used in critical exception handling */ -#define CRIT_SPRG SPRN_SPRG2 -/* MCHECK_SPRG only used in machine check exception handling */ -#define MCHECK_SPRG SPRN_SPRG6W - -#define MCHECK_STACK_BASE mcheckirq_ctx +#define MC_STACK_BASE mcheckirq_ctx #define CRIT_STACK_BASE critirq_ctx /* only on e500mc/e200 */ -#define DEBUG_STACK_BASE dbgirq_ctx -#ifdef CONFIG_E200 -#define DEBUG_SPRG SPRN_SPRG6W -#else -#define DEBUG_SPRG SPRN_SPRG9 -#endif +#define DBG_STACK_BASE dbgirq_ctx #define EXC_LVL_FRAME_OVERHEAD (THREAD_SIZE - INT_FRAME_SIZE - EXC_LVL_SIZE) @@ -110,7 +100,7 @@ * critical/machine check exception stack at low physical addresses. */ #define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, exc_level_srr0, exc_level_srr1) \ - mtspr exc_level##_SPRG,r8; \ + mtspr SPRN_SPRG_WSCRATCH_##exc_level,r8; \ BOOKE_LOAD_EXC_LEVEL_STACK(exc_level);/* r8 points to the exc_level stack*/ \ stw r9,GPR9(r8); /* save various registers */\ mfcr r9; /* save CR in r9 for now */\ @@ -119,7 +109,7 @@ stw r9,_CCR(r8); /* save CR on stack */\ mfspr r10,exc_level_srr1; /* check whether user or kernel */\ andi. r10,r10,MSR_PR; \ - mfspr r11,SPRN_SPRG3; /* if from user, start at top of */\ + mfspr r11,SPRN_SPRG_THREAD; /* if from user, start at top of */\ lwz r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\ addi r11,r11,EXC_LVL_FRAME_OVERHEAD; /* allocate stack frame */\ beq 1f; \ @@ -140,7 +130,7 @@ lwz r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r11); \ stw r9,TI_TASK-EXC_LVL_FRAME_OVERHEAD(r8); \ mr r11,r8; \ -2: mfspr r8,exc_level##_SPRG; \ +2: mfspr r8,SPRN_SPRG_RSCRATCH_##exc_level; \ stw r12,GPR12(r11); /* save various registers */\ mflr r10; \ stw r10,_LINK(r11); \ @@ -161,9 +151,9 @@ #define CRITICAL_EXCEPTION_PROLOG \ EXC_LEVEL_EXCEPTION_PROLOG(CRIT, SPRN_CSRR0, SPRN_CSRR1) #define DEBUG_EXCEPTION_PROLOG \ - EXC_LEVEL_EXCEPTION_PROLOG(DEBUG, SPRN_DSRR0, SPRN_DSRR1) + EXC_LEVEL_EXCEPTION_PROLOG(DBG, SPRN_DSRR0, SPRN_DSRR1) #define MCHECK_EXCEPTION_PROLOG \ - EXC_LEVEL_EXCEPTION_PROLOG(MCHECK, SPRN_MCSRR0, SPRN_MCSRR1) + EXC_LEVEL_EXCEPTION_PROLOG(MC, SPRN_MCSRR0, SPRN_MCSRR1) /* * Exception vectors. @@ -282,13 +272,13 @@ mtspr SPRN_DSRR1,r9; \ lwz r9,GPR9(r11); \ lwz r12,GPR12(r11); \ - mtspr DEBUG_SPRG,r8; \ - BOOKE_LOAD_EXC_LEVEL_STACK(DEBUG); /* r8 points to the debug stack */ \ + mtspr SPRN_SPRG_WSCRATCH_DBG,r8; \ + BOOKE_LOAD_EXC_LEVEL_STACK(DBG); /* r8 points to the debug stack */ \ lwz r10,GPR10(r8); \ lwz r11,GPR11(r8); \ - mfspr r8,DEBUG_SPRG; \ + mfspr r8,SPRN_SPRG_RSCRATCH_DBG; \ \ - PPC_RFDI; \ + PPC_RFDI; \ b .; \ \ /* continue normal handling for a debug exception... */ \ @@ -335,11 +325,11 @@ mtspr SPRN_CSRR1,r9; \ lwz r9,GPR9(r11); \ lwz r12,GPR12(r11); \ - mtspr CRIT_SPRG,r8; \ + mtspr SPRN_SPRG_WSCRATCH_CRIT,r8; \ BOOKE_LOAD_EXC_LEVEL_STACK(CRIT); /* r8 points to the debug stack */ \ lwz r10,GPR10(r8); \ lwz r11,GPR11(r8); \ - mfspr r8,CRIT_SPRG; \ + mfspr r8,SPRN_SPRG_RSCRATCH_CRIT; \ \ rfci; \ b .; \ diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 5bdcc06d294c..eca80482ae72 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -361,7 +361,7 @@ skpinv: addi r6,r6,1 /* Increment */ /* ptr to current thread */ addi r4,r2,THREAD /* init task's THREAD */ - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 /* stack */ lis r1,init_thread_union@h @@ -532,12 +532,12 @@ interrupt_base: /* Data TLB Error Interrupt */ START_EXCEPTION(DataTLBError) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_WSCRATCH1, r11 + mtspr SPRN_SPRG_WSCRATCH2, r12 + mtspr SPRN_SPRG_WSCRATCH3, r13 mfcr r11 - mtspr SPRN_SPRG7W, r11 + mtspr SPRN_SPRG_WSCRATCH4, r11 mfspr r10, SPRN_DEAR /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -557,7 +557,7 @@ interrupt_base: /* Get the PGD for the current thread */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: @@ -598,12 +598,12 @@ interrupt_base: /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 b DataStorage /* Instruction TLB Error Interrupt */ @@ -613,12 +613,12 @@ interrupt_base: * to a different point. */ START_EXCEPTION(InstructionTLBError) - mtspr SPRN_SPRG0, r10 /* Save some working registers */ - mtspr SPRN_SPRG1, r11 - mtspr SPRN_SPRG4W, r12 - mtspr SPRN_SPRG5W, r13 + mtspr SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */ + mtspr SPRN_SPRG_WSCRATCH1, r11 + mtspr SPRN_SPRG_WSCRATCH2, r12 + mtspr SPRN_SPRG_WSCRATCH3, r13 mfcr r11 - mtspr SPRN_SPRG7W, r11 + mtspr SPRN_SPRG_WSCRATCH4, r11 mfspr r10, SPRN_SRR0 /* Get faulting address */ /* If we are faulting a kernel address, we have to use the @@ -638,7 +638,7 @@ interrupt_base: /* Get the PGD for the current thread */ 3: - mfspr r11,SPRN_SPRG3 + mfspr r11,SPRN_SPRG_THREAD lwz r11,PGDIR(r11) 4: @@ -666,12 +666,12 @@ interrupt_base: /* The bailout. Restore registers to pre-exception conditions * and call the heavyweights to help us out. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 b InstructionStorage #ifdef CONFIG_SPE @@ -790,12 +790,12 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) tlbwe /* Done...restore registers and get out of here. */ - mfspr r11, SPRN_SPRG7R + mfspr r11, SPRN_SPRG_RSCRATCH4 mtcr r11 - mfspr r13, SPRN_SPRG5R - mfspr r12, SPRN_SPRG4R - mfspr r11, SPRN_SPRG1 - mfspr r10, SPRN_SPRG0 + mfspr r13, SPRN_SPRG_RSCRATCH3 + mfspr r12, SPRN_SPRG_RSCRATCH2 + mfspr r11, SPRN_SPRG_RSCRATCH1 + mfspr r10, SPRN_SPRG_RSCRATCH0 rfi /* Force context change */ #ifdef CONFIG_SPE @@ -839,7 +839,7 @@ load_up_spe: #endif /* !CONFIG_SMP */ /* enable use of SPE after return */ oris r9,r9,MSR_SPE@h - mfspr r5,SPRN_SPRG3 /* current task's THREAD (phys) */ + mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ li r4,1 li r10,THREAD_ACC stw r4,THREAD_USED_SPE(r5) @@ -1118,7 +1118,7 @@ __secondary_start: /* ptr to current thread */ addi r4,r2,THREAD /* address of our thread_struct */ - mtspr SPRN_SPRG3,r4 + mtspr SPRN_SPRG_THREAD,r4 /* Setup the defaults for TLB entries */ li r4,(MAS4_TSIZED(BOOK3E_PAGESZ_4K))@l diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 1f6816003ebe..91b89b8d63d8 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -142,11 +142,11 @@ early_param("smt-enabled", early_smt_enabled); #define check_smt_enabled() #endif /* CONFIG_SMP */ -/* Put the paca pointer into r13 and SPRG3 */ +/* Put the paca pointer into r13 and SPRG_PACA */ void __init setup_paca(int cpu) { local_paca = &paca[cpu]; - mtspr(SPRN_SPRG3, local_paca); + mtspr(SPRN_SPRG_PACA, local_paca); } /* diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index ea4d64644d02..67b6916f0e94 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -65,7 +65,7 @@ _GLOBAL(load_up_altivec) 1: /* enable use of VMX after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG3 /* current task's THREAD (phys) */ + mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ oris r9,r9,MSR_VEC@h #else ld r4,PACACURRENT(r13) diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index d0c6f841bbd1..380a78cf484d 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -56,8 +56,8 @@ .macro KVM_HANDLER ivor_nr _GLOBAL(kvmppc_handler_\ivor_nr) /* Get pointer to vcpu and record exit number. */ - mtspr SPRN_SPRG0, r4 - mfspr r4, SPRN_SPRG1 + mtspr SPRN_SPRG_WSCRATCH0, r4 + mfspr r4, SPRN_SPRG_RVCPU stw r5, VCPU_GPR(r5)(r4) stw r6, VCPU_GPR(r6)(r4) mfctr r5 @@ -95,7 +95,7 @@ _GLOBAL(kvmppc_handler_len) /* Registers: - * SPRG0: guest r4 + * SPRG_SCRATCH0: guest r4 * r4: vcpu pointer * r5: KVM exit number */ @@ -181,7 +181,7 @@ _GLOBAL(kvmppc_resume_host) stw r3, VCPU_LR(r4) mfxer r3 stw r3, VCPU_XER(r4) - mfspr r3, SPRN_SPRG0 + mfspr r3, SPRN_SPRG_RSCRATCH0 stw r3, VCPU_GPR(r4)(r4) mfspr r3, SPRN_SRR0 stw r3, VCPU_PC(r4) @@ -374,7 +374,7 @@ lightweight_exit: mtspr SPRN_IVPR, r8 /* Save vcpu pointer for the exception handlers. */ - mtspr SPRN_SPRG1, r4 + mtspr SPRN_SPRG_WVCPU, r4 /* Can't switch the stack pointer until after IVPR is switched, * because host interrupt handlers would get confused. */ @@ -384,13 +384,13 @@ lightweight_exit: /* Host interrupt handlers may have clobbered these guest-readable * SPRGs, so we need to reload them here with the guest's values. */ lwz r3, VCPU_SPRG4(r4) - mtspr SPRN_SPRG4, r3 + mtspr SPRN_SPRG4W, r3 lwz r3, VCPU_SPRG5(r4) - mtspr SPRN_SPRG5, r3 + mtspr SPRN_SPRG5W, r3 lwz r3, VCPU_SPRG6(r4) - mtspr SPRN_SPRG6, r3 + mtspr SPRN_SPRG6W, r3 lwz r3, VCPU_SPRG7(r4) - mtspr SPRN_SPRG7, r3 + mtspr SPRN_SPRG7W, r3 #ifdef CONFIG_KVM_EXIT_TIMING /* save enter time */ diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 14af8cedab70..b13d58932bf6 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -40,7 +40,7 @@ mmu_hash_lock: * The address is in r4, and r3 contains an access flag: * _PAGE_RW (0x400) if a write. * r9 contains the SRR1 value, from which we use the MSR_PR bit. - * SPRG3 contains the physical address of the current task's thread. + * SPRG_THREAD contains the physical address of the current task's thread. * * Returns to the caller if the access is illegal or there is no * mapping for the address. Otherwise it places an appropriate PTE @@ -68,7 +68,7 @@ _GLOBAL(hash_page) /* Get PTE (linux-style) and check access */ lis r0,KERNELBASE@h /* check if kernel address */ cmplw 0,r4,r0 - mfspr r8,SPRN_SPRG3 /* current task's THREAD (phys) */ + mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ lwz r5,PGDIR(r8) /* virt page-table root */ blt+ 112f /* assume user more likely */ diff --git a/arch/powerpc/platforms/iseries/exception.S b/arch/powerpc/platforms/iseries/exception.S index 2f581521eb9b..2b8075979237 100644 --- a/arch/powerpc/platforms/iseries/exception.S +++ b/arch/powerpc/platforms/iseries/exception.S @@ -47,7 +47,7 @@ system_reset_iSeries: LOAD_REG_ADDR(r13, paca) mulli r0,r23,PACA_SIZE add r13,r13,r0 - mtspr SPRN_SPRG3,r13 /* Save it away for the future */ + mtspr SPRN_SPRG_PACA,r13 /* Save it away for the future */ mfmsr r24 ori r24,r24,MSR_RI mtmsrd r24 /* RI on */ @@ -116,7 +116,7 @@ iSeries_secondary_smp_loop: #endif /* CONFIG_SMP */ li r0,-1 /* r0=-1 indicates a Hypervisor call */ sc /* Invoke the hypervisor via a system call */ - mfspr r13,SPRN_SPRG3 /* Put r13 back ???? */ + mfspr r13,SPRN_SPRG_PACA /* Put r13 back ???? */ b 2b /* If SMP not configured, secondaries * loop forever */ @@ -126,9 +126,9 @@ iSeries_secondary_smp_loop: .globl data_access_iSeries data_access_iSeries: - mtspr SPRN_SPRG1,r13 + mtspr SPRN_SPRG_SCRATCH0,r13 BEGIN_FTR_SECTION - mtspr SPRN_SPRG2,r12 + mtspr SPRN_SPRG_SCRATCH1,r12 mfspr r13,SPRN_DAR mfspr r12,SPRN_DSISR srdi r13,r13,60 @@ -137,7 +137,7 @@ BEGIN_FTR_SECTION cmpwi r13,0x2c beq .do_stab_bolted_iSeries mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG2 + mfspr r12,SPRN_SPRG_SCRATCH1 END_FTR_SECTION_IFCLR(CPU_FTR_SLB) EXCEPTION_PROLOG_1(PACA_EXGEN) EXCEPTION_PROLOG_ISERIES_1 @@ -145,15 +145,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_SLB) .do_stab_bolted_iSeries: mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG2 + mfspr r12,SPRN_SPRG_SCRATCH1 EXCEPTION_PROLOG_1(PACA_EXSLB) EXCEPTION_PROLOG_ISERIES_1 b .do_stab_bolted .globl data_access_slb_iSeries data_access_slb_iSeries: - mtspr SPRN_SPRG1,r13 /* save r13 */ - mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */ + mfspr r13,SPRN_SPRG_PACA /* get paca address into r13 */ std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_DAR std r9,PACA_EXSLB+EX_R9(r13) @@ -165,7 +165,7 @@ data_access_slb_iSeries: std r10,PACA_EXSLB+EX_R10(r13) std r11,PACA_EXSLB+EX_R11(r13) std r12,PACA_EXSLB+EX_R12(r13) - mfspr r10,SPRN_SPRG1 + mfspr r10,SPRN_SPRG_SCRATCH0 std r10,PACA_EXSLB+EX_R13(r13) ld r12,PACALPPACAPTR(r13) ld r12,LPPACASRR1(r12) @@ -175,8 +175,8 @@ data_access_slb_iSeries: .globl instruction_access_slb_iSeries instruction_access_slb_iSeries: - mtspr SPRN_SPRG1,r13 /* save r13 */ - mfspr r13,SPRN_SPRG3 /* get paca address into r13 */ + mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */ + mfspr r13,SPRN_SPRG_PACA /* get paca address into r13 */ std r3,PACA_EXSLB+EX_R3(r13) ld r3,PACALPPACAPTR(r13) ld r3,LPPACASRR0(r3) /* get SRR0 value */ @@ -189,7 +189,7 @@ instruction_access_slb_iSeries: std r10,PACA_EXSLB+EX_R10(r13) std r11,PACA_EXSLB+EX_R11(r13) std r12,PACA_EXSLB+EX_R12(r13) - mfspr r10,SPRN_SPRG1 + mfspr r10,SPRN_SPRG_SCRATCH0 std r10,PACA_EXSLB+EX_R13(r13) ld r12,PACALPPACAPTR(r13) ld r12,LPPACASRR1(r12) @@ -200,7 +200,7 @@ slb_miss_user_iseries: std r10,PACA_EXGEN+EX_R10(r13) std r11,PACA_EXGEN+EX_R11(r13) std r12,PACA_EXGEN+EX_R12(r13) - mfspr r10,SPRG1 + mfspr r10,SPRG_SCRATCH0 ld r11,PACA_EXSLB+EX_R9(r13) ld r12,PACA_EXSLB+EX_R3(r13) std r10,PACA_EXGEN+EX_R13(r13) @@ -221,7 +221,7 @@ slb_miss_user_iseries: .globl system_call_iSeries system_call_iSeries: mr r9,r13 - mfspr r13,SPRN_SPRG3 + mfspr r13,SPRN_SPRG_PACA EXCEPTION_PROLOG_ISERIES_1 b system_call_common diff --git a/arch/powerpc/platforms/iseries/exception.h b/arch/powerpc/platforms/iseries/exception.h index e26eb86ac73d..bae3fba5ad8e 100644 --- a/arch/powerpc/platforms/iseries/exception.h +++ b/arch/powerpc/platforms/iseries/exception.h @@ -38,7 +38,7 @@ .globl label##_iSeries; \ label##_iSeries: \ HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ + mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ EXCEPTION_PROLOG_1(area); \ EXCEPTION_PROLOG_ISERIES_1; \ b label##_common @@ -47,7 +47,7 @@ label##_iSeries: \ .globl label##_iSeries; \ label##_iSeries: \ HMT_MEDIUM; \ - mtspr SPRN_SPRG1,r13; /* save r13 */ \ + mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ EXCEPTION_PROLOG_1(PACA_EXGEN); \ lbz r10,PACASOFTIRQEN(r13); \ cmpwi 0,r10,0; \ From c5a8c0c99f67ae8a784faafbaaea1529825796e2 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 16 Jul 2009 19:36:57 +0000 Subject: [PATCH 0122/1662] powerpc: Remove use of a second scratch SPRG in STAB code The STAB code used on Power3 and RS/64 uses a second scratch SPRG to save a GPR in order to decide whether to go to do_stab_bolted_* or to handle a normal data access exception. This prevents our scheme of freeing SPRG3 which is user visible for user uses since we cannot use SPRG0 which, on RS/64, seems to be read-only for supervisor mode (like POWER4). This reworks the STAB exception entry to use the PACA as temporary storage instead. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/exception-64s.h | 7 ++-- arch/powerpc/include/asm/reg.h | 3 +- arch/powerpc/kernel/exceptions-64s.S | 38 ++++++++++++++-------- arch/powerpc/platforms/iseries/exception.S | 37 +++++++++++++-------- 4 files changed, 55 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 773e380b5fe8..a98653b26231 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -66,8 +66,7 @@ std r9,area+EX_R13(r13); \ mfcr r9 -#define EXCEPTION_PROLOG_PSERIES(area, label) \ - EXCEPTION_PROLOG_1(area); \ +#define EXCEPTION_PROLOG_PSERIES_1(label) \ ld r12,PACAKBASE(r13); /* get high part of &label */ \ ld r10,PACAKMSR(r13); /* get MSR value for kernel */ \ mfspr r11,SPRN_SRR0; /* save SRR0 */ \ @@ -78,6 +77,10 @@ rfid; \ b . /* prevent speculative execution */ +#define EXCEPTION_PROLOG_PSERIES(area, label) \ + EXCEPTION_PROLOG_1(area); \ + EXCEPTION_PROLOG_PSERIES_1(label); + /* * The common exception prolog is used for all except a few exceptions * such as a segment miss on a kernel address. We have to be prepared diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index a8179cc99ac4..d17af2b3d4ce 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -654,7 +654,7 @@ * 64-bit server: * - SPRG0 unused (reserved for HV on Power4) * - SPRG1 scratch for exception vectors - * - SPRG2 scratch for exception vectors + * - SPRG2 unused * * All 32-bit: * - SPRG3 current thread_info pointer @@ -707,7 +707,6 @@ #ifdef CONFIG_PPC_BOOK3S_64 #define SPRN_SPRG_SCRATCH0 SPRN_SPRG1 -#define SPRN_SPRG_SCRATCH1 SPRN_SPRG2 #endif #ifdef CONFIG_PPC_BOOK3S_32 diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 4e9640cc0563..50f2ad36ed09 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -50,18 +50,28 @@ data_access_pSeries: HMT_MEDIUM mtspr SPRN_SPRG_SCRATCH0,r13 BEGIN_FTR_SECTION - mtspr SPRN_SPRG_SCRATCH1,r12 - mfspr r13,SPRN_DAR - mfspr r12,SPRN_DSISR - srdi r13,r13,60 - rlwimi r13,r12,16,0x20 - mfcr r12 - cmpwi r13,0x2c + mfspr r13,SPRN_SPRG_PACA + std r9,PACA_EXSLB+EX_R9(r13) + std r10,PACA_EXSLB+EX_R10(r13) + mfspr r10,SPRN_DAR + mfspr r9,SPRN_DSISR + srdi r10,r10,60 + rlwimi r10,r9,16,0x20 + mfcr r9 + cmpwi r10,0x2c beq do_stab_bolted_pSeries - mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG_SCRATCH1 -END_FTR_SECTION_IFCLR(CPU_FTR_SLB) + ld r10,PACA_EXSLB+EX_R10(r13) + std r11,PACA_EXGEN+EX_R11(r13) + ld r11,PACA_EXSLB+EX_R9(r13) + std r12,PACA_EXGEN+EX_R12(r13) + mfspr r12,SPRN_SPRG_SCRATCH0 + std r10,PACA_EXGEN+EX_R10(r13) + std r11,PACA_EXGEN+EX_R9(r13) + std r12,PACA_EXGEN+EX_R13(r13) + EXCEPTION_PROLOG_PSERIES_1(data_access_common) +FTR_SECTION_ELSE EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_SLB) . = 0x380 .globl data_access_slb_pSeries @@ -224,9 +234,11 @@ masked_interrupt: .align 7 do_stab_bolted_pSeries: - mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG_SCRATCH1 - EXCEPTION_PROLOG_PSERIES(PACA_EXSLB, .do_stab_bolted) + std r11,PACA_EXSLB+EX_R11(r13) + std r12,PACA_EXSLB+EX_R12(r13) + mfspr r10,SPRN_SPRG_SCRATCH0 + std r10,PACA_EXSLB+EX_R13(r13) + EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted) #ifdef CONFIG_PPC_PSERIES /* diff --git a/arch/powerpc/platforms/iseries/exception.S b/arch/powerpc/platforms/iseries/exception.S index 2b8075979237..5369653dcf6a 100644 --- a/arch/powerpc/platforms/iseries/exception.S +++ b/arch/powerpc/platforms/iseries/exception.S @@ -128,25 +128,36 @@ iSeries_secondary_smp_loop: data_access_iSeries: mtspr SPRN_SPRG_SCRATCH0,r13 BEGIN_FTR_SECTION - mtspr SPRN_SPRG_SCRATCH1,r12 - mfspr r13,SPRN_DAR - mfspr r12,SPRN_DSISR - srdi r13,r13,60 - rlwimi r13,r12,16,0x20 - mfcr r12 - cmpwi r13,0x2c + mfspr r13,SPRN_SPRG_PACA + std r9,PACA_EXSLB+EX_R9(r13) + std r10,PACA_EXSLB+EX_R10(r13) + mfspr r10,SPRN_DAR + mfspr r9,SPRN_DSISR + srdi r10,r10,60 + rlwimi r10,r9,16,0x20 + mfcr r9 + cmpwi r10,0x2c beq .do_stab_bolted_iSeries - mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG_SCRATCH1 -END_FTR_SECTION_IFCLR(CPU_FTR_SLB) + ld r10,PACA_EXSLB+EX_R10(r13) + std r11,PACA_EXGEN+EX_R11(r13) + ld r11,PACA_EXSLB+EX_R9(r13) + std r12,PACA_EXGEN+EX_R12(r13) + mfspr r12,SPRN_SPRG_SCRATCH0 + std r10,PACA_EXGEN+EX_R10(r13) + std r11,PACA_EXGEN+EX_R9(r13) + std r12,PACA_EXGEN+EX_R13(r13) + EXCEPTION_PROLOG_ISERIES_1 +FTR_SECTION_ELSE EXCEPTION_PROLOG_1(PACA_EXGEN) EXCEPTION_PROLOG_ISERIES_1 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_SLB) b data_access_common .do_stab_bolted_iSeries: - mtcrf 0x80,r12 - mfspr r12,SPRN_SPRG_SCRATCH1 - EXCEPTION_PROLOG_1(PACA_EXSLB) + std r11,PACA_EXSLB+EX_R11(r13) + std r12,PACA_EXSLB+EX_R12(r13) + mfspr r10,SPRN_SPRG_SCRATCH0 + std r10,PACA_EXSLB+EX_R13(r13) EXCEPTION_PROLOG_ISERIES_1 b .do_stab_bolted From 2e2ddb24d36106e029f6eeb3df611178a36fb295 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Tue, 21 Jul 2009 00:17:17 +0000 Subject: [PATCH 0123/1662] powerpc/cell: Replace strncpy by strlcpy Replace strncpy() and explicit null-termination by strlcpy() Signed-off-by: Roel Kluin Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/cell/celleb_setup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/cell/celleb_setup.c b/arch/powerpc/platforms/cell/celleb_setup.c index 07c234f6b2b6..e53845579770 100644 --- a/arch/powerpc/platforms/cell/celleb_setup.c +++ b/arch/powerpc/platforms/cell/celleb_setup.c @@ -80,8 +80,7 @@ static void celleb_show_cpuinfo(struct seq_file *m) static int __init celleb_machine_type_hack(char *ptr) { - strncpy(celleb_machine_type, ptr, sizeof(celleb_machine_type)); - celleb_machine_type[sizeof(celleb_machine_type)-1] = 0; + strlcpy(celleb_machine_type, ptr, sizeof(celleb_machine_type)); return 0; } From 066c4b87e927985a083481c92b4aebade8fe4ab3 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 21 Jul 2009 15:25:53 +0000 Subject: [PATCH 0124/1662] powerpc/mm: Fix definitions of FORCE_MAX_ZONEORDER in Kconfig The current definitions set ranges and defaults for 32 and 64-bit only using "PPC_STD_MMU" which means hash based MMU. This uselessly restrict the usefulness for the upcoming 64-bit BookE port, but more than that, it's broken on 32-bit since the only 32-bit platform supporting multiple page sizes currently is 44x which does -not- have PPC_STD_MMU_32 set. This fixes it by using PPC64 and PPC32 instead. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d00131ca0835..52349ef1b3a7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -492,16 +492,16 @@ endchoice config FORCE_MAX_ZONEORDER int "Maximum zone order" - range 9 64 if PPC_STD_MMU_64 && PPC_64K_PAGES - default "9" if PPC_STD_MMU_64 && PPC_64K_PAGES - range 13 64 if PPC_STD_MMU_64 && !PPC_64K_PAGES - default "13" if PPC_STD_MMU_64 && !PPC_64K_PAGES - range 9 64 if PPC_STD_MMU_32 && PPC_16K_PAGES - default "9" if PPC_STD_MMU_32 && PPC_16K_PAGES - range 7 64 if PPC_STD_MMU_32 && PPC_64K_PAGES - default "7" if PPC_STD_MMU_32 && PPC_64K_PAGES - range 5 64 if PPC_STD_MMU_32 && PPC_256K_PAGES - default "5" if PPC_STD_MMU_32 && PPC_256K_PAGES + range 9 64 if PPC64 && PPC_64K_PAGES + default "9" if PPC64 && PPC_64K_PAGES + range 13 64 if PPC64 && !PPC_64K_PAGES + default "13" if PPC64 && !PPC_64K_PAGES + range 9 64 if PPC32 && PPC_16K_PAGES + default "9" if PPC32 && PPC_16K_PAGES + range 7 64 if PPC32 && PPC_64K_PAGES + default "7" if PPC32 && PPC_64K_PAGES + range 5 64 if PPC32 && PPC_256K_PAGES + default "5" if PPC32 && PPC_256K_PAGES range 11 64 default "11" help From 527b3639616b21c257518ee7c26fbf05232db0c0 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 14 Jul 2009 20:56:58 +0000 Subject: [PATCH 0125/1662] powerpc/pmac: Fix PowerSurge SMP IPI allocation The code for setting up the IPIs for SMP PowerSurge marchines bitrot, it needs to properly map the HW interrupt number Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powermac/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index 6d4da7b46b41..937a38e73178 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -408,7 +408,7 @@ static void __init smp_psurge_setup_cpu(int cpu_nr) /* reset the entry point so if we get another intr we won't * try to startup again */ out_be32(psurge_start, 0x100); - if (setup_irq(30, &psurge_irqaction)) + if (setup_irq(irq_create_mapping(NULL, 30), &psurge_irqaction)) printk(KERN_ERR "Couldn't get primary IPI interrupt"); } From 063517bea114d4cb57bf582353d0a99b82775a63 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 14 Jul 2009 20:52:56 +0000 Subject: [PATCH 0126/1662] powerpc: Change PACA from SPRG3 to SPRG1 This change the SPRG used to store the PACA on ppc64 from SPRG3 to SPRG1. SPRG3 is user readable on most processors and we want to use it for other things. We change the scratch SPRG used by exception vectors from SRPG1 to SPRG2. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/reg.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index d17af2b3d4ce..2cedbb427618 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -649,12 +649,12 @@ * SPRG usage: * * All 64-bit: - * - SPRG3 stores PACA pointer + * - SPRG1 stores PACA pointer * * 64-bit server: * - SPRG0 unused (reserved for HV on Power4) - * - SPRG1 scratch for exception vectors - * - SPRG2 unused + * - SPRG2 scratch for exception vectors + * - SPRG3 unused (user visible) * * All 32-bit: * - SPRG3 current thread_info pointer @@ -700,13 +700,13 @@ * */ #ifdef CONFIG_PPC64 -#define SPRN_SPRG_PACA SPRN_SPRG3 +#define SPRN_SPRG_PACA SPRN_SPRG1 #else #define SPRN_SPRG_THREAD SPRN_SPRG3 #endif #ifdef CONFIG_PPC_BOOK3S_64 -#define SPRN_SPRG_SCRATCH0 SPRN_SPRG1 +#define SPRN_SPRG_SCRATCH0 SPRN_SPRG2 #endif #ifdef CONFIG_PPC_BOOK3S_32 From f7d4f68d971b8234491b4a0be58aa6f659c1c194 Mon Sep 17 00:00:00 2001 From: Frans Pop Date: Thu, 23 Jul 2009 08:57:18 +0000 Subject: [PATCH 0127/1662] powerpc: Makefile simplification through use of cc-ifversion Signed-off-by: Frans Pop Acked-by: Sam Ravnborg Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index bc35f4e2b81c..952a3963e9e8 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -77,7 +77,7 @@ CPP = $(CC) -E $(KBUILD_CFLAGS) CHECKFLAGS += -m$(CONFIG_WORD_SIZE) -D__powerpc__ -D__powerpc$(CONFIG_WORD_SIZE)__ ifeq ($(CONFIG_PPC64),y) -GCC_BROKEN_VEC := $(shell if [ $(call cc-version) -lt 0400 ] ; then echo "y"; fi) +GCC_BROKEN_VEC := $(call cc-ifversion, -lt, 0400, y) ifeq ($(CONFIG_POWER4_ONLY),y) ifeq ($(CONFIG_ALTIVEC),y) From c79b29735d28d819380b584d6707b4110ee759f3 Mon Sep 17 00:00:00 2001 From: Lucian Adrian Grijincu Date: Thu, 23 Jul 2009 00:13:37 +0000 Subject: [PATCH 0128/1662] powerpc: Update boot wrapper script with the new location of dtc dtc was moved in 9fffb55f66127b52c937ede5196ebfa0c0d50bce from arch/powerpc/boot/ to scripts/dtc/ This patch updates the wrapper script to point to the new location of dtc. Signed-off-by: Lucian Adrian Grijincu Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/boot/wrapper | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 4db487d1d2a8..ac9e9a58b2b0 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -46,6 +46,7 @@ CROSS= # directory for object and other files used by this script object=arch/powerpc/boot objbin=$object +dtc=scripts/dtc/dtc # directory for working files tmpdir=. @@ -124,7 +125,7 @@ if [ -n "$dts" ]; then if [ -z "$dtb" ]; then dtb="$platform.dtb" fi - $object/dtc -O dtb -o "$dtb" -b 0 "$dts" + $dtc -O dtb -o "$dtb" -b 0 "$dts" fi if [ -z "$kernel" ]; then From dd90bbd5fb763ab8924135a30956030c7a7b94fc Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 28 Jul 2009 11:54:32 +1000 Subject: [PATCH 0129/1662] powerpc: Add compat_sys_truncate The truncate syscall has a signed long parameter, so when using a 32- bit userspace with a 64-bit kernel the argument is zero-extended instead of sign-extended. Adding the compat_sys_truncate function fixes the issue. This was noticed during an LSB truncate test failure. The test was checking for the correct error number set when truncate is called with a length of -1. The test can be found at: http://bzr.linuxfoundation.org/lsb/devel/runtime-test?cmd=inventory;rev=stewb%40linux-foundation.org-20090626205411-sfb23cc0tjj7jzgm;path=modules/vsx-pcts/tset/POSIX.os/files/truncate/ BenH: Added compat_sys_ftruncate() as well, same issue. Signed-off-by: Chase Douglas Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/systbl.h | 4 ++-- arch/powerpc/kernel/sys_ppc32.c | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 370600ca2765..ed24bd92fe49 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -95,8 +95,8 @@ SYSCALL(reboot) SYSX(sys_ni_syscall,compat_sys_old_readdir,sys_old_readdir) SYSCALL_SPU(mmap) SYSCALL_SPU(munmap) -SYSCALL_SPU(truncate) -SYSCALL_SPU(ftruncate) +COMPAT_SYS_SPU(truncate) +COMPAT_SYS_SPU(ftruncate) SYSCALL_SPU(fchmod) SYSCALL_SPU(fchown) COMPAT_SYS_SPU(getpriority) diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index bb1cfcfdbbbb..1cc5e9e5da96 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -343,6 +343,18 @@ off_t ppc32_lseek(unsigned int fd, u32 offset, unsigned int origin) return sys_lseek(fd, (int)offset, origin); } +long compat_sys_truncate(const char __user * path, u32 length) +{ + /* sign extend length */ + return sys_truncate(path, (int)length); +} + +long compat_sys_ftruncate(int fd, u32 length) +{ + /* sign extend length */ + return sys_ftruncate(fd, (int)length); +} + /* Note: it is necessary to treat bufsiz as an unsigned int, * with the corresponding cast to a signed int to insure that the * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode) From 7d60b02cc7e6d67b498eed9ecb58010f61422325 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:04 +0000 Subject: [PATCH 0130/1662] powerpc/mm: Fix misplaced #endif in pgtable-ppc64-64k.h A misplaced #endif causes more definitions than intended to be protected by #ifndef __ASSEMBLY__. This breaks upcoming 64-bit BookE support patch when using 64k pages. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgtable-ppc64-64k.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64-64k.h b/arch/powerpc/include/asm/pgtable-ppc64-64k.h index 6cc085b945a5..90533ddcd703 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64-64k.h +++ b/arch/powerpc/include/asm/pgtable-ppc64-64k.h @@ -10,10 +10,10 @@ #define PGD_INDEX_SIZE 4 #ifndef __ASSEMBLY__ - #define PTE_TABLE_SIZE (sizeof(real_pte_t) << PTE_INDEX_SIZE) #define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) #define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) +#endif /* __ASSEMBLY__ */ #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) @@ -32,8 +32,6 @@ #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -#endif /* __ASSEMBLY__ */ - /* Bits to mask out from a PMD to get to the PTE page */ #define PMD_MASKED_BITS 0x1ff /* Bits to mask out from a PGD/PUD to get to the PMD page */ From 6c1719942e19936044c4673b18afa26e45a02320 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:07 +0000 Subject: [PATCH 0131/1662] powerpc/of: Remove useless register save/restore when calling OF back enter_prom() used to save and restore registers such as CTR, XER etc.. which are volatile, or SRR0,1... which we don't care about. This removes a bunch of useless code and while at it turns an mtmsrd into an MTMSRD macro which will be useful to Book3E. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/entry_64.S | 38 ++++++---------------------------- 1 file changed, 6 insertions(+), 32 deletions(-) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index dbf0e3115611..1cb0f3d1714b 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -823,30 +823,17 @@ _GLOBAL(enter_prom) * of all registers that it saves. We therefore save those registers * PROM might touch to the stack. (r0, r3-r13 are caller saved) */ - SAVE_8GPRS(2, r1) + SAVE_GPR(2, r1) SAVE_GPR(13, r1) SAVE_8GPRS(14, r1) SAVE_10GPRS(22, r1) - mfcr r4 - std r4,_CCR(r1) - mfctr r5 - std r5,_CTR(r1) - mfspr r6,SPRN_XER - std r6,_XER(r1) - mfdar r7 - std r7,_DAR(r1) - mfdsisr r8 - std r8,_DSISR(r1) - mfsrr0 r9 - std r9,_SRR0(r1) - mfsrr1 r10 - std r10,_SRR1(r1) + mfcr r10 mfmsr r11 + std r10,_CCR(r1) std r11,_MSR(r1) /* Get the PROM entrypoint */ - ld r0,GPR4(r1) - mtlr r0 + mtlr r4 /* Switch MSR to 32 bits mode */ @@ -860,8 +847,7 @@ _GLOBAL(enter_prom) mtmsrd r11 isync - /* Restore arguments & enter PROM here... */ - ld r3,GPR3(r1) + /* Enter PROM here... */ blrl /* Just make sure that r1 top 32 bits didn't get @@ -871,7 +857,7 @@ _GLOBAL(enter_prom) /* Restore the MSR (back to 64 bits) */ ld r0,_MSR(r1) - mtmsrd r0 + MTMSRD(r0) isync /* Restore other registers */ @@ -881,18 +867,6 @@ _GLOBAL(enter_prom) REST_10GPRS(22, r1) ld r4,_CCR(r1) mtcr r4 - ld r5,_CTR(r1) - mtctr r5 - ld r6,_XER(r1) - mtspr SPRN_XER,r6 - ld r7,_DAR(r1) - mtdar r7 - ld r8,_DSISR(r1) - mtdsisr r8 - ld r9,_SRR0(r1) - mtsrr0 r9 - ld r10,_SRR1(r1) - mtsrr1 r10 addi r1,r1,PROM_FRAME_SIZE ld r0,16(r1) From fcce810986b3f32a8322faf240f8cc5560a4c463 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:10 +0000 Subject: [PATCH 0132/1662] powerpc/mm: Add HW threads support to no_hash TLB management The current "no hash" MMU context management code is written with the assumption that one CPU == one TLB. This is not the case on implementations that support HW multithreading, where several linux CPUs can share the same TLB. This adds some basic support for this to our context management and our TLB flushing code. It also cleans up the optional debugging output a bit Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/cputhreads.h | 16 +++++ arch/powerpc/mm/mmu_context_nohash.c | 93 ++++++++++++++++++--------- arch/powerpc/mm/tlb_nohash.c | 10 ++- 3 files changed, 86 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/cputhreads.h b/arch/powerpc/include/asm/cputhreads.h index fb11b0c459b8..a8e18447c62b 100644 --- a/arch/powerpc/include/asm/cputhreads.h +++ b/arch/powerpc/include/asm/cputhreads.h @@ -5,6 +5,15 @@ /* * Mapping of threads to cores + * + * Note: This implementation is limited to a power of 2 number of + * threads per core and the same number for each core in the system + * (though it would work if some processors had less threads as long + * as the CPU numbers are still allocated, just not brought offline). + * + * However, the API allows for a different implementation in the future + * if needed, as long as you only use the functions and not the variables + * directly. */ #ifdef CONFIG_SMP @@ -67,5 +76,12 @@ static inline int cpu_first_thread_in_core(int cpu) return cpu & ~(threads_per_core - 1); } +static inline int cpu_last_thread_in_core(int cpu) +{ + return cpu | (threads_per_core - 1); +} + + + #endif /* _ASM_POWERPC_CPUTHREADS_H */ diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index b1a727def15b..834436d6d6b8 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -25,10 +25,20 @@ * also clear mm->cpu_vm_mask bits when processes are migrated */ -#undef DEBUG -#define DEBUG_STEAL_ONLY -#undef DEBUG_MAP_CONSISTENCY -/*#define DEBUG_CLAMP_LAST_CONTEXT 15 */ +#define DEBUG_MAP_CONSISTENCY +#define DEBUG_CLAMP_LAST_CONTEXT 31 +//#define DEBUG_HARDER + +/* We don't use DEBUG because it tends to be compiled in always nowadays + * and this would generate way too much output + */ +#ifdef DEBUG_HARDER +#define pr_hard(args...) printk(KERN_DEBUG args) +#define pr_hardcont(args...) printk(KERN_CONT args) +#else +#define pr_hard(args...) do { } while(0) +#define pr_hardcont(args...) do { } while(0) +#endif #include #include @@ -71,7 +81,7 @@ static DEFINE_SPINLOCK(context_lock); static unsigned int steal_context_smp(unsigned int id) { struct mm_struct *mm; - unsigned int cpu, max; + unsigned int cpu, max, i; max = last_context - first_context; @@ -89,15 +99,22 @@ static unsigned int steal_context_smp(unsigned int id) id = first_context; continue; } - pr_devel("[%d] steal context %d from mm @%p\n", - smp_processor_id(), id, mm); + pr_hardcont(" | steal %d from 0x%p", id, mm); /* Mark this mm has having no context anymore */ mm->context.id = MMU_NO_CONTEXT; - /* Mark it stale on all CPUs that used this mm */ - for_each_cpu(cpu, mm_cpumask(mm)) - __set_bit(id, stale_map[cpu]); + /* Mark it stale on all CPUs that used this mm. For threaded + * implementations, we set it on all threads on each core + * represented in the mask. A future implementation will use + * a core map instead but this will do for now. + */ + for_each_cpu(cpu, mm_cpumask(mm)) { + for (i = cpu_first_thread_in_core(cpu); + i <= cpu_last_thread_in_core(cpu); i++) + __set_bit(id, stale_map[i]); + cpu = i - 1; + } return id; } @@ -126,7 +143,7 @@ static unsigned int steal_context_up(unsigned int id) /* Pick up the victim mm */ mm = context_mm[id]; - pr_devel("[%d] steal context %d from mm @%p\n", cpu, id, mm); + pr_hardcont(" | steal %d from 0x%p", id, mm); /* Flush the TLB for that context */ local_flush_tlb_mm(mm); @@ -179,19 +196,14 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) /* No lockless fast path .. yet */ spin_lock(&context_lock); -#ifndef DEBUG_STEAL_ONLY - pr_devel("[%d] activating context for mm @%p, active=%d, id=%d\n", - cpu, next, next->context.active, next->context.id); -#endif + pr_hard("[%d] activating context for mm @%p, active=%d, id=%d", + cpu, next, next->context.active, next->context.id); #ifdef CONFIG_SMP /* Mark us active and the previous one not anymore */ next->context.active++; if (prev) { -#ifndef DEBUG_STEAL_ONLY - pr_devel(" old context %p active was: %d\n", - prev, prev->context.active); -#endif + pr_hardcont(" (old=0x%p a=%d)", prev, prev->context.active); WARN_ON(prev->context.active < 1); prev->context.active--; } @@ -201,8 +213,14 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) /* If we already have a valid assigned context, skip all that */ id = next->context.id; - if (likely(id != MMU_NO_CONTEXT)) + if (likely(id != MMU_NO_CONTEXT)) { +#ifdef DEBUG_MAP_CONSISTENCY + if (context_mm[id] != next) + pr_err("MMU: mm 0x%p has id %d but context_mm[%d] says 0x%p\n", + next, id, id, context_mm[id]); +#endif goto ctxt_ok; + } /* We really don't have a context, let's try to acquire one */ id = next_context; @@ -235,11 +253,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) next_context = id + 1; context_mm[id] = next; next->context.id = id; - -#ifndef DEBUG_STEAL_ONLY - pr_devel("[%d] picked up new id %d, nrf is now %d\n", - cpu, id, nr_free_contexts); -#endif + pr_hardcont(" | new id=%d,nrf=%d", id, nr_free_contexts); context_check_map(); ctxt_ok: @@ -248,15 +262,20 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) * local TLB for it and unmark it before we use it */ if (test_bit(id, stale_map[cpu])) { - pr_devel("[%d] flushing stale context %d for mm @%p !\n", - cpu, id, next); + pr_hardcont(" | stale flush %d [%d..%d]", + id, cpu_first_thread_in_core(cpu), + cpu_last_thread_in_core(cpu)); + local_flush_tlb_mm(next); /* XXX This clear should ultimately be part of local_flush_tlb_mm */ - __clear_bit(id, stale_map[cpu]); + for (cpu = cpu_first_thread_in_core(cpu); + cpu <= cpu_last_thread_in_core(cpu); cpu++) + __clear_bit(id, stale_map[cpu]); } /* Flick the MMU and release lock */ + pr_hardcont(" -> %d\n", id); set_context(id, next->pgd); spin_unlock(&context_lock); } @@ -266,6 +285,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) */ int init_new_context(struct task_struct *t, struct mm_struct *mm) { + pr_hard("initing context for mm @%p\n", mm); + mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; @@ -305,7 +326,9 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned int)(long)hcpu; - +#ifdef CONFIG_HOTPLUG_CPU + struct task_struct *p; +#endif /* We don't touch CPU 0 map, it's allocated at aboot and kept * around forever */ @@ -324,8 +347,16 @@ static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self, pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); kfree(stale_map[cpu]); stale_map[cpu] = NULL; - break; -#endif + + /* We also clear the cpu_vm_mask bits of CPUs going away */ + read_lock(&tasklist_lock); + for_each_process(p) { + if (p->mm) + cpu_mask_clear_cpu(cpu, mm_cpumask(p->mm)); + } + read_unlock(&tasklist_lock); + break; +#endif /* CONFIG_HOTPLUG_CPU */ } return NOTIFY_OK; } diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index ad2eb4d34dd4..d908e75cc3b5 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -87,6 +87,12 @@ EXPORT_SYMBOL(local_flush_tlb_page); static DEFINE_SPINLOCK(tlbivax_lock); +static int mm_is_core_local(struct mm_struct *mm) +{ + return cpumask_subset(mm_cpumask(mm), + topology_thread_cpumask(smp_processor_id())); +} + struct tlb_flush_param { unsigned long addr; unsigned int pid; @@ -131,7 +137,7 @@ void flush_tlb_mm(struct mm_struct *mm) pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) goto no_context; - if (!cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { + if (!mm_is_core_local(mm)) { struct tlb_flush_param p = { .pid = pid }; /* Ignores smp_processor_id() even if set. */ smp_call_function_many(mm_cpumask(mm), @@ -153,7 +159,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) if (unlikely(pid == MMU_NO_CONTEXT)) goto bail; cpu_mask = mm_cpumask(vma->vm_mm); - if (!cpumask_equal(cpu_mask, cpumask_of(smp_processor_id()))) { + if (!mm_is_core_local(mm)) { /* If broadcast tlbivax is supported, use it */ if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); From 29c09e8fbaf65698c51aeffe34acc284a454a38f Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:11 +0000 Subject: [PATCH 0133/1662] powerpc/mm: Add opcode definitions for tlbivax and tlbsrx. This adds the opcode definitions to ppc-opcode.h for the two instructions tlbivax and tlbsrx. as defined by Book3E 2.06 Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/ppc-opcode.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index b74f16d45cb4..ef9aa84cac5a 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -48,6 +48,8 @@ #define PPC_INST_TLBIE 0x7c000264 #define PPC_INST_TLBILX 0x7c000024 #define PPC_INST_WAIT 0x7c00007c +#define PPC_INST_TLBIVAX 0x7c000624 +#define PPC_INST_TLBSRX_DOT 0x7c0006a5 /* macros to insert fields into opcodes */ #define __PPC_RA(a) (((a) & 0x1f) << 16) @@ -76,6 +78,10 @@ __PPC_WC(w)) #define PPC_TLBIE(lp,a) stringify_in_c(.long PPC_INST_TLBIE | \ __PPC_RB(a) | __PPC_RS(lp)) +#define PPC_TLBSRX_DOT(a,b) stringify_in_c(.long PPC_INST_TLBSRX_DOT | \ + __PPC_RA(a) | __PPC_RB(b)) +#define PPC_TLBIVAX(a,b) stringify_in_c(.long PPC_INST_TLBIVAX | \ + __PPC_RA(a) | __PPC_RB(b)) /* * Define what the VSX XX1 form instructions will look like, then add From 1fe1a21005c14ad772caeb9005580f473c4b6c57 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:12 +0000 Subject: [PATCH 0134/1662] powerpc/mm: Add more bit definitions for Book3E MMU registers This adds various additional bit definitions for various MMU related SPRs used on Book3E. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu-book3e.h | 158 +++++++++++++++++++------- 1 file changed, 114 insertions(+), 44 deletions(-) diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index 7e74cff81d86..42a39b4aacec 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -38,58 +38,128 @@ #define BOOK3E_PAGESZ_1TB 30 #define BOOK3E_PAGESZ_2TB 31 -#define MAS0_TLBSEL(x) ((x << 28) & 0x30000000) -#define MAS0_ESEL(x) ((x << 16) & 0x0FFF0000) -#define MAS0_NV(x) ((x) & 0x00000FFF) +/* MAS registers bit definitions */ -#define MAS1_VALID 0x80000000 -#define MAS1_IPROT 0x40000000 -#define MAS1_TID(x) ((x << 16) & 0x3FFF0000) -#define MAS1_IND 0x00002000 -#define MAS1_TS 0x00001000 -#define MAS1_TSIZE(x) ((x << 7) & 0x00000F80) +#define MAS0_TLBSEL(x) ((x << 28) & 0x30000000) +#define MAS0_ESEL(x) ((x << 16) & 0x0FFF0000) +#define MAS0_NV(x) ((x) & 0x00000FFF) +#define MAS0_HES 0x00004000 +#define MAS0_WQ_ALLWAYS 0x00000000 +#define MAS0_WQ_COND 0x00001000 +#define MAS0_WQ_CLR_RSRV 0x00002000 -#define MAS2_EPN 0xFFFFF000 -#define MAS2_X0 0x00000040 -#define MAS2_X1 0x00000020 -#define MAS2_W 0x00000010 -#define MAS2_I 0x00000008 -#define MAS2_M 0x00000004 -#define MAS2_G 0x00000002 -#define MAS2_E 0x00000001 +#define MAS1_VALID 0x80000000 +#define MAS1_IPROT 0x40000000 +#define MAS1_TID(x) ((x << 16) & 0x3FFF0000) +#define MAS1_IND 0x00002000 +#define MAS1_TS 0x00001000 +#define MAS1_TSIZE_MASK 0x00000f80 +#define MAS1_TSIZE_SHIFT 7 +#define MAS1_TSIZE(x) ((x << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK) + +#define MAS2_EPN 0xFFFFF000 +#define MAS2_X0 0x00000040 +#define MAS2_X1 0x00000020 +#define MAS2_W 0x00000010 +#define MAS2_I 0x00000008 +#define MAS2_M 0x00000004 +#define MAS2_G 0x00000002 +#define MAS2_E 0x00000001 #define MAS2_EPN_MASK(size) (~0 << (size + 10)) #define MAS2_VAL(addr, size, flags) ((addr) & MAS2_EPN_MASK(size) | (flags)) -#define MAS3_RPN 0xFFFFF000 -#define MAS3_U0 0x00000200 -#define MAS3_U1 0x00000100 -#define MAS3_U2 0x00000080 -#define MAS3_U3 0x00000040 -#define MAS3_UX 0x00000020 -#define MAS3_SX 0x00000010 -#define MAS3_UW 0x00000008 -#define MAS3_SW 0x00000004 -#define MAS3_UR 0x00000002 -#define MAS3_SR 0x00000001 +#define MAS3_RPN 0xFFFFF000 +#define MAS3_U0 0x00000200 +#define MAS3_U1 0x00000100 +#define MAS3_U2 0x00000080 +#define MAS3_U3 0x00000040 +#define MAS3_UX 0x00000020 +#define MAS3_SX 0x00000010 +#define MAS3_UW 0x00000008 +#define MAS3_SW 0x00000004 +#define MAS3_UR 0x00000002 +#define MAS3_SR 0x00000001 +#define MAS3_SPSIZE 0x0000003e +#define MAS3_SPSIZE_SHIFT 1 -#define MAS4_TLBSELD(x) MAS0_TLBSEL(x) -#define MAS4_INDD 0x00008000 -#define MAS4_TSIZED(x) MAS1_TSIZE(x) -#define MAS4_X0D 0x00000040 -#define MAS4_X1D 0x00000020 -#define MAS4_WD 0x00000010 -#define MAS4_ID 0x00000008 -#define MAS4_MD 0x00000004 -#define MAS4_GD 0x00000002 -#define MAS4_ED 0x00000001 +#define MAS4_TLBSELD(x) MAS0_TLBSEL(x) +#define MAS4_INDD 0x00008000 /* Default IND */ +#define MAS4_TSIZED(x) MAS1_TSIZE(x) +#define MAS4_X0D 0x00000040 +#define MAS4_X1D 0x00000020 +#define MAS4_WD 0x00000010 +#define MAS4_ID 0x00000008 +#define MAS4_MD 0x00000004 +#define MAS4_GD 0x00000002 +#define MAS4_ED 0x00000001 +#define MAS4_WIMGED_MASK 0x0000001f /* Default WIMGE */ +#define MAS4_WIMGED_SHIFT 0 +#define MAS4_VLED MAS4_X1D /* Default VLE */ +#define MAS4_ACMD 0x000000c0 /* Default ACM */ +#define MAS4_ACMD_SHIFT 6 +#define MAS4_TSIZED_MASK 0x00000f80 /* Default TSIZE */ +#define MAS4_TSIZED_SHIFT 7 -#define MAS6_SPID0 0x3FFF0000 -#define MAS6_SPID1 0x00007FFE -#define MAS6_ISIZE(x) MAS1_TSIZE(x) -#define MAS6_SAS 0x00000001 -#define MAS6_SPID MAS6_SPID0 +#define MAS6_SPID0 0x3FFF0000 +#define MAS6_SPID1 0x00007FFE +#define MAS6_ISIZE(x) MAS1_TSIZE(x) +#define MAS6_SAS 0x00000001 +#define MAS6_SPID MAS6_SPID0 +#define MAS6_SIND 0x00000002 /* Indirect page */ +#define MAS6_SIND_SHIFT 1 +#define MAS6_SPID_MASK 0x3fff0000 +#define MAS6_SPID_SHIFT 16 +#define MAS6_ISIZE_MASK 0x00000f80 +#define MAS6_ISIZE_SHIFT 7 -#define MAS7_RPN 0xFFFFFFFF +#define MAS7_RPN 0xFFFFFFFF + +/* TLBnCFG encoding */ +#define TLBnCFG_N_ENTRY 0x00000fff /* number of entries */ +#define TLBnCFG_HES 0x00002000 /* HW select supported */ +#define TLBnCFG_IPROT 0x00008000 /* IPROT supported */ +#define TLBnCFG_GTWE 0x00010000 /* Guest can write */ +#define TLBnCFG_IND 0x00020000 /* IND entries supported */ +#define TLBnCFG_PT 0x00040000 /* Can load from page table */ +#define TLBnCFG_ASSOC 0xff000000 /* Associativity */ + +/* TLBnPS encoding */ +#define TLBnPS_4K 0x00000004 +#define TLBnPS_8K 0x00000008 +#define TLBnPS_16K 0x00000010 +#define TLBnPS_32K 0x00000020 +#define TLBnPS_64K 0x00000040 +#define TLBnPS_128K 0x00000080 +#define TLBnPS_256K 0x00000100 +#define TLBnPS_512K 0x00000200 +#define TLBnPS_1M 0x00000400 +#define TLBnPS_2M 0x00000800 +#define TLBnPS_4M 0x00001000 +#define TLBnPS_8M 0x00002000 +#define TLBnPS_16M 0x00004000 +#define TLBnPS_32M 0x00008000 +#define TLBnPS_64M 0x00010000 +#define TLBnPS_128M 0x00020000 +#define TLBnPS_256M 0x00040000 +#define TLBnPS_512M 0x00080000 +#define TLBnPS_1G 0x00100000 +#define TLBnPS_2G 0x00200000 +#define TLBnPS_4G 0x00400000 +#define TLBnPS_8G 0x00800000 +#define TLBnPS_16G 0x01000000 +#define TLBnPS_32G 0x02000000 +#define TLBnPS_64G 0x04000000 +#define TLBnPS_128G 0x08000000 +#define TLBnPS_256G 0x10000000 + +/* tlbilx action encoding */ +#define TLBILX_T_ALL 0 +#define TLBILX_T_TID 1 +#define TLBILX_T_FULLMATCH 3 +#define TLBILX_T_CLASS0 4 +#define TLBILX_T_CLASS1 5 +#define TLBILX_T_CLASS2 6 +#define TLBILX_T_CLASS3 7 #ifndef __ASSEMBLY__ From a245067e204f69c69abf92d94fc45ec65bf1f07e Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:16 +0000 Subject: [PATCH 0135/1662] powerpc/mm: Add support for early ioremap on non-hash 64-bit processors This adds some code to do early ioremap's using page tables instead of bolting entries in the hash table. This will be used by the upcoming 64-bits BookE port. The patch also changes the test for early vs. late ioremap to use slab_is_available() instead of our old hackish mem_init_done. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/pgtable_64.c | 59 +++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index bfa7db6b2fd5..93ed1a3c8729 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include #include @@ -55,19 +57,36 @@ unsigned long ioremap_bot = IOREMAP_BASE; + +#ifdef CONFIG_PPC_MMU_NOHASH +static void *early_alloc_pgtable(unsigned long size) +{ + void *pt; + + if (init_bootmem_done) + pt = __alloc_bootmem(size, size, __pa(MAX_DMA_ADDRESS)); + else + pt = __va(lmb_alloc_base(size, size, + __pa(MAX_DMA_ADDRESS))); + memset(pt, 0, size); + + return pt; +} +#endif /* CONFIG_PPC_MMU_NOHASH */ + /* - * map_io_page currently only called by __ioremap - * map_io_page adds an entry to the ioremap page table + * map_kernel_page currently only called by __ioremap + * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -static int map_io_page(unsigned long ea, unsigned long pa, int flags) +static int map_kernel_page(unsigned long ea, unsigned long pa, int flags) { pgd_t *pgdp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep; - if (mem_init_done) { + if (slab_is_available()) { pgdp = pgd_offset_k(ea); pudp = pud_alloc(&init_mm, pgdp, ea); if (!pudp) @@ -81,6 +100,35 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); } else { +#ifdef CONFIG_PPC_MMU_NOHASH + /* Warning ! This will blow up if bootmem is not initialized + * which our ppc64 code is keen to do that, we'll need to + * fix it and/or be more careful + */ + pgdp = pgd_offset_k(ea); +#ifdef PUD_TABLE_SIZE + if (pgd_none(*pgdp)) { + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); + BUG_ON(pudp == NULL); + pgd_populate(&init_mm, pgdp, pudp); + } +#endif /* PUD_TABLE_SIZE */ + pudp = pud_offset(pgdp, ea); + if (pud_none(*pudp)) { + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); + BUG_ON(pmdp == NULL); + pud_populate(&init_mm, pudp, pmdp); + } + pmdp = pmd_offset(pudp, ea); + if (!pmd_present(*pmdp)) { + ptep = early_alloc_pgtable(PAGE_SIZE); + BUG_ON(ptep == NULL); + pmd_populate_kernel(&init_mm, pmdp, ptep); + } + ptep = pte_offset_kernel(pmdp, ea); + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, + __pgprot(flags))); +#else /* CONFIG_PPC_MMU_NOHASH */ /* * If the mm subsystem is not fully up, we cannot create a * linux page table entry for this mapping. Simply bolt an @@ -93,6 +141,7 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) "memory at %016lx !\n", pa); return -ENOMEM; } +#endif /* !CONFIG_PPC_MMU_NOHASH */ } return 0; } @@ -124,7 +173,7 @@ void __iomem * __ioremap_at(phys_addr_t pa, void *ea, unsigned long size, WARN_ON(size & ~PAGE_MASK); for (i = 0; i < size; i += PAGE_SIZE) - if (map_io_page((unsigned long)ea+i, pa+i, flags)) + if (map_kernel_page((unsigned long)ea+i, pa+i, flags)) return NULL; return (void __iomem *)ea; From 44c58ccc8dc25f78a4f641901f17092c93dd0458 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:20 +0000 Subject: [PATCH 0136/1662] powerpc: Modify some ppc_asm.h macros to accomodate 64-bits Book3E The way I intend to use tophys/tovirt on 64-bit BookE is different from the "trick" that we currently play for 32-bit BookE so change the condition of definition of these macros to make it so. Also, make sure we only use rfid and mtmsrd instead of rfi and mtmsr for 64-bit server processors, not all 64-bit processors. Signed-off-by: Benjamin Herrenschmidt Acked-by: Kumar Gala --- arch/powerpc/include/asm/ppc_asm.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index f9729529c20d..dfae6e916dfb 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -375,8 +375,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) #define PPC440EP_ERR42 #endif - -#if defined(CONFIG_BOOKE) +/* + * toreal/fromreal/tophys/tovirt macros. 32-bit BookE makes them + * keep the address intact to be compatible with code shared with + * 32-bit classic. + * + * On the other hand, I find it useful to have them behave as expected + * by their name (ie always do the addition) on 64-bit BookE + */ +#if defined(CONFIG_BOOKE) && !defined(CONFIG_PPC64) #define toreal(rd) #define fromreal(rd) @@ -426,10 +433,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) .previous #endif -#ifdef CONFIG_PPC64 +#ifdef CONFIG_PPC_BOOK3S_64 #define RFI rfid #define MTMSRD(r) mtmsrd r - #else #define FIX_SRR1(ra, rb) #ifndef CONFIG_40x From d4e167da4cb60910f6ac305aee03714937f70b71 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:24 +0000 Subject: [PATCH 0137/1662] powerpc/mm: Make low level TLB flush ops on BookE take additional args We need to pass down whether the page is direct or indirect and we'll need to pass the page size to _tlbil_va and _tlbivax_bcast We also add a new low level _tlbil_pid_noind() which does a TLB flush by PID but avoids flushing indirect entries if possible This implements those new prototypes but defines them with inlines or macros so that no additional arguments are actually passed on current processors. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/tlbflush.h | 11 ++++++-- arch/powerpc/mm/mmu_decl.h | 16 ++++++++--- arch/powerpc/mm/tlb_nohash.c | 42 +++++++++++++++++++++-------- arch/powerpc/mm/tlb_nohash_low.S | 6 ++--- 4 files changed, 56 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index abbe3419d1dd..d50a380b2b6f 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -6,7 +6,7 @@ * * - flush_tlb_mm(mm) flushes the specified mm context TLB's * - flush_tlb_page(vma, vmaddr) flushes one page - * - local_flush_tlb_mm(mm) flushes the specified mm context on + * - local_flush_tlb_mm(mm, full) flushes the specified mm context on * the local processor * - local_flush_tlb_page(vma, vmaddr) flushes one page on the local processor * - flush_tlb_page_nohash(vma, vmaddr) flushes one page if SW loaded TLB @@ -29,7 +29,8 @@ * specific tlbie's */ -#include +struct vm_area_struct; +struct mm_struct; #define MMU_NO_CONTEXT ((unsigned int)-1) @@ -40,12 +41,18 @@ extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); extern void local_flush_tlb_mm(struct mm_struct *mm); extern void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +extern void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind); + #ifdef CONFIG_SMP extern void flush_tlb_mm(struct mm_struct *mm); extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); +extern void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind); #else #define flush_tlb_mm(mm) local_flush_tlb_mm(mm) #define flush_tlb_page(vma,addr) local_flush_tlb_page(vma,addr) +#define __flush_tlb_page(mm,addr,p,i) __local_flush_tlb_page(mm,addr,p,i) #endif #define flush_tlb_page_nohash(vma,addr) flush_tlb_page(vma,addr) diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index d1f9c62dc177..3871dceee2dd 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -36,21 +36,30 @@ static inline void _tlbil_pid(unsigned int pid) { asm volatile ("sync; tlbia; isync" : : : "memory"); } +#define _tlbil_pid_noind(pid) _tlbil_pid(pid) + #else /* CONFIG_40x || CONFIG_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); +#define _tlbil_pid_noind(pid) _tlbil_pid(pid) #endif /* !(CONFIG_40x || CONFIG_8xx) */ /* * On 8xx, we directly inline tlbie, on others, it's extern */ #ifdef CONFIG_8xx -static inline void _tlbil_va(unsigned long address, unsigned int pid) +static inline void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) { asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); } #else /* CONFIG_8xx */ -extern void _tlbil_va(unsigned long address, unsigned int pid); +extern void __tlbil_va(unsigned long address, unsigned int pid); +static inline void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) +{ + __tlbil_va(address, pid); +} #endif /* CONIFG_8xx */ /* @@ -58,7 +67,8 @@ extern void _tlbil_va(unsigned long address, unsigned int pid); * implementation. When that becomes the case, this will be * an extern. */ -static inline void _tlbivax_bcast(unsigned long address, unsigned int pid) +static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind) { BUG(); } diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index d908e75cc3b5..761e8882416f 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -67,18 +67,24 @@ void local_flush_tlb_mm(struct mm_struct *mm) } EXPORT_SYMBOL(local_flush_tlb_mm); -void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) { unsigned int pid; preempt_disable(); - pid = vma ? vma->vm_mm->context.id : 0; + pid = mm ? mm->context.id : 0; if (pid != MMU_NO_CONTEXT) - _tlbil_va(vmaddr, pid); + _tlbil_va(vmaddr, pid, tsize, ind); preempt_enable(); } -EXPORT_SYMBOL(local_flush_tlb_page); +void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + 0 /* tsize unused for now */, 0); +} +EXPORT_SYMBOL(local_flush_tlb_page); /* * And here are the SMP non-local implementations @@ -96,6 +102,8 @@ static int mm_is_core_local(struct mm_struct *mm) struct tlb_flush_param { unsigned long addr; unsigned int pid; + unsigned int tsize; + unsigned int ind; }; static void do_flush_tlb_mm_ipi(void *param) @@ -109,7 +117,7 @@ static void do_flush_tlb_page_ipi(void *param) { struct tlb_flush_param *p = param; - _tlbil_va(p->addr, p->pid); + _tlbil_va(p->addr, p->pid, p->tsize, p->ind); } @@ -149,37 +157,49 @@ void flush_tlb_mm(struct mm_struct *mm) } EXPORT_SYMBOL(flush_tlb_mm); -void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, + int tsize, int ind) { struct cpumask *cpu_mask; unsigned int pid; preempt_disable(); - pid = vma ? vma->vm_mm->context.id : 0; + pid = mm ? mm->context.id : 0; if (unlikely(pid == MMU_NO_CONTEXT)) goto bail; - cpu_mask = mm_cpumask(vma->vm_mm); + cpu_mask = mm_cpumask(mm); if (!mm_is_core_local(mm)) { /* If broadcast tlbivax is supported, use it */ if (mmu_has_feature(MMU_FTR_USE_TLBIVAX_BCAST)) { int lock = mmu_has_feature(MMU_FTR_LOCK_BCAST_INVAL); if (lock) spin_lock(&tlbivax_lock); - _tlbivax_bcast(vmaddr, pid); + _tlbivax_bcast(vmaddr, pid, tsize, ind); if (lock) spin_unlock(&tlbivax_lock); goto bail; } else { - struct tlb_flush_param p = { .pid = pid, .addr = vmaddr }; + struct tlb_flush_param p = { + .pid = pid, + .addr = vmaddr, + .tsize = tsize, + .ind = ind, + }; /* Ignores smp_processor_id() even if set in cpu_mask */ smp_call_function_many(cpu_mask, do_flush_tlb_page_ipi, &p, 1); } } - _tlbil_va(vmaddr, pid); + _tlbil_va(vmaddr, pid, tsize, ind); bail: preempt_enable(); } + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) +{ + __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, + 0 /* tsize unused for now */, 0); +} EXPORT_SYMBOL(flush_tlb_page); #endif /* CONFIG_SMP */ diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index 3037911279b1..c7d89a0adba2 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -39,7 +39,7 @@ /* * 40x implementation needs only tlbil_va */ -_GLOBAL(_tlbil_va) +_GLOBAL(__tlbil_va) /* We run the search with interrupts disabled because we have to change * the PID and I don't want to preempt when that happens. */ @@ -71,7 +71,7 @@ _GLOBAL(_tlbil_va) * 440 implementation uses tlbsx/we for tlbil_va and a full sweep * of the TLB for everything else. */ -_GLOBAL(_tlbil_va) +_GLOBAL(__tlbil_va) mfspr r5,SPRN_MMUCR rlwimi r5,r4,0,24,31 /* Set TID */ @@ -170,7 +170,7 @@ ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBILX) * Flush MMU TLB for a particular address, but only on the local processor * (no broadcast) */ -_GLOBAL(_tlbil_va) +_GLOBAL(__tlbil_va) mfmsr r10 wrteei 0 slwi r4,r4,16 From 6f0ef0f505af1ce6e9756087a9d4cc3778bae8c6 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:26 +0000 Subject: [PATCH 0138/1662] powerpc/mm: Call mmu_context_init() from ppc64 Our 64-bit hash context handling has no init function, but 64-bit Book3E will use the common mmu_context_nohash.c code which does, so define an empty inline mmu_context_init() for 64-bit server and call it from our 64-bit setup_arch() Signed-off-by: Benjamin Herrenschmidt Acked-by: Kumar Gala --- arch/powerpc/include/asm/mmu_context.h | 7 ++++++- arch/powerpc/kernel/setup_64.c | 4 ++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index b7063669f972..8dffed317013 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -14,7 +14,6 @@ /* * Most if the context management is out of line */ -extern void mmu_context_init(void); extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm); extern void destroy_context(struct mm_struct *mm); @@ -23,6 +22,12 @@ extern void switch_stab(struct task_struct *tsk, struct mm_struct *mm); extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm); extern void set_context(unsigned long id, pgd_t *pgd); +#ifdef CONFIG_PPC_BOOK3S_64 +static inline void mmu_context_init(void) { } +#else +extern void mmu_context_init(void); +#endif + /* * switch_mm is the entry point called from the architecture independent * code in kernel/sched.c diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 91b89b8d63d8..325dc5b2e626 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -534,6 +534,10 @@ void __init setup_arch(char **cmdline_p) #endif paging_init(); + + /* Initialize the MMU context management stuff */ + mmu_context_init(); + ppc64_boot_msg(0x15, "Setup Done"); } From 747bea91b764aefd59091ebff80f182282f1d23c Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:27 +0000 Subject: [PATCH 0139/1662] powerpc: Clean ifdef usage in copy_thread() Currently, a single ifdef covers SLB related bits and more generic ppc64 related bits, split this in two separate ifdef's since 64-bit BookE will need one but not the other. Signed-off-by: Benjamin Herrenschmidt Acked-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/process.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 892a9f2e6d76..678ff132e8b0 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -664,6 +664,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, sp_vsid |= SLB_VSID_KERNEL | llp; p->thread.ksp_vsid = sp_vsid; } +#endif /* CONFIG_PPC_STD_MMU_64 */ /* * The PPC64 ABI makes use of a TOC to contain function @@ -671,6 +672,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, * to the TOC entry. The first entry is a pointer to the actual * function. */ +#ifdef CONFIG_PPC64 kregs->nip = *((unsigned long *)ret_from_fork); #else kregs->nip = (unsigned long)ret_from_fork; From cf54dc7cd4f9aab55cd3e1794b0b74c3c88cd1a0 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:28 +0000 Subject: [PATCH 0140/1662] powerpc: Move definitions of secondary CPU spinloop to header file Those definitions are currently declared extern in the .c file where they are used, move them to a header file instead. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/smp.h | 9 +++++++++ arch/powerpc/kernel/prom_init.c | 4 ---- arch/powerpc/kernel/setup_64.c | 3 --- arch/powerpc/platforms/85xx/smp.c | 1 - arch/powerpc/platforms/86xx/mpc86xx_smp.c | 1 - arch/powerpc/platforms/cell/smp.c | 2 -- arch/powerpc/platforms/pseries/smp.c | 2 -- 7 files changed, 9 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index c25f73d1d842..e782f43ee669 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -148,6 +148,15 @@ extern struct smp_ops_t *smp_ops; extern void arch_send_call_function_single_ipi(int cpu); extern void arch_send_call_function_ipi(cpumask_t mask); +/* Definitions relative to the secondary CPU spin loop + * and entry point. Not all of them exist on both 32 and + * 64-bit but defining them all here doesn't harm + */ +extern void generic_secondary_smp_init(void); +extern unsigned long __secondary_hold_spinloop; +extern unsigned long __secondary_hold_acknowledge; +extern char __secondary_hold; + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index a538824616fd..d942404779c1 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1259,10 +1259,6 @@ static void __init prom_initialize_tce_table(void) * * -- Cort */ -extern char __secondary_hold; -extern unsigned long __secondary_hold_spinloop; -extern unsigned long __secondary_hold_acknowledge; - /* * We want to reference the copy of __secondary_hold_* in the * 0 - 0x100 address range diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 325dc5b2e626..a6b6c4c9ae41 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -230,9 +230,6 @@ void early_setup_secondary(void) #endif /* CONFIG_SMP */ #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC) -extern unsigned long __secondary_hold_spinloop; -extern void generic_secondary_smp_init(void); - void smp_release_cpus(void) { unsigned long *ptr; diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 62c592ede641..9f526ba31c1e 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -25,7 +25,6 @@ #include -extern volatile unsigned long __secondary_hold_acknowledge; extern void __early_start(void); #define BOOT_ENTRY_ADDR_UPPER 0 diff --git a/arch/powerpc/platforms/86xx/mpc86xx_smp.c b/arch/powerpc/platforms/86xx/mpc86xx_smp.c index d84bbb508ee7..eacea0e3fcc8 100644 --- a/arch/powerpc/platforms/86xx/mpc86xx_smp.c +++ b/arch/powerpc/platforms/86xx/mpc86xx_smp.c @@ -27,7 +27,6 @@ #include "mpc86xx.h" extern void __secondary_start_mpc86xx(void); -extern unsigned long __secondary_hold_acknowledge; #define MCM_PORT_CONFIG_OFFSET 0x10 diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index bc97fada48c6..f774530075b7 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -58,8 +58,6 @@ */ static cpumask_t of_spin_map; -extern void generic_secondary_smp_init(unsigned long); - /** * smp_startup_cpu() - start the given cpu * diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 1f8f6cfb94f7..440000cc7130 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -56,8 +56,6 @@ */ static cpumask_t of_spin_map; -extern void generic_secondary_smp_init(unsigned long); - /** * smp_startup_cpu() - start the given cpu * From c7cc58a1ad8dfe3c199d3b6ce50412b86dd3edaf Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:28 +0000 Subject: [PATCH 0141/1662] powerpc/mm: Rework & cleanup page table freeing code path That patch used to just add a hook to page table flushing but pulling that string brought out a whole bunch of issues, so it now does that and more: - We now make the RCU batching of page freeing SMP only, as I believe it was intended initially. We make a few more things compile to nothing on !CONFIG_SMP - Some macros are turned into functions, though that forced me to out of line a few stuffs due to unsolvable include depenencies, however it's probably better that way anyway, it's not -that- critical code path. - 32-bit didn't call pte_free_finish() on tlb_flush() which means that it wouldn't push out the batch to RCU for delayed freeing when a bunch of page tables have been freed, they would just stay in there until the batch gets full. 64-bit BookE will use that hook to maintain the virtually linear page tables or the indirect entries in the TLB when using the HW loader. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgalloc.h | 39 +++++++++++++++++++++--------- arch/powerpc/include/asm/tlb.h | 38 +++-------------------------- arch/powerpc/mm/pgtable.c | 10 ++++++++ arch/powerpc/mm/tlb_hash32.c | 3 +++ arch/powerpc/mm/tlb_hash64.c | 15 ++++++++++++ arch/powerpc/mm/tlb_nohash.c | 8 ++++++ 6 files changed, 67 insertions(+), 46 deletions(-) diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 1730e5e298d6..34b080671f00 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -4,6 +4,15 @@ #include +#ifdef CONFIG_PPC_BOOK3E +extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address); +#else /* CONFIG_PPC_BOOK3E */ +static inline void tlb_flush_pgtable(struct mmu_gather *tlb, + unsigned long address) +{ +} +#endif /* !CONFIG_PPC_BOOK3E */ + static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { free_page((unsigned long)pte); @@ -35,19 +44,27 @@ static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum, #include #endif -extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf); - #ifdef CONFIG_SMP -#define __pte_free_tlb(tlb,ptepage,address) \ -do { \ - pgtable_page_dtor(ptepage); \ - pgtable_free_tlb(tlb, pgtable_free_cache(page_address(ptepage), \ - PTE_NONCACHE_NUM, PTE_TABLE_SIZE-1)); \ -} while (0) -#else -#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, (pte)) -#endif +extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf); +extern void pte_free_finish(void); +#else /* CONFIG_SMP */ +static inline void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) +{ + pgtable_free(pgf); +} +static inline void pte_free_finish(void) { } +#endif /* !CONFIG_SMP */ +static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage, + unsigned long address) +{ + pgtable_free_t pgf = pgtable_free_cache(page_address(ptepage), + PTE_NONCACHE_NUM, + PTE_TABLE_SIZE-1); + tlb_flush_pgtable(tlb, address); + pgtable_page_dtor(ptepage); + pgtable_free_tlb(tlb, pgf); +} #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_PGALLOC_H */ diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index e20ff7541f36..e2b428b0f7ba 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -25,57 +25,25 @@ #include -struct mmu_gather; - #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) -#if !defined(CONFIG_PPC_STD_MMU) - -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - -#elif defined(__powerpc64__) - -extern void pte_free_finish(void); - -static inline void tlb_flush(struct mmu_gather *tlb) -{ - struct ppc64_tlb_batch *tlbbatch = &__get_cpu_var(ppc64_tlb_batch); - - /* If there's a TLB batch pending, then we must flush it because the - * pages are going to be freed and we really don't want to have a CPU - * access a freed page because it has a stale TLB - */ - if (tlbbatch->index) - __flush_tlb_pending(tlbbatch); - - pte_free_finish(); -} - -#else - extern void tlb_flush(struct mmu_gather *tlb); -#endif - /* Get the generic bits... */ #include -#if !defined(CONFIG_PPC_STD_MMU) || defined(__powerpc64__) - -#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while (0) - -#else extern void flush_hash_entry(struct mm_struct *mm, pte_t *ptep, unsigned long address); static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, - unsigned long address) + unsigned long address) { +#ifdef CONFIG_PPC_STD_MMU_32 if (pte_val(*ptep) & _PAGE_HASHPTE) flush_hash_entry(tlb->mm, ptep, address); +#endif } -#endif #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_TLB_H */ diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 627767d6169b..a65979a5f75b 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -30,6 +30,14 @@ #include #include +#ifdef CONFIG_SMP + +/* + * Handle batching of page table freeing on SMP. Page tables are + * queued up and send to be freed later by RCU in order to avoid + * freeing a page table page that is being walked without locks + */ + static DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); static unsigned long pte_freelist_forced_free; @@ -116,6 +124,8 @@ void pte_free_finish(void) *batchp = NULL; } +#endif /* CONFIG_SMP */ + /* * Handle i/d cache flushing, called from set_pte_at() or ptep_set_access_flags() */ diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c index 65190587a365..8aaa8b7eb324 100644 --- a/arch/powerpc/mm/tlb_hash32.c +++ b/arch/powerpc/mm/tlb_hash32.c @@ -71,6 +71,9 @@ void tlb_flush(struct mmu_gather *tlb) */ _tlbia(); } + + /* Push out batch of freed page tables */ + pte_free_finish(); } /* diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 937eb90677d9..8e35a6066938 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -154,6 +154,21 @@ void __flush_tlb_pending(struct ppc64_tlb_batch *batch) batch->index = 0; } +void tlb_flush(struct mmu_gather *tlb) +{ + struct ppc64_tlb_batch *tlbbatch = &__get_cpu_var(ppc64_tlb_batch); + + /* If there's a TLB batch pending, then we must flush it because the + * pages are going to be freed and we really don't want to have a CPU + * access a freed page because it has a stale TLB + */ + if (tlbbatch->index) + __flush_tlb_pending(tlbbatch); + + /* Push out batch of freed page tables */ + pte_free_finish(); +} + /** * __flush_hash_table_range - Flush all HPTEs for a given address range * from the hash table (and the TLB). But keeps diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 761e8882416f..6b43fc49f103 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -233,3 +233,11 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_mm(vma->vm_mm); } EXPORT_SYMBOL(flush_tlb_range); + +void tlb_flush(struct mmu_gather *tlb) +{ + flush_tlb_mm(tlb->mm); + + /* Push out batch of freed page tables */ + pte_free_finish(); +} From 0257c99cdfaca53a881339e1cbca638c61569b05 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:34 +0000 Subject: [PATCH 0142/1662] powerpc: Add SPR definitions for new 64-bit BookE This adds various SPRs defined on 64-bit BookE, along with changes to the definition of the base MSR values to add the values needed for 64-bit Book3E. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/reg.h | 10 ++----- arch/powerpc/include/asm/reg_booke.h | 42 ++++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 2cedbb427618..c8715331e1b0 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -98,19 +98,15 @@ #define MSR_RI __MASK(MSR_RI_LG) /* Recoverable Exception */ #define MSR_LE __MASK(MSR_LE_LG) /* Little Endian */ -#ifdef CONFIG_PPC64 +#if defined(CONFIG_PPC_BOOK3S_64) +/* Server variant */ #define MSR_ MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_ISF |MSR_HV #define MSR_KERNEL MSR_ | MSR_SF - #define MSR_USER32 MSR_ | MSR_PR | MSR_EE #define MSR_USER64 MSR_USER32 | MSR_SF - -#else /* 32-bit */ +#elif defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_8xx) /* Default MSR for kernel mode. */ -#ifndef MSR_KERNEL /* reg_booke.h also defines this */ #define MSR_KERNEL (MSR_ME|MSR_RI|MSR_IR|MSR_DR) -#endif - #define MSR_USER (MSR_KERNEL|MSR_PR|MSR_EE) #endif diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 6bcf364cbb2f..2c9c706e6448 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -18,18 +18,26 @@ #define MSR_IS MSR_IR /* Instruction Space */ #define MSR_DS MSR_DR /* Data Space */ #define MSR_PMM (1<<2) /* Performance monitor mark bit */ +#define MSR_CM (1<<31) /* Computation Mode (0=32-bit, 1=64-bit) */ -/* Default MSR for kernel mode. */ -#if defined (CONFIG_40x) +#if defined(CONFIG_PPC_BOOK3E_64) +#define MSR_ MSR_ME | MSR_CE +#define MSR_KERNEL MSR_ | MSR_CM +#define MSR_USER32 MSR_ | MSR_PR | MSR_EE +#define MSR_USER64 MSR_USER32 | MSR_CM +#elif defined (CONFIG_40x) #define MSR_KERNEL (MSR_ME|MSR_RI|MSR_IR|MSR_DR|MSR_CE) -#elif defined(CONFIG_BOOKE) +#define MSR_USER (MSR_KERNEL|MSR_PR|MSR_EE) +#else #define MSR_KERNEL (MSR_ME|MSR_RI|MSR_CE) +#define MSR_USER (MSR_KERNEL|MSR_PR|MSR_EE) #endif /* Special Purpose Registers (SPRNs)*/ #define SPRN_DECAR 0x036 /* Decrementer Auto Reload Register */ #define SPRN_IVPR 0x03F /* Interrupt Vector Prefix Register */ #define SPRN_USPRG0 0x100 /* User Special Purpose Register General 0 */ +#define SPRN_SPRG3R 0x103 /* Special Purpose Register General 3 Read */ #define SPRN_SPRG4R 0x104 /* Special Purpose Register General 4 Read */ #define SPRN_SPRG5R 0x105 /* Special Purpose Register General 5 Read */ #define SPRN_SPRG6R 0x106 /* Special Purpose Register General 6 Read */ @@ -38,11 +46,18 @@ #define SPRN_SPRG5W 0x115 /* Special Purpose Register General 5 Write */ #define SPRN_SPRG6W 0x116 /* Special Purpose Register General 6 Write */ #define SPRN_SPRG7W 0x117 /* Special Purpose Register General 7 Write */ +#define SPRN_EPCR 0x133 /* Embedded Processor Control Register */ #define SPRN_DBCR2 0x136 /* Debug Control Register 2 */ #define SPRN_IAC3 0x13A /* Instruction Address Compare 3 */ #define SPRN_IAC4 0x13B /* Instruction Address Compare 4 */ #define SPRN_DVC1 0x13E /* Data Value Compare Register 1 */ #define SPRN_DVC2 0x13F /* Data Value Compare Register 2 */ +#define SPRN_MAS8 0x155 /* MMU Assist Register 8 */ +#define SPRN_TLB0PS 0x158 /* TLB 0 Page Size Register */ +#define SPRN_MAS5_MAS6 0x15c /* MMU Assist Register 5 || 6 */ +#define SPRN_MAS8_MAS1 0x15d /* MMU Assist Register 8 || 1 */ +#define SPRN_MAS7_MAS3 0x174 /* MMU Assist Register 7 || 3 */ +#define SPRN_MAS0_MAS1 0x175 /* MMU Assist Register 0 || 1 */ #define SPRN_IVOR0 0x190 /* Interrupt Vector Offset Register 0 */ #define SPRN_IVOR1 0x191 /* Interrupt Vector Offset Register 1 */ #define SPRN_IVOR2 0x192 /* Interrupt Vector Offset Register 2 */ @@ -425,6 +440,27 @@ #define SGR_NORMAL 0 /* Speculative fetching allowed. */ #define SGR_GUARDED 1 /* Speculative fetching disallowed. */ +/* Bit definitions for EPCR */ +#define SPRN_EPCR_EXTGS 0x80000000 /* External Input interrupt + * directed to Guest state */ +#define SPRN_EPCR_DTLBGS 0x40000000 /* Data TLB Error interrupt + * directed to guest state */ +#define SPRN_EPCR_ITLBGS 0x20000000 /* Instr. TLB error interrupt + * directed to guest state */ +#define SPRN_EPCR_DSIGS 0x10000000 /* Data Storage interrupt + * directed to guest state */ +#define SPRN_EPCR_ISIGS 0x08000000 /* Instr. Storage interrupt + * directed to guest state */ +#define SPRN_EPCR_DUVD 0x04000000 /* Disable Hypervisor Debug */ +#define SPRN_EPCR_ICM 0x02000000 /* Interrupt computation mode + * (copied to MSR:CM on intr) */ +#define SPRN_EPCR_GICM 0x01000000 /* Guest Interrupt Comp. mode */ +#define SPRN_EPCR_DGTMI 0x00800000 /* Disable TLB Guest Management + * instructions */ +#define SPRN_EPCR_DMIUH 0x00400000 /* Disable MAS Interrupt updates + * for hypervisor */ + + /* * The IBM-403 is an even more odd special case, as it is much * older than the IBM-405 series. We put these down here incase someone From 57e2a99f74b0d3720c97a6aadb57ae6aad3c61ea Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 28 Jul 2009 11:59:34 +1000 Subject: [PATCH 0143/1662] powerpc: Add memory management headers for new 64-bit BookE This adds the PTE and pgtable format definitions, along with changes to the kernel memory map and other definitions related to implementing support for 64-bit Book3E. This also shields some asm-offset bits that are currently only relevant on 32-bit We also move the definition of the "linux" page size constants to the common mmu.h file and add a few sizes that are relevant to embedded processors. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu-book3e.h | 27 +++++++++ arch/powerpc/include/asm/mmu-hash64.h | 20 ------- arch/powerpc/include/asm/mmu.h | 37 +++++++++++++ arch/powerpc/include/asm/page.h | 4 ++ arch/powerpc/include/asm/page_64.h | 10 ++++ arch/powerpc/include/asm/pgtable-ppc64.h | 61 +++++++++++++++------ arch/powerpc/include/asm/pte-book3e.h | 70 ++++++++++++++++++++++++ arch/powerpc/include/asm/pte-common.h | 3 + arch/powerpc/kernel/asm-offsets.c | 5 +- arch/powerpc/mm/hugetlbpage.c | 8 ++- 10 files changed, 205 insertions(+), 40 deletions(-) create mode 100644 arch/powerpc/include/asm/pte-book3e.h diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index 42a39b4aacec..6ddbe48d07fa 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -170,6 +170,33 @@ typedef struct { unsigned int active; unsigned long vdso_base; } mm_context_t; + +/* Page size definitions, common between 32 and 64-bit + * + * shift : is the "PAGE_SHIFT" value for that page size + * penc : is the pte encoding mask + * + */ +struct mmu_psize_def +{ + unsigned int shift; /* number of bits */ + unsigned int enc; /* PTE encoding */ +}; +extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; + +/* The page sizes use the same names as 64-bit hash but are + * constants + */ +#if defined(CONFIG_PPC_4K_PAGES) +#define mmu_virtual_psize MMU_PAGE_4K +#elif defined(CONFIG_PPC_64K_PAGES) +#define mmu_virtual_psize MMU_PAGE_64K +#else +#error Unsupported page size +#endif + +extern int mmu_linear_psize; + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_MMU_BOOK3E_H_ */ diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 98c104a09961..b537903b9fca 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -138,26 +138,6 @@ struct mmu_psize_def #endif /* __ASSEMBLY__ */ -/* - * The kernel use the constants below to index in the page sizes array. - * The use of fixed constants for this purpose is better for performances - * of the low level hash refill handlers. - * - * A non supported page size has a "shift" field set to 0 - * - * Any new page size being implemented can get a new entry in here. Whether - * the kernel will use it or not is a different matter though. The actual page - * size used by hugetlbfs is not defined here and may be made variable - */ - -#define MMU_PAGE_4K 0 /* 4K */ -#define MMU_PAGE_64K 1 /* 64K */ -#define MMU_PAGE_64K_AP 2 /* 64K Admixed (in a 4K segment) */ -#define MMU_PAGE_1M 3 /* 1M */ -#define MMU_PAGE_16M 4 /* 16M */ -#define MMU_PAGE_16G 5 /* 16G */ -#define MMU_PAGE_COUNT 6 - /* * Segment sizes. * These are the values used by hardware in the B field of diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index fb57ded592f9..2fcfefc60894 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -17,6 +17,7 @@ #define MMU_FTR_TYPE_40x ASM_CONST(0x00000004) #define MMU_FTR_TYPE_44x ASM_CONST(0x00000008) #define MMU_FTR_TYPE_FSL_E ASM_CONST(0x00000010) +#define MMU_FTR_TYPE_3E ASM_CONST(0x00000020) /* * This is individual features @@ -73,6 +74,41 @@ extern void early_init_mmu_secondary(void); #endif /* !__ASSEMBLY__ */ +/* The kernel use the constants below to index in the page sizes array. + * The use of fixed constants for this purpose is better for performances + * of the low level hash refill handlers. + * + * A non supported page size has a "shift" field set to 0 + * + * Any new page size being implemented can get a new entry in here. Whether + * the kernel will use it or not is a different matter though. The actual page + * size used by hugetlbfs is not defined here and may be made variable + * + * Note: This array ended up being a false good idea as it's growing to the + * point where I wonder if we should replace it with something different, + * to think about, feedback welcome. --BenH. + */ + +/* There are #define as they have to be used in assembly + * + * WARNING: If you change this list, make sure to update the array of + * names currently in arch/powerpc/mm/hugetlbpage.c or bad things will + * happen + */ +#define MMU_PAGE_4K 0 +#define MMU_PAGE_16K 1 +#define MMU_PAGE_64K 2 +#define MMU_PAGE_64K_AP 3 /* "Admixed pages" (hash64 only) */ +#define MMU_PAGE_256K 4 +#define MMU_PAGE_1M 5 +#define MMU_PAGE_8M 6 +#define MMU_PAGE_16M 7 +#define MMU_PAGE_256M 8 +#define MMU_PAGE_1G 9 +#define MMU_PAGE_16G 10 +#define MMU_PAGE_64G 11 +#define MMU_PAGE_COUNT 12 + #if defined(CONFIG_PPC_STD_MMU_64) /* 64-bit classic hash table MMU */ @@ -94,5 +130,6 @@ extern void early_init_mmu_secondary(void); # include #endif + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MMU_H_ */ diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 4940662ee87e..ff24254990e1 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -139,7 +139,11 @@ extern phys_addr_t kernstart_addr; * Don't compare things with KERNELBASE or PAGE_OFFSET to test for * "kernelness", use is_kernel_addr() - it should do what you want. */ +#ifdef CONFIG_PPC_BOOK3E_64 +#define is_kernel_addr(x) ((x) >= 0x8000000000000000ul) +#else #define is_kernel_addr(x) ((x) >= PAGE_OFFSET) +#endif #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 5817a3b747e5..3f17b83f55a1 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -135,12 +135,22 @@ extern void slice_set_range_psize(struct mm_struct *mm, unsigned long start, #endif /* __ASSEMBLY__ */ #else #define slice_init() +#ifdef CONFIG_PPC_STD_MMU_64 #define get_slice_psize(mm, addr) ((mm)->context.user_psize) #define slice_set_user_psize(mm, psize) \ do { \ (mm)->context.user_psize = (psize); \ (mm)->context.sllp = SLB_VSID_USER | mmu_psize_defs[(psize)].sllp; \ } while (0) +#else /* CONFIG_PPC_STD_MMU_64 */ +#ifdef CONFIG_PPC_64K_PAGES +#define get_slice_psize(mm, addr) MMU_PAGE_64K +#else /* CONFIG_PPC_64K_PAGES */ +#define get_slice_psize(mm, addr) MMU_PAGE_4K +#endif /* !CONFIG_PPC_64K_PAGES */ +#define slice_set_user_psize(mm, psize) do { BUG(); } while(0) +#endif /* !CONFIG_PPC_STD_MMU_64 */ + #define slice_set_range_psize(mm, start, len, psize) \ slice_set_user_psize((mm), (psize)) #define slice_mm_new_context(mm) 1 diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 8cd083c61503..7254c5a3187c 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -5,11 +5,6 @@ * the ppc64 hashed page table. */ -#ifndef __ASSEMBLY__ -#include -#include -#endif /* __ASSEMBLY__ */ - #ifdef CONFIG_PPC_64K_PAGES #include #else @@ -38,26 +33,46 @@ #endif /* - * Define the address range of the vmalloc VM area. + * Define the address range of the kernel non-linear virtual area */ -#define VMALLOC_START ASM_CONST(0xD000000000000000) -#define VMALLOC_SIZE (PGTABLE_RANGE >> 1) -#define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) + +#ifdef CONFIG_PPC_BOOK3E +#define KERN_VIRT_START ASM_CONST(0x8000000000000000) +#else +#define KERN_VIRT_START ASM_CONST(0xD000000000000000) +#endif +#define KERN_VIRT_SIZE PGTABLE_RANGE /* - * Define the address ranges for MMIO and IO space : + * The vmalloc space starts at the beginning of that region, and + * occupies half of it on hash CPUs and a quarter of it on Book3E + */ +#define VMALLOC_START KERN_VIRT_START +#ifdef CONFIG_PPC_BOOK3E +#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 2) +#else +#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) +#endif +#define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) + +/* + * The second half of the kernel virtual space is used for IO mappings, + * it's itself carved into the PIO region (ISA and PHB IO space) and + * the ioremap space * - * ISA_IO_BASE = VMALLOC_END, 64K reserved area + * ISA_IO_BASE = KERN_IO_START, 64K reserved area * PHB_IO_BASE = ISA_IO_BASE + 64K to ISA_IO_BASE + 2G, PHB IO spaces * IOREMAP_BASE = ISA_IO_BASE + 2G to VMALLOC_START + PGTABLE_RANGE */ +#define KERN_IO_START (KERN_VIRT_START + (KERN_VIRT_SIZE >> 1)) #define FULL_IO_SIZE 0x80000000ul -#define ISA_IO_BASE (VMALLOC_END) -#define ISA_IO_END (VMALLOC_END + 0x10000ul) +#define ISA_IO_BASE (KERN_IO_START) +#define ISA_IO_END (KERN_IO_START + 0x10000ul) #define PHB_IO_BASE (ISA_IO_END) -#define PHB_IO_END (VMALLOC_END + FULL_IO_SIZE) +#define PHB_IO_END (KERN_IO_START + FULL_IO_SIZE) #define IOREMAP_BASE (PHB_IO_END) -#define IOREMAP_END (VMALLOC_START + PGTABLE_RANGE) +#define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE) + /* * Region IDs @@ -72,19 +87,28 @@ #define USER_REGION_ID (0UL) /* - * Defines the address of the vmemap area, in its own region + * Defines the address of the vmemap area, in its own region on + * hash table CPUs and after the vmalloc space on Book3E */ +#ifdef CONFIG_PPC_BOOK3E +#define VMEMMAP_BASE VMALLOC_END +#define VMEMMAP_END KERN_IO_START +#else #define VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT) +#endif #define vmemmap ((struct page *)VMEMMAP_BASE) /* * Include the PTE bits definitions */ +#ifdef CONFIG_PPC_BOOK3S #include +#else +#include +#endif #include - #ifdef CONFIG_PPC_MM_SLICES #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN @@ -92,6 +116,9 @@ #ifndef __ASSEMBLY__ +#include +#include + /* * This is the default implementation of various PTE accessors, it's * used in all cases except Book3S with 64K pages where we have a diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h new file mode 100644 index 000000000000..1d27c77d7704 --- /dev/null +++ b/arch/powerpc/include/asm/pte-book3e.h @@ -0,0 +1,70 @@ +#ifndef _ASM_POWERPC_PTE_BOOK3E_H +#define _ASM_POWERPC_PTE_BOOK3E_H +#ifdef __KERNEL__ + +/* PTE bit definitions for processors compliant to the Book3E + * architecture 2.06 or later. The position of the PTE bits + * matches the HW definition of the optional Embedded Page Table + * category. + */ + +/* Architected bits */ +#define _PAGE_PRESENT 0x000001 /* software: pte contains a translation */ +#define _PAGE_FILE 0x000002 /* (!present only) software: pte holds file offset */ +#define _PAGE_SW1 0x000002 +#define _PAGE_BAP_SR 0x000004 +#define _PAGE_BAP_UR 0x000008 +#define _PAGE_BAP_SW 0x000010 +#define _PAGE_BAP_UW 0x000020 +#define _PAGE_BAP_SX 0x000040 +#define _PAGE_BAP_UX 0x000080 +#define _PAGE_PSIZE_MSK 0x000f00 +#define _PAGE_PSIZE_4K 0x000200 +#define _PAGE_PSIZE_64K 0x000600 +#define _PAGE_PSIZE_1M 0x000a00 +#define _PAGE_PSIZE_16M 0x000e00 +#define _PAGE_DIRTY 0x001000 /* C: page changed */ +#define _PAGE_SW0 0x002000 +#define _PAGE_U3 0x004000 +#define _PAGE_U2 0x008000 +#define _PAGE_U1 0x010000 +#define _PAGE_U0 0x020000 +#define _PAGE_ACCESSED 0x040000 +#define _PAGE_LENDIAN 0x080000 +#define _PAGE_GUARDED 0x100000 +#define _PAGE_COHERENT 0x200000 /* M: enforce memory coherence */ +#define _PAGE_NO_CACHE 0x400000 /* I: cache inhibit */ +#define _PAGE_WRITETHRU 0x800000 /* W: cache write-through */ + +/* "Higher level" linux bit combinations */ +#define _PAGE_EXEC _PAGE_BAP_SX /* Can be executed from potentially */ +#define _PAGE_HWEXEC _PAGE_BAP_UX /* .. and was cache cleaned */ +#define _PAGE_RW (_PAGE_BAP_SW | _PAGE_BAP_UW) /* User write permission */ +#define _PAGE_KERNEL_RW (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY) +#define _PAGE_KERNEL_RO (_PAGE_BAP_SR) +#define _PAGE_USER (_PAGE_BAP_UR | _PAGE_BAP_SR) /* Can be read */ + +#define _PAGE_HASHPTE 0 +#define _PAGE_BUSY 0 + +#define _PAGE_SPECIAL _PAGE_SW0 + +/* Flags to be preserved on PTE modifications */ +#define _PAGE_HPTEFLAGS _PAGE_BUSY + +/* Base page size */ +#ifdef CONFIG_PPC_64K_PAGES +#define _PAGE_PSIZE _PAGE_PSIZE_64K +#define PTE_RPN_SHIFT (28) +#else +#define _PAGE_PSIZE _PAGE_PSIZE_4K +#define PTE_RPN_SHIFT (24) +#endif + +/* On 32-bit, we never clear the top part of the PTE */ +#ifdef CONFIG_PPC32 +#define _PTE_NONE_MASK 0xffffffff00000000ULL +#endif + +#endif /* __KERNEL__ */ +#endif /* _ASM_POWERPC_PTE_FSL_BOOKE_H */ diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index a7e210b6b48c..8bb6464ba619 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -34,6 +34,9 @@ #ifndef _PAGE_4K_PFN #define _PAGE_4K_PFN 0 #endif +#ifndef _PAGE_SAO +#define _PAGE_SAO 0 +#endif #ifndef _PAGE_PSIZE #define _PAGE_PSIZE 0 #endif diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 561b64652311..0a9f30b54952 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -52,9 +52,11 @@ #include #endif +#ifdef CONFIG_PPC32 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) #include "head_booke.h" #endif +#endif #if defined(CONFIG_FSL_BOOKE) #include "../mm/mmu_decl.h" @@ -260,6 +262,7 @@ int main(void) DEFINE(_SRR1, STACK_FRAME_OVERHEAD+sizeof(struct pt_regs)+8); #endif /* CONFIG_PPC64 */ +#if defined(CONFIG_PPC32) #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) DEFINE(EXC_LVL_SIZE, STACK_EXC_LVL_FRAME_SIZE); DEFINE(MAS0, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, mas0)); @@ -278,7 +281,7 @@ int main(void) DEFINE(_DSRR1, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, dsrr1)); DEFINE(SAVED_KSP_LIMIT, STACK_INT_FRAME_SIZE+offsetof(struct exception_regs, saved_ksp_limit)); #endif - +#endif DEFINE(CLONE_VM, CLONE_VM); DEFINE(CLONE_UNTRACED, CLONE_UNTRACED); diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index c46ef2ffa3d9..90df6ffe3a43 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -57,8 +57,10 @@ unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ #define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { - "unused_4K", "hugepte_cache_64K", "unused_64K_AP", - "hugepte_cache_1M", "hugepte_cache_16M", "hugepte_cache_16G" + [MMU_PAGE_64K] = "hugepte_cache_64K", + [MMU_PAGE_1M] = "hugepte_cache_1M", + [MMU_PAGE_16M] = "hugepte_cache_16M", + [MMU_PAGE_16G] = "hugepte_cache_16G", }; /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() @@ -700,6 +702,8 @@ static void __init set_huge_psize(int psize) if (mmu_huge_psizes[psize] || mmu_psize_defs[psize].shift == PAGE_SHIFT) return; + if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL)) + return; hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); switch (mmu_psize_defs[psize].shift) { From 13363ab9b9d040ebeace3a1a3a5ddcb13bf0d644 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:39 +0000 Subject: [PATCH 0144/1662] powerpc: Add definitions used by exception handling on 64-bit Book3E This adds various definitions and macros used by the exception and TLB miss handling on 64-bit BookE It also adds the definitions of the SPRGs used for various exception types Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/exception-64e.h | 201 +++++++++++++++++++++++ arch/powerpc/include/asm/reg.h | 19 +++ 2 files changed, 220 insertions(+) create mode 100644 arch/powerpc/include/asm/exception-64e.h diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h new file mode 100644 index 000000000000..94cb3d79d125 --- /dev/null +++ b/arch/powerpc/include/asm/exception-64e.h @@ -0,0 +1,201 @@ +/* + * Definitions for use by exception code on Book3-E + * + * Copyright (C) 2008 Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _ASM_POWERPC_EXCEPTION_64E_H +#define _ASM_POWERPC_EXCEPTION_64E_H + +/* + * SPRGs usage an other considerations... + * + * Since TLB miss and other standard exceptions can be interrupted by + * critical exceptions which can themselves be interrupted by machine + * checks, and since the two later can themselves cause a TLB miss when + * hitting the linear mapping for the kernel stacks, we need to be a bit + * creative on how we use SPRGs. + * + * The base idea is that we have one SRPG reserved for critical and one + * for machine check interrupts. Those are used to save a GPR that can + * then be used to get the PACA, and store as much context as we need + * to save in there. That includes saving the SPRGs used by the TLB miss + * handler for linear mapping misses and the associated SRR0/1 due to + * the above re-entrancy issue. + * + * So here's the current usage pattern. It's done regardless of which + * SPRGs are user-readable though, thus we might have to change some of + * this later. In order to do that more easily, we use special constants + * for naming them + * + * WARNING: Some of these SPRGs are user readable. We need to do something + * about it as some point by making sure they can't be used to leak kernel + * critical data + */ + + +/* We are out of SPRGs so we save some things in the PACA. The normal + * exception frame is smaller than the CRIT or MC one though + */ +#define EX_R1 (0 * 8) +#define EX_CR (1 * 8) +#define EX_R10 (2 * 8) +#define EX_R11 (3 * 8) +#define EX_R14 (4 * 8) +#define EX_R15 (5 * 8) + +/* The TLB miss exception uses different slots */ + +#define EX_TLB_R10 ( 0 * 8) +#define EX_TLB_R11 ( 1 * 8) +#define EX_TLB_R12 ( 2 * 8) +#define EX_TLB_R13 ( 3 * 8) +#define EX_TLB_R14 ( 4 * 8) +#define EX_TLB_R15 ( 5 * 8) +#define EX_TLB_R16 ( 6 * 8) +#define EX_TLB_CR ( 7 * 8) +#define EX_TLB_DEAR ( 8 * 8) /* Level 0 and 2 only */ +#define EX_TLB_ESR ( 9 * 8) /* Level 0 and 2 only */ +#define EX_TLB_SRR0 (10 * 8) +#define EX_TLB_SRR1 (11 * 8) +#define EX_TLB_MMUCR0 (12 * 8) /* Level 0 */ +#define EX_TLB_MAS1 (12 * 8) /* Level 0 */ +#define EX_TLB_MAS2 (13 * 8) /* Level 0 */ +#ifdef CONFIG_BOOK3E_MMU_TLB_STATS +#define EX_TLB_R8 (14 * 8) +#define EX_TLB_R9 (15 * 8) +#define EX_TLB_LR (16 * 8) +#define EX_TLB_SIZE (17 * 8) +#else +#define EX_TLB_SIZE (14 * 8) +#endif + +#define START_EXCEPTION(label) \ + .globl exc_##label##_book3e; \ +exc_##label##_book3e: + +/* TLB miss exception prolog + * + * This prolog handles re-entrancy (up to 3 levels supported in the PACA + * though we currently don't test for overflow). It provides you with a + * re-entrancy safe working space of r10...r16 and CR with r12 being used + * as the exception area pointer in the PACA for that level of re-entrancy + * and r13 containing the PACA pointer. + * + * SRR0 and SRR1 are saved, but DEAR and ESR are not, since they don't apply + * as-is for instruction exceptions. It's up to the actual exception code + * to save them as well if required. + */ +#define TLB_MISS_PROLOG \ + mtspr SPRN_SPRG_TLB_SCRATCH,r12; \ + mfspr r12,SPRN_SPRG_TLB_EXFRAME; \ + std r10,EX_TLB_R10(r12); \ + mfcr r10; \ + std r11,EX_TLB_R11(r12); \ + mfspr r11,SPRN_SPRG_TLB_SCRATCH; \ + std r13,EX_TLB_R13(r12); \ + mfspr r13,SPRN_SPRG_PACA; \ + std r14,EX_TLB_R14(r12); \ + addi r14,r12,EX_TLB_SIZE; \ + std r15,EX_TLB_R15(r12); \ + mfspr r15,SPRN_SRR1; \ + std r16,EX_TLB_R16(r12); \ + mfspr r16,SPRN_SRR0; \ + std r10,EX_TLB_CR(r12); \ + std r11,EX_TLB_R12(r12); \ + mtspr SPRN_SPRG_TLB_EXFRAME,r14; \ + std r15,EX_TLB_SRR1(r12); \ + std r16,EX_TLB_SRR0(r12); \ + TLB_MISS_PROLOG_STATS + +/* And these are the matching epilogs that restores things + * + * There are 3 epilogs: + * + * - SUCCESS : Unwinds one level + * - ERROR : restore from level 0 and reset + * - ERROR_SPECIAL : restore from current level and reset + * + * Normal errors use ERROR, that is, they restore the initial fault context + * and trigger a fault. However, there is a special case for linear mapping + * errors. Those should basically never happen, but if they do happen, we + * want the error to point out the context that did that linear mapping + * fault, not the initial level 0 (basically, we got a bogus PGF or something + * like that). For userland errors on the linear mapping, there is no + * difference since those are always level 0 anyway + */ + +#define TLB_MISS_RESTORE(freg) \ + ld r14,EX_TLB_CR(r12); \ + ld r10,EX_TLB_R10(r12); \ + ld r15,EX_TLB_SRR0(r12); \ + ld r16,EX_TLB_SRR1(r12); \ + mtspr SPRN_SPRG_TLB_EXFRAME,freg; \ + ld r11,EX_TLB_R11(r12); \ + mtcr r14; \ + ld r13,EX_TLB_R13(r12); \ + ld r14,EX_TLB_R14(r12); \ + mtspr SPRN_SRR0,r15; \ + ld r15,EX_TLB_R15(r12); \ + mtspr SPRN_SRR1,r16; \ + TLB_MISS_RESTORE_STATS \ + ld r16,EX_TLB_R16(r12); \ + ld r12,EX_TLB_R12(r12); \ + +#define TLB_MISS_EPILOG_SUCCESS \ + TLB_MISS_RESTORE(r12) + +#define TLB_MISS_EPILOG_ERROR \ + addi r12,r13,PACA_EXTLB; \ + TLB_MISS_RESTORE(r12) + +#define TLB_MISS_EPILOG_ERROR_SPECIAL \ + addi r11,r13,PACA_EXTLB; \ + TLB_MISS_RESTORE(r11) + +#ifdef CONFIG_BOOK3E_MMU_TLB_STATS +#define TLB_MISS_PROLOG_STATS \ + mflr r10; \ + std r8,EX_TLB_R8(r12); \ + std r9,EX_TLB_R9(r12); \ + std r10,EX_TLB_LR(r12); +#define TLB_MISS_RESTORE_STATS \ + ld r16,EX_TLB_LR(r12); \ + ld r9,EX_TLB_R9(r12); \ + ld r8,EX_TLB_R8(r12); \ + mtlr r16; +#define TLB_MISS_STATS_D(name) \ + addi r9,r13,MMSTAT_DSTATS+name; \ + bl .tlb_stat_inc; +#define TLB_MISS_STATS_I(name) \ + addi r9,r13,MMSTAT_ISTATS+name; \ + bl .tlb_stat_inc; +#define TLB_MISS_STATS_X(name) \ + ld r8,PACA_EXTLB+EX_TLB_ESR(r13); \ + cmpdi cr2,r8,-1; \ + beq cr2,61f; \ + addi r9,r13,MMSTAT_DSTATS+name; \ + b 62f; \ +61: addi r9,r13,MMSTAT_ISTATS+name; \ +62: bl .tlb_stat_inc; +#define TLB_MISS_STATS_SAVE_INFO \ + std r14,EX_TLB_ESR(r12); /* save ESR */ \ + + +#else +#define TLB_MISS_PROLOG_STATS +#define TLB_MISS_RESTORE_STATS +#define TLB_MISS_STATS_D(name) +#define TLB_MISS_STATS_I(name) +#define TLB_MISS_STATS_X(name) +#define TLB_MISS_STATS_Y(name) +#define TLB_MISS_STATS_SAVE_INFO +#endif + + +#endif /* _ASM_POWERPC_EXCEPTION_64E_H */ + diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index c8715331e1b0..6315edc205d8 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -652,6 +652,16 @@ * - SPRG2 scratch for exception vectors * - SPRG3 unused (user visible) * + * 64-bit embedded + * - SPRG0 generic exception scratch + * - SPRG2 TLB exception stack + * - SPRG3 unused (user visible) + * - SPRG4 unused (user visible) + * - SPRG6 TLB miss scratch (user visible, sorry !) + * - SPRG7 critical exception scratch + * - SPRG8 machine check exception scratch + * - SPRG9 debug exception scratch + * * All 32-bit: * - SPRG3 current thread_info pointer * (virtual on BookE, physical on others) @@ -705,6 +715,15 @@ #define SPRN_SPRG_SCRATCH0 SPRN_SPRG2 #endif +#ifdef CONFIG_PPC_BOOK3E_64 +#define SPRN_SPRG_MC_SCRATCH SPRN_SPRG8 +#define SPRN_SPRG_CRIT_SCRATCH SPRN_SPRG7 +#define SPRN_SPRG_DBG_SCRATCH SPRN_SPRG9 +#define SPRN_SPRG_TLB_EXFRAME SPRN_SPRG2 +#define SPRN_SPRG_TLB_SCRATCH SPRN_SPRG6 +#define SPRN_SPRG_GEN_SCRATCH SPRN_SPRG0 +#endif + #ifdef CONFIG_PPC_BOOK3S_32 #define SPRN_SPRG_SCRATCH0 SPRN_SPRG0 #define SPRN_SPRG_SCRATCH1 SPRN_SPRG1 From dce6670aaa7efece0558010b48d5ef9d421154be Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:42 +0000 Subject: [PATCH 0145/1662] powerpc: Add PACA fields specific to 64-bit Book3E processors This adds various fields in the PACA that are for use specifically by Book3E processors, such as exception save areas, current pgd pointer, special exceptions kernel stacks etc... Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/paca.h | 23 ++++++++++++++++++++--- arch/powerpc/kernel/asm-offsets.c | 14 ++++++++++++++ arch/powerpc/kernel/paca.c | 3 +++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index c8a3cbfe02ff..b634456ea893 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -14,9 +14,11 @@ #define _ASM_POWERPC_PACA_H #ifdef __KERNEL__ -#include -#include -#include +#include +#include +#include +#include +#include register struct paca_struct *local_paca asm("r13"); @@ -91,6 +93,21 @@ struct paca_struct { u16 slb_cache[SLB_CACHE_ENTRIES]; #endif /* CONFIG_PPC_STD_MMU_64 */ +#ifdef CONFIG_PPC_BOOK3E + pgd_t *pgd; /* Current PGD */ + pgd_t *kernel_pgd; /* Kernel PGD */ + u64 exgen[8] __attribute__((aligned(0x80))); + u64 extlb[EX_TLB_SIZE*3] __attribute__((aligned(0x80))); + u64 exmc[8]; /* used for machine checks */ + u64 excrit[8]; /* used for crit interrupts */ + u64 exdbg[8]; /* used for debug interrupts */ + + /* Kernel stack pointers for use by special exceptions */ + void *mc_kstack; + void *crit_kstack; + void *dbg_kstack; +#endif /* CONFIG_PPC_BOOK3E */ + mm_context_t context; /* diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 0a9f30b54952..b9e010d0fc91 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -140,6 +140,20 @@ int main(void) context.high_slices_psize)); DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); #endif /* CONFIG_PPC_MM_SLICES */ + +#ifdef CONFIG_PPC_BOOK3E + DEFINE(PACAPGD, offsetof(struct paca_struct, pgd)); + DEFINE(PACA_KERNELPGD, offsetof(struct paca_struct, kernel_pgd)); + DEFINE(PACA_EXGEN, offsetof(struct paca_struct, exgen)); + DEFINE(PACA_EXTLB, offsetof(struct paca_struct, extlb)); + DEFINE(PACA_EXMC, offsetof(struct paca_struct, exmc)); + DEFINE(PACA_EXCRIT, offsetof(struct paca_struct, excrit)); + DEFINE(PACA_EXDBG, offsetof(struct paca_struct, exdbg)); + DEFINE(PACA_MC_STACK, offsetof(struct paca_struct, mc_kstack)); + DEFINE(PACA_CRIT_STACK, offsetof(struct paca_struct, crit_kstack)); + DEFINE(PACA_DBG_STACK, offsetof(struct paca_struct, dbg_kstack)); +#endif /* CONFIG_PPC_BOOK3E */ + #ifdef CONFIG_PPC_STD_MMU_64 DEFINE(PACASTABREAL, offsetof(struct paca_struct, stab_real)); DEFINE(PACASTABVIRT, offsetof(struct paca_struct, stab_addr)); diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index e9962c7f8a09..d16b1ea55d44 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -13,6 +13,7 @@ #include #include #include +#include /* This symbol is provided by the linker - let it fill in the paca * field correctly */ @@ -87,6 +88,8 @@ void __init initialise_pacas(void) #ifdef CONFIG_PPC_BOOK3S new_paca->lppaca_ptr = &lppaca[cpu]; +#else + new_paca->kernel_pgd = swapper_pg_dir; #endif new_paca->lock_token = 0x8000; new_paca->paca_index = cpu; From a8f7758c1c52a13e031266483efd5525157e43e9 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:45 +0000 Subject: [PATCH 0146/1662] powerpc/mm: Move around mmu_gathers definition on 64-bit The definition for the global structure mmu_gathers, used by generic code, is currently defined in multiple places not including anything used by 64-bit Book3E. This changes it by moving to one place common to all processors. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/init_32.c | 2 -- arch/powerpc/mm/pgtable.c | 2 ++ arch/powerpc/mm/tlb_hash64.c | 5 ----- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 3de6a0d93824..3ef5084b90ca 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -54,8 +54,6 @@ #endif #define MAX_LOW_MEM CONFIG_LOWMEM_SIZE -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - phys_addr_t total_memory; phys_addr_t total_lowmem; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index a65979a5f75b..cafb2a269542 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -30,6 +30,8 @@ #include #include +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + #ifdef CONFIG_SMP /* diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 8e35a6066938..2b2f35f6985e 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -33,11 +33,6 @@ DEFINE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch); -/* This is declared as we are using the more or less generic - * arch/powerpc/include/asm/tlb.h file -- tgall - */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - /* * A linux PTE was changed and the corresponding hash table entry * neesd to be flushed. This function will either perform the flush From 25d21ad6e799cccd097b9df2a2fefe19a7e1dfcf Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:47 +0000 Subject: [PATCH 0147/1662] powerpc: Add TLB management code for 64-bit Book3E This adds the TLB miss handler assembly, the low level TLB flush routines along with the necessary hook for dealing with our virtual page tables or indirect TLB entries that need to be flushes when PTE pages are freed. There is currently no support for hugetlbfs Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu-40x.h | 3 + arch/powerpc/include/asm/mmu-44x.h | 6 + arch/powerpc/include/asm/mmu-8xx.h | 3 + arch/powerpc/include/asm/mmu-hash32.h | 6 + arch/powerpc/include/asm/mmu_context.h | 8 + arch/powerpc/kernel/setup_64.c | 4 + arch/powerpc/mm/mmu_decl.h | 14 +- arch/powerpc/mm/tlb_low_64e.S | 734 +++++++++++++++++++++++++ arch/powerpc/mm/tlb_nohash.c | 203 ++++++- arch/powerpc/mm/tlb_nohash_low.S | 79 +++ 10 files changed, 1055 insertions(+), 5 deletions(-) create mode 100644 arch/powerpc/mm/tlb_low_64e.S diff --git a/arch/powerpc/include/asm/mmu-40x.h b/arch/powerpc/include/asm/mmu-40x.h index 776f415a36aa..34916865eaef 100644 --- a/arch/powerpc/include/asm/mmu-40x.h +++ b/arch/powerpc/include/asm/mmu-40x.h @@ -61,4 +61,7 @@ typedef struct { #endif /* !__ASSEMBLY__ */ +#define mmu_virtual_psize MMU_PAGE_4K +#define mmu_linear_psize MMU_PAGE_256M + #endif /* _ASM_POWERPC_MMU_40X_H_ */ diff --git a/arch/powerpc/include/asm/mmu-44x.h b/arch/powerpc/include/asm/mmu-44x.h index 3c86576bfefa..0372669383a8 100644 --- a/arch/powerpc/include/asm/mmu-44x.h +++ b/arch/powerpc/include/asm/mmu-44x.h @@ -79,16 +79,22 @@ typedef struct { #if (PAGE_SHIFT == 12) #define PPC44x_TLBE_SIZE PPC44x_TLB_4K +#define mmu_virtual_psize MMU_PAGE_4K #elif (PAGE_SHIFT == 14) #define PPC44x_TLBE_SIZE PPC44x_TLB_16K +#define mmu_virtual_psize MMU_PAGE_16K #elif (PAGE_SHIFT == 16) #define PPC44x_TLBE_SIZE PPC44x_TLB_64K +#define mmu_virtual_psize MMU_PAGE_64K #elif (PAGE_SHIFT == 18) #define PPC44x_TLBE_SIZE PPC44x_TLB_256K +#define mmu_virtual_psize MMU_PAGE_256K #else #error "Unsupported PAGE_SIZE" #endif +#define mmu_linear_psize MMU_PAGE_256M + #define PPC44x_PGD_OFF_SHIFT (32 - PGDIR_SHIFT + PGD_T_LOG2) #define PPC44x_PGD_OFF_MASK_BIT (PGDIR_SHIFT - PGD_T_LOG2) #define PPC44x_PTE_ADD_SHIFT (32 - PGDIR_SHIFT + PTE_SHIFT + PTE_T_LOG2) diff --git a/arch/powerpc/include/asm/mmu-8xx.h b/arch/powerpc/include/asm/mmu-8xx.h index 07865a357848..3d11d3ce79ec 100644 --- a/arch/powerpc/include/asm/mmu-8xx.h +++ b/arch/powerpc/include/asm/mmu-8xx.h @@ -143,4 +143,7 @@ typedef struct { } mm_context_t; #endif /* !__ASSEMBLY__ */ +#define mmu_virtual_psize MMU_PAGE_4K +#define mmu_linear_psize MMU_PAGE_8M + #endif /* _ASM_POWERPC_MMU_8XX_H_ */ diff --git a/arch/powerpc/include/asm/mmu-hash32.h b/arch/powerpc/include/asm/mmu-hash32.h index 16b1a1e77e64..382fc689f204 100644 --- a/arch/powerpc/include/asm/mmu-hash32.h +++ b/arch/powerpc/include/asm/mmu-hash32.h @@ -80,4 +80,10 @@ typedef struct { #endif /* !__ASSEMBLY__ */ +/* We happily ignore the smaller BATs on 601, we don't actually use + * those definitions on hash32 at the moment anyway + */ +#define mmu_virtual_psize MMU_PAGE_4K +#define mmu_linear_psize MMU_PAGE_256M + #endif /* _ASM_POWERPC_MMU_HASH32_H_ */ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 8dffed317013..b34e94d94435 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -43,6 +43,10 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, tsk->thread.pgdir = next->pgd; #endif /* CONFIG_PPC32 */ + /* 64-bit Book3E keeps track of current PGD in the PACA */ +#ifdef CONFIG_PPC_BOOK3E_64 + get_paca()->pgd = next->pgd; +#endif /* Nothing else to do if we aren't actually switching */ if (prev == next) return; @@ -89,6 +93,10 @@ static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { + /* 64-bit Book3E keeps track of current PGD in the PACA */ +#ifdef CONFIG_PPC_BOOK3E_64 + get_paca()->pgd = NULL; +#endif } #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index a6b6c4c9ae41..65aced7b833a 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -62,6 +62,7 @@ #include #include #include +#include #include "setup.h" @@ -147,6 +148,9 @@ void __init setup_paca(int cpu) { local_paca = &paca[cpu]; mtspr(SPRN_SPRG_PACA, local_paca); +#ifdef CONFIG_PPC_BOOK3E + mtspr(SPRN_SPRG_TLB_EXFRAME, local_paca->extlb); +#endif } /* diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 3871dceee2dd..5961c6b739dd 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -41,7 +41,11 @@ static inline void _tlbil_pid(unsigned int pid) #else /* CONFIG_40x || CONFIG_8xx */ extern void _tlbil_all(void); extern void _tlbil_pid(unsigned int pid); +#ifdef CONFIG_PPC_BOOK3E +extern void _tlbil_pid_noind(unsigned int pid); +#else #define _tlbil_pid_noind(pid) _tlbil_pid(pid) +#endif #endif /* !(CONFIG_40x || CONFIG_8xx) */ /* @@ -53,7 +57,10 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, { asm volatile ("tlbie %0; sync" : : "r" (address) : "memory"); } -#else /* CONFIG_8xx */ +#elif defined(CONFIG_PPC_BOOK3E) +extern void _tlbil_va(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind); +#else extern void __tlbil_va(unsigned long address, unsigned int pid); static inline void _tlbil_va(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind) @@ -67,11 +74,16 @@ static inline void _tlbil_va(unsigned long address, unsigned int pid, * implementation. When that becomes the case, this will be * an extern. */ +#ifdef CONFIG_PPC_BOOK3E +extern void _tlbivax_bcast(unsigned long address, unsigned int pid, + unsigned int tsize, unsigned int ind); +#else static inline void _tlbivax_bcast(unsigned long address, unsigned int pid, unsigned int tsize, unsigned int ind) { BUG(); } +#endif #else /* CONFIG_PPC_MMU_NOHASH */ diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S new file mode 100644 index 000000000000..10d524ded7b2 --- /dev/null +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -0,0 +1,734 @@ +/* + * Low leve TLB miss handlers for Book3E + * + * Copyright (C) 2008-2009 + * Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PPC_64K_PAGES +#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE+1) +#else +#define VPTE_PMD_SHIFT (PTE_INDEX_SIZE) +#endif +#define VPTE_PUD_SHIFT (VPTE_PMD_SHIFT + PMD_INDEX_SIZE) +#define VPTE_PGD_SHIFT (VPTE_PUD_SHIFT + PUD_INDEX_SIZE) +#define VPTE_INDEX_SIZE (VPTE_PGD_SHIFT + PGD_INDEX_SIZE) + + +/********************************************************************** + * * + * TLB miss handling for Book3E with TLB reservation and HES support * + * * + **********************************************************************/ + + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss) + TLB_MISS_PROLOG + + /* Now we handle the fault proper. We only save DEAR in normal + * fault case since that's the only interesting values here. + * We could probably also optimize by not saving SRR0/1 in the + * linear mapping case but I'll leave that for later + */ + mfspr r14,SPRN_ESR + mfspr r16,SPRN_DEAR /* get faulting address */ + srdi r15,r16,60 /* get region */ + cmpldi cr0,r15,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* The page tables are mapped virtually linear. At this point, though, + * we don't know whether we are trying to fault in a first level + * virtual address or a virtual page table address. We can get that + * from bit 0x1 of the region ID which we have set for a page table + */ + andi. r10,r15,0x1 + bne- virt_page_table_tlb_miss + + std r14,EX_TLB_ESR(r12); /* save ESR */ + std r16,EX_TLB_DEAR(r12); /* save DEAR */ + + /* We need _PAGE_PRESENT and _PAGE_ACCESSED set */ + li r11,_PAGE_PRESENT + oris r11,r11,_PAGE_ACCESSED@h + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r15,0 /* Check for user region */ + + /* We pre-test some combination of permissions to avoid double + * faults: + * + * We move the ESR:ST bit into the position of _PAGE_BAP_SW in the PTE + * ESR_ST is 0x00800000 + * _PAGE_BAP_SW is 0x00000010 + * So the shift is >> 19. This tests for supervisor writeability. + * If the page happens to be supervisor writeable and not user + * writeable, we will take a new fault later, but that should be + * a rare enough case. + * + * We also move ESR_ST in _PAGE_DIRTY position + * _PAGE_DIRTY is 0x00001000 so the shift is >> 11 + * + * MAS1 is preset for all we need except for TID that needs to + * be cleared for kernel translations + */ + rlwimi r11,r14,32-19,27,27 + rlwimi r11,r14,32-16,19,19 + beq normal_tlb_miss + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ normal_tlb_miss + + /* We got a crappy address, just fault with whatever DEAR and ESR + * are here + */ + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss) + TLB_MISS_PROLOG + + /* If we take a recursive fault, the second level handler may need + * to know whether we are handling a data or instruction fault in + * order to get to the right store fault handler. We provide that + * info by writing a crazy value in ESR in our exception frame + */ + li r14,-1 /* store to exception frame is done later */ + + /* Now we handle the fault proper. We only save DEAR in the non + * linear mapping case since we know the linear mapping case will + * not re-enter. We could indeed optimize and also not save SRR0/1 + * in the linear mapping case but I'll leave that for later + * + * Faulting address is SRR0 which is already in r16 + */ + srdi r15,r16,60 /* get region */ + cmpldi cr0,r15,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + li r11,_PAGE_PRESENT|_PAGE_HWEXEC /* Base perm */ + oris r11,r11,_PAGE_ACCESSED@h + + cmpldi cr0,r15,0 /* Check for user region */ + std r14,EX_TLB_ESR(r12) /* write crazy -1 to frame */ + beq normal_tlb_miss + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r15,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + beq+ normal_tlb_miss + + /* We got a crappy address, just fault */ + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +/* + * This is the guts of the first-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = faulting address + * r15 = region ID + * r14 = crap (free to use) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = PTE permission mask + * r10 = crap (free to use) + */ +normal_tlb_miss: + /* So we first construct the page table address. We do that by + * shifting the bottom of the address (not the region ID) by + * PAGE_SHIFT-3, clearing the bottom 3 bits (get a PTE ptr) and + * or'ing the fourth high bit. + * + * NOTE: For 64K pages, we do things slightly differently in + * order to handle the weird page table format used by linux + */ + ori r10,r15,0x1 +#ifdef CONFIG_PPC_64K_PAGES + /* For the top bits, 16 bytes per PTE */ + rldicl r14,r16,64-(PAGE_SHIFT-4),PAGE_SHIFT-4+4 + /* Now create the bottom bits as 0 in position 0x8000 and + * the rest calculated for 8 bytes per PTE + */ + rldicl r15,r16,64-(PAGE_SHIFT-3),64-15 + /* Insert the bottom bits in */ + rlwimi r14,r15,0,16,31 +#else + rldicl r14,r16,64-(PAGE_SHIFT-3),PAGE_SHIFT-3+4 +#endif + sldi r15,r10,60 + clrrdi r14,r14,3 + or r10,r15,r14 + + /* Set the TLB reservation and seach for existing entry. Then load + * the entry. + */ + PPC_TLBSRX_DOT(0,r16) + ld r14,0(r10) + beq normal_tlb_miss_done + +finish_normal_tlb_miss: + /* Check if required permissions are met */ + andc. r15,r11,r14 + bne- normal_tlb_miss_access_fault + + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE need change if !base page size, not + * yet implemented for now + * MAS 2 : Defaults not useful, need to be redone + * MAS 3+7 : Needs to be done + * + * TODO: mix up code below for better scheduling + */ + clrrdi r11,r16,12 /* Clear low crap in EA */ + rlwimi r11,r14,32-19,27,31 /* Insert WIMGE */ + mtspr SPRN_MAS2,r11 + + /* Check page size, if not standard, update MAS1 */ + rldicl r11,r14,64-8,64-8 +#ifdef CONFIG_PPC_64K_PAGES + cmpldi cr0,r11,BOOK3E_PAGESZ_64K +#else + cmpldi cr0,r11,BOOK3E_PAGESZ_4K +#endif + beq- 1f + mfspr r11,SPRN_MAS1 + rlwimi r11,r14,31,21,24 + rlwinm r11,r11,0,21,19 + mtspr SPRN_MAS1,r11 +1: + /* Move RPN in position */ + rldicr r11,r14,64-(PTE_RPN_SHIFT-PAGE_SHIFT),63-PAGE_SHIFT + clrldi r15,r11,12 /* Clear crap at the top */ + rlwimi r15,r14,32-8,22,25 /* Move in U bits */ + rlwimi r15,r14,32-2,26,31 /* Move in BAP bits */ + + /* Mask out SW and UW if !DIRTY (XXX optimize this !) */ + andi. r11,r14,_PAGE_DIRTY + bne 1f + li r11,MAS3_SW|MAS3_UW + andc r15,r15,r11 +1: mtspr SPRN_MAS7_MAS3,r15 + + tlbwe + +normal_tlb_miss_done: + /* We don't bother with restoring DEAR or ESR since we know we are + * level 0 and just going back to userland. They are only needed + * if you are going to take an access fault + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK) + TLB_MISS_EPILOG_SUCCESS + rfi + +normal_tlb_miss_access_fault: + /* We need to check if it was an instruction miss */ + andi. r10,r11,_PAGE_HWEXEC + bne 1f + ld r14,EX_TLB_DEAR(r12) + ld r15,EX_TLB_ESR(r12) + mtspr SPRN_DEAR,r14 + mtspr SPRN_ESR,r15 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + + +/* + * This is the guts of the second-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = virtual page table faulting address + * r15 = region (top 4 bits of address) + * r14 = crap (free to use) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * Note that this should only ever be called as a second level handler + * with the current scheme when using SW load. + * That means we can always get the original fault DEAR at + * EX_TLB_DEAR-EX_TLB_SIZE(r12) + * + * It can be re-entered by the linear mapping miss handler. However, to + * avoid too much complication, it will restart the whole fault at level + * 0 so we don't care too much about clobbers + * + * XXX That code was written back when we couldn't clobber r14. We can now, + * so we could probably optimize things a bit + */ +virt_page_table_tlb_miss: + /* Are we hitting a kernel page table ? */ + andi. r10,r15,0x8 + + /* The cool thing now is that r10 contains 0 for user and 8 for kernel, + * and we happen to have the swapper_pg_dir at offset 8 from the user + * pgdir in the PACA :-). + */ + add r11,r10,r13 + + /* If kernel, we need to clear MAS1 TID */ + beq 1f + /* XXX replace the RMW cycles with immediate loads + writes */ + mfspr r10,SPRN_MAS1 + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 +1: + /* Search if we already have a TLB entry for that virtual address, and + * if we do, bail out. + */ + PPC_TLBSRX_DOT(0,r16) + beq virt_page_table_tlb_miss_done + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-(VPTE_INDEX_SIZE+3),VPTE_INDEX_SIZE+3+4 + bne- virt_page_table_tlb_miss_fault + + /* Get the PGD pointer */ + ld r15,PACAPGD(r11) + cmpldi cr0,r15,0 + beq- virt_page_table_tlb_miss_fault + + /* Get to PGD entry */ + rldicl r11,r16,64-VPTE_PGD_SHIFT,64-PGD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq virt_page_table_tlb_miss_fault + +#ifndef CONFIG_PPC_64K_PAGES + /* Get to PUD entry */ + rldicl r11,r16,64-VPTE_PUD_SHIFT,64-PUD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq virt_page_table_tlb_miss_fault +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get to PMD entry */ + rldicl r11,r16,64-VPTE_PMD_SHIFT,64-PMD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq virt_page_table_tlb_miss_fault + + /* Ok, we're all right, we can now create a kernel translation for + * a 4K or 64K page from r16 -> r15. + */ + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE for now is base page size always + * MAS 2 : Use defaults + * MAS 3+7 : Needs to be done + * + * So we only do MAS 2 and 3 for now... + */ + clrldi r11,r15,4 /* remove region ID from RPN */ + ori r10,r11,1 /* Or-in SR */ + mtspr SPRN_MAS7_MAS3,r10 + + tlbwe + +virt_page_table_tlb_miss_done: + + /* We have overriden MAS2:EPN but currently our primary TLB miss + * handler will always restore it so that should not be an issue, + * if we ever optimize the primary handler to not write MAS2 on + * some cases, we'll have to restore MAS2:EPN here based on the + * original fault's DEAR. If we do that we have to modify the + * ITLB miss handler to also store SRR0 in the exception frame + * as DEAR. + * + * However, one nasty thing we did is we cleared the reservation + * (well, potentially we did). We do a trick here thus if we + * are not a level 0 exception (we interrupted the TLB miss) we + * offset the return address by -4 in order to replay the tlbsrx + * instruction there + */ + subf r10,r13,r12 + cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE + bne- 1f + ld r11,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) + addi r10,r11,-4 + std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) +1: + /* Return to caller, normal case */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK); + TLB_MISS_EPILOG_SUCCESS + rfi + +virt_page_table_tlb_miss_fault: + /* If we fault here, things are a little bit tricky. We need to call + * either data or instruction store fault, and we need to retreive + * the original fault address and ESR (for data). + * + * The thing is, we know that in normal circumstances, this is + * always called as a second level tlb miss for SW load or as a first + * level TLB miss for HW load, so we should be able to peek at the + * relevant informations in the first exception frame in the PACA. + * + * However, we do need to double check that, because we may just hit + * a stray kernel pointer or a userland attack trying to hit those + * areas. If that is the case, we do a data fault. (We can't get here + * from an instruction tlb miss anyway). + * + * Note also that when going to a fault, we must unwind the previous + * level as well. Since we are doing that, we don't need to clear or + * restore the TLB reservation neither. + */ + subf r10,r13,r12 + cmpldi cr0,r10,PACA_EXTLB+EX_TLB_SIZE + bne- virt_page_table_tlb_miss_whacko_fault + + /* We dig the original DEAR and ESR from slot 0 */ + ld r15,EX_TLB_DEAR+PACA_EXTLB(r13) + ld r16,EX_TLB_ESR+PACA_EXTLB(r13) + + /* We check for the "special" ESR value for instruction faults */ + cmpdi cr0,r16,-1 + beq 1f + mtspr SPRN_DEAR,r15 + mtspr SPRN_ESR,r16 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +virt_page_table_tlb_miss_whacko_fault: + /* The linear fault will restart everything so ESR and DEAR will + * not have been clobbered, let's just fault with what we have + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_FAULT); + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + + +/************************************************************** + * * + * TLB miss handling for Book3E with hw page table support * + * * + **************************************************************/ + + +/* Data TLB miss */ + START_EXCEPTION(data_tlb_miss_htw) + TLB_MISS_PROLOG + + /* Now we handle the fault proper. We only save DEAR in normal + * fault case since that's the only interesting values here. + * We could probably also optimize by not saving SRR0/1 in the + * linear mapping case but I'll leave that for later + */ + mfspr r14,SPRN_ESR + mfspr r16,SPRN_DEAR /* get faulting address */ + srdi r11,r16,60 /* get region */ + cmpldi cr0,r11,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r11,0 /* Check for user region */ + ld r15,PACAPGD(r13) /* Load user pgdir */ + beq htw_tlb_miss + + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r11,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ + beq+ htw_tlb_miss + + /* We got a crappy address, just fault with whatever DEAR and ESR + * are here + */ + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e + +/* Instruction TLB miss */ + START_EXCEPTION(instruction_tlb_miss_htw) + TLB_MISS_PROLOG + + /* If we take a recursive fault, the second level handler may need + * to know whether we are handling a data or instruction fault in + * order to get to the right store fault handler. We provide that + * info by keeping a crazy value for ESR in r14 + */ + li r14,-1 /* store to exception frame is done later */ + + /* Now we handle the fault proper. We only save DEAR in the non + * linear mapping case since we know the linear mapping case will + * not re-enter. We could indeed optimize and also not save SRR0/1 + * in the linear mapping case but I'll leave that for later + * + * Faulting address is SRR0 which is already in r16 + */ + srdi r11,r16,60 /* get region */ + cmpldi cr0,r11,0xc /* linear mapping ? */ + TLB_MISS_STATS_SAVE_INFO + beq tlb_load_linear /* yes -> go to linear map load */ + + /* We do the user/kernel test for the PID here along with the RW test + */ + cmpldi cr0,r11,0 /* Check for user region */ + ld r15,PACAPGD(r13) /* Load user pgdir */ + beq htw_tlb_miss + + /* XXX replace the RMW cycles with immediate loads + writes */ +1: mfspr r10,SPRN_MAS1 + cmpldi cr0,r11,8 /* Check for vmalloc region */ + rlwinm r10,r10,0,16,1 /* Clear TID */ + mtspr SPRN_MAS1,r10 + ld r15,PACA_KERNELPGD(r13) /* Load kernel pgdir */ + beq+ htw_tlb_miss + + /* We got a crappy address, just fault */ + TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + + +/* + * This is the guts of the second-level TLB miss handler for direct + * misses. We are entered with: + * + * r16 = virtual page table faulting address + * r15 = PGD pointer + * r14 = ESR + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * It can be re-entered by the linear mapping miss handler. However, to + * avoid too much complication, it will save/restore things for us + */ +htw_tlb_miss: + /* Search if we already have a TLB entry for that virtual address, and + * if we do, bail out. + * + * MAS1:IND should be already set based on MAS4 + */ + PPC_TLBSRX_DOT(0,r16) + beq htw_tlb_miss_done + + /* Now, we need to walk the page tables. First check if we are in + * range. + */ + rldicl. r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4 + bne- htw_tlb_miss_fault + + /* Get the PGD pointer */ + cmpldi cr0,r15,0 + beq- htw_tlb_miss_fault + + /* Get to PGD entry */ + rldicl r11,r16,64-(PGDIR_SHIFT-3),64-PGD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq htw_tlb_miss_fault + +#ifndef CONFIG_PPC_64K_PAGES + /* Get to PUD entry */ + rldicl r11,r16,64-(PUD_SHIFT-3),64-PUD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq htw_tlb_miss_fault +#endif /* CONFIG_PPC_64K_PAGES */ + + /* Get to PMD entry */ + rldicl r11,r16,64-(PMD_SHIFT-3),64-PMD_INDEX_SIZE-3 + clrrdi r10,r11,3 + ldx r15,r10,r15 + cmpldi cr0,r15,0 + beq htw_tlb_miss_fault + + /* Ok, we're all right, we can now create an indirect entry for + * a 1M or 256M page. + * + * The last trick is now that because we use "half" pages for + * the HTW (1M IND is 2K and 256M IND is 32K) we need to account + * for an added LSB bit to the RPN. For 64K pages, there is no + * problem as we already use 32K arrays (half PTE pages), but for + * 4K page we need to extract a bit from the virtual address and + * insert it into the "PA52" bit of the RPN. + */ +#ifndef CONFIG_PPC_64K_PAGES + rlwimi r15,r16,32-9,20,20 +#endif + /* Now we build the MAS: + * + * MAS 0 : Fully setup with defaults in MAS4 and TLBnCFG + * MAS 1 : Almost fully setup + * - PID already updated by caller if necessary + * - TSIZE for now is base ind page size always + * MAS 2 : Use defaults + * MAS 3+7 : Needs to be done + */ +#ifdef CONFIG_PPC_64K_PAGES + ori r10,r15,(BOOK3E_PAGESZ_64K << MAS3_SPSIZE_SHIFT) +#else + ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) +#endif + mtspr SPRN_MAS7_MAS3,r10 + + tlbwe + +htw_tlb_miss_done: + /* We don't bother with restoring DEAR or ESR since we know we are + * level 0 and just going back to userland. They are only needed + * if you are going to take an access fault + */ + TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK) + TLB_MISS_EPILOG_SUCCESS + rfi + +htw_tlb_miss_fault: + /* We need to check if it was an instruction miss. We know this + * though because r14 would contain -1 + */ + cmpdi cr0,r14,-1 + beq 1f + mtspr SPRN_DEAR,r16 + mtspr SPRN_ESR,r14 + TLB_MISS_STATS_D(MMSTAT_TLB_MISS_PT_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_data_storage_book3e +1: TLB_MISS_STATS_I(MMSTAT_TLB_MISS_PT_FAULT) + TLB_MISS_EPILOG_ERROR + b exc_instruction_storage_book3e + +/* + * This is the guts of "any" level TLB miss handler for kernel linear + * mapping misses. We are entered with: + * + * + * r16 = faulting address + * r15 = crap (free to use) + * r14 = ESR (data) or -1 (instruction) + * r13 = PACA + * r12 = TLB exception frame in PACA + * r11 = crap (free to use) + * r10 = crap (free to use) + * + * In addition we know that we will not re-enter, so in theory, we could + * use a simpler epilog not restoring SRR0/1 etc.. but we'll do that later. + * + * We also need to be careful about MAS registers here & TLB reservation, + * as we know we'll have clobbered them if we interrupt the main TLB miss + * handlers in which case we probably want to do a full restart at level + * 0 rather than saving / restoring the MAS. + * + * Note: If we care about performance of that core, we can easily shuffle + * a few things around + */ +tlb_load_linear: + /* For now, we assume the linear mapping is contiguous and stops at + * linear_map_top. We also assume the size is a multiple of 1G, thus + * we only use 1G pages for now. That might have to be changed in a + * final implementation, especially when dealing with hypervisors + */ + ld r11,PACATOC(r13) + ld r11,linear_map_top@got(r11) + ld r10,0(r11) + cmpld cr0,r10,r16 + bge tlb_load_linear_fault + + /* MAS1 need whole new setup. */ + li r15,(BOOK3E_PAGESZ_1GB< - * IBM Corp. + * Copyright 2008,2009 Ben Herrenschmidt + * IBM Corp. * * Derived from arch/ppc/mm/init.c: * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) @@ -34,12 +34,70 @@ #include #include #include +#include #include #include +#include #include "mmu_decl.h" +#ifdef CONFIG_PPC_BOOK3E +struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = { + [MMU_PAGE_4K] = { + .shift = 12, + .enc = BOOK3E_PAGESZ_4K, + }, + [MMU_PAGE_16K] = { + .shift = 14, + .enc = BOOK3E_PAGESZ_16K, + }, + [MMU_PAGE_64K] = { + .shift = 16, + .enc = BOOK3E_PAGESZ_64K, + }, + [MMU_PAGE_1M] = { + .shift = 20, + .enc = BOOK3E_PAGESZ_1M, + }, + [MMU_PAGE_16M] = { + .shift = 24, + .enc = BOOK3E_PAGESZ_16M, + }, + [MMU_PAGE_256M] = { + .shift = 28, + .enc = BOOK3E_PAGESZ_256M, + }, + [MMU_PAGE_1G] = { + .shift = 30, + .enc = BOOK3E_PAGESZ_1GB, + }, +}; +static inline int mmu_get_tsize(int psize) +{ + return mmu_psize_defs[psize].enc; +} +#else +static inline int mmu_get_tsize(int psize) +{ + /* This isn't used on !Book3E for now */ + return 0; +} +#endif + +/* The variables below are currently only used on 64-bit Book3E + * though this will probably be made common with other nohash + * implementations at some point + */ +#ifdef CONFIG_PPC64 + +int mmu_linear_psize; /* Page size used for the linear mapping */ +int mmu_pte_psize; /* Page size used for PTE pages */ +int book3e_htw_enabled; /* Is HW tablewalk enabled ? */ +unsigned long linear_map_top; /* Top of linear mapping */ + +#endif /* CONFIG_PPC64 */ + /* * Base TLB flushing operations: * @@ -82,7 +140,7 @@ void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { __local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, - 0 /* tsize unused for now */, 0); + mmu_get_tsize(mmu_virtual_psize), 0); } EXPORT_SYMBOL(local_flush_tlb_page); @@ -198,7 +256,7 @@ void __flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { __flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, - 0 /* tsize unused for now */, 0); + mmu_get_tsize(mmu_virtual_psize), 0); } EXPORT_SYMBOL(flush_tlb_page); @@ -241,3 +299,140 @@ void tlb_flush(struct mmu_gather *tlb) /* Push out batch of freed page tables */ pte_free_finish(); } + +/* + * Below are functions specific to the 64-bit variant of Book3E though that + * may change in the future + */ + +#ifdef CONFIG_PPC64 + +/* + * Handling of virtual linear page tables or indirect TLB entries + * flushing when PTE pages are freed + */ +void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address) +{ + int tsize = mmu_psize_defs[mmu_pte_psize].enc; + + if (book3e_htw_enabled) { + unsigned long start = address & PMD_MASK; + unsigned long end = address + PMD_SIZE; + unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift; + + /* This isn't the most optimal, ideally we would factor out the + * while preempt & CPU mask mucking around, or even the IPI but + * it will do for now + */ + while (start < end) { + __flush_tlb_page(tlb->mm, start, tsize, 1); + start += size; + } + } else { + unsigned long rmask = 0xf000000000000000ul; + unsigned long rid = (address & rmask) | 0x1000000000000000ul; + unsigned long vpte = address & ~rmask; + +#ifdef CONFIG_PPC_64K_PAGES + vpte = (vpte >> (PAGE_SHIFT - 4)) & ~0xfffful; +#else + vpte = (vpte >> (PAGE_SHIFT - 3)) & ~0xffful; +#endif + vpte |= rid; + __flush_tlb_page(tlb->mm, vpte, tsize, 0); + } +} + +/* + * Early initialization of the MMU TLB code + */ +static void __early_init_mmu(int boot_cpu) +{ + extern unsigned int interrupt_base_book3e; + extern unsigned int exc_data_tlb_miss_htw_book3e; + extern unsigned int exc_instruction_tlb_miss_htw_book3e; + + unsigned int *ibase = &interrupt_base_book3e; + unsigned int mas4; + + /* XXX This will have to be decided at runtime, but right + * now our boot and TLB miss code hard wires it + */ + mmu_linear_psize = MMU_PAGE_1G; + + + /* Check if HW tablewalk is present, and if yes, enable it by: + * + * - patching the TLB miss handlers to branch to the + * one dedicates to it + * + * - setting the global book3e_htw_enabled + * + * - Set MAS4:INDD and default page size + */ + + /* XXX This code only checks for TLB 0 capabilities and doesn't + * check what page size combos are supported by the HW. It + * also doesn't handle the case where a separate array holds + * the IND entries from the array loaded by the PT. + */ + if (boot_cpu) { + unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG); + + /* Check if HW loader is supported */ + if ((tlb0cfg & TLBnCFG_IND) && + (tlb0cfg & TLBnCFG_PT)) { + patch_branch(ibase + (0x1c0 / 4), + (unsigned long)&exc_data_tlb_miss_htw_book3e, 0); + patch_branch(ibase + (0x1e0 / 4), + (unsigned long)&exc_instruction_tlb_miss_htw_book3e, 0); + book3e_htw_enabled = 1; + } + pr_info("MMU: Book3E Page Tables %s\n", + book3e_htw_enabled ? "Enabled" : "Disabled"); + } + + /* Set MAS4 based on page table setting */ + + mas4 = 0x4 << MAS4_WIMGED_SHIFT; + if (book3e_htw_enabled) { + mas4 |= mas4 | MAS4_INDD; +#ifdef CONFIG_PPC_64K_PAGES + mas4 |= BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT; + mmu_pte_psize = MMU_PAGE_256M; +#else + mas4 |= BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT; + mmu_pte_psize = MMU_PAGE_1M; +#endif + } else { +#ifdef CONFIG_PPC_64K_PAGES + mas4 |= BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT; +#else + mas4 |= BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT; +#endif + mmu_pte_psize = mmu_virtual_psize; + } + mtspr(SPRN_MAS4, mas4); + + /* Set the global containing the top of the linear mapping + * for use by the TLB miss code + */ + linear_map_top = lmb_end_of_DRAM(); + + /* A sync won't hurt us after mucking around with + * the MMU configuration + */ + mb(); +} + +void __init early_init_mmu(void) +{ + __early_init_mmu(1); +} + +void __cpuinit early_init_mmu_secondary(void) +{ + __early_init_mmu(0); +} + +#endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index c7d89a0adba2..7bcd9fbf6cc6 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -191,6 +191,85 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_TLBILX) isync 1: wrtee r10 blr +#elif defined(CONFIG_PPC_BOOK3E) +/* + * New Book3E (>= 2.06) implementation + * + * Note: We may be able to get away without the interrupt masking stuff + * if we save/restore MAS6 on exceptions that might modify it + */ +_GLOBAL(_tlbil_pid) + slwi r4,r3,MAS6_SPID_SHIFT + mfmsr r10 + wrteei 0 + mtspr SPRN_MAS6,r4 + PPC_TLBILX_PID(0,0) + wrtee r10 + msync + isync + blr + +_GLOBAL(_tlbil_pid_noind) + slwi r4,r3,MAS6_SPID_SHIFT + mfmsr r10 + ori r4,r4,MAS6_SIND + wrteei 0 + mtspr SPRN_MAS6,r4 + PPC_TLBILX_PID(0,0) + wrtee r10 + msync + isync + blr + +_GLOBAL(_tlbil_all) + PPC_TLBILX_ALL(0,0) + msync + isync + blr + +_GLOBAL(_tlbil_va) + mfmsr r10 + wrteei 0 + cmpwi cr0,r6,0 + slwi r4,r4,MAS6_SPID_SHIFT + rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK + beq 1f + rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND +1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ + PPC_TLBILX_VA(0,r3) + msync + isync + wrtee r10 + blr + +_GLOBAL(_tlbivax_bcast) + mfmsr r10 + wrteei 0 + cmpwi cr0,r6,0 + slwi r4,r4,MAS6_SPID_SHIFT + rlwimi r4,r5,MAS6_ISIZE_SHIFT,MAS6_ISIZE_MASK + beq 1f + rlwimi r4,r6,MAS6_SIND_SHIFT,MAS6_SIND +1: mtspr SPRN_MAS6,r4 /* assume AS=0 for now */ + PPC_TLBIVAX(0,r3) + eieio + tlbsync + sync + wrtee r10 + blr + +_GLOBAL(set_context) +#ifdef CONFIG_BDI_SWITCH + /* Context switch the PTE pointer for the Abatron BDI2000. + * The PGDIR is the second parameter. + */ + lis r5, abatron_pteptrs@h + ori r5, r5, abatron_pteptrs@l + stw r4, 0x4(r5) +#endif + mtspr SPRN_PID,r3 + isync /* Force context change */ + blr #else #error Unsupported processor type ! #endif From 32a74949b7337726e76d69f51c48715431126c6c Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:58 +0000 Subject: [PATCH 0148/1662] powerpc/mm: Add support for SPARSEMEM_VMEMMAP on 64-bit Book3E The base TLB support didn't include support for SPARSEMEM_VMEMMAP, though we did carve out some virtual space for it, the necessary support code wasn't there. This implements it by using 16M pages for now, though the page size could easily be changed at runtime if necessary. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu-book3e.h | 1 + arch/powerpc/include/asm/pgtable-ppc64.h | 3 +- arch/powerpc/mm/init_64.c | 55 +++++++++++++++++++++--- arch/powerpc/mm/mmu_decl.h | 7 ++- arch/powerpc/mm/pgtable_64.c | 2 +- arch/powerpc/mm/tlb_nohash.c | 11 ++++- 6 files changed, 68 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index 6ddbe48d07fa..d74580469361 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -196,6 +196,7 @@ extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; #endif extern int mmu_linear_psize; +extern int mmu_vmemmap_psize; #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 7254c5a3187c..200ec2dfa034 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -46,6 +46,7 @@ /* * The vmalloc space starts at the beginning of that region, and * occupies half of it on hash CPUs and a quarter of it on Book3E + * (we keep a quarter for the virtual memmap) */ #define VMALLOC_START KERN_VIRT_START #ifdef CONFIG_PPC_BOOK3E @@ -83,7 +84,7 @@ #define VMALLOC_REGION_ID (REGION_ID(VMALLOC_START)) #define KERNEL_REGION_ID (REGION_ID(PAGE_OFFSET)) -#define VMEMMAP_REGION_ID (0xfUL) +#define VMEMMAP_REGION_ID (0xfUL) /* Server only */ #define USER_REGION_ID (0UL) /* diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 68a821add28d..31582329cd67 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -205,6 +205,47 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size) return 0; } +/* On hash-based CPUs, the vmemmap is bolted in the hash table. + * + * On Book3E CPUs, the vmemmap is currently mapped in the top half of + * the vmalloc space using normal page tables, though the size of + * pages encoded in the PTEs can be different + */ + +#ifdef CONFIG_PPC_BOOK3E +static void __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + /* Create a PTE encoding without page size */ + unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | + _PAGE_KERNEL_RW; + + /* PTEs only contain page size encodings up to 32M */ + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); + + /* Encode the size in the PTE */ + flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; + + /* For each PTE for that area, map things. Note that we don't + * increment phys because all PTEs are of the large size and + * thus must have the low bits clear + */ + for (i = 0; i < page_size; i += PAGE_SIZE) + BUG_ON(map_kernel_page(start + i, phys, flags)); +} +#else /* CONFIG_PPC_BOOK3E */ +static void __meminit vmemmap_create_mapping(unsigned long start, + unsigned long page_size, + unsigned long phys) +{ + int mapped = htab_bolt_mapping(start, start + page_size, phys, + PAGE_KERNEL, mmu_vmemmap_psize, + mmu_kernel_ssize); + BUG_ON(mapped < 0); +} +#endif /* CONFIG_PPC_BOOK3E */ + int __meminit vmemmap_populate(struct page *start_page, unsigned long nr_pages, int node) { @@ -215,8 +256,11 @@ int __meminit vmemmap_populate(struct page *start_page, /* Align to the page size of the linear mapping. */ start = _ALIGN_DOWN(start, page_size); + pr_debug("vmemmap_populate page %p, %ld pages, node %d\n", + start_page, nr_pages, node); + pr_debug(" -> map %lx..%lx\n", start, end); + for (; start < end; start += page_size) { - int mapped; void *p; if (vmemmap_populated(start, page_size)) @@ -226,13 +270,10 @@ int __meminit vmemmap_populate(struct page *start_page, if (!p) return -ENOMEM; - pr_debug("vmemmap %08lx allocated at %p, physical %08lx.\n", - start, p, __pa(p)); + pr_debug(" * %016lx..%016lx allocated at %p\n", + start, start + page_size, p); - mapped = htab_bolt_mapping(start, start + page_size, __pa(p), - pgprot_val(PAGE_KERNEL), - mmu_vmemmap_psize, mmu_kernel_ssize); - BUG_ON(mapped < 0); + vmemmap_create_mapping(start, page_size, __pa(p)); } return 0; diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 5961c6b739dd..d2e5321d5ea6 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -121,7 +121,12 @@ extern unsigned int rtas_data, rtas_size; struct hash_pte; extern struct hash_pte *Hash, *Hash_end; extern unsigned long Hash_size, Hash_mask; -#endif + +#endif /* CONFIG_PPC32 */ + +#ifdef CONFIG_PPC64 +extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags); +#endif /* CONFIG_PPC64 */ extern unsigned long ioremap_bot; extern unsigned long __max_low_memory; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 93ed1a3c8729..853d5565eed5 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -79,7 +79,7 @@ static void *early_alloc_pgtable(unsigned long size) * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -static int map_kernel_page(unsigned long ea, unsigned long pa, int flags) +int map_kernel_page(unsigned long ea, unsigned long pa, int flags) { pgd_t *pgdp; pud_t *pudp; diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index d16100c9416a..2fbc680c2c71 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -93,6 +93,7 @@ static inline int mmu_get_tsize(int psize) int mmu_linear_psize; /* Page size used for the linear mapping */ int mmu_pte_psize; /* Page size used for PTE pages */ +int mmu_vmemmap_psize; /* Page size used for the virtual mem map */ int book3e_htw_enabled; /* Is HW tablewalk enabled ? */ unsigned long linear_map_top; /* Top of linear mapping */ @@ -356,10 +357,18 @@ static void __early_init_mmu(int boot_cpu) unsigned int mas4; /* XXX This will have to be decided at runtime, but right - * now our boot and TLB miss code hard wires it + * now our boot and TLB miss code hard wires it. Ideally + * we should find out a suitable page size and patch the + * TLB miss code (either that or use the PACA to store + * the value we want) */ mmu_linear_psize = MMU_PAGE_1G; + /* XXX This should be decided at runtime based on supported + * page sizes in the TLB, but for now let's assume 16M is + * always there and a good fit (which it probably is) + */ + mmu_vmemmap_psize = MMU_PAGE_16M; /* Check if HW tablewalk is present, and if yes, enable it by: * From 2d27cfd3286966c04d4192a9db5a6c7ea60eebf1 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 23 Jul 2009 23:15:59 +0000 Subject: [PATCH 0149/1662] powerpc: Remaining 64-bit Book3E support This contains all the bits that didn't fit in previous patches :-) This includes the actual exception handlers assembly, the changes to the kernel entry, other misc bits and wiring it all up in Kconfig. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/hw_irq.h | 5 + arch/powerpc/include/asm/smp.h | 1 + arch/powerpc/kernel/Makefile | 10 +- arch/powerpc/kernel/cputable.c | 27 +- arch/powerpc/kernel/entry_64.S | 60 +- arch/powerpc/kernel/exceptions-64e.S | 784 +++++++++++++++++++++++++ arch/powerpc/kernel/head_64.S | 68 ++- arch/powerpc/kernel/setup_64.c | 19 + arch/powerpc/mm/Makefile | 1 + arch/powerpc/platforms/Kconfig.cputype | 38 +- arch/powerpc/xmon/xmon.c | 2 +- 12 files changed, 993 insertions(+), 24 deletions(-) create mode 100644 arch/powerpc/kernel/exceptions-64e.S diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 52349ef1b3a7..4c0747e8ed74 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -472,7 +472,7 @@ config PPC_16K_PAGES bool "16k page size" if 44x config PPC_64K_PAGES - bool "64k page size" if 44x || PPC_STD_MMU_64 + bool "64k page size" if 44x || PPC_STD_MMU_64 || PPC_BOOK3E_64 select PPC_HAS_HASH_64K if PPC_STD_MMU_64 config PPC_256K_PAGES diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 8b505eaaa38a..e73d554538dd 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -49,8 +49,13 @@ extern void iseries_handle_interrupts(void); #define raw_irqs_disabled() (local_get_flags() == 0) #define raw_irqs_disabled_flags(flags) ((flags) == 0) +#ifdef CONFIG_PPC_BOOK3E +#define __hard_irq_enable() __asm__ __volatile__("wrteei 1": : :"memory"); +#define __hard_irq_disable() __asm__ __volatile__("wrteei 0": : :"memory"); +#else #define __hard_irq_enable() __mtmsrd(mfmsr() | MSR_EE, 1) #define __hard_irq_disable() __mtmsrd(mfmsr() & ~MSR_EE, 1) +#endif #define hard_irq_disable() \ do { \ diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index e782f43ee669..c0d3b8af9319 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -153,6 +153,7 @@ extern void arch_send_call_function_ipi(cpumask_t mask); * 64-bit but defining them all here doesn't harm */ extern void generic_secondary_smp_init(void); +extern void generic_secondary_thread_init(void); extern unsigned long __secondary_hold_spinloop; extern unsigned long __secondary_hold_acknowledge; extern char __secondary_hold; diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index b73396b93905..035946f9d5fb 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -33,10 +33,10 @@ obj-y := cputable.o ptrace.o syscalls.o \ obj-y += vdso32/ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ - paca.o cpu_setup_ppc970.o \ - cpu_setup_pa6t.o \ - firmware.o nvram_64.o + paca.o nvram_64.o firmware.o +obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj64-$(CONFIG_RELOCATABLE) += reloc_64.o +obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o obj-$(CONFIG_PPC64) += vdso64/ obj-$(CONFIG_ALTIVEC) += vecemu.o obj-$(CONFIG_PPC_970_NAP) += idle_power4.o @@ -63,8 +63,8 @@ obj-$(CONFIG_MODULES) += module.o module_$(CONFIG_WORD_SIZE).o obj-$(CONFIG_44x) += cpu_setup_44x.o obj-$(CONFIG_FSL_BOOKE) += cpu_setup_fsl_booke.o dbell.o -extra-$(CONFIG_PPC_STD_MMU) := head_32.o -extra-$(CONFIG_PPC64) := head_64.o +extra-y := head_$(CONFIG_WORD_SIZE).o +extra-$(CONFIG_PPC_BOOK3E_32) := head_new_booke.o extra-$(CONFIG_40x) := head_40x.o extra-$(CONFIG_44x) := head_44x.o extra-$(CONFIG_FSL_BOOKE) := head_fsl_booke.o diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 4a24a2fc4574..f34ea37079b5 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -93,7 +93,7 @@ extern void __restore_cpu_power7(void); PPC_FEATURE_BOOKE) static struct cpu_spec __initdata cpu_specs[] = { -#ifdef CONFIG_PPC64 +#ifdef CONFIG_PPC_BOOK3S_64 { /* Power3 */ .pvr_mask = 0xffff0000, .pvr_value = 0x00400000, @@ -508,7 +508,30 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_generic, .platform = "power4", } -#endif /* CONFIG_PPC64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ +#ifdef CONFIG_PPC_BOOK3E_64 + { /* This is a default entry to get going, to be replaced by + * a real one at some stage + */ +#define CPU_FTRS_BASE_BOOK3E (CPU_FTR_USE_TB | \ + CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_SMT | \ + CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE) + .pvr_mask = 0x00000000, + .pvr_value = 0x00000000, + .cpu_name = "Book3E", + .cpu_features = CPU_FTRS_BASE_BOOK3E, + .cpu_user_features = COMMON_USER_PPC64, + .mmu_features = MMU_FTR_TYPE_3E | MMU_FTR_USE_TLBILX | + MMU_FTR_USE_TLBIVAX_BCAST | + MMU_FTR_LOCK_BCAST_INVAL, + .icache_bsize = 64, + .dcache_bsize = 64, + .num_pmcs = 0, + .machine_check = machine_check_generic, + .platform = "power6", + }, +#endif + #ifdef CONFIG_PPC32 #if CLASSIC_PPC { /* 601 */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 1cb0f3d1714b..66bcda34a6bb 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -120,9 +120,15 @@ BEGIN_FW_FTR_SECTION 2: END_FW_FTR_SECTION_IFSET(FW_FEATURE_ISERIES) #endif /* CONFIG_PPC_ISERIES */ + + /* Hard enable interrupts */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 1 +#else mfmsr r11 ori r11,r11,MSR_EE mtmsrd r11,1 +#endif /* CONFIG_PPC_BOOK3E */ #ifdef SHOW_SYSCALLS bl .do_show_syscall @@ -168,15 +174,25 @@ syscall_exit: #endif clrrdi r12,r1,THREAD_SHIFT - /* disable interrupts so current_thread_info()->flags can't change, - and so that we don't get interrupted after loading SRR0/1. */ ld r8,_MSR(r1) +#ifdef CONFIG_PPC_BOOK3S + /* No MSR:RI on BookE */ andi. r10,r8,MSR_RI beq- unrecov_restore +#endif + + /* Disable interrupts so current_thread_info()->flags can't change, + * and so that we don't get interrupted after loading SRR0/1. + */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 0 +#else mfmsr r10 rldicl r10,r10,48,1 rotldi r10,r10,16 mtmsrd r10,1 +#endif /* CONFIG_PPC_BOOK3E */ + ld r9,TI_FLAGS(r12) li r11,-_LAST_ERRNO andi. r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) @@ -194,9 +210,13 @@ syscall_error_cont: * userspace and we take an exception after restoring r13, * we end up corrupting the userspace r13 value. */ +#ifdef CONFIG_PPC_BOOK3S + /* No MSR:RI on BookE */ li r12,MSR_RI andc r11,r10,r12 mtmsrd r11,1 /* clear MSR.RI */ +#endif /* CONFIG_PPC_BOOK3S */ + beq- 1f ACCOUNT_CPU_USER_EXIT(r11, r12) ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ @@ -206,7 +226,7 @@ syscall_error_cont: mtcr r5 mtspr SPRN_SRR0,r7 mtspr SPRN_SRR1,r8 - rfid + RFI b . /* prevent speculative execution */ syscall_error: @@ -276,9 +296,13 @@ syscall_exit_work: beq .ret_from_except_lite /* Re-enable interrupts */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 1 +#else mfmsr r10 ori r10,r10,MSR_EE mtmsrd r10,1 +#endif /* CONFIG_PPC_BOOK3E */ bl .save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD @@ -380,7 +404,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) and. r0,r0,r22 beq+ 1f andc r22,r22,r0 - mtmsrd r22 + MTMSRD(r22) isync 1: std r20,_NIP(r1) mfcr r23 @@ -399,6 +423,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) std r6,PACACURRENT(r13) /* Set new 'current' */ ld r8,KSP(r4) /* new stack pointer */ +#ifdef CONFIG_PPC_BOOK3S BEGIN_FTR_SECTION BEGIN_FTR_SECTION_NESTED(95) clrrdi r6,r8,28 /* get its ESID */ @@ -445,8 +470,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_1T_SEGMENT) slbie r6 /* Workaround POWER5 < DD2.1 issue */ slbmte r7,r0 isync - 2: +#endif /* !CONFIG_PPC_BOOK3S */ + clrrdi r7,r8,THREAD_SHIFT /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE because we don't need to leave the 288-byte ABI gap at the @@ -490,10 +516,14 @@ _GLOBAL(ret_from_except_lite) * can't change between when we test it and when we return * from the interrupt. */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 0 +#else mfmsr r10 /* Get current interrupt state */ rldicl r9,r10,48,1 /* clear MSR_EE */ rotldi r9,r9,16 mtmsrd r9,1 /* Update machine state */ +#endif /* CONFIG_PPC_BOOK3E */ #ifdef CONFIG_PREEMPT clrrdi r9,r1,THREAD_SHIFT /* current_thread_info() */ @@ -540,6 +570,9 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES) rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ stb r4,PACAHARDIRQEN(r13) +#ifdef CONFIG_PPC_BOOK3E + b .exception_return_book3e +#else ld r4,_CTR(r1) ld r0,_LINK(r1) mtctr r4 @@ -588,6 +621,8 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES) rfid b . /* prevent speculative execution */ +#endif /* CONFIG_PPC_BOOK3E */ + iseries_check_pending_irqs: #ifdef CONFIG_PPC_ISERIES ld r5,SOFTE(r1) @@ -638,6 +673,11 @@ do_work: li r0,1 stb r0,PACASOFTIRQEN(r13) stb r0,PACAHARDIRQEN(r13) +#ifdef CONFIG_PPC_BOOK3E + wrteei 1 + bl .preempt_schedule + wrteei 0 +#else ori r10,r10,MSR_EE mtmsrd r10,1 /* reenable interrupts */ bl .preempt_schedule @@ -646,6 +686,7 @@ do_work: rldicl r10,r10,48,1 /* disable interrupts again */ rotldi r10,r10,16 mtmsrd r10,1 +#endif /* CONFIG_PPC_BOOK3E */ ld r4,TI_FLAGS(r9) andi. r0,r4,_TIF_NEED_RESCHED bne 1b @@ -654,8 +695,12 @@ do_work: user_work: #endif /* Enable interrupts */ +#ifdef CONFIG_PPC_BOOK3E + wrteei 1 +#else ori r10,r10,MSR_EE mtmsrd r10,1 +#endif /* CONFIG_PPC_BOOK3E */ andi. r0,r4,_TIF_NEED_RESCHED beq 1f @@ -837,6 +882,10 @@ _GLOBAL(enter_prom) /* Switch MSR to 32 bits mode */ +#ifdef CONFIG_PPC_BOOK3E + rlwinm r11,r11,0,1,31 + mtmsr r11 +#else /* CONFIG_PPC_BOOK3E */ mfmsr r11 li r12,1 rldicr r12,r12,MSR_SF_LG,(63-MSR_SF_LG) @@ -845,6 +894,7 @@ _GLOBAL(enter_prom) rldicr r12,r12,MSR_ISF_LG,(63-MSR_ISF_LG) andc r11,r11,r12 mtmsrd r11 +#endif /* CONFIG_PPC_BOOK3E */ isync /* Enter PROM here... */ diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S new file mode 100644 index 000000000000..695d4847d228 --- /dev/null +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -0,0 +1,784 @@ +/* + * Boot code and exception vectors for Book3E processors + * + * Copyright (C) 2007 Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* XXX This will ultimately add space for a special exception save + * structure used to save things like SRR0/SRR1, SPRGs, MAS, etc... + * when taking special interrupts. For now we don't support that, + * special interrupts from within a non-standard level will probably + * blow you up + */ +#define SPECIAL_EXC_FRAME_SIZE INT_FRAME_SIZE + +/* Exception prolog code for all exceptions */ +#define EXCEPTION_PROLOG(n, type, addition) \ + mtspr SPRN_SPRG_##type##_SCRATCH,r13; /* get spare registers */ \ + mfspr r13,SPRN_SPRG_PACA; /* get PACA */ \ + std r10,PACA_EX##type+EX_R10(r13); \ + std r11,PACA_EX##type+EX_R11(r13); \ + mfcr r10; /* save CR */ \ + addition; /* additional code for that exc. */ \ + std r1,PACA_EX##type+EX_R1(r13); /* save old r1 in the PACA */ \ + stw r10,PACA_EX##type+EX_CR(r13); /* save old CR in the PACA */ \ + mfspr r11,SPRN_##type##_SRR1;/* what are we coming from */ \ + type##_SET_KSTACK; /* get special stack if necessary */\ + andi. r10,r11,MSR_PR; /* save stack pointer */ \ + beq 1f; /* branch around if supervisor */ \ + ld r1,PACAKSAVE(r13); /* get kernel stack coming from usr */\ +1: cmpdi cr1,r1,0; /* check if SP makes sense */ \ + bge- cr1,exc_##n##_bad_stack;/* bad stack (TODO: out of line) */ \ + mfspr r10,SPRN_##type##_SRR0; /* read SRR0 before touching stack */ + +/* Exception type-specific macros */ +#define GEN_SET_KSTACK \ + subi r1,r1,INT_FRAME_SIZE; /* alloc frame on kernel stack */ +#define SPRN_GEN_SRR0 SPRN_SRR0 +#define SPRN_GEN_SRR1 SPRN_SRR1 + +#define CRIT_SET_KSTACK \ + ld r1,PACA_CRIT_STACK(r13); \ + subi r1,r1,SPECIAL_EXC_FRAME_SIZE; +#define SPRN_CRIT_SRR0 SPRN_CSRR0 +#define SPRN_CRIT_SRR1 SPRN_CSRR1 + +#define DBG_SET_KSTACK \ + ld r1,PACA_DBG_STACK(r13); \ + subi r1,r1,SPECIAL_EXC_FRAME_SIZE; +#define SPRN_DBG_SRR0 SPRN_DSRR0 +#define SPRN_DBG_SRR1 SPRN_DSRR1 + +#define MC_SET_KSTACK \ + ld r1,PACA_MC_STACK(r13); \ + subi r1,r1,SPECIAL_EXC_FRAME_SIZE; +#define SPRN_MC_SRR0 SPRN_MCSRR0 +#define SPRN_MC_SRR1 SPRN_MCSRR1 + +#define NORMAL_EXCEPTION_PROLOG(n, addition) \ + EXCEPTION_PROLOG(n, GEN, addition##_GEN) + +#define CRIT_EXCEPTION_PROLOG(n, addition) \ + EXCEPTION_PROLOG(n, CRIT, addition##_CRIT) + +#define DBG_EXCEPTION_PROLOG(n, addition) \ + EXCEPTION_PROLOG(n, DBG, addition##_DBG) + +#define MC_EXCEPTION_PROLOG(n, addition) \ + EXCEPTION_PROLOG(n, MC, addition##_MC) + + +/* Variants of the "addition" argument for the prolog + */ +#define PROLOG_ADDITION_NONE_GEN +#define PROLOG_ADDITION_NONE_CRIT +#define PROLOG_ADDITION_NONE_DBG +#define PROLOG_ADDITION_NONE_MC + +#define PROLOG_ADDITION_MASKABLE_GEN \ + lbz r11,PACASOFTIRQEN(r13); /* are irqs soft-disabled ? */ \ + cmpwi cr0,r11,0; /* yes -> go out of line */ \ + beq masked_interrupt_book3e; + +#define PROLOG_ADDITION_2REGS_GEN \ + std r14,PACA_EXGEN+EX_R14(r13); \ + std r15,PACA_EXGEN+EX_R15(r13) + +#define PROLOG_ADDITION_1REG_GEN \ + std r14,PACA_EXGEN+EX_R14(r13); + +#define PROLOG_ADDITION_2REGS_CRIT \ + std r14,PACA_EXCRIT+EX_R14(r13); \ + std r15,PACA_EXCRIT+EX_R15(r13) + +#define PROLOG_ADDITION_2REGS_DBG \ + std r14,PACA_EXDBG+EX_R14(r13); \ + std r15,PACA_EXDBG+EX_R15(r13) + +#define PROLOG_ADDITION_2REGS_MC \ + std r14,PACA_EXMC+EX_R14(r13); \ + std r15,PACA_EXMC+EX_R15(r13) + +/* Core exception code for all exceptions except TLB misses. + * XXX: Needs to make SPRN_SPRG_GEN depend on exception type + */ +#define EXCEPTION_COMMON(n, excf, ints) \ + std r0,GPR0(r1); /* save r0 in stackframe */ \ + std r2,GPR2(r1); /* save r2 in stackframe */ \ + SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ + SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ + std r9,GPR9(r1); /* save r9 in stackframe */ \ + std r10,_NIP(r1); /* save SRR0 to stackframe */ \ + std r11,_MSR(r1); /* save SRR1 to stackframe */ \ + ACCOUNT_CPU_USER_ENTRY(r10,r11);/* accounting (uses cr0+eq) */ \ + ld r3,excf+EX_R10(r13); /* get back r10 */ \ + ld r4,excf+EX_R11(r13); /* get back r11 */ \ + mfspr r5,SPRN_SPRG_GEN_SCRATCH;/* get back r13 */ \ + std r12,GPR12(r1); /* save r12 in stackframe */ \ + ld r2,PACATOC(r13); /* get kernel TOC into r2 */ \ + mflr r6; /* save LR in stackframe */ \ + mfctr r7; /* save CTR in stackframe */ \ + mfspr r8,SPRN_XER; /* save XER in stackframe */ \ + ld r9,excf+EX_R1(r13); /* load orig r1 back from PACA */ \ + lwz r10,excf+EX_CR(r13); /* load orig CR back from PACA */ \ + lbz r11,PACASOFTIRQEN(r13); /* get current IRQ softe */ \ + ld r12,exception_marker@toc(r2); \ + li r0,0; \ + std r3,GPR10(r1); /* save r10 to stackframe */ \ + std r4,GPR11(r1); /* save r11 to stackframe */ \ + std r5,GPR13(r1); /* save it to stackframe */ \ + std r6,_LINK(r1); \ + std r7,_CTR(r1); \ + std r8,_XER(r1); \ + li r3,(n)+1; /* indicate partial regs in trap */ \ + std r9,0(r1); /* store stack frame back link */ \ + std r10,_CCR(r1); /* store orig CR in stackframe */ \ + std r9,GPR1(r1); /* store stack frame back link */ \ + std r11,SOFTE(r1); /* and save it to stackframe */ \ + std r12,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ \ + std r3,_TRAP(r1); /* set trap number */ \ + std r0,RESULT(r1); /* clear regs->result */ \ + ints; + +/* Variants for the "ints" argument */ +#define INTS_KEEP +#define INTS_DISABLE_SOFT \ + stb r0,PACASOFTIRQEN(r13); /* mark interrupts soft-disabled */ \ + TRACE_DISABLE_INTS; +#define INTS_DISABLE_HARD \ + stb r0,PACAHARDIRQEN(r13); /* and hard disabled */ +#define INTS_DISABLE_ALL \ + INTS_DISABLE_SOFT \ + INTS_DISABLE_HARD + +/* This is called by exceptions that used INTS_KEEP (that is did not clear + * neither soft nor hard IRQ indicators in the PACA. This will restore MSR:EE + * to it's previous value + * + * XXX In the long run, we may want to open-code it in order to separate the + * load from the wrtee, thus limiting the latency caused by the dependency + * but at this point, I'll favor code clarity until we have a near to final + * implementation + */ +#define INTS_RESTORE_HARD \ + ld r11,_MSR(r1); \ + wrtee r11; + +/* XXX FIXME: Restore r14/r15 when necessary */ +#define BAD_STACK_TRAMPOLINE(n) \ +exc_##n##_bad_stack: \ + li r1,(n); /* get exception number */ \ + sth r1,PACA_TRAP_SAVE(r13); /* store trap */ \ + b bad_stack_book3e; /* bad stack error */ + +#define EXCEPTION_STUB(loc, label) \ + . = interrupt_base_book3e + loc; \ + nop; /* To make debug interrupts happy */ \ + b exc_##label##_book3e; + +#define ACK_NONE(r) +#define ACK_DEC(r) \ + lis r,TSR_DIS@h; \ + mtspr SPRN_TSR,r +#define ACK_FIT(r) \ + lis r,TSR_FIS@h; \ + mtspr SPRN_TSR,r + +#define MASKABLE_EXCEPTION(trapnum, label, hdlr, ack) \ + START_EXCEPTION(label); \ + NORMAL_EXCEPTION_PROLOG(trapnum, PROLOG_ADDITION_MASKABLE) \ + EXCEPTION_COMMON(trapnum, PACA_EXGEN, INTS_DISABLE_ALL) \ + ack(r8); \ + addi r3,r1,STACK_FRAME_OVERHEAD; \ + bl hdlr; \ + b .ret_from_except_lite; + +/* This value is used to mark exception frames on the stack. */ + .section ".toc","aw" +exception_marker: + .tc ID_EXC_MARKER[TC],STACK_FRAME_REGS_MARKER + + +/* + * And here we have the exception vectors ! + */ + + .text + .balign 0x1000 + .globl interrupt_base_book3e +interrupt_base_book3e: /* fake trap */ + /* Note: If real debug exceptions are supported by the HW, the vector + * below will have to be patched up to point to an appropriate handler + */ + EXCEPTION_STUB(0x000, machine_check) /* 0x0200 */ + EXCEPTION_STUB(0x020, critical_input) /* 0x0580 */ + EXCEPTION_STUB(0x040, debug_crit) /* 0x0d00 */ + EXCEPTION_STUB(0x060, data_storage) /* 0x0300 */ + EXCEPTION_STUB(0x080, instruction_storage) /* 0x0400 */ + EXCEPTION_STUB(0x0a0, external_input) /* 0x0500 */ + EXCEPTION_STUB(0x0c0, alignment) /* 0x0600 */ + EXCEPTION_STUB(0x0e0, program) /* 0x0700 */ + EXCEPTION_STUB(0x100, fp_unavailable) /* 0x0800 */ + EXCEPTION_STUB(0x120, system_call) /* 0x0c00 */ + EXCEPTION_STUB(0x140, ap_unavailable) /* 0x0f20 */ + EXCEPTION_STUB(0x160, decrementer) /* 0x0900 */ + EXCEPTION_STUB(0x180, fixed_interval) /* 0x0980 */ + EXCEPTION_STUB(0x1a0, watchdog) /* 0x09f0 */ + EXCEPTION_STUB(0x1c0, data_tlb_miss) + EXCEPTION_STUB(0x1e0, instruction_tlb_miss) + +#if 0 + EXCEPTION_STUB(0x280, processor_doorbell) + EXCEPTION_STUB(0x220, processor_doorbell_crit) +#endif + .globl interrupt_end_book3e +interrupt_end_book3e: + +/* Critical Input Interrupt */ + START_EXCEPTION(critical_input); + CRIT_EXCEPTION_PROLOG(0x100, PROLOG_ADDITION_NONE) +// EXCEPTION_COMMON(0x100, PACA_EXCRIT, INTS_DISABLE_ALL) +// bl special_reg_save_crit +// addi r3,r1,STACK_FRAME_OVERHEAD +// bl .critical_exception +// b ret_from_crit_except + b . + +/* Machine Check Interrupt */ + START_EXCEPTION(machine_check); + CRIT_EXCEPTION_PROLOG(0x200, PROLOG_ADDITION_NONE) +// EXCEPTION_COMMON(0x200, PACA_EXMC, INTS_DISABLE_ALL) +// bl special_reg_save_mc +// addi r3,r1,STACK_FRAME_OVERHEAD +// bl .machine_check_exception +// b ret_from_mc_except + b . + +/* Data Storage Interrupt */ + START_EXCEPTION(data_storage) + NORMAL_EXCEPTION_PROLOG(0x300, PROLOG_ADDITION_2REGS) + mfspr r14,SPRN_DEAR + mfspr r15,SPRN_ESR + EXCEPTION_COMMON(0x300, PACA_EXGEN, INTS_KEEP) + b storage_fault_common + +/* Instruction Storage Interrupt */ + START_EXCEPTION(instruction_storage); + NORMAL_EXCEPTION_PROLOG(0x400, PROLOG_ADDITION_2REGS) + li r15,0 + mr r14,r10 + EXCEPTION_COMMON(0x400, PACA_EXGEN, INTS_KEEP) + b storage_fault_common + +/* External Input Interrupt */ + MASKABLE_EXCEPTION(0x500, external_input, .do_IRQ, ACK_NONE) + +/* Alignment */ + START_EXCEPTION(alignment); + NORMAL_EXCEPTION_PROLOG(0x600, PROLOG_ADDITION_2REGS) + mfspr r14,SPRN_DEAR + mfspr r15,SPRN_ESR + EXCEPTION_COMMON(0x600, PACA_EXGEN, INTS_KEEP) + b alignment_more /* no room, go out of line */ + +/* Program Interrupt */ + START_EXCEPTION(program); + NORMAL_EXCEPTION_PROLOG(0x700, PROLOG_ADDITION_1REG) + mfspr r14,SPRN_ESR + EXCEPTION_COMMON(0x700, PACA_EXGEN, INTS_DISABLE_SOFT) + std r14,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + ld r14,PACA_EXGEN+EX_R14(r13) + bl .save_nvgprs + INTS_RESTORE_HARD + bl .program_check_exception + b .ret_from_except + +/* Floating Point Unavailable Interrupt */ + START_EXCEPTION(fp_unavailable); + NORMAL_EXCEPTION_PROLOG(0x800, PROLOG_ADDITION_NONE) + /* we can probably do a shorter exception entry for that one... */ + EXCEPTION_COMMON(0x800, PACA_EXGEN, INTS_KEEP) + bne 1f /* if from user, just load it up */ + bl .save_nvgprs + addi r3,r1,STACK_FRAME_OVERHEAD + INTS_RESTORE_HARD + bl .kernel_fp_unavailable_exception + BUG_OPCODE +1: ld r12,_MSR(r1) + bl .load_up_fpu + b fast_exception_return + +/* Decrementer Interrupt */ + MASKABLE_EXCEPTION(0x900, decrementer, .timer_interrupt, ACK_DEC) + +/* Fixed Interval Timer Interrupt */ + MASKABLE_EXCEPTION(0x980, fixed_interval, .unknown_exception, ACK_FIT) + +/* Watchdog Timer Interrupt */ + START_EXCEPTION(watchdog); + CRIT_EXCEPTION_PROLOG(0x9f0, PROLOG_ADDITION_NONE) +// EXCEPTION_COMMON(0x9f0, PACA_EXCRIT, INTS_DISABLE_ALL) +// bl special_reg_save_crit +// addi r3,r1,STACK_FRAME_OVERHEAD +// bl .unknown_exception +// b ret_from_crit_except + b . + +/* System Call Interrupt */ + START_EXCEPTION(system_call) + mr r9,r13 /* keep a copy of userland r13 */ + mfspr r11,SPRN_SRR0 /* get return address */ + mfspr r12,SPRN_SRR1 /* get previous MSR */ + mfspr r13,SPRN_SPRG_PACA /* get our PACA */ + b system_call_common + +/* Auxillary Processor Unavailable Interrupt */ + START_EXCEPTION(ap_unavailable); + NORMAL_EXCEPTION_PROLOG(0xf20, PROLOG_ADDITION_NONE) + EXCEPTION_COMMON(0xf20, PACA_EXGEN, INTS_KEEP) + addi r3,r1,STACK_FRAME_OVERHEAD + bl .save_nvgprs + INTS_RESTORE_HARD + bl .unknown_exception + b .ret_from_except + +/* Debug exception as a critical interrupt*/ + START_EXCEPTION(debug_crit); + CRIT_EXCEPTION_PROLOG(0xd00, PROLOG_ADDITION_2REGS) + + /* + * If there is a single step or branch-taken exception in an + * exception entry sequence, it was probably meant to apply to + * the code where the exception occurred (since exception entry + * doesn't turn off DE automatically). We simulate the effect + * of turning off DE on entry to an exception handler by turning + * off DE in the CSRR1 value and clearing the debug status. + */ + + mfspr r14,SPRN_DBSR /* check single-step/branch taken */ + andis. r15,r14,DBSR_IC@h + beq+ 1f + + LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) + LOAD_REG_IMMEDIATE(r15,interrupt_end_book3e) + cmpld cr0,r10,r14 + cmpld cr1,r10,r15 + blt+ cr0,1f + bge+ cr1,1f + + /* here it looks like we got an inappropriate debug exception. */ + lis r14,DBSR_IC@h /* clear the IC event */ + rlwinm r11,r11,0,~MSR_DE /* clear DE in the CSRR1 value */ + mtspr SPRN_DBSR,r14 + mtspr SPRN_CSRR1,r11 + lwz r10,PACA_EXCRIT+EX_CR(r13) /* restore registers */ + ld r1,PACA_EXCRIT+EX_R1(r13) + ld r14,PACA_EXCRIT+EX_R14(r13) + ld r15,PACA_EXCRIT+EX_R15(r13) + mtcr r10 + ld r10,PACA_EXCRIT+EX_R10(r13) /* restore registers */ + ld r11,PACA_EXCRIT+EX_R11(r13) + mfspr r13,SPRN_SPRG_CRIT_SCRATCH + rfci + + /* Normal debug exception */ + /* XXX We only handle coming from userspace for now since we can't + * quite save properly an interrupted kernel state yet + */ +1: andi. r14,r11,MSR_PR; /* check for userspace again */ + beq kernel_dbg_exc; /* if from kernel mode */ + + /* Now we mash up things to make it look like we are coming on a + * normal exception + */ + mfspr r15,SPRN_SPRG_CRIT_SCRATCH + mtspr SPRN_SPRG_GEN_SCRATCH,r15 + mfspr r14,SPRN_DBSR + EXCEPTION_COMMON(0xd00, PACA_EXCRIT, INTS_DISABLE_ALL) + std r14,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + mr r4,r14 + ld r14,PACA_EXCRIT+EX_R14(r13) + ld r15,PACA_EXCRIT+EX_R15(r13) + bl .save_nvgprs + bl .DebugException + b .ret_from_except + +kernel_dbg_exc: + b . /* NYI */ + + +/* + * An interrupt came in while soft-disabled; clear EE in SRR1, + * clear paca->hard_enabled and return. + */ +masked_interrupt_book3e: + mtcr r10 + stb r11,PACAHARDIRQEN(r13) + mfspr r10,SPRN_SRR1 + rldicl r11,r10,48,1 /* clear MSR_EE */ + rotldi r10,r11,16 + mtspr SPRN_SRR1,r10 + ld r10,PACA_EXGEN+EX_R10(r13); /* restore registers */ + ld r11,PACA_EXGEN+EX_R11(r13); + mfspr r13,SPRN_SPRG_GEN_SCRATCH; + rfi + b . + +/* + * This is called from 0x300 and 0x400 handlers after the prologs with + * r14 and r15 containing the fault address and error code, with the + * original values stashed away in the PACA + */ +storage_fault_common: + std r14,_DAR(r1) + std r15,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + mr r4,r14 + mr r5,r15 + ld r14,PACA_EXGEN+EX_R14(r13) + ld r15,PACA_EXGEN+EX_R15(r13) + INTS_RESTORE_HARD + bl .do_page_fault + cmpdi r3,0 + bne- 1f + b .ret_from_except_lite +1: bl .save_nvgprs + mr r5,r3 + addi r3,r1,STACK_FRAME_OVERHEAD + ld r4,_DAR(r1) + bl .bad_page_fault + b .ret_from_except + +/* + * Alignment exception doesn't fit entirely in the 0x100 bytes so it + * continues here. + */ +alignment_more: + std r14,_DAR(r1) + std r15,_DSISR(r1) + addi r3,r1,STACK_FRAME_OVERHEAD + ld r14,PACA_EXGEN+EX_R14(r13) + ld r15,PACA_EXGEN+EX_R15(r13) + bl .save_nvgprs + INTS_RESTORE_HARD + bl .alignment_exception + b .ret_from_except + +/* + * We branch here from entry_64.S for the last stage of the exception + * return code path. MSR:EE is expected to be off at that point + */ +_GLOBAL(exception_return_book3e) + b 1f + +/* This is the return from load_up_fpu fast path which could do with + * less GPR restores in fact, but for now we have a single return path + */ + .globl fast_exception_return +fast_exception_return: + wrteei 0 +1: mr r0,r13 + ld r10,_MSR(r1) + REST_4GPRS(2, r1) + andi. r6,r10,MSR_PR + REST_2GPRS(6, r1) + beq 1f + ACCOUNT_CPU_USER_EXIT(r10, r11) + ld r0,GPR13(r1) + +1: stdcx. r0,0,r1 /* to clear the reservation */ + + ld r8,_CCR(r1) + ld r9,_LINK(r1) + ld r10,_CTR(r1) + ld r11,_XER(r1) + mtcr r8 + mtlr r9 + mtctr r10 + mtxer r11 + REST_2GPRS(8, r1) + ld r10,GPR10(r1) + ld r11,GPR11(r1) + ld r12,GPR12(r1) + mtspr SPRN_SPRG_GEN_SCRATCH,r0 + + std r10,PACA_EXGEN+EX_R10(r13); + std r11,PACA_EXGEN+EX_R11(r13); + ld r10,_NIP(r1) + ld r11,_MSR(r1) + ld r0,GPR0(r1) + ld r1,GPR1(r1) + mtspr SPRN_SRR0,r10 + mtspr SPRN_SRR1,r11 + ld r10,PACA_EXGEN+EX_R10(r13) + ld r11,PACA_EXGEN+EX_R11(r13) + mfspr r13,SPRN_SPRG_GEN_SCRATCH + rfi + +/* + * Trampolines used when spotting a bad kernel stack pointer in + * the exception entry code. + * + * TODO: move some bits like SRR0 read to trampoline, pass PACA + * index around, etc... to handle crit & mcheck + */ +BAD_STACK_TRAMPOLINE(0x000) +BAD_STACK_TRAMPOLINE(0x100) +BAD_STACK_TRAMPOLINE(0x200) +BAD_STACK_TRAMPOLINE(0x300) +BAD_STACK_TRAMPOLINE(0x400) +BAD_STACK_TRAMPOLINE(0x500) +BAD_STACK_TRAMPOLINE(0x600) +BAD_STACK_TRAMPOLINE(0x700) +BAD_STACK_TRAMPOLINE(0x800) +BAD_STACK_TRAMPOLINE(0x900) +BAD_STACK_TRAMPOLINE(0x980) +BAD_STACK_TRAMPOLINE(0x9f0) +BAD_STACK_TRAMPOLINE(0xa00) +BAD_STACK_TRAMPOLINE(0xb00) +BAD_STACK_TRAMPOLINE(0xc00) +BAD_STACK_TRAMPOLINE(0xd00) +BAD_STACK_TRAMPOLINE(0xe00) +BAD_STACK_TRAMPOLINE(0xf00) +BAD_STACK_TRAMPOLINE(0xf20) + + .globl bad_stack_book3e +bad_stack_book3e: + /* XXX: Needs to make SPRN_SPRG_GEN depend on exception type */ + mfspr r10,SPRN_SRR0; /* read SRR0 before touching stack */ + ld r1,PACAEMERGSP(r13) + subi r1,r1,64+INT_FRAME_SIZE + std r10,_NIP(r1) + std r11,_MSR(r1) + ld r10,PACA_EXGEN+EX_R1(r13) /* FIXME for crit & mcheck */ + lwz r11,PACA_EXGEN+EX_CR(r13) /* FIXME for crit & mcheck */ + std r10,GPR1(r1) + std r11,_CCR(r1) + mfspr r10,SPRN_DEAR + mfspr r11,SPRN_ESR + std r10,_DAR(r1) + std r11,_DSISR(r1) + std r0,GPR0(r1); /* save r0 in stackframe */ \ + std r2,GPR2(r1); /* save r2 in stackframe */ \ + SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ + SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe */ \ + std r9,GPR9(r1); /* save r9 in stackframe */ \ + ld r3,PACA_EXGEN+EX_R10(r13);/* get back r10 */ \ + ld r4,PACA_EXGEN+EX_R11(r13);/* get back r11 */ \ + mfspr r5,SPRN_SPRG_GEN_SCRATCH;/* get back r13 XXX can be wrong */ \ + std r3,GPR10(r1); /* save r10 to stackframe */ \ + std r4,GPR11(r1); /* save r11 to stackframe */ \ + std r12,GPR12(r1); /* save r12 in stackframe */ \ + std r5,GPR13(r1); /* save it to stackframe */ \ + mflr r10 + mfctr r11 + mfxer r12 + std r10,_LINK(r1) + std r11,_CTR(r1) + std r12,_XER(r1) + SAVE_10GPRS(14,r1) + SAVE_8GPRS(24,r1) + lhz r12,PACA_TRAP_SAVE(r13) + std r12,_TRAP(r1) + addi r11,r1,INT_FRAME_SIZE + std r11,0(r1) + li r12,0 + std r12,0(r11) + ld r2,PACATOC(r13) +1: addi r3,r1,STACK_FRAME_OVERHEAD + bl .kernel_bad_stack + b 1b + +/* + * Setup the initial TLB for a core. This current implementation + * assume that whatever we are running off will not conflict with + * the new mapping at PAGE_OFFSET. + * We also make various assumptions about the processor we run on, + * this might have to be made more flexible based on the content + * of MMUCFG and friends. + */ +_GLOBAL(initial_tlb_book3e) + + /* Setup MAS 0,1,2,3 and 7 for tlbwe of a 1G entry that maps the + * kernel linear mapping. We also set MAS8 once for all here though + * that will have to be made dependent on whether we are running under + * a hypervisor I suppose. + */ + li r3,MAS0_HES | MAS0_WQ_ALLWAYS + mtspr SPRN_MAS0,r3 + lis r3,(MAS1_VALID | MAS1_IPROT)@h + ori r3,r3,BOOK3E_PAGESZ_1GB << MAS1_TSIZE_SHIFT + mtspr SPRN_MAS1,r3 + LOAD_REG_IMMEDIATE(r3, PAGE_OFFSET | MAS2_M) + mtspr SPRN_MAS2,r3 + li r3,MAS3_SR | MAS3_SW | MAS3_SX + mtspr SPRN_MAS7_MAS3,r3 + li r3,0 + mtspr SPRN_MAS8,r3 + + /* Write the TLB entry */ + tlbwe + + /* Now we branch the new virtual address mapped by this entry */ + LOAD_REG_IMMEDIATE(r3,1f) + mtctr r3 + bctr + +1: /* We are now running at PAGE_OFFSET, clean the TLB of everything + * else (XXX we should scan for bolted crap from the firmware too) + */ + PPC_TLBILX(0,0,0) + sync + isync + + /* We translate LR and return */ + mflr r3 + tovirt(r3,r3) + mtlr r3 + blr + +/* + * Main entry (boot CPU, thread 0) + * + * We enter here from head_64.S, possibly after the prom_init trampoline + * with r3 and r4 already saved to r31 and 30 respectively and in 64 bits + * mode. Anything else is as it was left by the bootloader + * + * Initial requirements of this port: + * + * - Kernel loaded at 0 physical + * - A good lump of memory mapped 0:0 by UTLB entry 0 + * - MSR:IS & MSR:DS set to 0 + * + * Note that some of the above requirements will be relaxed in the future + * as the kernel becomes smarter at dealing with different initial conditions + * but for now you have to be careful + */ +_GLOBAL(start_initialization_book3e) + mflr r28 + + /* First, we need to setup some initial TLBs to map the kernel + * text, data and bss at PAGE_OFFSET. We don't have a real mode + * and always use AS 0, so we just set it up to match our link + * address and never use 0 based addresses. + */ + bl .initial_tlb_book3e + + /* Init global core bits */ + bl .init_core_book3e + + /* Init per-thread bits */ + bl .init_thread_book3e + + /* Return to common init code */ + tovirt(r28,r28) + mtlr r28 + blr + + +/* + * Secondary core/processor entry + * + * This is entered for thread 0 of a secondary core, all other threads + * are expected to be stopped. It's similar to start_initialization_book3e + * except that it's generally entered from the holding loop in head_64.S + * after CPUs have been gathered by Open Firmware. + * + * We assume we are in 32 bits mode running with whatever TLB entry was + * set for us by the firmware or POR engine. + */ +_GLOBAL(book3e_secondary_core_init_tlb_set) + li r4,1 + b .generic_secondary_smp_init + +_GLOBAL(book3e_secondary_core_init) + mflr r28 + + /* Do we need to setup initial TLB entry ? */ + cmplwi r4,0 + bne 2f + + /* Setup TLB for this core */ + bl .initial_tlb_book3e + + /* We can return from the above running at a different + * address, so recalculate r2 (TOC) + */ + bl .relative_toc + + /* Init global core bits */ +2: bl .init_core_book3e + + /* Init per-thread bits */ +3: bl .init_thread_book3e + + /* Return to common init code at proper virtual address. + * + * Due to various previous assumptions, we know we entered this + * function at either the final PAGE_OFFSET mapping or using a + * 1:1 mapping at 0, so we don't bother doing a complicated check + * here, we just ensure the return address has the right top bits. + * + * Note that if we ever want to be smarter about where we can be + * started from, we have to be careful that by the time we reach + * the code below we may already be running at a different location + * than the one we were called from since initial_tlb_book3e can + * have moved us already. + */ + cmpdi cr0,r28,0 + blt 1f + lis r3,PAGE_OFFSET@highest + sldi r3,r3,32 + or r28,r28,r3 +1: mtlr r28 + blr + +_GLOBAL(book3e_secondary_thread_init) + mflr r28 + b 3b + +_STATIC(init_core_book3e) + /* Establish the interrupt vector base */ + LOAD_REG_IMMEDIATE(r3, interrupt_base_book3e) + mtspr SPRN_IVPR,r3 + sync + blr + +_STATIC(init_thread_book3e) + lis r3,(SPRN_EPCR_ICM | SPRN_EPCR_GICM)@h + mtspr SPRN_EPCR,r3 + + /* Make sure interrupts are off */ + wrteei 0 + + /* disable watchdog and FIT and enable DEC interrupts */ + lis r3,TCR_DIE@h + mtspr SPRN_TCR,r3 + + blr + + + diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 0552f01041ab..c38afdb45d7b 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -121,10 +121,11 @@ __run_at_load: */ .globl __secondary_hold __secondary_hold: +#ifndef CONFIG_PPC_BOOK3E mfmsr r24 ori r24,r24,MSR_RI mtmsrd r24 /* RI on */ - +#endif /* Grab our physical cpu number */ mr r24,r3 @@ -143,6 +144,7 @@ __secondary_hold: ld r4,0(r4) /* deref function descriptor */ mtctr r4 mr r3,r24 + li r4,0 bctr #else BUG_OPCODE @@ -163,21 +165,49 @@ exception_marker: #include "exceptions-64s.S" #endif +_GLOBAL(generic_secondary_thread_init) + mr r24,r3 + + /* turn on 64-bit mode */ + bl .enable_64b_mode + + /* get a valid TOC pointer, wherever we're mapped at */ + bl .relative_toc + +#ifdef CONFIG_PPC_BOOK3E + /* Book3E initialization */ + mr r3,r24 + bl .book3e_secondary_thread_init +#endif + b generic_secondary_common_init /* * On pSeries and most other platforms, secondary processors spin * in the following code. * At entry, r3 = this processor's number (physical cpu id) + * + * On Book3E, r4 = 1 to indicate that the initial TLB entry for + * this core already exists (setup via some other mechanism such + * as SCOM before entry). */ _GLOBAL(generic_secondary_smp_init) mr r24,r3 - + mr r25,r4 + /* turn on 64-bit mode */ bl .enable_64b_mode - /* get the TOC pointer (real address) */ + /* get a valid TOC pointer, wherever we're mapped at */ bl .relative_toc +#ifdef CONFIG_PPC_BOOK3E + /* Book3E initialization */ + mr r3,r24 + mr r4,r25 + bl .book3e_secondary_core_init +#endif + +generic_secondary_common_init: /* Set up a paca value for this processor. Since we have the * physical cpu id in r24, we need to search the pacas to find * which logical id maps to our physical one. @@ -196,6 +226,11 @@ _GLOBAL(generic_secondary_smp_init) b .kexec_wait /* next kernel might do better */ 2: mtspr SPRN_SPRG_PACA,r13 /* Save vaddr of paca in an SPRG */ +#ifdef CONFIG_PPC_BOOK3E + addi r12,r13,PACA_EXTLB /* and TLB exc frame in another */ + mtspr SPRN_SPRG_TLB_EXFRAME,r12 +#endif + /* From now on, r24 is expected to be logical cpuid */ mr r24,r5 3: HMT_LOW @@ -231,6 +266,7 @@ _GLOBAL(generic_secondary_smp_init) * Turn the MMU off. * Assumes we're mapped EA == RA if the MMU is on. */ +#ifdef CONFIG_PPC_BOOK3S _STATIC(__mmu_off) mfmsr r3 andi. r0,r3,MSR_IR|MSR_DR @@ -242,6 +278,7 @@ _STATIC(__mmu_off) sync rfid b . /* prevent speculative execution */ +#endif /* @@ -279,6 +316,10 @@ _GLOBAL(__start_initialization_multiplatform) mr r31,r3 mr r30,r4 +#ifdef CONFIG_PPC_BOOK3E + bl .start_initialization_book3e + b .__after_prom_start +#else /* Setup some critical 970 SPRs before switching MMU off */ mfspr r0,SPRN_PVR srwi r0,r0,16 @@ -296,6 +337,7 @@ _GLOBAL(__start_initialization_multiplatform) /* Switch off MMU if not already off */ bl .__mmu_off b .__after_prom_start +#endif /* CONFIG_PPC_BOOK3E */ _INIT_STATIC(__boot_from_prom) #ifdef CONFIG_PPC_OF_BOOT_TRAMPOLINE @@ -358,10 +400,16 @@ _STATIC(__after_prom_start) * Note: This process overwrites the OF exception vectors. */ li r3,0 /* target addr */ +#ifdef CONFIG_PPC_BOOK3E + tovirt(r3,r3) /* on booke, we already run at PAGE_OFFSET */ +#endif mr. r4,r26 /* In some cases the loader may */ beq 9f /* have already put us at zero */ li r6,0x100 /* Start offset, the first 0x100 */ /* bytes were copied earlier. */ +#ifdef CONFIG_PPC_BOOK3E + tovirt(r6,r6) /* on booke, we already run at PAGE_OFFSET */ +#endif #ifdef CONFIG_CRASH_DUMP /* @@ -507,6 +555,9 @@ _GLOBAL(pmac_secondary_start) * r13 = paca virtual address * SPRG_PACA = paca virtual address */ + .section ".text"; + .align 2 ; + .globl __secondary_start __secondary_start: /* Set thread priority to MEDIUM */ @@ -543,7 +594,7 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISERIES) mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - rfid + RFI b . /* prevent speculative execution */ /* @@ -564,11 +615,16 @@ _GLOBAL(start_secondary_prolog) */ _GLOBAL(enable_64b_mode) mfmsr r11 /* grab the current MSR */ +#ifdef CONFIG_PPC_BOOK3E + oris r11,r11,0x8000 /* CM bit set, we'll set ICM later */ + mtmsr r11 +#else /* CONFIG_PPC_BOOK3E */ li r12,(MSR_SF | MSR_ISF)@highest sldi r12,r12,48 or r11,r11,r12 mtmsrd r11 isync +#endif blr /* @@ -612,9 +668,11 @@ _INIT_STATIC(start_here_multiplatform) bdnz 3b 4: +#ifndef CONFIG_PPC_BOOK3E mfmsr r6 ori r6,r6,MSR_RI mtmsrd r6 /* RI on */ +#endif #ifdef CONFIG_RELOCATABLE /* Save the physical address we're running at in kernstart_addr */ @@ -647,7 +705,7 @@ _INIT_STATIC(start_here_multiplatform) ld r4,PACAKMSR(r13) mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - rfid + RFI b . /* prevent speculative execution */ /* This is where all platforms converge execution */ diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 65aced7b833a..87df51720641 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -454,6 +454,24 @@ static void __init irqstack_early_init(void) #define irqstack_early_init() #endif +#ifdef CONFIG_PPC_BOOK3E +static void __init exc_lvl_early_init(void) +{ + unsigned int i; + + for_each_possible_cpu(i) { + critirq_ctx[i] = (struct thread_info *) + __va(lmb_alloc(THREAD_SIZE, THREAD_SIZE)); + dbgirq_ctx[i] = (struct thread_info *) + __va(lmb_alloc(THREAD_SIZE, THREAD_SIZE)); + mcheckirq_ctx[i] = (struct thread_info *) + __va(lmb_alloc(THREAD_SIZE, THREAD_SIZE)); + } +} +#else +#define exc_lvl_early_init() +#endif + /* * Stack space used when we detect a bad kernel stack pointer, and * early in SMP boots before relocation is enabled. @@ -513,6 +531,7 @@ void __init setup_arch(char **cmdline_p) init_mm.brk = klimit; irqstack_early_init(); + exc_lvl_early_init(); emergency_stack_init(); #ifdef CONFIG_PPC_STD_MMU_64 diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 3e68363405b7..6fb8fc8d2fea 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -13,6 +13,7 @@ obj-y := fault.o mem.o pgtable.o gup.o \ pgtable_$(CONFIG_WORD_SIZE).o obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ tlb_nohash_low.o +obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o obj-$(CONFIG_PPC64) += mmap_64.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o \ diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 61187bec7506..9efc8bda01b4 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -57,15 +57,35 @@ config E200 endchoice -config PPC_BOOK3S_64 - def_bool y +choice + prompt "Processor Type" depends on PPC64 + help + There are two families of 64 bit PowerPC chips supported. + The most common ones are the desktop and server CPUs + (POWER3, RS64, POWER4, POWER5, POWER5+, POWER6, ...) + + The other are the "embedded" processors compliant with the + "Book 3E" variant of the architecture + +config PPC_BOOK3S_64 + bool "Server processors" select PPC_FPU +config PPC_BOOK3E_64 + bool "Embedded processors" + select PPC_FPU # Make it a choice ? + +endchoice + config PPC_BOOK3S def_bool y depends on PPC_BOOK3S_32 || PPC_BOOK3S_64 +config PPC_BOOK3E + def_bool y + depends on PPC_BOOK3E_64 + config POWER4_ONLY bool "Optimize for POWER4" depends on PPC64 && PPC_BOOK3S @@ -125,7 +145,7 @@ config 4xx config BOOKE bool - depends on E200 || E500 || 44x + depends on E200 || E500 || 44x || PPC_BOOK3E default y config FSL_BOOKE @@ -223,9 +243,17 @@ config PPC_MMU_NOHASH def_bool y depends on !PPC_STD_MMU +config PPC_MMU_NOHASH_32 + def_bool y + depends on PPC_MMU_NOHASH && PPC32 + +config PPC_MMU_NOHASH_64 + def_bool y + depends on PPC_MMU_NOHASH && PPC64 + config PPC_BOOK3E_MMU def_bool y - depends on FSL_BOOKE + depends on FSL_BOOKE || PPC_BOOK3E config PPC_MM_SLICES bool @@ -257,7 +285,7 @@ config PPC_PERF_CTRS This enables the powerpc-specific perf_counter back-end. config SMP - depends on PPC_STD_MMU || FSL_BOOKE + depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE bool "Symmetric multi-processing support" ---help--- This enables support for systems with more than one CPU. If you have diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index e1f33a81e5e1..0e09a45ac79a 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2570,7 +2570,7 @@ static void xmon_print_symbol(unsigned long address, const char *mid, printf("%s", after); } -#ifdef CONFIG_PPC64 +#ifdef CONFIG_PPC_BOOK3S_64 static void dump_slb(void) { int i; From af984b816530b4725b92e01ecfba7c5e3eab910d Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 6 Aug 2009 13:50:58 +1000 Subject: [PATCH 0150/1662] powerpc/mm: Fix encoding of page table cache numbers The mask used to encode the page table cache number in the batch when freeing page tables was too small for the new possible values of MMU page sizes. This increases it along with a comment explaining the constraints. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgalloc.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 34b080671f00..f2e812de7c3c 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -28,7 +28,12 @@ typedef struct pgtable_free { unsigned long val; } pgtable_free_t; -#define PGF_CACHENUM_MASK 0x7 +/* This needs to be big enough to allow for MMU_PAGE_COUNT + 2 to be stored + * and small enough to fit in the low bits of any naturally aligned page + * table cache entry. Arbitrarily set to 0x1f, that should give us some + * room to grow + */ +#define PGF_CACHENUM_MASK 0x1f static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum, unsigned long mask) From 67050b5c3e9992d98554bd224d5a7898cc4881ff Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 4 Aug 2009 22:33:32 -0500 Subject: [PATCH 0151/1662] powerpc/mm: Fix switch_mmu_context to iterate of the proper list of cpus Introduced a temporary variable into our iterating over the list cpus that are threads on the same core. For some reason Ben forgot how for loops work. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/mmu_context_nohash.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 834436d6d6b8..c2f93dc470e6 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -190,7 +190,7 @@ static void context_check_map(void) { } void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) { - unsigned int id, cpu = smp_processor_id(); + unsigned int i, id, cpu = smp_processor_id(); unsigned long *map; /* No lockless fast path .. yet */ @@ -269,9 +269,10 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) local_flush_tlb_mm(next); /* XXX This clear should ultimately be part of local_flush_tlb_mm */ - for (cpu = cpu_first_thread_in_core(cpu); - cpu <= cpu_last_thread_in_core(cpu); cpu++) - __clear_bit(id, stale_map[cpu]); + for (i = cpu_first_thread_in_core(cpu); + i <= cpu_last_thread_in_core(cpu); i++) { + __clear_bit(id, stale_map[i]); + } } /* Flick the MMU and release lock */ From 20d70345f181be6bdd5b0a76a408d0693683bf3d Mon Sep 17 00:00:00 2001 From: Stefan Roese Date: Wed, 29 Jul 2009 07:04:46 +0000 Subject: [PATCH 0152/1662] powerpc: Add AMCC 460EX/460GT Rev. B support to cputable.c Signed-off-by: Stefan Roese Signed-off-by: Josh Boyer Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/cputable.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index f34ea37079b5..9f38ecb17859 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -1653,7 +1653,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .platform = "ppc440", }, { /* 460EX */ - .pvr_mask = 0xffff0002, + .pvr_mask = 0xffff0006, .pvr_value = 0x13020002, .cpu_name = "460EX", .cpu_features = CPU_FTRS_440x6, @@ -1665,8 +1665,21 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_440A, .platform = "ppc440", }, + { /* 460EX Rev B */ + .pvr_mask = 0xffff0007, + .pvr_value = 0x13020004, + .cpu_name = "460EX Rev. B", + .cpu_features = CPU_FTRS_440x6, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_460ex, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, { /* 460GT */ - .pvr_mask = 0xffff0002, + .pvr_mask = 0xffff0006, .pvr_value = 0x13020000, .cpu_name = "460GT", .cpu_features = CPU_FTRS_440x6, @@ -1678,6 +1691,19 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_440A, .platform = "ppc440", }, + { /* 460GT Rev B */ + .pvr_mask = 0xffff0007, + .pvr_value = 0x13020005, + .cpu_name = "460GT Rev. B", + .cpu_features = CPU_FTRS_440x6, + .cpu_user_features = COMMON_USER_BOOKE | PPC_FEATURE_HAS_FPU, + .mmu_features = MMU_FTR_TYPE_44x, + .icache_bsize = 32, + .dcache_bsize = 32, + .cpu_setup = __setup_cpu_460gt, + .machine_check = machine_check_440A, + .platform = "ppc440", + }, { /* 460SX */ .pvr_mask = 0xffffff00, .pvr_value = 0x13541800, From 88eeb72ec4c1ed7defff0f154c5b56798e038e2a Mon Sep 17 00:00:00 2001 From: Stefan Roese Date: Wed, 29 Jul 2009 07:05:01 +0000 Subject: [PATCH 0153/1662] powerpc/44x: Add NAND support to Canyonlands dts Also some whitespace cleanup in the USB device nodes. Signed-off-by: Stefan Roese Signed-off-by: Josh Boyer Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/boot/dts/canyonlands.dts | 49 ++++++++++++++++++++------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/boot/dts/canyonlands.dts b/arch/powerpc/boot/dts/canyonlands.dts index 5fd1ad09bdf2..c920170b7dfe 100644 --- a/arch/powerpc/boot/dts/canyonlands.dts +++ b/arch/powerpc/boot/dts/canyonlands.dts @@ -1,7 +1,7 @@ /* * Device Tree Source for AMCC Canyonlands (460EX) * - * Copyright 2008 DENX Software Engineering, Stefan Roese + * Copyright 2008-2009 DENX Software Engineering, Stefan Roese * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without @@ -149,19 +149,19 @@ /*RXDE*/ 0x5 0x4>; }; - USB0: ehci@bffd0400 { - compatible = "ibm,usb-ehci-460ex", "usb-ehci"; - interrupt-parent = <&UIC2>; - interrupts = <0x1d 4>; - reg = <4 0xbffd0400 0x90 4 0xbffd0490 0x70>; - }; + USB0: ehci@bffd0400 { + compatible = "ibm,usb-ehci-460ex", "usb-ehci"; + interrupt-parent = <&UIC2>; + interrupts = <0x1d 4>; + reg = <4 0xbffd0400 0x90 4 0xbffd0490 0x70>; + }; - USB1: usb@bffd0000 { - compatible = "ohci-le"; - reg = <4 0xbffd0000 0x60>; - interrupt-parent = <&UIC2>; - interrupts = <0x1e 4>; - }; + USB1: usb@bffd0000 { + compatible = "ohci-le"; + reg = <4 0xbffd0000 0x60>; + interrupt-parent = <&UIC2>; + interrupts = <0x1e 4>; + }; POB0: opb { compatible = "ibm,opb-460ex", "ibm,opb"; @@ -215,6 +215,29 @@ reg = <0x03fa0000 0x00060000>; }; }; + + ndfc@3,0 { + compatible = "ibm,ndfc"; + reg = <0x00000003 0x00000000 0x00002000>; + ccr = <0x00001000>; + bank-settings = <0x80002222>; + #address-cells = <1>; + #size-cells = <1>; + + nand { + #address-cells = <1>; + #size-cells = <1>; + + partition@0 { + label = "u-boot"; + reg = <0x00000000 0x00100000>; + }; + partition@100000 { + label = "user"; + reg = <0x00000000 0x03f00000>; + }; + }; + }; }; UART0: serial@ef600300 { From 13ae564f1db967dd4ea244f21f3dad6a28fa351c Mon Sep 17 00:00:00 2001 From: Stefan Roese Date: Wed, 29 Jul 2009 01:40:56 +0000 Subject: [PATCH 0154/1662] powerpc/40x: Update Kilauea dts to support NAND, RTC and HWMON This patch adds support for the following devices to the Kilauea dts: - PPC4xx NAND controller (NDFC) - I2C RTC (Dallas DS1338) - I2C HWMON (Dallas DS1775) Additionally the partitioning of the NOR FLASH is changed. The dtb partition has been missing. Fixed in this patch. Signed-off-by: Stefan Roese Signed-off-by: Josh Boyer Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/boot/dts/kilauea.dts | 44 ++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/boot/dts/kilauea.dts b/arch/powerpc/boot/dts/kilauea.dts index 5e6b08ff6f67..c46561456ede 100644 --- a/arch/powerpc/boot/dts/kilauea.dts +++ b/arch/powerpc/boot/dts/kilauea.dts @@ -1,7 +1,7 @@ /* * Device Tree Source for AMCC Kilauea (405EX) * - * Copyright 2007 DENX Software Engineering, Stefan Roese + * Copyright 2007-2009 DENX Software Engineering, Stefan Roese * * This file is licensed under the terms of the GNU General Public * License version 2. This program is licensed "as is" without @@ -150,7 +150,11 @@ #size-cells = <1>; partition@0 { label = "kernel"; - reg = <0x00000000 0x00200000>; + reg = <0x00000000 0x001e0000>; + }; + partition@1e0000 { + label = "dtb"; + reg = <0x001e0000 0x00020000>; }; partition@200000 { label = "root"; @@ -169,6 +173,29 @@ reg = <0x03fa0000 0x00060000>; }; }; + + ndfc@1,0 { + compatible = "ibm,ndfc"; + reg = <0x00000001 0x00000000 0x00002000>; + ccr = <0x00001000>; + bank-settings = <0x80002222>; + #address-cells = <1>; + #size-cells = <1>; + + nand { + #address-cells = <1>; + #size-cells = <1>; + + partition@0 { + label = "u-boot"; + reg = <0x00000000 0x00100000>; + }; + partition@100000 { + label = "user"; + reg = <0x00000000 0x03f00000>; + }; + }; + }; }; UART0: serial@ef600200 { @@ -198,6 +225,18 @@ reg = <0xef600400 0x00000014>; interrupt-parent = <&UIC0>; interrupts = <0x2 0x4>; + #address-cells = <1>; + #size-cells = <0>; + + rtc@68 { + compatible = "dallas,ds1338"; + reg = <0x68>; + }; + + dtt@48 { + compatible = "dallas,ds1775"; + reg = <0x48>; + }; }; IIC1: i2c@ef600500 { @@ -207,7 +246,6 @@ interrupts = <0x7 0x4>; }; - RGMII0: emac-rgmii@ef600b00 { compatible = "ibm,rgmii-405ex", "ibm,rgmii"; reg = <0xef600b00 0x00000104>; From 0e8e844246fcf9c38906b62e5d05891dbbda5754 Mon Sep 17 00:00:00 2001 From: Stefan Roese Date: Wed, 29 Jul 2009 07:05:11 +0000 Subject: [PATCH 0155/1662] powerpc/44x: Update Canyonlands defconfig to support NOR, NAND and RTC This patch adds support for the following devices to the Canyonlands defconfig file: - NOR FLASH - PPC4xx NAND controller (NDFC) - I2C RTC (M41T80) Signed-off-by: Stefan Roese Signed-off-by: Josh Boyer Signed-off-by: Benjamin Herrenschmidt --- .../powerpc/configs/44x/canyonlands_defconfig | 350 +++++++++++++++--- 1 file changed, 297 insertions(+), 53 deletions(-) diff --git a/arch/powerpc/configs/44x/canyonlands_defconfig b/arch/powerpc/configs/44x/canyonlands_defconfig index 5e85412eb9fa..b312b166be66 100644 --- a/arch/powerpc/configs/44x/canyonlands_defconfig +++ b/arch/powerpc/configs/44x/canyonlands_defconfig @@ -1,14 +1,14 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29-rc3 -# Mon Feb 2 13:13:04 2009 +# Linux kernel version: 2.6.31-rc4 +# Wed Jul 29 17:27:20 2009 # # CONFIG_PPC64 is not set # # Processor support # -# CONFIG_6xx is not set +# CONFIG_PPC_BOOK3S_32 is not set # CONFIG_PPC_85xx is not set # CONFIG_PPC_8xx is not set # CONFIG_40x is not set @@ -31,15 +31,16 @@ CONFIG_GENERIC_TIME=y CONFIG_GENERIC_TIME_VSYSCALL=y CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y # CONFIG_HAVE_SETUP_PER_CPU_AREA is not set CONFIG_IRQ_PER_CPU=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_HAVE_LATENCYTOP_SUPPORT=y +CONFIG_TRACE_IRQFLAGS_SUPPORT=y CONFIG_LOCKDEP_SUPPORT=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_ARCH_HAS_ILOG2_U32=y CONFIG_GENERIC_HWEIGHT=y -CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_GENERIC_FIND_NEXT_BIT=y # CONFIG_ARCH_NO_VIRT_TO_BUS is not set CONFIG_PPC=y @@ -53,11 +54,14 @@ CONFIG_PPC_UDBG_16550=y # CONFIG_GENERIC_TBSYNC is not set CONFIG_AUDIT_ARCH=y CONFIG_GENERIC_BUG=y +CONFIG_DTC=y # CONFIG_DEFAULT_UIMAGE is not set CONFIG_PPC_DCR_NATIVE=y # CONFIG_PPC_DCR_MMIO is not set CONFIG_PPC_DCR=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" +CONFIG_CONSTRUCTORS=y # # General setup @@ -71,6 +75,7 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set # CONFIG_AUDIT is not set @@ -93,8 +98,12 @@ CONFIG_SYSFS_DEPRECATED_V2=y # CONFIG_NAMESPACES is not set CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y CONFIG_EMBEDDED=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y @@ -104,23 +113,30 @@ CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y -CONFIG_COMPAT_BRK=y CONFIG_BASE_FULL=y CONFIG_FUTEX=y -CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_AIO=y +CONFIG_HAVE_PERF_COUNTERS=y + +# +# Performance Counters +# +# CONFIG_PERF_COUNTERS is not set CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y CONFIG_SLUB_DEBUG=y +# CONFIG_STRIP_ASM_SYMS is not set +CONFIG_COMPAT_BRK=y # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y @@ -128,6 +144,12 @@ CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +# CONFIG_SLOW_WORK is not set # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -139,8 +161,7 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y -CONFIG_LBD=y -# CONFIG_BLK_DEV_IO_TRACE is not set +CONFIG_LBDAF=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -176,6 +197,7 @@ CONFIG_PPC4xx_PCI_EXPRESS=y # CONFIG_ARCHES is not set CONFIG_CANYONLANDS=y # CONFIG_GLACIER is not set +# CONFIG_REDWOOD is not set # CONFIG_YOSEMITE is not set # CONFIG_XILINX_VIRTEX440_GENERIC_BOARD is not set CONFIG_PPC44x_SIMPLE=y @@ -218,6 +240,7 @@ CONFIG_BINFMT_ELF=y # CONFIG_BINFMT_MISC is not set # CONFIG_MATH_EMULATION is not set # CONFIG_IOMMU_HELPER is not set +# CONFIG_SWIOTLB is not set CONFIG_PPC_NEED_DMA_SYNC_OPS=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_ARCH_HAS_WALK_MEMORY=y @@ -237,10 +260,14 @@ CONFIG_PHYS_ADDR_T_64BIT=y CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y -CONFIG_UNEVICTABLE_LRU=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 +CONFIG_STDBINUTILS=y CONFIG_PPC_4K_PAGES=y # CONFIG_PPC_16K_PAGES is not set # CONFIG_PPC_64K_PAGES is not set +# CONFIG_PPC_256K_PAGES is not set CONFIG_FORCE_MAX_ZONEORDER=11 CONFIG_PROC_DEVICETREE=y CONFIG_CMDLINE_BOOL=y @@ -265,6 +292,7 @@ CONFIG_ARCH_SUPPORTS_MSI=y # CONFIG_PCI_LEGACY is not set # CONFIG_PCI_DEBUG is not set # CONFIG_PCI_STUB is not set +# CONFIG_PCI_IOV is not set # CONFIG_PCCARD is not set # CONFIG_HOTPLUG_PCI is not set # CONFIG_HAS_RAPIDIO is not set @@ -282,14 +310,12 @@ CONFIG_PAGE_OFFSET=0xc0000000 CONFIG_KERNEL_START=0xc0000000 CONFIG_PHYSICAL_START=0x00000000 CONFIG_TASK_SIZE=0xc0000000 -CONFIG_CONSISTENT_START=0xff100000 CONFIG_CONSISTENT_SIZE=0x00200000 CONFIG_NET=y # # Networking options # -CONFIG_COMPAT_NET_DEV_OPS=y CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set CONFIG_UNIX=y @@ -339,6 +365,8 @@ CONFIG_DEFAULT_TCP_CONG="cubic" # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set +# CONFIG_PHONET is not set +# CONFIG_IEEE802154 is not set # CONFIG_NET_SCHED is not set # CONFIG_DCB is not set @@ -351,7 +379,6 @@ CONFIG_DEFAULT_TCP_CONG="cubic" # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set -# CONFIG_PHONET is not set # CONFIG_WIRELESS is not set # CONFIG_WIMAX is not set # CONFIG_RFKILL is not set @@ -375,7 +402,101 @@ CONFIG_EXTRA_FIRMWARE="" # CONFIG_SYS_HYPERVISOR is not set CONFIG_CONNECTOR=y CONFIG_PROC_EVENTS=y -# CONFIG_MTD is not set +CONFIG_MTD=y +# CONFIG_MTD_DEBUG is not set +# CONFIG_MTD_CONCAT is not set +CONFIG_MTD_PARTITIONS=y +# CONFIG_MTD_TESTS is not set +# CONFIG_MTD_REDBOOT_PARTS is not set +CONFIG_MTD_CMDLINE_PARTS=y +CONFIG_MTD_OF_PARTS=y +# CONFIG_MTD_AR7_PARTS is not set + +# +# User Modules And Translation Layers +# +CONFIG_MTD_CHAR=y +CONFIG_MTD_BLKDEVS=y +CONFIG_MTD_BLOCK=y +# CONFIG_FTL is not set +# CONFIG_NFTL is not set +# CONFIG_INFTL is not set +# CONFIG_RFD_FTL is not set +# CONFIG_SSFDC is not set +# CONFIG_MTD_OOPS is not set + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=y +# CONFIG_MTD_JEDECPROBE is not set +CONFIG_MTD_GEN_PROBE=y +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +# CONFIG_MTD_CFI_I4 is not set +# CONFIG_MTD_CFI_I8 is not set +# CONFIG_MTD_CFI_INTELEXT is not set +CONFIG_MTD_CFI_AMDSTD=y +# CONFIG_MTD_CFI_STAA is not set +CONFIG_MTD_CFI_UTIL=y +# CONFIG_MTD_RAM is not set +# CONFIG_MTD_ROM is not set +# CONFIG_MTD_ABSENT is not set + +# +# Mapping drivers for chip access +# +# CONFIG_MTD_COMPLEX_MAPPINGS is not set +# CONFIG_MTD_PHYSMAP is not set +CONFIG_MTD_PHYSMAP_OF=y +# CONFIG_MTD_INTEL_VR_NOR is not set +# CONFIG_MTD_PLATRAM is not set + +# +# Self-contained MTD device drivers +# +# CONFIG_MTD_PMC551 is not set +# CONFIG_MTD_SLRAM is not set +# CONFIG_MTD_PHRAM is not set +# CONFIG_MTD_MTDRAM is not set +# CONFIG_MTD_BLOCK2MTD is not set + +# +# Disk-On-Chip Device Drivers +# +# CONFIG_MTD_DOC2000 is not set +# CONFIG_MTD_DOC2001 is not set +# CONFIG_MTD_DOC2001PLUS is not set +CONFIG_MTD_NAND=y +# CONFIG_MTD_NAND_VERIFY_WRITE is not set +CONFIG_MTD_NAND_ECC_SMC=y +# CONFIG_MTD_NAND_MUSEUM_IDS is not set +CONFIG_MTD_NAND_IDS=y +CONFIG_MTD_NAND_NDFC=y +# CONFIG_MTD_NAND_DISKONCHIP is not set +# CONFIG_MTD_NAND_CAFE is not set +# CONFIG_MTD_NAND_NANDSIM is not set +# CONFIG_MTD_NAND_PLATFORM is not set +# CONFIG_MTD_ALAUDA is not set +# CONFIG_MTD_NAND_FSL_ELBC is not set +# CONFIG_MTD_ONENAND is not set + +# +# LPDDR flash memory drivers +# +# CONFIG_MTD_LPDDR is not set + +# +# UBI - Unsorted block images +# +# CONFIG_MTD_UBI is not set CONFIG_OF_DEVICE=y CONFIG_OF_I2C=y # CONFIG_PARPORT is not set @@ -418,7 +539,11 @@ CONFIG_HAVE_IDE=y # # -# Enable only one of the two stacks, unless you know what you are doing +# You can enable one or both FireWire driver stacks. +# + +# +# See the help texts for more information. # # CONFIG_FIREWIRE is not set # CONFIG_IEEE1394 is not set @@ -439,6 +564,8 @@ CONFIG_NET_ETHERNET=y # CONFIG_SUNGEM is not set # CONFIG_CASSINI is not set # CONFIG_NET_VENDOR_3COM is not set +# CONFIG_ETHOC is not set +# CONFIG_DNET is not set # CONFIG_NET_TULIP is not set # CONFIG_HP100 is not set CONFIG_IBM_NEW_EMAC=y @@ -457,6 +584,7 @@ CONFIG_IBM_NEW_EMAC_EMAC4=y # CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set # CONFIG_NET_PCI is not set # CONFIG_B44 is not set +# CONFIG_KS8842 is not set # CONFIG_ATL2 is not set # CONFIG_NETDEV_1000 is not set # CONFIG_NETDEV_10000 is not set @@ -467,7 +595,6 @@ CONFIG_IBM_NEW_EMAC_EMAC4=y # # CONFIG_WLAN_PRE80211 is not set # CONFIG_WLAN_80211 is not set -# CONFIG_IWLWIFI_LEDS is not set # # Enable WiMAX (Networking options) to see the WiMAX drivers @@ -542,7 +669,6 @@ CONFIG_LEGACY_PTY_COUNT=256 # CONFIG_IPMI_HANDLER is not set # CONFIG_HW_RANDOM is not set # CONFIG_NVRAM is not set -# CONFIG_GEN_RTC is not set # CONFIG_R3964 is not set # CONFIG_APPLICOM is not set # CONFIG_RAW_DRIVER is not set @@ -608,14 +734,17 @@ CONFIG_I2C_IBM_IIC=y # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set # CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set -# CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set # CONFIG_I2C_DEBUG_CORE is not set # CONFIG_I2C_DEBUG_ALGO is not set # CONFIG_I2C_DEBUG_BUS is not set # CONFIG_I2C_DEBUG_CHIP is not set # CONFIG_SPI is not set + +# +# PPS support +# +# CONFIG_PPS is not set CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y # CONFIG_GPIOLIB is not set # CONFIG_W1 is not set @@ -640,6 +769,7 @@ CONFIG_SENSORS_AD7414=y # CONFIG_SENSORS_F71805F is not set # CONFIG_SENSORS_F71882FG is not set # CONFIG_SENSORS_F75375S is not set +# CONFIG_SENSORS_G760A is not set # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_IT87 is not set @@ -654,11 +784,14 @@ CONFIG_SENSORS_AD7414=y # CONFIG_SENSORS_LM90 is not set # CONFIG_SENSORS_LM92 is not set # CONFIG_SENSORS_LM93 is not set +# CONFIG_SENSORS_LTC4215 is not set # CONFIG_SENSORS_LTC4245 is not set +# CONFIG_SENSORS_LM95241 is not set # CONFIG_SENSORS_MAX1619 is not set # CONFIG_SENSORS_MAX6650 is not set # CONFIG_SENSORS_PC87360 is not set # CONFIG_SENSORS_PC87427 is not set +# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_SIS5595 is not set # CONFIG_SENSORS_DME1737 is not set # CONFIG_SENSORS_SMSC47M1 is not set @@ -666,6 +799,7 @@ CONFIG_SENSORS_AD7414=y # CONFIG_SENSORS_SMSC47B397 is not set # CONFIG_SENSORS_ADS7828 is not set # CONFIG_SENSORS_THMC50 is not set +# CONFIG_SENSORS_TMP401 is not set # CONFIG_SENSORS_VIA686A is not set # CONFIG_SENSORS_VT1211 is not set # CONFIG_SENSORS_VT8231 is not set @@ -700,24 +834,9 @@ CONFIG_SSB_POSSIBLE=y # CONFIG_MFD_WM8400 is not set # CONFIG_MFD_WM8350_I2C is not set # CONFIG_MFD_PCF50633 is not set +# CONFIG_AB3100_CORE is not set # CONFIG_REGULATOR is not set - -# -# Multimedia devices -# - -# -# Multimedia core support -# -# CONFIG_VIDEO_DEV is not set -# CONFIG_DVB_CORE is not set -# CONFIG_VIDEO_MEDIA is not set - -# -# Multimedia drivers -# -# CONFIG_DAB is not set -# CONFIG_USB_DABUSB is not set +# CONFIG_MEDIA_SUPPORT is not set # # Graphics support @@ -759,6 +878,7 @@ CONFIG_USB_MON=y # USB Host Controller Drivers # # CONFIG_USB_C67X00_HCD is not set +# CONFIG_USB_XHCI_HCD is not set CONFIG_USB_EHCI_HCD=m # CONFIG_USB_EHCI_ROOT_HUB_TT is not set # CONFIG_USB_EHCI_TT_NEWSCHED is not set @@ -767,9 +887,9 @@ CONFIG_USB_EHCI_HCD_PPC_OF=y # CONFIG_USB_ISP116X_HCD is not set # CONFIG_USB_ISP1760_HCD is not set CONFIG_USB_OHCI_HCD=y -CONFIG_USB_OHCI_HCD_PPC_OF=y CONFIG_USB_OHCI_HCD_PPC_OF_BE=y CONFIG_USB_OHCI_HCD_PPC_OF_LE=y +CONFIG_USB_OHCI_HCD_PPC_OF=y CONFIG_USB_OHCI_HCD_PCI=y CONFIG_USB_OHCI_BIG_ENDIAN_DESC=y CONFIG_USB_OHCI_BIG_ENDIAN_MMIO=y @@ -789,11 +909,11 @@ CONFIG_USB_OHCI_LITTLE_ENDIAN=y # CONFIG_USB_TMC is not set # -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may # # -# see USB_STORAGE Help for more information +# also be needed; see USB_STORAGE Help for more info # CONFIG_USB_LIBUSUAL=y @@ -821,7 +941,6 @@ CONFIG_USB_LIBUSUAL=y # CONFIG_USB_LED is not set # CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set -# CONFIG_USB_PHIDGET is not set # CONFIG_USB_IDMOUSE is not set # CONFIG_USB_FTDI_ELAN is not set # CONFIG_USB_APPLEDISPLAY is not set @@ -837,6 +956,7 @@ CONFIG_USB_LIBUSUAL=y # # OTG and related infrastructure # +# CONFIG_NOP_USB_XCEIV is not set # CONFIG_UWB is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -844,9 +964,70 @@ CONFIG_USB_LIBUSUAL=y # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set # CONFIG_EDAC is not set -# CONFIG_RTC_CLASS is not set +CONFIG_RTC_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +# CONFIG_RTC_DRV_DS1307 is not set +# CONFIG_RTC_DRV_DS1374 is not set +# CONFIG_RTC_DRV_DS1672 is not set +# CONFIG_RTC_DRV_MAX6900 is not set +# CONFIG_RTC_DRV_RS5C372 is not set +# CONFIG_RTC_DRV_ISL1208 is not set +# CONFIG_RTC_DRV_X1205 is not set +# CONFIG_RTC_DRV_PCF8563 is not set +# CONFIG_RTC_DRV_PCF8583 is not set +CONFIG_RTC_DRV_M41T80=y +# CONFIG_RTC_DRV_M41T80_WDT is not set +# CONFIG_RTC_DRV_S35390A is not set +# CONFIG_RTC_DRV_FM3130 is not set +# CONFIG_RTC_DRV_RX8581 is not set +# CONFIG_RTC_DRV_RX8025 is not set + +# +# SPI RTC drivers +# + +# +# Platform RTC drivers +# +# CONFIG_RTC_DRV_CMOS is not set +# CONFIG_RTC_DRV_DS1286 is not set +# CONFIG_RTC_DRV_DS1511 is not set +# CONFIG_RTC_DRV_DS1553 is not set +# CONFIG_RTC_DRV_DS1742 is not set +# CONFIG_RTC_DRV_STK17TA8 is not set +# CONFIG_RTC_DRV_M48T86 is not set +# CONFIG_RTC_DRV_M48T35 is not set +# CONFIG_RTC_DRV_M48T59 is not set +# CONFIG_RTC_DRV_BQ4802 is not set +# CONFIG_RTC_DRV_V3020 is not set + +# +# on-CPU RTC drivers +# +# CONFIG_RTC_DRV_GENERIC is not set # CONFIG_DMADEVICES is not set +# CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set + +# +# TI VLYNQ +# # CONFIG_STAGING is not set # @@ -860,11 +1041,12 @@ CONFIG_EXT2_FS=y # CONFIG_REISERFS_FS is not set # CONFIG_JFS_FS is not set # CONFIG_FS_POSIX_ACL is not set -CONFIG_FILE_LOCKING=y # CONFIG_XFS_FS is not set # CONFIG_GFS2_FS is not set # CONFIG_OCFS2_FS is not set # CONFIG_BTRFS_FS is not set +CONFIG_FILE_LOCKING=y +CONFIG_FSNOTIFY=y CONFIG_DNOTIFY=y CONFIG_INOTIFY=y CONFIG_INOTIFY_USER=y @@ -873,6 +1055,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -906,6 +1093,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_BEFS_FS is not set # CONFIG_BFS_FS is not set # CONFIG_EFS_FS is not set +# CONFIG_JFFS2_FS is not set CONFIG_CRAMFS=y # CONFIG_SQUASHFS is not set # CONFIG_VXFS_FS is not set @@ -916,6 +1104,7 @@ CONFIG_CRAMFS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -927,7 +1116,6 @@ CONFIG_LOCKD=y CONFIG_LOCKD_V4=y CONFIG_NFS_COMMON=y CONFIG_SUNRPC=y -# CONFIG_SUNRPC_REGISTER_V4 is not set # CONFIG_RPCSEC_GSS_KRB5 is not set # CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set @@ -941,8 +1129,48 @@ CONFIG_SUNRPC=y # # CONFIG_PARTITION_ADVANCED is not set CONFIG_MSDOS_PARTITION=y -# CONFIG_NLS is not set +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="iso8859-1" +# CONFIG_NLS_CODEPAGE_437 is not set +# CONFIG_NLS_CODEPAGE_737 is not set +# CONFIG_NLS_CODEPAGE_775 is not set +# CONFIG_NLS_CODEPAGE_850 is not set +# CONFIG_NLS_CODEPAGE_852 is not set +# CONFIG_NLS_CODEPAGE_855 is not set +# CONFIG_NLS_CODEPAGE_857 is not set +# CONFIG_NLS_CODEPAGE_860 is not set +# CONFIG_NLS_CODEPAGE_861 is not set +# CONFIG_NLS_CODEPAGE_862 is not set +# CONFIG_NLS_CODEPAGE_863 is not set +# CONFIG_NLS_CODEPAGE_864 is not set +# CONFIG_NLS_CODEPAGE_865 is not set +# CONFIG_NLS_CODEPAGE_866 is not set +# CONFIG_NLS_CODEPAGE_869 is not set +# CONFIG_NLS_CODEPAGE_936 is not set +# CONFIG_NLS_CODEPAGE_950 is not set +# CONFIG_NLS_CODEPAGE_932 is not set +# CONFIG_NLS_CODEPAGE_949 is not set +# CONFIG_NLS_CODEPAGE_874 is not set +# CONFIG_NLS_ISO8859_8 is not set +# CONFIG_NLS_CODEPAGE_1250 is not set +# CONFIG_NLS_CODEPAGE_1251 is not set +# CONFIG_NLS_ASCII is not set +# CONFIG_NLS_ISO8859_1 is not set +# CONFIG_NLS_ISO8859_2 is not set +# CONFIG_NLS_ISO8859_3 is not set +# CONFIG_NLS_ISO8859_4 is not set +# CONFIG_NLS_ISO8859_5 is not set +# CONFIG_NLS_ISO8859_6 is not set +# CONFIG_NLS_ISO8859_7 is not set +# CONFIG_NLS_ISO8859_9 is not set +# CONFIG_NLS_ISO8859_13 is not set +# CONFIG_NLS_ISO8859_14 is not set +# CONFIG_NLS_ISO8859_15 is not set +# CONFIG_NLS_KOI8_R is not set +# CONFIG_NLS_KOI8_U is not set +# CONFIG_NLS_UTF8 is not set # CONFIG_DLM is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines @@ -957,11 +1185,13 @@ CONFIG_CRC32=y # CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y -CONFIG_PLIST=y +CONFIG_DECOMPRESS_GZIP=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y CONFIG_HAVE_LMB=y +CONFIG_NLATTR=y +CONFIG_GENERIC_ATOMIC64=y # # Kernel hacking @@ -979,6 +1209,9 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 CONFIG_SCHED_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_TIMER_STATS is not set @@ -989,6 +1222,9 @@ CONFIG_SCHED_DEBUG=y # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set @@ -1000,7 +1236,6 @@ CONFIG_SCHED_DEBUG=y # CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_SG is not set # CONFIG_DEBUG_NOTIFIERS is not set -# CONFIG_BOOT_PRINTK_DELAY is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_RCU_CPU_STALL_DETECTOR is not set # CONFIG_BACKTRACE_SELF_TEST is not set @@ -1008,27 +1243,36 @@ CONFIG_SCHED_DEBUG=y # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set CONFIG_SYSCTL_SYSCALL_CHECK=y +# CONFIG_DEBUG_PAGEALLOC is not set CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y - -# -# Tracers -# +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y # CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set # CONFIG_SCHED_TRACER is not set -# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_ENABLE_DEFAULT_TRACERS is not set # CONFIG_BOOT_TRACER is not set -# CONFIG_TRACE_BRANCH_PROFILING is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +# CONFIG_PROFILE_ALL_BRANCHES is not set # CONFIG_STACK_TRACER is not set -# CONFIG_DYNAMIC_PRINTK_DEBUG is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DYNAMIC_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set +# CONFIG_KMEMCHECK is not set +# CONFIG_PPC_DISABLE_WERROR is not set +CONFIG_PPC_WERROR=y CONFIG_PRINT_STACK_DEPTH=64 # CONFIG_DEBUG_STACKOVERFLOW is not set # CONFIG_DEBUG_STACK_USAGE is not set -# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PPC_EMULATED_STATS is not set # CONFIG_CODE_PATCHING_SELFTEST is not set # CONFIG_FTR_FIXUP_SELFTEST is not set # CONFIG_MSI_BITMAP_SELFTEST is not set From 6b045a818f1d0b9b68c1d47e745d521dd115991a Mon Sep 17 00:00:00 2001 From: Stefan Roese Date: Wed, 29 Jul 2009 01:41:06 +0000 Subject: [PATCH 0156/1662] powerpc/40x: Update kilauea defconfig to support NAND, RTC and HWMON This patch adds support for the following devices to the Kilauea defconfig file: - PPC4xx NAND controller (NDFC) - I2C RTC (Dallas DS1338) - I2C HWMON (Dallas DS1775) Signed-off-by: Stefan Roese Signed-off-by: Josh Boyer Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/configs/40x/kilauea_defconfig | 298 ++++++++++++++++++--- 1 file changed, 256 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/configs/40x/kilauea_defconfig b/arch/powerpc/configs/40x/kilauea_defconfig index 865725effe93..9a05ec0ec312 100644 --- a/arch/powerpc/configs/40x/kilauea_defconfig +++ b/arch/powerpc/configs/40x/kilauea_defconfig @@ -1,14 +1,14 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.30-rc7 -# Wed Jun 3 10:18:16 2009 +# Linux kernel version: 2.6.31-rc4 +# Wed Jul 29 13:28:37 2009 # # CONFIG_PPC64 is not set # # Processor support # -# CONFIG_6xx is not set +# CONFIG_PPC_BOOK3S_32 is not set # CONFIG_PPC_85xx is not set # CONFIG_PPC_8xx is not set CONFIG_40x=y @@ -32,11 +32,11 @@ CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y CONFIG_IRQ_PER_CPU=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_HAVE_LATENCYTOP_SUPPORT=y +CONFIG_TRACE_IRQFLAGS_SUPPORT=y CONFIG_LOCKDEP_SUPPORT=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_ARCH_HAS_ILOG2_U32=y CONFIG_GENERIC_HWEIGHT=y -CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_GENERIC_FIND_NEXT_BIT=y # CONFIG_ARCH_NO_VIRT_TO_BUS is not set CONFIG_PPC=y @@ -57,6 +57,7 @@ CONFIG_PPC_DCR_NATIVE=y CONFIG_PPC_DCR=y CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" +CONFIG_CONSTRUCTORS=y # # General setup @@ -108,7 +109,6 @@ CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y -# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -121,9 +121,16 @@ CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_AIO=y +CONFIG_HAVE_PERF_COUNTERS=y + +# +# Performance Counters +# +# CONFIG_PERF_COUNTERS is not set CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y CONFIG_SLUB_DEBUG=y +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_COMPAT_BRK=y # CONFIG_SLAB is not set CONFIG_SLUB=y @@ -137,6 +144,11 @@ CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set # CONFIG_SLOW_WORK is not set # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set CONFIG_SLABINFO=y @@ -149,7 +161,7 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y -CONFIG_LBD=y +CONFIG_LBDAF=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -220,6 +232,7 @@ CONFIG_BINFMT_ELF=y # CONFIG_BINFMT_MISC is not set # CONFIG_MATH_EMULATION is not set # CONFIG_IOMMU_HELPER is not set +# CONFIG_SWIOTLB is not set CONFIG_PPC_NEED_DMA_SYNC_OPS=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_ARCH_HAS_WALK_MEMORY=y @@ -239,9 +252,9 @@ CONFIG_MIGRATION=y CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y -CONFIG_UNEVICTABLE_LRU=y CONFIG_HAVE_MLOCK=y CONFIG_HAVE_MLOCKED_PAGE_BIT=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 CONFIG_PPC_4K_PAGES=y # CONFIG_PPC_16K_PAGES is not set # CONFIG_PPC_64K_PAGES is not set @@ -344,6 +357,7 @@ CONFIG_DEFAULT_TCP_CONG="cubic" # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set # CONFIG_PHONET is not set +# CONFIG_IEEE802154 is not set # CONFIG_NET_SCHED is not set # CONFIG_DCB is not set @@ -393,9 +407,8 @@ CONFIG_MTD_OF_PARTS=y # User Modules And Translation Layers # CONFIG_MTD_CHAR=y -CONFIG_MTD_BLKDEVS=m -CONFIG_MTD_BLOCK=m -# CONFIG_MTD_BLOCK_RO is not set +CONFIG_MTD_BLKDEVS=y +CONFIG_MTD_BLOCK=y # CONFIG_FTL is not set # CONFIG_NFTL is not set # CONFIG_INFTL is not set @@ -452,7 +465,17 @@ CONFIG_MTD_PHYSMAP_OF=y # CONFIG_MTD_DOC2000 is not set # CONFIG_MTD_DOC2001 is not set # CONFIG_MTD_DOC2001PLUS is not set -# CONFIG_MTD_NAND is not set +CONFIG_MTD_NAND=y +# CONFIG_MTD_NAND_VERIFY_WRITE is not set +CONFIG_MTD_NAND_ECC_SMC=y +# CONFIG_MTD_NAND_MUSEUM_IDS is not set +CONFIG_MTD_NAND_IDS=y +CONFIG_MTD_NAND_NDFC=y +# CONFIG_MTD_NAND_DISKONCHIP is not set +# CONFIG_MTD_NAND_CAFE is not set +# CONFIG_MTD_NAND_NANDSIM is not set +# CONFIG_MTD_NAND_PLATFORM is not set +# CONFIG_MTD_NAND_FSL_ELBC is not set # CONFIG_MTD_ONENAND is not set # @@ -465,6 +488,7 @@ CONFIG_MTD_PHYSMAP_OF=y # # CONFIG_MTD_UBI is not set CONFIG_OF_DEVICE=y +CONFIG_OF_I2C=y # CONFIG_PARPORT is not set CONFIG_BLK_DEV=y # CONFIG_BLK_DEV_FD is not set @@ -504,14 +528,17 @@ CONFIG_HAVE_IDE=y # # -# Enable only one of the two stacks, unless you know what you are doing +# You can enable one or both FireWire driver stacks. +# + +# +# See the help texts for more information. # # CONFIG_FIREWIRE is not set # CONFIG_IEEE1394 is not set # CONFIG_I2O is not set # CONFIG_MACINTOSH_DRIVERS is not set CONFIG_NETDEVICES=y -CONFIG_COMPAT_NET_DEV_OPS=y # CONFIG_DUMMY is not set # CONFIG_BONDING is not set # CONFIG_MACVLAN is not set @@ -546,6 +573,7 @@ CONFIG_IBM_NEW_EMAC_EMAC4=y # CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set # CONFIG_NET_PCI is not set # CONFIG_B44 is not set +# CONFIG_KS8842 is not set # CONFIG_ATL2 is not set # CONFIG_NETDEV_1000 is not set # CONFIG_NETDEV_10000 is not set @@ -621,20 +649,150 @@ CONFIG_LEGACY_PTY_COUNT=256 # CONFIG_IPMI_HANDLER is not set # CONFIG_HW_RANDOM is not set # CONFIG_NVRAM is not set -# CONFIG_GEN_RTC is not set # CONFIG_R3964 is not set # CONFIG_APPLICOM is not set # CONFIG_RAW_DRIVER is not set # CONFIG_TCG_TPM is not set CONFIG_DEVPORT=y -# CONFIG_I2C is not set +CONFIG_I2C=y +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_CHARDEV=y +CONFIG_I2C_HELPER_AUTO=y + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +# CONFIG_I2C_ALI1535 is not set +# CONFIG_I2C_ALI1563 is not set +# CONFIG_I2C_ALI15X3 is not set +# CONFIG_I2C_AMD756 is not set +# CONFIG_I2C_AMD8111 is not set +# CONFIG_I2C_I801 is not set +# CONFIG_I2C_ISCH is not set +# CONFIG_I2C_PIIX4 is not set +# CONFIG_I2C_NFORCE2 is not set +# CONFIG_I2C_SIS5595 is not set +# CONFIG_I2C_SIS630 is not set +# CONFIG_I2C_SIS96X is not set +# CONFIG_I2C_VIA is not set +# CONFIG_I2C_VIAPRO is not set + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_IBM_IIC=y +# CONFIG_I2C_MPC is not set +# CONFIG_I2C_OCORES is not set +# CONFIG_I2C_SIMTEC is not set + +# +# External I2C/SMBus adapter drivers +# +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_TAOS_EVM is not set + +# +# Graphics adapter I2C/DDC channel drivers +# +# CONFIG_I2C_VOODOO3 is not set + +# +# Other I2C/SMBus bus drivers +# +# CONFIG_I2C_PCA_PLATFORM is not set +# CONFIG_I2C_STUB is not set + +# +# Miscellaneous I2C Chip support +# +# CONFIG_DS1682 is not set +# CONFIG_SENSORS_PCF8574 is not set +# CONFIG_PCF8575 is not set +# CONFIG_SENSORS_PCA9539 is not set +# CONFIG_SENSORS_TSL2550 is not set +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# CONFIG_I2C_DEBUG_CHIP is not set # CONFIG_SPI is not set + +# +# PPS support +# +# CONFIG_PPS is not set CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y # CONFIG_GPIOLIB is not set # CONFIG_W1 is not set # CONFIG_POWER_SUPPLY is not set -# CONFIG_HWMON is not set +CONFIG_HWMON=y +# CONFIG_HWMON_VID is not set +# CONFIG_SENSORS_AD7414 is not set +# CONFIG_SENSORS_AD7418 is not set +# CONFIG_SENSORS_ADM1021 is not set +# CONFIG_SENSORS_ADM1025 is not set +# CONFIG_SENSORS_ADM1026 is not set +# CONFIG_SENSORS_ADM1029 is not set +# CONFIG_SENSORS_ADM1031 is not set +# CONFIG_SENSORS_ADM9240 is not set +# CONFIG_SENSORS_ADT7462 is not set +# CONFIG_SENSORS_ADT7470 is not set +# CONFIG_SENSORS_ADT7473 is not set +# CONFIG_SENSORS_ADT7475 is not set +# CONFIG_SENSORS_ATXP1 is not set +# CONFIG_SENSORS_DS1621 is not set +# CONFIG_SENSORS_I5K_AMB is not set +# CONFIG_SENSORS_F71805F is not set +# CONFIG_SENSORS_F71882FG is not set +# CONFIG_SENSORS_F75375S is not set +# CONFIG_SENSORS_G760A is not set +# CONFIG_SENSORS_GL518SM is not set +# CONFIG_SENSORS_GL520SM is not set +# CONFIG_SENSORS_IT87 is not set +# CONFIG_SENSORS_LM63 is not set +CONFIG_SENSORS_LM75=y +# CONFIG_SENSORS_LM77 is not set +# CONFIG_SENSORS_LM78 is not set +# CONFIG_SENSORS_LM80 is not set +# CONFIG_SENSORS_LM83 is not set +# CONFIG_SENSORS_LM85 is not set +# CONFIG_SENSORS_LM87 is not set +# CONFIG_SENSORS_LM90 is not set +# CONFIG_SENSORS_LM92 is not set +# CONFIG_SENSORS_LM93 is not set +# CONFIG_SENSORS_LTC4215 is not set +# CONFIG_SENSORS_LTC4245 is not set +# CONFIG_SENSORS_LM95241 is not set +# CONFIG_SENSORS_MAX1619 is not set +# CONFIG_SENSORS_MAX6650 is not set +# CONFIG_SENSORS_PC87360 is not set +# CONFIG_SENSORS_PC87427 is not set +# CONFIG_SENSORS_PCF8591 is not set +# CONFIG_SENSORS_SIS5595 is not set +# CONFIG_SENSORS_DME1737 is not set +# CONFIG_SENSORS_SMSC47M1 is not set +# CONFIG_SENSORS_SMSC47M192 is not set +# CONFIG_SENSORS_SMSC47B397 is not set +# CONFIG_SENSORS_ADS7828 is not set +# CONFIG_SENSORS_THMC50 is not set +# CONFIG_SENSORS_TMP401 is not set +# CONFIG_SENSORS_VIA686A is not set +# CONFIG_SENSORS_VT1211 is not set +# CONFIG_SENSORS_VT8231 is not set +# CONFIG_SENSORS_W83781D is not set +# CONFIG_SENSORS_W83791D is not set +# CONFIG_SENSORS_W83792D is not set +# CONFIG_SENSORS_W83793 is not set +# CONFIG_SENSORS_W83L785TS is not set +# CONFIG_SENSORS_W83L786NG is not set +# CONFIG_SENSORS_W83627HF is not set +# CONFIG_SENSORS_W83627EHF is not set +# CONFIG_HWMON_DEBUG_CHIP is not set CONFIG_THERMAL=y +# CONFIG_THERMAL_HWMON is not set # CONFIG_WATCHDOG is not set CONFIG_SSB_POSSIBLE=y @@ -649,24 +807,15 @@ CONFIG_SSB_POSSIBLE=y # CONFIG_MFD_CORE is not set # CONFIG_MFD_SM501 is not set # CONFIG_HTC_PASIC3 is not set +# CONFIG_TWL4030_CORE is not set # CONFIG_MFD_TMIO is not set +# CONFIG_PMIC_DA903X is not set +# CONFIG_MFD_WM8400 is not set +# CONFIG_MFD_WM8350_I2C is not set +# CONFIG_MFD_PCF50633 is not set +# CONFIG_AB3100_CORE is not set # CONFIG_REGULATOR is not set - -# -# Multimedia devices -# - -# -# Multimedia core support -# -# CONFIG_VIDEO_DEV is not set -# CONFIG_DVB_CORE is not set -# CONFIG_VIDEO_MEDIA is not set - -# -# Multimedia drivers -# -# CONFIG_DAB is not set +# CONFIG_MEDIA_SUPPORT is not set # # Graphics support @@ -691,10 +840,69 @@ CONFIG_SSB_POSSIBLE=y # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set # CONFIG_EDAC is not set -# CONFIG_RTC_CLASS is not set +CONFIG_RTC_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +CONFIG_RTC_DRV_DS1307=y +# CONFIG_RTC_DRV_DS1374 is not set +# CONFIG_RTC_DRV_DS1672 is not set +# CONFIG_RTC_DRV_MAX6900 is not set +# CONFIG_RTC_DRV_RS5C372 is not set +# CONFIG_RTC_DRV_ISL1208 is not set +# CONFIG_RTC_DRV_X1205 is not set +# CONFIG_RTC_DRV_PCF8563 is not set +# CONFIG_RTC_DRV_PCF8583 is not set +# CONFIG_RTC_DRV_M41T80 is not set +# CONFIG_RTC_DRV_S35390A is not set +# CONFIG_RTC_DRV_FM3130 is not set +# CONFIG_RTC_DRV_RX8581 is not set +# CONFIG_RTC_DRV_RX8025 is not set + +# +# SPI RTC drivers +# + +# +# Platform RTC drivers +# +# CONFIG_RTC_DRV_CMOS is not set +# CONFIG_RTC_DRV_DS1286 is not set +# CONFIG_RTC_DRV_DS1511 is not set +# CONFIG_RTC_DRV_DS1553 is not set +# CONFIG_RTC_DRV_DS1742 is not set +# CONFIG_RTC_DRV_STK17TA8 is not set +# CONFIG_RTC_DRV_M48T86 is not set +# CONFIG_RTC_DRV_M48T35 is not set +# CONFIG_RTC_DRV_M48T59 is not set +# CONFIG_RTC_DRV_BQ4802 is not set +# CONFIG_RTC_DRV_V3020 is not set + +# +# on-CPU RTC drivers +# +# CONFIG_RTC_DRV_GENERIC is not set # CONFIG_DMADEVICES is not set # CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set + +# +# TI VLYNQ +# # CONFIG_STAGING is not set # @@ -708,11 +916,12 @@ CONFIG_EXT2_FS=y # CONFIG_REISERFS_FS is not set # CONFIG_JFS_FS is not set # CONFIG_FS_POSIX_ACL is not set -CONFIG_FILE_LOCKING=y # CONFIG_XFS_FS is not set # CONFIG_GFS2_FS is not set # CONFIG_OCFS2_FS is not set # CONFIG_BTRFS_FS is not set +CONFIG_FILE_LOCKING=y +CONFIG_FSNOTIFY=y CONFIG_DNOTIFY=y CONFIG_INOTIFY=y CONFIG_INOTIFY_USER=y @@ -818,6 +1027,7 @@ CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y CONFIG_HAVE_LMB=y CONFIG_NLATTR=y +CONFIG_GENERIC_ATOMIC64=y # # Kernel hacking @@ -848,6 +1058,9 @@ CONFIG_SCHED_DEBUG=y # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set @@ -859,7 +1072,6 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_SG is not set # CONFIG_DEBUG_NOTIFIERS is not set -# CONFIG_BOOT_PRINTK_DELAY is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_RCU_CPU_STALL_DETECTOR is not set # CONFIG_BACKTRACE_SELF_TEST is not set @@ -873,16 +1085,15 @@ CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_TRACING_SUPPORT=y - -# -# Tracers -# +CONFIG_FTRACE=y # CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set # CONFIG_SCHED_TRACER is not set -# CONFIG_CONTEXT_SWITCH_TRACER is not set -# CONFIG_EVENT_TRACER is not set +# CONFIG_ENABLE_DEFAULT_TRACERS is not set # CONFIG_BOOT_TRACER is not set -# CONFIG_TRACE_BRANCH_PROFILING is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +# CONFIG_PROFILE_ALL_BRANCHES is not set # CONFIG_STACK_TRACER is not set # CONFIG_KMEMTRACE is not set # CONFIG_WORKQUEUE_TRACER is not set @@ -891,6 +1102,9 @@ CONFIG_TRACING_SUPPORT=y # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set +# CONFIG_KMEMCHECK is not set +# CONFIG_PPC_DISABLE_WERROR is not set +CONFIG_PPC_WERROR=y CONFIG_PRINT_STACK_DEPTH=64 # CONFIG_DEBUG_STACKOVERFLOW is not set # CONFIG_DEBUG_STACK_USAGE is not set From 189339d47d30ad171aa1c95ffc90a6177b3ce4a8 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Thu, 30 Jul 2009 17:56:38 -0500 Subject: [PATCH 0157/1662] powerpc/85xx: Move mpc8536ds.dts to address-cells/size-cells = <2> Change the top-level #address-cells and #size-cells to <2> so the mpc8536ds.dts is easier to deal with both a true 32-bit physical or 36-bit physical address space. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/boot/dts/mpc8536ds.dts | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/boot/dts/mpc8536ds.dts b/arch/powerpc/boot/dts/mpc8536ds.dts index e781ad2f1f8a..22caf69e6efd 100644 --- a/arch/powerpc/boot/dts/mpc8536ds.dts +++ b/arch/powerpc/boot/dts/mpc8536ds.dts @@ -14,8 +14,8 @@ / { model = "fsl,mpc8536ds"; compatible = "fsl,mpc8536ds"; - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; aliases { ethernet0 = &enet0; @@ -42,7 +42,7 @@ memory { device_type = "memory"; - reg = <00000000 00000000>; // Filled by U-Boot + reg = <0 0 0 0>; // Filled by U-Boot }; soc@ffe00000 { @@ -50,7 +50,7 @@ #size-cells = <1>; device_type = "soc"; compatible = "simple-bus"; - ranges = <0x0 0xffe00000 0x100000>; + ranges = <0x0 0 0xffe00000 0x100000>; bus-frequency = <0>; // Filled out by uboot. ecm-law@0 { @@ -347,13 +347,13 @@ interrupt-parent = <&mpic>; interrupts = <24 0x2>; bus-range = <0 0xff>; - ranges = <0x02000000 0 0x80000000 0x80000000 0 0x10000000 - 0x01000000 0 0x00000000 0xffc00000 0 0x00010000>; + ranges = <0x02000000 0 0x80000000 0 0x80000000 0 0x10000000 + 0x01000000 0 0x00000000 0 0xffc00000 0 0x00010000>; clock-frequency = <66666666>; #interrupt-cells = <1>; #size-cells = <2>; #address-cells = <3>; - reg = <0xffe08000 0x1000>; + reg = <0 0xffe08000 0 0x1000>; }; pci1: pcie@ffe09000 { @@ -362,10 +362,10 @@ #interrupt-cells = <1>; #size-cells = <2>; #address-cells = <3>; - reg = <0xffe09000 0x1000>; + reg = <0 0xffe09000 0 0x1000>; bus-range = <0 0xff>; - ranges = <0x02000000 0 0x98000000 0x98000000 0 0x08000000 - 0x01000000 0 0x00000000 0xffc20000 0 0x00010000>; + ranges = <0x02000000 0 0x98000000 0 0x98000000 0 0x08000000 + 0x01000000 0 0x00000000 0 0xffc20000 0 0x00010000>; clock-frequency = <33333333>; interrupt-parent = <&mpic>; interrupts = <25 0x2>; @@ -398,10 +398,10 @@ #interrupt-cells = <1>; #size-cells = <2>; #address-cells = <3>; - reg = <0xffe0a000 0x1000>; + reg = <0 0xffe0a000 0 0x1000>; bus-range = <0 0xff>; - ranges = <0x02000000 0 0x90000000 0x90000000 0 0x08000000 - 0x01000000 0 0x00000000 0xffc10000 0 0x00010000>; + ranges = <0x02000000 0 0x90000000 0 0x90000000 0 0x08000000 + 0x01000000 0 0x00000000 0 0xffc10000 0 0x00010000>; clock-frequency = <33333333>; interrupt-parent = <&mpic>; interrupts = <26 0x2>; @@ -434,10 +434,10 @@ #interrupt-cells = <1>; #size-cells = <2>; #address-cells = <3>; - reg = <0xffe0b000 0x1000>; + reg = <0 0xffe0b000 0 0x1000>; bus-range = <0 0xff>; - ranges = <0x02000000 0 0xa0000000 0xa0000000 0 0x20000000 - 0x01000000 0 0x00000000 0xffc30000 0 0x00010000>; + ranges = <0x02000000 0 0xa0000000 0 0xa0000000 0 0x20000000 + 0x01000000 0 0x00000000 0 0xffc30000 0 0x00010000>; clock-frequency = <33333333>; interrupt-parent = <&mpic>; interrupts = <27 0x2>; From b6c316a1f6447d6dd0fd8d443b66643e328e04fa Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Thu, 30 Jul 2009 17:56:54 -0500 Subject: [PATCH 0158/1662] powerpc/85xx: Added 36-bit physical device tree for mpc8536ds board Added a device tree that should be similiar to mpc8536ds.dtb except the physical addresses for all IO are above the 4G boundary. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/boot/dts/mpc8536ds_36b.dts | 467 ++++++++++++++++++++++++ 1 file changed, 467 insertions(+) create mode 100644 arch/powerpc/boot/dts/mpc8536ds_36b.dts diff --git a/arch/powerpc/boot/dts/mpc8536ds_36b.dts b/arch/powerpc/boot/dts/mpc8536ds_36b.dts new file mode 100644 index 000000000000..113ed8b7c898 --- /dev/null +++ b/arch/powerpc/boot/dts/mpc8536ds_36b.dts @@ -0,0 +1,467 @@ +/* + * MPC8536 DS Device Tree Source + * + * Copyright 2008-2009 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +/dts-v1/; + +/ { + model = "fsl,mpc8536ds"; + compatible = "fsl,mpc8536ds"; + #address-cells = <2>; + #size-cells = <2>; + + aliases { + ethernet0 = &enet0; + ethernet1 = &enet1; + serial0 = &serial0; + serial1 = &serial1; + pci0 = &pci0; + pci1 = &pci1; + pci2 = &pci2; + pci3 = &pci3; + }; + + cpus { + #cpus = <1>; + #address-cells = <1>; + #size-cells = <0>; + + PowerPC,8536@0 { + device_type = "cpu"; + reg = <0>; + next-level-cache = <&L2>; + }; + }; + + memory { + device_type = "memory"; + reg = <0 0 0 0>; // Filled by U-Boot + }; + + soc@fffe00000 { + #address-cells = <1>; + #size-cells = <1>; + device_type = "soc"; + compatible = "simple-bus"; + ranges = <0x0 0xf 0xffe00000 0x100000>; + bus-frequency = <0>; // Filled out by uboot. + + ecm-law@0 { + compatible = "fsl,ecm-law"; + reg = <0x0 0x1000>; + fsl,num-laws = <12>; + }; + + ecm@1000 { + compatible = "fsl,mpc8536-ecm", "fsl,ecm"; + reg = <0x1000 0x1000>; + interrupts = <17 2>; + interrupt-parent = <&mpic>; + }; + + memory-controller@2000 { + compatible = "fsl,mpc8536-memory-controller"; + reg = <0x2000 0x1000>; + interrupt-parent = <&mpic>; + interrupts = <18 0x2>; + }; + + L2: l2-cache-controller@20000 { + compatible = "fsl,mpc8536-l2-cache-controller"; + reg = <0x20000 0x1000>; + interrupt-parent = <&mpic>; + interrupts = <16 0x2>; + }; + + i2c@3000 { + #address-cells = <1>; + #size-cells = <0>; + cell-index = <0>; + compatible = "fsl-i2c"; + reg = <0x3000 0x100>; + interrupts = <43 0x2>; + interrupt-parent = <&mpic>; + dfsrr; + }; + + i2c@3100 { + #address-cells = <1>; + #size-cells = <0>; + cell-index = <1>; + compatible = "fsl-i2c"; + reg = <0x3100 0x100>; + interrupts = <43 0x2>; + interrupt-parent = <&mpic>; + dfsrr; + rtc@68 { + compatible = "dallas,ds3232"; + reg = <0x68>; + interrupts = <0 0x1>; + interrupt-parent = <&mpic>; + }; + }; + + dma@21300 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,mpc8536-dma", "fsl,eloplus-dma"; + reg = <0x21300 4>; + ranges = <0 0x21100 0x200>; + cell-index = <0>; + dma-channel@0 { + compatible = "fsl,mpc8536-dma-channel", + "fsl,eloplus-dma-channel"; + reg = <0x0 0x80>; + cell-index = <0>; + interrupt-parent = <&mpic>; + interrupts = <20 2>; + }; + dma-channel@80 { + compatible = "fsl,mpc8536-dma-channel", + "fsl,eloplus-dma-channel"; + reg = <0x80 0x80>; + cell-index = <1>; + interrupt-parent = <&mpic>; + interrupts = <21 2>; + }; + dma-channel@100 { + compatible = "fsl,mpc8536-dma-channel", + "fsl,eloplus-dma-channel"; + reg = <0x100 0x80>; + cell-index = <2>; + interrupt-parent = <&mpic>; + interrupts = <22 2>; + }; + dma-channel@180 { + compatible = "fsl,mpc8536-dma-channel", + "fsl,eloplus-dma-channel"; + reg = <0x180 0x80>; + cell-index = <3>; + interrupt-parent = <&mpic>; + interrupts = <23 2>; + }; + }; + + usb@22000 { + compatible = "fsl,mpc8536-usb2-mph", "fsl-usb2-mph"; + reg = <0x22000 0x1000>; + #address-cells = <1>; + #size-cells = <0>; + interrupt-parent = <&mpic>; + interrupts = <28 0x2>; + phy_type = "ulpi"; + }; + + usb@23000 { + compatible = "fsl,mpc8536-usb2-mph", "fsl-usb2-mph"; + reg = <0x23000 0x1000>; + #address-cells = <1>; + #size-cells = <0>; + interrupt-parent = <&mpic>; + interrupts = <46 0x2>; + phy_type = "ulpi"; + }; + + enet0: ethernet@24000 { + #address-cells = <1>; + #size-cells = <1>; + cell-index = <0>; + device_type = "network"; + model = "eTSEC"; + compatible = "gianfar"; + reg = <0x24000 0x1000>; + ranges = <0x0 0x24000 0x1000>; + local-mac-address = [ 00 00 00 00 00 00 ]; + interrupts = <29 2 30 2 34 2>; + interrupt-parent = <&mpic>; + tbi-handle = <&tbi0>; + phy-handle = <&phy1>; + phy-connection-type = "rgmii-id"; + + mdio@520 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "fsl,gianfar-mdio"; + reg = <0x520 0x20>; + + phy0: ethernet-phy@0 { + interrupt-parent = <&mpic>; + interrupts = <10 0x1>; + reg = <0>; + device_type = "ethernet-phy"; + }; + phy1: ethernet-phy@1 { + interrupt-parent = <&mpic>; + interrupts = <10 0x1>; + reg = <1>; + device_type = "ethernet-phy"; + }; + tbi0: tbi-phy@11 { + reg = <0x11>; + device_type = "tbi-phy"; + }; + }; + }; + + enet1: ethernet@26000 { + #address-cells = <1>; + #size-cells = <1>; + cell-index = <1>; + device_type = "network"; + model = "eTSEC"; + compatible = "gianfar"; + reg = <0x26000 0x1000>; + ranges = <0x0 0x26000 0x1000>; + local-mac-address = [ 00 00 00 00 00 00 ]; + interrupts = <31 2 32 2 33 2>; + interrupt-parent = <&mpic>; + tbi-handle = <&tbi1>; + phy-handle = <&phy0>; + phy-connection-type = "rgmii-id"; + + mdio@520 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "fsl,gianfar-tbi"; + reg = <0x520 0x20>; + + tbi1: tbi-phy@11 { + reg = <0x11>; + device_type = "tbi-phy"; + }; + }; + }; + + usb@2b000 { + compatible = "fsl,mpc8536-usb2-dr", "fsl-usb2-dr"; + reg = <0x2b000 0x1000>; + #address-cells = <1>; + #size-cells = <0>; + interrupt-parent = <&mpic>; + interrupts = <60 0x2>; + dr_mode = "peripheral"; + phy_type = "ulpi"; + }; + + serial0: serial@4500 { + cell-index = <0>; + device_type = "serial"; + compatible = "ns16550"; + reg = <0x4500 0x100>; + clock-frequency = <0>; + interrupts = <42 0x2>; + interrupt-parent = <&mpic>; + }; + + serial1: serial@4600 { + cell-index = <1>; + device_type = "serial"; + compatible = "ns16550"; + reg = <0x4600 0x100>; + clock-frequency = <0>; + interrupts = <42 0x2>; + interrupt-parent = <&mpic>; + }; + + crypto@30000 { + compatible = "fsl,sec3.0", "fsl,sec2.4", "fsl,sec2.2", + "fsl,sec2.1", "fsl,sec2.0"; + reg = <0x30000 0x10000>; + interrupts = <45 2 58 2>; + interrupt-parent = <&mpic>; + fsl,num-channels = <4>; + fsl,channel-fifo-len = <24>; + fsl,exec-units-mask = <0x9fe>; + fsl,descriptor-types-mask = <0x3ab0ebf>; + }; + + sata@18000 { + compatible = "fsl,mpc8536-sata", "fsl,pq-sata"; + reg = <0x18000 0x1000>; + cell-index = <1>; + interrupts = <74 0x2>; + interrupt-parent = <&mpic>; + }; + + sata@19000 { + compatible = "fsl,mpc8536-sata", "fsl,pq-sata"; + reg = <0x19000 0x1000>; + cell-index = <2>; + interrupts = <41 0x2>; + interrupt-parent = <&mpic>; + }; + + global-utilities@e0000 { //global utilities block + compatible = "fsl,mpc8548-guts"; + reg = <0xe0000 0x1000>; + fsl,has-rstcr; + }; + + mpic: pic@40000 { + clock-frequency = <0>; + interrupt-controller; + #address-cells = <0>; + #interrupt-cells = <2>; + reg = <0x40000 0x40000>; + compatible = "chrp,open-pic"; + device_type = "open-pic"; + big-endian; + }; + + msi@41600 { + compatible = "fsl,mpc8536-msi", "fsl,mpic-msi"; + reg = <0x41600 0x80>; + msi-available-ranges = <0 0x100>; + interrupts = < + 0xe0 0 + 0xe1 0 + 0xe2 0 + 0xe3 0 + 0xe4 0 + 0xe5 0 + 0xe6 0 + 0xe7 0>; + interrupt-parent = <&mpic>; + }; + }; + + pci0: pci@fffe08000 { + compatible = "fsl,mpc8540-pci"; + device_type = "pci"; + interrupt-map-mask = <0xf800 0x0 0x0 0x7>; + interrupt-map = < + + /* IDSEL 0x11 J17 Slot 1 */ + 0x8800 0 0 1 &mpic 1 1 + 0x8800 0 0 2 &mpic 2 1 + 0x8800 0 0 3 &mpic 3 1 + 0x8800 0 0 4 &mpic 4 1>; + + interrupt-parent = <&mpic>; + interrupts = <24 0x2>; + bus-range = <0 0xff>; + ranges = <0x02000000 0 0xf0000000 0xc 0x00000000 0 0x10000000 + 0x01000000 0 0x00000000 0xf 0xffc00000 0 0x00010000>; + clock-frequency = <66666666>; + #interrupt-cells = <1>; + #size-cells = <2>; + #address-cells = <3>; + reg = <0xf 0xffe08000 0 0x1000>; + }; + + pci1: pcie@fffe09000 { + compatible = "fsl,mpc8548-pcie"; + device_type = "pci"; + #interrupt-cells = <1>; + #size-cells = <2>; + #address-cells = <3>; + reg = <0xf 0xffe09000 0 0x1000>; + bus-range = <0 0xff>; + ranges = <0x02000000 0 0xf8000000 0xc 0x18000000 0 0x08000000 + 0x01000000 0 0x00000000 0xf 0xffc20000 0 0x00010000>; + clock-frequency = <33333333>; + interrupt-parent = <&mpic>; + interrupts = <25 0x2>; + interrupt-map-mask = <0xf800 0 0 7>; + interrupt-map = < + /* IDSEL 0x0 */ + 0000 0 0 1 &mpic 4 1 + 0000 0 0 2 &mpic 5 1 + 0000 0 0 3 &mpic 6 1 + 0000 0 0 4 &mpic 7 1 + >; + pcie@0 { + reg = <0 0 0 0 0>; + #size-cells = <2>; + #address-cells = <3>; + device_type = "pci"; + ranges = <0x02000000 0 0xf8000000 + 0x02000000 0 0xf8000000 + 0 0x08000000 + + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00010000>; + }; + }; + + pci2: pcie@fffe0a000 { + compatible = "fsl,mpc8548-pcie"; + device_type = "pci"; + #interrupt-cells = <1>; + #size-cells = <2>; + #address-cells = <3>; + reg = <0xf 0xffe0a000 0 0x1000>; + bus-range = <0 0xff>; + ranges = <0x02000000 0 0xf8000000 0xc 0x10000000 0 0x08000000 + 0x01000000 0 0x00000000 0xf 0xffc10000 0 0x00010000>; + clock-frequency = <33333333>; + interrupt-parent = <&mpic>; + interrupts = <26 0x2>; + interrupt-map-mask = <0xf800 0 0 7>; + interrupt-map = < + /* IDSEL 0x0 */ + 0000 0 0 1 &mpic 0 1 + 0000 0 0 2 &mpic 1 1 + 0000 0 0 3 &mpic 2 1 + 0000 0 0 4 &mpic 3 1 + >; + pcie@0 { + reg = <0 0 0 0 0>; + #size-cells = <2>; + #address-cells = <3>; + device_type = "pci"; + ranges = <0x02000000 0 0xf8000000 + 0x02000000 0 0xf8000000 + 0 0x08000000 + + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00010000>; + }; + }; + + pci3: pcie@fffe0b000 { + compatible = "fsl,mpc8548-pcie"; + device_type = "pci"; + #interrupt-cells = <1>; + #size-cells = <2>; + #address-cells = <3>; + reg = <0xf 0xffe0b000 0 0x1000>; + bus-range = <0 0xff>; + ranges = <0x02000000 0 0xe0000000 0xc 0x20000000 0 0x20000000 + 0x01000000 0 0x00000000 0xf 0xffc30000 0 0x00010000>; + clock-frequency = <33333333>; + interrupt-parent = <&mpic>; + interrupts = <27 0x2>; + interrupt-map-mask = <0xf800 0 0 7>; + interrupt-map = < + /* IDSEL 0x0 */ + 0000 0 0 1 &mpic 8 1 + 0000 0 0 2 &mpic 9 1 + 0000 0 0 3 &mpic 10 1 + 0000 0 0 4 &mpic 11 1 + >; + + pcie@0 { + reg = <0 0 0 0 0>; + #size-cells = <2>; + #address-cells = <3>; + device_type = "pci"; + ranges = <0x02000000 0 0xe0000000 + 0x02000000 0 0xe0000000 + 0 0x20000000 + + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00100000>; + }; + }; +}; From 8dcd038a13b8e322c49fe0d3e31a0deaba4fd5fd Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 6 Aug 2009 16:00:37 -0700 Subject: [PATCH 0159/1662] powerpc/fsl-booke: read buffer overflow cam[tlbcam_index] is checked before tlbcam_index < ARRAY_SIZE(cam) Signed-off-by: Roel Kluin Signed-off-by: Andrew Morton Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/fsl_booke_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c index bb3d65998e6b..dc93e95b256e 100644 --- a/arch/powerpc/mm/fsl_booke_mmu.c +++ b/arch/powerpc/mm/fsl_booke_mmu.c @@ -161,7 +161,7 @@ unsigned long __init mmu_mapin_ram(void) unsigned long virt = PAGE_OFFSET; phys_addr_t phys = memstart_addr; - while (cam[tlbcam_index] && tlbcam_index < ARRAY_SIZE(cam)) { + while (tlbcam_index < ARRAY_SIZE(cam) && cam[tlbcam_index]) { settlbcam(tlbcam_index, virt, phys, cam[tlbcam_index], PAGE_KERNEL_X, 0); virt += cam[tlbcam_index]; phys += cam[tlbcam_index]; From 6c75933c00049bee59562a18843a4f133ec2bfe4 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 7 Aug 2009 09:00:34 +0200 Subject: [PATCH 0160/1662] powerpc/fsl_rio: Add kmalloc NULL tests Check that the result of kmalloc/kzalloc is not NULL before dereferencing it. The semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @@ expression *x; identifier f; constant char *C; @@ x = \(kmalloc\|kcalloc\|kzalloc\)(...); ... when != x == NULL when != x != NULL when != (x || ...) ( kfree(x) | f(...,C,...,x,...) | *f(...,x,...) | *x->f ) // Signed-off-by: Julia Lawall Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/sysdev/fsl_rio.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index cbb3bed75d3c..757a83fe5e59 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -1057,6 +1057,10 @@ int fsl_rio_setup(struct of_device *dev) law_start, law_size); ops = kmalloc(sizeof(struct rio_ops), GFP_KERNEL); + if (!ops) { + rc = -ENOMEM; + goto err_ops; + } ops->lcread = fsl_local_config_read; ops->lcwrite = fsl_local_config_write; ops->cread = fsl_rio_config_read; @@ -1064,6 +1068,10 @@ int fsl_rio_setup(struct of_device *dev) ops->dsend = fsl_rio_doorbell_send; port = kzalloc(sizeof(struct rio_mport), GFP_KERNEL); + if (!port) { + rc = -ENOMEM; + goto err_port; + } port->id = 0; port->index = 0; @@ -1071,7 +1079,7 @@ int fsl_rio_setup(struct of_device *dev) if (!priv) { printk(KERN_ERR "Can't alloc memory for 'priv'\n"); rc = -ENOMEM; - goto err; + goto err_priv; } INIT_LIST_HEAD(&port->dbells); @@ -1169,11 +1177,13 @@ int fsl_rio_setup(struct of_device *dev) return 0; err: - if (priv) - iounmap(priv->regs_win); - kfree(ops); + iounmap(priv->regs_win); kfree(priv); +err_priv: kfree(port); +err_port: + kfree(ops); +err_ops: return rc; } From 8640d3bf71fa0f25adf86527fe69a32372427d4b Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 5 Aug 2009 21:41:12 +0200 Subject: [PATCH 0161/1662] powerpc/ipic: unmask all interrupt sources in case the interrupt controller was used in an earlier life then it is possible it is that some of its sources were used and are still unmask. If the (unmasked) device is active and is creating interrupts (or one interrupts was pending since the interrupts were disabled) then the boot process "ends" very soon. Once external interrupts are enabled, we land in -> do_IRQ -> call ppc_md.get_irq() -> ipic_read() gets the source number -> irq_linear_revmap(source) -> revmap[source] == NO_IRQ -> irq_find_mapping(source) returns NO_IRQ because no source is registered -> source is NO_IRQ, ppc_spurious_interrupts gets incremented, no further action. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/sysdev/ipic.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 69e2630c9062..ebb7e5844012 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -781,6 +781,9 @@ struct ipic * __init ipic_init(struct device_node *node, unsigned int flags) primary_ipic = ipic; irq_set_default_host(primary_ipic->irqhost); + ipic_write(ipic->regs, IPIC_SIMSR_H, 0); + ipic_write(ipic->regs, IPIC_SIMSR_L, 0); + printk ("IPIC (%d IRQ sources) at %p\n", NR_IPIC_INTS, primary_ipic->regs); From 9239c89bc9c51c412b89eb8040077eaaee361326 Mon Sep 17 00:00:00 2001 From: Heiko Schocher Date: Mon, 3 Aug 2009 09:34:50 +0200 Subject: [PATCH 0162/1662] powerpc/82xx: mgcoge - updates for 2.6.32 - add I2C support - add FCC1 and FCC2 support - fix bogus gpio numbering in plattform code Signed-off-by: Heiko Schocher Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/boot/dts/mgcoge.dts | 53 +++++++++++++++++++++ arch/powerpc/platforms/82xx/mgcoge.c | 69 ++++++++++++++++++++++++---- 2 files changed, 113 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/boot/dts/mgcoge.dts b/arch/powerpc/boot/dts/mgcoge.dts index 633255a97557..0ce96644176d 100644 --- a/arch/powerpc/boot/dts/mgcoge.dts +++ b/arch/powerpc/boot/dts/mgcoge.dts @@ -162,6 +162,59 @@ fixed-link = <0 0 10 0 0>; }; + i2c@11860 { + compatible = "fsl,mpc8272-i2c", + "fsl,cpm2-i2c"; + reg = <0x11860 0x20 0x8afc 0x2>; + interrupts = <1 8>; + interrupt-parent = <&PIC>; + fsl,cpm-command = <0x29600000>; + #address-cells = <1>; + #size-cells = <0>; + }; + + mdio@10d40 { + compatible = "fsl,cpm2-mdio-bitbang"; + reg = <0x10d00 0x14>; + #address-cells = <1>; + #size-cells = <0>; + fsl,mdio-pin = <12>; + fsl,mdc-pin = <13>; + + phy0: ethernet-phy@0 { + reg = <0x0>; + }; + + phy1: ethernet-phy@1 { + reg = <0x1>; + }; + }; + + /* FCC1 management to switch */ + ethernet@11300 { + device_type = "network"; + compatible = "fsl,cpm2-fcc-enet"; + reg = <0x11300 0x20 0x8400 0x100 0x11390 0x1>; + local-mac-address = [ 00 01 02 03 04 07 ]; + interrupts = <32 8>; + interrupt-parent = <&PIC>; + phy-handle = <&phy0>; + linux,network-index = <1>; + fsl,cpm-command = <0x12000300>; + }; + + /* FCC2 to redundant core unit over backplane */ + ethernet@11320 { + device_type = "network"; + compatible = "fsl,cpm2-fcc-enet"; + reg = <0x11320 0x20 0x8500 0x100 0x113b0 0x1>; + local-mac-address = [ 00 01 02 03 04 08 ]; + interrupts = <33 8>; + interrupt-parent = <&PIC>; + phy-handle = <&phy1>; + linux,network-index = <2>; + fsl,cpm-command = <0x16200300>; + }; }; PIC: interrupt-controller@10c00 { diff --git a/arch/powerpc/platforms/82xx/mgcoge.c b/arch/powerpc/platforms/82xx/mgcoge.c index c2af169c1d1d..7a5de9eb3c73 100644 --- a/arch/powerpc/platforms/82xx/mgcoge.c +++ b/arch/powerpc/platforms/82xx/mgcoge.c @@ -50,16 +50,63 @@ struct cpm_pin { static __initdata struct cpm_pin mgcoge_pins[] = { /* SMC2 */ - {1, 8, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, - {1, 9, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + {0, 8, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {0, 9, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, /* SCC4 */ - {3, 25, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, - {3, 24, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, - {3, 9, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, - {3, 8, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, - {4, 22, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, - {4, 21, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + {2, 25, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {2, 24, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {2, 9, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {2, 8, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {3, 22, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {3, 21, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + + /* FCC1 */ + {0, 14, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {0, 15, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {0, 16, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {0, 17, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {0, 18, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + {0, 19, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + {0, 20, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + {0, 21, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + {0, 26, CPM_PIN_INPUT | CPM_PIN_SECONDARY}, + {0, 27, CPM_PIN_INPUT | CPM_PIN_SECONDARY}, + {0, 28, CPM_PIN_OUTPUT | CPM_PIN_SECONDARY}, + {0, 29, CPM_PIN_OUTPUT | CPM_PIN_SECONDARY}, + {0, 30, CPM_PIN_INPUT | CPM_PIN_SECONDARY}, + {0, 31, CPM_PIN_INPUT | CPM_PIN_SECONDARY}, + + {2, 22, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {2, 23, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + + /* FCC2 */ + {1, 18, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {1, 19, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {1, 20, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {1, 21, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {1, 22, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + {1, 23, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + {1, 24, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + {1, 25, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + {1, 26, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {1, 27, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {1, 28, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {1, 29, CPM_PIN_OUTPUT | CPM_PIN_SECONDARY}, + {1, 30, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {1, 31, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + + {2, 18, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {2, 19, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + + /* MDC */ + {0, 13, CPM_PIN_OUTPUT | CPM_PIN_GPIO}, + +#if defined(CONFIG_I2C_CPM) + /* I2C */ + {3, 14, CPM_PIN_INPUT | CPM_PIN_SECONDARY | CPM_PIN_OPENDRAIN}, + {3, 15, CPM_PIN_INPUT | CPM_PIN_SECONDARY | CPM_PIN_OPENDRAIN}, +#endif }; static void __init init_ioports(void) @@ -68,12 +115,16 @@ static void __init init_ioports(void) for (i = 0; i < ARRAY_SIZE(mgcoge_pins); i++) { const struct cpm_pin *pin = &mgcoge_pins[i]; - cpm2_set_pin(pin->port - 1, pin->pin, pin->flags); + cpm2_set_pin(pin->port, pin->pin, pin->flags); } cpm2_smc_clk_setup(CPM_CLK_SMC2, CPM_BRG8); cpm2_clk_setup(CPM_CLK_SCC4, CPM_CLK7, CPM_CLK_RX); cpm2_clk_setup(CPM_CLK_SCC4, CPM_CLK8, CPM_CLK_TX); + cpm2_clk_setup(CPM_CLK_FCC1, CPM_CLK10, CPM_CLK_RX); + cpm2_clk_setup(CPM_CLK_FCC1, CPM_CLK9, CPM_CLK_TX); + cpm2_clk_setup(CPM_CLK_FCC2, CPM_CLK13, CPM_CLK_RX); + cpm2_clk_setup(CPM_CLK_FCC2, CPM_CLK14, CPM_CLK_TX); } static void __init mgcoge_setup_arch(void) From 7a626b66bbd17e775cf2210e560b29383110290e Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 2 Aug 2009 10:44:53 +0200 Subject: [PATCH 0163/1662] powerpc/ipic: introduce missing kfree Error handling code following a kzalloc should free the allocated data. The semantic match that finds the problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; statement S; expression E; identifier f,f1,l; position p1,p2; expression *ptr != NULL; @@ x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...); ... if (x == NULL) S <... when != x when != if (...) { <+...x...+> } ( x->f1 = E | (x->f1 == NULL || ...) | f(...,x->f1,...) ) ...> ( return \(0\|<+...x...+>\|ptr\); | return@p2 ...; ) @script:python@ p1 << r.p1; p2 << r.p2; @@ print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line) // Signed-off-by: Julia Lawall Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/sysdev/ipic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index ebb7e5844012..cb7689c4bfbd 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -735,8 +735,10 @@ struct ipic * __init ipic_init(struct device_node *node, unsigned int flags) ipic->irqhost = irq_alloc_host(node, IRQ_HOST_MAP_LINEAR, NR_IPIC_INTS, &ipic_host_ops, 0); - if (ipic->irqhost == NULL) + if (ipic->irqhost == NULL) { + kfree(ipic); return NULL; + } ipic->regs = ioremap(res.start, res.end - res.start + 1); From 3475dd8a68a7c705bee88b143422ba02cb9a796b Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 1 Aug 2009 10:52:51 +0200 Subject: [PATCH 0164/1662] powerpc/qe: introduce missing kfree Error handling code following a kzalloc should free the allocated data. The semantic match that finds the problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; statement S; expression E; identifier f,f1,l; position p1,p2; expression *ptr != NULL; @@ x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...); ... if (x == NULL) S <... when != x when != if (...) { <+...x...+> } ( x->f1 = E | (x->f1 == NULL || ...) | f(...,x->f1,...) ) ...> ( return \(0\|<+...x...+>\|ptr\); | return@p2 ...; ) @script:python@ p1 << r.p1; p2 << r.p2; @@ print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line) // Signed-off-by: Julia Lawall Acked-by: Timur Tabi Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/sysdev/qe_lib/qe_ic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.c b/arch/powerpc/sysdev/qe_lib/qe_ic.c index 074905c3ee5a..3faa42e03a85 100644 --- a/arch/powerpc/sysdev/qe_lib/qe_ic.c +++ b/arch/powerpc/sysdev/qe_lib/qe_ic.c @@ -339,8 +339,10 @@ void __init qe_ic_init(struct device_node *node, unsigned int flags, qe_ic->irqhost = irq_alloc_host(node, IRQ_HOST_MAP_LINEAR, NR_QE_IC_INTS, &qe_ic_host_ops, 0); - if (qe_ic->irqhost == NULL) + if (qe_ic->irqhost == NULL) { + kfree(qe_ic); return; + } qe_ic->regs = ioremap(res.start, res.end - res.start + 1); @@ -352,6 +354,7 @@ void __init qe_ic_init(struct device_node *node, unsigned int flags, if (qe_ic->virq_low == NO_IRQ) { printk(KERN_ERR "Failed to map QE_IC low IRQ\n"); + kfree(qe_ic); return; } From fda4bd9bac78efd2f9d566c52956d297bc03e8d9 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Sat, 25 Jul 2009 01:42:17 +0400 Subject: [PATCH 0165/1662] powerpc/83xx: Add support for MPC8377E-WLAN boards MPC8377E-WLAN are basically RDB boards except: - RAM extended to 512 MB; - NAND flash removed, NOR flash extended to 64 MB; - Vitesse VSC7385 5-port switch removed, RTL8211B PHY added; - Power management MCU removed; - PCI slot removed, another mini-PCI slot added (IRQ routing changed); - USB3300 PHY's ID pin grounded, thus USB port is host-only. Signed-off-by: Anton Vorontsov Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/boot/dts/mpc8377_wlan.dts | 464 ++++++++++++++++++++++ arch/powerpc/platforms/83xx/Kconfig | 4 +- arch/powerpc/platforms/83xx/mpc837x_rdb.c | 5 +- 3 files changed, 469 insertions(+), 4 deletions(-) create mode 100644 arch/powerpc/boot/dts/mpc8377_wlan.dts diff --git a/arch/powerpc/boot/dts/mpc8377_wlan.dts b/arch/powerpc/boot/dts/mpc8377_wlan.dts new file mode 100644 index 000000000000..3febc4e91b10 --- /dev/null +++ b/arch/powerpc/boot/dts/mpc8377_wlan.dts @@ -0,0 +1,464 @@ +/* + * MPC8377E WLAN Device Tree Source + * + * Copyright 2007-2009 Freescale Semiconductor Inc. + * Copyright 2009 MontaVista Software, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +/dts-v1/; + +/ { + compatible = "fsl,mpc8377wlan"; + #address-cells = <1>; + #size-cells = <1>; + + aliases { + ethernet0 = &enet0; + ethernet1 = &enet1; + serial0 = &serial0; + serial1 = &serial1; + pci0 = &pci0; + pci1 = &pci1; + pci2 = &pci2; + }; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + PowerPC,8377@0 { + device_type = "cpu"; + reg = <0x0>; + d-cache-line-size = <32>; + i-cache-line-size = <32>; + d-cache-size = <32768>; + i-cache-size = <32768>; + timebase-frequency = <0>; + bus-frequency = <0>; + clock-frequency = <0>; + }; + }; + + memory { + device_type = "memory"; + reg = <0x00000000 0x20000000>; // 512MB at 0 + }; + + localbus@e0005000 { + #address-cells = <2>; + #size-cells = <1>; + compatible = "fsl,mpc8377-elbc", "fsl,elbc", "simple-bus"; + reg = <0xe0005000 0x1000>; + interrupts = <77 0x8>; + interrupt-parent = <&ipic>; + ranges = <0x0 0x0 0xfc000000 0x04000000>; + + flash@0,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "cfi-flash"; + reg = <0x0 0x0 0x4000000>; + bank-width = <2>; + device-width = <1>; + + partition@0 { + reg = <0 0x8000>; + label = "u-boot"; + read-only; + }; + + partition@a0000 { + reg = <0xa0000 0x300000>; + label = "kernel"; + }; + + partition@3a0000 { + reg = <0x3a0000 0x3c60000>; + label = "rootfs"; + }; + }; + }; + + immr@e0000000 { + #address-cells = <1>; + #size-cells = <1>; + device_type = "soc"; + compatible = "simple-bus"; + ranges = <0x0 0xe0000000 0x00100000>; + reg = <0xe0000000 0x00000200>; + bus-frequency = <0>; + + wdt@200 { + device_type = "watchdog"; + compatible = "mpc83xx_wdt"; + reg = <0x200 0x100>; + }; + + gpio1: gpio-controller@c00 { + #gpio-cells = <2>; + compatible = "fsl,mpc8377-gpio", "fsl,mpc8349-gpio"; + reg = <0xc00 0x100>; + interrupts = <74 0x8>; + interrupt-parent = <&ipic>; + gpio-controller; + }; + + gpio2: gpio-controller@d00 { + #gpio-cells = <2>; + compatible = "fsl,mpc8377-gpio", "fsl,mpc8349-gpio"; + reg = <0xd00 0x100>; + interrupts = <75 0x8>; + interrupt-parent = <&ipic>; + gpio-controller; + }; + + sleep-nexus { + #address-cells = <1>; + #size-cells = <1>; + compatible = "simple-bus"; + sleep = <&pmc 0x0c000000>; + ranges; + + i2c@3000 { + #address-cells = <1>; + #size-cells = <0>; + cell-index = <0>; + compatible = "fsl-i2c"; + reg = <0x3000 0x100>; + interrupts = <14 0x8>; + interrupt-parent = <&ipic>; + dfsrr; + + at24@50 { + compatible = "at24,24c256"; + reg = <0x50>; + }; + + rtc@68 { + compatible = "dallas,ds1339"; + reg = <0x68>; + }; + }; + + sdhci@2e000 { + compatible = "fsl,mpc8377-esdhc", "fsl,esdhc"; + reg = <0x2e000 0x1000>; + interrupts = <42 0x8>; + interrupt-parent = <&ipic>; + clock-frequency = <133333333>; + }; + }; + + i2c@3100 { + #address-cells = <1>; + #size-cells = <0>; + cell-index = <1>; + compatible = "fsl-i2c"; + reg = <0x3100 0x100>; + interrupts = <15 0x8>; + interrupt-parent = <&ipic>; + dfsrr; + }; + + spi@7000 { + cell-index = <0>; + compatible = "fsl,spi"; + reg = <0x7000 0x1000>; + interrupts = <16 0x8>; + interrupt-parent = <&ipic>; + mode = "cpu"; + }; + + dma@82a8 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,mpc8377-dma", "fsl,elo-dma"; + reg = <0x82a8 4>; + ranges = <0 0x8100 0x1a8>; + interrupt-parent = <&ipic>; + interrupts = <71 8>; + cell-index = <0>; + dma-channel@0 { + compatible = "fsl,mpc8377-dma-channel", "fsl,elo-dma-channel"; + reg = <0 0x80>; + cell-index = <0>; + interrupt-parent = <&ipic>; + interrupts = <71 8>; + }; + dma-channel@80 { + compatible = "fsl,mpc8377-dma-channel", "fsl,elo-dma-channel"; + reg = <0x80 0x80>; + cell-index = <1>; + interrupt-parent = <&ipic>; + interrupts = <71 8>; + }; + dma-channel@100 { + compatible = "fsl,mpc8377-dma-channel", "fsl,elo-dma-channel"; + reg = <0x100 0x80>; + cell-index = <2>; + interrupt-parent = <&ipic>; + interrupts = <71 8>; + }; + dma-channel@180 { + compatible = "fsl,mpc8377-dma-channel", "fsl,elo-dma-channel"; + reg = <0x180 0x28>; + cell-index = <3>; + interrupt-parent = <&ipic>; + interrupts = <71 8>; + }; + }; + + usb@23000 { + compatible = "fsl-usb2-dr"; + reg = <0x23000 0x1000>; + #address-cells = <1>; + #size-cells = <0>; + interrupt-parent = <&ipic>; + interrupts = <38 0x8>; + phy_type = "ulpi"; + sleep = <&pmc 0x00c00000>; + }; + + enet0: ethernet@24000 { + #address-cells = <1>; + #size-cells = <1>; + cell-index = <0>; + device_type = "network"; + model = "eTSEC"; + compatible = "gianfar"; + reg = <0x24000 0x1000>; + ranges = <0x0 0x24000 0x1000>; + local-mac-address = [ 00 00 00 00 00 00 ]; + interrupts = <32 0x8 33 0x8 34 0x8>; + phy-connection-type = "mii"; + interrupt-parent = <&ipic>; + tbi-handle = <&tbi0>; + phy-handle = <&phy2>; + sleep = <&pmc 0xc0000000>; + fsl,magic-packet; + + mdio@520 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "fsl,gianfar-mdio"; + reg = <0x520 0x20>; + + phy2: ethernet-phy@2 { + interrupt-parent = <&ipic>; + interrupts = <17 0x8>; + reg = <0x2>; + device_type = "ethernet-phy"; + }; + + phy3: ethernet-phy@3 { + interrupt-parent = <&ipic>; + interrupts = <18 0x8>; + reg = <0x3>; + device_type = "ethernet-phy"; + }; + + tbi0: tbi-phy@11 { + reg = <0x11>; + device_type = "tbi-phy"; + }; + }; + }; + + enet1: ethernet@25000 { + #address-cells = <1>; + #size-cells = <1>; + cell-index = <1>; + device_type = "network"; + model = "eTSEC"; + compatible = "gianfar"; + reg = <0x25000 0x1000>; + ranges = <0x0 0x25000 0x1000>; + local-mac-address = [ 00 00 00 00 00 00 ]; + interrupts = <35 0x8 36 0x8 37 0x8>; + phy-connection-type = "mii"; + interrupt-parent = <&ipic>; + phy-handle = <&phy3>; + tbi-handle = <&tbi1>; + sleep = <&pmc 0x30000000>; + fsl,magic-packet; + + mdio@520 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "fsl,gianfar-tbi"; + reg = <0x520 0x20>; + + tbi1: tbi-phy@11 { + reg = <0x11>; + device_type = "tbi-phy"; + }; + }; + }; + + serial0: serial@4500 { + cell-index = <0>; + device_type = "serial"; + compatible = "ns16550"; + reg = <0x4500 0x100>; + clock-frequency = <0>; + interrupts = <9 0x8>; + interrupt-parent = <&ipic>; + }; + + serial1: serial@4600 { + cell-index = <1>; + device_type = "serial"; + compatible = "ns16550"; + reg = <0x4600 0x100>; + clock-frequency = <0>; + interrupts = <10 0x8>; + interrupt-parent = <&ipic>; + }; + + crypto@30000 { + compatible = "fsl,sec3.0", "fsl,sec2.4", "fsl,sec2.2", + "fsl,sec2.1", "fsl,sec2.0"; + reg = <0x30000 0x10000>; + interrupts = <11 0x8>; + interrupt-parent = <&ipic>; + fsl,num-channels = <4>; + fsl,channel-fifo-len = <24>; + fsl,exec-units-mask = <0x9fe>; + fsl,descriptor-types-mask = <0x3ab0ebf>; + sleep = <&pmc 0x03000000>; + }; + + sata@18000 { + compatible = "fsl,mpc8377-sata", "fsl,pq-sata"; + reg = <0x18000 0x1000>; + interrupts = <44 0x8>; + interrupt-parent = <&ipic>; + sleep = <&pmc 0x000000c0>; + }; + + sata@19000 { + compatible = "fsl,mpc8377-sata", "fsl,pq-sata"; + reg = <0x19000 0x1000>; + interrupts = <45 0x8>; + interrupt-parent = <&ipic>; + sleep = <&pmc 0x00000030>; + }; + + /* IPIC + * interrupts cell = + * sense values match linux IORESOURCE_IRQ_* defines: + * sense == 8: Level, low assertion + * sense == 2: Edge, high-to-low change + */ + ipic: interrupt-controller@700 { + compatible = "fsl,ipic"; + interrupt-controller; + #address-cells = <0>; + #interrupt-cells = <2>; + reg = <0x700 0x100>; + }; + + pmc: power@b00 { + compatible = "fsl,mpc8377-pmc", "fsl,mpc8349-pmc"; + reg = <0xb00 0x100 0xa00 0x100>; + interrupts = <80 0x8>; + interrupt-parent = <&ipic>; + }; + }; + + pci0: pci@e0008500 { + interrupt-map-mask = <0xf800 0 0 7>; + interrupt-map = < + /* IRQ5 = 21 = 0x15, IRQ6 = 0x16, IRQ7 = 23 = 0x17 */ + + /* IDSEL AD14 IRQ6 inta */ + 0x7000 0x0 0x0 0x1 &ipic 22 0x8 + + /* IDSEL AD15 IRQ5 inta */ + 0x7800 0x0 0x0 0x1 &ipic 21 0x8>; + interrupt-parent = <&ipic>; + interrupts = <66 0x8>; + bus-range = <0 0>; + ranges = <0x02000000 0x0 0x90000000 0x90000000 0x0 0x10000000 + 0x42000000 0x0 0x80000000 0x80000000 0x0 0x10000000 + 0x01000000 0x0 0x00000000 0xe0300000 0x0 0x00100000>; + sleep = <&pmc 0x00010000>; + clock-frequency = <66666666>; + #interrupt-cells = <1>; + #size-cells = <2>; + #address-cells = <3>; + reg = <0xe0008500 0x100 /* internal registers */ + 0xe0008300 0x8>; /* config space access registers */ + compatible = "fsl,mpc8349-pci"; + device_type = "pci"; + }; + + pci1: pcie@e0009000 { + #address-cells = <3>; + #size-cells = <2>; + #interrupt-cells = <1>; + device_type = "pci"; + compatible = "fsl,mpc8377-pcie", "fsl,mpc8314-pcie"; + reg = <0xe0009000 0x00001000>; + ranges = <0x02000000 0 0xa8000000 0xa8000000 0 0x10000000 + 0x01000000 0 0x00000000 0xb8000000 0 0x00800000>; + bus-range = <0 255>; + interrupt-map-mask = <0xf800 0 0 7>; + interrupt-map = <0 0 0 1 &ipic 1 8 + 0 0 0 2 &ipic 1 8 + 0 0 0 3 &ipic 1 8 + 0 0 0 4 &ipic 1 8>; + sleep = <&pmc 0x00300000>; + clock-frequency = <0>; + + pcie@0 { + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + reg = <0 0 0 0 0>; + ranges = <0x02000000 0 0xa8000000 + 0x02000000 0 0xa8000000 + 0 0x10000000 + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00800000>; + }; + }; + + pci2: pcie@e000a000 { + #address-cells = <3>; + #size-cells = <2>; + #interrupt-cells = <1>; + device_type = "pci"; + compatible = "fsl,mpc8377-pcie", "fsl,mpc8314-pcie"; + reg = <0xe000a000 0x00001000>; + ranges = <0x02000000 0 0xc8000000 0xc8000000 0 0x10000000 + 0x01000000 0 0x00000000 0xd8000000 0 0x00800000>; + bus-range = <0 255>; + interrupt-map-mask = <0xf800 0 0 7>; + interrupt-map = <0 0 0 1 &ipic 2 8 + 0 0 0 2 &ipic 2 8 + 0 0 0 3 &ipic 2 8 + 0 0 0 4 &ipic 2 8>; + sleep = <&pmc 0x000c0000>; + clock-frequency = <0>; + + pcie@0 { + #address-cells = <3>; + #size-cells = <2>; + device_type = "pci"; + reg = <0 0 0 0 0>; + ranges = <0x02000000 0 0xc8000000 + 0x02000000 0 0xc8000000 + 0 0x10000000 + 0x01000000 0 0x00000000 + 0x01000000 0 0x00000000 + 0 0x00800000>; + }; + }; +}; diff --git a/arch/powerpc/platforms/83xx/Kconfig b/arch/powerpc/platforms/83xx/Kconfig index 083ebee9a16d..f49a2548c5ff 100644 --- a/arch/powerpc/platforms/83xx/Kconfig +++ b/arch/powerpc/platforms/83xx/Kconfig @@ -75,11 +75,11 @@ config MPC837x_MDS This option enables support for the MPC837x MDS Processor Board. config MPC837x_RDB - bool "Freescale MPC837x RDB" + bool "Freescale MPC837x RDB/WLAN" select DEFAULT_UIMAGE select PPC_MPC837x help - This option enables support for the MPC837x RDB Board. + This option enables support for the MPC837x RDB and WLAN Boards. config SBC834x bool "Wind River SBC834x" diff --git a/arch/powerpc/platforms/83xx/mpc837x_rdb.c b/arch/powerpc/platforms/83xx/mpc837x_rdb.c index 76f3b32a155e..91d19ab3a794 100644 --- a/arch/powerpc/platforms/83xx/mpc837x_rdb.c +++ b/arch/powerpc/platforms/83xx/mpc837x_rdb.c @@ -86,11 +86,12 @@ static int __init mpc837x_rdb_probe(void) return of_flat_dt_is_compatible(root, "fsl,mpc8377rdb") || of_flat_dt_is_compatible(root, "fsl,mpc8378rdb") || - of_flat_dt_is_compatible(root, "fsl,mpc8379rdb"); + of_flat_dt_is_compatible(root, "fsl,mpc8379rdb") || + of_flat_dt_is_compatible(root, "fsl,mpc8377wlan"); } define_machine(mpc837x_rdb) { - .name = "MPC837x RDB", + .name = "MPC837x RDB/WLAN", .probe = mpc837x_rdb_probe, .setup_arch = mpc837x_rdb_setup_arch, .init_IRQ = mpc837x_rdb_init_IRQ, From c69328d4b93e6885c897155fbacac69a12c5faef Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Thu, 9 Jul 2009 22:36:44 +0400 Subject: [PATCH 0166/1662] powerpc/85xx: Add support for I2C EEPROMs on MPC8548CDS boards This patch simply adds four eeprom nodes to MPC8548CDS' device tree. Signed-off-by: Anton Vorontsov Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/boot/dts/mpc8548cds.dts | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/powerpc/boot/dts/mpc8548cds.dts b/arch/powerpc/boot/dts/mpc8548cds.dts index 475be1433fe1..4173af387c63 100644 --- a/arch/powerpc/boot/dts/mpc8548cds.dts +++ b/arch/powerpc/boot/dts/mpc8548cds.dts @@ -100,6 +100,21 @@ interrupts = <43 2>; interrupt-parent = <&mpic>; dfsrr; + + eeprom@50 { + compatible = "atmel,24c64"; + reg = <0x50>; + }; + + eeprom@56 { + compatible = "atmel,24c64"; + reg = <0x56>; + }; + + eeprom@57 { + compatible = "atmel,24c64"; + reg = <0x57>; + }; }; i2c@3100 { @@ -111,6 +126,11 @@ interrupts = <43 2>; interrupt-parent = <&mpic>; dfsrr; + + eeprom@50 { + compatible = "atmel,24c64"; + reg = <0x50>; + }; }; dma@21300 { From 89f3729642cf33bfbc742b85e134936b562f9731 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Sat, 25 Jul 2009 01:42:30 +0400 Subject: [PATCH 0167/1662] powerpc/83xx: Add eSDHC support for MPC837xE-RDB/WLAN boards Actually, the support is already there, but it requires newer U-Boots (to fill-in clock-frequency, and setup pin multiplexing). Though, it appears that on RDB boards USBB pins aren't multiplexed between USB and eSDHC (unlike MDS boards, where USB and eSDHC share pctl and pwrfault pins). So, for RDB boards we can safely setup pinmux and manually fill-in clock-frequency, thus making eSDHC work even with older u-boots. Signed-off-by: Anton Vorontsov Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/boot/dts/mpc8377_rdb.dts | 2 +- arch/powerpc/boot/dts/mpc8378_rdb.dts | 2 +- arch/powerpc/boot/dts/mpc8379_rdb.dts | 2 +- arch/powerpc/platforms/83xx/mpc837x_rdb.c | 23 +++++++++++++++++++++++ arch/powerpc/platforms/83xx/mpc83xx.h | 4 ++++ 5 files changed, 30 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/boot/dts/mpc8377_rdb.dts b/arch/powerpc/boot/dts/mpc8377_rdb.dts index 4f06dbc0d27e..28e022ac4179 100644 --- a/arch/powerpc/boot/dts/mpc8377_rdb.dts +++ b/arch/powerpc/boot/dts/mpc8377_rdb.dts @@ -174,7 +174,7 @@ interrupts = <42 0x8>; interrupt-parent = <&ipic>; /* Filled in by U-Boot */ - clock-frequency = <0>; + clock-frequency = <111111111>; }; }; diff --git a/arch/powerpc/boot/dts/mpc8378_rdb.dts b/arch/powerpc/boot/dts/mpc8378_rdb.dts index aabf3437cadf..a11ead8214b4 100644 --- a/arch/powerpc/boot/dts/mpc8378_rdb.dts +++ b/arch/powerpc/boot/dts/mpc8378_rdb.dts @@ -174,7 +174,7 @@ interrupts = <42 0x8>; interrupt-parent = <&ipic>; /* Filled in by U-Boot */ - clock-frequency = <0>; + clock-frequency = <111111111>; }; }; diff --git a/arch/powerpc/boot/dts/mpc8379_rdb.dts b/arch/powerpc/boot/dts/mpc8379_rdb.dts index 9b1da864d890..e35dfba587c8 100644 --- a/arch/powerpc/boot/dts/mpc8379_rdb.dts +++ b/arch/powerpc/boot/dts/mpc8379_rdb.dts @@ -172,7 +172,7 @@ interrupts = <42 0x8>; interrupt-parent = <&ipic>; /* Filled in by U-Boot */ - clock-frequency = <0>; + clock-frequency = <111111111>; }; }; diff --git a/arch/powerpc/platforms/83xx/mpc837x_rdb.c b/arch/powerpc/platforms/83xx/mpc837x_rdb.c index 91d19ab3a794..a1908d261240 100644 --- a/arch/powerpc/platforms/83xx/mpc837x_rdb.c +++ b/arch/powerpc/platforms/83xx/mpc837x_rdb.c @@ -17,10 +17,32 @@ #include #include #include +#include #include #include "mpc83xx.h" +static void mpc837x_rdb_sd_cfg(void) +{ + void __iomem *im; + + im = ioremap(get_immrbase(), 0x1000); + if (!im) { + WARN_ON(1); + return; + } + + /* + * On RDB boards (in contrast to MDS) USBB pins are used for SD only, + * so we can safely mux them away from the USB block. + */ + clrsetbits_be32(im + MPC83XX_SICRL_OFFS, MPC837X_SICRL_USBB_MASK, + MPC837X_SICRL_SD); + clrsetbits_be32(im + MPC83XX_SICRH_OFFS, MPC837X_SICRH_SPI_MASK, + MPC837X_SICRH_SD); + iounmap(im); +} + /* ************************************************************************ * * Setup the architecture @@ -42,6 +64,7 @@ static void __init mpc837x_rdb_setup_arch(void) mpc83xx_add_bridge(np); #endif mpc837x_usb_cfg(); + mpc837x_rdb_sd_cfg(); } static struct of_device_id mpc837x_ids[] = { diff --git a/arch/powerpc/platforms/83xx/mpc83xx.h b/arch/powerpc/platforms/83xx/mpc83xx.h index d1dc5b0b4fbf..0fea8811d45b 100644 --- a/arch/powerpc/platforms/83xx/mpc83xx.h +++ b/arch/powerpc/platforms/83xx/mpc83xx.h @@ -30,6 +30,8 @@ #define MPC8315_SICRL_USB_ULPI 0x00000054 #define MPC837X_SICRL_USB_MASK 0xf0000000 #define MPC837X_SICRL_USB_ULPI 0x50000000 +#define MPC837X_SICRL_USBB_MASK 0x30000000 +#define MPC837X_SICRL_SD 0x20000000 /* system i/o configuration register high */ #define MPC83XX_SICRH_OFFS 0x118 @@ -38,6 +40,8 @@ #define MPC831X_SICRH_USB_ULPI 0x000000a0 #define MPC8315_SICRH_USB_MASK 0x0000ff00 #define MPC8315_SICRH_USB_ULPI 0x00000000 +#define MPC837X_SICRH_SPI_MASK 0x00000003 +#define MPC837X_SICRH_SD 0x00000001 /* USB Control Register */ #define FSL_USB2_CONTROL_OFFS 0x500 From 2eaa50e9670ecab9712317723cb7836a4da1c0dc Mon Sep 17 00:00:00 2001 From: Martyn Welch Date: Tue, 30 Jun 2009 15:32:26 +0100 Subject: [PATCH 0168/1662] powerpc/86xx: Correct reading of information presented in cpuinfo /proc/cpuinfo should be showing the boards revision and the revision of the FPGA fitted. The functions currently used to access this information as incorrect. Additionally the VME geographical address of the PPC9A and it's status as system contoller are available in the board registers. Show these in cpuinfo. Signed-off-by: Martyn Welch Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/86xx/gef_ppc9a.c | 37 ++++++++++++++++++++----- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/86xx/gef_ppc9a.c b/arch/powerpc/platforms/86xx/gef_ppc9a.c index 2efa052975e6..287f7bd17dd9 100644 --- a/arch/powerpc/platforms/86xx/gef_ppc9a.c +++ b/arch/powerpc/platforms/86xx/gef_ppc9a.c @@ -102,8 +102,8 @@ static unsigned int gef_ppc9a_get_pcb_rev(void) { unsigned int reg; - reg = ioread32(ppc9a_regs); - return (reg >> 8) & 0xff; + reg = ioread32be(ppc9a_regs); + return (reg >> 16) & 0xff; } /* Return the board (software) revision */ @@ -111,8 +111,8 @@ static unsigned int gef_ppc9a_get_board_rev(void) { unsigned int reg; - reg = ioread32(ppc9a_regs); - return (reg >> 16) & 0xff; + reg = ioread32be(ppc9a_regs); + return (reg >> 8) & 0xff; } /* Return the FPGA revision */ @@ -120,8 +120,26 @@ static unsigned int gef_ppc9a_get_fpga_rev(void) { unsigned int reg; - reg = ioread32(ppc9a_regs); - return (reg >> 24) & 0xf; + reg = ioread32be(ppc9a_regs); + return reg & 0xf; +} + +/* Return VME Geographical Address */ +static unsigned int gef_ppc9a_get_vme_geo_addr(void) +{ + unsigned int reg; + + reg = ioread32be(ppc9a_regs + 0x4); + return reg & 0x1f; +} + +/* Return VME System Controller Status */ +static unsigned int gef_ppc9a_get_vme_is_syscon(void) +{ + unsigned int reg; + + reg = ioread32be(ppc9a_regs + 0x4); + return (reg >> 9) & 0x1; } static void gef_ppc9a_show_cpuinfo(struct seq_file *m) @@ -131,10 +149,15 @@ static void gef_ppc9a_show_cpuinfo(struct seq_file *m) seq_printf(m, "Vendor\t\t: GE Fanuc Intelligent Platforms\n"); seq_printf(m, "Revision\t: %u%c\n", gef_ppc9a_get_pcb_rev(), - ('A' + gef_ppc9a_get_board_rev() - 1)); + ('A' + gef_ppc9a_get_board_rev())); seq_printf(m, "FPGA Revision\t: %u\n", gef_ppc9a_get_fpga_rev()); seq_printf(m, "SVR\t\t: 0x%x\n", svid); + + seq_printf(m, "VME geo. addr\t: %u\n", gef_ppc9a_get_vme_geo_addr()); + + seq_printf(m, "VME syscon\t: %s\n", + gef_ppc9a_get_vme_is_syscon() ? "yes" : "no"); } static void __init gef_ppc9a_nec_fixup(struct pci_dev *pdev) From 8798b9df26bbc7f3cee686483a0bac68b666bf8f Mon Sep 17 00:00:00 2001 From: Martyn Welch Date: Tue, 30 Jun 2009 15:32:38 +0100 Subject: [PATCH 0169/1662] powerpc/86xx: Enable XMC site on GE Fanuc SBC310 This patch enables the XMC (PCIe daughter card) site on the SBC310. STG enter the description for the patch above. Signed-off-by: Martyn Welch Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/boot/dts/gef_sbc310.dts | 37 ++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/arch/powerpc/boot/dts/gef_sbc310.dts b/arch/powerpc/boot/dts/gef_sbc310.dts index 0f4c9ec2c3a6..7810ea995541 100644 --- a/arch/powerpc/boot/dts/gef_sbc310.dts +++ b/arch/powerpc/boot/dts/gef_sbc310.dts @@ -376,4 +376,41 @@ 0x0 0x00400000>; }; }; + + pci1: pcie@fef09000 { + compatible = "fsl,mpc8641-pcie"; + device_type = "pci"; + #interrupt-cells = <1>; + #size-cells = <2>; + #address-cells = <3>; + reg = <0xfef09000 0x1000>; + bus-range = <0x0 0xff>; + ranges = <0x02000000 0x0 0xc0000000 0xc0000000 0x0 0x20000000 + 0x01000000 0x0 0x00000000 0xfe400000 0x0 0x00400000>; + clock-frequency = <33333333>; + interrupt-parent = <&mpic>; + interrupts = <0x19 0x2>; + interrupt-map-mask = <0xf800 0x0 0x0 0x7>; + interrupt-map = < + 0x0000 0x0 0x0 0x1 &mpic 0x4 0x2 + 0x0000 0x0 0x0 0x2 &mpic 0x5 0x2 + 0x0000 0x0 0x0 0x3 &mpic 0x6 0x2 + 0x0000 0x0 0x0 0x4 &mpic 0x7 0x2 + >; + + pcie@0 { + reg = <0 0 0 0 0>; + #size-cells = <2>; + #address-cells = <3>; + device_type = "pci"; + ranges = <0x02000000 0x0 0xc0000000 + 0x02000000 0x0 0xc0000000 + 0x0 0x20000000 + + 0x01000000 0x0 0x00000000 + 0x01000000 0x0 0x00000000 + 0x0 0x00400000>; + }; + }; + }; From 433abcdf321322495d83c8b571bdf7134622c734 Mon Sep 17 00:00:00 2001 From: Martyn Welch Date: Wed, 29 Jul 2009 22:13:45 +0000 Subject: [PATCH 0170/1662] powerpc/86xx: Update GE Fanuc sbc310 DTS Update GE Fanuc DTS to match the alterations suggested during the merge of the ppc9a DTS in commit 740d36ae6344f38c4da64c2ede765d7d2dd1f132 Signed-off-by: Martyn Welch Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/boot/dts/gef_sbc310.dts | 29 +++++++++++++--------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/boot/dts/gef_sbc310.dts b/arch/powerpc/boot/dts/gef_sbc310.dts index 7810ea995541..2107d3c7cfe1 100644 --- a/arch/powerpc/boot/dts/gef_sbc310.dts +++ b/arch/powerpc/boot/dts/gef_sbc310.dts @@ -83,34 +83,34 @@ /* flash@0,0 is a mirror of part of the memory in flash@1,0 flash@0,0 { - compatible = "cfi-flash"; - reg = <0 0 0x01000000>; + compatible = "gef,sbc310-firmware-mirror", "cfi-flash"; + reg = <0x0 0x0 0x01000000>; bank-width = <2>; device-width = <2>; #address-cells = <1>; #size-cells = <1>; partition@0 { label = "firmware"; - reg = <0x00000000 0x01000000>; + reg = <0x0 0x01000000>; read-only; }; }; */ flash@1,0 { - compatible = "cfi-flash"; - reg = <1 0 0x8000000>; + compatible = "gef,sbc310-paged-flash", "cfi-flash"; + reg = <0x1 0x0 0x8000000>; bank-width = <2>; device-width = <2>; #address-cells = <1>; #size-cells = <1>; partition@0 { label = "user"; - reg = <0x00000000 0x07800000>; + reg = <0x0 0x7800000>; }; partition@7800000 { label = "firmware"; - reg = <0x07800000 0x00800000>; + reg = <0x7800000 0x800000>; read-only; }; }; @@ -121,18 +121,16 @@ }; wdt@4,2000 { - #interrupt-cells = <2>; - device_type = "watchdog"; - compatible = "gef,fpga-wdt"; + compatible = "gef,sbc310-fpga-wdt", "gef,fpga-wdt-1.00", + "gef,fpga-wdt"; reg = <0x4 0x2000 0x8>; interrupts = <0x1a 0x4>; interrupt-parent = <&gef_pic>; }; /* wdt@4,2010 { - #interrupt-cells = <2>; - device_type = "watchdog"; - compatible = "gef,fpga-wdt"; + compatible = "gef,sbc310-fpga-wdt", "gef,fpga-wdt-1.00", + "gef,fpga-wdt"; reg = <0x4 0x2010 0x8>; interrupts = <0x1b 0x4>; interrupt-parent = <&gef_pic>; @@ -141,7 +139,7 @@ gef_pic: pic@4,4000 { #interrupt-cells = <1>; interrupt-controller; - compatible = "gef,fpga-pic"; + compatible = "gef,sbc310-fpga-pic", "gef,fpga-pic"; reg = <0x4 0x4000 0x20>; interrupts = <0x8 0x9>; @@ -161,7 +159,7 @@ #size-cells = <1>; #interrupt-cells = <2>; device_type = "soc"; - compatible = "simple-bus"; + compatible = "fsl,mpc8641-soc", "simple-bus"; ranges = <0x0 0xfef00000 0x00100000>; bus-frequency = <33333333>; @@ -412,5 +410,4 @@ 0x0 0x00400000>; }; }; - }; From 797a747a82e23530ee45d2927bf84f3571c1acb2 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 18 Aug 2009 15:21:40 +0000 Subject: [PATCH 0171/1662] powerpc/mm: Fix assert_pte_locked to work properly on uniprocessor Since the pte_lockptr is a spinlock it gets optimized away on uniprocessor builds so using spin_is_locked is not correct. We can use assert_spin_locked instead and get the proper behavior between UP and SMP builds. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/mm/pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index cafb2a269542..b6b32487e740 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -254,7 +254,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, addr); BUG_ON(!pmd_present(*pmd)); - BUG_ON(!spin_is_locked(pte_lockptr(mm, pmd))); + assert_spin_locked(pte_lockptr(mm, pmd)); } #endif /* CONFIG_DEBUG_VM */ From ae142e0c52b38e44d28b12f77c6e7faa96f7a069 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 12 Jun 2009 04:31:52 +0000 Subject: [PATCH 0172/1662] powerpc/sputrace: Use the generic event tracer I wrote sputrace before generic tracing infrastrucure was available. Now that we have the generic event tracer we can convert it over and remove a lot of code: 8 files changed, 45 insertions(+), 285 deletions(-) To use it make sure CONFIG_EVENT_TRACING is enabled and then enable the spufs trace channel by echo 1 > /sys/kernel/debug/tracing/events/spufs/spufs_context/enable and then read the trace records using e.g. cat /sys/kernel/debug/tracing/trace Signed-off-by: Christoph Hellwig Acked-by: Jeremy Kerr Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/cell/Kconfig | 7 - arch/powerpc/platforms/cell/spufs/Makefile | 3 +- arch/powerpc/platforms/cell/spufs/context.c | 1 + arch/powerpc/platforms/cell/spufs/file.c | 1 + arch/powerpc/platforms/cell/spufs/sched.c | 2 + arch/powerpc/platforms/cell/spufs/spufs.h | 5 - arch/powerpc/platforms/cell/spufs/sputrace.c | 272 ------------------- arch/powerpc/platforms/cell/spufs/sputrace.h | 39 +++ 8 files changed, 45 insertions(+), 285 deletions(-) delete mode 100644 arch/powerpc/platforms/cell/spufs/sputrace.c create mode 100644 arch/powerpc/platforms/cell/spufs/sputrace.h diff --git a/arch/powerpc/platforms/cell/Kconfig b/arch/powerpc/platforms/cell/Kconfig index 50f17bdd3c16..48cd7d2e1b75 100644 --- a/arch/powerpc/platforms/cell/Kconfig +++ b/arch/powerpc/platforms/cell/Kconfig @@ -80,13 +80,6 @@ config SPU_FS_64K_LS uses 4K pages. This can improve performances of applications using multiple SPEs by lowering the TLB pressure on them. -config SPU_TRACE - tristate "SPU event tracing support" - depends on SPU_FS && MARKERS - help - This option allows reading a trace of spu-related events through - the sputrace file in procfs. - config SPU_BASE bool default n diff --git a/arch/powerpc/platforms/cell/spufs/Makefile b/arch/powerpc/platforms/cell/spufs/Makefile index 99610a6361f2..b93f877ba504 100644 --- a/arch/powerpc/platforms/cell/spufs/Makefile +++ b/arch/powerpc/platforms/cell/spufs/Makefile @@ -4,7 +4,8 @@ spufs-y += inode.o file.o context.o syscalls.o coredump.o spufs-y += sched.o backing_ops.o hw_ops.o run.o gang.o spufs-y += switch.o fault.o lscsa_alloc.o -obj-$(CONFIG_SPU_TRACE) += sputrace.o +# magic for the trace events +CFLAGS_sched.o := -I$(src) # Rules to build switch.o with the help of SPU tool chain SPU_CROSS := spu- diff --git a/arch/powerpc/platforms/cell/spufs/context.c b/arch/powerpc/platforms/cell/spufs/context.c index db5398c0339f..0c87bcd2452a 100644 --- a/arch/powerpc/platforms/cell/spufs/context.c +++ b/arch/powerpc/platforms/cell/spufs/context.c @@ -28,6 +28,7 @@ #include #include #include "spufs.h" +#include "sputrace.h" atomic_t nr_spu_contexts = ATOMIC_INIT(0); diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index d6a519e6e1c1..ab8aef9bb8ea 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -38,6 +38,7 @@ #include #include "spufs.h" +#include "sputrace.h" #define SPUFS_MMAP_4K (PAGE_SIZE == 0x1000) diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index f085369301b1..bb5b77c66d05 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -47,6 +47,8 @@ #include #include #include "spufs.h" +#define CREATE_TRACE_POINTS +#include "sputrace.h" struct spu_prio_array { DECLARE_BITMAP(bitmap, MAX_PRIO); diff --git a/arch/powerpc/platforms/cell/spufs/spufs.h b/arch/powerpc/platforms/cell/spufs/spufs.h index ae31573bea4a..c448bac65518 100644 --- a/arch/powerpc/platforms/cell/spufs/spufs.h +++ b/arch/powerpc/platforms/cell/spufs/spufs.h @@ -373,9 +373,4 @@ extern void spu_free_lscsa(struct spu_state *csa); extern void spuctx_switch_state(struct spu_context *ctx, enum spu_utilization_state new_state); -#define spu_context_trace(name, ctx, spu) \ - trace_mark(name, "ctx %p spu %p", ctx, spu); -#define spu_context_nospu_trace(name, ctx) \ - trace_mark(name, "ctx %p", ctx); - #endif diff --git a/arch/powerpc/platforms/cell/spufs/sputrace.c b/arch/powerpc/platforms/cell/spufs/sputrace.c deleted file mode 100644 index d0b1f3f4d9c8..000000000000 --- a/arch/powerpc/platforms/cell/spufs/sputrace.c +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright (C) 2007 IBM Deutschland Entwicklung GmbH - * Released under GPL v2. - * - * Partially based on net/ipv4/tcp_probe.c. - * - * Simple tracing facility for spu contexts. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include "spufs.h" - -struct spu_probe { - const char *name; - const char *format; - marker_probe_func *probe_func; -}; - -struct sputrace { - ktime_t tstamp; - int owner_tid; /* owner */ - int curr_tid; - const char *name; - int number; -}; - -static int bufsize __read_mostly = 16384; -MODULE_PARM_DESC(bufsize, "Log buffer size (number of records)"); -module_param(bufsize, int, 0); - - -static DEFINE_SPINLOCK(sputrace_lock); -static DECLARE_WAIT_QUEUE_HEAD(sputrace_wait); -static ktime_t sputrace_start; -static unsigned long sputrace_head, sputrace_tail; -static struct sputrace *sputrace_log; -static int sputrace_logging; - -static int sputrace_used(void) -{ - return (sputrace_head - sputrace_tail) % bufsize; -} - -static inline int sputrace_avail(void) -{ - return bufsize - sputrace_used(); -} - -static int sputrace_sprint(char *tbuf, int n) -{ - const struct sputrace *t = sputrace_log + sputrace_tail % bufsize; - struct timespec tv = - ktime_to_timespec(ktime_sub(t->tstamp, sputrace_start)); - - return snprintf(tbuf, n, - "[%lu.%09lu] %d: %s (ctxthread = %d, spu = %d)\n", - (unsigned long) tv.tv_sec, - (unsigned long) tv.tv_nsec, - t->curr_tid, - t->name, - t->owner_tid, - t->number); -} - -static ssize_t sputrace_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) -{ - int error = 0, cnt = 0; - - if (!buf || len < 0) - return -EINVAL; - - while (cnt < len) { - char tbuf[128]; - int width; - - /* If we have data ready to return, don't block waiting - * for more */ - if (cnt > 0 && sputrace_used() == 0) - break; - - error = wait_event_interruptible(sputrace_wait, - sputrace_used() > 0); - if (error) - break; - - spin_lock(&sputrace_lock); - if (sputrace_head == sputrace_tail) { - spin_unlock(&sputrace_lock); - continue; - } - - width = sputrace_sprint(tbuf, sizeof(tbuf)); - if (width < len) - sputrace_tail = (sputrace_tail + 1) % bufsize; - spin_unlock(&sputrace_lock); - - if (width >= len) - break; - - error = copy_to_user(buf + cnt, tbuf, width); - if (error) - break; - cnt += width; - } - - return cnt == 0 ? error : cnt; -} - -static int sputrace_open(struct inode *inode, struct file *file) -{ - int rc; - - spin_lock(&sputrace_lock); - if (sputrace_logging) { - rc = -EBUSY; - goto out; - } - - sputrace_logging = 1; - sputrace_head = sputrace_tail = 0; - sputrace_start = ktime_get(); - rc = 0; - -out: - spin_unlock(&sputrace_lock); - return rc; -} - -static int sputrace_release(struct inode *inode, struct file *file) -{ - spin_lock(&sputrace_lock); - sputrace_logging = 0; - spin_unlock(&sputrace_lock); - return 0; -} - -static const struct file_operations sputrace_fops = { - .owner = THIS_MODULE, - .open = sputrace_open, - .read = sputrace_read, - .release = sputrace_release, -}; - -static void sputrace_log_item(const char *name, struct spu_context *ctx, - struct spu *spu) -{ - spin_lock(&sputrace_lock); - - if (!sputrace_logging) { - spin_unlock(&sputrace_lock); - return; - } - - if (sputrace_avail() > 1) { - struct sputrace *t = sputrace_log + sputrace_head; - - t->tstamp = ktime_get(); - t->owner_tid = ctx->tid; - t->name = name; - t->curr_tid = current->pid; - t->number = spu ? spu->number : -1; - - sputrace_head = (sputrace_head + 1) % bufsize; - } else { - printk(KERN_WARNING - "sputrace: lost samples due to full buffer.\n"); - } - spin_unlock(&sputrace_lock); - - wake_up(&sputrace_wait); -} - -static void spu_context_event(void *probe_private, void *call_data, - const char *format, va_list *args) -{ - struct spu_probe *p = probe_private; - struct spu_context *ctx; - struct spu *spu; - - ctx = va_arg(*args, struct spu_context *); - spu = va_arg(*args, struct spu *); - - sputrace_log_item(p->name, ctx, spu); -} - -static void spu_context_nospu_event(void *probe_private, void *call_data, - const char *format, va_list *args) -{ - struct spu_probe *p = probe_private; - struct spu_context *ctx; - - ctx = va_arg(*args, struct spu_context *); - - sputrace_log_item(p->name, ctx, NULL); -} - -struct spu_probe spu_probes[] = { - { "spu_bind_context__enter", "ctx %p spu %p", spu_context_event }, - { "spu_unbind_context__enter", "ctx %p spu %p", spu_context_event }, - { "spu_get_idle__enter", "ctx %p", spu_context_nospu_event }, - { "spu_get_idle__found", "ctx %p spu %p", spu_context_event }, - { "spu_get_idle__not_found", "ctx %p", spu_context_nospu_event }, - { "spu_find_victim__enter", "ctx %p", spu_context_nospu_event }, - { "spusched_tick__preempt", "ctx %p spu %p", spu_context_event }, - { "spusched_tick__newslice", "ctx %p", spu_context_nospu_event }, - { "spu_yield__enter", "ctx %p", spu_context_nospu_event }, - { "spu_deactivate__enter", "ctx %p", spu_context_nospu_event }, - { "__spu_deactivate__unload", "ctx %p spu %p", spu_context_event }, - { "spufs_ps_fault__enter", "ctx %p", spu_context_nospu_event }, - { "spufs_ps_fault__sleep", "ctx %p", spu_context_nospu_event }, - { "spufs_ps_fault__wake", "ctx %p spu %p", spu_context_event }, - { "spufs_ps_fault__insert", "ctx %p spu %p", spu_context_event }, - { "spu_acquire_saved__enter", "ctx %p", spu_context_nospu_event }, - { "destroy_spu_context__enter", "ctx %p", spu_context_nospu_event }, - { "spufs_stop_callback__enter", "ctx %p spu %p", spu_context_event }, -}; - -static int __init sputrace_init(void) -{ - struct proc_dir_entry *entry; - int i, error = -ENOMEM; - - sputrace_log = kcalloc(bufsize, sizeof(struct sputrace), GFP_KERNEL); - if (!sputrace_log) - goto out; - - entry = proc_create("sputrace", S_IRUSR, NULL, &sputrace_fops); - if (!entry) - goto out_free_log; - - for (i = 0; i < ARRAY_SIZE(spu_probes); i++) { - struct spu_probe *p = &spu_probes[i]; - - error = marker_probe_register(p->name, p->format, - p->probe_func, p); - if (error) - printk(KERN_INFO "Unable to register probe %s\n", - p->name); - } - - return 0; - -out_free_log: - kfree(sputrace_log); -out: - return -ENOMEM; -} - -static void __exit sputrace_exit(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(spu_probes); i++) - marker_probe_unregister(spu_probes[i].name, - spu_probes[i].probe_func, &spu_probes[i]); - - remove_proc_entry("sputrace", NULL); - kfree(sputrace_log); - marker_synchronize_unregister(); -} - -module_init(sputrace_init); -module_exit(sputrace_exit); - -MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/platforms/cell/spufs/sputrace.h b/arch/powerpc/platforms/cell/spufs/sputrace.h new file mode 100644 index 000000000000..db2656aa4103 --- /dev/null +++ b/arch/powerpc/platforms/cell/spufs/sputrace.h @@ -0,0 +1,39 @@ +#if !defined(_TRACE_SPUFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SPUFS_H + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM spufs + +TRACE_EVENT(spufs_context, + TP_PROTO(struct spu_context *ctx, struct spu *spu, const char *name), + TP_ARGS(ctx, spu, name), + + TP_STRUCT__entry( + __field(const char *, name) + __field(int, owner_tid) + __field(int, number) + ), + + TP_fast_assign( + __entry->name = name; + __entry->owner_tid = ctx->tid; + __entry->number = spu ? spu->number : -1; + ), + + TP_printk("%s (ctxthread = %d, spu = %d)", + __entry->name, __entry->owner_tid, __entry->number) +); + +#define spu_context_trace(name, ctx, spu) \ + trace_spufs_context(ctx, spu, __stringify(name)) +#define spu_context_nospu_trace(name, ctx) \ + trace_spufs_context(ctx, NULL, __stringify(name)) + +#endif /* _TRACE_SPUFS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE sputrace +#include From 690a2d074ead1867ca5a1976284ca1c89ebc04c6 Mon Sep 17 00:00:00 2001 From: Martyn Welch Date: Thu, 2 Jul 2009 06:12:18 +0000 Subject: [PATCH 0173/1662] powerpc/nvram: Allow byte length reads from mmio NVRAM driver Add a byte length read and write interface compatible with the nvram_generic driver interface to the mmio driver. Signed-off-by: Martyn Welch Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/sysdev/mmio_nvram.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/arch/powerpc/sysdev/mmio_nvram.c b/arch/powerpc/sysdev/mmio_nvram.c index 7b49633a4bd0..207324209065 100644 --- a/arch/powerpc/sysdev/mmio_nvram.c +++ b/arch/powerpc/sysdev/mmio_nvram.c @@ -53,6 +53,23 @@ static ssize_t mmio_nvram_read(char *buf, size_t count, loff_t *index) return count; } +static unsigned char mmio_nvram_read_val(int addr) +{ + unsigned long flags; + unsigned char val; + + if (addr >= mmio_nvram_len) + return 0xff; + + spin_lock_irqsave(&mmio_nvram_lock, flags); + + val = ioread8(mmio_nvram_start + addr); + + spin_unlock_irqrestore(&mmio_nvram_lock, flags); + + return val; +} + static ssize_t mmio_nvram_write(char *buf, size_t count, loff_t *index) { unsigned long flags; @@ -72,6 +89,19 @@ static ssize_t mmio_nvram_write(char *buf, size_t count, loff_t *index) return count; } +void mmio_nvram_write_val(int addr, unsigned char val) +{ + unsigned long flags; + + if (addr < mmio_nvram_len) { + spin_lock_irqsave(&mmio_nvram_lock, flags); + + iowrite8(val, mmio_nvram_start + addr); + + spin_unlock_irqrestore(&mmio_nvram_lock, flags); + } +} + static ssize_t mmio_nvram_get_size(void) { return mmio_nvram_len; @@ -114,6 +144,8 @@ int __init mmio_nvram_init(void) printk(KERN_INFO "mmio NVRAM, %luk at 0x%lx mapped to %p\n", mmio_nvram_len >> 10, nvram_addr, mmio_nvram_start); + ppc_md.nvram_read_val = mmio_nvram_read_val; + ppc_md.nvram_write_val = mmio_nvram_write_val; ppc_md.nvram_read = mmio_nvram_read; ppc_md.nvram_write = mmio_nvram_write; ppc_md.nvram_size = mmio_nvram_get_size; From 5b36f1deefa63ef71cd3c3933781318ac61c5469 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 12 Jul 2009 10:03:42 +0000 Subject: [PATCH 0174/1662] hvc_console: Drop unnecessary NULL test The result of container_of should not be NULL. In particular, in this case the argument to the enclosing function has passed though INIT_WORK, which dereferences it, implying that its container cannot be NULL. A simplified version of the semantic patch that makes this change is as follows: (http://www.emn.fr/x-info/coccinelle/) // @@ identifier fn,work,x,fld; type T; expression E1,E2; statement S; @@ static fn(struct work_struct *work) { ... when != work = E1 x = container_of(work,T,fld) ... when != x = E2 - if (x == NULL) S ... } // Signed-off-by: Julia Lawall Signed-off-by: Benjamin Herrenschmidt --- drivers/char/hvc_console.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c index d97779ef72cb..25ce15bb1c08 100644 --- a/drivers/char/hvc_console.c +++ b/drivers/char/hvc_console.c @@ -516,8 +516,6 @@ static void hvc_set_winsz(struct work_struct *work) struct winsize ws; hp = container_of(work, struct hvc_struct, tty_resize); - if (!hp) - return; spin_lock_irqsave(&hp->lock, hvc_flags); if (!hp->tty) { From bbdc16f58ed84523e8991f103dceb586e9053e04 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 15 Jul 2009 15:25:22 +0000 Subject: [PATCH 0175/1662] kmemleak: Allow kmemleak to be built on powerpc Very lightly tested, doesn't crash the kernel. Signed-off-by: Michael Ellerman Acked-by: Catalin Marinas Signed-off-by: Benjamin Herrenschmidt --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 12327b2bb785..d5ca9a5a31c6 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -338,7 +338,7 @@ config SLUB_STATS config DEBUG_KMEMLEAK bool "Kernel memory leak detector" - depends on DEBUG_KERNEL && EXPERIMENTAL && (X86 || ARM) && \ + depends on DEBUG_KERNEL && EXPERIMENTAL && (X86 || ARM || PPC) && \ !MEMORY_HOTPLUG select DEBUG_FS if SYSFS select STACKTRACE if STACKTRACE_SUPPORT From a888ad451a96881a7e40f40f717d05f1f3b26ad4 Mon Sep 17 00:00:00 2001 From: Stoyan Gaydarov Date: Tue, 21 Jul 2009 17:02:31 +0000 Subject: [PATCH 0176/1662] powerpc: ARRAY_SIZE changes in pasemi and powermac code These changes were a direct result of using a semantic patch More information can be found at http://www.emn.fr/x-info/coccinelle/ Signed-off-by: Stoyan Gaydarov Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pasemi/idle.c | 2 +- arch/powerpc/platforms/powermac/feature.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/pasemi/idle.c b/arch/powerpc/platforms/pasemi/idle.c index 43911d8b0206..75b296bc51af 100644 --- a/arch/powerpc/platforms/pasemi/idle.c +++ b/arch/powerpc/platforms/pasemi/idle.c @@ -90,7 +90,7 @@ machine_late_initcall(pasemi, pasemi_idle_init); static int __init idle_param(char *p) { int i; - for (i = 0; i < sizeof(modes)/sizeof(struct sleep_mode); i++) { + for (i = 0; i < ARRAY_SIZE(modes); i++) { if (!strcmp(modes[i].name, p)) { current_mode = i; break; diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c index e6c0040ee797..095de3227d83 100644 --- a/arch/powerpc/platforms/powermac/feature.c +++ b/arch/powerpc/platforms/powermac/feature.c @@ -2419,13 +2419,13 @@ static int __init probe_motherboard(void) dt = of_find_node_by_name(NULL, "device-tree"); if (dt != NULL) model = of_get_property(dt, "model", NULL); - for(i=0; model && i<(sizeof(pmac_mb_defs)/sizeof(struct pmac_mb_def)); i++) { + for(i=0; model && i Date: Mon, 27 Jul 2009 22:02:39 +0000 Subject: [PATCH 0177/1662] powerpc/prom_init: Evaluate mem kernel parameter for early allocation Evaluate mem kernel parameter for early memory allocations. If mem is set no allocation in the region above the given boundary is allowed. The current code doesn't take care about this and allocate memory above the given mem boundary. Signed-off-by: Benjamin Krill Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/prom_init.c | 103 +++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index d942404779c1..864334b337a3 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -190,6 +190,8 @@ static int __initdata of_platform; static char __initdata prom_cmd_line[COMMAND_LINE_SIZE]; +static unsigned long __initdata prom_memory_limit; + static unsigned long __initdata alloc_top; static unsigned long __initdata alloc_top_high; static unsigned long __initdata alloc_bottom; @@ -484,6 +486,67 @@ static int __init prom_setprop(phandle node, const char *nodename, return call_prom("interpret", 1, 1, (u32)(unsigned long) cmd); } +/* We can't use the standard versions because of RELOC headaches. */ +#define isxdigit(c) (('0' <= (c) && (c) <= '9') \ + || ('a' <= (c) && (c) <= 'f') \ + || ('A' <= (c) && (c) <= 'F')) + +#define isdigit(c) ('0' <= (c) && (c) <= '9') +#define islower(c) ('a' <= (c) && (c) <= 'z') +#define toupper(c) (islower(c) ? ((c) - 'a' + 'A') : (c)) + +unsigned long prom_strtoul(const char *cp, const char **endp) +{ + unsigned long result = 0, base = 10, value; + + if (*cp == '0') { + base = 8; + cp++; + if (toupper(*cp) == 'X') { + cp++; + base = 16; + } + } + + while (isxdigit(*cp) && + (value = isdigit(*cp) ? *cp - '0' : toupper(*cp) - 'A' + 10) < base) { + result = result * base + value; + cp++; + } + + if (endp) + *endp = cp; + + return result; +} + +unsigned long prom_memparse(const char *ptr, const char **retptr) +{ + unsigned long ret = prom_strtoul(ptr, retptr); + int shift = 0; + + /* + * We can't use a switch here because GCC *may* generate a + * jump table which won't work, because we're not running at + * the address we're linked at. + */ + if ('G' == **retptr || 'g' == **retptr) + shift = 30; + + if ('M' == **retptr || 'm' == **retptr) + shift = 20; + + if ('K' == **retptr || 'k' == **retptr) + shift = 10; + + if (shift) { + ret <<= shift; + (*retptr)++; + } + + return ret; +} + /* * Early parsing of the command line passed to the kernel, used for * "mem=x" and the options that affect the iommu @@ -491,9 +554,8 @@ static int __init prom_setprop(phandle node, const char *nodename, static void __init early_cmdline_parse(void) { struct prom_t *_prom = &RELOC(prom); -#ifdef CONFIG_PPC64 const char *opt; -#endif + char *p; int l = 0; @@ -521,6 +583,15 @@ static void __init early_cmdline_parse(void) RELOC(prom_iommu_force_on) = 1; } #endif + opt = strstr(RELOC(prom_cmd_line), RELOC("mem=")); + if (opt) { + opt += 4; + RELOC(prom_memory_limit) = prom_memparse(opt, (const char **)&opt); +#ifdef CONFIG_PPC64 + /* Align to 16 MB == size of ppc64 large page */ + RELOC(prom_memory_limit) = ALIGN(RELOC(prom_memory_limit), 0x1000000); +#endif + } } #ifdef CONFIG_PPC_PSERIES @@ -1026,6 +1097,29 @@ static void __init prom_init_mem(void) RELOC(alloc_bottom) = PAGE_ALIGN(RELOC(prom_initrd_end)); } + /* + * If prom_memory_limit is set we reduce the upper limits *except* for + * alloc_top_high. This must be the real top of RAM so we can put + * TCE's up there. + */ + + RELOC(alloc_top_high) = RELOC(ram_top); + + if (RELOC(prom_memory_limit)) { + if (RELOC(prom_memory_limit) <= RELOC(alloc_bottom)) { + prom_printf("Ignoring mem=%x <= alloc_bottom.\n", + RELOC(prom_memory_limit)); + RELOC(prom_memory_limit) = 0; + } else if (RELOC(prom_memory_limit) >= RELOC(ram_top)) { + prom_printf("Ignoring mem=%x >= ram_top.\n", + RELOC(prom_memory_limit)); + RELOC(prom_memory_limit) = 0; + } else { + RELOC(ram_top) = RELOC(prom_memory_limit); + RELOC(rmo_top) = min(RELOC(rmo_top), RELOC(prom_memory_limit)); + } + } + /* * Setup our top alloc point, that is top of RMO or top of * segment 0 when running non-LPAR. @@ -1041,6 +1135,7 @@ static void __init prom_init_mem(void) RELOC(alloc_top_high) = RELOC(ram_top); prom_printf("memory layout at init:\n"); + prom_printf(" memory_limit : %x (16 MB aligned)\n", RELOC(prom_memory_limit)); prom_printf(" alloc_bottom : %x\n", RELOC(alloc_bottom)); prom_printf(" alloc_top : %x\n", RELOC(alloc_top)); prom_printf(" alloc_top_hi : %x\n", RELOC(alloc_top_high)); @@ -2395,6 +2490,10 @@ unsigned long __init prom_init(unsigned long r3, unsigned long r4, /* * Fill in some infos for use by the kernel later on */ + if (RELOC(prom_memory_limit)) + prom_setprop(_prom->chosen, "/chosen", "linux,memory-limit", + &RELOC(prom_memory_limit), + sizeof(prom_memory_limit)); #ifdef CONFIG_PPC64 if (RELOC(prom_iommu_off)) prom_setprop(_prom->chosen, "/chosen", "linux,iommu-off", From 02a606c2c47a2e4714225d0c3513ca03679f01d2 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Thu, 30 Jul 2009 06:02:18 +0000 Subject: [PATCH 0178/1662] powerpc: Missing tests for NULL after ioremap() Missing tests after ioremap() Signed-off-by: Roel Kluin Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powermac/feature.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c index 095de3227d83..fbc9bbd74dbd 100644 --- a/arch/powerpc/platforms/powermac/feature.c +++ b/arch/powerpc/platforms/powermac/feature.c @@ -2589,9 +2589,16 @@ static void __init probe_uninorth(void) if (address == 0) return; uninorth_base = ioremap(address, 0x40000); + if (uninorth_base == NULL) + return; uninorth_rev = in_be32(UN_REG(UNI_N_VERSION)); - if (uninorth_maj == 3 || uninorth_maj == 4) + if (uninorth_maj == 3 || uninorth_maj == 4) { u3_ht_base = ioremap(address + U3_HT_CONFIG_BASE, 0x1000); + if (u3_ht_base == NULL) { + iounmap(uninorth_base); + return; + } + } printk(KERN_INFO "Found %s memory controller & host bridge" " @ 0x%08x revision: 0x%02x\n", uninorth_maj == 3 ? "U3" : From 9413c8836a16e9d034928a7f9d3ad81bebd71ce9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 29 Jul 2009 02:06:42 +0000 Subject: [PATCH 0179/1662] powerpc/cell: Move CBE_IOPTE_* to As doesn't contain any other hardware specific definitions but only interfaces. Reported-by: Arnd Bergmann Signed-off-by: Geert Uytterhoeven Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/cell-regs.h | 11 +++++++++++ arch/powerpc/include/asm/iommu.h | 10 ---------- arch/powerpc/platforms/ps3/mm.c | 2 +- arch/powerpc/platforms/ps3/system-bus.c | 2 +- drivers/block/ps3vram.c | 2 +- drivers/video/ps3fb.c | 2 +- 6 files changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/cell-regs.h b/arch/powerpc/include/asm/cell-regs.h index fd6fd00434ef..fdf64fd25950 100644 --- a/arch/powerpc/include/asm/cell-regs.h +++ b/arch/powerpc/include/asm/cell-regs.h @@ -303,6 +303,17 @@ struct cbe_mic_tm_regs { extern struct cbe_mic_tm_regs __iomem *cbe_get_mic_tm_regs(struct device_node *np); extern struct cbe_mic_tm_regs __iomem *cbe_get_cpu_mic_tm_regs(int cpu); + +/* Cell page table entries */ +#define CBE_IOPTE_PP_W 0x8000000000000000ul /* protection: write */ +#define CBE_IOPTE_PP_R 0x4000000000000000ul /* protection: read */ +#define CBE_IOPTE_M 0x2000000000000000ul /* coherency required */ +#define CBE_IOPTE_SO_R 0x1000000000000000ul /* ordering: writes */ +#define CBE_IOPTE_SO_RW 0x1800000000000000ul /* ordering: r & w */ +#define CBE_IOPTE_RPN_Mask 0x07fffffffffff000ul /* RPN */ +#define CBE_IOPTE_H 0x0000000000000800ul /* cache hint */ +#define CBE_IOPTE_IOID_Mask 0x00000000000007fful /* ioid */ + /* some utility functions to deal with SMT */ extern u32 cbe_get_hw_thread_id(int cpu); extern u32 cbe_cpu_to_node(int cpu); diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 7ead7c16fb7c..7464c0daddd1 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -35,16 +35,6 @@ #define IOMMU_PAGE_MASK (~((1 << IOMMU_PAGE_SHIFT) - 1)) #define IOMMU_PAGE_ALIGN(addr) _ALIGN_UP(addr, IOMMU_PAGE_SIZE) -/* Cell page table entries */ -#define CBE_IOPTE_PP_W 0x8000000000000000ul /* protection: write */ -#define CBE_IOPTE_PP_R 0x4000000000000000ul /* protection: read */ -#define CBE_IOPTE_M 0x2000000000000000ul /* coherency required */ -#define CBE_IOPTE_SO_R 0x1000000000000000ul /* ordering: writes */ -#define CBE_IOPTE_SO_RW 0x1800000000000000ul /* ordering: r & w */ -#define CBE_IOPTE_RPN_Mask 0x07fffffffffff000ul /* RPN */ -#define CBE_IOPTE_H 0x0000000000000800ul /* cache hint */ -#define CBE_IOPTE_IOID_Mask 0x00000000000007fful /* ioid */ - /* Boot time flags */ extern int iommu_is_off; extern int iommu_force_on; diff --git a/arch/powerpc/platforms/ps3/mm.c b/arch/powerpc/platforms/ps3/mm.c index 846eb8b57fd1..189a25b80735 100644 --- a/arch/powerpc/platforms/ps3/mm.c +++ b/arch/powerpc/platforms/ps3/mm.c @@ -23,8 +23,8 @@ #include #include +#include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c index 3f763c5284ac..676f989ed4e4 100644 --- a/arch/powerpc/platforms/ps3/system-bus.c +++ b/arch/powerpc/platforms/ps3/system-bus.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include "platform.h" diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 095f97e60665..c8753a9ed290 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -13,8 +13,8 @@ #include #include +#include #include -#include #include #include #include diff --git a/drivers/video/ps3fb.c b/drivers/video/ps3fb.c index c0af638fe702..9c0144ee7ae5 100644 --- a/drivers/video/ps3fb.c +++ b/drivers/video/ps3fb.c @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include #include From 14ea58ad797e4e9b7be755aca0fd3925d0713ede Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 1 Aug 2009 22:48:27 +0000 Subject: [PATCH 0180/1662] powerpc: Use DIV_ROUND_CLOSEST in time init code The kernel.h macro DIV_ROUND_CLOSEST performs the computation (x + d/2)/d but is perhaps more readable. The semantic patch that makes this change is as follows: (http://www.emn.fr/x-info/coccinelle/) // @haskernel@ @@ #include @depends on haskernel@ expression x,__divisor; @@ - (((x) + ((__divisor) / 2)) / (__divisor)) + DIV_ROUND_CLOSEST(x,__divisor) // Signed-off-by: Julia Lawall Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/time.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index eae4511ceeac..edb1edb36469 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -479,7 +479,8 @@ static int __init iSeries_tb_recal(void) unsigned long tb_ticks = tb - iSeries_recal_tb; unsigned long titan_usec = (titan - iSeries_recal_titan) >> 12; unsigned long new_tb_ticks_per_sec = (tb_ticks * USEC_PER_SEC)/titan_usec; - unsigned long new_tb_ticks_per_jiffy = (new_tb_ticks_per_sec+(HZ/2))/HZ; + unsigned long new_tb_ticks_per_jiffy = + DIV_ROUND_CLOSEST(new_tb_ticks_per_sec, HZ); long tick_diff = new_tb_ticks_per_jiffy - tb_ticks_per_jiffy; char sign = '+'; /* make sure tb_ticks_per_sec and tb_ticks_per_jiffy are consistent */ From fb2881a7134576a6e95a63e3d2f34ea5629db4a1 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Mon, 3 Aug 2009 02:41:42 +0000 Subject: [PATCH 0181/1662] powerpc/macio: Don't the address of an array element before boundchecking Check whether index is within bounds before grabbing the element. Signed-off-by: Roel Kluin Signed-off-by: Benjamin Herrenschmidt --- drivers/macintosh/macio_asic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c index a0f68386c12f..588a5b0bc4b5 100644 --- a/drivers/macintosh/macio_asic.c +++ b/drivers/macintosh/macio_asic.c @@ -294,10 +294,11 @@ static void macio_setup_interrupts(struct macio_dev *dev) int i = 0, j = 0; for (;;) { - struct resource *res = &dev->interrupt[j]; + struct resource *res; if (j >= MACIO_DEV_COUNT_IRQS) break; + res = &dev->interrupt[j]; irq = irq_of_parse_and_map(np, i++); if (irq == NO_IRQ) break; @@ -321,9 +322,10 @@ static void macio_setup_resources(struct macio_dev *dev, int index; for (index = 0; of_address_to_resource(np, index, &r) == 0; index++) { - struct resource *res = &dev->resource[index]; + struct resource *res; if (index >= MACIO_DEV_COUNT_RESOURCES) break; + res = &dev->resource[index]; *res = r; res->name = dev_name(&dev->ofdev.dev); From 52f072cb084bbb460d3a4ae09f0b6efc3e7e8a8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= Date: Tue, 4 Aug 2009 11:51:03 +0000 Subject: [PATCH 0182/1662] agp/uninorth: Allow larger aperture sizes on pre-U3 bridges. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the radeon KMS test functionality, I verified that the AGP bridge of the Intrepid2 chipset in my PowerBook supports aperture sizes up to 256M. So allow aperture sizes up to 256M on pre-U3 bridges as well, and bump the default size to 256M. It's possible that older revisions only support smaller sizes, but it'll be easy to verify that with the raden KMS test functionality. Also, there's only a problem on an actual attempt to access the aperture beyond the maximum size supported by the hardware, and non-KMS X still defaults to using only 32M. Also use ARRAY_SIZE for the aperture size arrays. Signed-off-by: Michel Dänzer Signed-off-by: Benjamin Herrenschmidt --- drivers/char/agp/uninorth-agp.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c index f192c3b9ad41..37ff3bd56d60 100644 --- a/drivers/char/agp/uninorth-agp.c +++ b/drivers/char/agp/uninorth-agp.c @@ -27,6 +27,8 @@ static int uninorth_rev; static int is_u3; +#define DEFAULT_APERTURE_SIZE 256 +#define DEFAULT_APERTURE_STRING "256" static char *aperture = NULL; static int uninorth_fetch_size(void) @@ -55,7 +57,7 @@ static int uninorth_fetch_size(void) if (!size) { for (i = 0; i < agp_bridge->driver->num_aperture_sizes; i++) - if (values[i].size == 32) + if (values[i].size == DEFAULT_APERTURE_SIZE) break; } @@ -474,13 +476,11 @@ void null_cache_flush(void) /* Setup function */ -static const struct aper_size_info_32 uninorth_sizes[7] = +static const struct aper_size_info_32 uninorth_sizes[] = { -#if 0 /* Not sure uninorth supports that high aperture sizes */ {256, 65536, 6, 64}, {128, 32768, 5, 32}, {64, 16384, 4, 16}, -#endif {32, 8192, 3, 8}, {16, 4096, 2, 4}, {8, 2048, 1, 2}, @@ -491,7 +491,7 @@ static const struct aper_size_info_32 uninorth_sizes[7] = * Not sure that u3 supports that high aperture sizes but it * would strange if it did not :) */ -static const struct aper_size_info_32 u3_sizes[8] = +static const struct aper_size_info_32 u3_sizes[] = { {512, 131072, 7, 128}, {256, 65536, 6, 64}, @@ -507,7 +507,7 @@ const struct agp_bridge_driver uninorth_agp_driver = { .owner = THIS_MODULE, .aperture_sizes = (void *)uninorth_sizes, .size_type = U32_APER_SIZE, - .num_aperture_sizes = 4, + .num_aperture_sizes = ARRAY_SIZE(uninorth_sizes), .configure = uninorth_configure, .fetch_size = uninorth_fetch_size, .cleanup = uninorth_cleanup, @@ -534,7 +534,7 @@ const struct agp_bridge_driver u3_agp_driver = { .owner = THIS_MODULE, .aperture_sizes = (void *)u3_sizes, .size_type = U32_APER_SIZE, - .num_aperture_sizes = 8, + .num_aperture_sizes = ARRAY_SIZE(u3_sizes), .configure = uninorth_configure, .fetch_size = uninorth_fetch_size, .cleanup = uninorth_cleanup, @@ -717,7 +717,7 @@ module_param(aperture, charp, 0); MODULE_PARM_DESC(aperture, "Aperture size, must be power of two between 4MB and an\n" "\t\tupper limit specific to the UniNorth revision.\n" - "\t\tDefault: 32M"); + "\t\tDefault: " DEFAULT_APERTURE_STRING "M"); MODULE_AUTHOR("Ben Herrenschmidt & Paul Mackerras"); MODULE_LICENSE("GPL"); From e8a5f900148d058bce2d7bdce3d6bcbcb40267ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michel=20D=C3=A4nzer?= Date: Tue, 4 Aug 2009 11:51:04 +0000 Subject: [PATCH 0183/1662] agp/uninorth: Simplify cache flushing. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Map the GART table uncached, so we don't always need to flush the CPU caches explicitly after updates. Signed-off-by: Michel Dänzer Signed-off-by: Benjamin Herrenschmidt --- drivers/char/agp/uninorth-agp.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/drivers/char/agp/uninorth-agp.c b/drivers/char/agp/uninorth-agp.c index 37ff3bd56d60..bba29abe917f 100644 --- a/drivers/char/agp/uninorth-agp.c +++ b/drivers/char/agp/uninorth-agp.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -181,8 +182,6 @@ static int uninorth_insert_memory(struct agp_memory *mem, off_t pg_start, } (void)in_le32((volatile u32*)&agp_bridge->gatt_table[pg_start]); mb(); - flush_dcache_range((unsigned long)&agp_bridge->gatt_table[pg_start], - (unsigned long)&agp_bridge->gatt_table[pg_start + mem->page_count]); uninorth_tlbflush(mem); return 0; @@ -226,7 +225,6 @@ static int u3_insert_memory(struct agp_memory *mem, off_t pg_start, int type) (unsigned long)__va(page_to_phys(mem->pages[i]))+0x1000); } mb(); - flush_dcache_range((unsigned long)gp, (unsigned long) &gp[i]); uninorth_tlbflush(mem); return 0; @@ -245,7 +243,6 @@ int u3_remove_memory(struct agp_memory *mem, off_t pg_start, int type) for (i = 0; i < mem->page_count; ++i) gp[i] = 0; mb(); - flush_dcache_range((unsigned long)gp, (unsigned long) &gp[i]); uninorth_tlbflush(mem); return 0; @@ -398,6 +395,7 @@ static int uninorth_create_gatt_table(struct agp_bridge_data *bridge) int i; void *temp; struct page *page; + struct page **pages; /* We can't handle 2 level gatt's */ if (bridge->driver->size_type == LVL2_APER_SIZE) @@ -426,21 +424,39 @@ static int uninorth_create_gatt_table(struct agp_bridge_data *bridge) if (table == NULL) return -ENOMEM; + pages = kmalloc((1 << page_order) * sizeof(struct page*), GFP_KERNEL); + if (pages == NULL) + goto enomem; + table_end = table + ((PAGE_SIZE * (1 << page_order)) - 1); - for (page = virt_to_page(table); page <= virt_to_page(table_end); page++) + for (page = virt_to_page(table), i = 0; page <= virt_to_page(table_end); + page++, i++) { SetPageReserved(page); + pages[i] = page; + } bridge->gatt_table_real = (u32 *) table; - bridge->gatt_table = (u32 *)table; + /* Need to clear out any dirty data still sitting in caches */ + flush_dcache_range((unsigned long)table, + (unsigned long)(table_end + PAGE_SIZE)); + bridge->gatt_table = vmap(pages, (1 << page_order), 0, PAGE_KERNEL_NCG); + + if (bridge->gatt_table == NULL) + goto enomem; + bridge->gatt_bus_addr = virt_to_gart(table); for (i = 0; i < num_entries; i++) bridge->gatt_table[i] = 0; - flush_dcache_range((unsigned long)table, (unsigned long)table_end); - return 0; + +enomem: + kfree(pages); + if (table) + free_pages((unsigned long)table, page_order); + return -ENOMEM; } static int uninorth_free_gatt_table(struct agp_bridge_data *bridge) @@ -458,6 +474,7 @@ static int uninorth_free_gatt_table(struct agp_bridge_data *bridge) * from the table. */ + vunmap(bridge->gatt_table); table = (char *) bridge->gatt_table_real; table_end = table + ((PAGE_SIZE * (1 << page_order)) - 1); From 6826a57d1abc8ac9f59b24f1a008554c6560a995 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Aug 2009 12:24:45 +0000 Subject: [PATCH 0184/1662] powerpc: Switch to asm-generic/hardirq.h hardirq.h on powerpc defines a __last_jiffy_stamp field, but it's not actually used anywhere. Signed-off-by: Christoph Hellwig Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/hardirq.h | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/arch/powerpc/include/asm/hardirq.h b/arch/powerpc/include/asm/hardirq.h index 288e14d53b7f..fb3c05a0cbbf 100644 --- a/arch/powerpc/include/asm/hardirq.h +++ b/arch/powerpc/include/asm/hardirq.h @@ -1,29 +1 @@ -#ifndef _ASM_POWERPC_HARDIRQ_H -#define _ASM_POWERPC_HARDIRQ_H -#ifdef __KERNEL__ - -#include -#include - -/* The __last_jiffy_stamp field is needed to ensure that no decrementer - * interrupt is lost on SMP machines. Since on most CPUs it is in the same - * cache line as local_irq_count, it is cheap to access and is also used on UP - * for uniformity. - */ -typedef struct { - unsigned int __softirq_pending; /* set_bit is used on this */ - unsigned int __last_jiffy_stamp; -} ____cacheline_aligned irq_cpustat_t; - -#include /* Standard mappings for irq_cpustat_t above */ - -#define last_jiffy_stamp(cpu) __IRQ_STAT((cpu), __last_jiffy_stamp) - -static inline void ack_bad_irq(int irq) -{ - printk(KERN_CRIT "illegal vector %d received!\n", irq); - BUG(); -} - -#endif /* __KERNEL__ */ -#endif /* _ASM_POWERPC_HARDIRQ_H */ +#include From 728656506447b3b349d082a7fb99445f9cb0caaa Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 6 Aug 2009 13:00:37 +0000 Subject: [PATCH 0185/1662] powerpc/hvsi: Avoid calculating possibly-invalid address Check whether index is within bounds prior to calculating a possibly-invalid address. Signed-off-by: Roel Kluin Cc: Bernd Petrovitsch Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Benjamin Herrenschmidt --- drivers/char/hvsi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/char/hvsi.c b/drivers/char/hvsi.c index 2989056a9e39..793b236c9266 100644 --- a/drivers/char/hvsi.c +++ b/drivers/char/hvsi.c @@ -1230,11 +1230,12 @@ static struct tty_driver *hvsi_console_device(struct console *console, static int __init hvsi_console_setup(struct console *console, char *options) { - struct hvsi_struct *hp = &hvsi_ports[console->index]; + struct hvsi_struct *hp; int ret; if (console->index < 0 || console->index >= hvsi_count) return -1; + hp = &hvsi_ports[console->index]; /* give the FSP a chance to change the baud rate when we re-open */ hvsi_close_protocol(hp); From a15098c90df1ac2b1bfe1d33dd1c47063213aa9a Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sun, 9 Aug 2009 19:02:51 +0000 Subject: [PATCH 0186/1662] powerpc: Enable GCOV Make it possible to enable GCOV code coverage measurement on powerpc. Lightly tested on 64-bit, seems to work as expected. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/Makefile | 7 +++++++ arch/powerpc/kernel/vdso32/Makefile | 1 + arch/powerpc/kernel/vdso64/Makefile | 2 ++ arch/powerpc/xmon/Makefile | 2 ++ kernel/gcov/Kconfig | 2 +- 5 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 035946f9d5fb..720e8c3b8e94 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -115,6 +115,13 @@ ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),) obj-y += ppc_save_regs.o endif +# Disable GCOV in odd or sensitive code +GCOV_PROFILE_prom_init.o := n +GCOV_PROFILE_ftrace.o := n +GCOV_PROFILE_machine_kexec_64.o := n +GCOV_PROFILE_machine_kexec_32.o := n +GCOV_PROFILE_kprobes.o := n + extra-$(CONFIG_PPC_FPU) += fpu.o extra-$(CONFIG_ALTIVEC) += vector.o extra-$(CONFIG_PPC64) += entry_64.o diff --git a/arch/powerpc/kernel/vdso32/Makefile b/arch/powerpc/kernel/vdso32/Makefile index c3d57bd01a88..b54b81688132 100644 --- a/arch/powerpc/kernel/vdso32/Makefile +++ b/arch/powerpc/kernel/vdso32/Makefile @@ -12,6 +12,7 @@ endif targets := $(obj-vdso32) vdso32.so vdso32.so.dbg obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) +GCOV_PROFILE := n EXTRA_CFLAGS := -shared -fno-common -fno-builtin EXTRA_CFLAGS += -nostdlib -Wl,-soname=linux-vdso32.so.1 \ diff --git a/arch/powerpc/kernel/vdso64/Makefile b/arch/powerpc/kernel/vdso64/Makefile index fa7f1b8f3e50..dd0c8e936775 100644 --- a/arch/powerpc/kernel/vdso64/Makefile +++ b/arch/powerpc/kernel/vdso64/Makefile @@ -7,6 +7,8 @@ obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o targets := $(obj-vdso64) vdso64.so vdso64.so.dbg obj-vdso64 := $(addprefix $(obj)/, $(obj-vdso64)) +GCOV_PROFILE := n + EXTRA_CFLAGS := -shared -fno-common -fno-builtin EXTRA_CFLAGS += -nostdlib -Wl,-soname=linux-vdso64.so.1 \ $(call ld-option, -Wl$(comma)--hash-style=sysv) diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 85ab97ab840a..faa81b6a6612 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -2,6 +2,8 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror +GCOV_PROFILE := n + ifdef CONFIG_PPC64 EXTRA_CFLAGS += -mno-minimal-toc endif diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 22e9dcfaa3d3..654efd09f6a9 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on S390 || X86 + depends on S390 || X86 || (PPC && EXPERIMENTAL) default n ---help--- This options activates profiling for the entire kernel. From 903444e4297264ec0959e886695911659edb425c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sun, 9 Aug 2009 19:06:24 +0000 Subject: [PATCH 0187/1662] powerpc/vmlinux.lds: Move _edata down Currently _edata does not include several data sections, this causes the kernel's report of memory usage at boot to not match reality, and also prevents kmemleak from working - because it scan between _sdata and _edata for pointers to allocated memory. This mirrors a similar change made recently to the x86 linker script. Signed-off-by: Michael Ellerman Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/vmlinux.lds.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 8ef8a14abc95..3bb09975e342 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -245,10 +245,6 @@ SECTIONS } #endif - . = ALIGN(PAGE_SIZE); - _edata = .; - PROVIDE32 (edata = .); - /* The initial task and kernel stack */ #ifdef CONFIG_PPC32 . = ALIGN(8192); @@ -282,6 +278,10 @@ SECTIONS __nosave_end = .; } + . = ALIGN(PAGE_SIZE); + _edata = .; + PROVIDE32 (edata = .); + /* * And finally the bss */ From 3c15a68880023722fc794018768df556f438ae98 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 10 Aug 2009 17:14:55 +0000 Subject: [PATCH 0188/1662] powerpc: use consistent types in mktree gcc v4.4 currently produces this build warning: arch/powerpc/boot/mktree.c: In function 'main': arch/powerpc/boot/mktree.c:104: warning: dereferencing type-punned pointer will break strict-aliasing rules tmpbuf is only used as an array of unsigned ints, so declare it that way. Signed-off-by: Stephen Rothwell Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/boot/mktree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/boot/mktree.c b/arch/powerpc/boot/mktree.c index c2baae0a3d89..e2ae24340fc8 100644 --- a/arch/powerpc/boot/mktree.c +++ b/arch/powerpc/boot/mktree.c @@ -36,7 +36,7 @@ typedef struct boot_block { } boot_block_t; #define IMGBLK 512 -char tmpbuf[IMGBLK]; +unsigned int tmpbuf[IMGBLK / sizeof(unsigned int)]; int main(int argc, char *argv[]) { @@ -95,13 +95,13 @@ int main(int argc, char *argv[]) /* Assume zImage is an ELF file, and skip the 64K header. */ - if (read(in_fd, tmpbuf, IMGBLK) != IMGBLK) { + if (read(in_fd, tmpbuf, sizeof(tmpbuf)) != sizeof(tmpbuf)) { fprintf(stderr, "%s is too small to be an ELF image\n", argv[1]); exit(4); } - if ((*(unsigned int *)tmpbuf) != htonl(0x7f454c46)) { + if (tmpbuf[0] != htonl(0x7f454c46)) { fprintf(stderr, "%s is not an ELF image\n", argv[1]); exit(4); } @@ -121,11 +121,11 @@ int main(int argc, char *argv[]) } while (nblks-- > 0) { - if (read(in_fd, tmpbuf, IMGBLK) < 0) { + if (read(in_fd, tmpbuf, sizeof(tmpbuf)) < 0) { perror("zImage read"); exit(5); } - cp = (unsigned int *)tmpbuf; + cp = tmpbuf; for (i = 0; i < sizeof(tmpbuf) / sizeof(unsigned int); i++) cksum += *cp++; if (write(out_fd, tmpbuf, sizeof(tmpbuf)) != sizeof(tmpbuf)) { From 8126dec32738421afa362114337331337b4be17f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 20 Aug 2009 20:23:11 +0800 Subject: [PATCH 0189/1662] x86: Fix system crash when loading with "reservetop" parameter The system will die if the kernel is booted with "reservetop" parameter, in present code, parse "reservetop" parameter after early_ioremap_init(), and some function still use early_ioremap() after it. The problem is, "reservetop" parameter can modify 'FIXADDR_TOP', then the virtual address got by early_ioremap() is base on old 'FIXADDR_TOP', but the page mapping is base on new 'FIXADDR_TOP', it will occur page fault, and the IDT is not prepare yet, so, the system is dead. So, put parse_early_param() in the front of early_ioremap_init() in this patch. Signed-off-by: Xiao Guangrong Cc: yinghai@kernel.org Cc: Andrew Morton LKML-Reference: <4A8D402F.4080805@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 63f32d220ef2..02643cc3bf26 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -711,6 +711,11 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + + parse_early_param(); + /* VMI may relocate the fixmap; do this before touching ioremap area */ vmi_init(); @@ -793,11 +798,6 @@ void __init setup_arch(char **cmdline_p) #endif #endif - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - - parse_early_param(); - #ifdef CONFIG_X86_64 check_efer(); #endif From 3e0e1e9c5a327d4dba8490d83ef55c0564e6e8a7 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 21 Aug 2009 04:34:45 -0400 Subject: [PATCH 0190/1662] x86: Fix an incorrect argument of reserve_bootmem() This line looks suspicious, because if this is true, then the 'flags' parameter of function reserve_bootmem_generic() will be unused when !CONFIG_NUMA. I don't think this is what we want. Signed-off-by: WANG Cong Cc: Yinghai Lu Cc: akpm@linux-foundation.org LKML-Reference: <20090821083709.5098.52505.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6176fe8f29e0..ea56b8cbb6a6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -796,7 +796,7 @@ int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, return ret; #else - reserve_bootmem(phys, len, BOOTMEM_DEFAULT); + reserve_bootmem(phys, len, flags); #endif if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { From 269c861baa2fe7c114c3bc7831292758d29eb336 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 19 Aug 2009 18:05:35 -0700 Subject: [PATCH 0191/1662] generic-ipi: Allow cpus not yet online to call smp_call_function with irqs disabled Because of deadlock possiblities smp_call_function() is not allowed to be called with interrupts disabled. Add an exception for the cpu not yet online, as no one else can send smp call function interrupt to this cpu that is not yet online and as such deadlock condition is not possible. Signed-off-by: Suresh Siddha Acked-by: Nick Piggin Signed-off-by: H. Peter Anvin --- kernel/smp.c | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index ad63d8501207..2accdf6712e5 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -176,6 +176,11 @@ void generic_smp_call_function_interrupt(void) struct call_function_data *data; int cpu = get_cpu(); + /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(cpu)); + /* * Ensure entry is visible on call_function_queue after we have * entered the IPI. See comment in smp_call_function_many. @@ -230,6 +235,11 @@ void generic_smp_call_function_single_interrupt(void) unsigned int data_flags; LIST_HEAD(list); + /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(smp_processor_id())); + spin_lock(&q->lock); list_replace_init(&q->list, &list); spin_unlock(&q->lock); @@ -285,8 +295,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, */ this_cpu = get_cpu(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); if (cpu == this_cpu) { local_irq_save(flags); @@ -329,8 +345,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, { csd_lock(data); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() + && !oops_in_progress); generic_exec_single(cpu, data, wait); } @@ -365,8 +387,14 @@ void smp_call_function_many(const struct cpumask *mask, unsigned long flags; int cpu, next_cpu, this_cpu = smp_processor_id(); - /* Can deadlock when called with interrupts disabled */ - WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); From d0af9eed5aa91b6b7b5049cae69e5ea956fd85c3 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 19 Aug 2009 18:05:36 -0700 Subject: [PATCH 0192/1662] x86, pat/mtrr: Rendezvous all the cpus for MTRR/PAT init SDM Vol 3a section titled "MTRR considerations in MP systems" specifies the need for synchronizing the logical cpu's while initializing/updating MTRR. Currently Linux kernel does the synchronization of all cpu's only when a single MTRR register is programmed/updated. During an AP online (during boot/cpu-online/resume) where we initialize all the MTRR/PAT registers, we don't follow this synchronization algorithm. This can lead to scenarios where during a dynamic cpu online, that logical cpu is initializing MTRR/PAT with cache disabled (cr0.cd=1) etc while other logical HT sibling continue to run (also with cache disabled because of cr0.cd=1 on its sibling). Starting from Westmere, VMX transitions with cr0.cd=1 don't work properly (because of some VMX performance optimizations) and the above scenario (with one logical cpu doing VMX activity and another logical cpu coming online) can result in system crash. Fix the MTRR initialization by doing rendezvous of all the cpus. During boot and resume, we delay the MTRR/PAT init for APs till all the logical cpu's come online and the rendezvous process at the end of AP's bringup, will initialize the MTRR/PAT for all AP's. For dynamic single cpu online, we synchronize all the logical cpus and do the MTRR/PAT init on the AP that is coming online. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mtrr.h | 7 +++++ arch/x86/kernel/cpu/mtrr/main.c | 46 ++++++++++++++++++++++++++------- arch/x86/kernel/smpboot.c | 14 ++++++++++ arch/x86/power/cpu.c | 2 +- kernel/cpu.c | 14 ++++++++++ 5 files changed, 73 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index a51ada8467de..d5366ec5cb8f 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -121,8 +121,12 @@ extern int mtrr_del_page(int reg, unsigned long base, unsigned long size); extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); extern void mtrr_ap_init(void); extern void mtrr_bp_init(void); +extern void set_mtrr_aps_delayed_init(void); +extern void mtrr_aps_init(void); +extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); +extern u32 mtrr_aps_delayed_init; # else static inline u8 mtrr_type_lookup(u64 addr, u64 end) { @@ -161,6 +165,9 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) +#define set_mtrr_aps_delayed_init() do {} while (0) +#define mtrr_aps_init() do {} while (0) +#define mtrr_bp_restore() do {} while (0) # endif #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7af0f88a4163..7339be0aa580 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -58,6 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; +u32 mtrr_aps_delayed_init; static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; @@ -163,7 +164,10 @@ static void ipi_handler(void *info) if (data->smp_reg != ~0U) { mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - } else { + } else if (mtrr_aps_delayed_init) { + /* + * Initialize the MTRRs inaddition to the synchronisation. + */ mtrr_if->set_all(); } @@ -265,6 +269,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ */ if (reg != ~0U) mtrr_if->set(reg, base, size, type); + else if (!mtrr_aps_delayed_init) + mtrr_if->set_all(); /* Wait for the others */ while (atomic_read(&data.count)) @@ -721,9 +727,7 @@ void __init mtrr_bp_init(void) void mtrr_ap_init(void) { - unsigned long flags; - - if (!mtrr_if || !use_intel()) + if (!use_intel() || mtrr_aps_delayed_init) return; /* * Ideally we should hold mtrr_mutex here to avoid mtrr entries @@ -738,11 +742,7 @@ void mtrr_ap_init(void) * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug * lock to prevent mtrr entry changes */ - local_irq_save(flags); - - mtrr_if->set_all(); - - local_irq_restore(flags); + set_mtrr(~0U, 0, 0, 0); } /** @@ -753,6 +753,34 @@ void mtrr_save_state(void) smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); } +void set_mtrr_aps_delayed_init(void) +{ + if (!use_intel()) + return; + + mtrr_aps_delayed_init = 1; +} + +/* + * MTRR initialization for all AP's + */ +void mtrr_aps_init(void) +{ + if (!use_intel()) + return; + + set_mtrr(~0U, 0, 0, 0); + mtrr_aps_delayed_init = 0; +} + +void mtrr_bp_restore(void) +{ + if (!use_intel()) + return; + + mtrr_if->set_all(); +} + static int __init mtrr_init_finialize(void) { if (!mtrr_if) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2fecda69ee64..d720b7e0cf3d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1116,9 +1116,22 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (is_uv_system()) uv_system_init(); + + set_mtrr_aps_delayed_init(); out: preempt_enable(); } + +void arch_enable_nonboot_cpus_begin(void) +{ + set_mtrr_aps_delayed_init(); +} + +void arch_enable_nonboot_cpus_end(void) +{ + mtrr_aps_init(); +} + /* * Early setup to make printk work. */ @@ -1140,6 +1153,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif check_nmi_watchdog(); + mtrr_aps_init(); } static int __initdata setup_possible_cpus = -1; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index b3d20b9cac63..417c9f5b4afa 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -242,7 +242,7 @@ static void __restore_processor_state(struct saved_context *ctxt) fix_processor_context(); do_fpu_end(); - mtrr_ap_init(); + mtrr_bp_restore(); #ifdef CONFIG_X86_OLD_MCE mcheck_init(&boot_cpu_data); diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce10043e4ac..f5f9485b8c0f 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -413,6 +413,14 @@ int disable_nonboot_cpus(void) return error; } +void __weak arch_enable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_enable_nonboot_cpus_end(void) +{ +} + void __ref enable_nonboot_cpus(void) { int cpu, error; @@ -424,6 +432,9 @@ void __ref enable_nonboot_cpus(void) goto out; printk("Enabling non-boot CPUs ...\n"); + + arch_enable_nonboot_cpus_begin(); + for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { @@ -432,6 +443,9 @@ void __ref enable_nonboot_cpus(void) } printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); } + + arch_enable_nonboot_cpus_end(); + cpumask_clear(frozen_cpus); out: cpu_maps_update_done(); From 5400743db5a06a4e6e298725a2044c40edcb27b9 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 21 Aug 2009 17:00:02 -0700 Subject: [PATCH 0193/1662] x86, mtrr: make mtrr_aps_delayed_init static bool mtr_aps_delayed_init was declared u32 and made global, but it only ever takes boolean values and is only ever used in arch/x86/kernel/cpu/mtrr/main.c. Declare it "static bool" and remove external references. Signed-off-by: H. Peter Anvin Cc: Suresh Siddha --- arch/x86/include/asm/mtrr.h | 1 - arch/x86/kernel/cpu/mtrr/main.c | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index d5366ec5cb8f..4365ffdb461f 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -126,7 +126,6 @@ extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); -extern u32 mtrr_aps_delayed_init; # else static inline u8 mtrr_type_lookup(u64 addr, u64 end) { diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7339be0aa580..84e83de54575 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -58,7 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; -u32 mtrr_aps_delayed_init; +static bool mtrr_aps_delayed_init; static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; @@ -758,7 +758,7 @@ void set_mtrr_aps_delayed_init(void) if (!use_intel()) return; - mtrr_aps_delayed_init = 1; + mtrr_aps_delayed_init = true; } /* @@ -770,7 +770,7 @@ void mtrr_aps_init(void) return; set_mtrr(~0U, 0, 0, 0); - mtrr_aps_delayed_init = 0; + mtrr_aps_delayed_init = false; } void mtrr_bp_restore(void) From 8934210cfe925f0d3e3089c69e9e88021324801b Mon Sep 17 00:00:00 2001 From: Heiko Schocher Date: Fri, 7 Aug 2009 08:41:15 +0200 Subject: [PATCH 0194/1662] powerpc/82xx: mgcoge - updated defconfig - add I2C support - add FCC1 and FCC2 support Signed-off-by: Heiko Schocher Signed-off-by: Kumar Gala --- arch/powerpc/configs/mgcoge_defconfig | 86 +++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/configs/mgcoge_defconfig b/arch/powerpc/configs/mgcoge_defconfig index e9491c1c3f31..30b68bfacebf 100644 --- a/arch/powerpc/configs/mgcoge_defconfig +++ b/arch/powerpc/configs/mgcoge_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.31-rc4 -# Wed Jul 29 23:31:51 2009 +# Linux kernel version: 2.6.31-rc5 +# Fri Aug 7 08:19:15 2009 # # CONFIG_PPC64 is not set @@ -158,6 +158,7 @@ CONFIG_BASE_SMALL=0 # CONFIG_MODULES is not set CONFIG_BLOCK=y CONFIG_LBDAF=y +CONFIG_BLK_DEV_BSG=y # CONFIG_BLK_DEV_INTEGRITY is not set # @@ -506,6 +507,7 @@ CONFIG_MTD_PHYSMAP_OF=y # CONFIG_MTD_UBI is not set CONFIG_OF_DEVICE=y CONFIG_OF_GPIO=y +CONFIG_OF_I2C=y CONFIG_OF_MDIO=y # CONFIG_PARPORT is not set CONFIG_BLK_DEV=y @@ -582,7 +584,8 @@ CONFIG_PHYLIB=y # CONFIG_STE10XP is not set # CONFIG_LSI_ET1011C_PHY is not set CONFIG_FIXED_PHY=y -# CONFIG_MDIO_BITBANG is not set +CONFIG_MDIO_BITBANG=y +# CONFIG_MDIO_GPIO is not set CONFIG_NET_ETHERNET=y CONFIG_MII=y # CONFIG_MACE is not set @@ -608,8 +611,8 @@ CONFIG_MII=y # CONFIG_ATL2 is not set CONFIG_FS_ENET=y CONFIG_FS_ENET_HAS_SCC=y -# CONFIG_FS_ENET_HAS_FCC is not set -# CONFIG_FS_ENET_MDIO_FCC is not set +CONFIG_FS_ENET_HAS_FCC=y +CONFIG_FS_ENET_MDIO_FCC=y # CONFIG_NETDEV_1000 is not set # CONFIG_NETDEV_10000 is not set # CONFIG_TR is not set @@ -680,7 +683,68 @@ CONFIG_HW_RANDOM=y # CONFIG_APPLICOM is not set # CONFIG_RAW_DRIVER is not set CONFIG_DEVPORT=y -# CONFIG_I2C is not set +CONFIG_I2C=y +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_CHARDEV=y +CONFIG_I2C_HELPER_AUTO=y + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +# CONFIG_I2C_ALI1535 is not set +# CONFIG_I2C_ALI15X3 is not set +# CONFIG_I2C_AMD756 is not set +# CONFIG_I2C_AMD8111 is not set +# CONFIG_I2C_I801 is not set +# CONFIG_I2C_ISCH is not set +# CONFIG_I2C_PIIX4 is not set +# CONFIG_I2C_NFORCE2 is not set +# CONFIG_I2C_SIS5595 is not set +# CONFIG_I2C_SIS630 is not set +# CONFIG_I2C_SIS96X is not set +# CONFIG_I2C_VIAPRO is not set + +# +# Mac SMBus host controller drivers +# +# CONFIG_I2C_POWERMAC is not set + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_CPM=y +# CONFIG_I2C_DESIGNWARE is not set +# CONFIG_I2C_GPIO is not set +# CONFIG_I2C_MPC is not set +# CONFIG_I2C_SIMTEC is not set + +# +# External I2C/SMBus adapter drivers +# +# CONFIG_I2C_PARPORT_LIGHT is not set + +# +# Graphics adapter I2C/DDC channel drivers +# +# CONFIG_I2C_VOODOO3 is not set + +# +# Other I2C/SMBus bus drivers +# +# CONFIG_I2C_PCA_PLATFORM is not set + +# +# Miscellaneous I2C Chip support +# +# CONFIG_PCF8575 is not set +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# CONFIG_I2C_DEBUG_CHIP is not set # CONFIG_SPI is not set # @@ -699,6 +763,9 @@ CONFIG_GPIOLIB=y # # I2C GPIO expanders: # +# CONFIG_GPIO_MAX732X is not set +# CONFIG_GPIO_PCA953X is not set +# CONFIG_GPIO_PCF857X is not set # # PCI GPIO expanders: @@ -727,7 +794,14 @@ CONFIG_SSB_POSSIBLE=y # CONFIG_MFD_CORE is not set # CONFIG_MFD_SM501 is not set # CONFIG_HTC_PASIC3 is not set +# CONFIG_TPS65010 is not set +# CONFIG_TWL4030_CORE is not set # CONFIG_MFD_TMIO is not set +# CONFIG_PMIC_DA903X is not set +# CONFIG_MFD_WM8400 is not set +# CONFIG_MFD_WM8350_I2C is not set +# CONFIG_MFD_PCF50633 is not set +# CONFIG_AB3100_CORE is not set # CONFIG_REGULATOR is not set # CONFIG_MEDIA_SUPPORT is not set From fc4bdb35fba1c8f464fd85b94a5059e752fc85d4 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Fri, 14 Aug 2009 09:38:34 -0500 Subject: [PATCH 0195/1662] powerpc/booke: Move MMUCSR definition into mmu-book3e.h The MMUCSR is now defined as part of the Book-3E architecture so we can move it into mmu-book3e.h and add some of the additional bits defined by the architecture specs. Signed-off-by: Kumar Gala --- arch/powerpc/include/asm/mmu-book3e.h | 12 ++++++++++++ arch/powerpc/include/asm/reg_booke.h | 6 ------ arch/powerpc/mm/tlb_nohash_low.S | 2 -- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index d74580469361..74695816205c 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -114,6 +114,18 @@ #define MAS7_RPN 0xFFFFFFFF +/* Bit definitions for MMUCSR0 */ +#define MMUCSR0_TLB1FI 0x00000002 /* TLB1 Flash invalidate */ +#define MMUCSR0_TLB0FI 0x00000004 /* TLB0 Flash invalidate */ +#define MMUCSR0_TLB2FI 0x00000040 /* TLB2 Flash invalidate */ +#define MMUCSR0_TLB3FI 0x00000020 /* TLB3 Flash invalidate */ +#define MMUCSR0_TLBFI (MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \ + MMUCSR0_TLB2FI | MMUCSR0_TLB3FI) +#define MMUCSR0_TLB0PS 0x00000780 /* TLB0 Page Size */ +#define MMUCSR0_TLB1PS 0x00007800 /* TLB1 Page Size */ +#define MMUCSR0_TLB2PS 0x00078000 /* TLB2 Page Size */ +#define MMUCSR0_TLB3PS 0x00780000 /* TLB3 Page Size */ + /* TLBnCFG encoding */ #define TLBnCFG_N_ENTRY 0x00000fff /* number of entries */ #define TLBnCFG_HES 0x00002000 /* HW select supported */ diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 2c9c706e6448..9bb81d99b765 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -430,12 +430,6 @@ #define L2CSR0_L2LOA 0x00000080 /* L2 Cache Lock Overflow Allocate */ #define L2CSR0_L2LO 0x00000020 /* L2 Cache Lock Overflow */ -/* Bit definitions for MMUCSR0 */ -#define MMUCSR0_TLB1FI 0x00000002 /* TLB1 Flash invalidate */ -#define MMUCSR0_TLB0FI 0x00000004 /* TLB0 Flash invalidate */ -#define MMUCSR0_TLB2FI 0x00000040 /* TLB2 Flash invalidate */ -#define MMUCSR0_TLB3FI 0x00000020 /* TLB3 Flash invalidate */ - /* Bit definitions for SGR. */ #define SGR_NORMAL 0 /* Speculative fetching allowed. */ #define SGR_GUARDED 1 /* Speculative fetching disallowed. */ diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S index 7bcd9fbf6cc6..bbdc5b577b85 100644 --- a/arch/powerpc/mm/tlb_nohash_low.S +++ b/arch/powerpc/mm/tlb_nohash_low.S @@ -124,8 +124,6 @@ _GLOBAL(_tlbil_pid) * to have the larger code path before the _SECTION_ELSE */ -#define MMUCSR0_TLBFI (MMUCSR0_TLB0FI | MMUCSR0_TLB1FI | \ - MMUCSR0_TLB2FI | MMUCSR0_TLB3FI) /* * Flush MMU TLB on the local processor */ From fb8e3e1fe1df963b6c1ab8610682737ccae96ef0 Mon Sep 17 00:00:00 2001 From: Poonam Aggrwal Date: Fri, 7 Aug 2009 21:05:16 +0530 Subject: [PATCH 0196/1662] powerpc/85xx: Add support for P2020RDB board Add support for the P2020RDB reference board from Freescale. Overview of P2020RDB platform - DDR DDR2 1G - NOR Flash 16MByte - NAND Flash 32MByte - 3 Ethernet interfaces 1) etSEC1 - RGMII - connected to a 5 port Vitesse Switch(VSC7385) - Switch is memory mapped through eLBC interface(CS#2) - IRQ1 2) etSEC2 - SGMII - connected to VSC8221 - IRQ2 3) etSEC3 - RGMII - connected to VSC8641 - IRQ3 - 2 1X PCIe interfaces - SD/MMC ,USB - SPI EEPROM - Serial I2C EEPROM Signed-off-by: Poonam Aggrwal Signed-off-by: Kumar Gala --- arch/powerpc/boot/dts/p2020rdb.dts | 586 ++++++++++++++++++++++ arch/powerpc/configs/mpc85xx_defconfig | 1 + arch/powerpc/platforms/85xx/Kconfig | 9 + arch/powerpc/platforms/85xx/Makefile | 3 +- arch/powerpc/platforms/85xx/mpc85xx_rdb.c | 141 ++++++ 5 files changed, 739 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/boot/dts/p2020rdb.dts create mode 100644 arch/powerpc/platforms/85xx/mpc85xx_rdb.c diff --git a/arch/powerpc/boot/dts/p2020rdb.dts b/arch/powerpc/boot/dts/p2020rdb.dts new file mode 100644 index 000000000000..da4cb0d8d215 --- /dev/null +++ b/arch/powerpc/boot/dts/p2020rdb.dts @@ -0,0 +1,586 @@ +/* + * P2020 RDB Device Tree Source + * + * Copyright 2009 Freescale Semiconductor Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +/dts-v1/; +/ { + model = "fsl,P2020"; + compatible = "fsl,P2020RDB"; + #address-cells = <2>; + #size-cells = <2>; + + aliases { + ethernet0 = &enet0; + ethernet1 = &enet1; + ethernet2 = &enet2; + serial0 = &serial0; + serial1 = &serial1; + pci0 = &pci0; + pci1 = &pci1; + }; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + PowerPC,P2020@0 { + device_type = "cpu"; + reg = <0x0>; + next-level-cache = <&L2>; + }; + + PowerPC,P2020@1 { + device_type = "cpu"; + reg = <0x1>; + next-level-cache = <&L2>; + }; + }; + + memory { + device_type = "memory"; + }; + + localbus@ffe05000 { + #address-cells = <2>; + #size-cells = <1>; + compatible = "fsl,p2020-elbc", "fsl,elbc", "simple-bus"; + reg = <0 0xffe05000 0 0x1000>; + interrupts = <19 2>; + interrupt-parent = <&mpic>; + + /* NOR and NAND Flashes */ + ranges = <0x0 0x0 0x0 0xef000000 0x01000000 + 0x1 0x0 0x0 0xffa00000 0x00040000 + 0x2 0x0 0x0 0xffb00000 0x00020000>; + + nor@0,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "cfi-flash"; + reg = <0x0 0x0 0x1000000>; + bank-width = <2>; + device-width = <1>; + + partition@0 { + /* This location must not be altered */ + /* 256KB for Vitesse 7385 Switch firmware */ + reg = <0x0 0x00040000>; + label = "NOR (RO) Vitesse-7385 Firmware"; + read-only; + }; + + partition@40000 { + /* 256KB for DTB Image */ + reg = <0x00040000 0x00040000>; + label = "NOR (RO) DTB Image"; + read-only; + }; + + partition@80000 { + /* 3.5 MB for Linux Kernel Image */ + reg = <0x00080000 0x00380000>; + label = "NOR (RO) Linux Kernel Image"; + read-only; + }; + + partition@400000 { + /* 11MB for JFFS2 based Root file System */ + reg = <0x00400000 0x00b00000>; + label = "NOR (RW) JFFS2 Root File System"; + }; + + partition@f00000 { + /* This location must not be altered */ + /* 512KB for u-boot Bootloader Image */ + /* 512KB for u-boot Environment Variables */ + reg = <0x00f00000 0x00100000>; + label = "NOR (RO) U-Boot Image"; + read-only; + }; + }; + + nand@1,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,p2020-fcm-nand", + "fsl,elbc-fcm-nand"; + reg = <0x1 0x0 0x40000>; + + partition@0 { + /* This location must not be altered */ + /* 1MB for u-boot Bootloader Image */ + reg = <0x0 0x00100000>; + label = "NAND (RO) U-Boot Image"; + read-only; + }; + + partition@100000 { + /* 1MB for DTB Image */ + reg = <0x00100000 0x00100000>; + label = "NAND (RO) DTB Image"; + read-only; + }; + + partition@200000 { + /* 4MB for Linux Kernel Image */ + reg = <0x00200000 0x00400000>; + label = "NAND (RO) Linux Kernel Image"; + read-only; + }; + + partition@600000 { + /* 4MB for Compressed Root file System Image */ + reg = <0x00600000 0x00400000>; + label = "NAND (RO) Compressed RFS Image"; + read-only; + }; + + partition@a00000 { + /* 7MB for JFFS2 based Root file System */ + reg = <0x00a00000 0x00700000>; + label = "NAND (RW) JFFS2 Root File System"; + }; + + partition@1100000 { + /* 15MB for JFFS2 based Root file System */ + reg = <0x01100000 0x00f00000>; + label = "NAND (RW) Writable User area"; + }; + }; + + L2switch@2,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "vitesse-7385"; + reg = <0x2 0x0 0x20000>; + }; + + }; + + soc@ffe00000 { + #address-cells = <1>; + #size-cells = <1>; + device_type = "soc"; + compatible = "fsl,p2020-immr", "simple-bus"; + ranges = <0x0 0x0 0xffe00000 0x100000>; + bus-frequency = <0>; // Filled out by uboot. + + ecm-law@0 { + compatible = "fsl,ecm-law"; + reg = <0x0 0x1000>; + fsl,num-laws = <12>; + }; + + ecm@1000 { + compatible = "fsl,p2020-ecm", "fsl,ecm"; + reg = <0x1000 0x1000>; + interrupts = <17 2>; + interrupt-parent = <&mpic>; + }; + + memory-controller@2000 { + compatible = "fsl,p2020-memory-controller"; + reg = <0x2000 0x1000>; + interrupt-parent = <&mpic>; + interrupts = <18 2>; + }; + + i2c@3000 { + #address-cells = <1>; + #size-cells = <0>; + cell-index = <0>; + compatible = "fsl-i2c"; + reg = <0x3000 0x100>; + interrupts = <43 2>; + interrupt-parent = <&mpic>; + dfsrr; + rtc@68 { + compatible = "dallas,ds1339"; + reg = <0x68>; + }; + }; + + i2c@3100 { + #address-cells = <1>; + #size-cells = <0>; + cell-index = <1>; + compatible = "fsl-i2c"; + reg = <0x3100 0x100>; + interrupts = <43 2>; + interrupt-parent = <&mpic>; + dfsrr; + }; + + serial0: serial@4500 { + cell-index = <0>; + device_type = "serial"; + compatible = "ns16550"; + reg = <0x4500 0x100>; + clock-frequency = <0>; + interrupts = <42 2>; + interrupt-parent = <&mpic>; + }; + + serial1: serial@4600 { + cell-index = <1>; + device_type = "serial"; + compatible = "ns16550"; + reg = <0x4600 0x100>; + clock-frequency = <0>; + interrupts = <42 2>; + interrupt-parent = <&mpic>; + }; + + spi@7000 { + cell-index = <0>; + #address-cells = <1>; + #size-cells = <0>; + compatible = "fsl,espi"; + reg = <0x7000 0x1000>; + interrupts = <59 0x2>; + interrupt-parent = <&mpic>; + mode = "cpu"; + + fsl_m25p80@0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,espi-flash"; + reg = <0>; + linux,modalias = "fsl_m25p80"; + modal = "s25sl128b"; + spi-max-frequency = <50000000>; + mode = <0>; + + partition@0 { + /* 512KB for u-boot Bootloader Image */ + reg = <0x0 0x00080000>; + label = "SPI (RO) U-Boot Image"; + read-only; + }; + + partition@80000 { + /* 512KB for DTB Image */ + reg = <0x00080000 0x00080000>; + label = "SPI (RO) DTB Image"; + read-only; + }; + + partition@100000 { + /* 4MB for Linux Kernel Image */ + reg = <0x00100000 0x00400000>; + label = "SPI (RO) Linux Kernel Image"; + read-only; + }; + + partition@500000 { + /* 4MB for Compressed RFS Image */ + reg = <0x00500000 0x00400000>; + label = "SPI (RO) Compressed RFS Image"; + read-only; + }; + + partition@900000 { + /* 7MB for JFFS2 based RFS */ + reg = <0x00900000 0x00700000>; + label = "SPI (RW) JFFS2 RFS"; + }; + }; + }; + + dma@c300 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,eloplus-dma"; + reg = <0xc300 0x4>; + ranges = <0x0 0xc100 0x200>; + cell-index = <1>; + dma-channel@0 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x0 0x80>; + cell-index = <0>; + interrupt-parent = <&mpic>; + interrupts = <76 2>; + }; + dma-channel@80 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x80 0x80>; + cell-index = <1>; + interrupt-parent = <&mpic>; + interrupts = <77 2>; + }; + dma-channel@100 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x100 0x80>; + cell-index = <2>; + interrupt-parent = <&mpic>; + interrupts = <78 2>; + }; + dma-channel@180 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x180 0x80>; + cell-index = <3>; + interrupt-parent = <&mpic>; + interrupts = <79 2>; + }; + }; + + gpio: gpio-controller@f000 { + #gpio-cells = <2>; + compatible = "fsl,mpc8572-gpio"; + reg = <0xf000 0x100>; + interrupts = <47 0x2>; + interrupt-parent = <&mpic>; + gpio-controller; + }; + + L2: l2-cache-controller@20000 { + compatible = "fsl,p2020-l2-cache-controller"; + reg = <0x20000 0x1000>; + cache-line-size = <32>; // 32 bytes + cache-size = <0x80000>; // L2,512K + interrupt-parent = <&mpic>; + interrupts = <16 2>; + }; + + dma@21300 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "fsl,eloplus-dma"; + reg = <0x21300 0x4>; + ranges = <0x0 0x21100 0x200>; + cell-index = <0>; + dma-channel@0 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x0 0x80>; + cell-index = <0>; + interrupt-parent = <&mpic>; + interrupts = <20 2>; + }; + dma-channel@80 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x80 0x80>; + cell-index = <1>; + interrupt-parent = <&mpic>; + interrupts = <21 2>; + }; + dma-channel@100 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x100 0x80>; + cell-index = <2>; + interrupt-parent = <&mpic>; + interrupts = <22 2>; + }; + dma-channel@180 { + compatible = "fsl,eloplus-dma-channel"; + reg = <0x180 0x80>; + cell-index = <3>; + interrupt-parent = <&mpic>; + interrupts = <23 2>; + }; + }; + + usb@22000 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "fsl-usb2-dr"; + reg = <0x22000 0x1000>; + interrupt-parent = <&mpic>; + interrupts = <28 0x2>; + phy_type = "ulpi"; + }; + + enet0: ethernet@24000 { + #address-cells = <1>; + #size-cells = <1>; + cell-index = <0>; + device_type = "network"; + model = "eTSEC"; + compatible = "gianfar"; + reg = <0x24000 0x1000>; + ranges = <0x0 0x24000 0x1000>; + local-mac-address = [ 00 00 00 00 00 00 ]; + interrupts = <29 2 30 2 34 2>; + interrupt-parent = <&mpic>; + fixed-link = <1 1 1000 0 0>; + phy-connection-type = "rgmii-id"; + + mdio@520 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "fsl,gianfar-mdio"; + reg = <0x520 0x20>; + + phy0: ethernet-phy@0 { + interrupt-parent = <&mpic>; + interrupts = <3 1>; + reg = <0x0>; + }; + phy1: ethernet-phy@1 { + interrupt-parent = <&mpic>; + interrupts = <3 1>; + reg = <0x1>; + }; + }; + }; + + enet1: ethernet@25000 { + #address-cells = <1>; + #size-cells = <1>; + cell-index = <1>; + device_type = "network"; + model = "eTSEC"; + compatible = "gianfar"; + reg = <0x25000 0x1000>; + ranges = <0x0 0x25000 0x1000>; + local-mac-address = [ 00 00 00 00 00 00 ]; + interrupts = <35 2 36 2 40 2>; + interrupt-parent = <&mpic>; + tbi-handle = <&tbi0>; + phy-handle = <&phy0>; + phy-connection-type = "sgmii"; + + mdio@520 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "fsl,gianfar-tbi"; + reg = <0x520 0x20>; + + tbi0: tbi-phy@11 { + reg = <0x11>; + device_type = "tbi-phy"; + }; + }; + }; + + enet2: ethernet@26000 { + #address-cells = <1>; + #size-cells = <1>; + cell-index = <2>; + device_type = "network"; + model = "eTSEC"; + compatible = "gianfar"; + reg = <0x26000 0x1000>; + ranges = <0x0 0x26000 0x1000>; + local-mac-address = [ 00 00 00 00 00 00 ]; + interrupts = <31 2 32 2 33 2>; + interrupt-parent = <&mpic>; + phy-handle = <&phy1>; + phy-connection-type = "rgmii-id"; + }; + + sdhci@2e000 { + compatible = "fsl,p2020-esdhc", "fsl,esdhc"; + reg = <0x2e000 0x1000>; + interrupts = <72 0x2>; + interrupt-parent = <&mpic>; + /* Filled in by U-Boot */ + clock-frequency = <0>; + }; + + crypto@30000 { + compatible = "fsl,sec3.1", "fsl,sec3.0", "fsl,sec2.4", + "fsl,sec2.2", "fsl,sec2.1", "fsl,sec2.0"; + reg = <0x30000 0x10000>; + interrupts = <45 2 58 2>; + interrupt-parent = <&mpic>; + fsl,num-channels = <4>; + fsl,channel-fifo-len = <24>; + fsl,exec-units-mask = <0xbfe>; + fsl,descriptor-types-mask = <0x3ab0ebf>; + }; + + mpic: pic@40000 { + interrupt-controller; + #address-cells = <0>; + #interrupt-cells = <2>; + reg = <0x40000 0x40000>; + compatible = "chrp,open-pic"; + device_type = "open-pic"; + }; + + msi@41600 { + compatible = "fsl,p2020-msi", "fsl,mpic-msi"; + reg = <0x41600 0x80>; + msi-available-ranges = <0 0x100>; + interrupts = < + 0xe0 0 + 0xe1 0 + 0xe2 0 + 0xe3 0 + 0xe4 0 + 0xe5 0 + 0xe6 0 + 0xe7 0>; + interrupt-parent = <&mpic>; + }; + + global-utilities@e0000 { //global utilities block + compatible = "fsl,p2020-guts"; + reg = <0xe0000 0x1000>; + fsl,has-rstcr; + }; + }; + + pci0: pcie@ffe09000 { + compatible = "fsl,mpc8548-pcie"; + device_type = "pci"; + #interrupt-cells = <1>; + #size-cells = <2>; + #address-cells = <3>; + reg = <0 0xffe09000 0 0x1000>; + bus-range = <0 255>; + ranges = <0x2000000 0x0 0xa0000000 0 0xa0000000 0x0 0x20000000 + 0x1000000 0x0 0x00000000 0 0xffc30000 0x0 0x10000>; + clock-frequency = <33333333>; + interrupt-parent = <&mpic>; + interrupts = <25 2>; + pcie@0 { + reg = <0x0 0x0 0x0 0x0 0x0>; + #size-cells = <2>; + #address-cells = <3>; + device_type = "pci"; + ranges = <0x2000000 0x0 0xa0000000 + 0x2000000 0x0 0xa0000000 + 0x0 0x20000000 + + 0x1000000 0x0 0x0 + 0x1000000 0x0 0x0 + 0x0 0x100000>; + }; + }; + + pci1: pcie@ffe0a000 { + compatible = "fsl,mpc8548-pcie"; + device_type = "pci"; + #interrupt-cells = <1>; + #size-cells = <2>; + #address-cells = <3>; + reg = <0 0xffe0a000 0 0x1000>; + bus-range = <0 255>; + ranges = <0x2000000 0x0 0xc0000000 0 0xc0000000 0x0 0x20000000 + 0x1000000 0x0 0x00000000 0 0xffc20000 0x0 0x10000>; + clock-frequency = <33333333>; + interrupt-parent = <&mpic>; + interrupts = <26 2>; + pcie@0 { + reg = <0x0 0x0 0x0 0x0 0x0>; + #size-cells = <2>; + #address-cells = <3>; + device_type = "pci"; + ranges = <0x2000000 0x0 0xc0000000 + 0x2000000 0x0 0xc0000000 + 0x0 0x20000000 + + 0x1000000 0x0 0x0 + 0x1000000 0x0 0x0 + 0x0 0x100000>; + }; + }; +}; diff --git a/arch/powerpc/configs/mpc85xx_defconfig b/arch/powerpc/configs/mpc85xx_defconfig index ada595898af1..ee6acc6557f8 100644 --- a/arch/powerpc/configs/mpc85xx_defconfig +++ b/arch/powerpc/configs/mpc85xx_defconfig @@ -203,6 +203,7 @@ CONFIG_MPC85xx_CDS=y CONFIG_MPC85xx_MDS=y CONFIG_MPC8536_DS=y CONFIG_MPC85xx_DS=y +CONFIG_MPC85xx_RDB=y CONFIG_SOCRATES=y CONFIG_KSI8560=y # CONFIG_XES_MPC85xx is not set diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig index a9b416688975..d3a975e8fd3e 100644 --- a/arch/powerpc/platforms/85xx/Kconfig +++ b/arch/powerpc/platforms/85xx/Kconfig @@ -55,6 +55,15 @@ config MPC85xx_DS help This option enables support for the MPC85xx DS (MPC8544 DS) board +config MPC85xx_RDB + bool "Freescale MPC85xx RDB" + select PPC_I8259 + select DEFAULT_UIMAGE + select FSL_ULI1575 + select SWIOTLB + help + This option enables support for the MPC85xx RDB (P2020 RDB) board + config SOCRATES bool "Socrates" select DEFAULT_UIMAGE diff --git a/arch/powerpc/platforms/85xx/Makefile b/arch/powerpc/platforms/85xx/Makefile index 835733f2b12c..9098aea0cf32 100644 --- a/arch/powerpc/platforms/85xx/Makefile +++ b/arch/powerpc/platforms/85xx/Makefile @@ -9,10 +9,11 @@ obj-$(CONFIG_MPC85xx_CDS) += mpc85xx_cds.o obj-$(CONFIG_MPC8536_DS) += mpc8536_ds.o obj-$(CONFIG_MPC85xx_DS) += mpc85xx_ds.o obj-$(CONFIG_MPC85xx_MDS) += mpc85xx_mds.o +obj-$(CONFIG_MPC85xx_RDB) += mpc85xx_rdb.o obj-$(CONFIG_STX_GP3) += stx_gp3.o obj-$(CONFIG_TQM85xx) += tqm85xx.o obj-$(CONFIG_SBC8560) += sbc8560.o obj-$(CONFIG_SBC8548) += sbc8548.o obj-$(CONFIG_SOCRATES) += socrates.o socrates_fpga_pic.o obj-$(CONFIG_KSI8560) += ksi8560.o -obj-$(CONFIG_XES_MPC85xx) += xes_mpc85xx.o \ No newline at end of file +obj-$(CONFIG_XES_MPC85xx) += xes_mpc85xx.o diff --git a/arch/powerpc/platforms/85xx/mpc85xx_rdb.c b/arch/powerpc/platforms/85xx/mpc85xx_rdb.c new file mode 100644 index 000000000000..c8468de4acf6 --- /dev/null +++ b/arch/powerpc/platforms/85xx/mpc85xx_rdb.c @@ -0,0 +1,141 @@ +/* + * MPC85xx RDB Board Setup + * + * Copyright 2009 Freescale Semiconductor Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#undef DEBUG + +#ifdef DEBUG +#define DBG(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args) +#else +#define DBG(fmt, args...) +#endif + + +void __init mpc85xx_rdb_pic_init(void) +{ + struct mpic *mpic; + struct resource r; + struct device_node *np; + + np = of_find_node_by_type(NULL, "open-pic"); + if (np == NULL) { + printk(KERN_ERR "Could not find open-pic node\n"); + return; + } + + if (of_address_to_resource(np, 0, &r)) { + printk(KERN_ERR "Failed to map mpic register space\n"); + of_node_put(np); + return; + } + + mpic = mpic_alloc(np, r.start, + MPIC_PRIMARY | MPIC_WANTS_RESET | + MPIC_BIG_ENDIAN | MPIC_BROKEN_FRR_NIRQS | + MPIC_SINGLE_DEST_CPU, + 0, 256, " OpenPIC "); + + BUG_ON(mpic == NULL); + of_node_put(np); + + mpic_init(mpic); + +} + +/* + * Setup the architecture + */ +#ifdef CONFIG_SMP +extern void __init mpc85xx_smp_init(void); +#endif +static void __init mpc85xx_rdb_setup_arch(void) +{ +#ifdef CONFIG_PCI + struct device_node *np; +#endif + + if (ppc_md.progress) + ppc_md.progress("mpc85xx_rdb_setup_arch()", 0); + +#ifdef CONFIG_PCI + for_each_node_by_type(np, "pci") { + if (of_device_is_compatible(np, "fsl,mpc8548-pcie")) + fsl_add_bridge(np, 0); + } + +#endif + +#ifdef CONFIG_SMP + mpc85xx_smp_init(); +#endif + + printk(KERN_INFO "MPC85xx RDB board from Freescale Semiconductor\n"); +} + +static struct of_device_id __initdata mpc85xxrdb_ids[] = { + { .type = "soc", }, + { .compatible = "soc", }, + { .compatible = "simple-bus", }, + { .compatible = "gianfar", }, + {}, +}; + +static int __init mpc85xxrdb_publish_devices(void) +{ + return of_platform_bus_probe(NULL, mpc85xxrdb_ids, NULL); +} +machine_device_initcall(p2020_rdb, mpc85xxrdb_publish_devices); + +/* + * Called very early, device-tree isn't unflattened + */ +static int __init p2020_rdb_probe(void) +{ + unsigned long root = of_get_flat_dt_root(); + + if (of_flat_dt_is_compatible(root, "fsl,P2020RDB")) + return 1; + return 0; +} + +define_machine(p2020_rdb) { + .name = "P2020 RDB", + .probe = p2020_rdb_probe, + .setup_arch = mpc85xx_rdb_setup_arch, + .init_IRQ = mpc85xx_rdb_pic_init, +#ifdef CONFIG_PCI + .pcibios_fixup_bus = fsl_pcibios_fixup_bus, +#endif + .get_irq = mpic_get_irq, + .restart = fsl_rstcr_restart, + .calibrate_decr = generic_calibrate_decr, + .progress = udbg_progress, +}; From 4de124446b4d68edd6cb747495234be915d56a29 Mon Sep 17 00:00:00 2001 From: Liang Li Date: Wed, 12 Aug 2009 09:34:28 -0400 Subject: [PATCH 0197/1662] powerpc/83xx: Remove second USB node from SBC834x DTS Since only one of the SoC USB devices is brought out to a physical connector on the board, remove the 2nd (USB-DR) node from the DTS. Having it present and USB enabled will cause a hang at boot. Signed-off-by: Liang Li Signed-off-by: Yang Shi Signed-off-by: Paul Gortmaker Signed-off-by: Kumar Gala --- arch/powerpc/boot/dts/sbc8349.dts | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/powerpc/boot/dts/sbc8349.dts b/arch/powerpc/boot/dts/sbc8349.dts index 2d9fa68f641c..d93e5beab731 100644 --- a/arch/powerpc/boot/dts/sbc8349.dts +++ b/arch/powerpc/boot/dts/sbc8349.dts @@ -146,18 +146,6 @@ phy_type = "ulpi"; port0; }; - /* phy type (ULPI, UTMI, UTMI_WIDE, SERIAL) */ - usb@23000 { - device_type = "usb"; - compatible = "fsl-usb2-dr"; - reg = <0x23000 0x1000>; - #address-cells = <1>; - #size-cells = <0>; - interrupt-parent = <&ipic>; - interrupts = <38 0x8>; - dr_mode = "otg"; - phy_type = "ulpi"; - }; enet0: ethernet@24000 { #address-cells = <1>; From 31ff09b7a60bbc80b5df941ec1a12502861b4da9 Mon Sep 17 00:00:00 2001 From: Liang Li Date: Wed, 12 Aug 2009 09:34:29 -0400 Subject: [PATCH 0198/1662] powerpc/83xx: Add localbus node and MTD partitions for SBC834x There is 8MB flash, 8kB EEPROM and 128MB SDRAM on the sbc834x local bus, so add a localbus node in DTS with MTD partitions. The recent U-boot commit fe613cdd4eb moves u-boot to the beginning of flash, hence the legacy label on the partition at the end of flash. Signed-off-by: Liang Li Signed-off-by: Yang Shi Signed-off-by: Paul Gortmaker Signed-off-by: Kumar Gala --- arch/powerpc/boot/dts/sbc8349.dts | 40 +++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/arch/powerpc/boot/dts/sbc8349.dts b/arch/powerpc/boot/dts/sbc8349.dts index d93e5beab731..b83036a8d0e5 100644 --- a/arch/powerpc/boot/dts/sbc8349.dts +++ b/arch/powerpc/boot/dts/sbc8349.dts @@ -265,6 +265,46 @@ }; }; + localbus@e0005000 { + #address-cells = <2>; + #size-cells = <1>; + compatible = "fsl,mpc8349-localbus", "simple-bus"; + reg = <0xe0005000 0x1000>; + interrupts = <77 0x8>; + interrupt-parent = <&ipic>; + ranges = <0x0 0x0 0xff800000 0x00800000 /* 8MB Flash */ + 0x1 0x0 0xf8000000 0x00002000 /* 8KB EEPROM */ + 0x2 0x0 0x10000000 0x04000000 /* 64MB SDRAM */ + 0x3 0x0 0x10000000 0x04000000>; /* 64MB SDRAM */ + + flash@0,0 { + #address-cells = <1>; + #size-cells = <1>; + compatible = "intel,28F640J3A", "cfi-flash"; + reg = <0x0 0x0 0x800000>; + bank-width = <2>; + device-width = <1>; + + partition@0 { + label = "u-boot"; + reg = <0x00000000 0x00040000>; + read-only; + }; + + partition@40000 { + label = "user"; + reg = <0x00040000 0x006c0000>; + }; + + partition@700000 { + label = "legacy u-boot"; + reg = <0x00700000 0x00100000>; + read-only; + }; + + }; + }; + pci0: pci@e0008500 { interrupt-map-mask = <0xf800 0x0 0x0 0x7>; interrupt-map = < From 58dfc497b9357ec00c879b775c02f4eab19bcecb Mon Sep 17 00:00:00 2001 From: Liang Li Date: Wed, 12 Aug 2009 09:34:30 -0400 Subject: [PATCH 0199/1662] powerpc/83xx: Fix incorrect PCI interrupt map in SBC834x DTS Allows interrupts to occur on the sbc834x. Currently PCI devices get assigned an incorrect IRQ and so the interrupt count never increases. This was tested with the 82546GB based dual port E1000 PCI-X NIC which uses two distinct IRQ lines on the one card. root@localhost:/root> cat /proc/interrupts | grep eth 17: 78 IPIC Level eth1 48: 27121 IPIC Level eth0 Signed-off-by: Liang Li Signed-off-by: Yang Shi Signed-off-by: Paul Gortmaker Signed-off-by: Kumar Gala --- arch/powerpc/boot/dts/sbc8349.dts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/boot/dts/sbc8349.dts b/arch/powerpc/boot/dts/sbc8349.dts index b83036a8d0e5..0dc90f9bd814 100644 --- a/arch/powerpc/boot/dts/sbc8349.dts +++ b/arch/powerpc/boot/dts/sbc8349.dts @@ -310,10 +310,10 @@ interrupt-map = < /* IDSEL 0x11 */ - 0x8800 0x0 0x0 0x1 &ipic 20 0x8 - 0x8800 0x0 0x0 0x2 &ipic 21 0x8 - 0x8800 0x0 0x0 0x3 &ipic 22 0x8 - 0x8800 0x0 0x0 0x4 &ipic 23 0x8>; + 0x8800 0x0 0x0 0x1 &ipic 48 0x8 + 0x8800 0x0 0x0 0x2 &ipic 17 0x8 + 0x8800 0x0 0x0 0x3 &ipic 18 0x8 + 0x8800 0x0 0x0 0x4 &ipic 19 0x8>; interrupt-parent = <&ipic>; interrupts = <0x42 0x8>; From 944ac03804c4c9c0879347098ea458fd57f38687 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 12 Aug 2009 09:34:31 -0400 Subject: [PATCH 0200/1662] powerpc/83xx: sbc8349 - update defconfig, enable MTD, USB storage With flash partition entries in the DTS file, MTD might as well be enabled in the defconfig. In a similar vein, enable USB and enough related options (SCSI/ext2/ext3) so that a user can read and write to a generic USB flash drive as well. Also, this board only has the two default SOC UARTs, so adjust the UART config accordingly. Signed-off-by: Paul Gortmaker Signed-off-by: Kumar Gala --- arch/powerpc/configs/83xx/sbc834x_defconfig | 320 +++++++++++++++++++- 1 file changed, 308 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/configs/83xx/sbc834x_defconfig b/arch/powerpc/configs/83xx/sbc834x_defconfig index a592b5efdc4d..3a68f861b1bd 100644 --- a/arch/powerpc/configs/83xx/sbc834x_defconfig +++ b/arch/powerpc/configs/83xx/sbc834x_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.31-rc4 -# Wed Jul 29 23:32:13 2009 +# Linux kernel version: 2.6.31-rc5 +# Tue Aug 11 19:57:51 2009 # # CONFIG_PPC64 is not set @@ -420,7 +420,90 @@ CONFIG_PREVENT_FIRMWARE_BUILD=y # CONFIG_FW_LOADER is not set # CONFIG_SYS_HYPERVISOR is not set # CONFIG_CONNECTOR is not set -# CONFIG_MTD is not set +CONFIG_MTD=y +# CONFIG_MTD_DEBUG is not set +CONFIG_MTD_CONCAT=y +CONFIG_MTD_PARTITIONS=y +# CONFIG_MTD_TESTS is not set +# CONFIG_MTD_REDBOOT_PARTS is not set +CONFIG_MTD_CMDLINE_PARTS=y +CONFIG_MTD_OF_PARTS=y +# CONFIG_MTD_AR7_PARTS is not set + +# +# User Modules And Translation Layers +# +CONFIG_MTD_CHAR=y +CONFIG_MTD_BLKDEVS=y +CONFIG_MTD_BLOCK=y +# CONFIG_FTL is not set +# CONFIG_NFTL is not set +# CONFIG_INFTL is not set +# CONFIG_RFD_FTL is not set +# CONFIG_SSFDC is not set +# CONFIG_MTD_OOPS is not set + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=y +# CONFIG_MTD_JEDECPROBE is not set +CONFIG_MTD_GEN_PROBE=y +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +# CONFIG_MTD_CFI_I4 is not set +# CONFIG_MTD_CFI_I8 is not set +CONFIG_MTD_CFI_INTELEXT=y +# CONFIG_MTD_CFI_AMDSTD is not set +# CONFIG_MTD_CFI_STAA is not set +CONFIG_MTD_CFI_UTIL=y +# CONFIG_MTD_RAM is not set +# CONFIG_MTD_ROM is not set +# CONFIG_MTD_ABSENT is not set + +# +# Mapping drivers for chip access +# +# CONFIG_MTD_COMPLEX_MAPPINGS is not set +# CONFIG_MTD_PHYSMAP is not set +CONFIG_MTD_PHYSMAP_OF=y +# CONFIG_MTD_INTEL_VR_NOR is not set +# CONFIG_MTD_PLATRAM is not set + +# +# Self-contained MTD device drivers +# +# CONFIG_MTD_PMC551 is not set +# CONFIG_MTD_SLRAM is not set +# CONFIG_MTD_PHRAM is not set +# CONFIG_MTD_MTDRAM is not set +# CONFIG_MTD_BLOCK2MTD is not set + +# +# Disk-On-Chip Device Drivers +# +# CONFIG_MTD_DOC2000 is not set +# CONFIG_MTD_DOC2001 is not set +# CONFIG_MTD_DOC2001PLUS is not set +# CONFIG_MTD_NAND is not set +# CONFIG_MTD_ONENAND is not set + +# +# LPDDR flash memory drivers +# +# CONFIG_MTD_LPDDR is not set + +# +# UBI - Unsorted block images +# +# CONFIG_MTD_UBI is not set CONFIG_OF_DEVICE=y CONFIG_OF_I2C=y CONFIG_OF_MDIO=y @@ -436,6 +519,7 @@ CONFIG_BLK_DEV_LOOP=y # CONFIG_BLK_DEV_CRYPTOLOOP is not set # CONFIG_BLK_DEV_NBD is not set # CONFIG_BLK_DEV_SX8 is not set +# CONFIG_BLK_DEV_UB is not set CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=32768 @@ -468,9 +552,38 @@ CONFIG_HAVE_IDE=y # SCSI device support # # CONFIG_RAID_ATTRS is not set -# CONFIG_SCSI is not set -# CONFIG_SCSI_DMA is not set +CONFIG_SCSI=y +CONFIG_SCSI_DMA=y +# CONFIG_SCSI_TGT is not set # CONFIG_SCSI_NETLINK is not set +# CONFIG_SCSI_PROC_FS is not set + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=y +# CONFIG_CHR_DEV_ST is not set +# CONFIG_CHR_DEV_OSST is not set +# CONFIG_BLK_DEV_SR is not set +# CONFIG_CHR_DEV_SG is not set +# CONFIG_CHR_DEV_SCH is not set +# CONFIG_SCSI_MULTI_LUN is not set +# CONFIG_SCSI_CONSTANTS is not set +# CONFIG_SCSI_LOGGING is not set +# CONFIG_SCSI_SCAN_ASYNC is not set +CONFIG_SCSI_WAIT_SCAN=m + +# +# SCSI Transports +# +# CONFIG_SCSI_SPI_ATTRS is not set +# CONFIG_SCSI_FC_ATTRS is not set +# CONFIG_SCSI_ISCSI_ATTRS is not set +# CONFIG_SCSI_SAS_LIBSAS is not set +# CONFIG_SCSI_SRP_ATTRS is not set +# CONFIG_SCSI_LOWLEVEL is not set +# CONFIG_SCSI_DH is not set +# CONFIG_SCSI_OSD_INITIATOR is not set # CONFIG_ATA is not set # CONFIG_MD is not set # CONFIG_FUSION is not set @@ -578,11 +691,21 @@ CONFIG_GIANFAR=y # # Enable WiMAX (Networking options) to see the WiMAX drivers # + +# +# USB Network Adapters +# +# CONFIG_USB_CATC is not set +# CONFIG_USB_KAWETH is not set +# CONFIG_USB_PEGASUS is not set +# CONFIG_USB_RTL8150 is not set +# CONFIG_USB_USBNET is not set # CONFIG_WAN is not set # CONFIG_FDDI is not set # CONFIG_HIPPI is not set # CONFIG_PPP is not set # CONFIG_SLIP is not set +# CONFIG_NET_FC is not set # CONFIG_NETCONSOLE is not set # CONFIG_NETPOLL is not set # CONFIG_NET_POLL_CONTROLLER is not set @@ -633,9 +756,9 @@ CONFIG_DEVKMEM=y # CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_8250_PCI=y -CONFIG_SERIAL_8250_NR_UARTS=4 -CONFIG_SERIAL_8250_RUNTIME_UARTS=4 +# CONFIG_SERIAL_8250_PCI is not set +CONFIG_SERIAL_8250_NR_UARTS=2 +CONFIG_SERIAL_8250_RUNTIME_UARTS=2 # CONFIG_SERIAL_8250_EXTENDED is not set # @@ -700,6 +823,7 @@ CONFIG_I2C_MPC=y # # CONFIG_I2C_PARPORT_LIGHT is not set # CONFIG_I2C_TAOS_EVM is not set +# CONFIG_I2C_TINY_USB is not set # # Graphics adapter I2C/DDC channel drivers @@ -814,6 +938,11 @@ CONFIG_WATCHDOG=y # # CONFIG_PCIPCWATCHDOG is not set # CONFIG_WDTPCI is not set + +# +# USB-based Watchdog Cards +# +# CONFIG_USBPCWATCHDOG is not set CONFIG_SSB_POSSIBLE=y # @@ -856,12 +985,134 @@ CONFIG_HID_SUPPORT=y CONFIG_HID=y # CONFIG_HID_DEBUG is not set # CONFIG_HIDRAW is not set + +# +# USB Input Devices +# +# CONFIG_USB_HID is not set # CONFIG_HID_PID is not set +# +# USB HID Boot Protocol drivers +# +# CONFIG_USB_KBD is not set +# CONFIG_USB_MOUSE is not set + # # Special HID drivers # -# CONFIG_USB_SUPPORT is not set +CONFIG_USB_SUPPORT=y +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB_ARCH_HAS_OHCI=y +CONFIG_USB_ARCH_HAS_EHCI=y +CONFIG_USB=y +# CONFIG_USB_DEBUG is not set +# CONFIG_USB_ANNOUNCE_NEW_DEVICES is not set + +# +# Miscellaneous USB options +# +CONFIG_USB_DEVICEFS=y +CONFIG_USB_DEVICE_CLASS=y +# CONFIG_USB_DYNAMIC_MINORS is not set +# CONFIG_USB_OTG is not set +# CONFIG_USB_OTG_WHITELIST is not set +# CONFIG_USB_OTG_BLACKLIST_HUB is not set +CONFIG_USB_MON=y +# CONFIG_USB_WUSB is not set +# CONFIG_USB_WUSB_CBAF is not set + +# +# USB Host Controller Drivers +# +# CONFIG_USB_C67X00_HCD is not set +# CONFIG_USB_XHCI_HCD is not set +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_ROOT_HUB_TT=y +# CONFIG_USB_EHCI_TT_NEWSCHED is not set +CONFIG_USB_EHCI_FSL=y +CONFIG_USB_EHCI_HCD_PPC_OF=y +# CONFIG_USB_OXU210HP_HCD is not set +# CONFIG_USB_ISP116X_HCD is not set +# CONFIG_USB_ISP1760_HCD is not set +# CONFIG_USB_OHCI_HCD is not set +# CONFIG_USB_UHCI_HCD is not set +# CONFIG_USB_SL811_HCD is not set +# CONFIG_USB_R8A66597_HCD is not set +# CONFIG_USB_WHCI_HCD is not set +# CONFIG_USB_HWA_HCD is not set + +# +# USB Device Class drivers +# +# CONFIG_USB_ACM is not set +# CONFIG_USB_PRINTER is not set +# CONFIG_USB_WDM is not set +# CONFIG_USB_TMC is not set + +# +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may +# + +# +# also be needed; see USB_STORAGE Help for more info +# +CONFIG_USB_STORAGE=y +# CONFIG_USB_STORAGE_DEBUG is not set +# CONFIG_USB_STORAGE_DATAFAB is not set +# CONFIG_USB_STORAGE_FREECOM is not set +# CONFIG_USB_STORAGE_ISD200 is not set +# CONFIG_USB_STORAGE_USBAT is not set +# CONFIG_USB_STORAGE_SDDR09 is not set +# CONFIG_USB_STORAGE_SDDR55 is not set +# CONFIG_USB_STORAGE_JUMPSHOT is not set +# CONFIG_USB_STORAGE_ALAUDA is not set +# CONFIG_USB_STORAGE_ONETOUCH is not set +# CONFIG_USB_STORAGE_KARMA is not set +# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set +# CONFIG_USB_LIBUSUAL is not set + +# +# USB Imaging devices +# +# CONFIG_USB_MDC800 is not set +# CONFIG_USB_MICROTEK is not set + +# +# USB port drivers +# +# CONFIG_USB_SERIAL is not set + +# +# USB Miscellaneous drivers +# +# CONFIG_USB_EMI62 is not set +# CONFIG_USB_EMI26 is not set +# CONFIG_USB_ADUTUX is not set +# CONFIG_USB_SEVSEG is not set +# CONFIG_USB_RIO500 is not set +# CONFIG_USB_LEGOTOWER is not set +# CONFIG_USB_LCD is not set +# CONFIG_USB_BERRY_CHARGE is not set +# CONFIG_USB_LED is not set +# CONFIG_USB_CYPRESS_CY7C63 is not set +# CONFIG_USB_CYTHERM is not set +# CONFIG_USB_IDMOUSE is not set +# CONFIG_USB_FTDI_ELAN is not set +# CONFIG_USB_APPLEDISPLAY is not set +# CONFIG_USB_SISUSBVGA is not set +# CONFIG_USB_LD is not set +# CONFIG_USB_TRANCEVIBRATOR is not set +# CONFIG_USB_IOWARRIOR is not set +# CONFIG_USB_TEST is not set +# CONFIG_USB_ISIGHTFW is not set +# CONFIG_USB_VST is not set +# CONFIG_USB_GADGET is not set + +# +# OTG and related infrastructure +# +# CONFIG_NOP_USB_XCEIV is not set # CONFIG_UWB is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -882,9 +1133,14 @@ CONFIG_HID=y # # File systems # -# CONFIG_EXT2_FS is not set -# CONFIG_EXT3_FS is not set +CONFIG_EXT2_FS=y +# CONFIG_EXT2_FS_XATTR is not set +# CONFIG_EXT2_FS_XIP is not set +CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set +# CONFIG_EXT3_FS_XATTR is not set # CONFIG_EXT4_FS is not set +CONFIG_JBD=y # CONFIG_REISERFS_FS is not set # CONFIG_JFS_FS is not set # CONFIG_FS_POSIX_ACL is not set @@ -940,6 +1196,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_BEFS_FS is not set # CONFIG_BFS_FS is not set # CONFIG_EFS_FS is not set +# CONFIG_JFFS2_FS is not set # CONFIG_CRAMFS is not set # CONFIG_SQUASHFS is not set # CONFIG_VXFS_FS is not set @@ -977,7 +1234,46 @@ CONFIG_RPCSEC_GSS_KRB5=y # # CONFIG_PARTITION_ADVANCED is not set CONFIG_MSDOS_PARTITION=y -# CONFIG_NLS is not set +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="iso8859-1" +# CONFIG_NLS_CODEPAGE_437 is not set +# CONFIG_NLS_CODEPAGE_737 is not set +# CONFIG_NLS_CODEPAGE_775 is not set +# CONFIG_NLS_CODEPAGE_850 is not set +# CONFIG_NLS_CODEPAGE_852 is not set +# CONFIG_NLS_CODEPAGE_855 is not set +# CONFIG_NLS_CODEPAGE_857 is not set +# CONFIG_NLS_CODEPAGE_860 is not set +# CONFIG_NLS_CODEPAGE_861 is not set +# CONFIG_NLS_CODEPAGE_862 is not set +# CONFIG_NLS_CODEPAGE_863 is not set +# CONFIG_NLS_CODEPAGE_864 is not set +# CONFIG_NLS_CODEPAGE_865 is not set +# CONFIG_NLS_CODEPAGE_866 is not set +# CONFIG_NLS_CODEPAGE_869 is not set +# CONFIG_NLS_CODEPAGE_936 is not set +# CONFIG_NLS_CODEPAGE_950 is not set +# CONFIG_NLS_CODEPAGE_932 is not set +# CONFIG_NLS_CODEPAGE_949 is not set +# CONFIG_NLS_CODEPAGE_874 is not set +# CONFIG_NLS_ISO8859_8 is not set +# CONFIG_NLS_CODEPAGE_1250 is not set +# CONFIG_NLS_CODEPAGE_1251 is not set +# CONFIG_NLS_ASCII is not set +# CONFIG_NLS_ISO8859_1 is not set +# CONFIG_NLS_ISO8859_2 is not set +# CONFIG_NLS_ISO8859_3 is not set +# CONFIG_NLS_ISO8859_4 is not set +# CONFIG_NLS_ISO8859_5 is not set +# CONFIG_NLS_ISO8859_6 is not set +# CONFIG_NLS_ISO8859_7 is not set +# CONFIG_NLS_ISO8859_9 is not set +# CONFIG_NLS_ISO8859_13 is not set +# CONFIG_NLS_ISO8859_14 is not set +# CONFIG_NLS_ISO8859_15 is not set +# CONFIG_NLS_KOI8_R is not set +# CONFIG_NLS_KOI8_U is not set +# CONFIG_NLS_UTF8 is not set # CONFIG_DLM is not set # CONFIG_BINARY_PRINTF is not set From 7792da8567a70be3280c8eef916334e6923815e3 Mon Sep 17 00:00:00 2001 From: Liang Li Date: Fri, 14 Aug 2009 10:36:14 -0400 Subject: [PATCH 0201/1662] powerpc/85xx: sbc8560 - Fix warm reboot with board specific reset function The existing fsl_rstcr_restart function is not applicable to the mpc8560. The Global Utilities Block on this earlier CPU doesn't have the control/reset register at 0xe00b0. This implements a board specific reset function that uses the RCR(Reset Control Register) of the sbc8560's EPLD to do a reset. Signed-off-by: Liang Li Signed-off-by: Paul Gortmaker Signed-off-by: Kumar Gala --- arch/powerpc/platforms/85xx/sbc8560.c | 39 ++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/85xx/sbc8560.c b/arch/powerpc/platforms/85xx/sbc8560.c index cc27807a8b64..a5ad1c7794bf 100644 --- a/arch/powerpc/platforms/85xx/sbc8560.c +++ b/arch/powerpc/platforms/85xx/sbc8560.c @@ -267,6 +267,43 @@ arch_initcall(sbc8560_rtc_init); #endif /* M48T59 */ +static __u8 __iomem *brstcr; + +static int __init sbc8560_bdrstcr_init(void) +{ + struct device_node *np; + struct resource res; + + np = of_find_compatible_node(NULL, NULL, "wrs,sbc8560-brstcr"); + if (np == NULL) { + printk(KERN_WARNING "sbc8560: No board specific RSTCR in DTB.\n"); + return -ENODEV; + } + + of_address_to_resource(np, 0, &res); + + printk(KERN_INFO "sbc8560: Found BRSTCR at i/o 0x%x\n", res.start); + + brstcr = ioremap(res.start, res.end - res.start); + if(!brstcr) + printk(KERN_WARNING "sbc8560: ioremap of brstcr failed.\n"); + + of_node_put(np); + + return 0; +} + +arch_initcall(sbc8560_bdrstcr_init); + +void sbc8560_rstcr_restart(char * cmd) +{ + local_irq_disable(); + if(brstcr) + clrbits8(brstcr, 0x80); + + while(1); +} + define_machine(sbc8560) { .name = "SBC8560", .probe = sbc8560_probe, @@ -274,7 +311,7 @@ define_machine(sbc8560) { .init_IRQ = sbc8560_pic_init, .show_cpuinfo = sbc8560_show_cpuinfo, .get_irq = mpic_get_irq, - .restart = fsl_rstcr_restart, + .restart = sbc8560_rstcr_restart, .calibrate_decr = generic_calibrate_decr, .progress = udbg_progress, }; From 46c4c229ecf470202a1f4fd2402283cb038864bf Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 14 Aug 2009 12:13:53 -0400 Subject: [PATCH 0202/1662] powerpc/85xx: issue fsl_soc reboot warning only when applicable Some CPU, like the MPC8560 don't have a RSTCR in the Global Utilities Block. These boards will implement their own reboot call, and not use this code, so we should only warn about the absence of the GUTS RSTCR when the default reboot code is used. Signed-off-by: Paul Gortmaker Signed-off-by: Kumar Gala --- arch/powerpc/sysdev/fsl_soc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c index 95dbc643c4fc..adca4affcf1f 100644 --- a/arch/powerpc/sysdev/fsl_soc.c +++ b/arch/powerpc/sysdev/fsl_soc.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -383,8 +384,9 @@ static int __init setup_rstcr(void) if (!rstcr) printk (KERN_EMERG "Error: reset control register " "not mapped!\n"); - } else - printk (KERN_INFO "rstcr compatible register does not exist!\n"); + } else if (ppc_md.restart == fsl_rstcr_restart) + printk(KERN_ERR "No RSTCR register, warm reboot won't work\n"); + if (np) of_node_put(np); return 0; From c9c419773d2a84af8ff2f91c859d3d16592c947b Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Fri, 14 Aug 2009 12:13:54 -0400 Subject: [PATCH 0203/1662] powerpc/85xx: sbc8560 - remove "has-rstcr" from global utilities block The earlier mpc8560 CPUs don't have the RSTCR at 0xe00b0 in the GUTS. The generic reboot code uses this tag to determine if it should be using the RSTCR for reboot, so remove it from the board definition. Signed-off-by: Paul Gortmaker Signed-off-by: Kumar Gala --- arch/powerpc/boot/dts/sbc8560.dts | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/boot/dts/sbc8560.dts b/arch/powerpc/boot/dts/sbc8560.dts index 239d57a55cf4..9e13ed8a1193 100644 --- a/arch/powerpc/boot/dts/sbc8560.dts +++ b/arch/powerpc/boot/dts/sbc8560.dts @@ -303,7 +303,6 @@ global-utilities@e0000 { compatible = "fsl,mpc8560-guts"; reg = <0xe0000 0x1000>; - fsl,has-rstcr; }; }; From 3fafdd7cc810a8fcd28b83fbde2d0e25830dee41 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 25 Aug 2009 15:03:07 +0100 Subject: [PATCH 0204/1662] uwb: handle radio controller events with out-of-range IDs correctly If a radio controller event has an ID that's just out of range don't read beyond the end of uwbd's event arrays. Signed-off-by: Roel Kluin Signed-off-by: David Vrabel --- drivers/uwb/uwbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/uwb/uwbd.c b/drivers/uwb/uwbd.c index 57bd6bfef37e..5a777d8624da 100644 --- a/drivers/uwb/uwbd.c +++ b/drivers/uwb/uwbd.c @@ -187,12 +187,12 @@ int uwbd_event_handle_urc(struct uwb_event *evt) event = le16_to_cpu(evt->notif.rceb->wEvent); context = evt->notif.rceb->bEventContext; - if (type > ARRAY_SIZE(uwbd_urc_evt_type_handlers)) + if (type >= ARRAY_SIZE(uwbd_urc_evt_type_handlers)) goto out; type_table = &uwbd_urc_evt_type_handlers[type]; if (type_table->uwbd_events == NULL) goto out; - if (event > type_table->size) + if (event >= type_table->size) goto out; handler = type_table->uwbd_events[event].handler; if (handler == NULL) From a9e75a389254801ca160b72c6e221e5bb7e35df9 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 25 Aug 2009 15:07:12 +0100 Subject: [PATCH 0205/1662] uwb: stop uwbd thread if rc->start() fails This fixes an oops when uwbd thread continues running after a failed radio controller start. Signed-off-by: David Vrabel --- drivers/uwb/lc-rc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/uwb/lc-rc.c b/drivers/uwb/lc-rc.c index 9cf21e6bb624..9611ef3b787a 100644 --- a/drivers/uwb/lc-rc.c +++ b/drivers/uwb/lc-rc.c @@ -288,8 +288,8 @@ int uwb_rc_add(struct uwb_rc *rc, struct device *parent_dev, void *priv) error_dev_add: error_rc_setup: rc->stop(rc); - uwbd_stop(rc); error_rc_start: + uwbd_stop(rc); return result; } EXPORT_SYMBOL_GPL(uwb_rc_add); From ec80fb2d89a0a93db46bdc968b6a849be5340531 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 19 Aug 2009 03:38:18 +0400 Subject: [PATCH 0206/1662] powerpc/85xx: Add eSDHC support for MPC8536DS boards This patch simply adds sdhci node to the device tree. We specify clock-frequency manually, so that eSDHC will work without upgrading U-Boot. Though, that'll only work for default setup (1500 MHz) on new board revisions. For non-default setups, it's recommended to upgrade U-Boot, since it will fixup clock-frequency automatically. Signed-off-by: Anton Vorontsov Signed-off-by: Kumar Gala --- arch/powerpc/boot/dts/mpc8536ds.dts | 8 ++++++++ arch/powerpc/boot/dts/mpc8536ds_36b.dts | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/arch/powerpc/boot/dts/mpc8536ds.dts b/arch/powerpc/boot/dts/mpc8536ds.dts index 22caf69e6efd..815cebb2e3e5 100644 --- a/arch/powerpc/boot/dts/mpc8536ds.dts +++ b/arch/powerpc/boot/dts/mpc8536ds.dts @@ -250,6 +250,14 @@ phy_type = "ulpi"; }; + sdhci@2e000 { + compatible = "fsl,mpc8536-esdhc", "fsl,esdhc"; + reg = <0x2e000 0x1000>; + interrupts = <72 0x2>; + interrupt-parent = <&mpic>; + clock-frequency = <250000000>; + }; + serial0: serial@4500 { cell-index = <0>; device_type = "serial"; diff --git a/arch/powerpc/boot/dts/mpc8536ds_36b.dts b/arch/powerpc/boot/dts/mpc8536ds_36b.dts index 113ed8b7c898..d95b26021e62 100644 --- a/arch/powerpc/boot/dts/mpc8536ds_36b.dts +++ b/arch/powerpc/boot/dts/mpc8536ds_36b.dts @@ -250,6 +250,14 @@ phy_type = "ulpi"; }; + sdhci@2e000 { + compatible = "fsl,mpc8536-esdhc", "fsl,esdhc"; + reg = <0x2e000 0x1000>; + interrupts = <72 0x2>; + interrupt-parent = <&mpic>; + clock-frequency = <250000000>; + }; + serial0: serial@4500 { cell-index = <0>; device_type = "serial"; From a70e88bc78d220a506394058d48cedd1c491a256 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 19 Aug 2009 03:28:10 +0400 Subject: [PATCH 0207/1662] powerpc/82xx: Fix BCSR bits for MPC8272ADS boards mpc8272_ads.c is using BCSR bits definitions from pq2ads.h, but according to User's Guide the bits are wrong for MPC8272ADS boards (I guess definitions from pq2ads should only be used for PQ2FADS boards). So, let's introduce our own definitions for MPC8272ADS, and don't include pq2ads.h. Signed-off-by: Anton Vorontsov Signed-off-by: Kumar Gala --- arch/powerpc/platforms/82xx/mpc8272_ads.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/82xx/mpc8272_ads.c b/arch/powerpc/platforms/82xx/mpc8272_ads.c index 8054c685d323..67e2184f4243 100644 --- a/arch/powerpc/platforms/82xx/mpc8272_ads.c +++ b/arch/powerpc/platforms/82xx/mpc8272_ads.c @@ -29,7 +29,6 @@ #include #include -#include "pq2ads.h" #include "pq2.h" static void __init mpc8272_ads_pic_init(void) @@ -144,6 +143,13 @@ static void __init mpc8272_ads_setup_arch(void) return; } +#define BCSR1_FETHIEN 0x08000000 +#define BCSR1_FETH_RST 0x04000000 +#define BCSR1_RS232_EN1 0x02000000 +#define BCSR1_RS232_EN2 0x01000000 +#define BCSR3_FETHIEN2 0x10000000 +#define BCSR3_FETH2_RST 0x08000000 + clrbits32(&bcsr[1], BCSR1_RS232_EN1 | BCSR1_RS232_EN2 | BCSR1_FETHIEN); setbits32(&bcsr[1], BCSR1_FETH_RST); From 818fcac554397a04987d49e2bd2dfc2d394b265c Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 19 Aug 2009 03:28:17 +0400 Subject: [PATCH 0208/1662] powerpc/82xx: Add CPM USB Gadget support for MPC8272ADS boards - Add usb node; - Configure pins and clocks; - Enable USB function in BCSR. The support was successfully tested using serial and ethernet gadget drivers. Signed-off-by: Anton Vorontsov Signed-off-by: Kumar Gala --- arch/powerpc/boot/dts/mpc8272ads.dts | 8 ++++++++ arch/powerpc/platforms/82xx/mpc8272_ads.c | 14 ++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/arch/powerpc/boot/dts/mpc8272ads.dts b/arch/powerpc/boot/dts/mpc8272ads.dts index 60f332778e41..e802ebd88cb1 100644 --- a/arch/powerpc/boot/dts/mpc8272ads.dts +++ b/arch/powerpc/boot/dts/mpc8272ads.dts @@ -173,6 +173,14 @@ fsl,cpm-command = <0xce00000>; }; + usb@11b60 { + compatible = "fsl,mpc8272-cpm-usb"; + reg = <0x11b60 0x40 0x8b00 0x100>; + interrupts = <11 8>; + interrupt-parent = <&PIC>; + mode = "peripheral"; + }; + mdio@10d40 { device_type = "mdio"; compatible = "fsl,mpc8272ads-mdio-bitbang", diff --git a/arch/powerpc/platforms/82xx/mpc8272_ads.c b/arch/powerpc/platforms/82xx/mpc8272_ads.c index 67e2184f4243..30394b409b3f 100644 --- a/arch/powerpc/platforms/82xx/mpc8272_ads.c +++ b/arch/powerpc/platforms/82xx/mpc8272_ads.c @@ -99,6 +99,15 @@ static struct cpm_pin mpc8272_ads_pins[] = { /* I2C */ {3, 14, CPM_PIN_INPUT | CPM_PIN_SECONDARY | CPM_PIN_OPENDRAIN}, {3, 15, CPM_PIN_INPUT | CPM_PIN_SECONDARY | CPM_PIN_OPENDRAIN}, + + /* USB */ + {2, 10, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {2, 11, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {2, 20, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + {2, 24, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, + {3, 23, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + {3, 24, CPM_PIN_OUTPUT | CPM_PIN_PRIMARY}, + {3, 25, CPM_PIN_INPUT | CPM_PIN_PRIMARY}, }; static void __init init_ioports(void) @@ -112,6 +121,8 @@ static void __init init_ioports(void) cpm2_clk_setup(CPM_CLK_SCC1, CPM_BRG1, CPM_CLK_RX); cpm2_clk_setup(CPM_CLK_SCC1, CPM_BRG1, CPM_CLK_TX); + cpm2_clk_setup(CPM_CLK_SCC3, CPM_CLK8, CPM_CLK_RX); + cpm2_clk_setup(CPM_CLK_SCC3, CPM_CLK8, CPM_CLK_TX); cpm2_clk_setup(CPM_CLK_SCC4, CPM_BRG4, CPM_CLK_RX); cpm2_clk_setup(CPM_CLK_SCC4, CPM_BRG4, CPM_CLK_TX); cpm2_clk_setup(CPM_CLK_FCC1, CPM_CLK11, CPM_CLK_RX); @@ -147,6 +158,7 @@ static void __init mpc8272_ads_setup_arch(void) #define BCSR1_FETH_RST 0x04000000 #define BCSR1_RS232_EN1 0x02000000 #define BCSR1_RS232_EN2 0x01000000 +#define BCSR3_USB_nEN 0x80000000 #define BCSR3_FETHIEN2 0x10000000 #define BCSR3_FETH2_RST 0x08000000 @@ -156,6 +168,8 @@ static void __init mpc8272_ads_setup_arch(void) clrbits32(&bcsr[3], BCSR3_FETHIEN2); setbits32(&bcsr[3], BCSR3_FETH2_RST); + clrbits32(&bcsr[3], BCSR3_USB_nEN); + iounmap(bcsr); init_ioports(); From 9b9d401b8d11796f4c4bcbcabecfec9f5d85ea25 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Wed, 19 Aug 2009 03:28:21 +0400 Subject: [PATCH 0209/1662] powerpc/85xx: Add QE USB support for MPC8569E-MDS boards - Add gpio-controller node for BCSR17, it is used to control USB speed and VBUS; - Add timer node for QE GTM, needed for USB host; - Add usb node itself; - Add some probing code for BCSR GPIOs. NOTE: QE USB doesn't work on prototype boards, but should work on pilot boards if specs and schematics are correct, though we don't have the pilot boards to actually test it. Signed-off-by: Anton Vorontsov Signed-off-by: Kumar Gala --- arch/powerpc/boot/dts/mpc8569mds.dts | 45 +++++++++++++++++++++++ arch/powerpc/platforms/85xx/mpc85xx_mds.c | 4 ++ 2 files changed, 49 insertions(+) diff --git a/arch/powerpc/boot/dts/mpc8569mds.dts b/arch/powerpc/boot/dts/mpc8569mds.dts index 9e4ce99e1613..06332d61830a 100644 --- a/arch/powerpc/boot/dts/mpc8569mds.dts +++ b/arch/powerpc/boot/dts/mpc8569mds.dts @@ -99,8 +99,18 @@ }; bcsr@1,0 { + #address-cells = <1>; + #size-cells = <1>; compatible = "fsl,mpc8569mds-bcsr"; reg = <1 0 0x8000>; + ranges = <0 1 0 0x8000>; + + bcsr17: gpio-controller@11 { + #gpio-cells = <2>; + compatible = "fsl,mpc8569mds-bcsr-gpio"; + reg = <0x11 0x1>; + gpio-controller; + }; }; nand@3,0 { @@ -315,6 +325,14 @@ gpio-controller; }; + qe_pio_f: gpio-controller@a0 { + #gpio-cells = <2>; + compatible = "fsl,mpc8569-qe-pario-bank", + "fsl,mpc8323-qe-pario-bank"; + reg = <0xa0 0x18>; + gpio-controller; + }; + pio1: ucc_pin@01 { pio-map = < /* port pin dir open_drain assignment has_irq */ @@ -419,6 +437,16 @@ interrupt-parent = <&mpic>; }; + timer@440 { + compatible = "fsl,mpc8569-qe-gtm", + "fsl,qe-gtm", "fsl,gtm"; + reg = <0x440 0x40>; + interrupts = <12 13 14 15>; + interrupt-parent = <&qeic>; + /* Filled in by U-Boot */ + clock-frequency = <0>; + }; + spi@4c0 { #address-cells = <1>; #size-cells = <0>; @@ -446,6 +474,23 @@ mode = "cpu"; }; + usb@6c0 { + compatible = "fsl,mpc8569-qe-usb", + "fsl,mpc8323-qe-usb"; + reg = <0x6c0 0x40 0x8b00 0x100>; + interrupts = <11>; + interrupt-parent = <&qeic>; + fsl,fullspeed-clock = "clk5"; + fsl,lowspeed-clock = "brg10"; + gpios = <&qe_pio_f 3 0 /* USBOE */ + &qe_pio_f 4 0 /* USBTP */ + &qe_pio_f 5 0 /* USBTN */ + &qe_pio_f 6 0 /* USBRP */ + &qe_pio_f 8 0 /* USBRN */ + &bcsr17 6 0 /* SPEED */ + &bcsr17 5 1>; /* POWER */ + }; + enet0: ucc@2000 { device_type = "network"; compatible = "ucc_geth"; diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c index bfb32834ab0c..20a61d0af33b 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -304,6 +305,9 @@ static struct of_device_id mpc85xx_ids[] = { static int __init mpc85xx_publish_devices(void) { + if (machine_is(mpc8569_mds)) + simple_gpiochip_init("fsl,mpc8569mds-bcsr-gpio"); + /* Publish the QE devices */ of_platform_bus_probe(NULL, mpc85xx_ids, NULL); From 1dcd8ffc81e80a170625883f63f6a5db3cd0428d Mon Sep 17 00:00:00 2001 From: Michael Barkowski Date: Tue, 18 Aug 2009 17:20:44 -0400 Subject: [PATCH 0210/1662] powerpc/qe_lib: Set gpio data before changing the direction to output This avoids having a short glitch if the desired initial value is not the same as what was previously in the data register. Signed-off-by: Michael Barkowski Acked-by: Anton Vorontsov Signed-off-by: Kumar Gala --- arch/powerpc/sysdev/qe_lib/gpio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/sysdev/qe_lib/gpio.c b/arch/powerpc/sysdev/qe_lib/gpio.c index 3485288dce31..8e7a7767dd5c 100644 --- a/arch/powerpc/sysdev/qe_lib/gpio.c +++ b/arch/powerpc/sysdev/qe_lib/gpio.c @@ -105,14 +105,14 @@ static int qe_gpio_dir_out(struct gpio_chip *gc, unsigned int gpio, int val) struct qe_gpio_chip *qe_gc = to_qe_gpio_chip(mm_gc); unsigned long flags; + qe_gpio_set(gc, gpio, val); + spin_lock_irqsave(&qe_gc->lock, flags); __par_io_config_pin(mm_gc->regs, gpio, QE_PIO_DIR_OUT, 0, 0, 0); spin_unlock_irqrestore(&qe_gc->lock, flags); - qe_gpio_set(gc, gpio, val); - return 0; } From 0396c215f301e92677d1e9a064b405e31501dc1d Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Tue, 25 Aug 2009 16:41:06 +0100 Subject: [PATCH 0211/1662] uwb: avoid radio controller reset loops If a radio controller reset attempt occurs while a probe() or remove() is in progress it fails and is retried endlessly, potentially preventing the probe() or remove() from completing. If a reset fails, sleep for a bit before retrying the reset. This allows the probe()/remove() to complete. Signed-off-by: David Vrabel --- drivers/uwb/hwa-rc.c | 3 +-- drivers/uwb/reset.c | 21 +++++++++++---------- drivers/uwb/umc-bus.c | 2 +- drivers/uwb/whc-rc.c | 3 +-- include/linux/uwb.h | 2 +- 5 files changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/uwb/hwa-rc.c b/drivers/uwb/hwa-rc.c index 9052bcb4f528..e7eeb63fab23 100644 --- a/drivers/uwb/hwa-rc.c +++ b/drivers/uwb/hwa-rc.c @@ -887,8 +887,7 @@ static int hwarc_post_reset(struct usb_interface *iface) struct hwarc *hwarc = usb_get_intfdata(iface); struct uwb_rc *uwb_rc = hwarc->uwb_rc; - uwb_rc_post_reset(uwb_rc); - return 0; + return uwb_rc_post_reset(uwb_rc); } /** USB device ID's that we handle */ diff --git a/drivers/uwb/reset.c b/drivers/uwb/reset.c index 70f8050221ff..7f0512e43d9d 100644 --- a/drivers/uwb/reset.c +++ b/drivers/uwb/reset.c @@ -30,6 +30,7 @@ */ #include #include +#include #include "uwb-internal.h" @@ -323,13 +324,15 @@ int uwbd_msg_handle_reset(struct uwb_event *evt) dev_info(&rc->uwb_dev.dev, "resetting radio controller\n"); ret = rc->reset(rc); - if (ret) { + if (ret < 0) { dev_err(&rc->uwb_dev.dev, "failed to reset hardware: %d\n", ret); goto error; } return 0; error: - /* Nothing can be done except try the reset again. */ + /* Nothing can be done except try the reset again. Wait a bit + to avoid reset loops during probe() or remove(). */ + msleep(1000); uwb_rc_reset_all(rc); return ret; } @@ -368,22 +371,20 @@ void uwb_rc_pre_reset(struct uwb_rc *rc) } EXPORT_SYMBOL_GPL(uwb_rc_pre_reset); -void uwb_rc_post_reset(struct uwb_rc *rc) +int uwb_rc_post_reset(struct uwb_rc *rc) { int ret; ret = rc->start(rc); if (ret) - goto error; + goto out; ret = uwb_rc_mac_addr_set(rc, &rc->uwb_dev.mac_addr); if (ret) - goto error; + goto out; ret = uwb_rc_dev_addr_set(rc, &rc->uwb_dev.dev_addr); if (ret) - goto error; - return; -error: - /* Nothing can be done except try the reset again. */ - uwb_rc_reset_all(rc); + goto out; +out: + return ret; } EXPORT_SYMBOL_GPL(uwb_rc_post_reset); diff --git a/drivers/uwb/umc-bus.c b/drivers/uwb/umc-bus.c index 5ad36164c13b..cdd6c8efc9f8 100644 --- a/drivers/uwb/umc-bus.c +++ b/drivers/uwb/umc-bus.c @@ -66,7 +66,7 @@ int umc_controller_reset(struct umc_dev *umc) return -EAGAIN; ret = device_for_each_child(parent, parent, umc_bus_pre_reset_helper); if (ret >= 0) - device_for_each_child(parent, parent, umc_bus_post_reset_helper); + ret = device_for_each_child(parent, parent, umc_bus_post_reset_helper); up(&parent->sem); return ret; diff --git a/drivers/uwb/whc-rc.c b/drivers/uwb/whc-rc.c index 19a1dd129212..1d9a6f54658e 100644 --- a/drivers/uwb/whc-rc.c +++ b/drivers/uwb/whc-rc.c @@ -443,8 +443,7 @@ static int whcrc_post_reset(struct umc_dev *umc) struct whcrc *whcrc = umc_get_drvdata(umc); struct uwb_rc *uwb_rc = whcrc->uwb_rc; - uwb_rc_post_reset(uwb_rc); - return 0; + return uwb_rc_post_reset(uwb_rc); } /* PCI device ID's that we handle [so it gets loaded] */ diff --git a/include/linux/uwb.h b/include/linux/uwb.h index c02128991ff7..7fc9746f22cd 100644 --- a/include/linux/uwb.h +++ b/include/linux/uwb.h @@ -597,7 +597,7 @@ void uwb_rc_neh_grok(struct uwb_rc *, void *, size_t); void uwb_rc_neh_error(struct uwb_rc *, int); void uwb_rc_reset_all(struct uwb_rc *rc); void uwb_rc_pre_reset(struct uwb_rc *rc); -void uwb_rc_post_reset(struct uwb_rc *rc); +int uwb_rc_post_reset(struct uwb_rc *rc); /** * uwb_rsv_is_owner - is the owner of this reservation the RC? From 680b6cfd3cee30a7d997d49430fb73af84523853 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 26 Aug 2009 16:20:36 +0900 Subject: [PATCH 0212/1662] x86, mce: CE in last bank prevents panic by unknown MCE If MCE handler is called but none of mces_seen have machine check event which might signal the MCE (i.e. event higher than MCE_KEEP_SEVERITY), panic with "Machine check from unknown source" will be taken since the MCE is assumed to be signaled from external agent or so. Usually mces_seen never point MCE_KEEP_SEVERITY event such as CE. But it can happen because initial value of mces_seen is accidentally modified by mce_no_way_out() - in case if mce_no_way_out() run through all banks and the last bank has the CE, mces_seen points the CE and the "panic by unknown" will not be taken. This patch fixes this undesired behavior, and clarifies the logic. Signed-off-by: Hidetoshi Seto Cc: H. Peter Anvin Cc: Andi Kleen Cc: Jin Dongming LKML-Reference: <4A94E244.3020301@jp.fujitsu.com> Signed-off-by: Ingo Molnar Reported-by: Jin Dongming --- arch/x86/kernel/cpu/mcheck/mce.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 54bd1b2fb4c0..325559d1aa58 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -612,7 +612,7 @@ static int mce_timed_out(u64 *t) * This way we prevent any potential data corruption in a unrecoverable case * and also makes sure always all CPU's errors are examined. * - * Also this detects the case of an machine check event coming from outer + * Also this detects the case of a machine check event coming from outer * space (not detected by any CPUs) In this case some external agent wants * us to shut down, so panic too. * @@ -665,7 +665,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (!m && tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) mce_panic("Machine check from unknown source", NULL, NULL); /* @@ -889,11 +889,11 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_setup(&m); m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - no_way_out = mce_no_way_out(&m, &msg); - final = &__get_cpu_var(mces_seen); *final = m; + no_way_out = mce_no_way_out(&m, &msg); + barrier(); /* From 5fc517466dd3d0fc6d2a5180ca6792e60344d8be Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:32 -0700 Subject: [PATCH 0213/1662] x86, pat: Keep identity maps consistent with mmaps even when pat_disabled Make reserve_memtype internally take care of pat disabled case and fallback to default return values. Remove the specific pat_disabled checks in track_* routines. Change kernel_map_sync_memtype to sync identity map even when pat_disabled. This change ensures that, even for pat_disabled case, we take care of keeping identity map in sync. Before this patch, in pat disabled case, ioremap() keeps the identity maps in sync and other APIs like pci and /dev/mem mmap don't, which is not a very consistent behavior. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index e6718bb28065..d5af2792d2fd 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -339,6 +339,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (new_type) { if (req_type == -1) *new_type = _PAGE_CACHE_WB; + else if (req_type == _PAGE_CACHE_WC) + *new_type = _PAGE_CACHE_UC_MINUS; else *new_type = req_type & _PAGE_CACHE_MASK; } @@ -577,7 +579,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) { unsigned long id_sz; - if (!pat_enabled || base >= __pa(high_memory)) + if (base >= __pa(high_memory)) return 0; id_sz = (__pa(high_memory) < base + size) ? @@ -677,9 +679,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - if (!pat_enabled) - return 0; - /* * For now, only handle remap_pfn_range() vmas where * is_linear_pfn_mapping() == TRUE. Handling of @@ -715,9 +714,6 @@ int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - if (!pat_enabled) - return 0; - /* * For now, only handle remap_pfn_range() vmas where * is_linear_pfn_mapping() == TRUE. Handling of @@ -743,9 +739,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - if (!pat_enabled) - return; - /* * For now, only handle remap_pfn_range() vmas where * is_linear_pfn_mapping() == TRUE. Handling of From 279e669b3fc0068cc3509e8e53036999e1e86588 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:33 -0700 Subject: [PATCH 0214/1662] x86, pat: ioremap to follow same PAT restrictions as other PAT users ioremap has this hard-coded check for new type and requested type. That check differs from other PAT users like /dev/mem mmap, remap_pfn_range in only one condition where requested type is UC_MINUS and new type is WC. Under that condition, ioremap fails. But other PAT interfaces succeed with a WC mapping. Change to make ioremap be in sync with other PAT APIs and use the same macro as others. Also changes the error print to KERN_ERR instead of pr_debug. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/ioremap.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 8a450930834f..aeaea8c5b2f4 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -228,24 +228,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, retval = reserve_memtype(phys_addr, (u64)phys_addr + size, prot_val, &new_prot_val); if (retval) { - pr_debug("Warning: reserve_memtype returned %d\n", retval); + printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval); return NULL; } if (prot_val != new_prot_val) { - /* - * Do not fallback to certain memory types with certain - * requested type: - * - request is uc-, return cannot be write-back - * - request is uc-, return cannot be write-combine - * - request is write-combine, return cannot be write-back - */ - if ((prot_val == _PAGE_CACHE_UC_MINUS && - (new_prot_val == _PAGE_CACHE_WB || - new_prot_val == _PAGE_CACHE_WC)) || - (prot_val == _PAGE_CACHE_WC && - new_prot_val == _PAGE_CACHE_WB)) { - pr_debug( + if (!is_new_memtype_allowed(prot_val, new_prot_val)) { + printk(KERN_ERR "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n", (unsigned long long)phys_addr, (unsigned long long)(phys_addr + size), From 9fd126bc742f74a95d2ba610247712ff05da02fe Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:34 -0700 Subject: [PATCH 0215/1662] x86, pat: New i/f for driver to request memtype for IO regions Add new routines to request memtype for IO regions. This will currently be a backend for io_mapping_* routines. But, it can also be made available to drivers directly in future, in case it is needed. reserve interface reserves the memory, makes sure we have a compatible memory type available and keeps the identity map in sync when needed. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pat.h | 5 ++++ arch/x86/mm/pat.c | 49 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 7af14e512f97..e2c1668dde7a 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -19,4 +19,9 @@ extern int free_memtype(u64 start, u64 end); extern int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flag); +int io_reserve_memtype(resource_size_t start, resource_size_t end, + unsigned long *type); + +void io_free_memtype(resource_size_t start, resource_size_t end); + #endif /* _ASM_X86_PAT_H */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index d5af2792d2fd..82d097ce3091 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -498,6 +498,55 @@ int free_memtype(u64 start, u64 end) } +/** + * io_reserve_memtype - Request a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + * @type: A pointer to memtype, with requested type. On success, requested + * or any other compatible type that was available for the region is returned + * + * On success, returns 0 + * On failure, returns non-zero + */ +int io_reserve_memtype(resource_size_t start, resource_size_t end, + unsigned long *type) +{ + unsigned long req_type = *type; + unsigned long new_type; + int ret; + + WARN_ON_ONCE(iomem_map_sanity_check(start, end - start)); + + ret = reserve_memtype(start, end, req_type, &new_type); + if (ret) + goto out_err; + + if (!is_new_memtype_allowed(req_type, new_type)) + goto out_free; + + if (kernel_map_sync_memtype(start, end - start, new_type) < 0) + goto out_free; + + *type = new_type; + return 0; + +out_free: + free_memtype(start, end); + ret = -EBUSY; +out_err: + return ret; +} + +/** + * io_free_memtype - Release a memory type mapping for a region of memory + * @start: start (physical address) of the region + * @end: end (physical address) of the region + */ +void io_free_memtype(resource_size_t start, resource_size_t end) +{ + free_memtype(start, end); +} + pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { From 9e36fda0b359d2a6ae039c3d7e71a04502a77898 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:35 -0700 Subject: [PATCH 0216/1662] x86, pat: Add PAT reserve free to io_mapping* APIs io_mapping_* interfaces were added, mainly for graphics drivers. Make this interface go through the PAT reserve/free, instead of hardcoding WC mapping. This makes sure that there are no aliases due to unconditional WC setting. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/iomap.h | 9 ++++++--- arch/x86/mm/iomap_32.c | 27 +++++++++++++++++++++++++-- include/linux/io-mapping.h | 17 ++++++++++++----- 3 files changed, 43 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index 0e9fe1d9d971..f35eb45d6576 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -26,13 +26,16 @@ #include #include -int -is_io_mapping_possible(resource_size_t base, unsigned long size); - void * iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); void iounmap_atomic(void *kvaddr, enum km_type type); +int +iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); + +void +iomap_free(resource_size_t base, unsigned long size); + #endif /* _ASM_X86_IOMAP_H */ diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index fe6f84ca121e..84e236ce76ba 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -21,7 +21,7 @@ #include #include -int is_io_mapping_possible(resource_size_t base, unsigned long size) +static int is_io_mapping_possible(resource_size_t base, unsigned long size) { #if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT) /* There is no way to map greater than 1 << 32 address without PAE */ @@ -30,7 +30,30 @@ int is_io_mapping_possible(resource_size_t base, unsigned long size) #endif return 1; } -EXPORT_SYMBOL_GPL(is_io_mapping_possible); + +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) +{ + unsigned long flag = _PAGE_CACHE_WC; + int ret; + + if (!is_io_mapping_possible(base, size)) + return -EINVAL; + + ret = io_reserve_memtype(base, base + size, &flag); + if (ret) + return ret; + + *prot = __pgprot(__PAGE_KERNEL | flag); + return 0; +} +EXPORT_SYMBOL_GPL(iomap_create_wc); + +void +iomap_free(resource_size_t base, unsigned long size) +{ + io_free_memtype(base, base + size); +} +EXPORT_SYMBOL_GPL(iomap_free); void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) { diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 0adb0f91568c..97eb928b4924 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -49,23 +49,30 @@ static inline struct io_mapping * io_mapping_create_wc(resource_size_t base, unsigned long size) { struct io_mapping *iomap; - - if (!is_io_mapping_possible(base, size)) - return NULL; + pgprot_t prot; iomap = kmalloc(sizeof(*iomap), GFP_KERNEL); if (!iomap) - return NULL; + goto out_err; + + if (iomap_create_wc(base, size, &prot)) + goto out_free; iomap->base = base; iomap->size = size; - iomap->prot = pgprot_writecombine(__pgprot(__PAGE_KERNEL)); + iomap->prot = prot; return iomap; + +out_free: + kfree(iomap); +out_err: + return NULL; } static inline void io_mapping_free(struct io_mapping *mapping) { + iomap_free(mapping->base, mapping->size); kfree(mapping); } From 335ef896d4c6639849d79367f0fef9abc06d121b Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:36 -0700 Subject: [PATCH 0217/1662] x86, pat: Add rbtree to do quick lookup in memtype tracking PAT memtype tracking uses a linear link list to keep track of IO (non-RAM) regions and their memtypes. The code used a last_accessed pointer as a cache to speedup the lookup. As per discussions with H. Peter Anvin a while back, having a rbtree here will avoid bad performances in pathological cases where we may end up with huge linked list. This may not add any noticable performance speedup in normal case as the number of entires in PAT memtype list tend to be ~20-30 range. The patch removes the "cached_entry" logic as with rbtree we have more generic way of speeding up the lookup. With this patch, we use rbtree to do the quick lookup. We still use linked list as the memtype range tracked can be of different sizes and can overlap in different ways. We also keep track of usage counts with linked list. Example: Multiple ioremaps with different sizes uncached-minus @ 0xfffff00000-0xfffff04000 uncached-minus @ 0xfffff02000-0xfffff03000 And one userlevel mmap and the thread forks a new process uncached-minus @ 0xbf453000-0xbf454000 uncached-minus @ 0xbf453000-0xbf454000 Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 108 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 87 insertions(+), 21 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 82d097ce3091..c90f2420f56c 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -148,11 +149,10 @@ static char *cattr_name(unsigned long flags) * areas). All the aliases have the same cache attributes of course. * Zero attributes are represented as holes. * - * Currently the data structure is a list because the number of mappings - * are expected to be relatively small. If this should be a problem - * it could be changed to a rbtree or similar. + * The data structure is a list that is also organized as an rbtree + * sorted on the start address of memtype range. * - * memtype_lock protects the whole list. + * memtype_lock protects both the linear list and rbtree. */ struct memtype { @@ -160,11 +160,53 @@ struct memtype { u64 end; unsigned long type; struct list_head nd; + struct rb_node rb; }; +static struct rb_root memtype_rbroot = RB_ROOT; static LIST_HEAD(memtype_list); static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ +static struct memtype *memtype_rb_search(struct rb_root *root, u64 start) +{ + struct rb_node *node = root->rb_node; + struct memtype *last_lower = NULL; + + while (node) { + struct memtype *data = container_of(node, struct memtype, rb); + + if (data->start < start) { + last_lower = data; + node = node->rb_right; + } else if (data->start > start) { + node = node->rb_left; + } else + return data; + } + + /* Will return NULL if there is no entry with its start <= start */ + return last_lower; +} + +static void memtype_rb_insert(struct rb_root *root, struct memtype *data) +{ + struct rb_node **new = &(root->rb_node); + struct rb_node *parent = NULL; + + while (*new) { + struct memtype *this = container_of(*new, struct memtype, rb); + + parent = *new; + if (data->start <= this->start) + new = &((*new)->rb_left); + else if (data->start > this->start) + new = &((*new)->rb_right); + } + + rb_link_node(&data->rb, parent, new); + rb_insert_color(&data->rb, root); +} + /* * Does intersection of PAT memory type and MTRR memory type and returns * the resulting memory type as PAT understands it. @@ -218,9 +260,6 @@ chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) return -EBUSY; } -static struct memtype *cached_entry; -static u64 cached_start; - static int pat_pagerange_is_ram(unsigned long start, unsigned long end) { int ram_page = 0, not_rampage = 0; @@ -382,17 +421,19 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, spin_lock(&memtype_lock); - if (cached_entry && start >= cached_start) - entry = cached_entry; - else + entry = memtype_rb_search(&memtype_rbroot, new->start); + if (likely(entry != NULL)) { + /* To work correctly with list_for_each_entry_continue */ + entry = list_entry(entry->nd.prev, struct memtype, nd); + } else { entry = list_entry(&memtype_list, struct memtype, nd); + } /* Search for existing mapping that overlaps the current range */ where = NULL; list_for_each_entry_continue(entry, &memtype_list, nd) { if (end <= entry->start) { where = entry->nd.prev; - cached_entry = list_entry(where, struct memtype, nd); break; } else if (start <= entry->start) { /* end > entry->start */ err = chk_conflict(new, entry, new_type); @@ -400,8 +441,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); where = entry->nd.prev; - cached_entry = list_entry(where, - struct memtype, nd); } break; } else if (start < entry->end) { /* start > entry->start */ @@ -409,8 +448,6 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!err) { dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); - cached_entry = list_entry(entry->nd.prev, - struct memtype, nd); /* * Move to right position in the linked @@ -438,13 +475,13 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, return err; } - cached_start = start; - if (where) list_add(&new->nd, where); else list_add_tail(&new->nd, &memtype_list); + memtype_rb_insert(&memtype_rbroot, new); + spin_unlock(&memtype_lock); dprintk("reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", @@ -456,7 +493,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, int free_memtype(u64 start, u64 end) { - struct memtype *entry; + struct memtype *entry, *saved_entry; int err = -EINVAL; int is_range_ram; @@ -474,17 +511,46 @@ int free_memtype(u64 start, u64 end) return -EINVAL; spin_lock(&memtype_lock); + + entry = memtype_rb_search(&memtype_rbroot, start); + if (unlikely(entry == NULL)) + goto unlock_ret; + + /* + * Saved entry points to an entry with start same or less than what + * we searched for. Now go through the list in both directions to look + * for the entry that matches with both start and end, with list stored + * in sorted start address + */ + saved_entry = entry; list_for_each_entry(entry, &memtype_list, nd) { if (entry->start == start && entry->end == end) { - if (cached_entry == entry || cached_start == start) - cached_entry = NULL; - + rb_erase(&entry->rb, &memtype_rbroot); list_del(&entry->nd); kfree(entry); err = 0; break; + } else if (entry->start > start) { + break; } } + + if (!err) + goto unlock_ret; + + entry = saved_entry; + list_for_each_entry_reverse(entry, &memtype_list, nd) { + if (entry->start == start && entry->end == end) { + rb_erase(&entry->rb, &memtype_rbroot); + list_del(&entry->nd); + kfree(entry); + err = 0; + break; + } else if (entry->start < start) { + break; + } + } +unlock_ret: spin_unlock(&memtype_lock); if (err) { From 46cf98cdaef5471926010b5bddf84c44ec177fdd Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:37 -0700 Subject: [PATCH 0218/1662] x86, pat: Generalize the use of page flag PG_uncached Only IA64 was using PG_uncached as of now. We now intend to use this bit in x86 as well, to keep track of memory type of those addresses that have page struct for them. So, generalize the use of that bit across ia64 and x86. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/ia64/Kconfig | 4 ++++ arch/x86/Kconfig | 4 ++++ include/linux/page-flags.h | 4 ++-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 170042b420d4..e6246119932a 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -112,6 +112,10 @@ config IA64_UNCACHED_ALLOCATOR bool select GENERIC_ALLOCATOR +config ARCH_USES_PG_UNCACHED + def_bool y + depends on IA64_UNCACHED_ALLOCATOR + config AUDIT_ARCH bool default y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c07f72205909..8e1595382196 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1414,6 +1414,10 @@ config X86_PAT If unsure, say Y. +config ARCH_USES_PG_UNCACHED + def_bool y + depends on X86_PAT + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e2e5ce543595..2b87acfc5f87 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -99,7 +99,7 @@ enum pageflags { #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT PG_mlocked, /* Page is vma mlocked */ #endif -#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR +#ifdef CONFIG_ARCH_USES_PG_UNCACHED PG_uncached, /* Page has been mapped as uncached */ #endif __NR_PAGEFLAGS, @@ -257,7 +257,7 @@ PAGEFLAG_FALSE(Mlocked) SETPAGEFLAG_NOOP(Mlocked) TESTCLEARFLAG_FALSE(Mlocked) #endif -#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR +#ifdef CONFIG_ARCH_USES_PG_UNCACHED PAGEFLAG(Uncached, uncached) #else PAGEFLAG_FALSE(Uncached) From f58417409603d62f2eb23db4d2cf6853d84a1698 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:38 -0700 Subject: [PATCH 0219/1662] x86, pat: Use page flags to track memtypes of RAM pages Change reserve_ram_pages_type and free_ram_pages_type to use 2 page flags to track UC_MINUS, WC, WB and default types. Previous RAM tracking just tracked WB or NonWB, which was not complete and did not allow tracking of RAM fully and there was no way to get the actual type reserved by looking at the page flags. We use the memtype_lock spinlock for atomicity in dealing with memtype tracking in struct page. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cacheflush.h | 54 +++++++++++++++++- arch/x86/mm/pat.c | 95 +++++++++++++++++-------------- 2 files changed, 104 insertions(+), 45 deletions(-) diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index e55dfc1ad453..b54f6afe7ec4 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -43,8 +43,58 @@ static inline void copy_from_user_page(struct vm_area_struct *vma, memcpy(dst, src, len); } -#define PG_non_WB PG_arch_1 -PAGEFLAG(NonWB, non_WB) +#define PG_WC PG_arch_1 +PAGEFLAG(WC, WC) + +#ifdef CONFIG_X86_PAT +/* + * X86 PAT uses page flags WC and Uncached together to keep track of + * memory type of pages that have backing page struct. X86 PAT supports 3 + * different memory types, _PAGE_CACHE_WB, _PAGE_CACHE_WC and + * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not + * been changed from its default (value of -1 used to denote this). + * Note we do not support _PAGE_CACHE_UC here. + * + * Caller must hold memtype_lock for atomicity. + */ +static inline unsigned long get_page_memtype(struct page *pg) +{ + if (!PageUncached(pg) && !PageWC(pg)) + return -1; + else if (!PageUncached(pg) && PageWC(pg)) + return _PAGE_CACHE_WC; + else if (PageUncached(pg) && !PageWC(pg)) + return _PAGE_CACHE_UC_MINUS; + else + return _PAGE_CACHE_WB; +} + +static inline void set_page_memtype(struct page *pg, unsigned long memtype) +{ + switch (memtype) { + case _PAGE_CACHE_WC: + ClearPageUncached(pg); + SetPageWC(pg); + break; + case _PAGE_CACHE_UC_MINUS: + SetPageUncached(pg); + ClearPageWC(pg); + break; + case _PAGE_CACHE_WB: + SetPageUncached(pg); + SetPageWC(pg); + break; + default: + case -1: + ClearPageUncached(pg); + ClearPageWC(pg); + break; + } +} +#else +static inline unsigned long get_page_memtype(struct page *pg) { return -1; } +static inline void set_page_memtype(struct page *pg, unsigned long memtype) { } +#endif /* * The set_memory_* API can be used to change various attributes of a virtual diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index c90f2420f56c..1a9d0f07593f 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -288,63 +288,61 @@ static int pat_pagerange_is_ram(unsigned long start, unsigned long end) } /* - * For RAM pages, mark the pages as non WB memory type using - * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or - * set_memory_wc() on a RAM page at a time before marking it as WB again. - * This is ok, because only one driver will be owning the page and - * doing set_memory_*() calls. + * For RAM pages, we use page flags to mark the pages with appropriate type. + * Here we do two pass: + * - Find the memtype of all the pages in the range, look for any conflicts + * - In case of no conflicts, set the new memtype for pages in the range * - * For now, we use PageNonWB to track that the RAM page is being mapped - * as non WB. In future, we will have to use one more flag - * (or some other mechanism in page_struct) to distinguish between - * UC and WC mapping. + * Caller must hold memtype_lock for atomicity. */ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, unsigned long *new_type) { struct page *page; - u64 pfn, end_pfn; + u64 pfn; + + if (req_type == _PAGE_CACHE_UC) { + /* We do not support strong UC */ + WARN_ON_ONCE(1); + req_type = _PAGE_CACHE_UC_MINUS; + } + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + unsigned long type; + + page = pfn_to_page(pfn); + type = get_page_memtype(page); + if (type != -1) { + printk(KERN_INFO "reserve_ram_pages_type failed " + "0x%Lx-0x%Lx, track 0x%lx, req 0x%lx\n", + start, end, type, req_type); + if (new_type) + *new_type = type; + + return -EBUSY; + } + } + + if (new_type) + *new_type = req_type; for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); - if (page_mapped(page) || PageNonWB(page)) - goto out; - - SetPageNonWB(page); + set_page_memtype(page, req_type); } return 0; - -out: - end_pfn = pfn; - for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { - page = pfn_to_page(pfn); - ClearPageNonWB(page); - } - - return -EINVAL; } static int free_ram_pages_type(u64 start, u64 end) { struct page *page; - u64 pfn, end_pfn; + u64 pfn; for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { page = pfn_to_page(pfn); - if (page_mapped(page) || !PageNonWB(page)) - goto out; - - ClearPageNonWB(page); + set_page_memtype(page, -1); } return 0; - -out: - end_pfn = pfn; - for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { - page = pfn_to_page(pfn); - SetPageNonWB(page); - } - return -EINVAL; } /* @@ -405,11 +403,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, *new_type = actual_type; is_range_ram = pat_pagerange_is_ram(start, end); - if (is_range_ram == 1) - return reserve_ram_pages_type(start, end, req_type, - new_type); - else if (is_range_ram < 0) + if (is_range_ram == 1) { + + spin_lock(&memtype_lock); + err = reserve_ram_pages_type(start, end, req_type, new_type); + spin_unlock(&memtype_lock); + + return err; + } else if (is_range_ram < 0) { return -EINVAL; + } new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) @@ -505,10 +508,16 @@ int free_memtype(u64 start, u64 end) return 0; is_range_ram = pat_pagerange_is_ram(start, end); - if (is_range_ram == 1) - return free_ram_pages_type(start, end); - else if (is_range_ram < 0) + if (is_range_ram == 1) { + + spin_lock(&memtype_lock); + err = free_ram_pages_type(start, end); + spin_unlock(&memtype_lock); + + return err; + } else if (is_range_ram < 0) { return -EINVAL; + } spin_lock(&memtype_lock); From 637b86e75f4c255a4446bc0b67ce9d914b9d2d42 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:39 -0700 Subject: [PATCH 0220/1662] x86, pat: Add lookup_memtype to get the current memtype of a paddr Add a new routine lookup_memtype() to get the current memtype based on the PAT reserves and frees. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1a9d0f07593f..71aa6f7246c6 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -573,6 +573,51 @@ int free_memtype(u64 start, u64 end) } +/** + * lookup_memtype - Looksup the memory type for a physical address + * @paddr: physical address of which memory type needs to be looked up + * + * Only to be called when PAT is enabled + * + * Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or + * _PAGE_CACHE_UC + */ +static unsigned long lookup_memtype(u64 paddr) +{ + int rettype = _PAGE_CACHE_WB; + struct memtype *entry; + + if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) + return rettype; + + if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { + struct page *page; + spin_lock(&memtype_lock); + page = pfn_to_page(paddr >> PAGE_SHIFT); + rettype = get_page_memtype(page); + spin_unlock(&memtype_lock); + /* + * -1 from get_page_memtype() implies RAM page is in its + * default state and not reserved, and hence of type WB + */ + if (rettype == -1) + rettype = _PAGE_CACHE_WB; + + return rettype; + } + + spin_lock(&memtype_lock); + + entry = memtype_rb_search(&memtype_rbroot, paddr); + if (entry != NULL) + rettype = entry->type; + else + rettype = _PAGE_CACHE_UC_MINUS; + + spin_unlock(&memtype_lock); + return rettype; +} + /** * io_reserve_memtype - Request a memory type mapping for a region of memory * @start: start (physical address) of the region From 1087637616dd5e96d834164ea462aed6159d039b Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:40 -0700 Subject: [PATCH 0221/1662] x86, pat: Lookup the protection from memtype list on vm_insert_pfn() Lookup the reserved memtype during vm_insert_pfn and use that memtype for the new mapping. This takes care or handling of vm_insert_pfn() interface in track_pfn_vma*/untrack_pfn_vma. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 71aa6f7246c6..b629f75f73de 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -848,11 +848,6 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) unsigned long vma_size = vma->vm_end - vma->vm_start; pgprot_t pgprot; - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* * reserve the whole chunk covered by vma. We need the @@ -880,20 +875,24 @@ int track_pfn_vma_copy(struct vm_area_struct *vma) int track_pfn_vma_new(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long size) { + unsigned long flags; resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* reserve the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; return reserve_pfn_range(paddr, vma_size, prot, 0); } + if (!pat_enabled) + return 0; + + /* for vm_insert_pfn and friends, we set prot based on lookup */ + flags = lookup_memtype(pfn << PAGE_SHIFT); + *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | + flags); + return 0; } @@ -908,11 +907,6 @@ void untrack_pfn_vma(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long vma_size = vma->vm_end - vma->vm_start; - /* - * For now, only handle remap_pfn_range() vmas where - * is_linear_pfn_mapping() == TRUE. Handling of - * vm_insert_pfn() is TBD. - */ if (is_linear_pfn_mapping(vma)) { /* free the whole chunk starting from vm_pgoff */ paddr = (resource_size_t)vma->vm_pgoff << PAGE_SHIFT; From d886c73cd4cf02a71e1650cbcb6176799d78aac1 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Fri, 10 Jul 2009 09:57:41 -0700 Subject: [PATCH 0222/1662] x86, pat: Sanity check remap_pfn_range for RAM region Add sanity check for remap_pfn_range of RAM regions using lookup_memtype(). Previously, we did not have anyway to get the type of RAM memory regions as they were tracked using a single bit in page_struct (WB, nonWB). Now we can get the actual type from page struct (WB, WC, UC_MINUS) and make sure the requester gets that type. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index b629f75f73de..a6cace0694a2 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -783,11 +783,29 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot, is_ram = pat_pagerange_is_ram(paddr, paddr + size); /* - * reserve_pfn_range() doesn't support RAM pages. Maintain the current - * behavior with RAM pages by returning success. + * reserve_pfn_range() for RAM pages. We do not refcount to keep + * track of number of mappings of RAM pages. We can assert that + * the type requested matches the type of first page in the range. */ - if (is_ram != 0) + if (is_ram) { + if (!pat_enabled) + return 0; + + flags = lookup_memtype(paddr); + if (want_flags != flags) { + printk(KERN_WARNING + "%s:%d map pfn RAM range req %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + (unsigned long long)paddr, + (unsigned long long)(paddr + size), + cattr_name(flags)); + *vma_prot = __pgprot((pgprot_val(*vma_prot) & + (~_PAGE_CACHE_MASK)) | + flags); + } return 0; + } ret = reserve_memtype(paddr, paddr + size, want_flags, &flags); if (ret) From ea3cc330ac0cd521ff07c7cd432a1848c19a7e92 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 18 Aug 2009 19:00:34 +0000 Subject: [PATCH 0223/1662] powerpc/mm: Cleanup handling of execute permission This is an attempt at cleaning up a bit the way we handle execute permission on powerpc. _PAGE_HWEXEC is gone, _PAGE_EXEC is now only defined by CPUs that can do something with it, and the myriad of #ifdef's in the I$/D$ coherency code is reduced to 2 cases that hopefully should cover everything. The logic on BookE is a little bit different than what it was though not by much. Since now, _PAGE_EXEC will be set by the generic code for executable pages, we need to filter out if they are unclean and recover it. However, I don't expect the code to be more bloated than it already was in that area due to that change. I could boast that this brings proper enforcing of per-page execute permissions to all BookE and 40x but in fact, we've had that now for some time as a side effect of my previous rework in that area (and I didn't even know it :-) We would only enable execute permission if the page was cache clean and we would only cache clean it if we took and exec fault. Since we now enforce that the later only work if VM_EXEC is part of the VMA flags, we de-fact already enforce per-page execute permissions... Unless I missed something Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pgtable-ppc32.h | 7 +- arch/powerpc/include/asm/pgtable-ppc64.h | 3 +- arch/powerpc/include/asm/pte-40x.h | 2 +- arch/powerpc/include/asm/pte-44x.h | 2 +- arch/powerpc/include/asm/pte-8xx.h | 1 - arch/powerpc/include/asm/pte-book3e.h | 13 +- arch/powerpc/include/asm/pte-common.h | 22 +-- arch/powerpc/include/asm/pte-fsl-booke.h | 2 +- arch/powerpc/include/asm/pte-hash32.h | 1 - arch/powerpc/kernel/head_44x.S | 2 +- arch/powerpc/kernel/head_fsl_booke.S | 4 +- arch/powerpc/mm/40x_mmu.c | 4 +- arch/powerpc/mm/pgtable.c | 167 ++++++++++++++++------- arch/powerpc/mm/pgtable_32.c | 2 +- arch/powerpc/mm/tlb_low_64e.S | 4 +- 15 files changed, 149 insertions(+), 87 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc32.h b/arch/powerpc/include/asm/pgtable-ppc32.h index c9ff9d75990e..f2c52e253956 100644 --- a/arch/powerpc/include/asm/pgtable-ppc32.h +++ b/arch/powerpc/include/asm/pgtable-ppc32.h @@ -186,7 +186,7 @@ static inline unsigned long pte_update(pte_t *p, #endif /* !PTE_ATOMIC_UPDATES */ #ifdef CONFIG_44x - if ((old & _PAGE_USER) && (old & _PAGE_HWEXEC)) + if ((old & _PAGE_USER) && (old & _PAGE_EXEC)) icache_44x_need_flush = 1; #endif return old; @@ -217,7 +217,7 @@ static inline unsigned long long pte_update(pte_t *p, #endif /* !PTE_ATOMIC_UPDATES */ #ifdef CONFIG_44x - if ((old & _PAGE_USER) && (old & _PAGE_HWEXEC)) + if ((old & _PAGE_USER) && (old & _PAGE_EXEC)) icache_44x_need_flush = 1; #endif return old; @@ -267,8 +267,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) { unsigned long bits = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | - _PAGE_HWEXEC | _PAGE_EXEC); + (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); pte_update(ptep, 0, bits); } diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 200ec2dfa034..806abe7a3fa5 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -313,8 +313,7 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) { unsigned long bits = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | - _PAGE_EXEC | _PAGE_HWEXEC); + (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); #ifdef PTE_ATOMIC_UPDATES unsigned long old, tmp; diff --git a/arch/powerpc/include/asm/pte-40x.h b/arch/powerpc/include/asm/pte-40x.h index 07630faae029..6c3e1f4378d4 100644 --- a/arch/powerpc/include/asm/pte-40x.h +++ b/arch/powerpc/include/asm/pte-40x.h @@ -46,7 +46,7 @@ #define _PAGE_RW 0x040 /* software: Writes permitted */ #define _PAGE_DIRTY 0x080 /* software: dirty page */ #define _PAGE_HWWRITE 0x100 /* hardware: Dirty & RW, set in exception */ -#define _PAGE_HWEXEC 0x200 /* hardware: EX permission */ +#define _PAGE_EXEC 0x200 /* hardware: EX permission */ #define _PAGE_ACCESSED 0x400 /* software: R: page referenced */ #define _PMD_PRESENT 0x400 /* PMD points to page of PTEs */ diff --git a/arch/powerpc/include/asm/pte-44x.h b/arch/powerpc/include/asm/pte-44x.h index 37e98bcf83e0..4192b9bad901 100644 --- a/arch/powerpc/include/asm/pte-44x.h +++ b/arch/powerpc/include/asm/pte-44x.h @@ -78,7 +78,7 @@ #define _PAGE_PRESENT 0x00000001 /* S: PTE valid */ #define _PAGE_RW 0x00000002 /* S: Write permission */ #define _PAGE_FILE 0x00000004 /* S: nonlinear file mapping */ -#define _PAGE_HWEXEC 0x00000004 /* H: Execute permission */ +#define _PAGE_EXEC 0x00000004 /* H: Execute permission */ #define _PAGE_ACCESSED 0x00000008 /* S: Page referenced */ #define _PAGE_DIRTY 0x00000010 /* S: Page dirty */ #define _PAGE_SPECIAL 0x00000020 /* S: Special page */ diff --git a/arch/powerpc/include/asm/pte-8xx.h b/arch/powerpc/include/asm/pte-8xx.h index 8c6e31251034..94e979718dcf 100644 --- a/arch/powerpc/include/asm/pte-8xx.h +++ b/arch/powerpc/include/asm/pte-8xx.h @@ -36,7 +36,6 @@ /* These five software bits must be masked out when the entry is loaded * into the TLB. */ -#define _PAGE_EXEC 0x0008 /* software: i-cache coherency required */ #define _PAGE_GUARDED 0x0010 /* software: guarded access */ #define _PAGE_DIRTY 0x0020 /* software: page changed */ #define _PAGE_RW 0x0040 /* software: user write access allowed */ diff --git a/arch/powerpc/include/asm/pte-book3e.h b/arch/powerpc/include/asm/pte-book3e.h index 1d27c77d7704..9800565aebb8 100644 --- a/arch/powerpc/include/asm/pte-book3e.h +++ b/arch/powerpc/include/asm/pte-book3e.h @@ -37,12 +37,13 @@ #define _PAGE_WRITETHRU 0x800000 /* W: cache write-through */ /* "Higher level" linux bit combinations */ -#define _PAGE_EXEC _PAGE_BAP_SX /* Can be executed from potentially */ -#define _PAGE_HWEXEC _PAGE_BAP_UX /* .. and was cache cleaned */ -#define _PAGE_RW (_PAGE_BAP_SW | _PAGE_BAP_UW) /* User write permission */ -#define _PAGE_KERNEL_RW (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY) -#define _PAGE_KERNEL_RO (_PAGE_BAP_SR) -#define _PAGE_USER (_PAGE_BAP_UR | _PAGE_BAP_SR) /* Can be read */ +#define _PAGE_EXEC _PAGE_BAP_UX /* .. and was cache cleaned */ +#define _PAGE_RW (_PAGE_BAP_SW | _PAGE_BAP_UW) /* User write permission */ +#define _PAGE_KERNEL_RW (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY) +#define _PAGE_KERNEL_RO (_PAGE_BAP_SR) +#define _PAGE_KERNEL_RWX (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY | _PAGE_BAP_SX) +#define _PAGE_KERNEL_ROX (_PAGE_BAP_SR | _PAGE_BAP_SX) +#define _PAGE_USER (_PAGE_BAP_UR | _PAGE_BAP_SR) /* Can be read */ #define _PAGE_HASHPTE 0 #define _PAGE_BUSY 0 diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 8bb6464ba619..c3b65076a263 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -13,9 +13,6 @@ #ifndef _PAGE_HWWRITE #define _PAGE_HWWRITE 0 #endif -#ifndef _PAGE_HWEXEC -#define _PAGE_HWEXEC 0 -#endif #ifndef _PAGE_EXEC #define _PAGE_EXEC 0 #endif @@ -48,10 +45,16 @@ #define PMD_PAGE_SIZE(pmd) bad_call_to_PMD_PAGE_SIZE() #endif #ifndef _PAGE_KERNEL_RO -#define _PAGE_KERNEL_RO 0 +#define _PAGE_KERNEL_RO 0 +#endif +#ifndef _PAGE_KERNEL_ROX +#define _PAGE_KERNEL_ROX (_PAGE_EXEC) #endif #ifndef _PAGE_KERNEL_RW -#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW | _PAGE_HWWRITE) +#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW | _PAGE_HWWRITE) +#endif +#ifndef _PAGE_KERNEL_RWX +#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_HWWRITE | _PAGE_EXEC) #endif #ifndef _PAGE_HPTEFLAGS #define _PAGE_HPTEFLAGS _PAGE_HASHPTE @@ -96,8 +99,7 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); #define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ _PAGE_WRITETHRU | _PAGE_ENDIAN | _PAGE_4K_PFN | \ _PAGE_USER | _PAGE_ACCESSED | \ - _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | \ - _PAGE_EXEC | _PAGE_HWEXEC) + _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) /* * We define 2 sets of base prot bits, one for basic pages (ie, @@ -154,11 +156,9 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); _PAGE_NO_CACHE) #define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ _PAGE_NO_CACHE | _PAGE_GUARDED) -#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW | _PAGE_EXEC | \ - _PAGE_HWEXEC) +#define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) -#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO | _PAGE_EXEC | \ - _PAGE_HWEXEC) +#define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) /* Protection used for kernel text. We want the debuggers to be able to * set breakpoints anywhere, so don't write protect the kernel text diff --git a/arch/powerpc/include/asm/pte-fsl-booke.h b/arch/powerpc/include/asm/pte-fsl-booke.h index 10820f58acf5..ce8a9e94ce7f 100644 --- a/arch/powerpc/include/asm/pte-fsl-booke.h +++ b/arch/powerpc/include/asm/pte-fsl-booke.h @@ -23,7 +23,7 @@ #define _PAGE_FILE 0x00002 /* S: when !present: nonlinear file mapping */ #define _PAGE_RW 0x00004 /* S: Write permission (SW) */ #define _PAGE_DIRTY 0x00008 /* S: Page dirty */ -#define _PAGE_HWEXEC 0x00010 /* H: SX permission */ +#define _PAGE_EXEC 0x00010 /* H: SX permission */ #define _PAGE_ACCESSED 0x00020 /* S: Page referenced */ #define _PAGE_ENDIAN 0x00040 /* H: E bit */ diff --git a/arch/powerpc/include/asm/pte-hash32.h b/arch/powerpc/include/asm/pte-hash32.h index 16e571c7f9ef..4aad4132d0a8 100644 --- a/arch/powerpc/include/asm/pte-hash32.h +++ b/arch/powerpc/include/asm/pte-hash32.h @@ -26,7 +26,6 @@ #define _PAGE_WRITETHRU 0x040 /* W: cache write-through */ #define _PAGE_DIRTY 0x080 /* C: page changed */ #define _PAGE_ACCESSED 0x100 /* R: page referenced */ -#define _PAGE_EXEC 0x200 /* software: i-cache coherency required */ #define _PAGE_RW 0x400 /* software: user write access allowed */ #define _PAGE_SPECIAL 0x800 /* software: Special page */ diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 656cfb2d6666..711368b993f2 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -497,7 +497,7 @@ tlb_44x_patch_hwater_D: mtspr SPRN_MMUCR,r12 /* Make up the required permissions */ - li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_HWEXEC + li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC /* Compute pgdir/pmd offset */ rlwinm r12, r10, PPC44x_PGD_OFF_SHIFT, PPC44x_PGD_OFF_MASK_BIT, 29 diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index eca80482ae72..2c5af5256479 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -643,7 +643,7 @@ interrupt_base: 4: /* Make up the required permissions */ - li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_HWEXEC + li r13,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC FIND_PTE andc. r13,r13,r11 /* Check permission */ @@ -742,7 +742,7 @@ finish_tlb_load: #endif mtspr SPRN_MAS2, r12 - li r10, (_PAGE_HWEXEC | _PAGE_PRESENT) + li r10, (_PAGE_EXEC | _PAGE_PRESENT) rlwimi r10, r11, 31, 29, 29 /* extract _PAGE_DIRTY into SW */ and r12, r11, r10 andi. r10, r11, _PAGE_USER /* Test for _PAGE_USER */ diff --git a/arch/powerpc/mm/40x_mmu.c b/arch/powerpc/mm/40x_mmu.c index 29954dc28942..f5e7b9ce63dd 100644 --- a/arch/powerpc/mm/40x_mmu.c +++ b/arch/powerpc/mm/40x_mmu.c @@ -105,7 +105,7 @@ unsigned long __init mmu_mapin_ram(void) while (s >= LARGE_PAGE_SIZE_16M) { pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE; + unsigned long val = p | _PMD_SIZE_16M | _PAGE_EXEC | _PAGE_HWWRITE; pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); pmd_val(*pmdp++) = val; @@ -120,7 +120,7 @@ unsigned long __init mmu_mapin_ram(void) while (s >= LARGE_PAGE_SIZE_4M) { pmd_t *pmdp; - unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE; + unsigned long val = p | _PMD_SIZE_4M | _PAGE_EXEC | _PAGE_HWWRITE; pmdp = pmd_offset(pud_offset(pgd_offset_k(v), v), v); pmd_val(*pmdp) = val; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index b6b32487e740..83f1551ec2c9 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -128,28 +128,6 @@ void pte_free_finish(void) #endif /* CONFIG_SMP */ -/* - * Handle i/d cache flushing, called from set_pte_at() or ptep_set_access_flags() - */ -static pte_t do_dcache_icache_coherency(pte_t pte) -{ - unsigned long pfn = pte_pfn(pte); - struct page *page; - - if (unlikely(!pfn_valid(pfn))) - return pte; - page = pfn_to_page(pfn); - - if (!PageReserved(page) && !test_bit(PG_arch_1, &page->flags)) { - pr_devel("do_dcache_icache_coherency... flushing\n"); - flush_dcache_icache_page(page); - set_bit(PG_arch_1, &page->flags); - } - else - pr_devel("do_dcache_icache_coherency... already clean\n"); - return __pte(pte_val(pte) | _PAGE_HWEXEC); -} - static inline int is_exec_fault(void) { return current->thread.regs && TRAP(current->thread.regs) == 0x400; @@ -157,49 +135,139 @@ static inline int is_exec_fault(void) /* We only try to do i/d cache coherency on stuff that looks like * reasonably "normal" PTEs. We currently require a PTE to be present - * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE + * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that + * on userspace PTEs */ static inline int pte_looks_normal(pte_t pte) { return (pte_val(pte) & - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE)) == - (_PAGE_PRESENT); + (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == + (_PAGE_PRESENT | _PAGE_USER); } -#if defined(CONFIG_PPC_STD_MMU) +struct page * maybe_pte_to_page(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + struct page *page; + + if (unlikely(!pfn_valid(pfn))) + return NULL; + page = pfn_to_page(pfn); + if (PageReserved(page)) + return NULL; + return page; +} + +#if defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 + /* Server-style MMU handles coherency when hashing if HW exec permission - * is supposed per page (currently 64-bit only). Else, we always flush - * valid PTEs in set_pte. + * is supposed per page (currently 64-bit only). If not, then, we always + * flush the cache for valid PTEs in set_pte. Embedded CPU without HW exec + * support falls into the same category. */ -static inline int pte_need_exec_flush(pte_t pte, int set_pte) + +static pte_t set_pte_filter(pte_t pte) { - return set_pte && pte_looks_normal(pte) && - !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || - cpu_has_feature(CPU_FTR_NOEXECUTE)); + pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); + if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || + cpu_has_feature(CPU_FTR_NOEXECUTE))) { + struct page *pg = maybe_pte_to_page(pte); + if (!pg) + return pte; + if (!test_bit(PG_arch_1, &pg->flags)) { + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + } + } + return pte; } -#elif _PAGE_HWEXEC == 0 -/* Embedded type MMU without HW exec support (8xx only so far), we flush - * the cache for any present PTE + +static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, + int dirty) +{ + return pte; +} + +#else /* defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0 */ + +/* Embedded type MMU with HW exec support. This is a bit more complicated + * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so + * instead we "filter out" the exec permission for non clean pages. */ -static inline int pte_need_exec_flush(pte_t pte, int set_pte) +static pte_t set_pte_filter(pte_t pte) { - return set_pte && pte_looks_normal(pte); + struct page *pg; + + /* No exec permission in the first place, move on */ + if (!(pte_val(pte) & _PAGE_EXEC) || !pte_looks_normal(pte)) + return pte; + + /* If you set _PAGE_EXEC on weird pages you're on your own */ + pg = maybe_pte_to_page(pte); + if (unlikely(!pg)) + return pte; + + /* If the page clean, we move on */ + if (test_bit(PG_arch_1, &pg->flags)) + return pte; + + /* If it's an exec fault, we flush the cache and make it clean */ + if (is_exec_fault()) { + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + return pte; + } + + /* Else, we filter out _PAGE_EXEC */ + return __pte(pte_val(pte) & ~_PAGE_EXEC); } -#else -/* Other embedded CPUs with HW exec support per-page, we flush on exec - * fault if HWEXEC is not set - */ -static inline int pte_need_exec_flush(pte_t pte, int set_pte) + +static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, + int dirty) { - return pte_looks_normal(pte) && is_exec_fault() && - !(pte_val(pte) & _PAGE_HWEXEC); + struct page *pg; + + /* So here, we only care about exec faults, as we use them + * to recover lost _PAGE_EXEC and perform I$/D$ coherency + * if necessary. Also if _PAGE_EXEC is already set, same deal, + * we just bail out + */ + if (dirty || (pte_val(pte) & _PAGE_EXEC) || !is_exec_fault()) + return pte; + +#ifdef CONFIG_DEBUG_VM + /* So this is an exec fault, _PAGE_EXEC is not set. If it was + * an error we would have bailed out earlier in do_page_fault() + * but let's make sure of it + */ + if (WARN_ON(!(vma->vm_flags & VM_EXEC))) + return pte; +#endif /* CONFIG_DEBUG_VM */ + + /* If you set _PAGE_EXEC on weird pages you're on your own */ + pg = maybe_pte_to_page(pte); + if (unlikely(!pg)) + goto bail; + + /* If the page is already clean, we move on */ + if (test_bit(PG_arch_1, &pg->flags)) + goto bail; + + /* Clean the page and set PG_arch_1 */ + flush_dcache_icache_page(pg); + set_bit(PG_arch_1, &pg->flags); + + bail: + return __pte(pte_val(pte) | _PAGE_EXEC); } -#endif + +#endif /* !(defined(CONFIG_PPC_STD_MMU) || _PAGE_EXEC == 0) */ /* * set_pte stores a linux PTE into the linux page table. */ -void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte) { #ifdef CONFIG_DEBUG_VM WARN_ON(pte_present(*ptep)); @@ -208,9 +276,7 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte * this context might not have been activated yet when this * is called. */ - pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); - if (pte_need_exec_flush(pte, 1)) - pte = do_dcache_icache_coherency(pte); + pte = set_pte_filter(pte); /* Perform the setting of the PTE */ __set_pte_at(mm, addr, ptep, pte, 0); @@ -227,8 +293,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed; - if (!dirty && pte_need_exec_flush(entry, 0)) - entry = do_dcache_icache_coherency(entry); + entry = set_access_flags_filter(entry, vma, dirty); changed = !pte_same(*(ptep), entry); if (changed) { if (!(vma->vm_flags & VM_HUGETLB)) diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 5422169626ba..cb96cb2e17cc 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -142,7 +142,7 @@ ioremap_flags(phys_addr_t addr, unsigned long size, unsigned long flags) flags |= _PAGE_DIRTY | _PAGE_HWWRITE; /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - flags &= ~(_PAGE_USER | _PAGE_EXEC | _PAGE_HWEXEC); + flags &= ~(_PAGE_USER | _PAGE_EXEC); return __ioremap_caller(addr, size, flags, __builtin_return_address(0)); } diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index 10d524ded7b2..cd92f62f9cf5 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -133,7 +133,7 @@ /* We do the user/kernel test for the PID here along with the RW test */ - li r11,_PAGE_PRESENT|_PAGE_HWEXEC /* Base perm */ + li r11,_PAGE_PRESENT|_PAGE_EXEC /* Base perm */ oris r11,r11,_PAGE_ACCESSED@h cmpldi cr0,r15,0 /* Check for user region */ @@ -256,7 +256,7 @@ normal_tlb_miss_done: normal_tlb_miss_access_fault: /* We need to check if it was an instruction miss */ - andi. r10,r11,_PAGE_HWEXEC + andi. r10,r11,_PAGE_EXEC bne 1f ld r14,EX_TLB_DEAR(r12) ld r15,EX_TLB_ESR(r12) From 14d757520a08d09745c3b18bb34addd9bef56e2d Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Wed, 19 Aug 2009 04:27:53 +0000 Subject: [PATCH 0224/1662] powerpc: Fix __flush_icache_range on 44x The ptrace POKETEXT interface allows a process to modify the text pages of a child process being ptraced, usually to insert breakpoints via trap instructions. The kernel eventually calls copy_to_user_page, which in turn calls __flush_icache_range to invalidate the icache lines for the child process. However, this function does not work on 44x due to the icache being virtually indexed. This was noticed by a breakpoint being triggered after it had been cleared by ltrace on a 440EPx board. The convenient solution is to do a flash invalidate of the icache in the __flush_icache_range function. Signed-off-by: Josh Boyer Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/misc_32.S | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 15f28e0de78d..da9c0c4c10f3 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -342,10 +342,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) addi r3,r3,L1_CACHE_BYTES bdnz 1b sync /* wait for dcbst's to get to ram */ +#ifndef CONFIG_44x mtctr r4 2: icbi 0,r6 addi r6,r6,L1_CACHE_BYTES bdnz 2b +#else + /* Flash invalidate on 44x because we are passed kmapped addresses and + this doesn't work for userspace pages due to the virtually tagged + icache. Sigh. */ + iccci 0, r0 +#endif sync /* additional sync needed on g4 */ isync blr From 6fdc31a2b86cf1f98e3eed896578ad9659eeb0f8 Mon Sep 17 00:00:00 2001 From: Bastian Blank Date: Wed, 12 Aug 2009 23:30:45 +0000 Subject: [PATCH 0225/1662] powerpc: Remove SMP warning from PowerMac cpufreq On Thu, Aug 13, 2009 at 04:14:58PM +1000, Benjamin Herrenschmidt wrote: > On Tue, 2009-08-11 at 11:39 +0200, Bastian Blank wrote: > > This patch just disables this driver on SMP kernels, as it is obviously > > not supported. > Why not remove the #error instead ? :-) I don't think it's still > meaningful, especially since we use the timebase for delays nowadays > which doesn't depend on the CPU frequency... Your call. Take this one: The build of a PowerMac 32bit kernel currently fails with error: #warning "WARNING, CPUFREQ not recommended on SMP kernels" Thie patch removes the not longer applicable SMP warning from the PowerMac cpufreq code. Signed-off-by: Bastian Blank Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/powermac/cpufreq_32.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/powerpc/platforms/powermac/cpufreq_32.c b/arch/powerpc/platforms/powermac/cpufreq_32.c index 65c585b8b00d..08d94e4cedd3 100644 --- a/arch/powerpc/platforms/powermac/cpufreq_32.c +++ b/arch/powerpc/platforms/powermac/cpufreq_32.c @@ -44,14 +44,6 @@ */ #undef DEBUG_FREQ -/* - * There is a problem with the core cpufreq code on SMP kernels, - * it won't recalculate the Bogomips properly - */ -#ifdef CONFIG_SMP -#warning "WARNING, CPUFREQ not recommended on SMP kernels" -#endif - extern void low_choose_7447a_dfs(int dfs); extern void low_choose_750fx_pll(int pll); extern void low_sleep_handler(void); From 6776426320e151051a16bc7bf86f12d310c9e8ca Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 23 Jun 2009 23:26:37 +0000 Subject: [PATCH 0226/1662] powerpc/pseries: Reduce the polling interval in __cpu_up() Time time taken for a single cpu online operation on a pseries machine is as follows: Dedicated LPAR (POWER6): ~220ms. Shared LPAR (POWER5) : ~240ms. Of this time, approximately 200ms is taken up by __cpu_up(). This is because we poll every 200ms to check if the new cpu has notified it's presence through the cpu_callin_map. We repeat this operation until the new cpu sets the value in cpu_callin_map or 5 seconds elapse, whichever comes earlier. However, using completion_structs instead of polling loops, the time taken by the new processor to indicate it's presence has found to be less than 1ms on pseries. This method however may not work on all powerpc platforms due to the time-base synchronization code. Keeping this in mind, we could reduce msleep polling interval from 200ms to 1ms while retaining the 5 second timeout. With this, the time taken for a cpu online operation changes as follows: Dedicated LPAR (POWER6): 20-25ms. Shared LPAR (POWER5) : 60-80ms. In both these cases, it was found that the code polls through the loop only once indicating that 1ms is a reasonable value, atleast on pseries. The code needs testing on other powerpc platforms. Signed-off-by: Gautham R Shenoy Acked-by: Joel Schopp Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/smp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 0b47de07302d..96f107cc0160 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -412,9 +412,8 @@ int __cpuinit __cpu_up(unsigned int cpu) * CPUs can take much longer to come up in the * hotplug case. Wait five seconds. */ - for (c = 25; c && !cpu_callin_map[cpu]; c--) { - msleep(200); - } + for (c = 5000; c && !cpu_callin_map[cpu]; c--) + msleep(1); #endif if (!cpu_callin_map[cpu]) { From 762afb7317b1987fa0851135fe4f2947f68c3c2a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:22 +0000 Subject: [PATCH 0227/1662] powerpc: Remove addr_needs_map in struct dma_mapping_ops This patch adds max_direct_dma_addr to struct dev_archdata to remove addr_needs_map in struct dma_mapping_ops. It also converts dma_capable() to use max_direct_dma_addr. max_direct_dma_addr is initialized in pci_dma_dev_setup_swiotlb(), called via ppc_md.pci_dma_dev_setup hook. For further information: http://marc.info/?t=124719060200001&r=1&w=2 Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/device.h | 3 ++ arch/powerpc/include/asm/dma-mapping.h | 8 ++--- arch/powerpc/include/asm/swiotlb.h | 5 ++- arch/powerpc/kernel/dma-swiotlb.c | 36 +++++++++------------- arch/powerpc/platforms/85xx/mpc8536_ds.c | 1 + arch/powerpc/platforms/85xx/mpc85xx_ds.c | 1 + arch/powerpc/platforms/85xx/mpc85xx_mds.c | 1 + arch/powerpc/platforms/86xx/mpc86xx_hpcn.c | 1 + 8 files changed, 28 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 7d2277cef09a..0086f8d46f1c 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -16,6 +16,9 @@ struct dev_archdata { /* DMA operations on that device */ struct dma_mapping_ops *dma_ops; void *dma_data; +#ifdef CONFIG_SWIOTLB + dma_addr_t max_direct_dma_addr; +#endif }; static inline void dev_archdata_set_node(struct dev_archdata *ad, diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 0c34371ec49c..1765c379138a 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -87,8 +87,6 @@ struct dma_mapping_ops { dma_addr_t dma_address, size_t size, enum dma_data_direction direction, struct dma_attrs *attrs); - int (*addr_needs_map)(struct device *dev, dma_addr_t addr, - size_t size); #ifdef CONFIG_PPC_NEED_DMA_SYNC_OPS void (*sync_single_range_for_cpu)(struct device *hwdev, dma_addr_t dma_handle, unsigned long offset, @@ -426,10 +424,12 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) { - struct dma_mapping_ops *ops = get_dma_ops(dev); +#ifdef CONFIG_SWIOTLB + struct dev_archdata *sd = &dev->archdata; - if (ops->addr_needs_map && ops->addr_needs_map(dev, addr, size)) + if (sd->max_direct_dma_addr && addr + size > sd->max_direct_dma_addr) return 0; +#endif if (!dev->dma_mask) return 0; diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h index 30891d6e2bc1..31e0e43c880d 100644 --- a/arch/powerpc/include/asm/swiotlb.h +++ b/arch/powerpc/include/asm/swiotlb.h @@ -16,12 +16,11 @@ extern struct dma_mapping_ops swiotlb_dma_ops; extern struct dma_mapping_ops swiotlb_pci_dma_ops; -int swiotlb_arch_address_needs_mapping(struct device *, dma_addr_t, - size_t size); - static inline void dma_mark_clean(void *addr, size_t size) {} extern unsigned int ppc_swiotlb_enable; int __init swiotlb_setup_bus_notifier(void); +extern void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev); + #endif /* __ASM_SWIOTLB_H */ diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index e8a57de85bcf..c9f6a302e879 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -24,26 +24,6 @@ int swiotlb __read_mostly; unsigned int ppc_swiotlb_enable; -/* - * Determine if an address is reachable by a pci device, or if we must bounce. - */ -static int -swiotlb_pci_addr_needs_map(struct device *hwdev, dma_addr_t addr, size_t size) -{ - dma_addr_t max; - struct pci_controller *hose; - struct pci_dev *pdev = to_pci_dev(hwdev); - - hose = pci_bus_to_host(pdev->bus); - max = hose->dma_window_base_cur + hose->dma_window_size; - - /* check that we're within mapped pci window space */ - if ((addr + size > max) | (addr < hose->dma_window_base_cur)) - return 1; - - return 0; -} - /* * At the moment, all platforms that use this code only require * swiotlb to be used if we're operating on HIGHMEM. Since @@ -73,22 +53,36 @@ struct dma_mapping_ops swiotlb_pci_dma_ops = { .dma_supported = swiotlb_dma_supported, .map_page = swiotlb_map_page, .unmap_page = swiotlb_unmap_page, - .addr_needs_map = swiotlb_pci_addr_needs_map, .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, .sync_single_range_for_device = swiotlb_sync_single_range_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, .sync_sg_for_device = swiotlb_sync_sg_for_device }; +void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) +{ + struct pci_controller *hose; + struct dev_archdata *sd; + + hose = pci_bus_to_host(pdev->bus); + sd = &pdev->dev.archdata; + sd->max_direct_dma_addr = + hose->dma_window_base_cur + hose->dma_window_size; +} + static int ppc_swiotlb_bus_notify(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; + struct dev_archdata *sd; /* We are only intereted in device addition */ if (action != BUS_NOTIFY_ADD_DEVICE) return 0; + sd = &dev->archdata; + sd->max_direct_dma_addr = 0; + /* May need to bounce if the device can't address all of DRAM */ if (dma_get_mask(dev) < lmb_end_of_DRAM()) set_dma_ops(dev, &swiotlb_dma_ops); diff --git a/arch/powerpc/platforms/85xx/mpc8536_ds.c b/arch/powerpc/platforms/85xx/mpc8536_ds.c index 055ff417bae9..bf052c056106 100644 --- a/arch/powerpc/platforms/85xx/mpc8536_ds.c +++ b/arch/powerpc/platforms/85xx/mpc8536_ds.c @@ -97,6 +97,7 @@ static void __init mpc8536_ds_setup_arch(void) if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; set_pci_dma_ops(&swiotlb_pci_dma_ops); + ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif diff --git a/arch/powerpc/platforms/85xx/mpc85xx_ds.c b/arch/powerpc/platforms/85xx/mpc85xx_ds.c index 849c0ac0025f..c6f92ccd963b 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_ds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_ds.c @@ -193,6 +193,7 @@ static void __init mpc85xx_ds_setup_arch(void) if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; set_pci_dma_ops(&swiotlb_pci_dma_ops); + ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c index 20a61d0af33b..25998b661f20 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c @@ -256,6 +256,7 @@ static void __init mpc85xx_mds_setup_arch(void) if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; set_pci_dma_ops(&swiotlb_pci_dma_ops); + ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif } diff --git a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c index 66327024a6a6..803230156875 100644 --- a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c +++ b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c @@ -106,6 +106,7 @@ mpc86xx_hpcn_setup_arch(void) if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; set_pci_dma_ops(&swiotlb_pci_dma_ops); + ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif } From 3702977fa7d1a1a95caa387121fa7c9f4cae35f3 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:23 +0000 Subject: [PATCH 0228/1662] powerpc: Remove swiotlb_pci_dma_ops Now swiotlb_pci_dma_ops is identical to swiotlb_dma_ops; we can use swiotlb_dma_ops with any devices. This removes swiotlb_pci_dma_ops. Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/swiotlb.h | 1 - arch/powerpc/kernel/dma-swiotlb.c | 14 -------------- arch/powerpc/platforms/85xx/mpc8536_ds.c | 2 +- arch/powerpc/platforms/85xx/mpc85xx_ds.c | 2 +- arch/powerpc/platforms/85xx/mpc85xx_mds.c | 2 +- arch/powerpc/platforms/86xx/mpc86xx_hpcn.c | 2 +- 6 files changed, 4 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h index 31e0e43c880d..21ce0a3b4941 100644 --- a/arch/powerpc/include/asm/swiotlb.h +++ b/arch/powerpc/include/asm/swiotlb.h @@ -14,7 +14,6 @@ #include extern struct dma_mapping_ops swiotlb_dma_ops; -extern struct dma_mapping_ops swiotlb_pci_dma_ops; static inline void dma_mark_clean(void *addr, size_t size) {} diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index c9f6a302e879..ca141e108ae3 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -45,20 +45,6 @@ struct dma_mapping_ops swiotlb_dma_ops = { .sync_sg_for_device = swiotlb_sync_sg_for_device }; -struct dma_mapping_ops swiotlb_pci_dma_ops = { - .alloc_coherent = dma_direct_alloc_coherent, - .free_coherent = dma_direct_free_coherent, - .map_sg = swiotlb_map_sg_attrs, - .unmap_sg = swiotlb_unmap_sg_attrs, - .dma_supported = swiotlb_dma_supported, - .map_page = swiotlb_map_page, - .unmap_page = swiotlb_unmap_page, - .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, - .sync_single_range_for_device = swiotlb_sync_single_range_for_device, - .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = swiotlb_sync_sg_for_device -}; - void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) { struct pci_controller *hose; diff --git a/arch/powerpc/platforms/85xx/mpc8536_ds.c b/arch/powerpc/platforms/85xx/mpc8536_ds.c index bf052c056106..004b7d36cdb7 100644 --- a/arch/powerpc/platforms/85xx/mpc8536_ds.c +++ b/arch/powerpc/platforms/85xx/mpc8536_ds.c @@ -96,7 +96,7 @@ static void __init mpc8536_ds_setup_arch(void) #ifdef CONFIG_SWIOTLB if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; - set_pci_dma_ops(&swiotlb_pci_dma_ops); + set_pci_dma_ops(&swiotlb_dma_ops); ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif diff --git a/arch/powerpc/platforms/85xx/mpc85xx_ds.c b/arch/powerpc/platforms/85xx/mpc85xx_ds.c index c6f92ccd963b..544011a562fb 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_ds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_ds.c @@ -192,7 +192,7 @@ static void __init mpc85xx_ds_setup_arch(void) #ifdef CONFIG_SWIOTLB if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; - set_pci_dma_ops(&swiotlb_pci_dma_ops); + set_pci_dma_ops(&swiotlb_dma_ops); ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif diff --git a/arch/powerpc/platforms/85xx/mpc85xx_mds.c b/arch/powerpc/platforms/85xx/mpc85xx_mds.c index 25998b661f20..3909d57b86e3 100644 --- a/arch/powerpc/platforms/85xx/mpc85xx_mds.c +++ b/arch/powerpc/platforms/85xx/mpc85xx_mds.c @@ -255,7 +255,7 @@ static void __init mpc85xx_mds_setup_arch(void) #ifdef CONFIG_SWIOTLB if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; - set_pci_dma_ops(&swiotlb_pci_dma_ops); + set_pci_dma_ops(&swiotlb_dma_ops); ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif diff --git a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c index 803230156875..2aa69a69bcc8 100644 --- a/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c +++ b/arch/powerpc/platforms/86xx/mpc86xx_hpcn.c @@ -105,7 +105,7 @@ mpc86xx_hpcn_setup_arch(void) #ifdef CONFIG_SWIOTLB if (lmb_end_of_DRAM() > max) { ppc_swiotlb_enable = 1; - set_pci_dma_ops(&swiotlb_pci_dma_ops); + set_pci_dma_ops(&swiotlb_dma_ops); ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_swiotlb; } #endif From f726f30e32305a34a203ff975e60885aa7556c6a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:24 +0000 Subject: [PATCH 0229/1662] dma: Add set_dma_mask hook to struct dma_map_ops POWERPC needs this hook. SPARC could use it too. Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce Signed-off-by: Benjamin Herrenschmidt --- include/linux/dma-mapping.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index c0f6c3cd788c..91b761846061 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -58,6 +58,7 @@ struct dma_map_ops { enum dma_data_direction dir); int (*mapping_error)(struct device *dev, dma_addr_t dma_addr); int (*dma_supported)(struct device *dev, u64 mask); + int (*set_dma_mask)(struct device *dev, u64 mask); int is_phys; }; From 45223c549273bbb2c6e1bc6e3629174e8765ad01 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:25 +0000 Subject: [PATCH 0230/1662] powerpc: use dma_map_ops struct This converts uses dma_map_ops struct (in include/linux/dma-mapping.h) instead of POWERPC homegrown dma_mapping_ops. Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/device.h | 4 +- arch/powerpc/include/asm/dma-mapping.h | 84 +++++++------------------ arch/powerpc/include/asm/pci.h | 4 +- arch/powerpc/include/asm/swiotlb.h | 2 +- arch/powerpc/kernel/dma-iommu.c | 2 +- arch/powerpc/kernel/dma-swiotlb.c | 2 +- arch/powerpc/kernel/dma.c | 2 +- arch/powerpc/kernel/ibmebus.c | 2 +- arch/powerpc/kernel/pci-common.c | 6 +- arch/powerpc/kernel/vio.c | 2 +- arch/powerpc/platforms/cell/iommu.c | 2 +- arch/powerpc/platforms/ps3/system-bus.c | 4 +- 12 files changed, 37 insertions(+), 79 deletions(-) diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h index 0086f8d46f1c..67fcd7f89d99 100644 --- a/arch/powerpc/include/asm/device.h +++ b/arch/powerpc/include/asm/device.h @@ -6,7 +6,7 @@ #ifndef _ASM_POWERPC_DEVICE_H #define _ASM_POWERPC_DEVICE_H -struct dma_mapping_ops; +struct dma_map_ops; struct device_node; struct dev_archdata { @@ -14,7 +14,7 @@ struct dev_archdata { struct device_node *of_node; /* DMA operations on that device */ - struct dma_mapping_ops *dma_ops; + struct dma_map_ops *dma_ops; void *dma_data; #ifdef CONFIG_SWIOTLB dma_addr_t max_direct_dma_addr; diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 1765c379138a..8ca2b5183c56 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -63,57 +63,15 @@ static inline unsigned long device_to_mask(struct device *dev) return 0xfffffffful; } -/* - * DMA operations are abstracted for G5 vs. i/pSeries, PCI vs. VIO - */ -struct dma_mapping_ops { - void * (*alloc_coherent)(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag); - void (*free_coherent)(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle); - int (*map_sg)(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - struct dma_attrs *attrs); - void (*unmap_sg)(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - struct dma_attrs *attrs); - int (*dma_supported)(struct device *dev, u64 mask); - int (*set_dma_mask)(struct device *dev, u64 dma_mask); - dma_addr_t (*map_page)(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs); - void (*unmap_page)(struct device *dev, - dma_addr_t dma_address, size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs); -#ifdef CONFIG_PPC_NEED_DMA_SYNC_OPS - void (*sync_single_range_for_cpu)(struct device *hwdev, - dma_addr_t dma_handle, unsigned long offset, - size_t size, - enum dma_data_direction direction); - void (*sync_single_range_for_device)(struct device *hwdev, - dma_addr_t dma_handle, unsigned long offset, - size_t size, - enum dma_data_direction direction); - void (*sync_sg_for_cpu)(struct device *hwdev, - struct scatterlist *sg, int nelems, - enum dma_data_direction direction); - void (*sync_sg_for_device)(struct device *hwdev, - struct scatterlist *sg, int nelems, - enum dma_data_direction direction); -#endif -}; - /* * Available generic sets of operations */ #ifdef CONFIG_PPC64 -extern struct dma_mapping_ops dma_iommu_ops; +extern struct dma_map_ops dma_iommu_ops; #endif -extern struct dma_mapping_ops dma_direct_ops; +extern struct dma_map_ops dma_direct_ops; -static inline struct dma_mapping_ops *get_dma_ops(struct device *dev) +static inline struct dma_map_ops *get_dma_ops(struct device *dev) { /* We don't handle the NULL dev case for ISA for now. We could * do it via an out of line call but it is not needed for now. The @@ -126,14 +84,14 @@ static inline struct dma_mapping_ops *get_dma_ops(struct device *dev) return dev->archdata.dma_ops; } -static inline void set_dma_ops(struct device *dev, struct dma_mapping_ops *ops) +static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops) { dev->archdata.dma_ops = ops; } static inline int dma_supported(struct device *dev, u64 mask) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); if (unlikely(dma_ops == NULL)) return 0; @@ -147,7 +105,7 @@ static inline int dma_supported(struct device *dev, u64 mask) static inline int dma_set_mask(struct device *dev, u64 dma_mask) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); if (unlikely(dma_ops == NULL)) return -EIO; @@ -161,7 +119,7 @@ static inline int dma_set_mask(struct device *dev, u64 dma_mask) /* * map_/unmap_single actually call through to map/unmap_page now that all the - * dma_mapping_ops have been converted over. We just have to get the page and + * dma_map_ops have been converted over. We just have to get the page and * offset to pass through to map_page */ static inline dma_addr_t dma_map_single_attrs(struct device *dev, @@ -170,7 +128,7 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -185,7 +143,7 @@ static inline void dma_unmap_single_attrs(struct device *dev, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -198,7 +156,7 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -211,7 +169,7 @@ static inline void dma_unmap_page_attrs(struct device *dev, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -222,7 +180,7 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); return dma_ops->map_sg(dev, sg, nents, direction, attrs); @@ -234,7 +192,7 @@ static inline void dma_unmap_sg_attrs(struct device *dev, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); dma_ops->unmap_sg(dev, sg, nhwentries, direction, attrs); @@ -243,7 +201,7 @@ static inline void dma_unmap_sg_attrs(struct device *dev, static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); return dma_ops->alloc_coherent(dev, size, dma_handle, flag); @@ -252,7 +210,7 @@ static inline void *dma_alloc_coherent(struct device *dev, size_t size, static inline void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); @@ -304,7 +262,7 @@ static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -317,7 +275,7 @@ static inline void dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -330,7 +288,7 @@ static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction direction) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -342,7 +300,7 @@ static inline void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, int nents, enum dma_data_direction direction) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -354,7 +312,7 @@ static inline void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, unsigned long offset, size_t size, enum dma_data_direction direction) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); @@ -367,7 +325,7 @@ static inline void dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, unsigned long offset, size_t size, enum dma_data_direction direction) { - struct dma_mapping_ops *dma_ops = get_dma_ops(dev); + struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index d9483c504d2d..7ae46d7e270d 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -61,8 +61,8 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) } #ifdef CONFIG_PCI -extern void set_pci_dma_ops(struct dma_mapping_ops *dma_ops); -extern struct dma_mapping_ops *get_pci_dma_ops(void); +extern void set_pci_dma_ops(struct dma_map_ops *dma_ops); +extern struct dma_map_ops *get_pci_dma_ops(void); #else /* CONFIG_PCI */ #define set_pci_dma_ops(d) #define get_pci_dma_ops() NULL diff --git a/arch/powerpc/include/asm/swiotlb.h b/arch/powerpc/include/asm/swiotlb.h index 21ce0a3b4941..8979d4cd3d70 100644 --- a/arch/powerpc/include/asm/swiotlb.h +++ b/arch/powerpc/include/asm/swiotlb.h @@ -13,7 +13,7 @@ #include -extern struct dma_mapping_ops swiotlb_dma_ops; +extern struct dma_map_ops swiotlb_dma_ops; static inline void dma_mark_clean(void *addr, size_t size) {} diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index 2983adac8cc3..87ddb3fb948c 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -89,7 +89,7 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask) return 1; } -struct dma_mapping_ops dma_iommu_ops = { +struct dma_map_ops dma_iommu_ops = { .alloc_coherent = dma_iommu_alloc_coherent, .free_coherent = dma_iommu_free_coherent, .map_sg = dma_iommu_map_sg, diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index ca141e108ae3..d1143a68d82a 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -31,7 +31,7 @@ unsigned int ppc_swiotlb_enable; * map_page, and unmap_page on highmem, use normal dma_ops * for everything else. */ -struct dma_mapping_ops swiotlb_dma_ops = { +struct dma_map_ops swiotlb_dma_ops = { .alloc_coherent = dma_direct_alloc_coherent, .free_coherent = dma_direct_free_coherent, .map_sg = swiotlb_map_sg_attrs, diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index ccf129d47d84..c61f70e145ad 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -140,7 +140,7 @@ static inline void dma_direct_sync_single_range(struct device *dev, } #endif -struct dma_mapping_ops dma_direct_ops = { +struct dma_map_ops dma_direct_ops = { .alloc_coherent = dma_direct_alloc_coherent, .free_coherent = dma_direct_free_coherent, .map_sg = dma_direct_map_sg, diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c index 6e3f62493659..a4c8b38b0ba1 100644 --- a/arch/powerpc/kernel/ibmebus.c +++ b/arch/powerpc/kernel/ibmebus.c @@ -127,7 +127,7 @@ static int ibmebus_dma_supported(struct device *dev, u64 mask) return 1; } -static struct dma_mapping_ops ibmebus_dma_ops = { +static struct dma_map_ops ibmebus_dma_ops = { .alloc_coherent = ibmebus_alloc_coherent, .free_coherent = ibmebus_free_coherent, .map_sg = ibmebus_map_sg, diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 5a56e97c5ac0..7585f1fc26db 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -50,14 +50,14 @@ resource_size_t isa_mem_base; unsigned int ppc_pci_flags = 0; -static struct dma_mapping_ops *pci_dma_ops = &dma_direct_ops; +static struct dma_map_ops *pci_dma_ops = &dma_direct_ops; -void set_pci_dma_ops(struct dma_mapping_ops *dma_ops) +void set_pci_dma_ops(struct dma_map_ops *dma_ops) { pci_dma_ops = dma_ops; } -struct dma_mapping_ops *get_pci_dma_ops(void) +struct dma_map_ops *get_pci_dma_ops(void) { return pci_dma_ops; } diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index 819e59f6f7c7..bc7b41edbdfc 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -601,7 +601,7 @@ static void vio_dma_iommu_unmap_sg(struct device *dev, vio_cmo_dealloc(viodev, alloc_size); } -struct dma_mapping_ops vio_dma_mapping_ops = { +struct dma_map_ops vio_dma_mapping_ops = { .alloc_coherent = vio_dma_iommu_alloc_coherent, .free_coherent = vio_dma_iommu_free_coherent, .map_sg = vio_dma_iommu_map_sg, diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 5b34fc211f35..416db17eb18f 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -642,7 +642,7 @@ static int dma_fixed_dma_supported(struct device *dev, u64 mask) static int dma_set_mask_and_switch(struct device *dev, u64 dma_mask); -struct dma_mapping_ops dma_iommu_fixed_ops = { +struct dma_map_ops dma_iommu_fixed_ops = { .alloc_coherent = dma_fixed_alloc_coherent, .free_coherent = dma_fixed_free_coherent, .map_sg = dma_fixed_map_sg, diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c index 676f989ed4e4..e34b305a7a52 100644 --- a/arch/powerpc/platforms/ps3/system-bus.c +++ b/arch/powerpc/platforms/ps3/system-bus.c @@ -694,7 +694,7 @@ static int ps3_dma_supported(struct device *_dev, u64 mask) return mask >= DMA_BIT_MASK(32); } -static struct dma_mapping_ops ps3_sb_dma_ops = { +static struct dma_map_ops ps3_sb_dma_ops = { .alloc_coherent = ps3_alloc_coherent, .free_coherent = ps3_free_coherent, .map_sg = ps3_sb_map_sg, @@ -704,7 +704,7 @@ static struct dma_mapping_ops ps3_sb_dma_ops = { .unmap_page = ps3_unmap_page, }; -static struct dma_mapping_ops ps3_ioc0_dma_ops = { +static struct dma_map_ops ps3_ioc0_dma_ops = { .alloc_coherent = ps3_alloc_coherent, .free_coherent = ps3_free_coherent, .map_sg = ps3_ioc0_map_sg, From 46bab4e4b45ec522ecd5fa4a0e2b4a6e6d1f153a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:26 +0000 Subject: [PATCH 0231/1662] powerpc: Use asm-generic/dma-mapping-common.h Signed-off-by: FUJITA Tomonori Acked-by: Becky Bruce Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 6 +- arch/powerpc/include/asm/dma-mapping.h | 242 +------------------------ 2 files changed, 7 insertions(+), 241 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 4c0747e8ed74..6078253c6d76 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -120,7 +120,7 @@ config PPC select HAVE_KRETPROBES select HAVE_ARCH_TRACEHOOK select HAVE_LMB - select HAVE_DMA_ATTRS if PPC64 + select HAVE_DMA_ATTRS select USE_GENERIC_SMP_HELPERS if SMP select HAVE_OPROFILE select HAVE_SYSCALL_WRAPPERS if PPC64 @@ -307,10 +307,6 @@ config SWIOTLB platforms where the size of a physical address is larger than the bus address. Not all platforms support this. -config PPC_NEED_DMA_SYNC_OPS - def_bool y - depends on (NOT_COHERENT_CACHE || SWIOTLB) - config HOTPLUG_CPU bool "Support for enabling/disabling CPUs" depends on SMP && HOTPLUG && EXPERIMENTAL && (PPC_PSERIES || PPC_PMAC) diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 8ca2b5183c56..91217e4a0bfc 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -89,6 +90,11 @@ static inline void set_dma_ops(struct device *dev, struct dma_map_ops *ops) dev->archdata.dma_ops = ops; } +/* this will be removed soon */ +#define flush_write_buffers() + +#include + static inline int dma_supported(struct device *dev, u64 mask) { struct dma_map_ops *dma_ops = get_dma_ops(dev); @@ -117,87 +123,6 @@ static inline int dma_set_mask(struct device *dev, u64 dma_mask) return 0; } -/* - * map_/unmap_single actually call through to map/unmap_page now that all the - * dma_map_ops have been converted over. We just have to get the page and - * offset to pass through to map_page - */ -static inline dma_addr_t dma_map_single_attrs(struct device *dev, - void *cpu_addr, - size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - return dma_ops->map_page(dev, virt_to_page(cpu_addr), - (unsigned long)cpu_addr % PAGE_SIZE, size, - direction, attrs); -} - -static inline void dma_unmap_single_attrs(struct device *dev, - dma_addr_t dma_addr, - size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - dma_ops->unmap_page(dev, dma_addr, size, direction, attrs); -} - -static inline dma_addr_t dma_map_page_attrs(struct device *dev, - struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - return dma_ops->map_page(dev, page, offset, size, direction, attrs); -} - -static inline void dma_unmap_page_attrs(struct device *dev, - dma_addr_t dma_address, - size_t size, - enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - dma_ops->unmap_page(dev, dma_address, size, direction, attrs); -} - -static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - return dma_ops->map_sg(dev, sg, nents, direction, attrs); -} - -static inline void dma_unmap_sg_attrs(struct device *dev, - struct scatterlist *sg, - int nhwentries, - enum dma_data_direction direction, - struct dma_attrs *attrs) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - dma_ops->unmap_sg(dev, sg, nhwentries, direction, attrs); -} - static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { @@ -216,161 +141,6 @@ static inline void dma_free_coherent(struct device *dev, size_t size, dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); } -static inline dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, - size_t size, - enum dma_data_direction direction) -{ - return dma_map_single_attrs(dev, cpu_addr, size, direction, NULL); -} - -static inline void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, - size_t size, - enum dma_data_direction direction) -{ - dma_unmap_single_attrs(dev, dma_addr, size, direction, NULL); -} - -static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - return dma_map_page_attrs(dev, page, offset, size, direction, NULL); -} - -static inline void dma_unmap_page(struct device *dev, dma_addr_t dma_address, - size_t size, - enum dma_data_direction direction) -{ - dma_unmap_page_attrs(dev, dma_address, size, direction, NULL); -} - -static inline int dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, enum dma_data_direction direction) -{ - return dma_map_sg_attrs(dev, sg, nents, direction, NULL); -} - -static inline void dma_unmap_sg(struct device *dev, struct scatterlist *sg, - int nhwentries, - enum dma_data_direction direction) -{ - dma_unmap_sg_attrs(dev, sg, nhwentries, direction, NULL); -} - -#ifdef CONFIG_PPC_NEED_DMA_SYNC_OPS -static inline void dma_sync_single_for_cpu(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - if (dma_ops->sync_single_range_for_cpu) - dma_ops->sync_single_range_for_cpu(dev, dma_handle, 0, - size, direction); -} - -static inline void dma_sync_single_for_device(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - if (dma_ops->sync_single_range_for_device) - dma_ops->sync_single_range_for_device(dev, dma_handle, - 0, size, direction); -} - -static inline void dma_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nents, - enum dma_data_direction direction) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - if (dma_ops->sync_sg_for_cpu) - dma_ops->sync_sg_for_cpu(dev, sgl, nents, direction); -} - -static inline void dma_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nents, - enum dma_data_direction direction) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - if (dma_ops->sync_sg_for_device) - dma_ops->sync_sg_for_device(dev, sgl, nents, direction); -} - -static inline void dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - if (dma_ops->sync_single_range_for_cpu) - dma_ops->sync_single_range_for_cpu(dev, dma_handle, - offset, size, direction); -} - -static inline void dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - BUG_ON(!dma_ops); - - if (dma_ops->sync_single_range_for_device) - dma_ops->sync_single_range_for_device(dev, dma_handle, offset, - size, direction); -} -#else /* CONFIG_PPC_NEED_DMA_SYNC_OPS */ -static inline void dma_sync_single_for_cpu(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ -} - -static inline void dma_sync_single_for_device(struct device *dev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ -} - -static inline void dma_sync_sg_for_cpu(struct device *dev, - struct scatterlist *sgl, int nents, - enum dma_data_direction direction) -{ -} - -static inline void dma_sync_sg_for_device(struct device *dev, - struct scatterlist *sgl, int nents, - enum dma_data_direction direction) -{ -} - -static inline void dma_sync_single_range_for_cpu(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction) -{ -} - -static inline void dma_sync_single_range_for_device(struct device *dev, - dma_addr_t dma_handle, unsigned long offset, size_t size, - enum dma_data_direction direction) -{ -} -#endif - static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { #ifdef CONFIG_PPC64 From 4a9a6bfe707cfe5bcb0a20eabe240293a095cd10 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:27 +0000 Subject: [PATCH 0232/1662] powerpc: Handle SWIOTLB mapping error properly Signed-off-by: FUJITA Tomonori Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/dma-mapping.h | 5 +++++ arch/powerpc/kernel/dma-swiotlb.c | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 91217e4a0bfc..4bd41b4051e3 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -143,6 +143,11 @@ static inline void dma_free_coherent(struct device *dev, size_t size, static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { + struct dma_map_ops *dma_ops = get_dma_ops(dev); + + if (dma_ops->mapping_error) + return dma_ops->mapping_error(dev, dma_addr); + #ifdef CONFIG_PPC64 return (dma_addr == DMA_ERROR_CODE); #else diff --git a/arch/powerpc/kernel/dma-swiotlb.c b/arch/powerpc/kernel/dma-swiotlb.c index d1143a68d82a..e96cbbd9b449 100644 --- a/arch/powerpc/kernel/dma-swiotlb.c +++ b/arch/powerpc/kernel/dma-swiotlb.c @@ -42,7 +42,8 @@ struct dma_map_ops swiotlb_dma_ops = { .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, .sync_single_range_for_device = swiotlb_sync_single_range_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, - .sync_sg_for_device = swiotlb_sync_sg_for_device + .sync_sg_for_device = swiotlb_sync_sg_for_device, + .mapping_error = swiotlb_dma_mapping_error, }; void pci_dma_dev_setup_swiotlb(struct pci_dev *pdev) From 80d3e8abb73dad3983fef2597b52cab8fbcd876b Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 4 Aug 2009 19:08:28 +0000 Subject: [PATCH 0233/1662] powerpc: Add CONFIG_DMA_API_DEBUG support Signed-off-by: FUJITA Tomonori Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/dma-mapping.h | 11 ++++++++++- arch/powerpc/kernel/dma.c | 11 +++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 6078253c6d76..9e03991dc878 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -121,6 +121,7 @@ config PPC select HAVE_ARCH_TRACEHOOK select HAVE_LMB select HAVE_DMA_ATTRS + select HAVE_DMA_API_DEBUG select USE_GENERIC_SMP_HELPERS if SMP select HAVE_OPROFILE select HAVE_SYSCALL_WRAPPERS if PPC64 diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h index 4bd41b4051e3..cb2ca41dd526 100644 --- a/arch/powerpc/include/asm/dma-mapping.h +++ b/arch/powerpc/include/asm/dma-mapping.h @@ -127,9 +127,15 @@ static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { struct dma_map_ops *dma_ops = get_dma_ops(dev); + void *cpu_addr; BUG_ON(!dma_ops); - return dma_ops->alloc_coherent(dev, size, dma_handle, flag); + + cpu_addr = dma_ops->alloc_coherent(dev, size, dma_handle, flag); + + debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); + + return cpu_addr; } static inline void dma_free_coherent(struct device *dev, size_t size, @@ -138,6 +144,9 @@ static inline void dma_free_coherent(struct device *dev, size_t size, struct dma_map_ops *dma_ops = get_dma_ops(dev); BUG_ON(!dma_ops); + + debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); + dma_ops->free_coherent(dev, size, cpu_addr, dma_handle); } diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index c61f70e145ad..21b784d7e7d0 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -156,3 +157,13 @@ struct dma_map_ops dma_direct_ops = { #endif }; EXPORT_SYMBOL(dma_direct_ops); + +#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) + +static int __init dma_init(void) +{ + dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); + + return 0; +} +fs_initcall(dma_init); From 948c28fe3001f2c9d852dff2a0b2c69fe7cec91b Mon Sep 17 00:00:00 2001 From: Peter Huewe Date: Thu, 20 Aug 2009 11:14:06 +0000 Subject: [PATCH 0234/1662] hvc_console: Add __init and __exit to hvc_vio Trivial patch which adds the __init/__exit macros to the module_init/ module_exit functions of char/hvc_vio.c Please have a look at the small patch and either pull it through your tree, or please ack' it so Jiri can pull it through the trivial tree. linux version 2.6.31-rc6 - linus git tree, Do 20. Aug 22:26:06 CEST 2009 Signed-off-by: Peter Huewe Signed-off-by: Benjamin Herrenschmidt --- drivers/char/hvc_vio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/hvc_vio.c b/drivers/char/hvc_vio.c index c72b994652ac..10be343d6ae7 100644 --- a/drivers/char/hvc_vio.c +++ b/drivers/char/hvc_vio.c @@ -120,7 +120,7 @@ static struct vio_driver hvc_vio_driver = { } }; -static int hvc_vio_init(void) +static int __init hvc_vio_init(void) { int rc; @@ -134,7 +134,7 @@ static int hvc_vio_init(void) } module_init(hvc_vio_init); /* after drivers/char/hvc_console.c */ -static void hvc_vio_exit(void) +static void __exit hvc_vio_exit(void) { vio_unregister_driver(&hvc_vio_driver); } From e3e1d15855206c85f4c9ed82746e81acfe13e5aa Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Mon, 24 Aug 2009 06:15:36 +0000 Subject: [PATCH 0235/1662] powerpc: Name xpn & x fields in HW Hash PTE format Previously, the 36-bit code was using these bits, but they had never been named in the pte format definition. This patch just gives those fields their proper names and adds a comment that they are only present on some processors. There is no functional code change. Signed-off-by: Becky Bruce Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu-hash32.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/mmu-hash32.h b/arch/powerpc/include/asm/mmu-hash32.h index 382fc689f204..16f513e5cbd7 100644 --- a/arch/powerpc/include/asm/mmu-hash32.h +++ b/arch/powerpc/include/asm/mmu-hash32.h @@ -55,21 +55,25 @@ struct ppc_bat { #ifndef __ASSEMBLY__ -/* Hardware Page Table Entry */ +/* + * Hardware Page Table Entry + * Note that the xpn and x bitfields are used only by processors that + * support extended addressing; otherwise, those bits are reserved. + */ struct hash_pte { unsigned long v:1; /* Entry is valid */ unsigned long vsid:24; /* Virtual segment identifier */ unsigned long h:1; /* Hash algorithm indicator */ unsigned long api:6; /* Abbreviated page index */ unsigned long rpn:20; /* Real (physical) page number */ - unsigned long :3; /* Unused */ + unsigned long xpn:3; /* Real page number bits 0-2, optional */ unsigned long r:1; /* Referenced */ unsigned long c:1; /* Changed */ unsigned long w:1; /* Write-thru cache mode */ unsigned long i:1; /* Cache inhibited */ unsigned long m:1; /* Memory coherence */ unsigned long g:1; /* Guarded */ - unsigned long :1; /* Unused */ + unsigned long x:1; /* Real page number bit 3, optional */ unsigned long pp:2; /* Page protection */ }; From 23e55f92d4fd733365dd572ea6e9e211387123c2 Mon Sep 17 00:00:00 2001 From: Michael Wolf Date: Thu, 20 Aug 2009 13:21:45 +0000 Subject: [PATCH 0236/1662] powerpc: Adjust base and index registers in Altivec macros On POWER6 systems RA needs to be the base and RB the index. If they are reversed you take a misdirect hit. Signed-off-by: Mike Wolf ---- Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/ppc_asm.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index dfae6e916dfb..498fe09263d3 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -98,13 +98,13 @@ END_FTR_SECTION_IFCLR(CPU_FTR_PURR); \ #define REST_16FPRS(n, base) REST_8FPRS(n, base); REST_8FPRS(n+8, base) #define REST_32FPRS(n, base) REST_16FPRS(n, base); REST_16FPRS(n+16, base) -#define SAVE_VR(n,b,base) li b,THREAD_VR0+(16*(n)); stvx n,b,base +#define SAVE_VR(n,b,base) li b,THREAD_VR0+(16*(n)); stvx n,base,b #define SAVE_2VRS(n,b,base) SAVE_VR(n,b,base); SAVE_VR(n+1,b,base) #define SAVE_4VRS(n,b,base) SAVE_2VRS(n,b,base); SAVE_2VRS(n+2,b,base) #define SAVE_8VRS(n,b,base) SAVE_4VRS(n,b,base); SAVE_4VRS(n+4,b,base) #define SAVE_16VRS(n,b,base) SAVE_8VRS(n,b,base); SAVE_8VRS(n+8,b,base) #define SAVE_32VRS(n,b,base) SAVE_16VRS(n,b,base); SAVE_16VRS(n+16,b,base) -#define REST_VR(n,b,base) li b,THREAD_VR0+(16*(n)); lvx n,b,base +#define REST_VR(n,b,base) li b,THREAD_VR0+(16*(n)); lvx n,base,b #define REST_2VRS(n,b,base) REST_VR(n,b,base); REST_VR(n+1,b,base) #define REST_4VRS(n,b,base) REST_2VRS(n,b,base); REST_2VRS(n+2,b,base) #define REST_8VRS(n,b,base) REST_4VRS(n,b,base); REST_4VRS(n+4,b,base) @@ -112,26 +112,26 @@ END_FTR_SECTION_IFCLR(CPU_FTR_PURR); \ #define REST_32VRS(n,b,base) REST_16VRS(n,b,base); REST_16VRS(n+16,b,base) /* Save the lower 32 VSRs in the thread VSR region */ -#define SAVE_VSR(n,b,base) li b,THREAD_VSR0+(16*(n)); STXVD2X(n,b,base) +#define SAVE_VSR(n,b,base) li b,THREAD_VSR0+(16*(n)); STXVD2X(n,base,b) #define SAVE_2VSRS(n,b,base) SAVE_VSR(n,b,base); SAVE_VSR(n+1,b,base) #define SAVE_4VSRS(n,b,base) SAVE_2VSRS(n,b,base); SAVE_2VSRS(n+2,b,base) #define SAVE_8VSRS(n,b,base) SAVE_4VSRS(n,b,base); SAVE_4VSRS(n+4,b,base) #define SAVE_16VSRS(n,b,base) SAVE_8VSRS(n,b,base); SAVE_8VSRS(n+8,b,base) #define SAVE_32VSRS(n,b,base) SAVE_16VSRS(n,b,base); SAVE_16VSRS(n+16,b,base) -#define REST_VSR(n,b,base) li b,THREAD_VSR0+(16*(n)); LXVD2X(n,b,base) +#define REST_VSR(n,b,base) li b,THREAD_VSR0+(16*(n)); LXVD2X(n,base,b) #define REST_2VSRS(n,b,base) REST_VSR(n,b,base); REST_VSR(n+1,b,base) #define REST_4VSRS(n,b,base) REST_2VSRS(n,b,base); REST_2VSRS(n+2,b,base) #define REST_8VSRS(n,b,base) REST_4VSRS(n,b,base); REST_4VSRS(n+4,b,base) #define REST_16VSRS(n,b,base) REST_8VSRS(n,b,base); REST_8VSRS(n+8,b,base) #define REST_32VSRS(n,b,base) REST_16VSRS(n,b,base); REST_16VSRS(n+16,b,base) /* Save the upper 32 VSRs (32-63) in the thread VSX region (0-31) */ -#define SAVE_VSRU(n,b,base) li b,THREAD_VR0+(16*(n)); STXVD2X(n+32,b,base) +#define SAVE_VSRU(n,b,base) li b,THREAD_VR0+(16*(n)); STXVD2X(n+32,base,b) #define SAVE_2VSRSU(n,b,base) SAVE_VSRU(n,b,base); SAVE_VSRU(n+1,b,base) #define SAVE_4VSRSU(n,b,base) SAVE_2VSRSU(n,b,base); SAVE_2VSRSU(n+2,b,base) #define SAVE_8VSRSU(n,b,base) SAVE_4VSRSU(n,b,base); SAVE_4VSRSU(n+4,b,base) #define SAVE_16VSRSU(n,b,base) SAVE_8VSRSU(n,b,base); SAVE_8VSRSU(n+8,b,base) #define SAVE_32VSRSU(n,b,base) SAVE_16VSRSU(n,b,base); SAVE_16VSRSU(n+16,b,base) -#define REST_VSRU(n,b,base) li b,THREAD_VR0+(16*(n)); LXVD2X(n+32,b,base) +#define REST_VSRU(n,b,base) li b,THREAD_VR0+(16*(n)); LXVD2X(n+32,base,b) #define REST_2VSRSU(n,b,base) REST_VSRU(n,b,base); REST_VSRU(n+1,b,base) #define REST_4VSRSU(n,b,base) REST_2VSRSU(n,b,base); REST_2VSRSU(n+2,b,base) #define REST_8VSRSU(n,b,base) REST_4VSRSU(n,b,base); REST_4VSRSU(n+4,b,base) From df5d6ecf8157245ef733db87597adb2c6e2510da Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Mon, 24 Aug 2009 15:52:48 +0000 Subject: [PATCH 0237/1662] powerpc/mm: Add MMU features for TLB reservation & Paired MAS registers Support for TLB reservation (or TLB Write Conditional) and Paired MAS registers are optional for a processor implementation so we handle them via MMU feature sections. We currently only used paired MAS registers to access the full RPN + perm bits that are kept in MAS7||MAS3. We assume that if an implementation has hardware page table at this time it also implements in TLB reservations. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/mmu.h | 9 ++++++++ arch/powerpc/mm/tlb_low_64e.S | 38 +++++++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 2fcfefc60894..7ffbb65ff7a9 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -58,6 +58,15 @@ */ #define MMU_FTR_TLBIE_206 ASM_CONST(0x00400000) +/* Enable use of TLB reservation. Processor should support tlbsrx. + * instruction and MAS0[WQ]. + */ +#define MMU_FTR_USE_TLBRSRV ASM_CONST(0x00800000) + +/* Use paired MAS registers (MAS7||MAS3, etc.) + */ +#define MMU_FTR_USE_PAIRED_MAS ASM_CONST(0x01000000) + #ifndef __ASSEMBLY__ #include diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S index cd92f62f9cf5..ef1cccf71173 100644 --- a/arch/powerpc/mm/tlb_low_64e.S +++ b/arch/powerpc/mm/tlb_low_64e.S @@ -189,12 +189,16 @@ normal_tlb_miss: clrrdi r14,r14,3 or r10,r15,r14 +BEGIN_MMU_FTR_SECTION /* Set the TLB reservation and seach for existing entry. Then load * the entry. */ PPC_TLBSRX_DOT(0,r16) ld r14,0(r10) beq normal_tlb_miss_done +MMU_FTR_SECTION_ELSE + ld r14,0(r10) +ALT_MMU_FTR_SECTION_END_IFSET(MMU_FTR_USE_TLBRSRV) finish_normal_tlb_miss: /* Check if required permissions are met */ @@ -241,7 +245,14 @@ finish_normal_tlb_miss: bne 1f li r11,MAS3_SW|MAS3_UW andc r15,r15,r11 -1: mtspr SPRN_MAS7_MAS3,r15 +1: +BEGIN_MMU_FTR_SECTION + srdi r16,r15,32 + mtspr SPRN_MAS3,r15 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE + mtspr SPRN_MAS7_MAS3,r15 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe @@ -311,11 +322,13 @@ virt_page_table_tlb_miss: rlwinm r10,r10,0,16,1 /* Clear TID */ mtspr SPRN_MAS1,r10 1: +BEGIN_MMU_FTR_SECTION /* Search if we already have a TLB entry for that virtual address, and * if we do, bail out. */ PPC_TLBSRX_DOT(0,r16) beq virt_page_table_tlb_miss_done +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) /* Now, we need to walk the page tables. First check if we are in * range. @@ -367,10 +380,18 @@ virt_page_table_tlb_miss: */ clrldi r11,r15,4 /* remove region ID from RPN */ ori r10,r11,1 /* Or-in SR */ + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe +BEGIN_MMU_FTR_SECTION virt_page_table_tlb_miss_done: /* We have overriden MAS2:EPN but currently our primary TLB miss @@ -394,6 +415,7 @@ virt_page_table_tlb_miss_done: addi r10,r11,-4 std r10,PACA_EXTLB+EX_TLB_SIZE+EX_TLB_SRR0(r13) 1: +END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_TLBRSRV) /* Return to caller, normal case */ TLB_MISS_STATS_X(MMSTAT_TLB_MISS_PT_OK); TLB_MISS_EPILOG_SUCCESS @@ -618,7 +640,14 @@ htw_tlb_miss: #else ori r10,r15,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT) #endif + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe @@ -700,7 +729,14 @@ tlb_load_linear: clrrdi r10,r16,30 /* 1G page index */ clrldi r10,r10,4 /* clear region bits */ ori r10,r10,MAS3_SR|MAS3_SW|MAS3_SX + +BEGIN_MMU_FTR_SECTION + srdi r16,r10,32 + mtspr SPRN_MAS3,r10 + mtspr SPRN_MAS7,r16 +MMU_FTR_SECTION_ELSE mtspr SPRN_MAS7_MAS3,r10 +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS) tlbwe From f45c4486f70d0a6502e7499a8664cdc0bba84cd2 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 18 Aug 2009 19:08:30 +0000 Subject: [PATCH 0238/1662] powerpc/book3e-64: Move the default cpu table entry Move the default cpu entry table for CONFIG_PPC_BOOK3E_64 to the very end since we will probably want to support both 32-bit and 64-bit kernels for some processors that are higher up in the list. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/cputable.c | 49 +++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 9f38ecb17859..0b9c9135922e 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -89,8 +89,12 @@ extern void __restore_cpu_power7(void); #define COMMON_USER_PA6T (COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\ PPC_FEATURE_TRUE_LE | \ PPC_FEATURE_HAS_ALTIVEC_COMP) +#ifdef CONFIG_PPC_BOOK3E_64 +#define COMMON_USER_BOOKE (COMMON_USER_PPC64 | PPC_FEATURE_BOOKE) +#else #define COMMON_USER_BOOKE (PPC_FEATURE_32 | PPC_FEATURE_HAS_MMU | \ PPC_FEATURE_BOOKE) +#endif static struct cpu_spec __initdata cpu_specs[] = { #ifdef CONFIG_PPC_BOOK3S_64 @@ -509,28 +513,6 @@ static struct cpu_spec __initdata cpu_specs[] = { .platform = "power4", } #endif /* CONFIG_PPC_BOOK3S_64 */ -#ifdef CONFIG_PPC_BOOK3E_64 - { /* This is a default entry to get going, to be replaced by - * a real one at some stage - */ -#define CPU_FTRS_BASE_BOOK3E (CPU_FTR_USE_TB | \ - CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_SMT | \ - CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE) - .pvr_mask = 0x00000000, - .pvr_value = 0x00000000, - .cpu_name = "Book3E", - .cpu_features = CPU_FTRS_BASE_BOOK3E, - .cpu_user_features = COMMON_USER_PPC64, - .mmu_features = MMU_FTR_TYPE_3E | MMU_FTR_USE_TLBILX | - MMU_FTR_USE_TLBIVAX_BCAST | - MMU_FTR_LOCK_BCAST_INVAL, - .icache_bsize = 64, - .dcache_bsize = 64, - .num_pmcs = 0, - .machine_check = machine_check_generic, - .platform = "power6", - }, -#endif #ifdef CONFIG_PPC32 #if CLASSIC_PPC @@ -1846,6 +1828,29 @@ static struct cpu_spec __initdata cpu_specs[] = { } #endif /* CONFIG_E500 */ #endif /* CONFIG_PPC32 */ + +#ifdef CONFIG_PPC_BOOK3E_64 + { /* This is a default entry to get going, to be replaced by + * a real one at some stage + */ +#define CPU_FTRS_BASE_BOOK3E (CPU_FTR_USE_TB | \ + CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_SMT | \ + CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE) + .pvr_mask = 0x00000000, + .pvr_value = 0x00000000, + .cpu_name = "Book3E", + .cpu_features = CPU_FTRS_BASE_BOOK3E, + .cpu_user_features = COMMON_USER_PPC64, + .mmu_features = MMU_FTR_TYPE_3E | MMU_FTR_USE_TLBILX | + MMU_FTR_USE_TLBIVAX_BCAST | + MMU_FTR_LOCK_BCAST_INVAL, + .icache_bsize = 64, + .dcache_bsize = 64, + .num_pmcs = 0, + .machine_check = machine_check_generic, + .platform = "power6", + }, +#endif }; static struct cpu_spec the_cpu_spec; From 6c188829d2c20a1d02aedb13db34b3ca2a8f0dc4 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 18 Aug 2009 19:08:31 +0000 Subject: [PATCH 0239/1662] powerpc/book3e-64: Wait til generic_calibrate_decr to enable decrementer Match what we do on 32-bit Book-E processors and enable the decrementer in generic_calibrate_decr. We need to make sure we disable the decrementer early in boot since we currently use lazy (soft) interrupt on 64-bit Book-E and possible get a decrementer exception before we are ready for it. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/exceptions-64e.S | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 695d4847d228..3611b0e7d46d 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -774,9 +774,11 @@ _STATIC(init_thread_book3e) /* Make sure interrupts are off */ wrteei 0 - /* disable watchdog and FIT and enable DEC interrupts */ - lis r3,TCR_DIE@h + /* disable all timers and clear out status */ + li r3,0 mtspr SPRN_TCR,r3 + mfspr r3,SPRN_TSR + mtspr SPRN_TSR,r3 blr From 4b98d9e713a03bd79ced8800e24a56359f9effbf Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 18 Aug 2009 19:08:32 +0000 Subject: [PATCH 0240/1662] powerpc/book3e-64: Add helper function to setup IVORs Not all 64-bit Book-3E parts will have fixed IVORs so add a function that cpusetup code can call to setup the base IVORs (0..15) to match the fixed offsets. We need to 'or' part of interrupt_base_book3e into the IVORs since on parts that have them the IVPR doesn't extend as far down. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/exception-64e.h | 4 ++++ arch/powerpc/kernel/exceptions-64e.S | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h index 94cb3d79d125..6d53f311d942 100644 --- a/arch/powerpc/include/asm/exception-64e.h +++ b/arch/powerpc/include/asm/exception-64e.h @@ -196,6 +196,10 @@ exc_##label##_book3e: #define TLB_MISS_STATS_SAVE_INFO #endif +#define SET_IVOR(vector_number, vector_offset) \ + li r3,vector_offset@l; \ + ori r3,r3,interrupt_base_book3e@l; \ + mtspr SPRN_IVOR##vector_number,r3; #endif /* _ASM_POWERPC_EXCEPTION_64E_H */ diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 3611b0e7d46d..662236c72244 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -782,5 +782,24 @@ _STATIC(init_thread_book3e) blr +_GLOBAL(__setup_base_ivors) + SET_IVOR(0, 0x020) /* Critical Input */ + SET_IVOR(1, 0x000) /* Machine Check */ + SET_IVOR(2, 0x060) /* Data Storage */ + SET_IVOR(3, 0x080) /* Instruction Storage */ + SET_IVOR(4, 0x0a0) /* External Input */ + SET_IVOR(5, 0x0c0) /* Alignment */ + SET_IVOR(6, 0x0e0) /* Program */ + SET_IVOR(7, 0x100) /* FP Unavailable */ + SET_IVOR(8, 0x120) /* System Call */ + SET_IVOR(9, 0x140) /* Auxiliary Processor Unavailable */ + SET_IVOR(10, 0x160) /* Decrementer */ + SET_IVOR(11, 0x180) /* Fixed Interval Timer */ + SET_IVOR(12, 0x1a0) /* Watchdog Timer */ + SET_IVOR(13, 0x1c0) /* Data TLB Error */ + SET_IVOR(14, 0x1e0) /* Instruction TLB Error */ + SET_IVOR(15, 0x040) /* Debug */ + sync + blr From bb1af71ecbfdbecbe9f7e43f703da5840b76c2e4 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 18 Aug 2009 19:08:33 +0000 Subject: [PATCH 0241/1662] powerpc/book3e-64: Add support to initial_tlb_book3e for non-HES TLB We now search through TLBnCFG looking for the first array that has IPROT support (we assume that there is only one). If that TLB has hardware entry select (HES) support we use the existing code and with the proper TLB select (the HES code still needs to clean up bolted entries from firmware). The non-HES code is pretty similiar to the 32-bit FSL Book-E code but does make some new assumtions (like that we have tlbilx) and simplifies things down a bit. Signed-off-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/reg_booke.h | 2 + arch/powerpc/kernel/exceptions-64e.S | 204 ++++++++++++++++++++++++++- 2 files changed, 202 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h index 9bb81d99b765..3bf783505528 100644 --- a/arch/powerpc/include/asm/reg_booke.h +++ b/arch/powerpc/include/asm/reg_booke.h @@ -108,6 +108,8 @@ #define SPRN_PID2 0x27A /* Process ID Register 2 */ #define SPRN_TLB0CFG 0x2B0 /* TLB 0 Config Register */ #define SPRN_TLB1CFG 0x2B1 /* TLB 1 Config Register */ +#define SPRN_TLB2CFG 0x2B2 /* TLB 2 Config Register */ +#define SPRN_TLB3CFG 0x2B3 /* TLB 3 Config Register */ #define SPRN_EPR 0x2BE /* External Proxy Register */ #define SPRN_CCR1 0x378 /* Core Configuration Register 1 */ #define SPRN_ZPR 0x3B0 /* Zone Protection Register (40x) */ diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 662236c72244..9048f96237f6 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -616,18 +616,214 @@ bad_stack_book3e: * Setup the initial TLB for a core. This current implementation * assume that whatever we are running off will not conflict with * the new mapping at PAGE_OFFSET. - * We also make various assumptions about the processor we run on, - * this might have to be made more flexible based on the content - * of MMUCFG and friends. */ _GLOBAL(initial_tlb_book3e) + /* Look for the first TLB with IPROT set */ + mfspr r4,SPRN_TLB0CFG + andi. r3,r4,TLBnCFG_IPROT + lis r3,MAS0_TLBSEL(0)@h + bne found_iprot + + mfspr r4,SPRN_TLB1CFG + andi. r3,r4,TLBnCFG_IPROT + lis r3,MAS0_TLBSEL(1)@h + bne found_iprot + + mfspr r4,SPRN_TLB2CFG + andi. r3,r4,TLBnCFG_IPROT + lis r3,MAS0_TLBSEL(2)@h + bne found_iprot + + lis r3,MAS0_TLBSEL(3)@h + mfspr r4,SPRN_TLB3CFG + /* fall through */ + +found_iprot: + andi. r5,r4,TLBnCFG_HES + bne have_hes + + mflr r8 /* save LR */ +/* 1. Find the index of the entry we're executing in + * + * r3 = MAS0_TLBSEL (for the iprot array) + * r4 = SPRN_TLBnCFG + */ + bl invstr /* Find our address */ +invstr: mflr r6 /* Make it accessible */ + mfmsr r7 + rlwinm r5,r7,27,31,31 /* extract MSR[IS] */ + mfspr r7,SPRN_PID + slwi r7,r7,16 + or r7,r7,r5 + mtspr SPRN_MAS6,r7 + tlbsx 0,r6 /* search MSR[IS], SPID=PID */ + + mfspr r3,SPRN_MAS0 + rlwinm r5,r3,16,20,31 /* Extract MAS0(Entry) */ + + mfspr r7,SPRN_MAS1 /* Insure IPROT set */ + oris r7,r7,MAS1_IPROT@h + mtspr SPRN_MAS1,r7 + tlbwe + +/* 2. Invalidate all entries except the entry we're executing in + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we are running in + * r4 = SPRN_TLBnCFG + * r5 = ESEL of entry we are running in + */ + andi. r4,r4,TLBnCFG_N_ENTRY /* Extract # entries */ + li r6,0 /* Set Entry counter to 0 */ +1: mr r7,r3 /* Set MAS0(TLBSEL) */ + rlwimi r7,r6,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r6) */ + mtspr SPRN_MAS0,r7 + tlbre + mfspr r7,SPRN_MAS1 + rlwinm r7,r7,0,2,31 /* Clear MAS1 Valid and IPROT */ + cmpw r5,r6 + beq skpinv /* Dont update the current execution TLB */ + mtspr SPRN_MAS1,r7 + tlbwe + isync +skpinv: addi r6,r6,1 /* Increment */ + cmpw r6,r4 /* Are we done? */ + bne 1b /* If not, repeat */ + + /* Invalidate all TLBs */ + PPC_TLBILX_ALL(0,0) + sync + isync + +/* 3. Setup a temp mapping and jump to it + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we are running in + * r5 = ESEL of entry we are running in + */ + andi. r7,r5,0x1 /* Find an entry not used and is non-zero */ + addi r7,r7,0x1 + mr r4,r3 /* Set MAS0(TLBSEL) = 1 */ + mtspr SPRN_MAS0,r4 + tlbre + + rlwimi r4,r7,16,4,15 /* Setup MAS0 = TLBSEL | ESEL(r7) */ + mtspr SPRN_MAS0,r4 + + mfspr r7,SPRN_MAS1 + xori r6,r7,MAS1_TS /* Setup TMP mapping in the other Address space */ + mtspr SPRN_MAS1,r6 + + tlbwe + + mfmsr r6 + xori r6,r6,MSR_IS + mtspr SPRN_SRR1,r6 + bl 1f /* Find our address */ +1: mflr r6 + addi r6,r6,(2f - 1b) + mtspr SPRN_SRR0,r6 + rfi +2: + +/* 4. Clear out PIDs & Search info + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we started in + * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping + * r5 = MAS3 + */ + li r6,0 + mtspr SPRN_MAS6,r6 + mtspr SPRN_PID,r6 + +/* 5. Invalidate mapping we started in + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we started in + * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping + * r5 = MAS3 + */ + mtspr SPRN_MAS0,r3 + tlbre + mfspr r6,SPRN_MAS1 + rlwinm r6,r6,0,2,0 /* clear IPROT */ + mtspr SPRN_MAS1,r6 + tlbwe + + /* Invalidate TLB1 */ + PPC_TLBILX_ALL(0,0) + sync + isync + +/* The mapping only needs to be cache-coherent on SMP */ +#ifdef CONFIG_SMP +#define M_IF_SMP MAS2_M +#else +#define M_IF_SMP 0 +#endif + +/* 6. Setup KERNELBASE mapping in TLB[0] + * + * r3 = MAS0 w/TLBSEL & ESEL for the entry we started in + * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping + * r5 = MAS3 + */ + rlwinm r3,r3,0,16,3 /* clear ESEL */ + mtspr SPRN_MAS0,r3 + lis r6,(MAS1_VALID|MAS1_IPROT)@h + ori r6,r6,(MAS1_TSIZE(BOOK3E_PAGESZ_1GB))@l + mtspr SPRN_MAS1,r6 + + LOAD_REG_IMMEDIATE(r6, PAGE_OFFSET | M_IF_SMP) + mtspr SPRN_MAS2,r6 + + rlwinm r5,r5,0,0,25 + ori r5,r5,MAS3_SR | MAS3_SW | MAS3_SX + mtspr SPRN_MAS3,r5 + li r5,-1 + rlwinm r5,r5,0,0,25 + + tlbwe + +/* 7. Jump to KERNELBASE mapping + * + * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping + */ + /* Now we branch the new virtual address mapped by this entry */ + LOAD_REG_IMMEDIATE(r6,2f) + lis r7,MSR_KERNEL@h + ori r7,r7,MSR_KERNEL@l + mtspr SPRN_SRR0,r6 + mtspr SPRN_SRR1,r7 + rfi /* start execution out of TLB1[0] entry */ +2: + +/* 8. Clear out the temp mapping + * + * r4 = MAS0 w/TLBSEL & ESEL for the entry we are running in + */ + mtspr SPRN_MAS0,r4 + tlbre + mfspr r5,SPRN_MAS1 + rlwinm r5,r5,0,2,0 /* clear IPROT */ + mtspr SPRN_MAS1,r5 + tlbwe + + /* Invalidate TLB1 */ + PPC_TLBILX_ALL(0,0) + sync + isync + + /* We translate LR and return */ + tovirt(r8,r8) + mtlr r8 + blr + +have_hes: /* Setup MAS 0,1,2,3 and 7 for tlbwe of a 1G entry that maps the * kernel linear mapping. We also set MAS8 once for all here though * that will have to be made dependent on whether we are running under * a hypervisor I suppose. */ - li r3,MAS0_HES | MAS0_WQ_ALLWAYS + ori r3,r3,MAS0_HES | MAS0_WQ_ALLWAYS mtspr SPRN_MAS0,r3 lis r3,(MAS1_VALID | MAS1_IPROT)@h ori r3,r3,BOOK3E_PAGESZ_1GB << MAS1_TSIZE_SHIFT From ae14e13a4c8bb091dfd5606fd76c9cd272090ab7 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 25 Aug 2009 20:07:02 +0000 Subject: [PATCH 0242/1662] powerpc/pci: Remove dead checks for CONFIG_PPC_OF PPC_OF is always selected for arch/powerpc. This patch removes the stale #defines Signed-off-by: Grant Likely Acked-by: Stephen Rothwell Acked-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/pci-common.c | 8 -------- arch/powerpc/kernel/pci_32.c | 9 --------- 2 files changed, 17 deletions(-) diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 7585f1fc26db..158a78ae6341 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -176,8 +176,6 @@ int pci_domain_nr(struct pci_bus *bus) } EXPORT_SYMBOL(pci_domain_nr); -#ifdef CONFIG_PPC_OF - /* This routine is meant to be used early during boot, when the * PCI bus numbers have not yet been assigned, and you need to * issue PCI config cycles to an OF device. @@ -210,17 +208,11 @@ static ssize_t pci_show_devspec(struct device *dev, return sprintf(buf, "%s", np->full_name); } static DEVICE_ATTR(devspec, S_IRUGO, pci_show_devspec, NULL); -#endif /* CONFIG_PPC_OF */ /* Add sysfs properties */ int pcibios_add_platform_entries(struct pci_dev *pdev) { -#ifdef CONFIG_PPC_OF return device_create_file(&pdev->dev, &dev_attr_devspec); -#else - return 0; -#endif /* CONFIG_PPC_OF */ - } char __devinit *pcibios_setup(char *str) diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 3ae1c666ff92..1e807fe7ad2c 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -34,9 +34,7 @@ int pcibios_assign_bus_offset = 1; void pcibios_make_OF_bus_map(void); static void fixup_cpc710_pci64(struct pci_dev* dev); -#ifdef CONFIG_PPC_OF static u8* pci_to_OF_bus_map; -#endif /* By default, we don't re-assign bus numbers. We do this only on * some pmacs @@ -83,7 +81,6 @@ fixup_cpc710_pci64(struct pci_dev* dev) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CPC710_PCI64, fixup_cpc710_pci64); -#ifdef CONFIG_PPC_OF /* * Functions below are used on OpenFirmware machines. */ @@ -357,12 +354,6 @@ pci_create_OF_bus_map(void) } } -#else /* CONFIG_PPC_OF */ -void pcibios_make_OF_bus_map(void) -{ -} -#endif /* CONFIG_PPC_OF */ - static void __devinit pcibios_scan_phb(struct pci_controller *hose) { struct pci_bus *bus; From fbe65447197789a3ccccc27755956f6a4c445089 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 25 Aug 2009 20:07:11 +0000 Subject: [PATCH 0243/1662] powerpc/pci: move pci_64.c device tree scanning code into pci-common.c The PCI device tree scanning code in pci_64.c is some useful functionality. It allows PCI devices to be described in the device tree instead of being probed for, which in turn allows pci devices to use all of the device tree facilities to describe complex PCI bus architectures like GPIO and IRQ routing (perhaps not a common situation for desktop or server systems, but useful for embedded systems with on-board PCI devices). This patch moves the device tree scanning into pci-common.c so it is available for 32-bit powerpc machines too. Signed-off-by: Grant Likely Acked-by: Kumar Gala Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/pci-bridge.h | 5 - arch/powerpc/include/asm/pci.h | 5 + arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/pci-common.c | 1 - arch/powerpc/kernel/pci_64.c | 289 --------------------- arch/powerpc/kernel/pci_of_scan.c | 358 ++++++++++++++++++++++++++ 6 files changed, 364 insertions(+), 296 deletions(-) create mode 100644 arch/powerpc/kernel/pci_of_scan.c diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 4c61fa0b8d75..3faf575f6b06 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -284,11 +284,6 @@ static inline int isa_vaddr_is_ioport(void __iomem *address) extern int pcibios_unmap_io_space(struct pci_bus *bus); extern int pcibios_map_io_space(struct pci_bus *bus); -/* Return values for ppc_md.pci_probe_mode function */ -#define PCI_PROBE_NONE -1 /* Don't look at this bus at all */ -#define PCI_PROBE_NORMAL 0 /* Do normal PCI probing */ -#define PCI_PROBE_DEVTREE 1 /* Instantiate from device tree */ - #ifdef CONFIG_NUMA #define PHB_SET_NODE(PHB, NODE) ((PHB)->node = (NODE)) #else diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index 7ae46d7e270d..b856a837b4a3 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -22,6 +22,11 @@ #include +/* Return values for ppc_md.pci_probe_mode function */ +#define PCI_PROBE_NONE -1 /* Don't look at this bus at all */ +#define PCI_PROBE_NORMAL 0 /* Do normal PCI probing */ +#define PCI_PROBE_DEVTREE 1 /* Instantiate from device tree */ + #define PCIBIOS_MIN_IO 0x1000 #define PCIBIOS_MIN_MEM 0x10000000 diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 7c83edbc2155..569f79ccd310 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -88,7 +88,7 @@ obj-$(CONFIG_SWIOTLB) += dma-swiotlb.o pci64-$(CONFIG_PPC64) += pci_dn.o isa-bridge.o obj-$(CONFIG_PCI) += pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \ - pci-common.o + pci-common.o pci_of_scan.o obj-$(CONFIG_PCI_MSI) += msi.o obj-$(CONFIG_KEXEC) += machine_kexec.o crash.o \ machine_kexec_$(CONFIG_WORD_SIZE).o diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 158a78ae6341..725ea9144e38 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1617,4 +1617,3 @@ void __devinit pcibios_setup_phb_resources(struct pci_controller *hose) (unsigned long)hose->io_base_virt - _IO_BASE); } - diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 9e8902fa14c7..4d5b4ced7e45 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -43,295 +43,6 @@ unsigned long pci_probe_only = 1; unsigned long pci_io_base = ISA_IO_BASE; EXPORT_SYMBOL(pci_io_base); -static u32 get_int_prop(struct device_node *np, const char *name, u32 def) -{ - const u32 *prop; - int len; - - prop = of_get_property(np, name, &len); - if (prop && len >= 4) - return *prop; - return def; -} - -static unsigned int pci_parse_of_flags(u32 addr0, int bridge) -{ - unsigned int flags = 0; - - if (addr0 & 0x02000000) { - flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; - flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; - flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; - if (addr0 & 0x40000000) - flags |= IORESOURCE_PREFETCH - | PCI_BASE_ADDRESS_MEM_PREFETCH; - /* Note: We don't know whether the ROM has been left enabled - * by the firmware or not. We mark it as disabled (ie, we do - * not set the IORESOURCE_ROM_ENABLE flag) for now rather than - * do a config space read, it will be force-enabled if needed - */ - if (!bridge && (addr0 & 0xff) == 0x30) - flags |= IORESOURCE_READONLY; - } else if (addr0 & 0x01000000) - flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO; - if (flags) - flags |= IORESOURCE_SIZEALIGN; - return flags; -} - - -static void pci_parse_of_addrs(struct device_node *node, struct pci_dev *dev) -{ - u64 base, size; - unsigned int flags; - struct resource *res; - const u32 *addrs; - u32 i; - int proplen; - - addrs = of_get_property(node, "assigned-addresses", &proplen); - if (!addrs) - return; - pr_debug(" parse addresses (%d bytes) @ %p\n", proplen, addrs); - for (; proplen >= 20; proplen -= 20, addrs += 5) { - flags = pci_parse_of_flags(addrs[0], 0); - if (!flags) - continue; - base = of_read_number(&addrs[1], 2); - size = of_read_number(&addrs[3], 2); - if (!size) - continue; - i = addrs[0] & 0xff; - pr_debug(" base: %llx, size: %llx, i: %x\n", - (unsigned long long)base, - (unsigned long long)size, i); - - if (PCI_BASE_ADDRESS_0 <= i && i <= PCI_BASE_ADDRESS_5) { - res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2]; - } else if (i == dev->rom_base_reg) { - res = &dev->resource[PCI_ROM_RESOURCE]; - flags |= IORESOURCE_READONLY | IORESOURCE_CACHEABLE; - } else { - printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i); - continue; - } - res->start = base; - res->end = base + size - 1; - res->flags = flags; - res->name = pci_name(dev); - } -} - -struct pci_dev *of_create_pci_dev(struct device_node *node, - struct pci_bus *bus, int devfn) -{ - struct pci_dev *dev; - const char *type; - - dev = alloc_pci_dev(); - if (!dev) - return NULL; - type = of_get_property(node, "device_type", NULL); - if (type == NULL) - type = ""; - - pr_debug(" create device, devfn: %x, type: %s\n", devfn, type); - - dev->bus = bus; - dev->sysdata = node; - dev->dev.parent = bus->bridge; - dev->dev.bus = &pci_bus_type; - dev->devfn = devfn; - dev->multifunction = 0; /* maybe a lie? */ - - dev->vendor = get_int_prop(node, "vendor-id", 0xffff); - dev->device = get_int_prop(node, "device-id", 0xffff); - dev->subsystem_vendor = get_int_prop(node, "subsystem-vendor-id", 0); - dev->subsystem_device = get_int_prop(node, "subsystem-id", 0); - - dev->cfg_size = pci_cfg_space_size(dev); - - dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(bus), - dev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn)); - dev->class = get_int_prop(node, "class-code", 0); - dev->revision = get_int_prop(node, "revision-id", 0); - - pr_debug(" class: 0x%x\n", dev->class); - pr_debug(" revision: 0x%x\n", dev->revision); - - dev->current_state = 4; /* unknown power state */ - dev->error_state = pci_channel_io_normal; - dev->dma_mask = 0xffffffff; - - if (!strcmp(type, "pci") || !strcmp(type, "pciex")) { - /* a PCI-PCI bridge */ - dev->hdr_type = PCI_HEADER_TYPE_BRIDGE; - dev->rom_base_reg = PCI_ROM_ADDRESS1; - } else if (!strcmp(type, "cardbus")) { - dev->hdr_type = PCI_HEADER_TYPE_CARDBUS; - } else { - dev->hdr_type = PCI_HEADER_TYPE_NORMAL; - dev->rom_base_reg = PCI_ROM_ADDRESS; - /* Maybe do a default OF mapping here */ - dev->irq = NO_IRQ; - } - - pci_parse_of_addrs(node, dev); - - pr_debug(" adding to system ...\n"); - - pci_device_add(dev, bus); - - return dev; -} -EXPORT_SYMBOL(of_create_pci_dev); - -static void __devinit __of_scan_bus(struct device_node *node, - struct pci_bus *bus, int rescan_existing) -{ - struct device_node *child; - const u32 *reg; - int reglen, devfn; - struct pci_dev *dev; - - pr_debug("of_scan_bus(%s) bus no %d... \n", - node->full_name, bus->number); - - /* Scan direct children */ - for_each_child_of_node(node, child) { - pr_debug(" * %s\n", child->full_name); - reg = of_get_property(child, "reg", ®len); - if (reg == NULL || reglen < 20) - continue; - devfn = (reg[0] >> 8) & 0xff; - - /* create a new pci_dev for this device */ - dev = of_create_pci_dev(child, bus, devfn); - if (!dev) - continue; - pr_debug(" dev header type: %x\n", dev->hdr_type); - } - - /* Apply all fixups necessary. We don't fixup the bus "self" - * for an existing bridge that is being rescanned - */ - if (!rescan_existing) - pcibios_setup_bus_self(bus); - pcibios_setup_bus_devices(bus); - - /* Now scan child busses */ - list_for_each_entry(dev, &bus->devices, bus_list) { - if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || - dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) { - struct device_node *child = pci_device_to_OF_node(dev); - if (dev) - of_scan_pci_bridge(child, dev); - } - } -} - -void __devinit of_scan_bus(struct device_node *node, - struct pci_bus *bus) -{ - __of_scan_bus(node, bus, 0); -} -EXPORT_SYMBOL_GPL(of_scan_bus); - -void __devinit of_rescan_bus(struct device_node *node, - struct pci_bus *bus) -{ - __of_scan_bus(node, bus, 1); -} -EXPORT_SYMBOL_GPL(of_rescan_bus); - -void __devinit of_scan_pci_bridge(struct device_node *node, - struct pci_dev *dev) -{ - struct pci_bus *bus; - const u32 *busrange, *ranges; - int len, i, mode; - struct resource *res; - unsigned int flags; - u64 size; - - pr_debug("of_scan_pci_bridge(%s)\n", node->full_name); - - /* parse bus-range property */ - busrange = of_get_property(node, "bus-range", &len); - if (busrange == NULL || len != 8) { - printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %s\n", - node->full_name); - return; - } - ranges = of_get_property(node, "ranges", &len); - if (ranges == NULL) { - printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %s\n", - node->full_name); - return; - } - - bus = pci_add_new_bus(dev->bus, dev, busrange[0]); - if (!bus) { - printk(KERN_ERR "Failed to create pci bus for %s\n", - node->full_name); - return; - } - - bus->primary = dev->bus->number; - bus->subordinate = busrange[1]; - bus->bridge_ctl = 0; - bus->sysdata = node; - - /* parse ranges property */ - /* PCI #address-cells == 3 and #size-cells == 2 always */ - res = &dev->resource[PCI_BRIDGE_RESOURCES]; - for (i = 0; i < PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES; ++i) { - res->flags = 0; - bus->resource[i] = res; - ++res; - } - i = 1; - for (; len >= 32; len -= 32, ranges += 8) { - flags = pci_parse_of_flags(ranges[0], 1); - size = of_read_number(&ranges[6], 2); - if (flags == 0 || size == 0) - continue; - if (flags & IORESOURCE_IO) { - res = bus->resource[0]; - if (res->flags) { - printk(KERN_ERR "PCI: ignoring extra I/O range" - " for bridge %s\n", node->full_name); - continue; - } - } else { - if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) { - printk(KERN_ERR "PCI: too many memory ranges" - " for bridge %s\n", node->full_name); - continue; - } - res = bus->resource[i]; - ++i; - } - res->start = of_read_number(&ranges[1], 2); - res->end = res->start + size - 1; - res->flags = flags; - } - sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus), - bus->number); - pr_debug(" bus name: %s\n", bus->name); - - mode = PCI_PROBE_NORMAL; - if (ppc_md.pci_probe_mode) - mode = ppc_md.pci_probe_mode(bus); - pr_debug(" probe mode: %d\n", mode); - - if (mode == PCI_PROBE_DEVTREE) - of_scan_bus(node, bus); - else if (mode == PCI_PROBE_NORMAL) - pci_scan_child_bus(bus); -} -EXPORT_SYMBOL(of_scan_pci_bridge); - void __devinit scan_phb(struct pci_controller *hose) { struct pci_bus *bus; diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c new file mode 100644 index 000000000000..72c31bcb7aa4 --- /dev/null +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -0,0 +1,358 @@ +/* + * Helper routines to scan the device tree for PCI devices and busses + * + * Migrated out of PowerPC architecture pci_64.c file by Grant Likely + * so that these routines are available for + * 32 bit also. + * + * Copyright (C) 2003 Anton Blanchard , IBM + * Rework, based on alpha PCI code. + * Copyright (c) 2009 Secret Lab Technologies Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include +#include +#include + +/** + * get_int_prop - Decode a u32 from a device tree property + */ +static u32 get_int_prop(struct device_node *np, const char *name, u32 def) +{ + const u32 *prop; + int len; + + prop = of_get_property(np, name, &len); + if (prop && len >= 4) + return *prop; + return def; +} + +/** + * pci_parse_of_flags - Parse the flags cell of a device tree PCI address + * @addr0: value of 1st cell of a device tree PCI address. + * @bridge: Set this flag if the address is from a bridge 'ranges' property + */ +unsigned int pci_parse_of_flags(u32 addr0, int bridge) +{ + unsigned int flags = 0; + + if (addr0 & 0x02000000) { + flags = IORESOURCE_MEM | PCI_BASE_ADDRESS_SPACE_MEMORY; + flags |= (addr0 >> 22) & PCI_BASE_ADDRESS_MEM_TYPE_64; + flags |= (addr0 >> 28) & PCI_BASE_ADDRESS_MEM_TYPE_1M; + if (addr0 & 0x40000000) + flags |= IORESOURCE_PREFETCH + | PCI_BASE_ADDRESS_MEM_PREFETCH; + /* Note: We don't know whether the ROM has been left enabled + * by the firmware or not. We mark it as disabled (ie, we do + * not set the IORESOURCE_ROM_ENABLE flag) for now rather than + * do a config space read, it will be force-enabled if needed + */ + if (!bridge && (addr0 & 0xff) == 0x30) + flags |= IORESOURCE_READONLY; + } else if (addr0 & 0x01000000) + flags = IORESOURCE_IO | PCI_BASE_ADDRESS_SPACE_IO; + if (flags) + flags |= IORESOURCE_SIZEALIGN; + return flags; +} + +/** + * of_pci_parse_addrs - Parse PCI addresses assigned in the device tree node + * @node: device tree node for the PCI device + * @dev: pci_dev structure for the device + * + * This function parses the 'assigned-addresses' property of a PCI devices' + * device tree node and writes them into the associated pci_dev structure. + */ +static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev) +{ + u64 base, size; + unsigned int flags; + struct resource *res; + const u32 *addrs; + u32 i; + int proplen; + + addrs = of_get_property(node, "assigned-addresses", &proplen); + if (!addrs) + return; + pr_debug(" parse addresses (%d bytes) @ %p\n", proplen, addrs); + for (; proplen >= 20; proplen -= 20, addrs += 5) { + flags = pci_parse_of_flags(addrs[0], 0); + if (!flags) + continue; + base = of_read_number(&addrs[1], 2); + size = of_read_number(&addrs[3], 2); + if (!size) + continue; + i = addrs[0] & 0xff; + pr_debug(" base: %llx, size: %llx, i: %x\n", + (unsigned long long)base, + (unsigned long long)size, i); + + if (PCI_BASE_ADDRESS_0 <= i && i <= PCI_BASE_ADDRESS_5) { + res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2]; + } else if (i == dev->rom_base_reg) { + res = &dev->resource[PCI_ROM_RESOURCE]; + flags |= IORESOURCE_READONLY | IORESOURCE_CACHEABLE; + } else { + printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i); + continue; + } + res->start = base; + res->end = base + size - 1; + res->flags = flags; + res->name = pci_name(dev); + } +} + +/** + * of_create_pci_dev - Given a device tree node on a pci bus, create a pci_dev + * @node: device tree node pointer + * @bus: bus the device is sitting on + * @devfn: PCI function number, extracted from device tree by caller. + */ +struct pci_dev *of_create_pci_dev(struct device_node *node, + struct pci_bus *bus, int devfn) +{ + struct pci_dev *dev; + const char *type; + + dev = alloc_pci_dev(); + if (!dev) + return NULL; + type = of_get_property(node, "device_type", NULL); + if (type == NULL) + type = ""; + + pr_debug(" create device, devfn: %x, type: %s\n", devfn, type); + + dev->bus = bus; + dev->sysdata = node; + dev->dev.parent = bus->bridge; + dev->dev.bus = &pci_bus_type; + dev->devfn = devfn; + dev->multifunction = 0; /* maybe a lie? */ + + dev->vendor = get_int_prop(node, "vendor-id", 0xffff); + dev->device = get_int_prop(node, "device-id", 0xffff); + dev->subsystem_vendor = get_int_prop(node, "subsystem-vendor-id", 0); + dev->subsystem_device = get_int_prop(node, "subsystem-id", 0); + + dev->cfg_size = pci_cfg_space_size(dev); + + dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(bus), + dev->bus->number, PCI_SLOT(devfn), PCI_FUNC(devfn)); + dev->class = get_int_prop(node, "class-code", 0); + dev->revision = get_int_prop(node, "revision-id", 0); + + pr_debug(" class: 0x%x\n", dev->class); + pr_debug(" revision: 0x%x\n", dev->revision); + + dev->current_state = 4; /* unknown power state */ + dev->error_state = pci_channel_io_normal; + dev->dma_mask = 0xffffffff; + + if (!strcmp(type, "pci") || !strcmp(type, "pciex")) { + /* a PCI-PCI bridge */ + dev->hdr_type = PCI_HEADER_TYPE_BRIDGE; + dev->rom_base_reg = PCI_ROM_ADDRESS1; + } else if (!strcmp(type, "cardbus")) { + dev->hdr_type = PCI_HEADER_TYPE_CARDBUS; + } else { + dev->hdr_type = PCI_HEADER_TYPE_NORMAL; + dev->rom_base_reg = PCI_ROM_ADDRESS; + /* Maybe do a default OF mapping here */ + dev->irq = NO_IRQ; + } + + of_pci_parse_addrs(node, dev); + + pr_debug(" adding to system ...\n"); + + pci_device_add(dev, bus); + + return dev; +} +EXPORT_SYMBOL(of_create_pci_dev); + +/** + * of_scan_pci_bridge - Set up a PCI bridge and scan for child nodes + * @node: device tree node of bridge + * @dev: pci_dev structure for the bridge + * + * of_scan_bus() calls this routine for each PCI bridge that it finds, and + * this routine in turn call of_scan_bus() recusively to scan for more child + * devices. + */ +void __devinit of_scan_pci_bridge(struct device_node *node, + struct pci_dev *dev) +{ + struct pci_bus *bus; + const u32 *busrange, *ranges; + int len, i, mode; + struct resource *res; + unsigned int flags; + u64 size; + + pr_debug("of_scan_pci_bridge(%s)\n", node->full_name); + + /* parse bus-range property */ + busrange = of_get_property(node, "bus-range", &len); + if (busrange == NULL || len != 8) { + printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %s\n", + node->full_name); + return; + } + ranges = of_get_property(node, "ranges", &len); + if (ranges == NULL) { + printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %s\n", + node->full_name); + return; + } + + bus = pci_add_new_bus(dev->bus, dev, busrange[0]); + if (!bus) { + printk(KERN_ERR "Failed to create pci bus for %s\n", + node->full_name); + return; + } + + bus->primary = dev->bus->number; + bus->subordinate = busrange[1]; + bus->bridge_ctl = 0; + bus->sysdata = node; + + /* parse ranges property */ + /* PCI #address-cells == 3 and #size-cells == 2 always */ + res = &dev->resource[PCI_BRIDGE_RESOURCES]; + for (i = 0; i < PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES; ++i) { + res->flags = 0; + bus->resource[i] = res; + ++res; + } + i = 1; + for (; len >= 32; len -= 32, ranges += 8) { + flags = pci_parse_of_flags(ranges[0], 1); + size = of_read_number(&ranges[6], 2); + if (flags == 0 || size == 0) + continue; + if (flags & IORESOURCE_IO) { + res = bus->resource[0]; + if (res->flags) { + printk(KERN_ERR "PCI: ignoring extra I/O range" + " for bridge %s\n", node->full_name); + continue; + } + } else { + if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) { + printk(KERN_ERR "PCI: too many memory ranges" + " for bridge %s\n", node->full_name); + continue; + } + res = bus->resource[i]; + ++i; + } + res->start = of_read_number(&ranges[1], 2); + res->end = res->start + size - 1; + res->flags = flags; + } + sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus), + bus->number); + pr_debug(" bus name: %s\n", bus->name); + + mode = PCI_PROBE_NORMAL; + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + pr_debug(" probe mode: %d\n", mode); + + if (mode == PCI_PROBE_DEVTREE) + of_scan_bus(node, bus); + else if (mode == PCI_PROBE_NORMAL) + pci_scan_child_bus(bus); +} +EXPORT_SYMBOL(of_scan_pci_bridge); + +/** + * __of_scan_bus - given a PCI bus node, setup bus and scan for child devices + * @node: device tree node for the PCI bus + * @bus: pci_bus structure for the PCI bus + * @rescan_existing: Flag indicating bus has already been set up + */ +static void __devinit __of_scan_bus(struct device_node *node, + struct pci_bus *bus, int rescan_existing) +{ + struct device_node *child; + const u32 *reg; + int reglen, devfn; + struct pci_dev *dev; + + pr_debug("of_scan_bus(%s) bus no %d... \n", + node->full_name, bus->number); + + /* Scan direct children */ + for_each_child_of_node(node, child) { + pr_debug(" * %s\n", child->full_name); + reg = of_get_property(child, "reg", ®len); + if (reg == NULL || reglen < 20) + continue; + devfn = (reg[0] >> 8) & 0xff; + + /* create a new pci_dev for this device */ + dev = of_create_pci_dev(child, bus, devfn); + if (!dev) + continue; + pr_debug(" dev header type: %x\n", dev->hdr_type); + } + + /* Apply all fixups necessary. We don't fixup the bus "self" + * for an existing bridge that is being rescanned + */ + if (!rescan_existing) + pcibios_setup_bus_self(bus); + pcibios_setup_bus_devices(bus); + + /* Now scan child busses */ + list_for_each_entry(dev, &bus->devices, bus_list) { + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || + dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) { + struct device_node *child = pci_device_to_OF_node(dev); + if (dev) + of_scan_pci_bridge(child, dev); + } + } +} + +/** + * of_scan_bus - given a PCI bus node, setup bus and scan for child devices + * @node: device tree node for the PCI bus + * @bus: pci_bus structure for the PCI bus + */ +void __devinit of_scan_bus(struct device_node *node, + struct pci_bus *bus) +{ + __of_scan_bus(node, bus, 0); +} +EXPORT_SYMBOL_GPL(of_scan_bus); + +/** + * of_rescan_bus - given a PCI bus node, scan for child devices + * @node: device tree node for the PCI bus + * @bus: pci_bus structure for the PCI bus + * + * Same as of_scan_bus, but for a pci_bus structure that has already been + * setup. + */ +void __devinit of_rescan_bus(struct device_node *node, + struct pci_bus *bus) +{ + __of_scan_bus(node, bus, 1); +} +EXPORT_SYMBOL_GPL(of_rescan_bus); + From 89c2dd62a389c5fed07c4b13c906c43214fc7491 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Tue, 25 Aug 2009 16:20:45 +0000 Subject: [PATCH 0244/1662] powerpc/pci: Pull ppc32 PCI features into common Some of the PCI features we have in ppc32 we will need on ppc64 platforms in the future. These include support for: * ppc_md.pci_exclude_device * indirect config cycles * early config cycles We also simplified the logic in fake_pci_bus() to assume it will always get a valid pci_controller. Since all current callers seem to pass it one. Signed-off-by: Kumar Gala Acked-by: Grant Likely Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/machdep.h | 6 +-- arch/powerpc/include/asm/pci-bridge.h | 35 ++++++------- arch/powerpc/kernel/pci-common.c | 71 +++++++++++++++++++++++++++ arch/powerpc/kernel/pci_32.c | 71 --------------------------- 4 files changed, 90 insertions(+), 93 deletions(-) diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index 11d1fc3a8962..9efa2be78331 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -209,14 +209,14 @@ struct machdep_calls { /* * optional PCI "hooks" */ - /* Called in indirect_* to avoid touching devices */ - int (*pci_exclude_device)(struct pci_controller *, unsigned char, unsigned char); - /* Called at then very end of pcibios_init() */ void (*pcibios_after_init)(void); #endif /* CONFIG_PPC32 */ + /* Called in indirect_* to avoid touching devices */ + int (*pci_exclude_device)(struct pci_controller *, unsigned char, unsigned char); + /* Called after PPC generic resource fixup to perform machine specific fixups */ void (*pcibios_fixup_resources)(struct pci_dev *); diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 3faf575f6b06..76e1f313a58e 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -77,9 +77,7 @@ struct pci_controller { int first_busno; int last_busno; -#ifndef CONFIG_PPC64 int self_busno; -#endif void __iomem *io_base_virt; #ifdef CONFIG_PPC64 @@ -104,7 +102,6 @@ struct pci_controller { unsigned int __iomem *cfg_addr; void __iomem *cfg_data; -#ifndef CONFIG_PPC64 /* * Used for variants of PCI indirect handling and possible quirks: * SET_CFG_TYPE - used on 4xx or any PHB that does explicit type0/1 @@ -128,7 +125,6 @@ struct pci_controller { #define PPC_INDIRECT_TYPE_BIG_ENDIAN 0x00000010 #define PPC_INDIRECT_TYPE_BROKEN_MRM 0x00000020 u32 indirect_type; -#endif /* !CONFIG_PPC64 */ /* Currently, we limit ourselves to 1 IO range and 3 mem * ranges since the common pci_bus structure can't handle more */ @@ -146,21 +142,6 @@ struct pci_controller { #endif /* CONFIG_PPC64 */ }; -#ifndef CONFIG_PPC64 - -static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus) -{ - return bus->sysdata; -} - -static inline int isa_vaddr_is_ioport(void __iomem *address) -{ - /* No specific ISA handling on ppc32 at this stage, it - * all goes through PCI - */ - return 0; -} - /* These are used for config access before all the PCI probing has been done. */ extern int early_read_config_byte(struct pci_controller *hose, int bus, @@ -182,6 +163,22 @@ extern int early_find_capability(struct pci_controller *hose, int bus, extern void setup_indirect_pci(struct pci_controller* hose, resource_size_t cfg_addr, resource_size_t cfg_data, u32 flags); + +#ifndef CONFIG_PPC64 + +static inline struct pci_controller *pci_bus_to_host(const struct pci_bus *bus) +{ + return bus->sysdata; +} + +static inline int isa_vaddr_is_ioport(void __iomem *address) +{ + /* No specific ISA handling on ppc32 at this stage, it + * all goes through PCI + */ + return 0; +} + #else /* CONFIG_PPC64 */ /* diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 725ea9144e38..8f84a9a8428e 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -1617,3 +1617,74 @@ void __devinit pcibios_setup_phb_resources(struct pci_controller *hose) (unsigned long)hose->io_base_virt - _IO_BASE); } + +/* + * Null PCI config access functions, for the case when we can't + * find a hose. + */ +#define NULL_PCI_OP(rw, size, type) \ +static int \ +null_##rw##_config_##size(struct pci_dev *dev, int offset, type val) \ +{ \ + return PCIBIOS_DEVICE_NOT_FOUND; \ +} + +static int +null_read_config(struct pci_bus *bus, unsigned int devfn, int offset, + int len, u32 *val) +{ + return PCIBIOS_DEVICE_NOT_FOUND; +} + +static int +null_write_config(struct pci_bus *bus, unsigned int devfn, int offset, + int len, u32 val) +{ + return PCIBIOS_DEVICE_NOT_FOUND; +} + +static struct pci_ops null_pci_ops = +{ + .read = null_read_config, + .write = null_write_config, +}; + +/* + * These functions are used early on before PCI scanning is done + * and all of the pci_dev and pci_bus structures have been created. + */ +static struct pci_bus * +fake_pci_bus(struct pci_controller *hose, int busnr) +{ + static struct pci_bus bus; + + if (hose == 0) { + printk(KERN_ERR "Can't find hose for PCI bus %d!\n", busnr); + } + bus.number = busnr; + bus.sysdata = hose; + bus.ops = hose? hose->ops: &null_pci_ops; + return &bus; +} + +#define EARLY_PCI_OP(rw, size, type) \ +int early_##rw##_config_##size(struct pci_controller *hose, int bus, \ + int devfn, int offset, type value) \ +{ \ + return pci_bus_##rw##_config_##size(fake_pci_bus(hose, bus), \ + devfn, offset, value); \ +} + +EARLY_PCI_OP(read, byte, u8 *) +EARLY_PCI_OP(read, word, u16 *) +EARLY_PCI_OP(read, dword, u32 *) +EARLY_PCI_OP(write, byte, u8) +EARLY_PCI_OP(write, word, u16) +EARLY_PCI_OP(write, dword, u32) + +extern int pci_bus_find_capability (struct pci_bus *bus, unsigned int devfn, int cap); +int early_find_capability(struct pci_controller *hose, int bus, int devfn, + int cap) +{ + return pci_bus_find_capability(fake_pci_bus(hose, bus), devfn, cap); +} diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 1e807fe7ad2c..8cf15d961c38 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -469,75 +469,4 @@ long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn) return result; } -/* - * Null PCI config access functions, for the case when we can't - * find a hose. - */ -#define NULL_PCI_OP(rw, size, type) \ -static int \ -null_##rw##_config_##size(struct pci_dev *dev, int offset, type val) \ -{ \ - return PCIBIOS_DEVICE_NOT_FOUND; \ -} -static int -null_read_config(struct pci_bus *bus, unsigned int devfn, int offset, - int len, u32 *val) -{ - return PCIBIOS_DEVICE_NOT_FOUND; -} - -static int -null_write_config(struct pci_bus *bus, unsigned int devfn, int offset, - int len, u32 val) -{ - return PCIBIOS_DEVICE_NOT_FOUND; -} - -static struct pci_ops null_pci_ops = -{ - .read = null_read_config, - .write = null_write_config, -}; - -/* - * These functions are used early on before PCI scanning is done - * and all of the pci_dev and pci_bus structures have been created. - */ -static struct pci_bus * -fake_pci_bus(struct pci_controller *hose, int busnr) -{ - static struct pci_bus bus; - - if (hose == 0) { - hose = pci_bus_to_hose(busnr); - if (hose == 0) - printk(KERN_ERR "Can't find hose for PCI bus %d!\n", busnr); - } - bus.number = busnr; - bus.sysdata = hose; - bus.ops = hose? hose->ops: &null_pci_ops; - return &bus; -} - -#define EARLY_PCI_OP(rw, size, type) \ -int early_##rw##_config_##size(struct pci_controller *hose, int bus, \ - int devfn, int offset, type value) \ -{ \ - return pci_bus_##rw##_config_##size(fake_pci_bus(hose, bus), \ - devfn, offset, value); \ -} - -EARLY_PCI_OP(read, byte, u8 *) -EARLY_PCI_OP(read, word, u16 *) -EARLY_PCI_OP(read, dword, u32 *) -EARLY_PCI_OP(write, byte, u8) -EARLY_PCI_OP(write, word, u16) -EARLY_PCI_OP(write, dword, u32) - -extern int pci_bus_find_capability (struct pci_bus *bus, unsigned int devfn, int cap); -int early_find_capability(struct pci_controller *hose, int bus, int devfn, - int cap) -{ - return pci_bus_find_capability(fake_pci_bus(hose, bus), devfn, cap); -} From e5a6a1c9094839581242c678b11c93c294108696 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 13 Aug 2009 09:37:04 +0000 Subject: [PATCH 0245/1662] powerpc: derive COMMAND_LINE_SIZE from asm-generic The default COMMAND_LINE_SIZE in asm-generic is 512, so the net effect of this change is nil, aside from the cleanup factor. See also commit 2b74b8569. Signed-off-by: Paul Gortmaker Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/setup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 817fac0a0714..dae19342f0b9 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -1,6 +1,6 @@ #ifndef _ASM_POWERPC_SETUP_H #define _ASM_POWERPC_SETUP_H -#define COMMAND_LINE_SIZE 512 +#include #endif /* _ASM_POWERPC_SETUP_H */ From 77c0a700c1c292edafa11c1e52821ce4636f81b0 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 28 Aug 2009 14:25:04 +1000 Subject: [PATCH 0246/1662] powerpc: Properly start decrementer on BookE secondary CPUs This moves the code to start the decrementer on 40x and BookE into a separate function which is now called from time_init() and secondary_time_init(), before the respective clock sources are registered. We also remove the 85xx specific code for doing it from the platform code. Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/time.c | 30 ++++++++++++++++++++++-------- arch/powerpc/platforms/85xx/smp.c | 12 ------------ 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index edb1edb36469..a180b4f9a4f6 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -727,6 +727,18 @@ static int __init get_freq(char *name, int cells, unsigned long *val) return found; } +/* should become __cpuinit when secondary_cpu_time_init also is */ +void start_cpu_decrementer(void) +{ +#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) + /* Clear any pending timer interrupts */ + mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS); + + /* Enable decrementer interrupt */ + mtspr(SPRN_TCR, TCR_DIE); +#endif /* defined(CONFIG_BOOKE) || defined(CONFIG_40x) */ +} + void __init generic_calibrate_decr(void) { ppc_tb_freq = DEFAULT_TB_FREQ; /* hardcoded default */ @@ -746,14 +758,6 @@ void __init generic_calibrate_decr(void) printk(KERN_ERR "WARNING: Estimating processor frequency " "(not found)\n"); } - -#if defined(CONFIG_BOOKE) || defined(CONFIG_40x) - /* Clear any pending timer interrupts */ - mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS); - - /* Enable decrementer interrupt */ - mtspr(SPRN_TCR, TCR_DIE); -#endif } int update_persistent_clock(struct timespec now) @@ -914,6 +918,11 @@ static void __init init_decrementer_clockevent(void) void secondary_cpu_time_init(void) { + /* Start the decrementer on CPUs that have manual control + * such as BookE + */ + start_cpu_decrementer(); + /* FIME: Should make unrelatred change to move snapshot_timebase * call here ! */ register_decrementer_clockevent(smp_processor_id()); @@ -1017,6 +1026,11 @@ void __init time_init(void) write_sequnlock_irqrestore(&xtime_lock, flags); + /* Start the decrementer on CPUs that have manual control + * such as BookE + */ + start_cpu_decrementer(); + /* Register the clocksource, if we're not running on iSeries */ if (!firmware_has_feature(FW_FEATURE_ISERIES)) clocksource_init(); diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 9f526ba31c1e..94f901da4918 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -78,22 +78,10 @@ smp_85xx_kick_cpu(int nr) pr_debug("waited %d msecs for CPU #%d.\n", n, nr); } -static void __init -smp_85xx_basic_setup(int cpu_nr) -{ - /* Clear any pending timer interrupts */ - mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS); - - /* Enable decrementer interrupt */ - mtspr(SPRN_TCR, TCR_DIE); -} - static void __init smp_85xx_setup_cpu(int cpu_nr) { mpic_setup_this_cpu(); - - smp_85xx_basic_setup(cpu_nr); } struct smp_ops_t smp_85xx_ops = { From 47d25003cbd9e9030a95f7ccc4e70fec6aa7b844 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 28 Aug 2009 14:11:57 +0100 Subject: [PATCH 0247/1662] x86: Fix earlyprintk=dbgp for machines without NX Since parse_early_param() may (e.g. for earlyprintk=dbgp) involve calls to page table manipulation functions (here set_fixmap_nocache()), NX hardware support must be determined before calling that function (so that __supported_pte_mask gets properly set up). But the call after parse_early_param() can also not go away, as that will honor eventual command line specified disabling of the NX functionality. ( This will then just result in whatever mappings got established during parse_early_param() having the NX bit set despite it being disabled on the command line, but I think that's tolerable). Signed-off-by: Jan Beulich Cc: Yinghai Lu LKML-Reference: <4A97F3BD02000078000121B9@vpn.id2.novell.com> [ merged to x86/pat to resolve a conflict. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 02643cc3bf26..eb1f1e6e52b0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -714,6 +714,16 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; +#ifdef CONFIG_X86_64 + /* + * Must call this twice: Once just to detect whether hardware doesn't + * support NX (so that the early EHCI debug console setup can safely + * call set_fixmap(), and then again after parsing early parameters to + * honor the respective command line option. + */ + check_efer(); +#endif + parse_early_param(); /* VMI may relocate the fixmap; do this before touching ioremap area */ From f661be6c8af3ae357e159c3ac6d6aea4aaf7581c Mon Sep 17 00:00:00 2001 From: Stefan Roese Date: Thu, 13 Aug 2009 04:30:44 +0000 Subject: [PATCH 0248/1662] powerpc/44x: Update Arches dts This patch adds some nodes to the AMCC Arches dts: - L2 cache support - NOR FLASH mapping with default partitioning - I2C HWMON device (AD7414) Signed-off-by: Stefan Roese Signed-off-by: Josh Boyer --- arch/powerpc/boot/dts/arches.dts | 50 ++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/arch/powerpc/boot/dts/arches.dts b/arch/powerpc/boot/dts/arches.dts index d9113b1e8c1d..414ef8b7e575 100644 --- a/arch/powerpc/boot/dts/arches.dts +++ b/arch/powerpc/boot/dts/arches.dts @@ -124,6 +124,16 @@ dcr-reg = <0x00c 0x002>; }; + L2C0: l2c { + compatible = "ibm,l2-cache-460gt", "ibm,l2-cache"; + dcr-reg = <0x020 0x008 /* Internal SRAM DCR's */ + 0x030 0x008>; /* L2 cache DCR's */ + cache-line-size = <32>; /* 32 bytes */ + cache-size = <262144>; /* L2, 256K */ + interrupt-parent = <&UIC1>; + interrupts = <11 1>; + }; + plb { compatible = "ibm,plb-460gt", "ibm,plb4"; #address-cells = <2>; @@ -168,6 +178,38 @@ /* ranges property is supplied by U-Boot */ interrupts = <0x6 0x4>; interrupt-parent = <&UIC1>; + + nor_flash@0,0 { + compatible = "amd,s29gl256n", "cfi-flash"; + bank-width = <2>; + reg = <0x00000000 0x00000000 0x02000000>; + #address-cells = <1>; + #size-cells = <1>; + partition@0 { + label = "kernel"; + reg = <0x00000000 0x001e0000>; + }; + partition@1e0000 { + label = "dtb"; + reg = <0x001e0000 0x00020000>; + }; + partition@200000 { + label = "root"; + reg = <0x00200000 0x00200000>; + }; + partition@400000 { + label = "user"; + reg = <0x00400000 0x01b60000>; + }; + partition@1f60000 { + label = "env"; + reg = <0x01f60000 0x00040000>; + }; + partition@1fa0000 { + label = "u-boot"; + reg = <0x01fa0000 0x00060000>; + }; + }; }; UART0: serial@ef600300 { @@ -186,6 +228,14 @@ reg = <0xef600700 0x00000014>; interrupt-parent = <&UIC0>; interrupts = <0x2 0x4>; + #address-cells = <1>; + #size-cells = <0>; + sttm@4a { + compatible = "ad,ad7414"; + reg = <0x4a>; + interrupt-parent = <&UIC1>; + interrupts = <0x0 0x8>; + }; }; IIC1: i2c@ef600800 { From 241749261377f2e10d513505a75c071c0f4b3b51 Mon Sep 17 00:00:00 2001 From: Stefan Roese Date: Thu, 13 Aug 2009 04:30:56 +0000 Subject: [PATCH 0249/1662] powerpc/44x: Update Arches defconfig This patch adds NOR MTD support and I2C HWMON support for the AD7414 to the AMCC Arches defconfig. Signed-off-by: Stefan Roese Signed-off-by: Josh Boyer --- arch/powerpc/configs/44x/arches_defconfig | 382 +++++++++++++++++++--- 1 file changed, 332 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/configs/44x/arches_defconfig b/arch/powerpc/configs/44x/arches_defconfig index f7fd32c09424..6f976b51cdd0 100644 --- a/arch/powerpc/configs/44x/arches_defconfig +++ b/arch/powerpc/configs/44x/arches_defconfig @@ -1,14 +1,14 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29-rc2 -# Tue Jan 20 08:22:31 2009 +# Linux kernel version: 2.6.31-rc5 +# Thu Aug 13 14:14:07 2009 # # CONFIG_PPC64 is not set # # Processor support # -# CONFIG_6xx is not set +# CONFIG_PPC_BOOK3S_32 is not set # CONFIG_PPC_85xx is not set # CONFIG_PPC_8xx is not set # CONFIG_40x is not set @@ -31,15 +31,16 @@ CONFIG_GENERIC_TIME=y CONFIG_GENERIC_TIME_VSYSCALL=y CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y # CONFIG_HAVE_SETUP_PER_CPU_AREA is not set CONFIG_IRQ_PER_CPU=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_HAVE_LATENCYTOP_SUPPORT=y +CONFIG_TRACE_IRQFLAGS_SUPPORT=y CONFIG_LOCKDEP_SUPPORT=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_ARCH_HAS_ILOG2_U32=y CONFIG_GENERIC_HWEIGHT=y -CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_GENERIC_FIND_NEXT_BIT=y # CONFIG_ARCH_NO_VIRT_TO_BUS is not set CONFIG_PPC=y @@ -53,11 +54,14 @@ CONFIG_PPC_UDBG_16550=y # CONFIG_GENERIC_TBSYNC is not set CONFIG_AUDIT_ARCH=y CONFIG_GENERIC_BUG=y +CONFIG_DTC=y # CONFIG_DEFAULT_UIMAGE is not set CONFIG_PPC_DCR_NATIVE=y # CONFIG_PPC_DCR_MMIO is not set CONFIG_PPC_DCR=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" +CONFIG_CONSTRUCTORS=y # # General setup @@ -71,9 +75,19 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set # CONFIG_AUDIT is not set + +# +# RCU Subsystem +# +CONFIG_CLASSIC_RCU=y +# CONFIG_TREE_RCU is not set +# CONFIG_PREEMPT_RCU is not set +# CONFIG_TREE_RCU_TRACE is not set +# CONFIG_PREEMPT_RCU_TRACE is not set # CONFIG_IKCONFIG is not set CONFIG_LOG_BUF_SHIFT=14 # CONFIG_GROUP_SCHED is not set @@ -84,8 +98,12 @@ CONFIG_SYSFS_DEPRECATED_V2=y # CONFIG_NAMESPACES is not set CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y CONFIG_EMBEDDED=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y @@ -95,23 +113,30 @@ CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y -CONFIG_COMPAT_BRK=y CONFIG_BASE_FULL=y CONFIG_FUTEX=y -CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_AIO=y +CONFIG_HAVE_PERF_COUNTERS=y + +# +# Performance Counters +# +# CONFIG_PERF_COUNTERS is not set CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y CONFIG_SLUB_DEBUG=y +# CONFIG_STRIP_ASM_SYMS is not set +CONFIG_COMPAT_BRK=y # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y @@ -119,6 +144,12 @@ CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +# CONFIG_SLOW_WORK is not set # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -130,8 +161,7 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y -CONFIG_LBD=y -# CONFIG_BLK_DEV_IO_TRACE is not set +CONFIG_LBDAF=y # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -147,11 +177,6 @@ CONFIG_DEFAULT_AS=y # CONFIG_DEFAULT_CFQ is not set # CONFIG_DEFAULT_NOOP is not set CONFIG_DEFAULT_IOSCHED="anticipatory" -CONFIG_CLASSIC_RCU=y -# CONFIG_TREE_RCU is not set -# CONFIG_PREEMPT_RCU is not set -# CONFIG_TREE_RCU_TRACE is not set -# CONFIG_PREEMPT_RCU_TRACE is not set # CONFIG_FREEZER is not set CONFIG_PPC4xx_PCI_EXPRESS=y @@ -172,6 +197,7 @@ CONFIG_PPC4xx_PCI_EXPRESS=y CONFIG_ARCHES=y # CONFIG_CANYONLANDS is not set # CONFIG_GLACIER is not set +# CONFIG_REDWOOD is not set # CONFIG_YOSEMITE is not set # CONFIG_XILINX_VIRTEX440_GENERIC_BOARD is not set CONFIG_PPC44x_SIMPLE=y @@ -214,6 +240,7 @@ CONFIG_BINFMT_ELF=y # CONFIG_BINFMT_MISC is not set # CONFIG_MATH_EMULATION is not set # CONFIG_IOMMU_HELPER is not set +# CONFIG_SWIOTLB is not set CONFIG_PPC_NEED_DMA_SYNC_OPS=y CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y CONFIG_ARCH_HAS_WALK_MEMORY=y @@ -233,10 +260,14 @@ CONFIG_PHYS_ADDR_T_64BIT=y CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y -CONFIG_UNEVICTABLE_LRU=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 +CONFIG_STDBINUTILS=y CONFIG_PPC_4K_PAGES=y # CONFIG_PPC_16K_PAGES is not set # CONFIG_PPC_64K_PAGES is not set +# CONFIG_PPC_256K_PAGES is not set CONFIG_FORCE_MAX_ZONEORDER=11 CONFIG_PROC_DEVICETREE=y CONFIG_CMDLINE_BOOL=y @@ -261,6 +292,7 @@ CONFIG_ARCH_SUPPORTS_MSI=y # CONFIG_PCI_LEGACY is not set # CONFIG_PCI_DEBUG is not set # CONFIG_PCI_STUB is not set +# CONFIG_PCI_IOV is not set # CONFIG_PCCARD is not set # CONFIG_HOTPLUG_PCI is not set # CONFIG_HAS_RAPIDIO is not set @@ -278,14 +310,12 @@ CONFIG_PAGE_OFFSET=0xc0000000 CONFIG_KERNEL_START=0xc0000000 CONFIG_PHYSICAL_START=0x00000000 CONFIG_TASK_SIZE=0xc0000000 -CONFIG_CONSISTENT_START=0xff100000 CONFIG_CONSISTENT_SIZE=0x00200000 CONFIG_NET=y # # Networking options # -CONFIG_COMPAT_NET_DEV_OPS=y CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set CONFIG_UNIX=y @@ -335,6 +365,8 @@ CONFIG_DEFAULT_TCP_CONG="cubic" # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set +# CONFIG_PHONET is not set +# CONFIG_IEEE802154 is not set # CONFIG_NET_SCHED is not set # CONFIG_DCB is not set @@ -347,7 +379,6 @@ CONFIG_DEFAULT_TCP_CONG="cubic" # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set -# CONFIG_PHONET is not set # CONFIG_WIRELESS is not set # CONFIG_WIMAX is not set # CONFIG_RFKILL is not set @@ -371,8 +402,92 @@ CONFIG_EXTRA_FIRMWARE="" # CONFIG_SYS_HYPERVISOR is not set CONFIG_CONNECTOR=y CONFIG_PROC_EVENTS=y -# CONFIG_MTD is not set +CONFIG_MTD=y +# CONFIG_MTD_DEBUG is not set +# CONFIG_MTD_CONCAT is not set +CONFIG_MTD_PARTITIONS=y +# CONFIG_MTD_TESTS is not set +# CONFIG_MTD_REDBOOT_PARTS is not set +CONFIG_MTD_CMDLINE_PARTS=y +CONFIG_MTD_OF_PARTS=y +# CONFIG_MTD_AR7_PARTS is not set + +# +# User Modules And Translation Layers +# +CONFIG_MTD_CHAR=y +CONFIG_MTD_BLKDEVS=y +CONFIG_MTD_BLOCK=y +# CONFIG_FTL is not set +# CONFIG_NFTL is not set +# CONFIG_INFTL is not set +# CONFIG_RFD_FTL is not set +# CONFIG_SSFDC is not set +# CONFIG_MTD_OOPS is not set + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=y +# CONFIG_MTD_JEDECPROBE is not set +CONFIG_MTD_GEN_PROBE=y +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +# CONFIG_MTD_CFI_I4 is not set +# CONFIG_MTD_CFI_I8 is not set +# CONFIG_MTD_CFI_INTELEXT is not set +CONFIG_MTD_CFI_AMDSTD=y +# CONFIG_MTD_CFI_STAA is not set +CONFIG_MTD_CFI_UTIL=y +# CONFIG_MTD_RAM is not set +# CONFIG_MTD_ROM is not set +# CONFIG_MTD_ABSENT is not set + +# +# Mapping drivers for chip access +# +# CONFIG_MTD_COMPLEX_MAPPINGS is not set +# CONFIG_MTD_PHYSMAP is not set +CONFIG_MTD_PHYSMAP_OF=y +# CONFIG_MTD_INTEL_VR_NOR is not set +# CONFIG_MTD_PLATRAM is not set + +# +# Self-contained MTD device drivers +# +# CONFIG_MTD_PMC551 is not set +# CONFIG_MTD_SLRAM is not set +# CONFIG_MTD_PHRAM is not set +# CONFIG_MTD_MTDRAM is not set +# CONFIG_MTD_BLOCK2MTD is not set + +# +# Disk-On-Chip Device Drivers +# +# CONFIG_MTD_DOC2000 is not set +# CONFIG_MTD_DOC2001 is not set +# CONFIG_MTD_DOC2001PLUS is not set +# CONFIG_MTD_NAND is not set +# CONFIG_MTD_ONENAND is not set + +# +# LPDDR flash memory drivers +# +# CONFIG_MTD_LPDDR is not set + +# +# UBI - Unsorted block images +# +# CONFIG_MTD_UBI is not set CONFIG_OF_DEVICE=y +CONFIG_OF_I2C=y # CONFIG_PARPORT is not set CONFIG_BLK_DEV=y # CONFIG_BLK_DEV_FD is not set @@ -412,7 +527,11 @@ CONFIG_HAVE_IDE=y # # -# Enable only one of the two stacks, unless you know what you are doing +# You can enable one or both FireWire driver stacks. +# + +# +# See the help texts for more information. # # CONFIG_FIREWIRE is not set # CONFIG_IEEE1394 is not set @@ -433,6 +552,8 @@ CONFIG_NET_ETHERNET=y # CONFIG_SUNGEM is not set # CONFIG_CASSINI is not set # CONFIG_NET_VENDOR_3COM is not set +# CONFIG_ETHOC is not set +# CONFIG_DNET is not set # CONFIG_NET_TULIP is not set # CONFIG_HP100 is not set CONFIG_IBM_NEW_EMAC=y @@ -451,6 +572,7 @@ CONFIG_IBM_NEW_EMAC_EMAC4=y # CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set # CONFIG_NET_PCI is not set # CONFIG_B44 is not set +# CONFIG_KS8842 is not set # CONFIG_ATL2 is not set # CONFIG_NETDEV_1000 is not set # CONFIG_NETDEV_10000 is not set @@ -461,7 +583,6 @@ CONFIG_IBM_NEW_EMAC_EMAC4=y # # CONFIG_WLAN_PRE80211 is not set # CONFIG_WLAN_80211 is not set -# CONFIG_IWLWIFI_LEDS is not set # # Enable WiMAX (Networking options) to see the WiMAX drivers @@ -533,13 +654,143 @@ CONFIG_LEGACY_PTY_COUNT=256 # CONFIG_RAW_DRIVER is not set # CONFIG_TCG_TPM is not set CONFIG_DEVPORT=y -# CONFIG_I2C is not set +CONFIG_I2C=y +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_CHARDEV=y +CONFIG_I2C_HELPER_AUTO=y + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +# CONFIG_I2C_ALI1535 is not set +# CONFIG_I2C_ALI1563 is not set +# CONFIG_I2C_ALI15X3 is not set +# CONFIG_I2C_AMD756 is not set +# CONFIG_I2C_AMD8111 is not set +# CONFIG_I2C_I801 is not set +# CONFIG_I2C_ISCH is not set +# CONFIG_I2C_PIIX4 is not set +# CONFIG_I2C_NFORCE2 is not set +# CONFIG_I2C_SIS5595 is not set +# CONFIG_I2C_SIS630 is not set +# CONFIG_I2C_SIS96X is not set +# CONFIG_I2C_VIA is not set +# CONFIG_I2C_VIAPRO is not set + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_IBM_IIC=y +# CONFIG_I2C_MPC is not set +# CONFIG_I2C_OCORES is not set +# CONFIG_I2C_SIMTEC is not set + +# +# External I2C/SMBus adapter drivers +# +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_TAOS_EVM is not set + +# +# Graphics adapter I2C/DDC channel drivers +# +# CONFIG_I2C_VOODOO3 is not set + +# +# Other I2C/SMBus bus drivers +# +# CONFIG_I2C_PCA_PLATFORM is not set +# CONFIG_I2C_STUB is not set + +# +# Miscellaneous I2C Chip support +# +# CONFIG_DS1682 is not set +# CONFIG_SENSORS_PCF8574 is not set +# CONFIG_PCF8575 is not set +# CONFIG_SENSORS_PCA9539 is not set +# CONFIG_SENSORS_TSL2550 is not set +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# CONFIG_I2C_DEBUG_CHIP is not set # CONFIG_SPI is not set + +# +# PPS support +# +# CONFIG_PPS is not set CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y # CONFIG_GPIOLIB is not set # CONFIG_W1 is not set # CONFIG_POWER_SUPPLY is not set -# CONFIG_HWMON is not set +CONFIG_HWMON=y +# CONFIG_HWMON_VID is not set +CONFIG_SENSORS_AD7414=y +# CONFIG_SENSORS_AD7418 is not set +# CONFIG_SENSORS_ADM1021 is not set +# CONFIG_SENSORS_ADM1025 is not set +# CONFIG_SENSORS_ADM1026 is not set +# CONFIG_SENSORS_ADM1029 is not set +# CONFIG_SENSORS_ADM1031 is not set +# CONFIG_SENSORS_ADM9240 is not set +# CONFIG_SENSORS_ADT7462 is not set +# CONFIG_SENSORS_ADT7470 is not set +# CONFIG_SENSORS_ADT7473 is not set +# CONFIG_SENSORS_ADT7475 is not set +# CONFIG_SENSORS_ATXP1 is not set +# CONFIG_SENSORS_DS1621 is not set +# CONFIG_SENSORS_I5K_AMB is not set +# CONFIG_SENSORS_F71805F is not set +# CONFIG_SENSORS_F71882FG is not set +# CONFIG_SENSORS_F75375S is not set +# CONFIG_SENSORS_G760A is not set +# CONFIG_SENSORS_GL518SM is not set +# CONFIG_SENSORS_GL520SM is not set +# CONFIG_SENSORS_IT87 is not set +# CONFIG_SENSORS_LM63 is not set +# CONFIG_SENSORS_LM75 is not set +# CONFIG_SENSORS_LM77 is not set +# CONFIG_SENSORS_LM78 is not set +# CONFIG_SENSORS_LM80 is not set +# CONFIG_SENSORS_LM83 is not set +# CONFIG_SENSORS_LM85 is not set +# CONFIG_SENSORS_LM87 is not set +# CONFIG_SENSORS_LM90 is not set +# CONFIG_SENSORS_LM92 is not set +# CONFIG_SENSORS_LM93 is not set +# CONFIG_SENSORS_LTC4215 is not set +# CONFIG_SENSORS_LTC4245 is not set +# CONFIG_SENSORS_LM95241 is not set +# CONFIG_SENSORS_MAX1619 is not set +# CONFIG_SENSORS_MAX6650 is not set +# CONFIG_SENSORS_PC87360 is not set +# CONFIG_SENSORS_PC87427 is not set +# CONFIG_SENSORS_PCF8591 is not set +# CONFIG_SENSORS_SIS5595 is not set +# CONFIG_SENSORS_DME1737 is not set +# CONFIG_SENSORS_SMSC47M1 is not set +# CONFIG_SENSORS_SMSC47M192 is not set +# CONFIG_SENSORS_SMSC47B397 is not set +# CONFIG_SENSORS_ADS7828 is not set +# CONFIG_SENSORS_THMC50 is not set +# CONFIG_SENSORS_TMP401 is not set +# CONFIG_SENSORS_VIA686A is not set +# CONFIG_SENSORS_VT1211 is not set +# CONFIG_SENSORS_VT8231 is not set +# CONFIG_SENSORS_W83781D is not set +# CONFIG_SENSORS_W83791D is not set +# CONFIG_SENSORS_W83792D is not set +# CONFIG_SENSORS_W83793 is not set +# CONFIG_SENSORS_W83L785TS is not set +# CONFIG_SENSORS_W83L786NG is not set +# CONFIG_SENSORS_W83627HF is not set +# CONFIG_SENSORS_W83627EHF is not set +# CONFIG_HWMON_DEBUG_CHIP is not set # CONFIG_THERMAL is not set # CONFIG_THERMAL_HWMON is not set # CONFIG_WATCHDOG is not set @@ -556,24 +807,15 @@ CONFIG_SSB_POSSIBLE=y # CONFIG_MFD_CORE is not set # CONFIG_MFD_SM501 is not set # CONFIG_HTC_PASIC3 is not set +# CONFIG_TWL4030_CORE is not set # CONFIG_MFD_TMIO is not set +# CONFIG_PMIC_DA903X is not set +# CONFIG_MFD_WM8400 is not set +# CONFIG_MFD_WM8350_I2C is not set +# CONFIG_MFD_PCF50633 is not set +# CONFIG_AB3100_CORE is not set # CONFIG_REGULATOR is not set - -# -# Multimedia devices -# - -# -# Multimedia core support -# -# CONFIG_VIDEO_DEV is not set -# CONFIG_DVB_CORE is not set -# CONFIG_VIDEO_MEDIA is not set - -# -# Multimedia drivers -# -CONFIG_DAB=y +# CONFIG_MEDIA_SUPPORT is not set # # Graphics support @@ -600,7 +842,12 @@ CONFIG_VIDEO_OUTPUT_CONTROL=m # CONFIG_EDAC is not set # CONFIG_RTC_CLASS is not set # CONFIG_DMADEVICES is not set +# CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set + +# +# TI VLYNQ +# # CONFIG_STAGING is not set # @@ -614,11 +861,12 @@ CONFIG_EXT2_FS=y # CONFIG_REISERFS_FS is not set # CONFIG_JFS_FS is not set # CONFIG_FS_POSIX_ACL is not set -CONFIG_FILE_LOCKING=y # CONFIG_XFS_FS is not set # CONFIG_GFS2_FS is not set # CONFIG_OCFS2_FS is not set # CONFIG_BTRFS_FS is not set +CONFIG_FILE_LOCKING=y +CONFIG_FSNOTIFY=y CONFIG_DNOTIFY=y CONFIG_INOTIFY=y CONFIG_INOTIFY_USER=y @@ -627,6 +875,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -660,6 +913,17 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_BEFS_FS is not set # CONFIG_BFS_FS is not set # CONFIG_EFS_FS is not set +CONFIG_JFFS2_FS=y +CONFIG_JFFS2_FS_DEBUG=0 +CONFIG_JFFS2_FS_WRITEBUFFER=y +# CONFIG_JFFS2_FS_WBUF_VERIFY is not set +# CONFIG_JFFS2_SUMMARY is not set +# CONFIG_JFFS2_FS_XATTR is not set +# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set +CONFIG_JFFS2_ZLIB=y +# CONFIG_JFFS2_LZO is not set +CONFIG_JFFS2_RTIME=y +# CONFIG_JFFS2_RUBIN is not set CONFIG_CRAMFS=y # CONFIG_SQUASHFS is not set # CONFIG_VXFS_FS is not set @@ -670,6 +934,7 @@ CONFIG_CRAMFS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -681,7 +946,6 @@ CONFIG_LOCKD=y CONFIG_LOCKD_V4=y CONFIG_NFS_COMMON=y CONFIG_SUNRPC=y -# CONFIG_SUNRPC_REGISTER_V4 is not set # CONFIG_RPCSEC_GSS_KRB5 is not set # CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set @@ -697,6 +961,7 @@ CONFIG_SUNRPC=y CONFIG_MSDOS_PARTITION=y # CONFIG_NLS is not set # CONFIG_DLM is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines @@ -711,11 +976,14 @@ CONFIG_CRC32=y # CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y -CONFIG_PLIST=y +CONFIG_ZLIB_DEFLATE=y +CONFIG_DECOMPRESS_GZIP=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y CONFIG_HAVE_LMB=y +CONFIG_NLATTR=y +CONFIG_GENERIC_ATOMIC64=y # # Kernel hacking @@ -733,6 +1001,9 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 CONFIG_SCHED_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_TIMER_STATS is not set @@ -743,6 +1014,9 @@ CONFIG_SCHED_DEBUG=y # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set @@ -754,7 +1028,6 @@ CONFIG_SCHED_DEBUG=y # CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_SG is not set # CONFIG_DEBUG_NOTIFIERS is not set -# CONFIG_BOOT_PRINTK_DELAY is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_RCU_CPU_STALL_DETECTOR is not set # CONFIG_BACKTRACE_SELF_TEST is not set @@ -762,27 +1035,36 @@ CONFIG_SCHED_DEBUG=y # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set CONFIG_SYSCTL_SYSCALL_CHECK=y +# CONFIG_DEBUG_PAGEALLOC is not set CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y - -# -# Tracers -# +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y # CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set # CONFIG_SCHED_TRACER is not set -# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_ENABLE_DEFAULT_TRACERS is not set # CONFIG_BOOT_TRACER is not set -# CONFIG_TRACE_BRANCH_PROFILING is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +# CONFIG_PROFILE_ALL_BRANCHES is not set # CONFIG_STACK_TRACER is not set -# CONFIG_DYNAMIC_PRINTK_DEBUG is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DYNAMIC_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set +# CONFIG_KMEMCHECK is not set +# CONFIG_PPC_DISABLE_WERROR is not set +CONFIG_PPC_WERROR=y CONFIG_PRINT_STACK_DEPTH=64 # CONFIG_DEBUG_STACKOVERFLOW is not set # CONFIG_DEBUG_STACK_USAGE is not set -# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PPC_EMULATED_STATS is not set # CONFIG_CODE_PATCHING_SELFTEST is not set # CONFIG_FTR_FIXUP_SELFTEST is not set # CONFIG_MSI_BITMAP_SELFTEST is not set From c9f75093a497bdc7389df983e914da17fad20d95 Mon Sep 17 00:00:00 2001 From: "fkan@amcc.com" Date: Wed, 12 Aug 2009 14:38:47 +0000 Subject: [PATCH 0250/1662] powerpc/44x: Add Eiger AMCC (AppliedMicro) PPC460SX evaluation board support. This patch adds support for the AMCC (AppliedMicro) PPC460SX Eiger evaluation board. Signed-off-by: Tai Tri Nguyen Acked-by: Feng Kan Acked-by: Tirumala Marri Signed-off-by: Josh Boyer --- arch/powerpc/boot/dts/eiger.dts | 421 +++++++ arch/powerpc/configs/44x/eiger_defconfig | 1252 ++++++++++++++++++++ arch/powerpc/platforms/44x/Kconfig | 12 + arch/powerpc/platforms/44x/ppc44x_simple.c | 1 + 4 files changed, 1686 insertions(+) create mode 100644 arch/powerpc/boot/dts/eiger.dts create mode 100644 arch/powerpc/configs/44x/eiger_defconfig diff --git a/arch/powerpc/boot/dts/eiger.dts b/arch/powerpc/boot/dts/eiger.dts new file mode 100644 index 000000000000..c4a934f2e886 --- /dev/null +++ b/arch/powerpc/boot/dts/eiger.dts @@ -0,0 +1,421 @@ +/* + * Device Tree Source for AMCC (AppliedMicro) Eiger(460SX) + * + * Copyright 2009 AMCC (AppliedMicro) + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without + * any warranty of any kind, whether express or implied. + */ + +/dts-v1/; + +/ { + #address-cells = <2>; + #size-cells = <1>; + model = "amcc,eiger"; + compatible = "amcc,eiger"; + dcr-parent = <&{/cpus/cpu@0}>; + + aliases { + ethernet0 = &EMAC0; + ethernet1 = &EMAC1; + ethernet2 = &EMAC2; + ethernet3 = &EMAC3; + serial0 = &UART0; + serial1 = &UART1; + }; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + device_type = "cpu"; + model = "PowerPC,460SX"; + reg = <0x00000000>; + clock-frequency = <0>; /* Filled in by U-Boot */ + timebase-frequency = <0>; /* Filled in by U-Boot */ + i-cache-line-size = <32>; + d-cache-line-size = <32>; + i-cache-size = <32768>; + d-cache-size = <32768>; + dcr-controller; + dcr-access-method = "native"; + }; + }; + + memory { + device_type = "memory"; + reg = <0x00000000 0x00000000 0x00000000>; /* Filled in by U-Boot */ + }; + + UIC0: interrupt-controller0 { + compatible = "ibm,uic-460sx","ibm,uic"; + interrupt-controller; + cell-index = <0>; + dcr-reg = <0x0c0 0x009>; + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + }; + + UIC1: interrupt-controller1 { + compatible = "ibm,uic-460sx","ibm,uic"; + interrupt-controller; + cell-index = <1>; + dcr-reg = <0x0d0 0x009>; + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + interrupts = <0x1e 0x4 0x1f 0x4>; /* cascade */ + interrupt-parent = <&UIC0>; + }; + + UIC2: interrupt-controller2 { + compatible = "ibm,uic-460sx","ibm,uic"; + interrupt-controller; + cell-index = <2>; + dcr-reg = <0x0e0 0x009>; + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + interrupts = <0xa 0x4 0xb 0x4>; /* cascade */ + interrupt-parent = <&UIC0>; + }; + + UIC3: interrupt-controller3 { + compatible = "ibm,uic-460sx","ibm,uic"; + interrupt-controller; + cell-index = <3>; + dcr-reg = <0x0f0 0x009>; + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + interrupts = <0x10 0x4 0x11 0x4>; /* cascade */ + interrupt-parent = <&UIC0>; + }; + + SDR0: sdr { + compatible = "ibm,sdr-460sx"; + dcr-reg = <0x00e 0x002>; + }; + + CPR0: cpr { + compatible = "ibm,cpr-460sx"; + dcr-reg = <0x00c 0x002>; + }; + + plb { + compatible = "ibm,plb-460sx", "ibm,plb4"; + #address-cells = <2>; + #size-cells = <1>; + ranges; + clock-frequency = <0>; /* Filled in by U-Boot */ + + SDRAM0: sdram { + compatible = "ibm,sdram-460sx", "ibm,sdram-405gp"; + dcr-reg = <0x010 0x002>; + }; + + MAL0: mcmal { + compatible = "ibm,mcmal-460sx", "ibm,mcmal2"; + dcr-reg = <0x180 0x62>; + num-tx-chans = <4>; + num-rx-chans = <32>; + #address-cells = <1>; + #size-cells = <1>; + interrupt-parent = <&UIC1>; + interrupts = < /*TXEOB*/ 0x6 0x4 + /*RXEOB*/ 0x7 0x4 + /*SERR*/ 0x1 0x4 + /*TXDE*/ 0x2 0x4 + /*RXDE*/ 0x3 0x4 + /*COAL TX0*/ 0x18 0x2 + /*COAL TX1*/ 0x19 0x2 + /*COAL TX2*/ 0x1a 0x2 + /*COAL TX3*/ 0x1b 0x2 + /*COAL RX0*/ 0x1c 0x2 + /*COAL RX1*/ 0x1d 0x2 + /*COAL RX2*/ 0x1e 0x2 + /*COAL RX3*/ 0x1f 0x2>; + }; + + POB0: opb { + compatible = "ibm,opb-460sx", "ibm,opb"; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0xb0000000 0x00000004 0xb0000000 0x50000000>; + clock-frequency = <0>; /* Filled in by U-Boot */ + + EBC0: ebc { + compatible = "ibm,ebc-460sx", "ibm,ebc"; + dcr-reg = <0x012 0x002>; + #address-cells = <2>; + #size-cells = <1>; + clock-frequency = <0>; /* Filled in by U-Boot */ + /* ranges property is supplied by U-Boot */ + interrupts = <0x6 0x4>; + interrupt-parent = <&UIC1>; + + nor_flash@0,0 { + compatible = "amd,s29gl512n", "cfi-flash"; + bank-width = <2>; + /* reg property is supplied in by U-Boot */ + #address-cells = <1>; + #size-cells = <1>; + partition@0 { + label = "kernel"; + reg = <0x00000000 0x001e0000>; + }; + partition@1e0000 { + label = "dtb"; + reg = <0x001e0000 0x00020000>; + }; + partition@200000 { + label = "ramdisk"; + reg = <0x00200000 0x01400000>; + }; + partition@1600000 { + label = "jffs2"; + reg = <0x01600000 0x00400000>; + }; + partition@1a00000 { + label = "user"; + reg = <0x01a00000 0x02560000>; + }; + partition@3f60000 { + label = "env"; + reg = <0x03f60000 0x00040000>; + }; + partition@3fa0000 { + label = "u-boot"; + reg = <0x03fa0000 0x00060000>; + }; + }; + + ndfc@1,0 { + compatible = "ibm,ndfc"; + /* reg property is supplied by U-boot */ + ccr = <0x00003000>; + bank-settings = <0x80002222>; + #address-cells = <1>; + #size-cells = <1>; + + nand { + #address-cells = <1>; + #size-cells = <1>; + partition@0 { + label = "uboot"; + reg = <0x00000000 0x00200000>; + }; + partition@200000 { + label = "uboot-environment"; + reg = <0x00200000 0x00100000>; + }; + partition@300000 { + label = "linux"; + reg = <0x00300000 0x00300000>; + }; + partition@600000 { + label = "root-file-system"; + reg = <0x00600000 0x01900000>; + }; + partition@1f00000 { + label = "device-tree"; + reg = <0x01f00000 0x00020000>; + }; + partition@1f20000 { + label = "data"; + reg = <0x01f20000 0x060E0000>; + }; + }; + }; + }; + + UART0: serial@ef600200 { + device_type = "serial"; + compatible = "ns16550"; + reg = <0xef600200 0x00000008>; + virtual-reg = <0xef600200>; + clock-frequency = <0>; /* Filled in by U-Boot */ + current-speed = <0>; /* Filled in by U-Boot */ + interrupt-parent = <&UIC0>; + interrupts = <0x0 0x4>; + }; + + UART1: serial@ef600300 { + device_type = "serial"; + compatible = "ns16550"; + reg = <0xef600300 0x00000008>; + virtual-reg = <0xef600300>; + clock-frequency = <0>; /* Filled in by U-Boot */ + current-speed = <0>; /* Filled in by U-Boot */ + interrupt-parent = <&UIC0>; + interrupts = <0x1 0x4>; + }; + + IIC0: i2c@ef600400 { + compatible = "ibm,iic-460sx", "ibm,iic"; + reg = <0xef600400 0x00000014>; + interrupt-parent = <&UIC0>; + interrupts = <0x2 0x4>; + #address-cells = <1>; + #size-cells = <0>; + index = <0>; + }; + + IIC1: i2c@ef600500 { + compatible = "ibm,iic-460sx", "ibm,iic"; + reg = <0xef600500 0x00000014>; + interrupt-parent = <&UIC0>; + interrupts = <0x3 0x4>; + #address-cells = <1>; + #size-cells = <0>; + index = <1>; + }; + + RGMII0: emac-rgmii@ef600900 { + compatible = "ibm,rgmii-460sx", "ibm,rgmii"; + reg = <0xef600900 0x00000008>; + has-mdio; + }; + + RGMII1: emac-rgmii@ef600920 { + compatible = "ibm,rgmii-460sx", "ibm,rgmii"; + reg = <0xef600920 0x00000008>; + has-mdio; + }; + + TAH0: emac-tah@ef600e50 { + compatible = "ibm,tah-460sx", "ibm,tah"; + reg = <0xef600e50 0x00000030>; + }; + + TAH1: emac-tah@ef600f50 { + compatible = "ibm,tah-460sx", "ibm,tah"; + reg = <0xef600f50 0x00000030>; + }; + + EMAC0: ethernet@ef600a00 { + device_type = "network"; + compatible = "ibm,emac-460sx", "ibm,emac4"; + interrupt-parent = <&EMAC0>; + interrupts = <0x0 0x1>; + #interrupt-cells = <1>; + #address-cells = <0>; + #size-cells = <0>; + interrupt-map = ; + reg = <0xef600a00 0x00000070>; + local-mac-address = [000000000000]; /* Filled in by U-Boot */ + mal-device = <&MAL0>; + mal-tx-channel = <0>; + mal-rx-channel = <0>; + cell-index = <0>; + max-frame-size = <9000>; + rx-fifo-size = <4096>; + tx-fifo-size = <2048>; + phy-mode = "rgmii"; + phy-map = <0x00000000>; + rgmii-device = <&RGMII0>; + rgmii-channel = <0>; + tah-device = <&TAH0>; + tah-channel = <0>; + has-inverted-stacr-oc; + has-new-stacr-staopc; + }; + + EMAC1: ethernet@ef600b00 { + device_type = "network"; + compatible = "ibm,emac-460sx", "ibm,emac4"; + interrupt-parent = <&EMAC1>; + interrupts = <0x0 0x1>; + #interrupt-cells = <1>; + #address-cells = <0>; + #size-cells = <0>; + interrupt-map = ; + reg = <0xef600b00 0x00000070>; + local-mac-address = [000000000000]; /* Filled in by U-Boot */ + mal-device = <&MAL0>; + mal-tx-channel = <1>; + mal-rx-channel = <8>; + cell-index = <1>; + max-frame-size = <9000>; + rx-fifo-size = <4096>; + tx-fifo-size = <2048>; + phy-mode = "rgmii"; + phy-map = <0x00000000>; + rgmii-device = <&RGMII0>; + rgmii-channel = <1>; + tah-device = <&TAH1>; + tah-channel = <1>; + has-inverted-stacr-oc; + has-new-stacr-staopc; + mdio-device = <&EMAC0>; + }; + + EMAC2: ethernet@ef600c00 { + device_type = "network"; + compatible = "ibm,emac-460sx", "ibm,emac4"; + interrupt-parent = <&EMAC2>; + interrupts = <0x0 0x1>; + #interrupt-cells = <1>; + #address-cells = <0>; + #size-cells = <0>; + interrupt-map = ; + reg = <0xef600c00 0x00000070>; + local-mac-address = [000000000000]; /* Filled in by U-Boot */ + mal-device = <&MAL0>; + mal-tx-channel = <2>; + mal-rx-channel = <16>; + cell-index = <2>; + max-frame-size = <9000>; + rx-fifo-size = <4096>; + tx-fifo-size = <2048>; + phy-mode = "rgmii"; + phy-map = <0x00000000>; + rgmii-device = <&RGMII1>; + rgmii-channel = <0>; + has-inverted-stacr-oc; + has-new-stacr-staopc; + mdio-device = <&EMAC0>; + }; + + EMAC3: ethernet@ef600d00 { + device_type = "network"; + compatible = "ibm,emac-460sx", "ibm,emac4"; + interrupt-parent = <&EMAC3>; + interrupts = <0x0 0x1>; + #interrupt-cells = <1>; + #address-cells = <0>; + #size-cells = <0>; + interrupt-map = ; + reg = <0xef600d00 0x00000070>; + local-mac-address = [000000000000]; /* Filled in by U-Boot */ + mal-device = <&MAL0>; + mal-tx-channel = <3>; + mal-rx-channel = <24>; + cell-index = <3>; + max-frame-size = <9000>; + rx-fifo-size = <4096>; + tx-fifo-size = <2048>; + phy-mode = "rgmii"; + phy-map = <0x00000000>; + rgmii-device = <&RGMII1>; + rgmii-channel = <1>; + has-inverted-stacr-oc; + has-new-stacr-staopc; + mdio-device = <&EMAC0>; + }; + }; + + }; + chosen { + linux,stdout-path = "/plb/opb/serial@ef600200"; + }; + +}; diff --git a/arch/powerpc/configs/44x/eiger_defconfig b/arch/powerpc/configs/44x/eiger_defconfig new file mode 100644 index 000000000000..007f3bd939e7 --- /dev/null +++ b/arch/powerpc/configs/44x/eiger_defconfig @@ -0,0 +1,1252 @@ +# +# Automatically generated make config: don't edit +# Linux kernel version: 2.6.31-rc6 +# Wed Aug 19 13:06:50 2009 +# +# CONFIG_PPC64 is not set + +# +# Processor support +# +# CONFIG_PPC_BOOK3S_32 is not set +# CONFIG_PPC_85xx is not set +# CONFIG_PPC_8xx is not set +# CONFIG_40x is not set +CONFIG_44x=y +# CONFIG_E200 is not set +CONFIG_PPC_FPU=y +CONFIG_4xx=y +CONFIG_BOOKE=y +CONFIG_PTE_64BIT=y +CONFIG_PHYS_64BIT=y +CONFIG_PPC_MMU_NOHASH=y +CONFIG_PPC_MMU_NOHASH_32=y +# CONFIG_PPC_MM_SLICES is not set +CONFIG_NOT_COHERENT_CACHE=y +CONFIG_PPC32=y +CONFIG_WORD_SIZE=32 +CONFIG_ARCH_PHYS_ADDR_T_64BIT=y +CONFIG_MMU=y +CONFIG_GENERIC_CMOS_UPDATE=y +CONFIG_GENERIC_TIME=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y +# CONFIG_HAVE_SETUP_PER_CPU_AREA is not set +CONFIG_IRQ_PER_CPU=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_HAVE_LATENCYTOP_SUPPORT=y +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +CONFIG_ARCH_HAS_ILOG2_U32=y +CONFIG_GENERIC_HWEIGHT=y +CONFIG_GENERIC_FIND_NEXT_BIT=y +# CONFIG_ARCH_NO_VIRT_TO_BUS is not set +CONFIG_PPC=y +CONFIG_EARLY_PRINTK=y +CONFIG_GENERIC_NVRAM=y +CONFIG_SCHED_OMIT_FRAME_POINTER=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +CONFIG_PPC_OF=y +CONFIG_OF=y +CONFIG_PPC_UDBG_16550=y +# CONFIG_GENERIC_TBSYNC is not set +CONFIG_AUDIT_ARCH=y +CONFIG_GENERIC_BUG=y +CONFIG_DTC=y +# CONFIG_DEFAULT_UIMAGE is not set +CONFIG_PPC_DCR_NATIVE=y +# CONFIG_PPC_DCR_MMIO is not set +CONFIG_PPC_DCR=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" +CONFIG_CONSTRUCTORS=y + +# +# General setup +# +CONFIG_EXPERIMENTAL=y +CONFIG_BROKEN_ON_SMP=y +CONFIG_INIT_ENV_ARG_LIMIT=32 +CONFIG_LOCALVERSION="" +CONFIG_LOCALVERSION_AUTO=y +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y +# CONFIG_BSD_PROCESS_ACCT is not set +# CONFIG_TASKSTATS is not set +# CONFIG_AUDIT is not set + +# +# RCU Subsystem +# +CONFIG_CLASSIC_RCU=y +# CONFIG_TREE_RCU is not set +# CONFIG_PREEMPT_RCU is not set +# CONFIG_TREE_RCU_TRACE is not set +# CONFIG_PREEMPT_RCU_TRACE is not set +# CONFIG_IKCONFIG is not set +CONFIG_LOG_BUF_SHIFT=14 +# CONFIG_GROUP_SCHED is not set +# CONFIG_CGROUPS is not set +CONFIG_SYSFS_DEPRECATED=y +CONFIG_SYSFS_DEPRECATED_V2=y +# CONFIG_RELAY is not set +# CONFIG_NAMESPACES is not set +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y +CONFIG_EMBEDDED=y +CONFIG_SYSCTL_SYSCALL=y +CONFIG_KALLSYMS=y +# CONFIG_KALLSYMS_ALL is not set +# CONFIG_KALLSYMS_EXTRA_PASS is not set +CONFIG_HOTPLUG=y +CONFIG_PRINTK=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_HAVE_PERF_COUNTERS=y + +# +# Performance Counters +# +# CONFIG_PERF_COUNTERS is not set +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_PCI_QUIRKS=y +CONFIG_SLUB_DEBUG=y +# CONFIG_STRIP_ASM_SYMS is not set +CONFIG_COMPAT_BRK=y +# CONFIG_SLAB is not set +CONFIG_SLUB=y +# CONFIG_SLOB is not set +# CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set +CONFIG_HAVE_OPROFILE=y +# CONFIG_KPROBES is not set +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_ARCH_TRACEHOOK=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +# CONFIG_SLOW_WORK is not set +# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set +CONFIG_SLABINFO=y +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +CONFIG_MODULES=y +# CONFIG_MODULE_FORCE_LOAD is not set +CONFIG_MODULE_UNLOAD=y +# CONFIG_MODULE_FORCE_UNLOAD is not set +# CONFIG_MODVERSIONS is not set +# CONFIG_MODULE_SRCVERSION_ALL is not set +CONFIG_BLOCK=y +CONFIG_LBDAF=y +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_BLK_DEV_INTEGRITY is not set + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" +# CONFIG_FREEZER is not set +CONFIG_PPC4xx_PCI_EXPRESS=y + +# +# Platform support +# +# CONFIG_PPC_CELL is not set +# CONFIG_PPC_CELL_NATIVE is not set +# CONFIG_PQ2ADS is not set +# CONFIG_BAMBOO is not set +# CONFIG_EBONY is not set +# CONFIG_SAM440EP is not set +# CONFIG_SEQUOIA is not set +# CONFIG_TAISHAN is not set +# CONFIG_KATMAI is not set +# CONFIG_RAINIER is not set +# CONFIG_WARP is not set +# CONFIG_ARCHES is not set +# CONFIG_CANYONLANDS is not set +# CONFIG_GLACIER is not set +# CONFIG_REDWOOD is not set +CONFIG_EIGER=y +# CONFIG_YOSEMITE is not set +# CONFIG_XILINX_VIRTEX440_GENERIC_BOARD is not set +CONFIG_PPC44x_SIMPLE=y +# CONFIG_PPC4xx_GPIO is not set +CONFIG_460SX=y +# CONFIG_IPIC is not set +# CONFIG_MPIC is not set +# CONFIG_MPIC_WEIRD is not set +# CONFIG_PPC_I8259 is not set +# CONFIG_PPC_RTAS is not set +# CONFIG_MMIO_NVRAM is not set +# CONFIG_PPC_MPC106 is not set +# CONFIG_PPC_970_NAP is not set +# CONFIG_PPC_INDIRECT_IO is not set +# CONFIG_GENERIC_IOMAP is not set +# CONFIG_CPU_FREQ is not set +# CONFIG_FSL_ULI1575 is not set +# CONFIG_SIMPLE_GPIO is not set + +# +# Kernel options +# +# CONFIG_HIGHMEM is not set +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_GENERIC_CLOCKEVENTS_BUILD=y +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y +# CONFIG_HZ_300 is not set +# CONFIG_HZ_1000 is not set +CONFIG_HZ=250 +CONFIG_SCHED_HRTICK=y +CONFIG_PREEMPT_NONE=y +# CONFIG_PREEMPT_VOLUNTARY is not set +# CONFIG_PREEMPT is not set +CONFIG_BINFMT_ELF=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +# CONFIG_HAVE_AOUT is not set +# CONFIG_BINFMT_MISC is not set +# CONFIG_MATH_EMULATION is not set +# CONFIG_IOMMU_HELPER is not set +# CONFIG_SWIOTLB is not set +CONFIG_PPC_NEED_DMA_SYNC_OPS=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_HAS_WALK_MEMORY=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_ARCH_FLATMEM_ENABLE=y +CONFIG_ARCH_POPULATES_NODE_MAP=y +CONFIG_SELECT_MEMORY_MODEL=y +CONFIG_FLATMEM_MANUAL=y +# CONFIG_DISCONTIGMEM_MANUAL is not set +# CONFIG_SPARSEMEM_MANUAL is not set +CONFIG_FLATMEM=y +CONFIG_FLAT_NODE_MEM_MAP=y +CONFIG_PAGEFLAGS_EXTENDED=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_MIGRATION=y +CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_ZONE_DMA_FLAG=1 +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 +CONFIG_STDBINUTILS=y +CONFIG_PPC_4K_PAGES=y +# CONFIG_PPC_16K_PAGES is not set +# CONFIG_PPC_64K_PAGES is not set +# CONFIG_PPC_256K_PAGES is not set +CONFIG_FORCE_MAX_ZONEORDER=11 +CONFIG_PROC_DEVICETREE=y +CONFIG_CMDLINE_BOOL=y +CONFIG_CMDLINE="" +CONFIG_EXTRA_TARGETS="" +CONFIG_SECCOMP=y +CONFIG_ISA_DMA_API=y + +# +# Bus options +# +CONFIG_ZONE_DMA=y +CONFIG_PPC_INDIRECT_PCI=y +CONFIG_4xx_SOC=y +CONFIG_PPC_PCI_CHOICE=y +CONFIG_PCI=y +CONFIG_PCI_DOMAINS=y +CONFIG_PCI_SYSCALL=y +CONFIG_PCIEPORTBUS=y +CONFIG_PCIEAER=y +# CONFIG_PCIE_ECRC is not set +# CONFIG_PCIEAER_INJECT is not set +# CONFIG_PCIEASPM is not set +CONFIG_ARCH_SUPPORTS_MSI=y +# CONFIG_PCI_MSI is not set +CONFIG_PCI_LEGACY=y +# CONFIG_PCI_DEBUG is not set +# CONFIG_PCI_STUB is not set +# CONFIG_PCI_IOV is not set +# CONFIG_PCCARD is not set +# CONFIG_HOTPLUG_PCI is not set +# CONFIG_HAS_RAPIDIO is not set + +# +# Advanced setup +# +# CONFIG_ADVANCED_OPTIONS is not set + +# +# Default settings for advanced configuration options are used +# +CONFIG_LOWMEM_SIZE=0x30000000 +CONFIG_PAGE_OFFSET=0xc0000000 +CONFIG_KERNEL_START=0xc0000000 +CONFIG_PHYSICAL_START=0x00000000 +CONFIG_TASK_SIZE=0xc0000000 +CONFIG_CONSISTENT_SIZE=0x00200000 +CONFIG_NET=y + +# +# Networking options +# +CONFIG_PACKET=y +# CONFIG_PACKET_MMAP is not set +CONFIG_UNIX=y +# CONFIG_NET_KEY is not set +CONFIG_INET=y +# CONFIG_IP_MULTICAST is not set +# CONFIG_IP_ADVANCED_ROUTER is not set +CONFIG_IP_FIB_HASH=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +# CONFIG_IP_PNP_RARP is not set +# CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE is not set +# CONFIG_ARPD is not set +# CONFIG_SYN_COOKIES is not set +# CONFIG_INET_AH is not set +# CONFIG_INET_ESP is not set +# CONFIG_INET_IPCOMP is not set +# CONFIG_INET_XFRM_TUNNEL is not set +# CONFIG_INET_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_INET_LRO is not set +CONFIG_INET_DIAG=y +CONFIG_INET_TCP_DIAG=y +# CONFIG_TCP_CONG_ADVANCED is not set +CONFIG_TCP_CONG_CUBIC=y +CONFIG_DEFAULT_TCP_CONG="cubic" +# CONFIG_TCP_MD5SIG is not set +# CONFIG_IPV6 is not set +# CONFIG_NETWORK_SECMARK is not set +# CONFIG_NETFILTER is not set +# CONFIG_IP_DCCP is not set +# CONFIG_IP_SCTP is not set +# CONFIG_TIPC is not set +# CONFIG_ATM is not set +# CONFIG_BRIDGE is not set +# CONFIG_NET_DSA is not set +# CONFIG_VLAN_8021Q is not set +# CONFIG_DECNET is not set +# CONFIG_LLC2 is not set +# CONFIG_IPX is not set +# CONFIG_ATALK is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set +# CONFIG_PHONET is not set +# CONFIG_IEEE802154 is not set +# CONFIG_NET_SCHED is not set +# CONFIG_DCB is not set + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set +# CONFIG_HAMRADIO is not set +# CONFIG_CAN is not set +# CONFIG_IRDA is not set +# CONFIG_BT is not set +# CONFIG_AF_RXRPC is not set +CONFIG_WIRELESS=y +# CONFIG_CFG80211 is not set +CONFIG_WIRELESS_OLD_REGULATORY=y +# CONFIG_WIRELESS_EXT is not set +# CONFIG_LIB80211 is not set + +# +# CFG80211 needs to be enabled for MAC80211 +# +CONFIG_MAC80211_DEFAULT_PS_VALUE=0 +# CONFIG_WIMAX is not set +# CONFIG_RFKILL is not set +# CONFIG_NET_9P is not set + +# +# Device Drivers +# + +# +# Generic Driver Options +# +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y +CONFIG_FW_LOADER=y +CONFIG_FIRMWARE_IN_KERNEL=y +CONFIG_EXTRA_FIRMWARE="" +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set +# CONFIG_SYS_HYPERVISOR is not set +CONFIG_CONNECTOR=y +CONFIG_PROC_EVENTS=y +CONFIG_MTD=y +# CONFIG_MTD_DEBUG is not set +CONFIG_MTD_CONCAT=y +CONFIG_MTD_PARTITIONS=y +# CONFIG_MTD_TESTS is not set +# CONFIG_MTD_REDBOOT_PARTS is not set +CONFIG_MTD_CMDLINE_PARTS=y +CONFIG_MTD_OF_PARTS=y +# CONFIG_MTD_AR7_PARTS is not set + +# +# User Modules And Translation Layers +# +CONFIG_MTD_CHAR=y +CONFIG_MTD_BLKDEVS=y +CONFIG_MTD_BLOCK=y +# CONFIG_FTL is not set +# CONFIG_NFTL is not set +# CONFIG_INFTL is not set +# CONFIG_RFD_FTL is not set +# CONFIG_SSFDC is not set +# CONFIG_MTD_OOPS is not set + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=y +# CONFIG_MTD_JEDECPROBE is not set +CONFIG_MTD_GEN_PROBE=y +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +# CONFIG_MTD_CFI_I4 is not set +# CONFIG_MTD_CFI_I8 is not set +# CONFIG_MTD_CFI_INTELEXT is not set +CONFIG_MTD_CFI_AMDSTD=y +# CONFIG_MTD_CFI_STAA is not set +CONFIG_MTD_CFI_UTIL=y +# CONFIG_MTD_RAM is not set +# CONFIG_MTD_ROM is not set +# CONFIG_MTD_ABSENT is not set + +# +# Mapping drivers for chip access +# +# CONFIG_MTD_COMPLEX_MAPPINGS is not set +# CONFIG_MTD_PHYSMAP is not set +CONFIG_MTD_PHYSMAP_OF=y +# CONFIG_MTD_INTEL_VR_NOR is not set +# CONFIG_MTD_PLATRAM is not set + +# +# Self-contained MTD device drivers +# +# CONFIG_MTD_PMC551 is not set +# CONFIG_MTD_SLRAM is not set +# CONFIG_MTD_PHRAM is not set +# CONFIG_MTD_MTDRAM is not set +# CONFIG_MTD_BLOCK2MTD is not set + +# +# Disk-On-Chip Device Drivers +# +# CONFIG_MTD_DOC2000 is not set +# CONFIG_MTD_DOC2001 is not set +# CONFIG_MTD_DOC2001PLUS is not set +CONFIG_MTD_NAND=y +# CONFIG_MTD_NAND_VERIFY_WRITE is not set +CONFIG_MTD_NAND_ECC_SMC=y +# CONFIG_MTD_NAND_MUSEUM_IDS is not set +CONFIG_MTD_NAND_IDS=y +CONFIG_MTD_NAND_NDFC=y +# CONFIG_MTD_NAND_DISKONCHIP is not set +# CONFIG_MTD_NAND_CAFE is not set +# CONFIG_MTD_NAND_NANDSIM is not set +# CONFIG_MTD_NAND_PLATFORM is not set +# CONFIG_MTD_NAND_FSL_ELBC is not set +# CONFIG_MTD_ONENAND is not set + +# +# LPDDR flash memory drivers +# +# CONFIG_MTD_LPDDR is not set + +# +# UBI - Unsorted block images +# +# CONFIG_MTD_UBI is not set +CONFIG_OF_DEVICE=y +CONFIG_OF_I2C=y +# CONFIG_PARPORT is not set +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_FD is not set +# CONFIG_BLK_CPQ_DA is not set +# CONFIG_BLK_CPQ_CISS_DA is not set +# CONFIG_BLK_DEV_DAC960 is not set +# CONFIG_BLK_DEV_UMEM is not set +# CONFIG_BLK_DEV_COW_COMMON is not set +# CONFIG_BLK_DEV_LOOP is not set +# CONFIG_BLK_DEV_NBD is not set +# CONFIG_BLK_DEV_SX8 is not set +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=35000 +# CONFIG_BLK_DEV_XIP is not set +# CONFIG_CDROM_PKTCDVD is not set +# CONFIG_ATA_OVER_ETH is not set +# CONFIG_XILINX_SYSACE is not set +# CONFIG_BLK_DEV_HD is not set +# CONFIG_MISC_DEVICES is not set +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +# CONFIG_RAID_ATTRS is not set +CONFIG_SCSI=y +CONFIG_SCSI_DMA=y +# CONFIG_SCSI_TGT is not set +# CONFIG_SCSI_NETLINK is not set +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=y +# CONFIG_CHR_DEV_ST is not set +# CONFIG_CHR_DEV_OSST is not set +# CONFIG_BLK_DEV_SR is not set +CONFIG_CHR_DEV_SG=y +# CONFIG_CHR_DEV_SCH is not set +# CONFIG_SCSI_MULTI_LUN is not set +# CONFIG_SCSI_CONSTANTS is not set +# CONFIG_SCSI_LOGGING is not set +# CONFIG_SCSI_SCAN_ASYNC is not set +CONFIG_SCSI_WAIT_SCAN=m + +# +# SCSI Transports +# +# CONFIG_SCSI_SPI_ATTRS is not set +# CONFIG_SCSI_FC_ATTRS is not set +# CONFIG_SCSI_ISCSI_ATTRS is not set +CONFIG_SCSI_SAS_ATTRS=y +# CONFIG_SCSI_SAS_LIBSAS is not set +# CONFIG_SCSI_SRP_ATTRS is not set +CONFIG_SCSI_LOWLEVEL=y +# CONFIG_ISCSI_TCP is not set +# CONFIG_SCSI_BNX2_ISCSI is not set +# CONFIG_BLK_DEV_3W_XXXX_RAID is not set +# CONFIG_SCSI_3W_9XXX is not set +# CONFIG_SCSI_ACARD is not set +# CONFIG_SCSI_AACRAID is not set +# CONFIG_SCSI_AIC7XXX is not set +# CONFIG_SCSI_AIC7XXX_OLD is not set +# CONFIG_SCSI_AIC79XX is not set +# CONFIG_SCSI_AIC94XX is not set +# CONFIG_SCSI_MVSAS is not set +# CONFIG_SCSI_DPT_I2O is not set +# CONFIG_SCSI_ADVANSYS is not set +# CONFIG_SCSI_ARCMSR is not set +# CONFIG_MEGARAID_NEWGEN is not set +# CONFIG_MEGARAID_LEGACY is not set +# CONFIG_MEGARAID_SAS is not set +# CONFIG_SCSI_MPT2SAS is not set +# CONFIG_SCSI_HPTIOP is not set +# CONFIG_SCSI_BUSLOGIC is not set +# CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set +# CONFIG_FCOE is not set +# CONFIG_SCSI_DMX3191D is not set +# CONFIG_SCSI_EATA is not set +# CONFIG_SCSI_FUTURE_DOMAIN is not set +# CONFIG_SCSI_GDTH is not set +# CONFIG_SCSI_IPS is not set +# CONFIG_SCSI_INITIO is not set +# CONFIG_SCSI_INIA100 is not set +# CONFIG_SCSI_STEX is not set +# CONFIG_SCSI_SYM53C8XX_2 is not set +# CONFIG_SCSI_QLOGIC_1280 is not set +# CONFIG_SCSI_QLA_FC is not set +# CONFIG_SCSI_QLA_ISCSI is not set +# CONFIG_SCSI_LPFC is not set +# CONFIG_SCSI_DC395x is not set +# CONFIG_SCSI_DC390T is not set +# CONFIG_SCSI_NSP32 is not set +# CONFIG_SCSI_DEBUG is not set +# CONFIG_SCSI_SRP is not set +# CONFIG_SCSI_DH is not set +# CONFIG_SCSI_OSD_INITIATOR is not set +# CONFIG_ATA is not set +# CONFIG_MD is not set +CONFIG_FUSION=y +# CONFIG_FUSION_SPI is not set +# CONFIG_FUSION_FC is not set +CONFIG_FUSION_SAS=y +CONFIG_FUSION_MAX_SGE=128 +# CONFIG_FUSION_CTL is not set +# CONFIG_FUSION_LOGGING is not set + +# +# IEEE 1394 (FireWire) support +# + +# +# You can enable one or both FireWire driver stacks. +# + +# +# See the help texts for more information. +# +# CONFIG_FIREWIRE is not set +# CONFIG_IEEE1394 is not set +CONFIG_I2O=y +CONFIG_I2O_LCT_NOTIFY_ON_CHANGES=y +CONFIG_I2O_EXT_ADAPTEC=y +# CONFIG_I2O_CONFIG is not set +# CONFIG_I2O_BUS is not set +# CONFIG_I2O_BLOCK is not set +# CONFIG_I2O_SCSI is not set +# CONFIG_I2O_PROC is not set +# CONFIG_MACINTOSH_DRIVERS is not set +CONFIG_NETDEVICES=y +# CONFIG_DUMMY is not set +# CONFIG_BONDING is not set +# CONFIG_MACVLAN is not set +# CONFIG_EQUALIZER is not set +# CONFIG_TUN is not set +# CONFIG_VETH is not set +# CONFIG_ARCNET is not set +# CONFIG_PHYLIB is not set +CONFIG_NET_ETHERNET=y +# CONFIG_MII is not set +# CONFIG_HAPPYMEAL is not set +# CONFIG_SUNGEM is not set +# CONFIG_CASSINI is not set +# CONFIG_NET_VENDOR_3COM is not set +# CONFIG_ETHOC is not set +# CONFIG_DNET is not set +# CONFIG_NET_TULIP is not set +# CONFIG_HP100 is not set +CONFIG_IBM_NEW_EMAC=y +CONFIG_IBM_NEW_EMAC_RXB=256 +CONFIG_IBM_NEW_EMAC_TXB=256 +CONFIG_IBM_NEW_EMAC_POLL_WEIGHT=32 +CONFIG_IBM_NEW_EMAC_RX_COPY_THRESHOLD=256 +CONFIG_IBM_NEW_EMAC_RX_SKB_HEADROOM=0 +# CONFIG_IBM_NEW_EMAC_DEBUG is not set +CONFIG_IBM_NEW_EMAC_ZMII=y +CONFIG_IBM_NEW_EMAC_RGMII=y +CONFIG_IBM_NEW_EMAC_TAH=y +CONFIG_IBM_NEW_EMAC_EMAC4=y +# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set +# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set +# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set +# CONFIG_NET_PCI is not set +# CONFIG_B44 is not set +# CONFIG_KS8842 is not set +# CONFIG_ATL2 is not set +CONFIG_NETDEV_1000=y +# CONFIG_ACENIC is not set +# CONFIG_DL2K is not set +# CONFIG_E1000 is not set +CONFIG_E1000E=y +# CONFIG_IP1000 is not set +# CONFIG_IGB is not set +# CONFIG_IGBVF is not set +# CONFIG_NS83820 is not set +# CONFIG_HAMACHI is not set +# CONFIG_YELLOWFIN is not set +# CONFIG_R8169 is not set +# CONFIG_SIS190 is not set +# CONFIG_SKGE is not set +# CONFIG_SKY2 is not set +# CONFIG_VIA_VELOCITY is not set +# CONFIG_TIGON3 is not set +# CONFIG_BNX2 is not set +# CONFIG_CNIC is not set +# CONFIG_MV643XX_ETH is not set +# CONFIG_XILINX_LL_TEMAC is not set +# CONFIG_QLA3XXX is not set +# CONFIG_ATL1 is not set +# CONFIG_ATL1E is not set +# CONFIG_ATL1C is not set +# CONFIG_JME is not set +# CONFIG_NETDEV_10000 is not set +# CONFIG_TR is not set + +# +# Wireless LAN +# +# CONFIG_WLAN_PRE80211 is not set +# CONFIG_WLAN_80211 is not set + +# +# Enable WiMAX (Networking options) to see the WiMAX drivers +# +# CONFIG_WAN is not set +# CONFIG_FDDI is not set +# CONFIG_HIPPI is not set +# CONFIG_PPP is not set +# CONFIG_SLIP is not set +# CONFIG_NET_FC is not set +# CONFIG_NETCONSOLE is not set +# CONFIG_NETPOLL is not set +# CONFIG_NET_POLL_CONTROLLER is not set +# CONFIG_ISDN is not set +# CONFIG_PHONE is not set + +# +# Input device support +# +# CONFIG_INPUT is not set + +# +# Hardware I/O ports +# +# CONFIG_SERIO is not set +# CONFIG_GAMEPORT is not set + +# +# Character devices +# +# CONFIG_VT is not set +CONFIG_DEVKMEM=y +# CONFIG_SERIAL_NONSTANDARD is not set +# CONFIG_NOZOMI is not set + +# +# Serial drivers +# +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +# CONFIG_SERIAL_8250_PCI is not set +CONFIG_SERIAL_8250_NR_UARTS=2 +CONFIG_SERIAL_8250_RUNTIME_UARTS=2 +CONFIG_SERIAL_8250_EXTENDED=y +# CONFIG_SERIAL_8250_MANY_PORTS is not set +CONFIG_SERIAL_8250_SHARE_IRQ=y +# CONFIG_SERIAL_8250_DETECT_IRQ is not set +# CONFIG_SERIAL_8250_RSA is not set + +# +# Non-8250 serial port support +# +# CONFIG_SERIAL_UARTLITE is not set +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +# CONFIG_SERIAL_JSM is not set +CONFIG_SERIAL_OF_PLATFORM=y +# CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL is not set +CONFIG_UNIX98_PTYS=y +# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set +CONFIG_LEGACY_PTYS=y +CONFIG_LEGACY_PTY_COUNT=256 +# CONFIG_HVC_UDBG is not set +# CONFIG_IPMI_HANDLER is not set +# CONFIG_HW_RANDOM is not set +# CONFIG_NVRAM is not set +# CONFIG_GEN_RTC is not set +# CONFIG_R3964 is not set +# CONFIG_APPLICOM is not set +# CONFIG_RAW_DRIVER is not set +# CONFIG_TCG_TPM is not set +CONFIG_DEVPORT=y +CONFIG_I2C=y +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_CHARDEV=y +CONFIG_I2C_HELPER_AUTO=y + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +# CONFIG_I2C_ALI1535 is not set +# CONFIG_I2C_ALI1563 is not set +# CONFIG_I2C_ALI15X3 is not set +# CONFIG_I2C_AMD756 is not set +# CONFIG_I2C_AMD8111 is not set +# CONFIG_I2C_I801 is not set +# CONFIG_I2C_ISCH is not set +# CONFIG_I2C_PIIX4 is not set +# CONFIG_I2C_NFORCE2 is not set +# CONFIG_I2C_SIS5595 is not set +# CONFIG_I2C_SIS630 is not set +# CONFIG_I2C_SIS96X is not set +# CONFIG_I2C_VIA is not set +# CONFIG_I2C_VIAPRO is not set + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_IBM_IIC=y +# CONFIG_I2C_MPC is not set +# CONFIG_I2C_OCORES is not set +# CONFIG_I2C_SIMTEC is not set + +# +# External I2C/SMBus adapter drivers +# +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_TAOS_EVM is not set + +# +# Graphics adapter I2C/DDC channel drivers +# +# CONFIG_I2C_VOODOO3 is not set + +# +# Other I2C/SMBus bus drivers +# +# CONFIG_I2C_PCA_PLATFORM is not set +# CONFIG_I2C_STUB is not set + +# +# Miscellaneous I2C Chip support +# +# CONFIG_DS1682 is not set +# CONFIG_SENSORS_PCF8574 is not set +# CONFIG_PCF8575 is not set +# CONFIG_SENSORS_PCA9539 is not set +# CONFIG_SENSORS_TSL2550 is not set +CONFIG_I2C_DEBUG_CORE=y +CONFIG_I2C_DEBUG_ALGO=y +CONFIG_I2C_DEBUG_BUS=y +CONFIG_I2C_DEBUG_CHIP=y +# CONFIG_SPI is not set + +# +# PPS support +# +# CONFIG_PPS is not set +CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y +# CONFIG_GPIOLIB is not set +# CONFIG_W1 is not set +# CONFIG_POWER_SUPPLY is not set +# CONFIG_HWMON is not set +# CONFIG_THERMAL is not set +# CONFIG_THERMAL_HWMON is not set +# CONFIG_WATCHDOG is not set +CONFIG_SSB_POSSIBLE=y + +# +# Sonics Silicon Backplane +# +# CONFIG_SSB is not set + +# +# Multifunction device drivers +# +# CONFIG_MFD_CORE is not set +# CONFIG_MFD_SM501 is not set +# CONFIG_HTC_PASIC3 is not set +# CONFIG_TWL4030_CORE is not set +# CONFIG_MFD_TMIO is not set +# CONFIG_PMIC_DA903X is not set +# CONFIG_MFD_WM8400 is not set +# CONFIG_MFD_WM8350_I2C is not set +# CONFIG_MFD_PCF50633 is not set +# CONFIG_AB3100_CORE is not set +# CONFIG_REGULATOR is not set +# CONFIG_MEDIA_SUPPORT is not set + +# +# Graphics support +# +# CONFIG_AGP is not set +# CONFIG_DRM is not set +# CONFIG_VGASTATE is not set +CONFIG_VIDEO_OUTPUT_CONTROL=m +# CONFIG_FB is not set +# CONFIG_BACKLIGHT_LCD_SUPPORT is not set + +# +# Display device support +# +# CONFIG_DISPLAY_SUPPORT is not set +# CONFIG_SOUND is not set +# CONFIG_USB_SUPPORT is not set +# CONFIG_UWB is not set +# CONFIG_MMC is not set +# CONFIG_MEMSTICK is not set +# CONFIG_NEW_LEDS is not set +# CONFIG_ACCESSIBILITY is not set +# CONFIG_INFINIBAND is not set +# CONFIG_EDAC is not set +# CONFIG_RTC_CLASS is not set +CONFIG_DMADEVICES=y + +# +# DMA Devices +# +# CONFIG_AUXDISPLAY is not set +# CONFIG_UIO is not set + +# +# TI VLYNQ +# +# CONFIG_STAGING is not set + +# +# File systems +# +CONFIG_EXT2_FS=y +# CONFIG_EXT2_FS_XATTR is not set +# CONFIG_EXT2_FS_XIP is not set +# CONFIG_EXT3_FS is not set +# CONFIG_EXT4_FS is not set +# CONFIG_REISERFS_FS is not set +# CONFIG_JFS_FS is not set +# CONFIG_FS_POSIX_ACL is not set +# CONFIG_XFS_FS is not set +# CONFIG_GFS2_FS is not set +# CONFIG_OCFS2_FS is not set +# CONFIG_BTRFS_FS is not set +CONFIG_FILE_LOCKING=y +CONFIG_FSNOTIFY=y +CONFIG_DNOTIFY=y +CONFIG_INOTIFY=y +CONFIG_INOTIFY_USER=y +# CONFIG_QUOTA is not set +# CONFIG_AUTOFS_FS is not set +# CONFIG_AUTOFS4_FS is not set +# CONFIG_FUSE_FS is not set + +# +# Caches +# +# CONFIG_FSCACHE is not set + +# +# CD-ROM/DVD Filesystems +# +# CONFIG_ISO9660_FS is not set +# CONFIG_UDF_FS is not set + +# +# DOS/FAT/NT Filesystems +# +# CONFIG_MSDOS_FS is not set +# CONFIG_VFAT_FS is not set +# CONFIG_NTFS_FS is not set + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +CONFIG_PROC_SYSCTL=y +CONFIG_PROC_PAGE_MONITOR=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +# CONFIG_TMPFS_POSIX_ACL is not set +# CONFIG_HUGETLB_PAGE is not set +# CONFIG_CONFIGFS_FS is not set +CONFIG_MISC_FILESYSTEMS=y +# CONFIG_ADFS_FS is not set +# CONFIG_AFFS_FS is not set +# CONFIG_HFS_FS is not set +# CONFIG_HFSPLUS_FS is not set +# CONFIG_BEFS_FS is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +# CONFIG_JFFS2_FS is not set +CONFIG_CRAMFS=y +# CONFIG_SQUASHFS is not set +# CONFIG_VXFS_FS is not set +# CONFIG_MINIX_FS is not set +# CONFIG_OMFS_FS is not set +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_ROMFS_FS is not set +# CONFIG_SYSV_FS is not set +# CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +# CONFIG_NFS_V3_ACL is not set +# CONFIG_NFS_V4 is not set +CONFIG_ROOT_NFS=y +# CONFIG_NFSD is not set +CONFIG_LOCKD=y +CONFIG_LOCKD_V4=y +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=y +# CONFIG_RPCSEC_GSS_KRB5 is not set +# CONFIG_RPCSEC_GSS_SPKM3 is not set +# CONFIG_SMB_FS is not set +# CONFIG_CIFS is not set +# CONFIG_NCP_FS is not set +# CONFIG_CODA_FS is not set +# CONFIG_AFS_FS is not set + +# +# Partition Types +# +# CONFIG_PARTITION_ADVANCED is not set +CONFIG_MSDOS_PARTITION=y +# CONFIG_NLS is not set +# CONFIG_DLM is not set +# CONFIG_BINARY_PRINTF is not set + +# +# Library routines +# +CONFIG_BITREVERSE=y +CONFIG_GENERIC_FIND_LAST_BIT=y +# CONFIG_CRC_CCITT is not set +# CONFIG_CRC16 is not set +# CONFIG_CRC_T10DIF is not set +# CONFIG_CRC_ITU_T is not set +CONFIG_CRC32=y +# CONFIG_CRC7 is not set +# CONFIG_LIBCRC32C is not set +CONFIG_ZLIB_INFLATE=y +CONFIG_DECOMPRESS_GZIP=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT=y +CONFIG_HAS_DMA=y +CONFIG_HAVE_LMB=y +CONFIG_NLATTR=y +CONFIG_GENERIC_ATOMIC64=y + +# +# Kernel hacking +# +# CONFIG_PRINTK_TIME is not set +CONFIG_ENABLE_WARN_DEPRECATED=y +CONFIG_ENABLE_MUST_CHECK=y +CONFIG_FRAME_WARN=1024 +CONFIG_MAGIC_SYSRQ=y +# CONFIG_UNUSED_SYMBOLS is not set +CONFIG_DEBUG_FS=y +# CONFIG_HEADERS_CHECK is not set +CONFIG_DEBUG_KERNEL=y +# CONFIG_DEBUG_SHIRQ is not set +CONFIG_DETECT_SOFTLOCKUP=y +# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 +CONFIG_SCHED_DEBUG=y +# CONFIG_SCHEDSTATS is not set +# CONFIG_TIMER_STATS is not set +# CONFIG_DEBUG_OBJECTS is not set +# CONFIG_SLUB_DEBUG_ON is not set +# CONFIG_SLUB_STATS is not set +# CONFIG_DEBUG_KMEMLEAK is not set +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_RT_MUTEX_TESTER is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_DEBUG_SPINLOCK_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +# CONFIG_DEBUG_KOBJECT is not set +# CONFIG_DEBUG_BUGVERBOSE is not set +# CONFIG_DEBUG_INFO is not set +# CONFIG_DEBUG_VM is not set +# CONFIG_DEBUG_WRITECOUNT is not set +# CONFIG_DEBUG_MEMORY_INIT is not set +# CONFIG_DEBUG_LIST is not set +# CONFIG_DEBUG_SG is not set +# CONFIG_DEBUG_NOTIFIERS is not set +# CONFIG_RCU_TORTURE_TEST is not set +# CONFIG_RCU_CPU_STALL_DETECTOR is not set +# CONFIG_BACKTRACE_SELF_TEST is not set +# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set +# CONFIG_FAULT_INJECTION is not set +# CONFIG_LATENCYTOP is not set +CONFIG_SYSCTL_SYSCALL_CHECK=y +# CONFIG_DEBUG_PAGEALLOC is not set +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_ENABLE_DEFAULT_TRACERS is not set +# CONFIG_BOOT_TRACER is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +# CONFIG_PROFILE_ALL_BRANCHES is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_SAMPLES is not set +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set +# CONFIG_KMEMCHECK is not set +# CONFIG_PPC_DISABLE_WERROR is not set +CONFIG_PPC_WERROR=y +CONFIG_PRINT_STACK_DEPTH=64 +# CONFIG_DEBUG_STACKOVERFLOW is not set +# CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_PPC_EMULATED_STATS is not set +# CONFIG_CODE_PATCHING_SELFTEST is not set +# CONFIG_FTR_FIXUP_SELFTEST is not set +# CONFIG_MSI_BITMAP_SELFTEST is not set +# CONFIG_XMON is not set +# CONFIG_IRQSTACKS is not set +# CONFIG_VIRQ_DEBUG is not set +# CONFIG_BDI_SWITCH is not set +# CONFIG_PPC_EARLY_DEBUG is not set + +# +# Security options +# +# CONFIG_KEYS is not set +# CONFIG_SECURITY is not set +# CONFIG_SECURITYFS is not set +# CONFIG_SECURITY_FILE_CAPABILITIES is not set +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +# CONFIG_CRYPTO_FIPS is not set +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_BLKCIPHER=y +CONFIG_CRYPTO_BLKCIPHER2=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=y +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_PCOMP=y +CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y +CONFIG_CRYPTO_GF128MUL=y +# CONFIG_CRYPTO_NULL is not set +CONFIG_CRYPTO_WORKQUEUE=y +CONFIG_CRYPTO_CRYPTD=y +CONFIG_CRYPTO_AUTHENC=y +# CONFIG_CRYPTO_TEST is not set + +# +# Authenticated Encryption with Associated Data +# +CONFIG_CRYPTO_CCM=y +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_SEQIV=y + +# +# Block modes +# +CONFIG_CRYPTO_CBC=y +CONFIG_CRYPTO_CTR=y +CONFIG_CRYPTO_CTS=y +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_LRW=y +CONFIG_CRYPTO_PCBC=y +CONFIG_CRYPTO_XTS=y + +# +# Hash modes +# +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_XCBC=y + +# +# Digest +# +# CONFIG_CRYPTO_CRC32C is not set +CONFIG_CRYPTO_MD4=y +CONFIG_CRYPTO_MD5=y +# CONFIG_CRYPTO_MICHAEL_MIC is not set +# CONFIG_CRYPTO_RMD128 is not set +# CONFIG_CRYPTO_RMD160 is not set +# CONFIG_CRYPTO_RMD256 is not set +# CONFIG_CRYPTO_RMD320 is not set +CONFIG_CRYPTO_SHA1=y +CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_SHA512=y +# CONFIG_CRYPTO_TGR192 is not set +# CONFIG_CRYPTO_WP512 is not set + +# +# Ciphers +# +CONFIG_CRYPTO_AES=y +# CONFIG_CRYPTO_ANUBIS is not set +CONFIG_CRYPTO_ARC4=y +CONFIG_CRYPTO_BLOWFISH=y +# CONFIG_CRYPTO_CAMELLIA is not set +# CONFIG_CRYPTO_CAST5 is not set +# CONFIG_CRYPTO_CAST6 is not set +CONFIG_CRYPTO_DES=y +# CONFIG_CRYPTO_FCRYPT is not set +# CONFIG_CRYPTO_KHAZAD is not set +# CONFIG_CRYPTO_SALSA20 is not set +# CONFIG_CRYPTO_SEED is not set +# CONFIG_CRYPTO_SERPENT is not set +# CONFIG_CRYPTO_TEA is not set +# CONFIG_CRYPTO_TWOFISH is not set + +# +# Compression +# +# CONFIG_CRYPTO_DEFLATE is not set +# CONFIG_CRYPTO_ZLIB is not set +# CONFIG_CRYPTO_LZO is not set + +# +# Random Number Generation +# +# CONFIG_CRYPTO_ANSI_CPRNG is not set +CONFIG_CRYPTO_HW=y +# CONFIG_CRYPTO_DEV_HIFN_795X is not set +# CONFIG_CRYPTO_DEV_PPC4XX is not set +# CONFIG_PPC_CLOCK is not set +# CONFIG_VIRTUALIZATION is not set diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index 90e3192611a4..7486bffd3ebb 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -129,6 +129,18 @@ config REDWOOD help This option enables support for the AMCC PPC460SX Redwood board. +config EIGER + bool "Eiger" + depends on 44x + default n + select PPC44x_SIMPLE + select 460SX + select PCI + select PPC4xx_PCI_EXPRESS + select IBM_NEW_EMAC_RGMII + help + This option enables support for the AMCC PPC460SX evaluation board. + config YOSEMITE bool "Yosemite" depends on 44x diff --git a/arch/powerpc/platforms/44x/ppc44x_simple.c b/arch/powerpc/platforms/44x/ppc44x_simple.c index 5bcd441885e8..e8c23ccaa1fc 100644 --- a/arch/powerpc/platforms/44x/ppc44x_simple.c +++ b/arch/powerpc/platforms/44x/ppc44x_simple.c @@ -55,6 +55,7 @@ static char *board[] __initdata = { "amcc,canyonlands", "amcc,glacier", "ibm,ebony", + "amcc,eiger", "amcc,katmai", "amcc,rainier", "amcc,redwood", From 0cdf50a7c65df894fb5fd0ef181fe18b8fec6137 Mon Sep 17 00:00:00 2001 From: Solomon Peachy Date: Thu, 20 Aug 2009 10:19:47 +0000 Subject: [PATCH 0251/1662] powerpc/40x: Add support for the ESTeem 195E (PPC405EP) SBC This patch adds support for the ESTeem 195E Hotfoot SBC. There are several variants of the SBC deployed, single/dual ethernet+serial, and also 4MB/8MB flash variations. In the interest of having a single kernel image boot on all boards, the cuboot shim detects the differences and mangles the DTS tree appropriately. With the exception of the CF interface that was never populated on production boards, this code/DTS supports all boardpop options. Signed-off-by: Solomon Peachy Signed-off-by: Josh Boyer --- arch/powerpc/boot/Makefile | 4 +- arch/powerpc/boot/cuboot-hotfoot.c | 142 ++++++++++ arch/powerpc/boot/dts/hotfoot.dts | 294 +++++++++++++++++++++ arch/powerpc/boot/ppcboot-hotfoot.h | 133 ++++++++++ arch/powerpc/platforms/40x/Kconfig | 10 + arch/powerpc/platforms/40x/ppc40x_simple.c | 3 +- 6 files changed, 584 insertions(+), 2 deletions(-) create mode 100644 arch/powerpc/boot/cuboot-hotfoot.c create mode 100644 arch/powerpc/boot/dts/hotfoot.dts create mode 100644 arch/powerpc/boot/ppcboot-hotfoot.h diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 9ae7b7e2ba71..5a109a93753a 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -39,6 +39,7 @@ DTS_FLAGS ?= -p 1024 $(obj)/4xx.o: BOOTCFLAGS += -mcpu=405 $(obj)/ebony.o: BOOTCFLAGS += -mcpu=405 +$(obj)/cuboot-hotfoot.o: BOOTCFLAGS += -mcpu=405 $(obj)/cuboot-taishan.o: BOOTCFLAGS += -mcpu=405 $(obj)/cuboot-katmai.o: BOOTCFLAGS += -mcpu=405 $(obj)/cuboot-acadia.o: BOOTCFLAGS += -mcpu=405 @@ -67,7 +68,7 @@ src-wlib := string.S crt0.S crtsavres.S stdio.c main.c \ cpm-serial.c stdlib.c mpc52xx-psc.c planetcore.c uartlite.c \ fsl-soc.c mpc8xx.c pq2.c src-plat := of.c cuboot-52xx.c cuboot-824x.c cuboot-83xx.c cuboot-85xx.c holly.c \ - cuboot-ebony.c treeboot-ebony.c prpmc2800.c \ + cuboot-ebony.c cuboot-hotfoot.c treeboot-ebony.c prpmc2800.c \ ps3-head.S ps3-hvcall.S ps3.c treeboot-bamboo.c cuboot-8xx.c \ cuboot-pq2.c cuboot-sequoia.c treeboot-walnut.c \ cuboot-bamboo.c cuboot-mpc7448hpc2.c cuboot-taishan.c \ @@ -190,6 +191,7 @@ image-$(CONFIG_DEFAULT_UIMAGE) += uImage # Board ports in arch/powerpc/platform/40x/Kconfig image-$(CONFIG_EP405) += dtbImage.ep405 +image-$(CONFIG_HOTFOOT) += cuImage.hotfoot image-$(CONFIG_WALNUT) += treeImage.walnut image-$(CONFIG_ACADIA) += cuImage.acadia diff --git a/arch/powerpc/boot/cuboot-hotfoot.c b/arch/powerpc/boot/cuboot-hotfoot.c new file mode 100644 index 000000000000..8f697b958e45 --- /dev/null +++ b/arch/powerpc/boot/cuboot-hotfoot.c @@ -0,0 +1,142 @@ +/* + * Old U-boot compatibility for Esteem 195E Hotfoot CPU Board + * + * Author: Solomon Peachy + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include "ops.h" +#include "stdio.h" +#include "reg.h" +#include "dcr.h" +#include "4xx.h" +#include "cuboot.h" + +#define TARGET_4xx +#define TARGET_HOTFOOT + +#include "ppcboot-hotfoot.h" + +static bd_t bd; + +#define NUM_REGS 3 + +static void hotfoot_fixups(void) +{ + u32 uart = mfdcr(DCRN_CPC0_UCR) & 0x7f; + + dt_fixup_memory(bd.bi_memstart, bd.bi_memsize); + + dt_fixup_cpu_clocks(bd.bi_procfreq, bd.bi_procfreq, 0); + dt_fixup_clock("/plb", bd.bi_plb_busfreq); + dt_fixup_clock("/plb/opb", bd.bi_opbfreq); + dt_fixup_clock("/plb/ebc", bd.bi_pci_busfreq); + dt_fixup_clock("/plb/opb/serial@ef600300", bd.bi_procfreq / uart); + dt_fixup_clock("/plb/opb/serial@ef600400", bd.bi_procfreq / uart); + + dt_fixup_mac_address_by_alias("ethernet0", bd.bi_enetaddr); + dt_fixup_mac_address_by_alias("ethernet1", bd.bi_enet1addr); + + /* Is this a single eth/serial board? */ + if ((bd.bi_enet1addr[0] == 0) && + (bd.bi_enet1addr[1] == 0) && + (bd.bi_enet1addr[2] == 0) && + (bd.bi_enet1addr[3] == 0) && + (bd.bi_enet1addr[4] == 0) && + (bd.bi_enet1addr[5] == 0)) { + void *devp; + + printf("Trimming devtree for single serial/eth board\n"); + + devp = finddevice("/plb/opb/serial@ef600300"); + if (!devp) + fatal("Can't find node for /plb/opb/serial@ef600300"); + del_node(devp); + + devp = finddevice("/plb/opb/ethernet@ef600900"); + if (!devp) + fatal("Can't find node for /plb/opb/ethernet@ef600900"); + del_node(devp); + } + + ibm4xx_quiesce_eth((u32 *)0xef600800, (u32 *)0xef600900); + + /* Fix up flash size in fdt for 4M boards. */ + if (bd.bi_flashsize < 0x800000) { + u32 regs[NUM_REGS]; + void *devp = finddevice("/plb/ebc/nor_flash@0"); + if (!devp) + fatal("Can't find FDT node for nor_flash!??"); + + printf("Fixing devtree for 4M Flash\n"); + + /* First fix up the base addresse */ + getprop(devp, "reg", regs, sizeof(regs)); + regs[0] = 0; + regs[1] = 0xffc00000; + regs[2] = 0x00400000; + setprop(devp, "reg", regs, sizeof(regs)); + + /* Then the offsets */ + devp = finddevice("/plb/ebc/nor_flash@0/partition@0"); + if (!devp) + fatal("Can't find FDT node for partition@0"); + getprop(devp, "reg", regs, 2*sizeof(u32)); + regs[0] -= 0x400000; + setprop(devp, "reg", regs, 2*sizeof(u32)); + + devp = finddevice("/plb/ebc/nor_flash@0/partition@1"); + if (!devp) + fatal("Can't find FDT node for partition@1"); + getprop(devp, "reg", regs, 2*sizeof(u32)); + regs[0] -= 0x400000; + setprop(devp, "reg", regs, 2*sizeof(u32)); + + devp = finddevice("/plb/ebc/nor_flash@0/partition@2"); + if (!devp) + fatal("Can't find FDT node for partition@2"); + getprop(devp, "reg", regs, 2*sizeof(u32)); + regs[0] -= 0x400000; + setprop(devp, "reg", regs, 2*sizeof(u32)); + + devp = finddevice("/plb/ebc/nor_flash@0/partition@3"); + if (!devp) + fatal("Can't find FDT node for partition@3"); + getprop(devp, "reg", regs, 2*sizeof(u32)); + regs[0] -= 0x400000; + setprop(devp, "reg", regs, 2*sizeof(u32)); + + devp = finddevice("/plb/ebc/nor_flash@0/partition@4"); + if (!devp) + fatal("Can't find FDT node for partition@4"); + getprop(devp, "reg", regs, 2*sizeof(u32)); + regs[0] -= 0x400000; + setprop(devp, "reg", regs, 2*sizeof(u32)); + + devp = finddevice("/plb/ebc/nor_flash@0/partition@6"); + if (!devp) + fatal("Can't find FDT node for partition@6"); + getprop(devp, "reg", regs, 2*sizeof(u32)); + regs[0] -= 0x400000; + setprop(devp, "reg", regs, 2*sizeof(u32)); + + /* Delete the FeatFS node */ + devp = finddevice("/plb/ebc/nor_flash@0/partition@5"); + if (!devp) + fatal("Can't find FDT node for partition@5"); + del_node(devp); + } +} + +void platform_init(unsigned long r3, unsigned long r4, unsigned long r5, + unsigned long r6, unsigned long r7) +{ + CUBOOT_INIT(); + platform_ops.fixups = hotfoot_fixups; + platform_ops.exit = ibm40x_dbcr_reset; + fdt_init(_dtb_start); + serial_console_init(); +} diff --git a/arch/powerpc/boot/dts/hotfoot.dts b/arch/powerpc/boot/dts/hotfoot.dts new file mode 100644 index 000000000000..cad9c3840afc --- /dev/null +++ b/arch/powerpc/boot/dts/hotfoot.dts @@ -0,0 +1,294 @@ +/* + * Device Tree Source for ESTeem 195E Hotfoot + * + * Copyright 2009 AbsoluteValue Systems + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without + * any warranty of any kind, whether express or implied. + */ + +/dts-v1/; + +/ { + #address-cells = <1>; + #size-cells = <1>; + model = "est,hotfoot"; + compatible = "est,hotfoot"; + dcr-parent = <&{/cpus/cpu@0}>; + + aliases { + ethernet0 = &EMAC0; + ethernet1 = &EMAC1; + serial0 = &UART0; + serial1 = &UART1; + }; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + device_type = "cpu"; + model = "PowerPC,405EP"; + reg = <0x00000000>; + clock-frequency = <0>; /* Filled in by zImage */ + timebase-frequency = <0>; /* Filled in by zImage */ + i-cache-line-size = <0x20>; + d-cache-line-size = <0x20>; + i-cache-size = <0x4000>; + d-cache-size = <0x4000>; + dcr-controller; + dcr-access-method = "native"; + }; + }; + + memory { + device_type = "memory"; + reg = <0x00000000 0x00000000>; /* Filled in by zImage */ + }; + + UIC0: interrupt-controller { + compatible = "ibm,uic"; + interrupt-controller; + cell-index = <0>; + dcr-reg = <0x0c0 0x009>; + #address-cells = <0>; + #size-cells = <0>; + #interrupt-cells = <2>; + }; + + plb { + compatible = "ibm,plb3"; + #address-cells = <1>; + #size-cells = <1>; + ranges; + clock-frequency = <0>; /* Filled in by zImage */ + + SDRAM0: memory-controller { + compatible = "ibm,sdram-405ep"; + dcr-reg = <0x010 0x002>; + }; + + MAL: mcmal { + compatible = "ibm,mcmal-405ep", "ibm,mcmal"; + dcr-reg = <0x180 0x062>; + num-tx-chans = <4>; + num-rx-chans = <2>; + interrupt-parent = <&UIC0>; + interrupts = < + 0xb 0x4 /* TXEOB */ + 0xc 0x4 /* RXEOB */ + 0xa 0x4 /* SERR */ + 0xd 0x4 /* TXDE */ + 0xe 0x4 /* RXDE */>; + }; + + POB0: opb { + compatible = "ibm,opb-405ep", "ibm,opb"; + #address-cells = <1>; + #size-cells = <1>; + ranges = <0xef600000 0xef600000 0x00a00000>; + dcr-reg = <0x0a0 0x005>; + clock-frequency = <0>; /* Filled in by zImage */ + + /* Hotfoot has UART0/UART1 swapped */ + + UART0: serial@ef600400 { + device_type = "serial"; + compatible = "ns16550"; + reg = <0xef600400 0x00000008>; + virtual-reg = <0xef600400>; + clock-frequency = <0>; /* Filled in by zImage */ + current-speed = <0x9600>; + interrupt-parent = <&UIC0>; + interrupts = <0x1 0x4>; + }; + + UART1: serial@ef600300 { + device_type = "serial"; + compatible = "ns16550"; + reg = <0xef600300 0x00000008>; + virtual-reg = <0xef600300>; + clock-frequency = <0>; /* Filled in by zImage */ + current-speed = <0x9600>; + interrupt-parent = <&UIC0>; + interrupts = <0x0 0x4>; + }; + + IIC: i2c@ef600500 { + compatible = "ibm,iic-405ep", "ibm,iic"; + reg = <0xef600500 0x00000011>; + interrupt-parent = <&UIC0>; + interrupts = <0x2 0x4>; + + rtc@68 { + /* Actually a DS1339 */ + compatible = "dallas,ds1307"; + reg = <0x68>; + }; + + temp@4a { + /* Not present on all boards */ + compatible = "national,lm75"; + reg = <0x4a>; + }; + }; + + GPIO: gpio@ef600700 { + #gpio-cells = <2>; + compatible = "ibm,ppc4xx-gpio"; + reg = <0xef600700 0x00000020>; + gpio-controller; + }; + + gpio-leds { + compatible = "gpio-leds"; + status { + label = "Status"; + gpios = <&GPIO 1 0>; + }; + radiorx { + label = "Rx"; + gpios = <&GPIO 0xe 0>; + }; + }; + + EMAC0: ethernet@ef600800 { + linux,network-index = <0x0>; + device_type = "network"; + compatible = "ibm,emac-405ep", "ibm,emac"; + interrupt-parent = <&UIC0>; + interrupts = < + 0xf 0x4 /* Ethernet */ + 0x9 0x4 /* Ethernet Wake Up */>; + local-mac-address = [000000000000]; /* Filled in by zImage */ + reg = <0xef600800 0x00000070>; + mal-device = <&MAL>; + mal-tx-channel = <0>; + mal-rx-channel = <0>; + cell-index = <0>; + max-frame-size = <0x5dc>; + rx-fifo-size = <0x1000>; + tx-fifo-size = <0x800>; + phy-mode = "mii"; + phy-map = <0x00000000>; + }; + + EMAC1: ethernet@ef600900 { + linux,network-index = <0x1>; + device_type = "network"; + compatible = "ibm,emac-405ep", "ibm,emac"; + interrupt-parent = <&UIC0>; + interrupts = < + 0x11 0x4 /* Ethernet */ + 0x9 0x4 /* Ethernet Wake Up */>; + local-mac-address = [000000000000]; /* Filled in by zImage */ + reg = <0xef600900 0x00000070>; + mal-device = <&MAL>; + mal-tx-channel = <2>; + mal-rx-channel = <1>; + cell-index = <1>; + max-frame-size = <0x5dc>; + rx-fifo-size = <0x1000>; + tx-fifo-size = <0x800>; + mdio-device = <&EMAC0>; + phy-mode = "mii"; + phy-map = <0x0000001>; + }; + }; + + EBC0: ebc { + compatible = "ibm,ebc-405ep", "ibm,ebc"; + dcr-reg = <0x012 0x002>; + #address-cells = <2>; + #size-cells = <1>; + + /* The ranges property is supplied by the bootwrapper + * and is based on the firmware's configuration of the + * EBC bridge + */ + clock-frequency = <0>; /* Filled in by zImage */ + + nor_flash@0 { + compatible = "cfi-flash"; + bank-width = <2>; + reg = <0x0 0xff800000 0x00800000>; + #address-cells = <1>; + #size-cells = <1>; + + /* This mapping is for the 8M flash + 4M flash has all ofssets -= 4M, + and FeatFS partition is not present */ + partition@0 { + label = "Bootloader"; + reg = <0x7c0000 0x40000>; + /* read-only; */ + }; + partition@1 { + label = "Env_and_Config_Primary"; + reg = <0x400000 0x10000>; + }; + partition@2 { + label = "Kernel"; + reg = <0x420000 0x100000>; + }; + partition@3 { + label = "Filesystem"; + reg = <0x520000 0x2a0000>; + }; + partition@4 { + label = "Env_and_Config_Secondary"; + reg = <0x410000 0x10000>; + }; + partition@5 { + label = "FeatFS"; + reg = <0x000000 0x400000>; + }; + partition@6 { + label = "Bootloader_Env"; + reg = <0x7d0000 0x10000>; + }; + }; + }; + + PCI0: pci@ec000000 { + device_type = "pci"; + #interrupt-cells = <1>; + #size-cells = <2>; + #address-cells = <3>; + compatible = "ibm,plb405ep-pci", "ibm,plb-pci"; + primary; + reg = <0xeec00000 0x00000008 /* Config space access */ + 0xeed80000 0x00000004 /* IACK */ + 0xeed80000 0x00000004 /* Special cycle */ + 0xef480000 0x00000040>; /* Internal registers */ + + /* Outbound ranges, one memory and one IO, + * later cannot be changed. Chip supports a second + * IO range but we don't use it for now + */ + ranges = <0x02000000 0x00000000 0x80000000 0x80000000 0x00000000 0x20000000 + 0x01000000 0x00000000 0x00000000 0xe8000000 0x00000000 0x00010000>; + + /* Inbound 2GB range starting at 0 */ + dma-ranges = <0x42000000 0x0 0x0 0x0 0x0 0x80000000>; + + interrupt-parent = <&UIC0>; + interrupt-map-mask = <0xf800 0x0 0x0 0x7>; + interrupt-map = < + /* IDSEL 3 -- slot1 (optional) 27/29 A/B IRQ2/4 */ + 0x1800 0x0 0x0 0x1 &UIC0 0x1b 0x8 + 0x1800 0x0 0x0 0x2 &UIC0 0x1d 0x8 + + /* IDSEL 4 -- slot0, 26/28 A/B IRQ1/3 */ + 0x2000 0x0 0x0 0x1 &UIC0 0x1a 0x8 + 0x2000 0x0 0x0 0x2 &UIC0 0x1c 0x8 + >; + }; + }; + + chosen { + linux,stdout-path = &UART0; + }; +}; diff --git a/arch/powerpc/boot/ppcboot-hotfoot.h b/arch/powerpc/boot/ppcboot-hotfoot.h new file mode 100644 index 000000000000..1a3e80b533da --- /dev/null +++ b/arch/powerpc/boot/ppcboot-hotfoot.h @@ -0,0 +1,133 @@ +/* + * This interface is used for compatibility with old U-boots *ONLY*. + * Please do not imitate or extend this. + */ + +/* + * Unfortunately, the ESTeem Hotfoot board uses a mangled version of + * ppcboot.h for historical reasons, and in the interest of having a + * mainline kernel boot on the production board+bootloader, this was the + * least-offensive solution. Please direct all flames to: + * + * Solomon Peachy + * + * (This header is identical to ppcboot.h except for the + * TARGET_HOTFOOT bits) + */ + +/* + * (C) Copyright 2000, 2001 + * Wolfgang Denk, DENX Software Engineering, wd@denx.de. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + */ + +#ifndef __PPCBOOT_H__ +#define __PPCBOOT_H__ + +/* + * Board information passed to kernel from PPCBoot + * + * include/asm-ppc/ppcboot.h + */ + +#include "types.h" + +typedef struct bd_info { + unsigned long bi_memstart; /* start of DRAM memory */ + unsigned long bi_memsize; /* size of DRAM memory in bytes */ + unsigned long bi_flashstart; /* start of FLASH memory */ + unsigned long bi_flashsize; /* size of FLASH memory */ + unsigned long bi_flashoffset; /* reserved area for startup monitor */ + unsigned long bi_sramstart; /* start of SRAM memory */ + unsigned long bi_sramsize; /* size of SRAM memory */ +#if defined(TARGET_8xx) || defined(TARGET_CPM2) || defined(TARGET_85xx) ||\ + defined(TARGET_83xx) + unsigned long bi_immr_base; /* base of IMMR register */ +#endif +#if defined(TARGET_PPC_MPC52xx) + unsigned long bi_mbar_base; /* base of internal registers */ +#endif + unsigned long bi_bootflags; /* boot / reboot flag (for LynxOS) */ + unsigned long bi_ip_addr; /* IP Address */ + unsigned char bi_enetaddr[6]; /* Ethernet address */ +#if defined(TARGET_HOTFOOT) + /* second onboard ethernet port */ + unsigned char bi_enet1addr[6]; +#define HAVE_ENET1ADDR +#endif /* TARGET_HOOTFOOT */ + unsigned short bi_ethspeed; /* Ethernet speed in Mbps */ + unsigned long bi_intfreq; /* Internal Freq, in MHz */ + unsigned long bi_busfreq; /* Bus Freq, in MHz */ +#if defined(TARGET_CPM2) + unsigned long bi_cpmfreq; /* CPM_CLK Freq, in MHz */ + unsigned long bi_brgfreq; /* BRG_CLK Freq, in MHz */ + unsigned long bi_sccfreq; /* SCC_CLK Freq, in MHz */ + unsigned long bi_vco; /* VCO Out from PLL, in MHz */ +#endif +#if defined(TARGET_PPC_MPC52xx) + unsigned long bi_ipbfreq; /* IPB Bus Freq, in MHz */ + unsigned long bi_pcifreq; /* PCI Bus Freq, in MHz */ +#endif + unsigned long bi_baudrate; /* Console Baudrate */ +#if defined(TARGET_4xx) + unsigned char bi_s_version[4]; /* Version of this structure */ + unsigned char bi_r_version[32]; /* Version of the ROM (IBM) */ + unsigned int bi_procfreq; /* CPU (Internal) Freq, in Hz */ + unsigned int bi_plb_busfreq; /* PLB Bus speed, in Hz */ + unsigned int bi_pci_busfreq; /* PCI Bus speed, in Hz */ + unsigned char bi_pci_enetaddr[6]; /* PCI Ethernet MAC address */ +#endif +#if defined(TARGET_HOTFOOT) + unsigned int bi_pllouta_freq; /* PLL OUTA speed, in Hz */ +#endif +#if defined(TARGET_HYMOD) + hymod_conf_t bi_hymod_conf; /* hymod configuration information */ +#endif +#if defined(TARGET_EVB64260) || defined(TARGET_405EP) || defined(TARGET_44x) || \ + defined(TARGET_85xx) || defined(TARGET_83xx) || defined(TARGET_HAS_ETH1) + /* second onboard ethernet port */ + unsigned char bi_enet1addr[6]; +#define HAVE_ENET1ADDR +#endif +#if defined(TARGET_EVB64260) || defined(TARGET_440GX) || \ + defined(TARGET_85xx) || defined(TARGET_HAS_ETH2) + /* third onboard ethernet ports */ + unsigned char bi_enet2addr[6]; +#define HAVE_ENET2ADDR +#endif +#if defined(TARGET_440GX) || defined(TARGET_HAS_ETH3) + /* fourth onboard ethernet ports */ + unsigned char bi_enet3addr[6]; +#define HAVE_ENET3ADDR +#endif +#if defined(TARGET_HOTFOOT) + int bi_phynum[2]; /* Determines phy mapping */ + int bi_phymode[2]; /* Determines phy mode */ +#endif +#if defined(TARGET_4xx) + unsigned int bi_opbfreq; /* OB clock in Hz */ + int bi_iic_fast[2]; /* Use fast i2c mode */ +#endif +#if defined(TARGET_440GX) + int bi_phynum[4]; /* phy mapping */ + int bi_phymode[4]; /* phy mode */ +#endif +} bd_t; + +#define bi_tbfreq bi_intfreq + +#endif /* __PPCBOOT_H__ */ diff --git a/arch/powerpc/platforms/40x/Kconfig b/arch/powerpc/platforms/40x/Kconfig index a6e43cb6f825..ec64264f7a50 100644 --- a/arch/powerpc/platforms/40x/Kconfig +++ b/arch/powerpc/platforms/40x/Kconfig @@ -40,6 +40,16 @@ config HCU4 help This option enables support for the Nestal Maschinen HCU4 board. +config HOTFOOT + bool "Hotfoot" + depends on 40x + default n + select 405EP + select PPC40x_SIMPLE + select PCI + help + This option enables support for the ESTEEM 195E Hotfoot board. + config KILAUEA bool "Kilauea" depends on 40x diff --git a/arch/powerpc/platforms/40x/ppc40x_simple.c b/arch/powerpc/platforms/40x/ppc40x_simple.c index 5fd5a5974001..546bbc229d19 100644 --- a/arch/powerpc/platforms/40x/ppc40x_simple.c +++ b/arch/powerpc/platforms/40x/ppc40x_simple.c @@ -54,7 +54,8 @@ static char *board[] __initdata = { "amcc,acadia", "amcc,haleakala", "amcc,kilauea", - "amcc,makalu" + "amcc,makalu", + "est,hotfoot" }; static int __init ppc40x_probe(void) From 0484c1df473815020bdce4b46b861395759686e5 Mon Sep 17 00:00:00 2001 From: Tiejun Chen Date: Sat, 22 Aug 2009 16:03:43 +0000 Subject: [PATCH 0252/1662] powerpc/405ex: provide necessary fixup function to support cuImage For cuImage format it's necessary to provide clock fixups since u-boot will not pass necessary clock frequency into the dtb included into cuImage so we implement the clock fixups as defined in the technical documentation for the board and update header file with the basic register definitions. Signed-off-by: Tiejun Chen Signed-off-by: Josh Boyer --- arch/powerpc/boot/4xx.c | 142 ++++++++++++++++++++++++++++++++++++++++ arch/powerpc/boot/4xx.h | 1 + arch/powerpc/boot/dcr.h | 4 +- 3 files changed, 144 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/boot/4xx.c b/arch/powerpc/boot/4xx.c index 325b310573b9..27db8938827a 100644 --- a/arch/powerpc/boot/4xx.c +++ b/arch/powerpc/boot/4xx.c @@ -8,6 +8,10 @@ * Eugene Surovegin or * Copyright (c) 2003, 2004 Zultys Technologies * + * Copyright (C) 2009 Wind River Systems, Inc. + * Updated for supporting PPC405EX on Kilauea. + * Tiejun Chen + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -659,3 +663,141 @@ void ibm405ep_fixup_clocks(unsigned int sys_clk) dt_fixup_clock("/plb/opb/serial@ef600300", uart0); dt_fixup_clock("/plb/opb/serial@ef600400", uart1); } + +static u8 ibm405ex_fwdv_multi_bits[] = { + /* values for: 1 - 16 */ + 0x01, 0x02, 0x0e, 0x09, 0x04, 0x0b, 0x10, 0x0d, 0x0c, 0x05, + 0x06, 0x0f, 0x0a, 0x07, 0x08, 0x03 +}; + +u32 ibm405ex_get_fwdva(unsigned long cpr_fwdv) +{ + u32 index; + + for (index = 0; index < ARRAY_SIZE(ibm405ex_fwdv_multi_bits); index++) + if (cpr_fwdv == (u32)ibm405ex_fwdv_multi_bits[index]) + return index + 1; + + return 0; +} + +static u8 ibm405ex_fbdv_multi_bits[] = { + /* values for: 1 - 100 */ + 0x00, 0xff, 0x7e, 0xfd, 0x7a, 0xf5, 0x6a, 0xd5, 0x2a, 0xd4, + 0x29, 0xd3, 0x26, 0xcc, 0x19, 0xb3, 0x67, 0xce, 0x1d, 0xbb, + 0x77, 0xee, 0x5d, 0xba, 0x74, 0xe9, 0x52, 0xa5, 0x4b, 0x96, + 0x2c, 0xd8, 0x31, 0xe3, 0x46, 0x8d, 0x1b, 0xb7, 0x6f, 0xde, + 0x3d, 0xfb, 0x76, 0xed, 0x5a, 0xb5, 0x6b, 0xd6, 0x2d, 0xdb, + 0x36, 0xec, 0x59, 0xb2, 0x64, 0xc9, 0x12, 0xa4, 0x48, 0x91, + 0x23, 0xc7, 0x0e, 0x9c, 0x38, 0xf0, 0x61, 0xc2, 0x05, 0x8b, + 0x17, 0xaf, 0x5f, 0xbe, 0x7c, 0xf9, 0x72, 0xe5, 0x4a, 0x95, + 0x2b, 0xd7, 0x2e, 0xdc, 0x39, 0xf3, 0x66, 0xcd, 0x1a, 0xb4, + 0x68, 0xd1, 0x22, 0xc4, 0x09, 0x93, 0x27, 0xcf, 0x1e, 0xbc, + /* values for: 101 - 200 */ + 0x78, 0xf1, 0x62, 0xc5, 0x0a, 0x94, 0x28, 0xd0, 0x21, 0xc3, + 0x06, 0x8c, 0x18, 0xb0, 0x60, 0xc1, 0x02, 0x84, 0x08, 0x90, + 0x20, 0xc0, 0x01, 0x83, 0x07, 0x8f, 0x1f, 0xbf, 0x7f, 0xfe, + 0x7d, 0xfa, 0x75, 0xea, 0x55, 0xaa, 0x54, 0xa9, 0x53, 0xa6, + 0x4c, 0x99, 0x33, 0xe7, 0x4e, 0x9d, 0x3b, 0xf7, 0x6e, 0xdd, + 0x3a, 0xf4, 0x69, 0xd2, 0x25, 0xcb, 0x16, 0xac, 0x58, 0xb1, + 0x63, 0xc6, 0x0d, 0x9b, 0x37, 0xef, 0x5e, 0xbd, 0x7b, 0xf6, + 0x6d, 0xda, 0x35, 0xeb, 0x56, 0xad, 0x5b, 0xb6, 0x6c, 0xd9, + 0x32, 0xe4, 0x49, 0x92, 0x24, 0xc8, 0x11, 0xa3, 0x47, 0x8e, + 0x1c, 0xb8, 0x70, 0xe1, 0x42, 0x85, 0x0b, 0x97, 0x2f, 0xdf, + /* values for: 201 - 255 */ + 0x3e, 0xfc, 0x79, 0xf2, 0x65, 0xca, 0x15, 0xab, 0x57, 0xae, + 0x5c, 0xb9, 0x73, 0xe6, 0x4d, 0x9a, 0x34, 0xe8, 0x51, 0xa2, + 0x44, 0x89, 0x13, 0xa7, 0x4f, 0x9e, 0x3c, 0xf8, 0x71, 0xe2, + 0x45, 0x8a, 0x14, 0xa8, 0x50, 0xa1, 0x43, 0x86, 0x0c, 0x98, + 0x30, 0xe0, 0x41, 0x82, 0x04, 0x88, 0x10, 0xa0, 0x40, 0x81, + 0x03, 0x87, 0x0f, 0x9f, 0x3f /* END */ +}; + +u32 ibm405ex_get_fbdv(unsigned long cpr_fbdv) +{ + u32 index; + + for (index = 0; index < ARRAY_SIZE(ibm405ex_fbdv_multi_bits); index++) + if (cpr_fbdv == (u32)ibm405ex_fbdv_multi_bits[index]) + return index + 1; + + return 0; +} + +void ibm405ex_fixup_clocks(unsigned int sys_clk, unsigned int uart_clk) +{ + /* PLL config */ + u32 pllc = CPR0_READ(DCRN_CPR0_PLLC); + u32 plld = CPR0_READ(DCRN_CPR0_PLLD); + u32 cpud = CPR0_READ(DCRN_CPR0_PRIMAD); + u32 plbd = CPR0_READ(DCRN_CPR0_PRIMBD); + u32 opbd = CPR0_READ(DCRN_CPR0_OPBD); + u32 perd = CPR0_READ(DCRN_CPR0_PERD); + + /* Dividers */ + u32 fbdv = ibm405ex_get_fbdv(__fix_zero((plld >> 24) & 0xff, 1)); + + u32 fwdva = ibm405ex_get_fwdva(__fix_zero((plld >> 16) & 0x0f, 1)); + + u32 cpudv0 = __fix_zero((cpud >> 24) & 7, 8); + + /* PLBDV0 is hardwared to 010. */ + u32 plbdv0 = 2; + u32 plb2xdv0 = __fix_zero((plbd >> 16) & 7, 8); + + u32 opbdv0 = __fix_zero((opbd >> 24) & 3, 4); + + u32 perdv0 = __fix_zero((perd >> 24) & 3, 4); + + /* Resulting clocks */ + u32 cpu, plb, opb, ebc, vco, tb, uart0, uart1; + + /* PLL's VCO is the source for primary forward ? */ + if (pllc & 0x40000000) { + u32 m; + + /* Feedback path */ + switch ((pllc >> 24) & 7) { + case 0: + /* PLLOUTx */ + m = fbdv; + break; + case 1: + /* CPU */ + m = fbdv * fwdva * cpudv0; + break; + case 5: + /* PERClk */ + m = fbdv * fwdva * plb2xdv0 * plbdv0 * opbdv0 * perdv0; + break; + default: + printf("WARNING ! Invalid PLL feedback source !\n"); + goto bypass; + } + + vco = (unsigned int)(sys_clk * m); + } else { +bypass: + /* Bypass system PLL */ + vco = 0; + } + + /* CPU = VCO / ( FWDVA x CPUDV0) */ + cpu = vco / (fwdva * cpudv0); + /* PLB = VCO / ( FWDVA x PLB2XDV0 x PLBDV0) */ + plb = vco / (fwdva * plb2xdv0 * plbdv0); + /* OPB = PLB / OPBDV0 */ + opb = plb / opbdv0; + /* EBC = OPB / PERDV0 */ + ebc = opb / perdv0; + + tb = cpu; + uart0 = uart1 = uart_clk; + + dt_fixup_cpu_clocks(cpu, tb, 0); + dt_fixup_clock("/plb", plb); + dt_fixup_clock("/plb/opb", opb); + dt_fixup_clock("/plb/opb/ebc", ebc); + dt_fixup_clock("/plb/opb/serial@ef600200", uart0); + dt_fixup_clock("/plb/opb/serial@ef600300", uart1); +} diff --git a/arch/powerpc/boot/4xx.h b/arch/powerpc/boot/4xx.h index 2606e64f0c4b..7dc5d45361bc 100644 --- a/arch/powerpc/boot/4xx.h +++ b/arch/powerpc/boot/4xx.h @@ -21,6 +21,7 @@ void ibm4xx_fixup_ebc_ranges(const char *ebc); void ibm405gp_fixup_clocks(unsigned int sys_clk, unsigned int ser_clk); void ibm405ep_fixup_clocks(unsigned int sys_clk); +void ibm405ex_fixup_clocks(unsigned int sys_clk, unsigned int uart_clk); void ibm440gp_fixup_clocks(unsigned int sys_clk, unsigned int ser_clk); void ibm440ep_fixup_clocks(unsigned int sys_clk, unsigned int ser_clk, unsigned int tmr_clk); diff --git a/arch/powerpc/boot/dcr.h b/arch/powerpc/boot/dcr.h index 95b9f5344016..645a7c964e5f 100644 --- a/arch/powerpc/boot/dcr.h +++ b/arch/powerpc/boot/dcr.h @@ -153,9 +153,7 @@ static const unsigned long sdram_bxcr[] = { SDRAM0_B0CR, SDRAM0_B1CR, #define DCRN_CPC0_PLLMR1 0xf4 #define DCRN_CPC0_UCR 0xf5 -/* 440GX Clock control etc */ - - +/* 440GX/405EX Clock Control reg */ #define DCRN_CPR0_CLKUPD 0x020 #define DCRN_CPR0_PLLC 0x040 #define DCRN_CPR0_PLLD 0x060 From c5b20d3926dfc9616265b8ff5967cb7a476f9344 Mon Sep 17 00:00:00 2001 From: Tiejun Chen Date: Sat, 22 Aug 2009 16:03:44 +0000 Subject: [PATCH 0253/1662] powerpc/405ex: support cuImage via included dtb To support cuImage, we need to initialize the required sections and ensure that it is built. Signed-off-by: Tiejun Chen Signed-off-by: Josh Boyer --- arch/powerpc/boot/Makefile | 2 +- arch/powerpc/boot/cuboot-kilauea.c | 49 ++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/boot/cuboot-kilauea.c diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 5a109a93753a..7bfc8ad87798 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -76,7 +76,7 @@ src-plat := of.c cuboot-52xx.c cuboot-824x.c cuboot-83xx.c cuboot-85xx.c holly.c cuboot-katmai.c cuboot-rainier.c redboot-8xx.c ep8248e.c \ cuboot-warp.c cuboot-85xx-cpm2.c cuboot-yosemite.c simpleboot.c \ virtex405-head.S virtex.c redboot-83xx.c cuboot-sam440ep.c \ - cuboot-acadia.c cuboot-amigaone.c + cuboot-acadia.c cuboot-amigaone.c cuboot-kilauea.c src-boot := $(src-wlib) $(src-plat) empty.c src-boot := $(addprefix $(obj)/, $(src-boot)) diff --git a/arch/powerpc/boot/cuboot-kilauea.c b/arch/powerpc/boot/cuboot-kilauea.c new file mode 100644 index 000000000000..80cdad6bbc3f --- /dev/null +++ b/arch/powerpc/boot/cuboot-kilauea.c @@ -0,0 +1,49 @@ +/* + * Old U-boot compatibility for PPC405EX. This image is already included + * a dtb. + * + * Author: Tiejun Chen + * + * Copyright (C) 2009 Wind River Systems, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include "ops.h" +#include "io.h" +#include "dcr.h" +#include "stdio.h" +#include "4xx.h" +#include "44x.h" +#include "cuboot.h" + +#define TARGET_4xx +#define TARGET_44x +#include "ppcboot.h" + +#define KILAUEA_SYS_EXT_SERIAL_CLOCK 11059200 /* ext. 11.059MHz clk */ + +static bd_t bd; + +static void kilauea_fixups(void) +{ + unsigned long sysclk = 33333333; + + ibm405ex_fixup_clocks(sysclk, KILAUEA_SYS_EXT_SERIAL_CLOCK); + dt_fixup_memory(bd.bi_memstart, bd.bi_memsize); + ibm4xx_fixup_ebc_ranges("/plb/opb/ebc"); + dt_fixup_mac_address_by_alias("ethernet0", bd.bi_enetaddr); + dt_fixup_mac_address_by_alias("ethernet1", bd.bi_enet1addr); +} + +void platform_init(unsigned long r3, unsigned long r4, unsigned long r5, + unsigned long r6, unsigned long r7) +{ + CUBOOT_INIT(); + platform_ops.fixups = kilauea_fixups; + platform_ops.exit = ibm40x_dbcr_reset; + fdt_init(_dtb_start); + serial_console_init(); +} From a269cca9926faf8e44b340b017be0d884203141b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 31 Aug 2009 11:17:44 -0700 Subject: [PATCH 0254/1662] mm: remove !NUMA condition from PAGEFLAGS_EXTENDED condition set CONFIG_PAGEFLAGS_EXTENDED disables a trick to conserve pageflags. This trick is indended to be enabled when the pressure on page flags is very high. The previous condition was: - depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM ... however, the sparsemem code already has a way to crowd out the node number from the pageflags, which means that !NUMA actually doesn't contribute to hard pageflags exhaustion. This is required for the new PG_uncached flag to not cause pageflags exhaustion on x86_32 + PAE + SPARSEMEM + !NUMA. Signed-off-by: H. Peter Anvin LKML-Reference: <4A9828F4.4040905@zytor.com> Cc: Venkatesh Pallipadi Cc: Suresh Siddha --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index c948d4ca8bde..fe221c7df752 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -153,7 +153,7 @@ config MEMORY_HOTREMOVE # config PAGEFLAGS_EXTENDED def_bool y - depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM + depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address From d96f8f891f69ac1dc8c7bd82e27525de220c04e1 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 2 Jul 2009 00:09:33 -0500 Subject: [PATCH 0255/1662] xfs: add more statics & drop some unused functions A lot more functions could be made static, but they need forward declarations; this does some easy ones, and also found a few unused functions in the process. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_super.c | 2 +- fs/xfs/linux-2.6/xfs_sync.c | 15 ------------- fs/xfs/linux-2.6/xfs_sync.h | 1 - fs/xfs/xfs_ag.h | 3 --- fs/xfs/xfs_alloc.c | 2 +- fs/xfs/xfs_bmap.c | 2 +- fs/xfs/xfs_bmap.h | 11 ---------- fs/xfs/xfs_bmap_btree.c | 20 ++++++++--------- fs/xfs/xfs_bmap_btree.h | 1 - fs/xfs/xfs_btree.c | 42 +----------------------------------- fs/xfs/xfs_btree.h | 15 ------------- fs/xfs/xfs_inode.c | 8 +++---- fs/xfs/xfs_inode.h | 5 ----- fs/xfs/xfs_itable.c | 2 +- fs/xfs/xfs_itable.h | 5 ----- fs/xfs/xfs_log_priv.h | 2 -- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_mount.c | 2 +- fs/xfs/xfs_mount.h | 3 --- fs/xfs/xfs_mru_cache.c | 29 ------------------------- fs/xfs/xfs_mru_cache.h | 1 - fs/xfs/xfs_rw.h | 6 ------ fs/xfs/xfs_vnodeops.c | 2 +- 23 files changed, 22 insertions(+), 159 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index a220d36f789b..c709ed61a53e 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -687,7 +687,7 @@ xfs_barrier_test( return error; } -void +STATIC void xfs_mountfs_check_barriers(xfs_mount_t *mp) { int error; diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 98ef624d9baf..320be6aea492 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -749,21 +749,6 @@ __xfs_inode_clear_reclaim_tag( XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); } -void -xfs_inode_clear_reclaim_tag( - xfs_inode_t *ip) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino); - - read_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - __xfs_inode_clear_reclaim_tag(mp, pag, ip); - spin_unlock(&ip->i_flags_lock); - read_unlock(&pag->pag_ici_lock); - xfs_put_perag(mp, pag); -} - STATIC int xfs_reclaim_inode_now( struct xfs_inode *ip, diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 59120602588a..27920eb7a820 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -49,7 +49,6 @@ int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); -void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip); void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, struct xfs_inode *ip); diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index f24b50b68d03..af3cfeb18b0d 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -91,9 +91,6 @@ typedef struct xfs_agf { #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) #define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) -extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); - /* * Size of the unlinked inode hash table in the agi. */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 2cf944eb796d..6316004922d7 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2248,7 +2248,7 @@ xfs_alloc_put_freelist( /* * Read in the allocation group header (free/alloc section). */ -int /* error */ +STATIC int /* error */ xfs_read_agf( struct xfs_mount *mp, /* mount point structure */ struct xfs_trans *tp, /* transaction pointer */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 8ee5b5a76a2a..8971fb09d387 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3713,7 +3713,7 @@ xfs_bmap_local_to_extents( * entry (null if none). Else, *lastxp will be set to the index * of the found entry; *gotp will contain the entry. */ -xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ xfs_bmap_search_multi_extents( xfs_ifork_t *ifp, /* inode fork pointer */ xfs_fileoff_t bno, /* block number searched for */ diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 1b8ff9256bd0..56f62d2edc35 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -392,17 +392,6 @@ xfs_bmap_count_blocks( int whichfork, int *count); -/* - * Search the extent records for the entry containing block bno. - * If bno lies in a hole, point to the next entry. If bno lies - * past eof, *eofp will be set, and *prevp will contain the last - * entry (null if none). Else, *lastxp will be set to the index - * of the found entry; *gotp will contain the entry. - */ -xfs_bmbt_rec_host_t * -xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *, - xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *); - #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 5c1ade06578e..eb7b702d0690 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -202,16 +202,6 @@ xfs_bmbt_get_state( ext_flag); } -/* Endian flipping versions of the bmbt extraction functions */ -void -xfs_bmbt_disk_get_all( - xfs_bmbt_rec_t *r, - xfs_bmbt_irec_t *s) -{ - __xfs_bmbt_get_all(get_unaligned_be64(&r->l0), - get_unaligned_be64(&r->l1), s); -} - /* * Extract the blockcount field from an on disk bmap extent record. */ @@ -816,6 +806,16 @@ xfs_bmbt_trace_key( *l1 = 0; } +/* Endian flipping versions of the bmbt extraction functions */ +STATIC void +xfs_bmbt_disk_get_all( + xfs_bmbt_rec_t *r, + xfs_bmbt_irec_t *s) +{ + __xfs_bmbt_get_all(get_unaligned_be64(&r->l0), + get_unaligned_be64(&r->l1), s); +} + STATIC void xfs_bmbt_trace_record( struct xfs_btree_cur *cur, diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 0e8df007615e..5549d495947f 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r); extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); -extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 26717388acf5..52b5f14d0c32 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -645,46 +645,6 @@ xfs_btree_read_bufl( return 0; } -/* - * Get a buffer for the block, return it read in. - * Short-form addressing. - */ -int /* error */ -xfs_btree_read_bufs( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - uint lock, /* lock flags for read_buf */ - xfs_buf_t **bpp, /* buffer for agno/agbno */ - int refval) /* ref count value for buffer */ -{ - xfs_buf_t *bp; /* return value */ - xfs_daddr_t d; /* real disk block address */ - int error; - - ASSERT(agno != NULLAGNUMBER); - ASSERT(agbno != NULLAGBLOCK); - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp))) { - return error; - } - ASSERT(!bp || !XFS_BUF_GETERROR(bp)); - if (bp != NULL) { - switch (refval) { - case XFS_ALLOC_BTREE_REF: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); - break; - case XFS_INO_BTREE_REF: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval); - break; - } - } - *bpp = bp; - return 0; -} - /* * Read-ahead the block, don't wait for it, don't return a buffer. * Long-form addressing. @@ -2951,7 +2911,7 @@ xfs_btree_insert( * inode we have to copy the single block it was pointing to into the * inode. */ -int +STATIC int xfs_btree_kill_iroot( struct xfs_btree_cur *cur) { diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 4f852b735b96..7fa07062bdda 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -378,20 +378,6 @@ xfs_btree_read_bufl( struct xfs_buf **bpp, /* buffer for fsbno */ int refval);/* ref count value for buffer */ -/* - * Get a buffer for the block, return it read in. - * Short-form addressing. - */ -int /* error */ -xfs_btree_read_bufs( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - uint lock, /* lock flags for read_buf */ - struct xfs_buf **bpp, /* buffer for agno/agbno */ - int refval);/* ref count value for buffer */ - /* * Read-ahead the block, don't wait for it, don't return a buffer. * Long-form addressing. @@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *); int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); -int xfs_btree_kill_iroot(struct xfs_btree_cur *); int xfs_btree_insert(struct xfs_btree_cur *, int *); int xfs_btree_delete(struct xfs_btree_cur *, int *); int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index da428b3fe0f5..c1dc7ef5a1d8 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -651,7 +651,7 @@ xfs_iformat_btree( return 0; } -void +STATIC void xfs_dinode_from_disk( xfs_icdinode_t *to, xfs_dinode_t *from) @@ -1247,7 +1247,7 @@ xfs_isize_check( * In that case the pages will still be in memory, but the inode size * will never have been updated. */ -xfs_fsize_t +STATIC xfs_fsize_t xfs_file_last_byte( xfs_inode_t *ip) { @@ -3837,7 +3837,7 @@ xfs_iext_inline_to_direct( /* * Resize an extent indirection array to new_size bytes. */ -void +STATIC void xfs_iext_realloc_indirect( xfs_ifork_t *ifp, /* inode fork pointer */ int new_size) /* new indirection array size */ @@ -3862,7 +3862,7 @@ xfs_iext_realloc_indirect( /* * Switch from indirection array to linear (direct) extent allocations. */ -void +STATIC void xfs_iext_indirect_to_direct( xfs_ifork_t *ifp) /* inode fork pointer */ { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 65f24a3cc992..00e8505bde2d 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -504,7 +504,6 @@ void xfs_ipin(xfs_inode_t *); void xfs_iunpin(xfs_inode_t *); int xfs_iflush(xfs_inode_t *, uint); void xfs_ichgtime(xfs_inode_t *, int); -xfs_fsize_t xfs_file_last_byte(xfs_inode_t *); void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); @@ -572,8 +571,6 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *, struct xfs_buf **, uint); int xfs_iread(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, xfs_daddr_t, uint); -void xfs_dinode_from_disk(struct xfs_icdinode *, - struct xfs_dinode *); void xfs_dinode_to_disk(struct xfs_dinode *, struct xfs_icdinode *); void xfs_idestroy_fork(struct xfs_inode *, int); @@ -592,8 +589,6 @@ void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int); void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int); void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int); void xfs_iext_realloc_direct(xfs_ifork_t *, int); -void xfs_iext_realloc_indirect(xfs_ifork_t *, int); -void xfs_iext_indirect_to_direct(xfs_ifork_t *); void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t); void xfs_iext_inline_to_direct(xfs_ifork_t *, int); void xfs_iext_destroy(xfs_ifork_t *); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index aeb2d2221c7d..c471122d2234 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -39,7 +39,7 @@ #include "xfs_error.h" #include "xfs_btree.h" -int +STATIC int xfs_internal_inum( xfs_mount_t *mp, xfs_ino_t ino) diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 1fb04e7deb61..20792bf45946 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -99,11 +99,6 @@ xfs_bulkstat_one( void *dibuff, int *stat); -int -xfs_internal_inum( - xfs_mount_t *mp, - xfs_ino_t ino); - typedef int (*inumbers_fmt_pf)( void __user *ubuffer, /* buffer to write to */ const xfs_inogrp_t *buffer, /* buffer to read from */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index bcad5f4c1fd1..679c7c4926a2 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -451,8 +451,6 @@ extern int xlog_find_tail(xlog_t *log, extern int xlog_recover(xlog_t *log); extern int xlog_recover_finish(xlog_t *log); extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); -extern void xlog_recover_process_iunlinks(xlog_t *log); - extern struct xfs_buf *xlog_get_bp(xlog_t *, int); extern void xlog_put_bp(struct xfs_buf *); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 47da2fb45377..1099395d7d6c 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink( * freeing of the inode and its removal from the list must be * atomic. */ -void +STATIC void xlog_recover_process_iunlinks( xlog_t *log) { diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 5c6f092659c1..8b6c9e807efb 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) * * The m_sb_lock must be held when this routine is called. */ -int +STATIC int xfs_mod_incore_sb_unlocked( xfs_mount_t *mp, xfs_sb_field_t field, diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a5122382afde..a6c023bc0fb2 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -414,13 +414,10 @@ typedef struct xfs_mod_sb { extern int xfs_log_sbcount(xfs_mount_t *, uint); extern int xfs_mountfs(xfs_mount_t *mp); -extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); extern int xfs_unmountfs_writesb(xfs_mount_t *); extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); -extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, - int64_t, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index afee7eb24323..4b0613d99faa 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -563,35 +563,6 @@ xfs_mru_cache_lookup( return elem ? elem->value : NULL; } -/* - * To look up an element using its key, but leave its location in the internal - * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this - * function returns NULL. - * - * See the comments above the declaration of the xfs_mru_cache_lookup() function - * for important locking information pertaining to this call. - */ -void * -xfs_mru_cache_peek( - xfs_mru_cache_t *mru, - unsigned long key) -{ - xfs_mru_cache_elem_t *elem; - - ASSERT(mru && mru->lists); - if (!mru || !mru->lists) - return NULL; - - spin_lock(&mru->lock); - elem = radix_tree_lookup(&mru->store, key); - if (!elem) - spin_unlock(&mru->lock); - else - __release(mru_lock); /* help sparse not be stupid */ - - return elem ? elem->value : NULL; -} - /* * To release the internal data structure spinlock after having performed an * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h index dd58ea1bbebe..5d439f34b0c9 100644 --- a/fs/xfs/xfs_mru_cache.h +++ b/fs/xfs/xfs_mru_cache.h @@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); -void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key); void xfs_mru_cache_done(struct xfs_mru_cache *mru); #endif /* __XFS_MRU_CACHE_H__ */ diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index f76c003ec55d..ae65f0df87da 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -78,10 +78,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, xfs_buf_t *bp, xfs_daddr_t blkno); -/* - * Prototypes for functions in xfs_vnodeops.c. - */ -extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, - int flags); - #endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 492d75bae2bf..ceecafd1f9c1 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -718,7 +718,7 @@ xfs_fsync( * when the link count isn't zero and by xfs_dm_punch_hole() when * punching a hole to EOF. */ -int +STATIC int xfs_free_eofblocks( xfs_mount_t *mp, xfs_inode_t *ip, From fef1111ecdd2b5198afe91537a5cd4c6be80a255 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 2 Jul 2009 21:35:43 -0500 Subject: [PATCH 0256/1662] un-static xfs_read_agf CONFIG_XFS_DEBUG builds still need xfs_read_agf to be non-static, oops. Signed-off-by: Eric Sandeen Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_ag.h | 3 +++ fs/xfs/xfs_alloc.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index af3cfeb18b0d..f24b50b68d03 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -91,6 +91,9 @@ typedef struct xfs_agf { #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) #define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) +extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); + /* * Size of the unlinked inode hash table in the agi. */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 6316004922d7..2cf944eb796d 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2248,7 +2248,7 @@ xfs_alloc_put_freelist( /* * Read in the allocation group header (free/alloc section). */ -STATIC int /* error */ +int /* error */ xfs_read_agf( struct xfs_mount *mp, /* mount point structure */ struct xfs_trans *tp, /* transaction pointer */ From eb00457d624c70a2492c319e7e05ed7e067b2794 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sun, 5 Jul 2009 12:23:35 -0500 Subject: [PATCH 0257/1662] xfs: remove XFS_INO64_OFFSET Commit a19d9f887d81106d52cacbc9930207b487e07e0e removed the ino64 option but left the XFS_INO64_OFFSET define it used in place - just remove it. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_inum.h | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h index 7a28191cb0de..b8e4ee4e89a4 100644 --- a/fs/xfs/xfs_inum.h +++ b/fs/xfs/xfs_inum.h @@ -72,7 +72,6 @@ struct xfs_mount; #if XFS_BIG_INUMS #define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) -#define XFS_INO64_OFFSET ((xfs_ino_t)(1ULL << 32)) #else #define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL)) #endif From a0f7bfd3429fc48f4d1aa5364b9428d75798ba04 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 27 Jul 2009 18:15:25 +0200 Subject: [PATCH 0258/1662] fs/xfs: Correct redundant test bp was tested for NULL a few lines before, followed by a return, and there is no intervening modification of its value. A simplified version of the semantic match that finds this problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; expression E; position p1,p2; @@ if (x == NULL || ...) { ... when forall return ...; } ... when != \(x=E\|x--\|x++\|--x\|++x\|x-=E\|x+=E\|x|=E\|x&=E\|&x\) ( *x == NULL | *x != NULL ) // Signed-off-by: Julia Lawall Acked-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_trans_buf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 8ee2f8c8b0a6..218829e6a152 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -307,7 +307,7 @@ xfs_trans_read_buf( return (flags & XFS_BUF_TRYLOCK) ? EAGAIN : XFS_ERROR(ENOMEM); - if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) { + if (XFS_BUF_GETERROR(bp) != 0) { xfs_ioerror_alert("xfs_trans_read_buf", mp, bp, blkno); error = XFS_BUF_GETERROR(bp); @@ -315,7 +315,7 @@ xfs_trans_read_buf( return error; } #ifdef DEBUG - if (xfs_do_error && (bp != NULL)) { + if (xfs_do_error) { if (xfs_error_target == target) { if (((xfs_req_num++) % xfs_error_mod) == 0) { xfs_buf_relse(bp); From 85c0b2ab5e69ca6133380ead1c50e0840d136b39 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 31 Aug 2009 20:56:51 -0300 Subject: [PATCH 0259/1662] xfs: factor out inode initialisation Factor out code to initialize new inode clusters into a function of it's own. This keeps xfs_ialloc_ag_alloc smaller and better structured and enables a future inode cluster initialization transaction. Also initialize the agno variable earlier in xfs_ialloc_ag_alloc to avoid repeated byte swaps. [hch: The original patch is from Dave from his unpublished inode create transaction patch series, with some modifcations by me to apply stand-alone] Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Felix Blyakher --- fs/xfs/xfs_ialloc.c | 175 ++++++++++++++++++++++++-------------------- 1 file changed, 95 insertions(+), 80 deletions(-) diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 3120a3a5e20f..ce9edf7f4cb4 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -152,6 +152,87 @@ xfs_inobt_get_rec( return error; } +/* + * Initialise a new set of inodes. + */ +STATIC void +xfs_ialloc_inode_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + xfs_agblock_t length, + unsigned int gen) +{ + struct xfs_buf *fbuf; + struct xfs_dinode *free; + int blks_per_cluster, nbufs, ninodes; + int version; + int i, j; + xfs_daddr_t d; + + /* + * Loop over the new block(s), filling in the inodes. + * For small block sizes, manipulate the inodes in buffers + * which are multiples of the blocks size. + */ + if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { + blks_per_cluster = 1; + nbufs = length; + ninodes = mp->m_sb.sb_inopblock; + } else { + blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / + mp->m_sb.sb_blocksize; + nbufs = length / blks_per_cluster; + ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; + } + + /* + * Figure out what version number to use in the inodes we create. + * If the superblock version has caught up to the one that supports + * the new inode format, then use the new inode version. Otherwise + * use the old version so that old kernels will continue to be + * able to use the file system. + */ + if (xfs_sb_version_hasnlink(&mp->m_sb)) + version = 2; + else + version = 1; + + for (j = 0; j < nbufs; j++) { + /* + * Get the block. + */ + d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); + fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize * blks_per_cluster, + XFS_BUF_LOCK); + ASSERT(fbuf); + ASSERT(!XFS_BUF_GETERROR(fbuf)); + + /* + * Initialize all inodes in this buffer and then log them. + * + * XXX: It would be much better if we had just one transaction + * to log a whole cluster of inodes instead of all the + * individual transactions causing a lot of log traffic. + */ + xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog); + for (i = 0; i < ninodes; i++) { + int ioffset = i << mp->m_sb.sb_inodelog; + uint isize = sizeof(struct xfs_dinode); + + free = xfs_make_iptr(mp, fbuf, i); + free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + free->di_version = version; + free->di_gen = cpu_to_be32(gen); + free->di_next_unlinked = cpu_to_be32(NULLAGINO); + xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1); + } + xfs_trans_inode_alloc_buf(tp, fbuf); + } +} + /* * Allocate new inodes in the allocation group specified by agbp. * Return 0 for success, else error code. @@ -164,24 +245,15 @@ xfs_ialloc_ag_alloc( { xfs_agi_t *agi; /* allocation group header */ xfs_alloc_arg_t args; /* allocation argument structure */ - int blks_per_cluster; /* fs blocks per inode cluster */ xfs_btree_cur_t *cur; /* inode btree cursor */ - xfs_daddr_t d; /* disk addr of buffer */ xfs_agnumber_t agno; int error; - xfs_buf_t *fbuf; /* new free inodes' buffer */ - xfs_dinode_t *free; /* new free inode structure */ - int i; /* inode counter */ - int j; /* block counter */ - int nbufs; /* num bufs of new inodes */ + int i; xfs_agino_t newino; /* new first inode's number */ xfs_agino_t newlen; /* new number of inodes */ - int ninodes; /* num inodes per buf */ xfs_agino_t thisino; /* current inode number, for loop */ - int version; /* inode version number to use */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ - unsigned int gen; args.tp = tp; args.mp = tp->t_mountp; @@ -202,12 +274,12 @@ xfs_ialloc_ag_alloc( */ agi = XFS_BUF_TO_AGI(agbp); newino = be32_to_cpu(agi->agi_newino); + agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + XFS_IALLOC_BLOCKS(args.mp); if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { - args.fsbno = XFS_AGB_TO_FSB(args.mp, - be32_to_cpu(agi->agi_seqno), args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; args.mod = args.total = args.wasdel = args.isfl = args.userdata = args.minalignslop = 0; @@ -258,8 +330,7 @@ xfs_ialloc_ag_alloc( * For now, just allocate blocks up front. */ args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, - be32_to_cpu(agi->agi_seqno), args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); /* * Allocate a fixed-size extent of inodes. */ @@ -282,8 +353,7 @@ xfs_ialloc_ag_alloc( if (isaligned && args.fsbno == NULLFSBLOCK) { args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, - be32_to_cpu(agi->agi_seqno), args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); args.alignment = xfs_ialloc_cluster_alignment(&args); if ((error = xfs_alloc_vextent(&args))) return error; @@ -294,85 +364,30 @@ xfs_ialloc_ag_alloc( return 0; } ASSERT(args.len == args.minlen); - /* - * Convert the results. - */ - newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); - /* - * Loop over the new block(s), filling in the inodes. - * For small block sizes, manipulate the inodes in buffers - * which are multiples of the blocks size. - */ - if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) { - blks_per_cluster = 1; - nbufs = (int)args.len; - ninodes = args.mp->m_sb.sb_inopblock; - } else { - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) / - args.mp->m_sb.sb_blocksize; - nbufs = (int)args.len / blks_per_cluster; - ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock; - } - /* - * Figure out what version number to use in the inodes we create. - * If the superblock version has caught up to the one that supports - * the new inode format, then use the new inode version. Otherwise - * use the old version so that old kernels will continue to be - * able to use the file system. - */ - if (xfs_sb_version_hasnlink(&args.mp->m_sb)) - version = 2; - else - version = 1; /* + * Stamp and write the inode buffers. + * * Seed the new inode cluster with a random generation number. This * prevents short-term reuse of generation numbers if a chunk is * freed and then immediately reallocated. We use random numbers * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - gen = random32(); - for (j = 0; j < nbufs; j++) { - /* - * Get the block. - */ - d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno), - args.agbno + (j * blks_per_cluster)); - fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d, - args.mp->m_bsize * blks_per_cluster, - XFS_BUF_LOCK); - ASSERT(fbuf); - ASSERT(!XFS_BUF_GETERROR(fbuf)); + xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len, + random32()); - /* - * Initialize all inodes in this buffer and then log them. - * - * XXX: It would be much better if we had just one transaction to - * log a whole cluster of inodes instead of all the individual - * transactions causing a lot of log traffic. - */ - xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog); - for (i = 0; i < ninodes; i++) { - int ioffset = i << args.mp->m_sb.sb_inodelog; - uint isize = sizeof(struct xfs_dinode); - - free = xfs_make_iptr(args.mp, fbuf, i); - free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); - free->di_version = version; - free->di_gen = cpu_to_be32(gen); - free->di_next_unlinked = cpu_to_be32(NULLAGINO); - xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1); - } - xfs_trans_inode_alloc_buf(tp, fbuf); - } + /* + * Convert the results. + */ + newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); - agno = be32_to_cpu(agi->agi_seqno); down_read(&args.mp->m_peraglock); args.mp->m_perag[agno].pagi_freecount += newlen; up_read(&args.mp->m_peraglock); agi->agi_newino = cpu_to_be32(newino); + /* * Insert records describing the new inode chunk into the btree. */ From 2e287a731e0607e0371dc6165b7dd3ebc67fa8e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2009 20:56:58 -0300 Subject: [PATCH 0260/1662] xfs: improve xfs_inobt_get_rec prototype Most callers of xfs_inobt_get_rec need to fill a xfs_inobt_rec_incore_t, and those who don't yet are fine with a xfs_inobt_rec_incore_t, instead of the three individual variables, too. So just change xfs_inobt_get_rec to write the output into a xfs_inobt_rec_incore_t directly. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Felix Blyakher --- fs/xfs/xfs_ialloc.c | 81 +++++++++++++++++-------------------------- fs/xfs/xfs_ialloc.h | 4 +-- fs/xfs/xfs_itable.c | 84 ++++++++++++++++++++++++--------------------- 3 files changed, 77 insertions(+), 92 deletions(-) diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index ce9edf7f4cb4..72fa3bfc56eb 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -135,9 +135,7 @@ xfs_inobt_update( int /* error */ xfs_inobt_get_rec( struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agino_t *ino, /* output: starting inode of chunk */ - __int32_t *fcnt, /* output: number of free inodes */ - xfs_inofree_t *free, /* output: free inode mask */ + xfs_inobt_rec_incore_t *irec, /* btree record */ int *stat) /* output: success/failure */ { union xfs_btree_rec *rec; @@ -145,9 +143,9 @@ xfs_inobt_get_rec( error = xfs_btree_get_rec(cur, &rec, stat); if (!error && *stat == 1) { - *ino = be32_to_cpu(rec->inobt.ir_startino); - *fcnt = be32_to_cpu(rec->inobt.ir_freecount); - *free = be64_to_cpu(rec->inobt.ir_free); + irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); + irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); + irec->ir_free = be64_to_cpu(rec->inobt.ir_free); } return error; } @@ -746,8 +744,8 @@ xfs_dialloc( goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); do { - if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &i))) + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); freecount += rec.ir_freecount; @@ -766,8 +764,7 @@ xfs_dialloc( if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i))) goto error0; if (i != 0 && - (error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &j)) == 0 && + (error = xfs_inobt_get_rec(cur, &rec, &j)) == 0 && j == 1 && rec.ir_freecount > 0) { /* @@ -799,10 +796,8 @@ xfs_dialloc( goto error1; doneleft = !i; if (!doneleft) { - if ((error = xfs_inobt_get_rec(tcur, - &trec.ir_startino, - &trec.ir_freecount, - &trec.ir_free, &i))) + error = xfs_inobt_get_rec(tcur, &trec, &i); + if (error) goto error1; XFS_WANT_CORRUPTED_GOTO(i == 1, error1); } @@ -813,10 +808,8 @@ xfs_dialloc( goto error1; doneright = !i; if (!doneright) { - if ((error = xfs_inobt_get_rec(cur, - &rec.ir_startino, - &rec.ir_freecount, - &rec.ir_free, &i))) + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) goto error1; XFS_WANT_CORRUPTED_GOTO(i == 1, error1); } @@ -876,11 +869,9 @@ xfs_dialloc( goto error1; doneleft = !i; if (!doneleft) { - if ((error = xfs_inobt_get_rec( - tcur, - &trec.ir_startino, - &trec.ir_freecount, - &trec.ir_free, &i))) + error = xfs_inobt_get_rec( + tcur, &trec, &i); + if (error) goto error1; XFS_WANT_CORRUPTED_GOTO(i == 1, error1); @@ -896,11 +887,9 @@ xfs_dialloc( goto error1; doneright = !i; if (!doneright) { - if ((error = xfs_inobt_get_rec( - cur, - &rec.ir_startino, - &rec.ir_freecount, - &rec.ir_free, &i))) + error = xfs_inobt_get_rec( + cur, &rec, &i); + if (error) goto error1; XFS_WANT_CORRUPTED_GOTO(i == 1, error1); @@ -919,8 +908,7 @@ xfs_dialloc( be32_to_cpu(agi->agi_newino), 0, 0, &i))) goto error0; if (i == 1 && - (error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &j)) == 0 && + (error = xfs_inobt_get_rec(cur, &rec, &j)) == 0 && j == 1 && rec.ir_freecount > 0) { /* @@ -938,10 +926,8 @@ xfs_dialloc( goto error0; ASSERT(i == 1); for (;;) { - if ((error = xfs_inobt_get_rec(cur, - &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, - &i))) + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); if (rec.ir_freecount > 0) @@ -975,8 +961,8 @@ xfs_dialloc( if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) goto error0; do { - if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &i))) + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); freecount += rec.ir_freecount; @@ -1084,8 +1070,8 @@ xfs_difree( if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) goto error0; do { - if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &i))) + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) goto error0; if (i) { freecount += rec.ir_freecount; @@ -1107,8 +1093,8 @@ xfs_difree( goto error0; } XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount, - &rec.ir_free, &i))) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) { cmn_err(CE_WARN, "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.", error, mp->m_fsname); @@ -1187,10 +1173,8 @@ xfs_difree( if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) goto error0; do { - if ((error = xfs_inobt_get_rec(cur, - &rec.ir_startino, - &rec.ir_freecount, - &rec.ir_free, &i))) + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) goto error0; if (i) { freecount += rec.ir_freecount; @@ -1312,9 +1296,7 @@ xfs_imap( chunk_agbno = agbno - offset_agbno; } else { xfs_btree_cur_t *cur; /* inode btree cursor */ - xfs_agino_t chunk_agino; /* first agino in inode chunk */ - __int32_t chunk_cnt; /* count of free inodes in chunk */ - xfs_inofree_t chunk_free; /* mask of free inodes in chunk */ + xfs_inobt_rec_incore_t chunk_rec; xfs_buf_t *agbp; /* agi buffer */ int i; /* temp state */ @@ -1337,8 +1319,7 @@ xfs_imap( goto error0; } - error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt, - &chunk_free, &i); + error = xfs_inobt_get_rec(cur, &chunk_rec, &i); if (error) { xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " "xfs_inobt_get_rec() failed"); @@ -1356,7 +1337,7 @@ xfs_imap( xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); if (error) return error; - chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino); + chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino); offset_agbno = agbno - chunk_agbno; } diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index aeee8278f92c..52e72fe7e411 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -166,7 +166,7 @@ int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino, /* * Get the data from the pointed-to record. */ -extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino, - __int32_t *fcnt, xfs_inofree_t *free, int *stat); +extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, + xfs_inobt_rec_incore_t *rec, int *stat); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c471122d2234..3ec13e8a8c54 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -353,9 +353,6 @@ xfs_bulkstat( int end_of_ag; /* set if we've seen the ag end */ int error; /* error code */ int fmterror;/* bulkstat formatter result */ - __int32_t gcnt; /* current btree rec's count */ - xfs_inofree_t gfree; /* current btree rec's free mask */ - xfs_agino_t gino; /* current btree rec's start inode */ int i; /* loop index */ int icount; /* count of inodes good in irbuf */ size_t irbsize; /* size of irec buffer in bytes */ @@ -442,6 +439,8 @@ xfs_bulkstat( * we need to get the remainder of the chunk we're in. */ if (agino > 0) { + xfs_inobt_rec_incore_t r; + /* * Lookup the inode chunk that this inode lives in. */ @@ -449,33 +448,33 @@ xfs_bulkstat( if (!error && /* no I/O error */ tmp && /* lookup succeeded */ /* got the record, should always work */ - !(error = xfs_inobt_get_rec(cur, &gino, &gcnt, - &gfree, &i)) && + !(error = xfs_inobt_get_rec(cur, &r, &i)) && i == 1 && /* this is the right chunk */ - agino < gino + XFS_INODES_PER_CHUNK && + agino < r.ir_startino + XFS_INODES_PER_CHUNK && /* lastino was not last in chunk */ - (chunkidx = agino - gino + 1) < + (chunkidx = agino - r.ir_startino + 1) < XFS_INODES_PER_CHUNK && /* there are some left allocated */ xfs_inobt_maskn(chunkidx, - XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) { + XFS_INODES_PER_CHUNK - chunkidx) & + ~r.ir_free) { /* * Grab the chunk record. Mark all the * uninteresting inodes (because they're * before our start point) free. */ for (i = 0; i < chunkidx; i++) { - if (XFS_INOBT_MASK(i) & ~gfree) - gcnt++; + if (XFS_INOBT_MASK(i) & ~r.ir_free) + r.ir_freecount++; } - gfree |= xfs_inobt_maskn(0, chunkidx); - irbp->ir_startino = gino; - irbp->ir_freecount = gcnt; - irbp->ir_free = gfree; + r.ir_free |= xfs_inobt_maskn(0, chunkidx); + irbp->ir_startino = r.ir_startino; + irbp->ir_freecount = r.ir_freecount; + irbp->ir_free = r.ir_free; irbp++; - agino = gino + XFS_INODES_PER_CHUNK; - icount = XFS_INODES_PER_CHUNK - gcnt; + agino = r.ir_startino + XFS_INODES_PER_CHUNK; + icount = XFS_INODES_PER_CHUNK - r.ir_freecount; } else { /* * If any of those tests failed, bump the @@ -501,6 +500,8 @@ xfs_bulkstat( * until we run out of inodes or space in the buffer. */ while (irbp < irbufend && icount < ubcount) { + xfs_inobt_rec_incore_t r; + /* * Loop as long as we're unable to read the * inode btree. @@ -518,43 +519,47 @@ xfs_bulkstat( * If ran off the end of the ag either with an error, * or the normal way, set end and stop collecting. */ - if (error || - (error = xfs_inobt_get_rec(cur, &gino, &gcnt, - &gfree, &i)) || - i == 0) { + if (error) { end_of_ag = 1; break; } + + error = xfs_inobt_get_rec(cur, &r, &i); + if (error || i == 0) { + end_of_ag = 1; + break; + } + /* * If this chunk has any allocated inodes, save it. * Also start read-ahead now for this chunk. */ - if (gcnt < XFS_INODES_PER_CHUNK) { + if (r.ir_freecount < XFS_INODES_PER_CHUNK) { /* * Loop over all clusters in the next chunk. * Do a readahead if there are any allocated * inodes in that cluster. */ - for (agbno = XFS_AGINO_TO_AGBNO(mp, gino), - chunkidx = 0; + agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); + for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK; chunkidx += nicluster, agbno += nbcluster) { - if (xfs_inobt_maskn(chunkidx, - nicluster) & ~gfree) + if (xfs_inobt_maskn(chunkidx, nicluster) + & ~r.ir_free) xfs_btree_reada_bufs(mp, agno, agbno, nbcluster); } - irbp->ir_startino = gino; - irbp->ir_freecount = gcnt; - irbp->ir_free = gfree; + irbp->ir_startino = r.ir_startino; + irbp->ir_freecount = r.ir_freecount; + irbp->ir_free = r.ir_free; irbp++; - icount += XFS_INODES_PER_CHUNK - gcnt; + icount += XFS_INODES_PER_CHUNK - r.ir_freecount; } /* * Set agino to after this chunk and bump the cursor. */ - agino = gino + XFS_INODES_PER_CHUNK; + agino = r.ir_startino + XFS_INODES_PER_CHUNK; error = xfs_btree_increment(cur, 0, &tmp); cond_resched(); } @@ -820,9 +825,7 @@ xfs_inumbers( int bufidx; xfs_btree_cur_t *cur; int error; - __int32_t gcnt; - xfs_inofree_t gfree; - xfs_agino_t gino; + xfs_inobt_rec_incore_t r; int i; xfs_ino_t ino; int left; @@ -870,9 +873,8 @@ xfs_inumbers( continue; } } - if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree, - &i)) || - i == 0) { + error = xfs_inobt_get_rec(cur, &r, &i); + if (error || i == 0) { xfs_buf_relse(agbp); agbp = NULL; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); @@ -881,10 +883,12 @@ xfs_inumbers( agino = 0; continue; } - agino = gino + XFS_INODES_PER_CHUNK - 1; - buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino); - buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt; - buffer[bufidx].xi_allocmask = ~gfree; + agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; + buffer[bufidx].xi_startino = + XFS_AGINO_TO_INO(mp, agno, r.ir_startino); + buffer[bufidx].xi_alloccount = + XFS_INODES_PER_CHUNK - r.ir_freecount; + buffer[bufidx].xi_allocmask = ~r.ir_free; bufidx++; left--; if (bufidx == bcount) { From afabc24a73bfee2656724b0a70395f1693eaa62b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2009 20:57:03 -0300 Subject: [PATCH 0261/1662] xfs: improve xfs_inobt_update prototype Both callers of xfs_inobt_update have the record in form of a xfs_inobt_rec_incore_t, so just pass a pointer to it instead of the individual variables. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Felix Blyakher --- fs/xfs/xfs_ialloc.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 72fa3bfc56eb..8819cdacf702 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -110,22 +110,19 @@ xfs_inobt_lookup_le( } /* - * Update the record referred to by cur to the value given - * by [ino, fcnt, free]. + * Update the record referred to by cur to the value given. * This either works (return 0) or gets an EFSCORRUPTED error. */ STATIC int /* error */ xfs_inobt_update( struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free) /* free inode mask */ + xfs_inobt_rec_incore_t *irec) /* btree record */ { union xfs_btree_rec rec; - rec.inobt.ir_startino = cpu_to_be32(ino); - rec.inobt.ir_freecount = cpu_to_be32(fcnt); - rec.inobt.ir_free = cpu_to_be64(free); + rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); + rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); + rec.inobt.ir_free = cpu_to_be64(irec->ir_free); return xfs_btree_update(cur, &rec); } @@ -946,8 +943,8 @@ xfs_dialloc( ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; - if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, - rec.ir_free))) + error = xfs_inobt_update(cur, &rec); + if (error) goto error0; be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); @@ -1149,12 +1146,14 @@ xfs_difree( } else { *delete = 0; - if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) { + error = xfs_inobt_update(cur, &rec); + if (error) { cmn_err(CE_WARN, - "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.", + "xfs_difree: xfs_inobt_update returned an error %d on %s.", error, mp->m_fsname); goto error0; } + /* * Change the inode free counts and log the ag/sb changes. */ From 0b48db80ba689edfd96ed06c3124d6cf1146de3f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 31 Aug 2009 20:57:09 -0300 Subject: [PATCH 0262/1662] xfs: factor out debug checks from xfs_dialloc and xfs_difree Factor out a common helper from repeated debug checks in xfs_dialloc and xfs_difree. [hch: split out from Dave's dynamic allocation policy patches] Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Felix Blyakher --- fs/xfs/xfs_ialloc.c | 131 +++++++++++++++++++------------------------- 1 file changed, 56 insertions(+), 75 deletions(-) diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 8819cdacf702..18bf6eece54d 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -147,6 +147,47 @@ xfs_inobt_get_rec( return error; } +/* + * Verify that the number of free inodes in the AGI is correct. + */ +#ifdef DEBUG +STATIC int +xfs_check_agi_freecount( + struct xfs_btree_cur *cur, + struct xfs_agi *agi) +{ + if (cur->bc_nlevels == 1) { + xfs_inobt_rec_incore_t rec; + int freecount = 0; + int error; + int i; + + error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i); + if (error) + return error; + + do { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + return error; + + if (i) { + freecount += rec.ir_freecount; + error = xfs_btree_increment(cur, 0, &i); + if (error) + return error; + } + } while (i == 1); + + if (!XFS_FORCED_SHUTDOWN(cur->bc_mp)) + ASSERT(freecount == be32_to_cpu(agi->agi_freecount)); + } + return 0; +} +#else +#define xfs_check_agi_freecount(cur, agi) 0 +#endif + /* * Initialise a new set of inodes. */ @@ -548,6 +589,7 @@ xfs_ialloc_ag_select( } } + /* * Visible inode allocation functions. */ @@ -733,27 +775,11 @@ xfs_dialloc( */ if (!pagino) pagino = be32_to_cpu(agi->agi_newino); -#ifdef DEBUG - if (cur->bc_nlevels == 1) { - int freecount = 0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - do { - error = xfs_inobt_get_rec(cur, &rec, &i); - if (error) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - freecount += rec.ir_freecount; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto error0; - } while (i == 1); + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; - ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); - } -#endif /* * If in the same a.g. as the parent, try to get near the parent. */ @@ -951,25 +977,11 @@ xfs_dialloc( down_read(&mp->m_peraglock); mp->m_perag[tagno].pagi_freecount--; up_read(&mp->m_peraglock); -#ifdef DEBUG - if (cur->bc_nlevels == 1) { - int freecount = 0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - do { - error = xfs_inobt_get_rec(cur, &rec, &i); - if (error) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - freecount += rec.ir_freecount; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto error0; - } while (i == 1); - ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); - } -#endif + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); *inop = ino; @@ -1060,26 +1072,11 @@ xfs_difree( * Initialize the cursor. */ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); -#ifdef DEBUG - if (cur->bc_nlevels == 1) { - int freecount = 0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - do { - error = xfs_inobt_get_rec(cur, &rec, &i); - if (error) - goto error0; - if (i) { - freecount += rec.ir_freecount; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto error0; - } - } while (i == 1); - ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); - } -#endif + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; + /* * Look for the entry describing this inode. */ @@ -1165,26 +1162,10 @@ xfs_difree( xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } -#ifdef DEBUG - if (cur->bc_nlevels == 1) { - int freecount = 0; + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - do { - error = xfs_inobt_get_rec(cur, &rec, &i); - if (error) - goto error0; - if (i) { - freecount += rec.ir_freecount; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto error0; - } - } while (i == 1); - ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); - } -#endif xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; From 4254b0bbb1c0826b7443ffa593576696bc591aa2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2009 20:57:14 -0300 Subject: [PATCH 0263/1662] xfs: untangle xfs_dialloc Clarify the control flow in xfs_dialloc. Factor out a helper to go to the next node from the current one and improve the control flow by expanding composite if statements and using gotos. The xfs_ialloc_next_rec helper is borrowed from Dave Chinners dynamic allocation policy patches. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Felix Blyakher --- fs/xfs/xfs_ialloc.c | 301 +++++++++++++++++++++----------------------- 1 file changed, 143 insertions(+), 158 deletions(-) diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 18bf6eece54d..7679c1633053 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -589,6 +589,37 @@ xfs_ialloc_ag_select( } } +/* + * Try to retrieve the next record to the left/right from the current one. + */ +STATIC int +xfs_ialloc_next_rec( + struct xfs_btree_cur *cur, + xfs_inobt_rec_incore_t *rec, + int *done, + int left) +{ + int error; + int i; + + if (left) + error = xfs_btree_decrement(cur, 0, &i); + else + error = xfs_btree_increment(cur, 0, &i); + + if (error) + return error; + *done = !i; + if (i) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + + return 0; +} + /* * Visible inode allocation functions. @@ -644,8 +675,8 @@ xfs_dialloc( int j; /* result code */ xfs_mount_t *mp; /* file system mount structure */ int offset; /* index of inode in chunk */ - xfs_agino_t pagino; /* parent's a.g. relative inode # */ - xfs_agnumber_t pagno; /* parent's allocation group number */ + xfs_agino_t pagino; /* parent's AG relative inode # */ + xfs_agnumber_t pagno; /* parent's AG number */ xfs_inobt_rec_incore_t rec; /* inode allocation record */ xfs_agnumber_t tagno; /* testing allocation group number */ xfs_btree_cur_t *tcur; /* temp cursor */ @@ -781,186 +812,140 @@ xfs_dialloc( goto error0; /* - * If in the same a.g. as the parent, try to get near the parent. + * If in the same AG as the parent, try to get near the parent. */ if (pagno == agno) { - if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i))) + int doneleft; /* done, to the left */ + int doneright; /* done, to the right */ + + error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i); + if (error) goto error0; - if (i != 0 && - (error = xfs_inobt_get_rec(cur, &rec, &j)) == 0 && - j == 1 && - rec.ir_freecount > 0) { + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_inobt_get_rec(cur, &rec, &j); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + if (rec.ir_freecount > 0) { /* * Found a free inode in the same chunk - * as parent, done. + * as the parent, done. */ + goto alloc_inode; } + + /* - * In the same a.g. as parent, but parent's chunk is full. + * In the same AG as parent, but parent's chunk is full. */ - else { - int doneleft; /* done, to the left */ - int doneright; /* done, to the right */ + /* duplicate the cursor, search left & right simultaneously */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + + /* search left with tcur, back up 1 record */ + error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); + if (error) + goto error1; + + /* search right with cur, go forward 1 record. */ + error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); + if (error) + goto error1; + + /* + * Loop until we find an inode chunk with a free inode. + */ + while (!doneleft || !doneright) { + int useleft; /* using left inode chunk this time */ + + /* figure out the closer block if both are valid. */ + if (!doneleft && !doneright) { + useleft = pagino - + (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) < + rec.ir_startino - pagino; + } else { + useleft = !doneleft; + } + + /* free inodes to the left? */ + if (useleft && trec.ir_freecount) { + rec = trec; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + cur = tcur; + goto alloc_inode; + } + + /* free inodes to the right? */ + if (!useleft && rec.ir_freecount) { + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + goto alloc_inode; + } + + /* get next record to check */ + if (useleft) { + error = xfs_ialloc_next_rec(tcur, &trec, + &doneleft, 1); + } else { + error = xfs_ialloc_next_rec(cur, &rec, + &doneright, 0); + } if (error) - goto error0; - ASSERT(i == 1); - ASSERT(j == 1); - /* - * Duplicate the cursor, search left & right - * simultaneously. - */ - if ((error = xfs_btree_dup_cursor(cur, &tcur))) - goto error0; - /* - * Search left with tcur, back up 1 record. - */ - if ((error = xfs_btree_decrement(tcur, 0, &i))) goto error1; - doneleft = !i; - if (!doneleft) { - error = xfs_inobt_get_rec(tcur, &trec, &i); - if (error) - goto error1; - XFS_WANT_CORRUPTED_GOTO(i == 1, error1); - } - /* - * Search right with cur, go forward 1 record. - */ - if ((error = xfs_btree_increment(cur, 0, &i))) - goto error1; - doneright = !i; - if (!doneright) { - error = xfs_inobt_get_rec(cur, &rec, &i); - if (error) - goto error1; - XFS_WANT_CORRUPTED_GOTO(i == 1, error1); - } - /* - * Loop until we find the closest inode chunk - * with a free one. - */ - while (!doneleft || !doneright) { - int useleft; /* using left inode - chunk this time */ - - /* - * Figure out which block is closer, - * if both are valid. - */ - if (!doneleft && !doneright) - useleft = - pagino - - (trec.ir_startino + - XFS_INODES_PER_CHUNK - 1) < - rec.ir_startino - pagino; - else - useleft = !doneleft; - /* - * If checking the left, does it have - * free inodes? - */ - if (useleft && trec.ir_freecount) { - /* - * Yes, set it up as the chunk to use. - */ - rec = trec; - xfs_btree_del_cursor(cur, - XFS_BTREE_NOERROR); - cur = tcur; - break; - } - /* - * If checking the right, does it have - * free inodes? - */ - if (!useleft && rec.ir_freecount) { - /* - * Yes, it's already set up. - */ - xfs_btree_del_cursor(tcur, - XFS_BTREE_NOERROR); - break; - } - /* - * If used the left, get another one - * further left. - */ - if (useleft) { - if ((error = xfs_btree_decrement(tcur, 0, - &i))) - goto error1; - doneleft = !i; - if (!doneleft) { - error = xfs_inobt_get_rec( - tcur, &trec, &i); - if (error) - goto error1; - XFS_WANT_CORRUPTED_GOTO(i == 1, - error1); - } - } - /* - * If used the right, get another one - * further right. - */ - else { - if ((error = xfs_btree_increment(cur, 0, - &i))) - goto error1; - doneright = !i; - if (!doneright) { - error = xfs_inobt_get_rec( - cur, &rec, &i); - if (error) - goto error1; - XFS_WANT_CORRUPTED_GOTO(i == 1, - error1); - } - } - } - ASSERT(!doneleft || !doneright); } + ASSERT(!doneleft || !doneright); } + /* - * In a different a.g. from the parent. + * In a different AG from the parent. * See if the most recently allocated block has any free. */ else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) { - if ((error = xfs_inobt_lookup_eq(cur, - be32_to_cpu(agi->agi_newino), 0, 0, &i))) + error = xfs_inobt_lookup_eq(cur, be32_to_cpu(agi->agi_newino), + 0, 0, &i); + if (error) goto error0; - if (i == 1 && - (error = xfs_inobt_get_rec(cur, &rec, &j)) == 0 && - j == 1 && - rec.ir_freecount > 0) { - /* - * The last chunk allocated in the group still has - * a free inode. - */ - } - /* - * None left in the last group, search the whole a.g. - */ - else { + + if (i == 1) { + error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - ASSERT(i == 1); - for (;;) { - error = xfs_inobt_get_rec(cur, &rec, &i); - if (error) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (rec.ir_freecount > 0) - break; - if ((error = xfs_btree_increment(cur, 0, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + if (j == 1 && rec.ir_freecount > 0) { + /* + * The last chunk allocated in the group + * still has a free inode. + */ + goto alloc_inode; } } + + /* + * None left in the last group, search the whole AG + */ + error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + for (;;) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (rec.ir_freecount > 0) + break; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } } + +alloc_inode: offset = xfs_ialloc_find_free(&rec.ir_free); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); From 2187550525d7bcb8c87689e4eca41b1955bf9ac3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2009 20:58:21 -0300 Subject: [PATCH 0264/1662] xfs: rationalize xfs_inobt_lookup* Currenly we have a xfs_inobt_lookup* variant for each comparism direction, and all these get all three fields of the inobt records passed, while the common case is just looking for the inode number and we have only marginally more callers than xfs_inobt_lookup* variants. So opencode a direct call to xfs_btree_lookup for the single case where we need all fields, and replace xfs_inobt_lookup* with a xfs_inobt_looku that just takes the inode number and the direction for all other callers. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Felix Blyakher --- fs/xfs/xfs_ialloc.c | 77 +++++++++++++-------------------------------- fs/xfs/xfs_ialloc.h | 14 ++------- fs/xfs/xfs_itable.c | 12 ++++--- 3 files changed, 32 insertions(+), 71 deletions(-) diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 7679c1633053..748637cd70ff 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -57,56 +57,19 @@ xfs_ialloc_cluster_alignment( } /* - * Lookup the record equal to ino in the btree given by cur. + * Lookup a record by ino in the btree given by cur. */ STATIC int /* error */ -xfs_inobt_lookup_eq( +xfs_inobt_lookup( struct xfs_btree_cur *cur, /* btree cursor */ xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free, /* free inode mask */ + xfs_lookup_t dir, /* <=, >=, == */ int *stat) /* success/failure */ { cur->bc_rec.i.ir_startino = ino; - cur->bc_rec.i.ir_freecount = fcnt; - cur->bc_rec.i.ir_free = free; - return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); -} - -/* - * Lookup the first record greater than or equal to ino - * in the btree given by cur. - */ -int /* error */ -xfs_inobt_lookup_ge( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free, /* free inode mask */ - int *stat) /* success/failure */ -{ - cur->bc_rec.i.ir_startino = ino; - cur->bc_rec.i.ir_freecount = fcnt; - cur->bc_rec.i.ir_free = free; - return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); -} - -/* - * Lookup the first record less than or equal to ino - * in the btree given by cur. - */ -int /* error */ -xfs_inobt_lookup_le( - struct xfs_btree_cur *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free, /* free inode mask */ - int *stat) /* success/failure */ -{ - cur->bc_rec.i.ir_startino = ino; - cur->bc_rec.i.ir_freecount = fcnt; - cur->bc_rec.i.ir_free = free; - return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); + cur->bc_rec.i.ir_freecount = 0; + cur->bc_rec.i.ir_free = 0; + return xfs_btree_lookup(cur, dir, stat); } /* @@ -162,7 +125,7 @@ xfs_check_agi_freecount( int error; int i; - error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i); + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; @@ -431,13 +394,17 @@ xfs_ialloc_ag_alloc( for (thisino = newino; thisino < newino + newlen; thisino += XFS_INODES_PER_CHUNK) { - if ((error = xfs_inobt_lookup_eq(cur, thisino, - XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) { + cur->bc_rec.i.ir_startino = thisino; + cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK; + cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE; + error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i); + if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } ASSERT(i == 0); - if ((error = xfs_btree_insert(cur, &i))) { + error = xfs_btree_insert(cur, &i); + if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); return error; } @@ -818,7 +785,7 @@ xfs_dialloc( int doneleft; /* done, to the left */ int doneright; /* done, to the right */ - error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i); + error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); @@ -904,8 +871,8 @@ xfs_dialloc( * See if the most recently allocated block has any free. */ else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) { - error = xfs_inobt_lookup_eq(cur, be32_to_cpu(agi->agi_newino), - 0, 0, &i); + error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), + XFS_LOOKUP_EQ, &i); if (error) goto error0; @@ -926,7 +893,7 @@ xfs_dialloc( /* * None left in the last group, search the whole AG */ - error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i); + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); @@ -1065,9 +1032,9 @@ xfs_difree( /* * Look for the entry describing this inode. */ - if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) { + if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) { cmn_err(CE_WARN, - "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.", + "xfs_difree: xfs_inobt_lookup returned() an error %d on %s. Returning error.", error, mp->m_fsname); goto error0; } @@ -1277,10 +1244,10 @@ xfs_imap( } cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); - error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (error) { xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " - "xfs_inobt_lookup_le() failed"); + "xfs_inobt_lookup() failed"); goto error0; } diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 52e72fe7e411..bb5385475e1f 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -150,18 +150,10 @@ xfs_ialloc_pagi_init( xfs_agnumber_t agno); /* allocation group number */ /* - * Lookup the first record greater than or equal to ino - * in the btree given by cur. + * Lookup a record by ino in the btree given by cur. */ -int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino, - __int32_t fcnt, xfs_inofree_t free, int *stat); - -/* - * Lookup the first record less than or equal to ino - * in the btree given by cur. - */ -int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino, - __int32_t fcnt, xfs_inofree_t free, int *stat); +int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, + xfs_lookup_t dir, int *stat); /* * Get the data from the pointed-to record. diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 3ec13e8a8c54..b68f9107e26c 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -444,7 +444,8 @@ xfs_bulkstat( /* * Lookup the inode chunk that this inode lives in. */ - error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, + &tmp); if (!error && /* no I/O error */ tmp && /* lookup succeeded */ /* got the record, should always work */ @@ -492,7 +493,7 @@ xfs_bulkstat( /* * Start of ag. Lookup the first inode chunk. */ - error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp); + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp); icount = 0; } /* @@ -511,8 +512,8 @@ xfs_bulkstat( if (XFS_AGINO_TO_AGBNO(mp, agino) >= be32_to_cpu(agi->agi_length)) break; - error = xfs_inobt_lookup_ge(cur, agino, 0, 0, - &tmp); + error = xfs_inobt_lookup(cur, agino, + XFS_LOOKUP_GE, &tmp); cond_resched(); } /* @@ -858,7 +859,8 @@ xfs_inumbers( continue; } cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); - error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE, + &tmp); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); cur = NULL; From bd169565993b39b9b4b102cdac8b13e0a259ce2f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 31 Aug 2009 20:58:28 -0300 Subject: [PATCH 0265/1662] xfs: speed up free inode search Don't search too far - abort if it is outside a certain radius and simply do a linear search for the first free inode. In AGs with a million inodes this can speed up allocation speed by 3-4x. [hch: ported to the new xfs_ialloc.c world order] Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Felix Blyakher --- fs/xfs/xfs_ag.h | 9 +++ fs/xfs/xfs_ialloc.c | 133 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 115 insertions(+), 27 deletions(-) diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index f24b50b68d03..a5d54bf4931b 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -198,6 +198,15 @@ typedef struct xfs_perag xfs_agino_t pagi_count; /* number of allocated inodes */ int pagb_count; /* pagb slots in use */ xfs_perag_busy_t *pagb_list; /* unstable blocks */ + + /* + * Inode allocation search lookup optimisation. + * If the pagino matches, the search for new inodes + * doesn't need to search the near ones again straight away + */ + xfs_agino_t pagl_pagino; + xfs_agino_t pagl_leftrec; + xfs_agino_t pagl_rightrec; #ifdef __KERNEL__ spinlock_t pagb_lock; /* lock for pagb_list */ diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 748637cd70ff..d12d805f20a5 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -587,6 +587,30 @@ xfs_ialloc_next_rec( return 0; } +STATIC int +xfs_ialloc_get_rec( + struct xfs_btree_cur *cur, + xfs_agino_t agino, + xfs_inobt_rec_incore_t *rec, + int *done, + int left) +{ + int error; + int i; + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i); + if (error) + return error; + *done = !i; + if (i) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + } + + return 0; +} /* * Visible inode allocation functions. @@ -766,6 +790,8 @@ xfs_dialloc( */ agno = tagno; *IO_agbp = NULL; + + restart_pagno: cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); /* * If pagino is 0 (this is the root inode allocation) use newino. @@ -782,8 +808,10 @@ xfs_dialloc( * If in the same AG as the parent, try to get near the parent. */ if (pagno == agno) { + xfs_perag_t *pag = &mp->m_perag[agno]; int doneleft; /* done, to the left */ int doneright; /* done, to the right */ + int searchdistance = 10; error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) @@ -813,15 +841,33 @@ xfs_dialloc( if (error) goto error0; - /* search left with tcur, back up 1 record */ - error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); - if (error) - goto error1; + /* + * Skip to last blocks looked up if same parent inode. + */ + if (pagino != NULLAGINO && + pag->pagl_pagino == pagino && + pag->pagl_leftrec != NULLAGINO && + pag->pagl_rightrec != NULLAGINO) { + error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, + &trec, &doneleft, 1); + if (error) + goto error1; - /* search right with cur, go forward 1 record. */ - error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); - if (error) - goto error1; + error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, + &rec, &doneright, 0); + if (error) + goto error1; + } else { + /* search left with tcur, back up 1 record */ + error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); + if (error) + goto error1; + + /* search right with cur, go forward 1 record. */ + error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); + if (error) + goto error1; + } /* * Loop until we find an inode chunk with a free inode. @@ -829,6 +875,17 @@ xfs_dialloc( while (!doneleft || !doneright) { int useleft; /* using left inode chunk this time */ + if (!--searchdistance) { + /* + * Not in range - save last search + * location and allocate a new inode + */ + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto newino; + } + /* figure out the closer block if both are valid. */ if (!doneleft && !doneright) { useleft = pagino - @@ -843,12 +900,20 @@ xfs_dialloc( rec = trec; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); cur = tcur; + + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; goto alloc_inode; } /* free inodes to the right? */ if (!useleft && rec.ir_freecount) { xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; goto alloc_inode; } @@ -863,14 +928,28 @@ xfs_dialloc( if (error) goto error1; } - ASSERT(!doneleft || !doneright); + + /* + * We've reached the end of the btree. because + * we are only searching a small chunk of the + * btree each search, there is obviously free + * inodes closer to the parent inode than we + * are now. restart the search again. + */ + pag->pagl_pagino = NULLAGINO; + pag->pagl_leftrec = NULLAGINO; + pag->pagl_rightrec = NULLAGINO; + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + goto restart_pagno; } /* * In a different AG from the parent. * See if the most recently allocated block has any free. */ - else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) { +newino: + if (be32_to_cpu(agi->agi_newino) != NULLAGINO) { error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), XFS_LOOKUP_EQ, &i); if (error) @@ -889,27 +968,27 @@ xfs_dialloc( goto alloc_inode; } } + } - /* - * None left in the last group, search the whole AG - */ - error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + /* + * None left in the last group, search the whole AG + */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + for (;;) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (rec.ir_freecount > 0) + break; + error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - - for (;;) { - error = xfs_inobt_get_rec(cur, &rec, &i); - if (error) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (rec.ir_freecount > 0) - break; - error = xfs_btree_increment(cur, 0, &i); - if (error) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - } } alloc_inode: From 13e6d5cdde0e785aa943810f08b801cadd0935df Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2009 21:00:31 -0300 Subject: [PATCH 0266/1662] xfs: merge fsync and O_SYNC handling The guarantees for O_SYNC are exactly the same as the ones we need to make for an fsync call (and given that Linux O_SYNC is O_DSYNC the equivalent is fdadatasync, but we treat both the same in XFS), except with a range data writeout. Jan Kara has started unifying these two path for filesystems using the generic helpers, and I've started to look at XFS. The actual transaction commited by xfs_fsync and xfs_write_sync_logforce has a different transaction number, but actually is exactly the same. We'll only use the fsync transaction going forward. One major difference is that xfs_write_sync_logforce never issues a cache flush unless we commit a transaction causing that as a side-effect, which is an obvious bug in the O_SYNC handling. Second all the locking and i_update_size vs i_update_core changes from 978b7237123d007b9fa983af6e0e2fa8f97f9934 never made it to xfs_write_sync_logforce, so we add them back. To make xfs_fsync easily usable from the O_SYNC path, the filemap_fdatawait call is moved up to xfs_file_fsync, so that we don't wait on the whole file after we already waited for our portion in xfs_write. We'll also use a plain call to filemap_write_and_wait_range instead of the previous sync_page_rang which did it in two steps including an half-hearted inode write out that doesn't help us. Once we're done with this also remove the now useless i_update_size tracking. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_aops.c | 1 - fs/xfs/linux-2.6/xfs_file.c | 19 ++++++--- fs/xfs/linux-2.6/xfs_lrw.c | 7 +++- fs/xfs/xfs_iget.c | 1 - fs/xfs/xfs_inode.h | 1 - fs/xfs/xfs_inode_item.c | 8 ---- fs/xfs/xfs_rw.c | 84 ------------------------------------- fs/xfs/xfs_rw.h | 1 - fs/xfs/xfs_trans.h | 2 +- fs/xfs/xfs_vnodeops.c | 11 ++--- 10 files changed, 23 insertions(+), 112 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index aecf2519db76..d5e5559e31db 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -216,7 +216,6 @@ xfs_setfilesize( if (ip->i_d.di_size < isize) { ip->i_d.di_size = isize; ip->i_update_core = 1; - ip->i_update_size = 1; xfs_mark_inode_dirty_sync(ip); } diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 0542fd507649..988d8f87bc0f 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -172,12 +172,21 @@ xfs_file_release( */ STATIC int xfs_file_fsync( - struct file *filp, - struct dentry *dentry, - int datasync) + struct file *file, + struct dentry *dentry, + int datasync) { - xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); - return -xfs_fsync(XFS_I(dentry->d_inode)); + struct inode *inode = dentry->d_inode; + struct xfs_inode *ip = XFS_I(inode); + int error; + + /* capture size updates in I/O completion before writing the inode. */ + error = filemap_fdatawait(inode->i_mapping); + if (error) + return error; + + xfs_iflags_clear(ip, XFS_ITRUNCATED); + return -xfs_fsync(ip); } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 7078974a6eee..49e4a6aea73c 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -812,18 +812,21 @@ xfs_write( /* Handle various SYNC-type writes */ if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { + loff_t end = pos + ret - 1; int error2; xfs_iunlock(xip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); - error2 = sync_page_range(inode, mapping, pos, ret); + + error2 = filemap_write_and_wait_range(mapping, pos, end); if (!error) error = error2; if (need_i_mutex) mutex_lock(&inode->i_mutex); xfs_ilock(xip, iolock); - error2 = xfs_write_sync_logforce(mp, xip); + + error2 = xfs_fsync(xip); if (!error) error = error2; } diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index ecbf8b4d2e2e..3323826274b3 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -82,7 +82,6 @@ xfs_inode_alloc( memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); ip->i_flags = 0; ip->i_update_core = 0; - ip->i_update_size = 0; ip->i_delayed_blks = 0; memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); ip->i_size = 0; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 00e8505bde2d..ed566c248ae4 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -261,7 +261,6 @@ typedef struct xfs_inode { /* Miscellaneous state. */ unsigned short i_flags; /* see defined flags below */ unsigned char i_update_core; /* timestamps/size is dirty */ - unsigned char i_update_size; /* di_size field is dirty */ unsigned int i_delayed_blks; /* count of delay alloc blks */ xfs_icdinode_t i_d; /* most of ondisk inode */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 977c4aec587e..2e69412195e6 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -262,14 +262,6 @@ xfs_inode_item_format( SYNCHRONIZE(); } - /* - * We don't have to worry about re-ordering here because - * the update_size field is protected by the inode lock - * and we have that held in exclusive mode. - */ - if (ip->i_update_size) - ip->i_update_size = 0; - /* * Make sure to get the latest atime from the Linux inode. */ diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c index fea68615ed23..3f816ad7ff19 100644 --- a/fs/xfs/xfs_rw.c +++ b/fs/xfs/xfs_rw.c @@ -87,90 +87,6 @@ xfs_write_clear_setuid( return 0; } -/* - * Handle logging requirements of various synchronous types of write. - */ -int -xfs_write_sync_logforce( - xfs_mount_t *mp, - xfs_inode_t *ip) -{ - int error = 0; - - /* - * If we're treating this as O_DSYNC and we have not updated the - * size, force the log. - */ - if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && - !(ip->i_update_size)) { - xfs_inode_log_item_t *iip = ip->i_itemp; - - /* - * If an allocation transaction occurred - * without extending the size, then we have to force - * the log up the proper point to ensure that the - * allocation is permanent. We can't count on - * the fact that buffered writes lock out direct I/O - * writes - the direct I/O write could have extended - * the size nontransactionally, then finished before - * we started. xfs_write_file will think that the file - * didn't grow but the update isn't safe unless the - * size change is logged. - * - * Force the log if we've committed a transaction - * against the inode or if someone else has and - * the commit record hasn't gone to disk (e.g. - * the inode is pinned). This guarantees that - * all changes affecting the inode are permanent - * when we return. - */ - if (iip && iip->ili_last_lsn) { - error = _xfs_log_force(mp, iip->ili_last_lsn, - XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); - } else if (xfs_ipincount(ip) > 0) { - error = _xfs_log_force(mp, (xfs_lsn_t)0, - XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); - } - - } else { - xfs_trans_t *tp; - - /* - * O_SYNC or O_DSYNC _with_ a size update are handled - * the same way. - * - * If the write was synchronous then we need to make - * sure that the inode modification time is permanent. - * We'll have updated the timestamp above, so here - * we use a synchronous transaction to log the inode. - * It's not fast, but it's necessary. - * - * If this a dsync write and the size got changed - * non-transactionally, then we need to ensure that - * the size change gets logged in a synchronous - * transaction. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); - if ((error = xfs_trans_reserve(tp, 0, - XFS_SWRITE_LOG_RES(mp), - 0, 0, 0))) { - /* Transaction reserve failed */ - xfs_trans_cancel(tp, 0); - } else { - /* Transaction reserve successful */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - } - - return error; -} - /* * Force a shutdown of the filesystem instantly while keeping * the filesystem consistent. We don't do an unmount here; just shutdown diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h index ae65f0df87da..f5e4874c37d8 100644 --- a/fs/xfs/xfs_rw.h +++ b/fs/xfs/xfs_rw.h @@ -68,7 +68,6 @@ xfs_get_extsz_hint( * Prototypes for functions in xfs_rw.c. */ extern int xfs_write_clear_setuid(struct xfs_inode *ip); -extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip); extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); extern int xfs_bioerror(struct xfs_buf *bp); extern int xfs_bioerror_relse(struct xfs_buf *bp); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 775249a54f6f..ed47fc77759c 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -68,7 +68,7 @@ typedef struct xfs_trans_header { #define XFS_TRANS_GROWFS 14 #define XFS_TRANS_STRAT_WRITE 15 #define XFS_TRANS_DIOSTRAT 16 -#define XFS_TRANS_WRITE_SYNC 17 +/* 17 was XFS_TRANS_WRITE_SYNC */ #define XFS_TRANS_WRITEID 18 #define XFS_TRANS_ADDAFORK 19 #define XFS_TRANS_ATTRINVAL 20 diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index ceecafd1f9c1..03d3100559ac 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -611,7 +611,7 @@ xfs_fsync( xfs_inode_t *ip) { xfs_trans_t *tp; - int error; + int error = 0; int log_flushed = 0, changed = 1; xfs_itrace_entry(ip); @@ -619,14 +619,9 @@ xfs_fsync( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return XFS_ERROR(EIO); - /* capture size updates in I/O completion before writing the inode. */ - error = xfs_wait_on_pages(ip, 0, -1); - if (error) - return XFS_ERROR(error); - /* * We always need to make sure that the required inode state is safe on - * disk. The vnode might be clean but we still might need to force the + * disk. The inode might be clean but we still might need to force the * log because of committed transactions that haven't hit the disk yet. * Likewise, there could be unflushed non-transactional changes to the * inode core that have to go to disk and this requires us to issue @@ -638,7 +633,7 @@ xfs_fsync( */ xfs_ilock(ip, XFS_ILOCK_SHARED); - if (!(ip->i_update_size || ip->i_update_core)) { + if (!ip->i_update_core) { /* * Timestamps/size haven't changed since last inode flush or * inode transaction commit. That means either nothing got From aa72a5cf00001d0b952c7c755be404b9118ceb2e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 31 Aug 2009 21:51:52 -0300 Subject: [PATCH 0267/1662] xfs: simplify xfs_trans_iget xfs_trans_iget is a wrapper for xfs_iget that adds the inode to the transaction after it is read. Except when the inode already is in the inode cache, in which case it returns the existing locked inode with increment lock recursion counts. Now, no one in the tree every decrements these lock recursion counts, so any user of this gets a potential double unlock when both the original owner of the inode and the xfs_trans_iget caller unlock it. When looking back in a git bisect in the historic XFS tree there was only one place that decremented these counts, xfs_trans_iput. Introduced in commit ca25df7a840f426eb566d52667b6950b92bb84b5 by Adam Sweeney in 1993, and removed in commit 19f899a3ab155ff6a49c0c79b06f2f61059afaf3 by Steve Lord in 2003. And as long as it didn't slip through git bisects cracks never actually used in that time frame. A quick audit of the callers of xfs_trans_iget shows that no caller really relies on this behaviour fortunately - xfs_ialloc allows this inode from disk so it must not be there before, and all the RT allocator routines only every add each RT bitmap inode once. In addition to removing lots of code and reducing the size of the inode item this patch also avoids the double inode cache lookup in each create/mkdir/mknod transaction. Signed-off-by: Christoph Hellwig Reviewed-by: Alex Elder Signed-off-by: Felix Blyakher --- fs/xfs/xfs_iget.c | 26 ------------ fs/xfs/xfs_inode.h | 2 - fs/xfs/xfs_inode_item.c | 2 - fs/xfs/xfs_inode_item.h | 2 - fs/xfs/xfs_trans_inode.c | 86 +++------------------------------------- 5 files changed, 5 insertions(+), 113 deletions(-) diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 3323826274b3..80e526489be5 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -455,32 +455,6 @@ xfs_iget( return error; } - -/* - * Look for the inode corresponding to the given ino in the hash table. - * If it is there and its i_transp pointer matches tp, return it. - * Otherwise, return NULL. - */ -xfs_inode_t * -xfs_inode_incore(xfs_mount_t *mp, - xfs_ino_t ino, - xfs_trans_t *tp) -{ - xfs_inode_t *ip; - xfs_perag_t *pag; - - pag = xfs_get_perag(mp, ino); - read_lock(&pag->pag_ici_lock); - ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino)); - read_unlock(&pag->pag_ici_lock); - xfs_put_perag(mp, pag); - - /* the returned inode must match the transaction */ - if (ip && (ip->i_transp != tp)) - return NULL; - return ip; -} - /* * Decrement reference count of an inode structure and unlock it. * diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index ed566c248ae4..0b38b9a869ec 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -467,8 +467,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) /* * xfs_iget.c prototypes. */ -xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t, - struct xfs_trans *); int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, uint, uint, xfs_inode_t **, xfs_daddr_t); void xfs_iput(xfs_inode_t *, uint); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 2e69412195e6..47d5b663c37e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -704,8 +704,6 @@ xfs_inode_item_unlock( * Clear out the fields of the inode log item particular * to the current transaction. */ - iip->ili_ilock_recur = 0; - iip->ili_iolock_recur = 0; iip->ili_flags = 0; /* diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index a52ac125f055..65bae4c9b8bf 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -137,8 +137,6 @@ typedef struct xfs_inode_log_item { struct xfs_inode *ili_inode; /* inode ptr */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ - unsigned short ili_ilock_recur; /* lock recursion count */ - unsigned short ili_iolock_recur; /* lock recursion count */ unsigned short ili_flags; /* misc flags */ unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 23d276af2e0c..785ff101da0a 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -49,30 +49,7 @@ xfs_trans_inode_broot_debug( /* - * Get and lock the inode for the caller if it is not already - * locked within the given transaction. If it is already locked - * within the transaction, just increment its lock recursion count - * and return a pointer to it. - * - * For an inode to be locked in a transaction, the inode lock, as - * opposed to the io lock, must be taken exclusively. This ensures - * that the inode can be involved in only 1 transaction at a time. - * Lock recursion is handled on the io lock, but only for lock modes - * of equal or lesser strength. That is, you can recur on the io lock - * held EXCL with a SHARED request but not vice versa. Also, if - * the inode is already a part of the transaction then you cannot - * go from not holding the io lock to having it EXCL or SHARED. - * - * Use the inode cache routine xfs_inode_incore() to find the inode - * if it is already owned by this transaction. - * - * If we don't already own the inode, use xfs_iget() to get it. - * Since the inode log item structure is embedded in the incore - * inode structure and is initialized when the inode is brought - * into memory, there is nothing to do with it here. - * - * If the given transaction pointer is NULL, just call xfs_iget(). - * This simplifies code which must handle both cases. + * Get an inode and join it to the transaction. */ int xfs_trans_iget( @@ -84,62 +61,11 @@ xfs_trans_iget( xfs_inode_t **ipp) { int error; - xfs_inode_t *ip; - /* - * If the transaction pointer is NULL, just call the normal - * xfs_iget(). - */ - if (tp == NULL) - return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0); - - /* - * If we find the inode in core with this transaction - * pointer in its i_transp field, then we know we already - * have it locked. In this case we just increment the lock - * recursion count and return the inode to the caller. - * Assert that the inode is already locked in the mode requested - * by the caller. We cannot do lock promotions yet, so - * die if someone gets this wrong. - */ - if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) { - /* - * Make sure that the inode lock is held EXCL and - * that the io lock is never upgraded when the inode - * is already a part of the transaction. - */ - ASSERT(ip->i_itemp != NULL); - ASSERT(lock_flags & XFS_ILOCK_EXCL); - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) || - xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) || - (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL)); - ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) || - xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)); - ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) || - (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY)); - - if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) { - ip->i_itemp->ili_iolock_recur++; - } - if (lock_flags & XFS_ILOCK_EXCL) { - ip->i_itemp->ili_ilock_recur++; - } - *ipp = ip; - return 0; - } - - ASSERT(lock_flags & XFS_ILOCK_EXCL); - error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0); - if (error) { - return error; - } - ASSERT(ip != NULL); - - xfs_trans_ijoin(tp, ip, lock_flags); - *ipp = ip; - return 0; + error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0); + if (!error && tp) + xfs_trans_ijoin(tp, *ipp, lock_flags); + return error; } /* @@ -163,8 +89,6 @@ xfs_trans_ijoin( xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; ASSERT(iip->ili_flags == 0); - ASSERT(iip->ili_ilock_recur == 0); - ASSERT(iip->ili_iolock_recur == 0); /* * Get a log_item_desc to point at the new item. From f4378b6eaf63492c0f9a397d52813e0ae6b49e7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Sep 2009 14:03:08 -0400 Subject: [PATCH 0268/1662] xfs: actually enable the swapext compat handler Fix a small typo in the compat ioctl handler that cause the swapext compat handler to never be called. Signed-off-by: Christoph Hellwig Reviewed-by: Torsten Kaiser Tested-by: Torsten Kaiser Reviewed-by: Eric Sandeen Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_ioctl32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 0882d166239a..eafcc7c18706 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -619,7 +619,7 @@ xfs_file_compat_ioctl( case XFS_IOC_GETVERSION_32: cmd = _NATIVE_IOC(cmd, long); return xfs_file_ioctl(filp, cmd, p); - case XFS_IOC_SWAPEXT: { + case XFS_IOC_SWAPEXT_32: { struct xfs_swapext sxp; struct compat_xfs_swapext __user *sxu = arg; From 6d703a81ad5fdd102334751ddacb053ecc6ff046 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 1 Sep 2009 17:52:57 -0700 Subject: [PATCH 0269/1662] ide: convert to ->proc_fops ->read_proc, ->write_proc are going away, ->proc_fops should be used instead. The only tricky place is IDENTIFY handling: if for some reason taskfile_lib_get_identify() fails, buffer _is_ changed and at least first byte is overwritten. Emulate old behaviour with returning that first byte to userspace and reporting length=1 despite overall -E. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- drivers/ide/ide-cd.c | 28 ++- drivers/ide/ide-disk_proc.c | 135 +++++++++----- drivers/ide/ide-floppy_proc.c | 30 ++- drivers/ide/ide-proc.c | 338 ++++++++++++++++++++++------------ drivers/ide/ide-tape.c | 31 +++- include/linux/ide.h | 24 +-- 6 files changed, 372 insertions(+), 214 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index ad0ab0c0a493..b79ca419d8d9 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -1389,19 +1390,30 @@ static sector_t ide_cdrom_capacity(ide_drive_t *drive) return capacity * sectors_per_frame; } -static int proc_idecd_read_capacity(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int idecd_capacity_proc_show(struct seq_file *m, void *v) { - ide_drive_t *drive = data; - int len; + ide_drive_t *drive = m->private; - len = sprintf(page, "%llu\n", (long long)ide_cdrom_capacity(drive)); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "%llu\n", (long long)ide_cdrom_capacity(drive)); + return 0; } +static int idecd_capacity_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, idecd_capacity_proc_show, PDE(inode)->data); +} + +static const struct file_operations idecd_capacity_proc_fops = { + .owner = THIS_MODULE, + .open = idecd_capacity_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static ide_proc_entry_t idecd_proc[] = { - { "capacity", S_IFREG|S_IRUGO, proc_idecd_read_capacity, NULL }, - { NULL, 0, NULL, NULL } + { "capacity", S_IFREG|S_IRUGO, &idecd_capacity_proc_fops }, + {} }; static ide_proc_entry_t *ide_cd_proc_entries(ide_drive_t *drive) diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c index 19f263bf0a9e..60b0590ccc9c 100644 --- a/drivers/ide/ide-disk_proc.c +++ b/drivers/ide/ide-disk_proc.c @@ -1,5 +1,6 @@ #include #include +#include #include "ide-disk.h" @@ -37,77 +38,117 @@ static int get_smart_data(ide_drive_t *drive, u8 *buf, u8 sub_cmd) return ide_raw_taskfile(drive, &cmd, buf, 1); } -static int proc_idedisk_read_cache - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int idedisk_cache_proc_show(struct seq_file *m, void *v) { - ide_drive_t *drive = (ide_drive_t *) data; - char *out = page; - int len; + ide_drive_t *drive = (ide_drive_t *) m->private; if (drive->dev_flags & IDE_DFLAG_ID_READ) - len = sprintf(out, "%i\n", drive->id[ATA_ID_BUF_SIZE] / 2); + seq_printf(m, "%i\n", drive->id[ATA_ID_BUF_SIZE] / 2); else - len = sprintf(out, "(none)\n"); - - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "(none)\n"); + return 0; } -static int proc_idedisk_read_capacity - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int idedisk_cache_proc_open(struct inode *inode, struct file *file) { - ide_drive_t*drive = (ide_drive_t *)data; - int len; - - len = sprintf(page, "%llu\n", (long long)ide_gd_capacity(drive)); - - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + return single_open(file, idedisk_cache_proc_show, PDE(inode)->data); } -static int proc_idedisk_read_smart(char *page, char **start, off_t off, - int count, int *eof, void *data, u8 sub_cmd) +static const struct file_operations idedisk_cache_proc_fops = { + .owner = THIS_MODULE, + .open = idedisk_cache_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int idedisk_capacity_proc_show(struct seq_file *m, void *v) { - ide_drive_t *drive = (ide_drive_t *)data; - int len = 0, i = 0; + ide_drive_t*drive = (ide_drive_t *)m->private; + + seq_printf(m, "%llu\n", (long long)ide_gd_capacity(drive)); + return 0; +} + +static int idedisk_capacity_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, idedisk_capacity_proc_show, PDE(inode)->data); +} + +static const struct file_operations idedisk_capacity_proc_fops = { + .owner = THIS_MODULE, + .open = idedisk_capacity_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __idedisk_proc_show(struct seq_file *m, ide_drive_t *drive, u8 sub_cmd) +{ + u8 *buf; + + buf = kmalloc(SECTOR_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; (void)smart_enable(drive); - if (get_smart_data(drive, page, sub_cmd) == 0) { - unsigned short *val = (unsigned short *) page; - char *out = (char *)val + SECTOR_SIZE; + if (get_smart_data(drive, buf, sub_cmd) == 0) { + __le16 *val = (__le16 *)buf; + int i; - page = out; - do { - out += sprintf(out, "%04x%c", le16_to_cpu(*val), - (++i & 7) ? ' ' : '\n'); - val += 1; - } while (i < SECTOR_SIZE / 2); - len = out - page; + for (i = 0; i < SECTOR_SIZE / 2; i++) { + seq_printf(m, "%04x%c", le16_to_cpu(val[i]), + (i % 8) == 7 ? '\n' : ' '); + } } - - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + kfree(buf); + return 0; } -static int proc_idedisk_read_sv - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int idedisk_sv_proc_show(struct seq_file *m, void *v) { - return proc_idedisk_read_smart(page, start, off, count, eof, data, - ATA_SMART_READ_VALUES); + return __idedisk_proc_show(m, m->private, ATA_SMART_READ_VALUES); } -static int proc_idedisk_read_st - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int idedisk_sv_proc_open(struct inode *inode, struct file *file) { - return proc_idedisk_read_smart(page, start, off, count, eof, data, - ATA_SMART_READ_THRESHOLDS); + return single_open(file, idedisk_sv_proc_show, PDE(inode)->data); } +static const struct file_operations idedisk_sv_proc_fops = { + .owner = THIS_MODULE, + .open = idedisk_sv_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int idedisk_st_proc_show(struct seq_file *m, void *v) +{ + return __idedisk_proc_show(m, m->private, ATA_SMART_READ_THRESHOLDS); +} + +static int idedisk_st_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, idedisk_st_proc_show, PDE(inode)->data); +} + +static const struct file_operations idedisk_st_proc_fops = { + .owner = THIS_MODULE, + .open = idedisk_st_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + ide_proc_entry_t ide_disk_proc[] = { - { "cache", S_IFREG|S_IRUGO, proc_idedisk_read_cache, NULL }, - { "capacity", S_IFREG|S_IRUGO, proc_idedisk_read_capacity, NULL }, - { "geometry", S_IFREG|S_IRUGO, proc_ide_read_geometry, NULL }, - { "smart_values", S_IFREG|S_IRUSR, proc_idedisk_read_sv, NULL }, - { "smart_thresholds", S_IFREG|S_IRUSR, proc_idedisk_read_st, NULL }, - { NULL, 0, NULL, NULL } + { "cache", S_IFREG|S_IRUGO, &idedisk_cache_proc_fops }, + { "capacity", S_IFREG|S_IRUGO, &idedisk_capacity_proc_fops }, + { "geometry", S_IFREG|S_IRUGO, &ide_geometry_proc_fops }, + { "smart_values", S_IFREG|S_IRUSR, &idedisk_sv_proc_fops }, + { "smart_thresholds", S_IFREG|S_IRUSR, &idedisk_st_proc_fops }, + {} }; ide_devset_rw_field(bios_cyl, bios_cyl); diff --git a/drivers/ide/ide-floppy_proc.c b/drivers/ide/ide-floppy_proc.c index fcd4d8153df5..d711d9b883de 100644 --- a/drivers/ide/ide-floppy_proc.c +++ b/drivers/ide/ide-floppy_proc.c @@ -1,22 +1,34 @@ #include #include +#include #include "ide-floppy.h" -static int proc_idefloppy_read_capacity(char *page, char **start, off_t off, - int count, int *eof, void *data) +static int idefloppy_capacity_proc_show(struct seq_file *m, void *v) { - ide_drive_t*drive = (ide_drive_t *)data; - int len; + ide_drive_t*drive = (ide_drive_t *)m->private; - len = sprintf(page, "%llu\n", (long long)ide_gd_capacity(drive)); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "%llu\n", (long long)ide_gd_capacity(drive)); + return 0; } +static int idefloppy_capacity_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, idefloppy_capacity_proc_show, PDE(inode)->data); +} + +static const struct file_operations idefloppy_capacity_proc_fops = { + .owner = THIS_MODULE, + .open = idefloppy_capacity_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + ide_proc_entry_t ide_floppy_proc[] = { - { "capacity", S_IFREG|S_IRUGO, proc_idefloppy_read_capacity, NULL }, - { "geometry", S_IFREG|S_IRUGO, proc_ide_read_geometry, NULL }, - { NULL, 0, NULL, NULL } + { "capacity", S_IFREG|S_IRUGO, &idefloppy_capacity_proc_fops }, + { "geometry", S_IFREG|S_IRUGO, &ide_geometry_proc_fops }, + {} }; ide_devset_rw_field(bios_cyl, bios_cyl); diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c index 021de41655e6..28d09a5d8450 100644 --- a/drivers/ide/ide-proc.c +++ b/drivers/ide/ide-proc.c @@ -30,11 +30,9 @@ static struct proc_dir_entry *proc_ide_root; -static int proc_ide_read_imodel - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int ide_imodel_proc_show(struct seq_file *m, void *v) { - ide_hwif_t *hwif = (ide_hwif_t *) data; - int len; + ide_hwif_t *hwif = (ide_hwif_t *) m->private; const char *name; switch (hwif->chipset) { @@ -53,63 +51,108 @@ static int proc_ide_read_imodel case ide_acorn: name = "acorn"; break; default: name = "(unknown)"; break; } - len = sprintf(page, "%s\n", name); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "%s\n", name); + return 0; } -static int proc_ide_read_mate - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int ide_imodel_proc_open(struct inode *inode, struct file *file) { - ide_hwif_t *hwif = (ide_hwif_t *) data; - int len; + return single_open(file, ide_imodel_proc_show, PDE(inode)->data); +} + +static const struct file_operations ide_imodel_proc_fops = { + .owner = THIS_MODULE, + .open = ide_imodel_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ide_mate_proc_show(struct seq_file *m, void *v) +{ + ide_hwif_t *hwif = (ide_hwif_t *) m->private; if (hwif && hwif->mate) - len = sprintf(page, "%s\n", hwif->mate->name); + seq_printf(m, "%s\n", hwif->mate->name); else - len = sprintf(page, "(none)\n"); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "(none)\n"); + return 0; } -static int proc_ide_read_channel - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int ide_mate_proc_open(struct inode *inode, struct file *file) { - ide_hwif_t *hwif = (ide_hwif_t *) data; - int len; - - page[0] = hwif->channel ? '1' : '0'; - page[1] = '\n'; - len = 2; - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + return single_open(file, ide_mate_proc_show, PDE(inode)->data); } -static int proc_ide_read_identify - (char *page, char **start, off_t off, int count, int *eof, void *data) +static const struct file_operations ide_mate_proc_fops = { + .owner = THIS_MODULE, + .open = ide_mate_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ide_channel_proc_show(struct seq_file *m, void *v) { - ide_drive_t *drive = (ide_drive_t *)data; - int len = 0, i = 0; - int err = 0; + ide_hwif_t *hwif = (ide_hwif_t *) m->private; - len = sprintf(page, "\n"); + seq_printf(m, "%c\n", hwif->channel ? '1' : '0'); + return 0; +} - if (drive) { - __le16 *val = (__le16 *)page; +static int ide_channel_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ide_channel_proc_show, PDE(inode)->data); +} - err = taskfile_lib_get_identify(drive, page); - if (!err) { - char *out = (char *)page + SECTOR_SIZE; +static const struct file_operations ide_channel_proc_fops = { + .owner = THIS_MODULE, + .open = ide_channel_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; - page = out; - do { - out += sprintf(out, "%04x%c", - le16_to_cpup(val), (++i & 7) ? ' ' : '\n'); - val += 1; - } while (i < SECTOR_SIZE / 2); - len = out - page; - } +static int ide_identify_proc_show(struct seq_file *m, void *v) +{ + ide_drive_t *drive = (ide_drive_t *)m->private; + u8 *buf; + + if (!drive) { + seq_putc(m, '\n'); + return 0; } - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + + buf = kmalloc(SECTOR_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + if (taskfile_lib_get_identify(drive, buf) == 0) { + __le16 *val = (__le16 *)buf; + int i; + + for (i = 0; i < SECTOR_SIZE / 2; i++) { + seq_printf(m, "%04x%c", le16_to_cpu(val[i]), + (i % 8) == 7 ? '\n' : ' '); + } + } else + seq_putc(m, buf[0]); + kfree(buf); + return 0; } +static int ide_identify_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ide_identify_proc_show, PDE(inode)->data); +} + +static const struct file_operations ide_identify_proc_fops = { + .owner = THIS_MODULE, + .open = ide_identify_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /** * ide_find_setting - find a specific setting * @st: setting table pointer @@ -240,22 +283,20 @@ static void proc_ide_settings_warn(void) warned = 1; } -static int proc_ide_read_settings - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int ide_settings_proc_show(struct seq_file *m, void *v) { const struct ide_proc_devset *setting, *g, *d; const struct ide_devset *ds; - ide_drive_t *drive = (ide_drive_t *) data; - char *out = page; - int len, rc, mul_factor, div_factor; + ide_drive_t *drive = (ide_drive_t *) m->private; + int rc, mul_factor, div_factor; proc_ide_settings_warn(); mutex_lock(&ide_setting_mtx); g = ide_generic_settings; d = drive->settings; - out += sprintf(out, "name\t\t\tvalue\t\tmin\t\tmax\t\tmode\n"); - out += sprintf(out, "----\t\t\t-----\t\t---\t\t---\t\t----\n"); + seq_printf(m, "name\t\t\tvalue\t\tmin\t\tmax\t\tmode\n"); + seq_printf(m, "----\t\t\t-----\t\t---\t\t---\t\t----\n"); while (g->name || (d && d->name)) { /* read settings in the alphabetical order */ if (g->name && d && d->name) { @@ -269,31 +310,35 @@ static int proc_ide_read_settings setting = g++; mul_factor = setting->mulf ? setting->mulf(drive) : 1; div_factor = setting->divf ? setting->divf(drive) : 1; - out += sprintf(out, "%-24s", setting->name); + seq_printf(m, "%-24s", setting->name); rc = ide_read_setting(drive, setting); if (rc >= 0) - out += sprintf(out, "%-16d", rc * mul_factor / div_factor); + seq_printf(m, "%-16d", rc * mul_factor / div_factor); else - out += sprintf(out, "%-16s", "write-only"); - out += sprintf(out, "%-16d%-16d", (setting->min * mul_factor + div_factor - 1) / div_factor, setting->max * mul_factor / div_factor); + seq_printf(m, "%-16s", "write-only"); + seq_printf(m, "%-16d%-16d", (setting->min * mul_factor + div_factor - 1) / div_factor, setting->max * mul_factor / div_factor); ds = setting->setting; if (ds->get) - out += sprintf(out, "r"); + seq_printf(m, "r"); if (ds->set) - out += sprintf(out, "w"); - out += sprintf(out, "\n"); + seq_printf(m, "w"); + seq_printf(m, "\n"); } - len = out - page; mutex_unlock(&ide_setting_mtx); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + return 0; +} + +static int ide_settings_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ide_settings_proc_show, PDE(inode)->data); } #define MAX_LEN 30 -static int proc_ide_write_settings(struct file *file, const char __user *buffer, - unsigned long count, void *data) +static ssize_t ide_settings_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *pos) { - ide_drive_t *drive = (ide_drive_t *) data; + ide_drive_t *drive = (ide_drive_t *) PDE(file->f_path.dentry->d_inode)->data; char name[MAX_LEN + 1]; int for_real = 0, mul_factor, div_factor; unsigned long n; @@ -388,63 +433,104 @@ static int proc_ide_write_settings(struct file *file, const char __user *buffer, return count; parse_error: free_page((unsigned long)buf); - printk("proc_ide_write_settings(): parse error\n"); + printk("%s(): parse error\n", __func__); return -EINVAL; } -int proc_ide_read_capacity - (char *page, char **start, off_t off, int count, int *eof, void *data) +static const struct file_operations ide_settings_proc_fops = { + .owner = THIS_MODULE, + .open = ide_settings_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = ide_settings_proc_write, +}; + +static int ide_capacity_proc_show(struct seq_file *m, void *v) { - int len = sprintf(page, "%llu\n", (long long)0x7fffffff); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "%llu\n", (long long)0x7fffffff); + return 0; } -EXPORT_SYMBOL_GPL(proc_ide_read_capacity); - -int proc_ide_read_geometry - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int ide_capacity_proc_open(struct inode *inode, struct file *file) { - ide_drive_t *drive = (ide_drive_t *) data; - char *out = page; - int len; + return single_open(file, ide_capacity_proc_show, NULL); +} - out += sprintf(out, "physical %d/%d/%d\n", +const struct file_operations ide_capacity_proc_fops = { + .owner = THIS_MODULE, + .open = ide_capacity_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +EXPORT_SYMBOL_GPL(ide_capacity_proc_fops); + +static int ide_geometry_proc_show(struct seq_file *m, void *v) +{ + ide_drive_t *drive = (ide_drive_t *) m->private; + + seq_printf(m, "physical %d/%d/%d\n", drive->cyl, drive->head, drive->sect); - out += sprintf(out, "logical %d/%d/%d\n", + seq_printf(m, "logical %d/%d/%d\n", drive->bios_cyl, drive->bios_head, drive->bios_sect); - - len = out - page; - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + return 0; } -EXPORT_SYMBOL(proc_ide_read_geometry); - -static int proc_ide_read_dmodel - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int ide_geometry_proc_open(struct inode *inode, struct file *file) { - ide_drive_t *drive = (ide_drive_t *) data; + return single_open(file, ide_geometry_proc_show, PDE(inode)->data); +} + +const struct file_operations ide_geometry_proc_fops = { + .owner = THIS_MODULE, + .open = ide_geometry_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +EXPORT_SYMBOL(ide_geometry_proc_fops); + +static int ide_dmodel_proc_show(struct seq_file *seq, void *v) +{ + ide_drive_t *drive = (ide_drive_t *) seq->private; char *m = (char *)&drive->id[ATA_ID_PROD]; - int len; - len = sprintf(page, "%.40s\n", m[0] ? m : "(none)"); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(seq, "%.40s\n", m[0] ? m : "(none)"); + return 0; } -static int proc_ide_read_driver - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int ide_dmodel_proc_open(struct inode *inode, struct file *file) { - ide_drive_t *drive = (ide_drive_t *)data; + return single_open(file, ide_dmodel_proc_show, PDE(inode)->data); +} + +static const struct file_operations ide_dmodel_proc_fops = { + .owner = THIS_MODULE, + .open = ide_dmodel_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int ide_driver_proc_show(struct seq_file *m, void *v) +{ + ide_drive_t *drive = (ide_drive_t *)m->private; struct device *dev = &drive->gendev; struct ide_driver *ide_drv; - int len; if (dev->driver) { ide_drv = to_ide_driver(dev->driver); - len = sprintf(page, "%s version %s\n", + seq_printf(m, "%s version %s\n", dev->driver->name, ide_drv->version); } else - len = sprintf(page, "ide-default version 0.9.newide\n"); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "ide-default version 0.9.newide\n"); + return 0; +} + +static int ide_driver_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ide_driver_proc_show, PDE(inode)->data); } static int ide_replace_subdriver(ide_drive_t *drive, const char *driver) @@ -474,10 +560,10 @@ static int ide_replace_subdriver(ide_drive_t *drive, const char *driver) return ret; } -static int proc_ide_write_driver - (struct file *file, const char __user *buffer, unsigned long count, void *data) +static ssize_t ide_driver_proc_write(struct file *file, const char __user *buffer, + size_t count, loff_t *pos) { - ide_drive_t *drive = (ide_drive_t *) data; + ide_drive_t *drive = (ide_drive_t *) PDE(file->f_path.dentry->d_inode)->data; char name[32]; if (!capable(CAP_SYS_ADMIN)) @@ -492,12 +578,19 @@ static int proc_ide_write_driver return count; } -static int proc_ide_read_media - (char *page, char **start, off_t off, int count, int *eof, void *data) +static const struct file_operations ide_driver_proc_fops = { + .owner = THIS_MODULE, + .open = ide_driver_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = ide_driver_proc_write, +}; + +static int ide_media_proc_show(struct seq_file *m, void *v) { - ide_drive_t *drive = (ide_drive_t *) data; + ide_drive_t *drive = (ide_drive_t *) m->private; const char *media; - int len; switch (drive->media) { case ide_disk: media = "disk\n"; break; @@ -507,20 +600,30 @@ static int proc_ide_read_media case ide_optical: media = "optical\n"; break; default: media = "UNKNOWN\n"; break; } - strcpy(page, media); - len = strlen(media); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_puts(m, media); + return 0; } +static int ide_media_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ide_media_proc_show, PDE(inode)->data); +} + +static const struct file_operations ide_media_proc_fops = { + .owner = THIS_MODULE, + .open = ide_media_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static ide_proc_entry_t generic_drive_entries[] = { - { "driver", S_IFREG|S_IRUGO, proc_ide_read_driver, - proc_ide_write_driver }, - { "identify", S_IFREG|S_IRUSR, proc_ide_read_identify, NULL }, - { "media", S_IFREG|S_IRUGO, proc_ide_read_media, NULL }, - { "model", S_IFREG|S_IRUGO, proc_ide_read_dmodel, NULL }, - { "settings", S_IFREG|S_IRUSR|S_IWUSR, proc_ide_read_settings, - proc_ide_write_settings }, - { NULL, 0, NULL, NULL } + { "driver", S_IFREG|S_IRUGO, &ide_driver_proc_fops }, + { "identify", S_IFREG|S_IRUSR, &ide_identify_proc_fops}, + { "media", S_IFREG|S_IRUGO, &ide_media_proc_fops }, + { "model", S_IFREG|S_IRUGO, &ide_dmodel_proc_fops }, + { "settings", S_IFREG|S_IRUSR|S_IWUSR, &ide_settings_proc_fops}, + {} }; static void ide_add_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p, void *data) @@ -530,11 +633,8 @@ static void ide_add_proc_entries(struct proc_dir_entry *dir, ide_proc_entry_t *p if (!dir || !p) return; while (p->name != NULL) { - ent = create_proc_entry(p->name, p->mode, dir); + ent = proc_create_data(p->name, p->mode, dir, p->proc_fops, data); if (!ent) return; - ent->data = data; - ent->read_proc = p->read_proc; - ent->write_proc = p->write_proc; p++; } } @@ -617,10 +717,10 @@ void ide_proc_unregister_device(ide_drive_t *drive) } static ide_proc_entry_t hwif_entries[] = { - { "channel", S_IFREG|S_IRUGO, proc_ide_read_channel, NULL }, - { "mate", S_IFREG|S_IRUGO, proc_ide_read_mate, NULL }, - { "model", S_IFREG|S_IRUGO, proc_ide_read_imodel, NULL }, - { NULL, 0, NULL, NULL } + { "channel", S_IFREG|S_IRUGO, &ide_channel_proc_fops }, + { "mate", S_IFREG|S_IRUGO, &ide_mate_proc_fops }, + { "model", S_IFREG|S_IRUGO, &ide_imodel_proc_fops }, + {} }; void ide_proc_register_port(ide_hwif_t *hwif) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 7b2032bc357b..9d6f62baac27 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -1816,22 +1817,32 @@ static void ide_tape_release(struct device *dev) } #ifdef CONFIG_IDE_PROC_FS -static int proc_idetape_read_name - (char *page, char **start, off_t off, int count, int *eof, void *data) +static int idetape_name_proc_show(struct seq_file *m, void *v) { - ide_drive_t *drive = (ide_drive_t *) data; + ide_drive_t *drive = (ide_drive_t *) m->private; idetape_tape_t *tape = drive->driver_data; - char *out = page; - int len; - len = sprintf(out, "%s\n", tape->name); - PROC_IDE_READ_RETURN(page, start, off, count, eof, len); + seq_printf(m, "%s\n", tape->name); + return 0; } +static int idetape_name_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, idetape_name_proc_show, PDE(inode)->data); +} + +static const struct file_operations idetape_name_proc_fops = { + .owner = THIS_MODULE, + .open = idetape_name_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static ide_proc_entry_t idetape_proc[] = { - { "capacity", S_IFREG|S_IRUGO, proc_ide_read_capacity, NULL }, - { "name", S_IFREG|S_IRUGO, proc_idetape_read_name, NULL }, - { NULL, 0, NULL, NULL } + { "capacity", S_IFREG|S_IRUGO, &ide_capacity_proc_fops }, + { "name", S_IFREG|S_IRUGO, &idetape_name_proc_fops }, + {} }; static ide_proc_entry_t *ide_tape_proc_entries(ide_drive_t *drive) diff --git a/include/linux/ide.h b/include/linux/ide.h index 803c1ae31237..e4135d6e0556 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -919,8 +919,7 @@ __IDE_PROC_DEVSET(_name, _min, _max, NULL, NULL) typedef struct { const char *name; mode_t mode; - read_proc_t *read_proc; - write_proc_t *write_proc; + const struct file_operations *proc_fops; } ide_proc_entry_t; void proc_ide_create(void); @@ -932,24 +931,8 @@ void ide_proc_unregister_port(ide_hwif_t *); void ide_proc_register_driver(ide_drive_t *, struct ide_driver *); void ide_proc_unregister_driver(ide_drive_t *, struct ide_driver *); -read_proc_t proc_ide_read_capacity; -read_proc_t proc_ide_read_geometry; - -/* - * Standard exit stuff: - */ -#define PROC_IDE_READ_RETURN(page,start,off,count,eof,len) \ -{ \ - len -= off; \ - if (len < count) { \ - *eof = 1; \ - if (len <= 0) \ - return 0; \ - } else \ - len = count; \ - *start = page + off; \ - return len; \ -} +extern const struct file_operations ide_capacity_proc_fops; +extern const struct file_operations ide_geometry_proc_fops; #else static inline void proc_ide_create(void) { ; } static inline void proc_ide_destroy(void) { ; } @@ -961,7 +944,6 @@ static inline void ide_proc_register_driver(ide_drive_t *drive, struct ide_driver *driver) { ; } static inline void ide_proc_unregister_driver(ide_drive_t *drive, struct ide_driver *driver) { ; } -#define PROC_IDE_READ_RETURN(page,start,off,count,eof,len) return 0; #endif enum { From 69575d388603365f2afbf4166df93152df59b165 Mon Sep 17 00:00:00 2001 From: Shane Wang Date: Tue, 1 Sep 2009 18:25:07 -0700 Subject: [PATCH 0270/1662] x86, intel_txt: clean up the impact on generic code, unbreak non-x86 Move tboot.h from asm to linux to fix the build errors of intel_txt patch on non-X86 platforms. Remove the tboot code from generic code init/main.c and kernel/cpu.c. Signed-off-by: Shane Wang Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 4 ++ arch/x86/kernel/reboot.c | 3 +- arch/x86/kernel/setup.c | 3 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/tboot.c | 58 +++++++++++++++---- drivers/acpi/acpica/hwsleep.c | 2 +- drivers/pci/dmar.c | 2 +- drivers/pci/intel-iommu.c | 2 +- .../x86/include/asm => include/linux}/tboot.h | 57 ++++-------------- init/main.c | 3 - kernel/cpu.c | 6 +- security/Kconfig | 2 +- 12 files changed, 70 insertions(+), 74 deletions(-) rename {arch/x86/include/asm => include/linux}/tboot.h (83%) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 738bdc6b0f8b..b66f2102c35d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -178,6 +178,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_INTEL_TXT + def_bool y + depends on EXPERIMENTAL && DMAR && ACPI + # Use the generic interrupt handling code in kernel/irq/: config GENERIC_HARDIRQS bool diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 9de01c5d9794..18ce5c04242a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -24,8 +25,6 @@ # include #endif -#include - /* * Power off function, if any */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 80d6e9e32483..6ce0d6f38f7f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -66,6 +66,7 @@ #include #include +#include #include