From 9394c1c65e61eb6f4c1c99f342b49e451ec337b6 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Mon, 11 Mar 2013 13:52:12 +0100
Subject: [PATCH 01/31] ARM: 7669/1: keep __my_cpu_offset consistent with
 generic one

Commit 14318efb(ARM: 7587/1: implement optimized percpu variable access)
introduces arm's __my_cpu_offset to optimize percpu vaiable access,
which really works well on hackbench, but will cause __my_cpu_offset
to return garbage value before it is initialized in cpu_init() called
by setup_arch, so accessing percpu variable before setup_arch may cause
kernel hang. But generic __my_cpu_offset always returns zero before
percpu area is brought up, and won't hang kernel.

So the patch tries to clear __my_cpu_offset on boot CPU early
to avoid boot hang.

At least now percpu variable is accessed by lockdep before
setup_arch(), and enabling CONFIG_LOCK_STAT or CONFIG_DEBUG_LOCKDEP
can trigger kernel hang.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/kernel/setup.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 1522c7ae31b0..dd1c6aacbaf9 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -456,6 +456,13 @@ void __init smp_setup_processor_id(void)
 	for (i = 1; i < nr_cpu_ids; ++i)
 		cpu_logical_map(i) = i == cpu ? 0 : i;
 
+	/*
+	 * clear __my_cpu_offset on boot CPU to avoid hang caused by
+	 * using percpu variable early, for example, lockdep will
+	 * access percpu variable inside lock_release
+	 */
+	set_my_cpu_offset(0);
+
 	printk(KERN_INFO "Booting Linux on physical CPU 0x%x\n", mpidr);
 }
 

From 049f3e84d34bfc37f25c91715f2e24a90fb8665e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 29 Apr 2013 16:06:10 +0100
Subject: [PATCH 02/31] ARM: 7705/1: use optimized do_div only for EABI

In OABI configurations, some uses of the do_div function
cause gcc to run out of registers. To work around that,
we can force the use of the out-of-line version for
configurations that build a OABI kernel.

Without this patch, building netx_defconfig results in:

net/core/pktgen.c: In function 'pktgen_if_show':
net/core/pktgen.c:682:2775: error: can't find a register in class 'GENERAL_REGS' while reloading 'asm'
net/core/pktgen.c:682:3153: error: can't find a register in class 'GENERAL_REGS' while reloading 'asm'
net/core/pktgen.c:682:2775: error: 'asm' operand has impossible constraints
net/core/pktgen.c:682:3153: error: 'asm' operand has impossible constraints

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/div64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h
index fe92ccf1d0b0..191ada6e4d2d 100644
--- a/arch/arm/include/asm/div64.h
+++ b/arch/arm/include/asm/div64.h
@@ -46,7 +46,7 @@
 	__rem;							\
 })
 
-#if __GNUC__ < 4
+#if __GNUC__ < 4 || !defined(CONFIG_AEABI)
 
 /*
  * gcc versions earlier than 4.0 are simply too problematic for the

From faefd550c45d8d314e8f260f21565320355c947f Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Wed, 15 May 2013 09:39:17 +0100
Subject: [PATCH 03/31] ARM: 7722/1: zImage: Convert 32bits memory size and
 address from ATAG to 64bits DTB

When CONFIG_ARM_APPENDED_DTB is selected, if the bootloader provides
an ATAG_MEM it replaces the memory size and the memory address in the
memory node of the device tree. In the case of a system which can
handle more than 4GB, the memory node cell size is 4: each data
(memory size and memory address) are 64 bits and then use 2 cells.

The current code in atags_to_fdt.c made the assumption of a cell size
of 2 (one cell for the memory size and one cell for the memory
address), this leads to an improper write of the data and ends with a
boot hang.

This patch writes the memory size and the memory address on the memory
node in the device tree depending of the size of the memory node (32
bits or 64 bits).

It has been tested in the 2 cases:
- with a dtb using skeleton.dtsi
- and with a dtb using skeleton64.dtsi

Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Acked-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/boot/compressed/atags_to_fdt.c | 44 +++++++++++++++++++++----
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/arch/arm/boot/compressed/atags_to_fdt.c b/arch/arm/boot/compressed/atags_to_fdt.c
index aabc02a68482..d1153c8a765a 100644
--- a/arch/arm/boot/compressed/atags_to_fdt.c
+++ b/arch/arm/boot/compressed/atags_to_fdt.c
@@ -53,6 +53,17 @@ static const void *getprop(const void *fdt, const char *node_path,
 	return fdt_getprop(fdt, offset, property, len);
 }
 
+static uint32_t get_cell_size(const void *fdt)
+{
+	int len;
+	uint32_t cell_size = 1;
+	const uint32_t *size_len =  getprop(fdt, "/", "#size-cells", &len);
+
+	if (size_len)
+		cell_size = fdt32_to_cpu(*size_len);
+	return cell_size;
+}
+
 static void merge_fdt_bootargs(void *fdt, const char *fdt_cmdline)
 {
 	char cmdline[COMMAND_LINE_SIZE];
@@ -95,9 +106,11 @@ static void merge_fdt_bootargs(void *fdt, const char *fdt_cmdline)
 int atags_to_fdt(void *atag_list, void *fdt, int total_space)
 {
 	struct tag *atag = atag_list;
-	uint32_t mem_reg_property[2 * NR_BANKS];
+	/* In the case of 64 bits memory size, need to reserve 2 cells for
+	 * address and size for each bank */
+	uint32_t mem_reg_property[2 * 2 * NR_BANKS];
 	int memcount = 0;
-	int ret;
+	int ret, memsize;
 
 	/* make sure we've got an aligned pointer */
 	if ((u32)atag_list & 0x3)
@@ -137,8 +150,25 @@ int atags_to_fdt(void *atag_list, void *fdt, int total_space)
 				continue;
 			if (!atag->u.mem.size)
 				continue;
-			mem_reg_property[memcount++] = cpu_to_fdt32(atag->u.mem.start);
-			mem_reg_property[memcount++] = cpu_to_fdt32(atag->u.mem.size);
+			memsize = get_cell_size(fdt);
+
+			if (memsize == 2) {
+				/* if memsize is 2, that means that
+				 * each data needs 2 cells of 32 bits,
+				 * so the data are 64 bits */
+				uint64_t *mem_reg_prop64 =
+					(uint64_t *)mem_reg_property;
+				mem_reg_prop64[memcount++] =
+					cpu_to_fdt64(atag->u.mem.start);
+				mem_reg_prop64[memcount++] =
+					cpu_to_fdt64(atag->u.mem.size);
+			} else {
+				mem_reg_property[memcount++] =
+					cpu_to_fdt32(atag->u.mem.start);
+				mem_reg_property[memcount++] =
+					cpu_to_fdt32(atag->u.mem.size);
+			}
+
 		} else if (atag->hdr.tag == ATAG_INITRD2) {
 			uint32_t initrd_start, initrd_size;
 			initrd_start = atag->u.initrd.start;
@@ -150,8 +180,10 @@ int atags_to_fdt(void *atag_list, void *fdt, int total_space)
 		}
 	}
 
-	if (memcount)
-		setprop(fdt, "/memory", "reg", mem_reg_property, 4*memcount);
+	if (memcount) {
+		setprop(fdt, "/memory", "reg", mem_reg_property,
+			4 * memcount * memsize);
+	}
 
 	return fdt_pack(fdt);
 }

From 3b656fed6ff65d6d268da9ed0760c2a58d125771 Mon Sep 17 00:00:00 2001
From: Christian Daudt <csd@broadcom.com>
Date: Thu, 9 May 2013 22:21:01 +0100
Subject: [PATCH 04/31] ARM: 7716/1: bcm281xx: Add L2 support for Rev A2 chips

Rev A2 SoCs have an unorthodox memory re-mapping and this needs
to be reflected in the cache operations.
This patch adds new outer cache functions for the l2x0 driver
to support this SoC revision. It also adds a new compatible
value for the cache to enable this functionality.

Updates from V1:
- remove section 1 altogether and note that in comments
- simplify section selection caused by section 1 removal
- BUG_ON just in case section 1 shows up

Signed-off-by: Christian Daudt <csd@broadcom.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 .../devicetree/bindings/arm/l2cc.txt          |   3 +
 arch/arm/boot/dts/bcm11351.dtsi               |   8 +-
 arch/arm/mm/cache-l2x0.c                      | 158 ++++++++++++++++++
 3 files changed, 165 insertions(+), 4 deletions(-)

diff --git a/Documentation/devicetree/bindings/arm/l2cc.txt b/Documentation/devicetree/bindings/arm/l2cc.txt
index cbef09b5c8a7..69ddf9fad2dc 100644
--- a/Documentation/devicetree/bindings/arm/l2cc.txt
+++ b/Documentation/devicetree/bindings/arm/l2cc.txt
@@ -16,6 +16,9 @@ Required properties:
      performs the same operation).
 	"marvell,"aurora-outer-cache: Marvell Controller designed to be
 	 compatible with the ARM one with outer cache mode.
+	"bcm,bcm11351-a2-pl310-cache": For Broadcom bcm11351 chipset where an
+	offset needs to be added to the address before passing down to the L2
+	cache controller
 - cache-unified : Specifies the cache is a unified cache.
 - cache-level : Should be set to 2 for a level 2 cache.
 - reg : Physical base address and size of cache controller's memory mapped
diff --git a/arch/arm/boot/dts/bcm11351.dtsi b/arch/arm/boot/dts/bcm11351.dtsi
index 41b2c6c33f09..5e48c85abc2f 100644
--- a/arch/arm/boot/dts/bcm11351.dtsi
+++ b/arch/arm/boot/dts/bcm11351.dtsi
@@ -47,10 +47,10 @@
 	};
 
 	L2: l2-cache {
-		    compatible = "arm,pl310-cache";
-		    reg = <0x3ff20000 0x1000>;
-		    cache-unified;
-		    cache-level = <2>;
+		compatible = "bcm,bcm11351-a2-pl310-cache";
+		reg = <0x3ff20000 0x1000>;
+		cache-unified;
+		cache-level = <2>;
 	};
 
 	timer@35006000 {
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index c465faca51b0..d70e0aba0c9d 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -523,6 +523,147 @@ static void aurora_flush_range(unsigned long start, unsigned long end)
 	}
 }
 
+/*
+ * For certain Broadcom SoCs, depending on the address range, different offsets
+ * need to be added to the address before passing it to L2 for
+ * invalidation/clean/flush
+ *
+ * Section Address Range              Offset        EMI
+ *   1     0x00000000 - 0x3FFFFFFF    0x80000000    VC
+ *   2     0x40000000 - 0xBFFFFFFF    0x40000000    SYS
+ *   3     0xC0000000 - 0xFFFFFFFF    0x80000000    VC
+ *
+ * When the start and end addresses have crossed two different sections, we
+ * need to break the L2 operation into two, each within its own section.
+ * For example, if we need to invalidate addresses starts at 0xBFFF0000 and
+ * ends at 0xC0001000, we need do invalidate 1) 0xBFFF0000 - 0xBFFFFFFF and 2)
+ * 0xC0000000 - 0xC0001000
+ *
+ * Note 1:
+ * By breaking a single L2 operation into two, we may potentially suffer some
+ * performance hit, but keep in mind the cross section case is very rare
+ *
+ * Note 2:
+ * We do not need to handle the case when the start address is in
+ * Section 1 and the end address is in Section 3, since it is not a valid use
+ * case
+ *
+ * Note 3:
+ * Section 1 in practical terms can no longer be used on rev A2. Because of
+ * that the code does not need to handle section 1 at all.
+ *
+ */
+#define BCM_SYS_EMI_START_ADDR        0x40000000UL
+#define BCM_VC_EMI_SEC3_START_ADDR    0xC0000000UL
+
+#define BCM_SYS_EMI_OFFSET            0x40000000UL
+#define BCM_VC_EMI_OFFSET             0x80000000UL
+
+static inline int bcm_addr_is_sys_emi(unsigned long addr)
+{
+	return (addr >= BCM_SYS_EMI_START_ADDR) &&
+		(addr < BCM_VC_EMI_SEC3_START_ADDR);
+}
+
+static inline unsigned long bcm_l2_phys_addr(unsigned long addr)
+{
+	if (bcm_addr_is_sys_emi(addr))
+		return addr + BCM_SYS_EMI_OFFSET;
+	else
+		return addr + BCM_VC_EMI_OFFSET;
+}
+
+static void bcm_inv_range(unsigned long start, unsigned long end)
+{
+	unsigned long new_start, new_end;
+
+	BUG_ON(start < BCM_SYS_EMI_START_ADDR);
+
+	if (unlikely(end <= start))
+		return;
+
+	new_start = bcm_l2_phys_addr(start);
+	new_end = bcm_l2_phys_addr(end);
+
+	/* normal case, no cross section between start and end */
+	if (likely(bcm_addr_is_sys_emi(end) || !bcm_addr_is_sys_emi(start))) {
+		l2x0_inv_range(new_start, new_end);
+		return;
+	}
+
+	/* They cross sections, so it can only be a cross from section
+	 * 2 to section 3
+	 */
+	l2x0_inv_range(new_start,
+		bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR-1));
+	l2x0_inv_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR),
+		new_end);
+}
+
+static void bcm_clean_range(unsigned long start, unsigned long end)
+{
+	unsigned long new_start, new_end;
+
+	BUG_ON(start < BCM_SYS_EMI_START_ADDR);
+
+	if (unlikely(end <= start))
+		return;
+
+	if ((end - start) >= l2x0_size) {
+		l2x0_clean_all();
+		return;
+	}
+
+	new_start = bcm_l2_phys_addr(start);
+	new_end = bcm_l2_phys_addr(end);
+
+	/* normal case, no cross section between start and end */
+	if (likely(bcm_addr_is_sys_emi(end) || !bcm_addr_is_sys_emi(start))) {
+		l2x0_clean_range(new_start, new_end);
+		return;
+	}
+
+	/* They cross sections, so it can only be a cross from section
+	 * 2 to section 3
+	 */
+	l2x0_clean_range(new_start,
+		bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR-1));
+	l2x0_clean_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR),
+		new_end);
+}
+
+static void bcm_flush_range(unsigned long start, unsigned long end)
+{
+	unsigned long new_start, new_end;
+
+	BUG_ON(start < BCM_SYS_EMI_START_ADDR);
+
+	if (unlikely(end <= start))
+		return;
+
+	if ((end - start) >= l2x0_size) {
+		l2x0_flush_all();
+		return;
+	}
+
+	new_start = bcm_l2_phys_addr(start);
+	new_end = bcm_l2_phys_addr(end);
+
+	/* normal case, no cross section between start and end */
+	if (likely(bcm_addr_is_sys_emi(end) || !bcm_addr_is_sys_emi(start))) {
+		l2x0_flush_range(new_start, new_end);
+		return;
+	}
+
+	/* They cross sections, so it can only be a cross from section
+	 * 2 to section 3
+	 */
+	l2x0_flush_range(new_start,
+		bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR-1));
+	l2x0_flush_range(bcm_l2_phys_addr(BCM_VC_EMI_SEC3_START_ADDR),
+		new_end);
+}
+
 static void __init l2x0_of_setup(const struct device_node *np,
 				 u32 *aux_val, u32 *aux_mask)
 {
@@ -765,6 +906,21 @@ static const struct l2x0_of_data aurora_no_outer_data = {
 	},
 };
 
+static const struct l2x0_of_data bcm_l2x0_data = {
+	.setup = pl310_of_setup,
+	.save  = pl310_save,
+	.outer_cache = {
+		.resume      = pl310_resume,
+		.inv_range   = bcm_inv_range,
+		.clean_range = bcm_clean_range,
+		.flush_range = bcm_flush_range,
+		.sync        = l2x0_cache_sync,
+		.flush_all   = l2x0_flush_all,
+		.inv_all     = l2x0_inv_all,
+		.disable     = l2x0_disable,
+	},
+};
+
 static const struct of_device_id l2x0_ids[] __initconst = {
 	{ .compatible = "arm,pl310-cache", .data = (void *)&pl310_data },
 	{ .compatible = "arm,l220-cache", .data = (void *)&l2x0_data },
@@ -773,6 +929,8 @@ static const struct of_device_id l2x0_ids[] __initconst = {
 	  .data = (void *)&aurora_no_outer_data},
 	{ .compatible = "marvell,aurora-outer-cache",
 	  .data = (void *)&aurora_with_outer_data},
+	{ .compatible = "bcm,bcm11351-a2-pl310-cache",
+	  .data = (void *)&bcm_l2x0_data},
 	{}
 };
 

From 665ba56fdab1670ef686ec35569aa1de0ec4ef4d Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 13 May 2013 15:39:17 +0100
Subject: [PATCH 05/31] ARM: 7717/1: mmc: mmci: Use devm_clk_get API

Converting to devm_clk_get simplifies error handling in probe
and we can remove other corresponding calls to clk_put.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/mmc/host/mmci.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index f4f3038c1df0..d57ce4332f83 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -1362,16 +1362,15 @@ static int mmci_probe(struct amba_device *dev,
 	dev_dbg(mmc_dev(mmc), "designer ID = 0x%02x\n", host->hw_designer);
 	dev_dbg(mmc_dev(mmc), "revision = 0x%01x\n", host->hw_revision);
 
-	host->clk = clk_get(&dev->dev, NULL);
+	host->clk = devm_clk_get(&dev->dev, NULL);
 	if (IS_ERR(host->clk)) {
 		ret = PTR_ERR(host->clk);
-		host->clk = NULL;
 		goto host_free;
 	}
 
 	ret = clk_prepare_enable(host->clk);
 	if (ret)
-		goto clk_free;
+		goto host_free;
 
 	host->plat = plat;
 	host->variant = variant;
@@ -1576,8 +1575,6 @@ static int mmci_probe(struct amba_device *dev,
 	iounmap(host->base);
  clk_disable:
 	clk_disable_unprepare(host->clk);
- clk_free:
-	clk_put(host->clk);
  host_free:
 	mmc_free_host(mmc);
  rel_regions:
@@ -1623,7 +1620,6 @@ static int mmci_remove(struct amba_device *dev)
 
 		iounmap(host->base);
 		clk_disable_unprepare(host->clk);
-		clk_put(host->clk);
 
 		mmc_free_host(mmc);
 

From c58a85090c8fbf7274af5ed0fe30828320c40ad6 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 13 May 2013 15:40:03 +0100
Subject: [PATCH 06/31] ARM: 7718/1: mmc: mmci: Set actual clock for debug
 purpose

Update cclk to the acutal used value and copy it to the actual_clock
variable in the mmc host for debug purpose.

Signed-off-by: Fredrik Soderstedt <fredrik.soderstedt@stericsson.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/mmc/host/mmci.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index d57ce4332f83..5f53cf8b8f5a 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -196,6 +196,9 @@ static void mmci_set_clkreg(struct mmci_host *host, unsigned int desired)
 	struct variant_data *variant = host->variant;
 	u32 clk = variant->clkreg;
 
+	/* Make sure cclk reflects the current calculated clock */
+	host->cclk = 0;
+
 	if (desired) {
 		if (desired >= host->mclk) {
 			clk = MCI_CLK_BYPASS;
@@ -230,6 +233,9 @@ static void mmci_set_clkreg(struct mmci_host *host, unsigned int desired)
 		/* clk |= MCI_CLK_PWRSAVE; */
 	}
 
+	/* Set actual clock for debug */
+	host->mmc->actual_clock = host->cclk;
+
 	if (host->mmc->ios.bus_width == MMC_BUS_WIDTH_4)
 		clk |= MCI_4BIT_BUS;
 	if (host->mmc->ios.bus_width == MMC_BUS_WIDTH_8)

From 024629c62ffc25f267d57bf588cc10c96ccc0ce5 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 13 May 2013 15:40:56 +0100
Subject: [PATCH 07/31] ARM: 7719/1: mmc: mmci: Support for CMD23

Support added for transmission of CMD23 during multi block read or
write. In order to activate this feature, MMC_CAP_CMD23 flag needs
to be enabled in the capabilities field. Note that CMD23 support is
mandatory to support features like reliable write, data tag, context
ID, packed command.

This patch is based upon a patch from Saugata Das.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/mmc/host/mmci.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index 5f53cf8b8f5a..9df8b84145b1 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -848,7 +848,7 @@ mmci_data_irq(struct mmci_host *host, struct mmc_data *data,
 			/* The error clause is handled above, success! */
 			data->bytes_xfered = data->blksz * data->blocks;
 
-		if (!data->stop) {
+		if (!data->stop || host->mrq->sbc) {
 			mmci_request_end(host, data->mrq);
 		} else {
 			mmci_start_command(host, data->stop, 0);
@@ -861,6 +861,7 @@ mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd,
 	     unsigned int status)
 {
 	void __iomem *base = host->base;
+	bool sbc = (cmd == host->mrq->sbc);
 
 	host->cmd = NULL;
 
@@ -875,7 +876,7 @@ mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd,
 		cmd->resp[3] = readl(base + MMCIRESPONSE3);
 	}
 
-	if (!cmd->data || cmd->error) {
+	if ((!sbc && !cmd->data) || cmd->error) {
 		if (host->data) {
 			/* Terminate the DMA transfer */
 			if (dma_inprogress(host)) {
@@ -884,7 +885,9 @@ mmci_cmd_irq(struct mmci_host *host, struct mmc_command *cmd,
 			}
 			mmci_stop_data(host);
 		}
-		mmci_request_end(host, cmd->mrq);
+		mmci_request_end(host, host->mrq);
+	} else if (sbc) {
+		mmci_start_command(host, host->mrq->cmd, 0);
 	} else if (!(cmd->data->flags & MMC_DATA_READ)) {
 		mmci_start_data(host, cmd->data);
 	}
@@ -1125,7 +1128,10 @@ static void mmci_request(struct mmc_host *mmc, struct mmc_request *mrq)
 	if (mrq->data && mrq->data->flags & MMC_DATA_READ)
 		mmci_start_data(host, mrq->data);
 
-	mmci_start_command(host, mrq->cmd, 0);
+	if (mrq->sbc)
+		mmci_start_command(host, mrq->sbc, 0);
+	else
+		mmci_start_command(host, mrq->cmd, 0);
 
 	spin_unlock_irqrestore(&host->lock, flags);
 }

From 1fd83f0ecf87e33ab560e8229842cf10f91552ee Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Fri, 3 May 2013 12:51:17 +0100
Subject: [PATCH 08/31] ARM: 7713/1: mmc: mmci: Allow MMCI to request channels
 with information acquired from DT

Currently, if DMA information isn't passed from platform data, then DMA
will not be used. This patch allows DMA information obtained though Device
Tree to be used as well.

Cc: Chris Ball <cjb@laptop.org>
Cc: linux-mmc@vger.kernel.org
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/mmc/host/mmci.c | 43 +++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index 9df8b84145b1..c6d8b6216069 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -310,10 +310,8 @@ static void mmci_dma_setup(struct mmci_host *host)
 	const char *rxname, *txname;
 	dma_cap_mask_t mask;
 
-	if (!plat || !plat->dma_filter) {
-		dev_info(mmc_dev(host->mmc), "no DMA platform data\n");
-		return;
-	}
+	host->dma_rx_channel = dma_request_slave_channel(mmc_dev(host->mmc), "rx");
+	host->dma_tx_channel = dma_request_slave_channel(mmc_dev(host->mmc), "tx");
 
 	/* initialize pre request cookie */
 	host->next_data.cookie = 1;
@@ -322,29 +320,32 @@ static void mmci_dma_setup(struct mmci_host *host)
 	dma_cap_zero(mask);
 	dma_cap_set(DMA_SLAVE, mask);
 
+	if (plat && plat->dma_filter) {
+		if (!host->dma_rx_channel && plat->dma_rx_param) {
+			host->dma_rx_channel = dma_request_channel(mask,
+							   plat->dma_filter,
+							   plat->dma_rx_param);
+			/* E.g if no DMA hardware is present */
+			if (!host->dma_rx_channel)
+				dev_err(mmc_dev(host->mmc), "no RX DMA channel\n");
+		}
+
+		if (!host->dma_tx_channel && plat->dma_tx_param) {
+			host->dma_tx_channel = dma_request_channel(mask,
+							   plat->dma_filter,
+							   plat->dma_tx_param);
+			if (!host->dma_tx_channel)
+				dev_warn(mmc_dev(host->mmc), "no TX DMA channel\n");
+		}
+	}
+
 	/*
 	 * If only an RX channel is specified, the driver will
 	 * attempt to use it bidirectionally, however if it is
 	 * is specified but cannot be located, DMA will be disabled.
 	 */
-	if (plat->dma_rx_param) {
-		host->dma_rx_channel = dma_request_channel(mask,
-							   plat->dma_filter,
-							   plat->dma_rx_param);
-		/* E.g if no DMA hardware is present */
-		if (!host->dma_rx_channel)
-			dev_err(mmc_dev(host->mmc), "no RX DMA channel\n");
-	}
-
-	if (plat->dma_tx_param) {
-		host->dma_tx_channel = dma_request_channel(mask,
-							   plat->dma_filter,
-							   plat->dma_tx_param);
-		if (!host->dma_tx_channel)
-			dev_warn(mmc_dev(host->mmc), "no TX DMA channel\n");
-	} else {
+	if (host->dma_rx_channel && !host->dma_tx_channel)
 		host->dma_tx_channel = host->dma_rx_channel;
-	}
 
 	if (host->dma_rx_channel)
 		rxname = dma_chan_name(host->dma_rx_channel);

From 7c0136ef773c206e242b9718740377a45747bd70 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 14 May 2013 13:53:10 +0100
Subject: [PATCH 09/31] ARM: 7721/1: mmc: mmci: Fixup regulator handling for
 vqmmc

We can not rely on regulator_is_enabled to decide whether to
enable|disable the regulator. It would mean that the reference
counter for it is not balanced properly.

Instead keep track of our internal state by using a new flag in
the host struct, so we can take correct decisions.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/mmc/host/mmci.c | 10 ++++++----
 drivers/mmc/host/mmci.h |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index c6d8b6216069..a8bbdd1c4314 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -1156,9 +1156,10 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 		if (!IS_ERR(mmc->supply.vmmc))
 			mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, 0);
 
-		if (!IS_ERR(mmc->supply.vqmmc) &&
-		    regulator_is_enabled(mmc->supply.vqmmc))
+		if (!IS_ERR(mmc->supply.vqmmc) && host->vqmmc_enabled) {
 			regulator_disable(mmc->supply.vqmmc);
+			host->vqmmc_enabled = false;
+		}
 
 		break;
 	case MMC_POWER_UP:
@@ -1174,12 +1175,13 @@ static void mmci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
 
 		break;
 	case MMC_POWER_ON:
-		if (!IS_ERR(mmc->supply.vqmmc) &&
-		    !regulator_is_enabled(mmc->supply.vqmmc)) {
+		if (!IS_ERR(mmc->supply.vqmmc) && !host->vqmmc_enabled) {
 			ret = regulator_enable(mmc->supply.vqmmc);
 			if (ret < 0)
 				dev_err(mmc_dev(mmc),
 					"failed to enable vqmmc regulator\n");
+			else
+				host->vqmmc_enabled = true;
 		}
 
 		pwr |= MCI_PWR_ON;
diff --git a/drivers/mmc/host/mmci.h b/drivers/mmc/host/mmci.h
index 1f33ad5333a0..1383c9ce2646 100644
--- a/drivers/mmc/host/mmci.h
+++ b/drivers/mmc/host/mmci.h
@@ -183,6 +183,7 @@ struct mmci_host {
 	unsigned int		cclk;
 	u32			pwr_reg;
 	u32			clk_reg;
+	bool			vqmmc_enabled;
 	struct mmci_platform_data *plat;
 	struct variant_data	*variant;
 

From 0f3ed7f75cf1a16df9309f3a9ffaf62a3fc1f0bb Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 15 May 2013 20:47:33 +0100
Subject: [PATCH 10/31] ARM: 7724/1: mmc: mmci: Support signal voltage switch
 for UHS cards

Add .start_signal_voltage_switch callback to be able to support UHS cards.
The voltage switch requires the optional vqmmc regulator to exist since
the actual voltage switch will be performed directly on it.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/mmc/host/mmci.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index a8bbdd1c4314..cb9e562b3571 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -1266,6 +1266,39 @@ static int mmci_get_cd(struct mmc_host *mmc)
 	return status;
 }
 
+static int mmci_sig_volt_switch(struct mmc_host *mmc, struct mmc_ios *ios)
+{
+	int ret = 0;
+
+	if (!IS_ERR(mmc->supply.vqmmc)) {
+
+		pm_runtime_get_sync(mmc_dev(mmc));
+
+		switch (ios->signal_voltage) {
+		case MMC_SIGNAL_VOLTAGE_330:
+			ret = regulator_set_voltage(mmc->supply.vqmmc,
+						2700000, 3600000);
+			break;
+		case MMC_SIGNAL_VOLTAGE_180:
+			ret = regulator_set_voltage(mmc->supply.vqmmc,
+						1700000, 1950000);
+			break;
+		case MMC_SIGNAL_VOLTAGE_120:
+			ret = regulator_set_voltage(mmc->supply.vqmmc,
+						1100000, 1300000);
+			break;
+		}
+
+		if (ret)
+			dev_warn(mmc_dev(mmc), "Voltage switch failed\n");
+
+		pm_runtime_mark_last_busy(mmc_dev(mmc));
+		pm_runtime_put_autosuspend(mmc_dev(mmc));
+	}
+
+	return ret;
+}
+
 static irqreturn_t mmci_cd_irq(int irq, void *dev_id)
 {
 	struct mmci_host *host = dev_id;
@@ -1282,6 +1315,7 @@ static const struct mmc_host_ops mmci_ops = {
 	.set_ios	= mmci_set_ios,
 	.get_ro		= mmci_get_ro,
 	.get_cd		= mmci_get_cd,
+	.start_signal_voltage_switch = mmci_sig_volt_switch,
 };
 
 #ifdef CONFIG_OF

From 9cc639a20fdc0b935e55d4992f93963f95233ca4 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 15 May 2013 20:48:23 +0100
Subject: [PATCH 11/31] ARM: 7725/1: mmc: mmci: Cache MMCIDATACTRL register

Add a cache variable in the host struct that reflects the current data in
the MMCIDATACTRL register. This patch will not introduce any functional
change but instead provide an easy option to keep specific bits in the
register between each data transfer.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/mmc/host/mmci.c | 17 ++++++++++++++---
 drivers/mmc/host/mmci.h |  1 +
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index cb9e562b3571..ccfe0bc62f78 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -188,6 +188,17 @@ static void mmci_write_pwrreg(struct mmci_host *host, u32 pwr)
 	}
 }
 
+/*
+ * This must be called with host->lock held
+ */
+static void mmci_write_datactrlreg(struct mmci_host *host, u32 datactrl)
+{
+	if (host->datactrl_reg != datactrl) {
+		host->datactrl_reg = datactrl;
+		writel(datactrl, host->base + MMCIDATACTRL);
+	}
+}
+
 /*
  * This must be called with host->lock held
  */
@@ -281,7 +292,7 @@ static void mmci_set_mask1(struct mmci_host *host, unsigned int mask)
 
 static void mmci_stop_data(struct mmci_host *host)
 {
-	writel(0, host->base + MMCIDATACTRL);
+	mmci_write_datactrlreg(host, 0);
 	mmci_set_mask1(host, 0);
 	host->data = NULL;
 }
@@ -559,7 +570,7 @@ static int mmci_dma_start_data(struct mmci_host *host, unsigned int datactrl)
 	datactrl |= MCI_DPSM_DMAENABLE;
 
 	/* Trigger the DMA transfer */
-	writel(datactrl, host->base + MMCIDATACTRL);
+	mmci_write_datactrlreg(host, datactrl);
 
 	/*
 	 * Let the MMCI say when the data is ended and it's time
@@ -757,7 +768,7 @@ static void mmci_start_data(struct mmci_host *host, struct mmc_data *data)
 		irqmask = MCI_TXFIFOHALFEMPTYMASK;
 	}
 
-	writel(datactrl, base + MMCIDATACTRL);
+	mmci_write_datactrlreg(host, datactrl);
 	writel(readl(base + MMCIMASK0) & ~MCI_DATAENDMASK, base + MMCIMASK0);
 	mmci_set_mask1(host, irqmask);
 }
diff --git a/drivers/mmc/host/mmci.h b/drivers/mmc/host/mmci.h
index 1383c9ce2646..0b6cc54be966 100644
--- a/drivers/mmc/host/mmci.h
+++ b/drivers/mmc/host/mmci.h
@@ -183,6 +183,7 @@ struct mmci_host {
 	unsigned int		cclk;
 	u32			pwr_reg;
 	u32			clk_reg;
+	u32			datactrl_reg;
 	bool			vqmmc_enabled;
 	struct mmci_platform_data *plat;
 	struct variant_data	*variant;

From 0125962000777cd1e2ce53deefb99779d5ee5199 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Wed, 15 May 2013 20:53:22 +0100
Subject: [PATCH 12/31] ARM: 7726/1: mmc: mmci: Add card_busy function to
 improve UHS card support

To verify a signal voltage switch at initialization of UHS cards the
.card_busy callback is used. For some of the ST-variants, card busy
detection on the DAT0 pin is supported.

We extend the variant struct with a busy_detect flag to indicate
support for it. A corresponding busy detect function, which polls the
busy status bit, is then set to the .card_busy callback.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/mmc/host/mmci.c | 33 ++++++++++++++++++++++++++++++++-
 drivers/mmc/host/mmci.h |  2 ++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c
index ccfe0bc62f78..c3785edc0e92 100644
--- a/drivers/mmc/host/mmci.c
+++ b/drivers/mmc/host/mmci.c
@@ -61,6 +61,7 @@ static unsigned int fmax = 515633;
  * @pwrreg_powerup: power up value for MMCIPOWER register
  * @signal_direction: input/out direction of bus signals can be indicated
  * @pwrreg_clkgate: MMCIPOWER register must be used to gate the clock
+ * @busy_detect: true if busy detection on dat0 is supported
  */
 struct variant_data {
 	unsigned int		clkreg;
@@ -74,6 +75,7 @@ struct variant_data {
 	u32			pwrreg_powerup;
 	bool			signal_direction;
 	bool			pwrreg_clkgate;
+	bool			busy_detect;
 };
 
 static struct variant_data variant_arm = {
@@ -132,6 +134,7 @@ static struct variant_data variant_ux500 = {
 	.pwrreg_powerup		= MCI_PWR_ON,
 	.signal_direction	= true,
 	.pwrreg_clkgate		= true,
+	.busy_detect		= true,
 };
 
 static struct variant_data variant_ux500v2 = {
@@ -146,8 +149,28 @@ static struct variant_data variant_ux500v2 = {
 	.pwrreg_powerup		= MCI_PWR_ON,
 	.signal_direction	= true,
 	.pwrreg_clkgate		= true,
+	.busy_detect		= true,
 };
 
+static int mmci_card_busy(struct mmc_host *mmc)
+{
+	struct mmci_host *host = mmc_priv(mmc);
+	unsigned long flags;
+	int busy = 0;
+
+	pm_runtime_get_sync(mmc_dev(mmc));
+
+	spin_lock_irqsave(&host->lock, flags);
+	if (readl(host->base + MMCISTATUS) & MCI_ST_CARDBUSY)
+		busy = 1;
+	spin_unlock_irqrestore(&host->lock, flags);
+
+	pm_runtime_mark_last_busy(mmc_dev(mmc));
+	pm_runtime_put_autosuspend(mmc_dev(mmc));
+
+	return busy;
+}
+
 /*
  * Validate mmc prerequisites
  */
@@ -193,6 +216,9 @@ static void mmci_write_pwrreg(struct mmci_host *host, u32 pwr)
  */
 static void mmci_write_datactrlreg(struct mmci_host *host, u32 datactrl)
 {
+	/* Keep ST Micro busy mode if enabled */
+	datactrl |= host->datactrl_reg & MCI_ST_DPSM_BUSYMODE;
+
 	if (host->datactrl_reg != datactrl) {
 		host->datactrl_reg = datactrl;
 		writel(datactrl, host->base + MMCIDATACTRL);
@@ -1319,7 +1345,7 @@ static irqreturn_t mmci_cd_irq(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static const struct mmc_host_ops mmci_ops = {
+static struct mmc_host_ops mmci_ops = {
 	.request	= mmci_request,
 	.pre_req	= mmci_pre_request,
 	.post_req	= mmci_post_request,
@@ -1455,6 +1481,11 @@ static int mmci_probe(struct amba_device *dev,
 		goto clk_disable;
 	}
 
+	if (variant->busy_detect) {
+		mmci_ops.card_busy = mmci_card_busy;
+		mmci_write_datactrlreg(host, MCI_ST_DPSM_BUSYMODE);
+	}
+
 	mmc->ops = &mmci_ops;
 	/*
 	 * The ARM and ST versions of the block have slightly different
diff --git a/drivers/mmc/host/mmci.h b/drivers/mmc/host/mmci.h
index 0b6cc54be966..69080fab6375 100644
--- a/drivers/mmc/host/mmci.h
+++ b/drivers/mmc/host/mmci.h
@@ -94,6 +94,7 @@
 /* Extended status bits for the ST Micro variants */
 #define MCI_ST_SDIOIT		(1 << 22)
 #define MCI_ST_CEATAEND		(1 << 23)
+#define MCI_ST_CARDBUSY		(1 << 24)
 
 #define MMCICLEAR		0x038
 #define MCI_CMDCRCFAILCLR	(1 << 0)
@@ -110,6 +111,7 @@
 /* Extended status bits for the ST Micro variants */
 #define MCI_ST_SDIOITC		(1 << 22)
 #define MCI_ST_CEATAENDC	(1 << 23)
+#define MCI_ST_BUSYENDC		(1 << 24)
 
 #define MMCIMASK0		0x03c
 #define MCI_CMDCRCFAILMASK	(1 << 0)

From 9b97173e785a54c5df0aa23d1e1f680f61e36e43 Mon Sep 17 00:00:00 2001
From: Laura Abbott <lauraa@codeaurora.org>
Date: Thu, 16 May 2013 19:40:22 +0100
Subject: [PATCH 13/31] ARM: 7728/1: mm: Use phys_addr_t properly for ioremap
 functions

Several of the ioremap functions use unsigned long in places
resulting in truncation if physical addresses greater than
4G are passed in. Change the types of the functions and the
callers accordingly.

Cc: Krzysztof Halasa <khc@pm.waw.pl>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/io.h     |  8 ++++----
 arch/arm/mach-ebsa110/core.c  |  2 +-
 arch/arm/mach-imx/mm-imx3.c   |  2 +-
 arch/arm/mach-iop13xx/io.c    |  2 +-
 arch/arm/mach-ixp4xx/common.c |  2 +-
 arch/arm/mach-msm/common.h    |  2 +-
 arch/arm/mach-msm/io.c        |  2 +-
 arch/arm/mm/ioremap.c         | 10 +++++-----
 arch/arm/mm/nommu.c           |  6 +++---
 9 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h
index 652b56086de7..d070741b2b37 100644
--- a/arch/arm/include/asm/io.h
+++ b/arch/arm/include/asm/io.h
@@ -130,16 +130,16 @@ static inline u32 __raw_readl(const volatile void __iomem *addr)
  */
 extern void __iomem *__arm_ioremap_pfn_caller(unsigned long, unsigned long,
 	size_t, unsigned int, void *);
-extern void __iomem *__arm_ioremap_caller(unsigned long, size_t, unsigned int,
+extern void __iomem *__arm_ioremap_caller(phys_addr_t, size_t, unsigned int,
 	void *);
 
 extern void __iomem *__arm_ioremap_pfn(unsigned long, unsigned long, size_t, unsigned int);
-extern void __iomem *__arm_ioremap(unsigned long, size_t, unsigned int);
-extern void __iomem *__arm_ioremap_exec(unsigned long, size_t, bool cached);
+extern void __iomem *__arm_ioremap(phys_addr_t, size_t, unsigned int);
+extern void __iomem *__arm_ioremap_exec(phys_addr_t, size_t, bool cached);
 extern void __iounmap(volatile void __iomem *addr);
 extern void __arm_iounmap(volatile void __iomem *addr);
 
-extern void __iomem * (*arch_ioremap_caller)(unsigned long, size_t,
+extern void __iomem * (*arch_ioremap_caller)(phys_addr_t, size_t,
 	unsigned int, void *);
 extern void (*arch_iounmap)(volatile void __iomem *);
 
diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c
index b13cc74114db..8a53f346cdb3 100644
--- a/arch/arm/mach-ebsa110/core.c
+++ b/arch/arm/mach-ebsa110/core.c
@@ -116,7 +116,7 @@ static void __init ebsa110_map_io(void)
 	iotable_init(ebsa110_io_desc, ARRAY_SIZE(ebsa110_io_desc));
 }
 
-static void __iomem *ebsa110_ioremap_caller(unsigned long cookie, size_t size,
+static void __iomem *ebsa110_ioremap_caller(phys_addr_t cookie, size_t size,
 					    unsigned int flags, void *caller)
 {
 	return (void __iomem *)cookie;
diff --git a/arch/arm/mach-imx/mm-imx3.c b/arch/arm/mach-imx/mm-imx3.c
index e0e69a682174..eed32ca0b8ab 100644
--- a/arch/arm/mach-imx/mm-imx3.c
+++ b/arch/arm/mach-imx/mm-imx3.c
@@ -65,7 +65,7 @@ static void imx3_idle(void)
 		: "=r" (reg));
 }
 
-static void __iomem *imx3_ioremap_caller(unsigned long phys_addr, size_t size,
+static void __iomem *imx3_ioremap_caller(phys_addr_t phys_addr, size_t size,
 					 unsigned int mtype, void *caller)
 {
 	if (mtype == MT_DEVICE) {
diff --git a/arch/arm/mach-iop13xx/io.c b/arch/arm/mach-iop13xx/io.c
index 183dc8b5511b..faaf7d4482c5 100644
--- a/arch/arm/mach-iop13xx/io.c
+++ b/arch/arm/mach-iop13xx/io.c
@@ -23,7 +23,7 @@
 
 #include "pci.h"
 
-static void __iomem *__iop13xx_ioremap_caller(unsigned long cookie,
+static void __iomem *__iop13xx_ioremap_caller(phys_addr_t cookie,
 	size_t size, unsigned int mtype, void *caller)
 {
 	void __iomem * retval;
diff --git a/arch/arm/mach-ixp4xx/common.c b/arch/arm/mach-ixp4xx/common.c
index 6600cff6bd92..d7223b3b81f3 100644
--- a/arch/arm/mach-ixp4xx/common.c
+++ b/arch/arm/mach-ixp4xx/common.c
@@ -559,7 +559,7 @@ void ixp4xx_restart(char mode, const char *cmd)
  * fallback to the default.
  */
 
-static void __iomem *ixp4xx_ioremap_caller(unsigned long addr, size_t size,
+static void __iomem *ixp4xx_ioremap_caller(phys_addr_t addr, size_t size,
 					   unsigned int mtype, void *caller)
 {
 	if (!is_pci_memory(addr))
diff --git a/arch/arm/mach-msm/common.h b/arch/arm/mach-msm/common.h
index ce8215a269e5..421cf7751a80 100644
--- a/arch/arm/mach-msm/common.h
+++ b/arch/arm/mach-msm/common.h
@@ -23,7 +23,7 @@ extern void msm_map_msm8x60_io(void);
 extern void msm_map_msm8960_io(void);
 extern void msm_map_qsd8x50_io(void);
 
-extern void __iomem *__msm_ioremap_caller(unsigned long phys_addr, size_t size,
+extern void __iomem *__msm_ioremap_caller(phys_addr_t phys_addr, size_t size,
 					  unsigned int mtype, void *caller);
 
 extern struct smp_operations msm_smp_ops;
diff --git a/arch/arm/mach-msm/io.c b/arch/arm/mach-msm/io.c
index 123ef9cbce1b..fd65b6d42cde 100644
--- a/arch/arm/mach-msm/io.c
+++ b/arch/arm/mach-msm/io.c
@@ -172,7 +172,7 @@ void __init msm_map_msm7x30_io(void)
 }
 #endif /* CONFIG_ARCH_MSM7X30 */
 
-void __iomem *__msm_ioremap_caller(unsigned long phys_addr, size_t size,
+void __iomem *__msm_ioremap_caller(phys_addr_t phys_addr, size_t size,
 				   unsigned int mtype, void *caller)
 {
 	if (mtype == MT_DEVICE) {
diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c
index 04d9006eab1f..f123d6eb074b 100644
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -331,10 +331,10 @@ void __iomem * __arm_ioremap_pfn_caller(unsigned long pfn,
 	return (void __iomem *) (offset + addr);
 }
 
-void __iomem *__arm_ioremap_caller(unsigned long phys_addr, size_t size,
+void __iomem *__arm_ioremap_caller(phys_addr_t phys_addr, size_t size,
 	unsigned int mtype, void *caller)
 {
-	unsigned long last_addr;
+	phys_addr_t last_addr;
  	unsigned long offset = phys_addr & ~PAGE_MASK;
  	unsigned long pfn = __phys_to_pfn(phys_addr);
 
@@ -367,12 +367,12 @@ __arm_ioremap_pfn(unsigned long pfn, unsigned long offset, size_t size,
 }
 EXPORT_SYMBOL(__arm_ioremap_pfn);
 
-void __iomem * (*arch_ioremap_caller)(unsigned long, size_t,
+void __iomem * (*arch_ioremap_caller)(phys_addr_t, size_t,
 				      unsigned int, void *) =
 	__arm_ioremap_caller;
 
 void __iomem *
-__arm_ioremap(unsigned long phys_addr, size_t size, unsigned int mtype)
+__arm_ioremap(phys_addr_t phys_addr, size_t size, unsigned int mtype)
 {
 	return arch_ioremap_caller(phys_addr, size, mtype,
 		__builtin_return_address(0));
@@ -387,7 +387,7 @@ EXPORT_SYMBOL(__arm_ioremap);
  * CONFIG_GENERIC_ALLOCATOR for allocating external memory.
  */
 void __iomem *
-__arm_ioremap_exec(unsigned long phys_addr, size_t size, bool cached)
+__arm_ioremap_exec(phys_addr_t phys_addr, size_t size, bool cached)
 {
 	unsigned int mtype;
 
diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c
index d51225f90ae2..31bd77b1d6df 100644
--- a/arch/arm/mm/nommu.c
+++ b/arch/arm/mm/nommu.c
@@ -81,16 +81,16 @@ void __iomem *__arm_ioremap_pfn_caller(unsigned long pfn, unsigned long offset,
 	return __arm_ioremap_pfn(pfn, offset, size, mtype);
 }
 
-void __iomem *__arm_ioremap(unsigned long phys_addr, size_t size,
+void __iomem *__arm_ioremap(phys_addr_t phys_addr, size_t size,
 			    unsigned int mtype)
 {
 	return (void __iomem *)phys_addr;
 }
 EXPORT_SYMBOL(__arm_ioremap);
 
-void __iomem * (*arch_ioremap_caller)(unsigned long, size_t, unsigned int, void *);
+void __iomem * (*arch_ioremap_caller)(phys_addr_t, size_t, unsigned int, void *);
 
-void __iomem *__arm_ioremap_caller(unsigned long phys_addr, size_t size,
+void __iomem *__arm_ioremap_caller(phys_addr_t phys_addr, size_t size,
 				   unsigned int mtype, void *caller)
 {
 	return __arm_ioremap(phys_addr, size, mtype);

From b2a234ed6417cf88bdbf4b22700e0ae6e08653ce Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Sat, 18 May 2013 11:21:36 +0100
Subject: [PATCH 14/31] ARM: 7730/1: DMA-mapping: mark all !DMA_TO_DEVICE pages
 in unmapping as clean

It is common for one sg to include many pages, so mark all these
pages as clean to avoid unnecessary flushing on them in
set_pte_at() or update_mmu_cache().

The patch might improve loading performance of applciation code a bit.

On the below test code to read file(~1GByte size) from usb mass storage
disk to buffer created with mmap(PROT_READ | PROT_EXEC) on
Pandaboard, average ~1% improvement can be observed with the patch on
10 times test.

unsigned int sum = 0;
static unsigned long tv_diff(struct timeval *tv1, struct timeval *tv2)
{
	return (tv2->tv_sec - tv1->tv_sec) * 1000000 + (tv2->tv_usec - tv1->tv_usec);
}
int main(int argc, char *argv[])
{
	char *mbuffer;
	int fd;
	int i;
	unsigned long page_size, size;
	struct stat stat;
	struct timeval t1, t2;

	page_size = getpagesize();
	fd = open(argv[1], O_RDONLY);
	assert(fd >= 0);

	fstat(fd, &stat);
	size = stat.st_size;
	printf("%s: file %s, file size %lu, page size %lun", argv[0],
	        read_filename, size, page_size);

	gettimeofday(&t1, NULL);
	mbuffer = mmap(NULL, size, PROT_READ | PROT_EXEC, MAP_SHARED, fd, 0);
	for (i = 0 ; i < size ; i += page_size)
	        sum += mbuffer[i];
	munmap(mbuffer, page_size);
	gettimeofday(&t2, NULL);
	printf("tread mmaped time: %luusn", tv_diff(&t1, &t2));

	close(fd);
}

Acked-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/dma-mapping.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index ef3e0f3aac96..c038ec0738ac 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -880,10 +880,24 @@ static void __dma_page_dev_to_cpu(struct page *page, unsigned long off,
 	dma_cache_maint_page(page, off, size, dir, dmac_unmap_area);
 
 	/*
-	 * Mark the D-cache clean for this page to avoid extra flushing.
+	 * Mark the D-cache clean for these pages to avoid extra flushing.
 	 */
-	if (dir != DMA_TO_DEVICE && off == 0 && size >= PAGE_SIZE)
-		set_bit(PG_dcache_clean, &page->flags);
+	if (dir != DMA_TO_DEVICE && size >= PAGE_SIZE) {
+		unsigned long pfn;
+		size_t left = size;
+
+		pfn = page_to_pfn(page) + off / PAGE_SIZE;
+		off %= PAGE_SIZE;
+		if (off) {
+			pfn++;
+			left -= PAGE_SIZE - off;
+		}
+		while (left >= PAGE_SIZE) {
+			page = pfn_to_page(pfn++);
+			set_bit(PG_dcache_clean, &page->flags);
+			left -= PAGE_SIZE;
+		}
+	}
 }
 
 /**

From 81f28946a8b066d484ca8935ff545d94ce730aa3 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Wed, 5 Jun 2013 02:44:00 +0100
Subject: [PATCH 15/31] ARM: 7746/1: mm: lazy cache flushing on non-mapped
 pages

Currently flush_dcache_page() thinks pages as non-mapped if
mapping_mapped(mapping) return false. This approach is very
coase:
	- mmap on part of file may cause all pages backed on
	the file being thought as mmaped

	- file-backed pages aren't mapped into user space actually
	if the memory mmaped on the file isn't accessed

This patch uses page_mapped() to decide if the page has been
mapped.

From the attached test code, I find there is much performance
improvement(>25%) when accessing page caches via read under this
situations, so memcpy benefits a lot from not flushing cache
under this situation.

No.   read time without the patch	No. read time with the patch
================================================================
No. 0, time  22615636 us		No. 0, time  22014717 us
No. 1, time  4387851 us 		No. 1, time  3113184 us
No. 2, time  4276535 us 		No. 2, time  3005244 us
No. 3, time  4259821 us 		No. 3, time  3001565 us
No. 4, time  4263811 us 		No. 4, time  3002748 us
No. 5, time  4258486 us 		No. 5, time  3004104 us
No. 6, time  4253009 us 		No. 6, time  3002188 us
No. 7, time  4262809 us 		No. 7, time  2998196 us
No. 8, time  4264525 us 		No. 8, time  3007255 us
No. 9, time  4267795 us 		No. 9, time  3005094 us

1), No.0. is to read the file from storage device, and others are
to read the file from page caches basically.
2), file size is 512M, and is on ext4 over usb mass storage.
3), the test is done on Pandaboard.

unsigned int  sum = 0;
unsigned long sum_val = 0;

static unsigned long tv_diff(struct timeval *tv1, struct timeval *tv2)
{
	return (tv2->tv_sec - tv1->tv_sec) * 1000000 +
		(tv2->tv_usec - tv1->tv_usec);
}

int main(int argc, char *argv[])
{
	char *mbuf, fbuf;
	int fd;
	int i;
	unsigned long page_size, size;
	struct stat stat;
	struct timeval t1, t2;
	unsigned char *rbuf = malloc(32 * page_size);

	if (!rbuf) {
		printf("	%sn", "malloc failed");
		exit(-1);
	}

	page_size = getpagesize();
	fd = open(argv[1], O_RDWR);
	assert(fd >= 0);

	fstat(fd, &stat);
	size = stat.st_size;
	printf("%s: file %s, size %lu, page size %lun",
		argv[0],
		argv[1], size, page_size);

	gettimeofday(&t1, NULL);
	mbuf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
	if (!mbuf) {
		printf("	%sn", "mmap failed");
		exit(-1);
	}

	for (i = 0 ; i < size ; i += (page_size * 32)) {
		int rcnt;
		lseek(fd, i, SEEK_SET);
		rcnt = read(fd, rbuf, page_size * 32);
		if (rcnt != page_size * 32) {
			printf("%s: read faildn", __func__);
			exit(-1);
		}
	}
	free(rbuf);
	munmap(mbuf, size);
	gettimeofday(&t2, NULL);
	printf("tread mmaped time: %luusn", tv_diff(&t1, &t2));

	close(fd);
}

Cc: Michel Lespinasse <walken@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/flush.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 0d473cce501c..2ff66eb98ff0 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -287,7 +287,7 @@ void flush_dcache_page(struct page *page)
 	mapping = page_mapping(page);
 
 	if (!cache_ops_need_broadcast() &&
-	    mapping && !mapping_mapped(mapping))
+	    mapping && !page_mapped(page))
 		clear_bit(PG_dcache_clean, &page->flags);
 	else {
 		__flush_dcache_page(mapping, page);

From 0305ffd7536270befc3e772bb978a6e3c6d37150 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Fri, 31 May 2013 05:17:29 +0100
Subject: [PATCH 16/31] ARM: 7740/1: MCPM: prettify debug output

The CPU and cluster numbers (MPIDR affinity levels 0 and 1) may have
at most 8 bits each.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/common/mcpm_head.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S
index 8178705c4b24..80f033614a1f 100644
--- a/arch/arm/common/mcpm_head.S
+++ b/arch/arm/common/mcpm_head.S
@@ -32,11 +32,11 @@
 1901:	adr	r0, 1902b
 	bl	printascii
 	mov	r0, r9
-	bl	printhex8
+	bl	printhex2
 	adr	r0, 1903b
 	bl	printascii
 	mov	r0, r10
-	bl	printhex8
+	bl	printhex2
 	adr	r0, 1904b
 	bl	printascii
 #endif

From 6c93dd438aad39c60630dbe47fde8193c845f8b8 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Fri, 31 May 2013 05:24:03 +0100
Subject: [PATCH 17/31] ARM: 7741/1: mcpm_platsmp.c: remove empty smp_init_cpus
 method

The smp_init_cpus method in the smp_operations structure is optional
and can be omitted entirely.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/common/mcpm_platsmp.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/arm/common/mcpm_platsmp.c b/arch/arm/common/mcpm_platsmp.c
index 3caed0db6986..510e5b13aa2e 100644
--- a/arch/arm/common/mcpm_platsmp.c
+++ b/arch/arm/common/mcpm_platsmp.c
@@ -19,10 +19,6 @@
 #include <asm/smp.h>
 #include <asm/smp_plat.h>
 
-static void __init simple_smp_init_cpus(void)
-{
-}
-
 static int __cpuinit mcpm_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
 	unsigned int mpidr, pcpu, pcluster, ret;
@@ -74,7 +70,6 @@ static void mcpm_cpu_die(unsigned int cpu)
 #endif
 
 static struct smp_operations __initdata mcpm_smp_ops = {
-	.smp_init_cpus		= simple_smp_init_cpus,
 	.smp_boot_secondary	= mcpm_boot_secondary,
 	.smp_secondary_init	= mcpm_secondary_init,
 #ifdef CONFIG_HOTPLUG_CPU

From 1aa2b3b7a6c4f3dbd3671171113a20e6a6190e3b Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 5 Jun 2013 11:25:13 +0100
Subject: [PATCH 18/31] ARM: 7748/1: oabi: handle faults when loading swi
 instruction from userspace

Running an OABI_COMPAT kernel on an SMP platform can lead to fun and
games with page aging.

If one CPU issues a swi instruction immediately before another CPU
decides to mkold the page containing the swi instruction, then we will
fault attempting to load the instruction during the vector_swi handler
in order to retrieve its immediate field. Since this fault is not
currently dealt with by our exception tables, this results in a panic:

  Unable to handle kernel paging request at virtual address 4020841c
  pgd = c490c000
  [4020841c] *pgd=84451831, *pte=bf05859d, *ppte=00000000
  Internal error: Oops: 17 [#1] PREEMPT SMP ARM
  Modules linked in: hid_sony(O)
  CPU: 1    Tainted: G        W  O  (3.4.0-perf-gf496dca-01162-gcbcc62b #1)
  PC is at vector_swi+0x28/0x88
  LR is at 0x40208420

This patch wraps all of the swi instruction loads with the USER macro
and provides a shared exception table entry which simply rewinds the
saved user PC and returns from the system call (without setting tbl, so
there's no worries with tracing or syscall restarting). Returning to
userspace will re-enter the page fault handler, from where we will
probably send SIGSEGV to the current task.

Reported-by: Wang, Yalin <yalin.wang@sonymobile.com>
Reviewed-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/kernel/entry-common.S | 42 +++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index bc5bc0a97131..4bc816a74a2e 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -362,6 +362,16 @@ ENTRY(vector_swi)
 	str	r0, [sp, #S_OLD_R0]		@ Save OLD_R0
 	zero_fp
 
+#ifdef CONFIG_ALIGNMENT_TRAP
+	ldr	ip, __cr_alignment
+	ldr	ip, [ip]
+	mcr	p15, 0, ip, c1, c0		@ update control register
+#endif
+
+	enable_irq
+	ct_user_exit
+	get_thread_info tsk
+
 	/*
 	 * Get the system call number.
 	 */
@@ -375,9 +385,9 @@ ENTRY(vector_swi)
 #ifdef CONFIG_ARM_THUMB
 	tst	r8, #PSR_T_BIT
 	movne	r10, #0				@ no thumb OABI emulation
-	ldreq	r10, [lr, #-4]			@ get SWI instruction
+ USER(	ldreq	r10, [lr, #-4]		)	@ get SWI instruction
 #else
-	ldr	r10, [lr, #-4]			@ get SWI instruction
+ USER(	ldr	r10, [lr, #-4]		)	@ get SWI instruction
 #endif
 #ifdef CONFIG_CPU_ENDIAN_BE8
 	rev	r10, r10			@ little endian instruction
@@ -392,22 +402,13 @@ ENTRY(vector_swi)
 	/* Legacy ABI only, possibly thumb mode. */
 	tst	r8, #PSR_T_BIT			@ this is SPSR from save_user_regs
 	addne	scno, r7, #__NR_SYSCALL_BASE	@ put OS number in
-	ldreq	scno, [lr, #-4]
+ USER(	ldreq	scno, [lr, #-4]		)
 
 #else
 	/* Legacy ABI only. */
-	ldr	scno, [lr, #-4]			@ get SWI instruction
+ USER(	ldr	scno, [lr, #-4]		)	@ get SWI instruction
 #endif
 
-#ifdef CONFIG_ALIGNMENT_TRAP
-	ldr	ip, __cr_alignment
-	ldr	ip, [ip]
-	mcr	p15, 0, ip, c1, c0		@ update control register
-#endif
-	enable_irq
-	ct_user_exit
-
-	get_thread_info tsk
 	adr	tbl, sys_call_table		@ load syscall table pointer
 
 #if defined(CONFIG_OABI_COMPAT)
@@ -442,6 +443,21 @@ local_restart:
 	eor	r0, scno, #__NR_SYSCALL_BASE	@ put OS number back
 	bcs	arm_syscall	
 	b	sys_ni_syscall			@ not private func
+
+#if defined(CONFIG_OABI_COMPAT) || !defined(CONFIG_AEABI)
+	/*
+	 * We failed to handle a fault trying to access the page
+	 * containing the swi instruction, but we're not really in a
+	 * position to return -EFAULT. Instead, return back to the
+	 * instruction and re-enter the user fault handling path trying
+	 * to page it in. This will likely result in sending SEGV to the
+	 * current task.
+	 */
+9001:
+	sub	lr, lr, #4
+	str	lr, [sp, #S_PC]
+	b	ret_fast_syscall
+#endif
 ENDPROC(vector_swi)
 
 	/*

From 15e7e5c1ebf556cd620c9b091e121091ac760f6d Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 5 Jun 2013 11:27:26 +0100
Subject: [PATCH 19/31] ARM: 7749/1: spinlock: retry trylock operation if strex
 fails on free lock

An exclusive store instruction may fail for reasons other than lock
contention (e.g. a cache eviction during the critical section) so, in
line with other architectures using similar exclusive instructions
(alpha, mips, powerpc), retry the trylock operation if the lock appears
to be free but the strex reported failure.

Reported-by: Tony Thompson <anthony.thompson@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/spinlock.h | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index 6220e9fdf4c7..f8b8965666e9 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -97,19 +97,22 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
 {
-	unsigned long tmp;
+	unsigned long contended, res;
 	u32 slock;
 
-	__asm__ __volatile__(
-"	ldrex	%0, [%2]\n"
-"	subs	%1, %0, %0, ror #16\n"
-"	addeq	%0, %0, %3\n"
-"	strexeq	%1, %0, [%2]"
-	: "=&r" (slock), "=&r" (tmp)
-	: "r" (&lock->slock), "I" (1 << TICKET_SHIFT)
-	: "cc");
+	do {
+		__asm__ __volatile__(
+		"	ldrex	%0, [%3]\n"
+		"	mov	%2, #0\n"
+		"	subs	%1, %0, %0, ror #16\n"
+		"	addeq	%0, %0, %4\n"
+		"	strexeq	%2, %0, [%3]"
+		: "=&r" (slock), "=&r" (contended), "=r" (res)
+		: "r" (&lock->slock), "I" (1 << TICKET_SHIFT)
+		: "cc");
+	} while (res);
 
-	if (tmp == 0) {
+	if (!contended) {
 		smp_mb();
 		return 1;
 	} else {

From 2874865c1271cc8e8b663804e5de4bc0c36273e1 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Thu, 6 Jun 2013 05:13:48 +0100
Subject: [PATCH 20/31] ARM: 7751/1: zImage: don't overwrite ourself with a
 page table

When zImage is loaded into RAM at a low address but TEXT_OFFSET
is set higher, we risk overwriting ourself with the page table
needed to turn on the cache as it is located relative to the relocation
address.  Let's defer the cache setup after relocation in that case.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Reported-by: Stephen Boyd <sboyd@codeurora.org>
Tested-by: Stephen Boyd <sboyd@codeurora.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/boot/compressed/head.S | 35 ++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index fe4d9c3ad761..8e0d0ada62df 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -182,7 +182,19 @@ not_angel:
 		ldr	r4, =zreladdr
 #endif
 
-		bl	cache_on
+		/*
+		 * Set up a page table only if it won't overwrite ourself.
+		 * That means r4 < pc && r4 - 16k page directory > &_end.
+		 * Given that r4 > &_end is most unfrequent, we add a rough
+		 * additional 1MB of room for a possible appended DTB.
+		 */
+		mov	r0, pc
+		cmp	r0, r4
+		ldrcc	r0, LC0+32
+		addcc	r0, r0, pc
+		cmpcc	r4, r0
+		orrcc	r4, r4, #1		@ remember we skipped cache_on
+		blcs	cache_on
 
 restart:	adr	r0, LC0
 		ldmia	r0, {r1, r2, r3, r6, r10, r11, r12}
@@ -228,7 +240,7 @@ restart:	adr	r0, LC0
  *   r0  = delta
  *   r2  = BSS start
  *   r3  = BSS end
- *   r4  = final kernel address
+ *   r4  = final kernel address (possibly with LSB set)
  *   r5  = appended dtb size (still unknown)
  *   r6  = _edata
  *   r7  = architecture ID
@@ -276,6 +288,7 @@ restart:	adr	r0, LC0
 		 */
 		cmp	r0, #1
 		sub	r0, r4, #TEXT_OFFSET
+		bic	r0, r0, #1
 		add	r0, r0, #0x100
 		mov	r1, r6
 		sub	r2, sp, r6
@@ -322,12 +335,13 @@ dtb_check_done:
 
 /*
  * Check to see if we will overwrite ourselves.
- *   r4  = final kernel address
+ *   r4  = final kernel address (possibly with LSB set)
  *   r9  = size of decompressed image
  *   r10 = end of this image, including  bss/stack/malloc space if non XIP
  * We basically want:
  *   r4 - 16k page directory >= r10 -> OK
  *   r4 + image length <= address of wont_overwrite -> OK
+ * Note: the possible LSB in r4 is harmless here.
  */
 		add	r10, r10, #16384
 		cmp	r4, r10
@@ -389,7 +403,8 @@ dtb_check_done:
 		add	sp, sp, r6
 #endif
 
-		bl	cache_clean_flush
+		tst	r4, #1
+		bleq	cache_clean_flush
 
 		adr	r0, BSYM(restart)
 		add	r0, r0, r6
@@ -401,7 +416,7 @@ wont_overwrite:
  *   r0  = delta
  *   r2  = BSS start
  *   r3  = BSS end
- *   r4  = kernel execution address
+ *   r4  = kernel execution address (possibly with LSB set)
  *   r5  = appended dtb size (0 if not present)
  *   r7  = architecture ID
  *   r8  = atags pointer
@@ -464,6 +479,15 @@ not_relocated:	mov	r0, #0
 		cmp	r2, r3
 		blo	1b
 
+		/*
+		 * Did we skip the cache setup earlier?
+		 * That is indicated by the LSB in r4.
+		 * Do it now if so.
+		 */
+		tst	r4, #1
+		bic	r4, r4, #1
+		blne	cache_on
+
 /*
  * The C runtime environment should now be setup sufficiently.
  * Set up some pointers, and start decompressing.
@@ -512,6 +536,7 @@ LC0:		.word	LC0			@ r1
 		.word	_got_start		@ r11
 		.word	_got_end		@ ip
 		.word	.L_user_stack_end	@ sp
+		.word	_end - restart + 16384 + 1024*1024
 		.size	LC0, . - LC0
 
 #ifdef CONFIG_ARCH_RPC

From 621a0147d5c921f4cc33636ccd0602ad5d7cbfbc Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 12 Jun 2013 12:25:56 +0100
Subject: [PATCH 21/31] ARM: 7757/1: mm: don't flush icache in switch_mm with
 hardware broadcasting

When scheduling an mm on a CPU where it hasn't previously been used, we
flush the icache on that CPU so that any code loaded previously on
a different core can be safely executed.

For cores with hardware broadcasting of cache maintenance operations,
this is clearly unnecessary, since the inner-shareable invalidation in
__sync_icache_dcache will affect all CPUs.

This patch conditionalises the icache flush in switch_mm based on
cache_ops_need_broadcast().

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Albin Tonnerre <albin.tonnerre@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/mmu_context.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index a7b85e0d0cc1..2a45c33ebdc8 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -18,6 +18,7 @@
 #include <asm/cacheflush.h>
 #include <asm/cachetype.h>
 #include <asm/proc-fns.h>
+#include <asm/smp_plat.h>
 #include <asm-generic/mm_hooks.h>
 
 void __check_vmalloc_seq(struct mm_struct *mm);
@@ -98,12 +99,16 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next,
 #ifdef CONFIG_MMU
 	unsigned int cpu = smp_processor_id();
 
-#ifdef CONFIG_SMP
-	/* check for possible thread migration */
-	if (!cpumask_empty(mm_cpumask(next)) &&
+	/*
+	 * __sync_icache_dcache doesn't broadcast the I-cache invalidation,
+	 * so check for possible thread migration and invalidate the I-cache
+	 * if we're new to this CPU.
+	 */
+	if (cache_ops_need_broadcast() &&
+	    !cpumask_empty(mm_cpumask(next)) &&
 	    !cpumask_test_cpu(cpu, mm_cpumask(next)))
 		__flush_icache_all();
-#endif
+
 	if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next) {
 		check_and_switch_context(next, tsk);
 		if (cache_is_vivt())

From 4a1b573346ee0d64d95beb78d49a5bbb574e6c6a Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <eduardo.valentin@ti.com>
Date: Thu, 13 Jun 2013 22:58:52 +0100
Subject: [PATCH 22/31] ARM: 7758/1: introduce config HAS_BANDGAP

Bandgap is a device used to measure temperature on electronic
equipments.  It is widely used in digital integrated circuits.  It is
based on the dependency between silicon voltage and temperature.

This patch introduce HAS_BANDGAP config entry.  This config is a boolean
value so that arch code can flag if they feature a bandgap device.

This config entry follows the same idea behind ARCH_HAS_CPUFREQ.

Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-omap@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Fabio Stevam <festevam@gmail.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Acked-by: Amit Daniel Kachhap <amit.daniel@samsung.com>
Signed-off-by: Eduardo Valentin <eduardo.valentin@ti.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/Kconfig            | 3 +++
 arch/arm/mach-omap2/Kconfig | 1 +
 2 files changed, 4 insertions(+)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index d423d58f938d..bcbdec95b506 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -174,6 +174,9 @@ config ARCH_HAS_CPUFREQ
 	  and that the relevant menu configurations are displayed for
 	  it.
 
+config ARCH_HAS_BANDGAP
+	bool
+
 config GENERIC_HWEIGHT
 	bool
 	default y
diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig
index f49cd51e162a..8620ab52a4de 100644
--- a/arch/arm/mach-omap2/Kconfig
+++ b/arch/arm/mach-omap2/Kconfig
@@ -4,6 +4,7 @@ config ARCH_OMAP
 config ARCH_OMAP2PLUS
 	bool "TI OMAP2/3/4/5 SoCs with device tree support" if (ARCH_MULTI_V6 || ARCH_MULTI_V7)
 	select ARCH_HAS_CPUFREQ
+	select ARCH_HAS_BANDGAP
 	select ARCH_HAS_HOLES_MEMORYMODEL
 	select ARCH_OMAP
 	select ARCH_REQUIRE_GPIOLIB

From a4780adeefd042482f624f5e0d577bf9cdcbb760 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Hentschel?= <nerv@dawncrow.de>
Date: Tue, 18 Jun 2013 23:23:26 +0100
Subject: [PATCH 23/31] ARM: 7735/2: Preserve the user r/w register TPIDRURW on
 context switch and fork
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since commit 6a1c53124aa1 the user writeable TLS register was zeroed to
prevent it from being used as a covert channel between two tasks.

There are more and more applications coming to Windows RT,
Wine could support them, but mostly they expect to have
the thread environment block (TEB) in TPIDRURW.

This patch preserves that register per thread instead of clearing it.
Unlike the TPIDRURO, which is already switched, the TPIDRURW
can be updated from userspace so needs careful treatment in the case that we
modify TPIDRURW and call fork(). To avoid this we must always read
TPIDRURW in copy_thread.

Signed-off-by: André Hentschel <nerv@dawncrow.de>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Jonathan Austin <jonathan.austin@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/thread_info.h |  2 +-
 arch/arm/include/asm/tls.h         | 40 ++++++++++++++++++++----------
 arch/arm/kernel/entry-armv.S       |  5 ++--
 arch/arm/kernel/process.c          |  4 ++-
 arch/arm/kernel/ptrace.c           |  2 +-
 arch/arm/kernel/traps.c            |  4 +--
 6 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 1995d1a84060..214d4158089a 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -58,7 +58,7 @@ struct thread_info {
 	struct cpu_context_save	cpu_context;	/* cpu context */
 	__u32			syscall;	/* syscall number */
 	__u8			used_cp[16];	/* thread used copro */
-	unsigned long		tp_value;
+	unsigned long		tp_value[2];	/* TLS registers */
 #ifdef CONFIG_CRUNCH
 	struct crunch_state	crunchstate;
 #endif
diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 73409e6c0251..83259b873333 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -2,27 +2,30 @@
 #define __ASMARM_TLS_H
 
 #ifdef __ASSEMBLY__
-	.macro set_tls_none, tp, tmp1, tmp2
+#include <asm/asm-offsets.h>
+	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
 	.endm
 
-	.macro set_tls_v6k, tp, tmp1, tmp2
+	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
+	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
-	mov	\tmp1, #0
-	mcr	p15, 0, \tmp1, c13, c0, 2	@ clear user r/w TLS register
+	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
+	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro set_tls_v6, tp, tmp1, tmp2
+	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
 	ldr	\tmp1, =elf_hwcap
 	ldr	\tmp1, [\tmp1, #0]
 	mov	\tmp2, #0xffff0fff
 	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
-	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
-	movne	\tmp1, #0
-	mcrne	p15, 0, \tmp1, c13, c0, 2	@ clear user r/w TLS register
 	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
+	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
+	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
+	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
+	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro set_tls_software, tp, tmp1, tmp2
+	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
 	mov	\tmp1, #0xffff0fff
 	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
 	.endm
@@ -31,19 +34,30 @@
 #ifdef CONFIG_TLS_REG_EMUL
 #define tls_emu		1
 #define has_tls_reg		1
-#define set_tls		set_tls_none
+#define switch_tls	switch_tls_none
 #elif defined(CONFIG_CPU_V6)
 #define tls_emu		0
 #define has_tls_reg		(elf_hwcap & HWCAP_TLS)
-#define set_tls		set_tls_v6
+#define switch_tls	switch_tls_v6
 #elif defined(CONFIG_CPU_32v6K)
 #define tls_emu		0
 #define has_tls_reg		1
-#define set_tls		set_tls_v6k
+#define switch_tls	switch_tls_v6k
 #else
 #define tls_emu		0
 #define has_tls_reg		0
-#define set_tls		set_tls_software
+#define switch_tls	switch_tls_software
 #endif
 
+#ifndef __ASSEMBLY__
+static inline unsigned long get_tpuser(void)
+{
+	unsigned long reg = 0;
+
+	if (has_tls_reg && !tls_emu)
+		__asm__("mrc p15, 0, %0, c13, c0, 2" : "=r" (reg));
+
+	return reg;
+}
+#endif
 #endif	/* __ASMARM_TLS_H */
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 582b405befc5..a39cfc2a1f90 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -685,15 +685,16 @@ ENTRY(__switch_to)
  UNWIND(.fnstart	)
  UNWIND(.cantunwind	)
 	add	ip, r1, #TI_CPU_SAVE
-	ldr	r3, [r2, #TI_TP_VALUE]
  ARM(	stmia	ip!, {r4 - sl, fp, sp, lr} )	@ Store most regs on stack
  THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
  THUMB(	str	sp, [ip], #4		   )
  THUMB(	str	lr, [ip], #4		   )
+	ldr	r4, [r2, #TI_TP_VALUE]
+	ldr	r5, [r2, #TI_TP_VALUE + 4]
 #ifdef CONFIG_CPU_USE_DOMAINS
 	ldr	r6, [r2, #TI_CPU_DOMAIN]
 #endif
-	set_tls	r3, r4, r5
+	switch_tls r1, r4, r5, r3, r7
 #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	ldr	r7, [r2, #TI_TASK]
 	ldr	r8, =__stack_chk_guard
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index f21970316836..087064148ebf 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -39,6 +39,7 @@
 #include <asm/thread_notify.h>
 #include <asm/stacktrace.h>
 #include <asm/mach/time.h>
+#include <asm/tls.h>
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 #include <linux/stackprotector.h>
@@ -343,7 +344,8 @@ copy_thread(unsigned long clone_flags, unsigned long stack_start,
 	clear_ptrace_hw_breakpoint(p);
 
 	if (clone_flags & CLONE_SETTLS)
-		thread->tp_value = childregs->ARM_r3;
+		thread->tp_value[0] = childregs->ARM_r3;
+	thread->tp_value[1] = get_tpuser();
 
 	thread_notify(THREAD_NOTIFY_COPY, thread);
 
diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 03deeffd9f6d..2bc1514d6dbe 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -849,7 +849,7 @@ long arch_ptrace(struct task_struct *child, long request,
 #endif
 
 		case PTRACE_GET_THREAD_AREA:
-			ret = put_user(task_thread_info(child)->tp_value,
+			ret = put_user(task_thread_info(child)->tp_value[0],
 				       datap);
 			break;
 
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 18b32e8e4497..517bfd4da1c9 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -581,7 +581,7 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
 		return regs->ARM_r0;
 
 	case NR(set_tls):
-		thread->tp_value = regs->ARM_r0;
+		thread->tp_value[0] = regs->ARM_r0;
 		if (tls_emu)
 			return 0;
 		if (has_tls_reg) {
@@ -699,7 +699,7 @@ static int get_tp_trap(struct pt_regs *regs, unsigned int instr)
 	int reg = (instr >> 12) & 15;
 	if (reg == 15)
 		return 1;
-	regs->uregs[reg] = current_thread_info()->tp_value;
+	regs->uregs[reg] = current_thread_info()->tp_value[0];
 	regs->ARM_pc += 4;
 	return 0;
 }

From c5f927a6f62196226915f12194c9d0df4e2210d7 Mon Sep 17 00:00:00 2001
From: Jed Davis <jld@mozilla.com>
Date: Thu, 20 Jun 2013 10:16:29 +0100
Subject: [PATCH 24/31] ARM: 7765/1: perf: Record the user-mode PC in the call
 chain.

With this change, we no longer lose the innermost entry in the user-mode
part of the call chain.  See also the x86 port, which includes the ip.

It's possible to partially work around this problem by post-processing
the data to use the PERF_SAMPLE_IP value, but this works only if the CPU
wasn't in the kernel when the sample was taken.

Cc: <stable@vger.kernel.org>
Signed-off-by: Jed Davis <jld@mozilla.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/kernel/perf_event.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 8c3094d0f7b7..d9f5cd4e533f 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -569,6 +569,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 	}
 
+	perf_callchain_store(entry, regs->ARM_pc);
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
 	while ((entry->nr < PERF_MAX_STACK_DEPTH) &&

From 8121cf312a1908f67173ec3773da1688e04437e7 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Thu, 20 Jun 2013 14:02:21 +0100
Subject: [PATCH 25/31] ARM: 7766/1: versatile: don't mark pen as __INIT

When booting fewer cores than are physically present on a versatile
platform (e.g. when passing maxcpus=N on the command line), some
secondary cores may remain in the holding pen, which is marked __INIT,
as each CPU's gic cpumask is initialised to 0xff, and thus an IPI to any
CPU will wake up *all* secondaries. This behaviour is crucial to the GIC
cpumask self-discovery. Late in the boot process, the memory comprising
the holding pen will be released to the kernel for more general use, and
may be overwritten with arbitrary data, which can cause the held
secondaries to start behaving unpredictably. This can lead to all manner
of odd behaviour from the kernel.

As preventing cpus from entering the pen would require invasive changes
to the GIC driver and to existing dts used in the wild, we instead
remove the __INIT marker from the pen, keeping it around and leaving the
unused secondary CPUs dormant.

Link: http://lists.infradead.org/pipermail/linux-arm-kernel/2013-June/175039.html

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Pawel Moll <pawel.moll@arm.com>
Acked-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/plat-versatile/headsmp.S | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm/plat-versatile/headsmp.S b/arch/arm/plat-versatile/headsmp.S
index b178d44e9eaa..2677bc3762d7 100644
--- a/arch/arm/plat-versatile/headsmp.S
+++ b/arch/arm/plat-versatile/headsmp.S
@@ -11,8 +11,6 @@
 #include <linux/linkage.h>
 #include <linux/init.h>
 
-	__INIT
-
 /*
  * Realview/Versatile Express specific entry point for secondary CPUs.
  * This provides a "holding pen" into which all secondary cores are held

From ae120d9edfe96628f03d87634acda0bfa7110632 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <Marc.Zyngier@arm.com>
Date: Fri, 21 Jun 2013 12:06:19 +0100
Subject: [PATCH 26/31] ARM: 7767/1: let the ASID allocator handle suspended
 animation

When a CPU is running a process, the ASID for that process is
held in a per-CPU variable (the "active ASIDs" array). When
the ASID allocator handles a rollover, it copies the active
ASIDs into a "reserved ASIDs" array to ensure that a process
currently running on another CPU will continue to run unaffected.
The active array is zero-ed to indicate that a rollover occurred.

Because of this mechanism, a reserved ASID is only remembered for
a single rollover. A subsequent rollover will completely refill
the reserved ASIDs array.

In a severely oversubscribed environment where a CPU can be
prevented from running for extended periods of time (think virtual
machines), the above has a horrible side effect:

[P{a} denotes process P running with ASID a]

	CPU-0		CPU-1

	A{x}				[active = <x 0>]

	[suspended]	runs B{y}	[active = <x y>]

					[rollover:
					 active = <0 0>
					 reserved = <x y>]

			runs B{y}	[active = <0 y>
					 reserved = <x y>]

					[rollover:
					 active = <0 0>
					 reserved = <0 y>]

			runs C{x}	[active = <0 x>]

	[resumes]

	runs A{x}

At that stage, both A and C have the same ASID, with deadly
consequences.

The fix is to preserve reserved ASIDs across rollovers if
the CPU doesn't have an active ASID when the rollover occurs.

Cc: <stable@vger.kernel.org> # 3.9
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Catalin Carinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/context.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c
index 2ac37372ef52..8e12fcbb2c63 100644
--- a/arch/arm/mm/context.c
+++ b/arch/arm/mm/context.c
@@ -128,6 +128,15 @@ static void flush_context(unsigned int cpu)
 			asid = 0;
 		} else {
 			asid = atomic64_xchg(&per_cpu(active_asids, i), 0);
+			/*
+			 * If this CPU has already been through a
+			 * rollover, but hasn't run another task in
+			 * the meantime, we must preserve its reserved
+			 * ASID, as this is the only trace we have of
+			 * the process it is still running.
+			 */
+			if (asid == 0)
+				asid = per_cpu(reserved_asids, i);
 			__set_bit(ASID_TO_IDX(asid), asid_map);
 		}
 		per_cpu(reserved_asids, i) = asid;

From b8e4a4740fa2b17c0a447b3ab783b3dc10702e27 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <Marc.Zyngier@arm.com>
Date: Fri, 21 Jun 2013 12:06:55 +0100
Subject: [PATCH 27/31] ARM: 7768/1: prevent risks of out-of-bound access in
 ASID allocator

On a CPU that never ran anything, both the active and reserved ASID
fields are set to zero. In this case the ASID_TO_IDX() macro will
return -1, which is not a very useful value to index a bitmap.

Instead of trying to offset the ASID so that ASID #1 is actually
bit 0 in the asid_map bitmap, just always ignore bit 0 and start
the search from bit 1. This makes the code a bit more readable,
and without risk of OoB access.

Cc: <stable@vger.kernel.org> # 3.9
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/mm/context.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c
index 8e12fcbb2c63..83e09058f96f 100644
--- a/arch/arm/mm/context.c
+++ b/arch/arm/mm/context.c
@@ -39,10 +39,7 @@
  * non 64-bit operations.
  */
 #define ASID_FIRST_VERSION	(1ULL << ASID_BITS)
-#define NUM_USER_ASIDS		(ASID_FIRST_VERSION - 1)
-
-#define ASID_TO_IDX(asid)	((asid & ~ASID_MASK) - 1)
-#define IDX_TO_ASID(idx)	((idx + 1) & ~ASID_MASK)
+#define NUM_USER_ASIDS		ASID_FIRST_VERSION
 
 static DEFINE_RAW_SPINLOCK(cpu_asid_lock);
 static atomic64_t asid_generation = ATOMIC64_INIT(ASID_FIRST_VERSION);
@@ -137,7 +134,7 @@ static void flush_context(unsigned int cpu)
 			 */
 			if (asid == 0)
 				asid = per_cpu(reserved_asids, i);
-			__set_bit(ASID_TO_IDX(asid), asid_map);
+			__set_bit(asid & ~ASID_MASK, asid_map);
 		}
 		per_cpu(reserved_asids, i) = asid;
 	}
@@ -176,17 +173,19 @@ static u64 new_context(struct mm_struct *mm, unsigned int cpu)
 		/*
 		 * Allocate a free ASID. If we can't find one, take a
 		 * note of the currently active ASIDs and mark the TLBs
-		 * as requiring flushes.
+		 * as requiring flushes. We always count from ASID #1,
+		 * as we reserve ASID #0 to switch via TTBR0 and indicate
+		 * rollover events.
 		 */
-		asid = find_first_zero_bit(asid_map, NUM_USER_ASIDS);
+		asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, 1);
 		if (asid == NUM_USER_ASIDS) {
 			generation = atomic64_add_return(ASID_FIRST_VERSION,
 							 &asid_generation);
 			flush_context(cpu);
-			asid = find_first_zero_bit(asid_map, NUM_USER_ASIDS);
+			asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, 1);
 		}
 		__set_bit(asid, asid_map);
-		asid = generation | IDX_TO_ASID(asid);
+		asid |= generation;
 		cpumask_clear(mm_cpumask(mm));
 	}
 

From 0d0752bca1f9a91fb646647aa4abbb21156f316c Mon Sep 17 00:00:00 2001
From: Marc Zyngier <Marc.Zyngier@arm.com>
Date: Fri, 21 Jun 2013 12:07:27 +0100
Subject: [PATCH 28/31] ARM: 7769/1: Cortex-A15: fix erratum 798181
 implementation

Looking into the active_asids array is not enough, as we also need
to look into the reserved_asids array (they both represent processes
that are currently running).

Also, not holding the ASID allocator lock is racy, as another CPU
could schedule that process and trigger a rollover, making the erratum
workaround miss an IPI.

Exposing this outside of context.c is a little ugly on the side, so
let's define a new entry point that the erratum workaround can call
to obtain the cpumask.

Cc: <stable@vger.kernel.org> # 3.9
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/mmu_context.h | 10 +++++++++-
 arch/arm/kernel/smp_tlb.c          | 18 ++----------------
 arch/arm/mm/context.c              | 29 ++++++++++++++++++++++++++++-
 3 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index 2a45c33ebdc8..b5792b7fd8d3 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -28,7 +28,15 @@ void __check_vmalloc_seq(struct mm_struct *mm);
 void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk);
 #define init_new_context(tsk,mm)	({ atomic64_set(&mm->context.id, 0); 0; })
 
-DECLARE_PER_CPU(atomic64_t, active_asids);
+#ifdef CONFIG_ARM_ERRATA_798181
+void a15_erratum_get_cpumask(int this_cpu, struct mm_struct *mm,
+			     cpumask_t *mask);
+#else  /* !CONFIG_ARM_ERRATA_798181 */
+static inline void a15_erratum_get_cpumask(int this_cpu, struct mm_struct *mm,
+					   cpumask_t *mask)
+{
+}
+#endif /* CONFIG_ARM_ERRATA_798181 */
 
 #else	/* !CONFIG_CPU_HAS_ASID */
 
diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c
index 9a52a07aa40e..a98b62dca2fa 100644
--- a/arch/arm/kernel/smp_tlb.c
+++ b/arch/arm/kernel/smp_tlb.c
@@ -103,7 +103,7 @@ static void broadcast_tlb_a15_erratum(void)
 
 static void broadcast_tlb_mm_a15_erratum(struct mm_struct *mm)
 {
-	int cpu, this_cpu;
+	int this_cpu;
 	cpumask_t mask = { CPU_BITS_NONE };
 
 	if (!erratum_a15_798181())
@@ -111,21 +111,7 @@ static void broadcast_tlb_mm_a15_erratum(struct mm_struct *mm)
 
 	dummy_flush_tlb_a15_erratum();
 	this_cpu = get_cpu();
-	for_each_online_cpu(cpu) {
-		if (cpu == this_cpu)
-			continue;
-		/*
-		 * We only need to send an IPI if the other CPUs are running
-		 * the same ASID as the one being invalidated. There is no
-		 * need for locking around the active_asids check since the
-		 * switch_mm() function has at least one dmb() (as required by
-		 * this workaround) in case a context switch happens on
-		 * another CPU after the condition below.
-		 */
-		if (atomic64_read(&mm->context.id) ==
-		    atomic64_read(&per_cpu(active_asids, cpu)))
-			cpumask_set_cpu(cpu, &mask);
-	}
+	a15_erratum_get_cpumask(this_cpu, mm, &mask);
 	smp_call_function_many(&mask, ipi_flush_tlb_a15_erratum, NULL, 1);
 	put_cpu();
 }
diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c
index 83e09058f96f..eeab06ebd06e 100644
--- a/arch/arm/mm/context.c
+++ b/arch/arm/mm/context.c
@@ -45,10 +45,37 @@ static DEFINE_RAW_SPINLOCK(cpu_asid_lock);
 static atomic64_t asid_generation = ATOMIC64_INIT(ASID_FIRST_VERSION);
 static DECLARE_BITMAP(asid_map, NUM_USER_ASIDS);
 
-DEFINE_PER_CPU(atomic64_t, active_asids);
+static DEFINE_PER_CPU(atomic64_t, active_asids);
 static DEFINE_PER_CPU(u64, reserved_asids);
 static cpumask_t tlb_flush_pending;
 
+#ifdef CONFIG_ARM_ERRATA_798181
+void a15_erratum_get_cpumask(int this_cpu, struct mm_struct *mm,
+			     cpumask_t *mask)
+{
+	int cpu;
+	unsigned long flags;
+	u64 context_id, asid;
+
+	raw_spin_lock_irqsave(&cpu_asid_lock, flags);
+	context_id = mm->context.id.counter;
+	for_each_online_cpu(cpu) {
+		if (cpu == this_cpu)
+			continue;
+		/*
+		 * We only need to send an IPI if the other CPUs are
+		 * running the same ASID as the one being invalidated.
+		 */
+		asid = per_cpu(active_asids, cpu).counter;
+		if (asid == 0)
+			asid = per_cpu(reserved_asids, cpu);
+		if (context_id == asid)
+			cpumask_set_cpu(cpu, mask);
+	}
+	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
+}
+#endif
+
 #ifdef CONFIG_ARM_LPAE
 static void cpu_set_reserved_ttbr0(void)
 {

From 52c08a9e399739979027ea1e463f5529476da104 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <Marc.Zyngier@arm.com>
Date: Fri, 21 Jun 2013 14:49:43 +0100
Subject: [PATCH 29/31] ARM: 7770/1: remove residual ARMv2 support from
 decompressor

arm26 support in Linux is long gone, yet it left an interresting,
fossilized trace in the decompressor.

Remove it so people won't get confused about what teqp is actually
doing here...

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/boot/compressed/head.S | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 8e0d0ada62df..15f2f573d480 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -141,7 +141,6 @@ start:
 		mov	r7, r1			@ save architecture ID
 		mov	r8, r2			@ save atags pointer
 
-#ifndef __ARM_ARCH_2__
 		/*
 		 * Booting from Angel - need to enter SVC mode and disable
 		 * FIQs/IRQs (numeric definitions from angel arm.h source).
@@ -157,10 +156,6 @@ not_angel:
 		safe_svcmode_maskall r0
 		msr	spsr_cxsf, r9		@ Save the CPU boot mode in
 						@ SPSR
-#else
-		teqp	pc, #0x0c000003		@ turn off interrupts
-#endif
-
 		/*
 		 * Note that some cache flushing and other stuff may
 		 * be needed here - is there an Angel SWI call for this?

From 81793bab5d7cc0d0962dae47dab3a63ae4bd5265 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Mon, 24 Jun 2013 22:03:50 +0100
Subject: [PATCH 30/31] ARM: 7774/1: Fix dtb dependency to use order-only
 prerequisites

The %.dtb dependency is specified to depend on the PHONY "scripts".
That means that it'll build every time even if the underlying dtb file
hasn't been touched.  Use an order-only prerequisites to fix this.
Also mark "dtbs" as PHONY for correctness.

This was broken in (70b0476 ARM: 7513/1: Make sure dtc is built before
running it).

Reported-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Doug Anderson <dianders@chromium.org>
Acked-by: Olof Johansson <olof@lixom.net>
Reviewed-by: David Brown <davidb@codeaurora.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 47374085befd..10c08092378f 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -289,9 +289,10 @@ zImage Image xipImage bootpImage uImage: vmlinux
 zinstall uinstall install: vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $@
 
-%.dtb: scripts
+%.dtb: | scripts
 	$(Q)$(MAKE) $(build)=$(boot)/dts MACHINE=$(MACHINE) $(boot)/dts/$@
 
+PHONY += dtbs
 dtbs: scripts
 	$(Q)$(MAKE) $(build)=$(boot)/dts MACHINE=$(MACHINE) dtbs
 

From 1b21376a737aeaa82320ae014b8e1c2c53cfc479 Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Thu, 27 Jun 2013 04:32:06 +0100
Subject: [PATCH 31/31] ARM: 7777/1: Avoid extra calls to the C compiler

Starting up the C compiler can be a slow operation on some systems.
Though these calls don't individually take a lot of time, they add up.
Rearrange the ARM Makefile a bit to avoid extra calls to the compiler
when they can be easily avoided.

When running with the Chrome OS ARM cross compiler
"armv7a-cros-linux-gnueabi-", this shaved .55 seconds (from 5.31
seconds to 4.76 seconds) off an incremental build of the kernel:
  time make -j32 ARCH=arm CROSS_COMPILE=armv7a-cros-linux-gnueabi-

Thanks to Mike Frysinger for the clean trick to make this work.

Signed-off-by: Doug Anderson <dianders@chromium.org>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/Makefile | 56 ++++++++++++++++++++++++++---------------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 10c08092378f..20559a089442 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -59,37 +59,43 @@ comma = ,
 # Note that GCC does not numerically define an architecture version
 # macro, but instead defines a whole series of macros which makes
 # testing for a specific architecture or later rather impossible.
-arch-$(CONFIG_CPU_32v7)		:=-D__LINUX_ARM_ARCH__=7 $(call cc-option,-march=armv7-a,-march=armv5t -Wa$(comma)-march=armv7-a)
-arch-$(CONFIG_CPU_32v6)		:=-D__LINUX_ARM_ARCH__=6 $(call cc-option,-march=armv6,-march=armv5t -Wa$(comma)-march=armv6)
+arch-$(CONFIG_CPU_32v7)		=-D__LINUX_ARM_ARCH__=7 $(call cc-option,-march=armv7-a,-march=armv5t -Wa$(comma)-march=armv7-a)
+arch-$(CONFIG_CPU_32v6)		=-D__LINUX_ARM_ARCH__=6 $(call cc-option,-march=armv6,-march=armv5t -Wa$(comma)-march=armv6)
 # Only override the compiler option if ARMv6. The ARMv6K extensions are
 # always available in ARMv7
 ifeq ($(CONFIG_CPU_32v6),y)
-arch-$(CONFIG_CPU_32v6K)	:=-D__LINUX_ARM_ARCH__=6 $(call cc-option,-march=armv6k,-march=armv5t -Wa$(comma)-march=armv6k)
+arch-$(CONFIG_CPU_32v6K)	=-D__LINUX_ARM_ARCH__=6 $(call cc-option,-march=armv6k,-march=armv5t -Wa$(comma)-march=armv6k)
 endif
-arch-$(CONFIG_CPU_32v5)		:=-D__LINUX_ARM_ARCH__=5 $(call cc-option,-march=armv5te,-march=armv4t)
-arch-$(CONFIG_CPU_32v4T)	:=-D__LINUX_ARM_ARCH__=4 -march=armv4t
-arch-$(CONFIG_CPU_32v4)		:=-D__LINUX_ARM_ARCH__=4 -march=armv4
-arch-$(CONFIG_CPU_32v3)		:=-D__LINUX_ARM_ARCH__=3 -march=armv3
+arch-$(CONFIG_CPU_32v5)		=-D__LINUX_ARM_ARCH__=5 $(call cc-option,-march=armv5te,-march=armv4t)
+arch-$(CONFIG_CPU_32v4T)	=-D__LINUX_ARM_ARCH__=4 -march=armv4t
+arch-$(CONFIG_CPU_32v4)		=-D__LINUX_ARM_ARCH__=4 -march=armv4
+arch-$(CONFIG_CPU_32v3)		=-D__LINUX_ARM_ARCH__=3 -march=armv3
+
+# Evaluate arch cc-option calls now
+arch-y := $(arch-y)
 
 # This selects how we optimise for the processor.
-tune-$(CONFIG_CPU_ARM7TDMI)	:=-mtune=arm7tdmi
-tune-$(CONFIG_CPU_ARM720T)	:=-mtune=arm7tdmi
-tune-$(CONFIG_CPU_ARM740T)	:=-mtune=arm7tdmi
-tune-$(CONFIG_CPU_ARM9TDMI)	:=-mtune=arm9tdmi
-tune-$(CONFIG_CPU_ARM940T)	:=-mtune=arm9tdmi
-tune-$(CONFIG_CPU_ARM946E)	:=$(call cc-option,-mtune=arm9e,-mtune=arm9tdmi)
-tune-$(CONFIG_CPU_ARM920T)	:=-mtune=arm9tdmi
-tune-$(CONFIG_CPU_ARM922T)	:=-mtune=arm9tdmi
-tune-$(CONFIG_CPU_ARM925T)	:=-mtune=arm9tdmi
-tune-$(CONFIG_CPU_ARM926T)	:=-mtune=arm9tdmi
-tune-$(CONFIG_CPU_FA526)	:=-mtune=arm9tdmi
-tune-$(CONFIG_CPU_SA110)	:=-mtune=strongarm110
-tune-$(CONFIG_CPU_SA1100)	:=-mtune=strongarm1100
-tune-$(CONFIG_CPU_XSCALE)	:=$(call cc-option,-mtune=xscale,-mtune=strongarm110) -Wa,-mcpu=xscale
-tune-$(CONFIG_CPU_XSC3)		:=$(call cc-option,-mtune=xscale,-mtune=strongarm110) -Wa,-mcpu=xscale
-tune-$(CONFIG_CPU_FEROCEON)	:=$(call cc-option,-mtune=marvell-f,-mtune=xscale)
-tune-$(CONFIG_CPU_V6)		:=$(call cc-option,-mtune=arm1136j-s,-mtune=strongarm)
-tune-$(CONFIG_CPU_V6K)		:=$(call cc-option,-mtune=arm1136j-s,-mtune=strongarm)
+tune-$(CONFIG_CPU_ARM7TDMI)	=-mtune=arm7tdmi
+tune-$(CONFIG_CPU_ARM720T)	=-mtune=arm7tdmi
+tune-$(CONFIG_CPU_ARM740T)	=-mtune=arm7tdmi
+tune-$(CONFIG_CPU_ARM9TDMI)	=-mtune=arm9tdmi
+tune-$(CONFIG_CPU_ARM940T)	=-mtune=arm9tdmi
+tune-$(CONFIG_CPU_ARM946E)	=$(call cc-option,-mtune=arm9e,-mtune=arm9tdmi)
+tune-$(CONFIG_CPU_ARM920T)	=-mtune=arm9tdmi
+tune-$(CONFIG_CPU_ARM922T)	=-mtune=arm9tdmi
+tune-$(CONFIG_CPU_ARM925T)	=-mtune=arm9tdmi
+tune-$(CONFIG_CPU_ARM926T)	=-mtune=arm9tdmi
+tune-$(CONFIG_CPU_FA526)	=-mtune=arm9tdmi
+tune-$(CONFIG_CPU_SA110)	=-mtune=strongarm110
+tune-$(CONFIG_CPU_SA1100)	=-mtune=strongarm1100
+tune-$(CONFIG_CPU_XSCALE)	=$(call cc-option,-mtune=xscale,-mtune=strongarm110) -Wa,-mcpu=xscale
+tune-$(CONFIG_CPU_XSC3)		=$(call cc-option,-mtune=xscale,-mtune=strongarm110) -Wa,-mcpu=xscale
+tune-$(CONFIG_CPU_FEROCEON)	=$(call cc-option,-mtune=marvell-f,-mtune=xscale)
+tune-$(CONFIG_CPU_V6)		=$(call cc-option,-mtune=arm1136j-s,-mtune=strongarm)
+tune-$(CONFIG_CPU_V6K)		=$(call cc-option,-mtune=arm1136j-s,-mtune=strongarm)
+
+# Evaluate tune cc-option calls now
+tune-y := $(tune-y)
 
 ifeq ($(CONFIG_AEABI),y)
 CFLAGS_ABI	:=-mabi=aapcs-linux -mno-thumb-interwork