Merge "Merge android-4.19-stable.157 (8ee67bc
) into msm-4.19"
This commit is contained in:
commit
58fada0da7
662 changed files with 51036 additions and 5409 deletions
|
@ -566,7 +566,7 @@
|
|||
loops can be debugged more effectively on production
|
||||
systems.
|
||||
|
||||
clearcpuid=BITNUM [X86]
|
||||
clearcpuid=BITNUM[,BITNUM...] [X86]
|
||||
Disable CPUID feature X for the kernel. See
|
||||
arch/x86/include/asm/cpufeatures.h for the valid bit
|
||||
numbers. Note the Linux specific bits are not necessarily
|
||||
|
@ -5302,6 +5302,14 @@
|
|||
with /sys/devices/system/xen_memory/xen_memory0/scrub_pages.
|
||||
Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT.
|
||||
|
||||
xen.event_eoi_delay= [XEN]
|
||||
How long to delay EOI handling in case of event
|
||||
storms (jiffies). Default is 10.
|
||||
|
||||
xen.event_loop_timeout= [XEN]
|
||||
After which time (jiffies) the event handling loop
|
||||
should start to delay EOI handling. Default is 2.
|
||||
|
||||
xirc2ps_cs= [NET,PCMCIA]
|
||||
Format:
|
||||
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
|
||||
|
|
|
@ -99,16 +99,20 @@ Coarse and fast_ns access
|
|||
|
||||
Some additional variants exist for more specialized cases:
|
||||
|
||||
.. c:function:: ktime_t ktime_get_coarse_boottime( void )
|
||||
.. c:function:: ktime_t ktime_get_coarse( void )
|
||||
ktime_t ktime_get_coarse_boottime( void )
|
||||
ktime_t ktime_get_coarse_real( void )
|
||||
ktime_t ktime_get_coarse_clocktai( void )
|
||||
ktime_t ktime_get_coarse_raw( void )
|
||||
|
||||
.. c:function:: u64 ktime_get_coarse_ns( void )
|
||||
u64 ktime_get_coarse_boottime_ns( void )
|
||||
u64 ktime_get_coarse_real_ns( void )
|
||||
u64 ktime_get_coarse_clocktai_ns( void )
|
||||
|
||||
.. c:function:: void ktime_get_coarse_ts64( struct timespec64 * )
|
||||
void ktime_get_coarse_boottime_ts64( struct timespec64 * )
|
||||
void ktime_get_coarse_real_ts64( struct timespec64 * )
|
||||
void ktime_get_coarse_clocktai_ts64( struct timespec64 * )
|
||||
void ktime_get_coarse_raw_ts64( struct timespec64 * )
|
||||
|
||||
These are quicker than the non-coarse versions, but less accurate,
|
||||
corresponding to CLOCK_MONONOTNIC_COARSE and CLOCK_REALTIME_COARSE
|
||||
|
|
|
@ -29,8 +29,7 @@ whole range, 0-255, dividing the angular value by 1.41. The enum
|
|||
:c:type:`v4l2_hsv_encoding` specifies which encoding is used.
|
||||
|
||||
.. note:: The default R'G'B' quantization is full range for all
|
||||
colorspaces except for BT.2020 which uses limited range R'G'B'
|
||||
quantization.
|
||||
colorspaces. HSV formats are always full range.
|
||||
|
||||
.. tabularcolumns:: |p{6.0cm}|p{11.5cm}|
|
||||
|
||||
|
@ -162,8 +161,8 @@ whole range, 0-255, dividing the angular value by 1.41. The enum
|
|||
- Details
|
||||
* - ``V4L2_QUANTIZATION_DEFAULT``
|
||||
- Use the default quantization encoding as defined by the
|
||||
colorspace. This is always full range for R'G'B' (except for the
|
||||
BT.2020 colorspace) and HSV. It is usually limited range for Y'CbCr.
|
||||
colorspace. This is always full range for R'G'B' and HSV.
|
||||
It is usually limited range for Y'CbCr.
|
||||
* - ``V4L2_QUANTIZATION_FULL_RANGE``
|
||||
- Use the full range quantization encoding. I.e. the range [0…1] is
|
||||
mapped to [0…255] (with possible clipping to [1…254] to avoid the
|
||||
|
@ -173,4 +172,4 @@ whole range, 0-255, dividing the angular value by 1.41. The enum
|
|||
* - ``V4L2_QUANTIZATION_LIM_RANGE``
|
||||
- Use the limited range quantization encoding. I.e. the range [0…1]
|
||||
is mapped to [16…235]. Cb and Cr are mapped from [-0.5…0.5] to
|
||||
[16…240].
|
||||
[16…240]. Limited Range cannot be used with HSV.
|
||||
|
|
|
@ -370,9 +370,8 @@ Colorspace BT.2020 (V4L2_COLORSPACE_BT2020)
|
|||
The :ref:`itu2020` standard defines the colorspace used by Ultra-high
|
||||
definition television (UHDTV). The default transfer function is
|
||||
``V4L2_XFER_FUNC_709``. The default Y'CbCr encoding is
|
||||
``V4L2_YCBCR_ENC_BT2020``. The default R'G'B' quantization is limited
|
||||
range (!), and so is the default Y'CbCr quantization. The chromaticities
|
||||
of the primary colors and the white reference are:
|
||||
``V4L2_YCBCR_ENC_BT2020``. The default Y'CbCr quantization is limited range.
|
||||
The chromaticities of the primary colors and the white reference are:
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -949,12 +949,14 @@ icmp_ratelimit - INTEGER
|
|||
icmp_msgs_per_sec - INTEGER
|
||||
Limit maximal number of ICMP packets sent per second from this host.
|
||||
Only messages whose type matches icmp_ratemask (see below) are
|
||||
controlled by this limit.
|
||||
controlled by this limit. For security reasons, the precise count
|
||||
of messages per second is randomized.
|
||||
Default: 1000
|
||||
|
||||
icmp_msgs_burst - INTEGER
|
||||
icmp_msgs_per_sec controls number of ICMP packets sent per second,
|
||||
while icmp_msgs_burst controls the burst size of these packets.
|
||||
For security reasons, the precise burst size is randomized.
|
||||
Default: 50
|
||||
|
||||
icmp_ratemask - INTEGER
|
||||
|
|
|
@ -3907,6 +3907,7 @@ F: crypto/
|
|||
F: drivers/crypto/
|
||||
F: include/crypto/
|
||||
F: include/linux/crypto*
|
||||
F: lib/crypto/
|
||||
|
||||
CRYPTOGRAPHIC RANDOM NUMBER GENERATOR
|
||||
M: Neil Horman <nhorman@tuxdriver.com>
|
||||
|
@ -15890,6 +15891,14 @@ L: linux-gpio@vger.kernel.org
|
|||
S: Maintained
|
||||
F: drivers/gpio/gpio-ws16c48.c
|
||||
|
||||
WIREGUARD SECURE NETWORK TUNNEL
|
||||
M: Jason A. Donenfeld <Jason@zx2c4.com>
|
||||
S: Maintained
|
||||
F: drivers/net/wireguard/
|
||||
F: tools/testing/selftests/wireguard/
|
||||
L: wireguard@lists.zx2c4.com
|
||||
L: netdev@vger.kernel.org
|
||||
|
||||
WISTRON LAPTOP BUTTON DRIVER
|
||||
M: Miloslav Trmac <mitr@volny.cz>
|
||||
S: Maintained
|
||||
|
|
8
Makefile
8
Makefile
|
@ -1,7 +1,7 @@
|
|||
# SPDX-License-Identifier: GPL-2.0
|
||||
VERSION = 4
|
||||
PATCHLEVEL = 19
|
||||
SUBLEVEL = 152
|
||||
SUBLEVEL = 157
|
||||
EXTRAVERSION =
|
||||
NAME = "People's Front"
|
||||
|
||||
|
@ -505,11 +505,7 @@ endif
|
|||
|
||||
ifeq ($(cc-name),clang)
|
||||
ifneq ($(CROSS_COMPILE),)
|
||||
CLANG_TRIPLE ?= $(CROSS_COMPILE)
|
||||
CLANG_FLAGS += --target=$(notdir $(CLANG_TRIPLE:%-=%))
|
||||
ifeq ($(shell $(srctree)/scripts/clang-android.sh $(CC) $(CLANG_FLAGS)), y)
|
||||
$(error "Clang with Android --target detected. Did you specify CLANG_TRIPLE?")
|
||||
endif
|
||||
CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%))
|
||||
GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
|
||||
CLANG_FLAGS += --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE))
|
||||
GCC_TOOLCHAIN := $(realpath $(GCC_TOOLCHAIN_DIR)/..)
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -2348,6 +2348,7 @@
|
|||
__sock_recv_ts_and_drops
|
||||
sock_wake_async
|
||||
sock_wfree
|
||||
timer_reduce
|
||||
unregister_net_sysctl_table
|
||||
__wake_up_sync_key
|
||||
__xfrm_policy_check
|
||||
|
|
|
@ -366,6 +366,13 @@ config HAVE_RCU_TABLE_FREE
|
|||
config HAVE_RCU_TABLE_INVALIDATE
|
||||
bool
|
||||
|
||||
config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
|
||||
bool
|
||||
help
|
||||
Temporary select until all architectures can be converted to have
|
||||
irqs disabled over activate_mm. Architectures that do IPI based TLB
|
||||
shootdowns should enable this.
|
||||
|
||||
config ARCH_HAVE_NMI_SAFE_CMPXCHG
|
||||
bool
|
||||
|
||||
|
|
|
@ -156,6 +156,7 @@ END(EV_Extension)
|
|||
tracesys:
|
||||
; save EFA in case tracer wants the PC of traced task
|
||||
; using ERET won't work since next-PC has already committed
|
||||
lr r12, [efa]
|
||||
GET_CURR_TASK_FIELD_PTR TASK_THREAD, r11
|
||||
st r12, [r11, THREAD_FAULT_ADDR] ; thread.fault_address
|
||||
|
||||
|
@ -198,9 +199,15 @@ tracesys_exit:
|
|||
; Breakpoint TRAP
|
||||
; ---------------------------------------------
|
||||
trap_with_param:
|
||||
mov r0, r12 ; EFA in case ptracer/gdb wants stop_pc
|
||||
|
||||
; stop_pc info by gdb needs this info
|
||||
lr r0, [efa]
|
||||
mov r1, sp
|
||||
|
||||
; Now that we have read EFA, it is safe to do "fake" rtie
|
||||
; and get out of CPU exception mode
|
||||
FAKE_RET_FROM_EXCPN
|
||||
|
||||
; Save callee regs in case gdb wants to have a look
|
||||
; SP will grow up by size of CALLEE Reg-File
|
||||
; NOTE: clobbers r12
|
||||
|
@ -227,10 +234,6 @@ ENTRY(EV_Trap)
|
|||
|
||||
EXCEPTION_PROLOGUE
|
||||
|
||||
lr r12, [efa]
|
||||
|
||||
FAKE_RET_FROM_EXCPN
|
||||
|
||||
;============ TRAP 1 :breakpoints
|
||||
; Check ECR for trap with arg (PROLOGUE ensures r9 has ECR)
|
||||
bmsk.f 0, r9, 7
|
||||
|
@ -238,6 +241,9 @@ ENTRY(EV_Trap)
|
|||
|
||||
;============ TRAP (no param): syscall top level
|
||||
|
||||
; First return from Exception to pure K mode (Exception/IRQs renabled)
|
||||
FAKE_RET_FROM_EXCPN
|
||||
|
||||
; If syscall tracing ongoing, invoke pre-post-hooks
|
||||
GET_CURR_THR_INFO_FLAGS r10
|
||||
btst r10, TIF_SYSCALL_TRACE
|
||||
|
|
|
@ -115,7 +115,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,
|
|||
int (*consumer_fn) (unsigned int, void *), void *arg)
|
||||
{
|
||||
#ifdef CONFIG_ARC_DW2_UNWIND
|
||||
int ret = 0;
|
||||
int ret = 0, cnt = 0;
|
||||
unsigned int address;
|
||||
struct unwind_frame_info frame_info;
|
||||
|
||||
|
@ -135,6 +135,11 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,
|
|||
break;
|
||||
|
||||
frame_info.regs.r63 = frame_info.regs.r31;
|
||||
|
||||
if (cnt++ > 128) {
|
||||
printk("unwinder looping too long, aborting !\n");
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return address; /* return the last address it saw */
|
||||
|
|
|
@ -11,5 +11,6 @@ menuconfig ARC_SOC_HSDK
|
|||
select ARC_HAS_ACCL_REGS
|
||||
select ARC_IRQ_NO_AUTOSAVE
|
||||
select CLK_HSDK
|
||||
select RESET_CONTROLLER
|
||||
select RESET_HSDK
|
||||
select MIGHT_HAVE_PCI
|
||||
|
|
|
@ -622,8 +622,10 @@ config ARCH_S3C24XX
|
|||
select HAVE_S3C2410_WATCHDOG if WATCHDOG
|
||||
select HAVE_S3C_RTC if RTC_CLASS
|
||||
select NEED_MACH_IO_H
|
||||
select S3C2410_WATCHDOG
|
||||
select SAMSUNG_ATAGS
|
||||
select USE_OF
|
||||
select WATCHDOG
|
||||
help
|
||||
Samsung S3C2410, S3C2412, S3C2413, S3C2416, S3C2440, S3C2442, S3C2443
|
||||
and S3C2450 SoCs based systems, such as the Simtec Electronics BAST
|
||||
|
|
|
@ -922,8 +922,10 @@
|
|||
};
|
||||
|
||||
rngb: rngb@21b4000 {
|
||||
compatible = "fsl,imx6sl-rngb", "fsl,imx25-rngb";
|
||||
reg = <0x021b4000 0x4000>;
|
||||
interrupts = <0 5 IRQ_TYPE_LEVEL_HIGH>;
|
||||
clocks = <&clks IMX6SL_CLK_DUMMY>;
|
||||
};
|
||||
|
||||
weim: weim@21b8000 {
|
||||
|
|
|
@ -192,6 +192,7 @@
|
|||
fixed-link {
|
||||
speed = <1000>;
|
||||
full-duplex;
|
||||
pause;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
|
|
@ -516,7 +516,7 @@
|
|||
status = "disabled";
|
||||
};
|
||||
|
||||
target-module@56000000 {
|
||||
sgx_module: target-module@56000000 {
|
||||
compatible = "ti,sysc-omap4", "ti,sysc";
|
||||
ti,hwmods = "gpu";
|
||||
reg = <0x5601fc00 0x4>,
|
||||
|
|
|
@ -74,3 +74,13 @@
|
|||
};
|
||||
|
||||
/include/ "omap443x-clocks.dtsi"
|
||||
|
||||
/*
|
||||
* Use dpll_per for sgx at 153.6MHz like droid4 stock v3.0.8 Android kernel
|
||||
*/
|
||||
&sgx_module {
|
||||
assigned-clocks = <&l3_gfx_clkctrl OMAP4_GPU_CLKCTRL 24>,
|
||||
<&dpll_per_m7x2_ck>;
|
||||
assigned-clock-rates = <0>, <153600000>;
|
||||
assigned-clock-parents = <&dpll_per_m7x2_ck>;
|
||||
};
|
||||
|
|
|
@ -85,21 +85,21 @@
|
|||
global_timer: timer@b0020200 {
|
||||
compatible = "arm,cortex-a9-global-timer";
|
||||
reg = <0xb0020200 0x100>;
|
||||
interrupts = <GIC_PPI 0 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||
interrupts = <GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
||||
twd_timer: timer@b0020600 {
|
||||
compatible = "arm,cortex-a9-twd-timer";
|
||||
reg = <0xb0020600 0x20>;
|
||||
interrupts = <GIC_PPI 2 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||
interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
||||
twd_wdt: wdt@b0020620 {
|
||||
compatible = "arm,cortex-a9-twd-wdt";
|
||||
reg = <0xb0020620 0xe0>;
|
||||
interrupts = <GIC_PPI 3 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||
interrupts = <GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
||||
|
|
|
@ -98,19 +98,16 @@
|
|||
};
|
||||
|
||||
clocks: clock-controller@e0100000 {
|
||||
compatible = "samsung,s5pv210-clock", "simple-bus";
|
||||
compatible = "samsung,s5pv210-clock";
|
||||
reg = <0xe0100000 0x10000>;
|
||||
clock-names = "xxti", "xusbxti";
|
||||
clocks = <&xxti>, <&xusbxti>;
|
||||
#clock-cells = <1>;
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
ranges;
|
||||
};
|
||||
|
||||
pmu_syscon: syscon@e0108000 {
|
||||
compatible = "samsung-s5pv210-pmu", "syscon";
|
||||
reg = <0xe0108000 0x8000>;
|
||||
};
|
||||
pmu_syscon: syscon@e0108000 {
|
||||
compatible = "samsung-s5pv210-pmu", "syscon";
|
||||
reg = <0xe0108000 0x8000>;
|
||||
};
|
||||
|
||||
pinctrl0: pinctrl@e0200000 {
|
||||
|
@ -126,35 +123,28 @@
|
|||
};
|
||||
};
|
||||
|
||||
amba {
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
compatible = "simple-bus";
|
||||
ranges;
|
||||
pdma0: dma@e0900000 {
|
||||
compatible = "arm,pl330", "arm,primecell";
|
||||
reg = <0xe0900000 0x1000>;
|
||||
interrupt-parent = <&vic0>;
|
||||
interrupts = <19>;
|
||||
clocks = <&clocks CLK_PDMA0>;
|
||||
clock-names = "apb_pclk";
|
||||
#dma-cells = <1>;
|
||||
#dma-channels = <8>;
|
||||
#dma-requests = <32>;
|
||||
};
|
||||
|
||||
pdma0: dma@e0900000 {
|
||||
compatible = "arm,pl330", "arm,primecell";
|
||||
reg = <0xe0900000 0x1000>;
|
||||
interrupt-parent = <&vic0>;
|
||||
interrupts = <19>;
|
||||
clocks = <&clocks CLK_PDMA0>;
|
||||
clock-names = "apb_pclk";
|
||||
#dma-cells = <1>;
|
||||
#dma-channels = <8>;
|
||||
#dma-requests = <32>;
|
||||
};
|
||||
|
||||
pdma1: dma@e0a00000 {
|
||||
compatible = "arm,pl330", "arm,primecell";
|
||||
reg = <0xe0a00000 0x1000>;
|
||||
interrupt-parent = <&vic0>;
|
||||
interrupts = <20>;
|
||||
clocks = <&clocks CLK_PDMA1>;
|
||||
clock-names = "apb_pclk";
|
||||
#dma-cells = <1>;
|
||||
#dma-channels = <8>;
|
||||
#dma-requests = <32>;
|
||||
};
|
||||
pdma1: dma@e0a00000 {
|
||||
compatible = "arm,pl330", "arm,primecell";
|
||||
reg = <0xe0a00000 0x1000>;
|
||||
interrupt-parent = <&vic0>;
|
||||
interrupts = <20>;
|
||||
clocks = <&clocks CLK_PDMA1>;
|
||||
clock-names = "apb_pclk";
|
||||
#dma-cells = <1>;
|
||||
#dma-channels = <8>;
|
||||
#dma-requests = <32>;
|
||||
};
|
||||
|
||||
spi0: spi@e1300000 {
|
||||
|
@ -227,43 +217,36 @@
|
|||
status = "disabled";
|
||||
};
|
||||
|
||||
audio-subsystem {
|
||||
compatible = "samsung,s5pv210-audss", "simple-bus";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <1>;
|
||||
ranges;
|
||||
clk_audss: clock-controller@eee10000 {
|
||||
compatible = "samsung,s5pv210-audss-clock";
|
||||
reg = <0xeee10000 0x1000>;
|
||||
clock-names = "hclk", "xxti",
|
||||
"fout_epll",
|
||||
"sclk_audio0";
|
||||
clocks = <&clocks DOUT_HCLKP>, <&xxti>,
|
||||
<&clocks FOUT_EPLL>,
|
||||
<&clocks SCLK_AUDIO0>;
|
||||
#clock-cells = <1>;
|
||||
};
|
||||
|
||||
clk_audss: clock-controller@eee10000 {
|
||||
compatible = "samsung,s5pv210-audss-clock";
|
||||
reg = <0xeee10000 0x1000>;
|
||||
clock-names = "hclk", "xxti",
|
||||
"fout_epll",
|
||||
"sclk_audio0";
|
||||
clocks = <&clocks DOUT_HCLKP>, <&xxti>,
|
||||
<&clocks FOUT_EPLL>,
|
||||
<&clocks SCLK_AUDIO0>;
|
||||
#clock-cells = <1>;
|
||||
};
|
||||
|
||||
i2s0: i2s@eee30000 {
|
||||
compatible = "samsung,s5pv210-i2s";
|
||||
reg = <0xeee30000 0x1000>;
|
||||
interrupt-parent = <&vic2>;
|
||||
interrupts = <16>;
|
||||
dma-names = "rx", "tx", "tx-sec";
|
||||
dmas = <&pdma1 9>, <&pdma1 10>, <&pdma1 11>;
|
||||
clock-names = "iis",
|
||||
"i2s_opclk0",
|
||||
"i2s_opclk1";
|
||||
clocks = <&clk_audss CLK_I2S>,
|
||||
<&clk_audss CLK_I2S>,
|
||||
<&clk_audss CLK_DOUT_AUD_BUS>;
|
||||
samsung,idma-addr = <0xc0010000>;
|
||||
pinctrl-names = "default";
|
||||
pinctrl-0 = <&i2s0_bus>;
|
||||
#sound-dai-cells = <0>;
|
||||
status = "disabled";
|
||||
};
|
||||
i2s0: i2s@eee30000 {
|
||||
compatible = "samsung,s5pv210-i2s";
|
||||
reg = <0xeee30000 0x1000>;
|
||||
interrupt-parent = <&vic2>;
|
||||
interrupts = <16>;
|
||||
dma-names = "rx", "tx", "tx-sec";
|
||||
dmas = <&pdma1 9>, <&pdma1 10>, <&pdma1 11>;
|
||||
clock-names = "iis",
|
||||
"i2s_opclk0",
|
||||
"i2s_opclk1";
|
||||
clocks = <&clk_audss CLK_I2S>,
|
||||
<&clk_audss CLK_I2S>,
|
||||
<&clk_audss CLK_DOUT_AUD_BUS>;
|
||||
samsung,idma-addr = <0xc0010000>;
|
||||
pinctrl-names = "default";
|
||||
pinctrl-0 = <&i2s0_bus>;
|
||||
#sound-dai-cells = <0>;
|
||||
status = "disabled";
|
||||
};
|
||||
|
||||
i2s1: i2s@e2100000 {
|
||||
|
|
|
@ -143,7 +143,7 @@
|
|||
trips {
|
||||
cpu_alert0: cpu-alert0 {
|
||||
/* milliCelsius */
|
||||
temperature = <850000>;
|
||||
temperature = <85000>;
|
||||
hysteresis = <2000>;
|
||||
type = "passive";
|
||||
};
|
||||
|
|
|
@ -206,16 +206,16 @@
|
|||
};
|
||||
|
||||
®_dc1sw {
|
||||
regulator-min-microvolt = <3000000>;
|
||||
regulator-max-microvolt = <3000000>;
|
||||
regulator-min-microvolt = <3300000>;
|
||||
regulator-max-microvolt = <3300000>;
|
||||
regulator-name = "vcc-gmac-phy";
|
||||
};
|
||||
|
||||
®_dcdc1 {
|
||||
regulator-always-on;
|
||||
regulator-min-microvolt = <3000000>;
|
||||
regulator-max-microvolt = <3000000>;
|
||||
regulator-name = "vcc-3v0";
|
||||
regulator-min-microvolt = <3300000>;
|
||||
regulator-max-microvolt = <3300000>;
|
||||
regulator-name = "vcc-3v3";
|
||||
};
|
||||
|
||||
®_dcdc2 {
|
||||
|
|
1
arch/arm/crypto/.gitignore
vendored
1
arch/arm/crypto/.gitignore
vendored
|
@ -1,3 +1,4 @@
|
|||
aesbs-core.S
|
||||
sha256-core.S
|
||||
sha512-core.S
|
||||
poly1305-core.S
|
||||
|
|
|
@ -125,14 +125,24 @@ config CRYPTO_CRC32_ARM_CE
|
|||
select CRYPTO_HASH
|
||||
|
||||
config CRYPTO_CHACHA20_NEON
|
||||
tristate "NEON accelerated ChaCha stream cipher algorithms"
|
||||
depends on KERNEL_MODE_NEON
|
||||
tristate "NEON and scalar accelerated ChaCha stream cipher algorithms"
|
||||
select CRYPTO_BLKCIPHER
|
||||
select CRYPTO_CHACHA20
|
||||
select CRYPTO_ARCH_HAVE_LIB_CHACHA
|
||||
|
||||
config CRYPTO_POLY1305_ARM
|
||||
tristate "Accelerated scalar and SIMD Poly1305 hash implementations"
|
||||
select CRYPTO_HASH
|
||||
select CRYPTO_ARCH_HAVE_LIB_POLY1305
|
||||
|
||||
config CRYPTO_NHPOLY1305_NEON
|
||||
tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
|
||||
depends on KERNEL_MODE_NEON
|
||||
select CRYPTO_NHPOLY1305
|
||||
|
||||
config CRYPTO_CURVE25519_NEON
|
||||
tristate "NEON accelerated Curve25519 scalar multiplication library"
|
||||
depends on KERNEL_MODE_NEON
|
||||
select CRYPTO_LIB_CURVE25519_GENERIC
|
||||
select CRYPTO_ARCH_HAVE_LIB_CURVE25519
|
||||
|
||||
endif
|
||||
|
|
|
@ -10,7 +10,9 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
|
|||
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
|
||||
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
|
||||
obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
|
||||
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
|
||||
obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
|
||||
|
||||
ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
|
||||
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
|
||||
|
@ -53,13 +55,19 @@ aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
|
|||
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
|
||||
crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
|
||||
crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
|
||||
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
|
||||
chacha-neon-y := chacha-scalar-core.o chacha-glue.o
|
||||
chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
|
||||
poly1305-arm-y := poly1305-core.o poly1305-glue.o
|
||||
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
|
||||
curve25519-neon-y := curve25519-core.o curve25519-glue.o
|
||||
|
||||
ifdef REGENERATE_ARM_CRYPTO
|
||||
quiet_cmd_perl = PERL $@
|
||||
cmd_perl = $(PERL) $(<) > $(@)
|
||||
|
||||
$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl
|
||||
$(call cmd,perl)
|
||||
|
||||
$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
|
||||
$(call cmd,perl)
|
||||
|
||||
|
@ -67,4 +75,9 @@ $(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl
|
|||
$(call cmd,perl)
|
||||
endif
|
||||
|
||||
targets += sha256-core.S sha512-core.S
|
||||
targets += poly1305-core.S sha256-core.S sha512-core.S
|
||||
|
||||
# massage the perlasm code a bit so we only get the NEON routine if we need it
|
||||
poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
|
||||
poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
|
||||
AFLAGS_poly1305-core.o += $(poly1305-aflags-y)
|
||||
|
|
356
arch/arm/crypto/chacha-glue.c
Normal file
356
arch/arm/crypto/chacha-glue.c
Normal file
|
@ -0,0 +1,356 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* ARM NEON accelerated ChaCha and XChaCha stream ciphers,
|
||||
* including ChaCha20 (RFC7539)
|
||||
*
|
||||
* Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include <asm/cputype.h>
|
||||
#include <asm/hwcap.h>
|
||||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
||||
int nrounds);
|
||||
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
||||
int nrounds);
|
||||
asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
|
||||
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
|
||||
|
||||
asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
|
||||
const u32 *state, int nrounds);
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
|
||||
|
||||
static inline bool neon_usable(void)
|
||||
{
|
||||
return static_branch_likely(&use_neon) && may_use_simd();
|
||||
}
|
||||
|
||||
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes, int nrounds)
|
||||
{
|
||||
u8 buf[CHACHA_BLOCK_SIZE];
|
||||
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha_4block_xor_neon(state, dst, src, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||
src += CHACHA_BLOCK_SIZE * 4;
|
||||
dst += CHACHA_BLOCK_SIZE * 4;
|
||||
state[12] += 4;
|
||||
}
|
||||
while (bytes >= CHACHA_BLOCK_SIZE) {
|
||||
chacha_block_xor_neon(state, dst, src, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE;
|
||||
src += CHACHA_BLOCK_SIZE;
|
||||
dst += CHACHA_BLOCK_SIZE;
|
||||
state[12]++;
|
||||
}
|
||||
if (bytes) {
|
||||
memcpy(buf, src, bytes);
|
||||
chacha_block_xor_neon(state, buf, buf, nrounds);
|
||||
memcpy(dst, buf, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
|
||||
hchacha_block_arm(state, stream, nrounds);
|
||||
} else {
|
||||
kernel_neon_begin();
|
||||
hchacha_block_neon(state, stream, nrounds);
|
||||
kernel_neon_end();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(hchacha_block_arch);
|
||||
|
||||
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
||||
{
|
||||
chacha_init_generic(state, key, iv);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_init_arch);
|
||||
|
||||
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
|
||||
int nrounds)
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
|
||||
bytes <= CHACHA_BLOCK_SIZE) {
|
||||
chacha_doarm(dst, src, bytes, state, nrounds);
|
||||
state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
|
||||
return;
|
||||
}
|
||||
|
||||
do {
|
||||
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
|
||||
|
||||
kernel_neon_begin();
|
||||
chacha_doneon(state, dst, src, todo, nrounds);
|
||||
kernel_neon_end();
|
||||
|
||||
bytes -= todo;
|
||||
src += todo;
|
||||
dst += todo;
|
||||
} while (bytes);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_crypt_arch);
|
||||
|
||||
static int chacha_stream_xor(struct skcipher_request *req,
|
||||
const struct chacha_ctx *ctx, const u8 *iv,
|
||||
bool neon)
|
||||
{
|
||||
struct skcipher_walk walk;
|
||||
u32 state[16];
|
||||
int err;
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
chacha_init_generic(state, ctx->key, iv);
|
||||
|
||||
while (walk.nbytes > 0) {
|
||||
unsigned int nbytes = walk.nbytes;
|
||||
|
||||
if (nbytes < walk.total)
|
||||
nbytes = round_down(nbytes, walk.stride);
|
||||
|
||||
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
|
||||
chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
|
||||
nbytes, state, ctx->nrounds);
|
||||
state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
|
||||
} else {
|
||||
kernel_neon_begin();
|
||||
chacha_doneon(state, walk.dst.virt.addr,
|
||||
walk.src.virt.addr, nbytes, ctx->nrounds);
|
||||
kernel_neon_end();
|
||||
}
|
||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int do_chacha(struct skcipher_request *req, bool neon)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
|
||||
return chacha_stream_xor(req, ctx, req->iv, neon);
|
||||
}
|
||||
|
||||
static int chacha_arm(struct skcipher_request *req)
|
||||
{
|
||||
return do_chacha(req, false);
|
||||
}
|
||||
|
||||
static int chacha_neon(struct skcipher_request *req)
|
||||
{
|
||||
return do_chacha(req, neon_usable());
|
||||
}
|
||||
|
||||
static int do_xchacha(struct skcipher_request *req, bool neon)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
struct chacha_ctx subctx;
|
||||
u32 state[16];
|
||||
u8 real_iv[16];
|
||||
|
||||
chacha_init_generic(state, ctx->key, req->iv);
|
||||
|
||||
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
|
||||
hchacha_block_arm(state, subctx.key, ctx->nrounds);
|
||||
} else {
|
||||
kernel_neon_begin();
|
||||
hchacha_block_neon(state, subctx.key, ctx->nrounds);
|
||||
kernel_neon_end();
|
||||
}
|
||||
subctx.nrounds = ctx->nrounds;
|
||||
|
||||
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||
memcpy(&real_iv[8], req->iv + 16, 8);
|
||||
return chacha_stream_xor(req, &subctx, real_iv, neon);
|
||||
}
|
||||
|
||||
static int xchacha_arm(struct skcipher_request *req)
|
||||
{
|
||||
return do_xchacha(req, false);
|
||||
}
|
||||
|
||||
static int xchacha_neon(struct skcipher_request *req)
|
||||
{
|
||||
return do_xchacha(req, neon_usable());
|
||||
}
|
||||
|
||||
static struct skcipher_alg arm_algs[] = {
|
||||
{
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-arm",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = chacha_arm,
|
||||
.decrypt = chacha_arm,
|
||||
}, {
|
||||
.base.cra_name = "xchacha20",
|
||||
.base.cra_driver_name = "xchacha20-arm",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = xchacha_arm,
|
||||
.decrypt = xchacha_arm,
|
||||
}, {
|
||||
.base.cra_name = "xchacha12",
|
||||
.base.cra_driver_name = "xchacha12-arm",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha12_setkey,
|
||||
.encrypt = xchacha_arm,
|
||||
.decrypt = xchacha_arm,
|
||||
},
|
||||
};
|
||||
|
||||
static struct skcipher_alg neon_algs[] = {
|
||||
{
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-neon",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = chacha_neon,
|
||||
.decrypt = chacha_neon,
|
||||
}, {
|
||||
.base.cra_name = "xchacha20",
|
||||
.base.cra_driver_name = "xchacha20-neon",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = xchacha_neon,
|
||||
.decrypt = xchacha_neon,
|
||||
}, {
|
||||
.base.cra_name = "xchacha12",
|
||||
.base.cra_driver_name = "xchacha12-neon",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha12_setkey,
|
||||
.encrypt = xchacha_neon,
|
||||
.decrypt = xchacha_neon,
|
||||
}
|
||||
};
|
||||
|
||||
static int __init chacha_simd_mod_init(void)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
|
||||
err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
|
||||
int i;
|
||||
|
||||
switch (read_cpuid_part()) {
|
||||
case ARM_CPU_PART_CORTEX_A7:
|
||||
case ARM_CPU_PART_CORTEX_A5:
|
||||
/*
|
||||
* The Cortex-A7 and Cortex-A5 do not perform well with
|
||||
* the NEON implementation but do incredibly with the
|
||||
* scalar one and use less power.
|
||||
*/
|
||||
for (i = 0; i < ARRAY_SIZE(neon_algs); i++)
|
||||
neon_algs[i].base.cra_priority = 0;
|
||||
break;
|
||||
default:
|
||||
static_branch_enable(&use_neon);
|
||||
}
|
||||
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
|
||||
err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
|
||||
if (err)
|
||||
crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static void __exit chacha_simd_mod_fini(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
|
||||
crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
|
||||
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
|
||||
crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
|
||||
}
|
||||
}
|
||||
|
||||
module_init(chacha_simd_mod_init);
|
||||
module_exit(chacha_simd_mod_fini);
|
||||
|
||||
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)");
|
||||
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_ALIAS_CRYPTO("chacha20");
|
||||
MODULE_ALIAS_CRYPTO("chacha20-arm");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20-arm");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12-arm");
|
||||
#ifdef CONFIG_KERNEL_MODE_NEON
|
||||
MODULE_ALIAS_CRYPTO("chacha20-neon");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20-neon");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12-neon");
|
||||
#endif
|
460
arch/arm/crypto/chacha-scalar-core.S
Normal file
460
arch/arm/crypto/chacha-scalar-core.S
Normal file
|
@ -0,0 +1,460 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (C) 2018 Google, Inc.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/assembler.h>
|
||||
|
||||
/*
|
||||
* Design notes:
|
||||
*
|
||||
* 16 registers would be needed to hold the state matrix, but only 14 are
|
||||
* available because 'sp' and 'pc' cannot be used. So we spill the elements
|
||||
* (x8, x9) to the stack and swap them out with (x10, x11). This adds one
|
||||
* 'ldrd' and one 'strd' instruction per round.
|
||||
*
|
||||
* All rotates are performed using the implicit rotate operand accepted by the
|
||||
* 'add' and 'eor' instructions. This is faster than using explicit rotate
|
||||
* instructions. To make this work, we allow the values in the second and last
|
||||
* rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
|
||||
* wrong rotation amount. The rotation amount is then fixed up just in time
|
||||
* when the values are used. 'brot' is the number of bits the values in row 'b'
|
||||
* need to be rotated right to arrive at the correct values, and 'drot'
|
||||
* similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
|
||||
* that they end up as (25, 24) after every round.
|
||||
*/
|
||||
|
||||
// ChaCha state registers
|
||||
X0 .req r0
|
||||
X1 .req r1
|
||||
X2 .req r2
|
||||
X3 .req r3
|
||||
X4 .req r4
|
||||
X5 .req r5
|
||||
X6 .req r6
|
||||
X7 .req r7
|
||||
X8_X10 .req r8 // shared by x8 and x10
|
||||
X9_X11 .req r9 // shared by x9 and x11
|
||||
X12 .req r10
|
||||
X13 .req r11
|
||||
X14 .req r12
|
||||
X15 .req r14
|
||||
|
||||
.macro __rev out, in, t0, t1, t2
|
||||
.if __LINUX_ARM_ARCH__ >= 6
|
||||
rev \out, \in
|
||||
.else
|
||||
lsl \t0, \in, #24
|
||||
and \t1, \in, #0xff00
|
||||
and \t2, \in, #0xff0000
|
||||
orr \out, \t0, \in, lsr #24
|
||||
orr \out, \out, \t1, lsl #8
|
||||
orr \out, \out, \t2, lsr #8
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro _le32_bswap x, t0, t1, t2
|
||||
#ifdef __ARMEB__
|
||||
__rev \x, \x, \t0, \t1, \t2
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
|
||||
_le32_bswap \a, \t0, \t1, \t2
|
||||
_le32_bswap \b, \t0, \t1, \t2
|
||||
_le32_bswap \c, \t0, \t1, \t2
|
||||
_le32_bswap \d, \t0, \t1, \t2
|
||||
.endm
|
||||
|
||||
.macro __ldrd a, b, src, offset
|
||||
#if __LINUX_ARM_ARCH__ >= 6
|
||||
ldrd \a, \b, [\src, #\offset]
|
||||
#else
|
||||
ldr \a, [\src, #\offset]
|
||||
ldr \b, [\src, #\offset + 4]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro __strd a, b, dst, offset
|
||||
#if __LINUX_ARM_ARCH__ >= 6
|
||||
strd \a, \b, [\dst, #\offset]
|
||||
#else
|
||||
str \a, [\dst, #\offset]
|
||||
str \b, [\dst, #\offset + 4]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
|
||||
|
||||
// a += b; d ^= a; d = rol(d, 16);
|
||||
add \a1, \a1, \b1, ror #brot
|
||||
add \a2, \a2, \b2, ror #brot
|
||||
eor \d1, \a1, \d1, ror #drot
|
||||
eor \d2, \a2, \d2, ror #drot
|
||||
// drot == 32 - 16 == 16
|
||||
|
||||
// c += d; b ^= c; b = rol(b, 12);
|
||||
add \c1, \c1, \d1, ror #16
|
||||
add \c2, \c2, \d2, ror #16
|
||||
eor \b1, \c1, \b1, ror #brot
|
||||
eor \b2, \c2, \b2, ror #brot
|
||||
// brot == 32 - 12 == 20
|
||||
|
||||
// a += b; d ^= a; d = rol(d, 8);
|
||||
add \a1, \a1, \b1, ror #20
|
||||
add \a2, \a2, \b2, ror #20
|
||||
eor \d1, \a1, \d1, ror #16
|
||||
eor \d2, \a2, \d2, ror #16
|
||||
// drot == 32 - 8 == 24
|
||||
|
||||
// c += d; b ^= c; b = rol(b, 7);
|
||||
add \c1, \c1, \d1, ror #24
|
||||
add \c2, \c2, \d2, ror #24
|
||||
eor \b1, \c1, \b1, ror #20
|
||||
eor \b2, \c2, \b2, ror #20
|
||||
// brot == 32 - 7 == 25
|
||||
.endm
|
||||
|
||||
.macro _doubleround
|
||||
|
||||
// column round
|
||||
|
||||
// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
|
||||
_halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
|
||||
|
||||
// save (x8, x9); restore (x10, x11)
|
||||
__strd X8_X10, X9_X11, sp, 0
|
||||
__ldrd X8_X10, X9_X11, sp, 8
|
||||
|
||||
// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
|
||||
_halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
|
||||
|
||||
.set brot, 25
|
||||
.set drot, 24
|
||||
|
||||
// diagonal round
|
||||
|
||||
// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
|
||||
_halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
|
||||
|
||||
// save (x10, x11); restore (x8, x9)
|
||||
__strd X8_X10, X9_X11, sp, 8
|
||||
__ldrd X8_X10, X9_X11, sp, 0
|
||||
|
||||
// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
|
||||
_halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
|
||||
.endm
|
||||
|
||||
.macro _chacha_permute nrounds
|
||||
.set brot, 0
|
||||
.set drot, 0
|
||||
.rept \nrounds / 2
|
||||
_doubleround
|
||||
.endr
|
||||
.endm
|
||||
|
||||
.macro _chacha nrounds
|
||||
|
||||
.Lnext_block\@:
|
||||
// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
|
||||
// Registers contain x0-x9,x12-x15.
|
||||
|
||||
// Do the core ChaCha permutation to update x0-x15.
|
||||
_chacha_permute \nrounds
|
||||
|
||||
add sp, #8
|
||||
// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
|
||||
// Registers contain x0-x9,x12-x15.
|
||||
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
|
||||
|
||||
// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
|
||||
push {X8_X10, X9_X11, X12, X13, X14, X15}
|
||||
|
||||
// Load (OUT, IN, LEN).
|
||||
ldr r14, [sp, #96]
|
||||
ldr r12, [sp, #100]
|
||||
ldr r11, [sp, #104]
|
||||
|
||||
orr r10, r14, r12
|
||||
|
||||
// Use slow path if fewer than 64 bytes remain.
|
||||
cmp r11, #64
|
||||
blt .Lxor_slowpath\@
|
||||
|
||||
// Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
|
||||
// ARMv6+, since ldmia and stmia (used below) still require alignment.
|
||||
tst r10, #3
|
||||
bne .Lxor_slowpath\@
|
||||
|
||||
// Fast path: XOR 64 bytes of aligned data.
|
||||
|
||||
// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
|
||||
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
|
||||
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
|
||||
|
||||
// x0-x3
|
||||
__ldrd r8, r9, sp, 32
|
||||
__ldrd r10, r11, sp, 40
|
||||
add X0, X0, r8
|
||||
add X1, X1, r9
|
||||
add X2, X2, r10
|
||||
add X3, X3, r11
|
||||
_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
|
||||
ldmia r12!, {r8-r11}
|
||||
eor X0, X0, r8
|
||||
eor X1, X1, r9
|
||||
eor X2, X2, r10
|
||||
eor X3, X3, r11
|
||||
stmia r14!, {X0-X3}
|
||||
|
||||
// x4-x7
|
||||
__ldrd r8, r9, sp, 48
|
||||
__ldrd r10, r11, sp, 56
|
||||
add X4, r8, X4, ror #brot
|
||||
add X5, r9, X5, ror #brot
|
||||
ldmia r12!, {X0-X3}
|
||||
add X6, r10, X6, ror #brot
|
||||
add X7, r11, X7, ror #brot
|
||||
_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
|
||||
eor X4, X4, X0
|
||||
eor X5, X5, X1
|
||||
eor X6, X6, X2
|
||||
eor X7, X7, X3
|
||||
stmia r14!, {X4-X7}
|
||||
|
||||
// x8-x15
|
||||
pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
|
||||
__ldrd r8, r9, sp, 32
|
||||
__ldrd r10, r11, sp, 40
|
||||
add r0, r0, r8 // x8
|
||||
add r1, r1, r9 // x9
|
||||
add r6, r6, r10 // x10
|
||||
add r7, r7, r11 // x11
|
||||
_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
|
||||
ldmia r12!, {r8-r11}
|
||||
eor r0, r0, r8 // x8
|
||||
eor r1, r1, r9 // x9
|
||||
eor r6, r6, r10 // x10
|
||||
eor r7, r7, r11 // x11
|
||||
stmia r14!, {r0,r1,r6,r7}
|
||||
ldmia r12!, {r0,r1,r6,r7}
|
||||
__ldrd r8, r9, sp, 48
|
||||
__ldrd r10, r11, sp, 56
|
||||
add r2, r8, r2, ror #drot // x12
|
||||
add r3, r9, r3, ror #drot // x13
|
||||
add r4, r10, r4, ror #drot // x14
|
||||
add r5, r11, r5, ror #drot // x15
|
||||
_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
|
||||
ldr r9, [sp, #72] // load LEN
|
||||
eor r2, r2, r0 // x12
|
||||
eor r3, r3, r1 // x13
|
||||
eor r4, r4, r6 // x14
|
||||
eor r5, r5, r7 // x15
|
||||
subs r9, #64 // decrement and check LEN
|
||||
stmia r14!, {r2-r5}
|
||||
|
||||
beq .Ldone\@
|
||||
|
||||
.Lprepare_for_next_block\@:
|
||||
|
||||
// Stack: x0-x15 OUT IN LEN
|
||||
|
||||
// Increment block counter (x12)
|
||||
add r8, #1
|
||||
|
||||
// Store updated (OUT, IN, LEN)
|
||||
str r14, [sp, #64]
|
||||
str r12, [sp, #68]
|
||||
str r9, [sp, #72]
|
||||
|
||||
mov r14, sp
|
||||
|
||||
// Store updated block counter (x12)
|
||||
str r8, [sp, #48]
|
||||
|
||||
sub sp, #16
|
||||
|
||||
// Reload state and do next block
|
||||
ldmia r14!, {r0-r11} // load x0-x11
|
||||
__strd r10, r11, sp, 8 // store x10-x11 before state
|
||||
ldmia r14, {r10-r12,r14} // load x12-x15
|
||||
b .Lnext_block\@
|
||||
|
||||
.Lxor_slowpath\@:
|
||||
// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
|
||||
// We handle it by storing the 64 bytes of keystream to the stack, then
|
||||
// XOR-ing the needed portion with the data.
|
||||
|
||||
// Allocate keystream buffer
|
||||
sub sp, #64
|
||||
mov r14, sp
|
||||
|
||||
// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
|
||||
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
|
||||
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
|
||||
|
||||
// Save keystream for x0-x3
|
||||
__ldrd r8, r9, sp, 96
|
||||
__ldrd r10, r11, sp, 104
|
||||
add X0, X0, r8
|
||||
add X1, X1, r9
|
||||
add X2, X2, r10
|
||||
add X3, X3, r11
|
||||
_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
|
||||
stmia r14!, {X0-X3}
|
||||
|
||||
// Save keystream for x4-x7
|
||||
__ldrd r8, r9, sp, 112
|
||||
__ldrd r10, r11, sp, 120
|
||||
add X4, r8, X4, ror #brot
|
||||
add X5, r9, X5, ror #brot
|
||||
add X6, r10, X6, ror #brot
|
||||
add X7, r11, X7, ror #brot
|
||||
_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
|
||||
add r8, sp, #64
|
||||
stmia r14!, {X4-X7}
|
||||
|
||||
// Save keystream for x8-x15
|
||||
ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
|
||||
__ldrd r8, r9, sp, 128
|
||||
__ldrd r10, r11, sp, 136
|
||||
add r0, r0, r8 // x8
|
||||
add r1, r1, r9 // x9
|
||||
add r6, r6, r10 // x10
|
||||
add r7, r7, r11 // x11
|
||||
_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
|
||||
stmia r14!, {r0,r1,r6,r7}
|
||||
__ldrd r8, r9, sp, 144
|
||||
__ldrd r10, r11, sp, 152
|
||||
add r2, r8, r2, ror #drot // x12
|
||||
add r3, r9, r3, ror #drot // x13
|
||||
add r4, r10, r4, ror #drot // x14
|
||||
add r5, r11, r5, ror #drot // x15
|
||||
_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
|
||||
stmia r14, {r2-r5}
|
||||
|
||||
// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
|
||||
// Registers: r8 is block counter, r12 is IN.
|
||||
|
||||
ldr r9, [sp, #168] // LEN
|
||||
ldr r14, [sp, #160] // OUT
|
||||
cmp r9, #64
|
||||
mov r0, sp
|
||||
movle r1, r9
|
||||
movgt r1, #64
|
||||
// r1 is number of bytes to XOR, in range [1, 64]
|
||||
|
||||
.if __LINUX_ARM_ARCH__ < 6
|
||||
orr r2, r12, r14
|
||||
tst r2, #3 // IN or OUT misaligned?
|
||||
bne .Lxor_next_byte\@
|
||||
.endif
|
||||
|
||||
// XOR a word at a time
|
||||
.rept 16
|
||||
subs r1, #4
|
||||
blt .Lxor_words_done\@
|
||||
ldr r2, [r12], #4
|
||||
ldr r3, [r0], #4
|
||||
eor r2, r2, r3
|
||||
str r2, [r14], #4
|
||||
.endr
|
||||
b .Lxor_slowpath_done\@
|
||||
.Lxor_words_done\@:
|
||||
ands r1, r1, #3
|
||||
beq .Lxor_slowpath_done\@
|
||||
|
||||
// XOR a byte at a time
|
||||
.Lxor_next_byte\@:
|
||||
ldrb r2, [r12], #1
|
||||
ldrb r3, [r0], #1
|
||||
eor r2, r2, r3
|
||||
strb r2, [r14], #1
|
||||
subs r1, #1
|
||||
bne .Lxor_next_byte\@
|
||||
|
||||
.Lxor_slowpath_done\@:
|
||||
subs r9, #64
|
||||
add sp, #96
|
||||
bgt .Lprepare_for_next_block\@
|
||||
|
||||
.Ldone\@:
|
||||
.endm // _chacha
|
||||
|
||||
/*
|
||||
* void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
|
||||
* const u32 *state, int nrounds);
|
||||
*/
|
||||
ENTRY(chacha_doarm)
|
||||
cmp r2, #0 // len == 0?
|
||||
reteq lr
|
||||
|
||||
ldr ip, [sp]
|
||||
cmp ip, #12
|
||||
|
||||
push {r0-r2,r4-r11,lr}
|
||||
|
||||
// Push state x0-x15 onto stack.
|
||||
// Also store an extra copy of x10-x11 just before the state.
|
||||
|
||||
add X12, r3, #48
|
||||
ldm X12, {X12,X13,X14,X15}
|
||||
push {X12,X13,X14,X15}
|
||||
sub sp, sp, #64
|
||||
|
||||
__ldrd X8_X10, X9_X11, r3, 40
|
||||
__strd X8_X10, X9_X11, sp, 8
|
||||
__strd X8_X10, X9_X11, sp, 56
|
||||
ldm r3, {X0-X9_X11}
|
||||
__strd X0, X1, sp, 16
|
||||
__strd X2, X3, sp, 24
|
||||
__strd X4, X5, sp, 32
|
||||
__strd X6, X7, sp, 40
|
||||
__strd X8_X10, X9_X11, sp, 48
|
||||
|
||||
beq 1f
|
||||
_chacha 20
|
||||
|
||||
0: add sp, #76
|
||||
pop {r4-r11, pc}
|
||||
|
||||
1: _chacha 12
|
||||
b 0b
|
||||
ENDPROC(chacha_doarm)
|
||||
|
||||
/*
|
||||
* void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
|
||||
*/
|
||||
ENTRY(hchacha_block_arm)
|
||||
push {r1,r4-r11,lr}
|
||||
|
||||
cmp r2, #12 // ChaCha12 ?
|
||||
|
||||
mov r14, r0
|
||||
ldmia r14!, {r0-r11} // load x0-x11
|
||||
push {r10-r11} // store x10-x11 to stack
|
||||
ldm r14, {r10-r12,r14} // load x12-x15
|
||||
sub sp, #8
|
||||
|
||||
beq 1f
|
||||
_chacha_permute 20
|
||||
|
||||
// Skip over (unused0-unused1, x10-x11)
|
||||
0: add sp, #16
|
||||
|
||||
// Fix up rotations of x12-x15
|
||||
ror X12, X12, #drot
|
||||
ror X13, X13, #drot
|
||||
pop {r4} // load 'out'
|
||||
ror X14, X14, #drot
|
||||
ror X15, X15, #drot
|
||||
|
||||
// Store (x0-x3,x12-x15) to 'out'
|
||||
stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
|
||||
|
||||
pop {r4-r11,pc}
|
||||
|
||||
1: _chacha_permute 12
|
||||
b 0b
|
||||
ENDPROC(hchacha_block_arm)
|
2062
arch/arm/crypto/curve25519-core.S
Normal file
2062
arch/arm/crypto/curve25519-core.S
Normal file
File diff suppressed because it is too large
Load diff
135
arch/arm/crypto/curve25519-glue.c
Normal file
135
arch/arm/crypto/curve25519-glue.c
Normal file
|
@ -0,0 +1,135 @@
|
|||
// SPDX-License-Identifier: GPL-2.0 OR MIT
|
||||
/*
|
||||
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||
*
|
||||
* Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
|
||||
* began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
|
||||
* manually reworked for use in kernel space.
|
||||
*/
|
||||
|
||||
#include <asm/hwcap.h>
|
||||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
#include <crypto/internal/kpp.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <crypto/curve25519.h>
|
||||
|
||||
asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
|
||||
const u8 secret[CURVE25519_KEY_SIZE],
|
||||
const u8 basepoint[CURVE25519_KEY_SIZE]);
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
|
||||
|
||||
void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
|
||||
const u8 scalar[CURVE25519_KEY_SIZE],
|
||||
const u8 point[CURVE25519_KEY_SIZE])
|
||||
{
|
||||
if (static_branch_likely(&have_neon) && may_use_simd()) {
|
||||
kernel_neon_begin();
|
||||
curve25519_neon(out, scalar, point);
|
||||
kernel_neon_end();
|
||||
} else {
|
||||
curve25519_generic(out, scalar, point);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(curve25519_arch);
|
||||
|
||||
void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
|
||||
const u8 secret[CURVE25519_KEY_SIZE])
|
||||
{
|
||||
return curve25519_arch(pub, secret, curve25519_base_point);
|
||||
}
|
||||
EXPORT_SYMBOL(curve25519_base_arch);
|
||||
|
||||
static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
|
||||
unsigned int len)
|
||||
{
|
||||
u8 *secret = kpp_tfm_ctx(tfm);
|
||||
|
||||
if (!len)
|
||||
curve25519_generate_secret(secret);
|
||||
else if (len == CURVE25519_KEY_SIZE &&
|
||||
crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
|
||||
memcpy(secret, buf, CURVE25519_KEY_SIZE);
|
||||
else
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int curve25519_compute_value(struct kpp_request *req)
|
||||
{
|
||||
struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
|
||||
const u8 *secret = kpp_tfm_ctx(tfm);
|
||||
u8 public_key[CURVE25519_KEY_SIZE];
|
||||
u8 buf[CURVE25519_KEY_SIZE];
|
||||
int copied, nbytes;
|
||||
u8 const *bp;
|
||||
|
||||
if (req->src) {
|
||||
copied = sg_copy_to_buffer(req->src,
|
||||
sg_nents_for_len(req->src,
|
||||
CURVE25519_KEY_SIZE),
|
||||
public_key, CURVE25519_KEY_SIZE);
|
||||
if (copied != CURVE25519_KEY_SIZE)
|
||||
return -EINVAL;
|
||||
bp = public_key;
|
||||
} else {
|
||||
bp = curve25519_base_point;
|
||||
}
|
||||
|
||||
curve25519_arch(buf, secret, bp);
|
||||
|
||||
/* might want less than we've got */
|
||||
nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
|
||||
copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
|
||||
nbytes),
|
||||
buf, nbytes);
|
||||
if (copied != nbytes)
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
|
||||
{
|
||||
return CURVE25519_KEY_SIZE;
|
||||
}
|
||||
|
||||
static struct kpp_alg curve25519_alg = {
|
||||
.base.cra_name = "curve25519",
|
||||
.base.cra_driver_name = "curve25519-neon",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
.base.cra_ctxsize = CURVE25519_KEY_SIZE,
|
||||
|
||||
.set_secret = curve25519_set_secret,
|
||||
.generate_public_key = curve25519_compute_value,
|
||||
.compute_shared_secret = curve25519_compute_value,
|
||||
.max_size = curve25519_max_size,
|
||||
};
|
||||
|
||||
static int __init mod_init(void)
|
||||
{
|
||||
if (elf_hwcap & HWCAP_NEON) {
|
||||
static_branch_enable(&have_neon);
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
|
||||
crypto_register_kpp(&curve25519_alg) : 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void __exit mod_exit(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && elf_hwcap & HWCAP_NEON)
|
||||
crypto_unregister_kpp(&curve25519_alg);
|
||||
}
|
||||
|
||||
module_init(mod_init);
|
||||
module_exit(mod_exit);
|
||||
|
||||
MODULE_ALIAS_CRYPTO("curve25519");
|
||||
MODULE_ALIAS_CRYPTO("curve25519-neon");
|
||||
MODULE_LICENSE("GPL v2");
|
1236
arch/arm/crypto/poly1305-armv4.pl
Normal file
1236
arch/arm/crypto/poly1305-armv4.pl
Normal file
File diff suppressed because it is too large
Load diff
1158
arch/arm/crypto/poly1305-core.S_shipped
Normal file
1158
arch/arm/crypto/poly1305-core.S_shipped
Normal file
File diff suppressed because it is too large
Load diff
272
arch/arm/crypto/poly1305-glue.c
Normal file
272
arch/arm/crypto/poly1305-glue.c
Normal file
|
@ -0,0 +1,272 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
|
||||
*
|
||||
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||
*/
|
||||
|
||||
#include <asm/hwcap.h>
|
||||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/internal/poly1305.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
void poly1305_init_arm(void *state, const u8 *key);
|
||||
void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
|
||||
void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
|
||||
void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
|
||||
|
||||
void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
|
||||
{
|
||||
}
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
|
||||
|
||||
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
|
||||
{
|
||||
poly1305_init_arm(&dctx->h, key);
|
||||
dctx->s[0] = get_unaligned_le32(key + 16);
|
||||
dctx->s[1] = get_unaligned_le32(key + 20);
|
||||
dctx->s[2] = get_unaligned_le32(key + 24);
|
||||
dctx->s[3] = get_unaligned_le32(key + 28);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_init_arch);
|
||||
|
||||
static int arm_poly1305_init(struct shash_desc *desc)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
dctx->buflen = 0;
|
||||
dctx->rset = 0;
|
||||
dctx->sset = false;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
u32 len, u32 hibit, bool do_neon)
|
||||
{
|
||||
if (unlikely(!dctx->sset)) {
|
||||
if (!dctx->rset) {
|
||||
poly1305_init_arm(&dctx->h, src);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
dctx->rset = 1;
|
||||
}
|
||||
if (len >= POLY1305_BLOCK_SIZE) {
|
||||
dctx->s[0] = get_unaligned_le32(src + 0);
|
||||
dctx->s[1] = get_unaligned_le32(src + 4);
|
||||
dctx->s[2] = get_unaligned_le32(src + 8);
|
||||
dctx->s[3] = get_unaligned_le32(src + 12);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
dctx->sset = true;
|
||||
}
|
||||
if (len < POLY1305_BLOCK_SIZE)
|
||||
return;
|
||||
}
|
||||
|
||||
len &= ~(POLY1305_BLOCK_SIZE - 1);
|
||||
|
||||
if (static_branch_likely(&have_neon) && likely(do_neon))
|
||||
poly1305_blocks_neon(&dctx->h, src, len, hibit);
|
||||
else
|
||||
poly1305_blocks_arm(&dctx->h, src, len, hibit);
|
||||
}
|
||||
|
||||
static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
|
||||
const u8 *src, u32 len, bool do_neon)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
|
||||
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||
src += bytes;
|
||||
len -= bytes;
|
||||
dctx->buflen += bytes;
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
arm_poly1305_blocks(dctx, dctx->buf,
|
||||
POLY1305_BLOCK_SIZE, 1, false);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(len >= POLY1305_BLOCK_SIZE)) {
|
||||
arm_poly1305_blocks(dctx, src, len, 1, do_neon);
|
||||
src += round_down(len, POLY1305_BLOCK_SIZE);
|
||||
len %= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (unlikely(len)) {
|
||||
dctx->buflen = len;
|
||||
memcpy(dctx->buf, src, len);
|
||||
}
|
||||
}
|
||||
|
||||
static int arm_poly1305_update(struct shash_desc *desc,
|
||||
const u8 *src, unsigned int srclen)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
arm_poly1305_do_update(dctx, src, srclen, false);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
|
||||
const u8 *src,
|
||||
unsigned int srclen)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
bool do_neon = may_use_simd() && srclen > 128;
|
||||
|
||||
if (static_branch_likely(&have_neon) && do_neon)
|
||||
kernel_neon_begin();
|
||||
arm_poly1305_do_update(dctx, src, srclen, do_neon);
|
||||
if (static_branch_likely(&have_neon) && do_neon)
|
||||
kernel_neon_end();
|
||||
return 0;
|
||||
}
|
||||
|
||||
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
unsigned int nbytes)
|
||||
{
|
||||
bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
|
||||
may_use_simd();
|
||||
|
||||
if (unlikely(dctx->buflen)) {
|
||||
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
|
||||
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||
src += bytes;
|
||||
nbytes -= bytes;
|
||||
dctx->buflen += bytes;
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
poly1305_blocks_arm(&dctx->h, dctx->buf,
|
||||
POLY1305_BLOCK_SIZE, 1);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
|
||||
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
|
||||
|
||||
if (static_branch_likely(&have_neon) && do_neon) {
|
||||
do {
|
||||
unsigned int todo = min_t(unsigned int, len, SZ_4K);
|
||||
|
||||
kernel_neon_begin();
|
||||
poly1305_blocks_neon(&dctx->h, src, todo, 1);
|
||||
kernel_neon_end();
|
||||
|
||||
len -= todo;
|
||||
src += todo;
|
||||
} while (len);
|
||||
} else {
|
||||
poly1305_blocks_arm(&dctx->h, src, len, 1);
|
||||
src += len;
|
||||
}
|
||||
nbytes %= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (unlikely(nbytes)) {
|
||||
dctx->buflen = nbytes;
|
||||
memcpy(dctx->buf, src, nbytes);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_update_arch);
|
||||
|
||||
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
dctx->buf[dctx->buflen++] = 1;
|
||||
memset(dctx->buf + dctx->buflen, 0,
|
||||
POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
|
||||
}
|
||||
|
||||
poly1305_emit_arm(&dctx->h, dst, dctx->s);
|
||||
*dctx = (struct poly1305_desc_ctx){};
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_final_arch);
|
||||
|
||||
static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
if (unlikely(!dctx->sset))
|
||||
return -ENOKEY;
|
||||
|
||||
poly1305_final_arch(dctx, dst);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg arm_poly1305_algs[] = {{
|
||||
.init = arm_poly1305_init,
|
||||
.update = arm_poly1305_update,
|
||||
.final = arm_poly1305_final,
|
||||
.digestsize = POLY1305_DIGEST_SIZE,
|
||||
.descsize = sizeof(struct poly1305_desc_ctx),
|
||||
|
||||
.base.cra_name = "poly1305",
|
||||
.base.cra_driver_name = "poly1305-arm",
|
||||
.base.cra_priority = 150,
|
||||
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
#ifdef CONFIG_KERNEL_MODE_NEON
|
||||
}, {
|
||||
.init = arm_poly1305_init,
|
||||
.update = arm_poly1305_update_neon,
|
||||
.final = arm_poly1305_final,
|
||||
.digestsize = POLY1305_DIGEST_SIZE,
|
||||
.descsize = sizeof(struct poly1305_desc_ctx),
|
||||
|
||||
.base.cra_name = "poly1305",
|
||||
.base.cra_driver_name = "poly1305-neon",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
#endif
|
||||
}};
|
||||
|
||||
static int __init arm_poly1305_mod_init(void)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
|
||||
(elf_hwcap & HWCAP_NEON))
|
||||
static_branch_enable(&have_neon);
|
||||
else if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
||||
/* register only the first entry */
|
||||
return crypto_register_shash(&arm_poly1305_algs[0]);
|
||||
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||
crypto_register_shashes(arm_poly1305_algs,
|
||||
ARRAY_SIZE(arm_poly1305_algs)) : 0;
|
||||
}
|
||||
|
||||
static void __exit arm_poly1305_mod_exit(void)
|
||||
{
|
||||
if (!IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
||||
return;
|
||||
if (!static_branch_likely(&have_neon)) {
|
||||
crypto_unregister_shash(&arm_poly1305_algs[0]);
|
||||
return;
|
||||
}
|
||||
crypto_unregister_shashes(arm_poly1305_algs,
|
||||
ARRAY_SIZE(arm_poly1305_algs));
|
||||
}
|
||||
|
||||
module_init(arm_poly1305_mod_init);
|
||||
module_exit(arm_poly1305_mod_exit);
|
||||
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_ALIAS_CRYPTO("poly1305");
|
||||
MODULE_ALIAS_CRYPTO("poly1305-arm");
|
||||
MODULE_ALIAS_CRYPTO("poly1305-neon");
|
|
@ -688,6 +688,40 @@ static void disable_single_step(struct perf_event *bp)
|
|||
arch_install_hw_breakpoint(bp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Arm32 hardware does not always report a watchpoint hit address that matches
|
||||
* one of the watchpoints set. It can also report an address "near" the
|
||||
* watchpoint if a single instruction access both watched and unwatched
|
||||
* addresses. There is no straight-forward way, short of disassembling the
|
||||
* offending instruction, to map that address back to the watchpoint. This
|
||||
* function computes the distance of the memory access from the watchpoint as a
|
||||
* heuristic for the likelyhood that a given access triggered the watchpoint.
|
||||
*
|
||||
* See this same function in the arm64 platform code, which has the same
|
||||
* problem.
|
||||
*
|
||||
* The function returns the distance of the address from the bytes watched by
|
||||
* the watchpoint. In case of an exact match, it returns 0.
|
||||
*/
|
||||
static u32 get_distance_from_watchpoint(unsigned long addr, u32 val,
|
||||
struct arch_hw_breakpoint_ctrl *ctrl)
|
||||
{
|
||||
u32 wp_low, wp_high;
|
||||
u32 lens, lene;
|
||||
|
||||
lens = __ffs(ctrl->len);
|
||||
lene = __fls(ctrl->len);
|
||||
|
||||
wp_low = val + lens;
|
||||
wp_high = val + lene;
|
||||
if (addr < wp_low)
|
||||
return wp_low - addr;
|
||||
else if (addr > wp_high)
|
||||
return addr - wp_high;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int watchpoint_fault_on_uaccess(struct pt_regs *regs,
|
||||
struct arch_hw_breakpoint *info)
|
||||
{
|
||||
|
@ -697,23 +731,25 @@ static int watchpoint_fault_on_uaccess(struct pt_regs *regs,
|
|||
static void watchpoint_handler(unsigned long addr, unsigned int fsr,
|
||||
struct pt_regs *regs)
|
||||
{
|
||||
int i, access;
|
||||
u32 val, ctrl_reg, alignment_mask;
|
||||
int i, access, closest_match = 0;
|
||||
u32 min_dist = -1, dist;
|
||||
u32 val, ctrl_reg;
|
||||
struct perf_event *wp, **slots;
|
||||
struct arch_hw_breakpoint *info;
|
||||
struct arch_hw_breakpoint_ctrl ctrl;
|
||||
|
||||
slots = this_cpu_ptr(wp_on_reg);
|
||||
|
||||
/*
|
||||
* Find all watchpoints that match the reported address. If no exact
|
||||
* match is found. Attribute the hit to the closest watchpoint.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < core_num_wrps; ++i) {
|
||||
rcu_read_lock();
|
||||
|
||||
wp = slots[i];
|
||||
|
||||
if (wp == NULL)
|
||||
goto unlock;
|
||||
continue;
|
||||
|
||||
info = counter_arch_bp(wp);
|
||||
/*
|
||||
* The DFAR is an unknown value on debug architectures prior
|
||||
* to 7.1. Since we only allow a single watchpoint on these
|
||||
|
@ -722,33 +758,31 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
|
|||
*/
|
||||
if (debug_arch < ARM_DEBUG_ARCH_V7_1) {
|
||||
BUG_ON(i > 0);
|
||||
info = counter_arch_bp(wp);
|
||||
info->trigger = wp->attr.bp_addr;
|
||||
} else {
|
||||
if (info->ctrl.len == ARM_BREAKPOINT_LEN_8)
|
||||
alignment_mask = 0x7;
|
||||
else
|
||||
alignment_mask = 0x3;
|
||||
|
||||
/* Check if the watchpoint value matches. */
|
||||
val = read_wb_reg(ARM_BASE_WVR + i);
|
||||
if (val != (addr & ~alignment_mask))
|
||||
goto unlock;
|
||||
|
||||
/* Possible match, check the byte address select. */
|
||||
ctrl_reg = read_wb_reg(ARM_BASE_WCR + i);
|
||||
decode_ctrl_reg(ctrl_reg, &ctrl);
|
||||
if (!((1 << (addr & alignment_mask)) & ctrl.len))
|
||||
goto unlock;
|
||||
|
||||
/* Check that the access type matches. */
|
||||
if (debug_exception_updates_fsr()) {
|
||||
access = (fsr & ARM_FSR_ACCESS_MASK) ?
|
||||
HW_BREAKPOINT_W : HW_BREAKPOINT_R;
|
||||
if (!(access & hw_breakpoint_type(wp)))
|
||||
goto unlock;
|
||||
continue;
|
||||
}
|
||||
|
||||
val = read_wb_reg(ARM_BASE_WVR + i);
|
||||
ctrl_reg = read_wb_reg(ARM_BASE_WCR + i);
|
||||
decode_ctrl_reg(ctrl_reg, &ctrl);
|
||||
dist = get_distance_from_watchpoint(addr, val, &ctrl);
|
||||
if (dist < min_dist) {
|
||||
min_dist = dist;
|
||||
closest_match = i;
|
||||
}
|
||||
/* Is this an exact match? */
|
||||
if (dist != 0)
|
||||
continue;
|
||||
|
||||
/* We have a winner. */
|
||||
info = counter_arch_bp(wp);
|
||||
info->trigger = addr;
|
||||
}
|
||||
|
||||
|
@ -770,13 +804,23 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
|
|||
* we can single-step over the watchpoint trigger.
|
||||
*/
|
||||
if (!is_default_overflow_handler(wp))
|
||||
goto unlock;
|
||||
|
||||
continue;
|
||||
step:
|
||||
enable_single_step(wp, instruction_pointer(regs));
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
if (min_dist > 0 && min_dist != -1) {
|
||||
/* No exact match found. */
|
||||
wp = slots[closest_match];
|
||||
info = counter_arch_bp(wp);
|
||||
info->trigger = addr;
|
||||
pr_debug("watchpoint fired: address = 0x%x\n", info->trigger);
|
||||
perf_bp_event(wp, regs);
|
||||
if (is_default_overflow_handler(wp))
|
||||
enable_single_step(wp, instruction_pointer(regs));
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void watchpoint_single_step_handler(unsigned long pc)
|
||||
|
|
|
@ -1261,20 +1261,28 @@ static void __init l2c310_of_parse(const struct device_node *np,
|
|||
|
||||
ret = of_property_read_u32(np, "prefetch-data", &val);
|
||||
if (ret == 0) {
|
||||
if (val)
|
||||
if (val) {
|
||||
prefetch |= L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||
else
|
||||
*aux_val |= L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||
} else {
|
||||
prefetch &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||
*aux_val &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||
}
|
||||
*aux_mask &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||
} else if (ret != -EINVAL) {
|
||||
pr_err("L2C-310 OF prefetch-data property value is missing\n");
|
||||
}
|
||||
|
||||
ret = of_property_read_u32(np, "prefetch-instr", &val);
|
||||
if (ret == 0) {
|
||||
if (val)
|
||||
if (val) {
|
||||
prefetch |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||
else
|
||||
*aux_val |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||
} else {
|
||||
prefetch &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||
*aux_val &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||
}
|
||||
*aux_mask &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||
} else if (ret != -EINVAL) {
|
||||
pr_err("L2C-310 OF prefetch-instr property value is missing\n");
|
||||
}
|
||||
|
|
|
@ -240,6 +240,7 @@ config SAMSUNG_PM_DEBUG
|
|||
bool "Samsung PM Suspend debug"
|
||||
depends on PM && DEBUG_KERNEL
|
||||
depends on DEBUG_EXYNOS_UART || DEBUG_S3C24XX_UART || DEBUG_S3C2410_UART
|
||||
depends on DEBUG_LL && MMU
|
||||
help
|
||||
Say Y here if you want verbose debugging from the PM Suspend and
|
||||
Resume code. See <file:Documentation/arm/Samsung-S3C24XX/Suspend.txt>
|
||||
|
|
|
@ -46,6 +46,7 @@ config ARCH_BCM_IPROC
|
|||
config ARCH_BERLIN
|
||||
bool "Marvell Berlin SoC Family"
|
||||
select DW_APB_ICTL
|
||||
select DW_APB_TIMER_OF
|
||||
select GPIOLIB
|
||||
select PINCTRL
|
||||
help
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
#
|
||||
# Copyright (C) 1995-2001 by Russell King
|
||||
|
||||
LDFLAGS_vmlinux :=--no-undefined -X
|
||||
LDFLAGS_vmlinux :=--no-undefined -X -z norelro
|
||||
CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET)
|
||||
GZFLAGS :=-9
|
||||
|
||||
|
@ -18,7 +18,7 @@ ifeq ($(CONFIG_RELOCATABLE), y)
|
|||
# Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour
|
||||
# for relative relocs, since this leads to better Image compression
|
||||
# with the relocation offsets always being zero.
|
||||
LDFLAGS_vmlinux += -shared -Bsymbolic -z notext -z norelro \
|
||||
LDFLAGS_vmlinux += -shared -Bsymbolic -z notext \
|
||||
$(call ld-option, --no-apply-dynamic-relocs)
|
||||
endif
|
||||
|
||||
|
|
|
@ -21,6 +21,10 @@
|
|||
|
||||
aliases {
|
||||
ethernet0 = ð0;
|
||||
/* for dsa slave device */
|
||||
ethernet1 = &switch0port1;
|
||||
ethernet2 = &switch0port2;
|
||||
ethernet3 = &switch0port3;
|
||||
serial0 = &uart0;
|
||||
serial1 = &uart1;
|
||||
};
|
||||
|
@ -136,25 +140,25 @@
|
|||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
|
||||
port@0 {
|
||||
switch0port0: port@0 {
|
||||
reg = <0>;
|
||||
label = "cpu";
|
||||
ethernet = <ð0>;
|
||||
};
|
||||
|
||||
port@1 {
|
||||
switch0port1: port@1 {
|
||||
reg = <1>;
|
||||
label = "wan";
|
||||
phy-handle = <&switch0phy0>;
|
||||
};
|
||||
|
||||
port@2 {
|
||||
switch0port2: port@2 {
|
||||
reg = <2>;
|
||||
label = "lan0";
|
||||
phy-handle = <&switch0phy1>;
|
||||
};
|
||||
|
||||
port@3 {
|
||||
switch0port3: port@3 {
|
||||
reg = <3>;
|
||||
label = "lan1";
|
||||
phy-handle = <&switch0phy2>;
|
||||
|
|
|
@ -877,7 +877,7 @@
|
|||
reg-names = "mdp_phys";
|
||||
|
||||
interrupt-parent = <&mdss>;
|
||||
interrupts = <0 0>;
|
||||
interrupts = <0>;
|
||||
|
||||
clocks = <&gcc GCC_MDSS_AHB_CLK>,
|
||||
<&gcc GCC_MDSS_AXI_CLK>,
|
||||
|
@ -909,7 +909,7 @@
|
|||
reg-names = "dsi_ctrl";
|
||||
|
||||
interrupt-parent = <&mdss>;
|
||||
interrupts = <4 0>;
|
||||
interrupts = <4>;
|
||||
|
||||
assigned-clocks = <&gcc BYTE0_CLK_SRC>,
|
||||
<&gcc PCLK0_CLK_SRC>;
|
||||
|
|
|
@ -99,7 +99,7 @@
|
|||
|
||||
wcd_codec: codec@f000 {
|
||||
compatible = "qcom,pm8916-wcd-analog-codec";
|
||||
reg = <0xf000 0x200>;
|
||||
reg = <0xf000>;
|
||||
reg-names = "pmic-codec-core";
|
||||
clocks = <&gcc GCC_CODEC_DIGCODEC_CLK>;
|
||||
clock-names = "mclk";
|
||||
|
|
|
@ -430,6 +430,7 @@
|
|||
bus-width = <8>;
|
||||
mmc-hs200-1_8v;
|
||||
non-removable;
|
||||
full-pwr-cycle-in-suspend;
|
||||
status = "okay";
|
||||
};
|
||||
|
||||
|
|
|
@ -411,7 +411,7 @@
|
|||
};
|
||||
|
||||
i2c0: i2c@ff020000 {
|
||||
compatible = "cdns,i2c-r1p14", "cdns,i2c-r1p10";
|
||||
compatible = "cdns,i2c-r1p14";
|
||||
status = "disabled";
|
||||
interrupt-parent = <&gic>;
|
||||
interrupts = <0 17 4>;
|
||||
|
@ -421,7 +421,7 @@
|
|||
};
|
||||
|
||||
i2c1: i2c@ff030000 {
|
||||
compatible = "cdns,i2c-r1p14", "cdns,i2c-r1p10";
|
||||
compatible = "cdns,i2c-r1p14";
|
||||
status = "disabled";
|
||||
interrupt-parent = <&gic>;
|
||||
interrupts = <0 18 4>;
|
||||
|
|
|
@ -77,7 +77,6 @@ CONFIG_ARM_SCMI_PROTOCOL=y
|
|||
CONFIG_ARM_SCPI_PROTOCOL=y
|
||||
# CONFIG_ARM_SCPI_POWER_DOMAIN is not set
|
||||
# CONFIG_EFI_ARMSTUB_DTB_LOADER is not set
|
||||
CONFIG_ARM64_CRYPTO=y
|
||||
CONFIG_CRYPTO_SHA2_ARM64_CE=y
|
||||
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
|
||||
CONFIG_JUMP_LABEL=y
|
||||
|
@ -246,6 +245,7 @@ CONFIG_DM_VERITY_FEC=y
|
|||
CONFIG_DM_BOW=y
|
||||
CONFIG_NETDEVICES=y
|
||||
CONFIG_DUMMY=y
|
||||
CONFIG_WIREGUARD=y
|
||||
CONFIG_TUN=y
|
||||
CONFIG_VETH=y
|
||||
# CONFIG_ETHERNET is not set
|
||||
|
@ -358,6 +358,7 @@ CONFIG_HID_NINTENDO=y
|
|||
CONFIG_HID_SONY=y
|
||||
CONFIG_HID_STEAM=y
|
||||
CONFIG_USB_HIDDEV=y
|
||||
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
|
||||
CONFIG_USB_OTG=y
|
||||
CONFIG_USB_XHCI_HCD=y
|
||||
CONFIG_USB_GADGET=y
|
||||
|
@ -503,6 +504,7 @@ CONFIG_CRC8=y
|
|||
CONFIG_XZ_DEC=y
|
||||
CONFIG_PRINTK_TIME=y
|
||||
CONFIG_DEBUG_INFO=y
|
||||
CONFIG_DEBUG_INFO_DWARF4=y
|
||||
# CONFIG_ENABLE_MUST_CHECK is not set
|
||||
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
|
||||
CONFIG_MAGIC_SYSRQ=y
|
||||
|
|
1
arch/arm64/crypto/.gitignore
vendored
1
arch/arm64/crypto/.gitignore
vendored
|
@ -1,2 +1,3 @@
|
|||
sha256-core.S
|
||||
sha512-core.S
|
||||
poly1305-core.S
|
||||
|
|
|
@ -106,10 +106,17 @@ config CRYPTO_AES_ARM64_NEON_BLK
|
|||
select CRYPTO_SIMD
|
||||
|
||||
config CRYPTO_CHACHA20_NEON
|
||||
tristate "NEON accelerated ChaCha20 symmetric cipher"
|
||||
tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
|
||||
depends on KERNEL_MODE_NEON
|
||||
select CRYPTO_BLKCIPHER
|
||||
select CRYPTO_CHACHA20
|
||||
select CRYPTO_LIB_CHACHA_GENERIC
|
||||
select CRYPTO_ARCH_HAVE_LIB_CHACHA
|
||||
|
||||
config CRYPTO_POLY1305_NEON
|
||||
tristate "Poly1305 hash function using scalar or NEON instructions"
|
||||
depends on KERNEL_MODE_NEON
|
||||
select CRYPTO_HASH
|
||||
select CRYPTO_ARCH_HAVE_LIB_POLY1305
|
||||
|
||||
config CRYPTO_AES_ARM64_BS
|
||||
tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
|
||||
|
|
|
@ -53,8 +53,12 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
|
|||
obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
|
||||
sha512-arm64-y := sha512-glue.o sha512-core.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
|
||||
chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
|
||||
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
|
||||
poly1305-neon-y := poly1305-core.o poly1305-glue.o
|
||||
AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
|
||||
|
||||
obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
|
||||
aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
|
||||
|
@ -71,6 +75,9 @@ ifdef REGENERATE_ARM64_CRYPTO
|
|||
quiet_cmd_perlasm = PERLASM $@
|
||||
cmd_perlasm = $(PERL) $(<) void $(@)
|
||||
|
||||
$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
|
||||
$(call cmd,perlasm)
|
||||
|
||||
$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
|
||||
$(call cmd,perlasm)
|
||||
|
||||
|
@ -78,4 +85,4 @@ $(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
|
|||
$(call cmd,perlasm)
|
||||
endif
|
||||
|
||||
targets += sha256-core.S sha512-core.S
|
||||
targets += poly1305-core.S sha256-core.S sha512-core.S
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
|
||||
* ChaCha/XChaCha NEON helper functions
|
||||
*
|
||||
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
* Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Based on:
|
||||
* Originally based on:
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
|
@ -19,29 +19,27 @@
|
|||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/assembler.h>
|
||||
#include <asm/cache.h>
|
||||
|
||||
.text
|
||||
.align 6
|
||||
|
||||
ENTRY(chacha20_block_xor_neon)
|
||||
// x0: Input state matrix, s
|
||||
// x1: 1 data block output, o
|
||||
// x2: 1 data block input, i
|
||||
/*
|
||||
* chacha_permute - permute one block
|
||||
*
|
||||
* Permute one 64-byte block where the state matrix is stored in the four NEON
|
||||
* registers v0-v3. It performs matrix operations on four words in parallel,
|
||||
* but requires shuffling to rearrange the words after each round.
|
||||
*
|
||||
* The round count is given in w3.
|
||||
*
|
||||
* Clobbers: w3, x10, v4, v12
|
||||
*/
|
||||
chacha_permute:
|
||||
|
||||
//
|
||||
// This function encrypts one ChaCha20 block by loading the state matrix
|
||||
// in four NEON registers. It performs matrix operation on four words in
|
||||
// parallel, but requires shuffling to rearrange the words after each
|
||||
// round.
|
||||
//
|
||||
|
||||
// x0..3 = s0..3
|
||||
adr x3, ROT8
|
||||
ld1 {v0.4s-v3.4s}, [x0]
|
||||
ld1 {v8.4s-v11.4s}, [x0]
|
||||
ld1 {v12.4s}, [x3]
|
||||
|
||||
mov x3, #10
|
||||
adr_l x10, ROT8
|
||||
ld1 {v12.4s}, [x10]
|
||||
|
||||
.Ldoubleround:
|
||||
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
|
@ -102,9 +100,27 @@ ENTRY(chacha20_block_xor_neon)
|
|||
// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||
ext v3.16b, v3.16b, v3.16b, #4
|
||||
|
||||
subs x3, x3, #1
|
||||
subs w3, w3, #2
|
||||
b.ne .Ldoubleround
|
||||
|
||||
ret
|
||||
ENDPROC(chacha_permute)
|
||||
|
||||
ENTRY(chacha_block_xor_neon)
|
||||
// x0: Input state matrix, s
|
||||
// x1: 1 data block output, o
|
||||
// x2: 1 data block input, i
|
||||
// w3: nrounds
|
||||
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
|
||||
// x0..3 = s0..3
|
||||
ld1 {v0.4s-v3.4s}, [x0]
|
||||
ld1 {v8.4s-v11.4s}, [x0]
|
||||
|
||||
bl chacha_permute
|
||||
|
||||
ld1 {v4.16b-v7.16b}, [x2]
|
||||
|
||||
// o0 = i0 ^ (x0 + s0)
|
||||
|
@ -125,71 +141,156 @@ ENTRY(chacha20_block_xor_neon)
|
|||
|
||||
st1 {v0.16b-v3.16b}, [x1]
|
||||
|
||||
ldp x29, x30, [sp], #16
|
||||
ret
|
||||
ENDPROC(chacha20_block_xor_neon)
|
||||
ENDPROC(chacha_block_xor_neon)
|
||||
|
||||
ENTRY(hchacha_block_neon)
|
||||
// x0: Input state matrix, s
|
||||
// x1: output (8 32-bit words)
|
||||
// w2: nrounds
|
||||
|
||||
stp x29, x30, [sp, #-16]!
|
||||
mov x29, sp
|
||||
|
||||
ld1 {v0.4s-v3.4s}, [x0]
|
||||
|
||||
mov w3, w2
|
||||
bl chacha_permute
|
||||
|
||||
st1 {v0.4s}, [x1], #16
|
||||
st1 {v3.4s}, [x1]
|
||||
|
||||
ldp x29, x30, [sp], #16
|
||||
ret
|
||||
ENDPROC(hchacha_block_neon)
|
||||
|
||||
a0 .req w12
|
||||
a1 .req w13
|
||||
a2 .req w14
|
||||
a3 .req w15
|
||||
a4 .req w16
|
||||
a5 .req w17
|
||||
a6 .req w19
|
||||
a7 .req w20
|
||||
a8 .req w21
|
||||
a9 .req w22
|
||||
a10 .req w23
|
||||
a11 .req w24
|
||||
a12 .req w25
|
||||
a13 .req w26
|
||||
a14 .req w27
|
||||
a15 .req w28
|
||||
|
||||
.align 6
|
||||
ENTRY(chacha20_4block_xor_neon)
|
||||
ENTRY(chacha_4block_xor_neon)
|
||||
frame_push 10
|
||||
|
||||
// x0: Input state matrix, s
|
||||
// x1: 4 data blocks output, o
|
||||
// x2: 4 data blocks input, i
|
||||
// w3: nrounds
|
||||
// x4: byte count
|
||||
|
||||
adr_l x10, .Lpermute
|
||||
and x5, x4, #63
|
||||
add x10, x10, x5
|
||||
add x11, x10, #64
|
||||
|
||||
//
|
||||
// This function encrypts four consecutive ChaCha20 blocks by loading
|
||||
// This function encrypts four consecutive ChaCha blocks by loading
|
||||
// the state matrix in NEON registers four times. The algorithm performs
|
||||
// each operation on the corresponding word of each state matrix, hence
|
||||
// requires no word shuffling. For final XORing step we transpose the
|
||||
// matrix by interleaving 32- and then 64-bit words, which allows us to
|
||||
// do XOR in NEON registers.
|
||||
//
|
||||
adr x3, CTRINC // ... and ROT8
|
||||
ld1 {v30.4s-v31.4s}, [x3]
|
||||
// At the same time, a fifth block is encrypted in parallel using
|
||||
// scalar registers
|
||||
//
|
||||
adr_l x9, CTRINC // ... and ROT8
|
||||
ld1 {v30.4s-v31.4s}, [x9]
|
||||
|
||||
// x0..15[0-3] = s0..3[0..3]
|
||||
mov x4, x0
|
||||
ld4r { v0.4s- v3.4s}, [x4], #16
|
||||
ld4r { v4.4s- v7.4s}, [x4], #16
|
||||
ld4r { v8.4s-v11.4s}, [x4], #16
|
||||
ld4r {v12.4s-v15.4s}, [x4]
|
||||
add x8, x0, #16
|
||||
ld4r { v0.4s- v3.4s}, [x0]
|
||||
ld4r { v4.4s- v7.4s}, [x8], #16
|
||||
ld4r { v8.4s-v11.4s}, [x8], #16
|
||||
ld4r {v12.4s-v15.4s}, [x8]
|
||||
|
||||
// x12 += counter values 0-3
|
||||
mov a0, v0.s[0]
|
||||
mov a1, v1.s[0]
|
||||
mov a2, v2.s[0]
|
||||
mov a3, v3.s[0]
|
||||
mov a4, v4.s[0]
|
||||
mov a5, v5.s[0]
|
||||
mov a6, v6.s[0]
|
||||
mov a7, v7.s[0]
|
||||
mov a8, v8.s[0]
|
||||
mov a9, v9.s[0]
|
||||
mov a10, v10.s[0]
|
||||
mov a11, v11.s[0]
|
||||
mov a12, v12.s[0]
|
||||
mov a13, v13.s[0]
|
||||
mov a14, v14.s[0]
|
||||
mov a15, v15.s[0]
|
||||
|
||||
// x12 += counter values 1-4
|
||||
add v12.4s, v12.4s, v30.4s
|
||||
|
||||
mov x3, #10
|
||||
|
||||
.Ldoubleround4:
|
||||
// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||
// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
|
||||
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
|
||||
// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
|
||||
add v0.4s, v0.4s, v4.4s
|
||||
add a0, a0, a4
|
||||
add v1.4s, v1.4s, v5.4s
|
||||
add a1, a1, a5
|
||||
add v2.4s, v2.4s, v6.4s
|
||||
add a2, a2, a6
|
||||
add v3.4s, v3.4s, v7.4s
|
||||
add a3, a3, a7
|
||||
|
||||
eor v12.16b, v12.16b, v0.16b
|
||||
eor a12, a12, a0
|
||||
eor v13.16b, v13.16b, v1.16b
|
||||
eor a13, a13, a1
|
||||
eor v14.16b, v14.16b, v2.16b
|
||||
eor a14, a14, a2
|
||||
eor v15.16b, v15.16b, v3.16b
|
||||
eor a15, a15, a3
|
||||
|
||||
rev32 v12.8h, v12.8h
|
||||
ror a12, a12, #16
|
||||
rev32 v13.8h, v13.8h
|
||||
ror a13, a13, #16
|
||||
rev32 v14.8h, v14.8h
|
||||
ror a14, a14, #16
|
||||
rev32 v15.8h, v15.8h
|
||||
ror a15, a15, #16
|
||||
|
||||
// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
|
||||
// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
|
||||
// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
|
||||
// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
|
||||
add v8.4s, v8.4s, v12.4s
|
||||
add a8, a8, a12
|
||||
add v9.4s, v9.4s, v13.4s
|
||||
add a9, a9, a13
|
||||
add v10.4s, v10.4s, v14.4s
|
||||
add a10, a10, a14
|
||||
add v11.4s, v11.4s, v15.4s
|
||||
add a11, a11, a15
|
||||
|
||||
eor v16.16b, v4.16b, v8.16b
|
||||
eor a4, a4, a8
|
||||
eor v17.16b, v5.16b, v9.16b
|
||||
eor a5, a5, a9
|
||||
eor v18.16b, v6.16b, v10.16b
|
||||
eor a6, a6, a10
|
||||
eor v19.16b, v7.16b, v11.16b
|
||||
eor a7, a7, a11
|
||||
|
||||
shl v4.4s, v16.4s, #12
|
||||
shl v5.4s, v17.4s, #12
|
||||
|
@ -197,42 +298,66 @@ ENTRY(chacha20_4block_xor_neon)
|
|||
shl v7.4s, v19.4s, #12
|
||||
|
||||
sri v4.4s, v16.4s, #20
|
||||
ror a4, a4, #20
|
||||
sri v5.4s, v17.4s, #20
|
||||
ror a5, a5, #20
|
||||
sri v6.4s, v18.4s, #20
|
||||
ror a6, a6, #20
|
||||
sri v7.4s, v19.4s, #20
|
||||
ror a7, a7, #20
|
||||
|
||||
// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
|
||||
// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
|
||||
// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
|
||||
// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
|
||||
add v0.4s, v0.4s, v4.4s
|
||||
add a0, a0, a4
|
||||
add v1.4s, v1.4s, v5.4s
|
||||
add a1, a1, a5
|
||||
add v2.4s, v2.4s, v6.4s
|
||||
add a2, a2, a6
|
||||
add v3.4s, v3.4s, v7.4s
|
||||
add a3, a3, a7
|
||||
|
||||
eor v12.16b, v12.16b, v0.16b
|
||||
eor a12, a12, a0
|
||||
eor v13.16b, v13.16b, v1.16b
|
||||
eor a13, a13, a1
|
||||
eor v14.16b, v14.16b, v2.16b
|
||||
eor a14, a14, a2
|
||||
eor v15.16b, v15.16b, v3.16b
|
||||
eor a15, a15, a3
|
||||
|
||||
tbl v12.16b, {v12.16b}, v31.16b
|
||||
ror a12, a12, #24
|
||||
tbl v13.16b, {v13.16b}, v31.16b
|
||||
ror a13, a13, #24
|
||||
tbl v14.16b, {v14.16b}, v31.16b
|
||||
ror a14, a14, #24
|
||||
tbl v15.16b, {v15.16b}, v31.16b
|
||||
ror a15, a15, #24
|
||||
|
||||
// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
|
||||
// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
|
||||
// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
|
||||
// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
|
||||
add v8.4s, v8.4s, v12.4s
|
||||
add a8, a8, a12
|
||||
add v9.4s, v9.4s, v13.4s
|
||||
add a9, a9, a13
|
||||
add v10.4s, v10.4s, v14.4s
|
||||
add a10, a10, a14
|
||||
add v11.4s, v11.4s, v15.4s
|
||||
add a11, a11, a15
|
||||
|
||||
eor v16.16b, v4.16b, v8.16b
|
||||
eor a4, a4, a8
|
||||
eor v17.16b, v5.16b, v9.16b
|
||||
eor a5, a5, a9
|
||||
eor v18.16b, v6.16b, v10.16b
|
||||
eor a6, a6, a10
|
||||
eor v19.16b, v7.16b, v11.16b
|
||||
eor a7, a7, a11
|
||||
|
||||
shl v4.4s, v16.4s, #7
|
||||
shl v5.4s, v17.4s, #7
|
||||
|
@ -240,42 +365,66 @@ ENTRY(chacha20_4block_xor_neon)
|
|||
shl v7.4s, v19.4s, #7
|
||||
|
||||
sri v4.4s, v16.4s, #25
|
||||
ror a4, a4, #25
|
||||
sri v5.4s, v17.4s, #25
|
||||
ror a5, a5, #25
|
||||
sri v6.4s, v18.4s, #25
|
||||
ror a6, a6, #25
|
||||
sri v7.4s, v19.4s, #25
|
||||
ror a7, a7, #25
|
||||
|
||||
// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
|
||||
// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
|
||||
// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
|
||||
// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
|
||||
add v0.4s, v0.4s, v5.4s
|
||||
add a0, a0, a5
|
||||
add v1.4s, v1.4s, v6.4s
|
||||
add a1, a1, a6
|
||||
add v2.4s, v2.4s, v7.4s
|
||||
add a2, a2, a7
|
||||
add v3.4s, v3.4s, v4.4s
|
||||
add a3, a3, a4
|
||||
|
||||
eor v15.16b, v15.16b, v0.16b
|
||||
eor a15, a15, a0
|
||||
eor v12.16b, v12.16b, v1.16b
|
||||
eor a12, a12, a1
|
||||
eor v13.16b, v13.16b, v2.16b
|
||||
eor a13, a13, a2
|
||||
eor v14.16b, v14.16b, v3.16b
|
||||
eor a14, a14, a3
|
||||
|
||||
rev32 v15.8h, v15.8h
|
||||
ror a15, a15, #16
|
||||
rev32 v12.8h, v12.8h
|
||||
ror a12, a12, #16
|
||||
rev32 v13.8h, v13.8h
|
||||
ror a13, a13, #16
|
||||
rev32 v14.8h, v14.8h
|
||||
ror a14, a14, #16
|
||||
|
||||
// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
|
||||
// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
|
||||
// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
|
||||
// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
|
||||
add v10.4s, v10.4s, v15.4s
|
||||
add a10, a10, a15
|
||||
add v11.4s, v11.4s, v12.4s
|
||||
add a11, a11, a12
|
||||
add v8.4s, v8.4s, v13.4s
|
||||
add a8, a8, a13
|
||||
add v9.4s, v9.4s, v14.4s
|
||||
add a9, a9, a14
|
||||
|
||||
eor v16.16b, v5.16b, v10.16b
|
||||
eor a5, a5, a10
|
||||
eor v17.16b, v6.16b, v11.16b
|
||||
eor a6, a6, a11
|
||||
eor v18.16b, v7.16b, v8.16b
|
||||
eor a7, a7, a8
|
||||
eor v19.16b, v4.16b, v9.16b
|
||||
eor a4, a4, a9
|
||||
|
||||
shl v5.4s, v16.4s, #12
|
||||
shl v6.4s, v17.4s, #12
|
||||
|
@ -283,42 +432,66 @@ ENTRY(chacha20_4block_xor_neon)
|
|||
shl v4.4s, v19.4s, #12
|
||||
|
||||
sri v5.4s, v16.4s, #20
|
||||
ror a5, a5, #20
|
||||
sri v6.4s, v17.4s, #20
|
||||
ror a6, a6, #20
|
||||
sri v7.4s, v18.4s, #20
|
||||
ror a7, a7, #20
|
||||
sri v4.4s, v19.4s, #20
|
||||
ror a4, a4, #20
|
||||
|
||||
// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
|
||||
// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
|
||||
// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
|
||||
// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
|
||||
add v0.4s, v0.4s, v5.4s
|
||||
add a0, a0, a5
|
||||
add v1.4s, v1.4s, v6.4s
|
||||
add a1, a1, a6
|
||||
add v2.4s, v2.4s, v7.4s
|
||||
add a2, a2, a7
|
||||
add v3.4s, v3.4s, v4.4s
|
||||
add a3, a3, a4
|
||||
|
||||
eor v15.16b, v15.16b, v0.16b
|
||||
eor a15, a15, a0
|
||||
eor v12.16b, v12.16b, v1.16b
|
||||
eor a12, a12, a1
|
||||
eor v13.16b, v13.16b, v2.16b
|
||||
eor a13, a13, a2
|
||||
eor v14.16b, v14.16b, v3.16b
|
||||
eor a14, a14, a3
|
||||
|
||||
tbl v15.16b, {v15.16b}, v31.16b
|
||||
ror a15, a15, #24
|
||||
tbl v12.16b, {v12.16b}, v31.16b
|
||||
ror a12, a12, #24
|
||||
tbl v13.16b, {v13.16b}, v31.16b
|
||||
ror a13, a13, #24
|
||||
tbl v14.16b, {v14.16b}, v31.16b
|
||||
ror a14, a14, #24
|
||||
|
||||
// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
|
||||
// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
|
||||
// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
|
||||
// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
|
||||
add v10.4s, v10.4s, v15.4s
|
||||
add a10, a10, a15
|
||||
add v11.4s, v11.4s, v12.4s
|
||||
add a11, a11, a12
|
||||
add v8.4s, v8.4s, v13.4s
|
||||
add a8, a8, a13
|
||||
add v9.4s, v9.4s, v14.4s
|
||||
add a9, a9, a14
|
||||
|
||||
eor v16.16b, v5.16b, v10.16b
|
||||
eor a5, a5, a10
|
||||
eor v17.16b, v6.16b, v11.16b
|
||||
eor a6, a6, a11
|
||||
eor v18.16b, v7.16b, v8.16b
|
||||
eor a7, a7, a8
|
||||
eor v19.16b, v4.16b, v9.16b
|
||||
eor a4, a4, a9
|
||||
|
||||
shl v5.4s, v16.4s, #7
|
||||
shl v6.4s, v17.4s, #7
|
||||
|
@ -326,11 +499,15 @@ ENTRY(chacha20_4block_xor_neon)
|
|||
shl v4.4s, v19.4s, #7
|
||||
|
||||
sri v5.4s, v16.4s, #25
|
||||
ror a5, a5, #25
|
||||
sri v6.4s, v17.4s, #25
|
||||
ror a6, a6, #25
|
||||
sri v7.4s, v18.4s, #25
|
||||
ror a7, a7, #25
|
||||
sri v4.4s, v19.4s, #25
|
||||
ror a4, a4, #25
|
||||
|
||||
subs x3, x3, #1
|
||||
subs w3, w3, #2
|
||||
b.ne .Ldoubleround4
|
||||
|
||||
ld4r {v16.4s-v19.4s}, [x0], #16
|
||||
|
@ -344,9 +521,21 @@ ENTRY(chacha20_4block_xor_neon)
|
|||
// x2[0-3] += s0[2]
|
||||
// x3[0-3] += s0[3]
|
||||
add v0.4s, v0.4s, v16.4s
|
||||
mov w6, v16.s[0]
|
||||
mov w7, v17.s[0]
|
||||
add v1.4s, v1.4s, v17.4s
|
||||
mov w8, v18.s[0]
|
||||
mov w9, v19.s[0]
|
||||
add v2.4s, v2.4s, v18.4s
|
||||
add a0, a0, w6
|
||||
add a1, a1, w7
|
||||
add v3.4s, v3.4s, v19.4s
|
||||
add a2, a2, w8
|
||||
add a3, a3, w9
|
||||
CPU_BE( rev a0, a0 )
|
||||
CPU_BE( rev a1, a1 )
|
||||
CPU_BE( rev a2, a2 )
|
||||
CPU_BE( rev a3, a3 )
|
||||
|
||||
ld4r {v24.4s-v27.4s}, [x0], #16
|
||||
ld4r {v28.4s-v31.4s}, [x0]
|
||||
|
@ -356,95 +545,316 @@ ENTRY(chacha20_4block_xor_neon)
|
|||
// x6[0-3] += s1[2]
|
||||
// x7[0-3] += s1[3]
|
||||
add v4.4s, v4.4s, v20.4s
|
||||
mov w6, v20.s[0]
|
||||
mov w7, v21.s[0]
|
||||
add v5.4s, v5.4s, v21.4s
|
||||
mov w8, v22.s[0]
|
||||
mov w9, v23.s[0]
|
||||
add v6.4s, v6.4s, v22.4s
|
||||
add a4, a4, w6
|
||||
add a5, a5, w7
|
||||
add v7.4s, v7.4s, v23.4s
|
||||
add a6, a6, w8
|
||||
add a7, a7, w9
|
||||
CPU_BE( rev a4, a4 )
|
||||
CPU_BE( rev a5, a5 )
|
||||
CPU_BE( rev a6, a6 )
|
||||
CPU_BE( rev a7, a7 )
|
||||
|
||||
// x8[0-3] += s2[0]
|
||||
// x9[0-3] += s2[1]
|
||||
// x10[0-3] += s2[2]
|
||||
// x11[0-3] += s2[3]
|
||||
add v8.4s, v8.4s, v24.4s
|
||||
mov w6, v24.s[0]
|
||||
mov w7, v25.s[0]
|
||||
add v9.4s, v9.4s, v25.4s
|
||||
mov w8, v26.s[0]
|
||||
mov w9, v27.s[0]
|
||||
add v10.4s, v10.4s, v26.4s
|
||||
add a8, a8, w6
|
||||
add a9, a9, w7
|
||||
add v11.4s, v11.4s, v27.4s
|
||||
add a10, a10, w8
|
||||
add a11, a11, w9
|
||||
CPU_BE( rev a8, a8 )
|
||||
CPU_BE( rev a9, a9 )
|
||||
CPU_BE( rev a10, a10 )
|
||||
CPU_BE( rev a11, a11 )
|
||||
|
||||
// x12[0-3] += s3[0]
|
||||
// x13[0-3] += s3[1]
|
||||
// x14[0-3] += s3[2]
|
||||
// x15[0-3] += s3[3]
|
||||
add v12.4s, v12.4s, v28.4s
|
||||
mov w6, v28.s[0]
|
||||
mov w7, v29.s[0]
|
||||
add v13.4s, v13.4s, v29.4s
|
||||
mov w8, v30.s[0]
|
||||
mov w9, v31.s[0]
|
||||
add v14.4s, v14.4s, v30.4s
|
||||
add a12, a12, w6
|
||||
add a13, a13, w7
|
||||
add v15.4s, v15.4s, v31.4s
|
||||
add a14, a14, w8
|
||||
add a15, a15, w9
|
||||
CPU_BE( rev a12, a12 )
|
||||
CPU_BE( rev a13, a13 )
|
||||
CPU_BE( rev a14, a14 )
|
||||
CPU_BE( rev a15, a15 )
|
||||
|
||||
// interleave 32-bit words in state n, n+1
|
||||
ldp w6, w7, [x2], #64
|
||||
zip1 v16.4s, v0.4s, v1.4s
|
||||
ldp w8, w9, [x2, #-56]
|
||||
eor a0, a0, w6
|
||||
zip2 v17.4s, v0.4s, v1.4s
|
||||
eor a1, a1, w7
|
||||
zip1 v18.4s, v2.4s, v3.4s
|
||||
eor a2, a2, w8
|
||||
zip2 v19.4s, v2.4s, v3.4s
|
||||
eor a3, a3, w9
|
||||
ldp w6, w7, [x2, #-48]
|
||||
zip1 v20.4s, v4.4s, v5.4s
|
||||
ldp w8, w9, [x2, #-40]
|
||||
eor a4, a4, w6
|
||||
zip2 v21.4s, v4.4s, v5.4s
|
||||
eor a5, a5, w7
|
||||
zip1 v22.4s, v6.4s, v7.4s
|
||||
eor a6, a6, w8
|
||||
zip2 v23.4s, v6.4s, v7.4s
|
||||
eor a7, a7, w9
|
||||
ldp w6, w7, [x2, #-32]
|
||||
zip1 v24.4s, v8.4s, v9.4s
|
||||
ldp w8, w9, [x2, #-24]
|
||||
eor a8, a8, w6
|
||||
zip2 v25.4s, v8.4s, v9.4s
|
||||
eor a9, a9, w7
|
||||
zip1 v26.4s, v10.4s, v11.4s
|
||||
eor a10, a10, w8
|
||||
zip2 v27.4s, v10.4s, v11.4s
|
||||
eor a11, a11, w9
|
||||
ldp w6, w7, [x2, #-16]
|
||||
zip1 v28.4s, v12.4s, v13.4s
|
||||
ldp w8, w9, [x2, #-8]
|
||||
eor a12, a12, w6
|
||||
zip2 v29.4s, v12.4s, v13.4s
|
||||
eor a13, a13, w7
|
||||
zip1 v30.4s, v14.4s, v15.4s
|
||||
eor a14, a14, w8
|
||||
zip2 v31.4s, v14.4s, v15.4s
|
||||
eor a15, a15, w9
|
||||
|
||||
mov x3, #64
|
||||
subs x5, x4, #128
|
||||
add x6, x5, x2
|
||||
csel x3, x3, xzr, ge
|
||||
csel x2, x2, x6, ge
|
||||
|
||||
// interleave 64-bit words in state n, n+2
|
||||
zip1 v0.2d, v16.2d, v18.2d
|
||||
zip2 v4.2d, v16.2d, v18.2d
|
||||
stp a0, a1, [x1], #64
|
||||
zip1 v8.2d, v17.2d, v19.2d
|
||||
zip2 v12.2d, v17.2d, v19.2d
|
||||
ld1 {v16.16b-v19.16b}, [x2], #64
|
||||
stp a2, a3, [x1, #-56]
|
||||
ld1 {v16.16b-v19.16b}, [x2], x3
|
||||
|
||||
subs x6, x4, #192
|
||||
ccmp x3, xzr, #4, lt
|
||||
add x7, x6, x2
|
||||
csel x3, x3, xzr, eq
|
||||
csel x2, x2, x7, eq
|
||||
|
||||
zip1 v1.2d, v20.2d, v22.2d
|
||||
zip2 v5.2d, v20.2d, v22.2d
|
||||
stp a4, a5, [x1, #-48]
|
||||
zip1 v9.2d, v21.2d, v23.2d
|
||||
zip2 v13.2d, v21.2d, v23.2d
|
||||
ld1 {v20.16b-v23.16b}, [x2], #64
|
||||
stp a6, a7, [x1, #-40]
|
||||
ld1 {v20.16b-v23.16b}, [x2], x3
|
||||
|
||||
subs x7, x4, #256
|
||||
ccmp x3, xzr, #4, lt
|
||||
add x8, x7, x2
|
||||
csel x3, x3, xzr, eq
|
||||
csel x2, x2, x8, eq
|
||||
|
||||
zip1 v2.2d, v24.2d, v26.2d
|
||||
zip2 v6.2d, v24.2d, v26.2d
|
||||
stp a8, a9, [x1, #-32]
|
||||
zip1 v10.2d, v25.2d, v27.2d
|
||||
zip2 v14.2d, v25.2d, v27.2d
|
||||
ld1 {v24.16b-v27.16b}, [x2], #64
|
||||
stp a10, a11, [x1, #-24]
|
||||
ld1 {v24.16b-v27.16b}, [x2], x3
|
||||
|
||||
subs x8, x4, #320
|
||||
ccmp x3, xzr, #4, lt
|
||||
add x9, x8, x2
|
||||
csel x2, x2, x9, eq
|
||||
|
||||
zip1 v3.2d, v28.2d, v30.2d
|
||||
zip2 v7.2d, v28.2d, v30.2d
|
||||
stp a12, a13, [x1, #-16]
|
||||
zip1 v11.2d, v29.2d, v31.2d
|
||||
zip2 v15.2d, v29.2d, v31.2d
|
||||
stp a14, a15, [x1, #-8]
|
||||
ld1 {v28.16b-v31.16b}, [x2]
|
||||
|
||||
// xor with corresponding input, write to output
|
||||
tbnz x5, #63, 0f
|
||||
eor v16.16b, v16.16b, v0.16b
|
||||
eor v17.16b, v17.16b, v1.16b
|
||||
eor v18.16b, v18.16b, v2.16b
|
||||
eor v19.16b, v19.16b, v3.16b
|
||||
st1 {v16.16b-v19.16b}, [x1], #64
|
||||
cbz x5, .Lout
|
||||
|
||||
tbnz x6, #63, 1f
|
||||
eor v20.16b, v20.16b, v4.16b
|
||||
eor v21.16b, v21.16b, v5.16b
|
||||
st1 {v16.16b-v19.16b}, [x1], #64
|
||||
eor v22.16b, v22.16b, v6.16b
|
||||
eor v23.16b, v23.16b, v7.16b
|
||||
st1 {v20.16b-v23.16b}, [x1], #64
|
||||
cbz x6, .Lout
|
||||
|
||||
tbnz x7, #63, 2f
|
||||
eor v24.16b, v24.16b, v8.16b
|
||||
eor v25.16b, v25.16b, v9.16b
|
||||
st1 {v20.16b-v23.16b}, [x1], #64
|
||||
eor v26.16b, v26.16b, v10.16b
|
||||
eor v27.16b, v27.16b, v11.16b
|
||||
eor v28.16b, v28.16b, v12.16b
|
||||
st1 {v24.16b-v27.16b}, [x1], #64
|
||||
cbz x7, .Lout
|
||||
|
||||
tbnz x8, #63, 3f
|
||||
eor v28.16b, v28.16b, v12.16b
|
||||
eor v29.16b, v29.16b, v13.16b
|
||||
eor v30.16b, v30.16b, v14.16b
|
||||
eor v31.16b, v31.16b, v15.16b
|
||||
st1 {v28.16b-v31.16b}, [x1]
|
||||
|
||||
.Lout: frame_pop
|
||||
ret
|
||||
ENDPROC(chacha20_4block_xor_neon)
|
||||
|
||||
CTRINC: .word 0, 1, 2, 3
|
||||
// fewer than 128 bytes of in/output
|
||||
0: ld1 {v8.16b}, [x10]
|
||||
ld1 {v9.16b}, [x11]
|
||||
movi v10.16b, #16
|
||||
sub x2, x1, #64
|
||||
add x1, x1, x5
|
||||
ld1 {v16.16b-v19.16b}, [x2]
|
||||
tbl v4.16b, {v0.16b-v3.16b}, v8.16b
|
||||
tbx v20.16b, {v16.16b-v19.16b}, v9.16b
|
||||
add v8.16b, v8.16b, v10.16b
|
||||
add v9.16b, v9.16b, v10.16b
|
||||
tbl v5.16b, {v0.16b-v3.16b}, v8.16b
|
||||
tbx v21.16b, {v16.16b-v19.16b}, v9.16b
|
||||
add v8.16b, v8.16b, v10.16b
|
||||
add v9.16b, v9.16b, v10.16b
|
||||
tbl v6.16b, {v0.16b-v3.16b}, v8.16b
|
||||
tbx v22.16b, {v16.16b-v19.16b}, v9.16b
|
||||
add v8.16b, v8.16b, v10.16b
|
||||
add v9.16b, v9.16b, v10.16b
|
||||
tbl v7.16b, {v0.16b-v3.16b}, v8.16b
|
||||
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
|
||||
|
||||
eor v20.16b, v20.16b, v4.16b
|
||||
eor v21.16b, v21.16b, v5.16b
|
||||
eor v22.16b, v22.16b, v6.16b
|
||||
eor v23.16b, v23.16b, v7.16b
|
||||
st1 {v20.16b-v23.16b}, [x1]
|
||||
b .Lout
|
||||
|
||||
// fewer than 192 bytes of in/output
|
||||
1: ld1 {v8.16b}, [x10]
|
||||
ld1 {v9.16b}, [x11]
|
||||
movi v10.16b, #16
|
||||
add x1, x1, x6
|
||||
tbl v0.16b, {v4.16b-v7.16b}, v8.16b
|
||||
tbx v20.16b, {v16.16b-v19.16b}, v9.16b
|
||||
add v8.16b, v8.16b, v10.16b
|
||||
add v9.16b, v9.16b, v10.16b
|
||||
tbl v1.16b, {v4.16b-v7.16b}, v8.16b
|
||||
tbx v21.16b, {v16.16b-v19.16b}, v9.16b
|
||||
add v8.16b, v8.16b, v10.16b
|
||||
add v9.16b, v9.16b, v10.16b
|
||||
tbl v2.16b, {v4.16b-v7.16b}, v8.16b
|
||||
tbx v22.16b, {v16.16b-v19.16b}, v9.16b
|
||||
add v8.16b, v8.16b, v10.16b
|
||||
add v9.16b, v9.16b, v10.16b
|
||||
tbl v3.16b, {v4.16b-v7.16b}, v8.16b
|
||||
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
|
||||
|
||||
eor v20.16b, v20.16b, v0.16b
|
||||
eor v21.16b, v21.16b, v1.16b
|
||||
eor v22.16b, v22.16b, v2.16b
|
||||
eor v23.16b, v23.16b, v3.16b
|
||||
st1 {v20.16b-v23.16b}, [x1]
|
||||
b .Lout
|
||||
|
||||
// fewer than 256 bytes of in/output
|
||||
2: ld1 {v4.16b}, [x10]
|
||||
ld1 {v5.16b}, [x11]
|
||||
movi v6.16b, #16
|
||||
add x1, x1, x7
|
||||
tbl v0.16b, {v8.16b-v11.16b}, v4.16b
|
||||
tbx v24.16b, {v20.16b-v23.16b}, v5.16b
|
||||
add v4.16b, v4.16b, v6.16b
|
||||
add v5.16b, v5.16b, v6.16b
|
||||
tbl v1.16b, {v8.16b-v11.16b}, v4.16b
|
||||
tbx v25.16b, {v20.16b-v23.16b}, v5.16b
|
||||
add v4.16b, v4.16b, v6.16b
|
||||
add v5.16b, v5.16b, v6.16b
|
||||
tbl v2.16b, {v8.16b-v11.16b}, v4.16b
|
||||
tbx v26.16b, {v20.16b-v23.16b}, v5.16b
|
||||
add v4.16b, v4.16b, v6.16b
|
||||
add v5.16b, v5.16b, v6.16b
|
||||
tbl v3.16b, {v8.16b-v11.16b}, v4.16b
|
||||
tbx v27.16b, {v20.16b-v23.16b}, v5.16b
|
||||
|
||||
eor v24.16b, v24.16b, v0.16b
|
||||
eor v25.16b, v25.16b, v1.16b
|
||||
eor v26.16b, v26.16b, v2.16b
|
||||
eor v27.16b, v27.16b, v3.16b
|
||||
st1 {v24.16b-v27.16b}, [x1]
|
||||
b .Lout
|
||||
|
||||
// fewer than 320 bytes of in/output
|
||||
3: ld1 {v4.16b}, [x10]
|
||||
ld1 {v5.16b}, [x11]
|
||||
movi v6.16b, #16
|
||||
add x1, x1, x8
|
||||
tbl v0.16b, {v12.16b-v15.16b}, v4.16b
|
||||
tbx v28.16b, {v24.16b-v27.16b}, v5.16b
|
||||
add v4.16b, v4.16b, v6.16b
|
||||
add v5.16b, v5.16b, v6.16b
|
||||
tbl v1.16b, {v12.16b-v15.16b}, v4.16b
|
||||
tbx v29.16b, {v24.16b-v27.16b}, v5.16b
|
||||
add v4.16b, v4.16b, v6.16b
|
||||
add v5.16b, v5.16b, v6.16b
|
||||
tbl v2.16b, {v12.16b-v15.16b}, v4.16b
|
||||
tbx v30.16b, {v24.16b-v27.16b}, v5.16b
|
||||
add v4.16b, v4.16b, v6.16b
|
||||
add v5.16b, v5.16b, v6.16b
|
||||
tbl v3.16b, {v12.16b-v15.16b}, v4.16b
|
||||
tbx v31.16b, {v24.16b-v27.16b}, v5.16b
|
||||
|
||||
eor v28.16b, v28.16b, v0.16b
|
||||
eor v29.16b, v29.16b, v1.16b
|
||||
eor v30.16b, v30.16b, v2.16b
|
||||
eor v31.16b, v31.16b, v3.16b
|
||||
st1 {v28.16b-v31.16b}, [x1]
|
||||
b .Lout
|
||||
ENDPROC(chacha_4block_xor_neon)
|
||||
|
||||
.section ".rodata", "a", %progbits
|
||||
.align L1_CACHE_SHIFT
|
||||
.Lpermute:
|
||||
.set .Li, 0
|
||||
.rept 192
|
||||
.byte (.Li - 64)
|
||||
.set .Li, .Li + 1
|
||||
.endr
|
||||
|
||||
CTRINC: .word 1, 2, 3, 4
|
||||
ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
|
|
@ -1,8 +1,8 @@
|
|||
/*
|
||||
* ARM NEON accelerated ChaCha and XChaCha stream ciphers,
|
||||
* ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
|
||||
* including ChaCha20 (RFC7539)
|
||||
*
|
||||
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
|
@ -20,8 +20,9 @@
|
|||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/chacha.h>
|
||||
#include <crypto/internal/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
|
@ -29,40 +30,78 @@
|
|||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
||||
asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
|
||||
int nrounds);
|
||||
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
||||
int nrounds);
|
||||
asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
|
||||
int nrounds, int bytes);
|
||||
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
|
||||
|
||||
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes, int nrounds)
|
||||
{
|
||||
u8 buf[CHACHA_BLOCK_SIZE];
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
|
||||
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha_4block_xor_neon(state, dst, src, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||
src += CHACHA_BLOCK_SIZE * 4;
|
||||
dst += CHACHA_BLOCK_SIZE * 4;
|
||||
state[12] += 4;
|
||||
}
|
||||
while (bytes >= CHACHA_BLOCK_SIZE) {
|
||||
chacha_block_xor_neon(state, dst, src, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE;
|
||||
src += CHACHA_BLOCK_SIZE;
|
||||
dst += CHACHA_BLOCK_SIZE;
|
||||
state[12]++;
|
||||
}
|
||||
if (bytes) {
|
||||
memcpy(buf, src, bytes);
|
||||
chacha_block_xor_neon(state, buf, buf, nrounds);
|
||||
memcpy(dst, buf, bytes);
|
||||
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
|
||||
int bytes, int nrounds)
|
||||
{
|
||||
while (bytes > 0) {
|
||||
int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
|
||||
|
||||
if (l <= CHACHA_BLOCK_SIZE) {
|
||||
u8 buf[CHACHA_BLOCK_SIZE];
|
||||
|
||||
memcpy(buf, src, l);
|
||||
chacha_block_xor_neon(state, buf, buf, nrounds);
|
||||
memcpy(dst, buf, l);
|
||||
state[12] += 1;
|
||||
break;
|
||||
}
|
||||
chacha_4block_xor_neon(state, dst, src, nrounds, l);
|
||||
bytes -= l;
|
||||
src += l;
|
||||
dst += l;
|
||||
state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
|
||||
{
|
||||
if (!static_branch_likely(&have_neon) || !may_use_simd()) {
|
||||
hchacha_block_generic(state, stream, nrounds);
|
||||
} else {
|
||||
kernel_neon_begin();
|
||||
hchacha_block_neon(state, stream, nrounds);
|
||||
kernel_neon_end();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(hchacha_block_arch);
|
||||
|
||||
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
||||
{
|
||||
chacha_init_generic(state, key, iv);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_init_arch);
|
||||
|
||||
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
|
||||
int nrounds)
|
||||
{
|
||||
if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
|
||||
!may_use_simd())
|
||||
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
|
||||
|
||||
do {
|
||||
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
|
||||
|
||||
kernel_neon_begin();
|
||||
chacha_doneon(state, dst, src, todo, nrounds);
|
||||
kernel_neon_end();
|
||||
|
||||
bytes -= todo;
|
||||
src += todo;
|
||||
dst += todo;
|
||||
} while (bytes);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_crypt_arch);
|
||||
|
||||
static int chacha_neon_stream_xor(struct skcipher_request *req,
|
||||
struct chacha_ctx *ctx, u8 *iv)
|
||||
const struct chacha_ctx *ctx, const u8 *iv)
|
||||
{
|
||||
struct skcipher_walk walk;
|
||||
u32 state[16];
|
||||
|
@ -70,18 +109,25 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
|
|||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
crypto_chacha_init(state, ctx, iv);
|
||||
chacha_init_generic(state, ctx->key, iv);
|
||||
|
||||
while (walk.nbytes > 0) {
|
||||
unsigned int nbytes = walk.nbytes;
|
||||
|
||||
if (nbytes < walk.total)
|
||||
nbytes = round_down(nbytes, walk.stride);
|
||||
nbytes = rounddown(nbytes, walk.stride);
|
||||
|
||||
kernel_neon_begin();
|
||||
chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
nbytes, ctx->nrounds);
|
||||
kernel_neon_end();
|
||||
if (!static_branch_likely(&have_neon) ||
|
||||
!may_use_simd()) {
|
||||
chacha_crypt_generic(state, walk.dst.virt.addr,
|
||||
walk.src.virt.addr, nbytes,
|
||||
ctx->nrounds);
|
||||
} else {
|
||||
kernel_neon_begin();
|
||||
chacha_doneon(state, walk.dst.virt.addr,
|
||||
walk.src.virt.addr, nbytes, ctx->nrounds);
|
||||
kernel_neon_end();
|
||||
}
|
||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||
}
|
||||
|
||||
|
@ -93,9 +139,6 @@ static int chacha_neon(struct skcipher_request *req)
|
|||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
|
||||
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
|
||||
return crypto_chacha_crypt(req);
|
||||
|
||||
return chacha_neon_stream_xor(req, ctx, req->iv);
|
||||
}
|
||||
|
||||
|
@ -107,14 +150,8 @@ static int xchacha_neon(struct skcipher_request *req)
|
|||
u32 state[16];
|
||||
u8 real_iv[16];
|
||||
|
||||
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
|
||||
return crypto_xchacha_crypt(req);
|
||||
|
||||
crypto_chacha_init(state, ctx, req->iv);
|
||||
|
||||
kernel_neon_begin();
|
||||
hchacha_block_neon(state, subctx.key, ctx->nrounds);
|
||||
kernel_neon_end();
|
||||
chacha_init_generic(state, ctx->key, req->iv);
|
||||
hchacha_block_arch(state, subctx.key, ctx->nrounds);
|
||||
subctx.nrounds = ctx->nrounds;
|
||||
|
||||
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||
|
@ -135,8 +172,8 @@ static struct skcipher_alg algs[] = {
|
|||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.walksize = 5 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = chacha_neon,
|
||||
.decrypt = chacha_neon,
|
||||
}, {
|
||||
|
@ -151,8 +188,8 @@ static struct skcipher_alg algs[] = {
|
|||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.walksize = 5 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = xchacha_neon,
|
||||
.decrypt = xchacha_neon,
|
||||
}, {
|
||||
|
@ -167,8 +204,8 @@ static struct skcipher_alg algs[] = {
|
|||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha12_setkey,
|
||||
.walksize = 5 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha12_setkey,
|
||||
.encrypt = xchacha_neon,
|
||||
.decrypt = xchacha_neon,
|
||||
}
|
||||
|
@ -176,15 +213,19 @@ static struct skcipher_alg algs[] = {
|
|||
|
||||
static int __init chacha_simd_mod_init(void)
|
||||
{
|
||||
if (!(elf_hwcap & HWCAP_NEON))
|
||||
return -ENODEV;
|
||||
if (!(elf_hwcap & HWCAP_ASIMD))
|
||||
return 0;
|
||||
|
||||
return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
|
||||
static_branch_enable(&have_neon);
|
||||
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
|
||||
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
|
||||
}
|
||||
|
||||
static void __exit chacha_simd_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && (elf_hwcap & HWCAP_ASIMD))
|
||||
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
||||
}
|
||||
|
||||
module_init(chacha_simd_mod_init);
|
|
@ -1,133 +0,0 @@
|
|||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
|
||||
*
|
||||
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2 as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* Based on:
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include <asm/hwcap.h>
|
||||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
|
||||
asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
|
||||
|
||||
static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes)
|
||||
{
|
||||
u8 buf[CHACHA_BLOCK_SIZE];
|
||||
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||
kernel_neon_begin();
|
||||
chacha20_4block_xor_neon(state, dst, src);
|
||||
kernel_neon_end();
|
||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||
src += CHACHA_BLOCK_SIZE * 4;
|
||||
dst += CHACHA_BLOCK_SIZE * 4;
|
||||
state[12] += 4;
|
||||
}
|
||||
|
||||
if (!bytes)
|
||||
return;
|
||||
|
||||
kernel_neon_begin();
|
||||
while (bytes >= CHACHA_BLOCK_SIZE) {
|
||||
chacha20_block_xor_neon(state, dst, src);
|
||||
bytes -= CHACHA_BLOCK_SIZE;
|
||||
src += CHACHA_BLOCK_SIZE;
|
||||
dst += CHACHA_BLOCK_SIZE;
|
||||
state[12]++;
|
||||
}
|
||||
if (bytes) {
|
||||
memcpy(buf, src, bytes);
|
||||
chacha20_block_xor_neon(state, buf, buf);
|
||||
memcpy(dst, buf, bytes);
|
||||
}
|
||||
kernel_neon_end();
|
||||
}
|
||||
|
||||
static int chacha20_neon(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
struct skcipher_walk walk;
|
||||
u32 state[16];
|
||||
int err;
|
||||
|
||||
if (!may_use_simd() || req->cryptlen <= CHACHA_BLOCK_SIZE)
|
||||
return crypto_chacha_crypt(req);
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
crypto_chacha_init(state, ctx, walk.iv);
|
||||
|
||||
while (walk.nbytes > 0) {
|
||||
unsigned int nbytes = walk.nbytes;
|
||||
|
||||
if (nbytes < walk.total)
|
||||
nbytes = round_down(nbytes, walk.stride);
|
||||
|
||||
chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
nbytes);
|
||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct skcipher_alg alg = {
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-neon",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.encrypt = chacha20_neon,
|
||||
.decrypt = chacha20_neon,
|
||||
};
|
||||
|
||||
static int __init chacha20_simd_mod_init(void)
|
||||
{
|
||||
if (!(elf_hwcap & HWCAP_ASIMD))
|
||||
return -ENODEV;
|
||||
|
||||
return crypto_register_skcipher(&alg);
|
||||
}
|
||||
|
||||
static void __exit chacha20_simd_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_skcipher(&alg);
|
||||
}
|
||||
|
||||
module_init(chacha20_simd_mod_init);
|
||||
module_exit(chacha20_simd_mod_fini);
|
||||
|
||||
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_ALIAS_CRYPTO("chacha20");
|
913
arch/arm64/crypto/poly1305-armv8.pl
Normal file
913
arch/arm64/crypto/poly1305-armv8.pl
Normal file
|
@ -0,0 +1,913 @@
|
|||
#!/usr/bin/env perl
|
||||
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
|
||||
#
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
|
||||
# project.
|
||||
# ====================================================================
|
||||
#
|
||||
# This module implements Poly1305 hash for ARMv8.
|
||||
#
|
||||
# June 2015
|
||||
#
|
||||
# Numbers are cycles per processed byte with poly1305_blocks alone.
|
||||
#
|
||||
# IALU/gcc-4.9 NEON
|
||||
#
|
||||
# Apple A7 1.86/+5% 0.72
|
||||
# Cortex-A53 2.69/+58% 1.47
|
||||
# Cortex-A57 2.70/+7% 1.14
|
||||
# Denver 1.64/+50% 1.18(*)
|
||||
# X-Gene 2.13/+68% 2.27
|
||||
# Mongoose 1.77/+75% 1.12
|
||||
# Kryo 2.70/+55% 1.13
|
||||
# ThunderX2 1.17/+95% 1.36
|
||||
#
|
||||
# (*) estimate based on resources availability is less than 1.0,
|
||||
# i.e. measured result is worse than expected, presumably binary
|
||||
# translator is not almighty;
|
||||
|
||||
$flavour=shift;
|
||||
$output=shift;
|
||||
|
||||
if ($flavour && $flavour ne "void") {
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||
die "can't locate arm-xlate.pl";
|
||||
|
||||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||
} else {
|
||||
open STDOUT,">$output";
|
||||
}
|
||||
|
||||
my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
|
||||
my ($mac,$nonce)=($inp,$len);
|
||||
|
||||
my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
|
||||
|
||||
$code.=<<___;
|
||||
#ifndef __KERNEL__
|
||||
# include "arm_arch.h"
|
||||
.extern OPENSSL_armcap_P
|
||||
#endif
|
||||
|
||||
.text
|
||||
|
||||
// forward "declarations" are required for Apple
|
||||
.globl poly1305_blocks
|
||||
.globl poly1305_emit
|
||||
|
||||
.globl poly1305_init
|
||||
.type poly1305_init,%function
|
||||
.align 5
|
||||
poly1305_init:
|
||||
cmp $inp,xzr
|
||||
stp xzr,xzr,[$ctx] // zero hash value
|
||||
stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
|
||||
|
||||
csel x0,xzr,x0,eq
|
||||
b.eq .Lno_key
|
||||
|
||||
#ifndef __KERNEL__
|
||||
adrp x17,OPENSSL_armcap_P
|
||||
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
|
||||
#endif
|
||||
|
||||
ldp $r0,$r1,[$inp] // load key
|
||||
mov $s1,#0xfffffffc0fffffff
|
||||
movk $s1,#0x0fff,lsl#48
|
||||
#ifdef __AARCH64EB__
|
||||
rev $r0,$r0 // flip bytes
|
||||
rev $r1,$r1
|
||||
#endif
|
||||
and $r0,$r0,$s1 // &=0ffffffc0fffffff
|
||||
and $s1,$s1,#-4
|
||||
and $r1,$r1,$s1 // &=0ffffffc0ffffffc
|
||||
mov w#$s1,#-1
|
||||
stp $r0,$r1,[$ctx,#32] // save key value
|
||||
str w#$s1,[$ctx,#48] // impossible key power value
|
||||
|
||||
#ifndef __KERNEL__
|
||||
tst w17,#ARMV7_NEON
|
||||
|
||||
adr $d0,.Lpoly1305_blocks
|
||||
adr $r0,.Lpoly1305_blocks_neon
|
||||
adr $d1,.Lpoly1305_emit
|
||||
|
||||
csel $d0,$d0,$r0,eq
|
||||
|
||||
# ifdef __ILP32__
|
||||
stp w#$d0,w#$d1,[$len]
|
||||
# else
|
||||
stp $d0,$d1,[$len]
|
||||
# endif
|
||||
#endif
|
||||
mov x0,#1
|
||||
.Lno_key:
|
||||
ret
|
||||
.size poly1305_init,.-poly1305_init
|
||||
|
||||
.type poly1305_blocks,%function
|
||||
.align 5
|
||||
poly1305_blocks:
|
||||
.Lpoly1305_blocks:
|
||||
ands $len,$len,#-16
|
||||
b.eq .Lno_data
|
||||
|
||||
ldp $h0,$h1,[$ctx] // load hash value
|
||||
ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
|
||||
ldp $r0,$r1,[$ctx,#32] // load key value
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
lsr $d0,$h0,#32
|
||||
mov w#$d1,w#$h0
|
||||
lsr $d2,$h1,#32
|
||||
mov w15,w#$h1
|
||||
lsr x16,$h2,#32
|
||||
#else
|
||||
mov w#$d0,w#$h0
|
||||
lsr $d1,$h0,#32
|
||||
mov w#$d2,w#$h1
|
||||
lsr x15,$h1,#32
|
||||
mov w16,w#$h2
|
||||
#endif
|
||||
|
||||
add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr $d1,$d2,#12
|
||||
adds $d0,$d0,$d2,lsl#52
|
||||
add $d1,$d1,x15,lsl#14
|
||||
adc $d1,$d1,xzr
|
||||
lsr $d2,x16,#24
|
||||
adds $d1,$d1,x16,lsl#40
|
||||
adc $d2,$d2,xzr
|
||||
|
||||
cmp x17,#0 // is_base2_26?
|
||||
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
csel $h0,$h0,$d0,eq // choose between radixes
|
||||
csel $h1,$h1,$d1,eq
|
||||
csel $h2,$h2,$d2,eq
|
||||
|
||||
.Loop:
|
||||
ldp $t0,$t1,[$inp],#16 // load input
|
||||
sub $len,$len,#16
|
||||
#ifdef __AARCH64EB__
|
||||
rev $t0,$t0
|
||||
rev $t1,$t1
|
||||
#endif
|
||||
adds $h0,$h0,$t0 // accumulate input
|
||||
adcs $h1,$h1,$t1
|
||||
|
||||
mul $d0,$h0,$r0 // h0*r0
|
||||
adc $h2,$h2,$padbit
|
||||
umulh $d1,$h0,$r0
|
||||
|
||||
mul $t0,$h1,$s1 // h1*5*r1
|
||||
umulh $t1,$h1,$s1
|
||||
|
||||
adds $d0,$d0,$t0
|
||||
mul $t0,$h0,$r1 // h0*r1
|
||||
adc $d1,$d1,$t1
|
||||
umulh $d2,$h0,$r1
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
mul $t0,$h1,$r0 // h1*r0
|
||||
adc $d2,$d2,xzr
|
||||
umulh $t1,$h1,$r0
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
mul $t0,$h2,$s1 // h2*5*r1
|
||||
adc $d2,$d2,$t1
|
||||
mul $t1,$h2,$r0 // h2*r0
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
adc $d2,$d2,$t1
|
||||
|
||||
and $t0,$d2,#-4 // final reduction
|
||||
and $h2,$d2,#3
|
||||
add $t0,$t0,$d2,lsr#2
|
||||
adds $h0,$d0,$t0
|
||||
adcs $h1,$d1,xzr
|
||||
adc $h2,$h2,xzr
|
||||
|
||||
cbnz $len,.Loop
|
||||
|
||||
stp $h0,$h1,[$ctx] // store hash value
|
||||
stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
|
||||
|
||||
.Lno_data:
|
||||
ret
|
||||
.size poly1305_blocks,.-poly1305_blocks
|
||||
|
||||
.type poly1305_emit,%function
|
||||
.align 5
|
||||
poly1305_emit:
|
||||
.Lpoly1305_emit:
|
||||
ldp $h0,$h1,[$ctx] // load hash base 2^64
|
||||
ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
|
||||
ldp $t0,$t1,[$nonce] // load nonce
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
lsr $d0,$h0,#32
|
||||
mov w#$d1,w#$h0
|
||||
lsr $d2,$h1,#32
|
||||
mov w15,w#$h1
|
||||
lsr x16,$h2,#32
|
||||
#else
|
||||
mov w#$d0,w#$h0
|
||||
lsr $d1,$h0,#32
|
||||
mov w#$d2,w#$h1
|
||||
lsr x15,$h1,#32
|
||||
mov w16,w#$h2
|
||||
#endif
|
||||
|
||||
add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr $d1,$d2,#12
|
||||
adds $d0,$d0,$d2,lsl#52
|
||||
add $d1,$d1,x15,lsl#14
|
||||
adc $d1,$d1,xzr
|
||||
lsr $d2,x16,#24
|
||||
adds $d1,$d1,x16,lsl#40
|
||||
adc $d2,$d2,xzr
|
||||
|
||||
cmp $r0,#0 // is_base2_26?
|
||||
csel $h0,$h0,$d0,eq // choose between radixes
|
||||
csel $h1,$h1,$d1,eq
|
||||
csel $h2,$h2,$d2,eq
|
||||
|
||||
adds $d0,$h0,#5 // compare to modulus
|
||||
adcs $d1,$h1,xzr
|
||||
adc $d2,$h2,xzr
|
||||
|
||||
tst $d2,#-4 // see if it's carried/borrowed
|
||||
|
||||
csel $h0,$h0,$d0,eq
|
||||
csel $h1,$h1,$d1,eq
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
ror $t0,$t0,#32 // flip nonce words
|
||||
ror $t1,$t1,#32
|
||||
#endif
|
||||
adds $h0,$h0,$t0 // accumulate nonce
|
||||
adc $h1,$h1,$t1
|
||||
#ifdef __AARCH64EB__
|
||||
rev $h0,$h0 // flip output bytes
|
||||
rev $h1,$h1
|
||||
#endif
|
||||
stp $h0,$h1,[$mac] // write result
|
||||
|
||||
ret
|
||||
.size poly1305_emit,.-poly1305_emit
|
||||
___
|
||||
my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
|
||||
my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
|
||||
my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
|
||||
my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
|
||||
my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
|
||||
my ($T0,$T1,$MASK) = map("v$_",(29..31));
|
||||
|
||||
my ($in2,$zeros)=("x16","x17");
|
||||
my $is_base2_26 = $zeros; # borrow
|
||||
|
||||
$code.=<<___;
|
||||
.type poly1305_mult,%function
|
||||
.align 5
|
||||
poly1305_mult:
|
||||
mul $d0,$h0,$r0 // h0*r0
|
||||
umulh $d1,$h0,$r0
|
||||
|
||||
mul $t0,$h1,$s1 // h1*5*r1
|
||||
umulh $t1,$h1,$s1
|
||||
|
||||
adds $d0,$d0,$t0
|
||||
mul $t0,$h0,$r1 // h0*r1
|
||||
adc $d1,$d1,$t1
|
||||
umulh $d2,$h0,$r1
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
mul $t0,$h1,$r0 // h1*r0
|
||||
adc $d2,$d2,xzr
|
||||
umulh $t1,$h1,$r0
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
mul $t0,$h2,$s1 // h2*5*r1
|
||||
adc $d2,$d2,$t1
|
||||
mul $t1,$h2,$r0 // h2*r0
|
||||
|
||||
adds $d1,$d1,$t0
|
||||
adc $d2,$d2,$t1
|
||||
|
||||
and $t0,$d2,#-4 // final reduction
|
||||
and $h2,$d2,#3
|
||||
add $t0,$t0,$d2,lsr#2
|
||||
adds $h0,$d0,$t0
|
||||
adcs $h1,$d1,xzr
|
||||
adc $h2,$h2,xzr
|
||||
|
||||
ret
|
||||
.size poly1305_mult,.-poly1305_mult
|
||||
|
||||
.type poly1305_splat,%function
|
||||
.align 4
|
||||
poly1305_splat:
|
||||
and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x13,$h0,#26,#26
|
||||
extr x14,$h1,$h0,#52
|
||||
and x14,x14,#0x03ffffff
|
||||
ubfx x15,$h1,#14,#26
|
||||
extr x16,$h2,$h1,#40
|
||||
|
||||
str w12,[$ctx,#16*0] // r0
|
||||
add w12,w13,w13,lsl#2 // r1*5
|
||||
str w13,[$ctx,#16*1] // r1
|
||||
add w13,w14,w14,lsl#2 // r2*5
|
||||
str w12,[$ctx,#16*2] // s1
|
||||
str w14,[$ctx,#16*3] // r2
|
||||
add w14,w15,w15,lsl#2 // r3*5
|
||||
str w13,[$ctx,#16*4] // s2
|
||||
str w15,[$ctx,#16*5] // r3
|
||||
add w15,w16,w16,lsl#2 // r4*5
|
||||
str w14,[$ctx,#16*6] // s3
|
||||
str w16,[$ctx,#16*7] // r4
|
||||
str w15,[$ctx,#16*8] // s4
|
||||
|
||||
ret
|
||||
.size poly1305_splat,.-poly1305_splat
|
||||
|
||||
#ifdef __KERNEL__
|
||||
.globl poly1305_blocks_neon
|
||||
#endif
|
||||
.type poly1305_blocks_neon,%function
|
||||
.align 5
|
||||
poly1305_blocks_neon:
|
||||
.Lpoly1305_blocks_neon:
|
||||
ldr $is_base2_26,[$ctx,#24]
|
||||
cmp $len,#128
|
||||
b.lo .Lpoly1305_blocks
|
||||
|
||||
.inst 0xd503233f // paciasp
|
||||
stp x29,x30,[sp,#-80]!
|
||||
add x29,sp,#0
|
||||
|
||||
stp d8,d9,[sp,#16] // meet ABI requirements
|
||||
stp d10,d11,[sp,#32]
|
||||
stp d12,d13,[sp,#48]
|
||||
stp d14,d15,[sp,#64]
|
||||
|
||||
cbz $is_base2_26,.Lbase2_64_neon
|
||||
|
||||
ldp w10,w11,[$ctx] // load hash value base 2^26
|
||||
ldp w12,w13,[$ctx,#8]
|
||||
ldr w14,[$ctx,#16]
|
||||
|
||||
tst $len,#31
|
||||
b.eq .Leven_neon
|
||||
|
||||
ldp $r0,$r1,[$ctx,#32] // load key value
|
||||
|
||||
add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr $h1,x12,#12
|
||||
adds $h0,$h0,x12,lsl#52
|
||||
add $h1,$h1,x13,lsl#14
|
||||
adc $h1,$h1,xzr
|
||||
lsr $h2,x14,#24
|
||||
adds $h1,$h1,x14,lsl#40
|
||||
adc $d2,$h2,xzr // can be partially reduced...
|
||||
|
||||
ldp $d0,$d1,[$inp],#16 // load input
|
||||
sub $len,$len,#16
|
||||
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev $d0,$d0
|
||||
rev $d1,$d1
|
||||
#endif
|
||||
adds $h0,$h0,$d0 // accumulate input
|
||||
adcs $h1,$h1,$d1
|
||||
adc $h2,$h2,$padbit
|
||||
|
||||
bl poly1305_mult
|
||||
|
||||
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,$h0,#26,#26
|
||||
extr x12,$h1,$h0,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,$h1,#14,#26
|
||||
extr x14,$h2,$h1,#40
|
||||
|
||||
b .Leven_neon
|
||||
|
||||
.align 4
|
||||
.Lbase2_64_neon:
|
||||
ldp $r0,$r1,[$ctx,#32] // load key value
|
||||
|
||||
ldp $h0,$h1,[$ctx] // load hash value base 2^64
|
||||
ldr $h2,[$ctx,#16]
|
||||
|
||||
tst $len,#31
|
||||
b.eq .Linit_neon
|
||||
|
||||
ldp $d0,$d1,[$inp],#16 // load input
|
||||
sub $len,$len,#16
|
||||
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
#ifdef __AARCH64EB__
|
||||
rev $d0,$d0
|
||||
rev $d1,$d1
|
||||
#endif
|
||||
adds $h0,$h0,$d0 // accumulate input
|
||||
adcs $h1,$h1,$d1
|
||||
adc $h2,$h2,$padbit
|
||||
|
||||
bl poly1305_mult
|
||||
|
||||
.Linit_neon:
|
||||
ldr w17,[$ctx,#48] // first table element
|
||||
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,$h0,#26,#26
|
||||
extr x12,$h1,$h0,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,$h1,#14,#26
|
||||
extr x14,$h2,$h1,#40
|
||||
|
||||
cmp w17,#-1 // is value impossible?
|
||||
b.ne .Leven_neon
|
||||
|
||||
fmov ${H0},x10
|
||||
fmov ${H1},x11
|
||||
fmov ${H2},x12
|
||||
fmov ${H3},x13
|
||||
fmov ${H4},x14
|
||||
|
||||
////////////////////////////////// initialize r^n table
|
||||
mov $h0,$r0 // r^1
|
||||
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
mov $h1,$r1
|
||||
mov $h2,xzr
|
||||
add $ctx,$ctx,#48+12
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^2
|
||||
sub $ctx,$ctx,#4
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^3
|
||||
sub $ctx,$ctx,#4
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^4
|
||||
sub $ctx,$ctx,#4
|
||||
bl poly1305_splat
|
||||
sub $ctx,$ctx,#48 // restore original $ctx
|
||||
b .Ldo_neon
|
||||
|
||||
.align 4
|
||||
.Leven_neon:
|
||||
fmov ${H0},x10
|
||||
fmov ${H1},x11
|
||||
fmov ${H2},x12
|
||||
fmov ${H3},x13
|
||||
fmov ${H4},x14
|
||||
|
||||
.Ldo_neon:
|
||||
ldp x8,x12,[$inp,#32] // inp[2:3]
|
||||
subs $len,$len,#64
|
||||
ldp x9,x13,[$inp,#48]
|
||||
add $in2,$inp,#96
|
||||
adr $zeros,.Lzeros
|
||||
|
||||
lsl $padbit,$padbit,#24
|
||||
add x15,$ctx,#48
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov $IN23_0,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,$padbit,x12,lsr#40
|
||||
add x13,$padbit,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov $IN23_1,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
fmov $IN23_2,x8
|
||||
fmov $IN23_3,x10
|
||||
fmov $IN23_4,x12
|
||||
|
||||
ldp x8,x12,[$inp],#16 // inp[0:1]
|
||||
ldp x9,x13,[$inp],#48
|
||||
|
||||
ld1 {$R0,$R1,$S1,$R2},[x15],#64
|
||||
ld1 {$S2,$R3,$S3,$R4},[x15],#64
|
||||
ld1 {$S4},[x15]
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov $IN01_0,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,$padbit,x12,lsr#40
|
||||
add x13,$padbit,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov $IN01_1,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
movi $MASK.2d,#-1
|
||||
fmov $IN01_2,x8
|
||||
fmov $IN01_3,x10
|
||||
fmov $IN01_4,x12
|
||||
ushr $MASK.2d,$MASK.2d,#38
|
||||
|
||||
b.ls .Lskip_loop
|
||||
|
||||
.align 4
|
||||
.Loop_neon:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
||||
// \___________________/
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
|
||||
// \___________________/ \____________________/
|
||||
//
|
||||
// Note that we start with inp[2:3]*r^2. This is because it
|
||||
// doesn't depend on reduction in previous iteration.
|
||||
////////////////////////////////////////////////////////////////
|
||||
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
|
||||
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
|
||||
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
|
||||
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
|
||||
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
|
||||
|
||||
subs $len,$len,#64
|
||||
umull $ACC4,$IN23_0,${R4}[2]
|
||||
csel $in2,$zeros,$in2,lo
|
||||
umull $ACC3,$IN23_0,${R3}[2]
|
||||
umull $ACC2,$IN23_0,${R2}[2]
|
||||
ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
|
||||
umull $ACC1,$IN23_0,${R1}[2]
|
||||
ldp x9,x13,[$in2],#48
|
||||
umull $ACC0,$IN23_0,${R0}[2]
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
umlal $ACC4,$IN23_1,${R3}[2]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal $ACC3,$IN23_1,${R2}[2]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal $ACC2,$IN23_1,${R1}[2]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal $ACC1,$IN23_1,${R0}[2]
|
||||
ubfx x7,x9,#26,#26
|
||||
umlal $ACC0,$IN23_1,${S4}[2]
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
|
||||
umlal $ACC4,$IN23_2,${R2}[2]
|
||||
extr x8,x12,x8,#52
|
||||
umlal $ACC3,$IN23_2,${R1}[2]
|
||||
extr x9,x13,x9,#52
|
||||
umlal $ACC2,$IN23_2,${R0}[2]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal $ACC1,$IN23_2,${S4}[2]
|
||||
fmov $IN23_0,x4
|
||||
umlal $ACC0,$IN23_2,${S3}[2]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
umlal $ACC4,$IN23_3,${R1}[2]
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal $ACC3,$IN23_3,${R0}[2]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal $ACC2,$IN23_3,${S4}[2]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal $ACC1,$IN23_3,${S3}[2]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal $ACC0,$IN23_3,${S2}[2]
|
||||
fmov $IN23_1,x6
|
||||
|
||||
add $IN01_2,$IN01_2,$H2
|
||||
add x12,$padbit,x12,lsr#40
|
||||
umlal $ACC4,$IN23_4,${R0}[2]
|
||||
add x13,$padbit,x13,lsr#40
|
||||
umlal $ACC3,$IN23_4,${S4}[2]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal $ACC2,$IN23_4,${S3}[2]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal $ACC1,$IN23_4,${S2}[2]
|
||||
fmov $IN23_2,x8
|
||||
umlal $ACC0,$IN23_4,${S1}[2]
|
||||
fmov $IN23_3,x10
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4 and accumulate
|
||||
|
||||
add $IN01_0,$IN01_0,$H0
|
||||
fmov $IN23_4,x12
|
||||
umlal $ACC3,$IN01_2,${R1}[0]
|
||||
ldp x8,x12,[$inp],#16 // inp[0:1]
|
||||
umlal $ACC0,$IN01_2,${S3}[0]
|
||||
ldp x9,x13,[$inp],#48
|
||||
umlal $ACC4,$IN01_2,${R2}[0]
|
||||
umlal $ACC1,$IN01_2,${S4}[0]
|
||||
umlal $ACC2,$IN01_2,${R0}[0]
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
add $IN01_1,$IN01_1,$H1
|
||||
umlal $ACC3,$IN01_0,${R3}[0]
|
||||
umlal $ACC4,$IN01_0,${R4}[0]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal $ACC2,$IN01_0,${R2}[0]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal $ACC0,$IN01_0,${R0}[0]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal $ACC1,$IN01_0,${R1}[0]
|
||||
ubfx x7,x9,#26,#26
|
||||
|
||||
add $IN01_3,$IN01_3,$H3
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
umlal $ACC3,$IN01_1,${R2}[0]
|
||||
extr x8,x12,x8,#52
|
||||
umlal $ACC4,$IN01_1,${R3}[0]
|
||||
extr x9,x13,x9,#52
|
||||
umlal $ACC0,$IN01_1,${S4}[0]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal $ACC2,$IN01_1,${R1}[0]
|
||||
fmov $IN01_0,x4
|
||||
umlal $ACC1,$IN01_1,${R0}[0]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
add $IN01_4,$IN01_4,$H4
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal $ACC3,$IN01_3,${R0}[0]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal $ACC0,$IN01_3,${S2}[0]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal $ACC4,$IN01_3,${R1}[0]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal $ACC1,$IN01_3,${S3}[0]
|
||||
fmov $IN01_1,x6
|
||||
umlal $ACC2,$IN01_3,${S4}[0]
|
||||
add x12,$padbit,x12,lsr#40
|
||||
|
||||
umlal $ACC3,$IN01_4,${S4}[0]
|
||||
add x13,$padbit,x13,lsr#40
|
||||
umlal $ACC0,$IN01_4,${S1}[0]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal $ACC4,$IN01_4,${R0}[0]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal $ACC1,$IN01_4,${S2}[0]
|
||||
fmov $IN01_2,x8
|
||||
umlal $ACC2,$IN01_4,${S3}[0]
|
||||
fmov $IN01_3,x10
|
||||
fmov $IN01_4,x12
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
||||
// and P. Schwabe
|
||||
//
|
||||
// [see discussion in poly1305-armv4 module]
|
||||
|
||||
ushr $T0.2d,$ACC3,#26
|
||||
xtn $H3,$ACC3
|
||||
ushr $T1.2d,$ACC0,#26
|
||||
and $ACC0,$ACC0,$MASK.2d
|
||||
add $ACC4,$ACC4,$T0.2d // h3 -> h4
|
||||
bic $H3,#0xfc,lsl#24 // &=0x03ffffff
|
||||
add $ACC1,$ACC1,$T1.2d // h0 -> h1
|
||||
|
||||
ushr $T0.2d,$ACC4,#26
|
||||
xtn $H4,$ACC4
|
||||
ushr $T1.2d,$ACC1,#26
|
||||
xtn $H1,$ACC1
|
||||
bic $H4,#0xfc,lsl#24
|
||||
add $ACC2,$ACC2,$T1.2d // h1 -> h2
|
||||
|
||||
add $ACC0,$ACC0,$T0.2d
|
||||
shl $T0.2d,$T0.2d,#2
|
||||
shrn $T1.2s,$ACC2,#26
|
||||
xtn $H2,$ACC2
|
||||
add $ACC0,$ACC0,$T0.2d // h4 -> h0
|
||||
bic $H1,#0xfc,lsl#24
|
||||
add $H3,$H3,$T1.2s // h2 -> h3
|
||||
bic $H2,#0xfc,lsl#24
|
||||
|
||||
shrn $T0.2s,$ACC0,#26
|
||||
xtn $H0,$ACC0
|
||||
ushr $T1.2s,$H3,#26
|
||||
bic $H3,#0xfc,lsl#24
|
||||
bic $H0,#0xfc,lsl#24
|
||||
add $H1,$H1,$T0.2s // h0 -> h1
|
||||
add $H4,$H4,$T1.2s // h3 -> h4
|
||||
|
||||
b.hi .Loop_neon
|
||||
|
||||
.Lskip_loop:
|
||||
dup $IN23_2,${IN23_2}[0]
|
||||
add $IN01_2,$IN01_2,$H2
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
|
||||
|
||||
adds $len,$len,#32
|
||||
b.ne .Long_tail
|
||||
|
||||
dup $IN23_2,${IN01_2}[0]
|
||||
add $IN23_0,$IN01_0,$H0
|
||||
add $IN23_3,$IN01_3,$H3
|
||||
add $IN23_1,$IN01_1,$H1
|
||||
add $IN23_4,$IN01_4,$H4
|
||||
|
||||
.Long_tail:
|
||||
dup $IN23_0,${IN23_0}[0]
|
||||
umull2 $ACC0,$IN23_2,${S3}
|
||||
umull2 $ACC3,$IN23_2,${R1}
|
||||
umull2 $ACC4,$IN23_2,${R2}
|
||||
umull2 $ACC2,$IN23_2,${R0}
|
||||
umull2 $ACC1,$IN23_2,${S4}
|
||||
|
||||
dup $IN23_1,${IN23_1}[0]
|
||||
umlal2 $ACC0,$IN23_0,${R0}
|
||||
umlal2 $ACC2,$IN23_0,${R2}
|
||||
umlal2 $ACC3,$IN23_0,${R3}
|
||||
umlal2 $ACC4,$IN23_0,${R4}
|
||||
umlal2 $ACC1,$IN23_0,${R1}
|
||||
|
||||
dup $IN23_3,${IN23_3}[0]
|
||||
umlal2 $ACC0,$IN23_1,${S4}
|
||||
umlal2 $ACC3,$IN23_1,${R2}
|
||||
umlal2 $ACC2,$IN23_1,${R1}
|
||||
umlal2 $ACC4,$IN23_1,${R3}
|
||||
umlal2 $ACC1,$IN23_1,${R0}
|
||||
|
||||
dup $IN23_4,${IN23_4}[0]
|
||||
umlal2 $ACC3,$IN23_3,${R0}
|
||||
umlal2 $ACC4,$IN23_3,${R1}
|
||||
umlal2 $ACC0,$IN23_3,${S2}
|
||||
umlal2 $ACC1,$IN23_3,${S3}
|
||||
umlal2 $ACC2,$IN23_3,${S4}
|
||||
|
||||
umlal2 $ACC3,$IN23_4,${S4}
|
||||
umlal2 $ACC0,$IN23_4,${S1}
|
||||
umlal2 $ACC4,$IN23_4,${R0}
|
||||
umlal2 $ACC1,$IN23_4,${S2}
|
||||
umlal2 $ACC2,$IN23_4,${S3}
|
||||
|
||||
b.eq .Lshort_tail
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4:r^3 and accumulate
|
||||
|
||||
add $IN01_0,$IN01_0,$H0
|
||||
umlal $ACC3,$IN01_2,${R1}
|
||||
umlal $ACC0,$IN01_2,${S3}
|
||||
umlal $ACC4,$IN01_2,${R2}
|
||||
umlal $ACC1,$IN01_2,${S4}
|
||||
umlal $ACC2,$IN01_2,${R0}
|
||||
|
||||
add $IN01_1,$IN01_1,$H1
|
||||
umlal $ACC3,$IN01_0,${R3}
|
||||
umlal $ACC0,$IN01_0,${R0}
|
||||
umlal $ACC4,$IN01_0,${R4}
|
||||
umlal $ACC1,$IN01_0,${R1}
|
||||
umlal $ACC2,$IN01_0,${R2}
|
||||
|
||||
add $IN01_3,$IN01_3,$H3
|
||||
umlal $ACC3,$IN01_1,${R2}
|
||||
umlal $ACC0,$IN01_1,${S4}
|
||||
umlal $ACC4,$IN01_1,${R3}
|
||||
umlal $ACC1,$IN01_1,${R0}
|
||||
umlal $ACC2,$IN01_1,${R1}
|
||||
|
||||
add $IN01_4,$IN01_4,$H4
|
||||
umlal $ACC3,$IN01_3,${R0}
|
||||
umlal $ACC0,$IN01_3,${S2}
|
||||
umlal $ACC4,$IN01_3,${R1}
|
||||
umlal $ACC1,$IN01_3,${S3}
|
||||
umlal $ACC2,$IN01_3,${S4}
|
||||
|
||||
umlal $ACC3,$IN01_4,${S4}
|
||||
umlal $ACC0,$IN01_4,${S1}
|
||||
umlal $ACC4,$IN01_4,${R0}
|
||||
umlal $ACC1,$IN01_4,${S2}
|
||||
umlal $ACC2,$IN01_4,${S3}
|
||||
|
||||
.Lshort_tail:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// horizontal add
|
||||
|
||||
addp $ACC3,$ACC3,$ACC3
|
||||
ldp d8,d9,[sp,#16] // meet ABI requirements
|
||||
addp $ACC0,$ACC0,$ACC0
|
||||
ldp d10,d11,[sp,#32]
|
||||
addp $ACC4,$ACC4,$ACC4
|
||||
ldp d12,d13,[sp,#48]
|
||||
addp $ACC1,$ACC1,$ACC1
|
||||
ldp d14,d15,[sp,#64]
|
||||
addp $ACC2,$ACC2,$ACC2
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// lazy reduction, but without narrowing
|
||||
|
||||
ushr $T0.2d,$ACC3,#26
|
||||
and $ACC3,$ACC3,$MASK.2d
|
||||
ushr $T1.2d,$ACC0,#26
|
||||
and $ACC0,$ACC0,$MASK.2d
|
||||
|
||||
add $ACC4,$ACC4,$T0.2d // h3 -> h4
|
||||
add $ACC1,$ACC1,$T1.2d // h0 -> h1
|
||||
|
||||
ushr $T0.2d,$ACC4,#26
|
||||
and $ACC4,$ACC4,$MASK.2d
|
||||
ushr $T1.2d,$ACC1,#26
|
||||
and $ACC1,$ACC1,$MASK.2d
|
||||
add $ACC2,$ACC2,$T1.2d // h1 -> h2
|
||||
|
||||
add $ACC0,$ACC0,$T0.2d
|
||||
shl $T0.2d,$T0.2d,#2
|
||||
ushr $T1.2d,$ACC2,#26
|
||||
and $ACC2,$ACC2,$MASK.2d
|
||||
add $ACC0,$ACC0,$T0.2d // h4 -> h0
|
||||
add $ACC3,$ACC3,$T1.2d // h2 -> h3
|
||||
|
||||
ushr $T0.2d,$ACC0,#26
|
||||
and $ACC0,$ACC0,$MASK.2d
|
||||
ushr $T1.2d,$ACC3,#26
|
||||
and $ACC3,$ACC3,$MASK.2d
|
||||
add $ACC1,$ACC1,$T0.2d // h0 -> h1
|
||||
add $ACC4,$ACC4,$T1.2d // h3 -> h4
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// write the result, can be partially reduced
|
||||
|
||||
st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
|
||||
mov x4,#1
|
||||
st1 {$ACC4}[0],[$ctx]
|
||||
str x4,[$ctx,#8] // set is_base2_26
|
||||
|
||||
ldr x29,[sp],#80
|
||||
.inst 0xd50323bf // autiasp
|
||||
ret
|
||||
.size poly1305_blocks_neon,.-poly1305_blocks_neon
|
||||
|
||||
.align 5
|
||||
.Lzeros:
|
||||
.long 0,0,0,0,0,0,0,0
|
||||
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
|
||||
.align 2
|
||||
#if !defined(__KERNEL__) && !defined(_WIN64)
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
.hidden OPENSSL_armcap_P
|
||||
#endif
|
||||
___
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
|
||||
s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
|
||||
(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
|
||||
(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
|
||||
(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
|
||||
(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
|
||||
(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
|
||||
|
||||
s/\.[124]([sd])\[/.$1\[/;
|
||||
s/w#x([0-9]+)/w$1/g;
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT;
|
835
arch/arm64/crypto/poly1305-core.S_shipped
Normal file
835
arch/arm64/crypto/poly1305-core.S_shipped
Normal file
|
@ -0,0 +1,835 @@
|
|||
#ifndef __KERNEL__
|
||||
# include "arm_arch.h"
|
||||
.extern OPENSSL_armcap_P
|
||||
#endif
|
||||
|
||||
.text
|
||||
|
||||
// forward "declarations" are required for Apple
|
||||
.globl poly1305_blocks
|
||||
.globl poly1305_emit
|
||||
|
||||
.globl poly1305_init
|
||||
.type poly1305_init,%function
|
||||
.align 5
|
||||
poly1305_init:
|
||||
cmp x1,xzr
|
||||
stp xzr,xzr,[x0] // zero hash value
|
||||
stp xzr,xzr,[x0,#16] // [along with is_base2_26]
|
||||
|
||||
csel x0,xzr,x0,eq
|
||||
b.eq .Lno_key
|
||||
|
||||
#ifndef __KERNEL__
|
||||
adrp x17,OPENSSL_armcap_P
|
||||
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
|
||||
#endif
|
||||
|
||||
ldp x7,x8,[x1] // load key
|
||||
mov x9,#0xfffffffc0fffffff
|
||||
movk x9,#0x0fff,lsl#48
|
||||
#ifdef __AARCH64EB__
|
||||
rev x7,x7 // flip bytes
|
||||
rev x8,x8
|
||||
#endif
|
||||
and x7,x7,x9 // &=0ffffffc0fffffff
|
||||
and x9,x9,#-4
|
||||
and x8,x8,x9 // &=0ffffffc0ffffffc
|
||||
mov w9,#-1
|
||||
stp x7,x8,[x0,#32] // save key value
|
||||
str w9,[x0,#48] // impossible key power value
|
||||
|
||||
#ifndef __KERNEL__
|
||||
tst w17,#ARMV7_NEON
|
||||
|
||||
adr x12,.Lpoly1305_blocks
|
||||
adr x7,.Lpoly1305_blocks_neon
|
||||
adr x13,.Lpoly1305_emit
|
||||
|
||||
csel x12,x12,x7,eq
|
||||
|
||||
# ifdef __ILP32__
|
||||
stp w12,w13,[x2]
|
||||
# else
|
||||
stp x12,x13,[x2]
|
||||
# endif
|
||||
#endif
|
||||
mov x0,#1
|
||||
.Lno_key:
|
||||
ret
|
||||
.size poly1305_init,.-poly1305_init
|
||||
|
||||
.type poly1305_blocks,%function
|
||||
.align 5
|
||||
poly1305_blocks:
|
||||
.Lpoly1305_blocks:
|
||||
ands x2,x2,#-16
|
||||
b.eq .Lno_data
|
||||
|
||||
ldp x4,x5,[x0] // load hash value
|
||||
ldp x6,x17,[x0,#16] // [along with is_base2_26]
|
||||
ldp x7,x8,[x0,#32] // load key value
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
lsr x12,x4,#32
|
||||
mov w13,w4
|
||||
lsr x14,x5,#32
|
||||
mov w15,w5
|
||||
lsr x16,x6,#32
|
||||
#else
|
||||
mov w12,w4
|
||||
lsr x13,x4,#32
|
||||
mov w14,w5
|
||||
lsr x15,x5,#32
|
||||
mov w16,w6
|
||||
#endif
|
||||
|
||||
add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr x13,x14,#12
|
||||
adds x12,x12,x14,lsl#52
|
||||
add x13,x13,x15,lsl#14
|
||||
adc x13,x13,xzr
|
||||
lsr x14,x16,#24
|
||||
adds x13,x13,x16,lsl#40
|
||||
adc x14,x14,xzr
|
||||
|
||||
cmp x17,#0 // is_base2_26?
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
csel x4,x4,x12,eq // choose between radixes
|
||||
csel x5,x5,x13,eq
|
||||
csel x6,x6,x14,eq
|
||||
|
||||
.Loop:
|
||||
ldp x10,x11,[x1],#16 // load input
|
||||
sub x2,x2,#16
|
||||
#ifdef __AARCH64EB__
|
||||
rev x10,x10
|
||||
rev x11,x11
|
||||
#endif
|
||||
adds x4,x4,x10 // accumulate input
|
||||
adcs x5,x5,x11
|
||||
|
||||
mul x12,x4,x7 // h0*r0
|
||||
adc x6,x6,x3
|
||||
umulh x13,x4,x7
|
||||
|
||||
mul x10,x5,x9 // h1*5*r1
|
||||
umulh x11,x5,x9
|
||||
|
||||
adds x12,x12,x10
|
||||
mul x10,x4,x8 // h0*r1
|
||||
adc x13,x13,x11
|
||||
umulh x14,x4,x8
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x5,x7 // h1*r0
|
||||
adc x14,x14,xzr
|
||||
umulh x11,x5,x7
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x6,x9 // h2*5*r1
|
||||
adc x14,x14,x11
|
||||
mul x11,x6,x7 // h2*r0
|
||||
|
||||
adds x13,x13,x10
|
||||
adc x14,x14,x11
|
||||
|
||||
and x10,x14,#-4 // final reduction
|
||||
and x6,x14,#3
|
||||
add x10,x10,x14,lsr#2
|
||||
adds x4,x12,x10
|
||||
adcs x5,x13,xzr
|
||||
adc x6,x6,xzr
|
||||
|
||||
cbnz x2,.Loop
|
||||
|
||||
stp x4,x5,[x0] // store hash value
|
||||
stp x6,xzr,[x0,#16] // [and clear is_base2_26]
|
||||
|
||||
.Lno_data:
|
||||
ret
|
||||
.size poly1305_blocks,.-poly1305_blocks
|
||||
|
||||
.type poly1305_emit,%function
|
||||
.align 5
|
||||
poly1305_emit:
|
||||
.Lpoly1305_emit:
|
||||
ldp x4,x5,[x0] // load hash base 2^64
|
||||
ldp x6,x7,[x0,#16] // [along with is_base2_26]
|
||||
ldp x10,x11,[x2] // load nonce
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
lsr x12,x4,#32
|
||||
mov w13,w4
|
||||
lsr x14,x5,#32
|
||||
mov w15,w5
|
||||
lsr x16,x6,#32
|
||||
#else
|
||||
mov w12,w4
|
||||
lsr x13,x4,#32
|
||||
mov w14,w5
|
||||
lsr x15,x5,#32
|
||||
mov w16,w6
|
||||
#endif
|
||||
|
||||
add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr x13,x14,#12
|
||||
adds x12,x12,x14,lsl#52
|
||||
add x13,x13,x15,lsl#14
|
||||
adc x13,x13,xzr
|
||||
lsr x14,x16,#24
|
||||
adds x13,x13,x16,lsl#40
|
||||
adc x14,x14,xzr
|
||||
|
||||
cmp x7,#0 // is_base2_26?
|
||||
csel x4,x4,x12,eq // choose between radixes
|
||||
csel x5,x5,x13,eq
|
||||
csel x6,x6,x14,eq
|
||||
|
||||
adds x12,x4,#5 // compare to modulus
|
||||
adcs x13,x5,xzr
|
||||
adc x14,x6,xzr
|
||||
|
||||
tst x14,#-4 // see if it's carried/borrowed
|
||||
|
||||
csel x4,x4,x12,eq
|
||||
csel x5,x5,x13,eq
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
ror x10,x10,#32 // flip nonce words
|
||||
ror x11,x11,#32
|
||||
#endif
|
||||
adds x4,x4,x10 // accumulate nonce
|
||||
adc x5,x5,x11
|
||||
#ifdef __AARCH64EB__
|
||||
rev x4,x4 // flip output bytes
|
||||
rev x5,x5
|
||||
#endif
|
||||
stp x4,x5,[x1] // write result
|
||||
|
||||
ret
|
||||
.size poly1305_emit,.-poly1305_emit
|
||||
.type poly1305_mult,%function
|
||||
.align 5
|
||||
poly1305_mult:
|
||||
mul x12,x4,x7 // h0*r0
|
||||
umulh x13,x4,x7
|
||||
|
||||
mul x10,x5,x9 // h1*5*r1
|
||||
umulh x11,x5,x9
|
||||
|
||||
adds x12,x12,x10
|
||||
mul x10,x4,x8 // h0*r1
|
||||
adc x13,x13,x11
|
||||
umulh x14,x4,x8
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x5,x7 // h1*r0
|
||||
adc x14,x14,xzr
|
||||
umulh x11,x5,x7
|
||||
|
||||
adds x13,x13,x10
|
||||
mul x10,x6,x9 // h2*5*r1
|
||||
adc x14,x14,x11
|
||||
mul x11,x6,x7 // h2*r0
|
||||
|
||||
adds x13,x13,x10
|
||||
adc x14,x14,x11
|
||||
|
||||
and x10,x14,#-4 // final reduction
|
||||
and x6,x14,#3
|
||||
add x10,x10,x14,lsr#2
|
||||
adds x4,x12,x10
|
||||
adcs x5,x13,xzr
|
||||
adc x6,x6,xzr
|
||||
|
||||
ret
|
||||
.size poly1305_mult,.-poly1305_mult
|
||||
|
||||
.type poly1305_splat,%function
|
||||
.align 4
|
||||
poly1305_splat:
|
||||
and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x13,x4,#26,#26
|
||||
extr x14,x5,x4,#52
|
||||
and x14,x14,#0x03ffffff
|
||||
ubfx x15,x5,#14,#26
|
||||
extr x16,x6,x5,#40
|
||||
|
||||
str w12,[x0,#16*0] // r0
|
||||
add w12,w13,w13,lsl#2 // r1*5
|
||||
str w13,[x0,#16*1] // r1
|
||||
add w13,w14,w14,lsl#2 // r2*5
|
||||
str w12,[x0,#16*2] // s1
|
||||
str w14,[x0,#16*3] // r2
|
||||
add w14,w15,w15,lsl#2 // r3*5
|
||||
str w13,[x0,#16*4] // s2
|
||||
str w15,[x0,#16*5] // r3
|
||||
add w15,w16,w16,lsl#2 // r4*5
|
||||
str w14,[x0,#16*6] // s3
|
||||
str w16,[x0,#16*7] // r4
|
||||
str w15,[x0,#16*8] // s4
|
||||
|
||||
ret
|
||||
.size poly1305_splat,.-poly1305_splat
|
||||
|
||||
#ifdef __KERNEL__
|
||||
.globl poly1305_blocks_neon
|
||||
#endif
|
||||
.type poly1305_blocks_neon,%function
|
||||
.align 5
|
||||
poly1305_blocks_neon:
|
||||
.Lpoly1305_blocks_neon:
|
||||
ldr x17,[x0,#24]
|
||||
cmp x2,#128
|
||||
b.lo .Lpoly1305_blocks
|
||||
|
||||
.inst 0xd503233f // paciasp
|
||||
stp x29,x30,[sp,#-80]!
|
||||
add x29,sp,#0
|
||||
|
||||
stp d8,d9,[sp,#16] // meet ABI requirements
|
||||
stp d10,d11,[sp,#32]
|
||||
stp d12,d13,[sp,#48]
|
||||
stp d14,d15,[sp,#64]
|
||||
|
||||
cbz x17,.Lbase2_64_neon
|
||||
|
||||
ldp w10,w11,[x0] // load hash value base 2^26
|
||||
ldp w12,w13,[x0,#8]
|
||||
ldr w14,[x0,#16]
|
||||
|
||||
tst x2,#31
|
||||
b.eq .Leven_neon
|
||||
|
||||
ldp x7,x8,[x0,#32] // load key value
|
||||
|
||||
add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
||||
lsr x5,x12,#12
|
||||
adds x4,x4,x12,lsl#52
|
||||
add x5,x5,x13,lsl#14
|
||||
adc x5,x5,xzr
|
||||
lsr x6,x14,#24
|
||||
adds x5,x5,x14,lsl#40
|
||||
adc x14,x6,xzr // can be partially reduced...
|
||||
|
||||
ldp x12,x13,[x1],#16 // load input
|
||||
sub x2,x2,#16
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x12,x12
|
||||
rev x13,x13
|
||||
#endif
|
||||
adds x4,x4,x12 // accumulate input
|
||||
adcs x5,x5,x13
|
||||
adc x6,x6,x3
|
||||
|
||||
bl poly1305_mult
|
||||
|
||||
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,x4,#26,#26
|
||||
extr x12,x5,x4,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,x5,#14,#26
|
||||
extr x14,x6,x5,#40
|
||||
|
||||
b .Leven_neon
|
||||
|
||||
.align 4
|
||||
.Lbase2_64_neon:
|
||||
ldp x7,x8,[x0,#32] // load key value
|
||||
|
||||
ldp x4,x5,[x0] // load hash value base 2^64
|
||||
ldr x6,[x0,#16]
|
||||
|
||||
tst x2,#31
|
||||
b.eq .Linit_neon
|
||||
|
||||
ldp x12,x13,[x1],#16 // load input
|
||||
sub x2,x2,#16
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
#ifdef __AARCH64EB__
|
||||
rev x12,x12
|
||||
rev x13,x13
|
||||
#endif
|
||||
adds x4,x4,x12 // accumulate input
|
||||
adcs x5,x5,x13
|
||||
adc x6,x6,x3
|
||||
|
||||
bl poly1305_mult
|
||||
|
||||
.Linit_neon:
|
||||
ldr w17,[x0,#48] // first table element
|
||||
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||
ubfx x11,x4,#26,#26
|
||||
extr x12,x5,x4,#52
|
||||
and x12,x12,#0x03ffffff
|
||||
ubfx x13,x5,#14,#26
|
||||
extr x14,x6,x5,#40
|
||||
|
||||
cmp w17,#-1 // is value impossible?
|
||||
b.ne .Leven_neon
|
||||
|
||||
fmov d24,x10
|
||||
fmov d25,x11
|
||||
fmov d26,x12
|
||||
fmov d27,x13
|
||||
fmov d28,x14
|
||||
|
||||
////////////////////////////////// initialize r^n table
|
||||
mov x4,x7 // r^1
|
||||
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||
mov x5,x8
|
||||
mov x6,xzr
|
||||
add x0,x0,#48+12
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^2
|
||||
sub x0,x0,#4
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^3
|
||||
sub x0,x0,#4
|
||||
bl poly1305_splat
|
||||
|
||||
bl poly1305_mult // r^4
|
||||
sub x0,x0,#4
|
||||
bl poly1305_splat
|
||||
sub x0,x0,#48 // restore original x0
|
||||
b .Ldo_neon
|
||||
|
||||
.align 4
|
||||
.Leven_neon:
|
||||
fmov d24,x10
|
||||
fmov d25,x11
|
||||
fmov d26,x12
|
||||
fmov d27,x13
|
||||
fmov d28,x14
|
||||
|
||||
.Ldo_neon:
|
||||
ldp x8,x12,[x1,#32] // inp[2:3]
|
||||
subs x2,x2,#64
|
||||
ldp x9,x13,[x1,#48]
|
||||
add x16,x1,#96
|
||||
adr x17,.Lzeros
|
||||
|
||||
lsl x3,x3,#24
|
||||
add x15,x0,#48
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov d14,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,x3,x12,lsr#40
|
||||
add x13,x3,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov d15,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
fmov d16,x8
|
||||
fmov d17,x10
|
||||
fmov d18,x12
|
||||
|
||||
ldp x8,x12,[x1],#16 // inp[0:1]
|
||||
ldp x9,x13,[x1],#48
|
||||
|
||||
ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
|
||||
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
|
||||
ld1 {v8.4s},[x15]
|
||||
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
and x5,x9,#0x03ffffff
|
||||
ubfx x6,x8,#26,#26
|
||||
ubfx x7,x9,#26,#26
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
extr x8,x12,x8,#52
|
||||
extr x9,x13,x9,#52
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
fmov d9,x4
|
||||
and x8,x8,#0x03ffffff
|
||||
and x9,x9,#0x03ffffff
|
||||
ubfx x10,x12,#14,#26
|
||||
ubfx x11,x13,#14,#26
|
||||
add x12,x3,x12,lsr#40
|
||||
add x13,x3,x13,lsr#40
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
fmov d10,x6
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
movi v31.2d,#-1
|
||||
fmov d11,x8
|
||||
fmov d12,x10
|
||||
fmov d13,x12
|
||||
ushr v31.2d,v31.2d,#38
|
||||
|
||||
b.ls .Lskip_loop
|
||||
|
||||
.align 4
|
||||
.Loop_neon:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
||||
// ___________________/
|
||||
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
|
||||
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
|
||||
// ___________________/ ____________________/
|
||||
//
|
||||
// Note that we start with inp[2:3]*r^2. This is because it
|
||||
// doesn't depend on reduction in previous iteration.
|
||||
////////////////////////////////////////////////////////////////
|
||||
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
|
||||
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
|
||||
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
|
||||
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
|
||||
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
|
||||
|
||||
subs x2,x2,#64
|
||||
umull v23.2d,v14.2s,v7.s[2]
|
||||
csel x16,x17,x16,lo
|
||||
umull v22.2d,v14.2s,v5.s[2]
|
||||
umull v21.2d,v14.2s,v3.s[2]
|
||||
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
|
||||
umull v20.2d,v14.2s,v1.s[2]
|
||||
ldp x9,x13,[x16],#48
|
||||
umull v19.2d,v14.2s,v0.s[2]
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
umlal v23.2d,v15.2s,v5.s[2]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal v22.2d,v15.2s,v3.s[2]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal v21.2d,v15.2s,v1.s[2]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal v20.2d,v15.2s,v0.s[2]
|
||||
ubfx x7,x9,#26,#26
|
||||
umlal v19.2d,v15.2s,v8.s[2]
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
|
||||
umlal v23.2d,v16.2s,v3.s[2]
|
||||
extr x8,x12,x8,#52
|
||||
umlal v22.2d,v16.2s,v1.s[2]
|
||||
extr x9,x13,x9,#52
|
||||
umlal v21.2d,v16.2s,v0.s[2]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal v20.2d,v16.2s,v8.s[2]
|
||||
fmov d14,x4
|
||||
umlal v19.2d,v16.2s,v6.s[2]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
umlal v23.2d,v17.2s,v1.s[2]
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal v22.2d,v17.2s,v0.s[2]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal v21.2d,v17.2s,v8.s[2]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal v20.2d,v17.2s,v6.s[2]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal v19.2d,v17.2s,v4.s[2]
|
||||
fmov d15,x6
|
||||
|
||||
add v11.2s,v11.2s,v26.2s
|
||||
add x12,x3,x12,lsr#40
|
||||
umlal v23.2d,v18.2s,v0.s[2]
|
||||
add x13,x3,x13,lsr#40
|
||||
umlal v22.2d,v18.2s,v8.s[2]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal v21.2d,v18.2s,v6.s[2]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal v20.2d,v18.2s,v4.s[2]
|
||||
fmov d16,x8
|
||||
umlal v19.2d,v18.2s,v2.s[2]
|
||||
fmov d17,x10
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4 and accumulate
|
||||
|
||||
add v9.2s,v9.2s,v24.2s
|
||||
fmov d18,x12
|
||||
umlal v22.2d,v11.2s,v1.s[0]
|
||||
ldp x8,x12,[x1],#16 // inp[0:1]
|
||||
umlal v19.2d,v11.2s,v6.s[0]
|
||||
ldp x9,x13,[x1],#48
|
||||
umlal v23.2d,v11.2s,v3.s[0]
|
||||
umlal v20.2d,v11.2s,v8.s[0]
|
||||
umlal v21.2d,v11.2s,v0.s[0]
|
||||
#ifdef __AARCH64EB__
|
||||
rev x8,x8
|
||||
rev x12,x12
|
||||
rev x9,x9
|
||||
rev x13,x13
|
||||
#endif
|
||||
|
||||
add v10.2s,v10.2s,v25.2s
|
||||
umlal v22.2d,v9.2s,v5.s[0]
|
||||
umlal v23.2d,v9.2s,v7.s[0]
|
||||
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||
umlal v21.2d,v9.2s,v3.s[0]
|
||||
and x5,x9,#0x03ffffff
|
||||
umlal v19.2d,v9.2s,v0.s[0]
|
||||
ubfx x6,x8,#26,#26
|
||||
umlal v20.2d,v9.2s,v1.s[0]
|
||||
ubfx x7,x9,#26,#26
|
||||
|
||||
add v12.2s,v12.2s,v27.2s
|
||||
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||
umlal v22.2d,v10.2s,v3.s[0]
|
||||
extr x8,x12,x8,#52
|
||||
umlal v23.2d,v10.2s,v5.s[0]
|
||||
extr x9,x13,x9,#52
|
||||
umlal v19.2d,v10.2s,v8.s[0]
|
||||
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||
umlal v21.2d,v10.2s,v1.s[0]
|
||||
fmov d9,x4
|
||||
umlal v20.2d,v10.2s,v0.s[0]
|
||||
and x8,x8,#0x03ffffff
|
||||
|
||||
add v13.2s,v13.2s,v28.2s
|
||||
and x9,x9,#0x03ffffff
|
||||
umlal v22.2d,v12.2s,v0.s[0]
|
||||
ubfx x10,x12,#14,#26
|
||||
umlal v19.2d,v12.2s,v4.s[0]
|
||||
ubfx x11,x13,#14,#26
|
||||
umlal v23.2d,v12.2s,v1.s[0]
|
||||
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||
umlal v20.2d,v12.2s,v6.s[0]
|
||||
fmov d10,x6
|
||||
umlal v21.2d,v12.2s,v8.s[0]
|
||||
add x12,x3,x12,lsr#40
|
||||
|
||||
umlal v22.2d,v13.2s,v8.s[0]
|
||||
add x13,x3,x13,lsr#40
|
||||
umlal v19.2d,v13.2s,v2.s[0]
|
||||
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||
umlal v23.2d,v13.2s,v0.s[0]
|
||||
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||
umlal v20.2d,v13.2s,v4.s[0]
|
||||
fmov d11,x8
|
||||
umlal v21.2d,v13.2s,v6.s[0]
|
||||
fmov d12,x10
|
||||
fmov d13,x12
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
||||
// and P. Schwabe
|
||||
//
|
||||
// [see discussion in poly1305-armv4 module]
|
||||
|
||||
ushr v29.2d,v22.2d,#26
|
||||
xtn v27.2s,v22.2d
|
||||
ushr v30.2d,v19.2d,#26
|
||||
and v19.16b,v19.16b,v31.16b
|
||||
add v23.2d,v23.2d,v29.2d // h3 -> h4
|
||||
bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
|
||||
add v20.2d,v20.2d,v30.2d // h0 -> h1
|
||||
|
||||
ushr v29.2d,v23.2d,#26
|
||||
xtn v28.2s,v23.2d
|
||||
ushr v30.2d,v20.2d,#26
|
||||
xtn v25.2s,v20.2d
|
||||
bic v28.2s,#0xfc,lsl#24
|
||||
add v21.2d,v21.2d,v30.2d // h1 -> h2
|
||||
|
||||
add v19.2d,v19.2d,v29.2d
|
||||
shl v29.2d,v29.2d,#2
|
||||
shrn v30.2s,v21.2d,#26
|
||||
xtn v26.2s,v21.2d
|
||||
add v19.2d,v19.2d,v29.2d // h4 -> h0
|
||||
bic v25.2s,#0xfc,lsl#24
|
||||
add v27.2s,v27.2s,v30.2s // h2 -> h3
|
||||
bic v26.2s,#0xfc,lsl#24
|
||||
|
||||
shrn v29.2s,v19.2d,#26
|
||||
xtn v24.2s,v19.2d
|
||||
ushr v30.2s,v27.2s,#26
|
||||
bic v27.2s,#0xfc,lsl#24
|
||||
bic v24.2s,#0xfc,lsl#24
|
||||
add v25.2s,v25.2s,v29.2s // h0 -> h1
|
||||
add v28.2s,v28.2s,v30.2s // h3 -> h4
|
||||
|
||||
b.hi .Loop_neon
|
||||
|
||||
.Lskip_loop:
|
||||
dup v16.2d,v16.d[0]
|
||||
add v11.2s,v11.2s,v26.2s
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
|
||||
|
||||
adds x2,x2,#32
|
||||
b.ne .Long_tail
|
||||
|
||||
dup v16.2d,v11.d[0]
|
||||
add v14.2s,v9.2s,v24.2s
|
||||
add v17.2s,v12.2s,v27.2s
|
||||
add v15.2s,v10.2s,v25.2s
|
||||
add v18.2s,v13.2s,v28.2s
|
||||
|
||||
.Long_tail:
|
||||
dup v14.2d,v14.d[0]
|
||||
umull2 v19.2d,v16.4s,v6.4s
|
||||
umull2 v22.2d,v16.4s,v1.4s
|
||||
umull2 v23.2d,v16.4s,v3.4s
|
||||
umull2 v21.2d,v16.4s,v0.4s
|
||||
umull2 v20.2d,v16.4s,v8.4s
|
||||
|
||||
dup v15.2d,v15.d[0]
|
||||
umlal2 v19.2d,v14.4s,v0.4s
|
||||
umlal2 v21.2d,v14.4s,v3.4s
|
||||
umlal2 v22.2d,v14.4s,v5.4s
|
||||
umlal2 v23.2d,v14.4s,v7.4s
|
||||
umlal2 v20.2d,v14.4s,v1.4s
|
||||
|
||||
dup v17.2d,v17.d[0]
|
||||
umlal2 v19.2d,v15.4s,v8.4s
|
||||
umlal2 v22.2d,v15.4s,v3.4s
|
||||
umlal2 v21.2d,v15.4s,v1.4s
|
||||
umlal2 v23.2d,v15.4s,v5.4s
|
||||
umlal2 v20.2d,v15.4s,v0.4s
|
||||
|
||||
dup v18.2d,v18.d[0]
|
||||
umlal2 v22.2d,v17.4s,v0.4s
|
||||
umlal2 v23.2d,v17.4s,v1.4s
|
||||
umlal2 v19.2d,v17.4s,v4.4s
|
||||
umlal2 v20.2d,v17.4s,v6.4s
|
||||
umlal2 v21.2d,v17.4s,v8.4s
|
||||
|
||||
umlal2 v22.2d,v18.4s,v8.4s
|
||||
umlal2 v19.2d,v18.4s,v2.4s
|
||||
umlal2 v23.2d,v18.4s,v0.4s
|
||||
umlal2 v20.2d,v18.4s,v4.4s
|
||||
umlal2 v21.2d,v18.4s,v6.4s
|
||||
|
||||
b.eq .Lshort_tail
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// (hash+inp[0:1])*r^4:r^3 and accumulate
|
||||
|
||||
add v9.2s,v9.2s,v24.2s
|
||||
umlal v22.2d,v11.2s,v1.2s
|
||||
umlal v19.2d,v11.2s,v6.2s
|
||||
umlal v23.2d,v11.2s,v3.2s
|
||||
umlal v20.2d,v11.2s,v8.2s
|
||||
umlal v21.2d,v11.2s,v0.2s
|
||||
|
||||
add v10.2s,v10.2s,v25.2s
|
||||
umlal v22.2d,v9.2s,v5.2s
|
||||
umlal v19.2d,v9.2s,v0.2s
|
||||
umlal v23.2d,v9.2s,v7.2s
|
||||
umlal v20.2d,v9.2s,v1.2s
|
||||
umlal v21.2d,v9.2s,v3.2s
|
||||
|
||||
add v12.2s,v12.2s,v27.2s
|
||||
umlal v22.2d,v10.2s,v3.2s
|
||||
umlal v19.2d,v10.2s,v8.2s
|
||||
umlal v23.2d,v10.2s,v5.2s
|
||||
umlal v20.2d,v10.2s,v0.2s
|
||||
umlal v21.2d,v10.2s,v1.2s
|
||||
|
||||
add v13.2s,v13.2s,v28.2s
|
||||
umlal v22.2d,v12.2s,v0.2s
|
||||
umlal v19.2d,v12.2s,v4.2s
|
||||
umlal v23.2d,v12.2s,v1.2s
|
||||
umlal v20.2d,v12.2s,v6.2s
|
||||
umlal v21.2d,v12.2s,v8.2s
|
||||
|
||||
umlal v22.2d,v13.2s,v8.2s
|
||||
umlal v19.2d,v13.2s,v2.2s
|
||||
umlal v23.2d,v13.2s,v0.2s
|
||||
umlal v20.2d,v13.2s,v4.2s
|
||||
umlal v21.2d,v13.2s,v6.2s
|
||||
|
||||
.Lshort_tail:
|
||||
////////////////////////////////////////////////////////////////
|
||||
// horizontal add
|
||||
|
||||
addp v22.2d,v22.2d,v22.2d
|
||||
ldp d8,d9,[sp,#16] // meet ABI requirements
|
||||
addp v19.2d,v19.2d,v19.2d
|
||||
ldp d10,d11,[sp,#32]
|
||||
addp v23.2d,v23.2d,v23.2d
|
||||
ldp d12,d13,[sp,#48]
|
||||
addp v20.2d,v20.2d,v20.2d
|
||||
ldp d14,d15,[sp,#64]
|
||||
addp v21.2d,v21.2d,v21.2d
|
||||
ldr x30,[sp,#8]
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// lazy reduction, but without narrowing
|
||||
|
||||
ushr v29.2d,v22.2d,#26
|
||||
and v22.16b,v22.16b,v31.16b
|
||||
ushr v30.2d,v19.2d,#26
|
||||
and v19.16b,v19.16b,v31.16b
|
||||
|
||||
add v23.2d,v23.2d,v29.2d // h3 -> h4
|
||||
add v20.2d,v20.2d,v30.2d // h0 -> h1
|
||||
|
||||
ushr v29.2d,v23.2d,#26
|
||||
and v23.16b,v23.16b,v31.16b
|
||||
ushr v30.2d,v20.2d,#26
|
||||
and v20.16b,v20.16b,v31.16b
|
||||
add v21.2d,v21.2d,v30.2d // h1 -> h2
|
||||
|
||||
add v19.2d,v19.2d,v29.2d
|
||||
shl v29.2d,v29.2d,#2
|
||||
ushr v30.2d,v21.2d,#26
|
||||
and v21.16b,v21.16b,v31.16b
|
||||
add v19.2d,v19.2d,v29.2d // h4 -> h0
|
||||
add v22.2d,v22.2d,v30.2d // h2 -> h3
|
||||
|
||||
ushr v29.2d,v19.2d,#26
|
||||
and v19.16b,v19.16b,v31.16b
|
||||
ushr v30.2d,v22.2d,#26
|
||||
and v22.16b,v22.16b,v31.16b
|
||||
add v20.2d,v20.2d,v29.2d // h0 -> h1
|
||||
add v23.2d,v23.2d,v30.2d // h3 -> h4
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// write the result, can be partially reduced
|
||||
|
||||
st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
|
||||
mov x4,#1
|
||||
st1 {v23.s}[0],[x0]
|
||||
str x4,[x0,#8] // set is_base2_26
|
||||
|
||||
ldr x29,[sp],#80
|
||||
.inst 0xd50323bf // autiasp
|
||||
ret
|
||||
.size poly1305_blocks_neon,.-poly1305_blocks_neon
|
||||
|
||||
.align 5
|
||||
.Lzeros:
|
||||
.long 0,0,0,0,0,0,0,0
|
||||
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
|
||||
.align 2
|
||||
#if !defined(__KERNEL__) && !defined(_WIN64)
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
.hidden OPENSSL_armcap_P
|
||||
#endif
|
230
arch/arm64/crypto/poly1305-glue.c
Normal file
230
arch/arm64/crypto/poly1305-glue.c
Normal file
|
@ -0,0 +1,230 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
|
||||
*
|
||||
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||
*/
|
||||
|
||||
#include <asm/hwcap.h>
|
||||
#include <asm/neon.h>
|
||||
#include <asm/simd.h>
|
||||
#include <asm/unaligned.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/internal/poly1305.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
|
||||
asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
|
||||
asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
|
||||
asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
|
||||
|
||||
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
|
||||
{
|
||||
poly1305_init_arm64(&dctx->h, key);
|
||||
dctx->s[0] = get_unaligned_le32(key + 16);
|
||||
dctx->s[1] = get_unaligned_le32(key + 20);
|
||||
dctx->s[2] = get_unaligned_le32(key + 24);
|
||||
dctx->s[3] = get_unaligned_le32(key + 28);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_init_arch);
|
||||
|
||||
static int neon_poly1305_init(struct shash_desc *desc)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
dctx->buflen = 0;
|
||||
dctx->rset = 0;
|
||||
dctx->sset = false;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
u32 len, u32 hibit, bool do_neon)
|
||||
{
|
||||
if (unlikely(!dctx->sset)) {
|
||||
if (!dctx->rset) {
|
||||
poly1305_init_arch(dctx, src);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
dctx->rset = 1;
|
||||
}
|
||||
if (len >= POLY1305_BLOCK_SIZE) {
|
||||
dctx->s[0] = get_unaligned_le32(src + 0);
|
||||
dctx->s[1] = get_unaligned_le32(src + 4);
|
||||
dctx->s[2] = get_unaligned_le32(src + 8);
|
||||
dctx->s[3] = get_unaligned_le32(src + 12);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
dctx->sset = true;
|
||||
}
|
||||
if (len < POLY1305_BLOCK_SIZE)
|
||||
return;
|
||||
}
|
||||
|
||||
len &= ~(POLY1305_BLOCK_SIZE - 1);
|
||||
|
||||
if (static_branch_likely(&have_neon) && likely(do_neon))
|
||||
poly1305_blocks_neon(&dctx->h, src, len, hibit);
|
||||
else
|
||||
poly1305_blocks(&dctx->h, src, len, hibit);
|
||||
}
|
||||
|
||||
static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
|
||||
const u8 *src, u32 len, bool do_neon)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
|
||||
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||
src += bytes;
|
||||
len -= bytes;
|
||||
dctx->buflen += bytes;
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
neon_poly1305_blocks(dctx, dctx->buf,
|
||||
POLY1305_BLOCK_SIZE, 1, false);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(len >= POLY1305_BLOCK_SIZE)) {
|
||||
neon_poly1305_blocks(dctx, src, len, 1, do_neon);
|
||||
src += round_down(len, POLY1305_BLOCK_SIZE);
|
||||
len %= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (unlikely(len)) {
|
||||
dctx->buflen = len;
|
||||
memcpy(dctx->buf, src, len);
|
||||
}
|
||||
}
|
||||
|
||||
static int neon_poly1305_update(struct shash_desc *desc,
|
||||
const u8 *src, unsigned int srclen)
|
||||
{
|
||||
bool do_neon = may_use_simd() && srclen > 128;
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
if (static_branch_likely(&have_neon) && do_neon)
|
||||
kernel_neon_begin();
|
||||
neon_poly1305_do_update(dctx, src, srclen, do_neon);
|
||||
if (static_branch_likely(&have_neon) && do_neon)
|
||||
kernel_neon_end();
|
||||
return 0;
|
||||
}
|
||||
|
||||
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
unsigned int nbytes)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
|
||||
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||
src += bytes;
|
||||
nbytes -= bytes;
|
||||
dctx->buflen += bytes;
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
|
||||
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
|
||||
|
||||
if (static_branch_likely(&have_neon) && may_use_simd()) {
|
||||
do {
|
||||
unsigned int todo = min_t(unsigned int, len, SZ_4K);
|
||||
|
||||
kernel_neon_begin();
|
||||
poly1305_blocks_neon(&dctx->h, src, todo, 1);
|
||||
kernel_neon_end();
|
||||
|
||||
len -= todo;
|
||||
src += todo;
|
||||
} while (len);
|
||||
} else {
|
||||
poly1305_blocks(&dctx->h, src, len, 1);
|
||||
src += len;
|
||||
}
|
||||
nbytes %= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (unlikely(nbytes)) {
|
||||
dctx->buflen = nbytes;
|
||||
memcpy(dctx->buf, src, nbytes);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_update_arch);
|
||||
|
||||
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
dctx->buf[dctx->buflen++] = 1;
|
||||
memset(dctx->buf + dctx->buflen, 0,
|
||||
POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
|
||||
}
|
||||
|
||||
poly1305_emit(&dctx->h, dst, dctx->s);
|
||||
*dctx = (struct poly1305_desc_ctx){};
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_final_arch);
|
||||
|
||||
static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
if (unlikely(!dctx->sset))
|
||||
return -ENOKEY;
|
||||
|
||||
poly1305_final_arch(dctx, dst);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg neon_poly1305_alg = {
|
||||
.init = neon_poly1305_init,
|
||||
.update = neon_poly1305_update,
|
||||
.final = neon_poly1305_final,
|
||||
.digestsize = POLY1305_DIGEST_SIZE,
|
||||
.descsize = sizeof(struct poly1305_desc_ctx),
|
||||
|
||||
.base.cra_name = "poly1305",
|
||||
.base.cra_driver_name = "poly1305-neon",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init neon_poly1305_mod_init(void)
|
||||
{
|
||||
if (!(elf_hwcap & HWCAP_ASIMD))
|
||||
return 0;
|
||||
|
||||
static_branch_enable(&have_neon);
|
||||
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||
crypto_register_shash(&neon_poly1305_alg) : 0;
|
||||
}
|
||||
|
||||
static void __exit neon_poly1305_mod_exit(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && (elf_hwcap & HWCAP_ASIMD))
|
||||
crypto_unregister_shash(&neon_poly1305_alg);
|
||||
}
|
||||
|
||||
module_init(neon_poly1305_mod_init);
|
||||
module_exit(neon_poly1305_mod_exit);
|
||||
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_ALIAS_CRYPTO("poly1305");
|
||||
MODULE_ALIAS_CRYPTO("poly1305-neon");
|
|
@ -192,6 +192,7 @@ enum vcpu_sysreg {
|
|||
#define cp14_DBGWCR0 (DBGWCR0_EL1 * 2)
|
||||
#define cp14_DBGWVR0 (DBGWVR0_EL1 * 2)
|
||||
#define cp14_DBGDCCINT (MDCCINT_EL1 * 2)
|
||||
#define cp14_DBGVCR (DBGVCR32_EL2 * 2)
|
||||
|
||||
#define NR_COPRO_REGS (NR_SYS_REGS * 2)
|
||||
|
||||
|
|
|
@ -25,6 +25,9 @@ const struct cpumask *cpumask_of_node(int node);
|
|||
/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
|
||||
static inline const struct cpumask *cpumask_of_node(int node)
|
||||
{
|
||||
if (node == NUMA_NO_NODE)
|
||||
return cpu_all_mask;
|
||||
|
||||
return node_to_cpumask_map[node];
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -620,6 +620,12 @@ check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope)
|
|||
return (need_wa > 0);
|
||||
}
|
||||
|
||||
static void
|
||||
cpu_enable_branch_predictor_hardening(const struct arm64_cpu_capabilities *cap)
|
||||
{
|
||||
cap->matches(cap, SCOPE_LOCAL_CPU);
|
||||
}
|
||||
|
||||
static const __maybe_unused struct midr_range tx2_family_cpus[] = {
|
||||
MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
|
||||
MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
|
||||
|
@ -860,9 +866,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
|
|||
},
|
||||
#endif
|
||||
{
|
||||
.desc = "Branch predictor hardening",
|
||||
.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
|
||||
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
|
||||
.matches = check_branch_predictor,
|
||||
.cpu_enable = cpu_enable_branch_predictor_hardening,
|
||||
},
|
||||
#ifdef CONFIG_HARDEN_EL2_VECTORS
|
||||
{
|
||||
|
|
|
@ -290,21 +290,23 @@ void store_cpu_topology(unsigned int cpuid)
|
|||
if (mpidr & MPIDR_UP_BITMASK)
|
||||
return;
|
||||
|
||||
/* Create cpu topology mapping based on MPIDR. */
|
||||
if (mpidr & MPIDR_MT_BITMASK) {
|
||||
/* Multiprocessor system : Multi-threads per core */
|
||||
cpuid_topo->thread_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
|
||||
cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 1);
|
||||
cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) |
|
||||
MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8;
|
||||
} else {
|
||||
/* Multiprocessor system : Single-thread per core */
|
||||
cpuid_topo->thread_id = -1;
|
||||
cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
|
||||
cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) |
|
||||
MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 |
|
||||
MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16;
|
||||
}
|
||||
/*
|
||||
* This would be the place to create cpu topology based on MPIDR.
|
||||
*
|
||||
* However, it cannot be trusted to depict the actual topology; some
|
||||
* pieces of the architecture enforce an artificial cap on Aff0 values
|
||||
* (e.g. GICv3's ICC_SGI1R_EL1 limits it to 15), leading to an
|
||||
* artificial cycling of Aff1, Aff2 and Aff3 values. IOW, these end up
|
||||
* having absolutely no relationship to the actual underlying system
|
||||
* topology, and cannot be reasonably used as core / package ID.
|
||||
*
|
||||
* If the MT bit is set, Aff0 *could* be used to define a thread ID, but
|
||||
* we still wouldn't be able to obtain a sane core ID. This means we
|
||||
* need to entirely ignore MPIDR for any topology deduction.
|
||||
*/
|
||||
cpuid_topo->thread_id = -1;
|
||||
cpuid_topo->core_id = cpuid;
|
||||
cpuid_topo->package_id = cpu_to_node(cpuid);
|
||||
|
||||
pr_debug("CPU%u: cluster %d core %d thread %d mpidr %#016llx\n",
|
||||
cpuid, cpuid_topo->package_id, cpuid_topo->core_id,
|
||||
|
|
|
@ -1555,9 +1555,9 @@ static const struct sys_reg_desc cp14_regs[] = {
|
|||
{ Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi },
|
||||
DBG_BCR_BVR_WCR_WVR(1),
|
||||
/* DBGDCCINT */
|
||||
{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32 },
|
||||
{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32, NULL, cp14_DBGDCCINT },
|
||||
/* DBGDSCRext */
|
||||
{ Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32 },
|
||||
{ Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32, NULL, cp14_DBGDSCRext },
|
||||
DBG_BCR_BVR_WCR_WVR(2),
|
||||
/* DBGDTR[RT]Xint */
|
||||
{ Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi },
|
||||
|
@ -1572,7 +1572,7 @@ static const struct sys_reg_desc cp14_regs[] = {
|
|||
{ Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi },
|
||||
DBG_BCR_BVR_WCR_WVR(6),
|
||||
/* DBGVCR */
|
||||
{ Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32 },
|
||||
{ Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32, NULL, cp14_DBGVCR },
|
||||
DBG_BCR_BVR_WCR_WVR(7),
|
||||
DBG_BCR_BVR_WCR_WVR(8),
|
||||
DBG_BCR_BVR_WCR_WVR(9),
|
||||
|
|
|
@ -58,7 +58,11 @@ EXPORT_SYMBOL(node_to_cpumask_map);
|
|||
*/
|
||||
const struct cpumask *cpumask_of_node(int node)
|
||||
{
|
||||
if (WARN_ON(node >= nr_node_ids))
|
||||
|
||||
if (node == NUMA_NO_NODE)
|
||||
return cpu_all_mask;
|
||||
|
||||
if (WARN_ON(node < 0 || node >= nr_node_ids))
|
||||
return cpu_none_mask;
|
||||
|
||||
if (WARN_ON(node_to_cpumask_map[node] == NULL))
|
||||
|
|
|
@ -42,7 +42,7 @@ obj-y += esi_stub.o # must be in kernel proper
|
|||
endif
|
||||
obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o
|
||||
|
||||
obj-$(CONFIG_BINFMT_ELF) += elfcore.o
|
||||
obj-$(CONFIG_ELF_CORE) += elfcore.o
|
||||
|
||||
# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
|
||||
CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31
|
||||
|
|
|
@ -409,83 +409,9 @@ static void kretprobe_trampoline(void)
|
|||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* At this point the target function has been tricked into
|
||||
* returning into our trampoline. Lookup the associated instance
|
||||
* and then:
|
||||
* - call the handler function
|
||||
* - cleanup by marking the instance as unused
|
||||
* - long jump back to the original return address
|
||||
*/
|
||||
int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
|
||||
{
|
||||
struct kretprobe_instance *ri = NULL;
|
||||
struct hlist_head *head, empty_rp;
|
||||
struct hlist_node *tmp;
|
||||
unsigned long flags, orig_ret_address = 0;
|
||||
unsigned long trampoline_address =
|
||||
((struct fnptr *)kretprobe_trampoline)->ip;
|
||||
|
||||
INIT_HLIST_HEAD(&empty_rp);
|
||||
kretprobe_hash_lock(current, &head, &flags);
|
||||
|
||||
/*
|
||||
* It is possible to have multiple instances associated with a given
|
||||
* task either because an multiple functions in the call path
|
||||
* have a return probe installed on them, and/or more than one return
|
||||
* return probe was registered for a target function.
|
||||
*
|
||||
* We can handle this because:
|
||||
* - instances are always inserted at the head of the list
|
||||
* - when multiple return probes are registered for the same
|
||||
* function, the first instance's ret_addr will point to the
|
||||
* real return address, and all the rest will point to
|
||||
* kretprobe_trampoline
|
||||
*/
|
||||
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
|
||||
if (ri->task != current)
|
||||
/* another task is sharing our hash bucket */
|
||||
continue;
|
||||
|
||||
orig_ret_address = (unsigned long)ri->ret_addr;
|
||||
if (orig_ret_address != trampoline_address)
|
||||
/*
|
||||
* This is the real return address. Any other
|
||||
* instances associated with this task are for
|
||||
* other calls deeper on the call stack
|
||||
*/
|
||||
break;
|
||||
}
|
||||
|
||||
regs->cr_iip = orig_ret_address;
|
||||
|
||||
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
|
||||
if (ri->task != current)
|
||||
/* another task is sharing our hash bucket */
|
||||
continue;
|
||||
|
||||
if (ri->rp && ri->rp->handler)
|
||||
ri->rp->handler(ri, regs);
|
||||
|
||||
orig_ret_address = (unsigned long)ri->ret_addr;
|
||||
recycle_rp_inst(ri, &empty_rp);
|
||||
|
||||
if (orig_ret_address != trampoline_address)
|
||||
/*
|
||||
* This is the real return address. Any other
|
||||
* instances associated with this task are for
|
||||
* other calls deeper on the call stack
|
||||
*/
|
||||
break;
|
||||
}
|
||||
kretprobe_assert(ri, orig_ret_address, trampoline_address);
|
||||
|
||||
kretprobe_hash_unlock(current, &flags);
|
||||
|
||||
hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
|
||||
hlist_del(&ri->hlist);
|
||||
kfree(ri);
|
||||
}
|
||||
regs->cr_iip = __kretprobe_trampoline_handler(regs, kretprobe_trampoline, NULL);
|
||||
/*
|
||||
* By returning a non-zero value, we are telling
|
||||
* kprobe_handler() that we don't want the post_handler
|
||||
|
@ -498,6 +424,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
|
|||
struct pt_regs *regs)
|
||||
{
|
||||
ri->ret_addr = (kprobe_opcode_t *)regs->b0;
|
||||
ri->fp = NULL;
|
||||
|
||||
/* Replace the return addr with trampoline addr */
|
||||
regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip;
|
||||
|
|
|
@ -339,7 +339,7 @@ libs-y += arch/mips/math-emu/
|
|||
# See arch/mips/Kbuild for content of core part of the kernel
|
||||
core-y += arch/mips/
|
||||
|
||||
drivers-$(CONFIG_MIPS_CRC_SUPPORT) += arch/mips/crypto/
|
||||
drivers-y += arch/mips/crypto/
|
||||
drivers-$(CONFIG_OPROFILE) += arch/mips/oprofile/
|
||||
|
||||
# suspend and hibernation support
|
||||
|
|
|
@ -4,3 +4,21 @@
|
|||
#
|
||||
|
||||
obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o
|
||||
|
||||
obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
|
||||
chacha-mips-y := chacha-core.o chacha-glue.o
|
||||
AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
|
||||
|
||||
obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
|
||||
poly1305-mips-y := poly1305-core.o poly1305-glue.o
|
||||
|
||||
perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
|
||||
perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
|
||||
|
||||
quiet_cmd_perlasm = PERLASM $@
|
||||
cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
|
||||
|
||||
$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
|
||||
$(call if_changed,perlasm)
|
||||
|
||||
targets += poly1305-core.S
|
||||
|
|
497
arch/mips/crypto/chacha-core.S
Normal file
497
arch/mips/crypto/chacha-core.S
Normal file
|
@ -0,0 +1,497 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
|
||||
/*
|
||||
* Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
|
||||
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||
*/
|
||||
|
||||
#define MASK_U32 0x3c
|
||||
#define CHACHA20_BLOCK_SIZE 64
|
||||
#define STACK_SIZE 32
|
||||
|
||||
#define X0 $t0
|
||||
#define X1 $t1
|
||||
#define X2 $t2
|
||||
#define X3 $t3
|
||||
#define X4 $t4
|
||||
#define X5 $t5
|
||||
#define X6 $t6
|
||||
#define X7 $t7
|
||||
#define X8 $t8
|
||||
#define X9 $t9
|
||||
#define X10 $v1
|
||||
#define X11 $s6
|
||||
#define X12 $s5
|
||||
#define X13 $s4
|
||||
#define X14 $s3
|
||||
#define X15 $s2
|
||||
/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
|
||||
#define T0 $s1
|
||||
#define T1 $s0
|
||||
#define T(n) T ## n
|
||||
#define X(n) X ## n
|
||||
|
||||
/* Input arguments */
|
||||
#define STATE $a0
|
||||
#define OUT $a1
|
||||
#define IN $a2
|
||||
#define BYTES $a3
|
||||
|
||||
/* Output argument */
|
||||
/* NONCE[0] is kept in a register and not in memory.
|
||||
* We don't want to touch original value in memory.
|
||||
* Must be incremented every loop iteration.
|
||||
*/
|
||||
#define NONCE_0 $v0
|
||||
|
||||
/* SAVED_X and SAVED_CA are set in the jump table.
|
||||
* Use regs which are overwritten on exit else we don't leak clear data.
|
||||
* They are used to handling the last bytes which are not multiple of 4.
|
||||
*/
|
||||
#define SAVED_X X15
|
||||
#define SAVED_CA $s7
|
||||
|
||||
#define IS_UNALIGNED $s7
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
#define MSB 0
|
||||
#define LSB 3
|
||||
#define ROTx rotl
|
||||
#define ROTR(n) rotr n, 24
|
||||
#define CPU_TO_LE32(n) \
|
||||
wsbh n; \
|
||||
rotr n, 16;
|
||||
#else
|
||||
#define MSB 3
|
||||
#define LSB 0
|
||||
#define ROTx rotr
|
||||
#define CPU_TO_LE32(n)
|
||||
#define ROTR(n)
|
||||
#endif
|
||||
|
||||
#define FOR_EACH_WORD(x) \
|
||||
x( 0); \
|
||||
x( 1); \
|
||||
x( 2); \
|
||||
x( 3); \
|
||||
x( 4); \
|
||||
x( 5); \
|
||||
x( 6); \
|
||||
x( 7); \
|
||||
x( 8); \
|
||||
x( 9); \
|
||||
x(10); \
|
||||
x(11); \
|
||||
x(12); \
|
||||
x(13); \
|
||||
x(14); \
|
||||
x(15);
|
||||
|
||||
#define FOR_EACH_WORD_REV(x) \
|
||||
x(15); \
|
||||
x(14); \
|
||||
x(13); \
|
||||
x(12); \
|
||||
x(11); \
|
||||
x(10); \
|
||||
x( 9); \
|
||||
x( 8); \
|
||||
x( 7); \
|
||||
x( 6); \
|
||||
x( 5); \
|
||||
x( 4); \
|
||||
x( 3); \
|
||||
x( 2); \
|
||||
x( 1); \
|
||||
x( 0);
|
||||
|
||||
#define PLUS_ONE_0 1
|
||||
#define PLUS_ONE_1 2
|
||||
#define PLUS_ONE_2 3
|
||||
#define PLUS_ONE_3 4
|
||||
#define PLUS_ONE_4 5
|
||||
#define PLUS_ONE_5 6
|
||||
#define PLUS_ONE_6 7
|
||||
#define PLUS_ONE_7 8
|
||||
#define PLUS_ONE_8 9
|
||||
#define PLUS_ONE_9 10
|
||||
#define PLUS_ONE_10 11
|
||||
#define PLUS_ONE_11 12
|
||||
#define PLUS_ONE_12 13
|
||||
#define PLUS_ONE_13 14
|
||||
#define PLUS_ONE_14 15
|
||||
#define PLUS_ONE_15 16
|
||||
#define PLUS_ONE(x) PLUS_ONE_ ## x
|
||||
#define _CONCAT3(a,b,c) a ## b ## c
|
||||
#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
|
||||
|
||||
#define STORE_UNALIGNED(x) \
|
||||
CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
|
||||
.if (x != 12); \
|
||||
lw T0, (x*4)(STATE); \
|
||||
.endif; \
|
||||
lwl T1, (x*4)+MSB ## (IN); \
|
||||
lwr T1, (x*4)+LSB ## (IN); \
|
||||
.if (x == 12); \
|
||||
addu X ## x, NONCE_0; \
|
||||
.else; \
|
||||
addu X ## x, T0; \
|
||||
.endif; \
|
||||
CPU_TO_LE32(X ## x); \
|
||||
xor X ## x, T1; \
|
||||
swl X ## x, (x*4)+MSB ## (OUT); \
|
||||
swr X ## x, (x*4)+LSB ## (OUT);
|
||||
|
||||
#define STORE_ALIGNED(x) \
|
||||
CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
|
||||
.if (x != 12); \
|
||||
lw T0, (x*4)(STATE); \
|
||||
.endif; \
|
||||
lw T1, (x*4) ## (IN); \
|
||||
.if (x == 12); \
|
||||
addu X ## x, NONCE_0; \
|
||||
.else; \
|
||||
addu X ## x, T0; \
|
||||
.endif; \
|
||||
CPU_TO_LE32(X ## x); \
|
||||
xor X ## x, T1; \
|
||||
sw X ## x, (x*4) ## (OUT);
|
||||
|
||||
/* Jump table macro.
|
||||
* Used for setup and handling the last bytes, which are not multiple of 4.
|
||||
* X15 is free to store Xn
|
||||
* Every jumptable entry must be equal in size.
|
||||
*/
|
||||
#define JMPTBL_ALIGNED(x) \
|
||||
.Lchacha_mips_jmptbl_aligned_ ## x: ; \
|
||||
.set noreorder; \
|
||||
b .Lchacha_mips_xor_aligned_ ## x ## _b; \
|
||||
.if (x == 12); \
|
||||
addu SAVED_X, X ## x, NONCE_0; \
|
||||
.else; \
|
||||
addu SAVED_X, X ## x, SAVED_CA; \
|
||||
.endif; \
|
||||
.set reorder
|
||||
|
||||
#define JMPTBL_UNALIGNED(x) \
|
||||
.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
|
||||
.set noreorder; \
|
||||
b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
|
||||
.if (x == 12); \
|
||||
addu SAVED_X, X ## x, NONCE_0; \
|
||||
.else; \
|
||||
addu SAVED_X, X ## x, SAVED_CA; \
|
||||
.endif; \
|
||||
.set reorder
|
||||
|
||||
#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
|
||||
addu X(A), X(K); \
|
||||
addu X(B), X(L); \
|
||||
addu X(C), X(M); \
|
||||
addu X(D), X(N); \
|
||||
xor X(V), X(A); \
|
||||
xor X(W), X(B); \
|
||||
xor X(Y), X(C); \
|
||||
xor X(Z), X(D); \
|
||||
rotl X(V), S; \
|
||||
rotl X(W), S; \
|
||||
rotl X(Y), S; \
|
||||
rotl X(Z), S;
|
||||
|
||||
.text
|
||||
.set reorder
|
||||
.set noat
|
||||
.globl chacha_crypt_arch
|
||||
.ent chacha_crypt_arch
|
||||
chacha_crypt_arch:
|
||||
.frame $sp, STACK_SIZE, $ra
|
||||
|
||||
/* Load number of rounds */
|
||||
lw $at, 16($sp)
|
||||
|
||||
addiu $sp, -STACK_SIZE
|
||||
|
||||
/* Return bytes = 0. */
|
||||
beqz BYTES, .Lchacha_mips_end
|
||||
|
||||
lw NONCE_0, 48(STATE)
|
||||
|
||||
/* Save s0-s7 */
|
||||
sw $s0, 0($sp)
|
||||
sw $s1, 4($sp)
|
||||
sw $s2, 8($sp)
|
||||
sw $s3, 12($sp)
|
||||
sw $s4, 16($sp)
|
||||
sw $s5, 20($sp)
|
||||
sw $s6, 24($sp)
|
||||
sw $s7, 28($sp)
|
||||
|
||||
/* Test IN or OUT is unaligned.
|
||||
* IS_UNALIGNED = ( IN | OUT ) & 0x00000003
|
||||
*/
|
||||
or IS_UNALIGNED, IN, OUT
|
||||
andi IS_UNALIGNED, 0x3
|
||||
|
||||
b .Lchacha_rounds_start
|
||||
|
||||
.align 4
|
||||
.Loop_chacha_rounds:
|
||||
addiu IN, CHACHA20_BLOCK_SIZE
|
||||
addiu OUT, CHACHA20_BLOCK_SIZE
|
||||
addiu NONCE_0, 1
|
||||
|
||||
.Lchacha_rounds_start:
|
||||
lw X0, 0(STATE)
|
||||
lw X1, 4(STATE)
|
||||
lw X2, 8(STATE)
|
||||
lw X3, 12(STATE)
|
||||
|
||||
lw X4, 16(STATE)
|
||||
lw X5, 20(STATE)
|
||||
lw X6, 24(STATE)
|
||||
lw X7, 28(STATE)
|
||||
lw X8, 32(STATE)
|
||||
lw X9, 36(STATE)
|
||||
lw X10, 40(STATE)
|
||||
lw X11, 44(STATE)
|
||||
|
||||
move X12, NONCE_0
|
||||
lw X13, 52(STATE)
|
||||
lw X14, 56(STATE)
|
||||
lw X15, 60(STATE)
|
||||
|
||||
.Loop_chacha_xor_rounds:
|
||||
addiu $at, -2
|
||||
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
|
||||
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
|
||||
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
|
||||
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
|
||||
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
|
||||
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
|
||||
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
|
||||
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
|
||||
bnez $at, .Loop_chacha_xor_rounds
|
||||
|
||||
addiu BYTES, -(CHACHA20_BLOCK_SIZE)
|
||||
|
||||
/* Is data src/dst unaligned? Jump */
|
||||
bnez IS_UNALIGNED, .Loop_chacha_unaligned
|
||||
|
||||
/* Set number rounds here to fill delayslot. */
|
||||
lw $at, (STACK_SIZE+16)($sp)
|
||||
|
||||
/* BYTES < 0, it has no full block. */
|
||||
bltz BYTES, .Lchacha_mips_no_full_block_aligned
|
||||
|
||||
FOR_EACH_WORD_REV(STORE_ALIGNED)
|
||||
|
||||
/* BYTES > 0? Loop again. */
|
||||
bgtz BYTES, .Loop_chacha_rounds
|
||||
|
||||
/* Place this here to fill delay slot */
|
||||
addiu NONCE_0, 1
|
||||
|
||||
/* BYTES < 0? Handle last bytes */
|
||||
bltz BYTES, .Lchacha_mips_xor_bytes
|
||||
|
||||
.Lchacha_mips_xor_done:
|
||||
/* Restore used registers */
|
||||
lw $s0, 0($sp)
|
||||
lw $s1, 4($sp)
|
||||
lw $s2, 8($sp)
|
||||
lw $s3, 12($sp)
|
||||
lw $s4, 16($sp)
|
||||
lw $s5, 20($sp)
|
||||
lw $s6, 24($sp)
|
||||
lw $s7, 28($sp)
|
||||
|
||||
/* Write NONCE_0 back to right location in state */
|
||||
sw NONCE_0, 48(STATE)
|
||||
|
||||
.Lchacha_mips_end:
|
||||
addiu $sp, STACK_SIZE
|
||||
jr $ra
|
||||
|
||||
.Lchacha_mips_no_full_block_aligned:
|
||||
/* Restore the offset on BYTES */
|
||||
addiu BYTES, CHACHA20_BLOCK_SIZE
|
||||
|
||||
/* Get number of full WORDS */
|
||||
andi $at, BYTES, MASK_U32
|
||||
|
||||
/* Load upper half of jump table addr */
|
||||
lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
|
||||
|
||||
/* Calculate lower half jump table offset */
|
||||
ins T0, $at, 1, 6
|
||||
|
||||
/* Add offset to STATE */
|
||||
addu T1, STATE, $at
|
||||
|
||||
/* Add lower half jump table addr */
|
||||
addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
|
||||
|
||||
/* Read value from STATE */
|
||||
lw SAVED_CA, 0(T1)
|
||||
|
||||
/* Store remaining bytecounter as negative value */
|
||||
subu BYTES, $at, BYTES
|
||||
|
||||
jr T0
|
||||
|
||||
/* Jump table */
|
||||
FOR_EACH_WORD(JMPTBL_ALIGNED)
|
||||
|
||||
|
||||
.Loop_chacha_unaligned:
|
||||
/* Set number rounds here to fill delayslot. */
|
||||
lw $at, (STACK_SIZE+16)($sp)
|
||||
|
||||
/* BYTES > 0, it has no full block. */
|
||||
bltz BYTES, .Lchacha_mips_no_full_block_unaligned
|
||||
|
||||
FOR_EACH_WORD_REV(STORE_UNALIGNED)
|
||||
|
||||
/* BYTES > 0? Loop again. */
|
||||
bgtz BYTES, .Loop_chacha_rounds
|
||||
|
||||
/* Write NONCE_0 back to right location in state */
|
||||
sw NONCE_0, 48(STATE)
|
||||
|
||||
.set noreorder
|
||||
/* Fall through to byte handling */
|
||||
bgez BYTES, .Lchacha_mips_xor_done
|
||||
.Lchacha_mips_xor_unaligned_0_b:
|
||||
.Lchacha_mips_xor_aligned_0_b:
|
||||
/* Place this here to fill delay slot */
|
||||
addiu NONCE_0, 1
|
||||
.set reorder
|
||||
|
||||
.Lchacha_mips_xor_bytes:
|
||||
addu IN, $at
|
||||
addu OUT, $at
|
||||
/* First byte */
|
||||
lbu T1, 0(IN)
|
||||
addiu $at, BYTES, 1
|
||||
CPU_TO_LE32(SAVED_X)
|
||||
ROTR(SAVED_X)
|
||||
xor T1, SAVED_X
|
||||
sb T1, 0(OUT)
|
||||
beqz $at, .Lchacha_mips_xor_done
|
||||
/* Second byte */
|
||||
lbu T1, 1(IN)
|
||||
addiu $at, BYTES, 2
|
||||
ROTx SAVED_X, 8
|
||||
xor T1, SAVED_X
|
||||
sb T1, 1(OUT)
|
||||
beqz $at, .Lchacha_mips_xor_done
|
||||
/* Third byte */
|
||||
lbu T1, 2(IN)
|
||||
ROTx SAVED_X, 8
|
||||
xor T1, SAVED_X
|
||||
sb T1, 2(OUT)
|
||||
b .Lchacha_mips_xor_done
|
||||
|
||||
.Lchacha_mips_no_full_block_unaligned:
|
||||
/* Restore the offset on BYTES */
|
||||
addiu BYTES, CHACHA20_BLOCK_SIZE
|
||||
|
||||
/* Get number of full WORDS */
|
||||
andi $at, BYTES, MASK_U32
|
||||
|
||||
/* Load upper half of jump table addr */
|
||||
lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
|
||||
|
||||
/* Calculate lower half jump table offset */
|
||||
ins T0, $at, 1, 6
|
||||
|
||||
/* Add offset to STATE */
|
||||
addu T1, STATE, $at
|
||||
|
||||
/* Add lower half jump table addr */
|
||||
addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
|
||||
|
||||
/* Read value from STATE */
|
||||
lw SAVED_CA, 0(T1)
|
||||
|
||||
/* Store remaining bytecounter as negative value */
|
||||
subu BYTES, $at, BYTES
|
||||
|
||||
jr T0
|
||||
|
||||
/* Jump table */
|
||||
FOR_EACH_WORD(JMPTBL_UNALIGNED)
|
||||
.end chacha_crypt_arch
|
||||
.set at
|
||||
|
||||
/* Input arguments
|
||||
* STATE $a0
|
||||
* OUT $a1
|
||||
* NROUND $a2
|
||||
*/
|
||||
|
||||
#undef X12
|
||||
#undef X13
|
||||
#undef X14
|
||||
#undef X15
|
||||
|
||||
#define X12 $a3
|
||||
#define X13 $at
|
||||
#define X14 $v0
|
||||
#define X15 STATE
|
||||
|
||||
.set noat
|
||||
.globl hchacha_block_arch
|
||||
.ent hchacha_block_arch
|
||||
hchacha_block_arch:
|
||||
.frame $sp, STACK_SIZE, $ra
|
||||
|
||||
addiu $sp, -STACK_SIZE
|
||||
|
||||
/* Save X11(s6) */
|
||||
sw X11, 0($sp)
|
||||
|
||||
lw X0, 0(STATE)
|
||||
lw X1, 4(STATE)
|
||||
lw X2, 8(STATE)
|
||||
lw X3, 12(STATE)
|
||||
lw X4, 16(STATE)
|
||||
lw X5, 20(STATE)
|
||||
lw X6, 24(STATE)
|
||||
lw X7, 28(STATE)
|
||||
lw X8, 32(STATE)
|
||||
lw X9, 36(STATE)
|
||||
lw X10, 40(STATE)
|
||||
lw X11, 44(STATE)
|
||||
lw X12, 48(STATE)
|
||||
lw X13, 52(STATE)
|
||||
lw X14, 56(STATE)
|
||||
lw X15, 60(STATE)
|
||||
|
||||
.Loop_hchacha_xor_rounds:
|
||||
addiu $a2, -2
|
||||
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
|
||||
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
|
||||
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
|
||||
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
|
||||
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
|
||||
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
|
||||
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
|
||||
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
|
||||
bnez $a2, .Loop_hchacha_xor_rounds
|
||||
|
||||
/* Restore used register */
|
||||
lw X11, 0($sp)
|
||||
|
||||
sw X0, 0(OUT)
|
||||
sw X1, 4(OUT)
|
||||
sw X2, 8(OUT)
|
||||
sw X3, 12(OUT)
|
||||
sw X12, 16(OUT)
|
||||
sw X13, 20(OUT)
|
||||
sw X14, 24(OUT)
|
||||
sw X15, 28(OUT)
|
||||
|
||||
addiu $sp, STACK_SIZE
|
||||
jr $ra
|
||||
.end hchacha_block_arch
|
||||
.set at
|
152
arch/mips/crypto/chacha-glue.c
Normal file
152
arch/mips/crypto/chacha-glue.c
Normal file
|
@ -0,0 +1,152 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* MIPS accelerated ChaCha and XChaCha stream ciphers,
|
||||
* including ChaCha20 (RFC7539)
|
||||
*
|
||||
* Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||
*/
|
||||
|
||||
#include <asm/byteorder.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes, int nrounds);
|
||||
EXPORT_SYMBOL(chacha_crypt_arch);
|
||||
|
||||
asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds);
|
||||
EXPORT_SYMBOL(hchacha_block_arch);
|
||||
|
||||
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
||||
{
|
||||
chacha_init_generic(state, key, iv);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_init_arch);
|
||||
|
||||
static int chacha_mips_stream_xor(struct skcipher_request *req,
|
||||
const struct chacha_ctx *ctx, const u8 *iv)
|
||||
{
|
||||
struct skcipher_walk walk;
|
||||
u32 state[16];
|
||||
int err;
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
chacha_init_generic(state, ctx->key, iv);
|
||||
|
||||
while (walk.nbytes > 0) {
|
||||
unsigned int nbytes = walk.nbytes;
|
||||
|
||||
if (nbytes < walk.total)
|
||||
nbytes = round_down(nbytes, walk.stride);
|
||||
|
||||
chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
nbytes, ctx->nrounds);
|
||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int chacha_mips(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
|
||||
return chacha_mips_stream_xor(req, ctx, req->iv);
|
||||
}
|
||||
|
||||
static int xchacha_mips(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
struct chacha_ctx subctx;
|
||||
u32 state[16];
|
||||
u8 real_iv[16];
|
||||
|
||||
chacha_init_generic(state, ctx->key, req->iv);
|
||||
|
||||
hchacha_block(state, subctx.key, ctx->nrounds);
|
||||
subctx.nrounds = ctx->nrounds;
|
||||
|
||||
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||
memcpy(&real_iv[8], req->iv + 16, 8);
|
||||
return chacha_mips_stream_xor(req, &subctx, real_iv);
|
||||
}
|
||||
|
||||
static struct skcipher_alg algs[] = {
|
||||
{
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-mips",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = chacha_mips,
|
||||
.decrypt = chacha_mips,
|
||||
}, {
|
||||
.base.cra_name = "xchacha20",
|
||||
.base.cra_driver_name = "xchacha20-mips",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = xchacha_mips,
|
||||
.decrypt = xchacha_mips,
|
||||
}, {
|
||||
.base.cra_name = "xchacha12",
|
||||
.base.cra_driver_name = "xchacha12-mips",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha12_setkey,
|
||||
.encrypt = xchacha_mips,
|
||||
.decrypt = xchacha_mips,
|
||||
}
|
||||
};
|
||||
|
||||
static int __init chacha_simd_mod_init(void)
|
||||
{
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
|
||||
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
|
||||
}
|
||||
|
||||
static void __exit chacha_simd_mod_fini(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER))
|
||||
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
||||
}
|
||||
|
||||
module_init(chacha_simd_mod_init);
|
||||
module_exit(chacha_simd_mod_fini);
|
||||
|
||||
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)");
|
||||
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_ALIAS_CRYPTO("chacha20");
|
||||
MODULE_ALIAS_CRYPTO("chacha20-mips");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20-mips");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12-mips");
|
191
arch/mips/crypto/poly1305-glue.c
Normal file
191
arch/mips/crypto/poly1305-glue.c
Normal file
|
@ -0,0 +1,191 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
|
||||
*
|
||||
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||
*/
|
||||
|
||||
#include <asm/unaligned.h>
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
#include <crypto/internal/poly1305.h>
|
||||
#include <linux/cpufeature.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
asmlinkage void poly1305_init_mips(void *state, const u8 *key);
|
||||
asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
|
||||
asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce);
|
||||
|
||||
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
|
||||
{
|
||||
poly1305_init_mips(&dctx->h, key);
|
||||
dctx->s[0] = get_unaligned_le32(key + 16);
|
||||
dctx->s[1] = get_unaligned_le32(key + 20);
|
||||
dctx->s[2] = get_unaligned_le32(key + 24);
|
||||
dctx->s[3] = get_unaligned_le32(key + 28);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_init_arch);
|
||||
|
||||
static int mips_poly1305_init(struct shash_desc *desc)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
dctx->buflen = 0;
|
||||
dctx->rset = 0;
|
||||
dctx->sset = false;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
u32 len, u32 hibit)
|
||||
{
|
||||
if (unlikely(!dctx->sset)) {
|
||||
if (!dctx->rset) {
|
||||
poly1305_init_mips(&dctx->h, src);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
dctx->rset = 1;
|
||||
}
|
||||
if (len >= POLY1305_BLOCK_SIZE) {
|
||||
dctx->s[0] = get_unaligned_le32(src + 0);
|
||||
dctx->s[1] = get_unaligned_le32(src + 4);
|
||||
dctx->s[2] = get_unaligned_le32(src + 8);
|
||||
dctx->s[3] = get_unaligned_le32(src + 12);
|
||||
src += POLY1305_BLOCK_SIZE;
|
||||
len -= POLY1305_BLOCK_SIZE;
|
||||
dctx->sset = true;
|
||||
}
|
||||
if (len < POLY1305_BLOCK_SIZE)
|
||||
return;
|
||||
}
|
||||
|
||||
len &= ~(POLY1305_BLOCK_SIZE - 1);
|
||||
|
||||
poly1305_blocks_mips(&dctx->h, src, len, hibit);
|
||||
}
|
||||
|
||||
static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
|
||||
unsigned int len)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
if (unlikely(dctx->buflen)) {
|
||||
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
|
||||
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||
src += bytes;
|
||||
len -= bytes;
|
||||
dctx->buflen += bytes;
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(len >= POLY1305_BLOCK_SIZE)) {
|
||||
mips_poly1305_blocks(dctx, src, len, 1);
|
||||
src += round_down(len, POLY1305_BLOCK_SIZE);
|
||||
len %= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (unlikely(len)) {
|
||||
dctx->buflen = len;
|
||||
memcpy(dctx->buf, src, len);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||
unsigned int nbytes)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
|
||||
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||
src += bytes;
|
||||
nbytes -= bytes;
|
||||
dctx->buflen += bytes;
|
||||
|
||||
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||
poly1305_blocks_mips(&dctx->h, dctx->buf,
|
||||
POLY1305_BLOCK_SIZE, 1);
|
||||
dctx->buflen = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
|
||||
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
|
||||
|
||||
poly1305_blocks_mips(&dctx->h, src, len, 1);
|
||||
src += len;
|
||||
nbytes %= POLY1305_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
if (unlikely(nbytes)) {
|
||||
dctx->buflen = nbytes;
|
||||
memcpy(dctx->buf, src, nbytes);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_update_arch);
|
||||
|
||||
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
|
||||
{
|
||||
if (unlikely(dctx->buflen)) {
|
||||
dctx->buf[dctx->buflen++] = 1;
|
||||
memset(dctx->buf + dctx->buflen, 0,
|
||||
POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||
poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
|
||||
}
|
||||
|
||||
poly1305_emit_mips(&dctx->h, dst, dctx->s);
|
||||
*dctx = (struct poly1305_desc_ctx){};
|
||||
}
|
||||
EXPORT_SYMBOL(poly1305_final_arch);
|
||||
|
||||
static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
|
||||
{
|
||||
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||
|
||||
if (unlikely(!dctx->sset))
|
||||
return -ENOKEY;
|
||||
|
||||
poly1305_final_arch(dctx, dst);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg mips_poly1305_alg = {
|
||||
.init = mips_poly1305_init,
|
||||
.update = mips_poly1305_update,
|
||||
.final = mips_poly1305_final,
|
||||
.digestsize = POLY1305_DIGEST_SIZE,
|
||||
.descsize = sizeof(struct poly1305_desc_ctx),
|
||||
|
||||
.base.cra_name = "poly1305",
|
||||
.base.cra_driver_name = "poly1305-mips",
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
};
|
||||
|
||||
static int __init mips_poly1305_mod_init(void)
|
||||
{
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||
crypto_register_shash(&mips_poly1305_alg) : 0;
|
||||
}
|
||||
|
||||
static void __exit mips_poly1305_mod_exit(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
||||
crypto_unregister_shash(&mips_poly1305_alg);
|
||||
}
|
||||
|
||||
module_init(mips_poly1305_mod_init);
|
||||
module_exit(mips_poly1305_mod_exit);
|
||||
|
||||
MODULE_LICENSE("GPL v2");
|
||||
MODULE_ALIAS_CRYPTO("poly1305");
|
||||
MODULE_ALIAS_CRYPTO("poly1305-mips");
|
1273
arch/mips/crypto/poly1305-mips.pl
Normal file
1273
arch/mips/crypto/poly1305-mips.pl
Normal file
File diff suppressed because it is too large
Load diff
|
@ -152,6 +152,7 @@ config PPC
|
|||
select ARCH_USE_BUILTIN_BSWAP
|
||||
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
|
||||
select ARCH_WANT_IPC_PARSE_VERSION
|
||||
select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
|
||||
select ARCH_WEAK_RELEASE_ACQUIRE
|
||||
select BINFMT_ELF
|
||||
select BUILDTIME_EXTABLE_SORT
|
||||
|
@ -1009,6 +1010,19 @@ config FSL_RIO
|
|||
|
||||
source "drivers/rapidio/Kconfig"
|
||||
|
||||
config PPC_RTAS_FILTER
|
||||
bool "Enable filtering of RTAS syscalls"
|
||||
default y
|
||||
depends on PPC_RTAS
|
||||
help
|
||||
The RTAS syscall API has security issues that could be used to
|
||||
compromise system integrity. This option enforces restrictions on the
|
||||
RTAS calls and arguments passed by userspace programs to mitigate
|
||||
these issues.
|
||||
|
||||
Say Y unless you know what you are doing and the filter is causing
|
||||
problems for you.
|
||||
|
||||
endmenu
|
||||
|
||||
config NONSTATIC_KERNEL
|
||||
|
|
|
@ -12,6 +12,8 @@
|
|||
#ifndef _ASM_POWERPC_LMB_H
|
||||
#define _ASM_POWERPC_LMB_H
|
||||
|
||||
#include <linux/sched.h>
|
||||
|
||||
struct drmem_lmb {
|
||||
u64 base_addr;
|
||||
u32 drc_index;
|
||||
|
@ -22,13 +24,27 @@ struct drmem_lmb {
|
|||
struct drmem_lmb_info {
|
||||
struct drmem_lmb *lmbs;
|
||||
int n_lmbs;
|
||||
u32 lmb_size;
|
||||
u64 lmb_size;
|
||||
};
|
||||
|
||||
extern struct drmem_lmb_info *drmem_info;
|
||||
|
||||
static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb,
|
||||
const struct drmem_lmb *start)
|
||||
{
|
||||
/*
|
||||
* DLPAR code paths can take several milliseconds per element
|
||||
* when interacting with firmware. Ensure that we don't
|
||||
* unfairly monopolize the CPU.
|
||||
*/
|
||||
if (((++lmb - start) % 16) == 0)
|
||||
cond_resched();
|
||||
|
||||
return lmb;
|
||||
}
|
||||
|
||||
#define for_each_drmem_lmb_in_range(lmb, start, end) \
|
||||
for ((lmb) = (start); (lmb) < (end); (lmb)++)
|
||||
for ((lmb) = (start); (lmb) < (end); lmb = drmem_lmb_next(lmb, start))
|
||||
|
||||
#define for_each_drmem_lmb(lmb) \
|
||||
for_each_drmem_lmb_in_range((lmb), \
|
||||
|
@ -67,7 +83,7 @@ struct of_drconf_cell_v2 {
|
|||
#define DRCONF_MEM_AI_INVALID 0x00000040
|
||||
#define DRCONF_MEM_RESERVED 0x00000080
|
||||
|
||||
static inline u32 drmem_lmb_size(void)
|
||||
static inline u64 drmem_lmb_size(void)
|
||||
{
|
||||
return drmem_info->lmb_size;
|
||||
}
|
||||
|
|
|
@ -204,7 +204,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
|||
*/
|
||||
static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
|
||||
{
|
||||
switch_mm(prev, next, current);
|
||||
switch_mm_irqs_off(prev, next, current);
|
||||
}
|
||||
|
||||
/* We don't currently use enter_lazy_tlb() for anything */
|
||||
|
|
|
@ -788,7 +788,7 @@
|
|||
#define THRM1_TIN (1 << 31)
|
||||
#define THRM1_TIV (1 << 30)
|
||||
#define THRM1_THRES(x) ((x&0x7f)<<23)
|
||||
#define THRM3_SITV(x) ((x&0x3fff)<<1)
|
||||
#define THRM3_SITV(x) ((x & 0x1fff) << 1)
|
||||
#define THRM1_TID (1<<2)
|
||||
#define THRM1_TIE (1<<1)
|
||||
#define THRM1_V (1<<0)
|
||||
|
|
|
@ -76,19 +76,6 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
|
|||
return false;
|
||||
return cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm));
|
||||
}
|
||||
static inline void mm_reset_thread_local(struct mm_struct *mm)
|
||||
{
|
||||
WARN_ON(atomic_read(&mm->context.copros) > 0);
|
||||
/*
|
||||
* It's possible for mm_access to take a reference on mm_users to
|
||||
* access the remote mm from another thread, but it's not allowed
|
||||
* to set mm_cpumask, so mm_users may be > 1 here.
|
||||
*/
|
||||
WARN_ON(current->mm != mm);
|
||||
atomic_set(&mm->context.active_cpus, 1);
|
||||
cpumask_clear(mm_cpumask(mm));
|
||||
cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
|
||||
}
|
||||
#else /* CONFIG_PPC_BOOK3S_64 */
|
||||
static inline int mm_is_thread_local(struct mm_struct *mm)
|
||||
{
|
||||
|
|
|
@ -1057,6 +1057,147 @@ struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PPC_RTAS_FILTER
|
||||
|
||||
/*
|
||||
* The sys_rtas syscall, as originally designed, allows root to pass
|
||||
* arbitrary physical addresses to RTAS calls. A number of RTAS calls
|
||||
* can be abused to write to arbitrary memory and do other things that
|
||||
* are potentially harmful to system integrity, and thus should only
|
||||
* be used inside the kernel and not exposed to userspace.
|
||||
*
|
||||
* All known legitimate users of the sys_rtas syscall will only ever
|
||||
* pass addresses that fall within the RMO buffer, and use a known
|
||||
* subset of RTAS calls.
|
||||
*
|
||||
* Accordingly, we filter RTAS requests to check that the call is
|
||||
* permitted, and that provided pointers fall within the RMO buffer.
|
||||
* The rtas_filters list contains an entry for each permitted call,
|
||||
* with the indexes of the parameters which are expected to contain
|
||||
* addresses and sizes of buffers allocated inside the RMO buffer.
|
||||
*/
|
||||
struct rtas_filter {
|
||||
const char *name;
|
||||
int token;
|
||||
/* Indexes into the args buffer, -1 if not used */
|
||||
int buf_idx1;
|
||||
int size_idx1;
|
||||
int buf_idx2;
|
||||
int size_idx2;
|
||||
|
||||
int fixed_size;
|
||||
};
|
||||
|
||||
static struct rtas_filter rtas_filters[] __ro_after_init = {
|
||||
{ "ibm,activate-firmware", -1, -1, -1, -1, -1 },
|
||||
{ "ibm,configure-connector", -1, 0, -1, 1, -1, 4096 }, /* Special cased */
|
||||
{ "display-character", -1, -1, -1, -1, -1 },
|
||||
{ "ibm,display-message", -1, 0, -1, -1, -1 },
|
||||
{ "ibm,errinjct", -1, 2, -1, -1, -1, 1024 },
|
||||
{ "ibm,close-errinjct", -1, -1, -1, -1, -1 },
|
||||
{ "ibm,open-errinct", -1, -1, -1, -1, -1 },
|
||||
{ "ibm,get-config-addr-info2", -1, -1, -1, -1, -1 },
|
||||
{ "ibm,get-dynamic-sensor-state", -1, 1, -1, -1, -1 },
|
||||
{ "ibm,get-indices", -1, 2, 3, -1, -1 },
|
||||
{ "get-power-level", -1, -1, -1, -1, -1 },
|
||||
{ "get-sensor-state", -1, -1, -1, -1, -1 },
|
||||
{ "ibm,get-system-parameter", -1, 1, 2, -1, -1 },
|
||||
{ "get-time-of-day", -1, -1, -1, -1, -1 },
|
||||
{ "ibm,get-vpd", -1, 0, -1, 1, 2 },
|
||||
{ "ibm,lpar-perftools", -1, 2, 3, -1, -1 },
|
||||
{ "ibm,platform-dump", -1, 4, 5, -1, -1 },
|
||||
{ "ibm,read-slot-reset-state", -1, -1, -1, -1, -1 },
|
||||
{ "ibm,scan-log-dump", -1, 0, 1, -1, -1 },
|
||||
{ "ibm,set-dynamic-indicator", -1, 2, -1, -1, -1 },
|
||||
{ "ibm,set-eeh-option", -1, -1, -1, -1, -1 },
|
||||
{ "set-indicator", -1, -1, -1, -1, -1 },
|
||||
{ "set-power-level", -1, -1, -1, -1, -1 },
|
||||
{ "set-time-for-power-on", -1, -1, -1, -1, -1 },
|
||||
{ "ibm,set-system-parameter", -1, 1, -1, -1, -1 },
|
||||
{ "set-time-of-day", -1, -1, -1, -1, -1 },
|
||||
{ "ibm,suspend-me", -1, -1, -1, -1, -1 },
|
||||
{ "ibm,update-nodes", -1, 0, -1, -1, -1, 4096 },
|
||||
{ "ibm,update-properties", -1, 0, -1, -1, -1, 4096 },
|
||||
{ "ibm,physical-attestation", -1, 0, 1, -1, -1 },
|
||||
};
|
||||
|
||||
static bool in_rmo_buf(u32 base, u32 end)
|
||||
{
|
||||
return base >= rtas_rmo_buf &&
|
||||
base < (rtas_rmo_buf + RTAS_RMOBUF_MAX) &&
|
||||
base <= end &&
|
||||
end >= rtas_rmo_buf &&
|
||||
end < (rtas_rmo_buf + RTAS_RMOBUF_MAX);
|
||||
}
|
||||
|
||||
static bool block_rtas_call(int token, int nargs,
|
||||
struct rtas_args *args)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) {
|
||||
struct rtas_filter *f = &rtas_filters[i];
|
||||
u32 base, size, end;
|
||||
|
||||
if (token != f->token)
|
||||
continue;
|
||||
|
||||
if (f->buf_idx1 != -1) {
|
||||
base = be32_to_cpu(args->args[f->buf_idx1]);
|
||||
if (f->size_idx1 != -1)
|
||||
size = be32_to_cpu(args->args[f->size_idx1]);
|
||||
else if (f->fixed_size)
|
||||
size = f->fixed_size;
|
||||
else
|
||||
size = 1;
|
||||
|
||||
end = base + size - 1;
|
||||
if (!in_rmo_buf(base, end))
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (f->buf_idx2 != -1) {
|
||||
base = be32_to_cpu(args->args[f->buf_idx2]);
|
||||
if (f->size_idx2 != -1)
|
||||
size = be32_to_cpu(args->args[f->size_idx2]);
|
||||
else if (f->fixed_size)
|
||||
size = f->fixed_size;
|
||||
else
|
||||
size = 1;
|
||||
end = base + size - 1;
|
||||
|
||||
/*
|
||||
* Special case for ibm,configure-connector where the
|
||||
* address can be 0
|
||||
*/
|
||||
if (!strcmp(f->name, "ibm,configure-connector") &&
|
||||
base == 0)
|
||||
return false;
|
||||
|
||||
if (!in_rmo_buf(base, end))
|
||||
goto err;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
err:
|
||||
pr_err_ratelimited("sys_rtas: RTAS call blocked - exploit attempt?\n");
|
||||
pr_err_ratelimited("sys_rtas: token=0x%x, nargs=%d (called by %s)\n",
|
||||
token, nargs, current->comm);
|
||||
return true;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static bool block_rtas_call(int token, int nargs,
|
||||
struct rtas_args *args)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_PPC_RTAS_FILTER */
|
||||
|
||||
/* We assume to be passed big endian arguments */
|
||||
SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
|
||||
{
|
||||
|
@ -1094,6 +1235,9 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
|
|||
args.rets = &args.args[nargs];
|
||||
memset(args.rets, 0, nret * sizeof(rtas_arg_t));
|
||||
|
||||
if (block_rtas_call(token, nargs, &args))
|
||||
return -EINVAL;
|
||||
|
||||
/* Need to handle ibm,suspend_me call specially */
|
||||
if (token == ibm_suspend_me_token) {
|
||||
|
||||
|
@ -1155,6 +1299,9 @@ void __init rtas_initialize(void)
|
|||
unsigned long rtas_region = RTAS_INSTANTIATE_MAX;
|
||||
u32 base, size, entry;
|
||||
int no_base, no_size, no_entry;
|
||||
#ifdef CONFIG_PPC_RTAS_FILTER
|
||||
int i;
|
||||
#endif
|
||||
|
||||
/* Get RTAS dev node and fill up our "rtas" structure with infos
|
||||
* about it.
|
||||
|
@ -1190,6 +1337,12 @@ void __init rtas_initialize(void)
|
|||
#ifdef CONFIG_RTAS_ERROR_LOGGING
|
||||
rtas_last_error_token = rtas_token("rtas-last-error");
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PPC_RTAS_FILTER
|
||||
for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) {
|
||||
rtas_filters[i].token = rtas_token(rtas_filters[i].name);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
int __init early_init_dt_scan_rtas(unsigned long node,
|
||||
|
|
|
@ -29,29 +29,27 @@
|
|||
|
||||
static DEFINE_PER_CPU(struct cpu, cpu_devices);
|
||||
|
||||
/*
|
||||
* SMT snooze delay stuff, 64-bit only for now
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_PPC64
|
||||
|
||||
/* Time in microseconds we delay before sleeping in the idle loop */
|
||||
static DEFINE_PER_CPU(long, smt_snooze_delay) = { 100 };
|
||||
/*
|
||||
* Snooze delay has not been hooked up since 3fa8cad82b94 ("powerpc/pseries/cpuidle:
|
||||
* smt-snooze-delay cleanup.") and has been broken even longer. As was foretold in
|
||||
* 2014:
|
||||
*
|
||||
* "ppc64_util currently utilises it. Once we fix ppc64_util, propose to clean
|
||||
* up the kernel code."
|
||||
*
|
||||
* powerpc-utils stopped using it as of 1.3.8. At some point in the future this
|
||||
* code should be removed.
|
||||
*/
|
||||
|
||||
static ssize_t store_smt_snooze_delay(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
const char *buf,
|
||||
size_t count)
|
||||
{
|
||||
struct cpu *cpu = container_of(dev, struct cpu, dev);
|
||||
ssize_t ret;
|
||||
long snooze;
|
||||
|
||||
ret = sscanf(buf, "%ld", &snooze);
|
||||
if (ret != 1)
|
||||
return -EINVAL;
|
||||
|
||||
per_cpu(smt_snooze_delay, cpu->dev.id) = snooze;
|
||||
pr_warn_once("%s (%d) stored to unsupported smt_snooze_delay, which has no effect.\n",
|
||||
current->comm, current->pid);
|
||||
return count;
|
||||
}
|
||||
|
||||
|
@ -59,9 +57,9 @@ static ssize_t show_smt_snooze_delay(struct device *dev,
|
|||
struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct cpu *cpu = container_of(dev, struct cpu, dev);
|
||||
|
||||
return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->dev.id));
|
||||
pr_warn_once("%s (%d) read from unsupported smt_snooze_delay\n",
|
||||
current->comm, current->pid);
|
||||
return sprintf(buf, "100\n");
|
||||
}
|
||||
|
||||
static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
|
||||
|
@ -69,16 +67,10 @@ static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
|
|||
|
||||
static int __init setup_smt_snooze_delay(char *str)
|
||||
{
|
||||
unsigned int cpu;
|
||||
long snooze;
|
||||
|
||||
if (!cpu_has_feature(CPU_FTR_SMT))
|
||||
return 1;
|
||||
|
||||
snooze = simple_strtol(str, NULL, 10);
|
||||
for_each_possible_cpu(cpu)
|
||||
per_cpu(smt_snooze_delay, cpu) = snooze;
|
||||
|
||||
pr_warn("smt-snooze-delay command line option has no effect\n");
|
||||
return 1;
|
||||
}
|
||||
__setup("smt-snooze-delay=", setup_smt_snooze_delay);
|
||||
|
|
|
@ -13,13 +13,14 @@
|
|||
*/
|
||||
|
||||
#include <linux/errno.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/param.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/workqueue.h>
|
||||
|
||||
#include <asm/io.h>
|
||||
#include <asm/reg.h>
|
||||
|
@ -39,9 +40,7 @@ static struct tau_temp
|
|||
unsigned char grew;
|
||||
} tau[NR_CPUS];
|
||||
|
||||
struct timer_list tau_timer;
|
||||
|
||||
#undef DEBUG
|
||||
static bool tau_int_enable;
|
||||
|
||||
/* TODO: put these in a /proc interface, with some sanity checks, and maybe
|
||||
* dynamic adjustment to minimize # of interrupts */
|
||||
|
@ -50,72 +49,49 @@ struct timer_list tau_timer;
|
|||
#define step_size 2 /* step size when temp goes out of range */
|
||||
#define window_expand 1 /* expand the window by this much */
|
||||
/* configurable values for shrinking the window */
|
||||
#define shrink_timer 2*HZ /* period between shrinking the window */
|
||||
#define shrink_timer 2000 /* period between shrinking the window */
|
||||
#define min_window 2 /* minimum window size, degrees C */
|
||||
|
||||
static void set_thresholds(unsigned long cpu)
|
||||
{
|
||||
#ifdef CONFIG_TAU_INT
|
||||
/*
|
||||
* setup THRM1,
|
||||
* threshold, valid bit, enable interrupts, interrupt when below threshold
|
||||
*/
|
||||
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TIE | THRM1_TID);
|
||||
u32 maybe_tie = tau_int_enable ? THRM1_TIE : 0;
|
||||
|
||||
/* setup THRM2,
|
||||
* threshold, valid bit, enable interrupts, interrupt when above threshold
|
||||
*/
|
||||
mtspr (SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | THRM1_TIE);
|
||||
#else
|
||||
/* same thing but don't enable interrupts */
|
||||
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TID);
|
||||
mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V);
|
||||
#endif
|
||||
/* setup THRM1, threshold, valid bit, interrupt when below threshold */
|
||||
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | maybe_tie | THRM1_TID);
|
||||
|
||||
/* setup THRM2, threshold, valid bit, interrupt when above threshold */
|
||||
mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | maybe_tie);
|
||||
}
|
||||
|
||||
static void TAUupdate(int cpu)
|
||||
{
|
||||
unsigned thrm;
|
||||
|
||||
#ifdef DEBUG
|
||||
printk("TAUupdate ");
|
||||
#endif
|
||||
u32 thrm;
|
||||
u32 bits = THRM1_TIV | THRM1_TIN | THRM1_V;
|
||||
|
||||
/* if both thresholds are crossed, the step_sizes cancel out
|
||||
* and the window winds up getting expanded twice. */
|
||||
if((thrm = mfspr(SPRN_THRM1)) & THRM1_TIV){ /* is valid? */
|
||||
if(thrm & THRM1_TIN){ /* crossed low threshold */
|
||||
if (tau[cpu].low >= step_size){
|
||||
tau[cpu].low -= step_size;
|
||||
tau[cpu].high -= (step_size - window_expand);
|
||||
}
|
||||
tau[cpu].grew = 1;
|
||||
#ifdef DEBUG
|
||||
printk("low threshold crossed ");
|
||||
#endif
|
||||
thrm = mfspr(SPRN_THRM1);
|
||||
if ((thrm & bits) == bits) {
|
||||
mtspr(SPRN_THRM1, 0);
|
||||
|
||||
if (tau[cpu].low >= step_size) {
|
||||
tau[cpu].low -= step_size;
|
||||
tau[cpu].high -= (step_size - window_expand);
|
||||
}
|
||||
tau[cpu].grew = 1;
|
||||
pr_debug("%s: low threshold crossed\n", __func__);
|
||||
}
|
||||
if((thrm = mfspr(SPRN_THRM2)) & THRM1_TIV){ /* is valid? */
|
||||
if(thrm & THRM1_TIN){ /* crossed high threshold */
|
||||
if (tau[cpu].high <= 127-step_size){
|
||||
tau[cpu].low += (step_size - window_expand);
|
||||
tau[cpu].high += step_size;
|
||||
}
|
||||
tau[cpu].grew = 1;
|
||||
#ifdef DEBUG
|
||||
printk("high threshold crossed ");
|
||||
#endif
|
||||
thrm = mfspr(SPRN_THRM2);
|
||||
if ((thrm & bits) == bits) {
|
||||
mtspr(SPRN_THRM2, 0);
|
||||
|
||||
if (tau[cpu].high <= 127 - step_size) {
|
||||
tau[cpu].low += (step_size - window_expand);
|
||||
tau[cpu].high += step_size;
|
||||
}
|
||||
tau[cpu].grew = 1;
|
||||
pr_debug("%s: high threshold crossed\n", __func__);
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printk("grew = %d\n", tau[cpu].grew);
|
||||
#endif
|
||||
|
||||
#ifndef CONFIG_TAU_INT /* tau_timeout will do this if not using interrupts */
|
||||
set_thresholds(cpu);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#ifdef CONFIG_TAU_INT
|
||||
|
@ -140,17 +116,16 @@ void TAUException(struct pt_regs * regs)
|
|||
static void tau_timeout(void * info)
|
||||
{
|
||||
int cpu;
|
||||
unsigned long flags;
|
||||
int size;
|
||||
int shrink;
|
||||
|
||||
/* disabling interrupts *should* be okay */
|
||||
local_irq_save(flags);
|
||||
cpu = smp_processor_id();
|
||||
|
||||
#ifndef CONFIG_TAU_INT
|
||||
TAUupdate(cpu);
|
||||
#endif
|
||||
if (!tau_int_enable)
|
||||
TAUupdate(cpu);
|
||||
|
||||
/* Stop thermal sensor comparisons and interrupts */
|
||||
mtspr(SPRN_THRM3, 0);
|
||||
|
||||
size = tau[cpu].high - tau[cpu].low;
|
||||
if (size > min_window && ! tau[cpu].grew) {
|
||||
|
@ -173,32 +148,26 @@ static void tau_timeout(void * info)
|
|||
|
||||
set_thresholds(cpu);
|
||||
|
||||
/*
|
||||
* Do the enable every time, since otherwise a bunch of (relatively)
|
||||
* complex sleep code needs to be added. One mtspr every time
|
||||
* tau_timeout is called is probably not a big deal.
|
||||
*
|
||||
* Enable thermal sensor and set up sample interval timer
|
||||
* need 20 us to do the compare.. until a nice 'cpu_speed' function
|
||||
* call is implemented, just assume a 500 mhz clock. It doesn't really
|
||||
* matter if we take too long for a compare since it's all interrupt
|
||||
* driven anyway.
|
||||
*
|
||||
* use a extra long time.. (60 us @ 500 mhz)
|
||||
/* Restart thermal sensor comparisons and interrupts.
|
||||
* The "PowerPC 740 and PowerPC 750 Microprocessor Datasheet"
|
||||
* recommends that "the maximum value be set in THRM3 under all
|
||||
* conditions."
|
||||
*/
|
||||
mtspr(SPRN_THRM3, THRM3_SITV(500*60) | THRM3_E);
|
||||
|
||||
local_irq_restore(flags);
|
||||
mtspr(SPRN_THRM3, THRM3_SITV(0x1fff) | THRM3_E);
|
||||
}
|
||||
|
||||
static void tau_timeout_smp(struct timer_list *unused)
|
||||
static struct workqueue_struct *tau_workq;
|
||||
|
||||
static void tau_work_func(struct work_struct *work)
|
||||
{
|
||||
|
||||
/* schedule ourselves to be run again */
|
||||
mod_timer(&tau_timer, jiffies + shrink_timer) ;
|
||||
msleep(shrink_timer);
|
||||
on_each_cpu(tau_timeout, NULL, 0);
|
||||
/* schedule ourselves to be run again */
|
||||
queue_work(tau_workq, work);
|
||||
}
|
||||
|
||||
DECLARE_WORK(tau_work, tau_work_func);
|
||||
|
||||
/*
|
||||
* setup the TAU
|
||||
*
|
||||
|
@ -231,21 +200,19 @@ static int __init TAU_init(void)
|
|||
return 1;
|
||||
}
|
||||
|
||||
tau_int_enable = IS_ENABLED(CONFIG_TAU_INT) &&
|
||||
!strcmp(cur_cpu_spec->platform, "ppc750");
|
||||
|
||||
/* first, set up the window shrinking timer */
|
||||
timer_setup(&tau_timer, tau_timeout_smp, 0);
|
||||
tau_timer.expires = jiffies + shrink_timer;
|
||||
add_timer(&tau_timer);
|
||||
tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1);
|
||||
if (!tau_workq)
|
||||
return -ENOMEM;
|
||||
|
||||
on_each_cpu(TAU_init_smp, NULL, 0);
|
||||
|
||||
printk("Thermal assist unit ");
|
||||
#ifdef CONFIG_TAU_INT
|
||||
printk("using interrupts, ");
|
||||
#else
|
||||
printk("using timers, ");
|
||||
#endif
|
||||
printk("shrink_timer: %d jiffies\n", shrink_timer);
|
||||
queue_work(tau_workq, &tau_work);
|
||||
|
||||
pr_info("Thermal assist unit using %s, shrink_timer: %d ms\n",
|
||||
tau_int_enable ? "interrupts" : "workqueue", shrink_timer);
|
||||
tau_initialized = 1;
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -794,7 +794,7 @@ static void p9_hmi_special_emu(struct pt_regs *regs)
|
|||
{
|
||||
unsigned int ra, rb, t, i, sel, instr, rc;
|
||||
const void __user *addr;
|
||||
u8 vbuf[16], *vdst;
|
||||
u8 vbuf[16] __aligned(16), *vdst;
|
||||
unsigned long ea, msr, msr_mask;
|
||||
bool swap;
|
||||
|
||||
|
|
|
@ -598,19 +598,29 @@ static void do_exit_flush_lazy_tlb(void *arg)
|
|||
struct mm_struct *mm = arg;
|
||||
unsigned long pid = mm->context.id;
|
||||
|
||||
/*
|
||||
* A kthread could have done a mmget_not_zero() after the flushing CPU
|
||||
* checked mm_is_singlethreaded, and be in the process of
|
||||
* kthread_use_mm when interrupted here. In that case, current->mm will
|
||||
* be set to mm, because kthread_use_mm() setting ->mm and switching to
|
||||
* the mm is done with interrupts off.
|
||||
*/
|
||||
if (current->mm == mm)
|
||||
return; /* Local CPU */
|
||||
goto out_flush;
|
||||
|
||||
if (current->active_mm == mm) {
|
||||
/*
|
||||
* Must be a kernel thread because sender is single-threaded.
|
||||
*/
|
||||
BUG_ON(current->mm);
|
||||
WARN_ON_ONCE(current->mm != NULL);
|
||||
/* Is a kernel thread and is using mm as the lazy tlb */
|
||||
mmgrab(&init_mm);
|
||||
switch_mm(mm, &init_mm, current);
|
||||
current->active_mm = &init_mm;
|
||||
switch_mm_irqs_off(mm, &init_mm, current);
|
||||
mmdrop(mm);
|
||||
}
|
||||
|
||||
atomic_dec(&mm->context.active_cpus);
|
||||
cpumask_clear_cpu(smp_processor_id(), mm_cpumask(mm));
|
||||
|
||||
out_flush:
|
||||
_tlbiel_pid(pid, RIC_FLUSH_ALL);
|
||||
}
|
||||
|
||||
|
@ -625,7 +635,6 @@ static void exit_flush_lazy_tlbs(struct mm_struct *mm)
|
|||
*/
|
||||
smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
|
||||
(void *)mm, 1);
|
||||
mm_reset_thread_local(mm);
|
||||
}
|
||||
|
||||
void radix__flush_tlb_mm(struct mm_struct *mm)
|
||||
|
|
|
@ -95,7 +95,7 @@ REQUEST(__field(0, 8, partition_id)
|
|||
|
||||
#define REQUEST_NAME system_performance_capabilities
|
||||
#define REQUEST_NUM 0x40
|
||||
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
|
||||
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
|
||||
#include I(REQUEST_BEGIN)
|
||||
REQUEST(__field(0, 1, perf_collect_privileged)
|
||||
__field(0x1, 1, capability_mask)
|
||||
|
@ -223,7 +223,7 @@ REQUEST(__field(0, 2, partition_id)
|
|||
|
||||
#define REQUEST_NAME system_hypervisor_times
|
||||
#define REQUEST_NUM 0xF0
|
||||
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
|
||||
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
|
||||
#include I(REQUEST_BEGIN)
|
||||
REQUEST(__count(0, 8, time_spent_to_dispatch_virtual_processors)
|
||||
__count(0x8, 8, time_spent_processing_virtual_processor_timers)
|
||||
|
@ -234,7 +234,7 @@ REQUEST(__count(0, 8, time_spent_to_dispatch_virtual_processors)
|
|||
|
||||
#define REQUEST_NAME system_tlbie_count_and_time
|
||||
#define REQUEST_NUM 0xF4
|
||||
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
|
||||
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
|
||||
#include I(REQUEST_BEGIN)
|
||||
REQUEST(__count(0, 8, tlbie_instructions_issued)
|
||||
/*
|
||||
|
|
|
@ -273,6 +273,15 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
|
|||
|
||||
mask |= CNST_PMC_MASK(pmc);
|
||||
value |= CNST_PMC_VAL(pmc);
|
||||
|
||||
/*
|
||||
* PMC5 and PMC6 are used to count cycles and instructions and
|
||||
* they do not support most of the constraint bits. Add a check
|
||||
* to exclude PMC5/6 from most of the constraints except for
|
||||
* EBB/BHRB.
|
||||
*/
|
||||
if (pmc >= 5)
|
||||
goto ebb_bhrb;
|
||||
}
|
||||
|
||||
if (pmc <= 4) {
|
||||
|
@ -331,6 +340,7 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
|
|||
}
|
||||
}
|
||||
|
||||
ebb_bhrb:
|
||||
if (!pmc && ebb)
|
||||
/* EBB events must specify the PMC */
|
||||
return -1;
|
||||
|
|
|
@ -238,12 +238,11 @@ config TAU
|
|||
temperature within 2-4 degrees Celsius. This option shows the current
|
||||
on-die temperature in /proc/cpuinfo if the cpu supports it.
|
||||
|
||||
Unfortunately, on some chip revisions, this sensor is very inaccurate
|
||||
and in many cases, does not work at all, so don't assume the cpu
|
||||
temp is actually what /proc/cpuinfo says it is.
|
||||
Unfortunately, this sensor is very inaccurate when uncalibrated, so
|
||||
don't assume the cpu temp is actually what /proc/cpuinfo says it is.
|
||||
|
||||
config TAU_INT
|
||||
bool "Interrupt driven TAU driver (DANGEROUS)"
|
||||
bool "Interrupt driven TAU driver (EXPERIMENTAL)"
|
||||
depends on TAU
|
||||
---help---
|
||||
The TAU supports an interrupt driven mode which causes an interrupt
|
||||
|
@ -251,12 +250,7 @@ config TAU_INT
|
|||
to get notified the temp has exceeded a range. With this option off,
|
||||
a timer is used to re-check the temperature periodically.
|
||||
|
||||
However, on some cpus it appears that the TAU interrupt hardware
|
||||
is buggy and can cause a situation which would lead unexplained hard
|
||||
lockups.
|
||||
|
||||
Unless you are extending the TAU driver, or enjoy kernel/hardware
|
||||
debugging, leave this option off.
|
||||
If in doubt, say N here.
|
||||
|
||||
config TAU_AVERAGE
|
||||
bool "Average high and low temp"
|
||||
|
|
|
@ -322,15 +322,14 @@ static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
|
|||
return count;
|
||||
}
|
||||
|
||||
static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
|
||||
uint32_t type)
|
||||
static void create_dump_obj(uint32_t id, size_t size, uint32_t type)
|
||||
{
|
||||
struct dump_obj *dump;
|
||||
int rc;
|
||||
|
||||
dump = kzalloc(sizeof(*dump), GFP_KERNEL);
|
||||
if (!dump)
|
||||
return NULL;
|
||||
return;
|
||||
|
||||
dump->kobj.kset = dump_kset;
|
||||
|
||||
|
@ -350,21 +349,39 @@ static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
|
|||
rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
|
||||
if (rc) {
|
||||
kobject_put(&dump->kobj);
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* As soon as the sysfs file for this dump is created/activated there is
|
||||
* a chance the opal_errd daemon (or any userspace) might read and
|
||||
* acknowledge the dump before kobject_uevent() is called. If that
|
||||
* happens then there is a potential race between
|
||||
* dump_ack_store->kobject_put() and kobject_uevent() which leads to a
|
||||
* use-after-free of a kernfs object resulting in a kernel crash.
|
||||
*
|
||||
* To avoid that, we need to take a reference on behalf of the bin file,
|
||||
* so that our reference remains valid while we call kobject_uevent().
|
||||
* We then drop our reference before exiting the function, leaving the
|
||||
* bin file to drop the last reference (if it hasn't already).
|
||||
*/
|
||||
|
||||
/* Take a reference for the bin file */
|
||||
kobject_get(&dump->kobj);
|
||||
rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
|
||||
if (rc) {
|
||||
if (rc == 0) {
|
||||
kobject_uevent(&dump->kobj, KOBJ_ADD);
|
||||
|
||||
pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
|
||||
__func__, dump->id, dump->size);
|
||||
} else {
|
||||
/* Drop reference count taken for bin file */
|
||||
kobject_put(&dump->kobj);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
|
||||
__func__, dump->id, dump->size);
|
||||
|
||||
kobject_uevent(&dump->kobj, KOBJ_ADD);
|
||||
|
||||
return dump;
|
||||
/* Drop our reference */
|
||||
kobject_put(&dump->kobj);
|
||||
return;
|
||||
}
|
||||
|
||||
static irqreturn_t process_dump(int irq, void *data)
|
||||
|
|
|
@ -183,14 +183,14 @@ static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
|
|||
return count;
|
||||
}
|
||||
|
||||
static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
|
||||
static void create_elog_obj(uint64_t id, size_t size, uint64_t type)
|
||||
{
|
||||
struct elog_obj *elog;
|
||||
int rc;
|
||||
|
||||
elog = kzalloc(sizeof(*elog), GFP_KERNEL);
|
||||
if (!elog)
|
||||
return NULL;
|
||||
return;
|
||||
|
||||
elog->kobj.kset = elog_kset;
|
||||
|
||||
|
@ -223,18 +223,37 @@ static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
|
|||
rc = kobject_add(&elog->kobj, NULL, "0x%llx", id);
|
||||
if (rc) {
|
||||
kobject_put(&elog->kobj);
|
||||
return NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* As soon as the sysfs file for this elog is created/activated there is
|
||||
* a chance the opal_errd daemon (or any userspace) might read and
|
||||
* acknowledge the elog before kobject_uevent() is called. If that
|
||||
* happens then there is a potential race between
|
||||
* elog_ack_store->kobject_put() and kobject_uevent() which leads to a
|
||||
* use-after-free of a kernfs object resulting in a kernel crash.
|
||||
*
|
||||
* To avoid that, we need to take a reference on behalf of the bin file,
|
||||
* so that our reference remains valid while we call kobject_uevent().
|
||||
* We then drop our reference before exiting the function, leaving the
|
||||
* bin file to drop the last reference (if it hasn't already).
|
||||
*/
|
||||
|
||||
/* Take a reference for the bin file */
|
||||
kobject_get(&elog->kobj);
|
||||
rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr);
|
||||
if (rc) {
|
||||
if (rc == 0) {
|
||||
kobject_uevent(&elog->kobj, KOBJ_ADD);
|
||||
} else {
|
||||
/* Drop the reference taken for the bin file */
|
||||
kobject_put(&elog->kobj);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
kobject_uevent(&elog->kobj, KOBJ_ADD);
|
||||
/* Drop our reference */
|
||||
kobject_put(&elog->kobj);
|
||||
|
||||
return elog;
|
||||
return;
|
||||
}
|
||||
|
||||
static irqreturn_t elog_event(int irq, void *data)
|
||||
|
|
|
@ -47,7 +47,7 @@
|
|||
#include <asm/udbg.h>
|
||||
#define DBG(fmt...) udbg_printf(fmt)
|
||||
#else
|
||||
#define DBG(fmt...)
|
||||
#define DBG(fmt...) do { } while (0)
|
||||
#endif
|
||||
|
||||
static void pnv_smp_setup_cpu(int cpu)
|
||||
|
|
|
@ -40,6 +40,7 @@ static __init int rng_init(void)
|
|||
|
||||
ppc_md.get_random_seed = pseries_get_random_long;
|
||||
|
||||
of_node_put(dn);
|
||||
return 0;
|
||||
}
|
||||
machine_subsys_initcall(pseries, rng_init);
|
||||
|
|
|
@ -179,6 +179,7 @@ int icp_hv_init(void)
|
|||
|
||||
icp_ops = &icp_hv_ops;
|
||||
|
||||
of_node_put(np);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -21,4 +21,7 @@
|
|||
/* vDSO location */
|
||||
#define AT_SYSINFO_EHDR 33
|
||||
|
||||
/* entries in ARCH_DLINFO */
|
||||
#define AT_VECTOR_SIZE_ARCH 1
|
||||
|
||||
#endif /* _UAPI_ASM_RISCV_AUXVEC_H */
|
||||
|
|
|
@ -354,8 +354,9 @@ static DEFINE_PER_CPU(atomic_t, clock_sync_word);
|
|||
static DEFINE_MUTEX(clock_sync_mutex);
|
||||
static unsigned long clock_sync_flags;
|
||||
|
||||
#define CLOCK_SYNC_HAS_STP 0
|
||||
#define CLOCK_SYNC_STP 1
|
||||
#define CLOCK_SYNC_HAS_STP 0
|
||||
#define CLOCK_SYNC_STP 1
|
||||
#define CLOCK_SYNC_STPINFO_VALID 2
|
||||
|
||||
/*
|
||||
* The get_clock function for the physical clock. It will get the current
|
||||
|
@ -592,6 +593,22 @@ void stp_queue_work(void)
|
|||
queue_work(time_sync_wq, &stp_work);
|
||||
}
|
||||
|
||||
static int __store_stpinfo(void)
|
||||
{
|
||||
int rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
|
||||
|
||||
if (rc)
|
||||
clear_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
|
||||
else
|
||||
set_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int stpinfo_valid(void)
|
||||
{
|
||||
return stp_online && test_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
|
||||
}
|
||||
|
||||
static int stp_sync_clock(void *data)
|
||||
{
|
||||
struct clock_sync_data *sync = data;
|
||||
|
@ -613,8 +630,7 @@ static int stp_sync_clock(void *data)
|
|||
if (rc == 0) {
|
||||
sync->clock_delta = clock_delta;
|
||||
clock_sync_global(clock_delta);
|
||||
rc = chsc_sstpi(stp_page, &stp_info,
|
||||
sizeof(struct stp_sstpi));
|
||||
rc = __store_stpinfo();
|
||||
if (rc == 0 && stp_info.tmd != 2)
|
||||
rc = -EAGAIN;
|
||||
}
|
||||
|
@ -659,7 +675,7 @@ static void stp_work_fn(struct work_struct *work)
|
|||
if (rc)
|
||||
goto out_unlock;
|
||||
|
||||
rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
|
||||
rc = __store_stpinfo();
|
||||
if (rc || stp_info.c == 0)
|
||||
goto out_unlock;
|
||||
|
||||
|
@ -696,10 +712,14 @@ static ssize_t stp_ctn_id_show(struct device *dev,
|
|||
struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
if (!stp_online)
|
||||
return -ENODATA;
|
||||
return sprintf(buf, "%016llx\n",
|
||||
*(unsigned long long *) stp_info.ctnid);
|
||||
ssize_t ret = -ENODATA;
|
||||
|
||||
mutex_lock(&stp_work_mutex);
|
||||
if (stpinfo_valid())
|
||||
ret = sprintf(buf, "%016llx\n",
|
||||
*(unsigned long long *) stp_info.ctnid);
|
||||
mutex_unlock(&stp_work_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL);
|
||||
|
@ -708,9 +728,13 @@ static ssize_t stp_ctn_type_show(struct device *dev,
|
|||
struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
if (!stp_online)
|
||||
return -ENODATA;
|
||||
return sprintf(buf, "%i\n", stp_info.ctn);
|
||||
ssize_t ret = -ENODATA;
|
||||
|
||||
mutex_lock(&stp_work_mutex);
|
||||
if (stpinfo_valid())
|
||||
ret = sprintf(buf, "%i\n", stp_info.ctn);
|
||||
mutex_unlock(&stp_work_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL);
|
||||
|
@ -719,9 +743,13 @@ static ssize_t stp_dst_offset_show(struct device *dev,
|
|||
struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
if (!stp_online || !(stp_info.vbits & 0x2000))
|
||||
return -ENODATA;
|
||||
return sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
|
||||
ssize_t ret = -ENODATA;
|
||||
|
||||
mutex_lock(&stp_work_mutex);
|
||||
if (stpinfo_valid() && (stp_info.vbits & 0x2000))
|
||||
ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
|
||||
mutex_unlock(&stp_work_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL);
|
||||
|
@ -730,9 +758,13 @@ static ssize_t stp_leap_seconds_show(struct device *dev,
|
|||
struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
if (!stp_online || !(stp_info.vbits & 0x8000))
|
||||
return -ENODATA;
|
||||
return sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
|
||||
ssize_t ret = -ENODATA;
|
||||
|
||||
mutex_lock(&stp_work_mutex);
|
||||
if (stpinfo_valid() && (stp_info.vbits & 0x8000))
|
||||
ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
|
||||
mutex_unlock(&stp_work_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL);
|
||||
|
@ -741,9 +773,13 @@ static ssize_t stp_stratum_show(struct device *dev,
|
|||
struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
if (!stp_online)
|
||||
return -ENODATA;
|
||||
return sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
|
||||
ssize_t ret = -ENODATA;
|
||||
|
||||
mutex_lock(&stp_work_mutex);
|
||||
if (stpinfo_valid())
|
||||
ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
|
||||
mutex_unlock(&stp_work_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL);
|
||||
|
@ -752,9 +788,13 @@ static ssize_t stp_time_offset_show(struct device *dev,
|
|||
struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
if (!stp_online || !(stp_info.vbits & 0x0800))
|
||||
return -ENODATA;
|
||||
return sprintf(buf, "%i\n", (int) stp_info.tto);
|
||||
ssize_t ret = -ENODATA;
|
||||
|
||||
mutex_lock(&stp_work_mutex);
|
||||
if (stpinfo_valid() && (stp_info.vbits & 0x0800))
|
||||
ret = sprintf(buf, "%i\n", (int) stp_info.tto);
|
||||
mutex_unlock(&stp_work_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL);
|
||||
|
@ -763,9 +803,13 @@ static ssize_t stp_time_zone_offset_show(struct device *dev,
|
|||
struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
if (!stp_online || !(stp_info.vbits & 0x4000))
|
||||
return -ENODATA;
|
||||
return sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
|
||||
ssize_t ret = -ENODATA;
|
||||
|
||||
mutex_lock(&stp_work_mutex);
|
||||
if (stpinfo_valid() && (stp_info.vbits & 0x4000))
|
||||
ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
|
||||
mutex_unlock(&stp_work_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static DEVICE_ATTR(time_zone_offset, 0400,
|
||||
|
@ -775,9 +819,13 @@ static ssize_t stp_timing_mode_show(struct device *dev,
|
|||
struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
if (!stp_online)
|
||||
return -ENODATA;
|
||||
return sprintf(buf, "%i\n", stp_info.tmd);
|
||||
ssize_t ret = -ENODATA;
|
||||
|
||||
mutex_lock(&stp_work_mutex);
|
||||
if (stpinfo_valid())
|
||||
ret = sprintf(buf, "%i\n", stp_info.tmd);
|
||||
mutex_unlock(&stp_work_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL);
|
||||
|
@ -786,9 +834,13 @@ static ssize_t stp_timing_state_show(struct device *dev,
|
|||
struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
if (!stp_online)
|
||||
return -ENODATA;
|
||||
return sprintf(buf, "%i\n", stp_info.tst);
|
||||
ssize_t ret = -ENODATA;
|
||||
|
||||
mutex_lock(&stp_work_mutex);
|
||||
if (stpinfo_valid())
|
||||
ret = sprintf(buf, "%i\n", stp_info.tst);
|
||||
mutex_unlock(&stp_work_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL);
|
||||
|
|
|
@ -1039,38 +1039,9 @@ void smp_fetch_global_pmu(void)
|
|||
* are flush_tlb_*() routines, and these run after flush_cache_*()
|
||||
* which performs the flushw.
|
||||
*
|
||||
* The SMP TLB coherency scheme we use works as follows:
|
||||
*
|
||||
* 1) mm->cpu_vm_mask is a bit mask of which cpus an address
|
||||
* space has (potentially) executed on, this is the heuristic
|
||||
* we use to avoid doing cross calls.
|
||||
*
|
||||
* Also, for flushing from kswapd and also for clones, we
|
||||
* use cpu_vm_mask as the list of cpus to make run the TLB.
|
||||
*
|
||||
* 2) TLB context numbers are shared globally across all processors
|
||||
* in the system, this allows us to play several games to avoid
|
||||
* cross calls.
|
||||
*
|
||||
* One invariant is that when a cpu switches to a process, and
|
||||
* that processes tsk->active_mm->cpu_vm_mask does not have the
|
||||
* current cpu's bit set, that tlb context is flushed locally.
|
||||
*
|
||||
* If the address space is non-shared (ie. mm->count == 1) we avoid
|
||||
* cross calls when we want to flush the currently running process's
|
||||
* tlb state. This is done by clearing all cpu bits except the current
|
||||
* processor's in current->mm->cpu_vm_mask and performing the
|
||||
* flush locally only. This will force any subsequent cpus which run
|
||||
* this task to flush the context from the local tlb if the process
|
||||
* migrates to another cpu (again).
|
||||
*
|
||||
* 3) For shared address spaces (threads) and swapping we bite the
|
||||
* bullet for most cases and perform the cross call (but only to
|
||||
* the cpus listed in cpu_vm_mask).
|
||||
*
|
||||
* The performance gain from "optimizing" away the cross call for threads is
|
||||
* questionable (in theory the big win for threads is the massive sharing of
|
||||
* address space state across processors).
|
||||
* mm->cpu_vm_mask is a bit mask of which cpus an address
|
||||
* space has (potentially) executed on, this is the heuristic
|
||||
* we use to limit cross calls.
|
||||
*/
|
||||
|
||||
/* This currently is only used by the hugetlb arch pre-fault
|
||||
|
@ -1080,18 +1051,13 @@ void smp_fetch_global_pmu(void)
|
|||
void smp_flush_tlb_mm(struct mm_struct *mm)
|
||||
{
|
||||
u32 ctx = CTX_HWBITS(mm->context);
|
||||
int cpu = get_cpu();
|
||||
|
||||
if (atomic_read(&mm->mm_users) == 1) {
|
||||
cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
|
||||
goto local_flush_and_out;
|
||||
}
|
||||
get_cpu();
|
||||
|
||||
smp_cross_call_masked(&xcall_flush_tlb_mm,
|
||||
ctx, 0, 0,
|
||||
mm_cpumask(mm));
|
||||
|
||||
local_flush_and_out:
|
||||
__flush_tlb_mm(ctx, SECONDARY_CONTEXT);
|
||||
|
||||
put_cpu();
|
||||
|
@ -1114,17 +1080,15 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
|
|||
{
|
||||
u32 ctx = CTX_HWBITS(mm->context);
|
||||
struct tlb_pending_info info;
|
||||
int cpu = get_cpu();
|
||||
|
||||
get_cpu();
|
||||
|
||||
info.ctx = ctx;
|
||||
info.nr = nr;
|
||||
info.vaddrs = vaddrs;
|
||||
|
||||
if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
|
||||
cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
|
||||
else
|
||||
smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
|
||||
&info, 1);
|
||||
smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
|
||||
&info, 1);
|
||||
|
||||
__flush_tlb_pending(ctx, nr, vaddrs);
|
||||
|
||||
|
@ -1134,14 +1098,13 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
|
|||
void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
|
||||
{
|
||||
unsigned long context = CTX_HWBITS(mm->context);
|
||||
int cpu = get_cpu();
|
||||
|
||||
if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
|
||||
cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
|
||||
else
|
||||
smp_cross_call_masked(&xcall_flush_tlb_page,
|
||||
context, vaddr, 0,
|
||||
mm_cpumask(mm));
|
||||
get_cpu();
|
||||
|
||||
smp_cross_call_masked(&xcall_flush_tlb_page,
|
||||
context, vaddr, 0,
|
||||
mm_cpumask(mm));
|
||||
|
||||
__flush_tlb_page(context, vaddr);
|
||||
|
||||
put_cpu();
|
||||
|
|
|
@ -36,14 +36,14 @@ int write_sigio_irq(int fd)
|
|||
}
|
||||
|
||||
/* These are called from os-Linux/sigio.c to protect its pollfds arrays. */
|
||||
static DEFINE_SPINLOCK(sigio_spinlock);
|
||||
static DEFINE_MUTEX(sigio_mutex);
|
||||
|
||||
void sigio_lock(void)
|
||||
{
|
||||
spin_lock(&sigio_spinlock);
|
||||
mutex_lock(&sigio_mutex);
|
||||
}
|
||||
|
||||
void sigio_unlock(void)
|
||||
{
|
||||
spin_unlock(&sigio_spinlock);
|
||||
mutex_unlock(&sigio_mutex);
|
||||
}
|
||||
|
|
|
@ -200,9 +200,10 @@ avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
|
|||
avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
|
||||
sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
|
||||
sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
|
||||
adx_instr := $(call as-instr,adox %r10$(comma)%r10,-DCONFIG_AS_ADX=1)
|
||||
|
||||
KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
|
||||
KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
|
||||
KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
|
||||
KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
|
||||
|
||||
KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
|
||||
|
||||
|
|
|
@ -40,6 +40,7 @@ CONFIG_EMBEDDED=y
|
|||
# CONFIG_SLAB_MERGE_DEFAULT is not set
|
||||
CONFIG_PROFILING=y
|
||||
CONFIG_SMP=y
|
||||
CONFIG_X86_X2APIC=y
|
||||
CONFIG_HYPERVISOR_GUEST=y
|
||||
CONFIG_PARAVIRT=y
|
||||
CONFIG_NR_CPUS=32
|
||||
|
@ -213,6 +214,7 @@ CONFIG_DM_VERITY_FEC=y
|
|||
CONFIG_DM_BOW=y
|
||||
CONFIG_NETDEVICES=y
|
||||
CONFIG_DUMMY=y
|
||||
CONFIG_WIREGUARD=y
|
||||
CONFIG_TUN=y
|
||||
CONFIG_VETH=y
|
||||
# CONFIG_ETHERNET is not set
|
||||
|
@ -310,6 +312,7 @@ CONFIG_HID_NINTENDO=y
|
|||
CONFIG_HID_SONY=y
|
||||
CONFIG_HID_STEAM=y
|
||||
CONFIG_USB_HIDDEV=y
|
||||
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
|
||||
CONFIG_USB_XHCI_HCD=y
|
||||
CONFIG_USB_GADGET=y
|
||||
CONFIG_USB_GADGET_VBUS_DRAW=500
|
||||
|
@ -436,6 +439,7 @@ CONFIG_CRC8=y
|
|||
CONFIG_XZ_DEC=y
|
||||
CONFIG_PRINTK_TIME=y
|
||||
CONFIG_DEBUG_INFO=y
|
||||
CONFIG_DEBUG_INFO_DWARF4=y
|
||||
# CONFIG_ENABLE_MUST_CHECK is not set
|
||||
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
|
||||
CONFIG_MAGIC_SYSRQ=y
|
||||
|
|
1
arch/x86/crypto/.gitignore
vendored
Normal file
1
arch/x86/crypto/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
poly1305-x86_64-cryptogams.S
|
|
@ -8,8 +8,10 @@ OBJECT_FILES_NON_STANDARD := y
|
|||
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
|
||||
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
|
||||
$(comma)4)$(comma)%ymm2,yes,no)
|
||||
avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
|
||||
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
|
||||
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
|
||||
adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no)
|
||||
|
||||
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
|
||||
|
||||
|
@ -23,7 +25,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
|
|||
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
|
||||
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
|
||||
|
@ -46,6 +48,11 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
|
|||
obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
|
||||
obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
|
||||
|
||||
# These modules require the assembler to support ADX.
|
||||
ifeq ($(adx_supported),yes)
|
||||
obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
|
||||
endif
|
||||
|
||||
# These modules require assembler to support AVX.
|
||||
ifeq ($(avx_supported),yes)
|
||||
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
|
||||
|
@ -54,6 +61,7 @@ ifeq ($(avx_supported),yes)
|
|||
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
|
||||
obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
|
||||
endif
|
||||
|
||||
# These modules require assembler to support AVX2.
|
||||
|
@ -77,7 +85,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
|
|||
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
|
||||
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
|
||||
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
|
||||
chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
|
||||
chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
|
||||
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
|
||||
|
||||
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
|
||||
|
@ -87,6 +95,12 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
|
|||
morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
|
||||
morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
|
||||
|
||||
blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
|
||||
poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
|
||||
ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
|
||||
targets += poly1305-x86_64-cryptogams.S
|
||||
endif
|
||||
|
||||
ifeq ($(avx_supported),yes)
|
||||
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
|
||||
camellia_aesni_avx_glue.o
|
||||
|
@ -100,20 +114,22 @@ endif
|
|||
|
||||
ifeq ($(avx2_supported),yes)
|
||||
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
|
||||
chacha20-x86_64-y += chacha20-avx2-x86_64.o
|
||||
chacha-x86_64-y += chacha-avx2-x86_64.o
|
||||
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
|
||||
|
||||
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
|
||||
endif
|
||||
|
||||
ifeq ($(avx512_supported),yes)
|
||||
chacha-x86_64-y += chacha-avx512vl-x86_64.o
|
||||
endif
|
||||
|
||||
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
|
||||
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
|
||||
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
|
||||
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
|
||||
poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
|
||||
ifeq ($(avx2_supported),yes)
|
||||
sha1-ssse3-y += sha1_avx2_x86_64_asm.o
|
||||
poly1305-x86_64-y += poly1305-avx2-x86_64.o
|
||||
endif
|
||||
ifeq ($(sha1_ni_supported),yes)
|
||||
sha1-ssse3-y += sha1_ni_asm.o
|
||||
|
@ -127,3 +143,8 @@ sha256-ssse3-y += sha256_ni_asm.o
|
|||
endif
|
||||
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
|
||||
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
|
||||
|
||||
quiet_cmd_perlasm = PERLASM $@
|
||||
cmd_perlasm = $(PERL) $< > $@
|
||||
$(obj)/%.S: $(src)/%.pl FORCE
|
||||
$(call if_changed,perlasm)
|
||||
|
|
258
arch/x86/crypto/blake2s-core.S
Normal file
258
arch/x86/crypto/blake2s-core.S
Normal file
|
@ -0,0 +1,258 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
|
||||
/*
|
||||
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||
* Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
|
||||
.align 32
|
||||
IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
|
||||
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
|
||||
.section .rodata.cst16.ROT16, "aM", @progbits, 16
|
||||
.align 16
|
||||
ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
|
||||
.section .rodata.cst16.ROR328, "aM", @progbits, 16
|
||||
.align 16
|
||||
ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
|
||||
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
|
||||
.align 64
|
||||
SIGMA:
|
||||
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
|
||||
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
|
||||
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
|
||||
.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
|
||||
.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
|
||||
.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
|
||||
.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
|
||||
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
|
||||
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
|
||||
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
|
||||
#ifdef CONFIG_AS_AVX512
|
||||
.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
|
||||
.align 64
|
||||
SIGMA2:
|
||||
.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
|
||||
.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
|
||||
.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
|
||||
.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
|
||||
.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
|
||||
.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
|
||||
.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
|
||||
.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
|
||||
.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
|
||||
.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
|
||||
#endif /* CONFIG_AS_AVX512 */
|
||||
|
||||
.text
|
||||
#ifdef CONFIG_AS_SSSE3
|
||||
ENTRY(blake2s_compress_ssse3)
|
||||
testq %rdx,%rdx
|
||||
je .Lendofloop
|
||||
movdqu (%rdi),%xmm0
|
||||
movdqu 0x10(%rdi),%xmm1
|
||||
movdqa ROT16(%rip),%xmm12
|
||||
movdqa ROR328(%rip),%xmm13
|
||||
movdqu 0x20(%rdi),%xmm14
|
||||
movq %rcx,%xmm15
|
||||
leaq SIGMA+0xa0(%rip),%r8
|
||||
jmp .Lbeginofloop
|
||||
.align 32
|
||||
.Lbeginofloop:
|
||||
movdqa %xmm0,%xmm10
|
||||
movdqa %xmm1,%xmm11
|
||||
paddq %xmm15,%xmm14
|
||||
movdqa IV(%rip),%xmm2
|
||||
movdqa %xmm14,%xmm3
|
||||
pxor IV+0x10(%rip),%xmm3
|
||||
leaq SIGMA(%rip),%rcx
|
||||
.Lroundloop:
|
||||
movzbl (%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm4
|
||||
movzbl 0x1(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm5
|
||||
movzbl 0x2(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm6
|
||||
movzbl 0x3(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm7
|
||||
punpckldq %xmm5,%xmm4
|
||||
punpckldq %xmm7,%xmm6
|
||||
punpcklqdq %xmm6,%xmm4
|
||||
paddd %xmm4,%xmm0
|
||||
paddd %xmm1,%xmm0
|
||||
pxor %xmm0,%xmm3
|
||||
pshufb %xmm12,%xmm3
|
||||
paddd %xmm3,%xmm2
|
||||
pxor %xmm2,%xmm1
|
||||
movdqa %xmm1,%xmm8
|
||||
psrld $0xc,%xmm1
|
||||
pslld $0x14,%xmm8
|
||||
por %xmm8,%xmm1
|
||||
movzbl 0x4(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm5
|
||||
movzbl 0x5(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm6
|
||||
movzbl 0x6(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm7
|
||||
movzbl 0x7(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm4
|
||||
punpckldq %xmm6,%xmm5
|
||||
punpckldq %xmm4,%xmm7
|
||||
punpcklqdq %xmm7,%xmm5
|
||||
paddd %xmm5,%xmm0
|
||||
paddd %xmm1,%xmm0
|
||||
pxor %xmm0,%xmm3
|
||||
pshufb %xmm13,%xmm3
|
||||
paddd %xmm3,%xmm2
|
||||
pxor %xmm2,%xmm1
|
||||
movdqa %xmm1,%xmm8
|
||||
psrld $0x7,%xmm1
|
||||
pslld $0x19,%xmm8
|
||||
por %xmm8,%xmm1
|
||||
pshufd $0x93,%xmm0,%xmm0
|
||||
pshufd $0x4e,%xmm3,%xmm3
|
||||
pshufd $0x39,%xmm2,%xmm2
|
||||
movzbl 0x8(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm6
|
||||
movzbl 0x9(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm7
|
||||
movzbl 0xa(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm4
|
||||
movzbl 0xb(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm5
|
||||
punpckldq %xmm7,%xmm6
|
||||
punpckldq %xmm5,%xmm4
|
||||
punpcklqdq %xmm4,%xmm6
|
||||
paddd %xmm6,%xmm0
|
||||
paddd %xmm1,%xmm0
|
||||
pxor %xmm0,%xmm3
|
||||
pshufb %xmm12,%xmm3
|
||||
paddd %xmm3,%xmm2
|
||||
pxor %xmm2,%xmm1
|
||||
movdqa %xmm1,%xmm8
|
||||
psrld $0xc,%xmm1
|
||||
pslld $0x14,%xmm8
|
||||
por %xmm8,%xmm1
|
||||
movzbl 0xc(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm7
|
||||
movzbl 0xd(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm4
|
||||
movzbl 0xe(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm5
|
||||
movzbl 0xf(%rcx),%eax
|
||||
movd (%rsi,%rax,4),%xmm6
|
||||
punpckldq %xmm4,%xmm7
|
||||
punpckldq %xmm6,%xmm5
|
||||
punpcklqdq %xmm5,%xmm7
|
||||
paddd %xmm7,%xmm0
|
||||
paddd %xmm1,%xmm0
|
||||
pxor %xmm0,%xmm3
|
||||
pshufb %xmm13,%xmm3
|
||||
paddd %xmm3,%xmm2
|
||||
pxor %xmm2,%xmm1
|
||||
movdqa %xmm1,%xmm8
|
||||
psrld $0x7,%xmm1
|
||||
pslld $0x19,%xmm8
|
||||
por %xmm8,%xmm1
|
||||
pshufd $0x39,%xmm0,%xmm0
|
||||
pshufd $0x4e,%xmm3,%xmm3
|
||||
pshufd $0x93,%xmm2,%xmm2
|
||||
addq $0x10,%rcx
|
||||
cmpq %r8,%rcx
|
||||
jnz .Lroundloop
|
||||
pxor %xmm2,%xmm0
|
||||
pxor %xmm3,%xmm1
|
||||
pxor %xmm10,%xmm0
|
||||
pxor %xmm11,%xmm1
|
||||
addq $0x40,%rsi
|
||||
decq %rdx
|
||||
jnz .Lbeginofloop
|
||||
movdqu %xmm0,(%rdi)
|
||||
movdqu %xmm1,0x10(%rdi)
|
||||
movdqu %xmm14,0x20(%rdi)
|
||||
.Lendofloop:
|
||||
ret
|
||||
ENDPROC(blake2s_compress_ssse3)
|
||||
#endif /* CONFIG_AS_SSSE3 */
|
||||
|
||||
#ifdef CONFIG_AS_AVX512
|
||||
ENTRY(blake2s_compress_avx512)
|
||||
vmovdqu (%rdi),%xmm0
|
||||
vmovdqu 0x10(%rdi),%xmm1
|
||||
vmovdqu 0x20(%rdi),%xmm4
|
||||
vmovq %rcx,%xmm5
|
||||
vmovdqa IV(%rip),%xmm14
|
||||
vmovdqa IV+16(%rip),%xmm15
|
||||
jmp .Lblake2s_compress_avx512_mainloop
|
||||
.align 32
|
||||
.Lblake2s_compress_avx512_mainloop:
|
||||
vmovdqa %xmm0,%xmm10
|
||||
vmovdqa %xmm1,%xmm11
|
||||
vpaddq %xmm5,%xmm4,%xmm4
|
||||
vmovdqa %xmm14,%xmm2
|
||||
vpxor %xmm15,%xmm4,%xmm3
|
||||
vmovdqu (%rsi),%ymm6
|
||||
vmovdqu 0x20(%rsi),%ymm7
|
||||
addq $0x40,%rsi
|
||||
leaq SIGMA2(%rip),%rax
|
||||
movb $0xa,%cl
|
||||
.Lblake2s_compress_avx512_roundloop:
|
||||
addq $0x40,%rax
|
||||
vmovdqa -0x40(%rax),%ymm8
|
||||
vmovdqa -0x20(%rax),%ymm9
|
||||
vpermi2d %ymm7,%ymm6,%ymm8
|
||||
vpermi2d %ymm7,%ymm6,%ymm9
|
||||
vmovdqa %ymm8,%ymm6
|
||||
vmovdqa %ymm9,%ymm7
|
||||
vpaddd %xmm8,%xmm0,%xmm0
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vprord $0x10,%xmm3,%xmm3
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
vprord $0xc,%xmm1,%xmm1
|
||||
vextracti128 $0x1,%ymm8,%xmm8
|
||||
vpaddd %xmm8,%xmm0,%xmm0
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vprord $0x8,%xmm3,%xmm3
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
vprord $0x7,%xmm1,%xmm1
|
||||
vpshufd $0x93,%xmm0,%xmm0
|
||||
vpshufd $0x4e,%xmm3,%xmm3
|
||||
vpshufd $0x39,%xmm2,%xmm2
|
||||
vpaddd %xmm9,%xmm0,%xmm0
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vprord $0x10,%xmm3,%xmm3
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
vprord $0xc,%xmm1,%xmm1
|
||||
vextracti128 $0x1,%ymm9,%xmm9
|
||||
vpaddd %xmm9,%xmm0,%xmm0
|
||||
vpaddd %xmm1,%xmm0,%xmm0
|
||||
vpxor %xmm0,%xmm3,%xmm3
|
||||
vprord $0x8,%xmm3,%xmm3
|
||||
vpaddd %xmm3,%xmm2,%xmm2
|
||||
vpxor %xmm2,%xmm1,%xmm1
|
||||
vprord $0x7,%xmm1,%xmm1
|
||||
vpshufd $0x39,%xmm0,%xmm0
|
||||
vpshufd $0x4e,%xmm3,%xmm3
|
||||
vpshufd $0x93,%xmm2,%xmm2
|
||||
decb %cl
|
||||
jne .Lblake2s_compress_avx512_roundloop
|
||||
vpxor %xmm10,%xmm0,%xmm0
|
||||
vpxor %xmm11,%xmm1,%xmm1
|
||||
vpxor %xmm2,%xmm0,%xmm0
|
||||
vpxor %xmm3,%xmm1,%xmm1
|
||||
decq %rdx
|
||||
jne .Lblake2s_compress_avx512_mainloop
|
||||
vmovdqu %xmm0,(%rdi)
|
||||
vmovdqu %xmm1,0x10(%rdi)
|
||||
vmovdqu %xmm4,0x20(%rdi)
|
||||
vzeroupper
|
||||
retq
|
||||
ENDPROC(blake2s_compress_avx512)
|
||||
#endif /* CONFIG_AS_AVX512 */
|
232
arch/x86/crypto/blake2s-glue.c
Normal file
232
arch/x86/crypto/blake2s-glue.c
Normal file
|
@ -0,0 +1,232 @@
|
|||
// SPDX-License-Identifier: GPL-2.0 OR MIT
|
||||
/*
|
||||
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||
*/
|
||||
|
||||
#include <crypto/internal/blake2s.h>
|
||||
#include <crypto/internal/hash.h>
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/fpu/api.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
|
||||
const u8 *block, const size_t nblocks,
|
||||
const u32 inc);
|
||||
asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
|
||||
const u8 *block, const size_t nblocks,
|
||||
const u32 inc);
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
|
||||
|
||||
void blake2s_compress_arch(struct blake2s_state *state,
|
||||
const u8 *block, size_t nblocks,
|
||||
const u32 inc)
|
||||
{
|
||||
/* SIMD disables preemption, so relax after processing each page. */
|
||||
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
|
||||
|
||||
if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
|
||||
blake2s_compress_generic(state, block, nblocks, inc);
|
||||
return;
|
||||
}
|
||||
|
||||
do {
|
||||
const size_t blocks = min_t(size_t, nblocks,
|
||||
SZ_4K / BLAKE2S_BLOCK_SIZE);
|
||||
|
||||
kernel_fpu_begin();
|
||||
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
||||
static_branch_likely(&blake2s_use_avx512))
|
||||
blake2s_compress_avx512(state, block, blocks, inc);
|
||||
else
|
||||
blake2s_compress_ssse3(state, block, blocks, inc);
|
||||
kernel_fpu_end();
|
||||
|
||||
nblocks -= blocks;
|
||||
block += blocks * BLAKE2S_BLOCK_SIZE;
|
||||
} while (nblocks);
|
||||
}
|
||||
EXPORT_SYMBOL(blake2s_compress_arch);
|
||||
|
||||
static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
|
||||
unsigned int keylen)
|
||||
{
|
||||
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
|
||||
|
||||
if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
|
||||
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
memcpy(tctx->key, key, keylen);
|
||||
tctx->keylen = keylen;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_blake2s_init(struct shash_desc *desc)
|
||||
{
|
||||
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
|
||||
struct blake2s_state *state = shash_desc_ctx(desc);
|
||||
const int outlen = crypto_shash_digestsize(desc->tfm);
|
||||
|
||||
if (tctx->keylen)
|
||||
blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
|
||||
else
|
||||
blake2s_init(state, outlen);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
|
||||
unsigned int inlen)
|
||||
{
|
||||
struct blake2s_state *state = shash_desc_ctx(desc);
|
||||
const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
|
||||
|
||||
if (unlikely(!inlen))
|
||||
return 0;
|
||||
if (inlen > fill) {
|
||||
memcpy(state->buf + state->buflen, in, fill);
|
||||
blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
|
||||
state->buflen = 0;
|
||||
in += fill;
|
||||
inlen -= fill;
|
||||
}
|
||||
if (inlen > BLAKE2S_BLOCK_SIZE) {
|
||||
const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
|
||||
/* Hash one less (full) block than strictly possible */
|
||||
blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
|
||||
in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
|
||||
inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
|
||||
}
|
||||
memcpy(state->buf + state->buflen, in, inlen);
|
||||
state->buflen += inlen;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
|
||||
{
|
||||
struct blake2s_state *state = shash_desc_ctx(desc);
|
||||
|
||||
blake2s_set_lastblock(state);
|
||||
memset(state->buf + state->buflen, 0,
|
||||
BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
|
||||
blake2s_compress_arch(state, state->buf, 1, state->buflen);
|
||||
cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
|
||||
memcpy(out, state->h, state->outlen);
|
||||
memzero_explicit(state, sizeof(*state));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct shash_alg blake2s_algs[] = {{
|
||||
.base.cra_name = "blake2s-128",
|
||||
.base.cra_driver_name = "blake2s-128-x86",
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.digestsize = BLAKE2S_128_HASH_SIZE,
|
||||
.setkey = crypto_blake2s_setkey,
|
||||
.init = crypto_blake2s_init,
|
||||
.update = crypto_blake2s_update,
|
||||
.final = crypto_blake2s_final,
|
||||
.descsize = sizeof(struct blake2s_state),
|
||||
}, {
|
||||
.base.cra_name = "blake2s-160",
|
||||
.base.cra_driver_name = "blake2s-160-x86",
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.digestsize = BLAKE2S_160_HASH_SIZE,
|
||||
.setkey = crypto_blake2s_setkey,
|
||||
.init = crypto_blake2s_init,
|
||||
.update = crypto_blake2s_update,
|
||||
.final = crypto_blake2s_final,
|
||||
.descsize = sizeof(struct blake2s_state),
|
||||
}, {
|
||||
.base.cra_name = "blake2s-224",
|
||||
.base.cra_driver_name = "blake2s-224-x86",
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.digestsize = BLAKE2S_224_HASH_SIZE,
|
||||
.setkey = crypto_blake2s_setkey,
|
||||
.init = crypto_blake2s_init,
|
||||
.update = crypto_blake2s_update,
|
||||
.final = crypto_blake2s_final,
|
||||
.descsize = sizeof(struct blake2s_state),
|
||||
}, {
|
||||
.base.cra_name = "blake2s-256",
|
||||
.base.cra_driver_name = "blake2s-256-x86",
|
||||
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||
.base.cra_priority = 200,
|
||||
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.digestsize = BLAKE2S_256_HASH_SIZE,
|
||||
.setkey = crypto_blake2s_setkey,
|
||||
.init = crypto_blake2s_init,
|
||||
.update = crypto_blake2s_update,
|
||||
.final = crypto_blake2s_final,
|
||||
.descsize = sizeof(struct blake2s_state),
|
||||
}};
|
||||
|
||||
static int __init blake2s_mod_init(void)
|
||||
{
|
||||
if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
return 0;
|
||||
|
||||
static_branch_enable(&blake2s_use_ssse3);
|
||||
|
||||
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX2) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX512F) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX512VL) &&
|
||||
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
|
||||
XFEATURE_MASK_AVX512, NULL))
|
||||
static_branch_enable(&blake2s_use_avx512);
|
||||
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||
crypto_register_shashes(blake2s_algs,
|
||||
ARRAY_SIZE(blake2s_algs)) : 0;
|
||||
}
|
||||
|
||||
static void __exit blake2s_mod_exit(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
|
||||
}
|
||||
|
||||
module_init(blake2s_mod_init);
|
||||
module_exit(blake2s_mod_exit);
|
||||
|
||||
MODULE_ALIAS_CRYPTO("blake2s-128");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-128-x86");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-160");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-160-x86");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-224");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-224-x86");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-256");
|
||||
MODULE_ALIAS_CRYPTO("blake2s-256-x86");
|
||||
MODULE_LICENSE("GPL v2");
|
1025
arch/x86/crypto/chacha-avx2-x86_64.S
Normal file
1025
arch/x86/crypto/chacha-avx2-x86_64.S
Normal file
File diff suppressed because it is too large
Load diff
836
arch/x86/crypto/chacha-avx512vl-x86_64.S
Normal file
836
arch/x86/crypto/chacha-avx512vl-x86_64.S
Normal file
|
@ -0,0 +1,836 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0+ */
|
||||
/*
|
||||
* ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
|
||||
*
|
||||
* Copyright (C) 2018 Martin Willi
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
|
||||
.align 32
|
||||
CTR2BL: .octa 0x00000000000000000000000000000000
|
||||
.octa 0x00000000000000000000000000000001
|
||||
|
||||
.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
|
||||
.align 32
|
||||
CTR4BL: .octa 0x00000000000000000000000000000002
|
||||
.octa 0x00000000000000000000000000000003
|
||||
|
||||
.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
|
||||
.align 32
|
||||
CTR8BL: .octa 0x00000003000000020000000100000000
|
||||
.octa 0x00000007000000060000000500000004
|
||||
|
||||
.text
|
||||
|
||||
ENTRY(chacha_2block_xor_avx512vl)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 2 data blocks output, o
|
||||
# %rdx: up to 2 data blocks input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
|
||||
# This function encrypts two ChaCha blocks by loading the state
|
||||
# matrix twice across four AVX registers. It performs matrix operations
|
||||
# on four words in each matrix in parallel, but requires shuffling to
|
||||
# rearrange the words after each round.
|
||||
|
||||
vzeroupper
|
||||
|
||||
# x0..3[0-2] = s0..3
|
||||
vbroadcasti128 0x00(%rdi),%ymm0
|
||||
vbroadcasti128 0x10(%rdi),%ymm1
|
||||
vbroadcasti128 0x20(%rdi),%ymm2
|
||||
vbroadcasti128 0x30(%rdi),%ymm3
|
||||
|
||||
vpaddd CTR2BL(%rip),%ymm3,%ymm3
|
||||
|
||||
vmovdqa %ymm0,%ymm8
|
||||
vmovdqa %ymm1,%ymm9
|
||||
vmovdqa %ymm2,%ymm10
|
||||
vmovdqa %ymm3,%ymm11
|
||||
|
||||
.Ldoubleround:
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $16,%ymm3,%ymm3
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $12,%ymm1,%ymm1
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $8,%ymm3,%ymm3
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $7,%ymm1,%ymm1
|
||||
|
||||
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
|
||||
vpshufd $0x39,%ymm1,%ymm1
|
||||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||
vpshufd $0x4e,%ymm2,%ymm2
|
||||
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
|
||||
vpshufd $0x93,%ymm3,%ymm3
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $16,%ymm3,%ymm3
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $12,%ymm1,%ymm1
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $8,%ymm3,%ymm3
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $7,%ymm1,%ymm1
|
||||
|
||||
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
|
||||
vpshufd $0x93,%ymm1,%ymm1
|
||||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||
vpshufd $0x4e,%ymm2,%ymm2
|
||||
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||
vpshufd $0x39,%ymm3,%ymm3
|
||||
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround
|
||||
|
||||
# o0 = i0 ^ (x0 + s0)
|
||||
vpaddd %ymm8,%ymm0,%ymm7
|
||||
cmp $0x10,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x00(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x00(%rsi)
|
||||
vextracti128 $1,%ymm7,%xmm0
|
||||
# o1 = i1 ^ (x1 + s1)
|
||||
vpaddd %ymm9,%ymm1,%ymm7
|
||||
cmp $0x20,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x10(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x10(%rsi)
|
||||
vextracti128 $1,%ymm7,%xmm1
|
||||
# o2 = i2 ^ (x2 + s2)
|
||||
vpaddd %ymm10,%ymm2,%ymm7
|
||||
cmp $0x30,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x20(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x20(%rsi)
|
||||
vextracti128 $1,%ymm7,%xmm2
|
||||
# o3 = i3 ^ (x3 + s3)
|
||||
vpaddd %ymm11,%ymm3,%ymm7
|
||||
cmp $0x40,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x30(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x30(%rsi)
|
||||
vextracti128 $1,%ymm7,%xmm3
|
||||
|
||||
# xor and write second block
|
||||
vmovdqa %xmm0,%xmm7
|
||||
cmp $0x50,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x40(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x40(%rsi)
|
||||
|
||||
vmovdqa %xmm1,%xmm7
|
||||
cmp $0x60,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x50(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x50(%rsi)
|
||||
|
||||
vmovdqa %xmm2,%xmm7
|
||||
cmp $0x70,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x60(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x60(%rsi)
|
||||
|
||||
vmovdqa %xmm3,%xmm7
|
||||
cmp $0x80,%rcx
|
||||
jl .Lxorpart2
|
||||
vpxord 0x70(%rdx),%xmm7,%xmm6
|
||||
vmovdqu %xmm6,0x70(%rsi)
|
||||
|
||||
.Ldone2:
|
||||
vzeroupper
|
||||
ret
|
||||
|
||||
.Lxorpart2:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rcx,%rax
|
||||
and $0xf,%rcx
|
||||
jz .Ldone8
|
||||
mov %rax,%r9
|
||||
and $~0xf,%r9
|
||||
|
||||
mov $1,%rax
|
||||
shld %cl,%rax,%rax
|
||||
sub $1,%rax
|
||||
kmovq %rax,%k1
|
||||
|
||||
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
|
||||
vpxord %xmm7,%xmm1,%xmm1
|
||||
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
|
||||
|
||||
jmp .Ldone2
|
||||
|
||||
ENDPROC(chacha_2block_xor_avx512vl)
|
||||
|
||||
ENTRY(chacha_4block_xor_avx512vl)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 4 data blocks output, o
|
||||
# %rdx: up to 4 data blocks input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
|
||||
# This function encrypts four ChaCha blocks by loading the state
|
||||
# matrix four times across eight AVX registers. It performs matrix
|
||||
# operations on four words in two matrices in parallel, sequentially
|
||||
# to the operations on the four words of the other two matrices. The
|
||||
# required word shuffling has a rather high latency, we can do the
|
||||
# arithmetic on two matrix-pairs without much slowdown.
|
||||
|
||||
vzeroupper
|
||||
|
||||
# x0..3[0-4] = s0..3
|
||||
vbroadcasti128 0x00(%rdi),%ymm0
|
||||
vbroadcasti128 0x10(%rdi),%ymm1
|
||||
vbroadcasti128 0x20(%rdi),%ymm2
|
||||
vbroadcasti128 0x30(%rdi),%ymm3
|
||||
|
||||
vmovdqa %ymm0,%ymm4
|
||||
vmovdqa %ymm1,%ymm5
|
||||
vmovdqa %ymm2,%ymm6
|
||||
vmovdqa %ymm3,%ymm7
|
||||
|
||||
vpaddd CTR2BL(%rip),%ymm3,%ymm3
|
||||
vpaddd CTR4BL(%rip),%ymm7,%ymm7
|
||||
|
||||
vmovdqa %ymm0,%ymm11
|
||||
vmovdqa %ymm1,%ymm12
|
||||
vmovdqa %ymm2,%ymm13
|
||||
vmovdqa %ymm3,%ymm14
|
||||
vmovdqa %ymm7,%ymm15
|
||||
|
||||
.Ldoubleround4:
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $16,%ymm3,%ymm3
|
||||
|
||||
vpaddd %ymm5,%ymm4,%ymm4
|
||||
vpxord %ymm4,%ymm7,%ymm7
|
||||
vprold $16,%ymm7,%ymm7
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $12,%ymm1,%ymm1
|
||||
|
||||
vpaddd %ymm7,%ymm6,%ymm6
|
||||
vpxord %ymm6,%ymm5,%ymm5
|
||||
vprold $12,%ymm5,%ymm5
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $8,%ymm3,%ymm3
|
||||
|
||||
vpaddd %ymm5,%ymm4,%ymm4
|
||||
vpxord %ymm4,%ymm7,%ymm7
|
||||
vprold $8,%ymm7,%ymm7
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $7,%ymm1,%ymm1
|
||||
|
||||
vpaddd %ymm7,%ymm6,%ymm6
|
||||
vpxord %ymm6,%ymm5,%ymm5
|
||||
vprold $7,%ymm5,%ymm5
|
||||
|
||||
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
|
||||
vpshufd $0x39,%ymm1,%ymm1
|
||||
vpshufd $0x39,%ymm5,%ymm5
|
||||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||
vpshufd $0x4e,%ymm2,%ymm2
|
||||
vpshufd $0x4e,%ymm6,%ymm6
|
||||
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
|
||||
vpshufd $0x93,%ymm3,%ymm3
|
||||
vpshufd $0x93,%ymm7,%ymm7
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $16,%ymm3,%ymm3
|
||||
|
||||
vpaddd %ymm5,%ymm4,%ymm4
|
||||
vpxord %ymm4,%ymm7,%ymm7
|
||||
vprold $16,%ymm7,%ymm7
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $12,%ymm1,%ymm1
|
||||
|
||||
vpaddd %ymm7,%ymm6,%ymm6
|
||||
vpxord %ymm6,%ymm5,%ymm5
|
||||
vprold $12,%ymm5,%ymm5
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||
vpaddd %ymm1,%ymm0,%ymm0
|
||||
vpxord %ymm0,%ymm3,%ymm3
|
||||
vprold $8,%ymm3,%ymm3
|
||||
|
||||
vpaddd %ymm5,%ymm4,%ymm4
|
||||
vpxord %ymm4,%ymm7,%ymm7
|
||||
vprold $8,%ymm7,%ymm7
|
||||
|
||||
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||
vpaddd %ymm3,%ymm2,%ymm2
|
||||
vpxord %ymm2,%ymm1,%ymm1
|
||||
vprold $7,%ymm1,%ymm1
|
||||
|
||||
vpaddd %ymm7,%ymm6,%ymm6
|
||||
vpxord %ymm6,%ymm5,%ymm5
|
||||
vprold $7,%ymm5,%ymm5
|
||||
|
||||
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
|
||||
vpshufd $0x93,%ymm1,%ymm1
|
||||
vpshufd $0x93,%ymm5,%ymm5
|
||||
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||
vpshufd $0x4e,%ymm2,%ymm2
|
||||
vpshufd $0x4e,%ymm6,%ymm6
|
||||
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||
vpshufd $0x39,%ymm3,%ymm3
|
||||
vpshufd $0x39,%ymm7,%ymm7
|
||||
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround4
|
||||
|
||||
# o0 = i0 ^ (x0 + s0), first block
|
||||
vpaddd %ymm11,%ymm0,%ymm10
|
||||
cmp $0x10,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x00(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x00(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm0
|
||||
# o1 = i1 ^ (x1 + s1), first block
|
||||
vpaddd %ymm12,%ymm1,%ymm10
|
||||
cmp $0x20,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x10(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x10(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm1
|
||||
# o2 = i2 ^ (x2 + s2), first block
|
||||
vpaddd %ymm13,%ymm2,%ymm10
|
||||
cmp $0x30,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x20(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x20(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm2
|
||||
# o3 = i3 ^ (x3 + s3), first block
|
||||
vpaddd %ymm14,%ymm3,%ymm10
|
||||
cmp $0x40,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x30(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x30(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm3
|
||||
|
||||
# xor and write second block
|
||||
vmovdqa %xmm0,%xmm10
|
||||
cmp $0x50,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x40(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x40(%rsi)
|
||||
|
||||
vmovdqa %xmm1,%xmm10
|
||||
cmp $0x60,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x50(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x50(%rsi)
|
||||
|
||||
vmovdqa %xmm2,%xmm10
|
||||
cmp $0x70,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x60(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x60(%rsi)
|
||||
|
||||
vmovdqa %xmm3,%xmm10
|
||||
cmp $0x80,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x70(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x70(%rsi)
|
||||
|
||||
# o0 = i0 ^ (x0 + s0), third block
|
||||
vpaddd %ymm11,%ymm4,%ymm10
|
||||
cmp $0x90,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x80(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x80(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm4
|
||||
# o1 = i1 ^ (x1 + s1), third block
|
||||
vpaddd %ymm12,%ymm5,%ymm10
|
||||
cmp $0xa0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0x90(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0x90(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm5
|
||||
# o2 = i2 ^ (x2 + s2), third block
|
||||
vpaddd %ymm13,%ymm6,%ymm10
|
||||
cmp $0xb0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xa0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xa0(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm6
|
||||
# o3 = i3 ^ (x3 + s3), third block
|
||||
vpaddd %ymm15,%ymm7,%ymm10
|
||||
cmp $0xc0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xb0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xb0(%rsi)
|
||||
vextracti128 $1,%ymm10,%xmm7
|
||||
|
||||
# xor and write fourth block
|
||||
vmovdqa %xmm4,%xmm10
|
||||
cmp $0xd0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xc0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xc0(%rsi)
|
||||
|
||||
vmovdqa %xmm5,%xmm10
|
||||
cmp $0xe0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xd0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xd0(%rsi)
|
||||
|
||||
vmovdqa %xmm6,%xmm10
|
||||
cmp $0xf0,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xe0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xe0(%rsi)
|
||||
|
||||
vmovdqa %xmm7,%xmm10
|
||||
cmp $0x100,%rcx
|
||||
jl .Lxorpart4
|
||||
vpxord 0xf0(%rdx),%xmm10,%xmm9
|
||||
vmovdqu %xmm9,0xf0(%rsi)
|
||||
|
||||
.Ldone4:
|
||||
vzeroupper
|
||||
ret
|
||||
|
||||
.Lxorpart4:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rcx,%rax
|
||||
and $0xf,%rcx
|
||||
jz .Ldone8
|
||||
mov %rax,%r9
|
||||
and $~0xf,%r9
|
||||
|
||||
mov $1,%rax
|
||||
shld %cl,%rax,%rax
|
||||
sub $1,%rax
|
||||
kmovq %rax,%k1
|
||||
|
||||
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
|
||||
vpxord %xmm10,%xmm1,%xmm1
|
||||
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
|
||||
|
||||
jmp .Ldone4
|
||||
|
||||
ENDPROC(chacha_4block_xor_avx512vl)
|
||||
|
||||
ENTRY(chacha_8block_xor_avx512vl)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 8 data blocks output, o
|
||||
# %rdx: up to 8 data blocks input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
|
||||
# This function encrypts eight consecutive ChaCha blocks by loading
|
||||
# the state matrix in AVX registers eight times. Compared to AVX2, this
|
||||
# mostly benefits from the new rotate instructions in VL and the
|
||||
# additional registers.
|
||||
|
||||
vzeroupper
|
||||
|
||||
# x0..15[0-7] = s[0..15]
|
||||
vpbroadcastd 0x00(%rdi),%ymm0
|
||||
vpbroadcastd 0x04(%rdi),%ymm1
|
||||
vpbroadcastd 0x08(%rdi),%ymm2
|
||||
vpbroadcastd 0x0c(%rdi),%ymm3
|
||||
vpbroadcastd 0x10(%rdi),%ymm4
|
||||
vpbroadcastd 0x14(%rdi),%ymm5
|
||||
vpbroadcastd 0x18(%rdi),%ymm6
|
||||
vpbroadcastd 0x1c(%rdi),%ymm7
|
||||
vpbroadcastd 0x20(%rdi),%ymm8
|
||||
vpbroadcastd 0x24(%rdi),%ymm9
|
||||
vpbroadcastd 0x28(%rdi),%ymm10
|
||||
vpbroadcastd 0x2c(%rdi),%ymm11
|
||||
vpbroadcastd 0x30(%rdi),%ymm12
|
||||
vpbroadcastd 0x34(%rdi),%ymm13
|
||||
vpbroadcastd 0x38(%rdi),%ymm14
|
||||
vpbroadcastd 0x3c(%rdi),%ymm15
|
||||
|
||||
# x12 += counter values 0-3
|
||||
vpaddd CTR8BL(%rip),%ymm12,%ymm12
|
||||
|
||||
vmovdqa64 %ymm0,%ymm16
|
||||
vmovdqa64 %ymm1,%ymm17
|
||||
vmovdqa64 %ymm2,%ymm18
|
||||
vmovdqa64 %ymm3,%ymm19
|
||||
vmovdqa64 %ymm4,%ymm20
|
||||
vmovdqa64 %ymm5,%ymm21
|
||||
vmovdqa64 %ymm6,%ymm22
|
||||
vmovdqa64 %ymm7,%ymm23
|
||||
vmovdqa64 %ymm8,%ymm24
|
||||
vmovdqa64 %ymm9,%ymm25
|
||||
vmovdqa64 %ymm10,%ymm26
|
||||
vmovdqa64 %ymm11,%ymm27
|
||||
vmovdqa64 %ymm12,%ymm28
|
||||
vmovdqa64 %ymm13,%ymm29
|
||||
vmovdqa64 %ymm14,%ymm30
|
||||
vmovdqa64 %ymm15,%ymm31
|
||||
|
||||
.Ldoubleround8:
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||
vpaddd %ymm0,%ymm4,%ymm0
|
||||
vpxord %ymm0,%ymm12,%ymm12
|
||||
vprold $16,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
|
||||
vpaddd %ymm1,%ymm5,%ymm1
|
||||
vpxord %ymm1,%ymm13,%ymm13
|
||||
vprold $16,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
|
||||
vpaddd %ymm2,%ymm6,%ymm2
|
||||
vpxord %ymm2,%ymm14,%ymm14
|
||||
vprold $16,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
|
||||
vpaddd %ymm3,%ymm7,%ymm3
|
||||
vpxord %ymm3,%ymm15,%ymm15
|
||||
vprold $16,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxord %ymm8,%ymm4,%ymm4
|
||||
vprold $12,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxord %ymm9,%ymm5,%ymm5
|
||||
vprold $12,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxord %ymm10,%ymm6,%ymm6
|
||||
vprold $12,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxord %ymm11,%ymm7,%ymm7
|
||||
vprold $12,%ymm7,%ymm7
|
||||
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
|
||||
vpaddd %ymm0,%ymm4,%ymm0
|
||||
vpxord %ymm0,%ymm12,%ymm12
|
||||
vprold $8,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
|
||||
vpaddd %ymm1,%ymm5,%ymm1
|
||||
vpxord %ymm1,%ymm13,%ymm13
|
||||
vprold $8,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
|
||||
vpaddd %ymm2,%ymm6,%ymm2
|
||||
vpxord %ymm2,%ymm14,%ymm14
|
||||
vprold $8,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
|
||||
vpaddd %ymm3,%ymm7,%ymm3
|
||||
vpxord %ymm3,%ymm15,%ymm15
|
||||
vprold $8,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxord %ymm8,%ymm4,%ymm4
|
||||
vprold $7,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxord %ymm9,%ymm5,%ymm5
|
||||
vprold $7,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxord %ymm10,%ymm6,%ymm6
|
||||
vprold $7,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxord %ymm11,%ymm7,%ymm7
|
||||
vprold $7,%ymm7,%ymm7
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
|
||||
vpaddd %ymm0,%ymm5,%ymm0
|
||||
vpxord %ymm0,%ymm15,%ymm15
|
||||
vprold $16,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
|
||||
vpaddd %ymm1,%ymm6,%ymm1
|
||||
vpxord %ymm1,%ymm12,%ymm12
|
||||
vprold $16,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
|
||||
vpaddd %ymm2,%ymm7,%ymm2
|
||||
vpxord %ymm2,%ymm13,%ymm13
|
||||
vprold $16,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
|
||||
vpaddd %ymm3,%ymm4,%ymm3
|
||||
vpxord %ymm3,%ymm14,%ymm14
|
||||
vprold $16,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxord %ymm10,%ymm5,%ymm5
|
||||
vprold $12,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxord %ymm11,%ymm6,%ymm6
|
||||
vprold $12,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxord %ymm8,%ymm7,%ymm7
|
||||
vprold $12,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxord %ymm9,%ymm4,%ymm4
|
||||
vprold $12,%ymm4,%ymm4
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
|
||||
vpaddd %ymm0,%ymm5,%ymm0
|
||||
vpxord %ymm0,%ymm15,%ymm15
|
||||
vprold $8,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
|
||||
vpaddd %ymm1,%ymm6,%ymm1
|
||||
vpxord %ymm1,%ymm12,%ymm12
|
||||
vprold $8,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
|
||||
vpaddd %ymm2,%ymm7,%ymm2
|
||||
vpxord %ymm2,%ymm13,%ymm13
|
||||
vprold $8,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
|
||||
vpaddd %ymm3,%ymm4,%ymm3
|
||||
vpxord %ymm3,%ymm14,%ymm14
|
||||
vprold $8,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxord %ymm10,%ymm5,%ymm5
|
||||
vprold $7,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxord %ymm11,%ymm6,%ymm6
|
||||
vprold $7,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxord %ymm8,%ymm7,%ymm7
|
||||
vprold $7,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxord %ymm9,%ymm4,%ymm4
|
||||
vprold $7,%ymm4,%ymm4
|
||||
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround8
|
||||
|
||||
# x0..15[0-3] += s[0..15]
|
||||
vpaddd %ymm16,%ymm0,%ymm0
|
||||
vpaddd %ymm17,%ymm1,%ymm1
|
||||
vpaddd %ymm18,%ymm2,%ymm2
|
||||
vpaddd %ymm19,%ymm3,%ymm3
|
||||
vpaddd %ymm20,%ymm4,%ymm4
|
||||
vpaddd %ymm21,%ymm5,%ymm5
|
||||
vpaddd %ymm22,%ymm6,%ymm6
|
||||
vpaddd %ymm23,%ymm7,%ymm7
|
||||
vpaddd %ymm24,%ymm8,%ymm8
|
||||
vpaddd %ymm25,%ymm9,%ymm9
|
||||
vpaddd %ymm26,%ymm10,%ymm10
|
||||
vpaddd %ymm27,%ymm11,%ymm11
|
||||
vpaddd %ymm28,%ymm12,%ymm12
|
||||
vpaddd %ymm29,%ymm13,%ymm13
|
||||
vpaddd %ymm30,%ymm14,%ymm14
|
||||
vpaddd %ymm31,%ymm15,%ymm15
|
||||
|
||||
# interleave 32-bit words in state n, n+1
|
||||
vpunpckldq %ymm1,%ymm0,%ymm16
|
||||
vpunpckhdq %ymm1,%ymm0,%ymm17
|
||||
vpunpckldq %ymm3,%ymm2,%ymm18
|
||||
vpunpckhdq %ymm3,%ymm2,%ymm19
|
||||
vpunpckldq %ymm5,%ymm4,%ymm20
|
||||
vpunpckhdq %ymm5,%ymm4,%ymm21
|
||||
vpunpckldq %ymm7,%ymm6,%ymm22
|
||||
vpunpckhdq %ymm7,%ymm6,%ymm23
|
||||
vpunpckldq %ymm9,%ymm8,%ymm24
|
||||
vpunpckhdq %ymm9,%ymm8,%ymm25
|
||||
vpunpckldq %ymm11,%ymm10,%ymm26
|
||||
vpunpckhdq %ymm11,%ymm10,%ymm27
|
||||
vpunpckldq %ymm13,%ymm12,%ymm28
|
||||
vpunpckhdq %ymm13,%ymm12,%ymm29
|
||||
vpunpckldq %ymm15,%ymm14,%ymm30
|
||||
vpunpckhdq %ymm15,%ymm14,%ymm31
|
||||
|
||||
# interleave 64-bit words in state n, n+2
|
||||
vpunpcklqdq %ymm18,%ymm16,%ymm0
|
||||
vpunpcklqdq %ymm19,%ymm17,%ymm1
|
||||
vpunpckhqdq %ymm18,%ymm16,%ymm2
|
||||
vpunpckhqdq %ymm19,%ymm17,%ymm3
|
||||
vpunpcklqdq %ymm22,%ymm20,%ymm4
|
||||
vpunpcklqdq %ymm23,%ymm21,%ymm5
|
||||
vpunpckhqdq %ymm22,%ymm20,%ymm6
|
||||
vpunpckhqdq %ymm23,%ymm21,%ymm7
|
||||
vpunpcklqdq %ymm26,%ymm24,%ymm8
|
||||
vpunpcklqdq %ymm27,%ymm25,%ymm9
|
||||
vpunpckhqdq %ymm26,%ymm24,%ymm10
|
||||
vpunpckhqdq %ymm27,%ymm25,%ymm11
|
||||
vpunpcklqdq %ymm30,%ymm28,%ymm12
|
||||
vpunpcklqdq %ymm31,%ymm29,%ymm13
|
||||
vpunpckhqdq %ymm30,%ymm28,%ymm14
|
||||
vpunpckhqdq %ymm31,%ymm29,%ymm15
|
||||
|
||||
# interleave 128-bit words in state n, n+4
|
||||
# xor/write first four blocks
|
||||
vmovdqa64 %ymm0,%ymm16
|
||||
vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
|
||||
cmp $0x0020,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0000(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0000(%rsi)
|
||||
vmovdqa64 %ymm16,%ymm0
|
||||
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
|
||||
|
||||
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
|
||||
cmp $0x0040,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0020(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0020(%rsi)
|
||||
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
|
||||
|
||||
vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
|
||||
cmp $0x0060,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0040(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0040(%rsi)
|
||||
vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
|
||||
|
||||
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
|
||||
cmp $0x0080,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0060(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0060(%rsi)
|
||||
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
|
||||
|
||||
vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
|
||||
cmp $0x00a0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0080(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0080(%rsi)
|
||||
vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
|
||||
|
||||
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
|
||||
cmp $0x00c0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x00a0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x00a0(%rsi)
|
||||
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
|
||||
|
||||
vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
|
||||
cmp $0x00e0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x00c0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x00c0(%rsi)
|
||||
vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
|
||||
|
||||
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
|
||||
cmp $0x0100,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x00e0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x00e0(%rsi)
|
||||
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
|
||||
|
||||
# xor remaining blocks, write to output
|
||||
vmovdqa64 %ymm4,%ymm0
|
||||
cmp $0x0120,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0100(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0100(%rsi)
|
||||
|
||||
vmovdqa64 %ymm12,%ymm0
|
||||
cmp $0x0140,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0120(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0120(%rsi)
|
||||
|
||||
vmovdqa64 %ymm6,%ymm0
|
||||
cmp $0x0160,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0140(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0140(%rsi)
|
||||
|
||||
vmovdqa64 %ymm14,%ymm0
|
||||
cmp $0x0180,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0160(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0160(%rsi)
|
||||
|
||||
vmovdqa64 %ymm5,%ymm0
|
||||
cmp $0x01a0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x0180(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x0180(%rsi)
|
||||
|
||||
vmovdqa64 %ymm13,%ymm0
|
||||
cmp $0x01c0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x01a0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x01a0(%rsi)
|
||||
|
||||
vmovdqa64 %ymm7,%ymm0
|
||||
cmp $0x01e0,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x01c0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x01c0(%rsi)
|
||||
|
||||
vmovdqa64 %ymm15,%ymm0
|
||||
cmp $0x0200,%rcx
|
||||
jl .Lxorpart8
|
||||
vpxord 0x01e0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu64 %ymm0,0x01e0(%rsi)
|
||||
|
||||
.Ldone8:
|
||||
vzeroupper
|
||||
ret
|
||||
|
||||
.Lxorpart8:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rcx,%rax
|
||||
and $0x1f,%rcx
|
||||
jz .Ldone8
|
||||
mov %rax,%r9
|
||||
and $~0x1f,%r9
|
||||
|
||||
mov $1,%rax
|
||||
shld %cl,%rax,%rax
|
||||
sub $1,%rax
|
||||
kmovq %rax,%k1
|
||||
|
||||
vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
|
||||
vpxord %ymm0,%ymm1,%ymm1
|
||||
vmovdqu8 %ymm1,(%rsi,%r9){%k1}
|
||||
|
||||
jmp .Ldone8
|
||||
|
||||
ENDPROC(chacha_8block_xor_avx512vl)
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
|
||||
* ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
|
@ -10,6 +10,7 @@
|
|||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
#include <asm/frame.h>
|
||||
|
||||
.section .rodata.cst16.ROT8, "aM", @progbits, 16
|
||||
.align 16
|
||||
|
@ -23,35 +24,25 @@ CTRINC: .octa 0x00000003000000020000000100000000
|
|||
|
||||
.text
|
||||
|
||||
ENTRY(chacha20_block_xor_ssse3)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: 1 data block output, o
|
||||
# %rdx: 1 data block input, i
|
||||
|
||||
# This function encrypts one ChaCha20 block by loading the state matrix
|
||||
# in four SSE registers. It performs matrix operation on four words in
|
||||
# parallel, but requireds shuffling to rearrange the words after each
|
||||
# round. 8/16-bit word rotation is done with the slightly better
|
||||
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
|
||||
# traditional shift+OR.
|
||||
|
||||
# x0..3 = s0..3
|
||||
movdqa 0x00(%rdi),%xmm0
|
||||
movdqa 0x10(%rdi),%xmm1
|
||||
movdqa 0x20(%rdi),%xmm2
|
||||
movdqa 0x30(%rdi),%xmm3
|
||||
movdqa %xmm0,%xmm8
|
||||
movdqa %xmm1,%xmm9
|
||||
movdqa %xmm2,%xmm10
|
||||
movdqa %xmm3,%xmm11
|
||||
/*
|
||||
* chacha_permute - permute one block
|
||||
*
|
||||
* Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
|
||||
* function performs matrix operations on four words in parallel, but requires
|
||||
* shuffling to rearrange the words after each round. 8/16-bit word rotation is
|
||||
* done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
|
||||
* rotation uses traditional shift+OR.
|
||||
*
|
||||
* The round count is given in %r8d.
|
||||
*
|
||||
* Clobbers: %r8d, %xmm4-%xmm7
|
||||
*/
|
||||
chacha_permute:
|
||||
|
||||
movdqa ROT8(%rip),%xmm4
|
||||
movdqa ROT16(%rip),%xmm5
|
||||
|
||||
mov $10,%ecx
|
||||
|
||||
.Ldoubleround:
|
||||
|
||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||
paddd %xmm1,%xmm0
|
||||
pxor %xmm0,%xmm3
|
||||
|
@ -118,39 +109,129 @@ ENTRY(chacha20_block_xor_ssse3)
|
|||
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||
pshufd $0x39,%xmm3,%xmm3
|
||||
|
||||
dec %ecx
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround
|
||||
|
||||
ret
|
||||
ENDPROC(chacha_permute)
|
||||
|
||||
ENTRY(chacha_block_xor_ssse3)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 1 data block output, o
|
||||
# %rdx: up to 1 data block input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
FRAME_BEGIN
|
||||
|
||||
# x0..3 = s0..3
|
||||
movdqu 0x00(%rdi),%xmm0
|
||||
movdqu 0x10(%rdi),%xmm1
|
||||
movdqu 0x20(%rdi),%xmm2
|
||||
movdqu 0x30(%rdi),%xmm3
|
||||
movdqa %xmm0,%xmm8
|
||||
movdqa %xmm1,%xmm9
|
||||
movdqa %xmm2,%xmm10
|
||||
movdqa %xmm3,%xmm11
|
||||
|
||||
mov %rcx,%rax
|
||||
call chacha_permute
|
||||
|
||||
# o0 = i0 ^ (x0 + s0)
|
||||
movdqu 0x00(%rdx),%xmm4
|
||||
paddd %xmm8,%xmm0
|
||||
cmp $0x10,%rax
|
||||
jl .Lxorpart
|
||||
movdqu 0x00(%rdx),%xmm4
|
||||
pxor %xmm4,%xmm0
|
||||
movdqu %xmm0,0x00(%rsi)
|
||||
# o1 = i1 ^ (x1 + s1)
|
||||
movdqu 0x10(%rdx),%xmm5
|
||||
paddd %xmm9,%xmm1
|
||||
pxor %xmm5,%xmm1
|
||||
movdqu %xmm1,0x10(%rsi)
|
||||
movdqa %xmm1,%xmm0
|
||||
cmp $0x20,%rax
|
||||
jl .Lxorpart
|
||||
movdqu 0x10(%rdx),%xmm0
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x10(%rsi)
|
||||
# o2 = i2 ^ (x2 + s2)
|
||||
movdqu 0x20(%rdx),%xmm6
|
||||
paddd %xmm10,%xmm2
|
||||
pxor %xmm6,%xmm2
|
||||
movdqu %xmm2,0x20(%rsi)
|
||||
movdqa %xmm2,%xmm0
|
||||
cmp $0x30,%rax
|
||||
jl .Lxorpart
|
||||
movdqu 0x20(%rdx),%xmm0
|
||||
pxor %xmm2,%xmm0
|
||||
movdqu %xmm0,0x20(%rsi)
|
||||
# o3 = i3 ^ (x3 + s3)
|
||||
movdqu 0x30(%rdx),%xmm7
|
||||
paddd %xmm11,%xmm3
|
||||
pxor %xmm7,%xmm3
|
||||
movdqu %xmm3,0x30(%rsi)
|
||||
movdqa %xmm3,%xmm0
|
||||
cmp $0x40,%rax
|
||||
jl .Lxorpart
|
||||
movdqu 0x30(%rdx),%xmm0
|
||||
pxor %xmm3,%xmm0
|
||||
movdqu %xmm0,0x30(%rsi)
|
||||
|
||||
.Ldone:
|
||||
FRAME_END
|
||||
ret
|
||||
ENDPROC(chacha20_block_xor_ssse3)
|
||||
|
||||
ENTRY(chacha20_4block_xor_ssse3)
|
||||
.Lxorpart:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rax,%r9
|
||||
and $0x0f,%r9
|
||||
jz .Ldone
|
||||
and $~0x0f,%rax
|
||||
|
||||
mov %rsi,%r11
|
||||
|
||||
lea 8(%rsp),%r10
|
||||
sub $0x10,%rsp
|
||||
and $~31,%rsp
|
||||
|
||||
lea (%rdx,%rax),%rsi
|
||||
mov %rsp,%rdi
|
||||
mov %r9,%rcx
|
||||
rep movsb
|
||||
|
||||
pxor 0x00(%rsp),%xmm0
|
||||
movdqa %xmm0,0x00(%rsp)
|
||||
|
||||
mov %rsp,%rsi
|
||||
lea (%r11,%rax),%rdi
|
||||
mov %r9,%rcx
|
||||
rep movsb
|
||||
|
||||
lea -8(%r10),%rsp
|
||||
jmp .Ldone
|
||||
|
||||
ENDPROC(chacha_block_xor_ssse3)
|
||||
|
||||
ENTRY(hchacha_block_ssse3)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: 4 data blocks output, o
|
||||
# %rdx: 4 data blocks input, i
|
||||
# %rsi: output (8 32-bit words)
|
||||
# %edx: nrounds
|
||||
FRAME_BEGIN
|
||||
|
||||
# This function encrypts four consecutive ChaCha20 blocks by loading the
|
||||
movdqu 0x00(%rdi),%xmm0
|
||||
movdqu 0x10(%rdi),%xmm1
|
||||
movdqu 0x20(%rdi),%xmm2
|
||||
movdqu 0x30(%rdi),%xmm3
|
||||
|
||||
mov %edx,%r8d
|
||||
call chacha_permute
|
||||
|
||||
movdqu %xmm0,0x00(%rsi)
|
||||
movdqu %xmm3,0x10(%rsi)
|
||||
|
||||
FRAME_END
|
||||
ret
|
||||
ENDPROC(hchacha_block_ssse3)
|
||||
|
||||
ENTRY(chacha_4block_xor_ssse3)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: up to 4 data blocks output, o
|
||||
# %rdx: up to 4 data blocks input, i
|
||||
# %rcx: input/output length in bytes
|
||||
# %r8d: nrounds
|
||||
|
||||
# This function encrypts four consecutive ChaCha blocks by loading the
|
||||
# the state matrix in SSE registers four times. As we need some scratch
|
||||
# registers, we save the first four registers on the stack. The
|
||||
# algorithm performs each operation on the corresponding word of each
|
||||
|
@ -163,6 +244,7 @@ ENTRY(chacha20_4block_xor_ssse3)
|
|||
lea 8(%rsp),%r10
|
||||
sub $0x80,%rsp
|
||||
and $~63,%rsp
|
||||
mov %rcx,%rax
|
||||
|
||||
# x0..15[0-3] = s0..3[0..3]
|
||||
movq 0x00(%rdi),%xmm1
|
||||
|
@ -202,8 +284,6 @@ ENTRY(chacha20_4block_xor_ssse3)
|
|||
# x12 += counter values 0-3
|
||||
paddd %xmm1,%xmm12
|
||||
|
||||
mov $10,%ecx
|
||||
|
||||
.Ldoubleround4:
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||
movdqa 0x00(%rsp),%xmm0
|
||||
|
@ -421,7 +501,7 @@ ENTRY(chacha20_4block_xor_ssse3)
|
|||
psrld $25,%xmm4
|
||||
por %xmm0,%xmm4
|
||||
|
||||
dec %ecx
|
||||
sub $2,%r8d
|
||||
jnz .Ldoubleround4
|
||||
|
||||
# x0[0-3] += s0[0]
|
||||
|
@ -573,58 +653,143 @@ ENTRY(chacha20_4block_xor_ssse3)
|
|||
|
||||
# xor with corresponding input, write to output
|
||||
movdqa 0x00(%rsp),%xmm0
|
||||
cmp $0x10,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x00(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x00(%rsi)
|
||||
movdqa 0x10(%rsp),%xmm0
|
||||
movdqu 0x80(%rdx),%xmm1
|
||||
|
||||
movdqu %xmm4,%xmm0
|
||||
cmp $0x20,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x10(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x80(%rsi)
|
||||
movdqu %xmm0,0x10(%rsi)
|
||||
|
||||
movdqu %xmm8,%xmm0
|
||||
cmp $0x30,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x20(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x20(%rsi)
|
||||
|
||||
movdqu %xmm12,%xmm0
|
||||
cmp $0x40,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x30(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x30(%rsi)
|
||||
|
||||
movdqa 0x20(%rsp),%xmm0
|
||||
cmp $0x50,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x40(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x40(%rsi)
|
||||
|
||||
movdqu %xmm6,%xmm0
|
||||
cmp $0x60,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x50(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x50(%rsi)
|
||||
|
||||
movdqu %xmm10,%xmm0
|
||||
cmp $0x70,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x60(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x60(%rsi)
|
||||
|
||||
movdqu %xmm14,%xmm0
|
||||
cmp $0x80,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x70(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x70(%rsi)
|
||||
|
||||
movdqa 0x10(%rsp),%xmm0
|
||||
cmp $0x90,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x80(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x80(%rsi)
|
||||
|
||||
movdqu %xmm5,%xmm0
|
||||
cmp $0xa0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0x90(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0x90(%rsi)
|
||||
|
||||
movdqu %xmm9,%xmm0
|
||||
cmp $0xb0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xa0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xa0(%rsi)
|
||||
|
||||
movdqu %xmm13,%xmm0
|
||||
cmp $0xc0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xb0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xb0(%rsi)
|
||||
|
||||
movdqa 0x30(%rsp),%xmm0
|
||||
cmp $0xd0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xc0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xc0(%rsi)
|
||||
movdqu 0x10(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm4
|
||||
movdqu %xmm4,0x10(%rsi)
|
||||
movdqu 0x90(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm5
|
||||
movdqu %xmm5,0x90(%rsi)
|
||||
movdqu 0x50(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm6
|
||||
movdqu %xmm6,0x50(%rsi)
|
||||
movdqu 0xd0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm7
|
||||
movdqu %xmm7,0xd0(%rsi)
|
||||
movdqu 0x20(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm8
|
||||
movdqu %xmm8,0x20(%rsi)
|
||||
movdqu 0xa0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm9
|
||||
movdqu %xmm9,0xa0(%rsi)
|
||||
movdqu 0x60(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm10
|
||||
movdqu %xmm10,0x60(%rsi)
|
||||
movdqu 0xe0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm11
|
||||
movdqu %xmm11,0xe0(%rsi)
|
||||
movdqu 0x30(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm12
|
||||
movdqu %xmm12,0x30(%rsi)
|
||||
movdqu 0xb0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm13
|
||||
movdqu %xmm13,0xb0(%rsi)
|
||||
movdqu 0x70(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm14
|
||||
movdqu %xmm14,0x70(%rsi)
|
||||
movdqu 0xf0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm15
|
||||
movdqu %xmm15,0xf0(%rsi)
|
||||
|
||||
movdqu %xmm7,%xmm0
|
||||
cmp $0xe0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xd0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xd0(%rsi)
|
||||
|
||||
movdqu %xmm11,%xmm0
|
||||
cmp $0xf0,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xe0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xe0(%rsi)
|
||||
|
||||
movdqu %xmm15,%xmm0
|
||||
cmp $0x100,%rax
|
||||
jl .Lxorpart4
|
||||
movdqu 0xf0(%rdx),%xmm1
|
||||
pxor %xmm1,%xmm0
|
||||
movdqu %xmm0,0xf0(%rsi)
|
||||
|
||||
.Ldone4:
|
||||
lea -8(%r10),%rsp
|
||||
ret
|
||||
ENDPROC(chacha20_4block_xor_ssse3)
|
||||
|
||||
.Lxorpart4:
|
||||
# xor remaining bytes from partial register into output
|
||||
mov %rax,%r9
|
||||
and $0x0f,%r9
|
||||
jz .Ldone4
|
||||
and $~0x0f,%rax
|
||||
|
||||
mov %rsi,%r11
|
||||
|
||||
lea (%rdx,%rax),%rsi
|
||||
mov %rsp,%rdi
|
||||
mov %r9,%rcx
|
||||
rep movsb
|
||||
|
||||
pxor 0x00(%rsp),%xmm0
|
||||
movdqa %xmm0,0x00(%rsp)
|
||||
|
||||
mov %rsp,%rsi
|
||||
lea (%r11,%rax),%rdi
|
||||
mov %r9,%rcx
|
||||
rep movsb
|
||||
|
||||
jmp .Ldone4
|
||||
|
||||
ENDPROC(chacha_4block_xor_ssse3)
|
|
@ -1,448 +0,0 @@
|
|||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <linux/linkage.h>
|
||||
|
||||
.section .rodata.cst32.ROT8, "aM", @progbits, 32
|
||||
.align 32
|
||||
ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
|
||||
.octa 0x0e0d0c0f0a09080b0605040702010003
|
||||
|
||||
.section .rodata.cst32.ROT16, "aM", @progbits, 32
|
||||
.align 32
|
||||
ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
|
||||
.octa 0x0d0c0f0e09080b0a0504070601000302
|
||||
|
||||
.section .rodata.cst32.CTRINC, "aM", @progbits, 32
|
||||
.align 32
|
||||
CTRINC: .octa 0x00000003000000020000000100000000
|
||||
.octa 0x00000007000000060000000500000004
|
||||
|
||||
.text
|
||||
|
||||
ENTRY(chacha20_8block_xor_avx2)
|
||||
# %rdi: Input state matrix, s
|
||||
# %rsi: 8 data blocks output, o
|
||||
# %rdx: 8 data blocks input, i
|
||||
|
||||
# This function encrypts eight consecutive ChaCha20 blocks by loading
|
||||
# the state matrix in AVX registers eight times. As we need some
|
||||
# scratch registers, we save the first four registers on the stack. The
|
||||
# algorithm performs each operation on the corresponding word of each
|
||||
# state matrix, hence requires no word shuffling. For final XORing step
|
||||
# we transpose the matrix by interleaving 32-, 64- and then 128-bit
|
||||
# words, which allows us to do XOR in AVX registers. 8/16-bit word
|
||||
# rotation is done with the slightly better performing byte shuffling,
|
||||
# 7/12-bit word rotation uses traditional shift+OR.
|
||||
|
||||
vzeroupper
|
||||
# 4 * 32 byte stack, 32-byte aligned
|
||||
lea 8(%rsp),%r10
|
||||
and $~31, %rsp
|
||||
sub $0x80, %rsp
|
||||
|
||||
# x0..15[0-7] = s[0..15]
|
||||
vpbroadcastd 0x00(%rdi),%ymm0
|
||||
vpbroadcastd 0x04(%rdi),%ymm1
|
||||
vpbroadcastd 0x08(%rdi),%ymm2
|
||||
vpbroadcastd 0x0c(%rdi),%ymm3
|
||||
vpbroadcastd 0x10(%rdi),%ymm4
|
||||
vpbroadcastd 0x14(%rdi),%ymm5
|
||||
vpbroadcastd 0x18(%rdi),%ymm6
|
||||
vpbroadcastd 0x1c(%rdi),%ymm7
|
||||
vpbroadcastd 0x20(%rdi),%ymm8
|
||||
vpbroadcastd 0x24(%rdi),%ymm9
|
||||
vpbroadcastd 0x28(%rdi),%ymm10
|
||||
vpbroadcastd 0x2c(%rdi),%ymm11
|
||||
vpbroadcastd 0x30(%rdi),%ymm12
|
||||
vpbroadcastd 0x34(%rdi),%ymm13
|
||||
vpbroadcastd 0x38(%rdi),%ymm14
|
||||
vpbroadcastd 0x3c(%rdi),%ymm15
|
||||
# x0..3 on stack
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa %ymm2,0x40(%rsp)
|
||||
vmovdqa %ymm3,0x60(%rsp)
|
||||
|
||||
vmovdqa CTRINC(%rip),%ymm1
|
||||
vmovdqa ROT8(%rip),%ymm2
|
||||
vmovdqa ROT16(%rip),%ymm3
|
||||
|
||||
# x12 += counter values 0-3
|
||||
vpaddd %ymm1,%ymm12,%ymm12
|
||||
|
||||
mov $10,%ecx
|
||||
|
||||
.Ldoubleround8:
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||
vpaddd 0x00(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm3,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
|
||||
vpaddd 0x20(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm3,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
|
||||
vpaddd 0x40(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm3,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
|
||||
vpaddd 0x60(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm3,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm4,%ymm4
|
||||
vpslld $12,%ymm4,%ymm0
|
||||
vpsrld $20,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm5,%ymm5
|
||||
vpslld $12,%ymm5,%ymm0
|
||||
vpsrld $20,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm6,%ymm6
|
||||
vpslld $12,%ymm6,%ymm0
|
||||
vpsrld $20,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm7,%ymm7
|
||||
vpslld $12,%ymm7,%ymm0
|
||||
vpsrld $20,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
|
||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
|
||||
vpaddd 0x00(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm2,%ymm12,%ymm12
|
||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
|
||||
vpaddd 0x20(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm2,%ymm13,%ymm13
|
||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
|
||||
vpaddd 0x40(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm2,%ymm14,%ymm14
|
||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
|
||||
vpaddd 0x60(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm2,%ymm15,%ymm15
|
||||
|
||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
|
||||
vpaddd %ymm12,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm4,%ymm4
|
||||
vpslld $7,%ymm4,%ymm0
|
||||
vpsrld $25,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
|
||||
vpaddd %ymm13,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm5,%ymm5
|
||||
vpslld $7,%ymm5,%ymm0
|
||||
vpsrld $25,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
|
||||
vpaddd %ymm14,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm6,%ymm6
|
||||
vpslld $7,%ymm6,%ymm0
|
||||
vpsrld $25,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
|
||||
vpaddd %ymm15,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm7,%ymm7
|
||||
vpslld $7,%ymm7,%ymm0
|
||||
vpsrld $25,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
|
||||
vpaddd 0x00(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm3,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
|
||||
vpaddd 0x20(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm3,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
|
||||
vpaddd 0x40(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm3,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
|
||||
vpaddd 0x60(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm3,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm5,%ymm5
|
||||
vpslld $12,%ymm5,%ymm0
|
||||
vpsrld $20,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm6,%ymm6
|
||||
vpslld $12,%ymm6,%ymm0
|
||||
vpsrld $20,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm7,%ymm7
|
||||
vpslld $12,%ymm7,%ymm0
|
||||
vpsrld $20,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm4,%ymm4
|
||||
vpslld $12,%ymm4,%ymm0
|
||||
vpsrld $20,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
|
||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
|
||||
vpaddd 0x00(%rsp),%ymm5,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpxor %ymm0,%ymm15,%ymm15
|
||||
vpshufb %ymm2,%ymm15,%ymm15
|
||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
|
||||
vpaddd 0x20(%rsp),%ymm6,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpxor %ymm0,%ymm12,%ymm12
|
||||
vpshufb %ymm2,%ymm12,%ymm12
|
||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
|
||||
vpaddd 0x40(%rsp),%ymm7,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpxor %ymm0,%ymm13,%ymm13
|
||||
vpshufb %ymm2,%ymm13,%ymm13
|
||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
|
||||
vpaddd 0x60(%rsp),%ymm4,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpxor %ymm0,%ymm14,%ymm14
|
||||
vpshufb %ymm2,%ymm14,%ymm14
|
||||
|
||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
|
||||
vpaddd %ymm15,%ymm10,%ymm10
|
||||
vpxor %ymm10,%ymm5,%ymm5
|
||||
vpslld $7,%ymm5,%ymm0
|
||||
vpsrld $25,%ymm5,%ymm5
|
||||
vpor %ymm0,%ymm5,%ymm5
|
||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
|
||||
vpaddd %ymm12,%ymm11,%ymm11
|
||||
vpxor %ymm11,%ymm6,%ymm6
|
||||
vpslld $7,%ymm6,%ymm0
|
||||
vpsrld $25,%ymm6,%ymm6
|
||||
vpor %ymm0,%ymm6,%ymm6
|
||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
|
||||
vpaddd %ymm13,%ymm8,%ymm8
|
||||
vpxor %ymm8,%ymm7,%ymm7
|
||||
vpslld $7,%ymm7,%ymm0
|
||||
vpsrld $25,%ymm7,%ymm7
|
||||
vpor %ymm0,%ymm7,%ymm7
|
||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
|
||||
vpaddd %ymm14,%ymm9,%ymm9
|
||||
vpxor %ymm9,%ymm4,%ymm4
|
||||
vpslld $7,%ymm4,%ymm0
|
||||
vpsrld $25,%ymm4,%ymm4
|
||||
vpor %ymm0,%ymm4,%ymm4
|
||||
|
||||
dec %ecx
|
||||
jnz .Ldoubleround8
|
||||
|
||||
# x0..15[0-3] += s[0..15]
|
||||
vpbroadcastd 0x00(%rdi),%ymm0
|
||||
vpaddd 0x00(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x00(%rsp)
|
||||
vpbroadcastd 0x04(%rdi),%ymm0
|
||||
vpaddd 0x20(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x20(%rsp)
|
||||
vpbroadcastd 0x08(%rdi),%ymm0
|
||||
vpaddd 0x40(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x40(%rsp)
|
||||
vpbroadcastd 0x0c(%rdi),%ymm0
|
||||
vpaddd 0x60(%rsp),%ymm0,%ymm0
|
||||
vmovdqa %ymm0,0x60(%rsp)
|
||||
vpbroadcastd 0x10(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm4,%ymm4
|
||||
vpbroadcastd 0x14(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm5,%ymm5
|
||||
vpbroadcastd 0x18(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm6,%ymm6
|
||||
vpbroadcastd 0x1c(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm7,%ymm7
|
||||
vpbroadcastd 0x20(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm8,%ymm8
|
||||
vpbroadcastd 0x24(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm9,%ymm9
|
||||
vpbroadcastd 0x28(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm10,%ymm10
|
||||
vpbroadcastd 0x2c(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm11,%ymm11
|
||||
vpbroadcastd 0x30(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm12,%ymm12
|
||||
vpbroadcastd 0x34(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm13,%ymm13
|
||||
vpbroadcastd 0x38(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm14,%ymm14
|
||||
vpbroadcastd 0x3c(%rdi),%ymm0
|
||||
vpaddd %ymm0,%ymm15,%ymm15
|
||||
|
||||
# x12 += counter values 0-3
|
||||
vpaddd %ymm1,%ymm12,%ymm12
|
||||
|
||||
# interleave 32-bit words in state n, n+1
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vmovdqa 0x20(%rsp),%ymm1
|
||||
vpunpckldq %ymm1,%ymm0,%ymm2
|
||||
vpunpckhdq %ymm1,%ymm0,%ymm1
|
||||
vmovdqa %ymm2,0x00(%rsp)
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa 0x40(%rsp),%ymm0
|
||||
vmovdqa 0x60(%rsp),%ymm1
|
||||
vpunpckldq %ymm1,%ymm0,%ymm2
|
||||
vpunpckhdq %ymm1,%ymm0,%ymm1
|
||||
vmovdqa %ymm2,0x40(%rsp)
|
||||
vmovdqa %ymm1,0x60(%rsp)
|
||||
vmovdqa %ymm4,%ymm0
|
||||
vpunpckldq %ymm5,%ymm0,%ymm4
|
||||
vpunpckhdq %ymm5,%ymm0,%ymm5
|
||||
vmovdqa %ymm6,%ymm0
|
||||
vpunpckldq %ymm7,%ymm0,%ymm6
|
||||
vpunpckhdq %ymm7,%ymm0,%ymm7
|
||||
vmovdqa %ymm8,%ymm0
|
||||
vpunpckldq %ymm9,%ymm0,%ymm8
|
||||
vpunpckhdq %ymm9,%ymm0,%ymm9
|
||||
vmovdqa %ymm10,%ymm0
|
||||
vpunpckldq %ymm11,%ymm0,%ymm10
|
||||
vpunpckhdq %ymm11,%ymm0,%ymm11
|
||||
vmovdqa %ymm12,%ymm0
|
||||
vpunpckldq %ymm13,%ymm0,%ymm12
|
||||
vpunpckhdq %ymm13,%ymm0,%ymm13
|
||||
vmovdqa %ymm14,%ymm0
|
||||
vpunpckldq %ymm15,%ymm0,%ymm14
|
||||
vpunpckhdq %ymm15,%ymm0,%ymm15
|
||||
|
||||
# interleave 64-bit words in state n, n+2
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vmovdqa 0x40(%rsp),%ymm2
|
||||
vpunpcklqdq %ymm2,%ymm0,%ymm1
|
||||
vpunpckhqdq %ymm2,%ymm0,%ymm2
|
||||
vmovdqa %ymm1,0x00(%rsp)
|
||||
vmovdqa %ymm2,0x40(%rsp)
|
||||
vmovdqa 0x20(%rsp),%ymm0
|
||||
vmovdqa 0x60(%rsp),%ymm2
|
||||
vpunpcklqdq %ymm2,%ymm0,%ymm1
|
||||
vpunpckhqdq %ymm2,%ymm0,%ymm2
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa %ymm2,0x60(%rsp)
|
||||
vmovdqa %ymm4,%ymm0
|
||||
vpunpcklqdq %ymm6,%ymm0,%ymm4
|
||||
vpunpckhqdq %ymm6,%ymm0,%ymm6
|
||||
vmovdqa %ymm5,%ymm0
|
||||
vpunpcklqdq %ymm7,%ymm0,%ymm5
|
||||
vpunpckhqdq %ymm7,%ymm0,%ymm7
|
||||
vmovdqa %ymm8,%ymm0
|
||||
vpunpcklqdq %ymm10,%ymm0,%ymm8
|
||||
vpunpckhqdq %ymm10,%ymm0,%ymm10
|
||||
vmovdqa %ymm9,%ymm0
|
||||
vpunpcklqdq %ymm11,%ymm0,%ymm9
|
||||
vpunpckhqdq %ymm11,%ymm0,%ymm11
|
||||
vmovdqa %ymm12,%ymm0
|
||||
vpunpcklqdq %ymm14,%ymm0,%ymm12
|
||||
vpunpckhqdq %ymm14,%ymm0,%ymm14
|
||||
vmovdqa %ymm13,%ymm0
|
||||
vpunpcklqdq %ymm15,%ymm0,%ymm13
|
||||
vpunpckhqdq %ymm15,%ymm0,%ymm15
|
||||
|
||||
# interleave 128-bit words in state n, n+4
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
|
||||
vmovdqa %ymm1,0x00(%rsp)
|
||||
vmovdqa 0x20(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
|
||||
vmovdqa %ymm1,0x20(%rsp)
|
||||
vmovdqa 0x40(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
|
||||
vmovdqa %ymm1,0x40(%rsp)
|
||||
vmovdqa 0x60(%rsp),%ymm0
|
||||
vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
|
||||
vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
|
||||
vmovdqa %ymm1,0x60(%rsp)
|
||||
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
|
||||
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
|
||||
vmovdqa %ymm0,%ymm8
|
||||
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
|
||||
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
|
||||
vmovdqa %ymm0,%ymm9
|
||||
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
|
||||
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
|
||||
vmovdqa %ymm0,%ymm10
|
||||
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
|
||||
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
|
||||
vmovdqa %ymm0,%ymm11
|
||||
|
||||
# xor with corresponding input, write to output
|
||||
vmovdqa 0x00(%rsp),%ymm0
|
||||
vpxor 0x0000(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x0000(%rsi)
|
||||
vmovdqa 0x20(%rsp),%ymm0
|
||||
vpxor 0x0080(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x0080(%rsi)
|
||||
vmovdqa 0x40(%rsp),%ymm0
|
||||
vpxor 0x0040(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x0040(%rsi)
|
||||
vmovdqa 0x60(%rsp),%ymm0
|
||||
vpxor 0x00c0(%rdx),%ymm0,%ymm0
|
||||
vmovdqu %ymm0,0x00c0(%rsi)
|
||||
vpxor 0x0100(%rdx),%ymm4,%ymm4
|
||||
vmovdqu %ymm4,0x0100(%rsi)
|
||||
vpxor 0x0180(%rdx),%ymm5,%ymm5
|
||||
vmovdqu %ymm5,0x00180(%rsi)
|
||||
vpxor 0x0140(%rdx),%ymm6,%ymm6
|
||||
vmovdqu %ymm6,0x0140(%rsi)
|
||||
vpxor 0x01c0(%rdx),%ymm7,%ymm7
|
||||
vmovdqu %ymm7,0x01c0(%rsi)
|
||||
vpxor 0x0020(%rdx),%ymm8,%ymm8
|
||||
vmovdqu %ymm8,0x0020(%rsi)
|
||||
vpxor 0x00a0(%rdx),%ymm9,%ymm9
|
||||
vmovdqu %ymm9,0x00a0(%rsi)
|
||||
vpxor 0x0060(%rdx),%ymm10,%ymm10
|
||||
vmovdqu %ymm10,0x0060(%rsi)
|
||||
vpxor 0x00e0(%rdx),%ymm11,%ymm11
|
||||
vmovdqu %ymm11,0x00e0(%rsi)
|
||||
vpxor 0x0120(%rdx),%ymm12,%ymm12
|
||||
vmovdqu %ymm12,0x0120(%rsi)
|
||||
vpxor 0x01a0(%rdx),%ymm13,%ymm13
|
||||
vmovdqu %ymm13,0x01a0(%rsi)
|
||||
vpxor 0x0160(%rdx),%ymm14,%ymm14
|
||||
vmovdqu %ymm14,0x0160(%rsi)
|
||||
vpxor 0x01e0(%rdx),%ymm15,%ymm15
|
||||
vmovdqu %ymm15,0x01e0(%rsi)
|
||||
|
||||
vzeroupper
|
||||
lea -8(%r10),%rsp
|
||||
ret
|
||||
ENDPROC(chacha20_8block_xor_avx2)
|
|
@ -1,146 +0,0 @@
|
|||
/*
|
||||
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/fpu/api.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
#define CHACHA20_STATE_ALIGN 16
|
||||
|
||||
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
|
||||
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
|
||||
static bool chacha20_use_avx2;
|
||||
#endif
|
||||
|
||||
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes)
|
||||
{
|
||||
u8 buf[CHACHA_BLOCK_SIZE];
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
if (chacha20_use_avx2) {
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
|
||||
chacha20_8block_xor_avx2(state, dst, src);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 8;
|
||||
src += CHACHA_BLOCK_SIZE * 8;
|
||||
dst += CHACHA_BLOCK_SIZE * 8;
|
||||
state[12] += 8;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha20_4block_xor_ssse3(state, dst, src);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||
src += CHACHA_BLOCK_SIZE * 4;
|
||||
dst += CHACHA_BLOCK_SIZE * 4;
|
||||
state[12] += 4;
|
||||
}
|
||||
while (bytes >= CHACHA_BLOCK_SIZE) {
|
||||
chacha20_block_xor_ssse3(state, dst, src);
|
||||
bytes -= CHACHA_BLOCK_SIZE;
|
||||
src += CHACHA_BLOCK_SIZE;
|
||||
dst += CHACHA_BLOCK_SIZE;
|
||||
state[12]++;
|
||||
}
|
||||
if (bytes) {
|
||||
memcpy(buf, src, bytes);
|
||||
chacha20_block_xor_ssse3(state, buf, buf);
|
||||
memcpy(dst, buf, bytes);
|
||||
}
|
||||
}
|
||||
|
||||
static int chacha20_simd(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
u32 *state, state_buf[16 + 2] __aligned(8);
|
||||
struct skcipher_walk walk;
|
||||
int err;
|
||||
|
||||
BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
|
||||
state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
|
||||
|
||||
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
|
||||
return crypto_chacha_crypt(req);
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, true);
|
||||
|
||||
crypto_chacha_init(state, ctx, walk.iv);
|
||||
|
||||
kernel_fpu_begin();
|
||||
|
||||
while (walk.nbytes >= CHACHA_BLOCK_SIZE) {
|
||||
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
rounddown(walk.nbytes, CHACHA_BLOCK_SIZE));
|
||||
err = skcipher_walk_done(&walk,
|
||||
walk.nbytes % CHACHA_BLOCK_SIZE);
|
||||
}
|
||||
|
||||
if (walk.nbytes) {
|
||||
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||
walk.nbytes);
|
||||
err = skcipher_walk_done(&walk, 0);
|
||||
}
|
||||
|
||||
kernel_fpu_end();
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct skcipher_alg alg = {
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-simd",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = crypto_chacha20_setkey,
|
||||
.encrypt = chacha20_simd,
|
||||
.decrypt = chacha20_simd,
|
||||
};
|
||||
|
||||
static int __init chacha20_simd_mod_init(void)
|
||||
{
|
||||
if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
return -ENODEV;
|
||||
|
||||
#ifdef CONFIG_AS_AVX2
|
||||
chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX2) &&
|
||||
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
|
||||
#endif
|
||||
return crypto_register_skcipher(&alg);
|
||||
}
|
||||
|
||||
static void __exit chacha20_simd_mod_fini(void)
|
||||
{
|
||||
crypto_unregister_skcipher(&alg);
|
||||
}
|
||||
|
||||
module_init(chacha20_simd_mod_init);
|
||||
module_exit(chacha20_simd_mod_fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
|
||||
MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
|
||||
MODULE_ALIAS_CRYPTO("chacha20");
|
||||
MODULE_ALIAS_CRYPTO("chacha20-simd");
|
322
arch/x86/crypto/chacha_glue.c
Normal file
322
arch/x86/crypto/chacha_glue.c
Normal file
|
@ -0,0 +1,322 @@
|
|||
/*
|
||||
* x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
|
||||
* including ChaCha20 (RFC7539)
|
||||
*
|
||||
* Copyright (C) 2015 Martin Willi
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*/
|
||||
|
||||
#include <crypto/algapi.h>
|
||||
#include <crypto/internal/chacha.h>
|
||||
#include <crypto/internal/skcipher.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <asm/fpu/api.h>
|
||||
#include <asm/simd.h>
|
||||
|
||||
asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
|
||||
|
||||
asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
|
||||
asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int len, int nrounds);
|
||||
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
|
||||
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
|
||||
|
||||
static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
|
||||
{
|
||||
len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
|
||||
return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
|
||||
unsigned int bytes, int nrounds)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
||||
static_branch_likely(&chacha_use_avx512vl)) {
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
|
||||
chacha_8block_xor_avx512vl(state, dst, src, bytes,
|
||||
nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 8;
|
||||
src += CHACHA_BLOCK_SIZE * 8;
|
||||
dst += CHACHA_BLOCK_SIZE * 8;
|
||||
state[12] += 8;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha_8block_xor_avx512vl(state, dst, src, bytes,
|
||||
nrounds);
|
||||
state[12] += chacha_advance(bytes, 8);
|
||||
return;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE * 2) {
|
||||
chacha_4block_xor_avx512vl(state, dst, src, bytes,
|
||||
nrounds);
|
||||
state[12] += chacha_advance(bytes, 4);
|
||||
return;
|
||||
}
|
||||
if (bytes) {
|
||||
chacha_2block_xor_avx512vl(state, dst, src, bytes,
|
||||
nrounds);
|
||||
state[12] += chacha_advance(bytes, 2);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if (IS_ENABLED(CONFIG_AS_AVX2) &&
|
||||
static_branch_likely(&chacha_use_avx2)) {
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
|
||||
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 8;
|
||||
src += CHACHA_BLOCK_SIZE * 8;
|
||||
dst += CHACHA_BLOCK_SIZE * 8;
|
||||
state[12] += 8;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||
state[12] += chacha_advance(bytes, 8);
|
||||
return;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE * 2) {
|
||||
chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||
state[12] += chacha_advance(bytes, 4);
|
||||
return;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE) {
|
||||
chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||
state[12] += chacha_advance(bytes, 2);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
|
||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||
src += CHACHA_BLOCK_SIZE * 4;
|
||||
dst += CHACHA_BLOCK_SIZE * 4;
|
||||
state[12] += 4;
|
||||
}
|
||||
if (bytes > CHACHA_BLOCK_SIZE) {
|
||||
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
|
||||
state[12] += chacha_advance(bytes, 4);
|
||||
return;
|
||||
}
|
||||
if (bytes) {
|
||||
chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
|
||||
state[12]++;
|
||||
}
|
||||
}
|
||||
|
||||
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
|
||||
{
|
||||
if (!static_branch_likely(&chacha_use_simd) || !may_use_simd()) {
|
||||
hchacha_block_generic(state, stream, nrounds);
|
||||
} else {
|
||||
kernel_fpu_begin();
|
||||
hchacha_block_ssse3(state, stream, nrounds);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(hchacha_block_arch);
|
||||
|
||||
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
||||
{
|
||||
chacha_init_generic(state, key, iv);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_init_arch);
|
||||
|
||||
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
|
||||
int nrounds)
|
||||
{
|
||||
if (!static_branch_likely(&chacha_use_simd) || !may_use_simd() ||
|
||||
bytes <= CHACHA_BLOCK_SIZE)
|
||||
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
|
||||
|
||||
do {
|
||||
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
|
||||
|
||||
kernel_fpu_begin();
|
||||
chacha_dosimd(state, dst, src, todo, nrounds);
|
||||
kernel_fpu_end();
|
||||
|
||||
bytes -= todo;
|
||||
src += todo;
|
||||
dst += todo;
|
||||
} while (bytes);
|
||||
}
|
||||
EXPORT_SYMBOL(chacha_crypt_arch);
|
||||
|
||||
static int chacha_simd_stream_xor(struct skcipher_request *req,
|
||||
const struct chacha_ctx *ctx, const u8 *iv)
|
||||
{
|
||||
u32 state[CHACHA_STATE_WORDS] __aligned(8);
|
||||
struct skcipher_walk walk;
|
||||
int err;
|
||||
|
||||
err = skcipher_walk_virt(&walk, req, false);
|
||||
|
||||
chacha_init_generic(state, ctx->key, iv);
|
||||
|
||||
while (walk.nbytes > 0) {
|
||||
unsigned int nbytes = walk.nbytes;
|
||||
|
||||
if (nbytes < walk.total)
|
||||
nbytes = round_down(nbytes, walk.stride);
|
||||
|
||||
if (!static_branch_likely(&chacha_use_simd) ||
|
||||
!may_use_simd()) {
|
||||
chacha_crypt_generic(state, walk.dst.virt.addr,
|
||||
walk.src.virt.addr, nbytes,
|
||||
ctx->nrounds);
|
||||
} else {
|
||||
kernel_fpu_begin();
|
||||
chacha_dosimd(state, walk.dst.virt.addr,
|
||||
walk.src.virt.addr, nbytes,
|
||||
ctx->nrounds);
|
||||
kernel_fpu_end();
|
||||
}
|
||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int chacha_simd(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
|
||||
return chacha_simd_stream_xor(req, ctx, req->iv);
|
||||
}
|
||||
|
||||
static int xchacha_simd(struct skcipher_request *req)
|
||||
{
|
||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||
u32 state[CHACHA_STATE_WORDS] __aligned(8);
|
||||
struct chacha_ctx subctx;
|
||||
u8 real_iv[16];
|
||||
|
||||
chacha_init_generic(state, ctx->key, req->iv);
|
||||
|
||||
if (req->cryptlen > CHACHA_BLOCK_SIZE && irq_fpu_usable()) {
|
||||
kernel_fpu_begin();
|
||||
hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
|
||||
kernel_fpu_end();
|
||||
} else {
|
||||
hchacha_block_generic(state, subctx.key, ctx->nrounds);
|
||||
}
|
||||
subctx.nrounds = ctx->nrounds;
|
||||
|
||||
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||
memcpy(&real_iv[8], req->iv + 16, 8);
|
||||
return chacha_simd_stream_xor(req, &subctx, real_iv);
|
||||
}
|
||||
|
||||
static struct skcipher_alg algs[] = {
|
||||
{
|
||||
.base.cra_name = "chacha20",
|
||||
.base.cra_driver_name = "chacha20-simd",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = CHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = chacha_simd,
|
||||
.decrypt = chacha_simd,
|
||||
}, {
|
||||
.base.cra_name = "xchacha20",
|
||||
.base.cra_driver_name = "xchacha20-simd",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha20_setkey,
|
||||
.encrypt = xchacha_simd,
|
||||
.decrypt = xchacha_simd,
|
||||
}, {
|
||||
.base.cra_name = "xchacha12",
|
||||
.base.cra_driver_name = "xchacha12-simd",
|
||||
.base.cra_priority = 300,
|
||||
.base.cra_blocksize = 1,
|
||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||
.base.cra_module = THIS_MODULE,
|
||||
|
||||
.min_keysize = CHACHA_KEY_SIZE,
|
||||
.max_keysize = CHACHA_KEY_SIZE,
|
||||
.ivsize = XCHACHA_IV_SIZE,
|
||||
.chunksize = CHACHA_BLOCK_SIZE,
|
||||
.setkey = chacha12_setkey,
|
||||
.encrypt = xchacha_simd,
|
||||
.decrypt = xchacha_simd,
|
||||
},
|
||||
};
|
||||
|
||||
static int __init chacha_simd_mod_init(void)
|
||||
{
|
||||
if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
return 0;
|
||||
|
||||
static_branch_enable(&chacha_use_simd);
|
||||
|
||||
if (IS_ENABLED(CONFIG_AS_AVX2) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX2) &&
|
||||
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
|
||||
static_branch_enable(&chacha_use_avx2);
|
||||
|
||||
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX512VL) &&
|
||||
boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
|
||||
static_branch_enable(&chacha_use_avx512vl);
|
||||
}
|
||||
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
|
||||
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
|
||||
}
|
||||
|
||||
static void __exit chacha_simd_mod_fini(void)
|
||||
{
|
||||
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3))
|
||||
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
||||
}
|
||||
|
||||
module_init(chacha_simd_mod_init);
|
||||
module_exit(chacha_simd_mod_fini);
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
|
||||
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
|
||||
MODULE_ALIAS_CRYPTO("chacha20");
|
||||
MODULE_ALIAS_CRYPTO("chacha20-simd");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20");
|
||||
MODULE_ALIAS_CRYPTO("xchacha20-simd");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12");
|
||||
MODULE_ALIAS_CRYPTO("xchacha12-simd");
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue