Merge android-4.19-stable.157 (8ee67bc) into msm-4.19

* refs/heads/tmp-8ee67bc Revert "nl80211: fix non-split wiphy information" Reverting usb changes Linux 4.19.157 powercap: restrict energy meter to root access Revert "ANDROID: Kbuild, LLVMLinux: allow overriding clang target triple" Linux 4.19.156 arm64: dts: marvell: espressobin: Add ethernet switch aliases net: dsa: read mac address from DT for slave device tools: perf: Fix build error in v4.19.y perf/core: Fix a memory leak in perf_event_parse_addr_filter() PM: runtime: Resume the device earlier in __device_release_driver() Revert "ARC: entry: fix potential EFA clobber when TIF_SYSCALL_TRACE" ARC: stack unwinding: avoid indefinite looping usb: mtu3: fix panic in mtu3_gadget_stop() USB: Add NO_LPM quirk for Kingston flash drive USB: serial: option: add Telit FN980 composition 0x1055 USB: serial: option: add LE910Cx compositions 0x1203, 0x1230, 0x1231 USB: serial: option: add Quectel EC200T module support USB: serial: cyberjack: fix write-URB completion race serial: txx9: add missing platform_driver_unregister() on error in serial_txx9_init serial: 8250_mtk: Fix uart_get_baud_rate warning fork: fix copy_process(CLONE_PARENT) race with the exiting ->real_parent vt: Disable KD_FONT_OP_COPY ACPI: NFIT: Fix comparison to '-ENXIO' drm/vc4: drv: Add error handding for bind vsock: use ns_capable_noaudit() on socket create scsi: core: Don't start concurrent async scan on same host blk-cgroup: Pre-allocate tree node on blkg_conf_prep blk-cgroup: Fix memleak on error path of: Fix reserved-memory overlap detection x86/kexec: Use up-to-dated screen_info copy to fill boot params ARM: dts: sun4i-a10: fix cpu_alert temperature futex: Handle transient "ownerless" rtmutex state correctly tracing: Fix out of bounds write in get_trace_buf ftrace: Handle tracing when switching between context ftrace: Fix recursion check for NMI test ring-buffer: Fix recursion protection transitions between interrupt context gfs2: Wake up when sd_glock_disposal becomes zero mm: always have io_remap_pfn_range() set pgprot_decrypted() kthread_worker: prevent queuing delayed work from timer_fn when it is being canceled lib/crc32test: remove extra local_irq_disable/enable mm: mempolicy: fix potential pte_unmap_unlock pte error ALSA: usb-audio: Add implicit feedback quirk for MODX ALSA: usb-audio: Add implicit feedback quirk for Qu-16 ALSA: usb-audio: add usb vendor id as DSD-capable for Khadas devices ALSA: usb-audio: Add implicit feedback quirk for Zoom UAC-2 Fonts: Replace discarded const qualifier btrfs: tree-checker: fix the error message for transid error btrfs: tree-checker: Verify inode item btrfs: tree-checker: Enhance chunk checker to validate chunk profile btrfs: tree-checker: Fix wrong check on max devid btrfs: tree-checker: Verify dev item btrfs: tree-checker: Check chunk item at tree block read time btrfs: tree-checker: Make btrfs_check_chunk_valid() return EUCLEAN instead of EIO btrfs: tree-checker: Make chunk item checker messages more readable btrfs: Move btrfs_check_chunk_valid() to tree-check.[ch] and export it btrfs: Don't submit any btree write bio if the fs has errors Btrfs: fix unwritten extent buffers and hangs on future writeback attempts btrfs: extent_io: add proper error handling to lock_extent_buffer_for_io() btrfs: extent_io: Handle errors better in btree_write_cache_pages() btrfs: extent_io: Handle errors better in extent_write_full_page() btrfs: flush write bio if we loop in extent_write_cache_pages Revert "btrfs: flush write bio if we loop in extent_write_cache_pages" btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up btrfs: extent_io: Kill the forward declaration of flush_write_bio blktrace: fix debugfs use after free sfp: Fix error handing in sfp_probe() sctp: Fix COMM_LOST/CANT_STR_ASSOC err reporting on big-endian platforms net: usb: qmi_wwan: add Telit LE910Cx 0x1230 composition gianfar: Account for Tx PTP timestamp in the skb headroom gianfar: Replace skb_realloc_headroom with skb_cow_head for PTP chelsio/chtls: fix always leaking ctrl_skb chelsio/chtls: fix memory leaks caused by a race cadence: force nonlinear buffers to be cloned ptrace: fix task_join_group_stop() for the case when current is traced tipc: fix use-after-free in tipc_bcast_get_mode drm/i915: Break up error capture compression loops with cond_resched() ANDROID: fuse: Add support for d_canonical_path ANDROID: vfs: add d_canonical_path for stacked filesystem support ANDROID: Temporarily disable XFRM_USER_COMPAT filtering Linux 4.19.155 staging: octeon: Drop on uncorrectable alignment or FCS error staging: octeon: repair "fixed-link" support staging: comedi: cb_pcidas: Allow 2-channel commands for AO subdevice KVM: arm64: Fix AArch32 handling of DBGD{CCINT,SCRext} and DBGVCR device property: Don't clear secondary pointer for shared primary firmware node device property: Keep secondary firmware node secondary by type ARM: s3c24xx: fix missing system reset ARM: samsung: fix PM debug build with DEBUG_LL but !MMU arm: dts: mt7623: add missing pause for switchport hil/parisc: Disable HIL driver when it gets stuck cachefiles: Handle readpage error correctly arm64: berlin: Select DW_APB_TIMER_OF tty: make FONTX ioctl use the tty pointer they were actually passed rtc: rx8010: don't modify the global rtc ops drm/ttm: fix eviction valuable range check. ext4: fix invalid inode checksum ext4: fix error handling code in add_new_gdb ext4: fix leaking sysfs kobject after failed mount vringh: fix __vringh_iov() when riov and wiov are different ring-buffer: Return 0 on success from ring_buffer_resize() 9P: Cast to loff_t before multiplying libceph: clear con->out_msg on Policy::stateful_server faults ceph: promote to unsigned long long before shifting drm/amd/display: Don't invoke kgdb_breakpoint() unconditionally drm/amdgpu: don't map BO in reserved region i2c: imx: Fix external abort on interrupt in exit paths ia64: fix build error with !COREDUMP ubi: check kthread_should_stop() after the setting of task state perf python scripting: Fix printable strings in python3 scripts ubifs: dent: Fix some potential memory leaks while iterating entries NFSD: Add missing NFSv2 .pc_func methods NFSv4.2: support EXCHGID4_FLAG_SUPP_FENCE_OPS 4.2 EXCHANGE_ID flag powerpc: Fix undetected data corruption with P9N DD2.1 VSX CI load emulation powerpc/powernv/elog: Fix race while processing OPAL error log event. powerpc: Warn about use of smt_snooze_delay powerpc/rtas: Restrict RTAS requests from userspace s390/stp: add locking to sysfs functions powerpc/drmem: Make lmb_size 64 bit iio:gyro:itg3200: Fix timestamp alignment and prevent data leak. iio:adc:ti-adc12138 Fix alignment issue with timestamp iio:adc:ti-adc0832 Fix alignment issue with timestamp iio:light:si1145: Fix timestamp alignment and prevent data leak. dmaengine: dma-jz4780: Fix race in jz4780_dma_tx_status udf: Fix memory leak when mounting HID: wacom: Avoid entering wacom_wac_pen_report for pad / battery vt: keyboard, extend func_buf_lock to readers vt: keyboard, simplify vt_kdgkbsent drm/i915: Force VT'd workarounds when running as a guest OS usb: host: fsl-mph-dr-of: check return of dma_set_mask() usb: typec: tcpm: reset hard_reset_count for any disconnect usb: cdc-acm: fix cooldown mechanism usb: dwc3: core: don't trigger runtime pm when remove driver usb: dwc3: core: add phy cleanup for probe error handling usb: dwc3: gadget: Check MPS of the request length usb: dwc3: ep0: Fix ZLP for OUT ep0 requests usb: xhci: Workaround for S3 issue on AMD SNPS 3.0 xHC btrfs: fix use-after-free on readahead extent after failure to create it btrfs: cleanup cow block on error btrfs: use kvzalloc() to allocate clone_roots in btrfs_ioctl_send() btrfs: send, recompute reference path after orphanization of a directory btrfs: reschedule if necessary when logging directory items btrfs: improve device scanning messages btrfs: qgroup: fix wrong qgroup metadata reserve for delayed inode scsi: qla2xxx: Fix crash on session cleanup with unload scsi: mptfusion: Fix null pointer dereferences in mptscsih_remove() w1: mxc_w1: Fix timeout resolution problem leading to bus error acpi-cpufreq: Honor _PSD table setting on new AMD CPUs ACPI: debug: don't allow debugging when ACPI is disabled ACPI: video: use ACPI backlight for HP 635 Notebook ACPI / extlog: Check for RDMSR failure ACPI: button: fix handling lid state changes when input device closed NFS: fix nfs_path in case of a rename retry fs: Don't invalidate page buffers in block_write_full_page() media: uvcvideo: Fix uvc_ctrl_fixup_xu_info() not having any effect leds: bcm6328, bcm6358: use devres LED registering function perf/x86/amd/ibs: Fix raw sample data accumulation perf/x86/amd/ibs: Don't include randomized bits in get_ibs_op_count() mmc: sdhci-acpi: AMDI0040: Set SDHCI_QUIRK2_PRESET_VALUE_BROKEN md/raid5: fix oops during stripe resizing nvme-rdma: fix crash when connect rejected sgl_alloc_order: fix memory leak nbd: make the config put is called before the notifying the waiter ARM: dts: s5pv210: remove dedicated 'audio-subsystem' node ARM: dts: s5pv210: move PMU node out of clock controller ARM: dts: s5pv210: remove DMA controller bus node name to fix dtschema warnings memory: emif: Remove bogus debugfs error handling ARM: dts: omap4: Fix sgx clock rate for 4430 arm64: dts: renesas: ulcb: add full-pwr-cycle-in-suspend into eMMC nodes cifs: handle -EINTR in cifs_setattr gfs2: add validation checks for size of superblock ext4: Detect already used quota file early drivers: watchdog: rdc321x_wdt: Fix race condition bugs net: 9p: initialize sun_server.sun_path to have addr's value only when addr is valid clk: ti: clockdomain: fix static checker warning rpmsg: glink: Use complete_all for open states bnxt_en: Log unknown link speed appropriately. md/bitmap: md_bitmap_get_counter returns wrong blocks btrfs: fix replace of seed device drm/amd/display: HDMI remote sink need mode validation for Linux power: supply: test_power: add missing newlines when printing parameters by sysfs bus/fsl_mc: Do not rely on caller to provide non NULL mc_io drivers/net/wan/hdlc_fr: Correctly handle special skb->protocol values ACPI: Add out of bounds and numa_off protections to pxm_to_node() xfs: don't free rt blocks when we're doing a REMAP bunmapi call arm64/mm: return cpu_all_mask when node is NUMA_NO_NODE usb: xhci: omit duplicate actions when suspending a runtime suspended host. uio: free uio id after uio file node is freed USB: adutux: fix debugging cpufreq: sti-cpufreq: add stih418 support riscv: Define AT_VECTOR_SIZE_ARCH for ARCH_DLINFO media: uvcvideo: Fix dereference of out-of-bound list iterator kgdb: Make "kgdbcon" work properly with "kgdb_earlycon" ia64: kprobes: Use generic kretprobe trampoline handler printk: reduce LOG_BUF_SHIFT range for H8300 arm64: topology: Stop using MPIDR for topology information drm/bridge/synopsys: dsi: add support for non-continuous HS clock mmc: via-sdmmc: Fix data race bug media: imx274: fix frame interval handling media: tw5864: check status of tw5864_frameinterval_get usb: typec: tcpm: During PR_SWAP, source caps should be sent only after tSwapSourceStart media: platform: Improve queue set up flow for bug fixing media: videodev2.h: RGB BT2020 and HSV are always full range drm/brige/megachips: Add checking if ge_b850v3_lvds_init() is working correctly ath10k: fix VHT NSS calculation when STBC is enabled ath10k: start recovery process when payload length exceeds max htc length for sdio video: fbdev: pvr2fb: initialize variables xfs: fix realtime bitmap/summary file truncation when growing rt volume power: supply: bq27xxx: report "not charging" on all types ARM: 8997/2: hw_breakpoint: Handle inexact watchpoint addresses um: change sigio_spinlock to a mutex f2fs: fix to check segment boundary during SIT page readahead f2fs: fix uninit-value in f2fs_lookup f2fs: add trace exit in exception path sparc64: remove mm_cpumask clearing to fix kthread_use_mm race powerpc: select ARCH_WANT_IRQS_OFF_ACTIVATE_MM mm: fix exec activate_mm vs TLB shootdown and lazy tlb switching race powerpc/powernv/smp: Fix spurious DBG() warning futex: Fix incorrect should_fail_futex() handling ata: sata_nv: Fix retrieving of active qcs RDMA/qedr: Fix memory leak in iWARP CM mlxsw: core: Fix use-after-free in mlxsw_emad_trans_finish() x86/unwind/orc: Fix inactive tasks with stack pointer in %sp on GCC 10 compiled kernels xen/events: block rogue events for some time xen/events: defer eoi in case of excessive number of events xen/events: use a common cpu hotplug hook for event channels xen/events: switch user event channels to lateeoi model xen/pciback: use lateeoi irq binding xen/pvcallsback: use lateeoi irq binding xen/scsiback: use lateeoi irq binding xen/netback: use lateeoi irq binding xen/blkback: use lateeoi irq binding xen/events: add a new "late EOI" evtchn framework xen/events: fix race in evtchn_fifo_unmask() xen/events: add a proper barrier to 2-level uevent unmasking xen/events: avoid removing an event channel while handling it xen/events: don't use chip_data for legacy IRQs Revert "block: ratelimit handle_bad_sector() message" fscrypt: fix race where ->lookup() marks plaintext dentry as ciphertext fscrypt: only set dentry_operations on ciphertext dentries fs, fscrypt: clear DCACHE_ENCRYPTED_NAME when unaliasing directory fscrypt: fix race allowing rename() and link() of ciphertext dentries fscrypt: clean up and improve dentry revalidation fscrypt: return -EXDEV for incompatible rename or link into encrypted dir ata: sata_rcar: Fix DMA boundary mask serial: pl011: Fix lockdep splat when handling magic-sysrq interrupt mtd: lpddr: Fix bad logic in print_drs_error RDMA/addr: Fix race with netevent_callback()/rdma_addr_cancel() cxl: Rework error message for incompatible slots p54: avoid accessing the data mapped to streaming DMA evm: Check size of security.evm before using it bpf: Fix comment for helper bpf_current_task_under_cgroup() fuse: fix page dereference after free x86/xen: disable Firmware First mode for correctable memory errors arch/x86/amd/ibs: Fix re-arming IBS Fetch cxgb4: set up filter action after rewrites r8169: fix issue with forced threading in combination with shared interrupts tipc: fix memory leak caused by tipc_buf_append() tcp: Prevent low rmem stalls with SO_RCVLOWAT. ravb: Fix bit fields checking in ravb_hwtstamp_get() netem: fix zero division in tabledist mlxsw: core: Fix memory leak on module removal gtp: fix an use-before-init in gtp_newlink() chelsio/chtls: fix tls record info to user chelsio/chtls: fix memory leaks in CPL handlers chelsio/chtls: fix deadlock issue efivarfs: Replace invalid slashes with exclamation marks in dentries. x86/PCI: Fix intel_mid_pci.c build error when ACPI is not enabled arm64: link with -z norelro regardless of CONFIG_RELOCATABLE arm64: Run ARCH_WORKAROUND_1 enabling code on all CPUs scripts/setlocalversion: make git describe output more reliable objtool: Support Clang non-section symbols in ORC generation ANDROID: GKI: Enable DEBUG_INFO_DWARF4 UPSTREAM: mm/sl[uo]b: export __kmalloc_track(_node)_caller BACKPORT: xfrm/compat: Translate 32-bit user_policy from sockptr BACKPORT: xfrm/compat: Add 32=>64-bit messages translator UPSTREAM: xfrm/compat: Attach xfrm dumps to 64=>32 bit translator UPSTREAM: xfrm/compat: Add 64=>32-bit messages translator BACKPORT: xfrm: Provide API to register translator module ANDROID: Publish uncompressed Image on aarch64 FROMLIST: crypto: arm64/poly1305-neon - reorder PAC authentication with SP update UPSTREAM: crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian UPSTREAM: crypto: arm64/chacha - fix hchacha_block_neon() for big endian Linux 4.19.154 usb: gadget: f_ncm: allow using NCM in SuperSpeed Plus gadgets. eeprom: at25: set minimum read/write access stride to 1 USB: cdc-wdm: Make wdm_flush() interruptible and add wdm_fsync(). usb: cdc-acm: add quirk to blacklist ETAS ES58X devices tty: serial: fsl_lpuart: fix lpuart32_poll_get_char net: korina: cast KSEG0 address to pointer in kfree ath10k: check idx validity in __ath10k_htt_rx_ring_fill_n() scsi: ufs: ufs-qcom: Fix race conditions caused by ufs_qcom_testbus_config() usb: core: Solve race condition in anchor cleanup functions brcm80211: fix possible memleak in brcmf_proto_msgbuf_attach mwifiex: don't call del_timer_sync() on uninitialized timer reiserfs: Fix memory leak in reiserfs_parse_options() ipvs: Fix uninit-value in do_ip_vs_set_ctl() tty: ipwireless: fix error handling scsi: qedi: Fix list_del corruption while removing active I/O scsi: qedi: Protect active command list to avoid list corruption Fix use after free in get_capset_info callback. rtl8xxxu: prevent potential memory leak brcmsmac: fix memory leak in wlc_phy_attach_lcnphy scsi: ibmvfc: Fix error return in ibmvfc_probe() Bluetooth: Only mark socket zapped after unlocking usb: ohci: Default to per-port over-current protection xfs: make sure the rt allocator doesn't run off the end reiserfs: only call unlock_new_inode() if I_NEW misc: rtsx: Fix memory leak in rtsx_pci_probe ath9k: hif_usb: fix race condition between usb_get_urb() and usb_kill_anchored_urbs() can: flexcan: flexcan_chip_stop(): add error handling and propagate error value usb: dwc3: simple: add support for Hikey 970 USB: cdc-acm: handle broken union descriptors udf: Avoid accessing uninitialized data on failed inode read udf: Limit sparing table size usb: gadget: function: printer: fix use-after-free in __lock_acquire misc: vop: add round_up(x,4) for vring_size to avoid kernel panic mic: vop: copy data to kernel space then write to io memory scsi: target: core: Add CONTROL field for trace events scsi: mvumi: Fix error return in mvumi_io_attach() PM: hibernate: remove the bogus call to get_gendisk() in software_resume() mac80211: handle lack of sband->bitrates in rates ip_gre: set dev->hard_header_len and dev->needed_headroom properly ntfs: add check for mft record size in superblock media: venus: core: Fix runtime PM imbalance in venus_probe fs: dlm: fix configfs memory leak media: saa7134: avoid a shift overflow mmc: sdio: Check for CISTPL_VERS_1 buffer size media: uvcvideo: Ensure all probed info is returned to v4l2 media: media/pci: prevent memory leak in bttv_probe media: bdisp: Fix runtime PM imbalance on error media: platform: sti: hva: Fix runtime PM imbalance on error media: platform: s3c-camif: Fix runtime PM imbalance on error media: vsp1: Fix runtime PM imbalance on error media: exynos4-is: Fix a reference count leak media: exynos4-is: Fix a reference count leak due to pm_runtime_get_sync media: exynos4-is: Fix several reference count leaks due to pm_runtime_get_sync media: sti: Fix reference count leaks media: st-delta: Fix reference count leak in delta_run_work media: ati_remote: sanity check for both endpoints media: firewire: fix memory leak crypto: ccp - fix error handling block: ratelimit handle_bad_sector() message i2c: core: Restore acpi_walk_dep_device_list() getting called after registering the ACPI i2c devs perf: correct SNOOPX field offset sched/features: Fix !CONFIG_JUMP_LABEL case NTB: hw: amd: fix an issue about leak system resources nvmet: fix uninitialized work for zero kato powerpc/powernv/dump: Fix race while processing OPAL dump arm64: dts: zynqmp: Remove additional compatible string for i2c IPs ARM: dts: owl-s500: Fix incorrect PPI interrupt specifiers arm64: dts: qcom: msm8916: Fix MDP/DSI interrupts arm64: dts: qcom: pm8916: Remove invalid reg size from wcd_codec memory: fsl-corenet-cf: Fix handling of platform_get_irq() error memory: omap-gpmc: Fix build error without CONFIG_OF memory: omap-gpmc: Fix a couple off by ones ARM: dts: sun8i: r40: bananapi-m2-ultra: Fix dcdc1 regulator ARM: dts: imx6sl: fix rng node netfilter: nf_fwd_netdev: clear timestamp in forwarding path netfilter: conntrack: connection timeout after re-register KVM: x86: emulating RDPID failure shall return #UD rather than #GP Input: sun4i-ps2 - fix handling of platform_get_irq() error Input: twl4030_keypad - fix handling of platform_get_irq() error Input: omap4-keypad - fix handling of platform_get_irq() error Input: ep93xx_keypad - fix handling of platform_get_irq() error Input: stmfts - fix a & vs && typo Input: imx6ul_tsc - clean up some errors in imx6ul_tsc_resume() SUNRPC: fix copying of multiple pages in gss_read_proxy_verf() vfio iommu type1: Fix memory leak in vfio_iommu_type1_pin_pages vfio/pci: Clear token on bypass registration failure ext4: limit entries returned when counting fsmap records svcrdma: fix bounce buffers for unaligned offsets and multiple pages watchdog: sp5100: Fix definition of EFCH_PM_DECODEEN3 watchdog: Use put_device on error watchdog: Fix memleak in watchdog_cdev_register clk: bcm2835: add missing release if devm_clk_hw_register fails clk: at91: clk-main: update key before writing AT91_CKGR_MOR clk: rockchip: Initialize hw to error to avoid undefined behavior pwm: img: Fix null pointer access in probe rpmsg: smd: Fix a kobj leak in in qcom_smd_parse_edge() PCI: iproc: Set affinity mask on MSI interrupts i2c: rcar: Auto select RESET_CONTROLLER mailbox: avoid timer start from callback rapidio: fix the missed put_device() for rio_mport_add_riodev rapidio: fix error handling path ramfs: fix nommu mmap with gaps in the page cache lib/crc32.c: fix trivial typo in preprocessor condition f2fs: wait for sysfs kobject removal before freeing f2fs_sb_info IB/rdmavt: Fix sizeof mismatch cpufreq: powernv: Fix frame-size-overflow in powernv_cpufreq_reboot_notifier powerpc/perf/hv-gpci: Fix starting index value powerpc/perf: Exclude pmc5/6 from the irrelevant PMU group constraints overflow: Include header file with SIZE_MAX declaration kdb: Fix pager search for multi-line strings RDMA/hns: Fix missing sq_sig_type when querying QP RDMA/hns: Set the unsupported wr opcode perf intel-pt: Fix "context_switch event has no tid" error RDMA/cma: Consolidate the destruction of a cma_multicast in one place RDMA/cma: Remove dead code for kernel rdmacm multicast powerpc/64s/radix: Fix mm_cpumask trimming race vs kthread_use_mm powerpc/tau: Disable TAU between measurements powerpc/tau: Check processor type before enabling TAU interrupt ANDROID: GKI: update the ABI xml Linux 4.19.153 powerpc/tau: Remove duplicated set_thresholds() call powerpc/tau: Convert from timer to workqueue powerpc/tau: Use appropriate temperature sample interval RDMA/qedr: Fix inline size returned for iWARP RDMA/qedr: Fix use of uninitialized field xfs: fix high key handling in the rt allocator's query_range function xfs: limit entries returned when counting fsmap records arc: plat-hsdk: fix kconfig dependency warning when !RESET_CONTROLLER ARM: 9007/1: l2c: fix prefetch bits init in L2X0_AUX_CTRL using DT values mtd: mtdoops: Don't write panic data twice powerpc/pseries: explicitly reschedule during drmem_lmb list traversal mtd: lpddr: fix excessive stack usage with clang RDMA/ucma: Add missing locking around rdma_leave_multicast() RDMA/ucma: Fix locking for ctx->events_reported powerpc/icp-hv: Fix missing of_node_put() in success path powerpc/pseries: Fix missing of_node_put() in rng_init() IB/mlx4: Adjust delayed work when a dup is observed IB/mlx4: Fix starvation in paravirt mux/demux mm, oom_adj: don't loop through tasks in __set_oom_adj when not necessary mm/memcg: fix device private memcg accounting netfilter: nf_log: missing vlan offload tag and proto net: korina: fix kfree of rx/tx descriptor array ipvs: clear skb->tstamp in forwarding path mwifiex: fix double free platform/x86: mlx-platform: Remove PSU EEPROM configuration scsi: be2iscsi: Fix a theoretical leak in beiscsi_create_eqs() scsi: target: tcmu: Fix warning: 'page' may be used uninitialized usb: dwc2: Fix INTR OUT transfers in DDMA mode. nl80211: fix non-split wiphy information usb: gadget: u_ether: enable qmult on SuperSpeed Plus as well usb: gadget: f_ncm: fix ncm_bitrate for SuperSpeed and above. iwlwifi: mvm: split a print to avoid a WARNING in ROC mfd: sm501: Fix leaks in probe() net: enic: Cure the enic api locking trainwreck qtnfmac: fix resource leaks on unsupported iftype error return path HID: hid-input: fix stylus battery reporting slimbus: qcom-ngd-ctrl: disable ngd in qmi server down callback slimbus: core: do not enter to clock pause mode in core slimbus: core: check get_addr before removing laddr ida quota: clear padding in v2r1_mem2diskdqb() usb: dwc2: Fix parameter type in function pointer prototype ALSA: seq: oss: Avoid mutex lock for a long-time ioctl misc: mic: scif: Fix error handling path ath6kl: wmi: prevent a shift wrapping bug in ath6kl_wmi_delete_pstream_cmd() net: dsa: rtl8366rb: Support all 4096 VLANs net: dsa: rtl8366: Skip PVID setting if not requested net: dsa: rtl8366: Refactor VLAN/PVID init net: dsa: rtl8366: Check validity of passed VLANs cpufreq: armada-37xx: Add missing MODULE_DEVICE_TABLE net: stmmac: use netif_tx_start|stop_all_queues() function net/mlx5: Don't call timecounter cyc2time directly from 1PPS flow pinctrl: mcp23s08: Fix mcp23x17 precious range pinctrl: mcp23s08: Fix mcp23x17_regmap initialiser HID: roccat: add bounds checking in kone_sysfs_write_settings() video: fbdev: radeon: Fix memleak in radeonfb_pci_register video: fbdev: sis: fix null ptr dereference video: fbdev: vga16fb: fix setting of pixclock because a pass-by-value error drivers/virt/fsl_hypervisor: Fix error handling path pwm: lpss: Add range limit check for the base_unit register value pwm: lpss: Fix off by one error in base_unit math in pwm_lpss_prepare() pty: do tty_flip_buffer_push without port->lock in pty_write tty: hvcs: Don't NULL tty->driver_data until hvcs_cleanup() tty: serial: earlycon dependency VMCI: check return value of get_user_pages_fast() for errors backlight: sky81452-backlight: Fix refcount imbalance on error scsi: csiostor: Fix wrong return value in csio_hw_prep_fw() scsi: qla2xxx: Fix wrong return value in qla_nvme_register_hba() scsi: qla4xxx: Fix an error handling path in 'qla4xxx_get_host_stats()' drm/gma500: fix error check staging: rtl8192u: Do not use GFP_KERNEL in atomic context mwifiex: Do not use GFP_KERNEL in atomic context brcmfmac: check ndev pointer ASoC: qcom: lpass-cpu: fix concurrency issue ASoC: qcom: lpass-platform: fix memory leak wcn36xx: Fix reported 802.11n rx_highest rate wcn3660/wcn3680 ath10k: Fix the size used in a 'dma_free_coherent()' call in an error handling path ath9k: Fix potential out of bounds in ath9k_htc_txcompletion_cb() ath6kl: prevent potential array overflow in ath6kl_add_new_sta() Bluetooth: hci_uart: Cancel init work before unregistering ath10k: provide survey info as accumulated data spi: spi-s3c64xx: Check return values spi: spi-s3c64xx: swap s3c64xx_spi_set_cs() and s3c64xx_enable_datapath() pinctrl: bcm: fix kconfig dependency warning when !GPIOLIB regulator: resolve supply after creating regulator media: ti-vpe: Fix a missing check and reference count leak media: stm32-dcmi: Fix a reference count leak media: s5p-mfc: Fix a reference count leak media: camss: Fix a reference count leak. media: platform: fcp: Fix a reference count leak. media: rockchip/rga: Fix a reference count leak. media: rcar-vin: Fix a reference count leak. media: tc358743: cleanup tc358743_cec_isr media: tc358743: initialize variable media: mx2_emmaprp: Fix memleak in emmaprp_probe cypto: mediatek - fix leaks in mtk_desc_ring_alloc hwmon: (pmbus/max34440) Fix status register reads for MAX344{51,60,61} crypto: omap-sham - fix digcnt register handling with export/import media: omap3isp: Fix memleak in isp_probe media: uvcvideo: Silence shift-out-of-bounds warning media: uvcvideo: Set media controller entity functions media: m5mols: Check function pointer in m5mols_sensor_power media: Revert "media: exynos4-is: Add missed check for pinctrl_lookup_state()" media: tuner-simple: fix regression in simple_set_radio_freq crypto: picoxcell - Fix potential race condition bug crypto: ixp4xx - Fix the size used in a 'dma_free_coherent()' call crypto: mediatek - Fix wrong return value in mtk_desc_ring_alloc() crypto: algif_skcipher - EBUSY on aio should be an error x86/events/amd/iommu: Fix sizeof mismatch x86/nmi: Fix nmi_handle() duration miscalculation drivers/perf: xgene_pmu: Fix uninitialized resource struct x86/fpu: Allow multiple bits in clearcpuid= parameter EDAC/ti: Fix handling of platform_get_irq() error EDAC/i5100: Fix error handling order in i5100_init_one() crypto: algif_aead - Do not set MAY_BACKLOG on the async path ima: Don't ignore errors from crypto_shash_update() KVM: SVM: Initialize prev_ga_tag before use KVM: x86/mmu: Commit zap of remaining invalid pages when recovering lpages cifs: Return the error from crypt_message when enc/dec key not found. cifs: remove bogus debug code ALSA: hda/realtek: Enable audio jacks of ASUS D700SA with ALC887 icmp: randomize the global rate limiter r8169: fix operation under forced interrupt threading tcp: fix to update snd_wl1 in bulk receiver fast path nfc: Ensure presence of NFC_ATTR_FIRMWARE_NAME attribute in nfc_genl_fw_download() net/sched: act_tunnel_key: fix OOB write in case of IPv6 ERSPAN tunnels net: hdlc_raw_eth: Clear the IFF_TX_SKB_SHARING flag after calling ether_setup net: hdlc: In hdlc_rcv, check to make sure dev is an HDLC device chelsio/chtls: correct function return and return type chelsio/chtls: correct netdevice for vlan interface chelsio/chtls: fix socket lock ALSA: bebob: potential info leak in hwdep_read() binder: fix UAF when releasing todo list net/tls: sendfile fails with ktls offload r8169: fix data corruption issue on RTL8402 net/ipv4: always honour route mtu during forwarding tipc: fix the skb_unshare() in tipc_buf_append() net: usb: qmi_wwan: add Cellient MPL200 card net/smc: fix valid DMBE buffer sizes net: fix pos incrementment in ipv6_route_seq_next net: fec: Fix PHY init after phy_reset_after_clk_enable() net: fec: Fix phy_device lookup for phy_reset_after_clk_enable() mlx4: handle non-napi callers to napi_poll ipv4: Restore flowi4_oif update before call to xfrm_lookup_route ibmveth: Identify ingress large send packets. ibmveth: Switch order of ibmveth_helper calls. ANDROID: clang: update to 11.0.5 FROMLIST: arm64: link with -z norelro regardless of CONFIG_RELOCATABLE ANDROID: GKI: enable CONFIG_WIREGUARD UPSTREAM: wireguard: peerlookup: take lock before checking hash in replace operation UPSTREAM: wireguard: noise: take lock when removing handshake entry from table UPSTREAM: wireguard: queueing: make use of ip_tunnel_parse_protocol UPSTREAM: net: ip_tunnel: add header_ops for layer 3 devices UPSTREAM: wireguard: receive: account for napi_gro_receive never returning GRO_DROP UPSTREAM: wireguard: device: avoid circular netns references UPSTREAM: wireguard: noise: do not assign initiation time in if condition UPSTREAM: wireguard: noise: separate receive counter from send counter UPSTREAM: wireguard: queueing: preserve flow hash across packet scrubbing UPSTREAM: wireguard: noise: read preshared key while taking lock UPSTREAM: wireguard: selftests: use newer iproute2 for gcc-10 UPSTREAM: wireguard: send/receive: use explicit unlikely branch instead of implicit coalescing UPSTREAM: wireguard: selftests: initalize ipv6 members to NULL to squelch clang warning UPSTREAM: wireguard: send/receive: cond_resched() when processing worker ringbuffers UPSTREAM: wireguard: socket: remove errant restriction on looping to self UPSTREAM: wireguard: selftests: use normal kernel stack size on ppc64 UPSTREAM: wireguard: receive: use tunnel helpers for decapsulating ECN markings UPSTREAM: wireguard: queueing: cleanup ptr_ring in error path of packet_queue_init UPSTREAM: wireguard: send: remove errant newline from packet_encrypt_worker UPSTREAM: wireguard: noise: error out precomputed DH during handshake rather than config UPSTREAM: wireguard: receive: remove dead code from default packet type case UPSTREAM: wireguard: queueing: account for skb->protocol==0 UPSTREAM: wireguard: selftests: remove duplicated include <sys/types.h> UPSTREAM: wireguard: socket: remove extra call to synchronize_net UPSTREAM: wireguard: send: account for mtu=0 devices UPSTREAM: wireguard: receive: reset last_under_load to zero UPSTREAM: wireguard: selftests: reduce complexity and fix make races UPSTREAM: wireguard: device: use icmp_ndo_send helper UPSTREAM: wireguard: selftests: tie socket waiting to target pid UPSTREAM: wireguard: selftests: ensure non-addition of peers with failed precomputation UPSTREAM: wireguard: noise: reject peers with low order public keys UPSTREAM: wireguard: allowedips: fix use-after-free in root_remove_peer_lists UPSTREAM: net: skbuff: disambiguate argument and member for skb_list_walk_safe helper UPSTREAM: net: introduce skb_list_walk_safe for skb segment walking UPSTREAM: wireguard: socket: mark skbs as not on list when receiving via gro UPSTREAM: wireguard: queueing: do not account for pfmemalloc when clearing skb header UPSTREAM: wireguard: selftests: remove ancient kernel compatibility code UPSTREAM: wireguard: allowedips: use kfree_rcu() instead of call_rcu() UPSTREAM: wireguard: main: remove unused include <linux/version.h> UPSTREAM: wireguard: global: fix spelling mistakes in comments UPSTREAM: wireguard: Kconfig: select parent dependency for crypto UPSTREAM: wireguard: selftests: import harness makefile for test suite UPSTREAM: net: WireGuard secure network tunnel UPSTREAM: timekeeping: Boot should be boottime for coarse ns accessor UPSTREAM: timekeeping: Add missing _ns functions for coarse accessors UPSTREAM: icmp: introduce helper for nat'd source address in network device context UPSTREAM: crypto: poly1305-x86_64 - Use XORL r32,32 UPSTREAM: crypto: curve25519-x86_64 - Use XORL r32,32 UPSTREAM: crypto: arm/poly1305 - Add prototype for poly1305_blocks_neon UPSTREAM: crypto: arm/curve25519 - include <linux/scatterlist.h> UPSTREAM: crypto: x86/curve25519 - Remove unused carry variables UPSTREAM: crypto: x86/chacha-sse3 - use unaligned loads for state array UPSTREAM: crypto: lib/chacha20poly1305 - Add missing function declaration UPSTREAM: crypto: arch/lib - limit simd usage to 4k chunks UPSTREAM: crypto: arm[64]/poly1305 - add artifact to .gitignore files UPSTREAM: crypto: x86/curve25519 - leave r12 as spare register UPSTREAM: crypto: x86/curve25519 - replace with formally verified implementation UPSTREAM: crypto: arm64/chacha - correctly walk through blocks UPSTREAM: crypto: x86/curve25519 - support assemblers with no adx support UPSTREAM: crypto: chacha20poly1305 - prevent integer overflow on large input UPSTREAM: crypto: Kconfig - allow tests to be disabled when manager is disabled UPSTREAM: crypto: arm/chacha - fix build failured when kernel mode NEON is disabled UPSTREAM: crypto: x86/poly1305 - emit does base conversion itself UPSTREAM: crypto: chacha20poly1305 - add back missing test vectors and test chunking UPSTREAM: crypto: x86/poly1305 - fix .gitignore typo UPSTREAM: crypto: curve25519 - Fix selftest build error UPSTREAM: crypto: {arm,arm64,mips}/poly1305 - remove redundant non-reduction from emit UPSTREAM: crypto: x86/poly1305 - wire up faster implementations for kernel UPSTREAM: crypto: x86/poly1305 - import unmodified cryptogams implementation UPSTREAM: crypto: poly1305 - add new 32 and 64-bit generic versions UPSTREAM: crypto: lib/curve25519 - re-add selftests UPSTREAM: crypto: arm/curve25519 - add arch-specific key generation function UPSTREAM: crypto: chacha - fix warning message in header file UPSTREAM: crypto: arch - conditionalize crypto api in arch glue for lib code UPSTREAM: crypto: lib/chacha20poly1305 - use chacha20_crypt() UPSTREAM: crypto: x86/chacha - only unregister algorithms if registered UPSTREAM: crypto: chacha_generic - remove unnecessary setkey() functions UPSTREAM: crypto: lib/chacha20poly1305 - reimplement crypt_from_sg() routine UPSTREAM: crypto: chacha20poly1305 - import construction and selftest from Zinc UPSTREAM: crypto: arm/curve25519 - wire up NEON implementation UPSTREAM: crypto: arm/curve25519 - import Bernstein and Schwabe's Curve25519 ARM implementation UPSTREAM: crypto: curve25519 - x86_64 library and KPP implementations UPSTREAM: crypto: lib/curve25519 - work around Clang stack spilling issue UPSTREAM: crypto: curve25519 - implement generic KPP driver UPSTREAM: crypto: curve25519 - add kpp selftest UPSTREAM: crypto: curve25519 - generic C library implementations UPSTREAM: crypto: blake2s - x86_64 SIMD implementation UPSTREAM: crypto: blake2s - implement generic shash driver UPSTREAM: crypto: testmgr - add test cases for Blake2s UPSTREAM: crypto: blake2s - generic C library implementation and selftest UPSTREAM: crypto: mips/poly1305 - incorporate OpenSSL/CRYPTOGAMS optimized implementation UPSTREAM: crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation UPSTREAM: crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation UPSTREAM: crypto: x86/poly1305 - expose existing driver as poly1305 library UPSTREAM: crypto: x86/poly1305 - depend on generic library not generic shash UPSTREAM: crypto: poly1305 - expose init/update/final library interface UPSTREAM: crypto: x86/poly1305 - unify Poly1305 state struct with generic code UPSTREAM: crypto: poly1305 - move core routines into a separate library UPSTREAM: crypto: chacha - unexport chacha_generic routines UPSTREAM: crypto: mips/chacha - wire up accelerated 32r2 code from Zinc UPSTREAM: crypto: mips/chacha - import 32r2 ChaCha code from Zinc UPSTREAM: crypto: arm/chacha - expose ARM ChaCha routine as library function UPSTREAM: crypto: arm/chacha - remove dependency on generic ChaCha driver UPSTREAM: crypto: arm/chacha - import Eric Biggers's scalar accelerated ChaCha code UPSTREAM: crypto: arm64/chacha - expose arm64 ChaCha routine as library function UPSTREAM: crypto: arm64/chacha - depend on generic chacha library instead of crypto driver UPSTREAM: crypto: arm64/chacha - use combined SIMD/ALU routine for more speed UPSTREAM: crypto: arm64/chacha - optimize for arbitrary length inputs UPSTREAM: crypto: x86/chacha - expose SIMD ChaCha routine as library function UPSTREAM: crypto: x86/chacha - depend on generic chacha library instead of crypto driver UPSTREAM: crypto: chacha - move existing library code into lib/crypto UPSTREAM: crypto: lib - tidy up lib/crypto Kconfig and Makefile UPSTREAM: crypto: chacha - constify ctx and iv arguments UPSTREAM: crypto: x86/poly1305 - Clear key material from stack in SSE2 variant UPSTREAM: crypto: xchacha20 - fix comments for test vectors UPSTREAM: crypto: xchacha - add test vector from XChaCha20 draft RFC UPSTREAM: crypto: arm64/chacha - add XChaCha12 support UPSTREAM: crypto: arm64/chacha20 - refactor to allow varying number of rounds UPSTREAM: crypto: arm64/chacha20 - add XChaCha20 support UPSTREAM: crypto: x86/chacha - avoid sleeping under kernel_fpu_begin() UPSTREAM: crypto: x86/chacha - yield the FPU occasionally UPSTREAM: crypto: x86/chacha - add XChaCha12 support UPSTREAM: crypto: x86/chacha20 - refactor to allow varying number of rounds UPSTREAM: crypto: x86/chacha20 - add XChaCha20 support UPSTREAM: crypto: x86/chacha20 - Add a 4-block AVX-512VL variant UPSTREAM: crypto: x86/chacha20 - Add a 2-block AVX-512VL variant UPSTREAM: crypto: x86/chacha20 - Add a 8-block AVX-512VL variant UPSTREAM: crypto: x86/chacha20 - Add a 4-block AVX2 variant UPSTREAM: crypto: x86/chacha20 - Add a 2-block AVX2 variant UPSTREAM: crypto: x86/chacha20 - Use larger block functions more aggressively UPSTREAM: crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant UPSTREAM: crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3 variant UPSTREAM: crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant ANDROID: GKI: Enable CONFIG_USB_ANNOUNCE_NEW_DEVICES ANDROID: GKI: Enable CONFIG_X86_X2APIC ANDROID: move builds to use gas prebuilts UPSTREAM: binder: fix UAF when releasing todo list Conflicts: crypto/algif_aead.c drivers/rpmsg/qcom_glink_native.c drivers/scsi/ufs/ufs-qcom.c drivers/slimbus/qcom-ngd-ctrl.c fs/notify/inotify/inotify_user.c include/linux/dcache.h include/linux/fsnotify.h mm/oom_kill.c Fixed build errors: fs/fuse/dir.c Change-Id: I95bdbb1b183fa2c569023f18e09799d9cb96fc9f Signed-off-by: Srinivasarao P <spathi@codeaurora.org>
2020-12-18 18:25:50 +05:30 · 2020-12-18 18:25:50 +05:30 · 20912a8acc
commit 20912a8acc
parent 00089f062c 63625b1b0d
662 changed files with 51036 additions and 5409 deletions
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@ -566,7 +566,7 @@
 			loops can be debugged more effectively on production
 			systems.
-	clearcpuid=BITNUM [X86]
+	clearcpuid=BITNUM[,BITNUM...] [X86]
 			Disable CPUID feature X for the kernel. See
 			arch/x86/include/asm/cpufeatures.h for the valid bit
 			numbers. Note the Linux specific bits are not necessarily
@ -5302,6 +5302,14 @@
 			with /sys/devices/system/xen_memory/xen_memory0/scrub_pages.
 			Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT.
 	xen.event_eoi_delay=	[XEN]
 			How long to delay EOI handling in case of event
 			storms (jiffies). Default is 10.
 	xen.event_loop_timeout=	[XEN]
 			After which time (jiffies) the event handling loop
 			should start to delay EOI handling. Default is 2.
 	xirc2ps_cs=	[NET,PCMCIA]
 			Format:
 			<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
--- a/Documentation/core-api/timekeeping.rst
+++ b/Documentation/core-api/timekeeping.rst
@ -99,16 +99,20 @@ Coarse and fast_ns access
 Some additional variants exist for more specialized cases:
-.. c:function:: ktime_t ktime_get_coarse_boottime( void )
+.. c:function:: ktime_t ktime_get_coarse( void )
 		ktime_t ktime_get_coarse_boottime( void )
 		ktime_t ktime_get_coarse_real( void )
 		ktime_t ktime_get_coarse_clocktai( void )
-		ktime_t ktime_get_coarse_raw( void )
+
 .. c:function:: u64 ktime_get_coarse_ns( void )
 		u64 ktime_get_coarse_boottime_ns( void )
 		u64 ktime_get_coarse_real_ns( void )
 		u64 ktime_get_coarse_clocktai_ns( void )
 .. c:function:: void ktime_get_coarse_ts64( struct timespec64 * )
 		void ktime_get_coarse_boottime_ts64( struct timespec64 * )
 		void ktime_get_coarse_real_ts64( struct timespec64 * )
 		void ktime_get_coarse_clocktai_ts64( struct timespec64 * )
 		void ktime_get_coarse_raw_ts64( struct timespec64 * )
 	These are quicker than the non-coarse versions, but less accurate,
 	corresponding to CLOCK_MONONOTNIC_COARSE and CLOCK_REALTIME_COARSE
--- a/Documentation/media/uapi/v4l/colorspaces-defs.rst
+++ b/Documentation/media/uapi/v4l/colorspaces-defs.rst
@ -29,8 +29,7 @@ whole range, 0-255, dividing the angular value by 1.41. The enum
 :c:type:`v4l2_hsv_encoding` specifies which encoding is used.
 .. note:: The default R'G'B' quantization is full range for all
-   colorspaces except for BT.2020 which uses limited range R'G'B'
+   colorspaces. HSV formats are always full range.
   quantization.
 .. tabularcolumns:: |p{6.0cm}|p{11.5cm}|
@ -162,8 +161,8 @@ whole range, 0-255, dividing the angular value by 1.41. The enum
      - Details
    * - ``V4L2_QUANTIZATION_DEFAULT``
      - Use the default quantization encoding as defined by the
-	colorspace. This is always full range for R'G'B' (except for the
+	colorspace. This is always full range for R'G'B' and HSV.
-	BT.2020 colorspace) and HSV. It is usually limited range for Y'CbCr.
+	It is usually limited range for Y'CbCr.
    * - ``V4L2_QUANTIZATION_FULL_RANGE``
      - Use the full range quantization encoding. I.e. the range [0…1] is
 	mapped to [0…255] (with possible clipping to [1…254] to avoid the
@ -173,4 +172,4 @@ whole range, 0-255, dividing the angular value by 1.41. The enum
    * - ``V4L2_QUANTIZATION_LIM_RANGE``
      - Use the limited range quantization encoding. I.e. the range [0…1]
 	is mapped to [16…235]. Cb and Cr are mapped from [-0.5…0.5] to
-	[16…240].
+	[16…240]. Limited Range cannot be used with HSV.
--- a/Documentation/media/uapi/v4l/colorspaces-details.rst
+++ b/Documentation/media/uapi/v4l/colorspaces-details.rst
@ -370,9 +370,8 @@ Colorspace BT.2020 (V4L2_COLORSPACE_BT2020)
 The :ref:`itu2020` standard defines the colorspace used by Ultra-high
 definition television (UHDTV). The default transfer function is
 ``V4L2_XFER_FUNC_709``. The default Y'CbCr encoding is
-``V4L2_YCBCR_ENC_BT2020``. The default R'G'B' quantization is limited
+``V4L2_YCBCR_ENC_BT2020``. The default Y'CbCr quantization is limited range.
-range (!), and so is the default Y'CbCr quantization. The chromaticities
+The chromaticities of the primary colors and the white reference are:
 of the primary colors and the white reference are:
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@ -949,12 +949,14 @@ icmp_ratelimit - INTEGER
 icmp_msgs_per_sec - INTEGER
 	Limit maximal number of ICMP packets sent per second from this host.
 	Only messages whose type matches icmp_ratemask (see below) are
-	controlled by this limit.
+	controlled by this limit. For security reasons, the precise count
 	of messages per second is randomized.
 	Default: 1000
 icmp_msgs_burst - INTEGER
 	icmp_msgs_per_sec controls number of ICMP packets sent per second,
 	while icmp_msgs_burst controls the burst size of these packets.
 	For security reasons, the precise burst size is randomized.
 	Default: 50
 icmp_ratemask - INTEGER
--- a/9
+++ b/9
@ -3907,6 +3907,7 @@ F:	crypto/
 F:	drivers/crypto/
 F:	include/crypto/
 F:	include/linux/crypto*
 F:	lib/crypto/
 CRYPTOGRAPHIC RANDOM NUMBER GENERATOR
 M:	Neil Horman <nhorman@tuxdriver.com>
@ -15890,6 +15891,14 @@ L:	linux-gpio@vger.kernel.org
 S:	Maintained
 F:	drivers/gpio/gpio-ws16c48.c
 WIREGUARD SECURE NETWORK TUNNEL
 M:	Jason A. Donenfeld <Jason@zx2c4.com>
 S:	Maintained
 F:	drivers/net/wireguard/
 F:	tools/testing/selftests/wireguard/
 L:	wireguard@lists.zx2c4.com
 L:	netdev@vger.kernel.org
 WISTRON LAPTOP BUTTON DRIVER
 M:	Miloslav Trmac <mitr@volny.cz>
 S:	Maintained
--- a/8
+++ b/8
@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 VERSION = 4
 PATCHLEVEL = 19
-SUBLEVEL = 152
+SUBLEVEL = 157
 EXTRAVERSION =
 NAME = "People's Front"
@ -505,11 +505,7 @@ endif
 ifeq ($(cc-name),clang)
 ifneq ($(CROSS_COMPILE),)
-CLANG_TRIPLE	?= $(CROSS_COMPILE)
+CLANG_FLAGS	+= --target=$(notdir $(CROSS_COMPILE:%-=%))
 CLANG_FLAGS	+= --target=$(notdir $(CLANG_TRIPLE:%-=%))
 ifeq ($(shell $(srctree)/scripts/clang-android.sh $(CC) $(CLANG_FLAGS)), y)
 $(error "Clang with Android --target detected. Did you specify CLANG_TRIPLE?")
 endif
 GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
 CLANG_FLAGS	+= --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE))
 GCC_TOOLCHAIN	:= $(realpath $(GCC_TOOLCHAIN_DIR)/..)
--- a/android/abi_gki_aarch64.xml
+++ b/android/abi_gki_aarch64.xml
--- a/android/abi_gki_aarch64_qcom
+++ b/android/abi_gki_aarch64_qcom
@ -2348,6 +2348,7 @@
  __sock_recv_ts_and_drops
  sock_wake_async
  sock_wfree
  timer_reduce
  unregister_net_sysctl_table
  __wake_up_sync_key
  __xfrm_policy_check
--- a/arch/Kconfig
+++ b/arch/Kconfig
@ -366,6 +366,13 @@ config HAVE_RCU_TABLE_FREE
 config HAVE_RCU_TABLE_INVALIDATE
 	bool
 config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	bool
 	help
 	  Temporary select until all architectures can be converted to have
 	  irqs disabled over activate_mm. Architectures that do IPI based TLB
 	  shootdowns should enable this.
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
--- a/arch/arc/kernel/entry.S
+++ b/arch/arc/kernel/entry.S
@ -156,6 +156,7 @@ END(EV_Extension)
 tracesys:
 	; save EFA in case tracer wants the PC of traced task
 	; using ERET won't work since next-PC has already committed
 	lr  r12, [efa]
 	GET_CURR_TASK_FIELD_PTR   TASK_THREAD, r11
 	st  r12, [r11, THREAD_FAULT_ADDR]	; thread.fault_address
@ -198,9 +199,15 @@ tracesys_exit:
 ; Breakpoint TRAP
 ; ---------------------------------------------
 trap_with_param:
-	mov r0, r12	; EFA in case ptracer/gdb wants stop_pc
+
 	; stop_pc info by gdb needs this info
 	lr  r0, [efa]
 	mov r1, sp
 	; Now that we have read EFA, it is safe to do "fake" rtie
 	;   and get out of CPU exception mode
 	FAKE_RET_FROM_EXCPN
 	; Save callee regs in case gdb wants to have a look
 	; SP will grow up by size of CALLEE Reg-File
 	; NOTE: clobbers r12
@ -227,10 +234,6 @@ ENTRY(EV_Trap)
 	EXCEPTION_PROLOGUE
 	lr  r12, [efa]
 	FAKE_RET_FROM_EXCPN
 	;============ TRAP 1   :breakpoints
 	; Check ECR for trap with arg (PROLOGUE ensures r9 has ECR)
 	bmsk.f 0, r9, 7
@ -238,6 +241,9 @@ ENTRY(EV_Trap)
 	;============ TRAP  (no param): syscall top level
 	; First return from Exception to pure K mode (Exception/IRQs renabled)
 	FAKE_RET_FROM_EXCPN
 	; If syscall tracing ongoing, invoke pre-post-hooks
 	GET_CURR_THR_INFO_FLAGS   r10
 	btst r10, TIF_SYSCALL_TRACE
--- a/arch/arc/kernel/stacktrace.c
+++ b/arch/arc/kernel/stacktrace.c
@ -115,7 +115,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,
 		int (*consumer_fn) (unsigned int, void *), void *arg)
 {
 #ifdef CONFIG_ARC_DW2_UNWIND
-	int ret = 0;
+	int ret = 0, cnt = 0;
 	unsigned int address;
 	struct unwind_frame_info frame_info;
@ -135,6 +135,11 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,
 			break;
 		frame_info.regs.r63 = frame_info.regs.r31;
 		if (cnt++ > 128) {
 			printk("unwinder looping too long, aborting !\n");
 			return 0;
 		}
 	}
 	return address;		/* return the last address it saw */
--- a/arch/arc/plat-hsdk/Kconfig
+++ b/arch/arc/plat-hsdk/Kconfig
@ -11,5 +11,6 @@ menuconfig ARC_SOC_HSDK
 	select ARC_HAS_ACCL_REGS
 	select ARC_IRQ_NO_AUTOSAVE
 	select CLK_HSDK
 	select RESET_CONTROLLER
 	select RESET_HSDK
 	select MIGHT_HAVE_PCI
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@ -622,8 +622,10 @@ config ARCH_S3C24XX
 	select HAVE_S3C2410_WATCHDOG if WATCHDOG
 	select HAVE_S3C_RTC if RTC_CLASS
 	select NEED_MACH_IO_H
 	select S3C2410_WATCHDOG
 	select SAMSUNG_ATAGS
 	select USE_OF
 	select WATCHDOG
 	help
 	  Samsung S3C2410, S3C2412, S3C2413, S3C2416, S3C2440, S3C2442, S3C2443
 	  and S3C2450 SoCs based systems, such as the Simtec Electronics BAST
--- a/arch/arm/boot/dts/imx6sl.dtsi
+++ b/arch/arm/boot/dts/imx6sl.dtsi
@ -922,8 +922,10 @@
 			};
 			rngb: rngb@21b4000 {
 				compatible = "fsl,imx6sl-rngb", "fsl,imx25-rngb";
 				reg = <0x021b4000 0x4000>;
 				interrupts = <0 5 IRQ_TYPE_LEVEL_HIGH>;
 				clocks = <&clks IMX6SL_CLK_DUMMY>;
 			};
 			weim: weim@21b8000 {
--- a/arch/arm/boot/dts/mt7623n-bananapi-bpi-r2.dts
+++ b/arch/arm/boot/dts/mt7623n-bananapi-bpi-r2.dts
@ -192,6 +192,7 @@
 					fixed-link {
 						speed = <1000>;
 						full-duplex;
 						pause;
 					};
 				};
 			};
--- a/arch/arm/boot/dts/omap4.dtsi
+++ b/arch/arm/boot/dts/omap4.dtsi
@ -516,7 +516,7 @@
 			status = "disabled";
 		};
-		target-module@56000000 {
+		sgx_module: target-module@56000000 {
 			compatible = "ti,sysc-omap4", "ti,sysc";
 			ti,hwmods = "gpu";
 			reg = <0x5601fc00 0x4>,
--- a/arch/arm/boot/dts/omap443x.dtsi
+++ b/arch/arm/boot/dts/omap443x.dtsi
@ -74,3 +74,13 @@
 };
 /include/ "omap443x-clocks.dtsi"
 /*
 * Use dpll_per for sgx at 153.6MHz like droid4 stock v3.0.8 Android kernel
 */
 &sgx_module {
 	assigned-clocks = <&l3_gfx_clkctrl OMAP4_GPU_CLKCTRL 24>,
 			  <&dpll_per_m7x2_ck>;
 	assigned-clock-rates = <0>, <153600000>;
 	assigned-clock-parents = <&dpll_per_m7x2_ck>;
 };
--- a/arch/arm/boot/dts/owl-s500.dtsi
+++ b/arch/arm/boot/dts/owl-s500.dtsi
@ -85,21 +85,21 @@
 		global_timer: timer@b0020200 {
 			compatible = "arm,cortex-a9-global-timer";
 			reg = <0xb0020200 0x100>;
-			interrupts = <GIC_PPI 0 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
+			interrupts = <GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
 			status = "disabled";
 		};
 		twd_timer: timer@b0020600 {
 			compatible = "arm,cortex-a9-twd-timer";
 			reg = <0xb0020600 0x20>;
-			interrupts = <GIC_PPI 2 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
+			interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
 			status = "disabled";
 		};
 		twd_wdt: wdt@b0020620 {
 			compatible = "arm,cortex-a9-twd-wdt";
 			reg = <0xb0020620 0xe0>;
-			interrupts = <GIC_PPI 3 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
+			interrupts = <GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
 			status = "disabled";
 		};
--- a/arch/arm/boot/dts/s5pv210.dtsi
+++ b/arch/arm/boot/dts/s5pv210.dtsi
@ -98,20 +98,17 @@
 		};
 		clocks: clock-controller@e0100000 {
-			compatible = "samsung,s5pv210-clock", "simple-bus";
+			compatible = "samsung,s5pv210-clock";
 			reg = <0xe0100000 0x10000>;
 			clock-names = "xxti", "xusbxti";
 			clocks = <&xxti>, <&xusbxti>;
 			#clock-cells = <1>;
-			#address-cells = <1>;
+		};
 			#size-cells = <1>;
 			ranges;
 		pmu_syscon: syscon@e0108000 {
 			compatible = "samsung-s5pv210-pmu", "syscon";
 			reg = <0xe0108000 0x8000>;
 		};
 		};
 		pinctrl0: pinctrl@e0200000 {
 			compatible = "samsung,s5pv210-pinctrl";
@ -126,12 +123,6 @@
 			};
 		};
 		amba {
 			#address-cells = <1>;
 			#size-cells = <1>;
 			compatible = "simple-bus";
 			ranges;
 		pdma0: dma@e0900000 {
 			compatible = "arm,pl330", "arm,primecell";
 			reg = <0xe0900000 0x1000>;
@ -155,7 +146,6 @@
 			#dma-channels = <8>;
 			#dma-requests = <32>;
 		};
 		};
 		spi0: spi@e1300000 {
 			compatible = "samsung,s5pv210-spi";
@ -227,12 +217,6 @@
 			status = "disabled";
 		};
 		audio-subsystem {
 			compatible = "samsung,s5pv210-audss", "simple-bus";
 			#address-cells = <1>;
 			#size-cells = <1>;
 			ranges;
 		clk_audss: clock-controller@eee10000 {
 			compatible = "samsung,s5pv210-audss-clock";
 			reg = <0xeee10000 0x1000>;
@ -264,7 +248,6 @@
 			#sound-dai-cells = <0>;
 			status = "disabled";
 		};
 		};
 		i2s1: i2s@e2100000 {
 			compatible = "samsung,s3c6410-i2s";
--- a/arch/arm/boot/dts/sun4i-a10.dtsi
+++ b/arch/arm/boot/dts/sun4i-a10.dtsi
@ -143,7 +143,7 @@
 			trips {
 				cpu_alert0: cpu-alert0 {
 					/* milliCelsius */
-					temperature = <850000>;
+					temperature = <85000>;
 					hysteresis = <2000>;
 					type = "passive";
 				};
--- a/arch/arm/boot/dts/sun8i-r40-bananapi-m2-ultra.dts
+++ b/arch/arm/boot/dts/sun8i-r40-bananapi-m2-ultra.dts
@ -206,16 +206,16 @@
 };
 &reg_dc1sw {
-	regulator-min-microvolt = <3000000>;
+	regulator-min-microvolt = <3300000>;
-	regulator-max-microvolt = <3000000>;
+	regulator-max-microvolt = <3300000>;
 	regulator-name = "vcc-gmac-phy";
 };
 &reg_dcdc1 {
 	regulator-always-on;
-	regulator-min-microvolt = <3000000>;
+	regulator-min-microvolt = <3300000>;
-	regulator-max-microvolt = <3000000>;
+	regulator-max-microvolt = <3300000>;
-	regulator-name = "vcc-3v0";
+	regulator-name = "vcc-3v3";
 };
 &reg_dcdc2 {
--- a/arch/arm/crypto/.gitignore
+++ b/arch/arm/crypto/.gitignore
@ -1,3 +1,4 @@
 aesbs-core.S
 sha256-core.S
 sha512-core.S
 poly1305-core.S
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@ -125,14 +125,24 @@ config CRYPTO_CRC32_ARM_CE
 	select CRYPTO_HASH
 config CRYPTO_CHACHA20_NEON
-	tristate "NEON accelerated ChaCha stream cipher algorithms"
+	tristate "NEON and scalar accelerated ChaCha stream cipher algorithms"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_BLKCIPHER
-	select CRYPTO_CHACHA20
+	select CRYPTO_ARCH_HAVE_LIB_CHACHA
 config CRYPTO_POLY1305_ARM
 	tristate "Accelerated scalar and SIMD Poly1305 hash implementations"
 	select CRYPTO_HASH
 	select CRYPTO_ARCH_HAVE_LIB_POLY1305
 config CRYPTO_NHPOLY1305_NEON
 	tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_NHPOLY1305
 config CRYPTO_CURVE25519_NEON
 	tristate "NEON accelerated Curve25519 scalar multiplication library"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_LIB_CURVE25519_GENERIC
 	select CRYPTO_ARCH_HAVE_LIB_CURVE25519
 endif
--- a/arch/arm/crypto/Makefile
+++ b/arch/arm/crypto/Makefile
@ -10,7 +10,9 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
 obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
 obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
 obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
 obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
 obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
 ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
 ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
@ -53,13 +55,19 @@ aes-arm-ce-y	:= aes-ce-core.o aes-ce-glue.o
 ghash-arm-ce-y	:= ghash-ce-core.o ghash-ce-glue.o
 crct10dif-arm-ce-y	:= crct10dif-ce-core.o crct10dif-ce-glue.o
 crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
-chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
+chacha-neon-y := chacha-scalar-core.o chacha-glue.o
 chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
 poly1305-arm-y := poly1305-core.o poly1305-glue.o
 nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
 curve25519-neon-y := curve25519-core.o curve25519-glue.o
 ifdef REGENERATE_ARM_CRYPTO
 quiet_cmd_perl = PERL    $@
      cmd_perl = $(PERL) $(<) > $(@)
 $(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl
 	$(call cmd,perl)
 $(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
 	$(call cmd,perl)
@ -67,4 +75,9 @@ $(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl
 	$(call cmd,perl)
 endif
-targets += sha256-core.S sha512-core.S
+targets += poly1305-core.S sha256-core.S sha512-core.S
 # massage the perlasm code a bit so we only get the NEON routine if we need it
 poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
 poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
 AFLAGS_poly1305-core.o += $(poly1305-aflags-y)
--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
@ -0,0 +1,356 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
 * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
 * including ChaCha20 (RFC7539)
 *
 * Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
 * Copyright (C) 2015 Martin Willi
 */
 #include <crypto/algapi.h>
 #include <crypto/internal/chacha.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <asm/cputype.h>
 #include <asm/hwcap.h>
 #include <asm/neon.h>
 #include <asm/simd.h>
 asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
 				      int nrounds);
 asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
 				       int nrounds);
 asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
 asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
 asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
 			     const u32 *state, int nrounds);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
 static inline bool neon_usable(void)
 {
 	return static_branch_likely(&use_neon) && may_use_simd();
 }
 static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
 			  unsigned int bytes, int nrounds)
 {
 	u8 buf[CHACHA_BLOCK_SIZE];
 	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
 		chacha_4block_xor_neon(state, dst, src, nrounds);
 		bytes -= CHACHA_BLOCK_SIZE * 4;
 		src += CHACHA_BLOCK_SIZE * 4;
 		dst += CHACHA_BLOCK_SIZE * 4;
 		state[12] += 4;
 	}
 	while (bytes >= CHACHA_BLOCK_SIZE) {
 		chacha_block_xor_neon(state, dst, src, nrounds);
 		bytes -= CHACHA_BLOCK_SIZE;
 		src += CHACHA_BLOCK_SIZE;
 		dst += CHACHA_BLOCK_SIZE;
 		state[12]++;
 	}
 	if (bytes) {
 		memcpy(buf, src, bytes);
 		chacha_block_xor_neon(state, buf, buf, nrounds);
 		memcpy(dst, buf, bytes);
 	}
 }
 void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
 {
 	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
 		hchacha_block_arm(state, stream, nrounds);
 	} else {
 		kernel_neon_begin();
 		hchacha_block_neon(state, stream, nrounds);
 		kernel_neon_end();
 	}
 }
 EXPORT_SYMBOL(hchacha_block_arch);
 void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
 {
 	chacha_init_generic(state, key, iv);
 }
 EXPORT_SYMBOL(chacha_init_arch);
 void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
 		       int nrounds)
 {
 	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
 	    bytes <= CHACHA_BLOCK_SIZE) {
 		chacha_doarm(dst, src, bytes, state, nrounds);
 		state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
 		return;
 	}
 	do {
 		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
 		kernel_neon_begin();
 		chacha_doneon(state, dst, src, todo, nrounds);
 		kernel_neon_end();
 		bytes -= todo;
 		src += todo;
 		dst += todo;
 	} while (bytes);
 }
 EXPORT_SYMBOL(chacha_crypt_arch);
 static int chacha_stream_xor(struct skcipher_request *req,
 			     const struct chacha_ctx *ctx, const u8 *iv,
 			     bool neon)
 {
 	struct skcipher_walk walk;
 	u32 state[16];
 	int err;
 	err = skcipher_walk_virt(&walk, req, false);
 	chacha_init_generic(state, ctx->key, iv);
 	while (walk.nbytes > 0) {
 		unsigned int nbytes = walk.nbytes;
 		if (nbytes < walk.total)
 			nbytes = round_down(nbytes, walk.stride);
 		if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
 			chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
 				     nbytes, state, ctx->nrounds);
 			state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
 		} else {
 			kernel_neon_begin();
 			chacha_doneon(state, walk.dst.virt.addr,
 				      walk.src.virt.addr, nbytes, ctx->nrounds);
 			kernel_neon_end();
 		}
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 	return err;
 }
 static int do_chacha(struct skcipher_request *req, bool neon)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	return chacha_stream_xor(req, ctx, req->iv, neon);
 }
 static int chacha_arm(struct skcipher_request *req)
 {
 	return do_chacha(req, false);
 }
 static int chacha_neon(struct skcipher_request *req)
 {
 	return do_chacha(req, neon_usable());
 }
 static int do_xchacha(struct skcipher_request *req, bool neon)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	struct chacha_ctx subctx;
 	u32 state[16];
 	u8 real_iv[16];
 	chacha_init_generic(state, ctx->key, req->iv);
 	if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
 		hchacha_block_arm(state, subctx.key, ctx->nrounds);
 	} else {
 		kernel_neon_begin();
 		hchacha_block_neon(state, subctx.key, ctx->nrounds);
 		kernel_neon_end();
 	}
 	subctx.nrounds = ctx->nrounds;
 	memcpy(&real_iv[0], req->iv + 24, 8);
 	memcpy(&real_iv[8], req->iv + 16, 8);
 	return chacha_stream_xor(req, &subctx, real_iv, neon);
 }
 static int xchacha_arm(struct skcipher_request *req)
 {
 	return do_xchacha(req, false);
 }
 static int xchacha_neon(struct skcipher_request *req)
 {
 	return do_xchacha(req, neon_usable());
 }
 static struct skcipher_alg arm_algs[] = {
 	{
 		.base.cra_name		= "chacha20",
 		.base.cra_driver_name	= "chacha20-arm",
 		.base.cra_priority	= 200,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 		.base.cra_module	= THIS_MODULE,
 		.min_keysize		= CHACHA_KEY_SIZE,
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= CHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.setkey			= chacha20_setkey,
 		.encrypt		= chacha_arm,
 		.decrypt		= chacha_arm,
 	}, {
 		.base.cra_name		= "xchacha20",
 		.base.cra_driver_name	= "xchacha20-arm",
 		.base.cra_priority	= 200,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 		.base.cra_module	= THIS_MODULE,
 		.min_keysize		= CHACHA_KEY_SIZE,
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.setkey			= chacha20_setkey,
 		.encrypt		= xchacha_arm,
 		.decrypt		= xchacha_arm,
 	}, {
 		.base.cra_name		= "xchacha12",
 		.base.cra_driver_name	= "xchacha12-arm",
 		.base.cra_priority	= 200,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 		.base.cra_module	= THIS_MODULE,
 		.min_keysize		= CHACHA_KEY_SIZE,
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.setkey			= chacha12_setkey,
 		.encrypt		= xchacha_arm,
 		.decrypt		= xchacha_arm,
 	},
 };
 static struct skcipher_alg neon_algs[] = {
 	{
 		.base.cra_name		= "chacha20",
 		.base.cra_driver_name	= "chacha20-neon",
 		.base.cra_priority	= 300,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 		.base.cra_module	= THIS_MODULE,
 		.min_keysize		= CHACHA_KEY_SIZE,
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= CHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.walksize		= 4 * CHACHA_BLOCK_SIZE,
 		.setkey			= chacha20_setkey,
 		.encrypt		= chacha_neon,
 		.decrypt		= chacha_neon,
 	}, {
 		.base.cra_name		= "xchacha20",
 		.base.cra_driver_name	= "xchacha20-neon",
 		.base.cra_priority	= 300,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 		.base.cra_module	= THIS_MODULE,
 		.min_keysize		= CHACHA_KEY_SIZE,
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.walksize		= 4 * CHACHA_BLOCK_SIZE,
 		.setkey			= chacha20_setkey,
 		.encrypt		= xchacha_neon,
 		.decrypt		= xchacha_neon,
 	}, {
 		.base.cra_name		= "xchacha12",
 		.base.cra_driver_name	= "xchacha12-neon",
 		.base.cra_priority	= 300,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 		.base.cra_module	= THIS_MODULE,
 		.min_keysize		= CHACHA_KEY_SIZE,
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.walksize		= 4 * CHACHA_BLOCK_SIZE,
 		.setkey			= chacha12_setkey,
 		.encrypt		= xchacha_neon,
 		.decrypt		= xchacha_neon,
 	}
 };
 static int __init chacha_simd_mod_init(void)
 {
 	int err = 0;
 	if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
 		err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
 		if (err)
 			return err;
 	}
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
 		int i;
 		switch (read_cpuid_part()) {
 		case ARM_CPU_PART_CORTEX_A7:
 		case ARM_CPU_PART_CORTEX_A5:
 			/*
 			 * The Cortex-A7 and Cortex-A5 do not perform well with
 			 * the NEON implementation but do incredibly with the
 			 * scalar one and use less power.
 			 */
 			for (i = 0; i < ARRAY_SIZE(neon_algs); i++)
 				neon_algs[i].base.cra_priority = 0;
 			break;
 		default:
 			static_branch_enable(&use_neon);
 		}
 		if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
 			err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
 			if (err)
 				crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
 		}
 	}
 	return err;
 }
 static void __exit chacha_simd_mod_fini(void)
 {
 	if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
 		crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
 		if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
 			crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
 	}
 }
 module_init(chacha_simd_mod_init);
 module_exit(chacha_simd_mod_fini);
 MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("chacha20");
 MODULE_ALIAS_CRYPTO("chacha20-arm");
 MODULE_ALIAS_CRYPTO("xchacha20");
 MODULE_ALIAS_CRYPTO("xchacha20-arm");
 MODULE_ALIAS_CRYPTO("xchacha12");
 MODULE_ALIAS_CRYPTO("xchacha12-arm");
 #ifdef CONFIG_KERNEL_MODE_NEON
 MODULE_ALIAS_CRYPTO("chacha20-neon");
 MODULE_ALIAS_CRYPTO("xchacha20-neon");
 MODULE_ALIAS_CRYPTO("xchacha12-neon");
 #endif
--- a/arch/arm/crypto/chacha-scalar-core.S
+++ b/arch/arm/crypto/chacha-scalar-core.S
@ -0,0 +1,460 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
 * Copyright (C) 2018 Google, Inc.
 */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 /*
 * Design notes:
 *
 * 16 registers would be needed to hold the state matrix, but only 14 are
 * available because 'sp' and 'pc' cannot be used.  So we spill the elements
 * (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
 * 'ldrd' and one 'strd' instruction per round.
 *
 * All rotates are performed using the implicit rotate operand accepted by the
 * 'add' and 'eor' instructions.  This is faster than using explicit rotate
 * instructions.  To make this work, we allow the values in the second and last
 * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
 * wrong rotation amount.  The rotation amount is then fixed up just in time
 * when the values are used.  'brot' is the number of bits the values in row 'b'
 * need to be rotated right to arrive at the correct values, and 'drot'
 * similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
 * that they end up as (25, 24) after every round.
 */
 	// ChaCha state registers
 	X0	.req	r0
 	X1	.req	r1
 	X2	.req	r2
 	X3	.req	r3
 	X4	.req	r4
 	X5	.req	r5
 	X6	.req	r6
 	X7	.req	r7
 	X8_X10	.req	r8	// shared by x8 and x10
 	X9_X11	.req	r9	// shared by x9 and x11
 	X12	.req	r10
 	X13	.req	r11
 	X14	.req	r12
 	X15	.req	r14
 .macro __rev		out, in,  t0, t1, t2
 .if __LINUX_ARM_ARCH__ >= 6
 	rev		\out, \in
 .else
 	lsl		\t0, \in, #24
 	and		\t1, \in, #0xff00
 	and		\t2, \in, #0xff0000
 	orr		\out, \t0, \in, lsr #24
 	orr		\out, \out, \t1, lsl #8
 	orr		\out, \out, \t2, lsr #8
 .endif
 .endm
 .macro _le32_bswap	x,  t0, t1, t2
 #ifdef __ARMEB__
 	__rev		\x, \x,  \t0, \t1, \t2
 #endif
 .endm
 .macro _le32_bswap_4x	a, b, c, d,  t0, t1, t2
 	_le32_bswap	\a,  \t0, \t1, \t2
 	_le32_bswap	\b,  \t0, \t1, \t2
 	_le32_bswap	\c,  \t0, \t1, \t2
 	_le32_bswap	\d,  \t0, \t1, \t2
 .endm
 .macro __ldrd		a, b, src, offset
 #if __LINUX_ARM_ARCH__ >= 6
 	ldrd		\a, \b, [\src, #\offset]
 #else
 	ldr		\a, [\src, #\offset]
 	ldr		\b, [\src, #\offset + 4]
 #endif
 .endm
 .macro __strd		a, b, dst, offset
 #if __LINUX_ARM_ARCH__ >= 6
 	strd		\a, \b, [\dst, #\offset]
 #else
 	str		\a, [\dst, #\offset]
 	str		\b, [\dst, #\offset + 4]
 #endif
 .endm
 .macro _halfround	a1, b1, c1, d1,  a2, b2, c2, d2
 	// a += b; d ^= a; d = rol(d, 16);
 	add		\a1, \a1, \b1, ror #brot
 	add		\a2, \a2, \b2, ror #brot
 	eor		\d1, \a1, \d1, ror #drot
 	eor		\d2, \a2, \d2, ror #drot
 	// drot == 32 - 16 == 16
 	// c += d; b ^= c; b = rol(b, 12);
 	add		\c1, \c1, \d1, ror #16
 	add		\c2, \c2, \d2, ror #16
 	eor		\b1, \c1, \b1, ror #brot
 	eor		\b2, \c2, \b2, ror #brot
 	// brot == 32 - 12 == 20
 	// a += b; d ^= a; d = rol(d, 8);
 	add		\a1, \a1, \b1, ror #20
 	add		\a2, \a2, \b2, ror #20
 	eor		\d1, \a1, \d1, ror #16
 	eor		\d2, \a2, \d2, ror #16
 	// drot == 32 - 8 == 24
 	// c += d; b ^= c; b = rol(b, 7);
 	add		\c1, \c1, \d1, ror #24
 	add		\c2, \c2, \d2, ror #24
 	eor		\b1, \c1, \b1, ror #20
 	eor		\b2, \c2, \b2, ror #20
 	// brot == 32 - 7 == 25
 .endm
 .macro _doubleround
 	// column round
 	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
 	_halfround	X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13
 	// save (x8, x9); restore (x10, x11)
 	__strd		X8_X10, X9_X11, sp, 0
 	__ldrd		X8_X10, X9_X11, sp, 8
 	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
 	_halfround	X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15
 	.set brot, 25
 	.set drot, 24
 	// diagonal round
 	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
 	_halfround	X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12
 	// save (x10, x11); restore (x8, x9)
 	__strd		X8_X10, X9_X11, sp, 8
 	__ldrd		X8_X10, X9_X11, sp, 0
 	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
 	_halfround	X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
 .endm
 .macro _chacha_permute	nrounds
 	.set brot, 0
 	.set drot, 0
 	.rept \nrounds / 2
 	 _doubleround
 	.endr
 .endm
 .macro _chacha		nrounds
 .Lnext_block\@:
 	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
 	// Registers contain x0-x9,x12-x15.
 	// Do the core ChaCha permutation to update x0-x15.
 	_chacha_permute	\nrounds
 	add		sp, #8
 	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
 	// Registers contain x0-x9,x12-x15.
 	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
 	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
 	push		{X8_X10, X9_X11, X12, X13, X14, X15}
 	// Load (OUT, IN, LEN).
 	ldr		r14, [sp, #96]
 	ldr		r12, [sp, #100]
 	ldr		r11, [sp, #104]
 	orr		r10, r14, r12
 	// Use slow path if fewer than 64 bytes remain.
 	cmp		r11, #64
 	blt		.Lxor_slowpath\@
 	// Use slow path if IN and/or OUT isn't 4-byte aligned.  Needed even on
 	// ARMv6+, since ldmia and stmia (used below) still require alignment.
 	tst		r10, #3
 	bne		.Lxor_slowpath\@
 	// Fast path: XOR 64 bytes of aligned data.
 	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
 	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
 	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
 	// x0-x3
 	__ldrd		r8, r9, sp, 32
 	__ldrd		r10, r11, sp, 40
 	add		X0, X0, r8
 	add		X1, X1, r9
 	add		X2, X2, r10
 	add		X3, X3, r11
 	_le32_bswap_4x	X0, X1, X2, X3,  r8, r9, r10
 	ldmia		r12!, {r8-r11}
 	eor		X0, X0, r8
 	eor		X1, X1, r9
 	eor		X2, X2, r10
 	eor		X3, X3, r11
 	stmia		r14!, {X0-X3}
 	// x4-x7
 	__ldrd		r8, r9, sp, 48
 	__ldrd		r10, r11, sp, 56
 	add		X4, r8, X4, ror #brot
 	add		X5, r9, X5, ror #brot
 	ldmia		r12!, {X0-X3}
 	add		X6, r10, X6, ror #brot
 	add		X7, r11, X7, ror #brot
 	_le32_bswap_4x	X4, X5, X6, X7,  r8, r9, r10
 	eor		X4, X4, X0
 	eor		X5, X5, X1
 	eor		X6, X6, X2
 	eor		X7, X7, X3
 	stmia		r14!, {X4-X7}
 	// x8-x15
 	pop		{r0-r7}			// (x8-x9,x12-x15,x10-x11)
 	__ldrd		r8, r9, sp, 32
 	__ldrd		r10, r11, sp, 40
 	add		r0, r0, r8		// x8
 	add		r1, r1, r9		// x9
 	add		r6, r6, r10		// x10
 	add		r7, r7, r11		// x11
 	_le32_bswap_4x	r0, r1, r6, r7,  r8, r9, r10
 	ldmia		r12!, {r8-r11}
 	eor		r0, r0, r8		// x8
 	eor		r1, r1, r9		// x9
 	eor		r6, r6, r10		// x10
 	eor		r7, r7, r11		// x11
 	stmia		r14!, {r0,r1,r6,r7}
 	ldmia		r12!, {r0,r1,r6,r7}
 	__ldrd		r8, r9, sp, 48
 	__ldrd		r10, r11, sp, 56
 	add		r2, r8, r2, ror #drot	// x12
 	add		r3, r9, r3, ror #drot	// x13
 	add		r4, r10, r4, ror #drot	// x14
 	add		r5, r11, r5, ror #drot	// x15
 	_le32_bswap_4x	r2, r3, r4, r5,  r9, r10, r11
 	  ldr		r9, [sp, #72]		// load LEN
 	eor		r2, r2, r0		// x12
 	eor		r3, r3, r1		// x13
 	eor		r4, r4, r6		// x14
 	eor		r5, r5, r7		// x15
 	  subs		r9, #64			// decrement and check LEN
 	stmia		r14!, {r2-r5}
 	beq		.Ldone\@
 .Lprepare_for_next_block\@:
 	// Stack: x0-x15 OUT IN LEN
 	// Increment block counter (x12)
 	add		r8, #1
 	// Store updated (OUT, IN, LEN)
 	str		r14, [sp, #64]
 	str		r12, [sp, #68]
 	str		r9, [sp, #72]
 	  mov		r14, sp
 	// Store updated block counter (x12)
 	str		r8, [sp, #48]
 	  sub		sp, #16
 	// Reload state and do next block
 	ldmia		r14!, {r0-r11}		// load x0-x11
 	__strd		r10, r11, sp, 8		// store x10-x11 before state
 	ldmia		r14, {r10-r12,r14}	// load x12-x15
 	b		.Lnext_block\@
 .Lxor_slowpath\@:
 	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
 	// We handle it by storing the 64 bytes of keystream to the stack, then
 	// XOR-ing the needed portion with the data.
 	// Allocate keystream buffer
 	sub		sp, #64
 	mov		r14, sp
 	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
 	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
 	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
 	// Save keystream for x0-x3
 	__ldrd		r8, r9, sp, 96
 	__ldrd		r10, r11, sp, 104
 	add		X0, X0, r8
 	add		X1, X1, r9
 	add		X2, X2, r10
 	add		X3, X3, r11
 	_le32_bswap_4x	X0, X1, X2, X3,  r8, r9, r10
 	stmia		r14!, {X0-X3}
 	// Save keystream for x4-x7
 	__ldrd		r8, r9, sp, 112
 	__ldrd		r10, r11, sp, 120
 	add		X4, r8, X4, ror #brot
 	add		X5, r9, X5, ror #brot
 	add		X6, r10, X6, ror #brot
 	add		X7, r11, X7, ror #brot
 	_le32_bswap_4x	X4, X5, X6, X7,  r8, r9, r10
 	  add		r8, sp, #64
 	stmia		r14!, {X4-X7}
 	// Save keystream for x8-x15
 	ldm		r8, {r0-r7}		// (x8-x9,x12-x15,x10-x11)
 	__ldrd		r8, r9, sp, 128
 	__ldrd		r10, r11, sp, 136
 	add		r0, r0, r8		// x8
 	add		r1, r1, r9		// x9
 	add		r6, r6, r10		// x10
 	add		r7, r7, r11		// x11
 	_le32_bswap_4x	r0, r1, r6, r7,  r8, r9, r10
 	stmia		r14!, {r0,r1,r6,r7}
 	__ldrd		r8, r9, sp, 144
 	__ldrd		r10, r11, sp, 152
 	add		r2, r8, r2, ror #drot	// x12
 	add		r3, r9, r3, ror #drot	// x13
 	add		r4, r10, r4, ror #drot	// x14
 	add		r5, r11, r5, ror #drot	// x15
 	_le32_bswap_4x	r2, r3, r4, r5,  r9, r10, r11
 	stmia		r14, {r2-r5}
 	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
 	// Registers: r8 is block counter, r12 is IN.
 	ldr		r9, [sp, #168]		// LEN
 	ldr		r14, [sp, #160]		// OUT
 	cmp		r9, #64
 	  mov		r0, sp
 	movle		r1, r9
 	movgt		r1, #64
 	// r1 is number of bytes to XOR, in range [1, 64]
 .if __LINUX_ARM_ARCH__ < 6
 	orr		r2, r12, r14
 	tst		r2, #3			// IN or OUT misaligned?
 	bne		.Lxor_next_byte\@
 .endif
 	// XOR a word at a time
 .rept 16
 	subs		r1, #4
 	blt		.Lxor_words_done\@
 	ldr		r2, [r12], #4
 	ldr		r3, [r0], #4
 	eor		r2, r2, r3
 	str		r2, [r14], #4
 .endr
 	b		.Lxor_slowpath_done\@
 .Lxor_words_done\@:
 	ands		r1, r1, #3
 	beq		.Lxor_slowpath_done\@
 	// XOR a byte at a time
 .Lxor_next_byte\@:
 	ldrb		r2, [r12], #1
 	ldrb		r3, [r0], #1
 	eor		r2, r2, r3
 	strb		r2, [r14], #1
 	subs		r1, #1
 	bne		.Lxor_next_byte\@
 .Lxor_slowpath_done\@:
 	subs		r9, #64
 	add		sp, #96
 	bgt		.Lprepare_for_next_block\@
 .Ldone\@:
 .endm	// _chacha
 /*
 * void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
 *		     const u32 *state, int nrounds);
 */
 ENTRY(chacha_doarm)
 	cmp		r2, #0			// len == 0?
 	reteq		lr
 	ldr		ip, [sp]
 	cmp		ip, #12
 	push		{r0-r2,r4-r11,lr}
 	// Push state x0-x15 onto stack.
 	// Also store an extra copy of x10-x11 just before the state.
 	add		X12, r3, #48
 	ldm		X12, {X12,X13,X14,X15}
 	push		{X12,X13,X14,X15}
 	sub		sp, sp, #64
 	__ldrd		X8_X10, X9_X11, r3, 40
 	__strd		X8_X10, X9_X11, sp, 8
 	__strd		X8_X10, X9_X11, sp, 56
 	ldm		r3, {X0-X9_X11}
 	__strd		X0, X1, sp, 16
 	__strd		X2, X3, sp, 24
 	__strd		X4, X5, sp, 32
 	__strd		X6, X7, sp, 40
 	__strd		X8_X10, X9_X11, sp, 48
 	beq		1f
 	_chacha		20
 0:	add		sp, #76
 	pop		{r4-r11, pc}
 1:	_chacha		12
 	b		0b
 ENDPROC(chacha_doarm)
 /*
 * void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
 */
 ENTRY(hchacha_block_arm)
 	push		{r1,r4-r11,lr}
 	cmp		r2, #12			// ChaCha12 ?
 	mov		r14, r0
 	ldmia		r14!, {r0-r11}		// load x0-x11
 	push		{r10-r11}		// store x10-x11 to stack
 	ldm		r14, {r10-r12,r14}	// load x12-x15
 	sub		sp, #8
 	beq		1f
 	_chacha_permute	20
 	// Skip over (unused0-unused1, x10-x11)
 0:	add		sp, #16
 	// Fix up rotations of x12-x15
 	ror		X12, X12, #drot
 	ror		X13, X13, #drot
 	  pop		{r4}			// load 'out'
 	ror		X14, X14, #drot
 	ror		X15, X15, #drot
 	// Store (x0-x3,x12-x15) to 'out'
 	stm		r4, {X0,X1,X2,X3,X12,X13,X14,X15}
 	pop		{r4-r11,pc}
 1:	_chacha_permute	12
 	b		0b
 ENDPROC(hchacha_block_arm)
--- a/arch/arm/crypto/curve25519-core.S
+++ b/arch/arm/crypto/curve25519-core.S
--- a/arch/arm/crypto/curve25519-glue.c
+++ b/arch/arm/crypto/curve25519-glue.c
@ -0,0 +1,135 @@
 // SPDX-License-Identifier: GPL-2.0 OR MIT
 /*
 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 *
 * Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
 * began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
 * manually reworked for use in kernel space.
 */
 #include <asm/hwcap.h>
 #include <asm/neon.h>
 #include <asm/simd.h>
 #include <crypto/internal/kpp.h>
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/jump_label.h>
 #include <linux/scatterlist.h>
 #include <crypto/curve25519.h>
 asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
 				const u8 secret[CURVE25519_KEY_SIZE],
 				const u8 basepoint[CURVE25519_KEY_SIZE]);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
 void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
 		     const u8 scalar[CURVE25519_KEY_SIZE],
 		     const u8 point[CURVE25519_KEY_SIZE])
 {
 	if (static_branch_likely(&have_neon) && may_use_simd()) {
 		kernel_neon_begin();
 		curve25519_neon(out, scalar, point);
 		kernel_neon_end();
 	} else {
 		curve25519_generic(out, scalar, point);
 	}
 }
 EXPORT_SYMBOL(curve25519_arch);
 void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
 			  const u8 secret[CURVE25519_KEY_SIZE])
 {
 	return curve25519_arch(pub, secret, curve25519_base_point);
 }
 EXPORT_SYMBOL(curve25519_base_arch);
 static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
 				 unsigned int len)
 {
 	u8 *secret = kpp_tfm_ctx(tfm);
 	if (!len)
 		curve25519_generate_secret(secret);
 	else if (len == CURVE25519_KEY_SIZE &&
 		 crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
 		memcpy(secret, buf, CURVE25519_KEY_SIZE);
 	else
 		return -EINVAL;
 	return 0;
 }
 static int curve25519_compute_value(struct kpp_request *req)
 {
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
 	const u8 *secret = kpp_tfm_ctx(tfm);
 	u8 public_key[CURVE25519_KEY_SIZE];
 	u8 buf[CURVE25519_KEY_SIZE];
 	int copied, nbytes;
 	u8 const *bp;
 	if (req->src) {
 		copied = sg_copy_to_buffer(req->src,
 					   sg_nents_for_len(req->src,
 							    CURVE25519_KEY_SIZE),
 					   public_key, CURVE25519_KEY_SIZE);
 		if (copied != CURVE25519_KEY_SIZE)
 			return -EINVAL;
 		bp = public_key;
 	} else {
 		bp = curve25519_base_point;
 	}
 	curve25519_arch(buf, secret, bp);
 	/* might want less than we've got */
 	nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
 	copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
 								nbytes),
 				     buf, nbytes);
 	if (copied != nbytes)
 		return -EINVAL;
 	return 0;
 }
 static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
 {
 	return CURVE25519_KEY_SIZE;
 }
 static struct kpp_alg curve25519_alg = {
 	.base.cra_name		= "curve25519",
 	.base.cra_driver_name	= "curve25519-neon",
 	.base.cra_priority	= 200,
 	.base.cra_module	= THIS_MODULE,
 	.base.cra_ctxsize	= CURVE25519_KEY_SIZE,
 	.set_secret		= curve25519_set_secret,
 	.generate_public_key	= curve25519_compute_value,
 	.compute_shared_secret	= curve25519_compute_value,
 	.max_size		= curve25519_max_size,
 };
 static int __init mod_init(void)
 {
 	if (elf_hwcap & HWCAP_NEON) {
 		static_branch_enable(&have_neon);
 		return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
 			crypto_register_kpp(&curve25519_alg) : 0;
 	}
 	return 0;
 }
 static void __exit mod_exit(void)
 {
 	if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && elf_hwcap & HWCAP_NEON)
 		crypto_unregister_kpp(&curve25519_alg);
 }
 module_init(mod_init);
 module_exit(mod_exit);
 MODULE_ALIAS_CRYPTO("curve25519");
 MODULE_ALIAS_CRYPTO("curve25519-neon");
 MODULE_LICENSE("GPL v2");
--- a/arch/arm/crypto/poly1305-armv4.pl
+++ b/arch/arm/crypto/poly1305-armv4.pl
--- a/arch/arm/crypto/poly1305-core.S_shipped
+++ b/arch/arm/crypto/poly1305-core.S_shipped
--- a/arch/arm/crypto/poly1305-glue.c
+++ b/arch/arm/crypto/poly1305-glue.c
@ -0,0 +1,272 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
 * OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
 *
 * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
 */
 #include <asm/hwcap.h>
 #include <asm/neon.h>
 #include <asm/simd.h>
 #include <asm/unaligned.h>
 #include <crypto/algapi.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/poly1305.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
 #include <linux/jump_label.h>
 #include <linux/module.h>
 void poly1305_init_arm(void *state, const u8 *key);
 void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
 void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
 void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
 void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
 {
 }
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
 void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
 {
 	poly1305_init_arm(&dctx->h, key);
 	dctx->s[0] = get_unaligned_le32(key + 16);
 	dctx->s[1] = get_unaligned_le32(key + 20);
 	dctx->s[2] = get_unaligned_le32(key + 24);
 	dctx->s[3] = get_unaligned_le32(key + 28);
 	dctx->buflen = 0;
 }
 EXPORT_SYMBOL(poly1305_init_arch);
 static int arm_poly1305_init(struct shash_desc *desc)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 	dctx->buflen = 0;
 	dctx->rset = 0;
 	dctx->sset = false;
 	return 0;
 }
 static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
 				 u32 len, u32 hibit, bool do_neon)
 {
 	if (unlikely(!dctx->sset)) {
 		if (!dctx->rset) {
 			poly1305_init_arm(&dctx->h, src);
 			src += POLY1305_BLOCK_SIZE;
 			len -= POLY1305_BLOCK_SIZE;
 			dctx->rset = 1;
 		}
 		if (len >= POLY1305_BLOCK_SIZE) {
 			dctx->s[0] = get_unaligned_le32(src +  0);
 			dctx->s[1] = get_unaligned_le32(src +  4);
 			dctx->s[2] = get_unaligned_le32(src +  8);
 			dctx->s[3] = get_unaligned_le32(src + 12);
 			src += POLY1305_BLOCK_SIZE;
 			len -= POLY1305_BLOCK_SIZE;
 			dctx->sset = true;
 		}
 		if (len < POLY1305_BLOCK_SIZE)
 			return;
 	}
 	len &= ~(POLY1305_BLOCK_SIZE - 1);
 	if (static_branch_likely(&have_neon) && likely(do_neon))
 		poly1305_blocks_neon(&dctx->h, src, len, hibit);
 	else
 		poly1305_blocks_arm(&dctx->h, src, len, hibit);
 }
 static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
 				    const u8 *src, u32 len, bool do_neon)
 {
 	if (unlikely(dctx->buflen)) {
 		u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
 		memcpy(dctx->buf + dctx->buflen, src, bytes);
 		src += bytes;
 		len -= bytes;
 		dctx->buflen += bytes;
 		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
 			arm_poly1305_blocks(dctx, dctx->buf,
 					    POLY1305_BLOCK_SIZE, 1, false);
 			dctx->buflen = 0;
 		}
 	}
 	if (likely(len >= POLY1305_BLOCK_SIZE)) {
 		arm_poly1305_blocks(dctx, src, len, 1, do_neon);
 		src += round_down(len, POLY1305_BLOCK_SIZE);
 		len %= POLY1305_BLOCK_SIZE;
 	}
 	if (unlikely(len)) {
 		dctx->buflen = len;
 		memcpy(dctx->buf, src, len);
 	}
 }
 static int arm_poly1305_update(struct shash_desc *desc,
 			       const u8 *src, unsigned int srclen)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 	arm_poly1305_do_update(dctx, src, srclen, false);
 	return 0;
 }
 static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
 						   const u8 *src,
 						   unsigned int srclen)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 	bool do_neon = may_use_simd() && srclen > 128;
 	if (static_branch_likely(&have_neon) && do_neon)
 		kernel_neon_begin();
 	arm_poly1305_do_update(dctx, src, srclen, do_neon);
 	if (static_branch_likely(&have_neon) && do_neon)
 		kernel_neon_end();
 	return 0;
 }
 void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
 			  unsigned int nbytes)
 {
 	bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 		       may_use_simd();
 	if (unlikely(dctx->buflen)) {
 		u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
 		memcpy(dctx->buf + dctx->buflen, src, bytes);
 		src += bytes;
 		nbytes -= bytes;
 		dctx->buflen += bytes;
 		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
 			poly1305_blocks_arm(&dctx->h, dctx->buf,
 					    POLY1305_BLOCK_SIZE, 1);
 			dctx->buflen = 0;
 		}
 	}
 	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
 		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
 		if (static_branch_likely(&have_neon) && do_neon) {
 			do {
 				unsigned int todo = min_t(unsigned int, len, SZ_4K);
 				kernel_neon_begin();
 				poly1305_blocks_neon(&dctx->h, src, todo, 1);
 				kernel_neon_end();
 				len -= todo;
 				src += todo;
 			} while (len);
 		} else {
 			poly1305_blocks_arm(&dctx->h, src, len, 1);
 			src += len;
 		}
 		nbytes %= POLY1305_BLOCK_SIZE;
 	}
 	if (unlikely(nbytes)) {
 		dctx->buflen = nbytes;
 		memcpy(dctx->buf, src, nbytes);
 	}
 }
 EXPORT_SYMBOL(poly1305_update_arch);
 void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
 {
 	if (unlikely(dctx->buflen)) {
 		dctx->buf[dctx->buflen++] = 1;
 		memset(dctx->buf + dctx->buflen, 0,
 		       POLY1305_BLOCK_SIZE - dctx->buflen);
 		poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
 	}
 	poly1305_emit_arm(&dctx->h, dst, dctx->s);
 	*dctx = (struct poly1305_desc_ctx){};
 }
 EXPORT_SYMBOL(poly1305_final_arch);
 static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 	if (unlikely(!dctx->sset))
 		return -ENOKEY;
 	poly1305_final_arch(dctx, dst);
 	return 0;
 }
 static struct shash_alg arm_poly1305_algs[] = {{
 	.init			= arm_poly1305_init,
 	.update			= arm_poly1305_update,
 	.final			= arm_poly1305_final,
 	.digestsize		= POLY1305_DIGEST_SIZE,
 	.descsize		= sizeof(struct poly1305_desc_ctx),
 	.base.cra_name		= "poly1305",
 	.base.cra_driver_name	= "poly1305-arm",
 	.base.cra_priority	= 150,
 	.base.cra_blocksize	= POLY1305_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 #ifdef CONFIG_KERNEL_MODE_NEON
 }, {
 	.init			= arm_poly1305_init,
 	.update			= arm_poly1305_update_neon,
 	.final			= arm_poly1305_final,
 	.digestsize		= POLY1305_DIGEST_SIZE,
 	.descsize		= sizeof(struct poly1305_desc_ctx),
 	.base.cra_name		= "poly1305",
 	.base.cra_driver_name	= "poly1305-neon",
 	.base.cra_priority	= 200,
 	.base.cra_blocksize	= POLY1305_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 #endif
 }};
 static int __init arm_poly1305_mod_init(void)
 {
 	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
 	    (elf_hwcap & HWCAP_NEON))
 		static_branch_enable(&have_neon);
 	else if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
 		/* register only the first entry */
 		return crypto_register_shash(&arm_poly1305_algs[0]);
 	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
 		crypto_register_shashes(arm_poly1305_algs,
 					ARRAY_SIZE(arm_poly1305_algs)) : 0;
 }
 static void __exit arm_poly1305_mod_exit(void)
 {
 	if (!IS_REACHABLE(CONFIG_CRYPTO_HASH))
 		return;
 	if (!static_branch_likely(&have_neon)) {
 		crypto_unregister_shash(&arm_poly1305_algs[0]);
 		return;
 	}
 	crypto_unregister_shashes(arm_poly1305_algs,
 				  ARRAY_SIZE(arm_poly1305_algs));
 }
 module_init(arm_poly1305_mod_init);
 module_exit(arm_poly1305_mod_exit);
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("poly1305");
 MODULE_ALIAS_CRYPTO("poly1305-arm");
 MODULE_ALIAS_CRYPTO("poly1305-neon");
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@ -688,6 +688,40 @@ static void disable_single_step(struct perf_event *bp)
 	arch_install_hw_breakpoint(bp);
 }
 /*
 * Arm32 hardware does not always report a watchpoint hit address that matches
 * one of the watchpoints set. It can also report an address "near" the
 * watchpoint if a single instruction access both watched and unwatched
 * addresses. There is no straight-forward way, short of disassembling the
 * offending instruction, to map that address back to the watchpoint. This
 * function computes the distance of the memory access from the watchpoint as a
 * heuristic for the likelyhood that a given access triggered the watchpoint.
 *
 * See this same function in the arm64 platform code, which has the same
 * problem.
 *
 * The function returns the distance of the address from the bytes watched by
 * the watchpoint. In case of an exact match, it returns 0.
 */
 static u32 get_distance_from_watchpoint(unsigned long addr, u32 val,
 					struct arch_hw_breakpoint_ctrl *ctrl)
 {
 	u32 wp_low, wp_high;
 	u32 lens, lene;
 	lens = __ffs(ctrl->len);
 	lene = __fls(ctrl->len);
 	wp_low = val + lens;
 	wp_high = val + lene;
 	if (addr < wp_low)
 		return wp_low - addr;
 	else if (addr > wp_high)
 		return addr - wp_high;
 	else
 		return 0;
 }
 static int watchpoint_fault_on_uaccess(struct pt_regs *regs,
 				       struct arch_hw_breakpoint *info)
 {
@ -697,23 +731,25 @@ static int watchpoint_fault_on_uaccess(struct pt_regs *regs,
 static void watchpoint_handler(unsigned long addr, unsigned int fsr,
 			       struct pt_regs *regs)
 {
-	int i, access;
+	int i, access, closest_match = 0;
-	u32 val, ctrl_reg, alignment_mask;
+	u32 min_dist = -1, dist;
 	u32 val, ctrl_reg;
 	struct perf_event *wp, **slots;
 	struct arch_hw_breakpoint *info;
 	struct arch_hw_breakpoint_ctrl ctrl;
 	slots = this_cpu_ptr(wp_on_reg);
-	for (i = 0; i < core_num_wrps; ++i) {
+	/*
 	 * Find all watchpoints that match the reported address. If no exact
 	 * match is found. Attribute the hit to the closest watchpoint.
 	 */
 	rcu_read_lock();
-
+	for (i = 0; i < core_num_wrps; ++i) {
 		wp = slots[i];
 		if (wp == NULL)
-			goto unlock;
+			continue;
 		info = counter_arch_bp(wp);
 		/*
 		 * The DFAR is an unknown value on debug architectures prior
 		 * to 7.1. Since we only allow a single watchpoint on these
@ -722,33 +758,31 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
 		 */
 		if (debug_arch < ARM_DEBUG_ARCH_V7_1) {
 			BUG_ON(i > 0);
 			info = counter_arch_bp(wp);
 			info->trigger = wp->attr.bp_addr;
 		} else {
 			if (info->ctrl.len == ARM_BREAKPOINT_LEN_8)
 				alignment_mask = 0x7;
 			else
 				alignment_mask = 0x3;
 			/* Check if the watchpoint value matches. */
 			val = read_wb_reg(ARM_BASE_WVR + i);
 			if (val != (addr & ~alignment_mask))
 				goto unlock;
 			/* Possible match, check the byte address select. */
 			ctrl_reg = read_wb_reg(ARM_BASE_WCR + i);
 			decode_ctrl_reg(ctrl_reg, &ctrl);
 			if (!((1 << (addr & alignment_mask)) & ctrl.len))
 				goto unlock;
 			/* Check that the access type matches. */
 			if (debug_exception_updates_fsr()) {
 				access = (fsr & ARM_FSR_ACCESS_MASK) ?
 					  HW_BREAKPOINT_W : HW_BREAKPOINT_R;
 				if (!(access & hw_breakpoint_type(wp)))
-					goto unlock;
+					continue;
 			}
 			val = read_wb_reg(ARM_BASE_WVR + i);
 			ctrl_reg = read_wb_reg(ARM_BASE_WCR + i);
 			decode_ctrl_reg(ctrl_reg, &ctrl);
 			dist = get_distance_from_watchpoint(addr, val, &ctrl);
 			if (dist < min_dist) {
 				min_dist = dist;
 				closest_match = i;
 			}
 			/* Is this an exact match? */
 			if (dist != 0)
 				continue;
 			/* We have a winner. */
 			info = counter_arch_bp(wp);
 			info->trigger = addr;
 		}
@ -770,13 +804,23 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
 		 * we can single-step over the watchpoint trigger.
 		 */
 		if (!is_default_overflow_handler(wp))
-			goto unlock;
+			continue;
 step:
 		enable_single_step(wp, instruction_pointer(regs));
 unlock:
 		rcu_read_unlock();
 	}
 	if (min_dist > 0 && min_dist != -1) {
 		/* No exact match found. */
 		wp = slots[closest_match];
 		info = counter_arch_bp(wp);
 		info->trigger = addr;
 		pr_debug("watchpoint fired: address = 0x%x\n", info->trigger);
 		perf_bp_event(wp, regs);
 		if (is_default_overflow_handler(wp))
 			enable_single_step(wp, instruction_pointer(regs));
 	}
 	rcu_read_unlock();
 }
 static void watchpoint_single_step_handler(unsigned long pc)
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@ -1261,20 +1261,28 @@ static void __init l2c310_of_parse(const struct device_node *np,
 	ret = of_property_read_u32(np, "prefetch-data", &val);
 	if (ret == 0) {
-		if (val)
+		if (val) {
 			prefetch |= L310_PREFETCH_CTRL_DATA_PREFETCH;
-		else
+			*aux_val |= L310_PREFETCH_CTRL_DATA_PREFETCH;
 		} else {
 			prefetch &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
 			*aux_val &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
 		}
 		*aux_mask &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
 	} else if (ret != -EINVAL) {
 		pr_err("L2C-310 OF prefetch-data property value is missing\n");
 	}
 	ret = of_property_read_u32(np, "prefetch-instr", &val);
 	if (ret == 0) {
-		if (val)
+		if (val) {
 			prefetch |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
-		else
+			*aux_val |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
 		} else {
 			prefetch &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
 			*aux_val &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
 		}
 		*aux_mask &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
 	} else if (ret != -EINVAL) {
 		pr_err("L2C-310 OF prefetch-instr property value is missing\n");
 	}
--- a/arch/arm/plat-samsung/Kconfig
+++ b/arch/arm/plat-samsung/Kconfig
@ -240,6 +240,7 @@ config SAMSUNG_PM_DEBUG
 	bool "Samsung PM Suspend debug"
 	depends on PM && DEBUG_KERNEL
 	depends on DEBUG_EXYNOS_UART || DEBUG_S3C24XX_UART || DEBUG_S3C2410_UART
 	depends on DEBUG_LL && MMU
 	help
 	  Say Y here if you want verbose debugging from the PM Suspend and
 	  Resume code. See <file:Documentation/arm/Samsung-S3C24XX/Suspend.txt>
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@ -46,6 +46,7 @@ config ARCH_BCM_IPROC
 config ARCH_BERLIN
 	bool "Marvell Berlin SoC Family"
 	select DW_APB_ICTL
 	select DW_APB_TIMER_OF
 	select GPIOLIB
 	select PINCTRL
 	help
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@ -10,7 +10,7 @@
 #
 # Copyright (C) 1995-2001 by Russell King
-LDFLAGS_vmlinux	:=--no-undefined -X
+LDFLAGS_vmlinux	:=--no-undefined -X -z norelro
 CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET)
 GZFLAGS		:=-9
@ -18,7 +18,7 @@ ifeq ($(CONFIG_RELOCATABLE), y)
 # Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour
 # for relative relocs, since this leads to better Image compression
 # with the relocation offsets always being zero.
-LDFLAGS_vmlinux		+= -shared -Bsymbolic -z notext -z norelro \
+LDFLAGS_vmlinux		+= -shared -Bsymbolic -z notext \
 			$(call ld-option, --no-apply-dynamic-relocs)
 endif
--- a/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dts
+++ b/arch/arm64/boot/dts/marvell/armada-3720-espressobin.dts
@ -21,6 +21,10 @@
 	aliases {
 		ethernet0 = &eth0;
 		/* for dsa slave device */
 		ethernet1 = &switch0port1;
 		ethernet2 = &switch0port2;
 		ethernet3 = &switch0port3;
 		serial0 = &uart0;
 		serial1 = &uart1;
 	};
@ -136,25 +140,25 @@
 			#address-cells = <1>;
 			#size-cells = <0>;
-			port@0 {
+			switch0port0: port@0 {
 				reg = <0>;
 				label = "cpu";
 				ethernet = <&eth0>;
 			};
-			port@1 {
+			switch0port1: port@1 {
 				reg = <1>;
 				label = "wan";
 				phy-handle = <&switch0phy0>;
 			};
-			port@2 {
+			switch0port2: port@2 {
 				reg = <2>;
 				label = "lan0";
 				phy-handle = <&switch0phy1>;
 			};
-			port@3 {
+			switch0port3: port@3 {
 				reg = <3>;
 				label = "lan1";
 				phy-handle = <&switch0phy2>;
--- a/arch/arm64/boot/dts/qcom/msm8916.dtsi
+++ b/arch/arm64/boot/dts/qcom/msm8916.dtsi
@ -877,7 +877,7 @@
 				reg-names = "mdp_phys";
 				interrupt-parent = <&mdss>;
-				interrupts = <0 0>;
+				interrupts = <0>;
 				clocks = <&gcc GCC_MDSS_AHB_CLK>,
 					 <&gcc GCC_MDSS_AXI_CLK>,
@ -909,7 +909,7 @@
 				reg-names = "dsi_ctrl";
 				interrupt-parent = <&mdss>;
-				interrupts = <4 0>;
+				interrupts = <4>;
 				assigned-clocks = <&gcc BYTE0_CLK_SRC>,
 						  <&gcc PCLK0_CLK_SRC>;
--- a/arch/arm64/boot/dts/qcom/pm8916.dtsi
+++ b/arch/arm64/boot/dts/qcom/pm8916.dtsi
@ -99,7 +99,7 @@
 		wcd_codec: codec@f000 {
 			compatible = "qcom,pm8916-wcd-analog-codec";
-			reg = <0xf000 0x200>;
+			reg = <0xf000>;
 			reg-names = "pmic-codec-core";
 			clocks = <&gcc GCC_CODEC_DIGCODEC_CLK>;
 			clock-names = "mclk";
--- a/arch/arm64/boot/dts/renesas/ulcb.dtsi
+++ b/arch/arm64/boot/dts/renesas/ulcb.dtsi
@ -430,6 +430,7 @@
 	bus-width = <8>;
 	mmc-hs200-1_8v;
 	non-removable;
 	full-pwr-cycle-in-suspend;
 	status = "okay";
 };
--- a/arch/arm64/boot/dts/xilinx/zynqmp.dtsi
+++ b/arch/arm64/boot/dts/xilinx/zynqmp.dtsi
@ -411,7 +411,7 @@
 		};
 		i2c0: i2c@ff020000 {
-			compatible = "cdns,i2c-r1p14", "cdns,i2c-r1p10";
+			compatible = "cdns,i2c-r1p14";
 			status = "disabled";
 			interrupt-parent = <&gic>;
 			interrupts = <0 17 4>;
@ -421,7 +421,7 @@
 		};
 		i2c1: i2c@ff030000 {
-			compatible = "cdns,i2c-r1p14", "cdns,i2c-r1p10";
+			compatible = "cdns,i2c-r1p14";
 			status = "disabled";
 			interrupt-parent = <&gic>;
 			interrupts = <0 18 4>;
--- a/arch/arm64/configs/gki_defconfig
+++ b/arch/arm64/configs/gki_defconfig
@ -77,7 +77,6 @@ CONFIG_ARM_SCMI_PROTOCOL=y
 CONFIG_ARM_SCPI_PROTOCOL=y
 # CONFIG_ARM_SCPI_POWER_DOMAIN is not set
 # CONFIG_EFI_ARMSTUB_DTB_LOADER is not set
 CONFIG_ARM64_CRYPTO=y
 CONFIG_CRYPTO_SHA2_ARM64_CE=y
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
 CONFIG_JUMP_LABEL=y
@ -246,6 +245,7 @@ CONFIG_DM_VERITY_FEC=y
 CONFIG_DM_BOW=y
 CONFIG_NETDEVICES=y
 CONFIG_DUMMY=y
 CONFIG_WIREGUARD=y
 CONFIG_TUN=y
 CONFIG_VETH=y
 # CONFIG_ETHERNET is not set
@ -358,6 +358,7 @@ CONFIG_HID_NINTENDO=y
 CONFIG_HID_SONY=y
 CONFIG_HID_STEAM=y
 CONFIG_USB_HIDDEV=y
 CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
 CONFIG_USB_OTG=y
 CONFIG_USB_XHCI_HCD=y
 CONFIG_USB_GADGET=y
@ -503,6 +504,7 @@ CONFIG_CRC8=y
 CONFIG_XZ_DEC=y
 CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_INFO=y
 CONFIG_DEBUG_INFO_DWARF4=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
 CONFIG_MAGIC_SYSRQ=y
--- a/arch/arm64/crypto/.gitignore
+++ b/arch/arm64/crypto/.gitignore
@ -1,2 +1,3 @@
 sha256-core.S
 sha512-core.S
 poly1305-core.S
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@ -106,10 +106,17 @@ config CRYPTO_AES_ARM64_NEON_BLK
 	select CRYPTO_SIMD
 config CRYPTO_CHACHA20_NEON
-	tristate "NEON accelerated ChaCha20 symmetric cipher"
+	tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_BLKCIPHER
-	select CRYPTO_CHACHA20
+	select CRYPTO_LIB_CHACHA_GENERIC
 	select CRYPTO_ARCH_HAVE_LIB_CHACHA
 config CRYPTO_POLY1305_NEON
 	tristate "Poly1305 hash function using scalar or NEON instructions"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_HASH
 	select CRYPTO_ARCH_HAVE_LIB_POLY1305
 config CRYPTO_AES_ARM64_BS
 	tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@ -53,8 +53,12 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
 sha512-arm64-y := sha512-glue.o sha512-core.o
-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
+chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
 obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
 poly1305-neon-y := poly1305-core.o poly1305-glue.o
 AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
 obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
 aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
@ -71,6 +75,9 @@ ifdef REGENERATE_ARM64_CRYPTO
 quiet_cmd_perlasm = PERLASM $@
      cmd_perlasm = $(PERL) $(<) void $(@)
 $(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
 	$(call cmd,perlasm)
 $(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
 	$(call cmd,perlasm)
@ -78,4 +85,4 @@ $(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
 	$(call cmd,perlasm)
 endif
-targets += sha256-core.S sha512-core.S
+targets += poly1305-core.S sha256-core.S sha512-core.S
--- a/arch/arm64/crypto/chacha20-neon-core.S
+++ b/arch/arm64/crypto/chacha20-neon-core.S
@ -1,13 +1,13 @@
 /*
- * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ * ChaCha/XChaCha NEON helper functions
 *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
- * Based on:
+ * Originally based on:
 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
 *
 * Copyright (C) 2015 Martin Willi
@ -19,29 +19,27 @@
 */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 #include <asm/cache.h>
 	.text
 	.align		6
-ENTRY(chacha20_block_xor_neon)
+/*
-	// x0: Input state matrix, s
+ * chacha_permute - permute one block
-	// x1: 1 data block output, o
+ *
-	// x2: 1 data block input, i
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
 * registers v0-v3.  It performs matrix operations on four words in parallel,
 * but requires shuffling to rearrange the words after each round.
 *
 * The round count is given in w3.
 *
 * Clobbers: w3, x10, v4, v12
 */
 chacha_permute:
-	//
+	adr_l		x10, ROT8
-	// This function encrypts one ChaCha20 block by loading the state matrix
+	ld1		{v12.4s}, [x10]
 	// in four NEON registers. It performs matrix operation on four words in
 	// parallel, but requires shuffling to rearrange the words after each
 	// round.
 	//
 	// x0..3 = s0..3
 	adr		x3, ROT8
 	ld1		{v0.4s-v3.4s}, [x0]
 	ld1		{v8.4s-v11.4s}, [x0]
 	ld1		{v12.4s}, [x3]
 	mov		x3, #10
 .Ldoubleround:
 	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
@ -102,9 +100,27 @@ ENTRY(chacha20_block_xor_neon)
 	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
 	ext		v3.16b, v3.16b, v3.16b, #4
-	subs		x3, x3, #1
+	subs		w3, w3, #2
 	b.ne		.Ldoubleround
 	ret
 ENDPROC(chacha_permute)
 ENTRY(chacha_block_xor_neon)
 	// x0: Input state matrix, s
 	// x1: 1 data block output, o
 	// x2: 1 data block input, i
 	// w3: nrounds
 	stp		x29, x30, [sp, #-16]!
 	mov		x29, sp
 	// x0..3 = s0..3
 	ld1		{v0.4s-v3.4s}, [x0]
 	ld1		{v8.4s-v11.4s}, [x0]
 	bl		chacha_permute
 	ld1		{v4.16b-v7.16b}, [x2]
 	// o0 = i0 ^ (x0 + s0)
@ -125,71 +141,156 @@ ENTRY(chacha20_block_xor_neon)
 	st1		{v0.16b-v3.16b}, [x1]
 	ldp		x29, x30, [sp], #16
 	ret
-ENDPROC(chacha20_block_xor_neon)
+ENDPROC(chacha_block_xor_neon)
 ENTRY(hchacha_block_neon)
 	// x0: Input state matrix, s
 	// x1: output (8 32-bit words)
 	// w2: nrounds
 	stp		x29, x30, [sp, #-16]!
 	mov		x29, sp
 	ld1		{v0.4s-v3.4s}, [x0]
 	mov		w3, w2
 	bl		chacha_permute
 	st1		{v0.4s}, [x1], #16
 	st1		{v3.4s}, [x1]
 	ldp		x29, x30, [sp], #16
 	ret
 ENDPROC(hchacha_block_neon)
 	a0		.req	w12
 	a1		.req	w13
 	a2		.req	w14
 	a3		.req	w15
 	a4		.req	w16
 	a5		.req	w17
 	a6		.req	w19
 	a7		.req	w20
 	a8		.req	w21
 	a9		.req	w22
 	a10		.req	w23
 	a11		.req	w24
 	a12		.req	w25
 	a13		.req	w26
 	a14		.req	w27
 	a15		.req	w28
 	.align		6
-ENTRY(chacha20_4block_xor_neon)
+ENTRY(chacha_4block_xor_neon)
 	frame_push	10
 	// x0: Input state matrix, s
 	// x1: 4 data blocks output, o
 	// x2: 4 data blocks input, i
 	// w3: nrounds
 	// x4: byte count
 	adr_l		x10, .Lpermute
 	and		x5, x4, #63
 	add		x10, x10, x5
 	add		x11, x10, #64
 	//
-	// This function encrypts four consecutive ChaCha20 blocks by loading
+	// This function encrypts four consecutive ChaCha blocks by loading
 	// the state matrix in NEON registers four times. The algorithm performs
 	// each operation on the corresponding word of each state matrix, hence
 	// requires no word shuffling. For final XORing step we transpose the
 	// matrix by interleaving 32- and then 64-bit words, which allows us to
 	// do XOR in NEON registers.
 	//
-	adr		x3, CTRINC		// ... and ROT8
+	// At the same time, a fifth block is encrypted in parallel using
-	ld1		{v30.4s-v31.4s}, [x3]
+	// scalar registers
 	//
 	adr_l		x9, CTRINC		// ... and ROT8
 	ld1		{v30.4s-v31.4s}, [x9]
 	// x0..15[0-3] = s0..3[0..3]
-	mov		x4, x0
+	add		x8, x0, #16
-	ld4r		{ v0.4s- v3.4s}, [x4], #16
+	ld4r		{ v0.4s- v3.4s}, [x0]
-	ld4r		{ v4.4s- v7.4s}, [x4], #16
+	ld4r		{ v4.4s- v7.4s}, [x8], #16
-	ld4r		{ v8.4s-v11.4s}, [x4], #16
+	ld4r		{ v8.4s-v11.4s}, [x8], #16
-	ld4r		{v12.4s-v15.4s}, [x4]
+	ld4r		{v12.4s-v15.4s}, [x8]
-	// x12 += counter values 0-3
+	mov		a0, v0.s[0]
 	mov		a1, v1.s[0]
 	mov		a2, v2.s[0]
 	mov		a3, v3.s[0]
 	mov		a4, v4.s[0]
 	mov		a5, v5.s[0]
 	mov		a6, v6.s[0]
 	mov		a7, v7.s[0]
 	mov		a8, v8.s[0]
 	mov		a9, v9.s[0]
 	mov		a10, v10.s[0]
 	mov		a11, v11.s[0]
 	mov		a12, v12.s[0]
 	mov		a13, v13.s[0]
 	mov		a14, v14.s[0]
 	mov		a15, v15.s[0]
 	// x12 += counter values 1-4
 	add		v12.4s, v12.4s, v30.4s
 	mov		x3, #10
 .Ldoubleround4:
 	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
 	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
 	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
 	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
 	add		v0.4s, v0.4s, v4.4s
 	  add		a0, a0, a4
 	add		v1.4s, v1.4s, v5.4s
 	  add		a1, a1, a5
 	add		v2.4s, v2.4s, v6.4s
 	  add		a2, a2, a6
 	add		v3.4s, v3.4s, v7.4s
 	  add		a3, a3, a7
 	eor		v12.16b, v12.16b, v0.16b
 	  eor		a12, a12, a0
 	eor		v13.16b, v13.16b, v1.16b
 	  eor		a13, a13, a1
 	eor		v14.16b, v14.16b, v2.16b
 	  eor		a14, a14, a2
 	eor		v15.16b, v15.16b, v3.16b
 	  eor		a15, a15, a3
 	rev32		v12.8h, v12.8h
 	  ror		a12, a12, #16
 	rev32		v13.8h, v13.8h
 	  ror		a13, a13, #16
 	rev32		v14.8h, v14.8h
 	  ror		a14, a14, #16
 	rev32		v15.8h, v15.8h
 	  ror		a15, a15, #16
 	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
 	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
 	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
 	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
 	add		v8.4s, v8.4s, v12.4s
 	  add		a8, a8, a12
 	add		v9.4s, v9.4s, v13.4s
 	  add		a9, a9, a13
 	add		v10.4s, v10.4s, v14.4s
 	  add		a10, a10, a14
 	add		v11.4s, v11.4s, v15.4s
 	  add		a11, a11, a15
 	eor		v16.16b, v4.16b, v8.16b
 	  eor		a4, a4, a8
 	eor		v17.16b, v5.16b, v9.16b
 	  eor		a5, a5, a9
 	eor		v18.16b, v6.16b, v10.16b
 	  eor		a6, a6, a10
 	eor		v19.16b, v7.16b, v11.16b
 	  eor		a7, a7, a11
 	shl		v4.4s, v16.4s, #12
 	shl		v5.4s, v17.4s, #12
@ -197,42 +298,66 @@ ENTRY(chacha20_4block_xor_neon)
 	shl		v7.4s, v19.4s, #12
 	sri		v4.4s, v16.4s, #20
 	  ror		a4, a4, #20
 	sri		v5.4s, v17.4s, #20
 	  ror		a5, a5, #20
 	sri		v6.4s, v18.4s, #20
 	  ror		a6, a6, #20
 	sri		v7.4s, v19.4s, #20
 	  ror		a7, a7, #20
 	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
 	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
 	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
 	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
 	add		v0.4s, v0.4s, v4.4s
 	  add		a0, a0, a4
 	add		v1.4s, v1.4s, v5.4s
 	  add		a1, a1, a5
 	add		v2.4s, v2.4s, v6.4s
 	  add		a2, a2, a6
 	add		v3.4s, v3.4s, v7.4s
 	  add		a3, a3, a7
 	eor		v12.16b, v12.16b, v0.16b
 	  eor		a12, a12, a0
 	eor		v13.16b, v13.16b, v1.16b
 	  eor		a13, a13, a1
 	eor		v14.16b, v14.16b, v2.16b
 	  eor		a14, a14, a2
 	eor		v15.16b, v15.16b, v3.16b
 	  eor		a15, a15, a3
 	tbl		v12.16b, {v12.16b}, v31.16b
 	  ror		a12, a12, #24
 	tbl		v13.16b, {v13.16b}, v31.16b
 	  ror		a13, a13, #24
 	tbl		v14.16b, {v14.16b}, v31.16b
 	  ror		a14, a14, #24
 	tbl		v15.16b, {v15.16b}, v31.16b
 	  ror		a15, a15, #24
 	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
 	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
 	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
 	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
 	add		v8.4s, v8.4s, v12.4s
 	  add		a8, a8, a12
 	add		v9.4s, v9.4s, v13.4s
 	  add		a9, a9, a13
 	add		v10.4s, v10.4s, v14.4s
 	  add		a10, a10, a14
 	add		v11.4s, v11.4s, v15.4s
 	  add		a11, a11, a15
 	eor		v16.16b, v4.16b, v8.16b
 	  eor		a4, a4, a8
 	eor		v17.16b, v5.16b, v9.16b
 	  eor		a5, a5, a9
 	eor		v18.16b, v6.16b, v10.16b
 	  eor		a6, a6, a10
 	eor		v19.16b, v7.16b, v11.16b
 	  eor		a7, a7, a11
 	shl		v4.4s, v16.4s, #7
 	shl		v5.4s, v17.4s, #7
@ -240,42 +365,66 @@ ENTRY(chacha20_4block_xor_neon)
 	shl		v7.4s, v19.4s, #7
 	sri		v4.4s, v16.4s, #25
 	  ror		a4, a4, #25
 	sri		v5.4s, v17.4s, #25
 	  ror		a5, a5, #25
 	sri		v6.4s, v18.4s, #25
 	 ror		a6, a6, #25
 	sri		v7.4s, v19.4s, #25
 	  ror		a7, a7, #25
 	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
 	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
 	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
 	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
 	add		v0.4s, v0.4s, v5.4s
 	  add		a0, a0, a5
 	add		v1.4s, v1.4s, v6.4s
 	  add		a1, a1, a6
 	add		v2.4s, v2.4s, v7.4s
 	  add		a2, a2, a7
 	add		v3.4s, v3.4s, v4.4s
 	  add		a3, a3, a4
 	eor		v15.16b, v15.16b, v0.16b
 	  eor		a15, a15, a0
 	eor		v12.16b, v12.16b, v1.16b
 	  eor		a12, a12, a1
 	eor		v13.16b, v13.16b, v2.16b
 	  eor		a13, a13, a2
 	eor		v14.16b, v14.16b, v3.16b
 	  eor		a14, a14, a3
 	rev32		v15.8h, v15.8h
 	  ror		a15, a15, #16
 	rev32		v12.8h, v12.8h
 	  ror		a12, a12, #16
 	rev32		v13.8h, v13.8h
 	  ror		a13, a13, #16
 	rev32		v14.8h, v14.8h
 	  ror		a14, a14, #16
 	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
 	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
 	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
 	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
 	add		v10.4s, v10.4s, v15.4s
 	  add		a10, a10, a15
 	add		v11.4s, v11.4s, v12.4s
 	  add		a11, a11, a12
 	add		v8.4s, v8.4s, v13.4s
 	  add		a8, a8, a13
 	add		v9.4s, v9.4s, v14.4s
 	  add		a9, a9, a14
 	eor		v16.16b, v5.16b, v10.16b
 	  eor		a5, a5, a10
 	eor		v17.16b, v6.16b, v11.16b
 	  eor		a6, a6, a11
 	eor		v18.16b, v7.16b, v8.16b
 	  eor		a7, a7, a8
 	eor		v19.16b, v4.16b, v9.16b
 	  eor		a4, a4, a9
 	shl		v5.4s, v16.4s, #12
 	shl		v6.4s, v17.4s, #12
@ -283,42 +432,66 @@ ENTRY(chacha20_4block_xor_neon)
 	shl		v4.4s, v19.4s, #12
 	sri		v5.4s, v16.4s, #20
 	  ror		a5, a5, #20
 	sri		v6.4s, v17.4s, #20
 	  ror		a6, a6, #20
 	sri		v7.4s, v18.4s, #20
 	  ror		a7, a7, #20
 	sri		v4.4s, v19.4s, #20
 	  ror		a4, a4, #20
 	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
 	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
 	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
 	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
 	add		v0.4s, v0.4s, v5.4s
 	  add		a0, a0, a5
 	add		v1.4s, v1.4s, v6.4s
 	  add		a1, a1, a6
 	add		v2.4s, v2.4s, v7.4s
 	  add		a2, a2, a7
 	add		v3.4s, v3.4s, v4.4s
 	  add		a3, a3, a4
 	eor		v15.16b, v15.16b, v0.16b
 	  eor		a15, a15, a0
 	eor		v12.16b, v12.16b, v1.16b
 	  eor		a12, a12, a1
 	eor		v13.16b, v13.16b, v2.16b
 	  eor		a13, a13, a2
 	eor		v14.16b, v14.16b, v3.16b
 	  eor		a14, a14, a3
 	tbl		v15.16b, {v15.16b}, v31.16b
 	  ror		a15, a15, #24
 	tbl		v12.16b, {v12.16b}, v31.16b
 	  ror		a12, a12, #24
 	tbl		v13.16b, {v13.16b}, v31.16b
 	  ror		a13, a13, #24
 	tbl		v14.16b, {v14.16b}, v31.16b
 	  ror		a14, a14, #24
 	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
 	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
 	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
 	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
 	add		v10.4s, v10.4s, v15.4s
 	  add		a10, a10, a15
 	add		v11.4s, v11.4s, v12.4s
 	  add		a11, a11, a12
 	add		v8.4s, v8.4s, v13.4s
 	  add		a8, a8, a13
 	add		v9.4s, v9.4s, v14.4s
 	  add		a9, a9, a14
 	eor		v16.16b, v5.16b, v10.16b
 	  eor		a5, a5, a10
 	eor		v17.16b, v6.16b, v11.16b
 	  eor		a6, a6, a11
 	eor		v18.16b, v7.16b, v8.16b
 	  eor		a7, a7, a8
 	eor		v19.16b, v4.16b, v9.16b
 	  eor		a4, a4, a9
 	shl		v5.4s, v16.4s, #7
 	shl		v6.4s, v17.4s, #7
@ -326,11 +499,15 @@ ENTRY(chacha20_4block_xor_neon)
 	shl		v4.4s, v19.4s, #7
 	sri		v5.4s, v16.4s, #25
 	  ror		a5, a5, #25
 	sri		v6.4s, v17.4s, #25
 	  ror		a6, a6, #25
 	sri		v7.4s, v18.4s, #25
 	  ror		a7, a7, #25
 	sri		v4.4s, v19.4s, #25
 	  ror		a4, a4, #25
-	subs		x3, x3, #1
+	subs		w3, w3, #2
 	b.ne		.Ldoubleround4
 	ld4r		{v16.4s-v19.4s}, [x0], #16
@ -344,9 +521,21 @@ ENTRY(chacha20_4block_xor_neon)
 	// x2[0-3] += s0[2]
 	// x3[0-3] += s0[3]
 	add		v0.4s, v0.4s, v16.4s
 	  mov		w6, v16.s[0]
 	  mov		w7, v17.s[0]
 	add		v1.4s, v1.4s, v17.4s
 	  mov		w8, v18.s[0]
 	  mov		w9, v19.s[0]
 	add		v2.4s, v2.4s, v18.4s
 	  add		a0, a0, w6
 	  add		a1, a1, w7
 	add		v3.4s, v3.4s, v19.4s
 	  add		a2, a2, w8
 	  add		a3, a3, w9
 CPU_BE(	  rev		a0, a0		)
 CPU_BE(	  rev		a1, a1		)
 CPU_BE(	  rev		a2, a2		)
 CPU_BE(	  rev		a3, a3		)
 	ld4r		{v24.4s-v27.4s}, [x0], #16
 	ld4r		{v28.4s-v31.4s}, [x0]
@ -356,95 +545,316 @@ ENTRY(chacha20_4block_xor_neon)
 	// x6[0-3] += s1[2]
 	// x7[0-3] += s1[3]
 	add		v4.4s, v4.4s, v20.4s
 	  mov		w6, v20.s[0]
 	  mov		w7, v21.s[0]
 	add		v5.4s, v5.4s, v21.4s
 	  mov		w8, v22.s[0]
 	  mov		w9, v23.s[0]
 	add		v6.4s, v6.4s, v22.4s
 	  add		a4, a4, w6
 	  add		a5, a5, w7
 	add		v7.4s, v7.4s, v23.4s
 	  add		a6, a6, w8
 	  add		a7, a7, w9
 CPU_BE(	  rev		a4, a4		)
 CPU_BE(	  rev		a5, a5		)
 CPU_BE(	  rev		a6, a6		)
 CPU_BE(	  rev		a7, a7		)
 	// x8[0-3] += s2[0]
 	// x9[0-3] += s2[1]
 	// x10[0-3] += s2[2]
 	// x11[0-3] += s2[3]
 	add		v8.4s, v8.4s, v24.4s
 	  mov		w6, v24.s[0]
 	  mov		w7, v25.s[0]
 	add		v9.4s, v9.4s, v25.4s
 	  mov		w8, v26.s[0]
 	  mov		w9, v27.s[0]
 	add		v10.4s, v10.4s, v26.4s
 	  add		a8, a8, w6
 	  add		a9, a9, w7
 	add		v11.4s, v11.4s, v27.4s
 	  add		a10, a10, w8
 	  add		a11, a11, w9
 CPU_BE(	  rev		a8, a8		)
 CPU_BE(	  rev		a9, a9		)
 CPU_BE(	  rev		a10, a10	)
 CPU_BE(	  rev		a11, a11	)
 	// x12[0-3] += s3[0]
 	// x13[0-3] += s3[1]
 	// x14[0-3] += s3[2]
 	// x15[0-3] += s3[3]
 	add		v12.4s, v12.4s, v28.4s
 	  mov		w6, v28.s[0]
 	  mov		w7, v29.s[0]
 	add		v13.4s, v13.4s, v29.4s
 	  mov		w8, v30.s[0]
 	  mov		w9, v31.s[0]
 	add		v14.4s, v14.4s, v30.4s
 	  add		a12, a12, w6
 	  add		a13, a13, w7
 	add		v15.4s, v15.4s, v31.4s
 	  add		a14, a14, w8
 	  add		a15, a15, w9
 CPU_BE(	  rev		a12, a12	)
 CPU_BE(	  rev		a13, a13	)
 CPU_BE(	  rev		a14, a14	)
 CPU_BE(	  rev		a15, a15	)
 	// interleave 32-bit words in state n, n+1
 	  ldp		w6, w7, [x2], #64
 	zip1		v16.4s, v0.4s, v1.4s
 	  ldp		w8, w9, [x2, #-56]
 	  eor		a0, a0, w6
 	zip2		v17.4s, v0.4s, v1.4s
 	  eor		a1, a1, w7
 	zip1		v18.4s, v2.4s, v3.4s
 	  eor		a2, a2, w8
 	zip2		v19.4s, v2.4s, v3.4s
 	  eor		a3, a3, w9
 	  ldp		w6, w7, [x2, #-48]
 	zip1		v20.4s, v4.4s, v5.4s
 	  ldp		w8, w9, [x2, #-40]
 	  eor		a4, a4, w6
 	zip2		v21.4s, v4.4s, v5.4s
 	  eor		a5, a5, w7
 	zip1		v22.4s, v6.4s, v7.4s
 	  eor		a6, a6, w8
 	zip2		v23.4s, v6.4s, v7.4s
 	  eor		a7, a7, w9
 	  ldp		w6, w7, [x2, #-32]
 	zip1		v24.4s, v8.4s, v9.4s
 	  ldp		w8, w9, [x2, #-24]
 	  eor		a8, a8, w6
 	zip2		v25.4s, v8.4s, v9.4s
 	  eor		a9, a9, w7
 	zip1		v26.4s, v10.4s, v11.4s
 	  eor		a10, a10, w8
 	zip2		v27.4s, v10.4s, v11.4s
 	  eor		a11, a11, w9
 	  ldp		w6, w7, [x2, #-16]
 	zip1		v28.4s, v12.4s, v13.4s
 	  ldp		w8, w9, [x2, #-8]
 	  eor		a12, a12, w6
 	zip2		v29.4s, v12.4s, v13.4s
 	  eor		a13, a13, w7
 	zip1		v30.4s, v14.4s, v15.4s
 	  eor		a14, a14, w8
 	zip2		v31.4s, v14.4s, v15.4s
 	  eor		a15, a15, w9
 	mov		x3, #64
 	subs		x5, x4, #128
 	add		x6, x5, x2
 	csel		x3, x3, xzr, ge
 	csel		x2, x2, x6, ge
 	// interleave 64-bit words in state n, n+2
 	zip1		v0.2d, v16.2d, v18.2d
 	zip2		v4.2d, v16.2d, v18.2d
 	  stp		a0, a1, [x1], #64
 	zip1		v8.2d, v17.2d, v19.2d
 	zip2		v12.2d, v17.2d, v19.2d
-	ld1		{v16.16b-v19.16b}, [x2], #64
+	  stp		a2, a3, [x1, #-56]
 	ld1		{v16.16b-v19.16b}, [x2], x3
 	subs		x6, x4, #192
 	ccmp		x3, xzr, #4, lt
 	add		x7, x6, x2
 	csel		x3, x3, xzr, eq
 	csel		x2, x2, x7, eq
 	zip1		v1.2d, v20.2d, v22.2d
 	zip2		v5.2d, v20.2d, v22.2d
 	  stp		a4, a5, [x1, #-48]
 	zip1		v9.2d, v21.2d, v23.2d
 	zip2		v13.2d, v21.2d, v23.2d
-	ld1		{v20.16b-v23.16b}, [x2], #64
+	  stp		a6, a7, [x1, #-40]
 	ld1		{v20.16b-v23.16b}, [x2], x3
 	subs		x7, x4, #256
 	ccmp		x3, xzr, #4, lt
 	add		x8, x7, x2
 	csel		x3, x3, xzr, eq
 	csel		x2, x2, x8, eq
 	zip1		v2.2d, v24.2d, v26.2d
 	zip2		v6.2d, v24.2d, v26.2d
 	  stp		a8, a9, [x1, #-32]
 	zip1		v10.2d, v25.2d, v27.2d
 	zip2		v14.2d, v25.2d, v27.2d
-	ld1		{v24.16b-v27.16b}, [x2], #64
+	  stp		a10, a11, [x1, #-24]
 	ld1		{v24.16b-v27.16b}, [x2], x3
 	subs		x8, x4, #320
 	ccmp		x3, xzr, #4, lt
 	add		x9, x8, x2
 	csel		x2, x2, x9, eq
 	zip1		v3.2d, v28.2d, v30.2d
 	zip2		v7.2d, v28.2d, v30.2d
 	  stp		a12, a13, [x1, #-16]
 	zip1		v11.2d, v29.2d, v31.2d
 	zip2		v15.2d, v29.2d, v31.2d
 	  stp		a14, a15, [x1, #-8]
 	ld1		{v28.16b-v31.16b}, [x2]
 	// xor with corresponding input, write to output
 	tbnz		x5, #63, 0f
 	eor		v16.16b, v16.16b, v0.16b
 	eor		v17.16b, v17.16b, v1.16b
 	eor		v18.16b, v18.16b, v2.16b
 	eor		v19.16b, v19.16b, v3.16b
 	st1		{v16.16b-v19.16b}, [x1], #64
 	cbz		x5, .Lout
 	tbnz		x6, #63, 1f
 	eor		v20.16b, v20.16b, v4.16b
 	eor		v21.16b, v21.16b, v5.16b
 	st1		{v16.16b-v19.16b}, [x1], #64
 	eor		v22.16b, v22.16b, v6.16b
 	eor		v23.16b, v23.16b, v7.16b
 	st1		{v20.16b-v23.16b}, [x1], #64
 	cbz		x6, .Lout
 	tbnz		x7, #63, 2f
 	eor		v24.16b, v24.16b, v8.16b
 	eor		v25.16b, v25.16b, v9.16b
 	st1		{v20.16b-v23.16b}, [x1], #64
 	eor		v26.16b, v26.16b, v10.16b
 	eor		v27.16b, v27.16b, v11.16b
 	eor		v28.16b, v28.16b, v12.16b
 	st1		{v24.16b-v27.16b}, [x1], #64
 	cbz		x7, .Lout
 	tbnz		x8, #63, 3f
 	eor		v28.16b, v28.16b, v12.16b
 	eor		v29.16b, v29.16b, v13.16b
 	eor		v30.16b, v30.16b, v14.16b
 	eor		v31.16b, v31.16b, v15.16b
 	st1		{v28.16b-v31.16b}, [x1]
 .Lout:	frame_pop
 	ret
 ENDPROC(chacha20_4block_xor_neon)
-CTRINC:	.word		0, 1, 2, 3
+	// fewer than 128 bytes of in/output
 0:	ld1		{v8.16b}, [x10]
 	ld1		{v9.16b}, [x11]
 	movi		v10.16b, #16
 	sub		x2, x1, #64
 	add		x1, x1, x5
 	ld1		{v16.16b-v19.16b}, [x2]
 	tbl		v4.16b, {v0.16b-v3.16b}, v8.16b
 	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
 	add		v8.16b, v8.16b, v10.16b
 	add		v9.16b, v9.16b, v10.16b
 	tbl		v5.16b, {v0.16b-v3.16b}, v8.16b
 	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
 	add		v8.16b, v8.16b, v10.16b
 	add		v9.16b, v9.16b, v10.16b
 	tbl		v6.16b, {v0.16b-v3.16b}, v8.16b
 	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
 	add		v8.16b, v8.16b, v10.16b
 	add		v9.16b, v9.16b, v10.16b
 	tbl		v7.16b, {v0.16b-v3.16b}, v8.16b
 	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
 	eor		v20.16b, v20.16b, v4.16b
 	eor		v21.16b, v21.16b, v5.16b
 	eor		v22.16b, v22.16b, v6.16b
 	eor		v23.16b, v23.16b, v7.16b
 	st1		{v20.16b-v23.16b}, [x1]
 	b		.Lout
 	// fewer than 192 bytes of in/output
 1:	ld1		{v8.16b}, [x10]
 	ld1		{v9.16b}, [x11]
 	movi		v10.16b, #16
 	add		x1, x1, x6
 	tbl		v0.16b, {v4.16b-v7.16b}, v8.16b
 	tbx		v20.16b, {v16.16b-v19.16b}, v9.16b
 	add		v8.16b, v8.16b, v10.16b
 	add		v9.16b, v9.16b, v10.16b
 	tbl		v1.16b, {v4.16b-v7.16b}, v8.16b
 	tbx		v21.16b, {v16.16b-v19.16b}, v9.16b
 	add		v8.16b, v8.16b, v10.16b
 	add		v9.16b, v9.16b, v10.16b
 	tbl		v2.16b, {v4.16b-v7.16b}, v8.16b
 	tbx		v22.16b, {v16.16b-v19.16b}, v9.16b
 	add		v8.16b, v8.16b, v10.16b
 	add		v9.16b, v9.16b, v10.16b
 	tbl		v3.16b, {v4.16b-v7.16b}, v8.16b
 	tbx		v23.16b, {v16.16b-v19.16b}, v9.16b
 	eor		v20.16b, v20.16b, v0.16b
 	eor		v21.16b, v21.16b, v1.16b
 	eor		v22.16b, v22.16b, v2.16b
 	eor		v23.16b, v23.16b, v3.16b
 	st1		{v20.16b-v23.16b}, [x1]
 	b		.Lout
 	// fewer than 256 bytes of in/output
 2:	ld1		{v4.16b}, [x10]
 	ld1		{v5.16b}, [x11]
 	movi		v6.16b, #16
 	add		x1, x1, x7
 	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
 	tbx		v24.16b, {v20.16b-v23.16b}, v5.16b
 	add		v4.16b, v4.16b, v6.16b
 	add		v5.16b, v5.16b, v6.16b
 	tbl		v1.16b, {v8.16b-v11.16b}, v4.16b
 	tbx		v25.16b, {v20.16b-v23.16b}, v5.16b
 	add		v4.16b, v4.16b, v6.16b
 	add		v5.16b, v5.16b, v6.16b
 	tbl		v2.16b, {v8.16b-v11.16b}, v4.16b
 	tbx		v26.16b, {v20.16b-v23.16b}, v5.16b
 	add		v4.16b, v4.16b, v6.16b
 	add		v5.16b, v5.16b, v6.16b
 	tbl		v3.16b, {v8.16b-v11.16b}, v4.16b
 	tbx		v27.16b, {v20.16b-v23.16b}, v5.16b
 	eor		v24.16b, v24.16b, v0.16b
 	eor		v25.16b, v25.16b, v1.16b
 	eor		v26.16b, v26.16b, v2.16b
 	eor		v27.16b, v27.16b, v3.16b
 	st1		{v24.16b-v27.16b}, [x1]
 	b		.Lout
 	// fewer than 320 bytes of in/output
 3:	ld1		{v4.16b}, [x10]
 	ld1		{v5.16b}, [x11]
 	movi		v6.16b, #16
 	add		x1, x1, x8
 	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
 	tbx		v28.16b, {v24.16b-v27.16b}, v5.16b
 	add		v4.16b, v4.16b, v6.16b
 	add		v5.16b, v5.16b, v6.16b
 	tbl		v1.16b, {v12.16b-v15.16b}, v4.16b
 	tbx		v29.16b, {v24.16b-v27.16b}, v5.16b
 	add		v4.16b, v4.16b, v6.16b
 	add		v5.16b, v5.16b, v6.16b
 	tbl		v2.16b, {v12.16b-v15.16b}, v4.16b
 	tbx		v30.16b, {v24.16b-v27.16b}, v5.16b
 	add		v4.16b, v4.16b, v6.16b
 	add		v5.16b, v5.16b, v6.16b
 	tbl		v3.16b, {v12.16b-v15.16b}, v4.16b
 	tbx		v31.16b, {v24.16b-v27.16b}, v5.16b
 	eor		v28.16b, v28.16b, v0.16b
 	eor		v29.16b, v29.16b, v1.16b
 	eor		v30.16b, v30.16b, v2.16b
 	eor		v31.16b, v31.16b, v3.16b
 	st1		{v28.16b-v31.16b}, [x1]
 	b		.Lout
 ENDPROC(chacha_4block_xor_neon)
 	.section	".rodata", "a", %progbits
 	.align		L1_CACHE_SHIFT
 .Lpermute:
 	.set		.Li, 0
 	.rept		192
 	.byte		(.Li - 64)
 	.set		.Li, .Li + 1
 	.endr
 CTRINC:	.word		1, 2, 3, 4
 ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@ -1,8 +1,8 @@
 /*
- * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
+ * ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
 * including ChaCha20 (RFC7539)
 *
- * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@ -20,8 +20,9 @@
 */
 #include <crypto/algapi.h>
-#include <crypto/chacha.h>
+#include <crypto/internal/chacha.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@ -29,40 +30,78 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
-asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
+asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
 				      int nrounds);
 asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
 				      int nrounds);
 asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
 				       int nrounds, int bytes);
 asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
 static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
-			  unsigned int bytes, int nrounds)
+			  int bytes, int nrounds)
 {
 	while (bytes > 0) {
 		int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
 		if (l <= CHACHA_BLOCK_SIZE) {
 			u8 buf[CHACHA_BLOCK_SIZE];
-	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+			memcpy(buf, src, l);
 		chacha_4block_xor_neon(state, dst, src, nrounds);
 		bytes -= CHACHA_BLOCK_SIZE * 4;
 		src += CHACHA_BLOCK_SIZE * 4;
 		dst += CHACHA_BLOCK_SIZE * 4;
 		state[12] += 4;
 	}
 	while (bytes >= CHACHA_BLOCK_SIZE) {
 		chacha_block_xor_neon(state, dst, src, nrounds);
 		bytes -= CHACHA_BLOCK_SIZE;
 		src += CHACHA_BLOCK_SIZE;
 		dst += CHACHA_BLOCK_SIZE;
 		state[12]++;
 	}
 	if (bytes) {
 		memcpy(buf, src, bytes);
 			chacha_block_xor_neon(state, buf, buf, nrounds);
-		memcpy(dst, buf, bytes);
+			memcpy(dst, buf, l);
 			state[12] += 1;
 			break;
 		}
 		chacha_4block_xor_neon(state, dst, src, nrounds, l);
 		bytes -= l;
 		src += l;
 		dst += l;
 		state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
 	}
 }
 void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
 {
 	if (!static_branch_likely(&have_neon) || !may_use_simd()) {
 		hchacha_block_generic(state, stream, nrounds);
 	} else {
 		kernel_neon_begin();
 		hchacha_block_neon(state, stream, nrounds);
 		kernel_neon_end();
 	}
 }
 EXPORT_SYMBOL(hchacha_block_arch);
 void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
 {
 	chacha_init_generic(state, key, iv);
 }
 EXPORT_SYMBOL(chacha_init_arch);
 void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
 		       int nrounds)
 {
 	if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
 	    !may_use_simd())
 		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
 	do {
 		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
 		kernel_neon_begin();
 		chacha_doneon(state, dst, src, todo, nrounds);
 		kernel_neon_end();
 		bytes -= todo;
 		src += todo;
 		dst += todo;
 	} while (bytes);
 }
 EXPORT_SYMBOL(chacha_crypt_arch);
 static int chacha_neon_stream_xor(struct skcipher_request *req,
-				  struct chacha_ctx *ctx, u8 *iv)
+				  const struct chacha_ctx *ctx, const u8 *iv)
 {
 	struct skcipher_walk walk;
 	u32 state[16];
@ -70,18 +109,25 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
 	err = skcipher_walk_virt(&walk, req, false);
-	crypto_chacha_init(state, ctx, iv);
+	chacha_init_generic(state, ctx->key, iv);
 	while (walk.nbytes > 0) {
 		unsigned int nbytes = walk.nbytes;
 		if (nbytes < walk.total)
-			nbytes = round_down(nbytes, walk.stride);
+			nbytes = rounddown(nbytes, walk.stride);
 		if (!static_branch_likely(&have_neon) ||
 		    !may_use_simd()) {
 			chacha_crypt_generic(state, walk.dst.virt.addr,
 					     walk.src.virt.addr, nbytes,
 					     ctx->nrounds);
 		} else {
 			kernel_neon_begin();
-		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+			chacha_doneon(state, walk.dst.virt.addr,
-			      nbytes, ctx->nrounds);
+				      walk.src.virt.addr, nbytes, ctx->nrounds);
 			kernel_neon_end();
 		}
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
@ -93,9 +139,6 @@ static int chacha_neon(struct skcipher_request *req)
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
 		return crypto_chacha_crypt(req);
 	return chacha_neon_stream_xor(req, ctx, req->iv);
 }
@ -107,14 +150,8 @@ static int xchacha_neon(struct skcipher_request *req)
 	u32 state[16];
 	u8 real_iv[16];
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
+	chacha_init_generic(state, ctx->key, req->iv);
-		return crypto_xchacha_crypt(req);
+	hchacha_block_arch(state, subctx.key, ctx->nrounds);
 	crypto_chacha_init(state, ctx, req->iv);
 	kernel_neon_begin();
 	hchacha_block_neon(state, subctx.key, ctx->nrounds);
 	kernel_neon_end();
 	subctx.nrounds = ctx->nrounds;
 	memcpy(&real_iv[0], req->iv + 24, 8);
@ -135,8 +172,8 @@ static struct skcipher_alg algs[] = {
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= CHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.walksize		= 5 * CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
+		.setkey			= chacha20_setkey,
 		.encrypt		= chacha_neon,
 		.decrypt		= chacha_neon,
 	}, {
@ -151,8 +188,8 @@ static struct skcipher_alg algs[] = {
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.walksize		= 5 * CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
+		.setkey			= chacha20_setkey,
 		.encrypt		= xchacha_neon,
 		.decrypt		= xchacha_neon,
 	}, {
@ -167,8 +204,8 @@ static struct skcipher_alg algs[] = {
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 4 * CHACHA_BLOCK_SIZE,
+		.walksize		= 5 * CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha12_setkey,
+		.setkey			= chacha12_setkey,
 		.encrypt		= xchacha_neon,
 		.decrypt		= xchacha_neon,
 	}
@ -176,14 +213,18 @@ static struct skcipher_alg algs[] = {
 static int __init chacha_simd_mod_init(void)
 {
-	if (!(elf_hwcap & HWCAP_NEON))
+	if (!(elf_hwcap & HWCAP_ASIMD))
-		return -ENODEV;
+		return 0;
-	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+	static_branch_enable(&have_neon);
 	return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
 		crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
 }
 static void __exit chacha_simd_mod_fini(void)
 {
 	if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && (elf_hwcap & HWCAP_ASIMD))
 		crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
 }
--- a/arch/arm64/crypto/chacha20-neon-glue.c
+++ b/arch/arm64/crypto/chacha20-neon-glue.c
@ -1,133 +0,0 @@
 /*
 * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
 *
 * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 * Based on:
 * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
 *
 * Copyright (C) 2015 Martin Willi
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 */
 #include <crypto/algapi.h>
 #include <crypto/chacha.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <asm/hwcap.h>
 #include <asm/neon.h>
 #include <asm/simd.h>
 asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
 asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
 static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
 			    unsigned int bytes)
 {
 	u8 buf[CHACHA_BLOCK_SIZE];
 	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
 		kernel_neon_begin();
 		chacha20_4block_xor_neon(state, dst, src);
 		kernel_neon_end();
 		bytes -= CHACHA_BLOCK_SIZE * 4;
 		src += CHACHA_BLOCK_SIZE * 4;
 		dst += CHACHA_BLOCK_SIZE * 4;
 		state[12] += 4;
 	}
 	if (!bytes)
 		return;
 	kernel_neon_begin();
 	while (bytes >= CHACHA_BLOCK_SIZE) {
 		chacha20_block_xor_neon(state, dst, src);
 		bytes -= CHACHA_BLOCK_SIZE;
 		src += CHACHA_BLOCK_SIZE;
 		dst += CHACHA_BLOCK_SIZE;
 		state[12]++;
 	}
 	if (bytes) {
 		memcpy(buf, src, bytes);
 		chacha20_block_xor_neon(state, buf, buf);
 		memcpy(dst, buf, bytes);
 	}
 	kernel_neon_end();
 }
 static int chacha20_neon(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	struct skcipher_walk walk;
 	u32 state[16];
 	int err;
 	if (!may_use_simd() || req->cryptlen <= CHACHA_BLOCK_SIZE)
 		return crypto_chacha_crypt(req);
 	err = skcipher_walk_virt(&walk, req, false);
 	crypto_chacha_init(state, ctx, walk.iv);
 	while (walk.nbytes > 0) {
 		unsigned int nbytes = walk.nbytes;
 		if (nbytes < walk.total)
 			nbytes = round_down(nbytes, walk.stride);
 		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
 				nbytes);
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 	return err;
 }
 static struct skcipher_alg alg = {
 	.base.cra_name		= "chacha20",
 	.base.cra_driver_name	= "chacha20-neon",
 	.base.cra_priority	= 300,
 	.base.cra_blocksize	= 1,
 	.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 	.base.cra_module	= THIS_MODULE,
 	.min_keysize		= CHACHA_KEY_SIZE,
 	.max_keysize		= CHACHA_KEY_SIZE,
 	.ivsize			= CHACHA_IV_SIZE,
 	.chunksize		= CHACHA_BLOCK_SIZE,
 	.walksize		= 4 * CHACHA_BLOCK_SIZE,
 	.setkey			= crypto_chacha20_setkey,
 	.encrypt		= chacha20_neon,
 	.decrypt		= chacha20_neon,
 };
 static int __init chacha20_simd_mod_init(void)
 {
 	if (!(elf_hwcap & HWCAP_ASIMD))
 		return -ENODEV;
 	return crypto_register_skcipher(&alg);
 }
 static void __exit chacha20_simd_mod_fini(void)
 {
 	crypto_unregister_skcipher(&alg);
 }
 module_init(chacha20_simd_mod_init);
 module_exit(chacha20_simd_mod_fini);
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("chacha20");
--- a/arch/arm64/crypto/poly1305-armv8.pl
+++ b/arch/arm64/crypto/poly1305-armv8.pl
@ -0,0 +1,913 @@
 #!/usr/bin/env perl
 # SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
 #
 # ====================================================================
 # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
 # project.
 # ====================================================================
 #
 # This module implements Poly1305 hash for ARMv8.
 #
 # June 2015
 #
 # Numbers are cycles per processed byte with poly1305_blocks alone.
 #
 #		IALU/gcc-4.9	NEON
 #
 # Apple A7	1.86/+5%	0.72
 # Cortex-A53	2.69/+58%	1.47
 # Cortex-A57	2.70/+7%	1.14
 # Denver	1.64/+50%	1.18(*)
 # X-Gene	2.13/+68%	2.27
 # Mongoose	1.77/+75%	1.12
 # Kryo		2.70/+55%	1.13
 # ThunderX2	1.17/+95%	1.36
 #
 # (*)	estimate based on resources availability is less than 1.0,
 #	i.e. measured result is worse than expected, presumably binary
 #	translator is not almighty;
 $flavour=shift;
 $output=shift;
 if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";
    open STDOUT,"| \"$^X\" $xlate $flavour $output";
 } else {
    open STDOUT,">$output";
 }
 my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
 my ($mac,$nonce)=($inp,$len);
 my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
 $code.=<<___;
 #ifndef __KERNEL__
 # include "arm_arch.h"
 .extern	OPENSSL_armcap_P
 #endif
 .text
 // forward "declarations" are required for Apple
 .globl	poly1305_blocks
 .globl	poly1305_emit
 .globl	poly1305_init
 .type	poly1305_init,%function
 .align	5
 poly1305_init:
 	cmp	$inp,xzr
 	stp	xzr,xzr,[$ctx]		// zero hash value
 	stp	xzr,xzr,[$ctx,#16]	// [along with is_base2_26]
 	csel	x0,xzr,x0,eq
 	b.eq	.Lno_key
 #ifndef	__KERNEL__
 	adrp	x17,OPENSSL_armcap_P
 	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
 #endif
 	ldp	$r0,$r1,[$inp]		// load key
 	mov	$s1,#0xfffffffc0fffffff
 	movk	$s1,#0x0fff,lsl#48
 #ifdef	__AARCH64EB__
 	rev	$r0,$r0			// flip bytes
 	rev	$r1,$r1
 #endif
 	and	$r0,$r0,$s1		// &=0ffffffc0fffffff
 	and	$s1,$s1,#-4
 	and	$r1,$r1,$s1		// &=0ffffffc0ffffffc
 	mov	w#$s1,#-1
 	stp	$r0,$r1,[$ctx,#32]	// save key value
 	str	w#$s1,[$ctx,#48]	// impossible key power value
 #ifndef	__KERNEL__
 	tst	w17,#ARMV7_NEON
 	adr	$d0,.Lpoly1305_blocks
 	adr	$r0,.Lpoly1305_blocks_neon
 	adr	$d1,.Lpoly1305_emit
 	csel	$d0,$d0,$r0,eq
 # ifdef	__ILP32__
 	stp	w#$d0,w#$d1,[$len]
 # else
 	stp	$d0,$d1,[$len]
 # endif
 #endif
 	mov	x0,#1
 .Lno_key:
 	ret
 .size	poly1305_init,.-poly1305_init
 .type	poly1305_blocks,%function
 .align	5
 poly1305_blocks:
 .Lpoly1305_blocks:
 	ands	$len,$len,#-16
 	b.eq	.Lno_data
 	ldp	$h0,$h1,[$ctx]		// load hash value
 	ldp	$h2,x17,[$ctx,#16]	// [along with is_base2_26]
 	ldp	$r0,$r1,[$ctx,#32]	// load key value
 #ifdef	__AARCH64EB__
 	lsr	$d0,$h0,#32
 	mov	w#$d1,w#$h0
 	lsr	$d2,$h1,#32
 	mov	w15,w#$h1
 	lsr	x16,$h2,#32
 #else
 	mov	w#$d0,w#$h0
 	lsr	$d1,$h0,#32
 	mov	w#$d2,w#$h1
 	lsr	x15,$h1,#32
 	mov	w16,w#$h2
 #endif
 	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
 	lsr	$d1,$d2,#12
 	adds	$d0,$d0,$d2,lsl#52
 	add	$d1,$d1,x15,lsl#14
 	adc	$d1,$d1,xzr
 	lsr	$d2,x16,#24
 	adds	$d1,$d1,x16,lsl#40
 	adc	$d2,$d2,xzr
 	cmp	x17,#0			// is_base2_26?
 	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
 	csel	$h0,$h0,$d0,eq		// choose between radixes
 	csel	$h1,$h1,$d1,eq
 	csel	$h2,$h2,$d2,eq
 .Loop:
 	ldp	$t0,$t1,[$inp],#16	// load input
 	sub	$len,$len,#16
 #ifdef	__AARCH64EB__
 	rev	$t0,$t0
 	rev	$t1,$t1
 #endif
 	adds	$h0,$h0,$t0		// accumulate input
 	adcs	$h1,$h1,$t1
 	mul	$d0,$h0,$r0		// h0*r0
 	adc	$h2,$h2,$padbit
 	umulh	$d1,$h0,$r0
 	mul	$t0,$h1,$s1		// h1*5*r1
 	umulh	$t1,$h1,$s1
 	adds	$d0,$d0,$t0
 	mul	$t0,$h0,$r1		// h0*r1
 	adc	$d1,$d1,$t1
 	umulh	$d2,$h0,$r1
 	adds	$d1,$d1,$t0
 	mul	$t0,$h1,$r0		// h1*r0
 	adc	$d2,$d2,xzr
 	umulh	$t1,$h1,$r0
 	adds	$d1,$d1,$t0
 	mul	$t0,$h2,$s1		// h2*5*r1
 	adc	$d2,$d2,$t1
 	mul	$t1,$h2,$r0		// h2*r0
 	adds	$d1,$d1,$t0
 	adc	$d2,$d2,$t1
 	and	$t0,$d2,#-4		// final reduction
 	and	$h2,$d2,#3
 	add	$t0,$t0,$d2,lsr#2
 	adds	$h0,$d0,$t0
 	adcs	$h1,$d1,xzr
 	adc	$h2,$h2,xzr
 	cbnz	$len,.Loop
 	stp	$h0,$h1,[$ctx]		// store hash value
 	stp	$h2,xzr,[$ctx,#16]	// [and clear is_base2_26]
 .Lno_data:
 	ret
 .size	poly1305_blocks,.-poly1305_blocks
 .type	poly1305_emit,%function
 .align	5
 poly1305_emit:
 .Lpoly1305_emit:
 	ldp	$h0,$h1,[$ctx]		// load hash base 2^64
 	ldp	$h2,$r0,[$ctx,#16]	// [along with is_base2_26]
 	ldp	$t0,$t1,[$nonce]	// load nonce
 #ifdef	__AARCH64EB__
 	lsr	$d0,$h0,#32
 	mov	w#$d1,w#$h0
 	lsr	$d2,$h1,#32
 	mov	w15,w#$h1
 	lsr	x16,$h2,#32
 #else
 	mov	w#$d0,w#$h0
 	lsr	$d1,$h0,#32
 	mov	w#$d2,w#$h1
 	lsr	x15,$h1,#32
 	mov	w16,w#$h2
 #endif
 	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
 	lsr	$d1,$d2,#12
 	adds	$d0,$d0,$d2,lsl#52
 	add	$d1,$d1,x15,lsl#14
 	adc	$d1,$d1,xzr
 	lsr	$d2,x16,#24
 	adds	$d1,$d1,x16,lsl#40
 	adc	$d2,$d2,xzr
 	cmp	$r0,#0			// is_base2_26?
 	csel	$h0,$h0,$d0,eq		// choose between radixes
 	csel	$h1,$h1,$d1,eq
 	csel	$h2,$h2,$d2,eq
 	adds	$d0,$h0,#5		// compare to modulus
 	adcs	$d1,$h1,xzr
 	adc	$d2,$h2,xzr
 	tst	$d2,#-4			// see if it's carried/borrowed
 	csel	$h0,$h0,$d0,eq
 	csel	$h1,$h1,$d1,eq
 #ifdef	__AARCH64EB__
 	ror	$t0,$t0,#32		// flip nonce words
 	ror	$t1,$t1,#32
 #endif
 	adds	$h0,$h0,$t0		// accumulate nonce
 	adc	$h1,$h1,$t1
 #ifdef	__AARCH64EB__
 	rev	$h0,$h0			// flip output bytes
 	rev	$h1,$h1
 #endif
 	stp	$h0,$h1,[$mac]		// write result
 	ret
 .size	poly1305_emit,.-poly1305_emit
 ___
 my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
 my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
 my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
 my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
 my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
 my ($T0,$T1,$MASK) = map("v$_",(29..31));
 my ($in2,$zeros)=("x16","x17");
 my $is_base2_26 = $zeros;		# borrow
 $code.=<<___;
 .type	poly1305_mult,%function
 .align	5
 poly1305_mult:
 	mul	$d0,$h0,$r0		// h0*r0
 	umulh	$d1,$h0,$r0
 	mul	$t0,$h1,$s1		// h1*5*r1
 	umulh	$t1,$h1,$s1
 	adds	$d0,$d0,$t0
 	mul	$t0,$h0,$r1		// h0*r1
 	adc	$d1,$d1,$t1
 	umulh	$d2,$h0,$r1
 	adds	$d1,$d1,$t0
 	mul	$t0,$h1,$r0		// h1*r0
 	adc	$d2,$d2,xzr
 	umulh	$t1,$h1,$r0
 	adds	$d1,$d1,$t0
 	mul	$t0,$h2,$s1		// h2*5*r1
 	adc	$d2,$d2,$t1
 	mul	$t1,$h2,$r0		// h2*r0
 	adds	$d1,$d1,$t0
 	adc	$d2,$d2,$t1
 	and	$t0,$d2,#-4		// final reduction
 	and	$h2,$d2,#3
 	add	$t0,$t0,$d2,lsr#2
 	adds	$h0,$d0,$t0
 	adcs	$h1,$d1,xzr
 	adc	$h2,$h2,xzr
 	ret
 .size	poly1305_mult,.-poly1305_mult
 .type	poly1305_splat,%function
 .align	4
 poly1305_splat:
 	and	x12,$h0,#0x03ffffff	// base 2^64 -> base 2^26
 	ubfx	x13,$h0,#26,#26
 	extr	x14,$h1,$h0,#52
 	and	x14,x14,#0x03ffffff
 	ubfx	x15,$h1,#14,#26
 	extr	x16,$h2,$h1,#40
 	str	w12,[$ctx,#16*0]	// r0
 	add	w12,w13,w13,lsl#2	// r1*5
 	str	w13,[$ctx,#16*1]	// r1
 	add	w13,w14,w14,lsl#2	// r2*5
 	str	w12,[$ctx,#16*2]	// s1
 	str	w14,[$ctx,#16*3]	// r2
 	add	w14,w15,w15,lsl#2	// r3*5
 	str	w13,[$ctx,#16*4]	// s2
 	str	w15,[$ctx,#16*5]	// r3
 	add	w15,w16,w16,lsl#2	// r4*5
 	str	w14,[$ctx,#16*6]	// s3
 	str	w16,[$ctx,#16*7]	// r4
 	str	w15,[$ctx,#16*8]	// s4
 	ret
 .size	poly1305_splat,.-poly1305_splat
 #ifdef	__KERNEL__
 .globl	poly1305_blocks_neon
 #endif
 .type	poly1305_blocks_neon,%function
 .align	5
 poly1305_blocks_neon:
 .Lpoly1305_blocks_neon:
 	ldr	$is_base2_26,[$ctx,#24]
 	cmp	$len,#128
 	b.lo	.Lpoly1305_blocks
 	.inst	0xd503233f		// paciasp
 	stp	x29,x30,[sp,#-80]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#16]		// meet ABI requirements
 	stp	d10,d11,[sp,#32]
 	stp	d12,d13,[sp,#48]
 	stp	d14,d15,[sp,#64]
 	cbz	$is_base2_26,.Lbase2_64_neon
 	ldp	w10,w11,[$ctx]		// load hash value base 2^26
 	ldp	w12,w13,[$ctx,#8]
 	ldr	w14,[$ctx,#16]
 	tst	$len,#31
 	b.eq	.Leven_neon
 	ldp	$r0,$r1,[$ctx,#32]	// load key value
 	add	$h0,x10,x11,lsl#26	// base 2^26 -> base 2^64
 	lsr	$h1,x12,#12
 	adds	$h0,$h0,x12,lsl#52
 	add	$h1,$h1,x13,lsl#14
 	adc	$h1,$h1,xzr
 	lsr	$h2,x14,#24
 	adds	$h1,$h1,x14,lsl#40
 	adc	$d2,$h2,xzr		// can be partially reduced...
 	ldp	$d0,$d1,[$inp],#16	// load input
 	sub	$len,$len,#16
 	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
 #ifdef	__AARCH64EB__
 	rev	$d0,$d0
 	rev	$d1,$d1
 #endif
 	adds	$h0,$h0,$d0		// accumulate input
 	adcs	$h1,$h1,$d1
 	adc	$h2,$h2,$padbit
 	bl	poly1305_mult
 	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
 	ubfx	x11,$h0,#26,#26
 	extr	x12,$h1,$h0,#52
 	and	x12,x12,#0x03ffffff
 	ubfx	x13,$h1,#14,#26
 	extr	x14,$h2,$h1,#40
 	b	.Leven_neon
 .align	4
 .Lbase2_64_neon:
 	ldp	$r0,$r1,[$ctx,#32]	// load key value
 	ldp	$h0,$h1,[$ctx]		// load hash value base 2^64
 	ldr	$h2,[$ctx,#16]
 	tst	$len,#31
 	b.eq	.Linit_neon
 	ldp	$d0,$d1,[$inp],#16	// load input
 	sub	$len,$len,#16
 	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
 #ifdef	__AARCH64EB__
 	rev	$d0,$d0
 	rev	$d1,$d1
 #endif
 	adds	$h0,$h0,$d0		// accumulate input
 	adcs	$h1,$h1,$d1
 	adc	$h2,$h2,$padbit
 	bl	poly1305_mult
 .Linit_neon:
 	ldr	w17,[$ctx,#48]		// first table element
 	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
 	ubfx	x11,$h0,#26,#26
 	extr	x12,$h1,$h0,#52
 	and	x12,x12,#0x03ffffff
 	ubfx	x13,$h1,#14,#26
 	extr	x14,$h2,$h1,#40
 	cmp	w17,#-1			// is value impossible?
 	b.ne	.Leven_neon
 	fmov	${H0},x10
 	fmov	${H1},x11
 	fmov	${H2},x12
 	fmov	${H3},x13
 	fmov	${H4},x14
 	////////////////////////////////// initialize r^n table
 	mov	$h0,$r0			// r^1
 	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
 	mov	$h1,$r1
 	mov	$h2,xzr
 	add	$ctx,$ctx,#48+12
 	bl	poly1305_splat
 	bl	poly1305_mult		// r^2
 	sub	$ctx,$ctx,#4
 	bl	poly1305_splat
 	bl	poly1305_mult		// r^3
 	sub	$ctx,$ctx,#4
 	bl	poly1305_splat
 	bl	poly1305_mult		// r^4
 	sub	$ctx,$ctx,#4
 	bl	poly1305_splat
 	sub	$ctx,$ctx,#48		// restore original $ctx
 	b	.Ldo_neon
 .align	4
 .Leven_neon:
 	fmov	${H0},x10
 	fmov	${H1},x11
 	fmov	${H2},x12
 	fmov	${H3},x13
 	fmov	${H4},x14
 .Ldo_neon:
 	ldp	x8,x12,[$inp,#32]	// inp[2:3]
 	subs	$len,$len,#64
 	ldp	x9,x13,[$inp,#48]
 	add	$in2,$inp,#96
 	adr	$zeros,.Lzeros
 	lsl	$padbit,$padbit,#24
 	add	x15,$ctx,#48
 #ifdef	__AARCH64EB__
 	rev	x8,x8
 	rev	x12,x12
 	rev	x9,x9
 	rev	x13,x13
 #endif
 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
 	and	x5,x9,#0x03ffffff
 	ubfx	x6,x8,#26,#26
 	ubfx	x7,x9,#26,#26
 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
 	extr	x8,x12,x8,#52
 	extr	x9,x13,x9,#52
 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
 	fmov	$IN23_0,x4
 	and	x8,x8,#0x03ffffff
 	and	x9,x9,#0x03ffffff
 	ubfx	x10,x12,#14,#26
 	ubfx	x11,x13,#14,#26
 	add	x12,$padbit,x12,lsr#40
 	add	x13,$padbit,x13,lsr#40
 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
 	fmov	$IN23_1,x6
 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
 	fmov	$IN23_2,x8
 	fmov	$IN23_3,x10
 	fmov	$IN23_4,x12
 	ldp	x8,x12,[$inp],#16	// inp[0:1]
 	ldp	x9,x13,[$inp],#48
 	ld1	{$R0,$R1,$S1,$R2},[x15],#64
 	ld1	{$S2,$R3,$S3,$R4},[x15],#64
 	ld1	{$S4},[x15]
 #ifdef	__AARCH64EB__
 	rev	x8,x8
 	rev	x12,x12
 	rev	x9,x9
 	rev	x13,x13
 #endif
 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
 	and	x5,x9,#0x03ffffff
 	ubfx	x6,x8,#26,#26
 	ubfx	x7,x9,#26,#26
 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
 	extr	x8,x12,x8,#52
 	extr	x9,x13,x9,#52
 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
 	fmov	$IN01_0,x4
 	and	x8,x8,#0x03ffffff
 	and	x9,x9,#0x03ffffff
 	ubfx	x10,x12,#14,#26
 	ubfx	x11,x13,#14,#26
 	add	x12,$padbit,x12,lsr#40
 	add	x13,$padbit,x13,lsr#40
 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
 	fmov	$IN01_1,x6
 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
 	movi	$MASK.2d,#-1
 	fmov	$IN01_2,x8
 	fmov	$IN01_3,x10
 	fmov	$IN01_4,x12
 	ushr	$MASK.2d,$MASK.2d,#38
 	b.ls	.Lskip_loop
 .align	4
 .Loop_neon:
 	////////////////////////////////////////////////////////////////
 	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
 	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
 	//   \___________________/
 	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
 	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
 	//   \___________________/ \____________________/
 	//
 	// Note that we start with inp[2:3]*r^2. This is because it
 	// doesn't depend on reduction in previous iteration.
 	////////////////////////////////////////////////////////////////
 	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
 	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
 	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
 	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
 	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
 	subs	$len,$len,#64
 	umull	$ACC4,$IN23_0,${R4}[2]
 	csel	$in2,$zeros,$in2,lo
 	umull	$ACC3,$IN23_0,${R3}[2]
 	umull	$ACC2,$IN23_0,${R2}[2]
 	 ldp	x8,x12,[$in2],#16	// inp[2:3] (or zero)
 	umull	$ACC1,$IN23_0,${R1}[2]
 	 ldp	x9,x13,[$in2],#48
 	umull	$ACC0,$IN23_0,${R0}[2]
 #ifdef	__AARCH64EB__
 	 rev	x8,x8
 	 rev	x12,x12
 	 rev	x9,x9
 	 rev	x13,x13
 #endif
 	umlal	$ACC4,$IN23_1,${R3}[2]
 	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
 	umlal	$ACC3,$IN23_1,${R2}[2]
 	 and	x5,x9,#0x03ffffff
 	umlal	$ACC2,$IN23_1,${R1}[2]
 	 ubfx	x6,x8,#26,#26
 	umlal	$ACC1,$IN23_1,${R0}[2]
 	 ubfx	x7,x9,#26,#26
 	umlal	$ACC0,$IN23_1,${S4}[2]
 	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
 	umlal	$ACC4,$IN23_2,${R2}[2]
 	 extr	x8,x12,x8,#52
 	umlal	$ACC3,$IN23_2,${R1}[2]
 	 extr	x9,x13,x9,#52
 	umlal	$ACC2,$IN23_2,${R0}[2]
 	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
 	umlal	$ACC1,$IN23_2,${S4}[2]
 	 fmov	$IN23_0,x4
 	umlal	$ACC0,$IN23_2,${S3}[2]
 	 and	x8,x8,#0x03ffffff
 	umlal	$ACC4,$IN23_3,${R1}[2]
 	 and	x9,x9,#0x03ffffff
 	umlal	$ACC3,$IN23_3,${R0}[2]
 	 ubfx	x10,x12,#14,#26
 	umlal	$ACC2,$IN23_3,${S4}[2]
 	 ubfx	x11,x13,#14,#26
 	umlal	$ACC1,$IN23_3,${S3}[2]
 	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
 	umlal	$ACC0,$IN23_3,${S2}[2]
 	 fmov	$IN23_1,x6
 	add	$IN01_2,$IN01_2,$H2
 	 add	x12,$padbit,x12,lsr#40
 	umlal	$ACC4,$IN23_4,${R0}[2]
 	 add	x13,$padbit,x13,lsr#40
 	umlal	$ACC3,$IN23_4,${S4}[2]
 	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
 	umlal	$ACC2,$IN23_4,${S3}[2]
 	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
 	umlal	$ACC1,$IN23_4,${S2}[2]
 	 fmov	$IN23_2,x8
 	umlal	$ACC0,$IN23_4,${S1}[2]
 	 fmov	$IN23_3,x10
 	////////////////////////////////////////////////////////////////
 	// (hash+inp[0:1])*r^4 and accumulate
 	add	$IN01_0,$IN01_0,$H0
 	 fmov	$IN23_4,x12
 	umlal	$ACC3,$IN01_2,${R1}[0]
 	 ldp	x8,x12,[$inp],#16	// inp[0:1]
 	umlal	$ACC0,$IN01_2,${S3}[0]
 	 ldp	x9,x13,[$inp],#48
 	umlal	$ACC4,$IN01_2,${R2}[0]
 	umlal	$ACC1,$IN01_2,${S4}[0]
 	umlal	$ACC2,$IN01_2,${R0}[0]
 #ifdef	__AARCH64EB__
 	 rev	x8,x8
 	 rev	x12,x12
 	 rev	x9,x9
 	 rev	x13,x13
 #endif
 	add	$IN01_1,$IN01_1,$H1
 	umlal	$ACC3,$IN01_0,${R3}[0]
 	umlal	$ACC4,$IN01_0,${R4}[0]
 	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
 	umlal	$ACC2,$IN01_0,${R2}[0]
 	 and	x5,x9,#0x03ffffff
 	umlal	$ACC0,$IN01_0,${R0}[0]
 	 ubfx	x6,x8,#26,#26
 	umlal	$ACC1,$IN01_0,${R1}[0]
 	 ubfx	x7,x9,#26,#26
 	add	$IN01_3,$IN01_3,$H3
 	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
 	umlal	$ACC3,$IN01_1,${R2}[0]
 	 extr	x8,x12,x8,#52
 	umlal	$ACC4,$IN01_1,${R3}[0]
 	 extr	x9,x13,x9,#52
 	umlal	$ACC0,$IN01_1,${S4}[0]
 	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
 	umlal	$ACC2,$IN01_1,${R1}[0]
 	 fmov	$IN01_0,x4
 	umlal	$ACC1,$IN01_1,${R0}[0]
 	 and	x8,x8,#0x03ffffff
 	add	$IN01_4,$IN01_4,$H4
 	 and	x9,x9,#0x03ffffff
 	umlal	$ACC3,$IN01_3,${R0}[0]
 	 ubfx	x10,x12,#14,#26
 	umlal	$ACC0,$IN01_3,${S2}[0]
 	 ubfx	x11,x13,#14,#26
 	umlal	$ACC4,$IN01_3,${R1}[0]
 	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
 	umlal	$ACC1,$IN01_3,${S3}[0]
 	 fmov	$IN01_1,x6
 	umlal	$ACC2,$IN01_3,${S4}[0]
 	 add	x12,$padbit,x12,lsr#40
 	umlal	$ACC3,$IN01_4,${S4}[0]
 	 add	x13,$padbit,x13,lsr#40
 	umlal	$ACC0,$IN01_4,${S1}[0]
 	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
 	umlal	$ACC4,$IN01_4,${R0}[0]
 	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
 	umlal	$ACC1,$IN01_4,${S2}[0]
 	 fmov	$IN01_2,x8
 	umlal	$ACC2,$IN01_4,${S3}[0]
 	 fmov	$IN01_3,x10
 	 fmov	$IN01_4,x12
 	/////////////////////////////////////////////////////////////////
 	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
 	// and P. Schwabe
 	//
 	// [see discussion in poly1305-armv4 module]
 	ushr	$T0.2d,$ACC3,#26
 	xtn	$H3,$ACC3
 	 ushr	$T1.2d,$ACC0,#26
 	 and	$ACC0,$ACC0,$MASK.2d
 	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
 	bic	$H3,#0xfc,lsl#24	// &=0x03ffffff
 	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
 	ushr	$T0.2d,$ACC4,#26
 	xtn	$H4,$ACC4
 	 ushr	$T1.2d,$ACC1,#26
 	 xtn	$H1,$ACC1
 	bic	$H4,#0xfc,lsl#24
 	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
 	add	$ACC0,$ACC0,$T0.2d
 	shl	$T0.2d,$T0.2d,#2
 	 shrn	$T1.2s,$ACC2,#26
 	 xtn	$H2,$ACC2
 	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
 	 bic	$H1,#0xfc,lsl#24
 	 add	$H3,$H3,$T1.2s		// h2 -> h3
 	 bic	$H2,#0xfc,lsl#24
 	shrn	$T0.2s,$ACC0,#26
 	xtn	$H0,$ACC0
 	 ushr	$T1.2s,$H3,#26
 	 bic	$H3,#0xfc,lsl#24
 	 bic	$H0,#0xfc,lsl#24
 	add	$H1,$H1,$T0.2s		// h0 -> h1
 	 add	$H4,$H4,$T1.2s		// h3 -> h4
 	b.hi	.Loop_neon
 .Lskip_loop:
 	dup	$IN23_2,${IN23_2}[0]
 	add	$IN01_2,$IN01_2,$H2
 	////////////////////////////////////////////////////////////////
 	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
 	adds	$len,$len,#32
 	b.ne	.Long_tail
 	dup	$IN23_2,${IN01_2}[0]
 	add	$IN23_0,$IN01_0,$H0
 	add	$IN23_3,$IN01_3,$H3
 	add	$IN23_1,$IN01_1,$H1
 	add	$IN23_4,$IN01_4,$H4
 .Long_tail:
 	dup	$IN23_0,${IN23_0}[0]
 	umull2	$ACC0,$IN23_2,${S3}
 	umull2	$ACC3,$IN23_2,${R1}
 	umull2	$ACC4,$IN23_2,${R2}
 	umull2	$ACC2,$IN23_2,${R0}
 	umull2	$ACC1,$IN23_2,${S4}
 	dup	$IN23_1,${IN23_1}[0]
 	umlal2	$ACC0,$IN23_0,${R0}
 	umlal2	$ACC2,$IN23_0,${R2}
 	umlal2	$ACC3,$IN23_0,${R3}
 	umlal2	$ACC4,$IN23_0,${R4}
 	umlal2	$ACC1,$IN23_0,${R1}
 	dup	$IN23_3,${IN23_3}[0]
 	umlal2	$ACC0,$IN23_1,${S4}
 	umlal2	$ACC3,$IN23_1,${R2}
 	umlal2	$ACC2,$IN23_1,${R1}
 	umlal2	$ACC4,$IN23_1,${R3}
 	umlal2	$ACC1,$IN23_1,${R0}
 	dup	$IN23_4,${IN23_4}[0]
 	umlal2	$ACC3,$IN23_3,${R0}
 	umlal2	$ACC4,$IN23_3,${R1}
 	umlal2	$ACC0,$IN23_3,${S2}
 	umlal2	$ACC1,$IN23_3,${S3}
 	umlal2	$ACC2,$IN23_3,${S4}
 	umlal2	$ACC3,$IN23_4,${S4}
 	umlal2	$ACC0,$IN23_4,${S1}
 	umlal2	$ACC4,$IN23_4,${R0}
 	umlal2	$ACC1,$IN23_4,${S2}
 	umlal2	$ACC2,$IN23_4,${S3}
 	b.eq	.Lshort_tail
 	////////////////////////////////////////////////////////////////
 	// (hash+inp[0:1])*r^4:r^3 and accumulate
 	add	$IN01_0,$IN01_0,$H0
 	umlal	$ACC3,$IN01_2,${R1}
 	umlal	$ACC0,$IN01_2,${S3}
 	umlal	$ACC4,$IN01_2,${R2}
 	umlal	$ACC1,$IN01_2,${S4}
 	umlal	$ACC2,$IN01_2,${R0}
 	add	$IN01_1,$IN01_1,$H1
 	umlal	$ACC3,$IN01_0,${R3}
 	umlal	$ACC0,$IN01_0,${R0}
 	umlal	$ACC4,$IN01_0,${R4}
 	umlal	$ACC1,$IN01_0,${R1}
 	umlal	$ACC2,$IN01_0,${R2}
 	add	$IN01_3,$IN01_3,$H3
 	umlal	$ACC3,$IN01_1,${R2}
 	umlal	$ACC0,$IN01_1,${S4}
 	umlal	$ACC4,$IN01_1,${R3}
 	umlal	$ACC1,$IN01_1,${R0}
 	umlal	$ACC2,$IN01_1,${R1}
 	add	$IN01_4,$IN01_4,$H4
 	umlal	$ACC3,$IN01_3,${R0}
 	umlal	$ACC0,$IN01_3,${S2}
 	umlal	$ACC4,$IN01_3,${R1}
 	umlal	$ACC1,$IN01_3,${S3}
 	umlal	$ACC2,$IN01_3,${S4}
 	umlal	$ACC3,$IN01_4,${S4}
 	umlal	$ACC0,$IN01_4,${S1}
 	umlal	$ACC4,$IN01_4,${R0}
 	umlal	$ACC1,$IN01_4,${S2}
 	umlal	$ACC2,$IN01_4,${S3}
 .Lshort_tail:
 	////////////////////////////////////////////////////////////////
 	// horizontal add
 	addp	$ACC3,$ACC3,$ACC3
 	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
 	addp	$ACC0,$ACC0,$ACC0
 	 ldp	d10,d11,[sp,#32]
 	addp	$ACC4,$ACC4,$ACC4
 	 ldp	d12,d13,[sp,#48]
 	addp	$ACC1,$ACC1,$ACC1
 	 ldp	d14,d15,[sp,#64]
 	addp	$ACC2,$ACC2,$ACC2
 	 ldr	x30,[sp,#8]
 	////////////////////////////////////////////////////////////////
 	// lazy reduction, but without narrowing
 	ushr	$T0.2d,$ACC3,#26
 	and	$ACC3,$ACC3,$MASK.2d
 	 ushr	$T1.2d,$ACC0,#26
 	 and	$ACC0,$ACC0,$MASK.2d
 	add	$ACC4,$ACC4,$T0.2d	// h3 -> h4
 	 add	$ACC1,$ACC1,$T1.2d	// h0 -> h1
 	ushr	$T0.2d,$ACC4,#26
 	and	$ACC4,$ACC4,$MASK.2d
 	 ushr	$T1.2d,$ACC1,#26
 	 and	$ACC1,$ACC1,$MASK.2d
 	 add	$ACC2,$ACC2,$T1.2d	// h1 -> h2
 	add	$ACC0,$ACC0,$T0.2d
 	shl	$T0.2d,$T0.2d,#2
 	 ushr	$T1.2d,$ACC2,#26
 	 and	$ACC2,$ACC2,$MASK.2d
 	add	$ACC0,$ACC0,$T0.2d	// h4 -> h0
 	 add	$ACC3,$ACC3,$T1.2d	// h2 -> h3
 	ushr	$T0.2d,$ACC0,#26
 	and	$ACC0,$ACC0,$MASK.2d
 	 ushr	$T1.2d,$ACC3,#26
 	 and	$ACC3,$ACC3,$MASK.2d
 	add	$ACC1,$ACC1,$T0.2d	// h0 -> h1
 	 add	$ACC4,$ACC4,$T1.2d	// h3 -> h4
 	////////////////////////////////////////////////////////////////
 	// write the result, can be partially reduced
 	st4	{$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
 	mov	x4,#1
 	st1	{$ACC4}[0],[$ctx]
 	str	x4,[$ctx,#8]		// set is_base2_26
 	ldr	x29,[sp],#80
 	 .inst	0xd50323bf		// autiasp
 	ret
 .size	poly1305_blocks_neon,.-poly1305_blocks_neon
 .align	5
 .Lzeros:
 .long	0,0,0,0,0,0,0,0
 .asciz	"Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
 .align	2
 #if !defined(__KERNEL__) && !defined(_WIN64)
 .comm	OPENSSL_armcap_P,4,4
 .hidden	OPENSSL_armcap_P
 #endif
 ___
 foreach (split("\n",$code)) {
 	s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/			or
 	s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/	or
 	(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1))			or
 	(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1))	or
 	(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1))		or
 	(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1))		or
 	(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
 	s/\.[124]([sd])\[/.$1\[/;
 	s/w#x([0-9]+)/w$1/g;
 	print $_,"\n";
 }
 close STDOUT;
--- a/arch/arm64/crypto/poly1305-core.S_shipped
+++ b/arch/arm64/crypto/poly1305-core.S_shipped
@ -0,0 +1,835 @@
 #ifndef __KERNEL__
 # include "arm_arch.h"
 .extern	OPENSSL_armcap_P
 #endif
 .text
 // forward "declarations" are required for Apple
 .globl	poly1305_blocks
 .globl	poly1305_emit
 .globl	poly1305_init
 .type	poly1305_init,%function
 .align	5
 poly1305_init:
 	cmp	x1,xzr
 	stp	xzr,xzr,[x0]		// zero hash value
 	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
 	csel	x0,xzr,x0,eq
 	b.eq	.Lno_key
 #ifndef	__KERNEL__
 	adrp	x17,OPENSSL_armcap_P
 	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
 #endif
 	ldp	x7,x8,[x1]		// load key
 	mov	x9,#0xfffffffc0fffffff
 	movk	x9,#0x0fff,lsl#48
 #ifdef	__AARCH64EB__
 	rev	x7,x7			// flip bytes
 	rev	x8,x8
 #endif
 	and	x7,x7,x9		// &=0ffffffc0fffffff
 	and	x9,x9,#-4
 	and	x8,x8,x9		// &=0ffffffc0ffffffc
 	mov	w9,#-1
 	stp	x7,x8,[x0,#32]	// save key value
 	str	w9,[x0,#48]	// impossible key power value
 #ifndef	__KERNEL__
 	tst	w17,#ARMV7_NEON
 	adr	x12,.Lpoly1305_blocks
 	adr	x7,.Lpoly1305_blocks_neon
 	adr	x13,.Lpoly1305_emit
 	csel	x12,x12,x7,eq
 # ifdef	__ILP32__
 	stp	w12,w13,[x2]
 # else
 	stp	x12,x13,[x2]
 # endif
 #endif
 	mov	x0,#1
 .Lno_key:
 	ret
 .size	poly1305_init,.-poly1305_init
 .type	poly1305_blocks,%function
 .align	5
 poly1305_blocks:
 .Lpoly1305_blocks:
 	ands	x2,x2,#-16
 	b.eq	.Lno_data
 	ldp	x4,x5,[x0]		// load hash value
 	ldp	x6,x17,[x0,#16]	// [along with is_base2_26]
 	ldp	x7,x8,[x0,#32]	// load key value
 #ifdef	__AARCH64EB__
 	lsr	x12,x4,#32
 	mov	w13,w4
 	lsr	x14,x5,#32
 	mov	w15,w5
 	lsr	x16,x6,#32
 #else
 	mov	w12,w4
 	lsr	x13,x4,#32
 	mov	w14,w5
 	lsr	x15,x5,#32
 	mov	w16,w6
 #endif
 	add	x12,x12,x13,lsl#26	// base 2^26 -> base 2^64
 	lsr	x13,x14,#12
 	adds	x12,x12,x14,lsl#52
 	add	x13,x13,x15,lsl#14
 	adc	x13,x13,xzr
 	lsr	x14,x16,#24
 	adds	x13,x13,x16,lsl#40
 	adc	x14,x14,xzr
 	cmp	x17,#0			// is_base2_26?
 	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
 	csel	x4,x4,x12,eq		// choose between radixes
 	csel	x5,x5,x13,eq
 	csel	x6,x6,x14,eq
 .Loop:
 	ldp	x10,x11,[x1],#16	// load input
 	sub	x2,x2,#16
 #ifdef	__AARCH64EB__
 	rev	x10,x10
 	rev	x11,x11
 #endif
 	adds	x4,x4,x10		// accumulate input
 	adcs	x5,x5,x11
 	mul	x12,x4,x7		// h0*r0
 	adc	x6,x6,x3
 	umulh	x13,x4,x7
 	mul	x10,x5,x9		// h1*5*r1
 	umulh	x11,x5,x9
 	adds	x12,x12,x10
 	mul	x10,x4,x8		// h0*r1
 	adc	x13,x13,x11
 	umulh	x14,x4,x8
 	adds	x13,x13,x10
 	mul	x10,x5,x7		// h1*r0
 	adc	x14,x14,xzr
 	umulh	x11,x5,x7
 	adds	x13,x13,x10
 	mul	x10,x6,x9		// h2*5*r1
 	adc	x14,x14,x11
 	mul	x11,x6,x7		// h2*r0
 	adds	x13,x13,x10
 	adc	x14,x14,x11
 	and	x10,x14,#-4		// final reduction
 	and	x6,x14,#3
 	add	x10,x10,x14,lsr#2
 	adds	x4,x12,x10
 	adcs	x5,x13,xzr
 	adc	x6,x6,xzr
 	cbnz	x2,.Loop
 	stp	x4,x5,[x0]		// store hash value
 	stp	x6,xzr,[x0,#16]	// [and clear is_base2_26]
 .Lno_data:
 	ret
 .size	poly1305_blocks,.-poly1305_blocks
 .type	poly1305_emit,%function
 .align	5
 poly1305_emit:
 .Lpoly1305_emit:
 	ldp	x4,x5,[x0]		// load hash base 2^64
 	ldp	x6,x7,[x0,#16]	// [along with is_base2_26]
 	ldp	x10,x11,[x2]	// load nonce
 #ifdef	__AARCH64EB__
 	lsr	x12,x4,#32
 	mov	w13,w4
 	lsr	x14,x5,#32
 	mov	w15,w5
 	lsr	x16,x6,#32
 #else
 	mov	w12,w4
 	lsr	x13,x4,#32
 	mov	w14,w5
 	lsr	x15,x5,#32
 	mov	w16,w6
 #endif
 	add	x12,x12,x13,lsl#26	// base 2^26 -> base 2^64
 	lsr	x13,x14,#12
 	adds	x12,x12,x14,lsl#52
 	add	x13,x13,x15,lsl#14
 	adc	x13,x13,xzr
 	lsr	x14,x16,#24
 	adds	x13,x13,x16,lsl#40
 	adc	x14,x14,xzr
 	cmp	x7,#0			// is_base2_26?
 	csel	x4,x4,x12,eq		// choose between radixes
 	csel	x5,x5,x13,eq
 	csel	x6,x6,x14,eq
 	adds	x12,x4,#5		// compare to modulus
 	adcs	x13,x5,xzr
 	adc	x14,x6,xzr
 	tst	x14,#-4			// see if it's carried/borrowed
 	csel	x4,x4,x12,eq
 	csel	x5,x5,x13,eq
 #ifdef	__AARCH64EB__
 	ror	x10,x10,#32		// flip nonce words
 	ror	x11,x11,#32
 #endif
 	adds	x4,x4,x10		// accumulate nonce
 	adc	x5,x5,x11
 #ifdef	__AARCH64EB__
 	rev	x4,x4			// flip output bytes
 	rev	x5,x5
 #endif
 	stp	x4,x5,[x1]		// write result
 	ret
 .size	poly1305_emit,.-poly1305_emit
 .type	poly1305_mult,%function
 .align	5
 poly1305_mult:
 	mul	x12,x4,x7		// h0*r0
 	umulh	x13,x4,x7
 	mul	x10,x5,x9		// h1*5*r1
 	umulh	x11,x5,x9
 	adds	x12,x12,x10
 	mul	x10,x4,x8		// h0*r1
 	adc	x13,x13,x11
 	umulh	x14,x4,x8
 	adds	x13,x13,x10
 	mul	x10,x5,x7		// h1*r0
 	adc	x14,x14,xzr
 	umulh	x11,x5,x7
 	adds	x13,x13,x10
 	mul	x10,x6,x9		// h2*5*r1
 	adc	x14,x14,x11
 	mul	x11,x6,x7		// h2*r0
 	adds	x13,x13,x10
 	adc	x14,x14,x11
 	and	x10,x14,#-4		// final reduction
 	and	x6,x14,#3
 	add	x10,x10,x14,lsr#2
 	adds	x4,x12,x10
 	adcs	x5,x13,xzr
 	adc	x6,x6,xzr
 	ret
 .size	poly1305_mult,.-poly1305_mult
 .type	poly1305_splat,%function
 .align	4
 poly1305_splat:
 	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
 	ubfx	x13,x4,#26,#26
 	extr	x14,x5,x4,#52
 	and	x14,x14,#0x03ffffff
 	ubfx	x15,x5,#14,#26
 	extr	x16,x6,x5,#40
 	str	w12,[x0,#16*0]	// r0
 	add	w12,w13,w13,lsl#2	// r1*5
 	str	w13,[x0,#16*1]	// r1
 	add	w13,w14,w14,lsl#2	// r2*5
 	str	w12,[x0,#16*2]	// s1
 	str	w14,[x0,#16*3]	// r2
 	add	w14,w15,w15,lsl#2	// r3*5
 	str	w13,[x0,#16*4]	// s2
 	str	w15,[x0,#16*5]	// r3
 	add	w15,w16,w16,lsl#2	// r4*5
 	str	w14,[x0,#16*6]	// s3
 	str	w16,[x0,#16*7]	// r4
 	str	w15,[x0,#16*8]	// s4
 	ret
 .size	poly1305_splat,.-poly1305_splat
 #ifdef	__KERNEL__
 .globl	poly1305_blocks_neon
 #endif
 .type	poly1305_blocks_neon,%function
 .align	5
 poly1305_blocks_neon:
 .Lpoly1305_blocks_neon:
 	ldr	x17,[x0,#24]
 	cmp	x2,#128
 	b.lo	.Lpoly1305_blocks
 	.inst	0xd503233f		// paciasp
 	stp	x29,x30,[sp,#-80]!
 	add	x29,sp,#0
 	stp	d8,d9,[sp,#16]		// meet ABI requirements
 	stp	d10,d11,[sp,#32]
 	stp	d12,d13,[sp,#48]
 	stp	d14,d15,[sp,#64]
 	cbz	x17,.Lbase2_64_neon
 	ldp	w10,w11,[x0]		// load hash value base 2^26
 	ldp	w12,w13,[x0,#8]
 	ldr	w14,[x0,#16]
 	tst	x2,#31
 	b.eq	.Leven_neon
 	ldp	x7,x8,[x0,#32]	// load key value
 	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
 	lsr	x5,x12,#12
 	adds	x4,x4,x12,lsl#52
 	add	x5,x5,x13,lsl#14
 	adc	x5,x5,xzr
 	lsr	x6,x14,#24
 	adds	x5,x5,x14,lsl#40
 	adc	x14,x6,xzr		// can be partially reduced...
 	ldp	x12,x13,[x1],#16	// load input
 	sub	x2,x2,#16
 	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
 #ifdef	__AARCH64EB__
 	rev	x12,x12
 	rev	x13,x13
 #endif
 	adds	x4,x4,x12		// accumulate input
 	adcs	x5,x5,x13
 	adc	x6,x6,x3
 	bl	poly1305_mult
 	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
 	ubfx	x11,x4,#26,#26
 	extr	x12,x5,x4,#52
 	and	x12,x12,#0x03ffffff
 	ubfx	x13,x5,#14,#26
 	extr	x14,x6,x5,#40
 	b	.Leven_neon
 .align	4
 .Lbase2_64_neon:
 	ldp	x7,x8,[x0,#32]	// load key value
 	ldp	x4,x5,[x0]		// load hash value base 2^64
 	ldr	x6,[x0,#16]
 	tst	x2,#31
 	b.eq	.Linit_neon
 	ldp	x12,x13,[x1],#16	// load input
 	sub	x2,x2,#16
 	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
 #ifdef	__AARCH64EB__
 	rev	x12,x12
 	rev	x13,x13
 #endif
 	adds	x4,x4,x12		// accumulate input
 	adcs	x5,x5,x13
 	adc	x6,x6,x3
 	bl	poly1305_mult
 .Linit_neon:
 	ldr	w17,[x0,#48]		// first table element
 	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
 	ubfx	x11,x4,#26,#26
 	extr	x12,x5,x4,#52
 	and	x12,x12,#0x03ffffff
 	ubfx	x13,x5,#14,#26
 	extr	x14,x6,x5,#40
 	cmp	w17,#-1			// is value impossible?
 	b.ne	.Leven_neon
 	fmov	d24,x10
 	fmov	d25,x11
 	fmov	d26,x12
 	fmov	d27,x13
 	fmov	d28,x14
 	////////////////////////////////// initialize r^n table
 	mov	x4,x7			// r^1
 	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
 	mov	x5,x8
 	mov	x6,xzr
 	add	x0,x0,#48+12
 	bl	poly1305_splat
 	bl	poly1305_mult		// r^2
 	sub	x0,x0,#4
 	bl	poly1305_splat
 	bl	poly1305_mult		// r^3
 	sub	x0,x0,#4
 	bl	poly1305_splat
 	bl	poly1305_mult		// r^4
 	sub	x0,x0,#4
 	bl	poly1305_splat
 	sub	x0,x0,#48		// restore original x0
 	b	.Ldo_neon
 .align	4
 .Leven_neon:
 	fmov	d24,x10
 	fmov	d25,x11
 	fmov	d26,x12
 	fmov	d27,x13
 	fmov	d28,x14
 .Ldo_neon:
 	ldp	x8,x12,[x1,#32]	// inp[2:3]
 	subs	x2,x2,#64
 	ldp	x9,x13,[x1,#48]
 	add	x16,x1,#96
 	adr	x17,.Lzeros
 	lsl	x3,x3,#24
 	add	x15,x0,#48
 #ifdef	__AARCH64EB__
 	rev	x8,x8
 	rev	x12,x12
 	rev	x9,x9
 	rev	x13,x13
 #endif
 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
 	and	x5,x9,#0x03ffffff
 	ubfx	x6,x8,#26,#26
 	ubfx	x7,x9,#26,#26
 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
 	extr	x8,x12,x8,#52
 	extr	x9,x13,x9,#52
 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
 	fmov	d14,x4
 	and	x8,x8,#0x03ffffff
 	and	x9,x9,#0x03ffffff
 	ubfx	x10,x12,#14,#26
 	ubfx	x11,x13,#14,#26
 	add	x12,x3,x12,lsr#40
 	add	x13,x3,x13,lsr#40
 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
 	fmov	d15,x6
 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
 	fmov	d16,x8
 	fmov	d17,x10
 	fmov	d18,x12
 	ldp	x8,x12,[x1],#16	// inp[0:1]
 	ldp	x9,x13,[x1],#48
 	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
 	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
 	ld1	{v8.4s},[x15]
 #ifdef	__AARCH64EB__
 	rev	x8,x8
 	rev	x12,x12
 	rev	x9,x9
 	rev	x13,x13
 #endif
 	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
 	and	x5,x9,#0x03ffffff
 	ubfx	x6,x8,#26,#26
 	ubfx	x7,x9,#26,#26
 	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
 	extr	x8,x12,x8,#52
 	extr	x9,x13,x9,#52
 	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
 	fmov	d9,x4
 	and	x8,x8,#0x03ffffff
 	and	x9,x9,#0x03ffffff
 	ubfx	x10,x12,#14,#26
 	ubfx	x11,x13,#14,#26
 	add	x12,x3,x12,lsr#40
 	add	x13,x3,x13,lsr#40
 	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
 	fmov	d10,x6
 	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
 	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
 	movi	v31.2d,#-1
 	fmov	d11,x8
 	fmov	d12,x10
 	fmov	d13,x12
 	ushr	v31.2d,v31.2d,#38
 	b.ls	.Lskip_loop
 .align	4
 .Loop_neon:
 	////////////////////////////////////////////////////////////////
 	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
 	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
 	//   ___________________/
 	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
 	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
 	//   ___________________/ ____________________/
 	//
 	// Note that we start with inp[2:3]*r^2. This is because it
 	// doesn't depend on reduction in previous iteration.
 	////////////////////////////////////////////////////////////////
 	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
 	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
 	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
 	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
 	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
 	subs	x2,x2,#64
 	umull	v23.2d,v14.2s,v7.s[2]
 	csel	x16,x17,x16,lo
 	umull	v22.2d,v14.2s,v5.s[2]
 	umull	v21.2d,v14.2s,v3.s[2]
 	 ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
 	umull	v20.2d,v14.2s,v1.s[2]
 	 ldp	x9,x13,[x16],#48
 	umull	v19.2d,v14.2s,v0.s[2]
 #ifdef	__AARCH64EB__
 	 rev	x8,x8
 	 rev	x12,x12
 	 rev	x9,x9
 	 rev	x13,x13
 #endif
 	umlal	v23.2d,v15.2s,v5.s[2]
 	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
 	umlal	v22.2d,v15.2s,v3.s[2]
 	 and	x5,x9,#0x03ffffff
 	umlal	v21.2d,v15.2s,v1.s[2]
 	 ubfx	x6,x8,#26,#26
 	umlal	v20.2d,v15.2s,v0.s[2]
 	 ubfx	x7,x9,#26,#26
 	umlal	v19.2d,v15.2s,v8.s[2]
 	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
 	umlal	v23.2d,v16.2s,v3.s[2]
 	 extr	x8,x12,x8,#52
 	umlal	v22.2d,v16.2s,v1.s[2]
 	 extr	x9,x13,x9,#52
 	umlal	v21.2d,v16.2s,v0.s[2]
 	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
 	umlal	v20.2d,v16.2s,v8.s[2]
 	 fmov	d14,x4
 	umlal	v19.2d,v16.2s,v6.s[2]
 	 and	x8,x8,#0x03ffffff
 	umlal	v23.2d,v17.2s,v1.s[2]
 	 and	x9,x9,#0x03ffffff
 	umlal	v22.2d,v17.2s,v0.s[2]
 	 ubfx	x10,x12,#14,#26
 	umlal	v21.2d,v17.2s,v8.s[2]
 	 ubfx	x11,x13,#14,#26
 	umlal	v20.2d,v17.2s,v6.s[2]
 	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
 	umlal	v19.2d,v17.2s,v4.s[2]
 	 fmov	d15,x6
 	add	v11.2s,v11.2s,v26.2s
 	 add	x12,x3,x12,lsr#40
 	umlal	v23.2d,v18.2s,v0.s[2]
 	 add	x13,x3,x13,lsr#40
 	umlal	v22.2d,v18.2s,v8.s[2]
 	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
 	umlal	v21.2d,v18.2s,v6.s[2]
 	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
 	umlal	v20.2d,v18.2s,v4.s[2]
 	 fmov	d16,x8
 	umlal	v19.2d,v18.2s,v2.s[2]
 	 fmov	d17,x10
 	////////////////////////////////////////////////////////////////
 	// (hash+inp[0:1])*r^4 and accumulate
 	add	v9.2s,v9.2s,v24.2s
 	 fmov	d18,x12
 	umlal	v22.2d,v11.2s,v1.s[0]
 	 ldp	x8,x12,[x1],#16	// inp[0:1]
 	umlal	v19.2d,v11.2s,v6.s[0]
 	 ldp	x9,x13,[x1],#48
 	umlal	v23.2d,v11.2s,v3.s[0]
 	umlal	v20.2d,v11.2s,v8.s[0]
 	umlal	v21.2d,v11.2s,v0.s[0]
 #ifdef	__AARCH64EB__
 	 rev	x8,x8
 	 rev	x12,x12
 	 rev	x9,x9
 	 rev	x13,x13
 #endif
 	add	v10.2s,v10.2s,v25.2s
 	umlal	v22.2d,v9.2s,v5.s[0]
 	umlal	v23.2d,v9.2s,v7.s[0]
 	 and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
 	umlal	v21.2d,v9.2s,v3.s[0]
 	 and	x5,x9,#0x03ffffff
 	umlal	v19.2d,v9.2s,v0.s[0]
 	 ubfx	x6,x8,#26,#26
 	umlal	v20.2d,v9.2s,v1.s[0]
 	 ubfx	x7,x9,#26,#26
 	add	v12.2s,v12.2s,v27.2s
 	 add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
 	umlal	v22.2d,v10.2s,v3.s[0]
 	 extr	x8,x12,x8,#52
 	umlal	v23.2d,v10.2s,v5.s[0]
 	 extr	x9,x13,x9,#52
 	umlal	v19.2d,v10.2s,v8.s[0]
 	 add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
 	umlal	v21.2d,v10.2s,v1.s[0]
 	 fmov	d9,x4
 	umlal	v20.2d,v10.2s,v0.s[0]
 	 and	x8,x8,#0x03ffffff
 	add	v13.2s,v13.2s,v28.2s
 	 and	x9,x9,#0x03ffffff
 	umlal	v22.2d,v12.2s,v0.s[0]
 	 ubfx	x10,x12,#14,#26
 	umlal	v19.2d,v12.2s,v4.s[0]
 	 ubfx	x11,x13,#14,#26
 	umlal	v23.2d,v12.2s,v1.s[0]
 	 add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
 	umlal	v20.2d,v12.2s,v6.s[0]
 	 fmov	d10,x6
 	umlal	v21.2d,v12.2s,v8.s[0]
 	 add	x12,x3,x12,lsr#40
 	umlal	v22.2d,v13.2s,v8.s[0]
 	 add	x13,x3,x13,lsr#40
 	umlal	v19.2d,v13.2s,v2.s[0]
 	 add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
 	umlal	v23.2d,v13.2s,v0.s[0]
 	 add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
 	umlal	v20.2d,v13.2s,v4.s[0]
 	 fmov	d11,x8
 	umlal	v21.2d,v13.2s,v6.s[0]
 	 fmov	d12,x10
 	 fmov	d13,x12
 	/////////////////////////////////////////////////////////////////
 	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
 	// and P. Schwabe
 	//
 	// [see discussion in poly1305-armv4 module]
 	ushr	v29.2d,v22.2d,#26
 	xtn	v27.2s,v22.2d
 	 ushr	v30.2d,v19.2d,#26
 	 and	v19.16b,v19.16b,v31.16b
 	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
 	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
 	 add	v20.2d,v20.2d,v30.2d	// h0 -> h1
 	ushr	v29.2d,v23.2d,#26
 	xtn	v28.2s,v23.2d
 	 ushr	v30.2d,v20.2d,#26
 	 xtn	v25.2s,v20.2d
 	bic	v28.2s,#0xfc,lsl#24
 	 add	v21.2d,v21.2d,v30.2d	// h1 -> h2
 	add	v19.2d,v19.2d,v29.2d
 	shl	v29.2d,v29.2d,#2
 	 shrn	v30.2s,v21.2d,#26
 	 xtn	v26.2s,v21.2d
 	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
 	 bic	v25.2s,#0xfc,lsl#24
 	 add	v27.2s,v27.2s,v30.2s		// h2 -> h3
 	 bic	v26.2s,#0xfc,lsl#24
 	shrn	v29.2s,v19.2d,#26
 	xtn	v24.2s,v19.2d
 	 ushr	v30.2s,v27.2s,#26
 	 bic	v27.2s,#0xfc,lsl#24
 	 bic	v24.2s,#0xfc,lsl#24
 	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
 	 add	v28.2s,v28.2s,v30.2s		// h3 -> h4
 	b.hi	.Loop_neon
 .Lskip_loop:
 	dup	v16.2d,v16.d[0]
 	add	v11.2s,v11.2s,v26.2s
 	////////////////////////////////////////////////////////////////
 	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
 	adds	x2,x2,#32
 	b.ne	.Long_tail
 	dup	v16.2d,v11.d[0]
 	add	v14.2s,v9.2s,v24.2s
 	add	v17.2s,v12.2s,v27.2s
 	add	v15.2s,v10.2s,v25.2s
 	add	v18.2s,v13.2s,v28.2s
 .Long_tail:
 	dup	v14.2d,v14.d[0]
 	umull2	v19.2d,v16.4s,v6.4s
 	umull2	v22.2d,v16.4s,v1.4s
 	umull2	v23.2d,v16.4s,v3.4s
 	umull2	v21.2d,v16.4s,v0.4s
 	umull2	v20.2d,v16.4s,v8.4s
 	dup	v15.2d,v15.d[0]
 	umlal2	v19.2d,v14.4s,v0.4s
 	umlal2	v21.2d,v14.4s,v3.4s
 	umlal2	v22.2d,v14.4s,v5.4s
 	umlal2	v23.2d,v14.4s,v7.4s
 	umlal2	v20.2d,v14.4s,v1.4s
 	dup	v17.2d,v17.d[0]
 	umlal2	v19.2d,v15.4s,v8.4s
 	umlal2	v22.2d,v15.4s,v3.4s
 	umlal2	v21.2d,v15.4s,v1.4s
 	umlal2	v23.2d,v15.4s,v5.4s
 	umlal2	v20.2d,v15.4s,v0.4s
 	dup	v18.2d,v18.d[0]
 	umlal2	v22.2d,v17.4s,v0.4s
 	umlal2	v23.2d,v17.4s,v1.4s
 	umlal2	v19.2d,v17.4s,v4.4s
 	umlal2	v20.2d,v17.4s,v6.4s
 	umlal2	v21.2d,v17.4s,v8.4s
 	umlal2	v22.2d,v18.4s,v8.4s
 	umlal2	v19.2d,v18.4s,v2.4s
 	umlal2	v23.2d,v18.4s,v0.4s
 	umlal2	v20.2d,v18.4s,v4.4s
 	umlal2	v21.2d,v18.4s,v6.4s
 	b.eq	.Lshort_tail
 	////////////////////////////////////////////////////////////////
 	// (hash+inp[0:1])*r^4:r^3 and accumulate
 	add	v9.2s,v9.2s,v24.2s
 	umlal	v22.2d,v11.2s,v1.2s
 	umlal	v19.2d,v11.2s,v6.2s
 	umlal	v23.2d,v11.2s,v3.2s
 	umlal	v20.2d,v11.2s,v8.2s
 	umlal	v21.2d,v11.2s,v0.2s
 	add	v10.2s,v10.2s,v25.2s
 	umlal	v22.2d,v9.2s,v5.2s
 	umlal	v19.2d,v9.2s,v0.2s
 	umlal	v23.2d,v9.2s,v7.2s
 	umlal	v20.2d,v9.2s,v1.2s
 	umlal	v21.2d,v9.2s,v3.2s
 	add	v12.2s,v12.2s,v27.2s
 	umlal	v22.2d,v10.2s,v3.2s
 	umlal	v19.2d,v10.2s,v8.2s
 	umlal	v23.2d,v10.2s,v5.2s
 	umlal	v20.2d,v10.2s,v0.2s
 	umlal	v21.2d,v10.2s,v1.2s
 	add	v13.2s,v13.2s,v28.2s
 	umlal	v22.2d,v12.2s,v0.2s
 	umlal	v19.2d,v12.2s,v4.2s
 	umlal	v23.2d,v12.2s,v1.2s
 	umlal	v20.2d,v12.2s,v6.2s
 	umlal	v21.2d,v12.2s,v8.2s
 	umlal	v22.2d,v13.2s,v8.2s
 	umlal	v19.2d,v13.2s,v2.2s
 	umlal	v23.2d,v13.2s,v0.2s
 	umlal	v20.2d,v13.2s,v4.2s
 	umlal	v21.2d,v13.2s,v6.2s
 .Lshort_tail:
 	////////////////////////////////////////////////////////////////
 	// horizontal add
 	addp	v22.2d,v22.2d,v22.2d
 	 ldp	d8,d9,[sp,#16]		// meet ABI requirements
 	addp	v19.2d,v19.2d,v19.2d
 	 ldp	d10,d11,[sp,#32]
 	addp	v23.2d,v23.2d,v23.2d
 	 ldp	d12,d13,[sp,#48]
 	addp	v20.2d,v20.2d,v20.2d
 	 ldp	d14,d15,[sp,#64]
 	addp	v21.2d,v21.2d,v21.2d
 	 ldr	x30,[sp,#8]
 	////////////////////////////////////////////////////////////////
 	// lazy reduction, but without narrowing
 	ushr	v29.2d,v22.2d,#26
 	and	v22.16b,v22.16b,v31.16b
 	 ushr	v30.2d,v19.2d,#26
 	 and	v19.16b,v19.16b,v31.16b
 	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
 	 add	v20.2d,v20.2d,v30.2d	// h0 -> h1
 	ushr	v29.2d,v23.2d,#26
 	and	v23.16b,v23.16b,v31.16b
 	 ushr	v30.2d,v20.2d,#26
 	 and	v20.16b,v20.16b,v31.16b
 	 add	v21.2d,v21.2d,v30.2d	// h1 -> h2
 	add	v19.2d,v19.2d,v29.2d
 	shl	v29.2d,v29.2d,#2
 	 ushr	v30.2d,v21.2d,#26
 	 and	v21.16b,v21.16b,v31.16b
 	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
 	 add	v22.2d,v22.2d,v30.2d	// h2 -> h3
 	ushr	v29.2d,v19.2d,#26
 	and	v19.16b,v19.16b,v31.16b
 	 ushr	v30.2d,v22.2d,#26
 	 and	v22.16b,v22.16b,v31.16b
 	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
 	 add	v23.2d,v23.2d,v30.2d	// h3 -> h4
 	////////////////////////////////////////////////////////////////
 	// write the result, can be partially reduced
 	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
 	mov	x4,#1
 	st1	{v23.s}[0],[x0]
 	str	x4,[x0,#8]		// set is_base2_26
 	ldr	x29,[sp],#80
 	 .inst	0xd50323bf		// autiasp
 	ret
 .size	poly1305_blocks_neon,.-poly1305_blocks_neon
 .align	5
 .Lzeros:
 .long	0,0,0,0,0,0,0,0
 .asciz	"Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
 .align	2
 #if !defined(__KERNEL__) && !defined(_WIN64)
 .comm	OPENSSL_armcap_P,4,4
 .hidden	OPENSSL_armcap_P
 #endif
--- a/arch/arm64/crypto/poly1305-glue.c
+++ b/arch/arm64/crypto/poly1305-glue.c
@ -0,0 +1,230 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
 * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
 *
 * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
 */
 #include <asm/hwcap.h>
 #include <asm/neon.h>
 #include <asm/simd.h>
 #include <asm/unaligned.h>
 #include <crypto/algapi.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/poly1305.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
 #include <linux/jump_label.h>
 #include <linux/module.h>
 asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
 asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
 asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
 asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
 void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
 {
 	poly1305_init_arm64(&dctx->h, key);
 	dctx->s[0] = get_unaligned_le32(key + 16);
 	dctx->s[1] = get_unaligned_le32(key + 20);
 	dctx->s[2] = get_unaligned_le32(key + 24);
 	dctx->s[3] = get_unaligned_le32(key + 28);
 	dctx->buflen = 0;
 }
 EXPORT_SYMBOL(poly1305_init_arch);
 static int neon_poly1305_init(struct shash_desc *desc)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 	dctx->buflen = 0;
 	dctx->rset = 0;
 	dctx->sset = false;
 	return 0;
 }
 static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
 				 u32 len, u32 hibit, bool do_neon)
 {
 	if (unlikely(!dctx->sset)) {
 		if (!dctx->rset) {
 			poly1305_init_arch(dctx, src);
 			src += POLY1305_BLOCK_SIZE;
 			len -= POLY1305_BLOCK_SIZE;
 			dctx->rset = 1;
 		}
 		if (len >= POLY1305_BLOCK_SIZE) {
 			dctx->s[0] = get_unaligned_le32(src +  0);
 			dctx->s[1] = get_unaligned_le32(src +  4);
 			dctx->s[2] = get_unaligned_le32(src +  8);
 			dctx->s[3] = get_unaligned_le32(src + 12);
 			src += POLY1305_BLOCK_SIZE;
 			len -= POLY1305_BLOCK_SIZE;
 			dctx->sset = true;
 		}
 		if (len < POLY1305_BLOCK_SIZE)
 			return;
 	}
 	len &= ~(POLY1305_BLOCK_SIZE - 1);
 	if (static_branch_likely(&have_neon) && likely(do_neon))
 		poly1305_blocks_neon(&dctx->h, src, len, hibit);
 	else
 		poly1305_blocks(&dctx->h, src, len, hibit);
 }
 static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
 				    const u8 *src, u32 len, bool do_neon)
 {
 	if (unlikely(dctx->buflen)) {
 		u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
 		memcpy(dctx->buf + dctx->buflen, src, bytes);
 		src += bytes;
 		len -= bytes;
 		dctx->buflen += bytes;
 		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
 			neon_poly1305_blocks(dctx, dctx->buf,
 					     POLY1305_BLOCK_SIZE, 1, false);
 			dctx->buflen = 0;
 		}
 	}
 	if (likely(len >= POLY1305_BLOCK_SIZE)) {
 		neon_poly1305_blocks(dctx, src, len, 1, do_neon);
 		src += round_down(len, POLY1305_BLOCK_SIZE);
 		len %= POLY1305_BLOCK_SIZE;
 	}
 	if (unlikely(len)) {
 		dctx->buflen = len;
 		memcpy(dctx->buf, src, len);
 	}
 }
 static int neon_poly1305_update(struct shash_desc *desc,
 				const u8 *src, unsigned int srclen)
 {
 	bool do_neon = may_use_simd() && srclen > 128;
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 	if (static_branch_likely(&have_neon) && do_neon)
 		kernel_neon_begin();
 	neon_poly1305_do_update(dctx, src, srclen, do_neon);
 	if (static_branch_likely(&have_neon) && do_neon)
 		kernel_neon_end();
 	return 0;
 }
 void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
 			  unsigned int nbytes)
 {
 	if (unlikely(dctx->buflen)) {
 		u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
 		memcpy(dctx->buf + dctx->buflen, src, bytes);
 		src += bytes;
 		nbytes -= bytes;
 		dctx->buflen += bytes;
 		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
 			poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
 			dctx->buflen = 0;
 		}
 	}
 	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
 		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
 		if (static_branch_likely(&have_neon) && may_use_simd()) {
 			do {
 				unsigned int todo = min_t(unsigned int, len, SZ_4K);
 				kernel_neon_begin();
 				poly1305_blocks_neon(&dctx->h, src, todo, 1);
 				kernel_neon_end();
 				len -= todo;
 				src += todo;
 			} while (len);
 		} else {
 			poly1305_blocks(&dctx->h, src, len, 1);
 			src += len;
 		}
 		nbytes %= POLY1305_BLOCK_SIZE;
 	}
 	if (unlikely(nbytes)) {
 		dctx->buflen = nbytes;
 		memcpy(dctx->buf, src, nbytes);
 	}
 }
 EXPORT_SYMBOL(poly1305_update_arch);
 void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
 {
 	if (unlikely(dctx->buflen)) {
 		dctx->buf[dctx->buflen++] = 1;
 		memset(dctx->buf + dctx->buflen, 0,
 		       POLY1305_BLOCK_SIZE - dctx->buflen);
 		poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
 	}
 	poly1305_emit(&dctx->h, dst, dctx->s);
 	*dctx = (struct poly1305_desc_ctx){};
 }
 EXPORT_SYMBOL(poly1305_final_arch);
 static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 	if (unlikely(!dctx->sset))
 		return -ENOKEY;
 	poly1305_final_arch(dctx, dst);
 	return 0;
 }
 static struct shash_alg neon_poly1305_alg = {
 	.init			= neon_poly1305_init,
 	.update			= neon_poly1305_update,
 	.final			= neon_poly1305_final,
 	.digestsize		= POLY1305_DIGEST_SIZE,
 	.descsize		= sizeof(struct poly1305_desc_ctx),
 	.base.cra_name		= "poly1305",
 	.base.cra_driver_name	= "poly1305-neon",
 	.base.cra_priority	= 200,
 	.base.cra_blocksize	= POLY1305_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 };
 static int __init neon_poly1305_mod_init(void)
 {
 	if (!(elf_hwcap & HWCAP_ASIMD))
 		return 0;
 	static_branch_enable(&have_neon);
 	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
 		crypto_register_shash(&neon_poly1305_alg) : 0;
 }
 static void __exit neon_poly1305_mod_exit(void)
 {
 	if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && (elf_hwcap & HWCAP_ASIMD))
 		crypto_unregister_shash(&neon_poly1305_alg);
 }
 module_init(neon_poly1305_mod_init);
 module_exit(neon_poly1305_mod_exit);
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("poly1305");
 MODULE_ALIAS_CRYPTO("poly1305-neon");
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@ -192,6 +192,7 @@ enum vcpu_sysreg {
 #define cp14_DBGWCR0	(DBGWCR0_EL1 * 2)
 #define cp14_DBGWVR0	(DBGWVR0_EL1 * 2)
 #define cp14_DBGDCCINT	(MDCCINT_EL1 * 2)
 #define cp14_DBGVCR	(DBGVCR32_EL2 * 2)
 #define NR_COPRO_REGS	(NR_SYS_REGS * 2)
--- a/arch/arm64/include/asm/numa.h
+++ b/arch/arm64/include/asm/numa.h
@ -25,6 +25,9 @@ const struct cpumask *cpumask_of_node(int node);
 /* Returns a pointer to the cpumask of CPUs on Node 'node'. */
 static inline const struct cpumask *cpumask_of_node(int node)
 {
 	if (node == NUMA_NO_NODE)
 		return cpu_all_mask;
 	return node_to_cpumask_map[node];
 }
 #endif
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@ -620,6 +620,12 @@ check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope)
 	return (need_wa > 0);
 }
 static void
 cpu_enable_branch_predictor_hardening(const struct arm64_cpu_capabilities *cap)
 {
 	cap->matches(cap, SCOPE_LOCAL_CPU);
 }
 static const __maybe_unused struct midr_range tx2_family_cpus[] = {
 	MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
 	MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
@ -860,9 +866,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 	},
 #endif
 	{
 		.desc = "Branch predictor hardening",
 		.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
 		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
 		.matches = check_branch_predictor,
 		.cpu_enable = cpu_enable_branch_predictor_hardening,
 	},
 #ifdef CONFIG_HARDEN_EL2_VECTORS
 	{
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@ -290,21 +290,23 @@ void store_cpu_topology(unsigned int cpuid)
 	if (mpidr & MPIDR_UP_BITMASK)
 		return;
-	/* Create cpu topology mapping based on MPIDR. */
+	/*
-	if (mpidr & MPIDR_MT_BITMASK) {
+	 * This would be the place to create cpu topology based on MPIDR.
-		/* Multiprocessor system : Multi-threads per core */
+	 *
-		cpuid_topo->thread_id  = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+	 * However, it cannot be trusted to depict the actual topology; some
-		cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+	 * pieces of the architecture enforce an artificial cap on Aff0 values
-		cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) |
+	 * (e.g. GICv3's ICC_SGI1R_EL1 limits it to 15), leading to an
-					 MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8;
+	 * artificial cycling of Aff1, Aff2 and Aff3 values. IOW, these end up
-	} else {
+	 * having absolutely no relationship to the actual underlying system
-		/* Multiprocessor system : Single-thread per core */
+	 * topology, and cannot be reasonably used as core / package ID.
 	 *
 	 * If the MT bit is set, Aff0 *could* be used to define a thread ID, but
 	 * we still wouldn't be able to obtain a sane core ID. This means we
 	 * need to entirely ignore MPIDR for any topology deduction.
 	 */
 	cpuid_topo->thread_id  = -1;
-		cpuid_topo->core_id    = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+	cpuid_topo->core_id    = cpuid;
-		cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) |
+	cpuid_topo->package_id = cpu_to_node(cpuid);
 					 MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 |
 					 MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16;
 	}
 	pr_debug("CPU%u: cluster %d core %d thread %d mpidr %#016llx\n",
 		 cpuid, cpuid_topo->package_id, cpuid_topo->core_id,
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@ -1555,9 +1555,9 @@ static const struct sys_reg_desc cp14_regs[] = {
 	{ Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi },
 	DBG_BCR_BVR_WCR_WVR(1),
 	/* DBGDCCINT */
-	{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32 },
+	{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32, NULL, cp14_DBGDCCINT },
 	/* DBGDSCRext */
-	{ Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32 },
+	{ Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32, NULL, cp14_DBGDSCRext },
 	DBG_BCR_BVR_WCR_WVR(2),
 	/* DBGDTR[RT]Xint */
 	{ Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi },
@ -1572,7 +1572,7 @@ static const struct sys_reg_desc cp14_regs[] = {
 	{ Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi },
 	DBG_BCR_BVR_WCR_WVR(6),
 	/* DBGVCR */
-	{ Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32 },
+	{ Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32, NULL, cp14_DBGVCR },
 	DBG_BCR_BVR_WCR_WVR(7),
 	DBG_BCR_BVR_WCR_WVR(8),
 	DBG_BCR_BVR_WCR_WVR(9),
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@ -58,7 +58,11 @@ EXPORT_SYMBOL(node_to_cpumask_map);
 */
 const struct cpumask *cpumask_of_node(int node)
 {
-	if (WARN_ON(node >= nr_node_ids))
+
 	if (node == NUMA_NO_NODE)
 		return cpu_all_mask;
 	if (WARN_ON(node < 0 || node >= nr_node_ids))
 		return cpu_none_mask;
 	if (WARN_ON(node_to_cpumask_map[node] == NULL))
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@ -42,7 +42,7 @@ obj-y				+= esi_stub.o	# must be in kernel proper
 endif
 obj-$(CONFIG_INTEL_IOMMU)	+= pci-dma.o
-obj-$(CONFIG_BINFMT_ELF)	+= elfcore.o
+obj-$(CONFIG_ELF_CORE)		+= elfcore.o
 # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
 CFLAGS_traps.o  += -mfixed-range=f2-f5,f16-f31
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@ -409,83 +409,9 @@ static void kretprobe_trampoline(void)
 {
 }
 /*
 * At this point the target function has been tricked into
 * returning into our trampoline.  Lookup the associated instance
 * and then:
 *    - call the handler function
 *    - cleanup by marking the instance as unused
 *    - long jump back to the original return address
 */
 int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 {
-	struct kretprobe_instance *ri = NULL;
+	regs->cr_iip = __kretprobe_trampoline_handler(regs, kretprobe_trampoline, NULL);
 	struct hlist_head *head, empty_rp;
 	struct hlist_node *tmp;
 	unsigned long flags, orig_ret_address = 0;
 	unsigned long trampoline_address =
 		((struct fnptr *)kretprobe_trampoline)->ip;
 	INIT_HLIST_HEAD(&empty_rp);
 	kretprobe_hash_lock(current, &head, &flags);
 	/*
 	 * It is possible to have multiple instances associated with a given
 	 * task either because an multiple functions in the call path
 	 * have a return probe installed on them, and/or more than one return
 	 * return probe was registered for a target function.
 	 *
 	 * We can handle this because:
 	 *     - instances are always inserted at the head of the list
 	 *     - when multiple return probes are registered for the same
 	 *       function, the first instance's ret_addr will point to the
 	 *       real return address, and all the rest will point to
 	 *       kretprobe_trampoline
 	 */
 	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
 		if (ri->task != current)
 			/* another task is sharing our hash bucket */
 			continue;
 		orig_ret_address = (unsigned long)ri->ret_addr;
 		if (orig_ret_address != trampoline_address)
 			/*
 			 * This is the real return address. Any other
 			 * instances associated with this task are for
 			 * other calls deeper on the call stack
 			 */
 			break;
 	}
 	regs->cr_iip = orig_ret_address;
 	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
 		if (ri->task != current)
 			/* another task is sharing our hash bucket */
 			continue;
 		if (ri->rp && ri->rp->handler)
 			ri->rp->handler(ri, regs);
 		orig_ret_address = (unsigned long)ri->ret_addr;
 		recycle_rp_inst(ri, &empty_rp);
 		if (orig_ret_address != trampoline_address)
 			/*
 			 * This is the real return address. Any other
 			 * instances associated with this task are for
 			 * other calls deeper on the call stack
 			 */
 			break;
 	}
 	kretprobe_assert(ri, orig_ret_address, trampoline_address);
 	kretprobe_hash_unlock(current, &flags);
 	hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
 		hlist_del(&ri->hlist);
 		kfree(ri);
 	}
 	/*
 	 * By returning a non-zero value, we are telling
 	 * kprobe_handler() that we don't want the post_handler
@ -498,6 +424,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
 	ri->ret_addr = (kprobe_opcode_t *)regs->b0;
 	ri->fp = NULL;
 	/* Replace the return addr with trampoline addr */
 	regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip;
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@ -339,7 +339,7 @@ libs-y			+= arch/mips/math-emu/
 # See arch/mips/Kbuild for content of core part of the kernel
 core-y += arch/mips/
-drivers-$(CONFIG_MIPS_CRC_SUPPORT) += arch/mips/crypto/
+drivers-y			+= arch/mips/crypto/
 drivers-$(CONFIG_OPROFILE)	+= arch/mips/oprofile/
 # suspend and hibernation support
--- a/arch/mips/crypto/Makefile
+++ b/arch/mips/crypto/Makefile
@ -4,3 +4,21 @@
 #
 obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o
 obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
 chacha-mips-y := chacha-core.o chacha-glue.o
 AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
 obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
 poly1305-mips-y := poly1305-core.o poly1305-glue.o
 perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
 perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
 quiet_cmd_perlasm = PERLASM $@
      cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
 $(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
 	$(call if_changed,perlasm)
 targets += poly1305-core.S
--- a/arch/mips/crypto/chacha-core.S
+++ b/arch/mips/crypto/chacha-core.S
@ -0,0 +1,497 @@
 /* SPDX-License-Identifier: GPL-2.0 OR MIT */
 /*
 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 */
 #define MASK_U32		0x3c
 #define CHACHA20_BLOCK_SIZE	64
 #define STACK_SIZE		32
 #define X0	$t0
 #define X1	$t1
 #define X2	$t2
 #define X3	$t3
 #define X4	$t4
 #define X5	$t5
 #define X6	$t6
 #define X7	$t7
 #define X8	$t8
 #define X9	$t9
 #define X10	$v1
 #define X11	$s6
 #define X12	$s5
 #define X13	$s4
 #define X14	$s3
 #define X15	$s2
 /* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
 #define T0	$s1
 #define T1	$s0
 #define T(n)	T ## n
 #define X(n)	X ## n
 /* Input arguments */
 #define STATE		$a0
 #define OUT		$a1
 #define IN		$a2
 #define BYTES		$a3
 /* Output argument */
 /* NONCE[0] is kept in a register and not in memory.
 * We don't want to touch original value in memory.
 * Must be incremented every loop iteration.
 */
 #define NONCE_0		$v0
 /* SAVED_X and SAVED_CA are set in the jump table.
 * Use regs which are overwritten on exit else we don't leak clear data.
 * They are used to handling the last bytes which are not multiple of 4.
 */
 #define SAVED_X		X15
 #define SAVED_CA	$s7
 #define IS_UNALIGNED	$s7
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 #define MSB 0
 #define LSB 3
 #define ROTx rotl
 #define ROTR(n) rotr n, 24
 #define	CPU_TO_LE32(n) \
 	wsbh	n; \
 	rotr	n, 16;
 #else
 #define MSB 3
 #define LSB 0
 #define ROTx rotr
 #define CPU_TO_LE32(n)
 #define ROTR(n)
 #endif
 #define FOR_EACH_WORD(x) \
 	x( 0); \
 	x( 1); \
 	x( 2); \
 	x( 3); \
 	x( 4); \
 	x( 5); \
 	x( 6); \
 	x( 7); \
 	x( 8); \
 	x( 9); \
 	x(10); \
 	x(11); \
 	x(12); \
 	x(13); \
 	x(14); \
 	x(15);
 #define FOR_EACH_WORD_REV(x) \
 	x(15); \
 	x(14); \
 	x(13); \
 	x(12); \
 	x(11); \
 	x(10); \
 	x( 9); \
 	x( 8); \
 	x( 7); \
 	x( 6); \
 	x( 5); \
 	x( 4); \
 	x( 3); \
 	x( 2); \
 	x( 1); \
 	x( 0);
 #define PLUS_ONE_0	 1
 #define PLUS_ONE_1	 2
 #define PLUS_ONE_2	 3
 #define PLUS_ONE_3	 4
 #define PLUS_ONE_4	 5
 #define PLUS_ONE_5	 6
 #define PLUS_ONE_6	 7
 #define PLUS_ONE_7	 8
 #define PLUS_ONE_8	 9
 #define PLUS_ONE_9	10
 #define PLUS_ONE_10	11
 #define PLUS_ONE_11	12
 #define PLUS_ONE_12	13
 #define PLUS_ONE_13	14
 #define PLUS_ONE_14	15
 #define PLUS_ONE_15	16
 #define PLUS_ONE(x)	PLUS_ONE_ ## x
 #define _CONCAT3(a,b,c)	a ## b ## c
 #define CONCAT3(a,b,c)	_CONCAT3(a,b,c)
 #define STORE_UNALIGNED(x) \
 CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
 	.if (x != 12); \
 		lw	T0, (x*4)(STATE); \
 	.endif; \
 	lwl	T1, (x*4)+MSB ## (IN); \
 	lwr	T1, (x*4)+LSB ## (IN); \
 	.if (x == 12); \
 		addu	X ## x, NONCE_0; \
 	.else; \
 		addu	X ## x, T0; \
 	.endif; \
 	CPU_TO_LE32(X ## x); \
 	xor	X ## x, T1; \
 	swl	X ## x, (x*4)+MSB ## (OUT); \
 	swr	X ## x, (x*4)+LSB ## (OUT);
 #define STORE_ALIGNED(x) \
 CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
 	.if (x != 12); \
 		lw	T0, (x*4)(STATE); \
 	.endif; \
 	lw	T1, (x*4) ## (IN); \
 	.if (x == 12); \
 		addu	X ## x, NONCE_0; \
 	.else; \
 		addu	X ## x, T0; \
 	.endif; \
 	CPU_TO_LE32(X ## x); \
 	xor	X ## x, T1; \
 	sw	X ## x, (x*4) ## (OUT);
 /* Jump table macro.
 * Used for setup and handling the last bytes, which are not multiple of 4.
 * X15 is free to store Xn
 * Every jumptable entry must be equal in size.
 */
 #define JMPTBL_ALIGNED(x) \
 .Lchacha_mips_jmptbl_aligned_ ## x: ; \
 	.set	noreorder; \
 	b	.Lchacha_mips_xor_aligned_ ## x ## _b; \
 	.if (x == 12); \
 		addu	SAVED_X, X ## x, NONCE_0; \
 	.else; \
 		addu	SAVED_X, X ## x, SAVED_CA; \
 	.endif; \
 	.set	reorder
 #define JMPTBL_UNALIGNED(x) \
 .Lchacha_mips_jmptbl_unaligned_ ## x: ; \
 	.set	noreorder; \
 	b	.Lchacha_mips_xor_unaligned_ ## x ## _b; \
 	.if (x == 12); \
 		addu	SAVED_X, X ## x, NONCE_0; \
 	.else; \
 		addu	SAVED_X, X ## x, SAVED_CA; \
 	.endif; \
 	.set	reorder
 #define AXR(A, B, C, D,  K, L, M, N,  V, W, Y, Z,  S) \
 	addu	X(A), X(K); \
 	addu	X(B), X(L); \
 	addu	X(C), X(M); \
 	addu	X(D), X(N); \
 	xor	X(V), X(A); \
 	xor	X(W), X(B); \
 	xor	X(Y), X(C); \
 	xor	X(Z), X(D); \
 	rotl	X(V), S;    \
 	rotl	X(W), S;    \
 	rotl	X(Y), S;    \
 	rotl	X(Z), S;
 .text
 .set	reorder
 .set	noat
 .globl	chacha_crypt_arch
 .ent	chacha_crypt_arch
 chacha_crypt_arch:
 	.frame	$sp, STACK_SIZE, $ra
 	/* Load number of rounds */
 	lw	$at, 16($sp)
 	addiu	$sp, -STACK_SIZE
 	/* Return bytes = 0. */
 	beqz	BYTES, .Lchacha_mips_end
 	lw	NONCE_0, 48(STATE)
 	/* Save s0-s7 */
 	sw	$s0,  0($sp)
 	sw	$s1,  4($sp)
 	sw	$s2,  8($sp)
 	sw	$s3, 12($sp)
 	sw	$s4, 16($sp)
 	sw	$s5, 20($sp)
 	sw	$s6, 24($sp)
 	sw	$s7, 28($sp)
 	/* Test IN or OUT is unaligned.
 	 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
 	 */
 	or	IS_UNALIGNED, IN, OUT
 	andi	IS_UNALIGNED, 0x3
 	b	.Lchacha_rounds_start
 .align 4
 .Loop_chacha_rounds:
 	addiu	IN,  CHACHA20_BLOCK_SIZE
 	addiu	OUT, CHACHA20_BLOCK_SIZE
 	addiu	NONCE_0, 1
 .Lchacha_rounds_start:
 	lw	X0,  0(STATE)
 	lw	X1,  4(STATE)
 	lw	X2,  8(STATE)
 	lw	X3,  12(STATE)
 	lw	X4,  16(STATE)
 	lw	X5,  20(STATE)
 	lw	X6,  24(STATE)
 	lw	X7,  28(STATE)
 	lw	X8,  32(STATE)
 	lw	X9,  36(STATE)
 	lw	X10, 40(STATE)
 	lw	X11, 44(STATE)
 	move	X12, NONCE_0
 	lw	X13, 52(STATE)
 	lw	X14, 56(STATE)
 	lw	X15, 60(STATE)
 .Loop_chacha_xor_rounds:
 	addiu	$at, -2
 	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
 	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
 	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
 	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
 	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
 	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
 	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
 	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
 	bnez	$at, .Loop_chacha_xor_rounds
 	addiu	BYTES, -(CHACHA20_BLOCK_SIZE)
 	/* Is data src/dst unaligned? Jump */
 	bnez	IS_UNALIGNED, .Loop_chacha_unaligned
 	/* Set number rounds here to fill delayslot. */
 	lw	$at, (STACK_SIZE+16)($sp)
 	/* BYTES < 0, it has no full block. */
 	bltz	BYTES, .Lchacha_mips_no_full_block_aligned
 	FOR_EACH_WORD_REV(STORE_ALIGNED)
 	/* BYTES > 0? Loop again. */
 	bgtz	BYTES, .Loop_chacha_rounds
 	/* Place this here to fill delay slot */
 	addiu	NONCE_0, 1
 	/* BYTES < 0? Handle last bytes */
 	bltz	BYTES, .Lchacha_mips_xor_bytes
 .Lchacha_mips_xor_done:
 	/* Restore used registers */
 	lw	$s0,  0($sp)
 	lw	$s1,  4($sp)
 	lw	$s2,  8($sp)
 	lw	$s3, 12($sp)
 	lw	$s4, 16($sp)
 	lw	$s5, 20($sp)
 	lw	$s6, 24($sp)
 	lw	$s7, 28($sp)
 	/* Write NONCE_0 back to right location in state */
 	sw	NONCE_0, 48(STATE)
 .Lchacha_mips_end:
 	addiu	$sp, STACK_SIZE
 	jr	$ra
 .Lchacha_mips_no_full_block_aligned:
 	/* Restore the offset on BYTES */
 	addiu	BYTES, CHACHA20_BLOCK_SIZE
 	/* Get number of full WORDS */
 	andi	$at, BYTES, MASK_U32
 	/* Load upper half of jump table addr */
 	lui	T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
 	/* Calculate lower half jump table offset */
 	ins	T0, $at, 1, 6
 	/* Add offset to STATE */
 	addu	T1, STATE, $at
 	/* Add lower half jump table addr */
 	addiu	T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
 	/* Read value from STATE */
 	lw	SAVED_CA, 0(T1)
 	/* Store remaining bytecounter as negative value */
 	subu	BYTES, $at, BYTES
 	jr	T0
 	/* Jump table */
 	FOR_EACH_WORD(JMPTBL_ALIGNED)
 .Loop_chacha_unaligned:
 	/* Set number rounds here to fill delayslot. */
 	lw	$at, (STACK_SIZE+16)($sp)
 	/* BYTES > 0, it has no full block. */
 	bltz	BYTES, .Lchacha_mips_no_full_block_unaligned
 	FOR_EACH_WORD_REV(STORE_UNALIGNED)
 	/* BYTES > 0? Loop again. */
 	bgtz	BYTES, .Loop_chacha_rounds
 	/* Write NONCE_0 back to right location in state */
 	sw	NONCE_0, 48(STATE)
 	.set noreorder
 	/* Fall through to byte handling */
 	bgez	BYTES, .Lchacha_mips_xor_done
 .Lchacha_mips_xor_unaligned_0_b:
 .Lchacha_mips_xor_aligned_0_b:
 	/* Place this here to fill delay slot */
 	addiu	NONCE_0, 1
 	.set reorder
 .Lchacha_mips_xor_bytes:
 	addu	IN, $at
 	addu	OUT, $at
 	/* First byte */
 	lbu	T1, 0(IN)
 	addiu	$at, BYTES, 1
 	CPU_TO_LE32(SAVED_X)
 	ROTR(SAVED_X)
 	xor	T1, SAVED_X
 	sb	T1, 0(OUT)
 	beqz	$at, .Lchacha_mips_xor_done
 	/* Second byte */
 	lbu	T1, 1(IN)
 	addiu	$at, BYTES, 2
 	ROTx	SAVED_X, 8
 	xor	T1, SAVED_X
 	sb	T1, 1(OUT)
 	beqz	$at, .Lchacha_mips_xor_done
 	/* Third byte */
 	lbu	T1, 2(IN)
 	ROTx	SAVED_X, 8
 	xor	T1, SAVED_X
 	sb	T1, 2(OUT)
 	b	.Lchacha_mips_xor_done
 .Lchacha_mips_no_full_block_unaligned:
 	/* Restore the offset on BYTES */
 	addiu	BYTES, CHACHA20_BLOCK_SIZE
 	/* Get number of full WORDS */
 	andi	$at, BYTES, MASK_U32
 	/* Load upper half of jump table addr */
 	lui	T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
 	/* Calculate lower half jump table offset */
 	ins	T0, $at, 1, 6
 	/* Add offset to STATE */
 	addu	T1, STATE, $at
 	/* Add lower half jump table addr */
 	addiu	T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
 	/* Read value from STATE */
 	lw	SAVED_CA, 0(T1)
 	/* Store remaining bytecounter as negative value */
 	subu	BYTES, $at, BYTES
 	jr	T0
 	/* Jump table */
 	FOR_EACH_WORD(JMPTBL_UNALIGNED)
 .end chacha_crypt_arch
 .set at
 /* Input arguments
 * STATE	$a0
 * OUT		$a1
 * NROUND	$a2
 */
 #undef X12
 #undef X13
 #undef X14
 #undef X15
 #define X12	$a3
 #define X13	$at
 #define X14	$v0
 #define X15	STATE
 .set noat
 .globl	hchacha_block_arch
 .ent	hchacha_block_arch
 hchacha_block_arch:
 	.frame	$sp, STACK_SIZE, $ra
 	addiu	$sp, -STACK_SIZE
 	/* Save X11(s6) */
 	sw	X11, 0($sp)
 	lw	X0,  0(STATE)
 	lw	X1,  4(STATE)
 	lw	X2,  8(STATE)
 	lw	X3,  12(STATE)
 	lw	X4,  16(STATE)
 	lw	X5,  20(STATE)
 	lw	X6,  24(STATE)
 	lw	X7,  28(STATE)
 	lw	X8,  32(STATE)
 	lw	X9,  36(STATE)
 	lw	X10, 40(STATE)
 	lw	X11, 44(STATE)
 	lw	X12, 48(STATE)
 	lw	X13, 52(STATE)
 	lw	X14, 56(STATE)
 	lw	X15, 60(STATE)
 .Loop_hchacha_xor_rounds:
 	addiu	$a2, -2
 	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15, 16);
 	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7, 12);
 	AXR( 0, 1, 2, 3,  4, 5, 6, 7, 12,13,14,15,  8);
 	AXR( 8, 9,10,11, 12,13,14,15,  4, 5, 6, 7,  7);
 	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14, 16);
 	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4, 12);
 	AXR( 0, 1, 2, 3,  5, 6, 7, 4, 15,12,13,14,  8);
 	AXR(10,11, 8, 9, 15,12,13,14,  5, 6, 7, 4,  7);
 	bnez	$a2, .Loop_hchacha_xor_rounds
 	/* Restore used register */
 	lw	X11, 0($sp)
 	sw	X0,  0(OUT)
 	sw	X1,  4(OUT)
 	sw	X2,  8(OUT)
 	sw	X3,  12(OUT)
 	sw	X12, 16(OUT)
 	sw	X13, 20(OUT)
 	sw	X14, 24(OUT)
 	sw	X15, 28(OUT)
 	addiu	$sp, STACK_SIZE
 	jr	$ra
 .end hchacha_block_arch
 .set at
--- a/arch/mips/crypto/chacha-glue.c
+++ b/arch/mips/crypto/chacha-glue.c
@ -0,0 +1,152 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
 * MIPS accelerated ChaCha and XChaCha stream ciphers,
 * including ChaCha20 (RFC7539)
 *
 * Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
 */
 #include <asm/byteorder.h>
 #include <crypto/algapi.h>
 #include <crypto/internal/chacha.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
 				  unsigned int bytes, int nrounds);
 EXPORT_SYMBOL(chacha_crypt_arch);
 asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds);
 EXPORT_SYMBOL(hchacha_block_arch);
 void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
 {
 	chacha_init_generic(state, key, iv);
 }
 EXPORT_SYMBOL(chacha_init_arch);
 static int chacha_mips_stream_xor(struct skcipher_request *req,
 				  const struct chacha_ctx *ctx, const u8 *iv)
 {
 	struct skcipher_walk walk;
 	u32 state[16];
 	int err;
 	err = skcipher_walk_virt(&walk, req, false);
 	chacha_init_generic(state, ctx->key, iv);
 	while (walk.nbytes > 0) {
 		unsigned int nbytes = walk.nbytes;
 		if (nbytes < walk.total)
 			nbytes = round_down(nbytes, walk.stride);
 		chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr,
 			     nbytes, ctx->nrounds);
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 	return err;
 }
 static int chacha_mips(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	return chacha_mips_stream_xor(req, ctx, req->iv);
 }
 static int xchacha_mips(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	struct chacha_ctx subctx;
 	u32 state[16];
 	u8 real_iv[16];
 	chacha_init_generic(state, ctx->key, req->iv);
 	hchacha_block(state, subctx.key, ctx->nrounds);
 	subctx.nrounds = ctx->nrounds;
 	memcpy(&real_iv[0], req->iv + 24, 8);
 	memcpy(&real_iv[8], req->iv + 16, 8);
 	return chacha_mips_stream_xor(req, &subctx, real_iv);
 }
 static struct skcipher_alg algs[] = {
 	{
 		.base.cra_name		= "chacha20",
 		.base.cra_driver_name	= "chacha20-mips",
 		.base.cra_priority	= 200,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 		.base.cra_module	= THIS_MODULE,
 		.min_keysize		= CHACHA_KEY_SIZE,
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= CHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.setkey			= chacha20_setkey,
 		.encrypt		= chacha_mips,
 		.decrypt		= chacha_mips,
 	}, {
 		.base.cra_name		= "xchacha20",
 		.base.cra_driver_name	= "xchacha20-mips",
 		.base.cra_priority	= 200,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 		.base.cra_module	= THIS_MODULE,
 		.min_keysize		= CHACHA_KEY_SIZE,
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.setkey			= chacha20_setkey,
 		.encrypt		= xchacha_mips,
 		.decrypt		= xchacha_mips,
 	}, {
 		.base.cra_name		= "xchacha12",
 		.base.cra_driver_name	= "xchacha12-mips",
 		.base.cra_priority	= 200,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 		.base.cra_module	= THIS_MODULE,
 		.min_keysize		= CHACHA_KEY_SIZE,
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.setkey			= chacha12_setkey,
 		.encrypt		= xchacha_mips,
 		.decrypt		= xchacha_mips,
 	}
 };
 static int __init chacha_simd_mod_init(void)
 {
 	return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
 		crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
 }
 static void __exit chacha_simd_mod_fini(void)
 {
 	if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER))
 		crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
 }
 module_init(chacha_simd_mod_init);
 module_exit(chacha_simd_mod_fini);
 MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("chacha20");
 MODULE_ALIAS_CRYPTO("chacha20-mips");
 MODULE_ALIAS_CRYPTO("xchacha20");
 MODULE_ALIAS_CRYPTO("xchacha20-mips");
 MODULE_ALIAS_CRYPTO("xchacha12");
 MODULE_ALIAS_CRYPTO("xchacha12-mips");
--- a/arch/mips/crypto/poly1305-glue.c
+++ b/arch/mips/crypto/poly1305-glue.c
@ -0,0 +1,191 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
 * OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
 *
 * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
 */
 #include <asm/unaligned.h>
 #include <crypto/algapi.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/poly1305.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
 #include <linux/module.h>
 asmlinkage void poly1305_init_mips(void *state, const u8 *key);
 asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
 asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce);
 void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
 {
 	poly1305_init_mips(&dctx->h, key);
 	dctx->s[0] = get_unaligned_le32(key + 16);
 	dctx->s[1] = get_unaligned_le32(key + 20);
 	dctx->s[2] = get_unaligned_le32(key + 24);
 	dctx->s[3] = get_unaligned_le32(key + 28);
 	dctx->buflen = 0;
 }
 EXPORT_SYMBOL(poly1305_init_arch);
 static int mips_poly1305_init(struct shash_desc *desc)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 	dctx->buflen = 0;
 	dctx->rset = 0;
 	dctx->sset = false;
 	return 0;
 }
 static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
 				 u32 len, u32 hibit)
 {
 	if (unlikely(!dctx->sset)) {
 		if (!dctx->rset) {
 			poly1305_init_mips(&dctx->h, src);
 			src += POLY1305_BLOCK_SIZE;
 			len -= POLY1305_BLOCK_SIZE;
 			dctx->rset = 1;
 		}
 		if (len >= POLY1305_BLOCK_SIZE) {
 			dctx->s[0] = get_unaligned_le32(src +  0);
 			dctx->s[1] = get_unaligned_le32(src +  4);
 			dctx->s[2] = get_unaligned_le32(src +  8);
 			dctx->s[3] = get_unaligned_le32(src + 12);
 			src += POLY1305_BLOCK_SIZE;
 			len -= POLY1305_BLOCK_SIZE;
 			dctx->sset = true;
 		}
 		if (len < POLY1305_BLOCK_SIZE)
 			return;
 	}
 	len &= ~(POLY1305_BLOCK_SIZE - 1);
 	poly1305_blocks_mips(&dctx->h, src, len, hibit);
 }
 static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
 				unsigned int len)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 	if (unlikely(dctx->buflen)) {
 		u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
 		memcpy(dctx->buf + dctx->buflen, src, bytes);
 		src += bytes;
 		len -= bytes;
 		dctx->buflen += bytes;
 		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
 			mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
 			dctx->buflen = 0;
 		}
 	}
 	if (likely(len >= POLY1305_BLOCK_SIZE)) {
 		mips_poly1305_blocks(dctx, src, len, 1);
 		src += round_down(len, POLY1305_BLOCK_SIZE);
 		len %= POLY1305_BLOCK_SIZE;
 	}
 	if (unlikely(len)) {
 		dctx->buflen = len;
 		memcpy(dctx->buf, src, len);
 	}
 	return 0;
 }
 void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
 			  unsigned int nbytes)
 {
 	if (unlikely(dctx->buflen)) {
 		u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
 		memcpy(dctx->buf + dctx->buflen, src, bytes);
 		src += bytes;
 		nbytes -= bytes;
 		dctx->buflen += bytes;
 		if (dctx->buflen == POLY1305_BLOCK_SIZE) {
 			poly1305_blocks_mips(&dctx->h, dctx->buf,
 					     POLY1305_BLOCK_SIZE, 1);
 			dctx->buflen = 0;
 		}
 	}
 	if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
 		unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
 		poly1305_blocks_mips(&dctx->h, src, len, 1);
 		src += len;
 		nbytes %= POLY1305_BLOCK_SIZE;
 	}
 	if (unlikely(nbytes)) {
 		dctx->buflen = nbytes;
 		memcpy(dctx->buf, src, nbytes);
 	}
 }
 EXPORT_SYMBOL(poly1305_update_arch);
 void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
 {
 	if (unlikely(dctx->buflen)) {
 		dctx->buf[dctx->buflen++] = 1;
 		memset(dctx->buf + dctx->buflen, 0,
 		       POLY1305_BLOCK_SIZE - dctx->buflen);
 		poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
 	}
 	poly1305_emit_mips(&dctx->h, dst, dctx->s);
 	*dctx = (struct poly1305_desc_ctx){};
 }
 EXPORT_SYMBOL(poly1305_final_arch);
 static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
 {
 	struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
 	if (unlikely(!dctx->sset))
 		return -ENOKEY;
 	poly1305_final_arch(dctx, dst);
 	return 0;
 }
 static struct shash_alg mips_poly1305_alg = {
 	.init			= mips_poly1305_init,
 	.update			= mips_poly1305_update,
 	.final			= mips_poly1305_final,
 	.digestsize		= POLY1305_DIGEST_SIZE,
 	.descsize		= sizeof(struct poly1305_desc_ctx),
 	.base.cra_name		= "poly1305",
 	.base.cra_driver_name	= "poly1305-mips",
 	.base.cra_priority	= 200,
 	.base.cra_blocksize	= POLY1305_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 };
 static int __init mips_poly1305_mod_init(void)
 {
 	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
 		crypto_register_shash(&mips_poly1305_alg) : 0;
 }
 static void __exit mips_poly1305_mod_exit(void)
 {
 	if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
 		crypto_unregister_shash(&mips_poly1305_alg);
 }
 module_init(mips_poly1305_mod_init);
 module_exit(mips_poly1305_mod_exit);
 MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("poly1305");
 MODULE_ALIAS_CRYPTO("poly1305-mips");
--- a/arch/mips/crypto/poly1305-mips.pl
+++ b/arch/mips/crypto/poly1305-mips.pl
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@ -152,6 +152,7 @@ config PPC
 	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF		if PPC64
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	select ARCH_WEAK_RELEASE_ACQUIRE
 	select BINFMT_ELF
 	select BUILDTIME_EXTABLE_SORT
@ -1009,6 +1010,19 @@ config FSL_RIO
 source "drivers/rapidio/Kconfig"
 config PPC_RTAS_FILTER
 	bool "Enable filtering of RTAS syscalls"
 	default y
 	depends on PPC_RTAS
 	help
 	  The RTAS syscall API has security issues that could be used to
 	  compromise system integrity. This option enforces restrictions on the
 	  RTAS calls and arguments passed by userspace programs to mitigate
 	  these issues.
 	  Say Y unless you know what you are doing and the filter is causing
 	  problems for you.
 endmenu
 config NONSTATIC_KERNEL
--- a/arch/powerpc/include/asm/drmem.h
+++ b/arch/powerpc/include/asm/drmem.h
@ -12,6 +12,8 @@
 #ifndef _ASM_POWERPC_LMB_H
 #define _ASM_POWERPC_LMB_H
 #include <linux/sched.h>
 struct drmem_lmb {
 	u64     base_addr;
 	u32     drc_index;
@ -22,13 +24,27 @@ struct drmem_lmb {
 struct drmem_lmb_info {
 	struct drmem_lmb        *lmbs;
 	int                     n_lmbs;
-	u32                     lmb_size;
+	u64                     lmb_size;
 };
 extern struct drmem_lmb_info *drmem_info;
 static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb,
 					       const struct drmem_lmb *start)
 {
 	/*
 	 * DLPAR code paths can take several milliseconds per element
 	 * when interacting with firmware. Ensure that we don't
 	 * unfairly monopolize the CPU.
 	 */
 	if (((++lmb - start) % 16) == 0)
 		cond_resched();
 	return lmb;
 }
 #define for_each_drmem_lmb_in_range(lmb, start, end)		\
-	for ((lmb) = (start); (lmb) < (end); (lmb)++)
+	for ((lmb) = (start); (lmb) < (end); lmb = drmem_lmb_next(lmb, start))
 #define for_each_drmem_lmb(lmb)					\
 	for_each_drmem_lmb_in_range((lmb),			\
@ -67,7 +83,7 @@ struct of_drconf_cell_v2 {
 #define DRCONF_MEM_AI_INVALID	0x00000040
 #define DRCONF_MEM_RESERVED	0x00000080
-static inline u32 drmem_lmb_size(void)
+static inline u64 drmem_lmb_size(void)
 {
 	return drmem_info->lmb_size;
 }
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@ -204,7 +204,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 */
 static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
-	switch_mm(prev, next, current);
+	switch_mm_irqs_off(prev, next, current);
 }
 /* We don't currently use enter_lazy_tlb() for anything */
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@ -788,7 +788,7 @@
 #define THRM1_TIN	(1 << 31)
 #define THRM1_TIV	(1 << 30)
 #define THRM1_THRES(x)	((x&0x7f)<<23)
-#define THRM3_SITV(x)	((x&0x3fff)<<1)
+#define THRM3_SITV(x)	((x & 0x1fff) << 1)
 #define THRM1_TID	(1<<2)
 #define THRM1_TIE	(1<<1)
 #define THRM1_V		(1<<0)
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@ -76,19 +76,6 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
 		return false;
 	return cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm));
 }
 static inline void mm_reset_thread_local(struct mm_struct *mm)
 {
 	WARN_ON(atomic_read(&mm->context.copros) > 0);
 	/*
 	 * It's possible for mm_access to take a reference on mm_users to
 	 * access the remote mm from another thread, but it's not allowed
 	 * to set mm_cpumask, so mm_users may be > 1 here.
 	 */
 	WARN_ON(current->mm != mm);
 	atomic_set(&mm->context.active_cpus, 1);
 	cpumask_clear(mm_cpumask(mm));
 	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
 }
 #else /* CONFIG_PPC_BOOK3S_64 */
 static inline int mm_is_thread_local(struct mm_struct *mm)
 {
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@ -1057,6 +1057,147 @@ struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log,
 	return NULL;
 }
 #ifdef CONFIG_PPC_RTAS_FILTER
 /*
 * The sys_rtas syscall, as originally designed, allows root to pass
 * arbitrary physical addresses to RTAS calls. A number of RTAS calls
 * can be abused to write to arbitrary memory and do other things that
 * are potentially harmful to system integrity, and thus should only
 * be used inside the kernel and not exposed to userspace.
 *
 * All known legitimate users of the sys_rtas syscall will only ever
 * pass addresses that fall within the RMO buffer, and use a known
 * subset of RTAS calls.
 *
 * Accordingly, we filter RTAS requests to check that the call is
 * permitted, and that provided pointers fall within the RMO buffer.
 * The rtas_filters list contains an entry for each permitted call,
 * with the indexes of the parameters which are expected to contain
 * addresses and sizes of buffers allocated inside the RMO buffer.
 */
 struct rtas_filter {
 	const char *name;
 	int token;
 	/* Indexes into the args buffer, -1 if not used */
 	int buf_idx1;
 	int size_idx1;
 	int buf_idx2;
 	int size_idx2;
 	int fixed_size;
 };
 static struct rtas_filter rtas_filters[] __ro_after_init = {
 	{ "ibm,activate-firmware", -1, -1, -1, -1, -1 },
 	{ "ibm,configure-connector", -1, 0, -1, 1, -1, 4096 },	/* Special cased */
 	{ "display-character", -1, -1, -1, -1, -1 },
 	{ "ibm,display-message", -1, 0, -1, -1, -1 },
 	{ "ibm,errinjct", -1, 2, -1, -1, -1, 1024 },
 	{ "ibm,close-errinjct", -1, -1, -1, -1, -1 },
 	{ "ibm,open-errinct", -1, -1, -1, -1, -1 },
 	{ "ibm,get-config-addr-info2", -1, -1, -1, -1, -1 },
 	{ "ibm,get-dynamic-sensor-state", -1, 1, -1, -1, -1 },
 	{ "ibm,get-indices", -1, 2, 3, -1, -1 },
 	{ "get-power-level", -1, -1, -1, -1, -1 },
 	{ "get-sensor-state", -1, -1, -1, -1, -1 },
 	{ "ibm,get-system-parameter", -1, 1, 2, -1, -1 },
 	{ "get-time-of-day", -1, -1, -1, -1, -1 },
 	{ "ibm,get-vpd", -1, 0, -1, 1, 2 },
 	{ "ibm,lpar-perftools", -1, 2, 3, -1, -1 },
 	{ "ibm,platform-dump", -1, 4, 5, -1, -1 },
 	{ "ibm,read-slot-reset-state", -1, -1, -1, -1, -1 },
 	{ "ibm,scan-log-dump", -1, 0, 1, -1, -1 },
 	{ "ibm,set-dynamic-indicator", -1, 2, -1, -1, -1 },
 	{ "ibm,set-eeh-option", -1, -1, -1, -1, -1 },
 	{ "set-indicator", -1, -1, -1, -1, -1 },
 	{ "set-power-level", -1, -1, -1, -1, -1 },
 	{ "set-time-for-power-on", -1, -1, -1, -1, -1 },
 	{ "ibm,set-system-parameter", -1, 1, -1, -1, -1 },
 	{ "set-time-of-day", -1, -1, -1, -1, -1 },
 	{ "ibm,suspend-me", -1, -1, -1, -1, -1 },
 	{ "ibm,update-nodes", -1, 0, -1, -1, -1, 4096 },
 	{ "ibm,update-properties", -1, 0, -1, -1, -1, 4096 },
 	{ "ibm,physical-attestation", -1, 0, 1, -1, -1 },
 };
 static bool in_rmo_buf(u32 base, u32 end)
 {
 	return base >= rtas_rmo_buf &&
 		base < (rtas_rmo_buf + RTAS_RMOBUF_MAX) &&
 		base <= end &&
 		end >= rtas_rmo_buf &&
 		end < (rtas_rmo_buf + RTAS_RMOBUF_MAX);
 }
 static bool block_rtas_call(int token, int nargs,
 			    struct rtas_args *args)
 {
 	int i;
 	for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) {
 		struct rtas_filter *f = &rtas_filters[i];
 		u32 base, size, end;
 		if (token != f->token)
 			continue;
 		if (f->buf_idx1 != -1) {
 			base = be32_to_cpu(args->args[f->buf_idx1]);
 			if (f->size_idx1 != -1)
 				size = be32_to_cpu(args->args[f->size_idx1]);
 			else if (f->fixed_size)
 				size = f->fixed_size;
 			else
 				size = 1;
 			end = base + size - 1;
 			if (!in_rmo_buf(base, end))
 				goto err;
 		}
 		if (f->buf_idx2 != -1) {
 			base = be32_to_cpu(args->args[f->buf_idx2]);
 			if (f->size_idx2 != -1)
 				size = be32_to_cpu(args->args[f->size_idx2]);
 			else if (f->fixed_size)
 				size = f->fixed_size;
 			else
 				size = 1;
 			end = base + size - 1;
 			/*
 			 * Special case for ibm,configure-connector where the
 			 * address can be 0
 			 */
 			if (!strcmp(f->name, "ibm,configure-connector") &&
 			    base == 0)
 				return false;
 			if (!in_rmo_buf(base, end))
 				goto err;
 		}
 		return false;
 	}
 err:
 	pr_err_ratelimited("sys_rtas: RTAS call blocked - exploit attempt?\n");
 	pr_err_ratelimited("sys_rtas: token=0x%x, nargs=%d (called by %s)\n",
 			   token, nargs, current->comm);
 	return true;
 }
 #else
 static bool block_rtas_call(int token, int nargs,
 			    struct rtas_args *args)
 {
 	return false;
 }
 #endif /* CONFIG_PPC_RTAS_FILTER */
 /* We assume to be passed big endian arguments */
 SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
 {
@ -1094,6 +1235,9 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
 	args.rets = &args.args[nargs];
 	memset(args.rets, 0, nret * sizeof(rtas_arg_t));
 	if (block_rtas_call(token, nargs, &args))
 		return -EINVAL;
 	/* Need to handle ibm,suspend_me call specially */
 	if (token == ibm_suspend_me_token) {
@ -1155,6 +1299,9 @@ void __init rtas_initialize(void)
 	unsigned long rtas_region = RTAS_INSTANTIATE_MAX;
 	u32 base, size, entry;
 	int no_base, no_size, no_entry;
 #ifdef CONFIG_PPC_RTAS_FILTER
 	int i;
 #endif
 	/* Get RTAS dev node and fill up our "rtas" structure with infos
 	 * about it.
@ -1190,6 +1337,12 @@ void __init rtas_initialize(void)
 #ifdef CONFIG_RTAS_ERROR_LOGGING
 	rtas_last_error_token = rtas_token("rtas-last-error");
 #endif
 #ifdef CONFIG_PPC_RTAS_FILTER
 	for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) {
 		rtas_filters[i].token = rtas_token(rtas_filters[i].name);
 	}
 #endif
 }
 int __init early_init_dt_scan_rtas(unsigned long node,
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@ -29,29 +29,27 @@
 static DEFINE_PER_CPU(struct cpu, cpu_devices);
 /*
 * SMT snooze delay stuff, 64-bit only for now
 */
 #ifdef CONFIG_PPC64
-/* Time in microseconds we delay before sleeping in the idle loop */
+/*
-static DEFINE_PER_CPU(long, smt_snooze_delay) = { 100 };
+ * Snooze delay has not been hooked up since 3fa8cad82b94 ("powerpc/pseries/cpuidle:
 * smt-snooze-delay cleanup.") and has been broken even longer. As was foretold in
 * 2014:
 *
 *  "ppc64_util currently utilises it. Once we fix ppc64_util, propose to clean
 *  up the kernel code."
 *
 * powerpc-utils stopped using it as of 1.3.8. At some point in the future this
 * code should be removed.
 */
 static ssize_t store_smt_snooze_delay(struct device *dev,
 				      struct device_attribute *attr,
 				      const char *buf,
 				      size_t count)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, dev);
+	pr_warn_once("%s (%d) stored to unsupported smt_snooze_delay, which has no effect.\n",
-	ssize_t ret;
+		     current->comm, current->pid);
 	long snooze;
 	ret = sscanf(buf, "%ld", &snooze);
 	if (ret != 1)
 		return -EINVAL;
 	per_cpu(smt_snooze_delay, cpu->dev.id) = snooze;
 	return count;
 }
@ -59,9 +57,9 @@ static ssize_t show_smt_snooze_delay(struct device *dev,
 				     struct device_attribute *attr,
 				     char *buf)
 {
-	struct cpu *cpu = container_of(dev, struct cpu, dev);
+	pr_warn_once("%s (%d) read from unsupported smt_snooze_delay\n",
-
+		     current->comm, current->pid);
-	return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->dev.id));
+	return sprintf(buf, "100\n");
 }
 static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
@ -69,16 +67,10 @@ static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
 static int __init setup_smt_snooze_delay(char *str)
 {
 	unsigned int cpu;
 	long snooze;
 	if (!cpu_has_feature(CPU_FTR_SMT))
 		return 1;
-	snooze = simple_strtol(str, NULL, 10);
+	pr_warn("smt-snooze-delay command line option has no effect\n");
 	for_each_possible_cpu(cpu)
 		per_cpu(smt_snooze_delay, cpu) = snooze;
 	return 1;
 }
 __setup("smt-snooze-delay=", setup_smt_snooze_delay);
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@ -13,13 +13,14 @@
 */
 #include <linux/errno.h>
 #include <linux/jiffies.h>
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/workqueue.h>
 #include <asm/io.h>
 #include <asm/reg.h>
@ -39,9 +40,7 @@ static struct tau_temp
 	unsigned char grew;
 } tau[NR_CPUS];
-struct timer_list tau_timer;
+static bool tau_int_enable;
 #undef DEBUG
 /* TODO: put these in a /proc interface, with some sanity checks, and maybe
 * dynamic adjustment to minimize # of interrupts */
@ -50,74 +49,51 @@ struct timer_list tau_timer;
 #define step_size		2	/* step size when temp goes out of range */
 #define window_expand		1	/* expand the window by this much */
 /* configurable values for shrinking the window */
-#define shrink_timer	2*HZ	/* period between shrinking the window */
+#define shrink_timer	2000	/* period between shrinking the window */
 #define min_window	2	/* minimum window size, degrees C */
 static void set_thresholds(unsigned long cpu)
 {
-#ifdef CONFIG_TAU_INT
+	u32 maybe_tie = tau_int_enable ? THRM1_TIE : 0;
 	/*
 	 * setup THRM1,
 	 * threshold, valid bit, enable interrupts, interrupt when below threshold
 	 */
 	mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TIE | THRM1_TID);
-	/* setup THRM2,
+	/* setup THRM1, threshold, valid bit, interrupt when below threshold */
-	 * threshold, valid bit, enable interrupts, interrupt when above threshold
+	mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | maybe_tie | THRM1_TID);
-	 */
+
-	mtspr (SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | THRM1_TIE);
+	/* setup THRM2, threshold, valid bit, interrupt when above threshold */
-#else
+	mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | maybe_tie);
 	/* same thing but don't enable interrupts */
 	mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TID);
 	mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V);
 #endif
 }
 static void TAUupdate(int cpu)
 {
-	unsigned thrm;
+	u32 thrm;
-
+	u32 bits = THRM1_TIV | THRM1_TIN | THRM1_V;
 #ifdef DEBUG
 	printk("TAUupdate ");
 #endif
 	/* if both thresholds are crossed, the step_sizes cancel out
 	 * and the window winds up getting expanded twice. */
-	if((thrm = mfspr(SPRN_THRM1)) & THRM1_TIV){ /* is valid? */
+	thrm = mfspr(SPRN_THRM1);
-		if(thrm & THRM1_TIN){ /* crossed low threshold */
+	if ((thrm & bits) == bits) {
 		mtspr(SPRN_THRM1, 0);
 		if (tau[cpu].low >= step_size) {
 			tau[cpu].low -= step_size;
 			tau[cpu].high -= (step_size - window_expand);
 		}
 		tau[cpu].grew = 1;
-#ifdef DEBUG
+		pr_debug("%s: low threshold crossed\n", __func__);
 			printk("low threshold crossed ");
 #endif
 	}
-	}
+	thrm = mfspr(SPRN_THRM2);
-	if((thrm = mfspr(SPRN_THRM2)) & THRM1_TIV){ /* is valid? */
+	if ((thrm & bits) == bits) {
-		if(thrm & THRM1_TIN){ /* crossed high threshold */
+		mtspr(SPRN_THRM2, 0);
 		if (tau[cpu].high <= 127 - step_size) {
 			tau[cpu].low += (step_size - window_expand);
 			tau[cpu].high += step_size;
 		}
 		tau[cpu].grew = 1;
-#ifdef DEBUG
+		pr_debug("%s: high threshold crossed\n", __func__);
 			printk("high threshold crossed ");
 #endif
 	}
 }
 #ifdef DEBUG
 	printk("grew = %d\n", tau[cpu].grew);
 #endif
 #ifndef CONFIG_TAU_INT /* tau_timeout will do this if not using interrupts */
 	set_thresholds(cpu);
 #endif
 }
 #ifdef CONFIG_TAU_INT
 /*
 * TAU interrupts - called when we have a thermal assist unit interrupt
@ -140,17 +116,16 @@ void TAUException(struct pt_regs * regs)
 static void tau_timeout(void * info)
 {
 	int cpu;
 	unsigned long flags;
 	int size;
 	int shrink;
 	/* disabling interrupts *should* be okay */
 	local_irq_save(flags);
 	cpu = smp_processor_id();
-#ifndef CONFIG_TAU_INT
+	if (!tau_int_enable)
 		TAUupdate(cpu);
-#endif
+
 	/* Stop thermal sensor comparisons and interrupts */
 	mtspr(SPRN_THRM3, 0);
 	size = tau[cpu].high - tau[cpu].low;
 	if (size > min_window && ! tau[cpu].grew) {
@ -173,32 +148,26 @@ static void tau_timeout(void * info)
 	set_thresholds(cpu);
-	/*
+	/* Restart thermal sensor comparisons and interrupts.
-	 * Do the enable every time, since otherwise a bunch of (relatively)
+	 * The "PowerPC 740 and PowerPC 750 Microprocessor Datasheet"
-	 * complex sleep code needs to be added. One mtspr every time
+	 * recommends that "the maximum value be set in THRM3 under all
-	 * tau_timeout is called is probably not a big deal.
+	 * conditions."
 	 *
 	 * Enable thermal sensor and set up sample interval timer
 	 * need 20 us to do the compare.. until a nice 'cpu_speed' function
 	 * call is implemented, just assume a 500 mhz clock. It doesn't really
 	 * matter if we take too long for a compare since it's all interrupt
 	 * driven anyway.
 	 *
 	 * use a extra long time.. (60 us @ 500 mhz)
 	 */
-	mtspr(SPRN_THRM3, THRM3_SITV(500*60) | THRM3_E);
+	mtspr(SPRN_THRM3, THRM3_SITV(0x1fff) | THRM3_E);
 	local_irq_restore(flags);
 }
-static void tau_timeout_smp(struct timer_list *unused)
+static struct workqueue_struct *tau_workq;
 static void tau_work_func(struct work_struct *work)
 {
-
+	msleep(shrink_timer);
 	/* schedule ourselves to be run again */
 	mod_timer(&tau_timer, jiffies + shrink_timer) ;
 	on_each_cpu(tau_timeout, NULL, 0);
 	/* schedule ourselves to be run again */
 	queue_work(tau_workq, work);
 }
 DECLARE_WORK(tau_work, tau_work_func);
 /*
 * setup the TAU
 *
@ -231,21 +200,19 @@ static int __init TAU_init(void)
 		return 1;
 	}
 	tau_int_enable = IS_ENABLED(CONFIG_TAU_INT) &&
 			 !strcmp(cur_cpu_spec->platform, "ppc750");
-	/* first, set up the window shrinking timer */
+	tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1);
-	timer_setup(&tau_timer, tau_timeout_smp, 0);
+	if (!tau_workq)
-	tau_timer.expires = jiffies + shrink_timer;
+		return -ENOMEM;
 	add_timer(&tau_timer);
 	on_each_cpu(TAU_init_smp, NULL, 0);
-	printk("Thermal assist unit ");
+	queue_work(tau_workq, &tau_work);
-#ifdef CONFIG_TAU_INT
+
-	printk("using interrupts, ");
+	pr_info("Thermal assist unit using %s, shrink_timer: %d ms\n",
-#else
+		tau_int_enable ? "interrupts" : "workqueue", shrink_timer);
 	printk("using timers, ");
 #endif
 	printk("shrink_timer: %d jiffies\n", shrink_timer);
 	tau_initialized = 1;
 	return 0;
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@ -794,7 +794,7 @@ static void p9_hmi_special_emu(struct pt_regs *regs)
 {
 	unsigned int ra, rb, t, i, sel, instr, rc;
 	const void __user *addr;
-	u8 vbuf[16], *vdst;
+	u8 vbuf[16] __aligned(16), *vdst;
 	unsigned long ea, msr, msr_mask;
 	bool swap;
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@ -598,19 +598,29 @@ static void do_exit_flush_lazy_tlb(void *arg)
 	struct mm_struct *mm = arg;
 	unsigned long pid = mm->context.id;
 	/*
 	 * A kthread could have done a mmget_not_zero() after the flushing CPU
 	 * checked mm_is_singlethreaded, and be in the process of
 	 * kthread_use_mm when interrupted here. In that case, current->mm will
 	 * be set to mm, because kthread_use_mm() setting ->mm and switching to
 	 * the mm is done with interrupts off.
 	 */
 	if (current->mm == mm)
-		return; /* Local CPU */
+		goto out_flush;
 	if (current->active_mm == mm) {
-		/*
+		WARN_ON_ONCE(current->mm != NULL);
-		 * Must be a kernel thread because sender is single-threaded.
+		/* Is a kernel thread and is using mm as the lazy tlb */
 		 */
 		BUG_ON(current->mm);
 		mmgrab(&init_mm);
 		switch_mm(mm, &init_mm, current);
 		current->active_mm = &init_mm;
 		switch_mm_irqs_off(mm, &init_mm, current);
 		mmdrop(mm);
 	}
 	atomic_dec(&mm->context.active_cpus);
 	cpumask_clear_cpu(smp_processor_id(), mm_cpumask(mm));
 out_flush:
 	_tlbiel_pid(pid, RIC_FLUSH_ALL);
 }
@ -625,7 +635,6 @@ static void exit_flush_lazy_tlbs(struct mm_struct *mm)
 	 */
 	smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
 				(void *)mm, 1);
 	mm_reset_thread_local(mm);
 }
 void radix__flush_tlb_mm(struct mm_struct *mm)
--- a/arch/powerpc/perf/hv-gpci-requests.h
+++ b/arch/powerpc/perf/hv-gpci-requests.h
@ -95,7 +95,7 @@ REQUEST(__field(0,	8,	partition_id)
 #define REQUEST_NAME system_performance_capabilities
 #define REQUEST_NUM 0x40
-#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
+#define REQUEST_IDX_KIND "starting_index=0xffffffff"
 #include I(REQUEST_BEGIN)
 REQUEST(__field(0,	1,	perf_collect_privileged)
 	__field(0x1,	1,	capability_mask)
@ -223,7 +223,7 @@ REQUEST(__field(0,	2, partition_id)
 #define REQUEST_NAME system_hypervisor_times
 #define REQUEST_NUM 0xF0
-#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
+#define REQUEST_IDX_KIND "starting_index=0xffffffff"
 #include I(REQUEST_BEGIN)
 REQUEST(__count(0,	8,	time_spent_to_dispatch_virtual_processors)
 	__count(0x8,	8,	time_spent_processing_virtual_processor_timers)
@ -234,7 +234,7 @@ REQUEST(__count(0,	8,	time_spent_to_dispatch_virtual_processors)
 #define REQUEST_NAME system_tlbie_count_and_time
 #define REQUEST_NUM 0xF4
-#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
+#define REQUEST_IDX_KIND "starting_index=0xffffffff"
 #include I(REQUEST_BEGIN)
 REQUEST(__count(0,	8,	tlbie_instructions_issued)
 	/*
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@ -273,6 +273,15 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
 		mask  |= CNST_PMC_MASK(pmc);
 		value |= CNST_PMC_VAL(pmc);
 		/*
 		 * PMC5 and PMC6 are used to count cycles and instructions and
 		 * they do not support most of the constraint bits. Add a check
 		 * to exclude PMC5/6 from most of the constraints except for
 		 * EBB/BHRB.
 		 */
 		if (pmc >= 5)
 			goto ebb_bhrb;
 	}
 	if (pmc <= 4) {
@ -331,6 +340,7 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
 		}
 	}
 ebb_bhrb:
 	if (!pmc && ebb)
 		/* EBB events must specify the PMC */
 		return -1;
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@ -238,12 +238,11 @@ config TAU
 	  temperature within 2-4 degrees Celsius. This option shows the current
 	  on-die temperature in /proc/cpuinfo if the cpu supports it.
-	  Unfortunately, on some chip revisions, this sensor is very inaccurate
+	  Unfortunately, this sensor is very inaccurate when uncalibrated, so
-	  and in many cases, does not work at all, so don't assume the cpu
+	  don't assume the cpu temp is actually what /proc/cpuinfo says it is.
 	  temp is actually what /proc/cpuinfo says it is.
 config TAU_INT
-	bool "Interrupt driven TAU driver (DANGEROUS)"
+	bool "Interrupt driven TAU driver (EXPERIMENTAL)"
 	depends on TAU
 	---help---
 	  The TAU supports an interrupt driven mode which causes an interrupt
@ -251,12 +250,7 @@ config TAU_INT
 	  to get notified the temp has exceeded a range. With this option off,
 	  a timer is used to re-check the temperature periodically.
-	  However, on some cpus it appears that the TAU interrupt hardware
+	  If in doubt, say N here.
 	  is buggy and can cause a situation which would lead unexplained hard
 	  lockups.
 	  Unless you are extending the TAU driver, or enjoy kernel/hardware
 	  debugging, leave this option off.
 config TAU_AVERAGE
 	bool "Average high and low temp"
--- a/arch/powerpc/platforms/powernv/opal-dump.c
+++ b/arch/powerpc/platforms/powernv/opal-dump.c
@ -322,15 +322,14 @@ static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
 	return count;
 }
-static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
+static void create_dump_obj(uint32_t id, size_t size, uint32_t type)
 					uint32_t type)
 {
 	struct dump_obj *dump;
 	int rc;
 	dump = kzalloc(sizeof(*dump), GFP_KERNEL);
 	if (!dump)
-		return NULL;
+		return;
 	dump->kobj.kset = dump_kset;
@ -350,21 +349,39 @@ static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
 	rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
 	if (rc) {
 		kobject_put(&dump->kobj);
-		return NULL;
+		return;
 	}
 	/*
 	 * As soon as the sysfs file for this dump is created/activated there is
 	 * a chance the opal_errd daemon (or any userspace) might read and
 	 * acknowledge the dump before kobject_uevent() is called. If that
 	 * happens then there is a potential race between
 	 * dump_ack_store->kobject_put() and kobject_uevent() which leads to a
 	 * use-after-free of a kernfs object resulting in a kernel crash.
 	 *
 	 * To avoid that, we need to take a reference on behalf of the bin file,
 	 * so that our reference remains valid while we call kobject_uevent().
 	 * We then drop our reference before exiting the function, leaving the
 	 * bin file to drop the last reference (if it hasn't already).
 	 */
 	/* Take a reference for the bin file */
 	kobject_get(&dump->kobj);
 	rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
-	if (rc) {
+	if (rc == 0) {
-		kobject_put(&dump->kobj);
+		kobject_uevent(&dump->kobj, KOBJ_ADD);
 		return NULL;
 	}
 		pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
 			__func__, dump->id, dump->size);
 	} else {
 		/* Drop reference count taken for bin file */
 		kobject_put(&dump->kobj);
 	}
-	kobject_uevent(&dump->kobj, KOBJ_ADD);
+	/* Drop our reference */
-
+	kobject_put(&dump->kobj);
-	return dump;
+	return;
 }
 static irqreturn_t process_dump(int irq, void *data)
--- a/arch/powerpc/platforms/powernv/opal-elog.c
+++ b/arch/powerpc/platforms/powernv/opal-elog.c
@ -183,14 +183,14 @@ static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
 	return count;
 }
-static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
+static void create_elog_obj(uint64_t id, size_t size, uint64_t type)
 {
 	struct elog_obj *elog;
 	int rc;
 	elog = kzalloc(sizeof(*elog), GFP_KERNEL);
 	if (!elog)
-		return NULL;
+		return;
 	elog->kobj.kset = elog_kset;
@ -223,18 +223,37 @@ static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
 	rc = kobject_add(&elog->kobj, NULL, "0x%llx", id);
 	if (rc) {
 		kobject_put(&elog->kobj);
-		return NULL;
+		return;
 	}
 	/*
 	 * As soon as the sysfs file for this elog is created/activated there is
 	 * a chance the opal_errd daemon (or any userspace) might read and
 	 * acknowledge the elog before kobject_uevent() is called. If that
 	 * happens then there is a potential race between
 	 * elog_ack_store->kobject_put() and kobject_uevent() which leads to a
 	 * use-after-free of a kernfs object resulting in a kernel crash.
 	 *
 	 * To avoid that, we need to take a reference on behalf of the bin file,
 	 * so that our reference remains valid while we call kobject_uevent().
 	 * We then drop our reference before exiting the function, leaving the
 	 * bin file to drop the last reference (if it hasn't already).
 	 */
 	/* Take a reference for the bin file */
 	kobject_get(&elog->kobj);
 	rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr);
-	if (rc) {
+	if (rc == 0) {
 		kobject_uevent(&elog->kobj, KOBJ_ADD);
 	} else {
 		/* Drop the reference taken for the bin file */
 		kobject_put(&elog->kobj);
 		return NULL;
 	}
-	kobject_uevent(&elog->kobj, KOBJ_ADD);
+	/* Drop our reference */
 	kobject_put(&elog->kobj);
-	return elog;
+	return;
 }
 static irqreturn_t elog_event(int irq, void *data)
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@ -47,7 +47,7 @@
 #include <asm/udbg.h>
 #define DBG(fmt...) udbg_printf(fmt)
 #else
-#define DBG(fmt...)
+#define DBG(fmt...) do { } while (0)
 #endif
 static void pnv_smp_setup_cpu(int cpu)
--- a/arch/powerpc/platforms/pseries/rng.c
+++ b/arch/powerpc/platforms/pseries/rng.c
@ -40,6 +40,7 @@ static __init int rng_init(void)
 	ppc_md.get_random_seed = pseries_get_random_long;
 	of_node_put(dn);
 	return 0;
 }
 machine_subsys_initcall(pseries, rng_init);
--- a/arch/powerpc/sysdev/xics/icp-hv.c
+++ b/arch/powerpc/sysdev/xics/icp-hv.c
@ -179,6 +179,7 @@ int icp_hv_init(void)
 	icp_ops = &icp_hv_ops;
 	of_node_put(np);
 	return 0;
 }
--- a/arch/riscv/include/uapi/asm/auxvec.h
+++ b/arch/riscv/include/uapi/asm/auxvec.h
@ -21,4 +21,7 @@
 /* vDSO location */
 #define AT_SYSINFO_EHDR 33
 /* entries in ARCH_DLINFO */
 #define AT_VECTOR_SIZE_ARCH	1
 #endif /* _UAPI_ASM_RISCV_AUXVEC_H */
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@ -356,6 +356,7 @@ static unsigned long clock_sync_flags;
 #define CLOCK_SYNC_HAS_STP		0
 #define CLOCK_SYNC_STP			1
 #define CLOCK_SYNC_STPINFO_VALID	2
 /*
 * The get_clock function for the physical clock. It will get the current
@ -592,6 +593,22 @@ void stp_queue_work(void)
 	queue_work(time_sync_wq, &stp_work);
 }
 static int __store_stpinfo(void)
 {
 	int rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
 	if (rc)
 		clear_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
 	else
 		set_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
 	return rc;
 }
 static int stpinfo_valid(void)
 {
 	return stp_online && test_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
 }
 static int stp_sync_clock(void *data)
 {
 	struct clock_sync_data *sync = data;
@ -613,8 +630,7 @@ static int stp_sync_clock(void *data)
 			if (rc == 0) {
 				sync->clock_delta = clock_delta;
 				clock_sync_global(clock_delta);
-				rc = chsc_sstpi(stp_page, &stp_info,
+				rc = __store_stpinfo();
 						sizeof(struct stp_sstpi));
 				if (rc == 0 && stp_info.tmd != 2)
 					rc = -EAGAIN;
 			}
@ -659,7 +675,7 @@ static void stp_work_fn(struct work_struct *work)
 	if (rc)
 		goto out_unlock;
-	rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
+	rc = __store_stpinfo();
 	if (rc || stp_info.c == 0)
 		goto out_unlock;
@ -696,10 +712,14 @@ static ssize_t stp_ctn_id_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online)
+	ssize_t ret = -ENODATA;
-		return -ENODATA;
+
-	return sprintf(buf, "%016llx\n",
+	mutex_lock(&stp_work_mutex);
 	if (stpinfo_valid())
 		ret = sprintf(buf, "%016llx\n",
 			      *(unsigned long long *) stp_info.ctnid);
 	mutex_unlock(&stp_work_mutex);
 	return ret;
 }
 static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL);
@ -708,9 +728,13 @@ static ssize_t stp_ctn_type_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online)
+	ssize_t ret = -ENODATA;
-		return -ENODATA;
+
-	return sprintf(buf, "%i\n", stp_info.ctn);
+	mutex_lock(&stp_work_mutex);
 	if (stpinfo_valid())
 		ret = sprintf(buf, "%i\n", stp_info.ctn);
 	mutex_unlock(&stp_work_mutex);
 	return ret;
 }
 static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL);
@ -719,9 +743,13 @@ static ssize_t stp_dst_offset_show(struct device *dev,
 				   struct device_attribute *attr,
 				   char *buf)
 {
-	if (!stp_online || !(stp_info.vbits & 0x2000))
+	ssize_t ret = -ENODATA;
-		return -ENODATA;
+
-	return sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
+	mutex_lock(&stp_work_mutex);
 	if (stpinfo_valid() && (stp_info.vbits & 0x2000))
 		ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
 	mutex_unlock(&stp_work_mutex);
 	return ret;
 }
 static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL);
@ -730,9 +758,13 @@ static ssize_t stp_leap_seconds_show(struct device *dev,
 					struct device_attribute *attr,
 					char *buf)
 {
-	if (!stp_online || !(stp_info.vbits & 0x8000))
+	ssize_t ret = -ENODATA;
-		return -ENODATA;
+
-	return sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
+	mutex_lock(&stp_work_mutex);
 	if (stpinfo_valid() && (stp_info.vbits & 0x8000))
 		ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
 	mutex_unlock(&stp_work_mutex);
 	return ret;
 }
 static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL);
@ -741,9 +773,13 @@ static ssize_t stp_stratum_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online)
+	ssize_t ret = -ENODATA;
-		return -ENODATA;
+
-	return sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
+	mutex_lock(&stp_work_mutex);
 	if (stpinfo_valid())
 		ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
 	mutex_unlock(&stp_work_mutex);
 	return ret;
 }
 static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL);
@ -752,9 +788,13 @@ static ssize_t stp_time_offset_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online || !(stp_info.vbits & 0x0800))
+	ssize_t ret = -ENODATA;
-		return -ENODATA;
+
-	return sprintf(buf, "%i\n", (int) stp_info.tto);
+	mutex_lock(&stp_work_mutex);
 	if (stpinfo_valid() && (stp_info.vbits & 0x0800))
 		ret = sprintf(buf, "%i\n", (int) stp_info.tto);
 	mutex_unlock(&stp_work_mutex);
 	return ret;
 }
 static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL);
@ -763,9 +803,13 @@ static ssize_t stp_time_zone_offset_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online || !(stp_info.vbits & 0x4000))
+	ssize_t ret = -ENODATA;
-		return -ENODATA;
+
-	return sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
+	mutex_lock(&stp_work_mutex);
 	if (stpinfo_valid() && (stp_info.vbits & 0x4000))
 		ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
 	mutex_unlock(&stp_work_mutex);
 	return ret;
 }
 static DEVICE_ATTR(time_zone_offset, 0400,
@ -775,9 +819,13 @@ static ssize_t stp_timing_mode_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online)
+	ssize_t ret = -ENODATA;
-		return -ENODATA;
+
-	return sprintf(buf, "%i\n", stp_info.tmd);
+	mutex_lock(&stp_work_mutex);
 	if (stpinfo_valid())
 		ret = sprintf(buf, "%i\n", stp_info.tmd);
 	mutex_unlock(&stp_work_mutex);
 	return ret;
 }
 static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL);
@ -786,9 +834,13 @@ static ssize_t stp_timing_state_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
-	if (!stp_online)
+	ssize_t ret = -ENODATA;
-		return -ENODATA;
+
-	return sprintf(buf, "%i\n", stp_info.tst);
+	mutex_lock(&stp_work_mutex);
 	if (stpinfo_valid())
 		ret = sprintf(buf, "%i\n", stp_info.tst);
 	mutex_unlock(&stp_work_mutex);
 	return ret;
 }
 static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL);
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@ -1039,38 +1039,9 @@ void smp_fetch_global_pmu(void)
 * are flush_tlb_*() routines, and these run after flush_cache_*()
 * which performs the flushw.
 *
- * The SMP TLB coherency scheme we use works as follows:
+ * mm->cpu_vm_mask is a bit mask of which cpus an address
 *
 * 1) mm->cpu_vm_mask is a bit mask of which cpus an address
 * space has (potentially) executed on, this is the heuristic
- *    we use to avoid doing cross calls.
+ * we use to limit cross calls.
 *
 *    Also, for flushing from kswapd and also for clones, we
 *    use cpu_vm_mask as the list of cpus to make run the TLB.
 *
 * 2) TLB context numbers are shared globally across all processors
 *    in the system, this allows us to play several games to avoid
 *    cross calls.
 *
 *    One invariant is that when a cpu switches to a process, and
 *    that processes tsk->active_mm->cpu_vm_mask does not have the
 *    current cpu's bit set, that tlb context is flushed locally.
 *
 *    If the address space is non-shared (ie. mm->count == 1) we avoid
 *    cross calls when we want to flush the currently running process's
 *    tlb state.  This is done by clearing all cpu bits except the current
 *    processor's in current->mm->cpu_vm_mask and performing the
 *    flush locally only.  This will force any subsequent cpus which run
 *    this task to flush the context from the local tlb if the process
 *    migrates to another cpu (again).
 *
 * 3) For shared address spaces (threads) and swapping we bite the
 *    bullet for most cases and perform the cross call (but only to
 *    the cpus listed in cpu_vm_mask).
 *
 *    The performance gain from "optimizing" away the cross call for threads is
 *    questionable (in theory the big win for threads is the massive sharing of
 *    address space state across processors).
 */
 /* This currently is only used by the hugetlb arch pre-fault
@ -1080,18 +1051,13 @@ void smp_fetch_global_pmu(void)
 void smp_flush_tlb_mm(struct mm_struct *mm)
 {
 	u32 ctx = CTX_HWBITS(mm->context);
 	int cpu = get_cpu();
-	if (atomic_read(&mm->mm_users) == 1) {
+	get_cpu();
 		cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
 		goto local_flush_and_out;
 	}
 	smp_cross_call_masked(&xcall_flush_tlb_mm,
 			      ctx, 0, 0,
 			      mm_cpumask(mm));
 local_flush_and_out:
 	__flush_tlb_mm(ctx, SECONDARY_CONTEXT);
 	put_cpu();
@ -1114,15 +1080,13 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
 {
 	u32 ctx = CTX_HWBITS(mm->context);
 	struct tlb_pending_info info;
-	int cpu = get_cpu();
+
 	get_cpu();
 	info.ctx = ctx;
 	info.nr = nr;
 	info.vaddrs = vaddrs;
 	if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
 		cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
 	else
 	smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
 			       &info, 1);
@ -1134,14 +1098,13 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
 void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
 {
 	unsigned long context = CTX_HWBITS(mm->context);
 	int cpu = get_cpu();
-	if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
+	get_cpu();
-		cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
+
 	else
 	smp_cross_call_masked(&xcall_flush_tlb_page,
 			      context, vaddr, 0,
 			      mm_cpumask(mm));
 	__flush_tlb_page(context, vaddr);
 	put_cpu();
--- a/arch/um/kernel/sigio.c
+++ b/arch/um/kernel/sigio.c
@ -36,14 +36,14 @@ int write_sigio_irq(int fd)
 }
 /* These are called from os-Linux/sigio.c to protect its pollfds arrays. */
-static DEFINE_SPINLOCK(sigio_spinlock);
+static DEFINE_MUTEX(sigio_mutex);
 void sigio_lock(void)
 {
-	spin_lock(&sigio_spinlock);
+	mutex_lock(&sigio_mutex);
 }
 void sigio_unlock(void)
 {
-	spin_unlock(&sigio_spinlock);
+	mutex_unlock(&sigio_mutex);
 }
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@ -200,9 +200,10 @@ avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
 avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
 sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
 sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
 adx_instr := $(call as-instr,adox %r10$(comma)%r10,-DCONFIG_AS_ADX=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
 KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
--- a/arch/x86/configs/gki_defconfig
+++ b/arch/x86/configs/gki_defconfig
@ -40,6 +40,7 @@ CONFIG_EMBEDDED=y
 # CONFIG_SLAB_MERGE_DEFAULT is not set
 CONFIG_PROFILING=y
 CONFIG_SMP=y
 CONFIG_X86_X2APIC=y
 CONFIG_HYPERVISOR_GUEST=y
 CONFIG_PARAVIRT=y
 CONFIG_NR_CPUS=32
@ -213,6 +214,7 @@ CONFIG_DM_VERITY_FEC=y
 CONFIG_DM_BOW=y
 CONFIG_NETDEVICES=y
 CONFIG_DUMMY=y
 CONFIG_WIREGUARD=y
 CONFIG_TUN=y
 CONFIG_VETH=y
 # CONFIG_ETHERNET is not set
@ -310,6 +312,7 @@ CONFIG_HID_NINTENDO=y
 CONFIG_HID_SONY=y
 CONFIG_HID_STEAM=y
 CONFIG_USB_HIDDEV=y
 CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
 CONFIG_USB_XHCI_HCD=y
 CONFIG_USB_GADGET=y
 CONFIG_USB_GADGET_VBUS_DRAW=500
@ -436,6 +439,7 @@ CONFIG_CRC8=y
 CONFIG_XZ_DEC=y
 CONFIG_PRINTK_TIME=y
 CONFIG_DEBUG_INFO=y
 CONFIG_DEBUG_INFO_DWARF4=y
 # CONFIG_ENABLE_MUST_CHECK is not set
 # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
 CONFIG_MAGIC_SYSRQ=y
--- a/arch/x86/crypto/.gitignore
+++ b/arch/x86/crypto/.gitignore
@ -0,0 +1 @@
 poly1305-x86_64-cryptogams.S
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@ -8,8 +8,10 @@ OBJECT_FILES_NON_STANDARD := y
 avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
 avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
 				$(comma)4)$(comma)%ymm2,yes,no)
 avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
 sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
 sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
 adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no)
 obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
@ -23,7 +25,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
 obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
 obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
 obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
 obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@ -46,6 +48,11 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
 obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
 obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
 # These modules require the assembler to support ADX.
 ifeq ($(adx_supported),yes)
 	obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
 endif
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
 	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
@ -54,6 +61,7 @@ ifeq ($(avx_supported),yes)
 	obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
 	obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
 	obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
 	obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
 endif
 # These modules require assembler to support AVX2.
@ -77,7 +85,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
 blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
 twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
+chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
@ -87,6 +95,12 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
 morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
 morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
 blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
 poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
 ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
 targets += poly1305-x86_64-cryptogams.S
 endif
 ifeq ($(avx_supported),yes)
 	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
 					camellia_aesni_avx_glue.o
@ -100,20 +114,22 @@ endif
 ifeq ($(avx2_supported),yes)
 	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
-	chacha20-x86_64-y += chacha20-avx2-x86_64.o
+	chacha-x86_64-y += chacha-avx2-x86_64.o
 	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
 	morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
 endif
 ifeq ($(avx512_supported),yes)
 	chacha-x86_64-y += chacha-avx512vl-x86_64.o
 endif
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
 aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
 ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
 sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
 poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
 ifeq ($(avx2_supported),yes)
 sha1-ssse3-y += sha1_avx2_x86_64_asm.o
 poly1305-x86_64-y += poly1305-avx2-x86_64.o
 endif
 ifeq ($(sha1_ni_supported),yes)
 sha1-ssse3-y += sha1_ni_asm.o
@ -127,3 +143,8 @@ sha256-ssse3-y += sha256_ni_asm.o
 endif
 sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
 crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
 quiet_cmd_perlasm = PERLASM $@
      cmd_perlasm = $(PERL) $< > $@
 $(obj)/%.S: $(src)/%.pl FORCE
 	$(call if_changed,perlasm)
--- a/arch/x86/crypto/blake2s-core.S
+++ b/arch/x86/crypto/blake2s-core.S
@ -0,0 +1,258 @@
 /* SPDX-License-Identifier: GPL-2.0 OR MIT */
 /*
 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
 */
 #include <linux/linkage.h>
 .section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
 .align 32
 IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
 	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
 .section .rodata.cst16.ROT16, "aM", @progbits, 16
 .align 16
 ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
 .section .rodata.cst16.ROR328, "aM", @progbits, 16
 .align 16
 ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
 .section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
 .align 64
 SIGMA:
 .byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
 .byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
 .byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
 .byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
 .byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
 .byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
 .byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
 .byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
 .byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
 .byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
 #ifdef CONFIG_AS_AVX512
 .section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
 .align 64
 SIGMA2:
 .long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
 .long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
 .long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
 .long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
 .long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
 .long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
 .long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
 .long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
 .long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
 .long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
 #endif /* CONFIG_AS_AVX512 */
 .text
 #ifdef CONFIG_AS_SSSE3
 ENTRY(blake2s_compress_ssse3)
 	testq		%rdx,%rdx
 	je		.Lendofloop
 	movdqu		(%rdi),%xmm0
 	movdqu		0x10(%rdi),%xmm1
 	movdqa		ROT16(%rip),%xmm12
 	movdqa		ROR328(%rip),%xmm13
 	movdqu		0x20(%rdi),%xmm14
 	movq		%rcx,%xmm15
 	leaq		SIGMA+0xa0(%rip),%r8
 	jmp		.Lbeginofloop
 	.align		32
 .Lbeginofloop:
 	movdqa		%xmm0,%xmm10
 	movdqa		%xmm1,%xmm11
 	paddq		%xmm15,%xmm14
 	movdqa		IV(%rip),%xmm2
 	movdqa		%xmm14,%xmm3
 	pxor		IV+0x10(%rip),%xmm3
 	leaq		SIGMA(%rip),%rcx
 .Lroundloop:
 	movzbl		(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm4
 	movzbl		0x1(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm5
 	movzbl		0x2(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm6
 	movzbl		0x3(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm7
 	punpckldq	%xmm5,%xmm4
 	punpckldq	%xmm7,%xmm6
 	punpcklqdq	%xmm6,%xmm4
 	paddd		%xmm4,%xmm0
 	paddd		%xmm1,%xmm0
 	pxor		%xmm0,%xmm3
 	pshufb		%xmm12,%xmm3
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
 	psrld		$0xc,%xmm1
 	pslld		$0x14,%xmm8
 	por		%xmm8,%xmm1
 	movzbl		0x4(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm5
 	movzbl		0x5(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm6
 	movzbl		0x6(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm7
 	movzbl		0x7(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm4
 	punpckldq	%xmm6,%xmm5
 	punpckldq	%xmm4,%xmm7
 	punpcklqdq	%xmm7,%xmm5
 	paddd		%xmm5,%xmm0
 	paddd		%xmm1,%xmm0
 	pxor		%xmm0,%xmm3
 	pshufb		%xmm13,%xmm3
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
 	psrld		$0x7,%xmm1
 	pslld		$0x19,%xmm8
 	por		%xmm8,%xmm1
 	pshufd		$0x93,%xmm0,%xmm0
 	pshufd		$0x4e,%xmm3,%xmm3
 	pshufd		$0x39,%xmm2,%xmm2
 	movzbl		0x8(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm6
 	movzbl		0x9(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm7
 	movzbl		0xa(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm4
 	movzbl		0xb(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm5
 	punpckldq	%xmm7,%xmm6
 	punpckldq	%xmm5,%xmm4
 	punpcklqdq	%xmm4,%xmm6
 	paddd		%xmm6,%xmm0
 	paddd		%xmm1,%xmm0
 	pxor		%xmm0,%xmm3
 	pshufb		%xmm12,%xmm3
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
 	psrld		$0xc,%xmm1
 	pslld		$0x14,%xmm8
 	por		%xmm8,%xmm1
 	movzbl		0xc(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm7
 	movzbl		0xd(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm4
 	movzbl		0xe(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm5
 	movzbl		0xf(%rcx),%eax
 	movd		(%rsi,%rax,4),%xmm6
 	punpckldq	%xmm4,%xmm7
 	punpckldq	%xmm6,%xmm5
 	punpcklqdq	%xmm5,%xmm7
 	paddd		%xmm7,%xmm0
 	paddd		%xmm1,%xmm0
 	pxor		%xmm0,%xmm3
 	pshufb		%xmm13,%xmm3
 	paddd		%xmm3,%xmm2
 	pxor		%xmm2,%xmm1
 	movdqa		%xmm1,%xmm8
 	psrld		$0x7,%xmm1
 	pslld		$0x19,%xmm8
 	por		%xmm8,%xmm1
 	pshufd		$0x39,%xmm0,%xmm0
 	pshufd		$0x4e,%xmm3,%xmm3
 	pshufd		$0x93,%xmm2,%xmm2
 	addq		$0x10,%rcx
 	cmpq		%r8,%rcx
 	jnz		.Lroundloop
 	pxor		%xmm2,%xmm0
 	pxor		%xmm3,%xmm1
 	pxor		%xmm10,%xmm0
 	pxor		%xmm11,%xmm1
 	addq		$0x40,%rsi
 	decq		%rdx
 	jnz		.Lbeginofloop
 	movdqu		%xmm0,(%rdi)
 	movdqu		%xmm1,0x10(%rdi)
 	movdqu		%xmm14,0x20(%rdi)
 .Lendofloop:
 	ret
 ENDPROC(blake2s_compress_ssse3)
 #endif /* CONFIG_AS_SSSE3 */
 #ifdef CONFIG_AS_AVX512
 ENTRY(blake2s_compress_avx512)
 	vmovdqu		(%rdi),%xmm0
 	vmovdqu		0x10(%rdi),%xmm1
 	vmovdqu		0x20(%rdi),%xmm4
 	vmovq		%rcx,%xmm5
 	vmovdqa		IV(%rip),%xmm14
 	vmovdqa		IV+16(%rip),%xmm15
 	jmp		.Lblake2s_compress_avx512_mainloop
 .align 32
 .Lblake2s_compress_avx512_mainloop:
 	vmovdqa		%xmm0,%xmm10
 	vmovdqa		%xmm1,%xmm11
 	vpaddq		%xmm5,%xmm4,%xmm4
 	vmovdqa		%xmm14,%xmm2
 	vpxor		%xmm15,%xmm4,%xmm3
 	vmovdqu		(%rsi),%ymm6
 	vmovdqu		0x20(%rsi),%ymm7
 	addq		$0x40,%rsi
 	leaq		SIGMA2(%rip),%rax
 	movb		$0xa,%cl
 .Lblake2s_compress_avx512_roundloop:
 	addq		$0x40,%rax
 	vmovdqa		-0x40(%rax),%ymm8
 	vmovdqa		-0x20(%rax),%ymm9
 	vpermi2d	%ymm7,%ymm6,%ymm8
 	vpermi2d	%ymm7,%ymm6,%ymm9
 	vmovdqa		%ymm8,%ymm6
 	vmovdqa		%ymm9,%ymm7
 	vpaddd		%xmm8,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
 	vprord		$0x10,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
 	vprord		$0xc,%xmm1,%xmm1
 	vextracti128	$0x1,%ymm8,%xmm8
 	vpaddd		%xmm8,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
 	vprord		$0x8,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
 	vprord		$0x7,%xmm1,%xmm1
 	vpshufd		$0x93,%xmm0,%xmm0
 	vpshufd		$0x4e,%xmm3,%xmm3
 	vpshufd		$0x39,%xmm2,%xmm2
 	vpaddd		%xmm9,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
 	vprord		$0x10,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
 	vprord		$0xc,%xmm1,%xmm1
 	vextracti128	$0x1,%ymm9,%xmm9
 	vpaddd		%xmm9,%xmm0,%xmm0
 	vpaddd		%xmm1,%xmm0,%xmm0
 	vpxor		%xmm0,%xmm3,%xmm3
 	vprord		$0x8,%xmm3,%xmm3
 	vpaddd		%xmm3,%xmm2,%xmm2
 	vpxor		%xmm2,%xmm1,%xmm1
 	vprord		$0x7,%xmm1,%xmm1
 	vpshufd		$0x39,%xmm0,%xmm0
 	vpshufd		$0x4e,%xmm3,%xmm3
 	vpshufd		$0x93,%xmm2,%xmm2
 	decb		%cl
 	jne		.Lblake2s_compress_avx512_roundloop
 	vpxor		%xmm10,%xmm0,%xmm0
 	vpxor		%xmm11,%xmm1,%xmm1
 	vpxor		%xmm2,%xmm0,%xmm0
 	vpxor		%xmm3,%xmm1,%xmm1
 	decq		%rdx
 	jne		.Lblake2s_compress_avx512_mainloop
 	vmovdqu		%xmm0,(%rdi)
 	vmovdqu		%xmm1,0x10(%rdi)
 	vmovdqu		%xmm4,0x20(%rdi)
 	vzeroupper
 	retq
 ENDPROC(blake2s_compress_avx512)
 #endif /* CONFIG_AS_AVX512 */
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@ -0,0 +1,232 @@
 // SPDX-License-Identifier: GPL-2.0 OR MIT
 /*
 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 */
 #include <crypto/internal/blake2s.h>
 #include <crypto/internal/hash.h>
 #include <linux/types.h>
 #include <linux/jump_label.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <asm/cpufeature.h>
 #include <asm/fpu/api.h>
 #include <asm/processor.h>
 #include <asm/simd.h>
 asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
 				       const u8 *block, const size_t nblocks,
 				       const u32 inc);
 asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
 					const u8 *block, const size_t nblocks,
 					const u32 inc);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
 void blake2s_compress_arch(struct blake2s_state *state,
 			   const u8 *block, size_t nblocks,
 			   const u32 inc)
 {
 	/* SIMD disables preemption, so relax after processing each page. */
 	BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
 	if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
 		blake2s_compress_generic(state, block, nblocks, inc);
 		return;
 	}
 	do {
 		const size_t blocks = min_t(size_t, nblocks,
 					    SZ_4K / BLAKE2S_BLOCK_SIZE);
 		kernel_fpu_begin();
 		if (IS_ENABLED(CONFIG_AS_AVX512) &&
 		    static_branch_likely(&blake2s_use_avx512))
 			blake2s_compress_avx512(state, block, blocks, inc);
 		else
 			blake2s_compress_ssse3(state, block, blocks, inc);
 		kernel_fpu_end();
 		nblocks -= blocks;
 		block += blocks * BLAKE2S_BLOCK_SIZE;
 	} while (nblocks);
 }
 EXPORT_SYMBOL(blake2s_compress_arch);
 static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
 				 unsigned int keylen)
 {
 	struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
 	if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
 		crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
 		return -EINVAL;
 	}
 	memcpy(tctx->key, key, keylen);
 	tctx->keylen = keylen;
 	return 0;
 }
 static int crypto_blake2s_init(struct shash_desc *desc)
 {
 	struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct blake2s_state *state = shash_desc_ctx(desc);
 	const int outlen = crypto_shash_digestsize(desc->tfm);
 	if (tctx->keylen)
 		blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
 	else
 		blake2s_init(state, outlen);
 	return 0;
 }
 static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
 				 unsigned int inlen)
 {
 	struct blake2s_state *state = shash_desc_ctx(desc);
 	const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
 	if (unlikely(!inlen))
 		return 0;
 	if (inlen > fill) {
 		memcpy(state->buf + state->buflen, in, fill);
 		blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
 		state->buflen = 0;
 		in += fill;
 		inlen -= fill;
 	}
 	if (inlen > BLAKE2S_BLOCK_SIZE) {
 		const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
 		/* Hash one less (full) block than strictly possible */
 		blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
 		in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 		inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
 	}
 	memcpy(state->buf + state->buflen, in, inlen);
 	state->buflen += inlen;
 	return 0;
 }
 static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
 {
 	struct blake2s_state *state = shash_desc_ctx(desc);
 	blake2s_set_lastblock(state);
 	memset(state->buf + state->buflen, 0,
 	       BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
 	blake2s_compress_arch(state, state->buf, 1, state->buflen);
 	cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
 	memcpy(out, state->h, state->outlen);
 	memzero_explicit(state, sizeof(*state));
 	return 0;
 }
 static struct shash_alg blake2s_algs[] = {{
 	.base.cra_name		= "blake2s-128",
 	.base.cra_driver_name	= "blake2s-128-x86",
 	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
 	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
 	.base.cra_priority	= 200,
 	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.digestsize		= BLAKE2S_128_HASH_SIZE,
 	.setkey			= crypto_blake2s_setkey,
 	.init			= crypto_blake2s_init,
 	.update			= crypto_blake2s_update,
 	.final			= crypto_blake2s_final,
 	.descsize		= sizeof(struct blake2s_state),
 }, {
 	.base.cra_name		= "blake2s-160",
 	.base.cra_driver_name	= "blake2s-160-x86",
 	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
 	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
 	.base.cra_priority	= 200,
 	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.digestsize		= BLAKE2S_160_HASH_SIZE,
 	.setkey			= crypto_blake2s_setkey,
 	.init			= crypto_blake2s_init,
 	.update			= crypto_blake2s_update,
 	.final			= crypto_blake2s_final,
 	.descsize		= sizeof(struct blake2s_state),
 }, {
 	.base.cra_name		= "blake2s-224",
 	.base.cra_driver_name	= "blake2s-224-x86",
 	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
 	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
 	.base.cra_priority	= 200,
 	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.digestsize		= BLAKE2S_224_HASH_SIZE,
 	.setkey			= crypto_blake2s_setkey,
 	.init			= crypto_blake2s_init,
 	.update			= crypto_blake2s_update,
 	.final			= crypto_blake2s_final,
 	.descsize		= sizeof(struct blake2s_state),
 }, {
 	.base.cra_name		= "blake2s-256",
 	.base.cra_driver_name	= "blake2s-256-x86",
 	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
 	.base.cra_ctxsize	= sizeof(struct blake2s_tfm_ctx),
 	.base.cra_priority	= 200,
 	.base.cra_blocksize     = BLAKE2S_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 	.digestsize		= BLAKE2S_256_HASH_SIZE,
 	.setkey			= crypto_blake2s_setkey,
 	.init			= crypto_blake2s_init,
 	.update			= crypto_blake2s_update,
 	.final			= crypto_blake2s_final,
 	.descsize		= sizeof(struct blake2s_state),
 }};
 static int __init blake2s_mod_init(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_SSSE3))
 		return 0;
 	static_branch_enable(&blake2s_use_ssse3);
 	if (IS_ENABLED(CONFIG_AS_AVX512) &&
 	    boot_cpu_has(X86_FEATURE_AVX) &&
 	    boot_cpu_has(X86_FEATURE_AVX2) &&
 	    boot_cpu_has(X86_FEATURE_AVX512F) &&
 	    boot_cpu_has(X86_FEATURE_AVX512VL) &&
 	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
 			      XFEATURE_MASK_AVX512, NULL))
 		static_branch_enable(&blake2s_use_avx512);
 	return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
 		crypto_register_shashes(blake2s_algs,
 					ARRAY_SIZE(blake2s_algs)) : 0;
 }
 static void __exit blake2s_mod_exit(void)
 {
 	if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
 		crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
 }
 module_init(blake2s_mod_init);
 module_exit(blake2s_mod_exit);
 MODULE_ALIAS_CRYPTO("blake2s-128");
 MODULE_ALIAS_CRYPTO("blake2s-128-x86");
 MODULE_ALIAS_CRYPTO("blake2s-160");
 MODULE_ALIAS_CRYPTO("blake2s-160-x86");
 MODULE_ALIAS_CRYPTO("blake2s-224");
 MODULE_ALIAS_CRYPTO("blake2s-224-x86");
 MODULE_ALIAS_CRYPTO("blake2s-256");
 MODULE_ALIAS_CRYPTO("blake2s-256-x86");
 MODULE_LICENSE("GPL v2");
--- a/arch/x86/crypto/chacha-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha-avx2-x86_64.S
--- a/arch/x86/crypto/chacha-avx512vl-x86_64.S
+++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S
@ -0,0 +1,836 @@
 /* SPDX-License-Identifier: GPL-2.0+ */
 /*
 * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
 *
 * Copyright (C) 2018 Martin Willi
 */
 #include <linux/linkage.h>
 .section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
 .align 32
 CTR2BL:	.octa 0x00000000000000000000000000000000
 	.octa 0x00000000000000000000000000000001
 .section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
 .align 32
 CTR4BL:	.octa 0x00000000000000000000000000000002
 	.octa 0x00000000000000000000000000000003
 .section	.rodata.cst32.CTR8BL, "aM", @progbits, 32
 .align 32
 CTR8BL:	.octa 0x00000003000000020000000100000000
 	.octa 0x00000007000000060000000500000004
 .text
 ENTRY(chacha_2block_xor_avx512vl)
 	# %rdi: Input state matrix, s
 	# %rsi: up to 2 data blocks output, o
 	# %rdx: up to 2 data blocks input, i
 	# %rcx: input/output length in bytes
 	# %r8d: nrounds
 	# This function encrypts two ChaCha blocks by loading the state
 	# matrix twice across four AVX registers. It performs matrix operations
 	# on four words in each matrix in parallel, but requires shuffling to
 	# rearrange the words after each round.
 	vzeroupper
 	# x0..3[0-2] = s0..3
 	vbroadcasti128	0x00(%rdi),%ymm0
 	vbroadcasti128	0x10(%rdi),%ymm1
 	vbroadcasti128	0x20(%rdi),%ymm2
 	vbroadcasti128	0x30(%rdi),%ymm3
 	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
 	vmovdqa		%ymm0,%ymm8
 	vmovdqa		%ymm1,%ymm9
 	vmovdqa		%ymm2,%ymm10
 	vmovdqa		%ymm3,%ymm11
 .Ldoubleround:
 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 	vpaddd		%ymm1,%ymm0,%ymm0
 	vpxord		%ymm0,%ymm3,%ymm3
 	vprold		$16,%ymm3,%ymm3
 	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
 	vpaddd		%ymm3,%ymm2,%ymm2
 	vpxord		%ymm2,%ymm1,%ymm1
 	vprold		$12,%ymm1,%ymm1
 	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 	vpaddd		%ymm1,%ymm0,%ymm0
 	vpxord		%ymm0,%ymm3,%ymm3
 	vprold		$8,%ymm3,%ymm3
 	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 	vpaddd		%ymm3,%ymm2,%ymm2
 	vpxord		%ymm2,%ymm1,%ymm1
 	vprold		$7,%ymm1,%ymm1
 	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
 	vpshufd		$0x39,%ymm1,%ymm1
 	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
 	vpshufd		$0x4e,%ymm2,%ymm2
 	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
 	vpshufd		$0x93,%ymm3,%ymm3
 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 	vpaddd		%ymm1,%ymm0,%ymm0
 	vpxord		%ymm0,%ymm3,%ymm3
 	vprold		$16,%ymm3,%ymm3
 	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
 	vpaddd		%ymm3,%ymm2,%ymm2
 	vpxord		%ymm2,%ymm1,%ymm1
 	vprold		$12,%ymm1,%ymm1
 	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 	vpaddd		%ymm1,%ymm0,%ymm0
 	vpxord		%ymm0,%ymm3,%ymm3
 	vprold		$8,%ymm3,%ymm3
 	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 	vpaddd		%ymm3,%ymm2,%ymm2
 	vpxord		%ymm2,%ymm1,%ymm1
 	vprold		$7,%ymm1,%ymm1
 	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
 	vpshufd		$0x93,%ymm1,%ymm1
 	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
 	vpshufd		$0x4e,%ymm2,%ymm2
 	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
 	vpshufd		$0x39,%ymm3,%ymm3
 	sub		$2,%r8d
 	jnz		.Ldoubleround
 	# o0 = i0 ^ (x0 + s0)
 	vpaddd		%ymm8,%ymm0,%ymm7
 	cmp		$0x10,%rcx
 	jl		.Lxorpart2
 	vpxord		0x00(%rdx),%xmm7,%xmm6
 	vmovdqu		%xmm6,0x00(%rsi)
 	vextracti128	$1,%ymm7,%xmm0
 	# o1 = i1 ^ (x1 + s1)
 	vpaddd		%ymm9,%ymm1,%ymm7
 	cmp		$0x20,%rcx
 	jl		.Lxorpart2
 	vpxord		0x10(%rdx),%xmm7,%xmm6
 	vmovdqu		%xmm6,0x10(%rsi)
 	vextracti128	$1,%ymm7,%xmm1
 	# o2 = i2 ^ (x2 + s2)
 	vpaddd		%ymm10,%ymm2,%ymm7
 	cmp		$0x30,%rcx
 	jl		.Lxorpart2
 	vpxord		0x20(%rdx),%xmm7,%xmm6
 	vmovdqu		%xmm6,0x20(%rsi)
 	vextracti128	$1,%ymm7,%xmm2
 	# o3 = i3 ^ (x3 + s3)
 	vpaddd		%ymm11,%ymm3,%ymm7
 	cmp		$0x40,%rcx
 	jl		.Lxorpart2
 	vpxord		0x30(%rdx),%xmm7,%xmm6
 	vmovdqu		%xmm6,0x30(%rsi)
 	vextracti128	$1,%ymm7,%xmm3
 	# xor and write second block
 	vmovdqa		%xmm0,%xmm7
 	cmp		$0x50,%rcx
 	jl		.Lxorpart2
 	vpxord		0x40(%rdx),%xmm7,%xmm6
 	vmovdqu		%xmm6,0x40(%rsi)
 	vmovdqa		%xmm1,%xmm7
 	cmp		$0x60,%rcx
 	jl		.Lxorpart2
 	vpxord		0x50(%rdx),%xmm7,%xmm6
 	vmovdqu		%xmm6,0x50(%rsi)
 	vmovdqa		%xmm2,%xmm7
 	cmp		$0x70,%rcx
 	jl		.Lxorpart2
 	vpxord		0x60(%rdx),%xmm7,%xmm6
 	vmovdqu		%xmm6,0x60(%rsi)
 	vmovdqa		%xmm3,%xmm7
 	cmp		$0x80,%rcx
 	jl		.Lxorpart2
 	vpxord		0x70(%rdx),%xmm7,%xmm6
 	vmovdqu		%xmm6,0x70(%rsi)
 .Ldone2:
 	vzeroupper
 	ret
 .Lxorpart2:
 	# xor remaining bytes from partial register into output
 	mov		%rcx,%rax
 	and		$0xf,%rcx
 	jz		.Ldone8
 	mov		%rax,%r9
 	and		$~0xf,%r9
 	mov		$1,%rax
 	shld		%cl,%rax,%rax
 	sub		$1,%rax
 	kmovq		%rax,%k1
 	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
 	vpxord		%xmm7,%xmm1,%xmm1
 	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
 	jmp		.Ldone2
 ENDPROC(chacha_2block_xor_avx512vl)
 ENTRY(chacha_4block_xor_avx512vl)
 	# %rdi: Input state matrix, s
 	# %rsi: up to 4 data blocks output, o
 	# %rdx: up to 4 data blocks input, i
 	# %rcx: input/output length in bytes
 	# %r8d: nrounds
 	# This function encrypts four ChaCha blocks by loading the state
 	# matrix four times across eight AVX registers. It performs matrix
 	# operations on four words in two matrices in parallel, sequentially
 	# to the operations on the four words of the other two matrices. The
 	# required word shuffling has a rather high latency, we can do the
 	# arithmetic on two matrix-pairs without much slowdown.
 	vzeroupper
 	# x0..3[0-4] = s0..3
 	vbroadcasti128	0x00(%rdi),%ymm0
 	vbroadcasti128	0x10(%rdi),%ymm1
 	vbroadcasti128	0x20(%rdi),%ymm2
 	vbroadcasti128	0x30(%rdi),%ymm3
 	vmovdqa		%ymm0,%ymm4
 	vmovdqa		%ymm1,%ymm5
 	vmovdqa		%ymm2,%ymm6
 	vmovdqa		%ymm3,%ymm7
 	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
 	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
 	vmovdqa		%ymm0,%ymm11
 	vmovdqa		%ymm1,%ymm12
 	vmovdqa		%ymm2,%ymm13
 	vmovdqa		%ymm3,%ymm14
 	vmovdqa		%ymm7,%ymm15
 .Ldoubleround4:
 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 	vpaddd		%ymm1,%ymm0,%ymm0
 	vpxord		%ymm0,%ymm3,%ymm3
 	vprold		$16,%ymm3,%ymm3
 	vpaddd		%ymm5,%ymm4,%ymm4
 	vpxord		%ymm4,%ymm7,%ymm7
 	vprold		$16,%ymm7,%ymm7
 	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
 	vpaddd		%ymm3,%ymm2,%ymm2
 	vpxord		%ymm2,%ymm1,%ymm1
 	vprold		$12,%ymm1,%ymm1
 	vpaddd		%ymm7,%ymm6,%ymm6
 	vpxord		%ymm6,%ymm5,%ymm5
 	vprold		$12,%ymm5,%ymm5
 	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 	vpaddd		%ymm1,%ymm0,%ymm0
 	vpxord		%ymm0,%ymm3,%ymm3
 	vprold		$8,%ymm3,%ymm3
 	vpaddd		%ymm5,%ymm4,%ymm4
 	vpxord		%ymm4,%ymm7,%ymm7
 	vprold		$8,%ymm7,%ymm7
 	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 	vpaddd		%ymm3,%ymm2,%ymm2
 	vpxord		%ymm2,%ymm1,%ymm1
 	vprold		$7,%ymm1,%ymm1
 	vpaddd		%ymm7,%ymm6,%ymm6
 	vpxord		%ymm6,%ymm5,%ymm5
 	vprold		$7,%ymm5,%ymm5
 	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
 	vpshufd		$0x39,%ymm1,%ymm1
 	vpshufd		$0x39,%ymm5,%ymm5
 	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
 	vpshufd		$0x4e,%ymm2,%ymm2
 	vpshufd		$0x4e,%ymm6,%ymm6
 	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
 	vpshufd		$0x93,%ymm3,%ymm3
 	vpshufd		$0x93,%ymm7,%ymm7
 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 	vpaddd		%ymm1,%ymm0,%ymm0
 	vpxord		%ymm0,%ymm3,%ymm3
 	vprold		$16,%ymm3,%ymm3
 	vpaddd		%ymm5,%ymm4,%ymm4
 	vpxord		%ymm4,%ymm7,%ymm7
 	vprold		$16,%ymm7,%ymm7
 	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
 	vpaddd		%ymm3,%ymm2,%ymm2
 	vpxord		%ymm2,%ymm1,%ymm1
 	vprold		$12,%ymm1,%ymm1
 	vpaddd		%ymm7,%ymm6,%ymm6
 	vpxord		%ymm6,%ymm5,%ymm5
 	vprold		$12,%ymm5,%ymm5
 	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 	vpaddd		%ymm1,%ymm0,%ymm0
 	vpxord		%ymm0,%ymm3,%ymm3
 	vprold		$8,%ymm3,%ymm3
 	vpaddd		%ymm5,%ymm4,%ymm4
 	vpxord		%ymm4,%ymm7,%ymm7
 	vprold		$8,%ymm7,%ymm7
 	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 	vpaddd		%ymm3,%ymm2,%ymm2
 	vpxord		%ymm2,%ymm1,%ymm1
 	vprold		$7,%ymm1,%ymm1
 	vpaddd		%ymm7,%ymm6,%ymm6
 	vpxord		%ymm6,%ymm5,%ymm5
 	vprold		$7,%ymm5,%ymm5
 	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
 	vpshufd		$0x93,%ymm1,%ymm1
 	vpshufd		$0x93,%ymm5,%ymm5
 	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
 	vpshufd		$0x4e,%ymm2,%ymm2
 	vpshufd		$0x4e,%ymm6,%ymm6
 	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
 	vpshufd		$0x39,%ymm3,%ymm3
 	vpshufd		$0x39,%ymm7,%ymm7
 	sub		$2,%r8d
 	jnz		.Ldoubleround4
 	# o0 = i0 ^ (x0 + s0), first block
 	vpaddd		%ymm11,%ymm0,%ymm10
 	cmp		$0x10,%rcx
 	jl		.Lxorpart4
 	vpxord		0x00(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0x00(%rsi)
 	vextracti128	$1,%ymm10,%xmm0
 	# o1 = i1 ^ (x1 + s1), first block
 	vpaddd		%ymm12,%ymm1,%ymm10
 	cmp		$0x20,%rcx
 	jl		.Lxorpart4
 	vpxord		0x10(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0x10(%rsi)
 	vextracti128	$1,%ymm10,%xmm1
 	# o2 = i2 ^ (x2 + s2), first block
 	vpaddd		%ymm13,%ymm2,%ymm10
 	cmp		$0x30,%rcx
 	jl		.Lxorpart4
 	vpxord		0x20(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0x20(%rsi)
 	vextracti128	$1,%ymm10,%xmm2
 	# o3 = i3 ^ (x3 + s3), first block
 	vpaddd		%ymm14,%ymm3,%ymm10
 	cmp		$0x40,%rcx
 	jl		.Lxorpart4
 	vpxord		0x30(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0x30(%rsi)
 	vextracti128	$1,%ymm10,%xmm3
 	# xor and write second block
 	vmovdqa		%xmm0,%xmm10
 	cmp		$0x50,%rcx
 	jl		.Lxorpart4
 	vpxord		0x40(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0x40(%rsi)
 	vmovdqa		%xmm1,%xmm10
 	cmp		$0x60,%rcx
 	jl		.Lxorpart4
 	vpxord		0x50(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0x50(%rsi)
 	vmovdqa		%xmm2,%xmm10
 	cmp		$0x70,%rcx
 	jl		.Lxorpart4
 	vpxord		0x60(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0x60(%rsi)
 	vmovdqa		%xmm3,%xmm10
 	cmp		$0x80,%rcx
 	jl		.Lxorpart4
 	vpxord		0x70(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0x70(%rsi)
 	# o0 = i0 ^ (x0 + s0), third block
 	vpaddd		%ymm11,%ymm4,%ymm10
 	cmp		$0x90,%rcx
 	jl		.Lxorpart4
 	vpxord		0x80(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0x80(%rsi)
 	vextracti128	$1,%ymm10,%xmm4
 	# o1 = i1 ^ (x1 + s1), third block
 	vpaddd		%ymm12,%ymm5,%ymm10
 	cmp		$0xa0,%rcx
 	jl		.Lxorpart4
 	vpxord		0x90(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0x90(%rsi)
 	vextracti128	$1,%ymm10,%xmm5
 	# o2 = i2 ^ (x2 + s2), third block
 	vpaddd		%ymm13,%ymm6,%ymm10
 	cmp		$0xb0,%rcx
 	jl		.Lxorpart4
 	vpxord		0xa0(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0xa0(%rsi)
 	vextracti128	$1,%ymm10,%xmm6
 	# o3 = i3 ^ (x3 + s3), third block
 	vpaddd		%ymm15,%ymm7,%ymm10
 	cmp		$0xc0,%rcx
 	jl		.Lxorpart4
 	vpxord		0xb0(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0xb0(%rsi)
 	vextracti128	$1,%ymm10,%xmm7
 	# xor and write fourth block
 	vmovdqa		%xmm4,%xmm10
 	cmp		$0xd0,%rcx
 	jl		.Lxorpart4
 	vpxord		0xc0(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0xc0(%rsi)
 	vmovdqa		%xmm5,%xmm10
 	cmp		$0xe0,%rcx
 	jl		.Lxorpart4
 	vpxord		0xd0(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0xd0(%rsi)
 	vmovdqa		%xmm6,%xmm10
 	cmp		$0xf0,%rcx
 	jl		.Lxorpart4
 	vpxord		0xe0(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0xe0(%rsi)
 	vmovdqa		%xmm7,%xmm10
 	cmp		$0x100,%rcx
 	jl		.Lxorpart4
 	vpxord		0xf0(%rdx),%xmm10,%xmm9
 	vmovdqu		%xmm9,0xf0(%rsi)
 .Ldone4:
 	vzeroupper
 	ret
 .Lxorpart4:
 	# xor remaining bytes from partial register into output
 	mov		%rcx,%rax
 	and		$0xf,%rcx
 	jz		.Ldone8
 	mov		%rax,%r9
 	and		$~0xf,%r9
 	mov		$1,%rax
 	shld		%cl,%rax,%rax
 	sub		$1,%rax
 	kmovq		%rax,%k1
 	vmovdqu8	(%rdx,%r9),%xmm1{%k1}{z}
 	vpxord		%xmm10,%xmm1,%xmm1
 	vmovdqu8	%xmm1,(%rsi,%r9){%k1}
 	jmp		.Ldone4
 ENDPROC(chacha_4block_xor_avx512vl)
 ENTRY(chacha_8block_xor_avx512vl)
 	# %rdi: Input state matrix, s
 	# %rsi: up to 8 data blocks output, o
 	# %rdx: up to 8 data blocks input, i
 	# %rcx: input/output length in bytes
 	# %r8d: nrounds
 	# This function encrypts eight consecutive ChaCha blocks by loading
 	# the state matrix in AVX registers eight times. Compared to AVX2, this
 	# mostly benefits from the new rotate instructions in VL and the
 	# additional registers.
 	vzeroupper
 	# x0..15[0-7] = s[0..15]
 	vpbroadcastd	0x00(%rdi),%ymm0
 	vpbroadcastd	0x04(%rdi),%ymm1
 	vpbroadcastd	0x08(%rdi),%ymm2
 	vpbroadcastd	0x0c(%rdi),%ymm3
 	vpbroadcastd	0x10(%rdi),%ymm4
 	vpbroadcastd	0x14(%rdi),%ymm5
 	vpbroadcastd	0x18(%rdi),%ymm6
 	vpbroadcastd	0x1c(%rdi),%ymm7
 	vpbroadcastd	0x20(%rdi),%ymm8
 	vpbroadcastd	0x24(%rdi),%ymm9
 	vpbroadcastd	0x28(%rdi),%ymm10
 	vpbroadcastd	0x2c(%rdi),%ymm11
 	vpbroadcastd	0x30(%rdi),%ymm12
 	vpbroadcastd	0x34(%rdi),%ymm13
 	vpbroadcastd	0x38(%rdi),%ymm14
 	vpbroadcastd	0x3c(%rdi),%ymm15
 	# x12 += counter values 0-3
 	vpaddd		CTR8BL(%rip),%ymm12,%ymm12
 	vmovdqa64	%ymm0,%ymm16
 	vmovdqa64	%ymm1,%ymm17
 	vmovdqa64	%ymm2,%ymm18
 	vmovdqa64	%ymm3,%ymm19
 	vmovdqa64	%ymm4,%ymm20
 	vmovdqa64	%ymm5,%ymm21
 	vmovdqa64	%ymm6,%ymm22
 	vmovdqa64	%ymm7,%ymm23
 	vmovdqa64	%ymm8,%ymm24
 	vmovdqa64	%ymm9,%ymm25
 	vmovdqa64	%ymm10,%ymm26
 	vmovdqa64	%ymm11,%ymm27
 	vmovdqa64	%ymm12,%ymm28
 	vmovdqa64	%ymm13,%ymm29
 	vmovdqa64	%ymm14,%ymm30
 	vmovdqa64	%ymm15,%ymm31
 .Ldoubleround8:
 	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
 	vpaddd		%ymm0,%ymm4,%ymm0
 	vpxord		%ymm0,%ymm12,%ymm12
 	vprold		$16,%ymm12,%ymm12
 	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
 	vpaddd		%ymm1,%ymm5,%ymm1
 	vpxord		%ymm1,%ymm13,%ymm13
 	vprold		$16,%ymm13,%ymm13
 	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
 	vpaddd		%ymm2,%ymm6,%ymm2
 	vpxord		%ymm2,%ymm14,%ymm14
 	vprold		$16,%ymm14,%ymm14
 	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
 	vpaddd		%ymm3,%ymm7,%ymm3
 	vpxord		%ymm3,%ymm15,%ymm15
 	vprold		$16,%ymm15,%ymm15
 	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
 	vpaddd		%ymm12,%ymm8,%ymm8
 	vpxord		%ymm8,%ymm4,%ymm4
 	vprold		$12,%ymm4,%ymm4
 	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
 	vpaddd		%ymm13,%ymm9,%ymm9
 	vpxord		%ymm9,%ymm5,%ymm5
 	vprold		$12,%ymm5,%ymm5
 	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
 	vpaddd		%ymm14,%ymm10,%ymm10
 	vpxord		%ymm10,%ymm6,%ymm6
 	vprold		$12,%ymm6,%ymm6
 	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
 	vpaddd		%ymm15,%ymm11,%ymm11
 	vpxord		%ymm11,%ymm7,%ymm7
 	vprold		$12,%ymm7,%ymm7
 	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
 	vpaddd		%ymm0,%ymm4,%ymm0
 	vpxord		%ymm0,%ymm12,%ymm12
 	vprold		$8,%ymm12,%ymm12
 	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
 	vpaddd		%ymm1,%ymm5,%ymm1
 	vpxord		%ymm1,%ymm13,%ymm13
 	vprold		$8,%ymm13,%ymm13
 	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
 	vpaddd		%ymm2,%ymm6,%ymm2
 	vpxord		%ymm2,%ymm14,%ymm14
 	vprold		$8,%ymm14,%ymm14
 	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
 	vpaddd		%ymm3,%ymm7,%ymm3
 	vpxord		%ymm3,%ymm15,%ymm15
 	vprold		$8,%ymm15,%ymm15
 	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
 	vpaddd		%ymm12,%ymm8,%ymm8
 	vpxord		%ymm8,%ymm4,%ymm4
 	vprold		$7,%ymm4,%ymm4
 	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
 	vpaddd		%ymm13,%ymm9,%ymm9
 	vpxord		%ymm9,%ymm5,%ymm5
 	vprold		$7,%ymm5,%ymm5
 	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
 	vpaddd		%ymm14,%ymm10,%ymm10
 	vpxord		%ymm10,%ymm6,%ymm6
 	vprold		$7,%ymm6,%ymm6
 	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
 	vpaddd		%ymm15,%ymm11,%ymm11
 	vpxord		%ymm11,%ymm7,%ymm7
 	vprold		$7,%ymm7,%ymm7
 	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
 	vpaddd		%ymm0,%ymm5,%ymm0
 	vpxord		%ymm0,%ymm15,%ymm15
 	vprold		$16,%ymm15,%ymm15
 	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
 	vpaddd		%ymm1,%ymm6,%ymm1
 	vpxord		%ymm1,%ymm12,%ymm12
 	vprold		$16,%ymm12,%ymm12
 	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
 	vpaddd		%ymm2,%ymm7,%ymm2
 	vpxord		%ymm2,%ymm13,%ymm13
 	vprold		$16,%ymm13,%ymm13
 	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
 	vpaddd		%ymm3,%ymm4,%ymm3
 	vpxord		%ymm3,%ymm14,%ymm14
 	vprold		$16,%ymm14,%ymm14
 	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
 	vpaddd		%ymm15,%ymm10,%ymm10
 	vpxord		%ymm10,%ymm5,%ymm5
 	vprold		$12,%ymm5,%ymm5
 	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
 	vpaddd		%ymm12,%ymm11,%ymm11
 	vpxord		%ymm11,%ymm6,%ymm6
 	vprold		$12,%ymm6,%ymm6
 	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
 	vpaddd		%ymm13,%ymm8,%ymm8
 	vpxord		%ymm8,%ymm7,%ymm7
 	vprold		$12,%ymm7,%ymm7
 	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
 	vpaddd		%ymm14,%ymm9,%ymm9
 	vpxord		%ymm9,%ymm4,%ymm4
 	vprold		$12,%ymm4,%ymm4
 	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
 	vpaddd		%ymm0,%ymm5,%ymm0
 	vpxord		%ymm0,%ymm15,%ymm15
 	vprold		$8,%ymm15,%ymm15
 	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
 	vpaddd		%ymm1,%ymm6,%ymm1
 	vpxord		%ymm1,%ymm12,%ymm12
 	vprold		$8,%ymm12,%ymm12
 	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
 	vpaddd		%ymm2,%ymm7,%ymm2
 	vpxord		%ymm2,%ymm13,%ymm13
 	vprold		$8,%ymm13,%ymm13
 	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
 	vpaddd		%ymm3,%ymm4,%ymm3
 	vpxord		%ymm3,%ymm14,%ymm14
 	vprold		$8,%ymm14,%ymm14
 	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
 	vpaddd		%ymm15,%ymm10,%ymm10
 	vpxord		%ymm10,%ymm5,%ymm5
 	vprold		$7,%ymm5,%ymm5
 	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
 	vpaddd		%ymm12,%ymm11,%ymm11
 	vpxord		%ymm11,%ymm6,%ymm6
 	vprold		$7,%ymm6,%ymm6
 	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
 	vpaddd		%ymm13,%ymm8,%ymm8
 	vpxord		%ymm8,%ymm7,%ymm7
 	vprold		$7,%ymm7,%ymm7
 	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
 	vpaddd		%ymm14,%ymm9,%ymm9
 	vpxord		%ymm9,%ymm4,%ymm4
 	vprold		$7,%ymm4,%ymm4
 	sub		$2,%r8d
 	jnz		.Ldoubleround8
 	# x0..15[0-3] += s[0..15]
 	vpaddd		%ymm16,%ymm0,%ymm0
 	vpaddd		%ymm17,%ymm1,%ymm1
 	vpaddd		%ymm18,%ymm2,%ymm2
 	vpaddd		%ymm19,%ymm3,%ymm3
 	vpaddd		%ymm20,%ymm4,%ymm4
 	vpaddd		%ymm21,%ymm5,%ymm5
 	vpaddd		%ymm22,%ymm6,%ymm6
 	vpaddd		%ymm23,%ymm7,%ymm7
 	vpaddd		%ymm24,%ymm8,%ymm8
 	vpaddd		%ymm25,%ymm9,%ymm9
 	vpaddd		%ymm26,%ymm10,%ymm10
 	vpaddd		%ymm27,%ymm11,%ymm11
 	vpaddd		%ymm28,%ymm12,%ymm12
 	vpaddd		%ymm29,%ymm13,%ymm13
 	vpaddd		%ymm30,%ymm14,%ymm14
 	vpaddd		%ymm31,%ymm15,%ymm15
 	# interleave 32-bit words in state n, n+1
 	vpunpckldq	%ymm1,%ymm0,%ymm16
 	vpunpckhdq	%ymm1,%ymm0,%ymm17
 	vpunpckldq	%ymm3,%ymm2,%ymm18
 	vpunpckhdq	%ymm3,%ymm2,%ymm19
 	vpunpckldq	%ymm5,%ymm4,%ymm20
 	vpunpckhdq	%ymm5,%ymm4,%ymm21
 	vpunpckldq	%ymm7,%ymm6,%ymm22
 	vpunpckhdq	%ymm7,%ymm6,%ymm23
 	vpunpckldq	%ymm9,%ymm8,%ymm24
 	vpunpckhdq	%ymm9,%ymm8,%ymm25
 	vpunpckldq	%ymm11,%ymm10,%ymm26
 	vpunpckhdq	%ymm11,%ymm10,%ymm27
 	vpunpckldq	%ymm13,%ymm12,%ymm28
 	vpunpckhdq	%ymm13,%ymm12,%ymm29
 	vpunpckldq	%ymm15,%ymm14,%ymm30
 	vpunpckhdq	%ymm15,%ymm14,%ymm31
 	# interleave 64-bit words in state n, n+2
 	vpunpcklqdq	%ymm18,%ymm16,%ymm0
 	vpunpcklqdq	%ymm19,%ymm17,%ymm1
 	vpunpckhqdq	%ymm18,%ymm16,%ymm2
 	vpunpckhqdq	%ymm19,%ymm17,%ymm3
 	vpunpcklqdq	%ymm22,%ymm20,%ymm4
 	vpunpcklqdq	%ymm23,%ymm21,%ymm5
 	vpunpckhqdq	%ymm22,%ymm20,%ymm6
 	vpunpckhqdq	%ymm23,%ymm21,%ymm7
 	vpunpcklqdq	%ymm26,%ymm24,%ymm8
 	vpunpcklqdq	%ymm27,%ymm25,%ymm9
 	vpunpckhqdq	%ymm26,%ymm24,%ymm10
 	vpunpckhqdq	%ymm27,%ymm25,%ymm11
 	vpunpcklqdq	%ymm30,%ymm28,%ymm12
 	vpunpcklqdq	%ymm31,%ymm29,%ymm13
 	vpunpckhqdq	%ymm30,%ymm28,%ymm14
 	vpunpckhqdq	%ymm31,%ymm29,%ymm15
 	# interleave 128-bit words in state n, n+4
 	# xor/write first four blocks
 	vmovdqa64	%ymm0,%ymm16
 	vperm2i128	$0x20,%ymm4,%ymm0,%ymm0
 	cmp		$0x0020,%rcx
 	jl		.Lxorpart8
 	vpxord		0x0000(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x0000(%rsi)
 	vmovdqa64	%ymm16,%ymm0
 	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
 	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
 	cmp		$0x0040,%rcx
 	jl		.Lxorpart8
 	vpxord		0x0020(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x0020(%rsi)
 	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
 	vperm2i128	$0x20,%ymm6,%ymm2,%ymm0
 	cmp		$0x0060,%rcx
 	jl		.Lxorpart8
 	vpxord		0x0040(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x0040(%rsi)
 	vperm2i128	$0x31,%ymm6,%ymm2,%ymm6
 	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
 	cmp		$0x0080,%rcx
 	jl		.Lxorpart8
 	vpxord		0x0060(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x0060(%rsi)
 	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
 	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
 	cmp		$0x00a0,%rcx
 	jl		.Lxorpart8
 	vpxord		0x0080(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x0080(%rsi)
 	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
 	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
 	cmp		$0x00c0,%rcx
 	jl		.Lxorpart8
 	vpxord		0x00a0(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x00a0(%rsi)
 	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
 	vperm2i128	$0x20,%ymm7,%ymm3,%ymm0
 	cmp		$0x00e0,%rcx
 	jl		.Lxorpart8
 	vpxord		0x00c0(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x00c0(%rsi)
 	vperm2i128	$0x31,%ymm7,%ymm3,%ymm7
 	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
 	cmp		$0x0100,%rcx
 	jl		.Lxorpart8
 	vpxord		0x00e0(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x00e0(%rsi)
 	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
 	# xor remaining blocks, write to output
 	vmovdqa64	%ymm4,%ymm0
 	cmp		$0x0120,%rcx
 	jl		.Lxorpart8
 	vpxord		0x0100(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x0100(%rsi)
 	vmovdqa64	%ymm12,%ymm0
 	cmp		$0x0140,%rcx
 	jl		.Lxorpart8
 	vpxord		0x0120(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x0120(%rsi)
 	vmovdqa64	%ymm6,%ymm0
 	cmp		$0x0160,%rcx
 	jl		.Lxorpart8
 	vpxord		0x0140(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x0140(%rsi)
 	vmovdqa64	%ymm14,%ymm0
 	cmp		$0x0180,%rcx
 	jl		.Lxorpart8
 	vpxord		0x0160(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x0160(%rsi)
 	vmovdqa64	%ymm5,%ymm0
 	cmp		$0x01a0,%rcx
 	jl		.Lxorpart8
 	vpxord		0x0180(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x0180(%rsi)
 	vmovdqa64	%ymm13,%ymm0
 	cmp		$0x01c0,%rcx
 	jl		.Lxorpart8
 	vpxord		0x01a0(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x01a0(%rsi)
 	vmovdqa64	%ymm7,%ymm0
 	cmp		$0x01e0,%rcx
 	jl		.Lxorpart8
 	vpxord		0x01c0(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x01c0(%rsi)
 	vmovdqa64	%ymm15,%ymm0
 	cmp		$0x0200,%rcx
 	jl		.Lxorpart8
 	vpxord		0x01e0(%rdx),%ymm0,%ymm0
 	vmovdqu64	%ymm0,0x01e0(%rsi)
 .Ldone8:
 	vzeroupper
 	ret
 .Lxorpart8:
 	# xor remaining bytes from partial register into output
 	mov		%rcx,%rax
 	and		$0x1f,%rcx
 	jz		.Ldone8
 	mov		%rax,%r9
 	and		$~0x1f,%r9
 	mov		$1,%rax
 	shld		%cl,%rax,%rax
 	sub		$1,%rax
 	kmovq		%rax,%k1
 	vmovdqu8	(%rdx,%r9),%ymm1{%k1}{z}
 	vpxord		%ymm0,%ymm1,%ymm1
 	vmovdqu8	%ymm1,(%rsi,%r9){%k1}
 	jmp		.Ldone8
 ENDPROC(chacha_8block_xor_avx512vl)
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S
@ -1,5 +1,5 @@
 /*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
 *
 * Copyright (C) 2015 Martin Willi
 *
@ -10,6 +10,7 @@
 */
 #include <linux/linkage.h>
 #include <asm/frame.h>
 .section	.rodata.cst16.ROT8, "aM", @progbits, 16
 .align 16
@ -23,35 +24,25 @@ CTRINC:	.octa 0x00000003000000020000000100000000
 .text
-ENTRY(chacha20_block_xor_ssse3)
+/*
-	# %rdi: Input state matrix, s
+ * chacha_permute - permute one block
-	# %rsi: 1 data block output, o
+ *
-	# %rdx: 1 data block input, i
+ * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
-
+ * function performs matrix operations on four words in parallel, but requires
-	# This function encrypts one ChaCha20 block by loading the state matrix
+ * shuffling to rearrange the words after each round.  8/16-bit word rotation is
-	# in four SSE registers. It performs matrix operation on four words in
+ * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
-	# parallel, but requireds shuffling to rearrange the words after each
+ * rotation uses traditional shift+OR.
-	# round. 8/16-bit word rotation is done with the slightly better
+ *
-	# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
+ * The round count is given in %r8d.
-	# traditional shift+OR.
+ *
-
+ * Clobbers: %r8d, %xmm4-%xmm7
-	# x0..3 = s0..3
+ */
-	movdqa		0x00(%rdi),%xmm0
+chacha_permute:
 	movdqa		0x10(%rdi),%xmm1
 	movdqa		0x20(%rdi),%xmm2
 	movdqa		0x30(%rdi),%xmm3
 	movdqa		%xmm0,%xmm8
 	movdqa		%xmm1,%xmm9
 	movdqa		%xmm2,%xmm10
 	movdqa		%xmm3,%xmm11
 	movdqa		ROT8(%rip),%xmm4
 	movdqa		ROT16(%rip),%xmm5
 	mov	$10,%ecx
 .Ldoubleround:
 	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 	paddd		%xmm1,%xmm0
 	pxor		%xmm0,%xmm3
@ -118,39 +109,129 @@ ENTRY(chacha20_block_xor_ssse3)
 	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
 	pshufd		$0x39,%xmm3,%xmm3
-	dec		%ecx
+	sub		$2,%r8d
 	jnz		.Ldoubleround
 	ret
 ENDPROC(chacha_permute)
 ENTRY(chacha_block_xor_ssse3)
 	# %rdi: Input state matrix, s
 	# %rsi: up to 1 data block output, o
 	# %rdx: up to 1 data block input, i
 	# %rcx: input/output length in bytes
 	# %r8d: nrounds
 	FRAME_BEGIN
 	# x0..3 = s0..3
 	movdqu		0x00(%rdi),%xmm0
 	movdqu		0x10(%rdi),%xmm1
 	movdqu		0x20(%rdi),%xmm2
 	movdqu		0x30(%rdi),%xmm3
 	movdqa		%xmm0,%xmm8
 	movdqa		%xmm1,%xmm9
 	movdqa		%xmm2,%xmm10
 	movdqa		%xmm3,%xmm11
 	mov		%rcx,%rax
 	call		chacha_permute
 	# o0 = i0 ^ (x0 + s0)
 	movdqu		0x00(%rdx),%xmm4
 	paddd		%xmm8,%xmm0
 	cmp		$0x10,%rax
 	jl		.Lxorpart
 	movdqu		0x00(%rdx),%xmm4
 	pxor		%xmm4,%xmm0
 	movdqu		%xmm0,0x00(%rsi)
 	# o1 = i1 ^ (x1 + s1)
 	movdqu		0x10(%rdx),%xmm5
 	paddd		%xmm9,%xmm1
-	pxor		%xmm5,%xmm1
+	movdqa		%xmm1,%xmm0
-	movdqu		%xmm1,0x10(%rsi)
+	cmp		$0x20,%rax
 	jl		.Lxorpart
 	movdqu		0x10(%rdx),%xmm0
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x10(%rsi)
 	# o2 = i2 ^ (x2 + s2)
 	movdqu		0x20(%rdx),%xmm6
 	paddd		%xmm10,%xmm2
-	pxor		%xmm6,%xmm2
+	movdqa		%xmm2,%xmm0
-	movdqu		%xmm2,0x20(%rsi)
+	cmp		$0x30,%rax
 	jl		.Lxorpart
 	movdqu		0x20(%rdx),%xmm0
 	pxor		%xmm2,%xmm0
 	movdqu		%xmm0,0x20(%rsi)
 	# o3 = i3 ^ (x3 + s3)
 	movdqu		0x30(%rdx),%xmm7
 	paddd		%xmm11,%xmm3
-	pxor		%xmm7,%xmm3
+	movdqa		%xmm3,%xmm0
-	movdqu		%xmm3,0x30(%rsi)
+	cmp		$0x40,%rax
 	jl		.Lxorpart
 	movdqu		0x30(%rdx),%xmm0
 	pxor		%xmm3,%xmm0
 	movdqu		%xmm0,0x30(%rsi)
 .Ldone:
 	FRAME_END
 	ret
 ENDPROC(chacha20_block_xor_ssse3)
-ENTRY(chacha20_4block_xor_ssse3)
+.Lxorpart:
 	# xor remaining bytes from partial register into output
 	mov		%rax,%r9
 	and		$0x0f,%r9
 	jz		.Ldone
 	and		$~0x0f,%rax
 	mov		%rsi,%r11
 	lea		8(%rsp),%r10
 	sub		$0x10,%rsp
 	and		$~31,%rsp
 	lea		(%rdx,%rax),%rsi
 	mov		%rsp,%rdi
 	mov		%r9,%rcx
 	rep movsb
 	pxor		0x00(%rsp),%xmm0
 	movdqa		%xmm0,0x00(%rsp)
 	mov		%rsp,%rsi
 	lea		(%r11,%rax),%rdi
 	mov		%r9,%rcx
 	rep movsb
 	lea		-8(%r10),%rsp
 	jmp		.Ldone
 ENDPROC(chacha_block_xor_ssse3)
 ENTRY(hchacha_block_ssse3)
 	# %rdi: Input state matrix, s
-	# %rsi: 4 data blocks output, o
+	# %rsi: output (8 32-bit words)
-	# %rdx: 4 data blocks input, i
+	# %edx: nrounds
 	FRAME_BEGIN
-	# This function encrypts four consecutive ChaCha20 blocks by loading the
+	movdqu		0x00(%rdi),%xmm0
 	movdqu		0x10(%rdi),%xmm1
 	movdqu		0x20(%rdi),%xmm2
 	movdqu		0x30(%rdi),%xmm3
 	mov		%edx,%r8d
 	call		chacha_permute
 	movdqu		%xmm0,0x00(%rsi)
 	movdqu		%xmm3,0x10(%rsi)
 	FRAME_END
 	ret
 ENDPROC(hchacha_block_ssse3)
 ENTRY(chacha_4block_xor_ssse3)
 	# %rdi: Input state matrix, s
 	# %rsi: up to 4 data blocks output, o
 	# %rdx: up to 4 data blocks input, i
 	# %rcx: input/output length in bytes
 	# %r8d: nrounds
 	# This function encrypts four consecutive ChaCha blocks by loading the
 	# the state matrix in SSE registers four times. As we need some scratch
 	# registers, we save the first four registers on the stack. The
 	# algorithm performs each operation on the corresponding word of each
@ -163,6 +244,7 @@ ENTRY(chacha20_4block_xor_ssse3)
 	lea		8(%rsp),%r10
 	sub		$0x80,%rsp
 	and		$~63,%rsp
 	mov		%rcx,%rax
 	# x0..15[0-3] = s0..3[0..3]
 	movq		0x00(%rdi),%xmm1
@ -202,8 +284,6 @@ ENTRY(chacha20_4block_xor_ssse3)
 	# x12 += counter values 0-3
 	paddd		%xmm1,%xmm12
 	mov		$10,%ecx
 .Ldoubleround4:
 	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
 	movdqa		0x00(%rsp),%xmm0
@ -421,7 +501,7 @@ ENTRY(chacha20_4block_xor_ssse3)
 	psrld		$25,%xmm4
 	por		%xmm0,%xmm4
-	dec		%ecx
+	sub		$2,%r8d
 	jnz		.Ldoubleround4
 	# x0[0-3] += s0[0]
@ -573,58 +653,143 @@ ENTRY(chacha20_4block_xor_ssse3)
 	# xor with corresponding input, write to output
 	movdqa		0x00(%rsp),%xmm0
 	cmp		$0x10,%rax
 	jl		.Lxorpart4
 	movdqu		0x00(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x00(%rsi)
-	movdqa		0x10(%rsp),%xmm0
+
-	movdqu		0x80(%rdx),%xmm1
+	movdqu		%xmm4,%xmm0
 	cmp		$0x20,%rax
 	jl		.Lxorpart4
 	movdqu		0x10(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
-	movdqu		%xmm0,0x80(%rsi)
+	movdqu		%xmm0,0x10(%rsi)
 	movdqu		%xmm8,%xmm0
 	cmp		$0x30,%rax
 	jl		.Lxorpart4
 	movdqu		0x20(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x20(%rsi)
 	movdqu		%xmm12,%xmm0
 	cmp		$0x40,%rax
 	jl		.Lxorpart4
 	movdqu		0x30(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x30(%rsi)
 	movdqa		0x20(%rsp),%xmm0
 	cmp		$0x50,%rax
 	jl		.Lxorpart4
 	movdqu		0x40(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x40(%rsi)
 	movdqu		%xmm6,%xmm0
 	cmp		$0x60,%rax
 	jl		.Lxorpart4
 	movdqu		0x50(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x50(%rsi)
 	movdqu		%xmm10,%xmm0
 	cmp		$0x70,%rax
 	jl		.Lxorpart4
 	movdqu		0x60(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x60(%rsi)
 	movdqu		%xmm14,%xmm0
 	cmp		$0x80,%rax
 	jl		.Lxorpart4
 	movdqu		0x70(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x70(%rsi)
 	movdqa		0x10(%rsp),%xmm0
 	cmp		$0x90,%rax
 	jl		.Lxorpart4
 	movdqu		0x80(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x80(%rsi)
 	movdqu		%xmm5,%xmm0
 	cmp		$0xa0,%rax
 	jl		.Lxorpart4
 	movdqu		0x90(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0x90(%rsi)
 	movdqu		%xmm9,%xmm0
 	cmp		$0xb0,%rax
 	jl		.Lxorpart4
 	movdqu		0xa0(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0xa0(%rsi)
 	movdqu		%xmm13,%xmm0
 	cmp		$0xc0,%rax
 	jl		.Lxorpart4
 	movdqu		0xb0(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0xb0(%rsi)
 	movdqa		0x30(%rsp),%xmm0
 	cmp		$0xd0,%rax
 	jl		.Lxorpart4
 	movdqu		0xc0(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0xc0(%rsi)
 	movdqu		0x10(%rdx),%xmm1
 	pxor		%xmm1,%xmm4
 	movdqu		%xmm4,0x10(%rsi)
 	movdqu		0x90(%rdx),%xmm1
 	pxor		%xmm1,%xmm5
 	movdqu		%xmm5,0x90(%rsi)
 	movdqu		0x50(%rdx),%xmm1
 	pxor		%xmm1,%xmm6
 	movdqu		%xmm6,0x50(%rsi)
 	movdqu		0xd0(%rdx),%xmm1
 	pxor		%xmm1,%xmm7
 	movdqu		%xmm7,0xd0(%rsi)
 	movdqu		0x20(%rdx),%xmm1
 	pxor		%xmm1,%xmm8
 	movdqu		%xmm8,0x20(%rsi)
 	movdqu		0xa0(%rdx),%xmm1
 	pxor		%xmm1,%xmm9
 	movdqu		%xmm9,0xa0(%rsi)
 	movdqu		0x60(%rdx),%xmm1
 	pxor		%xmm1,%xmm10
 	movdqu		%xmm10,0x60(%rsi)
 	movdqu		0xe0(%rdx),%xmm1
 	pxor		%xmm1,%xmm11
 	movdqu		%xmm11,0xe0(%rsi)
 	movdqu		0x30(%rdx),%xmm1
 	pxor		%xmm1,%xmm12
 	movdqu		%xmm12,0x30(%rsi)
 	movdqu		0xb0(%rdx),%xmm1
 	pxor		%xmm1,%xmm13
 	movdqu		%xmm13,0xb0(%rsi)
 	movdqu		0x70(%rdx),%xmm1
 	pxor		%xmm1,%xmm14
 	movdqu		%xmm14,0x70(%rsi)
 	movdqu		0xf0(%rdx),%xmm1
 	pxor		%xmm1,%xmm15
 	movdqu		%xmm15,0xf0(%rsi)
 	movdqu		%xmm7,%xmm0
 	cmp		$0xe0,%rax
 	jl		.Lxorpart4
 	movdqu		0xd0(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0xd0(%rsi)
 	movdqu		%xmm11,%xmm0
 	cmp		$0xf0,%rax
 	jl		.Lxorpart4
 	movdqu		0xe0(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0xe0(%rsi)
 	movdqu		%xmm15,%xmm0
 	cmp		$0x100,%rax
 	jl		.Lxorpart4
 	movdqu		0xf0(%rdx),%xmm1
 	pxor		%xmm1,%xmm0
 	movdqu		%xmm0,0xf0(%rsi)
 .Ldone4:
 	lea		-8(%r10),%rsp
 	ret
-ENDPROC(chacha20_4block_xor_ssse3)
+
 .Lxorpart4:
 	# xor remaining bytes from partial register into output
 	mov		%rax,%r9
 	and		$0x0f,%r9
 	jz		.Ldone4
 	and		$~0x0f,%rax
 	mov		%rsi,%r11
 	lea		(%rdx,%rax),%rsi
 	mov		%rsp,%rdi
 	mov		%r9,%rcx
 	rep movsb
 	pxor		0x00(%rsp),%xmm0
 	movdqa		%xmm0,0x00(%rsp)
 	mov		%rsp,%rsi
 	lea		(%r11,%rax),%rdi
 	mov		%r9,%rcx
 	rep movsb
 	jmp		.Ldone4
 ENDPROC(chacha_4block_xor_ssse3)
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ b/arch/x86/crypto/chacha20-avx2-x86_64.S
@ -1,448 +0,0 @@
 /*
 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
 *
 * Copyright (C) 2015 Martin Willi
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 */
 #include <linux/linkage.h>
 .section	.rodata.cst32.ROT8, "aM", @progbits, 32
 .align 32
 ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
 	.octa 0x0e0d0c0f0a09080b0605040702010003
 .section	.rodata.cst32.ROT16, "aM", @progbits, 32
 .align 32
 ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
 	.octa 0x0d0c0f0e09080b0a0504070601000302
 .section	.rodata.cst32.CTRINC, "aM", @progbits, 32
 .align 32
 CTRINC:	.octa 0x00000003000000020000000100000000
 	.octa 0x00000007000000060000000500000004
 .text
 ENTRY(chacha20_8block_xor_avx2)
 	# %rdi: Input state matrix, s
 	# %rsi: 8 data blocks output, o
 	# %rdx: 8 data blocks input, i
 	# This function encrypts eight consecutive ChaCha20 blocks by loading
 	# the state matrix in AVX registers eight times. As we need some
 	# scratch registers, we save the first four registers on the stack. The
 	# algorithm performs each operation on the corresponding word of each
 	# state matrix, hence requires no word shuffling. For final XORing step
 	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
 	# words, which allows us to do XOR in AVX registers. 8/16-bit word
 	# rotation is done with the slightly better performing byte shuffling,
 	# 7/12-bit word rotation uses traditional shift+OR.
 	vzeroupper
 	# 4 * 32 byte stack, 32-byte aligned
 	lea		8(%rsp),%r10
 	and		$~31, %rsp
 	sub		$0x80, %rsp
 	# x0..15[0-7] = s[0..15]
 	vpbroadcastd	0x00(%rdi),%ymm0
 	vpbroadcastd	0x04(%rdi),%ymm1
 	vpbroadcastd	0x08(%rdi),%ymm2
 	vpbroadcastd	0x0c(%rdi),%ymm3
 	vpbroadcastd	0x10(%rdi),%ymm4
 	vpbroadcastd	0x14(%rdi),%ymm5
 	vpbroadcastd	0x18(%rdi),%ymm6
 	vpbroadcastd	0x1c(%rdi),%ymm7
 	vpbroadcastd	0x20(%rdi),%ymm8
 	vpbroadcastd	0x24(%rdi),%ymm9
 	vpbroadcastd	0x28(%rdi),%ymm10
 	vpbroadcastd	0x2c(%rdi),%ymm11
 	vpbroadcastd	0x30(%rdi),%ymm12
 	vpbroadcastd	0x34(%rdi),%ymm13
 	vpbroadcastd	0x38(%rdi),%ymm14
 	vpbroadcastd	0x3c(%rdi),%ymm15
 	# x0..3 on stack
 	vmovdqa		%ymm0,0x00(%rsp)
 	vmovdqa		%ymm1,0x20(%rsp)
 	vmovdqa		%ymm2,0x40(%rsp)
 	vmovdqa		%ymm3,0x60(%rsp)
 	vmovdqa		CTRINC(%rip),%ymm1
 	vmovdqa		ROT8(%rip),%ymm2
 	vmovdqa		ROT16(%rip),%ymm3
 	# x12 += counter values 0-3
 	vpaddd		%ymm1,%ymm12,%ymm12
 	mov		$10,%ecx
 .Ldoubleround8:
 	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
 	vpaddd		0x00(%rsp),%ymm4,%ymm0
 	vmovdqa		%ymm0,0x00(%rsp)
 	vpxor		%ymm0,%ymm12,%ymm12
 	vpshufb		%ymm3,%ymm12,%ymm12
 	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
 	vpaddd		0x20(%rsp),%ymm5,%ymm0
 	vmovdqa		%ymm0,0x20(%rsp)
 	vpxor		%ymm0,%ymm13,%ymm13
 	vpshufb		%ymm3,%ymm13,%ymm13
 	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
 	vpaddd		0x40(%rsp),%ymm6,%ymm0
 	vmovdqa		%ymm0,0x40(%rsp)
 	vpxor		%ymm0,%ymm14,%ymm14
 	vpshufb		%ymm3,%ymm14,%ymm14
 	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
 	vpaddd		0x60(%rsp),%ymm7,%ymm0
 	vmovdqa		%ymm0,0x60(%rsp)
 	vpxor		%ymm0,%ymm15,%ymm15
 	vpshufb		%ymm3,%ymm15,%ymm15
 	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
 	vpaddd		%ymm12,%ymm8,%ymm8
 	vpxor		%ymm8,%ymm4,%ymm4
 	vpslld		$12,%ymm4,%ymm0
 	vpsrld		$20,%ymm4,%ymm4
 	vpor		%ymm0,%ymm4,%ymm4
 	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
 	vpaddd		%ymm13,%ymm9,%ymm9
 	vpxor		%ymm9,%ymm5,%ymm5
 	vpslld		$12,%ymm5,%ymm0
 	vpsrld		$20,%ymm5,%ymm5
 	vpor		%ymm0,%ymm5,%ymm5
 	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
 	vpaddd		%ymm14,%ymm10,%ymm10
 	vpxor		%ymm10,%ymm6,%ymm6
 	vpslld		$12,%ymm6,%ymm0
 	vpsrld		$20,%ymm6,%ymm6
 	vpor		%ymm0,%ymm6,%ymm6
 	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
 	vpaddd		%ymm15,%ymm11,%ymm11
 	vpxor		%ymm11,%ymm7,%ymm7
 	vpslld		$12,%ymm7,%ymm0
 	vpsrld		$20,%ymm7,%ymm7
 	vpor		%ymm0,%ymm7,%ymm7
 	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
 	vpaddd		0x00(%rsp),%ymm4,%ymm0
 	vmovdqa		%ymm0,0x00(%rsp)
 	vpxor		%ymm0,%ymm12,%ymm12
 	vpshufb		%ymm2,%ymm12,%ymm12
 	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
 	vpaddd		0x20(%rsp),%ymm5,%ymm0
 	vmovdqa		%ymm0,0x20(%rsp)
 	vpxor		%ymm0,%ymm13,%ymm13
 	vpshufb		%ymm2,%ymm13,%ymm13
 	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
 	vpaddd		0x40(%rsp),%ymm6,%ymm0
 	vmovdqa		%ymm0,0x40(%rsp)
 	vpxor		%ymm0,%ymm14,%ymm14
 	vpshufb		%ymm2,%ymm14,%ymm14
 	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
 	vpaddd		0x60(%rsp),%ymm7,%ymm0
 	vmovdqa		%ymm0,0x60(%rsp)
 	vpxor		%ymm0,%ymm15,%ymm15
 	vpshufb		%ymm2,%ymm15,%ymm15
 	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
 	vpaddd		%ymm12,%ymm8,%ymm8
 	vpxor		%ymm8,%ymm4,%ymm4
 	vpslld		$7,%ymm4,%ymm0
 	vpsrld		$25,%ymm4,%ymm4
 	vpor		%ymm0,%ymm4,%ymm4
 	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
 	vpaddd		%ymm13,%ymm9,%ymm9
 	vpxor		%ymm9,%ymm5,%ymm5
 	vpslld		$7,%ymm5,%ymm0
 	vpsrld		$25,%ymm5,%ymm5
 	vpor		%ymm0,%ymm5,%ymm5
 	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
 	vpaddd		%ymm14,%ymm10,%ymm10
 	vpxor		%ymm10,%ymm6,%ymm6
 	vpslld		$7,%ymm6,%ymm0
 	vpsrld		$25,%ymm6,%ymm6
 	vpor		%ymm0,%ymm6,%ymm6
 	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
 	vpaddd		%ymm15,%ymm11,%ymm11
 	vpxor		%ymm11,%ymm7,%ymm7
 	vpslld		$7,%ymm7,%ymm0
 	vpsrld		$25,%ymm7,%ymm7
 	vpor		%ymm0,%ymm7,%ymm7
 	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
 	vpaddd		0x00(%rsp),%ymm5,%ymm0
 	vmovdqa		%ymm0,0x00(%rsp)
 	vpxor		%ymm0,%ymm15,%ymm15
 	vpshufb		%ymm3,%ymm15,%ymm15
 	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
 	vpaddd		0x20(%rsp),%ymm6,%ymm0
 	vmovdqa		%ymm0,0x20(%rsp)
 	vpxor		%ymm0,%ymm12,%ymm12
 	vpshufb		%ymm3,%ymm12,%ymm12
 	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
 	vpaddd		0x40(%rsp),%ymm7,%ymm0
 	vmovdqa		%ymm0,0x40(%rsp)
 	vpxor		%ymm0,%ymm13,%ymm13
 	vpshufb		%ymm3,%ymm13,%ymm13
 	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
 	vpaddd		0x60(%rsp),%ymm4,%ymm0
 	vmovdqa		%ymm0,0x60(%rsp)
 	vpxor		%ymm0,%ymm14,%ymm14
 	vpshufb		%ymm3,%ymm14,%ymm14
 	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
 	vpaddd		%ymm15,%ymm10,%ymm10
 	vpxor		%ymm10,%ymm5,%ymm5
 	vpslld		$12,%ymm5,%ymm0
 	vpsrld		$20,%ymm5,%ymm5
 	vpor		%ymm0,%ymm5,%ymm5
 	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
 	vpaddd		%ymm12,%ymm11,%ymm11
 	vpxor		%ymm11,%ymm6,%ymm6
 	vpslld		$12,%ymm6,%ymm0
 	vpsrld		$20,%ymm6,%ymm6
 	vpor		%ymm0,%ymm6,%ymm6
 	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
 	vpaddd		%ymm13,%ymm8,%ymm8
 	vpxor		%ymm8,%ymm7,%ymm7
 	vpslld		$12,%ymm7,%ymm0
 	vpsrld		$20,%ymm7,%ymm7
 	vpor		%ymm0,%ymm7,%ymm7
 	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
 	vpaddd		%ymm14,%ymm9,%ymm9
 	vpxor		%ymm9,%ymm4,%ymm4
 	vpslld		$12,%ymm4,%ymm0
 	vpsrld		$20,%ymm4,%ymm4
 	vpor		%ymm0,%ymm4,%ymm4
 	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
 	vpaddd		0x00(%rsp),%ymm5,%ymm0
 	vmovdqa		%ymm0,0x00(%rsp)
 	vpxor		%ymm0,%ymm15,%ymm15
 	vpshufb		%ymm2,%ymm15,%ymm15
 	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
 	vpaddd		0x20(%rsp),%ymm6,%ymm0
 	vmovdqa		%ymm0,0x20(%rsp)
 	vpxor		%ymm0,%ymm12,%ymm12
 	vpshufb		%ymm2,%ymm12,%ymm12
 	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
 	vpaddd		0x40(%rsp),%ymm7,%ymm0
 	vmovdqa		%ymm0,0x40(%rsp)
 	vpxor		%ymm0,%ymm13,%ymm13
 	vpshufb		%ymm2,%ymm13,%ymm13
 	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
 	vpaddd		0x60(%rsp),%ymm4,%ymm0
 	vmovdqa		%ymm0,0x60(%rsp)
 	vpxor		%ymm0,%ymm14,%ymm14
 	vpshufb		%ymm2,%ymm14,%ymm14
 	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
 	vpaddd		%ymm15,%ymm10,%ymm10
 	vpxor		%ymm10,%ymm5,%ymm5
 	vpslld		$7,%ymm5,%ymm0
 	vpsrld		$25,%ymm5,%ymm5
 	vpor		%ymm0,%ymm5,%ymm5
 	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
 	vpaddd		%ymm12,%ymm11,%ymm11
 	vpxor		%ymm11,%ymm6,%ymm6
 	vpslld		$7,%ymm6,%ymm0
 	vpsrld		$25,%ymm6,%ymm6
 	vpor		%ymm0,%ymm6,%ymm6
 	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
 	vpaddd		%ymm13,%ymm8,%ymm8
 	vpxor		%ymm8,%ymm7,%ymm7
 	vpslld		$7,%ymm7,%ymm0
 	vpsrld		$25,%ymm7,%ymm7
 	vpor		%ymm0,%ymm7,%ymm7
 	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
 	vpaddd		%ymm14,%ymm9,%ymm9
 	vpxor		%ymm9,%ymm4,%ymm4
 	vpslld		$7,%ymm4,%ymm0
 	vpsrld		$25,%ymm4,%ymm4
 	vpor		%ymm0,%ymm4,%ymm4
 	dec		%ecx
 	jnz		.Ldoubleround8
 	# x0..15[0-3] += s[0..15]
 	vpbroadcastd	0x00(%rdi),%ymm0
 	vpaddd		0x00(%rsp),%ymm0,%ymm0
 	vmovdqa		%ymm0,0x00(%rsp)
 	vpbroadcastd	0x04(%rdi),%ymm0
 	vpaddd		0x20(%rsp),%ymm0,%ymm0
 	vmovdqa		%ymm0,0x20(%rsp)
 	vpbroadcastd	0x08(%rdi),%ymm0
 	vpaddd		0x40(%rsp),%ymm0,%ymm0
 	vmovdqa		%ymm0,0x40(%rsp)
 	vpbroadcastd	0x0c(%rdi),%ymm0
 	vpaddd		0x60(%rsp),%ymm0,%ymm0
 	vmovdqa		%ymm0,0x60(%rsp)
 	vpbroadcastd	0x10(%rdi),%ymm0
 	vpaddd		%ymm0,%ymm4,%ymm4
 	vpbroadcastd	0x14(%rdi),%ymm0
 	vpaddd		%ymm0,%ymm5,%ymm5
 	vpbroadcastd	0x18(%rdi),%ymm0
 	vpaddd		%ymm0,%ymm6,%ymm6
 	vpbroadcastd	0x1c(%rdi),%ymm0
 	vpaddd		%ymm0,%ymm7,%ymm7
 	vpbroadcastd	0x20(%rdi),%ymm0
 	vpaddd		%ymm0,%ymm8,%ymm8
 	vpbroadcastd	0x24(%rdi),%ymm0
 	vpaddd		%ymm0,%ymm9,%ymm9
 	vpbroadcastd	0x28(%rdi),%ymm0
 	vpaddd		%ymm0,%ymm10,%ymm10
 	vpbroadcastd	0x2c(%rdi),%ymm0
 	vpaddd		%ymm0,%ymm11,%ymm11
 	vpbroadcastd	0x30(%rdi),%ymm0
 	vpaddd		%ymm0,%ymm12,%ymm12
 	vpbroadcastd	0x34(%rdi),%ymm0
 	vpaddd		%ymm0,%ymm13,%ymm13
 	vpbroadcastd	0x38(%rdi),%ymm0
 	vpaddd		%ymm0,%ymm14,%ymm14
 	vpbroadcastd	0x3c(%rdi),%ymm0
 	vpaddd		%ymm0,%ymm15,%ymm15
 	# x12 += counter values 0-3
 	vpaddd		%ymm1,%ymm12,%ymm12
 	# interleave 32-bit words in state n, n+1
 	vmovdqa		0x00(%rsp),%ymm0
 	vmovdqa		0x20(%rsp),%ymm1
 	vpunpckldq	%ymm1,%ymm0,%ymm2
 	vpunpckhdq	%ymm1,%ymm0,%ymm1
 	vmovdqa		%ymm2,0x00(%rsp)
 	vmovdqa		%ymm1,0x20(%rsp)
 	vmovdqa		0x40(%rsp),%ymm0
 	vmovdqa		0x60(%rsp),%ymm1
 	vpunpckldq	%ymm1,%ymm0,%ymm2
 	vpunpckhdq	%ymm1,%ymm0,%ymm1
 	vmovdqa		%ymm2,0x40(%rsp)
 	vmovdqa		%ymm1,0x60(%rsp)
 	vmovdqa		%ymm4,%ymm0
 	vpunpckldq	%ymm5,%ymm0,%ymm4
 	vpunpckhdq	%ymm5,%ymm0,%ymm5
 	vmovdqa		%ymm6,%ymm0
 	vpunpckldq	%ymm7,%ymm0,%ymm6
 	vpunpckhdq	%ymm7,%ymm0,%ymm7
 	vmovdqa		%ymm8,%ymm0
 	vpunpckldq	%ymm9,%ymm0,%ymm8
 	vpunpckhdq	%ymm9,%ymm0,%ymm9
 	vmovdqa		%ymm10,%ymm0
 	vpunpckldq	%ymm11,%ymm0,%ymm10
 	vpunpckhdq	%ymm11,%ymm0,%ymm11
 	vmovdqa		%ymm12,%ymm0
 	vpunpckldq	%ymm13,%ymm0,%ymm12
 	vpunpckhdq	%ymm13,%ymm0,%ymm13
 	vmovdqa		%ymm14,%ymm0
 	vpunpckldq	%ymm15,%ymm0,%ymm14
 	vpunpckhdq	%ymm15,%ymm0,%ymm15
 	# interleave 64-bit words in state n, n+2
 	vmovdqa		0x00(%rsp),%ymm0
 	vmovdqa		0x40(%rsp),%ymm2
 	vpunpcklqdq	%ymm2,%ymm0,%ymm1
 	vpunpckhqdq	%ymm2,%ymm0,%ymm2
 	vmovdqa		%ymm1,0x00(%rsp)
 	vmovdqa		%ymm2,0x40(%rsp)
 	vmovdqa		0x20(%rsp),%ymm0
 	vmovdqa		0x60(%rsp),%ymm2
 	vpunpcklqdq	%ymm2,%ymm0,%ymm1
 	vpunpckhqdq	%ymm2,%ymm0,%ymm2
 	vmovdqa		%ymm1,0x20(%rsp)
 	vmovdqa		%ymm2,0x60(%rsp)
 	vmovdqa		%ymm4,%ymm0
 	vpunpcklqdq	%ymm6,%ymm0,%ymm4
 	vpunpckhqdq	%ymm6,%ymm0,%ymm6
 	vmovdqa		%ymm5,%ymm0
 	vpunpcklqdq	%ymm7,%ymm0,%ymm5
 	vpunpckhqdq	%ymm7,%ymm0,%ymm7
 	vmovdqa		%ymm8,%ymm0
 	vpunpcklqdq	%ymm10,%ymm0,%ymm8
 	vpunpckhqdq	%ymm10,%ymm0,%ymm10
 	vmovdqa		%ymm9,%ymm0
 	vpunpcklqdq	%ymm11,%ymm0,%ymm9
 	vpunpckhqdq	%ymm11,%ymm0,%ymm11
 	vmovdqa		%ymm12,%ymm0
 	vpunpcklqdq	%ymm14,%ymm0,%ymm12
 	vpunpckhqdq	%ymm14,%ymm0,%ymm14
 	vmovdqa		%ymm13,%ymm0
 	vpunpcklqdq	%ymm15,%ymm0,%ymm13
 	vpunpckhqdq	%ymm15,%ymm0,%ymm15
 	# interleave 128-bit words in state n, n+4
 	vmovdqa		0x00(%rsp),%ymm0
 	vperm2i128	$0x20,%ymm4,%ymm0,%ymm1
 	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
 	vmovdqa		%ymm1,0x00(%rsp)
 	vmovdqa		0x20(%rsp),%ymm0
 	vperm2i128	$0x20,%ymm5,%ymm0,%ymm1
 	vperm2i128	$0x31,%ymm5,%ymm0,%ymm5
 	vmovdqa		%ymm1,0x20(%rsp)
 	vmovdqa		0x40(%rsp),%ymm0
 	vperm2i128	$0x20,%ymm6,%ymm0,%ymm1
 	vperm2i128	$0x31,%ymm6,%ymm0,%ymm6
 	vmovdqa		%ymm1,0x40(%rsp)
 	vmovdqa		0x60(%rsp),%ymm0
 	vperm2i128	$0x20,%ymm7,%ymm0,%ymm1
 	vperm2i128	$0x31,%ymm7,%ymm0,%ymm7
 	vmovdqa		%ymm1,0x60(%rsp)
 	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
 	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
 	vmovdqa		%ymm0,%ymm8
 	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
 	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
 	vmovdqa		%ymm0,%ymm9
 	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
 	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
 	vmovdqa		%ymm0,%ymm10
 	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
 	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
 	vmovdqa		%ymm0,%ymm11
 	# xor with corresponding input, write to output
 	vmovdqa		0x00(%rsp),%ymm0
 	vpxor		0x0000(%rdx),%ymm0,%ymm0
 	vmovdqu		%ymm0,0x0000(%rsi)
 	vmovdqa		0x20(%rsp),%ymm0
 	vpxor		0x0080(%rdx),%ymm0,%ymm0
 	vmovdqu		%ymm0,0x0080(%rsi)
 	vmovdqa		0x40(%rsp),%ymm0
 	vpxor		0x0040(%rdx),%ymm0,%ymm0
 	vmovdqu		%ymm0,0x0040(%rsi)
 	vmovdqa		0x60(%rsp),%ymm0
 	vpxor		0x00c0(%rdx),%ymm0,%ymm0
 	vmovdqu		%ymm0,0x00c0(%rsi)
 	vpxor		0x0100(%rdx),%ymm4,%ymm4
 	vmovdqu		%ymm4,0x0100(%rsi)
 	vpxor		0x0180(%rdx),%ymm5,%ymm5
 	vmovdqu		%ymm5,0x00180(%rsi)
 	vpxor		0x0140(%rdx),%ymm6,%ymm6
 	vmovdqu		%ymm6,0x0140(%rsi)
 	vpxor		0x01c0(%rdx),%ymm7,%ymm7
 	vmovdqu		%ymm7,0x01c0(%rsi)
 	vpxor		0x0020(%rdx),%ymm8,%ymm8
 	vmovdqu		%ymm8,0x0020(%rsi)
 	vpxor		0x00a0(%rdx),%ymm9,%ymm9
 	vmovdqu		%ymm9,0x00a0(%rsi)
 	vpxor		0x0060(%rdx),%ymm10,%ymm10
 	vmovdqu		%ymm10,0x0060(%rsi)
 	vpxor		0x00e0(%rdx),%ymm11,%ymm11
 	vmovdqu		%ymm11,0x00e0(%rsi)
 	vpxor		0x0120(%rdx),%ymm12,%ymm12
 	vmovdqu		%ymm12,0x0120(%rsi)
 	vpxor		0x01a0(%rdx),%ymm13,%ymm13
 	vmovdqu		%ymm13,0x01a0(%rsi)
 	vpxor		0x0160(%rdx),%ymm14,%ymm14
 	vmovdqu		%ymm14,0x0160(%rsi)
 	vpxor		0x01e0(%rdx),%ymm15,%ymm15
 	vmovdqu		%ymm15,0x01e0(%rsi)
 	vzeroupper
 	lea		-8(%r10),%rsp
 	ret
 ENDPROC(chacha20_8block_xor_avx2)
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@ -1,146 +0,0 @@
 /*
 * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
 *
 * Copyright (C) 2015 Martin Willi
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 */
 #include <crypto/algapi.h>
 #include <crypto/chacha.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <asm/fpu/api.h>
 #include <asm/simd.h>
 #define CHACHA20_STATE_ALIGN 16
 asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
 asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
 #ifdef CONFIG_AS_AVX2
 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
 static bool chacha20_use_avx2;
 #endif
 static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 			    unsigned int bytes)
 {
 	u8 buf[CHACHA_BLOCK_SIZE];
 #ifdef CONFIG_AS_AVX2
 	if (chacha20_use_avx2) {
 		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
 			chacha20_8block_xor_avx2(state, dst, src);
 			bytes -= CHACHA_BLOCK_SIZE * 8;
 			src += CHACHA_BLOCK_SIZE * 8;
 			dst += CHACHA_BLOCK_SIZE * 8;
 			state[12] += 8;
 		}
 	}
 #endif
 	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
 		chacha20_4block_xor_ssse3(state, dst, src);
 		bytes -= CHACHA_BLOCK_SIZE * 4;
 		src += CHACHA_BLOCK_SIZE * 4;
 		dst += CHACHA_BLOCK_SIZE * 4;
 		state[12] += 4;
 	}
 	while (bytes >= CHACHA_BLOCK_SIZE) {
 		chacha20_block_xor_ssse3(state, dst, src);
 		bytes -= CHACHA_BLOCK_SIZE;
 		src += CHACHA_BLOCK_SIZE;
 		dst += CHACHA_BLOCK_SIZE;
 		state[12]++;
 	}
 	if (bytes) {
 		memcpy(buf, src, bytes);
 		chacha20_block_xor_ssse3(state, buf, buf);
 		memcpy(dst, buf, bytes);
 	}
 }
 static int chacha20_simd(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	u32 *state, state_buf[16 + 2] __aligned(8);
 	struct skcipher_walk walk;
 	int err;
 	BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
 	state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
 	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
 		return crypto_chacha_crypt(req);
 	err = skcipher_walk_virt(&walk, req, true);
 	crypto_chacha_init(state, ctx, walk.iv);
 	kernel_fpu_begin();
 	while (walk.nbytes >= CHACHA_BLOCK_SIZE) {
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				rounddown(walk.nbytes, CHACHA_BLOCK_SIZE));
 		err = skcipher_walk_done(&walk,
 					 walk.nbytes % CHACHA_BLOCK_SIZE);
 	}
 	if (walk.nbytes) {
 		chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
 				walk.nbytes);
 		err = skcipher_walk_done(&walk, 0);
 	}
 	kernel_fpu_end();
 	return err;
 }
 static struct skcipher_alg alg = {
 	.base.cra_name		= "chacha20",
 	.base.cra_driver_name	= "chacha20-simd",
 	.base.cra_priority	= 300,
 	.base.cra_blocksize	= 1,
 	.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 	.base.cra_module	= THIS_MODULE,
 	.min_keysize		= CHACHA_KEY_SIZE,
 	.max_keysize		= CHACHA_KEY_SIZE,
 	.ivsize			= CHACHA_IV_SIZE,
 	.chunksize		= CHACHA_BLOCK_SIZE,
 	.setkey			= crypto_chacha20_setkey,
 	.encrypt		= chacha20_simd,
 	.decrypt		= chacha20_simd,
 };
 static int __init chacha20_simd_mod_init(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_SSSE3))
 		return -ENODEV;
 #ifdef CONFIG_AS_AVX2
 	chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
 			    boot_cpu_has(X86_FEATURE_AVX2) &&
 			    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
 #endif
 	return crypto_register_skcipher(&alg);
 }
 static void __exit chacha20_simd_mod_fini(void)
 {
 	crypto_unregister_skcipher(&alg);
 }
 module_init(chacha20_simd_mod_init);
 module_exit(chacha20_simd_mod_fini);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
 MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
 MODULE_ALIAS_CRYPTO("chacha20");
 MODULE_ALIAS_CRYPTO("chacha20-simd");
--- a/arch/x86/crypto/chacha_glue.c
+++ b/arch/x86/crypto/chacha_glue.c
@ -0,0 +1,322 @@
 /*
 * x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
 * including ChaCha20 (RFC7539)
 *
 * Copyright (C) 2015 Martin Willi
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 */
 #include <crypto/algapi.h>
 #include <crypto/internal/chacha.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <asm/fpu/api.h>
 #include <asm/simd.h>
 asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 				       unsigned int len, int nrounds);
 asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 					unsigned int len, int nrounds);
 asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
 asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 				       unsigned int len, int nrounds);
 asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 				       unsigned int len, int nrounds);
 asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 				       unsigned int len, int nrounds);
 asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
 					   unsigned int len, int nrounds);
 asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
 					   unsigned int len, int nrounds);
 asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
 					   unsigned int len, int nrounds);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
 static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
 static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
 {
 	len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
 	return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
 }
 static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
 			  unsigned int bytes, int nrounds)
 {
 	if (IS_ENABLED(CONFIG_AS_AVX512) &&
 	    static_branch_likely(&chacha_use_avx512vl)) {
 		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
 			chacha_8block_xor_avx512vl(state, dst, src, bytes,
 						   nrounds);
 			bytes -= CHACHA_BLOCK_SIZE * 8;
 			src += CHACHA_BLOCK_SIZE * 8;
 			dst += CHACHA_BLOCK_SIZE * 8;
 			state[12] += 8;
 		}
 		if (bytes > CHACHA_BLOCK_SIZE * 4) {
 			chacha_8block_xor_avx512vl(state, dst, src, bytes,
 						   nrounds);
 			state[12] += chacha_advance(bytes, 8);
 			return;
 		}
 		if (bytes > CHACHA_BLOCK_SIZE * 2) {
 			chacha_4block_xor_avx512vl(state, dst, src, bytes,
 						   nrounds);
 			state[12] += chacha_advance(bytes, 4);
 			return;
 		}
 		if (bytes) {
 			chacha_2block_xor_avx512vl(state, dst, src, bytes,
 						   nrounds);
 			state[12] += chacha_advance(bytes, 2);
 			return;
 		}
 	}
 	if (IS_ENABLED(CONFIG_AS_AVX2) &&
 	    static_branch_likely(&chacha_use_avx2)) {
 		while (bytes >= CHACHA_BLOCK_SIZE * 8) {
 			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
 			bytes -= CHACHA_BLOCK_SIZE * 8;
 			src += CHACHA_BLOCK_SIZE * 8;
 			dst += CHACHA_BLOCK_SIZE * 8;
 			state[12] += 8;
 		}
 		if (bytes > CHACHA_BLOCK_SIZE * 4) {
 			chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
 			state[12] += chacha_advance(bytes, 8);
 			return;
 		}
 		if (bytes > CHACHA_BLOCK_SIZE * 2) {
 			chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
 			state[12] += chacha_advance(bytes, 4);
 			return;
 		}
 		if (bytes > CHACHA_BLOCK_SIZE) {
 			chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
 			state[12] += chacha_advance(bytes, 2);
 			return;
 		}
 	}
 	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
 		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
 		bytes -= CHACHA_BLOCK_SIZE * 4;
 		src += CHACHA_BLOCK_SIZE * 4;
 		dst += CHACHA_BLOCK_SIZE * 4;
 		state[12] += 4;
 	}
 	if (bytes > CHACHA_BLOCK_SIZE) {
 		chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
 		state[12] += chacha_advance(bytes, 4);
 		return;
 	}
 	if (bytes) {
 		chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
 		state[12]++;
 	}
 }
 void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
 {
 	if (!static_branch_likely(&chacha_use_simd) || !may_use_simd()) {
 		hchacha_block_generic(state, stream, nrounds);
 	} else {
 		kernel_fpu_begin();
 		hchacha_block_ssse3(state, stream, nrounds);
 		kernel_fpu_end();
 	}
 }
 EXPORT_SYMBOL(hchacha_block_arch);
 void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
 {
 	chacha_init_generic(state, key, iv);
 }
 EXPORT_SYMBOL(chacha_init_arch);
 void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
 		       int nrounds)
 {
 	if (!static_branch_likely(&chacha_use_simd) || !may_use_simd() ||
 	    bytes <= CHACHA_BLOCK_SIZE)
 		return chacha_crypt_generic(state, dst, src, bytes, nrounds);
 	do {
 		unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
 		kernel_fpu_begin();
 		chacha_dosimd(state, dst, src, todo, nrounds);
 		kernel_fpu_end();
 		bytes -= todo;
 		src += todo;
 		dst += todo;
 	} while (bytes);
 }
 EXPORT_SYMBOL(chacha_crypt_arch);
 static int chacha_simd_stream_xor(struct skcipher_request *req,
 				  const struct chacha_ctx *ctx, const u8 *iv)
 {
 	u32 state[CHACHA_STATE_WORDS] __aligned(8);
 	struct skcipher_walk walk;
 	int err;
 	err = skcipher_walk_virt(&walk, req, false);
 	chacha_init_generic(state, ctx->key, iv);
 	while (walk.nbytes > 0) {
 		unsigned int nbytes = walk.nbytes;
 		if (nbytes < walk.total)
 			nbytes = round_down(nbytes, walk.stride);
 		if (!static_branch_likely(&chacha_use_simd) ||
 		    !may_use_simd()) {
 			chacha_crypt_generic(state, walk.dst.virt.addr,
 					     walk.src.virt.addr, nbytes,
 					     ctx->nrounds);
 		} else {
 			kernel_fpu_begin();
 			chacha_dosimd(state, walk.dst.virt.addr,
 				      walk.src.virt.addr, nbytes,
 				      ctx->nrounds);
 			kernel_fpu_end();
 		}
 		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
 	}
 	return err;
 }
 static int chacha_simd(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	return chacha_simd_stream_xor(req, ctx, req->iv);
 }
 static int xchacha_simd(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
 	u32 state[CHACHA_STATE_WORDS] __aligned(8);
 	struct chacha_ctx subctx;
 	u8 real_iv[16];
 	chacha_init_generic(state, ctx->key, req->iv);
 	if (req->cryptlen > CHACHA_BLOCK_SIZE && irq_fpu_usable()) {
 		kernel_fpu_begin();
 		hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
 		kernel_fpu_end();
 	} else {
 		hchacha_block_generic(state, subctx.key, ctx->nrounds);
 	}
 	subctx.nrounds = ctx->nrounds;
 	memcpy(&real_iv[0], req->iv + 24, 8);
 	memcpy(&real_iv[8], req->iv + 16, 8);
 	return chacha_simd_stream_xor(req, &subctx, real_iv);
 }
 static struct skcipher_alg algs[] = {
 	{
 		.base.cra_name		= "chacha20",
 		.base.cra_driver_name	= "chacha20-simd",
 		.base.cra_priority	= 300,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 		.base.cra_module	= THIS_MODULE,
 		.min_keysize		= CHACHA_KEY_SIZE,
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= CHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.setkey			= chacha20_setkey,
 		.encrypt		= chacha_simd,
 		.decrypt		= chacha_simd,
 	}, {
 		.base.cra_name		= "xchacha20",
 		.base.cra_driver_name	= "xchacha20-simd",
 		.base.cra_priority	= 300,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 		.base.cra_module	= THIS_MODULE,
 		.min_keysize		= CHACHA_KEY_SIZE,
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.setkey			= chacha20_setkey,
 		.encrypt		= xchacha_simd,
 		.decrypt		= xchacha_simd,
 	}, {
 		.base.cra_name		= "xchacha12",
 		.base.cra_driver_name	= "xchacha12-simd",
 		.base.cra_priority	= 300,
 		.base.cra_blocksize	= 1,
 		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
 		.base.cra_module	= THIS_MODULE,
 		.min_keysize		= CHACHA_KEY_SIZE,
 		.max_keysize		= CHACHA_KEY_SIZE,
 		.ivsize			= XCHACHA_IV_SIZE,
 		.chunksize		= CHACHA_BLOCK_SIZE,
 		.setkey			= chacha12_setkey,
 		.encrypt		= xchacha_simd,
 		.decrypt		= xchacha_simd,
 	},
 };
 static int __init chacha_simd_mod_init(void)
 {
 	if (!boot_cpu_has(X86_FEATURE_SSSE3))
 		return 0;
 	static_branch_enable(&chacha_use_simd);
 	if (IS_ENABLED(CONFIG_AS_AVX2) &&
 	    boot_cpu_has(X86_FEATURE_AVX) &&
 	    boot_cpu_has(X86_FEATURE_AVX2) &&
 	    cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
 		static_branch_enable(&chacha_use_avx2);
 		if (IS_ENABLED(CONFIG_AS_AVX512) &&
 		    boot_cpu_has(X86_FEATURE_AVX512VL) &&
 		    boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
 			static_branch_enable(&chacha_use_avx512vl);
 	}
 	return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
 		crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
 }
 static void __exit chacha_simd_mod_fini(void)
 {
 	if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3))
 		crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
 }
 module_init(chacha_simd_mod_init);
 module_exit(chacha_simd_mod_fini);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
 MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
 MODULE_ALIAS_CRYPTO("chacha20");
 MODULE_ALIAS_CRYPTO("chacha20-simd");
 MODULE_ALIAS_CRYPTO("xchacha20");
 MODULE_ALIAS_CRYPTO("xchacha20-simd");
 MODULE_ALIAS_CRYPTO("xchacha12");
 MODULE_ALIAS_CRYPTO("xchacha12-simd");
--- a/Show more
+++ b/Show more