Merge android-4.19-stable.157 (8ee67bc
) into msm-4.19
* refs/heads/tmp-8ee67bc Revert "nl80211: fix non-split wiphy information" Reverting usb changes Linux 4.19.157 powercap: restrict energy meter to root access Revert "ANDROID: Kbuild, LLVMLinux: allow overriding clang target triple" Linux 4.19.156 arm64: dts: marvell: espressobin: Add ethernet switch aliases net: dsa: read mac address from DT for slave device tools: perf: Fix build error in v4.19.y perf/core: Fix a memory leak in perf_event_parse_addr_filter() PM: runtime: Resume the device earlier in __device_release_driver() Revert "ARC: entry: fix potential EFA clobber when TIF_SYSCALL_TRACE" ARC: stack unwinding: avoid indefinite looping usb: mtu3: fix panic in mtu3_gadget_stop() USB: Add NO_LPM quirk for Kingston flash drive USB: serial: option: add Telit FN980 composition 0x1055 USB: serial: option: add LE910Cx compositions 0x1203, 0x1230, 0x1231 USB: serial: option: add Quectel EC200T module support USB: serial: cyberjack: fix write-URB completion race serial: txx9: add missing platform_driver_unregister() on error in serial_txx9_init serial: 8250_mtk: Fix uart_get_baud_rate warning fork: fix copy_process(CLONE_PARENT) race with the exiting ->real_parent vt: Disable KD_FONT_OP_COPY ACPI: NFIT: Fix comparison to '-ENXIO' drm/vc4: drv: Add error handding for bind vsock: use ns_capable_noaudit() on socket create scsi: core: Don't start concurrent async scan on same host blk-cgroup: Pre-allocate tree node on blkg_conf_prep blk-cgroup: Fix memleak on error path of: Fix reserved-memory overlap detection x86/kexec: Use up-to-dated screen_info copy to fill boot params ARM: dts: sun4i-a10: fix cpu_alert temperature futex: Handle transient "ownerless" rtmutex state correctly tracing: Fix out of bounds write in get_trace_buf ftrace: Handle tracing when switching between context ftrace: Fix recursion check for NMI test ring-buffer: Fix recursion protection transitions between interrupt context gfs2: Wake up when sd_glock_disposal becomes zero mm: always have io_remap_pfn_range() set pgprot_decrypted() kthread_worker: prevent queuing delayed work from timer_fn when it is being canceled lib/crc32test: remove extra local_irq_disable/enable mm: mempolicy: fix potential pte_unmap_unlock pte error ALSA: usb-audio: Add implicit feedback quirk for MODX ALSA: usb-audio: Add implicit feedback quirk for Qu-16 ALSA: usb-audio: add usb vendor id as DSD-capable for Khadas devices ALSA: usb-audio: Add implicit feedback quirk for Zoom UAC-2 Fonts: Replace discarded const qualifier btrfs: tree-checker: fix the error message for transid error btrfs: tree-checker: Verify inode item btrfs: tree-checker: Enhance chunk checker to validate chunk profile btrfs: tree-checker: Fix wrong check on max devid btrfs: tree-checker: Verify dev item btrfs: tree-checker: Check chunk item at tree block read time btrfs: tree-checker: Make btrfs_check_chunk_valid() return EUCLEAN instead of EIO btrfs: tree-checker: Make chunk item checker messages more readable btrfs: Move btrfs_check_chunk_valid() to tree-check.[ch] and export it btrfs: Don't submit any btree write bio if the fs has errors Btrfs: fix unwritten extent buffers and hangs on future writeback attempts btrfs: extent_io: add proper error handling to lock_extent_buffer_for_io() btrfs: extent_io: Handle errors better in btree_write_cache_pages() btrfs: extent_io: Handle errors better in extent_write_full_page() btrfs: flush write bio if we loop in extent_write_cache_pages Revert "btrfs: flush write bio if we loop in extent_write_cache_pages" btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up btrfs: extent_io: Kill the forward declaration of flush_write_bio blktrace: fix debugfs use after free sfp: Fix error handing in sfp_probe() sctp: Fix COMM_LOST/CANT_STR_ASSOC err reporting on big-endian platforms net: usb: qmi_wwan: add Telit LE910Cx 0x1230 composition gianfar: Account for Tx PTP timestamp in the skb headroom gianfar: Replace skb_realloc_headroom with skb_cow_head for PTP chelsio/chtls: fix always leaking ctrl_skb chelsio/chtls: fix memory leaks caused by a race cadence: force nonlinear buffers to be cloned ptrace: fix task_join_group_stop() for the case when current is traced tipc: fix use-after-free in tipc_bcast_get_mode drm/i915: Break up error capture compression loops with cond_resched() ANDROID: fuse: Add support for d_canonical_path ANDROID: vfs: add d_canonical_path for stacked filesystem support ANDROID: Temporarily disable XFRM_USER_COMPAT filtering Linux 4.19.155 staging: octeon: Drop on uncorrectable alignment or FCS error staging: octeon: repair "fixed-link" support staging: comedi: cb_pcidas: Allow 2-channel commands for AO subdevice KVM: arm64: Fix AArch32 handling of DBGD{CCINT,SCRext} and DBGVCR device property: Don't clear secondary pointer for shared primary firmware node device property: Keep secondary firmware node secondary by type ARM: s3c24xx: fix missing system reset ARM: samsung: fix PM debug build with DEBUG_LL but !MMU arm: dts: mt7623: add missing pause for switchport hil/parisc: Disable HIL driver when it gets stuck cachefiles: Handle readpage error correctly arm64: berlin: Select DW_APB_TIMER_OF tty: make FONTX ioctl use the tty pointer they were actually passed rtc: rx8010: don't modify the global rtc ops drm/ttm: fix eviction valuable range check. ext4: fix invalid inode checksum ext4: fix error handling code in add_new_gdb ext4: fix leaking sysfs kobject after failed mount vringh: fix __vringh_iov() when riov and wiov are different ring-buffer: Return 0 on success from ring_buffer_resize() 9P: Cast to loff_t before multiplying libceph: clear con->out_msg on Policy::stateful_server faults ceph: promote to unsigned long long before shifting drm/amd/display: Don't invoke kgdb_breakpoint() unconditionally drm/amdgpu: don't map BO in reserved region i2c: imx: Fix external abort on interrupt in exit paths ia64: fix build error with !COREDUMP ubi: check kthread_should_stop() after the setting of task state perf python scripting: Fix printable strings in python3 scripts ubifs: dent: Fix some potential memory leaks while iterating entries NFSD: Add missing NFSv2 .pc_func methods NFSv4.2: support EXCHGID4_FLAG_SUPP_FENCE_OPS 4.2 EXCHANGE_ID flag powerpc: Fix undetected data corruption with P9N DD2.1 VSX CI load emulation powerpc/powernv/elog: Fix race while processing OPAL error log event. powerpc: Warn about use of smt_snooze_delay powerpc/rtas: Restrict RTAS requests from userspace s390/stp: add locking to sysfs functions powerpc/drmem: Make lmb_size 64 bit iio:gyro:itg3200: Fix timestamp alignment and prevent data leak. iio:adc:ti-adc12138 Fix alignment issue with timestamp iio:adc:ti-adc0832 Fix alignment issue with timestamp iio:light:si1145: Fix timestamp alignment and prevent data leak. dmaengine: dma-jz4780: Fix race in jz4780_dma_tx_status udf: Fix memory leak when mounting HID: wacom: Avoid entering wacom_wac_pen_report for pad / battery vt: keyboard, extend func_buf_lock to readers vt: keyboard, simplify vt_kdgkbsent drm/i915: Force VT'd workarounds when running as a guest OS usb: host: fsl-mph-dr-of: check return of dma_set_mask() usb: typec: tcpm: reset hard_reset_count for any disconnect usb: cdc-acm: fix cooldown mechanism usb: dwc3: core: don't trigger runtime pm when remove driver usb: dwc3: core: add phy cleanup for probe error handling usb: dwc3: gadget: Check MPS of the request length usb: dwc3: ep0: Fix ZLP for OUT ep0 requests usb: xhci: Workaround for S3 issue on AMD SNPS 3.0 xHC btrfs: fix use-after-free on readahead extent after failure to create it btrfs: cleanup cow block on error btrfs: use kvzalloc() to allocate clone_roots in btrfs_ioctl_send() btrfs: send, recompute reference path after orphanization of a directory btrfs: reschedule if necessary when logging directory items btrfs: improve device scanning messages btrfs: qgroup: fix wrong qgroup metadata reserve for delayed inode scsi: qla2xxx: Fix crash on session cleanup with unload scsi: mptfusion: Fix null pointer dereferences in mptscsih_remove() w1: mxc_w1: Fix timeout resolution problem leading to bus error acpi-cpufreq: Honor _PSD table setting on new AMD CPUs ACPI: debug: don't allow debugging when ACPI is disabled ACPI: video: use ACPI backlight for HP 635 Notebook ACPI / extlog: Check for RDMSR failure ACPI: button: fix handling lid state changes when input device closed NFS: fix nfs_path in case of a rename retry fs: Don't invalidate page buffers in block_write_full_page() media: uvcvideo: Fix uvc_ctrl_fixup_xu_info() not having any effect leds: bcm6328, bcm6358: use devres LED registering function perf/x86/amd/ibs: Fix raw sample data accumulation perf/x86/amd/ibs: Don't include randomized bits in get_ibs_op_count() mmc: sdhci-acpi: AMDI0040: Set SDHCI_QUIRK2_PRESET_VALUE_BROKEN md/raid5: fix oops during stripe resizing nvme-rdma: fix crash when connect rejected sgl_alloc_order: fix memory leak nbd: make the config put is called before the notifying the waiter ARM: dts: s5pv210: remove dedicated 'audio-subsystem' node ARM: dts: s5pv210: move PMU node out of clock controller ARM: dts: s5pv210: remove DMA controller bus node name to fix dtschema warnings memory: emif: Remove bogus debugfs error handling ARM: dts: omap4: Fix sgx clock rate for 4430 arm64: dts: renesas: ulcb: add full-pwr-cycle-in-suspend into eMMC nodes cifs: handle -EINTR in cifs_setattr gfs2: add validation checks for size of superblock ext4: Detect already used quota file early drivers: watchdog: rdc321x_wdt: Fix race condition bugs net: 9p: initialize sun_server.sun_path to have addr's value only when addr is valid clk: ti: clockdomain: fix static checker warning rpmsg: glink: Use complete_all for open states bnxt_en: Log unknown link speed appropriately. md/bitmap: md_bitmap_get_counter returns wrong blocks btrfs: fix replace of seed device drm/amd/display: HDMI remote sink need mode validation for Linux power: supply: test_power: add missing newlines when printing parameters by sysfs bus/fsl_mc: Do not rely on caller to provide non NULL mc_io drivers/net/wan/hdlc_fr: Correctly handle special skb->protocol values ACPI: Add out of bounds and numa_off protections to pxm_to_node() xfs: don't free rt blocks when we're doing a REMAP bunmapi call arm64/mm: return cpu_all_mask when node is NUMA_NO_NODE usb: xhci: omit duplicate actions when suspending a runtime suspended host. uio: free uio id after uio file node is freed USB: adutux: fix debugging cpufreq: sti-cpufreq: add stih418 support riscv: Define AT_VECTOR_SIZE_ARCH for ARCH_DLINFO media: uvcvideo: Fix dereference of out-of-bound list iterator kgdb: Make "kgdbcon" work properly with "kgdb_earlycon" ia64: kprobes: Use generic kretprobe trampoline handler printk: reduce LOG_BUF_SHIFT range for H8300 arm64: topology: Stop using MPIDR for topology information drm/bridge/synopsys: dsi: add support for non-continuous HS clock mmc: via-sdmmc: Fix data race bug media: imx274: fix frame interval handling media: tw5864: check status of tw5864_frameinterval_get usb: typec: tcpm: During PR_SWAP, source caps should be sent only after tSwapSourceStart media: platform: Improve queue set up flow for bug fixing media: videodev2.h: RGB BT2020 and HSV are always full range drm/brige/megachips: Add checking if ge_b850v3_lvds_init() is working correctly ath10k: fix VHT NSS calculation when STBC is enabled ath10k: start recovery process when payload length exceeds max htc length for sdio video: fbdev: pvr2fb: initialize variables xfs: fix realtime bitmap/summary file truncation when growing rt volume power: supply: bq27xxx: report "not charging" on all types ARM: 8997/2: hw_breakpoint: Handle inexact watchpoint addresses um: change sigio_spinlock to a mutex f2fs: fix to check segment boundary during SIT page readahead f2fs: fix uninit-value in f2fs_lookup f2fs: add trace exit in exception path sparc64: remove mm_cpumask clearing to fix kthread_use_mm race powerpc: select ARCH_WANT_IRQS_OFF_ACTIVATE_MM mm: fix exec activate_mm vs TLB shootdown and lazy tlb switching race powerpc/powernv/smp: Fix spurious DBG() warning futex: Fix incorrect should_fail_futex() handling ata: sata_nv: Fix retrieving of active qcs RDMA/qedr: Fix memory leak in iWARP CM mlxsw: core: Fix use-after-free in mlxsw_emad_trans_finish() x86/unwind/orc: Fix inactive tasks with stack pointer in %sp on GCC 10 compiled kernels xen/events: block rogue events for some time xen/events: defer eoi in case of excessive number of events xen/events: use a common cpu hotplug hook for event channels xen/events: switch user event channels to lateeoi model xen/pciback: use lateeoi irq binding xen/pvcallsback: use lateeoi irq binding xen/scsiback: use lateeoi irq binding xen/netback: use lateeoi irq binding xen/blkback: use lateeoi irq binding xen/events: add a new "late EOI" evtchn framework xen/events: fix race in evtchn_fifo_unmask() xen/events: add a proper barrier to 2-level uevent unmasking xen/events: avoid removing an event channel while handling it xen/events: don't use chip_data for legacy IRQs Revert "block: ratelimit handle_bad_sector() message" fscrypt: fix race where ->lookup() marks plaintext dentry as ciphertext fscrypt: only set dentry_operations on ciphertext dentries fs, fscrypt: clear DCACHE_ENCRYPTED_NAME when unaliasing directory fscrypt: fix race allowing rename() and link() of ciphertext dentries fscrypt: clean up and improve dentry revalidation fscrypt: return -EXDEV for incompatible rename or link into encrypted dir ata: sata_rcar: Fix DMA boundary mask serial: pl011: Fix lockdep splat when handling magic-sysrq interrupt mtd: lpddr: Fix bad logic in print_drs_error RDMA/addr: Fix race with netevent_callback()/rdma_addr_cancel() cxl: Rework error message for incompatible slots p54: avoid accessing the data mapped to streaming DMA evm: Check size of security.evm before using it bpf: Fix comment for helper bpf_current_task_under_cgroup() fuse: fix page dereference after free x86/xen: disable Firmware First mode for correctable memory errors arch/x86/amd/ibs: Fix re-arming IBS Fetch cxgb4: set up filter action after rewrites r8169: fix issue with forced threading in combination with shared interrupts tipc: fix memory leak caused by tipc_buf_append() tcp: Prevent low rmem stalls with SO_RCVLOWAT. ravb: Fix bit fields checking in ravb_hwtstamp_get() netem: fix zero division in tabledist mlxsw: core: Fix memory leak on module removal gtp: fix an use-before-init in gtp_newlink() chelsio/chtls: fix tls record info to user chelsio/chtls: fix memory leaks in CPL handlers chelsio/chtls: fix deadlock issue efivarfs: Replace invalid slashes with exclamation marks in dentries. x86/PCI: Fix intel_mid_pci.c build error when ACPI is not enabled arm64: link with -z norelro regardless of CONFIG_RELOCATABLE arm64: Run ARCH_WORKAROUND_1 enabling code on all CPUs scripts/setlocalversion: make git describe output more reliable objtool: Support Clang non-section symbols in ORC generation ANDROID: GKI: Enable DEBUG_INFO_DWARF4 UPSTREAM: mm/sl[uo]b: export __kmalloc_track(_node)_caller BACKPORT: xfrm/compat: Translate 32-bit user_policy from sockptr BACKPORT: xfrm/compat: Add 32=>64-bit messages translator UPSTREAM: xfrm/compat: Attach xfrm dumps to 64=>32 bit translator UPSTREAM: xfrm/compat: Add 64=>32-bit messages translator BACKPORT: xfrm: Provide API to register translator module ANDROID: Publish uncompressed Image on aarch64 FROMLIST: crypto: arm64/poly1305-neon - reorder PAC authentication with SP update UPSTREAM: crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian UPSTREAM: crypto: arm64/chacha - fix hchacha_block_neon() for big endian Linux 4.19.154 usb: gadget: f_ncm: allow using NCM in SuperSpeed Plus gadgets. eeprom: at25: set minimum read/write access stride to 1 USB: cdc-wdm: Make wdm_flush() interruptible and add wdm_fsync(). usb: cdc-acm: add quirk to blacklist ETAS ES58X devices tty: serial: fsl_lpuart: fix lpuart32_poll_get_char net: korina: cast KSEG0 address to pointer in kfree ath10k: check idx validity in __ath10k_htt_rx_ring_fill_n() scsi: ufs: ufs-qcom: Fix race conditions caused by ufs_qcom_testbus_config() usb: core: Solve race condition in anchor cleanup functions brcm80211: fix possible memleak in brcmf_proto_msgbuf_attach mwifiex: don't call del_timer_sync() on uninitialized timer reiserfs: Fix memory leak in reiserfs_parse_options() ipvs: Fix uninit-value in do_ip_vs_set_ctl() tty: ipwireless: fix error handling scsi: qedi: Fix list_del corruption while removing active I/O scsi: qedi: Protect active command list to avoid list corruption Fix use after free in get_capset_info callback. rtl8xxxu: prevent potential memory leak brcmsmac: fix memory leak in wlc_phy_attach_lcnphy scsi: ibmvfc: Fix error return in ibmvfc_probe() Bluetooth: Only mark socket zapped after unlocking usb: ohci: Default to per-port over-current protection xfs: make sure the rt allocator doesn't run off the end reiserfs: only call unlock_new_inode() if I_NEW misc: rtsx: Fix memory leak in rtsx_pci_probe ath9k: hif_usb: fix race condition between usb_get_urb() and usb_kill_anchored_urbs() can: flexcan: flexcan_chip_stop(): add error handling and propagate error value usb: dwc3: simple: add support for Hikey 970 USB: cdc-acm: handle broken union descriptors udf: Avoid accessing uninitialized data on failed inode read udf: Limit sparing table size usb: gadget: function: printer: fix use-after-free in __lock_acquire misc: vop: add round_up(x,4) for vring_size to avoid kernel panic mic: vop: copy data to kernel space then write to io memory scsi: target: core: Add CONTROL field for trace events scsi: mvumi: Fix error return in mvumi_io_attach() PM: hibernate: remove the bogus call to get_gendisk() in software_resume() mac80211: handle lack of sband->bitrates in rates ip_gre: set dev->hard_header_len and dev->needed_headroom properly ntfs: add check for mft record size in superblock media: venus: core: Fix runtime PM imbalance in venus_probe fs: dlm: fix configfs memory leak media: saa7134: avoid a shift overflow mmc: sdio: Check for CISTPL_VERS_1 buffer size media: uvcvideo: Ensure all probed info is returned to v4l2 media: media/pci: prevent memory leak in bttv_probe media: bdisp: Fix runtime PM imbalance on error media: platform: sti: hva: Fix runtime PM imbalance on error media: platform: s3c-camif: Fix runtime PM imbalance on error media: vsp1: Fix runtime PM imbalance on error media: exynos4-is: Fix a reference count leak media: exynos4-is: Fix a reference count leak due to pm_runtime_get_sync media: exynos4-is: Fix several reference count leaks due to pm_runtime_get_sync media: sti: Fix reference count leaks media: st-delta: Fix reference count leak in delta_run_work media: ati_remote: sanity check for both endpoints media: firewire: fix memory leak crypto: ccp - fix error handling block: ratelimit handle_bad_sector() message i2c: core: Restore acpi_walk_dep_device_list() getting called after registering the ACPI i2c devs perf: correct SNOOPX field offset sched/features: Fix !CONFIG_JUMP_LABEL case NTB: hw: amd: fix an issue about leak system resources nvmet: fix uninitialized work for zero kato powerpc/powernv/dump: Fix race while processing OPAL dump arm64: dts: zynqmp: Remove additional compatible string for i2c IPs ARM: dts: owl-s500: Fix incorrect PPI interrupt specifiers arm64: dts: qcom: msm8916: Fix MDP/DSI interrupts arm64: dts: qcom: pm8916: Remove invalid reg size from wcd_codec memory: fsl-corenet-cf: Fix handling of platform_get_irq() error memory: omap-gpmc: Fix build error without CONFIG_OF memory: omap-gpmc: Fix a couple off by ones ARM: dts: sun8i: r40: bananapi-m2-ultra: Fix dcdc1 regulator ARM: dts: imx6sl: fix rng node netfilter: nf_fwd_netdev: clear timestamp in forwarding path netfilter: conntrack: connection timeout after re-register KVM: x86: emulating RDPID failure shall return #UD rather than #GP Input: sun4i-ps2 - fix handling of platform_get_irq() error Input: twl4030_keypad - fix handling of platform_get_irq() error Input: omap4-keypad - fix handling of platform_get_irq() error Input: ep93xx_keypad - fix handling of platform_get_irq() error Input: stmfts - fix a & vs && typo Input: imx6ul_tsc - clean up some errors in imx6ul_tsc_resume() SUNRPC: fix copying of multiple pages in gss_read_proxy_verf() vfio iommu type1: Fix memory leak in vfio_iommu_type1_pin_pages vfio/pci: Clear token on bypass registration failure ext4: limit entries returned when counting fsmap records svcrdma: fix bounce buffers for unaligned offsets and multiple pages watchdog: sp5100: Fix definition of EFCH_PM_DECODEEN3 watchdog: Use put_device on error watchdog: Fix memleak in watchdog_cdev_register clk: bcm2835: add missing release if devm_clk_hw_register fails clk: at91: clk-main: update key before writing AT91_CKGR_MOR clk: rockchip: Initialize hw to error to avoid undefined behavior pwm: img: Fix null pointer access in probe rpmsg: smd: Fix a kobj leak in in qcom_smd_parse_edge() PCI: iproc: Set affinity mask on MSI interrupts i2c: rcar: Auto select RESET_CONTROLLER mailbox: avoid timer start from callback rapidio: fix the missed put_device() for rio_mport_add_riodev rapidio: fix error handling path ramfs: fix nommu mmap with gaps in the page cache lib/crc32.c: fix trivial typo in preprocessor condition f2fs: wait for sysfs kobject removal before freeing f2fs_sb_info IB/rdmavt: Fix sizeof mismatch cpufreq: powernv: Fix frame-size-overflow in powernv_cpufreq_reboot_notifier powerpc/perf/hv-gpci: Fix starting index value powerpc/perf: Exclude pmc5/6 from the irrelevant PMU group constraints overflow: Include header file with SIZE_MAX declaration kdb: Fix pager search for multi-line strings RDMA/hns: Fix missing sq_sig_type when querying QP RDMA/hns: Set the unsupported wr opcode perf intel-pt: Fix "context_switch event has no tid" error RDMA/cma: Consolidate the destruction of a cma_multicast in one place RDMA/cma: Remove dead code for kernel rdmacm multicast powerpc/64s/radix: Fix mm_cpumask trimming race vs kthread_use_mm powerpc/tau: Disable TAU between measurements powerpc/tau: Check processor type before enabling TAU interrupt ANDROID: GKI: update the ABI xml Linux 4.19.153 powerpc/tau: Remove duplicated set_thresholds() call powerpc/tau: Convert from timer to workqueue powerpc/tau: Use appropriate temperature sample interval RDMA/qedr: Fix inline size returned for iWARP RDMA/qedr: Fix use of uninitialized field xfs: fix high key handling in the rt allocator's query_range function xfs: limit entries returned when counting fsmap records arc: plat-hsdk: fix kconfig dependency warning when !RESET_CONTROLLER ARM: 9007/1: l2c: fix prefetch bits init in L2X0_AUX_CTRL using DT values mtd: mtdoops: Don't write panic data twice powerpc/pseries: explicitly reschedule during drmem_lmb list traversal mtd: lpddr: fix excessive stack usage with clang RDMA/ucma: Add missing locking around rdma_leave_multicast() RDMA/ucma: Fix locking for ctx->events_reported powerpc/icp-hv: Fix missing of_node_put() in success path powerpc/pseries: Fix missing of_node_put() in rng_init() IB/mlx4: Adjust delayed work when a dup is observed IB/mlx4: Fix starvation in paravirt mux/demux mm, oom_adj: don't loop through tasks in __set_oom_adj when not necessary mm/memcg: fix device private memcg accounting netfilter: nf_log: missing vlan offload tag and proto net: korina: fix kfree of rx/tx descriptor array ipvs: clear skb->tstamp in forwarding path mwifiex: fix double free platform/x86: mlx-platform: Remove PSU EEPROM configuration scsi: be2iscsi: Fix a theoretical leak in beiscsi_create_eqs() scsi: target: tcmu: Fix warning: 'page' may be used uninitialized usb: dwc2: Fix INTR OUT transfers in DDMA mode. nl80211: fix non-split wiphy information usb: gadget: u_ether: enable qmult on SuperSpeed Plus as well usb: gadget: f_ncm: fix ncm_bitrate for SuperSpeed and above. iwlwifi: mvm: split a print to avoid a WARNING in ROC mfd: sm501: Fix leaks in probe() net: enic: Cure the enic api locking trainwreck qtnfmac: fix resource leaks on unsupported iftype error return path HID: hid-input: fix stylus battery reporting slimbus: qcom-ngd-ctrl: disable ngd in qmi server down callback slimbus: core: do not enter to clock pause mode in core slimbus: core: check get_addr before removing laddr ida quota: clear padding in v2r1_mem2diskdqb() usb: dwc2: Fix parameter type in function pointer prototype ALSA: seq: oss: Avoid mutex lock for a long-time ioctl misc: mic: scif: Fix error handling path ath6kl: wmi: prevent a shift wrapping bug in ath6kl_wmi_delete_pstream_cmd() net: dsa: rtl8366rb: Support all 4096 VLANs net: dsa: rtl8366: Skip PVID setting if not requested net: dsa: rtl8366: Refactor VLAN/PVID init net: dsa: rtl8366: Check validity of passed VLANs cpufreq: armada-37xx: Add missing MODULE_DEVICE_TABLE net: stmmac: use netif_tx_start|stop_all_queues() function net/mlx5: Don't call timecounter cyc2time directly from 1PPS flow pinctrl: mcp23s08: Fix mcp23x17 precious range pinctrl: mcp23s08: Fix mcp23x17_regmap initialiser HID: roccat: add bounds checking in kone_sysfs_write_settings() video: fbdev: radeon: Fix memleak in radeonfb_pci_register video: fbdev: sis: fix null ptr dereference video: fbdev: vga16fb: fix setting of pixclock because a pass-by-value error drivers/virt/fsl_hypervisor: Fix error handling path pwm: lpss: Add range limit check for the base_unit register value pwm: lpss: Fix off by one error in base_unit math in pwm_lpss_prepare() pty: do tty_flip_buffer_push without port->lock in pty_write tty: hvcs: Don't NULL tty->driver_data until hvcs_cleanup() tty: serial: earlycon dependency VMCI: check return value of get_user_pages_fast() for errors backlight: sky81452-backlight: Fix refcount imbalance on error scsi: csiostor: Fix wrong return value in csio_hw_prep_fw() scsi: qla2xxx: Fix wrong return value in qla_nvme_register_hba() scsi: qla4xxx: Fix an error handling path in 'qla4xxx_get_host_stats()' drm/gma500: fix error check staging: rtl8192u: Do not use GFP_KERNEL in atomic context mwifiex: Do not use GFP_KERNEL in atomic context brcmfmac: check ndev pointer ASoC: qcom: lpass-cpu: fix concurrency issue ASoC: qcom: lpass-platform: fix memory leak wcn36xx: Fix reported 802.11n rx_highest rate wcn3660/wcn3680 ath10k: Fix the size used in a 'dma_free_coherent()' call in an error handling path ath9k: Fix potential out of bounds in ath9k_htc_txcompletion_cb() ath6kl: prevent potential array overflow in ath6kl_add_new_sta() Bluetooth: hci_uart: Cancel init work before unregistering ath10k: provide survey info as accumulated data spi: spi-s3c64xx: Check return values spi: spi-s3c64xx: swap s3c64xx_spi_set_cs() and s3c64xx_enable_datapath() pinctrl: bcm: fix kconfig dependency warning when !GPIOLIB regulator: resolve supply after creating regulator media: ti-vpe: Fix a missing check and reference count leak media: stm32-dcmi: Fix a reference count leak media: s5p-mfc: Fix a reference count leak media: camss: Fix a reference count leak. media: platform: fcp: Fix a reference count leak. media: rockchip/rga: Fix a reference count leak. media: rcar-vin: Fix a reference count leak. media: tc358743: cleanup tc358743_cec_isr media: tc358743: initialize variable media: mx2_emmaprp: Fix memleak in emmaprp_probe cypto: mediatek - fix leaks in mtk_desc_ring_alloc hwmon: (pmbus/max34440) Fix status register reads for MAX344{51,60,61} crypto: omap-sham - fix digcnt register handling with export/import media: omap3isp: Fix memleak in isp_probe media: uvcvideo: Silence shift-out-of-bounds warning media: uvcvideo: Set media controller entity functions media: m5mols: Check function pointer in m5mols_sensor_power media: Revert "media: exynos4-is: Add missed check for pinctrl_lookup_state()" media: tuner-simple: fix regression in simple_set_radio_freq crypto: picoxcell - Fix potential race condition bug crypto: ixp4xx - Fix the size used in a 'dma_free_coherent()' call crypto: mediatek - Fix wrong return value in mtk_desc_ring_alloc() crypto: algif_skcipher - EBUSY on aio should be an error x86/events/amd/iommu: Fix sizeof mismatch x86/nmi: Fix nmi_handle() duration miscalculation drivers/perf: xgene_pmu: Fix uninitialized resource struct x86/fpu: Allow multiple bits in clearcpuid= parameter EDAC/ti: Fix handling of platform_get_irq() error EDAC/i5100: Fix error handling order in i5100_init_one() crypto: algif_aead - Do not set MAY_BACKLOG on the async path ima: Don't ignore errors from crypto_shash_update() KVM: SVM: Initialize prev_ga_tag before use KVM: x86/mmu: Commit zap of remaining invalid pages when recovering lpages cifs: Return the error from crypt_message when enc/dec key not found. cifs: remove bogus debug code ALSA: hda/realtek: Enable audio jacks of ASUS D700SA with ALC887 icmp: randomize the global rate limiter r8169: fix operation under forced interrupt threading tcp: fix to update snd_wl1 in bulk receiver fast path nfc: Ensure presence of NFC_ATTR_FIRMWARE_NAME attribute in nfc_genl_fw_download() net/sched: act_tunnel_key: fix OOB write in case of IPv6 ERSPAN tunnels net: hdlc_raw_eth: Clear the IFF_TX_SKB_SHARING flag after calling ether_setup net: hdlc: In hdlc_rcv, check to make sure dev is an HDLC device chelsio/chtls: correct function return and return type chelsio/chtls: correct netdevice for vlan interface chelsio/chtls: fix socket lock ALSA: bebob: potential info leak in hwdep_read() binder: fix UAF when releasing todo list net/tls: sendfile fails with ktls offload r8169: fix data corruption issue on RTL8402 net/ipv4: always honour route mtu during forwarding tipc: fix the skb_unshare() in tipc_buf_append() net: usb: qmi_wwan: add Cellient MPL200 card net/smc: fix valid DMBE buffer sizes net: fix pos incrementment in ipv6_route_seq_next net: fec: Fix PHY init after phy_reset_after_clk_enable() net: fec: Fix phy_device lookup for phy_reset_after_clk_enable() mlx4: handle non-napi callers to napi_poll ipv4: Restore flowi4_oif update before call to xfrm_lookup_route ibmveth: Identify ingress large send packets. ibmveth: Switch order of ibmveth_helper calls. ANDROID: clang: update to 11.0.5 FROMLIST: arm64: link with -z norelro regardless of CONFIG_RELOCATABLE ANDROID: GKI: enable CONFIG_WIREGUARD UPSTREAM: wireguard: peerlookup: take lock before checking hash in replace operation UPSTREAM: wireguard: noise: take lock when removing handshake entry from table UPSTREAM: wireguard: queueing: make use of ip_tunnel_parse_protocol UPSTREAM: net: ip_tunnel: add header_ops for layer 3 devices UPSTREAM: wireguard: receive: account for napi_gro_receive never returning GRO_DROP UPSTREAM: wireguard: device: avoid circular netns references UPSTREAM: wireguard: noise: do not assign initiation time in if condition UPSTREAM: wireguard: noise: separate receive counter from send counter UPSTREAM: wireguard: queueing: preserve flow hash across packet scrubbing UPSTREAM: wireguard: noise: read preshared key while taking lock UPSTREAM: wireguard: selftests: use newer iproute2 for gcc-10 UPSTREAM: wireguard: send/receive: use explicit unlikely branch instead of implicit coalescing UPSTREAM: wireguard: selftests: initalize ipv6 members to NULL to squelch clang warning UPSTREAM: wireguard: send/receive: cond_resched() when processing worker ringbuffers UPSTREAM: wireguard: socket: remove errant restriction on looping to self UPSTREAM: wireguard: selftests: use normal kernel stack size on ppc64 UPSTREAM: wireguard: receive: use tunnel helpers for decapsulating ECN markings UPSTREAM: wireguard: queueing: cleanup ptr_ring in error path of packet_queue_init UPSTREAM: wireguard: send: remove errant newline from packet_encrypt_worker UPSTREAM: wireguard: noise: error out precomputed DH during handshake rather than config UPSTREAM: wireguard: receive: remove dead code from default packet type case UPSTREAM: wireguard: queueing: account for skb->protocol==0 UPSTREAM: wireguard: selftests: remove duplicated include <sys/types.h> UPSTREAM: wireguard: socket: remove extra call to synchronize_net UPSTREAM: wireguard: send: account for mtu=0 devices UPSTREAM: wireguard: receive: reset last_under_load to zero UPSTREAM: wireguard: selftests: reduce complexity and fix make races UPSTREAM: wireguard: device: use icmp_ndo_send helper UPSTREAM: wireguard: selftests: tie socket waiting to target pid UPSTREAM: wireguard: selftests: ensure non-addition of peers with failed precomputation UPSTREAM: wireguard: noise: reject peers with low order public keys UPSTREAM: wireguard: allowedips: fix use-after-free in root_remove_peer_lists UPSTREAM: net: skbuff: disambiguate argument and member for skb_list_walk_safe helper UPSTREAM: net: introduce skb_list_walk_safe for skb segment walking UPSTREAM: wireguard: socket: mark skbs as not on list when receiving via gro UPSTREAM: wireguard: queueing: do not account for pfmemalloc when clearing skb header UPSTREAM: wireguard: selftests: remove ancient kernel compatibility code UPSTREAM: wireguard: allowedips: use kfree_rcu() instead of call_rcu() UPSTREAM: wireguard: main: remove unused include <linux/version.h> UPSTREAM: wireguard: global: fix spelling mistakes in comments UPSTREAM: wireguard: Kconfig: select parent dependency for crypto UPSTREAM: wireguard: selftests: import harness makefile for test suite UPSTREAM: net: WireGuard secure network tunnel UPSTREAM: timekeeping: Boot should be boottime for coarse ns accessor UPSTREAM: timekeeping: Add missing _ns functions for coarse accessors UPSTREAM: icmp: introduce helper for nat'd source address in network device context UPSTREAM: crypto: poly1305-x86_64 - Use XORL r32,32 UPSTREAM: crypto: curve25519-x86_64 - Use XORL r32,32 UPSTREAM: crypto: arm/poly1305 - Add prototype for poly1305_blocks_neon UPSTREAM: crypto: arm/curve25519 - include <linux/scatterlist.h> UPSTREAM: crypto: x86/curve25519 - Remove unused carry variables UPSTREAM: crypto: x86/chacha-sse3 - use unaligned loads for state array UPSTREAM: crypto: lib/chacha20poly1305 - Add missing function declaration UPSTREAM: crypto: arch/lib - limit simd usage to 4k chunks UPSTREAM: crypto: arm[64]/poly1305 - add artifact to .gitignore files UPSTREAM: crypto: x86/curve25519 - leave r12 as spare register UPSTREAM: crypto: x86/curve25519 - replace with formally verified implementation UPSTREAM: crypto: arm64/chacha - correctly walk through blocks UPSTREAM: crypto: x86/curve25519 - support assemblers with no adx support UPSTREAM: crypto: chacha20poly1305 - prevent integer overflow on large input UPSTREAM: crypto: Kconfig - allow tests to be disabled when manager is disabled UPSTREAM: crypto: arm/chacha - fix build failured when kernel mode NEON is disabled UPSTREAM: crypto: x86/poly1305 - emit does base conversion itself UPSTREAM: crypto: chacha20poly1305 - add back missing test vectors and test chunking UPSTREAM: crypto: x86/poly1305 - fix .gitignore typo UPSTREAM: crypto: curve25519 - Fix selftest build error UPSTREAM: crypto: {arm,arm64,mips}/poly1305 - remove redundant non-reduction from emit UPSTREAM: crypto: x86/poly1305 - wire up faster implementations for kernel UPSTREAM: crypto: x86/poly1305 - import unmodified cryptogams implementation UPSTREAM: crypto: poly1305 - add new 32 and 64-bit generic versions UPSTREAM: crypto: lib/curve25519 - re-add selftests UPSTREAM: crypto: arm/curve25519 - add arch-specific key generation function UPSTREAM: crypto: chacha - fix warning message in header file UPSTREAM: crypto: arch - conditionalize crypto api in arch glue for lib code UPSTREAM: crypto: lib/chacha20poly1305 - use chacha20_crypt() UPSTREAM: crypto: x86/chacha - only unregister algorithms if registered UPSTREAM: crypto: chacha_generic - remove unnecessary setkey() functions UPSTREAM: crypto: lib/chacha20poly1305 - reimplement crypt_from_sg() routine UPSTREAM: crypto: chacha20poly1305 - import construction and selftest from Zinc UPSTREAM: crypto: arm/curve25519 - wire up NEON implementation UPSTREAM: crypto: arm/curve25519 - import Bernstein and Schwabe's Curve25519 ARM implementation UPSTREAM: crypto: curve25519 - x86_64 library and KPP implementations UPSTREAM: crypto: lib/curve25519 - work around Clang stack spilling issue UPSTREAM: crypto: curve25519 - implement generic KPP driver UPSTREAM: crypto: curve25519 - add kpp selftest UPSTREAM: crypto: curve25519 - generic C library implementations UPSTREAM: crypto: blake2s - x86_64 SIMD implementation UPSTREAM: crypto: blake2s - implement generic shash driver UPSTREAM: crypto: testmgr - add test cases for Blake2s UPSTREAM: crypto: blake2s - generic C library implementation and selftest UPSTREAM: crypto: mips/poly1305 - incorporate OpenSSL/CRYPTOGAMS optimized implementation UPSTREAM: crypto: arm/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation UPSTREAM: crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation UPSTREAM: crypto: x86/poly1305 - expose existing driver as poly1305 library UPSTREAM: crypto: x86/poly1305 - depend on generic library not generic shash UPSTREAM: crypto: poly1305 - expose init/update/final library interface UPSTREAM: crypto: x86/poly1305 - unify Poly1305 state struct with generic code UPSTREAM: crypto: poly1305 - move core routines into a separate library UPSTREAM: crypto: chacha - unexport chacha_generic routines UPSTREAM: crypto: mips/chacha - wire up accelerated 32r2 code from Zinc UPSTREAM: crypto: mips/chacha - import 32r2 ChaCha code from Zinc UPSTREAM: crypto: arm/chacha - expose ARM ChaCha routine as library function UPSTREAM: crypto: arm/chacha - remove dependency on generic ChaCha driver UPSTREAM: crypto: arm/chacha - import Eric Biggers's scalar accelerated ChaCha code UPSTREAM: crypto: arm64/chacha - expose arm64 ChaCha routine as library function UPSTREAM: crypto: arm64/chacha - depend on generic chacha library instead of crypto driver UPSTREAM: crypto: arm64/chacha - use combined SIMD/ALU routine for more speed UPSTREAM: crypto: arm64/chacha - optimize for arbitrary length inputs UPSTREAM: crypto: x86/chacha - expose SIMD ChaCha routine as library function UPSTREAM: crypto: x86/chacha - depend on generic chacha library instead of crypto driver UPSTREAM: crypto: chacha - move existing library code into lib/crypto UPSTREAM: crypto: lib - tidy up lib/crypto Kconfig and Makefile UPSTREAM: crypto: chacha - constify ctx and iv arguments UPSTREAM: crypto: x86/poly1305 - Clear key material from stack in SSE2 variant UPSTREAM: crypto: xchacha20 - fix comments for test vectors UPSTREAM: crypto: xchacha - add test vector from XChaCha20 draft RFC UPSTREAM: crypto: arm64/chacha - add XChaCha12 support UPSTREAM: crypto: arm64/chacha20 - refactor to allow varying number of rounds UPSTREAM: crypto: arm64/chacha20 - add XChaCha20 support UPSTREAM: crypto: x86/chacha - avoid sleeping under kernel_fpu_begin() UPSTREAM: crypto: x86/chacha - yield the FPU occasionally UPSTREAM: crypto: x86/chacha - add XChaCha12 support UPSTREAM: crypto: x86/chacha20 - refactor to allow varying number of rounds UPSTREAM: crypto: x86/chacha20 - add XChaCha20 support UPSTREAM: crypto: x86/chacha20 - Add a 4-block AVX-512VL variant UPSTREAM: crypto: x86/chacha20 - Add a 2-block AVX-512VL variant UPSTREAM: crypto: x86/chacha20 - Add a 8-block AVX-512VL variant UPSTREAM: crypto: x86/chacha20 - Add a 4-block AVX2 variant UPSTREAM: crypto: x86/chacha20 - Add a 2-block AVX2 variant UPSTREAM: crypto: x86/chacha20 - Use larger block functions more aggressively UPSTREAM: crypto: x86/chacha20 - Support partial lengths in 8-block AVX2 variant UPSTREAM: crypto: x86/chacha20 - Support partial lengths in 4-block SSSE3 variant UPSTREAM: crypto: x86/chacha20 - Support partial lengths in 1-block SSSE3 variant ANDROID: GKI: Enable CONFIG_USB_ANNOUNCE_NEW_DEVICES ANDROID: GKI: Enable CONFIG_X86_X2APIC ANDROID: move builds to use gas prebuilts UPSTREAM: binder: fix UAF when releasing todo list Conflicts: crypto/algif_aead.c drivers/rpmsg/qcom_glink_native.c drivers/scsi/ufs/ufs-qcom.c drivers/slimbus/qcom-ngd-ctrl.c fs/notify/inotify/inotify_user.c include/linux/dcache.h include/linux/fsnotify.h mm/oom_kill.c Fixed build errors: fs/fuse/dir.c Change-Id: I95bdbb1b183fa2c569023f18e09799d9cb96fc9f Signed-off-by: Srinivasarao P <spathi@codeaurora.org>
This commit is contained in:
commit
20912a8acc
662 changed files with 51036 additions and 5409 deletions
|
@ -566,7 +566,7 @@
|
||||||
loops can be debugged more effectively on production
|
loops can be debugged more effectively on production
|
||||||
systems.
|
systems.
|
||||||
|
|
||||||
clearcpuid=BITNUM [X86]
|
clearcpuid=BITNUM[,BITNUM...] [X86]
|
||||||
Disable CPUID feature X for the kernel. See
|
Disable CPUID feature X for the kernel. See
|
||||||
arch/x86/include/asm/cpufeatures.h for the valid bit
|
arch/x86/include/asm/cpufeatures.h for the valid bit
|
||||||
numbers. Note the Linux specific bits are not necessarily
|
numbers. Note the Linux specific bits are not necessarily
|
||||||
|
@ -5302,6 +5302,14 @@
|
||||||
with /sys/devices/system/xen_memory/xen_memory0/scrub_pages.
|
with /sys/devices/system/xen_memory/xen_memory0/scrub_pages.
|
||||||
Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT.
|
Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT.
|
||||||
|
|
||||||
|
xen.event_eoi_delay= [XEN]
|
||||||
|
How long to delay EOI handling in case of event
|
||||||
|
storms (jiffies). Default is 10.
|
||||||
|
|
||||||
|
xen.event_loop_timeout= [XEN]
|
||||||
|
After which time (jiffies) the event handling loop
|
||||||
|
should start to delay EOI handling. Default is 2.
|
||||||
|
|
||||||
xirc2ps_cs= [NET,PCMCIA]
|
xirc2ps_cs= [NET,PCMCIA]
|
||||||
Format:
|
Format:
|
||||||
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
|
<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
|
||||||
|
|
|
@ -99,16 +99,20 @@ Coarse and fast_ns access
|
||||||
|
|
||||||
Some additional variants exist for more specialized cases:
|
Some additional variants exist for more specialized cases:
|
||||||
|
|
||||||
.. c:function:: ktime_t ktime_get_coarse_boottime( void )
|
.. c:function:: ktime_t ktime_get_coarse( void )
|
||||||
|
ktime_t ktime_get_coarse_boottime( void )
|
||||||
ktime_t ktime_get_coarse_real( void )
|
ktime_t ktime_get_coarse_real( void )
|
||||||
ktime_t ktime_get_coarse_clocktai( void )
|
ktime_t ktime_get_coarse_clocktai( void )
|
||||||
ktime_t ktime_get_coarse_raw( void )
|
|
||||||
|
.. c:function:: u64 ktime_get_coarse_ns( void )
|
||||||
|
u64 ktime_get_coarse_boottime_ns( void )
|
||||||
|
u64 ktime_get_coarse_real_ns( void )
|
||||||
|
u64 ktime_get_coarse_clocktai_ns( void )
|
||||||
|
|
||||||
.. c:function:: void ktime_get_coarse_ts64( struct timespec64 * )
|
.. c:function:: void ktime_get_coarse_ts64( struct timespec64 * )
|
||||||
void ktime_get_coarse_boottime_ts64( struct timespec64 * )
|
void ktime_get_coarse_boottime_ts64( struct timespec64 * )
|
||||||
void ktime_get_coarse_real_ts64( struct timespec64 * )
|
void ktime_get_coarse_real_ts64( struct timespec64 * )
|
||||||
void ktime_get_coarse_clocktai_ts64( struct timespec64 * )
|
void ktime_get_coarse_clocktai_ts64( struct timespec64 * )
|
||||||
void ktime_get_coarse_raw_ts64( struct timespec64 * )
|
|
||||||
|
|
||||||
These are quicker than the non-coarse versions, but less accurate,
|
These are quicker than the non-coarse versions, but less accurate,
|
||||||
corresponding to CLOCK_MONONOTNIC_COARSE and CLOCK_REALTIME_COARSE
|
corresponding to CLOCK_MONONOTNIC_COARSE and CLOCK_REALTIME_COARSE
|
||||||
|
|
|
@ -29,8 +29,7 @@ whole range, 0-255, dividing the angular value by 1.41. The enum
|
||||||
:c:type:`v4l2_hsv_encoding` specifies which encoding is used.
|
:c:type:`v4l2_hsv_encoding` specifies which encoding is used.
|
||||||
|
|
||||||
.. note:: The default R'G'B' quantization is full range for all
|
.. note:: The default R'G'B' quantization is full range for all
|
||||||
colorspaces except for BT.2020 which uses limited range R'G'B'
|
colorspaces. HSV formats are always full range.
|
||||||
quantization.
|
|
||||||
|
|
||||||
.. tabularcolumns:: |p{6.0cm}|p{11.5cm}|
|
.. tabularcolumns:: |p{6.0cm}|p{11.5cm}|
|
||||||
|
|
||||||
|
@ -162,8 +161,8 @@ whole range, 0-255, dividing the angular value by 1.41. The enum
|
||||||
- Details
|
- Details
|
||||||
* - ``V4L2_QUANTIZATION_DEFAULT``
|
* - ``V4L2_QUANTIZATION_DEFAULT``
|
||||||
- Use the default quantization encoding as defined by the
|
- Use the default quantization encoding as defined by the
|
||||||
colorspace. This is always full range for R'G'B' (except for the
|
colorspace. This is always full range for R'G'B' and HSV.
|
||||||
BT.2020 colorspace) and HSV. It is usually limited range for Y'CbCr.
|
It is usually limited range for Y'CbCr.
|
||||||
* - ``V4L2_QUANTIZATION_FULL_RANGE``
|
* - ``V4L2_QUANTIZATION_FULL_RANGE``
|
||||||
- Use the full range quantization encoding. I.e. the range [0…1] is
|
- Use the full range quantization encoding. I.e. the range [0…1] is
|
||||||
mapped to [0…255] (with possible clipping to [1…254] to avoid the
|
mapped to [0…255] (with possible clipping to [1…254] to avoid the
|
||||||
|
@ -173,4 +172,4 @@ whole range, 0-255, dividing the angular value by 1.41. The enum
|
||||||
* - ``V4L2_QUANTIZATION_LIM_RANGE``
|
* - ``V4L2_QUANTIZATION_LIM_RANGE``
|
||||||
- Use the limited range quantization encoding. I.e. the range [0…1]
|
- Use the limited range quantization encoding. I.e. the range [0…1]
|
||||||
is mapped to [16…235]. Cb and Cr are mapped from [-0.5…0.5] to
|
is mapped to [16…235]. Cb and Cr are mapped from [-0.5…0.5] to
|
||||||
[16…240].
|
[16…240]. Limited Range cannot be used with HSV.
|
||||||
|
|
|
@ -370,9 +370,8 @@ Colorspace BT.2020 (V4L2_COLORSPACE_BT2020)
|
||||||
The :ref:`itu2020` standard defines the colorspace used by Ultra-high
|
The :ref:`itu2020` standard defines the colorspace used by Ultra-high
|
||||||
definition television (UHDTV). The default transfer function is
|
definition television (UHDTV). The default transfer function is
|
||||||
``V4L2_XFER_FUNC_709``. The default Y'CbCr encoding is
|
``V4L2_XFER_FUNC_709``. The default Y'CbCr encoding is
|
||||||
``V4L2_YCBCR_ENC_BT2020``. The default R'G'B' quantization is limited
|
``V4L2_YCBCR_ENC_BT2020``. The default Y'CbCr quantization is limited range.
|
||||||
range (!), and so is the default Y'CbCr quantization. The chromaticities
|
The chromaticities of the primary colors and the white reference are:
|
||||||
of the primary colors and the white reference are:
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -949,12 +949,14 @@ icmp_ratelimit - INTEGER
|
||||||
icmp_msgs_per_sec - INTEGER
|
icmp_msgs_per_sec - INTEGER
|
||||||
Limit maximal number of ICMP packets sent per second from this host.
|
Limit maximal number of ICMP packets sent per second from this host.
|
||||||
Only messages whose type matches icmp_ratemask (see below) are
|
Only messages whose type matches icmp_ratemask (see below) are
|
||||||
controlled by this limit.
|
controlled by this limit. For security reasons, the precise count
|
||||||
|
of messages per second is randomized.
|
||||||
Default: 1000
|
Default: 1000
|
||||||
|
|
||||||
icmp_msgs_burst - INTEGER
|
icmp_msgs_burst - INTEGER
|
||||||
icmp_msgs_per_sec controls number of ICMP packets sent per second,
|
icmp_msgs_per_sec controls number of ICMP packets sent per second,
|
||||||
while icmp_msgs_burst controls the burst size of these packets.
|
while icmp_msgs_burst controls the burst size of these packets.
|
||||||
|
For security reasons, the precise burst size is randomized.
|
||||||
Default: 50
|
Default: 50
|
||||||
|
|
||||||
icmp_ratemask - INTEGER
|
icmp_ratemask - INTEGER
|
||||||
|
|
|
@ -3907,6 +3907,7 @@ F: crypto/
|
||||||
F: drivers/crypto/
|
F: drivers/crypto/
|
||||||
F: include/crypto/
|
F: include/crypto/
|
||||||
F: include/linux/crypto*
|
F: include/linux/crypto*
|
||||||
|
F: lib/crypto/
|
||||||
|
|
||||||
CRYPTOGRAPHIC RANDOM NUMBER GENERATOR
|
CRYPTOGRAPHIC RANDOM NUMBER GENERATOR
|
||||||
M: Neil Horman <nhorman@tuxdriver.com>
|
M: Neil Horman <nhorman@tuxdriver.com>
|
||||||
|
@ -15890,6 +15891,14 @@ L: linux-gpio@vger.kernel.org
|
||||||
S: Maintained
|
S: Maintained
|
||||||
F: drivers/gpio/gpio-ws16c48.c
|
F: drivers/gpio/gpio-ws16c48.c
|
||||||
|
|
||||||
|
WIREGUARD SECURE NETWORK TUNNEL
|
||||||
|
M: Jason A. Donenfeld <Jason@zx2c4.com>
|
||||||
|
S: Maintained
|
||||||
|
F: drivers/net/wireguard/
|
||||||
|
F: tools/testing/selftests/wireguard/
|
||||||
|
L: wireguard@lists.zx2c4.com
|
||||||
|
L: netdev@vger.kernel.org
|
||||||
|
|
||||||
WISTRON LAPTOP BUTTON DRIVER
|
WISTRON LAPTOP BUTTON DRIVER
|
||||||
M: Miloslav Trmac <mitr@volny.cz>
|
M: Miloslav Trmac <mitr@volny.cz>
|
||||||
S: Maintained
|
S: Maintained
|
||||||
|
|
8
Makefile
8
Makefile
|
@ -1,7 +1,7 @@
|
||||||
# SPDX-License-Identifier: GPL-2.0
|
# SPDX-License-Identifier: GPL-2.0
|
||||||
VERSION = 4
|
VERSION = 4
|
||||||
PATCHLEVEL = 19
|
PATCHLEVEL = 19
|
||||||
SUBLEVEL = 152
|
SUBLEVEL = 157
|
||||||
EXTRAVERSION =
|
EXTRAVERSION =
|
||||||
NAME = "People's Front"
|
NAME = "People's Front"
|
||||||
|
|
||||||
|
@ -505,11 +505,7 @@ endif
|
||||||
|
|
||||||
ifeq ($(cc-name),clang)
|
ifeq ($(cc-name),clang)
|
||||||
ifneq ($(CROSS_COMPILE),)
|
ifneq ($(CROSS_COMPILE),)
|
||||||
CLANG_TRIPLE ?= $(CROSS_COMPILE)
|
CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%))
|
||||||
CLANG_FLAGS += --target=$(notdir $(CLANG_TRIPLE:%-=%))
|
|
||||||
ifeq ($(shell $(srctree)/scripts/clang-android.sh $(CC) $(CLANG_FLAGS)), y)
|
|
||||||
$(error "Clang with Android --target detected. Did you specify CLANG_TRIPLE?")
|
|
||||||
endif
|
|
||||||
GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
|
GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
|
||||||
CLANG_FLAGS += --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE))
|
CLANG_FLAGS += --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE))
|
||||||
GCC_TOOLCHAIN := $(realpath $(GCC_TOOLCHAIN_DIR)/..)
|
GCC_TOOLCHAIN := $(realpath $(GCC_TOOLCHAIN_DIR)/..)
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -2348,6 +2348,7 @@
|
||||||
__sock_recv_ts_and_drops
|
__sock_recv_ts_and_drops
|
||||||
sock_wake_async
|
sock_wake_async
|
||||||
sock_wfree
|
sock_wfree
|
||||||
|
timer_reduce
|
||||||
unregister_net_sysctl_table
|
unregister_net_sysctl_table
|
||||||
__wake_up_sync_key
|
__wake_up_sync_key
|
||||||
__xfrm_policy_check
|
__xfrm_policy_check
|
||||||
|
|
|
@ -366,6 +366,13 @@ config HAVE_RCU_TABLE_FREE
|
||||||
config HAVE_RCU_TABLE_INVALIDATE
|
config HAVE_RCU_TABLE_INVALIDATE
|
||||||
bool
|
bool
|
||||||
|
|
||||||
|
config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
|
||||||
|
bool
|
||||||
|
help
|
||||||
|
Temporary select until all architectures can be converted to have
|
||||||
|
irqs disabled over activate_mm. Architectures that do IPI based TLB
|
||||||
|
shootdowns should enable this.
|
||||||
|
|
||||||
config ARCH_HAVE_NMI_SAFE_CMPXCHG
|
config ARCH_HAVE_NMI_SAFE_CMPXCHG
|
||||||
bool
|
bool
|
||||||
|
|
||||||
|
|
|
@ -156,6 +156,7 @@ END(EV_Extension)
|
||||||
tracesys:
|
tracesys:
|
||||||
; save EFA in case tracer wants the PC of traced task
|
; save EFA in case tracer wants the PC of traced task
|
||||||
; using ERET won't work since next-PC has already committed
|
; using ERET won't work since next-PC has already committed
|
||||||
|
lr r12, [efa]
|
||||||
GET_CURR_TASK_FIELD_PTR TASK_THREAD, r11
|
GET_CURR_TASK_FIELD_PTR TASK_THREAD, r11
|
||||||
st r12, [r11, THREAD_FAULT_ADDR] ; thread.fault_address
|
st r12, [r11, THREAD_FAULT_ADDR] ; thread.fault_address
|
||||||
|
|
||||||
|
@ -198,9 +199,15 @@ tracesys_exit:
|
||||||
; Breakpoint TRAP
|
; Breakpoint TRAP
|
||||||
; ---------------------------------------------
|
; ---------------------------------------------
|
||||||
trap_with_param:
|
trap_with_param:
|
||||||
mov r0, r12 ; EFA in case ptracer/gdb wants stop_pc
|
|
||||||
|
; stop_pc info by gdb needs this info
|
||||||
|
lr r0, [efa]
|
||||||
mov r1, sp
|
mov r1, sp
|
||||||
|
|
||||||
|
; Now that we have read EFA, it is safe to do "fake" rtie
|
||||||
|
; and get out of CPU exception mode
|
||||||
|
FAKE_RET_FROM_EXCPN
|
||||||
|
|
||||||
; Save callee regs in case gdb wants to have a look
|
; Save callee regs in case gdb wants to have a look
|
||||||
; SP will grow up by size of CALLEE Reg-File
|
; SP will grow up by size of CALLEE Reg-File
|
||||||
; NOTE: clobbers r12
|
; NOTE: clobbers r12
|
||||||
|
@ -227,10 +234,6 @@ ENTRY(EV_Trap)
|
||||||
|
|
||||||
EXCEPTION_PROLOGUE
|
EXCEPTION_PROLOGUE
|
||||||
|
|
||||||
lr r12, [efa]
|
|
||||||
|
|
||||||
FAKE_RET_FROM_EXCPN
|
|
||||||
|
|
||||||
;============ TRAP 1 :breakpoints
|
;============ TRAP 1 :breakpoints
|
||||||
; Check ECR for trap with arg (PROLOGUE ensures r9 has ECR)
|
; Check ECR for trap with arg (PROLOGUE ensures r9 has ECR)
|
||||||
bmsk.f 0, r9, 7
|
bmsk.f 0, r9, 7
|
||||||
|
@ -238,6 +241,9 @@ ENTRY(EV_Trap)
|
||||||
|
|
||||||
;============ TRAP (no param): syscall top level
|
;============ TRAP (no param): syscall top level
|
||||||
|
|
||||||
|
; First return from Exception to pure K mode (Exception/IRQs renabled)
|
||||||
|
FAKE_RET_FROM_EXCPN
|
||||||
|
|
||||||
; If syscall tracing ongoing, invoke pre-post-hooks
|
; If syscall tracing ongoing, invoke pre-post-hooks
|
||||||
GET_CURR_THR_INFO_FLAGS r10
|
GET_CURR_THR_INFO_FLAGS r10
|
||||||
btst r10, TIF_SYSCALL_TRACE
|
btst r10, TIF_SYSCALL_TRACE
|
||||||
|
|
|
@ -115,7 +115,7 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,
|
||||||
int (*consumer_fn) (unsigned int, void *), void *arg)
|
int (*consumer_fn) (unsigned int, void *), void *arg)
|
||||||
{
|
{
|
||||||
#ifdef CONFIG_ARC_DW2_UNWIND
|
#ifdef CONFIG_ARC_DW2_UNWIND
|
||||||
int ret = 0;
|
int ret = 0, cnt = 0;
|
||||||
unsigned int address;
|
unsigned int address;
|
||||||
struct unwind_frame_info frame_info;
|
struct unwind_frame_info frame_info;
|
||||||
|
|
||||||
|
@ -135,6 +135,11 @@ arc_unwind_core(struct task_struct *tsk, struct pt_regs *regs,
|
||||||
break;
|
break;
|
||||||
|
|
||||||
frame_info.regs.r63 = frame_info.regs.r31;
|
frame_info.regs.r63 = frame_info.regs.r31;
|
||||||
|
|
||||||
|
if (cnt++ > 128) {
|
||||||
|
printk("unwinder looping too long, aborting !\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return address; /* return the last address it saw */
|
return address; /* return the last address it saw */
|
||||||
|
|
|
@ -11,5 +11,6 @@ menuconfig ARC_SOC_HSDK
|
||||||
select ARC_HAS_ACCL_REGS
|
select ARC_HAS_ACCL_REGS
|
||||||
select ARC_IRQ_NO_AUTOSAVE
|
select ARC_IRQ_NO_AUTOSAVE
|
||||||
select CLK_HSDK
|
select CLK_HSDK
|
||||||
|
select RESET_CONTROLLER
|
||||||
select RESET_HSDK
|
select RESET_HSDK
|
||||||
select MIGHT_HAVE_PCI
|
select MIGHT_HAVE_PCI
|
||||||
|
|
|
@ -622,8 +622,10 @@ config ARCH_S3C24XX
|
||||||
select HAVE_S3C2410_WATCHDOG if WATCHDOG
|
select HAVE_S3C2410_WATCHDOG if WATCHDOG
|
||||||
select HAVE_S3C_RTC if RTC_CLASS
|
select HAVE_S3C_RTC if RTC_CLASS
|
||||||
select NEED_MACH_IO_H
|
select NEED_MACH_IO_H
|
||||||
|
select S3C2410_WATCHDOG
|
||||||
select SAMSUNG_ATAGS
|
select SAMSUNG_ATAGS
|
||||||
select USE_OF
|
select USE_OF
|
||||||
|
select WATCHDOG
|
||||||
help
|
help
|
||||||
Samsung S3C2410, S3C2412, S3C2413, S3C2416, S3C2440, S3C2442, S3C2443
|
Samsung S3C2410, S3C2412, S3C2413, S3C2416, S3C2440, S3C2442, S3C2443
|
||||||
and S3C2450 SoCs based systems, such as the Simtec Electronics BAST
|
and S3C2450 SoCs based systems, such as the Simtec Electronics BAST
|
||||||
|
|
|
@ -922,8 +922,10 @@
|
||||||
};
|
};
|
||||||
|
|
||||||
rngb: rngb@21b4000 {
|
rngb: rngb@21b4000 {
|
||||||
|
compatible = "fsl,imx6sl-rngb", "fsl,imx25-rngb";
|
||||||
reg = <0x021b4000 0x4000>;
|
reg = <0x021b4000 0x4000>;
|
||||||
interrupts = <0 5 IRQ_TYPE_LEVEL_HIGH>;
|
interrupts = <0 5 IRQ_TYPE_LEVEL_HIGH>;
|
||||||
|
clocks = <&clks IMX6SL_CLK_DUMMY>;
|
||||||
};
|
};
|
||||||
|
|
||||||
weim: weim@21b8000 {
|
weim: weim@21b8000 {
|
||||||
|
|
|
@ -192,6 +192,7 @@
|
||||||
fixed-link {
|
fixed-link {
|
||||||
speed = <1000>;
|
speed = <1000>;
|
||||||
full-duplex;
|
full-duplex;
|
||||||
|
pause;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
|
@ -516,7 +516,7 @@
|
||||||
status = "disabled";
|
status = "disabled";
|
||||||
};
|
};
|
||||||
|
|
||||||
target-module@56000000 {
|
sgx_module: target-module@56000000 {
|
||||||
compatible = "ti,sysc-omap4", "ti,sysc";
|
compatible = "ti,sysc-omap4", "ti,sysc";
|
||||||
ti,hwmods = "gpu";
|
ti,hwmods = "gpu";
|
||||||
reg = <0x5601fc00 0x4>,
|
reg = <0x5601fc00 0x4>,
|
||||||
|
|
|
@ -74,3 +74,13 @@
|
||||||
};
|
};
|
||||||
|
|
||||||
/include/ "omap443x-clocks.dtsi"
|
/include/ "omap443x-clocks.dtsi"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Use dpll_per for sgx at 153.6MHz like droid4 stock v3.0.8 Android kernel
|
||||||
|
*/
|
||||||
|
&sgx_module {
|
||||||
|
assigned-clocks = <&l3_gfx_clkctrl OMAP4_GPU_CLKCTRL 24>,
|
||||||
|
<&dpll_per_m7x2_ck>;
|
||||||
|
assigned-clock-rates = <0>, <153600000>;
|
||||||
|
assigned-clock-parents = <&dpll_per_m7x2_ck>;
|
||||||
|
};
|
||||||
|
|
|
@ -85,21 +85,21 @@
|
||||||
global_timer: timer@b0020200 {
|
global_timer: timer@b0020200 {
|
||||||
compatible = "arm,cortex-a9-global-timer";
|
compatible = "arm,cortex-a9-global-timer";
|
||||||
reg = <0xb0020200 0x100>;
|
reg = <0xb0020200 0x100>;
|
||||||
interrupts = <GIC_PPI 0 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
interrupts = <GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||||
status = "disabled";
|
status = "disabled";
|
||||||
};
|
};
|
||||||
|
|
||||||
twd_timer: timer@b0020600 {
|
twd_timer: timer@b0020600 {
|
||||||
compatible = "arm,cortex-a9-twd-timer";
|
compatible = "arm,cortex-a9-twd-timer";
|
||||||
reg = <0xb0020600 0x20>;
|
reg = <0xb0020600 0x20>;
|
||||||
interrupts = <GIC_PPI 2 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
interrupts = <GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||||
status = "disabled";
|
status = "disabled";
|
||||||
};
|
};
|
||||||
|
|
||||||
twd_wdt: wdt@b0020620 {
|
twd_wdt: wdt@b0020620 {
|
||||||
compatible = "arm,cortex-a9-twd-wdt";
|
compatible = "arm,cortex-a9-twd-wdt";
|
||||||
reg = <0xb0020620 0xe0>;
|
reg = <0xb0020620 0xe0>;
|
||||||
interrupts = <GIC_PPI 3 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
interrupts = <GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_EDGE_RISING)>;
|
||||||
status = "disabled";
|
status = "disabled";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -98,20 +98,17 @@
|
||||||
};
|
};
|
||||||
|
|
||||||
clocks: clock-controller@e0100000 {
|
clocks: clock-controller@e0100000 {
|
||||||
compatible = "samsung,s5pv210-clock", "simple-bus";
|
compatible = "samsung,s5pv210-clock";
|
||||||
reg = <0xe0100000 0x10000>;
|
reg = <0xe0100000 0x10000>;
|
||||||
clock-names = "xxti", "xusbxti";
|
clock-names = "xxti", "xusbxti";
|
||||||
clocks = <&xxti>, <&xusbxti>;
|
clocks = <&xxti>, <&xusbxti>;
|
||||||
#clock-cells = <1>;
|
#clock-cells = <1>;
|
||||||
#address-cells = <1>;
|
};
|
||||||
#size-cells = <1>;
|
|
||||||
ranges;
|
|
||||||
|
|
||||||
pmu_syscon: syscon@e0108000 {
|
pmu_syscon: syscon@e0108000 {
|
||||||
compatible = "samsung-s5pv210-pmu", "syscon";
|
compatible = "samsung-s5pv210-pmu", "syscon";
|
||||||
reg = <0xe0108000 0x8000>;
|
reg = <0xe0108000 0x8000>;
|
||||||
};
|
};
|
||||||
};
|
|
||||||
|
|
||||||
pinctrl0: pinctrl@e0200000 {
|
pinctrl0: pinctrl@e0200000 {
|
||||||
compatible = "samsung,s5pv210-pinctrl";
|
compatible = "samsung,s5pv210-pinctrl";
|
||||||
|
@ -126,12 +123,6 @@
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
amba {
|
|
||||||
#address-cells = <1>;
|
|
||||||
#size-cells = <1>;
|
|
||||||
compatible = "simple-bus";
|
|
||||||
ranges;
|
|
||||||
|
|
||||||
pdma0: dma@e0900000 {
|
pdma0: dma@e0900000 {
|
||||||
compatible = "arm,pl330", "arm,primecell";
|
compatible = "arm,pl330", "arm,primecell";
|
||||||
reg = <0xe0900000 0x1000>;
|
reg = <0xe0900000 0x1000>;
|
||||||
|
@ -155,7 +146,6 @@
|
||||||
#dma-channels = <8>;
|
#dma-channels = <8>;
|
||||||
#dma-requests = <32>;
|
#dma-requests = <32>;
|
||||||
};
|
};
|
||||||
};
|
|
||||||
|
|
||||||
spi0: spi@e1300000 {
|
spi0: spi@e1300000 {
|
||||||
compatible = "samsung,s5pv210-spi";
|
compatible = "samsung,s5pv210-spi";
|
||||||
|
@ -227,12 +217,6 @@
|
||||||
status = "disabled";
|
status = "disabled";
|
||||||
};
|
};
|
||||||
|
|
||||||
audio-subsystem {
|
|
||||||
compatible = "samsung,s5pv210-audss", "simple-bus";
|
|
||||||
#address-cells = <1>;
|
|
||||||
#size-cells = <1>;
|
|
||||||
ranges;
|
|
||||||
|
|
||||||
clk_audss: clock-controller@eee10000 {
|
clk_audss: clock-controller@eee10000 {
|
||||||
compatible = "samsung,s5pv210-audss-clock";
|
compatible = "samsung,s5pv210-audss-clock";
|
||||||
reg = <0xeee10000 0x1000>;
|
reg = <0xeee10000 0x1000>;
|
||||||
|
@ -264,7 +248,6 @@
|
||||||
#sound-dai-cells = <0>;
|
#sound-dai-cells = <0>;
|
||||||
status = "disabled";
|
status = "disabled";
|
||||||
};
|
};
|
||||||
};
|
|
||||||
|
|
||||||
i2s1: i2s@e2100000 {
|
i2s1: i2s@e2100000 {
|
||||||
compatible = "samsung,s3c6410-i2s";
|
compatible = "samsung,s3c6410-i2s";
|
||||||
|
|
|
@ -143,7 +143,7 @@
|
||||||
trips {
|
trips {
|
||||||
cpu_alert0: cpu-alert0 {
|
cpu_alert0: cpu-alert0 {
|
||||||
/* milliCelsius */
|
/* milliCelsius */
|
||||||
temperature = <850000>;
|
temperature = <85000>;
|
||||||
hysteresis = <2000>;
|
hysteresis = <2000>;
|
||||||
type = "passive";
|
type = "passive";
|
||||||
};
|
};
|
||||||
|
|
|
@ -206,16 +206,16 @@
|
||||||
};
|
};
|
||||||
|
|
||||||
®_dc1sw {
|
®_dc1sw {
|
||||||
regulator-min-microvolt = <3000000>;
|
regulator-min-microvolt = <3300000>;
|
||||||
regulator-max-microvolt = <3000000>;
|
regulator-max-microvolt = <3300000>;
|
||||||
regulator-name = "vcc-gmac-phy";
|
regulator-name = "vcc-gmac-phy";
|
||||||
};
|
};
|
||||||
|
|
||||||
®_dcdc1 {
|
®_dcdc1 {
|
||||||
regulator-always-on;
|
regulator-always-on;
|
||||||
regulator-min-microvolt = <3000000>;
|
regulator-min-microvolt = <3300000>;
|
||||||
regulator-max-microvolt = <3000000>;
|
regulator-max-microvolt = <3300000>;
|
||||||
regulator-name = "vcc-3v0";
|
regulator-name = "vcc-3v3";
|
||||||
};
|
};
|
||||||
|
|
||||||
®_dcdc2 {
|
®_dcdc2 {
|
||||||
|
|
1
arch/arm/crypto/.gitignore
vendored
1
arch/arm/crypto/.gitignore
vendored
|
@ -1,3 +1,4 @@
|
||||||
aesbs-core.S
|
aesbs-core.S
|
||||||
sha256-core.S
|
sha256-core.S
|
||||||
sha512-core.S
|
sha512-core.S
|
||||||
|
poly1305-core.S
|
||||||
|
|
|
@ -125,14 +125,24 @@ config CRYPTO_CRC32_ARM_CE
|
||||||
select CRYPTO_HASH
|
select CRYPTO_HASH
|
||||||
|
|
||||||
config CRYPTO_CHACHA20_NEON
|
config CRYPTO_CHACHA20_NEON
|
||||||
tristate "NEON accelerated ChaCha stream cipher algorithms"
|
tristate "NEON and scalar accelerated ChaCha stream cipher algorithms"
|
||||||
depends on KERNEL_MODE_NEON
|
|
||||||
select CRYPTO_BLKCIPHER
|
select CRYPTO_BLKCIPHER
|
||||||
select CRYPTO_CHACHA20
|
select CRYPTO_ARCH_HAVE_LIB_CHACHA
|
||||||
|
|
||||||
|
config CRYPTO_POLY1305_ARM
|
||||||
|
tristate "Accelerated scalar and SIMD Poly1305 hash implementations"
|
||||||
|
select CRYPTO_HASH
|
||||||
|
select CRYPTO_ARCH_HAVE_LIB_POLY1305
|
||||||
|
|
||||||
config CRYPTO_NHPOLY1305_NEON
|
config CRYPTO_NHPOLY1305_NEON
|
||||||
tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
|
tristate "NEON accelerated NHPoly1305 hash function (for Adiantum)"
|
||||||
depends on KERNEL_MODE_NEON
|
depends on KERNEL_MODE_NEON
|
||||||
select CRYPTO_NHPOLY1305
|
select CRYPTO_NHPOLY1305
|
||||||
|
|
||||||
|
config CRYPTO_CURVE25519_NEON
|
||||||
|
tristate "NEON accelerated Curve25519 scalar multiplication library"
|
||||||
|
depends on KERNEL_MODE_NEON
|
||||||
|
select CRYPTO_LIB_CURVE25519_GENERIC
|
||||||
|
select CRYPTO_ARCH_HAVE_LIB_CURVE25519
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -10,7 +10,9 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
|
||||||
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
|
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
|
||||||
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
|
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
|
||||||
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
|
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
|
||||||
|
obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
|
||||||
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
|
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
|
||||||
|
obj-$(CONFIG_CRYPTO_CURVE25519_NEON) += curve25519-neon.o
|
||||||
|
|
||||||
ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
|
ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
|
||||||
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
|
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
|
||||||
|
@ -53,13 +55,19 @@ aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
|
||||||
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
|
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
|
||||||
crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
|
crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
|
||||||
crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
|
crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
|
||||||
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
|
chacha-neon-y := chacha-scalar-core.o chacha-glue.o
|
||||||
|
chacha-neon-$(CONFIG_KERNEL_MODE_NEON) += chacha-neon-core.o
|
||||||
|
poly1305-arm-y := poly1305-core.o poly1305-glue.o
|
||||||
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
|
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
|
||||||
|
curve25519-neon-y := curve25519-core.o curve25519-glue.o
|
||||||
|
|
||||||
ifdef REGENERATE_ARM_CRYPTO
|
ifdef REGENERATE_ARM_CRYPTO
|
||||||
quiet_cmd_perl = PERL $@
|
quiet_cmd_perl = PERL $@
|
||||||
cmd_perl = $(PERL) $(<) > $(@)
|
cmd_perl = $(PERL) $(<) > $(@)
|
||||||
|
|
||||||
|
$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv4.pl
|
||||||
|
$(call cmd,perl)
|
||||||
|
|
||||||
$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
|
$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
|
||||||
$(call cmd,perl)
|
$(call cmd,perl)
|
||||||
|
|
||||||
|
@ -67,4 +75,9 @@ $(src)/sha512-core.S_shipped: $(src)/sha512-armv4.pl
|
||||||
$(call cmd,perl)
|
$(call cmd,perl)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
targets += sha256-core.S sha512-core.S
|
targets += poly1305-core.S sha256-core.S sha512-core.S
|
||||||
|
|
||||||
|
# massage the perlasm code a bit so we only get the NEON routine if we need it
|
||||||
|
poly1305-aflags-$(CONFIG_CPU_V7) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=5
|
||||||
|
poly1305-aflags-$(CONFIG_KERNEL_MODE_NEON) := -U__LINUX_ARM_ARCH__ -D__LINUX_ARM_ARCH__=7
|
||||||
|
AFLAGS_poly1305-core.o += $(poly1305-aflags-y)
|
||||||
|
|
356
arch/arm/crypto/chacha-glue.c
Normal file
356
arch/arm/crypto/chacha-glue.c
Normal file
|
@ -0,0 +1,356 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
|
/*
|
||||||
|
* ARM NEON accelerated ChaCha and XChaCha stream ciphers,
|
||||||
|
* including ChaCha20 (RFC7539)
|
||||||
|
*
|
||||||
|
* Copyright (C) 2016-2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||||
|
* Copyright (C) 2015 Martin Willi
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <crypto/algapi.h>
|
||||||
|
#include <crypto/internal/chacha.h>
|
||||||
|
#include <crypto/internal/skcipher.h>
|
||||||
|
#include <linux/jump_label.h>
|
||||||
|
#include <linux/kernel.h>
|
||||||
|
#include <linux/module.h>
|
||||||
|
|
||||||
|
#include <asm/cputype.h>
|
||||||
|
#include <asm/hwcap.h>
|
||||||
|
#include <asm/neon.h>
|
||||||
|
#include <asm/simd.h>
|
||||||
|
|
||||||
|
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
||||||
|
int nrounds);
|
||||||
|
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
||||||
|
int nrounds);
|
||||||
|
asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
|
||||||
|
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
|
||||||
|
|
||||||
|
asmlinkage void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
|
||||||
|
const u32 *state, int nrounds);
|
||||||
|
|
||||||
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(use_neon);
|
||||||
|
|
||||||
|
static inline bool neon_usable(void)
|
||||||
|
{
|
||||||
|
return static_branch_likely(&use_neon) && may_use_simd();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
|
||||||
|
unsigned int bytes, int nrounds)
|
||||||
|
{
|
||||||
|
u8 buf[CHACHA_BLOCK_SIZE];
|
||||||
|
|
||||||
|
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||||
|
chacha_4block_xor_neon(state, dst, src, nrounds);
|
||||||
|
bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||||
|
src += CHACHA_BLOCK_SIZE * 4;
|
||||||
|
dst += CHACHA_BLOCK_SIZE * 4;
|
||||||
|
state[12] += 4;
|
||||||
|
}
|
||||||
|
while (bytes >= CHACHA_BLOCK_SIZE) {
|
||||||
|
chacha_block_xor_neon(state, dst, src, nrounds);
|
||||||
|
bytes -= CHACHA_BLOCK_SIZE;
|
||||||
|
src += CHACHA_BLOCK_SIZE;
|
||||||
|
dst += CHACHA_BLOCK_SIZE;
|
||||||
|
state[12]++;
|
||||||
|
}
|
||||||
|
if (bytes) {
|
||||||
|
memcpy(buf, src, bytes);
|
||||||
|
chacha_block_xor_neon(state, buf, buf, nrounds);
|
||||||
|
memcpy(dst, buf, bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
|
||||||
|
{
|
||||||
|
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable()) {
|
||||||
|
hchacha_block_arm(state, stream, nrounds);
|
||||||
|
} else {
|
||||||
|
kernel_neon_begin();
|
||||||
|
hchacha_block_neon(state, stream, nrounds);
|
||||||
|
kernel_neon_end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(hchacha_block_arch);
|
||||||
|
|
||||||
|
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
||||||
|
{
|
||||||
|
chacha_init_generic(state, key, iv);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(chacha_init_arch);
|
||||||
|
|
||||||
|
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
|
||||||
|
int nrounds)
|
||||||
|
{
|
||||||
|
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon_usable() ||
|
||||||
|
bytes <= CHACHA_BLOCK_SIZE) {
|
||||||
|
chacha_doarm(dst, src, bytes, state, nrounds);
|
||||||
|
state[12] += DIV_ROUND_UP(bytes, CHACHA_BLOCK_SIZE);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
|
||||||
|
|
||||||
|
kernel_neon_begin();
|
||||||
|
chacha_doneon(state, dst, src, todo, nrounds);
|
||||||
|
kernel_neon_end();
|
||||||
|
|
||||||
|
bytes -= todo;
|
||||||
|
src += todo;
|
||||||
|
dst += todo;
|
||||||
|
} while (bytes);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(chacha_crypt_arch);
|
||||||
|
|
||||||
|
static int chacha_stream_xor(struct skcipher_request *req,
|
||||||
|
const struct chacha_ctx *ctx, const u8 *iv,
|
||||||
|
bool neon)
|
||||||
|
{
|
||||||
|
struct skcipher_walk walk;
|
||||||
|
u32 state[16];
|
||||||
|
int err;
|
||||||
|
|
||||||
|
err = skcipher_walk_virt(&walk, req, false);
|
||||||
|
|
||||||
|
chacha_init_generic(state, ctx->key, iv);
|
||||||
|
|
||||||
|
while (walk.nbytes > 0) {
|
||||||
|
unsigned int nbytes = walk.nbytes;
|
||||||
|
|
||||||
|
if (nbytes < walk.total)
|
||||||
|
nbytes = round_down(nbytes, walk.stride);
|
||||||
|
|
||||||
|
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
|
||||||
|
chacha_doarm(walk.dst.virt.addr, walk.src.virt.addr,
|
||||||
|
nbytes, state, ctx->nrounds);
|
||||||
|
state[12] += DIV_ROUND_UP(nbytes, CHACHA_BLOCK_SIZE);
|
||||||
|
} else {
|
||||||
|
kernel_neon_begin();
|
||||||
|
chacha_doneon(state, walk.dst.virt.addr,
|
||||||
|
walk.src.virt.addr, nbytes, ctx->nrounds);
|
||||||
|
kernel_neon_end();
|
||||||
|
}
|
||||||
|
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int do_chacha(struct skcipher_request *req, bool neon)
|
||||||
|
{
|
||||||
|
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||||
|
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||||
|
|
||||||
|
return chacha_stream_xor(req, ctx, req->iv, neon);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int chacha_arm(struct skcipher_request *req)
|
||||||
|
{
|
||||||
|
return do_chacha(req, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int chacha_neon(struct skcipher_request *req)
|
||||||
|
{
|
||||||
|
return do_chacha(req, neon_usable());
|
||||||
|
}
|
||||||
|
|
||||||
|
static int do_xchacha(struct skcipher_request *req, bool neon)
|
||||||
|
{
|
||||||
|
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||||
|
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||||
|
struct chacha_ctx subctx;
|
||||||
|
u32 state[16];
|
||||||
|
u8 real_iv[16];
|
||||||
|
|
||||||
|
chacha_init_generic(state, ctx->key, req->iv);
|
||||||
|
|
||||||
|
if (!IS_ENABLED(CONFIG_KERNEL_MODE_NEON) || !neon) {
|
||||||
|
hchacha_block_arm(state, subctx.key, ctx->nrounds);
|
||||||
|
} else {
|
||||||
|
kernel_neon_begin();
|
||||||
|
hchacha_block_neon(state, subctx.key, ctx->nrounds);
|
||||||
|
kernel_neon_end();
|
||||||
|
}
|
||||||
|
subctx.nrounds = ctx->nrounds;
|
||||||
|
|
||||||
|
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||||
|
memcpy(&real_iv[8], req->iv + 16, 8);
|
||||||
|
return chacha_stream_xor(req, &subctx, real_iv, neon);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int xchacha_arm(struct skcipher_request *req)
|
||||||
|
{
|
||||||
|
return do_xchacha(req, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int xchacha_neon(struct skcipher_request *req)
|
||||||
|
{
|
||||||
|
return do_xchacha(req, neon_usable());
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct skcipher_alg arm_algs[] = {
|
||||||
|
{
|
||||||
|
.base.cra_name = "chacha20",
|
||||||
|
.base.cra_driver_name = "chacha20-arm",
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_blocksize = 1,
|
||||||
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.ivsize = CHACHA_IV_SIZE,
|
||||||
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
|
.setkey = chacha20_setkey,
|
||||||
|
.encrypt = chacha_arm,
|
||||||
|
.decrypt = chacha_arm,
|
||||||
|
}, {
|
||||||
|
.base.cra_name = "xchacha20",
|
||||||
|
.base.cra_driver_name = "xchacha20-arm",
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_blocksize = 1,
|
||||||
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.ivsize = XCHACHA_IV_SIZE,
|
||||||
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
|
.setkey = chacha20_setkey,
|
||||||
|
.encrypt = xchacha_arm,
|
||||||
|
.decrypt = xchacha_arm,
|
||||||
|
}, {
|
||||||
|
.base.cra_name = "xchacha12",
|
||||||
|
.base.cra_driver_name = "xchacha12-arm",
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_blocksize = 1,
|
||||||
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.ivsize = XCHACHA_IV_SIZE,
|
||||||
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
|
.setkey = chacha12_setkey,
|
||||||
|
.encrypt = xchacha_arm,
|
||||||
|
.decrypt = xchacha_arm,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct skcipher_alg neon_algs[] = {
|
||||||
|
{
|
||||||
|
.base.cra_name = "chacha20",
|
||||||
|
.base.cra_driver_name = "chacha20-neon",
|
||||||
|
.base.cra_priority = 300,
|
||||||
|
.base.cra_blocksize = 1,
|
||||||
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.ivsize = CHACHA_IV_SIZE,
|
||||||
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
|
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||||
|
.setkey = chacha20_setkey,
|
||||||
|
.encrypt = chacha_neon,
|
||||||
|
.decrypt = chacha_neon,
|
||||||
|
}, {
|
||||||
|
.base.cra_name = "xchacha20",
|
||||||
|
.base.cra_driver_name = "xchacha20-neon",
|
||||||
|
.base.cra_priority = 300,
|
||||||
|
.base.cra_blocksize = 1,
|
||||||
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.ivsize = XCHACHA_IV_SIZE,
|
||||||
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
|
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||||
|
.setkey = chacha20_setkey,
|
||||||
|
.encrypt = xchacha_neon,
|
||||||
|
.decrypt = xchacha_neon,
|
||||||
|
}, {
|
||||||
|
.base.cra_name = "xchacha12",
|
||||||
|
.base.cra_driver_name = "xchacha12-neon",
|
||||||
|
.base.cra_priority = 300,
|
||||||
|
.base.cra_blocksize = 1,
|
||||||
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.ivsize = XCHACHA_IV_SIZE,
|
||||||
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
|
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
||||||
|
.setkey = chacha12_setkey,
|
||||||
|
.encrypt = xchacha_neon,
|
||||||
|
.decrypt = xchacha_neon,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static int __init chacha_simd_mod_init(void)
|
||||||
|
{
|
||||||
|
int err = 0;
|
||||||
|
|
||||||
|
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
|
||||||
|
err = crypto_register_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
|
||||||
|
if (err)
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON)) {
|
||||||
|
int i;
|
||||||
|
|
||||||
|
switch (read_cpuid_part()) {
|
||||||
|
case ARM_CPU_PART_CORTEX_A7:
|
||||||
|
case ARM_CPU_PART_CORTEX_A5:
|
||||||
|
/*
|
||||||
|
* The Cortex-A7 and Cortex-A5 do not perform well with
|
||||||
|
* the NEON implementation but do incredibly with the
|
||||||
|
* scalar one and use less power.
|
||||||
|
*/
|
||||||
|
for (i = 0; i < ARRAY_SIZE(neon_algs); i++)
|
||||||
|
neon_algs[i].base.cra_priority = 0;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
static_branch_enable(&use_neon);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
|
||||||
|
err = crypto_register_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
|
||||||
|
if (err)
|
||||||
|
crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __exit chacha_simd_mod_fini(void)
|
||||||
|
{
|
||||||
|
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER)) {
|
||||||
|
crypto_unregister_skciphers(arm_algs, ARRAY_SIZE(arm_algs));
|
||||||
|
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_NEON))
|
||||||
|
crypto_unregister_skciphers(neon_algs, ARRAY_SIZE(neon_algs));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
module_init(chacha_simd_mod_init);
|
||||||
|
module_exit(chacha_simd_mod_fini);
|
||||||
|
|
||||||
|
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (scalar and NEON accelerated)");
|
||||||
|
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||||
|
MODULE_LICENSE("GPL v2");
|
||||||
|
MODULE_ALIAS_CRYPTO("chacha20");
|
||||||
|
MODULE_ALIAS_CRYPTO("chacha20-arm");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha20");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha20-arm");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha12");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha12-arm");
|
||||||
|
#ifdef CONFIG_KERNEL_MODE_NEON
|
||||||
|
MODULE_ALIAS_CRYPTO("chacha20-neon");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha20-neon");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha12-neon");
|
||||||
|
#endif
|
460
arch/arm/crypto/chacha-scalar-core.S
Normal file
460
arch/arm/crypto/chacha-scalar-core.S
Normal file
|
@ -0,0 +1,460 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0 */
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2018 Google, Inc.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <linux/linkage.h>
|
||||||
|
#include <asm/assembler.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Design notes:
|
||||||
|
*
|
||||||
|
* 16 registers would be needed to hold the state matrix, but only 14 are
|
||||||
|
* available because 'sp' and 'pc' cannot be used. So we spill the elements
|
||||||
|
* (x8, x9) to the stack and swap them out with (x10, x11). This adds one
|
||||||
|
* 'ldrd' and one 'strd' instruction per round.
|
||||||
|
*
|
||||||
|
* All rotates are performed using the implicit rotate operand accepted by the
|
||||||
|
* 'add' and 'eor' instructions. This is faster than using explicit rotate
|
||||||
|
* instructions. To make this work, we allow the values in the second and last
|
||||||
|
* rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
|
||||||
|
* wrong rotation amount. The rotation amount is then fixed up just in time
|
||||||
|
* when the values are used. 'brot' is the number of bits the values in row 'b'
|
||||||
|
* need to be rotated right to arrive at the correct values, and 'drot'
|
||||||
|
* similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
|
||||||
|
* that they end up as (25, 24) after every round.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// ChaCha state registers
|
||||||
|
X0 .req r0
|
||||||
|
X1 .req r1
|
||||||
|
X2 .req r2
|
||||||
|
X3 .req r3
|
||||||
|
X4 .req r4
|
||||||
|
X5 .req r5
|
||||||
|
X6 .req r6
|
||||||
|
X7 .req r7
|
||||||
|
X8_X10 .req r8 // shared by x8 and x10
|
||||||
|
X9_X11 .req r9 // shared by x9 and x11
|
||||||
|
X12 .req r10
|
||||||
|
X13 .req r11
|
||||||
|
X14 .req r12
|
||||||
|
X15 .req r14
|
||||||
|
|
||||||
|
.macro __rev out, in, t0, t1, t2
|
||||||
|
.if __LINUX_ARM_ARCH__ >= 6
|
||||||
|
rev \out, \in
|
||||||
|
.else
|
||||||
|
lsl \t0, \in, #24
|
||||||
|
and \t1, \in, #0xff00
|
||||||
|
and \t2, \in, #0xff0000
|
||||||
|
orr \out, \t0, \in, lsr #24
|
||||||
|
orr \out, \out, \t1, lsl #8
|
||||||
|
orr \out, \out, \t2, lsr #8
|
||||||
|
.endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro _le32_bswap x, t0, t1, t2
|
||||||
|
#ifdef __ARMEB__
|
||||||
|
__rev \x, \x, \t0, \t1, \t2
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
|
||||||
|
_le32_bswap \a, \t0, \t1, \t2
|
||||||
|
_le32_bswap \b, \t0, \t1, \t2
|
||||||
|
_le32_bswap \c, \t0, \t1, \t2
|
||||||
|
_le32_bswap \d, \t0, \t1, \t2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro __ldrd a, b, src, offset
|
||||||
|
#if __LINUX_ARM_ARCH__ >= 6
|
||||||
|
ldrd \a, \b, [\src, #\offset]
|
||||||
|
#else
|
||||||
|
ldr \a, [\src, #\offset]
|
||||||
|
ldr \b, [\src, #\offset + 4]
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro __strd a, b, dst, offset
|
||||||
|
#if __LINUX_ARM_ARCH__ >= 6
|
||||||
|
strd \a, \b, [\dst, #\offset]
|
||||||
|
#else
|
||||||
|
str \a, [\dst, #\offset]
|
||||||
|
str \b, [\dst, #\offset + 4]
|
||||||
|
#endif
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
|
||||||
|
|
||||||
|
// a += b; d ^= a; d = rol(d, 16);
|
||||||
|
add \a1, \a1, \b1, ror #brot
|
||||||
|
add \a2, \a2, \b2, ror #brot
|
||||||
|
eor \d1, \a1, \d1, ror #drot
|
||||||
|
eor \d2, \a2, \d2, ror #drot
|
||||||
|
// drot == 32 - 16 == 16
|
||||||
|
|
||||||
|
// c += d; b ^= c; b = rol(b, 12);
|
||||||
|
add \c1, \c1, \d1, ror #16
|
||||||
|
add \c2, \c2, \d2, ror #16
|
||||||
|
eor \b1, \c1, \b1, ror #brot
|
||||||
|
eor \b2, \c2, \b2, ror #brot
|
||||||
|
// brot == 32 - 12 == 20
|
||||||
|
|
||||||
|
// a += b; d ^= a; d = rol(d, 8);
|
||||||
|
add \a1, \a1, \b1, ror #20
|
||||||
|
add \a2, \a2, \b2, ror #20
|
||||||
|
eor \d1, \a1, \d1, ror #16
|
||||||
|
eor \d2, \a2, \d2, ror #16
|
||||||
|
// drot == 32 - 8 == 24
|
||||||
|
|
||||||
|
// c += d; b ^= c; b = rol(b, 7);
|
||||||
|
add \c1, \c1, \d1, ror #24
|
||||||
|
add \c2, \c2, \d2, ror #24
|
||||||
|
eor \b1, \c1, \b1, ror #20
|
||||||
|
eor \b2, \c2, \b2, ror #20
|
||||||
|
// brot == 32 - 7 == 25
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro _doubleround
|
||||||
|
|
||||||
|
// column round
|
||||||
|
|
||||||
|
// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
|
||||||
|
_halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
|
||||||
|
|
||||||
|
// save (x8, x9); restore (x10, x11)
|
||||||
|
__strd X8_X10, X9_X11, sp, 0
|
||||||
|
__ldrd X8_X10, X9_X11, sp, 8
|
||||||
|
|
||||||
|
// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
|
||||||
|
_halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
|
||||||
|
|
||||||
|
.set brot, 25
|
||||||
|
.set drot, 24
|
||||||
|
|
||||||
|
// diagonal round
|
||||||
|
|
||||||
|
// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
|
||||||
|
_halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
|
||||||
|
|
||||||
|
// save (x10, x11); restore (x8, x9)
|
||||||
|
__strd X8_X10, X9_X11, sp, 8
|
||||||
|
__ldrd X8_X10, X9_X11, sp, 0
|
||||||
|
|
||||||
|
// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
|
||||||
|
_halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro _chacha_permute nrounds
|
||||||
|
.set brot, 0
|
||||||
|
.set drot, 0
|
||||||
|
.rept \nrounds / 2
|
||||||
|
_doubleround
|
||||||
|
.endr
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro _chacha nrounds
|
||||||
|
|
||||||
|
.Lnext_block\@:
|
||||||
|
// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
|
||||||
|
// Registers contain x0-x9,x12-x15.
|
||||||
|
|
||||||
|
// Do the core ChaCha permutation to update x0-x15.
|
||||||
|
_chacha_permute \nrounds
|
||||||
|
|
||||||
|
add sp, #8
|
||||||
|
// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
|
||||||
|
// Registers contain x0-x9,x12-x15.
|
||||||
|
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
|
||||||
|
|
||||||
|
// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
|
||||||
|
push {X8_X10, X9_X11, X12, X13, X14, X15}
|
||||||
|
|
||||||
|
// Load (OUT, IN, LEN).
|
||||||
|
ldr r14, [sp, #96]
|
||||||
|
ldr r12, [sp, #100]
|
||||||
|
ldr r11, [sp, #104]
|
||||||
|
|
||||||
|
orr r10, r14, r12
|
||||||
|
|
||||||
|
// Use slow path if fewer than 64 bytes remain.
|
||||||
|
cmp r11, #64
|
||||||
|
blt .Lxor_slowpath\@
|
||||||
|
|
||||||
|
// Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
|
||||||
|
// ARMv6+, since ldmia and stmia (used below) still require alignment.
|
||||||
|
tst r10, #3
|
||||||
|
bne .Lxor_slowpath\@
|
||||||
|
|
||||||
|
// Fast path: XOR 64 bytes of aligned data.
|
||||||
|
|
||||||
|
// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
|
||||||
|
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
|
||||||
|
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
|
||||||
|
|
||||||
|
// x0-x3
|
||||||
|
__ldrd r8, r9, sp, 32
|
||||||
|
__ldrd r10, r11, sp, 40
|
||||||
|
add X0, X0, r8
|
||||||
|
add X1, X1, r9
|
||||||
|
add X2, X2, r10
|
||||||
|
add X3, X3, r11
|
||||||
|
_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
|
||||||
|
ldmia r12!, {r8-r11}
|
||||||
|
eor X0, X0, r8
|
||||||
|
eor X1, X1, r9
|
||||||
|
eor X2, X2, r10
|
||||||
|
eor X3, X3, r11
|
||||||
|
stmia r14!, {X0-X3}
|
||||||
|
|
||||||
|
// x4-x7
|
||||||
|
__ldrd r8, r9, sp, 48
|
||||||
|
__ldrd r10, r11, sp, 56
|
||||||
|
add X4, r8, X4, ror #brot
|
||||||
|
add X5, r9, X5, ror #brot
|
||||||
|
ldmia r12!, {X0-X3}
|
||||||
|
add X6, r10, X6, ror #brot
|
||||||
|
add X7, r11, X7, ror #brot
|
||||||
|
_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
|
||||||
|
eor X4, X4, X0
|
||||||
|
eor X5, X5, X1
|
||||||
|
eor X6, X6, X2
|
||||||
|
eor X7, X7, X3
|
||||||
|
stmia r14!, {X4-X7}
|
||||||
|
|
||||||
|
// x8-x15
|
||||||
|
pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
|
||||||
|
__ldrd r8, r9, sp, 32
|
||||||
|
__ldrd r10, r11, sp, 40
|
||||||
|
add r0, r0, r8 // x8
|
||||||
|
add r1, r1, r9 // x9
|
||||||
|
add r6, r6, r10 // x10
|
||||||
|
add r7, r7, r11 // x11
|
||||||
|
_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
|
||||||
|
ldmia r12!, {r8-r11}
|
||||||
|
eor r0, r0, r8 // x8
|
||||||
|
eor r1, r1, r9 // x9
|
||||||
|
eor r6, r6, r10 // x10
|
||||||
|
eor r7, r7, r11 // x11
|
||||||
|
stmia r14!, {r0,r1,r6,r7}
|
||||||
|
ldmia r12!, {r0,r1,r6,r7}
|
||||||
|
__ldrd r8, r9, sp, 48
|
||||||
|
__ldrd r10, r11, sp, 56
|
||||||
|
add r2, r8, r2, ror #drot // x12
|
||||||
|
add r3, r9, r3, ror #drot // x13
|
||||||
|
add r4, r10, r4, ror #drot // x14
|
||||||
|
add r5, r11, r5, ror #drot // x15
|
||||||
|
_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
|
||||||
|
ldr r9, [sp, #72] // load LEN
|
||||||
|
eor r2, r2, r0 // x12
|
||||||
|
eor r3, r3, r1 // x13
|
||||||
|
eor r4, r4, r6 // x14
|
||||||
|
eor r5, r5, r7 // x15
|
||||||
|
subs r9, #64 // decrement and check LEN
|
||||||
|
stmia r14!, {r2-r5}
|
||||||
|
|
||||||
|
beq .Ldone\@
|
||||||
|
|
||||||
|
.Lprepare_for_next_block\@:
|
||||||
|
|
||||||
|
// Stack: x0-x15 OUT IN LEN
|
||||||
|
|
||||||
|
// Increment block counter (x12)
|
||||||
|
add r8, #1
|
||||||
|
|
||||||
|
// Store updated (OUT, IN, LEN)
|
||||||
|
str r14, [sp, #64]
|
||||||
|
str r12, [sp, #68]
|
||||||
|
str r9, [sp, #72]
|
||||||
|
|
||||||
|
mov r14, sp
|
||||||
|
|
||||||
|
// Store updated block counter (x12)
|
||||||
|
str r8, [sp, #48]
|
||||||
|
|
||||||
|
sub sp, #16
|
||||||
|
|
||||||
|
// Reload state and do next block
|
||||||
|
ldmia r14!, {r0-r11} // load x0-x11
|
||||||
|
__strd r10, r11, sp, 8 // store x10-x11 before state
|
||||||
|
ldmia r14, {r10-r12,r14} // load x12-x15
|
||||||
|
b .Lnext_block\@
|
||||||
|
|
||||||
|
.Lxor_slowpath\@:
|
||||||
|
// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
|
||||||
|
// We handle it by storing the 64 bytes of keystream to the stack, then
|
||||||
|
// XOR-ing the needed portion with the data.
|
||||||
|
|
||||||
|
// Allocate keystream buffer
|
||||||
|
sub sp, #64
|
||||||
|
mov r14, sp
|
||||||
|
|
||||||
|
// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
|
||||||
|
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
|
||||||
|
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
|
||||||
|
|
||||||
|
// Save keystream for x0-x3
|
||||||
|
__ldrd r8, r9, sp, 96
|
||||||
|
__ldrd r10, r11, sp, 104
|
||||||
|
add X0, X0, r8
|
||||||
|
add X1, X1, r9
|
||||||
|
add X2, X2, r10
|
||||||
|
add X3, X3, r11
|
||||||
|
_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
|
||||||
|
stmia r14!, {X0-X3}
|
||||||
|
|
||||||
|
// Save keystream for x4-x7
|
||||||
|
__ldrd r8, r9, sp, 112
|
||||||
|
__ldrd r10, r11, sp, 120
|
||||||
|
add X4, r8, X4, ror #brot
|
||||||
|
add X5, r9, X5, ror #brot
|
||||||
|
add X6, r10, X6, ror #brot
|
||||||
|
add X7, r11, X7, ror #brot
|
||||||
|
_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
|
||||||
|
add r8, sp, #64
|
||||||
|
stmia r14!, {X4-X7}
|
||||||
|
|
||||||
|
// Save keystream for x8-x15
|
||||||
|
ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
|
||||||
|
__ldrd r8, r9, sp, 128
|
||||||
|
__ldrd r10, r11, sp, 136
|
||||||
|
add r0, r0, r8 // x8
|
||||||
|
add r1, r1, r9 // x9
|
||||||
|
add r6, r6, r10 // x10
|
||||||
|
add r7, r7, r11 // x11
|
||||||
|
_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
|
||||||
|
stmia r14!, {r0,r1,r6,r7}
|
||||||
|
__ldrd r8, r9, sp, 144
|
||||||
|
__ldrd r10, r11, sp, 152
|
||||||
|
add r2, r8, r2, ror #drot // x12
|
||||||
|
add r3, r9, r3, ror #drot // x13
|
||||||
|
add r4, r10, r4, ror #drot // x14
|
||||||
|
add r5, r11, r5, ror #drot // x15
|
||||||
|
_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
|
||||||
|
stmia r14, {r2-r5}
|
||||||
|
|
||||||
|
// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
|
||||||
|
// Registers: r8 is block counter, r12 is IN.
|
||||||
|
|
||||||
|
ldr r9, [sp, #168] // LEN
|
||||||
|
ldr r14, [sp, #160] // OUT
|
||||||
|
cmp r9, #64
|
||||||
|
mov r0, sp
|
||||||
|
movle r1, r9
|
||||||
|
movgt r1, #64
|
||||||
|
// r1 is number of bytes to XOR, in range [1, 64]
|
||||||
|
|
||||||
|
.if __LINUX_ARM_ARCH__ < 6
|
||||||
|
orr r2, r12, r14
|
||||||
|
tst r2, #3 // IN or OUT misaligned?
|
||||||
|
bne .Lxor_next_byte\@
|
||||||
|
.endif
|
||||||
|
|
||||||
|
// XOR a word at a time
|
||||||
|
.rept 16
|
||||||
|
subs r1, #4
|
||||||
|
blt .Lxor_words_done\@
|
||||||
|
ldr r2, [r12], #4
|
||||||
|
ldr r3, [r0], #4
|
||||||
|
eor r2, r2, r3
|
||||||
|
str r2, [r14], #4
|
||||||
|
.endr
|
||||||
|
b .Lxor_slowpath_done\@
|
||||||
|
.Lxor_words_done\@:
|
||||||
|
ands r1, r1, #3
|
||||||
|
beq .Lxor_slowpath_done\@
|
||||||
|
|
||||||
|
// XOR a byte at a time
|
||||||
|
.Lxor_next_byte\@:
|
||||||
|
ldrb r2, [r12], #1
|
||||||
|
ldrb r3, [r0], #1
|
||||||
|
eor r2, r2, r3
|
||||||
|
strb r2, [r14], #1
|
||||||
|
subs r1, #1
|
||||||
|
bne .Lxor_next_byte\@
|
||||||
|
|
||||||
|
.Lxor_slowpath_done\@:
|
||||||
|
subs r9, #64
|
||||||
|
add sp, #96
|
||||||
|
bgt .Lprepare_for_next_block\@
|
||||||
|
|
||||||
|
.Ldone\@:
|
||||||
|
.endm // _chacha
|
||||||
|
|
||||||
|
/*
|
||||||
|
* void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
|
||||||
|
* const u32 *state, int nrounds);
|
||||||
|
*/
|
||||||
|
ENTRY(chacha_doarm)
|
||||||
|
cmp r2, #0 // len == 0?
|
||||||
|
reteq lr
|
||||||
|
|
||||||
|
ldr ip, [sp]
|
||||||
|
cmp ip, #12
|
||||||
|
|
||||||
|
push {r0-r2,r4-r11,lr}
|
||||||
|
|
||||||
|
// Push state x0-x15 onto stack.
|
||||||
|
// Also store an extra copy of x10-x11 just before the state.
|
||||||
|
|
||||||
|
add X12, r3, #48
|
||||||
|
ldm X12, {X12,X13,X14,X15}
|
||||||
|
push {X12,X13,X14,X15}
|
||||||
|
sub sp, sp, #64
|
||||||
|
|
||||||
|
__ldrd X8_X10, X9_X11, r3, 40
|
||||||
|
__strd X8_X10, X9_X11, sp, 8
|
||||||
|
__strd X8_X10, X9_X11, sp, 56
|
||||||
|
ldm r3, {X0-X9_X11}
|
||||||
|
__strd X0, X1, sp, 16
|
||||||
|
__strd X2, X3, sp, 24
|
||||||
|
__strd X4, X5, sp, 32
|
||||||
|
__strd X6, X7, sp, 40
|
||||||
|
__strd X8_X10, X9_X11, sp, 48
|
||||||
|
|
||||||
|
beq 1f
|
||||||
|
_chacha 20
|
||||||
|
|
||||||
|
0: add sp, #76
|
||||||
|
pop {r4-r11, pc}
|
||||||
|
|
||||||
|
1: _chacha 12
|
||||||
|
b 0b
|
||||||
|
ENDPROC(chacha_doarm)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* void hchacha_block_arm(const u32 state[16], u32 out[8], int nrounds);
|
||||||
|
*/
|
||||||
|
ENTRY(hchacha_block_arm)
|
||||||
|
push {r1,r4-r11,lr}
|
||||||
|
|
||||||
|
cmp r2, #12 // ChaCha12 ?
|
||||||
|
|
||||||
|
mov r14, r0
|
||||||
|
ldmia r14!, {r0-r11} // load x0-x11
|
||||||
|
push {r10-r11} // store x10-x11 to stack
|
||||||
|
ldm r14, {r10-r12,r14} // load x12-x15
|
||||||
|
sub sp, #8
|
||||||
|
|
||||||
|
beq 1f
|
||||||
|
_chacha_permute 20
|
||||||
|
|
||||||
|
// Skip over (unused0-unused1, x10-x11)
|
||||||
|
0: add sp, #16
|
||||||
|
|
||||||
|
// Fix up rotations of x12-x15
|
||||||
|
ror X12, X12, #drot
|
||||||
|
ror X13, X13, #drot
|
||||||
|
pop {r4} // load 'out'
|
||||||
|
ror X14, X14, #drot
|
||||||
|
ror X15, X15, #drot
|
||||||
|
|
||||||
|
// Store (x0-x3,x12-x15) to 'out'
|
||||||
|
stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
|
||||||
|
|
||||||
|
pop {r4-r11,pc}
|
||||||
|
|
||||||
|
1: _chacha_permute 12
|
||||||
|
b 0b
|
||||||
|
ENDPROC(hchacha_block_arm)
|
2062
arch/arm/crypto/curve25519-core.S
Normal file
2062
arch/arm/crypto/curve25519-core.S
Normal file
File diff suppressed because it is too large
Load diff
135
arch/arm/crypto/curve25519-glue.c
Normal file
135
arch/arm/crypto/curve25519-glue.c
Normal file
|
@ -0,0 +1,135 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0 OR MIT
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Based on public domain code from Daniel J. Bernstein and Peter Schwabe. This
|
||||||
|
* began from SUPERCOP's curve25519/neon2/scalarmult.s, but has subsequently been
|
||||||
|
* manually reworked for use in kernel space.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <asm/hwcap.h>
|
||||||
|
#include <asm/neon.h>
|
||||||
|
#include <asm/simd.h>
|
||||||
|
#include <crypto/internal/kpp.h>
|
||||||
|
#include <linux/types.h>
|
||||||
|
#include <linux/module.h>
|
||||||
|
#include <linux/init.h>
|
||||||
|
#include <linux/jump_label.h>
|
||||||
|
#include <linux/scatterlist.h>
|
||||||
|
#include <crypto/curve25519.h>
|
||||||
|
|
||||||
|
asmlinkage void curve25519_neon(u8 mypublic[CURVE25519_KEY_SIZE],
|
||||||
|
const u8 secret[CURVE25519_KEY_SIZE],
|
||||||
|
const u8 basepoint[CURVE25519_KEY_SIZE]);
|
||||||
|
|
||||||
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
|
||||||
|
|
||||||
|
void curve25519_arch(u8 out[CURVE25519_KEY_SIZE],
|
||||||
|
const u8 scalar[CURVE25519_KEY_SIZE],
|
||||||
|
const u8 point[CURVE25519_KEY_SIZE])
|
||||||
|
{
|
||||||
|
if (static_branch_likely(&have_neon) && may_use_simd()) {
|
||||||
|
kernel_neon_begin();
|
||||||
|
curve25519_neon(out, scalar, point);
|
||||||
|
kernel_neon_end();
|
||||||
|
} else {
|
||||||
|
curve25519_generic(out, scalar, point);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(curve25519_arch);
|
||||||
|
|
||||||
|
void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
|
||||||
|
const u8 secret[CURVE25519_KEY_SIZE])
|
||||||
|
{
|
||||||
|
return curve25519_arch(pub, secret, curve25519_base_point);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(curve25519_base_arch);
|
||||||
|
|
||||||
|
static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf,
|
||||||
|
unsigned int len)
|
||||||
|
{
|
||||||
|
u8 *secret = kpp_tfm_ctx(tfm);
|
||||||
|
|
||||||
|
if (!len)
|
||||||
|
curve25519_generate_secret(secret);
|
||||||
|
else if (len == CURVE25519_KEY_SIZE &&
|
||||||
|
crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE))
|
||||||
|
memcpy(secret, buf, CURVE25519_KEY_SIZE);
|
||||||
|
else
|
||||||
|
return -EINVAL;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int curve25519_compute_value(struct kpp_request *req)
|
||||||
|
{
|
||||||
|
struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
|
||||||
|
const u8 *secret = kpp_tfm_ctx(tfm);
|
||||||
|
u8 public_key[CURVE25519_KEY_SIZE];
|
||||||
|
u8 buf[CURVE25519_KEY_SIZE];
|
||||||
|
int copied, nbytes;
|
||||||
|
u8 const *bp;
|
||||||
|
|
||||||
|
if (req->src) {
|
||||||
|
copied = sg_copy_to_buffer(req->src,
|
||||||
|
sg_nents_for_len(req->src,
|
||||||
|
CURVE25519_KEY_SIZE),
|
||||||
|
public_key, CURVE25519_KEY_SIZE);
|
||||||
|
if (copied != CURVE25519_KEY_SIZE)
|
||||||
|
return -EINVAL;
|
||||||
|
bp = public_key;
|
||||||
|
} else {
|
||||||
|
bp = curve25519_base_point;
|
||||||
|
}
|
||||||
|
|
||||||
|
curve25519_arch(buf, secret, bp);
|
||||||
|
|
||||||
|
/* might want less than we've got */
|
||||||
|
nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len);
|
||||||
|
copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst,
|
||||||
|
nbytes),
|
||||||
|
buf, nbytes);
|
||||||
|
if (copied != nbytes)
|
||||||
|
return -EINVAL;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned int curve25519_max_size(struct crypto_kpp *tfm)
|
||||||
|
{
|
||||||
|
return CURVE25519_KEY_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct kpp_alg curve25519_alg = {
|
||||||
|
.base.cra_name = "curve25519",
|
||||||
|
.base.cra_driver_name = "curve25519-neon",
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
.base.cra_ctxsize = CURVE25519_KEY_SIZE,
|
||||||
|
|
||||||
|
.set_secret = curve25519_set_secret,
|
||||||
|
.generate_public_key = curve25519_compute_value,
|
||||||
|
.compute_shared_secret = curve25519_compute_value,
|
||||||
|
.max_size = curve25519_max_size,
|
||||||
|
};
|
||||||
|
|
||||||
|
static int __init mod_init(void)
|
||||||
|
{
|
||||||
|
if (elf_hwcap & HWCAP_NEON) {
|
||||||
|
static_branch_enable(&have_neon);
|
||||||
|
return IS_REACHABLE(CONFIG_CRYPTO_KPP) ?
|
||||||
|
crypto_register_kpp(&curve25519_alg) : 0;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __exit mod_exit(void)
|
||||||
|
{
|
||||||
|
if (IS_REACHABLE(CONFIG_CRYPTO_KPP) && elf_hwcap & HWCAP_NEON)
|
||||||
|
crypto_unregister_kpp(&curve25519_alg);
|
||||||
|
}
|
||||||
|
|
||||||
|
module_init(mod_init);
|
||||||
|
module_exit(mod_exit);
|
||||||
|
|
||||||
|
MODULE_ALIAS_CRYPTO("curve25519");
|
||||||
|
MODULE_ALIAS_CRYPTO("curve25519-neon");
|
||||||
|
MODULE_LICENSE("GPL v2");
|
1236
arch/arm/crypto/poly1305-armv4.pl
Normal file
1236
arch/arm/crypto/poly1305-armv4.pl
Normal file
File diff suppressed because it is too large
Load diff
1158
arch/arm/crypto/poly1305-core.S_shipped
Normal file
1158
arch/arm/crypto/poly1305-core.S_shipped
Normal file
File diff suppressed because it is too large
Load diff
272
arch/arm/crypto/poly1305-glue.c
Normal file
272
arch/arm/crypto/poly1305-glue.c
Normal file
|
@ -0,0 +1,272 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
|
/*
|
||||||
|
* OpenSSL/Cryptogams accelerated Poly1305 transform for ARM
|
||||||
|
*
|
||||||
|
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <asm/hwcap.h>
|
||||||
|
#include <asm/neon.h>
|
||||||
|
#include <asm/simd.h>
|
||||||
|
#include <asm/unaligned.h>
|
||||||
|
#include <crypto/algapi.h>
|
||||||
|
#include <crypto/internal/hash.h>
|
||||||
|
#include <crypto/internal/poly1305.h>
|
||||||
|
#include <linux/cpufeature.h>
|
||||||
|
#include <linux/crypto.h>
|
||||||
|
#include <linux/jump_label.h>
|
||||||
|
#include <linux/module.h>
|
||||||
|
|
||||||
|
void poly1305_init_arm(void *state, const u8 *key);
|
||||||
|
void poly1305_blocks_arm(void *state, const u8 *src, u32 len, u32 hibit);
|
||||||
|
void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
|
||||||
|
void poly1305_emit_arm(void *state, u8 *digest, const u32 *nonce);
|
||||||
|
|
||||||
|
void __weak poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
|
||||||
|
|
||||||
|
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
|
||||||
|
{
|
||||||
|
poly1305_init_arm(&dctx->h, key);
|
||||||
|
dctx->s[0] = get_unaligned_le32(key + 16);
|
||||||
|
dctx->s[1] = get_unaligned_le32(key + 20);
|
||||||
|
dctx->s[2] = get_unaligned_le32(key + 24);
|
||||||
|
dctx->s[3] = get_unaligned_le32(key + 28);
|
||||||
|
dctx->buflen = 0;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(poly1305_init_arch);
|
||||||
|
|
||||||
|
static int arm_poly1305_init(struct shash_desc *desc)
|
||||||
|
{
|
||||||
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||||
|
|
||||||
|
dctx->buflen = 0;
|
||||||
|
dctx->rset = 0;
|
||||||
|
dctx->sset = false;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void arm_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||||
|
u32 len, u32 hibit, bool do_neon)
|
||||||
|
{
|
||||||
|
if (unlikely(!dctx->sset)) {
|
||||||
|
if (!dctx->rset) {
|
||||||
|
poly1305_init_arm(&dctx->h, src);
|
||||||
|
src += POLY1305_BLOCK_SIZE;
|
||||||
|
len -= POLY1305_BLOCK_SIZE;
|
||||||
|
dctx->rset = 1;
|
||||||
|
}
|
||||||
|
if (len >= POLY1305_BLOCK_SIZE) {
|
||||||
|
dctx->s[0] = get_unaligned_le32(src + 0);
|
||||||
|
dctx->s[1] = get_unaligned_le32(src + 4);
|
||||||
|
dctx->s[2] = get_unaligned_le32(src + 8);
|
||||||
|
dctx->s[3] = get_unaligned_le32(src + 12);
|
||||||
|
src += POLY1305_BLOCK_SIZE;
|
||||||
|
len -= POLY1305_BLOCK_SIZE;
|
||||||
|
dctx->sset = true;
|
||||||
|
}
|
||||||
|
if (len < POLY1305_BLOCK_SIZE)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
len &= ~(POLY1305_BLOCK_SIZE - 1);
|
||||||
|
|
||||||
|
if (static_branch_likely(&have_neon) && likely(do_neon))
|
||||||
|
poly1305_blocks_neon(&dctx->h, src, len, hibit);
|
||||||
|
else
|
||||||
|
poly1305_blocks_arm(&dctx->h, src, len, hibit);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void arm_poly1305_do_update(struct poly1305_desc_ctx *dctx,
|
||||||
|
const u8 *src, u32 len, bool do_neon)
|
||||||
|
{
|
||||||
|
if (unlikely(dctx->buflen)) {
|
||||||
|
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||||
|
|
||||||
|
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||||
|
src += bytes;
|
||||||
|
len -= bytes;
|
||||||
|
dctx->buflen += bytes;
|
||||||
|
|
||||||
|
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||||
|
arm_poly1305_blocks(dctx, dctx->buf,
|
||||||
|
POLY1305_BLOCK_SIZE, 1, false);
|
||||||
|
dctx->buflen = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (likely(len >= POLY1305_BLOCK_SIZE)) {
|
||||||
|
arm_poly1305_blocks(dctx, src, len, 1, do_neon);
|
||||||
|
src += round_down(len, POLY1305_BLOCK_SIZE);
|
||||||
|
len %= POLY1305_BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (unlikely(len)) {
|
||||||
|
dctx->buflen = len;
|
||||||
|
memcpy(dctx->buf, src, len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int arm_poly1305_update(struct shash_desc *desc,
|
||||||
|
const u8 *src, unsigned int srclen)
|
||||||
|
{
|
||||||
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||||
|
|
||||||
|
arm_poly1305_do_update(dctx, src, srclen, false);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int __maybe_unused arm_poly1305_update_neon(struct shash_desc *desc,
|
||||||
|
const u8 *src,
|
||||||
|
unsigned int srclen)
|
||||||
|
{
|
||||||
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||||
|
bool do_neon = may_use_simd() && srclen > 128;
|
||||||
|
|
||||||
|
if (static_branch_likely(&have_neon) && do_neon)
|
||||||
|
kernel_neon_begin();
|
||||||
|
arm_poly1305_do_update(dctx, src, srclen, do_neon);
|
||||||
|
if (static_branch_likely(&have_neon) && do_neon)
|
||||||
|
kernel_neon_end();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||||
|
unsigned int nbytes)
|
||||||
|
{
|
||||||
|
bool do_neon = IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
|
||||||
|
may_use_simd();
|
||||||
|
|
||||||
|
if (unlikely(dctx->buflen)) {
|
||||||
|
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||||
|
|
||||||
|
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||||
|
src += bytes;
|
||||||
|
nbytes -= bytes;
|
||||||
|
dctx->buflen += bytes;
|
||||||
|
|
||||||
|
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||||
|
poly1305_blocks_arm(&dctx->h, dctx->buf,
|
||||||
|
POLY1305_BLOCK_SIZE, 1);
|
||||||
|
dctx->buflen = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
|
||||||
|
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
|
||||||
|
|
||||||
|
if (static_branch_likely(&have_neon) && do_neon) {
|
||||||
|
do {
|
||||||
|
unsigned int todo = min_t(unsigned int, len, SZ_4K);
|
||||||
|
|
||||||
|
kernel_neon_begin();
|
||||||
|
poly1305_blocks_neon(&dctx->h, src, todo, 1);
|
||||||
|
kernel_neon_end();
|
||||||
|
|
||||||
|
len -= todo;
|
||||||
|
src += todo;
|
||||||
|
} while (len);
|
||||||
|
} else {
|
||||||
|
poly1305_blocks_arm(&dctx->h, src, len, 1);
|
||||||
|
src += len;
|
||||||
|
}
|
||||||
|
nbytes %= POLY1305_BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (unlikely(nbytes)) {
|
||||||
|
dctx->buflen = nbytes;
|
||||||
|
memcpy(dctx->buf, src, nbytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(poly1305_update_arch);
|
||||||
|
|
||||||
|
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
|
||||||
|
{
|
||||||
|
if (unlikely(dctx->buflen)) {
|
||||||
|
dctx->buf[dctx->buflen++] = 1;
|
||||||
|
memset(dctx->buf + dctx->buflen, 0,
|
||||||
|
POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||||
|
poly1305_blocks_arm(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
poly1305_emit_arm(&dctx->h, dst, dctx->s);
|
||||||
|
*dctx = (struct poly1305_desc_ctx){};
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(poly1305_final_arch);
|
||||||
|
|
||||||
|
static int arm_poly1305_final(struct shash_desc *desc, u8 *dst)
|
||||||
|
{
|
||||||
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||||
|
|
||||||
|
if (unlikely(!dctx->sset))
|
||||||
|
return -ENOKEY;
|
||||||
|
|
||||||
|
poly1305_final_arch(dctx, dst);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct shash_alg arm_poly1305_algs[] = {{
|
||||||
|
.init = arm_poly1305_init,
|
||||||
|
.update = arm_poly1305_update,
|
||||||
|
.final = arm_poly1305_final,
|
||||||
|
.digestsize = POLY1305_DIGEST_SIZE,
|
||||||
|
.descsize = sizeof(struct poly1305_desc_ctx),
|
||||||
|
|
||||||
|
.base.cra_name = "poly1305",
|
||||||
|
.base.cra_driver_name = "poly1305-arm",
|
||||||
|
.base.cra_priority = 150,
|
||||||
|
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
#ifdef CONFIG_KERNEL_MODE_NEON
|
||||||
|
}, {
|
||||||
|
.init = arm_poly1305_init,
|
||||||
|
.update = arm_poly1305_update_neon,
|
||||||
|
.final = arm_poly1305_final,
|
||||||
|
.digestsize = POLY1305_DIGEST_SIZE,
|
||||||
|
.descsize = sizeof(struct poly1305_desc_ctx),
|
||||||
|
|
||||||
|
.base.cra_name = "poly1305",
|
||||||
|
.base.cra_driver_name = "poly1305-neon",
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
#endif
|
||||||
|
}};
|
||||||
|
|
||||||
|
static int __init arm_poly1305_mod_init(void)
|
||||||
|
{
|
||||||
|
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) &&
|
||||||
|
(elf_hwcap & HWCAP_NEON))
|
||||||
|
static_branch_enable(&have_neon);
|
||||||
|
else if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
||||||
|
/* register only the first entry */
|
||||||
|
return crypto_register_shash(&arm_poly1305_algs[0]);
|
||||||
|
|
||||||
|
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||||
|
crypto_register_shashes(arm_poly1305_algs,
|
||||||
|
ARRAY_SIZE(arm_poly1305_algs)) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __exit arm_poly1305_mod_exit(void)
|
||||||
|
{
|
||||||
|
if (!IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
||||||
|
return;
|
||||||
|
if (!static_branch_likely(&have_neon)) {
|
||||||
|
crypto_unregister_shash(&arm_poly1305_algs[0]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
crypto_unregister_shashes(arm_poly1305_algs,
|
||||||
|
ARRAY_SIZE(arm_poly1305_algs));
|
||||||
|
}
|
||||||
|
|
||||||
|
module_init(arm_poly1305_mod_init);
|
||||||
|
module_exit(arm_poly1305_mod_exit);
|
||||||
|
|
||||||
|
MODULE_LICENSE("GPL v2");
|
||||||
|
MODULE_ALIAS_CRYPTO("poly1305");
|
||||||
|
MODULE_ALIAS_CRYPTO("poly1305-arm");
|
||||||
|
MODULE_ALIAS_CRYPTO("poly1305-neon");
|
|
@ -688,6 +688,40 @@ static void disable_single_step(struct perf_event *bp)
|
||||||
arch_install_hw_breakpoint(bp);
|
arch_install_hw_breakpoint(bp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Arm32 hardware does not always report a watchpoint hit address that matches
|
||||||
|
* one of the watchpoints set. It can also report an address "near" the
|
||||||
|
* watchpoint if a single instruction access both watched and unwatched
|
||||||
|
* addresses. There is no straight-forward way, short of disassembling the
|
||||||
|
* offending instruction, to map that address back to the watchpoint. This
|
||||||
|
* function computes the distance of the memory access from the watchpoint as a
|
||||||
|
* heuristic for the likelyhood that a given access triggered the watchpoint.
|
||||||
|
*
|
||||||
|
* See this same function in the arm64 platform code, which has the same
|
||||||
|
* problem.
|
||||||
|
*
|
||||||
|
* The function returns the distance of the address from the bytes watched by
|
||||||
|
* the watchpoint. In case of an exact match, it returns 0.
|
||||||
|
*/
|
||||||
|
static u32 get_distance_from_watchpoint(unsigned long addr, u32 val,
|
||||||
|
struct arch_hw_breakpoint_ctrl *ctrl)
|
||||||
|
{
|
||||||
|
u32 wp_low, wp_high;
|
||||||
|
u32 lens, lene;
|
||||||
|
|
||||||
|
lens = __ffs(ctrl->len);
|
||||||
|
lene = __fls(ctrl->len);
|
||||||
|
|
||||||
|
wp_low = val + lens;
|
||||||
|
wp_high = val + lene;
|
||||||
|
if (addr < wp_low)
|
||||||
|
return wp_low - addr;
|
||||||
|
else if (addr > wp_high)
|
||||||
|
return addr - wp_high;
|
||||||
|
else
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static int watchpoint_fault_on_uaccess(struct pt_regs *regs,
|
static int watchpoint_fault_on_uaccess(struct pt_regs *regs,
|
||||||
struct arch_hw_breakpoint *info)
|
struct arch_hw_breakpoint *info)
|
||||||
{
|
{
|
||||||
|
@ -697,23 +731,25 @@ static int watchpoint_fault_on_uaccess(struct pt_regs *regs,
|
||||||
static void watchpoint_handler(unsigned long addr, unsigned int fsr,
|
static void watchpoint_handler(unsigned long addr, unsigned int fsr,
|
||||||
struct pt_regs *regs)
|
struct pt_regs *regs)
|
||||||
{
|
{
|
||||||
int i, access;
|
int i, access, closest_match = 0;
|
||||||
u32 val, ctrl_reg, alignment_mask;
|
u32 min_dist = -1, dist;
|
||||||
|
u32 val, ctrl_reg;
|
||||||
struct perf_event *wp, **slots;
|
struct perf_event *wp, **slots;
|
||||||
struct arch_hw_breakpoint *info;
|
struct arch_hw_breakpoint *info;
|
||||||
struct arch_hw_breakpoint_ctrl ctrl;
|
struct arch_hw_breakpoint_ctrl ctrl;
|
||||||
|
|
||||||
slots = this_cpu_ptr(wp_on_reg);
|
slots = this_cpu_ptr(wp_on_reg);
|
||||||
|
|
||||||
for (i = 0; i < core_num_wrps; ++i) {
|
/*
|
||||||
|
* Find all watchpoints that match the reported address. If no exact
|
||||||
|
* match is found. Attribute the hit to the closest watchpoint.
|
||||||
|
*/
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
|
for (i = 0; i < core_num_wrps; ++i) {
|
||||||
wp = slots[i];
|
wp = slots[i];
|
||||||
|
|
||||||
if (wp == NULL)
|
if (wp == NULL)
|
||||||
goto unlock;
|
continue;
|
||||||
|
|
||||||
info = counter_arch_bp(wp);
|
|
||||||
/*
|
/*
|
||||||
* The DFAR is an unknown value on debug architectures prior
|
* The DFAR is an unknown value on debug architectures prior
|
||||||
* to 7.1. Since we only allow a single watchpoint on these
|
* to 7.1. Since we only allow a single watchpoint on these
|
||||||
|
@ -722,33 +758,31 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
|
||||||
*/
|
*/
|
||||||
if (debug_arch < ARM_DEBUG_ARCH_V7_1) {
|
if (debug_arch < ARM_DEBUG_ARCH_V7_1) {
|
||||||
BUG_ON(i > 0);
|
BUG_ON(i > 0);
|
||||||
|
info = counter_arch_bp(wp);
|
||||||
info->trigger = wp->attr.bp_addr;
|
info->trigger = wp->attr.bp_addr;
|
||||||
} else {
|
} else {
|
||||||
if (info->ctrl.len == ARM_BREAKPOINT_LEN_8)
|
|
||||||
alignment_mask = 0x7;
|
|
||||||
else
|
|
||||||
alignment_mask = 0x3;
|
|
||||||
|
|
||||||
/* Check if the watchpoint value matches. */
|
|
||||||
val = read_wb_reg(ARM_BASE_WVR + i);
|
|
||||||
if (val != (addr & ~alignment_mask))
|
|
||||||
goto unlock;
|
|
||||||
|
|
||||||
/* Possible match, check the byte address select. */
|
|
||||||
ctrl_reg = read_wb_reg(ARM_BASE_WCR + i);
|
|
||||||
decode_ctrl_reg(ctrl_reg, &ctrl);
|
|
||||||
if (!((1 << (addr & alignment_mask)) & ctrl.len))
|
|
||||||
goto unlock;
|
|
||||||
|
|
||||||
/* Check that the access type matches. */
|
/* Check that the access type matches. */
|
||||||
if (debug_exception_updates_fsr()) {
|
if (debug_exception_updates_fsr()) {
|
||||||
access = (fsr & ARM_FSR_ACCESS_MASK) ?
|
access = (fsr & ARM_FSR_ACCESS_MASK) ?
|
||||||
HW_BREAKPOINT_W : HW_BREAKPOINT_R;
|
HW_BREAKPOINT_W : HW_BREAKPOINT_R;
|
||||||
if (!(access & hw_breakpoint_type(wp)))
|
if (!(access & hw_breakpoint_type(wp)))
|
||||||
goto unlock;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
val = read_wb_reg(ARM_BASE_WVR + i);
|
||||||
|
ctrl_reg = read_wb_reg(ARM_BASE_WCR + i);
|
||||||
|
decode_ctrl_reg(ctrl_reg, &ctrl);
|
||||||
|
dist = get_distance_from_watchpoint(addr, val, &ctrl);
|
||||||
|
if (dist < min_dist) {
|
||||||
|
min_dist = dist;
|
||||||
|
closest_match = i;
|
||||||
|
}
|
||||||
|
/* Is this an exact match? */
|
||||||
|
if (dist != 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
/* We have a winner. */
|
/* We have a winner. */
|
||||||
|
info = counter_arch_bp(wp);
|
||||||
info->trigger = addr;
|
info->trigger = addr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -770,13 +804,23 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
|
||||||
* we can single-step over the watchpoint trigger.
|
* we can single-step over the watchpoint trigger.
|
||||||
*/
|
*/
|
||||||
if (!is_default_overflow_handler(wp))
|
if (!is_default_overflow_handler(wp))
|
||||||
goto unlock;
|
continue;
|
||||||
|
|
||||||
step:
|
step:
|
||||||
enable_single_step(wp, instruction_pointer(regs));
|
enable_single_step(wp, instruction_pointer(regs));
|
||||||
unlock:
|
|
||||||
rcu_read_unlock();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (min_dist > 0 && min_dist != -1) {
|
||||||
|
/* No exact match found. */
|
||||||
|
wp = slots[closest_match];
|
||||||
|
info = counter_arch_bp(wp);
|
||||||
|
info->trigger = addr;
|
||||||
|
pr_debug("watchpoint fired: address = 0x%x\n", info->trigger);
|
||||||
|
perf_bp_event(wp, regs);
|
||||||
|
if (is_default_overflow_handler(wp))
|
||||||
|
enable_single_step(wp, instruction_pointer(regs));
|
||||||
|
}
|
||||||
|
|
||||||
|
rcu_read_unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void watchpoint_single_step_handler(unsigned long pc)
|
static void watchpoint_single_step_handler(unsigned long pc)
|
||||||
|
|
|
@ -1261,20 +1261,28 @@ static void __init l2c310_of_parse(const struct device_node *np,
|
||||||
|
|
||||||
ret = of_property_read_u32(np, "prefetch-data", &val);
|
ret = of_property_read_u32(np, "prefetch-data", &val);
|
||||||
if (ret == 0) {
|
if (ret == 0) {
|
||||||
if (val)
|
if (val) {
|
||||||
prefetch |= L310_PREFETCH_CTRL_DATA_PREFETCH;
|
prefetch |= L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||||
else
|
*aux_val |= L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||||
|
} else {
|
||||||
prefetch &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
|
prefetch &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||||
|
*aux_val &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||||
|
}
|
||||||
|
*aux_mask &= ~L310_PREFETCH_CTRL_DATA_PREFETCH;
|
||||||
} else if (ret != -EINVAL) {
|
} else if (ret != -EINVAL) {
|
||||||
pr_err("L2C-310 OF prefetch-data property value is missing\n");
|
pr_err("L2C-310 OF prefetch-data property value is missing\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = of_property_read_u32(np, "prefetch-instr", &val);
|
ret = of_property_read_u32(np, "prefetch-instr", &val);
|
||||||
if (ret == 0) {
|
if (ret == 0) {
|
||||||
if (val)
|
if (val) {
|
||||||
prefetch |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
prefetch |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||||
else
|
*aux_val |= L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||||
|
} else {
|
||||||
prefetch &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
prefetch &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||||
|
*aux_val &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||||
|
}
|
||||||
|
*aux_mask &= ~L310_PREFETCH_CTRL_INSTR_PREFETCH;
|
||||||
} else if (ret != -EINVAL) {
|
} else if (ret != -EINVAL) {
|
||||||
pr_err("L2C-310 OF prefetch-instr property value is missing\n");
|
pr_err("L2C-310 OF prefetch-instr property value is missing\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -240,6 +240,7 @@ config SAMSUNG_PM_DEBUG
|
||||||
bool "Samsung PM Suspend debug"
|
bool "Samsung PM Suspend debug"
|
||||||
depends on PM && DEBUG_KERNEL
|
depends on PM && DEBUG_KERNEL
|
||||||
depends on DEBUG_EXYNOS_UART || DEBUG_S3C24XX_UART || DEBUG_S3C2410_UART
|
depends on DEBUG_EXYNOS_UART || DEBUG_S3C24XX_UART || DEBUG_S3C2410_UART
|
||||||
|
depends on DEBUG_LL && MMU
|
||||||
help
|
help
|
||||||
Say Y here if you want verbose debugging from the PM Suspend and
|
Say Y here if you want verbose debugging from the PM Suspend and
|
||||||
Resume code. See <file:Documentation/arm/Samsung-S3C24XX/Suspend.txt>
|
Resume code. See <file:Documentation/arm/Samsung-S3C24XX/Suspend.txt>
|
||||||
|
|
|
@ -46,6 +46,7 @@ config ARCH_BCM_IPROC
|
||||||
config ARCH_BERLIN
|
config ARCH_BERLIN
|
||||||
bool "Marvell Berlin SoC Family"
|
bool "Marvell Berlin SoC Family"
|
||||||
select DW_APB_ICTL
|
select DW_APB_ICTL
|
||||||
|
select DW_APB_TIMER_OF
|
||||||
select GPIOLIB
|
select GPIOLIB
|
||||||
select PINCTRL
|
select PINCTRL
|
||||||
help
|
help
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
#
|
#
|
||||||
# Copyright (C) 1995-2001 by Russell King
|
# Copyright (C) 1995-2001 by Russell King
|
||||||
|
|
||||||
LDFLAGS_vmlinux :=--no-undefined -X
|
LDFLAGS_vmlinux :=--no-undefined -X -z norelro
|
||||||
CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET)
|
CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET)
|
||||||
GZFLAGS :=-9
|
GZFLAGS :=-9
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ ifeq ($(CONFIG_RELOCATABLE), y)
|
||||||
# Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour
|
# Pass --no-apply-dynamic-relocs to restore pre-binutils-2.27 behaviour
|
||||||
# for relative relocs, since this leads to better Image compression
|
# for relative relocs, since this leads to better Image compression
|
||||||
# with the relocation offsets always being zero.
|
# with the relocation offsets always being zero.
|
||||||
LDFLAGS_vmlinux += -shared -Bsymbolic -z notext -z norelro \
|
LDFLAGS_vmlinux += -shared -Bsymbolic -z notext \
|
||||||
$(call ld-option, --no-apply-dynamic-relocs)
|
$(call ld-option, --no-apply-dynamic-relocs)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,10 @@
|
||||||
|
|
||||||
aliases {
|
aliases {
|
||||||
ethernet0 = ð0;
|
ethernet0 = ð0;
|
||||||
|
/* for dsa slave device */
|
||||||
|
ethernet1 = &switch0port1;
|
||||||
|
ethernet2 = &switch0port2;
|
||||||
|
ethernet3 = &switch0port3;
|
||||||
serial0 = &uart0;
|
serial0 = &uart0;
|
||||||
serial1 = &uart1;
|
serial1 = &uart1;
|
||||||
};
|
};
|
||||||
|
@ -136,25 +140,25 @@
|
||||||
#address-cells = <1>;
|
#address-cells = <1>;
|
||||||
#size-cells = <0>;
|
#size-cells = <0>;
|
||||||
|
|
||||||
port@0 {
|
switch0port0: port@0 {
|
||||||
reg = <0>;
|
reg = <0>;
|
||||||
label = "cpu";
|
label = "cpu";
|
||||||
ethernet = <ð0>;
|
ethernet = <ð0>;
|
||||||
};
|
};
|
||||||
|
|
||||||
port@1 {
|
switch0port1: port@1 {
|
||||||
reg = <1>;
|
reg = <1>;
|
||||||
label = "wan";
|
label = "wan";
|
||||||
phy-handle = <&switch0phy0>;
|
phy-handle = <&switch0phy0>;
|
||||||
};
|
};
|
||||||
|
|
||||||
port@2 {
|
switch0port2: port@2 {
|
||||||
reg = <2>;
|
reg = <2>;
|
||||||
label = "lan0";
|
label = "lan0";
|
||||||
phy-handle = <&switch0phy1>;
|
phy-handle = <&switch0phy1>;
|
||||||
};
|
};
|
||||||
|
|
||||||
port@3 {
|
switch0port3: port@3 {
|
||||||
reg = <3>;
|
reg = <3>;
|
||||||
label = "lan1";
|
label = "lan1";
|
||||||
phy-handle = <&switch0phy2>;
|
phy-handle = <&switch0phy2>;
|
||||||
|
|
|
@ -877,7 +877,7 @@
|
||||||
reg-names = "mdp_phys";
|
reg-names = "mdp_phys";
|
||||||
|
|
||||||
interrupt-parent = <&mdss>;
|
interrupt-parent = <&mdss>;
|
||||||
interrupts = <0 0>;
|
interrupts = <0>;
|
||||||
|
|
||||||
clocks = <&gcc GCC_MDSS_AHB_CLK>,
|
clocks = <&gcc GCC_MDSS_AHB_CLK>,
|
||||||
<&gcc GCC_MDSS_AXI_CLK>,
|
<&gcc GCC_MDSS_AXI_CLK>,
|
||||||
|
@ -909,7 +909,7 @@
|
||||||
reg-names = "dsi_ctrl";
|
reg-names = "dsi_ctrl";
|
||||||
|
|
||||||
interrupt-parent = <&mdss>;
|
interrupt-parent = <&mdss>;
|
||||||
interrupts = <4 0>;
|
interrupts = <4>;
|
||||||
|
|
||||||
assigned-clocks = <&gcc BYTE0_CLK_SRC>,
|
assigned-clocks = <&gcc BYTE0_CLK_SRC>,
|
||||||
<&gcc PCLK0_CLK_SRC>;
|
<&gcc PCLK0_CLK_SRC>;
|
||||||
|
|
|
@ -99,7 +99,7 @@
|
||||||
|
|
||||||
wcd_codec: codec@f000 {
|
wcd_codec: codec@f000 {
|
||||||
compatible = "qcom,pm8916-wcd-analog-codec";
|
compatible = "qcom,pm8916-wcd-analog-codec";
|
||||||
reg = <0xf000 0x200>;
|
reg = <0xf000>;
|
||||||
reg-names = "pmic-codec-core";
|
reg-names = "pmic-codec-core";
|
||||||
clocks = <&gcc GCC_CODEC_DIGCODEC_CLK>;
|
clocks = <&gcc GCC_CODEC_DIGCODEC_CLK>;
|
||||||
clock-names = "mclk";
|
clock-names = "mclk";
|
||||||
|
|
|
@ -430,6 +430,7 @@
|
||||||
bus-width = <8>;
|
bus-width = <8>;
|
||||||
mmc-hs200-1_8v;
|
mmc-hs200-1_8v;
|
||||||
non-removable;
|
non-removable;
|
||||||
|
full-pwr-cycle-in-suspend;
|
||||||
status = "okay";
|
status = "okay";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -411,7 +411,7 @@
|
||||||
};
|
};
|
||||||
|
|
||||||
i2c0: i2c@ff020000 {
|
i2c0: i2c@ff020000 {
|
||||||
compatible = "cdns,i2c-r1p14", "cdns,i2c-r1p10";
|
compatible = "cdns,i2c-r1p14";
|
||||||
status = "disabled";
|
status = "disabled";
|
||||||
interrupt-parent = <&gic>;
|
interrupt-parent = <&gic>;
|
||||||
interrupts = <0 17 4>;
|
interrupts = <0 17 4>;
|
||||||
|
@ -421,7 +421,7 @@
|
||||||
};
|
};
|
||||||
|
|
||||||
i2c1: i2c@ff030000 {
|
i2c1: i2c@ff030000 {
|
||||||
compatible = "cdns,i2c-r1p14", "cdns,i2c-r1p10";
|
compatible = "cdns,i2c-r1p14";
|
||||||
status = "disabled";
|
status = "disabled";
|
||||||
interrupt-parent = <&gic>;
|
interrupt-parent = <&gic>;
|
||||||
interrupts = <0 18 4>;
|
interrupts = <0 18 4>;
|
||||||
|
|
|
@ -77,7 +77,6 @@ CONFIG_ARM_SCMI_PROTOCOL=y
|
||||||
CONFIG_ARM_SCPI_PROTOCOL=y
|
CONFIG_ARM_SCPI_PROTOCOL=y
|
||||||
# CONFIG_ARM_SCPI_POWER_DOMAIN is not set
|
# CONFIG_ARM_SCPI_POWER_DOMAIN is not set
|
||||||
# CONFIG_EFI_ARMSTUB_DTB_LOADER is not set
|
# CONFIG_EFI_ARMSTUB_DTB_LOADER is not set
|
||||||
CONFIG_ARM64_CRYPTO=y
|
|
||||||
CONFIG_CRYPTO_SHA2_ARM64_CE=y
|
CONFIG_CRYPTO_SHA2_ARM64_CE=y
|
||||||
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
|
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
|
||||||
CONFIG_JUMP_LABEL=y
|
CONFIG_JUMP_LABEL=y
|
||||||
|
@ -246,6 +245,7 @@ CONFIG_DM_VERITY_FEC=y
|
||||||
CONFIG_DM_BOW=y
|
CONFIG_DM_BOW=y
|
||||||
CONFIG_NETDEVICES=y
|
CONFIG_NETDEVICES=y
|
||||||
CONFIG_DUMMY=y
|
CONFIG_DUMMY=y
|
||||||
|
CONFIG_WIREGUARD=y
|
||||||
CONFIG_TUN=y
|
CONFIG_TUN=y
|
||||||
CONFIG_VETH=y
|
CONFIG_VETH=y
|
||||||
# CONFIG_ETHERNET is not set
|
# CONFIG_ETHERNET is not set
|
||||||
|
@ -358,6 +358,7 @@ CONFIG_HID_NINTENDO=y
|
||||||
CONFIG_HID_SONY=y
|
CONFIG_HID_SONY=y
|
||||||
CONFIG_HID_STEAM=y
|
CONFIG_HID_STEAM=y
|
||||||
CONFIG_USB_HIDDEV=y
|
CONFIG_USB_HIDDEV=y
|
||||||
|
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
|
||||||
CONFIG_USB_OTG=y
|
CONFIG_USB_OTG=y
|
||||||
CONFIG_USB_XHCI_HCD=y
|
CONFIG_USB_XHCI_HCD=y
|
||||||
CONFIG_USB_GADGET=y
|
CONFIG_USB_GADGET=y
|
||||||
|
@ -503,6 +504,7 @@ CONFIG_CRC8=y
|
||||||
CONFIG_XZ_DEC=y
|
CONFIG_XZ_DEC=y
|
||||||
CONFIG_PRINTK_TIME=y
|
CONFIG_PRINTK_TIME=y
|
||||||
CONFIG_DEBUG_INFO=y
|
CONFIG_DEBUG_INFO=y
|
||||||
|
CONFIG_DEBUG_INFO_DWARF4=y
|
||||||
# CONFIG_ENABLE_MUST_CHECK is not set
|
# CONFIG_ENABLE_MUST_CHECK is not set
|
||||||
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
|
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
|
||||||
CONFIG_MAGIC_SYSRQ=y
|
CONFIG_MAGIC_SYSRQ=y
|
||||||
|
|
1
arch/arm64/crypto/.gitignore
vendored
1
arch/arm64/crypto/.gitignore
vendored
|
@ -1,2 +1,3 @@
|
||||||
sha256-core.S
|
sha256-core.S
|
||||||
sha512-core.S
|
sha512-core.S
|
||||||
|
poly1305-core.S
|
||||||
|
|
|
@ -106,10 +106,17 @@ config CRYPTO_AES_ARM64_NEON_BLK
|
||||||
select CRYPTO_SIMD
|
select CRYPTO_SIMD
|
||||||
|
|
||||||
config CRYPTO_CHACHA20_NEON
|
config CRYPTO_CHACHA20_NEON
|
||||||
tristate "NEON accelerated ChaCha20 symmetric cipher"
|
tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
|
||||||
depends on KERNEL_MODE_NEON
|
depends on KERNEL_MODE_NEON
|
||||||
select CRYPTO_BLKCIPHER
|
select CRYPTO_BLKCIPHER
|
||||||
select CRYPTO_CHACHA20
|
select CRYPTO_LIB_CHACHA_GENERIC
|
||||||
|
select CRYPTO_ARCH_HAVE_LIB_CHACHA
|
||||||
|
|
||||||
|
config CRYPTO_POLY1305_NEON
|
||||||
|
tristate "Poly1305 hash function using scalar or NEON instructions"
|
||||||
|
depends on KERNEL_MODE_NEON
|
||||||
|
select CRYPTO_HASH
|
||||||
|
select CRYPTO_ARCH_HAVE_LIB_POLY1305
|
||||||
|
|
||||||
config CRYPTO_AES_ARM64_BS
|
config CRYPTO_AES_ARM64_BS
|
||||||
tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
|
tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
|
||||||
|
|
|
@ -53,8 +53,12 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
|
||||||
obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
|
obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
|
||||||
sha512-arm64-y := sha512-glue.o sha512-core.o
|
sha512-arm64-y := sha512-glue.o sha512-core.o
|
||||||
|
|
||||||
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
|
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
|
||||||
chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
|
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
|
||||||
|
|
||||||
|
obj-$(CONFIG_CRYPTO_POLY1305_NEON) += poly1305-neon.o
|
||||||
|
poly1305-neon-y := poly1305-core.o poly1305-glue.o
|
||||||
|
AFLAGS_poly1305-core.o += -Dpoly1305_init=poly1305_init_arm64
|
||||||
|
|
||||||
obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
|
obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
|
||||||
aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
|
aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
|
||||||
|
@ -71,6 +75,9 @@ ifdef REGENERATE_ARM64_CRYPTO
|
||||||
quiet_cmd_perlasm = PERLASM $@
|
quiet_cmd_perlasm = PERLASM $@
|
||||||
cmd_perlasm = $(PERL) $(<) void $(@)
|
cmd_perlasm = $(PERL) $(<) void $(@)
|
||||||
|
|
||||||
|
$(src)/poly1305-core.S_shipped: $(src)/poly1305-armv8.pl
|
||||||
|
$(call cmd,perlasm)
|
||||||
|
|
||||||
$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
|
$(src)/sha256-core.S_shipped: $(src)/sha512-armv8.pl
|
||||||
$(call cmd,perlasm)
|
$(call cmd,perlasm)
|
||||||
|
|
||||||
|
@ -78,4 +85,4 @@ $(src)/sha512-core.S_shipped: $(src)/sha512-armv8.pl
|
||||||
$(call cmd,perlasm)
|
$(call cmd,perlasm)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
targets += sha256-core.S sha512-core.S
|
targets += poly1305-core.S sha256-core.S sha512-core.S
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
/*
|
/*
|
||||||
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
|
* ChaCha/XChaCha NEON helper functions
|
||||||
*
|
*
|
||||||
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
* Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||||
*
|
*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
* it under the terms of the GNU General Public License version 2 as
|
* it under the terms of the GNU General Public License version 2 as
|
||||||
* published by the Free Software Foundation.
|
* published by the Free Software Foundation.
|
||||||
*
|
*
|
||||||
* Based on:
|
* Originally based on:
|
||||||
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
|
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
|
||||||
*
|
*
|
||||||
* Copyright (C) 2015 Martin Willi
|
* Copyright (C) 2015 Martin Willi
|
||||||
|
@ -19,29 +19,27 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <linux/linkage.h>
|
#include <linux/linkage.h>
|
||||||
|
#include <asm/assembler.h>
|
||||||
|
#include <asm/cache.h>
|
||||||
|
|
||||||
.text
|
.text
|
||||||
.align 6
|
.align 6
|
||||||
|
|
||||||
ENTRY(chacha20_block_xor_neon)
|
/*
|
||||||
// x0: Input state matrix, s
|
* chacha_permute - permute one block
|
||||||
// x1: 1 data block output, o
|
*
|
||||||
// x2: 1 data block input, i
|
* Permute one 64-byte block where the state matrix is stored in the four NEON
|
||||||
|
* registers v0-v3. It performs matrix operations on four words in parallel,
|
||||||
|
* but requires shuffling to rearrange the words after each round.
|
||||||
|
*
|
||||||
|
* The round count is given in w3.
|
||||||
|
*
|
||||||
|
* Clobbers: w3, x10, v4, v12
|
||||||
|
*/
|
||||||
|
chacha_permute:
|
||||||
|
|
||||||
//
|
adr_l x10, ROT8
|
||||||
// This function encrypts one ChaCha20 block by loading the state matrix
|
ld1 {v12.4s}, [x10]
|
||||||
// in four NEON registers. It performs matrix operation on four words in
|
|
||||||
// parallel, but requires shuffling to rearrange the words after each
|
|
||||||
// round.
|
|
||||||
//
|
|
||||||
|
|
||||||
// x0..3 = s0..3
|
|
||||||
adr x3, ROT8
|
|
||||||
ld1 {v0.4s-v3.4s}, [x0]
|
|
||||||
ld1 {v8.4s-v11.4s}, [x0]
|
|
||||||
ld1 {v12.4s}, [x3]
|
|
||||||
|
|
||||||
mov x3, #10
|
|
||||||
|
|
||||||
.Ldoubleround:
|
.Ldoubleround:
|
||||||
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||||
|
@ -102,9 +100,27 @@ ENTRY(chacha20_block_xor_neon)
|
||||||
// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||||
ext v3.16b, v3.16b, v3.16b, #4
|
ext v3.16b, v3.16b, v3.16b, #4
|
||||||
|
|
||||||
subs x3, x3, #1
|
subs w3, w3, #2
|
||||||
b.ne .Ldoubleround
|
b.ne .Ldoubleround
|
||||||
|
|
||||||
|
ret
|
||||||
|
ENDPROC(chacha_permute)
|
||||||
|
|
||||||
|
ENTRY(chacha_block_xor_neon)
|
||||||
|
// x0: Input state matrix, s
|
||||||
|
// x1: 1 data block output, o
|
||||||
|
// x2: 1 data block input, i
|
||||||
|
// w3: nrounds
|
||||||
|
|
||||||
|
stp x29, x30, [sp, #-16]!
|
||||||
|
mov x29, sp
|
||||||
|
|
||||||
|
// x0..3 = s0..3
|
||||||
|
ld1 {v0.4s-v3.4s}, [x0]
|
||||||
|
ld1 {v8.4s-v11.4s}, [x0]
|
||||||
|
|
||||||
|
bl chacha_permute
|
||||||
|
|
||||||
ld1 {v4.16b-v7.16b}, [x2]
|
ld1 {v4.16b-v7.16b}, [x2]
|
||||||
|
|
||||||
// o0 = i0 ^ (x0 + s0)
|
// o0 = i0 ^ (x0 + s0)
|
||||||
|
@ -125,71 +141,156 @@ ENTRY(chacha20_block_xor_neon)
|
||||||
|
|
||||||
st1 {v0.16b-v3.16b}, [x1]
|
st1 {v0.16b-v3.16b}, [x1]
|
||||||
|
|
||||||
|
ldp x29, x30, [sp], #16
|
||||||
ret
|
ret
|
||||||
ENDPROC(chacha20_block_xor_neon)
|
ENDPROC(chacha_block_xor_neon)
|
||||||
|
|
||||||
|
ENTRY(hchacha_block_neon)
|
||||||
|
// x0: Input state matrix, s
|
||||||
|
// x1: output (8 32-bit words)
|
||||||
|
// w2: nrounds
|
||||||
|
|
||||||
|
stp x29, x30, [sp, #-16]!
|
||||||
|
mov x29, sp
|
||||||
|
|
||||||
|
ld1 {v0.4s-v3.4s}, [x0]
|
||||||
|
|
||||||
|
mov w3, w2
|
||||||
|
bl chacha_permute
|
||||||
|
|
||||||
|
st1 {v0.4s}, [x1], #16
|
||||||
|
st1 {v3.4s}, [x1]
|
||||||
|
|
||||||
|
ldp x29, x30, [sp], #16
|
||||||
|
ret
|
||||||
|
ENDPROC(hchacha_block_neon)
|
||||||
|
|
||||||
|
a0 .req w12
|
||||||
|
a1 .req w13
|
||||||
|
a2 .req w14
|
||||||
|
a3 .req w15
|
||||||
|
a4 .req w16
|
||||||
|
a5 .req w17
|
||||||
|
a6 .req w19
|
||||||
|
a7 .req w20
|
||||||
|
a8 .req w21
|
||||||
|
a9 .req w22
|
||||||
|
a10 .req w23
|
||||||
|
a11 .req w24
|
||||||
|
a12 .req w25
|
||||||
|
a13 .req w26
|
||||||
|
a14 .req w27
|
||||||
|
a15 .req w28
|
||||||
|
|
||||||
.align 6
|
.align 6
|
||||||
ENTRY(chacha20_4block_xor_neon)
|
ENTRY(chacha_4block_xor_neon)
|
||||||
|
frame_push 10
|
||||||
|
|
||||||
// x0: Input state matrix, s
|
// x0: Input state matrix, s
|
||||||
// x1: 4 data blocks output, o
|
// x1: 4 data blocks output, o
|
||||||
// x2: 4 data blocks input, i
|
// x2: 4 data blocks input, i
|
||||||
|
// w3: nrounds
|
||||||
|
// x4: byte count
|
||||||
|
|
||||||
|
adr_l x10, .Lpermute
|
||||||
|
and x5, x4, #63
|
||||||
|
add x10, x10, x5
|
||||||
|
add x11, x10, #64
|
||||||
|
|
||||||
//
|
//
|
||||||
// This function encrypts four consecutive ChaCha20 blocks by loading
|
// This function encrypts four consecutive ChaCha blocks by loading
|
||||||
// the state matrix in NEON registers four times. The algorithm performs
|
// the state matrix in NEON registers four times. The algorithm performs
|
||||||
// each operation on the corresponding word of each state matrix, hence
|
// each operation on the corresponding word of each state matrix, hence
|
||||||
// requires no word shuffling. For final XORing step we transpose the
|
// requires no word shuffling. For final XORing step we transpose the
|
||||||
// matrix by interleaving 32- and then 64-bit words, which allows us to
|
// matrix by interleaving 32- and then 64-bit words, which allows us to
|
||||||
// do XOR in NEON registers.
|
// do XOR in NEON registers.
|
||||||
//
|
//
|
||||||
adr x3, CTRINC // ... and ROT8
|
// At the same time, a fifth block is encrypted in parallel using
|
||||||
ld1 {v30.4s-v31.4s}, [x3]
|
// scalar registers
|
||||||
|
//
|
||||||
|
adr_l x9, CTRINC // ... and ROT8
|
||||||
|
ld1 {v30.4s-v31.4s}, [x9]
|
||||||
|
|
||||||
// x0..15[0-3] = s0..3[0..3]
|
// x0..15[0-3] = s0..3[0..3]
|
||||||
mov x4, x0
|
add x8, x0, #16
|
||||||
ld4r { v0.4s- v3.4s}, [x4], #16
|
ld4r { v0.4s- v3.4s}, [x0]
|
||||||
ld4r { v4.4s- v7.4s}, [x4], #16
|
ld4r { v4.4s- v7.4s}, [x8], #16
|
||||||
ld4r { v8.4s-v11.4s}, [x4], #16
|
ld4r { v8.4s-v11.4s}, [x8], #16
|
||||||
ld4r {v12.4s-v15.4s}, [x4]
|
ld4r {v12.4s-v15.4s}, [x8]
|
||||||
|
|
||||||
// x12 += counter values 0-3
|
mov a0, v0.s[0]
|
||||||
|
mov a1, v1.s[0]
|
||||||
|
mov a2, v2.s[0]
|
||||||
|
mov a3, v3.s[0]
|
||||||
|
mov a4, v4.s[0]
|
||||||
|
mov a5, v5.s[0]
|
||||||
|
mov a6, v6.s[0]
|
||||||
|
mov a7, v7.s[0]
|
||||||
|
mov a8, v8.s[0]
|
||||||
|
mov a9, v9.s[0]
|
||||||
|
mov a10, v10.s[0]
|
||||||
|
mov a11, v11.s[0]
|
||||||
|
mov a12, v12.s[0]
|
||||||
|
mov a13, v13.s[0]
|
||||||
|
mov a14, v14.s[0]
|
||||||
|
mov a15, v15.s[0]
|
||||||
|
|
||||||
|
// x12 += counter values 1-4
|
||||||
add v12.4s, v12.4s, v30.4s
|
add v12.4s, v12.4s, v30.4s
|
||||||
|
|
||||||
mov x3, #10
|
|
||||||
|
|
||||||
.Ldoubleround4:
|
.Ldoubleround4:
|
||||||
// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||||
// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
|
// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
|
||||||
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
|
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
|
||||||
// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
|
// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
|
||||||
add v0.4s, v0.4s, v4.4s
|
add v0.4s, v0.4s, v4.4s
|
||||||
|
add a0, a0, a4
|
||||||
add v1.4s, v1.4s, v5.4s
|
add v1.4s, v1.4s, v5.4s
|
||||||
|
add a1, a1, a5
|
||||||
add v2.4s, v2.4s, v6.4s
|
add v2.4s, v2.4s, v6.4s
|
||||||
|
add a2, a2, a6
|
||||||
add v3.4s, v3.4s, v7.4s
|
add v3.4s, v3.4s, v7.4s
|
||||||
|
add a3, a3, a7
|
||||||
|
|
||||||
eor v12.16b, v12.16b, v0.16b
|
eor v12.16b, v12.16b, v0.16b
|
||||||
|
eor a12, a12, a0
|
||||||
eor v13.16b, v13.16b, v1.16b
|
eor v13.16b, v13.16b, v1.16b
|
||||||
|
eor a13, a13, a1
|
||||||
eor v14.16b, v14.16b, v2.16b
|
eor v14.16b, v14.16b, v2.16b
|
||||||
|
eor a14, a14, a2
|
||||||
eor v15.16b, v15.16b, v3.16b
|
eor v15.16b, v15.16b, v3.16b
|
||||||
|
eor a15, a15, a3
|
||||||
|
|
||||||
rev32 v12.8h, v12.8h
|
rev32 v12.8h, v12.8h
|
||||||
|
ror a12, a12, #16
|
||||||
rev32 v13.8h, v13.8h
|
rev32 v13.8h, v13.8h
|
||||||
|
ror a13, a13, #16
|
||||||
rev32 v14.8h, v14.8h
|
rev32 v14.8h, v14.8h
|
||||||
|
ror a14, a14, #16
|
||||||
rev32 v15.8h, v15.8h
|
rev32 v15.8h, v15.8h
|
||||||
|
ror a15, a15, #16
|
||||||
|
|
||||||
// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
|
// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
|
||||||
// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
|
// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
|
||||||
// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
|
// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
|
||||||
// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
|
// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
|
||||||
add v8.4s, v8.4s, v12.4s
|
add v8.4s, v8.4s, v12.4s
|
||||||
|
add a8, a8, a12
|
||||||
add v9.4s, v9.4s, v13.4s
|
add v9.4s, v9.4s, v13.4s
|
||||||
|
add a9, a9, a13
|
||||||
add v10.4s, v10.4s, v14.4s
|
add v10.4s, v10.4s, v14.4s
|
||||||
|
add a10, a10, a14
|
||||||
add v11.4s, v11.4s, v15.4s
|
add v11.4s, v11.4s, v15.4s
|
||||||
|
add a11, a11, a15
|
||||||
|
|
||||||
eor v16.16b, v4.16b, v8.16b
|
eor v16.16b, v4.16b, v8.16b
|
||||||
|
eor a4, a4, a8
|
||||||
eor v17.16b, v5.16b, v9.16b
|
eor v17.16b, v5.16b, v9.16b
|
||||||
|
eor a5, a5, a9
|
||||||
eor v18.16b, v6.16b, v10.16b
|
eor v18.16b, v6.16b, v10.16b
|
||||||
|
eor a6, a6, a10
|
||||||
eor v19.16b, v7.16b, v11.16b
|
eor v19.16b, v7.16b, v11.16b
|
||||||
|
eor a7, a7, a11
|
||||||
|
|
||||||
shl v4.4s, v16.4s, #12
|
shl v4.4s, v16.4s, #12
|
||||||
shl v5.4s, v17.4s, #12
|
shl v5.4s, v17.4s, #12
|
||||||
|
@ -197,42 +298,66 @@ ENTRY(chacha20_4block_xor_neon)
|
||||||
shl v7.4s, v19.4s, #12
|
shl v7.4s, v19.4s, #12
|
||||||
|
|
||||||
sri v4.4s, v16.4s, #20
|
sri v4.4s, v16.4s, #20
|
||||||
|
ror a4, a4, #20
|
||||||
sri v5.4s, v17.4s, #20
|
sri v5.4s, v17.4s, #20
|
||||||
|
ror a5, a5, #20
|
||||||
sri v6.4s, v18.4s, #20
|
sri v6.4s, v18.4s, #20
|
||||||
|
ror a6, a6, #20
|
||||||
sri v7.4s, v19.4s, #20
|
sri v7.4s, v19.4s, #20
|
||||||
|
ror a7, a7, #20
|
||||||
|
|
||||||
// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
|
// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
|
||||||
// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
|
// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
|
||||||
// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
|
// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
|
||||||
// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
|
// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
|
||||||
add v0.4s, v0.4s, v4.4s
|
add v0.4s, v0.4s, v4.4s
|
||||||
|
add a0, a0, a4
|
||||||
add v1.4s, v1.4s, v5.4s
|
add v1.4s, v1.4s, v5.4s
|
||||||
|
add a1, a1, a5
|
||||||
add v2.4s, v2.4s, v6.4s
|
add v2.4s, v2.4s, v6.4s
|
||||||
|
add a2, a2, a6
|
||||||
add v3.4s, v3.4s, v7.4s
|
add v3.4s, v3.4s, v7.4s
|
||||||
|
add a3, a3, a7
|
||||||
|
|
||||||
eor v12.16b, v12.16b, v0.16b
|
eor v12.16b, v12.16b, v0.16b
|
||||||
|
eor a12, a12, a0
|
||||||
eor v13.16b, v13.16b, v1.16b
|
eor v13.16b, v13.16b, v1.16b
|
||||||
|
eor a13, a13, a1
|
||||||
eor v14.16b, v14.16b, v2.16b
|
eor v14.16b, v14.16b, v2.16b
|
||||||
|
eor a14, a14, a2
|
||||||
eor v15.16b, v15.16b, v3.16b
|
eor v15.16b, v15.16b, v3.16b
|
||||||
|
eor a15, a15, a3
|
||||||
|
|
||||||
tbl v12.16b, {v12.16b}, v31.16b
|
tbl v12.16b, {v12.16b}, v31.16b
|
||||||
|
ror a12, a12, #24
|
||||||
tbl v13.16b, {v13.16b}, v31.16b
|
tbl v13.16b, {v13.16b}, v31.16b
|
||||||
|
ror a13, a13, #24
|
||||||
tbl v14.16b, {v14.16b}, v31.16b
|
tbl v14.16b, {v14.16b}, v31.16b
|
||||||
|
ror a14, a14, #24
|
||||||
tbl v15.16b, {v15.16b}, v31.16b
|
tbl v15.16b, {v15.16b}, v31.16b
|
||||||
|
ror a15, a15, #24
|
||||||
|
|
||||||
// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
|
// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
|
||||||
// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
|
// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
|
||||||
// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
|
// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
|
||||||
// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
|
// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
|
||||||
add v8.4s, v8.4s, v12.4s
|
add v8.4s, v8.4s, v12.4s
|
||||||
|
add a8, a8, a12
|
||||||
add v9.4s, v9.4s, v13.4s
|
add v9.4s, v9.4s, v13.4s
|
||||||
|
add a9, a9, a13
|
||||||
add v10.4s, v10.4s, v14.4s
|
add v10.4s, v10.4s, v14.4s
|
||||||
|
add a10, a10, a14
|
||||||
add v11.4s, v11.4s, v15.4s
|
add v11.4s, v11.4s, v15.4s
|
||||||
|
add a11, a11, a15
|
||||||
|
|
||||||
eor v16.16b, v4.16b, v8.16b
|
eor v16.16b, v4.16b, v8.16b
|
||||||
|
eor a4, a4, a8
|
||||||
eor v17.16b, v5.16b, v9.16b
|
eor v17.16b, v5.16b, v9.16b
|
||||||
|
eor a5, a5, a9
|
||||||
eor v18.16b, v6.16b, v10.16b
|
eor v18.16b, v6.16b, v10.16b
|
||||||
|
eor a6, a6, a10
|
||||||
eor v19.16b, v7.16b, v11.16b
|
eor v19.16b, v7.16b, v11.16b
|
||||||
|
eor a7, a7, a11
|
||||||
|
|
||||||
shl v4.4s, v16.4s, #7
|
shl v4.4s, v16.4s, #7
|
||||||
shl v5.4s, v17.4s, #7
|
shl v5.4s, v17.4s, #7
|
||||||
|
@ -240,42 +365,66 @@ ENTRY(chacha20_4block_xor_neon)
|
||||||
shl v7.4s, v19.4s, #7
|
shl v7.4s, v19.4s, #7
|
||||||
|
|
||||||
sri v4.4s, v16.4s, #25
|
sri v4.4s, v16.4s, #25
|
||||||
|
ror a4, a4, #25
|
||||||
sri v5.4s, v17.4s, #25
|
sri v5.4s, v17.4s, #25
|
||||||
|
ror a5, a5, #25
|
||||||
sri v6.4s, v18.4s, #25
|
sri v6.4s, v18.4s, #25
|
||||||
|
ror a6, a6, #25
|
||||||
sri v7.4s, v19.4s, #25
|
sri v7.4s, v19.4s, #25
|
||||||
|
ror a7, a7, #25
|
||||||
|
|
||||||
// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
|
// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
|
||||||
// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
|
// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
|
||||||
// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
|
// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
|
||||||
// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
|
// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
|
||||||
add v0.4s, v0.4s, v5.4s
|
add v0.4s, v0.4s, v5.4s
|
||||||
|
add a0, a0, a5
|
||||||
add v1.4s, v1.4s, v6.4s
|
add v1.4s, v1.4s, v6.4s
|
||||||
|
add a1, a1, a6
|
||||||
add v2.4s, v2.4s, v7.4s
|
add v2.4s, v2.4s, v7.4s
|
||||||
|
add a2, a2, a7
|
||||||
add v3.4s, v3.4s, v4.4s
|
add v3.4s, v3.4s, v4.4s
|
||||||
|
add a3, a3, a4
|
||||||
|
|
||||||
eor v15.16b, v15.16b, v0.16b
|
eor v15.16b, v15.16b, v0.16b
|
||||||
|
eor a15, a15, a0
|
||||||
eor v12.16b, v12.16b, v1.16b
|
eor v12.16b, v12.16b, v1.16b
|
||||||
|
eor a12, a12, a1
|
||||||
eor v13.16b, v13.16b, v2.16b
|
eor v13.16b, v13.16b, v2.16b
|
||||||
|
eor a13, a13, a2
|
||||||
eor v14.16b, v14.16b, v3.16b
|
eor v14.16b, v14.16b, v3.16b
|
||||||
|
eor a14, a14, a3
|
||||||
|
|
||||||
rev32 v15.8h, v15.8h
|
rev32 v15.8h, v15.8h
|
||||||
|
ror a15, a15, #16
|
||||||
rev32 v12.8h, v12.8h
|
rev32 v12.8h, v12.8h
|
||||||
|
ror a12, a12, #16
|
||||||
rev32 v13.8h, v13.8h
|
rev32 v13.8h, v13.8h
|
||||||
|
ror a13, a13, #16
|
||||||
rev32 v14.8h, v14.8h
|
rev32 v14.8h, v14.8h
|
||||||
|
ror a14, a14, #16
|
||||||
|
|
||||||
// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
|
// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
|
||||||
// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
|
// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
|
||||||
// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
|
// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
|
||||||
// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
|
// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
|
||||||
add v10.4s, v10.4s, v15.4s
|
add v10.4s, v10.4s, v15.4s
|
||||||
|
add a10, a10, a15
|
||||||
add v11.4s, v11.4s, v12.4s
|
add v11.4s, v11.4s, v12.4s
|
||||||
|
add a11, a11, a12
|
||||||
add v8.4s, v8.4s, v13.4s
|
add v8.4s, v8.4s, v13.4s
|
||||||
|
add a8, a8, a13
|
||||||
add v9.4s, v9.4s, v14.4s
|
add v9.4s, v9.4s, v14.4s
|
||||||
|
add a9, a9, a14
|
||||||
|
|
||||||
eor v16.16b, v5.16b, v10.16b
|
eor v16.16b, v5.16b, v10.16b
|
||||||
|
eor a5, a5, a10
|
||||||
eor v17.16b, v6.16b, v11.16b
|
eor v17.16b, v6.16b, v11.16b
|
||||||
|
eor a6, a6, a11
|
||||||
eor v18.16b, v7.16b, v8.16b
|
eor v18.16b, v7.16b, v8.16b
|
||||||
|
eor a7, a7, a8
|
||||||
eor v19.16b, v4.16b, v9.16b
|
eor v19.16b, v4.16b, v9.16b
|
||||||
|
eor a4, a4, a9
|
||||||
|
|
||||||
shl v5.4s, v16.4s, #12
|
shl v5.4s, v16.4s, #12
|
||||||
shl v6.4s, v17.4s, #12
|
shl v6.4s, v17.4s, #12
|
||||||
|
@ -283,42 +432,66 @@ ENTRY(chacha20_4block_xor_neon)
|
||||||
shl v4.4s, v19.4s, #12
|
shl v4.4s, v19.4s, #12
|
||||||
|
|
||||||
sri v5.4s, v16.4s, #20
|
sri v5.4s, v16.4s, #20
|
||||||
|
ror a5, a5, #20
|
||||||
sri v6.4s, v17.4s, #20
|
sri v6.4s, v17.4s, #20
|
||||||
|
ror a6, a6, #20
|
||||||
sri v7.4s, v18.4s, #20
|
sri v7.4s, v18.4s, #20
|
||||||
|
ror a7, a7, #20
|
||||||
sri v4.4s, v19.4s, #20
|
sri v4.4s, v19.4s, #20
|
||||||
|
ror a4, a4, #20
|
||||||
|
|
||||||
// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
|
// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
|
||||||
// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
|
// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
|
||||||
// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
|
// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
|
||||||
// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
|
// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
|
||||||
add v0.4s, v0.4s, v5.4s
|
add v0.4s, v0.4s, v5.4s
|
||||||
|
add a0, a0, a5
|
||||||
add v1.4s, v1.4s, v6.4s
|
add v1.4s, v1.4s, v6.4s
|
||||||
|
add a1, a1, a6
|
||||||
add v2.4s, v2.4s, v7.4s
|
add v2.4s, v2.4s, v7.4s
|
||||||
|
add a2, a2, a7
|
||||||
add v3.4s, v3.4s, v4.4s
|
add v3.4s, v3.4s, v4.4s
|
||||||
|
add a3, a3, a4
|
||||||
|
|
||||||
eor v15.16b, v15.16b, v0.16b
|
eor v15.16b, v15.16b, v0.16b
|
||||||
|
eor a15, a15, a0
|
||||||
eor v12.16b, v12.16b, v1.16b
|
eor v12.16b, v12.16b, v1.16b
|
||||||
|
eor a12, a12, a1
|
||||||
eor v13.16b, v13.16b, v2.16b
|
eor v13.16b, v13.16b, v2.16b
|
||||||
|
eor a13, a13, a2
|
||||||
eor v14.16b, v14.16b, v3.16b
|
eor v14.16b, v14.16b, v3.16b
|
||||||
|
eor a14, a14, a3
|
||||||
|
|
||||||
tbl v15.16b, {v15.16b}, v31.16b
|
tbl v15.16b, {v15.16b}, v31.16b
|
||||||
|
ror a15, a15, #24
|
||||||
tbl v12.16b, {v12.16b}, v31.16b
|
tbl v12.16b, {v12.16b}, v31.16b
|
||||||
|
ror a12, a12, #24
|
||||||
tbl v13.16b, {v13.16b}, v31.16b
|
tbl v13.16b, {v13.16b}, v31.16b
|
||||||
|
ror a13, a13, #24
|
||||||
tbl v14.16b, {v14.16b}, v31.16b
|
tbl v14.16b, {v14.16b}, v31.16b
|
||||||
|
ror a14, a14, #24
|
||||||
|
|
||||||
// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
|
// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
|
||||||
// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
|
// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
|
||||||
// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
|
// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
|
||||||
// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
|
// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
|
||||||
add v10.4s, v10.4s, v15.4s
|
add v10.4s, v10.4s, v15.4s
|
||||||
|
add a10, a10, a15
|
||||||
add v11.4s, v11.4s, v12.4s
|
add v11.4s, v11.4s, v12.4s
|
||||||
|
add a11, a11, a12
|
||||||
add v8.4s, v8.4s, v13.4s
|
add v8.4s, v8.4s, v13.4s
|
||||||
|
add a8, a8, a13
|
||||||
add v9.4s, v9.4s, v14.4s
|
add v9.4s, v9.4s, v14.4s
|
||||||
|
add a9, a9, a14
|
||||||
|
|
||||||
eor v16.16b, v5.16b, v10.16b
|
eor v16.16b, v5.16b, v10.16b
|
||||||
|
eor a5, a5, a10
|
||||||
eor v17.16b, v6.16b, v11.16b
|
eor v17.16b, v6.16b, v11.16b
|
||||||
|
eor a6, a6, a11
|
||||||
eor v18.16b, v7.16b, v8.16b
|
eor v18.16b, v7.16b, v8.16b
|
||||||
|
eor a7, a7, a8
|
||||||
eor v19.16b, v4.16b, v9.16b
|
eor v19.16b, v4.16b, v9.16b
|
||||||
|
eor a4, a4, a9
|
||||||
|
|
||||||
shl v5.4s, v16.4s, #7
|
shl v5.4s, v16.4s, #7
|
||||||
shl v6.4s, v17.4s, #7
|
shl v6.4s, v17.4s, #7
|
||||||
|
@ -326,11 +499,15 @@ ENTRY(chacha20_4block_xor_neon)
|
||||||
shl v4.4s, v19.4s, #7
|
shl v4.4s, v19.4s, #7
|
||||||
|
|
||||||
sri v5.4s, v16.4s, #25
|
sri v5.4s, v16.4s, #25
|
||||||
|
ror a5, a5, #25
|
||||||
sri v6.4s, v17.4s, #25
|
sri v6.4s, v17.4s, #25
|
||||||
|
ror a6, a6, #25
|
||||||
sri v7.4s, v18.4s, #25
|
sri v7.4s, v18.4s, #25
|
||||||
|
ror a7, a7, #25
|
||||||
sri v4.4s, v19.4s, #25
|
sri v4.4s, v19.4s, #25
|
||||||
|
ror a4, a4, #25
|
||||||
|
|
||||||
subs x3, x3, #1
|
subs w3, w3, #2
|
||||||
b.ne .Ldoubleround4
|
b.ne .Ldoubleround4
|
||||||
|
|
||||||
ld4r {v16.4s-v19.4s}, [x0], #16
|
ld4r {v16.4s-v19.4s}, [x0], #16
|
||||||
|
@ -344,9 +521,21 @@ ENTRY(chacha20_4block_xor_neon)
|
||||||
// x2[0-3] += s0[2]
|
// x2[0-3] += s0[2]
|
||||||
// x3[0-3] += s0[3]
|
// x3[0-3] += s0[3]
|
||||||
add v0.4s, v0.4s, v16.4s
|
add v0.4s, v0.4s, v16.4s
|
||||||
|
mov w6, v16.s[0]
|
||||||
|
mov w7, v17.s[0]
|
||||||
add v1.4s, v1.4s, v17.4s
|
add v1.4s, v1.4s, v17.4s
|
||||||
|
mov w8, v18.s[0]
|
||||||
|
mov w9, v19.s[0]
|
||||||
add v2.4s, v2.4s, v18.4s
|
add v2.4s, v2.4s, v18.4s
|
||||||
|
add a0, a0, w6
|
||||||
|
add a1, a1, w7
|
||||||
add v3.4s, v3.4s, v19.4s
|
add v3.4s, v3.4s, v19.4s
|
||||||
|
add a2, a2, w8
|
||||||
|
add a3, a3, w9
|
||||||
|
CPU_BE( rev a0, a0 )
|
||||||
|
CPU_BE( rev a1, a1 )
|
||||||
|
CPU_BE( rev a2, a2 )
|
||||||
|
CPU_BE( rev a3, a3 )
|
||||||
|
|
||||||
ld4r {v24.4s-v27.4s}, [x0], #16
|
ld4r {v24.4s-v27.4s}, [x0], #16
|
||||||
ld4r {v28.4s-v31.4s}, [x0]
|
ld4r {v28.4s-v31.4s}, [x0]
|
||||||
|
@ -356,95 +545,316 @@ ENTRY(chacha20_4block_xor_neon)
|
||||||
// x6[0-3] += s1[2]
|
// x6[0-3] += s1[2]
|
||||||
// x7[0-3] += s1[3]
|
// x7[0-3] += s1[3]
|
||||||
add v4.4s, v4.4s, v20.4s
|
add v4.4s, v4.4s, v20.4s
|
||||||
|
mov w6, v20.s[0]
|
||||||
|
mov w7, v21.s[0]
|
||||||
add v5.4s, v5.4s, v21.4s
|
add v5.4s, v5.4s, v21.4s
|
||||||
|
mov w8, v22.s[0]
|
||||||
|
mov w9, v23.s[0]
|
||||||
add v6.4s, v6.4s, v22.4s
|
add v6.4s, v6.4s, v22.4s
|
||||||
|
add a4, a4, w6
|
||||||
|
add a5, a5, w7
|
||||||
add v7.4s, v7.4s, v23.4s
|
add v7.4s, v7.4s, v23.4s
|
||||||
|
add a6, a6, w8
|
||||||
|
add a7, a7, w9
|
||||||
|
CPU_BE( rev a4, a4 )
|
||||||
|
CPU_BE( rev a5, a5 )
|
||||||
|
CPU_BE( rev a6, a6 )
|
||||||
|
CPU_BE( rev a7, a7 )
|
||||||
|
|
||||||
// x8[0-3] += s2[0]
|
// x8[0-3] += s2[0]
|
||||||
// x9[0-3] += s2[1]
|
// x9[0-3] += s2[1]
|
||||||
// x10[0-3] += s2[2]
|
// x10[0-3] += s2[2]
|
||||||
// x11[0-3] += s2[3]
|
// x11[0-3] += s2[3]
|
||||||
add v8.4s, v8.4s, v24.4s
|
add v8.4s, v8.4s, v24.4s
|
||||||
|
mov w6, v24.s[0]
|
||||||
|
mov w7, v25.s[0]
|
||||||
add v9.4s, v9.4s, v25.4s
|
add v9.4s, v9.4s, v25.4s
|
||||||
|
mov w8, v26.s[0]
|
||||||
|
mov w9, v27.s[0]
|
||||||
add v10.4s, v10.4s, v26.4s
|
add v10.4s, v10.4s, v26.4s
|
||||||
|
add a8, a8, w6
|
||||||
|
add a9, a9, w7
|
||||||
add v11.4s, v11.4s, v27.4s
|
add v11.4s, v11.4s, v27.4s
|
||||||
|
add a10, a10, w8
|
||||||
|
add a11, a11, w9
|
||||||
|
CPU_BE( rev a8, a8 )
|
||||||
|
CPU_BE( rev a9, a9 )
|
||||||
|
CPU_BE( rev a10, a10 )
|
||||||
|
CPU_BE( rev a11, a11 )
|
||||||
|
|
||||||
// x12[0-3] += s3[0]
|
// x12[0-3] += s3[0]
|
||||||
// x13[0-3] += s3[1]
|
// x13[0-3] += s3[1]
|
||||||
// x14[0-3] += s3[2]
|
// x14[0-3] += s3[2]
|
||||||
// x15[0-3] += s3[3]
|
// x15[0-3] += s3[3]
|
||||||
add v12.4s, v12.4s, v28.4s
|
add v12.4s, v12.4s, v28.4s
|
||||||
|
mov w6, v28.s[0]
|
||||||
|
mov w7, v29.s[0]
|
||||||
add v13.4s, v13.4s, v29.4s
|
add v13.4s, v13.4s, v29.4s
|
||||||
|
mov w8, v30.s[0]
|
||||||
|
mov w9, v31.s[0]
|
||||||
add v14.4s, v14.4s, v30.4s
|
add v14.4s, v14.4s, v30.4s
|
||||||
|
add a12, a12, w6
|
||||||
|
add a13, a13, w7
|
||||||
add v15.4s, v15.4s, v31.4s
|
add v15.4s, v15.4s, v31.4s
|
||||||
|
add a14, a14, w8
|
||||||
|
add a15, a15, w9
|
||||||
|
CPU_BE( rev a12, a12 )
|
||||||
|
CPU_BE( rev a13, a13 )
|
||||||
|
CPU_BE( rev a14, a14 )
|
||||||
|
CPU_BE( rev a15, a15 )
|
||||||
|
|
||||||
// interleave 32-bit words in state n, n+1
|
// interleave 32-bit words in state n, n+1
|
||||||
|
ldp w6, w7, [x2], #64
|
||||||
zip1 v16.4s, v0.4s, v1.4s
|
zip1 v16.4s, v0.4s, v1.4s
|
||||||
|
ldp w8, w9, [x2, #-56]
|
||||||
|
eor a0, a0, w6
|
||||||
zip2 v17.4s, v0.4s, v1.4s
|
zip2 v17.4s, v0.4s, v1.4s
|
||||||
|
eor a1, a1, w7
|
||||||
zip1 v18.4s, v2.4s, v3.4s
|
zip1 v18.4s, v2.4s, v3.4s
|
||||||
|
eor a2, a2, w8
|
||||||
zip2 v19.4s, v2.4s, v3.4s
|
zip2 v19.4s, v2.4s, v3.4s
|
||||||
|
eor a3, a3, w9
|
||||||
|
ldp w6, w7, [x2, #-48]
|
||||||
zip1 v20.4s, v4.4s, v5.4s
|
zip1 v20.4s, v4.4s, v5.4s
|
||||||
|
ldp w8, w9, [x2, #-40]
|
||||||
|
eor a4, a4, w6
|
||||||
zip2 v21.4s, v4.4s, v5.4s
|
zip2 v21.4s, v4.4s, v5.4s
|
||||||
|
eor a5, a5, w7
|
||||||
zip1 v22.4s, v6.4s, v7.4s
|
zip1 v22.4s, v6.4s, v7.4s
|
||||||
|
eor a6, a6, w8
|
||||||
zip2 v23.4s, v6.4s, v7.4s
|
zip2 v23.4s, v6.4s, v7.4s
|
||||||
|
eor a7, a7, w9
|
||||||
|
ldp w6, w7, [x2, #-32]
|
||||||
zip1 v24.4s, v8.4s, v9.4s
|
zip1 v24.4s, v8.4s, v9.4s
|
||||||
|
ldp w8, w9, [x2, #-24]
|
||||||
|
eor a8, a8, w6
|
||||||
zip2 v25.4s, v8.4s, v9.4s
|
zip2 v25.4s, v8.4s, v9.4s
|
||||||
|
eor a9, a9, w7
|
||||||
zip1 v26.4s, v10.4s, v11.4s
|
zip1 v26.4s, v10.4s, v11.4s
|
||||||
|
eor a10, a10, w8
|
||||||
zip2 v27.4s, v10.4s, v11.4s
|
zip2 v27.4s, v10.4s, v11.4s
|
||||||
|
eor a11, a11, w9
|
||||||
|
ldp w6, w7, [x2, #-16]
|
||||||
zip1 v28.4s, v12.4s, v13.4s
|
zip1 v28.4s, v12.4s, v13.4s
|
||||||
|
ldp w8, w9, [x2, #-8]
|
||||||
|
eor a12, a12, w6
|
||||||
zip2 v29.4s, v12.4s, v13.4s
|
zip2 v29.4s, v12.4s, v13.4s
|
||||||
|
eor a13, a13, w7
|
||||||
zip1 v30.4s, v14.4s, v15.4s
|
zip1 v30.4s, v14.4s, v15.4s
|
||||||
|
eor a14, a14, w8
|
||||||
zip2 v31.4s, v14.4s, v15.4s
|
zip2 v31.4s, v14.4s, v15.4s
|
||||||
|
eor a15, a15, w9
|
||||||
|
|
||||||
|
mov x3, #64
|
||||||
|
subs x5, x4, #128
|
||||||
|
add x6, x5, x2
|
||||||
|
csel x3, x3, xzr, ge
|
||||||
|
csel x2, x2, x6, ge
|
||||||
|
|
||||||
// interleave 64-bit words in state n, n+2
|
// interleave 64-bit words in state n, n+2
|
||||||
zip1 v0.2d, v16.2d, v18.2d
|
zip1 v0.2d, v16.2d, v18.2d
|
||||||
zip2 v4.2d, v16.2d, v18.2d
|
zip2 v4.2d, v16.2d, v18.2d
|
||||||
|
stp a0, a1, [x1], #64
|
||||||
zip1 v8.2d, v17.2d, v19.2d
|
zip1 v8.2d, v17.2d, v19.2d
|
||||||
zip2 v12.2d, v17.2d, v19.2d
|
zip2 v12.2d, v17.2d, v19.2d
|
||||||
ld1 {v16.16b-v19.16b}, [x2], #64
|
stp a2, a3, [x1, #-56]
|
||||||
|
ld1 {v16.16b-v19.16b}, [x2], x3
|
||||||
|
|
||||||
|
subs x6, x4, #192
|
||||||
|
ccmp x3, xzr, #4, lt
|
||||||
|
add x7, x6, x2
|
||||||
|
csel x3, x3, xzr, eq
|
||||||
|
csel x2, x2, x7, eq
|
||||||
|
|
||||||
zip1 v1.2d, v20.2d, v22.2d
|
zip1 v1.2d, v20.2d, v22.2d
|
||||||
zip2 v5.2d, v20.2d, v22.2d
|
zip2 v5.2d, v20.2d, v22.2d
|
||||||
|
stp a4, a5, [x1, #-48]
|
||||||
zip1 v9.2d, v21.2d, v23.2d
|
zip1 v9.2d, v21.2d, v23.2d
|
||||||
zip2 v13.2d, v21.2d, v23.2d
|
zip2 v13.2d, v21.2d, v23.2d
|
||||||
ld1 {v20.16b-v23.16b}, [x2], #64
|
stp a6, a7, [x1, #-40]
|
||||||
|
ld1 {v20.16b-v23.16b}, [x2], x3
|
||||||
|
|
||||||
|
subs x7, x4, #256
|
||||||
|
ccmp x3, xzr, #4, lt
|
||||||
|
add x8, x7, x2
|
||||||
|
csel x3, x3, xzr, eq
|
||||||
|
csel x2, x2, x8, eq
|
||||||
|
|
||||||
zip1 v2.2d, v24.2d, v26.2d
|
zip1 v2.2d, v24.2d, v26.2d
|
||||||
zip2 v6.2d, v24.2d, v26.2d
|
zip2 v6.2d, v24.2d, v26.2d
|
||||||
|
stp a8, a9, [x1, #-32]
|
||||||
zip1 v10.2d, v25.2d, v27.2d
|
zip1 v10.2d, v25.2d, v27.2d
|
||||||
zip2 v14.2d, v25.2d, v27.2d
|
zip2 v14.2d, v25.2d, v27.2d
|
||||||
ld1 {v24.16b-v27.16b}, [x2], #64
|
stp a10, a11, [x1, #-24]
|
||||||
|
ld1 {v24.16b-v27.16b}, [x2], x3
|
||||||
|
|
||||||
|
subs x8, x4, #320
|
||||||
|
ccmp x3, xzr, #4, lt
|
||||||
|
add x9, x8, x2
|
||||||
|
csel x2, x2, x9, eq
|
||||||
|
|
||||||
zip1 v3.2d, v28.2d, v30.2d
|
zip1 v3.2d, v28.2d, v30.2d
|
||||||
zip2 v7.2d, v28.2d, v30.2d
|
zip2 v7.2d, v28.2d, v30.2d
|
||||||
|
stp a12, a13, [x1, #-16]
|
||||||
zip1 v11.2d, v29.2d, v31.2d
|
zip1 v11.2d, v29.2d, v31.2d
|
||||||
zip2 v15.2d, v29.2d, v31.2d
|
zip2 v15.2d, v29.2d, v31.2d
|
||||||
|
stp a14, a15, [x1, #-8]
|
||||||
ld1 {v28.16b-v31.16b}, [x2]
|
ld1 {v28.16b-v31.16b}, [x2]
|
||||||
|
|
||||||
// xor with corresponding input, write to output
|
// xor with corresponding input, write to output
|
||||||
|
tbnz x5, #63, 0f
|
||||||
eor v16.16b, v16.16b, v0.16b
|
eor v16.16b, v16.16b, v0.16b
|
||||||
eor v17.16b, v17.16b, v1.16b
|
eor v17.16b, v17.16b, v1.16b
|
||||||
eor v18.16b, v18.16b, v2.16b
|
eor v18.16b, v18.16b, v2.16b
|
||||||
eor v19.16b, v19.16b, v3.16b
|
eor v19.16b, v19.16b, v3.16b
|
||||||
|
st1 {v16.16b-v19.16b}, [x1], #64
|
||||||
|
cbz x5, .Lout
|
||||||
|
|
||||||
|
tbnz x6, #63, 1f
|
||||||
eor v20.16b, v20.16b, v4.16b
|
eor v20.16b, v20.16b, v4.16b
|
||||||
eor v21.16b, v21.16b, v5.16b
|
eor v21.16b, v21.16b, v5.16b
|
||||||
st1 {v16.16b-v19.16b}, [x1], #64
|
|
||||||
eor v22.16b, v22.16b, v6.16b
|
eor v22.16b, v22.16b, v6.16b
|
||||||
eor v23.16b, v23.16b, v7.16b
|
eor v23.16b, v23.16b, v7.16b
|
||||||
|
st1 {v20.16b-v23.16b}, [x1], #64
|
||||||
|
cbz x6, .Lout
|
||||||
|
|
||||||
|
tbnz x7, #63, 2f
|
||||||
eor v24.16b, v24.16b, v8.16b
|
eor v24.16b, v24.16b, v8.16b
|
||||||
eor v25.16b, v25.16b, v9.16b
|
eor v25.16b, v25.16b, v9.16b
|
||||||
st1 {v20.16b-v23.16b}, [x1], #64
|
|
||||||
eor v26.16b, v26.16b, v10.16b
|
eor v26.16b, v26.16b, v10.16b
|
||||||
eor v27.16b, v27.16b, v11.16b
|
eor v27.16b, v27.16b, v11.16b
|
||||||
eor v28.16b, v28.16b, v12.16b
|
|
||||||
st1 {v24.16b-v27.16b}, [x1], #64
|
st1 {v24.16b-v27.16b}, [x1], #64
|
||||||
|
cbz x7, .Lout
|
||||||
|
|
||||||
|
tbnz x8, #63, 3f
|
||||||
|
eor v28.16b, v28.16b, v12.16b
|
||||||
eor v29.16b, v29.16b, v13.16b
|
eor v29.16b, v29.16b, v13.16b
|
||||||
eor v30.16b, v30.16b, v14.16b
|
eor v30.16b, v30.16b, v14.16b
|
||||||
eor v31.16b, v31.16b, v15.16b
|
eor v31.16b, v31.16b, v15.16b
|
||||||
st1 {v28.16b-v31.16b}, [x1]
|
st1 {v28.16b-v31.16b}, [x1]
|
||||||
|
|
||||||
|
.Lout: frame_pop
|
||||||
ret
|
ret
|
||||||
ENDPROC(chacha20_4block_xor_neon)
|
|
||||||
|
|
||||||
CTRINC: .word 0, 1, 2, 3
|
// fewer than 128 bytes of in/output
|
||||||
|
0: ld1 {v8.16b}, [x10]
|
||||||
|
ld1 {v9.16b}, [x11]
|
||||||
|
movi v10.16b, #16
|
||||||
|
sub x2, x1, #64
|
||||||
|
add x1, x1, x5
|
||||||
|
ld1 {v16.16b-v19.16b}, [x2]
|
||||||
|
tbl v4.16b, {v0.16b-v3.16b}, v8.16b
|
||||||
|
tbx v20.16b, {v16.16b-v19.16b}, v9.16b
|
||||||
|
add v8.16b, v8.16b, v10.16b
|
||||||
|
add v9.16b, v9.16b, v10.16b
|
||||||
|
tbl v5.16b, {v0.16b-v3.16b}, v8.16b
|
||||||
|
tbx v21.16b, {v16.16b-v19.16b}, v9.16b
|
||||||
|
add v8.16b, v8.16b, v10.16b
|
||||||
|
add v9.16b, v9.16b, v10.16b
|
||||||
|
tbl v6.16b, {v0.16b-v3.16b}, v8.16b
|
||||||
|
tbx v22.16b, {v16.16b-v19.16b}, v9.16b
|
||||||
|
add v8.16b, v8.16b, v10.16b
|
||||||
|
add v9.16b, v9.16b, v10.16b
|
||||||
|
tbl v7.16b, {v0.16b-v3.16b}, v8.16b
|
||||||
|
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
|
||||||
|
|
||||||
|
eor v20.16b, v20.16b, v4.16b
|
||||||
|
eor v21.16b, v21.16b, v5.16b
|
||||||
|
eor v22.16b, v22.16b, v6.16b
|
||||||
|
eor v23.16b, v23.16b, v7.16b
|
||||||
|
st1 {v20.16b-v23.16b}, [x1]
|
||||||
|
b .Lout
|
||||||
|
|
||||||
|
// fewer than 192 bytes of in/output
|
||||||
|
1: ld1 {v8.16b}, [x10]
|
||||||
|
ld1 {v9.16b}, [x11]
|
||||||
|
movi v10.16b, #16
|
||||||
|
add x1, x1, x6
|
||||||
|
tbl v0.16b, {v4.16b-v7.16b}, v8.16b
|
||||||
|
tbx v20.16b, {v16.16b-v19.16b}, v9.16b
|
||||||
|
add v8.16b, v8.16b, v10.16b
|
||||||
|
add v9.16b, v9.16b, v10.16b
|
||||||
|
tbl v1.16b, {v4.16b-v7.16b}, v8.16b
|
||||||
|
tbx v21.16b, {v16.16b-v19.16b}, v9.16b
|
||||||
|
add v8.16b, v8.16b, v10.16b
|
||||||
|
add v9.16b, v9.16b, v10.16b
|
||||||
|
tbl v2.16b, {v4.16b-v7.16b}, v8.16b
|
||||||
|
tbx v22.16b, {v16.16b-v19.16b}, v9.16b
|
||||||
|
add v8.16b, v8.16b, v10.16b
|
||||||
|
add v9.16b, v9.16b, v10.16b
|
||||||
|
tbl v3.16b, {v4.16b-v7.16b}, v8.16b
|
||||||
|
tbx v23.16b, {v16.16b-v19.16b}, v9.16b
|
||||||
|
|
||||||
|
eor v20.16b, v20.16b, v0.16b
|
||||||
|
eor v21.16b, v21.16b, v1.16b
|
||||||
|
eor v22.16b, v22.16b, v2.16b
|
||||||
|
eor v23.16b, v23.16b, v3.16b
|
||||||
|
st1 {v20.16b-v23.16b}, [x1]
|
||||||
|
b .Lout
|
||||||
|
|
||||||
|
// fewer than 256 bytes of in/output
|
||||||
|
2: ld1 {v4.16b}, [x10]
|
||||||
|
ld1 {v5.16b}, [x11]
|
||||||
|
movi v6.16b, #16
|
||||||
|
add x1, x1, x7
|
||||||
|
tbl v0.16b, {v8.16b-v11.16b}, v4.16b
|
||||||
|
tbx v24.16b, {v20.16b-v23.16b}, v5.16b
|
||||||
|
add v4.16b, v4.16b, v6.16b
|
||||||
|
add v5.16b, v5.16b, v6.16b
|
||||||
|
tbl v1.16b, {v8.16b-v11.16b}, v4.16b
|
||||||
|
tbx v25.16b, {v20.16b-v23.16b}, v5.16b
|
||||||
|
add v4.16b, v4.16b, v6.16b
|
||||||
|
add v5.16b, v5.16b, v6.16b
|
||||||
|
tbl v2.16b, {v8.16b-v11.16b}, v4.16b
|
||||||
|
tbx v26.16b, {v20.16b-v23.16b}, v5.16b
|
||||||
|
add v4.16b, v4.16b, v6.16b
|
||||||
|
add v5.16b, v5.16b, v6.16b
|
||||||
|
tbl v3.16b, {v8.16b-v11.16b}, v4.16b
|
||||||
|
tbx v27.16b, {v20.16b-v23.16b}, v5.16b
|
||||||
|
|
||||||
|
eor v24.16b, v24.16b, v0.16b
|
||||||
|
eor v25.16b, v25.16b, v1.16b
|
||||||
|
eor v26.16b, v26.16b, v2.16b
|
||||||
|
eor v27.16b, v27.16b, v3.16b
|
||||||
|
st1 {v24.16b-v27.16b}, [x1]
|
||||||
|
b .Lout
|
||||||
|
|
||||||
|
// fewer than 320 bytes of in/output
|
||||||
|
3: ld1 {v4.16b}, [x10]
|
||||||
|
ld1 {v5.16b}, [x11]
|
||||||
|
movi v6.16b, #16
|
||||||
|
add x1, x1, x8
|
||||||
|
tbl v0.16b, {v12.16b-v15.16b}, v4.16b
|
||||||
|
tbx v28.16b, {v24.16b-v27.16b}, v5.16b
|
||||||
|
add v4.16b, v4.16b, v6.16b
|
||||||
|
add v5.16b, v5.16b, v6.16b
|
||||||
|
tbl v1.16b, {v12.16b-v15.16b}, v4.16b
|
||||||
|
tbx v29.16b, {v24.16b-v27.16b}, v5.16b
|
||||||
|
add v4.16b, v4.16b, v6.16b
|
||||||
|
add v5.16b, v5.16b, v6.16b
|
||||||
|
tbl v2.16b, {v12.16b-v15.16b}, v4.16b
|
||||||
|
tbx v30.16b, {v24.16b-v27.16b}, v5.16b
|
||||||
|
add v4.16b, v4.16b, v6.16b
|
||||||
|
add v5.16b, v5.16b, v6.16b
|
||||||
|
tbl v3.16b, {v12.16b-v15.16b}, v4.16b
|
||||||
|
tbx v31.16b, {v24.16b-v27.16b}, v5.16b
|
||||||
|
|
||||||
|
eor v28.16b, v28.16b, v0.16b
|
||||||
|
eor v29.16b, v29.16b, v1.16b
|
||||||
|
eor v30.16b, v30.16b, v2.16b
|
||||||
|
eor v31.16b, v31.16b, v3.16b
|
||||||
|
st1 {v28.16b-v31.16b}, [x1]
|
||||||
|
b .Lout
|
||||||
|
ENDPROC(chacha_4block_xor_neon)
|
||||||
|
|
||||||
|
.section ".rodata", "a", %progbits
|
||||||
|
.align L1_CACHE_SHIFT
|
||||||
|
.Lpermute:
|
||||||
|
.set .Li, 0
|
||||||
|
.rept 192
|
||||||
|
.byte (.Li - 64)
|
||||||
|
.set .Li, .Li + 1
|
||||||
|
.endr
|
||||||
|
|
||||||
|
CTRINC: .word 1, 2, 3, 4
|
||||||
ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
|
ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
|
|
@ -1,8 +1,8 @@
|
||||||
/*
|
/*
|
||||||
* ARM NEON accelerated ChaCha and XChaCha stream ciphers,
|
* ARM NEON and scalar accelerated ChaCha and XChaCha stream ciphers,
|
||||||
* including ChaCha20 (RFC7539)
|
* including ChaCha20 (RFC7539)
|
||||||
*
|
*
|
||||||
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||||
*
|
*
|
||||||
* This program is free software; you can redistribute it and/or modify
|
* This program is free software; you can redistribute it and/or modify
|
||||||
* it under the terms of the GNU General Public License version 2 as
|
* it under the terms of the GNU General Public License version 2 as
|
||||||
|
@ -20,8 +20,9 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <crypto/algapi.h>
|
#include <crypto/algapi.h>
|
||||||
#include <crypto/chacha.h>
|
#include <crypto/internal/chacha.h>
|
||||||
#include <crypto/internal/skcipher.h>
|
#include <crypto/internal/skcipher.h>
|
||||||
|
#include <linux/jump_label.h>
|
||||||
#include <linux/kernel.h>
|
#include <linux/kernel.h>
|
||||||
#include <linux/module.h>
|
#include <linux/module.h>
|
||||||
|
|
||||||
|
@ -29,40 +30,78 @@
|
||||||
#include <asm/neon.h>
|
#include <asm/neon.h>
|
||||||
#include <asm/simd.h>
|
#include <asm/simd.h>
|
||||||
|
|
||||||
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
|
||||||
int nrounds);
|
|
||||||
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
|
|
||||||
int nrounds);
|
int nrounds);
|
||||||
|
asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
|
||||||
|
int nrounds, int bytes);
|
||||||
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
|
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
|
||||||
|
|
||||||
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
|
||||||
|
|
||||||
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
|
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
|
||||||
unsigned int bytes, int nrounds)
|
int bytes, int nrounds)
|
||||||
{
|
{
|
||||||
|
while (bytes > 0) {
|
||||||
|
int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
|
||||||
|
|
||||||
|
if (l <= CHACHA_BLOCK_SIZE) {
|
||||||
u8 buf[CHACHA_BLOCK_SIZE];
|
u8 buf[CHACHA_BLOCK_SIZE];
|
||||||
|
|
||||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
memcpy(buf, src, l);
|
||||||
chacha_4block_xor_neon(state, dst, src, nrounds);
|
|
||||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
|
||||||
src += CHACHA_BLOCK_SIZE * 4;
|
|
||||||
dst += CHACHA_BLOCK_SIZE * 4;
|
|
||||||
state[12] += 4;
|
|
||||||
}
|
|
||||||
while (bytes >= CHACHA_BLOCK_SIZE) {
|
|
||||||
chacha_block_xor_neon(state, dst, src, nrounds);
|
|
||||||
bytes -= CHACHA_BLOCK_SIZE;
|
|
||||||
src += CHACHA_BLOCK_SIZE;
|
|
||||||
dst += CHACHA_BLOCK_SIZE;
|
|
||||||
state[12]++;
|
|
||||||
}
|
|
||||||
if (bytes) {
|
|
||||||
memcpy(buf, src, bytes);
|
|
||||||
chacha_block_xor_neon(state, buf, buf, nrounds);
|
chacha_block_xor_neon(state, buf, buf, nrounds);
|
||||||
memcpy(dst, buf, bytes);
|
memcpy(dst, buf, l);
|
||||||
|
state[12] += 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
chacha_4block_xor_neon(state, dst, src, nrounds, l);
|
||||||
|
bytes -= l;
|
||||||
|
src += l;
|
||||||
|
dst += l;
|
||||||
|
state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
|
||||||
|
{
|
||||||
|
if (!static_branch_likely(&have_neon) || !may_use_simd()) {
|
||||||
|
hchacha_block_generic(state, stream, nrounds);
|
||||||
|
} else {
|
||||||
|
kernel_neon_begin();
|
||||||
|
hchacha_block_neon(state, stream, nrounds);
|
||||||
|
kernel_neon_end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(hchacha_block_arch);
|
||||||
|
|
||||||
|
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
||||||
|
{
|
||||||
|
chacha_init_generic(state, key, iv);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(chacha_init_arch);
|
||||||
|
|
||||||
|
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
|
||||||
|
int nrounds)
|
||||||
|
{
|
||||||
|
if (!static_branch_likely(&have_neon) || bytes <= CHACHA_BLOCK_SIZE ||
|
||||||
|
!may_use_simd())
|
||||||
|
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
|
||||||
|
|
||||||
|
do {
|
||||||
|
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
|
||||||
|
|
||||||
|
kernel_neon_begin();
|
||||||
|
chacha_doneon(state, dst, src, todo, nrounds);
|
||||||
|
kernel_neon_end();
|
||||||
|
|
||||||
|
bytes -= todo;
|
||||||
|
src += todo;
|
||||||
|
dst += todo;
|
||||||
|
} while (bytes);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(chacha_crypt_arch);
|
||||||
|
|
||||||
static int chacha_neon_stream_xor(struct skcipher_request *req,
|
static int chacha_neon_stream_xor(struct skcipher_request *req,
|
||||||
struct chacha_ctx *ctx, u8 *iv)
|
const struct chacha_ctx *ctx, const u8 *iv)
|
||||||
{
|
{
|
||||||
struct skcipher_walk walk;
|
struct skcipher_walk walk;
|
||||||
u32 state[16];
|
u32 state[16];
|
||||||
|
@ -70,18 +109,25 @@ static int chacha_neon_stream_xor(struct skcipher_request *req,
|
||||||
|
|
||||||
err = skcipher_walk_virt(&walk, req, false);
|
err = skcipher_walk_virt(&walk, req, false);
|
||||||
|
|
||||||
crypto_chacha_init(state, ctx, iv);
|
chacha_init_generic(state, ctx->key, iv);
|
||||||
|
|
||||||
while (walk.nbytes > 0) {
|
while (walk.nbytes > 0) {
|
||||||
unsigned int nbytes = walk.nbytes;
|
unsigned int nbytes = walk.nbytes;
|
||||||
|
|
||||||
if (nbytes < walk.total)
|
if (nbytes < walk.total)
|
||||||
nbytes = round_down(nbytes, walk.stride);
|
nbytes = rounddown(nbytes, walk.stride);
|
||||||
|
|
||||||
|
if (!static_branch_likely(&have_neon) ||
|
||||||
|
!may_use_simd()) {
|
||||||
|
chacha_crypt_generic(state, walk.dst.virt.addr,
|
||||||
|
walk.src.virt.addr, nbytes,
|
||||||
|
ctx->nrounds);
|
||||||
|
} else {
|
||||||
kernel_neon_begin();
|
kernel_neon_begin();
|
||||||
chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
|
chacha_doneon(state, walk.dst.virt.addr,
|
||||||
nbytes, ctx->nrounds);
|
walk.src.virt.addr, nbytes, ctx->nrounds);
|
||||||
kernel_neon_end();
|
kernel_neon_end();
|
||||||
|
}
|
||||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -93,9 +139,6 @@ static int chacha_neon(struct skcipher_request *req)
|
||||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||||
|
|
||||||
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
|
|
||||||
return crypto_chacha_crypt(req);
|
|
||||||
|
|
||||||
return chacha_neon_stream_xor(req, ctx, req->iv);
|
return chacha_neon_stream_xor(req, ctx, req->iv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -107,14 +150,8 @@ static int xchacha_neon(struct skcipher_request *req)
|
||||||
u32 state[16];
|
u32 state[16];
|
||||||
u8 real_iv[16];
|
u8 real_iv[16];
|
||||||
|
|
||||||
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
|
chacha_init_generic(state, ctx->key, req->iv);
|
||||||
return crypto_xchacha_crypt(req);
|
hchacha_block_arch(state, subctx.key, ctx->nrounds);
|
||||||
|
|
||||||
crypto_chacha_init(state, ctx, req->iv);
|
|
||||||
|
|
||||||
kernel_neon_begin();
|
|
||||||
hchacha_block_neon(state, subctx.key, ctx->nrounds);
|
|
||||||
kernel_neon_end();
|
|
||||||
subctx.nrounds = ctx->nrounds;
|
subctx.nrounds = ctx->nrounds;
|
||||||
|
|
||||||
memcpy(&real_iv[0], req->iv + 24, 8);
|
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||||
|
@ -135,8 +172,8 @@ static struct skcipher_alg algs[] = {
|
||||||
.max_keysize = CHACHA_KEY_SIZE,
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
.ivsize = CHACHA_IV_SIZE,
|
.ivsize = CHACHA_IV_SIZE,
|
||||||
.chunksize = CHACHA_BLOCK_SIZE,
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
.walksize = 5 * CHACHA_BLOCK_SIZE,
|
||||||
.setkey = crypto_chacha20_setkey,
|
.setkey = chacha20_setkey,
|
||||||
.encrypt = chacha_neon,
|
.encrypt = chacha_neon,
|
||||||
.decrypt = chacha_neon,
|
.decrypt = chacha_neon,
|
||||||
}, {
|
}, {
|
||||||
|
@ -151,8 +188,8 @@ static struct skcipher_alg algs[] = {
|
||||||
.max_keysize = CHACHA_KEY_SIZE,
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
.ivsize = XCHACHA_IV_SIZE,
|
.ivsize = XCHACHA_IV_SIZE,
|
||||||
.chunksize = CHACHA_BLOCK_SIZE,
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
.walksize = 5 * CHACHA_BLOCK_SIZE,
|
||||||
.setkey = crypto_chacha20_setkey,
|
.setkey = chacha20_setkey,
|
||||||
.encrypt = xchacha_neon,
|
.encrypt = xchacha_neon,
|
||||||
.decrypt = xchacha_neon,
|
.decrypt = xchacha_neon,
|
||||||
}, {
|
}, {
|
||||||
|
@ -167,8 +204,8 @@ static struct skcipher_alg algs[] = {
|
||||||
.max_keysize = CHACHA_KEY_SIZE,
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
.ivsize = XCHACHA_IV_SIZE,
|
.ivsize = XCHACHA_IV_SIZE,
|
||||||
.chunksize = CHACHA_BLOCK_SIZE,
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
.walksize = 5 * CHACHA_BLOCK_SIZE,
|
||||||
.setkey = crypto_chacha12_setkey,
|
.setkey = chacha12_setkey,
|
||||||
.encrypt = xchacha_neon,
|
.encrypt = xchacha_neon,
|
||||||
.decrypt = xchacha_neon,
|
.decrypt = xchacha_neon,
|
||||||
}
|
}
|
||||||
|
@ -176,14 +213,18 @@ static struct skcipher_alg algs[] = {
|
||||||
|
|
||||||
static int __init chacha_simd_mod_init(void)
|
static int __init chacha_simd_mod_init(void)
|
||||||
{
|
{
|
||||||
if (!(elf_hwcap & HWCAP_NEON))
|
if (!(elf_hwcap & HWCAP_ASIMD))
|
||||||
return -ENODEV;
|
return 0;
|
||||||
|
|
||||||
return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
|
static_branch_enable(&have_neon);
|
||||||
|
|
||||||
|
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
|
||||||
|
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __exit chacha_simd_mod_fini(void)
|
static void __exit chacha_simd_mod_fini(void)
|
||||||
{
|
{
|
||||||
|
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && (elf_hwcap & HWCAP_ASIMD))
|
||||||
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,133 +0,0 @@
|
||||||
/*
|
|
||||||
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
|
|
||||||
*
|
|
||||||
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License version 2 as
|
|
||||||
* published by the Free Software Foundation.
|
|
||||||
*
|
|
||||||
* Based on:
|
|
||||||
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
|
|
||||||
*
|
|
||||||
* Copyright (C) 2015 Martin Willi
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <crypto/algapi.h>
|
|
||||||
#include <crypto/chacha.h>
|
|
||||||
#include <crypto/internal/skcipher.h>
|
|
||||||
#include <linux/kernel.h>
|
|
||||||
#include <linux/module.h>
|
|
||||||
|
|
||||||
#include <asm/hwcap.h>
|
|
||||||
#include <asm/neon.h>
|
|
||||||
#include <asm/simd.h>
|
|
||||||
|
|
||||||
asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
|
|
||||||
asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
|
|
||||||
|
|
||||||
static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
|
|
||||||
unsigned int bytes)
|
|
||||||
{
|
|
||||||
u8 buf[CHACHA_BLOCK_SIZE];
|
|
||||||
|
|
||||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
|
||||||
kernel_neon_begin();
|
|
||||||
chacha20_4block_xor_neon(state, dst, src);
|
|
||||||
kernel_neon_end();
|
|
||||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
|
||||||
src += CHACHA_BLOCK_SIZE * 4;
|
|
||||||
dst += CHACHA_BLOCK_SIZE * 4;
|
|
||||||
state[12] += 4;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!bytes)
|
|
||||||
return;
|
|
||||||
|
|
||||||
kernel_neon_begin();
|
|
||||||
while (bytes >= CHACHA_BLOCK_SIZE) {
|
|
||||||
chacha20_block_xor_neon(state, dst, src);
|
|
||||||
bytes -= CHACHA_BLOCK_SIZE;
|
|
||||||
src += CHACHA_BLOCK_SIZE;
|
|
||||||
dst += CHACHA_BLOCK_SIZE;
|
|
||||||
state[12]++;
|
|
||||||
}
|
|
||||||
if (bytes) {
|
|
||||||
memcpy(buf, src, bytes);
|
|
||||||
chacha20_block_xor_neon(state, buf, buf);
|
|
||||||
memcpy(dst, buf, bytes);
|
|
||||||
}
|
|
||||||
kernel_neon_end();
|
|
||||||
}
|
|
||||||
|
|
||||||
static int chacha20_neon(struct skcipher_request *req)
|
|
||||||
{
|
|
||||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
|
||||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
|
||||||
struct skcipher_walk walk;
|
|
||||||
u32 state[16];
|
|
||||||
int err;
|
|
||||||
|
|
||||||
if (!may_use_simd() || req->cryptlen <= CHACHA_BLOCK_SIZE)
|
|
||||||
return crypto_chacha_crypt(req);
|
|
||||||
|
|
||||||
err = skcipher_walk_virt(&walk, req, false);
|
|
||||||
|
|
||||||
crypto_chacha_init(state, ctx, walk.iv);
|
|
||||||
|
|
||||||
while (walk.nbytes > 0) {
|
|
||||||
unsigned int nbytes = walk.nbytes;
|
|
||||||
|
|
||||||
if (nbytes < walk.total)
|
|
||||||
nbytes = round_down(nbytes, walk.stride);
|
|
||||||
|
|
||||||
chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
|
|
||||||
nbytes);
|
|
||||||
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct skcipher_alg alg = {
|
|
||||||
.base.cra_name = "chacha20",
|
|
||||||
.base.cra_driver_name = "chacha20-neon",
|
|
||||||
.base.cra_priority = 300,
|
|
||||||
.base.cra_blocksize = 1,
|
|
||||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
|
||||||
.base.cra_module = THIS_MODULE,
|
|
||||||
|
|
||||||
.min_keysize = CHACHA_KEY_SIZE,
|
|
||||||
.max_keysize = CHACHA_KEY_SIZE,
|
|
||||||
.ivsize = CHACHA_IV_SIZE,
|
|
||||||
.chunksize = CHACHA_BLOCK_SIZE,
|
|
||||||
.walksize = 4 * CHACHA_BLOCK_SIZE,
|
|
||||||
.setkey = crypto_chacha20_setkey,
|
|
||||||
.encrypt = chacha20_neon,
|
|
||||||
.decrypt = chacha20_neon,
|
|
||||||
};
|
|
||||||
|
|
||||||
static int __init chacha20_simd_mod_init(void)
|
|
||||||
{
|
|
||||||
if (!(elf_hwcap & HWCAP_ASIMD))
|
|
||||||
return -ENODEV;
|
|
||||||
|
|
||||||
return crypto_register_skcipher(&alg);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void __exit chacha20_simd_mod_fini(void)
|
|
||||||
{
|
|
||||||
crypto_unregister_skcipher(&alg);
|
|
||||||
}
|
|
||||||
|
|
||||||
module_init(chacha20_simd_mod_init);
|
|
||||||
module_exit(chacha20_simd_mod_fini);
|
|
||||||
|
|
||||||
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
|
||||||
MODULE_LICENSE("GPL v2");
|
|
||||||
MODULE_ALIAS_CRYPTO("chacha20");
|
|
913
arch/arm64/crypto/poly1305-armv8.pl
Normal file
913
arch/arm64/crypto/poly1305-armv8.pl
Normal file
|
@ -0,0 +1,913 @@
|
||||||
|
#!/usr/bin/env perl
|
||||||
|
# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
|
||||||
|
#
|
||||||
|
# ====================================================================
|
||||||
|
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
|
||||||
|
# project.
|
||||||
|
# ====================================================================
|
||||||
|
#
|
||||||
|
# This module implements Poly1305 hash for ARMv8.
|
||||||
|
#
|
||||||
|
# June 2015
|
||||||
|
#
|
||||||
|
# Numbers are cycles per processed byte with poly1305_blocks alone.
|
||||||
|
#
|
||||||
|
# IALU/gcc-4.9 NEON
|
||||||
|
#
|
||||||
|
# Apple A7 1.86/+5% 0.72
|
||||||
|
# Cortex-A53 2.69/+58% 1.47
|
||||||
|
# Cortex-A57 2.70/+7% 1.14
|
||||||
|
# Denver 1.64/+50% 1.18(*)
|
||||||
|
# X-Gene 2.13/+68% 2.27
|
||||||
|
# Mongoose 1.77/+75% 1.12
|
||||||
|
# Kryo 2.70/+55% 1.13
|
||||||
|
# ThunderX2 1.17/+95% 1.36
|
||||||
|
#
|
||||||
|
# (*) estimate based on resources availability is less than 1.0,
|
||||||
|
# i.e. measured result is worse than expected, presumably binary
|
||||||
|
# translator is not almighty;
|
||||||
|
|
||||||
|
$flavour=shift;
|
||||||
|
$output=shift;
|
||||||
|
|
||||||
|
if ($flavour && $flavour ne "void") {
|
||||||
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||||
|
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||||||
|
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||||||
|
die "can't locate arm-xlate.pl";
|
||||||
|
|
||||||
|
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||||||
|
} else {
|
||||||
|
open STDOUT,">$output";
|
||||||
|
}
|
||||||
|
|
||||||
|
my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3));
|
||||||
|
my ($mac,$nonce)=($inp,$len);
|
||||||
|
|
||||||
|
my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14));
|
||||||
|
|
||||||
|
$code.=<<___;
|
||||||
|
#ifndef __KERNEL__
|
||||||
|
# include "arm_arch.h"
|
||||||
|
.extern OPENSSL_armcap_P
|
||||||
|
#endif
|
||||||
|
|
||||||
|
.text
|
||||||
|
|
||||||
|
// forward "declarations" are required for Apple
|
||||||
|
.globl poly1305_blocks
|
||||||
|
.globl poly1305_emit
|
||||||
|
|
||||||
|
.globl poly1305_init
|
||||||
|
.type poly1305_init,%function
|
||||||
|
.align 5
|
||||||
|
poly1305_init:
|
||||||
|
cmp $inp,xzr
|
||||||
|
stp xzr,xzr,[$ctx] // zero hash value
|
||||||
|
stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
|
||||||
|
|
||||||
|
csel x0,xzr,x0,eq
|
||||||
|
b.eq .Lno_key
|
||||||
|
|
||||||
|
#ifndef __KERNEL__
|
||||||
|
adrp x17,OPENSSL_armcap_P
|
||||||
|
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ldp $r0,$r1,[$inp] // load key
|
||||||
|
mov $s1,#0xfffffffc0fffffff
|
||||||
|
movk $s1,#0x0fff,lsl#48
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev $r0,$r0 // flip bytes
|
||||||
|
rev $r1,$r1
|
||||||
|
#endif
|
||||||
|
and $r0,$r0,$s1 // &=0ffffffc0fffffff
|
||||||
|
and $s1,$s1,#-4
|
||||||
|
and $r1,$r1,$s1 // &=0ffffffc0ffffffc
|
||||||
|
mov w#$s1,#-1
|
||||||
|
stp $r0,$r1,[$ctx,#32] // save key value
|
||||||
|
str w#$s1,[$ctx,#48] // impossible key power value
|
||||||
|
|
||||||
|
#ifndef __KERNEL__
|
||||||
|
tst w17,#ARMV7_NEON
|
||||||
|
|
||||||
|
adr $d0,.Lpoly1305_blocks
|
||||||
|
adr $r0,.Lpoly1305_blocks_neon
|
||||||
|
adr $d1,.Lpoly1305_emit
|
||||||
|
|
||||||
|
csel $d0,$d0,$r0,eq
|
||||||
|
|
||||||
|
# ifdef __ILP32__
|
||||||
|
stp w#$d0,w#$d1,[$len]
|
||||||
|
# else
|
||||||
|
stp $d0,$d1,[$len]
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
mov x0,#1
|
||||||
|
.Lno_key:
|
||||||
|
ret
|
||||||
|
.size poly1305_init,.-poly1305_init
|
||||||
|
|
||||||
|
.type poly1305_blocks,%function
|
||||||
|
.align 5
|
||||||
|
poly1305_blocks:
|
||||||
|
.Lpoly1305_blocks:
|
||||||
|
ands $len,$len,#-16
|
||||||
|
b.eq .Lno_data
|
||||||
|
|
||||||
|
ldp $h0,$h1,[$ctx] // load hash value
|
||||||
|
ldp $h2,x17,[$ctx,#16] // [along with is_base2_26]
|
||||||
|
ldp $r0,$r1,[$ctx,#32] // load key value
|
||||||
|
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
lsr $d0,$h0,#32
|
||||||
|
mov w#$d1,w#$h0
|
||||||
|
lsr $d2,$h1,#32
|
||||||
|
mov w15,w#$h1
|
||||||
|
lsr x16,$h2,#32
|
||||||
|
#else
|
||||||
|
mov w#$d0,w#$h0
|
||||||
|
lsr $d1,$h0,#32
|
||||||
|
mov w#$d2,w#$h1
|
||||||
|
lsr x15,$h1,#32
|
||||||
|
mov w16,w#$h2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
|
||||||
|
lsr $d1,$d2,#12
|
||||||
|
adds $d0,$d0,$d2,lsl#52
|
||||||
|
add $d1,$d1,x15,lsl#14
|
||||||
|
adc $d1,$d1,xzr
|
||||||
|
lsr $d2,x16,#24
|
||||||
|
adds $d1,$d1,x16,lsl#40
|
||||||
|
adc $d2,$d2,xzr
|
||||||
|
|
||||||
|
cmp x17,#0 // is_base2_26?
|
||||||
|
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||||
|
csel $h0,$h0,$d0,eq // choose between radixes
|
||||||
|
csel $h1,$h1,$d1,eq
|
||||||
|
csel $h2,$h2,$d2,eq
|
||||||
|
|
||||||
|
.Loop:
|
||||||
|
ldp $t0,$t1,[$inp],#16 // load input
|
||||||
|
sub $len,$len,#16
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev $t0,$t0
|
||||||
|
rev $t1,$t1
|
||||||
|
#endif
|
||||||
|
adds $h0,$h0,$t0 // accumulate input
|
||||||
|
adcs $h1,$h1,$t1
|
||||||
|
|
||||||
|
mul $d0,$h0,$r0 // h0*r0
|
||||||
|
adc $h2,$h2,$padbit
|
||||||
|
umulh $d1,$h0,$r0
|
||||||
|
|
||||||
|
mul $t0,$h1,$s1 // h1*5*r1
|
||||||
|
umulh $t1,$h1,$s1
|
||||||
|
|
||||||
|
adds $d0,$d0,$t0
|
||||||
|
mul $t0,$h0,$r1 // h0*r1
|
||||||
|
adc $d1,$d1,$t1
|
||||||
|
umulh $d2,$h0,$r1
|
||||||
|
|
||||||
|
adds $d1,$d1,$t0
|
||||||
|
mul $t0,$h1,$r0 // h1*r0
|
||||||
|
adc $d2,$d2,xzr
|
||||||
|
umulh $t1,$h1,$r0
|
||||||
|
|
||||||
|
adds $d1,$d1,$t0
|
||||||
|
mul $t0,$h2,$s1 // h2*5*r1
|
||||||
|
adc $d2,$d2,$t1
|
||||||
|
mul $t1,$h2,$r0 // h2*r0
|
||||||
|
|
||||||
|
adds $d1,$d1,$t0
|
||||||
|
adc $d2,$d2,$t1
|
||||||
|
|
||||||
|
and $t0,$d2,#-4 // final reduction
|
||||||
|
and $h2,$d2,#3
|
||||||
|
add $t0,$t0,$d2,lsr#2
|
||||||
|
adds $h0,$d0,$t0
|
||||||
|
adcs $h1,$d1,xzr
|
||||||
|
adc $h2,$h2,xzr
|
||||||
|
|
||||||
|
cbnz $len,.Loop
|
||||||
|
|
||||||
|
stp $h0,$h1,[$ctx] // store hash value
|
||||||
|
stp $h2,xzr,[$ctx,#16] // [and clear is_base2_26]
|
||||||
|
|
||||||
|
.Lno_data:
|
||||||
|
ret
|
||||||
|
.size poly1305_blocks,.-poly1305_blocks
|
||||||
|
|
||||||
|
.type poly1305_emit,%function
|
||||||
|
.align 5
|
||||||
|
poly1305_emit:
|
||||||
|
.Lpoly1305_emit:
|
||||||
|
ldp $h0,$h1,[$ctx] // load hash base 2^64
|
||||||
|
ldp $h2,$r0,[$ctx,#16] // [along with is_base2_26]
|
||||||
|
ldp $t0,$t1,[$nonce] // load nonce
|
||||||
|
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
lsr $d0,$h0,#32
|
||||||
|
mov w#$d1,w#$h0
|
||||||
|
lsr $d2,$h1,#32
|
||||||
|
mov w15,w#$h1
|
||||||
|
lsr x16,$h2,#32
|
||||||
|
#else
|
||||||
|
mov w#$d0,w#$h0
|
||||||
|
lsr $d1,$h0,#32
|
||||||
|
mov w#$d2,w#$h1
|
||||||
|
lsr x15,$h1,#32
|
||||||
|
mov w16,w#$h2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
add $d0,$d0,$d1,lsl#26 // base 2^26 -> base 2^64
|
||||||
|
lsr $d1,$d2,#12
|
||||||
|
adds $d0,$d0,$d2,lsl#52
|
||||||
|
add $d1,$d1,x15,lsl#14
|
||||||
|
adc $d1,$d1,xzr
|
||||||
|
lsr $d2,x16,#24
|
||||||
|
adds $d1,$d1,x16,lsl#40
|
||||||
|
adc $d2,$d2,xzr
|
||||||
|
|
||||||
|
cmp $r0,#0 // is_base2_26?
|
||||||
|
csel $h0,$h0,$d0,eq // choose between radixes
|
||||||
|
csel $h1,$h1,$d1,eq
|
||||||
|
csel $h2,$h2,$d2,eq
|
||||||
|
|
||||||
|
adds $d0,$h0,#5 // compare to modulus
|
||||||
|
adcs $d1,$h1,xzr
|
||||||
|
adc $d2,$h2,xzr
|
||||||
|
|
||||||
|
tst $d2,#-4 // see if it's carried/borrowed
|
||||||
|
|
||||||
|
csel $h0,$h0,$d0,eq
|
||||||
|
csel $h1,$h1,$d1,eq
|
||||||
|
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
ror $t0,$t0,#32 // flip nonce words
|
||||||
|
ror $t1,$t1,#32
|
||||||
|
#endif
|
||||||
|
adds $h0,$h0,$t0 // accumulate nonce
|
||||||
|
adc $h1,$h1,$t1
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev $h0,$h0 // flip output bytes
|
||||||
|
rev $h1,$h1
|
||||||
|
#endif
|
||||||
|
stp $h0,$h1,[$mac] // write result
|
||||||
|
|
||||||
|
ret
|
||||||
|
.size poly1305_emit,.-poly1305_emit
|
||||||
|
___
|
||||||
|
my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
|
||||||
|
my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
|
||||||
|
my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
|
||||||
|
my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
|
||||||
|
my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
|
||||||
|
my ($T0,$T1,$MASK) = map("v$_",(29..31));
|
||||||
|
|
||||||
|
my ($in2,$zeros)=("x16","x17");
|
||||||
|
my $is_base2_26 = $zeros; # borrow
|
||||||
|
|
||||||
|
$code.=<<___;
|
||||||
|
.type poly1305_mult,%function
|
||||||
|
.align 5
|
||||||
|
poly1305_mult:
|
||||||
|
mul $d0,$h0,$r0 // h0*r0
|
||||||
|
umulh $d1,$h0,$r0
|
||||||
|
|
||||||
|
mul $t0,$h1,$s1 // h1*5*r1
|
||||||
|
umulh $t1,$h1,$s1
|
||||||
|
|
||||||
|
adds $d0,$d0,$t0
|
||||||
|
mul $t0,$h0,$r1 // h0*r1
|
||||||
|
adc $d1,$d1,$t1
|
||||||
|
umulh $d2,$h0,$r1
|
||||||
|
|
||||||
|
adds $d1,$d1,$t0
|
||||||
|
mul $t0,$h1,$r0 // h1*r0
|
||||||
|
adc $d2,$d2,xzr
|
||||||
|
umulh $t1,$h1,$r0
|
||||||
|
|
||||||
|
adds $d1,$d1,$t0
|
||||||
|
mul $t0,$h2,$s1 // h2*5*r1
|
||||||
|
adc $d2,$d2,$t1
|
||||||
|
mul $t1,$h2,$r0 // h2*r0
|
||||||
|
|
||||||
|
adds $d1,$d1,$t0
|
||||||
|
adc $d2,$d2,$t1
|
||||||
|
|
||||||
|
and $t0,$d2,#-4 // final reduction
|
||||||
|
and $h2,$d2,#3
|
||||||
|
add $t0,$t0,$d2,lsr#2
|
||||||
|
adds $h0,$d0,$t0
|
||||||
|
adcs $h1,$d1,xzr
|
||||||
|
adc $h2,$h2,xzr
|
||||||
|
|
||||||
|
ret
|
||||||
|
.size poly1305_mult,.-poly1305_mult
|
||||||
|
|
||||||
|
.type poly1305_splat,%function
|
||||||
|
.align 4
|
||||||
|
poly1305_splat:
|
||||||
|
and x12,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
ubfx x13,$h0,#26,#26
|
||||||
|
extr x14,$h1,$h0,#52
|
||||||
|
and x14,x14,#0x03ffffff
|
||||||
|
ubfx x15,$h1,#14,#26
|
||||||
|
extr x16,$h2,$h1,#40
|
||||||
|
|
||||||
|
str w12,[$ctx,#16*0] // r0
|
||||||
|
add w12,w13,w13,lsl#2 // r1*5
|
||||||
|
str w13,[$ctx,#16*1] // r1
|
||||||
|
add w13,w14,w14,lsl#2 // r2*5
|
||||||
|
str w12,[$ctx,#16*2] // s1
|
||||||
|
str w14,[$ctx,#16*3] // r2
|
||||||
|
add w14,w15,w15,lsl#2 // r3*5
|
||||||
|
str w13,[$ctx,#16*4] // s2
|
||||||
|
str w15,[$ctx,#16*5] // r3
|
||||||
|
add w15,w16,w16,lsl#2 // r4*5
|
||||||
|
str w14,[$ctx,#16*6] // s3
|
||||||
|
str w16,[$ctx,#16*7] // r4
|
||||||
|
str w15,[$ctx,#16*8] // s4
|
||||||
|
|
||||||
|
ret
|
||||||
|
.size poly1305_splat,.-poly1305_splat
|
||||||
|
|
||||||
|
#ifdef __KERNEL__
|
||||||
|
.globl poly1305_blocks_neon
|
||||||
|
#endif
|
||||||
|
.type poly1305_blocks_neon,%function
|
||||||
|
.align 5
|
||||||
|
poly1305_blocks_neon:
|
||||||
|
.Lpoly1305_blocks_neon:
|
||||||
|
ldr $is_base2_26,[$ctx,#24]
|
||||||
|
cmp $len,#128
|
||||||
|
b.lo .Lpoly1305_blocks
|
||||||
|
|
||||||
|
.inst 0xd503233f // paciasp
|
||||||
|
stp x29,x30,[sp,#-80]!
|
||||||
|
add x29,sp,#0
|
||||||
|
|
||||||
|
stp d8,d9,[sp,#16] // meet ABI requirements
|
||||||
|
stp d10,d11,[sp,#32]
|
||||||
|
stp d12,d13,[sp,#48]
|
||||||
|
stp d14,d15,[sp,#64]
|
||||||
|
|
||||||
|
cbz $is_base2_26,.Lbase2_64_neon
|
||||||
|
|
||||||
|
ldp w10,w11,[$ctx] // load hash value base 2^26
|
||||||
|
ldp w12,w13,[$ctx,#8]
|
||||||
|
ldr w14,[$ctx,#16]
|
||||||
|
|
||||||
|
tst $len,#31
|
||||||
|
b.eq .Leven_neon
|
||||||
|
|
||||||
|
ldp $r0,$r1,[$ctx,#32] // load key value
|
||||||
|
|
||||||
|
add $h0,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
||||||
|
lsr $h1,x12,#12
|
||||||
|
adds $h0,$h0,x12,lsl#52
|
||||||
|
add $h1,$h1,x13,lsl#14
|
||||||
|
adc $h1,$h1,xzr
|
||||||
|
lsr $h2,x14,#24
|
||||||
|
adds $h1,$h1,x14,lsl#40
|
||||||
|
adc $d2,$h2,xzr // can be partially reduced...
|
||||||
|
|
||||||
|
ldp $d0,$d1,[$inp],#16 // load input
|
||||||
|
sub $len,$len,#16
|
||||||
|
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||||
|
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev $d0,$d0
|
||||||
|
rev $d1,$d1
|
||||||
|
#endif
|
||||||
|
adds $h0,$h0,$d0 // accumulate input
|
||||||
|
adcs $h1,$h1,$d1
|
||||||
|
adc $h2,$h2,$padbit
|
||||||
|
|
||||||
|
bl poly1305_mult
|
||||||
|
|
||||||
|
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
ubfx x11,$h0,#26,#26
|
||||||
|
extr x12,$h1,$h0,#52
|
||||||
|
and x12,x12,#0x03ffffff
|
||||||
|
ubfx x13,$h1,#14,#26
|
||||||
|
extr x14,$h2,$h1,#40
|
||||||
|
|
||||||
|
b .Leven_neon
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
.Lbase2_64_neon:
|
||||||
|
ldp $r0,$r1,[$ctx,#32] // load key value
|
||||||
|
|
||||||
|
ldp $h0,$h1,[$ctx] // load hash value base 2^64
|
||||||
|
ldr $h2,[$ctx,#16]
|
||||||
|
|
||||||
|
tst $len,#31
|
||||||
|
b.eq .Linit_neon
|
||||||
|
|
||||||
|
ldp $d0,$d1,[$inp],#16 // load input
|
||||||
|
sub $len,$len,#16
|
||||||
|
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev $d0,$d0
|
||||||
|
rev $d1,$d1
|
||||||
|
#endif
|
||||||
|
adds $h0,$h0,$d0 // accumulate input
|
||||||
|
adcs $h1,$h1,$d1
|
||||||
|
adc $h2,$h2,$padbit
|
||||||
|
|
||||||
|
bl poly1305_mult
|
||||||
|
|
||||||
|
.Linit_neon:
|
||||||
|
ldr w17,[$ctx,#48] // first table element
|
||||||
|
and x10,$h0,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
ubfx x11,$h0,#26,#26
|
||||||
|
extr x12,$h1,$h0,#52
|
||||||
|
and x12,x12,#0x03ffffff
|
||||||
|
ubfx x13,$h1,#14,#26
|
||||||
|
extr x14,$h2,$h1,#40
|
||||||
|
|
||||||
|
cmp w17,#-1 // is value impossible?
|
||||||
|
b.ne .Leven_neon
|
||||||
|
|
||||||
|
fmov ${H0},x10
|
||||||
|
fmov ${H1},x11
|
||||||
|
fmov ${H2},x12
|
||||||
|
fmov ${H3},x13
|
||||||
|
fmov ${H4},x14
|
||||||
|
|
||||||
|
////////////////////////////////// initialize r^n table
|
||||||
|
mov $h0,$r0 // r^1
|
||||||
|
add $s1,$r1,$r1,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||||
|
mov $h1,$r1
|
||||||
|
mov $h2,xzr
|
||||||
|
add $ctx,$ctx,#48+12
|
||||||
|
bl poly1305_splat
|
||||||
|
|
||||||
|
bl poly1305_mult // r^2
|
||||||
|
sub $ctx,$ctx,#4
|
||||||
|
bl poly1305_splat
|
||||||
|
|
||||||
|
bl poly1305_mult // r^3
|
||||||
|
sub $ctx,$ctx,#4
|
||||||
|
bl poly1305_splat
|
||||||
|
|
||||||
|
bl poly1305_mult // r^4
|
||||||
|
sub $ctx,$ctx,#4
|
||||||
|
bl poly1305_splat
|
||||||
|
sub $ctx,$ctx,#48 // restore original $ctx
|
||||||
|
b .Ldo_neon
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
.Leven_neon:
|
||||||
|
fmov ${H0},x10
|
||||||
|
fmov ${H1},x11
|
||||||
|
fmov ${H2},x12
|
||||||
|
fmov ${H3},x13
|
||||||
|
fmov ${H4},x14
|
||||||
|
|
||||||
|
.Ldo_neon:
|
||||||
|
ldp x8,x12,[$inp,#32] // inp[2:3]
|
||||||
|
subs $len,$len,#64
|
||||||
|
ldp x9,x13,[$inp,#48]
|
||||||
|
add $in2,$inp,#96
|
||||||
|
adr $zeros,.Lzeros
|
||||||
|
|
||||||
|
lsl $padbit,$padbit,#24
|
||||||
|
add x15,$ctx,#48
|
||||||
|
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev x8,x8
|
||||||
|
rev x12,x12
|
||||||
|
rev x9,x9
|
||||||
|
rev x13,x13
|
||||||
|
#endif
|
||||||
|
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
and x5,x9,#0x03ffffff
|
||||||
|
ubfx x6,x8,#26,#26
|
||||||
|
ubfx x7,x9,#26,#26
|
||||||
|
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||||
|
extr x8,x12,x8,#52
|
||||||
|
extr x9,x13,x9,#52
|
||||||
|
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||||
|
fmov $IN23_0,x4
|
||||||
|
and x8,x8,#0x03ffffff
|
||||||
|
and x9,x9,#0x03ffffff
|
||||||
|
ubfx x10,x12,#14,#26
|
||||||
|
ubfx x11,x13,#14,#26
|
||||||
|
add x12,$padbit,x12,lsr#40
|
||||||
|
add x13,$padbit,x13,lsr#40
|
||||||
|
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||||
|
fmov $IN23_1,x6
|
||||||
|
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||||
|
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||||
|
fmov $IN23_2,x8
|
||||||
|
fmov $IN23_3,x10
|
||||||
|
fmov $IN23_4,x12
|
||||||
|
|
||||||
|
ldp x8,x12,[$inp],#16 // inp[0:1]
|
||||||
|
ldp x9,x13,[$inp],#48
|
||||||
|
|
||||||
|
ld1 {$R0,$R1,$S1,$R2},[x15],#64
|
||||||
|
ld1 {$S2,$R3,$S3,$R4},[x15],#64
|
||||||
|
ld1 {$S4},[x15]
|
||||||
|
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev x8,x8
|
||||||
|
rev x12,x12
|
||||||
|
rev x9,x9
|
||||||
|
rev x13,x13
|
||||||
|
#endif
|
||||||
|
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
and x5,x9,#0x03ffffff
|
||||||
|
ubfx x6,x8,#26,#26
|
||||||
|
ubfx x7,x9,#26,#26
|
||||||
|
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||||
|
extr x8,x12,x8,#52
|
||||||
|
extr x9,x13,x9,#52
|
||||||
|
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||||
|
fmov $IN01_0,x4
|
||||||
|
and x8,x8,#0x03ffffff
|
||||||
|
and x9,x9,#0x03ffffff
|
||||||
|
ubfx x10,x12,#14,#26
|
||||||
|
ubfx x11,x13,#14,#26
|
||||||
|
add x12,$padbit,x12,lsr#40
|
||||||
|
add x13,$padbit,x13,lsr#40
|
||||||
|
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||||
|
fmov $IN01_1,x6
|
||||||
|
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||||
|
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||||
|
movi $MASK.2d,#-1
|
||||||
|
fmov $IN01_2,x8
|
||||||
|
fmov $IN01_3,x10
|
||||||
|
fmov $IN01_4,x12
|
||||||
|
ushr $MASK.2d,$MASK.2d,#38
|
||||||
|
|
||||||
|
b.ls .Lskip_loop
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
.Loop_neon:
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
||||||
|
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
||||||
|
// \___________________/
|
||||||
|
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
|
||||||
|
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
|
||||||
|
// \___________________/ \____________________/
|
||||||
|
//
|
||||||
|
// Note that we start with inp[2:3]*r^2. This is because it
|
||||||
|
// doesn't depend on reduction in previous iteration.
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
|
||||||
|
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
|
||||||
|
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
|
||||||
|
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
|
||||||
|
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
|
||||||
|
|
||||||
|
subs $len,$len,#64
|
||||||
|
umull $ACC4,$IN23_0,${R4}[2]
|
||||||
|
csel $in2,$zeros,$in2,lo
|
||||||
|
umull $ACC3,$IN23_0,${R3}[2]
|
||||||
|
umull $ACC2,$IN23_0,${R2}[2]
|
||||||
|
ldp x8,x12,[$in2],#16 // inp[2:3] (or zero)
|
||||||
|
umull $ACC1,$IN23_0,${R1}[2]
|
||||||
|
ldp x9,x13,[$in2],#48
|
||||||
|
umull $ACC0,$IN23_0,${R0}[2]
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev x8,x8
|
||||||
|
rev x12,x12
|
||||||
|
rev x9,x9
|
||||||
|
rev x13,x13
|
||||||
|
#endif
|
||||||
|
|
||||||
|
umlal $ACC4,$IN23_1,${R3}[2]
|
||||||
|
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
umlal $ACC3,$IN23_1,${R2}[2]
|
||||||
|
and x5,x9,#0x03ffffff
|
||||||
|
umlal $ACC2,$IN23_1,${R1}[2]
|
||||||
|
ubfx x6,x8,#26,#26
|
||||||
|
umlal $ACC1,$IN23_1,${R0}[2]
|
||||||
|
ubfx x7,x9,#26,#26
|
||||||
|
umlal $ACC0,$IN23_1,${S4}[2]
|
||||||
|
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||||
|
|
||||||
|
umlal $ACC4,$IN23_2,${R2}[2]
|
||||||
|
extr x8,x12,x8,#52
|
||||||
|
umlal $ACC3,$IN23_2,${R1}[2]
|
||||||
|
extr x9,x13,x9,#52
|
||||||
|
umlal $ACC2,$IN23_2,${R0}[2]
|
||||||
|
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||||
|
umlal $ACC1,$IN23_2,${S4}[2]
|
||||||
|
fmov $IN23_0,x4
|
||||||
|
umlal $ACC0,$IN23_2,${S3}[2]
|
||||||
|
and x8,x8,#0x03ffffff
|
||||||
|
|
||||||
|
umlal $ACC4,$IN23_3,${R1}[2]
|
||||||
|
and x9,x9,#0x03ffffff
|
||||||
|
umlal $ACC3,$IN23_3,${R0}[2]
|
||||||
|
ubfx x10,x12,#14,#26
|
||||||
|
umlal $ACC2,$IN23_3,${S4}[2]
|
||||||
|
ubfx x11,x13,#14,#26
|
||||||
|
umlal $ACC1,$IN23_3,${S3}[2]
|
||||||
|
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||||
|
umlal $ACC0,$IN23_3,${S2}[2]
|
||||||
|
fmov $IN23_1,x6
|
||||||
|
|
||||||
|
add $IN01_2,$IN01_2,$H2
|
||||||
|
add x12,$padbit,x12,lsr#40
|
||||||
|
umlal $ACC4,$IN23_4,${R0}[2]
|
||||||
|
add x13,$padbit,x13,lsr#40
|
||||||
|
umlal $ACC3,$IN23_4,${S4}[2]
|
||||||
|
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||||
|
umlal $ACC2,$IN23_4,${S3}[2]
|
||||||
|
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||||
|
umlal $ACC1,$IN23_4,${S2}[2]
|
||||||
|
fmov $IN23_2,x8
|
||||||
|
umlal $ACC0,$IN23_4,${S1}[2]
|
||||||
|
fmov $IN23_3,x10
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// (hash+inp[0:1])*r^4 and accumulate
|
||||||
|
|
||||||
|
add $IN01_0,$IN01_0,$H0
|
||||||
|
fmov $IN23_4,x12
|
||||||
|
umlal $ACC3,$IN01_2,${R1}[0]
|
||||||
|
ldp x8,x12,[$inp],#16 // inp[0:1]
|
||||||
|
umlal $ACC0,$IN01_2,${S3}[0]
|
||||||
|
ldp x9,x13,[$inp],#48
|
||||||
|
umlal $ACC4,$IN01_2,${R2}[0]
|
||||||
|
umlal $ACC1,$IN01_2,${S4}[0]
|
||||||
|
umlal $ACC2,$IN01_2,${R0}[0]
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev x8,x8
|
||||||
|
rev x12,x12
|
||||||
|
rev x9,x9
|
||||||
|
rev x13,x13
|
||||||
|
#endif
|
||||||
|
|
||||||
|
add $IN01_1,$IN01_1,$H1
|
||||||
|
umlal $ACC3,$IN01_0,${R3}[0]
|
||||||
|
umlal $ACC4,$IN01_0,${R4}[0]
|
||||||
|
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
umlal $ACC2,$IN01_0,${R2}[0]
|
||||||
|
and x5,x9,#0x03ffffff
|
||||||
|
umlal $ACC0,$IN01_0,${R0}[0]
|
||||||
|
ubfx x6,x8,#26,#26
|
||||||
|
umlal $ACC1,$IN01_0,${R1}[0]
|
||||||
|
ubfx x7,x9,#26,#26
|
||||||
|
|
||||||
|
add $IN01_3,$IN01_3,$H3
|
||||||
|
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||||
|
umlal $ACC3,$IN01_1,${R2}[0]
|
||||||
|
extr x8,x12,x8,#52
|
||||||
|
umlal $ACC4,$IN01_1,${R3}[0]
|
||||||
|
extr x9,x13,x9,#52
|
||||||
|
umlal $ACC0,$IN01_1,${S4}[0]
|
||||||
|
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||||
|
umlal $ACC2,$IN01_1,${R1}[0]
|
||||||
|
fmov $IN01_0,x4
|
||||||
|
umlal $ACC1,$IN01_1,${R0}[0]
|
||||||
|
and x8,x8,#0x03ffffff
|
||||||
|
|
||||||
|
add $IN01_4,$IN01_4,$H4
|
||||||
|
and x9,x9,#0x03ffffff
|
||||||
|
umlal $ACC3,$IN01_3,${R0}[0]
|
||||||
|
ubfx x10,x12,#14,#26
|
||||||
|
umlal $ACC0,$IN01_3,${S2}[0]
|
||||||
|
ubfx x11,x13,#14,#26
|
||||||
|
umlal $ACC4,$IN01_3,${R1}[0]
|
||||||
|
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||||
|
umlal $ACC1,$IN01_3,${S3}[0]
|
||||||
|
fmov $IN01_1,x6
|
||||||
|
umlal $ACC2,$IN01_3,${S4}[0]
|
||||||
|
add x12,$padbit,x12,lsr#40
|
||||||
|
|
||||||
|
umlal $ACC3,$IN01_4,${S4}[0]
|
||||||
|
add x13,$padbit,x13,lsr#40
|
||||||
|
umlal $ACC0,$IN01_4,${S1}[0]
|
||||||
|
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||||
|
umlal $ACC4,$IN01_4,${R0}[0]
|
||||||
|
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||||
|
umlal $ACC1,$IN01_4,${S2}[0]
|
||||||
|
fmov $IN01_2,x8
|
||||||
|
umlal $ACC2,$IN01_4,${S3}[0]
|
||||||
|
fmov $IN01_3,x10
|
||||||
|
fmov $IN01_4,x12
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
||||||
|
// and P. Schwabe
|
||||||
|
//
|
||||||
|
// [see discussion in poly1305-armv4 module]
|
||||||
|
|
||||||
|
ushr $T0.2d,$ACC3,#26
|
||||||
|
xtn $H3,$ACC3
|
||||||
|
ushr $T1.2d,$ACC0,#26
|
||||||
|
and $ACC0,$ACC0,$MASK.2d
|
||||||
|
add $ACC4,$ACC4,$T0.2d // h3 -> h4
|
||||||
|
bic $H3,#0xfc,lsl#24 // &=0x03ffffff
|
||||||
|
add $ACC1,$ACC1,$T1.2d // h0 -> h1
|
||||||
|
|
||||||
|
ushr $T0.2d,$ACC4,#26
|
||||||
|
xtn $H4,$ACC4
|
||||||
|
ushr $T1.2d,$ACC1,#26
|
||||||
|
xtn $H1,$ACC1
|
||||||
|
bic $H4,#0xfc,lsl#24
|
||||||
|
add $ACC2,$ACC2,$T1.2d // h1 -> h2
|
||||||
|
|
||||||
|
add $ACC0,$ACC0,$T0.2d
|
||||||
|
shl $T0.2d,$T0.2d,#2
|
||||||
|
shrn $T1.2s,$ACC2,#26
|
||||||
|
xtn $H2,$ACC2
|
||||||
|
add $ACC0,$ACC0,$T0.2d // h4 -> h0
|
||||||
|
bic $H1,#0xfc,lsl#24
|
||||||
|
add $H3,$H3,$T1.2s // h2 -> h3
|
||||||
|
bic $H2,#0xfc,lsl#24
|
||||||
|
|
||||||
|
shrn $T0.2s,$ACC0,#26
|
||||||
|
xtn $H0,$ACC0
|
||||||
|
ushr $T1.2s,$H3,#26
|
||||||
|
bic $H3,#0xfc,lsl#24
|
||||||
|
bic $H0,#0xfc,lsl#24
|
||||||
|
add $H1,$H1,$T0.2s // h0 -> h1
|
||||||
|
add $H4,$H4,$T1.2s // h3 -> h4
|
||||||
|
|
||||||
|
b.hi .Loop_neon
|
||||||
|
|
||||||
|
.Lskip_loop:
|
||||||
|
dup $IN23_2,${IN23_2}[0]
|
||||||
|
add $IN01_2,$IN01_2,$H2
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
|
||||||
|
|
||||||
|
adds $len,$len,#32
|
||||||
|
b.ne .Long_tail
|
||||||
|
|
||||||
|
dup $IN23_2,${IN01_2}[0]
|
||||||
|
add $IN23_0,$IN01_0,$H0
|
||||||
|
add $IN23_3,$IN01_3,$H3
|
||||||
|
add $IN23_1,$IN01_1,$H1
|
||||||
|
add $IN23_4,$IN01_4,$H4
|
||||||
|
|
||||||
|
.Long_tail:
|
||||||
|
dup $IN23_0,${IN23_0}[0]
|
||||||
|
umull2 $ACC0,$IN23_2,${S3}
|
||||||
|
umull2 $ACC3,$IN23_2,${R1}
|
||||||
|
umull2 $ACC4,$IN23_2,${R2}
|
||||||
|
umull2 $ACC2,$IN23_2,${R0}
|
||||||
|
umull2 $ACC1,$IN23_2,${S4}
|
||||||
|
|
||||||
|
dup $IN23_1,${IN23_1}[0]
|
||||||
|
umlal2 $ACC0,$IN23_0,${R0}
|
||||||
|
umlal2 $ACC2,$IN23_0,${R2}
|
||||||
|
umlal2 $ACC3,$IN23_0,${R3}
|
||||||
|
umlal2 $ACC4,$IN23_0,${R4}
|
||||||
|
umlal2 $ACC1,$IN23_0,${R1}
|
||||||
|
|
||||||
|
dup $IN23_3,${IN23_3}[0]
|
||||||
|
umlal2 $ACC0,$IN23_1,${S4}
|
||||||
|
umlal2 $ACC3,$IN23_1,${R2}
|
||||||
|
umlal2 $ACC2,$IN23_1,${R1}
|
||||||
|
umlal2 $ACC4,$IN23_1,${R3}
|
||||||
|
umlal2 $ACC1,$IN23_1,${R0}
|
||||||
|
|
||||||
|
dup $IN23_4,${IN23_4}[0]
|
||||||
|
umlal2 $ACC3,$IN23_3,${R0}
|
||||||
|
umlal2 $ACC4,$IN23_3,${R1}
|
||||||
|
umlal2 $ACC0,$IN23_3,${S2}
|
||||||
|
umlal2 $ACC1,$IN23_3,${S3}
|
||||||
|
umlal2 $ACC2,$IN23_3,${S4}
|
||||||
|
|
||||||
|
umlal2 $ACC3,$IN23_4,${S4}
|
||||||
|
umlal2 $ACC0,$IN23_4,${S1}
|
||||||
|
umlal2 $ACC4,$IN23_4,${R0}
|
||||||
|
umlal2 $ACC1,$IN23_4,${S2}
|
||||||
|
umlal2 $ACC2,$IN23_4,${S3}
|
||||||
|
|
||||||
|
b.eq .Lshort_tail
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// (hash+inp[0:1])*r^4:r^3 and accumulate
|
||||||
|
|
||||||
|
add $IN01_0,$IN01_0,$H0
|
||||||
|
umlal $ACC3,$IN01_2,${R1}
|
||||||
|
umlal $ACC0,$IN01_2,${S3}
|
||||||
|
umlal $ACC4,$IN01_2,${R2}
|
||||||
|
umlal $ACC1,$IN01_2,${S4}
|
||||||
|
umlal $ACC2,$IN01_2,${R0}
|
||||||
|
|
||||||
|
add $IN01_1,$IN01_1,$H1
|
||||||
|
umlal $ACC3,$IN01_0,${R3}
|
||||||
|
umlal $ACC0,$IN01_0,${R0}
|
||||||
|
umlal $ACC4,$IN01_0,${R4}
|
||||||
|
umlal $ACC1,$IN01_0,${R1}
|
||||||
|
umlal $ACC2,$IN01_0,${R2}
|
||||||
|
|
||||||
|
add $IN01_3,$IN01_3,$H3
|
||||||
|
umlal $ACC3,$IN01_1,${R2}
|
||||||
|
umlal $ACC0,$IN01_1,${S4}
|
||||||
|
umlal $ACC4,$IN01_1,${R3}
|
||||||
|
umlal $ACC1,$IN01_1,${R0}
|
||||||
|
umlal $ACC2,$IN01_1,${R1}
|
||||||
|
|
||||||
|
add $IN01_4,$IN01_4,$H4
|
||||||
|
umlal $ACC3,$IN01_3,${R0}
|
||||||
|
umlal $ACC0,$IN01_3,${S2}
|
||||||
|
umlal $ACC4,$IN01_3,${R1}
|
||||||
|
umlal $ACC1,$IN01_3,${S3}
|
||||||
|
umlal $ACC2,$IN01_3,${S4}
|
||||||
|
|
||||||
|
umlal $ACC3,$IN01_4,${S4}
|
||||||
|
umlal $ACC0,$IN01_4,${S1}
|
||||||
|
umlal $ACC4,$IN01_4,${R0}
|
||||||
|
umlal $ACC1,$IN01_4,${S2}
|
||||||
|
umlal $ACC2,$IN01_4,${S3}
|
||||||
|
|
||||||
|
.Lshort_tail:
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// horizontal add
|
||||||
|
|
||||||
|
addp $ACC3,$ACC3,$ACC3
|
||||||
|
ldp d8,d9,[sp,#16] // meet ABI requirements
|
||||||
|
addp $ACC0,$ACC0,$ACC0
|
||||||
|
ldp d10,d11,[sp,#32]
|
||||||
|
addp $ACC4,$ACC4,$ACC4
|
||||||
|
ldp d12,d13,[sp,#48]
|
||||||
|
addp $ACC1,$ACC1,$ACC1
|
||||||
|
ldp d14,d15,[sp,#64]
|
||||||
|
addp $ACC2,$ACC2,$ACC2
|
||||||
|
ldr x30,[sp,#8]
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// lazy reduction, but without narrowing
|
||||||
|
|
||||||
|
ushr $T0.2d,$ACC3,#26
|
||||||
|
and $ACC3,$ACC3,$MASK.2d
|
||||||
|
ushr $T1.2d,$ACC0,#26
|
||||||
|
and $ACC0,$ACC0,$MASK.2d
|
||||||
|
|
||||||
|
add $ACC4,$ACC4,$T0.2d // h3 -> h4
|
||||||
|
add $ACC1,$ACC1,$T1.2d // h0 -> h1
|
||||||
|
|
||||||
|
ushr $T0.2d,$ACC4,#26
|
||||||
|
and $ACC4,$ACC4,$MASK.2d
|
||||||
|
ushr $T1.2d,$ACC1,#26
|
||||||
|
and $ACC1,$ACC1,$MASK.2d
|
||||||
|
add $ACC2,$ACC2,$T1.2d // h1 -> h2
|
||||||
|
|
||||||
|
add $ACC0,$ACC0,$T0.2d
|
||||||
|
shl $T0.2d,$T0.2d,#2
|
||||||
|
ushr $T1.2d,$ACC2,#26
|
||||||
|
and $ACC2,$ACC2,$MASK.2d
|
||||||
|
add $ACC0,$ACC0,$T0.2d // h4 -> h0
|
||||||
|
add $ACC3,$ACC3,$T1.2d // h2 -> h3
|
||||||
|
|
||||||
|
ushr $T0.2d,$ACC0,#26
|
||||||
|
and $ACC0,$ACC0,$MASK.2d
|
||||||
|
ushr $T1.2d,$ACC3,#26
|
||||||
|
and $ACC3,$ACC3,$MASK.2d
|
||||||
|
add $ACC1,$ACC1,$T0.2d // h0 -> h1
|
||||||
|
add $ACC4,$ACC4,$T1.2d // h3 -> h4
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// write the result, can be partially reduced
|
||||||
|
|
||||||
|
st4 {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
|
||||||
|
mov x4,#1
|
||||||
|
st1 {$ACC4}[0],[$ctx]
|
||||||
|
str x4,[$ctx,#8] // set is_base2_26
|
||||||
|
|
||||||
|
ldr x29,[sp],#80
|
||||||
|
.inst 0xd50323bf // autiasp
|
||||||
|
ret
|
||||||
|
.size poly1305_blocks_neon,.-poly1305_blocks_neon
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lzeros:
|
||||||
|
.long 0,0,0,0,0,0,0,0
|
||||||
|
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by \@dot-asm"
|
||||||
|
.align 2
|
||||||
|
#if !defined(__KERNEL__) && !defined(_WIN64)
|
||||||
|
.comm OPENSSL_armcap_P,4,4
|
||||||
|
.hidden OPENSSL_armcap_P
|
||||||
|
#endif
|
||||||
|
___
|
||||||
|
|
||||||
|
foreach (split("\n",$code)) {
|
||||||
|
s/\b(shrn\s+v[0-9]+)\.[24]d/$1.2s/ or
|
||||||
|
s/\b(fmov\s+)v([0-9]+)[^,]*,\s*x([0-9]+)/$1d$2,x$3/ or
|
||||||
|
(m/\bdup\b/ and (s/\.[24]s/.2d/g or 1)) or
|
||||||
|
(m/\b(eor|and)/ and (s/\.[248][sdh]/.16b/g or 1)) or
|
||||||
|
(m/\bum(ul|la)l\b/ and (s/\.4s/.2s/g or 1)) or
|
||||||
|
(m/\bum(ul|la)l2\b/ and (s/\.2s/.4s/g or 1)) or
|
||||||
|
(m/\bst[1-4]\s+{[^}]+}\[/ and (s/\.[24]d/.s/g or 1));
|
||||||
|
|
||||||
|
s/\.[124]([sd])\[/.$1\[/;
|
||||||
|
s/w#x([0-9]+)/w$1/g;
|
||||||
|
|
||||||
|
print $_,"\n";
|
||||||
|
}
|
||||||
|
close STDOUT;
|
835
arch/arm64/crypto/poly1305-core.S_shipped
Normal file
835
arch/arm64/crypto/poly1305-core.S_shipped
Normal file
|
@ -0,0 +1,835 @@
|
||||||
|
#ifndef __KERNEL__
|
||||||
|
# include "arm_arch.h"
|
||||||
|
.extern OPENSSL_armcap_P
|
||||||
|
#endif
|
||||||
|
|
||||||
|
.text
|
||||||
|
|
||||||
|
// forward "declarations" are required for Apple
|
||||||
|
.globl poly1305_blocks
|
||||||
|
.globl poly1305_emit
|
||||||
|
|
||||||
|
.globl poly1305_init
|
||||||
|
.type poly1305_init,%function
|
||||||
|
.align 5
|
||||||
|
poly1305_init:
|
||||||
|
cmp x1,xzr
|
||||||
|
stp xzr,xzr,[x0] // zero hash value
|
||||||
|
stp xzr,xzr,[x0,#16] // [along with is_base2_26]
|
||||||
|
|
||||||
|
csel x0,xzr,x0,eq
|
||||||
|
b.eq .Lno_key
|
||||||
|
|
||||||
|
#ifndef __KERNEL__
|
||||||
|
adrp x17,OPENSSL_armcap_P
|
||||||
|
ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ldp x7,x8,[x1] // load key
|
||||||
|
mov x9,#0xfffffffc0fffffff
|
||||||
|
movk x9,#0x0fff,lsl#48
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev x7,x7 // flip bytes
|
||||||
|
rev x8,x8
|
||||||
|
#endif
|
||||||
|
and x7,x7,x9 // &=0ffffffc0fffffff
|
||||||
|
and x9,x9,#-4
|
||||||
|
and x8,x8,x9 // &=0ffffffc0ffffffc
|
||||||
|
mov w9,#-1
|
||||||
|
stp x7,x8,[x0,#32] // save key value
|
||||||
|
str w9,[x0,#48] // impossible key power value
|
||||||
|
|
||||||
|
#ifndef __KERNEL__
|
||||||
|
tst w17,#ARMV7_NEON
|
||||||
|
|
||||||
|
adr x12,.Lpoly1305_blocks
|
||||||
|
adr x7,.Lpoly1305_blocks_neon
|
||||||
|
adr x13,.Lpoly1305_emit
|
||||||
|
|
||||||
|
csel x12,x12,x7,eq
|
||||||
|
|
||||||
|
# ifdef __ILP32__
|
||||||
|
stp w12,w13,[x2]
|
||||||
|
# else
|
||||||
|
stp x12,x13,[x2]
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
mov x0,#1
|
||||||
|
.Lno_key:
|
||||||
|
ret
|
||||||
|
.size poly1305_init,.-poly1305_init
|
||||||
|
|
||||||
|
.type poly1305_blocks,%function
|
||||||
|
.align 5
|
||||||
|
poly1305_blocks:
|
||||||
|
.Lpoly1305_blocks:
|
||||||
|
ands x2,x2,#-16
|
||||||
|
b.eq .Lno_data
|
||||||
|
|
||||||
|
ldp x4,x5,[x0] // load hash value
|
||||||
|
ldp x6,x17,[x0,#16] // [along with is_base2_26]
|
||||||
|
ldp x7,x8,[x0,#32] // load key value
|
||||||
|
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
lsr x12,x4,#32
|
||||||
|
mov w13,w4
|
||||||
|
lsr x14,x5,#32
|
||||||
|
mov w15,w5
|
||||||
|
lsr x16,x6,#32
|
||||||
|
#else
|
||||||
|
mov w12,w4
|
||||||
|
lsr x13,x4,#32
|
||||||
|
mov w14,w5
|
||||||
|
lsr x15,x5,#32
|
||||||
|
mov w16,w6
|
||||||
|
#endif
|
||||||
|
|
||||||
|
add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
|
||||||
|
lsr x13,x14,#12
|
||||||
|
adds x12,x12,x14,lsl#52
|
||||||
|
add x13,x13,x15,lsl#14
|
||||||
|
adc x13,x13,xzr
|
||||||
|
lsr x14,x16,#24
|
||||||
|
adds x13,x13,x16,lsl#40
|
||||||
|
adc x14,x14,xzr
|
||||||
|
|
||||||
|
cmp x17,#0 // is_base2_26?
|
||||||
|
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||||
|
csel x4,x4,x12,eq // choose between radixes
|
||||||
|
csel x5,x5,x13,eq
|
||||||
|
csel x6,x6,x14,eq
|
||||||
|
|
||||||
|
.Loop:
|
||||||
|
ldp x10,x11,[x1],#16 // load input
|
||||||
|
sub x2,x2,#16
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev x10,x10
|
||||||
|
rev x11,x11
|
||||||
|
#endif
|
||||||
|
adds x4,x4,x10 // accumulate input
|
||||||
|
adcs x5,x5,x11
|
||||||
|
|
||||||
|
mul x12,x4,x7 // h0*r0
|
||||||
|
adc x6,x6,x3
|
||||||
|
umulh x13,x4,x7
|
||||||
|
|
||||||
|
mul x10,x5,x9 // h1*5*r1
|
||||||
|
umulh x11,x5,x9
|
||||||
|
|
||||||
|
adds x12,x12,x10
|
||||||
|
mul x10,x4,x8 // h0*r1
|
||||||
|
adc x13,x13,x11
|
||||||
|
umulh x14,x4,x8
|
||||||
|
|
||||||
|
adds x13,x13,x10
|
||||||
|
mul x10,x5,x7 // h1*r0
|
||||||
|
adc x14,x14,xzr
|
||||||
|
umulh x11,x5,x7
|
||||||
|
|
||||||
|
adds x13,x13,x10
|
||||||
|
mul x10,x6,x9 // h2*5*r1
|
||||||
|
adc x14,x14,x11
|
||||||
|
mul x11,x6,x7 // h2*r0
|
||||||
|
|
||||||
|
adds x13,x13,x10
|
||||||
|
adc x14,x14,x11
|
||||||
|
|
||||||
|
and x10,x14,#-4 // final reduction
|
||||||
|
and x6,x14,#3
|
||||||
|
add x10,x10,x14,lsr#2
|
||||||
|
adds x4,x12,x10
|
||||||
|
adcs x5,x13,xzr
|
||||||
|
adc x6,x6,xzr
|
||||||
|
|
||||||
|
cbnz x2,.Loop
|
||||||
|
|
||||||
|
stp x4,x5,[x0] // store hash value
|
||||||
|
stp x6,xzr,[x0,#16] // [and clear is_base2_26]
|
||||||
|
|
||||||
|
.Lno_data:
|
||||||
|
ret
|
||||||
|
.size poly1305_blocks,.-poly1305_blocks
|
||||||
|
|
||||||
|
.type poly1305_emit,%function
|
||||||
|
.align 5
|
||||||
|
poly1305_emit:
|
||||||
|
.Lpoly1305_emit:
|
||||||
|
ldp x4,x5,[x0] // load hash base 2^64
|
||||||
|
ldp x6,x7,[x0,#16] // [along with is_base2_26]
|
||||||
|
ldp x10,x11,[x2] // load nonce
|
||||||
|
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
lsr x12,x4,#32
|
||||||
|
mov w13,w4
|
||||||
|
lsr x14,x5,#32
|
||||||
|
mov w15,w5
|
||||||
|
lsr x16,x6,#32
|
||||||
|
#else
|
||||||
|
mov w12,w4
|
||||||
|
lsr x13,x4,#32
|
||||||
|
mov w14,w5
|
||||||
|
lsr x15,x5,#32
|
||||||
|
mov w16,w6
|
||||||
|
#endif
|
||||||
|
|
||||||
|
add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
|
||||||
|
lsr x13,x14,#12
|
||||||
|
adds x12,x12,x14,lsl#52
|
||||||
|
add x13,x13,x15,lsl#14
|
||||||
|
adc x13,x13,xzr
|
||||||
|
lsr x14,x16,#24
|
||||||
|
adds x13,x13,x16,lsl#40
|
||||||
|
adc x14,x14,xzr
|
||||||
|
|
||||||
|
cmp x7,#0 // is_base2_26?
|
||||||
|
csel x4,x4,x12,eq // choose between radixes
|
||||||
|
csel x5,x5,x13,eq
|
||||||
|
csel x6,x6,x14,eq
|
||||||
|
|
||||||
|
adds x12,x4,#5 // compare to modulus
|
||||||
|
adcs x13,x5,xzr
|
||||||
|
adc x14,x6,xzr
|
||||||
|
|
||||||
|
tst x14,#-4 // see if it's carried/borrowed
|
||||||
|
|
||||||
|
csel x4,x4,x12,eq
|
||||||
|
csel x5,x5,x13,eq
|
||||||
|
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
ror x10,x10,#32 // flip nonce words
|
||||||
|
ror x11,x11,#32
|
||||||
|
#endif
|
||||||
|
adds x4,x4,x10 // accumulate nonce
|
||||||
|
adc x5,x5,x11
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev x4,x4 // flip output bytes
|
||||||
|
rev x5,x5
|
||||||
|
#endif
|
||||||
|
stp x4,x5,[x1] // write result
|
||||||
|
|
||||||
|
ret
|
||||||
|
.size poly1305_emit,.-poly1305_emit
|
||||||
|
.type poly1305_mult,%function
|
||||||
|
.align 5
|
||||||
|
poly1305_mult:
|
||||||
|
mul x12,x4,x7 // h0*r0
|
||||||
|
umulh x13,x4,x7
|
||||||
|
|
||||||
|
mul x10,x5,x9 // h1*5*r1
|
||||||
|
umulh x11,x5,x9
|
||||||
|
|
||||||
|
adds x12,x12,x10
|
||||||
|
mul x10,x4,x8 // h0*r1
|
||||||
|
adc x13,x13,x11
|
||||||
|
umulh x14,x4,x8
|
||||||
|
|
||||||
|
adds x13,x13,x10
|
||||||
|
mul x10,x5,x7 // h1*r0
|
||||||
|
adc x14,x14,xzr
|
||||||
|
umulh x11,x5,x7
|
||||||
|
|
||||||
|
adds x13,x13,x10
|
||||||
|
mul x10,x6,x9 // h2*5*r1
|
||||||
|
adc x14,x14,x11
|
||||||
|
mul x11,x6,x7 // h2*r0
|
||||||
|
|
||||||
|
adds x13,x13,x10
|
||||||
|
adc x14,x14,x11
|
||||||
|
|
||||||
|
and x10,x14,#-4 // final reduction
|
||||||
|
and x6,x14,#3
|
||||||
|
add x10,x10,x14,lsr#2
|
||||||
|
adds x4,x12,x10
|
||||||
|
adcs x5,x13,xzr
|
||||||
|
adc x6,x6,xzr
|
||||||
|
|
||||||
|
ret
|
||||||
|
.size poly1305_mult,.-poly1305_mult
|
||||||
|
|
||||||
|
.type poly1305_splat,%function
|
||||||
|
.align 4
|
||||||
|
poly1305_splat:
|
||||||
|
and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
ubfx x13,x4,#26,#26
|
||||||
|
extr x14,x5,x4,#52
|
||||||
|
and x14,x14,#0x03ffffff
|
||||||
|
ubfx x15,x5,#14,#26
|
||||||
|
extr x16,x6,x5,#40
|
||||||
|
|
||||||
|
str w12,[x0,#16*0] // r0
|
||||||
|
add w12,w13,w13,lsl#2 // r1*5
|
||||||
|
str w13,[x0,#16*1] // r1
|
||||||
|
add w13,w14,w14,lsl#2 // r2*5
|
||||||
|
str w12,[x0,#16*2] // s1
|
||||||
|
str w14,[x0,#16*3] // r2
|
||||||
|
add w14,w15,w15,lsl#2 // r3*5
|
||||||
|
str w13,[x0,#16*4] // s2
|
||||||
|
str w15,[x0,#16*5] // r3
|
||||||
|
add w15,w16,w16,lsl#2 // r4*5
|
||||||
|
str w14,[x0,#16*6] // s3
|
||||||
|
str w16,[x0,#16*7] // r4
|
||||||
|
str w15,[x0,#16*8] // s4
|
||||||
|
|
||||||
|
ret
|
||||||
|
.size poly1305_splat,.-poly1305_splat
|
||||||
|
|
||||||
|
#ifdef __KERNEL__
|
||||||
|
.globl poly1305_blocks_neon
|
||||||
|
#endif
|
||||||
|
.type poly1305_blocks_neon,%function
|
||||||
|
.align 5
|
||||||
|
poly1305_blocks_neon:
|
||||||
|
.Lpoly1305_blocks_neon:
|
||||||
|
ldr x17,[x0,#24]
|
||||||
|
cmp x2,#128
|
||||||
|
b.lo .Lpoly1305_blocks
|
||||||
|
|
||||||
|
.inst 0xd503233f // paciasp
|
||||||
|
stp x29,x30,[sp,#-80]!
|
||||||
|
add x29,sp,#0
|
||||||
|
|
||||||
|
stp d8,d9,[sp,#16] // meet ABI requirements
|
||||||
|
stp d10,d11,[sp,#32]
|
||||||
|
stp d12,d13,[sp,#48]
|
||||||
|
stp d14,d15,[sp,#64]
|
||||||
|
|
||||||
|
cbz x17,.Lbase2_64_neon
|
||||||
|
|
||||||
|
ldp w10,w11,[x0] // load hash value base 2^26
|
||||||
|
ldp w12,w13,[x0,#8]
|
||||||
|
ldr w14,[x0,#16]
|
||||||
|
|
||||||
|
tst x2,#31
|
||||||
|
b.eq .Leven_neon
|
||||||
|
|
||||||
|
ldp x7,x8,[x0,#32] // load key value
|
||||||
|
|
||||||
|
add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
|
||||||
|
lsr x5,x12,#12
|
||||||
|
adds x4,x4,x12,lsl#52
|
||||||
|
add x5,x5,x13,lsl#14
|
||||||
|
adc x5,x5,xzr
|
||||||
|
lsr x6,x14,#24
|
||||||
|
adds x5,x5,x14,lsl#40
|
||||||
|
adc x14,x6,xzr // can be partially reduced...
|
||||||
|
|
||||||
|
ldp x12,x13,[x1],#16 // load input
|
||||||
|
sub x2,x2,#16
|
||||||
|
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||||
|
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev x12,x12
|
||||||
|
rev x13,x13
|
||||||
|
#endif
|
||||||
|
adds x4,x4,x12 // accumulate input
|
||||||
|
adcs x5,x5,x13
|
||||||
|
adc x6,x6,x3
|
||||||
|
|
||||||
|
bl poly1305_mult
|
||||||
|
|
||||||
|
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
ubfx x11,x4,#26,#26
|
||||||
|
extr x12,x5,x4,#52
|
||||||
|
and x12,x12,#0x03ffffff
|
||||||
|
ubfx x13,x5,#14,#26
|
||||||
|
extr x14,x6,x5,#40
|
||||||
|
|
||||||
|
b .Leven_neon
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
.Lbase2_64_neon:
|
||||||
|
ldp x7,x8,[x0,#32] // load key value
|
||||||
|
|
||||||
|
ldp x4,x5,[x0] // load hash value base 2^64
|
||||||
|
ldr x6,[x0,#16]
|
||||||
|
|
||||||
|
tst x2,#31
|
||||||
|
b.eq .Linit_neon
|
||||||
|
|
||||||
|
ldp x12,x13,[x1],#16 // load input
|
||||||
|
sub x2,x2,#16
|
||||||
|
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev x12,x12
|
||||||
|
rev x13,x13
|
||||||
|
#endif
|
||||||
|
adds x4,x4,x12 // accumulate input
|
||||||
|
adcs x5,x5,x13
|
||||||
|
adc x6,x6,x3
|
||||||
|
|
||||||
|
bl poly1305_mult
|
||||||
|
|
||||||
|
.Linit_neon:
|
||||||
|
ldr w17,[x0,#48] // first table element
|
||||||
|
and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
ubfx x11,x4,#26,#26
|
||||||
|
extr x12,x5,x4,#52
|
||||||
|
and x12,x12,#0x03ffffff
|
||||||
|
ubfx x13,x5,#14,#26
|
||||||
|
extr x14,x6,x5,#40
|
||||||
|
|
||||||
|
cmp w17,#-1 // is value impossible?
|
||||||
|
b.ne .Leven_neon
|
||||||
|
|
||||||
|
fmov d24,x10
|
||||||
|
fmov d25,x11
|
||||||
|
fmov d26,x12
|
||||||
|
fmov d27,x13
|
||||||
|
fmov d28,x14
|
||||||
|
|
||||||
|
////////////////////////////////// initialize r^n table
|
||||||
|
mov x4,x7 // r^1
|
||||||
|
add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
|
||||||
|
mov x5,x8
|
||||||
|
mov x6,xzr
|
||||||
|
add x0,x0,#48+12
|
||||||
|
bl poly1305_splat
|
||||||
|
|
||||||
|
bl poly1305_mult // r^2
|
||||||
|
sub x0,x0,#4
|
||||||
|
bl poly1305_splat
|
||||||
|
|
||||||
|
bl poly1305_mult // r^3
|
||||||
|
sub x0,x0,#4
|
||||||
|
bl poly1305_splat
|
||||||
|
|
||||||
|
bl poly1305_mult // r^4
|
||||||
|
sub x0,x0,#4
|
||||||
|
bl poly1305_splat
|
||||||
|
sub x0,x0,#48 // restore original x0
|
||||||
|
b .Ldo_neon
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
.Leven_neon:
|
||||||
|
fmov d24,x10
|
||||||
|
fmov d25,x11
|
||||||
|
fmov d26,x12
|
||||||
|
fmov d27,x13
|
||||||
|
fmov d28,x14
|
||||||
|
|
||||||
|
.Ldo_neon:
|
||||||
|
ldp x8,x12,[x1,#32] // inp[2:3]
|
||||||
|
subs x2,x2,#64
|
||||||
|
ldp x9,x13,[x1,#48]
|
||||||
|
add x16,x1,#96
|
||||||
|
adr x17,.Lzeros
|
||||||
|
|
||||||
|
lsl x3,x3,#24
|
||||||
|
add x15,x0,#48
|
||||||
|
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev x8,x8
|
||||||
|
rev x12,x12
|
||||||
|
rev x9,x9
|
||||||
|
rev x13,x13
|
||||||
|
#endif
|
||||||
|
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
and x5,x9,#0x03ffffff
|
||||||
|
ubfx x6,x8,#26,#26
|
||||||
|
ubfx x7,x9,#26,#26
|
||||||
|
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||||
|
extr x8,x12,x8,#52
|
||||||
|
extr x9,x13,x9,#52
|
||||||
|
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||||
|
fmov d14,x4
|
||||||
|
and x8,x8,#0x03ffffff
|
||||||
|
and x9,x9,#0x03ffffff
|
||||||
|
ubfx x10,x12,#14,#26
|
||||||
|
ubfx x11,x13,#14,#26
|
||||||
|
add x12,x3,x12,lsr#40
|
||||||
|
add x13,x3,x13,lsr#40
|
||||||
|
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||||
|
fmov d15,x6
|
||||||
|
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||||
|
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||||
|
fmov d16,x8
|
||||||
|
fmov d17,x10
|
||||||
|
fmov d18,x12
|
||||||
|
|
||||||
|
ldp x8,x12,[x1],#16 // inp[0:1]
|
||||||
|
ldp x9,x13,[x1],#48
|
||||||
|
|
||||||
|
ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
|
||||||
|
ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
|
||||||
|
ld1 {v8.4s},[x15]
|
||||||
|
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev x8,x8
|
||||||
|
rev x12,x12
|
||||||
|
rev x9,x9
|
||||||
|
rev x13,x13
|
||||||
|
#endif
|
||||||
|
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
and x5,x9,#0x03ffffff
|
||||||
|
ubfx x6,x8,#26,#26
|
||||||
|
ubfx x7,x9,#26,#26
|
||||||
|
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||||
|
extr x8,x12,x8,#52
|
||||||
|
extr x9,x13,x9,#52
|
||||||
|
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||||
|
fmov d9,x4
|
||||||
|
and x8,x8,#0x03ffffff
|
||||||
|
and x9,x9,#0x03ffffff
|
||||||
|
ubfx x10,x12,#14,#26
|
||||||
|
ubfx x11,x13,#14,#26
|
||||||
|
add x12,x3,x12,lsr#40
|
||||||
|
add x13,x3,x13,lsr#40
|
||||||
|
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||||
|
fmov d10,x6
|
||||||
|
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||||
|
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||||
|
movi v31.2d,#-1
|
||||||
|
fmov d11,x8
|
||||||
|
fmov d12,x10
|
||||||
|
fmov d13,x12
|
||||||
|
ushr v31.2d,v31.2d,#38
|
||||||
|
|
||||||
|
b.ls .Lskip_loop
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
.Loop_neon:
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
||||||
|
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
||||||
|
// ___________________/
|
||||||
|
// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
|
||||||
|
// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
|
||||||
|
// ___________________/ ____________________/
|
||||||
|
//
|
||||||
|
// Note that we start with inp[2:3]*r^2. This is because it
|
||||||
|
// doesn't depend on reduction in previous iteration.
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
|
||||||
|
// d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
|
||||||
|
// d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
|
||||||
|
// d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
|
||||||
|
// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
|
||||||
|
|
||||||
|
subs x2,x2,#64
|
||||||
|
umull v23.2d,v14.2s,v7.s[2]
|
||||||
|
csel x16,x17,x16,lo
|
||||||
|
umull v22.2d,v14.2s,v5.s[2]
|
||||||
|
umull v21.2d,v14.2s,v3.s[2]
|
||||||
|
ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
|
||||||
|
umull v20.2d,v14.2s,v1.s[2]
|
||||||
|
ldp x9,x13,[x16],#48
|
||||||
|
umull v19.2d,v14.2s,v0.s[2]
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev x8,x8
|
||||||
|
rev x12,x12
|
||||||
|
rev x9,x9
|
||||||
|
rev x13,x13
|
||||||
|
#endif
|
||||||
|
|
||||||
|
umlal v23.2d,v15.2s,v5.s[2]
|
||||||
|
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
umlal v22.2d,v15.2s,v3.s[2]
|
||||||
|
and x5,x9,#0x03ffffff
|
||||||
|
umlal v21.2d,v15.2s,v1.s[2]
|
||||||
|
ubfx x6,x8,#26,#26
|
||||||
|
umlal v20.2d,v15.2s,v0.s[2]
|
||||||
|
ubfx x7,x9,#26,#26
|
||||||
|
umlal v19.2d,v15.2s,v8.s[2]
|
||||||
|
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||||
|
|
||||||
|
umlal v23.2d,v16.2s,v3.s[2]
|
||||||
|
extr x8,x12,x8,#52
|
||||||
|
umlal v22.2d,v16.2s,v1.s[2]
|
||||||
|
extr x9,x13,x9,#52
|
||||||
|
umlal v21.2d,v16.2s,v0.s[2]
|
||||||
|
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||||
|
umlal v20.2d,v16.2s,v8.s[2]
|
||||||
|
fmov d14,x4
|
||||||
|
umlal v19.2d,v16.2s,v6.s[2]
|
||||||
|
and x8,x8,#0x03ffffff
|
||||||
|
|
||||||
|
umlal v23.2d,v17.2s,v1.s[2]
|
||||||
|
and x9,x9,#0x03ffffff
|
||||||
|
umlal v22.2d,v17.2s,v0.s[2]
|
||||||
|
ubfx x10,x12,#14,#26
|
||||||
|
umlal v21.2d,v17.2s,v8.s[2]
|
||||||
|
ubfx x11,x13,#14,#26
|
||||||
|
umlal v20.2d,v17.2s,v6.s[2]
|
||||||
|
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||||
|
umlal v19.2d,v17.2s,v4.s[2]
|
||||||
|
fmov d15,x6
|
||||||
|
|
||||||
|
add v11.2s,v11.2s,v26.2s
|
||||||
|
add x12,x3,x12,lsr#40
|
||||||
|
umlal v23.2d,v18.2s,v0.s[2]
|
||||||
|
add x13,x3,x13,lsr#40
|
||||||
|
umlal v22.2d,v18.2s,v8.s[2]
|
||||||
|
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||||
|
umlal v21.2d,v18.2s,v6.s[2]
|
||||||
|
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||||
|
umlal v20.2d,v18.2s,v4.s[2]
|
||||||
|
fmov d16,x8
|
||||||
|
umlal v19.2d,v18.2s,v2.s[2]
|
||||||
|
fmov d17,x10
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// (hash+inp[0:1])*r^4 and accumulate
|
||||||
|
|
||||||
|
add v9.2s,v9.2s,v24.2s
|
||||||
|
fmov d18,x12
|
||||||
|
umlal v22.2d,v11.2s,v1.s[0]
|
||||||
|
ldp x8,x12,[x1],#16 // inp[0:1]
|
||||||
|
umlal v19.2d,v11.2s,v6.s[0]
|
||||||
|
ldp x9,x13,[x1],#48
|
||||||
|
umlal v23.2d,v11.2s,v3.s[0]
|
||||||
|
umlal v20.2d,v11.2s,v8.s[0]
|
||||||
|
umlal v21.2d,v11.2s,v0.s[0]
|
||||||
|
#ifdef __AARCH64EB__
|
||||||
|
rev x8,x8
|
||||||
|
rev x12,x12
|
||||||
|
rev x9,x9
|
||||||
|
rev x13,x13
|
||||||
|
#endif
|
||||||
|
|
||||||
|
add v10.2s,v10.2s,v25.2s
|
||||||
|
umlal v22.2d,v9.2s,v5.s[0]
|
||||||
|
umlal v23.2d,v9.2s,v7.s[0]
|
||||||
|
and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
|
||||||
|
umlal v21.2d,v9.2s,v3.s[0]
|
||||||
|
and x5,x9,#0x03ffffff
|
||||||
|
umlal v19.2d,v9.2s,v0.s[0]
|
||||||
|
ubfx x6,x8,#26,#26
|
||||||
|
umlal v20.2d,v9.2s,v1.s[0]
|
||||||
|
ubfx x7,x9,#26,#26
|
||||||
|
|
||||||
|
add v12.2s,v12.2s,v27.2s
|
||||||
|
add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
|
||||||
|
umlal v22.2d,v10.2s,v3.s[0]
|
||||||
|
extr x8,x12,x8,#52
|
||||||
|
umlal v23.2d,v10.2s,v5.s[0]
|
||||||
|
extr x9,x13,x9,#52
|
||||||
|
umlal v19.2d,v10.2s,v8.s[0]
|
||||||
|
add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
|
||||||
|
umlal v21.2d,v10.2s,v1.s[0]
|
||||||
|
fmov d9,x4
|
||||||
|
umlal v20.2d,v10.2s,v0.s[0]
|
||||||
|
and x8,x8,#0x03ffffff
|
||||||
|
|
||||||
|
add v13.2s,v13.2s,v28.2s
|
||||||
|
and x9,x9,#0x03ffffff
|
||||||
|
umlal v22.2d,v12.2s,v0.s[0]
|
||||||
|
ubfx x10,x12,#14,#26
|
||||||
|
umlal v19.2d,v12.2s,v4.s[0]
|
||||||
|
ubfx x11,x13,#14,#26
|
||||||
|
umlal v23.2d,v12.2s,v1.s[0]
|
||||||
|
add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
|
||||||
|
umlal v20.2d,v12.2s,v6.s[0]
|
||||||
|
fmov d10,x6
|
||||||
|
umlal v21.2d,v12.2s,v8.s[0]
|
||||||
|
add x12,x3,x12,lsr#40
|
||||||
|
|
||||||
|
umlal v22.2d,v13.2s,v8.s[0]
|
||||||
|
add x13,x3,x13,lsr#40
|
||||||
|
umlal v19.2d,v13.2s,v2.s[0]
|
||||||
|
add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
|
||||||
|
umlal v23.2d,v13.2s,v0.s[0]
|
||||||
|
add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
|
||||||
|
umlal v20.2d,v13.2s,v4.s[0]
|
||||||
|
fmov d11,x8
|
||||||
|
umlal v21.2d,v13.2s,v6.s[0]
|
||||||
|
fmov d12,x10
|
||||||
|
fmov d13,x12
|
||||||
|
|
||||||
|
/////////////////////////////////////////////////////////////////
|
||||||
|
// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
||||||
|
// and P. Schwabe
|
||||||
|
//
|
||||||
|
// [see discussion in poly1305-armv4 module]
|
||||||
|
|
||||||
|
ushr v29.2d,v22.2d,#26
|
||||||
|
xtn v27.2s,v22.2d
|
||||||
|
ushr v30.2d,v19.2d,#26
|
||||||
|
and v19.16b,v19.16b,v31.16b
|
||||||
|
add v23.2d,v23.2d,v29.2d // h3 -> h4
|
||||||
|
bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
|
||||||
|
add v20.2d,v20.2d,v30.2d // h0 -> h1
|
||||||
|
|
||||||
|
ushr v29.2d,v23.2d,#26
|
||||||
|
xtn v28.2s,v23.2d
|
||||||
|
ushr v30.2d,v20.2d,#26
|
||||||
|
xtn v25.2s,v20.2d
|
||||||
|
bic v28.2s,#0xfc,lsl#24
|
||||||
|
add v21.2d,v21.2d,v30.2d // h1 -> h2
|
||||||
|
|
||||||
|
add v19.2d,v19.2d,v29.2d
|
||||||
|
shl v29.2d,v29.2d,#2
|
||||||
|
shrn v30.2s,v21.2d,#26
|
||||||
|
xtn v26.2s,v21.2d
|
||||||
|
add v19.2d,v19.2d,v29.2d // h4 -> h0
|
||||||
|
bic v25.2s,#0xfc,lsl#24
|
||||||
|
add v27.2s,v27.2s,v30.2s // h2 -> h3
|
||||||
|
bic v26.2s,#0xfc,lsl#24
|
||||||
|
|
||||||
|
shrn v29.2s,v19.2d,#26
|
||||||
|
xtn v24.2s,v19.2d
|
||||||
|
ushr v30.2s,v27.2s,#26
|
||||||
|
bic v27.2s,#0xfc,lsl#24
|
||||||
|
bic v24.2s,#0xfc,lsl#24
|
||||||
|
add v25.2s,v25.2s,v29.2s // h0 -> h1
|
||||||
|
add v28.2s,v28.2s,v30.2s // h3 -> h4
|
||||||
|
|
||||||
|
b.hi .Loop_neon
|
||||||
|
|
||||||
|
.Lskip_loop:
|
||||||
|
dup v16.2d,v16.d[0]
|
||||||
|
add v11.2s,v11.2s,v26.2s
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
|
||||||
|
|
||||||
|
adds x2,x2,#32
|
||||||
|
b.ne .Long_tail
|
||||||
|
|
||||||
|
dup v16.2d,v11.d[0]
|
||||||
|
add v14.2s,v9.2s,v24.2s
|
||||||
|
add v17.2s,v12.2s,v27.2s
|
||||||
|
add v15.2s,v10.2s,v25.2s
|
||||||
|
add v18.2s,v13.2s,v28.2s
|
||||||
|
|
||||||
|
.Long_tail:
|
||||||
|
dup v14.2d,v14.d[0]
|
||||||
|
umull2 v19.2d,v16.4s,v6.4s
|
||||||
|
umull2 v22.2d,v16.4s,v1.4s
|
||||||
|
umull2 v23.2d,v16.4s,v3.4s
|
||||||
|
umull2 v21.2d,v16.4s,v0.4s
|
||||||
|
umull2 v20.2d,v16.4s,v8.4s
|
||||||
|
|
||||||
|
dup v15.2d,v15.d[0]
|
||||||
|
umlal2 v19.2d,v14.4s,v0.4s
|
||||||
|
umlal2 v21.2d,v14.4s,v3.4s
|
||||||
|
umlal2 v22.2d,v14.4s,v5.4s
|
||||||
|
umlal2 v23.2d,v14.4s,v7.4s
|
||||||
|
umlal2 v20.2d,v14.4s,v1.4s
|
||||||
|
|
||||||
|
dup v17.2d,v17.d[0]
|
||||||
|
umlal2 v19.2d,v15.4s,v8.4s
|
||||||
|
umlal2 v22.2d,v15.4s,v3.4s
|
||||||
|
umlal2 v21.2d,v15.4s,v1.4s
|
||||||
|
umlal2 v23.2d,v15.4s,v5.4s
|
||||||
|
umlal2 v20.2d,v15.4s,v0.4s
|
||||||
|
|
||||||
|
dup v18.2d,v18.d[0]
|
||||||
|
umlal2 v22.2d,v17.4s,v0.4s
|
||||||
|
umlal2 v23.2d,v17.4s,v1.4s
|
||||||
|
umlal2 v19.2d,v17.4s,v4.4s
|
||||||
|
umlal2 v20.2d,v17.4s,v6.4s
|
||||||
|
umlal2 v21.2d,v17.4s,v8.4s
|
||||||
|
|
||||||
|
umlal2 v22.2d,v18.4s,v8.4s
|
||||||
|
umlal2 v19.2d,v18.4s,v2.4s
|
||||||
|
umlal2 v23.2d,v18.4s,v0.4s
|
||||||
|
umlal2 v20.2d,v18.4s,v4.4s
|
||||||
|
umlal2 v21.2d,v18.4s,v6.4s
|
||||||
|
|
||||||
|
b.eq .Lshort_tail
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// (hash+inp[0:1])*r^4:r^3 and accumulate
|
||||||
|
|
||||||
|
add v9.2s,v9.2s,v24.2s
|
||||||
|
umlal v22.2d,v11.2s,v1.2s
|
||||||
|
umlal v19.2d,v11.2s,v6.2s
|
||||||
|
umlal v23.2d,v11.2s,v3.2s
|
||||||
|
umlal v20.2d,v11.2s,v8.2s
|
||||||
|
umlal v21.2d,v11.2s,v0.2s
|
||||||
|
|
||||||
|
add v10.2s,v10.2s,v25.2s
|
||||||
|
umlal v22.2d,v9.2s,v5.2s
|
||||||
|
umlal v19.2d,v9.2s,v0.2s
|
||||||
|
umlal v23.2d,v9.2s,v7.2s
|
||||||
|
umlal v20.2d,v9.2s,v1.2s
|
||||||
|
umlal v21.2d,v9.2s,v3.2s
|
||||||
|
|
||||||
|
add v12.2s,v12.2s,v27.2s
|
||||||
|
umlal v22.2d,v10.2s,v3.2s
|
||||||
|
umlal v19.2d,v10.2s,v8.2s
|
||||||
|
umlal v23.2d,v10.2s,v5.2s
|
||||||
|
umlal v20.2d,v10.2s,v0.2s
|
||||||
|
umlal v21.2d,v10.2s,v1.2s
|
||||||
|
|
||||||
|
add v13.2s,v13.2s,v28.2s
|
||||||
|
umlal v22.2d,v12.2s,v0.2s
|
||||||
|
umlal v19.2d,v12.2s,v4.2s
|
||||||
|
umlal v23.2d,v12.2s,v1.2s
|
||||||
|
umlal v20.2d,v12.2s,v6.2s
|
||||||
|
umlal v21.2d,v12.2s,v8.2s
|
||||||
|
|
||||||
|
umlal v22.2d,v13.2s,v8.2s
|
||||||
|
umlal v19.2d,v13.2s,v2.2s
|
||||||
|
umlal v23.2d,v13.2s,v0.2s
|
||||||
|
umlal v20.2d,v13.2s,v4.2s
|
||||||
|
umlal v21.2d,v13.2s,v6.2s
|
||||||
|
|
||||||
|
.Lshort_tail:
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// horizontal add
|
||||||
|
|
||||||
|
addp v22.2d,v22.2d,v22.2d
|
||||||
|
ldp d8,d9,[sp,#16] // meet ABI requirements
|
||||||
|
addp v19.2d,v19.2d,v19.2d
|
||||||
|
ldp d10,d11,[sp,#32]
|
||||||
|
addp v23.2d,v23.2d,v23.2d
|
||||||
|
ldp d12,d13,[sp,#48]
|
||||||
|
addp v20.2d,v20.2d,v20.2d
|
||||||
|
ldp d14,d15,[sp,#64]
|
||||||
|
addp v21.2d,v21.2d,v21.2d
|
||||||
|
ldr x30,[sp,#8]
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// lazy reduction, but without narrowing
|
||||||
|
|
||||||
|
ushr v29.2d,v22.2d,#26
|
||||||
|
and v22.16b,v22.16b,v31.16b
|
||||||
|
ushr v30.2d,v19.2d,#26
|
||||||
|
and v19.16b,v19.16b,v31.16b
|
||||||
|
|
||||||
|
add v23.2d,v23.2d,v29.2d // h3 -> h4
|
||||||
|
add v20.2d,v20.2d,v30.2d // h0 -> h1
|
||||||
|
|
||||||
|
ushr v29.2d,v23.2d,#26
|
||||||
|
and v23.16b,v23.16b,v31.16b
|
||||||
|
ushr v30.2d,v20.2d,#26
|
||||||
|
and v20.16b,v20.16b,v31.16b
|
||||||
|
add v21.2d,v21.2d,v30.2d // h1 -> h2
|
||||||
|
|
||||||
|
add v19.2d,v19.2d,v29.2d
|
||||||
|
shl v29.2d,v29.2d,#2
|
||||||
|
ushr v30.2d,v21.2d,#26
|
||||||
|
and v21.16b,v21.16b,v31.16b
|
||||||
|
add v19.2d,v19.2d,v29.2d // h4 -> h0
|
||||||
|
add v22.2d,v22.2d,v30.2d // h2 -> h3
|
||||||
|
|
||||||
|
ushr v29.2d,v19.2d,#26
|
||||||
|
and v19.16b,v19.16b,v31.16b
|
||||||
|
ushr v30.2d,v22.2d,#26
|
||||||
|
and v22.16b,v22.16b,v31.16b
|
||||||
|
add v20.2d,v20.2d,v29.2d // h0 -> h1
|
||||||
|
add v23.2d,v23.2d,v30.2d // h3 -> h4
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////
|
||||||
|
// write the result, can be partially reduced
|
||||||
|
|
||||||
|
st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
|
||||||
|
mov x4,#1
|
||||||
|
st1 {v23.s}[0],[x0]
|
||||||
|
str x4,[x0,#8] // set is_base2_26
|
||||||
|
|
||||||
|
ldr x29,[sp],#80
|
||||||
|
.inst 0xd50323bf // autiasp
|
||||||
|
ret
|
||||||
|
.size poly1305_blocks_neon,.-poly1305_blocks_neon
|
||||||
|
|
||||||
|
.align 5
|
||||||
|
.Lzeros:
|
||||||
|
.long 0,0,0,0,0,0,0,0
|
||||||
|
.asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
|
||||||
|
.align 2
|
||||||
|
#if !defined(__KERNEL__) && !defined(_WIN64)
|
||||||
|
.comm OPENSSL_armcap_P,4,4
|
||||||
|
.hidden OPENSSL_armcap_P
|
||||||
|
#endif
|
230
arch/arm64/crypto/poly1305-glue.c
Normal file
230
arch/arm64/crypto/poly1305-glue.c
Normal file
|
@ -0,0 +1,230 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
|
/*
|
||||||
|
* OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
|
||||||
|
*
|
||||||
|
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <asm/hwcap.h>
|
||||||
|
#include <asm/neon.h>
|
||||||
|
#include <asm/simd.h>
|
||||||
|
#include <asm/unaligned.h>
|
||||||
|
#include <crypto/algapi.h>
|
||||||
|
#include <crypto/internal/hash.h>
|
||||||
|
#include <crypto/internal/poly1305.h>
|
||||||
|
#include <linux/cpufeature.h>
|
||||||
|
#include <linux/crypto.h>
|
||||||
|
#include <linux/jump_label.h>
|
||||||
|
#include <linux/module.h>
|
||||||
|
|
||||||
|
asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
|
||||||
|
asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
|
||||||
|
asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
|
||||||
|
asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
|
||||||
|
|
||||||
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
|
||||||
|
|
||||||
|
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
|
||||||
|
{
|
||||||
|
poly1305_init_arm64(&dctx->h, key);
|
||||||
|
dctx->s[0] = get_unaligned_le32(key + 16);
|
||||||
|
dctx->s[1] = get_unaligned_le32(key + 20);
|
||||||
|
dctx->s[2] = get_unaligned_le32(key + 24);
|
||||||
|
dctx->s[3] = get_unaligned_le32(key + 28);
|
||||||
|
dctx->buflen = 0;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(poly1305_init_arch);
|
||||||
|
|
||||||
|
static int neon_poly1305_init(struct shash_desc *desc)
|
||||||
|
{
|
||||||
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||||
|
|
||||||
|
dctx->buflen = 0;
|
||||||
|
dctx->rset = 0;
|
||||||
|
dctx->sset = false;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||||
|
u32 len, u32 hibit, bool do_neon)
|
||||||
|
{
|
||||||
|
if (unlikely(!dctx->sset)) {
|
||||||
|
if (!dctx->rset) {
|
||||||
|
poly1305_init_arch(dctx, src);
|
||||||
|
src += POLY1305_BLOCK_SIZE;
|
||||||
|
len -= POLY1305_BLOCK_SIZE;
|
||||||
|
dctx->rset = 1;
|
||||||
|
}
|
||||||
|
if (len >= POLY1305_BLOCK_SIZE) {
|
||||||
|
dctx->s[0] = get_unaligned_le32(src + 0);
|
||||||
|
dctx->s[1] = get_unaligned_le32(src + 4);
|
||||||
|
dctx->s[2] = get_unaligned_le32(src + 8);
|
||||||
|
dctx->s[3] = get_unaligned_le32(src + 12);
|
||||||
|
src += POLY1305_BLOCK_SIZE;
|
||||||
|
len -= POLY1305_BLOCK_SIZE;
|
||||||
|
dctx->sset = true;
|
||||||
|
}
|
||||||
|
if (len < POLY1305_BLOCK_SIZE)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
len &= ~(POLY1305_BLOCK_SIZE - 1);
|
||||||
|
|
||||||
|
if (static_branch_likely(&have_neon) && likely(do_neon))
|
||||||
|
poly1305_blocks_neon(&dctx->h, src, len, hibit);
|
||||||
|
else
|
||||||
|
poly1305_blocks(&dctx->h, src, len, hibit);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
|
||||||
|
const u8 *src, u32 len, bool do_neon)
|
||||||
|
{
|
||||||
|
if (unlikely(dctx->buflen)) {
|
||||||
|
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||||
|
|
||||||
|
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||||
|
src += bytes;
|
||||||
|
len -= bytes;
|
||||||
|
dctx->buflen += bytes;
|
||||||
|
|
||||||
|
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||||
|
neon_poly1305_blocks(dctx, dctx->buf,
|
||||||
|
POLY1305_BLOCK_SIZE, 1, false);
|
||||||
|
dctx->buflen = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (likely(len >= POLY1305_BLOCK_SIZE)) {
|
||||||
|
neon_poly1305_blocks(dctx, src, len, 1, do_neon);
|
||||||
|
src += round_down(len, POLY1305_BLOCK_SIZE);
|
||||||
|
len %= POLY1305_BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (unlikely(len)) {
|
||||||
|
dctx->buflen = len;
|
||||||
|
memcpy(dctx->buf, src, len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int neon_poly1305_update(struct shash_desc *desc,
|
||||||
|
const u8 *src, unsigned int srclen)
|
||||||
|
{
|
||||||
|
bool do_neon = may_use_simd() && srclen > 128;
|
||||||
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||||
|
|
||||||
|
if (static_branch_likely(&have_neon) && do_neon)
|
||||||
|
kernel_neon_begin();
|
||||||
|
neon_poly1305_do_update(dctx, src, srclen, do_neon);
|
||||||
|
if (static_branch_likely(&have_neon) && do_neon)
|
||||||
|
kernel_neon_end();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||||
|
unsigned int nbytes)
|
||||||
|
{
|
||||||
|
if (unlikely(dctx->buflen)) {
|
||||||
|
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||||
|
|
||||||
|
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||||
|
src += bytes;
|
||||||
|
nbytes -= bytes;
|
||||||
|
dctx->buflen += bytes;
|
||||||
|
|
||||||
|
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||||
|
poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
|
||||||
|
dctx->buflen = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
|
||||||
|
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
|
||||||
|
|
||||||
|
if (static_branch_likely(&have_neon) && may_use_simd()) {
|
||||||
|
do {
|
||||||
|
unsigned int todo = min_t(unsigned int, len, SZ_4K);
|
||||||
|
|
||||||
|
kernel_neon_begin();
|
||||||
|
poly1305_blocks_neon(&dctx->h, src, todo, 1);
|
||||||
|
kernel_neon_end();
|
||||||
|
|
||||||
|
len -= todo;
|
||||||
|
src += todo;
|
||||||
|
} while (len);
|
||||||
|
} else {
|
||||||
|
poly1305_blocks(&dctx->h, src, len, 1);
|
||||||
|
src += len;
|
||||||
|
}
|
||||||
|
nbytes %= POLY1305_BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (unlikely(nbytes)) {
|
||||||
|
dctx->buflen = nbytes;
|
||||||
|
memcpy(dctx->buf, src, nbytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(poly1305_update_arch);
|
||||||
|
|
||||||
|
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
|
||||||
|
{
|
||||||
|
if (unlikely(dctx->buflen)) {
|
||||||
|
dctx->buf[dctx->buflen++] = 1;
|
||||||
|
memset(dctx->buf + dctx->buflen, 0,
|
||||||
|
POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||||
|
poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
poly1305_emit(&dctx->h, dst, dctx->s);
|
||||||
|
*dctx = (struct poly1305_desc_ctx){};
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(poly1305_final_arch);
|
||||||
|
|
||||||
|
static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
|
||||||
|
{
|
||||||
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||||
|
|
||||||
|
if (unlikely(!dctx->sset))
|
||||||
|
return -ENOKEY;
|
||||||
|
|
||||||
|
poly1305_final_arch(dctx, dst);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct shash_alg neon_poly1305_alg = {
|
||||||
|
.init = neon_poly1305_init,
|
||||||
|
.update = neon_poly1305_update,
|
||||||
|
.final = neon_poly1305_final,
|
||||||
|
.digestsize = POLY1305_DIGEST_SIZE,
|
||||||
|
.descsize = sizeof(struct poly1305_desc_ctx),
|
||||||
|
|
||||||
|
.base.cra_name = "poly1305",
|
||||||
|
.base.cra_driver_name = "poly1305-neon",
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
};
|
||||||
|
|
||||||
|
static int __init neon_poly1305_mod_init(void)
|
||||||
|
{
|
||||||
|
if (!(elf_hwcap & HWCAP_ASIMD))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
static_branch_enable(&have_neon);
|
||||||
|
|
||||||
|
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||||
|
crypto_register_shash(&neon_poly1305_alg) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __exit neon_poly1305_mod_exit(void)
|
||||||
|
{
|
||||||
|
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && (elf_hwcap & HWCAP_ASIMD))
|
||||||
|
crypto_unregister_shash(&neon_poly1305_alg);
|
||||||
|
}
|
||||||
|
|
||||||
|
module_init(neon_poly1305_mod_init);
|
||||||
|
module_exit(neon_poly1305_mod_exit);
|
||||||
|
|
||||||
|
MODULE_LICENSE("GPL v2");
|
||||||
|
MODULE_ALIAS_CRYPTO("poly1305");
|
||||||
|
MODULE_ALIAS_CRYPTO("poly1305-neon");
|
|
@ -192,6 +192,7 @@ enum vcpu_sysreg {
|
||||||
#define cp14_DBGWCR0 (DBGWCR0_EL1 * 2)
|
#define cp14_DBGWCR0 (DBGWCR0_EL1 * 2)
|
||||||
#define cp14_DBGWVR0 (DBGWVR0_EL1 * 2)
|
#define cp14_DBGWVR0 (DBGWVR0_EL1 * 2)
|
||||||
#define cp14_DBGDCCINT (MDCCINT_EL1 * 2)
|
#define cp14_DBGDCCINT (MDCCINT_EL1 * 2)
|
||||||
|
#define cp14_DBGVCR (DBGVCR32_EL2 * 2)
|
||||||
|
|
||||||
#define NR_COPRO_REGS (NR_SYS_REGS * 2)
|
#define NR_COPRO_REGS (NR_SYS_REGS * 2)
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,9 @@ const struct cpumask *cpumask_of_node(int node);
|
||||||
/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
|
/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
|
||||||
static inline const struct cpumask *cpumask_of_node(int node)
|
static inline const struct cpumask *cpumask_of_node(int node)
|
||||||
{
|
{
|
||||||
|
if (node == NUMA_NO_NODE)
|
||||||
|
return cpu_all_mask;
|
||||||
|
|
||||||
return node_to_cpumask_map[node];
|
return node_to_cpumask_map[node];
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -620,6 +620,12 @@ check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope)
|
||||||
return (need_wa > 0);
|
return (need_wa > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
cpu_enable_branch_predictor_hardening(const struct arm64_cpu_capabilities *cap)
|
||||||
|
{
|
||||||
|
cap->matches(cap, SCOPE_LOCAL_CPU);
|
||||||
|
}
|
||||||
|
|
||||||
static const __maybe_unused struct midr_range tx2_family_cpus[] = {
|
static const __maybe_unused struct midr_range tx2_family_cpus[] = {
|
||||||
MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
|
MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
|
||||||
MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
|
MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
|
||||||
|
@ -860,9 +866,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
|
||||||
},
|
},
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
|
.desc = "Branch predictor hardening",
|
||||||
.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
|
.capability = ARM64_HARDEN_BRANCH_PREDICTOR,
|
||||||
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
|
.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
|
||||||
.matches = check_branch_predictor,
|
.matches = check_branch_predictor,
|
||||||
|
.cpu_enable = cpu_enable_branch_predictor_hardening,
|
||||||
},
|
},
|
||||||
#ifdef CONFIG_HARDEN_EL2_VECTORS
|
#ifdef CONFIG_HARDEN_EL2_VECTORS
|
||||||
{
|
{
|
||||||
|
|
|
@ -290,21 +290,23 @@ void store_cpu_topology(unsigned int cpuid)
|
||||||
if (mpidr & MPIDR_UP_BITMASK)
|
if (mpidr & MPIDR_UP_BITMASK)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/* Create cpu topology mapping based on MPIDR. */
|
/*
|
||||||
if (mpidr & MPIDR_MT_BITMASK) {
|
* This would be the place to create cpu topology based on MPIDR.
|
||||||
/* Multiprocessor system : Multi-threads per core */
|
*
|
||||||
cpuid_topo->thread_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
|
* However, it cannot be trusted to depict the actual topology; some
|
||||||
cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 1);
|
* pieces of the architecture enforce an artificial cap on Aff0 values
|
||||||
cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 2) |
|
* (e.g. GICv3's ICC_SGI1R_EL1 limits it to 15), leading to an
|
||||||
MPIDR_AFFINITY_LEVEL(mpidr, 3) << 8;
|
* artificial cycling of Aff1, Aff2 and Aff3 values. IOW, these end up
|
||||||
} else {
|
* having absolutely no relationship to the actual underlying system
|
||||||
/* Multiprocessor system : Single-thread per core */
|
* topology, and cannot be reasonably used as core / package ID.
|
||||||
|
*
|
||||||
|
* If the MT bit is set, Aff0 *could* be used to define a thread ID, but
|
||||||
|
* we still wouldn't be able to obtain a sane core ID. This means we
|
||||||
|
* need to entirely ignore MPIDR for any topology deduction.
|
||||||
|
*/
|
||||||
cpuid_topo->thread_id = -1;
|
cpuid_topo->thread_id = -1;
|
||||||
cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
|
cpuid_topo->core_id = cpuid;
|
||||||
cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 1) |
|
cpuid_topo->package_id = cpu_to_node(cpuid);
|
||||||
MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8 |
|
|
||||||
MPIDR_AFFINITY_LEVEL(mpidr, 3) << 16;
|
|
||||||
}
|
|
||||||
|
|
||||||
pr_debug("CPU%u: cluster %d core %d thread %d mpidr %#016llx\n",
|
pr_debug("CPU%u: cluster %d core %d thread %d mpidr %#016llx\n",
|
||||||
cpuid, cpuid_topo->package_id, cpuid_topo->core_id,
|
cpuid, cpuid_topo->package_id, cpuid_topo->core_id,
|
||||||
|
|
|
@ -1555,9 +1555,9 @@ static const struct sys_reg_desc cp14_regs[] = {
|
||||||
{ Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi },
|
{ Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi },
|
||||||
DBG_BCR_BVR_WCR_WVR(1),
|
DBG_BCR_BVR_WCR_WVR(1),
|
||||||
/* DBGDCCINT */
|
/* DBGDCCINT */
|
||||||
{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32 },
|
{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug32, NULL, cp14_DBGDCCINT },
|
||||||
/* DBGDSCRext */
|
/* DBGDSCRext */
|
||||||
{ Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32 },
|
{ Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug32, NULL, cp14_DBGDSCRext },
|
||||||
DBG_BCR_BVR_WCR_WVR(2),
|
DBG_BCR_BVR_WCR_WVR(2),
|
||||||
/* DBGDTR[RT]Xint */
|
/* DBGDTR[RT]Xint */
|
||||||
{ Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi },
|
{ Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi },
|
||||||
|
@ -1572,7 +1572,7 @@ static const struct sys_reg_desc cp14_regs[] = {
|
||||||
{ Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi },
|
{ Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi },
|
||||||
DBG_BCR_BVR_WCR_WVR(6),
|
DBG_BCR_BVR_WCR_WVR(6),
|
||||||
/* DBGVCR */
|
/* DBGVCR */
|
||||||
{ Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32 },
|
{ Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug32, NULL, cp14_DBGVCR },
|
||||||
DBG_BCR_BVR_WCR_WVR(7),
|
DBG_BCR_BVR_WCR_WVR(7),
|
||||||
DBG_BCR_BVR_WCR_WVR(8),
|
DBG_BCR_BVR_WCR_WVR(8),
|
||||||
DBG_BCR_BVR_WCR_WVR(9),
|
DBG_BCR_BVR_WCR_WVR(9),
|
||||||
|
|
|
@ -58,7 +58,11 @@ EXPORT_SYMBOL(node_to_cpumask_map);
|
||||||
*/
|
*/
|
||||||
const struct cpumask *cpumask_of_node(int node)
|
const struct cpumask *cpumask_of_node(int node)
|
||||||
{
|
{
|
||||||
if (WARN_ON(node >= nr_node_ids))
|
|
||||||
|
if (node == NUMA_NO_NODE)
|
||||||
|
return cpu_all_mask;
|
||||||
|
|
||||||
|
if (WARN_ON(node < 0 || node >= nr_node_ids))
|
||||||
return cpu_none_mask;
|
return cpu_none_mask;
|
||||||
|
|
||||||
if (WARN_ON(node_to_cpumask_map[node] == NULL))
|
if (WARN_ON(node_to_cpumask_map[node] == NULL))
|
||||||
|
|
|
@ -42,7 +42,7 @@ obj-y += esi_stub.o # must be in kernel proper
|
||||||
endif
|
endif
|
||||||
obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o
|
obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o
|
||||||
|
|
||||||
obj-$(CONFIG_BINFMT_ELF) += elfcore.o
|
obj-$(CONFIG_ELF_CORE) += elfcore.o
|
||||||
|
|
||||||
# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
|
# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
|
||||||
CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31
|
CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31
|
||||||
|
|
|
@ -409,83 +409,9 @@ static void kretprobe_trampoline(void)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* At this point the target function has been tricked into
|
|
||||||
* returning into our trampoline. Lookup the associated instance
|
|
||||||
* and then:
|
|
||||||
* - call the handler function
|
|
||||||
* - cleanup by marking the instance as unused
|
|
||||||
* - long jump back to the original return address
|
|
||||||
*/
|
|
||||||
int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
|
int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
|
||||||
{
|
{
|
||||||
struct kretprobe_instance *ri = NULL;
|
regs->cr_iip = __kretprobe_trampoline_handler(regs, kretprobe_trampoline, NULL);
|
||||||
struct hlist_head *head, empty_rp;
|
|
||||||
struct hlist_node *tmp;
|
|
||||||
unsigned long flags, orig_ret_address = 0;
|
|
||||||
unsigned long trampoline_address =
|
|
||||||
((struct fnptr *)kretprobe_trampoline)->ip;
|
|
||||||
|
|
||||||
INIT_HLIST_HEAD(&empty_rp);
|
|
||||||
kretprobe_hash_lock(current, &head, &flags);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* It is possible to have multiple instances associated with a given
|
|
||||||
* task either because an multiple functions in the call path
|
|
||||||
* have a return probe installed on them, and/or more than one return
|
|
||||||
* return probe was registered for a target function.
|
|
||||||
*
|
|
||||||
* We can handle this because:
|
|
||||||
* - instances are always inserted at the head of the list
|
|
||||||
* - when multiple return probes are registered for the same
|
|
||||||
* function, the first instance's ret_addr will point to the
|
|
||||||
* real return address, and all the rest will point to
|
|
||||||
* kretprobe_trampoline
|
|
||||||
*/
|
|
||||||
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
|
|
||||||
if (ri->task != current)
|
|
||||||
/* another task is sharing our hash bucket */
|
|
||||||
continue;
|
|
||||||
|
|
||||||
orig_ret_address = (unsigned long)ri->ret_addr;
|
|
||||||
if (orig_ret_address != trampoline_address)
|
|
||||||
/*
|
|
||||||
* This is the real return address. Any other
|
|
||||||
* instances associated with this task are for
|
|
||||||
* other calls deeper on the call stack
|
|
||||||
*/
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
regs->cr_iip = orig_ret_address;
|
|
||||||
|
|
||||||
hlist_for_each_entry_safe(ri, tmp, head, hlist) {
|
|
||||||
if (ri->task != current)
|
|
||||||
/* another task is sharing our hash bucket */
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (ri->rp && ri->rp->handler)
|
|
||||||
ri->rp->handler(ri, regs);
|
|
||||||
|
|
||||||
orig_ret_address = (unsigned long)ri->ret_addr;
|
|
||||||
recycle_rp_inst(ri, &empty_rp);
|
|
||||||
|
|
||||||
if (orig_ret_address != trampoline_address)
|
|
||||||
/*
|
|
||||||
* This is the real return address. Any other
|
|
||||||
* instances associated with this task are for
|
|
||||||
* other calls deeper on the call stack
|
|
||||||
*/
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
kretprobe_assert(ri, orig_ret_address, trampoline_address);
|
|
||||||
|
|
||||||
kretprobe_hash_unlock(current, &flags);
|
|
||||||
|
|
||||||
hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
|
|
||||||
hlist_del(&ri->hlist);
|
|
||||||
kfree(ri);
|
|
||||||
}
|
|
||||||
/*
|
/*
|
||||||
* By returning a non-zero value, we are telling
|
* By returning a non-zero value, we are telling
|
||||||
* kprobe_handler() that we don't want the post_handler
|
* kprobe_handler() that we don't want the post_handler
|
||||||
|
@ -498,6 +424,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
|
||||||
struct pt_regs *regs)
|
struct pt_regs *regs)
|
||||||
{
|
{
|
||||||
ri->ret_addr = (kprobe_opcode_t *)regs->b0;
|
ri->ret_addr = (kprobe_opcode_t *)regs->b0;
|
||||||
|
ri->fp = NULL;
|
||||||
|
|
||||||
/* Replace the return addr with trampoline addr */
|
/* Replace the return addr with trampoline addr */
|
||||||
regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip;
|
regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip;
|
||||||
|
|
|
@ -339,7 +339,7 @@ libs-y += arch/mips/math-emu/
|
||||||
# See arch/mips/Kbuild for content of core part of the kernel
|
# See arch/mips/Kbuild for content of core part of the kernel
|
||||||
core-y += arch/mips/
|
core-y += arch/mips/
|
||||||
|
|
||||||
drivers-$(CONFIG_MIPS_CRC_SUPPORT) += arch/mips/crypto/
|
drivers-y += arch/mips/crypto/
|
||||||
drivers-$(CONFIG_OPROFILE) += arch/mips/oprofile/
|
drivers-$(CONFIG_OPROFILE) += arch/mips/oprofile/
|
||||||
|
|
||||||
# suspend and hibernation support
|
# suspend and hibernation support
|
||||||
|
|
|
@ -4,3 +4,21 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o
|
obj-$(CONFIG_CRYPTO_CRC32_MIPS) += crc32-mips.o
|
||||||
|
|
||||||
|
obj-$(CONFIG_CRYPTO_CHACHA_MIPS) += chacha-mips.o
|
||||||
|
chacha-mips-y := chacha-core.o chacha-glue.o
|
||||||
|
AFLAGS_chacha-core.o += -O2 # needed to fill branch delay slots
|
||||||
|
|
||||||
|
obj-$(CONFIG_CRYPTO_POLY1305_MIPS) += poly1305-mips.o
|
||||||
|
poly1305-mips-y := poly1305-core.o poly1305-glue.o
|
||||||
|
|
||||||
|
perlasm-flavour-$(CONFIG_CPU_MIPS32) := o32
|
||||||
|
perlasm-flavour-$(CONFIG_CPU_MIPS64) := 64
|
||||||
|
|
||||||
|
quiet_cmd_perlasm = PERLASM $@
|
||||||
|
cmd_perlasm = $(PERL) $(<) $(perlasm-flavour-y) $(@)
|
||||||
|
|
||||||
|
$(obj)/poly1305-core.S: $(src)/poly1305-mips.pl FORCE
|
||||||
|
$(call if_changed,perlasm)
|
||||||
|
|
||||||
|
targets += poly1305-core.S
|
||||||
|
|
497
arch/mips/crypto/chacha-core.S
Normal file
497
arch/mips/crypto/chacha-core.S
Normal file
|
@ -0,0 +1,497 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
|
||||||
|
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define MASK_U32 0x3c
|
||||||
|
#define CHACHA20_BLOCK_SIZE 64
|
||||||
|
#define STACK_SIZE 32
|
||||||
|
|
||||||
|
#define X0 $t0
|
||||||
|
#define X1 $t1
|
||||||
|
#define X2 $t2
|
||||||
|
#define X3 $t3
|
||||||
|
#define X4 $t4
|
||||||
|
#define X5 $t5
|
||||||
|
#define X6 $t6
|
||||||
|
#define X7 $t7
|
||||||
|
#define X8 $t8
|
||||||
|
#define X9 $t9
|
||||||
|
#define X10 $v1
|
||||||
|
#define X11 $s6
|
||||||
|
#define X12 $s5
|
||||||
|
#define X13 $s4
|
||||||
|
#define X14 $s3
|
||||||
|
#define X15 $s2
|
||||||
|
/* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
|
||||||
|
#define T0 $s1
|
||||||
|
#define T1 $s0
|
||||||
|
#define T(n) T ## n
|
||||||
|
#define X(n) X ## n
|
||||||
|
|
||||||
|
/* Input arguments */
|
||||||
|
#define STATE $a0
|
||||||
|
#define OUT $a1
|
||||||
|
#define IN $a2
|
||||||
|
#define BYTES $a3
|
||||||
|
|
||||||
|
/* Output argument */
|
||||||
|
/* NONCE[0] is kept in a register and not in memory.
|
||||||
|
* We don't want to touch original value in memory.
|
||||||
|
* Must be incremented every loop iteration.
|
||||||
|
*/
|
||||||
|
#define NONCE_0 $v0
|
||||||
|
|
||||||
|
/* SAVED_X and SAVED_CA are set in the jump table.
|
||||||
|
* Use regs which are overwritten on exit else we don't leak clear data.
|
||||||
|
* They are used to handling the last bytes which are not multiple of 4.
|
||||||
|
*/
|
||||||
|
#define SAVED_X X15
|
||||||
|
#define SAVED_CA $s7
|
||||||
|
|
||||||
|
#define IS_UNALIGNED $s7
|
||||||
|
|
||||||
|
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||||
|
#define MSB 0
|
||||||
|
#define LSB 3
|
||||||
|
#define ROTx rotl
|
||||||
|
#define ROTR(n) rotr n, 24
|
||||||
|
#define CPU_TO_LE32(n) \
|
||||||
|
wsbh n; \
|
||||||
|
rotr n, 16;
|
||||||
|
#else
|
||||||
|
#define MSB 3
|
||||||
|
#define LSB 0
|
||||||
|
#define ROTx rotr
|
||||||
|
#define CPU_TO_LE32(n)
|
||||||
|
#define ROTR(n)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define FOR_EACH_WORD(x) \
|
||||||
|
x( 0); \
|
||||||
|
x( 1); \
|
||||||
|
x( 2); \
|
||||||
|
x( 3); \
|
||||||
|
x( 4); \
|
||||||
|
x( 5); \
|
||||||
|
x( 6); \
|
||||||
|
x( 7); \
|
||||||
|
x( 8); \
|
||||||
|
x( 9); \
|
||||||
|
x(10); \
|
||||||
|
x(11); \
|
||||||
|
x(12); \
|
||||||
|
x(13); \
|
||||||
|
x(14); \
|
||||||
|
x(15);
|
||||||
|
|
||||||
|
#define FOR_EACH_WORD_REV(x) \
|
||||||
|
x(15); \
|
||||||
|
x(14); \
|
||||||
|
x(13); \
|
||||||
|
x(12); \
|
||||||
|
x(11); \
|
||||||
|
x(10); \
|
||||||
|
x( 9); \
|
||||||
|
x( 8); \
|
||||||
|
x( 7); \
|
||||||
|
x( 6); \
|
||||||
|
x( 5); \
|
||||||
|
x( 4); \
|
||||||
|
x( 3); \
|
||||||
|
x( 2); \
|
||||||
|
x( 1); \
|
||||||
|
x( 0);
|
||||||
|
|
||||||
|
#define PLUS_ONE_0 1
|
||||||
|
#define PLUS_ONE_1 2
|
||||||
|
#define PLUS_ONE_2 3
|
||||||
|
#define PLUS_ONE_3 4
|
||||||
|
#define PLUS_ONE_4 5
|
||||||
|
#define PLUS_ONE_5 6
|
||||||
|
#define PLUS_ONE_6 7
|
||||||
|
#define PLUS_ONE_7 8
|
||||||
|
#define PLUS_ONE_8 9
|
||||||
|
#define PLUS_ONE_9 10
|
||||||
|
#define PLUS_ONE_10 11
|
||||||
|
#define PLUS_ONE_11 12
|
||||||
|
#define PLUS_ONE_12 13
|
||||||
|
#define PLUS_ONE_13 14
|
||||||
|
#define PLUS_ONE_14 15
|
||||||
|
#define PLUS_ONE_15 16
|
||||||
|
#define PLUS_ONE(x) PLUS_ONE_ ## x
|
||||||
|
#define _CONCAT3(a,b,c) a ## b ## c
|
||||||
|
#define CONCAT3(a,b,c) _CONCAT3(a,b,c)
|
||||||
|
|
||||||
|
#define STORE_UNALIGNED(x) \
|
||||||
|
CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
|
||||||
|
.if (x != 12); \
|
||||||
|
lw T0, (x*4)(STATE); \
|
||||||
|
.endif; \
|
||||||
|
lwl T1, (x*4)+MSB ## (IN); \
|
||||||
|
lwr T1, (x*4)+LSB ## (IN); \
|
||||||
|
.if (x == 12); \
|
||||||
|
addu X ## x, NONCE_0; \
|
||||||
|
.else; \
|
||||||
|
addu X ## x, T0; \
|
||||||
|
.endif; \
|
||||||
|
CPU_TO_LE32(X ## x); \
|
||||||
|
xor X ## x, T1; \
|
||||||
|
swl X ## x, (x*4)+MSB ## (OUT); \
|
||||||
|
swr X ## x, (x*4)+LSB ## (OUT);
|
||||||
|
|
||||||
|
#define STORE_ALIGNED(x) \
|
||||||
|
CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
|
||||||
|
.if (x != 12); \
|
||||||
|
lw T0, (x*4)(STATE); \
|
||||||
|
.endif; \
|
||||||
|
lw T1, (x*4) ## (IN); \
|
||||||
|
.if (x == 12); \
|
||||||
|
addu X ## x, NONCE_0; \
|
||||||
|
.else; \
|
||||||
|
addu X ## x, T0; \
|
||||||
|
.endif; \
|
||||||
|
CPU_TO_LE32(X ## x); \
|
||||||
|
xor X ## x, T1; \
|
||||||
|
sw X ## x, (x*4) ## (OUT);
|
||||||
|
|
||||||
|
/* Jump table macro.
|
||||||
|
* Used for setup and handling the last bytes, which are not multiple of 4.
|
||||||
|
* X15 is free to store Xn
|
||||||
|
* Every jumptable entry must be equal in size.
|
||||||
|
*/
|
||||||
|
#define JMPTBL_ALIGNED(x) \
|
||||||
|
.Lchacha_mips_jmptbl_aligned_ ## x: ; \
|
||||||
|
.set noreorder; \
|
||||||
|
b .Lchacha_mips_xor_aligned_ ## x ## _b; \
|
||||||
|
.if (x == 12); \
|
||||||
|
addu SAVED_X, X ## x, NONCE_0; \
|
||||||
|
.else; \
|
||||||
|
addu SAVED_X, X ## x, SAVED_CA; \
|
||||||
|
.endif; \
|
||||||
|
.set reorder
|
||||||
|
|
||||||
|
#define JMPTBL_UNALIGNED(x) \
|
||||||
|
.Lchacha_mips_jmptbl_unaligned_ ## x: ; \
|
||||||
|
.set noreorder; \
|
||||||
|
b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
|
||||||
|
.if (x == 12); \
|
||||||
|
addu SAVED_X, X ## x, NONCE_0; \
|
||||||
|
.else; \
|
||||||
|
addu SAVED_X, X ## x, SAVED_CA; \
|
||||||
|
.endif; \
|
||||||
|
.set reorder
|
||||||
|
|
||||||
|
#define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
|
||||||
|
addu X(A), X(K); \
|
||||||
|
addu X(B), X(L); \
|
||||||
|
addu X(C), X(M); \
|
||||||
|
addu X(D), X(N); \
|
||||||
|
xor X(V), X(A); \
|
||||||
|
xor X(W), X(B); \
|
||||||
|
xor X(Y), X(C); \
|
||||||
|
xor X(Z), X(D); \
|
||||||
|
rotl X(V), S; \
|
||||||
|
rotl X(W), S; \
|
||||||
|
rotl X(Y), S; \
|
||||||
|
rotl X(Z), S;
|
||||||
|
|
||||||
|
.text
|
||||||
|
.set reorder
|
||||||
|
.set noat
|
||||||
|
.globl chacha_crypt_arch
|
||||||
|
.ent chacha_crypt_arch
|
||||||
|
chacha_crypt_arch:
|
||||||
|
.frame $sp, STACK_SIZE, $ra
|
||||||
|
|
||||||
|
/* Load number of rounds */
|
||||||
|
lw $at, 16($sp)
|
||||||
|
|
||||||
|
addiu $sp, -STACK_SIZE
|
||||||
|
|
||||||
|
/* Return bytes = 0. */
|
||||||
|
beqz BYTES, .Lchacha_mips_end
|
||||||
|
|
||||||
|
lw NONCE_0, 48(STATE)
|
||||||
|
|
||||||
|
/* Save s0-s7 */
|
||||||
|
sw $s0, 0($sp)
|
||||||
|
sw $s1, 4($sp)
|
||||||
|
sw $s2, 8($sp)
|
||||||
|
sw $s3, 12($sp)
|
||||||
|
sw $s4, 16($sp)
|
||||||
|
sw $s5, 20($sp)
|
||||||
|
sw $s6, 24($sp)
|
||||||
|
sw $s7, 28($sp)
|
||||||
|
|
||||||
|
/* Test IN or OUT is unaligned.
|
||||||
|
* IS_UNALIGNED = ( IN | OUT ) & 0x00000003
|
||||||
|
*/
|
||||||
|
or IS_UNALIGNED, IN, OUT
|
||||||
|
andi IS_UNALIGNED, 0x3
|
||||||
|
|
||||||
|
b .Lchacha_rounds_start
|
||||||
|
|
||||||
|
.align 4
|
||||||
|
.Loop_chacha_rounds:
|
||||||
|
addiu IN, CHACHA20_BLOCK_SIZE
|
||||||
|
addiu OUT, CHACHA20_BLOCK_SIZE
|
||||||
|
addiu NONCE_0, 1
|
||||||
|
|
||||||
|
.Lchacha_rounds_start:
|
||||||
|
lw X0, 0(STATE)
|
||||||
|
lw X1, 4(STATE)
|
||||||
|
lw X2, 8(STATE)
|
||||||
|
lw X3, 12(STATE)
|
||||||
|
|
||||||
|
lw X4, 16(STATE)
|
||||||
|
lw X5, 20(STATE)
|
||||||
|
lw X6, 24(STATE)
|
||||||
|
lw X7, 28(STATE)
|
||||||
|
lw X8, 32(STATE)
|
||||||
|
lw X9, 36(STATE)
|
||||||
|
lw X10, 40(STATE)
|
||||||
|
lw X11, 44(STATE)
|
||||||
|
|
||||||
|
move X12, NONCE_0
|
||||||
|
lw X13, 52(STATE)
|
||||||
|
lw X14, 56(STATE)
|
||||||
|
lw X15, 60(STATE)
|
||||||
|
|
||||||
|
.Loop_chacha_xor_rounds:
|
||||||
|
addiu $at, -2
|
||||||
|
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
|
||||||
|
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
|
||||||
|
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
|
||||||
|
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
|
||||||
|
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
|
||||||
|
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
|
||||||
|
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
|
||||||
|
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
|
||||||
|
bnez $at, .Loop_chacha_xor_rounds
|
||||||
|
|
||||||
|
addiu BYTES, -(CHACHA20_BLOCK_SIZE)
|
||||||
|
|
||||||
|
/* Is data src/dst unaligned? Jump */
|
||||||
|
bnez IS_UNALIGNED, .Loop_chacha_unaligned
|
||||||
|
|
||||||
|
/* Set number rounds here to fill delayslot. */
|
||||||
|
lw $at, (STACK_SIZE+16)($sp)
|
||||||
|
|
||||||
|
/* BYTES < 0, it has no full block. */
|
||||||
|
bltz BYTES, .Lchacha_mips_no_full_block_aligned
|
||||||
|
|
||||||
|
FOR_EACH_WORD_REV(STORE_ALIGNED)
|
||||||
|
|
||||||
|
/* BYTES > 0? Loop again. */
|
||||||
|
bgtz BYTES, .Loop_chacha_rounds
|
||||||
|
|
||||||
|
/* Place this here to fill delay slot */
|
||||||
|
addiu NONCE_0, 1
|
||||||
|
|
||||||
|
/* BYTES < 0? Handle last bytes */
|
||||||
|
bltz BYTES, .Lchacha_mips_xor_bytes
|
||||||
|
|
||||||
|
.Lchacha_mips_xor_done:
|
||||||
|
/* Restore used registers */
|
||||||
|
lw $s0, 0($sp)
|
||||||
|
lw $s1, 4($sp)
|
||||||
|
lw $s2, 8($sp)
|
||||||
|
lw $s3, 12($sp)
|
||||||
|
lw $s4, 16($sp)
|
||||||
|
lw $s5, 20($sp)
|
||||||
|
lw $s6, 24($sp)
|
||||||
|
lw $s7, 28($sp)
|
||||||
|
|
||||||
|
/* Write NONCE_0 back to right location in state */
|
||||||
|
sw NONCE_0, 48(STATE)
|
||||||
|
|
||||||
|
.Lchacha_mips_end:
|
||||||
|
addiu $sp, STACK_SIZE
|
||||||
|
jr $ra
|
||||||
|
|
||||||
|
.Lchacha_mips_no_full_block_aligned:
|
||||||
|
/* Restore the offset on BYTES */
|
||||||
|
addiu BYTES, CHACHA20_BLOCK_SIZE
|
||||||
|
|
||||||
|
/* Get number of full WORDS */
|
||||||
|
andi $at, BYTES, MASK_U32
|
||||||
|
|
||||||
|
/* Load upper half of jump table addr */
|
||||||
|
lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
|
||||||
|
|
||||||
|
/* Calculate lower half jump table offset */
|
||||||
|
ins T0, $at, 1, 6
|
||||||
|
|
||||||
|
/* Add offset to STATE */
|
||||||
|
addu T1, STATE, $at
|
||||||
|
|
||||||
|
/* Add lower half jump table addr */
|
||||||
|
addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
|
||||||
|
|
||||||
|
/* Read value from STATE */
|
||||||
|
lw SAVED_CA, 0(T1)
|
||||||
|
|
||||||
|
/* Store remaining bytecounter as negative value */
|
||||||
|
subu BYTES, $at, BYTES
|
||||||
|
|
||||||
|
jr T0
|
||||||
|
|
||||||
|
/* Jump table */
|
||||||
|
FOR_EACH_WORD(JMPTBL_ALIGNED)
|
||||||
|
|
||||||
|
|
||||||
|
.Loop_chacha_unaligned:
|
||||||
|
/* Set number rounds here to fill delayslot. */
|
||||||
|
lw $at, (STACK_SIZE+16)($sp)
|
||||||
|
|
||||||
|
/* BYTES > 0, it has no full block. */
|
||||||
|
bltz BYTES, .Lchacha_mips_no_full_block_unaligned
|
||||||
|
|
||||||
|
FOR_EACH_WORD_REV(STORE_UNALIGNED)
|
||||||
|
|
||||||
|
/* BYTES > 0? Loop again. */
|
||||||
|
bgtz BYTES, .Loop_chacha_rounds
|
||||||
|
|
||||||
|
/* Write NONCE_0 back to right location in state */
|
||||||
|
sw NONCE_0, 48(STATE)
|
||||||
|
|
||||||
|
.set noreorder
|
||||||
|
/* Fall through to byte handling */
|
||||||
|
bgez BYTES, .Lchacha_mips_xor_done
|
||||||
|
.Lchacha_mips_xor_unaligned_0_b:
|
||||||
|
.Lchacha_mips_xor_aligned_0_b:
|
||||||
|
/* Place this here to fill delay slot */
|
||||||
|
addiu NONCE_0, 1
|
||||||
|
.set reorder
|
||||||
|
|
||||||
|
.Lchacha_mips_xor_bytes:
|
||||||
|
addu IN, $at
|
||||||
|
addu OUT, $at
|
||||||
|
/* First byte */
|
||||||
|
lbu T1, 0(IN)
|
||||||
|
addiu $at, BYTES, 1
|
||||||
|
CPU_TO_LE32(SAVED_X)
|
||||||
|
ROTR(SAVED_X)
|
||||||
|
xor T1, SAVED_X
|
||||||
|
sb T1, 0(OUT)
|
||||||
|
beqz $at, .Lchacha_mips_xor_done
|
||||||
|
/* Second byte */
|
||||||
|
lbu T1, 1(IN)
|
||||||
|
addiu $at, BYTES, 2
|
||||||
|
ROTx SAVED_X, 8
|
||||||
|
xor T1, SAVED_X
|
||||||
|
sb T1, 1(OUT)
|
||||||
|
beqz $at, .Lchacha_mips_xor_done
|
||||||
|
/* Third byte */
|
||||||
|
lbu T1, 2(IN)
|
||||||
|
ROTx SAVED_X, 8
|
||||||
|
xor T1, SAVED_X
|
||||||
|
sb T1, 2(OUT)
|
||||||
|
b .Lchacha_mips_xor_done
|
||||||
|
|
||||||
|
.Lchacha_mips_no_full_block_unaligned:
|
||||||
|
/* Restore the offset on BYTES */
|
||||||
|
addiu BYTES, CHACHA20_BLOCK_SIZE
|
||||||
|
|
||||||
|
/* Get number of full WORDS */
|
||||||
|
andi $at, BYTES, MASK_U32
|
||||||
|
|
||||||
|
/* Load upper half of jump table addr */
|
||||||
|
lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
|
||||||
|
|
||||||
|
/* Calculate lower half jump table offset */
|
||||||
|
ins T0, $at, 1, 6
|
||||||
|
|
||||||
|
/* Add offset to STATE */
|
||||||
|
addu T1, STATE, $at
|
||||||
|
|
||||||
|
/* Add lower half jump table addr */
|
||||||
|
addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
|
||||||
|
|
||||||
|
/* Read value from STATE */
|
||||||
|
lw SAVED_CA, 0(T1)
|
||||||
|
|
||||||
|
/* Store remaining bytecounter as negative value */
|
||||||
|
subu BYTES, $at, BYTES
|
||||||
|
|
||||||
|
jr T0
|
||||||
|
|
||||||
|
/* Jump table */
|
||||||
|
FOR_EACH_WORD(JMPTBL_UNALIGNED)
|
||||||
|
.end chacha_crypt_arch
|
||||||
|
.set at
|
||||||
|
|
||||||
|
/* Input arguments
|
||||||
|
* STATE $a0
|
||||||
|
* OUT $a1
|
||||||
|
* NROUND $a2
|
||||||
|
*/
|
||||||
|
|
||||||
|
#undef X12
|
||||||
|
#undef X13
|
||||||
|
#undef X14
|
||||||
|
#undef X15
|
||||||
|
|
||||||
|
#define X12 $a3
|
||||||
|
#define X13 $at
|
||||||
|
#define X14 $v0
|
||||||
|
#define X15 STATE
|
||||||
|
|
||||||
|
.set noat
|
||||||
|
.globl hchacha_block_arch
|
||||||
|
.ent hchacha_block_arch
|
||||||
|
hchacha_block_arch:
|
||||||
|
.frame $sp, STACK_SIZE, $ra
|
||||||
|
|
||||||
|
addiu $sp, -STACK_SIZE
|
||||||
|
|
||||||
|
/* Save X11(s6) */
|
||||||
|
sw X11, 0($sp)
|
||||||
|
|
||||||
|
lw X0, 0(STATE)
|
||||||
|
lw X1, 4(STATE)
|
||||||
|
lw X2, 8(STATE)
|
||||||
|
lw X3, 12(STATE)
|
||||||
|
lw X4, 16(STATE)
|
||||||
|
lw X5, 20(STATE)
|
||||||
|
lw X6, 24(STATE)
|
||||||
|
lw X7, 28(STATE)
|
||||||
|
lw X8, 32(STATE)
|
||||||
|
lw X9, 36(STATE)
|
||||||
|
lw X10, 40(STATE)
|
||||||
|
lw X11, 44(STATE)
|
||||||
|
lw X12, 48(STATE)
|
||||||
|
lw X13, 52(STATE)
|
||||||
|
lw X14, 56(STATE)
|
||||||
|
lw X15, 60(STATE)
|
||||||
|
|
||||||
|
.Loop_hchacha_xor_rounds:
|
||||||
|
addiu $a2, -2
|
||||||
|
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
|
||||||
|
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
|
||||||
|
AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
|
||||||
|
AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
|
||||||
|
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
|
||||||
|
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
|
||||||
|
AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
|
||||||
|
AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
|
||||||
|
bnez $a2, .Loop_hchacha_xor_rounds
|
||||||
|
|
||||||
|
/* Restore used register */
|
||||||
|
lw X11, 0($sp)
|
||||||
|
|
||||||
|
sw X0, 0(OUT)
|
||||||
|
sw X1, 4(OUT)
|
||||||
|
sw X2, 8(OUT)
|
||||||
|
sw X3, 12(OUT)
|
||||||
|
sw X12, 16(OUT)
|
||||||
|
sw X13, 20(OUT)
|
||||||
|
sw X14, 24(OUT)
|
||||||
|
sw X15, 28(OUT)
|
||||||
|
|
||||||
|
addiu $sp, STACK_SIZE
|
||||||
|
jr $ra
|
||||||
|
.end hchacha_block_arch
|
||||||
|
.set at
|
152
arch/mips/crypto/chacha-glue.c
Normal file
152
arch/mips/crypto/chacha-glue.c
Normal file
|
@ -0,0 +1,152 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
|
/*
|
||||||
|
* MIPS accelerated ChaCha and XChaCha stream ciphers,
|
||||||
|
* including ChaCha20 (RFC7539)
|
||||||
|
*
|
||||||
|
* Copyright (C) 2019 Linaro, Ltd. <ard.biesheuvel@linaro.org>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <asm/byteorder.h>
|
||||||
|
#include <crypto/algapi.h>
|
||||||
|
#include <crypto/internal/chacha.h>
|
||||||
|
#include <crypto/internal/skcipher.h>
|
||||||
|
#include <linux/kernel.h>
|
||||||
|
#include <linux/module.h>
|
||||||
|
|
||||||
|
asmlinkage void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
|
||||||
|
unsigned int bytes, int nrounds);
|
||||||
|
EXPORT_SYMBOL(chacha_crypt_arch);
|
||||||
|
|
||||||
|
asmlinkage void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds);
|
||||||
|
EXPORT_SYMBOL(hchacha_block_arch);
|
||||||
|
|
||||||
|
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
||||||
|
{
|
||||||
|
chacha_init_generic(state, key, iv);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(chacha_init_arch);
|
||||||
|
|
||||||
|
static int chacha_mips_stream_xor(struct skcipher_request *req,
|
||||||
|
const struct chacha_ctx *ctx, const u8 *iv)
|
||||||
|
{
|
||||||
|
struct skcipher_walk walk;
|
||||||
|
u32 state[16];
|
||||||
|
int err;
|
||||||
|
|
||||||
|
err = skcipher_walk_virt(&walk, req, false);
|
||||||
|
|
||||||
|
chacha_init_generic(state, ctx->key, iv);
|
||||||
|
|
||||||
|
while (walk.nbytes > 0) {
|
||||||
|
unsigned int nbytes = walk.nbytes;
|
||||||
|
|
||||||
|
if (nbytes < walk.total)
|
||||||
|
nbytes = round_down(nbytes, walk.stride);
|
||||||
|
|
||||||
|
chacha_crypt(state, walk.dst.virt.addr, walk.src.virt.addr,
|
||||||
|
nbytes, ctx->nrounds);
|
||||||
|
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int chacha_mips(struct skcipher_request *req)
|
||||||
|
{
|
||||||
|
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||||
|
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||||
|
|
||||||
|
return chacha_mips_stream_xor(req, ctx, req->iv);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int xchacha_mips(struct skcipher_request *req)
|
||||||
|
{
|
||||||
|
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||||
|
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||||
|
struct chacha_ctx subctx;
|
||||||
|
u32 state[16];
|
||||||
|
u8 real_iv[16];
|
||||||
|
|
||||||
|
chacha_init_generic(state, ctx->key, req->iv);
|
||||||
|
|
||||||
|
hchacha_block(state, subctx.key, ctx->nrounds);
|
||||||
|
subctx.nrounds = ctx->nrounds;
|
||||||
|
|
||||||
|
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||||
|
memcpy(&real_iv[8], req->iv + 16, 8);
|
||||||
|
return chacha_mips_stream_xor(req, &subctx, real_iv);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct skcipher_alg algs[] = {
|
||||||
|
{
|
||||||
|
.base.cra_name = "chacha20",
|
||||||
|
.base.cra_driver_name = "chacha20-mips",
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_blocksize = 1,
|
||||||
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.ivsize = CHACHA_IV_SIZE,
|
||||||
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
|
.setkey = chacha20_setkey,
|
||||||
|
.encrypt = chacha_mips,
|
||||||
|
.decrypt = chacha_mips,
|
||||||
|
}, {
|
||||||
|
.base.cra_name = "xchacha20",
|
||||||
|
.base.cra_driver_name = "xchacha20-mips",
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_blocksize = 1,
|
||||||
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.ivsize = XCHACHA_IV_SIZE,
|
||||||
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
|
.setkey = chacha20_setkey,
|
||||||
|
.encrypt = xchacha_mips,
|
||||||
|
.decrypt = xchacha_mips,
|
||||||
|
}, {
|
||||||
|
.base.cra_name = "xchacha12",
|
||||||
|
.base.cra_driver_name = "xchacha12-mips",
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_blocksize = 1,
|
||||||
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.ivsize = XCHACHA_IV_SIZE,
|
||||||
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
|
.setkey = chacha12_setkey,
|
||||||
|
.encrypt = xchacha_mips,
|
||||||
|
.decrypt = xchacha_mips,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static int __init chacha_simd_mod_init(void)
|
||||||
|
{
|
||||||
|
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
|
||||||
|
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __exit chacha_simd_mod_fini(void)
|
||||||
|
{
|
||||||
|
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER))
|
||||||
|
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
||||||
|
}
|
||||||
|
|
||||||
|
module_init(chacha_simd_mod_init);
|
||||||
|
module_exit(chacha_simd_mod_fini);
|
||||||
|
|
||||||
|
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (MIPS accelerated)");
|
||||||
|
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
|
||||||
|
MODULE_LICENSE("GPL v2");
|
||||||
|
MODULE_ALIAS_CRYPTO("chacha20");
|
||||||
|
MODULE_ALIAS_CRYPTO("chacha20-mips");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha20");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha20-mips");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha12");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha12-mips");
|
191
arch/mips/crypto/poly1305-glue.c
Normal file
191
arch/mips/crypto/poly1305-glue.c
Normal file
|
@ -0,0 +1,191 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0
|
||||||
|
/*
|
||||||
|
* OpenSSL/Cryptogams accelerated Poly1305 transform for MIPS
|
||||||
|
*
|
||||||
|
* Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <asm/unaligned.h>
|
||||||
|
#include <crypto/algapi.h>
|
||||||
|
#include <crypto/internal/hash.h>
|
||||||
|
#include <crypto/internal/poly1305.h>
|
||||||
|
#include <linux/cpufeature.h>
|
||||||
|
#include <linux/crypto.h>
|
||||||
|
#include <linux/module.h>
|
||||||
|
|
||||||
|
asmlinkage void poly1305_init_mips(void *state, const u8 *key);
|
||||||
|
asmlinkage void poly1305_blocks_mips(void *state, const u8 *src, u32 len, u32 hibit);
|
||||||
|
asmlinkage void poly1305_emit_mips(void *state, u8 *digest, const u32 *nonce);
|
||||||
|
|
||||||
|
void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
|
||||||
|
{
|
||||||
|
poly1305_init_mips(&dctx->h, key);
|
||||||
|
dctx->s[0] = get_unaligned_le32(key + 16);
|
||||||
|
dctx->s[1] = get_unaligned_le32(key + 20);
|
||||||
|
dctx->s[2] = get_unaligned_le32(key + 24);
|
||||||
|
dctx->s[3] = get_unaligned_le32(key + 28);
|
||||||
|
dctx->buflen = 0;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(poly1305_init_arch);
|
||||||
|
|
||||||
|
static int mips_poly1305_init(struct shash_desc *desc)
|
||||||
|
{
|
||||||
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||||
|
|
||||||
|
dctx->buflen = 0;
|
||||||
|
dctx->rset = 0;
|
||||||
|
dctx->sset = false;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mips_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||||
|
u32 len, u32 hibit)
|
||||||
|
{
|
||||||
|
if (unlikely(!dctx->sset)) {
|
||||||
|
if (!dctx->rset) {
|
||||||
|
poly1305_init_mips(&dctx->h, src);
|
||||||
|
src += POLY1305_BLOCK_SIZE;
|
||||||
|
len -= POLY1305_BLOCK_SIZE;
|
||||||
|
dctx->rset = 1;
|
||||||
|
}
|
||||||
|
if (len >= POLY1305_BLOCK_SIZE) {
|
||||||
|
dctx->s[0] = get_unaligned_le32(src + 0);
|
||||||
|
dctx->s[1] = get_unaligned_le32(src + 4);
|
||||||
|
dctx->s[2] = get_unaligned_le32(src + 8);
|
||||||
|
dctx->s[3] = get_unaligned_le32(src + 12);
|
||||||
|
src += POLY1305_BLOCK_SIZE;
|
||||||
|
len -= POLY1305_BLOCK_SIZE;
|
||||||
|
dctx->sset = true;
|
||||||
|
}
|
||||||
|
if (len < POLY1305_BLOCK_SIZE)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
len &= ~(POLY1305_BLOCK_SIZE - 1);
|
||||||
|
|
||||||
|
poly1305_blocks_mips(&dctx->h, src, len, hibit);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int mips_poly1305_update(struct shash_desc *desc, const u8 *src,
|
||||||
|
unsigned int len)
|
||||||
|
{
|
||||||
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||||
|
|
||||||
|
if (unlikely(dctx->buflen)) {
|
||||||
|
u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||||
|
|
||||||
|
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||||
|
src += bytes;
|
||||||
|
len -= bytes;
|
||||||
|
dctx->buflen += bytes;
|
||||||
|
|
||||||
|
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||||
|
mips_poly1305_blocks(dctx, dctx->buf, POLY1305_BLOCK_SIZE, 1);
|
||||||
|
dctx->buflen = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (likely(len >= POLY1305_BLOCK_SIZE)) {
|
||||||
|
mips_poly1305_blocks(dctx, src, len, 1);
|
||||||
|
src += round_down(len, POLY1305_BLOCK_SIZE);
|
||||||
|
len %= POLY1305_BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (unlikely(len)) {
|
||||||
|
dctx->buflen = len;
|
||||||
|
memcpy(dctx->buf, src, len);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
|
||||||
|
unsigned int nbytes)
|
||||||
|
{
|
||||||
|
if (unlikely(dctx->buflen)) {
|
||||||
|
u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||||
|
|
||||||
|
memcpy(dctx->buf + dctx->buflen, src, bytes);
|
||||||
|
src += bytes;
|
||||||
|
nbytes -= bytes;
|
||||||
|
dctx->buflen += bytes;
|
||||||
|
|
||||||
|
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
|
||||||
|
poly1305_blocks_mips(&dctx->h, dctx->buf,
|
||||||
|
POLY1305_BLOCK_SIZE, 1);
|
||||||
|
dctx->buflen = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
|
||||||
|
unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
|
||||||
|
|
||||||
|
poly1305_blocks_mips(&dctx->h, src, len, 1);
|
||||||
|
src += len;
|
||||||
|
nbytes %= POLY1305_BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (unlikely(nbytes)) {
|
||||||
|
dctx->buflen = nbytes;
|
||||||
|
memcpy(dctx->buf, src, nbytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(poly1305_update_arch);
|
||||||
|
|
||||||
|
void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
|
||||||
|
{
|
||||||
|
if (unlikely(dctx->buflen)) {
|
||||||
|
dctx->buf[dctx->buflen++] = 1;
|
||||||
|
memset(dctx->buf + dctx->buflen, 0,
|
||||||
|
POLY1305_BLOCK_SIZE - dctx->buflen);
|
||||||
|
poly1305_blocks_mips(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
poly1305_emit_mips(&dctx->h, dst, dctx->s);
|
||||||
|
*dctx = (struct poly1305_desc_ctx){};
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(poly1305_final_arch);
|
||||||
|
|
||||||
|
static int mips_poly1305_final(struct shash_desc *desc, u8 *dst)
|
||||||
|
{
|
||||||
|
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
|
||||||
|
|
||||||
|
if (unlikely(!dctx->sset))
|
||||||
|
return -ENOKEY;
|
||||||
|
|
||||||
|
poly1305_final_arch(dctx, dst);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct shash_alg mips_poly1305_alg = {
|
||||||
|
.init = mips_poly1305_init,
|
||||||
|
.update = mips_poly1305_update,
|
||||||
|
.final = mips_poly1305_final,
|
||||||
|
.digestsize = POLY1305_DIGEST_SIZE,
|
||||||
|
.descsize = sizeof(struct poly1305_desc_ctx),
|
||||||
|
|
||||||
|
.base.cra_name = "poly1305",
|
||||||
|
.base.cra_driver_name = "poly1305-mips",
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_blocksize = POLY1305_BLOCK_SIZE,
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
};
|
||||||
|
|
||||||
|
static int __init mips_poly1305_mod_init(void)
|
||||||
|
{
|
||||||
|
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||||
|
crypto_register_shash(&mips_poly1305_alg) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __exit mips_poly1305_mod_exit(void)
|
||||||
|
{
|
||||||
|
if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
|
||||||
|
crypto_unregister_shash(&mips_poly1305_alg);
|
||||||
|
}
|
||||||
|
|
||||||
|
module_init(mips_poly1305_mod_init);
|
||||||
|
module_exit(mips_poly1305_mod_exit);
|
||||||
|
|
||||||
|
MODULE_LICENSE("GPL v2");
|
||||||
|
MODULE_ALIAS_CRYPTO("poly1305");
|
||||||
|
MODULE_ALIAS_CRYPTO("poly1305-mips");
|
1273
arch/mips/crypto/poly1305-mips.pl
Normal file
1273
arch/mips/crypto/poly1305-mips.pl
Normal file
File diff suppressed because it is too large
Load diff
|
@ -152,6 +152,7 @@ config PPC
|
||||||
select ARCH_USE_BUILTIN_BSWAP
|
select ARCH_USE_BUILTIN_BSWAP
|
||||||
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
|
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
|
||||||
select ARCH_WANT_IPC_PARSE_VERSION
|
select ARCH_WANT_IPC_PARSE_VERSION
|
||||||
|
select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
|
||||||
select ARCH_WEAK_RELEASE_ACQUIRE
|
select ARCH_WEAK_RELEASE_ACQUIRE
|
||||||
select BINFMT_ELF
|
select BINFMT_ELF
|
||||||
select BUILDTIME_EXTABLE_SORT
|
select BUILDTIME_EXTABLE_SORT
|
||||||
|
@ -1009,6 +1010,19 @@ config FSL_RIO
|
||||||
|
|
||||||
source "drivers/rapidio/Kconfig"
|
source "drivers/rapidio/Kconfig"
|
||||||
|
|
||||||
|
config PPC_RTAS_FILTER
|
||||||
|
bool "Enable filtering of RTAS syscalls"
|
||||||
|
default y
|
||||||
|
depends on PPC_RTAS
|
||||||
|
help
|
||||||
|
The RTAS syscall API has security issues that could be used to
|
||||||
|
compromise system integrity. This option enforces restrictions on the
|
||||||
|
RTAS calls and arguments passed by userspace programs to mitigate
|
||||||
|
these issues.
|
||||||
|
|
||||||
|
Say Y unless you know what you are doing and the filter is causing
|
||||||
|
problems for you.
|
||||||
|
|
||||||
endmenu
|
endmenu
|
||||||
|
|
||||||
config NONSTATIC_KERNEL
|
config NONSTATIC_KERNEL
|
||||||
|
|
|
@ -12,6 +12,8 @@
|
||||||
#ifndef _ASM_POWERPC_LMB_H
|
#ifndef _ASM_POWERPC_LMB_H
|
||||||
#define _ASM_POWERPC_LMB_H
|
#define _ASM_POWERPC_LMB_H
|
||||||
|
|
||||||
|
#include <linux/sched.h>
|
||||||
|
|
||||||
struct drmem_lmb {
|
struct drmem_lmb {
|
||||||
u64 base_addr;
|
u64 base_addr;
|
||||||
u32 drc_index;
|
u32 drc_index;
|
||||||
|
@ -22,13 +24,27 @@ struct drmem_lmb {
|
||||||
struct drmem_lmb_info {
|
struct drmem_lmb_info {
|
||||||
struct drmem_lmb *lmbs;
|
struct drmem_lmb *lmbs;
|
||||||
int n_lmbs;
|
int n_lmbs;
|
||||||
u32 lmb_size;
|
u64 lmb_size;
|
||||||
};
|
};
|
||||||
|
|
||||||
extern struct drmem_lmb_info *drmem_info;
|
extern struct drmem_lmb_info *drmem_info;
|
||||||
|
|
||||||
|
static inline struct drmem_lmb *drmem_lmb_next(struct drmem_lmb *lmb,
|
||||||
|
const struct drmem_lmb *start)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* DLPAR code paths can take several milliseconds per element
|
||||||
|
* when interacting with firmware. Ensure that we don't
|
||||||
|
* unfairly monopolize the CPU.
|
||||||
|
*/
|
||||||
|
if (((++lmb - start) % 16) == 0)
|
||||||
|
cond_resched();
|
||||||
|
|
||||||
|
return lmb;
|
||||||
|
}
|
||||||
|
|
||||||
#define for_each_drmem_lmb_in_range(lmb, start, end) \
|
#define for_each_drmem_lmb_in_range(lmb, start, end) \
|
||||||
for ((lmb) = (start); (lmb) < (end); (lmb)++)
|
for ((lmb) = (start); (lmb) < (end); lmb = drmem_lmb_next(lmb, start))
|
||||||
|
|
||||||
#define for_each_drmem_lmb(lmb) \
|
#define for_each_drmem_lmb(lmb) \
|
||||||
for_each_drmem_lmb_in_range((lmb), \
|
for_each_drmem_lmb_in_range((lmb), \
|
||||||
|
@ -67,7 +83,7 @@ struct of_drconf_cell_v2 {
|
||||||
#define DRCONF_MEM_AI_INVALID 0x00000040
|
#define DRCONF_MEM_AI_INVALID 0x00000040
|
||||||
#define DRCONF_MEM_RESERVED 0x00000080
|
#define DRCONF_MEM_RESERVED 0x00000080
|
||||||
|
|
||||||
static inline u32 drmem_lmb_size(void)
|
static inline u64 drmem_lmb_size(void)
|
||||||
{
|
{
|
||||||
return drmem_info->lmb_size;
|
return drmem_info->lmb_size;
|
||||||
}
|
}
|
||||||
|
|
|
@ -204,7 +204,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
||||||
*/
|
*/
|
||||||
static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
|
static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
|
||||||
{
|
{
|
||||||
switch_mm(prev, next, current);
|
switch_mm_irqs_off(prev, next, current);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* We don't currently use enter_lazy_tlb() for anything */
|
/* We don't currently use enter_lazy_tlb() for anything */
|
||||||
|
|
|
@ -788,7 +788,7 @@
|
||||||
#define THRM1_TIN (1 << 31)
|
#define THRM1_TIN (1 << 31)
|
||||||
#define THRM1_TIV (1 << 30)
|
#define THRM1_TIV (1 << 30)
|
||||||
#define THRM1_THRES(x) ((x&0x7f)<<23)
|
#define THRM1_THRES(x) ((x&0x7f)<<23)
|
||||||
#define THRM3_SITV(x) ((x&0x3fff)<<1)
|
#define THRM3_SITV(x) ((x & 0x1fff) << 1)
|
||||||
#define THRM1_TID (1<<2)
|
#define THRM1_TID (1<<2)
|
||||||
#define THRM1_TIE (1<<1)
|
#define THRM1_TIE (1<<1)
|
||||||
#define THRM1_V (1<<0)
|
#define THRM1_V (1<<0)
|
||||||
|
|
|
@ -76,19 +76,6 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
|
||||||
return false;
|
return false;
|
||||||
return cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm));
|
return cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm));
|
||||||
}
|
}
|
||||||
static inline void mm_reset_thread_local(struct mm_struct *mm)
|
|
||||||
{
|
|
||||||
WARN_ON(atomic_read(&mm->context.copros) > 0);
|
|
||||||
/*
|
|
||||||
* It's possible for mm_access to take a reference on mm_users to
|
|
||||||
* access the remote mm from another thread, but it's not allowed
|
|
||||||
* to set mm_cpumask, so mm_users may be > 1 here.
|
|
||||||
*/
|
|
||||||
WARN_ON(current->mm != mm);
|
|
||||||
atomic_set(&mm->context.active_cpus, 1);
|
|
||||||
cpumask_clear(mm_cpumask(mm));
|
|
||||||
cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
|
|
||||||
}
|
|
||||||
#else /* CONFIG_PPC_BOOK3S_64 */
|
#else /* CONFIG_PPC_BOOK3S_64 */
|
||||||
static inline int mm_is_thread_local(struct mm_struct *mm)
|
static inline int mm_is_thread_local(struct mm_struct *mm)
|
||||||
{
|
{
|
||||||
|
|
|
@ -1057,6 +1057,147 @@ struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_PPC_RTAS_FILTER
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The sys_rtas syscall, as originally designed, allows root to pass
|
||||||
|
* arbitrary physical addresses to RTAS calls. A number of RTAS calls
|
||||||
|
* can be abused to write to arbitrary memory and do other things that
|
||||||
|
* are potentially harmful to system integrity, and thus should only
|
||||||
|
* be used inside the kernel and not exposed to userspace.
|
||||||
|
*
|
||||||
|
* All known legitimate users of the sys_rtas syscall will only ever
|
||||||
|
* pass addresses that fall within the RMO buffer, and use a known
|
||||||
|
* subset of RTAS calls.
|
||||||
|
*
|
||||||
|
* Accordingly, we filter RTAS requests to check that the call is
|
||||||
|
* permitted, and that provided pointers fall within the RMO buffer.
|
||||||
|
* The rtas_filters list contains an entry for each permitted call,
|
||||||
|
* with the indexes of the parameters which are expected to contain
|
||||||
|
* addresses and sizes of buffers allocated inside the RMO buffer.
|
||||||
|
*/
|
||||||
|
struct rtas_filter {
|
||||||
|
const char *name;
|
||||||
|
int token;
|
||||||
|
/* Indexes into the args buffer, -1 if not used */
|
||||||
|
int buf_idx1;
|
||||||
|
int size_idx1;
|
||||||
|
int buf_idx2;
|
||||||
|
int size_idx2;
|
||||||
|
|
||||||
|
int fixed_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct rtas_filter rtas_filters[] __ro_after_init = {
|
||||||
|
{ "ibm,activate-firmware", -1, -1, -1, -1, -1 },
|
||||||
|
{ "ibm,configure-connector", -1, 0, -1, 1, -1, 4096 }, /* Special cased */
|
||||||
|
{ "display-character", -1, -1, -1, -1, -1 },
|
||||||
|
{ "ibm,display-message", -1, 0, -1, -1, -1 },
|
||||||
|
{ "ibm,errinjct", -1, 2, -1, -1, -1, 1024 },
|
||||||
|
{ "ibm,close-errinjct", -1, -1, -1, -1, -1 },
|
||||||
|
{ "ibm,open-errinct", -1, -1, -1, -1, -1 },
|
||||||
|
{ "ibm,get-config-addr-info2", -1, -1, -1, -1, -1 },
|
||||||
|
{ "ibm,get-dynamic-sensor-state", -1, 1, -1, -1, -1 },
|
||||||
|
{ "ibm,get-indices", -1, 2, 3, -1, -1 },
|
||||||
|
{ "get-power-level", -1, -1, -1, -1, -1 },
|
||||||
|
{ "get-sensor-state", -1, -1, -1, -1, -1 },
|
||||||
|
{ "ibm,get-system-parameter", -1, 1, 2, -1, -1 },
|
||||||
|
{ "get-time-of-day", -1, -1, -1, -1, -1 },
|
||||||
|
{ "ibm,get-vpd", -1, 0, -1, 1, 2 },
|
||||||
|
{ "ibm,lpar-perftools", -1, 2, 3, -1, -1 },
|
||||||
|
{ "ibm,platform-dump", -1, 4, 5, -1, -1 },
|
||||||
|
{ "ibm,read-slot-reset-state", -1, -1, -1, -1, -1 },
|
||||||
|
{ "ibm,scan-log-dump", -1, 0, 1, -1, -1 },
|
||||||
|
{ "ibm,set-dynamic-indicator", -1, 2, -1, -1, -1 },
|
||||||
|
{ "ibm,set-eeh-option", -1, -1, -1, -1, -1 },
|
||||||
|
{ "set-indicator", -1, -1, -1, -1, -1 },
|
||||||
|
{ "set-power-level", -1, -1, -1, -1, -1 },
|
||||||
|
{ "set-time-for-power-on", -1, -1, -1, -1, -1 },
|
||||||
|
{ "ibm,set-system-parameter", -1, 1, -1, -1, -1 },
|
||||||
|
{ "set-time-of-day", -1, -1, -1, -1, -1 },
|
||||||
|
{ "ibm,suspend-me", -1, -1, -1, -1, -1 },
|
||||||
|
{ "ibm,update-nodes", -1, 0, -1, -1, -1, 4096 },
|
||||||
|
{ "ibm,update-properties", -1, 0, -1, -1, -1, 4096 },
|
||||||
|
{ "ibm,physical-attestation", -1, 0, 1, -1, -1 },
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool in_rmo_buf(u32 base, u32 end)
|
||||||
|
{
|
||||||
|
return base >= rtas_rmo_buf &&
|
||||||
|
base < (rtas_rmo_buf + RTAS_RMOBUF_MAX) &&
|
||||||
|
base <= end &&
|
||||||
|
end >= rtas_rmo_buf &&
|
||||||
|
end < (rtas_rmo_buf + RTAS_RMOBUF_MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool block_rtas_call(int token, int nargs,
|
||||||
|
struct rtas_args *args)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) {
|
||||||
|
struct rtas_filter *f = &rtas_filters[i];
|
||||||
|
u32 base, size, end;
|
||||||
|
|
||||||
|
if (token != f->token)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (f->buf_idx1 != -1) {
|
||||||
|
base = be32_to_cpu(args->args[f->buf_idx1]);
|
||||||
|
if (f->size_idx1 != -1)
|
||||||
|
size = be32_to_cpu(args->args[f->size_idx1]);
|
||||||
|
else if (f->fixed_size)
|
||||||
|
size = f->fixed_size;
|
||||||
|
else
|
||||||
|
size = 1;
|
||||||
|
|
||||||
|
end = base + size - 1;
|
||||||
|
if (!in_rmo_buf(base, end))
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (f->buf_idx2 != -1) {
|
||||||
|
base = be32_to_cpu(args->args[f->buf_idx2]);
|
||||||
|
if (f->size_idx2 != -1)
|
||||||
|
size = be32_to_cpu(args->args[f->size_idx2]);
|
||||||
|
else if (f->fixed_size)
|
||||||
|
size = f->fixed_size;
|
||||||
|
else
|
||||||
|
size = 1;
|
||||||
|
end = base + size - 1;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Special case for ibm,configure-connector where the
|
||||||
|
* address can be 0
|
||||||
|
*/
|
||||||
|
if (!strcmp(f->name, "ibm,configure-connector") &&
|
||||||
|
base == 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (!in_rmo_buf(base, end))
|
||||||
|
goto err;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
err:
|
||||||
|
pr_err_ratelimited("sys_rtas: RTAS call blocked - exploit attempt?\n");
|
||||||
|
pr_err_ratelimited("sys_rtas: token=0x%x, nargs=%d (called by %s)\n",
|
||||||
|
token, nargs, current->comm);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
static bool block_rtas_call(int token, int nargs,
|
||||||
|
struct rtas_args *args)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* CONFIG_PPC_RTAS_FILTER */
|
||||||
|
|
||||||
/* We assume to be passed big endian arguments */
|
/* We assume to be passed big endian arguments */
|
||||||
SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
|
SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
|
||||||
{
|
{
|
||||||
|
@ -1094,6 +1235,9 @@ SYSCALL_DEFINE1(rtas, struct rtas_args __user *, uargs)
|
||||||
args.rets = &args.args[nargs];
|
args.rets = &args.args[nargs];
|
||||||
memset(args.rets, 0, nret * sizeof(rtas_arg_t));
|
memset(args.rets, 0, nret * sizeof(rtas_arg_t));
|
||||||
|
|
||||||
|
if (block_rtas_call(token, nargs, &args))
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
/* Need to handle ibm,suspend_me call specially */
|
/* Need to handle ibm,suspend_me call specially */
|
||||||
if (token == ibm_suspend_me_token) {
|
if (token == ibm_suspend_me_token) {
|
||||||
|
|
||||||
|
@ -1155,6 +1299,9 @@ void __init rtas_initialize(void)
|
||||||
unsigned long rtas_region = RTAS_INSTANTIATE_MAX;
|
unsigned long rtas_region = RTAS_INSTANTIATE_MAX;
|
||||||
u32 base, size, entry;
|
u32 base, size, entry;
|
||||||
int no_base, no_size, no_entry;
|
int no_base, no_size, no_entry;
|
||||||
|
#ifdef CONFIG_PPC_RTAS_FILTER
|
||||||
|
int i;
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Get RTAS dev node and fill up our "rtas" structure with infos
|
/* Get RTAS dev node and fill up our "rtas" structure with infos
|
||||||
* about it.
|
* about it.
|
||||||
|
@ -1190,6 +1337,12 @@ void __init rtas_initialize(void)
|
||||||
#ifdef CONFIG_RTAS_ERROR_LOGGING
|
#ifdef CONFIG_RTAS_ERROR_LOGGING
|
||||||
rtas_last_error_token = rtas_token("rtas-last-error");
|
rtas_last_error_token = rtas_token("rtas-last-error");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef CONFIG_PPC_RTAS_FILTER
|
||||||
|
for (i = 0; i < ARRAY_SIZE(rtas_filters); i++) {
|
||||||
|
rtas_filters[i].token = rtas_token(rtas_filters[i].name);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
int __init early_init_dt_scan_rtas(unsigned long node,
|
int __init early_init_dt_scan_rtas(unsigned long node,
|
||||||
|
|
|
@ -29,29 +29,27 @@
|
||||||
|
|
||||||
static DEFINE_PER_CPU(struct cpu, cpu_devices);
|
static DEFINE_PER_CPU(struct cpu, cpu_devices);
|
||||||
|
|
||||||
/*
|
|
||||||
* SMT snooze delay stuff, 64-bit only for now
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifdef CONFIG_PPC64
|
#ifdef CONFIG_PPC64
|
||||||
|
|
||||||
/* Time in microseconds we delay before sleeping in the idle loop */
|
/*
|
||||||
static DEFINE_PER_CPU(long, smt_snooze_delay) = { 100 };
|
* Snooze delay has not been hooked up since 3fa8cad82b94 ("powerpc/pseries/cpuidle:
|
||||||
|
* smt-snooze-delay cleanup.") and has been broken even longer. As was foretold in
|
||||||
|
* 2014:
|
||||||
|
*
|
||||||
|
* "ppc64_util currently utilises it. Once we fix ppc64_util, propose to clean
|
||||||
|
* up the kernel code."
|
||||||
|
*
|
||||||
|
* powerpc-utils stopped using it as of 1.3.8. At some point in the future this
|
||||||
|
* code should be removed.
|
||||||
|
*/
|
||||||
|
|
||||||
static ssize_t store_smt_snooze_delay(struct device *dev,
|
static ssize_t store_smt_snooze_delay(struct device *dev,
|
||||||
struct device_attribute *attr,
|
struct device_attribute *attr,
|
||||||
const char *buf,
|
const char *buf,
|
||||||
size_t count)
|
size_t count)
|
||||||
{
|
{
|
||||||
struct cpu *cpu = container_of(dev, struct cpu, dev);
|
pr_warn_once("%s (%d) stored to unsupported smt_snooze_delay, which has no effect.\n",
|
||||||
ssize_t ret;
|
current->comm, current->pid);
|
||||||
long snooze;
|
|
||||||
|
|
||||||
ret = sscanf(buf, "%ld", &snooze);
|
|
||||||
if (ret != 1)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
per_cpu(smt_snooze_delay, cpu->dev.id) = snooze;
|
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,9 +57,9 @@ static ssize_t show_smt_snooze_delay(struct device *dev,
|
||||||
struct device_attribute *attr,
|
struct device_attribute *attr,
|
||||||
char *buf)
|
char *buf)
|
||||||
{
|
{
|
||||||
struct cpu *cpu = container_of(dev, struct cpu, dev);
|
pr_warn_once("%s (%d) read from unsupported smt_snooze_delay\n",
|
||||||
|
current->comm, current->pid);
|
||||||
return sprintf(buf, "%ld\n", per_cpu(smt_snooze_delay, cpu->dev.id));
|
return sprintf(buf, "100\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
|
static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
|
||||||
|
@ -69,16 +67,10 @@ static DEVICE_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
|
||||||
|
|
||||||
static int __init setup_smt_snooze_delay(char *str)
|
static int __init setup_smt_snooze_delay(char *str)
|
||||||
{
|
{
|
||||||
unsigned int cpu;
|
|
||||||
long snooze;
|
|
||||||
|
|
||||||
if (!cpu_has_feature(CPU_FTR_SMT))
|
if (!cpu_has_feature(CPU_FTR_SMT))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
snooze = simple_strtol(str, NULL, 10);
|
pr_warn("smt-snooze-delay command line option has no effect\n");
|
||||||
for_each_possible_cpu(cpu)
|
|
||||||
per_cpu(smt_snooze_delay, cpu) = snooze;
|
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
__setup("smt-snooze-delay=", setup_smt_snooze_delay);
|
__setup("smt-snooze-delay=", setup_smt_snooze_delay);
|
||||||
|
|
|
@ -13,13 +13,14 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <linux/errno.h>
|
#include <linux/errno.h>
|
||||||
#include <linux/jiffies.h>
|
|
||||||
#include <linux/kernel.h>
|
#include <linux/kernel.h>
|
||||||
#include <linux/param.h>
|
#include <linux/param.h>
|
||||||
#include <linux/string.h>
|
#include <linux/string.h>
|
||||||
#include <linux/mm.h>
|
#include <linux/mm.h>
|
||||||
#include <linux/interrupt.h>
|
#include <linux/interrupt.h>
|
||||||
#include <linux/init.h>
|
#include <linux/init.h>
|
||||||
|
#include <linux/delay.h>
|
||||||
|
#include <linux/workqueue.h>
|
||||||
|
|
||||||
#include <asm/io.h>
|
#include <asm/io.h>
|
||||||
#include <asm/reg.h>
|
#include <asm/reg.h>
|
||||||
|
@ -39,9 +40,7 @@ static struct tau_temp
|
||||||
unsigned char grew;
|
unsigned char grew;
|
||||||
} tau[NR_CPUS];
|
} tau[NR_CPUS];
|
||||||
|
|
||||||
struct timer_list tau_timer;
|
static bool tau_int_enable;
|
||||||
|
|
||||||
#undef DEBUG
|
|
||||||
|
|
||||||
/* TODO: put these in a /proc interface, with some sanity checks, and maybe
|
/* TODO: put these in a /proc interface, with some sanity checks, and maybe
|
||||||
* dynamic adjustment to minimize # of interrupts */
|
* dynamic adjustment to minimize # of interrupts */
|
||||||
|
@ -50,74 +49,51 @@ struct timer_list tau_timer;
|
||||||
#define step_size 2 /* step size when temp goes out of range */
|
#define step_size 2 /* step size when temp goes out of range */
|
||||||
#define window_expand 1 /* expand the window by this much */
|
#define window_expand 1 /* expand the window by this much */
|
||||||
/* configurable values for shrinking the window */
|
/* configurable values for shrinking the window */
|
||||||
#define shrink_timer 2*HZ /* period between shrinking the window */
|
#define shrink_timer 2000 /* period between shrinking the window */
|
||||||
#define min_window 2 /* minimum window size, degrees C */
|
#define min_window 2 /* minimum window size, degrees C */
|
||||||
|
|
||||||
static void set_thresholds(unsigned long cpu)
|
static void set_thresholds(unsigned long cpu)
|
||||||
{
|
{
|
||||||
#ifdef CONFIG_TAU_INT
|
u32 maybe_tie = tau_int_enable ? THRM1_TIE : 0;
|
||||||
/*
|
|
||||||
* setup THRM1,
|
|
||||||
* threshold, valid bit, enable interrupts, interrupt when below threshold
|
|
||||||
*/
|
|
||||||
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TIE | THRM1_TID);
|
|
||||||
|
|
||||||
/* setup THRM2,
|
/* setup THRM1, threshold, valid bit, interrupt when below threshold */
|
||||||
* threshold, valid bit, enable interrupts, interrupt when above threshold
|
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | maybe_tie | THRM1_TID);
|
||||||
*/
|
|
||||||
mtspr (SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | THRM1_TIE);
|
/* setup THRM2, threshold, valid bit, interrupt when above threshold */
|
||||||
#else
|
mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V | maybe_tie);
|
||||||
/* same thing but don't enable interrupts */
|
|
||||||
mtspr(SPRN_THRM1, THRM1_THRES(tau[cpu].low) | THRM1_V | THRM1_TID);
|
|
||||||
mtspr(SPRN_THRM2, THRM1_THRES(tau[cpu].high) | THRM1_V);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void TAUupdate(int cpu)
|
static void TAUupdate(int cpu)
|
||||||
{
|
{
|
||||||
unsigned thrm;
|
u32 thrm;
|
||||||
|
u32 bits = THRM1_TIV | THRM1_TIN | THRM1_V;
|
||||||
#ifdef DEBUG
|
|
||||||
printk("TAUupdate ");
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* if both thresholds are crossed, the step_sizes cancel out
|
/* if both thresholds are crossed, the step_sizes cancel out
|
||||||
* and the window winds up getting expanded twice. */
|
* and the window winds up getting expanded twice. */
|
||||||
if((thrm = mfspr(SPRN_THRM1)) & THRM1_TIV){ /* is valid? */
|
thrm = mfspr(SPRN_THRM1);
|
||||||
if(thrm & THRM1_TIN){ /* crossed low threshold */
|
if ((thrm & bits) == bits) {
|
||||||
|
mtspr(SPRN_THRM1, 0);
|
||||||
|
|
||||||
if (tau[cpu].low >= step_size) {
|
if (tau[cpu].low >= step_size) {
|
||||||
tau[cpu].low -= step_size;
|
tau[cpu].low -= step_size;
|
||||||
tau[cpu].high -= (step_size - window_expand);
|
tau[cpu].high -= (step_size - window_expand);
|
||||||
}
|
}
|
||||||
tau[cpu].grew = 1;
|
tau[cpu].grew = 1;
|
||||||
#ifdef DEBUG
|
pr_debug("%s: low threshold crossed\n", __func__);
|
||||||
printk("low threshold crossed ");
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
thrm = mfspr(SPRN_THRM2);
|
||||||
if((thrm = mfspr(SPRN_THRM2)) & THRM1_TIV){ /* is valid? */
|
if ((thrm & bits) == bits) {
|
||||||
if(thrm & THRM1_TIN){ /* crossed high threshold */
|
mtspr(SPRN_THRM2, 0);
|
||||||
|
|
||||||
if (tau[cpu].high <= 127 - step_size) {
|
if (tau[cpu].high <= 127 - step_size) {
|
||||||
tau[cpu].low += (step_size - window_expand);
|
tau[cpu].low += (step_size - window_expand);
|
||||||
tau[cpu].high += step_size;
|
tau[cpu].high += step_size;
|
||||||
}
|
}
|
||||||
tau[cpu].grew = 1;
|
tau[cpu].grew = 1;
|
||||||
#ifdef DEBUG
|
pr_debug("%s: high threshold crossed\n", __func__);
|
||||||
printk("high threshold crossed ");
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef DEBUG
|
|
||||||
printk("grew = %d\n", tau[cpu].grew);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef CONFIG_TAU_INT /* tau_timeout will do this if not using interrupts */
|
|
||||||
set_thresholds(cpu);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef CONFIG_TAU_INT
|
#ifdef CONFIG_TAU_INT
|
||||||
/*
|
/*
|
||||||
* TAU interrupts - called when we have a thermal assist unit interrupt
|
* TAU interrupts - called when we have a thermal assist unit interrupt
|
||||||
|
@ -140,17 +116,16 @@ void TAUException(struct pt_regs * regs)
|
||||||
static void tau_timeout(void * info)
|
static void tau_timeout(void * info)
|
||||||
{
|
{
|
||||||
int cpu;
|
int cpu;
|
||||||
unsigned long flags;
|
|
||||||
int size;
|
int size;
|
||||||
int shrink;
|
int shrink;
|
||||||
|
|
||||||
/* disabling interrupts *should* be okay */
|
|
||||||
local_irq_save(flags);
|
|
||||||
cpu = smp_processor_id();
|
cpu = smp_processor_id();
|
||||||
|
|
||||||
#ifndef CONFIG_TAU_INT
|
if (!tau_int_enable)
|
||||||
TAUupdate(cpu);
|
TAUupdate(cpu);
|
||||||
#endif
|
|
||||||
|
/* Stop thermal sensor comparisons and interrupts */
|
||||||
|
mtspr(SPRN_THRM3, 0);
|
||||||
|
|
||||||
size = tau[cpu].high - tau[cpu].low;
|
size = tau[cpu].high - tau[cpu].low;
|
||||||
if (size > min_window && ! tau[cpu].grew) {
|
if (size > min_window && ! tau[cpu].grew) {
|
||||||
|
@ -173,32 +148,26 @@ static void tau_timeout(void * info)
|
||||||
|
|
||||||
set_thresholds(cpu);
|
set_thresholds(cpu);
|
||||||
|
|
||||||
/*
|
/* Restart thermal sensor comparisons and interrupts.
|
||||||
* Do the enable every time, since otherwise a bunch of (relatively)
|
* The "PowerPC 740 and PowerPC 750 Microprocessor Datasheet"
|
||||||
* complex sleep code needs to be added. One mtspr every time
|
* recommends that "the maximum value be set in THRM3 under all
|
||||||
* tau_timeout is called is probably not a big deal.
|
* conditions."
|
||||||
*
|
|
||||||
* Enable thermal sensor and set up sample interval timer
|
|
||||||
* need 20 us to do the compare.. until a nice 'cpu_speed' function
|
|
||||||
* call is implemented, just assume a 500 mhz clock. It doesn't really
|
|
||||||
* matter if we take too long for a compare since it's all interrupt
|
|
||||||
* driven anyway.
|
|
||||||
*
|
|
||||||
* use a extra long time.. (60 us @ 500 mhz)
|
|
||||||
*/
|
*/
|
||||||
mtspr(SPRN_THRM3, THRM3_SITV(500*60) | THRM3_E);
|
mtspr(SPRN_THRM3, THRM3_SITV(0x1fff) | THRM3_E);
|
||||||
|
|
||||||
local_irq_restore(flags);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void tau_timeout_smp(struct timer_list *unused)
|
static struct workqueue_struct *tau_workq;
|
||||||
|
|
||||||
|
static void tau_work_func(struct work_struct *work)
|
||||||
{
|
{
|
||||||
|
msleep(shrink_timer);
|
||||||
/* schedule ourselves to be run again */
|
|
||||||
mod_timer(&tau_timer, jiffies + shrink_timer) ;
|
|
||||||
on_each_cpu(tau_timeout, NULL, 0);
|
on_each_cpu(tau_timeout, NULL, 0);
|
||||||
|
/* schedule ourselves to be run again */
|
||||||
|
queue_work(tau_workq, work);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DECLARE_WORK(tau_work, tau_work_func);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* setup the TAU
|
* setup the TAU
|
||||||
*
|
*
|
||||||
|
@ -231,21 +200,19 @@ static int __init TAU_init(void)
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tau_int_enable = IS_ENABLED(CONFIG_TAU_INT) &&
|
||||||
|
!strcmp(cur_cpu_spec->platform, "ppc750");
|
||||||
|
|
||||||
/* first, set up the window shrinking timer */
|
tau_workq = alloc_workqueue("tau", WQ_UNBOUND, 1);
|
||||||
timer_setup(&tau_timer, tau_timeout_smp, 0);
|
if (!tau_workq)
|
||||||
tau_timer.expires = jiffies + shrink_timer;
|
return -ENOMEM;
|
||||||
add_timer(&tau_timer);
|
|
||||||
|
|
||||||
on_each_cpu(TAU_init_smp, NULL, 0);
|
on_each_cpu(TAU_init_smp, NULL, 0);
|
||||||
|
|
||||||
printk("Thermal assist unit ");
|
queue_work(tau_workq, &tau_work);
|
||||||
#ifdef CONFIG_TAU_INT
|
|
||||||
printk("using interrupts, ");
|
pr_info("Thermal assist unit using %s, shrink_timer: %d ms\n",
|
||||||
#else
|
tau_int_enable ? "interrupts" : "workqueue", shrink_timer);
|
||||||
printk("using timers, ");
|
|
||||||
#endif
|
|
||||||
printk("shrink_timer: %d jiffies\n", shrink_timer);
|
|
||||||
tau_initialized = 1;
|
tau_initialized = 1;
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -794,7 +794,7 @@ static void p9_hmi_special_emu(struct pt_regs *regs)
|
||||||
{
|
{
|
||||||
unsigned int ra, rb, t, i, sel, instr, rc;
|
unsigned int ra, rb, t, i, sel, instr, rc;
|
||||||
const void __user *addr;
|
const void __user *addr;
|
||||||
u8 vbuf[16], *vdst;
|
u8 vbuf[16] __aligned(16), *vdst;
|
||||||
unsigned long ea, msr, msr_mask;
|
unsigned long ea, msr, msr_mask;
|
||||||
bool swap;
|
bool swap;
|
||||||
|
|
||||||
|
|
|
@ -598,19 +598,29 @@ static void do_exit_flush_lazy_tlb(void *arg)
|
||||||
struct mm_struct *mm = arg;
|
struct mm_struct *mm = arg;
|
||||||
unsigned long pid = mm->context.id;
|
unsigned long pid = mm->context.id;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* A kthread could have done a mmget_not_zero() after the flushing CPU
|
||||||
|
* checked mm_is_singlethreaded, and be in the process of
|
||||||
|
* kthread_use_mm when interrupted here. In that case, current->mm will
|
||||||
|
* be set to mm, because kthread_use_mm() setting ->mm and switching to
|
||||||
|
* the mm is done with interrupts off.
|
||||||
|
*/
|
||||||
if (current->mm == mm)
|
if (current->mm == mm)
|
||||||
return; /* Local CPU */
|
goto out_flush;
|
||||||
|
|
||||||
if (current->active_mm == mm) {
|
if (current->active_mm == mm) {
|
||||||
/*
|
WARN_ON_ONCE(current->mm != NULL);
|
||||||
* Must be a kernel thread because sender is single-threaded.
|
/* Is a kernel thread and is using mm as the lazy tlb */
|
||||||
*/
|
|
||||||
BUG_ON(current->mm);
|
|
||||||
mmgrab(&init_mm);
|
mmgrab(&init_mm);
|
||||||
switch_mm(mm, &init_mm, current);
|
|
||||||
current->active_mm = &init_mm;
|
current->active_mm = &init_mm;
|
||||||
|
switch_mm_irqs_off(mm, &init_mm, current);
|
||||||
mmdrop(mm);
|
mmdrop(mm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic_dec(&mm->context.active_cpus);
|
||||||
|
cpumask_clear_cpu(smp_processor_id(), mm_cpumask(mm));
|
||||||
|
|
||||||
|
out_flush:
|
||||||
_tlbiel_pid(pid, RIC_FLUSH_ALL);
|
_tlbiel_pid(pid, RIC_FLUSH_ALL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -625,7 +635,6 @@ static void exit_flush_lazy_tlbs(struct mm_struct *mm)
|
||||||
*/
|
*/
|
||||||
smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
|
smp_call_function_many(mm_cpumask(mm), do_exit_flush_lazy_tlb,
|
||||||
(void *)mm, 1);
|
(void *)mm, 1);
|
||||||
mm_reset_thread_local(mm);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void radix__flush_tlb_mm(struct mm_struct *mm)
|
void radix__flush_tlb_mm(struct mm_struct *mm)
|
||||||
|
|
|
@ -95,7 +95,7 @@ REQUEST(__field(0, 8, partition_id)
|
||||||
|
|
||||||
#define REQUEST_NAME system_performance_capabilities
|
#define REQUEST_NAME system_performance_capabilities
|
||||||
#define REQUEST_NUM 0x40
|
#define REQUEST_NUM 0x40
|
||||||
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
|
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
|
||||||
#include I(REQUEST_BEGIN)
|
#include I(REQUEST_BEGIN)
|
||||||
REQUEST(__field(0, 1, perf_collect_privileged)
|
REQUEST(__field(0, 1, perf_collect_privileged)
|
||||||
__field(0x1, 1, capability_mask)
|
__field(0x1, 1, capability_mask)
|
||||||
|
@ -223,7 +223,7 @@ REQUEST(__field(0, 2, partition_id)
|
||||||
|
|
||||||
#define REQUEST_NAME system_hypervisor_times
|
#define REQUEST_NAME system_hypervisor_times
|
||||||
#define REQUEST_NUM 0xF0
|
#define REQUEST_NUM 0xF0
|
||||||
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
|
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
|
||||||
#include I(REQUEST_BEGIN)
|
#include I(REQUEST_BEGIN)
|
||||||
REQUEST(__count(0, 8, time_spent_to_dispatch_virtual_processors)
|
REQUEST(__count(0, 8, time_spent_to_dispatch_virtual_processors)
|
||||||
__count(0x8, 8, time_spent_processing_virtual_processor_timers)
|
__count(0x8, 8, time_spent_processing_virtual_processor_timers)
|
||||||
|
@ -234,7 +234,7 @@ REQUEST(__count(0, 8, time_spent_to_dispatch_virtual_processors)
|
||||||
|
|
||||||
#define REQUEST_NAME system_tlbie_count_and_time
|
#define REQUEST_NAME system_tlbie_count_and_time
|
||||||
#define REQUEST_NUM 0xF4
|
#define REQUEST_NUM 0xF4
|
||||||
#define REQUEST_IDX_KIND "starting_index=0xffffffffffffffff"
|
#define REQUEST_IDX_KIND "starting_index=0xffffffff"
|
||||||
#include I(REQUEST_BEGIN)
|
#include I(REQUEST_BEGIN)
|
||||||
REQUEST(__count(0, 8, tlbie_instructions_issued)
|
REQUEST(__count(0, 8, tlbie_instructions_issued)
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -273,6 +273,15 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
|
||||||
|
|
||||||
mask |= CNST_PMC_MASK(pmc);
|
mask |= CNST_PMC_MASK(pmc);
|
||||||
value |= CNST_PMC_VAL(pmc);
|
value |= CNST_PMC_VAL(pmc);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* PMC5 and PMC6 are used to count cycles and instructions and
|
||||||
|
* they do not support most of the constraint bits. Add a check
|
||||||
|
* to exclude PMC5/6 from most of the constraints except for
|
||||||
|
* EBB/BHRB.
|
||||||
|
*/
|
||||||
|
if (pmc >= 5)
|
||||||
|
goto ebb_bhrb;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pmc <= 4) {
|
if (pmc <= 4) {
|
||||||
|
@ -331,6 +340,7 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ebb_bhrb:
|
||||||
if (!pmc && ebb)
|
if (!pmc && ebb)
|
||||||
/* EBB events must specify the PMC */
|
/* EBB events must specify the PMC */
|
||||||
return -1;
|
return -1;
|
||||||
|
|
|
@ -238,12 +238,11 @@ config TAU
|
||||||
temperature within 2-4 degrees Celsius. This option shows the current
|
temperature within 2-4 degrees Celsius. This option shows the current
|
||||||
on-die temperature in /proc/cpuinfo if the cpu supports it.
|
on-die temperature in /proc/cpuinfo if the cpu supports it.
|
||||||
|
|
||||||
Unfortunately, on some chip revisions, this sensor is very inaccurate
|
Unfortunately, this sensor is very inaccurate when uncalibrated, so
|
||||||
and in many cases, does not work at all, so don't assume the cpu
|
don't assume the cpu temp is actually what /proc/cpuinfo says it is.
|
||||||
temp is actually what /proc/cpuinfo says it is.
|
|
||||||
|
|
||||||
config TAU_INT
|
config TAU_INT
|
||||||
bool "Interrupt driven TAU driver (DANGEROUS)"
|
bool "Interrupt driven TAU driver (EXPERIMENTAL)"
|
||||||
depends on TAU
|
depends on TAU
|
||||||
---help---
|
---help---
|
||||||
The TAU supports an interrupt driven mode which causes an interrupt
|
The TAU supports an interrupt driven mode which causes an interrupt
|
||||||
|
@ -251,12 +250,7 @@ config TAU_INT
|
||||||
to get notified the temp has exceeded a range. With this option off,
|
to get notified the temp has exceeded a range. With this option off,
|
||||||
a timer is used to re-check the temperature periodically.
|
a timer is used to re-check the temperature periodically.
|
||||||
|
|
||||||
However, on some cpus it appears that the TAU interrupt hardware
|
If in doubt, say N here.
|
||||||
is buggy and can cause a situation which would lead unexplained hard
|
|
||||||
lockups.
|
|
||||||
|
|
||||||
Unless you are extending the TAU driver, or enjoy kernel/hardware
|
|
||||||
debugging, leave this option off.
|
|
||||||
|
|
||||||
config TAU_AVERAGE
|
config TAU_AVERAGE
|
||||||
bool "Average high and low temp"
|
bool "Average high and low temp"
|
||||||
|
|
|
@ -322,15 +322,14 @@ static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
|
static void create_dump_obj(uint32_t id, size_t size, uint32_t type)
|
||||||
uint32_t type)
|
|
||||||
{
|
{
|
||||||
struct dump_obj *dump;
|
struct dump_obj *dump;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
dump = kzalloc(sizeof(*dump), GFP_KERNEL);
|
dump = kzalloc(sizeof(*dump), GFP_KERNEL);
|
||||||
if (!dump)
|
if (!dump)
|
||||||
return NULL;
|
return;
|
||||||
|
|
||||||
dump->kobj.kset = dump_kset;
|
dump->kobj.kset = dump_kset;
|
||||||
|
|
||||||
|
@ -350,21 +349,39 @@ static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
|
||||||
rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
|
rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
|
||||||
if (rc) {
|
if (rc) {
|
||||||
kobject_put(&dump->kobj);
|
kobject_put(&dump->kobj);
|
||||||
return NULL;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* As soon as the sysfs file for this dump is created/activated there is
|
||||||
|
* a chance the opal_errd daemon (or any userspace) might read and
|
||||||
|
* acknowledge the dump before kobject_uevent() is called. If that
|
||||||
|
* happens then there is a potential race between
|
||||||
|
* dump_ack_store->kobject_put() and kobject_uevent() which leads to a
|
||||||
|
* use-after-free of a kernfs object resulting in a kernel crash.
|
||||||
|
*
|
||||||
|
* To avoid that, we need to take a reference on behalf of the bin file,
|
||||||
|
* so that our reference remains valid while we call kobject_uevent().
|
||||||
|
* We then drop our reference before exiting the function, leaving the
|
||||||
|
* bin file to drop the last reference (if it hasn't already).
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Take a reference for the bin file */
|
||||||
|
kobject_get(&dump->kobj);
|
||||||
rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
|
rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
|
||||||
if (rc) {
|
if (rc == 0) {
|
||||||
kobject_put(&dump->kobj);
|
kobject_uevent(&dump->kobj, KOBJ_ADD);
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
|
pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
|
||||||
__func__, dump->id, dump->size);
|
__func__, dump->id, dump->size);
|
||||||
|
} else {
|
||||||
|
/* Drop reference count taken for bin file */
|
||||||
|
kobject_put(&dump->kobj);
|
||||||
|
}
|
||||||
|
|
||||||
kobject_uevent(&dump->kobj, KOBJ_ADD);
|
/* Drop our reference */
|
||||||
|
kobject_put(&dump->kobj);
|
||||||
return dump;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
static irqreturn_t process_dump(int irq, void *data)
|
static irqreturn_t process_dump(int irq, void *data)
|
||||||
|
|
|
@ -183,14 +183,14 @@ static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
|
static void create_elog_obj(uint64_t id, size_t size, uint64_t type)
|
||||||
{
|
{
|
||||||
struct elog_obj *elog;
|
struct elog_obj *elog;
|
||||||
int rc;
|
int rc;
|
||||||
|
|
||||||
elog = kzalloc(sizeof(*elog), GFP_KERNEL);
|
elog = kzalloc(sizeof(*elog), GFP_KERNEL);
|
||||||
if (!elog)
|
if (!elog)
|
||||||
return NULL;
|
return;
|
||||||
|
|
||||||
elog->kobj.kset = elog_kset;
|
elog->kobj.kset = elog_kset;
|
||||||
|
|
||||||
|
@ -223,18 +223,37 @@ static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
|
||||||
rc = kobject_add(&elog->kobj, NULL, "0x%llx", id);
|
rc = kobject_add(&elog->kobj, NULL, "0x%llx", id);
|
||||||
if (rc) {
|
if (rc) {
|
||||||
kobject_put(&elog->kobj);
|
kobject_put(&elog->kobj);
|
||||||
return NULL;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* As soon as the sysfs file for this elog is created/activated there is
|
||||||
|
* a chance the opal_errd daemon (or any userspace) might read and
|
||||||
|
* acknowledge the elog before kobject_uevent() is called. If that
|
||||||
|
* happens then there is a potential race between
|
||||||
|
* elog_ack_store->kobject_put() and kobject_uevent() which leads to a
|
||||||
|
* use-after-free of a kernfs object resulting in a kernel crash.
|
||||||
|
*
|
||||||
|
* To avoid that, we need to take a reference on behalf of the bin file,
|
||||||
|
* so that our reference remains valid while we call kobject_uevent().
|
||||||
|
* We then drop our reference before exiting the function, leaving the
|
||||||
|
* bin file to drop the last reference (if it hasn't already).
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Take a reference for the bin file */
|
||||||
|
kobject_get(&elog->kobj);
|
||||||
rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr);
|
rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr);
|
||||||
if (rc) {
|
if (rc == 0) {
|
||||||
|
kobject_uevent(&elog->kobj, KOBJ_ADD);
|
||||||
|
} else {
|
||||||
|
/* Drop the reference taken for the bin file */
|
||||||
kobject_put(&elog->kobj);
|
kobject_put(&elog->kobj);
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
kobject_uevent(&elog->kobj, KOBJ_ADD);
|
/* Drop our reference */
|
||||||
|
kobject_put(&elog->kobj);
|
||||||
|
|
||||||
return elog;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
static irqreturn_t elog_event(int irq, void *data)
|
static irqreturn_t elog_event(int irq, void *data)
|
||||||
|
|
|
@ -47,7 +47,7 @@
|
||||||
#include <asm/udbg.h>
|
#include <asm/udbg.h>
|
||||||
#define DBG(fmt...) udbg_printf(fmt)
|
#define DBG(fmt...) udbg_printf(fmt)
|
||||||
#else
|
#else
|
||||||
#define DBG(fmt...)
|
#define DBG(fmt...) do { } while (0)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void pnv_smp_setup_cpu(int cpu)
|
static void pnv_smp_setup_cpu(int cpu)
|
||||||
|
|
|
@ -40,6 +40,7 @@ static __init int rng_init(void)
|
||||||
|
|
||||||
ppc_md.get_random_seed = pseries_get_random_long;
|
ppc_md.get_random_seed = pseries_get_random_long;
|
||||||
|
|
||||||
|
of_node_put(dn);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
machine_subsys_initcall(pseries, rng_init);
|
machine_subsys_initcall(pseries, rng_init);
|
||||||
|
|
|
@ -179,6 +179,7 @@ int icp_hv_init(void)
|
||||||
|
|
||||||
icp_ops = &icp_hv_ops;
|
icp_ops = &icp_hv_ops;
|
||||||
|
|
||||||
|
of_node_put(np);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,4 +21,7 @@
|
||||||
/* vDSO location */
|
/* vDSO location */
|
||||||
#define AT_SYSINFO_EHDR 33
|
#define AT_SYSINFO_EHDR 33
|
||||||
|
|
||||||
|
/* entries in ARCH_DLINFO */
|
||||||
|
#define AT_VECTOR_SIZE_ARCH 1
|
||||||
|
|
||||||
#endif /* _UAPI_ASM_RISCV_AUXVEC_H */
|
#endif /* _UAPI_ASM_RISCV_AUXVEC_H */
|
||||||
|
|
|
@ -356,6 +356,7 @@ static unsigned long clock_sync_flags;
|
||||||
|
|
||||||
#define CLOCK_SYNC_HAS_STP 0
|
#define CLOCK_SYNC_HAS_STP 0
|
||||||
#define CLOCK_SYNC_STP 1
|
#define CLOCK_SYNC_STP 1
|
||||||
|
#define CLOCK_SYNC_STPINFO_VALID 2
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The get_clock function for the physical clock. It will get the current
|
* The get_clock function for the physical clock. It will get the current
|
||||||
|
@ -592,6 +593,22 @@ void stp_queue_work(void)
|
||||||
queue_work(time_sync_wq, &stp_work);
|
queue_work(time_sync_wq, &stp_work);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int __store_stpinfo(void)
|
||||||
|
{
|
||||||
|
int rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
|
||||||
|
|
||||||
|
if (rc)
|
||||||
|
clear_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
|
||||||
|
else
|
||||||
|
set_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int stpinfo_valid(void)
|
||||||
|
{
|
||||||
|
return stp_online && test_bit(CLOCK_SYNC_STPINFO_VALID, &clock_sync_flags);
|
||||||
|
}
|
||||||
|
|
||||||
static int stp_sync_clock(void *data)
|
static int stp_sync_clock(void *data)
|
||||||
{
|
{
|
||||||
struct clock_sync_data *sync = data;
|
struct clock_sync_data *sync = data;
|
||||||
|
@ -613,8 +630,7 @@ static int stp_sync_clock(void *data)
|
||||||
if (rc == 0) {
|
if (rc == 0) {
|
||||||
sync->clock_delta = clock_delta;
|
sync->clock_delta = clock_delta;
|
||||||
clock_sync_global(clock_delta);
|
clock_sync_global(clock_delta);
|
||||||
rc = chsc_sstpi(stp_page, &stp_info,
|
rc = __store_stpinfo();
|
||||||
sizeof(struct stp_sstpi));
|
|
||||||
if (rc == 0 && stp_info.tmd != 2)
|
if (rc == 0 && stp_info.tmd != 2)
|
||||||
rc = -EAGAIN;
|
rc = -EAGAIN;
|
||||||
}
|
}
|
||||||
|
@ -659,7 +675,7 @@ static void stp_work_fn(struct work_struct *work)
|
||||||
if (rc)
|
if (rc)
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
|
|
||||||
rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
|
rc = __store_stpinfo();
|
||||||
if (rc || stp_info.c == 0)
|
if (rc || stp_info.c == 0)
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
|
|
||||||
|
@ -696,10 +712,14 @@ static ssize_t stp_ctn_id_show(struct device *dev,
|
||||||
struct device_attribute *attr,
|
struct device_attribute *attr,
|
||||||
char *buf)
|
char *buf)
|
||||||
{
|
{
|
||||||
if (!stp_online)
|
ssize_t ret = -ENODATA;
|
||||||
return -ENODATA;
|
|
||||||
return sprintf(buf, "%016llx\n",
|
mutex_lock(&stp_work_mutex);
|
||||||
|
if (stpinfo_valid())
|
||||||
|
ret = sprintf(buf, "%016llx\n",
|
||||||
*(unsigned long long *) stp_info.ctnid);
|
*(unsigned long long *) stp_info.ctnid);
|
||||||
|
mutex_unlock(&stp_work_mutex);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL);
|
static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL);
|
||||||
|
@ -708,9 +728,13 @@ static ssize_t stp_ctn_type_show(struct device *dev,
|
||||||
struct device_attribute *attr,
|
struct device_attribute *attr,
|
||||||
char *buf)
|
char *buf)
|
||||||
{
|
{
|
||||||
if (!stp_online)
|
ssize_t ret = -ENODATA;
|
||||||
return -ENODATA;
|
|
||||||
return sprintf(buf, "%i\n", stp_info.ctn);
|
mutex_lock(&stp_work_mutex);
|
||||||
|
if (stpinfo_valid())
|
||||||
|
ret = sprintf(buf, "%i\n", stp_info.ctn);
|
||||||
|
mutex_unlock(&stp_work_mutex);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL);
|
static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL);
|
||||||
|
@ -719,9 +743,13 @@ static ssize_t stp_dst_offset_show(struct device *dev,
|
||||||
struct device_attribute *attr,
|
struct device_attribute *attr,
|
||||||
char *buf)
|
char *buf)
|
||||||
{
|
{
|
||||||
if (!stp_online || !(stp_info.vbits & 0x2000))
|
ssize_t ret = -ENODATA;
|
||||||
return -ENODATA;
|
|
||||||
return sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
|
mutex_lock(&stp_work_mutex);
|
||||||
|
if (stpinfo_valid() && (stp_info.vbits & 0x2000))
|
||||||
|
ret = sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
|
||||||
|
mutex_unlock(&stp_work_mutex);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL);
|
static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL);
|
||||||
|
@ -730,9 +758,13 @@ static ssize_t stp_leap_seconds_show(struct device *dev,
|
||||||
struct device_attribute *attr,
|
struct device_attribute *attr,
|
||||||
char *buf)
|
char *buf)
|
||||||
{
|
{
|
||||||
if (!stp_online || !(stp_info.vbits & 0x8000))
|
ssize_t ret = -ENODATA;
|
||||||
return -ENODATA;
|
|
||||||
return sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
|
mutex_lock(&stp_work_mutex);
|
||||||
|
if (stpinfo_valid() && (stp_info.vbits & 0x8000))
|
||||||
|
ret = sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
|
||||||
|
mutex_unlock(&stp_work_mutex);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL);
|
static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL);
|
||||||
|
@ -741,9 +773,13 @@ static ssize_t stp_stratum_show(struct device *dev,
|
||||||
struct device_attribute *attr,
|
struct device_attribute *attr,
|
||||||
char *buf)
|
char *buf)
|
||||||
{
|
{
|
||||||
if (!stp_online)
|
ssize_t ret = -ENODATA;
|
||||||
return -ENODATA;
|
|
||||||
return sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
|
mutex_lock(&stp_work_mutex);
|
||||||
|
if (stpinfo_valid())
|
||||||
|
ret = sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
|
||||||
|
mutex_unlock(&stp_work_mutex);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL);
|
static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL);
|
||||||
|
@ -752,9 +788,13 @@ static ssize_t stp_time_offset_show(struct device *dev,
|
||||||
struct device_attribute *attr,
|
struct device_attribute *attr,
|
||||||
char *buf)
|
char *buf)
|
||||||
{
|
{
|
||||||
if (!stp_online || !(stp_info.vbits & 0x0800))
|
ssize_t ret = -ENODATA;
|
||||||
return -ENODATA;
|
|
||||||
return sprintf(buf, "%i\n", (int) stp_info.tto);
|
mutex_lock(&stp_work_mutex);
|
||||||
|
if (stpinfo_valid() && (stp_info.vbits & 0x0800))
|
||||||
|
ret = sprintf(buf, "%i\n", (int) stp_info.tto);
|
||||||
|
mutex_unlock(&stp_work_mutex);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL);
|
static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL);
|
||||||
|
@ -763,9 +803,13 @@ static ssize_t stp_time_zone_offset_show(struct device *dev,
|
||||||
struct device_attribute *attr,
|
struct device_attribute *attr,
|
||||||
char *buf)
|
char *buf)
|
||||||
{
|
{
|
||||||
if (!stp_online || !(stp_info.vbits & 0x4000))
|
ssize_t ret = -ENODATA;
|
||||||
return -ENODATA;
|
|
||||||
return sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
|
mutex_lock(&stp_work_mutex);
|
||||||
|
if (stpinfo_valid() && (stp_info.vbits & 0x4000))
|
||||||
|
ret = sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
|
||||||
|
mutex_unlock(&stp_work_mutex);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static DEVICE_ATTR(time_zone_offset, 0400,
|
static DEVICE_ATTR(time_zone_offset, 0400,
|
||||||
|
@ -775,9 +819,13 @@ static ssize_t stp_timing_mode_show(struct device *dev,
|
||||||
struct device_attribute *attr,
|
struct device_attribute *attr,
|
||||||
char *buf)
|
char *buf)
|
||||||
{
|
{
|
||||||
if (!stp_online)
|
ssize_t ret = -ENODATA;
|
||||||
return -ENODATA;
|
|
||||||
return sprintf(buf, "%i\n", stp_info.tmd);
|
mutex_lock(&stp_work_mutex);
|
||||||
|
if (stpinfo_valid())
|
||||||
|
ret = sprintf(buf, "%i\n", stp_info.tmd);
|
||||||
|
mutex_unlock(&stp_work_mutex);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL);
|
static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL);
|
||||||
|
@ -786,9 +834,13 @@ static ssize_t stp_timing_state_show(struct device *dev,
|
||||||
struct device_attribute *attr,
|
struct device_attribute *attr,
|
||||||
char *buf)
|
char *buf)
|
||||||
{
|
{
|
||||||
if (!stp_online)
|
ssize_t ret = -ENODATA;
|
||||||
return -ENODATA;
|
|
||||||
return sprintf(buf, "%i\n", stp_info.tst);
|
mutex_lock(&stp_work_mutex);
|
||||||
|
if (stpinfo_valid())
|
||||||
|
ret = sprintf(buf, "%i\n", stp_info.tst);
|
||||||
|
mutex_unlock(&stp_work_mutex);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL);
|
static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL);
|
||||||
|
|
|
@ -1039,38 +1039,9 @@ void smp_fetch_global_pmu(void)
|
||||||
* are flush_tlb_*() routines, and these run after flush_cache_*()
|
* are flush_tlb_*() routines, and these run after flush_cache_*()
|
||||||
* which performs the flushw.
|
* which performs the flushw.
|
||||||
*
|
*
|
||||||
* The SMP TLB coherency scheme we use works as follows:
|
* mm->cpu_vm_mask is a bit mask of which cpus an address
|
||||||
*
|
|
||||||
* 1) mm->cpu_vm_mask is a bit mask of which cpus an address
|
|
||||||
* space has (potentially) executed on, this is the heuristic
|
* space has (potentially) executed on, this is the heuristic
|
||||||
* we use to avoid doing cross calls.
|
* we use to limit cross calls.
|
||||||
*
|
|
||||||
* Also, for flushing from kswapd and also for clones, we
|
|
||||||
* use cpu_vm_mask as the list of cpus to make run the TLB.
|
|
||||||
*
|
|
||||||
* 2) TLB context numbers are shared globally across all processors
|
|
||||||
* in the system, this allows us to play several games to avoid
|
|
||||||
* cross calls.
|
|
||||||
*
|
|
||||||
* One invariant is that when a cpu switches to a process, and
|
|
||||||
* that processes tsk->active_mm->cpu_vm_mask does not have the
|
|
||||||
* current cpu's bit set, that tlb context is flushed locally.
|
|
||||||
*
|
|
||||||
* If the address space is non-shared (ie. mm->count == 1) we avoid
|
|
||||||
* cross calls when we want to flush the currently running process's
|
|
||||||
* tlb state. This is done by clearing all cpu bits except the current
|
|
||||||
* processor's in current->mm->cpu_vm_mask and performing the
|
|
||||||
* flush locally only. This will force any subsequent cpus which run
|
|
||||||
* this task to flush the context from the local tlb if the process
|
|
||||||
* migrates to another cpu (again).
|
|
||||||
*
|
|
||||||
* 3) For shared address spaces (threads) and swapping we bite the
|
|
||||||
* bullet for most cases and perform the cross call (but only to
|
|
||||||
* the cpus listed in cpu_vm_mask).
|
|
||||||
*
|
|
||||||
* The performance gain from "optimizing" away the cross call for threads is
|
|
||||||
* questionable (in theory the big win for threads is the massive sharing of
|
|
||||||
* address space state across processors).
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* This currently is only used by the hugetlb arch pre-fault
|
/* This currently is only used by the hugetlb arch pre-fault
|
||||||
|
@ -1080,18 +1051,13 @@ void smp_fetch_global_pmu(void)
|
||||||
void smp_flush_tlb_mm(struct mm_struct *mm)
|
void smp_flush_tlb_mm(struct mm_struct *mm)
|
||||||
{
|
{
|
||||||
u32 ctx = CTX_HWBITS(mm->context);
|
u32 ctx = CTX_HWBITS(mm->context);
|
||||||
int cpu = get_cpu();
|
|
||||||
|
|
||||||
if (atomic_read(&mm->mm_users) == 1) {
|
get_cpu();
|
||||||
cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
|
|
||||||
goto local_flush_and_out;
|
|
||||||
}
|
|
||||||
|
|
||||||
smp_cross_call_masked(&xcall_flush_tlb_mm,
|
smp_cross_call_masked(&xcall_flush_tlb_mm,
|
||||||
ctx, 0, 0,
|
ctx, 0, 0,
|
||||||
mm_cpumask(mm));
|
mm_cpumask(mm));
|
||||||
|
|
||||||
local_flush_and_out:
|
|
||||||
__flush_tlb_mm(ctx, SECONDARY_CONTEXT);
|
__flush_tlb_mm(ctx, SECONDARY_CONTEXT);
|
||||||
|
|
||||||
put_cpu();
|
put_cpu();
|
||||||
|
@ -1114,15 +1080,13 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
|
||||||
{
|
{
|
||||||
u32 ctx = CTX_HWBITS(mm->context);
|
u32 ctx = CTX_HWBITS(mm->context);
|
||||||
struct tlb_pending_info info;
|
struct tlb_pending_info info;
|
||||||
int cpu = get_cpu();
|
|
||||||
|
get_cpu();
|
||||||
|
|
||||||
info.ctx = ctx;
|
info.ctx = ctx;
|
||||||
info.nr = nr;
|
info.nr = nr;
|
||||||
info.vaddrs = vaddrs;
|
info.vaddrs = vaddrs;
|
||||||
|
|
||||||
if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
|
|
||||||
cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
|
|
||||||
else
|
|
||||||
smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
|
smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
|
||||||
&info, 1);
|
&info, 1);
|
||||||
|
|
||||||
|
@ -1134,14 +1098,13 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
|
||||||
void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
|
void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
|
||||||
{
|
{
|
||||||
unsigned long context = CTX_HWBITS(mm->context);
|
unsigned long context = CTX_HWBITS(mm->context);
|
||||||
int cpu = get_cpu();
|
|
||||||
|
|
||||||
if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
|
get_cpu();
|
||||||
cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
|
|
||||||
else
|
|
||||||
smp_cross_call_masked(&xcall_flush_tlb_page,
|
smp_cross_call_masked(&xcall_flush_tlb_page,
|
||||||
context, vaddr, 0,
|
context, vaddr, 0,
|
||||||
mm_cpumask(mm));
|
mm_cpumask(mm));
|
||||||
|
|
||||||
__flush_tlb_page(context, vaddr);
|
__flush_tlb_page(context, vaddr);
|
||||||
|
|
||||||
put_cpu();
|
put_cpu();
|
||||||
|
|
|
@ -36,14 +36,14 @@ int write_sigio_irq(int fd)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* These are called from os-Linux/sigio.c to protect its pollfds arrays. */
|
/* These are called from os-Linux/sigio.c to protect its pollfds arrays. */
|
||||||
static DEFINE_SPINLOCK(sigio_spinlock);
|
static DEFINE_MUTEX(sigio_mutex);
|
||||||
|
|
||||||
void sigio_lock(void)
|
void sigio_lock(void)
|
||||||
{
|
{
|
||||||
spin_lock(&sigio_spinlock);
|
mutex_lock(&sigio_mutex);
|
||||||
}
|
}
|
||||||
|
|
||||||
void sigio_unlock(void)
|
void sigio_unlock(void)
|
||||||
{
|
{
|
||||||
spin_unlock(&sigio_spinlock);
|
mutex_unlock(&sigio_mutex);
|
||||||
}
|
}
|
||||||
|
|
|
@ -200,9 +200,10 @@ avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
|
||||||
avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
|
avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
|
||||||
sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
|
sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
|
||||||
sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
|
sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
|
||||||
|
adx_instr := $(call as-instr,adox %r10$(comma)%r10,-DCONFIG_AS_ADX=1)
|
||||||
|
|
||||||
KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
|
KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
|
||||||
KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
|
KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
|
||||||
|
|
||||||
KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
|
KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
|
||||||
|
|
||||||
|
|
|
@ -40,6 +40,7 @@ CONFIG_EMBEDDED=y
|
||||||
# CONFIG_SLAB_MERGE_DEFAULT is not set
|
# CONFIG_SLAB_MERGE_DEFAULT is not set
|
||||||
CONFIG_PROFILING=y
|
CONFIG_PROFILING=y
|
||||||
CONFIG_SMP=y
|
CONFIG_SMP=y
|
||||||
|
CONFIG_X86_X2APIC=y
|
||||||
CONFIG_HYPERVISOR_GUEST=y
|
CONFIG_HYPERVISOR_GUEST=y
|
||||||
CONFIG_PARAVIRT=y
|
CONFIG_PARAVIRT=y
|
||||||
CONFIG_NR_CPUS=32
|
CONFIG_NR_CPUS=32
|
||||||
|
@ -213,6 +214,7 @@ CONFIG_DM_VERITY_FEC=y
|
||||||
CONFIG_DM_BOW=y
|
CONFIG_DM_BOW=y
|
||||||
CONFIG_NETDEVICES=y
|
CONFIG_NETDEVICES=y
|
||||||
CONFIG_DUMMY=y
|
CONFIG_DUMMY=y
|
||||||
|
CONFIG_WIREGUARD=y
|
||||||
CONFIG_TUN=y
|
CONFIG_TUN=y
|
||||||
CONFIG_VETH=y
|
CONFIG_VETH=y
|
||||||
# CONFIG_ETHERNET is not set
|
# CONFIG_ETHERNET is not set
|
||||||
|
@ -310,6 +312,7 @@ CONFIG_HID_NINTENDO=y
|
||||||
CONFIG_HID_SONY=y
|
CONFIG_HID_SONY=y
|
||||||
CONFIG_HID_STEAM=y
|
CONFIG_HID_STEAM=y
|
||||||
CONFIG_USB_HIDDEV=y
|
CONFIG_USB_HIDDEV=y
|
||||||
|
CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
|
||||||
CONFIG_USB_XHCI_HCD=y
|
CONFIG_USB_XHCI_HCD=y
|
||||||
CONFIG_USB_GADGET=y
|
CONFIG_USB_GADGET=y
|
||||||
CONFIG_USB_GADGET_VBUS_DRAW=500
|
CONFIG_USB_GADGET_VBUS_DRAW=500
|
||||||
|
@ -436,6 +439,7 @@ CONFIG_CRC8=y
|
||||||
CONFIG_XZ_DEC=y
|
CONFIG_XZ_DEC=y
|
||||||
CONFIG_PRINTK_TIME=y
|
CONFIG_PRINTK_TIME=y
|
||||||
CONFIG_DEBUG_INFO=y
|
CONFIG_DEBUG_INFO=y
|
||||||
|
CONFIG_DEBUG_INFO_DWARF4=y
|
||||||
# CONFIG_ENABLE_MUST_CHECK is not set
|
# CONFIG_ENABLE_MUST_CHECK is not set
|
||||||
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
|
# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
|
||||||
CONFIG_MAGIC_SYSRQ=y
|
CONFIG_MAGIC_SYSRQ=y
|
||||||
|
|
1
arch/x86/crypto/.gitignore
vendored
Normal file
1
arch/x86/crypto/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
poly1305-x86_64-cryptogams.S
|
|
@ -8,8 +8,10 @@ OBJECT_FILES_NON_STANDARD := y
|
||||||
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
|
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
|
||||||
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
|
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
|
||||||
$(comma)4)$(comma)%ymm2,yes,no)
|
$(comma)4)$(comma)%ymm2,yes,no)
|
||||||
|
avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
|
||||||
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
|
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
|
||||||
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
|
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
|
||||||
|
adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no)
|
||||||
|
|
||||||
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
|
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
|
||||||
|
|
||||||
|
@ -23,7 +25,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
|
||||||
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
|
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
|
||||||
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
|
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
|
||||||
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
|
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
|
||||||
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
|
obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
|
||||||
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
|
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
|
||||||
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
|
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
|
||||||
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
|
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
|
||||||
|
@ -46,6 +48,11 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
|
||||||
obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
|
obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
|
||||||
obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
|
obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
|
||||||
|
|
||||||
|
# These modules require the assembler to support ADX.
|
||||||
|
ifeq ($(adx_supported),yes)
|
||||||
|
obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
|
||||||
|
endif
|
||||||
|
|
||||||
# These modules require assembler to support AVX.
|
# These modules require assembler to support AVX.
|
||||||
ifeq ($(avx_supported),yes)
|
ifeq ($(avx_supported),yes)
|
||||||
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
|
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
|
||||||
|
@ -54,6 +61,7 @@ ifeq ($(avx_supported),yes)
|
||||||
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
|
obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
|
||||||
obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
|
obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o
|
||||||
obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
|
obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o
|
||||||
|
obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# These modules require assembler to support AVX2.
|
# These modules require assembler to support AVX2.
|
||||||
|
@ -77,7 +85,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
|
||||||
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
|
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
|
||||||
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
|
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
|
||||||
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
|
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
|
||||||
chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
|
chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
|
||||||
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
|
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
|
||||||
|
|
||||||
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
|
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
|
||||||
|
@ -87,6 +95,12 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
|
||||||
morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
|
morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
|
||||||
morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
|
morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
|
||||||
|
|
||||||
|
blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
|
||||||
|
poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
|
||||||
|
ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
|
||||||
|
targets += poly1305-x86_64-cryptogams.S
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(avx_supported),yes)
|
ifeq ($(avx_supported),yes)
|
||||||
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
|
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
|
||||||
camellia_aesni_avx_glue.o
|
camellia_aesni_avx_glue.o
|
||||||
|
@ -100,20 +114,22 @@ endif
|
||||||
|
|
||||||
ifeq ($(avx2_supported),yes)
|
ifeq ($(avx2_supported),yes)
|
||||||
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
|
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
|
||||||
chacha20-x86_64-y += chacha20-avx2-x86_64.o
|
chacha-x86_64-y += chacha-avx2-x86_64.o
|
||||||
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
|
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
|
||||||
|
|
||||||
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
|
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(avx512_supported),yes)
|
||||||
|
chacha-x86_64-y += chacha-avx512vl-x86_64.o
|
||||||
|
endif
|
||||||
|
|
||||||
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
|
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
|
||||||
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
|
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
|
||||||
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
|
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
|
||||||
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
|
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
|
||||||
poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
|
|
||||||
ifeq ($(avx2_supported),yes)
|
ifeq ($(avx2_supported),yes)
|
||||||
sha1-ssse3-y += sha1_avx2_x86_64_asm.o
|
sha1-ssse3-y += sha1_avx2_x86_64_asm.o
|
||||||
poly1305-x86_64-y += poly1305-avx2-x86_64.o
|
|
||||||
endif
|
endif
|
||||||
ifeq ($(sha1_ni_supported),yes)
|
ifeq ($(sha1_ni_supported),yes)
|
||||||
sha1-ssse3-y += sha1_ni_asm.o
|
sha1-ssse3-y += sha1_ni_asm.o
|
||||||
|
@ -127,3 +143,8 @@ sha256-ssse3-y += sha256_ni_asm.o
|
||||||
endif
|
endif
|
||||||
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
|
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
|
||||||
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
|
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
|
||||||
|
|
||||||
|
quiet_cmd_perlasm = PERLASM $@
|
||||||
|
cmd_perlasm = $(PERL) $< > $@
|
||||||
|
$(obj)/%.S: $(src)/%.pl FORCE
|
||||||
|
$(call if_changed,perlasm)
|
||||||
|
|
258
arch/x86/crypto/blake2s-core.S
Normal file
258
arch/x86/crypto/blake2s-core.S
Normal file
|
@ -0,0 +1,258 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||||
|
* Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <linux/linkage.h>
|
||||||
|
|
||||||
|
.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
|
||||||
|
.align 32
|
||||||
|
IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667
|
||||||
|
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
|
||||||
|
.section .rodata.cst16.ROT16, "aM", @progbits, 16
|
||||||
|
.align 16
|
||||||
|
ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302
|
||||||
|
.section .rodata.cst16.ROR328, "aM", @progbits, 16
|
||||||
|
.align 16
|
||||||
|
ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
|
||||||
|
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
|
||||||
|
.align 64
|
||||||
|
SIGMA:
|
||||||
|
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
|
||||||
|
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
|
||||||
|
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
|
||||||
|
.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
|
||||||
|
.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
|
||||||
|
.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
|
||||||
|
.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
|
||||||
|
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
|
||||||
|
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
|
||||||
|
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
|
||||||
|
#ifdef CONFIG_AS_AVX512
|
||||||
|
.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
|
||||||
|
.align 64
|
||||||
|
SIGMA2:
|
||||||
|
.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
|
||||||
|
.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
|
||||||
|
.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
|
||||||
|
.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
|
||||||
|
.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
|
||||||
|
.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
|
||||||
|
.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
|
||||||
|
.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
|
||||||
|
.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
|
||||||
|
.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
|
||||||
|
#endif /* CONFIG_AS_AVX512 */
|
||||||
|
|
||||||
|
.text
|
||||||
|
#ifdef CONFIG_AS_SSSE3
|
||||||
|
ENTRY(blake2s_compress_ssse3)
|
||||||
|
testq %rdx,%rdx
|
||||||
|
je .Lendofloop
|
||||||
|
movdqu (%rdi),%xmm0
|
||||||
|
movdqu 0x10(%rdi),%xmm1
|
||||||
|
movdqa ROT16(%rip),%xmm12
|
||||||
|
movdqa ROR328(%rip),%xmm13
|
||||||
|
movdqu 0x20(%rdi),%xmm14
|
||||||
|
movq %rcx,%xmm15
|
||||||
|
leaq SIGMA+0xa0(%rip),%r8
|
||||||
|
jmp .Lbeginofloop
|
||||||
|
.align 32
|
||||||
|
.Lbeginofloop:
|
||||||
|
movdqa %xmm0,%xmm10
|
||||||
|
movdqa %xmm1,%xmm11
|
||||||
|
paddq %xmm15,%xmm14
|
||||||
|
movdqa IV(%rip),%xmm2
|
||||||
|
movdqa %xmm14,%xmm3
|
||||||
|
pxor IV+0x10(%rip),%xmm3
|
||||||
|
leaq SIGMA(%rip),%rcx
|
||||||
|
.Lroundloop:
|
||||||
|
movzbl (%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm4
|
||||||
|
movzbl 0x1(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm5
|
||||||
|
movzbl 0x2(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm6
|
||||||
|
movzbl 0x3(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm7
|
||||||
|
punpckldq %xmm5,%xmm4
|
||||||
|
punpckldq %xmm7,%xmm6
|
||||||
|
punpcklqdq %xmm6,%xmm4
|
||||||
|
paddd %xmm4,%xmm0
|
||||||
|
paddd %xmm1,%xmm0
|
||||||
|
pxor %xmm0,%xmm3
|
||||||
|
pshufb %xmm12,%xmm3
|
||||||
|
paddd %xmm3,%xmm2
|
||||||
|
pxor %xmm2,%xmm1
|
||||||
|
movdqa %xmm1,%xmm8
|
||||||
|
psrld $0xc,%xmm1
|
||||||
|
pslld $0x14,%xmm8
|
||||||
|
por %xmm8,%xmm1
|
||||||
|
movzbl 0x4(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm5
|
||||||
|
movzbl 0x5(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm6
|
||||||
|
movzbl 0x6(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm7
|
||||||
|
movzbl 0x7(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm4
|
||||||
|
punpckldq %xmm6,%xmm5
|
||||||
|
punpckldq %xmm4,%xmm7
|
||||||
|
punpcklqdq %xmm7,%xmm5
|
||||||
|
paddd %xmm5,%xmm0
|
||||||
|
paddd %xmm1,%xmm0
|
||||||
|
pxor %xmm0,%xmm3
|
||||||
|
pshufb %xmm13,%xmm3
|
||||||
|
paddd %xmm3,%xmm2
|
||||||
|
pxor %xmm2,%xmm1
|
||||||
|
movdqa %xmm1,%xmm8
|
||||||
|
psrld $0x7,%xmm1
|
||||||
|
pslld $0x19,%xmm8
|
||||||
|
por %xmm8,%xmm1
|
||||||
|
pshufd $0x93,%xmm0,%xmm0
|
||||||
|
pshufd $0x4e,%xmm3,%xmm3
|
||||||
|
pshufd $0x39,%xmm2,%xmm2
|
||||||
|
movzbl 0x8(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm6
|
||||||
|
movzbl 0x9(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm7
|
||||||
|
movzbl 0xa(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm4
|
||||||
|
movzbl 0xb(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm5
|
||||||
|
punpckldq %xmm7,%xmm6
|
||||||
|
punpckldq %xmm5,%xmm4
|
||||||
|
punpcklqdq %xmm4,%xmm6
|
||||||
|
paddd %xmm6,%xmm0
|
||||||
|
paddd %xmm1,%xmm0
|
||||||
|
pxor %xmm0,%xmm3
|
||||||
|
pshufb %xmm12,%xmm3
|
||||||
|
paddd %xmm3,%xmm2
|
||||||
|
pxor %xmm2,%xmm1
|
||||||
|
movdqa %xmm1,%xmm8
|
||||||
|
psrld $0xc,%xmm1
|
||||||
|
pslld $0x14,%xmm8
|
||||||
|
por %xmm8,%xmm1
|
||||||
|
movzbl 0xc(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm7
|
||||||
|
movzbl 0xd(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm4
|
||||||
|
movzbl 0xe(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm5
|
||||||
|
movzbl 0xf(%rcx),%eax
|
||||||
|
movd (%rsi,%rax,4),%xmm6
|
||||||
|
punpckldq %xmm4,%xmm7
|
||||||
|
punpckldq %xmm6,%xmm5
|
||||||
|
punpcklqdq %xmm5,%xmm7
|
||||||
|
paddd %xmm7,%xmm0
|
||||||
|
paddd %xmm1,%xmm0
|
||||||
|
pxor %xmm0,%xmm3
|
||||||
|
pshufb %xmm13,%xmm3
|
||||||
|
paddd %xmm3,%xmm2
|
||||||
|
pxor %xmm2,%xmm1
|
||||||
|
movdqa %xmm1,%xmm8
|
||||||
|
psrld $0x7,%xmm1
|
||||||
|
pslld $0x19,%xmm8
|
||||||
|
por %xmm8,%xmm1
|
||||||
|
pshufd $0x39,%xmm0,%xmm0
|
||||||
|
pshufd $0x4e,%xmm3,%xmm3
|
||||||
|
pshufd $0x93,%xmm2,%xmm2
|
||||||
|
addq $0x10,%rcx
|
||||||
|
cmpq %r8,%rcx
|
||||||
|
jnz .Lroundloop
|
||||||
|
pxor %xmm2,%xmm0
|
||||||
|
pxor %xmm3,%xmm1
|
||||||
|
pxor %xmm10,%xmm0
|
||||||
|
pxor %xmm11,%xmm1
|
||||||
|
addq $0x40,%rsi
|
||||||
|
decq %rdx
|
||||||
|
jnz .Lbeginofloop
|
||||||
|
movdqu %xmm0,(%rdi)
|
||||||
|
movdqu %xmm1,0x10(%rdi)
|
||||||
|
movdqu %xmm14,0x20(%rdi)
|
||||||
|
.Lendofloop:
|
||||||
|
ret
|
||||||
|
ENDPROC(blake2s_compress_ssse3)
|
||||||
|
#endif /* CONFIG_AS_SSSE3 */
|
||||||
|
|
||||||
|
#ifdef CONFIG_AS_AVX512
|
||||||
|
ENTRY(blake2s_compress_avx512)
|
||||||
|
vmovdqu (%rdi),%xmm0
|
||||||
|
vmovdqu 0x10(%rdi),%xmm1
|
||||||
|
vmovdqu 0x20(%rdi),%xmm4
|
||||||
|
vmovq %rcx,%xmm5
|
||||||
|
vmovdqa IV(%rip),%xmm14
|
||||||
|
vmovdqa IV+16(%rip),%xmm15
|
||||||
|
jmp .Lblake2s_compress_avx512_mainloop
|
||||||
|
.align 32
|
||||||
|
.Lblake2s_compress_avx512_mainloop:
|
||||||
|
vmovdqa %xmm0,%xmm10
|
||||||
|
vmovdqa %xmm1,%xmm11
|
||||||
|
vpaddq %xmm5,%xmm4,%xmm4
|
||||||
|
vmovdqa %xmm14,%xmm2
|
||||||
|
vpxor %xmm15,%xmm4,%xmm3
|
||||||
|
vmovdqu (%rsi),%ymm6
|
||||||
|
vmovdqu 0x20(%rsi),%ymm7
|
||||||
|
addq $0x40,%rsi
|
||||||
|
leaq SIGMA2(%rip),%rax
|
||||||
|
movb $0xa,%cl
|
||||||
|
.Lblake2s_compress_avx512_roundloop:
|
||||||
|
addq $0x40,%rax
|
||||||
|
vmovdqa -0x40(%rax),%ymm8
|
||||||
|
vmovdqa -0x20(%rax),%ymm9
|
||||||
|
vpermi2d %ymm7,%ymm6,%ymm8
|
||||||
|
vpermi2d %ymm7,%ymm6,%ymm9
|
||||||
|
vmovdqa %ymm8,%ymm6
|
||||||
|
vmovdqa %ymm9,%ymm7
|
||||||
|
vpaddd %xmm8,%xmm0,%xmm0
|
||||||
|
vpaddd %xmm1,%xmm0,%xmm0
|
||||||
|
vpxor %xmm0,%xmm3,%xmm3
|
||||||
|
vprord $0x10,%xmm3,%xmm3
|
||||||
|
vpaddd %xmm3,%xmm2,%xmm2
|
||||||
|
vpxor %xmm2,%xmm1,%xmm1
|
||||||
|
vprord $0xc,%xmm1,%xmm1
|
||||||
|
vextracti128 $0x1,%ymm8,%xmm8
|
||||||
|
vpaddd %xmm8,%xmm0,%xmm0
|
||||||
|
vpaddd %xmm1,%xmm0,%xmm0
|
||||||
|
vpxor %xmm0,%xmm3,%xmm3
|
||||||
|
vprord $0x8,%xmm3,%xmm3
|
||||||
|
vpaddd %xmm3,%xmm2,%xmm2
|
||||||
|
vpxor %xmm2,%xmm1,%xmm1
|
||||||
|
vprord $0x7,%xmm1,%xmm1
|
||||||
|
vpshufd $0x93,%xmm0,%xmm0
|
||||||
|
vpshufd $0x4e,%xmm3,%xmm3
|
||||||
|
vpshufd $0x39,%xmm2,%xmm2
|
||||||
|
vpaddd %xmm9,%xmm0,%xmm0
|
||||||
|
vpaddd %xmm1,%xmm0,%xmm0
|
||||||
|
vpxor %xmm0,%xmm3,%xmm3
|
||||||
|
vprord $0x10,%xmm3,%xmm3
|
||||||
|
vpaddd %xmm3,%xmm2,%xmm2
|
||||||
|
vpxor %xmm2,%xmm1,%xmm1
|
||||||
|
vprord $0xc,%xmm1,%xmm1
|
||||||
|
vextracti128 $0x1,%ymm9,%xmm9
|
||||||
|
vpaddd %xmm9,%xmm0,%xmm0
|
||||||
|
vpaddd %xmm1,%xmm0,%xmm0
|
||||||
|
vpxor %xmm0,%xmm3,%xmm3
|
||||||
|
vprord $0x8,%xmm3,%xmm3
|
||||||
|
vpaddd %xmm3,%xmm2,%xmm2
|
||||||
|
vpxor %xmm2,%xmm1,%xmm1
|
||||||
|
vprord $0x7,%xmm1,%xmm1
|
||||||
|
vpshufd $0x39,%xmm0,%xmm0
|
||||||
|
vpshufd $0x4e,%xmm3,%xmm3
|
||||||
|
vpshufd $0x93,%xmm2,%xmm2
|
||||||
|
decb %cl
|
||||||
|
jne .Lblake2s_compress_avx512_roundloop
|
||||||
|
vpxor %xmm10,%xmm0,%xmm0
|
||||||
|
vpxor %xmm11,%xmm1,%xmm1
|
||||||
|
vpxor %xmm2,%xmm0,%xmm0
|
||||||
|
vpxor %xmm3,%xmm1,%xmm1
|
||||||
|
decq %rdx
|
||||||
|
jne .Lblake2s_compress_avx512_mainloop
|
||||||
|
vmovdqu %xmm0,(%rdi)
|
||||||
|
vmovdqu %xmm1,0x10(%rdi)
|
||||||
|
vmovdqu %xmm4,0x20(%rdi)
|
||||||
|
vzeroupper
|
||||||
|
retq
|
||||||
|
ENDPROC(blake2s_compress_avx512)
|
||||||
|
#endif /* CONFIG_AS_AVX512 */
|
232
arch/x86/crypto/blake2s-glue.c
Normal file
232
arch/x86/crypto/blake2s-glue.c
Normal file
|
@ -0,0 +1,232 @@
|
||||||
|
// SPDX-License-Identifier: GPL-2.0 OR MIT
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <crypto/internal/blake2s.h>
|
||||||
|
#include <crypto/internal/hash.h>
|
||||||
|
|
||||||
|
#include <linux/types.h>
|
||||||
|
#include <linux/jump_label.h>
|
||||||
|
#include <linux/kernel.h>
|
||||||
|
#include <linux/module.h>
|
||||||
|
|
||||||
|
#include <asm/cpufeature.h>
|
||||||
|
#include <asm/fpu/api.h>
|
||||||
|
#include <asm/processor.h>
|
||||||
|
#include <asm/simd.h>
|
||||||
|
|
||||||
|
asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
|
||||||
|
const u8 *block, const size_t nblocks,
|
||||||
|
const u32 inc);
|
||||||
|
asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
|
||||||
|
const u8 *block, const size_t nblocks,
|
||||||
|
const u32 inc);
|
||||||
|
|
||||||
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3);
|
||||||
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512);
|
||||||
|
|
||||||
|
void blake2s_compress_arch(struct blake2s_state *state,
|
||||||
|
const u8 *block, size_t nblocks,
|
||||||
|
const u32 inc)
|
||||||
|
{
|
||||||
|
/* SIMD disables preemption, so relax after processing each page. */
|
||||||
|
BUILD_BUG_ON(SZ_4K / BLAKE2S_BLOCK_SIZE < 8);
|
||||||
|
|
||||||
|
if (!static_branch_likely(&blake2s_use_ssse3) || !may_use_simd()) {
|
||||||
|
blake2s_compress_generic(state, block, nblocks, inc);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
const size_t blocks = min_t(size_t, nblocks,
|
||||||
|
SZ_4K / BLAKE2S_BLOCK_SIZE);
|
||||||
|
|
||||||
|
kernel_fpu_begin();
|
||||||
|
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
||||||
|
static_branch_likely(&blake2s_use_avx512))
|
||||||
|
blake2s_compress_avx512(state, block, blocks, inc);
|
||||||
|
else
|
||||||
|
blake2s_compress_ssse3(state, block, blocks, inc);
|
||||||
|
kernel_fpu_end();
|
||||||
|
|
||||||
|
nblocks -= blocks;
|
||||||
|
block += blocks * BLAKE2S_BLOCK_SIZE;
|
||||||
|
} while (nblocks);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(blake2s_compress_arch);
|
||||||
|
|
||||||
|
static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
|
||||||
|
unsigned int keylen)
|
||||||
|
{
|
||||||
|
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
|
||||||
|
|
||||||
|
if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
|
||||||
|
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpy(tctx->key, key, keylen);
|
||||||
|
tctx->keylen = keylen;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int crypto_blake2s_init(struct shash_desc *desc)
|
||||||
|
{
|
||||||
|
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
|
||||||
|
struct blake2s_state *state = shash_desc_ctx(desc);
|
||||||
|
const int outlen = crypto_shash_digestsize(desc->tfm);
|
||||||
|
|
||||||
|
if (tctx->keylen)
|
||||||
|
blake2s_init_key(state, outlen, tctx->key, tctx->keylen);
|
||||||
|
else
|
||||||
|
blake2s_init(state, outlen);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in,
|
||||||
|
unsigned int inlen)
|
||||||
|
{
|
||||||
|
struct blake2s_state *state = shash_desc_ctx(desc);
|
||||||
|
const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen;
|
||||||
|
|
||||||
|
if (unlikely(!inlen))
|
||||||
|
return 0;
|
||||||
|
if (inlen > fill) {
|
||||||
|
memcpy(state->buf + state->buflen, in, fill);
|
||||||
|
blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE);
|
||||||
|
state->buflen = 0;
|
||||||
|
in += fill;
|
||||||
|
inlen -= fill;
|
||||||
|
}
|
||||||
|
if (inlen > BLAKE2S_BLOCK_SIZE) {
|
||||||
|
const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE);
|
||||||
|
/* Hash one less (full) block than strictly possible */
|
||||||
|
blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE);
|
||||||
|
in += BLAKE2S_BLOCK_SIZE * (nblocks - 1);
|
||||||
|
inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1);
|
||||||
|
}
|
||||||
|
memcpy(state->buf + state->buflen, in, inlen);
|
||||||
|
state->buflen += inlen;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int crypto_blake2s_final(struct shash_desc *desc, u8 *out)
|
||||||
|
{
|
||||||
|
struct blake2s_state *state = shash_desc_ctx(desc);
|
||||||
|
|
||||||
|
blake2s_set_lastblock(state);
|
||||||
|
memset(state->buf + state->buflen, 0,
|
||||||
|
BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */
|
||||||
|
blake2s_compress_arch(state, state->buf, 1, state->buflen);
|
||||||
|
cpu_to_le32_array(state->h, ARRAY_SIZE(state->h));
|
||||||
|
memcpy(out, state->h, state->outlen);
|
||||||
|
memzero_explicit(state, sizeof(*state));
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct shash_alg blake2s_algs[] = {{
|
||||||
|
.base.cra_name = "blake2s-128",
|
||||||
|
.base.cra_driver_name = "blake2s-128-x86",
|
||||||
|
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||||
|
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.digestsize = BLAKE2S_128_HASH_SIZE,
|
||||||
|
.setkey = crypto_blake2s_setkey,
|
||||||
|
.init = crypto_blake2s_init,
|
||||||
|
.update = crypto_blake2s_update,
|
||||||
|
.final = crypto_blake2s_final,
|
||||||
|
.descsize = sizeof(struct blake2s_state),
|
||||||
|
}, {
|
||||||
|
.base.cra_name = "blake2s-160",
|
||||||
|
.base.cra_driver_name = "blake2s-160-x86",
|
||||||
|
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||||
|
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.digestsize = BLAKE2S_160_HASH_SIZE,
|
||||||
|
.setkey = crypto_blake2s_setkey,
|
||||||
|
.init = crypto_blake2s_init,
|
||||||
|
.update = crypto_blake2s_update,
|
||||||
|
.final = crypto_blake2s_final,
|
||||||
|
.descsize = sizeof(struct blake2s_state),
|
||||||
|
}, {
|
||||||
|
.base.cra_name = "blake2s-224",
|
||||||
|
.base.cra_driver_name = "blake2s-224-x86",
|
||||||
|
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||||
|
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.digestsize = BLAKE2S_224_HASH_SIZE,
|
||||||
|
.setkey = crypto_blake2s_setkey,
|
||||||
|
.init = crypto_blake2s_init,
|
||||||
|
.update = crypto_blake2s_update,
|
||||||
|
.final = crypto_blake2s_final,
|
||||||
|
.descsize = sizeof(struct blake2s_state),
|
||||||
|
}, {
|
||||||
|
.base.cra_name = "blake2s-256",
|
||||||
|
.base.cra_driver_name = "blake2s-256-x86",
|
||||||
|
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
|
||||||
|
.base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx),
|
||||||
|
.base.cra_priority = 200,
|
||||||
|
.base.cra_blocksize = BLAKE2S_BLOCK_SIZE,
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.digestsize = BLAKE2S_256_HASH_SIZE,
|
||||||
|
.setkey = crypto_blake2s_setkey,
|
||||||
|
.init = crypto_blake2s_init,
|
||||||
|
.update = crypto_blake2s_update,
|
||||||
|
.final = crypto_blake2s_final,
|
||||||
|
.descsize = sizeof(struct blake2s_state),
|
||||||
|
}};
|
||||||
|
|
||||||
|
static int __init blake2s_mod_init(void)
|
||||||
|
{
|
||||||
|
if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
static_branch_enable(&blake2s_use_ssse3);
|
||||||
|
|
||||||
|
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
||||||
|
boot_cpu_has(X86_FEATURE_AVX) &&
|
||||||
|
boot_cpu_has(X86_FEATURE_AVX2) &&
|
||||||
|
boot_cpu_has(X86_FEATURE_AVX512F) &&
|
||||||
|
boot_cpu_has(X86_FEATURE_AVX512VL) &&
|
||||||
|
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
|
||||||
|
XFEATURE_MASK_AVX512, NULL))
|
||||||
|
static_branch_enable(&blake2s_use_avx512);
|
||||||
|
|
||||||
|
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
|
||||||
|
crypto_register_shashes(blake2s_algs,
|
||||||
|
ARRAY_SIZE(blake2s_algs)) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __exit blake2s_mod_exit(void)
|
||||||
|
{
|
||||||
|
if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && boot_cpu_has(X86_FEATURE_SSSE3))
|
||||||
|
crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs));
|
||||||
|
}
|
||||||
|
|
||||||
|
module_init(blake2s_mod_init);
|
||||||
|
module_exit(blake2s_mod_exit);
|
||||||
|
|
||||||
|
MODULE_ALIAS_CRYPTO("blake2s-128");
|
||||||
|
MODULE_ALIAS_CRYPTO("blake2s-128-x86");
|
||||||
|
MODULE_ALIAS_CRYPTO("blake2s-160");
|
||||||
|
MODULE_ALIAS_CRYPTO("blake2s-160-x86");
|
||||||
|
MODULE_ALIAS_CRYPTO("blake2s-224");
|
||||||
|
MODULE_ALIAS_CRYPTO("blake2s-224-x86");
|
||||||
|
MODULE_ALIAS_CRYPTO("blake2s-256");
|
||||||
|
MODULE_ALIAS_CRYPTO("blake2s-256-x86");
|
||||||
|
MODULE_LICENSE("GPL v2");
|
1025
arch/x86/crypto/chacha-avx2-x86_64.S
Normal file
1025
arch/x86/crypto/chacha-avx2-x86_64.S
Normal file
File diff suppressed because it is too large
Load diff
836
arch/x86/crypto/chacha-avx512vl-x86_64.S
Normal file
836
arch/x86/crypto/chacha-avx512vl-x86_64.S
Normal file
|
@ -0,0 +1,836 @@
|
||||||
|
/* SPDX-License-Identifier: GPL-2.0+ */
|
||||||
|
/*
|
||||||
|
* ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
|
||||||
|
*
|
||||||
|
* Copyright (C) 2018 Martin Willi
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <linux/linkage.h>
|
||||||
|
|
||||||
|
.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
|
||||||
|
.align 32
|
||||||
|
CTR2BL: .octa 0x00000000000000000000000000000000
|
||||||
|
.octa 0x00000000000000000000000000000001
|
||||||
|
|
||||||
|
.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
|
||||||
|
.align 32
|
||||||
|
CTR4BL: .octa 0x00000000000000000000000000000002
|
||||||
|
.octa 0x00000000000000000000000000000003
|
||||||
|
|
||||||
|
.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
|
||||||
|
.align 32
|
||||||
|
CTR8BL: .octa 0x00000003000000020000000100000000
|
||||||
|
.octa 0x00000007000000060000000500000004
|
||||||
|
|
||||||
|
.text
|
||||||
|
|
||||||
|
ENTRY(chacha_2block_xor_avx512vl)
|
||||||
|
# %rdi: Input state matrix, s
|
||||||
|
# %rsi: up to 2 data blocks output, o
|
||||||
|
# %rdx: up to 2 data blocks input, i
|
||||||
|
# %rcx: input/output length in bytes
|
||||||
|
# %r8d: nrounds
|
||||||
|
|
||||||
|
# This function encrypts two ChaCha blocks by loading the state
|
||||||
|
# matrix twice across four AVX registers. It performs matrix operations
|
||||||
|
# on four words in each matrix in parallel, but requires shuffling to
|
||||||
|
# rearrange the words after each round.
|
||||||
|
|
||||||
|
vzeroupper
|
||||||
|
|
||||||
|
# x0..3[0-2] = s0..3
|
||||||
|
vbroadcasti128 0x00(%rdi),%ymm0
|
||||||
|
vbroadcasti128 0x10(%rdi),%ymm1
|
||||||
|
vbroadcasti128 0x20(%rdi),%ymm2
|
||||||
|
vbroadcasti128 0x30(%rdi),%ymm3
|
||||||
|
|
||||||
|
vpaddd CTR2BL(%rip),%ymm3,%ymm3
|
||||||
|
|
||||||
|
vmovdqa %ymm0,%ymm8
|
||||||
|
vmovdqa %ymm1,%ymm9
|
||||||
|
vmovdqa %ymm2,%ymm10
|
||||||
|
vmovdqa %ymm3,%ymm11
|
||||||
|
|
||||||
|
.Ldoubleround:
|
||||||
|
|
||||||
|
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||||
|
vpaddd %ymm1,%ymm0,%ymm0
|
||||||
|
vpxord %ymm0,%ymm3,%ymm3
|
||||||
|
vprold $16,%ymm3,%ymm3
|
||||||
|
|
||||||
|
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||||
|
vpaddd %ymm3,%ymm2,%ymm2
|
||||||
|
vpxord %ymm2,%ymm1,%ymm1
|
||||||
|
vprold $12,%ymm1,%ymm1
|
||||||
|
|
||||||
|
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||||
|
vpaddd %ymm1,%ymm0,%ymm0
|
||||||
|
vpxord %ymm0,%ymm3,%ymm3
|
||||||
|
vprold $8,%ymm3,%ymm3
|
||||||
|
|
||||||
|
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||||
|
vpaddd %ymm3,%ymm2,%ymm2
|
||||||
|
vpxord %ymm2,%ymm1,%ymm1
|
||||||
|
vprold $7,%ymm1,%ymm1
|
||||||
|
|
||||||
|
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
|
||||||
|
vpshufd $0x39,%ymm1,%ymm1
|
||||||
|
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||||
|
vpshufd $0x4e,%ymm2,%ymm2
|
||||||
|
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
|
||||||
|
vpshufd $0x93,%ymm3,%ymm3
|
||||||
|
|
||||||
|
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||||
|
vpaddd %ymm1,%ymm0,%ymm0
|
||||||
|
vpxord %ymm0,%ymm3,%ymm3
|
||||||
|
vprold $16,%ymm3,%ymm3
|
||||||
|
|
||||||
|
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||||
|
vpaddd %ymm3,%ymm2,%ymm2
|
||||||
|
vpxord %ymm2,%ymm1,%ymm1
|
||||||
|
vprold $12,%ymm1,%ymm1
|
||||||
|
|
||||||
|
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||||
|
vpaddd %ymm1,%ymm0,%ymm0
|
||||||
|
vpxord %ymm0,%ymm3,%ymm3
|
||||||
|
vprold $8,%ymm3,%ymm3
|
||||||
|
|
||||||
|
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||||
|
vpaddd %ymm3,%ymm2,%ymm2
|
||||||
|
vpxord %ymm2,%ymm1,%ymm1
|
||||||
|
vprold $7,%ymm1,%ymm1
|
||||||
|
|
||||||
|
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
|
||||||
|
vpshufd $0x93,%ymm1,%ymm1
|
||||||
|
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||||
|
vpshufd $0x4e,%ymm2,%ymm2
|
||||||
|
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||||
|
vpshufd $0x39,%ymm3,%ymm3
|
||||||
|
|
||||||
|
sub $2,%r8d
|
||||||
|
jnz .Ldoubleround
|
||||||
|
|
||||||
|
# o0 = i0 ^ (x0 + s0)
|
||||||
|
vpaddd %ymm8,%ymm0,%ymm7
|
||||||
|
cmp $0x10,%rcx
|
||||||
|
jl .Lxorpart2
|
||||||
|
vpxord 0x00(%rdx),%xmm7,%xmm6
|
||||||
|
vmovdqu %xmm6,0x00(%rsi)
|
||||||
|
vextracti128 $1,%ymm7,%xmm0
|
||||||
|
# o1 = i1 ^ (x1 + s1)
|
||||||
|
vpaddd %ymm9,%ymm1,%ymm7
|
||||||
|
cmp $0x20,%rcx
|
||||||
|
jl .Lxorpart2
|
||||||
|
vpxord 0x10(%rdx),%xmm7,%xmm6
|
||||||
|
vmovdqu %xmm6,0x10(%rsi)
|
||||||
|
vextracti128 $1,%ymm7,%xmm1
|
||||||
|
# o2 = i2 ^ (x2 + s2)
|
||||||
|
vpaddd %ymm10,%ymm2,%ymm7
|
||||||
|
cmp $0x30,%rcx
|
||||||
|
jl .Lxorpart2
|
||||||
|
vpxord 0x20(%rdx),%xmm7,%xmm6
|
||||||
|
vmovdqu %xmm6,0x20(%rsi)
|
||||||
|
vextracti128 $1,%ymm7,%xmm2
|
||||||
|
# o3 = i3 ^ (x3 + s3)
|
||||||
|
vpaddd %ymm11,%ymm3,%ymm7
|
||||||
|
cmp $0x40,%rcx
|
||||||
|
jl .Lxorpart2
|
||||||
|
vpxord 0x30(%rdx),%xmm7,%xmm6
|
||||||
|
vmovdqu %xmm6,0x30(%rsi)
|
||||||
|
vextracti128 $1,%ymm7,%xmm3
|
||||||
|
|
||||||
|
# xor and write second block
|
||||||
|
vmovdqa %xmm0,%xmm7
|
||||||
|
cmp $0x50,%rcx
|
||||||
|
jl .Lxorpart2
|
||||||
|
vpxord 0x40(%rdx),%xmm7,%xmm6
|
||||||
|
vmovdqu %xmm6,0x40(%rsi)
|
||||||
|
|
||||||
|
vmovdqa %xmm1,%xmm7
|
||||||
|
cmp $0x60,%rcx
|
||||||
|
jl .Lxorpart2
|
||||||
|
vpxord 0x50(%rdx),%xmm7,%xmm6
|
||||||
|
vmovdqu %xmm6,0x50(%rsi)
|
||||||
|
|
||||||
|
vmovdqa %xmm2,%xmm7
|
||||||
|
cmp $0x70,%rcx
|
||||||
|
jl .Lxorpart2
|
||||||
|
vpxord 0x60(%rdx),%xmm7,%xmm6
|
||||||
|
vmovdqu %xmm6,0x60(%rsi)
|
||||||
|
|
||||||
|
vmovdqa %xmm3,%xmm7
|
||||||
|
cmp $0x80,%rcx
|
||||||
|
jl .Lxorpart2
|
||||||
|
vpxord 0x70(%rdx),%xmm7,%xmm6
|
||||||
|
vmovdqu %xmm6,0x70(%rsi)
|
||||||
|
|
||||||
|
.Ldone2:
|
||||||
|
vzeroupper
|
||||||
|
ret
|
||||||
|
|
||||||
|
.Lxorpart2:
|
||||||
|
# xor remaining bytes from partial register into output
|
||||||
|
mov %rcx,%rax
|
||||||
|
and $0xf,%rcx
|
||||||
|
jz .Ldone8
|
||||||
|
mov %rax,%r9
|
||||||
|
and $~0xf,%r9
|
||||||
|
|
||||||
|
mov $1,%rax
|
||||||
|
shld %cl,%rax,%rax
|
||||||
|
sub $1,%rax
|
||||||
|
kmovq %rax,%k1
|
||||||
|
|
||||||
|
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
|
||||||
|
vpxord %xmm7,%xmm1,%xmm1
|
||||||
|
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
|
||||||
|
|
||||||
|
jmp .Ldone2
|
||||||
|
|
||||||
|
ENDPROC(chacha_2block_xor_avx512vl)
|
||||||
|
|
||||||
|
ENTRY(chacha_4block_xor_avx512vl)
|
||||||
|
# %rdi: Input state matrix, s
|
||||||
|
# %rsi: up to 4 data blocks output, o
|
||||||
|
# %rdx: up to 4 data blocks input, i
|
||||||
|
# %rcx: input/output length in bytes
|
||||||
|
# %r8d: nrounds
|
||||||
|
|
||||||
|
# This function encrypts four ChaCha blocks by loading the state
|
||||||
|
# matrix four times across eight AVX registers. It performs matrix
|
||||||
|
# operations on four words in two matrices in parallel, sequentially
|
||||||
|
# to the operations on the four words of the other two matrices. The
|
||||||
|
# required word shuffling has a rather high latency, we can do the
|
||||||
|
# arithmetic on two matrix-pairs without much slowdown.
|
||||||
|
|
||||||
|
vzeroupper
|
||||||
|
|
||||||
|
# x0..3[0-4] = s0..3
|
||||||
|
vbroadcasti128 0x00(%rdi),%ymm0
|
||||||
|
vbroadcasti128 0x10(%rdi),%ymm1
|
||||||
|
vbroadcasti128 0x20(%rdi),%ymm2
|
||||||
|
vbroadcasti128 0x30(%rdi),%ymm3
|
||||||
|
|
||||||
|
vmovdqa %ymm0,%ymm4
|
||||||
|
vmovdqa %ymm1,%ymm5
|
||||||
|
vmovdqa %ymm2,%ymm6
|
||||||
|
vmovdqa %ymm3,%ymm7
|
||||||
|
|
||||||
|
vpaddd CTR2BL(%rip),%ymm3,%ymm3
|
||||||
|
vpaddd CTR4BL(%rip),%ymm7,%ymm7
|
||||||
|
|
||||||
|
vmovdqa %ymm0,%ymm11
|
||||||
|
vmovdqa %ymm1,%ymm12
|
||||||
|
vmovdqa %ymm2,%ymm13
|
||||||
|
vmovdqa %ymm3,%ymm14
|
||||||
|
vmovdqa %ymm7,%ymm15
|
||||||
|
|
||||||
|
.Ldoubleround4:
|
||||||
|
|
||||||
|
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||||
|
vpaddd %ymm1,%ymm0,%ymm0
|
||||||
|
vpxord %ymm0,%ymm3,%ymm3
|
||||||
|
vprold $16,%ymm3,%ymm3
|
||||||
|
|
||||||
|
vpaddd %ymm5,%ymm4,%ymm4
|
||||||
|
vpxord %ymm4,%ymm7,%ymm7
|
||||||
|
vprold $16,%ymm7,%ymm7
|
||||||
|
|
||||||
|
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||||
|
vpaddd %ymm3,%ymm2,%ymm2
|
||||||
|
vpxord %ymm2,%ymm1,%ymm1
|
||||||
|
vprold $12,%ymm1,%ymm1
|
||||||
|
|
||||||
|
vpaddd %ymm7,%ymm6,%ymm6
|
||||||
|
vpxord %ymm6,%ymm5,%ymm5
|
||||||
|
vprold $12,%ymm5,%ymm5
|
||||||
|
|
||||||
|
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||||
|
vpaddd %ymm1,%ymm0,%ymm0
|
||||||
|
vpxord %ymm0,%ymm3,%ymm3
|
||||||
|
vprold $8,%ymm3,%ymm3
|
||||||
|
|
||||||
|
vpaddd %ymm5,%ymm4,%ymm4
|
||||||
|
vpxord %ymm4,%ymm7,%ymm7
|
||||||
|
vprold $8,%ymm7,%ymm7
|
||||||
|
|
||||||
|
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||||
|
vpaddd %ymm3,%ymm2,%ymm2
|
||||||
|
vpxord %ymm2,%ymm1,%ymm1
|
||||||
|
vprold $7,%ymm1,%ymm1
|
||||||
|
|
||||||
|
vpaddd %ymm7,%ymm6,%ymm6
|
||||||
|
vpxord %ymm6,%ymm5,%ymm5
|
||||||
|
vprold $7,%ymm5,%ymm5
|
||||||
|
|
||||||
|
# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
|
||||||
|
vpshufd $0x39,%ymm1,%ymm1
|
||||||
|
vpshufd $0x39,%ymm5,%ymm5
|
||||||
|
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||||
|
vpshufd $0x4e,%ymm2,%ymm2
|
||||||
|
vpshufd $0x4e,%ymm6,%ymm6
|
||||||
|
# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
|
||||||
|
vpshufd $0x93,%ymm3,%ymm3
|
||||||
|
vpshufd $0x93,%ymm7,%ymm7
|
||||||
|
|
||||||
|
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||||
|
vpaddd %ymm1,%ymm0,%ymm0
|
||||||
|
vpxord %ymm0,%ymm3,%ymm3
|
||||||
|
vprold $16,%ymm3,%ymm3
|
||||||
|
|
||||||
|
vpaddd %ymm5,%ymm4,%ymm4
|
||||||
|
vpxord %ymm4,%ymm7,%ymm7
|
||||||
|
vprold $16,%ymm7,%ymm7
|
||||||
|
|
||||||
|
# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
|
||||||
|
vpaddd %ymm3,%ymm2,%ymm2
|
||||||
|
vpxord %ymm2,%ymm1,%ymm1
|
||||||
|
vprold $12,%ymm1,%ymm1
|
||||||
|
|
||||||
|
vpaddd %ymm7,%ymm6,%ymm6
|
||||||
|
vpxord %ymm6,%ymm5,%ymm5
|
||||||
|
vprold $12,%ymm5,%ymm5
|
||||||
|
|
||||||
|
# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
|
||||||
|
vpaddd %ymm1,%ymm0,%ymm0
|
||||||
|
vpxord %ymm0,%ymm3,%ymm3
|
||||||
|
vprold $8,%ymm3,%ymm3
|
||||||
|
|
||||||
|
vpaddd %ymm5,%ymm4,%ymm4
|
||||||
|
vpxord %ymm4,%ymm7,%ymm7
|
||||||
|
vprold $8,%ymm7,%ymm7
|
||||||
|
|
||||||
|
# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
|
||||||
|
vpaddd %ymm3,%ymm2,%ymm2
|
||||||
|
vpxord %ymm2,%ymm1,%ymm1
|
||||||
|
vprold $7,%ymm1,%ymm1
|
||||||
|
|
||||||
|
vpaddd %ymm7,%ymm6,%ymm6
|
||||||
|
vpxord %ymm6,%ymm5,%ymm5
|
||||||
|
vprold $7,%ymm5,%ymm5
|
||||||
|
|
||||||
|
# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
|
||||||
|
vpshufd $0x93,%ymm1,%ymm1
|
||||||
|
vpshufd $0x93,%ymm5,%ymm5
|
||||||
|
# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
|
||||||
|
vpshufd $0x4e,%ymm2,%ymm2
|
||||||
|
vpshufd $0x4e,%ymm6,%ymm6
|
||||||
|
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||||
|
vpshufd $0x39,%ymm3,%ymm3
|
||||||
|
vpshufd $0x39,%ymm7,%ymm7
|
||||||
|
|
||||||
|
sub $2,%r8d
|
||||||
|
jnz .Ldoubleround4
|
||||||
|
|
||||||
|
# o0 = i0 ^ (x0 + s0), first block
|
||||||
|
vpaddd %ymm11,%ymm0,%ymm10
|
||||||
|
cmp $0x10,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0x00(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0x00(%rsi)
|
||||||
|
vextracti128 $1,%ymm10,%xmm0
|
||||||
|
# o1 = i1 ^ (x1 + s1), first block
|
||||||
|
vpaddd %ymm12,%ymm1,%ymm10
|
||||||
|
cmp $0x20,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0x10(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0x10(%rsi)
|
||||||
|
vextracti128 $1,%ymm10,%xmm1
|
||||||
|
# o2 = i2 ^ (x2 + s2), first block
|
||||||
|
vpaddd %ymm13,%ymm2,%ymm10
|
||||||
|
cmp $0x30,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0x20(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0x20(%rsi)
|
||||||
|
vextracti128 $1,%ymm10,%xmm2
|
||||||
|
# o3 = i3 ^ (x3 + s3), first block
|
||||||
|
vpaddd %ymm14,%ymm3,%ymm10
|
||||||
|
cmp $0x40,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0x30(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0x30(%rsi)
|
||||||
|
vextracti128 $1,%ymm10,%xmm3
|
||||||
|
|
||||||
|
# xor and write second block
|
||||||
|
vmovdqa %xmm0,%xmm10
|
||||||
|
cmp $0x50,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0x40(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0x40(%rsi)
|
||||||
|
|
||||||
|
vmovdqa %xmm1,%xmm10
|
||||||
|
cmp $0x60,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0x50(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0x50(%rsi)
|
||||||
|
|
||||||
|
vmovdqa %xmm2,%xmm10
|
||||||
|
cmp $0x70,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0x60(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0x60(%rsi)
|
||||||
|
|
||||||
|
vmovdqa %xmm3,%xmm10
|
||||||
|
cmp $0x80,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0x70(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0x70(%rsi)
|
||||||
|
|
||||||
|
# o0 = i0 ^ (x0 + s0), third block
|
||||||
|
vpaddd %ymm11,%ymm4,%ymm10
|
||||||
|
cmp $0x90,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0x80(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0x80(%rsi)
|
||||||
|
vextracti128 $1,%ymm10,%xmm4
|
||||||
|
# o1 = i1 ^ (x1 + s1), third block
|
||||||
|
vpaddd %ymm12,%ymm5,%ymm10
|
||||||
|
cmp $0xa0,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0x90(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0x90(%rsi)
|
||||||
|
vextracti128 $1,%ymm10,%xmm5
|
||||||
|
# o2 = i2 ^ (x2 + s2), third block
|
||||||
|
vpaddd %ymm13,%ymm6,%ymm10
|
||||||
|
cmp $0xb0,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0xa0(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0xa0(%rsi)
|
||||||
|
vextracti128 $1,%ymm10,%xmm6
|
||||||
|
# o3 = i3 ^ (x3 + s3), third block
|
||||||
|
vpaddd %ymm15,%ymm7,%ymm10
|
||||||
|
cmp $0xc0,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0xb0(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0xb0(%rsi)
|
||||||
|
vextracti128 $1,%ymm10,%xmm7
|
||||||
|
|
||||||
|
# xor and write fourth block
|
||||||
|
vmovdqa %xmm4,%xmm10
|
||||||
|
cmp $0xd0,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0xc0(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0xc0(%rsi)
|
||||||
|
|
||||||
|
vmovdqa %xmm5,%xmm10
|
||||||
|
cmp $0xe0,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0xd0(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0xd0(%rsi)
|
||||||
|
|
||||||
|
vmovdqa %xmm6,%xmm10
|
||||||
|
cmp $0xf0,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0xe0(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0xe0(%rsi)
|
||||||
|
|
||||||
|
vmovdqa %xmm7,%xmm10
|
||||||
|
cmp $0x100,%rcx
|
||||||
|
jl .Lxorpart4
|
||||||
|
vpxord 0xf0(%rdx),%xmm10,%xmm9
|
||||||
|
vmovdqu %xmm9,0xf0(%rsi)
|
||||||
|
|
||||||
|
.Ldone4:
|
||||||
|
vzeroupper
|
||||||
|
ret
|
||||||
|
|
||||||
|
.Lxorpart4:
|
||||||
|
# xor remaining bytes from partial register into output
|
||||||
|
mov %rcx,%rax
|
||||||
|
and $0xf,%rcx
|
||||||
|
jz .Ldone8
|
||||||
|
mov %rax,%r9
|
||||||
|
and $~0xf,%r9
|
||||||
|
|
||||||
|
mov $1,%rax
|
||||||
|
shld %cl,%rax,%rax
|
||||||
|
sub $1,%rax
|
||||||
|
kmovq %rax,%k1
|
||||||
|
|
||||||
|
vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
|
||||||
|
vpxord %xmm10,%xmm1,%xmm1
|
||||||
|
vmovdqu8 %xmm1,(%rsi,%r9){%k1}
|
||||||
|
|
||||||
|
jmp .Ldone4
|
||||||
|
|
||||||
|
ENDPROC(chacha_4block_xor_avx512vl)
|
||||||
|
|
||||||
|
ENTRY(chacha_8block_xor_avx512vl)
|
||||||
|
# %rdi: Input state matrix, s
|
||||||
|
# %rsi: up to 8 data blocks output, o
|
||||||
|
# %rdx: up to 8 data blocks input, i
|
||||||
|
# %rcx: input/output length in bytes
|
||||||
|
# %r8d: nrounds
|
||||||
|
|
||||||
|
# This function encrypts eight consecutive ChaCha blocks by loading
|
||||||
|
# the state matrix in AVX registers eight times. Compared to AVX2, this
|
||||||
|
# mostly benefits from the new rotate instructions in VL and the
|
||||||
|
# additional registers.
|
||||||
|
|
||||||
|
vzeroupper
|
||||||
|
|
||||||
|
# x0..15[0-7] = s[0..15]
|
||||||
|
vpbroadcastd 0x00(%rdi),%ymm0
|
||||||
|
vpbroadcastd 0x04(%rdi),%ymm1
|
||||||
|
vpbroadcastd 0x08(%rdi),%ymm2
|
||||||
|
vpbroadcastd 0x0c(%rdi),%ymm3
|
||||||
|
vpbroadcastd 0x10(%rdi),%ymm4
|
||||||
|
vpbroadcastd 0x14(%rdi),%ymm5
|
||||||
|
vpbroadcastd 0x18(%rdi),%ymm6
|
||||||
|
vpbroadcastd 0x1c(%rdi),%ymm7
|
||||||
|
vpbroadcastd 0x20(%rdi),%ymm8
|
||||||
|
vpbroadcastd 0x24(%rdi),%ymm9
|
||||||
|
vpbroadcastd 0x28(%rdi),%ymm10
|
||||||
|
vpbroadcastd 0x2c(%rdi),%ymm11
|
||||||
|
vpbroadcastd 0x30(%rdi),%ymm12
|
||||||
|
vpbroadcastd 0x34(%rdi),%ymm13
|
||||||
|
vpbroadcastd 0x38(%rdi),%ymm14
|
||||||
|
vpbroadcastd 0x3c(%rdi),%ymm15
|
||||||
|
|
||||||
|
# x12 += counter values 0-3
|
||||||
|
vpaddd CTR8BL(%rip),%ymm12,%ymm12
|
||||||
|
|
||||||
|
vmovdqa64 %ymm0,%ymm16
|
||||||
|
vmovdqa64 %ymm1,%ymm17
|
||||||
|
vmovdqa64 %ymm2,%ymm18
|
||||||
|
vmovdqa64 %ymm3,%ymm19
|
||||||
|
vmovdqa64 %ymm4,%ymm20
|
||||||
|
vmovdqa64 %ymm5,%ymm21
|
||||||
|
vmovdqa64 %ymm6,%ymm22
|
||||||
|
vmovdqa64 %ymm7,%ymm23
|
||||||
|
vmovdqa64 %ymm8,%ymm24
|
||||||
|
vmovdqa64 %ymm9,%ymm25
|
||||||
|
vmovdqa64 %ymm10,%ymm26
|
||||||
|
vmovdqa64 %ymm11,%ymm27
|
||||||
|
vmovdqa64 %ymm12,%ymm28
|
||||||
|
vmovdqa64 %ymm13,%ymm29
|
||||||
|
vmovdqa64 %ymm14,%ymm30
|
||||||
|
vmovdqa64 %ymm15,%ymm31
|
||||||
|
|
||||||
|
.Ldoubleround8:
|
||||||
|
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||||
|
vpaddd %ymm0,%ymm4,%ymm0
|
||||||
|
vpxord %ymm0,%ymm12,%ymm12
|
||||||
|
vprold $16,%ymm12,%ymm12
|
||||||
|
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
|
||||||
|
vpaddd %ymm1,%ymm5,%ymm1
|
||||||
|
vpxord %ymm1,%ymm13,%ymm13
|
||||||
|
vprold $16,%ymm13,%ymm13
|
||||||
|
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
|
||||||
|
vpaddd %ymm2,%ymm6,%ymm2
|
||||||
|
vpxord %ymm2,%ymm14,%ymm14
|
||||||
|
vprold $16,%ymm14,%ymm14
|
||||||
|
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
|
||||||
|
vpaddd %ymm3,%ymm7,%ymm3
|
||||||
|
vpxord %ymm3,%ymm15,%ymm15
|
||||||
|
vprold $16,%ymm15,%ymm15
|
||||||
|
|
||||||
|
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
|
||||||
|
vpaddd %ymm12,%ymm8,%ymm8
|
||||||
|
vpxord %ymm8,%ymm4,%ymm4
|
||||||
|
vprold $12,%ymm4,%ymm4
|
||||||
|
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
|
||||||
|
vpaddd %ymm13,%ymm9,%ymm9
|
||||||
|
vpxord %ymm9,%ymm5,%ymm5
|
||||||
|
vprold $12,%ymm5,%ymm5
|
||||||
|
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
|
||||||
|
vpaddd %ymm14,%ymm10,%ymm10
|
||||||
|
vpxord %ymm10,%ymm6,%ymm6
|
||||||
|
vprold $12,%ymm6,%ymm6
|
||||||
|
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
|
||||||
|
vpaddd %ymm15,%ymm11,%ymm11
|
||||||
|
vpxord %ymm11,%ymm7,%ymm7
|
||||||
|
vprold $12,%ymm7,%ymm7
|
||||||
|
|
||||||
|
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
|
||||||
|
vpaddd %ymm0,%ymm4,%ymm0
|
||||||
|
vpxord %ymm0,%ymm12,%ymm12
|
||||||
|
vprold $8,%ymm12,%ymm12
|
||||||
|
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
|
||||||
|
vpaddd %ymm1,%ymm5,%ymm1
|
||||||
|
vpxord %ymm1,%ymm13,%ymm13
|
||||||
|
vprold $8,%ymm13,%ymm13
|
||||||
|
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
|
||||||
|
vpaddd %ymm2,%ymm6,%ymm2
|
||||||
|
vpxord %ymm2,%ymm14,%ymm14
|
||||||
|
vprold $8,%ymm14,%ymm14
|
||||||
|
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
|
||||||
|
vpaddd %ymm3,%ymm7,%ymm3
|
||||||
|
vpxord %ymm3,%ymm15,%ymm15
|
||||||
|
vprold $8,%ymm15,%ymm15
|
||||||
|
|
||||||
|
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
|
||||||
|
vpaddd %ymm12,%ymm8,%ymm8
|
||||||
|
vpxord %ymm8,%ymm4,%ymm4
|
||||||
|
vprold $7,%ymm4,%ymm4
|
||||||
|
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
|
||||||
|
vpaddd %ymm13,%ymm9,%ymm9
|
||||||
|
vpxord %ymm9,%ymm5,%ymm5
|
||||||
|
vprold $7,%ymm5,%ymm5
|
||||||
|
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
|
||||||
|
vpaddd %ymm14,%ymm10,%ymm10
|
||||||
|
vpxord %ymm10,%ymm6,%ymm6
|
||||||
|
vprold $7,%ymm6,%ymm6
|
||||||
|
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
|
||||||
|
vpaddd %ymm15,%ymm11,%ymm11
|
||||||
|
vpxord %ymm11,%ymm7,%ymm7
|
||||||
|
vprold $7,%ymm7,%ymm7
|
||||||
|
|
||||||
|
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
|
||||||
|
vpaddd %ymm0,%ymm5,%ymm0
|
||||||
|
vpxord %ymm0,%ymm15,%ymm15
|
||||||
|
vprold $16,%ymm15,%ymm15
|
||||||
|
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
|
||||||
|
vpaddd %ymm1,%ymm6,%ymm1
|
||||||
|
vpxord %ymm1,%ymm12,%ymm12
|
||||||
|
vprold $16,%ymm12,%ymm12
|
||||||
|
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
|
||||||
|
vpaddd %ymm2,%ymm7,%ymm2
|
||||||
|
vpxord %ymm2,%ymm13,%ymm13
|
||||||
|
vprold $16,%ymm13,%ymm13
|
||||||
|
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
|
||||||
|
vpaddd %ymm3,%ymm4,%ymm3
|
||||||
|
vpxord %ymm3,%ymm14,%ymm14
|
||||||
|
vprold $16,%ymm14,%ymm14
|
||||||
|
|
||||||
|
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
|
||||||
|
vpaddd %ymm15,%ymm10,%ymm10
|
||||||
|
vpxord %ymm10,%ymm5,%ymm5
|
||||||
|
vprold $12,%ymm5,%ymm5
|
||||||
|
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
|
||||||
|
vpaddd %ymm12,%ymm11,%ymm11
|
||||||
|
vpxord %ymm11,%ymm6,%ymm6
|
||||||
|
vprold $12,%ymm6,%ymm6
|
||||||
|
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
|
||||||
|
vpaddd %ymm13,%ymm8,%ymm8
|
||||||
|
vpxord %ymm8,%ymm7,%ymm7
|
||||||
|
vprold $12,%ymm7,%ymm7
|
||||||
|
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
|
||||||
|
vpaddd %ymm14,%ymm9,%ymm9
|
||||||
|
vpxord %ymm9,%ymm4,%ymm4
|
||||||
|
vprold $12,%ymm4,%ymm4
|
||||||
|
|
||||||
|
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
|
||||||
|
vpaddd %ymm0,%ymm5,%ymm0
|
||||||
|
vpxord %ymm0,%ymm15,%ymm15
|
||||||
|
vprold $8,%ymm15,%ymm15
|
||||||
|
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
|
||||||
|
vpaddd %ymm1,%ymm6,%ymm1
|
||||||
|
vpxord %ymm1,%ymm12,%ymm12
|
||||||
|
vprold $8,%ymm12,%ymm12
|
||||||
|
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
|
||||||
|
vpaddd %ymm2,%ymm7,%ymm2
|
||||||
|
vpxord %ymm2,%ymm13,%ymm13
|
||||||
|
vprold $8,%ymm13,%ymm13
|
||||||
|
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
|
||||||
|
vpaddd %ymm3,%ymm4,%ymm3
|
||||||
|
vpxord %ymm3,%ymm14,%ymm14
|
||||||
|
vprold $8,%ymm14,%ymm14
|
||||||
|
|
||||||
|
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
|
||||||
|
vpaddd %ymm15,%ymm10,%ymm10
|
||||||
|
vpxord %ymm10,%ymm5,%ymm5
|
||||||
|
vprold $7,%ymm5,%ymm5
|
||||||
|
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
|
||||||
|
vpaddd %ymm12,%ymm11,%ymm11
|
||||||
|
vpxord %ymm11,%ymm6,%ymm6
|
||||||
|
vprold $7,%ymm6,%ymm6
|
||||||
|
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
|
||||||
|
vpaddd %ymm13,%ymm8,%ymm8
|
||||||
|
vpxord %ymm8,%ymm7,%ymm7
|
||||||
|
vprold $7,%ymm7,%ymm7
|
||||||
|
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
|
||||||
|
vpaddd %ymm14,%ymm9,%ymm9
|
||||||
|
vpxord %ymm9,%ymm4,%ymm4
|
||||||
|
vprold $7,%ymm4,%ymm4
|
||||||
|
|
||||||
|
sub $2,%r8d
|
||||||
|
jnz .Ldoubleround8
|
||||||
|
|
||||||
|
# x0..15[0-3] += s[0..15]
|
||||||
|
vpaddd %ymm16,%ymm0,%ymm0
|
||||||
|
vpaddd %ymm17,%ymm1,%ymm1
|
||||||
|
vpaddd %ymm18,%ymm2,%ymm2
|
||||||
|
vpaddd %ymm19,%ymm3,%ymm3
|
||||||
|
vpaddd %ymm20,%ymm4,%ymm4
|
||||||
|
vpaddd %ymm21,%ymm5,%ymm5
|
||||||
|
vpaddd %ymm22,%ymm6,%ymm6
|
||||||
|
vpaddd %ymm23,%ymm7,%ymm7
|
||||||
|
vpaddd %ymm24,%ymm8,%ymm8
|
||||||
|
vpaddd %ymm25,%ymm9,%ymm9
|
||||||
|
vpaddd %ymm26,%ymm10,%ymm10
|
||||||
|
vpaddd %ymm27,%ymm11,%ymm11
|
||||||
|
vpaddd %ymm28,%ymm12,%ymm12
|
||||||
|
vpaddd %ymm29,%ymm13,%ymm13
|
||||||
|
vpaddd %ymm30,%ymm14,%ymm14
|
||||||
|
vpaddd %ymm31,%ymm15,%ymm15
|
||||||
|
|
||||||
|
# interleave 32-bit words in state n, n+1
|
||||||
|
vpunpckldq %ymm1,%ymm0,%ymm16
|
||||||
|
vpunpckhdq %ymm1,%ymm0,%ymm17
|
||||||
|
vpunpckldq %ymm3,%ymm2,%ymm18
|
||||||
|
vpunpckhdq %ymm3,%ymm2,%ymm19
|
||||||
|
vpunpckldq %ymm5,%ymm4,%ymm20
|
||||||
|
vpunpckhdq %ymm5,%ymm4,%ymm21
|
||||||
|
vpunpckldq %ymm7,%ymm6,%ymm22
|
||||||
|
vpunpckhdq %ymm7,%ymm6,%ymm23
|
||||||
|
vpunpckldq %ymm9,%ymm8,%ymm24
|
||||||
|
vpunpckhdq %ymm9,%ymm8,%ymm25
|
||||||
|
vpunpckldq %ymm11,%ymm10,%ymm26
|
||||||
|
vpunpckhdq %ymm11,%ymm10,%ymm27
|
||||||
|
vpunpckldq %ymm13,%ymm12,%ymm28
|
||||||
|
vpunpckhdq %ymm13,%ymm12,%ymm29
|
||||||
|
vpunpckldq %ymm15,%ymm14,%ymm30
|
||||||
|
vpunpckhdq %ymm15,%ymm14,%ymm31
|
||||||
|
|
||||||
|
# interleave 64-bit words in state n, n+2
|
||||||
|
vpunpcklqdq %ymm18,%ymm16,%ymm0
|
||||||
|
vpunpcklqdq %ymm19,%ymm17,%ymm1
|
||||||
|
vpunpckhqdq %ymm18,%ymm16,%ymm2
|
||||||
|
vpunpckhqdq %ymm19,%ymm17,%ymm3
|
||||||
|
vpunpcklqdq %ymm22,%ymm20,%ymm4
|
||||||
|
vpunpcklqdq %ymm23,%ymm21,%ymm5
|
||||||
|
vpunpckhqdq %ymm22,%ymm20,%ymm6
|
||||||
|
vpunpckhqdq %ymm23,%ymm21,%ymm7
|
||||||
|
vpunpcklqdq %ymm26,%ymm24,%ymm8
|
||||||
|
vpunpcklqdq %ymm27,%ymm25,%ymm9
|
||||||
|
vpunpckhqdq %ymm26,%ymm24,%ymm10
|
||||||
|
vpunpckhqdq %ymm27,%ymm25,%ymm11
|
||||||
|
vpunpcklqdq %ymm30,%ymm28,%ymm12
|
||||||
|
vpunpcklqdq %ymm31,%ymm29,%ymm13
|
||||||
|
vpunpckhqdq %ymm30,%ymm28,%ymm14
|
||||||
|
vpunpckhqdq %ymm31,%ymm29,%ymm15
|
||||||
|
|
||||||
|
# interleave 128-bit words in state n, n+4
|
||||||
|
# xor/write first four blocks
|
||||||
|
vmovdqa64 %ymm0,%ymm16
|
||||||
|
vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
|
||||||
|
cmp $0x0020,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x0000(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x0000(%rsi)
|
||||||
|
vmovdqa64 %ymm16,%ymm0
|
||||||
|
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
|
||||||
|
|
||||||
|
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
|
||||||
|
cmp $0x0040,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x0020(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x0020(%rsi)
|
||||||
|
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
|
||||||
|
|
||||||
|
vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
|
||||||
|
cmp $0x0060,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x0040(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x0040(%rsi)
|
||||||
|
vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
|
||||||
|
|
||||||
|
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
|
||||||
|
cmp $0x0080,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x0060(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x0060(%rsi)
|
||||||
|
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
|
||||||
|
|
||||||
|
vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
|
||||||
|
cmp $0x00a0,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x0080(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x0080(%rsi)
|
||||||
|
vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
|
||||||
|
|
||||||
|
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
|
||||||
|
cmp $0x00c0,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x00a0(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x00a0(%rsi)
|
||||||
|
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
|
||||||
|
|
||||||
|
vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
|
||||||
|
cmp $0x00e0,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x00c0(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x00c0(%rsi)
|
||||||
|
vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
|
||||||
|
|
||||||
|
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
|
||||||
|
cmp $0x0100,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x00e0(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x00e0(%rsi)
|
||||||
|
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
|
||||||
|
|
||||||
|
# xor remaining blocks, write to output
|
||||||
|
vmovdqa64 %ymm4,%ymm0
|
||||||
|
cmp $0x0120,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x0100(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x0100(%rsi)
|
||||||
|
|
||||||
|
vmovdqa64 %ymm12,%ymm0
|
||||||
|
cmp $0x0140,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x0120(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x0120(%rsi)
|
||||||
|
|
||||||
|
vmovdqa64 %ymm6,%ymm0
|
||||||
|
cmp $0x0160,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x0140(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x0140(%rsi)
|
||||||
|
|
||||||
|
vmovdqa64 %ymm14,%ymm0
|
||||||
|
cmp $0x0180,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x0160(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x0160(%rsi)
|
||||||
|
|
||||||
|
vmovdqa64 %ymm5,%ymm0
|
||||||
|
cmp $0x01a0,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x0180(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x0180(%rsi)
|
||||||
|
|
||||||
|
vmovdqa64 %ymm13,%ymm0
|
||||||
|
cmp $0x01c0,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x01a0(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x01a0(%rsi)
|
||||||
|
|
||||||
|
vmovdqa64 %ymm7,%ymm0
|
||||||
|
cmp $0x01e0,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x01c0(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x01c0(%rsi)
|
||||||
|
|
||||||
|
vmovdqa64 %ymm15,%ymm0
|
||||||
|
cmp $0x0200,%rcx
|
||||||
|
jl .Lxorpart8
|
||||||
|
vpxord 0x01e0(%rdx),%ymm0,%ymm0
|
||||||
|
vmovdqu64 %ymm0,0x01e0(%rsi)
|
||||||
|
|
||||||
|
.Ldone8:
|
||||||
|
vzeroupper
|
||||||
|
ret
|
||||||
|
|
||||||
|
.Lxorpart8:
|
||||||
|
# xor remaining bytes from partial register into output
|
||||||
|
mov %rcx,%rax
|
||||||
|
and $0x1f,%rcx
|
||||||
|
jz .Ldone8
|
||||||
|
mov %rax,%r9
|
||||||
|
and $~0x1f,%r9
|
||||||
|
|
||||||
|
mov $1,%rax
|
||||||
|
shld %cl,%rax,%rax
|
||||||
|
sub $1,%rax
|
||||||
|
kmovq %rax,%k1
|
||||||
|
|
||||||
|
vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
|
||||||
|
vpxord %ymm0,%ymm1,%ymm1
|
||||||
|
vmovdqu8 %ymm1,(%rsi,%r9){%k1}
|
||||||
|
|
||||||
|
jmp .Ldone8
|
||||||
|
|
||||||
|
ENDPROC(chacha_8block_xor_avx512vl)
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
|
* ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
|
||||||
*
|
*
|
||||||
* Copyright (C) 2015 Martin Willi
|
* Copyright (C) 2015 Martin Willi
|
||||||
*
|
*
|
||||||
|
@ -10,6 +10,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <linux/linkage.h>
|
#include <linux/linkage.h>
|
||||||
|
#include <asm/frame.h>
|
||||||
|
|
||||||
.section .rodata.cst16.ROT8, "aM", @progbits, 16
|
.section .rodata.cst16.ROT8, "aM", @progbits, 16
|
||||||
.align 16
|
.align 16
|
||||||
|
@ -23,35 +24,25 @@ CTRINC: .octa 0x00000003000000020000000100000000
|
||||||
|
|
||||||
.text
|
.text
|
||||||
|
|
||||||
ENTRY(chacha20_block_xor_ssse3)
|
/*
|
||||||
# %rdi: Input state matrix, s
|
* chacha_permute - permute one block
|
||||||
# %rsi: 1 data block output, o
|
*
|
||||||
# %rdx: 1 data block input, i
|
* Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
|
||||||
|
* function performs matrix operations on four words in parallel, but requires
|
||||||
# This function encrypts one ChaCha20 block by loading the state matrix
|
* shuffling to rearrange the words after each round. 8/16-bit word rotation is
|
||||||
# in four SSE registers. It performs matrix operation on four words in
|
* done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
|
||||||
# parallel, but requireds shuffling to rearrange the words after each
|
* rotation uses traditional shift+OR.
|
||||||
# round. 8/16-bit word rotation is done with the slightly better
|
*
|
||||||
# performing SSSE3 byte shuffling, 7/12-bit word rotation uses
|
* The round count is given in %r8d.
|
||||||
# traditional shift+OR.
|
*
|
||||||
|
* Clobbers: %r8d, %xmm4-%xmm7
|
||||||
# x0..3 = s0..3
|
*/
|
||||||
movdqa 0x00(%rdi),%xmm0
|
chacha_permute:
|
||||||
movdqa 0x10(%rdi),%xmm1
|
|
||||||
movdqa 0x20(%rdi),%xmm2
|
|
||||||
movdqa 0x30(%rdi),%xmm3
|
|
||||||
movdqa %xmm0,%xmm8
|
|
||||||
movdqa %xmm1,%xmm9
|
|
||||||
movdqa %xmm2,%xmm10
|
|
||||||
movdqa %xmm3,%xmm11
|
|
||||||
|
|
||||||
movdqa ROT8(%rip),%xmm4
|
movdqa ROT8(%rip),%xmm4
|
||||||
movdqa ROT16(%rip),%xmm5
|
movdqa ROT16(%rip),%xmm5
|
||||||
|
|
||||||
mov $10,%ecx
|
|
||||||
|
|
||||||
.Ldoubleround:
|
.Ldoubleround:
|
||||||
|
|
||||||
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
|
||||||
paddd %xmm1,%xmm0
|
paddd %xmm1,%xmm0
|
||||||
pxor %xmm0,%xmm3
|
pxor %xmm0,%xmm3
|
||||||
|
@ -118,39 +109,129 @@ ENTRY(chacha20_block_xor_ssse3)
|
||||||
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
|
||||||
pshufd $0x39,%xmm3,%xmm3
|
pshufd $0x39,%xmm3,%xmm3
|
||||||
|
|
||||||
dec %ecx
|
sub $2,%r8d
|
||||||
jnz .Ldoubleround
|
jnz .Ldoubleround
|
||||||
|
|
||||||
|
ret
|
||||||
|
ENDPROC(chacha_permute)
|
||||||
|
|
||||||
|
ENTRY(chacha_block_xor_ssse3)
|
||||||
|
# %rdi: Input state matrix, s
|
||||||
|
# %rsi: up to 1 data block output, o
|
||||||
|
# %rdx: up to 1 data block input, i
|
||||||
|
# %rcx: input/output length in bytes
|
||||||
|
# %r8d: nrounds
|
||||||
|
FRAME_BEGIN
|
||||||
|
|
||||||
|
# x0..3 = s0..3
|
||||||
|
movdqu 0x00(%rdi),%xmm0
|
||||||
|
movdqu 0x10(%rdi),%xmm1
|
||||||
|
movdqu 0x20(%rdi),%xmm2
|
||||||
|
movdqu 0x30(%rdi),%xmm3
|
||||||
|
movdqa %xmm0,%xmm8
|
||||||
|
movdqa %xmm1,%xmm9
|
||||||
|
movdqa %xmm2,%xmm10
|
||||||
|
movdqa %xmm3,%xmm11
|
||||||
|
|
||||||
|
mov %rcx,%rax
|
||||||
|
call chacha_permute
|
||||||
|
|
||||||
# o0 = i0 ^ (x0 + s0)
|
# o0 = i0 ^ (x0 + s0)
|
||||||
movdqu 0x00(%rdx),%xmm4
|
|
||||||
paddd %xmm8,%xmm0
|
paddd %xmm8,%xmm0
|
||||||
|
cmp $0x10,%rax
|
||||||
|
jl .Lxorpart
|
||||||
|
movdqu 0x00(%rdx),%xmm4
|
||||||
pxor %xmm4,%xmm0
|
pxor %xmm4,%xmm0
|
||||||
movdqu %xmm0,0x00(%rsi)
|
movdqu %xmm0,0x00(%rsi)
|
||||||
# o1 = i1 ^ (x1 + s1)
|
# o1 = i1 ^ (x1 + s1)
|
||||||
movdqu 0x10(%rdx),%xmm5
|
|
||||||
paddd %xmm9,%xmm1
|
paddd %xmm9,%xmm1
|
||||||
pxor %xmm5,%xmm1
|
movdqa %xmm1,%xmm0
|
||||||
movdqu %xmm1,0x10(%rsi)
|
cmp $0x20,%rax
|
||||||
|
jl .Lxorpart
|
||||||
|
movdqu 0x10(%rdx),%xmm0
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x10(%rsi)
|
||||||
# o2 = i2 ^ (x2 + s2)
|
# o2 = i2 ^ (x2 + s2)
|
||||||
movdqu 0x20(%rdx),%xmm6
|
|
||||||
paddd %xmm10,%xmm2
|
paddd %xmm10,%xmm2
|
||||||
pxor %xmm6,%xmm2
|
movdqa %xmm2,%xmm0
|
||||||
movdqu %xmm2,0x20(%rsi)
|
cmp $0x30,%rax
|
||||||
|
jl .Lxorpart
|
||||||
|
movdqu 0x20(%rdx),%xmm0
|
||||||
|
pxor %xmm2,%xmm0
|
||||||
|
movdqu %xmm0,0x20(%rsi)
|
||||||
# o3 = i3 ^ (x3 + s3)
|
# o3 = i3 ^ (x3 + s3)
|
||||||
movdqu 0x30(%rdx),%xmm7
|
|
||||||
paddd %xmm11,%xmm3
|
paddd %xmm11,%xmm3
|
||||||
pxor %xmm7,%xmm3
|
movdqa %xmm3,%xmm0
|
||||||
movdqu %xmm3,0x30(%rsi)
|
cmp $0x40,%rax
|
||||||
|
jl .Lxorpart
|
||||||
|
movdqu 0x30(%rdx),%xmm0
|
||||||
|
pxor %xmm3,%xmm0
|
||||||
|
movdqu %xmm0,0x30(%rsi)
|
||||||
|
|
||||||
|
.Ldone:
|
||||||
|
FRAME_END
|
||||||
ret
|
ret
|
||||||
ENDPROC(chacha20_block_xor_ssse3)
|
|
||||||
|
|
||||||
ENTRY(chacha20_4block_xor_ssse3)
|
.Lxorpart:
|
||||||
|
# xor remaining bytes from partial register into output
|
||||||
|
mov %rax,%r9
|
||||||
|
and $0x0f,%r9
|
||||||
|
jz .Ldone
|
||||||
|
and $~0x0f,%rax
|
||||||
|
|
||||||
|
mov %rsi,%r11
|
||||||
|
|
||||||
|
lea 8(%rsp),%r10
|
||||||
|
sub $0x10,%rsp
|
||||||
|
and $~31,%rsp
|
||||||
|
|
||||||
|
lea (%rdx,%rax),%rsi
|
||||||
|
mov %rsp,%rdi
|
||||||
|
mov %r9,%rcx
|
||||||
|
rep movsb
|
||||||
|
|
||||||
|
pxor 0x00(%rsp),%xmm0
|
||||||
|
movdqa %xmm0,0x00(%rsp)
|
||||||
|
|
||||||
|
mov %rsp,%rsi
|
||||||
|
lea (%r11,%rax),%rdi
|
||||||
|
mov %r9,%rcx
|
||||||
|
rep movsb
|
||||||
|
|
||||||
|
lea -8(%r10),%rsp
|
||||||
|
jmp .Ldone
|
||||||
|
|
||||||
|
ENDPROC(chacha_block_xor_ssse3)
|
||||||
|
|
||||||
|
ENTRY(hchacha_block_ssse3)
|
||||||
# %rdi: Input state matrix, s
|
# %rdi: Input state matrix, s
|
||||||
# %rsi: 4 data blocks output, o
|
# %rsi: output (8 32-bit words)
|
||||||
# %rdx: 4 data blocks input, i
|
# %edx: nrounds
|
||||||
|
FRAME_BEGIN
|
||||||
|
|
||||||
# This function encrypts four consecutive ChaCha20 blocks by loading the
|
movdqu 0x00(%rdi),%xmm0
|
||||||
|
movdqu 0x10(%rdi),%xmm1
|
||||||
|
movdqu 0x20(%rdi),%xmm2
|
||||||
|
movdqu 0x30(%rdi),%xmm3
|
||||||
|
|
||||||
|
mov %edx,%r8d
|
||||||
|
call chacha_permute
|
||||||
|
|
||||||
|
movdqu %xmm0,0x00(%rsi)
|
||||||
|
movdqu %xmm3,0x10(%rsi)
|
||||||
|
|
||||||
|
FRAME_END
|
||||||
|
ret
|
||||||
|
ENDPROC(hchacha_block_ssse3)
|
||||||
|
|
||||||
|
ENTRY(chacha_4block_xor_ssse3)
|
||||||
|
# %rdi: Input state matrix, s
|
||||||
|
# %rsi: up to 4 data blocks output, o
|
||||||
|
# %rdx: up to 4 data blocks input, i
|
||||||
|
# %rcx: input/output length in bytes
|
||||||
|
# %r8d: nrounds
|
||||||
|
|
||||||
|
# This function encrypts four consecutive ChaCha blocks by loading the
|
||||||
# the state matrix in SSE registers four times. As we need some scratch
|
# the state matrix in SSE registers four times. As we need some scratch
|
||||||
# registers, we save the first four registers on the stack. The
|
# registers, we save the first four registers on the stack. The
|
||||||
# algorithm performs each operation on the corresponding word of each
|
# algorithm performs each operation on the corresponding word of each
|
||||||
|
@ -163,6 +244,7 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||||
lea 8(%rsp),%r10
|
lea 8(%rsp),%r10
|
||||||
sub $0x80,%rsp
|
sub $0x80,%rsp
|
||||||
and $~63,%rsp
|
and $~63,%rsp
|
||||||
|
mov %rcx,%rax
|
||||||
|
|
||||||
# x0..15[0-3] = s0..3[0..3]
|
# x0..15[0-3] = s0..3[0..3]
|
||||||
movq 0x00(%rdi),%xmm1
|
movq 0x00(%rdi),%xmm1
|
||||||
|
@ -202,8 +284,6 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||||
# x12 += counter values 0-3
|
# x12 += counter values 0-3
|
||||||
paddd %xmm1,%xmm12
|
paddd %xmm1,%xmm12
|
||||||
|
|
||||||
mov $10,%ecx
|
|
||||||
|
|
||||||
.Ldoubleround4:
|
.Ldoubleround4:
|
||||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
||||||
movdqa 0x00(%rsp),%xmm0
|
movdqa 0x00(%rsp),%xmm0
|
||||||
|
@ -421,7 +501,7 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||||
psrld $25,%xmm4
|
psrld $25,%xmm4
|
||||||
por %xmm0,%xmm4
|
por %xmm0,%xmm4
|
||||||
|
|
||||||
dec %ecx
|
sub $2,%r8d
|
||||||
jnz .Ldoubleround4
|
jnz .Ldoubleround4
|
||||||
|
|
||||||
# x0[0-3] += s0[0]
|
# x0[0-3] += s0[0]
|
||||||
|
@ -573,58 +653,143 @@ ENTRY(chacha20_4block_xor_ssse3)
|
||||||
|
|
||||||
# xor with corresponding input, write to output
|
# xor with corresponding input, write to output
|
||||||
movdqa 0x00(%rsp),%xmm0
|
movdqa 0x00(%rsp),%xmm0
|
||||||
|
cmp $0x10,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
movdqu 0x00(%rdx),%xmm1
|
movdqu 0x00(%rdx),%xmm1
|
||||||
pxor %xmm1,%xmm0
|
pxor %xmm1,%xmm0
|
||||||
movdqu %xmm0,0x00(%rsi)
|
movdqu %xmm0,0x00(%rsi)
|
||||||
movdqa 0x10(%rsp),%xmm0
|
|
||||||
movdqu 0x80(%rdx),%xmm1
|
movdqu %xmm4,%xmm0
|
||||||
|
cmp $0x20,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x10(%rdx),%xmm1
|
||||||
pxor %xmm1,%xmm0
|
pxor %xmm1,%xmm0
|
||||||
movdqu %xmm0,0x80(%rsi)
|
movdqu %xmm0,0x10(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm8,%xmm0
|
||||||
|
cmp $0x30,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x20(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x20(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm12,%xmm0
|
||||||
|
cmp $0x40,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x30(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x30(%rsi)
|
||||||
|
|
||||||
movdqa 0x20(%rsp),%xmm0
|
movdqa 0x20(%rsp),%xmm0
|
||||||
|
cmp $0x50,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
movdqu 0x40(%rdx),%xmm1
|
movdqu 0x40(%rdx),%xmm1
|
||||||
pxor %xmm1,%xmm0
|
pxor %xmm1,%xmm0
|
||||||
movdqu %xmm0,0x40(%rsi)
|
movdqu %xmm0,0x40(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm6,%xmm0
|
||||||
|
cmp $0x60,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x50(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x50(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm10,%xmm0
|
||||||
|
cmp $0x70,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x60(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x60(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm14,%xmm0
|
||||||
|
cmp $0x80,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x70(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x70(%rsi)
|
||||||
|
|
||||||
|
movdqa 0x10(%rsp),%xmm0
|
||||||
|
cmp $0x90,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x80(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x80(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm5,%xmm0
|
||||||
|
cmp $0xa0,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0x90(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0x90(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm9,%xmm0
|
||||||
|
cmp $0xb0,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0xa0(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0xa0(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm13,%xmm0
|
||||||
|
cmp $0xc0,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0xb0(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0xb0(%rsi)
|
||||||
|
|
||||||
movdqa 0x30(%rsp),%xmm0
|
movdqa 0x30(%rsp),%xmm0
|
||||||
|
cmp $0xd0,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
movdqu 0xc0(%rdx),%xmm1
|
movdqu 0xc0(%rdx),%xmm1
|
||||||
pxor %xmm1,%xmm0
|
pxor %xmm1,%xmm0
|
||||||
movdqu %xmm0,0xc0(%rsi)
|
movdqu %xmm0,0xc0(%rsi)
|
||||||
movdqu 0x10(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm4
|
|
||||||
movdqu %xmm4,0x10(%rsi)
|
|
||||||
movdqu 0x90(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm5
|
|
||||||
movdqu %xmm5,0x90(%rsi)
|
|
||||||
movdqu 0x50(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm6
|
|
||||||
movdqu %xmm6,0x50(%rsi)
|
|
||||||
movdqu 0xd0(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm7
|
|
||||||
movdqu %xmm7,0xd0(%rsi)
|
|
||||||
movdqu 0x20(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm8
|
|
||||||
movdqu %xmm8,0x20(%rsi)
|
|
||||||
movdqu 0xa0(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm9
|
|
||||||
movdqu %xmm9,0xa0(%rsi)
|
|
||||||
movdqu 0x60(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm10
|
|
||||||
movdqu %xmm10,0x60(%rsi)
|
|
||||||
movdqu 0xe0(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm11
|
|
||||||
movdqu %xmm11,0xe0(%rsi)
|
|
||||||
movdqu 0x30(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm12
|
|
||||||
movdqu %xmm12,0x30(%rsi)
|
|
||||||
movdqu 0xb0(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm13
|
|
||||||
movdqu %xmm13,0xb0(%rsi)
|
|
||||||
movdqu 0x70(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm14
|
|
||||||
movdqu %xmm14,0x70(%rsi)
|
|
||||||
movdqu 0xf0(%rdx),%xmm1
|
|
||||||
pxor %xmm1,%xmm15
|
|
||||||
movdqu %xmm15,0xf0(%rsi)
|
|
||||||
|
|
||||||
|
movdqu %xmm7,%xmm0
|
||||||
|
cmp $0xe0,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0xd0(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0xd0(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm11,%xmm0
|
||||||
|
cmp $0xf0,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0xe0(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0xe0(%rsi)
|
||||||
|
|
||||||
|
movdqu %xmm15,%xmm0
|
||||||
|
cmp $0x100,%rax
|
||||||
|
jl .Lxorpart4
|
||||||
|
movdqu 0xf0(%rdx),%xmm1
|
||||||
|
pxor %xmm1,%xmm0
|
||||||
|
movdqu %xmm0,0xf0(%rsi)
|
||||||
|
|
||||||
|
.Ldone4:
|
||||||
lea -8(%r10),%rsp
|
lea -8(%r10),%rsp
|
||||||
ret
|
ret
|
||||||
ENDPROC(chacha20_4block_xor_ssse3)
|
|
||||||
|
.Lxorpart4:
|
||||||
|
# xor remaining bytes from partial register into output
|
||||||
|
mov %rax,%r9
|
||||||
|
and $0x0f,%r9
|
||||||
|
jz .Ldone4
|
||||||
|
and $~0x0f,%rax
|
||||||
|
|
||||||
|
mov %rsi,%r11
|
||||||
|
|
||||||
|
lea (%rdx,%rax),%rsi
|
||||||
|
mov %rsp,%rdi
|
||||||
|
mov %r9,%rcx
|
||||||
|
rep movsb
|
||||||
|
|
||||||
|
pxor 0x00(%rsp),%xmm0
|
||||||
|
movdqa %xmm0,0x00(%rsp)
|
||||||
|
|
||||||
|
mov %rsp,%rsi
|
||||||
|
lea (%r11,%rax),%rdi
|
||||||
|
mov %r9,%rcx
|
||||||
|
rep movsb
|
||||||
|
|
||||||
|
jmp .Ldone4
|
||||||
|
|
||||||
|
ENDPROC(chacha_4block_xor_ssse3)
|
|
@ -1,448 +0,0 @@
|
||||||
/*
|
|
||||||
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
|
|
||||||
*
|
|
||||||
* Copyright (C) 2015 Martin Willi
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <linux/linkage.h>
|
|
||||||
|
|
||||||
.section .rodata.cst32.ROT8, "aM", @progbits, 32
|
|
||||||
.align 32
|
|
||||||
ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
|
|
||||||
.octa 0x0e0d0c0f0a09080b0605040702010003
|
|
||||||
|
|
||||||
.section .rodata.cst32.ROT16, "aM", @progbits, 32
|
|
||||||
.align 32
|
|
||||||
ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
|
|
||||||
.octa 0x0d0c0f0e09080b0a0504070601000302
|
|
||||||
|
|
||||||
.section .rodata.cst32.CTRINC, "aM", @progbits, 32
|
|
||||||
.align 32
|
|
||||||
CTRINC: .octa 0x00000003000000020000000100000000
|
|
||||||
.octa 0x00000007000000060000000500000004
|
|
||||||
|
|
||||||
.text
|
|
||||||
|
|
||||||
ENTRY(chacha20_8block_xor_avx2)
|
|
||||||
# %rdi: Input state matrix, s
|
|
||||||
# %rsi: 8 data blocks output, o
|
|
||||||
# %rdx: 8 data blocks input, i
|
|
||||||
|
|
||||||
# This function encrypts eight consecutive ChaCha20 blocks by loading
|
|
||||||
# the state matrix in AVX registers eight times. As we need some
|
|
||||||
# scratch registers, we save the first four registers on the stack. The
|
|
||||||
# algorithm performs each operation on the corresponding word of each
|
|
||||||
# state matrix, hence requires no word shuffling. For final XORing step
|
|
||||||
# we transpose the matrix by interleaving 32-, 64- and then 128-bit
|
|
||||||
# words, which allows us to do XOR in AVX registers. 8/16-bit word
|
|
||||||
# rotation is done with the slightly better performing byte shuffling,
|
|
||||||
# 7/12-bit word rotation uses traditional shift+OR.
|
|
||||||
|
|
||||||
vzeroupper
|
|
||||||
# 4 * 32 byte stack, 32-byte aligned
|
|
||||||
lea 8(%rsp),%r10
|
|
||||||
and $~31, %rsp
|
|
||||||
sub $0x80, %rsp
|
|
||||||
|
|
||||||
# x0..15[0-7] = s[0..15]
|
|
||||||
vpbroadcastd 0x00(%rdi),%ymm0
|
|
||||||
vpbroadcastd 0x04(%rdi),%ymm1
|
|
||||||
vpbroadcastd 0x08(%rdi),%ymm2
|
|
||||||
vpbroadcastd 0x0c(%rdi),%ymm3
|
|
||||||
vpbroadcastd 0x10(%rdi),%ymm4
|
|
||||||
vpbroadcastd 0x14(%rdi),%ymm5
|
|
||||||
vpbroadcastd 0x18(%rdi),%ymm6
|
|
||||||
vpbroadcastd 0x1c(%rdi),%ymm7
|
|
||||||
vpbroadcastd 0x20(%rdi),%ymm8
|
|
||||||
vpbroadcastd 0x24(%rdi),%ymm9
|
|
||||||
vpbroadcastd 0x28(%rdi),%ymm10
|
|
||||||
vpbroadcastd 0x2c(%rdi),%ymm11
|
|
||||||
vpbroadcastd 0x30(%rdi),%ymm12
|
|
||||||
vpbroadcastd 0x34(%rdi),%ymm13
|
|
||||||
vpbroadcastd 0x38(%rdi),%ymm14
|
|
||||||
vpbroadcastd 0x3c(%rdi),%ymm15
|
|
||||||
# x0..3 on stack
|
|
||||||
vmovdqa %ymm0,0x00(%rsp)
|
|
||||||
vmovdqa %ymm1,0x20(%rsp)
|
|
||||||
vmovdqa %ymm2,0x40(%rsp)
|
|
||||||
vmovdqa %ymm3,0x60(%rsp)
|
|
||||||
|
|
||||||
vmovdqa CTRINC(%rip),%ymm1
|
|
||||||
vmovdqa ROT8(%rip),%ymm2
|
|
||||||
vmovdqa ROT16(%rip),%ymm3
|
|
||||||
|
|
||||||
# x12 += counter values 0-3
|
|
||||||
vpaddd %ymm1,%ymm12,%ymm12
|
|
||||||
|
|
||||||
mov $10,%ecx
|
|
||||||
|
|
||||||
.Ldoubleround8:
|
|
||||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
|
|
||||||
vpaddd 0x00(%rsp),%ymm4,%ymm0
|
|
||||||
vmovdqa %ymm0,0x00(%rsp)
|
|
||||||
vpxor %ymm0,%ymm12,%ymm12
|
|
||||||
vpshufb %ymm3,%ymm12,%ymm12
|
|
||||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
|
|
||||||
vpaddd 0x20(%rsp),%ymm5,%ymm0
|
|
||||||
vmovdqa %ymm0,0x20(%rsp)
|
|
||||||
vpxor %ymm0,%ymm13,%ymm13
|
|
||||||
vpshufb %ymm3,%ymm13,%ymm13
|
|
||||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
|
|
||||||
vpaddd 0x40(%rsp),%ymm6,%ymm0
|
|
||||||
vmovdqa %ymm0,0x40(%rsp)
|
|
||||||
vpxor %ymm0,%ymm14,%ymm14
|
|
||||||
vpshufb %ymm3,%ymm14,%ymm14
|
|
||||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
|
|
||||||
vpaddd 0x60(%rsp),%ymm7,%ymm0
|
|
||||||
vmovdqa %ymm0,0x60(%rsp)
|
|
||||||
vpxor %ymm0,%ymm15,%ymm15
|
|
||||||
vpshufb %ymm3,%ymm15,%ymm15
|
|
||||||
|
|
||||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
|
|
||||||
vpaddd %ymm12,%ymm8,%ymm8
|
|
||||||
vpxor %ymm8,%ymm4,%ymm4
|
|
||||||
vpslld $12,%ymm4,%ymm0
|
|
||||||
vpsrld $20,%ymm4,%ymm4
|
|
||||||
vpor %ymm0,%ymm4,%ymm4
|
|
||||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
|
|
||||||
vpaddd %ymm13,%ymm9,%ymm9
|
|
||||||
vpxor %ymm9,%ymm5,%ymm5
|
|
||||||
vpslld $12,%ymm5,%ymm0
|
|
||||||
vpsrld $20,%ymm5,%ymm5
|
|
||||||
vpor %ymm0,%ymm5,%ymm5
|
|
||||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
|
|
||||||
vpaddd %ymm14,%ymm10,%ymm10
|
|
||||||
vpxor %ymm10,%ymm6,%ymm6
|
|
||||||
vpslld $12,%ymm6,%ymm0
|
|
||||||
vpsrld $20,%ymm6,%ymm6
|
|
||||||
vpor %ymm0,%ymm6,%ymm6
|
|
||||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
|
|
||||||
vpaddd %ymm15,%ymm11,%ymm11
|
|
||||||
vpxor %ymm11,%ymm7,%ymm7
|
|
||||||
vpslld $12,%ymm7,%ymm0
|
|
||||||
vpsrld $20,%ymm7,%ymm7
|
|
||||||
vpor %ymm0,%ymm7,%ymm7
|
|
||||||
|
|
||||||
# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
|
|
||||||
vpaddd 0x00(%rsp),%ymm4,%ymm0
|
|
||||||
vmovdqa %ymm0,0x00(%rsp)
|
|
||||||
vpxor %ymm0,%ymm12,%ymm12
|
|
||||||
vpshufb %ymm2,%ymm12,%ymm12
|
|
||||||
# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
|
|
||||||
vpaddd 0x20(%rsp),%ymm5,%ymm0
|
|
||||||
vmovdqa %ymm0,0x20(%rsp)
|
|
||||||
vpxor %ymm0,%ymm13,%ymm13
|
|
||||||
vpshufb %ymm2,%ymm13,%ymm13
|
|
||||||
# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
|
|
||||||
vpaddd 0x40(%rsp),%ymm6,%ymm0
|
|
||||||
vmovdqa %ymm0,0x40(%rsp)
|
|
||||||
vpxor %ymm0,%ymm14,%ymm14
|
|
||||||
vpshufb %ymm2,%ymm14,%ymm14
|
|
||||||
# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
|
|
||||||
vpaddd 0x60(%rsp),%ymm7,%ymm0
|
|
||||||
vmovdqa %ymm0,0x60(%rsp)
|
|
||||||
vpxor %ymm0,%ymm15,%ymm15
|
|
||||||
vpshufb %ymm2,%ymm15,%ymm15
|
|
||||||
|
|
||||||
# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
|
|
||||||
vpaddd %ymm12,%ymm8,%ymm8
|
|
||||||
vpxor %ymm8,%ymm4,%ymm4
|
|
||||||
vpslld $7,%ymm4,%ymm0
|
|
||||||
vpsrld $25,%ymm4,%ymm4
|
|
||||||
vpor %ymm0,%ymm4,%ymm4
|
|
||||||
# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
|
|
||||||
vpaddd %ymm13,%ymm9,%ymm9
|
|
||||||
vpxor %ymm9,%ymm5,%ymm5
|
|
||||||
vpslld $7,%ymm5,%ymm0
|
|
||||||
vpsrld $25,%ymm5,%ymm5
|
|
||||||
vpor %ymm0,%ymm5,%ymm5
|
|
||||||
# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
|
|
||||||
vpaddd %ymm14,%ymm10,%ymm10
|
|
||||||
vpxor %ymm10,%ymm6,%ymm6
|
|
||||||
vpslld $7,%ymm6,%ymm0
|
|
||||||
vpsrld $25,%ymm6,%ymm6
|
|
||||||
vpor %ymm0,%ymm6,%ymm6
|
|
||||||
# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
|
|
||||||
vpaddd %ymm15,%ymm11,%ymm11
|
|
||||||
vpxor %ymm11,%ymm7,%ymm7
|
|
||||||
vpslld $7,%ymm7,%ymm0
|
|
||||||
vpsrld $25,%ymm7,%ymm7
|
|
||||||
vpor %ymm0,%ymm7,%ymm7
|
|
||||||
|
|
||||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
|
|
||||||
vpaddd 0x00(%rsp),%ymm5,%ymm0
|
|
||||||
vmovdqa %ymm0,0x00(%rsp)
|
|
||||||
vpxor %ymm0,%ymm15,%ymm15
|
|
||||||
vpshufb %ymm3,%ymm15,%ymm15
|
|
||||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
|
|
||||||
vpaddd 0x20(%rsp),%ymm6,%ymm0
|
|
||||||
vmovdqa %ymm0,0x20(%rsp)
|
|
||||||
vpxor %ymm0,%ymm12,%ymm12
|
|
||||||
vpshufb %ymm3,%ymm12,%ymm12
|
|
||||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
|
|
||||||
vpaddd 0x40(%rsp),%ymm7,%ymm0
|
|
||||||
vmovdqa %ymm0,0x40(%rsp)
|
|
||||||
vpxor %ymm0,%ymm13,%ymm13
|
|
||||||
vpshufb %ymm3,%ymm13,%ymm13
|
|
||||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
|
|
||||||
vpaddd 0x60(%rsp),%ymm4,%ymm0
|
|
||||||
vmovdqa %ymm0,0x60(%rsp)
|
|
||||||
vpxor %ymm0,%ymm14,%ymm14
|
|
||||||
vpshufb %ymm3,%ymm14,%ymm14
|
|
||||||
|
|
||||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
|
|
||||||
vpaddd %ymm15,%ymm10,%ymm10
|
|
||||||
vpxor %ymm10,%ymm5,%ymm5
|
|
||||||
vpslld $12,%ymm5,%ymm0
|
|
||||||
vpsrld $20,%ymm5,%ymm5
|
|
||||||
vpor %ymm0,%ymm5,%ymm5
|
|
||||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
|
|
||||||
vpaddd %ymm12,%ymm11,%ymm11
|
|
||||||
vpxor %ymm11,%ymm6,%ymm6
|
|
||||||
vpslld $12,%ymm6,%ymm0
|
|
||||||
vpsrld $20,%ymm6,%ymm6
|
|
||||||
vpor %ymm0,%ymm6,%ymm6
|
|
||||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
|
|
||||||
vpaddd %ymm13,%ymm8,%ymm8
|
|
||||||
vpxor %ymm8,%ymm7,%ymm7
|
|
||||||
vpslld $12,%ymm7,%ymm0
|
|
||||||
vpsrld $20,%ymm7,%ymm7
|
|
||||||
vpor %ymm0,%ymm7,%ymm7
|
|
||||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
|
|
||||||
vpaddd %ymm14,%ymm9,%ymm9
|
|
||||||
vpxor %ymm9,%ymm4,%ymm4
|
|
||||||
vpslld $12,%ymm4,%ymm0
|
|
||||||
vpsrld $20,%ymm4,%ymm4
|
|
||||||
vpor %ymm0,%ymm4,%ymm4
|
|
||||||
|
|
||||||
# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
|
|
||||||
vpaddd 0x00(%rsp),%ymm5,%ymm0
|
|
||||||
vmovdqa %ymm0,0x00(%rsp)
|
|
||||||
vpxor %ymm0,%ymm15,%ymm15
|
|
||||||
vpshufb %ymm2,%ymm15,%ymm15
|
|
||||||
# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
|
|
||||||
vpaddd 0x20(%rsp),%ymm6,%ymm0
|
|
||||||
vmovdqa %ymm0,0x20(%rsp)
|
|
||||||
vpxor %ymm0,%ymm12,%ymm12
|
|
||||||
vpshufb %ymm2,%ymm12,%ymm12
|
|
||||||
# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
|
|
||||||
vpaddd 0x40(%rsp),%ymm7,%ymm0
|
|
||||||
vmovdqa %ymm0,0x40(%rsp)
|
|
||||||
vpxor %ymm0,%ymm13,%ymm13
|
|
||||||
vpshufb %ymm2,%ymm13,%ymm13
|
|
||||||
# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
|
|
||||||
vpaddd 0x60(%rsp),%ymm4,%ymm0
|
|
||||||
vmovdqa %ymm0,0x60(%rsp)
|
|
||||||
vpxor %ymm0,%ymm14,%ymm14
|
|
||||||
vpshufb %ymm2,%ymm14,%ymm14
|
|
||||||
|
|
||||||
# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
|
|
||||||
vpaddd %ymm15,%ymm10,%ymm10
|
|
||||||
vpxor %ymm10,%ymm5,%ymm5
|
|
||||||
vpslld $7,%ymm5,%ymm0
|
|
||||||
vpsrld $25,%ymm5,%ymm5
|
|
||||||
vpor %ymm0,%ymm5,%ymm5
|
|
||||||
# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
|
|
||||||
vpaddd %ymm12,%ymm11,%ymm11
|
|
||||||
vpxor %ymm11,%ymm6,%ymm6
|
|
||||||
vpslld $7,%ymm6,%ymm0
|
|
||||||
vpsrld $25,%ymm6,%ymm6
|
|
||||||
vpor %ymm0,%ymm6,%ymm6
|
|
||||||
# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
|
|
||||||
vpaddd %ymm13,%ymm8,%ymm8
|
|
||||||
vpxor %ymm8,%ymm7,%ymm7
|
|
||||||
vpslld $7,%ymm7,%ymm0
|
|
||||||
vpsrld $25,%ymm7,%ymm7
|
|
||||||
vpor %ymm0,%ymm7,%ymm7
|
|
||||||
# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
|
|
||||||
vpaddd %ymm14,%ymm9,%ymm9
|
|
||||||
vpxor %ymm9,%ymm4,%ymm4
|
|
||||||
vpslld $7,%ymm4,%ymm0
|
|
||||||
vpsrld $25,%ymm4,%ymm4
|
|
||||||
vpor %ymm0,%ymm4,%ymm4
|
|
||||||
|
|
||||||
dec %ecx
|
|
||||||
jnz .Ldoubleround8
|
|
||||||
|
|
||||||
# x0..15[0-3] += s[0..15]
|
|
||||||
vpbroadcastd 0x00(%rdi),%ymm0
|
|
||||||
vpaddd 0x00(%rsp),%ymm0,%ymm0
|
|
||||||
vmovdqa %ymm0,0x00(%rsp)
|
|
||||||
vpbroadcastd 0x04(%rdi),%ymm0
|
|
||||||
vpaddd 0x20(%rsp),%ymm0,%ymm0
|
|
||||||
vmovdqa %ymm0,0x20(%rsp)
|
|
||||||
vpbroadcastd 0x08(%rdi),%ymm0
|
|
||||||
vpaddd 0x40(%rsp),%ymm0,%ymm0
|
|
||||||
vmovdqa %ymm0,0x40(%rsp)
|
|
||||||
vpbroadcastd 0x0c(%rdi),%ymm0
|
|
||||||
vpaddd 0x60(%rsp),%ymm0,%ymm0
|
|
||||||
vmovdqa %ymm0,0x60(%rsp)
|
|
||||||
vpbroadcastd 0x10(%rdi),%ymm0
|
|
||||||
vpaddd %ymm0,%ymm4,%ymm4
|
|
||||||
vpbroadcastd 0x14(%rdi),%ymm0
|
|
||||||
vpaddd %ymm0,%ymm5,%ymm5
|
|
||||||
vpbroadcastd 0x18(%rdi),%ymm0
|
|
||||||
vpaddd %ymm0,%ymm6,%ymm6
|
|
||||||
vpbroadcastd 0x1c(%rdi),%ymm0
|
|
||||||
vpaddd %ymm0,%ymm7,%ymm7
|
|
||||||
vpbroadcastd 0x20(%rdi),%ymm0
|
|
||||||
vpaddd %ymm0,%ymm8,%ymm8
|
|
||||||
vpbroadcastd 0x24(%rdi),%ymm0
|
|
||||||
vpaddd %ymm0,%ymm9,%ymm9
|
|
||||||
vpbroadcastd 0x28(%rdi),%ymm0
|
|
||||||
vpaddd %ymm0,%ymm10,%ymm10
|
|
||||||
vpbroadcastd 0x2c(%rdi),%ymm0
|
|
||||||
vpaddd %ymm0,%ymm11,%ymm11
|
|
||||||
vpbroadcastd 0x30(%rdi),%ymm0
|
|
||||||
vpaddd %ymm0,%ymm12,%ymm12
|
|
||||||
vpbroadcastd 0x34(%rdi),%ymm0
|
|
||||||
vpaddd %ymm0,%ymm13,%ymm13
|
|
||||||
vpbroadcastd 0x38(%rdi),%ymm0
|
|
||||||
vpaddd %ymm0,%ymm14,%ymm14
|
|
||||||
vpbroadcastd 0x3c(%rdi),%ymm0
|
|
||||||
vpaddd %ymm0,%ymm15,%ymm15
|
|
||||||
|
|
||||||
# x12 += counter values 0-3
|
|
||||||
vpaddd %ymm1,%ymm12,%ymm12
|
|
||||||
|
|
||||||
# interleave 32-bit words in state n, n+1
|
|
||||||
vmovdqa 0x00(%rsp),%ymm0
|
|
||||||
vmovdqa 0x20(%rsp),%ymm1
|
|
||||||
vpunpckldq %ymm1,%ymm0,%ymm2
|
|
||||||
vpunpckhdq %ymm1,%ymm0,%ymm1
|
|
||||||
vmovdqa %ymm2,0x00(%rsp)
|
|
||||||
vmovdqa %ymm1,0x20(%rsp)
|
|
||||||
vmovdqa 0x40(%rsp),%ymm0
|
|
||||||
vmovdqa 0x60(%rsp),%ymm1
|
|
||||||
vpunpckldq %ymm1,%ymm0,%ymm2
|
|
||||||
vpunpckhdq %ymm1,%ymm0,%ymm1
|
|
||||||
vmovdqa %ymm2,0x40(%rsp)
|
|
||||||
vmovdqa %ymm1,0x60(%rsp)
|
|
||||||
vmovdqa %ymm4,%ymm0
|
|
||||||
vpunpckldq %ymm5,%ymm0,%ymm4
|
|
||||||
vpunpckhdq %ymm5,%ymm0,%ymm5
|
|
||||||
vmovdqa %ymm6,%ymm0
|
|
||||||
vpunpckldq %ymm7,%ymm0,%ymm6
|
|
||||||
vpunpckhdq %ymm7,%ymm0,%ymm7
|
|
||||||
vmovdqa %ymm8,%ymm0
|
|
||||||
vpunpckldq %ymm9,%ymm0,%ymm8
|
|
||||||
vpunpckhdq %ymm9,%ymm0,%ymm9
|
|
||||||
vmovdqa %ymm10,%ymm0
|
|
||||||
vpunpckldq %ymm11,%ymm0,%ymm10
|
|
||||||
vpunpckhdq %ymm11,%ymm0,%ymm11
|
|
||||||
vmovdqa %ymm12,%ymm0
|
|
||||||
vpunpckldq %ymm13,%ymm0,%ymm12
|
|
||||||
vpunpckhdq %ymm13,%ymm0,%ymm13
|
|
||||||
vmovdqa %ymm14,%ymm0
|
|
||||||
vpunpckldq %ymm15,%ymm0,%ymm14
|
|
||||||
vpunpckhdq %ymm15,%ymm0,%ymm15
|
|
||||||
|
|
||||||
# interleave 64-bit words in state n, n+2
|
|
||||||
vmovdqa 0x00(%rsp),%ymm0
|
|
||||||
vmovdqa 0x40(%rsp),%ymm2
|
|
||||||
vpunpcklqdq %ymm2,%ymm0,%ymm1
|
|
||||||
vpunpckhqdq %ymm2,%ymm0,%ymm2
|
|
||||||
vmovdqa %ymm1,0x00(%rsp)
|
|
||||||
vmovdqa %ymm2,0x40(%rsp)
|
|
||||||
vmovdqa 0x20(%rsp),%ymm0
|
|
||||||
vmovdqa 0x60(%rsp),%ymm2
|
|
||||||
vpunpcklqdq %ymm2,%ymm0,%ymm1
|
|
||||||
vpunpckhqdq %ymm2,%ymm0,%ymm2
|
|
||||||
vmovdqa %ymm1,0x20(%rsp)
|
|
||||||
vmovdqa %ymm2,0x60(%rsp)
|
|
||||||
vmovdqa %ymm4,%ymm0
|
|
||||||
vpunpcklqdq %ymm6,%ymm0,%ymm4
|
|
||||||
vpunpckhqdq %ymm6,%ymm0,%ymm6
|
|
||||||
vmovdqa %ymm5,%ymm0
|
|
||||||
vpunpcklqdq %ymm7,%ymm0,%ymm5
|
|
||||||
vpunpckhqdq %ymm7,%ymm0,%ymm7
|
|
||||||
vmovdqa %ymm8,%ymm0
|
|
||||||
vpunpcklqdq %ymm10,%ymm0,%ymm8
|
|
||||||
vpunpckhqdq %ymm10,%ymm0,%ymm10
|
|
||||||
vmovdqa %ymm9,%ymm0
|
|
||||||
vpunpcklqdq %ymm11,%ymm0,%ymm9
|
|
||||||
vpunpckhqdq %ymm11,%ymm0,%ymm11
|
|
||||||
vmovdqa %ymm12,%ymm0
|
|
||||||
vpunpcklqdq %ymm14,%ymm0,%ymm12
|
|
||||||
vpunpckhqdq %ymm14,%ymm0,%ymm14
|
|
||||||
vmovdqa %ymm13,%ymm0
|
|
||||||
vpunpcklqdq %ymm15,%ymm0,%ymm13
|
|
||||||
vpunpckhqdq %ymm15,%ymm0,%ymm15
|
|
||||||
|
|
||||||
# interleave 128-bit words in state n, n+4
|
|
||||||
vmovdqa 0x00(%rsp),%ymm0
|
|
||||||
vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
|
|
||||||
vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
|
|
||||||
vmovdqa %ymm1,0x00(%rsp)
|
|
||||||
vmovdqa 0x20(%rsp),%ymm0
|
|
||||||
vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
|
|
||||||
vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
|
|
||||||
vmovdqa %ymm1,0x20(%rsp)
|
|
||||||
vmovdqa 0x40(%rsp),%ymm0
|
|
||||||
vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
|
|
||||||
vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
|
|
||||||
vmovdqa %ymm1,0x40(%rsp)
|
|
||||||
vmovdqa 0x60(%rsp),%ymm0
|
|
||||||
vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
|
|
||||||
vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
|
|
||||||
vmovdqa %ymm1,0x60(%rsp)
|
|
||||||
vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
|
|
||||||
vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
|
|
||||||
vmovdqa %ymm0,%ymm8
|
|
||||||
vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
|
|
||||||
vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
|
|
||||||
vmovdqa %ymm0,%ymm9
|
|
||||||
vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
|
|
||||||
vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
|
|
||||||
vmovdqa %ymm0,%ymm10
|
|
||||||
vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
|
|
||||||
vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
|
|
||||||
vmovdqa %ymm0,%ymm11
|
|
||||||
|
|
||||||
# xor with corresponding input, write to output
|
|
||||||
vmovdqa 0x00(%rsp),%ymm0
|
|
||||||
vpxor 0x0000(%rdx),%ymm0,%ymm0
|
|
||||||
vmovdqu %ymm0,0x0000(%rsi)
|
|
||||||
vmovdqa 0x20(%rsp),%ymm0
|
|
||||||
vpxor 0x0080(%rdx),%ymm0,%ymm0
|
|
||||||
vmovdqu %ymm0,0x0080(%rsi)
|
|
||||||
vmovdqa 0x40(%rsp),%ymm0
|
|
||||||
vpxor 0x0040(%rdx),%ymm0,%ymm0
|
|
||||||
vmovdqu %ymm0,0x0040(%rsi)
|
|
||||||
vmovdqa 0x60(%rsp),%ymm0
|
|
||||||
vpxor 0x00c0(%rdx),%ymm0,%ymm0
|
|
||||||
vmovdqu %ymm0,0x00c0(%rsi)
|
|
||||||
vpxor 0x0100(%rdx),%ymm4,%ymm4
|
|
||||||
vmovdqu %ymm4,0x0100(%rsi)
|
|
||||||
vpxor 0x0180(%rdx),%ymm5,%ymm5
|
|
||||||
vmovdqu %ymm5,0x00180(%rsi)
|
|
||||||
vpxor 0x0140(%rdx),%ymm6,%ymm6
|
|
||||||
vmovdqu %ymm6,0x0140(%rsi)
|
|
||||||
vpxor 0x01c0(%rdx),%ymm7,%ymm7
|
|
||||||
vmovdqu %ymm7,0x01c0(%rsi)
|
|
||||||
vpxor 0x0020(%rdx),%ymm8,%ymm8
|
|
||||||
vmovdqu %ymm8,0x0020(%rsi)
|
|
||||||
vpxor 0x00a0(%rdx),%ymm9,%ymm9
|
|
||||||
vmovdqu %ymm9,0x00a0(%rsi)
|
|
||||||
vpxor 0x0060(%rdx),%ymm10,%ymm10
|
|
||||||
vmovdqu %ymm10,0x0060(%rsi)
|
|
||||||
vpxor 0x00e0(%rdx),%ymm11,%ymm11
|
|
||||||
vmovdqu %ymm11,0x00e0(%rsi)
|
|
||||||
vpxor 0x0120(%rdx),%ymm12,%ymm12
|
|
||||||
vmovdqu %ymm12,0x0120(%rsi)
|
|
||||||
vpxor 0x01a0(%rdx),%ymm13,%ymm13
|
|
||||||
vmovdqu %ymm13,0x01a0(%rsi)
|
|
||||||
vpxor 0x0160(%rdx),%ymm14,%ymm14
|
|
||||||
vmovdqu %ymm14,0x0160(%rsi)
|
|
||||||
vpxor 0x01e0(%rdx),%ymm15,%ymm15
|
|
||||||
vmovdqu %ymm15,0x01e0(%rsi)
|
|
||||||
|
|
||||||
vzeroupper
|
|
||||||
lea -8(%r10),%rsp
|
|
||||||
ret
|
|
||||||
ENDPROC(chacha20_8block_xor_avx2)
|
|
|
@ -1,146 +0,0 @@
|
||||||
/*
|
|
||||||
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
|
|
||||||
*
|
|
||||||
* Copyright (C) 2015 Martin Willi
|
|
||||||
*
|
|
||||||
* This program is free software; you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU General Public License as published by
|
|
||||||
* the Free Software Foundation; either version 2 of the License, or
|
|
||||||
* (at your option) any later version.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <crypto/algapi.h>
|
|
||||||
#include <crypto/chacha.h>
|
|
||||||
#include <crypto/internal/skcipher.h>
|
|
||||||
#include <linux/kernel.h>
|
|
||||||
#include <linux/module.h>
|
|
||||||
#include <asm/fpu/api.h>
|
|
||||||
#include <asm/simd.h>
|
|
||||||
|
|
||||||
#define CHACHA20_STATE_ALIGN 16
|
|
||||||
|
|
||||||
asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
|
|
||||||
asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
|
|
||||||
#ifdef CONFIG_AS_AVX2
|
|
||||||
asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
|
|
||||||
static bool chacha20_use_avx2;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
|
|
||||||
unsigned int bytes)
|
|
||||||
{
|
|
||||||
u8 buf[CHACHA_BLOCK_SIZE];
|
|
||||||
|
|
||||||
#ifdef CONFIG_AS_AVX2
|
|
||||||
if (chacha20_use_avx2) {
|
|
||||||
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
|
|
||||||
chacha20_8block_xor_avx2(state, dst, src);
|
|
||||||
bytes -= CHACHA_BLOCK_SIZE * 8;
|
|
||||||
src += CHACHA_BLOCK_SIZE * 8;
|
|
||||||
dst += CHACHA_BLOCK_SIZE * 8;
|
|
||||||
state[12] += 8;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
|
||||||
chacha20_4block_xor_ssse3(state, dst, src);
|
|
||||||
bytes -= CHACHA_BLOCK_SIZE * 4;
|
|
||||||
src += CHACHA_BLOCK_SIZE * 4;
|
|
||||||
dst += CHACHA_BLOCK_SIZE * 4;
|
|
||||||
state[12] += 4;
|
|
||||||
}
|
|
||||||
while (bytes >= CHACHA_BLOCK_SIZE) {
|
|
||||||
chacha20_block_xor_ssse3(state, dst, src);
|
|
||||||
bytes -= CHACHA_BLOCK_SIZE;
|
|
||||||
src += CHACHA_BLOCK_SIZE;
|
|
||||||
dst += CHACHA_BLOCK_SIZE;
|
|
||||||
state[12]++;
|
|
||||||
}
|
|
||||||
if (bytes) {
|
|
||||||
memcpy(buf, src, bytes);
|
|
||||||
chacha20_block_xor_ssse3(state, buf, buf);
|
|
||||||
memcpy(dst, buf, bytes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static int chacha20_simd(struct skcipher_request *req)
|
|
||||||
{
|
|
||||||
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
|
||||||
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
|
||||||
u32 *state, state_buf[16 + 2] __aligned(8);
|
|
||||||
struct skcipher_walk walk;
|
|
||||||
int err;
|
|
||||||
|
|
||||||
BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
|
|
||||||
state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
|
|
||||||
|
|
||||||
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
|
|
||||||
return crypto_chacha_crypt(req);
|
|
||||||
|
|
||||||
err = skcipher_walk_virt(&walk, req, true);
|
|
||||||
|
|
||||||
crypto_chacha_init(state, ctx, walk.iv);
|
|
||||||
|
|
||||||
kernel_fpu_begin();
|
|
||||||
|
|
||||||
while (walk.nbytes >= CHACHA_BLOCK_SIZE) {
|
|
||||||
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
|
|
||||||
rounddown(walk.nbytes, CHACHA_BLOCK_SIZE));
|
|
||||||
err = skcipher_walk_done(&walk,
|
|
||||||
walk.nbytes % CHACHA_BLOCK_SIZE);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (walk.nbytes) {
|
|
||||||
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
|
|
||||||
walk.nbytes);
|
|
||||||
err = skcipher_walk_done(&walk, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
kernel_fpu_end();
|
|
||||||
|
|
||||||
return err;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct skcipher_alg alg = {
|
|
||||||
.base.cra_name = "chacha20",
|
|
||||||
.base.cra_driver_name = "chacha20-simd",
|
|
||||||
.base.cra_priority = 300,
|
|
||||||
.base.cra_blocksize = 1,
|
|
||||||
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
|
||||||
.base.cra_module = THIS_MODULE,
|
|
||||||
|
|
||||||
.min_keysize = CHACHA_KEY_SIZE,
|
|
||||||
.max_keysize = CHACHA_KEY_SIZE,
|
|
||||||
.ivsize = CHACHA_IV_SIZE,
|
|
||||||
.chunksize = CHACHA_BLOCK_SIZE,
|
|
||||||
.setkey = crypto_chacha20_setkey,
|
|
||||||
.encrypt = chacha20_simd,
|
|
||||||
.decrypt = chacha20_simd,
|
|
||||||
};
|
|
||||||
|
|
||||||
static int __init chacha20_simd_mod_init(void)
|
|
||||||
{
|
|
||||||
if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
|
||||||
return -ENODEV;
|
|
||||||
|
|
||||||
#ifdef CONFIG_AS_AVX2
|
|
||||||
chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
|
|
||||||
boot_cpu_has(X86_FEATURE_AVX2) &&
|
|
||||||
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
|
|
||||||
#endif
|
|
||||||
return crypto_register_skcipher(&alg);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void __exit chacha20_simd_mod_fini(void)
|
|
||||||
{
|
|
||||||
crypto_unregister_skcipher(&alg);
|
|
||||||
}
|
|
||||||
|
|
||||||
module_init(chacha20_simd_mod_init);
|
|
||||||
module_exit(chacha20_simd_mod_fini);
|
|
||||||
|
|
||||||
MODULE_LICENSE("GPL");
|
|
||||||
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
|
|
||||||
MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
|
|
||||||
MODULE_ALIAS_CRYPTO("chacha20");
|
|
||||||
MODULE_ALIAS_CRYPTO("chacha20-simd");
|
|
322
arch/x86/crypto/chacha_glue.c
Normal file
322
arch/x86/crypto/chacha_glue.c
Normal file
|
@ -0,0 +1,322 @@
|
||||||
|
/*
|
||||||
|
* x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
|
||||||
|
* including ChaCha20 (RFC7539)
|
||||||
|
*
|
||||||
|
* Copyright (C) 2015 Martin Willi
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; either version 2 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <crypto/algapi.h>
|
||||||
|
#include <crypto/internal/chacha.h>
|
||||||
|
#include <crypto/internal/skcipher.h>
|
||||||
|
#include <linux/kernel.h>
|
||||||
|
#include <linux/module.h>
|
||||||
|
#include <asm/fpu/api.h>
|
||||||
|
#include <asm/simd.h>
|
||||||
|
|
||||||
|
asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
||||||
|
unsigned int len, int nrounds);
|
||||||
|
asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
|
||||||
|
unsigned int len, int nrounds);
|
||||||
|
asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
|
||||||
|
|
||||||
|
asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
||||||
|
unsigned int len, int nrounds);
|
||||||
|
asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
||||||
|
unsigned int len, int nrounds);
|
||||||
|
asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
|
||||||
|
unsigned int len, int nrounds);
|
||||||
|
|
||||||
|
asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
||||||
|
unsigned int len, int nrounds);
|
||||||
|
asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
||||||
|
unsigned int len, int nrounds);
|
||||||
|
asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
|
||||||
|
unsigned int len, int nrounds);
|
||||||
|
|
||||||
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd);
|
||||||
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2);
|
||||||
|
static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl);
|
||||||
|
|
||||||
|
static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
|
||||||
|
{
|
||||||
|
len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
|
||||||
|
return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
|
||||||
|
unsigned int bytes, int nrounds)
|
||||||
|
{
|
||||||
|
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
||||||
|
static_branch_likely(&chacha_use_avx512vl)) {
|
||||||
|
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
|
||||||
|
chacha_8block_xor_avx512vl(state, dst, src, bytes,
|
||||||
|
nrounds);
|
||||||
|
bytes -= CHACHA_BLOCK_SIZE * 8;
|
||||||
|
src += CHACHA_BLOCK_SIZE * 8;
|
||||||
|
dst += CHACHA_BLOCK_SIZE * 8;
|
||||||
|
state[12] += 8;
|
||||||
|
}
|
||||||
|
if (bytes > CHACHA_BLOCK_SIZE * 4) {
|
||||||
|
chacha_8block_xor_avx512vl(state, dst, src, bytes,
|
||||||
|
nrounds);
|
||||||
|
state[12] += chacha_advance(bytes, 8);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (bytes > CHACHA_BLOCK_SIZE * 2) {
|
||||||
|
chacha_4block_xor_avx512vl(state, dst, src, bytes,
|
||||||
|
nrounds);
|
||||||
|
state[12] += chacha_advance(bytes, 4);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (bytes) {
|
||||||
|
chacha_2block_xor_avx512vl(state, dst, src, bytes,
|
||||||
|
nrounds);
|
||||||
|
state[12] += chacha_advance(bytes, 2);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (IS_ENABLED(CONFIG_AS_AVX2) &&
|
||||||
|
static_branch_likely(&chacha_use_avx2)) {
|
||||||
|
while (bytes >= CHACHA_BLOCK_SIZE * 8) {
|
||||||
|
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||||
|
bytes -= CHACHA_BLOCK_SIZE * 8;
|
||||||
|
src += CHACHA_BLOCK_SIZE * 8;
|
||||||
|
dst += CHACHA_BLOCK_SIZE * 8;
|
||||||
|
state[12] += 8;
|
||||||
|
}
|
||||||
|
if (bytes > CHACHA_BLOCK_SIZE * 4) {
|
||||||
|
chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||||
|
state[12] += chacha_advance(bytes, 8);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (bytes > CHACHA_BLOCK_SIZE * 2) {
|
||||||
|
chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||||
|
state[12] += chacha_advance(bytes, 4);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (bytes > CHACHA_BLOCK_SIZE) {
|
||||||
|
chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
|
||||||
|
state[12] += chacha_advance(bytes, 2);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
|
||||||
|
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
|
||||||
|
bytes -= CHACHA_BLOCK_SIZE * 4;
|
||||||
|
src += CHACHA_BLOCK_SIZE * 4;
|
||||||
|
dst += CHACHA_BLOCK_SIZE * 4;
|
||||||
|
state[12] += 4;
|
||||||
|
}
|
||||||
|
if (bytes > CHACHA_BLOCK_SIZE) {
|
||||||
|
chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
|
||||||
|
state[12] += chacha_advance(bytes, 4);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (bytes) {
|
||||||
|
chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
|
||||||
|
state[12]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds)
|
||||||
|
{
|
||||||
|
if (!static_branch_likely(&chacha_use_simd) || !may_use_simd()) {
|
||||||
|
hchacha_block_generic(state, stream, nrounds);
|
||||||
|
} else {
|
||||||
|
kernel_fpu_begin();
|
||||||
|
hchacha_block_ssse3(state, stream, nrounds);
|
||||||
|
kernel_fpu_end();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(hchacha_block_arch);
|
||||||
|
|
||||||
|
void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv)
|
||||||
|
{
|
||||||
|
chacha_init_generic(state, key, iv);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(chacha_init_arch);
|
||||||
|
|
||||||
|
void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes,
|
||||||
|
int nrounds)
|
||||||
|
{
|
||||||
|
if (!static_branch_likely(&chacha_use_simd) || !may_use_simd() ||
|
||||||
|
bytes <= CHACHA_BLOCK_SIZE)
|
||||||
|
return chacha_crypt_generic(state, dst, src, bytes, nrounds);
|
||||||
|
|
||||||
|
do {
|
||||||
|
unsigned int todo = min_t(unsigned int, bytes, SZ_4K);
|
||||||
|
|
||||||
|
kernel_fpu_begin();
|
||||||
|
chacha_dosimd(state, dst, src, todo, nrounds);
|
||||||
|
kernel_fpu_end();
|
||||||
|
|
||||||
|
bytes -= todo;
|
||||||
|
src += todo;
|
||||||
|
dst += todo;
|
||||||
|
} while (bytes);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL(chacha_crypt_arch);
|
||||||
|
|
||||||
|
static int chacha_simd_stream_xor(struct skcipher_request *req,
|
||||||
|
const struct chacha_ctx *ctx, const u8 *iv)
|
||||||
|
{
|
||||||
|
u32 state[CHACHA_STATE_WORDS] __aligned(8);
|
||||||
|
struct skcipher_walk walk;
|
||||||
|
int err;
|
||||||
|
|
||||||
|
err = skcipher_walk_virt(&walk, req, false);
|
||||||
|
|
||||||
|
chacha_init_generic(state, ctx->key, iv);
|
||||||
|
|
||||||
|
while (walk.nbytes > 0) {
|
||||||
|
unsigned int nbytes = walk.nbytes;
|
||||||
|
|
||||||
|
if (nbytes < walk.total)
|
||||||
|
nbytes = round_down(nbytes, walk.stride);
|
||||||
|
|
||||||
|
if (!static_branch_likely(&chacha_use_simd) ||
|
||||||
|
!may_use_simd()) {
|
||||||
|
chacha_crypt_generic(state, walk.dst.virt.addr,
|
||||||
|
walk.src.virt.addr, nbytes,
|
||||||
|
ctx->nrounds);
|
||||||
|
} else {
|
||||||
|
kernel_fpu_begin();
|
||||||
|
chacha_dosimd(state, walk.dst.virt.addr,
|
||||||
|
walk.src.virt.addr, nbytes,
|
||||||
|
ctx->nrounds);
|
||||||
|
kernel_fpu_end();
|
||||||
|
}
|
||||||
|
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int chacha_simd(struct skcipher_request *req)
|
||||||
|
{
|
||||||
|
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||||
|
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||||
|
|
||||||
|
return chacha_simd_stream_xor(req, ctx, req->iv);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int xchacha_simd(struct skcipher_request *req)
|
||||||
|
{
|
||||||
|
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
|
||||||
|
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
|
||||||
|
u32 state[CHACHA_STATE_WORDS] __aligned(8);
|
||||||
|
struct chacha_ctx subctx;
|
||||||
|
u8 real_iv[16];
|
||||||
|
|
||||||
|
chacha_init_generic(state, ctx->key, req->iv);
|
||||||
|
|
||||||
|
if (req->cryptlen > CHACHA_BLOCK_SIZE && irq_fpu_usable()) {
|
||||||
|
kernel_fpu_begin();
|
||||||
|
hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
|
||||||
|
kernel_fpu_end();
|
||||||
|
} else {
|
||||||
|
hchacha_block_generic(state, subctx.key, ctx->nrounds);
|
||||||
|
}
|
||||||
|
subctx.nrounds = ctx->nrounds;
|
||||||
|
|
||||||
|
memcpy(&real_iv[0], req->iv + 24, 8);
|
||||||
|
memcpy(&real_iv[8], req->iv + 16, 8);
|
||||||
|
return chacha_simd_stream_xor(req, &subctx, real_iv);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct skcipher_alg algs[] = {
|
||||||
|
{
|
||||||
|
.base.cra_name = "chacha20",
|
||||||
|
.base.cra_driver_name = "chacha20-simd",
|
||||||
|
.base.cra_priority = 300,
|
||||||
|
.base.cra_blocksize = 1,
|
||||||
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.ivsize = CHACHA_IV_SIZE,
|
||||||
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
|
.setkey = chacha20_setkey,
|
||||||
|
.encrypt = chacha_simd,
|
||||||
|
.decrypt = chacha_simd,
|
||||||
|
}, {
|
||||||
|
.base.cra_name = "xchacha20",
|
||||||
|
.base.cra_driver_name = "xchacha20-simd",
|
||||||
|
.base.cra_priority = 300,
|
||||||
|
.base.cra_blocksize = 1,
|
||||||
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.ivsize = XCHACHA_IV_SIZE,
|
||||||
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
|
.setkey = chacha20_setkey,
|
||||||
|
.encrypt = xchacha_simd,
|
||||||
|
.decrypt = xchacha_simd,
|
||||||
|
}, {
|
||||||
|
.base.cra_name = "xchacha12",
|
||||||
|
.base.cra_driver_name = "xchacha12-simd",
|
||||||
|
.base.cra_priority = 300,
|
||||||
|
.base.cra_blocksize = 1,
|
||||||
|
.base.cra_ctxsize = sizeof(struct chacha_ctx),
|
||||||
|
.base.cra_module = THIS_MODULE,
|
||||||
|
|
||||||
|
.min_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.max_keysize = CHACHA_KEY_SIZE,
|
||||||
|
.ivsize = XCHACHA_IV_SIZE,
|
||||||
|
.chunksize = CHACHA_BLOCK_SIZE,
|
||||||
|
.setkey = chacha12_setkey,
|
||||||
|
.encrypt = xchacha_simd,
|
||||||
|
.decrypt = xchacha_simd,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
static int __init chacha_simd_mod_init(void)
|
||||||
|
{
|
||||||
|
if (!boot_cpu_has(X86_FEATURE_SSSE3))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
static_branch_enable(&chacha_use_simd);
|
||||||
|
|
||||||
|
if (IS_ENABLED(CONFIG_AS_AVX2) &&
|
||||||
|
boot_cpu_has(X86_FEATURE_AVX) &&
|
||||||
|
boot_cpu_has(X86_FEATURE_AVX2) &&
|
||||||
|
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
|
||||||
|
static_branch_enable(&chacha_use_avx2);
|
||||||
|
|
||||||
|
if (IS_ENABLED(CONFIG_AS_AVX512) &&
|
||||||
|
boot_cpu_has(X86_FEATURE_AVX512VL) &&
|
||||||
|
boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */
|
||||||
|
static_branch_enable(&chacha_use_avx512vl);
|
||||||
|
}
|
||||||
|
return IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) ?
|
||||||
|
crypto_register_skciphers(algs, ARRAY_SIZE(algs)) : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __exit chacha_simd_mod_fini(void)
|
||||||
|
{
|
||||||
|
if (IS_REACHABLE(CONFIG_CRYPTO_BLKCIPHER) && boot_cpu_has(X86_FEATURE_SSSE3))
|
||||||
|
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
|
||||||
|
}
|
||||||
|
|
||||||
|
module_init(chacha_simd_mod_init);
|
||||||
|
module_exit(chacha_simd_mod_fini);
|
||||||
|
|
||||||
|
MODULE_LICENSE("GPL");
|
||||||
|
MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
|
||||||
|
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
|
||||||
|
MODULE_ALIAS_CRYPTO("chacha20");
|
||||||
|
MODULE_ALIAS_CRYPTO("chacha20-simd");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha20");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha20-simd");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha12");
|
||||||
|
MODULE_ALIAS_CRYPTO("xchacha12-simd");
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue