UPSTREAM: crypto: x86/chacha20 - Add a 2-block AVX2 variant

This variant uses the same principle as the single block SSSE3 variant by shuffling the state matrix after each round. With the wider AVX registers, we can do two blocks in parallel, though. This function can increase performance and efficiency significantly for lengths that would otherwise require a 4-block function. Signed-off-by: Martin Willi <martin@strongswan.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> (cherry picked from commit a5dd97f86211e91219807db607d740f9896b8e0b) Bug: 152722841 Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: Ie0011f26fd59257ace84299f7a65a6c11da13b47
2018-11-11 10:36:29 +01:00 · 2018-11-11 10:36:29 +01:00 · 3928787aea
commit 3928787aea
parent 26b2d7b5d4
1 changed files with 7 additions and 0 deletions
--- a/arch/x86/crypto/chacha20_glue.c
+++ b/arch/x86/crypto/chacha20_glue.c
@ -24,6 +24,8 @@ asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
 					  unsigned int len);
 #ifdef CONFIG_AS_AVX2
+asmlinkage void chacha20_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+					 unsigned int len);
 asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
 					 unsigned int len);
 static bool chacha20_use_avx2;
@ -52,6 +54,11 @@ static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
 			state[12] += chacha20_advance(bytes, 8);
 			return;
 		}
+		if (bytes > CHACHA_BLOCK_SIZE) {
+			chacha20_2block_xor_avx2(state, dst, src, bytes);
+			state[12] += chacha20_advance(bytes, 2);
+			return;
+		}
 	}
 #endif
 	while (bytes >= CHACHA_BLOCK_SIZE * 4) {