[SQUASH] treewide: Revert backported crypto algos

* breaks vowifi, besides i had little to no improvement in geekbench aex-xts scores with these Revert "ARM64/configs: surya: Enable backported crypto algos" This reverts commit 53cf765af430ea1ce8b99cfdebfaf1fbd05ce9e7. Revert "arm64: crypto: aes-glue: always clear out defines before assuming" This reverts commit 6bdf40bd12304f3c26c7278be8172dae4d355d23. Revert "lib: crypto: move out aes generic library into arm64" This reverts commit 6f53883b2cac7ec3818464e90738182c7da727b4. Revert "crypto: bring back blkcipher" This reverts commit 49ab3ad968fdf0f3b15a7014d6958cc46e50c4a3. Revert "crypto: lib/aes - export sbox and inverse sbox" This reverts commit 6d7509597228bb2824b229b6f92200621af17039. Revert "crypto: lib/aes add aes generic" This reverts commit cd956b7d9c2533f72a41eaa1421d7896c867137c. Revert "HACK: include: crypto: aes.h: redirect aes function definitions to current ones" This reverts commit 90035f7567d034bebd0672724d2ee341d9ef789a. Revert "arm64: assembler: add utility macros to push/pop stack frames" This reverts commit 5899671c86c750a23f68fce359ac4574f289b252. Revert "crypto: skcipher - add the ability to abort a skcipher walk" This reverts commit 6bfa9de12bea39c63115d573b292e6352a85ef69. Revert "crypto: aes/fixed-time - align key schedule with other implementations" This reverts commit 6b930b634c0bab896b127835ffba9a3c2d796760. Revert "crypto: sm4 - export encrypt/decrypt routines to other drivers" This reverts commit 306ce112895e48e8c67f5531dcc7794b8c9799d2. Revert "crypto: sm4 - introduce SM4 symmetric cipher algorithm" This reverts commit 9e677e8a2fe26fd4694dfff61279bd85237aacc8. Revert "crypto: hash - introduce crypto_shash_tfm_digest()" This reverts commit 2e972aa2655e7e50b4362d92e13b78723ab32c01. Revert "crypto: ctr - add helper for performing a CTR encryption walk" This reverts commit 20d0ffb036d9f88eb7b2c056202a9e6f4cece6bf. Revert "crypto: sm3 - export crypto_sm3_final function" This reverts commit d6983c76e3da30db111421795beda5c703955aa9. Revert "crypto: sm3 - add OSCCA SM3 secure hash" This reverts commit d6a29406704f117dfd2cc99b78078831d1fe287e. Revert "crypto: don't optimize keccakf()" This reverts commit 598d21bacff6319be4aa058025c632b107413f32. Revert "crypto: sha3-generic - Use __optimize to support old compilers" This reverts commit 734a8b2ae19a5e6e1ca1ada7ccb333a9655a0673. Revert "crypto: sha3-generic - deal with oversize stack frames" This reverts commit 83f16c21f6e058890a258a2a5b765046f6d1230c. Revert "crypto: sha3-generic - export init/update/final routines" This reverts commit 591f7afb9d0890c2ee9460360112aa4e448a3924. Revert "crypto: sha3-generic - simplify code" This reverts commit 6a9ea8f69fb0baa3d403fbbb82e052d684bc9fec. Revert "crypto: sha3-generic - rewrite KECCAK transform to help the compiler optimize" This reverts commit f7f038880d90b321a48788160beea14fa294f109. Revert "arm64: crypto: fallback to may_use_yield" This reverts commit 5dc3c1306cd51f5ca293e8b76f7f382ba68da2c2. Revert "arm64: assembler: add cond_yield macro" This reverts commit 10ec3894fbd69c75638c44925408b4a618a6afa8. Revert "crypto: arm64/aes-ce - deal with oversight in new CTR carry code" This reverts commit 769190a03e8e1373a4e6ebfffa409bd89a123e85. Revert "crypto: arm64/crc-t10dif - move NEON yield to C code" This reverts commit ae264b6f57eb729aaeb9f8c0cb64d6da7d2a61ae. Revert "crypto: arm64/aes-ce-mac - simplify NEON yield" This reverts commit f497267b2d962c1d1e3227b68ed47450235c0134. Revert "crypto: arm64/aes-neonbs - remove NEON yield calls" This reverts commit e3bf48533edde9d66e0de81df758b75e5daf7649. Revert "crypto: arm64/sha512-ce - simplify NEON yield" This reverts commit 223c7a38e2597442d1bfa9b8cf1d22d4e5bc975b. Revert "crypto: arm64/sha3-ce - simplify NEON yield" This reverts commit c4ab0f1013d7f75e77bbb190d2d0a45931777e06. Revert "crypto: arm64/sha2-ce - simplify NEON yield" This reverts commit c194acd3691aa17bb4bfff16124b2a3f476d0f4f. Revert "crypto: arm64/sha1-ce - simplify NEON yield" This reverts commit 2c13ddfe897ccda89cdbbb77d44522b7a1b52d71. Revert "crypto: arm64/sha - add missing module aliases" This reverts commit 0ccfa37388086d02858f9a829a8424ea6fe91369. Revert "crypto: arm64/aes-ctr - improve tail handling" This reverts commit 76050bf6b606b2a634d5d2615e3e37da30de1ccd. Revert "crypto: arm64/aes-ce - really hide slower algos when faster ones are enabled" This reverts commit 5648ba13a3c67b2f15544c73d75525f4b47af4f6. Revert "crypto: arm64/gcm - move authentication tag check to SIMD domain" This reverts commit 1efe38494e3a102b79f267e85359dad696483790. Revert "crypto: arm64/chacha - simplify tail block handling" This reverts commit c5f4710f95bbc3b530f22d6752cd1ff3d2006f67. Revert "crypto: hash - Use memzero_explicit() for clearing state" This reverts commit 1ec2a7f09f56c06773a754b1037d9463d44cf5b0. Revert "crypto: arm64: Use x16 with indirect branch to bti_c" This reverts commit a8c3cc4987341e820cf33589fb8be1e80d6a1455. Revert "crypto: arm64/gcm - Fix endianness warnings" This reverts commit 02adba8b71b749dbd0d4912e233e18b20f5a8a9d. Revert "crypto: arm64/sha - Add declarations for assembly variables" This reverts commit 435ec992f88cefba133194f1ee249805d482d8b2. Revert "crypto: arm64/gcm - use inline helper to suppress indirect calls" This reverts commit bc92b86fadeb48ae4bfef2b0d3c5b9ba078a4328. Revert "crypto: arm64/gcm - use variably sized key struct" This reverts commit 0aa3bc4ef6a0400c29fa82bf222996ed63a221f9. Revert "crypto: arm64/gcm - disentangle ghash and gcm setkey() routines" This reverts commit 4ff08ef5dd27b020b66fbe45f61282f20942c7cb. Revert "crypto: arm64/ghash - drop PMULL based shash" This reverts commit 313a347394fc9593669d088608fa327c21481d16. Revert "crypto: arm64/aes-glue - use crypto_shash_tfm_digest()" This reverts commit fb5c6d2fd18e01024fc8a509945545ec6925cdd7. Revert "crypto: arm64 - Consistently enable extension" This reverts commit 6eae8549471bc9b81b8d25859e6683e9be4cc67c. Revert "crypto: arm/neon - memzero_explicit aes-cbc key" This reverts commit fda9fcfafd679f7ed2ef75f8859ff91680b9dd6b. Revert "arm64: crypto: Modernize names for AES function macros" This reverts commit 2d88fdbd9de0f95ef5b8e135474a40004b0b63c3. Revert "arm64: crypto: Modernize some extra assembly annotations" This reverts commit d6b0bf996b79a03a37c407bfe93a8c80adbd7821. Revert "crypto: arm64/sha-ce - implement export/import" This reverts commit 62e0842054a83d169791e1fddb0699589b82079d. Revert "crypto: arm64 - Use modern annotations for assembly functions" This reverts commit cecb3c804506cae6693fa140435dc47a5ee9154c. Revert "crypto: arm64/ghash-neon - bump priority to 150" This reverts commit c1c96a11a6618724bd9f2c094d22118a6244439a. Revert "crypto: arm64/sha - fix function types" This reverts commit b2a6b1e16dc2abd35ec6cb0fee1bb5681fc27e95. Revert "crypto: skcipher - rename the crypto_blkcipher module and kconfig option" This reverts commit 515e10b92b2b7bf161703c0ffbdbb7273626de1d. Revert "crypto: arm64/aes-neonbs - add return value of skcipher_walk_done() in __xts_crypt()" This reverts commit 06cd75589703d1b6bce80320a182b595ed42ab8e. Revert "crypto: arm64/gcm-ce - implement 4 way interleave" This reverts commit c14ebb4667dd5e494867ee129a997c6d2198507d. Revert "crypto: arm64/aes-neonbs - implement ciphertext stealing for XTS" This reverts commit d5397cf5dc32382901d611cd724eec6c3817656c. Revert "crypto: arm64/aes - implement support for XTS ciphertext stealing" This reverts commit 192933ba203e238cae5b16ab599e4cfa6b698d51. Revert "crypto: arm64/aes-cts-cbc - move request context data to the stack" This reverts commit f71bf2534f6d610950b010790b8e5675ed5455c7. Revert "crypto: arm64/aes-cts-cbc-ce - performance tweak" This reverts commit 7e8f61db4e78db217a3364ee47cfca764cbc5f0d. Revert "crypto: arm64/aes-neon - limit exposed routines if faster driver is enabled" This reverts commit ecd648682b2dc6bed3214b2e43a0eca35b63094d. Revert "crypto: arm64/aes-neonbs - replace tweak mask literal with composition" This reverts commit 5be595c6ba8edae081cdb63dca0353115d2484d1. Revert "crypto: arm64/aes - Use PTR_ERR_OR_ZERO rather than its implementation." This reverts commit aab710faff1ec9001c805568e6b8831103e3ecb6. Revert "crypto: arm64 - Rename functions to avoid conflict with crypto/sha256.h" This reverts commit caf246ef54b23c9675585fa555d4bb541bd59c92. Revert "crypto: arm64/aes - implement accelerated ESSIV/CBC mode" This reverts commit f8779aee4500c50d6d94bd980a2842308abeb79c. Revert "crypto: arm64/aes-cts-cbc - factor out CBC en/decryption of a walk" This reverts commit e0a1f4f64a8d1f4edf497ff327e016fbffadf66f. Revert "crypto: arm64/aes-cipher - switch to shared AES inverse Sbox" This reverts commit 186ffd910b0d5b0489d89ba6ca191c0cd376f05c. Revert "crypto: arm64/aes-neon - switch to shared AES Sboxes" This reverts commit 59e5b082956da502c60ac5760a915b486d1b8e5a. Revert "crypto: arm64/aes-ce-cipher - use AES library as fallback" This reverts commit a700606c4170512da6f9b4c96aba4f6df8ccf8a7. Revert "crypto: aes - move sync ctr(aes) to AES library and generic helper" This reverts commit 48b821c53fc518c80c410047748577bea1983e71. Revert "crypto: arm64/aes-ce - switch to library version of key expansion routine" This reverts commit cdd168c7cd95008f372dfadc2ba3861fc906ca60. Revert "crypto: arm64/aes-neonbs - switch to library version of key expansion routine" This reverts commit e2eb9cb4f948c5541bf39bbe3c5b4298c8a92909. Revert "crypto: arm64/aes-ccm - switch to AES library" This reverts commit 9fd25afeced4df2f1da1ecdebab4833b2aa98c3a. Revert "crypto: arm64/ghash - switch to AES library" This reverts commit 8790d3e83a3f13e887eadd033719c08e1156ad52. Revert "crypto: aes - rename local routines to prevent future clashes" This reverts commit cef928c589372b9fc280695fcaecfb6c21b7ee83. Revert "crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR" This reverts commit fdce296ea17bff501601164bb25b783798b4a402. Revert "crypto: arm64/aes-ce - add 5 way interleave routines" This reverts commit 917c3db52e00f145613a362143cb01fa16e72676. Revert "crypto: chacha - constify ctx and iv arguments" This reverts commit c83b6ad8ff05e72d0142471ed848023d984da356. Revert "arm64: HWCAP: add support for AT_HWCAP2" This reverts commit 9bb745a4cea3de91bf69fe01af7532f2877f6ac3. Revert "crypto: arm64/cbcmac - handle empty messages in same way as template" This reverts commit 4e90675cbb0ff662a9027f73d1f13fd6ec3d694a. Revert "crypto: arm64 - convert to use crypto_simd_usable()" This reverts commit 3b5ca4665ab8281d8ccec34709aafa8e43dc2e6d. Revert "crypto: arm64/gcm-aes-ce - fix no-NEON fallback code" This reverts commit 924a1110fe35f605fb92804200bf21b4594479d3. Revert "crypto: arm64/chacha - fix hchacha_block_neon() for big endian" This reverts commit 6ecb0ec105fdd67f4f60736e166ae33bc96c3821. Revert "crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian" This reverts commit e0c9ed3235e0d9460acc931eff40d02244553204. Revert "crypto: arm64/aes-blk - update IV after partial final CTR block" This reverts commit 18b10273b19d1e5e9e29d27948f68fb9a9291a28. Revert "crypto: arm64/aes-neonbs - fix returning final keystream block" This reverts commit 9f297390e264ffa3f1a00ff46f21c656dde26df9. Revert "crypto: arm64/crct10dif-ce - cleanup and optimizations" This reverts commit 2aae1e3d62019544b8716d1b69ebf90614e8cdab. Revert "crypto: arm64/crct10dif - register PMULL variants as separate algos" This reverts commit bb84013e29730f4f68d227d2668a67a23eea8187. Revert "crypto: arm64/crct10dif - remove dead code" This reverts commit e9a91b8f781aa9b9ce9b8cba96b84f564aa5c9f4. Revert "crypto: arm64/ghash - register PMULL variants as separate algos" This reverts commit 7d747b051d69639da21c96e934909611c9f81549. Revert "crypto: arm64/aes-ccm - don't use an atomic walk needlessly" This reverts commit 25b54060800d593a4ffdb34579210fc804b5489a. Revert "crypto: arm64/aes-ccm - fix logical bug in AAD MAC handling" This reverts commit 8b995fb58cc5c8c6bf7489dab8fe8142301ac1da. Revert "crypto: arm64/chacha - use combined SIMD/ALU routine for more speed" This reverts commit fefef573014532d1deb1bdc428d3a71306d0e558. Revert "crypto: arm64/chacha - optimize for arbitrary length inputs" This reverts commit d7764657bd5578c796275ba5dbbac2ad713cfdca. Revert "crypto: arm64/chacha - add XChaCha12 support" This reverts commit e6c5e231ae1cd27d51526fafa4a7b081f332b019. Revert "crypto: arm64/chacha20 - refactor to allow varying number of rounds" This reverts commit fe2906d03233b94e1ddc91c505a6d2f385cc6679. Revert "crypto: arm64/chacha20 - add XChaCha20 support" This reverts commit b4e2ca051adf26b8cafad012b16b39d96cb77d3f. Revert "crypto: arm64/nhpoly1305 - add NEON-accelerated NHPoly1305" This reverts commit f5d169f37be30ca6360232e88357d715a578174f. Revert "crypto: arm64/aes-blk - ensure XTS mask is always loaded" This reverts commit a21d4e59e8d54c315ccbc196def1aa15ee6538c9. Revert "crypto: arm64/aes - fix handling sub-block CTS-CBC inputs" This reverts commit 2211e34cc97c80548ab831166373fefc7dc2f29d. Revert "crypto: arm64/aes-blk - improve XTS mask handling" This reverts commit 3fef20ad89773585e81febe97e7337664b488450. Revert "crypto: arm64/aes-blk - add support for CTS-CBC mode" This reverts commit 54f8af3c79f88da71c3173be25cbd790c8bd07ec. Revert "crypto: arm64/aes-blk - revert NEON yield for skciphers" This reverts commit 4007f6d321639b70146b5f3c1e4612c8f8158fee. Revert "crypto: arm64/aes-blk - remove pointless (u8 *) casts" This reverts commit 3e38de9d524ebc2d509f4af77a1ef53d71579f6b. Revert "crypto: arm64/crct10dif - implement non-Crypto Extensions alternative" This reverts commit 7546637992dddb0d5ed8ec161f5aa00b5e13d5e6. Revert "crypto: arm64/crct10dif - preparatory refactor for 8x8 PMULL version" This reverts commit 4a0e214b37f461ea570b3e23982447bb2cad31a5. Revert "crypto: arm64/crc32 - remove PMULL based CRC32 driver" This reverts commit 7362148d02a38e69e11f2c5f9985ff7375d4bd29. Revert "crypto: arm64/aes-modes - get rid of literal load of addend vector" This reverts commit ca48e35e57323d4ce03694240616a9d967c8ebfd. Revert "crypto: arm64/aes-gcm-ce - fix scatterwalk API violation" This reverts commit 842ecc8548a5ba41706ce1943f3caabed69ac06a. Revert "crypto: arm64/sm4-ce - check for the right CPU feature bit" This reverts commit 0d9a3ccbc34f63956c254406faf28ba2ae22fe6f. Revert "crypto: arm64/ghash-ce - implement 4-way aggregation" This reverts commit c2b969ace9aade2625ae01f466289f05762192ff. Revert "crypto: arm64/ghash-ce - replace NEON yield check with block limit" This reverts commit 811b2c5d4d1ade3c71dfc62da2ed082490acae6a. Revert "crypto: arm64/aes-ce-gcm - don't reload key schedule if avoidable" This reverts commit e873ab3d42c7ad1906ec869900a0507fabc0d517. Revert "crypto: arm64/aes-ce-gcm - implement 2-way aggregation" This reverts commit 94ccba0f7c486355122d25da43c39097d81a02c7. Revert "crypto: arm64/aes-ce-gcm - operate on two input blocks at a time" This reverts commit f2105c4e7e500b80b28d8ebc2f17db32c1d8bfd6. Revert "crypto: arm64 - revert NEON yield for fast AEAD implementations" This reverts commit 39e662f0f9d3db97eccef09f36f2fefc9a618bbd. Revert "crypto/arm64: aes-ce-gcm - add missing kernel_neon_begin/end pair" This reverts commit 15d003b13075a16712dcbdb177c41b2bddbcbf0e. Revert "crypto: arm64/sha256 - increase cra_priority of scalar implementations" This reverts commit 2ef73cb76eb12bb45bf0f89a49b7dc7126fadd47. Revert "crypto: shash - remove useless setting of type flags" This reverts commit c1d4b72ca1321bfcbee7b86390b808428c3a27fa. Revert "crypto: arm64/aes-blk - fix and move skcipher_walk_done out of kernel_neon_begin, _end" This reverts commit ac65336b8bd522c96fd50f49ef9c4ceee3603487. Revert "crypto: clarify licensing of OpenSSL asm code" This reverts commit e989520a26841372c045c493e7e2c8f7bbda4f59. Revert "crypto: arm64/sha512-ce - yield NEON after every block of input" This reverts commit 3bdd591e9b4da8c93d9ef7ae6049cf28385098a0. Revert "crypto: arm64/sha3-ce - yield NEON after every block of input" This reverts commit e3d3203e0645587a72da93e482f38695f107c3a4. Revert "crypto: arm64/crct10dif-ce - yield NEON after every block of input" This reverts commit 859ffc1c5d38d109ca9c57f0703e792f3a9582c2. Revert "crypto: arm64/crc32-ce - yield NEON after every block of input" This reverts commit 5f1c71918b39f512e4c1f4daeeb3c9d5706351a8. Revert "crypto: arm64/aes-ghash - yield NEON after every block of input" This reverts commit 3125f780eb7ac53a4cdbeed97ae91e8ec5945ea1. Revert "crypto: arm64/aes-bs - yield NEON after every block of input" This reverts commit ad2d657e731bd4d462cedbcd6ef5662a265d75a0. Revert "Revert "crypto: arm64/aes-neonbs - fix returning final keystream block"" This reverts commit 1525d077c18b78b2953cf0d1868c444c37d3c682. Revert "crypto: arm64/aes-blk - yield NEON after every block of input" This reverts commit 5accaa34eea96c527aa8ec741383d3f38042ff69. Revert "crypto: arm64/aes-ccm - yield NEON after every block of input" This reverts commit c8f4e1b6574b2d4fdc06cd352b6950a8da2bef6c. Revert "Revert "crypto: arm64/aes-ccm - fix logical bug in AAD MAC handling"" This reverts commit 1629c18dda48bbac7154022dccdf9eb952ba8643. Revert "crypto: arm64/sha2-ce - yield NEON after every block of input" This reverts commit 343779e3c70586299ab67f3f9216562ab9bd6001. Revert "crypto: arm64/sha1-ce - yield NEON after every block of input" This reverts commit 68106b7893e6e78da5dc39601cba4d7a60071dea. Revert "crypto: arm64 - add support for SM4 encryption using special instructions" This reverts commit b9547b0195e0c89e59f41a631eadad77220d8993. Revert "crypto: arm64/sha256-neon - play nice with CONFIG_PREEMPT kernels" This reverts commit 33045c52eb89af1dca46aa2f31b9ace6848eb1fd. Revert "crypto: arm64/aes-blk - add 4 way interleave to CBC-MAC encrypt path" This reverts commit 2624ccfec2c746fd5c38e52be423d81d772ab785. Revert "crypto: arm64/aes-blk - add 4 way interleave to CBC encrypt path" This reverts commit 26acced8924afe42757e2f20420d2b412fd5122d. Revert "crypto: arm64/aes-blk - remove configurable interleave" This reverts commit aade51c37fda96baf6708504fae38dd7e8f3b525. Revert "crypto: arm64/chacha20 - move kernel mode neon en/disable into loop" This reverts commit 563fdb9d8718c999feff21a56e855d7d368bfe38. Revert "crypto: arm64/aes-bs - move kernel mode neon en/disable into loop" This reverts commit 7693fb31e5ec7a3a442abc82ce6dc223e0070c63. Revert "crypto: arm64/aes-blk - move kernel mode neon en/disable into loop" This reverts commit f3d47872ae322555f1933b89deaedf15f01a1630. Revert "crypto: arm64/aes-ce-ccm - move kernel mode neon en/disable into loop" This reverts commit 208213ffed71f368561cddafd96b8d726a96d628. Revert "crypto: arm64/speck - add NEON-accelerated implementation of Speck-XTS" This reverts commit a53ebc68fb69e48a0670b74f2107339937af236b. Revert "crypto: arm64/sha512 - fix/improve new v8.2 Crypto Extensions code" This reverts commit 3fe60009d5c1a9619412ff4e8f3a3963b90b03ee. Revert "crypto: arm64/sm3 - new v8.2 Crypto Extensions implementation" This reverts commit f12a399dc824600b5df583418a96759d7b76df3d. Revert "crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation" This reverts commit 0224378dd15ddc1e5bf6999b26f6a722be125604. Revert "crypto: arm64/sha1-ce - get rid of literal pool" This reverts commit 537e00dd0c79e6c31c4b3b2adde9727aa9f2590f. Revert "crypto: arm64/sha2-ce - move the round constant table to .rodata section" This reverts commit 11f38eee4f30ae202563dce4742be2819cf5ffe7. Revert "crypto: arm64/crct10dif - move literal data to .rodata section" This reverts commit 89710355b705b55f2c410c698ee8cd510c769672. Revert "crypto: arm64/crc32 - move literal data to .rodata section" This reverts commit d3fada575171f56e3dbbc402b119db0a7e106b65. Revert "crypto: arm64/aes-neon - move literal data to .rodata section" This reverts commit b73eae3af478173d89be32546d3db209c241a2ac. Revert "crypto: arm64/aes-cipher - move S-box to .rodata section" This reverts commit fc79c79af1578456030b820c8855887f79460eec. Revert "crypto: arm64 - implement SHA-512 using special instructions" This reverts commit f85618f0d28e37abd7fbb83acaa3de5e32849c5e. Revert "crypto: arm64/aes - do not call crypto_unregister_skcipher twice on error" This reverts commit 095b634144846d00932b2d68b212373b203d8265. Revert "[SQUASH] arm64: crypto: Revert old backports" This reverts commit 62e858fc144999fd13942977e9de8f0ab4b17041.
2025-02-20 11:45:48 +08:00 · 2021-08-13 13:08:37 +05:30 · 2021-08-13 13:08:37 +05:30 · c65c2e9bc6
commit c65c2e9bc6
parent 147f696539
153 changed files with 2935 additions and 7140 deletions
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@ -81,7 +81,7 @@ config CRYPTO_AES_ARM
 config CRYPTO_AES_ARM_BS
 	tristate "Bit sliced AES using NEON instructions"
 	depends on KERNEL_MODE_NEON
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_SIMD
 	select CRYPTO_AES
 	help
@ -97,7 +97,7 @@ config CRYPTO_AES_ARM_BS
 config CRYPTO_AES_ARM_CE
 	tristate "Accelerated AES using ARMv8 Crypto Extensions"
 	depends on KERNEL_MODE_NEON
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_SIMD
 	help
 	  Use an implementation of AES in CBC, CTR and XTS modes that uses
@ -127,7 +127,7 @@ config CRYPTO_CRC32_ARM_CE
 config CRYPTO_CHACHA20_NEON
 	tristate "NEON accelerated ChaCha stream cipher algorithms"
 	depends on KERNEL_MODE_NEON
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20

 config CRYPTO_NHPOLY1305_NEON
--- a/arch/arm/crypto/aes-cipher-glue.c
+++ b/arch/arm/crypto/aes-cipher-glue.c
@ -19,7 +19,7 @@ EXPORT_SYMBOL(__aes_arm_encrypt);
 asmlinkage void __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
 EXPORT_SYMBOL(__aes_arm_decrypt);

-static void aes_arm_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
 	int rounds = 6 + ctx->key_length / 4;
@ -27,7 +27,7 @@ static void aes_arm_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	__aes_arm_encrypt(ctx->key_enc, rounds, in, out);
 }

-static void aes_arm_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
 	int rounds = 6 + ctx->key_length / 4;
@ -47,8 +47,8 @@ static struct crypto_alg aes_alg = {
 	.cra_cipher.cia_min_keysize	= AES_MIN_KEY_SIZE,
 	.cra_cipher.cia_max_keysize	= AES_MAX_KEY_SIZE,
 	.cra_cipher.cia_setkey		= crypto_aes_set_key,
-	.cra_cipher.cia_encrypt		= aes_arm_encrypt,
-	.cra_cipher.cia_decrypt		= aes_arm_decrypt,
+	.cra_cipher.cia_encrypt		= aes_encrypt,
+	.cra_cipher.cia_decrypt		= aes_decrypt,

 #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
 	.cra_alignmask			= 3,
--- a/arch/arm/crypto/aes-neonbs-glue.c
+++ b/arch/arm/crypto/aes-neonbs-glue.c
@ -132,7 +132,6 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 	kernel_neon_begin();
 	aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
 	kernel_neon_end();
-	memzero_explicit(&rk, sizeof(rk));

 	return crypto_cipher_setkey(ctx->enc_tfm, in_key, key_len);
 }
--- a/arch/arm/crypto/chacha-neon-glue.c
+++ b/arch/arm/crypto/chacha-neon-glue.c
@ -62,7 +62,7 @@ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
 }

 static int chacha_neon_stream_xor(struct skcipher_request *req,
-				  const struct chacha_ctx *ctx, const u8 *iv)
+				  struct chacha_ctx *ctx, u8 *iv)
 {
 	struct skcipher_walk walk;
 	u32 state[16];
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@ -152,7 +152,7 @@ static struct shash_alg ghash_alg = {
 		.cra_name	= "__ghash",
 		.cra_driver_name = "__driver-ghash-ce",
 		.cra_priority	= 0,
-		.cra_flags	= CRYPTO_ALG_INTERNAL,
+		.cra_flags	= CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_INTERNAL,
 		.cra_blocksize	= GHASH_BLOCK_SIZE,
 		.cra_ctxsize	= sizeof(struct ghash_key),
 		.cra_module	= THIS_MODULE,
--- a/arch/arm/crypto/sha1-armv4-large.S
+++ b/arch/arm/crypto/sha1-armv4-large.S
@ -1,14 +1,4 @@
 #define __ARM_ARCH__ __LINUX_ARM_ARCH__
-@ SPDX-License-Identifier: GPL-2.0
-
-@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
-@ has relicensed it under the GPLv2. Therefore this program is free software;
-@ you can redistribute it and/or modify it under the terms of the GNU General
-@ Public License version 2 as published by the Free Software Foundation.
-@
-@ The original headers, including the original license headers, are
-@ included below for completeness.
-
@ ====================================================================
@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
--- a/arch/arm/crypto/sha1-ce-glue.c
+++ b/arch/arm/crypto/sha1-ce-glue.c
@ -75,6 +75,7 @@ static struct shash_alg alg = {
 		.cra_name		= "sha1",
 		.cra_driver_name	= "sha1-ce",
 		.cra_priority		= 200,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA1_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
--- a/arch/arm/crypto/sha1_glue.c
+++ b/arch/arm/crypto/sha1_glue.c
@ -67,6 +67,7 @@ static struct shash_alg alg = {
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"sha1-asm",
 		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/arm/crypto/sha1_neon_glue.c
+++ b/arch/arm/crypto/sha1_neon_glue.c
@ -83,6 +83,7 @@ static struct shash_alg alg = {
 		.cra_name		= "sha1",
 		.cra_driver_name	= "sha1-neon",
 		.cra_priority		= 250,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA1_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
--- a/arch/arm/crypto/sha2-ce-glue.c
+++ b/arch/arm/crypto/sha2-ce-glue.c
@ -78,6 +78,7 @@ static struct shash_alg algs[] = { {
 		.cra_name		= "sha224",
 		.cra_driver_name	= "sha224-ce",
 		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
@ -92,6 +93,7 @@ static struct shash_alg algs[] = { {
 		.cra_name		= "sha256",
 		.cra_driver_name	= "sha256-ce",
 		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
--- a/arch/arm/crypto/sha256-armv4.pl
+++ b/arch/arm/crypto/sha256-armv4.pl
@ -1,19 +1,12 @@
 #!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.

 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
 # ====================================================================

 # SHA256 block procedure for ARMv4. May 2007.
--- a/arch/arm/crypto/sha256-core.S_shipped
+++ b/arch/arm/crypto/sha256-core.S_shipped
@ -1,18 +1,11 @@
-@ SPDX-License-Identifier: GPL-2.0
-
-@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
-@ has relicensed it under the GPLv2. Therefore this program is free software;
-@ you can redistribute it and/or modify it under the terms of the GNU General
-@ Public License version 2 as published by the Free Software Foundation.
-@
-@ The original headers, including the original license headers, are
-@ included below for completeness.

@ ====================================================================
@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
@ ====================================================================

@ SHA256 block procedure for ARMv4. May 2007.
--- a/arch/arm/crypto/sha256_glue.c
+++ b/arch/arm/crypto/sha256_glue.c
@ -71,6 +71,7 @@ static struct shash_alg algs[] = { {
 		.cra_name	=	"sha256",
 		.cra_driver_name =	"sha256-asm",
 		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA256_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -85,6 +86,7 @@ static struct shash_alg algs[] = { {
 		.cra_name	=	"sha224",
 		.cra_driver_name =	"sha224-asm",
 		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA224_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/arm/crypto/sha256_neon_glue.c
+++ b/arch/arm/crypto/sha256_neon_glue.c
@ -79,6 +79,7 @@ struct shash_alg sha256_neon_algs[] = { {
 		.cra_name	=	"sha256",
 		.cra_driver_name =	"sha256-neon",
 		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA256_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -93,6 +94,7 @@ struct shash_alg sha256_neon_algs[] = { {
 		.cra_name	=	"sha224",
 		.cra_driver_name =	"sha224-neon",
 		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA224_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/arm/crypto/sha512-armv4.pl
+++ b/arch/arm/crypto/sha512-armv4.pl
@ -1,19 +1,12 @@
 #!/usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.

 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPL terms is granted.
 # ====================================================================

 # SHA512 block procedure for ARMv4. September 2007.
--- a/arch/arm/crypto/sha512-core.S_shipped
+++ b/arch/arm/crypto/sha512-core.S_shipped
@ -1,18 +1,11 @@
-@ SPDX-License-Identifier: GPL-2.0
-
-@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
-@ has relicensed it under the GPLv2. Therefore this program is free software;
-@ you can redistribute it and/or modify it under the terms of the GNU General
-@ Public License version 2 as published by the Free Software Foundation.
-@
-@ The original headers, including the original license headers, are
-@ included below for completeness.

@ ====================================================================
@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
@ details see http://www.openssl.org/~appro/cryptogams/.
+@
+@ Permission to use under GPL terms is granted.
@ ====================================================================

@ SHA512 block procedure for ARMv4. September 2007.
--- a/arch/arm/crypto/sha512-glue.c
+++ b/arch/arm/crypto/sha512-glue.c
@ -63,6 +63,7 @@ static struct shash_alg sha512_arm_algs[] = { {
 		.cra_name		= "sha384",
 		.cra_driver_name	= "sha384-arm",
 		.cra_priority		= 250,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA512_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
@ -77,6 +78,7 @@ static struct shash_alg sha512_arm_algs[] = { {
 		.cra_name		= "sha512",
 		.cra_driver_name	= "sha512-arm",
 		.cra_priority		= 250,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA512_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
--- a/arch/arm/crypto/sha512-neon-glue.c
+++ b/arch/arm/crypto/sha512-neon-glue.c
@ -75,6 +75,7 @@ struct shash_alg sha512_neon_algs[] = { {
 		.cra_name		= "sha384",
 		.cra_driver_name	= "sha384-neon",
 		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA384_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,

@ -90,6 +91,7 @@ struct shash_alg sha512_neon_algs[] = { {
 		.cra_name		= "sha512",
 		.cra_driver_name	= "sha512-neon",
 		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA512_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@ -596,6 +596,7 @@ CONFIG_CRYPTO_SHA1_ARM64_CE=y
 CONFIG_CRYPTO_SHA2_ARM64_CE=y
 CONFIG_CRYPTO_GHASH_ARM64_CE=y
 CONFIG_CRYPTO_CRCT10DIF_ARM64_CE=m
+CONFIG_CRYPTO_CRC32_ARM64_CE=m
 CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
 CONFIG_CRYPTO_CHACHA20_NEON=m
--- a/arch/arm64/configs/vendor/surya-perf_defconfig
+++ b/arch/arm64/configs/vendor/surya-perf_defconfig
@ -727,13 +727,7 @@ CONFIG_CRYPTO_DEV_QCOM_ICE=y
 CONFIG_ARM64_CRYPTO=y
 CONFIG_CRYPTO_SHA1_ARM64_CE=y
 CONFIG_CRYPTO_SHA2_ARM64_CE=y
-CONFIG_CRYPTO_SHA512_ARM64_CE=y
-CONFIG_CRYPTO_SHA3_ARM64=y
-CONFIG_CRYPTO_SM3_ARM64_CE=y
-CONFIG_CRYPTO_SM4_ARM64_CE=y
 CONFIG_CRYPTO_GHASH_ARM64_CE=y
 CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
 CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
-CONFIG_CRYPTO_CHACHA20_NEON=y
-CONFIG_CRYPTO_NHPOLY1305_NEON=y
-CONFIG_CRYPTO_AES_ARM64_BS=y
+CONFIG_CRYPTO_AES_ARM64_NEON_BLK=y
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@ -29,42 +29,24 @@ config CRYPTO_SHA2_ARM64_CE
 	select CRYPTO_HASH
 	select CRYPTO_SHA256_ARM64

-config CRYPTO_SHA512_ARM64_CE
-	tristate "SHA-384/SHA-512 digest algorithm (ARMv8 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_HASH
-	select CRYPTO_SHA512_ARM64
-
-config CRYPTO_SHA3_ARM64
-	tristate "SHA3 digest algorithm (ARMv8.2 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_HASH
-	select CRYPTO_SHA3
-
-config CRYPTO_SM3_ARM64_CE
-	tristate "SM3 digest algorithm (ARMv8.2 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_HASH
-	select CRYPTO_SM3
-
-config CRYPTO_SM4_ARM64_CE
-	tristate "SM4 symmetric cipher (ARMv8.2 Crypto Extensions)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_ALGAPI
-	select CRYPTO_SM4
-
 config CRYPTO_GHASH_ARM64_CE
 	tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_HASH
 	select CRYPTO_GF128MUL
-	select CRYPTO_LIB_AES
+	select CRYPTO_AES
+	select CRYPTO_AES_ARM64

 config CRYPTO_CRCT10DIF_ARM64_CE
 	tristate "CRCT10DIF digest algorithm using PMULL instructions"
 	depends on KERNEL_MODE_NEON && CRC_T10DIF
 	select CRYPTO_HASH

+config CRYPTO_CRC32_ARM64_CE
+	tristate "CRC32 and CRC32C digest algorithms using ARMv8 extensions"
+	depends on CRC32
+	select CRYPTO_HASH
+
 config CRYPTO_AES_ARM64
 	tristate "AES core cipher using scalar instructions"
 	select CRYPTO_AES
@ -73,20 +55,20 @@ config CRYPTO_AES_ARM64_CE
 	tristate "AES core cipher using ARMv8 Crypto Extensions"
 	depends on ARM64 && KERNEL_MODE_NEON
 	select CRYPTO_ALGAPI
-	select CRYPTO_LIB_AES
+	select CRYPTO_AES_ARM64

 config CRYPTO_AES_ARM64_CE_CCM
 	tristate "AES in CCM mode using ARMv8 Crypto Extensions"
 	depends on ARM64 && KERNEL_MODE_NEON
 	select CRYPTO_ALGAPI
 	select CRYPTO_AES_ARM64_CE
+	select CRYPTO_AES_ARM64
 	select CRYPTO_AEAD
-	select CRYPTO_LIB_AES

 config CRYPTO_AES_ARM64_CE_BLK
 	tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
 	depends on KERNEL_MODE_NEON
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_AES_ARM64_CE
 	select CRYPTO_AES_ARM64
 	select CRYPTO_SIMD
@ -94,35 +76,23 @@ config CRYPTO_AES_ARM64_CE_BLK
 config CRYPTO_AES_ARM64_NEON_BLK
 	tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
 	depends on KERNEL_MODE_NEON
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_AES_ARM64
-	select CRYPTO_LIB_AES
+	select CRYPTO_AES
 	select CRYPTO_SIMD

 config CRYPTO_CHACHA20_NEON
-	tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
+	tristate "NEON accelerated ChaCha20 symmetric cipher"
 	depends on KERNEL_MODE_NEON
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20

-config CRYPTO_NHPOLY1305_NEON
-	tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_NHPOLY1305
-
 config CRYPTO_AES_ARM64_BS
 	tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
 	depends on KERNEL_MODE_NEON
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_AES_ARM64_NEON_BLK
 	select CRYPTO_AES_ARM64
-	select CRYPTO_LIB_AES
 	select CRYPTO_SIMD

-config CRYPTO_SPECK_NEON
-	tristate "NEON accelerated Speck cipher algorithms"
-	depends on KERNEL_MODE_NEON
-	select CRYPTO_BLKCIPHER
-	select CRYPTO_SPECK
-
 endif
--- a/arch/arm64/crypto/Makefile
+++ b/arch/arm64/crypto/Makefile
@ -8,32 +8,21 @@
 # published by the Free Software Foundation.
 #

-obj-y := aes-lib.o
-
 obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
 sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o

 obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o
 sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o

-obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o
-sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o
-
-obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o
-sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
-
-obj-$(CONFIG_CRYPTO_SM3_ARM64_CE) += sm3-ce.o
-sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o
-
-obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce.o
-sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o
-
 obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
 ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o

 obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
 crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o

+obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
+crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
+
 obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
 aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o

@ -52,14 +41,8 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
 obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
 sha512-arm64-y := sha512-glue.o sha512-core.o

-obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
-chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
-
-obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
-speck-neon-y := speck-neon-core.o speck-neon-glue.o
-
-obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
-nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
+obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
+chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o

 obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
 aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o
--- a/arch/arm64/crypto/aes-ce-ccm-core.S
+++ b/arch/arm64/crypto/aes-ce-ccm-core.S
@ -18,7 +18,7 @@
 	 * void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
 	 *			     u32 *macp, u8 const rk[], u32 rounds);
 	 */
-SYM_FUNC_START(ce_aes_ccm_auth_data)
+ENTRY(ce_aes_ccm_auth_data)
 	ldr	w8, [x3]			/* leftover from prev round? */
 	ld1	{v0.16b}, [x0]			/* load mac */
 	cbz	w8, 1f
@ -84,13 +84,13 @@ SYM_FUNC_START(ce_aes_ccm_auth_data)
 	st1	{v0.16b}, [x0]
 10:	str	w8, [x3]
 	ret
-SYM_FUNC_END(ce_aes_ccm_auth_data)
+ENDPROC(ce_aes_ccm_auth_data)

 	/*
 	 * void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
 	 * 			 u32 rounds);
 	 */
-SYM_FUNC_START(ce_aes_ccm_final)
+ENTRY(ce_aes_ccm_final)
 	ld1	{v3.4s}, [x2], #16		/* load first round key */
 	ld1	{v0.16b}, [x0]			/* load mac */
 	cmp	w3, #12				/* which key size? */
@ -124,7 +124,7 @@ SYM_FUNC_START(ce_aes_ccm_final)
 	eor	v0.16b, v0.16b, v1.16b		/* en-/decrypt the mac */
 	st1	{v0.16b}, [x0]			/* store result */
 	ret
-SYM_FUNC_END(ce_aes_ccm_final)
+ENDPROC(ce_aes_ccm_final)

 	.macro	aes_ccm_do_crypt,enc
 	ldr	x8, [x6, #8]			/* load lower ctr */
@ -215,10 +215,10 @@ CPU_LE(	rev	x8, x8			)
 	 * 			   u8 const rk[], u32 rounds, u8 mac[],
 	 * 			   u8 ctr[]);
 	 */
-SYM_FUNC_START(ce_aes_ccm_encrypt)
+ENTRY(ce_aes_ccm_encrypt)
 	aes_ccm_do_crypt	1
-SYM_FUNC_END(ce_aes_ccm_encrypt)
+ENDPROC(ce_aes_ccm_encrypt)

-SYM_FUNC_START(ce_aes_ccm_decrypt)
+ENTRY(ce_aes_ccm_decrypt)
 	aes_ccm_do_crypt	0
-SYM_FUNC_END(ce_aes_ccm_decrypt)
+ENDPROC(ce_aes_ccm_decrypt)
--- a/arch/arm64/crypto/aes-ce-ccm-glue.c
+++ b/arch/arm64/crypto/aes-ce-ccm-glue.c
@ -14,7 +14,6 @@
 #include <crypto/aes.h>
 #include <crypto/scatterwalk.h>
 #include <crypto/internal/aead.h>
-#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <linux/module.h>

@ -46,6 +45,8 @@ asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
 asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[],
 				 u32 rounds);

+asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+
 static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key,
 		      unsigned int key_len)
 {
@ -106,13 +107,11 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
 }

 static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
-			   u32 abytes, u32 *macp)
+			   u32 abytes, u32 *macp, bool use_neon)
 {
-	if (may_use_simd()) {
-		kernel_neon_begin();
+	if (likely(use_neon)) {
 		ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc,
 				     num_rounds(key));
-		kernel_neon_end();
 	} else {
 		if (*macp > 0 && *macp < AES_BLOCK_SIZE) {
 			int added = min(abytes, AES_BLOCK_SIZE - *macp);
@ -125,7 +124,8 @@ static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
 		}

 		while (abytes >= AES_BLOCK_SIZE) {
-			aes_encrypt(key, mac, mac);
+			__aes_arm64_encrypt(key->key_enc, mac, mac,
+					    num_rounds(key));
 			crypto_xor(mac, in, AES_BLOCK_SIZE);

 			in += AES_BLOCK_SIZE;
@ -133,14 +133,16 @@ static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
 		}

 		if (abytes > 0) {
-			aes_encrypt(key, mac, mac);
+			__aes_arm64_encrypt(key->key_enc, mac, mac,
+					    num_rounds(key));
 			crypto_xor(mac, in, abytes);
 			*macp = abytes;
 		}
 	}
 }

-static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
+static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[],
+				   bool use_neon)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
 	struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
@ -159,7 +161,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
 		ltag.len = 6;
 	}

-	ccm_update_mac(ctx, mac, (u8 *)&ltag, ltag.len, &macp);
+	ccm_update_mac(ctx, mac, (u8 *)&ltag, ltag.len, &macp, use_neon);
 	scatterwalk_start(&walk, req->src);

 	do {
@ -171,7 +173,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
 			n = scatterwalk_clamp(&walk, len);
 		}
 		p = scatterwalk_map(&walk);
-		ccm_update_mac(ctx, mac, p, n, &macp);
+		ccm_update_mac(ctx, mac, p, n, &macp, use_neon);
 		len -= n;

 		scatterwalk_unmap(p);
@ -205,8 +207,10 @@ static int ccm_crypt_fallback(struct skcipher_walk *walk, u8 mac[], u8 iv0[],
 				bsize = nbytes;

 			crypto_inc(walk->iv, AES_BLOCK_SIZE);
-			aes_encrypt(ctx, buf, walk->iv);
-			aes_encrypt(ctx, mac, mac);
+			__aes_arm64_encrypt(ctx->key_enc, buf, walk->iv,
+					    num_rounds(ctx));
+			__aes_arm64_encrypt(ctx->key_enc, mac, mac,
+					    num_rounds(ctx));
 			if (enc)
 				crypto_xor(mac, src, bsize);
 			crypto_xor_cpy(dst, src, buf, bsize);
@ -221,8 +225,8 @@ static int ccm_crypt_fallback(struct skcipher_walk *walk, u8 mac[], u8 iv0[],
 	}

 	if (!err) {
-		aes_encrypt(ctx, buf, iv0);
-		aes_encrypt(ctx, mac, mac);
+		__aes_arm64_encrypt(ctx->key_enc, buf, iv0, num_rounds(ctx));
+		__aes_arm64_encrypt(ctx->key_enc, mac, mac, num_rounds(ctx));
 		crypto_xor(mac, buf, AES_BLOCK_SIZE);
 	}
 	return err;
@ -236,42 +240,43 @@ static int ccm_encrypt(struct aead_request *req)
 	u8 __aligned(8) mac[AES_BLOCK_SIZE];
 	u8 buf[AES_BLOCK_SIZE];
 	u32 len = req->cryptlen;
+	bool use_neon = may_use_simd();
 	int err;

 	err = ccm_init_mac(req, mac, len);
 	if (err)
 		return err;

+	if (likely(use_neon))
+		kernel_neon_begin();
+
 	if (req->assoclen)
-		ccm_calculate_auth_mac(req, mac);
+		ccm_calculate_auth_mac(req, mac, use_neon);

 	/* preserve the original iv for the final round */
 	memcpy(buf, req->iv, AES_BLOCK_SIZE);

-	err = skcipher_walk_aead_encrypt(&walk, req, false);
+	err = skcipher_walk_aead_encrypt(&walk, req, true);

-	if (may_use_simd()) {
+	if (likely(use_neon)) {
 		while (walk.nbytes) {
 			u32 tail = walk.nbytes % AES_BLOCK_SIZE;

 			if (walk.nbytes == walk.total)
 				tail = 0;

-			kernel_neon_begin();
 			ce_aes_ccm_encrypt(walk.dst.virt.addr,
 					   walk.src.virt.addr,
 					   walk.nbytes - tail, ctx->key_enc,
 					   num_rounds(ctx), mac, walk.iv);
-			kernel_neon_end();

 			err = skcipher_walk_done(&walk, tail);
 		}
-		if (!err) {
-			kernel_neon_begin();
+		if (!err)
 			ce_aes_ccm_final(mac, buf, ctx->key_enc,
 					 num_rounds(ctx));
-			kernel_neon_end();
-		}
+
+		kernel_neon_end();
 	} else {
 		err = ccm_crypt_fallback(&walk, mac, buf, ctx, true);
 	}
@ -294,42 +299,43 @@ static int ccm_decrypt(struct aead_request *req)
 	u8 __aligned(8) mac[AES_BLOCK_SIZE];
 	u8 buf[AES_BLOCK_SIZE];
 	u32 len = req->cryptlen - authsize;
+	bool use_neon = may_use_simd();
 	int err;

 	err = ccm_init_mac(req, mac, len);
 	if (err)
 		return err;

+	if (likely(use_neon))
+		kernel_neon_begin();
+
 	if (req->assoclen)
-		ccm_calculate_auth_mac(req, mac);
+		ccm_calculate_auth_mac(req, mac, use_neon);

 	/* preserve the original iv for the final round */
 	memcpy(buf, req->iv, AES_BLOCK_SIZE);

-	err = skcipher_walk_aead_decrypt(&walk, req, false);
+	err = skcipher_walk_aead_decrypt(&walk, req, true);

-	if (may_use_simd()) {
+	if (likely(use_neon)) {
 		while (walk.nbytes) {
 			u32 tail = walk.nbytes % AES_BLOCK_SIZE;

 			if (walk.nbytes == walk.total)
 				tail = 0;

-			kernel_neon_begin();
 			ce_aes_ccm_decrypt(walk.dst.virt.addr,
 					   walk.src.virt.addr,
 					   walk.nbytes - tail, ctx->key_enc,
 					   num_rounds(ctx), mac, walk.iv);
-			kernel_neon_end();

 			err = skcipher_walk_done(&walk, tail);
 		}
-		if (!err) {
-			kernel_neon_begin();
+		if (!err)
 			ce_aes_ccm_final(mac, buf, ctx->key_enc,
 					 num_rounds(ctx));
-			kernel_neon_end();
-		}
+
+		kernel_neon_end();
 	} else {
 		err = ccm_crypt_fallback(&walk, mac, buf, ctx, false);
 	}
@ -367,7 +373,7 @@ static struct aead_alg ccm_aes_alg = {

 static int __init aes_mod_init(void)
 {
-	if (!cpu_have_named_feature(AES))
+	if (!(elf_hwcap & HWCAP_AES))
 		return -ENODEV;
 	return crypto_register_aead(&ccm_aes_alg);
 }
--- a/arch/arm64/crypto/aes-ce-core.S
+++ b/arch/arm64/crypto/aes-ce-core.S
@ -11,7 +11,7 @@

 	.arch		armv8-a+crypto

-SYM_FUNC_START(__aes_ce_encrypt)
+ENTRY(__aes_ce_encrypt)
 	sub		w3, w3, #2
 	ld1		{v0.16b}, [x2]
 	ld1		{v1.4s}, [x0], #16
@ -37,9 +37,9 @@ SYM_FUNC_START(__aes_ce_encrypt)
 	eor		v0.16b, v0.16b, v3.16b
 	st1		{v0.16b}, [x1]
 	ret
-SYM_FUNC_END(__aes_ce_encrypt)
+ENDPROC(__aes_ce_encrypt)

-SYM_FUNC_START(__aes_ce_decrypt)
+ENTRY(__aes_ce_decrypt)
 	sub		w3, w3, #2
 	ld1		{v0.16b}, [x2]
 	ld1		{v1.4s}, [x0], #16
@ -65,23 +65,23 @@ SYM_FUNC_START(__aes_ce_decrypt)
 	eor		v0.16b, v0.16b, v3.16b
 	st1		{v0.16b}, [x1]
 	ret
-SYM_FUNC_END(__aes_ce_decrypt)
+ENDPROC(__aes_ce_decrypt)

 /*
 * __aes_ce_sub() - use the aese instruction to perform the AES sbox
 *                  substitution on each byte in 'input'
 */
-SYM_FUNC_START(__aes_ce_sub)
+ENTRY(__aes_ce_sub)
 	dup		v1.4s, w0
 	movi		v0.16b, #0
 	aese		v0.16b, v1.16b
 	umov		w0, v0.s[0]
 	ret
-SYM_FUNC_END(__aes_ce_sub)
+ENDPROC(__aes_ce_sub)

-SYM_FUNC_START(__aes_ce_invert)
+ENTRY(__aes_ce_invert)
 	ld1		{v0.4s}, [x1]
 	aesimc		v1.16b, v0.16b
 	st1		{v1.4s}, [x0]
 	ret
-SYM_FUNC_END(__aes_ce_invert)
+ENDPROC(__aes_ce_invert)
--- a/arch/arm64/crypto/aes-ce-glue.c
+++ b/arch/arm64/crypto/aes-ce-glue.c
@ -12,7 +12,6 @@
 #include <asm/simd.h>
 #include <asm/unaligned.h>
 #include <crypto/aes.h>
-#include <crypto/internal/simd.h>
 #include <linux/cpufeature.h>
 #include <linux/crypto.h>
 #include <linux/module.h>
@ -23,6 +22,9 @@ MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");

+asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+
 struct aes_block {
 	u8 b[AES_BLOCK_SIZE];
 };
@ -51,7 +53,7 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);

 	if (!may_use_simd()) {
-		aes_encrypt(ctx, dst, src);
+		__aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
 		return;
 	}

@ -65,7 +67,7 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);

 	if (!may_use_simd()) {
-		aes_decrypt(ctx, dst, src);
+		__aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
 		return;
 	}

--- a/arch/arm64/crypto/aes-ce.S
+++ b/arch/arm64/crypto/aes-ce.S
@ -12,21 +12,11 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>

-#define AES_FUNC_START(func)		SYM_FUNC_START(ce_ ## func)
-#define AES_FUNC_END(func)		SYM_FUNC_END(ce_ ## func)
+#define AES_ENTRY(func)		ENTRY(ce_ ## func)
+#define AES_ENDPROC(func)	ENDPROC(ce_ ## func)

 	.arch		armv8-a+crypto

-	xtsmask		.req	v16
-	cbciv		.req	v16
-	vctr		.req	v16
-
-	.macro		xts_reload_mask, tmp
-	.endm
-
-	.macro		xts_cts_skip_tw, reg, lbl
-	.endm
-
 	/* preload all round keys */
 	.macro		load_round_keys, rounds, rk
 	cmp		\rounds, #12
@ -40,24 +30,21 @@
 	.endm

 	/* prepare for encryption with key in rk[] */
-	.macro		enc_prepare, rounds, rk, temp
-	mov		\temp, \rk
-	load_round_keys	\rounds, \temp
+	.macro		enc_prepare, rounds, rk, ignore
+	load_round_keys	\rounds, \rk
 	.endm

 	/* prepare for encryption (again) but with new key in rk[] */
-	.macro		enc_switch_key, rounds, rk, temp
-	mov		\temp, \rk
-	load_round_keys	\rounds, \temp
+	.macro		enc_switch_key, rounds, rk, ignore
+	load_round_keys	\rounds, \rk
 	.endm

 	/* prepare for decryption with key in rk[] */
-	.macro		dec_prepare, rounds, rk, temp
-	mov		\temp, \rk
-	load_round_keys	\rounds, \temp
+	.macro		dec_prepare, rounds, rk, ignore
+	load_round_keys	\rounds, \rk
 	.endm

-	.macro		do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4
+	.macro		do_enc_Nx, de, mc, k, i0, i1, i2, i3
 	aes\de		\i0\().16b, \k\().16b
 	aes\mc		\i0\().16b, \i0\().16b
 	.ifnb		\i1
@ -68,34 +55,27 @@
 	aes\mc		\i2\().16b, \i2\().16b
 	aes\de		\i3\().16b, \k\().16b
 	aes\mc		\i3\().16b, \i3\().16b
-	.ifnb		\i4
-	aes\de		\i4\().16b, \k\().16b
-	aes\mc		\i4\().16b, \i4\().16b
-	.endif
 	.endif
 	.endif
 	.endm

-	/* up to 5 interleaved encryption rounds with the same round key */
-	.macro		round_Nx, enc, k, i0, i1, i2, i3, i4
+	/* up to 4 interleaved encryption rounds with the same round key */
+	.macro		round_Nx, enc, k, i0, i1, i2, i3
 	.ifc		\enc, e
-	do_enc_Nx	e, mc, \k, \i0, \i1, \i2, \i3, \i4
+	do_enc_Nx	e, mc, \k, \i0, \i1, \i2, \i3
 	.else
-	do_enc_Nx	d, imc, \k, \i0, \i1, \i2, \i3, \i4
+	do_enc_Nx	d, imc, \k, \i0, \i1, \i2, \i3
 	.endif
 	.endm

-	/* up to 5 interleaved final rounds */
-	.macro		fin_round_Nx, de, k, k2, i0, i1, i2, i3, i4
+	/* up to 4 interleaved final rounds */
+	.macro		fin_round_Nx, de, k, k2, i0, i1, i2, i3
 	aes\de		\i0\().16b, \k\().16b
 	.ifnb		\i1
 	aes\de		\i1\().16b, \k\().16b
 	.ifnb		\i3
 	aes\de		\i2\().16b, \k\().16b
 	aes\de		\i3\().16b, \k\().16b
-	.ifnb		\i4
-	aes\de		\i4\().16b, \k\().16b
-	.endif
 	.endif
 	.endif
 	eor		\i0\().16b, \i0\().16b, \k2\().16b
@ -104,52 +84,47 @@
 	.ifnb		\i3
 	eor		\i2\().16b, \i2\().16b, \k2\().16b
 	eor		\i3\().16b, \i3\().16b, \k2\().16b
-	.ifnb		\i4
-	eor		\i4\().16b, \i4\().16b, \k2\().16b
-	.endif
 	.endif
 	.endif
 	.endm

-	/* up to 5 interleaved blocks */
-	.macro		do_block_Nx, enc, rounds, i0, i1, i2, i3, i4
+	/* up to 4 interleaved blocks */
+	.macro		do_block_Nx, enc, rounds, i0, i1, i2, i3
 	cmp		\rounds, #12
 	blo		2222f		/* 128 bits */
 	beq		1111f		/* 192 bits */
-	round_Nx	\enc, v17, \i0, \i1, \i2, \i3, \i4
-	round_Nx	\enc, v18, \i0, \i1, \i2, \i3, \i4
-1111:	round_Nx	\enc, v19, \i0, \i1, \i2, \i3, \i4
-	round_Nx	\enc, v20, \i0, \i1, \i2, \i3, \i4
+	round_Nx	\enc, v17, \i0, \i1, \i2, \i3
+	round_Nx	\enc, v18, \i0, \i1, \i2, \i3
+1111:	round_Nx	\enc, v19, \i0, \i1, \i2, \i3
+	round_Nx	\enc, v20, \i0, \i1, \i2, \i3
 2222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
-	round_Nx	\enc, \key, \i0, \i1, \i2, \i3, \i4
+	round_Nx	\enc, \key, \i0, \i1, \i2, \i3
 	.endr
-	fin_round_Nx	\enc, v30, v31, \i0, \i1, \i2, \i3, \i4
+	fin_round_Nx	\enc, v30, v31, \i0, \i1, \i2, \i3
 	.endm

 	.macro		encrypt_block, in, rounds, t0, t1, t2
 	do_block_Nx	e, \rounds, \in
 	.endm

-	.macro		encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
-	do_block_Nx	e, \rounds, \i0, \i1, \i2, \i3
+	.macro		encrypt_block2x, i0, i1, rounds, t0, t1, t2
+	do_block_Nx	e, \rounds, \i0, \i1
 	.endm

-	.macro		encrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2
-	do_block_Nx	e, \rounds, \i0, \i1, \i2, \i3, \i4
+	.macro		encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
+	do_block_Nx	e, \rounds, \i0, \i1, \i2, \i3
 	.endm

 	.macro		decrypt_block, in, rounds, t0, t1, t2
 	do_block_Nx	d, \rounds, \in
 	.endm

+	.macro		decrypt_block2x, i0, i1, rounds, t0, t1, t2
+	do_block_Nx	d, \rounds, \i0, \i1
+	.endm
+
 	.macro		decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
 	do_block_Nx	d, \rounds, \i0, \i1, \i2, \i3
 	.endm

-	.macro		decrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2
-	do_block_Nx	d, \rounds, \i0, \i1, \i2, \i3, \i4
-	.endm
-
-#define MAX_STRIDE	5
-
 #include "aes-modes.S"
--- a/arch/arm64/crypto/aes-cipher-core.S
+++ b/arch/arm64/crypto/aes-cipher-core.S
@ -125,11 +125,48 @@ CPU_BE(	rev		w7, w7		)
 	ret
 	.endm

-SYM_FUNC_START(__aes_arm64_encrypt)
+	.align		L1_CACHE_SHIFT
+	.type		__aes_arm64_inverse_sbox, %object
+__aes_arm64_inverse_sbox:
+	.byte		0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+	.byte		0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+	.byte		0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+	.byte		0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+	.byte		0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+	.byte		0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+	.byte		0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+	.byte		0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+	.byte		0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+	.byte		0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+	.byte		0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+	.byte		0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+	.byte		0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+	.byte		0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+	.byte		0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+	.byte		0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+	.byte		0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+	.byte		0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+	.byte		0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+	.byte		0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+	.byte		0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+	.byte		0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+	.byte		0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+	.byte		0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+	.byte		0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+	.byte		0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+	.byte		0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+	.byte		0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+	.byte		0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+	.size		__aes_arm64_inverse_sbox, . - __aes_arm64_inverse_sbox
+
+ENTRY(__aes_arm64_encrypt)
 	do_crypt	fround, crypto_ft_tab, crypto_ft_tab + 1, 2
-SYM_FUNC_END(__aes_arm64_encrypt)
+ENDPROC(__aes_arm64_encrypt)

 	.align		5
-SYM_FUNC_START(__aes_arm64_decrypt)
-	do_crypt	iround, crypto_it_tab, crypto_aes_inv_sbox, 0
-SYM_FUNC_END(__aes_arm64_decrypt)
+ENTRY(__aes_arm64_decrypt)
+	do_crypt	iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0
+ENDPROC(__aes_arm64_decrypt)
--- a/arch/arm64/crypto/aes-cipher-glue.c
+++ b/arch/arm64/crypto/aes-cipher-glue.c
@ -13,9 +13,12 @@
 #include <linux/module.h>

 asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
-asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+EXPORT_SYMBOL(__aes_arm64_encrypt);

-static void aes_arm64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+EXPORT_SYMBOL(__aes_arm64_decrypt);
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
 	int rounds = 6 + ctx->key_length / 4;
@ -23,7 +26,7 @@ static void aes_arm64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 	__aes_arm64_encrypt(ctx->key_enc, out, in, rounds);
 }

-static void aes_arm64_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
 {
 	struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
 	int rounds = 6 + ctx->key_length / 4;
@ -43,8 +46,8 @@ static struct crypto_alg aes_alg = {
 	.cra_cipher.cia_min_keysize	= AES_MIN_KEY_SIZE,
 	.cra_cipher.cia_max_keysize	= AES_MAX_KEY_SIZE,
 	.cra_cipher.cia_setkey		= crypto_aes_set_key,
-	.cra_cipher.cia_encrypt		= aes_arm64_encrypt,
-	.cra_cipher.cia_decrypt		= aes_arm64_decrypt
+	.cra_cipher.cia_encrypt		= aes_encrypt,
+	.cra_cipher.cia_decrypt		= aes_decrypt
 };

 static int __init aes_init(void)
--- a/arch/arm64/crypto/aes-ctr-fallback.h
+++ b/arch/arm64/crypto/aes-ctr-fallback.h
@ -0,0 +1,53 @@
+/*
+ * Fallback for sync aes(ctr) in contexts where kernel mode NEON
+ * is not allowed
+ *
+ * Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <crypto/aes.h>
+#include <crypto/internal/skcipher.h>
+
+asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
+
+static inline int aes_ctr_encrypt_fallback(struct crypto_aes_ctx *ctx,
+					   struct skcipher_request *req)
+{
+	struct skcipher_walk walk;
+	u8 buf[AES_BLOCK_SIZE];
+	int err;
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	while (walk.nbytes > 0) {
+		u8 *dst = walk.dst.virt.addr;
+		u8 *src = walk.src.virt.addr;
+		int nbytes = walk.nbytes;
+		int tail = 0;
+
+		if (nbytes < walk.total) {
+			nbytes = round_down(nbytes, AES_BLOCK_SIZE);
+			tail = walk.nbytes % AES_BLOCK_SIZE;
+		}
+
+		do {
+			int bsize = min(nbytes, AES_BLOCK_SIZE);
+
+			__aes_arm64_encrypt(ctx->key_enc, buf, walk.iv,
+					    6 + ctx->key_length / 4);
+			crypto_xor_cpy(dst, src, buf, bsize);
+			crypto_inc(walk.iv, AES_BLOCK_SIZE);
+
+			dst += AES_BLOCK_SIZE;
+			src += AES_BLOCK_SIZE;
+			nbytes -= AES_BLOCK_SIZE;
+		} while (nbytes > 0);
+
+		err = skcipher_walk_done(&walk, tail);
+	}
+	return err;
+}
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@ -12,45 +12,25 @@
 #include <asm/hwcap.h>
 #include <asm/simd.h>
 #include <crypto/aes.h>
-#include <crypto/ctr.h>
-#include <crypto/sha.h>
 #include <crypto/internal/hash.h>
 #include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
-#include <crypto/scatterwalk.h>
 #include <linux/module.h>
 #include <linux/cpufeature.h>
 #include <crypto/xts.h>

 #include "aes-ce-setkey.h"
-
-#undef aes_expandkey
-#undef aes_ecb_encrypt
-#undef aes_ecb_decrypt
-#undef aes_cbc_encrypt
-#undef aes_cbc_decrypt
-#undef aes_cbc_cts_encrypt
-#undef aes_cbc_cts_decrypt
-#undef aes_essiv_cbc_encrypt
-#undef aes_essiv_cbc_decrypt
-#undef aes_ctr_encrypt
-#undef aes_xts_encrypt
-#undef aes_xts_decrypt
-#undef aes_mac_update
+#include "aes-ctr-fallback.h"

 #ifdef USE_V8_CRYPTO_EXTENSIONS
 #define MODE			"ce"
 #define PRIO			300
-#define STRIDE			5
+#define aes_setkey		ce_aes_setkey
 #define aes_expandkey		ce_aes_expandkey
 #define aes_ecb_encrypt		ce_aes_ecb_encrypt
 #define aes_ecb_decrypt		ce_aes_ecb_decrypt
 #define aes_cbc_encrypt		ce_aes_cbc_encrypt
 #define aes_cbc_decrypt		ce_aes_cbc_decrypt
-#define aes_cbc_cts_encrypt	ce_aes_cbc_cts_encrypt
-#define aes_cbc_cts_decrypt	ce_aes_cbc_cts_decrypt
-#define aes_essiv_cbc_encrypt	ce_aes_essiv_cbc_encrypt
-#define aes_essiv_cbc_decrypt	ce_aes_essiv_cbc_decrypt
 #define aes_ctr_encrypt		ce_aes_ctr_encrypt
 #define aes_xts_encrypt		ce_aes_xts_encrypt
 #define aes_xts_decrypt		ce_aes_xts_decrypt
@ -59,84 +39,59 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
 #else
 #define MODE			"neon"
 #define PRIO			200
-#define STRIDE			4
+#define aes_setkey		crypto_aes_set_key
+#define aes_expandkey		crypto_aes_expand_key
 #define aes_ecb_encrypt		neon_aes_ecb_encrypt
 #define aes_ecb_decrypt		neon_aes_ecb_decrypt
 #define aes_cbc_encrypt		neon_aes_cbc_encrypt
 #define aes_cbc_decrypt		neon_aes_cbc_decrypt
-#define aes_cbc_cts_encrypt	neon_aes_cbc_cts_encrypt
-#define aes_cbc_cts_decrypt	neon_aes_cbc_cts_decrypt
-#define aes_essiv_cbc_encrypt	neon_aes_essiv_cbc_encrypt
-#define aes_essiv_cbc_decrypt	neon_aes_essiv_cbc_decrypt
 #define aes_ctr_encrypt		neon_aes_ctr_encrypt
 #define aes_xts_encrypt		neon_aes_xts_encrypt
 #define aes_xts_decrypt		neon_aes_xts_decrypt
 #define aes_mac_update		neon_aes_mac_update
 MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
-#endif
-#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
 MODULE_ALIAS_CRYPTO("ecb(aes)");
 MODULE_ALIAS_CRYPTO("cbc(aes)");
 MODULE_ALIAS_CRYPTO("ctr(aes)");
 MODULE_ALIAS_CRYPTO("xts(aes)");
-#endif
-MODULE_ALIAS_CRYPTO("cts(cbc(aes))");
-MODULE_ALIAS_CRYPTO("essiv(cbc(aes),sha256)");
 MODULE_ALIAS_CRYPTO("cmac(aes)");
 MODULE_ALIAS_CRYPTO("xcbc(aes)");
 MODULE_ALIAS_CRYPTO("cbcmac(aes)");
+#endif

 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");

 /* defined in aes-modes.S */
-asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
+asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
 				int rounds, int blocks);
-asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[],
+asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
 				int rounds, int blocks);

-asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
+asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
 				int rounds, int blocks, u8 iv[]);
-asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[],
+asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
 				int rounds, int blocks, u8 iv[]);

-asmlinkage void aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
-				int rounds, int bytes, u8 const iv[]);
-asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
-				int rounds, int bytes, u8 const iv[]);
+asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
+				int rounds, int blocks, u8 ctr[]);

-asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
-				int rounds, int bytes, u8 ctr[], u8 finalbuf[]);
-
-asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
-				int rounds, int bytes, u32 const rk2[], u8 iv[],
+asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
+				int rounds, int blocks, u8 const rk2[], u8 iv[],
 				int first);
-asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[],
-				int rounds, int bytes, u32 const rk2[], u8 iv[],
+asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
+				int rounds, int blocks, u8 const rk2[], u8 iv[],
 				int first);

-asmlinkage void aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
-				      int rounds, int blocks, u8 iv[],
-				      u32 const rk2[]);
-asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
-				      int rounds, int blocks, u8 iv[],
-				      u32 const rk2[]);
-
-asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds,
-			      int blocks, u8 dg[], int enc_before,
-			      int enc_after);
+asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
+			       int blocks, u8 dg[], int enc_before,
+			       int enc_after);

 struct crypto_aes_xts_ctx {
 	struct crypto_aes_ctx key1;
 	struct crypto_aes_ctx __aligned(8) key2;
 };

-struct crypto_aes_essiv_cbc_ctx {
-	struct crypto_aes_ctx key1;
-	struct crypto_aes_ctx __aligned(8) key2;
-	struct crypto_shash *hash;
-};
-
 struct mac_tfm_ctx {
 	struct crypto_aes_ctx key;
 	u8 __aligned(8) consts[];
@ -150,18 +105,11 @@ struct mac_desc_ctx {
 static int skcipher_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 			       unsigned int key_len)
 {
-	struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int ret;
-
-	ret = aes_expandkey(ctx, in_key, key_len);
-	if (ret)
-		crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-
-	return ret;
+	return aes_setkey(crypto_skcipher_tfm(tfm), in_key, key_len);
 }

-static int __maybe_unused xts_set_key(struct crypto_skcipher *tfm,
-				      const u8 *in_key, unsigned int key_len)
+static int xts_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
+		       unsigned int key_len)
 {
 	struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
 	int ret;
@ -181,31 +129,7 @@ static int __maybe_unused xts_set_key(struct crypto_skcipher *tfm,
 	return -EINVAL;
 }

-static int __maybe_unused essiv_cbc_set_key(struct crypto_skcipher *tfm,
-					    const u8 *in_key,
-					    unsigned int key_len)
-{
-	struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-	u8 digest[SHA256_DIGEST_SIZE];
-	int ret;
-
-	ret = aes_expandkey(&ctx->key1, in_key, key_len);
-	if (ret)
-		goto out;
-
-	crypto_shash_tfm_digest(ctx->hash, in_key, key_len, digest);
-
-	ret = aes_expandkey(&ctx->key2, digest, sizeof(digest));
-	if (ret)
-		goto out;
-
-	return 0;
-out:
-	crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
-	return -EINVAL;
-}
-
-static int __maybe_unused ecb_encrypt(struct skcipher_request *req)
+static int ecb_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
@ -218,14 +142,14 @@ static int __maybe_unused ecb_encrypt(struct skcipher_request *req)
 	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
 		kernel_neon_begin();
 		aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				ctx->key_enc, rounds, blocks);
+				(u8 *)ctx->key_enc, rounds, blocks);
 		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
 	return err;
 }

-static int __maybe_unused ecb_decrypt(struct skcipher_request *req)
+static int ecb_decrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
@ -238,243 +162,51 @@ static int __maybe_unused ecb_decrypt(struct skcipher_request *req)
 	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
 		kernel_neon_begin();
 		aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				ctx->key_dec, rounds, blocks);
+				(u8 *)ctx->key_dec, rounds, blocks);
 		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
 	return err;
 }

-static int cbc_encrypt_walk(struct skcipher_request *req,
-			    struct skcipher_walk *walk)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err = 0, rounds = 6 + ctx->key_length / 4;
-	unsigned int blocks;
-
-	while ((blocks = (walk->nbytes / AES_BLOCK_SIZE))) {
-		kernel_neon_begin();
-		aes_cbc_encrypt(walk->dst.virt.addr, walk->src.virt.addr,
-				ctx->key_enc, rounds, blocks, walk->iv);
-		kernel_neon_end();
-		err = skcipher_walk_done(walk, walk->nbytes % AES_BLOCK_SIZE);
-	}
-	return err;
-}
-
-static int __maybe_unused cbc_encrypt(struct skcipher_request *req)
-{
-	struct skcipher_walk walk;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-	if (err)
-		return err;
-	return cbc_encrypt_walk(req, &walk);
-}
-
-static int cbc_decrypt_walk(struct skcipher_request *req,
-			    struct skcipher_walk *walk)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err = 0, rounds = 6 + ctx->key_length / 4;
-	unsigned int blocks;
-
-	while ((blocks = (walk->nbytes / AES_BLOCK_SIZE))) {
-		kernel_neon_begin();
-		aes_cbc_decrypt(walk->dst.virt.addr, walk->src.virt.addr,
-				ctx->key_dec, rounds, blocks, walk->iv);
-		kernel_neon_end();
-		err = skcipher_walk_done(walk, walk->nbytes % AES_BLOCK_SIZE);
-	}
-	return err;
-}
-
-static int __maybe_unused cbc_decrypt(struct skcipher_request *req)
-{
-	struct skcipher_walk walk;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-	if (err)
-		return err;
-	return cbc_decrypt_walk(req, &walk);
-}
-
-static int cts_cbc_encrypt(struct skcipher_request *req)
+static int cbc_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
 	int err, rounds = 6 + ctx->key_length / 4;
-	int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
-	struct scatterlist *src = req->src, *dst = req->dst;
-	struct scatterlist sg_src[2], sg_dst[2];
-	struct skcipher_request subreq;
 	struct skcipher_walk walk;
+	unsigned int blocks;

-	skcipher_request_set_tfm(&subreq, tfm);
-	skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
-				      NULL, NULL);
+	err = skcipher_walk_virt(&walk, req, false);

-	if (req->cryptlen <= AES_BLOCK_SIZE) {
-		if (req->cryptlen < AES_BLOCK_SIZE)
-			return -EINVAL;
-		cbc_blocks = 1;
+	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+		kernel_neon_begin();
+		aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				(u8 *)ctx->key_enc, rounds, blocks, walk.iv);
+		kernel_neon_end();
+		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
-
-	if (cbc_blocks > 0) {
-		skcipher_request_set_crypt(&subreq, req->src, req->dst,
-					   cbc_blocks * AES_BLOCK_SIZE,
-					   req->iv);
-
-		err = skcipher_walk_virt(&walk, &subreq, false) ?:
-		      cbc_encrypt_walk(&subreq, &walk);
-		if (err)
-			return err;
-
-		if (req->cryptlen == AES_BLOCK_SIZE)
-			return 0;
-
-		dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
-		if (req->dst != req->src)
-			dst = scatterwalk_ffwd(sg_dst, req->dst,
-					       subreq.cryptlen);
-	}
-
-	/* handle ciphertext stealing */
-	skcipher_request_set_crypt(&subreq, src, dst,
-				   req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
-				   req->iv);
-
-	err = skcipher_walk_virt(&walk, &subreq, false);
-	if (err)
-		return err;
-
-	kernel_neon_begin();
-	aes_cbc_cts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-			    ctx->key_enc, rounds, walk.nbytes, walk.iv);
-	kernel_neon_end();
-
-	return skcipher_walk_done(&walk, 0);
+	return err;
 }

-static int cts_cbc_decrypt(struct skcipher_request *req)
+static int cbc_decrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
 	int err, rounds = 6 + ctx->key_length / 4;
-	int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
-	struct scatterlist *src = req->src, *dst = req->dst;
-	struct scatterlist sg_src[2], sg_dst[2];
-	struct skcipher_request subreq;
-	struct skcipher_walk walk;
-
-	skcipher_request_set_tfm(&subreq, tfm);
-	skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
-				      NULL, NULL);
-
-	if (req->cryptlen <= AES_BLOCK_SIZE) {
-		if (req->cryptlen < AES_BLOCK_SIZE)
-			return -EINVAL;
-		cbc_blocks = 1;
-	}
-
-	if (cbc_blocks > 0) {
-		skcipher_request_set_crypt(&subreq, req->src, req->dst,
-					   cbc_blocks * AES_BLOCK_SIZE,
-					   req->iv);
-
-		err = skcipher_walk_virt(&walk, &subreq, false) ?:
-		      cbc_decrypt_walk(&subreq, &walk);
-		if (err)
-			return err;
-
-		if (req->cryptlen == AES_BLOCK_SIZE)
-			return 0;
-
-		dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
-		if (req->dst != req->src)
-			dst = scatterwalk_ffwd(sg_dst, req->dst,
-					       subreq.cryptlen);
-	}
-
-	/* handle ciphertext stealing */
-	skcipher_request_set_crypt(&subreq, src, dst,
-				   req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
-				   req->iv);
-
-	err = skcipher_walk_virt(&walk, &subreq, false);
-	if (err)
-		return err;
-
-	kernel_neon_begin();
-	aes_cbc_cts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-			    ctx->key_dec, rounds, walk.nbytes, walk.iv);
-	kernel_neon_end();
-
-	return skcipher_walk_done(&walk, 0);
-}
-
-static int __maybe_unused essiv_cbc_init_tfm(struct crypto_skcipher *tfm)
-{
-	struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	ctx->hash = crypto_alloc_shash("sha256", 0, 0);
-
-	return PTR_ERR_OR_ZERO(ctx->hash);
-}
-
-static void __maybe_unused essiv_cbc_exit_tfm(struct crypto_skcipher *tfm)
-{
-	struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	crypto_free_shash(ctx->hash);
-}
-
-static int __maybe_unused essiv_cbc_encrypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err, rounds = 6 + ctx->key1.key_length / 4;
 	struct skcipher_walk walk;
 	unsigned int blocks;

 	err = skcipher_walk_virt(&walk, req, false);

-	blocks = walk.nbytes / AES_BLOCK_SIZE;
-	if (blocks) {
+	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
 		kernel_neon_begin();
-		aes_essiv_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				      ctx->key1.key_enc, rounds, blocks,
-				      req->iv, ctx->key2.key_enc);
+		aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				(u8 *)ctx->key_dec, rounds, blocks, walk.iv);
 		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
-	return err ?: cbc_encrypt_walk(req, &walk);
-}
-
-static int __maybe_unused essiv_cbc_decrypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err, rounds = 6 + ctx->key1.key_length / 4;
-	struct skcipher_walk walk;
-	unsigned int blocks;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	blocks = walk.nbytes / AES_BLOCK_SIZE;
-	if (blocks) {
-		kernel_neon_begin();
-		aes_essiv_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				      ctx->key1.key_dec, rounds, blocks,
-				      req->iv, ctx->key2.key_enc);
-		kernel_neon_end();
-		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
-	}
-	return err ?: cbc_decrypt_walk(req, &walk);
+	return err;
 }

 static int ctr_encrypt(struct skcipher_request *req)
@ -483,211 +215,95 @@ static int ctr_encrypt(struct skcipher_request *req)
 	struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
 	int err, rounds = 6 + ctx->key_length / 4;
 	struct skcipher_walk walk;
+	int blocks;

 	err = skcipher_walk_virt(&walk, req, false);

-	while (walk.nbytes > 0) {
-		const u8 *src = walk.src.virt.addr;
+	while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
+		kernel_neon_begin();
+		aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
+				(u8 *)ctx->key_enc, rounds, blocks, walk.iv);
+		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
+		kernel_neon_end();
+	}
+	if (walk.nbytes) {
+		u8 __aligned(8) tail[AES_BLOCK_SIZE];
 		unsigned int nbytes = walk.nbytes;
-		u8 *dst = walk.dst.virt.addr;
-		u8 buf[AES_BLOCK_SIZE];
-		unsigned int tail;
+		u8 *tdst = walk.dst.virt.addr;
+		u8 *tsrc = walk.src.virt.addr;

-		if (unlikely(nbytes < AES_BLOCK_SIZE))
-			src = memcpy(buf, src, nbytes);
-		else if (nbytes < walk.total)
-			nbytes &= ~(AES_BLOCK_SIZE - 1);
+		/*
+		 * Tell aes_ctr_encrypt() to process a tail block.
+		 */
+		blocks = -1;

 		kernel_neon_begin();
-		aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
-				walk.iv, buf);
+		aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds,
+				blocks, walk.iv);
 		kernel_neon_end();
-
-		tail = nbytes % (STRIDE * AES_BLOCK_SIZE);
-		if (tail > 0 && tail < AES_BLOCK_SIZE)
-			/*
-			 * The final partial block could not be returned using
-			 * an overlapping store, so it was passed via buf[]
-			 * instead.
-			 */
-			memcpy(dst + nbytes - tail, buf, tail);
-
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+		crypto_xor_cpy(tdst, tsrc, tail, nbytes);
+		err = skcipher_walk_done(&walk, 0);
 	}

 	return err;
 }

-static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
+static int ctr_encrypt_sync(struct skcipher_request *req)
 {
-	const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
-	unsigned long flags;
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);

-	/*
-	 * Temporarily disable interrupts to avoid races where
-	 * cachelines are evicted when the CPU is interrupted
-	 * to do something else.
-	 */
-	local_irq_save(flags);
-	aes_encrypt(ctx, dst, src);
-	local_irq_restore(flags);
-}
-
-static int __maybe_unused ctr_encrypt_sync(struct skcipher_request *req)
-{
 	if (!may_use_simd())
-		return crypto_ctr_encrypt_walk(req, ctr_encrypt_one);
+		return aes_ctr_encrypt_fallback(ctx, req);

 	return ctr_encrypt(req);
 }

-static int __maybe_unused xts_encrypt(struct skcipher_request *req)
+static int xts_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
 	int err, first, rounds = 6 + ctx->key1.key_length / 4;
-	int tail = req->cryptlen % AES_BLOCK_SIZE;
-	struct scatterlist sg_src[2], sg_dst[2];
-	struct skcipher_request subreq;
-	struct scatterlist *src, *dst;
 	struct skcipher_walk walk;
-
-	if (req->cryptlen < AES_BLOCK_SIZE)
-		return -EINVAL;
+	unsigned int blocks;

 	err = skcipher_walk_virt(&walk, req, false);

-	if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
-		int xts_blocks = DIV_ROUND_UP(req->cryptlen,
-					      AES_BLOCK_SIZE) - 2;
-
-		skcipher_walk_abort(&walk);
-
-		skcipher_request_set_tfm(&subreq, tfm);
-		skcipher_request_set_callback(&subreq,
-					      skcipher_request_flags(req),
-					      NULL, NULL);
-		skcipher_request_set_crypt(&subreq, req->src, req->dst,
-					   xts_blocks * AES_BLOCK_SIZE,
-					   req->iv);
-		req = &subreq;
-		err = skcipher_walk_virt(&walk, req, false);
-	} else {
-		tail = 0;
-	}
-
-	for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
-		int nbytes = walk.nbytes;
-
-		if (walk.nbytes < walk.total)
-			nbytes &= ~(AES_BLOCK_SIZE - 1);
-
+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
 		kernel_neon_begin();
 		aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				ctx->key1.key_enc, rounds, nbytes,
-				ctx->key2.key_enc, walk.iv, first);
+				(u8 *)ctx->key1.key_enc, rounds, blocks,
+				(u8 *)ctx->key2.key_enc, walk.iv, first);
 		kernel_neon_end();
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}

-	if (err || likely(!tail))
-		return err;
-
-	dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
-	if (req->dst != req->src)
-		dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
-
-	skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
-				   req->iv);
-
-	err = skcipher_walk_virt(&walk, &subreq, false);
-	if (err)
-		return err;
-
-	kernel_neon_begin();
-	aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
-			ctx->key1.key_enc, rounds, walk.nbytes,
-			ctx->key2.key_enc, walk.iv, first);
-	kernel_neon_end();
-
-	return skcipher_walk_done(&walk, 0);
+	return err;
 }

-static int __maybe_unused xts_decrypt(struct skcipher_request *req)
+static int xts_decrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
 	int err, first, rounds = 6 + ctx->key1.key_length / 4;
-	int tail = req->cryptlen % AES_BLOCK_SIZE;
-	struct scatterlist sg_src[2], sg_dst[2];
-	struct skcipher_request subreq;
-	struct scatterlist *src, *dst;
 	struct skcipher_walk walk;
-
-	if (req->cryptlen < AES_BLOCK_SIZE)
-		return -EINVAL;
+	unsigned int blocks;

 	err = skcipher_walk_virt(&walk, req, false);

-	if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
-		int xts_blocks = DIV_ROUND_UP(req->cryptlen,
-					      AES_BLOCK_SIZE) - 2;
-
-		skcipher_walk_abort(&walk);
-
-		skcipher_request_set_tfm(&subreq, tfm);
-		skcipher_request_set_callback(&subreq,
-					      skcipher_request_flags(req),
-					      NULL, NULL);
-		skcipher_request_set_crypt(&subreq, req->src, req->dst,
-					   xts_blocks * AES_BLOCK_SIZE,
-					   req->iv);
-		req = &subreq;
-		err = skcipher_walk_virt(&walk, req, false);
-	} else {
-		tail = 0;
-	}
-
-	for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
-		int nbytes = walk.nbytes;
-
-		if (walk.nbytes < walk.total)
-			nbytes &= ~(AES_BLOCK_SIZE - 1);
-
+	for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
 		kernel_neon_begin();
 		aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-				ctx->key1.key_dec, rounds, nbytes,
-				ctx->key2.key_enc, walk.iv, first);
+				(u8 *)ctx->key1.key_dec, rounds, blocks,
+				(u8 *)ctx->key2.key_enc, walk.iv, first);
 		kernel_neon_end();
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}

-	if (err || likely(!tail))
-		return err;
-
-	dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
-	if (req->dst != req->src)
-		dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
-
-	skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
-				   req->iv);
-
-	err = skcipher_walk_virt(&walk, &subreq, false);
-	if (err)
-		return err;
-
-
-	kernel_neon_begin();
-	aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
-			ctx->key1.key_dec, rounds, walk.nbytes,
-			ctx->key2.key_enc, walk.iv, first);
-	kernel_neon_end();
-
-	return skcipher_walk_done(&walk, 0);
+	return err;
 }

 static struct skcipher_alg aes_algs[] = { {
-#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
 	.base = {
 		.cra_name		= "__ecb(aes)",
 		.cra_driver_name	= "__ecb-aes-" MODE,
@ -764,46 +380,9 @@ static struct skcipher_alg aes_algs[] = { {
 	.min_keysize	= 2 * AES_MIN_KEY_SIZE,
 	.max_keysize	= 2 * AES_MAX_KEY_SIZE,
 	.ivsize		= AES_BLOCK_SIZE,
-	.walksize	= 2 * AES_BLOCK_SIZE,
 	.setkey		= xts_set_key,
 	.encrypt	= xts_encrypt,
 	.decrypt	= xts_decrypt,
-}, {
-#endif
-	.base = {
-		.cra_name		= "__cts(cbc(aes))",
-		.cra_driver_name	= "__cts-cbc-aes-" MODE,
-		.cra_priority		= PRIO,
-		.cra_flags		= CRYPTO_ALG_INTERNAL,
-		.cra_blocksize		= AES_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct crypto_aes_ctx),
-		.cra_module		= THIS_MODULE,
-	},
-	.min_keysize	= AES_MIN_KEY_SIZE,
-	.max_keysize	= AES_MAX_KEY_SIZE,
-	.ivsize		= AES_BLOCK_SIZE,
-	.walksize	= 2 * AES_BLOCK_SIZE,
-	.setkey		= skcipher_aes_setkey,
-	.encrypt	= cts_cbc_encrypt,
-	.decrypt	= cts_cbc_decrypt,
-}, {
-	.base = {
-		.cra_name		= "__essiv(cbc(aes),sha256)",
-		.cra_driver_name	= "__essiv-cbc-aes-sha256-" MODE,
-		.cra_priority		= PRIO + 1,
-		.cra_flags		= CRYPTO_ALG_INTERNAL,
-		.cra_blocksize		= AES_BLOCK_SIZE,
-		.cra_ctxsize		= sizeof(struct crypto_aes_essiv_cbc_ctx),
-		.cra_module		= THIS_MODULE,
-	},
-	.min_keysize	= AES_MIN_KEY_SIZE,
-	.max_keysize	= AES_MAX_KEY_SIZE,
-	.ivsize		= AES_BLOCK_SIZE,
-	.setkey		= essiv_cbc_set_key,
-	.encrypt	= essiv_cbc_encrypt,
-	.decrypt	= essiv_cbc_decrypt,
-	.init		= essiv_cbc_init_tfm,
-	.exit		= essiv_cbc_exit_tfm,
 } };

 static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
@ -833,6 +412,7 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
 {
 	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
 	be128 *consts = (be128 *)ctx->consts;
+	u8 *rk = (u8 *)ctx->key.key_enc;
 	int rounds = 6 + key_len / 4;
 	int err;

@ -842,8 +422,7 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,

 	/* encrypt the zero vector */
 	kernel_neon_begin();
-	aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, ctx->key.key_enc,
-			rounds, 1);
+	aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, rk, rounds, 1);
 	kernel_neon_end();

 	cmac_gf128_mul_by_x(consts, consts);
@ -862,6 +441,7 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
 	};

 	struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
+	u8 *rk = (u8 *)ctx->key.key_enc;
 	int rounds = 6 + key_len / 4;
 	u8 key[AES_BLOCK_SIZE];
 	int err;
@ -871,8 +451,8 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
 		return err;

 	kernel_neon_begin();
-	aes_ecb_encrypt(key, ks[0], ctx->key.key_enc, rounds, 1);
-	aes_ecb_encrypt(ctx->consts, ks[1], ctx->key.key_enc, rounds, 2);
+	aes_ecb_encrypt(key, ks[0], rk, rounds, 1);
+	aes_ecb_encrypt(ctx->consts, ks[1], rk, rounds, 2);
 	kernel_neon_end();

 	return cbcmac_setkey(tfm, key, sizeof(key));
@ -894,27 +474,21 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
 	int rounds = 6 + ctx->key_length / 4;

 	if (may_use_simd()) {
-		int rem;
-
-		do {
-			kernel_neon_begin();
-			rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
-					     dg, enc_before, enc_after);
-			kernel_neon_end();
-			in += (blocks - rem) * AES_BLOCK_SIZE;
-			blocks = rem;
-			enc_before = 0;
-		} while (blocks);
+		kernel_neon_begin();
+		aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before,
+			       enc_after);
+		kernel_neon_end();
 	} else {
 		if (enc_before)
-			aes_encrypt(ctx, dg, dg);
+			__aes_arm64_encrypt(ctx->key_enc, dg, dg, rounds);

 		while (blocks--) {
 			crypto_xor(dg, in, AES_BLOCK_SIZE);
 			in += AES_BLOCK_SIZE;

 			if (blocks || enc_after)
-				aes_encrypt(ctx, dg, dg);
+				__aes_arm64_encrypt(ctx->key_enc, dg, dg,
+						    rounds);
 		}
 	}
 }
@ -964,7 +538,7 @@ static int cbcmac_final(struct shash_desc *desc, u8 *out)
 	struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
 	struct mac_desc_ctx *ctx = shash_desc_ctx(desc);

-	mac_do_update(&tctx->key, NULL, 0, ctx->dg, (ctx->len != 0), 0);
+	mac_do_update(&tctx->key, NULL, 0, ctx->dg, 1, 0);

 	memcpy(out, ctx->dg, AES_BLOCK_SIZE);

@ -993,6 +567,7 @@ static struct shash_alg mac_algs[] = { {
 	.base.cra_name		= "cmac(aes)",
 	.base.cra_driver_name	= "cmac-aes-" MODE,
 	.base.cra_priority	= PRIO,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
 				  2 * AES_BLOCK_SIZE,
@ -1008,6 +583,7 @@ static struct shash_alg mac_algs[] = { {
 	.base.cra_name		= "xcbc(aes)",
 	.base.cra_driver_name	= "xcbc-aes-" MODE,
 	.base.cra_priority	= PRIO,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= AES_BLOCK_SIZE,
 	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx) +
 				  2 * AES_BLOCK_SIZE,
@ -1023,6 +599,7 @@ static struct shash_alg mac_algs[] = { {
 	.base.cra_name		= "cbcmac(aes)",
 	.base.cra_driver_name	= "cbcmac-aes-" MODE,
 	.base.cra_priority	= PRIO,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= 1,
 	.base.cra_ctxsize	= sizeof(struct mac_tfm_ctx),
 	.base.cra_module	= THIS_MODULE,
@ -1085,7 +662,6 @@ static int __init aes_init(void)

 unregister_simds:
 	aes_exit();
-	return err;
 unregister_ciphers:
 	crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
 	return err;
@ -1097,7 +673,5 @@ module_cpu_feature_match(AES, aes_init);
 module_init(aes_init);
 EXPORT_SYMBOL(neon_aes_ecb_encrypt);
 EXPORT_SYMBOL(neon_aes_cbc_encrypt);
-EXPORT_SYMBOL(neon_aes_xts_encrypt);
-EXPORT_SYMBOL(neon_aes_xts_decrypt);
 #endif
 module_exit(aes_exit);
--- a/arch/arm64/crypto/aes-lib.c
+++ b/arch/arm64/crypto/aes-lib.c
@ -1,356 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2017-2019 Linaro Ltd <ard.biesheuvel@linaro.org>
- */
-
-#include <crypto/aes.h>
-#include <linux/crypto.h>
-#include <linux/module.h>
-#include <asm/unaligned.h>
-
-/*
- * Emit the sbox as volatile const to prevent the compiler from doing
- * constant folding on sbox references involving fixed indexes.
- */
-static volatile const u8 __cacheline_aligned aes_sbox[] = {
-	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
-	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
-	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
-	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
-	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
-	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
-	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
-	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
-	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
-	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
-	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
-	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
-	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
-	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
-	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
-	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
-	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
-	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
-	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
-	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
-	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
-	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
-	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
-	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
-	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
-	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
-	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
-	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
-	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
-	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
-	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
-	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
-};
-
-static volatile const u8 __cacheline_aligned aes_inv_sbox[] = {
-	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
-	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
-	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
-	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
-	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
-	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
-	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
-	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
-	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
-	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
-	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
-	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
-	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
-	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
-	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
-	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
-	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
-	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
-	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
-	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
-	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
-	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
-	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
-	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
-	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
-	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
-	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
-	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
-	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
-	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
-	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
-	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
-};
-
-extern const u8 crypto_aes_sbox[256] __alias(aes_sbox);
-extern const u8 crypto_aes_inv_sbox[256] __alias(aes_inv_sbox);
-
-EXPORT_SYMBOL(crypto_aes_sbox);
-EXPORT_SYMBOL(crypto_aes_inv_sbox);
-
-static u32 mul_by_x(u32 w)
-{
-	u32 x = w & 0x7f7f7f7f;
-	u32 y = w & 0x80808080;
-
-	/* multiply by polynomial 'x' (0b10) in GF(2^8) */
-	return (x << 1) ^ (y >> 7) * 0x1b;
-}
-
-static u32 mul_by_x2(u32 w)
-{
-	u32 x = w & 0x3f3f3f3f;
-	u32 y = w & 0x80808080;
-	u32 z = w & 0x40404040;
-
-	/* multiply by polynomial 'x^2' (0b100) in GF(2^8) */
-	return (x << 2) ^ (y >> 7) * 0x36 ^ (z >> 6) * 0x1b;
-}
-
-static u32 mix_columns(u32 x)
-{
-	/*
-	 * Perform the following matrix multiplication in GF(2^8)
-	 *
-	 * | 0x2 0x3 0x1 0x1 |   | x[0] |
-	 * | 0x1 0x2 0x3 0x1 |   | x[1] |
-	 * | 0x1 0x1 0x2 0x3 | x | x[2] |
-	 * | 0x3 0x1 0x1 0x2 |   | x[3] |
-	 */
-	u32 y = mul_by_x(x) ^ ror32(x, 16);
-
-	return y ^ ror32(x ^ y, 8);
-}
-
-static u32 inv_mix_columns(u32 x)
-{
-	/*
-	 * Perform the following matrix multiplication in GF(2^8)
-	 *
-	 * | 0xe 0xb 0xd 0x9 |   | x[0] |
-	 * | 0x9 0xe 0xb 0xd |   | x[1] |
-	 * | 0xd 0x9 0xe 0xb | x | x[2] |
-	 * | 0xb 0xd 0x9 0xe |   | x[3] |
-	 *
-	 * which can conveniently be reduced to
-	 *
-	 * | 0x2 0x3 0x1 0x1 |   | 0x5 0x0 0x4 0x0 |   | x[0] |
-	 * | 0x1 0x2 0x3 0x1 |   | 0x0 0x5 0x0 0x4 |   | x[1] |
-	 * | 0x1 0x1 0x2 0x3 | x | 0x4 0x0 0x5 0x0 | x | x[2] |
-	 * | 0x3 0x1 0x1 0x2 |   | 0x0 0x4 0x0 0x5 |   | x[3] |
-	 */
-	u32 y = mul_by_x2(x);
-
-	return mix_columns(x ^ y ^ ror32(y, 16));
-}
-
-static __always_inline u32 subshift(u32 in[], int pos)
-{
-	return (aes_sbox[in[pos] & 0xff]) ^
-	       (aes_sbox[(in[(pos + 1) % 4] >>  8) & 0xff] <<  8) ^
-	       (aes_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^
-	       (aes_sbox[(in[(pos + 3) % 4] >> 24) & 0xff] << 24);
-}
-
-static __always_inline u32 inv_subshift(u32 in[], int pos)
-{
-	return (aes_inv_sbox[in[pos] & 0xff]) ^
-	       (aes_inv_sbox[(in[(pos + 3) % 4] >>  8) & 0xff] <<  8) ^
-	       (aes_inv_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^
-	       (aes_inv_sbox[(in[(pos + 1) % 4] >> 24) & 0xff] << 24);
-}
-
-static u32 subw(u32 in)
-{
-	return (aes_sbox[in & 0xff]) ^
-	       (aes_sbox[(in >>  8) & 0xff] <<  8) ^
-	       (aes_sbox[(in >> 16) & 0xff] << 16) ^
-	       (aes_sbox[(in >> 24) & 0xff] << 24);
-}
-
-/**
- * aes_expandkey - Expands the AES key as described in FIPS-197
- * @ctx:	The location where the computed key will be stored.
- * @in_key:	The supplied key.
- * @key_len:	The length of the supplied key.
- *
- * Returns 0 on success. The function fails only if an invalid key size (or
- * pointer) is supplied.
- * The expanded key size is 240 bytes (max of 14 rounds with a unique 16 bytes
- * key schedule plus a 16 bytes key which is used before the first round).
- * The decryption key is prepared for the "Equivalent Inverse Cipher" as
- * described in FIPS-197. The first slot (16 bytes) of each key (enc or dec) is
- * for the initial combination, the second slot for the first round and so on.
- */
-int aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
-		  unsigned int key_len)
-{
-	u32 kwords = key_len / sizeof(u32);
-	u32 rc, i, j;
-
-	if (key_len != AES_KEYSIZE_128 &&
-	    key_len != AES_KEYSIZE_192 &&
-	    key_len != AES_KEYSIZE_256)
-		return -EINVAL;
-
-	ctx->key_length = key_len;
-
-	for (i = 0; i < kwords; i++)
-		ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32));
-
-	for (i = 0, rc = 1; i < 10; i++, rc = mul_by_x(rc)) {
-		u32 *rki = ctx->key_enc + (i * kwords);
-		u32 *rko = rki + kwords;
-
-		rko[0] = ror32(subw(rki[kwords - 1]), 8) ^ rc ^ rki[0];
-		rko[1] = rko[0] ^ rki[1];
-		rko[2] = rko[1] ^ rki[2];
-		rko[3] = rko[2] ^ rki[3];
-
-		if (key_len == AES_KEYSIZE_192) {
-			if (i >= 7)
-				break;
-			rko[4] = rko[3] ^ rki[4];
-			rko[5] = rko[4] ^ rki[5];
-		} else if (key_len == AES_KEYSIZE_256) {
-			if (i >= 6)
-				break;
-			rko[4] = subw(rko[3]) ^ rki[4];
-			rko[5] = rko[4] ^ rki[5];
-			rko[6] = rko[5] ^ rki[6];
-			rko[7] = rko[6] ^ rki[7];
-		}
-	}
-
-	/*
-	 * Generate the decryption keys for the Equivalent Inverse Cipher.
-	 * This involves reversing the order of the round keys, and applying
-	 * the Inverse Mix Columns transformation to all but the first and
-	 * the last one.
-	 */
-	ctx->key_dec[0] = ctx->key_enc[key_len + 24];
-	ctx->key_dec[1] = ctx->key_enc[key_len + 25];
-	ctx->key_dec[2] = ctx->key_enc[key_len + 26];
-	ctx->key_dec[3] = ctx->key_enc[key_len + 27];
-
-	for (i = 4, j = key_len + 20; j > 0; i += 4, j -= 4) {
-		ctx->key_dec[i]     = inv_mix_columns(ctx->key_enc[j]);
-		ctx->key_dec[i + 1] = inv_mix_columns(ctx->key_enc[j + 1]);
-		ctx->key_dec[i + 2] = inv_mix_columns(ctx->key_enc[j + 2]);
-		ctx->key_dec[i + 3] = inv_mix_columns(ctx->key_enc[j + 3]);
-	}
-
-	ctx->key_dec[i]     = ctx->key_enc[0];
-	ctx->key_dec[i + 1] = ctx->key_enc[1];
-	ctx->key_dec[i + 2] = ctx->key_enc[2];
-	ctx->key_dec[i + 3] = ctx->key_enc[3];
-
-	return 0;
-}
-EXPORT_SYMBOL(aes_expandkey);
-
-/**
- * aes_encrypt - Encrypt a single AES block
- * @ctx:	Context struct containing the key schedule
- * @out:	Buffer to store the ciphertext
- * @in:		Buffer containing the plaintext
- */
-void aes_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in)
-{
-	const u32 *rkp = ctx->key_enc + 4;
-	int rounds = 6 + ctx->key_length / 4;
-	u32 st0[4], st1[4];
-	int round;
-
-	st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
-	st0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4);
-	st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
-	st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
-
-	/*
-	 * Force the compiler to emit data independent Sbox references,
-	 * by xoring the input with Sbox values that are known to add up
-	 * to zero. This pulls the entire Sbox into the D-cache before any
-	 * data dependent lookups are done.
-	 */
-	st0[0] ^= aes_sbox[ 0] ^ aes_sbox[ 64] ^ aes_sbox[134] ^ aes_sbox[195];
-	st0[1] ^= aes_sbox[16] ^ aes_sbox[ 82] ^ aes_sbox[158] ^ aes_sbox[221];
-	st0[2] ^= aes_sbox[32] ^ aes_sbox[ 96] ^ aes_sbox[160] ^ aes_sbox[234];
-	st0[3] ^= aes_sbox[48] ^ aes_sbox[112] ^ aes_sbox[186] ^ aes_sbox[241];
-
-	for (round = 0;; round += 2, rkp += 8) {
-		st1[0] = mix_columns(subshift(st0, 0)) ^ rkp[0];
-		st1[1] = mix_columns(subshift(st0, 1)) ^ rkp[1];
-		st1[2] = mix_columns(subshift(st0, 2)) ^ rkp[2];
-		st1[3] = mix_columns(subshift(st0, 3)) ^ rkp[3];
-
-		if (round == rounds - 2)
-			break;
-
-		st0[0] = mix_columns(subshift(st1, 0)) ^ rkp[4];
-		st0[1] = mix_columns(subshift(st1, 1)) ^ rkp[5];
-		st0[2] = mix_columns(subshift(st1, 2)) ^ rkp[6];
-		st0[3] = mix_columns(subshift(st1, 3)) ^ rkp[7];
-	}
-
-	put_unaligned_le32(subshift(st1, 0) ^ rkp[4], out);
-	put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4);
-	put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8);
-	put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12);
-}
-EXPORT_SYMBOL(aes_encrypt);
-
-/**
- * aes_decrypt - Decrypt a single AES block
- * @ctx:	Context struct containing the key schedule
- * @out:	Buffer to store the plaintext
- * @in:		Buffer containing the ciphertext
- */
-void aes_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in)
-{
-	const u32 *rkp = ctx->key_dec + 4;
-	int rounds = 6 + ctx->key_length / 4;
-	u32 st0[4], st1[4];
-	int round;
-
-	st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
-	st0[1] = ctx->key_dec[1] ^ get_unaligned_le32(in + 4);
-	st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
-	st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
-
-	/*
-	 * Force the compiler to emit data independent Sbox references,
-	 * by xoring the input with Sbox values that are known to add up
-	 * to zero. This pulls the entire Sbox into the D-cache before any
-	 * data dependent lookups are done.
-	 */
-	st0[0] ^= aes_inv_sbox[ 0] ^ aes_inv_sbox[ 64] ^ aes_inv_sbox[129] ^ aes_inv_sbox[200];
-	st0[1] ^= aes_inv_sbox[16] ^ aes_inv_sbox[ 83] ^ aes_inv_sbox[150] ^ aes_inv_sbox[212];
-	st0[2] ^= aes_inv_sbox[32] ^ aes_inv_sbox[ 96] ^ aes_inv_sbox[160] ^ aes_inv_sbox[236];
-	st0[3] ^= aes_inv_sbox[48] ^ aes_inv_sbox[112] ^ aes_inv_sbox[187] ^ aes_inv_sbox[247];
-
-	for (round = 0;; round += 2, rkp += 8) {
-		st1[0] = inv_mix_columns(inv_subshift(st0, 0)) ^ rkp[0];
-		st1[1] = inv_mix_columns(inv_subshift(st0, 1)) ^ rkp[1];
-		st1[2] = inv_mix_columns(inv_subshift(st0, 2)) ^ rkp[2];
-		st1[3] = inv_mix_columns(inv_subshift(st0, 3)) ^ rkp[3];
-
-		if (round == rounds - 2)
-			break;
-
-		st0[0] = inv_mix_columns(inv_subshift(st1, 0)) ^ rkp[4];
-		st0[1] = inv_mix_columns(inv_subshift(st1, 1)) ^ rkp[5];
-		st0[2] = inv_mix_columns(inv_subshift(st1, 2)) ^ rkp[6];
-		st0[3] = inv_mix_columns(inv_subshift(st1, 3)) ^ rkp[7];
-	}
-
-	put_unaligned_le32(inv_subshift(st1, 0) ^ rkp[4], out);
-	put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4);
-	put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8);
-	put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12);
-}
-EXPORT_SYMBOL(aes_decrypt);
-
-MODULE_DESCRIPTION("Generic AES library");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@ -13,39 +13,15 @@
 	.text
 	.align		4

-#ifndef MAX_STRIDE
-#define MAX_STRIDE	4
-#endif
-
-#if MAX_STRIDE == 4
-#define ST4(x...) x
-#define ST5(x...)
-#else
-#define ST4(x...)
-#define ST5(x...) x
-#endif
-
-SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
+aes_encrypt_block4x:
 	encrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
 	ret
-SYM_FUNC_END(aes_encrypt_block4x)
+ENDPROC(aes_encrypt_block4x)

-SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
+aes_decrypt_block4x:
 	decrypt_block4x	v0, v1, v2, v3, w3, x2, x8, w7
 	ret
-SYM_FUNC_END(aes_decrypt_block4x)
-
-#if MAX_STRIDE == 5
-SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
-	encrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
-	ret
-SYM_FUNC_END(aes_encrypt_block5x)
-
-SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
-	decrypt_block5x	v0, v1, v2, v3, v4, w3, x2, x8, w7
-	ret
-SYM_FUNC_END(aes_decrypt_block5x)
-#endif
+ENDPROC(aes_decrypt_block4x)

 	/*
 	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
@ -54,24 +30,21 @@ SYM_FUNC_END(aes_decrypt_block5x)
 	 *		   int blocks)
 	 */

-AES_FUNC_START(aes_ecb_encrypt)
+AES_ENTRY(aes_ecb_encrypt)
 	stp		x29, x30, [sp, #-16]!
 	mov		x29, sp

 	enc_prepare	w3, x2, x5

 .LecbencloopNx:
-	subs		w4, w4, #MAX_STRIDE
+	subs		w4, w4, #4
 	bmi		.Lecbenc1x
 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
-ST4(	bl		aes_encrypt_block4x		)
-ST5(	ld1		{v4.16b}, [x1], #16		)
-ST5(	bl		aes_encrypt_block5x		)
+	bl		aes_encrypt_block4x
 	st1		{v0.16b-v3.16b}, [x0], #64
-ST5(	st1		{v4.16b}, [x0], #16		)
 	b		.LecbencloopNx
 .Lecbenc1x:
-	adds		w4, w4, #MAX_STRIDE
+	adds		w4, w4, #4
 	beq		.Lecbencout
 .Lecbencloop:
 	ld1		{v0.16b}, [x1], #16		/* get next pt block */
@ -82,27 +55,24 @@ ST5(	st1		{v4.16b}, [x0], #16		)
 .Lecbencout:
 	ldp		x29, x30, [sp], #16
 	ret
-AES_FUNC_END(aes_ecb_encrypt)
+AES_ENDPROC(aes_ecb_encrypt)


-AES_FUNC_START(aes_ecb_decrypt)
+AES_ENTRY(aes_ecb_decrypt)
 	stp		x29, x30, [sp, #-16]!
 	mov		x29, sp

 	dec_prepare	w3, x2, x5

 .LecbdecloopNx:
-	subs		w4, w4, #MAX_STRIDE
+	subs		w4, w4, #4
 	bmi		.Lecbdec1x
 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
-ST4(	bl		aes_decrypt_block4x		)
-ST5(	ld1		{v4.16b}, [x1], #16		)
-ST5(	bl		aes_decrypt_block5x		)
+	bl		aes_decrypt_block4x
 	st1		{v0.16b-v3.16b}, [x0], #64
-ST5(	st1		{v4.16b}, [x0], #16		)
 	b		.LecbdecloopNx
 .Lecbdec1x:
-	adds		w4, w4, #MAX_STRIDE
+	adds		w4, w4, #4
 	beq		.Lecbdecout
 .Lecbdecloop:
 	ld1		{v0.16b}, [x1], #16		/* get next ct block */
@ -113,7 +83,7 @@ ST5(	st1		{v4.16b}, [x0], #16		)
 .Lecbdecout:
 	ldp		x29, x30, [sp], #16
 	ret
-AES_FUNC_END(aes_ecb_decrypt)
+AES_ENDPROC(aes_ecb_decrypt)


 	/*
@ -121,24 +91,9 @@ AES_FUNC_END(aes_ecb_decrypt)
 	 *		   int blocks, u8 iv[])
 	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 	 *		   int blocks, u8 iv[])
-	 * aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
-	 *			 int rounds, int blocks, u8 iv[],
-	 *			 u32 const rk2[]);
-	 * aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
-	 *			 int rounds, int blocks, u8 iv[],
-	 *			 u32 const rk2[]);
 	 */

-AES_FUNC_START(aes_essiv_cbc_encrypt)
-	ld1		{v4.16b}, [x5]			/* get iv */
-
-	mov		w8, #14				/* AES-256: 14 rounds */
-	enc_prepare	w8, x6, x7
-	encrypt_block	v4, w8, x6, x7, w9
-	enc_switch_key	w3, x2, x6
-	b		.Lcbcencloop4x
-
-AES_FUNC_START(aes_cbc_encrypt)
+AES_ENTRY(aes_cbc_encrypt)
 	ld1		{v4.16b}, [x5]			/* get iv */
 	enc_prepare	w3, x2, x6

@ -170,360 +125,184 @@ AES_FUNC_START(aes_cbc_encrypt)
 .Lcbcencout:
 	st1		{v4.16b}, [x5]			/* return iv */
 	ret
-AES_FUNC_END(aes_cbc_encrypt)
-AES_FUNC_END(aes_essiv_cbc_encrypt)
+AES_ENDPROC(aes_cbc_encrypt)

-AES_FUNC_START(aes_essiv_cbc_decrypt)
+
+AES_ENTRY(aes_cbc_decrypt)
 	stp		x29, x30, [sp, #-16]!
 	mov		x29, sp

-	ld1		{cbciv.16b}, [x5]		/* get iv */
-
-	mov		w8, #14				/* AES-256: 14 rounds */
-	enc_prepare	w8, x6, x7
-	encrypt_block	cbciv, w8, x6, x7, w9
-	b		.Lessivcbcdecstart
-
-AES_FUNC_START(aes_cbc_decrypt)
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
-
-	ld1		{cbciv.16b}, [x5]		/* get iv */
-.Lessivcbcdecstart:
+	ld1		{v7.16b}, [x5]			/* get iv */
 	dec_prepare	w3, x2, x6

 .LcbcdecloopNx:
-	subs		w4, w4, #MAX_STRIDE
+	subs		w4, w4, #4
 	bmi		.Lcbcdec1x
 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
-#if MAX_STRIDE == 5
-	ld1		{v4.16b}, [x1], #16		/* get 1 ct block */
-	mov		v5.16b, v0.16b
-	mov		v6.16b, v1.16b
-	mov		v7.16b, v2.16b
-	bl		aes_decrypt_block5x
-	sub		x1, x1, #32
-	eor		v0.16b, v0.16b, cbciv.16b
-	eor		v1.16b, v1.16b, v5.16b
-	ld1		{v5.16b}, [x1], #16		/* reload 1 ct block */
-	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
-	eor		v2.16b, v2.16b, v6.16b
-	eor		v3.16b, v3.16b, v7.16b
-	eor		v4.16b, v4.16b, v5.16b
-#else
 	mov		v4.16b, v0.16b
 	mov		v5.16b, v1.16b
 	mov		v6.16b, v2.16b
 	bl		aes_decrypt_block4x
 	sub		x1, x1, #16
-	eor		v0.16b, v0.16b, cbciv.16b
+	eor		v0.16b, v0.16b, v7.16b
 	eor		v1.16b, v1.16b, v4.16b
-	ld1		{cbciv.16b}, [x1], #16		/* reload 1 ct block */
+	ld1		{v7.16b}, [x1], #16		/* reload 1 ct block */
 	eor		v2.16b, v2.16b, v5.16b
 	eor		v3.16b, v3.16b, v6.16b
-#endif
 	st1		{v0.16b-v3.16b}, [x0], #64
-ST5(	st1		{v4.16b}, [x0], #16		)
 	b		.LcbcdecloopNx
 .Lcbcdec1x:
-	adds		w4, w4, #MAX_STRIDE
+	adds		w4, w4, #4
 	beq		.Lcbcdecout
 .Lcbcdecloop:
 	ld1		{v1.16b}, [x1], #16		/* get next ct block */
 	mov		v0.16b, v1.16b			/* ...and copy to v0 */
 	decrypt_block	v0, w3, x2, x6, w7
-	eor		v0.16b, v0.16b, cbciv.16b	/* xor with iv => pt */
-	mov		cbciv.16b, v1.16b		/* ct is next iv */
+	eor		v0.16b, v0.16b, v7.16b		/* xor with iv => pt */
+	mov		v7.16b, v1.16b			/* ct is next iv */
 	st1		{v0.16b}, [x0], #16
 	subs		w4, w4, #1
 	bne		.Lcbcdecloop
 .Lcbcdecout:
-	st1		{cbciv.16b}, [x5]		/* return iv */
+	st1		{v7.16b}, [x5]			/* return iv */
 	ldp		x29, x30, [sp], #16
 	ret
-AES_FUNC_END(aes_cbc_decrypt)
-AES_FUNC_END(aes_essiv_cbc_decrypt)
-
-
-	/*
-	 * aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
-	 *		       int rounds, int bytes, u8 const iv[])
-	 * aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
-	 *		       int rounds, int bytes, u8 const iv[])
-	 */
-
-AES_FUNC_START(aes_cbc_cts_encrypt)
-	adr_l		x8, .Lcts_permute_table
-	sub		x4, x4, #16
-	add		x9, x8, #32
-	add		x8, x8, x4
-	sub		x9, x9, x4
-	ld1		{v3.16b}, [x8]
-	ld1		{v4.16b}, [x9]
-
-	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
-	ld1		{v1.16b}, [x1]
-
-	ld1		{v5.16b}, [x5]			/* get iv */
-	enc_prepare	w3, x2, x6
-
-	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
-	tbl		v1.16b, {v1.16b}, v4.16b
-	encrypt_block	v0, w3, x2, x6, w7
-
-	eor		v1.16b, v1.16b, v0.16b
-	tbl		v0.16b, {v0.16b}, v3.16b
-	encrypt_block	v1, w3, x2, x6, w7
-
-	add		x4, x0, x4
-	st1		{v0.16b}, [x4]			/* overlapping stores */
-	st1		{v1.16b}, [x0]
-	ret
-AES_FUNC_END(aes_cbc_cts_encrypt)
-
-AES_FUNC_START(aes_cbc_cts_decrypt)
-	adr_l		x8, .Lcts_permute_table
-	sub		x4, x4, #16
-	add		x9, x8, #32
-	add		x8, x8, x4
-	sub		x9, x9, x4
-	ld1		{v3.16b}, [x8]
-	ld1		{v4.16b}, [x9]
-
-	ld1		{v0.16b}, [x1], x4		/* overlapping loads */
-	ld1		{v1.16b}, [x1]
-
-	ld1		{v5.16b}, [x5]			/* get iv */
-	dec_prepare	w3, x2, x6
-
-	decrypt_block	v0, w3, x2, x6, w7
-	tbl		v2.16b, {v0.16b}, v3.16b
-	eor		v2.16b, v2.16b, v1.16b
-
-	tbx		v0.16b, {v1.16b}, v4.16b
-	decrypt_block	v0, w3, x2, x6, w7
-	eor		v0.16b, v0.16b, v5.16b		/* xor with iv */
-
-	add		x4, x0, x4
-	st1		{v2.16b}, [x4]			/* overlapping stores */
-	st1		{v0.16b}, [x0]
-	ret
-AES_FUNC_END(aes_cbc_cts_decrypt)
-
-	.section	".rodata", "a"
-	.align		6
-.Lcts_permute_table:
-	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
-	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
-	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-	.previous
+AES_ENDPROC(aes_cbc_decrypt)


 	/*
 	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
-	 *		   int bytes, u8 ctr[], u8 finalbuf[])
+	 *		   int blocks, u8 ctr[])
 	 */

-AES_FUNC_START(aes_ctr_encrypt)
+AES_ENTRY(aes_ctr_encrypt)
 	stp		x29, x30, [sp, #-16]!
 	mov		x29, sp

-	enc_prepare	w3, x2, x12
-	ld1		{vctr.16b}, [x5]
-
-	umov		x12, vctr.d[1]		/* keep swabbed ctr in reg */
-	rev		x12, x12
+	enc_prepare	w3, x2, x6
+	ld1		{v4.16b}, [x5]

+	umov		x6, v4.d[1]		/* keep swabbed ctr in reg */
+	rev		x6, x6
+	cmn		w6, w4			/* 32 bit overflow? */
+	bcs		.Lctrloop
 .LctrloopNx:
-	add		w7, w4, #15
-	sub		w4, w4, #MAX_STRIDE << 4
-	lsr		w7, w7, #4
-	mov		w8, #MAX_STRIDE
-	cmp		w7, w8
-	csel		w7, w7, w8, lt
-	adds		x12, x12, x7
-
-	mov		v0.16b, vctr.16b
-	mov		v1.16b, vctr.16b
-	mov		v2.16b, vctr.16b
-	mov		v3.16b, vctr.16b
-ST5(	mov		v4.16b, vctr.16b		)
-	bcs		0f
-
-	.subsection	1
-	/* apply carry to outgoing counter */
-0:	umov		x8, vctr.d[0]
-	rev		x8, x8
-	add		x8, x8, #1
-	rev		x8, x8
-	ins		vctr.d[0], x8
-
-	/* apply carry to N counter blocks for N := x12 */
-	cbz		x12, 2f
-	adr		x16, 1f
-	sub		x16, x16, x12, lsl #3
-	br		x16
-	hint		34			// bti c
-	mov		v0.d[0], vctr.d[0]
-	hint		34			// bti c
-	mov		v1.d[0], vctr.d[0]
-	hint		34			// bti c
-	mov		v2.d[0], vctr.d[0]
-	hint		34			// bti c
-	mov		v3.d[0], vctr.d[0]
-ST5(	hint		34				)
-ST5(	mov		v4.d[0], vctr.d[0]		)
-1:	b		2f
-	.previous
-
-2:	rev		x7, x12
-	ins		vctr.d[1], x7
-	sub		x7, x12, #MAX_STRIDE - 1
-	sub		x8, x12, #MAX_STRIDE - 2
-	sub		x9, x12, #MAX_STRIDE - 3
-	rev		x7, x7
-	rev		x8, x8
-	mov		v1.d[1], x7
-	rev		x9, x9
-ST5(	sub		x10, x12, #MAX_STRIDE - 4	)
-	mov		v2.d[1], x8
-ST5(	rev		x10, x10			)
-	mov		v3.d[1], x9
-ST5(	mov		v4.d[1], x10			)
-	tbnz		w4, #31, .Lctrtail
-	ld1		{v5.16b-v7.16b}, [x1], #48
-ST4(	bl		aes_encrypt_block4x		)
-ST5(	bl		aes_encrypt_block5x		)
+	subs		w4, w4, #4
+	bmi		.Lctr1x
+	add		w7, w6, #1
+	mov		v0.16b, v4.16b
+	add		w8, w6, #2
+	mov		v1.16b, v4.16b
+	add		w9, w6, #3
+	mov		v2.16b, v4.16b
+	rev		w7, w7
+	mov		v3.16b, v4.16b
+	rev		w8, w8
+	mov		v1.s[3], w7
+	rev		w9, w9
+	mov		v2.s[3], w8
+	mov		v3.s[3], w9
+	ld1		{v5.16b-v7.16b}, [x1], #48	/* get 3 input blocks */
+	bl		aes_encrypt_block4x
 	eor		v0.16b, v5.16b, v0.16b
-ST4(	ld1		{v5.16b}, [x1], #16		)
+	ld1		{v5.16b}, [x1], #16		/* get 1 input block  */
 	eor		v1.16b, v6.16b, v1.16b
-ST5(	ld1		{v5.16b-v6.16b}, [x1], #32	)
 	eor		v2.16b, v7.16b, v2.16b
 	eor		v3.16b, v5.16b, v3.16b
-ST5(	eor		v4.16b, v6.16b, v4.16b		)
 	st1		{v0.16b-v3.16b}, [x0], #64
-ST5(	st1		{v4.16b}, [x0], #16		)
+	add		x6, x6, #4
+	rev		x7, x6
+	ins		v4.d[1], x7
 	cbz		w4, .Lctrout
 	b		.LctrloopNx
+.Lctr1x:
+	adds		w4, w4, #4
+	beq		.Lctrout
+.Lctrloop:
+	mov		v0.16b, v4.16b
+	encrypt_block	v0, w3, x2, x8, w7
+
+	adds		x6, x6, #1		/* increment BE ctr */
+	rev		x7, x6
+	ins		v4.d[1], x7
+	bcs		.Lctrcarry		/* overflow? */
+
+.Lctrcarrydone:
+	subs		w4, w4, #1
+	bmi		.Lctrtailblock		/* blocks <0 means tail block */
+	ld1		{v3.16b}, [x1], #16
+	eor		v3.16b, v0.16b, v3.16b
+	st1		{v3.16b}, [x0], #16
+	bne		.Lctrloop

 .Lctrout:
-	st1		{vctr.16b}, [x5]	/* return next CTR value */
+	st1		{v4.16b}, [x5]		/* return next CTR value */
 	ldp		x29, x30, [sp], #16
 	ret

-.Lctrtail:
-	/* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
-	mov		x16, #16
-	ands		x13, x4, #0xf
-	csel		x13, x13, x16, ne
+.Lctrtailblock:
+	st1		{v0.16b}, [x0]
+	ldp		x29, x30, [sp], #16
+	ret

-ST5(	cmp		w4, #64 - (MAX_STRIDE << 4)	)
-ST5(	csel		x14, x16, xzr, gt		)
-	cmp		w4, #48 - (MAX_STRIDE << 4)
-	csel		x15, x16, xzr, gt
-	cmp		w4, #32 - (MAX_STRIDE << 4)
-	csel		x16, x16, xzr, gt
-	cmp		w4, #16 - (MAX_STRIDE << 4)
-	ble		.Lctrtail1x
-
-	adr_l		x12, .Lcts_permute_table
-	add		x12, x12, x13
-
-ST5(	ld1		{v5.16b}, [x1], x14		)
-	ld1		{v6.16b}, [x1], x15
-	ld1		{v7.16b}, [x1], x16
-
-ST4(	bl		aes_encrypt_block4x		)
-ST5(	bl		aes_encrypt_block5x		)
-
-	ld1		{v8.16b}, [x1], x13
-	ld1		{v9.16b}, [x1]
-	ld1		{v10.16b}, [x12]
-
-ST4(	eor		v6.16b, v6.16b, v0.16b		)
-ST4(	eor		v7.16b, v7.16b, v1.16b		)
-ST4(	tbl		v3.16b, {v3.16b}, v10.16b	)
-ST4(	eor		v8.16b, v8.16b, v2.16b		)
-ST4(	eor		v9.16b, v9.16b, v3.16b		)
-
-ST5(	eor		v5.16b, v5.16b, v0.16b		)
-ST5(	eor		v6.16b, v6.16b, v1.16b		)
-ST5(	tbl		v4.16b, {v4.16b}, v10.16b	)
-ST5(	eor		v7.16b, v7.16b, v2.16b		)
-ST5(	eor		v8.16b, v8.16b, v3.16b		)
-ST5(	eor		v9.16b, v9.16b, v4.16b		)
-
-ST5(	st1		{v5.16b}, [x0], x14		)
-	st1		{v6.16b}, [x0], x15
-	st1		{v7.16b}, [x0], x16
-	add		x13, x13, x0
-	st1		{v9.16b}, [x13]		// overlapping stores
-	st1		{v8.16b}, [x0]
-	b		.Lctrout
-
-.Lctrtail1x:
-	csel		x0, x0, x6, eq		// use finalbuf if less than a full block
-	ld1		{v5.16b}, [x1]
-ST5(	mov		v3.16b, v4.16b			)
-	encrypt_block	v3, w3, x2, x8, w7
-	eor		v5.16b, v5.16b, v3.16b
-	st1		{v5.16b}, [x0]
-	b		.Lctrout
-AES_FUNC_END(aes_ctr_encrypt)
+.Lctrcarry:
+	umov		x7, v4.d[0]		/* load upper word of ctr  */
+	rev		x7, x7			/* ... to handle the carry */
+	add		x7, x7, #1
+	rev		x7, x7
+	ins		v4.d[0], x7
+	b		.Lctrcarrydone
+AES_ENDPROC(aes_ctr_encrypt)
+	.ltorg


 	/*
-	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
-	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
 	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
-	 *		   int bytes, u8 const rk2[], u8 iv[], int first)
+	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
+	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
+	 *		   int blocks, u8 const rk2[], u8 iv[], int first)
 	 */

-	.macro		next_tweak, out, in, tmp
+	.macro		next_tweak, out, in, const, tmp
 	sshr		\tmp\().2d,  \in\().2d,   #63
-	and		\tmp\().16b, \tmp\().16b, xtsmask.16b
+	and		\tmp\().16b, \tmp\().16b, \const\().16b
 	add		\out\().2d,  \in\().2d,   \in\().2d
 	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
 	eor		\out\().16b, \out\().16b, \tmp\().16b
 	.endm

-	.macro		xts_load_mask, tmp
-	movi		xtsmask.2s, #0x1
-	movi		\tmp\().2s, #0x87
-	uzp1		xtsmask.4s, xtsmask.4s, \tmp\().4s
-	.endm
+.Lxts_mul_x:
+CPU_LE(	.quad		1, 0x87		)
+CPU_BE(	.quad		0x87, 1		)

-AES_FUNC_START(aes_xts_encrypt)
+AES_ENTRY(aes_xts_encrypt)
 	stp		x29, x30, [sp, #-16]!
 	mov		x29, sp

 	ld1		{v4.16b}, [x6]
-	xts_load_mask	v8
 	cbz		w7, .Lxtsencnotfirst

 	enc_prepare	w3, x5, x8
-	xts_cts_skip_tw	w7, .LxtsencNx
 	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
 	enc_switch_key	w3, x2, x8
+	ldr		q7, .Lxts_mul_x
 	b		.LxtsencNx

 .Lxtsencnotfirst:
 	enc_prepare	w3, x2, x8
 .LxtsencloopNx:
-	next_tweak	v4, v4, v8
+	ldr		q7, .Lxts_mul_x
+	next_tweak	v4, v4, v7, v8
 .LxtsencNx:
-	subs		w4, w4, #64
+	subs		w4, w4, #4
 	bmi		.Lxtsenc1x
 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 pt blocks */
-	next_tweak	v5, v4, v8
+	next_tweak	v5, v4, v7, v8
 	eor		v0.16b, v0.16b, v4.16b
-	next_tweak	v6, v5, v8
+	next_tweak	v6, v5, v7, v8
 	eor		v1.16b, v1.16b, v5.16b
 	eor		v2.16b, v2.16b, v6.16b
-	next_tweak	v7, v6, v8
+	next_tweak	v7, v6, v7, v8
 	eor		v3.16b, v3.16b, v7.16b
 	bl		aes_encrypt_block4x
 	eor		v3.16b, v3.16b, v7.16b
@ -532,91 +311,56 @@ AES_FUNC_START(aes_xts_encrypt)
 	eor		v2.16b, v2.16b, v6.16b
 	st1		{v0.16b-v3.16b}, [x0], #64
 	mov		v4.16b, v7.16b
-	cbz		w4, .Lxtsencret
-	xts_reload_mask	v8
+	cbz		w4, .Lxtsencout
 	b		.LxtsencloopNx
 .Lxtsenc1x:
-	adds		w4, w4, #64
+	adds		w4, w4, #4
 	beq		.Lxtsencout
-	subs		w4, w4, #16
-	bmi		.LxtsencctsNx
 .Lxtsencloop:
-	ld1		{v0.16b}, [x1], #16
-.Lxtsencctsout:
-	eor		v0.16b, v0.16b, v4.16b
+	ld1		{v1.16b}, [x1], #16
+	eor		v0.16b, v1.16b, v4.16b
 	encrypt_block	v0, w3, x2, x8, w7
 	eor		v0.16b, v0.16b, v4.16b
-	cbz		w4, .Lxtsencout
-	subs		w4, w4, #16
-	next_tweak	v4, v4, v8
-	bmi		.Lxtsenccts
 	st1		{v0.16b}, [x0], #16
+	subs		w4, w4, #1
+	beq		.Lxtsencout
+	next_tweak	v4, v4, v7, v8
 	b		.Lxtsencloop
 .Lxtsencout:
-	st1		{v0.16b}, [x0]
-.Lxtsencret:
 	st1		{v4.16b}, [x6]
 	ldp		x29, x30, [sp], #16
 	ret
+AES_ENDPROC(aes_xts_encrypt)

-.LxtsencctsNx:
-	mov		v0.16b, v3.16b
-	sub		x0, x0, #16
-.Lxtsenccts:
-	adr_l		x8, .Lcts_permute_table

-	add		x1, x1, w4, sxtw	/* rewind input pointer */
-	add		w4, w4, #16		/* # bytes in final block */
-	add		x9, x8, #32
-	add		x8, x8, x4
-	sub		x9, x9, x4
-	add		x4, x0, x4		/* output address of final block */
-
-	ld1		{v1.16b}, [x1]		/* load final block */
-	ld1		{v2.16b}, [x8]
-	ld1		{v3.16b}, [x9]
-
-	tbl		v2.16b, {v0.16b}, v2.16b
-	tbx		v0.16b, {v1.16b}, v3.16b
-	st1		{v2.16b}, [x4]			/* overlapping stores */
-	mov		w4, wzr
-	b		.Lxtsencctsout
-AES_FUNC_END(aes_xts_encrypt)
-
-AES_FUNC_START(aes_xts_decrypt)
+AES_ENTRY(aes_xts_decrypt)
 	stp		x29, x30, [sp, #-16]!
 	mov		x29, sp

-	/* subtract 16 bytes if we are doing CTS */
-	sub		w8, w4, #0x10
-	tst		w4, #0xf
-	csel		w4, w4, w8, eq
-
 	ld1		{v4.16b}, [x6]
-	xts_load_mask	v8
-	xts_cts_skip_tw	w7, .Lxtsdecskiptw
 	cbz		w7, .Lxtsdecnotfirst

 	enc_prepare	w3, x5, x8
 	encrypt_block	v4, w3, x5, x8, w7		/* first tweak */
-.Lxtsdecskiptw:
 	dec_prepare	w3, x2, x8
+	ldr		q7, .Lxts_mul_x
 	b		.LxtsdecNx

 .Lxtsdecnotfirst:
 	dec_prepare	w3, x2, x8
 .LxtsdecloopNx:
-	next_tweak	v4, v4, v8
+	ldr		q7, .Lxts_mul_x
+	next_tweak	v4, v4, v7, v8
 .LxtsdecNx:
-	subs		w4, w4, #64
+	subs		w4, w4, #4
 	bmi		.Lxtsdec1x
 	ld1		{v0.16b-v3.16b}, [x1], #64	/* get 4 ct blocks */
-	next_tweak	v5, v4, v8
+	next_tweak	v5, v4, v7, v8
 	eor		v0.16b, v0.16b, v4.16b
-	next_tweak	v6, v5, v8
+	next_tweak	v6, v5, v7, v8
 	eor		v1.16b, v1.16b, v5.16b
 	eor		v2.16b, v2.16b, v6.16b
-	next_tweak	v7, v6, v8
+	next_tweak	v7, v6, v7, v8
 	eor		v3.16b, v3.16b, v7.16b
 	bl		aes_decrypt_block4x
 	eor		v3.16b, v3.16b, v7.16b
@ -626,62 +370,31 @@ AES_FUNC_START(aes_xts_decrypt)
 	st1		{v0.16b-v3.16b}, [x0], #64
 	mov		v4.16b, v7.16b
 	cbz		w4, .Lxtsdecout
-	xts_reload_mask	v8
 	b		.LxtsdecloopNx
 .Lxtsdec1x:
-	adds		w4, w4, #64
+	adds		w4, w4, #4
 	beq		.Lxtsdecout
-	subs		w4, w4, #16
 .Lxtsdecloop:
-	ld1		{v0.16b}, [x1], #16
-	bmi		.Lxtsdeccts
-.Lxtsdecctsout:
-	eor		v0.16b, v0.16b, v4.16b
+	ld1		{v1.16b}, [x1], #16
+	eor		v0.16b, v1.16b, v4.16b
 	decrypt_block	v0, w3, x2, x8, w7
 	eor		v0.16b, v0.16b, v4.16b
 	st1		{v0.16b}, [x0], #16
-	cbz		w4, .Lxtsdecout
-	subs		w4, w4, #16
-	next_tweak	v4, v4, v8
+	subs		w4, w4, #1
+	beq		.Lxtsdecout
+	next_tweak	v4, v4, v7, v8
 	b		.Lxtsdecloop
 .Lxtsdecout:
 	st1		{v4.16b}, [x6]
 	ldp		x29, x30, [sp], #16
 	ret
-
-.Lxtsdeccts:
-	adr_l		x8, .Lcts_permute_table
-
-	add		x1, x1, w4, sxtw	/* rewind input pointer */
-	add		w4, w4, #16		/* # bytes in final block */
-	add		x9, x8, #32
-	add		x8, x8, x4
-	sub		x9, x9, x4
-	add		x4, x0, x4		/* output address of final block */
-
-	next_tweak	v5, v4, v8
-
-	ld1		{v1.16b}, [x1]		/* load final block */
-	ld1		{v2.16b}, [x8]
-	ld1		{v3.16b}, [x9]
-
-	eor		v0.16b, v0.16b, v5.16b
-	decrypt_block	v0, w3, x2, x8, w7
-	eor		v0.16b, v0.16b, v5.16b
-
-	tbl		v2.16b, {v0.16b}, v2.16b
-	tbx		v0.16b, {v1.16b}, v3.16b
-
-	st1		{v2.16b}, [x4]			/* overlapping stores */
-	mov		w4, wzr
-	b		.Lxtsdecctsout
-AES_FUNC_END(aes_xts_decrypt)
+AES_ENDPROC(aes_xts_decrypt)

 	/*
 	 * aes_mac_update(u8 const in[], u32 const rk[], int rounds,
 	 *		  int blocks, u8 dg[], int enc_before, int enc_after)
 	 */
-AES_FUNC_START(aes_mac_update)
+AES_ENTRY(aes_mac_update)
 	ld1		{v0.16b}, [x4]			/* get dg */
 	enc_prepare	w2, x1, x7
 	cbz		w5, .Lmacloop4x
@ -703,8 +416,6 @@ AES_FUNC_START(aes_mac_update)
 	csinv		x5, x6, xzr, eq
 	cbz		w5, .Lmacout
 	encrypt_block	v0, w2, x1, x7, w8
-	st1		{v0.16b}, [x4]			/* return dg */
-	cond_yield	.Lmacout, x7
 	b		.Lmacloop4x
 .Lmac1x:
 	add		w3, w3, #4
@ -717,12 +428,10 @@ AES_FUNC_START(aes_mac_update)
 	csinv		x5, x6, xzr, eq
 	cbz		w5, .Lmacout

-.Lmacenc:
 	encrypt_block	v0, w2, x1, x7, w8
 	b		.Lmacloop

 .Lmacout:
 	st1		{v0.16b}, [x4]			/* return dg */
-	mov		w0, w3
 	ret
-AES_FUNC_END(aes_mac_update)
+AES_ENDPROC(aes_mac_update)
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
@ -11,21 +11,8 @@
 #include <linux/linkage.h>
 #include <asm/assembler.h>

-#define AES_FUNC_START(func)		SYM_FUNC_START(neon_ ## func)
-#define AES_FUNC_END(func)		SYM_FUNC_END(neon_ ## func)
-
-	xtsmask		.req	v7
-	cbciv		.req	v7
-	vctr		.req	v4
-
-	.macro		xts_reload_mask, tmp
-	xts_load_mask	\tmp
-	.endm
-
-	/* special case for the neon-bs driver calling into this one for CTS */
-	.macro		xts_cts_skip_tw, reg, lbl
-	tbnz		\reg, #1, \lbl
-	.endm
+#define AES_ENTRY(func)		ENTRY(neon_ ## func)
+#define AES_ENDPROC(func)	ENDPROC(neon_ ## func)

 	/* multiply by polynomial 'x' in GF(2^8) */
 	.macro		mul_by_x, out, in, temp, const
@ -45,10 +32,10 @@

 	/* preload the entire Sbox */
 	.macro		prepare, sbox, shiftrows, temp
+	adr		\temp, \sbox
 	movi		v12.16b, #0x1b
-	ldr_l		q13, \shiftrows, \temp
-	ldr_l		q14, .Lror32by8, \temp
-	adr_l		\temp, \sbox
+	ldr		q13, \shiftrows
+	ldr		q14, .Lror32by8
 	ld1		{v16.16b-v19.16b}, [\temp], #64
 	ld1		{v20.16b-v23.16b}, [\temp], #64
 	ld1		{v24.16b-v27.16b}, [\temp], #64
@ -57,7 +44,7 @@

 	/* do preload for encryption */
 	.macro		enc_prepare, ignore0, ignore1, temp
-	prepare		crypto_aes_sbox, .LForward_ShiftRows, \temp
+	prepare		.LForward_Sbox, .LForward_ShiftRows, \temp
 	.endm

 	.macro		enc_switch_key, ignore0, ignore1, temp
@ -66,7 +53,7 @@

 	/* do preload for decryption */
 	.macro		dec_prepare, ignore0, ignore1, temp
-	prepare		crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp
+	prepare		.LReverse_Sbox, .LReverse_ShiftRows, \temp
 	.endm

 	/* apply SubBytes transformation using the the preloaded Sbox */
@ -124,9 +111,26 @@

 	/*
 	 * Interleaved versions: functionally equivalent to the
-	 * ones above, but applied to AES states in parallel.
+	 * ones above, but applied to 2 or 4 AES states in parallel.
 	 */

+	.macro		sub_bytes_2x, in0, in1
+	sub		v8.16b, \in0\().16b, v15.16b
+	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+	sub		v9.16b, \in1\().16b, v15.16b
+	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
+	sub		v10.16b, v8.16b, v15.16b
+	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
+	sub		v11.16b, v9.16b, v15.16b
+	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
+	sub		v8.16b, v10.16b, v15.16b
+	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
+	sub		v9.16b, v11.16b, v15.16b
+	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
+	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
+	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
+	.endm
+
 	.macro		sub_bytes_4x, in0, in1, in2, in3
 	sub		v8.16b, \in0\().16b, v15.16b
 	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
@ -205,6 +209,25 @@
 	eor		\in1\().16b, \in1\().16b, v11.16b
 	.endm

+	.macro		do_block_2x, enc, in0, in1, rounds, rk, rkp, i
+	ld1		{v15.4s}, [\rk]
+	add		\rkp, \rk, #16
+	mov		\i, \rounds
+1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
+	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
+	movi		v15.16b, #0x40
+	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
+	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
+	sub_bytes_2x	\in0, \in1
+	subs		\i, \i, #1
+	ld1		{v15.4s}, [\rkp], #16
+	beq		2222f
+	mix_columns_2x	\in0, \in1, \enc
+	b		1111b
+2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
+	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
+	.endm
+
 	.macro		do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
 	ld1		{v15.4s}, [\rk]
 	add		\rkp, \rk, #16
@ -231,6 +254,14 @@
 	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
 	.endm

+	.macro		encrypt_block2x, in0, in1, rounds, rk, rkp, i
+	do_block_2x	1, \in0, \in1, \rounds, \rk, \rkp, \i
+	.endm
+
+	.macro		decrypt_block2x, in0, in1, rounds, rk, rkp, i
+	do_block_2x	0, \in0, \in1, \rounds, \rk, \rkp, \i
+	.endm
+
 	.macro		encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
 	do_block_4x	1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
 	.endm
@ -241,8 +272,76 @@

 #include "aes-modes.S"

-	.section	".rodata", "a"
-	.align		4
+	.text
+	.align		6
+.LForward_Sbox:
+	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+	.byte		0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+	.byte		0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+	.byte		0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+	.byte		0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+	.byte		0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+	.byte		0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+	.byte		0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+	.byte		0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+	.byte		0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+	.byte		0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+	.byte		0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+	.byte		0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+	.byte		0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+	.byte		0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+	.byte		0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+	.byte		0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+	.byte		0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+	.byte		0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+	.byte		0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+	.byte		0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+	.byte		0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+	.byte		0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+	.byte		0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+	.byte		0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+	.byte		0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+	.byte		0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+	.byte		0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+	.byte		0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+	.byte		0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+	.byte		0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+
+.LReverse_Sbox:
+	.byte		0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+	.byte		0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+	.byte		0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+	.byte		0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+	.byte		0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+	.byte		0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+	.byte		0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+	.byte		0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+	.byte		0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+	.byte		0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+	.byte		0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+	.byte		0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+	.byte		0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+	.byte		0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+	.byte		0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+	.byte		0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+	.byte		0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+	.byte		0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+	.byte		0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+	.byte		0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+	.byte		0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+	.byte		0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+	.byte		0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+	.byte		0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+	.byte		0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+	.byte		0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+	.byte		0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+	.byte		0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+	.byte		0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+
 .LForward_ShiftRows:
 	.octa		0x0b06010c07020d08030e09040f0a0500

--- a/arch/arm64/crypto/aes-neonbs-core.S
+++ b/arch/arm64/crypto/aes-neonbs-core.S
@ -383,7 +383,7 @@ ISRM0:	.octa		0x0306090c00070a0d01040b0e0205080f
 	/*
 	 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
 	 */
-SYM_FUNC_START(aesbs_convert_key)
+ENTRY(aesbs_convert_key)
 	ld1		{v7.4s}, [x1], #16		// load round 0 key
 	ld1		{v17.4s}, [x1], #16		// load round 1 key

@ -428,10 +428,10 @@ SYM_FUNC_START(aesbs_convert_key)
 	eor		v17.16b, v17.16b, v7.16b
 	str		q17, [x0]
 	ret
-SYM_FUNC_END(aesbs_convert_key)
+ENDPROC(aesbs_convert_key)

 	.align		4
-SYM_FUNC_START_LOCAL(aesbs_encrypt8)
+aesbs_encrypt8:
 	ldr		q9, [bskey], #16		// round 0 key
 	ldr		q8, M0SR
 	ldr		q24, SR
@ -491,10 +491,10 @@ SYM_FUNC_START_LOCAL(aesbs_encrypt8)
 	eor		v2.16b, v2.16b, v12.16b
 	eor		v5.16b, v5.16b, v12.16b
 	ret
-SYM_FUNC_END(aesbs_encrypt8)
+ENDPROC(aesbs_encrypt8)

 	.align		4
-SYM_FUNC_START_LOCAL(aesbs_decrypt8)
+aesbs_decrypt8:
 	lsl		x9, rounds, #7
 	add		bskey, bskey, x9

@ -556,7 +556,7 @@ SYM_FUNC_START_LOCAL(aesbs_decrypt8)
 	eor		v3.16b, v3.16b, v12.16b
 	eor		v5.16b, v5.16b, v12.16b
 	ret
-SYM_FUNC_END(aesbs_decrypt8)
+ENDPROC(aesbs_decrypt8)

 	/*
 	 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
@ -565,122 +565,110 @@ SYM_FUNC_END(aesbs_decrypt8)
 	 *		     int blocks)
 	 */
 	.macro		__ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
-	frame_push	5
-
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp

 99:	mov		x5, #1
-	lsl		x5, x5, x23
-	subs		w23, w23, #8
-	csel		x23, x23, xzr, pl
+	lsl		x5, x5, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
 	csel		x5, x5, xzr, mi

-	ld1		{v0.16b}, [x20], #16
+	ld1		{v0.16b}, [x1], #16
 	tbnz		x5, #1, 0f
-	ld1		{v1.16b}, [x20], #16
+	ld1		{v1.16b}, [x1], #16
 	tbnz		x5, #2, 0f
-	ld1		{v2.16b}, [x20], #16
+	ld1		{v2.16b}, [x1], #16
 	tbnz		x5, #3, 0f
-	ld1		{v3.16b}, [x20], #16
+	ld1		{v3.16b}, [x1], #16
 	tbnz		x5, #4, 0f
-	ld1		{v4.16b}, [x20], #16
+	ld1		{v4.16b}, [x1], #16
 	tbnz		x5, #5, 0f
-	ld1		{v5.16b}, [x20], #16
+	ld1		{v5.16b}, [x1], #16
 	tbnz		x5, #6, 0f
-	ld1		{v6.16b}, [x20], #16
+	ld1		{v6.16b}, [x1], #16
 	tbnz		x5, #7, 0f
-	ld1		{v7.16b}, [x20], #16
+	ld1		{v7.16b}, [x1], #16

-0:	mov		bskey, x21
-	mov		rounds, x22
+0:	mov		bskey, x2
+	mov		rounds, x3
 	bl		\do8

-	st1		{\o0\().16b}, [x19], #16
+	st1		{\o0\().16b}, [x0], #16
 	tbnz		x5, #1, 1f
-	st1		{\o1\().16b}, [x19], #16
+	st1		{\o1\().16b}, [x0], #16
 	tbnz		x5, #2, 1f
-	st1		{\o2\().16b}, [x19], #16
+	st1		{\o2\().16b}, [x0], #16
 	tbnz		x5, #3, 1f
-	st1		{\o3\().16b}, [x19], #16
+	st1		{\o3\().16b}, [x0], #16
 	tbnz		x5, #4, 1f
-	st1		{\o4\().16b}, [x19], #16
+	st1		{\o4\().16b}, [x0], #16
 	tbnz		x5, #5, 1f
-	st1		{\o5\().16b}, [x19], #16
+	st1		{\o5\().16b}, [x0], #16
 	tbnz		x5, #6, 1f
-	st1		{\o6\().16b}, [x19], #16
+	st1		{\o6\().16b}, [x0], #16
 	tbnz		x5, #7, 1f
-	st1		{\o7\().16b}, [x19], #16
+	st1		{\o7\().16b}, [x0], #16

-	cbz		x23, 1f
-	b		99b
+	cbnz		x4, 99b

-1:	frame_pop
+1:	ldp		x29, x30, [sp], #16
 	ret
 	.endm

 	.align		4
-SYM_FUNC_START(aesbs_ecb_encrypt)
+ENTRY(aesbs_ecb_encrypt)
 	__ecb_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
-SYM_FUNC_END(aesbs_ecb_encrypt)
+ENDPROC(aesbs_ecb_encrypt)

 	.align		4
-SYM_FUNC_START(aesbs_ecb_decrypt)
+ENTRY(aesbs_ecb_decrypt)
 	__ecb_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
-SYM_FUNC_END(aesbs_ecb_decrypt)
+ENDPROC(aesbs_ecb_decrypt)

 	/*
 	 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 	 *		     int blocks, u8 iv[])
 	 */
 	.align		4
-SYM_FUNC_START(aesbs_cbc_decrypt)
-	frame_push	6
-
-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-	mov		x24, x5
+ENTRY(aesbs_cbc_decrypt)
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp

 99:	mov		x6, #1
-	lsl		x6, x6, x23
-	subs		w23, w23, #8
-	csel		x23, x23, xzr, pl
+	lsl		x6, x6, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
 	csel		x6, x6, xzr, mi

-	ld1		{v0.16b}, [x20], #16
+	ld1		{v0.16b}, [x1], #16
 	mov		v25.16b, v0.16b
 	tbnz		x6, #1, 0f
-	ld1		{v1.16b}, [x20], #16
+	ld1		{v1.16b}, [x1], #16
 	mov		v26.16b, v1.16b
 	tbnz		x6, #2, 0f
-	ld1		{v2.16b}, [x20], #16
+	ld1		{v2.16b}, [x1], #16
 	mov		v27.16b, v2.16b
 	tbnz		x6, #3, 0f
-	ld1		{v3.16b}, [x20], #16
+	ld1		{v3.16b}, [x1], #16
 	mov		v28.16b, v3.16b
 	tbnz		x6, #4, 0f
-	ld1		{v4.16b}, [x20], #16
+	ld1		{v4.16b}, [x1], #16
 	mov		v29.16b, v4.16b
 	tbnz		x6, #5, 0f
-	ld1		{v5.16b}, [x20], #16
+	ld1		{v5.16b}, [x1], #16
 	mov		v30.16b, v5.16b
 	tbnz		x6, #6, 0f
-	ld1		{v6.16b}, [x20], #16
+	ld1		{v6.16b}, [x1], #16
 	mov		v31.16b, v6.16b
 	tbnz		x6, #7, 0f
-	ld1		{v7.16b}, [x20]
+	ld1		{v7.16b}, [x1]

-0:	mov		bskey, x21
-	mov		rounds, x22
+0:	mov		bskey, x2
+	mov		rounds, x3
 	bl		aesbs_decrypt8

-	ld1		{v24.16b}, [x24]		// load IV
+	ld1		{v24.16b}, [x5]			// load IV

 	eor		v1.16b, v1.16b, v25.16b
 	eor		v6.16b, v6.16b, v26.16b
@ -691,37 +679,36 @@ SYM_FUNC_START(aesbs_cbc_decrypt)
 	eor		v3.16b, v3.16b, v30.16b
 	eor		v5.16b, v5.16b, v31.16b

-	st1		{v0.16b}, [x19], #16
+	st1		{v0.16b}, [x0], #16
 	mov		v24.16b, v25.16b
 	tbnz		x6, #1, 1f
-	st1		{v1.16b}, [x19], #16
+	st1		{v1.16b}, [x0], #16
 	mov		v24.16b, v26.16b
 	tbnz		x6, #2, 1f
-	st1		{v6.16b}, [x19], #16
+	st1		{v6.16b}, [x0], #16
 	mov		v24.16b, v27.16b
 	tbnz		x6, #3, 1f
-	st1		{v4.16b}, [x19], #16
+	st1		{v4.16b}, [x0], #16
 	mov		v24.16b, v28.16b
 	tbnz		x6, #4, 1f
-	st1		{v2.16b}, [x19], #16
+	st1		{v2.16b}, [x0], #16
 	mov		v24.16b, v29.16b
 	tbnz		x6, #5, 1f
-	st1		{v7.16b}, [x19], #16
+	st1		{v7.16b}, [x0], #16
 	mov		v24.16b, v30.16b
 	tbnz		x6, #6, 1f
-	st1		{v3.16b}, [x19], #16
+	st1		{v3.16b}, [x0], #16
 	mov		v24.16b, v31.16b
 	tbnz		x6, #7, 1f
-	ld1		{v24.16b}, [x20], #16
-	st1		{v5.16b}, [x19], #16
-1:	st1		{v24.16b}, [x24]		// store IV
+	ld1		{v24.16b}, [x1], #16
+	st1		{v5.16b}, [x0], #16
+1:	st1		{v24.16b}, [x5]			// store IV

-	cbz		x23, 2f
-	b		99b
+	cbnz		x4, 99b

-2:	frame_pop
+	ldp		x29, x30, [sp], #16
 	ret
-SYM_FUNC_END(aesbs_cbc_decrypt)
+ENDPROC(aesbs_cbc_decrypt)

 	.macro		next_tweak, out, in, const, tmp
 	sshr		\tmp\().2d,  \in\().2d,   #63
@ -731,103 +718,100 @@ SYM_FUNC_END(aesbs_cbc_decrypt)
 	eor		\out\().16b, \out\().16b, \tmp\().16b
 	.endm

+	.align		4
+.Lxts_mul_x:
+CPU_LE(	.quad		1, 0x87		)
+CPU_BE(	.quad		0x87, 1		)
+
 	/*
 	 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 	 *		     int blocks, u8 iv[])
 	 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 	 *		     int blocks, u8 iv[])
 	 */
-SYM_FUNC_START_LOCAL(__xts_crypt8)
+__xts_crypt8:
 	mov		x6, #1
-	lsl		x6, x6, x23
-	subs		w23, w23, #8
-	csel		x23, x23, xzr, pl
+	lsl		x6, x6, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
 	csel		x6, x6, xzr, mi

-	ld1		{v0.16b}, [x20], #16
+	ld1		{v0.16b}, [x1], #16
 	next_tweak	v26, v25, v30, v31
 	eor		v0.16b, v0.16b, v25.16b
 	tbnz		x6, #1, 0f

-	ld1		{v1.16b}, [x20], #16
+	ld1		{v1.16b}, [x1], #16
 	next_tweak	v27, v26, v30, v31
 	eor		v1.16b, v1.16b, v26.16b
 	tbnz		x6, #2, 0f

-	ld1		{v2.16b}, [x20], #16
+	ld1		{v2.16b}, [x1], #16
 	next_tweak	v28, v27, v30, v31
 	eor		v2.16b, v2.16b, v27.16b
 	tbnz		x6, #3, 0f

-	ld1		{v3.16b}, [x20], #16
+	ld1		{v3.16b}, [x1], #16
 	next_tweak	v29, v28, v30, v31
 	eor		v3.16b, v3.16b, v28.16b
 	tbnz		x6, #4, 0f

-	ld1		{v4.16b}, [x20], #16
-	str		q29, [sp, #.Lframe_local_offset]
+	ld1		{v4.16b}, [x1], #16
+	str		q29, [sp, #16]
 	eor		v4.16b, v4.16b, v29.16b
 	next_tweak	v29, v29, v30, v31
 	tbnz		x6, #5, 0f

-	ld1		{v5.16b}, [x20], #16
-	str		q29, [sp, #.Lframe_local_offset + 16]
+	ld1		{v5.16b}, [x1], #16
+	str		q29, [sp, #32]
 	eor		v5.16b, v5.16b, v29.16b
 	next_tweak	v29, v29, v30, v31
 	tbnz		x6, #6, 0f

-	ld1		{v6.16b}, [x20], #16
-	str		q29, [sp, #.Lframe_local_offset + 32]
+	ld1		{v6.16b}, [x1], #16
+	str		q29, [sp, #48]
 	eor		v6.16b, v6.16b, v29.16b
 	next_tweak	v29, v29, v30, v31
 	tbnz		x6, #7, 0f

-	ld1		{v7.16b}, [x20], #16
-	str		q29, [sp, #.Lframe_local_offset + 48]
+	ld1		{v7.16b}, [x1], #16
+	str		q29, [sp, #64]
 	eor		v7.16b, v7.16b, v29.16b
 	next_tweak	v29, v29, v30, v31

-0:	mov		bskey, x21
-	mov		rounds, x22
-	br		x16
-SYM_FUNC_END(__xts_crypt8)
+0:	mov		bskey, x2
+	mov		rounds, x3
+	br		x7
+ENDPROC(__xts_crypt8)

 	.macro		__xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
-	frame_push	6, 64
+	stp		x29, x30, [sp, #-80]!
+	mov		x29, sp

-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-	mov		x24, x5
+	ldr		q30, .Lxts_mul_x
+	ld1		{v25.16b}, [x5]

-	movi		v30.2s, #0x1
-	movi		v25.2s, #0x87
-	uzp1		v30.4s, v30.4s, v25.4s
-	ld1		{v25.16b}, [x24]
-
-99:	adr		x16, \do8
+99:	adr		x7, \do8
 	bl		__xts_crypt8

-	ldp		q16, q17, [sp, #.Lframe_local_offset]
-	ldp		q18, q19, [sp, #.Lframe_local_offset + 32]
+	ldp		q16, q17, [sp, #16]
+	ldp		q18, q19, [sp, #48]

 	eor		\o0\().16b, \o0\().16b, v25.16b
 	eor		\o1\().16b, \o1\().16b, v26.16b
 	eor		\o2\().16b, \o2\().16b, v27.16b
 	eor		\o3\().16b, \o3\().16b, v28.16b

-	st1		{\o0\().16b}, [x19], #16
+	st1		{\o0\().16b}, [x0], #16
 	mov		v25.16b, v26.16b
 	tbnz		x6, #1, 1f
-	st1		{\o1\().16b}, [x19], #16
+	st1		{\o1\().16b}, [x0], #16
 	mov		v25.16b, v27.16b
 	tbnz		x6, #2, 1f
-	st1		{\o2\().16b}, [x19], #16
+	st1		{\o2\().16b}, [x0], #16
 	mov		v25.16b, v28.16b
 	tbnz		x6, #3, 1f
-	st1		{\o3\().16b}, [x19], #16
+	st1		{\o3\().16b}, [x0], #16
 	mov		v25.16b, v29.16b
 	tbnz		x6, #4, 1f

@ -836,31 +820,28 @@ SYM_FUNC_END(__xts_crypt8)
 	eor		\o6\().16b, \o6\().16b, v18.16b
 	eor		\o7\().16b, \o7\().16b, v19.16b

-	st1		{\o4\().16b}, [x19], #16
+	st1		{\o4\().16b}, [x0], #16
 	tbnz		x6, #5, 1f
-	st1		{\o5\().16b}, [x19], #16
+	st1		{\o5\().16b}, [x0], #16
 	tbnz		x6, #6, 1f
-	st1		{\o6\().16b}, [x19], #16
+	st1		{\o6\().16b}, [x0], #16
 	tbnz		x6, #7, 1f
-	st1		{\o7\().16b}, [x19], #16
+	st1		{\o7\().16b}, [x0], #16

-	cbz		x23, 1f
-	st1		{v25.16b}, [x24]
+	cbnz		x4, 99b

-	b		99b
-
-1:	st1		{v25.16b}, [x24]
-	frame_pop
+1:	st1		{v25.16b}, [x5]
+	ldp		x29, x30, [sp], #80
 	ret
 	.endm

-SYM_FUNC_START(aesbs_xts_encrypt)
+ENTRY(aesbs_xts_encrypt)
 	__xts_crypt	aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
-SYM_FUNC_END(aesbs_xts_encrypt)
+ENDPROC(aesbs_xts_encrypt)

-SYM_FUNC_START(aesbs_xts_decrypt)
+ENTRY(aesbs_xts_decrypt)
 	__xts_crypt	aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
-SYM_FUNC_END(aesbs_xts_decrypt)
+ENDPROC(aesbs_xts_decrypt)

 	.macro		next_ctr, v
 	mov		\v\().d[1], x8
@ -874,32 +855,25 @@ SYM_FUNC_END(aesbs_xts_decrypt)
 	 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
 	 *		     int rounds, int blocks, u8 iv[], u8 final[])
 	 */
-SYM_FUNC_START(aesbs_ctr_encrypt)
-	frame_push	8
+ENTRY(aesbs_ctr_encrypt)
+	stp		x29, x30, [sp, #-16]!
+	mov		x29, sp

-	mov		x19, x0
-	mov		x20, x1
-	mov		x21, x2
-	mov		x22, x3
-	mov		x23, x4
-	mov		x24, x5
-	mov		x25, x6
+	cmp		x6, #0
+	cset		x10, ne
+	add		x4, x4, x10		// do one extra block if final

-	cmp		x25, #0
-	cset		x26, ne
-	add		x23, x23, x26		// do one extra block if final
-
-	ldp		x7, x8, [x24]
-	ld1		{v0.16b}, [x24]
+	ldp		x7, x8, [x5]
+	ld1		{v0.16b}, [x5]
 CPU_LE(	rev		x7, x7		)
 CPU_LE(	rev		x8, x8		)
 	adds		x8, x8, #1
 	adc		x7, x7, xzr

 99:	mov		x9, #1
-	lsl		x9, x9, x23
-	subs		w23, w23, #8
-	csel		x23, x23, xzr, pl
+	lsl		x9, x9, x4
+	subs		w4, w4, #8
+	csel		x4, x4, xzr, pl
 	csel		x9, x9, xzr, le

 	tbnz		x9, #1, 0f
@ -917,88 +891,85 @@ CPU_LE(	rev		x8, x8		)
 	tbnz		x9, #7, 0f
 	next_ctr	v7

-0:	mov		bskey, x21
-	mov		rounds, x22
+0:	mov		bskey, x2
+	mov		rounds, x3
 	bl		aesbs_encrypt8

-	lsr		x9, x9, x26		// disregard the extra block
+	lsr		x9, x9, x10		// disregard the extra block
 	tbnz		x9, #0, 0f

-	ld1		{v8.16b}, [x20], #16
+	ld1		{v8.16b}, [x1], #16
 	eor		v0.16b, v0.16b, v8.16b
-	st1		{v0.16b}, [x19], #16
+	st1		{v0.16b}, [x0], #16
 	tbnz		x9, #1, 1f

-	ld1		{v9.16b}, [x20], #16
+	ld1		{v9.16b}, [x1], #16
 	eor		v1.16b, v1.16b, v9.16b
-	st1		{v1.16b}, [x19], #16
+	st1		{v1.16b}, [x0], #16
 	tbnz		x9, #2, 2f

-	ld1		{v10.16b}, [x20], #16
+	ld1		{v10.16b}, [x1], #16
 	eor		v4.16b, v4.16b, v10.16b
-	st1		{v4.16b}, [x19], #16
+	st1		{v4.16b}, [x0], #16
 	tbnz		x9, #3, 3f

-	ld1		{v11.16b}, [x20], #16
+	ld1		{v11.16b}, [x1], #16
 	eor		v6.16b, v6.16b, v11.16b
-	st1		{v6.16b}, [x19], #16
+	st1		{v6.16b}, [x0], #16
 	tbnz		x9, #4, 4f

-	ld1		{v12.16b}, [x20], #16
+	ld1		{v12.16b}, [x1], #16
 	eor		v3.16b, v3.16b, v12.16b
-	st1		{v3.16b}, [x19], #16
+	st1		{v3.16b}, [x0], #16
 	tbnz		x9, #5, 5f

-	ld1		{v13.16b}, [x20], #16
+	ld1		{v13.16b}, [x1], #16
 	eor		v7.16b, v7.16b, v13.16b
-	st1		{v7.16b}, [x19], #16
+	st1		{v7.16b}, [x0], #16
 	tbnz		x9, #6, 6f

-	ld1		{v14.16b}, [x20], #16
+	ld1		{v14.16b}, [x1], #16
 	eor		v2.16b, v2.16b, v14.16b
-	st1		{v2.16b}, [x19], #16
+	st1		{v2.16b}, [x0], #16
 	tbnz		x9, #7, 7f

-	ld1		{v15.16b}, [x20], #16
+	ld1		{v15.16b}, [x1], #16
 	eor		v5.16b, v5.16b, v15.16b
-	st1		{v5.16b}, [x19], #16
+	st1		{v5.16b}, [x0], #16

 8:	next_ctr	v0
-	st1		{v0.16b}, [x24]
-	cbz		x23, .Lctr_done
+	cbnz		x4, 99b

-	b		99b
-
-.Lctr_done:
-	frame_pop
+	st1		{v0.16b}, [x5]
+	ldp		x29, x30, [sp], #16
 	ret

 	/*
 	 * If we are handling the tail of the input (x6 != NULL), return the
 	 * final keystream block back to the caller.
 	 */
-0:	cbz		x25, 8b
-	st1		{v0.16b}, [x25]
+0:	cbz		x6, 8b
+	st1		{v0.16b}, [x6]
 	b		8b
-1:	cbz		x25, 8b
-	st1		{v1.16b}, [x25]
+1:	cbz		x6, 8b
+	st1		{v1.16b}, [x6]
 	b		8b
-2:	cbz		x25, 8b
-	st1		{v4.16b}, [x25]
+2:	cbz		x6, 8b
+	st1		{v4.16b}, [x6]
 	b		8b
-3:	cbz		x25, 8b
-	st1		{v6.16b}, [x25]
+3:	cbz		x6, 8b
+	st1		{v6.16b}, [x6]
 	b		8b
-4:	cbz		x25, 8b
-	st1		{v3.16b}, [x25]
+4:	cbz		x6, 8b
+	st1		{v3.16b}, [x6]
 	b		8b
-5:	cbz		x25, 8b
-	st1		{v7.16b}, [x25]
+5:	cbz		x6, 8b
+	st1		{v7.16b}, [x6]
 	b		8b
-6:	cbz		x25, 8b
-	st1		{v2.16b}, [x25]
+6:	cbz		x6, 8b
+	st1		{v2.16b}, [x6]
 	b		8b
-7:	cbz		x25, 8b
-	st1		{v5.16b}, [x25]
+7:	cbz		x6, 8b
+	st1		{v5.16b}, [x6]
 	b		8b
-SYM_FUNC_END(aesbs_ctr_encrypt)
+ENDPROC(aesbs_ctr_encrypt)
--- a/arch/arm64/crypto/aes-neonbs-glue.c
+++ b/arch/arm64/crypto/aes-neonbs-glue.c
@ -11,13 +11,13 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
 #include <crypto/aes.h>
-#include <crypto/ctr.h>
 #include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
-#include <crypto/scatterwalk.h>
 #include <crypto/xts.h>
 #include <linux/module.h>

+#include "aes-ctr-fallback.h"
+
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");

@ -49,12 +49,6 @@ asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
 				     int rounds, int blocks);
 asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
 				     int rounds, int blocks, u8 iv[]);
-asmlinkage void neon_aes_xts_encrypt(u8 out[], u8 const in[],
-				     u32 const rk1[], int rounds, int bytes,
-				     u32 const rk2[], u8 iv[], int first);
-asmlinkage void neon_aes_xts_decrypt(u8 out[], u8 const in[],
-				     u32 const rk1[], int rounds, int bytes,
-				     u32 const rk2[], u8 iv[], int first);

 struct aesbs_ctx {
 	u8	rk[13 * (8 * AES_BLOCK_SIZE) + 32];
@ -74,7 +68,6 @@ struct aesbs_ctr_ctx {
 struct aesbs_xts_ctx {
 	struct aesbs_ctx	key;
 	u32			twkey[AES_MAX_KEYLENGTH_U32];
-	struct crypto_aes_ctx	cts;
 };

 static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
@ -84,7 +77,7 @@ static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 	struct crypto_aes_ctx rk;
 	int err;

-	err = aes_expandkey(&rk, in_key, key_len);
+	err = crypto_aes_expand_key(&rk, in_key, key_len);
 	if (err)
 		return err;

@ -106,8 +99,9 @@ static int __ecb_crypt(struct skcipher_request *req,
 	struct skcipher_walk walk;
 	int err;

-	err = skcipher_walk_virt(&walk, req, false);
+	err = skcipher_walk_virt(&walk, req, true);

+	kernel_neon_begin();
 	while (walk.nbytes >= AES_BLOCK_SIZE) {
 		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;

@ -115,13 +109,12 @@ static int __ecb_crypt(struct skcipher_request *req,
 			blocks = round_down(blocks,
 					    walk.stride / AES_BLOCK_SIZE);

-		kernel_neon_begin();
 		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
 		   ctx->rounds, blocks);
-		kernel_neon_end();
 		err = skcipher_walk_done(&walk,
 					 walk.nbytes - blocks * AES_BLOCK_SIZE);
 	}
+	kernel_neon_end();

 	return err;
 }
@ -143,7 +136,7 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 	struct crypto_aes_ctx rk;
 	int err;

-	err = aes_expandkey(&rk, in_key, key_len);
+	err = crypto_aes_expand_key(&rk, in_key, key_len);
 	if (err)
 		return err;

@ -154,7 +147,6 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 	kernel_neon_begin();
 	aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
 	kernel_neon_end();
-	memzero_explicit(&rk, sizeof(rk));

 	return 0;
 }
@ -166,19 +158,19 @@ static int cbc_encrypt(struct skcipher_request *req)
 	struct skcipher_walk walk;
 	int err;

-	err = skcipher_walk_virt(&walk, req, false);
+	err = skcipher_walk_virt(&walk, req, true);

+	kernel_neon_begin();
 	while (walk.nbytes >= AES_BLOCK_SIZE) {
 		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;

 		/* fall back to the non-bitsliced NEON implementation */
-		kernel_neon_begin();
 		neon_aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
 				     ctx->enc, ctx->key.rounds, blocks,
 				     walk.iv);
-		kernel_neon_end();
 		err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
 	}
+	kernel_neon_end();
 	return err;
 }

@ -189,8 +181,9 @@ static int cbc_decrypt(struct skcipher_request *req)
 	struct skcipher_walk walk;
 	int err;

-	err = skcipher_walk_virt(&walk, req, false);
+	err = skcipher_walk_virt(&walk, req, true);

+	kernel_neon_begin();
 	while (walk.nbytes >= AES_BLOCK_SIZE) {
 		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;

@ -198,14 +191,13 @@ static int cbc_decrypt(struct skcipher_request *req)
 			blocks = round_down(blocks,
 					    walk.stride / AES_BLOCK_SIZE);

-		kernel_neon_begin();
 		aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
 				  ctx->key.rk, ctx->key.rounds, blocks,
 				  walk.iv);
-		kernel_neon_end();
 		err = skcipher_walk_done(&walk,
 					 walk.nbytes - blocks * AES_BLOCK_SIZE);
 	}
+	kernel_neon_end();

 	return err;
 }
@ -216,7 +208,7 @@ static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key,
 	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
 	int err;

-	err = aes_expandkey(&ctx->fallback, in_key, key_len);
+	err = crypto_aes_expand_key(&ctx->fallback, in_key, key_len);
 	if (err)
 		return err;

@ -237,8 +229,9 @@ static int ctr_encrypt(struct skcipher_request *req)
 	u8 buf[AES_BLOCK_SIZE];
 	int err;

-	err = skcipher_walk_virt(&walk, req, false);
+	err = skcipher_walk_virt(&walk, req, true);

+	kernel_neon_begin();
 	while (walk.nbytes > 0) {
 		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
 		u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
@ -249,10 +242,8 @@ static int ctr_encrypt(struct skcipher_request *req)
 			final = NULL;
 		}

-		kernel_neon_begin();
 		aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
 				  ctx->rk, ctx->rounds, blocks, walk.iv, final);
-		kernel_neon_end();

 		if (final) {
 			u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
@ -267,6 +258,8 @@ static int ctr_encrypt(struct skcipher_request *req)
 		err = skcipher_walk_done(&walk,
 					 walk.nbytes - blocks * AES_BLOCK_SIZE);
 	}
+	kernel_neon_end();
+
 	return err;
 }

@ -282,11 +275,7 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 		return err;

 	key_len /= 2;
-	err = aes_expandkey(&ctx->cts, in_key, key_len);
-	if (err)
-		return err;
-
-	err = aes_expandkey(&rk, in_key + key_len, key_len);
+	err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
 	if (err)
 		return err;

@ -295,142 +284,60 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
 	return aesbs_setkey(tfm, in_key, key_len);
 }

-static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
-{
-	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
-	unsigned long flags;
-
-	/*
-	 * Temporarily disable interrupts to avoid races where
-	 * cachelines are evicted when the CPU is interrupted
-	 * to do something else.
-	 */
-	local_irq_save(flags);
-	aes_encrypt(&ctx->fallback, dst, src);
-	local_irq_restore(flags);
-}
-
 static int ctr_encrypt_sync(struct skcipher_request *req)
 {
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
+
 	if (!may_use_simd())
-		return crypto_ctr_encrypt_walk(req, ctr_encrypt_one);
+		return aes_ctr_encrypt_fallback(&ctx->fallback, req);

 	return ctr_encrypt(req);
 }

-static int __xts_crypt(struct skcipher_request *req, bool encrypt,
+static int __xts_crypt(struct skcipher_request *req,
 		       void (*fn)(u8 out[], u8 const in[], u8 const rk[],
 				  int rounds, int blocks, u8 iv[]))
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int tail = req->cryptlen % (8 * AES_BLOCK_SIZE);
-	struct scatterlist sg_src[2], sg_dst[2];
-	struct skcipher_request subreq;
-	struct scatterlist *src, *dst;
 	struct skcipher_walk walk;
-	int nbytes, err;
-	int first = 1;
-	u8 *out, *in;
+	int err;

-	if (req->cryptlen < AES_BLOCK_SIZE)
-		return -EINVAL;
-
-	/* ensure that the cts tail is covered by a single step */
-	if (unlikely(tail > 0 && tail < AES_BLOCK_SIZE)) {
-		int xts_blocks = DIV_ROUND_UP(req->cryptlen,
-					      AES_BLOCK_SIZE) - 2;
-
-		skcipher_request_set_tfm(&subreq, tfm);
-		skcipher_request_set_callback(&subreq,
-					      skcipher_request_flags(req),
-					      NULL, NULL);
-		skcipher_request_set_crypt(&subreq, req->src, req->dst,
-					   xts_blocks * AES_BLOCK_SIZE,
-					   req->iv);
-		req = &subreq;
-	} else {
-		tail = 0;
-	}
-
-	err = skcipher_walk_virt(&walk, req, false);
+	err = skcipher_walk_virt(&walk, req, true);
 	if (err)
 		return err;

+	kernel_neon_begin();
+
+	neon_aes_ecb_encrypt(walk.iv, walk.iv, ctx->twkey,
+			     ctx->key.rounds, 1);
+
 	while (walk.nbytes >= AES_BLOCK_SIZE) {
 		unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;

-		if (walk.nbytes < walk.total || walk.nbytes % AES_BLOCK_SIZE)
+		if (walk.nbytes < walk.total)
 			blocks = round_down(blocks,
 					    walk.stride / AES_BLOCK_SIZE);

-		out = walk.dst.virt.addr;
-		in = walk.src.virt.addr;
-		nbytes = walk.nbytes;
-
-		kernel_neon_begin();
-		if (likely(blocks > 6)) { /* plain NEON is faster otherwise */
-			if (first)
-				neon_aes_ecb_encrypt(walk.iv, walk.iv,
-						     ctx->twkey,
-						     ctx->key.rounds, 1);
-			first = 0;
-
-			fn(out, in, ctx->key.rk, ctx->key.rounds, blocks,
-			   walk.iv);
-
-			out += blocks * AES_BLOCK_SIZE;
-			in += blocks * AES_BLOCK_SIZE;
-			nbytes -= blocks * AES_BLOCK_SIZE;
-		}
-
-		if (walk.nbytes == walk.total && nbytes > 0)
-			goto xts_tail;
-
-		kernel_neon_end();
-		err = skcipher_walk_done(&walk, nbytes);
+		fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk,
+		   ctx->key.rounds, blocks, walk.iv);
+		err = skcipher_walk_done(&walk,
+					 walk.nbytes - blocks * AES_BLOCK_SIZE);
 	}
-
-	if (err || likely(!tail))
-		return err;
-
-	/* handle ciphertext stealing */
-	dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
-	if (req->dst != req->src)
-		dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
-
-	skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
-				   req->iv);
-
-	err = skcipher_walk_virt(&walk, req, false);
-	if (err)
-		return err;
-
-	out = walk.dst.virt.addr;
-	in = walk.src.virt.addr;
-	nbytes = walk.nbytes;
-
-	kernel_neon_begin();
-xts_tail:
-	if (encrypt)
-		neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, ctx->key.rounds,
-				     nbytes, ctx->twkey, walk.iv, first ?: 2);
-	else
-		neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, ctx->key.rounds,
-				     nbytes, ctx->twkey, walk.iv, first ?: 2);
 	kernel_neon_end();

-	return skcipher_walk_done(&walk, 0);
+	return err;
 }

 static int xts_encrypt(struct skcipher_request *req)
 {
-	return __xts_crypt(req, true, aesbs_xts_encrypt);
+	return __xts_crypt(req, aesbs_xts_encrypt);
 }

 static int xts_decrypt(struct skcipher_request *req)
 {
-	return __xts_crypt(req, false, aesbs_xts_decrypt);
+	return __xts_crypt(req, aesbs_xts_decrypt);
 }

 static struct skcipher_alg aes_algs[] = { {
@ -537,7 +444,7 @@ static int __init aes_init(void)
 	int err;
 	int i;

-	if (!cpu_have_named_feature(ASIMD))
+	if (!(elf_hwcap & HWCAP_ASIMD))
 		return -ENODEV;

 	err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
--- a/arch/arm64/crypto/chacha-neon-glue.c
+++ b/arch/arm64/crypto/chacha-neon-glue.c
@ -1,199 +0,0 @@
-/*
- * ARM NEON accelerated ChaCha and XChaCha stream ciphers,
- * including ChaCha20 (RFC7539)
- *
- * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on:
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/chacha.h>
-#include <crypto/internal/simd.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-
-asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
-				      int nrounds);
-asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
-				       int nrounds, int bytes);
-asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
-
-static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
-			  int bytes, int nrounds)
-{
-	while (bytes > 0) {
-		int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
-
-		if (l <= CHACHA_BLOCK_SIZE) {
-			u8 buf[CHACHA_BLOCK_SIZE];
-
-			memcpy(buf, src, l);
-			chacha_block_xor_neon(state, buf, buf, nrounds);
-			memcpy(dst, buf, l);
-			state[12] += 1;
-			break;
-		}
-		chacha_4block_xor_neon(state, dst, src, nrounds, l);
-		bytes -= CHACHA_BLOCK_SIZE * 5;
-		src += CHACHA_BLOCK_SIZE * 5;
-		dst += CHACHA_BLOCK_SIZE * 5;
-		state[12] += 5;
-	}
-}
-
-static int chacha_neon_stream_xor(struct skcipher_request *req,
-				  const struct chacha_ctx *ctx, const u8 *iv)
-{
-	struct skcipher_walk walk;
-	u32 state[16];
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, false);
-
-	crypto_chacha_init(state, ctx, iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-
-		if (nbytes < walk.total)
-			nbytes = rounddown(nbytes, walk.stride);
-
-		kernel_neon_begin();
-		chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
-			      nbytes, ctx->nrounds);
-		kernel_neon_end();
-		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-	}
-
-	return err;
-}
-
-static int chacha_neon(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
-		return crypto_chacha_crypt(req);
-
-	return chacha_neon_stream_xor(req, ctx, req->iv);
-}
-
-static int xchacha_neon(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct chacha_ctx subctx;
-	u32 state[16];
-	u8 real_iv[16];
-
-	if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
-		return crypto_xchacha_crypt(req);
-
-	crypto_chacha_init(state, ctx, req->iv);
-
-	kernel_neon_begin();
-	hchacha_block_neon(state, subctx.key, ctx->nrounds);
-	kernel_neon_end();
-	subctx.nrounds = ctx->nrounds;
-
-	memcpy(&real_iv[0], req->iv + 24, 8);
-	memcpy(&real_iv[8], req->iv + 16, 8);
-	return chacha_neon_stream_xor(req, &subctx, real_iv);
-}
-
-static struct skcipher_alg algs[] = {
-	{
-		.base.cra_name		= "chacha20",
-		.base.cra_driver_name	= "chacha20-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= CHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 5 * CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= chacha_neon,
-		.decrypt		= chacha_neon,
-	}, {
-		.base.cra_name		= "xchacha20",
-		.base.cra_driver_name	= "xchacha20-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 5 * CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha20_setkey,
-		.encrypt		= xchacha_neon,
-		.decrypt		= xchacha_neon,
-	}, {
-		.base.cra_name		= "xchacha12",
-		.base.cra_driver_name	= "xchacha12-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= 1,
-		.base.cra_ctxsize	= sizeof(struct chacha_ctx),
-		.base.cra_module	= THIS_MODULE,
-
-		.min_keysize		= CHACHA_KEY_SIZE,
-		.max_keysize		= CHACHA_KEY_SIZE,
-		.ivsize			= XCHACHA_IV_SIZE,
-		.chunksize		= CHACHA_BLOCK_SIZE,
-		.walksize		= 5 * CHACHA_BLOCK_SIZE,
-		.setkey			= crypto_chacha12_setkey,
-		.encrypt		= xchacha_neon,
-		.decrypt		= xchacha_neon,
-	}
-};
-
-static int __init chacha_simd_mod_init(void)
-{
-	if (!cpu_have_named_feature(ASIMD))
-		return -ENODEV;
-
-	return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit chacha_simd_mod_fini(void)
-{
-	crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
-}
-
-module_init(chacha_simd_mod_init);
-module_exit(chacha_simd_mod_fini);
-
-MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-neon");
-MODULE_ALIAS_CRYPTO("xchacha20");
-MODULE_ALIAS_CRYPTO("xchacha20-neon");
-MODULE_ALIAS_CRYPTO("xchacha12");
-MODULE_ALIAS_CRYPTO("xchacha12-neon");
--- a/arch/arm64/crypto/chacha20-neon-core.S
+++ b/arch/arm64/crypto/chacha20-neon-core.S
@ -1,13 +1,13 @@
 /*
- * ChaCha/XChaCha NEON helper functions
+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
 *
- * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
- * Originally based on:
+ * Based on:
 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
 *
 * Copyright (C) 2015 Martin Willi
@ -19,27 +19,29 @@
 */

 #include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/cache.h>

 	.text
 	.align		6

-/*
- * chacha_permute - permute one block
- *
- * Permute one 64-byte block where the state matrix is stored in the four NEON
- * registers v0-v3.  It performs matrix operations on four words in parallel,
- * but requires shuffling to rearrange the words after each round.
- *
- * The round count is given in w3.
- *
- * Clobbers: w3, x10, v4, v12
- */
-SYM_FUNC_START_LOCAL(chacha_permute)
+ENTRY(chacha20_block_xor_neon)
+	// x0: Input state matrix, s
+	// x1: 1 data block output, o
+	// x2: 1 data block input, i

-	adr_l		x10, ROT8
-	ld1		{v12.4s}, [x10]
+	//
+	// This function encrypts one ChaCha20 block by loading the state matrix
+	// in four NEON registers. It performs matrix operation on four words in
+	// parallel, but requires shuffling to rearrange the words after each
+	// round.
+	//
+
+	// x0..3 = s0..3
+	adr		x3, ROT8
+	ld1		{v0.4s-v3.4s}, [x0]
+	ld1		{v8.4s-v11.4s}, [x0]
+	ld1		{v12.4s}, [x3]
+
+	mov		x3, #10

 .Ldoubleround:
 	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
@ -100,27 +102,9 @@ SYM_FUNC_START_LOCAL(chacha_permute)
 	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
 	ext		v3.16b, v3.16b, v3.16b, #4

-	subs		w3, w3, #2
+	subs		x3, x3, #1
 	b.ne		.Ldoubleround

-	ret
-SYM_FUNC_END(chacha_permute)
-
-SYM_FUNC_START(chacha_block_xor_neon)
-	// x0: Input state matrix, s
-	// x1: 1 data block output, o
-	// x2: 1 data block input, i
-	// w3: nrounds
-
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
-
-	// x0..3 = s0..3
-	ld1		{v0.4s-v3.4s}, [x0]
-	ld1		{v8.4s-v11.4s}, [x0]
-
-	bl		chacha_permute
-
 	ld1		{v4.16b-v7.16b}, [x2]

 	// o0 = i0 ^ (x0 + s0)
@ -141,155 +125,71 @@ SYM_FUNC_START(chacha_block_xor_neon)

 	st1		{v0.16b-v3.16b}, [x1]

-	ldp		x29, x30, [sp], #16
 	ret
-SYM_FUNC_END(chacha_block_xor_neon)
-
-SYM_FUNC_START(hchacha_block_neon)
-	// x0: Input state matrix, s
-	// x1: output (8 32-bit words)
-	// w2: nrounds
-
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
-
-	ld1		{v0.4s-v3.4s}, [x0]
-
-	mov		w3, w2
-	bl		chacha_permute
-
-	st1		{v0.4s}, [x1], #16
-	st1		{v3.4s}, [x1]
-
-	ldp		x29, x30, [sp], #16
-	ret
-SYM_FUNC_END(hchacha_block_neon)
-
-	a0		.req	w12
-	a1		.req	w13
-	a2		.req	w14
-	a3		.req	w15
-	a4		.req	w16
-	a5		.req	w17
-	a6		.req	w19
-	a7		.req	w20
-	a8		.req	w21
-	a9		.req	w22
-	a10		.req	w23
-	a11		.req	w24
-	a12		.req	w25
-	a13		.req	w26
-	a14		.req	w27
-	a15		.req	w28
+ENDPROC(chacha20_block_xor_neon)

 	.align		6
-SYM_FUNC_START(chacha_4block_xor_neon)
-	frame_push	10
-
+ENTRY(chacha20_4block_xor_neon)
 	// x0: Input state matrix, s
 	// x1: 4 data blocks output, o
 	// x2: 4 data blocks input, i
-	// w3: nrounds
-	// x4: byte count
-
-	adr_l		x10, .Lpermute
-	and		x5, x4, #63
-	add		x10, x10, x5

 	//
-	// This function encrypts four consecutive ChaCha blocks by loading
+	// This function encrypts four consecutive ChaCha20 blocks by loading
 	// the state matrix in NEON registers four times. The algorithm performs
 	// each operation on the corresponding word of each state matrix, hence
 	// requires no word shuffling. For final XORing step we transpose the
 	// matrix by interleaving 32- and then 64-bit words, which allows us to
 	// do XOR in NEON registers.
 	//
-	// At the same time, a fifth block is encrypted in parallel using
-	// scalar registers
-	//
-	adr_l		x9, CTRINC		// ... and ROT8
-	ld1		{v30.4s-v31.4s}, [x9]
+	adr		x3, CTRINC		// ... and ROT8
+	ld1		{v30.4s-v31.4s}, [x3]

 	// x0..15[0-3] = s0..3[0..3]
-	add		x8, x0, #16
-	ld4r		{ v0.4s- v3.4s}, [x0]
-	ld4r		{ v4.4s- v7.4s}, [x8], #16
-	ld4r		{ v8.4s-v11.4s}, [x8], #16
-	ld4r		{v12.4s-v15.4s}, [x8]
+	mov		x4, x0
+	ld4r		{ v0.4s- v3.4s}, [x4], #16
+	ld4r		{ v4.4s- v7.4s}, [x4], #16
+	ld4r		{ v8.4s-v11.4s}, [x4], #16
+	ld4r		{v12.4s-v15.4s}, [x4]

-	mov		a0, v0.s[0]
-	mov		a1, v1.s[0]
-	mov		a2, v2.s[0]
-	mov		a3, v3.s[0]
-	mov		a4, v4.s[0]
-	mov		a5, v5.s[0]
-	mov		a6, v6.s[0]
-	mov		a7, v7.s[0]
-	mov		a8, v8.s[0]
-	mov		a9, v9.s[0]
-	mov		a10, v10.s[0]
-	mov		a11, v11.s[0]
-	mov		a12, v12.s[0]
-	mov		a13, v13.s[0]
-	mov		a14, v14.s[0]
-	mov		a15, v15.s[0]
-
-	// x12 += counter values 1-4
+	// x12 += counter values 0-3
 	add		v12.4s, v12.4s, v30.4s

+	mov		x3, #10
+
 .Ldoubleround4:
 	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
 	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
 	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
 	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
 	add		v0.4s, v0.4s, v4.4s
-	  add		a0, a0, a4
 	add		v1.4s, v1.4s, v5.4s
-	  add		a1, a1, a5
 	add		v2.4s, v2.4s, v6.4s
-	  add		a2, a2, a6
 	add		v3.4s, v3.4s, v7.4s
-	  add		a3, a3, a7

 	eor		v12.16b, v12.16b, v0.16b
-	  eor		a12, a12, a0
 	eor		v13.16b, v13.16b, v1.16b
-	  eor		a13, a13, a1
 	eor		v14.16b, v14.16b, v2.16b
-	  eor		a14, a14, a2
 	eor		v15.16b, v15.16b, v3.16b
-	  eor		a15, a15, a3

 	rev32		v12.8h, v12.8h
-	  ror		a12, a12, #16
 	rev32		v13.8h, v13.8h
-	  ror		a13, a13, #16
 	rev32		v14.8h, v14.8h
-	  ror		a14, a14, #16
 	rev32		v15.8h, v15.8h
-	  ror		a15, a15, #16

 	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
 	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
 	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
 	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
 	add		v8.4s, v8.4s, v12.4s
-	  add		a8, a8, a12
 	add		v9.4s, v9.4s, v13.4s
-	  add		a9, a9, a13
 	add		v10.4s, v10.4s, v14.4s
-	  add		a10, a10, a14
 	add		v11.4s, v11.4s, v15.4s
-	  add		a11, a11, a15

 	eor		v16.16b, v4.16b, v8.16b
-	  eor		a4, a4, a8
 	eor		v17.16b, v5.16b, v9.16b
-	  eor		a5, a5, a9
 	eor		v18.16b, v6.16b, v10.16b
-	  eor		a6, a6, a10
 	eor		v19.16b, v7.16b, v11.16b
-	  eor		a7, a7, a11

 	shl		v4.4s, v16.4s, #12
 	shl		v5.4s, v17.4s, #12
@ -297,66 +197,42 @@ SYM_FUNC_START(chacha_4block_xor_neon)
 	shl		v7.4s, v19.4s, #12

 	sri		v4.4s, v16.4s, #20
-	  ror		a4, a4, #20
 	sri		v5.4s, v17.4s, #20
-	  ror		a5, a5, #20
 	sri		v6.4s, v18.4s, #20
-	  ror		a6, a6, #20
 	sri		v7.4s, v19.4s, #20
-	  ror		a7, a7, #20

 	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
 	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
 	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
 	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
 	add		v0.4s, v0.4s, v4.4s
-	  add		a0, a0, a4
 	add		v1.4s, v1.4s, v5.4s
-	  add		a1, a1, a5
 	add		v2.4s, v2.4s, v6.4s
-	  add		a2, a2, a6
 	add		v3.4s, v3.4s, v7.4s
-	  add		a3, a3, a7

 	eor		v12.16b, v12.16b, v0.16b
-	  eor		a12, a12, a0
 	eor		v13.16b, v13.16b, v1.16b
-	  eor		a13, a13, a1
 	eor		v14.16b, v14.16b, v2.16b
-	  eor		a14, a14, a2
 	eor		v15.16b, v15.16b, v3.16b
-	  eor		a15, a15, a3

 	tbl		v12.16b, {v12.16b}, v31.16b
-	  ror		a12, a12, #24
 	tbl		v13.16b, {v13.16b}, v31.16b
-	  ror		a13, a13, #24
 	tbl		v14.16b, {v14.16b}, v31.16b
-	  ror		a14, a14, #24
 	tbl		v15.16b, {v15.16b}, v31.16b
-	  ror		a15, a15, #24

 	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
 	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
 	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
 	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
 	add		v8.4s, v8.4s, v12.4s
-	  add		a8, a8, a12
 	add		v9.4s, v9.4s, v13.4s
-	  add		a9, a9, a13
 	add		v10.4s, v10.4s, v14.4s
-	  add		a10, a10, a14
 	add		v11.4s, v11.4s, v15.4s
-	  add		a11, a11, a15

 	eor		v16.16b, v4.16b, v8.16b
-	  eor		a4, a4, a8
 	eor		v17.16b, v5.16b, v9.16b
-	  eor		a5, a5, a9
 	eor		v18.16b, v6.16b, v10.16b
-	  eor		a6, a6, a10
 	eor		v19.16b, v7.16b, v11.16b
-	  eor		a7, a7, a11

 	shl		v4.4s, v16.4s, #7
 	shl		v5.4s, v17.4s, #7
@ -364,66 +240,42 @@ SYM_FUNC_START(chacha_4block_xor_neon)
 	shl		v7.4s, v19.4s, #7

 	sri		v4.4s, v16.4s, #25
-	  ror		a4, a4, #25
 	sri		v5.4s, v17.4s, #25
-	  ror		a5, a5, #25
 	sri		v6.4s, v18.4s, #25
-	 ror		a6, a6, #25
 	sri		v7.4s, v19.4s, #25
-	  ror		a7, a7, #25

 	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
 	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
 	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
 	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
 	add		v0.4s, v0.4s, v5.4s
-	  add		a0, a0, a5
 	add		v1.4s, v1.4s, v6.4s
-	  add		a1, a1, a6
 	add		v2.4s, v2.4s, v7.4s
-	  add		a2, a2, a7
 	add		v3.4s, v3.4s, v4.4s
-	  add		a3, a3, a4

 	eor		v15.16b, v15.16b, v0.16b
-	  eor		a15, a15, a0
 	eor		v12.16b, v12.16b, v1.16b
-	  eor		a12, a12, a1
 	eor		v13.16b, v13.16b, v2.16b
-	  eor		a13, a13, a2
 	eor		v14.16b, v14.16b, v3.16b
-	  eor		a14, a14, a3

 	rev32		v15.8h, v15.8h
-	  ror		a15, a15, #16
 	rev32		v12.8h, v12.8h
-	  ror		a12, a12, #16
 	rev32		v13.8h, v13.8h
-	  ror		a13, a13, #16
 	rev32		v14.8h, v14.8h
-	  ror		a14, a14, #16

 	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
 	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
 	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
 	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
 	add		v10.4s, v10.4s, v15.4s
-	  add		a10, a10, a15
 	add		v11.4s, v11.4s, v12.4s
-	  add		a11, a11, a12
 	add		v8.4s, v8.4s, v13.4s
-	  add		a8, a8, a13
 	add		v9.4s, v9.4s, v14.4s
-	  add		a9, a9, a14

 	eor		v16.16b, v5.16b, v10.16b
-	  eor		a5, a5, a10
 	eor		v17.16b, v6.16b, v11.16b
-	  eor		a6, a6, a11
 	eor		v18.16b, v7.16b, v8.16b
-	  eor		a7, a7, a8
 	eor		v19.16b, v4.16b, v9.16b
-	  eor		a4, a4, a9

 	shl		v5.4s, v16.4s, #12
 	shl		v6.4s, v17.4s, #12
@ -431,66 +283,42 @@ SYM_FUNC_START(chacha_4block_xor_neon)
 	shl		v4.4s, v19.4s, #12

 	sri		v5.4s, v16.4s, #20
-	  ror		a5, a5, #20
 	sri		v6.4s, v17.4s, #20
-	  ror		a6, a6, #20
 	sri		v7.4s, v18.4s, #20
-	  ror		a7, a7, #20
 	sri		v4.4s, v19.4s, #20
-	  ror		a4, a4, #20

 	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
 	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
 	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
 	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
 	add		v0.4s, v0.4s, v5.4s
-	  add		a0, a0, a5
 	add		v1.4s, v1.4s, v6.4s
-	  add		a1, a1, a6
 	add		v2.4s, v2.4s, v7.4s
-	  add		a2, a2, a7
 	add		v3.4s, v3.4s, v4.4s
-	  add		a3, a3, a4

 	eor		v15.16b, v15.16b, v0.16b
-	  eor		a15, a15, a0
 	eor		v12.16b, v12.16b, v1.16b
-	  eor		a12, a12, a1
 	eor		v13.16b, v13.16b, v2.16b
-	  eor		a13, a13, a2
 	eor		v14.16b, v14.16b, v3.16b
-	  eor		a14, a14, a3

 	tbl		v15.16b, {v15.16b}, v31.16b
-	  ror		a15, a15, #24
 	tbl		v12.16b, {v12.16b}, v31.16b
-	  ror		a12, a12, #24
 	tbl		v13.16b, {v13.16b}, v31.16b
-	  ror		a13, a13, #24
 	tbl		v14.16b, {v14.16b}, v31.16b
-	  ror		a14, a14, #24

 	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
 	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
 	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
 	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
 	add		v10.4s, v10.4s, v15.4s
-	  add		a10, a10, a15
 	add		v11.4s, v11.4s, v12.4s
-	  add		a11, a11, a12
 	add		v8.4s, v8.4s, v13.4s
-	  add		a8, a8, a13
 	add		v9.4s, v9.4s, v14.4s
-	  add		a9, a9, a14

 	eor		v16.16b, v5.16b, v10.16b
-	  eor		a5, a5, a10
 	eor		v17.16b, v6.16b, v11.16b
-	  eor		a6, a6, a11
 	eor		v18.16b, v7.16b, v8.16b
-	  eor		a7, a7, a8
 	eor		v19.16b, v4.16b, v9.16b
-	  eor		a4, a4, a9

 	shl		v5.4s, v16.4s, #7
 	shl		v6.4s, v17.4s, #7
@ -498,15 +326,11 @@ SYM_FUNC_START(chacha_4block_xor_neon)
 	shl		v4.4s, v19.4s, #7

 	sri		v5.4s, v16.4s, #25
-	  ror		a5, a5, #25
 	sri		v6.4s, v17.4s, #25
-	  ror		a6, a6, #25
 	sri		v7.4s, v18.4s, #25
-	  ror		a7, a7, #25
 	sri		v4.4s, v19.4s, #25
-	  ror		a4, a4, #25

-	subs		w3, w3, #2
+	subs		x3, x3, #1
 	b.ne		.Ldoubleround4

 	ld4r		{v16.4s-v19.4s}, [x0], #16
@ -520,21 +344,9 @@ SYM_FUNC_START(chacha_4block_xor_neon)
 	// x2[0-3] += s0[2]
 	// x3[0-3] += s0[3]
 	add		v0.4s, v0.4s, v16.4s
-	  mov		w6, v16.s[0]
-	  mov		w7, v17.s[0]
 	add		v1.4s, v1.4s, v17.4s
-	  mov		w8, v18.s[0]
-	  mov		w9, v19.s[0]
 	add		v2.4s, v2.4s, v18.4s
-	  add		a0, a0, w6
-	  add		a1, a1, w7
 	add		v3.4s, v3.4s, v19.4s
-	  add		a2, a2, w8
-	  add		a3, a3, w9
-CPU_BE(	  rev		a0, a0		)
-CPU_BE(	  rev		a1, a1		)
-CPU_BE(	  rev		a2, a2		)
-CPU_BE(	  rev		a3, a3		)

 	ld4r		{v24.4s-v27.4s}, [x0], #16
 	ld4r		{v28.4s-v31.4s}, [x0]
@ -544,154 +356,69 @@ CPU_BE(	  rev		a3, a3		)
 	// x6[0-3] += s1[2]
 	// x7[0-3] += s1[3]
 	add		v4.4s, v4.4s, v20.4s
-	  mov		w6, v20.s[0]
-	  mov		w7, v21.s[0]
 	add		v5.4s, v5.4s, v21.4s
-	  mov		w8, v22.s[0]
-	  mov		w9, v23.s[0]
 	add		v6.4s, v6.4s, v22.4s
-	  add		a4, a4, w6
-	  add		a5, a5, w7
 	add		v7.4s, v7.4s, v23.4s
-	  add		a6, a6, w8
-	  add		a7, a7, w9
-CPU_BE(	  rev		a4, a4		)
-CPU_BE(	  rev		a5, a5		)
-CPU_BE(	  rev		a6, a6		)
-CPU_BE(	  rev		a7, a7		)

 	// x8[0-3] += s2[0]
 	// x9[0-3] += s2[1]
 	// x10[0-3] += s2[2]
 	// x11[0-3] += s2[3]
 	add		v8.4s, v8.4s, v24.4s
-	  mov		w6, v24.s[0]
-	  mov		w7, v25.s[0]
 	add		v9.4s, v9.4s, v25.4s
-	  mov		w8, v26.s[0]
-	  mov		w9, v27.s[0]
 	add		v10.4s, v10.4s, v26.4s
-	  add		a8, a8, w6
-	  add		a9, a9, w7
 	add		v11.4s, v11.4s, v27.4s
-	  add		a10, a10, w8
-	  add		a11, a11, w9
-CPU_BE(	  rev		a8, a8		)
-CPU_BE(	  rev		a9, a9		)
-CPU_BE(	  rev		a10, a10	)
-CPU_BE(	  rev		a11, a11	)

 	// x12[0-3] += s3[0]
 	// x13[0-3] += s3[1]
 	// x14[0-3] += s3[2]
 	// x15[0-3] += s3[3]
 	add		v12.4s, v12.4s, v28.4s
-	  mov		w6, v28.s[0]
-	  mov		w7, v29.s[0]
 	add		v13.4s, v13.4s, v29.4s
-	  mov		w8, v30.s[0]
-	  mov		w9, v31.s[0]
 	add		v14.4s, v14.4s, v30.4s
-	  add		a12, a12, w6
-	  add		a13, a13, w7
 	add		v15.4s, v15.4s, v31.4s
-	  add		a14, a14, w8
-	  add		a15, a15, w9
-CPU_BE(	  rev		a12, a12	)
-CPU_BE(	  rev		a13, a13	)
-CPU_BE(	  rev		a14, a14	)
-CPU_BE(	  rev		a15, a15	)

 	// interleave 32-bit words in state n, n+1
-	  ldp		w6, w7, [x2], #64
 	zip1		v16.4s, v0.4s, v1.4s
-	  ldp		w8, w9, [x2, #-56]
-	  eor		a0, a0, w6
 	zip2		v17.4s, v0.4s, v1.4s
-	  eor		a1, a1, w7
 	zip1		v18.4s, v2.4s, v3.4s
-	  eor		a2, a2, w8
 	zip2		v19.4s, v2.4s, v3.4s
-	  eor		a3, a3, w9
-	  ldp		w6, w7, [x2, #-48]
 	zip1		v20.4s, v4.4s, v5.4s
-	  ldp		w8, w9, [x2, #-40]
-	  eor		a4, a4, w6
 	zip2		v21.4s, v4.4s, v5.4s
-	  eor		a5, a5, w7
 	zip1		v22.4s, v6.4s, v7.4s
-	  eor		a6, a6, w8
 	zip2		v23.4s, v6.4s, v7.4s
-	  eor		a7, a7, w9
-	  ldp		w6, w7, [x2, #-32]
 	zip1		v24.4s, v8.4s, v9.4s
-	  ldp		w8, w9, [x2, #-24]
-	  eor		a8, a8, w6
 	zip2		v25.4s, v8.4s, v9.4s
-	  eor		a9, a9, w7
 	zip1		v26.4s, v10.4s, v11.4s
-	  eor		a10, a10, w8
 	zip2		v27.4s, v10.4s, v11.4s
-	  eor		a11, a11, w9
-	  ldp		w6, w7, [x2, #-16]
 	zip1		v28.4s, v12.4s, v13.4s
-	  ldp		w8, w9, [x2, #-8]
-	  eor		a12, a12, w6
 	zip2		v29.4s, v12.4s, v13.4s
-	  eor		a13, a13, w7
 	zip1		v30.4s, v14.4s, v15.4s
-	  eor		a14, a14, w8
 	zip2		v31.4s, v14.4s, v15.4s
-	  eor		a15, a15, w9
-
-	add		x3, x2, x4
-	sub		x3, x3, #128		// start of last block
-
-	subs		x5, x4, #128
-	csel		x2, x2, x3, ge

 	// interleave 64-bit words in state n, n+2
 	zip1		v0.2d, v16.2d, v18.2d
 	zip2		v4.2d, v16.2d, v18.2d
-	  stp		a0, a1, [x1], #64
 	zip1		v8.2d, v17.2d, v19.2d
 	zip2		v12.2d, v17.2d, v19.2d
-	  stp		a2, a3, [x1, #-56]
-
-	subs		x6, x4, #192
 	ld1		{v16.16b-v19.16b}, [x2], #64
-	csel		x2, x2, x3, ge

 	zip1		v1.2d, v20.2d, v22.2d
 	zip2		v5.2d, v20.2d, v22.2d
-	  stp		a4, a5, [x1, #-48]
 	zip1		v9.2d, v21.2d, v23.2d
 	zip2		v13.2d, v21.2d, v23.2d
-	  stp		a6, a7, [x1, #-40]
-
-	subs		x7, x4, #256
 	ld1		{v20.16b-v23.16b}, [x2], #64
-	csel		x2, x2, x3, ge

 	zip1		v2.2d, v24.2d, v26.2d
 	zip2		v6.2d, v24.2d, v26.2d
-	  stp		a8, a9, [x1, #-32]
 	zip1		v10.2d, v25.2d, v27.2d
 	zip2		v14.2d, v25.2d, v27.2d
-	  stp		a10, a11, [x1, #-24]
-
-	subs		x8, x4, #320
 	ld1		{v24.16b-v27.16b}, [x2], #64
-	csel		x2, x2, x3, ge

 	zip1		v3.2d, v28.2d, v30.2d
 	zip2		v7.2d, v28.2d, v30.2d
-	  stp		a12, a13, [x1, #-16]
 	zip1		v11.2d, v29.2d, v31.2d
 	zip2		v15.2d, v29.2d, v31.2d
-	  stp		a14, a15, [x1, #-8]
-
-	tbnz		x5, #63, .Lt128
 	ld1		{v28.16b-v31.16b}, [x2]

 	// xor with corresponding input, write to output
@ -699,107 +426,25 @@ CPU_BE(	  rev		a15, a15	)
 	eor		v17.16b, v17.16b, v1.16b
 	eor		v18.16b, v18.16b, v2.16b
 	eor		v19.16b, v19.16b, v3.16b
-
-	tbnz		x6, #63, .Lt192
-
 	eor		v20.16b, v20.16b, v4.16b
 	eor		v21.16b, v21.16b, v5.16b
+	st1		{v16.16b-v19.16b}, [x1], #64
 	eor		v22.16b, v22.16b, v6.16b
 	eor		v23.16b, v23.16b, v7.16b
-
-	st1		{v16.16b-v19.16b}, [x1], #64
-	tbnz		x7, #63, .Lt256
-
 	eor		v24.16b, v24.16b, v8.16b
 	eor		v25.16b, v25.16b, v9.16b
+	st1		{v20.16b-v23.16b}, [x1], #64
 	eor		v26.16b, v26.16b, v10.16b
 	eor		v27.16b, v27.16b, v11.16b
-
-	st1		{v20.16b-v23.16b}, [x1], #64
-	tbnz		x8, #63, .Lt320
-
 	eor		v28.16b, v28.16b, v12.16b
+	st1		{v24.16b-v27.16b}, [x1], #64
 	eor		v29.16b, v29.16b, v13.16b
 	eor		v30.16b, v30.16b, v14.16b
 	eor		v31.16b, v31.16b, v15.16b
-
-	st1		{v24.16b-v27.16b}, [x1], #64
 	st1		{v28.16b-v31.16b}, [x1]

-.Lout:	frame_pop
 	ret
+ENDPROC(chacha20_4block_xor_neon)

-	// fewer than 192 bytes of in/output
-.Lt192:	cbz		x5, 1f				// exactly 128 bytes?
-	ld1		{v28.16b-v31.16b}, [x10]
-	add		x5, x5, x1
-	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
-	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
-	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
-	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
-
-0:	eor		v20.16b, v20.16b, v28.16b
-	eor		v21.16b, v21.16b, v29.16b
-	eor		v22.16b, v22.16b, v30.16b
-	eor		v23.16b, v23.16b, v31.16b
-	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
-1:	st1		{v16.16b-v19.16b}, [x1]
-	b		.Lout
-
-	// fewer than 128 bytes of in/output
-.Lt128:	ld1		{v28.16b-v31.16b}, [x10]
-	add		x5, x5, x1
-	sub		x1, x1, #64
-	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
-	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
-	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
-	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
-	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
-	b		0b
-
-	// fewer than 256 bytes of in/output
-.Lt256:	cbz		x6, 2f				// exactly 192 bytes?
-	ld1		{v4.16b-v7.16b}, [x10]
-	add		x6, x6, x1
-	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
-	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
-	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
-	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
-
-	eor		v28.16b, v28.16b, v0.16b
-	eor		v29.16b, v29.16b, v1.16b
-	eor		v30.16b, v30.16b, v2.16b
-	eor		v31.16b, v31.16b, v3.16b
-	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
-2:	st1		{v20.16b-v23.16b}, [x1]
-	b		.Lout
-
-	// fewer than 320 bytes of in/output
-.Lt320:	cbz		x7, 3f				// exactly 256 bytes?
-	ld1		{v4.16b-v7.16b}, [x10]
-	add		x7, x7, x1
-	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
-	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
-	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
-	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
-
-	eor		v28.16b, v28.16b, v0.16b
-	eor		v29.16b, v29.16b, v1.16b
-	eor		v30.16b, v30.16b, v2.16b
-	eor		v31.16b, v31.16b, v3.16b
-	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
-3:	st1		{v24.16b-v27.16b}, [x1]
-	b		.Lout
-SYM_FUNC_END(chacha_4block_xor_neon)
-
-	.section	".rodata", "a", %progbits
-	.align		L1_CACHE_SHIFT
-.Lpermute:
-	.set		.Li, 0
-	.rept		128
-	.byte		(.Li - 64)
-	.set		.Li, .Li + 1
-	.endr
-
-CTRINC:	.word		1, 2, 3, 4
+CTRINC:	.word		0, 1, 2, 3
 ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
--- a/arch/arm64/crypto/chacha20-neon-glue.c
+++ b/arch/arm64/crypto/chacha20-neon-glue.c
@ -0,0 +1,127 @@
+/*
+ * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
+ *
+ * Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Based on:
+ * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+
+asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
+
+static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
+			    unsigned int bytes)
+{
+	u8 buf[CHACHA_BLOCK_SIZE];
+
+	while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+		chacha20_4block_xor_neon(state, dst, src);
+		bytes -= CHACHA_BLOCK_SIZE * 4;
+		src += CHACHA_BLOCK_SIZE * 4;
+		dst += CHACHA_BLOCK_SIZE * 4;
+		state[12] += 4;
+	}
+	while (bytes >= CHACHA_BLOCK_SIZE) {
+		chacha20_block_xor_neon(state, dst, src);
+		bytes -= CHACHA_BLOCK_SIZE;
+		src += CHACHA_BLOCK_SIZE;
+		dst += CHACHA_BLOCK_SIZE;
+		state[12]++;
+	}
+	if (bytes) {
+		memcpy(buf, src, bytes);
+		chacha20_block_xor_neon(state, buf, buf);
+		memcpy(dst, buf, bytes);
+	}
+}
+
+static int chacha20_neon(struct skcipher_request *req)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+	struct skcipher_walk walk;
+	u32 state[16];
+	int err;
+
+	if (!may_use_simd() || req->cryptlen <= CHACHA_BLOCK_SIZE)
+		return crypto_chacha_crypt(req);
+
+	err = skcipher_walk_virt(&walk, req, true);
+
+	crypto_chacha_init(state, ctx, walk.iv);
+
+	kernel_neon_begin();
+	while (walk.nbytes > 0) {
+		unsigned int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes = round_down(nbytes, walk.stride);
+
+		chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
+				nbytes);
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+	}
+	kernel_neon_end();
+
+	return err;
+}
+
+static struct skcipher_alg alg = {
+	.base.cra_name		= "chacha20",
+	.base.cra_driver_name	= "chacha20-neon",
+	.base.cra_priority	= 300,
+	.base.cra_blocksize	= 1,
+	.base.cra_ctxsize	= sizeof(struct chacha_ctx),
+	.base.cra_module	= THIS_MODULE,
+
+	.min_keysize		= CHACHA_KEY_SIZE,
+	.max_keysize		= CHACHA_KEY_SIZE,
+	.ivsize			= CHACHA_IV_SIZE,
+	.chunksize		= CHACHA_BLOCK_SIZE,
+	.walksize		= 4 * CHACHA_BLOCK_SIZE,
+	.setkey			= crypto_chacha20_setkey,
+	.encrypt		= chacha20_neon,
+	.decrypt		= chacha20_neon,
+};
+
+static int __init chacha20_simd_mod_init(void)
+{
+	if (!(elf_hwcap & HWCAP_ASIMD))
+		return -ENODEV;
+
+	return crypto_register_skcipher(&alg);
+}
+
+static void __exit chacha20_simd_mod_fini(void)
+{
+	crypto_unregister_skcipher(&alg);
+}
+
+module_init(chacha20_simd_mod_init);
+module_exit(chacha20_simd_mod_fini);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("chacha20");
--- a/arch/arm64/crypto/crc32-ce-core.S
+++ b/arch/arm64/crypto/crc32-ce-core.S
@ -0,0 +1,266 @@
+/*
+ * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see http://www.gnu.org/licenses
+ *
+ * Please  visit http://www.xyratex.com/contact if you need additional
+ * information or have any questions.
+ *
+ * GPL HEADER END
+ */
+
+/*
+ * Copyright 2012 Xyratex Technology Limited
+ *
+ * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
+ * calculation.
+ * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
+ * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
+ * at:
+ * http://www.intel.com/products/processor/manuals/
+ * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
+ * Volume 2B: Instruction Set Reference, N-Z
+ *
+ * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
+ *	      Alexander Boyko <Alexander_Boyko@xyratex.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+	.align		6
+	.cpu		generic+crypto+crc
+
+.Lcrc32_constants:
+	/*
+	 * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
+	 * #define CONSTANT_R1  0x154442bd4LL
+	 *
+	 * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
+	 * #define CONSTANT_R2  0x1c6e41596LL
+	 */
+	.octa		0x00000001c6e415960000000154442bd4
+
+	/*
+	 * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
+	 * #define CONSTANT_R3  0x1751997d0LL
+	 *
+	 * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
+	 * #define CONSTANT_R4  0x0ccaa009eLL
+	 */
+	.octa		0x00000000ccaa009e00000001751997d0
+
+	/*
+	 * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
+	 * #define CONSTANT_R5  0x163cd6124LL
+	 */
+	.quad		0x0000000163cd6124
+	.quad		0x00000000FFFFFFFF
+
+	/*
+	 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
+	 *
+	 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
+	 *                                                      = 0x1F7011641LL
+	 * #define CONSTANT_RU  0x1F7011641LL
+	 */
+	.octa		0x00000001F701164100000001DB710641
+
+.Lcrc32c_constants:
+	.octa		0x000000009e4addf800000000740eef02
+	.octa		0x000000014cd00bd600000000f20c0dfe
+	.quad		0x00000000dd45aab8
+	.quad		0x00000000FFFFFFFF
+	.octa		0x00000000dea713f10000000105ec76f0
+
+	vCONSTANT	.req	v0
+	dCONSTANT	.req	d0
+	qCONSTANT	.req	q0
+
+	BUF		.req	x0
+	LEN		.req	x1
+	CRC		.req	x2
+
+	vzr		.req	v9
+
+	/**
+	 * Calculate crc32
+	 * BUF - buffer
+	 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
+	 * CRC - initial crc32
+	 * return %eax crc32
+	 * uint crc32_pmull_le(unsigned char const *buffer,
+	 *                     size_t len, uint crc32)
+	 */
+ENTRY(crc32_pmull_le)
+	adr		x3, .Lcrc32_constants
+	b		0f
+
+ENTRY(crc32c_pmull_le)
+	adr		x3, .Lcrc32c_constants
+
+0:	bic		LEN, LEN, #15
+	ld1		{v1.16b-v4.16b}, [BUF], #0x40
+	movi		vzr.16b, #0
+	fmov		dCONSTANT, CRC
+	eor		v1.16b, v1.16b, vCONSTANT.16b
+	sub		LEN, LEN, #0x40
+	cmp		LEN, #0x40
+	b.lt		less_64
+
+	ldr		qCONSTANT, [x3]
+
+loop_64:		/* 64 bytes Full cache line folding */
+	sub		LEN, LEN, #0x40
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull2		v6.1q, v2.2d, vCONSTANT.2d
+	pmull2		v7.1q, v3.2d, vCONSTANT.2d
+	pmull2		v8.1q, v4.2d, vCONSTANT.2d
+
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	pmull		v2.1q, v2.1d, vCONSTANT.1d
+	pmull		v3.1q, v3.1d, vCONSTANT.1d
+	pmull		v4.1q, v4.1d, vCONSTANT.1d
+
+	eor		v1.16b, v1.16b, v5.16b
+	ld1		{v5.16b}, [BUF], #0x10
+	eor		v2.16b, v2.16b, v6.16b
+	ld1		{v6.16b}, [BUF], #0x10
+	eor		v3.16b, v3.16b, v7.16b
+	ld1		{v7.16b}, [BUF], #0x10
+	eor		v4.16b, v4.16b, v8.16b
+	ld1		{v8.16b}, [BUF], #0x10
+
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v2.16b, v2.16b, v6.16b
+	eor		v3.16b, v3.16b, v7.16b
+	eor		v4.16b, v4.16b, v8.16b
+
+	cmp		LEN, #0x40
+	b.ge		loop_64
+
+less_64:		/* Folding cache line into 128bit */
+	ldr		qCONSTANT, [x3, #16]
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v2.16b
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v3.16b
+
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v4.16b
+
+	cbz		LEN, fold_64
+
+loop_16:		/* Folding rest buffer into 128bit */
+	subs		LEN, LEN, #0x10
+
+	ld1		{v2.16b}, [BUF], #0x10
+	pmull2		v5.1q, v1.2d, vCONSTANT.2d
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v5.16b
+	eor		v1.16b, v1.16b, v2.16b
+
+	b.ne		loop_16
+
+fold_64:
+	/* perform the last 64 bit fold, also adds 32 zeroes
+	 * to the input stream */
+	ext		v2.16b, v1.16b, v1.16b, #8
+	pmull2		v2.1q, v2.2d, vCONSTANT.2d
+	ext		v1.16b, v1.16b, vzr.16b, #8
+	eor		v1.16b, v1.16b, v2.16b
+
+	/* final 32-bit fold */
+	ldr		dCONSTANT, [x3, #32]
+	ldr		d3, [x3, #40]
+
+	ext		v2.16b, v1.16b, vzr.16b, #4
+	and		v1.16b, v1.16b, v3.16b
+	pmull		v1.1q, v1.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v2.16b
+
+	/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
+	ldr		qCONSTANT, [x3, #48]
+
+	and		v2.16b, v1.16b, v3.16b
+	ext		v2.16b, vzr.16b, v2.16b, #8
+	pmull2		v2.1q, v2.2d, vCONSTANT.2d
+	and		v2.16b, v2.16b, v3.16b
+	pmull		v2.1q, v2.1d, vCONSTANT.1d
+	eor		v1.16b, v1.16b, v2.16b
+	mov		w0, v1.s[1]
+
+	ret
+ENDPROC(crc32_pmull_le)
+ENDPROC(crc32c_pmull_le)
+
+	.macro		__crc32, c
+0:	subs		x2, x2, #16
+	b.mi		8f
+	ldp		x3, x4, [x1], #16
+CPU_BE(	rev		x3, x3		)
+CPU_BE(	rev		x4, x4		)
+	crc32\c\()x	w0, w0, x3
+	crc32\c\()x	w0, w0, x4
+	b.ne		0b
+	ret
+
+8:	tbz		x2, #3, 4f
+	ldr		x3, [x1], #8
+CPU_BE(	rev		x3, x3		)
+	crc32\c\()x	w0, w0, x3
+4:	tbz		x2, #2, 2f
+	ldr		w3, [x1], #4
+CPU_BE(	rev		w3, w3		)
+	crc32\c\()w	w0, w0, w3
+2:	tbz		x2, #1, 1f
+	ldrh		w3, [x1], #2
+CPU_BE(	rev16		w3, w3		)
+	crc32\c\()h	w0, w0, w3
+1:	tbz		x2, #0, 0f
+	ldrb		w3, [x1]
+	crc32\c\()b	w0, w0, w3
+0:	ret
+	.endm
+
+	.align		5
+ENTRY(crc32_armv8_le)
+	__crc32
+ENDPROC(crc32_armv8_le)
+
+	.align		5
+ENTRY(crc32c_armv8_le)
+	__crc32		c
+ENDPROC(crc32c_armv8_le)
--- a/arch/arm64/crypto/crc32-ce-glue.c
+++ b/arch/arm64/crypto/crc32-ce-glue.c
@ -0,0 +1,244 @@
+/*
+ * Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions
+ *
+ * Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/cpufeature.h>
+#include <linux/crc32.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/hwcap.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/unaligned.h>
+
+#define PMULL_MIN_LEN		64L	/* minimum size of buffer
+					 * for crc32_pmull_le_16 */
+#define SCALE_F			16L	/* size of NEON register */
+
+asmlinkage u32 crc32_pmull_le(const u8 buf[], u64 len, u32 init_crc);
+asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], size_t len);
+
+asmlinkage u32 crc32c_pmull_le(const u8 buf[], u64 len, u32 init_crc);
+asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], size_t len);
+
+static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], size_t len);
+static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], size_t len);
+
+static int crc32_pmull_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = 0;
+	return 0;
+}
+
+static int crc32c_pmull_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = ~0;
+	return 0;
+}
+
+static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key,
+			      unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32_pmull_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = *mctx;
+	return 0;
+}
+
+static int crc32_update(struct shash_desc *desc, const u8 *data,
+			unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = crc32_armv8_le(*crc, data, length);
+	return 0;
+}
+
+static int crc32c_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	*crc = crc32c_armv8_le(*crc, data, length);
+	return 0;
+}
+
+static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+	unsigned int l;
+
+	if ((u64)data % SCALE_F) {
+		l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
+
+		*crc = fallback_crc32(*crc, data, l);
+
+		data += l;
+		length -= l;
+	}
+
+	if (length >= PMULL_MIN_LEN && may_use_simd()) {
+		l = round_down(length, SCALE_F);
+
+		kernel_neon_begin();
+		*crc = crc32_pmull_le(data, l, *crc);
+		kernel_neon_end();
+
+		data += l;
+		length -= l;
+	}
+
+	if (length > 0)
+		*crc = fallback_crc32(*crc, data, length);
+
+	return 0;
+}
+
+static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int length)
+{
+	u32 *crc = shash_desc_ctx(desc);
+	unsigned int l;
+
+	if ((u64)data % SCALE_F) {
+		l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
+
+		*crc = fallback_crc32c(*crc, data, l);
+
+		data += l;
+		length -= l;
+	}
+
+	if (length >= PMULL_MIN_LEN && may_use_simd()) {
+		l = round_down(length, SCALE_F);
+
+		kernel_neon_begin();
+		*crc = crc32c_pmull_le(data, l, *crc);
+		kernel_neon_end();
+
+		data += l;
+		length -= l;
+	}
+
+	if (length > 0) {
+		*crc = fallback_crc32c(*crc, data, length);
+	}
+
+	return 0;
+}
+
+static int crc32_pmull_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	put_unaligned_le32(*crc, out);
+	return 0;
+}
+
+static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crc = shash_desc_ctx(desc);
+
+	put_unaligned_le32(~*crc, out);
+	return 0;
+}
+
+static struct shash_alg crc32_pmull_algs[] = { {
+	.setkey			= crc32_pmull_setkey,
+	.init			= crc32_pmull_init,
+	.update			= crc32_update,
+	.final			= crc32_pmull_final,
+	.descsize		= sizeof(u32),
+	.digestsize		= sizeof(u32),
+
+	.base.cra_ctxsize	= sizeof(u32),
+	.base.cra_init		= crc32_pmull_cra_init,
+	.base.cra_name		= "crc32",
+	.base.cra_driver_name	= "crc32-arm64-ce",
+	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+	.base.cra_blocksize	= 1,
+	.base.cra_module	= THIS_MODULE,
+}, {
+	.setkey			= crc32_pmull_setkey,
+	.init			= crc32_pmull_init,
+	.update			= crc32c_update,
+	.final			= crc32c_pmull_final,
+	.descsize		= sizeof(u32),
+	.digestsize		= sizeof(u32),
+
+	.base.cra_ctxsize	= sizeof(u32),
+	.base.cra_init		= crc32c_pmull_cra_init,
+	.base.cra_name		= "crc32c",
+	.base.cra_driver_name	= "crc32c-arm64-ce",
+	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_ALG_OPTIONAL_KEY,
+	.base.cra_blocksize	= 1,
+	.base.cra_module	= THIS_MODULE,
+} };
+
+static int __init crc32_pmull_mod_init(void)
+{
+	if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
+		crc32_pmull_algs[0].update = crc32_pmull_update;
+		crc32_pmull_algs[1].update = crc32c_pmull_update;
+
+		if (elf_hwcap & HWCAP_CRC32) {
+			fallback_crc32 = crc32_armv8_le;
+			fallback_crc32c = crc32c_armv8_le;
+		} else {
+			fallback_crc32 = crc32_le;
+			fallback_crc32c = __crc32c_le;
+		}
+	} else if (!(elf_hwcap & HWCAP_CRC32)) {
+		return -ENODEV;
+	}
+	return crypto_register_shashes(crc32_pmull_algs,
+				       ARRAY_SIZE(crc32_pmull_algs));
+}
+
+static void __exit crc32_pmull_mod_exit(void)
+{
+	crypto_unregister_shashes(crc32_pmull_algs,
+				  ARRAY_SIZE(crc32_pmull_algs));
+}
+
+static const struct cpu_feature crc32_cpu_feature[] = {
+	{ cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
+};
+MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
+
+module_init(crc32_pmull_mod_init);
+module_exit(crc32_pmull_mod_exit);
+
+MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
+MODULE_LICENSE("GPL v2");
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@ -2,14 +2,12 @@
 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
 //
 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
-// Copyright (C) 2019 Google LLC <ebiggers@google.com>
 //
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License version 2 as
 // published by the Free Software Foundation.
 //

-// Derived from the x86 version:
 //
 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
 //
@ -56,176 +54,109 @@
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
+//       Function API:
+//       UINT16 crc_t10dif_pcl(
+//               UINT16 init_crc, //initial CRC value, 16 bits
+//               const unsigned char *buf, //buffer pointer to calculate CRC on
+//               UINT64 len //buffer length in bytes (64-bit data)
+//       );
+//
 //       Reference paper titled "Fast CRC Computation for Generic
 //	Polynomials Using PCLMULQDQ Instruction"
 //       URL: http://www.intel.com/content/dam/www/public/us/en/documents
 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
 //
+//

 #include <linux/linkage.h>
 #include <asm/assembler.h>

 	.text
-	.arch		armv8-a+crypto
+	.cpu		generic+crypto

-	init_crc	.req	w0
-	buf		.req	x1
-	len		.req	x2
-	fold_consts_ptr	.req	x3
+	arg1_low32	.req	w0
+	arg2		.req	x1
+	arg3		.req	x2

-	fold_consts	.req	v10
+	vzr		.req	v13

-	ad		.req	v14
+ENTRY(crc_t10dif_pmull)
+	movi		vzr.16b, #0		// init zero register

-	k00_16		.req	v15
-	k32_48		.req	v16
+	// adjust the 16-bit initial_crc value, scale it to 32 bits
+	lsl		arg1_low32, arg1_low32, #16

-	t3		.req	v17
-	t4		.req	v18
-	t5		.req	v19
-	t6		.req	v20
-	t7		.req	v21
-	t8		.req	v22
-	t9		.req	v23
+	// check if smaller than 256
+	cmp		arg3, #256

-	perm1		.req	v24
-	perm2		.req	v25
-	perm3		.req	v26
-	perm4		.req	v27
+	// for sizes less than 128, we can't fold 64B at a time...
+	b.lt		_less_than_128

-	bd1		.req	v28
-	bd2		.req	v29
-	bd3		.req	v30
-	bd4		.req	v31
+	// load the initial crc value
+	// crc value does not need to be byte-reflected, but it needs
+	// to be moved to the high part of the register.
+	// because data will be byte-reflected and will align with
+	// initial crc at correct place.
+	movi		v10.16b, #0
+	mov		v10.s[3], arg1_low32		// initial crc

-	.macro		__pmull_init_p64
-	.endm
+	// receive the initial 64B data, xor the initial crc value
+	ldp		q0, q1, [arg2]
+	ldp		q2, q3, [arg2, #0x20]
+	ldp		q4, q5, [arg2, #0x40]
+	ldp		q6, q7, [arg2, #0x60]
+	add		arg2, arg2, #0x80

-	.macro		__pmull_pre_p64, bd
-	.endm
+CPU_LE(	rev64		v0.16b, v0.16b			)
+CPU_LE(	rev64		v1.16b, v1.16b			)
+CPU_LE(	rev64		v2.16b, v2.16b			)
+CPU_LE(	rev64		v3.16b, v3.16b			)
+CPU_LE(	rev64		v4.16b, v4.16b			)
+CPU_LE(	rev64		v5.16b, v5.16b			)
+CPU_LE(	rev64		v6.16b, v6.16b			)
+CPU_LE(	rev64		v7.16b, v7.16b			)

-	.macro		__pmull_init_p8
-	// k00_16 := 0x0000000000000000_000000000000ffff
-	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
-	movi		k32_48.2d, #0xffffffff
-	mov		k32_48.h[2], k32_48.h[0]
-	ushr		k00_16.2d, k32_48.2d, #32
+CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
+CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
+CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
+CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
+CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
+CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
+CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)

-	// prepare the permutation vectors
-	mov_q		x5, 0x080f0e0d0c0b0a09
-	movi		perm4.8b, #8
-	dup		perm1.2d, x5
-	eor		perm1.16b, perm1.16b, perm4.16b
-	ushr		perm2.2d, perm1.2d, #8
-	ushr		perm3.2d, perm1.2d, #16
-	ushr		perm4.2d, perm1.2d, #24
-	sli		perm2.2d, perm1.2d, #56
-	sli		perm3.2d, perm1.2d, #48
-	sli		perm4.2d, perm1.2d, #40
-	.endm
+	// XOR the initial_crc value
+	eor		v0.16b, v0.16b, v10.16b

-	.macro		__pmull_pre_p8, bd
-	tbl		bd1.16b, {\bd\().16b}, perm1.16b
-	tbl		bd2.16b, {\bd\().16b}, perm2.16b
-	tbl		bd3.16b, {\bd\().16b}, perm3.16b
-	tbl		bd4.16b, {\bd\().16b}, perm4.16b
-	.endm
+	ldr		q10, rk3	// xmm10 has rk3 and rk4
+					// type of pmull instruction
+					// will determine which constant to use

-SYM_FUNC_START_LOCAL(__pmull_p8_core)
-.L__pmull_p8_core:
-	ext		t4.8b, ad.8b, ad.8b, #1			// A1
-	ext		t5.8b, ad.8b, ad.8b, #2			// A2
-	ext		t6.8b, ad.8b, ad.8b, #3			// A3
+	//
+	// we subtract 256 instead of 128 to save one instruction from the loop
+	//
+	sub		arg3, arg3, #256

-	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
-	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
-	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
-	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
-	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
-	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
-	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
-	b		0f
+	// at this section of the code, there is 64*x+y (0<=y<64) bytes of
+	// buffer. The _fold_64_B_loop will fold 64B at a time
+	// until we have 64+y Bytes of buffer

-.L__pmull_p8_core2:
-	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
-	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
-	tbl		t6.16b, {ad.16b}, perm3.16b		// A3

-	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
-	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
-	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
-	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
-	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
-	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
-	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
+	// fold 64B at a time. This section of the code folds 4 vector
+	// registers in parallel
+_fold_64_B_loop:

-0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
-	eor		t5.16b, t5.16b, t7.16b			// M = G + H
-	eor		t6.16b, t6.16b, t9.16b			// N = I + J
+	.macro		fold64, reg1, reg2
+	ldp		q11, q12, [arg2], #0x20

-	uzp1		t8.2d, t4.2d, t5.2d
-	uzp2		t4.2d, t4.2d, t5.2d
-	uzp1		t7.2d, t6.2d, t3.2d
-	uzp2		t6.2d, t6.2d, t3.2d
-
-	// t4 = (L) (P0 + P1) << 8
-	// t5 = (M) (P2 + P3) << 16
-	eor		t8.16b, t8.16b, t4.16b
-	and		t4.16b, t4.16b, k32_48.16b
-
-	// t6 = (N) (P4 + P5) << 24
-	// t7 = (K) (P6 + P7) << 32
-	eor		t7.16b, t7.16b, t6.16b
-	and		t6.16b, t6.16b, k00_16.16b
-
-	eor		t8.16b, t8.16b, t4.16b
-	eor		t7.16b, t7.16b, t6.16b
-
-	zip2		t5.2d, t8.2d, t4.2d
-	zip1		t4.2d, t8.2d, t4.2d
-	zip2		t3.2d, t7.2d, t6.2d
-	zip1		t6.2d, t7.2d, t6.2d
-
-	ext		t4.16b, t4.16b, t4.16b, #15
-	ext		t5.16b, t5.16b, t5.16b, #14
-	ext		t6.16b, t6.16b, t6.16b, #13
-	ext		t3.16b, t3.16b, t3.16b, #12
-
-	eor		t4.16b, t4.16b, t5.16b
-	eor		t6.16b, t6.16b, t3.16b
-	ret
-SYM_FUNC_END(__pmull_p8_core)
-
-	.macro		__pmull_p8, rq, ad, bd, i
-	.ifnc		\bd, fold_consts
-	.err
-	.endif
-	mov		ad.16b, \ad\().16b
-	.ifb		\i
-	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
-	.else
-	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
-	.endif
-
-	bl		.L__pmull_p8_core\i
-
-	eor		\rq\().16b, \rq\().16b, t4.16b
-	eor		\rq\().16b, \rq\().16b, t6.16b
-	.endm
-
-	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
-	// into reg1, reg2.
-	.macro		fold_32_bytes, p, reg1, reg2
-	ldp		q11, q12, [buf], #0x20
-
-	__pmull_\p	v8, \reg1, fold_consts, 2
-	__pmull_\p	\reg1, \reg1, fold_consts
+	pmull2		v8.1q, \reg1\().2d, v10.2d
+	pmull		\reg1\().1q, \reg1\().1d, v10.1d

 CPU_LE(	rev64		v11.16b, v11.16b		)
 CPU_LE(	rev64		v12.16b, v12.16b		)

-	__pmull_\p	v9, \reg2, fold_consts, 2
-	__pmull_\p	\reg2, \reg2, fold_consts
+	pmull2		v9.1q, \reg2\().2d, v10.2d
+	pmull		\reg2\().1q, \reg2\().1d, v10.1d

 CPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
 CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
@ -236,279 +167,225 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	eor		\reg2\().16b, \reg2\().16b, v12.16b
 	.endm

-	// Fold src_reg into dst_reg, optionally loading the next fold constants
-	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
-	__pmull_\p	v8, \src_reg, fold_consts
-	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
-	.ifnb		\load_next_consts
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	__pmull_pre_\p	fold_consts
+	fold64		v0, v1
+	fold64		v2, v3
+	fold64		v4, v5
+	fold64		v6, v7
+
+	subs		arg3, arg3, #128
+
+	// check if there is another 64B in the buffer to be able to fold
+	b.ge		_fold_64_B_loop
+
+	// at this point, the buffer pointer is pointing at the last y Bytes
+	// of the buffer the 64B of folded data is in 4 of the vector
+	// registers: v0, v1, v2, v3
+
+	// fold the 8 vector registers to 1 vector register with different
+	// constants
+
+	ldr		q10, rk9
+
+	.macro		fold16, reg, rk
+	pmull		v8.1q, \reg\().1d, v10.1d
+	pmull2		\reg\().1q, \reg\().2d, v10.2d
+	.ifnb		\rk
+	ldr		q10, \rk
 	.endif
-	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
-	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
-	.endm
-
-	.macro		__pmull_p64, rd, rn, rm, n
-	.ifb		\n
-	pmull		\rd\().1q, \rn\().1d, \rm\().1d
-	.else
-	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
-	.endif
-	.endm
-
-	.macro		crc_t10dif_pmull, p
-	__pmull_init_\p
-
-	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
-	cmp		len, #256
-	b.lt		.Lless_than_256_bytes_\@
-
-	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
-
-	// Load the first 128 data bytes.  Byte swapping is necessary to make
-	// the bit order match the polynomial coefficient order.
-	ldp		q0, q1, [buf]
-	ldp		q2, q3, [buf, #0x20]
-	ldp		q4, q5, [buf, #0x40]
-	ldp		q6, q7, [buf, #0x60]
-	add		buf, buf, #0x80
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	rev64		v1.16b, v1.16b			)
-CPU_LE(	rev64		v2.16b, v2.16b			)
-CPU_LE(	rev64		v3.16b, v3.16b			)
-CPU_LE(	rev64		v4.16b, v4.16b			)
-CPU_LE(	rev64		v5.16b, v5.16b			)
-CPU_LE(	rev64		v6.16b, v6.16b			)
-CPU_LE(	rev64		v7.16b, v7.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
-CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
-CPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
-CPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
-CPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
-CPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
-CPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
-CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
-
-	// XOR the first 16 data *bits* with the initial CRC value.
-	movi		v8.16b, #0
-	mov		v8.h[7], init_crc
-	eor		v0.16b, v0.16b, v8.16b
-
-	// Load the constants for folding across 128 bytes.
-	ld1		{fold_consts.2d}, [fold_consts_ptr]
-	__pmull_pre_\p	fold_consts
-
-	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
-	// 128 to simplify the termination condition of the following loop.
-	sub		len, len, #256
-
-	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
-	// bytes v0-v7 into them, storing the result back into v0-v7.
-.Lfold_128_bytes_loop_\@:
-	fold_32_bytes	\p, v0, v1
-	fold_32_bytes	\p, v2, v3
-	fold_32_bytes	\p, v4, v5
-	fold_32_bytes	\p, v6, v7
-
-	subs		len, len, #128
-	b.ge		.Lfold_128_bytes_loop_\@
-
-	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
-
-	// Fold across 64 bytes.
-	add		fold_consts_ptr, fold_consts_ptr, #16
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	__pmull_pre_\p	fold_consts
-	fold_16_bytes	\p, v0, v4
-	fold_16_bytes	\p, v1, v5
-	fold_16_bytes	\p, v2, v6
-	fold_16_bytes	\p, v3, v7, 1
-	// Fold across 32 bytes.
-	fold_16_bytes	\p, v4, v6
-	fold_16_bytes	\p, v5, v7, 1
-	// Fold across 16 bytes.
-	fold_16_bytes	\p, v6, v7
-
-	// Add 128 to get the correct number of data bytes remaining in 0...127
-	// (not counting v7), following the previous extra subtraction by 128.
-	// Then subtract 16 to simplify the termination condition of the
-	// following loop.
-	adds		len, len, #(128-16)
-
-	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
-	// into them, storing the result back into v7.
-	b.lt		.Lfold_16_bytes_loop_done_\@
-.Lfold_16_bytes_loop_\@:
-	__pmull_\p	v8, v7, fold_consts
-	__pmull_\p	v7, v7, fold_consts, 2
 	eor		v7.16b, v7.16b, v8.16b
-	ldr		q0, [buf], #16
+	eor		v7.16b, v7.16b, \reg\().16b
+	.endm
+
+	fold16		v0, rk11
+	fold16		v1, rk13
+	fold16		v2, rk15
+	fold16		v3, rk17
+	fold16		v4, rk19
+	fold16		v5, rk1
+	fold16		v6
+
+	// instead of 64, we add 48 to the loop counter to save 1 instruction
+	// from the loop instead of a cmp instruction, we use the negative
+	// flag with the jl instruction
+	adds		arg3, arg3, #(128-16)
+	b.lt		_final_reduction_for_128
+
+	// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
+	// and the rest is in memory. We can fold 16 bytes at a time if y>=16
+	// continue folding 16B at a time
+
+_16B_reduction_loop:
+	pmull		v8.1q, v7.1d, v10.1d
+	pmull2		v7.1q, v7.2d, v10.2d
+	eor		v7.16b, v7.16b, v8.16b
+
+	ldr		q0, [arg2], #16
 CPU_LE(	rev64		v0.16b, v0.16b			)
 CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 	eor		v7.16b, v7.16b, v0.16b
-	subs		len, len, #16
-	b.ge		.Lfold_16_bytes_loop_\@
+	subs		arg3, arg3, #16

-.Lfold_16_bytes_loop_done_\@:
-	// Add 16 to get the correct number of data bytes remaining in 0...15
-	// (not counting v7), following the previous extra subtraction by 16.
-	adds		len, len, #16
-	b.eq		.Lreduce_final_16_bytes_\@
+	// instead of a cmp instruction, we utilize the flags with the
+	// jge instruction equivalent of: cmp arg3, 16-16
+	// check if there is any more 16B in the buffer to be able to fold
+	b.ge		_16B_reduction_loop

-.Lhandle_partial_segment_\@:
-	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
-	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
-	// do this without needing a fold constant for each possible 'len',
-	// redivide the bytes into a first chunk of 'len' bytes and a second
-	// chunk of 16 bytes, then fold the first chunk into the second.
+	// now we have 16+z bytes left to reduce, where 0<= z < 16.
+	// first, we reduce the data in the xmm7 register

-	// v0 = last 16 original data bytes
-	add		buf, buf, len
-	ldr		q0, [buf, #-16]
-CPU_LE(	rev64		v0.16b, v0.16b			)
-CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
+_final_reduction_for_128:
+	// check if any more data to fold. If not, compute the CRC of
+	// the final 128 bits
+	adds		arg3, arg3, #16
+	b.eq		_128_done

-	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
-	adr_l		x4, .Lbyteshift_table + 16
-	sub		x4, x4, len
-	ld1		{v2.16b}, [x4]
-	tbl		v1.16b, {v7.16b}, v2.16b
+	// here we are getting data that is less than 16 bytes.
+	// since we know that there was data before the pointer, we can
+	// offset the input pointer before the actual point, to receive
+	// exactly 16 bytes. after that the registers need to be adjusted.
+_get_last_two_regs:
+	add		arg2, arg2, arg3
+	ldr		q1, [arg2, #-16]
+CPU_LE(	rev64		v1.16b, v1.16b			)
+CPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)

-	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
-	movi		v3.16b, #0x80
-	eor		v2.16b, v2.16b, v3.16b
-	tbl		v3.16b, {v7.16b}, v2.16b
+	// get rid of the extra data that was loaded before
+	// load the shift constant
+	adr		x4, tbl_shf_table + 16
+	sub		x4, x4, arg3
+	ld1		{v0.16b}, [x4]

-	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
-	sshr		v2.16b, v2.16b, #7
+	// shift v2 to the left by arg3 bytes
+	tbl		v2.16b, {v7.16b}, v0.16b

-	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
-	// then '16-len' bytes from v1 (high-order bytes).
-	bsl		v2.16b, v1.16b, v0.16b
+	// shift v7 to the right by 16-arg3 bytes
+	movi		v9.16b, #0x80
+	eor		v0.16b, v0.16b, v9.16b
+	tbl		v7.16b, {v7.16b}, v0.16b

-	// Fold the first chunk into the second chunk, storing the result in v7.
-	__pmull_\p	v0, v3, fold_consts
-	__pmull_\p	v7, v3, fold_consts, 2
+	// blend
+	sshr		v0.16b, v0.16b, #7	// convert to 8-bit mask
+	bsl		v0.16b, v2.16b, v1.16b
+
+	// fold 16 Bytes
+	pmull		v8.1q, v7.1d, v10.1d
+	pmull2		v7.1q, v7.2d, v10.2d
+	eor		v7.16b, v7.16b, v8.16b
 	eor		v7.16b, v7.16b, v0.16b
-	eor		v7.16b, v7.16b, v2.16b

-.Lreduce_final_16_bytes_\@:
-	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
+_128_done:
+	// compute crc of a 128-bit value
+	ldr		q10, rk5		// rk5 and rk6 in xmm10

-	movi		v2.16b, #0		// init zero register
+	// 64b fold
+	ext		v0.16b, vzr.16b, v7.16b, #8
+	mov		v7.d[0], v7.d[1]
+	pmull		v7.1q, v7.1d, v10.1d
+	eor		v7.16b, v7.16b, v0.16b

-	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	__pmull_pre_\p	fold_consts
+	// 32b fold
+	ext		v0.16b, v7.16b, vzr.16b, #4
+	mov		v7.s[3], vzr.s[0]
+	pmull2		v0.1q, v0.2d, v10.2d
+	eor		v7.16b, v7.16b, v0.16b

-	// Fold the high 64 bits into the low 64 bits, while also multiplying by
-	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
-	// whose low 48 bits are 0.
-	ext		v0.16b, v2.16b, v7.16b, #8
-	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
-	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
+	// barrett reduction
+_barrett:
+	ldr		q10, rk7
+	mov		v0.d[0], v7.d[1]

-	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
-	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
-	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
-	mov		v0.s[3], v2.s[0]	// zero high 32 bits
-	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
-	eor		v0.16b, v0.16b, v1.16b	// + low bits
+	pmull		v0.1q, v0.1d, v10.1d
+	ext		v0.16b, vzr.16b, v0.16b, #12
+	pmull2		v0.1q, v0.2d, v10.2d
+	ext		v0.16b, vzr.16b, v0.16b, #12
+	eor		v7.16b, v7.16b, v0.16b
+	mov		w0, v7.s[1]

-	// Load G(x) and floor(x^48 / G(x)).
-	ld1		{fold_consts.2d}, [fold_consts_ptr]
-	__pmull_pre_\p	fold_consts
-
-	// Use Barrett reduction to compute the final CRC value.
-	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
-	ushr		v1.2d, v1.2d, #32	// /= x^32
-	__pmull_\p	v1, v1, fold_consts	// *= G(x)
-	ushr		v0.2d, v0.2d, #48
-	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
-	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
-
-	umov		w0, v0.h[0]
-	.ifc		\p, p8
-	ldp		x29, x30, [sp], #16
-	.endif
+_cleanup:
+	// scale the result back to 16 bits
+	lsr		x0, x0, #16
 	ret

-.Lless_than_256_bytes_\@:
-	// Checksumming a buffer of length 16...255 bytes
+_less_than_128:
+	cbz		arg3, _cleanup

-	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
+	movi		v0.16b, #0
+	mov		v0.s[3], arg1_low32	// get the initial crc value

-	// Load the first 16 data bytes.
-	ldr		q7, [buf], #0x10
+	ldr		q7, [arg2], #0x10
 CPU_LE(	rev64		v7.16b, v7.16b			)
 CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
+	eor		v7.16b, v7.16b, v0.16b	// xor the initial crc value

-	// XOR the first 16 data *bits* with the initial CRC value.
-	movi		v0.16b, #0
-	mov		v0.h[7], init_crc
-	eor		v7.16b, v7.16b, v0.16b
+	cmp		arg3, #16
+	b.eq		_128_done		// exactly 16 left
+	b.lt		_less_than_16_left

-	// Load the fold-across-16-bytes constants.
-	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
-	__pmull_pre_\p	fold_consts
+	ldr		q10, rk1		// rk1 and rk2 in xmm10

-	cmp		len, #16
-	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
-	subs		len, len, #32
-	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
-	add		len, len, #16
-	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
-	.endm
+	// update the counter. subtract 32 instead of 16 to save one
+	// instruction from the loop
+	subs		arg3, arg3, #32
+	b.ge		_16B_reduction_loop

-//
-// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p8)
-	stp		x29, x30, [sp, #-16]!
-	mov		x29, sp
-	crc_t10dif_pmull p8
-SYM_FUNC_END(crc_t10dif_pmull_p8)
+	add		arg3, arg3, #16
+	b		_get_last_two_regs

-	.align		5
-//
-// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
-//
-// Assumes len >= 16.
-//
-SYM_FUNC_START(crc_t10dif_pmull_p64)
-	crc_t10dif_pmull	p64
-SYM_FUNC_END(crc_t10dif_pmull_p64)
+_less_than_16_left:
+	// shl r9, 4
+	adr		x0, tbl_shf_table + 16
+	sub		x0, x0, arg3
+	ld1		{v0.16b}, [x0]
+	movi		v9.16b, #0x80
+	eor		v0.16b, v0.16b, v9.16b
+	tbl		v7.16b, {v7.16b}, v0.16b
+	b		_128_done
+ENDPROC(crc_t10dif_pmull)

-	.section	".rodata", "a"
+// precomputed constants
+// these constants are precomputed from the poly:
+// 0x8bb70000 (0x8bb7 scaled to 32 bits)
 	.align		4
+// Q = 0x18BB70000
+// rk1 = 2^(32*3) mod Q << 32
+// rk2 = 2^(32*5) mod Q << 32
+// rk3 = 2^(32*15) mod Q << 32
+// rk4 = 2^(32*17) mod Q << 32
+// rk5 = 2^(32*3) mod Q << 32
+// rk6 = 2^(32*2) mod Q << 32
+// rk7 = floor(2^64/Q)
+// rk8 = Q

-// Fold constants precomputed from the polynomial 0x18bb7
-// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
-.Lfold_across_128_bytes_consts:
-	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
-	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
-// .Lfold_across_64_bytes_consts:
-	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
-	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
-// .Lfold_across_32_bytes_consts:
-	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
-	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
-.Lfold_across_16_bytes_consts:
-	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
-	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
-// .Lfinal_fold_consts:
-	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
-	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
-// .Lbarrett_reduction_consts:
-	.quad		0x0000000000018bb7	// G(x)
-	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
+rk1:	.octa		0x06df0000000000002d56000000000000
+rk3:	.octa		0x7cf50000000000009d9d000000000000
+rk5:	.octa		0x13680000000000002d56000000000000
+rk7:	.octa		0x000000018bb7000000000001f65a57f8
+rk9:	.octa		0xbfd6000000000000ceae000000000000
+rk11:	.octa		0x713c0000000000001e16000000000000
+rk13:	.octa		0x80a6000000000000f7f9000000000000
+rk15:	.octa		0xe658000000000000044c000000000000
+rk17:	.octa		0xa497000000000000ad18000000000000
+rk19:	.octa		0xe7b50000000000006ee3000000000000
+
+tbl_shf_table:
+// use these values for shift constants for the tbl/tbx instruction
+// different alignments result in values as shown:
+//	DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
+//	DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
+//	DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
+//	DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
+//	DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
+//	DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
+//	DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
+//	DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
+//	DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
+//	DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
+//	DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
+//	DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
+//	DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
+//	DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
+//	DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15

-// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
-// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
-// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
-.Lbyteshift_table:
 	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
 	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
 	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@ -16,15 +16,13 @@
 #include <linux/string.h>

 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>

 #include <asm/neon.h>
 #include <asm/simd.h>

 #define CRC_T10DIF_PMULL_CHUNK_SIZE	16U

-asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
-asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);

 static int crct10dif_init(struct shash_desc *desc)
 {
@ -34,49 +32,15 @@ static int crct10dif_init(struct shash_desc *desc)
 	return 0;
 }

-static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
+static int crct10dif_update(struct shash_desc *desc, const u8 *data,
 			    unsigned int length)
 {
 	u16 *crc = shash_desc_ctx(desc);

 	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
-		do {
-			unsigned int chunk = length;
-
-			if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
-				chunk = SZ_4K;
-
-			kernel_neon_begin();
-			*crc = crc_t10dif_pmull_p8(*crc, data, chunk);
-			kernel_neon_end();
-			data += chunk;
-			length -= chunk;
-		} while (length);
-	} else {
-		*crc = crc_t10dif_generic(*crc, data, length);
-	}
-
-	return 0;
-}
-
-static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
-			    unsigned int length)
-{
-	u16 *crc = shash_desc_ctx(desc);
-
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
-		do {
-			unsigned int chunk = length;
-
-			if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
-				chunk = SZ_4K;
-
-			kernel_neon_begin();
-			*crc = crc_t10dif_pmull_p64(*crc, data, chunk);
-			kernel_neon_end();
-			data += chunk;
-			length -= chunk;
-		} while (length);
+		kernel_neon_begin();
+		*crc = crc_t10dif_pmull(*crc, data, length);
+		kernel_neon_end();
 	} else {
 		*crc = crc_t10dif_generic(*crc, data, length);
 	}
@ -92,22 +56,10 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
 	return 0;
 }

-static struct shash_alg crc_t10dif_alg[] = {{
+static struct shash_alg crc_t10dif_alg = {
 	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
 	.init			= crct10dif_init,
-	.update			= crct10dif_update_pmull_p8,
-	.final			= crct10dif_final,
-	.descsize		= CRC_T10DIF_DIGEST_SIZE,
-
-	.base.cra_name		= "crct10dif",
-	.base.cra_driver_name	= "crct10dif-arm64-neon",
-	.base.cra_priority	= 100,
-	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.digestsize		= CRC_T10DIF_DIGEST_SIZE,
-	.init			= crct10dif_init,
-	.update			= crct10dif_update_pmull_p64,
+	.update			= crct10dif_update,
 	.final			= crct10dif_final,
 	.descsize		= CRC_T10DIF_DIGEST_SIZE,

@ -116,31 +68,20 @@ static struct shash_alg crc_t10dif_alg[] = {{
 	.base.cra_priority	= 200,
 	.base.cra_blocksize	= CRC_T10DIF_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
-}};
+};

 static int __init crc_t10dif_mod_init(void)
 {
-	if (cpu_have_named_feature(PMULL))
-		return crypto_register_shashes(crc_t10dif_alg,
-					       ARRAY_SIZE(crc_t10dif_alg));
-	else
-		/* only register the first array element */
-		return crypto_register_shash(crc_t10dif_alg);
+	return crypto_register_shash(&crc_t10dif_alg);
 }

 static void __exit crc_t10dif_mod_exit(void)
 {
-	if (cpu_have_named_feature(PMULL))
-		crypto_unregister_shashes(crc_t10dif_alg,
-					  ARRAY_SIZE(crc_t10dif_alg));
-	else
-		crypto_unregister_shash(crc_t10dif_alg);
+	crypto_unregister_shash(&crc_t10dif_alg);
 }

-module_cpu_feature_match(ASIMD, crc_t10dif_mod_init);
+module_cpu_feature_match(PMULL, crc_t10dif_mod_init);
 module_exit(crc_t10dif_mod_exit);

 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("crct10dif");
-MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce");
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@ -1,7 +1,7 @@
 /*
 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
 *
- * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
@ -16,8 +16,8 @@
 	T1		.req	v2
 	T2		.req	v3
 	MASK		.req	v4
-	XM		.req	v5
-	XL		.req	v6
+	XL		.req	v5
+	XM		.req	v6
 	XH		.req	v7
 	IN1		.req	v7

@ -46,19 +46,6 @@
 	ss3		.req	v26
 	ss4		.req	v27

-	XL2		.req	v8
-	XM2		.req	v9
-	XH2		.req	v10
-	XL3		.req	v11
-	XM3		.req	v12
-	XH3		.req	v13
-	TT3		.req	v14
-	TT4		.req	v15
-	HH		.req	v16
-	HH3		.req	v17
-	HH4		.req	v18
-	HH34		.req	v19
-
 	.text
 	.arch		armv8-a+crypto

@ -147,25 +134,11 @@
 	.endm

 	.macro		__pmull_pre_p64
-	add		x8, x3, #16
-	ld1		{HH.2d-HH4.2d}, [x8]
-
-	trn1		SHASH2.2d, SHASH.2d, HH.2d
-	trn2		T1.2d, SHASH.2d, HH.2d
-	eor		SHASH2.16b, SHASH2.16b, T1.16b
-
-	trn1		HH34.2d, HH3.2d, HH4.2d
-	trn2		T1.2d, HH3.2d, HH4.2d
-	eor		HH34.16b, HH34.16b, T1.16b
-
 	movi		MASK.16b, #0xe1
 	shl		MASK.2d, MASK.2d, #57
 	.endm

 	.macro		__pmull_pre_p8
-	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
-	eor		SHASH2.16b, SHASH2.16b, SHASH.16b
-
 	// k00_16 := 0x0000000000000000_000000000000ffff
 	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
 	movi		k32_48.2d, #0xffffffff
@ -242,86 +215,20 @@
 	.macro		__pmull_ghash, pn
 	ld1		{SHASH.2d}, [x3]
 	ld1		{XL.2d}, [x1]
+	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
+	eor		SHASH2.16b, SHASH2.16b, SHASH.16b

 	__pmull_pre_\pn

 	/* do the head block first, if supplied */
 	cbz		x4, 0f
 	ld1		{T1.2d}, [x4]
-	mov		x4, xzr
-	b		3f
+	b		1f

-0:	.ifc		\pn, p64
-	tbnz		w0, #0, 2f		// skip until #blocks is a
-	tbnz		w0, #1, 2f		// round multiple of 4
-
-1:	ld1		{XM3.16b-TT4.16b}, [x2], #64
-
-	sub		w0, w0, #4
-
-	rev64		T1.16b, XM3.16b
-	rev64		T2.16b, XH3.16b
-	rev64		TT4.16b, TT4.16b
-	rev64		TT3.16b, TT3.16b
-
-	ext		IN1.16b, TT4.16b, TT4.16b, #8
-	ext		XL3.16b, TT3.16b, TT3.16b, #8
-
-	eor		TT4.16b, TT4.16b, IN1.16b
-	pmull2		XH2.1q, SHASH.2d, IN1.2d	// a1 * b1
-	pmull		XL2.1q, SHASH.1d, IN1.1d	// a0 * b0
-	pmull		XM2.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
-
-	eor		TT3.16b, TT3.16b, XL3.16b
-	pmull2		XH3.1q, HH.2d, XL3.2d		// a1 * b1
-	pmull		XL3.1q, HH.1d, XL3.1d		// a0 * b0
-	pmull2		XM3.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
-
-	ext		IN1.16b, T2.16b, T2.16b, #8
-	eor		XL2.16b, XL2.16b, XL3.16b
-	eor		XH2.16b, XH2.16b, XH3.16b
-	eor		XM2.16b, XM2.16b, XM3.16b
-
-	eor		T2.16b, T2.16b, IN1.16b
-	pmull2		XH3.1q, HH3.2d, IN1.2d		// a1 * b1
-	pmull		XL3.1q, HH3.1d, IN1.1d		// a0 * b0
-	pmull		XM3.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
-
-	eor		XL2.16b, XL2.16b, XL3.16b
-	eor		XH2.16b, XH2.16b, XH3.16b
-	eor		XM2.16b, XM2.16b, XM3.16b
-
-	ext		IN1.16b, T1.16b, T1.16b, #8
-	ext		TT3.16b, XL.16b, XL.16b, #8
-	eor		XL.16b, XL.16b, IN1.16b
-	eor		T1.16b, T1.16b, TT3.16b
-
-	pmull2		XH.1q, HH4.2d, XL.2d		// a1 * b1
-	eor		T1.16b, T1.16b, XL.16b
-	pmull		XL.1q, HH4.1d, XL.1d		// a0 * b0
-	pmull2		XM.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
-
-	eor		XL.16b, XL.16b, XL2.16b
-	eor		XH.16b, XH.16b, XH2.16b
-	eor		XM.16b, XM.16b, XM2.16b
-
-	eor		T2.16b, XL.16b, XH.16b
-	ext		T1.16b, XL.16b, XH.16b, #8
-	eor		XM.16b, XM.16b, T2.16b
-
-	__pmull_reduce_p64
-
-	eor		T2.16b, T2.16b, XH.16b
-	eor		XL.16b, XL.16b, T2.16b
-
-	cbz		w0, 5f
-	b		1b
-	.endif
-
-2:	ld1		{T1.2d}, [x2], #16
+0:	ld1		{T1.2d}, [x2], #16
 	sub		w0, w0, #1

-3:	/* multiply XL by SHASH in GF(2^128) */
+1:	/* multiply XL by SHASH in GF(2^128) */
 CPU_LE(	rev64		T1.16b, T1.16b	)

 	ext		T2.16b, XL.16b, XL.16b, #8
@ -334,7 +241,7 @@ CPU_LE(	rev64		T1.16b, T1.16b	)
 	__pmull_\pn 	XL, XL, SHASH			// a0 * b0
 	__pmull_\pn	XM, T1, SHASH2			// (a1 + a0)(b1 + b0)

-4:	eor		T2.16b, XL.16b, XH.16b
+	eor		T2.16b, XL.16b, XH.16b
 	ext		T1.16b, XL.16b, XH.16b, #8
 	eor		XM.16b, XM.16b, T2.16b

@ -345,7 +252,7 @@ CPU_LE(	rev64		T1.16b, T1.16b	)

 	cbnz		w0, 0b

-5:	st1		{XL.2d}, [x1]
+	st1		{XL.2d}, [x1]
 	ret
 	.endm

@ -353,45 +260,27 @@ CPU_LE(	rev64		T1.16b, T1.16b	)
 	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
 	 *			   struct ghash_key const *k, const char *head)
 	 */
-SYM_FUNC_START(pmull_ghash_update_p64)
+ENTRY(pmull_ghash_update_p64)
 	__pmull_ghash	p64
-SYM_FUNC_END(pmull_ghash_update_p64)
+ENDPROC(pmull_ghash_update_p64)

-SYM_FUNC_START(pmull_ghash_update_p8)
+ENTRY(pmull_ghash_update_p8)
 	__pmull_ghash	p8
-SYM_FUNC_END(pmull_ghash_update_p8)
+ENDPROC(pmull_ghash_update_p8)

-	KS0		.req	v8
-	KS1		.req	v9
-	KS2		.req	v10
-	KS3		.req	v11
+	KS		.req	v8
+	CTR		.req	v9
+	INP		.req	v10

-	INP0		.req	v21
-	INP1		.req	v22
-	INP2		.req	v23
-	INP3		.req	v24
-
-	K0		.req	v25
-	K1		.req	v26
-	K2		.req	v27
-	K3		.req	v28
-	K4		.req	v12
-	K5		.req	v13
-	K6		.req	v4
-	K7		.req	v5
-	K8		.req	v14
-	K9		.req	v15
-	KK		.req	v29
-	KL		.req	v30
-	KM		.req	v31
-
-	.macro		load_round_keys, rounds, rk, tmp
-	add		\tmp, \rk, #64
-	ld1		{K0.4s-K3.4s}, [\rk]
-	ld1		{K4.4s-K5.4s}, [\tmp]
-	add		\tmp, \rk, \rounds, lsl #4
-	sub		\tmp, \tmp, #32
-	ld1		{KK.4s-KM.4s}, [\tmp]
+	.macro		load_round_keys, rounds, rk
+	cmp		\rounds, #12
+	blo		2222f		/* 128 bits */
+	beq		1111f		/* 192 bits */
+	ld1		{v17.4s-v18.4s}, [\rk], #32
+1111:	ld1		{v19.4s-v20.4s}, [\rk], #32
+2222:	ld1		{v21.4s-v24.4s}, [\rk], #64
+	ld1		{v25.4s-v28.4s}, [\rk], #64
+	ld1		{v29.4s-v31.4s}, [\rk]
 	.endm

 	.macro		enc_round, state, key
@ -399,382 +288,157 @@ SYM_FUNC_END(pmull_ghash_update_p8)
 	aesmc		\state\().16b, \state\().16b
 	.endm

-	.macro		enc_qround, s0, s1, s2, s3, key
-	enc_round	\s0, \key
-	enc_round	\s1, \key
-	enc_round	\s2, \key
-	enc_round	\s3, \key
-	.endm
-
-	.macro		enc_block, state, rounds, rk, tmp
-	add		\tmp, \rk, #96
-	ld1		{K6.4s-K7.4s}, [\tmp], #32
-	.irp		key, K0, K1, K2, K3, K4 K5
+	.macro		enc_block, state, rounds
+	cmp		\rounds, #12
+	b.lo		2222f		/* 128 bits */
+	b.eq		1111f		/* 192 bits */
+	enc_round	\state, v17
+	enc_round	\state, v18
+1111:	enc_round	\state, v19
+	enc_round	\state, v20
+2222:	.irp		key, v21, v22, v23, v24, v25, v26, v27, v28, v29
 	enc_round	\state, \key
 	.endr
-
-	tbnz		\rounds, #2, .Lnot128_\@
-.Lout256_\@:
-	enc_round	\state, K6
-	enc_round	\state, K7
-
-.Lout192_\@:
-	enc_round	\state, KK
-	aese		\state\().16b, KL.16b
-	eor		\state\().16b, \state\().16b, KM.16b
-
-	.subsection	1
-.Lnot128_\@:
-	ld1		{K8.4s-K9.4s}, [\tmp], #32
-	enc_round	\state, K6
-	enc_round	\state, K7
-	ld1		{K6.4s-K7.4s}, [\tmp]
-	enc_round	\state, K8
-	enc_round	\state, K9
-	tbz		\rounds, #1, .Lout192_\@
-	b		.Lout256_\@
-	.previous
+	aese		\state\().16b, v30.16b
+	eor		\state\().16b, \state\().16b, v31.16b
 	.endm

-	.align		6
 	.macro		pmull_gcm_do_crypt, enc
-	stp		x29, x30, [sp, #-32]!
-	mov		x29, sp
-	str		x19, [sp, #24]
+	ld1		{SHASH.2d}, [x4]
+	ld1		{XL.2d}, [x1]
+	ldr		x8, [x5, #8]			// load lower counter

-	load_round_keys	x7, x6, x8
-
-	ld1		{SHASH.2d}, [x3], #16
-	ld1		{HH.2d-HH4.2d}, [x3]
-
-	trn1		SHASH2.2d, SHASH.2d, HH.2d
-	trn2		T1.2d, SHASH.2d, HH.2d
-	eor		SHASH2.16b, SHASH2.16b, T1.16b
-
-	trn1		HH34.2d, HH3.2d, HH4.2d
-	trn2		T1.2d, HH3.2d, HH4.2d
-	eor		HH34.16b, HH34.16b, T1.16b
-
-	ld1		{XL.2d}, [x4]
-
-	cbz		x0, 3f				// tag only?
-
-	ldr		w8, [x5, #12]			// load lower counter
-CPU_LE(	rev		w8, w8		)
-
-0:	mov		w9, #4				// max blocks per round
-	add		x10, x0, #0xf
-	lsr		x10, x10, #4			// remaining blocks
-
-	subs		x0, x0, #64
-	csel		w9, w10, w9, mi
-	add		w8, w8, w9
-
-	bmi		1f
-	ld1		{INP0.16b-INP3.16b}, [x2], #64
-	.subsection	1
-	/*
-	 * Populate the four input registers right to left with up to 63 bytes
-	 * of data, using overlapping loads to avoid branches.
-	 *
-	 *                INP0     INP1     INP2     INP3
-	 *  1 byte     |        |        |        |x       |
-	 * 16 bytes    |        |        |        |xxxxxxxx|
-	 * 17 bytes    |        |        |xxxxxxxx|x       |
-	 * 47 bytes    |        |xxxxxxxx|xxxxxxxx|xxxxxxx |
-	 * etc etc
-	 *
-	 * Note that this code may read up to 15 bytes before the start of
-	 * the input. It is up to the calling code to ensure this is safe if
-	 * this happens in the first iteration of the loop (i.e., when the
-	 * input size is < 16 bytes)
-	 */
-1:	mov		x15, #16
-	ands		x19, x0, #0xf
-	csel		x19, x19, x15, ne
-	adr_l		x17, .Lpermute_table + 16
-
-	sub		x11, x15, x19
-	add		x12, x17, x11
-	sub		x17, x17, x11
-	ld1		{T1.16b}, [x12]
-	sub		x10, x1, x11
-	sub		x11, x2, x11
-
-	cmp		x0, #-16
-	csel		x14, x15, xzr, gt
-	cmp		x0, #-32
-	csel		x15, x15, xzr, gt
-	cmp		x0, #-48
-	csel		x16, x19, xzr, gt
-	csel		x1, x1, x10, gt
-	csel		x2, x2, x11, gt
-
-	ld1		{INP0.16b}, [x2], x14
-	ld1		{INP1.16b}, [x2], x15
-	ld1		{INP2.16b}, [x2], x16
-	ld1		{INP3.16b}, [x2]
-	tbl		INP3.16b, {INP3.16b}, T1.16b
-	b		2f
-	.previous
-
-2:	.if		\enc == 0
-	bl		pmull_gcm_ghash_4x
-	.endif
-
-	bl		pmull_gcm_enc_4x
-
-	tbnz		x0, #63, 6f
-	st1		{INP0.16b-INP3.16b}, [x1], #64
-	.if		\enc == 1
-	bl		pmull_gcm_ghash_4x
-	.endif
-	bne		0b
-
-3:	ldp		x19, x10, [sp, #24]
-	cbz		x10, 5f				// output tag?
-
-	ld1		{INP3.16b}, [x10]		// load lengths[]
-	mov		w9, #1
-	bl		pmull_gcm_ghash_4x
-
-	mov		w11, #(0x1 << 24)		// BE '1U'
-	ld1		{KS0.16b}, [x5]
-	mov		KS0.s[3], w11
-
-	enc_block	KS0, x7, x6, x12
-
-	ext		XL.16b, XL.16b, XL.16b, #8
-	rev64		XL.16b, XL.16b
-	eor		XL.16b, XL.16b, KS0.16b
-
-	.if		\enc == 1
-	st1		{XL.16b}, [x10]			// store tag
-	.else
-	ldp		x11, x12, [sp, #40]		// load tag pointer and authsize
-	adr_l		x17, .Lpermute_table
-	ld1		{KS0.16b}, [x11]		// load supplied tag
-	add		x17, x17, x12
-	ld1		{KS1.16b}, [x17]		// load permute vector
-
-	cmeq		XL.16b, XL.16b, KS0.16b		// compare tags
-	mvn		XL.16b, XL.16b			// -1 for fail, 0 for pass
-	tbl		XL.16b, {XL.16b}, KS1.16b	// keep authsize bytes only
-	sminv		b0, XL.16b			// signed minimum across XL
-	smov		w0, v0.b[0]			// return b0
-	.endif
-
-4:	ldp		x29, x30, [sp], #32
-	ret
-
-5:
-CPU_LE(	rev		w8, w8		)
-	str		w8, [x5, #12]			// store lower counter
-	st1		{XL.2d}, [x4]
-	b		4b
-
-6:	ld1		{T1.16b-T2.16b}, [x17], #32	// permute vectors
-	sub		x17, x17, x19, lsl #1
-
-	cmp		w9, #1
-	beq		7f
-	.subsection	1
-7:	ld1		{INP2.16b}, [x1]
-	tbx		INP2.16b, {INP3.16b}, T1.16b
-	mov		INP3.16b, INP2.16b
-	b		8f
-	.previous
-
-	st1		{INP0.16b}, [x1], x14
-	st1		{INP1.16b}, [x1], x15
-	st1		{INP2.16b}, [x1], x16
-	tbl		INP3.16b, {INP3.16b}, T1.16b
-	tbx		INP3.16b, {INP2.16b}, T2.16b
-8:	st1		{INP3.16b}, [x1]
-
-	.if		\enc == 1
-	ld1		{T1.16b}, [x17]
-	tbl		INP3.16b, {INP3.16b}, T1.16b	// clear non-data bits
-	bl		pmull_gcm_ghash_4x
-	.endif
-	b		3b
-	.endm
-
-	/*
-	 * void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
-	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
-	 *			  int rounds, u8 tag)
-	 */
-SYM_FUNC_START(pmull_gcm_encrypt)
-	pmull_gcm_do_crypt	1
-SYM_FUNC_END(pmull_gcm_encrypt)
-
-	/*
-	 * void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
-	 *			  struct ghash_key const *k, u64 dg[], u8 ctr[],
-	 *			  int rounds, u8 tag)
-	 */
-SYM_FUNC_START(pmull_gcm_decrypt)
-	pmull_gcm_do_crypt	0
-SYM_FUNC_END(pmull_gcm_decrypt)
-
-SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
 	movi		MASK.16b, #0xe1
+	ext		SHASH2.16b, SHASH.16b, SHASH.16b, #8
+CPU_LE(	rev		x8, x8		)
 	shl		MASK.2d, MASK.2d, #57
+	eor		SHASH2.16b, SHASH2.16b, SHASH.16b

-	rev64		T1.16b, INP0.16b
-	rev64		T2.16b, INP1.16b
-	rev64		TT3.16b, INP2.16b
-	rev64		TT4.16b, INP3.16b
+	.if		\enc == 1
+	ld1		{KS.16b}, [x7]
+	.endif

-	ext		XL.16b, XL.16b, XL.16b, #8
+0:	ld1		{CTR.8b}, [x5]			// load upper counter
+	ld1		{INP.16b}, [x3], #16
+	rev		x9, x8
+	add		x8, x8, #1
+	sub		w0, w0, #1
+	ins		CTR.d[1], x9			// set lower counter

-	tbz		w9, #2, 0f			// <4 blocks?
-	.subsection	1
-0:	movi		XH2.16b, #0
-	movi		XM2.16b, #0
-	movi		XL2.16b, #0
+	.if		\enc == 1
+	eor		INP.16b, INP.16b, KS.16b	// encrypt input
+	st1		{INP.16b}, [x2], #16
+	.endif

-	tbz		w9, #0, 1f			// 2 blocks?
-	tbz		w9, #1, 2f			// 1 block?
+	rev64		T1.16b, INP.16b

-	eor		T2.16b, T2.16b, XL.16b
-	ext		T1.16b, T2.16b, T2.16b, #8
-	b		.Lgh3
+	cmp		w6, #12
+	b.ge		2f				// AES-192/256?

-1:	eor		TT3.16b, TT3.16b, XL.16b
-	ext		T2.16b, TT3.16b, TT3.16b, #8
-	b		.Lgh2
+1:	enc_round	CTR, v21

-2:	eor		TT4.16b, TT4.16b, XL.16b
-	ext		IN1.16b, TT4.16b, TT4.16b, #8
-	b		.Lgh1
-	.previous
-
-	eor		T1.16b, T1.16b, XL.16b
+	ext		T2.16b, XL.16b, XL.16b, #8
 	ext		IN1.16b, T1.16b, T1.16b, #8

-	pmull2		XH2.1q, HH4.2d, IN1.2d		// a1 * b1
-	eor		T1.16b, T1.16b, IN1.16b
-	pmull		XL2.1q, HH4.1d, IN1.1d		// a0 * b0
-	pmull2		XM2.1q, HH34.2d, T1.2d		// (a1 + a0)(b1 + b0)
+	enc_round	CTR, v22

-	ext		T1.16b, T2.16b, T2.16b, #8
-.Lgh3:	eor		T2.16b, T2.16b, T1.16b
-	pmull2		XH.1q, HH3.2d, T1.2d		// a1 * b1
-	pmull		XL.1q, HH3.1d, T1.1d		// a0 * b0
-	pmull		XM.1q, HH34.1d, T2.1d		// (a1 + a0)(b1 + b0)
+	eor		T1.16b, T1.16b, T2.16b
+	eor		XL.16b, XL.16b, IN1.16b

-	eor		XH2.16b, XH2.16b, XH.16b
-	eor		XL2.16b, XL2.16b, XL.16b
-	eor		XM2.16b, XM2.16b, XM.16b
+	enc_round	CTR, v23

-	ext		T2.16b, TT3.16b, TT3.16b, #8
-.Lgh2:	eor		TT3.16b, TT3.16b, T2.16b
-	pmull2		XH.1q, HH.2d, T2.2d		// a1 * b1
-	pmull		XL.1q, HH.1d, T2.1d		// a0 * b0
-	pmull2		XM.1q, SHASH2.2d, TT3.2d	// (a1 + a0)(b1 + b0)
+	pmull2		XH.1q, SHASH.2d, XL.2d		// a1 * b1
+	eor		T1.16b, T1.16b, XL.16b

-	eor		XH2.16b, XH2.16b, XH.16b
-	eor		XL2.16b, XL2.16b, XL.16b
-	eor		XM2.16b, XM2.16b, XM.16b
+	enc_round	CTR, v24

-	ext		IN1.16b, TT4.16b, TT4.16b, #8
-.Lgh1:	eor		TT4.16b, TT4.16b, IN1.16b
-	pmull		XL.1q, SHASH.1d, IN1.1d		// a0 * b0
-	pmull2		XH.1q, SHASH.2d, IN1.2d		// a1 * b1
-	pmull		XM.1q, SHASH2.1d, TT4.1d	// (a1 + a0)(b1 + b0)
+	pmull		XL.1q, SHASH.1d, XL.1d		// a0 * b0
+	pmull		XM.1q, SHASH2.1d, T1.1d		// (a1 + a0)(b1 + b0)

-	eor		XH.16b, XH.16b, XH2.16b
-	eor		XL.16b, XL.16b, XL2.16b
-	eor		XM.16b, XM.16b, XM2.16b
+	enc_round	CTR, v25

-	eor		T2.16b, XL.16b, XH.16b
 	ext		T1.16b, XL.16b, XH.16b, #8
+	eor		T2.16b, XL.16b, XH.16b
+	eor		XM.16b, XM.16b, T1.16b
+
+	enc_round	CTR, v26
+
 	eor		XM.16b, XM.16b, T2.16b
+	pmull		T2.1q, XL.1d, MASK.1d

-	__pmull_reduce_p64
+	enc_round	CTR, v27

+	mov		XH.d[0], XM.d[1]
+	mov		XM.d[1], XL.d[0]
+
+	enc_round	CTR, v28
+
+	eor		XL.16b, XM.16b, T2.16b
+
+	enc_round	CTR, v29
+
+	ext		T2.16b, XL.16b, XL.16b, #8
+
+	aese		CTR.16b, v30.16b
+
+	pmull		XL.1q, XL.1d, MASK.1d
 	eor		T2.16b, T2.16b, XH.16b
+
+	eor		KS.16b, CTR.16b, v31.16b
+
 	eor		XL.16b, XL.16b, T2.16b

-	ret
-SYM_FUNC_END(pmull_gcm_ghash_4x)
+	.if		\enc == 0
+	eor		INP.16b, INP.16b, KS.16b
+	st1		{INP.16b}, [x2], #16
+	.endif

-SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
-	ld1		{KS0.16b}, [x5]			// load upper counter
-	sub		w10, w8, #4
-	sub		w11, w8, #3
-	sub		w12, w8, #2
-	sub		w13, w8, #1
-	rev		w10, w10
-	rev		w11, w11
-	rev		w12, w12
-	rev		w13, w13
-	mov		KS1.16b, KS0.16b
-	mov		KS2.16b, KS0.16b
-	mov		KS3.16b, KS0.16b
-	ins		KS0.s[3], w10			// set lower counter
-	ins		KS1.s[3], w11
-	ins		KS2.s[3], w12
-	ins		KS3.s[3], w13
+	cbnz		w0, 0b

-	add		x10, x6, #96			// round key pointer
-	ld1		{K6.4s-K7.4s}, [x10], #32
-	.irp		key, K0, K1, K2, K3, K4, K5
-	enc_qround	KS0, KS1, KS2, KS3, \key
-	.endr
+CPU_LE(	rev		x8, x8		)
+	st1		{XL.2d}, [x1]
+	str		x8, [x5, #8]			// store lower counter

-	tbnz		x7, #2, .Lnot128
-	.subsection	1
-.Lnot128:
-	ld1		{K8.4s-K9.4s}, [x10], #32
-	.irp		key, K6, K7
-	enc_qround	KS0, KS1, KS2, KS3, \key
-	.endr
-	ld1		{K6.4s-K7.4s}, [x10]
-	.irp		key, K8, K9
-	enc_qround	KS0, KS1, KS2, KS3, \key
-	.endr
-	tbz		x7, #1, .Lout192
-	b		.Lout256
-	.previous
-
-.Lout256:
-	.irp		key, K6, K7
-	enc_qround	KS0, KS1, KS2, KS3, \key
-	.endr
-
-.Lout192:
-	enc_qround	KS0, KS1, KS2, KS3, KK
-
-	aese		KS0.16b, KL.16b
-	aese		KS1.16b, KL.16b
-	aese		KS2.16b, KL.16b
-	aese		KS3.16b, KL.16b
-
-	eor		KS0.16b, KS0.16b, KM.16b
-	eor		KS1.16b, KS1.16b, KM.16b
-	eor		KS2.16b, KS2.16b, KM.16b
-	eor		KS3.16b, KS3.16b, KM.16b
-
-	eor		INP0.16b, INP0.16b, KS0.16b
-	eor		INP1.16b, INP1.16b, KS1.16b
-	eor		INP2.16b, INP2.16b, KS2.16b
-	eor		INP3.16b, INP3.16b, KS3.16b
+	.if		\enc == 1
+	st1		{KS.16b}, [x7]
+	.endif

 	ret
-SYM_FUNC_END(pmull_gcm_enc_4x)

-	.section	".rodata", "a"
-	.align		6
-.Lpermute_table:
-	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
-	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
-	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-	.byte		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
-	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
-	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
-	.previous
+2:	b.eq		3f				// AES-192?
+	enc_round	CTR, v17
+	enc_round	CTR, v18
+3:	enc_round	CTR, v19
+	enc_round	CTR, v20
+	b		1b
+	.endm
+
+	/*
+	 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
+	 *			  struct ghash_key const *k, u8 ctr[],
+	 *			  int rounds, u8 ks[])
+	 */
+ENTRY(pmull_gcm_encrypt)
+	pmull_gcm_do_crypt	1
+ENDPROC(pmull_gcm_encrypt)
+
+	/*
+	 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
+	 *			  struct ghash_key const *k, u8 ctr[],
+	 *			  int rounds)
+	 */
+ENTRY(pmull_gcm_decrypt)
+	pmull_gcm_do_crypt	0
+ENDPROC(pmull_gcm_decrypt)
+
+	/*
+	 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
+	 */
+ENTRY(pmull_gcm_encrypt_block)
+	cbz		x2, 0f
+	load_round_keys	w3, x2
+0:	ld1		{v0.16b}, [x1]
+	enc_block	v0, w3
+	st1		{v0.16b}, [x0]
+	ret
+ENDPROC(pmull_gcm_encrypt_block)
--- a/arch/arm64/crypto/ghash-ce-glue.c
+++ b/arch/arm64/crypto/ghash-ce-glue.c
@ -1,7 +1,7 @@
 /*
 * Accelerated GHASH implementation with ARMv8 PMULL instructions.
 *
- * Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
@ -17,7 +17,6 @@
 #include <crypto/gf128mul.h>
 #include <crypto/internal/aead.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/internal/skcipher.h>
 #include <crypto/scatterwalk.h>
 #include <linux/cpufeature.h>
@ -34,8 +33,9 @@ MODULE_ALIAS_CRYPTO("ghash");
 #define GCM_IV_SIZE		12

 struct ghash_key {
-	be128			k;
-	u64			h[][2];
+	u64 a;
+	u64 b;
+	be128 k;
 };

 struct ghash_desc_ctx {
@ -50,18 +50,29 @@ struct gcm_aes_ctx {
 };

 asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
-				       u64 const h[][2], const char *head);
+				       struct ghash_key const *k,
+				       const char *head);

 asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
-				      u64 const h[][2], const char *head);
+				      struct ghash_key const *k,
+				      const char *head);

-asmlinkage void pmull_gcm_encrypt(int bytes, u8 dst[], const u8 src[],
-				  u64 const h[][2], u64 dg[], u8 ctr[],
-				  u32 const rk[], int rounds, u8 tag[]);
-asmlinkage int pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[],
-				 u64 const h[][2], u64 dg[], u8 ctr[],
-				 u32 const rk[], int rounds, const u8 l[],
-				 const u8 tag[], u64 authsize);
+static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
+				  struct ghash_key const *k,
+				  const char *head);
+
+asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
+				  const u8 src[], struct ghash_key const *k,
+				  u8 ctr[], int rounds, u8 ks[]);
+
+asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[],
+				  const u8 src[], struct ghash_key const *k,
+				  u8 ctr[], int rounds);
+
+asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[],
+					u32 const rk[], int rounds);
+
+asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);

 static int ghash_init(struct shash_desc *desc)
 {
@ -73,48 +84,34 @@ static int ghash_init(struct shash_desc *desc)

 static void ghash_do_update(int blocks, u64 dg[], const char *src,
 			    struct ghash_key *key, const char *head)
-{
-	be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
-
-	do {
-		const u8 *in = src;
-
-		if (head) {
-			in = head;
-			blocks++;
-			head = NULL;
-		} else {
-			src += GHASH_BLOCK_SIZE;
-		}
-
-		crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
-		gf128mul_lle(&dst, &key->k);
-	} while (--blocks);
-
-	dg[0] = be64_to_cpu(dst.b);
-	dg[1] = be64_to_cpu(dst.a);
-}
-
-static __always_inline
-void ghash_do_simd_update(int blocks, u64 dg[], const char *src,
-			  struct ghash_key *key, const char *head,
-			  void (*simd_update)(int blocks, u64 dg[],
-					      const char *src,
-					      u64 const h[][2],
-					      const char *head))
 {
 	if (likely(may_use_simd())) {
 		kernel_neon_begin();
-		simd_update(blocks, dg, src, key->h, head);
+		pmull_ghash_update(blocks, dg, src, key, head);
 		kernel_neon_end();
 	} else {
-		ghash_do_update(blocks, dg, src, key, head);
+		be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
+
+		do {
+			const u8 *in = src;
+
+			if (head) {
+				in = head;
+				blocks++;
+				head = NULL;
+			} else {
+				src += GHASH_BLOCK_SIZE;
+			}
+
+			crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
+			gf128mul_lle(&dst, &key->k);
+		} while (--blocks);
+
+		dg[0] = be64_to_cpu(dst.b);
+		dg[1] = be64_to_cpu(dst.a);
 	}
 }

-/* avoid hogging the CPU for too long */
-#define MAX_BLOCKS	(SZ_64K / GHASH_BLOCK_SIZE)
-
 static int ghash_update(struct shash_desc *desc, const u8 *src,
 			unsigned int len)
 {
@ -138,17 +135,11 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
 		blocks = len / GHASH_BLOCK_SIZE;
 		len %= GHASH_BLOCK_SIZE;

-		do {
-			int chunk = min(blocks, MAX_BLOCKS);
+		ghash_do_update(blocks, ctx->digest, src, key,
+				partial ? ctx->buf : NULL);

-			ghash_do_simd_update(chunk, ctx->digest, src, key,
-					     partial ? ctx->buf : NULL,
-					     pmull_ghash_update_p8);
-
-			blocks -= chunk;
-			src += chunk * GHASH_BLOCK_SIZE;
-			partial = 0;
-		} while (unlikely(blocks > 0));
+		src += blocks * GHASH_BLOCK_SIZE;
+		partial = 0;
 	}
 	if (len)
 		memcpy(ctx->buf + partial, src, len);
@ -165,25 +156,34 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)

 		memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);

-		ghash_do_simd_update(1, ctx->digest, ctx->buf, key, NULL,
-				     pmull_ghash_update_p8);
+		ghash_do_update(1, ctx->digest, ctx->buf, key, NULL);
 	}
 	put_unaligned_be64(ctx->digest[1], dst);
 	put_unaligned_be64(ctx->digest[0], dst + 8);

-	memzero_explicit(ctx, sizeof(*ctx));
+	*ctx = (struct ghash_desc_ctx){};
 	return 0;
 }

-static void ghash_reflect(u64 h[], const be128 *k)
+static int __ghash_setkey(struct ghash_key *key,
+			  const u8 *inkey, unsigned int keylen)
 {
-	u64 carry = be64_to_cpu(k->a) & BIT(63) ? 1 : 0;
+	u64 a, b;

-	h[0] = (be64_to_cpu(k->b) << 1) | carry;
-	h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63);
+	/* needed for the fallback */
+	memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);

-	if (carry)
-		h[1] ^= 0xc200000000000000UL;
+	/* perform multiplication by 'x' in GF(2^128) */
+	b = get_unaligned_be64(inkey);
+	a = get_unaligned_be64(inkey + 8);
+
+	key->a = (a << 1) | (b >> 63);
+	key->b = (b << 1) | (a >> 63);
+
+	if (b >> 63)
+		key->b ^= 0xc200000000000000UL;
+
+	return 0;
 }

 static int ghash_setkey(struct crypto_shash *tfm,
@ -196,19 +196,16 @@ static int ghash_setkey(struct crypto_shash *tfm,
 		return -EINVAL;
 	}

-	/* needed for the fallback */
-	memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
-
-	ghash_reflect(key->h[0], &key->k);
-	return 0;
+	return __ghash_setkey(key, inkey, keylen);
 }

 static struct shash_alg ghash_alg = {
 	.base.cra_name		= "ghash",
-	.base.cra_driver_name	= "ghash-neon",
-	.base.cra_priority	= 150,
+	.base.cra_driver_name	= "ghash-ce",
+	.base.cra_priority	= 200,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= GHASH_BLOCK_SIZE,
-	.base.cra_ctxsize	= sizeof(struct ghash_key) + sizeof(u64[2]),
+	.base.cra_ctxsize	= sizeof(struct ghash_key),
 	.base.cra_module	= THIS_MODULE,

 	.digestsize		= GHASH_DIGEST_SIZE,
@ -236,33 +233,18 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
 {
 	struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm);
 	u8 key[GHASH_BLOCK_SIZE];
-	be128 h;
 	int ret;

-	ret = aes_expandkey(&ctx->aes_key, inkey, keylen);
+	ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen);
 	if (ret) {
 		tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
 		return -EINVAL;
 	}

-	aes_encrypt(&ctx->aes_key, key, (u8[AES_BLOCK_SIZE]){});
+	__aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){},
+			    num_rounds(&ctx->aes_key));

-	/* needed for the fallback */
-	memcpy(&ctx->ghash_key.k, key, GHASH_BLOCK_SIZE);
-
-	ghash_reflect(ctx->ghash_key.h[0], &ctx->ghash_key.k);
-
-	h = ctx->ghash_key.k;
-	gf128mul_lle(&h, &ctx->ghash_key.k);
-	ghash_reflect(ctx->ghash_key.h[1], &h);
-
-	gf128mul_lle(&h, &ctx->ghash_key.k);
-	ghash_reflect(ctx->ghash_key.h[2], &h);
-
-	gf128mul_lle(&h, &ctx->ghash_key.k);
-	ghash_reflect(ctx->ghash_key.h[3], &h);
-
-	return 0;
+	return __ghash_setkey(&ctx->ghash_key, key, sizeof(key));
 }

 static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
@ -294,9 +276,8 @@ static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
 	if (count >= GHASH_BLOCK_SIZE || *buf_count == GHASH_BLOCK_SIZE) {
 		int blocks = count / GHASH_BLOCK_SIZE;

-		ghash_do_simd_update(blocks, dg, src, &ctx->ghash_key,
-				     *buf_count ? buf : NULL,
-				     pmull_ghash_update_p64);
+		ghash_do_update(blocks, dg, src, &ctx->ghash_key,
+				*buf_count ? buf : NULL);

 		src += blocks * GHASH_BLOCK_SIZE;
 		count %= GHASH_BLOCK_SIZE;
@ -340,117 +321,121 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])

 	if (buf_count) {
 		memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count);
-		ghash_do_simd_update(1, dg, buf, &ctx->ghash_key, NULL,
-				     pmull_ghash_update_p64);
+		ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
 	}
 }

+static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx,
+		      u64 dg[], u8 tag[], int cryptlen)
+{
+	u8 mac[AES_BLOCK_SIZE];
+	u128 lengths;
+
+	lengths.a = cpu_to_be64(req->assoclen * 8);
+	lengths.b = cpu_to_be64(cryptlen * 8);
+
+	ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL);
+
+	put_unaligned_be64(dg[1], mac);
+	put_unaligned_be64(dg[0], mac + 8);
+
+	crypto_xor(tag, mac, AES_BLOCK_SIZE);
+}
+
 static int gcm_encrypt(struct aead_request *req)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
 	struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
-	int nrounds = num_rounds(&ctx->aes_key);
 	struct skcipher_walk walk;
-	u8 buf[AES_BLOCK_SIZE];
 	u8 iv[AES_BLOCK_SIZE];
+	u8 ks[AES_BLOCK_SIZE];
+	u8 tag[AES_BLOCK_SIZE];
 	u64 dg[2] = {};
-	be128 lengths;
-	u8 *tag;
 	int err;

-	lengths.a = cpu_to_be64(req->assoclen * 8);
-	lengths.b = cpu_to_be64(req->cryptlen * 8);
-
 	if (req->assoclen)
 		gcm_calculate_auth_mac(req, dg);

 	memcpy(iv, req->iv, GCM_IV_SIZE);
-	put_unaligned_be32(2, iv + GCM_IV_SIZE);
-
-	err = skcipher_walk_aead_encrypt(&walk, req, false);
+	put_unaligned_be32(1, iv + GCM_IV_SIZE);

 	if (likely(may_use_simd())) {
-		do {
-			const u8 *src = walk.src.virt.addr;
-			u8 *dst = walk.dst.virt.addr;
-			int nbytes = walk.nbytes;
+		kernel_neon_begin();

-			tag = (u8 *)&lengths;
+		pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
+					num_rounds(&ctx->aes_key));
+		put_unaligned_be32(2, iv + GCM_IV_SIZE);
+		pmull_gcm_encrypt_block(ks, iv, NULL,
+					num_rounds(&ctx->aes_key));
+		put_unaligned_be32(3, iv + GCM_IV_SIZE);

-			if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
-				src = dst = memcpy(buf + sizeof(buf) - nbytes,
-						   src, nbytes);
-			} else if (nbytes < walk.total) {
-				nbytes &= ~(AES_BLOCK_SIZE - 1);
-				tag = NULL;
-			}
+		err = skcipher_walk_aead_encrypt(&walk, req, true);

-			kernel_neon_begin();
-			pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h,
-					  dg, iv, ctx->aes_key.key_enc, nrounds,
-					  tag);
-			kernel_neon_end();
-
-			if (unlikely(!nbytes))
-				break;
-
-			if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
-				memcpy(walk.dst.virt.addr,
-				       buf + sizeof(buf) - nbytes, nbytes);
-
-			err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-		} while (walk.nbytes);
-	} else {
 		while (walk.nbytes >= AES_BLOCK_SIZE) {
 			int blocks = walk.nbytes / AES_BLOCK_SIZE;
-			const u8 *src = walk.src.virt.addr;
-			u8 *dst = walk.dst.virt.addr;
-			int remaining = blocks;

-			do {
-				aes_encrypt(&ctx->aes_key, buf, iv);
-				crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
-				crypto_inc(iv, AES_BLOCK_SIZE);
-
-				dst += AES_BLOCK_SIZE;
-				src += AES_BLOCK_SIZE;
-			} while (--remaining > 0);
-
-			ghash_do_update(blocks, dg, walk.dst.virt.addr,
-					&ctx->ghash_key, NULL);
+			pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
+					  walk.src.virt.addr, &ctx->ghash_key,
+					  iv, num_rounds(&ctx->aes_key), ks);

 			err = skcipher_walk_done(&walk,
 						 walk.nbytes % AES_BLOCK_SIZE);
 		}
+		kernel_neon_end();
+	} else {
+		__aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv,
+				    num_rounds(&ctx->aes_key));
+		put_unaligned_be32(2, iv + GCM_IV_SIZE);

-		/* handle the tail */
-		if (walk.nbytes) {
-			aes_encrypt(&ctx->aes_key, buf, iv);
+		err = skcipher_walk_aead_encrypt(&walk, req, true);

-			crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr,
-				       buf, walk.nbytes);
+		while (walk.nbytes >= AES_BLOCK_SIZE) {
+			int blocks = walk.nbytes / AES_BLOCK_SIZE;
+			u8 *dst = walk.dst.virt.addr;
+			u8 *src = walk.src.virt.addr;

-			memcpy(buf, walk.dst.virt.addr, walk.nbytes);
-			memset(buf + walk.nbytes, 0, sizeof(buf) - walk.nbytes);
+			do {
+				__aes_arm64_encrypt(ctx->aes_key.key_enc,
+						    ks, iv,
+						    num_rounds(&ctx->aes_key));
+				crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE);
+				crypto_inc(iv, AES_BLOCK_SIZE);
+
+				dst += AES_BLOCK_SIZE;
+				src += AES_BLOCK_SIZE;
+			} while (--blocks > 0);
+
+			ghash_do_update(walk.nbytes / AES_BLOCK_SIZE, dg,
+					walk.dst.virt.addr, &ctx->ghash_key,
+					NULL);
+
+			err = skcipher_walk_done(&walk,
+						 walk.nbytes % AES_BLOCK_SIZE);
 		}
-
-		tag = (u8 *)&lengths;
-		ghash_do_update(1, dg, tag, &ctx->ghash_key,
-				walk.nbytes ? buf : NULL);
-
 		if (walk.nbytes)
-			err = skcipher_walk_done(&walk, 0);
+			__aes_arm64_encrypt(ctx->aes_key.key_enc, ks, iv,
+					    num_rounds(&ctx->aes_key));
+	}

-		put_unaligned_be64(dg[1], tag);
-		put_unaligned_be64(dg[0], tag + 8);
-		put_unaligned_be32(1, iv + GCM_IV_SIZE);
-		aes_encrypt(&ctx->aes_key, iv, iv);
-		crypto_xor(tag, iv, AES_BLOCK_SIZE);
+	/* handle the tail */
+	if (walk.nbytes) {
+		u8 buf[GHASH_BLOCK_SIZE];
+
+		crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks,
+			       walk.nbytes);
+
+		memcpy(buf, walk.dst.virt.addr, walk.nbytes);
+		memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
+		ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+
+		err = skcipher_walk_done(&walk, 0);
 	}

 	if (err)
 		return err;

+	gcm_final(req, ctx, dg, tag, req->cryptlen);
+
 	/* copy authtag to end of dst */
 	scatterwalk_map_and_copy(tag, req->dst, req->assoclen + req->cryptlen,
 				 crypto_aead_authsize(aead), 1);
@ -463,81 +448,62 @@ static int gcm_decrypt(struct aead_request *req)
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
 	struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
 	unsigned int authsize = crypto_aead_authsize(aead);
-	int nrounds = num_rounds(&ctx->aes_key);
 	struct skcipher_walk walk;
-	u8 otag[AES_BLOCK_SIZE];
-	u8 buf[AES_BLOCK_SIZE];
 	u8 iv[AES_BLOCK_SIZE];
+	u8 tag[AES_BLOCK_SIZE];
+	u8 buf[GHASH_BLOCK_SIZE];
 	u64 dg[2] = {};
-	be128 lengths;
-	u8 *tag;
 	int err;

-	lengths.a = cpu_to_be64(req->assoclen * 8);
-	lengths.b = cpu_to_be64((req->cryptlen - authsize) * 8);
-
 	if (req->assoclen)
 		gcm_calculate_auth_mac(req, dg);

 	memcpy(iv, req->iv, GCM_IV_SIZE);
-	put_unaligned_be32(2, iv + GCM_IV_SIZE);
-
-	scatterwalk_map_and_copy(otag, req->src,
-				 req->assoclen + req->cryptlen - authsize,
-				 authsize, 0);
-
-	err = skcipher_walk_aead_decrypt(&walk, req, false);
+	put_unaligned_be32(1, iv + GCM_IV_SIZE);

 	if (likely(may_use_simd())) {
-		int ret;
+		kernel_neon_begin();

-		do {
-			const u8 *src = walk.src.virt.addr;
-			u8 *dst = walk.dst.virt.addr;
-			int nbytes = walk.nbytes;
+		pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
+					num_rounds(&ctx->aes_key));
+		put_unaligned_be32(2, iv + GCM_IV_SIZE);

-			tag = (u8 *)&lengths;
+		err = skcipher_walk_aead_decrypt(&walk, req, true);
+
+		while (walk.nbytes >= AES_BLOCK_SIZE) {
+			int blocks = walk.nbytes / AES_BLOCK_SIZE;
+
+			pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
+					  walk.src.virt.addr, &ctx->ghash_key,
+					  iv, num_rounds(&ctx->aes_key));
+
+			err = skcipher_walk_done(&walk,
+						 walk.nbytes % AES_BLOCK_SIZE);
+		}
+		if (walk.nbytes)
+			pmull_gcm_encrypt_block(iv, iv, NULL,
+						num_rounds(&ctx->aes_key));
+
+		kernel_neon_end();
+	} else {
+		__aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv,
+				    num_rounds(&ctx->aes_key));
+		put_unaligned_be32(2, iv + GCM_IV_SIZE);
+
+		err = skcipher_walk_aead_decrypt(&walk, req, true);

-			if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
-				src = dst = memcpy(buf + sizeof(buf) - nbytes,
-						   src, nbytes);
-			} else if (nbytes < walk.total) {
-				nbytes &= ~(AES_BLOCK_SIZE - 1);
-				tag = NULL;
-			}
-
-			kernel_neon_begin();
-			ret = pmull_gcm_decrypt(nbytes, dst, src,
-						ctx->ghash_key.h,
-						dg, iv, ctx->aes_key.key_enc,
-						nrounds, tag, otag, authsize);
-			kernel_neon_end();
-
-			if (unlikely(!nbytes))
-				break;
-
-			if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
-				memcpy(walk.dst.virt.addr,
-				       buf + sizeof(buf) - nbytes, nbytes);
-
-			err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
-		} while (walk.nbytes);
-
-		if (err)
-			return err;
-		if (ret)
-			return -EBADMSG;
-	} else {
 		while (walk.nbytes >= AES_BLOCK_SIZE) {
 			int blocks = walk.nbytes / AES_BLOCK_SIZE;
-			const u8 *src = walk.src.virt.addr;
 			u8 *dst = walk.dst.virt.addr;
+			u8 *src = walk.src.virt.addr;

 			ghash_do_update(blocks, dg, walk.src.virt.addr,
 					&ctx->ghash_key, NULL);

 			do {
-				aes_encrypt(&ctx->aes_key, buf, iv);
+				__aes_arm64_encrypt(ctx->aes_key.key_enc,
+						    buf, iv,
+						    num_rounds(&ctx->aes_key));
 				crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
 				crypto_inc(iv, AES_BLOCK_SIZE);

@ -548,40 +514,35 @@ static int gcm_decrypt(struct aead_request *req)
 			err = skcipher_walk_done(&walk,
 						 walk.nbytes % AES_BLOCK_SIZE);
 		}
-
-		/* handle the tail */
-		if (walk.nbytes) {
-			memcpy(buf, walk.src.virt.addr, walk.nbytes);
-			memset(buf + walk.nbytes, 0, sizeof(buf) - walk.nbytes);
-		}
-
-		tag = (u8 *)&lengths;
-		ghash_do_update(1, dg, tag, &ctx->ghash_key,
-				walk.nbytes ? buf : NULL);
-
-		if (walk.nbytes) {
-			aes_encrypt(&ctx->aes_key, buf, iv);
-
-			crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr,
-				       buf, walk.nbytes);
-
-			err = skcipher_walk_done(&walk, 0);
-		}
-
-		if (err)
-			return err;
-
-		put_unaligned_be64(dg[1], tag);
-		put_unaligned_be64(dg[0], tag + 8);
-		put_unaligned_be32(1, iv + GCM_IV_SIZE);
-		aes_encrypt(&ctx->aes_key, iv, iv);
-		crypto_xor(tag, iv, AES_BLOCK_SIZE);
-
-		if (crypto_memneq(tag, otag, authsize)) {
-			memzero_explicit(tag, AES_BLOCK_SIZE);
-			return -EBADMSG;
-		}
+		if (walk.nbytes)
+			__aes_arm64_encrypt(ctx->aes_key.key_enc, iv, iv,
+					    num_rounds(&ctx->aes_key));
 	}
+
+	/* handle the tail */
+	if (walk.nbytes) {
+		memcpy(buf, walk.src.virt.addr, walk.nbytes);
+		memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
+		ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
+
+		crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv,
+			       walk.nbytes);
+
+		err = skcipher_walk_done(&walk, 0);
+	}
+
+	if (err)
+		return err;
+
+	gcm_final(req, ctx, dg, tag, req->cryptlen - authsize);
+
+	/* compare calculated auth tag with the stored one */
+	scatterwalk_map_and_copy(buf, req->src,
+				 req->assoclen + req->cryptlen - authsize,
+				 authsize, 0);
+
+	if (crypto_memneq(tag, buf, authsize))
+		return -EBADMSG;
 	return 0;
 }

@ -598,28 +559,39 @@ static struct aead_alg gcm_aes_alg = {
 	.base.cra_driver_name	= "gcm-aes-ce",
 	.base.cra_priority	= 300,
 	.base.cra_blocksize	= 1,
-	.base.cra_ctxsize	= sizeof(struct gcm_aes_ctx) +
-				  4 * sizeof(u64[2]),
+	.base.cra_ctxsize	= sizeof(struct gcm_aes_ctx),
 	.base.cra_module	= THIS_MODULE,
 };

 static int __init ghash_ce_mod_init(void)
 {
-	if (!cpu_have_named_feature(ASIMD))
+	int ret;
+
+	if (!(elf_hwcap & HWCAP_ASIMD))
 		return -ENODEV;

-	if (cpu_have_named_feature(PMULL))
-		return crypto_register_aead(&gcm_aes_alg);
+	if (elf_hwcap & HWCAP_PMULL)
+		pmull_ghash_update = pmull_ghash_update_p64;

-	return crypto_register_shash(&ghash_alg);
+	else
+		pmull_ghash_update = pmull_ghash_update_p8;
+
+	ret = crypto_register_shash(&ghash_alg);
+	if (ret)
+		return ret;
+
+	if (elf_hwcap & HWCAP_PMULL) {
+		ret = crypto_register_aead(&gcm_aes_alg);
+		if (ret)
+			crypto_unregister_shash(&ghash_alg);
+	}
+	return ret;
 }

 static void __exit ghash_ce_mod_exit(void)
 {
-	if (cpu_have_named_feature(PMULL))
-		crypto_unregister_aead(&gcm_aes_alg);
-	else
-		crypto_unregister_shash(&ghash_alg);
+	crypto_unregister_shash(&ghash_alg);
+	crypto_unregister_aead(&gcm_aes_alg);
 }

 static const struct cpu_feature ghash_cpu_feature[] = {
--- a/arch/arm64/crypto/nh-neon-core.S
+++ b/arch/arm64/crypto/nh-neon-core.S
@ -1,103 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * NH - ε-almost-universal hash function, ARM64 NEON accelerated version
- *
- * Copyright 2018 Google LLC
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-
-	KEY		.req	x0
-	MESSAGE		.req	x1
-	MESSAGE_LEN	.req	x2
-	HASH		.req	x3
-
-	PASS0_SUMS	.req	v0
-	PASS1_SUMS	.req	v1
-	PASS2_SUMS	.req	v2
-	PASS3_SUMS	.req	v3
-	K0		.req	v4
-	K1		.req	v5
-	K2		.req	v6
-	K3		.req	v7
-	T0		.req	v8
-	T1		.req	v9
-	T2		.req	v10
-	T3		.req	v11
-	T4		.req	v12
-	T5		.req	v13
-	T6		.req	v14
-	T7		.req	v15
-
-.macro _nh_stride	k0, k1, k2, k3
-
-	// Load next message stride
-	ld1		{T3.16b}, [MESSAGE], #16
-
-	// Load next key stride
-	ld1		{\k3\().4s}, [KEY], #16
-
-	// Add message words to key words
-	add		T0.4s, T3.4s, \k0\().4s
-	add		T1.4s, T3.4s, \k1\().4s
-	add		T2.4s, T3.4s, \k2\().4s
-	add		T3.4s, T3.4s, \k3\().4s
-
-	// Multiply 32x32 => 64 and accumulate
-	mov		T4.d[0], T0.d[1]
-	mov		T5.d[0], T1.d[1]
-	mov		T6.d[0], T2.d[1]
-	mov		T7.d[0], T3.d[1]
-	umlal		PASS0_SUMS.2d, T0.2s, T4.2s
-	umlal		PASS1_SUMS.2d, T1.2s, T5.2s
-	umlal		PASS2_SUMS.2d, T2.2s, T6.2s
-	umlal		PASS3_SUMS.2d, T3.2s, T7.2s
-.endm
-
-/*
- * void nh_neon(const u32 *key, const u8 *message, size_t message_len,
- *		u8 hash[NH_HASH_BYTES])
- *
- * It's guaranteed that message_len % 16 == 0.
- */
-SYM_FUNC_START(nh_neon)
-
-	ld1		{K0.4s,K1.4s}, [KEY], #32
-	  movi		PASS0_SUMS.2d, #0
-	  movi		PASS1_SUMS.2d, #0
-	ld1		{K2.4s}, [KEY], #16
-	  movi		PASS2_SUMS.2d, #0
-	  movi		PASS3_SUMS.2d, #0
-
-	subs		MESSAGE_LEN, MESSAGE_LEN, #64
-	blt		.Lloop4_done
-.Lloop4:
-	_nh_stride	K0, K1, K2, K3
-	_nh_stride	K1, K2, K3, K0
-	_nh_stride	K2, K3, K0, K1
-	_nh_stride	K3, K0, K1, K2
-	subs		MESSAGE_LEN, MESSAGE_LEN, #64
-	bge		.Lloop4
-
-.Lloop4_done:
-	ands		MESSAGE_LEN, MESSAGE_LEN, #63
-	beq		.Ldone
-	_nh_stride	K0, K1, K2, K3
-
-	subs		MESSAGE_LEN, MESSAGE_LEN, #16
-	beq		.Ldone
-	_nh_stride	K1, K2, K3, K0
-
-	subs		MESSAGE_LEN, MESSAGE_LEN, #16
-	beq		.Ldone
-	_nh_stride	K2, K3, K0, K1
-
-.Ldone:
-	// Sum the accumulators for each pass, then store the sums to 'hash'
-	addp		T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
-	addp		T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
-	st1		{T0.16b,T1.16b}, [HASH]
-	ret
-SYM_FUNC_END(nh_neon)
--- a/arch/arm64/crypto/nhpoly1305-neon-glue.c
+++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c
@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
- * (ARM64 NEON accelerated version)
- *
- * Copyright 2018 Google LLC
- */
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/nhpoly1305.h>
-#include <linux/module.h>
-
-asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
-			u8 hash[NH_HASH_BYTES]);
-
-/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
-static void _nh_neon(const u32 *key, const u8 *message, size_t message_len,
-		     __le64 hash[NH_NUM_PASSES])
-{
-	nh_neon(key, message, message_len, (u8 *)hash);
-}
-
-static int nhpoly1305_neon_update(struct shash_desc *desc,
-				  const u8 *src, unsigned int srclen)
-{
-	if (srclen < 64 || !may_use_simd())
-		return crypto_nhpoly1305_update(desc, src, srclen);
-
-	do {
-		unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
-
-		kernel_neon_begin();
-		crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon);
-		kernel_neon_end();
-		src += n;
-		srclen -= n;
-	} while (srclen);
-	return 0;
-}
-
-static struct shash_alg nhpoly1305_alg = {
-	.base.cra_name		= "nhpoly1305",
-	.base.cra_driver_name	= "nhpoly1305-neon",
-	.base.cra_priority	= 200,
-	.base.cra_ctxsize	= sizeof(struct nhpoly1305_key),
-	.base.cra_module	= THIS_MODULE,
-	.digestsize		= POLY1305_DIGEST_SIZE,
-	.init			= crypto_nhpoly1305_init,
-	.update			= nhpoly1305_neon_update,
-	.final			= crypto_nhpoly1305_final,
-	.setkey			= crypto_nhpoly1305_setkey,
-	.descsize		= sizeof(struct nhpoly1305_state),
-};
-
-static int __init nhpoly1305_mod_init(void)
-{
-	if (!cpu_have_named_feature(ASIMD))
-		return -ENODEV;
-
-	return crypto_register_shash(&nhpoly1305_alg);
-}
-
-static void __exit nhpoly1305_mod_exit(void)
-{
-	crypto_unregister_shash(&nhpoly1305_alg);
-}
-
-module_init(nhpoly1305_mod_init);
-module_exit(nhpoly1305_mod_exit);
-
-MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)");
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("nhpoly1305");
-MODULE_ALIAS_CRYPTO("nhpoly1305-neon");
--- a/arch/arm64/crypto/sha1-ce-core.S
+++ b/arch/arm64/crypto/sha1-ce-core.S
@ -58,22 +58,24 @@
 	sha1su1		v\s0\().4s, v\s3\().4s
 	.endm

-	.macro		loadrc, k, val, tmp
-	movz		\tmp, :abs_g0_nc:\val
-	movk		\tmp, :abs_g1:\val
-	dup		\k, \tmp
-	.endm
+	/*
+	 * The SHA1 round constants
+	 */
+	.align		4
+.Lsha1_rcon:
+	.word		0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6

 	/*
-	 * int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
-	 *			 int blocks)
+	 * void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
+	 *			  int blocks)
 	 */
-SYM_FUNC_START(sha1_ce_transform)
+ENTRY(sha1_ce_transform)
 	/* load round constants */
-	loadrc		k0.4s, 0x5a827999, w6
-	loadrc		k1.4s, 0x6ed9eba1, w6
-	loadrc		k2.4s, 0x8f1bbcdc, w6
-	loadrc		k3.4s, 0xca62c1d6, w6
+	adr		x6, .Lsha1_rcon
+	ld1r		{k0.4s}, [x6], #4
+	ld1r		{k1.4s}, [x6], #4
+	ld1r		{k2.4s}, [x6], #4
+	ld1r		{k3.4s}, [x6]

 	/* load state */
 	ld1		{dgav.4s}, [x0]
@ -123,16 +125,14 @@ CPU_LE(	rev32		v11.16b, v11.16b	)
 	add		dgbv.2s, dgbv.2s, dg1v.2s
 	add		dgav.4s, dgav.4s, dg0v.4s

-	cbz		w2, 2f
-	cond_yield	3f, x5
-	b		0b
+	cbnz		w2, 0b

 	/*
 	 * Final block: add padding and total bit count.
 	 * Skip if the input size was not a round multiple of the block size,
 	 * the padding is handled by the C code in that case.
 	 */
-2:	cbz		x4, 3f
+	cbz		x4, 3f
 	ldr_l		w4, sha1_ce_offsetof_count, x4
 	ldr		x4, [x0, x4]
 	movi		v9.2d, #0
@ -148,6 +148,5 @@ CPU_LE(	rev32		v11.16b, v11.16b	)
 	/* store new state */
 3:	st1		{dgav.4s}, [x0]
 	str		dgb, [x0, #16]
-	mov		w0, w2
 	ret
-SYM_FUNC_END(sha1_ce_transform)
+ENDPROC(sha1_ce_transform)
--- a/arch/arm64/crypto/sha1-ce-glue.c
+++ b/arch/arm64/crypto/sha1-ce-glue.c
@ -12,7 +12,6 @@
 #include <asm/simd.h>
 #include <asm/unaligned.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/sha.h>
 #include <crypto/sha1_base.h>
 #include <linux/cpufeature.h>
@ -22,18 +21,14 @@
 MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha1");

 struct sha1_ce_state {
 	struct sha1_state	sst;
 	u32			finalize;
 };

-extern const u32 sha1_ce_offsetof_count;
-extern const u32 sha1_ce_offsetof_finalize;
-
-asmlinkage int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
-				 int blocks);
+asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
+				  int blocks);
 #ifdef CONFIG_CFI_CLANG
 static inline void __cfi_sha1_ce_transform(struct sha1_state *sst,
 					   u8 const *src, int blocks)
@ -43,23 +38,6 @@ static inline void __cfi_sha1_ce_transform(struct sha1_state *sst,
 #define sha1_ce_transform __cfi_sha1_ce_transform
 #endif

-static void __sha1_ce_transform(struct sha1_state *sst, u8 const *src,
-				int blocks)
-{
-	sha1_ce_transform(container_of(sst, struct sha1_ce_state, sst), src,
-			  blocks);
-	while (blocks) {
-		int rem;
-
-		kernel_neon_begin();
-		rem = sha1_ce_transform(container_of(sst, struct sha1_ce_state,
-						     sst), src, blocks);
-		kernel_neon_end();
-		src += (blocks - rem) * SHA1_BLOCK_SIZE;
-		blocks = rem;
-	}
-}
-
 const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count);
 const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize);

@ -72,7 +50,10 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
 		return crypto_sha1_update(desc, data, len);

 	sctx->finalize = 0;
-	sha1_base_do_update(desc, data, len, __sha1_ce_transform);
+	kernel_neon_begin();
+	sha1_base_do_update(desc, data, len,
+			    (sha1_block_fn *)sha1_ce_transform);
+	kernel_neon_end();

 	return 0;
 }
@ -92,9 +73,12 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
 	 */
 	sctx->finalize = finalize;

-	sha1_base_do_update(desc, data, len, __sha1_ce_transform);
+	kernel_neon_begin();
+	sha1_base_do_update(desc, data, len,
+			    (sha1_block_fn *)sha1_ce_transform);
 	if (!finalize)
-		sha1_base_do_finalize(desc, __sha1_ce_transform);
+		sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform);
+	kernel_neon_end();
 	return sha1_base_finish(desc, out);
 }

@ -106,41 +90,24 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out)
 		return crypto_sha1_finup(desc, NULL, 0, out);

 	sctx->finalize = 0;
-	sha1_base_do_finalize(desc, __sha1_ce_transform);
+	kernel_neon_begin();
+	sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform);
+	kernel_neon_end();
 	return sha1_base_finish(desc, out);
 }

-static int sha1_ce_export(struct shash_desc *desc, void *out)
-{
-	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, &sctx->sst, sizeof(struct sha1_state));
-	return 0;
-}
-
-static int sha1_ce_import(struct shash_desc *desc, const void *in)
-{
-	struct sha1_ce_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(&sctx->sst, in, sizeof(struct sha1_state));
-	sctx->finalize = 0;
-	return 0;
-}
-
 static struct shash_alg alg = {
 	.init			= sha1_base_init,
 	.update			= sha1_ce_update,
 	.final			= sha1_ce_final,
 	.finup			= sha1_ce_finup,
-	.import			= sha1_ce_import,
-	.export			= sha1_ce_export,
 	.descsize		= sizeof(struct sha1_ce_state),
-	.statesize		= sizeof(struct sha1_state),
 	.digestsize		= SHA1_DIGEST_SIZE,
 	.base			= {
 		.cra_name		= "sha1",
 		.cra_driver_name	= "sha1-ce",
 		.cra_priority		= 200,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA1_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
--- a/arch/arm64/crypto/sha2-ce-core.S
+++ b/arch/arm64/crypto/sha2-ce-core.S
@ -53,7 +53,6 @@
 	/*
 	 * The SHA-256 round constants
 	 */
-	.section	".rodata", "a"
 	.align		4
 .Lsha2_rcon:
 	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
@ -77,10 +76,9 @@
 	 * void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
 	 *			  int blocks)
 	 */
-	.text
-SYM_FUNC_START(sha2_ce_transform)
+ENTRY(sha2_ce_transform)
 	/* load round constants */
-	adr_l		x8, .Lsha2_rcon
+	adr		x8, .Lsha2_rcon
 	ld1		{ v0.4s- v3.4s}, [x8], #64
 	ld1		{ v4.4s- v7.4s}, [x8], #64
 	ld1		{ v8.4s-v11.4s}, [x8], #64
@ -131,16 +129,14 @@ CPU_LE(	rev32		v19.16b, v19.16b	)
 	add		dgbv.4s, dgbv.4s, dg1v.4s

 	/* handled all input blocks? */
-	cbz		w2, 2f
-	cond_yield	3f, x5
-	b		0b
+	cbnz		w2, 0b

 	/*
 	 * Final block: add padding and total bit count.
 	 * Skip if the input size was not a round multiple of the block size,
 	 * the padding is handled by the C code in that case.
 	 */
-2:	cbz		x4, 3f
+	cbz		x4, 3f
 	ldr_l		w4, sha256_ce_offsetof_count, x4
 	ldr		x4, [x0, x4]
 	movi		v17.2d, #0
@ -155,6 +151,5 @@ CPU_LE(	rev32		v19.16b, v19.16b	)

 	/* store new state */
 3:	st1		{dgav.4s, dgbv.4s}, [x0]
-	mov		w0, w2
 	ret
-SYM_FUNC_END(sha2_ce_transform)
+ENDPROC(sha2_ce_transform)
--- a/arch/arm64/crypto/sha2-ce-glue.c
+++ b/arch/arm64/crypto/sha2-ce-glue.c
@ -12,7 +12,6 @@
 #include <asm/simd.h>
 #include <asm/unaligned.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/sha.h>
 #include <crypto/sha256_base.h>
 #include <linux/cpufeature.h>
@ -22,19 +21,14 @@
 MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
 MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha224");
-MODULE_ALIAS_CRYPTO("sha256");

 struct sha256_ce_state {
 	struct sha256_state	sst;
 	u32			finalize;
 };

-extern const u32 sha256_ce_offsetof_count;
-extern const u32 sha256_ce_offsetof_finalize;
-
-asmlinkage int sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
-				 int blocks);
+asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
+				  int blocks);
 #ifdef CONFIG_CFI_CLANG
 static inline void __cfi_sha2_ce_transform(struct sha256_state *sst,
 					   u8 const *src, int blocks)
@ -44,23 +38,6 @@ static inline void __cfi_sha2_ce_transform(struct sha256_state *sst,
 #define sha2_ce_transform __cfi_sha2_ce_transform
 #endif

-static void __sha2_ce_transform(struct sha256_state *sst, u8 const *src,
-				int blocks)
-{
-	sha2_ce_transform(container_of(sst, struct sha256_ce_state, sst), src,
-			  blocks);
-	while (blocks) {
-		int rem;
-
-		kernel_neon_begin();
-		rem = sha2_ce_transform(container_of(sst, struct sha256_ce_state,
-						     sst), src, blocks);
-		kernel_neon_end();
-		src += (blocks - rem) * SHA256_BLOCK_SIZE;
-		blocks = rem;
-	}
-}
-
 const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
 					      sst.count);
 const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state,
@ -68,12 +45,6 @@ const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state,

 asmlinkage void sha256_block_data_order(u32 *digest, u8 const *src, int blocks);

-static void __sha256_block_data_order(struct sha256_state *sst, u8 const *src,
-				      int blocks)
-{
-	sha256_block_data_order(sst->state, src, blocks);
-}
-
 static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
 			    unsigned int len)
 {
@ -81,10 +52,13 @@ static int sha256_ce_update(struct shash_desc *desc, const u8 *data,

 	if (!may_use_simd())
 		return sha256_base_do_update(desc, data, len,
-				__sha256_block_data_order);
+				(sha256_block_fn *)sha256_block_data_order);

 	sctx->finalize = 0;
-	sha256_base_do_update(desc, data, len, __sha2_ce_transform);
+	kernel_neon_begin();
+	sha256_base_do_update(desc, data, len,
+			      (sha256_block_fn *)sha2_ce_transform);
+	kernel_neon_end();

 	return 0;
 }
@ -98,8 +72,9 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
 	if (!may_use_simd()) {
 		if (len)
 			sha256_base_do_update(desc, data, len,
-				__sha256_block_data_order);
-		sha256_base_do_finalize(desc, __sha256_block_data_order);
+				(sha256_block_fn *)sha256_block_data_order);
+		sha256_base_do_finalize(desc,
+				(sha256_block_fn *)sha256_block_data_order);
 		return sha256_base_finish(desc, out);
 	}

@ -109,9 +84,13 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
 	 */
 	sctx->finalize = finalize;

-	sha256_base_do_update(desc, data, len, __sha2_ce_transform);
+	kernel_neon_begin();
+	sha256_base_do_update(desc, data, len,
+			      (sha256_block_fn *)sha2_ce_transform);
 	if (!finalize)
-		sha256_base_do_finalize(desc, __sha2_ce_transform);
+		sha256_base_do_finalize(desc,
+					(sha256_block_fn *)sha2_ce_transform);
+	kernel_neon_end();
 	return sha256_base_finish(desc, out);
 }

@ -120,46 +99,30 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out)
 	struct sha256_ce_state *sctx = shash_desc_ctx(desc);

 	if (!may_use_simd()) {
-		sha256_base_do_finalize(desc, __sha256_block_data_order);
+		sha256_base_do_finalize(desc,
+				(sha256_block_fn *)sha256_block_data_order);
 		return sha256_base_finish(desc, out);
 	}

 	sctx->finalize = 0;
-	sha256_base_do_finalize(desc, __sha2_ce_transform);
+	kernel_neon_begin();
+	sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform);
+	kernel_neon_end();
 	return sha256_base_finish(desc, out);
 }

-static int sha256_ce_export(struct shash_desc *desc, void *out)
-{
-	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, &sctx->sst, sizeof(struct sha256_state));
-	return 0;
-}
-
-static int sha256_ce_import(struct shash_desc *desc, const void *in)
-{
-	struct sha256_ce_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(&sctx->sst, in, sizeof(struct sha256_state));
-	sctx->finalize = 0;
-	return 0;
-}
-
 static struct shash_alg algs[] = { {
 	.init			= sha224_base_init,
 	.update			= sha256_ce_update,
 	.final			= sha256_ce_final,
 	.finup			= sha256_ce_finup,
-	.export			= sha256_ce_export,
-	.import			= sha256_ce_import,
 	.descsize		= sizeof(struct sha256_ce_state),
-	.statesize		= sizeof(struct sha256_state),
 	.digestsize		= SHA224_DIGEST_SIZE,
 	.base			= {
 		.cra_name		= "sha224",
 		.cra_driver_name	= "sha224-ce",
 		.cra_priority		= 200,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
@ -168,15 +131,13 @@ static struct shash_alg algs[] = { {
 	.update			= sha256_ce_update,
 	.final			= sha256_ce_final,
 	.finup			= sha256_ce_finup,
-	.export			= sha256_ce_export,
-	.import			= sha256_ce_import,
 	.descsize		= sizeof(struct sha256_ce_state),
-	.statesize		= sizeof(struct sha256_state),
 	.digestsize		= SHA256_DIGEST_SIZE,
 	.base			= {
 		.cra_name		= "sha256",
 		.cra_driver_name	= "sha256-ce",
 		.cra_priority		= 200,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= SHA256_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
 	}
--- a/arch/arm64/crypto/sha256-core.S_shipped
+++ b/arch/arm64/crypto/sha256-core.S_shipped
@ -1,13 +1,3 @@
-// SPDX-License-Identifier: GPL-2.0
-
-// This code is taken from the OpenSSL project but the author (Andy Polyakov)
-// has relicensed it under the GPLv2. Therefore this program is free software;
-// you can redistribute it and/or modify it under the terms of the GNU General
-// Public License version 2 as published by the Free Software Foundation.
-//
-// The original headers, including the original license headers, are
-// included below for completeness.
-
 // Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
 //
 // Licensed under the OpenSSL license (the "License").  You may not use
@ -20,6 +10,8 @@
 // project. The module is, however, dual licensed under OpenSSL and
 // CRYPTOGAMS licenses depending on where you obtain it. For further
 // details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
 // ====================================================================
 //
 // SHA256/512 for ARMv8.
--- a/arch/arm64/crypto/sha256-glue.c
+++ b/arch/arm64/crypto/sha256-glue.c
@ -14,7 +14,6 @@
 #include <asm/neon.h>
 #include <asm/simd.h>
 #include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
 #include <crypto/sha.h>
 #include <crypto/sha256_base.h>
 #include <linux/cryptohash.h>
@ -32,66 +31,57 @@ asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
 					unsigned int num_blks);
 EXPORT_SYMBOL(sha256_block_data_order);

-static void __sha256_block_data_order(struct sha256_state *sst, u8 const *src,
-				      int blocks)
-{
-	sha256_block_data_order(sst->state, src, blocks);
-}
-
 asmlinkage void sha256_block_neon(u32 *digest, const void *data,
 				  unsigned int num_blks);

-static void __sha256_block_neon(struct sha256_state *sst, u8 const *src,
-				int blocks)
-{
-	sha256_block_neon(sst->state, src, blocks);
-}
-
-static int crypto_sha256_arm64_update(struct shash_desc *desc, const u8 *data,
-				      unsigned int len)
+static int sha256_update(struct shash_desc *desc, const u8 *data,
+			 unsigned int len)
 {
 	return sha256_base_do_update(desc, data, len,
-				     __sha256_block_data_order);
+				(sha256_block_fn *)sha256_block_data_order);
 }

-static int crypto_sha256_arm64_finup(struct shash_desc *desc, const u8 *data,
-				     unsigned int len, u8 *out)
+static int sha256_finup(struct shash_desc *desc, const u8 *data,
+			unsigned int len, u8 *out)
 {
 	if (len)
 		sha256_base_do_update(desc, data, len,
-				      __sha256_block_data_order);
-	sha256_base_do_finalize(desc, __sha256_block_data_order);
+				(sha256_block_fn *)sha256_block_data_order);
+	sha256_base_do_finalize(desc,
+				(sha256_block_fn *)sha256_block_data_order);

 	return sha256_base_finish(desc, out);
 }

-static int crypto_sha256_arm64_final(struct shash_desc *desc, u8 *out)
+static int sha256_final(struct shash_desc *desc, u8 *out)
 {
-	return crypto_sha256_arm64_finup(desc, NULL, 0, out);
+	return sha256_finup(desc, NULL, 0, out);
 }

 static struct shash_alg algs[] = { {
 	.digestsize		= SHA256_DIGEST_SIZE,
 	.init			= sha256_base_init,
-	.update			= crypto_sha256_arm64_update,
-	.final			= crypto_sha256_arm64_final,
-	.finup			= crypto_sha256_arm64_finup,
+	.update			= sha256_update,
+	.final			= sha256_final,
+	.finup			= sha256_finup,
 	.descsize		= sizeof(struct sha256_state),
 	.base.cra_name		= "sha256",
 	.base.cra_driver_name	= "sha256-arm64",
-	.base.cra_priority	= 125,
+	.base.cra_priority	= 100,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA256_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 }, {
 	.digestsize		= SHA224_DIGEST_SIZE,
 	.init			= sha224_base_init,
-	.update			= crypto_sha256_arm64_update,
-	.final			= crypto_sha256_arm64_final,
-	.finup			= crypto_sha256_arm64_finup,
+	.update			= sha256_update,
+	.final			= sha256_final,
+	.finup			= sha256_finup,
 	.descsize		= sizeof(struct sha256_state),
 	.base.cra_name		= "sha224",
 	.base.cra_driver_name	= "sha224-arm64",
-	.base.cra_priority	= 125,
+	.base.cra_priority	= 100,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA224_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 } };
@ -99,31 +89,21 @@ static struct shash_alg algs[] = { {
 static int sha256_update_neon(struct shash_desc *desc, const u8 *data,
 			      unsigned int len)
 {
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
+	/*
+	 * Stacking and unstacking a substantial slice of the NEON register
+	 * file may significantly affect performance for small updates when
+	 * executing in interrupt context, so fall back to the scalar code
+	 * in that case.
+	 */
 	if (!may_use_simd())
 		return sha256_base_do_update(desc, data, len,
-				__sha256_block_data_order);
+				(sha256_block_fn *)sha256_block_data_order);

-	while (len > 0) {
-		unsigned int chunk = len;
+	kernel_neon_begin();
+	sha256_base_do_update(desc, data, len,
+				(sha256_block_fn *)sha256_block_neon);
+	kernel_neon_end();

-		/*
-		 * Don't hog the CPU for the entire time it takes to process all
-		 * input when running on a preemptible kernel, but process the
-		 * data block by block instead.
-		 */
-		if (IS_ENABLED(CONFIG_PREEMPT) &&
-		    chunk + sctx->count % SHA256_BLOCK_SIZE > SHA256_BLOCK_SIZE)
-			chunk = SHA256_BLOCK_SIZE -
-				sctx->count % SHA256_BLOCK_SIZE;
-
-		kernel_neon_begin();
-		sha256_base_do_update(desc, data, chunk, __sha256_block_neon);
-		kernel_neon_end();
-		data += chunk;
-		len -= chunk;
-	}
 	return 0;
 }

@ -133,13 +113,16 @@ static int sha256_finup_neon(struct shash_desc *desc, const u8 *data,
 	if (!may_use_simd()) {
 		if (len)
 			sha256_base_do_update(desc, data, len,
-				__sha256_block_data_order);
-		sha256_base_do_finalize(desc, __sha256_block_data_order);
+				(sha256_block_fn *)sha256_block_data_order);
+		sha256_base_do_finalize(desc,
+				(sha256_block_fn *)sha256_block_data_order);
 	} else {
-		if (len)
-			sha256_update_neon(desc, data, len);
 		kernel_neon_begin();
-		sha256_base_do_finalize(desc, __sha256_block_neon);
+		if (len)
+			sha256_base_do_update(desc, data, len,
+				(sha256_block_fn *)sha256_block_neon);
+		sha256_base_do_finalize(desc,
+				(sha256_block_fn *)sha256_block_neon);
 		kernel_neon_end();
 	}
 	return sha256_base_finish(desc, out);
@ -160,6 +143,7 @@ static struct shash_alg neon_algs[] = { {
 	.base.cra_name		= "sha256",
 	.base.cra_driver_name	= "sha256-arm64-neon",
 	.base.cra_priority	= 150,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA256_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 }, {
@ -172,6 +156,7 @@ static struct shash_alg neon_algs[] = { {
 	.base.cra_name		= "sha224",
 	.base.cra_driver_name	= "sha224-arm64-neon",
 	.base.cra_priority	= 150,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA224_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 } };
@ -182,7 +167,7 @@ static int __init sha256_mod_init(void)
 	if (ret)
 		return ret;

-	if (cpu_have_named_feature(ASIMD)) {
+	if (elf_hwcap & HWCAP_ASIMD) {
 		ret = crypto_register_shashes(neon_algs, ARRAY_SIZE(neon_algs));
 		if (ret)
 			crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
@ -192,7 +177,7 @@ static int __init sha256_mod_init(void)

 static void __exit sha256_mod_fini(void)
 {
-	if (cpu_have_named_feature(ASIMD))
+	if (elf_hwcap & HWCAP_ASIMD)
 		crypto_unregister_shashes(neon_algs, ARRAY_SIZE(neon_algs));
 	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
 }
--- a/arch/arm64/crypto/sha3-ce-core.S
+++ b/arch/arm64/crypto/sha3-ce-core.S
@ -1,212 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.irp	b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
-	.set	.Lv\b\().2d, \b
-	.set	.Lv\b\().16b, \b
-	.endr
-
-	/*
-	 * ARMv8.2 Crypto Extensions instructions
-	 */
-	.macro	eor3, rd, rn, rm, ra
-	.inst	0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
-	.endm
-
-	.macro	rax1, rd, rn, rm
-	.inst	0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
-	.endm
-
-	.macro	bcax, rd, rn, rm, ra
-	.inst	0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
-	.endm
-
-	.macro	xar, rd, rn, rm, imm6
-	.inst	0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
-	.endm
-
-	/*
-	 * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
-	 */
-	.text
-SYM_FUNC_START(sha3_ce_transform)
-	/* load state */
-	add	x8, x0, #32
-	ld1	{ v0.1d- v3.1d}, [x0]
-	ld1	{ v4.1d- v7.1d}, [x8], #32
-	ld1	{ v8.1d-v11.1d}, [x8], #32
-	ld1	{v12.1d-v15.1d}, [x8], #32
-	ld1	{v16.1d-v19.1d}, [x8], #32
-	ld1	{v20.1d-v23.1d}, [x8], #32
-	ld1	{v24.1d}, [x8]
-
-0:	sub	w2, w2, #1
-	mov	w8, #24
-	adr_l	x9, .Lsha3_rcon
-
-	/* load input */
-	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b-v31.8b}, [x1], #24
-	eor	v0.8b, v0.8b, v25.8b
-	eor	v1.8b, v1.8b, v26.8b
-	eor	v2.8b, v2.8b, v27.8b
-	eor	v3.8b, v3.8b, v28.8b
-	eor	v4.8b, v4.8b, v29.8b
-	eor	v5.8b, v5.8b, v30.8b
-	eor	v6.8b, v6.8b, v31.8b
-
-	tbnz	x3, #6, 2f		// SHA3-512
-
-	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b-v30.8b}, [x1], #16
-	eor	 v7.8b,  v7.8b, v25.8b
-	eor	 v8.8b,  v8.8b, v26.8b
-	eor	 v9.8b,  v9.8b, v27.8b
-	eor	v10.8b, v10.8b, v28.8b
-	eor	v11.8b, v11.8b, v29.8b
-	eor	v12.8b, v12.8b, v30.8b
-
-	tbnz	x3, #4, 1f		// SHA3-384 or SHA3-224
-
-	// SHA3-256
-	ld1	{v25.8b-v28.8b}, [x1], #32
-	eor	v13.8b, v13.8b, v25.8b
-	eor	v14.8b, v14.8b, v26.8b
-	eor	v15.8b, v15.8b, v27.8b
-	eor	v16.8b, v16.8b, v28.8b
-	b	3f
-
-1:	tbz	x3, #2, 3f		// bit 2 cleared? SHA-384
-
-	// SHA3-224
-	ld1	{v25.8b-v28.8b}, [x1], #32
-	ld1	{v29.8b}, [x1], #8
-	eor	v13.8b, v13.8b, v25.8b
-	eor	v14.8b, v14.8b, v26.8b
-	eor	v15.8b, v15.8b, v27.8b
-	eor	v16.8b, v16.8b, v28.8b
-	eor	v17.8b, v17.8b, v29.8b
-	b	3f
-
-	// SHA3-512
-2:	ld1	{v25.8b-v26.8b}, [x1], #16
-	eor	 v7.8b,  v7.8b, v25.8b
-	eor	 v8.8b,  v8.8b, v26.8b
-
-3:	sub	w8, w8, #1
-
-	eor3	v29.16b,  v4.16b,  v9.16b, v14.16b
-	eor3	v26.16b,  v1.16b,  v6.16b, v11.16b
-	eor3	v28.16b,  v3.16b,  v8.16b, v13.16b
-	eor3	v25.16b,  v0.16b,  v5.16b, v10.16b
-	eor3	v27.16b,  v2.16b,  v7.16b, v12.16b
-	eor3	v29.16b, v29.16b, v19.16b, v24.16b
-	eor3	v26.16b, v26.16b, v16.16b, v21.16b
-	eor3	v28.16b, v28.16b, v18.16b, v23.16b
-	eor3	v25.16b, v25.16b, v15.16b, v20.16b
-	eor3	v27.16b, v27.16b, v17.16b, v22.16b
-
-	rax1	v30.2d, v29.2d, v26.2d	// bc[0]
-	rax1	v26.2d, v26.2d, v28.2d	// bc[2]
-	rax1	v28.2d, v28.2d, v25.2d	// bc[4]
-	rax1	v25.2d, v25.2d, v27.2d	// bc[1]
-	rax1	v27.2d, v27.2d, v29.2d	// bc[3]
-
-	eor	 v0.16b,  v0.16b, v30.16b
-	xar	 v29.2d,   v1.2d,  v25.2d, (64 - 1)
-	xar	  v1.2d,   v6.2d,  v25.2d, (64 - 44)
-	xar	  v6.2d,   v9.2d,  v28.2d, (64 - 20)
-	xar	  v9.2d,  v22.2d,  v26.2d, (64 - 61)
-	xar	 v22.2d,  v14.2d,  v28.2d, (64 - 39)
-	xar	 v14.2d,  v20.2d,  v30.2d, (64 - 18)
-	xar	 v31.2d,   v2.2d,  v26.2d, (64 - 62)
-	xar	  v2.2d,  v12.2d,  v26.2d, (64 - 43)
-	xar	 v12.2d,  v13.2d,  v27.2d, (64 - 25)
-	xar	 v13.2d,  v19.2d,  v28.2d, (64 - 8)
-	xar	 v19.2d,  v23.2d,  v27.2d, (64 - 56)
-	xar	 v23.2d,  v15.2d,  v30.2d, (64 - 41)
-	xar	 v15.2d,   v4.2d,  v28.2d, (64 - 27)
-	xar	 v28.2d,  v24.2d,  v28.2d, (64 - 14)
-	xar	 v24.2d,  v21.2d,  v25.2d, (64 - 2)
-	xar	  v8.2d,   v8.2d,  v27.2d, (64 - 55)
-	xar	  v4.2d,  v16.2d,  v25.2d, (64 - 45)
-	xar	 v16.2d,   v5.2d,  v30.2d, (64 - 36)
-	xar	  v5.2d,   v3.2d,  v27.2d, (64 - 28)
-	xar	 v27.2d,  v18.2d,  v27.2d, (64 - 21)
-	xar	  v3.2d,  v17.2d,  v26.2d, (64 - 15)
-	xar	 v25.2d,  v11.2d,  v25.2d, (64 - 10)
-	xar	 v26.2d,   v7.2d,  v26.2d, (64 - 6)
-	xar	 v30.2d,  v10.2d,  v30.2d, (64 - 3)
-
-	bcax	v20.16b, v31.16b, v22.16b,  v8.16b
-	bcax	v21.16b,  v8.16b, v23.16b, v22.16b
-	bcax	v22.16b, v22.16b, v24.16b, v23.16b
-	bcax	v23.16b, v23.16b, v31.16b, v24.16b
-	bcax	v24.16b, v24.16b,  v8.16b, v31.16b
-
-	ld1r	{v31.2d}, [x9], #8
-
-	bcax	v17.16b, v25.16b, v19.16b,  v3.16b
-	bcax	v18.16b,  v3.16b, v15.16b, v19.16b
-	bcax	v19.16b, v19.16b, v16.16b, v15.16b
-	bcax	v15.16b, v15.16b, v25.16b, v16.16b
-	bcax	v16.16b, v16.16b,  v3.16b, v25.16b
-
-	bcax	v10.16b, v29.16b, v12.16b, v26.16b
-	bcax	v11.16b, v26.16b, v13.16b, v12.16b
-	bcax	v12.16b, v12.16b, v14.16b, v13.16b
-	bcax	v13.16b, v13.16b, v29.16b, v14.16b
-	bcax	v14.16b, v14.16b, v26.16b, v29.16b
-
-	bcax	 v7.16b, v30.16b,  v9.16b,  v4.16b
-	bcax	 v8.16b,  v4.16b,  v5.16b,  v9.16b
-	bcax	 v9.16b,  v9.16b,  v6.16b,  v5.16b
-	bcax	 v5.16b,  v5.16b, v30.16b,  v6.16b
-	bcax	 v6.16b,  v6.16b,  v4.16b, v30.16b
-
-	bcax	 v3.16b, v27.16b,  v0.16b, v28.16b
-	bcax	 v4.16b, v28.16b,  v1.16b,  v0.16b
-	bcax	 v0.16b,  v0.16b,  v2.16b,  v1.16b
-	bcax	 v1.16b,  v1.16b, v27.16b,  v2.16b
-	bcax	 v2.16b,  v2.16b, v28.16b, v27.16b
-
-	eor	 v0.16b,  v0.16b, v31.16b
-
-	cbnz	w8, 3b
-	cond_yield 3f, x8
-	cbnz	w2, 0b
-
-	/* save state */
-3:	st1	{ v0.1d- v3.1d}, [x0], #32
-	st1	{ v4.1d- v7.1d}, [x0], #32
-	st1	{ v8.1d-v11.1d}, [x0], #32
-	st1	{v12.1d-v15.1d}, [x0], #32
-	st1	{v16.1d-v19.1d}, [x0], #32
-	st1	{v20.1d-v23.1d}, [x0], #32
-	st1	{v24.1d}, [x0]
-	mov	w0, w2
-	ret
-SYM_FUNC_END(sha3_ce_transform)
-
-	.section	".rodata", "a"
-	.align		8
-.Lsha3_rcon:
-	.quad	0x0000000000000001, 0x0000000000008082, 0x800000000000808a
-	.quad	0x8000000080008000, 0x000000000000808b, 0x0000000080000001
-	.quad	0x8000000080008081, 0x8000000000008009, 0x000000000000008a
-	.quad	0x0000000000000088, 0x0000000080008009, 0x000000008000000a
-	.quad	0x000000008000808b, 0x800000000000008b, 0x8000000000008089
-	.quad	0x8000000000008003, 0x8000000000008002, 0x8000000000000080
-	.quad	0x000000000000800a, 0x800000008000000a, 0x8000000080008081
-	.quad	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
--- a/arch/arm64/crypto/sha3-ce-glue.c
+++ b/arch/arm64/crypto/sha3-ce-glue.c
@ -1,166 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * sha3-ce-glue.c - core SHA-3 transform using v8.2 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <asm/unaligned.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/sha3.h>
-#include <linux/cpufeature.h>
-#include <linux/crypto.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha3-224");
-MODULE_ALIAS_CRYPTO("sha3-256");
-MODULE_ALIAS_CRYPTO("sha3-384");
-MODULE_ALIAS_CRYPTO("sha3-512");
-
-asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks,
-				 int md_len);
-
-static int sha3_update(struct shash_desc *desc, const u8 *data,
-		       unsigned int len)
-{
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
-
-	if (!may_use_simd())
-		return crypto_sha3_update(desc, data, len);
-
-	if ((sctx->partial + len) >= sctx->rsiz) {
-		int blocks;
-
-		if (sctx->partial) {
-			int p = sctx->rsiz - sctx->partial;
-
-			memcpy(sctx->buf + sctx->partial, data, p);
-			kernel_neon_begin();
-			sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
-			kernel_neon_end();
-
-			data += p;
-			len -= p;
-			sctx->partial = 0;
-		}
-
-		blocks = len / sctx->rsiz;
-		len %= sctx->rsiz;
-
-		while (blocks) {
-			int rem;
-
-			kernel_neon_begin();
-			rem = sha3_ce_transform(sctx->st, data, blocks,
-						digest_size);
-			kernel_neon_end();
-			data += (blocks - rem) * sctx->rsiz;
-			blocks = rem;
-		}
-	}
-
-	if (len) {
-		memcpy(sctx->buf + sctx->partial, data, len);
-		sctx->partial += len;
-	}
-	return 0;
-}
-
-static int sha3_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha3_state *sctx = shash_desc_ctx(desc);
-	unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
-	__le64 *digest = (__le64 *)out;
-	int i;
-
-	if (!may_use_simd())
-		return crypto_sha3_final(desc, out);
-
-	sctx->buf[sctx->partial++] = 0x06;
-	memset(sctx->buf + sctx->partial, 0, sctx->rsiz - sctx->partial);
-	sctx->buf[sctx->rsiz - 1] |= 0x80;
-
-	kernel_neon_begin();
-	sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
-	kernel_neon_end();
-
-	for (i = 0; i < digest_size / 8; i++)
-		put_unaligned_le64(sctx->st[i], digest++);
-
-	if (digest_size & 4)
-		put_unaligned_le32(sctx->st[i], (__le32 *)digest);
-
-	memzero_explicit(sctx, sizeof(*sctx));
-	return 0;
-}
-
-static struct shash_alg algs[] = { {
-	.digestsize		= SHA3_224_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= sha3_update,
-	.final			= sha3_final,
-	.descsize		= sizeof(struct sha3_state),
-	.base.cra_name		= "sha3-224",
-	.base.cra_driver_name	= "sha3-224-ce",
-	.base.cra_blocksize	= SHA3_224_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-}, {
-	.digestsize		= SHA3_256_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= sha3_update,
-	.final			= sha3_final,
-	.descsize		= sizeof(struct sha3_state),
-	.base.cra_name		= "sha3-256",
-	.base.cra_driver_name	= "sha3-256-ce",
-	.base.cra_blocksize	= SHA3_256_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-}, {
-	.digestsize		= SHA3_384_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= sha3_update,
-	.final			= sha3_final,
-	.descsize		= sizeof(struct sha3_state),
-	.base.cra_name		= "sha3-384",
-	.base.cra_driver_name	= "sha3-384-ce",
-	.base.cra_blocksize	= SHA3_384_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-}, {
-	.digestsize		= SHA3_512_DIGEST_SIZE,
-	.init			= crypto_sha3_init,
-	.update			= sha3_update,
-	.final			= sha3_final,
-	.descsize		= sizeof(struct sha3_state),
-	.base.cra_name		= "sha3-512",
-	.base.cra_driver_name	= "sha3-512-ce",
-	.base.cra_blocksize	= SHA3_512_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-} };
-
-static int __init sha3_neon_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha3_neon_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA3, sha3_neon_mod_init);
-module_exit(sha3_neon_mod_fini);
--- a/arch/arm64/crypto/sha512-armv8.pl
+++ b/arch/arm64/crypto/sha512-armv8.pl
@ -1,14 +1,4 @@
 #! /usr/bin/env perl
-# SPDX-License-Identifier: GPL-2.0
-
-# This code is taken from the OpenSSL project but the author (Andy Polyakov)
-# has relicensed it under the GPLv2. Therefore this program is free software;
-# you can redistribute it and/or modify it under the terms of the GNU General
-# Public License version 2 as published by the Free Software Foundation.
-#
-# The original headers, including the original license headers, are
-# included below for completeness.
-
 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
@ -21,6 +11,8 @@
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
+#
+# Permission to use under GPLv2 terms is granted.
 # ====================================================================
 #
 # SHA256/512 for ARMv8.
--- a/arch/arm64/crypto/sha512-ce-core.S
+++ b/arch/arm64/crypto/sha512-ce-core.S
@ -1,206 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.irp		b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
-	.set		.Lq\b, \b
-	.set		.Lv\b\().2d, \b
-	.endr
-
-	.macro		sha512h, rd, rn, rm
-	.inst		0xce608000 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
-	.endm
-
-	.macro		sha512h2, rd, rn, rm
-	.inst		0xce608400 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
-	.endm
-
-	.macro		sha512su0, rd, rn
-	.inst		0xcec08000 | .L\rd | (.L\rn << 5)
-	.endm
-
-	.macro		sha512su1, rd, rn, rm
-	.inst		0xce608800 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
-	.endm
-
-	/*
-	 * The SHA-512 round constants
-	 */
-	.section	".rodata", "a"
-	.align		4
-.Lsha512_rcon:
-	.quad		0x428a2f98d728ae22, 0x7137449123ef65cd
-	.quad		0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
-	.quad		0x3956c25bf348b538, 0x59f111f1b605d019
-	.quad		0x923f82a4af194f9b, 0xab1c5ed5da6d8118
-	.quad		0xd807aa98a3030242, 0x12835b0145706fbe
-	.quad		0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
-	.quad		0x72be5d74f27b896f, 0x80deb1fe3b1696b1
-	.quad		0x9bdc06a725c71235, 0xc19bf174cf692694
-	.quad		0xe49b69c19ef14ad2, 0xefbe4786384f25e3
-	.quad		0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
-	.quad		0x2de92c6f592b0275, 0x4a7484aa6ea6e483
-	.quad		0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
-	.quad		0x983e5152ee66dfab, 0xa831c66d2db43210
-	.quad		0xb00327c898fb213f, 0xbf597fc7beef0ee4
-	.quad		0xc6e00bf33da88fc2, 0xd5a79147930aa725
-	.quad		0x06ca6351e003826f, 0x142929670a0e6e70
-	.quad		0x27b70a8546d22ffc, 0x2e1b21385c26c926
-	.quad		0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
-	.quad		0x650a73548baf63de, 0x766a0abb3c77b2a8
-	.quad		0x81c2c92e47edaee6, 0x92722c851482353b
-	.quad		0xa2bfe8a14cf10364, 0xa81a664bbc423001
-	.quad		0xc24b8b70d0f89791, 0xc76c51a30654be30
-	.quad		0xd192e819d6ef5218, 0xd69906245565a910
-	.quad		0xf40e35855771202a, 0x106aa07032bbd1b8
-	.quad		0x19a4c116b8d2d0c8, 0x1e376c085141ab53
-	.quad		0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
-	.quad		0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
-	.quad		0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
-	.quad		0x748f82ee5defb2fc, 0x78a5636f43172f60
-	.quad		0x84c87814a1f0ab72, 0x8cc702081a6439ec
-	.quad		0x90befffa23631e28, 0xa4506cebde82bde9
-	.quad		0xbef9a3f7b2c67915, 0xc67178f2e372532b
-	.quad		0xca273eceea26619c, 0xd186b8c721c0c207
-	.quad		0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
-	.quad		0x06f067aa72176fba, 0x0a637dc5a2c898a6
-	.quad		0x113f9804bef90dae, 0x1b710b35131c471b
-	.quad		0x28db77f523047d84, 0x32caab7b40c72493
-	.quad		0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
-	.quad		0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
-	.quad		0x5fcb6fab3ad6faec, 0x6c44198c4a475817
-
-	.macro		dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4
-	.ifnb		\rc1
-	ld1		{v\rc1\().2d}, [x4], #16
-	.endif
-	add		v5.2d, v\rc0\().2d, v\in0\().2d
-	ext		v6.16b, v\i2\().16b, v\i3\().16b, #8
-	ext		v5.16b, v5.16b, v5.16b, #8
-	ext		v7.16b, v\i1\().16b, v\i2\().16b, #8
-	add		v\i3\().2d, v\i3\().2d, v5.2d
-	.ifnb		\in1
-	ext		v5.16b, v\in3\().16b, v\in4\().16b, #8
-	sha512su0	v\in0\().2d, v\in1\().2d
-	.endif
-	sha512h		q\i3, q6, v7.2d
-	.ifnb		\in1
-	sha512su1	v\in0\().2d, v\in2\().2d, v5.2d
-	.endif
-	add		v\i4\().2d, v\i1\().2d, v\i3\().2d
-	sha512h2	q\i3, q\i1, v\i0\().2d
-	.endm
-
-	/*
-	 * void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
-	 *			  int blocks)
-	 */
-	.text
-SYM_FUNC_START(sha512_ce_transform)
-	/* load state */
-	ld1		{v8.2d-v11.2d}, [x0]
-
-	/* load first 4 round constants */
-	adr_l		x3, .Lsha512_rcon
-	ld1		{v20.2d-v23.2d}, [x3], #64
-
-	/* load input */
-0:	ld1		{v12.2d-v15.2d}, [x1], #64
-	ld1		{v16.2d-v19.2d}, [x1], #64
-	sub		w2, w2, #1
-
-CPU_LE(	rev64		v12.16b, v12.16b	)
-CPU_LE(	rev64		v13.16b, v13.16b	)
-CPU_LE(	rev64		v14.16b, v14.16b	)
-CPU_LE(	rev64		v15.16b, v15.16b	)
-CPU_LE(	rev64		v16.16b, v16.16b	)
-CPU_LE(	rev64		v17.16b, v17.16b	)
-CPU_LE(	rev64		v18.16b, v18.16b	)
-CPU_LE(	rev64		v19.16b, v19.16b	)
-
-	mov		x4, x3				// rc pointer
-
-	mov		v0.16b, v8.16b
-	mov		v1.16b, v9.16b
-	mov		v2.16b, v10.16b
-	mov		v3.16b, v11.16b
-
-	// v0  ab  cd  --  ef  gh  ab
-	// v1  cd  --  ef  gh  ab  cd
-	// v2  ef  gh  ab  cd  --  ef
-	// v3  gh  ab  cd  --  ef  gh
-	// v4  --  ef  gh  ab  cd  --
-
-	dround		0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17
-	dround		3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18
-	dround		2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19
-	dround		4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12
-	dround		1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13
-
-	dround		0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14
-	dround		3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15
-	dround		2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16
-	dround		4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17
-	dround		1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18
-
-	dround		0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19
-	dround		3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12
-	dround		2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13
-	dround		4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14
-	dround		1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15
-
-	dround		0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16
-	dround		3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17
-	dround		2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18
-	dround		4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19
-	dround		1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12
-
-	dround		0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13
-	dround		3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14
-	dround		2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15
-	dround		4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16
-	dround		1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17
-
-	dround		0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18
-	dround		3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19
-	dround		2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12
-	dround		4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13
-	dround		1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14
-
-	dround		0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15
-	dround		3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16
-	dround		2, 3, 1, 4, 0, 28, 24, 12
-	dround		4, 2, 0, 1, 3, 29, 25, 13
-	dround		1, 4, 3, 0, 2, 30, 26, 14
-
-	dround		0, 1, 2, 3, 4, 31, 27, 15
-	dround		3, 0, 4, 2, 1, 24,   , 16
-	dround		2, 3, 1, 4, 0, 25,   , 17
-	dround		4, 2, 0, 1, 3, 26,   , 18
-	dround		1, 4, 3, 0, 2, 27,   , 19
-
-	/* update state */
-	add		v8.2d, v8.2d, v0.2d
-	add		v9.2d, v9.2d, v1.2d
-	add		v10.2d, v10.2d, v2.2d
-	add		v11.2d, v11.2d, v3.2d
-
-	cond_yield	3f, x4
-	/* handled all input blocks? */
-	cbnz		w2, 0b
-
-	/* store new state */
-3:	st1		{v8.2d-v11.2d}, [x0]
-	mov		w0, w2
-	ret
-SYM_FUNC_END(sha512_ce_transform)
--- a/arch/arm64/crypto/sha512-ce-glue.c
+++ b/arch/arm64/crypto/sha512-ce-glue.c
@ -1,121 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * sha512-ce-glue.c - SHA-384/SHA-512 using ARMv8 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <asm/unaligned.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/sha.h>
-#include <crypto/sha512_base.h>
-#include <linux/cpufeature.h>
-#include <linux/crypto.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS_CRYPTO("sha384");
-MODULE_ALIAS_CRYPTO("sha512");
-
-asmlinkage int sha512_ce_transform(struct sha512_state *sst, u8 const *src,
-				   int blocks);
-
-asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks);
-
-static void __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
-				  int blocks)
-{
-	while (blocks) {
-		int rem;
-
-		kernel_neon_begin();
-		rem = sha512_ce_transform(sst, src, blocks);
-		kernel_neon_end();
-		src += (blocks - rem) * SHA512_BLOCK_SIZE;
-		blocks = rem;
-	}
-}
-
-static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src,
-				      int blocks)
-{
-	sha512_block_data_order(sst->state, src, blocks);
-}
-
-static int sha512_ce_update(struct shash_desc *desc, const u8 *data,
-			    unsigned int len)
-{
-	sha512_block_fn *fn = may_use_simd() ? __sha512_ce_transform
-						   : __sha512_block_data_order;
-
-	sha512_base_do_update(desc, data, len, fn);
-	return 0;
-}
-
-static int sha512_ce_finup(struct shash_desc *desc, const u8 *data,
-			   unsigned int len, u8 *out)
-{
-	sha512_block_fn *fn = may_use_simd() ? __sha512_ce_transform
-						   : __sha512_block_data_order;
-
-	sha512_base_do_update(desc, data, len, fn);
-	sha512_base_do_finalize(desc, fn);
-	return sha512_base_finish(desc, out);
-}
-
-static int sha512_ce_final(struct shash_desc *desc, u8 *out)
-{
-	sha512_block_fn *fn = may_use_simd() ? __sha512_ce_transform
-						   : __sha512_block_data_order;
-
-	sha512_base_do_finalize(desc, fn);
-	return sha512_base_finish(desc, out);
-}
-
-static struct shash_alg algs[] = { {
-	.init			= sha384_base_init,
-	.update			= sha512_ce_update,
-	.final			= sha512_ce_final,
-	.finup			= sha512_ce_finup,
-	.descsize		= sizeof(struct sha512_state),
-	.digestsize		= SHA384_DIGEST_SIZE,
-	.base.cra_name		= "sha384",
-	.base.cra_driver_name	= "sha384-ce",
-	.base.cra_priority	= 200,
-	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-}, {
-	.init			= sha512_base_init,
-	.update			= sha512_ce_update,
-	.final			= sha512_ce_final,
-	.finup			= sha512_ce_finup,
-	.descsize		= sizeof(struct sha512_state),
-	.digestsize		= SHA512_DIGEST_SIZE,
-	.base.cra_name		= "sha512",
-	.base.cra_driver_name	= "sha512-ce",
-	.base.cra_priority	= 200,
-	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-} };
-
-static int __init sha512_ce_mod_init(void)
-{
-	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
-}
-
-static void __exit sha512_ce_mod_fini(void)
-{
-	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
-}
-
-module_cpu_feature_match(SHA512, sha512_ce_mod_init);
-module_exit(sha512_ce_mod_fini);
--- a/arch/arm64/crypto/sha512-core.S_shipped
+++ b/arch/arm64/crypto/sha512-core.S_shipped
@ -1,13 +1,3 @@
-// SPDX-License-Identifier: GPL-2.0
-
-// This code is taken from the OpenSSL project but the author (Andy Polyakov)
-// has relicensed it under the GPLv2. Therefore this program is free software;
-// you can redistribute it and/or modify it under the terms of the GNU General
-// Public License version 2 as published by the Free Software Foundation.
-//
-// The original headers, including the original license headers, are
-// included below for completeness.
-
 // Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
 //
 // Licensed under the OpenSSL license (the "License").  You may not use
@ -20,6 +10,8 @@
 // project. The module is, however, dual licensed under OpenSSL and
 // CRYPTOGAMS licenses depending on where you obtain it. For further
 // details see http://www.openssl.org/~appro/cryptogams/.
+//
+// Permission to use under GPLv2 terms is granted.
 // ====================================================================
 //
 // SHA256/512 for ARMv8.
--- a/arch/arm64/crypto/sha512-glue.c
+++ b/arch/arm64/crypto/sha512-glue.c
@ -25,21 +25,14 @@ MODULE_LICENSE("GPL v2");
 MODULE_ALIAS_CRYPTO("sha384");
 MODULE_ALIAS_CRYPTO("sha512");

-asmlinkage void sha512_block_data_order(u64 *digest, const void *data,
+asmlinkage void sha512_block_data_order(u32 *digest, const void *data,
 					unsigned int num_blks);
-EXPORT_SYMBOL(sha512_block_data_order);
-
-static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src,
-				      int blocks)
-{
-	sha512_block_data_order(sst->state, src, blocks);
-}

 static int sha512_update(struct shash_desc *desc, const u8 *data,
 			 unsigned int len)
 {
 	return sha512_base_do_update(desc, data, len,
-				     __sha512_block_data_order);
+			(sha512_block_fn *)sha512_block_data_order);
 }

 static int sha512_finup(struct shash_desc *desc, const u8 *data,
@ -47,8 +40,9 @@ static int sha512_finup(struct shash_desc *desc, const u8 *data,
 {
 	if (len)
 		sha512_base_do_update(desc, data, len,
-				      __sha512_block_data_order);
-	sha512_base_do_finalize(desc, __sha512_block_data_order);
+			(sha512_block_fn *)sha512_block_data_order);
+	sha512_base_do_finalize(desc,
+			(sha512_block_fn *)sha512_block_data_order);

 	return sha512_base_finish(desc, out);
 }
@ -68,6 +62,7 @@ static struct shash_alg algs[] = { {
 	.base.cra_name		= "sha512",
 	.base.cra_driver_name	= "sha512-arm64",
 	.base.cra_priority	= 150,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA512_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 }, {
@ -80,6 +75,7 @@ static struct shash_alg algs[] = { {
 	.base.cra_name		= "sha384",
 	.base.cra_driver_name	= "sha384-arm64",
 	.base.cra_priority	= 150,
+	.base.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 	.base.cra_blocksize	= SHA384_BLOCK_SIZE,
 	.base.cra_module	= THIS_MODULE,
 } };
--- a/arch/arm64/crypto/sm3-ce-core.S
+++ b/arch/arm64/crypto/sm3-ce-core.S
@ -1,141 +0,0 @@
-/*
- * sm3-ce-core.S - SM3 secure hash using ARMv8.2 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.irp		b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
-	.set		.Lv\b\().4s, \b
-	.endr
-
-	.macro		sm3partw1, rd, rn, rm
-	.inst		0xce60c000 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
-	.endm
-
-	.macro		sm3partw2, rd, rn, rm
-	.inst		0xce60c400 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
-	.endm
-
-	.macro		sm3ss1, rd, rn, rm, ra
-	.inst		0xce400000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
-	.endm
-
-	.macro		sm3tt1a, rd, rn, rm, imm2
-	.inst		0xce408000 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
-	.endm
-
-	.macro		sm3tt1b, rd, rn, rm, imm2
-	.inst		0xce408400 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
-	.endm
-
-	.macro		sm3tt2a, rd, rn, rm, imm2
-	.inst		0xce408800 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
-	.endm
-
-	.macro		sm3tt2b, rd, rn, rm, imm2
-	.inst		0xce408c00 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
-	.endm
-
-	.macro		round, ab, s0, t0, t1, i
-	sm3ss1		v5.4s, v8.4s, \t0\().4s, v9.4s
-	shl		\t1\().4s, \t0\().4s, #1
-	sri		\t1\().4s, \t0\().4s, #31
-	sm3tt1\ab	v8.4s, v5.4s, v10.4s, \i
-	sm3tt2\ab	v9.4s, v5.4s, \s0\().4s, \i
-	.endm
-
-	.macro		qround, ab, s0, s1, s2, s3, s4
-	.ifnb		\s4
-	ext		\s4\().16b, \s1\().16b, \s2\().16b, #12
-	ext		v6.16b, \s0\().16b, \s1\().16b, #12
-	ext		v7.16b, \s2\().16b, \s3\().16b, #8
-	sm3partw1	\s4\().4s, \s0\().4s, \s3\().4s
-	.endif
-
-	eor		v10.16b, \s0\().16b, \s1\().16b
-
-	round		\ab, \s0, v11, v12, 0
-	round		\ab, \s0, v12, v11, 1
-	round		\ab, \s0, v11, v12, 2
-	round		\ab, \s0, v12, v11, 3
-
-	.ifnb		\s4
-	sm3partw2	\s4\().4s, v7.4s, v6.4s
-	.endif
-	.endm
-
-	/*
-	 * void sm3_ce_transform(struct sm3_state *sst, u8 const *src,
-	 *                       int blocks)
-	 */
-	.text
-SYM_FUNC_START(sm3_ce_transform)
-	/* load state */
-	ld1		{v8.4s-v9.4s}, [x0]
-	rev64		v8.4s, v8.4s
-	rev64		v9.4s, v9.4s
-	ext		v8.16b, v8.16b, v8.16b, #8
-	ext		v9.16b, v9.16b, v9.16b, #8
-
-	adr_l		x8, .Lt
-	ldp		s13, s14, [x8]
-
-	/* load input */
-0:	ld1		{v0.16b-v3.16b}, [x1], #64
-	sub		w2, w2, #1
-
-	mov		v15.16b, v8.16b
-	mov		v16.16b, v9.16b
-
-CPU_LE(	rev32		v0.16b, v0.16b		)
-CPU_LE(	rev32		v1.16b, v1.16b		)
-CPU_LE(	rev32		v2.16b, v2.16b		)
-CPU_LE(	rev32		v3.16b, v3.16b		)
-
-	ext		v11.16b, v13.16b, v13.16b, #4
-
-	qround		a, v0, v1, v2, v3, v4
-	qround		a, v1, v2, v3, v4, v0
-	qround		a, v2, v3, v4, v0, v1
-	qround		a, v3, v4, v0, v1, v2
-
-	ext		v11.16b, v14.16b, v14.16b, #4
-
-	qround		b, v4, v0, v1, v2, v3
-	qround		b, v0, v1, v2, v3, v4
-	qround		b, v1, v2, v3, v4, v0
-	qround		b, v2, v3, v4, v0, v1
-	qround		b, v3, v4, v0, v1, v2
-	qround		b, v4, v0, v1, v2, v3
-	qround		b, v0, v1, v2, v3, v4
-	qround		b, v1, v2, v3, v4, v0
-	qround		b, v2, v3, v4, v0, v1
-	qround		b, v3, v4
-	qround		b, v4, v0
-	qround		b, v0, v1
-
-	eor		v8.16b, v8.16b, v15.16b
-	eor		v9.16b, v9.16b, v16.16b
-
-	/* handled all input blocks? */
-	cbnz		w2, 0b
-
-	/* save state */
-	rev64		v8.4s, v8.4s
-	rev64		v9.4s, v9.4s
-	ext		v8.16b, v8.16b, v8.16b, #8
-	ext		v9.16b, v9.16b, v9.16b, #8
-	st1		{v8.4s-v9.4s}, [x0]
-	ret
-SYM_FUNC_END(sm3_ce_transform)
-
-	.section	".rodata", "a"
-	.align		3
-.Lt:	.word		0x79cc4519, 0x9d8a7a87
--- a/arch/arm64/crypto/sm3-ce-glue.c
+++ b/arch/arm64/crypto/sm3-ce-glue.c
@ -1,92 +0,0 @@
-/*
- * sm3-ce-glue.c - SM3 secure hash using ARMv8.2 Crypto Extensions
- *
- * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <asm/unaligned.h>
-#include <crypto/internal/hash.h>
-#include <crypto/internal/simd.h>
-#include <crypto/sm3.h>
-#include <crypto/sm3_base.h>
-#include <linux/cpufeature.h>
-#include <linux/crypto.h>
-#include <linux/module.h>
-
-MODULE_DESCRIPTION("SM3 secure hash using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
-asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src,
-				 int blocks);
-
-static int sm3_ce_update(struct shash_desc *desc, const u8 *data,
-			 unsigned int len)
-{
-	if (!may_use_simd())
-		return crypto_sm3_update(desc, data, len);
-
-	kernel_neon_begin();
-	sm3_base_do_update(desc, data, len, sm3_ce_transform);
-	kernel_neon_end();
-
-	return 0;
-}
-
-static int sm3_ce_final(struct shash_desc *desc, u8 *out)
-{
-	if (!may_use_simd())
-		return crypto_sm3_finup(desc, NULL, 0, out);
-
-	kernel_neon_begin();
-	sm3_base_do_finalize(desc, sm3_ce_transform);
-	kernel_neon_end();
-
-	return sm3_base_finish(desc, out);
-}
-
-static int sm3_ce_finup(struct shash_desc *desc, const u8 *data,
-			unsigned int len, u8 *out)
-{
-	if (!may_use_simd())
-		return crypto_sm3_finup(desc, data, len, out);
-
-	kernel_neon_begin();
-	sm3_base_do_update(desc, data, len, sm3_ce_transform);
-	kernel_neon_end();
-
-	return sm3_ce_final(desc, out);
-}
-
-static struct shash_alg sm3_alg = {
-	.digestsize		= SM3_DIGEST_SIZE,
-	.init			= sm3_base_init,
-	.update			= sm3_ce_update,
-	.final			= sm3_ce_final,
-	.finup			= sm3_ce_finup,
-	.descsize		= sizeof(struct sm3_state),
-	.base.cra_name		= "sm3",
-	.base.cra_driver_name	= "sm3-ce",
-	.base.cra_blocksize	= SM3_BLOCK_SIZE,
-	.base.cra_module	= THIS_MODULE,
-	.base.cra_priority	= 200,
-};
-
-static int __init sm3_ce_mod_init(void)
-{
-	return crypto_register_shash(&sm3_alg);
-}
-
-static void __exit sm3_ce_mod_fini(void)
-{
-	crypto_unregister_shash(&sm3_alg);
-}
-
-module_cpu_feature_match(SM3, sm3_ce_mod_init);
-module_exit(sm3_ce_mod_fini);
--- a/arch/arm64/crypto/sm4-ce-core.S
+++ b/arch/arm64/crypto/sm4-ce-core.S
@ -1,36 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-	.irp		b, 0, 1, 2, 3, 4, 5, 6, 7, 8
-	.set		.Lv\b\().4s, \b
-	.endr
-
-	.macro		sm4e, rd, rn
-	.inst		0xcec08400 | .L\rd | (.L\rn << 5)
-	.endm
-
-	/*
-	 * void sm4_ce_do_crypt(const u32 *rk, u32 *out, const u32 *in);
-	 */
-	.text
-SYM_FUNC_START(sm4_ce_do_crypt)
-	ld1		{v8.4s}, [x2]
-	ld1		{v0.4s-v3.4s}, [x0], #64
-CPU_LE(	rev32		v8.16b, v8.16b		)
-	ld1		{v4.4s-v7.4s}, [x0]
-	sm4e		v8.4s, v0.4s
-	sm4e		v8.4s, v1.4s
-	sm4e		v8.4s, v2.4s
-	sm4e		v8.4s, v3.4s
-	sm4e		v8.4s, v4.4s
-	sm4e		v8.4s, v5.4s
-	sm4e		v8.4s, v6.4s
-	sm4e		v8.4s, v7.4s
-	rev64		v8.4s, v8.4s
-	ext		v8.16b, v8.16b, v8.16b, #8
-CPU_LE(	rev32		v8.16b, v8.16b		)
-	st1		{v8.4s}, [x1]
-	ret
-SYM_FUNC_END(sm4_ce_do_crypt)
--- a/arch/arm64/crypto/sm4-ce-glue.c
+++ b/arch/arm64/crypto/sm4-ce-glue.c
@ -1,74 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/sm4.h>
-#include <crypto/internal/simd.h>
-#include <linux/module.h>
-#include <linux/cpufeature.h>
-#include <linux/crypto.h>
-#include <linux/types.h>
-
-MODULE_ALIAS_CRYPTO("sm4");
-MODULE_ALIAS_CRYPTO("sm4-ce");
-MODULE_DESCRIPTION("SM4 symmetric cipher using ARMv8 Crypto Extensions");
-MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
-MODULE_LICENSE("GPL v2");
-
-asmlinkage void sm4_ce_do_crypt(const u32 *rk, void *out, const void *in);
-
-static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
-{
-	const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
-
-	if (!may_use_simd()) {
-		crypto_sm4_encrypt(tfm, out, in);
-	} else {
-		kernel_neon_begin();
-		sm4_ce_do_crypt(ctx->rkey_enc, out, in);
-		kernel_neon_end();
-	}
-}
-
-static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
-{
-	const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
-
-	if (!may_use_simd()) {
-		crypto_sm4_decrypt(tfm, out, in);
-	} else {
-		kernel_neon_begin();
-		sm4_ce_do_crypt(ctx->rkey_dec, out, in);
-		kernel_neon_end();
-	}
-}
-
-static struct crypto_alg sm4_ce_alg = {
-	.cra_name			= "sm4",
-	.cra_driver_name		= "sm4-ce",
-	.cra_priority			= 200,
-	.cra_flags			= CRYPTO_ALG_TYPE_CIPHER,
-	.cra_blocksize			= SM4_BLOCK_SIZE,
-	.cra_ctxsize			= sizeof(struct crypto_sm4_ctx),
-	.cra_module			= THIS_MODULE,
-	.cra_u.cipher = {
-		.cia_min_keysize	= SM4_KEY_SIZE,
-		.cia_max_keysize	= SM4_KEY_SIZE,
-		.cia_setkey		= crypto_sm4_set_key,
-		.cia_encrypt		= sm4_ce_encrypt,
-		.cia_decrypt		= sm4_ce_decrypt
-	}
-};
-
-static int __init sm4_ce_mod_init(void)
-{
-	return crypto_register_alg(&sm4_ce_alg);
-}
-
-static void __exit sm4_ce_mod_fini(void)
-{
-	crypto_unregister_alg(&sm4_ce_alg);
-}
-
-module_cpu_feature_match(SM4, sm4_ce_mod_init);
-module_exit(sm4_ce_mod_fini);
--- a/arch/arm64/crypto/speck-neon-core.S
+++ b/arch/arm64/crypto/speck-neon-core.S
@ -1,352 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- *
- * Copyright (c) 2018 Google, Inc
- *
- * Author: Eric Biggers <ebiggers@google.com>
- */
-
-#include <linux/linkage.h>
-
-	.text
-
-	// arguments
-	ROUND_KEYS	.req	x0	// const {u64,u32} *round_keys
-	NROUNDS		.req	w1	// int nrounds
-	NROUNDS_X	.req	x1
-	DST		.req	x2	// void *dst
-	SRC		.req	x3	// const void *src
-	NBYTES		.req	w4	// unsigned int nbytes
-	TWEAK		.req	x5	// void *tweak
-
-	// registers which hold the data being encrypted/decrypted
-	// (underscores avoid a naming collision with ARM64 registers x0-x3)
-	X_0		.req	v0
-	Y_0		.req	v1
-	X_1		.req	v2
-	Y_1		.req	v3
-	X_2		.req	v4
-	Y_2		.req	v5
-	X_3		.req	v6
-	Y_3		.req	v7
-
-	// the round key, duplicated in all lanes
-	ROUND_KEY	.req	v8
-
-	// index vector for tbl-based 8-bit rotates
-	ROTATE_TABLE	.req	v9
-	ROTATE_TABLE_Q	.req	q9
-
-	// temporary registers
-	TMP0		.req	v10
-	TMP1		.req	v11
-	TMP2		.req	v12
-	TMP3		.req	v13
-
-	// multiplication table for updating XTS tweaks
-	GFMUL_TABLE	.req	v14
-	GFMUL_TABLE_Q	.req	q14
-
-	// next XTS tweak value(s)
-	TWEAKV_NEXT	.req	v15
-
-	// XTS tweaks for the blocks currently being encrypted/decrypted
-	TWEAKV0		.req	v16
-	TWEAKV1		.req	v17
-	TWEAKV2		.req	v18
-	TWEAKV3		.req	v19
-	TWEAKV4		.req	v20
-	TWEAKV5		.req	v21
-	TWEAKV6		.req	v22
-	TWEAKV7		.req	v23
-
-	.align		4
-.Lror64_8_table:
-	.octa		0x080f0e0d0c0b0a090007060504030201
-.Lror32_8_table:
-	.octa		0x0c0f0e0d080b0a090407060500030201
-.Lrol64_8_table:
-	.octa		0x0e0d0c0b0a09080f0605040302010007
-.Lrol32_8_table:
-	.octa		0x0e0d0c0f0a09080b0605040702010003
-.Lgf128mul_table:
-	.octa		0x00000000000000870000000000000001
-.Lgf64mul_table:
-	.octa		0x0000000000000000000000002d361b00
-
-/*
- * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
- *
- * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
- * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
- * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
- * 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
- */
-.macro _speck_round_128bytes	n, lanes
-
-	// x = ror(x, 8)
-	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
-	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
-	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
-	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
-
-	// x += y
-	add		X_0.\lanes, X_0.\lanes, Y_0.\lanes
-	add		X_1.\lanes, X_1.\lanes, Y_1.\lanes
-	add		X_2.\lanes, X_2.\lanes, Y_2.\lanes
-	add		X_3.\lanes, X_3.\lanes, Y_3.\lanes
-
-	// x ^= k
-	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
-	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
-	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
-	eor		X_3.16b, X_3.16b, ROUND_KEY.16b
-
-	// y = rol(y, 3)
-	shl		TMP0.\lanes, Y_0.\lanes, #3
-	shl		TMP1.\lanes, Y_1.\lanes, #3
-	shl		TMP2.\lanes, Y_2.\lanes, #3
-	shl		TMP3.\lanes, Y_3.\lanes, #3
-	sri		TMP0.\lanes, Y_0.\lanes, #(\n - 3)
-	sri		TMP1.\lanes, Y_1.\lanes, #(\n - 3)
-	sri		TMP2.\lanes, Y_2.\lanes, #(\n - 3)
-	sri		TMP3.\lanes, Y_3.\lanes, #(\n - 3)
-
-	// y ^= x
-	eor		Y_0.16b, TMP0.16b, X_0.16b
-	eor		Y_1.16b, TMP1.16b, X_1.16b
-	eor		Y_2.16b, TMP2.16b, X_2.16b
-	eor		Y_3.16b, TMP3.16b, X_3.16b
-.endm
-
-/*
- * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
- *
- * This is the inverse of _speck_round_128bytes().
- */
-.macro _speck_unround_128bytes	n, lanes
-
-	// y ^= x
-	eor		TMP0.16b, Y_0.16b, X_0.16b
-	eor		TMP1.16b, Y_1.16b, X_1.16b
-	eor		TMP2.16b, Y_2.16b, X_2.16b
-	eor		TMP3.16b, Y_3.16b, X_3.16b
-
-	// y = ror(y, 3)
-	ushr		Y_0.\lanes, TMP0.\lanes, #3
-	ushr		Y_1.\lanes, TMP1.\lanes, #3
-	ushr		Y_2.\lanes, TMP2.\lanes, #3
-	ushr		Y_3.\lanes, TMP3.\lanes, #3
-	sli		Y_0.\lanes, TMP0.\lanes, #(\n - 3)
-	sli		Y_1.\lanes, TMP1.\lanes, #(\n - 3)
-	sli		Y_2.\lanes, TMP2.\lanes, #(\n - 3)
-	sli		Y_3.\lanes, TMP3.\lanes, #(\n - 3)
-
-	// x ^= k
-	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
-	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
-	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
-	eor		X_3.16b, X_3.16b, ROUND_KEY.16b
-
-	// x -= y
-	sub		X_0.\lanes, X_0.\lanes, Y_0.\lanes
-	sub		X_1.\lanes, X_1.\lanes, Y_1.\lanes
-	sub		X_2.\lanes, X_2.\lanes, Y_2.\lanes
-	sub		X_3.\lanes, X_3.\lanes, Y_3.\lanes
-
-	// x = rol(x, 8)
-	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
-	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
-	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
-	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
-.endm
-
-.macro _next_xts_tweak	next, cur, tmp, n
-.if \n == 64
-	/*
-	 * Calculate the next tweak by multiplying the current one by x,
-	 * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
-	 */
-	sshr		\tmp\().2d, \cur\().2d, #63
-	and		\tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
-	shl		\next\().2d, \cur\().2d, #1
-	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
-	eor		\next\().16b, \next\().16b, \tmp\().16b
-.else
-	/*
-	 * Calculate the next two tweaks by multiplying the current ones by x^2,
-	 * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
-	 */
-	ushr		\tmp\().2d, \cur\().2d, #62
-	shl		\next\().2d, \cur\().2d, #2
-	tbl		\tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
-	eor		\next\().16b, \next\().16b, \tmp\().16b
-.endif
-.endm
-
-/*
- * _speck_xts_crypt() - Speck-XTS encryption/decryption
- *
- * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
- * using Speck-XTS, specifically the variant with a block size of '2n' and round
- * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
- * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
- * nonzero multiple of 128.
- */
-.macro _speck_xts_crypt	n, lanes, decrypting
-
-	/*
-	 * If decrypting, modify the ROUND_KEYS parameter to point to the last
-	 * round key rather than the first, since for decryption the round keys
-	 * are used in reverse order.
-	 */
-.if \decrypting
-	mov		NROUNDS, NROUNDS	/* zero the high 32 bits */
-.if \n == 64
-	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
-	sub		ROUND_KEYS, ROUND_KEYS, #8
-.else
-	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
-	sub		ROUND_KEYS, ROUND_KEYS, #4
-.endif
-.endif
-
-	// Load the index vector for tbl-based 8-bit rotates
-.if \decrypting
-	ldr		ROTATE_TABLE_Q, .Lrol\n\()_8_table
-.else
-	ldr		ROTATE_TABLE_Q, .Lror\n\()_8_table
-.endif
-
-	// One-time XTS preparation
-.if \n == 64
-	// Load first tweak
-	ld1		{TWEAKV0.16b}, [TWEAK]
-
-	// Load GF(2^128) multiplication table
-	ldr		GFMUL_TABLE_Q, .Lgf128mul_table
-.else
-	// Load first tweak
-	ld1		{TWEAKV0.8b}, [TWEAK]
-
-	// Load GF(2^64) multiplication table
-	ldr		GFMUL_TABLE_Q, .Lgf64mul_table
-
-	// Calculate second tweak, packing it together with the first
-	ushr		TMP0.2d, TWEAKV0.2d, #63
-	shl		TMP1.2d, TWEAKV0.2d, #1
-	tbl		TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
-	eor		TMP0.8b, TMP0.8b, TMP1.8b
-	mov		TWEAKV0.d[1], TMP0.d[0]
-.endif
-
-.Lnext_128bytes_\@:
-
-	// Calculate XTS tweaks for next 128 bytes
-	_next_xts_tweak	TWEAKV1, TWEAKV0, TMP0, \n
-	_next_xts_tweak	TWEAKV2, TWEAKV1, TMP0, \n
-	_next_xts_tweak	TWEAKV3, TWEAKV2, TMP0, \n
-	_next_xts_tweak	TWEAKV4, TWEAKV3, TMP0, \n
-	_next_xts_tweak	TWEAKV5, TWEAKV4, TMP0, \n
-	_next_xts_tweak	TWEAKV6, TWEAKV5, TMP0, \n
-	_next_xts_tweak	TWEAKV7, TWEAKV6, TMP0, \n
-	_next_xts_tweak	TWEAKV_NEXT, TWEAKV7, TMP0, \n
-
-	// Load the next source blocks into {X,Y}[0-3]
-	ld1		{X_0.16b-Y_1.16b}, [SRC], #64
-	ld1		{X_2.16b-Y_3.16b}, [SRC], #64
-
-	// XOR the source blocks with their XTS tweaks
-	eor		TMP0.16b, X_0.16b, TWEAKV0.16b
-	eor		Y_0.16b,  Y_0.16b, TWEAKV1.16b
-	eor		TMP1.16b, X_1.16b, TWEAKV2.16b
-	eor		Y_1.16b,  Y_1.16b, TWEAKV3.16b
-	eor		TMP2.16b, X_2.16b, TWEAKV4.16b
-	eor		Y_2.16b,  Y_2.16b, TWEAKV5.16b
-	eor		TMP3.16b, X_3.16b, TWEAKV6.16b
-	eor		Y_3.16b,  Y_3.16b, TWEAKV7.16b
-
-	/*
-	 * De-interleave the 'x' and 'y' elements of each block, i.e. make it so
-	 * that the X[0-3] registers contain only the second halves of blocks,
-	 * and the Y[0-3] registers contain only the first halves of blocks.
-	 * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
-	 */
-	uzp2		X_0.\lanes, TMP0.\lanes, Y_0.\lanes
-	uzp1		Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
-	uzp2		X_1.\lanes, TMP1.\lanes, Y_1.\lanes
-	uzp1		Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
-	uzp2		X_2.\lanes, TMP2.\lanes, Y_2.\lanes
-	uzp1		Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
-	uzp2		X_3.\lanes, TMP3.\lanes, Y_3.\lanes
-	uzp1		Y_3.\lanes, TMP3.\lanes, Y_3.\lanes
-
-	// Do the cipher rounds
-	mov		x6, ROUND_KEYS
-	mov		w7, NROUNDS
-.Lnext_round_\@:
-.if \decrypting
-	ld1r		{ROUND_KEY.\lanes}, [x6]
-	sub		x6, x6, #( \n / 8 )
-	_speck_unround_128bytes	\n, \lanes
-.else
-	ld1r		{ROUND_KEY.\lanes}, [x6], #( \n / 8 )
-	_speck_round_128bytes	\n, \lanes
-.endif
-	subs		w7, w7, #1
-	bne		.Lnext_round_\@
-
-	// Re-interleave the 'x' and 'y' elements of each block
-	zip1		TMP0.\lanes, Y_0.\lanes, X_0.\lanes
-	zip2		Y_0.\lanes,  Y_0.\lanes, X_0.\lanes
-	zip1		TMP1.\lanes, Y_1.\lanes, X_1.\lanes
-	zip2		Y_1.\lanes,  Y_1.\lanes, X_1.\lanes
-	zip1		TMP2.\lanes, Y_2.\lanes, X_2.\lanes
-	zip2		Y_2.\lanes,  Y_2.\lanes, X_2.\lanes
-	zip1		TMP3.\lanes, Y_3.\lanes, X_3.\lanes
-	zip2		Y_3.\lanes,  Y_3.\lanes, X_3.\lanes
-
-	// XOR the encrypted/decrypted blocks with the tweaks calculated earlier
-	eor		X_0.16b, TMP0.16b, TWEAKV0.16b
-	eor		Y_0.16b, Y_0.16b,  TWEAKV1.16b
-	eor		X_1.16b, TMP1.16b, TWEAKV2.16b
-	eor		Y_1.16b, Y_1.16b,  TWEAKV3.16b
-	eor		X_2.16b, TMP2.16b, TWEAKV4.16b
-	eor		Y_2.16b, Y_2.16b,  TWEAKV5.16b
-	eor		X_3.16b, TMP3.16b, TWEAKV6.16b
-	eor		Y_3.16b, Y_3.16b,  TWEAKV7.16b
-	mov		TWEAKV0.16b, TWEAKV_NEXT.16b
-
-	// Store the ciphertext in the destination buffer
-	st1		{X_0.16b-Y_1.16b}, [DST], #64
-	st1		{X_2.16b-Y_3.16b}, [DST], #64
-
-	// Continue if there are more 128-byte chunks remaining
-	subs		NBYTES, NBYTES, #128
-	bne		.Lnext_128bytes_\@
-
-	// Store the next tweak and return
-.if \n == 64
-	st1		{TWEAKV_NEXT.16b}, [TWEAK]
-.else
-	st1		{TWEAKV_NEXT.8b}, [TWEAK]
-.endif
-	ret
-.endm
-
-ENTRY(speck128_xts_encrypt_neon)
-	_speck_xts_crypt	n=64, lanes=2d, decrypting=0
-ENDPROC(speck128_xts_encrypt_neon)
-
-ENTRY(speck128_xts_decrypt_neon)
-	_speck_xts_crypt	n=64, lanes=2d, decrypting=1
-ENDPROC(speck128_xts_decrypt_neon)
-
-ENTRY(speck64_xts_encrypt_neon)
-	_speck_xts_crypt	n=32, lanes=4s, decrypting=0
-ENDPROC(speck64_xts_encrypt_neon)
-
-ENTRY(speck64_xts_decrypt_neon)
-	_speck_xts_crypt	n=32, lanes=4s, decrypting=1
-ENDPROC(speck64_xts_decrypt_neon)
--- a/arch/arm64/crypto/speck-neon-glue.c
+++ b/arch/arm64/crypto/speck-neon-glue.c
@ -1,282 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
- * (64-bit version; based on the 32-bit version)
- *
- * Copyright (c) 2018 Google, Inc
- */
-
-#include <asm/hwcap.h>
-#include <asm/neon.h>
-#include <asm/simd.h>
-#include <crypto/algapi.h>
-#include <crypto/gf128mul.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/speck.h>
-#include <crypto/xts.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-
-/* The assembly functions only handle multiples of 128 bytes */
-#define SPECK_NEON_CHUNK_SIZE	128
-
-/* Speck128 */
-
-struct speck128_xts_tfm_ctx {
-	struct speck128_tfm_ctx main_key;
-	struct speck128_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
-					  void *dst, const void *src,
-					  unsigned int nbytes, void *tweak);
-
-asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
-					  void *dst, const void *src,
-					  unsigned int nbytes, void *tweak);
-
-typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
-				     u8 *, const u8 *);
-typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
-					  const void *, unsigned int, void *);
-
-static __always_inline int
-__speck128_xts_crypt(struct skcipher_request *req,
-		     speck128_crypt_one_t crypt_one,
-		     speck128_xts_crypt_many_t crypt_many)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	const struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	le128 tweak;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, true);
-
-	crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-		u8 *dst = walk.dst.virt.addr;
-		const u8 *src = walk.src.virt.addr;
-
-		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
-			unsigned int count;
-
-			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
-			kernel_neon_begin();
-			(*crypt_many)(ctx->main_key.round_keys,
-				      ctx->main_key.nrounds,
-				      dst, src, count, &tweak);
-			kernel_neon_end();
-			dst += count;
-			src += count;
-			nbytes -= count;
-		}
-
-		/* Handle any remainder with generic code */
-		while (nbytes >= sizeof(tweak)) {
-			le128_xor((le128 *)dst, (const le128 *)src, &tweak);
-			(*crypt_one)(&ctx->main_key, dst, dst);
-			le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
-			gf128mul_x_ble(&tweak, &tweak);
-
-			dst += sizeof(tweak);
-			src += sizeof(tweak);
-			nbytes -= sizeof(tweak);
-		}
-		err = skcipher_walk_done(&walk, nbytes);
-	}
-
-	return err;
-}
-
-static int speck128_xts_encrypt(struct skcipher_request *req)
-{
-	return __speck128_xts_crypt(req, crypto_speck128_encrypt,
-				    speck128_xts_encrypt_neon);
-}
-
-static int speck128_xts_decrypt(struct skcipher_request *req)
-{
-	return __speck128_xts_crypt(req, crypto_speck128_decrypt,
-				    speck128_xts_decrypt_neon);
-}
-
-static int speck128_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			       unsigned int keylen)
-{
-	struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err;
-
-	err = xts_verify_key(tfm, key, keylen);
-	if (err)
-		return err;
-
-	keylen /= 2;
-
-	err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
-	if (err)
-		return err;
-
-	return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-/* Speck64 */
-
-struct speck64_xts_tfm_ctx {
-	struct speck64_tfm_ctx main_key;
-	struct speck64_tfm_ctx tweak_key;
-};
-
-asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
-					 void *dst, const void *src,
-					 unsigned int nbytes, void *tweak);
-
-asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
-					 void *dst, const void *src,
-					 unsigned int nbytes, void *tweak);
-
-typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
-				    u8 *, const u8 *);
-typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
-					 const void *, unsigned int, void *);
-
-static __always_inline int
-__speck64_xts_crypt(struct skcipher_request *req, speck64_crypt_one_t crypt_one,
-		    speck64_xts_crypt_many_t crypt_many)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	const struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	struct skcipher_walk walk;
-	__le64 tweak;
-	int err;
-
-	err = skcipher_walk_virt(&walk, req, true);
-
-	crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
-
-	while (walk.nbytes > 0) {
-		unsigned int nbytes = walk.nbytes;
-		u8 *dst = walk.dst.virt.addr;
-		const u8 *src = walk.src.virt.addr;
-
-		if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
-			unsigned int count;
-
-			count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
-			kernel_neon_begin();
-			(*crypt_many)(ctx->main_key.round_keys,
-				      ctx->main_key.nrounds,
-				      dst, src, count, &tweak);
-			kernel_neon_end();
-			dst += count;
-			src += count;
-			nbytes -= count;
-		}
-
-		/* Handle any remainder with generic code */
-		while (nbytes >= sizeof(tweak)) {
-			*(__le64 *)dst = *(__le64 *)src ^ tweak;
-			(*crypt_one)(&ctx->main_key, dst, dst);
-			*(__le64 *)dst ^= tweak;
-			tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
-					    ((tweak & cpu_to_le64(1ULL << 63)) ?
-					     0x1B : 0));
-			dst += sizeof(tweak);
-			src += sizeof(tweak);
-			nbytes -= sizeof(tweak);
-		}
-		err = skcipher_walk_done(&walk, nbytes);
-	}
-
-	return err;
-}
-
-static int speck64_xts_encrypt(struct skcipher_request *req)
-{
-	return __speck64_xts_crypt(req, crypto_speck64_encrypt,
-				   speck64_xts_encrypt_neon);
-}
-
-static int speck64_xts_decrypt(struct skcipher_request *req)
-{
-	return __speck64_xts_crypt(req, crypto_speck64_decrypt,
-				   speck64_xts_decrypt_neon);
-}
-
-static int speck64_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			      unsigned int keylen)
-{
-	struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err;
-
-	err = xts_verify_key(tfm, key, keylen);
-	if (err)
-		return err;
-
-	keylen /= 2;
-
-	err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
-	if (err)
-		return err;
-
-	return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
-}
-
-static struct skcipher_alg speck_algs[] = {
-	{
-		.base.cra_name		= "xts(speck128)",
-		.base.cra_driver_name	= "xts-speck128-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= SPECK128_BLOCK_SIZE,
-		.base.cra_ctxsize	= sizeof(struct speck128_xts_tfm_ctx),
-		.base.cra_alignmask	= 7,
-		.base.cra_module	= THIS_MODULE,
-		.min_keysize		= 2 * SPECK128_128_KEY_SIZE,
-		.max_keysize		= 2 * SPECK128_256_KEY_SIZE,
-		.ivsize			= SPECK128_BLOCK_SIZE,
-		.walksize		= SPECK_NEON_CHUNK_SIZE,
-		.setkey			= speck128_xts_setkey,
-		.encrypt		= speck128_xts_encrypt,
-		.decrypt		= speck128_xts_decrypt,
-	}, {
-		.base.cra_name		= "xts(speck64)",
-		.base.cra_driver_name	= "xts-speck64-neon",
-		.base.cra_priority	= 300,
-		.base.cra_blocksize	= SPECK64_BLOCK_SIZE,
-		.base.cra_ctxsize	= sizeof(struct speck64_xts_tfm_ctx),
-		.base.cra_alignmask	= 7,
-		.base.cra_module	= THIS_MODULE,
-		.min_keysize		= 2 * SPECK64_96_KEY_SIZE,
-		.max_keysize		= 2 * SPECK64_128_KEY_SIZE,
-		.ivsize			= SPECK64_BLOCK_SIZE,
-		.walksize		= SPECK_NEON_CHUNK_SIZE,
-		.setkey			= speck64_xts_setkey,
-		.encrypt		= speck64_xts_encrypt,
-		.decrypt		= speck64_xts_decrypt,
-	}
-};
-
-static int __init speck_neon_module_init(void)
-{
-	if (!(elf_hwcap & HWCAP_ASIMD))
-		return -ENODEV;
-	return crypto_register_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-static void __exit speck_neon_module_exit(void)
-{
-	crypto_unregister_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
-}
-
-module_init(speck_neon_module_init);
-module_exit(speck_neon_module_exit);
-
-MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
-MODULE_ALIAS_CRYPTO("xts(speck128)");
-MODULE_ALIAS_CRYPTO("xts-speck128-neon");
-MODULE_ALIAS_CRYPTO("xts(speck64)");
-MODULE_ALIAS_CRYPTO("xts-speck64-neon");
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@ -532,22 +532,6 @@ alternative_endif
 	and	\phys, \pte, #(((1 << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
 	.endm

-	/*
-	 * Check whether preempt-disabled code should yield as soon as it
-	 * is able. This is the case if re-enabling preemption a single
-	 * time results in a preempt count of zero, and the TIF_NEED_RESCHED
-	 * flag is set. (Note that the latter is stored negated in the
-	 * top word of the thread_info::preempt_count field)
-	 */
-	.macro		cond_yield, lbl:req, tmp:req
-#ifdef CONFIG_PREEMPTION
-	get_current_task \tmp
-	ldr		\tmp, [\tmp, #TSK_TI_PREEMPT]
-	sub		\tmp, \tmp, #PREEMPT_DISABLE_OFFSET
-	cbz		\tmp, \lbl
-#endif
-	.endm
-
 /*
 * Check the MIDR_EL1 of the current CPU for a given model and a range of
 * variant/revision. See asm/cputype.h for the macros used below.
@ -587,67 +571,4 @@ alternative_endif
 .Ldone\@:
 	.endm

-	/*
-	 * frame_push - Push @regcount callee saved registers to the stack,
-	 *              starting at x19, as well as x29/x30, and set x29 to
-	 *              the new value of sp. Add @extra bytes of stack space
-	 *              for locals.
-	 */
-	.macro		frame_push, regcount:req, extra
-	__frame		st, \regcount, \extra
-	.endm
-
-	/*
-	 * frame_pop  - Pop the callee saved registers from the stack that were
-	 *              pushed in the most recent call to frame_push, as well
-	 *              as x29/x30 and any extra stack space that may have been
-	 *              allocated.
-	 */
-	.macro		frame_pop
-	__frame		ld
-	.endm
-
-	.macro		__frame_regs, reg1, reg2, op, num
-	.if		.Lframe_regcount == \num
-	\op\()r		\reg1, [sp, #(\num + 1) * 8]
-	.elseif		.Lframe_regcount > \num
-	\op\()p		\reg1, \reg2, [sp, #(\num + 1) * 8]
-	.endif
-	.endm
-
-	.macro		__frame, op, regcount, extra=0
-	.ifc		\op, st
-	.if		(\regcount) < 0 || (\regcount) > 10
-	.error		"regcount should be in the range [0 ... 10]"
-	.endif
-	.if		((\extra) % 16) != 0
-	.error		"extra should be a multiple of 16 bytes"
-	.endif
-	.ifdef		.Lframe_regcount
-	.if		.Lframe_regcount != -1
-	.error		"frame_push/frame_pop may not be nested"
-	.endif
-	.endif
-	.set		.Lframe_regcount, \regcount
-	.set		.Lframe_extra, \extra
-	.set		.Lframe_local_offset, ((\regcount + 3) / 2) * 16
-	stp		x29, x30, [sp, #-.Lframe_local_offset - .Lframe_extra]!
-	mov		x29, sp
-	.endif
-
-	__frame_regs	x19, x20, \op, 1
-	__frame_regs	x21, x22, \op, 3
-	__frame_regs	x23, x24, \op, 5
-	__frame_regs	x25, x26, \op, 7
-	__frame_regs	x27, x28, \op, 9
-
-	.ifc		\op, ld
-	.if		.Lframe_regcount == -1
-	.error		"frame_push/frame_pop may not be nested"
-	.endif
-	ldp		x29, x30, [sp], #.Lframe_local_offset + .Lframe_extra
-	.set		.Lframe_regcount, -1
-	.endif
-	.endm
-
 #endif	/* __ASM_ASSEMBLER_H */
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@ -14,8 +14,15 @@
 #include <asm/hwcap.h>
 #include <asm/sysreg.h>

-#define MAX_CPU_FEATURES	64
-#define cpu_feature(x)		KERNEL_HWCAP_ ## x
+/*
+ * In the arm64 world (as in the ARM world), elf_hwcap is used both internally
+ * in the kernel and for user space to keep track of which optional features
+ * are supported by the current system. So let's map feature 'x' to HWCAP_x.
+ * Note that HWCAP_x constants are bit fields so we need to take the log.
+ */
+
+#define MAX_CPU_FEATURES	(8 * sizeof(elf_hwcap))
+#define cpu_feature(x)		ilog2(HWCAP_ ## x)

 #define ARM64_SSBD_UNKNOWN		-1
 #define ARM64_SSBD_FORCE_DISABLE	0
@ -340,19 +347,10 @@ extern struct static_key_false arm64_const_caps_ready;

 bool this_cpu_has_cap(unsigned int cap);

-static inline void cpu_set_feature(unsigned int num)
-{
-	WARN_ON(num >= MAX_CPU_FEATURES);
-	elf_hwcap |= BIT(num);
-}
-#define cpu_set_named_feature(name) cpu_set_feature(cpu_feature(name))
-
 static inline bool cpu_have_feature(unsigned int num)
 {
-	WARN_ON(num >= MAX_CPU_FEATURES);
-	return elf_hwcap & BIT(num);
+	return elf_hwcap & (1UL << num);
 }
-#define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name))

 /* System capability check for constant caps */
 static inline bool __cpus_have_const_cap(int num)
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@ -40,61 +40,11 @@
 #define COMPAT_HWCAP2_CRC32	(1 << 4)

 #ifndef __ASSEMBLY__
-#include <linux/kernel.h>
-#include <linux/log2.h>
-
-/*
- * For userspace we represent hwcaps as a collection of HWCAP{,2}_x bitfields
- * as described in uapi/asm/hwcap.h. For the kernel we represent hwcaps as
- * natural numbers (in a single range of size MAX_CPU_FEATURES) defined here
- * with prefix KERNEL_HWCAP_ mapped to their HWCAP{,2}_x counterpart.
- *
- * Hwcaps should be set and tested within the kernel via the
- * cpu_{set,have}_named_feature(feature) where feature is the unique suffix
- * of KERNEL_HWCAP_{feature}.
- */
-#define __khwcap_feature(x)		const_ilog2(HWCAP_ ## x)
-#define KERNEL_HWCAP_FP			__khwcap_feature(FP)
-#define KERNEL_HWCAP_ASIMD		__khwcap_feature(ASIMD)
-#define KERNEL_HWCAP_EVTSTRM		__khwcap_feature(EVTSTRM)
-#define KERNEL_HWCAP_AES		__khwcap_feature(AES)
-#define KERNEL_HWCAP_PMULL		__khwcap_feature(PMULL)
-#define KERNEL_HWCAP_SHA1		__khwcap_feature(SHA1)
-#define KERNEL_HWCAP_SHA2		__khwcap_feature(SHA2)
-#define KERNEL_HWCAP_CRC32		__khwcap_feature(CRC32)
-#define KERNEL_HWCAP_ATOMICS		__khwcap_feature(ATOMICS)
-#define KERNEL_HWCAP_FPHP		__khwcap_feature(FPHP)
-#define KERNEL_HWCAP_ASIMDHP		__khwcap_feature(ASIMDHP)
-#define KERNEL_HWCAP_CPUID		__khwcap_feature(CPUID)
-#define KERNEL_HWCAP_ASIMDRDM		__khwcap_feature(ASIMDRDM)
-#define KERNEL_HWCAP_JSCVT		__khwcap_feature(JSCVT)
-#define KERNEL_HWCAP_FCMA		__khwcap_feature(FCMA)
-#define KERNEL_HWCAP_LRCPC		__khwcap_feature(LRCPC)
-#define KERNEL_HWCAP_DCPOP		__khwcap_feature(DCPOP)
-#define KERNEL_HWCAP_SHA3		__khwcap_feature(SHA3)
-#define KERNEL_HWCAP_SM3		__khwcap_feature(SM3)
-#define KERNEL_HWCAP_SM4		__khwcap_feature(SM4)
-#define KERNEL_HWCAP_ASIMDDP		__khwcap_feature(ASIMDDP)
-#define KERNEL_HWCAP_SHA512		__khwcap_feature(SHA512)
-#define KERNEL_HWCAP_SVE		__khwcap_feature(SVE)
-#define KERNEL_HWCAP_ASIMDFHM		__khwcap_feature(ASIMDFHM)
-#define KERNEL_HWCAP_DIT		__khwcap_feature(DIT)
-#define KERNEL_HWCAP_USCAT		__khwcap_feature(USCAT)
-#define KERNEL_HWCAP_ILRCPC		__khwcap_feature(ILRCPC)
-#define KERNEL_HWCAP_FLAGM		__khwcap_feature(FLAGM)
-#define KERNEL_HWCAP_SSBS		__khwcap_feature(SSBS)
-#define KERNEL_HWCAP_SB			__khwcap_feature(SB)
-#define KERNEL_HWCAP_PACA		__khwcap_feature(PACA)
-#define KERNEL_HWCAP_PACG		__khwcap_feature(PACG)
-
-#define __khwcap2_feature(x)		(const_ilog2(HWCAP2_ ## x) + 32)
-
 /*
 * This yields a mask that user programs can use to figure out what
 * instruction set this cpu supports.
 */
-#define ELF_HWCAP		lower_32_bits(elf_hwcap)
-#define ELF_HWCAP2		upper_32_bits(elf_hwcap)
+#define ELF_HWCAP		(elf_hwcap)

 #ifdef CONFIG_COMPAT
 #define COMPAT_ELF_HWCAP	(compat_elf_hwcap)
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@ -18,7 +18,7 @@
 #define _UAPI__ASM_HWCAP_H

 /*
- * HWCAP flags - for AT_HWCAP
+ * HWCAP flags - for elf_hwcap (in kernel) and AT_HWCAP
 */
 #define HWCAP_FP		(1 << 0)
 #define HWCAP_ASIMD		(1 << 1)
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@ -1258,32 +1258,32 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 	}

 static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_PMULL),
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AES),
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA1),
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA2),
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_SHA512),
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_CRC32),
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ATOMICS),
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RDM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM),
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA3),
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM3),
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM4),
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP),
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM),
-	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FLAGM),
-	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_FP),
-	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FPHP),
-	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_ASIMD),
-	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP),
-	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_DIT_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DIT),
-	HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DCPOP),
-	HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_JSCVT),
-	HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FCMA),
-	HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_LRCPC),
-	HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC),
-	HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT),
-	HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, KERNEL_HWCAP_SSBS),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_PMULL),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_AES),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA1),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA2),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_SHA512),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_CRC32),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_ATOMICS),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RDM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDRDM),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA3),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM3),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM4),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDDP),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDFHM),
+	HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FLAGM),
+	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_FP),
+	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP),
+	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD),
+	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_ASIMDHP),
+	HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_DIT_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_DIT),
+	HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_DCPOP),
+	HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_JSCVT),
+	HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FCMA),
+	HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_LRCPC),
+	HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_ILRCPC),
+	HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_USCAT),
+	HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, HWCAP_SSBS),
 	{},
 };

@ -1329,7 +1329,7 @@ static void __init cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap)
 {
 	switch (cap->hwcap_type) {
 	case CAP_HWCAP:
-		cpu_set_feature(cap->hwcap);
+		elf_hwcap |= cap->hwcap;
 		break;
 #ifdef CONFIG_COMPAT
 	case CAP_COMPAT_HWCAP:
@ -1352,7 +1352,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap)

 	switch (cap->hwcap_type) {
 	case CAP_HWCAP:
-		rc = cpu_have_feature(cap->hwcap);
+		rc = (elf_hwcap & cap->hwcap) != 0;
 		break;
 #ifdef CONFIG_COMPAT
 	case CAP_COMPAT_HWCAP:
@ -1373,7 +1373,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap)
 static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps)
 {
 	/* We support emulation of accesses to CPU ID feature registers */
-	cpu_set_named_feature(CPUID);
+	elf_hwcap |= HWCAP_CPUID;
 	for (; hwcaps->matches; hwcaps++)
 		if (hwcaps->matches(hwcaps, cpucap_default_scope(hwcaps)))
 			cap_set_elf_hwcap(hwcaps);
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@ -172,7 +172,7 @@ static int c_show(struct seq_file *m, void *v)
 #endif /* CONFIG_COMPAT */
 		} else {
 			for (j = 0; hwcap_str[j]; j++)
-				if (cpu_have_feature(j))
+				if (elf_hwcap & (1 << j))
 					seq_printf(m, " %s", hwcap_str[j]);
 		}
 		seq_puts(m, "\n");
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@ -444,14 +444,14 @@ static inline void fpsimd_hotplug_init(void) { }
 */
 static int __init fpsimd_init(void)
 {
-	if (cpu_have_named_feature(FP)) {
+	if (elf_hwcap & HWCAP_FP) {
 		fpsimd_pm_init();
 		fpsimd_hotplug_init();
 	} else {
 		pr_notice("Floating-point is not implemented\n");
 	}

-	if (!cpu_have_named_feature(ASIMD))
+	if (!(elf_hwcap & HWCAP_ASIMD))
 		pr_notice("Advanced SIMD is not implemented\n");

 	return 0;
--- a/arch/arm64/lib/crc32.S
+++ b/arch/arm64/lib/crc32.S
@ -12,7 +12,7 @@
 #include <asm/alternative.h>
 #include <asm/assembler.h>

-	.cpu		generic+crc
+	.arch		armv8-a+crc

 	.macro		__crc32, c
 	cmp		x2, #16
--- a/arch/mips/cavium-octeon/crypto/octeon-md5.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-md5.c
@ -182,6 +182,7 @@ static struct shash_alg alg = {
 		.cra_name	=	"md5",
 		.cra_driver_name=	"octeon-md5",
 		.cra_priority	=	OCTEON_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	MD5_HMAC_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/mips/cavium-octeon/crypto/octeon-sha1.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-sha1.c
@ -215,6 +215,7 @@ static struct shash_alg octeon_sha1_alg = {
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"octeon-sha1",
 		.cra_priority	=	OCTEON_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/mips/cavium-octeon/crypto/octeon-sha256.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-sha256.c
@ -239,6 +239,7 @@ static struct shash_alg octeon_sha256_algs[2] = { {
 		.cra_name	=	"sha256",
 		.cra_driver_name=	"octeon-sha256",
 		.cra_priority	=	OCTEON_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA256_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -251,6 +252,7 @@ static struct shash_alg octeon_sha256_algs[2] = { {
 	.base		=	{
 		.cra_name	=	"sha224",
 		.cra_driver_name=	"octeon-sha224",
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA224_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/mips/cavium-octeon/crypto/octeon-sha512.c
+++ b/arch/mips/cavium-octeon/crypto/octeon-sha512.c
@ -235,6 +235,7 @@ static struct shash_alg octeon_sha512_algs[2] = { {
 		.cra_name	=	"sha512",
 		.cra_driver_name=	"octeon-sha512",
 		.cra_priority	=	OCTEON_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -248,6 +249,7 @@ static struct shash_alg octeon_sha512_algs[2] = { {
 		.cra_name	=	"sha384",
 		.cra_driver_name=	"octeon-sha384",
 		.cra_priority	=	OCTEON_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA384_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/powerpc/crypto/md5-glue.c
+++ b/arch/powerpc/crypto/md5-glue.c
@ -139,6 +139,7 @@ static struct shash_alg alg = {
 		.cra_name	=	"md5",
 		.cra_driver_name=	"md5-ppc",
 		.cra_priority	=	200,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	MD5_HMAC_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/powerpc/crypto/sha1-spe-glue.c
+++ b/arch/powerpc/crypto/sha1-spe-glue.c
@ -185,6 +185,7 @@ static struct shash_alg alg = {
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"sha1-ppc-spe",
 		.cra_priority	=	300,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/powerpc/crypto/sha1.c
+++ b/arch/powerpc/crypto/sha1.c
@ -132,6 +132,7 @@ static struct shash_alg alg = {
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"sha1-powerpc",
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/powerpc/crypto/sha256-spe-glue.c
+++ b/arch/powerpc/crypto/sha256-spe-glue.c
@ -231,6 +231,7 @@ static struct shash_alg algs[2] = { {
 		.cra_name	=	"sha256",
 		.cra_driver_name=	"sha256-ppc-spe",
 		.cra_priority	=	300,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA256_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -247,6 +248,7 @@ static struct shash_alg algs[2] = { {
 		.cra_name	=	"sha224",
 		.cra_driver_name=	"sha224-ppc-spe",
 		.cra_priority	=	300,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA224_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/s390/crypto/ghash_s390.c
+++ b/arch/s390/crypto/ghash_s390.c
@ -127,6 +127,7 @@ static struct shash_alg ghash_alg = {
 		.cra_name		= "ghash",
 		.cra_driver_name	= "ghash-s390",
 		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize		= GHASH_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct ghash_ctx),
 		.cra_module		= THIS_MODULE,
--- a/arch/s390/crypto/sha1_s390.c
+++ b/arch/s390/crypto/sha1_s390.c
@ -83,6 +83,7 @@ static struct shash_alg alg = {
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"sha1-s390",
 		.cra_priority	=	300,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/s390/crypto/sha256_s390.c
+++ b/arch/s390/crypto/sha256_s390.c
@ -76,6 +76,7 @@ static struct shash_alg sha256_alg = {
 		.cra_name	=	"sha256",
 		.cra_driver_name=	"sha256-s390",
 		.cra_priority	=	300,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA256_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -112,6 +113,7 @@ static struct shash_alg sha224_alg = {
 		.cra_name	=	"sha224",
 		.cra_driver_name=	"sha224-s390",
 		.cra_priority	=	300,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA224_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/s390/crypto/sha512_s390.c
+++ b/arch/s390/crypto/sha512_s390.c
@ -81,6 +81,7 @@ static struct shash_alg sha512_alg = {
 		.cra_name	=	"sha512",
 		.cra_driver_name=	"sha512-s390",
 		.cra_priority	=	300,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -119,6 +120,7 @@ static struct shash_alg sha384_alg = {
 		.cra_name	=	"sha384",
 		.cra_driver_name=	"sha384-s390",
 		.cra_priority	=	300,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA384_BLOCK_SIZE,
 		.cra_ctxsize	=	sizeof(struct s390_sha_ctx),
 		.cra_module	=	THIS_MODULE,
--- a/arch/sparc/crypto/aes_glue.c
+++ b/arch/sparc/crypto/aes_glue.c
@ -196,14 +196,14 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
 	return 0;
 }

-static void crypto_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm);

 	ctx->ops->encrypt(&ctx->key[0], (const u32 *) src, (u32 *) dst);
 }

-static void crypto_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm);

@ -395,8 +395,8 @@ static struct crypto_alg algs[] = { {
 			.cia_min_keysize	= AES_MIN_KEY_SIZE,
 			.cia_max_keysize	= AES_MAX_KEY_SIZE,
 			.cia_setkey		= aes_set_key,
-			.cia_encrypt		= crypto_aes_encrypt,
-			.cia_decrypt		= crypto_aes_decrypt
+			.cia_encrypt		= aes_encrypt,
+			.cia_decrypt		= aes_decrypt
 		}
 	}
 }, {
--- a/arch/sparc/crypto/md5_glue.c
+++ b/arch/sparc/crypto/md5_glue.c
@ -144,6 +144,7 @@ static struct shash_alg alg = {
 		.cra_name	=	"md5",
 		.cra_driver_name=	"md5-sparc64",
 		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	MD5_HMAC_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/sparc/crypto/sha1_glue.c
+++ b/arch/sparc/crypto/sha1_glue.c
@ -139,6 +139,7 @@ static struct shash_alg alg = {
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"sha1-sparc64",
 		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/sparc/crypto/sha256_glue.c
+++ b/arch/sparc/crypto/sha256_glue.c
@ -169,6 +169,7 @@ static struct shash_alg sha256 = {
 		.cra_name	=	"sha256",
 		.cra_driver_name=	"sha256-sparc64",
 		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA256_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -184,6 +185,7 @@ static struct shash_alg sha224 = {
 		.cra_name	=	"sha224",
 		.cra_driver_name=	"sha224-sparc64",
 		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA224_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/sparc/crypto/sha512_glue.c
+++ b/arch/sparc/crypto/sha512_glue.c
@ -154,6 +154,7 @@ static struct shash_alg sha512 = {
 		.cra_name	=	"sha512",
 		.cra_driver_name=	"sha512-sparc64",
 		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -169,6 +170,7 @@ static struct shash_alg sha384 = {
 		.cra_name	=	"sha384",
 		.cra_driver_name=	"sha384-sparc64",
 		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA384_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@ -328,7 +328,7 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
 	return aes_set_key_common(tfm, crypto_tfm_ctx(tfm), in_key, key_len);
 }

-static void aesni_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));

@ -341,7 +341,7 @@ static void aesni_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 	}
 }

-static void aesni_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 {
 	struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));

@ -973,8 +973,8 @@ static struct crypto_alg aesni_algs[] = { {
 			.cia_min_keysize	= AES_MIN_KEY_SIZE,
 			.cia_max_keysize	= AES_MAX_KEY_SIZE,
 			.cia_setkey		= aes_set_key,
-			.cia_encrypt		= aesni_encrypt,
-			.cia_decrypt		= aesni_decrypt
+			.cia_encrypt		= aes_encrypt,
+			.cia_decrypt		= aes_decrypt
 		}
 	}
 }, {
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@ -154,7 +154,8 @@ static struct shash_alg ghash_alg = {
 		.cra_name		= "__ghash",
 		.cra_driver_name	= "__ghash-pclmulqdqni",
 		.cra_priority		= 0,
-		.cra_flags		= CRYPTO_ALG_INTERNAL,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH |
+					  CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= GHASH_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct ghash_ctx),
 		.cra_module		= THIS_MODULE,
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@ -171,6 +171,7 @@ static struct shash_alg alg = {
 		.cra_name		= "poly1305",
 		.cra_driver_name	= "poly1305-simd",
 		.cra_priority		= 300,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
 		.cra_alignmask		= sizeof(u32) - 1,
 		.cra_blocksize		= POLY1305_BLOCK_SIZE,
 		.cra_module		= THIS_MODULE,
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@ -100,6 +100,7 @@ static struct shash_alg sha1_ssse3_alg = {
 		.cra_name	=	"sha1",
 		.cra_driver_name =	"sha1-ssse3",
 		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -150,6 +151,7 @@ static struct shash_alg sha1_avx_alg = {
 		.cra_name	=	"sha1",
 		.cra_driver_name =	"sha1-avx",
 		.cra_priority	=	160,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -239,6 +241,7 @@ static struct shash_alg sha1_avx2_alg = {
 		.cra_name	=	"sha1",
 		.cra_driver_name =	"sha1-avx2",
 		.cra_priority	=	170,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -294,6 +297,7 @@ static struct shash_alg sha1_ni_alg = {
 		.cra_name	=	"sha1",
 		.cra_driver_name =	"sha1-ni",
 		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA1_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@ -109,6 +109,7 @@ static struct shash_alg sha256_ssse3_algs[] = { {
 		.cra_name	=	"sha256",
 		.cra_driver_name =	"sha256-ssse3",
 		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA256_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -123,6 +124,7 @@ static struct shash_alg sha256_ssse3_algs[] = { {
 		.cra_name	=	"sha224",
 		.cra_driver_name =	"sha224-ssse3",
 		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA224_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -175,6 +177,7 @@ static struct shash_alg sha256_avx_algs[] = { {
 		.cra_name	=	"sha256",
 		.cra_driver_name =	"sha256-avx",
 		.cra_priority	=	160,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA256_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -189,6 +192,7 @@ static struct shash_alg sha256_avx_algs[] = { {
 		.cra_name	=	"sha224",
 		.cra_driver_name =	"sha224-avx",
 		.cra_priority	=	160,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA224_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -257,6 +261,7 @@ static struct shash_alg sha256_avx2_algs[] = { {
 		.cra_name	=	"sha256",
 		.cra_driver_name =	"sha256-avx2",
 		.cra_priority	=	170,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA256_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -271,6 +276,7 @@ static struct shash_alg sha256_avx2_algs[] = { {
 		.cra_name	=	"sha224",
 		.cra_driver_name =	"sha224-avx2",
 		.cra_priority	=	170,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA224_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -337,6 +343,7 @@ static struct shash_alg sha256_ni_algs[] = { {
 		.cra_name	=	"sha256",
 		.cra_driver_name =	"sha256-ni",
 		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA256_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -351,6 +358,7 @@ static struct shash_alg sha256_ni_algs[] = { {
 		.cra_name	=	"sha224",
 		.cra_driver_name =	"sha224-ni",
 		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA224_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@ -108,6 +108,7 @@ static struct shash_alg sha512_ssse3_algs[] = { {
 		.cra_name	=	"sha512",
 		.cra_driver_name =	"sha512-ssse3",
 		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -122,6 +123,7 @@ static struct shash_alg sha512_ssse3_algs[] = { {
 		.cra_name	=	"sha384",
 		.cra_driver_name =	"sha384-ssse3",
 		.cra_priority	=	150,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA384_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -185,6 +187,7 @@ static struct shash_alg sha512_avx_algs[] = { {
 		.cra_name	=	"sha512",
 		.cra_driver_name =	"sha512-avx",
 		.cra_priority	=	160,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -199,6 +202,7 @@ static struct shash_alg sha512_avx_algs[] = { {
 		.cra_name	=	"sha384",
 		.cra_driver_name =	"sha384-avx",
 		.cra_priority	=	160,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA384_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -256,6 +260,7 @@ static struct shash_alg sha512_avx2_algs[] = { {
 		.cra_name	=	"sha512",
 		.cra_driver_name =	"sha512-avx2",
 		.cra_priority	=	170,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA512_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
@ -270,6 +275,7 @@ static struct shash_alg sha512_avx2_algs[] = { {
 		.cra_name	=	"sha384",
 		.cra_driver_name =	"sha384-avx2",
 		.cra_priority	=	170,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
 		.cra_blocksize	=	SHA384_BLOCK_SIZE,
 		.cra_module	=	THIS_MODULE,
 	}
--- a/block/Kconfig
+++ b/block/Kconfig
@ -195,7 +195,7 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
 	bool "Enable crypto API fallback for blk-crypto"
 	depends on BLK_INLINE_ENCRYPTION
 	select CRYPTO
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	help
 	  Enabling this lets the block layer handle inline encryption
 	  by falling back to the kernel crypto API when inline
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@ -52,12 +52,12 @@ config CRYPTO_AEAD2
 	select CRYPTO_NULL2
 	select CRYPTO_RNG2

-config CRYPTO_SKCIPHER
+config CRYPTO_BLKCIPHER
 	tristate
-	select CRYPTO_SKCIPHER2
+	select CRYPTO_BLKCIPHER2
 	select CRYPTO_ALGAPI

-config CRYPTO_SKCIPHER2
+config CRYPTO_BLKCIPHER2
 	tristate
 	select CRYPTO_ALGAPI2
 	select CRYPTO_RNG2
@ -146,7 +146,7 @@ config CRYPTO_MANAGER2
 	def_tristate CRYPTO_MANAGER || (CRYPTO_MANAGER!=n && CRYPTO_ALGAPI=y)
 	select CRYPTO_AEAD2
 	select CRYPTO_HASH2
-	select CRYPTO_SKCIPHER2
+	select CRYPTO_BLKCIPHER2
 	select CRYPTO_AKCIPHER2
 	select CRYPTO_KPP2
 	select CRYPTO_ACOMP2
@ -185,7 +185,7 @@ config CRYPTO_NULL
 config CRYPTO_NULL2
 	tristate
 	select CRYPTO_ALGAPI2
-	select CRYPTO_SKCIPHER2
+	select CRYPTO_BLKCIPHER2
 	select CRYPTO_HASH2

 config CRYPTO_PCRYPT
@ -203,7 +203,7 @@ config CRYPTO_WORKQUEUE

 config CRYPTO_CRYPTD
 	tristate "Software async crypto daemon"
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_HASH
 	select CRYPTO_MANAGER
 	select CRYPTO_WORKQUEUE
@ -214,7 +214,7 @@ config CRYPTO_CRYPTD

 config CRYPTO_MCRYPTD
 	tristate "Software async multi-buffer crypto daemon"
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_HASH
 	select CRYPTO_MANAGER
 	select CRYPTO_WORKQUEUE
@ -229,7 +229,7 @@ config CRYPTO_MCRYPTD
 config CRYPTO_AUTHENC
 	tristate "Authenc support"
 	select CRYPTO_AEAD
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_MANAGER
 	select CRYPTO_HASH
 	select CRYPTO_NULL
@ -255,7 +255,7 @@ config CRYPTO_SIMD
 config CRYPTO_GLUE_HELPER_X86
 	tristate
 	depends on X86
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER

 config CRYPTO_ENGINE
 	tristate
@ -295,7 +295,7 @@ config CRYPTO_CHACHA20POLY1305
 config CRYPTO_SEQIV
 	tristate "Sequence Number IV Generator"
 	select CRYPTO_AEAD
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_NULL
 	select CRYPTO_RNG_DEFAULT
 	help
@ -317,7 +317,7 @@ comment "Block modes"

 config CRYPTO_CBC
 	tristate "CBC support"
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_MANAGER
 	help
 	  CBC: Cipher Block Chaining mode
@ -325,7 +325,7 @@ config CRYPTO_CBC

 config CRYPTO_CTR
 	tristate "CTR support"
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_SEQIV
 	select CRYPTO_MANAGER
 	help
@ -334,7 +334,7 @@ config CRYPTO_CTR

 config CRYPTO_CTS
 	tristate "CTS support"
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	help
 	  CTS: Cipher Text Stealing
 	  This is the Cipher Text Stealing mode as described by
@ -345,7 +345,7 @@ config CRYPTO_CTS

 config CRYPTO_ECB
 	tristate "ECB support"
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_MANAGER
 	help
 	  ECB: Electronic CodeBook mode
@ -354,7 +354,7 @@ config CRYPTO_ECB

 config CRYPTO_LRW
 	tristate "LRW support"
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_MANAGER
 	select CRYPTO_GF128MUL
 	help
@ -366,7 +366,7 @@ config CRYPTO_LRW

 config CRYPTO_PCBC
 	tristate "PCBC support"
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_MANAGER
 	help
 	  PCBC: Propagating Cipher Block Chaining mode
@ -374,7 +374,7 @@ config CRYPTO_PCBC

 config CRYPTO_XTS
 	tristate "XTS support"
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_MANAGER
 	select CRYPTO_ECB
 	help
@ -384,7 +384,7 @@ config CRYPTO_XTS

 config CRYPTO_KEYWRAP
 	tristate "Key wrapping support"
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	help
 	  Support for key wrapping (NIST SP800-38F / RFC3394) without
 	  padding.
@ -888,17 +888,6 @@ config CRYPTO_SHA3
 	  References:
 	  http://keccak.noekeon.org/

-config CRYPTO_SM3
-	tristate "SM3 digest algorithm"
-	select CRYPTO_HASH
-	help
-	  SM3 secure hash function as defined by OSCCA GM/T 0004-2012 SM3).
-	  It is part of the Chinese Commercial Cryptography suite.
-
-	  References:
-	  http://www.oscca.gov.cn/UpFile/20101222141857786.pdf
-	  https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash
-
 config CRYPTO_TGR192
 	tristate "Tiger digest algorithms"
 	select CRYPTO_HASH
@ -934,9 +923,6 @@ config CRYPTO_GHASH_CLMUL_NI_INTEL

 comment "Ciphers"

-config CRYPTO_LIB_AES
-	tristate
-
 config CRYPTO_AES
 	tristate "AES cipher algorithms"
 	select CRYPTO_ALGAPI
@ -960,7 +946,6 @@ config CRYPTO_AES
 config CRYPTO_AES_TI
 	tristate "Fixed time AES cipher"
 	select CRYPTO_ALGAPI
-	select CRYPTO_LIB_AES
 	help
 	  This is a generic implementation of AES that attempts to eliminate
 	  data dependent latencies as much as possible without affecting
@ -1027,7 +1012,7 @@ config CRYPTO_AES_NI_INTEL
 	select CRYPTO_AES_X86_64 if 64BIT
 	select CRYPTO_AES_586 if !64BIT
 	select CRYPTO_ALGAPI
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_GLUE_HELPER_X86 if 64BIT
 	select CRYPTO_SIMD
 	help
@ -1111,7 +1096,7 @@ config CRYPTO_ANUBIS

 config CRYPTO_ARC4
 	tristate "ARC4 cipher algorithm"
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	help
 	  ARC4 cipher algorithm.

@ -1339,7 +1324,7 @@ config CRYPTO_DES3_EDE_X86_64
 config CRYPTO_FCRYPT
 	tristate "FCrypt cipher algorithm"
 	select CRYPTO_ALGAPI
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	help
 	  FCrypt algorithm used by RxRPC.

@ -1358,7 +1343,7 @@ config CRYPTO_KHAZAD

 config CRYPTO_SALSA20
 	tristate "Salsa20 stream cipher algorithm"
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	help
 	  Salsa20 stream cipher algorithm.

@ -1370,7 +1355,7 @@ config CRYPTO_SALSA20

 config CRYPTO_CHACHA20
 	tristate "ChaCha stream cipher algorithms"
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	help
 	  The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms.

@ -1392,7 +1377,7 @@ config CRYPTO_CHACHA20
 config CRYPTO_CHACHA20_X86_64
 	tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)"
 	depends on X86 && 64BIT
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_CHACHA20
 	help
 	  ChaCha20 cipher algorithm, RFC7539.
@ -1431,31 +1416,6 @@ config CRYPTO_SERPENT
 	  See also:
 	  <http://www.cl.cam.ac.uk/~rja14/serpent.html>

-config CRYPTO_SM4
-	tristate "SM4 cipher algorithm"
-	select CRYPTO_ALGAPI
-	help
-	  SM4 cipher algorithms (OSCCA GB/T 32907-2016).
-
-	  SM4 (GBT.32907-2016) is a cryptographic standard issued by the
-	  Organization of State Commercial Administration of China (OSCCA)
-	  as an authorized cryptographic algorithms for the use within China.
-
-	  SMS4 was originally created for use in protecting wireless
-	  networks, and is mandated in the Chinese National Standard for
-	  Wireless LAN WAPI (Wired Authentication and Privacy Infrastructure)
-	  (GB.15629.11-2003).
-
-	  The latest SM4 standard (GBT.32907-2016) was proposed by OSCCA and
-	  standardized through TC 260 of the Standardization Administration
-	  of the People's Republic of China (SAC).
-
-	  The input, output, and key of SMS4 are each 128 bits.
-
-	  See also: <https://eprint.iacr.org/2008/329.pdf>
-
-	  If unsure, say N.
-
 config CRYPTO_SERPENT_SSE2_X86_64
 	tristate "Serpent cipher algorithm (x86_64/SSE2)"
 	depends on X86 && 64BIT
@ -1795,7 +1755,7 @@ config CRYPTO_USER_API_HASH
 config CRYPTO_USER_API_SKCIPHER
 	tristate "User-space interface for symmetric key cipher algorithms"
 	depends on NET
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_USER_API
 	help
 	  This option enables the user-spaces interface for symmetric
@ -1814,7 +1774,7 @@ config CRYPTO_USER_API_AEAD
 	tristate "User-space interface for AEAD cipher algorithms"
 	depends on NET
 	select CRYPTO_AEAD
-	select CRYPTO_SKCIPHER
+	select CRYPTO_BLKCIPHER
 	select CRYPTO_NULL
 	select CRYPTO_USER_API
 	help
--- a/crypto/Makefile
+++ b/crypto/Makefile
@ -17,9 +17,10 @@ obj-$(CONFIG_CRYPTO_ALGAPI2) += crypto_algapi.o

 obj-$(CONFIG_CRYPTO_AEAD2) += aead.o

-crypto_skcipher-y := ablkcipher.o blkcipher.o
-crypto_skcipher-y += skcipher.o
-obj-$(CONFIG_CRYPTO_SKCIPHER2) += crypto_skcipher.o
+crypto_blkcipher-y := ablkcipher.o
+crypto_blkcipher-y += blkcipher.o
+crypto_blkcipher-y += skcipher.o
+obj-$(CONFIG_CRYPTO_BLKCIPHER2) += crypto_blkcipher.o
 obj-$(CONFIG_CRYPTO_SEQIV) += seqiv.o
 obj-$(CONFIG_CRYPTO_ECHAINIV) += echainiv.o

@ -70,7 +71,6 @@ obj-$(CONFIG_CRYPTO_SHA1) += sha1_generic.o
 obj-$(CONFIG_CRYPTO_SHA256) += sha256_generic.o
 obj-$(CONFIG_CRYPTO_SHA512) += sha512_generic.o
 obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o
-obj-$(CONFIG_CRYPTO_SM3) += sm3_generic.o
 obj-$(CONFIG_CRYPTO_WP512) += wp512.o
 CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
 obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
@ -101,7 +101,6 @@ obj-$(CONFIG_CRYPTO_SERPENT) += serpent_generic.o
 CFLAGS_serpent_generic.o := $(call cc-option,-fsched-pressure)  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
 obj-$(CONFIG_CRYPTO_AES) += aes_generic.o
 CFLAGS_aes_generic.o := $(call cc-option,-fno-code-hoisting) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83356
-obj-$(CONFIG_CRYPTO_SM4) += sm4_generic.o
 obj-$(CONFIG_CRYPTO_AES_TI) += aes_ti.o
 obj-$(CONFIG_CRYPTO_CAMELLIA) += camellia_generic.o
 obj-$(CONFIG_CRYPTO_CAST_COMMON) += cast_common.o
--- a/Show More
+++ b/Show More