[SQUASH] treewide: Revert backported crypto algos

* breaks vowifi, besides i had little to no improvement
   in geekbench aex-xts scores with these

Revert "ARM64/configs: surya: Enable backported crypto algos"

This reverts commit 53cf765af430ea1ce8b99cfdebfaf1fbd05ce9e7.

Revert "arm64: crypto: aes-glue: always clear out defines before assuming"

This reverts commit 6bdf40bd12304f3c26c7278be8172dae4d355d23.

Revert "lib: crypto: move out aes generic library into arm64"

This reverts commit 6f53883b2cac7ec3818464e90738182c7da727b4.

Revert "crypto: bring back blkcipher"

This reverts commit 49ab3ad968fdf0f3b15a7014d6958cc46e50c4a3.

Revert "crypto: lib/aes - export sbox and inverse sbox"

This reverts commit 6d7509597228bb2824b229b6f92200621af17039.

Revert "crypto: lib/aes add aes generic"

This reverts commit cd956b7d9c2533f72a41eaa1421d7896c867137c.

Revert "HACK: include: crypto: aes.h: redirect aes function definitions to current ones"

This reverts commit 90035f7567d034bebd0672724d2ee341d9ef789a.

Revert "arm64: assembler: add utility macros to push/pop stack frames"

This reverts commit 5899671c86c750a23f68fce359ac4574f289b252.

Revert "crypto: skcipher - add the ability to abort a skcipher walk"

This reverts commit 6bfa9de12bea39c63115d573b292e6352a85ef69.

Revert "crypto: aes/fixed-time - align key schedule with other implementations"

This reverts commit 6b930b634c0bab896b127835ffba9a3c2d796760.

Revert "crypto: sm4 - export encrypt/decrypt routines to other drivers"

This reverts commit 306ce112895e48e8c67f5531dcc7794b8c9799d2.

Revert "crypto: sm4 - introduce SM4 symmetric cipher algorithm"

This reverts commit 9e677e8a2fe26fd4694dfff61279bd85237aacc8.

Revert "crypto: hash - introduce crypto_shash_tfm_digest()"

This reverts commit 2e972aa2655e7e50b4362d92e13b78723ab32c01.

Revert "crypto: ctr - add helper for performing a CTR encryption walk"

This reverts commit 20d0ffb036d9f88eb7b2c056202a9e6f4cece6bf.

Revert "crypto: sm3 - export crypto_sm3_final function"

This reverts commit d6983c76e3da30db111421795beda5c703955aa9.

Revert "crypto: sm3 - add OSCCA SM3 secure hash"

This reverts commit d6a29406704f117dfd2cc99b78078831d1fe287e.

Revert "crypto: don't optimize keccakf()"

This reverts commit 598d21bacff6319be4aa058025c632b107413f32.

Revert "crypto: sha3-generic - Use __optimize to support old compilers"

This reverts commit 734a8b2ae19a5e6e1ca1ada7ccb333a9655a0673.

Revert "crypto: sha3-generic - deal with oversize stack frames"

This reverts commit 83f16c21f6e058890a258a2a5b765046f6d1230c.

Revert "crypto: sha3-generic - export init/update/final routines"

This reverts commit 591f7afb9d0890c2ee9460360112aa4e448a3924.

Revert "crypto: sha3-generic - simplify code"

This reverts commit 6a9ea8f69fb0baa3d403fbbb82e052d684bc9fec.

Revert "crypto: sha3-generic - rewrite KECCAK transform to help the compiler optimize"

This reverts commit f7f038880d90b321a48788160beea14fa294f109.

Revert "arm64: crypto: fallback to may_use_yield"

This reverts commit 5dc3c1306cd51f5ca293e8b76f7f382ba68da2c2.

Revert "arm64: assembler: add cond_yield macro"

This reverts commit 10ec3894fbd69c75638c44925408b4a618a6afa8.

Revert "crypto: arm64/aes-ce - deal with oversight in new CTR carry code"

This reverts commit 769190a03e8e1373a4e6ebfffa409bd89a123e85.

Revert "crypto: arm64/crc-t10dif - move NEON yield to C code"

This reverts commit ae264b6f57eb729aaeb9f8c0cb64d6da7d2a61ae.

Revert "crypto: arm64/aes-ce-mac - simplify NEON yield"

This reverts commit f497267b2d962c1d1e3227b68ed47450235c0134.

Revert "crypto: arm64/aes-neonbs - remove NEON yield calls"

This reverts commit e3bf48533edde9d66e0de81df758b75e5daf7649.

Revert "crypto: arm64/sha512-ce - simplify NEON yield"

This reverts commit 223c7a38e2597442d1bfa9b8cf1d22d4e5bc975b.

Revert "crypto: arm64/sha3-ce - simplify NEON yield"

This reverts commit c4ab0f1013d7f75e77bbb190d2d0a45931777e06.

Revert "crypto: arm64/sha2-ce - simplify NEON yield"

This reverts commit c194acd3691aa17bb4bfff16124b2a3f476d0f4f.

Revert "crypto: arm64/sha1-ce - simplify NEON yield"

This reverts commit 2c13ddfe897ccda89cdbbb77d44522b7a1b52d71.

Revert "crypto: arm64/sha - add missing module aliases"

This reverts commit 0ccfa37388086d02858f9a829a8424ea6fe91369.

Revert "crypto: arm64/aes-ctr - improve tail handling"

This reverts commit 76050bf6b606b2a634d5d2615e3e37da30de1ccd.

Revert "crypto: arm64/aes-ce - really hide slower algos when faster ones are enabled"

This reverts commit 5648ba13a3c67b2f15544c73d75525f4b47af4f6.

Revert "crypto: arm64/gcm - move authentication tag check to SIMD domain"

This reverts commit 1efe38494e3a102b79f267e85359dad696483790.

Revert "crypto: arm64/chacha - simplify tail block handling"

This reverts commit c5f4710f95bbc3b530f22d6752cd1ff3d2006f67.

Revert "crypto: hash - Use memzero_explicit() for clearing state"

This reverts commit 1ec2a7f09f56c06773a754b1037d9463d44cf5b0.

Revert "crypto: arm64: Use x16 with indirect branch to bti_c"

This reverts commit a8c3cc4987341e820cf33589fb8be1e80d6a1455.

Revert "crypto: arm64/gcm - Fix endianness warnings"

This reverts commit 02adba8b71b749dbd0d4912e233e18b20f5a8a9d.

Revert "crypto: arm64/sha - Add declarations for assembly variables"

This reverts commit 435ec992f88cefba133194f1ee249805d482d8b2.

Revert "crypto: arm64/gcm - use inline helper to suppress indirect calls"

This reverts commit bc92b86fadeb48ae4bfef2b0d3c5b9ba078a4328.

Revert "crypto: arm64/gcm - use variably sized key struct"

This reverts commit 0aa3bc4ef6a0400c29fa82bf222996ed63a221f9.

Revert "crypto: arm64/gcm - disentangle ghash and gcm setkey() routines"

This reverts commit 4ff08ef5dd27b020b66fbe45f61282f20942c7cb.

Revert "crypto: arm64/ghash - drop PMULL based shash"

This reverts commit 313a347394fc9593669d088608fa327c21481d16.

Revert "crypto: arm64/aes-glue - use crypto_shash_tfm_digest()"

This reverts commit fb5c6d2fd18e01024fc8a509945545ec6925cdd7.

Revert "crypto: arm64 - Consistently enable extension"

This reverts commit 6eae8549471bc9b81b8d25859e6683e9be4cc67c.

Revert "crypto: arm/neon - memzero_explicit aes-cbc key"

This reverts commit fda9fcfafd679f7ed2ef75f8859ff91680b9dd6b.

Revert "arm64: crypto: Modernize names for AES function macros"

This reverts commit 2d88fdbd9de0f95ef5b8e135474a40004b0b63c3.

Revert "arm64: crypto: Modernize some extra assembly annotations"

This reverts commit d6b0bf996b79a03a37c407bfe93a8c80adbd7821.

Revert "crypto: arm64/sha-ce - implement export/import"

This reverts commit 62e0842054a83d169791e1fddb0699589b82079d.

Revert "crypto: arm64 - Use modern annotations for assembly functions"

This reverts commit cecb3c804506cae6693fa140435dc47a5ee9154c.

Revert "crypto: arm64/ghash-neon - bump priority to 150"

This reverts commit c1c96a11a6618724bd9f2c094d22118a6244439a.

Revert "crypto: arm64/sha - fix function types"

This reverts commit b2a6b1e16dc2abd35ec6cb0fee1bb5681fc27e95.

Revert "crypto: skcipher - rename the crypto_blkcipher module and kconfig option"

This reverts commit 515e10b92b2b7bf161703c0ffbdbb7273626de1d.

Revert "crypto: arm64/aes-neonbs - add return value of skcipher_walk_done() in __xts_crypt()"

This reverts commit 06cd75589703d1b6bce80320a182b595ed42ab8e.

Revert "crypto: arm64/gcm-ce - implement 4 way interleave"

This reverts commit c14ebb4667dd5e494867ee129a997c6d2198507d.

Revert "crypto: arm64/aes-neonbs - implement ciphertext stealing for XTS"

This reverts commit d5397cf5dc32382901d611cd724eec6c3817656c.

Revert "crypto: arm64/aes - implement support for XTS ciphertext stealing"

This reverts commit 192933ba203e238cae5b16ab599e4cfa6b698d51.

Revert "crypto: arm64/aes-cts-cbc - move request context data to the stack"

This reverts commit f71bf2534f6d610950b010790b8e5675ed5455c7.

Revert "crypto: arm64/aes-cts-cbc-ce - performance tweak"

This reverts commit 7e8f61db4e78db217a3364ee47cfca764cbc5f0d.

Revert "crypto: arm64/aes-neon - limit exposed routines if faster driver is enabled"

This reverts commit ecd648682b2dc6bed3214b2e43a0eca35b63094d.

Revert "crypto: arm64/aes-neonbs - replace tweak mask literal with composition"

This reverts commit 5be595c6ba8edae081cdb63dca0353115d2484d1.

Revert "crypto: arm64/aes - Use PTR_ERR_OR_ZERO rather than its implementation."

This reverts commit aab710faff1ec9001c805568e6b8831103e3ecb6.

Revert "crypto: arm64 - Rename functions to avoid conflict with crypto/sha256.h"

This reverts commit caf246ef54b23c9675585fa555d4bb541bd59c92.

Revert "crypto: arm64/aes - implement accelerated ESSIV/CBC mode"

This reverts commit f8779aee4500c50d6d94bd980a2842308abeb79c.

Revert "crypto: arm64/aes-cts-cbc - factor out CBC en/decryption of a walk"

This reverts commit e0a1f4f64a8d1f4edf497ff327e016fbffadf66f.

Revert "crypto: arm64/aes-cipher - switch to shared AES inverse Sbox"

This reverts commit 186ffd910b0d5b0489d89ba6ca191c0cd376f05c.

Revert "crypto: arm64/aes-neon - switch to shared AES Sboxes"

This reverts commit 59e5b082956da502c60ac5760a915b486d1b8e5a.

Revert "crypto: arm64/aes-ce-cipher - use AES library as fallback"

This reverts commit a700606c4170512da6f9b4c96aba4f6df8ccf8a7.

Revert "crypto: aes - move sync ctr(aes) to AES library and generic helper"

This reverts commit 48b821c53fc518c80c410047748577bea1983e71.

Revert "crypto: arm64/aes-ce - switch to library version of key expansion routine"

This reverts commit cdd168c7cd95008f372dfadc2ba3861fc906ca60.

Revert "crypto: arm64/aes-neonbs - switch to library version of key expansion routine"

This reverts commit e2eb9cb4f948c5541bf39bbe3c5b4298c8a92909.

Revert "crypto: arm64/aes-ccm - switch to AES library"

This reverts commit 9fd25afeced4df2f1da1ecdebab4833b2aa98c3a.

Revert "crypto: arm64/ghash - switch to AES library"

This reverts commit 8790d3e83a3f13e887eadd033719c08e1156ad52.

Revert "crypto: aes - rename local routines to prevent future clashes"

This reverts commit cef928c589372b9fc280695fcaecfb6c21b7ee83.

Revert "crypto: arm64/aes-ce - implement 5 way interleave for ECB, CBC and CTR"

This reverts commit fdce296ea17bff501601164bb25b783798b4a402.

Revert "crypto: arm64/aes-ce - add 5 way interleave routines"

This reverts commit 917c3db52e00f145613a362143cb01fa16e72676.

Revert "crypto: chacha - constify ctx and iv arguments"

This reverts commit c83b6ad8ff05e72d0142471ed848023d984da356.

Revert "arm64: HWCAP: add support for AT_HWCAP2"

This reverts commit 9bb745a4cea3de91bf69fe01af7532f2877f6ac3.

Revert "crypto: arm64/cbcmac - handle empty messages in same way as template"

This reverts commit 4e90675cbb0ff662a9027f73d1f13fd6ec3d694a.

Revert "crypto: arm64 - convert to use crypto_simd_usable()"

This reverts commit 3b5ca4665ab8281d8ccec34709aafa8e43dc2e6d.

Revert "crypto: arm64/gcm-aes-ce - fix no-NEON fallback code"

This reverts commit 924a1110fe35f605fb92804200bf21b4594479d3.

Revert "crypto: arm64/chacha - fix hchacha_block_neon() for big endian"

This reverts commit 6ecb0ec105fdd67f4f60736e166ae33bc96c3821.

Revert "crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian"

This reverts commit e0c9ed3235e0d9460acc931eff40d02244553204.

Revert "crypto: arm64/aes-blk - update IV after partial final CTR block"

This reverts commit 18b10273b19d1e5e9e29d27948f68fb9a9291a28.

Revert "crypto: arm64/aes-neonbs - fix returning final keystream block"

This reverts commit 9f297390e264ffa3f1a00ff46f21c656dde26df9.

Revert "crypto: arm64/crct10dif-ce - cleanup and optimizations"

This reverts commit 2aae1e3d62019544b8716d1b69ebf90614e8cdab.

Revert "crypto: arm64/crct10dif - register PMULL variants as separate algos"

This reverts commit bb84013e29730f4f68d227d2668a67a23eea8187.

Revert "crypto: arm64/crct10dif - remove dead code"

This reverts commit e9a91b8f781aa9b9ce9b8cba96b84f564aa5c9f4.

Revert "crypto: arm64/ghash - register PMULL variants as separate algos"

This reverts commit 7d747b051d69639da21c96e934909611c9f81549.

Revert "crypto: arm64/aes-ccm - don't use an atomic walk needlessly"

This reverts commit 25b54060800d593a4ffdb34579210fc804b5489a.

Revert "crypto: arm64/aes-ccm - fix logical bug in AAD MAC handling"

This reverts commit 8b995fb58cc5c8c6bf7489dab8fe8142301ac1da.

Revert "crypto: arm64/chacha - use combined SIMD/ALU routine for more speed"

This reverts commit fefef573014532d1deb1bdc428d3a71306d0e558.

Revert "crypto: arm64/chacha - optimize for arbitrary length inputs"

This reverts commit d7764657bd5578c796275ba5dbbac2ad713cfdca.

Revert "crypto: arm64/chacha - add XChaCha12 support"

This reverts commit e6c5e231ae1cd27d51526fafa4a7b081f332b019.

Revert "crypto: arm64/chacha20 - refactor to allow varying number of rounds"

This reverts commit fe2906d03233b94e1ddc91c505a6d2f385cc6679.

Revert "crypto: arm64/chacha20 - add XChaCha20 support"

This reverts commit b4e2ca051adf26b8cafad012b16b39d96cb77d3f.

Revert "crypto: arm64/nhpoly1305 - add NEON-accelerated NHPoly1305"

This reverts commit f5d169f37be30ca6360232e88357d715a578174f.

Revert "crypto: arm64/aes-blk - ensure XTS mask is always loaded"

This reverts commit a21d4e59e8d54c315ccbc196def1aa15ee6538c9.

Revert "crypto: arm64/aes - fix handling sub-block CTS-CBC inputs"

This reverts commit 2211e34cc97c80548ab831166373fefc7dc2f29d.

Revert "crypto: arm64/aes-blk - improve XTS mask handling"

This reverts commit 3fef20ad89773585e81febe97e7337664b488450.

Revert "crypto: arm64/aes-blk - add support for CTS-CBC mode"

This reverts commit 54f8af3c79f88da71c3173be25cbd790c8bd07ec.

Revert "crypto: arm64/aes-blk - revert NEON yield for skciphers"

This reverts commit 4007f6d321639b70146b5f3c1e4612c8f8158fee.

Revert "crypto: arm64/aes-blk - remove pointless (u8 *) casts"

This reverts commit 3e38de9d524ebc2d509f4af77a1ef53d71579f6b.

Revert "crypto: arm64/crct10dif - implement non-Crypto Extensions alternative"

This reverts commit 7546637992dddb0d5ed8ec161f5aa00b5e13d5e6.

Revert "crypto: arm64/crct10dif - preparatory refactor for 8x8 PMULL version"

This reverts commit 4a0e214b37f461ea570b3e23982447bb2cad31a5.

Revert "crypto: arm64/crc32 - remove PMULL based CRC32 driver"

This reverts commit 7362148d02a38e69e11f2c5f9985ff7375d4bd29.

Revert "crypto: arm64/aes-modes - get rid of literal load of addend vector"

This reverts commit ca48e35e57323d4ce03694240616a9d967c8ebfd.

Revert "crypto: arm64/aes-gcm-ce - fix scatterwalk API violation"

This reverts commit 842ecc8548a5ba41706ce1943f3caabed69ac06a.

Revert "crypto: arm64/sm4-ce - check for the right CPU feature bit"

This reverts commit 0d9a3ccbc34f63956c254406faf28ba2ae22fe6f.

Revert "crypto: arm64/ghash-ce - implement 4-way aggregation"

This reverts commit c2b969ace9aade2625ae01f466289f05762192ff.

Revert "crypto: arm64/ghash-ce - replace NEON yield check with block limit"

This reverts commit 811b2c5d4d1ade3c71dfc62da2ed082490acae6a.

Revert "crypto: arm64/aes-ce-gcm - don't reload key schedule if avoidable"

This reverts commit e873ab3d42c7ad1906ec869900a0507fabc0d517.

Revert "crypto: arm64/aes-ce-gcm - implement 2-way aggregation"

This reverts commit 94ccba0f7c486355122d25da43c39097d81a02c7.

Revert "crypto: arm64/aes-ce-gcm - operate on two input blocks at a time"

This reverts commit f2105c4e7e500b80b28d8ebc2f17db32c1d8bfd6.

Revert "crypto: arm64 - revert NEON yield for fast AEAD implementations"

This reverts commit 39e662f0f9d3db97eccef09f36f2fefc9a618bbd.

Revert "crypto/arm64: aes-ce-gcm - add missing kernel_neon_begin/end pair"

This reverts commit 15d003b13075a16712dcbdb177c41b2bddbcbf0e.

Revert "crypto: arm64/sha256 - increase cra_priority of scalar implementations"

This reverts commit 2ef73cb76eb12bb45bf0f89a49b7dc7126fadd47.

Revert "crypto: shash - remove useless setting of type flags"

This reverts commit c1d4b72ca1321bfcbee7b86390b808428c3a27fa.

Revert "crypto: arm64/aes-blk - fix and move skcipher_walk_done out of kernel_neon_begin, _end"

This reverts commit ac65336b8bd522c96fd50f49ef9c4ceee3603487.

Revert "crypto: clarify licensing of OpenSSL asm code"

This reverts commit e989520a26841372c045c493e7e2c8f7bbda4f59.

Revert "crypto: arm64/sha512-ce - yield NEON after every block of input"

This reverts commit 3bdd591e9b4da8c93d9ef7ae6049cf28385098a0.

Revert "crypto: arm64/sha3-ce - yield NEON after every block of input"

This reverts commit e3d3203e0645587a72da93e482f38695f107c3a4.

Revert "crypto: arm64/crct10dif-ce - yield NEON after every block of input"

This reverts commit 859ffc1c5d38d109ca9c57f0703e792f3a9582c2.

Revert "crypto: arm64/crc32-ce - yield NEON after every block of input"

This reverts commit 5f1c71918b39f512e4c1f4daeeb3c9d5706351a8.

Revert "crypto: arm64/aes-ghash - yield NEON after every block of input"

This reverts commit 3125f780eb7ac53a4cdbeed97ae91e8ec5945ea1.

Revert "crypto: arm64/aes-bs - yield NEON after every block of input"

This reverts commit ad2d657e731bd4d462cedbcd6ef5662a265d75a0.

Revert "Revert "crypto: arm64/aes-neonbs - fix returning final keystream block""

This reverts commit 1525d077c18b78b2953cf0d1868c444c37d3c682.

Revert "crypto: arm64/aes-blk - yield NEON after every block of input"

This reverts commit 5accaa34eea96c527aa8ec741383d3f38042ff69.

Revert "crypto: arm64/aes-ccm - yield NEON after every block of input"

This reverts commit c8f4e1b6574b2d4fdc06cd352b6950a8da2bef6c.

Revert "Revert "crypto: arm64/aes-ccm - fix logical bug in AAD MAC handling""

This reverts commit 1629c18dda48bbac7154022dccdf9eb952ba8643.

Revert "crypto: arm64/sha2-ce - yield NEON after every block of input"

This reverts commit 343779e3c70586299ab67f3f9216562ab9bd6001.

Revert "crypto: arm64/sha1-ce - yield NEON after every block of input"

This reverts commit 68106b7893e6e78da5dc39601cba4d7a60071dea.

Revert "crypto: arm64 - add support for SM4 encryption using special instructions"

This reverts commit b9547b0195e0c89e59f41a631eadad77220d8993.

Revert "crypto: arm64/sha256-neon - play nice with CONFIG_PREEMPT kernels"

This reverts commit 33045c52eb89af1dca46aa2f31b9ace6848eb1fd.

Revert "crypto: arm64/aes-blk - add 4 way interleave to CBC-MAC encrypt path"

This reverts commit 2624ccfec2c746fd5c38e52be423d81d772ab785.

Revert "crypto: arm64/aes-blk - add 4 way interleave to CBC encrypt path"

This reverts commit 26acced8924afe42757e2f20420d2b412fd5122d.

Revert "crypto: arm64/aes-blk - remove configurable interleave"

This reverts commit aade51c37fda96baf6708504fae38dd7e8f3b525.

Revert "crypto: arm64/chacha20 - move kernel mode neon en/disable into loop"

This reverts commit 563fdb9d8718c999feff21a56e855d7d368bfe38.

Revert "crypto: arm64/aes-bs - move kernel mode neon en/disable into loop"

This reverts commit 7693fb31e5ec7a3a442abc82ce6dc223e0070c63.

Revert "crypto: arm64/aes-blk - move kernel mode neon en/disable into loop"

This reverts commit f3d47872ae322555f1933b89deaedf15f01a1630.

Revert "crypto: arm64/aes-ce-ccm - move kernel mode neon en/disable into loop"

This reverts commit 208213ffed71f368561cddafd96b8d726a96d628.

Revert "crypto: arm64/speck - add NEON-accelerated implementation of Speck-XTS"

This reverts commit a53ebc68fb69e48a0670b74f2107339937af236b.

Revert "crypto: arm64/sha512 - fix/improve new v8.2 Crypto Extensions code"

This reverts commit 3fe60009d5c1a9619412ff4e8f3a3963b90b03ee.

Revert "crypto: arm64/sm3 - new v8.2 Crypto Extensions implementation"

This reverts commit f12a399dc824600b5df583418a96759d7b76df3d.

Revert "crypto: arm64/sha3 - new v8.2 Crypto Extensions implementation"

This reverts commit 0224378dd15ddc1e5bf6999b26f6a722be125604.

Revert "crypto: arm64/sha1-ce - get rid of literal pool"

This reverts commit 537e00dd0c79e6c31c4b3b2adde9727aa9f2590f.

Revert "crypto: arm64/sha2-ce - move the round constant table to .rodata section"

This reverts commit 11f38eee4f30ae202563dce4742be2819cf5ffe7.

Revert "crypto: arm64/crct10dif - move literal data to .rodata section"

This reverts commit 89710355b705b55f2c410c698ee8cd510c769672.

Revert "crypto: arm64/crc32 - move literal data to .rodata section"

This reverts commit d3fada575171f56e3dbbc402b119db0a7e106b65.

Revert "crypto: arm64/aes-neon - move literal data to .rodata section"

This reverts commit b73eae3af478173d89be32546d3db209c241a2ac.

Revert "crypto: arm64/aes-cipher - move S-box to .rodata section"

This reverts commit fc79c79af1578456030b820c8855887f79460eec.

Revert "crypto: arm64 - implement SHA-512 using special instructions"

This reverts commit f85618f0d28e37abd7fbb83acaa3de5e32849c5e.

Revert "crypto: arm64/aes - do not call crypto_unregister_skcipher twice on error"

This reverts commit 095b634144846d00932b2d68b212373b203d8265.

Revert "[SQUASH] arm64: crypto: Revert old backports"

This reverts commit 62e858fc144999fd13942977e9de8f0ab4b17041.
This commit is contained in:
Adithya R 2021-08-13 13:08:37 +05:30
parent 147f696539
commit c65c2e9bc6
153 changed files with 2935 additions and 7140 deletions

View File

@ -81,7 +81,7 @@ config CRYPTO_AES_ARM
config CRYPTO_AES_ARM_BS
tristate "Bit sliced AES using NEON instructions"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_SIMD
select CRYPTO_AES
help
@ -97,7 +97,7 @@ config CRYPTO_AES_ARM_BS
config CRYPTO_AES_ARM_CE
tristate "Accelerated AES using ARMv8 Crypto Extensions"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_SIMD
help
Use an implementation of AES in CBC, CTR and XTS modes that uses
@ -127,7 +127,7 @@ config CRYPTO_CRC32_ARM_CE
config CRYPTO_CHACHA20_NEON
tristate "NEON accelerated ChaCha stream cipher algorithms"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
config CRYPTO_NHPOLY1305_NEON

View File

@ -19,7 +19,7 @@ EXPORT_SYMBOL(__aes_arm_encrypt);
asmlinkage void __aes_arm_decrypt(u32 *rk, int rounds, const u8 *in, u8 *out);
EXPORT_SYMBOL(__aes_arm_decrypt);
static void aes_arm_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
int rounds = 6 + ctx->key_length / 4;
@ -27,7 +27,7 @@ static void aes_arm_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
__aes_arm_encrypt(ctx->key_enc, rounds, in, out);
}
static void aes_arm_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
int rounds = 6 + ctx->key_length / 4;
@ -47,8 +47,8 @@ static struct crypto_alg aes_alg = {
.cra_cipher.cia_min_keysize = AES_MIN_KEY_SIZE,
.cra_cipher.cia_max_keysize = AES_MAX_KEY_SIZE,
.cra_cipher.cia_setkey = crypto_aes_set_key,
.cra_cipher.cia_encrypt = aes_arm_encrypt,
.cra_cipher.cia_decrypt = aes_arm_decrypt,
.cra_cipher.cia_encrypt = aes_encrypt,
.cra_cipher.cia_decrypt = aes_decrypt,
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
.cra_alignmask = 3,

View File

@ -132,7 +132,6 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
kernel_neon_begin();
aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
kernel_neon_end();
memzero_explicit(&rk, sizeof(rk));
return crypto_cipher_setkey(ctx->enc_tfm, in_key, key_len);
}

View File

@ -62,7 +62,7 @@ static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
}
static int chacha_neon_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv)
struct chacha_ctx *ctx, u8 *iv)
{
struct skcipher_walk walk;
u32 state[16];

View File

@ -152,7 +152,7 @@ static struct shash_alg ghash_alg = {
.cra_name = "__ghash",
.cra_driver_name = "__driver-ghash-ce",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_INTERNAL,
.cra_blocksize = GHASH_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct ghash_key),
.cra_module = THIS_MODULE,

View File

@ -1,14 +1,4 @@
#define __ARM_ARCH__ __LINUX_ARM_ARCH__
@ SPDX-License-Identifier: GPL-2.0
@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
@ has relicensed it under the GPLv2. Therefore this program is free software;
@ you can redistribute it and/or modify it under the terms of the GNU General
@ Public License version 2 as published by the Free Software Foundation.
@
@ The original headers, including the original license headers, are
@ included below for completeness.
@ ====================================================================
@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and

View File

@ -75,6 +75,7 @@ static struct shash_alg alg = {
.cra_name = "sha1",
.cra_driver_name = "sha1-ce",
.cra_priority = 200,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -67,6 +67,7 @@ static struct shash_alg alg = {
.cra_name = "sha1",
.cra_driver_name= "sha1-asm",
.cra_priority = 150,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -83,6 +83,7 @@ static struct shash_alg alg = {
.cra_name = "sha1",
.cra_driver_name = "sha1-neon",
.cra_priority = 250,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -78,6 +78,7 @@ static struct shash_alg algs[] = { {
.cra_name = "sha224",
.cra_driver_name = "sha224-ce",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -92,6 +93,7 @@ static struct shash_alg algs[] = { {
.cra_name = "sha256",
.cra_driver_name = "sha256-ce",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -1,19 +1,12 @@
#!/usr/bin/env perl
# SPDX-License-Identifier: GPL-2.0
# This code is taken from the OpenSSL project but the author (Andy Polyakov)
# has relicensed it under the GPLv2. Therefore this program is free software;
# you can redistribute it and/or modify it under the terms of the GNU General
# Public License version 2 as published by the Free Software Foundation.
#
# The original headers, including the original license headers, are
# included below for completeness.
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Permission to use under GPL terms is granted.
# ====================================================================
# SHA256 block procedure for ARMv4. May 2007.

View File

@ -1,18 +1,11 @@
@ SPDX-License-Identifier: GPL-2.0
@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
@ has relicensed it under the GPLv2. Therefore this program is free software;
@ you can redistribute it and/or modify it under the terms of the GNU General
@ Public License version 2 as published by the Free Software Foundation.
@
@ The original headers, including the original license headers, are
@ included below for completeness.
@ ====================================================================
@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
@ details see http://www.openssl.org/~appro/cryptogams/.
@
@ Permission to use under GPL terms is granted.
@ ====================================================================
@ SHA256 block procedure for ARMv4. May 2007.

View File

@ -71,6 +71,7 @@ static struct shash_alg algs[] = { {
.cra_name = "sha256",
.cra_driver_name = "sha256-asm",
.cra_priority = 150,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -85,6 +86,7 @@ static struct shash_alg algs[] = { {
.cra_name = "sha224",
.cra_driver_name = "sha224-asm",
.cra_priority = 150,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -79,6 +79,7 @@ struct shash_alg sha256_neon_algs[] = { {
.cra_name = "sha256",
.cra_driver_name = "sha256-neon",
.cra_priority = 250,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -93,6 +94,7 @@ struct shash_alg sha256_neon_algs[] = { {
.cra_name = "sha224",
.cra_driver_name = "sha224-neon",
.cra_priority = 250,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -1,19 +1,12 @@
#!/usr/bin/env perl
# SPDX-License-Identifier: GPL-2.0
# This code is taken from the OpenSSL project but the author (Andy Polyakov)
# has relicensed it under the GPLv2. Therefore this program is free software;
# you can redistribute it and/or modify it under the terms of the GNU General
# Public License version 2 as published by the Free Software Foundation.
#
# The original headers, including the original license headers, are
# included below for completeness.
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Permission to use under GPL terms is granted.
# ====================================================================
# SHA512 block procedure for ARMv4. September 2007.

View File

@ -1,18 +1,11 @@
@ SPDX-License-Identifier: GPL-2.0
@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
@ has relicensed it under the GPLv2. Therefore this program is free software;
@ you can redistribute it and/or modify it under the terms of the GNU General
@ Public License version 2 as published by the Free Software Foundation.
@
@ The original headers, including the original license headers, are
@ included below for completeness.
@ ====================================================================
@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@ project. The module is, however, dual licensed under OpenSSL and
@ CRYPTOGAMS licenses depending on where you obtain it. For further
@ details see http://www.openssl.org/~appro/cryptogams/.
@
@ Permission to use under GPL terms is granted.
@ ====================================================================
@ SHA512 block procedure for ARMv4. September 2007.

View File

@ -63,6 +63,7 @@ static struct shash_alg sha512_arm_algs[] = { {
.cra_name = "sha384",
.cra_driver_name = "sha384-arm",
.cra_priority = 250,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -77,6 +78,7 @@ static struct shash_alg sha512_arm_algs[] = { {
.cra_name = "sha512",
.cra_driver_name = "sha512-arm",
.cra_priority = 250,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -75,6 +75,7 @@ struct shash_alg sha512_neon_algs[] = { {
.cra_name = "sha384",
.cra_driver_name = "sha384-neon",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA384_BLOCK_SIZE,
.cra_module = THIS_MODULE,
@ -90,6 +91,7 @@ struct shash_alg sha512_neon_algs[] = { {
.cra_name = "sha512",
.cra_driver_name = "sha512-neon",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -596,6 +596,7 @@ CONFIG_CRYPTO_SHA1_ARM64_CE=y
CONFIG_CRYPTO_SHA2_ARM64_CE=y
CONFIG_CRYPTO_GHASH_ARM64_CE=y
CONFIG_CRYPTO_CRCT10DIF_ARM64_CE=m
CONFIG_CRYPTO_CRC32_ARM64_CE=m
CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
CONFIG_CRYPTO_CHACHA20_NEON=m

View File

@ -727,13 +727,7 @@ CONFIG_CRYPTO_DEV_QCOM_ICE=y
CONFIG_ARM64_CRYPTO=y
CONFIG_CRYPTO_SHA1_ARM64_CE=y
CONFIG_CRYPTO_SHA2_ARM64_CE=y
CONFIG_CRYPTO_SHA512_ARM64_CE=y
CONFIG_CRYPTO_SHA3_ARM64=y
CONFIG_CRYPTO_SM3_ARM64_CE=y
CONFIG_CRYPTO_SM4_ARM64_CE=y
CONFIG_CRYPTO_GHASH_ARM64_CE=y
CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
CONFIG_CRYPTO_CHACHA20_NEON=y
CONFIG_CRYPTO_NHPOLY1305_NEON=y
CONFIG_CRYPTO_AES_ARM64_BS=y
CONFIG_CRYPTO_AES_ARM64_NEON_BLK=y

View File

@ -29,42 +29,24 @@ config CRYPTO_SHA2_ARM64_CE
select CRYPTO_HASH
select CRYPTO_SHA256_ARM64
config CRYPTO_SHA512_ARM64_CE
tristate "SHA-384/SHA-512 digest algorithm (ARMv8 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_SHA512_ARM64
config CRYPTO_SHA3_ARM64
tristate "SHA3 digest algorithm (ARMv8.2 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_SHA3
config CRYPTO_SM3_ARM64_CE
tristate "SM3 digest algorithm (ARMv8.2 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_SM3
config CRYPTO_SM4_ARM64_CE
tristate "SM4 symmetric cipher (ARMv8.2 Crypto Extensions)"
depends on KERNEL_MODE_NEON
select CRYPTO_ALGAPI
select CRYPTO_SM4
config CRYPTO_GHASH_ARM64_CE
tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions"
depends on KERNEL_MODE_NEON
select CRYPTO_HASH
select CRYPTO_GF128MUL
select CRYPTO_LIB_AES
select CRYPTO_AES
select CRYPTO_AES_ARM64
config CRYPTO_CRCT10DIF_ARM64_CE
tristate "CRCT10DIF digest algorithm using PMULL instructions"
depends on KERNEL_MODE_NEON && CRC_T10DIF
select CRYPTO_HASH
config CRYPTO_CRC32_ARM64_CE
tristate "CRC32 and CRC32C digest algorithms using ARMv8 extensions"
depends on CRC32
select CRYPTO_HASH
config CRYPTO_AES_ARM64
tristate "AES core cipher using scalar instructions"
select CRYPTO_AES
@ -73,20 +55,20 @@ config CRYPTO_AES_ARM64_CE
tristate "AES core cipher using ARMv8 Crypto Extensions"
depends on ARM64 && KERNEL_MODE_NEON
select CRYPTO_ALGAPI
select CRYPTO_LIB_AES
select CRYPTO_AES_ARM64
config CRYPTO_AES_ARM64_CE_CCM
tristate "AES in CCM mode using ARMv8 Crypto Extensions"
depends on ARM64 && KERNEL_MODE_NEON
select CRYPTO_ALGAPI
select CRYPTO_AES_ARM64_CE
select CRYPTO_AES_ARM64
select CRYPTO_AEAD
select CRYPTO_LIB_AES
config CRYPTO_AES_ARM64_CE_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_AES_ARM64_CE
select CRYPTO_AES_ARM64
select CRYPTO_SIMD
@ -94,35 +76,23 @@ config CRYPTO_AES_ARM64_CE_BLK
config CRYPTO_AES_ARM64_NEON_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_AES_ARM64
select CRYPTO_LIB_AES
select CRYPTO_AES
select CRYPTO_SIMD
config CRYPTO_CHACHA20_NEON
tristate "ChaCha20, XChaCha20, and XChaCha12 stream ciphers using NEON instructions"
tristate "NEON accelerated ChaCha20 symmetric cipher"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
config CRYPTO_NHPOLY1305_NEON
tristate "NHPoly1305 hash function using NEON instructions (for Adiantum)"
depends on KERNEL_MODE_NEON
select CRYPTO_NHPOLY1305
config CRYPTO_AES_ARM64_BS
tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm"
depends on KERNEL_MODE_NEON
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_AES_ARM64_NEON_BLK
select CRYPTO_AES_ARM64
select CRYPTO_LIB_AES
select CRYPTO_SIMD
config CRYPTO_SPECK_NEON
tristate "NEON accelerated Speck cipher algorithms"
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_SPECK
endif

View File

@ -8,32 +8,21 @@
# published by the Free Software Foundation.
#
obj-y := aes-lib.o
obj-$(CONFIG_CRYPTO_SHA1_ARM64_CE) += sha1-ce.o
sha1-ce-y := sha1-ce-glue.o sha1-ce-core.o
obj-$(CONFIG_CRYPTO_SHA2_ARM64_CE) += sha2-ce.o
sha2-ce-y := sha2-ce-glue.o sha2-ce-core.o
obj-$(CONFIG_CRYPTO_SHA512_ARM64_CE) += sha512-ce.o
sha512-ce-y := sha512-ce-glue.o sha512-ce-core.o
obj-$(CONFIG_CRYPTO_SHA3_ARM64) += sha3-ce.o
sha3-ce-y := sha3-ce-glue.o sha3-ce-core.o
obj-$(CONFIG_CRYPTO_SM3_ARM64_CE) += sm3-ce.o
sm3-ce-y := sm3-ce-glue.o sm3-ce-core.o
obj-$(CONFIG_CRYPTO_SM4_ARM64_CE) += sm4-ce.o
sm4-ce-y := sm4-ce-glue.o sm4-ce-core.o
obj-$(CONFIG_CRYPTO_GHASH_ARM64_CE) += ghash-ce.o
ghash-ce-y := ghash-ce-glue.o ghash-ce-core.o
obj-$(CONFIG_CRYPTO_CRCT10DIF_ARM64_CE) += crct10dif-ce.o
crct10dif-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
@ -52,14 +41,8 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
sha512-arm64-y := sha512-glue.o sha512-core.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
chacha-neon-y := chacha-neon-core.o chacha-neon-glue.o
obj-$(CONFIG_CRYPTO_SPECK_NEON) += speck-neon.o
speck-neon-y := speck-neon-core.o speck-neon-glue.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
nhpoly1305-neon-y := nh-neon-core.o nhpoly1305-neon-glue.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
obj-$(CONFIG_CRYPTO_AES_ARM64) += aes-arm64.o
aes-arm64-y := aes-cipher-core.o aes-cipher-glue.o

View File

@ -18,7 +18,7 @@
* void ce_aes_ccm_auth_data(u8 mac[], u8 const in[], u32 abytes,
* u32 *macp, u8 const rk[], u32 rounds);
*/
SYM_FUNC_START(ce_aes_ccm_auth_data)
ENTRY(ce_aes_ccm_auth_data)
ldr w8, [x3] /* leftover from prev round? */
ld1 {v0.16b}, [x0] /* load mac */
cbz w8, 1f
@ -84,13 +84,13 @@ SYM_FUNC_START(ce_aes_ccm_auth_data)
st1 {v0.16b}, [x0]
10: str w8, [x3]
ret
SYM_FUNC_END(ce_aes_ccm_auth_data)
ENDPROC(ce_aes_ccm_auth_data)
/*
* void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u8 const rk[],
* u32 rounds);
*/
SYM_FUNC_START(ce_aes_ccm_final)
ENTRY(ce_aes_ccm_final)
ld1 {v3.4s}, [x2], #16 /* load first round key */
ld1 {v0.16b}, [x0] /* load mac */
cmp w3, #12 /* which key size? */
@ -124,7 +124,7 @@ SYM_FUNC_START(ce_aes_ccm_final)
eor v0.16b, v0.16b, v1.16b /* en-/decrypt the mac */
st1 {v0.16b}, [x0] /* store result */
ret
SYM_FUNC_END(ce_aes_ccm_final)
ENDPROC(ce_aes_ccm_final)
.macro aes_ccm_do_crypt,enc
ldr x8, [x6, #8] /* load lower ctr */
@ -215,10 +215,10 @@ CPU_LE( rev x8, x8 )
* u8 const rk[], u32 rounds, u8 mac[],
* u8 ctr[]);
*/
SYM_FUNC_START(ce_aes_ccm_encrypt)
ENTRY(ce_aes_ccm_encrypt)
aes_ccm_do_crypt 1
SYM_FUNC_END(ce_aes_ccm_encrypt)
ENDPROC(ce_aes_ccm_encrypt)
SYM_FUNC_START(ce_aes_ccm_decrypt)
ENTRY(ce_aes_ccm_decrypt)
aes_ccm_do_crypt 0
SYM_FUNC_END(ce_aes_ccm_decrypt)
ENDPROC(ce_aes_ccm_decrypt)

View File

@ -14,7 +14,6 @@
#include <crypto/aes.h>
#include <crypto/scatterwalk.h>
#include <crypto/internal/aead.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <linux/module.h>
@ -46,6 +45,8 @@ asmlinkage void ce_aes_ccm_decrypt(u8 out[], u8 const in[], u32 cbytes,
asmlinkage void ce_aes_ccm_final(u8 mac[], u8 const ctr[], u32 const rk[],
u32 rounds);
asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
static int ccm_setkey(struct crypto_aead *tfm, const u8 *in_key,
unsigned int key_len)
{
@ -106,13 +107,11 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen)
}
static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
u32 abytes, u32 *macp)
u32 abytes, u32 *macp, bool use_neon)
{
if (may_use_simd()) {
kernel_neon_begin();
if (likely(use_neon)) {
ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc,
num_rounds(key));
kernel_neon_end();
} else {
if (*macp > 0 && *macp < AES_BLOCK_SIZE) {
int added = min(abytes, AES_BLOCK_SIZE - *macp);
@ -125,7 +124,8 @@ static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
}
while (abytes >= AES_BLOCK_SIZE) {
aes_encrypt(key, mac, mac);
__aes_arm64_encrypt(key->key_enc, mac, mac,
num_rounds(key));
crypto_xor(mac, in, AES_BLOCK_SIZE);
in += AES_BLOCK_SIZE;
@ -133,14 +133,16 @@ static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[],
}
if (abytes > 0) {
aes_encrypt(key, mac, mac);
__aes_arm64_encrypt(key->key_enc, mac, mac,
num_rounds(key));
crypto_xor(mac, in, abytes);
*macp = abytes;
}
}
}
static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[],
bool use_neon)
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_aead_ctx(aead);
@ -159,7 +161,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
ltag.len = 6;
}
ccm_update_mac(ctx, mac, (u8 *)&ltag, ltag.len, &macp);
ccm_update_mac(ctx, mac, (u8 *)&ltag, ltag.len, &macp, use_neon);
scatterwalk_start(&walk, req->src);
do {
@ -171,7 +173,7 @@ static void ccm_calculate_auth_mac(struct aead_request *req, u8 mac[])
n = scatterwalk_clamp(&walk, len);
}
p = scatterwalk_map(&walk);
ccm_update_mac(ctx, mac, p, n, &macp);
ccm_update_mac(ctx, mac, p, n, &macp, use_neon);
len -= n;
scatterwalk_unmap(p);
@ -205,8 +207,10 @@ static int ccm_crypt_fallback(struct skcipher_walk *walk, u8 mac[], u8 iv0[],
bsize = nbytes;
crypto_inc(walk->iv, AES_BLOCK_SIZE);
aes_encrypt(ctx, buf, walk->iv);
aes_encrypt(ctx, mac, mac);
__aes_arm64_encrypt(ctx->key_enc, buf, walk->iv,
num_rounds(ctx));
__aes_arm64_encrypt(ctx->key_enc, mac, mac,
num_rounds(ctx));
if (enc)
crypto_xor(mac, src, bsize);
crypto_xor_cpy(dst, src, buf, bsize);
@ -221,8 +225,8 @@ static int ccm_crypt_fallback(struct skcipher_walk *walk, u8 mac[], u8 iv0[],
}
if (!err) {
aes_encrypt(ctx, buf, iv0);
aes_encrypt(ctx, mac, mac);
__aes_arm64_encrypt(ctx->key_enc, buf, iv0, num_rounds(ctx));
__aes_arm64_encrypt(ctx->key_enc, mac, mac, num_rounds(ctx));
crypto_xor(mac, buf, AES_BLOCK_SIZE);
}
return err;
@ -236,42 +240,43 @@ static int ccm_encrypt(struct aead_request *req)
u8 __aligned(8) mac[AES_BLOCK_SIZE];
u8 buf[AES_BLOCK_SIZE];
u32 len = req->cryptlen;
bool use_neon = may_use_simd();
int err;
err = ccm_init_mac(req, mac, len);
if (err)
return err;
if (likely(use_neon))
kernel_neon_begin();
if (req->assoclen)
ccm_calculate_auth_mac(req, mac);
ccm_calculate_auth_mac(req, mac, use_neon);
/* preserve the original iv for the final round */
memcpy(buf, req->iv, AES_BLOCK_SIZE);
err = skcipher_walk_aead_encrypt(&walk, req, false);
err = skcipher_walk_aead_encrypt(&walk, req, true);
if (may_use_simd()) {
if (likely(use_neon)) {
while (walk.nbytes) {
u32 tail = walk.nbytes % AES_BLOCK_SIZE;
if (walk.nbytes == walk.total)
tail = 0;
kernel_neon_begin();
ce_aes_ccm_encrypt(walk.dst.virt.addr,
walk.src.virt.addr,
walk.nbytes - tail, ctx->key_enc,
num_rounds(ctx), mac, walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk, tail);
}
if (!err) {
kernel_neon_begin();
if (!err)
ce_aes_ccm_final(mac, buf, ctx->key_enc,
num_rounds(ctx));
kernel_neon_end();
}
kernel_neon_end();
} else {
err = ccm_crypt_fallback(&walk, mac, buf, ctx, true);
}
@ -294,42 +299,43 @@ static int ccm_decrypt(struct aead_request *req)
u8 __aligned(8) mac[AES_BLOCK_SIZE];
u8 buf[AES_BLOCK_SIZE];
u32 len = req->cryptlen - authsize;
bool use_neon = may_use_simd();
int err;
err = ccm_init_mac(req, mac, len);
if (err)
return err;
if (likely(use_neon))
kernel_neon_begin();
if (req->assoclen)
ccm_calculate_auth_mac(req, mac);
ccm_calculate_auth_mac(req, mac, use_neon);
/* preserve the original iv for the final round */
memcpy(buf, req->iv, AES_BLOCK_SIZE);
err = skcipher_walk_aead_decrypt(&walk, req, false);
err = skcipher_walk_aead_decrypt(&walk, req, true);
if (may_use_simd()) {
if (likely(use_neon)) {
while (walk.nbytes) {
u32 tail = walk.nbytes % AES_BLOCK_SIZE;
if (walk.nbytes == walk.total)
tail = 0;
kernel_neon_begin();
ce_aes_ccm_decrypt(walk.dst.virt.addr,
walk.src.virt.addr,
walk.nbytes - tail, ctx->key_enc,
num_rounds(ctx), mac, walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk, tail);
}
if (!err) {
kernel_neon_begin();
if (!err)
ce_aes_ccm_final(mac, buf, ctx->key_enc,
num_rounds(ctx));
kernel_neon_end();
}
kernel_neon_end();
} else {
err = ccm_crypt_fallback(&walk, mac, buf, ctx, false);
}
@ -367,7 +373,7 @@ static struct aead_alg ccm_aes_alg = {
static int __init aes_mod_init(void)
{
if (!cpu_have_named_feature(AES))
if (!(elf_hwcap & HWCAP_AES))
return -ENODEV;
return crypto_register_aead(&ccm_aes_alg);
}

View File

@ -11,7 +11,7 @@
.arch armv8-a+crypto
SYM_FUNC_START(__aes_ce_encrypt)
ENTRY(__aes_ce_encrypt)
sub w3, w3, #2
ld1 {v0.16b}, [x2]
ld1 {v1.4s}, [x0], #16
@ -37,9 +37,9 @@ SYM_FUNC_START(__aes_ce_encrypt)
eor v0.16b, v0.16b, v3.16b
st1 {v0.16b}, [x1]
ret
SYM_FUNC_END(__aes_ce_encrypt)
ENDPROC(__aes_ce_encrypt)
SYM_FUNC_START(__aes_ce_decrypt)
ENTRY(__aes_ce_decrypt)
sub w3, w3, #2
ld1 {v0.16b}, [x2]
ld1 {v1.4s}, [x0], #16
@ -65,23 +65,23 @@ SYM_FUNC_START(__aes_ce_decrypt)
eor v0.16b, v0.16b, v3.16b
st1 {v0.16b}, [x1]
ret
SYM_FUNC_END(__aes_ce_decrypt)
ENDPROC(__aes_ce_decrypt)
/*
* __aes_ce_sub() - use the aese instruction to perform the AES sbox
* substitution on each byte in 'input'
*/
SYM_FUNC_START(__aes_ce_sub)
ENTRY(__aes_ce_sub)
dup v1.4s, w0
movi v0.16b, #0
aese v0.16b, v1.16b
umov w0, v0.s[0]
ret
SYM_FUNC_END(__aes_ce_sub)
ENDPROC(__aes_ce_sub)
SYM_FUNC_START(__aes_ce_invert)
ENTRY(__aes_ce_invert)
ld1 {v0.4s}, [x1]
aesimc v1.16b, v0.16b
st1 {v1.4s}, [x0]
ret
SYM_FUNC_END(__aes_ce_invert)
ENDPROC(__aes_ce_invert)

View File

@ -12,7 +12,6 @@
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/aes.h>
#include <crypto/internal/simd.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/module.h>
@ -23,6 +22,9 @@ MODULE_DESCRIPTION("Synchronous AES cipher using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
struct aes_block {
u8 b[AES_BLOCK_SIZE];
};
@ -51,7 +53,7 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
if (!may_use_simd()) {
aes_encrypt(ctx, dst, src);
__aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
return;
}
@ -65,7 +67,7 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
if (!may_use_simd()) {
aes_decrypt(ctx, dst, src);
__aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
return;
}

View File

@ -12,21 +12,11 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
#define AES_FUNC_START(func) SYM_FUNC_START(ce_ ## func)
#define AES_FUNC_END(func) SYM_FUNC_END(ce_ ## func)
#define AES_ENTRY(func) ENTRY(ce_ ## func)
#define AES_ENDPROC(func) ENDPROC(ce_ ## func)
.arch armv8-a+crypto
xtsmask .req v16
cbciv .req v16
vctr .req v16
.macro xts_reload_mask, tmp
.endm
.macro xts_cts_skip_tw, reg, lbl
.endm
/* preload all round keys */
.macro load_round_keys, rounds, rk
cmp \rounds, #12
@ -40,24 +30,21 @@
.endm
/* prepare for encryption with key in rk[] */
.macro enc_prepare, rounds, rk, temp
mov \temp, \rk
load_round_keys \rounds, \temp
.macro enc_prepare, rounds, rk, ignore
load_round_keys \rounds, \rk
.endm
/* prepare for encryption (again) but with new key in rk[] */
.macro enc_switch_key, rounds, rk, temp
mov \temp, \rk
load_round_keys \rounds, \temp
.macro enc_switch_key, rounds, rk, ignore
load_round_keys \rounds, \rk
.endm
/* prepare for decryption with key in rk[] */
.macro dec_prepare, rounds, rk, temp
mov \temp, \rk
load_round_keys \rounds, \temp
.macro dec_prepare, rounds, rk, ignore
load_round_keys \rounds, \rk
.endm
.macro do_enc_Nx, de, mc, k, i0, i1, i2, i3, i4
.macro do_enc_Nx, de, mc, k, i0, i1, i2, i3
aes\de \i0\().16b, \k\().16b
aes\mc \i0\().16b, \i0\().16b
.ifnb \i1
@ -68,34 +55,27 @@
aes\mc \i2\().16b, \i2\().16b
aes\de \i3\().16b, \k\().16b
aes\mc \i3\().16b, \i3\().16b
.ifnb \i4
aes\de \i4\().16b, \k\().16b
aes\mc \i4\().16b, \i4\().16b
.endif
.endif
.endif
.endm
/* up to 5 interleaved encryption rounds with the same round key */
.macro round_Nx, enc, k, i0, i1, i2, i3, i4
/* up to 4 interleaved encryption rounds with the same round key */
.macro round_Nx, enc, k, i0, i1, i2, i3
.ifc \enc, e
do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3, \i4
do_enc_Nx e, mc, \k, \i0, \i1, \i2, \i3
.else
do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3, \i4
do_enc_Nx d, imc, \k, \i0, \i1, \i2, \i3
.endif
.endm
/* up to 5 interleaved final rounds */
.macro fin_round_Nx, de, k, k2, i0, i1, i2, i3, i4
/* up to 4 interleaved final rounds */
.macro fin_round_Nx, de, k, k2, i0, i1, i2, i3
aes\de \i0\().16b, \k\().16b
.ifnb \i1
aes\de \i1\().16b, \k\().16b
.ifnb \i3
aes\de \i2\().16b, \k\().16b
aes\de \i3\().16b, \k\().16b
.ifnb \i4
aes\de \i4\().16b, \k\().16b
.endif
.endif
.endif
eor \i0\().16b, \i0\().16b, \k2\().16b
@ -104,52 +84,47 @@
.ifnb \i3
eor \i2\().16b, \i2\().16b, \k2\().16b
eor \i3\().16b, \i3\().16b, \k2\().16b
.ifnb \i4
eor \i4\().16b, \i4\().16b, \k2\().16b
.endif
.endif
.endif
.endm
/* up to 5 interleaved blocks */
.macro do_block_Nx, enc, rounds, i0, i1, i2, i3, i4
/* up to 4 interleaved blocks */
.macro do_block_Nx, enc, rounds, i0, i1, i2, i3
cmp \rounds, #12
blo 2222f /* 128 bits */
beq 1111f /* 192 bits */
round_Nx \enc, v17, \i0, \i1, \i2, \i3, \i4
round_Nx \enc, v18, \i0, \i1, \i2, \i3, \i4
1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3, \i4
round_Nx \enc, v20, \i0, \i1, \i2, \i3, \i4
round_Nx \enc, v17, \i0, \i1, \i2, \i3
round_Nx \enc, v18, \i0, \i1, \i2, \i3
1111: round_Nx \enc, v19, \i0, \i1, \i2, \i3
round_Nx \enc, v20, \i0, \i1, \i2, \i3
2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
round_Nx \enc, \key, \i0, \i1, \i2, \i3, \i4
round_Nx \enc, \key, \i0, \i1, \i2, \i3
.endr
fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3, \i4
fin_round_Nx \enc, v30, v31, \i0, \i1, \i2, \i3
.endm
.macro encrypt_block, in, rounds, t0, t1, t2
do_block_Nx e, \rounds, \in
.endm
.macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
do_block_Nx e, \rounds, \i0, \i1, \i2, \i3
.macro encrypt_block2x, i0, i1, rounds, t0, t1, t2
do_block_Nx e, \rounds, \i0, \i1
.endm
.macro encrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2
do_block_Nx e, \rounds, \i0, \i1, \i2, \i3, \i4
.macro encrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
do_block_Nx e, \rounds, \i0, \i1, \i2, \i3
.endm
.macro decrypt_block, in, rounds, t0, t1, t2
do_block_Nx d, \rounds, \in
.endm
.macro decrypt_block2x, i0, i1, rounds, t0, t1, t2
do_block_Nx d, \rounds, \i0, \i1
.endm
.macro decrypt_block4x, i0, i1, i2, i3, rounds, t0, t1, t2
do_block_Nx d, \rounds, \i0, \i1, \i2, \i3
.endm
.macro decrypt_block5x, i0, i1, i2, i3, i4, rounds, t0, t1, t2
do_block_Nx d, \rounds, \i0, \i1, \i2, \i3, \i4
.endm
#define MAX_STRIDE 5
#include "aes-modes.S"

View File

@ -125,11 +125,48 @@ CPU_BE( rev w7, w7 )
ret
.endm
SYM_FUNC_START(__aes_arm64_encrypt)
.align L1_CACHE_SHIFT
.type __aes_arm64_inverse_sbox, %object
__aes_arm64_inverse_sbox:
.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
.size __aes_arm64_inverse_sbox, . - __aes_arm64_inverse_sbox
ENTRY(__aes_arm64_encrypt)
do_crypt fround, crypto_ft_tab, crypto_ft_tab + 1, 2
SYM_FUNC_END(__aes_arm64_encrypt)
ENDPROC(__aes_arm64_encrypt)
.align 5
SYM_FUNC_START(__aes_arm64_decrypt)
do_crypt iround, crypto_it_tab, crypto_aes_inv_sbox, 0
SYM_FUNC_END(__aes_arm64_decrypt)
ENTRY(__aes_arm64_decrypt)
do_crypt iround, crypto_it_tab, __aes_arm64_inverse_sbox, 0
ENDPROC(__aes_arm64_decrypt)

View File

@ -13,9 +13,12 @@
#include <linux/module.h>
asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
EXPORT_SYMBOL(__aes_arm64_encrypt);
static void aes_arm64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
asmlinkage void __aes_arm64_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
EXPORT_SYMBOL(__aes_arm64_decrypt);
static void aes_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
int rounds = 6 + ctx->key_length / 4;
@ -23,7 +26,7 @@ static void aes_arm64_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
__aes_arm64_encrypt(ctx->key_enc, out, in, rounds);
}
static void aes_arm64_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
static void aes_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
int rounds = 6 + ctx->key_length / 4;
@ -43,8 +46,8 @@ static struct crypto_alg aes_alg = {
.cra_cipher.cia_min_keysize = AES_MIN_KEY_SIZE,
.cra_cipher.cia_max_keysize = AES_MAX_KEY_SIZE,
.cra_cipher.cia_setkey = crypto_aes_set_key,
.cra_cipher.cia_encrypt = aes_arm64_encrypt,
.cra_cipher.cia_decrypt = aes_arm64_decrypt
.cra_cipher.cia_encrypt = aes_encrypt,
.cra_cipher.cia_decrypt = aes_decrypt
};
static int __init aes_init(void)

View File

@ -0,0 +1,53 @@
/*
* Fallback for sync aes(ctr) in contexts where kernel mode NEON
* is not allowed
*
* Copyright (C) 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <crypto/aes.h>
#include <crypto/internal/skcipher.h>
asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
static inline int aes_ctr_encrypt_fallback(struct crypto_aes_ctx *ctx,
struct skcipher_request *req)
{
struct skcipher_walk walk;
u8 buf[AES_BLOCK_SIZE];
int err;
err = skcipher_walk_virt(&walk, req, true);
while (walk.nbytes > 0) {
u8 *dst = walk.dst.virt.addr;
u8 *src = walk.src.virt.addr;
int nbytes = walk.nbytes;
int tail = 0;
if (nbytes < walk.total) {
nbytes = round_down(nbytes, AES_BLOCK_SIZE);
tail = walk.nbytes % AES_BLOCK_SIZE;
}
do {
int bsize = min(nbytes, AES_BLOCK_SIZE);
__aes_arm64_encrypt(ctx->key_enc, buf, walk.iv,
6 + ctx->key_length / 4);
crypto_xor_cpy(dst, src, buf, bsize);
crypto_inc(walk.iv, AES_BLOCK_SIZE);
dst += AES_BLOCK_SIZE;
src += AES_BLOCK_SIZE;
nbytes -= AES_BLOCK_SIZE;
} while (nbytes > 0);
err = skcipher_walk_done(&walk, tail);
}
return err;
}

View File

@ -12,45 +12,25 @@
#include <asm/hwcap.h>
#include <asm/simd.h>
#include <crypto/aes.h>
#include <crypto/ctr.h>
#include <crypto/sha.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
#include <crypto/xts.h>
#include "aes-ce-setkey.h"
#undef aes_expandkey
#undef aes_ecb_encrypt
#undef aes_ecb_decrypt
#undef aes_cbc_encrypt
#undef aes_cbc_decrypt
#undef aes_cbc_cts_encrypt
#undef aes_cbc_cts_decrypt
#undef aes_essiv_cbc_encrypt
#undef aes_essiv_cbc_decrypt
#undef aes_ctr_encrypt
#undef aes_xts_encrypt
#undef aes_xts_decrypt
#undef aes_mac_update
#include "aes-ctr-fallback.h"
#ifdef USE_V8_CRYPTO_EXTENSIONS
#define MODE "ce"
#define PRIO 300
#define STRIDE 5
#define aes_setkey ce_aes_setkey
#define aes_expandkey ce_aes_expandkey
#define aes_ecb_encrypt ce_aes_ecb_encrypt
#define aes_ecb_decrypt ce_aes_ecb_decrypt
#define aes_cbc_encrypt ce_aes_cbc_encrypt
#define aes_cbc_decrypt ce_aes_cbc_decrypt
#define aes_cbc_cts_encrypt ce_aes_cbc_cts_encrypt
#define aes_cbc_cts_decrypt ce_aes_cbc_cts_decrypt
#define aes_essiv_cbc_encrypt ce_aes_essiv_cbc_encrypt
#define aes_essiv_cbc_decrypt ce_aes_essiv_cbc_decrypt
#define aes_ctr_encrypt ce_aes_ctr_encrypt
#define aes_xts_encrypt ce_aes_xts_encrypt
#define aes_xts_decrypt ce_aes_xts_decrypt
@ -59,84 +39,59 @@ MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 Crypto Extensions");
#else
#define MODE "neon"
#define PRIO 200
#define STRIDE 4
#define aes_setkey crypto_aes_set_key
#define aes_expandkey crypto_aes_expand_key
#define aes_ecb_encrypt neon_aes_ecb_encrypt
#define aes_ecb_decrypt neon_aes_ecb_decrypt
#define aes_cbc_encrypt neon_aes_cbc_encrypt
#define aes_cbc_decrypt neon_aes_cbc_decrypt
#define aes_cbc_cts_encrypt neon_aes_cbc_cts_encrypt
#define aes_cbc_cts_decrypt neon_aes_cbc_cts_decrypt
#define aes_essiv_cbc_encrypt neon_aes_essiv_cbc_encrypt
#define aes_essiv_cbc_decrypt neon_aes_essiv_cbc_decrypt
#define aes_ctr_encrypt neon_aes_ctr_encrypt
#define aes_xts_encrypt neon_aes_xts_encrypt
#define aes_xts_decrypt neon_aes_xts_decrypt
#define aes_mac_update neon_aes_mac_update
MODULE_DESCRIPTION("AES-ECB/CBC/CTR/XTS using ARMv8 NEON");
#endif
#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
MODULE_ALIAS_CRYPTO("ecb(aes)");
MODULE_ALIAS_CRYPTO("cbc(aes)");
MODULE_ALIAS_CRYPTO("ctr(aes)");
MODULE_ALIAS_CRYPTO("xts(aes)");
#endif
MODULE_ALIAS_CRYPTO("cts(cbc(aes))");
MODULE_ALIAS_CRYPTO("essiv(cbc(aes),sha256)");
MODULE_ALIAS_CRYPTO("cmac(aes)");
MODULE_ALIAS_CRYPTO("xcbc(aes)");
MODULE_ALIAS_CRYPTO("cbcmac(aes)");
#endif
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
/* defined in aes-modes.S */
asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
asmlinkage void aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks);
asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[],
asmlinkage void aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks);
asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
asmlinkage void aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]);
asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[],
asmlinkage void aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]);
asmlinkage void aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int bytes, u8 const iv[]);
asmlinkage void aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int bytes, u8 const iv[]);
asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 ctr[]);
asmlinkage void aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int bytes, u8 ctr[], u8 finalbuf[]);
asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int bytes, u32 const rk2[], u8 iv[],
asmlinkage void aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[],
int rounds, int blocks, u8 const rk2[], u8 iv[],
int first);
asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int bytes, u32 const rk2[], u8 iv[],
asmlinkage void aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[],
int rounds, int blocks, u8 const rk2[], u8 iv[],
int first);
asmlinkage void aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int blocks, u8 iv[],
u32 const rk2[]);
asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
int rounds, int blocks, u8 iv[],
u32 const rk2[]);
asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds,
int blocks, u8 dg[], int enc_before,
int enc_after);
asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
int blocks, u8 dg[], int enc_before,
int enc_after);
struct crypto_aes_xts_ctx {
struct crypto_aes_ctx key1;
struct crypto_aes_ctx __aligned(8) key2;
};
struct crypto_aes_essiv_cbc_ctx {
struct crypto_aes_ctx key1;
struct crypto_aes_ctx __aligned(8) key2;
struct crypto_shash *hash;
};
struct mac_tfm_ctx {
struct crypto_aes_ctx key;
u8 __aligned(8) consts[];
@ -150,18 +105,11 @@ struct mac_desc_ctx {
static int skcipher_aes_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
unsigned int key_len)
{
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
int ret;
ret = aes_expandkey(ctx, in_key, key_len);
if (ret)
crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return ret;
return aes_setkey(crypto_skcipher_tfm(tfm), in_key, key_len);
}
static int __maybe_unused xts_set_key(struct crypto_skcipher *tfm,
const u8 *in_key, unsigned int key_len)
static int xts_set_key(struct crypto_skcipher *tfm, const u8 *in_key,
unsigned int key_len)
{
struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int ret;
@ -181,31 +129,7 @@ static int __maybe_unused xts_set_key(struct crypto_skcipher *tfm,
return -EINVAL;
}
static int __maybe_unused essiv_cbc_set_key(struct crypto_skcipher *tfm,
const u8 *in_key,
unsigned int key_len)
{
struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
u8 digest[SHA256_DIGEST_SIZE];
int ret;
ret = aes_expandkey(&ctx->key1, in_key, key_len);
if (ret)
goto out;
crypto_shash_tfm_digest(ctx->hash, in_key, key_len, digest);
ret = aes_expandkey(&ctx->key2, digest, sizeof(digest));
if (ret)
goto out;
return 0;
out:
crypto_skcipher_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
static int __maybe_unused ecb_encrypt(struct skcipher_request *req)
static int ecb_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
@ -218,14 +142,14 @@ static int __maybe_unused ecb_encrypt(struct skcipher_request *req)
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_ecb_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key_enc, rounds, blocks);
(u8 *)ctx->key_enc, rounds, blocks);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
return err;
}
static int __maybe_unused ecb_decrypt(struct skcipher_request *req)
static int ecb_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
@ -238,243 +162,51 @@ static int __maybe_unused ecb_decrypt(struct skcipher_request *req)
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_ecb_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key_dec, rounds, blocks);
(u8 *)ctx->key_dec, rounds, blocks);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
return err;
}
static int cbc_encrypt_walk(struct skcipher_request *req,
struct skcipher_walk *walk)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
int err = 0, rounds = 6 + ctx->key_length / 4;
unsigned int blocks;
while ((blocks = (walk->nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_cbc_encrypt(walk->dst.virt.addr, walk->src.virt.addr,
ctx->key_enc, rounds, blocks, walk->iv);
kernel_neon_end();
err = skcipher_walk_done(walk, walk->nbytes % AES_BLOCK_SIZE);
}
return err;
}
static int __maybe_unused cbc_encrypt(struct skcipher_request *req)
{
struct skcipher_walk walk;
int err;
err = skcipher_walk_virt(&walk, req, false);
if (err)
return err;
return cbc_encrypt_walk(req, &walk);
}
static int cbc_decrypt_walk(struct skcipher_request *req,
struct skcipher_walk *walk)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
int err = 0, rounds = 6 + ctx->key_length / 4;
unsigned int blocks;
while ((blocks = (walk->nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_cbc_decrypt(walk->dst.virt.addr, walk->src.virt.addr,
ctx->key_dec, rounds, blocks, walk->iv);
kernel_neon_end();
err = skcipher_walk_done(walk, walk->nbytes % AES_BLOCK_SIZE);
}
return err;
}
static int __maybe_unused cbc_decrypt(struct skcipher_request *req)
{
struct skcipher_walk walk;
int err;
err = skcipher_walk_virt(&walk, req, false);
if (err)
return err;
return cbc_decrypt_walk(req, &walk);
}
static int cts_cbc_encrypt(struct skcipher_request *req)
static int cbc_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, rounds = 6 + ctx->key_length / 4;
int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
struct scatterlist *src = req->src, *dst = req->dst;
struct scatterlist sg_src[2], sg_dst[2];
struct skcipher_request subreq;
struct skcipher_walk walk;
unsigned int blocks;
skcipher_request_set_tfm(&subreq, tfm);
skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
NULL, NULL);
err = skcipher_walk_virt(&walk, req, false);
if (req->cryptlen <= AES_BLOCK_SIZE) {
if (req->cryptlen < AES_BLOCK_SIZE)
return -EINVAL;
cbc_blocks = 1;
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key_enc, rounds, blocks, walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
if (cbc_blocks > 0) {
skcipher_request_set_crypt(&subreq, req->src, req->dst,
cbc_blocks * AES_BLOCK_SIZE,
req->iv);
err = skcipher_walk_virt(&walk, &subreq, false) ?:
cbc_encrypt_walk(&subreq, &walk);
if (err)
return err;
if (req->cryptlen == AES_BLOCK_SIZE)
return 0;
dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
if (req->dst != req->src)
dst = scatterwalk_ffwd(sg_dst, req->dst,
subreq.cryptlen);
}
/* handle ciphertext stealing */
skcipher_request_set_crypt(&subreq, src, dst,
req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
req->iv);
err = skcipher_walk_virt(&walk, &subreq, false);
if (err)
return err;
kernel_neon_begin();
aes_cbc_cts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key_enc, rounds, walk.nbytes, walk.iv);
kernel_neon_end();
return skcipher_walk_done(&walk, 0);
return err;
}
static int cts_cbc_decrypt(struct skcipher_request *req)
static int cbc_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, rounds = 6 + ctx->key_length / 4;
int cbc_blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
struct scatterlist *src = req->src, *dst = req->dst;
struct scatterlist sg_src[2], sg_dst[2];
struct skcipher_request subreq;
struct skcipher_walk walk;
skcipher_request_set_tfm(&subreq, tfm);
skcipher_request_set_callback(&subreq, skcipher_request_flags(req),
NULL, NULL);
if (req->cryptlen <= AES_BLOCK_SIZE) {
if (req->cryptlen < AES_BLOCK_SIZE)
return -EINVAL;
cbc_blocks = 1;
}
if (cbc_blocks > 0) {
skcipher_request_set_crypt(&subreq, req->src, req->dst,
cbc_blocks * AES_BLOCK_SIZE,
req->iv);
err = skcipher_walk_virt(&walk, &subreq, false) ?:
cbc_decrypt_walk(&subreq, &walk);
if (err)
return err;
if (req->cryptlen == AES_BLOCK_SIZE)
return 0;
dst = src = scatterwalk_ffwd(sg_src, req->src, subreq.cryptlen);
if (req->dst != req->src)
dst = scatterwalk_ffwd(sg_dst, req->dst,
subreq.cryptlen);
}
/* handle ciphertext stealing */
skcipher_request_set_crypt(&subreq, src, dst,
req->cryptlen - cbc_blocks * AES_BLOCK_SIZE,
req->iv);
err = skcipher_walk_virt(&walk, &subreq, false);
if (err)
return err;
kernel_neon_begin();
aes_cbc_cts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key_dec, rounds, walk.nbytes, walk.iv);
kernel_neon_end();
return skcipher_walk_done(&walk, 0);
}
static int __maybe_unused essiv_cbc_init_tfm(struct crypto_skcipher *tfm)
{
struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
ctx->hash = crypto_alloc_shash("sha256", 0, 0);
return PTR_ERR_OR_ZERO(ctx->hash);
}
static void __maybe_unused essiv_cbc_exit_tfm(struct crypto_skcipher *tfm)
{
struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
crypto_free_shash(ctx->hash);
}
static int __maybe_unused essiv_cbc_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, rounds = 6 + ctx->key1.key_length / 4;
struct skcipher_walk walk;
unsigned int blocks;
err = skcipher_walk_virt(&walk, req, false);
blocks = walk.nbytes / AES_BLOCK_SIZE;
if (blocks) {
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_essiv_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key1.key_enc, rounds, blocks,
req->iv, ctx->key2.key_enc);
aes_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key_dec, rounds, blocks, walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
return err ?: cbc_encrypt_walk(req, &walk);
}
static int __maybe_unused essiv_cbc_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_essiv_cbc_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, rounds = 6 + ctx->key1.key_length / 4;
struct skcipher_walk walk;
unsigned int blocks;
err = skcipher_walk_virt(&walk, req, false);
blocks = walk.nbytes / AES_BLOCK_SIZE;
if (blocks) {
kernel_neon_begin();
aes_essiv_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key1.key_dec, rounds, blocks,
req->iv, ctx->key2.key_enc);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
return err ?: cbc_decrypt_walk(req, &walk);
return err;
}
static int ctr_encrypt(struct skcipher_request *req)
@ -483,211 +215,95 @@ static int ctr_encrypt(struct skcipher_request *req)
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, rounds = 6 + ctx->key_length / 4;
struct skcipher_walk walk;
int blocks;
err = skcipher_walk_virt(&walk, req, false);
while (walk.nbytes > 0) {
const u8 *src = walk.src.virt.addr;
while ((blocks = (walk.nbytes / AES_BLOCK_SIZE))) {
kernel_neon_begin();
aes_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
(u8 *)ctx->key_enc, rounds, blocks, walk.iv);
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
kernel_neon_end();
}
if (walk.nbytes) {
u8 __aligned(8) tail[AES_BLOCK_SIZE];
unsigned int nbytes = walk.nbytes;
u8 *dst = walk.dst.virt.addr;
u8 buf[AES_BLOCK_SIZE];
unsigned int tail;
u8 *tdst = walk.dst.virt.addr;
u8 *tsrc = walk.src.virt.addr;
if (unlikely(nbytes < AES_BLOCK_SIZE))
src = memcpy(buf, src, nbytes);
else if (nbytes < walk.total)
nbytes &= ~(AES_BLOCK_SIZE - 1);
/*
* Tell aes_ctr_encrypt() to process a tail block.
*/
blocks = -1;
kernel_neon_begin();
aes_ctr_encrypt(dst, src, ctx->key_enc, rounds, nbytes,
walk.iv, buf);
aes_ctr_encrypt(tail, NULL, (u8 *)ctx->key_enc, rounds,
blocks, walk.iv);
kernel_neon_end();
tail = nbytes % (STRIDE * AES_BLOCK_SIZE);
if (tail > 0 && tail < AES_BLOCK_SIZE)
/*
* The final partial block could not be returned using
* an overlapping store, so it was passed via buf[]
* instead.
*/
memcpy(dst + nbytes - tail, buf, tail);
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
crypto_xor_cpy(tdst, tsrc, tail, nbytes);
err = skcipher_walk_done(&walk, 0);
}
return err;
}
static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
static int ctr_encrypt_sync(struct skcipher_request *req)
{
const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
unsigned long flags;
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
/*
* Temporarily disable interrupts to avoid races where
* cachelines are evicted when the CPU is interrupted
* to do something else.
*/
local_irq_save(flags);
aes_encrypt(ctx, dst, src);
local_irq_restore(flags);
}
static int __maybe_unused ctr_encrypt_sync(struct skcipher_request *req)
{
if (!may_use_simd())
return crypto_ctr_encrypt_walk(req, ctr_encrypt_one);
return aes_ctr_encrypt_fallback(ctx, req);
return ctr_encrypt(req);
}
static int __maybe_unused xts_encrypt(struct skcipher_request *req)
static int xts_encrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, first, rounds = 6 + ctx->key1.key_length / 4;
int tail = req->cryptlen % AES_BLOCK_SIZE;
struct scatterlist sg_src[2], sg_dst[2];
struct skcipher_request subreq;
struct scatterlist *src, *dst;
struct skcipher_walk walk;
if (req->cryptlen < AES_BLOCK_SIZE)
return -EINVAL;
unsigned int blocks;
err = skcipher_walk_virt(&walk, req, false);
if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
int xts_blocks = DIV_ROUND_UP(req->cryptlen,
AES_BLOCK_SIZE) - 2;
skcipher_walk_abort(&walk);
skcipher_request_set_tfm(&subreq, tfm);
skcipher_request_set_callback(&subreq,
skcipher_request_flags(req),
NULL, NULL);
skcipher_request_set_crypt(&subreq, req->src, req->dst,
xts_blocks * AES_BLOCK_SIZE,
req->iv);
req = &subreq;
err = skcipher_walk_virt(&walk, req, false);
} else {
tail = 0;
}
for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
int nbytes = walk.nbytes;
if (walk.nbytes < walk.total)
nbytes &= ~(AES_BLOCK_SIZE - 1);
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
kernel_neon_begin();
aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key1.key_enc, rounds, nbytes,
ctx->key2.key_enc, walk.iv, first);
(u8 *)ctx->key1.key_enc, rounds, blocks,
(u8 *)ctx->key2.key_enc, walk.iv, first);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
if (err || likely(!tail))
return err;
dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
if (req->dst != req->src)
dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
req->iv);
err = skcipher_walk_virt(&walk, &subreq, false);
if (err)
return err;
kernel_neon_begin();
aes_xts_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key1.key_enc, rounds, walk.nbytes,
ctx->key2.key_enc, walk.iv, first);
kernel_neon_end();
return skcipher_walk_done(&walk, 0);
return err;
}
static int __maybe_unused xts_decrypt(struct skcipher_request *req)
static int xts_decrypt(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct crypto_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int err, first, rounds = 6 + ctx->key1.key_length / 4;
int tail = req->cryptlen % AES_BLOCK_SIZE;
struct scatterlist sg_src[2], sg_dst[2];
struct skcipher_request subreq;
struct scatterlist *src, *dst;
struct skcipher_walk walk;
if (req->cryptlen < AES_BLOCK_SIZE)
return -EINVAL;
unsigned int blocks;
err = skcipher_walk_virt(&walk, req, false);
if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
int xts_blocks = DIV_ROUND_UP(req->cryptlen,
AES_BLOCK_SIZE) - 2;
skcipher_walk_abort(&walk);
skcipher_request_set_tfm(&subreq, tfm);
skcipher_request_set_callback(&subreq,
skcipher_request_flags(req),
NULL, NULL);
skcipher_request_set_crypt(&subreq, req->src, req->dst,
xts_blocks * AES_BLOCK_SIZE,
req->iv);
req = &subreq;
err = skcipher_walk_virt(&walk, req, false);
} else {
tail = 0;
}
for (first = 1; walk.nbytes >= AES_BLOCK_SIZE; first = 0) {
int nbytes = walk.nbytes;
if (walk.nbytes < walk.total)
nbytes &= ~(AES_BLOCK_SIZE - 1);
for (first = 1; (blocks = (walk.nbytes / AES_BLOCK_SIZE)); first = 0) {
kernel_neon_begin();
aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key1.key_dec, rounds, nbytes,
ctx->key2.key_enc, walk.iv, first);
(u8 *)ctx->key1.key_dec, rounds, blocks,
(u8 *)ctx->key2.key_enc, walk.iv, first);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
if (err || likely(!tail))
return err;
dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
if (req->dst != req->src)
dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
req->iv);
err = skcipher_walk_virt(&walk, &subreq, false);
if (err)
return err;
kernel_neon_begin();
aes_xts_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key1.key_dec, rounds, walk.nbytes,
ctx->key2.key_enc, walk.iv, first);
kernel_neon_end();
return skcipher_walk_done(&walk, 0);
return err;
}
static struct skcipher_alg aes_algs[] = { {
#if defined(USE_V8_CRYPTO_EXTENSIONS) || !IS_ENABLED(CONFIG_CRYPTO_AES_ARM64_BS)
.base = {
.cra_name = "__ecb(aes)",
.cra_driver_name = "__ecb-aes-" MODE,
@ -764,46 +380,9 @@ static struct skcipher_alg aes_algs[] = { {
.min_keysize = 2 * AES_MIN_KEY_SIZE,
.max_keysize = 2 * AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.walksize = 2 * AES_BLOCK_SIZE,
.setkey = xts_set_key,
.encrypt = xts_encrypt,
.decrypt = xts_decrypt,
}, {
#endif
.base = {
.cra_name = "__cts(cbc(aes))",
.cra_driver_name = "__cts-cbc-aes-" MODE,
.cra_priority = PRIO,
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx),
.cra_module = THIS_MODULE,
},
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.walksize = 2 * AES_BLOCK_SIZE,
.setkey = skcipher_aes_setkey,
.encrypt = cts_cbc_encrypt,
.decrypt = cts_cbc_decrypt,
}, {
.base = {
.cra_name = "__essiv(cbc(aes),sha256)",
.cra_driver_name = "__essiv-cbc-aes-sha256-" MODE,
.cra_priority = PRIO + 1,
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_essiv_cbc_ctx),
.cra_module = THIS_MODULE,
},
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.ivsize = AES_BLOCK_SIZE,
.setkey = essiv_cbc_set_key,
.encrypt = essiv_cbc_encrypt,
.decrypt = essiv_cbc_decrypt,
.init = essiv_cbc_init_tfm,
.exit = essiv_cbc_exit_tfm,
} };
static int cbcmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
@ -833,6 +412,7 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
{
struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
be128 *consts = (be128 *)ctx->consts;
u8 *rk = (u8 *)ctx->key.key_enc;
int rounds = 6 + key_len / 4;
int err;
@ -842,8 +422,7 @@ static int cmac_setkey(struct crypto_shash *tfm, const u8 *in_key,
/* encrypt the zero vector */
kernel_neon_begin();
aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, ctx->key.key_enc,
rounds, 1);
aes_ecb_encrypt(ctx->consts, (u8[AES_BLOCK_SIZE]){}, rk, rounds, 1);
kernel_neon_end();
cmac_gf128_mul_by_x(consts, consts);
@ -862,6 +441,7 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
};
struct mac_tfm_ctx *ctx = crypto_shash_ctx(tfm);
u8 *rk = (u8 *)ctx->key.key_enc;
int rounds = 6 + key_len / 4;
u8 key[AES_BLOCK_SIZE];
int err;
@ -871,8 +451,8 @@ static int xcbc_setkey(struct crypto_shash *tfm, const u8 *in_key,
return err;
kernel_neon_begin();
aes_ecb_encrypt(key, ks[0], ctx->key.key_enc, rounds, 1);
aes_ecb_encrypt(ctx->consts, ks[1], ctx->key.key_enc, rounds, 2);
aes_ecb_encrypt(key, ks[0], rk, rounds, 1);
aes_ecb_encrypt(ctx->consts, ks[1], rk, rounds, 2);
kernel_neon_end();
return cbcmac_setkey(tfm, key, sizeof(key));
@ -894,27 +474,21 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
int rounds = 6 + ctx->key_length / 4;
if (may_use_simd()) {
int rem;
do {
kernel_neon_begin();
rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
dg, enc_before, enc_after);
kernel_neon_end();
in += (blocks - rem) * AES_BLOCK_SIZE;
blocks = rem;
enc_before = 0;
} while (blocks);
kernel_neon_begin();
aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before,
enc_after);
kernel_neon_end();
} else {
if (enc_before)
aes_encrypt(ctx, dg, dg);
__aes_arm64_encrypt(ctx->key_enc, dg, dg, rounds);
while (blocks--) {
crypto_xor(dg, in, AES_BLOCK_SIZE);
in += AES_BLOCK_SIZE;
if (blocks || enc_after)
aes_encrypt(ctx, dg, dg);
__aes_arm64_encrypt(ctx->key_enc, dg, dg,
rounds);
}
}
}
@ -964,7 +538,7 @@ static int cbcmac_final(struct shash_desc *desc, u8 *out)
struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
struct mac_desc_ctx *ctx = shash_desc_ctx(desc);
mac_do_update(&tctx->key, NULL, 0, ctx->dg, (ctx->len != 0), 0);
mac_do_update(&tctx->key, NULL, 0, ctx->dg, 1, 0);
memcpy(out, ctx->dg, AES_BLOCK_SIZE);
@ -993,6 +567,7 @@ static struct shash_alg mac_algs[] = { {
.base.cra_name = "cmac(aes)",
.base.cra_driver_name = "cmac-aes-" MODE,
.base.cra_priority = PRIO,
.base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = AES_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct mac_tfm_ctx) +
2 * AES_BLOCK_SIZE,
@ -1008,6 +583,7 @@ static struct shash_alg mac_algs[] = { {
.base.cra_name = "xcbc(aes)",
.base.cra_driver_name = "xcbc-aes-" MODE,
.base.cra_priority = PRIO,
.base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = AES_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct mac_tfm_ctx) +
2 * AES_BLOCK_SIZE,
@ -1023,6 +599,7 @@ static struct shash_alg mac_algs[] = { {
.base.cra_name = "cbcmac(aes)",
.base.cra_driver_name = "cbcmac-aes-" MODE,
.base.cra_priority = PRIO,
.base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct mac_tfm_ctx),
.base.cra_module = THIS_MODULE,
@ -1085,7 +662,6 @@ static int __init aes_init(void)
unregister_simds:
aes_exit();
return err;
unregister_ciphers:
crypto_unregister_skciphers(aes_algs, ARRAY_SIZE(aes_algs));
return err;
@ -1097,7 +673,5 @@ module_cpu_feature_match(AES, aes_init);
module_init(aes_init);
EXPORT_SYMBOL(neon_aes_ecb_encrypt);
EXPORT_SYMBOL(neon_aes_cbc_encrypt);
EXPORT_SYMBOL(neon_aes_xts_encrypt);
EXPORT_SYMBOL(neon_aes_xts_decrypt);
#endif
module_exit(aes_exit);

View File

@ -1,356 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2017-2019 Linaro Ltd <ard.biesheuvel@linaro.org>
*/
#include <crypto/aes.h>
#include <linux/crypto.h>
#include <linux/module.h>
#include <asm/unaligned.h>
/*
* Emit the sbox as volatile const to prevent the compiler from doing
* constant folding on sbox references involving fixed indexes.
*/
static volatile const u8 __cacheline_aligned aes_sbox[] = {
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
};
static volatile const u8 __cacheline_aligned aes_inv_sbox[] = {
0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
};
extern const u8 crypto_aes_sbox[256] __alias(aes_sbox);
extern const u8 crypto_aes_inv_sbox[256] __alias(aes_inv_sbox);
EXPORT_SYMBOL(crypto_aes_sbox);
EXPORT_SYMBOL(crypto_aes_inv_sbox);
static u32 mul_by_x(u32 w)
{
u32 x = w & 0x7f7f7f7f;
u32 y = w & 0x80808080;
/* multiply by polynomial 'x' (0b10) in GF(2^8) */
return (x << 1) ^ (y >> 7) * 0x1b;
}
static u32 mul_by_x2(u32 w)
{
u32 x = w & 0x3f3f3f3f;
u32 y = w & 0x80808080;
u32 z = w & 0x40404040;
/* multiply by polynomial 'x^2' (0b100) in GF(2^8) */
return (x << 2) ^ (y >> 7) * 0x36 ^ (z >> 6) * 0x1b;
}
static u32 mix_columns(u32 x)
{
/*
* Perform the following matrix multiplication in GF(2^8)
*
* | 0x2 0x3 0x1 0x1 | | x[0] |
* | 0x1 0x2 0x3 0x1 | | x[1] |
* | 0x1 0x1 0x2 0x3 | x | x[2] |
* | 0x3 0x1 0x1 0x2 | | x[3] |
*/
u32 y = mul_by_x(x) ^ ror32(x, 16);
return y ^ ror32(x ^ y, 8);
}
static u32 inv_mix_columns(u32 x)
{
/*
* Perform the following matrix multiplication in GF(2^8)
*
* | 0xe 0xb 0xd 0x9 | | x[0] |
* | 0x9 0xe 0xb 0xd | | x[1] |
* | 0xd 0x9 0xe 0xb | x | x[2] |
* | 0xb 0xd 0x9 0xe | | x[3] |
*
* which can conveniently be reduced to
*
* | 0x2 0x3 0x1 0x1 | | 0x5 0x0 0x4 0x0 | | x[0] |
* | 0x1 0x2 0x3 0x1 | | 0x0 0x5 0x0 0x4 | | x[1] |
* | 0x1 0x1 0x2 0x3 | x | 0x4 0x0 0x5 0x0 | x | x[2] |
* | 0x3 0x1 0x1 0x2 | | 0x0 0x4 0x0 0x5 | | x[3] |
*/
u32 y = mul_by_x2(x);
return mix_columns(x ^ y ^ ror32(y, 16));
}
static __always_inline u32 subshift(u32 in[], int pos)
{
return (aes_sbox[in[pos] & 0xff]) ^
(aes_sbox[(in[(pos + 1) % 4] >> 8) & 0xff] << 8) ^
(aes_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^
(aes_sbox[(in[(pos + 3) % 4] >> 24) & 0xff] << 24);
}
static __always_inline u32 inv_subshift(u32 in[], int pos)
{
return (aes_inv_sbox[in[pos] & 0xff]) ^
(aes_inv_sbox[(in[(pos + 3) % 4] >> 8) & 0xff] << 8) ^
(aes_inv_sbox[(in[(pos + 2) % 4] >> 16) & 0xff] << 16) ^
(aes_inv_sbox[(in[(pos + 1) % 4] >> 24) & 0xff] << 24);
}
static u32 subw(u32 in)
{
return (aes_sbox[in & 0xff]) ^
(aes_sbox[(in >> 8) & 0xff] << 8) ^
(aes_sbox[(in >> 16) & 0xff] << 16) ^
(aes_sbox[(in >> 24) & 0xff] << 24);
}
/**
* aes_expandkey - Expands the AES key as described in FIPS-197
* @ctx: The location where the computed key will be stored.
* @in_key: The supplied key.
* @key_len: The length of the supplied key.
*
* Returns 0 on success. The function fails only if an invalid key size (or
* pointer) is supplied.
* The expanded key size is 240 bytes (max of 14 rounds with a unique 16 bytes
* key schedule plus a 16 bytes key which is used before the first round).
* The decryption key is prepared for the "Equivalent Inverse Cipher" as
* described in FIPS-197. The first slot (16 bytes) of each key (enc or dec) is
* for the initial combination, the second slot for the first round and so on.
*/
int aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
unsigned int key_len)
{
u32 kwords = key_len / sizeof(u32);
u32 rc, i, j;
if (key_len != AES_KEYSIZE_128 &&
key_len != AES_KEYSIZE_192 &&
key_len != AES_KEYSIZE_256)
return -EINVAL;
ctx->key_length = key_len;
for (i = 0; i < kwords; i++)
ctx->key_enc[i] = get_unaligned_le32(in_key + i * sizeof(u32));
for (i = 0, rc = 1; i < 10; i++, rc = mul_by_x(rc)) {
u32 *rki = ctx->key_enc + (i * kwords);
u32 *rko = rki + kwords;
rko[0] = ror32(subw(rki[kwords - 1]), 8) ^ rc ^ rki[0];
rko[1] = rko[0] ^ rki[1];
rko[2] = rko[1] ^ rki[2];
rko[3] = rko[2] ^ rki[3];
if (key_len == AES_KEYSIZE_192) {
if (i >= 7)
break;
rko[4] = rko[3] ^ rki[4];
rko[5] = rko[4] ^ rki[5];
} else if (key_len == AES_KEYSIZE_256) {
if (i >= 6)
break;
rko[4] = subw(rko[3]) ^ rki[4];
rko[5] = rko[4] ^ rki[5];
rko[6] = rko[5] ^ rki[6];
rko[7] = rko[6] ^ rki[7];
}
}
/*
* Generate the decryption keys for the Equivalent Inverse Cipher.
* This involves reversing the order of the round keys, and applying
* the Inverse Mix Columns transformation to all but the first and
* the last one.
*/
ctx->key_dec[0] = ctx->key_enc[key_len + 24];
ctx->key_dec[1] = ctx->key_enc[key_len + 25];
ctx->key_dec[2] = ctx->key_enc[key_len + 26];
ctx->key_dec[3] = ctx->key_enc[key_len + 27];
for (i = 4, j = key_len + 20; j > 0; i += 4, j -= 4) {
ctx->key_dec[i] = inv_mix_columns(ctx->key_enc[j]);
ctx->key_dec[i + 1] = inv_mix_columns(ctx->key_enc[j + 1]);
ctx->key_dec[i + 2] = inv_mix_columns(ctx->key_enc[j + 2]);
ctx->key_dec[i + 3] = inv_mix_columns(ctx->key_enc[j + 3]);
}
ctx->key_dec[i] = ctx->key_enc[0];
ctx->key_dec[i + 1] = ctx->key_enc[1];
ctx->key_dec[i + 2] = ctx->key_enc[2];
ctx->key_dec[i + 3] = ctx->key_enc[3];
return 0;
}
EXPORT_SYMBOL(aes_expandkey);
/**
* aes_encrypt - Encrypt a single AES block
* @ctx: Context struct containing the key schedule
* @out: Buffer to store the ciphertext
* @in: Buffer containing the plaintext
*/
void aes_encrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in)
{
const u32 *rkp = ctx->key_enc + 4;
int rounds = 6 + ctx->key_length / 4;
u32 st0[4], st1[4];
int round;
st0[0] = ctx->key_enc[0] ^ get_unaligned_le32(in);
st0[1] = ctx->key_enc[1] ^ get_unaligned_le32(in + 4);
st0[2] = ctx->key_enc[2] ^ get_unaligned_le32(in + 8);
st0[3] = ctx->key_enc[3] ^ get_unaligned_le32(in + 12);
/*
* Force the compiler to emit data independent Sbox references,
* by xoring the input with Sbox values that are known to add up
* to zero. This pulls the entire Sbox into the D-cache before any
* data dependent lookups are done.
*/
st0[0] ^= aes_sbox[ 0] ^ aes_sbox[ 64] ^ aes_sbox[134] ^ aes_sbox[195];
st0[1] ^= aes_sbox[16] ^ aes_sbox[ 82] ^ aes_sbox[158] ^ aes_sbox[221];
st0[2] ^= aes_sbox[32] ^ aes_sbox[ 96] ^ aes_sbox[160] ^ aes_sbox[234];
st0[3] ^= aes_sbox[48] ^ aes_sbox[112] ^ aes_sbox[186] ^ aes_sbox[241];
for (round = 0;; round += 2, rkp += 8) {
st1[0] = mix_columns(subshift(st0, 0)) ^ rkp[0];
st1[1] = mix_columns(subshift(st0, 1)) ^ rkp[1];
st1[2] = mix_columns(subshift(st0, 2)) ^ rkp[2];
st1[3] = mix_columns(subshift(st0, 3)) ^ rkp[3];
if (round == rounds - 2)
break;
st0[0] = mix_columns(subshift(st1, 0)) ^ rkp[4];
st0[1] = mix_columns(subshift(st1, 1)) ^ rkp[5];
st0[2] = mix_columns(subshift(st1, 2)) ^ rkp[6];
st0[3] = mix_columns(subshift(st1, 3)) ^ rkp[7];
}
put_unaligned_le32(subshift(st1, 0) ^ rkp[4], out);
put_unaligned_le32(subshift(st1, 1) ^ rkp[5], out + 4);
put_unaligned_le32(subshift(st1, 2) ^ rkp[6], out + 8);
put_unaligned_le32(subshift(st1, 3) ^ rkp[7], out + 12);
}
EXPORT_SYMBOL(aes_encrypt);
/**
* aes_decrypt - Decrypt a single AES block
* @ctx: Context struct containing the key schedule
* @out: Buffer to store the plaintext
* @in: Buffer containing the ciphertext
*/
void aes_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in)
{
const u32 *rkp = ctx->key_dec + 4;
int rounds = 6 + ctx->key_length / 4;
u32 st0[4], st1[4];
int round;
st0[0] = ctx->key_dec[0] ^ get_unaligned_le32(in);
st0[1] = ctx->key_dec[1] ^ get_unaligned_le32(in + 4);
st0[2] = ctx->key_dec[2] ^ get_unaligned_le32(in + 8);
st0[3] = ctx->key_dec[3] ^ get_unaligned_le32(in + 12);
/*
* Force the compiler to emit data independent Sbox references,
* by xoring the input with Sbox values that are known to add up
* to zero. This pulls the entire Sbox into the D-cache before any
* data dependent lookups are done.
*/
st0[0] ^= aes_inv_sbox[ 0] ^ aes_inv_sbox[ 64] ^ aes_inv_sbox[129] ^ aes_inv_sbox[200];
st0[1] ^= aes_inv_sbox[16] ^ aes_inv_sbox[ 83] ^ aes_inv_sbox[150] ^ aes_inv_sbox[212];
st0[2] ^= aes_inv_sbox[32] ^ aes_inv_sbox[ 96] ^ aes_inv_sbox[160] ^ aes_inv_sbox[236];
st0[3] ^= aes_inv_sbox[48] ^ aes_inv_sbox[112] ^ aes_inv_sbox[187] ^ aes_inv_sbox[247];
for (round = 0;; round += 2, rkp += 8) {
st1[0] = inv_mix_columns(inv_subshift(st0, 0)) ^ rkp[0];
st1[1] = inv_mix_columns(inv_subshift(st0, 1)) ^ rkp[1];
st1[2] = inv_mix_columns(inv_subshift(st0, 2)) ^ rkp[2];
st1[3] = inv_mix_columns(inv_subshift(st0, 3)) ^ rkp[3];
if (round == rounds - 2)
break;
st0[0] = inv_mix_columns(inv_subshift(st1, 0)) ^ rkp[4];
st0[1] = inv_mix_columns(inv_subshift(st1, 1)) ^ rkp[5];
st0[2] = inv_mix_columns(inv_subshift(st1, 2)) ^ rkp[6];
st0[3] = inv_mix_columns(inv_subshift(st1, 3)) ^ rkp[7];
}
put_unaligned_le32(inv_subshift(st1, 0) ^ rkp[4], out);
put_unaligned_le32(inv_subshift(st1, 1) ^ rkp[5], out + 4);
put_unaligned_le32(inv_subshift(st1, 2) ^ rkp[6], out + 8);
put_unaligned_le32(inv_subshift(st1, 3) ^ rkp[7], out + 12);
}
EXPORT_SYMBOL(aes_decrypt);
MODULE_DESCRIPTION("Generic AES library");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");

View File

@ -13,39 +13,15 @@
.text
.align 4
#ifndef MAX_STRIDE
#define MAX_STRIDE 4
#endif
#if MAX_STRIDE == 4
#define ST4(x...) x
#define ST5(x...)
#else
#define ST4(x...)
#define ST5(x...) x
#endif
SYM_FUNC_START_LOCAL(aes_encrypt_block4x)
aes_encrypt_block4x:
encrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
SYM_FUNC_END(aes_encrypt_block4x)
ENDPROC(aes_encrypt_block4x)
SYM_FUNC_START_LOCAL(aes_decrypt_block4x)
aes_decrypt_block4x:
decrypt_block4x v0, v1, v2, v3, w3, x2, x8, w7
ret
SYM_FUNC_END(aes_decrypt_block4x)
#if MAX_STRIDE == 5
SYM_FUNC_START_LOCAL(aes_encrypt_block5x)
encrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
ret
SYM_FUNC_END(aes_encrypt_block5x)
SYM_FUNC_START_LOCAL(aes_decrypt_block5x)
decrypt_block5x v0, v1, v2, v3, v4, w3, x2, x8, w7
ret
SYM_FUNC_END(aes_decrypt_block5x)
#endif
ENDPROC(aes_decrypt_block4x)
/*
* aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
@ -54,24 +30,21 @@ SYM_FUNC_END(aes_decrypt_block5x)
* int blocks)
*/
AES_FUNC_START(aes_ecb_encrypt)
AES_ENTRY(aes_ecb_encrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
enc_prepare w3, x2, x5
.LecbencloopNx:
subs w4, w4, #MAX_STRIDE
subs w4, w4, #4
bmi .Lecbenc1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
ST4( bl aes_encrypt_block4x )
ST5( ld1 {v4.16b}, [x1], #16 )
ST5( bl aes_encrypt_block5x )
bl aes_encrypt_block4x
st1 {v0.16b-v3.16b}, [x0], #64
ST5( st1 {v4.16b}, [x0], #16 )
b .LecbencloopNx
.Lecbenc1x:
adds w4, w4, #MAX_STRIDE
adds w4, w4, #4
beq .Lecbencout
.Lecbencloop:
ld1 {v0.16b}, [x1], #16 /* get next pt block */
@ -82,27 +55,24 @@ ST5( st1 {v4.16b}, [x0], #16 )
.Lecbencout:
ldp x29, x30, [sp], #16
ret
AES_FUNC_END(aes_ecb_encrypt)
AES_ENDPROC(aes_ecb_encrypt)
AES_FUNC_START(aes_ecb_decrypt)
AES_ENTRY(aes_ecb_decrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
dec_prepare w3, x2, x5
.LecbdecloopNx:
subs w4, w4, #MAX_STRIDE
subs w4, w4, #4
bmi .Lecbdec1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
ST4( bl aes_decrypt_block4x )
ST5( ld1 {v4.16b}, [x1], #16 )
ST5( bl aes_decrypt_block5x )
bl aes_decrypt_block4x
st1 {v0.16b-v3.16b}, [x0], #64
ST5( st1 {v4.16b}, [x0], #16 )
b .LecbdecloopNx
.Lecbdec1x:
adds w4, w4, #MAX_STRIDE
adds w4, w4, #4
beq .Lecbdecout
.Lecbdecloop:
ld1 {v0.16b}, [x1], #16 /* get next ct block */
@ -113,7 +83,7 @@ ST5( st1 {v4.16b}, [x0], #16 )
.Lecbdecout:
ldp x29, x30, [sp], #16
ret
AES_FUNC_END(aes_ecb_decrypt)
AES_ENDPROC(aes_ecb_decrypt)
/*
@ -121,24 +91,9 @@ AES_FUNC_END(aes_ecb_decrypt)
* int blocks, u8 iv[])
* aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[])
* aes_essiv_cbc_encrypt(u8 out[], u8 const in[], u32 const rk1[],
* int rounds, int blocks, u8 iv[],
* u32 const rk2[]);
* aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
* int rounds, int blocks, u8 iv[],
* u32 const rk2[]);
*/
AES_FUNC_START(aes_essiv_cbc_encrypt)
ld1 {v4.16b}, [x5] /* get iv */
mov w8, #14 /* AES-256: 14 rounds */
enc_prepare w8, x6, x7
encrypt_block v4, w8, x6, x7, w9
enc_switch_key w3, x2, x6
b .Lcbcencloop4x
AES_FUNC_START(aes_cbc_encrypt)
AES_ENTRY(aes_cbc_encrypt)
ld1 {v4.16b}, [x5] /* get iv */
enc_prepare w3, x2, x6
@ -170,360 +125,184 @@ AES_FUNC_START(aes_cbc_encrypt)
.Lcbcencout:
st1 {v4.16b}, [x5] /* return iv */
ret
AES_FUNC_END(aes_cbc_encrypt)
AES_FUNC_END(aes_essiv_cbc_encrypt)
AES_ENDPROC(aes_cbc_encrypt)
AES_FUNC_START(aes_essiv_cbc_decrypt)
AES_ENTRY(aes_cbc_decrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
ld1 {cbciv.16b}, [x5] /* get iv */
mov w8, #14 /* AES-256: 14 rounds */
enc_prepare w8, x6, x7
encrypt_block cbciv, w8, x6, x7, w9
b .Lessivcbcdecstart
AES_FUNC_START(aes_cbc_decrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
ld1 {cbciv.16b}, [x5] /* get iv */
.Lessivcbcdecstart:
ld1 {v7.16b}, [x5] /* get iv */
dec_prepare w3, x2, x6
.LcbcdecloopNx:
subs w4, w4, #MAX_STRIDE
subs w4, w4, #4
bmi .Lcbcdec1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
#if MAX_STRIDE == 5
ld1 {v4.16b}, [x1], #16 /* get 1 ct block */
mov v5.16b, v0.16b
mov v6.16b, v1.16b
mov v7.16b, v2.16b
bl aes_decrypt_block5x
sub x1, x1, #32
eor v0.16b, v0.16b, cbciv.16b
eor v1.16b, v1.16b, v5.16b
ld1 {v5.16b}, [x1], #16 /* reload 1 ct block */
ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
eor v4.16b, v4.16b, v5.16b
#else
mov v4.16b, v0.16b
mov v5.16b, v1.16b
mov v6.16b, v2.16b
bl aes_decrypt_block4x
sub x1, x1, #16
eor v0.16b, v0.16b, cbciv.16b
eor v0.16b, v0.16b, v7.16b
eor v1.16b, v1.16b, v4.16b
ld1 {cbciv.16b}, [x1], #16 /* reload 1 ct block */
ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */
eor v2.16b, v2.16b, v5.16b
eor v3.16b, v3.16b, v6.16b
#endif
st1 {v0.16b-v3.16b}, [x0], #64
ST5( st1 {v4.16b}, [x0], #16 )
b .LcbcdecloopNx
.Lcbcdec1x:
adds w4, w4, #MAX_STRIDE
adds w4, w4, #4
beq .Lcbcdecout
.Lcbcdecloop:
ld1 {v1.16b}, [x1], #16 /* get next ct block */
mov v0.16b, v1.16b /* ...and copy to v0 */
decrypt_block v0, w3, x2, x6, w7
eor v0.16b, v0.16b, cbciv.16b /* xor with iv => pt */
mov cbciv.16b, v1.16b /* ct is next iv */
eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */
mov v7.16b, v1.16b /* ct is next iv */
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
bne .Lcbcdecloop
.Lcbcdecout:
st1 {cbciv.16b}, [x5] /* return iv */
st1 {v7.16b}, [x5] /* return iv */
ldp x29, x30, [sp], #16
ret
AES_FUNC_END(aes_cbc_decrypt)
AES_FUNC_END(aes_essiv_cbc_decrypt)
/*
* aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
* int rounds, int bytes, u8 const iv[])
* aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
* int rounds, int bytes, u8 const iv[])
*/
AES_FUNC_START(aes_cbc_cts_encrypt)
adr_l x8, .Lcts_permute_table
sub x4, x4, #16
add x9, x8, #32
add x8, x8, x4
sub x9, x9, x4
ld1 {v3.16b}, [x8]
ld1 {v4.16b}, [x9]
ld1 {v0.16b}, [x1], x4 /* overlapping loads */
ld1 {v1.16b}, [x1]
ld1 {v5.16b}, [x5] /* get iv */
enc_prepare w3, x2, x6
eor v0.16b, v0.16b, v5.16b /* xor with iv */
tbl v1.16b, {v1.16b}, v4.16b
encrypt_block v0, w3, x2, x6, w7
eor v1.16b, v1.16b, v0.16b
tbl v0.16b, {v0.16b}, v3.16b
encrypt_block v1, w3, x2, x6, w7
add x4, x0, x4
st1 {v0.16b}, [x4] /* overlapping stores */
st1 {v1.16b}, [x0]
ret
AES_FUNC_END(aes_cbc_cts_encrypt)
AES_FUNC_START(aes_cbc_cts_decrypt)
adr_l x8, .Lcts_permute_table
sub x4, x4, #16
add x9, x8, #32
add x8, x8, x4
sub x9, x9, x4
ld1 {v3.16b}, [x8]
ld1 {v4.16b}, [x9]
ld1 {v0.16b}, [x1], x4 /* overlapping loads */
ld1 {v1.16b}, [x1]
ld1 {v5.16b}, [x5] /* get iv */
dec_prepare w3, x2, x6
decrypt_block v0, w3, x2, x6, w7
tbl v2.16b, {v0.16b}, v3.16b
eor v2.16b, v2.16b, v1.16b
tbx v0.16b, {v1.16b}, v4.16b
decrypt_block v0, w3, x2, x6, w7
eor v0.16b, v0.16b, v5.16b /* xor with iv */
add x4, x0, x4
st1 {v2.16b}, [x4] /* overlapping stores */
st1 {v0.16b}, [x0]
ret
AES_FUNC_END(aes_cbc_cts_decrypt)
.section ".rodata", "a"
.align 6
.Lcts_permute_table:
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.previous
AES_ENDPROC(aes_cbc_decrypt)
/*
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int bytes, u8 ctr[], u8 finalbuf[])
* int blocks, u8 ctr[])
*/
AES_FUNC_START(aes_ctr_encrypt)
AES_ENTRY(aes_ctr_encrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
enc_prepare w3, x2, x12
ld1 {vctr.16b}, [x5]
umov x12, vctr.d[1] /* keep swabbed ctr in reg */
rev x12, x12
enc_prepare w3, x2, x6
ld1 {v4.16b}, [x5]
umov x6, v4.d[1] /* keep swabbed ctr in reg */
rev x6, x6
cmn w6, w4 /* 32 bit overflow? */
bcs .Lctrloop
.LctrloopNx:
add w7, w4, #15
sub w4, w4, #MAX_STRIDE << 4
lsr w7, w7, #4
mov w8, #MAX_STRIDE
cmp w7, w8
csel w7, w7, w8, lt
adds x12, x12, x7
mov v0.16b, vctr.16b
mov v1.16b, vctr.16b
mov v2.16b, vctr.16b
mov v3.16b, vctr.16b
ST5( mov v4.16b, vctr.16b )
bcs 0f
.subsection 1
/* apply carry to outgoing counter */
0: umov x8, vctr.d[0]
rev x8, x8
add x8, x8, #1
rev x8, x8
ins vctr.d[0], x8
/* apply carry to N counter blocks for N := x12 */
cbz x12, 2f
adr x16, 1f
sub x16, x16, x12, lsl #3
br x16
hint 34 // bti c
mov v0.d[0], vctr.d[0]
hint 34 // bti c
mov v1.d[0], vctr.d[0]
hint 34 // bti c
mov v2.d[0], vctr.d[0]
hint 34 // bti c
mov v3.d[0], vctr.d[0]
ST5( hint 34 )
ST5( mov v4.d[0], vctr.d[0] )
1: b 2f
.previous
2: rev x7, x12
ins vctr.d[1], x7
sub x7, x12, #MAX_STRIDE - 1
sub x8, x12, #MAX_STRIDE - 2
sub x9, x12, #MAX_STRIDE - 3
rev x7, x7
rev x8, x8
mov v1.d[1], x7
rev x9, x9
ST5( sub x10, x12, #MAX_STRIDE - 4 )
mov v2.d[1], x8
ST5( rev x10, x10 )
mov v3.d[1], x9
ST5( mov v4.d[1], x10 )
tbnz w4, #31, .Lctrtail
ld1 {v5.16b-v7.16b}, [x1], #48
ST4( bl aes_encrypt_block4x )
ST5( bl aes_encrypt_block5x )
subs w4, w4, #4
bmi .Lctr1x
add w7, w6, #1
mov v0.16b, v4.16b
add w8, w6, #2
mov v1.16b, v4.16b
add w9, w6, #3
mov v2.16b, v4.16b
rev w7, w7
mov v3.16b, v4.16b
rev w8, w8
mov v1.s[3], w7
rev w9, w9
mov v2.s[3], w8
mov v3.s[3], w9
ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */
bl aes_encrypt_block4x
eor v0.16b, v5.16b, v0.16b
ST4( ld1 {v5.16b}, [x1], #16 )
ld1 {v5.16b}, [x1], #16 /* get 1 input block */
eor v1.16b, v6.16b, v1.16b
ST5( ld1 {v5.16b-v6.16b}, [x1], #32 )
eor v2.16b, v7.16b, v2.16b
eor v3.16b, v5.16b, v3.16b
ST5( eor v4.16b, v6.16b, v4.16b )
st1 {v0.16b-v3.16b}, [x0], #64
ST5( st1 {v4.16b}, [x0], #16 )
add x6, x6, #4
rev x7, x6
ins v4.d[1], x7
cbz w4, .Lctrout
b .LctrloopNx
.Lctr1x:
adds w4, w4, #4
beq .Lctrout
.Lctrloop:
mov v0.16b, v4.16b
encrypt_block v0, w3, x2, x8, w7
adds x6, x6, #1 /* increment BE ctr */
rev x7, x6
ins v4.d[1], x7
bcs .Lctrcarry /* overflow? */
.Lctrcarrydone:
subs w4, w4, #1
bmi .Lctrtailblock /* blocks <0 means tail block */
ld1 {v3.16b}, [x1], #16
eor v3.16b, v0.16b, v3.16b
st1 {v3.16b}, [x0], #16
bne .Lctrloop
.Lctrout:
st1 {vctr.16b}, [x5] /* return next CTR value */
st1 {v4.16b}, [x5] /* return next CTR value */
ldp x29, x30, [sp], #16
ret
.Lctrtail:
/* XOR up to MAX_STRIDE * 16 - 1 bytes of in/output with v0 ... v3/v4 */
mov x16, #16
ands x13, x4, #0xf
csel x13, x13, x16, ne
.Lctrtailblock:
st1 {v0.16b}, [x0]
ldp x29, x30, [sp], #16
ret
ST5( cmp w4, #64 - (MAX_STRIDE << 4) )
ST5( csel x14, x16, xzr, gt )
cmp w4, #48 - (MAX_STRIDE << 4)
csel x15, x16, xzr, gt
cmp w4, #32 - (MAX_STRIDE << 4)
csel x16, x16, xzr, gt
cmp w4, #16 - (MAX_STRIDE << 4)
ble .Lctrtail1x
adr_l x12, .Lcts_permute_table
add x12, x12, x13
ST5( ld1 {v5.16b}, [x1], x14 )
ld1 {v6.16b}, [x1], x15
ld1 {v7.16b}, [x1], x16
ST4( bl aes_encrypt_block4x )
ST5( bl aes_encrypt_block5x )
ld1 {v8.16b}, [x1], x13
ld1 {v9.16b}, [x1]
ld1 {v10.16b}, [x12]
ST4( eor v6.16b, v6.16b, v0.16b )
ST4( eor v7.16b, v7.16b, v1.16b )
ST4( tbl v3.16b, {v3.16b}, v10.16b )
ST4( eor v8.16b, v8.16b, v2.16b )
ST4( eor v9.16b, v9.16b, v3.16b )
ST5( eor v5.16b, v5.16b, v0.16b )
ST5( eor v6.16b, v6.16b, v1.16b )
ST5( tbl v4.16b, {v4.16b}, v10.16b )
ST5( eor v7.16b, v7.16b, v2.16b )
ST5( eor v8.16b, v8.16b, v3.16b )
ST5( eor v9.16b, v9.16b, v4.16b )
ST5( st1 {v5.16b}, [x0], x14 )
st1 {v6.16b}, [x0], x15
st1 {v7.16b}, [x0], x16
add x13, x13, x0
st1 {v9.16b}, [x13] // overlapping stores
st1 {v8.16b}, [x0]
b .Lctrout
.Lctrtail1x:
csel x0, x0, x6, eq // use finalbuf if less than a full block
ld1 {v5.16b}, [x1]
ST5( mov v3.16b, v4.16b )
encrypt_block v3, w3, x2, x8, w7
eor v5.16b, v5.16b, v3.16b
st1 {v5.16b}, [x0]
b .Lctrout
AES_FUNC_END(aes_ctr_encrypt)
.Lctrcarry:
umov x7, v4.d[0] /* load upper word of ctr */
rev x7, x7 /* ... to handle the carry */
add x7, x7, #1
rev x7, x7
ins v4.d[0], x7
b .Lctrcarrydone
AES_ENDPROC(aes_ctr_encrypt)
.ltorg
/*
* aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
* int bytes, u8 const rk2[], u8 iv[], int first)
* aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
* int bytes, u8 const rk2[], u8 iv[], int first)
* int blocks, u8 const rk2[], u8 iv[], int first)
* aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
* int blocks, u8 const rk2[], u8 iv[], int first)
*/
.macro next_tweak, out, in, tmp
.macro next_tweak, out, in, const, tmp
sshr \tmp\().2d, \in\().2d, #63
and \tmp\().16b, \tmp\().16b, xtsmask.16b
and \tmp\().16b, \tmp\().16b, \const\().16b
add \out\().2d, \in\().2d, \in\().2d
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
eor \out\().16b, \out\().16b, \tmp\().16b
.endm
.macro xts_load_mask, tmp
movi xtsmask.2s, #0x1
movi \tmp\().2s, #0x87
uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
.endm
.Lxts_mul_x:
CPU_LE( .quad 1, 0x87 )
CPU_BE( .quad 0x87, 1 )
AES_FUNC_START(aes_xts_encrypt)
AES_ENTRY(aes_xts_encrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
ld1 {v4.16b}, [x6]
xts_load_mask v8
cbz w7, .Lxtsencnotfirst
enc_prepare w3, x5, x8
xts_cts_skip_tw w7, .LxtsencNx
encrypt_block v4, w3, x5, x8, w7 /* first tweak */
enc_switch_key w3, x2, x8
ldr q7, .Lxts_mul_x
b .LxtsencNx
.Lxtsencnotfirst:
enc_prepare w3, x2, x8
.LxtsencloopNx:
next_tweak v4, v4, v8
ldr q7, .Lxts_mul_x
next_tweak v4, v4, v7, v8
.LxtsencNx:
subs w4, w4, #64
subs w4, w4, #4
bmi .Lxtsenc1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
next_tweak v5, v4, v8
next_tweak v5, v4, v7, v8
eor v0.16b, v0.16b, v4.16b
next_tweak v6, v5, v8
next_tweak v6, v5, v7, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
next_tweak v7, v6, v8
next_tweak v7, v6, v7, v8
eor v3.16b, v3.16b, v7.16b
bl aes_encrypt_block4x
eor v3.16b, v3.16b, v7.16b
@ -532,91 +311,56 @@ AES_FUNC_START(aes_xts_encrypt)
eor v2.16b, v2.16b, v6.16b
st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b
cbz w4, .Lxtsencret
xts_reload_mask v8
cbz w4, .Lxtsencout
b .LxtsencloopNx
.Lxtsenc1x:
adds w4, w4, #64
adds w4, w4, #4
beq .Lxtsencout
subs w4, w4, #16
bmi .LxtsencctsNx
.Lxtsencloop:
ld1 {v0.16b}, [x1], #16
.Lxtsencctsout:
eor v0.16b, v0.16b, v4.16b
ld1 {v1.16b}, [x1], #16
eor v0.16b, v1.16b, v4.16b
encrypt_block v0, w3, x2, x8, w7
eor v0.16b, v0.16b, v4.16b
cbz w4, .Lxtsencout
subs w4, w4, #16
next_tweak v4, v4, v8
bmi .Lxtsenccts
st1 {v0.16b}, [x0], #16
subs w4, w4, #1
beq .Lxtsencout
next_tweak v4, v4, v7, v8
b .Lxtsencloop
.Lxtsencout:
st1 {v0.16b}, [x0]
.Lxtsencret:
st1 {v4.16b}, [x6]
ldp x29, x30, [sp], #16
ret
AES_ENDPROC(aes_xts_encrypt)
.LxtsencctsNx:
mov v0.16b, v3.16b
sub x0, x0, #16
.Lxtsenccts:
adr_l x8, .Lcts_permute_table
add x1, x1, w4, sxtw /* rewind input pointer */
add w4, w4, #16 /* # bytes in final block */
add x9, x8, #32
add x8, x8, x4
sub x9, x9, x4
add x4, x0, x4 /* output address of final block */
ld1 {v1.16b}, [x1] /* load final block */
ld1 {v2.16b}, [x8]
ld1 {v3.16b}, [x9]
tbl v2.16b, {v0.16b}, v2.16b
tbx v0.16b, {v1.16b}, v3.16b
st1 {v2.16b}, [x4] /* overlapping stores */
mov w4, wzr
b .Lxtsencctsout
AES_FUNC_END(aes_xts_encrypt)
AES_FUNC_START(aes_xts_decrypt)
AES_ENTRY(aes_xts_decrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
/* subtract 16 bytes if we are doing CTS */
sub w8, w4, #0x10
tst w4, #0xf
csel w4, w4, w8, eq
ld1 {v4.16b}, [x6]
xts_load_mask v8
xts_cts_skip_tw w7, .Lxtsdecskiptw
cbz w7, .Lxtsdecnotfirst
enc_prepare w3, x5, x8
encrypt_block v4, w3, x5, x8, w7 /* first tweak */
.Lxtsdecskiptw:
dec_prepare w3, x2, x8
ldr q7, .Lxts_mul_x
b .LxtsdecNx
.Lxtsdecnotfirst:
dec_prepare w3, x2, x8
.LxtsdecloopNx:
next_tweak v4, v4, v8
ldr q7, .Lxts_mul_x
next_tweak v4, v4, v7, v8
.LxtsdecNx:
subs w4, w4, #64
subs w4, w4, #4
bmi .Lxtsdec1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
next_tweak v5, v4, v8
next_tweak v5, v4, v7, v8
eor v0.16b, v0.16b, v4.16b
next_tweak v6, v5, v8
next_tweak v6, v5, v7, v8
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
next_tweak v7, v6, v8
next_tweak v7, v6, v7, v8
eor v3.16b, v3.16b, v7.16b
bl aes_decrypt_block4x
eor v3.16b, v3.16b, v7.16b
@ -626,62 +370,31 @@ AES_FUNC_START(aes_xts_decrypt)
st1 {v0.16b-v3.16b}, [x0], #64
mov v4.16b, v7.16b
cbz w4, .Lxtsdecout
xts_reload_mask v8
b .LxtsdecloopNx
.Lxtsdec1x:
adds w4, w4, #64
adds w4, w4, #4
beq .Lxtsdecout
subs w4, w4, #16
.Lxtsdecloop:
ld1 {v0.16b}, [x1], #16
bmi .Lxtsdeccts
.Lxtsdecctsout:
eor v0.16b, v0.16b, v4.16b
ld1 {v1.16b}, [x1], #16
eor v0.16b, v1.16b, v4.16b
decrypt_block v0, w3, x2, x8, w7
eor v0.16b, v0.16b, v4.16b
st1 {v0.16b}, [x0], #16
cbz w4, .Lxtsdecout
subs w4, w4, #16
next_tweak v4, v4, v8
subs w4, w4, #1
beq .Lxtsdecout
next_tweak v4, v4, v7, v8
b .Lxtsdecloop
.Lxtsdecout:
st1 {v4.16b}, [x6]
ldp x29, x30, [sp], #16
ret
.Lxtsdeccts:
adr_l x8, .Lcts_permute_table
add x1, x1, w4, sxtw /* rewind input pointer */
add w4, w4, #16 /* # bytes in final block */
add x9, x8, #32
add x8, x8, x4
sub x9, x9, x4
add x4, x0, x4 /* output address of final block */
next_tweak v5, v4, v8
ld1 {v1.16b}, [x1] /* load final block */
ld1 {v2.16b}, [x8]
ld1 {v3.16b}, [x9]
eor v0.16b, v0.16b, v5.16b
decrypt_block v0, w3, x2, x8, w7
eor v0.16b, v0.16b, v5.16b
tbl v2.16b, {v0.16b}, v2.16b
tbx v0.16b, {v1.16b}, v3.16b
st1 {v2.16b}, [x4] /* overlapping stores */
mov w4, wzr
b .Lxtsdecctsout
AES_FUNC_END(aes_xts_decrypt)
AES_ENDPROC(aes_xts_decrypt)
/*
* aes_mac_update(u8 const in[], u32 const rk[], int rounds,
* int blocks, u8 dg[], int enc_before, int enc_after)
*/
AES_FUNC_START(aes_mac_update)
AES_ENTRY(aes_mac_update)
ld1 {v0.16b}, [x4] /* get dg */
enc_prepare w2, x1, x7
cbz w5, .Lmacloop4x
@ -703,8 +416,6 @@ AES_FUNC_START(aes_mac_update)
csinv x5, x6, xzr, eq
cbz w5, .Lmacout
encrypt_block v0, w2, x1, x7, w8
st1 {v0.16b}, [x4] /* return dg */
cond_yield .Lmacout, x7
b .Lmacloop4x
.Lmac1x:
add w3, w3, #4
@ -717,12 +428,10 @@ AES_FUNC_START(aes_mac_update)
csinv x5, x6, xzr, eq
cbz w5, .Lmacout
.Lmacenc:
encrypt_block v0, w2, x1, x7, w8
b .Lmacloop
.Lmacout:
st1 {v0.16b}, [x4] /* return dg */
mov w0, w3
ret
AES_FUNC_END(aes_mac_update)
AES_ENDPROC(aes_mac_update)

View File

@ -11,21 +11,8 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
#define AES_FUNC_START(func) SYM_FUNC_START(neon_ ## func)
#define AES_FUNC_END(func) SYM_FUNC_END(neon_ ## func)
xtsmask .req v7
cbciv .req v7
vctr .req v4
.macro xts_reload_mask, tmp
xts_load_mask \tmp
.endm
/* special case for the neon-bs driver calling into this one for CTS */
.macro xts_cts_skip_tw, reg, lbl
tbnz \reg, #1, \lbl
.endm
#define AES_ENTRY(func) ENTRY(neon_ ## func)
#define AES_ENDPROC(func) ENDPROC(neon_ ## func)
/* multiply by polynomial 'x' in GF(2^8) */
.macro mul_by_x, out, in, temp, const
@ -45,10 +32,10 @@
/* preload the entire Sbox */
.macro prepare, sbox, shiftrows, temp
adr \temp, \sbox
movi v12.16b, #0x1b
ldr_l q13, \shiftrows, \temp
ldr_l q14, .Lror32by8, \temp
adr_l \temp, \sbox
ldr q13, \shiftrows
ldr q14, .Lror32by8
ld1 {v16.16b-v19.16b}, [\temp], #64
ld1 {v20.16b-v23.16b}, [\temp], #64
ld1 {v24.16b-v27.16b}, [\temp], #64
@ -57,7 +44,7 @@
/* do preload for encryption */
.macro enc_prepare, ignore0, ignore1, temp
prepare crypto_aes_sbox, .LForward_ShiftRows, \temp
prepare .LForward_Sbox, .LForward_ShiftRows, \temp
.endm
.macro enc_switch_key, ignore0, ignore1, temp
@ -66,7 +53,7 @@
/* do preload for decryption */
.macro dec_prepare, ignore0, ignore1, temp
prepare crypto_aes_inv_sbox, .LReverse_ShiftRows, \temp
prepare .LReverse_Sbox, .LReverse_ShiftRows, \temp
.endm
/* apply SubBytes transformation using the the preloaded Sbox */
@ -124,9 +111,26 @@
/*
* Interleaved versions: functionally equivalent to the
* ones above, but applied to AES states in parallel.
* ones above, but applied to 2 or 4 AES states in parallel.
*/
.macro sub_bytes_2x, in0, in1
sub v8.16b, \in0\().16b, v15.16b
tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
sub v9.16b, \in1\().16b, v15.16b
tbl \in1\().16b, {v16.16b-v19.16b}, \in1\().16b
sub v10.16b, v8.16b, v15.16b
tbx \in0\().16b, {v20.16b-v23.16b}, v8.16b
sub v11.16b, v9.16b, v15.16b
tbx \in1\().16b, {v20.16b-v23.16b}, v9.16b
sub v8.16b, v10.16b, v15.16b
tbx \in0\().16b, {v24.16b-v27.16b}, v10.16b
sub v9.16b, v11.16b, v15.16b
tbx \in1\().16b, {v24.16b-v27.16b}, v11.16b
tbx \in0\().16b, {v28.16b-v31.16b}, v8.16b
tbx \in1\().16b, {v28.16b-v31.16b}, v9.16b
.endm
.macro sub_bytes_4x, in0, in1, in2, in3
sub v8.16b, \in0\().16b, v15.16b
tbl \in0\().16b, {v16.16b-v19.16b}, \in0\().16b
@ -205,6 +209,25 @@
eor \in1\().16b, \in1\().16b, v11.16b
.endm
.macro do_block_2x, enc, in0, in1, rounds, rk, rkp, i
ld1 {v15.4s}, [\rk]
add \rkp, \rk, #16
mov \i, \rounds
1111: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
movi v15.16b, #0x40
tbl \in0\().16b, {\in0\().16b}, v13.16b /* ShiftRows */
tbl \in1\().16b, {\in1\().16b}, v13.16b /* ShiftRows */
sub_bytes_2x \in0, \in1
subs \i, \i, #1
ld1 {v15.4s}, [\rkp], #16
beq 2222f
mix_columns_2x \in0, \in1, \enc
b 1111b
2222: eor \in0\().16b, \in0\().16b, v15.16b /* ^round key */
eor \in1\().16b, \in1\().16b, v15.16b /* ^round key */
.endm
.macro do_block_4x, enc, in0, in1, in2, in3, rounds, rk, rkp, i
ld1 {v15.4s}, [\rk]
add \rkp, \rk, #16
@ -231,6 +254,14 @@
eor \in3\().16b, \in3\().16b, v15.16b /* ^round key */
.endm
.macro encrypt_block2x, in0, in1, rounds, rk, rkp, i
do_block_2x 1, \in0, \in1, \rounds, \rk, \rkp, \i
.endm
.macro decrypt_block2x, in0, in1, rounds, rk, rkp, i
do_block_2x 0, \in0, \in1, \rounds, \rk, \rkp, \i
.endm
.macro encrypt_block4x, in0, in1, in2, in3, rounds, rk, rkp, i
do_block_4x 1, \in0, \in1, \in2, \in3, \rounds, \rk, \rkp, \i
.endm
@ -241,8 +272,76 @@
#include "aes-modes.S"
.section ".rodata", "a"
.align 4
.text
.align 6
.LForward_Sbox:
.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
.LReverse_Sbox:
.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
.LForward_ShiftRows:
.octa 0x0b06010c07020d08030e09040f0a0500

View File

@ -383,7 +383,7 @@ ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f
/*
* void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
*/
SYM_FUNC_START(aesbs_convert_key)
ENTRY(aesbs_convert_key)
ld1 {v7.4s}, [x1], #16 // load round 0 key
ld1 {v17.4s}, [x1], #16 // load round 1 key
@ -428,10 +428,10 @@ SYM_FUNC_START(aesbs_convert_key)
eor v17.16b, v17.16b, v7.16b
str q17, [x0]
ret
SYM_FUNC_END(aesbs_convert_key)
ENDPROC(aesbs_convert_key)
.align 4
SYM_FUNC_START_LOCAL(aesbs_encrypt8)
aesbs_encrypt8:
ldr q9, [bskey], #16 // round 0 key
ldr q8, M0SR
ldr q24, SR
@ -491,10 +491,10 @@ SYM_FUNC_START_LOCAL(aesbs_encrypt8)
eor v2.16b, v2.16b, v12.16b
eor v5.16b, v5.16b, v12.16b
ret
SYM_FUNC_END(aesbs_encrypt8)
ENDPROC(aesbs_encrypt8)
.align 4
SYM_FUNC_START_LOCAL(aesbs_decrypt8)
aesbs_decrypt8:
lsl x9, rounds, #7
add bskey, bskey, x9
@ -556,7 +556,7 @@ SYM_FUNC_START_LOCAL(aesbs_decrypt8)
eor v3.16b, v3.16b, v12.16b
eor v5.16b, v5.16b, v12.16b
ret
SYM_FUNC_END(aesbs_decrypt8)
ENDPROC(aesbs_decrypt8)
/*
* aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
@ -565,122 +565,110 @@ SYM_FUNC_END(aesbs_decrypt8)
* int blocks)
*/
.macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
frame_push 5
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
stp x29, x30, [sp, #-16]!
mov x29, sp
99: mov x5, #1
lsl x5, x5, x23
subs w23, w23, #8
csel x23, x23, xzr, pl
lsl x5, x5, x4
subs w4, w4, #8
csel x4, x4, xzr, pl
csel x5, x5, xzr, mi
ld1 {v0.16b}, [x20], #16
ld1 {v0.16b}, [x1], #16
tbnz x5, #1, 0f
ld1 {v1.16b}, [x20], #16
ld1 {v1.16b}, [x1], #16
tbnz x5, #2, 0f
ld1 {v2.16b}, [x20], #16
ld1 {v2.16b}, [x1], #16
tbnz x5, #3, 0f
ld1 {v3.16b}, [x20], #16
ld1 {v3.16b}, [x1], #16
tbnz x5, #4, 0f
ld1 {v4.16b}, [x20], #16
ld1 {v4.16b}, [x1], #16
tbnz x5, #5, 0f
ld1 {v5.16b}, [x20], #16
ld1 {v5.16b}, [x1], #16
tbnz x5, #6, 0f
ld1 {v6.16b}, [x20], #16
ld1 {v6.16b}, [x1], #16
tbnz x5, #7, 0f
ld1 {v7.16b}, [x20], #16
ld1 {v7.16b}, [x1], #16
0: mov bskey, x21
mov rounds, x22
0: mov bskey, x2
mov rounds, x3
bl \do8
st1 {\o0\().16b}, [x19], #16
st1 {\o0\().16b}, [x0], #16
tbnz x5, #1, 1f
st1 {\o1\().16b}, [x19], #16
st1 {\o1\().16b}, [x0], #16
tbnz x5, #2, 1f
st1 {\o2\().16b}, [x19], #16
st1 {\o2\().16b}, [x0], #16
tbnz x5, #3, 1f
st1 {\o3\().16b}, [x19], #16
st1 {\o3\().16b}, [x0], #16
tbnz x5, #4, 1f
st1 {\o4\().16b}, [x19], #16
st1 {\o4\().16b}, [x0], #16
tbnz x5, #5, 1f
st1 {\o5\().16b}, [x19], #16
st1 {\o5\().16b}, [x0], #16
tbnz x5, #6, 1f
st1 {\o6\().16b}, [x19], #16
st1 {\o6\().16b}, [x0], #16
tbnz x5, #7, 1f
st1 {\o7\().16b}, [x19], #16
st1 {\o7\().16b}, [x0], #16
cbz x23, 1f
b 99b
cbnz x4, 99b
1: frame_pop
1: ldp x29, x30, [sp], #16
ret
.endm
.align 4
SYM_FUNC_START(aesbs_ecb_encrypt)
ENTRY(aesbs_ecb_encrypt)
__ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
SYM_FUNC_END(aesbs_ecb_encrypt)
ENDPROC(aesbs_ecb_encrypt)
.align 4
SYM_FUNC_START(aesbs_ecb_decrypt)
ENTRY(aesbs_ecb_decrypt)
__ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
SYM_FUNC_END(aesbs_ecb_decrypt)
ENDPROC(aesbs_ecb_decrypt)
/*
* aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[])
*/
.align 4
SYM_FUNC_START(aesbs_cbc_decrypt)
frame_push 6
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
ENTRY(aesbs_cbc_decrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
99: mov x6, #1
lsl x6, x6, x23
subs w23, w23, #8
csel x23, x23, xzr, pl
lsl x6, x6, x4
subs w4, w4, #8
csel x4, x4, xzr, pl
csel x6, x6, xzr, mi
ld1 {v0.16b}, [x20], #16
ld1 {v0.16b}, [x1], #16
mov v25.16b, v0.16b
tbnz x6, #1, 0f
ld1 {v1.16b}, [x20], #16
ld1 {v1.16b}, [x1], #16
mov v26.16b, v1.16b
tbnz x6, #2, 0f
ld1 {v2.16b}, [x20], #16
ld1 {v2.16b}, [x1], #16
mov v27.16b, v2.16b
tbnz x6, #3, 0f
ld1 {v3.16b}, [x20], #16
ld1 {v3.16b}, [x1], #16
mov v28.16b, v3.16b
tbnz x6, #4, 0f
ld1 {v4.16b}, [x20], #16
ld1 {v4.16b}, [x1], #16
mov v29.16b, v4.16b
tbnz x6, #5, 0f
ld1 {v5.16b}, [x20], #16
ld1 {v5.16b}, [x1], #16
mov v30.16b, v5.16b
tbnz x6, #6, 0f
ld1 {v6.16b}, [x20], #16
ld1 {v6.16b}, [x1], #16
mov v31.16b, v6.16b
tbnz x6, #7, 0f
ld1 {v7.16b}, [x20]
ld1 {v7.16b}, [x1]
0: mov bskey, x21
mov rounds, x22
0: mov bskey, x2
mov rounds, x3
bl aesbs_decrypt8
ld1 {v24.16b}, [x24] // load IV
ld1 {v24.16b}, [x5] // load IV
eor v1.16b, v1.16b, v25.16b
eor v6.16b, v6.16b, v26.16b
@ -691,37 +679,36 @@ SYM_FUNC_START(aesbs_cbc_decrypt)
eor v3.16b, v3.16b, v30.16b
eor v5.16b, v5.16b, v31.16b
st1 {v0.16b}, [x19], #16
st1 {v0.16b}, [x0], #16
mov v24.16b, v25.16b
tbnz x6, #1, 1f
st1 {v1.16b}, [x19], #16
st1 {v1.16b}, [x0], #16
mov v24.16b, v26.16b
tbnz x6, #2, 1f
st1 {v6.16b}, [x19], #16
st1 {v6.16b}, [x0], #16
mov v24.16b, v27.16b
tbnz x6, #3, 1f
st1 {v4.16b}, [x19], #16
st1 {v4.16b}, [x0], #16
mov v24.16b, v28.16b
tbnz x6, #4, 1f
st1 {v2.16b}, [x19], #16
st1 {v2.16b}, [x0], #16
mov v24.16b, v29.16b
tbnz x6, #5, 1f
st1 {v7.16b}, [x19], #16
st1 {v7.16b}, [x0], #16
mov v24.16b, v30.16b
tbnz x6, #6, 1f
st1 {v3.16b}, [x19], #16
st1 {v3.16b}, [x0], #16
mov v24.16b, v31.16b
tbnz x6, #7, 1f
ld1 {v24.16b}, [x20], #16
st1 {v5.16b}, [x19], #16
1: st1 {v24.16b}, [x24] // store IV
ld1 {v24.16b}, [x1], #16
st1 {v5.16b}, [x0], #16
1: st1 {v24.16b}, [x5] // store IV
cbz x23, 2f
b 99b
cbnz x4, 99b
2: frame_pop
ldp x29, x30, [sp], #16
ret
SYM_FUNC_END(aesbs_cbc_decrypt)
ENDPROC(aesbs_cbc_decrypt)
.macro next_tweak, out, in, const, tmp
sshr \tmp\().2d, \in\().2d, #63
@ -731,103 +718,100 @@ SYM_FUNC_END(aesbs_cbc_decrypt)
eor \out\().16b, \out\().16b, \tmp\().16b
.endm
.align 4
.Lxts_mul_x:
CPU_LE( .quad 1, 0x87 )
CPU_BE( .quad 0x87, 1 )
/*
* aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[])
* aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
* int blocks, u8 iv[])
*/
SYM_FUNC_START_LOCAL(__xts_crypt8)
__xts_crypt8:
mov x6, #1
lsl x6, x6, x23
subs w23, w23, #8
csel x23, x23, xzr, pl
lsl x6, x6, x4
subs w4, w4, #8
csel x4, x4, xzr, pl
csel x6, x6, xzr, mi
ld1 {v0.16b}, [x20], #16
ld1 {v0.16b}, [x1], #16
next_tweak v26, v25, v30, v31
eor v0.16b, v0.16b, v25.16b
tbnz x6, #1, 0f
ld1 {v1.16b}, [x20], #16
ld1 {v1.16b}, [x1], #16
next_tweak v27, v26, v30, v31
eor v1.16b, v1.16b, v26.16b
tbnz x6, #2, 0f
ld1 {v2.16b}, [x20], #16
ld1 {v2.16b}, [x1], #16
next_tweak v28, v27, v30, v31
eor v2.16b, v2.16b, v27.16b
tbnz x6, #3, 0f
ld1 {v3.16b}, [x20], #16
ld1 {v3.16b}, [x1], #16
next_tweak v29, v28, v30, v31
eor v3.16b, v3.16b, v28.16b
tbnz x6, #4, 0f
ld1 {v4.16b}, [x20], #16
str q29, [sp, #.Lframe_local_offset]
ld1 {v4.16b}, [x1], #16
str q29, [sp, #16]
eor v4.16b, v4.16b, v29.16b
next_tweak v29, v29, v30, v31
tbnz x6, #5, 0f
ld1 {v5.16b}, [x20], #16
str q29, [sp, #.Lframe_local_offset + 16]
ld1 {v5.16b}, [x1], #16
str q29, [sp, #32]
eor v5.16b, v5.16b, v29.16b
next_tweak v29, v29, v30, v31
tbnz x6, #6, 0f
ld1 {v6.16b}, [x20], #16
str q29, [sp, #.Lframe_local_offset + 32]
ld1 {v6.16b}, [x1], #16
str q29, [sp, #48]
eor v6.16b, v6.16b, v29.16b
next_tweak v29, v29, v30, v31
tbnz x6, #7, 0f
ld1 {v7.16b}, [x20], #16
str q29, [sp, #.Lframe_local_offset + 48]
ld1 {v7.16b}, [x1], #16
str q29, [sp, #64]
eor v7.16b, v7.16b, v29.16b
next_tweak v29, v29, v30, v31
0: mov bskey, x21
mov rounds, x22
br x16
SYM_FUNC_END(__xts_crypt8)
0: mov bskey, x2
mov rounds, x3
br x7
ENDPROC(__xts_crypt8)
.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
frame_push 6, 64
stp x29, x30, [sp, #-80]!
mov x29, sp
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
ldr q30, .Lxts_mul_x
ld1 {v25.16b}, [x5]
movi v30.2s, #0x1
movi v25.2s, #0x87
uzp1 v30.4s, v30.4s, v25.4s
ld1 {v25.16b}, [x24]
99: adr x16, \do8
99: adr x7, \do8
bl __xts_crypt8
ldp q16, q17, [sp, #.Lframe_local_offset]
ldp q18, q19, [sp, #.Lframe_local_offset + 32]
ldp q16, q17, [sp, #16]
ldp q18, q19, [sp, #48]
eor \o0\().16b, \o0\().16b, v25.16b
eor \o1\().16b, \o1\().16b, v26.16b
eor \o2\().16b, \o2\().16b, v27.16b
eor \o3\().16b, \o3\().16b, v28.16b
st1 {\o0\().16b}, [x19], #16
st1 {\o0\().16b}, [x0], #16
mov v25.16b, v26.16b
tbnz x6, #1, 1f
st1 {\o1\().16b}, [x19], #16
st1 {\o1\().16b}, [x0], #16
mov v25.16b, v27.16b
tbnz x6, #2, 1f
st1 {\o2\().16b}, [x19], #16
st1 {\o2\().16b}, [x0], #16
mov v25.16b, v28.16b
tbnz x6, #3, 1f
st1 {\o3\().16b}, [x19], #16
st1 {\o3\().16b}, [x0], #16
mov v25.16b, v29.16b
tbnz x6, #4, 1f
@ -836,31 +820,28 @@ SYM_FUNC_END(__xts_crypt8)
eor \o6\().16b, \o6\().16b, v18.16b
eor \o7\().16b, \o7\().16b, v19.16b
st1 {\o4\().16b}, [x19], #16
st1 {\o4\().16b}, [x0], #16
tbnz x6, #5, 1f
st1 {\o5\().16b}, [x19], #16
st1 {\o5\().16b}, [x0], #16
tbnz x6, #6, 1f
st1 {\o6\().16b}, [x19], #16
st1 {\o6\().16b}, [x0], #16
tbnz x6, #7, 1f
st1 {\o7\().16b}, [x19], #16
st1 {\o7\().16b}, [x0], #16
cbz x23, 1f
st1 {v25.16b}, [x24]
cbnz x4, 99b
b 99b
1: st1 {v25.16b}, [x24]
frame_pop
1: st1 {v25.16b}, [x5]
ldp x29, x30, [sp], #80
ret
.endm
SYM_FUNC_START(aesbs_xts_encrypt)
ENTRY(aesbs_xts_encrypt)
__xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5
SYM_FUNC_END(aesbs_xts_encrypt)
ENDPROC(aesbs_xts_encrypt)
SYM_FUNC_START(aesbs_xts_decrypt)
ENTRY(aesbs_xts_decrypt)
__xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5
SYM_FUNC_END(aesbs_xts_decrypt)
ENDPROC(aesbs_xts_decrypt)
.macro next_ctr, v
mov \v\().d[1], x8
@ -874,32 +855,25 @@ SYM_FUNC_END(aesbs_xts_decrypt)
* aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
* int rounds, int blocks, u8 iv[], u8 final[])
*/
SYM_FUNC_START(aesbs_ctr_encrypt)
frame_push 8
ENTRY(aesbs_ctr_encrypt)
stp x29, x30, [sp, #-16]!
mov x29, sp
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
mov x24, x5
mov x25, x6
cmp x6, #0
cset x10, ne
add x4, x4, x10 // do one extra block if final
cmp x25, #0
cset x26, ne
add x23, x23, x26 // do one extra block if final
ldp x7, x8, [x24]
ld1 {v0.16b}, [x24]
ldp x7, x8, [x5]
ld1 {v0.16b}, [x5]
CPU_LE( rev x7, x7 )
CPU_LE( rev x8, x8 )
adds x8, x8, #1
adc x7, x7, xzr
99: mov x9, #1
lsl x9, x9, x23
subs w23, w23, #8
csel x23, x23, xzr, pl
lsl x9, x9, x4
subs w4, w4, #8
csel x4, x4, xzr, pl
csel x9, x9, xzr, le
tbnz x9, #1, 0f
@ -917,88 +891,85 @@ CPU_LE( rev x8, x8 )
tbnz x9, #7, 0f
next_ctr v7
0: mov bskey, x21
mov rounds, x22
0: mov bskey, x2
mov rounds, x3
bl aesbs_encrypt8
lsr x9, x9, x26 // disregard the extra block
lsr x9, x9, x10 // disregard the extra block
tbnz x9, #0, 0f
ld1 {v8.16b}, [x20], #16
ld1 {v8.16b}, [x1], #16
eor v0.16b, v0.16b, v8.16b
st1 {v0.16b}, [x19], #16
st1 {v0.16b}, [x0], #16
tbnz x9, #1, 1f
ld1 {v9.16b}, [x20], #16
ld1 {v9.16b}, [x1], #16
eor v1.16b, v1.16b, v9.16b
st1 {v1.16b}, [x19], #16
st1 {v1.16b}, [x0], #16
tbnz x9, #2, 2f
ld1 {v10.16b}, [x20], #16
ld1 {v10.16b}, [x1], #16
eor v4.16b, v4.16b, v10.16b
st1 {v4.16b}, [x19], #16
st1 {v4.16b}, [x0], #16
tbnz x9, #3, 3f
ld1 {v11.16b}, [x20], #16
ld1 {v11.16b}, [x1], #16
eor v6.16b, v6.16b, v11.16b
st1 {v6.16b}, [x19], #16
st1 {v6.16b}, [x0], #16
tbnz x9, #4, 4f
ld1 {v12.16b}, [x20], #16
ld1 {v12.16b}, [x1], #16
eor v3.16b, v3.16b, v12.16b
st1 {v3.16b}, [x19], #16
st1 {v3.16b}, [x0], #16
tbnz x9, #5, 5f
ld1 {v13.16b}, [x20], #16
ld1 {v13.16b}, [x1], #16
eor v7.16b, v7.16b, v13.16b
st1 {v7.16b}, [x19], #16
st1 {v7.16b}, [x0], #16
tbnz x9, #6, 6f
ld1 {v14.16b}, [x20], #16
ld1 {v14.16b}, [x1], #16
eor v2.16b, v2.16b, v14.16b
st1 {v2.16b}, [x19], #16
st1 {v2.16b}, [x0], #16
tbnz x9, #7, 7f
ld1 {v15.16b}, [x20], #16
ld1 {v15.16b}, [x1], #16
eor v5.16b, v5.16b, v15.16b
st1 {v5.16b}, [x19], #16
st1 {v5.16b}, [x0], #16
8: next_ctr v0
st1 {v0.16b}, [x24]
cbz x23, .Lctr_done
cbnz x4, 99b
b 99b
.Lctr_done:
frame_pop
st1 {v0.16b}, [x5]
ldp x29, x30, [sp], #16
ret
/*
* If we are handling the tail of the input (x6 != NULL), return the
* final keystream block back to the caller.
*/
0: cbz x25, 8b
st1 {v0.16b}, [x25]
0: cbz x6, 8b
st1 {v0.16b}, [x6]
b 8b
1: cbz x25, 8b
st1 {v1.16b}, [x25]
1: cbz x6, 8b
st1 {v1.16b}, [x6]
b 8b
2: cbz x25, 8b
st1 {v4.16b}, [x25]
2: cbz x6, 8b
st1 {v4.16b}, [x6]
b 8b
3: cbz x25, 8b
st1 {v6.16b}, [x25]
3: cbz x6, 8b
st1 {v6.16b}, [x6]
b 8b
4: cbz x25, 8b
st1 {v3.16b}, [x25]
4: cbz x6, 8b
st1 {v3.16b}, [x6]
b 8b
5: cbz x25, 8b
st1 {v7.16b}, [x25]
5: cbz x6, 8b
st1 {v7.16b}, [x6]
b 8b
6: cbz x25, 8b
st1 {v2.16b}, [x25]
6: cbz x6, 8b
st1 {v2.16b}, [x6]
b 8b
7: cbz x25, 8b
st1 {v5.16b}, [x25]
7: cbz x6, 8b
st1 {v5.16b}, [x6]
b 8b
SYM_FUNC_END(aesbs_ctr_encrypt)
ENDPROC(aesbs_ctr_encrypt)

View File

@ -11,13 +11,13 @@
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/aes.h>
#include <crypto/ctr.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <crypto/xts.h>
#include <linux/module.h>
#include "aes-ctr-fallback.h"
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
@ -49,12 +49,6 @@ asmlinkage void neon_aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks);
asmlinkage void neon_aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[],
int rounds, int blocks, u8 iv[]);
asmlinkage void neon_aes_xts_encrypt(u8 out[], u8 const in[],
u32 const rk1[], int rounds, int bytes,
u32 const rk2[], u8 iv[], int first);
asmlinkage void neon_aes_xts_decrypt(u8 out[], u8 const in[],
u32 const rk1[], int rounds, int bytes,
u32 const rk2[], u8 iv[], int first);
struct aesbs_ctx {
u8 rk[13 * (8 * AES_BLOCK_SIZE) + 32];
@ -74,7 +68,6 @@ struct aesbs_ctr_ctx {
struct aesbs_xts_ctx {
struct aesbs_ctx key;
u32 twkey[AES_MAX_KEYLENGTH_U32];
struct crypto_aes_ctx cts;
};
static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
@ -84,7 +77,7 @@ static int aesbs_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
struct crypto_aes_ctx rk;
int err;
err = aes_expandkey(&rk, in_key, key_len);
err = crypto_aes_expand_key(&rk, in_key, key_len);
if (err)
return err;
@ -106,8 +99,9 @@ static int __ecb_crypt(struct skcipher_request *req,
struct skcipher_walk walk;
int err;
err = skcipher_walk_virt(&walk, req, false);
err = skcipher_walk_virt(&walk, req, true);
kernel_neon_begin();
while (walk.nbytes >= AES_BLOCK_SIZE) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
@ -115,13 +109,12 @@ static int __ecb_crypt(struct skcipher_request *req,
blocks = round_down(blocks,
walk.stride / AES_BLOCK_SIZE);
kernel_neon_begin();
fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->rk,
ctx->rounds, blocks);
kernel_neon_end();
err = skcipher_walk_done(&walk,
walk.nbytes - blocks * AES_BLOCK_SIZE);
}
kernel_neon_end();
return err;
}
@ -143,7 +136,7 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
struct crypto_aes_ctx rk;
int err;
err = aes_expandkey(&rk, in_key, key_len);
err = crypto_aes_expand_key(&rk, in_key, key_len);
if (err)
return err;
@ -154,7 +147,6 @@ static int aesbs_cbc_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
kernel_neon_begin();
aesbs_convert_key(ctx->key.rk, rk.key_enc, ctx->key.rounds);
kernel_neon_end();
memzero_explicit(&rk, sizeof(rk));
return 0;
}
@ -166,19 +158,19 @@ static int cbc_encrypt(struct skcipher_request *req)
struct skcipher_walk walk;
int err;
err = skcipher_walk_virt(&walk, req, false);
err = skcipher_walk_virt(&walk, req, true);
kernel_neon_begin();
while (walk.nbytes >= AES_BLOCK_SIZE) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
/* fall back to the non-bitsliced NEON implementation */
kernel_neon_begin();
neon_aes_cbc_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->enc, ctx->key.rounds, blocks,
walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes % AES_BLOCK_SIZE);
}
kernel_neon_end();
return err;
}
@ -189,8 +181,9 @@ static int cbc_decrypt(struct skcipher_request *req)
struct skcipher_walk walk;
int err;
err = skcipher_walk_virt(&walk, req, false);
err = skcipher_walk_virt(&walk, req, true);
kernel_neon_begin();
while (walk.nbytes >= AES_BLOCK_SIZE) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
@ -198,14 +191,13 @@ static int cbc_decrypt(struct skcipher_request *req)
blocks = round_down(blocks,
walk.stride / AES_BLOCK_SIZE);
kernel_neon_begin();
aesbs_cbc_decrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->key.rk, ctx->key.rounds, blocks,
walk.iv);
kernel_neon_end();
err = skcipher_walk_done(&walk,
walk.nbytes - blocks * AES_BLOCK_SIZE);
}
kernel_neon_end();
return err;
}
@ -216,7 +208,7 @@ static int aesbs_ctr_setkey_sync(struct crypto_skcipher *tfm, const u8 *in_key,
struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
int err;
err = aes_expandkey(&ctx->fallback, in_key, key_len);
err = crypto_aes_expand_key(&ctx->fallback, in_key, key_len);
if (err)
return err;
@ -237,8 +229,9 @@ static int ctr_encrypt(struct skcipher_request *req)
u8 buf[AES_BLOCK_SIZE];
int err;
err = skcipher_walk_virt(&walk, req, false);
err = skcipher_walk_virt(&walk, req, true);
kernel_neon_begin();
while (walk.nbytes > 0) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
u8 *final = (walk.total % AES_BLOCK_SIZE) ? buf : NULL;
@ -249,10 +242,8 @@ static int ctr_encrypt(struct skcipher_request *req)
final = NULL;
}
kernel_neon_begin();
aesbs_ctr_encrypt(walk.dst.virt.addr, walk.src.virt.addr,
ctx->rk, ctx->rounds, blocks, walk.iv, final);
kernel_neon_end();
if (final) {
u8 *dst = walk.dst.virt.addr + blocks * AES_BLOCK_SIZE;
@ -267,6 +258,8 @@ static int ctr_encrypt(struct skcipher_request *req)
err = skcipher_walk_done(&walk,
walk.nbytes - blocks * AES_BLOCK_SIZE);
}
kernel_neon_end();
return err;
}
@ -282,11 +275,7 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
return err;
key_len /= 2;
err = aes_expandkey(&ctx->cts, in_key, key_len);
if (err)
return err;
err = aes_expandkey(&rk, in_key + key_len, key_len);
err = crypto_aes_expand_key(&rk, in_key + key_len, key_len);
if (err)
return err;
@ -295,142 +284,60 @@ static int aesbs_xts_setkey(struct crypto_skcipher *tfm, const u8 *in_key,
return aesbs_setkey(tfm, in_key, key_len);
}
static void ctr_encrypt_one(struct crypto_skcipher *tfm, const u8 *src, u8 *dst)
{
struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
unsigned long flags;
/*
* Temporarily disable interrupts to avoid races where
* cachelines are evicted when the CPU is interrupted
* to do something else.
*/
local_irq_save(flags);
aes_encrypt(&ctx->fallback, dst, src);
local_irq_restore(flags);
}
static int ctr_encrypt_sync(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm);
if (!may_use_simd())
return crypto_ctr_encrypt_walk(req, ctr_encrypt_one);
return aes_ctr_encrypt_fallback(&ctx->fallback, req);
return ctr_encrypt(req);
}
static int __xts_crypt(struct skcipher_request *req, bool encrypt,
static int __xts_crypt(struct skcipher_request *req,
void (*fn)(u8 out[], u8 const in[], u8 const rk[],
int rounds, int blocks, u8 iv[]))
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct aesbs_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int tail = req->cryptlen % (8 * AES_BLOCK_SIZE);
struct scatterlist sg_src[2], sg_dst[2];
struct skcipher_request subreq;
struct scatterlist *src, *dst;
struct skcipher_walk walk;
int nbytes, err;
int first = 1;
u8 *out, *in;
int err;
if (req->cryptlen < AES_BLOCK_SIZE)
return -EINVAL;
/* ensure that the cts tail is covered by a single step */
if (unlikely(tail > 0 && tail < AES_BLOCK_SIZE)) {
int xts_blocks = DIV_ROUND_UP(req->cryptlen,
AES_BLOCK_SIZE) - 2;
skcipher_request_set_tfm(&subreq, tfm);
skcipher_request_set_callback(&subreq,
skcipher_request_flags(req),
NULL, NULL);
skcipher_request_set_crypt(&subreq, req->src, req->dst,
xts_blocks * AES_BLOCK_SIZE,
req->iv);
req = &subreq;
} else {
tail = 0;
}
err = skcipher_walk_virt(&walk, req, false);
err = skcipher_walk_virt(&walk, req, true);
if (err)
return err;
kernel_neon_begin();
neon_aes_ecb_encrypt(walk.iv, walk.iv, ctx->twkey,
ctx->key.rounds, 1);
while (walk.nbytes >= AES_BLOCK_SIZE) {
unsigned int blocks = walk.nbytes / AES_BLOCK_SIZE;
if (walk.nbytes < walk.total || walk.nbytes % AES_BLOCK_SIZE)
if (walk.nbytes < walk.total)
blocks = round_down(blocks,
walk.stride / AES_BLOCK_SIZE);
out = walk.dst.virt.addr;
in = walk.src.virt.addr;
nbytes = walk.nbytes;
kernel_neon_begin();
if (likely(blocks > 6)) { /* plain NEON is faster otherwise */
if (first)
neon_aes_ecb_encrypt(walk.iv, walk.iv,
ctx->twkey,
ctx->key.rounds, 1);
first = 0;
fn(out, in, ctx->key.rk, ctx->key.rounds, blocks,
walk.iv);
out += blocks * AES_BLOCK_SIZE;
in += blocks * AES_BLOCK_SIZE;
nbytes -= blocks * AES_BLOCK_SIZE;
}
if (walk.nbytes == walk.total && nbytes > 0)
goto xts_tail;
kernel_neon_end();
err = skcipher_walk_done(&walk, nbytes);
fn(walk.dst.virt.addr, walk.src.virt.addr, ctx->key.rk,
ctx->key.rounds, blocks, walk.iv);
err = skcipher_walk_done(&walk,
walk.nbytes - blocks * AES_BLOCK_SIZE);
}
if (err || likely(!tail))
return err;
/* handle ciphertext stealing */
dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
if (req->dst != req->src)
dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
req->iv);
err = skcipher_walk_virt(&walk, req, false);
if (err)
return err;
out = walk.dst.virt.addr;
in = walk.src.virt.addr;
nbytes = walk.nbytes;
kernel_neon_begin();
xts_tail:
if (encrypt)
neon_aes_xts_encrypt(out, in, ctx->cts.key_enc, ctx->key.rounds,
nbytes, ctx->twkey, walk.iv, first ?: 2);
else
neon_aes_xts_decrypt(out, in, ctx->cts.key_dec, ctx->key.rounds,
nbytes, ctx->twkey, walk.iv, first ?: 2);
kernel_neon_end();
return skcipher_walk_done(&walk, 0);
return err;
}
static int xts_encrypt(struct skcipher_request *req)
{
return __xts_crypt(req, true, aesbs_xts_encrypt);
return __xts_crypt(req, aesbs_xts_encrypt);
}
static int xts_decrypt(struct skcipher_request *req)
{
return __xts_crypt(req, false, aesbs_xts_decrypt);
return __xts_crypt(req, aesbs_xts_decrypt);
}
static struct skcipher_alg aes_algs[] = { {
@ -537,7 +444,7 @@ static int __init aes_init(void)
int err;
int i;
if (!cpu_have_named_feature(ASIMD))
if (!(elf_hwcap & HWCAP_ASIMD))
return -ENODEV;
err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs));

View File

@ -1,199 +0,0 @@
/*
* ARM NEON accelerated ChaCha and XChaCha stream ciphers,
* including ChaCha20 (RFC7539)
*
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Based on:
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <crypto/algapi.h>
#include <crypto/chacha.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
asmlinkage void chacha_block_xor_neon(u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void chacha_4block_xor_neon(u32 *state, u8 *dst, const u8 *src,
int nrounds, int bytes);
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
static void chacha_doneon(u32 *state, u8 *dst, const u8 *src,
int bytes, int nrounds)
{
while (bytes > 0) {
int l = min(bytes, CHACHA_BLOCK_SIZE * 5);
if (l <= CHACHA_BLOCK_SIZE) {
u8 buf[CHACHA_BLOCK_SIZE];
memcpy(buf, src, l);
chacha_block_xor_neon(state, buf, buf, nrounds);
memcpy(dst, buf, l);
state[12] += 1;
break;
}
chacha_4block_xor_neon(state, dst, src, nrounds, l);
bytes -= CHACHA_BLOCK_SIZE * 5;
src += CHACHA_BLOCK_SIZE * 5;
dst += CHACHA_BLOCK_SIZE * 5;
state[12] += 5;
}
}
static int chacha_neon_stream_xor(struct skcipher_request *req,
const struct chacha_ctx *ctx, const u8 *iv)
{
struct skcipher_walk walk;
u32 state[16];
int err;
err = skcipher_walk_virt(&walk, req, false);
crypto_chacha_init(state, ctx, iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = rounddown(nbytes, walk.stride);
kernel_neon_begin();
chacha_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
nbytes, ctx->nrounds);
kernel_neon_end();
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
return err;
}
static int chacha_neon(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
return crypto_chacha_crypt(req);
return chacha_neon_stream_xor(req, ctx, req->iv);
}
static int xchacha_neon(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct chacha_ctx subctx;
u32 state[16];
u8 real_iv[16];
if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd())
return crypto_xchacha_crypt(req);
crypto_chacha_init(state, ctx, req->iv);
kernel_neon_begin();
hchacha_block_neon(state, subctx.key, ctx->nrounds);
kernel_neon_end();
subctx.nrounds = ctx->nrounds;
memcpy(&real_iv[0], req->iv + 24, 8);
memcpy(&real_iv[8], req->iv + 16, 8);
return chacha_neon_stream_xor(req, &subctx, real_iv);
}
static struct skcipher_alg algs[] = {
{
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 5 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = chacha_neon,
.decrypt = chacha_neon,
}, {
.base.cra_name = "xchacha20",
.base.cra_driver_name = "xchacha20-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 5 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}, {
.base.cra_name = "xchacha12",
.base.cra_driver_name = "xchacha12-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = XCHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 5 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha12_setkey,
.encrypt = xchacha_neon,
.decrypt = xchacha_neon,
}
};
static int __init chacha_simd_mod_init(void)
{
if (!cpu_have_named_feature(ASIMD))
return -ENODEV;
return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
}
static void __exit chacha_simd_mod_fini(void)
{
crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
}
module_init(chacha_simd_mod_init);
module_exit(chacha_simd_mod_fini);
MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (NEON accelerated)");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("chacha20");
MODULE_ALIAS_CRYPTO("chacha20-neon");
MODULE_ALIAS_CRYPTO("xchacha20");
MODULE_ALIAS_CRYPTO("xchacha20-neon");
MODULE_ALIAS_CRYPTO("xchacha12");
MODULE_ALIAS_CRYPTO("xchacha12-neon");

View File

@ -1,13 +1,13 @@
/*
* ChaCha/XChaCha NEON helper functions
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
*
* Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Originally based on:
* Based on:
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
*
* Copyright (C) 2015 Martin Willi
@ -19,27 +19,29 @@
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/cache.h>
.text
.align 6
/*
* chacha_permute - permute one block
*
* Permute one 64-byte block where the state matrix is stored in the four NEON
* registers v0-v3. It performs matrix operations on four words in parallel,
* but requires shuffling to rearrange the words after each round.
*
* The round count is given in w3.
*
* Clobbers: w3, x10, v4, v12
*/
SYM_FUNC_START_LOCAL(chacha_permute)
ENTRY(chacha20_block_xor_neon)
// x0: Input state matrix, s
// x1: 1 data block output, o
// x2: 1 data block input, i
adr_l x10, ROT8
ld1 {v12.4s}, [x10]
//
// This function encrypts one ChaCha20 block by loading the state matrix
// in four NEON registers. It performs matrix operation on four words in
// parallel, but requires shuffling to rearrange the words after each
// round.
//
// x0..3 = s0..3
adr x3, ROT8
ld1 {v0.4s-v3.4s}, [x0]
ld1 {v8.4s-v11.4s}, [x0]
ld1 {v12.4s}, [x3]
mov x3, #10
.Ldoubleround:
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
@ -100,27 +102,9 @@ SYM_FUNC_START_LOCAL(chacha_permute)
// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
ext v3.16b, v3.16b, v3.16b, #4
subs w3, w3, #2
subs x3, x3, #1
b.ne .Ldoubleround
ret
SYM_FUNC_END(chacha_permute)
SYM_FUNC_START(chacha_block_xor_neon)
// x0: Input state matrix, s
// x1: 1 data block output, o
// x2: 1 data block input, i
// w3: nrounds
stp x29, x30, [sp, #-16]!
mov x29, sp
// x0..3 = s0..3
ld1 {v0.4s-v3.4s}, [x0]
ld1 {v8.4s-v11.4s}, [x0]
bl chacha_permute
ld1 {v4.16b-v7.16b}, [x2]
// o0 = i0 ^ (x0 + s0)
@ -141,155 +125,71 @@ SYM_FUNC_START(chacha_block_xor_neon)
st1 {v0.16b-v3.16b}, [x1]
ldp x29, x30, [sp], #16
ret
SYM_FUNC_END(chacha_block_xor_neon)
SYM_FUNC_START(hchacha_block_neon)
// x0: Input state matrix, s
// x1: output (8 32-bit words)
// w2: nrounds
stp x29, x30, [sp, #-16]!
mov x29, sp
ld1 {v0.4s-v3.4s}, [x0]
mov w3, w2
bl chacha_permute
st1 {v0.4s}, [x1], #16
st1 {v3.4s}, [x1]
ldp x29, x30, [sp], #16
ret
SYM_FUNC_END(hchacha_block_neon)
a0 .req w12
a1 .req w13
a2 .req w14
a3 .req w15
a4 .req w16
a5 .req w17
a6 .req w19
a7 .req w20
a8 .req w21
a9 .req w22
a10 .req w23
a11 .req w24
a12 .req w25
a13 .req w26
a14 .req w27
a15 .req w28
ENDPROC(chacha20_block_xor_neon)
.align 6
SYM_FUNC_START(chacha_4block_xor_neon)
frame_push 10
ENTRY(chacha20_4block_xor_neon)
// x0: Input state matrix, s
// x1: 4 data blocks output, o
// x2: 4 data blocks input, i
// w3: nrounds
// x4: byte count
adr_l x10, .Lpermute
and x5, x4, #63
add x10, x10, x5
//
// This function encrypts four consecutive ChaCha blocks by loading
// This function encrypts four consecutive ChaCha20 blocks by loading
// the state matrix in NEON registers four times. The algorithm performs
// each operation on the corresponding word of each state matrix, hence
// requires no word shuffling. For final XORing step we transpose the
// matrix by interleaving 32- and then 64-bit words, which allows us to
// do XOR in NEON registers.
//
// At the same time, a fifth block is encrypted in parallel using
// scalar registers
//
adr_l x9, CTRINC // ... and ROT8
ld1 {v30.4s-v31.4s}, [x9]
adr x3, CTRINC // ... and ROT8
ld1 {v30.4s-v31.4s}, [x3]
// x0..15[0-3] = s0..3[0..3]
add x8, x0, #16
ld4r { v0.4s- v3.4s}, [x0]
ld4r { v4.4s- v7.4s}, [x8], #16
ld4r { v8.4s-v11.4s}, [x8], #16
ld4r {v12.4s-v15.4s}, [x8]
mov x4, x0
ld4r { v0.4s- v3.4s}, [x4], #16
ld4r { v4.4s- v7.4s}, [x4], #16
ld4r { v8.4s-v11.4s}, [x4], #16
ld4r {v12.4s-v15.4s}, [x4]
mov a0, v0.s[0]
mov a1, v1.s[0]
mov a2, v2.s[0]
mov a3, v3.s[0]
mov a4, v4.s[0]
mov a5, v5.s[0]
mov a6, v6.s[0]
mov a7, v7.s[0]
mov a8, v8.s[0]
mov a9, v9.s[0]
mov a10, v10.s[0]
mov a11, v11.s[0]
mov a12, v12.s[0]
mov a13, v13.s[0]
mov a14, v14.s[0]
mov a15, v15.s[0]
// x12 += counter values 1-4
// x12 += counter values 0-3
add v12.4s, v12.4s, v30.4s
mov x3, #10
.Ldoubleround4:
// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
add v0.4s, v0.4s, v4.4s
add a0, a0, a4
add v1.4s, v1.4s, v5.4s
add a1, a1, a5
add v2.4s, v2.4s, v6.4s
add a2, a2, a6
add v3.4s, v3.4s, v7.4s
add a3, a3, a7
eor v12.16b, v12.16b, v0.16b
eor a12, a12, a0
eor v13.16b, v13.16b, v1.16b
eor a13, a13, a1
eor v14.16b, v14.16b, v2.16b
eor a14, a14, a2
eor v15.16b, v15.16b, v3.16b
eor a15, a15, a3
rev32 v12.8h, v12.8h
ror a12, a12, #16
rev32 v13.8h, v13.8h
ror a13, a13, #16
rev32 v14.8h, v14.8h
ror a14, a14, #16
rev32 v15.8h, v15.8h
ror a15, a15, #16
// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
add v8.4s, v8.4s, v12.4s
add a8, a8, a12
add v9.4s, v9.4s, v13.4s
add a9, a9, a13
add v10.4s, v10.4s, v14.4s
add a10, a10, a14
add v11.4s, v11.4s, v15.4s
add a11, a11, a15
eor v16.16b, v4.16b, v8.16b
eor a4, a4, a8
eor v17.16b, v5.16b, v9.16b
eor a5, a5, a9
eor v18.16b, v6.16b, v10.16b
eor a6, a6, a10
eor v19.16b, v7.16b, v11.16b
eor a7, a7, a11
shl v4.4s, v16.4s, #12
shl v5.4s, v17.4s, #12
@ -297,66 +197,42 @@ SYM_FUNC_START(chacha_4block_xor_neon)
shl v7.4s, v19.4s, #12
sri v4.4s, v16.4s, #20
ror a4, a4, #20
sri v5.4s, v17.4s, #20
ror a5, a5, #20
sri v6.4s, v18.4s, #20
ror a6, a6, #20
sri v7.4s, v19.4s, #20
ror a7, a7, #20
// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
add v0.4s, v0.4s, v4.4s
add a0, a0, a4
add v1.4s, v1.4s, v5.4s
add a1, a1, a5
add v2.4s, v2.4s, v6.4s
add a2, a2, a6
add v3.4s, v3.4s, v7.4s
add a3, a3, a7
eor v12.16b, v12.16b, v0.16b
eor a12, a12, a0
eor v13.16b, v13.16b, v1.16b
eor a13, a13, a1
eor v14.16b, v14.16b, v2.16b
eor a14, a14, a2
eor v15.16b, v15.16b, v3.16b
eor a15, a15, a3
tbl v12.16b, {v12.16b}, v31.16b
ror a12, a12, #24
tbl v13.16b, {v13.16b}, v31.16b
ror a13, a13, #24
tbl v14.16b, {v14.16b}, v31.16b
ror a14, a14, #24
tbl v15.16b, {v15.16b}, v31.16b
ror a15, a15, #24
// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
add v8.4s, v8.4s, v12.4s
add a8, a8, a12
add v9.4s, v9.4s, v13.4s
add a9, a9, a13
add v10.4s, v10.4s, v14.4s
add a10, a10, a14
add v11.4s, v11.4s, v15.4s
add a11, a11, a15
eor v16.16b, v4.16b, v8.16b
eor a4, a4, a8
eor v17.16b, v5.16b, v9.16b
eor a5, a5, a9
eor v18.16b, v6.16b, v10.16b
eor a6, a6, a10
eor v19.16b, v7.16b, v11.16b
eor a7, a7, a11
shl v4.4s, v16.4s, #7
shl v5.4s, v17.4s, #7
@ -364,66 +240,42 @@ SYM_FUNC_START(chacha_4block_xor_neon)
shl v7.4s, v19.4s, #7
sri v4.4s, v16.4s, #25
ror a4, a4, #25
sri v5.4s, v17.4s, #25
ror a5, a5, #25
sri v6.4s, v18.4s, #25
ror a6, a6, #25
sri v7.4s, v19.4s, #25
ror a7, a7, #25
// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
add v0.4s, v0.4s, v5.4s
add a0, a0, a5
add v1.4s, v1.4s, v6.4s
add a1, a1, a6
add v2.4s, v2.4s, v7.4s
add a2, a2, a7
add v3.4s, v3.4s, v4.4s
add a3, a3, a4
eor v15.16b, v15.16b, v0.16b
eor a15, a15, a0
eor v12.16b, v12.16b, v1.16b
eor a12, a12, a1
eor v13.16b, v13.16b, v2.16b
eor a13, a13, a2
eor v14.16b, v14.16b, v3.16b
eor a14, a14, a3
rev32 v15.8h, v15.8h
ror a15, a15, #16
rev32 v12.8h, v12.8h
ror a12, a12, #16
rev32 v13.8h, v13.8h
ror a13, a13, #16
rev32 v14.8h, v14.8h
ror a14, a14, #16
// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
add v10.4s, v10.4s, v15.4s
add a10, a10, a15
add v11.4s, v11.4s, v12.4s
add a11, a11, a12
add v8.4s, v8.4s, v13.4s
add a8, a8, a13
add v9.4s, v9.4s, v14.4s
add a9, a9, a14
eor v16.16b, v5.16b, v10.16b
eor a5, a5, a10
eor v17.16b, v6.16b, v11.16b
eor a6, a6, a11
eor v18.16b, v7.16b, v8.16b
eor a7, a7, a8
eor v19.16b, v4.16b, v9.16b
eor a4, a4, a9
shl v5.4s, v16.4s, #12
shl v6.4s, v17.4s, #12
@ -431,66 +283,42 @@ SYM_FUNC_START(chacha_4block_xor_neon)
shl v4.4s, v19.4s, #12
sri v5.4s, v16.4s, #20
ror a5, a5, #20
sri v6.4s, v17.4s, #20
ror a6, a6, #20
sri v7.4s, v18.4s, #20
ror a7, a7, #20
sri v4.4s, v19.4s, #20
ror a4, a4, #20
// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
add v0.4s, v0.4s, v5.4s
add a0, a0, a5
add v1.4s, v1.4s, v6.4s
add a1, a1, a6
add v2.4s, v2.4s, v7.4s
add a2, a2, a7
add v3.4s, v3.4s, v4.4s
add a3, a3, a4
eor v15.16b, v15.16b, v0.16b
eor a15, a15, a0
eor v12.16b, v12.16b, v1.16b
eor a12, a12, a1
eor v13.16b, v13.16b, v2.16b
eor a13, a13, a2
eor v14.16b, v14.16b, v3.16b
eor a14, a14, a3
tbl v15.16b, {v15.16b}, v31.16b
ror a15, a15, #24
tbl v12.16b, {v12.16b}, v31.16b
ror a12, a12, #24
tbl v13.16b, {v13.16b}, v31.16b
ror a13, a13, #24
tbl v14.16b, {v14.16b}, v31.16b
ror a14, a14, #24
// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
add v10.4s, v10.4s, v15.4s
add a10, a10, a15
add v11.4s, v11.4s, v12.4s
add a11, a11, a12
add v8.4s, v8.4s, v13.4s
add a8, a8, a13
add v9.4s, v9.4s, v14.4s
add a9, a9, a14
eor v16.16b, v5.16b, v10.16b
eor a5, a5, a10
eor v17.16b, v6.16b, v11.16b
eor a6, a6, a11
eor v18.16b, v7.16b, v8.16b
eor a7, a7, a8
eor v19.16b, v4.16b, v9.16b
eor a4, a4, a9
shl v5.4s, v16.4s, #7
shl v6.4s, v17.4s, #7
@ -498,15 +326,11 @@ SYM_FUNC_START(chacha_4block_xor_neon)
shl v4.4s, v19.4s, #7
sri v5.4s, v16.4s, #25
ror a5, a5, #25
sri v6.4s, v17.4s, #25
ror a6, a6, #25
sri v7.4s, v18.4s, #25
ror a7, a7, #25
sri v4.4s, v19.4s, #25
ror a4, a4, #25
subs w3, w3, #2
subs x3, x3, #1
b.ne .Ldoubleround4
ld4r {v16.4s-v19.4s}, [x0], #16
@ -520,21 +344,9 @@ SYM_FUNC_START(chacha_4block_xor_neon)
// x2[0-3] += s0[2]
// x3[0-3] += s0[3]
add v0.4s, v0.4s, v16.4s
mov w6, v16.s[0]
mov w7, v17.s[0]
add v1.4s, v1.4s, v17.4s
mov w8, v18.s[0]
mov w9, v19.s[0]
add v2.4s, v2.4s, v18.4s
add a0, a0, w6
add a1, a1, w7
add v3.4s, v3.4s, v19.4s
add a2, a2, w8
add a3, a3, w9
CPU_BE( rev a0, a0 )
CPU_BE( rev a1, a1 )
CPU_BE( rev a2, a2 )
CPU_BE( rev a3, a3 )
ld4r {v24.4s-v27.4s}, [x0], #16
ld4r {v28.4s-v31.4s}, [x0]
@ -544,154 +356,69 @@ CPU_BE( rev a3, a3 )
// x6[0-3] += s1[2]
// x7[0-3] += s1[3]
add v4.4s, v4.4s, v20.4s
mov w6, v20.s[0]
mov w7, v21.s[0]
add v5.4s, v5.4s, v21.4s
mov w8, v22.s[0]
mov w9, v23.s[0]
add v6.4s, v6.4s, v22.4s
add a4, a4, w6
add a5, a5, w7
add v7.4s, v7.4s, v23.4s
add a6, a6, w8
add a7, a7, w9
CPU_BE( rev a4, a4 )
CPU_BE( rev a5, a5 )
CPU_BE( rev a6, a6 )
CPU_BE( rev a7, a7 )
// x8[0-3] += s2[0]
// x9[0-3] += s2[1]
// x10[0-3] += s2[2]
// x11[0-3] += s2[3]
add v8.4s, v8.4s, v24.4s
mov w6, v24.s[0]
mov w7, v25.s[0]
add v9.4s, v9.4s, v25.4s
mov w8, v26.s[0]
mov w9, v27.s[0]
add v10.4s, v10.4s, v26.4s
add a8, a8, w6
add a9, a9, w7
add v11.4s, v11.4s, v27.4s
add a10, a10, w8
add a11, a11, w9
CPU_BE( rev a8, a8 )
CPU_BE( rev a9, a9 )
CPU_BE( rev a10, a10 )
CPU_BE( rev a11, a11 )
// x12[0-3] += s3[0]
// x13[0-3] += s3[1]
// x14[0-3] += s3[2]
// x15[0-3] += s3[3]
add v12.4s, v12.4s, v28.4s
mov w6, v28.s[0]
mov w7, v29.s[0]
add v13.4s, v13.4s, v29.4s
mov w8, v30.s[0]
mov w9, v31.s[0]
add v14.4s, v14.4s, v30.4s
add a12, a12, w6
add a13, a13, w7
add v15.4s, v15.4s, v31.4s
add a14, a14, w8
add a15, a15, w9
CPU_BE( rev a12, a12 )
CPU_BE( rev a13, a13 )
CPU_BE( rev a14, a14 )
CPU_BE( rev a15, a15 )
// interleave 32-bit words in state n, n+1
ldp w6, w7, [x2], #64
zip1 v16.4s, v0.4s, v1.4s
ldp w8, w9, [x2, #-56]
eor a0, a0, w6
zip2 v17.4s, v0.4s, v1.4s
eor a1, a1, w7
zip1 v18.4s, v2.4s, v3.4s
eor a2, a2, w8
zip2 v19.4s, v2.4s, v3.4s
eor a3, a3, w9
ldp w6, w7, [x2, #-48]
zip1 v20.4s, v4.4s, v5.4s
ldp w8, w9, [x2, #-40]
eor a4, a4, w6
zip2 v21.4s, v4.4s, v5.4s
eor a5, a5, w7
zip1 v22.4s, v6.4s, v7.4s
eor a6, a6, w8
zip2 v23.4s, v6.4s, v7.4s
eor a7, a7, w9
ldp w6, w7, [x2, #-32]
zip1 v24.4s, v8.4s, v9.4s
ldp w8, w9, [x2, #-24]
eor a8, a8, w6
zip2 v25.4s, v8.4s, v9.4s
eor a9, a9, w7
zip1 v26.4s, v10.4s, v11.4s
eor a10, a10, w8
zip2 v27.4s, v10.4s, v11.4s
eor a11, a11, w9
ldp w6, w7, [x2, #-16]
zip1 v28.4s, v12.4s, v13.4s
ldp w8, w9, [x2, #-8]
eor a12, a12, w6
zip2 v29.4s, v12.4s, v13.4s
eor a13, a13, w7
zip1 v30.4s, v14.4s, v15.4s
eor a14, a14, w8
zip2 v31.4s, v14.4s, v15.4s
eor a15, a15, w9
add x3, x2, x4
sub x3, x3, #128 // start of last block
subs x5, x4, #128
csel x2, x2, x3, ge
// interleave 64-bit words in state n, n+2
zip1 v0.2d, v16.2d, v18.2d
zip2 v4.2d, v16.2d, v18.2d
stp a0, a1, [x1], #64
zip1 v8.2d, v17.2d, v19.2d
zip2 v12.2d, v17.2d, v19.2d
stp a2, a3, [x1, #-56]
subs x6, x4, #192
ld1 {v16.16b-v19.16b}, [x2], #64
csel x2, x2, x3, ge
zip1 v1.2d, v20.2d, v22.2d
zip2 v5.2d, v20.2d, v22.2d
stp a4, a5, [x1, #-48]
zip1 v9.2d, v21.2d, v23.2d
zip2 v13.2d, v21.2d, v23.2d
stp a6, a7, [x1, #-40]
subs x7, x4, #256
ld1 {v20.16b-v23.16b}, [x2], #64
csel x2, x2, x3, ge
zip1 v2.2d, v24.2d, v26.2d
zip2 v6.2d, v24.2d, v26.2d
stp a8, a9, [x1, #-32]
zip1 v10.2d, v25.2d, v27.2d
zip2 v14.2d, v25.2d, v27.2d
stp a10, a11, [x1, #-24]
subs x8, x4, #320
ld1 {v24.16b-v27.16b}, [x2], #64
csel x2, x2, x3, ge
zip1 v3.2d, v28.2d, v30.2d
zip2 v7.2d, v28.2d, v30.2d
stp a12, a13, [x1, #-16]
zip1 v11.2d, v29.2d, v31.2d
zip2 v15.2d, v29.2d, v31.2d
stp a14, a15, [x1, #-8]
tbnz x5, #63, .Lt128
ld1 {v28.16b-v31.16b}, [x2]
// xor with corresponding input, write to output
@ -699,107 +426,25 @@ CPU_BE( rev a15, a15 )
eor v17.16b, v17.16b, v1.16b
eor v18.16b, v18.16b, v2.16b
eor v19.16b, v19.16b, v3.16b
tbnz x6, #63, .Lt192
eor v20.16b, v20.16b, v4.16b
eor v21.16b, v21.16b, v5.16b
st1 {v16.16b-v19.16b}, [x1], #64
eor v22.16b, v22.16b, v6.16b
eor v23.16b, v23.16b, v7.16b
st1 {v16.16b-v19.16b}, [x1], #64
tbnz x7, #63, .Lt256
eor v24.16b, v24.16b, v8.16b
eor v25.16b, v25.16b, v9.16b
st1 {v20.16b-v23.16b}, [x1], #64
eor v26.16b, v26.16b, v10.16b
eor v27.16b, v27.16b, v11.16b
st1 {v20.16b-v23.16b}, [x1], #64
tbnz x8, #63, .Lt320
eor v28.16b, v28.16b, v12.16b
st1 {v24.16b-v27.16b}, [x1], #64
eor v29.16b, v29.16b, v13.16b
eor v30.16b, v30.16b, v14.16b
eor v31.16b, v31.16b, v15.16b
st1 {v24.16b-v27.16b}, [x1], #64
st1 {v28.16b-v31.16b}, [x1]
.Lout: frame_pop
ret
ENDPROC(chacha20_4block_xor_neon)
// fewer than 192 bytes of in/output
.Lt192: cbz x5, 1f // exactly 128 bytes?
ld1 {v28.16b-v31.16b}, [x10]
add x5, x5, x1
tbl v28.16b, {v4.16b-v7.16b}, v28.16b
tbl v29.16b, {v4.16b-v7.16b}, v29.16b
tbl v30.16b, {v4.16b-v7.16b}, v30.16b
tbl v31.16b, {v4.16b-v7.16b}, v31.16b
0: eor v20.16b, v20.16b, v28.16b
eor v21.16b, v21.16b, v29.16b
eor v22.16b, v22.16b, v30.16b
eor v23.16b, v23.16b, v31.16b
st1 {v20.16b-v23.16b}, [x5] // overlapping stores
1: st1 {v16.16b-v19.16b}, [x1]
b .Lout
// fewer than 128 bytes of in/output
.Lt128: ld1 {v28.16b-v31.16b}, [x10]
add x5, x5, x1
sub x1, x1, #64
tbl v28.16b, {v0.16b-v3.16b}, v28.16b
tbl v29.16b, {v0.16b-v3.16b}, v29.16b
tbl v30.16b, {v0.16b-v3.16b}, v30.16b
tbl v31.16b, {v0.16b-v3.16b}, v31.16b
ld1 {v16.16b-v19.16b}, [x1] // reload first output block
b 0b
// fewer than 256 bytes of in/output
.Lt256: cbz x6, 2f // exactly 192 bytes?
ld1 {v4.16b-v7.16b}, [x10]
add x6, x6, x1
tbl v0.16b, {v8.16b-v11.16b}, v4.16b
tbl v1.16b, {v8.16b-v11.16b}, v5.16b
tbl v2.16b, {v8.16b-v11.16b}, v6.16b
tbl v3.16b, {v8.16b-v11.16b}, v7.16b
eor v28.16b, v28.16b, v0.16b
eor v29.16b, v29.16b, v1.16b
eor v30.16b, v30.16b, v2.16b
eor v31.16b, v31.16b, v3.16b
st1 {v28.16b-v31.16b}, [x6] // overlapping stores
2: st1 {v20.16b-v23.16b}, [x1]
b .Lout
// fewer than 320 bytes of in/output
.Lt320: cbz x7, 3f // exactly 256 bytes?
ld1 {v4.16b-v7.16b}, [x10]
add x7, x7, x1
tbl v0.16b, {v12.16b-v15.16b}, v4.16b
tbl v1.16b, {v12.16b-v15.16b}, v5.16b
tbl v2.16b, {v12.16b-v15.16b}, v6.16b
tbl v3.16b, {v12.16b-v15.16b}, v7.16b
eor v28.16b, v28.16b, v0.16b
eor v29.16b, v29.16b, v1.16b
eor v30.16b, v30.16b, v2.16b
eor v31.16b, v31.16b, v3.16b
st1 {v28.16b-v31.16b}, [x7] // overlapping stores
3: st1 {v24.16b-v27.16b}, [x1]
b .Lout
SYM_FUNC_END(chacha_4block_xor_neon)
.section ".rodata", "a", %progbits
.align L1_CACHE_SHIFT
.Lpermute:
.set .Li, 0
.rept 128
.byte (.Li - 64)
.set .Li, .Li + 1
.endr
CTRINC: .word 1, 2, 3, 4
CTRINC: .word 0, 1, 2, 3
ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f

View File

@ -0,0 +1,127 @@
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
*
* Copyright (C) 2016 - 2017 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Based on:
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <crypto/algapi.h>
#include <crypto/chacha.h>
#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
static void chacha20_doneon(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
u8 buf[CHACHA_BLOCK_SIZE];
while (bytes >= CHACHA_BLOCK_SIZE * 4) {
chacha20_4block_xor_neon(state, dst, src);
bytes -= CHACHA_BLOCK_SIZE * 4;
src += CHACHA_BLOCK_SIZE * 4;
dst += CHACHA_BLOCK_SIZE * 4;
state[12] += 4;
}
while (bytes >= CHACHA_BLOCK_SIZE) {
chacha20_block_xor_neon(state, dst, src);
bytes -= CHACHA_BLOCK_SIZE;
src += CHACHA_BLOCK_SIZE;
dst += CHACHA_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
memcpy(buf, src, bytes);
chacha20_block_xor_neon(state, buf, buf);
memcpy(dst, buf, bytes);
}
}
static int chacha20_neon(struct skcipher_request *req)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
u32 state[16];
int err;
if (!may_use_simd() || req->cryptlen <= CHACHA_BLOCK_SIZE)
return crypto_chacha_crypt(req);
err = skcipher_walk_virt(&walk, req, true);
crypto_chacha_init(state, ctx, walk.iv);
kernel_neon_begin();
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
if (nbytes < walk.total)
nbytes = round_down(nbytes, walk.stride);
chacha20_doneon(state, walk.dst.virt.addr, walk.src.virt.addr,
nbytes);
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
}
kernel_neon_end();
return err;
}
static struct skcipher_alg alg = {
.base.cra_name = "chacha20",
.base.cra_driver_name = "chacha20-neon",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct chacha_ctx),
.base.cra_module = THIS_MODULE,
.min_keysize = CHACHA_KEY_SIZE,
.max_keysize = CHACHA_KEY_SIZE,
.ivsize = CHACHA_IV_SIZE,
.chunksize = CHACHA_BLOCK_SIZE,
.walksize = 4 * CHACHA_BLOCK_SIZE,
.setkey = crypto_chacha20_setkey,
.encrypt = chacha20_neon,
.decrypt = chacha20_neon,
};
static int __init chacha20_simd_mod_init(void)
{
if (!(elf_hwcap & HWCAP_ASIMD))
return -ENODEV;
return crypto_register_skcipher(&alg);
}
static void __exit chacha20_simd_mod_fini(void)
{
crypto_unregister_skcipher(&alg);
}
module_init(chacha20_simd_mod_init);
module_exit(chacha20_simd_mod_fini);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("chacha20");

View File

@ -0,0 +1,266 @@
/*
* Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
*
* Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
/* GPL HEADER START
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 only,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License version 2 for more details (a copy is included
* in the LICENSE file that accompanied this code).
*
* You should have received a copy of the GNU General Public License
* version 2 along with this program; If not, see http://www.gnu.org/licenses
*
* Please visit http://www.xyratex.com/contact if you need additional
* information or have any questions.
*
* GPL HEADER END
*/
/*
* Copyright 2012 Xyratex Technology Limited
*
* Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
* calculation.
* CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
* PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
* at:
* http://www.intel.com/products/processor/manuals/
* Intel(R) 64 and IA-32 Architectures Software Developer's Manual
* Volume 2B: Instruction Set Reference, N-Z
*
* Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com>
* Alexander Boyko <Alexander_Boyko@xyratex.com>
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
.text
.align 6
.cpu generic+crypto+crc
.Lcrc32_constants:
/*
* [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
* #define CONSTANT_R1 0x154442bd4LL
*
* [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596
* #define CONSTANT_R2 0x1c6e41596LL
*/
.octa 0x00000001c6e415960000000154442bd4
/*
* [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0
* #define CONSTANT_R3 0x1751997d0LL
*
* [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e
* #define CONSTANT_R4 0x0ccaa009eLL
*/
.octa 0x00000000ccaa009e00000001751997d0
/*
* [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124
* #define CONSTANT_R5 0x163cd6124LL
*/
.quad 0x0000000163cd6124
.quad 0x00000000FFFFFFFF
/*
* #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
*
* Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
* = 0x1F7011641LL
* #define CONSTANT_RU 0x1F7011641LL
*/
.octa 0x00000001F701164100000001DB710641
.Lcrc32c_constants:
.octa 0x000000009e4addf800000000740eef02
.octa 0x000000014cd00bd600000000f20c0dfe
.quad 0x00000000dd45aab8
.quad 0x00000000FFFFFFFF
.octa 0x00000000dea713f10000000105ec76f0
vCONSTANT .req v0
dCONSTANT .req d0
qCONSTANT .req q0
BUF .req x0
LEN .req x1
CRC .req x2
vzr .req v9
/**
* Calculate crc32
* BUF - buffer
* LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
* CRC - initial crc32
* return %eax crc32
* uint crc32_pmull_le(unsigned char const *buffer,
* size_t len, uint crc32)
*/
ENTRY(crc32_pmull_le)
adr x3, .Lcrc32_constants
b 0f
ENTRY(crc32c_pmull_le)
adr x3, .Lcrc32c_constants
0: bic LEN, LEN, #15
ld1 {v1.16b-v4.16b}, [BUF], #0x40
movi vzr.16b, #0
fmov dCONSTANT, CRC
eor v1.16b, v1.16b, vCONSTANT.16b
sub LEN, LEN, #0x40
cmp LEN, #0x40
b.lt less_64
ldr qCONSTANT, [x3]
loop_64: /* 64 bytes Full cache line folding */
sub LEN, LEN, #0x40
pmull2 v5.1q, v1.2d, vCONSTANT.2d
pmull2 v6.1q, v2.2d, vCONSTANT.2d
pmull2 v7.1q, v3.2d, vCONSTANT.2d
pmull2 v8.1q, v4.2d, vCONSTANT.2d
pmull v1.1q, v1.1d, vCONSTANT.1d
pmull v2.1q, v2.1d, vCONSTANT.1d
pmull v3.1q, v3.1d, vCONSTANT.1d
pmull v4.1q, v4.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v5.16b
ld1 {v5.16b}, [BUF], #0x10
eor v2.16b, v2.16b, v6.16b
ld1 {v6.16b}, [BUF], #0x10
eor v3.16b, v3.16b, v7.16b
ld1 {v7.16b}, [BUF], #0x10
eor v4.16b, v4.16b, v8.16b
ld1 {v8.16b}, [BUF], #0x10
eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b
eor v3.16b, v3.16b, v7.16b
eor v4.16b, v4.16b, v8.16b
cmp LEN, #0x40
b.ge loop_64
less_64: /* Folding cache line into 128bit */
ldr qCONSTANT, [x3, #16]
pmull2 v5.1q, v1.2d, vCONSTANT.2d
pmull v1.1q, v1.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v5.16b
eor v1.16b, v1.16b, v2.16b
pmull2 v5.1q, v1.2d, vCONSTANT.2d
pmull v1.1q, v1.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v5.16b
eor v1.16b, v1.16b, v3.16b
pmull2 v5.1q, v1.2d, vCONSTANT.2d
pmull v1.1q, v1.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v5.16b
eor v1.16b, v1.16b, v4.16b
cbz LEN, fold_64
loop_16: /* Folding rest buffer into 128bit */
subs LEN, LEN, #0x10
ld1 {v2.16b}, [BUF], #0x10
pmull2 v5.1q, v1.2d, vCONSTANT.2d
pmull v1.1q, v1.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v5.16b
eor v1.16b, v1.16b, v2.16b
b.ne loop_16
fold_64:
/* perform the last 64 bit fold, also adds 32 zeroes
* to the input stream */
ext v2.16b, v1.16b, v1.16b, #8
pmull2 v2.1q, v2.2d, vCONSTANT.2d
ext v1.16b, v1.16b, vzr.16b, #8
eor v1.16b, v1.16b, v2.16b
/* final 32-bit fold */
ldr dCONSTANT, [x3, #32]
ldr d3, [x3, #40]
ext v2.16b, v1.16b, vzr.16b, #4
and v1.16b, v1.16b, v3.16b
pmull v1.1q, v1.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v2.16b
/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
ldr qCONSTANT, [x3, #48]
and v2.16b, v1.16b, v3.16b
ext v2.16b, vzr.16b, v2.16b, #8
pmull2 v2.1q, v2.2d, vCONSTANT.2d
and v2.16b, v2.16b, v3.16b
pmull v2.1q, v2.1d, vCONSTANT.1d
eor v1.16b, v1.16b, v2.16b
mov w0, v1.s[1]
ret
ENDPROC(crc32_pmull_le)
ENDPROC(crc32c_pmull_le)
.macro __crc32, c
0: subs x2, x2, #16
b.mi 8f
ldp x3, x4, [x1], #16
CPU_BE( rev x3, x3 )
CPU_BE( rev x4, x4 )
crc32\c\()x w0, w0, x3
crc32\c\()x w0, w0, x4
b.ne 0b
ret
8: tbz x2, #3, 4f
ldr x3, [x1], #8
CPU_BE( rev x3, x3 )
crc32\c\()x w0, w0, x3
4: tbz x2, #2, 2f
ldr w3, [x1], #4
CPU_BE( rev w3, w3 )
crc32\c\()w w0, w0, w3
2: tbz x2, #1, 1f
ldrh w3, [x1], #2
CPU_BE( rev16 w3, w3 )
crc32\c\()h w0, w0, w3
1: tbz x2, #0, 0f
ldrb w3, [x1]
crc32\c\()b w0, w0, w3
0: ret
.endm
.align 5
ENTRY(crc32_armv8_le)
__crc32
ENDPROC(crc32_armv8_le)
.align 5
ENTRY(crc32c_armv8_le)
__crc32 c
ENDPROC(crc32c_armv8_le)

View File

@ -0,0 +1,244 @@
/*
* Accelerated CRC32(C) using arm64 NEON and Crypto Extensions instructions
*
* Copyright (C) 2016 - 2017 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/cpufeature.h>
#include <linux/crc32.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/string.h>
#include <crypto/internal/hash.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
#define PMULL_MIN_LEN 64L /* minimum size of buffer
* for crc32_pmull_le_16 */
#define SCALE_F 16L /* size of NEON register */
asmlinkage u32 crc32_pmull_le(const u8 buf[], u64 len, u32 init_crc);
asmlinkage u32 crc32_armv8_le(u32 init_crc, const u8 buf[], size_t len);
asmlinkage u32 crc32c_pmull_le(const u8 buf[], u64 len, u32 init_crc);
asmlinkage u32 crc32c_armv8_le(u32 init_crc, const u8 buf[], size_t len);
static u32 (*fallback_crc32)(u32 init_crc, const u8 buf[], size_t len);
static u32 (*fallback_crc32c)(u32 init_crc, const u8 buf[], size_t len);
static int crc32_pmull_cra_init(struct crypto_tfm *tfm)
{
u32 *key = crypto_tfm_ctx(tfm);
*key = 0;
return 0;
}
static int crc32c_pmull_cra_init(struct crypto_tfm *tfm)
{
u32 *key = crypto_tfm_ctx(tfm);
*key = ~0;
return 0;
}
static int crc32_pmull_setkey(struct crypto_shash *hash, const u8 *key,
unsigned int keylen)
{
u32 *mctx = crypto_shash_ctx(hash);
if (keylen != sizeof(u32)) {
crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
return -EINVAL;
}
*mctx = le32_to_cpup((__le32 *)key);
return 0;
}
static int crc32_pmull_init(struct shash_desc *desc)
{
u32 *mctx = crypto_shash_ctx(desc->tfm);
u32 *crc = shash_desc_ctx(desc);
*crc = *mctx;
return 0;
}
static int crc32_update(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
u32 *crc = shash_desc_ctx(desc);
*crc = crc32_armv8_le(*crc, data, length);
return 0;
}
static int crc32c_update(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
u32 *crc = shash_desc_ctx(desc);
*crc = crc32c_armv8_le(*crc, data, length);
return 0;
}
static int crc32_pmull_update(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
u32 *crc = shash_desc_ctx(desc);
unsigned int l;
if ((u64)data % SCALE_F) {
l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
*crc = fallback_crc32(*crc, data, l);
data += l;
length -= l;
}
if (length >= PMULL_MIN_LEN && may_use_simd()) {
l = round_down(length, SCALE_F);
kernel_neon_begin();
*crc = crc32_pmull_le(data, l, *crc);
kernel_neon_end();
data += l;
length -= l;
}
if (length > 0)
*crc = fallback_crc32(*crc, data, length);
return 0;
}
static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
u32 *crc = shash_desc_ctx(desc);
unsigned int l;
if ((u64)data % SCALE_F) {
l = min_t(u32, length, SCALE_F - ((u64)data % SCALE_F));
*crc = fallback_crc32c(*crc, data, l);
data += l;
length -= l;
}
if (length >= PMULL_MIN_LEN && may_use_simd()) {
l = round_down(length, SCALE_F);
kernel_neon_begin();
*crc = crc32c_pmull_le(data, l, *crc);
kernel_neon_end();
data += l;
length -= l;
}
if (length > 0) {
*crc = fallback_crc32c(*crc, data, length);
}
return 0;
}
static int crc32_pmull_final(struct shash_desc *desc, u8 *out)
{
u32 *crc = shash_desc_ctx(desc);
put_unaligned_le32(*crc, out);
return 0;
}
static int crc32c_pmull_final(struct shash_desc *desc, u8 *out)
{
u32 *crc = shash_desc_ctx(desc);
put_unaligned_le32(~*crc, out);
return 0;
}
static struct shash_alg crc32_pmull_algs[] = { {
.setkey = crc32_pmull_setkey,
.init = crc32_pmull_init,
.update = crc32_update,
.final = crc32_pmull_final,
.descsize = sizeof(u32),
.digestsize = sizeof(u32),
.base.cra_ctxsize = sizeof(u32),
.base.cra_init = crc32_pmull_cra_init,
.base.cra_name = "crc32",
.base.cra_driver_name = "crc32-arm64-ce",
.base.cra_priority = 200,
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_blocksize = 1,
.base.cra_module = THIS_MODULE,
}, {
.setkey = crc32_pmull_setkey,
.init = crc32_pmull_init,
.update = crc32c_update,
.final = crc32c_pmull_final,
.descsize = sizeof(u32),
.digestsize = sizeof(u32),
.base.cra_ctxsize = sizeof(u32),
.base.cra_init = crc32c_pmull_cra_init,
.base.cra_name = "crc32c",
.base.cra_driver_name = "crc32c-arm64-ce",
.base.cra_priority = 200,
.base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY,
.base.cra_blocksize = 1,
.base.cra_module = THIS_MODULE,
} };
static int __init crc32_pmull_mod_init(void)
{
if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) {
crc32_pmull_algs[0].update = crc32_pmull_update;
crc32_pmull_algs[1].update = crc32c_pmull_update;
if (elf_hwcap & HWCAP_CRC32) {
fallback_crc32 = crc32_armv8_le;
fallback_crc32c = crc32c_armv8_le;
} else {
fallback_crc32 = crc32_le;
fallback_crc32c = __crc32c_le;
}
} else if (!(elf_hwcap & HWCAP_CRC32)) {
return -ENODEV;
}
return crypto_register_shashes(crc32_pmull_algs,
ARRAY_SIZE(crc32_pmull_algs));
}
static void __exit crc32_pmull_mod_exit(void)
{
crypto_unregister_shashes(crc32_pmull_algs,
ARRAY_SIZE(crc32_pmull_algs));
}
static const struct cpu_feature crc32_cpu_feature[] = {
{ cpu_feature(CRC32) }, { cpu_feature(PMULL) }, { }
};
MODULE_DEVICE_TABLE(cpu, crc32_cpu_feature);
module_init(crc32_pmull_mod_init);
module_exit(crc32_pmull_mod_exit);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");

View File

@ -2,14 +2,12 @@
// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
//
// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
// Copyright (C) 2019 Google LLC <ebiggers@google.com>
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License version 2 as
// published by the Free Software Foundation.
//
// Derived from the x86 version:
//
// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
//
@ -56,176 +54,109 @@
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Function API:
// UINT16 crc_t10dif_pcl(
// UINT16 init_crc, //initial CRC value, 16 bits
// const unsigned char *buf, //buffer pointer to calculate CRC on
// UINT64 len //buffer length in bytes (64-bit data)
// );
//
// Reference paper titled "Fast CRC Computation for Generic
// Polynomials Using PCLMULQDQ Instruction"
// URL: http://www.intel.com/content/dam/www/public/us/en/documents
// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
//
//
#include <linux/linkage.h>
#include <asm/assembler.h>
.text
.arch armv8-a+crypto
.cpu generic+crypto
init_crc .req w0
buf .req x1
len .req x2
fold_consts_ptr .req x3
arg1_low32 .req w0
arg2 .req x1
arg3 .req x2
fold_consts .req v10
vzr .req v13
ad .req v14
ENTRY(crc_t10dif_pmull)
movi vzr.16b, #0 // init zero register
k00_16 .req v15
k32_48 .req v16
// adjust the 16-bit initial_crc value, scale it to 32 bits
lsl arg1_low32, arg1_low32, #16
t3 .req v17
t4 .req v18
t5 .req v19
t6 .req v20
t7 .req v21
t8 .req v22
t9 .req v23
// check if smaller than 256
cmp arg3, #256
perm1 .req v24
perm2 .req v25
perm3 .req v26
perm4 .req v27
// for sizes less than 128, we can't fold 64B at a time...
b.lt _less_than_128
bd1 .req v28
bd2 .req v29
bd3 .req v30
bd4 .req v31
// load the initial crc value
// crc value does not need to be byte-reflected, but it needs
// to be moved to the high part of the register.
// because data will be byte-reflected and will align with
// initial crc at correct place.
movi v10.16b, #0
mov v10.s[3], arg1_low32 // initial crc
.macro __pmull_init_p64
.endm
// receive the initial 64B data, xor the initial crc value
ldp q0, q1, [arg2]
ldp q2, q3, [arg2, #0x20]
ldp q4, q5, [arg2, #0x40]
ldp q6, q7, [arg2, #0x60]
add arg2, arg2, #0x80
.macro __pmull_pre_p64, bd
.endm
CPU_LE( rev64 v0.16b, v0.16b )
CPU_LE( rev64 v1.16b, v1.16b )
CPU_LE( rev64 v2.16b, v2.16b )
CPU_LE( rev64 v3.16b, v3.16b )
CPU_LE( rev64 v4.16b, v4.16b )
CPU_LE( rev64 v5.16b, v5.16b )
CPU_LE( rev64 v6.16b, v6.16b )
CPU_LE( rev64 v7.16b, v7.16b )
.macro __pmull_init_p8
// k00_16 := 0x0000000000000000_000000000000ffff
// k32_48 := 0x00000000ffffffff_0000ffffffffffff
movi k32_48.2d, #0xffffffff
mov k32_48.h[2], k32_48.h[0]
ushr k00_16.2d, k32_48.2d, #32
CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// prepare the permutation vectors
mov_q x5, 0x080f0e0d0c0b0a09
movi perm4.8b, #8
dup perm1.2d, x5
eor perm1.16b, perm1.16b, perm4.16b
ushr perm2.2d, perm1.2d, #8
ushr perm3.2d, perm1.2d, #16
ushr perm4.2d, perm1.2d, #24
sli perm2.2d, perm1.2d, #56
sli perm3.2d, perm1.2d, #48
sli perm4.2d, perm1.2d, #40
.endm
// XOR the initial_crc value
eor v0.16b, v0.16b, v10.16b
.macro __pmull_pre_p8, bd
tbl bd1.16b, {\bd\().16b}, perm1.16b
tbl bd2.16b, {\bd\().16b}, perm2.16b
tbl bd3.16b, {\bd\().16b}, perm3.16b
tbl bd4.16b, {\bd\().16b}, perm4.16b
.endm
ldr q10, rk3 // xmm10 has rk3 and rk4
// type of pmull instruction
// will determine which constant to use
SYM_FUNC_START_LOCAL(__pmull_p8_core)
.L__pmull_p8_core:
ext t4.8b, ad.8b, ad.8b, #1 // A1
ext t5.8b, ad.8b, ad.8b, #2 // A2
ext t6.8b, ad.8b, ad.8b, #3 // A3
//
// we subtract 256 instead of 128 to save one instruction from the loop
//
sub arg3, arg3, #256
pmull t4.8h, t4.8b, fold_consts.8b // F = A1*B
pmull t8.8h, ad.8b, bd1.8b // E = A*B1
pmull t5.8h, t5.8b, fold_consts.8b // H = A2*B
pmull t7.8h, ad.8b, bd2.8b // G = A*B2
pmull t6.8h, t6.8b, fold_consts.8b // J = A3*B
pmull t9.8h, ad.8b, bd3.8b // I = A*B3
pmull t3.8h, ad.8b, bd4.8b // K = A*B4
b 0f
// at this section of the code, there is 64*x+y (0<=y<64) bytes of
// buffer. The _fold_64_B_loop will fold 64B at a time
// until we have 64+y Bytes of buffer
.L__pmull_p8_core2:
tbl t4.16b, {ad.16b}, perm1.16b // A1
tbl t5.16b, {ad.16b}, perm2.16b // A2
tbl t6.16b, {ad.16b}, perm3.16b // A3
pmull2 t4.8h, t4.16b, fold_consts.16b // F = A1*B
pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1
pmull2 t5.8h, t5.16b, fold_consts.16b // H = A2*B
pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2
pmull2 t6.8h, t6.16b, fold_consts.16b // J = A3*B
pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3
pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4
// fold 64B at a time. This section of the code folds 4 vector
// registers in parallel
_fold_64_B_loop:
0: eor t4.16b, t4.16b, t8.16b // L = E + F
eor t5.16b, t5.16b, t7.16b // M = G + H
eor t6.16b, t6.16b, t9.16b // N = I + J
.macro fold64, reg1, reg2
ldp q11, q12, [arg2], #0x20
uzp1 t8.2d, t4.2d, t5.2d
uzp2 t4.2d, t4.2d, t5.2d
uzp1 t7.2d, t6.2d, t3.2d
uzp2 t6.2d, t6.2d, t3.2d
// t4 = (L) (P0 + P1) << 8
// t5 = (M) (P2 + P3) << 16
eor t8.16b, t8.16b, t4.16b
and t4.16b, t4.16b, k32_48.16b
// t6 = (N) (P4 + P5) << 24
// t7 = (K) (P6 + P7) << 32
eor t7.16b, t7.16b, t6.16b
and t6.16b, t6.16b, k00_16.16b
eor t8.16b, t8.16b, t4.16b
eor t7.16b, t7.16b, t6.16b
zip2 t5.2d, t8.2d, t4.2d
zip1 t4.2d, t8.2d, t4.2d
zip2 t3.2d, t7.2d, t6.2d
zip1 t6.2d, t7.2d, t6.2d
ext t4.16b, t4.16b, t4.16b, #15
ext t5.16b, t5.16b, t5.16b, #14
ext t6.16b, t6.16b, t6.16b, #13
ext t3.16b, t3.16b, t3.16b, #12
eor t4.16b, t4.16b, t5.16b
eor t6.16b, t6.16b, t3.16b
ret
SYM_FUNC_END(__pmull_p8_core)
.macro __pmull_p8, rq, ad, bd, i
.ifnc \bd, fold_consts
.err
.endif
mov ad.16b, \ad\().16b
.ifb \i
pmull \rq\().8h, \ad\().8b, \bd\().8b // D = A*B
.else
pmull2 \rq\().8h, \ad\().16b, \bd\().16b // D = A*B
.endif
bl .L__pmull_p8_core\i
eor \rq\().16b, \rq\().16b, t4.16b
eor \rq\().16b, \rq\().16b, t6.16b
.endm
// Fold reg1, reg2 into the next 32 data bytes, storing the result back
// into reg1, reg2.
.macro fold_32_bytes, p, reg1, reg2
ldp q11, q12, [buf], #0x20
__pmull_\p v8, \reg1, fold_consts, 2
__pmull_\p \reg1, \reg1, fold_consts
pmull2 v8.1q, \reg1\().2d, v10.2d
pmull \reg1\().1q, \reg1\().1d, v10.1d
CPU_LE( rev64 v11.16b, v11.16b )
CPU_LE( rev64 v12.16b, v12.16b )
__pmull_\p v9, \reg2, fold_consts, 2
__pmull_\p \reg2, \reg2, fold_consts
pmull2 v9.1q, \reg2\().2d, v10.2d
pmull \reg2\().1q, \reg2\().1d, v10.1d
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
@ -236,279 +167,225 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
eor \reg2\().16b, \reg2\().16b, v12.16b
.endm
// Fold src_reg into dst_reg, optionally loading the next fold constants
.macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
__pmull_\p v8, \src_reg, fold_consts
__pmull_\p \src_reg, \src_reg, fold_consts, 2
.ifnb \load_next_consts
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
__pmull_pre_\p fold_consts
fold64 v0, v1
fold64 v2, v3
fold64 v4, v5
fold64 v6, v7
subs arg3, arg3, #128
// check if there is another 64B in the buffer to be able to fold
b.ge _fold_64_B_loop
// at this point, the buffer pointer is pointing at the last y Bytes
// of the buffer the 64B of folded data is in 4 of the vector
// registers: v0, v1, v2, v3
// fold the 8 vector registers to 1 vector register with different
// constants
ldr q10, rk9
.macro fold16, reg, rk
pmull v8.1q, \reg\().1d, v10.1d
pmull2 \reg\().1q, \reg\().2d, v10.2d
.ifnb \rk
ldr q10, \rk
.endif
eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
.endm
.macro __pmull_p64, rd, rn, rm, n
.ifb \n
pmull \rd\().1q, \rn\().1d, \rm\().1d
.else
pmull2 \rd\().1q, \rn\().2d, \rm\().2d
.endif
.endm
.macro crc_t10dif_pmull, p
__pmull_init_\p
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
cmp len, #256
b.lt .Lless_than_256_bytes_\@
adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts
// Load the first 128 data bytes. Byte swapping is necessary to make
// the bit order match the polynomial coefficient order.
ldp q0, q1, [buf]
ldp q2, q3, [buf, #0x20]
ldp q4, q5, [buf, #0x40]
ldp q6, q7, [buf, #0x60]
add buf, buf, #0x80
CPU_LE( rev64 v0.16b, v0.16b )
CPU_LE( rev64 v1.16b, v1.16b )
CPU_LE( rev64 v2.16b, v2.16b )
CPU_LE( rev64 v3.16b, v3.16b )
CPU_LE( rev64 v4.16b, v4.16b )
CPU_LE( rev64 v5.16b, v5.16b )
CPU_LE( rev64 v6.16b, v6.16b )
CPU_LE( rev64 v7.16b, v7.16b )
CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// XOR the first 16 data *bits* with the initial CRC value.
movi v8.16b, #0
mov v8.h[7], init_crc
eor v0.16b, v0.16b, v8.16b
// Load the constants for folding across 128 bytes.
ld1 {fold_consts.2d}, [fold_consts_ptr]
__pmull_pre_\p fold_consts
// Subtract 128 for the 128 data bytes just consumed. Subtract another
// 128 to simplify the termination condition of the following loop.
sub len, len, #256
// While >= 128 data bytes remain (not counting v0-v7), fold the 128
// bytes v0-v7 into them, storing the result back into v0-v7.
.Lfold_128_bytes_loop_\@:
fold_32_bytes \p, v0, v1
fold_32_bytes \p, v2, v3
fold_32_bytes \p, v4, v5
fold_32_bytes \p, v6, v7
subs len, len, #128
b.ge .Lfold_128_bytes_loop_\@
// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
// Fold across 64 bytes.
add fold_consts_ptr, fold_consts_ptr, #16
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
__pmull_pre_\p fold_consts
fold_16_bytes \p, v0, v4
fold_16_bytes \p, v1, v5
fold_16_bytes \p, v2, v6
fold_16_bytes \p, v3, v7, 1
// Fold across 32 bytes.
fold_16_bytes \p, v4, v6
fold_16_bytes \p, v5, v7, 1
// Fold across 16 bytes.
fold_16_bytes \p, v6, v7
// Add 128 to get the correct number of data bytes remaining in 0...127
// (not counting v7), following the previous extra subtraction by 128.
// Then subtract 16 to simplify the termination condition of the
// following loop.
adds len, len, #(128-16)
// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
// into them, storing the result back into v7.
b.lt .Lfold_16_bytes_loop_done_\@
.Lfold_16_bytes_loop_\@:
__pmull_\p v8, v7, fold_consts
__pmull_\p v7, v7, fold_consts, 2
eor v7.16b, v7.16b, v8.16b
ldr q0, [buf], #16
eor v7.16b, v7.16b, \reg\().16b
.endm
fold16 v0, rk11
fold16 v1, rk13
fold16 v2, rk15
fold16 v3, rk17
fold16 v4, rk19
fold16 v5, rk1
fold16 v6
// instead of 64, we add 48 to the loop counter to save 1 instruction
// from the loop instead of a cmp instruction, we use the negative
// flag with the jl instruction
adds arg3, arg3, #(128-16)
b.lt _final_reduction_for_128
// now we have 16+y bytes left to reduce. 16 Bytes is in register v7
// and the rest is in memory. We can fold 16 bytes at a time if y>=16
// continue folding 16B at a time
_16B_reduction_loop:
pmull v8.1q, v7.1d, v10.1d
pmull2 v7.1q, v7.2d, v10.2d
eor v7.16b, v7.16b, v8.16b
ldr q0, [arg2], #16
CPU_LE( rev64 v0.16b, v0.16b )
CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
eor v7.16b, v7.16b, v0.16b
subs len, len, #16
b.ge .Lfold_16_bytes_loop_\@
subs arg3, arg3, #16
.Lfold_16_bytes_loop_done_\@:
// Add 16 to get the correct number of data bytes remaining in 0...15
// (not counting v7), following the previous extra subtraction by 16.
adds len, len, #16
b.eq .Lreduce_final_16_bytes_\@
// instead of a cmp instruction, we utilize the flags with the
// jge instruction equivalent of: cmp arg3, 16-16
// check if there is any more 16B in the buffer to be able to fold
b.ge _16B_reduction_loop
.Lhandle_partial_segment_\@:
// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
// 16 bytes are in v7 and the rest are the remaining data in 'buf'. To
// do this without needing a fold constant for each possible 'len',
// redivide the bytes into a first chunk of 'len' bytes and a second
// chunk of 16 bytes, then fold the first chunk into the second.
// now we have 16+z bytes left to reduce, where 0<= z < 16.
// first, we reduce the data in the xmm7 register
// v0 = last 16 original data bytes
add buf, buf, len
ldr q0, [buf, #-16]
CPU_LE( rev64 v0.16b, v0.16b )
CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
_final_reduction_for_128:
// check if any more data to fold. If not, compute the CRC of
// the final 128 bits
adds arg3, arg3, #16
b.eq _128_done
// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
adr_l x4, .Lbyteshift_table + 16
sub x4, x4, len
ld1 {v2.16b}, [x4]
tbl v1.16b, {v7.16b}, v2.16b
// here we are getting data that is less than 16 bytes.
// since we know that there was data before the pointer, we can
// offset the input pointer before the actual point, to receive
// exactly 16 bytes. after that the registers need to be adjusted.
_get_last_two_regs:
add arg2, arg2, arg3
ldr q1, [arg2, #-16]
CPU_LE( rev64 v1.16b, v1.16b )
CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
// v3 = first chunk: v7 right-shifted by '16-len' bytes.
movi v3.16b, #0x80
eor v2.16b, v2.16b, v3.16b
tbl v3.16b, {v7.16b}, v2.16b
// get rid of the extra data that was loaded before
// load the shift constant
adr x4, tbl_shf_table + 16
sub x4, x4, arg3
ld1 {v0.16b}, [x4]
// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
sshr v2.16b, v2.16b, #7
// shift v2 to the left by arg3 bytes
tbl v2.16b, {v7.16b}, v0.16b
// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
// then '16-len' bytes from v1 (high-order bytes).
bsl v2.16b, v1.16b, v0.16b
// shift v7 to the right by 16-arg3 bytes
movi v9.16b, #0x80
eor v0.16b, v0.16b, v9.16b
tbl v7.16b, {v7.16b}, v0.16b
// Fold the first chunk into the second chunk, storing the result in v7.
__pmull_\p v0, v3, fold_consts
__pmull_\p v7, v3, fold_consts, 2
// blend
sshr v0.16b, v0.16b, #7 // convert to 8-bit mask
bsl v0.16b, v2.16b, v1.16b
// fold 16 Bytes
pmull v8.1q, v7.1d, v10.1d
pmull2 v7.1q, v7.2d, v10.2d
eor v7.16b, v7.16b, v8.16b
eor v7.16b, v7.16b, v0.16b
eor v7.16b, v7.16b, v2.16b
.Lreduce_final_16_bytes_\@:
// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
_128_done:
// compute crc of a 128-bit value
ldr q10, rk5 // rk5 and rk6 in xmm10
movi v2.16b, #0 // init zero register
// 64b fold
ext v0.16b, vzr.16b, v7.16b, #8
mov v7.d[0], v7.d[1]
pmull v7.1q, v7.1d, v10.1d
eor v7.16b, v7.16b, v0.16b
// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
__pmull_pre_\p fold_consts
// 32b fold
ext v0.16b, v7.16b, vzr.16b, #4
mov v7.s[3], vzr.s[0]
pmull2 v0.1q, v0.2d, v10.2d
eor v7.16b, v7.16b, v0.16b
// Fold the high 64 bits into the low 64 bits, while also multiplying by
// x^64. This produces a 128-bit value congruent to x^64 * M(x) and
// whose low 48 bits are 0.
ext v0.16b, v2.16b, v7.16b, #8
__pmull_\p v7, v7, fold_consts, 2 // high bits * x^48 * (x^80 mod G(x))
eor v0.16b, v0.16b, v7.16b // + low bits * x^64
// barrett reduction
_barrett:
ldr q10, rk7
mov v0.d[0], v7.d[1]
// Fold the high 32 bits into the low 96 bits. This produces a 96-bit
// value congruent to x^64 * M(x) and whose low 48 bits are 0.
ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits
mov v0.s[3], v2.s[0] // zero high 32 bits
__pmull_\p v1, v1, fold_consts // high 32 bits * x^48 * (x^48 mod G(x))
eor v0.16b, v0.16b, v1.16b // + low bits
pmull v0.1q, v0.1d, v10.1d
ext v0.16b, vzr.16b, v0.16b, #12
pmull2 v0.1q, v0.2d, v10.2d
ext v0.16b, vzr.16b, v0.16b, #12
eor v7.16b, v7.16b, v0.16b
mov w0, v7.s[1]
// Load G(x) and floor(x^48 / G(x)).
ld1 {fold_consts.2d}, [fold_consts_ptr]
__pmull_pre_\p fold_consts
// Use Barrett reduction to compute the final CRC value.
__pmull_\p v1, v0, fold_consts, 2 // high 32 bits * floor(x^48 / G(x))
ushr v1.2d, v1.2d, #32 // /= x^32
__pmull_\p v1, v1, fold_consts // *= G(x)
ushr v0.2d, v0.2d, #48
eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits
// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
umov w0, v0.h[0]
.ifc \p, p8
ldp x29, x30, [sp], #16
.endif
_cleanup:
// scale the result back to 16 bits
lsr x0, x0, #16
ret
.Lless_than_256_bytes_\@:
// Checksumming a buffer of length 16...255 bytes
_less_than_128:
cbz arg3, _cleanup
adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts
movi v0.16b, #0
mov v0.s[3], arg1_low32 // get the initial crc value
// Load the first 16 data bytes.
ldr q7, [buf], #0x10
ldr q7, [arg2], #0x10
CPU_LE( rev64 v7.16b, v7.16b )
CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
eor v7.16b, v7.16b, v0.16b // xor the initial crc value
// XOR the first 16 data *bits* with the initial CRC value.
movi v0.16b, #0
mov v0.h[7], init_crc
eor v7.16b, v7.16b, v0.16b
cmp arg3, #16
b.eq _128_done // exactly 16 left
b.lt _less_than_16_left
// Load the fold-across-16-bytes constants.
ld1 {fold_consts.2d}, [fold_consts_ptr], #16
__pmull_pre_\p fold_consts
ldr q10, rk1 // rk1 and rk2 in xmm10
cmp len, #16
b.eq .Lreduce_final_16_bytes_\@ // len == 16
subs len, len, #32
b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255
add len, len, #16
b .Lhandle_partial_segment_\@ // 17 <= len <= 31
.endm
// update the counter. subtract 32 instead of 16 to save one
// instruction from the loop
subs arg3, arg3, #32
b.ge _16B_reduction_loop
//
// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
//
// Assumes len >= 16.
//
SYM_FUNC_START(crc_t10dif_pmull_p8)
stp x29, x30, [sp, #-16]!
mov x29, sp
crc_t10dif_pmull p8
SYM_FUNC_END(crc_t10dif_pmull_p8)
add arg3, arg3, #16
b _get_last_two_regs
.align 5
//
// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
//
// Assumes len >= 16.
//
SYM_FUNC_START(crc_t10dif_pmull_p64)
crc_t10dif_pmull p64
SYM_FUNC_END(crc_t10dif_pmull_p64)
_less_than_16_left:
// shl r9, 4
adr x0, tbl_shf_table + 16
sub x0, x0, arg3
ld1 {v0.16b}, [x0]
movi v9.16b, #0x80
eor v0.16b, v0.16b, v9.16b
tbl v7.16b, {v7.16b}, v0.16b
b _128_done
ENDPROC(crc_t10dif_pmull)
.section ".rodata", "a"
// precomputed constants
// these constants are precomputed from the poly:
// 0x8bb70000 (0x8bb7 scaled to 32 bits)
.align 4
// Q = 0x18BB70000
// rk1 = 2^(32*3) mod Q << 32
// rk2 = 2^(32*5) mod Q << 32
// rk3 = 2^(32*15) mod Q << 32
// rk4 = 2^(32*17) mod Q << 32
// rk5 = 2^(32*3) mod Q << 32
// rk6 = 2^(32*2) mod Q << 32
// rk7 = floor(2^64/Q)
// rk8 = Q
// Fold constants precomputed from the polynomial 0x18bb7
// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
.Lfold_across_128_bytes_consts:
.quad 0x0000000000006123 // x^(8*128) mod G(x)
.quad 0x0000000000002295 // x^(8*128+64) mod G(x)
// .Lfold_across_64_bytes_consts:
.quad 0x0000000000001069 // x^(4*128) mod G(x)
.quad 0x000000000000dd31 // x^(4*128+64) mod G(x)
// .Lfold_across_32_bytes_consts:
.quad 0x000000000000857d // x^(2*128) mod G(x)
.quad 0x0000000000007acc // x^(2*128+64) mod G(x)
.Lfold_across_16_bytes_consts:
.quad 0x000000000000a010 // x^(1*128) mod G(x)
.quad 0x0000000000001faa // x^(1*128+64) mod G(x)
// .Lfinal_fold_consts:
.quad 0x1368000000000000 // x^48 * (x^48 mod G(x))
.quad 0x2d56000000000000 // x^48 * (x^80 mod G(x))
// .Lbarrett_reduction_consts:
.quad 0x0000000000018bb7 // G(x)
.quad 0x00000001f65a57f8 // floor(x^48 / G(x))
rk1: .octa 0x06df0000000000002d56000000000000
rk3: .octa 0x7cf50000000000009d9d000000000000
rk5: .octa 0x13680000000000002d56000000000000
rk7: .octa 0x000000018bb7000000000001f65a57f8
rk9: .octa 0xbfd6000000000000ceae000000000000
rk11: .octa 0x713c0000000000001e16000000000000
rk13: .octa 0x80a6000000000000f7f9000000000000
rk15: .octa 0xe658000000000000044c000000000000
rk17: .octa 0xa497000000000000ad18000000000000
rk19: .octa 0xe7b50000000000006ee3000000000000
tbl_shf_table:
// use these values for shift constants for the tbl/tbx instruction
// different alignments result in values as shown:
// DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
// DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
// DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
// DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
// DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
// DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
// DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
// DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
// DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
// DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
// DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
// DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
// DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
// DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
// DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
.Lbyteshift_table:
.byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7

View File

@ -16,15 +16,13 @@
#include <linux/string.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <asm/neon.h>
#include <asm/simd.h>
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len);
static int crct10dif_init(struct shash_desc *desc)
{
@ -34,49 +32,15 @@ static int crct10dif_init(struct shash_desc *desc)
return 0;
}
static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
static int crct10dif_update(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
do {
unsigned int chunk = length;
if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
chunk = SZ_4K;
kernel_neon_begin();
*crc = crc_t10dif_pmull_p8(*crc, data, chunk);
kernel_neon_end();
data += chunk;
length -= chunk;
} while (length);
} else {
*crc = crc_t10dif_generic(*crc, data, length);
}
return 0;
}
static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) {
do {
unsigned int chunk = length;
if (chunk > SZ_4K + CRC_T10DIF_PMULL_CHUNK_SIZE)
chunk = SZ_4K;
kernel_neon_begin();
*crc = crc_t10dif_pmull_p64(*crc, data, chunk);
kernel_neon_end();
data += chunk;
length -= chunk;
} while (length);
kernel_neon_begin();
*crc = crc_t10dif_pmull(*crc, data, length);
kernel_neon_end();
} else {
*crc = crc_t10dif_generic(*crc, data, length);
}
@ -92,22 +56,10 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
return 0;
}
static struct shash_alg crc_t10dif_alg[] = {{
static struct shash_alg crc_t10dif_alg = {
.digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = crct10dif_init,
.update = crct10dif_update_pmull_p8,
.final = crct10dif_final,
.descsize = CRC_T10DIF_DIGEST_SIZE,
.base.cra_name = "crct10dif",
.base.cra_driver_name = "crct10dif-arm64-neon",
.base.cra_priority = 100,
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
}, {
.digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = crct10dif_init,
.update = crct10dif_update_pmull_p64,
.update = crct10dif_update,
.final = crct10dif_final,
.descsize = CRC_T10DIF_DIGEST_SIZE,
@ -116,31 +68,20 @@ static struct shash_alg crc_t10dif_alg[] = {{
.base.cra_priority = 200,
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
}};
};
static int __init crc_t10dif_mod_init(void)
{
if (cpu_have_named_feature(PMULL))
return crypto_register_shashes(crc_t10dif_alg,
ARRAY_SIZE(crc_t10dif_alg));
else
/* only register the first array element */
return crypto_register_shash(crc_t10dif_alg);
return crypto_register_shash(&crc_t10dif_alg);
}
static void __exit crc_t10dif_mod_exit(void)
{
if (cpu_have_named_feature(PMULL))
crypto_unregister_shashes(crc_t10dif_alg,
ARRAY_SIZE(crc_t10dif_alg));
else
crypto_unregister_shash(crc_t10dif_alg);
crypto_unregister_shash(&crc_t10dif_alg);
}
module_cpu_feature_match(ASIMD, crc_t10dif_mod_init);
module_cpu_feature_match(PMULL, crc_t10dif_mod_init);
module_exit(crc_t10dif_mod_exit);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("crct10dif");
MODULE_ALIAS_CRYPTO("crct10dif-arm64-ce");

View File

@ -1,7 +1,7 @@
/*
* Accelerated GHASH implementation with ARMv8 PMULL instructions.
*
* Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
* Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
@ -16,8 +16,8 @@
T1 .req v2
T2 .req v3
MASK .req v4
XM .req v5
XL .req v6
XL .req v5
XM .req v6
XH .req v7
IN1 .req v7
@ -46,19 +46,6 @@
ss3 .req v26
ss4 .req v27
XL2 .req v8
XM2 .req v9
XH2 .req v10
XL3 .req v11
XM3 .req v12
XH3 .req v13
TT3 .req v14
TT4 .req v15
HH .req v16
HH3 .req v17
HH4 .req v18
HH34 .req v19
.text
.arch armv8-a+crypto
@ -147,25 +134,11 @@
.endm
.macro __pmull_pre_p64
add x8, x3, #16
ld1 {HH.2d-HH4.2d}, [x8]
trn1 SHASH2.2d, SHASH.2d, HH.2d
trn2 T1.2d, SHASH.2d, HH.2d
eor SHASH2.16b, SHASH2.16b, T1.16b
trn1 HH34.2d, HH3.2d, HH4.2d
trn2 T1.2d, HH3.2d, HH4.2d
eor HH34.16b, HH34.16b, T1.16b
movi MASK.16b, #0xe1
shl MASK.2d, MASK.2d, #57
.endm
.macro __pmull_pre_p8
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
eor SHASH2.16b, SHASH2.16b, SHASH.16b
// k00_16 := 0x0000000000000000_000000000000ffff
// k32_48 := 0x00000000ffffffff_0000ffffffffffff
movi k32_48.2d, #0xffffffff
@ -242,86 +215,20 @@
.macro __pmull_ghash, pn
ld1 {SHASH.2d}, [x3]
ld1 {XL.2d}, [x1]
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
eor SHASH2.16b, SHASH2.16b, SHASH.16b
__pmull_pre_\pn
/* do the head block first, if supplied */
cbz x4, 0f
ld1 {T1.2d}, [x4]
mov x4, xzr
b 3f
b 1f
0: .ifc \pn, p64
tbnz w0, #0, 2f // skip until #blocks is a
tbnz w0, #1, 2f // round multiple of 4
1: ld1 {XM3.16b-TT4.16b}, [x2], #64
sub w0, w0, #4
rev64 T1.16b, XM3.16b
rev64 T2.16b, XH3.16b
rev64 TT4.16b, TT4.16b
rev64 TT3.16b, TT3.16b
ext IN1.16b, TT4.16b, TT4.16b, #8
ext XL3.16b, TT3.16b, TT3.16b, #8
eor TT4.16b, TT4.16b, IN1.16b
pmull2 XH2.1q, SHASH.2d, IN1.2d // a1 * b1
pmull XL2.1q, SHASH.1d, IN1.1d // a0 * b0
pmull XM2.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
eor TT3.16b, TT3.16b, XL3.16b
pmull2 XH3.1q, HH.2d, XL3.2d // a1 * b1
pmull XL3.1q, HH.1d, XL3.1d // a0 * b0
pmull2 XM3.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
ext IN1.16b, T2.16b, T2.16b, #8
eor XL2.16b, XL2.16b, XL3.16b
eor XH2.16b, XH2.16b, XH3.16b
eor XM2.16b, XM2.16b, XM3.16b
eor T2.16b, T2.16b, IN1.16b
pmull2 XH3.1q, HH3.2d, IN1.2d // a1 * b1
pmull XL3.1q, HH3.1d, IN1.1d // a0 * b0
pmull XM3.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
eor XL2.16b, XL2.16b, XL3.16b
eor XH2.16b, XH2.16b, XH3.16b
eor XM2.16b, XM2.16b, XM3.16b
ext IN1.16b, T1.16b, T1.16b, #8
ext TT3.16b, XL.16b, XL.16b, #8
eor XL.16b, XL.16b, IN1.16b
eor T1.16b, T1.16b, TT3.16b
pmull2 XH.1q, HH4.2d, XL.2d // a1 * b1
eor T1.16b, T1.16b, XL.16b
pmull XL.1q, HH4.1d, XL.1d // a0 * b0
pmull2 XM.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
eor XL.16b, XL.16b, XL2.16b
eor XH.16b, XH.16b, XH2.16b
eor XM.16b, XM.16b, XM2.16b
eor T2.16b, XL.16b, XH.16b
ext T1.16b, XL.16b, XH.16b, #8
eor XM.16b, XM.16b, T2.16b
__pmull_reduce_p64
eor T2.16b, T2.16b, XH.16b
eor XL.16b, XL.16b, T2.16b
cbz w0, 5f
b 1b
.endif
2: ld1 {T1.2d}, [x2], #16
0: ld1 {T1.2d}, [x2], #16
sub w0, w0, #1
3: /* multiply XL by SHASH in GF(2^128) */
1: /* multiply XL by SHASH in GF(2^128) */
CPU_LE( rev64 T1.16b, T1.16b )
ext T2.16b, XL.16b, XL.16b, #8
@ -334,7 +241,7 @@ CPU_LE( rev64 T1.16b, T1.16b )
__pmull_\pn XL, XL, SHASH // a0 * b0
__pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
4: eor T2.16b, XL.16b, XH.16b
eor T2.16b, XL.16b, XH.16b
ext T1.16b, XL.16b, XH.16b, #8
eor XM.16b, XM.16b, T2.16b
@ -345,7 +252,7 @@ CPU_LE( rev64 T1.16b, T1.16b )
cbnz w0, 0b
5: st1 {XL.2d}, [x1]
st1 {XL.2d}, [x1]
ret
.endm
@ -353,45 +260,27 @@ CPU_LE( rev64 T1.16b, T1.16b )
* void pmull_ghash_update(int blocks, u64 dg[], const char *src,
* struct ghash_key const *k, const char *head)
*/
SYM_FUNC_START(pmull_ghash_update_p64)
ENTRY(pmull_ghash_update_p64)
__pmull_ghash p64
SYM_FUNC_END(pmull_ghash_update_p64)
ENDPROC(pmull_ghash_update_p64)
SYM_FUNC_START(pmull_ghash_update_p8)
ENTRY(pmull_ghash_update_p8)
__pmull_ghash p8
SYM_FUNC_END(pmull_ghash_update_p8)
ENDPROC(pmull_ghash_update_p8)
KS0 .req v8
KS1 .req v9
KS2 .req v10
KS3 .req v11
KS .req v8
CTR .req v9
INP .req v10
INP0 .req v21
INP1 .req v22
INP2 .req v23
INP3 .req v24
K0 .req v25
K1 .req v26
K2 .req v27
K3 .req v28
K4 .req v12
K5 .req v13
K6 .req v4
K7 .req v5
K8 .req v14
K9 .req v15
KK .req v29
KL .req v30
KM .req v31
.macro load_round_keys, rounds, rk, tmp
add \tmp, \rk, #64
ld1 {K0.4s-K3.4s}, [\rk]
ld1 {K4.4s-K5.4s}, [\tmp]
add \tmp, \rk, \rounds, lsl #4
sub \tmp, \tmp, #32
ld1 {KK.4s-KM.4s}, [\tmp]
.macro load_round_keys, rounds, rk
cmp \rounds, #12
blo 2222f /* 128 bits */
beq 1111f /* 192 bits */
ld1 {v17.4s-v18.4s}, [\rk], #32
1111: ld1 {v19.4s-v20.4s}, [\rk], #32
2222: ld1 {v21.4s-v24.4s}, [\rk], #64
ld1 {v25.4s-v28.4s}, [\rk], #64
ld1 {v29.4s-v31.4s}, [\rk]
.endm
.macro enc_round, state, key
@ -399,382 +288,157 @@ SYM_FUNC_END(pmull_ghash_update_p8)
aesmc \state\().16b, \state\().16b
.endm
.macro enc_qround, s0, s1, s2, s3, key
enc_round \s0, \key
enc_round \s1, \key
enc_round \s2, \key
enc_round \s3, \key
.endm
.macro enc_block, state, rounds, rk, tmp
add \tmp, \rk, #96
ld1 {K6.4s-K7.4s}, [\tmp], #32
.irp key, K0, K1, K2, K3, K4 K5
.macro enc_block, state, rounds
cmp \rounds, #12
b.lo 2222f /* 128 bits */
b.eq 1111f /* 192 bits */
enc_round \state, v17
enc_round \state, v18
1111: enc_round \state, v19
enc_round \state, v20
2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
enc_round \state, \key
.endr
tbnz \rounds, #2, .Lnot128_\@
.Lout256_\@:
enc_round \state, K6
enc_round \state, K7
.Lout192_\@:
enc_round \state, KK
aese \state\().16b, KL.16b
eor \state\().16b, \state\().16b, KM.16b
.subsection 1
.Lnot128_\@:
ld1 {K8.4s-K9.4s}, [\tmp], #32
enc_round \state, K6
enc_round \state, K7
ld1 {K6.4s-K7.4s}, [\tmp]
enc_round \state, K8
enc_round \state, K9
tbz \rounds, #1, .Lout192_\@
b .Lout256_\@
.previous
aese \state\().16b, v30.16b
eor \state\().16b, \state\().16b, v31.16b
.endm
.align 6
.macro pmull_gcm_do_crypt, enc
stp x29, x30, [sp, #-32]!
mov x29, sp
str x19, [sp, #24]
ld1 {SHASH.2d}, [x4]
ld1 {XL.2d}, [x1]
ldr x8, [x5, #8] // load lower counter
load_round_keys x7, x6, x8
ld1 {SHASH.2d}, [x3], #16
ld1 {HH.2d-HH4.2d}, [x3]
trn1 SHASH2.2d, SHASH.2d, HH.2d
trn2 T1.2d, SHASH.2d, HH.2d
eor SHASH2.16b, SHASH2.16b, T1.16b
trn1 HH34.2d, HH3.2d, HH4.2d
trn2 T1.2d, HH3.2d, HH4.2d
eor HH34.16b, HH34.16b, T1.16b
ld1 {XL.2d}, [x4]
cbz x0, 3f // tag only?
ldr w8, [x5, #12] // load lower counter
CPU_LE( rev w8, w8 )
0: mov w9, #4 // max blocks per round
add x10, x0, #0xf
lsr x10, x10, #4 // remaining blocks
subs x0, x0, #64
csel w9, w10, w9, mi
add w8, w8, w9
bmi 1f
ld1 {INP0.16b-INP3.16b}, [x2], #64
.subsection 1
/*
* Populate the four input registers right to left with up to 63 bytes
* of data, using overlapping loads to avoid branches.
*
* INP0 INP1 INP2 INP3
* 1 byte | | | |x |
* 16 bytes | | | |xxxxxxxx|
* 17 bytes | | |xxxxxxxx|x |
* 47 bytes | |xxxxxxxx|xxxxxxxx|xxxxxxx |
* etc etc
*
* Note that this code may read up to 15 bytes before the start of
* the input. It is up to the calling code to ensure this is safe if
* this happens in the first iteration of the loop (i.e., when the
* input size is < 16 bytes)
*/
1: mov x15, #16
ands x19, x0, #0xf
csel x19, x19, x15, ne
adr_l x17, .Lpermute_table + 16
sub x11, x15, x19
add x12, x17, x11
sub x17, x17, x11
ld1 {T1.16b}, [x12]
sub x10, x1, x11
sub x11, x2, x11
cmp x0, #-16
csel x14, x15, xzr, gt
cmp x0, #-32
csel x15, x15, xzr, gt
cmp x0, #-48
csel x16, x19, xzr, gt
csel x1, x1, x10, gt
csel x2, x2, x11, gt
ld1 {INP0.16b}, [x2], x14
ld1 {INP1.16b}, [x2], x15
ld1 {INP2.16b}, [x2], x16
ld1 {INP3.16b}, [x2]
tbl INP3.16b, {INP3.16b}, T1.16b
b 2f
.previous
2: .if \enc == 0
bl pmull_gcm_ghash_4x
.endif
bl pmull_gcm_enc_4x
tbnz x0, #63, 6f
st1 {INP0.16b-INP3.16b}, [x1], #64
.if \enc == 1
bl pmull_gcm_ghash_4x
.endif
bne 0b
3: ldp x19, x10, [sp, #24]
cbz x10, 5f // output tag?
ld1 {INP3.16b}, [x10] // load lengths[]
mov w9, #1
bl pmull_gcm_ghash_4x
mov w11, #(0x1 << 24) // BE '1U'
ld1 {KS0.16b}, [x5]
mov KS0.s[3], w11
enc_block KS0, x7, x6, x12
ext XL.16b, XL.16b, XL.16b, #8
rev64 XL.16b, XL.16b
eor XL.16b, XL.16b, KS0.16b
.if \enc == 1
st1 {XL.16b}, [x10] // store tag
.else
ldp x11, x12, [sp, #40] // load tag pointer and authsize
adr_l x17, .Lpermute_table
ld1 {KS0.16b}, [x11] // load supplied tag
add x17, x17, x12
ld1 {KS1.16b}, [x17] // load permute vector
cmeq XL.16b, XL.16b, KS0.16b // compare tags
mvn XL.16b, XL.16b // -1 for fail, 0 for pass
tbl XL.16b, {XL.16b}, KS1.16b // keep authsize bytes only
sminv b0, XL.16b // signed minimum across XL
smov w0, v0.b[0] // return b0
.endif
4: ldp x29, x30, [sp], #32
ret
5:
CPU_LE( rev w8, w8 )
str w8, [x5, #12] // store lower counter
st1 {XL.2d}, [x4]
b 4b
6: ld1 {T1.16b-T2.16b}, [x17], #32 // permute vectors
sub x17, x17, x19, lsl #1
cmp w9, #1
beq 7f
.subsection 1
7: ld1 {INP2.16b}, [x1]
tbx INP2.16b, {INP3.16b}, T1.16b
mov INP3.16b, INP2.16b
b 8f
.previous
st1 {INP0.16b}, [x1], x14
st1 {INP1.16b}, [x1], x15
st1 {INP2.16b}, [x1], x16
tbl INP3.16b, {INP3.16b}, T1.16b
tbx INP3.16b, {INP2.16b}, T2.16b
8: st1 {INP3.16b}, [x1]
.if \enc == 1
ld1 {T1.16b}, [x17]
tbl INP3.16b, {INP3.16b}, T1.16b // clear non-data bits
bl pmull_gcm_ghash_4x
.endif
b 3b
.endm
/*
* void pmull_gcm_encrypt(int blocks, u8 dst[], const u8 src[],
* struct ghash_key const *k, u64 dg[], u8 ctr[],
* int rounds, u8 tag)
*/
SYM_FUNC_START(pmull_gcm_encrypt)
pmull_gcm_do_crypt 1
SYM_FUNC_END(pmull_gcm_encrypt)
/*
* void pmull_gcm_decrypt(int blocks, u8 dst[], const u8 src[],
* struct ghash_key const *k, u64 dg[], u8 ctr[],
* int rounds, u8 tag)
*/
SYM_FUNC_START(pmull_gcm_decrypt)
pmull_gcm_do_crypt 0
SYM_FUNC_END(pmull_gcm_decrypt)
SYM_FUNC_START_LOCAL(pmull_gcm_ghash_4x)
movi MASK.16b, #0xe1
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
CPU_LE( rev x8, x8 )
shl MASK.2d, MASK.2d, #57
eor SHASH2.16b, SHASH2.16b, SHASH.16b
rev64 T1.16b, INP0.16b
rev64 T2.16b, INP1.16b
rev64 TT3.16b, INP2.16b
rev64 TT4.16b, INP3.16b
.if \enc == 1
ld1 {KS.16b}, [x7]
.endif
ext XL.16b, XL.16b, XL.16b, #8
0: ld1 {CTR.8b}, [x5] // load upper counter
ld1 {INP.16b}, [x3], #16
rev x9, x8
add x8, x8, #1
sub w0, w0, #1
ins CTR.d[1], x9 // set lower counter
tbz w9, #2, 0f // <4 blocks?
.subsection 1
0: movi XH2.16b, #0
movi XM2.16b, #0
movi XL2.16b, #0
.if \enc == 1
eor INP.16b, INP.16b, KS.16b // encrypt input
st1 {INP.16b}, [x2], #16
.endif
tbz w9, #0, 1f // 2 blocks?
tbz w9, #1, 2f // 1 block?
rev64 T1.16b, INP.16b
eor T2.16b, T2.16b, XL.16b
ext T1.16b, T2.16b, T2.16b, #8
b .Lgh3
cmp w6, #12
b.ge 2f // AES-192/256?
1: eor TT3.16b, TT3.16b, XL.16b
ext T2.16b, TT3.16b, TT3.16b, #8
b .Lgh2
1: enc_round CTR, v21
2: eor TT4.16b, TT4.16b, XL.16b
ext IN1.16b, TT4.16b, TT4.16b, #8
b .Lgh1
.previous
eor T1.16b, T1.16b, XL.16b
ext T2.16b, XL.16b, XL.16b, #8
ext IN1.16b, T1.16b, T1.16b, #8
pmull2 XH2.1q, HH4.2d, IN1.2d // a1 * b1
eor T1.16b, T1.16b, IN1.16b
pmull XL2.1q, HH4.1d, IN1.1d // a0 * b0
pmull2 XM2.1q, HH34.2d, T1.2d // (a1 + a0)(b1 + b0)
enc_round CTR, v22
ext T1.16b, T2.16b, T2.16b, #8
.Lgh3: eor T2.16b, T2.16b, T1.16b
pmull2 XH.1q, HH3.2d, T1.2d // a1 * b1
pmull XL.1q, HH3.1d, T1.1d // a0 * b0
pmull XM.1q, HH34.1d, T2.1d // (a1 + a0)(b1 + b0)
eor T1.16b, T1.16b, T2.16b
eor XL.16b, XL.16b, IN1.16b
eor XH2.16b, XH2.16b, XH.16b
eor XL2.16b, XL2.16b, XL.16b
eor XM2.16b, XM2.16b, XM.16b
enc_round CTR, v23
ext T2.16b, TT3.16b, TT3.16b, #8
.Lgh2: eor TT3.16b, TT3.16b, T2.16b
pmull2 XH.1q, HH.2d, T2.2d // a1 * b1
pmull XL.1q, HH.1d, T2.1d // a0 * b0
pmull2 XM.1q, SHASH2.2d, TT3.2d // (a1 + a0)(b1 + b0)
pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
eor T1.16b, T1.16b, XL.16b
eor XH2.16b, XH2.16b, XH.16b
eor XL2.16b, XL2.16b, XL.16b
eor XM2.16b, XM2.16b, XM.16b
enc_round CTR, v24
ext IN1.16b, TT4.16b, TT4.16b, #8
.Lgh1: eor TT4.16b, TT4.16b, IN1.16b
pmull XL.1q, SHASH.1d, IN1.1d // a0 * b0
pmull2 XH.1q, SHASH.2d, IN1.2d // a1 * b1
pmull XM.1q, SHASH2.1d, TT4.1d // (a1 + a0)(b1 + b0)
pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
eor XH.16b, XH.16b, XH2.16b
eor XL.16b, XL.16b, XL2.16b
eor XM.16b, XM.16b, XM2.16b
enc_round CTR, v25
eor T2.16b, XL.16b, XH.16b
ext T1.16b, XL.16b, XH.16b, #8
eor T2.16b, XL.16b, XH.16b
eor XM.16b, XM.16b, T1.16b
enc_round CTR, v26
eor XM.16b, XM.16b, T2.16b
pmull T2.1q, XL.1d, MASK.1d
__pmull_reduce_p64
enc_round CTR, v27
mov XH.d[0], XM.d[1]
mov XM.d[1], XL.d[0]
enc_round CTR, v28
eor XL.16b, XM.16b, T2.16b
enc_round CTR, v29
ext T2.16b, XL.16b, XL.16b, #8
aese CTR.16b, v30.16b
pmull XL.1q, XL.1d, MASK.1d
eor T2.16b, T2.16b, XH.16b
eor KS.16b, CTR.16b, v31.16b
eor XL.16b, XL.16b, T2.16b
ret
SYM_FUNC_END(pmull_gcm_ghash_4x)
.if \enc == 0
eor INP.16b, INP.16b, KS.16b
st1 {INP.16b}, [x2], #16
.endif
SYM_FUNC_START_LOCAL(pmull_gcm_enc_4x)
ld1 {KS0.16b}, [x5] // load upper counter
sub w10, w8, #4
sub w11, w8, #3
sub w12, w8, #2
sub w13, w8, #1
rev w10, w10
rev w11, w11
rev w12, w12
rev w13, w13
mov KS1.16b, KS0.16b
mov KS2.16b, KS0.16b
mov KS3.16b, KS0.16b
ins KS0.s[3], w10 // set lower counter
ins KS1.s[3], w11
ins KS2.s[3], w12
ins KS3.s[3], w13
cbnz w0, 0b
add x10, x6, #96 // round key pointer
ld1 {K6.4s-K7.4s}, [x10], #32
.irp key, K0, K1, K2, K3, K4, K5
enc_qround KS0, KS1, KS2, KS3, \key
.endr
CPU_LE( rev x8, x8 )
st1 {XL.2d}, [x1]
str x8, [x5, #8] // store lower counter
tbnz x7, #2, .Lnot128
.subsection 1
.Lnot128:
ld1 {K8.4s-K9.4s}, [x10], #32
.irp key, K6, K7
enc_qround KS0, KS1, KS2, KS3, \key
.endr
ld1 {K6.4s-K7.4s}, [x10]
.irp key, K8, K9
enc_qround KS0, KS1, KS2, KS3, \key
.endr
tbz x7, #1, .Lout192
b .Lout256
.previous
.Lout256:
.irp key, K6, K7
enc_qround KS0, KS1, KS2, KS3, \key
.endr
.Lout192:
enc_qround KS0, KS1, KS2, KS3, KK
aese KS0.16b, KL.16b
aese KS1.16b, KL.16b
aese KS2.16b, KL.16b
aese KS3.16b, KL.16b
eor KS0.16b, KS0.16b, KM.16b
eor KS1.16b, KS1.16b, KM.16b
eor KS2.16b, KS2.16b, KM.16b
eor KS3.16b, KS3.16b, KM.16b
eor INP0.16b, INP0.16b, KS0.16b
eor INP1.16b, INP1.16b, KS1.16b
eor INP2.16b, INP2.16b, KS2.16b
eor INP3.16b, INP3.16b, KS3.16b
.if \enc == 1
st1 {KS.16b}, [x7]
.endif
ret
SYM_FUNC_END(pmull_gcm_enc_4x)
.section ".rodata", "a"
.align 6
.Lpermute_table:
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
.previous
2: b.eq 3f // AES-192?
enc_round CTR, v17
enc_round CTR, v18
3: enc_round CTR, v19
enc_round CTR, v20
b 1b
.endm
/*
* void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
* struct ghash_key const *k, u8 ctr[],
* int rounds, u8 ks[])
*/
ENTRY(pmull_gcm_encrypt)
pmull_gcm_do_crypt 1
ENDPROC(pmull_gcm_encrypt)
/*
* void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
* struct ghash_key const *k, u8 ctr[],
* int rounds)
*/
ENTRY(pmull_gcm_decrypt)
pmull_gcm_do_crypt 0
ENDPROC(pmull_gcm_decrypt)
/*
* void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
*/
ENTRY(pmull_gcm_encrypt_block)
cbz x2, 0f
load_round_keys w3, x2
0: ld1 {v0.16b}, [x1]
enc_block v0, w3
st1 {v0.16b}, [x0]
ret
ENDPROC(pmull_gcm_encrypt_block)

View File

@ -1,7 +1,7 @@
/*
* Accelerated GHASH implementation with ARMv8 PMULL instructions.
*
* Copyright (C) 2014 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
* Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
@ -17,7 +17,6 @@
#include <crypto/gf128mul.h>
#include <crypto/internal/aead.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <linux/cpufeature.h>
@ -34,8 +33,9 @@ MODULE_ALIAS_CRYPTO("ghash");
#define GCM_IV_SIZE 12
struct ghash_key {
be128 k;
u64 h[][2];
u64 a;
u64 b;
be128 k;
};
struct ghash_desc_ctx {
@ -50,18 +50,29 @@ struct gcm_aes_ctx {
};
asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
u64 const h[][2], const char *head);
struct ghash_key const *k,
const char *head);
asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
u64 const h[][2], const char *head);
struct ghash_key const *k,
const char *head);
asmlinkage void pmull_gcm_encrypt(int bytes, u8 dst[], const u8 src[],
u64 const h[][2], u64 dg[], u8 ctr[],
u32 const rk[], int rounds, u8 tag[]);
asmlinkage int pmull_gcm_decrypt(int bytes, u8 dst[], const u8 src[],
u64 const h[][2], u64 dg[], u8 ctr[],
u32 const rk[], int rounds, const u8 l[],
const u8 tag[], u64 authsize);
static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
struct ghash_key const *k,
const char *head);
asmlinkage void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[],
const u8 src[], struct ghash_key const *k,
u8 ctr[], int rounds, u8 ks[]);
asmlinkage void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[],
const u8 src[], struct ghash_key const *k,
u8 ctr[], int rounds);
asmlinkage void pmull_gcm_encrypt_block(u8 dst[], u8 const src[],
u32 const rk[], int rounds);
asmlinkage void __aes_arm64_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
static int ghash_init(struct shash_desc *desc)
{
@ -73,48 +84,34 @@ static int ghash_init(struct shash_desc *desc)
static void ghash_do_update(int blocks, u64 dg[], const char *src,
struct ghash_key *key, const char *head)
{
be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
do {
const u8 *in = src;
if (head) {
in = head;
blocks++;
head = NULL;
} else {
src += GHASH_BLOCK_SIZE;
}
crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
gf128mul_lle(&dst, &key->k);
} while (--blocks);
dg[0] = be64_to_cpu(dst.b);
dg[1] = be64_to_cpu(dst.a);
}
static __always_inline
void ghash_do_simd_update(int blocks, u64 dg[], const char *src,
struct ghash_key *key, const char *head,
void (*simd_update)(int blocks, u64 dg[],
const char *src,
u64 const h[][2],
const char *head))
{
if (likely(may_use_simd())) {
kernel_neon_begin();
simd_update(blocks, dg, src, key->h, head);
pmull_ghash_update(blocks, dg, src, key, head);
kernel_neon_end();
} else {
ghash_do_update(blocks, dg, src, key, head);
be128 dst = { cpu_to_be64(dg[1]), cpu_to_be64(dg[0]) };
do {
const u8 *in = src;
if (head) {
in = head;
blocks++;
head = NULL;
} else {
src += GHASH_BLOCK_SIZE;
}
crypto_xor((u8 *)&dst, in, GHASH_BLOCK_SIZE);
gf128mul_lle(&dst, &key->k);
} while (--blocks);
dg[0] = be64_to_cpu(dst.b);
dg[1] = be64_to_cpu(dst.a);
}
}
/* avoid hogging the CPU for too long */
#define MAX_BLOCKS (SZ_64K / GHASH_BLOCK_SIZE)
static int ghash_update(struct shash_desc *desc, const u8 *src,
unsigned int len)
{
@ -138,17 +135,11 @@ static int ghash_update(struct shash_desc *desc, const u8 *src,
blocks = len / GHASH_BLOCK_SIZE;
len %= GHASH_BLOCK_SIZE;
do {
int chunk = min(blocks, MAX_BLOCKS);
ghash_do_update(blocks, ctx->digest, src, key,
partial ? ctx->buf : NULL);
ghash_do_simd_update(chunk, ctx->digest, src, key,
partial ? ctx->buf : NULL,
pmull_ghash_update_p8);
blocks -= chunk;
src += chunk * GHASH_BLOCK_SIZE;
partial = 0;
} while (unlikely(blocks > 0));
src += blocks * GHASH_BLOCK_SIZE;
partial = 0;
}
if (len)
memcpy(ctx->buf + partial, src, len);
@ -165,25 +156,34 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
memset(ctx->buf + partial, 0, GHASH_BLOCK_SIZE - partial);
ghash_do_simd_update(1, ctx->digest, ctx->buf, key, NULL,
pmull_ghash_update_p8);
ghash_do_update(1, ctx->digest, ctx->buf, key, NULL);
}
put_unaligned_be64(ctx->digest[1], dst);
put_unaligned_be64(ctx->digest[0], dst + 8);
memzero_explicit(ctx, sizeof(*ctx));
*ctx = (struct ghash_desc_ctx){};
return 0;
}
static void ghash_reflect(u64 h[], const be128 *k)
static int __ghash_setkey(struct ghash_key *key,
const u8 *inkey, unsigned int keylen)
{
u64 carry = be64_to_cpu(k->a) & BIT(63) ? 1 : 0;
u64 a, b;
h[0] = (be64_to_cpu(k->b) << 1) | carry;
h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63);
/* needed for the fallback */
memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
if (carry)
h[1] ^= 0xc200000000000000UL;
/* perform multiplication by 'x' in GF(2^128) */
b = get_unaligned_be64(inkey);
a = get_unaligned_be64(inkey + 8);
key->a = (a << 1) | (b >> 63);
key->b = (b << 1) | (a >> 63);
if (b >> 63)
key->b ^= 0xc200000000000000UL;
return 0;
}
static int ghash_setkey(struct crypto_shash *tfm,
@ -196,19 +196,16 @@ static int ghash_setkey(struct crypto_shash *tfm,
return -EINVAL;
}
/* needed for the fallback */
memcpy(&key->k, inkey, GHASH_BLOCK_SIZE);
ghash_reflect(key->h[0], &key->k);
return 0;
return __ghash_setkey(key, inkey, keylen);
}
static struct shash_alg ghash_alg = {
.base.cra_name = "ghash",
.base.cra_driver_name = "ghash-neon",
.base.cra_priority = 150,
.base.cra_driver_name = "ghash-ce",
.base.cra_priority = 200,
.base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = GHASH_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct ghash_key) + sizeof(u64[2]),
.base.cra_ctxsize = sizeof(struct ghash_key),
.base.cra_module = THIS_MODULE,
.digestsize = GHASH_DIGEST_SIZE,
@ -236,33 +233,18 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *inkey,
{
struct gcm_aes_ctx *ctx = crypto_aead_ctx(tfm);
u8 key[GHASH_BLOCK_SIZE];
be128 h;
int ret;
ret = aes_expandkey(&ctx->aes_key, inkey, keylen);
ret = crypto_aes_expand_key(&ctx->aes_key, inkey, keylen);
if (ret) {
tfm->base.crt_flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
return -EINVAL;
}
aes_encrypt(&ctx->aes_key, key, (u8[AES_BLOCK_SIZE]){});
__aes_arm64_encrypt(ctx->aes_key.key_enc, key, (u8[AES_BLOCK_SIZE]){},
num_rounds(&ctx->aes_key));
/* needed for the fallback */
memcpy(&ctx->ghash_key.k, key, GHASH_BLOCK_SIZE);
ghash_reflect(ctx->ghash_key.h[0], &ctx->ghash_key.k);
h = ctx->ghash_key.k;
gf128mul_lle(&h, &ctx->ghash_key.k);
ghash_reflect(ctx->ghash_key.h[1], &h);
gf128mul_lle(&h, &ctx->ghash_key.k);
ghash_reflect(ctx->ghash_key.h[2], &h);
gf128mul_lle(&h, &ctx->ghash_key.k);
ghash_reflect(ctx->ghash_key.h[3], &h);
return 0;
return __ghash_setkey(&ctx->ghash_key, key, sizeof(key));
}
static int gcm_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
@ -294,9 +276,8 @@ static void gcm_update_mac(u64 dg[], const u8 *src, int count, u8 buf[],
if (count >= GHASH_BLOCK_SIZE || *buf_count == GHASH_BLOCK_SIZE) {
int blocks = count / GHASH_BLOCK_SIZE;
ghash_do_simd_update(blocks, dg, src, &ctx->ghash_key,
*buf_count ? buf : NULL,
pmull_ghash_update_p64);
ghash_do_update(blocks, dg, src, &ctx->ghash_key,
*buf_count ? buf : NULL);
src += blocks * GHASH_BLOCK_SIZE;
count %= GHASH_BLOCK_SIZE;
@ -340,117 +321,121 @@ static void gcm_calculate_auth_mac(struct aead_request *req, u64 dg[])
if (buf_count) {
memset(&buf[buf_count], 0, GHASH_BLOCK_SIZE - buf_count);
ghash_do_simd_update(1, dg, buf, &ctx->ghash_key, NULL,
pmull_ghash_update_p64);
ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
}
}
static void gcm_final(struct aead_request *req, struct gcm_aes_ctx *ctx,
u64 dg[], u8 tag[], int cryptlen)
{
u8 mac[AES_BLOCK_SIZE];
u128 lengths;
lengths.a = cpu_to_be64(req->assoclen * 8);
lengths.b = cpu_to_be64(cryptlen * 8);
ghash_do_update(1, dg, (void *)&lengths, &ctx->ghash_key, NULL);
put_unaligned_be64(dg[1], mac);
put_unaligned_be64(dg[0], mac + 8);
crypto_xor(tag, mac, AES_BLOCK_SIZE);
}
static int gcm_encrypt(struct aead_request *req)
{
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
int nrounds = num_rounds(&ctx->aes_key);
struct skcipher_walk walk;
u8 buf[AES_BLOCK_SIZE];
u8 iv[AES_BLOCK_SIZE];
u8 ks[AES_BLOCK_SIZE];
u8 tag[AES_BLOCK_SIZE];
u64 dg[2] = {};
be128 lengths;
u8 *tag;
int err;
lengths.a = cpu_to_be64(req->assoclen * 8);
lengths.b = cpu_to_be64(req->cryptlen * 8);
if (req->assoclen)
gcm_calculate_auth_mac(req, dg);
memcpy(iv, req->iv, GCM_IV_SIZE);
put_unaligned_be32(2, iv + GCM_IV_SIZE);
err = skcipher_walk_aead_encrypt(&walk, req, false);
put_unaligned_be32(1, iv + GCM_IV_SIZE);
if (likely(may_use_simd())) {
do {
const u8 *src = walk.src.virt.addr;
u8 *dst = walk.dst.virt.addr;
int nbytes = walk.nbytes;
kernel_neon_begin();
tag = (u8 *)&lengths;
pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
num_rounds(&ctx->aes_key));
put_unaligned_be32(2, iv + GCM_IV_SIZE);
pmull_gcm_encrypt_block(ks, iv, NULL,
num_rounds(&ctx->aes_key));
put_unaligned_be32(3, iv + GCM_IV_SIZE);
if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
src = dst = memcpy(buf + sizeof(buf) - nbytes,
src, nbytes);
} else if (nbytes < walk.total) {
nbytes &= ~(AES_BLOCK_SIZE - 1);
tag = NULL;
}
err = skcipher_walk_aead_encrypt(&walk, req, true);
kernel_neon_begin();
pmull_gcm_encrypt(nbytes, dst, src, ctx->ghash_key.h,
dg, iv, ctx->aes_key.key_enc, nrounds,
tag);
kernel_neon_end();
if (unlikely(!nbytes))
break;
if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
memcpy(walk.dst.virt.addr,
buf + sizeof(buf) - nbytes, nbytes);
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
} while (walk.nbytes);
} else {
while (walk.nbytes >= AES_BLOCK_SIZE) {
int blocks = walk.nbytes / AES_BLOCK_SIZE;
const u8 *src = walk.src.virt.addr;
u8 *dst = walk.dst.virt.addr;
int remaining = blocks;
do {
aes_encrypt(&ctx->aes_key, buf, iv);
crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
crypto_inc(iv, AES_BLOCK_SIZE);
dst += AES_BLOCK_SIZE;
src += AES_BLOCK_SIZE;
} while (--remaining > 0);
ghash_do_update(blocks, dg, walk.dst.virt.addr,
&ctx->ghash_key, NULL);
pmull_gcm_encrypt(blocks, dg, walk.dst.virt.addr,
walk.src.virt.addr, &ctx->ghash_key,
iv, num_rounds(&ctx->aes_key), ks);
err = skcipher_walk_done(&walk,
walk.nbytes % AES_BLOCK_SIZE);
}
kernel_neon_end();
} else {
__aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv,
num_rounds(&ctx->aes_key));
put_unaligned_be32(2, iv + GCM_IV_SIZE);
/* handle the tail */
if (walk.nbytes) {
aes_encrypt(&ctx->aes_key, buf, iv);
err = skcipher_walk_aead_encrypt(&walk, req, true);
crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr,
buf, walk.nbytes);
while (walk.nbytes >= AES_BLOCK_SIZE) {
int blocks = walk.nbytes / AES_BLOCK_SIZE;
u8 *dst = walk.dst.virt.addr;
u8 *src = walk.src.virt.addr;
memcpy(buf, walk.dst.virt.addr, walk.nbytes);
memset(buf + walk.nbytes, 0, sizeof(buf) - walk.nbytes);
do {
__aes_arm64_encrypt(ctx->aes_key.key_enc,
ks, iv,
num_rounds(&ctx->aes_key));
crypto_xor_cpy(dst, src, ks, AES_BLOCK_SIZE);
crypto_inc(iv, AES_BLOCK_SIZE);
dst += AES_BLOCK_SIZE;
src += AES_BLOCK_SIZE;
} while (--blocks > 0);
ghash_do_update(walk.nbytes / AES_BLOCK_SIZE, dg,
walk.dst.virt.addr, &ctx->ghash_key,
NULL);
err = skcipher_walk_done(&walk,
walk.nbytes % AES_BLOCK_SIZE);
}
tag = (u8 *)&lengths;
ghash_do_update(1, dg, tag, &ctx->ghash_key,
walk.nbytes ? buf : NULL);
if (walk.nbytes)
err = skcipher_walk_done(&walk, 0);
__aes_arm64_encrypt(ctx->aes_key.key_enc, ks, iv,
num_rounds(&ctx->aes_key));
}
put_unaligned_be64(dg[1], tag);
put_unaligned_be64(dg[0], tag + 8);
put_unaligned_be32(1, iv + GCM_IV_SIZE);
aes_encrypt(&ctx->aes_key, iv, iv);
crypto_xor(tag, iv, AES_BLOCK_SIZE);
/* handle the tail */
if (walk.nbytes) {
u8 buf[GHASH_BLOCK_SIZE];
crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, ks,
walk.nbytes);
memcpy(buf, walk.dst.virt.addr, walk.nbytes);
memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
err = skcipher_walk_done(&walk, 0);
}
if (err)
return err;
gcm_final(req, ctx, dg, tag, req->cryptlen);
/* copy authtag to end of dst */
scatterwalk_map_and_copy(tag, req->dst, req->assoclen + req->cryptlen,
crypto_aead_authsize(aead), 1);
@ -463,81 +448,62 @@ static int gcm_decrypt(struct aead_request *req)
struct crypto_aead *aead = crypto_aead_reqtfm(req);
struct gcm_aes_ctx *ctx = crypto_aead_ctx(aead);
unsigned int authsize = crypto_aead_authsize(aead);
int nrounds = num_rounds(&ctx->aes_key);
struct skcipher_walk walk;
u8 otag[AES_BLOCK_SIZE];
u8 buf[AES_BLOCK_SIZE];
u8 iv[AES_BLOCK_SIZE];
u8 tag[AES_BLOCK_SIZE];
u8 buf[GHASH_BLOCK_SIZE];
u64 dg[2] = {};
be128 lengths;
u8 *tag;
int err;
lengths.a = cpu_to_be64(req->assoclen * 8);
lengths.b = cpu_to_be64((req->cryptlen - authsize) * 8);
if (req->assoclen)
gcm_calculate_auth_mac(req, dg);
memcpy(iv, req->iv, GCM_IV_SIZE);
put_unaligned_be32(2, iv + GCM_IV_SIZE);
scatterwalk_map_and_copy(otag, req->src,
req->assoclen + req->cryptlen - authsize,
authsize, 0);
err = skcipher_walk_aead_decrypt(&walk, req, false);
put_unaligned_be32(1, iv + GCM_IV_SIZE);
if (likely(may_use_simd())) {
int ret;
kernel_neon_begin();
do {
const u8 *src = walk.src.virt.addr;
u8 *dst = walk.dst.virt.addr;
int nbytes = walk.nbytes;
pmull_gcm_encrypt_block(tag, iv, ctx->aes_key.key_enc,
num_rounds(&ctx->aes_key));
put_unaligned_be32(2, iv + GCM_IV_SIZE);
tag = (u8 *)&lengths;
err = skcipher_walk_aead_decrypt(&walk, req, true);
while (walk.nbytes >= AES_BLOCK_SIZE) {
int blocks = walk.nbytes / AES_BLOCK_SIZE;
pmull_gcm_decrypt(blocks, dg, walk.dst.virt.addr,
walk.src.virt.addr, &ctx->ghash_key,
iv, num_rounds(&ctx->aes_key));
err = skcipher_walk_done(&walk,
walk.nbytes % AES_BLOCK_SIZE);
}
if (walk.nbytes)
pmull_gcm_encrypt_block(iv, iv, NULL,
num_rounds(&ctx->aes_key));
kernel_neon_end();
} else {
__aes_arm64_encrypt(ctx->aes_key.key_enc, tag, iv,
num_rounds(&ctx->aes_key));
put_unaligned_be32(2, iv + GCM_IV_SIZE);
err = skcipher_walk_aead_decrypt(&walk, req, true);
if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE)) {
src = dst = memcpy(buf + sizeof(buf) - nbytes,
src, nbytes);
} else if (nbytes < walk.total) {
nbytes &= ~(AES_BLOCK_SIZE - 1);
tag = NULL;
}
kernel_neon_begin();
ret = pmull_gcm_decrypt(nbytes, dst, src,
ctx->ghash_key.h,
dg, iv, ctx->aes_key.key_enc,
nrounds, tag, otag, authsize);
kernel_neon_end();
if (unlikely(!nbytes))
break;
if (unlikely(nbytes > 0 && nbytes < AES_BLOCK_SIZE))
memcpy(walk.dst.virt.addr,
buf + sizeof(buf) - nbytes, nbytes);
err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
} while (walk.nbytes);
if (err)
return err;
if (ret)
return -EBADMSG;
} else {
while (walk.nbytes >= AES_BLOCK_SIZE) {
int blocks = walk.nbytes / AES_BLOCK_SIZE;
const u8 *src = walk.src.virt.addr;
u8 *dst = walk.dst.virt.addr;
u8 *src = walk.src.virt.addr;
ghash_do_update(blocks, dg, walk.src.virt.addr,
&ctx->ghash_key, NULL);
do {
aes_encrypt(&ctx->aes_key, buf, iv);
__aes_arm64_encrypt(ctx->aes_key.key_enc,
buf, iv,
num_rounds(&ctx->aes_key));
crypto_xor_cpy(dst, src, buf, AES_BLOCK_SIZE);
crypto_inc(iv, AES_BLOCK_SIZE);
@ -548,40 +514,35 @@ static int gcm_decrypt(struct aead_request *req)
err = skcipher_walk_done(&walk,
walk.nbytes % AES_BLOCK_SIZE);
}
/* handle the tail */
if (walk.nbytes) {
memcpy(buf, walk.src.virt.addr, walk.nbytes);
memset(buf + walk.nbytes, 0, sizeof(buf) - walk.nbytes);
}
tag = (u8 *)&lengths;
ghash_do_update(1, dg, tag, &ctx->ghash_key,
walk.nbytes ? buf : NULL);
if (walk.nbytes) {
aes_encrypt(&ctx->aes_key, buf, iv);
crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr,
buf, walk.nbytes);
err = skcipher_walk_done(&walk, 0);
}
if (err)
return err;
put_unaligned_be64(dg[1], tag);
put_unaligned_be64(dg[0], tag + 8);
put_unaligned_be32(1, iv + GCM_IV_SIZE);
aes_encrypt(&ctx->aes_key, iv, iv);
crypto_xor(tag, iv, AES_BLOCK_SIZE);
if (crypto_memneq(tag, otag, authsize)) {
memzero_explicit(tag, AES_BLOCK_SIZE);
return -EBADMSG;
}
if (walk.nbytes)
__aes_arm64_encrypt(ctx->aes_key.key_enc, iv, iv,
num_rounds(&ctx->aes_key));
}
/* handle the tail */
if (walk.nbytes) {
memcpy(buf, walk.src.virt.addr, walk.nbytes);
memset(buf + walk.nbytes, 0, GHASH_BLOCK_SIZE - walk.nbytes);
ghash_do_update(1, dg, buf, &ctx->ghash_key, NULL);
crypto_xor_cpy(walk.dst.virt.addr, walk.src.virt.addr, iv,
walk.nbytes);
err = skcipher_walk_done(&walk, 0);
}
if (err)
return err;
gcm_final(req, ctx, dg, tag, req->cryptlen - authsize);
/* compare calculated auth tag with the stored one */
scatterwalk_map_and_copy(buf, req->src,
req->assoclen + req->cryptlen - authsize,
authsize, 0);
if (crypto_memneq(tag, buf, authsize))
return -EBADMSG;
return 0;
}
@ -598,28 +559,39 @@ static struct aead_alg gcm_aes_alg = {
.base.cra_driver_name = "gcm-aes-ce",
.base.cra_priority = 300,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct gcm_aes_ctx) +
4 * sizeof(u64[2]),
.base.cra_ctxsize = sizeof(struct gcm_aes_ctx),
.base.cra_module = THIS_MODULE,
};
static int __init ghash_ce_mod_init(void)
{
if (!cpu_have_named_feature(ASIMD))
int ret;
if (!(elf_hwcap & HWCAP_ASIMD))
return -ENODEV;
if (cpu_have_named_feature(PMULL))
return crypto_register_aead(&gcm_aes_alg);
if (elf_hwcap & HWCAP_PMULL)
pmull_ghash_update = pmull_ghash_update_p64;
return crypto_register_shash(&ghash_alg);
else
pmull_ghash_update = pmull_ghash_update_p8;
ret = crypto_register_shash(&ghash_alg);
if (ret)
return ret;
if (elf_hwcap & HWCAP_PMULL) {
ret = crypto_register_aead(&gcm_aes_alg);
if (ret)
crypto_unregister_shash(&ghash_alg);
}
return ret;
}
static void __exit ghash_ce_mod_exit(void)
{
if (cpu_have_named_feature(PMULL))
crypto_unregister_aead(&gcm_aes_alg);
else
crypto_unregister_shash(&ghash_alg);
crypto_unregister_shash(&ghash_alg);
crypto_unregister_aead(&gcm_aes_alg);
}
static const struct cpu_feature ghash_cpu_feature[] = {

View File

@ -1,103 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* NH - ε-almost-universal hash function, ARM64 NEON accelerated version
*
* Copyright 2018 Google LLC
*
* Author: Eric Biggers <ebiggers@google.com>
*/
#include <linux/linkage.h>
KEY .req x0
MESSAGE .req x1
MESSAGE_LEN .req x2
HASH .req x3
PASS0_SUMS .req v0
PASS1_SUMS .req v1
PASS2_SUMS .req v2
PASS3_SUMS .req v3
K0 .req v4
K1 .req v5
K2 .req v6
K3 .req v7
T0 .req v8
T1 .req v9
T2 .req v10
T3 .req v11
T4 .req v12
T5 .req v13
T6 .req v14
T7 .req v15
.macro _nh_stride k0, k1, k2, k3
// Load next message stride
ld1 {T3.16b}, [MESSAGE], #16
// Load next key stride
ld1 {\k3\().4s}, [KEY], #16
// Add message words to key words
add T0.4s, T3.4s, \k0\().4s
add T1.4s, T3.4s, \k1\().4s
add T2.4s, T3.4s, \k2\().4s
add T3.4s, T3.4s, \k3\().4s
// Multiply 32x32 => 64 and accumulate
mov T4.d[0], T0.d[1]
mov T5.d[0], T1.d[1]
mov T6.d[0], T2.d[1]
mov T7.d[0], T3.d[1]
umlal PASS0_SUMS.2d, T0.2s, T4.2s
umlal PASS1_SUMS.2d, T1.2s, T5.2s
umlal PASS2_SUMS.2d, T2.2s, T6.2s
umlal PASS3_SUMS.2d, T3.2s, T7.2s
.endm
/*
* void nh_neon(const u32 *key, const u8 *message, size_t message_len,
* u8 hash[NH_HASH_BYTES])
*
* It's guaranteed that message_len % 16 == 0.
*/
SYM_FUNC_START(nh_neon)
ld1 {K0.4s,K1.4s}, [KEY], #32
movi PASS0_SUMS.2d, #0
movi PASS1_SUMS.2d, #0
ld1 {K2.4s}, [KEY], #16
movi PASS2_SUMS.2d, #0
movi PASS3_SUMS.2d, #0
subs MESSAGE_LEN, MESSAGE_LEN, #64
blt .Lloop4_done
.Lloop4:
_nh_stride K0, K1, K2, K3
_nh_stride K1, K2, K3, K0
_nh_stride K2, K3, K0, K1
_nh_stride K3, K0, K1, K2
subs MESSAGE_LEN, MESSAGE_LEN, #64
bge .Lloop4
.Lloop4_done:
ands MESSAGE_LEN, MESSAGE_LEN, #63
beq .Ldone
_nh_stride K0, K1, K2, K3
subs MESSAGE_LEN, MESSAGE_LEN, #16
beq .Ldone
_nh_stride K1, K2, K3, K0
subs MESSAGE_LEN, MESSAGE_LEN, #16
beq .Ldone
_nh_stride K2, K3, K0, K1
.Ldone:
// Sum the accumulators for each pass, then store the sums to 'hash'
addp T0.2d, PASS0_SUMS.2d, PASS1_SUMS.2d
addp T1.2d, PASS2_SUMS.2d, PASS3_SUMS.2d
st1 {T0.16b,T1.16b}, [HASH]
ret
SYM_FUNC_END(nh_neon)

View File

@ -1,78 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NHPoly1305 - ε-almost--universal hash function for Adiantum
* (ARM64 NEON accelerated version)
*
* Copyright 2018 Google LLC
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/nhpoly1305.h>
#include <linux/module.h>
asmlinkage void nh_neon(const u32 *key, const u8 *message, size_t message_len,
u8 hash[NH_HASH_BYTES]);
/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
static void _nh_neon(const u32 *key, const u8 *message, size_t message_len,
__le64 hash[NH_NUM_PASSES])
{
nh_neon(key, message, message_len, (u8 *)hash);
}
static int nhpoly1305_neon_update(struct shash_desc *desc,
const u8 *src, unsigned int srclen)
{
if (srclen < 64 || !may_use_simd())
return crypto_nhpoly1305_update(desc, src, srclen);
do {
unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
kernel_neon_begin();
crypto_nhpoly1305_update_helper(desc, src, n, _nh_neon);
kernel_neon_end();
src += n;
srclen -= n;
} while (srclen);
return 0;
}
static struct shash_alg nhpoly1305_alg = {
.base.cra_name = "nhpoly1305",
.base.cra_driver_name = "nhpoly1305-neon",
.base.cra_priority = 200,
.base.cra_ctxsize = sizeof(struct nhpoly1305_key),
.base.cra_module = THIS_MODULE,
.digestsize = POLY1305_DIGEST_SIZE,
.init = crypto_nhpoly1305_init,
.update = nhpoly1305_neon_update,
.final = crypto_nhpoly1305_final,
.setkey = crypto_nhpoly1305_setkey,
.descsize = sizeof(struct nhpoly1305_state),
};
static int __init nhpoly1305_mod_init(void)
{
if (!cpu_have_named_feature(ASIMD))
return -ENODEV;
return crypto_register_shash(&nhpoly1305_alg);
}
static void __exit nhpoly1305_mod_exit(void)
{
crypto_unregister_shash(&nhpoly1305_alg);
}
module_init(nhpoly1305_mod_init);
module_exit(nhpoly1305_mod_exit);
MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (NEON-accelerated)");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
MODULE_ALIAS_CRYPTO("nhpoly1305");
MODULE_ALIAS_CRYPTO("nhpoly1305-neon");

View File

@ -58,22 +58,24 @@
sha1su1 v\s0\().4s, v\s3\().4s
.endm
.macro loadrc, k, val, tmp
movz \tmp, :abs_g0_nc:\val
movk \tmp, :abs_g1:\val
dup \k, \tmp
.endm
/*
* The SHA1 round constants
*/
.align 4
.Lsha1_rcon:
.word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6
/*
* int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
* int blocks)
* void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
* int blocks)
*/
SYM_FUNC_START(sha1_ce_transform)
ENTRY(sha1_ce_transform)
/* load round constants */
loadrc k0.4s, 0x5a827999, w6
loadrc k1.4s, 0x6ed9eba1, w6
loadrc k2.4s, 0x8f1bbcdc, w6
loadrc k3.4s, 0xca62c1d6, w6
adr x6, .Lsha1_rcon
ld1r {k0.4s}, [x6], #4
ld1r {k1.4s}, [x6], #4
ld1r {k2.4s}, [x6], #4
ld1r {k3.4s}, [x6]
/* load state */
ld1 {dgav.4s}, [x0]
@ -123,16 +125,14 @@ CPU_LE( rev32 v11.16b, v11.16b )
add dgbv.2s, dgbv.2s, dg1v.2s
add dgav.4s, dgav.4s, dg0v.4s
cbz w2, 2f
cond_yield 3f, x5
b 0b
cbnz w2, 0b
/*
* Final block: add padding and total bit count.
* Skip if the input size was not a round multiple of the block size,
* the padding is handled by the C code in that case.
*/
2: cbz x4, 3f
cbz x4, 3f
ldr_l w4, sha1_ce_offsetof_count, x4
ldr x4, [x0, x4]
movi v9.2d, #0
@ -148,6 +148,5 @@ CPU_LE( rev32 v11.16b, v11.16b )
/* store new state */
3: st1 {dgav.4s}, [x0]
str dgb, [x0, #16]
mov w0, w2
ret
SYM_FUNC_END(sha1_ce_transform)
ENDPROC(sha1_ce_transform)

View File

@ -12,7 +12,6 @@
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/sha.h>
#include <crypto/sha1_base.h>
#include <linux/cpufeature.h>
@ -22,18 +21,14 @@
MODULE_DESCRIPTION("SHA1 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("sha1");
struct sha1_ce_state {
struct sha1_state sst;
u32 finalize;
};
extern const u32 sha1_ce_offsetof_count;
extern const u32 sha1_ce_offsetof_finalize;
asmlinkage int sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
int blocks);
asmlinkage void sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
int blocks);
#ifdef CONFIG_CFI_CLANG
static inline void __cfi_sha1_ce_transform(struct sha1_state *sst,
u8 const *src, int blocks)
@ -43,23 +38,6 @@ static inline void __cfi_sha1_ce_transform(struct sha1_state *sst,
#define sha1_ce_transform __cfi_sha1_ce_transform
#endif
static void __sha1_ce_transform(struct sha1_state *sst, u8 const *src,
int blocks)
{
sha1_ce_transform(container_of(sst, struct sha1_ce_state, sst), src,
blocks);
while (blocks) {
int rem;
kernel_neon_begin();
rem = sha1_ce_transform(container_of(sst, struct sha1_ce_state,
sst), src, blocks);
kernel_neon_end();
src += (blocks - rem) * SHA1_BLOCK_SIZE;
blocks = rem;
}
}
const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count);
const u32 sha1_ce_offsetof_finalize = offsetof(struct sha1_ce_state, finalize);
@ -72,7 +50,10 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data,
return crypto_sha1_update(desc, data, len);
sctx->finalize = 0;
sha1_base_do_update(desc, data, len, __sha1_ce_transform);
kernel_neon_begin();
sha1_base_do_update(desc, data, len,
(sha1_block_fn *)sha1_ce_transform);
kernel_neon_end();
return 0;
}
@ -92,9 +73,12 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data,
*/
sctx->finalize = finalize;
sha1_base_do_update(desc, data, len, __sha1_ce_transform);
kernel_neon_begin();
sha1_base_do_update(desc, data, len,
(sha1_block_fn *)sha1_ce_transform);
if (!finalize)
sha1_base_do_finalize(desc, __sha1_ce_transform);
sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform);
kernel_neon_end();
return sha1_base_finish(desc, out);
}
@ -106,41 +90,24 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out)
return crypto_sha1_finup(desc, NULL, 0, out);
sctx->finalize = 0;
sha1_base_do_finalize(desc, __sha1_ce_transform);
kernel_neon_begin();
sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_ce_transform);
kernel_neon_end();
return sha1_base_finish(desc, out);
}
static int sha1_ce_export(struct shash_desc *desc, void *out)
{
struct sha1_ce_state *sctx = shash_desc_ctx(desc);
memcpy(out, &sctx->sst, sizeof(struct sha1_state));
return 0;
}
static int sha1_ce_import(struct shash_desc *desc, const void *in)
{
struct sha1_ce_state *sctx = shash_desc_ctx(desc);
memcpy(&sctx->sst, in, sizeof(struct sha1_state));
sctx->finalize = 0;
return 0;
}
static struct shash_alg alg = {
.init = sha1_base_init,
.update = sha1_ce_update,
.final = sha1_ce_final,
.finup = sha1_ce_finup,
.import = sha1_ce_import,
.export = sha1_ce_export,
.descsize = sizeof(struct sha1_ce_state),
.statesize = sizeof(struct sha1_state),
.digestsize = SHA1_DIGEST_SIZE,
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1-ce",
.cra_priority = 200,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -53,7 +53,6 @@
/*
* The SHA-256 round constants
*/
.section ".rodata", "a"
.align 4
.Lsha2_rcon:
.word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
@ -77,10 +76,9 @@
* void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
* int blocks)
*/
.text
SYM_FUNC_START(sha2_ce_transform)
ENTRY(sha2_ce_transform)
/* load round constants */
adr_l x8, .Lsha2_rcon
adr x8, .Lsha2_rcon
ld1 { v0.4s- v3.4s}, [x8], #64
ld1 { v4.4s- v7.4s}, [x8], #64
ld1 { v8.4s-v11.4s}, [x8], #64
@ -131,16 +129,14 @@ CPU_LE( rev32 v19.16b, v19.16b )
add dgbv.4s, dgbv.4s, dg1v.4s
/* handled all input blocks? */
cbz w2, 2f
cond_yield 3f, x5
b 0b
cbnz w2, 0b
/*
* Final block: add padding and total bit count.
* Skip if the input size was not a round multiple of the block size,
* the padding is handled by the C code in that case.
*/
2: cbz x4, 3f
cbz x4, 3f
ldr_l w4, sha256_ce_offsetof_count, x4
ldr x4, [x0, x4]
movi v17.2d, #0
@ -155,6 +151,5 @@ CPU_LE( rev32 v19.16b, v19.16b )
/* store new state */
3: st1 {dgav.4s, dgbv.4s}, [x0]
mov w0, w2
ret
SYM_FUNC_END(sha2_ce_transform)
ENDPROC(sha2_ce_transform)

View File

@ -12,7 +12,6 @@
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/sha.h>
#include <crypto/sha256_base.h>
#include <linux/cpufeature.h>
@ -22,19 +21,14 @@
MODULE_DESCRIPTION("SHA-224/SHA-256 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("sha224");
MODULE_ALIAS_CRYPTO("sha256");
struct sha256_ce_state {
struct sha256_state sst;
u32 finalize;
};
extern const u32 sha256_ce_offsetof_count;
extern const u32 sha256_ce_offsetof_finalize;
asmlinkage int sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
int blocks);
asmlinkage void sha2_ce_transform(struct sha256_ce_state *sst, u8 const *src,
int blocks);
#ifdef CONFIG_CFI_CLANG
static inline void __cfi_sha2_ce_transform(struct sha256_state *sst,
u8 const *src, int blocks)
@ -44,23 +38,6 @@ static inline void __cfi_sha2_ce_transform(struct sha256_state *sst,
#define sha2_ce_transform __cfi_sha2_ce_transform
#endif
static void __sha2_ce_transform(struct sha256_state *sst, u8 const *src,
int blocks)
{
sha2_ce_transform(container_of(sst, struct sha256_ce_state, sst), src,
blocks);
while (blocks) {
int rem;
kernel_neon_begin();
rem = sha2_ce_transform(container_of(sst, struct sha256_ce_state,
sst), src, blocks);
kernel_neon_end();
src += (blocks - rem) * SHA256_BLOCK_SIZE;
blocks = rem;
}
}
const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
sst.count);
const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state,
@ -68,12 +45,6 @@ const u32 sha256_ce_offsetof_finalize = offsetof(struct sha256_ce_state,
asmlinkage void sha256_block_data_order(u32 *digest, u8 const *src, int blocks);
static void __sha256_block_data_order(struct sha256_state *sst, u8 const *src,
int blocks)
{
sha256_block_data_order(sst->state, src, blocks);
}
static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
@ -81,10 +52,13 @@ static int sha256_ce_update(struct shash_desc *desc, const u8 *data,
if (!may_use_simd())
return sha256_base_do_update(desc, data, len,
__sha256_block_data_order);
(sha256_block_fn *)sha256_block_data_order);
sctx->finalize = 0;
sha256_base_do_update(desc, data, len, __sha2_ce_transform);
kernel_neon_begin();
sha256_base_do_update(desc, data, len,
(sha256_block_fn *)sha2_ce_transform);
kernel_neon_end();
return 0;
}
@ -98,8 +72,9 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
if (!may_use_simd()) {
if (len)
sha256_base_do_update(desc, data, len,
__sha256_block_data_order);
sha256_base_do_finalize(desc, __sha256_block_data_order);
(sha256_block_fn *)sha256_block_data_order);
sha256_base_do_finalize(desc,
(sha256_block_fn *)sha256_block_data_order);
return sha256_base_finish(desc, out);
}
@ -109,9 +84,13 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data,
*/
sctx->finalize = finalize;
sha256_base_do_update(desc, data, len, __sha2_ce_transform);
kernel_neon_begin();
sha256_base_do_update(desc, data, len,
(sha256_block_fn *)sha2_ce_transform);
if (!finalize)
sha256_base_do_finalize(desc, __sha2_ce_transform);
sha256_base_do_finalize(desc,
(sha256_block_fn *)sha2_ce_transform);
kernel_neon_end();
return sha256_base_finish(desc, out);
}
@ -120,46 +99,30 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out)
struct sha256_ce_state *sctx = shash_desc_ctx(desc);
if (!may_use_simd()) {
sha256_base_do_finalize(desc, __sha256_block_data_order);
sha256_base_do_finalize(desc,
(sha256_block_fn *)sha256_block_data_order);
return sha256_base_finish(desc, out);
}
sctx->finalize = 0;
sha256_base_do_finalize(desc, __sha2_ce_transform);
kernel_neon_begin();
sha256_base_do_finalize(desc, (sha256_block_fn *)sha2_ce_transform);
kernel_neon_end();
return sha256_base_finish(desc, out);
}
static int sha256_ce_export(struct shash_desc *desc, void *out)
{
struct sha256_ce_state *sctx = shash_desc_ctx(desc);
memcpy(out, &sctx->sst, sizeof(struct sha256_state));
return 0;
}
static int sha256_ce_import(struct shash_desc *desc, const void *in)
{
struct sha256_ce_state *sctx = shash_desc_ctx(desc);
memcpy(&sctx->sst, in, sizeof(struct sha256_state));
sctx->finalize = 0;
return 0;
}
static struct shash_alg algs[] = { {
.init = sha224_base_init,
.update = sha256_ce_update,
.final = sha256_ce_final,
.finup = sha256_ce_finup,
.export = sha256_ce_export,
.import = sha256_ce_import,
.descsize = sizeof(struct sha256_ce_state),
.statesize = sizeof(struct sha256_state),
.digestsize = SHA224_DIGEST_SIZE,
.base = {
.cra_name = "sha224",
.cra_driver_name = "sha224-ce",
.cra_priority = 200,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -168,15 +131,13 @@ static struct shash_alg algs[] = { {
.update = sha256_ce_update,
.final = sha256_ce_final,
.finup = sha256_ce_finup,
.export = sha256_ce_export,
.import = sha256_ce_import,
.descsize = sizeof(struct sha256_ce_state),
.statesize = sizeof(struct sha256_state),
.digestsize = SHA256_DIGEST_SIZE,
.base = {
.cra_name = "sha256",
.cra_driver_name = "sha256-ce",
.cra_priority = 200,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -1,13 +1,3 @@
// SPDX-License-Identifier: GPL-2.0
// This code is taken from the OpenSSL project but the author (Andy Polyakov)
// has relicensed it under the GPLv2. Therefore this program is free software;
// you can redistribute it and/or modify it under the terms of the GNU General
// Public License version 2 as published by the Free Software Foundation.
//
// The original headers, including the original license headers, are
// included below for completeness.
// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the OpenSSL license (the "License"). You may not use
@ -20,6 +10,8 @@
// project. The module is, however, dual licensed under OpenSSL and
// CRYPTOGAMS licenses depending on where you obtain it. For further
// details see http://www.openssl.org/~appro/cryptogams/.
//
// Permission to use under GPLv2 terms is granted.
// ====================================================================
//
// SHA256/512 for ARMv8.

View File

@ -14,7 +14,6 @@
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/sha.h>
#include <crypto/sha256_base.h>
#include <linux/cryptohash.h>
@ -32,66 +31,57 @@ asmlinkage void sha256_block_data_order(u32 *digest, const void *data,
unsigned int num_blks);
EXPORT_SYMBOL(sha256_block_data_order);
static void __sha256_block_data_order(struct sha256_state *sst, u8 const *src,
int blocks)
{
sha256_block_data_order(sst->state, src, blocks);
}
asmlinkage void sha256_block_neon(u32 *digest, const void *data,
unsigned int num_blks);
static void __sha256_block_neon(struct sha256_state *sst, u8 const *src,
int blocks)
{
sha256_block_neon(sst->state, src, blocks);
}
static int crypto_sha256_arm64_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
static int sha256_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha256_base_do_update(desc, data, len,
__sha256_block_data_order);
(sha256_block_fn *)sha256_block_data_order);
}
static int crypto_sha256_arm64_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
static int sha256_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
if (len)
sha256_base_do_update(desc, data, len,
__sha256_block_data_order);
sha256_base_do_finalize(desc, __sha256_block_data_order);
(sha256_block_fn *)sha256_block_data_order);
sha256_base_do_finalize(desc,
(sha256_block_fn *)sha256_block_data_order);
return sha256_base_finish(desc, out);
}
static int crypto_sha256_arm64_final(struct shash_desc *desc, u8 *out)
static int sha256_final(struct shash_desc *desc, u8 *out)
{
return crypto_sha256_arm64_finup(desc, NULL, 0, out);
return sha256_finup(desc, NULL, 0, out);
}
static struct shash_alg algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
.init = sha256_base_init,
.update = crypto_sha256_arm64_update,
.final = crypto_sha256_arm64_final,
.finup = crypto_sha256_arm64_finup,
.update = sha256_update,
.final = sha256_final,
.finup = sha256_finup,
.descsize = sizeof(struct sha256_state),
.base.cra_name = "sha256",
.base.cra_driver_name = "sha256-arm64",
.base.cra_priority = 125,
.base.cra_priority = 100,
.base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = SHA256_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
}, {
.digestsize = SHA224_DIGEST_SIZE,
.init = sha224_base_init,
.update = crypto_sha256_arm64_update,
.final = crypto_sha256_arm64_final,
.finup = crypto_sha256_arm64_finup,
.update = sha256_update,
.final = sha256_final,
.finup = sha256_finup,
.descsize = sizeof(struct sha256_state),
.base.cra_name = "sha224",
.base.cra_driver_name = "sha224-arm64",
.base.cra_priority = 125,
.base.cra_priority = 100,
.base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = SHA224_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
} };
@ -99,31 +89,21 @@ static struct shash_alg algs[] = { {
static int sha256_update_neon(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
struct sha256_state *sctx = shash_desc_ctx(desc);
/*
* Stacking and unstacking a substantial slice of the NEON register
* file may significantly affect performance for small updates when
* executing in interrupt context, so fall back to the scalar code
* in that case.
*/
if (!may_use_simd())
return sha256_base_do_update(desc, data, len,
__sha256_block_data_order);
(sha256_block_fn *)sha256_block_data_order);
while (len > 0) {
unsigned int chunk = len;
kernel_neon_begin();
sha256_base_do_update(desc, data, len,
(sha256_block_fn *)sha256_block_neon);
kernel_neon_end();
/*
* Don't hog the CPU for the entire time it takes to process all
* input when running on a preemptible kernel, but process the
* data block by block instead.
*/
if (IS_ENABLED(CONFIG_PREEMPT) &&
chunk + sctx->count % SHA256_BLOCK_SIZE > SHA256_BLOCK_SIZE)
chunk = SHA256_BLOCK_SIZE -
sctx->count % SHA256_BLOCK_SIZE;
kernel_neon_begin();
sha256_base_do_update(desc, data, chunk, __sha256_block_neon);
kernel_neon_end();
data += chunk;
len -= chunk;
}
return 0;
}
@ -133,13 +113,16 @@ static int sha256_finup_neon(struct shash_desc *desc, const u8 *data,
if (!may_use_simd()) {
if (len)
sha256_base_do_update(desc, data, len,
__sha256_block_data_order);
sha256_base_do_finalize(desc, __sha256_block_data_order);
(sha256_block_fn *)sha256_block_data_order);
sha256_base_do_finalize(desc,
(sha256_block_fn *)sha256_block_data_order);
} else {
if (len)
sha256_update_neon(desc, data, len);
kernel_neon_begin();
sha256_base_do_finalize(desc, __sha256_block_neon);
if (len)
sha256_base_do_update(desc, data, len,
(sha256_block_fn *)sha256_block_neon);
sha256_base_do_finalize(desc,
(sha256_block_fn *)sha256_block_neon);
kernel_neon_end();
}
return sha256_base_finish(desc, out);
@ -160,6 +143,7 @@ static struct shash_alg neon_algs[] = { {
.base.cra_name = "sha256",
.base.cra_driver_name = "sha256-arm64-neon",
.base.cra_priority = 150,
.base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = SHA256_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
}, {
@ -172,6 +156,7 @@ static struct shash_alg neon_algs[] = { {
.base.cra_name = "sha224",
.base.cra_driver_name = "sha224-arm64-neon",
.base.cra_priority = 150,
.base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = SHA224_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
} };
@ -182,7 +167,7 @@ static int __init sha256_mod_init(void)
if (ret)
return ret;
if (cpu_have_named_feature(ASIMD)) {
if (elf_hwcap & HWCAP_ASIMD) {
ret = crypto_register_shashes(neon_algs, ARRAY_SIZE(neon_algs));
if (ret)
crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
@ -192,7 +177,7 @@ static int __init sha256_mod_init(void)
static void __exit sha256_mod_fini(void)
{
if (cpu_have_named_feature(ASIMD))
if (elf_hwcap & HWCAP_ASIMD)
crypto_unregister_shashes(neon_algs, ARRAY_SIZE(neon_algs));
crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
}

View File

@ -1,212 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
*
* Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
.irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
.set .Lv\b\().2d, \b
.set .Lv\b\().16b, \b
.endr
/*
* ARMv8.2 Crypto Extensions instructions
*/
.macro eor3, rd, rn, rm, ra
.inst 0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
.endm
.macro rax1, rd, rn, rm
.inst 0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
.endm
.macro bcax, rd, rn, rm, ra
.inst 0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
.endm
.macro xar, rd, rn, rm, imm6
.inst 0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
.endm
/*
* int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
*/
.text
SYM_FUNC_START(sha3_ce_transform)
/* load state */
add x8, x0, #32
ld1 { v0.1d- v3.1d}, [x0]
ld1 { v4.1d- v7.1d}, [x8], #32
ld1 { v8.1d-v11.1d}, [x8], #32
ld1 {v12.1d-v15.1d}, [x8], #32
ld1 {v16.1d-v19.1d}, [x8], #32
ld1 {v20.1d-v23.1d}, [x8], #32
ld1 {v24.1d}, [x8]
0: sub w2, w2, #1
mov w8, #24
adr_l x9, .Lsha3_rcon
/* load input */
ld1 {v25.8b-v28.8b}, [x1], #32
ld1 {v29.8b-v31.8b}, [x1], #24
eor v0.8b, v0.8b, v25.8b
eor v1.8b, v1.8b, v26.8b
eor v2.8b, v2.8b, v27.8b
eor v3.8b, v3.8b, v28.8b
eor v4.8b, v4.8b, v29.8b
eor v5.8b, v5.8b, v30.8b
eor v6.8b, v6.8b, v31.8b
tbnz x3, #6, 2f // SHA3-512
ld1 {v25.8b-v28.8b}, [x1], #32
ld1 {v29.8b-v30.8b}, [x1], #16
eor v7.8b, v7.8b, v25.8b
eor v8.8b, v8.8b, v26.8b
eor v9.8b, v9.8b, v27.8b
eor v10.8b, v10.8b, v28.8b
eor v11.8b, v11.8b, v29.8b
eor v12.8b, v12.8b, v30.8b
tbnz x3, #4, 1f // SHA3-384 or SHA3-224
// SHA3-256
ld1 {v25.8b-v28.8b}, [x1], #32
eor v13.8b, v13.8b, v25.8b
eor v14.8b, v14.8b, v26.8b
eor v15.8b, v15.8b, v27.8b
eor v16.8b, v16.8b, v28.8b
b 3f
1: tbz x3, #2, 3f // bit 2 cleared? SHA-384
// SHA3-224
ld1 {v25.8b-v28.8b}, [x1], #32
ld1 {v29.8b}, [x1], #8
eor v13.8b, v13.8b, v25.8b
eor v14.8b, v14.8b, v26.8b
eor v15.8b, v15.8b, v27.8b
eor v16.8b, v16.8b, v28.8b
eor v17.8b, v17.8b, v29.8b
b 3f
// SHA3-512
2: ld1 {v25.8b-v26.8b}, [x1], #16
eor v7.8b, v7.8b, v25.8b
eor v8.8b, v8.8b, v26.8b
3: sub w8, w8, #1
eor3 v29.16b, v4.16b, v9.16b, v14.16b
eor3 v26.16b, v1.16b, v6.16b, v11.16b
eor3 v28.16b, v3.16b, v8.16b, v13.16b
eor3 v25.16b, v0.16b, v5.16b, v10.16b
eor3 v27.16b, v2.16b, v7.16b, v12.16b
eor3 v29.16b, v29.16b, v19.16b, v24.16b
eor3 v26.16b, v26.16b, v16.16b, v21.16b
eor3 v28.16b, v28.16b, v18.16b, v23.16b
eor3 v25.16b, v25.16b, v15.16b, v20.16b
eor3 v27.16b, v27.16b, v17.16b, v22.16b
rax1 v30.2d, v29.2d, v26.2d // bc[0]
rax1 v26.2d, v26.2d, v28.2d // bc[2]
rax1 v28.2d, v28.2d, v25.2d // bc[4]
rax1 v25.2d, v25.2d, v27.2d // bc[1]
rax1 v27.2d, v27.2d, v29.2d // bc[3]
eor v0.16b, v0.16b, v30.16b
xar v29.2d, v1.2d, v25.2d, (64 - 1)
xar v1.2d, v6.2d, v25.2d, (64 - 44)
xar v6.2d, v9.2d, v28.2d, (64 - 20)
xar v9.2d, v22.2d, v26.2d, (64 - 61)
xar v22.2d, v14.2d, v28.2d, (64 - 39)
xar v14.2d, v20.2d, v30.2d, (64 - 18)
xar v31.2d, v2.2d, v26.2d, (64 - 62)
xar v2.2d, v12.2d, v26.2d, (64 - 43)
xar v12.2d, v13.2d, v27.2d, (64 - 25)
xar v13.2d, v19.2d, v28.2d, (64 - 8)
xar v19.2d, v23.2d, v27.2d, (64 - 56)
xar v23.2d, v15.2d, v30.2d, (64 - 41)
xar v15.2d, v4.2d, v28.2d, (64 - 27)
xar v28.2d, v24.2d, v28.2d, (64 - 14)
xar v24.2d, v21.2d, v25.2d, (64 - 2)
xar v8.2d, v8.2d, v27.2d, (64 - 55)
xar v4.2d, v16.2d, v25.2d, (64 - 45)
xar v16.2d, v5.2d, v30.2d, (64 - 36)
xar v5.2d, v3.2d, v27.2d, (64 - 28)
xar v27.2d, v18.2d, v27.2d, (64 - 21)
xar v3.2d, v17.2d, v26.2d, (64 - 15)
xar v25.2d, v11.2d, v25.2d, (64 - 10)
xar v26.2d, v7.2d, v26.2d, (64 - 6)
xar v30.2d, v10.2d, v30.2d, (64 - 3)
bcax v20.16b, v31.16b, v22.16b, v8.16b
bcax v21.16b, v8.16b, v23.16b, v22.16b
bcax v22.16b, v22.16b, v24.16b, v23.16b
bcax v23.16b, v23.16b, v31.16b, v24.16b
bcax v24.16b, v24.16b, v8.16b, v31.16b
ld1r {v31.2d}, [x9], #8
bcax v17.16b, v25.16b, v19.16b, v3.16b
bcax v18.16b, v3.16b, v15.16b, v19.16b
bcax v19.16b, v19.16b, v16.16b, v15.16b
bcax v15.16b, v15.16b, v25.16b, v16.16b
bcax v16.16b, v16.16b, v3.16b, v25.16b
bcax v10.16b, v29.16b, v12.16b, v26.16b
bcax v11.16b, v26.16b, v13.16b, v12.16b
bcax v12.16b, v12.16b, v14.16b, v13.16b
bcax v13.16b, v13.16b, v29.16b, v14.16b
bcax v14.16b, v14.16b, v26.16b, v29.16b
bcax v7.16b, v30.16b, v9.16b, v4.16b
bcax v8.16b, v4.16b, v5.16b, v9.16b
bcax v9.16b, v9.16b, v6.16b, v5.16b
bcax v5.16b, v5.16b, v30.16b, v6.16b
bcax v6.16b, v6.16b, v4.16b, v30.16b
bcax v3.16b, v27.16b, v0.16b, v28.16b
bcax v4.16b, v28.16b, v1.16b, v0.16b
bcax v0.16b, v0.16b, v2.16b, v1.16b
bcax v1.16b, v1.16b, v27.16b, v2.16b
bcax v2.16b, v2.16b, v28.16b, v27.16b
eor v0.16b, v0.16b, v31.16b
cbnz w8, 3b
cond_yield 3f, x8
cbnz w2, 0b
/* save state */
3: st1 { v0.1d- v3.1d}, [x0], #32
st1 { v4.1d- v7.1d}, [x0], #32
st1 { v8.1d-v11.1d}, [x0], #32
st1 {v12.1d-v15.1d}, [x0], #32
st1 {v16.1d-v19.1d}, [x0], #32
st1 {v20.1d-v23.1d}, [x0], #32
st1 {v24.1d}, [x0]
mov w0, w2
ret
SYM_FUNC_END(sha3_ce_transform)
.section ".rodata", "a"
.align 8
.Lsha3_rcon:
.quad 0x0000000000000001, 0x0000000000008082, 0x800000000000808a
.quad 0x8000000080008000, 0x000000000000808b, 0x0000000080000001
.quad 0x8000000080008081, 0x8000000000008009, 0x000000000000008a
.quad 0x0000000000000088, 0x0000000080008009, 0x000000008000000a
.quad 0x000000008000808b, 0x800000000000008b, 0x8000000000008089
.quad 0x8000000000008003, 0x8000000000008002, 0x8000000000000080
.quad 0x000000000000800a, 0x800000008000000a, 0x8000000080008081
.quad 0x8000000000008080, 0x0000000080000001, 0x8000000080008008

View File

@ -1,166 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* sha3-ce-glue.c - core SHA-3 transform using v8.2 Crypto Extensions
*
* Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/sha3.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/module.h>
MODULE_DESCRIPTION("SHA3 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("sha3-224");
MODULE_ALIAS_CRYPTO("sha3-256");
MODULE_ALIAS_CRYPTO("sha3-384");
MODULE_ALIAS_CRYPTO("sha3-512");
asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks,
int md_len);
static int sha3_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
struct sha3_state *sctx = shash_desc_ctx(desc);
unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
if (!may_use_simd())
return crypto_sha3_update(desc, data, len);
if ((sctx->partial + len) >= sctx->rsiz) {
int blocks;
if (sctx->partial) {
int p = sctx->rsiz - sctx->partial;
memcpy(sctx->buf + sctx->partial, data, p);
kernel_neon_begin();
sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
kernel_neon_end();
data += p;
len -= p;
sctx->partial = 0;
}
blocks = len / sctx->rsiz;
len %= sctx->rsiz;
while (blocks) {
int rem;
kernel_neon_begin();
rem = sha3_ce_transform(sctx->st, data, blocks,
digest_size);
kernel_neon_end();
data += (blocks - rem) * sctx->rsiz;
blocks = rem;
}
}
if (len) {
memcpy(sctx->buf + sctx->partial, data, len);
sctx->partial += len;
}
return 0;
}
static int sha3_final(struct shash_desc *desc, u8 *out)
{
struct sha3_state *sctx = shash_desc_ctx(desc);
unsigned int digest_size = crypto_shash_digestsize(desc->tfm);
__le64 *digest = (__le64 *)out;
int i;
if (!may_use_simd())
return crypto_sha3_final(desc, out);
sctx->buf[sctx->partial++] = 0x06;
memset(sctx->buf + sctx->partial, 0, sctx->rsiz - sctx->partial);
sctx->buf[sctx->rsiz - 1] |= 0x80;
kernel_neon_begin();
sha3_ce_transform(sctx->st, sctx->buf, 1, digest_size);
kernel_neon_end();
for (i = 0; i < digest_size / 8; i++)
put_unaligned_le64(sctx->st[i], digest++);
if (digest_size & 4)
put_unaligned_le32(sctx->st[i], (__le32 *)digest);
memzero_explicit(sctx, sizeof(*sctx));
return 0;
}
static struct shash_alg algs[] = { {
.digestsize = SHA3_224_DIGEST_SIZE,
.init = crypto_sha3_init,
.update = sha3_update,
.final = sha3_final,
.descsize = sizeof(struct sha3_state),
.base.cra_name = "sha3-224",
.base.cra_driver_name = "sha3-224-ce",
.base.cra_blocksize = SHA3_224_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.base.cra_priority = 200,
}, {
.digestsize = SHA3_256_DIGEST_SIZE,
.init = crypto_sha3_init,
.update = sha3_update,
.final = sha3_final,
.descsize = sizeof(struct sha3_state),
.base.cra_name = "sha3-256",
.base.cra_driver_name = "sha3-256-ce",
.base.cra_blocksize = SHA3_256_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.base.cra_priority = 200,
}, {
.digestsize = SHA3_384_DIGEST_SIZE,
.init = crypto_sha3_init,
.update = sha3_update,
.final = sha3_final,
.descsize = sizeof(struct sha3_state),
.base.cra_name = "sha3-384",
.base.cra_driver_name = "sha3-384-ce",
.base.cra_blocksize = SHA3_384_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.base.cra_priority = 200,
}, {
.digestsize = SHA3_512_DIGEST_SIZE,
.init = crypto_sha3_init,
.update = sha3_update,
.final = sha3_final,
.descsize = sizeof(struct sha3_state),
.base.cra_name = "sha3-512",
.base.cra_driver_name = "sha3-512-ce",
.base.cra_blocksize = SHA3_512_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.base.cra_priority = 200,
} };
static int __init sha3_neon_mod_init(void)
{
return crypto_register_shashes(algs, ARRAY_SIZE(algs));
}
static void __exit sha3_neon_mod_fini(void)
{
crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
}
module_cpu_feature_match(SHA3, sha3_neon_mod_init);
module_exit(sha3_neon_mod_fini);

View File

@ -1,14 +1,4 @@
#! /usr/bin/env perl
# SPDX-License-Identifier: GPL-2.0
# This code is taken from the OpenSSL project but the author (Andy Polyakov)
# has relicensed it under the GPLv2. Therefore this program is free software;
# you can redistribute it and/or modify it under the terms of the GNU General
# Public License version 2 as published by the Free Software Foundation.
#
# The original headers, including the original license headers, are
# included below for completeness.
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
@ -21,6 +11,8 @@
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
#
# Permission to use under GPLv2 terms is granted.
# ====================================================================
#
# SHA256/512 for ARMv8.

View File

@ -1,206 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* sha512-ce-core.S - core SHA-384/SHA-512 transform using v8 Crypto Extensions
*
* Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
.irp b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
.set .Lq\b, \b
.set .Lv\b\().2d, \b
.endr
.macro sha512h, rd, rn, rm
.inst 0xce608000 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
.endm
.macro sha512h2, rd, rn, rm
.inst 0xce608400 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
.endm
.macro sha512su0, rd, rn
.inst 0xcec08000 | .L\rd | (.L\rn << 5)
.endm
.macro sha512su1, rd, rn, rm
.inst 0xce608800 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
.endm
/*
* The SHA-512 round constants
*/
.section ".rodata", "a"
.align 4
.Lsha512_rcon:
.quad 0x428a2f98d728ae22, 0x7137449123ef65cd
.quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
.quad 0x3956c25bf348b538, 0x59f111f1b605d019
.quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
.quad 0xd807aa98a3030242, 0x12835b0145706fbe
.quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
.quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
.quad 0x9bdc06a725c71235, 0xc19bf174cf692694
.quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
.quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
.quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
.quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
.quad 0x983e5152ee66dfab, 0xa831c66d2db43210
.quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4
.quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725
.quad 0x06ca6351e003826f, 0x142929670a0e6e70
.quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926
.quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
.quad 0x650a73548baf63de, 0x766a0abb3c77b2a8
.quad 0x81c2c92e47edaee6, 0x92722c851482353b
.quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001
.quad 0xc24b8b70d0f89791, 0xc76c51a30654be30
.quad 0xd192e819d6ef5218, 0xd69906245565a910
.quad 0xf40e35855771202a, 0x106aa07032bbd1b8
.quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
.quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
.quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
.quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
.quad 0x748f82ee5defb2fc, 0x78a5636f43172f60
.quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec
.quad 0x90befffa23631e28, 0xa4506cebde82bde9
.quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b
.quad 0xca273eceea26619c, 0xd186b8c721c0c207
.quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
.quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6
.quad 0x113f9804bef90dae, 0x1b710b35131c471b
.quad 0x28db77f523047d84, 0x32caab7b40c72493
.quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
.quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
.quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
.macro dround, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4
.ifnb \rc1
ld1 {v\rc1\().2d}, [x4], #16
.endif
add v5.2d, v\rc0\().2d, v\in0\().2d
ext v6.16b, v\i2\().16b, v\i3\().16b, #8
ext v5.16b, v5.16b, v5.16b, #8
ext v7.16b, v\i1\().16b, v\i2\().16b, #8
add v\i3\().2d, v\i3\().2d, v5.2d
.ifnb \in1
ext v5.16b, v\in3\().16b, v\in4\().16b, #8
sha512su0 v\in0\().2d, v\in1\().2d
.endif
sha512h q\i3, q6, v7.2d
.ifnb \in1
sha512su1 v\in0\().2d, v\in2\().2d, v5.2d
.endif
add v\i4\().2d, v\i1\().2d, v\i3\().2d
sha512h2 q\i3, q\i1, v\i0\().2d
.endm
/*
* void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
* int blocks)
*/
.text
SYM_FUNC_START(sha512_ce_transform)
/* load state */
ld1 {v8.2d-v11.2d}, [x0]
/* load first 4 round constants */
adr_l x3, .Lsha512_rcon
ld1 {v20.2d-v23.2d}, [x3], #64
/* load input */
0: ld1 {v12.2d-v15.2d}, [x1], #64
ld1 {v16.2d-v19.2d}, [x1], #64
sub w2, w2, #1
CPU_LE( rev64 v12.16b, v12.16b )
CPU_LE( rev64 v13.16b, v13.16b )
CPU_LE( rev64 v14.16b, v14.16b )
CPU_LE( rev64 v15.16b, v15.16b )
CPU_LE( rev64 v16.16b, v16.16b )
CPU_LE( rev64 v17.16b, v17.16b )
CPU_LE( rev64 v18.16b, v18.16b )
CPU_LE( rev64 v19.16b, v19.16b )
mov x4, x3 // rc pointer
mov v0.16b, v8.16b
mov v1.16b, v9.16b
mov v2.16b, v10.16b
mov v3.16b, v11.16b
// v0 ab cd -- ef gh ab
// v1 cd -- ef gh ab cd
// v2 ef gh ab cd -- ef
// v3 gh ab cd -- ef gh
// v4 -- ef gh ab cd --
dround 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17
dround 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18
dround 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19
dround 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12
dround 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13
dround 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14
dround 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15
dround 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16
dround 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17
dround 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18
dround 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19
dround 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12
dround 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13
dround 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14
dround 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15
dround 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16
dround 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17
dround 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18
dround 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19
dround 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12
dround 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13
dround 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14
dround 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15
dround 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16
dround 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17
dround 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18
dround 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19
dround 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12
dround 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13
dround 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14
dround 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15
dround 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16
dround 2, 3, 1, 4, 0, 28, 24, 12
dround 4, 2, 0, 1, 3, 29, 25, 13
dround 1, 4, 3, 0, 2, 30, 26, 14
dround 0, 1, 2, 3, 4, 31, 27, 15
dround 3, 0, 4, 2, 1, 24, , 16
dround 2, 3, 1, 4, 0, 25, , 17
dround 4, 2, 0, 1, 3, 26, , 18
dround 1, 4, 3, 0, 2, 27, , 19
/* update state */
add v8.2d, v8.2d, v0.2d
add v9.2d, v9.2d, v1.2d
add v10.2d, v10.2d, v2.2d
add v11.2d, v11.2d, v3.2d
cond_yield 3f, x4
/* handled all input blocks? */
cbnz w2, 0b
/* store new state */
3: st1 {v8.2d-v11.2d}, [x0]
mov w0, w2
ret
SYM_FUNC_END(sha512_ce_transform)

View File

@ -1,121 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* sha512-ce-glue.c - SHA-384/SHA-512 using ARMv8 Crypto Extensions
*
* Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/sha.h>
#include <crypto/sha512_base.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/module.h>
MODULE_DESCRIPTION("SHA-384/SHA-512 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("sha384");
MODULE_ALIAS_CRYPTO("sha512");
asmlinkage int sha512_ce_transform(struct sha512_state *sst, u8 const *src,
int blocks);
asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks);
static void __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
int blocks)
{
while (blocks) {
int rem;
kernel_neon_begin();
rem = sha512_ce_transform(sst, src, blocks);
kernel_neon_end();
src += (blocks - rem) * SHA512_BLOCK_SIZE;
blocks = rem;
}
}
static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src,
int blocks)
{
sha512_block_data_order(sst->state, src, blocks);
}
static int sha512_ce_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
sha512_block_fn *fn = may_use_simd() ? __sha512_ce_transform
: __sha512_block_data_order;
sha512_base_do_update(desc, data, len, fn);
return 0;
}
static int sha512_ce_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
sha512_block_fn *fn = may_use_simd() ? __sha512_ce_transform
: __sha512_block_data_order;
sha512_base_do_update(desc, data, len, fn);
sha512_base_do_finalize(desc, fn);
return sha512_base_finish(desc, out);
}
static int sha512_ce_final(struct shash_desc *desc, u8 *out)
{
sha512_block_fn *fn = may_use_simd() ? __sha512_ce_transform
: __sha512_block_data_order;
sha512_base_do_finalize(desc, fn);
return sha512_base_finish(desc, out);
}
static struct shash_alg algs[] = { {
.init = sha384_base_init,
.update = sha512_ce_update,
.final = sha512_ce_final,
.finup = sha512_ce_finup,
.descsize = sizeof(struct sha512_state),
.digestsize = SHA384_DIGEST_SIZE,
.base.cra_name = "sha384",
.base.cra_driver_name = "sha384-ce",
.base.cra_priority = 200,
.base.cra_blocksize = SHA512_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
}, {
.init = sha512_base_init,
.update = sha512_ce_update,
.final = sha512_ce_final,
.finup = sha512_ce_finup,
.descsize = sizeof(struct sha512_state),
.digestsize = SHA512_DIGEST_SIZE,
.base.cra_name = "sha512",
.base.cra_driver_name = "sha512-ce",
.base.cra_priority = 200,
.base.cra_blocksize = SHA512_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
} };
static int __init sha512_ce_mod_init(void)
{
return crypto_register_shashes(algs, ARRAY_SIZE(algs));
}
static void __exit sha512_ce_mod_fini(void)
{
crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
}
module_cpu_feature_match(SHA512, sha512_ce_mod_init);
module_exit(sha512_ce_mod_fini);

View File

@ -1,13 +1,3 @@
// SPDX-License-Identifier: GPL-2.0
// This code is taken from the OpenSSL project but the author (Andy Polyakov)
// has relicensed it under the GPLv2. Therefore this program is free software;
// you can redistribute it and/or modify it under the terms of the GNU General
// Public License version 2 as published by the Free Software Foundation.
//
// The original headers, including the original license headers, are
// included below for completeness.
// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the OpenSSL license (the "License"). You may not use
@ -20,6 +10,8 @@
// project. The module is, however, dual licensed under OpenSSL and
// CRYPTOGAMS licenses depending on where you obtain it. For further
// details see http://www.openssl.org/~appro/cryptogams/.
//
// Permission to use under GPLv2 terms is granted.
// ====================================================================
//
// SHA256/512 for ARMv8.

View File

@ -25,21 +25,14 @@ MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("sha384");
MODULE_ALIAS_CRYPTO("sha512");
asmlinkage void sha512_block_data_order(u64 *digest, const void *data,
asmlinkage void sha512_block_data_order(u32 *digest, const void *data,
unsigned int num_blks);
EXPORT_SYMBOL(sha512_block_data_order);
static void __sha512_block_data_order(struct sha512_state *sst, u8 const *src,
int blocks)
{
sha512_block_data_order(sst->state, src, blocks);
}
static int sha512_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
return sha512_base_do_update(desc, data, len,
__sha512_block_data_order);
(sha512_block_fn *)sha512_block_data_order);
}
static int sha512_finup(struct shash_desc *desc, const u8 *data,
@ -47,8 +40,9 @@ static int sha512_finup(struct shash_desc *desc, const u8 *data,
{
if (len)
sha512_base_do_update(desc, data, len,
__sha512_block_data_order);
sha512_base_do_finalize(desc, __sha512_block_data_order);
(sha512_block_fn *)sha512_block_data_order);
sha512_base_do_finalize(desc,
(sha512_block_fn *)sha512_block_data_order);
return sha512_base_finish(desc, out);
}
@ -68,6 +62,7 @@ static struct shash_alg algs[] = { {
.base.cra_name = "sha512",
.base.cra_driver_name = "sha512-arm64",
.base.cra_priority = 150,
.base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = SHA512_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
}, {
@ -80,6 +75,7 @@ static struct shash_alg algs[] = { {
.base.cra_name = "sha384",
.base.cra_driver_name = "sha384-arm64",
.base.cra_priority = 150,
.base.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.base.cra_blocksize = SHA384_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
} };

View File

@ -1,141 +0,0 @@
/*
* sm3-ce-core.S - SM3 secure hash using ARMv8.2 Crypto Extensions
*
* Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12
.set .Lv\b\().4s, \b
.endr
.macro sm3partw1, rd, rn, rm
.inst 0xce60c000 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
.endm
.macro sm3partw2, rd, rn, rm
.inst 0xce60c400 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
.endm
.macro sm3ss1, rd, rn, rm, ra
.inst 0xce400000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
.endm
.macro sm3tt1a, rd, rn, rm, imm2
.inst 0xce408000 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
.endm
.macro sm3tt1b, rd, rn, rm, imm2
.inst 0xce408400 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
.endm
.macro sm3tt2a, rd, rn, rm, imm2
.inst 0xce408800 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
.endm
.macro sm3tt2b, rd, rn, rm, imm2
.inst 0xce408c00 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16)
.endm
.macro round, ab, s0, t0, t1, i
sm3ss1 v5.4s, v8.4s, \t0\().4s, v9.4s
shl \t1\().4s, \t0\().4s, #1
sri \t1\().4s, \t0\().4s, #31
sm3tt1\ab v8.4s, v5.4s, v10.4s, \i
sm3tt2\ab v9.4s, v5.4s, \s0\().4s, \i
.endm
.macro qround, ab, s0, s1, s2, s3, s4
.ifnb \s4
ext \s4\().16b, \s1\().16b, \s2\().16b, #12
ext v6.16b, \s0\().16b, \s1\().16b, #12
ext v7.16b, \s2\().16b, \s3\().16b, #8
sm3partw1 \s4\().4s, \s0\().4s, \s3\().4s
.endif
eor v10.16b, \s0\().16b, \s1\().16b
round \ab, \s0, v11, v12, 0
round \ab, \s0, v12, v11, 1
round \ab, \s0, v11, v12, 2
round \ab, \s0, v12, v11, 3
.ifnb \s4
sm3partw2 \s4\().4s, v7.4s, v6.4s
.endif
.endm
/*
* void sm3_ce_transform(struct sm3_state *sst, u8 const *src,
* int blocks)
*/
.text
SYM_FUNC_START(sm3_ce_transform)
/* load state */
ld1 {v8.4s-v9.4s}, [x0]
rev64 v8.4s, v8.4s
rev64 v9.4s, v9.4s
ext v8.16b, v8.16b, v8.16b, #8
ext v9.16b, v9.16b, v9.16b, #8
adr_l x8, .Lt
ldp s13, s14, [x8]
/* load input */
0: ld1 {v0.16b-v3.16b}, [x1], #64
sub w2, w2, #1
mov v15.16b, v8.16b
mov v16.16b, v9.16b
CPU_LE( rev32 v0.16b, v0.16b )
CPU_LE( rev32 v1.16b, v1.16b )
CPU_LE( rev32 v2.16b, v2.16b )
CPU_LE( rev32 v3.16b, v3.16b )
ext v11.16b, v13.16b, v13.16b, #4
qround a, v0, v1, v2, v3, v4
qround a, v1, v2, v3, v4, v0
qround a, v2, v3, v4, v0, v1
qround a, v3, v4, v0, v1, v2
ext v11.16b, v14.16b, v14.16b, #4
qround b, v4, v0, v1, v2, v3
qround b, v0, v1, v2, v3, v4
qround b, v1, v2, v3, v4, v0
qround b, v2, v3, v4, v0, v1
qround b, v3, v4, v0, v1, v2
qround b, v4, v0, v1, v2, v3
qround b, v0, v1, v2, v3, v4
qround b, v1, v2, v3, v4, v0
qround b, v2, v3, v4, v0, v1
qround b, v3, v4
qround b, v4, v0
qround b, v0, v1
eor v8.16b, v8.16b, v15.16b
eor v9.16b, v9.16b, v16.16b
/* handled all input blocks? */
cbnz w2, 0b
/* save state */
rev64 v8.4s, v8.4s
rev64 v9.4s, v9.4s
ext v8.16b, v8.16b, v8.16b, #8
ext v9.16b, v9.16b, v9.16b, #8
st1 {v8.4s-v9.4s}, [x0]
ret
SYM_FUNC_END(sm3_ce_transform)
.section ".rodata", "a"
.align 3
.Lt: .word 0x79cc4519, 0x9d8a7a87

View File

@ -1,92 +0,0 @@
/*
* sm3-ce-glue.c - SM3 secure hash using ARMv8.2 Crypto Extensions
*
* Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <asm/neon.h>
#include <asm/simd.h>
#include <asm/unaligned.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/simd.h>
#include <crypto/sm3.h>
#include <crypto/sm3_base.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/module.h>
MODULE_DESCRIPTION("SM3 secure hash using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src,
int blocks);
static int sm3_ce_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
if (!may_use_simd())
return crypto_sm3_update(desc, data, len);
kernel_neon_begin();
sm3_base_do_update(desc, data, len, sm3_ce_transform);
kernel_neon_end();
return 0;
}
static int sm3_ce_final(struct shash_desc *desc, u8 *out)
{
if (!may_use_simd())
return crypto_sm3_finup(desc, NULL, 0, out);
kernel_neon_begin();
sm3_base_do_finalize(desc, sm3_ce_transform);
kernel_neon_end();
return sm3_base_finish(desc, out);
}
static int sm3_ce_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
if (!may_use_simd())
return crypto_sm3_finup(desc, data, len, out);
kernel_neon_begin();
sm3_base_do_update(desc, data, len, sm3_ce_transform);
kernel_neon_end();
return sm3_ce_final(desc, out);
}
static struct shash_alg sm3_alg = {
.digestsize = SM3_DIGEST_SIZE,
.init = sm3_base_init,
.update = sm3_ce_update,
.final = sm3_ce_final,
.finup = sm3_ce_finup,
.descsize = sizeof(struct sm3_state),
.base.cra_name = "sm3",
.base.cra_driver_name = "sm3-ce",
.base.cra_blocksize = SM3_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
.base.cra_priority = 200,
};
static int __init sm3_ce_mod_init(void)
{
return crypto_register_shash(&sm3_alg);
}
static void __exit sm3_ce_mod_fini(void)
{
crypto_unregister_shash(&sm3_alg);
}
module_cpu_feature_match(SM3, sm3_ce_mod_init);
module_exit(sm3_ce_mod_fini);

View File

@ -1,36 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/linkage.h>
#include <asm/assembler.h>
.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8
.set .Lv\b\().4s, \b
.endr
.macro sm4e, rd, rn
.inst 0xcec08400 | .L\rd | (.L\rn << 5)
.endm
/*
* void sm4_ce_do_crypt(const u32 *rk, u32 *out, const u32 *in);
*/
.text
SYM_FUNC_START(sm4_ce_do_crypt)
ld1 {v8.4s}, [x2]
ld1 {v0.4s-v3.4s}, [x0], #64
CPU_LE( rev32 v8.16b, v8.16b )
ld1 {v4.4s-v7.4s}, [x0]
sm4e v8.4s, v0.4s
sm4e v8.4s, v1.4s
sm4e v8.4s, v2.4s
sm4e v8.4s, v3.4s
sm4e v8.4s, v4.4s
sm4e v8.4s, v5.4s
sm4e v8.4s, v6.4s
sm4e v8.4s, v7.4s
rev64 v8.4s, v8.4s
ext v8.16b, v8.16b, v8.16b, #8
CPU_LE( rev32 v8.16b, v8.16b )
st1 {v8.4s}, [x1]
ret
SYM_FUNC_END(sm4_ce_do_crypt)

View File

@ -1,74 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/sm4.h>
#include <crypto/internal/simd.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/types.h>
MODULE_ALIAS_CRYPTO("sm4");
MODULE_ALIAS_CRYPTO("sm4-ce");
MODULE_DESCRIPTION("SM4 symmetric cipher using ARMv8 Crypto Extensions");
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
asmlinkage void sm4_ce_do_crypt(const u32 *rk, void *out, const void *in);
static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
if (!may_use_simd()) {
crypto_sm4_encrypt(tfm, out, in);
} else {
kernel_neon_begin();
sm4_ce_do_crypt(ctx->rkey_enc, out, in);
kernel_neon_end();
}
}
static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in)
{
const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm);
if (!may_use_simd()) {
crypto_sm4_decrypt(tfm, out, in);
} else {
kernel_neon_begin();
sm4_ce_do_crypt(ctx->rkey_dec, out, in);
kernel_neon_end();
}
}
static struct crypto_alg sm4_ce_alg = {
.cra_name = "sm4",
.cra_driver_name = "sm4-ce",
.cra_priority = 200,
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = SM4_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_sm4_ctx),
.cra_module = THIS_MODULE,
.cra_u.cipher = {
.cia_min_keysize = SM4_KEY_SIZE,
.cia_max_keysize = SM4_KEY_SIZE,
.cia_setkey = crypto_sm4_set_key,
.cia_encrypt = sm4_ce_encrypt,
.cia_decrypt = sm4_ce_decrypt
}
};
static int __init sm4_ce_mod_init(void)
{
return crypto_register_alg(&sm4_ce_alg);
}
static void __exit sm4_ce_mod_fini(void)
{
crypto_unregister_alg(&sm4_ce_alg);
}
module_cpu_feature_match(SM4, sm4_ce_mod_init);
module_exit(sm4_ce_mod_fini);

View File

@ -1,352 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
*
* Copyright (c) 2018 Google, Inc
*
* Author: Eric Biggers <ebiggers@google.com>
*/
#include <linux/linkage.h>
.text
// arguments
ROUND_KEYS .req x0 // const {u64,u32} *round_keys
NROUNDS .req w1 // int nrounds
NROUNDS_X .req x1
DST .req x2 // void *dst
SRC .req x3 // const void *src
NBYTES .req w4 // unsigned int nbytes
TWEAK .req x5 // void *tweak
// registers which hold the data being encrypted/decrypted
// (underscores avoid a naming collision with ARM64 registers x0-x3)
X_0 .req v0
Y_0 .req v1
X_1 .req v2
Y_1 .req v3
X_2 .req v4
Y_2 .req v5
X_3 .req v6
Y_3 .req v7
// the round key, duplicated in all lanes
ROUND_KEY .req v8
// index vector for tbl-based 8-bit rotates
ROTATE_TABLE .req v9
ROTATE_TABLE_Q .req q9
// temporary registers
TMP0 .req v10
TMP1 .req v11
TMP2 .req v12
TMP3 .req v13
// multiplication table for updating XTS tweaks
GFMUL_TABLE .req v14
GFMUL_TABLE_Q .req q14
// next XTS tweak value(s)
TWEAKV_NEXT .req v15
// XTS tweaks for the blocks currently being encrypted/decrypted
TWEAKV0 .req v16
TWEAKV1 .req v17
TWEAKV2 .req v18
TWEAKV3 .req v19
TWEAKV4 .req v20
TWEAKV5 .req v21
TWEAKV6 .req v22
TWEAKV7 .req v23
.align 4
.Lror64_8_table:
.octa 0x080f0e0d0c0b0a090007060504030201
.Lror32_8_table:
.octa 0x0c0f0e0d080b0a090407060500030201
.Lrol64_8_table:
.octa 0x0e0d0c0b0a09080f0605040302010007
.Lrol32_8_table:
.octa 0x0e0d0c0f0a09080b0605040702010003
.Lgf128mul_table:
.octa 0x00000000000000870000000000000001
.Lgf64mul_table:
.octa 0x0000000000000000000000002d361b00
/*
* _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
*
* Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
* Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
* of ROUND_KEY. 'n' is the lane size: 64 for Speck128, or 32 for Speck64.
* 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
*/
.macro _speck_round_128bytes n, lanes
// x = ror(x, 8)
tbl X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
tbl X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
tbl X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
tbl X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
// x += y
add X_0.\lanes, X_0.\lanes, Y_0.\lanes
add X_1.\lanes, X_1.\lanes, Y_1.\lanes
add X_2.\lanes, X_2.\lanes, Y_2.\lanes
add X_3.\lanes, X_3.\lanes, Y_3.\lanes
// x ^= k
eor X_0.16b, X_0.16b, ROUND_KEY.16b
eor X_1.16b, X_1.16b, ROUND_KEY.16b
eor X_2.16b, X_2.16b, ROUND_KEY.16b
eor X_3.16b, X_3.16b, ROUND_KEY.16b
// y = rol(y, 3)
shl TMP0.\lanes, Y_0.\lanes, #3
shl TMP1.\lanes, Y_1.\lanes, #3
shl TMP2.\lanes, Y_2.\lanes, #3
shl TMP3.\lanes, Y_3.\lanes, #3
sri TMP0.\lanes, Y_0.\lanes, #(\n - 3)
sri TMP1.\lanes, Y_1.\lanes, #(\n - 3)
sri TMP2.\lanes, Y_2.\lanes, #(\n - 3)
sri TMP3.\lanes, Y_3.\lanes, #(\n - 3)
// y ^= x
eor Y_0.16b, TMP0.16b, X_0.16b
eor Y_1.16b, TMP1.16b, X_1.16b
eor Y_2.16b, TMP2.16b, X_2.16b
eor Y_3.16b, TMP3.16b, X_3.16b
.endm
/*
* _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
*
* This is the inverse of _speck_round_128bytes().
*/
.macro _speck_unround_128bytes n, lanes
// y ^= x
eor TMP0.16b, Y_0.16b, X_0.16b
eor TMP1.16b, Y_1.16b, X_1.16b
eor TMP2.16b, Y_2.16b, X_2.16b
eor TMP3.16b, Y_3.16b, X_3.16b
// y = ror(y, 3)
ushr Y_0.\lanes, TMP0.\lanes, #3
ushr Y_1.\lanes, TMP1.\lanes, #3
ushr Y_2.\lanes, TMP2.\lanes, #3
ushr Y_3.\lanes, TMP3.\lanes, #3
sli Y_0.\lanes, TMP0.\lanes, #(\n - 3)
sli Y_1.\lanes, TMP1.\lanes, #(\n - 3)
sli Y_2.\lanes, TMP2.\lanes, #(\n - 3)
sli Y_3.\lanes, TMP3.\lanes, #(\n - 3)
// x ^= k
eor X_0.16b, X_0.16b, ROUND_KEY.16b
eor X_1.16b, X_1.16b, ROUND_KEY.16b
eor X_2.16b, X_2.16b, ROUND_KEY.16b
eor X_3.16b, X_3.16b, ROUND_KEY.16b
// x -= y
sub X_0.\lanes, X_0.\lanes, Y_0.\lanes
sub X_1.\lanes, X_1.\lanes, Y_1.\lanes
sub X_2.\lanes, X_2.\lanes, Y_2.\lanes
sub X_3.\lanes, X_3.\lanes, Y_3.\lanes
// x = rol(x, 8)
tbl X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
tbl X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
tbl X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
tbl X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
.endm
.macro _next_xts_tweak next, cur, tmp, n
.if \n == 64
/*
* Calculate the next tweak by multiplying the current one by x,
* modulo p(x) = x^128 + x^7 + x^2 + x + 1.
*/
sshr \tmp\().2d, \cur\().2d, #63
and \tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
shl \next\().2d, \cur\().2d, #1
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
eor \next\().16b, \next\().16b, \tmp\().16b
.else
/*
* Calculate the next two tweaks by multiplying the current ones by x^2,
* modulo p(x) = x^64 + x^4 + x^3 + x + 1.
*/
ushr \tmp\().2d, \cur\().2d, #62
shl \next\().2d, \cur\().2d, #2
tbl \tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
eor \next\().16b, \next\().16b, \tmp\().16b
.endif
.endm
/*
* _speck_xts_crypt() - Speck-XTS encryption/decryption
*
* Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
* using Speck-XTS, specifically the variant with a block size of '2n' and round
* count given by NROUNDS. The expanded round keys are given in ROUND_KEYS, and
* the current XTS tweak value is given in TWEAK. It's assumed that NBYTES is a
* nonzero multiple of 128.
*/
.macro _speck_xts_crypt n, lanes, decrypting
/*
* If decrypting, modify the ROUND_KEYS parameter to point to the last
* round key rather than the first, since for decryption the round keys
* are used in reverse order.
*/
.if \decrypting
mov NROUNDS, NROUNDS /* zero the high 32 bits */
.if \n == 64
add ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
sub ROUND_KEYS, ROUND_KEYS, #8
.else
add ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
sub ROUND_KEYS, ROUND_KEYS, #4
.endif
.endif
// Load the index vector for tbl-based 8-bit rotates
.if \decrypting
ldr ROTATE_TABLE_Q, .Lrol\n\()_8_table
.else
ldr ROTATE_TABLE_Q, .Lror\n\()_8_table
.endif
// One-time XTS preparation
.if \n == 64
// Load first tweak
ld1 {TWEAKV0.16b}, [TWEAK]
// Load GF(2^128) multiplication table
ldr GFMUL_TABLE_Q, .Lgf128mul_table
.else
// Load first tweak
ld1 {TWEAKV0.8b}, [TWEAK]
// Load GF(2^64) multiplication table
ldr GFMUL_TABLE_Q, .Lgf64mul_table
// Calculate second tweak, packing it together with the first
ushr TMP0.2d, TWEAKV0.2d, #63
shl TMP1.2d, TWEAKV0.2d, #1
tbl TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
eor TMP0.8b, TMP0.8b, TMP1.8b
mov TWEAKV0.d[1], TMP0.d[0]
.endif
.Lnext_128bytes_\@:
// Calculate XTS tweaks for next 128 bytes
_next_xts_tweak TWEAKV1, TWEAKV0, TMP0, \n
_next_xts_tweak TWEAKV2, TWEAKV1, TMP0, \n
_next_xts_tweak TWEAKV3, TWEAKV2, TMP0, \n
_next_xts_tweak TWEAKV4, TWEAKV3, TMP0, \n
_next_xts_tweak TWEAKV5, TWEAKV4, TMP0, \n
_next_xts_tweak TWEAKV6, TWEAKV5, TMP0, \n
_next_xts_tweak TWEAKV7, TWEAKV6, TMP0, \n
_next_xts_tweak TWEAKV_NEXT, TWEAKV7, TMP0, \n
// Load the next source blocks into {X,Y}[0-3]
ld1 {X_0.16b-Y_1.16b}, [SRC], #64
ld1 {X_2.16b-Y_3.16b}, [SRC], #64
// XOR the source blocks with their XTS tweaks
eor TMP0.16b, X_0.16b, TWEAKV0.16b
eor Y_0.16b, Y_0.16b, TWEAKV1.16b
eor TMP1.16b, X_1.16b, TWEAKV2.16b
eor Y_1.16b, Y_1.16b, TWEAKV3.16b
eor TMP2.16b, X_2.16b, TWEAKV4.16b
eor Y_2.16b, Y_2.16b, TWEAKV5.16b
eor TMP3.16b, X_3.16b, TWEAKV6.16b
eor Y_3.16b, Y_3.16b, TWEAKV7.16b
/*
* De-interleave the 'x' and 'y' elements of each block, i.e. make it so
* that the X[0-3] registers contain only the second halves of blocks,
* and the Y[0-3] registers contain only the first halves of blocks.
* (Speck uses the order (y, x) rather than the more intuitive (x, y).)
*/
uzp2 X_0.\lanes, TMP0.\lanes, Y_0.\lanes
uzp1 Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
uzp2 X_1.\lanes, TMP1.\lanes, Y_1.\lanes
uzp1 Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
uzp2 X_2.\lanes, TMP2.\lanes, Y_2.\lanes
uzp1 Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
uzp2 X_3.\lanes, TMP3.\lanes, Y_3.\lanes
uzp1 Y_3.\lanes, TMP3.\lanes, Y_3.\lanes
// Do the cipher rounds
mov x6, ROUND_KEYS
mov w7, NROUNDS
.Lnext_round_\@:
.if \decrypting
ld1r {ROUND_KEY.\lanes}, [x6]
sub x6, x6, #( \n / 8 )
_speck_unround_128bytes \n, \lanes
.else
ld1r {ROUND_KEY.\lanes}, [x6], #( \n / 8 )
_speck_round_128bytes \n, \lanes
.endif
subs w7, w7, #1
bne .Lnext_round_\@
// Re-interleave the 'x' and 'y' elements of each block
zip1 TMP0.\lanes, Y_0.\lanes, X_0.\lanes
zip2 Y_0.\lanes, Y_0.\lanes, X_0.\lanes
zip1 TMP1.\lanes, Y_1.\lanes, X_1.\lanes
zip2 Y_1.\lanes, Y_1.\lanes, X_1.\lanes
zip1 TMP2.\lanes, Y_2.\lanes, X_2.\lanes
zip2 Y_2.\lanes, Y_2.\lanes, X_2.\lanes
zip1 TMP3.\lanes, Y_3.\lanes, X_3.\lanes
zip2 Y_3.\lanes, Y_3.\lanes, X_3.\lanes
// XOR the encrypted/decrypted blocks with the tweaks calculated earlier
eor X_0.16b, TMP0.16b, TWEAKV0.16b
eor Y_0.16b, Y_0.16b, TWEAKV1.16b
eor X_1.16b, TMP1.16b, TWEAKV2.16b
eor Y_1.16b, Y_1.16b, TWEAKV3.16b
eor X_2.16b, TMP2.16b, TWEAKV4.16b
eor Y_2.16b, Y_2.16b, TWEAKV5.16b
eor X_3.16b, TMP3.16b, TWEAKV6.16b
eor Y_3.16b, Y_3.16b, TWEAKV7.16b
mov TWEAKV0.16b, TWEAKV_NEXT.16b
// Store the ciphertext in the destination buffer
st1 {X_0.16b-Y_1.16b}, [DST], #64
st1 {X_2.16b-Y_3.16b}, [DST], #64
// Continue if there are more 128-byte chunks remaining
subs NBYTES, NBYTES, #128
bne .Lnext_128bytes_\@
// Store the next tweak and return
.if \n == 64
st1 {TWEAKV_NEXT.16b}, [TWEAK]
.else
st1 {TWEAKV_NEXT.8b}, [TWEAK]
.endif
ret
.endm
ENTRY(speck128_xts_encrypt_neon)
_speck_xts_crypt n=64, lanes=2d, decrypting=0
ENDPROC(speck128_xts_encrypt_neon)
ENTRY(speck128_xts_decrypt_neon)
_speck_xts_crypt n=64, lanes=2d, decrypting=1
ENDPROC(speck128_xts_decrypt_neon)
ENTRY(speck64_xts_encrypt_neon)
_speck_xts_crypt n=32, lanes=4s, decrypting=0
ENDPROC(speck64_xts_encrypt_neon)
ENTRY(speck64_xts_decrypt_neon)
_speck_xts_crypt n=32, lanes=4s, decrypting=1
ENDPROC(speck64_xts_decrypt_neon)

View File

@ -1,282 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
* (64-bit version; based on the 32-bit version)
*
* Copyright (c) 2018 Google, Inc
*/
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
#include <crypto/algapi.h>
#include <crypto/gf128mul.h>
#include <crypto/internal/skcipher.h>
#include <crypto/speck.h>
#include <crypto/xts.h>
#include <linux/kernel.h>
#include <linux/module.h>
/* The assembly functions only handle multiples of 128 bytes */
#define SPECK_NEON_CHUNK_SIZE 128
/* Speck128 */
struct speck128_xts_tfm_ctx {
struct speck128_tfm_ctx main_key;
struct speck128_tfm_ctx tweak_key;
};
asmlinkage void speck128_xts_encrypt_neon(const u64 *round_keys, int nrounds,
void *dst, const void *src,
unsigned int nbytes, void *tweak);
asmlinkage void speck128_xts_decrypt_neon(const u64 *round_keys, int nrounds,
void *dst, const void *src,
unsigned int nbytes, void *tweak);
typedef void (*speck128_crypt_one_t)(const struct speck128_tfm_ctx *,
u8 *, const u8 *);
typedef void (*speck128_xts_crypt_many_t)(const u64 *, int, void *,
const void *, unsigned int, void *);
static __always_inline int
__speck128_xts_crypt(struct skcipher_request *req,
speck128_crypt_one_t crypt_one,
speck128_xts_crypt_many_t crypt_many)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
const struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
le128 tweak;
int err;
err = skcipher_walk_virt(&walk, req, true);
crypto_speck128_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
u8 *dst = walk.dst.virt.addr;
const u8 *src = walk.src.virt.addr;
if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
unsigned int count;
count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
kernel_neon_begin();
(*crypt_many)(ctx->main_key.round_keys,
ctx->main_key.nrounds,
dst, src, count, &tweak);
kernel_neon_end();
dst += count;
src += count;
nbytes -= count;
}
/* Handle any remainder with generic code */
while (nbytes >= sizeof(tweak)) {
le128_xor((le128 *)dst, (const le128 *)src, &tweak);
(*crypt_one)(&ctx->main_key, dst, dst);
le128_xor((le128 *)dst, (const le128 *)dst, &tweak);
gf128mul_x_ble(&tweak, &tweak);
dst += sizeof(tweak);
src += sizeof(tweak);
nbytes -= sizeof(tweak);
}
err = skcipher_walk_done(&walk, nbytes);
}
return err;
}
static int speck128_xts_encrypt(struct skcipher_request *req)
{
return __speck128_xts_crypt(req, crypto_speck128_encrypt,
speck128_xts_encrypt_neon);
}
static int speck128_xts_decrypt(struct skcipher_request *req)
{
return __speck128_xts_crypt(req, crypto_speck128_decrypt,
speck128_xts_decrypt_neon);
}
static int speck128_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct speck128_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
int err;
err = xts_verify_key(tfm, key, keylen);
if (err)
return err;
keylen /= 2;
err = crypto_speck128_setkey(&ctx->main_key, key, keylen);
if (err)
return err;
return crypto_speck128_setkey(&ctx->tweak_key, key + keylen, keylen);
}
/* Speck64 */
struct speck64_xts_tfm_ctx {
struct speck64_tfm_ctx main_key;
struct speck64_tfm_ctx tweak_key;
};
asmlinkage void speck64_xts_encrypt_neon(const u32 *round_keys, int nrounds,
void *dst, const void *src,
unsigned int nbytes, void *tweak);
asmlinkage void speck64_xts_decrypt_neon(const u32 *round_keys, int nrounds,
void *dst, const void *src,
unsigned int nbytes, void *tweak);
typedef void (*speck64_crypt_one_t)(const struct speck64_tfm_ctx *,
u8 *, const u8 *);
typedef void (*speck64_xts_crypt_many_t)(const u32 *, int, void *,
const void *, unsigned int, void *);
static __always_inline int
__speck64_xts_crypt(struct skcipher_request *req, speck64_crypt_one_t crypt_one,
speck64_xts_crypt_many_t crypt_many)
{
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
const struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
struct skcipher_walk walk;
__le64 tweak;
int err;
err = skcipher_walk_virt(&walk, req, true);
crypto_speck64_encrypt(&ctx->tweak_key, (u8 *)&tweak, walk.iv);
while (walk.nbytes > 0) {
unsigned int nbytes = walk.nbytes;
u8 *dst = walk.dst.virt.addr;
const u8 *src = walk.src.virt.addr;
if (nbytes >= SPECK_NEON_CHUNK_SIZE && may_use_simd()) {
unsigned int count;
count = round_down(nbytes, SPECK_NEON_CHUNK_SIZE);
kernel_neon_begin();
(*crypt_many)(ctx->main_key.round_keys,
ctx->main_key.nrounds,
dst, src, count, &tweak);
kernel_neon_end();
dst += count;
src += count;
nbytes -= count;
}
/* Handle any remainder with generic code */
while (nbytes >= sizeof(tweak)) {
*(__le64 *)dst = *(__le64 *)src ^ tweak;
(*crypt_one)(&ctx->main_key, dst, dst);
*(__le64 *)dst ^= tweak;
tweak = cpu_to_le64((le64_to_cpu(tweak) << 1) ^
((tweak & cpu_to_le64(1ULL << 63)) ?
0x1B : 0));
dst += sizeof(tweak);
src += sizeof(tweak);
nbytes -= sizeof(tweak);
}
err = skcipher_walk_done(&walk, nbytes);
}
return err;
}
static int speck64_xts_encrypt(struct skcipher_request *req)
{
return __speck64_xts_crypt(req, crypto_speck64_encrypt,
speck64_xts_encrypt_neon);
}
static int speck64_xts_decrypt(struct skcipher_request *req)
{
return __speck64_xts_crypt(req, crypto_speck64_decrypt,
speck64_xts_decrypt_neon);
}
static int speck64_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct speck64_xts_tfm_ctx *ctx = crypto_skcipher_ctx(tfm);
int err;
err = xts_verify_key(tfm, key, keylen);
if (err)
return err;
keylen /= 2;
err = crypto_speck64_setkey(&ctx->main_key, key, keylen);
if (err)
return err;
return crypto_speck64_setkey(&ctx->tweak_key, key + keylen, keylen);
}
static struct skcipher_alg speck_algs[] = {
{
.base.cra_name = "xts(speck128)",
.base.cra_driver_name = "xts-speck128-neon",
.base.cra_priority = 300,
.base.cra_blocksize = SPECK128_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct speck128_xts_tfm_ctx),
.base.cra_alignmask = 7,
.base.cra_module = THIS_MODULE,
.min_keysize = 2 * SPECK128_128_KEY_SIZE,
.max_keysize = 2 * SPECK128_256_KEY_SIZE,
.ivsize = SPECK128_BLOCK_SIZE,
.walksize = SPECK_NEON_CHUNK_SIZE,
.setkey = speck128_xts_setkey,
.encrypt = speck128_xts_encrypt,
.decrypt = speck128_xts_decrypt,
}, {
.base.cra_name = "xts(speck64)",
.base.cra_driver_name = "xts-speck64-neon",
.base.cra_priority = 300,
.base.cra_blocksize = SPECK64_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct speck64_xts_tfm_ctx),
.base.cra_alignmask = 7,
.base.cra_module = THIS_MODULE,
.min_keysize = 2 * SPECK64_96_KEY_SIZE,
.max_keysize = 2 * SPECK64_128_KEY_SIZE,
.ivsize = SPECK64_BLOCK_SIZE,
.walksize = SPECK_NEON_CHUNK_SIZE,
.setkey = speck64_xts_setkey,
.encrypt = speck64_xts_encrypt,
.decrypt = speck64_xts_decrypt,
}
};
static int __init speck_neon_module_init(void)
{
if (!(elf_hwcap & HWCAP_ASIMD))
return -ENODEV;
return crypto_register_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
}
static void __exit speck_neon_module_exit(void)
{
crypto_unregister_skciphers(speck_algs, ARRAY_SIZE(speck_algs));
}
module_init(speck_neon_module_init);
module_exit(speck_neon_module_exit);
MODULE_DESCRIPTION("Speck block cipher (NEON-accelerated)");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
MODULE_ALIAS_CRYPTO("xts(speck128)");
MODULE_ALIAS_CRYPTO("xts-speck128-neon");
MODULE_ALIAS_CRYPTO("xts(speck64)");
MODULE_ALIAS_CRYPTO("xts-speck64-neon");

View File

@ -532,22 +532,6 @@ alternative_endif
and \phys, \pte, #(((1 << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
.endm
/*
* Check whether preempt-disabled code should yield as soon as it
* is able. This is the case if re-enabling preemption a single
* time results in a preempt count of zero, and the TIF_NEED_RESCHED
* flag is set. (Note that the latter is stored negated in the
* top word of the thread_info::preempt_count field)
*/
.macro cond_yield, lbl:req, tmp:req
#ifdef CONFIG_PREEMPTION
get_current_task \tmp
ldr \tmp, [\tmp, #TSK_TI_PREEMPT]
sub \tmp, \tmp, #PREEMPT_DISABLE_OFFSET
cbz \tmp, \lbl
#endif
.endm
/*
* Check the MIDR_EL1 of the current CPU for a given model and a range of
* variant/revision. See asm/cputype.h for the macros used below.
@ -587,67 +571,4 @@ alternative_endif
.Ldone\@:
.endm
/*
* frame_push - Push @regcount callee saved registers to the stack,
* starting at x19, as well as x29/x30, and set x29 to
* the new value of sp. Add @extra bytes of stack space
* for locals.
*/
.macro frame_push, regcount:req, extra
__frame st, \regcount, \extra
.endm
/*
* frame_pop - Pop the callee saved registers from the stack that were
* pushed in the most recent call to frame_push, as well
* as x29/x30 and any extra stack space that may have been
* allocated.
*/
.macro frame_pop
__frame ld
.endm
.macro __frame_regs, reg1, reg2, op, num
.if .Lframe_regcount == \num
\op\()r \reg1, [sp, #(\num + 1) * 8]
.elseif .Lframe_regcount > \num
\op\()p \reg1, \reg2, [sp, #(\num + 1) * 8]
.endif
.endm
.macro __frame, op, regcount, extra=0
.ifc \op, st
.if (\regcount) < 0 || (\regcount) > 10
.error "regcount should be in the range [0 ... 10]"
.endif
.if ((\extra) % 16) != 0
.error "extra should be a multiple of 16 bytes"
.endif
.ifdef .Lframe_regcount
.if .Lframe_regcount != -1
.error "frame_push/frame_pop may not be nested"
.endif
.endif
.set .Lframe_regcount, \regcount
.set .Lframe_extra, \extra
.set .Lframe_local_offset, ((\regcount + 3) / 2) * 16
stp x29, x30, [sp, #-.Lframe_local_offset - .Lframe_extra]!
mov x29, sp
.endif
__frame_regs x19, x20, \op, 1
__frame_regs x21, x22, \op, 3
__frame_regs x23, x24, \op, 5
__frame_regs x25, x26, \op, 7
__frame_regs x27, x28, \op, 9
.ifc \op, ld
.if .Lframe_regcount == -1
.error "frame_push/frame_pop may not be nested"
.endif
ldp x29, x30, [sp], #.Lframe_local_offset + .Lframe_extra
.set .Lframe_regcount, -1
.endif
.endm
#endif /* __ASM_ASSEMBLER_H */

View File

@ -14,8 +14,15 @@
#include <asm/hwcap.h>
#include <asm/sysreg.h>
#define MAX_CPU_FEATURES 64
#define cpu_feature(x) KERNEL_HWCAP_ ## x
/*
* In the arm64 world (as in the ARM world), elf_hwcap is used both internally
* in the kernel and for user space to keep track of which optional features
* are supported by the current system. So let's map feature 'x' to HWCAP_x.
* Note that HWCAP_x constants are bit fields so we need to take the log.
*/
#define MAX_CPU_FEATURES (8 * sizeof(elf_hwcap))
#define cpu_feature(x) ilog2(HWCAP_ ## x)
#define ARM64_SSBD_UNKNOWN -1
#define ARM64_SSBD_FORCE_DISABLE 0
@ -340,19 +347,10 @@ extern struct static_key_false arm64_const_caps_ready;
bool this_cpu_has_cap(unsigned int cap);
static inline void cpu_set_feature(unsigned int num)
{
WARN_ON(num >= MAX_CPU_FEATURES);
elf_hwcap |= BIT(num);
}
#define cpu_set_named_feature(name) cpu_set_feature(cpu_feature(name))
static inline bool cpu_have_feature(unsigned int num)
{
WARN_ON(num >= MAX_CPU_FEATURES);
return elf_hwcap & BIT(num);
return elf_hwcap & (1UL << num);
}
#define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name))
/* System capability check for constant caps */
static inline bool __cpus_have_const_cap(int num)

View File

@ -40,61 +40,11 @@
#define COMPAT_HWCAP2_CRC32 (1 << 4)
#ifndef __ASSEMBLY__
#include <linux/kernel.h>
#include <linux/log2.h>
/*
* For userspace we represent hwcaps as a collection of HWCAP{,2}_x bitfields
* as described in uapi/asm/hwcap.h. For the kernel we represent hwcaps as
* natural numbers (in a single range of size MAX_CPU_FEATURES) defined here
* with prefix KERNEL_HWCAP_ mapped to their HWCAP{,2}_x counterpart.
*
* Hwcaps should be set and tested within the kernel via the
* cpu_{set,have}_named_feature(feature) where feature is the unique suffix
* of KERNEL_HWCAP_{feature}.
*/
#define __khwcap_feature(x) const_ilog2(HWCAP_ ## x)
#define KERNEL_HWCAP_FP __khwcap_feature(FP)
#define KERNEL_HWCAP_ASIMD __khwcap_feature(ASIMD)
#define KERNEL_HWCAP_EVTSTRM __khwcap_feature(EVTSTRM)
#define KERNEL_HWCAP_AES __khwcap_feature(AES)
#define KERNEL_HWCAP_PMULL __khwcap_feature(PMULL)
#define KERNEL_HWCAP_SHA1 __khwcap_feature(SHA1)
#define KERNEL_HWCAP_SHA2 __khwcap_feature(SHA2)
#define KERNEL_HWCAP_CRC32 __khwcap_feature(CRC32)
#define KERNEL_HWCAP_ATOMICS __khwcap_feature(ATOMICS)
#define KERNEL_HWCAP_FPHP __khwcap_feature(FPHP)
#define KERNEL_HWCAP_ASIMDHP __khwcap_feature(ASIMDHP)
#define KERNEL_HWCAP_CPUID __khwcap_feature(CPUID)
#define KERNEL_HWCAP_ASIMDRDM __khwcap_feature(ASIMDRDM)
#define KERNEL_HWCAP_JSCVT __khwcap_feature(JSCVT)
#define KERNEL_HWCAP_FCMA __khwcap_feature(FCMA)
#define KERNEL_HWCAP_LRCPC __khwcap_feature(LRCPC)
#define KERNEL_HWCAP_DCPOP __khwcap_feature(DCPOP)
#define KERNEL_HWCAP_SHA3 __khwcap_feature(SHA3)
#define KERNEL_HWCAP_SM3 __khwcap_feature(SM3)
#define KERNEL_HWCAP_SM4 __khwcap_feature(SM4)
#define KERNEL_HWCAP_ASIMDDP __khwcap_feature(ASIMDDP)
#define KERNEL_HWCAP_SHA512 __khwcap_feature(SHA512)
#define KERNEL_HWCAP_SVE __khwcap_feature(SVE)
#define KERNEL_HWCAP_ASIMDFHM __khwcap_feature(ASIMDFHM)
#define KERNEL_HWCAP_DIT __khwcap_feature(DIT)
#define KERNEL_HWCAP_USCAT __khwcap_feature(USCAT)
#define KERNEL_HWCAP_ILRCPC __khwcap_feature(ILRCPC)
#define KERNEL_HWCAP_FLAGM __khwcap_feature(FLAGM)
#define KERNEL_HWCAP_SSBS __khwcap_feature(SSBS)
#define KERNEL_HWCAP_SB __khwcap_feature(SB)
#define KERNEL_HWCAP_PACA __khwcap_feature(PACA)
#define KERNEL_HWCAP_PACG __khwcap_feature(PACG)
#define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 32)
/*
* This yields a mask that user programs can use to figure out what
* instruction set this cpu supports.
*/
#define ELF_HWCAP lower_32_bits(elf_hwcap)
#define ELF_HWCAP2 upper_32_bits(elf_hwcap)
#define ELF_HWCAP (elf_hwcap)
#ifdef CONFIG_COMPAT
#define COMPAT_ELF_HWCAP (compat_elf_hwcap)

View File

@ -18,7 +18,7 @@
#define _UAPI__ASM_HWCAP_H
/*
* HWCAP flags - for AT_HWCAP
* HWCAP flags - for elf_hwcap (in kernel) and AT_HWCAP
*/
#define HWCAP_FP (1 << 0)
#define HWCAP_ASIMD (1 << 1)

View File

@ -1258,32 +1258,32 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
}
static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_PMULL),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AES),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA1),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA2),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_SHA512),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_CRC32),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ATOMICS),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RDM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA3),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM3),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM4),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FLAGM),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_FP),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FPHP),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_ASIMD),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_DIT_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DIT),
HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DCPOP),
HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_JSCVT),
HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FCMA),
HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_LRCPC),
HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC),
HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT),
HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, KERNEL_HWCAP_SSBS),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_PMULL),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_AES),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA1),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA2),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_SHA512),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_CRC32),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_ATOMICS),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RDM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDRDM),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA3),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM3),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM4),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDDP),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDFHM),
HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FLAGM),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_FP),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_ASIMDHP),
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_DIT_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_DIT),
HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_DCPOP),
HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_JSCVT),
HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FCMA),
HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_LRCPC),
HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_ILRCPC),
HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_USCAT),
HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, HWCAP_SSBS),
{},
};
@ -1329,7 +1329,7 @@ static void __init cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap)
{
switch (cap->hwcap_type) {
case CAP_HWCAP:
cpu_set_feature(cap->hwcap);
elf_hwcap |= cap->hwcap;
break;
#ifdef CONFIG_COMPAT
case CAP_COMPAT_HWCAP:
@ -1352,7 +1352,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap)
switch (cap->hwcap_type) {
case CAP_HWCAP:
rc = cpu_have_feature(cap->hwcap);
rc = (elf_hwcap & cap->hwcap) != 0;
break;
#ifdef CONFIG_COMPAT
case CAP_COMPAT_HWCAP:
@ -1373,7 +1373,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap)
static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps)
{
/* We support emulation of accesses to CPU ID feature registers */
cpu_set_named_feature(CPUID);
elf_hwcap |= HWCAP_CPUID;
for (; hwcaps->matches; hwcaps++)
if (hwcaps->matches(hwcaps, cpucap_default_scope(hwcaps)))
cap_set_elf_hwcap(hwcaps);

View File

@ -172,7 +172,7 @@ static int c_show(struct seq_file *m, void *v)
#endif /* CONFIG_COMPAT */
} else {
for (j = 0; hwcap_str[j]; j++)
if (cpu_have_feature(j))
if (elf_hwcap & (1 << j))
seq_printf(m, " %s", hwcap_str[j]);
}
seq_puts(m, "\n");

View File

@ -444,14 +444,14 @@ static inline void fpsimd_hotplug_init(void) { }
*/
static int __init fpsimd_init(void)
{
if (cpu_have_named_feature(FP)) {
if (elf_hwcap & HWCAP_FP) {
fpsimd_pm_init();
fpsimd_hotplug_init();
} else {
pr_notice("Floating-point is not implemented\n");
}
if (!cpu_have_named_feature(ASIMD))
if (!(elf_hwcap & HWCAP_ASIMD))
pr_notice("Advanced SIMD is not implemented\n");
return 0;

View File

@ -12,7 +12,7 @@
#include <asm/alternative.h>
#include <asm/assembler.h>
.cpu generic+crc
.arch armv8-a+crc
.macro __crc32, c
cmp x2, #16

View File

@ -182,6 +182,7 @@ static struct shash_alg alg = {
.cra_name = "md5",
.cra_driver_name= "octeon-md5",
.cra_priority = OCTEON_CR_OPCODE_PRIORITY,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = MD5_HMAC_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -215,6 +215,7 @@ static struct shash_alg octeon_sha1_alg = {
.cra_name = "sha1",
.cra_driver_name= "octeon-sha1",
.cra_priority = OCTEON_CR_OPCODE_PRIORITY,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -239,6 +239,7 @@ static struct shash_alg octeon_sha256_algs[2] = { {
.cra_name = "sha256",
.cra_driver_name= "octeon-sha256",
.cra_priority = OCTEON_CR_OPCODE_PRIORITY,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -251,6 +252,7 @@ static struct shash_alg octeon_sha256_algs[2] = { {
.base = {
.cra_name = "sha224",
.cra_driver_name= "octeon-sha224",
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -235,6 +235,7 @@ static struct shash_alg octeon_sha512_algs[2] = { {
.cra_name = "sha512",
.cra_driver_name= "octeon-sha512",
.cra_priority = OCTEON_CR_OPCODE_PRIORITY,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -248,6 +249,7 @@ static struct shash_alg octeon_sha512_algs[2] = { {
.cra_name = "sha384",
.cra_driver_name= "octeon-sha384",
.cra_priority = OCTEON_CR_OPCODE_PRIORITY,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA384_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -139,6 +139,7 @@ static struct shash_alg alg = {
.cra_name = "md5",
.cra_driver_name= "md5-ppc",
.cra_priority = 200,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = MD5_HMAC_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -185,6 +185,7 @@ static struct shash_alg alg = {
.cra_name = "sha1",
.cra_driver_name= "sha1-ppc-spe",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -132,6 +132,7 @@ static struct shash_alg alg = {
.base = {
.cra_name = "sha1",
.cra_driver_name= "sha1-powerpc",
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -231,6 +231,7 @@ static struct shash_alg algs[2] = { {
.cra_name = "sha256",
.cra_driver_name= "sha256-ppc-spe",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -247,6 +248,7 @@ static struct shash_alg algs[2] = { {
.cra_name = "sha224",
.cra_driver_name= "sha224-ppc-spe",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -127,6 +127,7 @@ static struct shash_alg ghash_alg = {
.cra_name = "ghash",
.cra_driver_name = "ghash-s390",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = GHASH_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct ghash_ctx),
.cra_module = THIS_MODULE,

View File

@ -83,6 +83,7 @@ static struct shash_alg alg = {
.cra_name = "sha1",
.cra_driver_name= "sha1-s390",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -76,6 +76,7 @@ static struct shash_alg sha256_alg = {
.cra_name = "sha256",
.cra_driver_name= "sha256-s390",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -112,6 +113,7 @@ static struct shash_alg sha224_alg = {
.cra_name = "sha224",
.cra_driver_name= "sha224-s390",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -81,6 +81,7 @@ static struct shash_alg sha512_alg = {
.cra_name = "sha512",
.cra_driver_name= "sha512-s390",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -119,6 +120,7 @@ static struct shash_alg sha384_alg = {
.cra_name = "sha384",
.cra_driver_name= "sha384-s390",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA384_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct s390_sha_ctx),
.cra_module = THIS_MODULE,

View File

@ -196,14 +196,14 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
return 0;
}
static void crypto_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm);
ctx->ops->encrypt(&ctx->key[0], (const u32 *) src, (u32 *) dst);
}
static void crypto_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm);
@ -395,8 +395,8 @@ static struct crypto_alg algs[] = { {
.cia_min_keysize = AES_MIN_KEY_SIZE,
.cia_max_keysize = AES_MAX_KEY_SIZE,
.cia_setkey = aes_set_key,
.cia_encrypt = crypto_aes_encrypt,
.cia_decrypt = crypto_aes_decrypt
.cia_encrypt = aes_encrypt,
.cia_decrypt = aes_decrypt
}
}
}, {

View File

@ -144,6 +144,7 @@ static struct shash_alg alg = {
.cra_name = "md5",
.cra_driver_name= "md5-sparc64",
.cra_priority = SPARC_CR_OPCODE_PRIORITY,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = MD5_HMAC_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -139,6 +139,7 @@ static struct shash_alg alg = {
.cra_name = "sha1",
.cra_driver_name= "sha1-sparc64",
.cra_priority = SPARC_CR_OPCODE_PRIORITY,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -169,6 +169,7 @@ static struct shash_alg sha256 = {
.cra_name = "sha256",
.cra_driver_name= "sha256-sparc64",
.cra_priority = SPARC_CR_OPCODE_PRIORITY,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -184,6 +185,7 @@ static struct shash_alg sha224 = {
.cra_name = "sha224",
.cra_driver_name= "sha224-sparc64",
.cra_priority = SPARC_CR_OPCODE_PRIORITY,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -154,6 +154,7 @@ static struct shash_alg sha512 = {
.cra_name = "sha512",
.cra_driver_name= "sha512-sparc64",
.cra_priority = SPARC_CR_OPCODE_PRIORITY,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -169,6 +170,7 @@ static struct shash_alg sha384 = {
.cra_name = "sha384",
.cra_driver_name= "sha384-sparc64",
.cra_priority = SPARC_CR_OPCODE_PRIORITY,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA384_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -328,7 +328,7 @@ static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
return aes_set_key_common(tfm, crypto_tfm_ctx(tfm), in_key, key_len);
}
static void aesni_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
@ -341,7 +341,7 @@ static void aesni_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
}
}
static void aesni_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
{
struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm));
@ -973,8 +973,8 @@ static struct crypto_alg aesni_algs[] = { {
.cia_min_keysize = AES_MIN_KEY_SIZE,
.cia_max_keysize = AES_MAX_KEY_SIZE,
.cia_setkey = aes_set_key,
.cia_encrypt = aesni_encrypt,
.cia_decrypt = aesni_decrypt
.cia_encrypt = aes_encrypt,
.cia_decrypt = aes_decrypt
}
}
}, {

View File

@ -154,7 +154,8 @@ static struct shash_alg ghash_alg = {
.cra_name = "__ghash",
.cra_driver_name = "__ghash-pclmulqdqni",
.cra_priority = 0,
.cra_flags = CRYPTO_ALG_INTERNAL,
.cra_flags = CRYPTO_ALG_TYPE_SHASH |
CRYPTO_ALG_INTERNAL,
.cra_blocksize = GHASH_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct ghash_ctx),
.cra_module = THIS_MODULE,

View File

@ -171,6 +171,7 @@ static struct shash_alg alg = {
.cra_name = "poly1305",
.cra_driver_name = "poly1305-simd",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_alignmask = sizeof(u32) - 1,
.cra_blocksize = POLY1305_BLOCK_SIZE,
.cra_module = THIS_MODULE,

View File

@ -100,6 +100,7 @@ static struct shash_alg sha1_ssse3_alg = {
.cra_name = "sha1",
.cra_driver_name = "sha1-ssse3",
.cra_priority = 150,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -150,6 +151,7 @@ static struct shash_alg sha1_avx_alg = {
.cra_name = "sha1",
.cra_driver_name = "sha1-avx",
.cra_priority = 160,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -239,6 +241,7 @@ static struct shash_alg sha1_avx2_alg = {
.cra_name = "sha1",
.cra_driver_name = "sha1-avx2",
.cra_priority = 170,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -294,6 +297,7 @@ static struct shash_alg sha1_ni_alg = {
.cra_name = "sha1",
.cra_driver_name = "sha1-ni",
.cra_priority = 250,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -109,6 +109,7 @@ static struct shash_alg sha256_ssse3_algs[] = { {
.cra_name = "sha256",
.cra_driver_name = "sha256-ssse3",
.cra_priority = 150,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -123,6 +124,7 @@ static struct shash_alg sha256_ssse3_algs[] = { {
.cra_name = "sha224",
.cra_driver_name = "sha224-ssse3",
.cra_priority = 150,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -175,6 +177,7 @@ static struct shash_alg sha256_avx_algs[] = { {
.cra_name = "sha256",
.cra_driver_name = "sha256-avx",
.cra_priority = 160,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -189,6 +192,7 @@ static struct shash_alg sha256_avx_algs[] = { {
.cra_name = "sha224",
.cra_driver_name = "sha224-avx",
.cra_priority = 160,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -257,6 +261,7 @@ static struct shash_alg sha256_avx2_algs[] = { {
.cra_name = "sha256",
.cra_driver_name = "sha256-avx2",
.cra_priority = 170,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -271,6 +276,7 @@ static struct shash_alg sha256_avx2_algs[] = { {
.cra_name = "sha224",
.cra_driver_name = "sha224-avx2",
.cra_priority = 170,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -337,6 +343,7 @@ static struct shash_alg sha256_ni_algs[] = { {
.cra_name = "sha256",
.cra_driver_name = "sha256-ni",
.cra_priority = 250,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA256_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -351,6 +358,7 @@ static struct shash_alg sha256_ni_algs[] = { {
.cra_name = "sha224",
.cra_driver_name = "sha224-ni",
.cra_priority = 250,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA224_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -108,6 +108,7 @@ static struct shash_alg sha512_ssse3_algs[] = { {
.cra_name = "sha512",
.cra_driver_name = "sha512-ssse3",
.cra_priority = 150,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -122,6 +123,7 @@ static struct shash_alg sha512_ssse3_algs[] = { {
.cra_name = "sha384",
.cra_driver_name = "sha384-ssse3",
.cra_priority = 150,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA384_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -185,6 +187,7 @@ static struct shash_alg sha512_avx_algs[] = { {
.cra_name = "sha512",
.cra_driver_name = "sha512-avx",
.cra_priority = 160,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -199,6 +202,7 @@ static struct shash_alg sha512_avx_algs[] = { {
.cra_name = "sha384",
.cra_driver_name = "sha384-avx",
.cra_priority = 160,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA384_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -256,6 +260,7 @@ static struct shash_alg sha512_avx2_algs[] = { {
.cra_name = "sha512",
.cra_driver_name = "sha512-avx2",
.cra_priority = 170,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA512_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}
@ -270,6 +275,7 @@ static struct shash_alg sha512_avx2_algs[] = { {
.cra_name = "sha384",
.cra_driver_name = "sha384-avx2",
.cra_priority = 170,
.cra_flags = CRYPTO_ALG_TYPE_SHASH,
.cra_blocksize = SHA384_BLOCK_SIZE,
.cra_module = THIS_MODULE,
}

View File

@ -195,7 +195,7 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
bool "Enable crypto API fallback for blk-crypto"
depends on BLK_INLINE_ENCRYPTION
select CRYPTO
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
help
Enabling this lets the block layer handle inline encryption
by falling back to the kernel crypto API when inline

View File

@ -52,12 +52,12 @@ config CRYPTO_AEAD2
select CRYPTO_NULL2
select CRYPTO_RNG2
config CRYPTO_SKCIPHER
config CRYPTO_BLKCIPHER
tristate
select CRYPTO_SKCIPHER2
select CRYPTO_BLKCIPHER2
select CRYPTO_ALGAPI
config CRYPTO_SKCIPHER2
config CRYPTO_BLKCIPHER2
tristate
select CRYPTO_ALGAPI2
select CRYPTO_RNG2
@ -146,7 +146,7 @@ config CRYPTO_MANAGER2
def_tristate CRYPTO_MANAGER || (CRYPTO_MANAGER!=n && CRYPTO_ALGAPI=y)
select CRYPTO_AEAD2
select CRYPTO_HASH2
select CRYPTO_SKCIPHER2
select CRYPTO_BLKCIPHER2
select CRYPTO_AKCIPHER2
select CRYPTO_KPP2
select CRYPTO_ACOMP2
@ -185,7 +185,7 @@ config CRYPTO_NULL
config CRYPTO_NULL2
tristate
select CRYPTO_ALGAPI2
select CRYPTO_SKCIPHER2
select CRYPTO_BLKCIPHER2
select CRYPTO_HASH2
config CRYPTO_PCRYPT
@ -203,7 +203,7 @@ config CRYPTO_WORKQUEUE
config CRYPTO_CRYPTD
tristate "Software async crypto daemon"
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_HASH
select CRYPTO_MANAGER
select CRYPTO_WORKQUEUE
@ -214,7 +214,7 @@ config CRYPTO_CRYPTD
config CRYPTO_MCRYPTD
tristate "Software async multi-buffer crypto daemon"
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_HASH
select CRYPTO_MANAGER
select CRYPTO_WORKQUEUE
@ -229,7 +229,7 @@ config CRYPTO_MCRYPTD
config CRYPTO_AUTHENC
tristate "Authenc support"
select CRYPTO_AEAD
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_MANAGER
select CRYPTO_HASH
select CRYPTO_NULL
@ -255,7 +255,7 @@ config CRYPTO_SIMD
config CRYPTO_GLUE_HELPER_X86
tristate
depends on X86
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
config CRYPTO_ENGINE
tristate
@ -295,7 +295,7 @@ config CRYPTO_CHACHA20POLY1305
config CRYPTO_SEQIV
tristate "Sequence Number IV Generator"
select CRYPTO_AEAD
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_NULL
select CRYPTO_RNG_DEFAULT
help
@ -317,7 +317,7 @@ comment "Block modes"
config CRYPTO_CBC
tristate "CBC support"
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_MANAGER
help
CBC: Cipher Block Chaining mode
@ -325,7 +325,7 @@ config CRYPTO_CBC
config CRYPTO_CTR
tristate "CTR support"
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_SEQIV
select CRYPTO_MANAGER
help
@ -334,7 +334,7 @@ config CRYPTO_CTR
config CRYPTO_CTS
tristate "CTS support"
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
help
CTS: Cipher Text Stealing
This is the Cipher Text Stealing mode as described by
@ -345,7 +345,7 @@ config CRYPTO_CTS
config CRYPTO_ECB
tristate "ECB support"
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_MANAGER
help
ECB: Electronic CodeBook mode
@ -354,7 +354,7 @@ config CRYPTO_ECB
config CRYPTO_LRW
tristate "LRW support"
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_MANAGER
select CRYPTO_GF128MUL
help
@ -366,7 +366,7 @@ config CRYPTO_LRW
config CRYPTO_PCBC
tristate "PCBC support"
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_MANAGER
help
PCBC: Propagating Cipher Block Chaining mode
@ -374,7 +374,7 @@ config CRYPTO_PCBC
config CRYPTO_XTS
tristate "XTS support"
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_MANAGER
select CRYPTO_ECB
help
@ -384,7 +384,7 @@ config CRYPTO_XTS
config CRYPTO_KEYWRAP
tristate "Key wrapping support"
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
help
Support for key wrapping (NIST SP800-38F / RFC3394) without
padding.
@ -888,17 +888,6 @@ config CRYPTO_SHA3
References:
http://keccak.noekeon.org/
config CRYPTO_SM3
tristate "SM3 digest algorithm"
select CRYPTO_HASH
help
SM3 secure hash function as defined by OSCCA GM/T 0004-2012 SM3).
It is part of the Chinese Commercial Cryptography suite.
References:
http://www.oscca.gov.cn/UpFile/20101222141857786.pdf
https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash
config CRYPTO_TGR192
tristate "Tiger digest algorithms"
select CRYPTO_HASH
@ -934,9 +923,6 @@ config CRYPTO_GHASH_CLMUL_NI_INTEL
comment "Ciphers"
config CRYPTO_LIB_AES
tristate
config CRYPTO_AES
tristate "AES cipher algorithms"
select CRYPTO_ALGAPI
@ -960,7 +946,6 @@ config CRYPTO_AES
config CRYPTO_AES_TI
tristate "Fixed time AES cipher"
select CRYPTO_ALGAPI
select CRYPTO_LIB_AES
help
This is a generic implementation of AES that attempts to eliminate
data dependent latencies as much as possible without affecting
@ -1027,7 +1012,7 @@ config CRYPTO_AES_NI_INTEL
select CRYPTO_AES_X86_64 if 64BIT
select CRYPTO_AES_586 if !64BIT
select CRYPTO_ALGAPI
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_GLUE_HELPER_X86 if 64BIT
select CRYPTO_SIMD
help
@ -1111,7 +1096,7 @@ config CRYPTO_ANUBIS
config CRYPTO_ARC4
tristate "ARC4 cipher algorithm"
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
help
ARC4 cipher algorithm.
@ -1339,7 +1324,7 @@ config CRYPTO_DES3_EDE_X86_64
config CRYPTO_FCRYPT
tristate "FCrypt cipher algorithm"
select CRYPTO_ALGAPI
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
help
FCrypt algorithm used by RxRPC.
@ -1358,7 +1343,7 @@ config CRYPTO_KHAZAD
config CRYPTO_SALSA20
tristate "Salsa20 stream cipher algorithm"
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
help
Salsa20 stream cipher algorithm.
@ -1370,7 +1355,7 @@ config CRYPTO_SALSA20
config CRYPTO_CHACHA20
tristate "ChaCha stream cipher algorithms"
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
help
The ChaCha20, XChaCha20, and XChaCha12 stream cipher algorithms.
@ -1392,7 +1377,7 @@ config CRYPTO_CHACHA20
config CRYPTO_CHACHA20_X86_64
tristate "ChaCha20 cipher algorithm (x86_64/SSSE3/AVX2)"
depends on X86 && 64BIT
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
help
ChaCha20 cipher algorithm, RFC7539.
@ -1431,31 +1416,6 @@ config CRYPTO_SERPENT
See also:
<http://www.cl.cam.ac.uk/~rja14/serpent.html>
config CRYPTO_SM4
tristate "SM4 cipher algorithm"
select CRYPTO_ALGAPI
help
SM4 cipher algorithms (OSCCA GB/T 32907-2016).
SM4 (GBT.32907-2016) is a cryptographic standard issued by the
Organization of State Commercial Administration of China (OSCCA)
as an authorized cryptographic algorithms for the use within China.
SMS4 was originally created for use in protecting wireless
networks, and is mandated in the Chinese National Standard for
Wireless LAN WAPI (Wired Authentication and Privacy Infrastructure)
(GB.15629.11-2003).
The latest SM4 standard (GBT.32907-2016) was proposed by OSCCA and
standardized through TC 260 of the Standardization Administration
of the People's Republic of China (SAC).
The input, output, and key of SMS4 are each 128 bits.
See also: <https://eprint.iacr.org/2008/329.pdf>
If unsure, say N.
config CRYPTO_SERPENT_SSE2_X86_64
tristate "Serpent cipher algorithm (x86_64/SSE2)"
depends on X86 && 64BIT
@ -1795,7 +1755,7 @@ config CRYPTO_USER_API_HASH
config CRYPTO_USER_API_SKCIPHER
tristate "User-space interface for symmetric key cipher algorithms"
depends on NET
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_USER_API
help
This option enables the user-spaces interface for symmetric
@ -1814,7 +1774,7 @@ config CRYPTO_USER_API_AEAD
tristate "User-space interface for AEAD cipher algorithms"
depends on NET
select CRYPTO_AEAD
select CRYPTO_SKCIPHER
select CRYPTO_BLKCIPHER
select CRYPTO_NULL
select CRYPTO_USER_API
help

View File

@ -17,9 +17,10 @@ obj-$(CONFIG_CRYPTO_ALGAPI2) += crypto_algapi.o
obj-$(CONFIG_CRYPTO_AEAD2) += aead.o
crypto_skcipher-y := ablkcipher.o blkcipher.o
crypto_skcipher-y += skcipher.o
obj-$(CONFIG_CRYPTO_SKCIPHER2) += crypto_skcipher.o
crypto_blkcipher-y := ablkcipher.o
crypto_blkcipher-y += blkcipher.o
crypto_blkcipher-y += skcipher.o
obj-$(CONFIG_CRYPTO_BLKCIPHER2) += crypto_blkcipher.o
obj-$(CONFIG_CRYPTO_SEQIV) += seqiv.o
obj-$(CONFIG_CRYPTO_ECHAINIV) += echainiv.o
@ -70,7 +71,6 @@ obj-$(CONFIG_CRYPTO_SHA1) += sha1_generic.o
obj-$(CONFIG_CRYPTO_SHA256) += sha256_generic.o
obj-$(CONFIG_CRYPTO_SHA512) += sha512_generic.o
obj-$(CONFIG_CRYPTO_SHA3) += sha3_generic.o
obj-$(CONFIG_CRYPTO_SM3) += sm3_generic.o
obj-$(CONFIG_CRYPTO_WP512) += wp512.o
CFLAGS_wp512.o := $(call cc-option,-fno-schedule-insns) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
obj-$(CONFIG_CRYPTO_TGR192) += tgr192.o
@ -101,7 +101,6 @@ obj-$(CONFIG_CRYPTO_SERPENT) += serpent_generic.o
CFLAGS_serpent_generic.o := $(call cc-option,-fsched-pressure) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=79149
obj-$(CONFIG_CRYPTO_AES) += aes_generic.o
CFLAGS_aes_generic.o := $(call cc-option,-fno-code-hoisting) # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83356
obj-$(CONFIG_CRYPTO_SM4) += sm4_generic.o
obj-$(CONFIG_CRYPTO_AES_TI) += aes_ti.o
obj-$(CONFIG_CRYPTO_CAMELLIA) += camellia_generic.o
obj-$(CONFIG_CRYPTO_CAST_COMMON) += cast_common.o

Some files were not shown because too many files have changed in this diff Show More