Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
sha512.c
00001 /* sha512.c 00002 * 00003 * Copyright (C) 2006-2017 wolfSSL Inc. 00004 * 00005 * This file is part of wolfSSL. 00006 * 00007 * wolfSSL is free software; you can redistribute it and/or modify 00008 * it under the terms of the GNU General Public License as published by 00009 * the Free Software Foundation; either version 2 of the License, or 00010 * (at your option) any later version. 00011 * 00012 * wolfSSL is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 * GNU General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU General Public License 00018 * along with this program; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA 00020 */ 00021 00022 00023 #ifdef HAVE_CONFIG_H 00024 #include <config.h> 00025 #endif 00026 00027 #include <wolfcrypt/settings.h> 00028 00029 #if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384) 00030 00031 #if defined(HAVE_FIPS) && \ 00032 defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2) 00033 00034 /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */ 00035 #define FIPS_NO_WRAPPERS 00036 00037 #ifdef USE_WINDOWS_API 00038 #pragma code_seg(".fipsA$k") 00039 #pragma const_seg(".fipsB$k") 00040 #endif 00041 #endif 00042 00043 #include <wolfcrypt/sha512.h> 00044 #include <wolfcrypt/error-crypt.h> 00045 #include <wolfcrypt/cpuid.h> 00046 00047 /* deprecated USE_SLOW_SHA2 (replaced with USE_SLOW_SHA512) */ 00048 #if defined(USE_SLOW_SHA2) && !defined(USE_SLOW_SHA512) 00049 #define USE_SLOW_SHA512 00050 #endif 00051 00052 /* fips wrapper calls, user can call direct */ 00053 #if defined(HAVE_FIPS) && \ 00054 (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION < 2)) 00055 00056 #ifdef WOLFSSL_SHA512 00057 00058 int wc_InitSha512(wc_Sha512* sha) 00059 { 00060 if (sha == NULL) { 00061 return BAD_FUNC_ARG; 00062 } 00063 00064 return InitSha512_fips(sha); 00065 } 00066 int wc_InitSha512_ex(wc_Sha512* sha, void* heap, int devId) 00067 { 00068 (void)heap; 00069 (void)devId; 00070 if (sha == NULL) { 00071 return BAD_FUNC_ARG; 00072 } 00073 return InitSha512_fips(sha); 00074 } 00075 int wc_Sha512Update(wc_Sha512* sha, const byte* data, word32 len) 00076 { 00077 if (sha == NULL || (data == NULL && len > 0)) { 00078 return BAD_FUNC_ARG; 00079 } 00080 00081 return Sha512Update_fips(sha, data, len); 00082 } 00083 int wc_Sha512Final(wc_Sha512* sha, byte* out) 00084 { 00085 if (sha == NULL || out == NULL) { 00086 return BAD_FUNC_ARG; 00087 } 00088 00089 return Sha512Final_fips(sha, out); 00090 } 00091 void wc_Sha512Free(wc_Sha512* sha) 00092 { 00093 (void)sha; 00094 /* Not supported in FIPS */ 00095 } 00096 #endif 00097 00098 #if defined(WOLFSSL_SHA384) || defined(HAVE_AESGCM) 00099 int wc_InitSha384(wc_Sha384* sha) 00100 { 00101 if (sha == NULL) { 00102 return BAD_FUNC_ARG; 00103 } 00104 return InitSha384_fips(sha); 00105 } 00106 int wc_InitSha384_ex(wc_Sha384* sha, void* heap, int devId) 00107 { 00108 (void)heap; 00109 (void)devId; 00110 if (sha == NULL) { 00111 return BAD_FUNC_ARG; 00112 } 00113 return InitSha384_fips(sha); 00114 } 00115 int wc_Sha384Update(wc_Sha384* sha, const byte* data, word32 len) 00116 { 00117 if (sha == NULL || (data == NULL && len > 0)) { 00118 return BAD_FUNC_ARG; 00119 } 00120 return Sha384Update_fips(sha, data, len); 00121 } 00122 int wc_Sha384Final(wc_Sha384* sha, byte* out) 00123 { 00124 if (sha == NULL || out == NULL) { 00125 return BAD_FUNC_ARG; 00126 } 00127 return Sha384Final_fips(sha, out); 00128 } 00129 void wc_Sha384Free(wc_Sha384* sha) 00130 { 00131 (void)sha; 00132 /* Not supported in FIPS */ 00133 } 00134 #endif /* WOLFSSL_SHA384 || HAVE_AESGCM */ 00135 00136 #else /* else build without fips, or for FIPS v2 */ 00137 00138 #include <wolfcrypt/logging.h> 00139 00140 #ifdef NO_INLINE 00141 #include <wolfcrypt/misc.h> 00142 #else 00143 #define WOLFSSL_MISC_INCLUDED 00144 #include <wolfcrypt/src/misc.c> 00145 #endif 00146 00147 00148 #if defined(USE_INTEL_SPEEDUP) 00149 #define HAVE_INTEL_AVX1 00150 00151 #if defined(__GNUC__) && ((__GNUC__ < 4) || \ 00152 (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) 00153 #define NO_AVX2_SUPPORT 00154 #endif 00155 #if defined(__clang__) && ((__clang_major__ < 3) || \ 00156 (__clang_major__ == 3 && __clang_minor__ <= 5)) 00157 #define NO_AVX2_SUPPORT 00158 #elif defined(__clang__) && defined(NO_AVX2_SUPPORT) 00159 #undef NO_AVX2_SUPPORT 00160 #endif 00161 00162 #define HAVE_INTEL_AVX1 00163 #ifndef NO_AVX2_SUPPORT 00164 #define HAVE_INTEL_AVX2 00165 #endif 00166 #endif 00167 00168 #if defined(HAVE_INTEL_AVX1) 00169 /* #define DEBUG_XMM */ 00170 #endif 00171 00172 #if defined(HAVE_INTEL_AVX2) 00173 #define HAVE_INTEL_RORX 00174 /* #define DEBUG_YMM */ 00175 #endif 00176 00177 #if defined(HAVE_BYTEREVERSE64) && \ 00178 !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2) 00179 #define ByteReverseWords64(out, in, size) ByteReverseWords64_1(out, size) 00180 #define ByteReverseWords64_1(buf, size) \ 00181 { unsigned int i ;\ 00182 for(i=0; i< size/sizeof(word64); i++){\ 00183 __asm__ volatile("bswapq %0":"+r"(buf[i])::) ;\ 00184 }\ 00185 } 00186 #endif 00187 00188 #if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH) 00189 /* functions defined in wolfcrypt/src/port/caam/caam_sha.c */ 00190 #else 00191 00192 #ifdef WOLFSSL_SHA512 00193 00194 static int InitSha512(wc_Sha512* sha512) 00195 { 00196 if (sha512 == NULL) 00197 return BAD_FUNC_ARG; 00198 00199 sha512->digest[0] = W64LIT(0x6a09e667f3bcc908); 00200 sha512->digest[1] = W64LIT(0xbb67ae8584caa73b); 00201 sha512->digest[2] = W64LIT(0x3c6ef372fe94f82b); 00202 sha512->digest[3] = W64LIT(0xa54ff53a5f1d36f1); 00203 sha512->digest[4] = W64LIT(0x510e527fade682d1); 00204 sha512->digest[5] = W64LIT(0x9b05688c2b3e6c1f); 00205 sha512->digest[6] = W64LIT(0x1f83d9abfb41bd6b); 00206 sha512->digest[7] = W64LIT(0x5be0cd19137e2179); 00207 00208 sha512->buffLen = 0; 00209 sha512->loLen = 0; 00210 sha512->hiLen = 0; 00211 00212 return 0; 00213 } 00214 00215 #endif /* WOLFSSL_SHA512 */ 00216 00217 /* Hardware Acceleration */ 00218 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00219 00220 #ifdef WOLFSSL_SHA512 00221 00222 /***** 00223 Intel AVX1/AVX2 Macro Control Structure 00224 00225 #if defined(HAVE_INteL_SPEEDUP) 00226 #define HAVE_INTEL_AVX1 00227 #define HAVE_INTEL_AVX2 00228 #endif 00229 00230 int InitSha512(wc_Sha512* sha512) { 00231 Save/Recover XMM, YMM 00232 ... 00233 00234 Check Intel AVX cpuid flags 00235 } 00236 00237 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) 00238 Transform_Sha512_AVX1(); # Function prototype 00239 Transform_Sha512_AVX2(); # 00240 #endif 00241 00242 _Transform_Sha512() { # Native Transform Function body 00243 00244 } 00245 00246 int Sha512Update() { 00247 Save/Recover XMM, YMM 00248 ... 00249 } 00250 00251 int Sha512Final() { 00252 Save/Recover XMM, YMM 00253 ... 00254 } 00255 00256 00257 #if defined(HAVE_INTEL_AVX1) 00258 00259 XMM Instructions/INLINE asm Definitions 00260 00261 #endif 00262 00263 #if defined(HAVE_INTEL_AVX2) 00264 00265 YMM Instructions/INLINE asm Definitions 00266 00267 #endif 00268 00269 #if defnied(HAVE_INTEL_AVX1) 00270 00271 int Transform_Sha512_AVX1() { 00272 Stitched Message Sched/Round 00273 } 00274 00275 #endif 00276 00277 #if defnied(HAVE_INTEL_AVX2) 00278 00279 int Transform_Sha512_AVX2() { 00280 Stitched Message Sched/Round 00281 } 00282 #endif 00283 00284 */ 00285 00286 00287 /* Each platform needs to query info type 1 from cpuid to see if aesni is 00288 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts 00289 */ 00290 00291 #if defined(HAVE_INTEL_AVX1) 00292 static int Transform_Sha512_AVX1(wc_Sha512 *sha512); 00293 static int Transform_Sha512_AVX1_Len(wc_Sha512 *sha512, word32 len); 00294 #endif 00295 #if defined(HAVE_INTEL_AVX2) 00296 static int Transform_Sha512_AVX2(wc_Sha512 *sha512); 00297 static int Transform_Sha512_AVX2_Len(wc_Sha512 *sha512, word32 len); 00298 #if defined(HAVE_INTEL_RORX) 00299 static int Transform_Sha512_AVX1_RORX(wc_Sha512 *sha512); 00300 static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512 *sha512, 00301 word32 len); 00302 static int Transform_Sha512_AVX2_RORX(wc_Sha512 *sha512); 00303 static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512 *sha512, 00304 word32 len); 00305 #endif 00306 #endif 00307 static int _Transform_Sha512(wc_Sha512 *sha512); 00308 static int (*Transform_Sha512_p)(wc_Sha512* sha512) = _Transform_Sha512; 00309 static int (*Transform_Sha512_Len_p)(wc_Sha512* sha512, word32 len) = NULL; 00310 static int transform_check = 0; 00311 static int intel_flags; 00312 #define Transform_Sha512(sha512) (*Transform_Sha512_p)(sha512) 00313 #define Transform_Sha512_Len(sha512, len) \ 00314 (*Transform_Sha512_Len_p)(sha512, len) 00315 00316 static void Sha512_SetTransform() 00317 { 00318 if (transform_check) 00319 return; 00320 00321 intel_flags = cpuid_get_flags(); 00322 00323 #if defined(HAVE_INTEL_AVX2) 00324 if (IS_INTEL_AVX2(intel_flags)) { 00325 #ifdef HAVE_INTEL_RORX 00326 if (IS_INTEL_BMI2(intel_flags)) { 00327 Transform_Sha512_p = Transform_Sha512_AVX2_RORX; 00328 Transform_Sha512_Len_p = Transform_Sha512_AVX2_RORX_Len; 00329 } 00330 else 00331 #endif 00332 if (1) { 00333 Transform_Sha512_p = Transform_Sha512_AVX2; 00334 Transform_Sha512_Len_p = Transform_Sha512_AVX2_Len; 00335 } 00336 #ifdef HAVE_INTEL_RORX 00337 else { 00338 Transform_Sha512_p = Transform_Sha512_AVX1_RORX; 00339 Transform_Sha512_Len_p = Transform_Sha512_AVX1_RORX_Len; 00340 } 00341 #endif 00342 } 00343 else 00344 #endif 00345 #if defined(HAVE_INTEL_AVX1) 00346 if (IS_INTEL_AVX1(intel_flags)) { 00347 Transform_Sha512_p = Transform_Sha512_AVX1; 00348 Transform_Sha512_Len_p = Transform_Sha512_AVX1_Len; 00349 } 00350 else 00351 #endif 00352 Transform_Sha512_p = _Transform_Sha512; 00353 00354 transform_check = 1; 00355 } 00356 00357 int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId) 00358 { 00359 int ret = InitSha512(sha512); 00360 00361 (void)heap; 00362 (void)devId; 00363 00364 Sha512_SetTransform(); 00365 00366 return ret; 00367 } 00368 00369 #endif /* WOLFSSL_SHA512 */ 00370 00371 #else 00372 #define Transform_Sha512(sha512) _Transform_Sha512(sha512) 00373 00374 #ifdef WOLFSSL_SHA512 00375 00376 int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId) 00377 { 00378 int ret = 0; 00379 00380 if (sha512 == NULL) 00381 return BAD_FUNC_ARG; 00382 00383 sha512->heap = heap; 00384 00385 ret = InitSha512(sha512); 00386 if (ret != 0) 00387 return ret; 00388 00389 #ifdef WOLFSSL_SMALL_STACK_CACHE 00390 sha512->W = NULL; 00391 #endif 00392 00393 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512) 00394 ret = wolfAsync_DevCtxInit(&sha512->asyncDev, 00395 WOLFSSL_ASYNC_MARKER_SHA512, sha512->heap, devId); 00396 #else 00397 (void)devId; 00398 #endif /* WOLFSSL_ASYNC_CRYPT */ 00399 00400 return ret; 00401 } 00402 00403 #endif /* WOLFSSL_SHA512 */ 00404 00405 #endif /* Hardware Acceleration */ 00406 00407 static const word64 K512[80] = { 00408 W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), 00409 W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), 00410 W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), 00411 W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118), 00412 W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe), 00413 W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2), 00414 W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1), 00415 W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694), 00416 W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3), 00417 W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65), 00418 W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483), 00419 W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5), 00420 W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210), 00421 W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4), 00422 W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725), 00423 W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70), 00424 W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926), 00425 W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df), 00426 W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8), 00427 W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b), 00428 W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001), 00429 W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30), 00430 W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910), 00431 W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8), 00432 W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53), 00433 W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8), 00434 W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb), 00435 W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3), 00436 W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60), 00437 W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec), 00438 W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9), 00439 W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b), 00440 W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207), 00441 W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178), 00442 W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6), 00443 W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b), 00444 W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493), 00445 W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c), 00446 W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a), 00447 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817) 00448 }; 00449 00450 #define blk0(i) (W[i] = sha512->buffer[i]) 00451 00452 #define blk2(i) (\ 00453 W[ i & 15] += \ 00454 s1(W[(i-2) & 15])+ \ 00455 W[(i-7) & 15] + \ 00456 s0(W[(i-15) & 15]) \ 00457 ) 00458 00459 #define Ch(x,y,z) (z ^ (x & (y ^ z))) 00460 #define Maj(x,y,z) ((x & y) | (z & (x | y))) 00461 00462 #define a(i) T[(0-i) & 7] 00463 #define b(i) T[(1-i) & 7] 00464 #define c(i) T[(2-i) & 7] 00465 #define d(i) T[(3-i) & 7] 00466 #define e(i) T[(4-i) & 7] 00467 #define f(i) T[(5-i) & 7] 00468 #define g(i) T[(6-i) & 7] 00469 #define h(i) T[(7-i) & 7] 00470 00471 #define S0(x) (rotrFixed64(x,28) ^ rotrFixed64(x,34) ^ rotrFixed64(x,39)) 00472 #define S1(x) (rotrFixed64(x,14) ^ rotrFixed64(x,18) ^ rotrFixed64(x,41)) 00473 #define s0(x) (rotrFixed64(x,1) ^ rotrFixed64(x,8) ^ (x>>7)) 00474 #define s1(x) (rotrFixed64(x,19) ^ rotrFixed64(x,61) ^ (x>>6)) 00475 00476 #define R(i) \ 00477 h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j ? blk2(i) : blk0(i)); \ 00478 d(i) += h(i); \ 00479 h(i) += S0(a(i)) + Maj(a(i),b(i),c(i)) 00480 00481 static int _Transform_Sha512(wc_Sha512* sha512) 00482 { 00483 const word64* K = K512; 00484 word32 j; 00485 word64 T[8]; 00486 00487 #ifdef WOLFSSL_SMALL_STACK_CACHE 00488 word64* W = sha512->W; 00489 if (W == NULL) { 00490 W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, 00491 DYNAMIC_TYPE_TMP_BUFFER); 00492 if (W == NULL) 00493 return MEMORY_E; 00494 sha512->W = W; 00495 } 00496 #elif defined(WOLFSSL_SMALL_STACK) 00497 word64* W; 00498 W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER); 00499 if (W == NULL) 00500 return MEMORY_E; 00501 #else 00502 word64 W[16]; 00503 #endif 00504 00505 /* Copy digest to working vars */ 00506 XMEMCPY(T, sha512->digest, sizeof(T)); 00507 00508 #ifdef USE_SLOW_SHA512 00509 /* over twice as small, but 50% slower */ 00510 /* 80 operations, not unrolled */ 00511 for (j = 0; j < 80; j += 16) { 00512 int m; 00513 for (m = 0; m < 16; m++) { /* braces needed here for macros {} */ 00514 R(m); 00515 } 00516 } 00517 #else 00518 /* 80 operations, partially loop unrolled */ 00519 for (j = 0; j < 80; j += 16) { 00520 R( 0); R( 1); R( 2); R( 3); 00521 R( 4); R( 5); R( 6); R( 7); 00522 R( 8); R( 9); R(10); R(11); 00523 R(12); R(13); R(14); R(15); 00524 } 00525 #endif /* USE_SLOW_SHA512 */ 00526 00527 /* Add the working vars back into digest */ 00528 sha512->digest[0] += a(0); 00529 sha512->digest[1] += b(0); 00530 sha512->digest[2] += c(0); 00531 sha512->digest[3] += d(0); 00532 sha512->digest[4] += e(0); 00533 sha512->digest[5] += f(0); 00534 sha512->digest[6] += g(0); 00535 sha512->digest[7] += h(0); 00536 00537 /* Wipe variables */ 00538 ForceZero(W, sizeof(word64) * 16); 00539 ForceZero(T, sizeof(T)); 00540 00541 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE) 00542 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); 00543 #endif 00544 00545 return 0; 00546 } 00547 00548 00549 static WC_INLINE void AddLength(wc_Sha512* sha512, word32 len) 00550 { 00551 word64 tmp = sha512->loLen; 00552 if ( (sha512->loLen += len) < tmp) 00553 sha512->hiLen++; /* carry low to high */ 00554 } 00555 00556 static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 len) 00557 { 00558 int ret = 0; 00559 /* do block size increments */ 00560 byte* local = (byte*)sha512->buffer; 00561 00562 /* check that internal buffLen is valid */ 00563 if (sha512->buffLen >= WC_SHA512_BLOCK_SIZE) 00564 return BUFFER_E; 00565 00566 if (sha512->buffLen > 0) { 00567 word32 add = min(len, WC_SHA512_BLOCK_SIZE - sha512->buffLen); 00568 if (add > 0) { 00569 XMEMCPY(&local[sha512->buffLen], data, add); 00570 00571 sha512->buffLen += add; 00572 data += add; 00573 len -= add; 00574 } 00575 00576 if (sha512->buffLen == WC_SHA512_BLOCK_SIZE) { 00577 #if defined(LITTLE_ENDIAN_ORDER) 00578 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00579 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags)) 00580 #endif 00581 { 00582 ByteReverseWords64(sha512->buffer, sha512->buffer, 00583 WC_SHA512_BLOCK_SIZE); 00584 } 00585 #endif 00586 ret = Transform_Sha512(sha512); 00587 if (ret == 0) { 00588 AddLength(sha512, WC_SHA512_BLOCK_SIZE); 00589 sha512->buffLen = 0; 00590 } 00591 else 00592 len = 0; 00593 } 00594 } 00595 00596 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00597 if (Transform_Sha512_Len_p != NULL) { 00598 word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1); 00599 00600 if (blocksLen > 0) { 00601 AddLength(sha512, blocksLen); 00602 sha512->data = data; 00603 /* Byte reversal performed in function if required. */ 00604 Transform_Sha512_Len(sha512, blocksLen); 00605 data += blocksLen; 00606 len -= blocksLen; 00607 } 00608 } 00609 else 00610 #endif 00611 #if !defined(LITTLE_ENDIAN_ORDER) || defined(FREESCALE_MMCAU_SHA) || \ 00612 defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00613 { 00614 word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1); 00615 00616 AddLength(sha512, blocksLen); 00617 while (len >= WC_SHA512_BLOCK_SIZE) { 00618 XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE); 00619 00620 data += WC_SHA512_BLOCK_SIZE; 00621 len -= WC_SHA512_BLOCK_SIZE; 00622 00623 /* Byte reversal performed in function if required. */ 00624 ret = Transform_Sha512(sha512); 00625 if (ret != 0) 00626 break; 00627 } 00628 } 00629 #else 00630 { 00631 word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1); 00632 00633 AddLength(sha512, blocksLen); 00634 while (len >= WC_SHA512_BLOCK_SIZE) { 00635 XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE); 00636 00637 data += WC_SHA512_BLOCK_SIZE; 00638 len -= WC_SHA512_BLOCK_SIZE; 00639 00640 ByteReverseWords64(sha512->buffer, sha512->buffer, 00641 WC_SHA512_BLOCK_SIZE); 00642 ret = Transform_Sha512(sha512); 00643 if (ret != 0) 00644 break; 00645 } 00646 } 00647 #endif 00648 00649 if (len > 0) { 00650 XMEMCPY(local, data, len); 00651 sha512->buffLen = len; 00652 } 00653 00654 return ret; 00655 } 00656 00657 #ifdef WOLFSSL_SHA512 00658 00659 int wc_Sha512Update(wc_Sha512* sha512, const byte* data, word32 len) 00660 { 00661 if (sha512 == NULL || (data == NULL && len > 0)) { 00662 return BAD_FUNC_ARG; 00663 } 00664 00665 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512) 00666 if (sha512->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA512) { 00667 #if defined(HAVE_INTEL_QA) 00668 return IntelQaSymSha512(&sha512->asyncDev, NULL, data, len); 00669 #endif 00670 } 00671 #endif /* WOLFSSL_ASYNC_CRYPT */ 00672 00673 return Sha512Update(sha512, data, len); 00674 } 00675 00676 #endif /* WOLFSSL_SHA512 */ 00677 00678 #endif /* WOLFSSL_IMX6_CAAM */ 00679 00680 static WC_INLINE int Sha512Final(wc_Sha512* sha512) 00681 { 00682 byte* local = (byte*)sha512->buffer; 00683 int ret; 00684 00685 if (sha512 == NULL) { 00686 return BAD_FUNC_ARG; 00687 } 00688 00689 AddLength(sha512, sha512->buffLen); /* before adding pads */ 00690 00691 local[sha512->buffLen++] = 0x80; /* add 1 */ 00692 00693 /* pad with zeros */ 00694 if (sha512->buffLen > WC_SHA512_PAD_SIZE) { 00695 XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_BLOCK_SIZE - sha512->buffLen); 00696 sha512->buffLen += WC_SHA512_BLOCK_SIZE - sha512->buffLen; 00697 #if defined(LITTLE_ENDIAN_ORDER) 00698 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00699 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags)) 00700 #endif 00701 { 00702 ByteReverseWords64(sha512->buffer,sha512->buffer, 00703 WC_SHA512_BLOCK_SIZE); 00704 } 00705 #endif /* LITTLE_ENDIAN_ORDER */ 00706 ret = Transform_Sha512(sha512); 00707 if (ret != 0) 00708 return ret; 00709 00710 sha512->buffLen = 0; 00711 } 00712 XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_PAD_SIZE - sha512->buffLen); 00713 00714 /* put lengths in bits */ 00715 sha512->hiLen = (sha512->loLen >> (8 * sizeof(sha512->loLen) - 3)) + 00716 (sha512->hiLen << 3); 00717 sha512->loLen = sha512->loLen << 3; 00718 00719 /* store lengths */ 00720 #if defined(LITTLE_ENDIAN_ORDER) 00721 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00722 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags)) 00723 #endif 00724 ByteReverseWords64(sha512->buffer, sha512->buffer, WC_SHA512_PAD_SIZE); 00725 #endif 00726 /* ! length ordering dependent on digest endian type ! */ 00727 00728 sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen; 00729 sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen; 00730 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00731 if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags)) 00732 ByteReverseWords64(&(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]), 00733 &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]), 00734 WC_SHA512_BLOCK_SIZE - WC_SHA512_PAD_SIZE); 00735 #endif 00736 ret = Transform_Sha512(sha512); 00737 if (ret != 0) 00738 return ret; 00739 00740 #ifdef LITTLE_ENDIAN_ORDER 00741 ByteReverseWords64(sha512->digest, sha512->digest, WC_SHA512_DIGEST_SIZE); 00742 #endif 00743 00744 return 0; 00745 } 00746 00747 #ifdef WOLFSSL_SHA512 00748 00749 int wc_Sha512FinalRaw(wc_Sha512* sha512, byte* hash) 00750 { 00751 #ifdef LITTLE_ENDIAN_ORDER 00752 word64 digest[WC_SHA512_DIGEST_SIZE / sizeof(word64)]; 00753 #endif 00754 00755 if (sha512 == NULL || hash == NULL) { 00756 return BAD_FUNC_ARG; 00757 } 00758 00759 #ifdef LITTLE_ENDIAN_ORDER 00760 ByteReverseWords64((word64*)digest, (word64*)sha512->digest, 00761 WC_SHA512_DIGEST_SIZE); 00762 XMEMCPY(hash, digest, WC_SHA512_DIGEST_SIZE); 00763 #else 00764 XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE); 00765 #endif 00766 00767 return 0; 00768 } 00769 00770 int wc_Sha512Final(wc_Sha512* sha512, byte* hash) 00771 { 00772 int ret; 00773 00774 if (sha512 == NULL || hash == NULL) { 00775 return BAD_FUNC_ARG; 00776 } 00777 00778 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512) 00779 if (sha512->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA512) { 00780 #if defined(HAVE_INTEL_QA) 00781 return IntelQaSymSha512(&sha512->asyncDev, hash, NULL, 00782 WC_SHA512_DIGEST_SIZE); 00783 #endif 00784 } 00785 #endif /* WOLFSSL_ASYNC_CRYPT */ 00786 00787 ret = Sha512Final(sha512); 00788 if (ret != 0) 00789 return ret; 00790 00791 XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE); 00792 00793 return InitSha512(sha512); /* reset state */ 00794 } 00795 00796 00797 int wc_InitSha512(wc_Sha512* sha512) 00798 { 00799 return wc_InitSha512_ex(sha512, NULL, INVALID_DEVID); 00800 } 00801 00802 void wc_Sha512Free(wc_Sha512* sha512) 00803 { 00804 if (sha512 == NULL) 00805 return; 00806 00807 #ifdef WOLFSSL_SMALL_STACK_CACHE 00808 if (sha512->W != NULL) { 00809 XFREE(sha512->W, NULL, DYNAMIC_TYPE_TMP_BUFFER); 00810 sha512->W = NULL; 00811 } 00812 #endif 00813 00814 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512) 00815 wolfAsync_DevCtxFree(&sha512->asyncDev, WOLFSSL_ASYNC_MARKER_SHA512); 00816 #endif /* WOLFSSL_ASYNC_CRYPT */ 00817 } 00818 00819 00820 #if defined(HAVE_INTEL_AVX1) 00821 00822 static word64 mBYTE_FLIP_MASK[] = { 0x0001020304050607, 0x08090a0b0c0d0e0f }; 00823 00824 #define W_0 xmm0 00825 #define W_2 xmm1 00826 #define W_4 xmm2 00827 #define W_6 xmm3 00828 #define W_8 xmm4 00829 #define W_10 xmm5 00830 #define W_12 xmm6 00831 #define W_14 xmm7 00832 00833 #define W_M15 xmm12 00834 #define W_M7 xmm13 00835 #define MASK xmm14 00836 00837 #define XTMP1 xmm8 00838 #define XTMP2 xmm9 00839 #define XTMP3 xmm10 00840 #define XTMP4 xmm11 00841 00842 #define XMM_REGS \ 00843 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \ 00844 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" 00845 00846 #define _VPALIGNR(dest, src1, src2, bits) \ 00847 "vpalignr $" #bits ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t" 00848 #define VPALIGNR(dest, src1, src2, bits) \ 00849 _VPALIGNR(dest, src1, src2, bits) 00850 00851 #define _V_SHIFT_R(dest, src, bits) \ 00852 "vpsrlq $" #bits ", %%" #src ", %%" #dest "\n\t" 00853 #define V_SHIFT_R(dest, src, bits) \ 00854 _V_SHIFT_R(dest, src, bits) 00855 00856 #define _V_SHIFT_L(dest, src, bits) \ 00857 "vpsllq $" #bits ", %%" #src ", %%" #dest "\n\t" 00858 #define V_SHIFT_L(dest, src, bits) \ 00859 _V_SHIFT_L(dest, src, bits) 00860 00861 #define _V_ADD(dest, src1, src2) \ 00862 "vpaddq %%" #src1 ", %%" #src2 ", %%" #dest "\n\t" 00863 #define V_ADD(dest, src1, src2) \ 00864 _V_ADD(dest, src1, src2) 00865 00866 #define _V_XOR(dest, src1, src2) \ 00867 "vpxor %%" #src1 ", %%" #src2 ", %%" #dest "\n\t" 00868 #define V_XOR(dest, src1, src2) \ 00869 _V_XOR(dest, src1, src2) 00870 00871 #define _V_OR(dest, src1, src2) \ 00872 "vpor %%" #src1 ", %%" #src2 ", %%" #dest "\n\t" 00873 #define V_OR(dest, src1, src2) \ 00874 _V_OR(dest, src1, src2) 00875 00876 #define RA %%r8 00877 #define RB %%r9 00878 #define RC %%r10 00879 #define RD %%r11 00880 #define RE %%r12 00881 #define RF %%r13 00882 #define RG %%r14 00883 #define RH %%r15 00884 00885 #define STATE_REGS "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 00886 00887 #define L1 "%%rax" 00888 #define L2 "%%rcx" 00889 #define L3 "%%rdx" 00890 #define L4 "%%rbx" 00891 #define WX "%%rsp" 00892 00893 #define WORK_REGS "rax", "rbx", "rcx", "rdx" 00894 00895 #define RND_0_1(a,b,c,d,e,f,g,h,i) \ 00896 /* L1 = e >>> 23 */ \ 00897 "rorq $23, " L1 "\n\t" \ 00898 00899 #define RND_0_2(a,b,c,d,e,f,g,h,i) \ 00900 /* L3 = a */ \ 00901 "movq "#a", " L3 "\n\t" \ 00902 /* L2 = f */ \ 00903 "movq "#f", " L2 "\n\t" \ 00904 /* h += W_X[i] */ \ 00905 "addq ("#i")*8(" WX "), "#h"\n\t" \ 00906 /* L2 = f ^ g */ \ 00907 "xorq "#g", " L2 "\n\t" \ 00908 00909 #define RND_0_2_A(a,b,c,d,e,f,g,h,i) \ 00910 /* L3 = a */ \ 00911 "movq "#a", " L3 "\n\t" \ 00912 /* L2 = f */ \ 00913 "movq "#f", " L2 "\n\t" \ 00914 00915 #define RND_0_2_B(a,b,c,d,e,f,g,h,i) \ 00916 /* h += W_X[i] */ \ 00917 "addq ("#i")*8(" WX "), "#h"\n\t" \ 00918 /* L2 = f ^ g */ \ 00919 "xorq "#g", " L2 "\n\t" \ 00920 00921 #define RND_0_3(a,b,c,d,e,f,g,h,i) \ 00922 /* L1 = (e >>> 23) ^ e */ \ 00923 "xorq "#e", " L1 "\n\t" \ 00924 /* L2 = (f ^ g) & e */ \ 00925 "andq "#e", " L2 "\n\t" \ 00926 00927 #define RND_0_4(a,b,c,d,e,f,g,h,i) \ 00928 /* L1 = ((e >>> 23) ^ e) >>> 4 */ \ 00929 "rorq $4, " L1 "\n\t" \ 00930 /* L2 = ((f ^ g) & e) ^ g */ \ 00931 "xorq "#g", " L2 "\n\t" \ 00932 00933 #define RND_0_5(a,b,c,d,e,f,g,h,i) \ 00934 /* L1 = (((e >>> 23) ^ e) >>> 4) ^ e */ \ 00935 "xorq "#e", " L1 "\n\t" \ 00936 /* h += Ch(e,f,g) */ \ 00937 "addq " L2 ", "#h"\n\t" \ 00938 00939 #define RND_0_6(a,b,c,d,e,f,g,h,i) \ 00940 /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \ 00941 "rorq $14, " L1 "\n\t" \ 00942 /* L3 = a ^ b */ \ 00943 "xorq "#b", " L3 "\n\t" \ 00944 00945 #define RND_0_7(a,b,c,d,e,f,g,h,i) \ 00946 /* h += Sigma1(e) */ \ 00947 "addq " L1 ", "#h"\n\t" \ 00948 /* L2 = a */ \ 00949 "movq "#a", " L2 "\n\t" \ 00950 00951 #define RND_0_8(a,b,c,d,e,f,g,h,i) \ 00952 /* L4 = (a ^ b) & (b ^ c) */ \ 00953 "andq " L3 ", " L4 "\n\t" \ 00954 /* L2 = a >>> 5 */ \ 00955 "rorq $5, " L2 "\n\t" \ 00956 00957 #define RND_0_9(a,b,c,d,e,f,g,h,i) \ 00958 /* L2 = (a >>> 5) ^ a */ \ 00959 "xorq "#a", " L2 "\n\t" \ 00960 /* L4 = ((a ^ b) & (b ^ c) ^ b */ \ 00961 "xorq "#b", " L4 "\n\t" \ 00962 00963 #define RND_0_10(a,b,c,d,e,f,g,h,i) \ 00964 /* L2 = ((a >>> 5) ^ a) >>> 6 */ \ 00965 "rorq $6, " L2 "\n\t" \ 00966 /* d += h */ \ 00967 "addq "#h", "#d"\n\t" \ 00968 00969 #define RND_0_11(a,b,c,d,e,f,g,h,i) \ 00970 /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */ \ 00971 "xorq "#a", " L2 "\n\t" \ 00972 /* h += Sigma0(a) */ \ 00973 "addq " L4 ", "#h"\n\t" \ 00974 00975 #define RND_0_12(a,b,c,d,e,f,g,h,i) \ 00976 /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */ \ 00977 "rorq $28, " L2 "\n\t" \ 00978 /* d (= e next RND) */ \ 00979 "movq "#d", " L1 "\n\t" \ 00980 /* h += Maj(a,b,c) */ \ 00981 "addq " L2 ", "#h"\n\t" \ 00982 00983 #define RND_1_1(a,b,c,d,e,f,g,h,i) \ 00984 /* L1 = e >>> 23 */ \ 00985 "rorq $23, " L1 "\n\t" \ 00986 00987 #define RND_1_2(a,b,c,d,e,f,g,h,i) \ 00988 /* L4 = a */ \ 00989 "movq "#a", " L4 "\n\t" \ 00990 /* L2 = f */ \ 00991 "movq "#f", " L2 "\n\t" \ 00992 /* h += W_X[i] */ \ 00993 "addq ("#i")*8(" WX "), "#h"\n\t" \ 00994 /* L2 = f ^ g */ \ 00995 "xorq "#g", " L2 "\n\t" \ 00996 00997 #define RND_1_2_A(a,b,c,d,e,f,g,h,i) \ 00998 /* L4 = a */ \ 00999 "movq "#a", " L4 "\n\t" \ 01000 /* L2 = f */ \ 01001 "movq "#f", " L2 "\n\t" \ 01002 01003 #define RND_1_2_B(a,b,c,d,e,f,g,h,i) \ 01004 /* h += W_X[i] */ \ 01005 "addq ("#i")*8(" WX "), "#h"\n\t" \ 01006 /* L2 = f ^ g */ \ 01007 "xorq "#g", " L2 "\n\t" \ 01008 01009 #define RND_1_3(a,b,c,d,e,f,g,h,i) \ 01010 /* L1 = (e >>> 23) ^ e */ \ 01011 "xorq "#e", " L1 "\n\t" \ 01012 /* L2 = (f ^ g) & e */ \ 01013 "andq "#e", " L2 "\n\t" \ 01014 01015 #define RND_1_4(a,b,c,d,e,f,g,h,i) \ 01016 /* ((e >>> 23) ^ e) >>> 4 */ \ 01017 "rorq $4, " L1 "\n\t" \ 01018 /* ((f ^ g) & e) ^ g */ \ 01019 "xorq "#g", " L2 "\n\t" \ 01020 01021 #define RND_1_5(a,b,c,d,e,f,g,h,i) \ 01022 /* (((e >>> 23) ^ e) >>> 4) ^ e */ \ 01023 "xorq "#e", " L1 "\n\t" \ 01024 /* h += Ch(e,f,g) */ \ 01025 "addq " L2 ", "#h"\n\t" \ 01026 01027 #define RND_1_6(a,b,c,d,e,f,g,h,i) \ 01028 /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \ 01029 "rorq $14, " L1 "\n\t" \ 01030 /* L4 = a ^ b */ \ 01031 "xorq "#b", " L4 "\n\t" \ 01032 01033 #define RND_1_7(a,b,c,d,e,f,g,h,i) \ 01034 /* h += Sigma1(e) */ \ 01035 "addq " L1 ", "#h"\n\t" \ 01036 /* L2 = a */ \ 01037 "movq "#a", " L2 "\n\t" \ 01038 01039 #define RND_1_8(a,b,c,d,e,f,g,h,i) \ 01040 /* L3 = (a ^ b) & (b ^ c) */ \ 01041 "andq " L4 ", " L3 "\n\t" \ 01042 /* L2 = a >>> 5 */ \ 01043 "rorq $5, " L2 "\n\t" \ 01044 01045 #define RND_1_9(a,b,c,d,e,f,g,h,i) \ 01046 /* L2 = (a >>> 5) ^ a */ \ 01047 "xorq "#a", " L2 "\n\t" \ 01048 /* L3 = ((a ^ b) & (b ^ c) ^ b */ \ 01049 "xorq "#b", " L3 "\n\t" \ 01050 01051 #define RND_1_10(a,b,c,d,e,f,g,h,i) \ 01052 /* L2 = ((a >>> 5) ^ a) >>> 6 */ \ 01053 "rorq $6, " L2 "\n\t" \ 01054 /* d += h */ \ 01055 "addq "#h", "#d"\n\t" \ 01056 01057 #define RND_1_11(a,b,c,d,e,f,g,h,i) \ 01058 /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */ \ 01059 "xorq "#a", " L2 "\n\t" \ 01060 /* h += Sigma0(a) */ \ 01061 "addq " L3 ", "#h"\n\t" \ 01062 01063 #define RND_1_12(a,b,c,d,e,f,g,h,i) \ 01064 /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */ \ 01065 "rorq $28, " L2 "\n\t" \ 01066 /* d (= e next RND) */ \ 01067 "movq "#d", " L1 "\n\t" \ 01068 /* h += Maj(a,b,c) */ \ 01069 "addq " L2 ", "#h"\n\t" \ 01070 01071 01072 #define MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \ 01073 RND_0_1(a,b,c,d,e,f,g,h,i) \ 01074 VPALIGNR(W_M15, W_2, W_0, 8) \ 01075 VPALIGNR(W_M7, W_10, W_8, 8) \ 01076 RND_0_2(a,b,c,d,e,f,g,h,i) \ 01077 V_SHIFT_R(XTMP1, W_M15, 1) \ 01078 V_SHIFT_L(XTMP2, W_M15, 63) \ 01079 RND_0_3(a,b,c,d,e,f,g,h,i) \ 01080 RND_0_4(a,b,c,d,e,f,g,h,i) \ 01081 V_SHIFT_R(XTMP3, W_M15, 8) \ 01082 V_SHIFT_L(XTMP4, W_M15, 56) \ 01083 RND_0_5(a,b,c,d,e,f,g,h,i) \ 01084 RND_0_6(a,b,c,d,e,f,g,h,i) \ 01085 V_OR(XTMP1, XTMP2, XTMP1) \ 01086 V_OR(XTMP3, XTMP4, XTMP3) \ 01087 RND_0_7(a,b,c,d,e,f,g,h,i) \ 01088 RND_0_8(a,b,c,d,e,f,g,h,i) \ 01089 V_SHIFT_R(XTMP4, W_M15, 7) \ 01090 V_XOR(XTMP1, XTMP3, XTMP1) \ 01091 RND_0_9(a,b,c,d,e,f,g,h,i) \ 01092 RND_0_10(a,b,c,d,e,f,g,h,i) \ 01093 V_XOR(XTMP1, XTMP4, XTMP1) \ 01094 V_ADD(W_0, W_0, W_M7) \ 01095 RND_0_11(a,b,c,d,e,f,g,h,i) \ 01096 RND_0_12(a,b,c,d,e,f,g,h,i) \ 01097 RND_1_1(h,a,b,c,d,e,f,g,i+1) \ 01098 V_ADD(W_0, W_0, XTMP1) \ 01099 RND_1_2(h,a,b,c,d,e,f,g,i+1) \ 01100 V_SHIFT_R(XTMP1, W_14, 19) \ 01101 V_SHIFT_L(XTMP2, W_14, 45) \ 01102 RND_1_3(h,a,b,c,d,e,f,g,i+1) \ 01103 RND_1_4(h,a,b,c,d,e,f,g,i+1) \ 01104 V_SHIFT_R(XTMP3, W_14, 61) \ 01105 V_SHIFT_L(XTMP4, W_14, 3) \ 01106 RND_1_5(h,a,b,c,d,e,f,g,i+1) \ 01107 RND_1_6(h,a,b,c,d,e,f,g,i+1) \ 01108 RND_1_7(h,a,b,c,d,e,f,g,i+1) \ 01109 V_OR(XTMP1, XTMP2, XTMP1) \ 01110 V_OR(XTMP3, XTMP4, XTMP3) \ 01111 RND_1_8(h,a,b,c,d,e,f,g,i+1) \ 01112 RND_1_9(h,a,b,c,d,e,f,g,i+1) \ 01113 V_XOR(XTMP1, XTMP3, XTMP1) \ 01114 V_SHIFT_R(XTMP4, W_14, 6) \ 01115 RND_1_10(h,a,b,c,d,e,f,g,i+1) \ 01116 RND_1_11(h,a,b,c,d,e,f,g,i+1) \ 01117 V_XOR(XTMP1, XTMP4, XTMP1) \ 01118 RND_1_12(h,a,b,c,d,e,f,g,i+1) \ 01119 V_ADD(W_0, W_0, XTMP1) \ 01120 01121 #define RND_ALL_2(a, b, c, d, e, f, g, h, i) \ 01122 RND_0_1 (a, b, c, d, e, f, g, h, i ) \ 01123 RND_0_2 (a, b, c, d, e, f, g, h, i ) \ 01124 RND_0_3 (a, b, c, d, e, f, g, h, i ) \ 01125 RND_0_4 (a, b, c, d, e, f, g, h, i ) \ 01126 RND_0_5 (a, b, c, d, e, f, g, h, i ) \ 01127 RND_0_6 (a, b, c, d, e, f, g, h, i ) \ 01128 RND_0_7 (a, b, c, d, e, f, g, h, i ) \ 01129 RND_0_8 (a, b, c, d, e, f, g, h, i ) \ 01130 RND_0_9 (a, b, c, d, e, f, g, h, i ) \ 01131 RND_0_10(a, b, c, d, e, f, g, h, i ) \ 01132 RND_0_11(a, b, c, d, e, f, g, h, i ) \ 01133 RND_0_12(a, b, c, d, e, f, g, h, i ) \ 01134 RND_1_1 (h, a, b, c, d, e, f, g, i+1) \ 01135 RND_1_2 (h, a, b, c, d, e, f, g, i+1) \ 01136 RND_1_3 (h, a, b, c, d, e, f, g, i+1) \ 01137 RND_1_4 (h, a, b, c, d, e, f, g, i+1) \ 01138 RND_1_5 (h, a, b, c, d, e, f, g, i+1) \ 01139 RND_1_6 (h, a, b, c, d, e, f, g, i+1) \ 01140 RND_1_7 (h, a, b, c, d, e, f, g, i+1) \ 01141 RND_1_8 (h, a, b, c, d, e, f, g, i+1) \ 01142 RND_1_9 (h, a, b, c, d, e, f, g, i+1) \ 01143 RND_1_10(h, a, b, c, d, e, f, g, i+1) \ 01144 RND_1_11(h, a, b, c, d, e, f, g, i+1) \ 01145 RND_1_12(h, a, b, c, d, e, f, g, i+1) 01146 01147 01148 #if defined(HAVE_INTEL_RORX) 01149 01150 #define RND_RORX_0_1(a, b, c, d, e, f, g, h, i) \ 01151 /* L1 = e>>>14 */ \ 01152 "rorxq $14, "#e", " L1 "\n\t" \ 01153 /* L2 = e>>>18 */ \ 01154 "rorxq $18, "#e", " L2 "\n\t" \ 01155 /* Prev RND: h += Maj(a,b,c) */ \ 01156 "addq " L3 ", "#a"\n\t" \ 01157 01158 #define RND_RORX_0_2(a, b, c, d, e, f, g, h, i) \ 01159 /* h += w_k */ \ 01160 "addq ("#i")*8(" WX "), "#h"\n\t" \ 01161 /* L3 = f */ \ 01162 "movq "#f", " L3 "\n\t" \ 01163 /* L2 = (e>>>14) ^ (e>>>18) */ \ 01164 "xorq " L1 ", " L2 "\n\t" \ 01165 01166 #define RND_RORX_0_3(a, b, c, d, e, f, g, h, i) \ 01167 /* L3 = f ^ g */ \ 01168 "xorq "#g", " L3 "\n\t" \ 01169 /* L1 = e>>>41 */ \ 01170 "rorxq $41, "#e", " L1 "\n\t" \ 01171 /* L1 = Sigma1(e) */ \ 01172 "xorq " L2 ", " L1 "\n\t" \ 01173 01174 #define RND_RORX_0_4(a, b, c, d, e, f, g, h, i) \ 01175 /* L3 = (f ^ g) & e */ \ 01176 "andq "#e", " L3 "\n\t" \ 01177 /* h += Sigma1(e) */ \ 01178 "addq " L1 ", "#h"\n\t" \ 01179 /* L1 = a>>>28 */ \ 01180 "rorxq $28, "#a", " L1 "\n\t" \ 01181 01182 #define RND_RORX_0_5(a, b, c, d, e, f, g, h, i) \ 01183 /* L2 = a>>>34 */ \ 01184 "rorxq $34, "#a", " L2 "\n\t" \ 01185 /* L3 = Ch(e,f,g) */ \ 01186 "xorq "#g", " L3 "\n\t" \ 01187 /* L2 = (a>>>28) ^ (a>>>34) */ \ 01188 "xorq " L1 ", " L2 "\n\t" \ 01189 01190 #define RND_RORX_0_6(a, b, c, d, e, f, g, h, i) \ 01191 /* L1 = a>>>39 */ \ 01192 "rorxq $39, "#a", " L1 "\n\t" \ 01193 /* h += Ch(e,f,g) */ \ 01194 "addq " L3 ", "#h"\n\t" \ 01195 /* L1 = Sigma0(a) */ \ 01196 "xorq " L2 ", " L1 "\n\t" \ 01197 01198 #define RND_RORX_0_7(a, b, c, d, e, f, g, h, i) \ 01199 /* L3 = b */ \ 01200 "movq "#b", " L3 "\n\t" \ 01201 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 01202 "addq "#h", "#d"\n\t" \ 01203 /* L3 = a ^ b */ \ 01204 "xorq "#a", " L3 "\n\t" \ 01205 01206 #define RND_RORX_0_8(a, b, c, d, e, f, g, h, i) \ 01207 /* L4 = (a ^ b) & (b ^ c) */ \ 01208 "andq " L3 ", " L4 "\n\t" \ 01209 /* h += Sigma0(a) */ \ 01210 "addq " L1 ", "#h"\n\t" \ 01211 /* L4 = Maj(a,b,c) */ \ 01212 "xorq "#b", " L4 "\n\t" \ 01213 01214 #define RND_RORX_1_1(a, b, c, d, e, f, g, h, i) \ 01215 /* L1 = e>>>14 */ \ 01216 "rorxq $14, "#e", " L1 "\n\t" \ 01217 /* L2 = e>>>18 */ \ 01218 "rorxq $18, "#e", " L2 "\n\t" \ 01219 /* Prev RND: h += Maj(a,b,c) */ \ 01220 "addq " L4 ", "#a"\n\t" \ 01221 01222 #define RND_RORX_1_2(a, b, c, d, e, f, g, h, i) \ 01223 /* h += w_k */ \ 01224 "addq ("#i")*8(" WX "), "#h"\n\t" \ 01225 /* L4 = f */ \ 01226 "movq "#f", " L4 "\n\t" \ 01227 /* L2 = (e>>>14) ^ (e>>>18) */ \ 01228 "xorq " L1 ", " L2 "\n\t" \ 01229 01230 #define RND_RORX_1_3(a, b, c, d, e, f, g, h, i) \ 01231 /* L4 = f ^ g */ \ 01232 "xorq "#g", " L4 "\n\t" \ 01233 /* L1 = e>>>41 */ \ 01234 "rorxq $41, "#e", " L1 "\n\t" \ 01235 /* L1 = Sigma1(e) */ \ 01236 "xorq " L2 ", " L1 "\n\t" \ 01237 01238 #define RND_RORX_1_4(a, b, c, d, e, f, g, h, i) \ 01239 /* L4 = (f ^ g) & e */ \ 01240 "andq "#e", " L4 "\n\t" \ 01241 /* h += Sigma1(e) */ \ 01242 "addq " L1 ", "#h"\n\t" \ 01243 /* L1 = a>>>28 */ \ 01244 "rorxq $28, "#a", " L1 "\n\t" \ 01245 01246 #define RND_RORX_1_5(a, b, c, d, e, f, g, h, i) \ 01247 /* L2 = a>>>34 */ \ 01248 "rorxq $34, "#a", " L2 "\n\t" \ 01249 /* L4 = Ch(e,f,g) */ \ 01250 "xorq "#g", " L4 "\n\t" \ 01251 /* L2 = (a>>>28) ^ (a>>>34) */ \ 01252 "xorq " L1 ", " L2 "\n\t" \ 01253 01254 #define RND_RORX_1_6(a, b, c, d, e, f, g, h, i) \ 01255 /* L1 = a>>>39 */ \ 01256 "rorxq $39, "#a", " L1 "\n\t" \ 01257 /* h += Ch(e,f,g) */ \ 01258 "addq " L4 ", "#h"\n\t" \ 01259 /* L1 = Sigma0(a) */ \ 01260 "xorq " L2 ", " L1 "\n\t" \ 01261 01262 #define RND_RORX_1_7(a, b, c, d, e, f, g, h, i) \ 01263 /* L4 = b */ \ 01264 "movq "#b", " L4 "\n\t" \ 01265 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 01266 "addq "#h", "#d"\n\t" \ 01267 /* L4 = a ^ b */ \ 01268 "xorq "#a", " L4 "\n\t" \ 01269 01270 #define RND_RORX_1_8(a, b, c, d, e, f, g, h, i) \ 01271 /* L2 = (a ^ b) & (b ^ c) */ \ 01272 "andq " L4 ", " L3 "\n\t" \ 01273 /* h += Sigma0(a) */ \ 01274 "addq " L1 ", "#h"\n\t" \ 01275 /* L3 = Maj(a,b,c) */ \ 01276 "xorq "#b", " L3 "\n\t" \ 01277 01278 #define RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i) \ 01279 RND_RORX_0_1(a, b, c, d, e, f, g, h, i+0) \ 01280 RND_RORX_0_2(a, b, c, d, e, f, g, h, i+0) \ 01281 RND_RORX_0_3(a, b, c, d, e, f, g, h, i+0) \ 01282 RND_RORX_0_4(a, b, c, d, e, f, g, h, i+0) \ 01283 RND_RORX_0_5(a, b, c, d, e, f, g, h, i+0) \ 01284 RND_RORX_0_6(a, b, c, d, e, f, g, h, i+0) \ 01285 RND_RORX_0_7(a, b, c, d, e, f, g, h, i+0) \ 01286 RND_RORX_0_8(a, b, c, d, e, f, g, h, i+0) \ 01287 RND_RORX_1_1(h, a, b, c, d, e, f, g, i+1) \ 01288 RND_RORX_1_2(h, a, b, c, d, e, f, g, i+1) \ 01289 RND_RORX_1_3(h, a, b, c, d, e, f, g, i+1) \ 01290 RND_RORX_1_4(h, a, b, c, d, e, f, g, i+1) \ 01291 RND_RORX_1_5(h, a, b, c, d, e, f, g, i+1) \ 01292 RND_RORX_1_6(h, a, b, c, d, e, f, g, i+1) \ 01293 RND_RORX_1_7(h, a, b, c, d, e, f, g, i+1) \ 01294 RND_RORX_1_8(h, a, b, c, d, e, f, g, i+1) \ 01295 01296 #define RND_RORX_ALL_4(a, b, c, d, e, f, g, h, i) \ 01297 RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i+0) \ 01298 RND_RORX_ALL_2(g, h, a, b, c, d, e, f, i+2) 01299 01300 #define MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \ 01301 RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \ 01302 VPALIGNR(W_M15, W_2, W_0, 8) \ 01303 VPALIGNR(W_M7, W_10, W_8, 8) \ 01304 RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \ 01305 V_SHIFT_R(XTMP1, W_M15, 1) \ 01306 V_SHIFT_L(XTMP2, W_M15, 63) \ 01307 RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \ 01308 V_SHIFT_R(XTMP3, W_M15, 8) \ 01309 V_SHIFT_L(XTMP4, W_M15, 56) \ 01310 RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \ 01311 V_OR(XTMP1, XTMP2, XTMP1) \ 01312 V_OR(XTMP3, XTMP4, XTMP3) \ 01313 RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \ 01314 V_SHIFT_R(XTMP4, W_M15, 7) \ 01315 V_XOR(XTMP1, XTMP3, XTMP1) \ 01316 RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \ 01317 V_XOR(XTMP1, XTMP4, XTMP1) \ 01318 V_ADD(W_0, W_0, W_M7) \ 01319 RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \ 01320 RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \ 01321 V_ADD(W_0, W_0, XTMP1) \ 01322 RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \ 01323 V_SHIFT_R(XTMP1, W_14, 19) \ 01324 V_SHIFT_L(XTMP2, W_14, 45) \ 01325 RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \ 01326 V_SHIFT_R(XTMP3, W_14, 61) \ 01327 V_SHIFT_L(XTMP4, W_14, 3) \ 01328 RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \ 01329 V_OR(XTMP1, XTMP2, XTMP1) \ 01330 V_OR(XTMP3, XTMP4, XTMP3) \ 01331 RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \ 01332 RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \ 01333 V_XOR(XTMP1, XTMP3, XTMP1) \ 01334 V_SHIFT_R(XTMP4, W_14, 6) \ 01335 RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \ 01336 RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \ 01337 V_XOR(XTMP1, XTMP4, XTMP1) \ 01338 RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \ 01339 V_ADD(W_0, W_0, XTMP1) \ 01340 01341 #endif 01342 01343 #define _INIT_MASK(mask) \ 01344 "vmovdqu %[mask], %%" #mask "\n\t" 01345 #define INIT_MASK(mask) \ 01346 _INIT_MASK(mask) 01347 01348 #define _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) \ 01349 "vmovdqu " #i1 "*16(%%" #reg "), %%" #xmm1 "\n\t" \ 01350 "vmovdqu " #i2 "*16(%%" #reg "), %%" #xmm2 "\n\t" \ 01351 "vpshufb %%" #mask ", %%" #xmm1 ", %%" #xmm1 "\n\t" \ 01352 "vpshufb %%" #mask ", %%" #xmm2 ", %%" #xmm2 "\n\t" 01353 #define LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) \ 01354 _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) 01355 01356 #define LOAD_W(mask, reg) \ 01357 /* X0..3(xmm4..7), W[0..15] = buffer[0.15]; */ \ 01358 LOAD_W_2(0, 1, W_0 , W_2 , mask, reg) \ 01359 LOAD_W_2(2, 3, W_4 , W_6 , mask, reg) \ 01360 LOAD_W_2(4, 5, W_8 , W_10, mask, reg) \ 01361 LOAD_W_2(6, 7, W_12, W_14, mask, reg) 01362 01363 #define _SET_W_X_2(xmm0, xmm1, reg, i) \ 01364 "vpaddq " #i "+ 0(%%" #reg "), %%" #xmm0 ", %%xmm8\n\t" \ 01365 "vpaddq " #i "+16(%%" #reg "), %%" #xmm1 ", %%xmm9\n\t" \ 01366 "vmovdqu %%xmm8, " #i "+ 0(" WX ")\n\t" \ 01367 "vmovdqu %%xmm9, " #i "+16(" WX ")\n\t" \ 01368 01369 #define SET_W_X_2(xmm0, xmm1, reg, i) \ 01370 _SET_W_X_2(xmm0, xmm1, reg, i) 01371 01372 #define SET_W_X(reg) \ 01373 SET_W_X_2(W_0 , W_2 , reg, 0) \ 01374 SET_W_X_2(W_4 , W_6 , reg, 32) \ 01375 SET_W_X_2(W_8 , W_10, reg, 64) \ 01376 SET_W_X_2(W_12, W_14, reg, 96) 01377 01378 #define LOAD_DIGEST() \ 01379 "movq (%[sha512]), %%r8 \n\t" \ 01380 "movq 8(%[sha512]), %%r9 \n\t" \ 01381 "movq 16(%[sha512]), %%r10\n\t" \ 01382 "movq 24(%[sha512]), %%r11\n\t" \ 01383 "movq 32(%[sha512]), %%r12\n\t" \ 01384 "movq 40(%[sha512]), %%r13\n\t" \ 01385 "movq 48(%[sha512]), %%r14\n\t" \ 01386 "movq 56(%[sha512]), %%r15\n\t" 01387 01388 #define STORE_ADD_DIGEST() \ 01389 "addq %%r8, (%[sha512])\n\t" \ 01390 "addq %%r9, 8(%[sha512])\n\t" \ 01391 "addq %%r10, 16(%[sha512])\n\t" \ 01392 "addq %%r11, 24(%[sha512])\n\t" \ 01393 "addq %%r12, 32(%[sha512])\n\t" \ 01394 "addq %%r13, 40(%[sha512])\n\t" \ 01395 "addq %%r14, 48(%[sha512])\n\t" \ 01396 "addq %%r15, 56(%[sha512])\n\t" 01397 01398 #define ADD_DIGEST() \ 01399 "addq (%[sha512]), %%r8 \n\t" \ 01400 "addq 8(%[sha512]), %%r9 \n\t" \ 01401 "addq 16(%[sha512]), %%r10\n\t" \ 01402 "addq 24(%[sha512]), %%r11\n\t" \ 01403 "addq 32(%[sha512]), %%r12\n\t" \ 01404 "addq 40(%[sha512]), %%r13\n\t" \ 01405 "addq 48(%[sha512]), %%r14\n\t" \ 01406 "addq 56(%[sha512]), %%r15\n\t" 01407 01408 #define STORE_DIGEST() \ 01409 "movq %%r8, (%[sha512])\n\t" \ 01410 "movq %%r9, 8(%[sha512])\n\t" \ 01411 "movq %%r10, 16(%[sha512])\n\t" \ 01412 "movq %%r11, 24(%[sha512])\n\t" \ 01413 "movq %%r12, 32(%[sha512])\n\t" \ 01414 "movq %%r13, 40(%[sha512])\n\t" \ 01415 "movq %%r14, 48(%[sha512])\n\t" \ 01416 "movq %%r15, 56(%[sha512])\n\t" 01417 01418 #endif /* HAVE_INTEL_AVX1 */ 01419 01420 01421 /*** Transform Body ***/ 01422 #if defined(HAVE_INTEL_AVX1) 01423 static int Transform_Sha512_AVX1(wc_Sha512* sha512) 01424 { 01425 __asm__ __volatile__ ( 01426 01427 /* 16 Ws plus loop counter. */ 01428 "subq $136, %%rsp\n\t" 01429 "leaq 64(%[sha512]), %%rax\n\t" 01430 01431 INIT_MASK(MASK) 01432 LOAD_DIGEST() 01433 01434 LOAD_W(MASK, rax) 01435 01436 "movl $4, 16*8(" WX ")\n\t" 01437 "leaq %[K512], %%rsi\n\t" 01438 /* b */ 01439 "movq %%r9, " L4 "\n\t" 01440 /* e */ 01441 "movq %%r12, " L1 "\n\t" 01442 /* b ^ c */ 01443 "xorq %%r10, " L4 "\n\t" 01444 01445 "# Start of 16 rounds\n" 01446 "1:\n\t" 01447 01448 SET_W_X(rsi) 01449 01450 "addq $128, %%rsi\n\t" 01451 01452 MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0) 01453 MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2) 01454 MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4) 01455 MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6) 01456 MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8) 01457 MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10) 01458 MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12) 01459 MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14) 01460 01461 "subl $1, 16*8(" WX ")\n\t" 01462 "jne 1b\n\t" 01463 01464 SET_W_X(rsi) 01465 01466 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 01467 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) 01468 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) 01469 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) 01470 01471 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) 01472 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) 01473 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) 01474 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 01475 01476 STORE_ADD_DIGEST() 01477 01478 "addq $136, %%rsp\n\t" 01479 01480 : 01481 : [mask] "m" (mBYTE_FLIP_MASK), 01482 [sha512] "r" (sha512), 01483 [K512] "m" (K512) 01484 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi" 01485 ); 01486 01487 return 0; 01488 } 01489 01490 static int Transform_Sha512_AVX1_Len(wc_Sha512* sha512, word32 len) 01491 { 01492 __asm__ __volatile__ ( 01493 01494 "movq 224(%[sha512]), %%rsi\n\t" 01495 "leaq %[K512], %%rdx\n\t" 01496 01497 INIT_MASK(MASK) 01498 LOAD_DIGEST() 01499 01500 "# Start of processing a block\n" 01501 "2:\n\t" 01502 01503 /* 16 Ws plus loop counter and K512. len goes into -4(%rsp). 01504 * Debug needs more stack space. */ 01505 "subq $256, %%rsp\n\t" 01506 01507 LOAD_W(MASK, rsi) 01508 01509 "movl $4, 16*8(" WX ")\n\t" 01510 /* b */ 01511 "movq %%r9, " L4 "\n\t" 01512 /* e */ 01513 "movq %%r12, " L1 "\n\t" 01514 /* b ^ c */ 01515 "xorq %%r10, " L4 "\n\t" 01516 01517 SET_W_X(rdx) 01518 01519 "# Start of 16 rounds\n" 01520 "1:\n\t" 01521 01522 "addq $128, %%rdx\n\t" 01523 "movq %%rdx, 17*8(%%rsp)\n\t" 01524 01525 MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0) 01526 MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2) 01527 MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4) 01528 MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6) 01529 MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8) 01530 MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10) 01531 MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12) 01532 MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14) 01533 01534 "movq 17*8(%%rsp), %%rdx\n\t" 01535 01536 SET_W_X(rdx) 01537 01538 "subl $1, 16*8(" WX ")\n\t" 01539 "jne 1b\n\t" 01540 01541 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 01542 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) 01543 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) 01544 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) 01545 01546 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) 01547 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) 01548 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) 01549 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 01550 01551 ADD_DIGEST() 01552 01553 "addq $256, %%rsp\n\t" 01554 "leaq %[K512], %%rdx\n\t" 01555 "addq $128, %%rsi\n\t" 01556 "subl $128, %[len]\n\t" 01557 01558 STORE_DIGEST() 01559 01560 "jnz 2b\n\t" 01561 01562 : 01563 : [mask] "m" (mBYTE_FLIP_MASK), 01564 [len] "m" (len), 01565 [sha512] "r" (sha512), 01566 [K512] "m" (K512) 01567 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi" 01568 ); 01569 01570 return 0; 01571 } 01572 #endif /* HAVE_INTEL_AVX1 */ 01573 01574 #if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX) 01575 static int Transform_Sha512_AVX1_RORX(wc_Sha512* sha512) 01576 { 01577 __asm__ __volatile__ ( 01578 01579 /* 16 Ws plus loop counter and K512. */ 01580 "subq $144, %%rsp\n\t" 01581 "leaq 64(%[sha512]), %%rax\n\t" 01582 01583 INIT_MASK(MASK) 01584 LOAD_DIGEST() 01585 01586 LOAD_W(MASK, rax) 01587 01588 "movl $4, 16*8(" WX ")\n\t" 01589 "leaq %[K512], %%rsi\n\t" 01590 /* L4 = b */ 01591 "movq %%r9, " L4 "\n\t" 01592 /* L3 = 0 (add to prev h) */ 01593 "xorq " L3 ", " L3 "\n\t" 01594 /* L4 = b ^ c */ 01595 "xorq %%r10, " L4 "\n\t" 01596 01597 SET_W_X(rsi) 01598 01599 "# Start of 16 rounds\n" 01600 "1:\n\t" 01601 01602 "addq $128, %%rsi\n\t" 01603 01604 MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0) 01605 MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2) 01606 MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4) 01607 MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6) 01608 MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8) 01609 MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10) 01610 MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12) 01611 MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14) 01612 01613 SET_W_X(rsi) 01614 01615 "subl $1, 16*8(" WX ")\n\t" 01616 "jne 1b\n\t" 01617 01618 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 01619 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) 01620 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) 01621 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) 01622 01623 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) 01624 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) 01625 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) 01626 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 01627 01628 /* Prev RND: h += Maj(a,b,c) */ 01629 "addq " L3 ", %%r8\n\t" 01630 "addq $144, %%rsp\n\t" 01631 01632 STORE_ADD_DIGEST() 01633 01634 : 01635 : [mask] "m" (mBYTE_FLIP_MASK), 01636 [sha512] "r" (sha512), 01637 [K512] "m" (K512) 01638 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi" 01639 ); 01640 01641 return 0; 01642 } 01643 01644 static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512* sha512, word32 len) 01645 { 01646 __asm__ __volatile__ ( 01647 01648 "movq 224(%[sha512]), %%rsi\n\t" 01649 "leaq %[K512], %%rcx\n\t" 01650 01651 INIT_MASK(MASK) 01652 LOAD_DIGEST() 01653 01654 "# Start of processing a block\n" 01655 "2:\n\t" 01656 01657 /* 16 Ws plus loop counter and K512. len goes into -4(%rsp). 01658 * Debug needs more stack space. */ 01659 "subq $256, %%rsp\n\t" 01660 01661 LOAD_W(MASK, rsi) 01662 01663 "movl $4, 16*8(" WX ")\n\t" 01664 /* L4 = b */ 01665 "movq %%r9, " L4 "\n\t" 01666 /* L3 = 0 (add to prev h) */ 01667 "xorq " L3 ", " L3 "\n\t" 01668 /* L4 = b ^ c */ 01669 "xorq %%r10, " L4 "\n\t" 01670 01671 SET_W_X(rcx) 01672 01673 "# Start of 16 rounds\n" 01674 "1:\n\t" 01675 01676 "addq $128, %%rcx\n\t" 01677 "movq %%rcx, 17*8(%%rsp)\n\t" 01678 01679 MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0) 01680 MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2) 01681 MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4) 01682 MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6) 01683 MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8) 01684 MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10) 01685 MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12) 01686 MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14) 01687 01688 "movq 17*8(%%rsp), %%rcx\n\t" 01689 01690 SET_W_X(rcx) 01691 01692 "subl $1, 16*8(" WX ")\n\t" 01693 "jne 1b\n\t" 01694 01695 SET_W_X(rcx) 01696 01697 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 01698 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) 01699 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) 01700 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) 01701 01702 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) 01703 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) 01704 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) 01705 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 01706 01707 /* Prev RND: h += Maj(a,b,c) */ 01708 "addq " L3 ", %%r8\n\t" 01709 "addq $256, %%rsp\n\t" 01710 01711 ADD_DIGEST() 01712 01713 "leaq %[K512], %%rcx\n\t" 01714 "addq $128, %%rsi\n\t" 01715 "subl $128, %[len]\n\t" 01716 01717 STORE_DIGEST() 01718 01719 "jnz 2b\n\t" 01720 01721 : 01722 : [mask] "m" (mBYTE_FLIP_MASK), 01723 [len] "m" (len), 01724 [sha512] "r" (sha512), 01725 [K512] "m" (K512) 01726 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi" 01727 ); 01728 01729 return 0; 01730 } 01731 #endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_RORX */ 01732 01733 #if defined(HAVE_INTEL_AVX2) 01734 static const unsigned long mBYTE_FLIP_MASK_Y[] = 01735 { 0x0001020304050607, 0x08090a0b0c0d0e0f, 01736 0x0001020304050607, 0x08090a0b0c0d0e0f }; 01737 01738 #define W_Y_0 ymm0 01739 #define W_Y_4 ymm1 01740 #define W_Y_8 ymm2 01741 #define W_Y_12 ymm3 01742 01743 #define X0 xmm0 01744 #define X1 xmm1 01745 #define X2 xmm2 01746 #define X3 xmm3 01747 #define X4 xmm4 01748 #define X5 xmm5 01749 #define X6 xmm6 01750 #define X7 xmm7 01751 #define X8 xmm8 01752 #define X9 xmm9 01753 #define Y0 ymm0 01754 #define Y1 ymm1 01755 #define Y2 ymm2 01756 #define Y3 ymm3 01757 #define Y4 ymm4 01758 #define Y5 ymm5 01759 #define Y6 ymm6 01760 #define Y7 ymm7 01761 01762 #define W_Y_M15 ymm12 01763 #define W_Y_M7 ymm13 01764 #define W_Y_M2 ymm14 01765 #define MASK_Y ymm15 01766 01767 #define YTMP1 ymm8 01768 #define YTMP2 ymm9 01769 #define YTMP3 ymm10 01770 #define YTMP4 ymm11 01771 01772 #define YMM_REGS \ 01773 "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", \ 01774 "xmm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15" 01775 01776 #define _VPERM2I128(dest, src1, src2, sel) \ 01777 "vperm2I128 $" #sel ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t" 01778 #define VPERM2I128(dest, src1, src2, sel) \ 01779 _VPERM2I128(dest, src1, src2, sel) 01780 01781 #define _VPERMQ(dest, src, sel) \ 01782 "vpermq $" #sel ", %%" #src ", %%" #dest "\n\t" 01783 #define VPERMQ(dest, src, sel) \ 01784 _VPERMQ(dest, src, sel) 01785 01786 #define _VPBLENDD(dest, src1, src2, sel) \ 01787 "vpblendd $" #sel ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t" 01788 #define VPBLENDD(dest, src1, src2, sel) \ 01789 _VPBLENDD(dest, src1, src2, sel) 01790 01791 #define _V_ADD_I(dest, src1, addr, i) \ 01792 "vpaddq "#i"*8(%%" #addr "), %%" #src1 ", %%" #dest "\n\t" 01793 #define V_ADD_I(dest, src1, addr, i) \ 01794 _V_ADD_I(dest, src1, addr, i) 01795 01796 #define _VMOVDQU_I(addr, i, src) \ 01797 "vmovdqu %%" #src ", " #i "*8(%%" #addr ")\n\t" 01798 #define VMOVDQU_I(addr, i, src) \ 01799 _VMOVDQU_I(addr, i, src) 01800 01801 #define MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \ 01802 RND_0_1(a,b,c,d,e,f,g,h,i) \ 01803 /* W[-13]..W[-15], W[-12] */ \ 01804 VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03) \ 01805 /* W[-5]..W[-7], W[-4] */ \ 01806 VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03) \ 01807 RND_0_2(a,b,c,d,e,f,g,h,i) \ 01808 RND_0_3(a,b,c,d,e,f,g,h,i) \ 01809 /* W_Y_M15 = W[-12]..W[-15] */ \ 01810 VPERMQ(W_Y_M15, W_Y_M15, 0x39) \ 01811 RND_0_4(a,b,c,d,e,f,g,h,i) \ 01812 /* W_Y_M7 = W[-4]..W[-7] */ \ 01813 VPERMQ(W_Y_M7, W_Y_M7, 0x39) \ 01814 RND_0_5(a,b,c,d,e,f,g,h,i) \ 01815 RND_0_6(a,b,c,d,e,f,g,h,i) \ 01816 /* W[-15] >> 1 */ \ 01817 V_SHIFT_R(YTMP1, W_Y_M15, 1) \ 01818 RND_0_7(a,b,c,d,e,f,g,h,i) \ 01819 /* W[-15] << 63 */ \ 01820 V_SHIFT_L(YTMP2, W_Y_M15, 63) \ 01821 RND_0_8(a,b,c,d,e,f,g,h,i) \ 01822 /* W[-15] >> 8 */ \ 01823 V_SHIFT_R(YTMP3, W_Y_M15, 8) \ 01824 RND_0_9(a,b,c,d,e,f,g,h,i) \ 01825 /* W[-15] << 56 */ \ 01826 V_SHIFT_L(YTMP4, W_Y_M15, 56) \ 01827 RND_0_10(a,b,c,d,e,f,g,h,i) \ 01828 /* W[-15] >>> 1 */ \ 01829 V_OR(YTMP1, YTMP2, YTMP1) \ 01830 RND_0_11(a,b,c,d,e,f,g,h,i) \ 01831 /* W[-15] >>> 8 */ \ 01832 V_OR(YTMP3, YTMP4, YTMP3) \ 01833 RND_0_12(a,b,c,d,e,f,g,h,i) \ 01834 RND_1_1(h,a,b,c,d,e,f,g,i+1) \ 01835 /* W[-15] >> 7 */ \ 01836 V_SHIFT_R(YTMP4, W_Y_M15, 7) \ 01837 RND_1_2_A(h,a,b,c,d,e,f,g,i+1) \ 01838 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */ \ 01839 V_XOR(YTMP1, YTMP3, YTMP1) \ 01840 RND_1_2_B(h,a,b,c,d,e,f,g,i+1) \ 01841 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */ \ 01842 V_XOR(YTMP1, YTMP4, YTMP1) \ 01843 RND_1_3(h,a,b,c,d,e,f,g,i+1) \ 01844 /* W[0] = W[-16] + W[-7] */ \ 01845 V_ADD(W_Y_0, W_Y_0, W_Y_M7) \ 01846 RND_1_4(h,a,b,c,d,e,f,g,i+1) \ 01847 /* W[0] = W[-16] + W[-7] + s0(W[-15]) */ \ 01848 V_ADD(W_Y_0, W_Y_0, YTMP1) \ 01849 RND_1_5(h,a,b,c,d,e,f,g,i+1) \ 01850 /* 0, 0, W[-1], W[-2] */ \ 01851 VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81) \ 01852 RND_1_6(h,a,b,c,d,e,f,g,i+1) \ 01853 RND_1_7(h,a,b,c,d,e,f,g,i+1) \ 01854 RND_1_8(h,a,b,c,d,e,f,g,i+1) \ 01855 /* W[-2] >> 19 */ \ 01856 V_SHIFT_R(YTMP1, W_Y_M2, 19) \ 01857 RND_1_9(h,a,b,c,d,e,f,g,i+1) \ 01858 /* W[-2] << 45 */ \ 01859 V_SHIFT_L(YTMP2, W_Y_M2, 45) \ 01860 RND_1_10(h,a,b,c,d,e,f,g,i+1) \ 01861 /* W[-2] >> 61 */ \ 01862 V_SHIFT_R(YTMP3, W_Y_M2, 61) \ 01863 RND_1_11(h,a,b,c,d,e,f,g,i+1) \ 01864 /* W[-2] << 3 */ \ 01865 V_SHIFT_L(YTMP4, W_Y_M2, 3) \ 01866 RND_1_12(h,a,b,c,d,e,f,g,i+1) \ 01867 RND_0_1(g,h,a,b,c,d,e,f,i+2) \ 01868 /* W[-2] >>> 19 */ \ 01869 V_OR(YTMP1, YTMP2, YTMP1) \ 01870 RND_0_2(g,h,a,b,c,d,e,f,i+2) \ 01871 /* W[-2] >>> 61 */ \ 01872 V_OR(YTMP3, YTMP4, YTMP3) \ 01873 RND_0_3(g,h,a,b,c,d,e,f,i+2) \ 01874 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \ 01875 V_XOR(YTMP1, YTMP3, YTMP1) \ 01876 RND_0_4(g,h,a,b,c,d,e,f,i+2) \ 01877 /* W[-2] >> 6 */ \ 01878 V_SHIFT_R(YTMP4, W_Y_M2, 6) \ 01879 RND_0_5(g,h,a,b,c,d,e,f,i+2) \ 01880 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \ 01881 V_XOR(YTMP1, YTMP4, YTMP1) \ 01882 RND_0_6(g,h,a,b,c,d,e,f,i+2) \ 01883 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \ 01884 V_ADD(W_Y_0, W_Y_0, YTMP1) \ 01885 RND_0_7(g,h,a,b,c,d,e,f,i+2) \ 01886 RND_0_8(g,h,a,b,c,d,e,f,i+2) \ 01887 /* W[1], W[0], 0, 0 */ \ 01888 VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08) \ 01889 RND_0_9(g,h,a,b,c,d,e,f,i+2) \ 01890 RND_0_10(g,h,a,b,c,d,e,f,i+2) \ 01891 /* W[-2] >> 19 */ \ 01892 V_SHIFT_R(YTMP1, W_Y_M2, 19) \ 01893 RND_0_11(g,h,a,b,c,d,e,f,i+2) \ 01894 /* W[-2] << 45 */ \ 01895 V_SHIFT_L(YTMP2, W_Y_M2, 45) \ 01896 RND_0_12(g,h,a,b,c,d,e,f,i+2) \ 01897 RND_1_1(f,g,h,a,b,c,d,e,i+3) \ 01898 /* W[-2] >> 61 */ \ 01899 V_SHIFT_R(YTMP3, W_Y_M2, 61) \ 01900 RND_1_2(f,g,h,a,b,c,d,e,i+3) \ 01901 /* W[-2] << 3 */ \ 01902 V_SHIFT_L(YTMP4, W_Y_M2, 3) \ 01903 RND_1_3(f,g,h,a,b,c,d,e,i+3) \ 01904 /* W[-2] >>> 19 */ \ 01905 V_OR(YTMP1, YTMP2, YTMP1) \ 01906 RND_1_4(f,g,h,a,b,c,d,e,i+3) \ 01907 /* W[-2] >>> 61 */ \ 01908 V_OR(YTMP3, YTMP4, YTMP3) \ 01909 RND_1_5(f,g,h,a,b,c,d,e,i+3) \ 01910 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \ 01911 V_XOR(YTMP1, YTMP3, YTMP1) \ 01912 RND_1_6(f,g,h,a,b,c,d,e,i+3) \ 01913 /* W[-2] >> 6 */ \ 01914 V_SHIFT_R(YTMP4, W_Y_M2, 6) \ 01915 RND_1_7(f,g,h,a,b,c,d,e,i+3) \ 01916 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \ 01917 V_XOR(YTMP1, YTMP4, YTMP1) \ 01918 RND_1_8(f,g,h,a,b,c,d,e,i+3) \ 01919 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \ 01920 V_ADD(W_Y_0, W_Y_0, YTMP1) \ 01921 RND_1_9(f,g,h,a,b,c,d,e,i+3) \ 01922 RND_1_10(f,g,h,a,b,c,d,e,i+3) \ 01923 RND_1_11(f,g,h,a,b,c,d,e,i+3) \ 01924 RND_1_12(f,g,h,a,b,c,d,e,i+3) \ 01925 01926 #define MsgSched2_AVX2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \ 01927 RND_0_1(a,b,c,d,e,f,g,h,i) \ 01928 VPALIGNR(W_Y_M15, W_2, W_0, 8) \ 01929 VPALIGNR(W_Y_M7, W_10, W_8, 8) \ 01930 RND_0_2(a,b,c,d,e,f,g,h,i) \ 01931 V_SHIFT_R(YTMP1, W_Y_M15, 1) \ 01932 V_SHIFT_L(YTMP2, W_Y_M15, 63) \ 01933 RND_0_3(a,b,c,d,e,f,g,h,i) \ 01934 RND_0_4(a,b,c,d,e,f,g,h,i) \ 01935 V_SHIFT_R(YTMP3, W_Y_M15, 8) \ 01936 V_SHIFT_L(YTMP4, W_Y_M15, 56) \ 01937 RND_0_5(a,b,c,d,e,f,g,h,i) \ 01938 RND_0_6(a,b,c,d,e,f,g,h,i) \ 01939 V_OR(YTMP1, YTMP2, YTMP1) \ 01940 V_OR(YTMP3, YTMP4, YTMP3) \ 01941 RND_0_7(a,b,c,d,e,f,g,h,i) \ 01942 RND_0_8(a,b,c,d,e,f,g,h,i) \ 01943 V_SHIFT_R(YTMP4, W_Y_M15, 7) \ 01944 V_XOR(YTMP1, YTMP3, YTMP1) \ 01945 RND_0_9(a,b,c,d,e,f,g,h,i) \ 01946 RND_0_10(a,b,c,d,e,f,g,h,i) \ 01947 V_XOR(YTMP1, YTMP4, YTMP1) \ 01948 V_ADD(W_0, W_0, W_Y_M7) \ 01949 RND_0_11(a,b,c,d,e,f,g,h,i) \ 01950 RND_0_12(a,b,c,d,e,f,g,h,i) \ 01951 RND_1_1(h,a,b,c,d,e,f,g,i+1) \ 01952 V_ADD(W_0, W_0, YTMP1) \ 01953 RND_1_2(h,a,b,c,d,e,f,g,i+1) \ 01954 V_SHIFT_R(YTMP1, W_14, 19) \ 01955 V_SHIFT_L(YTMP2, W_14, 45) \ 01956 RND_1_3(h,a,b,c,d,e,f,g,i+1) \ 01957 RND_1_4(h,a,b,c,d,e,f,g,i+1) \ 01958 V_SHIFT_R(YTMP3, W_14, 61) \ 01959 V_SHIFT_L(YTMP4, W_14, 3) \ 01960 RND_1_5(h,a,b,c,d,e,f,g,i+1) \ 01961 RND_1_6(h,a,b,c,d,e,f,g,i+1) \ 01962 RND_1_7(h,a,b,c,d,e,f,g,i+1) \ 01963 V_OR(YTMP1, YTMP2, YTMP1) \ 01964 V_OR(YTMP3, YTMP4, YTMP3) \ 01965 RND_1_8(h,a,b,c,d,e,f,g,i+1) \ 01966 RND_1_9(h,a,b,c,d,e,f,g,i+1) \ 01967 V_XOR(YTMP1, YTMP3, YTMP1) \ 01968 V_SHIFT_R(YTMP4, W_14, 6) \ 01969 RND_1_10(h,a,b,c,d,e,f,g,i+1) \ 01970 RND_1_11(h,a,b,c,d,e,f,g,i+1) \ 01971 V_XOR(YTMP1, YTMP4, YTMP1) \ 01972 RND_1_12(h,a,b,c,d,e,f,g,i+1) \ 01973 V_ADD(W_0, W_0, YTMP1) \ 01974 01975 #define MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \ 01976 RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \ 01977 /* W[-13]..W[-15], W[-12] */ \ 01978 VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03) \ 01979 /* W[-5]..W[-7], W[-4] */ \ 01980 VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03) \ 01981 RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \ 01982 /* W_Y_M15 = W[-12]..W[-15] */ \ 01983 VPERMQ(W_Y_M15, W_Y_M15, 0x39) \ 01984 RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \ 01985 /* W_Y_M7 = W[-4]..W[-7] */ \ 01986 VPERMQ(W_Y_M7, W_Y_M7, 0x39) \ 01987 RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \ 01988 /* W[-15] >> 1 */ \ 01989 V_SHIFT_R(YTMP1, W_Y_M15, 1) \ 01990 /* W[-15] << 63 */ \ 01991 V_SHIFT_L(YTMP2, W_Y_M15, 63) \ 01992 RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \ 01993 /* W[-15] >> 8 */ \ 01994 V_SHIFT_R(YTMP3, W_Y_M15, 8) \ 01995 /* W[-15] << 56 */ \ 01996 V_SHIFT_L(YTMP4, W_Y_M15, 56) \ 01997 /* W[-15] >>> 1 */ \ 01998 V_OR(YTMP1, YTMP2, YTMP1) \ 01999 /* W[-15] >>> 8 */ \ 02000 V_OR(YTMP3, YTMP4, YTMP3) \ 02001 RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \ 02002 /* W[-15] >> 7 */ \ 02003 V_SHIFT_R(YTMP4, W_Y_M15, 7) \ 02004 RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \ 02005 /* 0, 0, W[-1], W[-2] */ \ 02006 VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81) \ 02007 RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \ 02008 RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \ 02009 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */ \ 02010 V_XOR(YTMP1, YTMP3, YTMP1) \ 02011 RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \ 02012 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */ \ 02013 V_XOR(YTMP1, YTMP4, YTMP1) \ 02014 RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \ 02015 /* W[0] = W[-16] + W[-7] */ \ 02016 V_ADD(W_Y_0, W_Y_0, W_Y_M7) \ 02017 /* W[0] = W[-16] + W[-7] + s0(W[-15]) */ \ 02018 V_ADD(W_Y_0, W_Y_0, YTMP1) \ 02019 RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \ 02020 /* W[-2] >> 19 */ \ 02021 V_SHIFT_R(YTMP1, W_Y_M2, 19) \ 02022 /* W[-2] << 45 */ \ 02023 V_SHIFT_L(YTMP2, W_Y_M2, 45) \ 02024 RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \ 02025 /* W[-2] >> 61 */ \ 02026 V_SHIFT_R(YTMP3, W_Y_M2, 61) \ 02027 /* W[-2] << 3 */ \ 02028 V_SHIFT_L(YTMP4, W_Y_M2, 3) \ 02029 /* W[-2] >>> 19 */ \ 02030 V_OR(YTMP1, YTMP2, YTMP1) \ 02031 RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \ 02032 /* W[-2] >>> 61 */ \ 02033 V_OR(YTMP3, YTMP4, YTMP3) \ 02034 RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \ 02035 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \ 02036 V_XOR(YTMP1, YTMP3, YTMP1) \ 02037 RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \ 02038 /* W[-2] >> 6 */ \ 02039 V_SHIFT_R(YTMP4, W_Y_M2, 6) \ 02040 RND_RORX_0_1(g,h,a,b,c,d,e,f,i+2) \ 02041 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \ 02042 V_XOR(YTMP1, YTMP4, YTMP1) \ 02043 RND_RORX_0_2(g,h,a,b,c,d,e,f,i+2) \ 02044 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \ 02045 V_ADD(W_Y_0, W_Y_0, YTMP1) \ 02046 RND_RORX_0_3(g,h,a,b,c,d,e,f,i+2) \ 02047 /* W[1], W[0], 0, 0 */ \ 02048 VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08) \ 02049 RND_RORX_0_4(g,h,a,b,c,d,e,f,i+2) \ 02050 RND_RORX_0_5(g,h,a,b,c,d,e,f,i+2) \ 02051 /* W[-2] >> 19 */ \ 02052 V_SHIFT_R(YTMP1, W_Y_M2, 19) \ 02053 /* W[-2] << 45 */ \ 02054 V_SHIFT_L(YTMP2, W_Y_M2, 45) \ 02055 RND_RORX_0_6(g,h,a,b,c,d,e,f,i+2) \ 02056 /* W[-2] >> 61 */ \ 02057 V_SHIFT_R(YTMP3, W_Y_M2, 61) \ 02058 /* W[-2] << 3 */ \ 02059 V_SHIFT_L(YTMP4, W_Y_M2, 3) \ 02060 /* W[-2] >>> 19 */ \ 02061 V_OR(YTMP1, YTMP2, YTMP1) \ 02062 RND_RORX_0_7(g,h,a,b,c,d,e,f,i+2) \ 02063 /* W[-2] >>> 61 */ \ 02064 V_OR(YTMP3, YTMP4, YTMP3) \ 02065 RND_RORX_0_8(g,h,a,b,c,d,e,f,i+2) \ 02066 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \ 02067 V_XOR(YTMP1, YTMP3, YTMP1) \ 02068 RND_RORX_1_1(f,g,h,a,b,c,d,e,i+3) \ 02069 /* W[-2] >> 6 */ \ 02070 V_SHIFT_R(YTMP4, W_Y_M2, 6) \ 02071 RND_RORX_1_2(f,g,h,a,b,c,d,e,i+3) \ 02072 RND_RORX_1_3(f,g,h,a,b,c,d,e,i+3) \ 02073 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \ 02074 V_XOR(YTMP1, YTMP4, YTMP1) \ 02075 RND_RORX_1_4(f,g,h,a,b,c,d,e,i+3) \ 02076 RND_RORX_1_5(f,g,h,a,b,c,d,e,i+3) \ 02077 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \ 02078 V_ADD(W_Y_0, W_Y_0, YTMP1) \ 02079 RND_RORX_1_6(f,g,h,a,b,c,d,e,i+3) \ 02080 V_ADD_I(YTMP1, W_Y_0, rsi, i) \ 02081 RND_RORX_1_7(f,g,h,a,b,c,d,e,i+3) \ 02082 RND_RORX_1_8(f,g,h,a,b,c,d,e,i+3) \ 02083 VMOVDQU_I(rsp, i, YTMP1) \ 02084 02085 #define MsgSched2_AVX2_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e, \ 02086 f,g,h,i) \ 02087 RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \ 02088 VPALIGNR(W_Y_M15, W_2, W_0, 8) \ 02089 VPALIGNR(W_Y_M7, W_10, W_8, 8) \ 02090 RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \ 02091 V_SHIFT_R(YTMP1, W_Y_M15, 1) \ 02092 V_SHIFT_L(YTMP2, W_Y_M15, 63) \ 02093 RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \ 02094 V_SHIFT_R(YTMP3, W_Y_M15, 8) \ 02095 V_SHIFT_L(YTMP4, W_Y_M15, 56) \ 02096 RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \ 02097 V_OR(YTMP1, YTMP2, YTMP1) \ 02098 V_OR(YTMP3, YTMP4, YTMP3) \ 02099 RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \ 02100 V_SHIFT_R(YTMP4, W_Y_M15, 7) \ 02101 V_XOR(YTMP1, YTMP3, YTMP1) \ 02102 RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \ 02103 V_XOR(YTMP1, YTMP4, YTMP1) \ 02104 V_ADD(W_0, W_0, W_Y_M7) \ 02105 RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \ 02106 RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \ 02107 V_ADD(W_0, W_0, YTMP1) \ 02108 RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \ 02109 V_SHIFT_R(YTMP1, W_14, 19) \ 02110 V_SHIFT_L(YTMP2, W_14, 45) \ 02111 RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \ 02112 V_SHIFT_R(YTMP3, W_14, 61) \ 02113 V_SHIFT_L(YTMP4, W_14, 3) \ 02114 RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \ 02115 V_OR(YTMP1, YTMP2, YTMP1) \ 02116 V_OR(YTMP3, YTMP4, YTMP3) \ 02117 RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \ 02118 RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \ 02119 V_XOR(YTMP1, YTMP3, YTMP1) \ 02120 V_SHIFT_R(YTMP4, W_14, 6) \ 02121 RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \ 02122 RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \ 02123 V_XOR(YTMP1, YTMP4, YTMP1) \ 02124 RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \ 02125 V_ADD(W_0, W_0, YTMP1) \ 02126 02127 02128 #define _INIT_MASK_Y(mask) \ 02129 "vmovdqu %[mask], %%"#mask"\n\t" 02130 #define INIT_MASK_Y(mask) \ 02131 _INIT_MASK_Y(mask) 02132 02133 /* Load into YMM registers and swap endian. */ 02134 #define _LOAD_BLOCK_W_Y_2(mask, ymm0, ymm1, reg, i) \ 02135 /* buffer[0..15] => ymm0..ymm3; */ \ 02136 "vmovdqu " #i "+ 0(%%" #reg "), %%" #ymm0 "\n\t" \ 02137 "vmovdqu " #i "+32(%%" #reg "), %%" #ymm1 "\n\t" \ 02138 "vpshufb %%" #mask ", %%" #ymm0 ", %%" #ymm0 "\n\t" \ 02139 "vpshufb %%" #mask ", %%" #ymm1 ", %%" #ymm1 "\n\t" 02140 02141 #define LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i) \ 02142 _LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i) 02143 02144 #define LOAD_BLOCK_W_Y(mask, reg) \ 02145 LOAD_BLOCK_W_Y_2(mask, W_Y_0, W_Y_4 , reg, 0) \ 02146 LOAD_BLOCK_W_Y_2(mask, W_Y_8, W_Y_12, reg, 64) 02147 02148 #define _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) \ 02149 "vpaddq " #i "+ 0(%%" #reg "), %%" #ymm0 ", %%" #ymm2 "\n\t" \ 02150 "vpaddq " #i "+32(%%" #reg "), %%" #ymm1 ", %%" #ymm3 "\n\t" \ 02151 "vmovdqu %%" #ymm2 ", " #i "+ 0(" WX ")\n\t" \ 02152 "vmovdqu %%" #ymm3 ", " #i "+32(" WX ")\n\t" 02153 02154 #define SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) \ 02155 _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) 02156 02157 #define SET_BLOCK_W_Y(reg) \ 02158 SET_W_Y_2(W_Y_0, W_Y_4 , YTMP1, YTMP2, reg, 0) \ 02159 SET_W_Y_2(W_Y_8, W_Y_12, YTMP1, YTMP2, reg, 64) 02160 02161 /* Load into YMM registers and swap endian. */ 02162 #define _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) \ 02163 "vmovdqu " #i "+ 0(%%" #reg "), %%" #X0 "\n\t" \ 02164 "vmovdqu " #i "+ 16(%%" #reg "), %%" #X1 "\n\t" \ 02165 "vmovdqu " #i "+128(%%" #reg "), %%" #X8 "\n\t" \ 02166 "vmovdqu " #i "+144(%%" #reg "), %%" #X9 "\n\t" \ 02167 "vinserti128 $1, %%" #X8 ", %%" #Y0 ", %%" #Y0 "\n\t" \ 02168 "vinserti128 $1, %%" #X9 ", %%" #Y1 ", %%" #Y1 "\n\t" \ 02169 "vpshufb %%" #mask ", %%" #Y0 ", %%" #Y0 "\n\t" \ 02170 "vpshufb %%" #mask ", %%" #Y1 ", %%" #Y1 "\n\t" 02171 02172 #define LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) \ 02173 _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) 02174 02175 #define LOAD_BLOCK2_W_Y(mask, reg) \ 02176 LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, 0) \ 02177 LOAD_BLOCK2_W_Y_2(mask, Y2, Y3, X2, X3, X8, X9, reg, 32) \ 02178 LOAD_BLOCK2_W_Y_2(mask, Y4, Y5, X4, X5, X8, X9, reg, 64) \ 02179 LOAD_BLOCK2_W_Y_2(mask, Y6, Y7, X6, X7, X8, X9, reg, 96) \ 02180 02181 #define SET_BLOCK2_W_Y(reg) \ 02182 SET_W_Y_2(Y0, Y1, YTMP1, YTMP2, reg, 0) \ 02183 SET_W_Y_2(Y2, Y3, YTMP1, YTMP2, reg, 64) \ 02184 SET_W_Y_2(Y4, Y5, YTMP1, YTMP2, reg, 128) \ 02185 SET_W_Y_2(Y6, Y7, YTMP1, YTMP2, reg, 192) 02186 02187 static const word64 K512_AVX2[160] = { 02188 W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), 02189 W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd), 02190 W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), 02191 W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc), 02192 W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), 02193 W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019), 02194 W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118), 02195 W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118), 02196 W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe), 02197 W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe), 02198 W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2), 02199 W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2), 02200 W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1), 02201 W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1), 02202 W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694), 02203 W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694), 02204 W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3), 02205 W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3), 02206 W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65), 02207 W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65), 02208 W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483), 02209 W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483), 02210 W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5), 02211 W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5), 02212 W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210), 02213 W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210), 02214 W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4), 02215 W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4), 02216 W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725), 02217 W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725), 02218 W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70), 02219 W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70), 02220 W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926), 02221 W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926), 02222 W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df), 02223 W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df), 02224 W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8), 02225 W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8), 02226 W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b), 02227 W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b), 02228 W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001), 02229 W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001), 02230 W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30), 02231 W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30), 02232 W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910), 02233 W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910), 02234 W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8), 02235 W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8), 02236 W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53), 02237 W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53), 02238 W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8), 02239 W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8), 02240 W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb), 02241 W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb), 02242 W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3), 02243 W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3), 02244 W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60), 02245 W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60), 02246 W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec), 02247 W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec), 02248 W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9), 02249 W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9), 02250 W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b), 02251 W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b), 02252 W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207), 02253 W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207), 02254 W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178), 02255 W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178), 02256 W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6), 02257 W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6), 02258 W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b), 02259 W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b), 02260 W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493), 02261 W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493), 02262 W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c), 02263 W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c), 02264 W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a), 02265 W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a), 02266 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817), 02267 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817) 02268 }; 02269 static const word64* K512_AVX2_END = &K512_AVX2[128]; 02270 02271 static int Transform_Sha512_AVX2(wc_Sha512* sha512) 02272 { 02273 __asm__ __volatile__ ( 02274 02275 /* 16 Ws plus loop counter and K512. */ 02276 "subq $136, %%rsp\n\t" 02277 "leaq 64(%[sha512]), %%rax\n\t" 02278 02279 INIT_MASK(MASK_Y) 02280 LOAD_DIGEST() 02281 02282 LOAD_BLOCK_W_Y(MASK_Y, rax) 02283 02284 "movl $4, 16*8(" WX ")\n\t" 02285 "leaq %[K512], %%rsi\n\t" 02286 /* b */ 02287 "movq %%r9, " L4 "\n\t" 02288 /* e */ 02289 "movq %%r12, " L1 "\n\t" 02290 /* b ^ c */ 02291 "xorq %%r10, " L4 "\n\t" 02292 02293 SET_BLOCK_W_Y(rsi) 02294 02295 "# Start of 16 rounds\n" 02296 "1:\n\t" 02297 02298 "addq $128, %%rsi\n\t" 02299 02300 MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0) 02301 MsgSched4_AVX2(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4) 02302 MsgSched4_AVX2(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8) 02303 MsgSched4_AVX2(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12) 02304 02305 SET_BLOCK_W_Y(rsi) 02306 02307 "subl $1, 16*8(" WX ")\n\t" 02308 "jne 1b\n\t" 02309 02310 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 02311 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2) 02312 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4) 02313 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6) 02314 02315 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8) 02316 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10) 02317 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12) 02318 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 02319 02320 STORE_ADD_DIGEST() 02321 02322 "addq $136, %%rsp\n\t" 02323 02324 : 02325 : [mask] "m" (mBYTE_FLIP_MASK_Y), 02326 [sha512] "r" (sha512), 02327 [K512] "m" (K512) 02328 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi" 02329 ); 02330 02331 return 0; 02332 } 02333 02334 static int Transform_Sha512_AVX2_Len(wc_Sha512* sha512, word32 len) 02335 { 02336 if ((len & WC_SHA512_BLOCK_SIZE) != 0) { 02337 XMEMCPY(sha512->buffer, sha512->data, WC_SHA512_BLOCK_SIZE); 02338 Transform_Sha512_AVX2(sha512); 02339 sha512->data += WC_SHA512_BLOCK_SIZE; 02340 len -= WC_SHA512_BLOCK_SIZE; 02341 if (len == 0) 02342 return 0; 02343 } 02344 02345 __asm__ __volatile__ ( 02346 02347 "movq 224(%[sha512]), %%rcx\n\t" 02348 02349 INIT_MASK(MASK_Y) 02350 LOAD_DIGEST() 02351 02352 "# Start of processing two blocks\n" 02353 "2:\n\t" 02354 02355 "subq $1344, %%rsp\n\t" 02356 "leaq %[K512], %%rsi\n\t" 02357 02358 /* L4 = b */ 02359 "movq %%r9, " L4 "\n\t" 02360 /* e */ 02361 "movq %%r12, " L1 "\n\t" 02362 02363 LOAD_BLOCK2_W_Y(MASK_Y, rcx) 02364 02365 /* L4 = b ^ c */ 02366 "xorq %%r10, " L4 "\n\t" 02367 "\n" 02368 "1:\n\t" 02369 SET_BLOCK2_W_Y(rsi) 02370 MsgSched2_AVX2(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0) 02371 MsgSched2_AVX2(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4) 02372 MsgSched2_AVX2(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8) 02373 MsgSched2_AVX2(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12) 02374 MsgSched2_AVX2(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16) 02375 MsgSched2_AVX2(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20) 02376 MsgSched2_AVX2(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24) 02377 MsgSched2_AVX2(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28) 02378 "addq $256, %%rsi\n\t" 02379 "addq $256, %%rsp\n\t" 02380 "cmpq %[K512_END], %%rsi\n\t" 02381 "jne 1b\n\t" 02382 02383 SET_BLOCK2_W_Y(rsi) 02384 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 02385 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4) 02386 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8) 02387 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12) 02388 02389 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16) 02390 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20) 02391 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24) 02392 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28) 02393 "subq $1024, %%rsp\n\t" 02394 02395 ADD_DIGEST() 02396 STORE_DIGEST() 02397 02398 /* L4 = b */ 02399 "movq %%r9, " L4 "\n\t" 02400 /* e */ 02401 "movq %%r12, " L1 "\n\t" 02402 /* L4 = b ^ c */ 02403 "xorq %%r10, " L4 "\n\t" 02404 02405 "movq $5, %%rsi\n\t" 02406 "\n" 02407 "3:\n\t" 02408 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2) 02409 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6) 02410 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10) 02411 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 02412 02413 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18) 02414 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22) 02415 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26) 02416 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30) 02417 "addq $256, %%rsp\n\t" 02418 "subq $1, %%rsi\n\t" 02419 "jnz 3b\n\t" 02420 02421 ADD_DIGEST() 02422 02423 "movq 224(%[sha512]), %%rcx\n\t" 02424 "addq $64, %%rsp\n\t" 02425 "addq $256, %%rcx\n\t" 02426 "subl $256, %[len]\n\t" 02427 "movq %%rcx, 224(%[sha512])\n\t" 02428 02429 STORE_DIGEST() 02430 02431 "jnz 2b\n\t" 02432 02433 : 02434 : [mask] "m" (mBYTE_FLIP_MASK_Y), 02435 [len] "m" (len), 02436 [sha512] "r" (sha512), 02437 [K512] "m" (K512_AVX2), 02438 [K512_END] "m" (K512_AVX2_END) 02439 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi" 02440 ); 02441 02442 return 0; 02443 } 02444 02445 #ifdef HAVE_INTEL_RORX 02446 static int Transform_Sha512_AVX2_RORX(wc_Sha512* sha512) 02447 { 02448 __asm__ __volatile__ ( 02449 02450 /* 16 Ws plus loop counter. */ 02451 "subq $136, %%rsp\n\t" 02452 "leaq 64(%[sha512]), " L2 "\n\t" 02453 02454 INIT_MASK(MASK_Y) 02455 LOAD_DIGEST() 02456 02457 LOAD_BLOCK_W_Y(MASK_Y, rcx) 02458 02459 "movl $4, 16*8(" WX ")\n\t" 02460 "leaq %[K512], %%rsi\n\t" 02461 /* b */ 02462 "movq %%r9, " L4 "\n\t" 02463 /* L3 = 0 (add to prev h) */ 02464 "xorq " L3 ", " L3 "\n\t" 02465 /* b ^ c */ 02466 "xorq %%r10, " L4 "\n\t" 02467 02468 SET_BLOCK_W_Y(rsi) 02469 02470 "# Start of 16 rounds\n" 02471 "1:\n\t" 02472 02473 "addq $128, %%rsi\n\t" 02474 02475 MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0) 02476 MsgSched4_AVX2_RORX_SET(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4) 02477 MsgSched4_AVX2_RORX_SET(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8) 02478 MsgSched4_AVX2_RORX_SET(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12) 02479 02480 "subl $1, 16*8(%%rsp)\n\t" 02481 "jnz 1b\n\t" 02482 02483 RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 0) 02484 RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD, 4) 02485 RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 8) 02486 RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD,12) 02487 /* Prev RND: h += Maj(a,b,c) */ 02488 "addq " L3 ", %%r8\n\t" 02489 "addq $136, %%rsp\n\t" 02490 02491 STORE_ADD_DIGEST() 02492 02493 : 02494 : [mask] "m" (mBYTE_FLIP_MASK_Y), 02495 [sha512] "r" (sha512), 02496 [K512] "m" (K512) 02497 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi" 02498 ); 02499 02500 return 0; 02501 } 02502 02503 static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512* sha512, word32 len) 02504 { 02505 if ((len & WC_SHA512_BLOCK_SIZE) != 0) { 02506 XMEMCPY(sha512->buffer, sha512->data, WC_SHA512_BLOCK_SIZE); 02507 Transform_Sha512_AVX2_RORX(sha512); 02508 sha512->data += WC_SHA512_BLOCK_SIZE; 02509 len -= WC_SHA512_BLOCK_SIZE; 02510 if (len == 0) 02511 return 0; 02512 } 02513 02514 __asm__ __volatile__ ( 02515 02516 "movq 224(%[sha512]), %%rax\n\t" 02517 02518 INIT_MASK(MASK_Y) 02519 LOAD_DIGEST() 02520 02521 "# Start of processing two blocks\n" 02522 "2:\n\t" 02523 02524 "subq $1344, %%rsp\n\t" 02525 "leaq %[K512], %%rsi\n\t" 02526 02527 /* L4 = b */ 02528 "movq %%r9, " L4 "\n\t" 02529 /* L3 = 0 (add to prev h) */ 02530 "xorq " L3 ", " L3 "\n\t" 02531 02532 LOAD_BLOCK2_W_Y(MASK_Y, rax) 02533 02534 /* L4 = b ^ c */ 02535 "xorq %%r10, " L4 "\n\t" 02536 "\n" 02537 "1:\n\t" 02538 SET_BLOCK2_W_Y(rsi) 02539 MsgSched2_AVX2_RORX(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0) 02540 MsgSched2_AVX2_RORX(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4) 02541 MsgSched2_AVX2_RORX(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8) 02542 MsgSched2_AVX2_RORX(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12) 02543 MsgSched2_AVX2_RORX(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16) 02544 MsgSched2_AVX2_RORX(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20) 02545 MsgSched2_AVX2_RORX(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24) 02546 MsgSched2_AVX2_RORX(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28) 02547 "addq $256, %%rsi\n\t" 02548 "addq $256, %%rsp\n\t" 02549 "cmpq %[K512_END], %%rsi\n\t" 02550 "jne 1b\n\t" 02551 02552 SET_BLOCK2_W_Y(rsi) 02553 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0) 02554 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4) 02555 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8) 02556 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12) 02557 02558 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16) 02559 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20) 02560 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24) 02561 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28) 02562 "addq " L3 ", %%r8\n\t" 02563 "subq $1024, %%rsp\n\t" 02564 02565 ADD_DIGEST() 02566 STORE_DIGEST() 02567 02568 /* L4 = b */ 02569 "movq %%r9, " L4 "\n\t" 02570 /* L3 = 0 (add to prev h) */ 02571 "xorq " L3 ", " L3 "\n\t" 02572 /* L4 = b ^ c */ 02573 "xorq %%r10, " L4 "\n\t" 02574 02575 "movq $5, %%rsi\n\t" 02576 "\n" 02577 "3:\n\t" 02578 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2) 02579 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6) 02580 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10) 02581 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14) 02582 02583 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18) 02584 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22) 02585 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26) 02586 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30) 02587 "addq $256, %%rsp\n\t" 02588 "subq $1, %%rsi\n\t" 02589 "jnz 3b\n\t" 02590 02591 "addq " L3 ", %%r8\n\t" 02592 02593 ADD_DIGEST() 02594 02595 "movq 224(%[sha512]), %%rax\n\t" 02596 "addq $64, %%rsp\n\t" 02597 "addq $256, %%rax\n\t" 02598 "subl $256, %[len]\n\t" 02599 "movq %%rax, 224(%[sha512])\n\t" 02600 02601 STORE_DIGEST() 02602 02603 "jnz 2b\n\t" 02604 02605 : 02606 : [mask] "m" (mBYTE_FLIP_MASK_Y), 02607 [len] "m" (len), 02608 [sha512] "r" (sha512), 02609 [K512] "m" (K512_AVX2), 02610 [K512_END] "m" (K512_AVX2_END) 02611 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi" 02612 ); 02613 02614 return 0; 02615 } 02616 #endif /* HAVE_INTEL_RORX */ 02617 #endif /* HAVE_INTEL_AVX2 */ 02618 02619 #endif /* WOLFSSL_SHA512 */ 02620 02621 02622 /* -------------------------------------------------------------------------- */ 02623 /* SHA384 */ 02624 /* -------------------------------------------------------------------------- */ 02625 #ifdef WOLFSSL_SHA384 02626 02627 #if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH) 02628 /* functions defined in wolfcrypt/src/port/caam/caam_sha.c */ 02629 #else 02630 02631 static int InitSha384(wc_Sha384* sha384) 02632 { 02633 if (sha384 == NULL) { 02634 return BAD_FUNC_ARG; 02635 } 02636 02637 sha384->digest[0] = W64LIT(0xcbbb9d5dc1059ed8); 02638 sha384->digest[1] = W64LIT(0x629a292a367cd507); 02639 sha384->digest[2] = W64LIT(0x9159015a3070dd17); 02640 sha384->digest[3] = W64LIT(0x152fecd8f70e5939); 02641 sha384->digest[4] = W64LIT(0x67332667ffc00b31); 02642 sha384->digest[5] = W64LIT(0x8eb44a8768581511); 02643 sha384->digest[6] = W64LIT(0xdb0c2e0d64f98fa7); 02644 sha384->digest[7] = W64LIT(0x47b5481dbefa4fa4); 02645 02646 sha384->buffLen = 0; 02647 sha384->loLen = 0; 02648 sha384->hiLen = 0; 02649 02650 return 0; 02651 } 02652 02653 int wc_Sha384Update(wc_Sha384* sha384, const byte* data, word32 len) 02654 { 02655 if (sha384 == NULL || (data == NULL && len > 0)) { 02656 return BAD_FUNC_ARG; 02657 } 02658 02659 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384) 02660 if (sha384->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA384) { 02661 #if defined(HAVE_INTEL_QA) 02662 return IntelQaSymSha384(&sha384->asyncDev, NULL, data, len); 02663 #endif 02664 } 02665 #endif /* WOLFSSL_ASYNC_CRYPT */ 02666 02667 return Sha512Update((wc_Sha512*)sha384, data, len); 02668 } 02669 02670 02671 int wc_Sha384FinalRaw(wc_Sha384* sha384, byte* hash) 02672 { 02673 #ifdef LITTLE_ENDIAN_ORDER 02674 word64 digest[WC_SHA384_DIGEST_SIZE / sizeof(word64)]; 02675 #endif 02676 02677 if (sha384 == NULL || hash == NULL) { 02678 return BAD_FUNC_ARG; 02679 } 02680 02681 #ifdef LITTLE_ENDIAN_ORDER 02682 ByteReverseWords64((word64*)digest, (word64*)sha384->digest, 02683 WC_SHA384_DIGEST_SIZE); 02684 XMEMCPY(hash, digest, WC_SHA384_DIGEST_SIZE); 02685 #else 02686 XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE); 02687 #endif 02688 02689 return 0; 02690 } 02691 02692 int wc_Sha384Final(wc_Sha384* sha384, byte* hash) 02693 { 02694 int ret; 02695 02696 if (sha384 == NULL || hash == NULL) { 02697 return BAD_FUNC_ARG; 02698 } 02699 02700 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384) 02701 if (sha384->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA384) { 02702 #if defined(HAVE_INTEL_QA) 02703 return IntelQaSymSha384(&sha384->asyncDev, hash, NULL, 02704 WC_SHA384_DIGEST_SIZE); 02705 #endif 02706 } 02707 #endif /* WOLFSSL_ASYNC_CRYPT */ 02708 02709 ret = Sha512Final((wc_Sha512*)sha384); 02710 if (ret != 0) 02711 return ret; 02712 02713 XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE); 02714 02715 return InitSha384(sha384); /* reset state */ 02716 } 02717 02718 02719 /* Hardware Acceleration */ 02720 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 02721 int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId) 02722 { 02723 int ret = InitSha384(sha384); 02724 02725 (void)heap; 02726 (void)devId; 02727 02728 Sha512_SetTransform(); 02729 02730 return ret; 02731 } 02732 #else 02733 int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId) 02734 { 02735 int ret; 02736 02737 if (sha384 == NULL) { 02738 return BAD_FUNC_ARG; 02739 } 02740 02741 sha384->heap = heap; 02742 ret = InitSha384(sha384); 02743 if (ret != 0) 02744 return ret; 02745 02746 #ifdef WOLFSSL_SMALL_STACK_CACHE 02747 sha384->W = NULL; 02748 #endif 02749 02750 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384) 02751 ret = wolfAsync_DevCtxInit(&sha384->asyncDev, WOLFSSL_ASYNC_MARKER_SHA384, 02752 sha384->heap, devId); 02753 #else 02754 (void)devId; 02755 #endif /* WOLFSSL_ASYNC_CRYPT */ 02756 02757 return ret; 02758 } 02759 #endif 02760 #endif /* WOLFSSL_IMX6_CAAM */ 02761 02762 int wc_InitSha384(wc_Sha384* sha384) 02763 { 02764 return wc_InitSha384_ex(sha384, NULL, INVALID_DEVID); 02765 } 02766 02767 void wc_Sha384Free(wc_Sha384* sha384) 02768 { 02769 if (sha384 == NULL) 02770 return; 02771 02772 #ifdef WOLFSSL_SMALL_STACK_CACHE 02773 if (sha384->W != NULL) { 02774 XFREE(sha384->W, NULL, DYNAMIC_TYPE_TMP_BUFFER); 02775 sha384->W = NULL; 02776 } 02777 #endif 02778 02779 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384) 02780 wolfAsync_DevCtxFree(&sha384->asyncDev, WOLFSSL_ASYNC_MARKER_SHA384); 02781 #endif /* WOLFSSL_ASYNC_CRYPT */ 02782 } 02783 02784 #endif /* WOLFSSL_SHA384 */ 02785 02786 #endif /* HAVE_FIPS */ 02787 02788 #ifdef WOLFSSL_SHA512 02789 02790 int wc_Sha512GetHash(wc_Sha512* sha512, byte* hash) 02791 { 02792 int ret; 02793 wc_Sha512 tmpSha512; 02794 02795 if (sha512 == NULL || hash == NULL) 02796 return BAD_FUNC_ARG; 02797 02798 ret = wc_Sha512Copy(sha512, &tmpSha512); 02799 if (ret == 0) { 02800 ret = wc_Sha512Final(&tmpSha512, hash); 02801 wc_Sha512Free(&tmpSha512); 02802 } 02803 return ret; 02804 } 02805 02806 int wc_Sha512Copy(wc_Sha512* src, wc_Sha512* dst) 02807 { 02808 int ret = 0; 02809 02810 if (src == NULL || dst == NULL) 02811 return BAD_FUNC_ARG; 02812 02813 XMEMCPY(dst, src, sizeof(wc_Sha512)); 02814 #ifdef WOLFSSL_SMALL_STACK_CACHE 02815 dst->W = NULL; 02816 #endif 02817 02818 #ifdef WOLFSSL_ASYNC_CRYPT 02819 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev); 02820 #endif 02821 02822 return ret; 02823 } 02824 02825 #endif /* WOLFSSL_SHA512 */ 02826 02827 #ifdef WOLFSSL_SHA384 02828 02829 int wc_Sha384GetHash(wc_Sha384* sha384, byte* hash) 02830 { 02831 int ret; 02832 wc_Sha384 tmpSha384; 02833 02834 if (sha384 == NULL || hash == NULL) 02835 return BAD_FUNC_ARG; 02836 02837 ret = wc_Sha384Copy(sha384, &tmpSha384); 02838 if (ret == 0) { 02839 ret = wc_Sha384Final(&tmpSha384, hash); 02840 wc_Sha384Free(&tmpSha384); 02841 } 02842 return ret; 02843 } 02844 int wc_Sha384Copy(wc_Sha384* src, wc_Sha384* dst) 02845 { 02846 int ret = 0; 02847 02848 if (src == NULL || dst == NULL) 02849 return BAD_FUNC_ARG; 02850 02851 XMEMCPY(dst, src, sizeof(wc_Sha384)); 02852 #ifdef WOLFSSL_SMALL_STACK_CACHE 02853 dst->W = NULL; 02854 #endif 02855 02856 #ifdef WOLFSSL_ASYNC_CRYPT 02857 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev); 02858 #endif 02859 02860 return ret; 02861 } 02862 02863 #endif /* WOLFSSL_SHA384 */ 02864 02865 #endif /* WOLFSSL_SHA512 || WOLFSSL_SHA384 */ 02866
Generated on Tue Jul 12 2022 16:58:07 by
1.7.2