Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
sha256.c
00001 /* sha256.c 00002 * 00003 * Copyright (C) 2006-2017 wolfSSL Inc. 00004 * 00005 * This file is part of wolfSSL. 00006 * 00007 * wolfSSL is free software; you can redistribute it and/or modify 00008 * it under the terms of the GNU General Public License as published by 00009 * the Free Software Foundation; either version 2 of the License, or 00010 * (at your option) any later version. 00011 * 00012 * wolfSSL is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 * GNU General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU General Public License 00018 * along with this program; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA 00020 */ 00021 00022 00023 /* code submitted by raphael.huck@efixo.com */ 00024 00025 #ifdef HAVE_CONFIG_H 00026 #include <config.h> 00027 #endif 00028 00029 #include <wolfcrypt/settings.h> 00030 00031 #if !defined(NO_SHA256) && !defined(WOLFSSL_ARMASM) 00032 00033 #if defined(HAVE_FIPS) && \ 00034 defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2) 00035 00036 /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */ 00037 #define FIPS_NO_WRAPPERS 00038 00039 #ifdef USE_WINDOWS_API 00040 #pragma code_seg(".fipsA$d") 00041 #pragma const_seg(".fipsB$d") 00042 #endif 00043 #endif 00044 00045 #include <wolfcrypt/sha256.h> 00046 #include <wolfcrypt/error-crypt.h> 00047 #include <wolfcrypt/cpuid.h> 00048 00049 /* fips wrapper calls, user can call direct */ 00050 #if defined(HAVE_FIPS) && \ 00051 (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION < 2)) 00052 00053 int wc_InitSha256(wc_Sha256* sha) 00054 { 00055 if (sha == NULL) { 00056 return BAD_FUNC_ARG; 00057 } 00058 return InitSha256_fips(sha); 00059 } 00060 int wc_InitSha256_ex(wc_Sha256* sha, void* heap, int devId) 00061 { 00062 (void)heap; 00063 (void)devId; 00064 if (sha == NULL) { 00065 return BAD_FUNC_ARG; 00066 } 00067 return InitSha256_fips(sha); 00068 } 00069 int wc_Sha256Update(wc_Sha256* sha, const byte* data, word32 len) 00070 { 00071 if (sha == NULL || (data == NULL && len > 0)) { 00072 return BAD_FUNC_ARG; 00073 } 00074 00075 if (data == NULL && len == 0) { 00076 /* valid, but do nothing */ 00077 return 0; 00078 } 00079 00080 return Sha256Update_fips(sha, data, len); 00081 } 00082 int wc_Sha256Final(wc_Sha256* sha, byte* out) 00083 { 00084 if (sha == NULL || out == NULL) { 00085 return BAD_FUNC_ARG; 00086 } 00087 return Sha256Final_fips(sha, out); 00088 } 00089 void wc_Sha256Free(wc_Sha256* sha) 00090 { 00091 (void)sha; 00092 /* Not supported in FIPS */ 00093 } 00094 00095 #else /* else build without fips, or for FIPS v2 */ 00096 00097 00098 #if defined(WOLFSSL_TI_HASH) 00099 /* #include <wolfcrypt/src/port/ti/ti-hash.c> included by wc_port.c */ 00100 #else 00101 00102 #include <wolfcrypt/logging.h> 00103 00104 #ifdef NO_INLINE 00105 #include <wolfcrypt/misc.h> 00106 #else 00107 #define WOLFSSL_MISC_INCLUDED 00108 #include <wolfcrypt/src/misc.c> 00109 #endif 00110 00111 00112 #if defined(USE_INTEL_SPEEDUP) 00113 #define HAVE_INTEL_AVX1 00114 00115 #if defined(__GNUC__) && ((__GNUC__ < 4) || \ 00116 (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) 00117 #define NO_AVX2_SUPPORT 00118 #endif 00119 #if defined(__clang__) && ((__clang_major__ < 3) || \ 00120 (__clang_major__ == 3 && __clang_minor__ <= 5)) 00121 #define NO_AVX2_SUPPORT 00122 #elif defined(__clang__) && defined(NO_AVX2_SUPPORT) 00123 #undef NO_AVX2_SUPPORT 00124 #endif 00125 00126 #define HAVE_INTEL_AVX1 00127 #ifndef NO_AVX2_SUPPORT 00128 #define HAVE_INTEL_AVX2 00129 #endif 00130 #endif /* USE_INTEL_SPEEDUP */ 00131 00132 #if defined(HAVE_INTEL_AVX2) 00133 #define HAVE_INTEL_RORX 00134 #endif 00135 00136 00137 #if !defined(WOLFSSL_PIC32MZ_HASH) && !defined(STM32_HASH_SHA2) && \ 00138 (!defined(WOLFSSL_IMX6_CAAM) || defined(NO_IMX6_CAAM_HASH)) 00139 static int InitSha256(wc_Sha256* sha256) 00140 { 00141 int ret = 0; 00142 00143 if (sha256 == NULL) 00144 return BAD_FUNC_ARG; 00145 00146 XMEMSET(sha256->digest, 0, sizeof(sha256->digest)); 00147 sha256->digest[0] = 0x6A09E667L; 00148 sha256->digest[1] = 0xBB67AE85L; 00149 sha256->digest[2] = 0x3C6EF372L; 00150 sha256->digest[3] = 0xA54FF53AL; 00151 sha256->digest[4] = 0x510E527FL; 00152 sha256->digest[5] = 0x9B05688CL; 00153 sha256->digest[6] = 0x1F83D9ABL; 00154 sha256->digest[7] = 0x5BE0CD19L; 00155 00156 sha256->buffLen = 0; 00157 sha256->loLen = 0; 00158 sha256->hiLen = 0; 00159 00160 return ret; 00161 } 00162 #endif 00163 00164 00165 /* Hardware Acceleration */ 00166 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00167 00168 /* in case intel instructions aren't available, plus we need the K[] global */ 00169 #define NEED_SOFT_SHA256 00170 00171 /***** 00172 Intel AVX1/AVX2 Macro Control Structure 00173 00174 #define HAVE_INTEL_AVX1 00175 #define HAVE_INTEL_AVX2 00176 00177 #define HAVE_INTEL_RORX 00178 00179 00180 int InitSha256(wc_Sha256* sha256) { 00181 Save/Recover XMM, YMM 00182 ... 00183 } 00184 00185 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) 00186 Transform_Sha256(); Function prototype 00187 #else 00188 Transform_Sha256() { } 00189 int Sha256Final() { 00190 Save/Recover XMM, YMM 00191 ... 00192 } 00193 #endif 00194 00195 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) 00196 #if defined(HAVE_INTEL_RORX 00197 #define RND with rorx instuction 00198 #else 00199 #define RND 00200 #endif 00201 #endif 00202 00203 #if defined(HAVE_INTEL_AVX1) 00204 00205 #define XMM Instructions/inline asm 00206 00207 int Transform_Sha256() { 00208 Stitched Message Sched/Round 00209 } 00210 00211 #elif defined(HAVE_INTEL_AVX2) 00212 00213 #define YMM Instructions/inline asm 00214 00215 int Transform_Sha256() { 00216 More granural Stitched Message Sched/Round 00217 } 00218 00219 #endif 00220 00221 */ 00222 00223 /* Each platform needs to query info type 1 from cpuid to see if aesni is 00224 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts 00225 */ 00226 00227 /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha256 */ 00228 static int Transform_Sha256(wc_Sha256* sha256); 00229 #if defined(HAVE_INTEL_AVX1) 00230 static int Transform_Sha256_AVX1(wc_Sha256 *sha256); 00231 static int Transform_Sha256_AVX1_Len(wc_Sha256* sha256, word32 len); 00232 #endif 00233 #if defined(HAVE_INTEL_AVX2) 00234 static int Transform_Sha256_AVX2(wc_Sha256 *sha256); 00235 static int Transform_Sha256_AVX2_Len(wc_Sha256* sha256, word32 len); 00236 #ifdef HAVE_INTEL_RORX 00237 static int Transform_Sha256_AVX1_RORX(wc_Sha256 *sha256); 00238 static int Transform_Sha256_AVX1_RORX_Len(wc_Sha256* sha256, word32 len); 00239 static int Transform_Sha256_AVX2_RORX(wc_Sha256 *sha256); 00240 static int Transform_Sha256_AVX2_RORX_Len(wc_Sha256* sha256, word32 len); 00241 #endif 00242 #endif 00243 static int (*Transform_Sha256_p)(wc_Sha256* sha256); 00244 /* = _Transform_Sha256 */ 00245 static int (*Transform_Sha256_Len_p)(wc_Sha256* sha256, word32 len); 00246 /* = NULL */ 00247 static int transform_check = 0; 00248 static word32 intel_flags; 00249 #define XTRANSFORM(S) (*Transform_Sha256_p)((S)) 00250 #define XTRANSFORM_LEN(S, L) (*Transform_Sha256_Len_p)((S),(L)) 00251 00252 static void Sha256_SetTransform(void) 00253 { 00254 00255 if (transform_check) 00256 return; 00257 00258 intel_flags = cpuid_get_flags(); 00259 00260 #ifdef HAVE_INTEL_AVX2 00261 if (IS_INTEL_AVX2(intel_flags)) { 00262 #ifdef HAVE_INTEL_RORX 00263 if (IS_INTEL_BMI2(intel_flags)) { 00264 Transform_Sha256_p = Transform_Sha256_AVX2_RORX; 00265 Transform_Sha256_Len_p = Transform_Sha256_AVX2_RORX_Len; 00266 } 00267 else 00268 #endif 00269 if (1) 00270 { 00271 Transform_Sha256_p = Transform_Sha256_AVX2; 00272 Transform_Sha256_Len_p = Transform_Sha256_AVX2_Len; 00273 } 00274 #ifdef HAVE_INTEL_RORX 00275 else { 00276 Transform_Sha256_p = Transform_Sha256_AVX1_RORX; 00277 Transform_Sha256_Len_p = Transform_Sha256_AVX1_RORX_Len; 00278 } 00279 #endif 00280 } 00281 else 00282 #endif 00283 #ifdef HAVE_INTEL_AVX1 00284 if (IS_INTEL_AVX1(intel_flags)) { 00285 Transform_Sha256_p = Transform_Sha256_AVX1; 00286 Transform_Sha256_Len_p = Transform_Sha256_AVX1_Len; 00287 } 00288 else 00289 #endif 00290 { 00291 Transform_Sha256_p = Transform_Sha256; 00292 Transform_Sha256_Len_p = NULL; 00293 } 00294 00295 transform_check = 1; 00296 } 00297 00298 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) 00299 { 00300 int ret = 0; 00301 if (sha256 == NULL) 00302 return BAD_FUNC_ARG; 00303 00304 sha256->heap = heap; 00305 00306 ret = InitSha256(sha256); 00307 if (ret != 0) 00308 return ret; 00309 00310 /* choose best Transform function under this runtime environment */ 00311 Sha256_SetTransform(); 00312 00313 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256) 00314 ret = wolfAsync_DevCtxInit(&sha256->asyncDev, 00315 WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId); 00316 #else 00317 (void)devId; 00318 #endif /* WOLFSSL_ASYNC_CRYPT */ 00319 00320 return ret; 00321 } 00322 00323 #elif defined(FREESCALE_LTC_SHA) 00324 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) 00325 { 00326 (void)heap; 00327 (void)devId; 00328 00329 LTC_HASH_Init(LTC_BASE, &sha256->ctx, kLTC_Sha256, NULL, 0); 00330 00331 return 0; 00332 } 00333 00334 #elif defined(FREESCALE_MMCAU_SHA) 00335 00336 #ifdef FREESCALE_MMCAU_CLASSIC_SHA 00337 #include "cau_api.h" 00338 #else 00339 #include "fsl_mmcau.h" 00340 #endif 00341 00342 #define XTRANSFORM(S) Transform_Sha256((S)) 00343 #define XTRANSFORM_LEN(S,L) Transform_Sha256_Len((S),(L)) 00344 00345 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) 00346 { 00347 int ret = 0; 00348 00349 (void)heap; 00350 (void)devId; 00351 00352 ret = wolfSSL_CryptHwMutexLock(); 00353 if (ret != 0) { 00354 return ret; 00355 } 00356 #ifdef FREESCALE_MMCAU_CLASSIC_SHA 00357 cau_sha256_initialize_output(sha256->digest); 00358 #else 00359 MMCAU_SHA256_InitializeOutput((uint32_t*)sha256->digest); 00360 #endif 00361 wolfSSL_CryptHwMutexUnLock(); 00362 00363 sha256->buffLen = 0; 00364 sha256->loLen = 0; 00365 sha256->hiLen = 0; 00366 00367 return ret; 00368 } 00369 00370 static int Transform_Sha256(wc_Sha256* sha256) 00371 { 00372 int ret = wolfSSL_CryptHwMutexLock(); 00373 if (ret == 0) { 00374 #ifdef FREESCALE_MMCAU_CLASSIC_SHA 00375 cau_sha256_hash_n((byte*)sha256->buffer, 1, sha256->digest); 00376 #else 00377 MMCAU_SHA256_HashN((byte*)sha256->buffer, 1, sha256->digest); 00378 #endif 00379 wolfSSL_CryptHwMutexUnLock(); 00380 } 00381 return ret; 00382 } 00383 00384 #elif defined(WOLFSSL_PIC32MZ_HASH) 00385 #include <wolfssl/wolfcrypt/port/pic32/pic32mz-crypt.h> 00386 00387 #elif defined(STM32_HASH_SHA2) 00388 00389 /* Supports CubeMX HAL or Standard Peripheral Library */ 00390 00391 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) 00392 { 00393 if (sha256 == NULL) 00394 return BAD_FUNC_ARG; 00395 00396 (void)devId; 00397 (void)heap; 00398 00399 wc_Stm32_Hash_Init(&sha256->stmCtx); 00400 return 0; 00401 } 00402 00403 int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len) 00404 { 00405 int ret = 0; 00406 00407 if (sha256 == NULL || (data == NULL && len > 0)) { 00408 return BAD_FUNC_ARG; 00409 } 00410 00411 ret = wolfSSL_CryptHwMutexLock(); 00412 if (ret == 0) { 00413 ret = wc_Stm32_Hash_Update(&sha256->stmCtx, 00414 HASH_AlgoSelection_SHA256, data, len); 00415 wolfSSL_CryptHwMutexUnLock(); 00416 } 00417 return ret; 00418 } 00419 00420 int wc_Sha256Final(wc_Sha256* sha256, byte* hash) 00421 { 00422 int ret = 0; 00423 00424 if (sha256 == NULL || hash == NULL) { 00425 return BAD_FUNC_ARG; 00426 } 00427 00428 ret = wolfSSL_CryptHwMutexLock(); 00429 if (ret == 0) { 00430 ret = wc_Stm32_Hash_Final(&sha256->stmCtx, 00431 HASH_AlgoSelection_SHA256, hash, WC_SHA256_DIGEST_SIZE); 00432 wolfSSL_CryptHwMutexUnLock(); 00433 } 00434 00435 (void)wc_InitSha256(sha256); /* reset state */ 00436 00437 return ret; 00438 } 00439 00440 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH) 00441 /* functions defined in wolfcrypt/src/port/caam/caam_sha256.c */ 00442 #else 00443 #define NEED_SOFT_SHA256 00444 00445 int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId) 00446 { 00447 int ret = 0; 00448 if (sha256 == NULL) 00449 return BAD_FUNC_ARG; 00450 00451 sha256->heap = heap; 00452 00453 ret = InitSha256(sha256); 00454 if (ret != 0) 00455 return ret; 00456 00457 #ifdef WOLFSSL_SMALL_STACK_CACHE 00458 sha256->W = NULL; 00459 #endif 00460 00461 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256) 00462 ret = wolfAsync_DevCtxInit(&sha256->asyncDev, 00463 WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId); 00464 #else 00465 (void)devId; 00466 #endif /* WOLFSSL_ASYNC_CRYPT */ 00467 00468 return ret; 00469 } 00470 #endif /* End Hardware Acceleration */ 00471 00472 #ifdef NEED_SOFT_SHA256 00473 00474 static const ALIGN32 word32 K[64] = { 00475 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL, 00476 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L, 00477 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, 00478 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, 00479 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L, 00480 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L, 00481 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, 00482 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, 00483 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L, 00484 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L, 00485 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, 00486 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, 00487 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L 00488 }; 00489 00490 #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) 00491 #define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y))) 00492 #define R(x, n) (((x) & 0xFFFFFFFFU) >> (n)) 00493 00494 #define S(x, n) rotrFixed(x, n) 00495 #define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) 00496 #define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) 00497 #define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) 00498 #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) 00499 00500 #define a(i) S[(0-i) & 7] 00501 #define b(i) S[(1-i) & 7] 00502 #define c(i) S[(2-i) & 7] 00503 #define d(i) S[(3-i) & 7] 00504 #define e(i) S[(4-i) & 7] 00505 #define f(i) S[(5-i) & 7] 00506 #define g(i) S[(6-i) & 7] 00507 #define h(i) S[(7-i) & 7] 00508 00509 #define RND(j) \ 00510 t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + W[i+j]; \ 00511 t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \ 00512 d(j) += t0; \ 00513 h(j) = t0 + t1 00514 00515 #ifndef XTRANSFORM 00516 #define XTRANSFORM(S) Transform_Sha256((S)) 00517 #define XTRANSFORM_LEN(S,L) Transform_Sha256_Len((S),(L)) 00518 #endif 00519 00520 static int Transform_Sha256(wc_Sha256* sha256) 00521 { 00522 word32 S[8], t0, t1; 00523 int i; 00524 00525 #ifdef WOLFSSL_SMALL_STACK_CACHE 00526 word32* W = sha256->W; 00527 if (W == NULL) { 00528 W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL, 00529 DYNAMIC_TYPE_RNG); 00530 if (W == NULL) 00531 return MEMORY_E; 00532 sha256->W = W; 00533 } 00534 #elif defined(WOLFSSL_SMALL_STACK) 00535 word32* W; 00536 W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL, 00537 DYNAMIC_TYPE_TMP_BUFFER); 00538 if (W == NULL) 00539 return MEMORY_E; 00540 #else 00541 word32 W[WC_SHA256_BLOCK_SIZE]; 00542 #endif 00543 00544 /* Copy context->state[] to working vars */ 00545 for (i = 0; i < 8; i++) 00546 S[i] = sha256->digest[i]; 00547 00548 for (i = 0; i < 16; i++) 00549 W[i] = sha256->buffer[i]; 00550 00551 for (i = 16; i < WC_SHA256_BLOCK_SIZE; i++) 00552 W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16]; 00553 00554 #ifdef USE_SLOW_SHA256 00555 /* not unrolled - ~2k smaller and ~25% slower */ 00556 for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) { 00557 int j; 00558 for (j = 0; j < 8; j++) { /* braces needed here for macros {} */ 00559 RND(j); 00560 } 00561 } 00562 #else 00563 /* partially loop unrolled */ 00564 for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) { 00565 RND(0); RND(1); RND(2); RND(3); 00566 RND(4); RND(5); RND(6); RND(7); 00567 } 00568 #endif /* USE_SLOW_SHA256 */ 00569 00570 /* Add the working vars back into digest state[] */ 00571 for (i = 0; i < 8; i++) { 00572 sha256->digest[i] += S[i]; 00573 } 00574 00575 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE) 00576 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); 00577 #endif 00578 return 0; 00579 } 00580 #endif 00581 /* End wc_ software implementation */ 00582 00583 00584 #ifdef XTRANSFORM 00585 00586 static WC_INLINE void AddLength(wc_Sha256* sha256, word32 len) 00587 { 00588 word32 tmp = sha256->loLen; 00589 if ((sha256->loLen += len) < tmp) 00590 sha256->hiLen++; /* carry low to high */ 00591 } 00592 00593 static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len) 00594 { 00595 int ret = 0; 00596 byte* local; 00597 00598 if (sha256 == NULL || (data == NULL && len > 0)) { 00599 return BAD_FUNC_ARG; 00600 } 00601 00602 if (data == NULL && len == 0) { 00603 /* valid, but do nothing */ 00604 return 0; 00605 } 00606 00607 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256) 00608 if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) { 00609 #if defined(HAVE_INTEL_QA) 00610 return IntelQaSymSha256(&sha256->asyncDev, NULL, data, len); 00611 #endif 00612 } 00613 #endif /* WOLFSSL_ASYNC_CRYPT */ 00614 00615 /* do block size increments */ 00616 local = (byte*)sha256->buffer; 00617 00618 /* check that internal buffLen is valid */ 00619 if (sha256->buffLen >= WC_SHA256_BLOCK_SIZE) 00620 return BUFFER_E; 00621 00622 if (sha256->buffLen > 0) { 00623 word32 add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen); 00624 XMEMCPY(&local[sha256->buffLen], data, add); 00625 00626 sha256->buffLen += add; 00627 data += add; 00628 len -= add; 00629 00630 if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) { 00631 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA) 00632 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00633 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags)) 00634 #endif 00635 { 00636 ByteReverseWords(sha256->buffer, sha256->buffer, 00637 WC_SHA256_BLOCK_SIZE); 00638 } 00639 #endif 00640 ret = XTRANSFORM(sha256); 00641 if (ret == 0) { 00642 AddLength(sha256, WC_SHA256_BLOCK_SIZE); 00643 sha256->buffLen = 0; 00644 } 00645 else 00646 len = 0; 00647 } 00648 } 00649 00650 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00651 if (Transform_Sha256_Len_p != NULL) { 00652 word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1); 00653 00654 if (blocksLen > 0) { 00655 AddLength(sha256, blocksLen); 00656 sha256->data = data; 00657 /* Byte reversal performed in function if required. */ 00658 XTRANSFORM_LEN(sha256, blocksLen); 00659 data += blocksLen; 00660 len -= blocksLen; 00661 } 00662 } 00663 else 00664 #endif 00665 #if !defined(LITTLE_ENDIAN_ORDER) || defined(FREESCALE_MMCAU_SHA) || \ 00666 defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00667 { 00668 word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1); 00669 00670 AddLength(sha256, blocksLen); 00671 while (len >= WC_SHA256_BLOCK_SIZE) { 00672 XMEMCPY(local, data, WC_SHA256_BLOCK_SIZE); 00673 00674 data += WC_SHA256_BLOCK_SIZE; 00675 len -= WC_SHA256_BLOCK_SIZE; 00676 00677 /* Byte reversal performed in function if required. */ 00678 ret = XTRANSFORM(sha256); 00679 if (ret != 0) 00680 break; 00681 } 00682 } 00683 #else 00684 { 00685 word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1); 00686 00687 AddLength(sha256, blocksLen); 00688 while (len >= WC_SHA256_BLOCK_SIZE) { 00689 XMEMCPY(local, data, WC_SHA256_BLOCK_SIZE); 00690 00691 data += WC_SHA256_BLOCK_SIZE; 00692 len -= WC_SHA256_BLOCK_SIZE; 00693 00694 ByteReverseWords(sha256->buffer, sha256->buffer, 00695 WC_SHA256_BLOCK_SIZE); 00696 ret = XTRANSFORM(sha256); 00697 if (ret != 0) 00698 break; 00699 } 00700 } 00701 #endif 00702 00703 if (len > 0) { 00704 XMEMCPY(local, data, len); 00705 sha256->buffLen = len; 00706 } 00707 00708 return ret; 00709 } 00710 00711 int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len) 00712 { 00713 return Sha256Update(sha256, data, len); 00714 } 00715 00716 static WC_INLINE int Sha256Final(wc_Sha256* sha256) 00717 { 00718 00719 int ret; 00720 byte* local = (byte*)sha256->buffer; 00721 00722 if (sha256 == NULL) { 00723 return BAD_FUNC_ARG; 00724 } 00725 00726 AddLength(sha256, sha256->buffLen); /* before adding pads */ 00727 local[sha256->buffLen++] = 0x80; /* add 1 */ 00728 00729 /* pad with zeros */ 00730 if (sha256->buffLen > WC_SHA256_PAD_SIZE) { 00731 XMEMSET(&local[sha256->buffLen], 0, 00732 WC_SHA256_BLOCK_SIZE - sha256->buffLen); 00733 sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen; 00734 00735 { 00736 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA) 00737 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00738 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags)) 00739 #endif 00740 { 00741 ByteReverseWords(sha256->buffer, sha256->buffer, 00742 WC_SHA256_BLOCK_SIZE); 00743 } 00744 #endif 00745 } 00746 00747 ret = XTRANSFORM(sha256); 00748 if (ret != 0) 00749 return ret; 00750 00751 sha256->buffLen = 0; 00752 } 00753 XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_PAD_SIZE - sha256->buffLen); 00754 00755 /* put lengths in bits */ 00756 sha256->hiLen = (sha256->loLen >> (8 * sizeof(sha256->loLen) - 3)) + 00757 (sha256->hiLen << 3); 00758 sha256->loLen = sha256->loLen << 3; 00759 00760 /* store lengths */ 00761 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA) 00762 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00763 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags)) 00764 #endif 00765 { 00766 ByteReverseWords(sha256->buffer, sha256->buffer, 00767 WC_SHA256_BLOCK_SIZE); 00768 } 00769 #endif 00770 /* ! length ordering dependent on digest endian type ! */ 00771 XMEMCPY(&local[WC_SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32)); 00772 XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen, 00773 sizeof(word32)); 00774 00775 #if defined(FREESCALE_MMCAU_SHA) || defined(HAVE_INTEL_AVX1) || \ 00776 defined(HAVE_INTEL_AVX2) 00777 /* Kinetis requires only these bytes reversed */ 00778 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00779 if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags)) 00780 #endif 00781 { 00782 ByteReverseWords( 00783 &sha256->buffer[WC_SHA256_PAD_SIZE / sizeof(word32)], 00784 &sha256->buffer[WC_SHA256_PAD_SIZE / sizeof(word32)], 00785 2 * sizeof(word32)); 00786 } 00787 #endif 00788 00789 return XTRANSFORM(sha256); 00790 } 00791 00792 int wc_Sha256FinalRaw(wc_Sha256* sha256, byte* hash) 00793 { 00794 #ifdef LITTLE_ENDIAN_ORDER 00795 word32 digest[WC_SHA256_DIGEST_SIZE / sizeof(word32)]; 00796 #endif 00797 00798 if (sha256 == NULL || hash == NULL) { 00799 return BAD_FUNC_ARG; 00800 } 00801 00802 #ifdef LITTLE_ENDIAN_ORDER 00803 ByteReverseWords((word32*)digest, (word32*)sha256->digest, 00804 WC_SHA256_DIGEST_SIZE); 00805 XMEMCPY(hash, digest, WC_SHA256_DIGEST_SIZE); 00806 #else 00807 XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE); 00808 #endif 00809 00810 return 0; 00811 } 00812 00813 int wc_Sha256Final(wc_Sha256* sha256, byte* hash) 00814 { 00815 int ret; 00816 00817 if (sha256 == NULL || hash == NULL) { 00818 return BAD_FUNC_ARG; 00819 } 00820 00821 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256) 00822 if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) { 00823 #if defined(HAVE_INTEL_QA) 00824 return IntelQaSymSha256(&sha256->asyncDev, hash, NULL, 00825 WC_SHA256_DIGEST_SIZE); 00826 #endif 00827 } 00828 #endif /* WOLFSSL_ASYNC_CRYPT */ 00829 00830 ret = Sha256Final(sha256); 00831 if (ret != 0) 00832 return ret; 00833 00834 #if defined(LITTLE_ENDIAN_ORDER) 00835 ByteReverseWords(sha256->digest, sha256->digest, WC_SHA256_DIGEST_SIZE); 00836 #endif 00837 XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE); 00838 00839 return InitSha256(sha256); /* reset state */ 00840 } 00841 00842 #endif /* XTRANSFORM */ 00843 00844 00845 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00846 00847 #define _LOAD_DIGEST() \ 00848 "movl (%[sha256]), %%r8d \n\t" \ 00849 "movl 4(%[sha256]), %%r9d \n\t" \ 00850 "movl 8(%[sha256]), %%r10d\n\t" \ 00851 "movl 12(%[sha256]), %%r11d\n\t" \ 00852 "movl 16(%[sha256]), %%r12d\n\t" \ 00853 "movl 20(%[sha256]), %%r13d\n\t" \ 00854 "movl 24(%[sha256]), %%r14d\n\t" \ 00855 "movl 28(%[sha256]), %%r15d\n\t" 00856 00857 #define _STORE_ADD_DIGEST() \ 00858 "addl %%r8d , (%[sha256])\n\t" \ 00859 "addl %%r9d , 4(%[sha256])\n\t" \ 00860 "addl %%r10d, 8(%[sha256])\n\t" \ 00861 "addl %%r11d, 12(%[sha256])\n\t" \ 00862 "addl %%r12d, 16(%[sha256])\n\t" \ 00863 "addl %%r13d, 20(%[sha256])\n\t" \ 00864 "addl %%r14d, 24(%[sha256])\n\t" \ 00865 "addl %%r15d, 28(%[sha256])\n\t" 00866 00867 #define _ADD_DIGEST() \ 00868 "addl (%[sha256]), %%r8d \n\t" \ 00869 "addl 4(%[sha256]), %%r9d \n\t" \ 00870 "addl 8(%[sha256]), %%r10d\n\t" \ 00871 "addl 12(%[sha256]), %%r11d\n\t" \ 00872 "addl 16(%[sha256]), %%r12d\n\t" \ 00873 "addl 20(%[sha256]), %%r13d\n\t" \ 00874 "addl 24(%[sha256]), %%r14d\n\t" \ 00875 "addl 28(%[sha256]), %%r15d\n\t" 00876 00877 #define _STORE_DIGEST() \ 00878 "movl %%r8d , (%[sha256])\n\t" \ 00879 "movl %%r9d , 4(%[sha256])\n\t" \ 00880 "movl %%r10d, 8(%[sha256])\n\t" \ 00881 "movl %%r11d, 12(%[sha256])\n\t" \ 00882 "movl %%r12d, 16(%[sha256])\n\t" \ 00883 "movl %%r13d, 20(%[sha256])\n\t" \ 00884 "movl %%r14d, 24(%[sha256])\n\t" \ 00885 "movl %%r15d, 28(%[sha256])\n\t" 00886 00887 #define LOAD_DIGEST() \ 00888 _LOAD_DIGEST() 00889 00890 #define STORE_ADD_DIGEST() \ 00891 _STORE_ADD_DIGEST() 00892 00893 #define ADD_DIGEST() \ 00894 _ADD_DIGEST() 00895 00896 #define STORE_DIGEST() \ 00897 _STORE_DIGEST() 00898 00899 00900 #define S_0 %r8d 00901 #define S_1 %r9d 00902 #define S_2 %r10d 00903 #define S_3 %r11d 00904 #define S_4 %r12d 00905 #define S_5 %r13d 00906 #define S_6 %r14d 00907 #define S_7 %r15d 00908 00909 #define L1 "%%edx" 00910 #define L2 "%%ecx" 00911 #define L3 "%%eax" 00912 #define L4 "%%ebx" 00913 #define WK "%%rsp" 00914 00915 #define WORK_REGS "eax", "ebx", "ecx", "edx" 00916 #define STATE_REGS "r8","r9","r10","r11","r12","r13","r14","r15" 00917 #define XMM_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", \ 00918 "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13" 00919 00920 #if defined(HAVE_INTEL_RORX) 00921 #define RND_STEP_RORX_0_1(a, b, c, d, e, f, g, h, i) \ 00922 /* L3 = f */ \ 00923 "movl %" #f ", " L3 "\n\t" \ 00924 /* L2 = e>>>11 */ \ 00925 "rorx $11, %" #e ", " L2 "\n\t" \ 00926 /* h += w_k */ \ 00927 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 00928 00929 #define RND_STEP_RORX_0_2(a, b, c, d, e, f, g, h, i) \ 00930 /* L2 = (e>>>6) ^ (e>>>11) */ \ 00931 "xorl " L1 ", " L2 "\n\t" \ 00932 /* L3 = f ^ g */ \ 00933 "xorl %" #g ", " L3 "\n\t" \ 00934 /* L1 = e>>>25 */ \ 00935 "rorx $25, %" #e ", " L1 "\n\t" \ 00936 00937 #define RND_STEP_RORX_0_3(a, b, c, d, e, f, g, h, i) \ 00938 /* L3 = (f ^ g) & e */ \ 00939 "andl %" #e ", " L3 "\n\t" \ 00940 /* L1 = Sigma1(e) */ \ 00941 "xorl " L2 ", " L1 "\n\t" \ 00942 /* L2 = a>>>13 */ \ 00943 "rorx $13, %" #a ", " L2 "\n\t" \ 00944 00945 #define RND_STEP_RORX_0_4(a, b, c, d, e, f, g, h, i) \ 00946 /* h += Sigma1(e) */ \ 00947 "addl " L1 ", %" #h "\n\t" \ 00948 /* L1 = a>>>2 */ \ 00949 "rorx $2, %" #a ", " L1 "\n\t" \ 00950 /* L3 = Ch(e,f,g) */ \ 00951 "xorl %" #g ", " L3 "\n\t" \ 00952 00953 #define RND_STEP_RORX_0_5(a, b, c, d, e, f, g, h, i) \ 00954 /* L2 = (a>>>2) ^ (a>>>13) */ \ 00955 "xorl " L1 ", " L2 "\n\t" \ 00956 /* L1 = a>>>22 */ \ 00957 "rorx $22, %" #a ", " L1 "\n\t" \ 00958 /* h += Ch(e,f,g) */ \ 00959 "addl " L3 ", %" #h "\n\t" \ 00960 00961 #define RND_STEP_RORX_0_6(a, b, c, d, e, f, g, h, i) \ 00962 /* L1 = Sigma0(a) */ \ 00963 "xorl " L2 ", " L1 "\n\t" \ 00964 /* L3 = b */ \ 00965 "movl %" #b ", " L3 "\n\t" \ 00966 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 00967 "addl %" #h ", %" #d "\n\t" \ 00968 00969 #define RND_STEP_RORX_0_7(a, b, c, d, e, f, g, h, i) \ 00970 /* L3 = a ^ b */ \ 00971 "xorl %" #a ", " L3 "\n\t" \ 00972 /* h += Sigma0(a) */ \ 00973 "addl " L1 ", %" #h "\n\t" \ 00974 /* L4 = (a ^ b) & (b ^ c) */ \ 00975 "andl " L3 ", " L4 "\n\t" \ 00976 00977 #define RND_STEP_RORX_0_8(a, b, c, d, e, f, g, h, i) \ 00978 /* L4 = Maj(a,b,c) */ \ 00979 "xorl %" #b ", " L4 "\n\t" \ 00980 /* L1 = d>>>6 (= e>>>6 next RND) */ \ 00981 "rorx $6, %" #d ", " L1 "\n\t" \ 00982 /* h += Maj(a,b,c) */ \ 00983 "addl " L4 ", %" #h "\n\t" \ 00984 00985 #define RND_STEP_RORX_1_1(a, b, c, d, e, f, g, h, i) \ 00986 /* L4 = f */ \ 00987 "movl %" #f ", " L4 "\n\t" \ 00988 /* L2 = e>>>11 */ \ 00989 "rorx $11, %" #e ", " L2 "\n\t" \ 00990 /* h += w_k */ \ 00991 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 00992 00993 #define RND_STEP_RORX_1_2(a, b, c, d, e, f, g, h, i) \ 00994 /* L2 = (e>>>6) ^ (e>>>11) */ \ 00995 "xorl " L1 ", " L2 "\n\t" \ 00996 /* L4 = f ^ g */ \ 00997 "xorl %" #g ", " L4 "\n\t" \ 00998 /* L1 = e>>>25 */ \ 00999 "rorx $25, %" #e ", " L1 "\n\t" \ 01000 01001 #define RND_STEP_RORX_1_3(a, b, c, d, e, f, g, h, i) \ 01002 /* L4 = (f ^ g) & e */ \ 01003 "andl %" #e ", " L4 "\n\t" \ 01004 /* L1 = Sigma1(e) */ \ 01005 "xorl " L2 ", " L1 "\n\t" \ 01006 /* L2 = a>>>13 */ \ 01007 "rorx $13, %" #a ", " L2 "\n\t" \ 01008 01009 #define RND_STEP_RORX_1_4(a, b, c, d, e, f, g, h, i) \ 01010 /* h += Sigma1(e) */ \ 01011 "addl " L1 ", %" #h "\n\t" \ 01012 /* L1 = a>>>2 */ \ 01013 "rorx $2, %" #a ", " L1 "\n\t" \ 01014 /* L4 = Ch(e,f,g) */ \ 01015 "xorl %" #g ", " L4 "\n\t" \ 01016 01017 #define RND_STEP_RORX_1_5(a, b, c, d, e, f, g, h, i) \ 01018 /* L2 = (a>>>2) ^ (a>>>13) */ \ 01019 "xorl " L1 ", " L2 "\n\t" \ 01020 /* L1 = a>>>22 */ \ 01021 "rorx $22, %" #a ", " L1 "\n\t" \ 01022 /* h += Ch(e,f,g) */ \ 01023 "addl " L4 ", %" #h "\n\t" \ 01024 01025 #define RND_STEP_RORX_1_6(a, b, c, d, e, f, g, h, i) \ 01026 /* L1 = Sigma0(a) */ \ 01027 "xorl " L2 ", " L1 "\n\t" \ 01028 /* L4 = b */ \ 01029 "movl %" #b ", " L4 "\n\t" \ 01030 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 01031 "addl %" #h ", %" #d "\n\t" \ 01032 01033 #define RND_STEP_RORX_1_7(a, b, c, d, e, f, g, h, i) \ 01034 /* L4 = a ^ b */ \ 01035 "xorl %" #a ", " L4 "\n\t" \ 01036 /* h += Sigma0(a) */ \ 01037 "addl " L1 ", %" #h "\n\t" \ 01038 /* L3 = (a ^ b) & (b ^ c) */ \ 01039 "andl " L4 ", " L3 "\n\t" \ 01040 01041 #define RND_STEP_RORX_1_8(a, b, c, d, e, f, g, h, i) \ 01042 /* L3 = Maj(a,b,c) */ \ 01043 "xorl %" #b ", " L3 "\n\t" \ 01044 /* L1 = d>>>6 (= e>>>6 next RND) */ \ 01045 "rorx $6, %" #d ", " L1 "\n\t" \ 01046 /* h += Maj(a,b,c) */ \ 01047 "addl " L3 ", %" #h "\n\t" \ 01048 01049 #define _RND_RORX_X_0(a, b, c, d, e, f, g, h, i) \ 01050 /* L1 = e>>>6 */ \ 01051 "rorx $6, %" #e ", " L1 "\n\t" \ 01052 /* L2 = e>>>11 */ \ 01053 "rorx $11, %" #e ", " L2 "\n\t" \ 01054 /* Prev RND: h += Maj(a,b,c) */ \ 01055 "addl " L3 ", %" #a "\n\t" \ 01056 /* h += w_k */ \ 01057 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 01058 /* L3 = f */ \ 01059 "movl %" #f ", " L3 "\n\t" \ 01060 /* L2 = (e>>>6) ^ (e>>>11) */ \ 01061 "xorl " L1 ", " L2 "\n\t" \ 01062 /* L3 = f ^ g */ \ 01063 "xorl %" #g ", " L3 "\n\t" \ 01064 /* L1 = e>>>25 */ \ 01065 "rorx $25, %" #e ", " L1 "\n\t" \ 01066 /* L1 = Sigma1(e) */ \ 01067 "xorl " L2 ", " L1 "\n\t" \ 01068 /* L3 = (f ^ g) & e */ \ 01069 "andl %" #e ", " L3 "\n\t" \ 01070 /* h += Sigma1(e) */ \ 01071 "addl " L1 ", %" #h "\n\t" \ 01072 /* L1 = a>>>2 */ \ 01073 "rorx $2, %" #a ", " L1 "\n\t" \ 01074 /* L2 = a>>>13 */ \ 01075 "rorx $13, %" #a ", " L2 "\n\t" \ 01076 /* L3 = Ch(e,f,g) */ \ 01077 "xorl %" #g ", " L3 "\n\t" \ 01078 /* L2 = (a>>>2) ^ (a>>>13) */ \ 01079 "xorl " L1 ", " L2 "\n\t" \ 01080 /* L1 = a>>>22 */ \ 01081 "rorx $22, %" #a ", " L1 "\n\t" \ 01082 /* h += Ch(e,f,g) */ \ 01083 "addl " L3 ", %" #h "\n\t" \ 01084 /* L1 = Sigma0(a) */ \ 01085 "xorl " L2 ", " L1 "\n\t" \ 01086 /* L3 = b */ \ 01087 "movl %" #b ", " L3 "\n\t" \ 01088 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 01089 "addl %" #h ", %" #d "\n\t" \ 01090 /* L3 = a ^ b */ \ 01091 "xorl %" #a ", " L3 "\n\t" \ 01092 /* L4 = (a ^ b) & (b ^ c) */ \ 01093 "andl " L3 ", " L4 "\n\t" \ 01094 /* h += Sigma0(a) */ \ 01095 "addl " L1 ", %" #h "\n\t" \ 01096 /* L4 = Maj(a,b,c) */ \ 01097 "xorl %" #b ", " L4 "\n\t" \ 01098 01099 #define _RND_RORX_X_1(a, b, c, d, e, f, g, h, i) \ 01100 /* L1 = e>>>6 */ \ 01101 "rorx $6, %" #e ", " L1 "\n\t" \ 01102 /* L2 = e>>>11 */ \ 01103 "rorx $11, %" #e ", " L2 "\n\t" \ 01104 /* Prev RND: h += Maj(a,b,c) */ \ 01105 "addl " L4 ", %" #a "\n\t" \ 01106 /* h += w_k */ \ 01107 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 01108 /* L4 = f */ \ 01109 "movl %" #f ", " L4 "\n\t" \ 01110 /* L2 = (e>>>6) ^ (e>>>11) */ \ 01111 "xorl " L1 ", " L2 "\n\t" \ 01112 /* L4 = f ^ g */ \ 01113 "xorl %" #g ", " L4 "\n\t" \ 01114 /* L1 = e>>>25 */ \ 01115 "rorx $25, %" #e ", " L1 "\n\t" \ 01116 /* L1 = Sigma1(e) */ \ 01117 "xorl " L2 ", " L1 "\n\t" \ 01118 /* L4 = (f ^ g) & e */ \ 01119 "andl %" #e ", " L4 "\n\t" \ 01120 /* h += Sigma1(e) */ \ 01121 "addl " L1 ", %" #h "\n\t" \ 01122 /* L1 = a>>>2 */ \ 01123 "rorx $2, %" #a ", " L1 "\n\t" \ 01124 /* L2 = a>>>13 */ \ 01125 "rorx $13, %" #a ", " L2 "\n\t" \ 01126 /* L4 = Ch(e,f,g) */ \ 01127 "xorl %" #g ", " L4 "\n\t" \ 01128 /* L2 = (a>>>2) ^ (a>>>13) */ \ 01129 "xorl " L1 ", " L2 "\n\t" \ 01130 /* L1 = a>>>22 */ \ 01131 "rorx $22, %" #a ", " L1 "\n\t" \ 01132 /* h += Ch(e,f,g) */ \ 01133 "addl " L4 ", %" #h "\n\t" \ 01134 /* L1 = Sigma0(a) */ \ 01135 "xorl " L2 ", " L1 "\n\t" \ 01136 /* L4 = b */ \ 01137 "movl %" #b ", " L4 "\n\t" \ 01138 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 01139 "addl %" #h ", %" #d "\n\t" \ 01140 /* L4 = a ^ b */ \ 01141 "xorl %" #a ", " L4 "\n\t" \ 01142 /* L2 = (a ^ b) & (b ^ c) */ \ 01143 "andl " L4 ", " L3 "\n\t" \ 01144 /* h += Sigma0(a) */ \ 01145 "addl " L1 ", %" #h "\n\t" \ 01146 /* L3 = Maj(a,b,c) */ \ 01147 "xorl %" #b ", " L3 "\n\t" \ 01148 01149 01150 #define RND_RORX_X_0(a,b,c,d,e,f,g,h,i) \ 01151 _RND_RORX_X_0(a,b,c,d,e,f,g,h,i) 01152 #define RND_RORX_X_1(a,b,c,d,e,f,g,h,i) \ 01153 _RND_RORX_X_1(a,b,c,d,e,f,g,h,i) 01154 01155 #define RND_RORX_X4(a,b,c,d,e,f,g,h,i) \ 01156 RND_RORX_X_0(a,b,c,d,e,f,g,h,i+0) \ 01157 RND_RORX_X_1(h,a,b,c,d,e,f,g,i+1) \ 01158 RND_RORX_X_0(g,h,a,b,c,d,e,f,i+2) \ 01159 RND_RORX_X_1(f,g,h,a,b,c,d,e,i+3) 01160 01161 #endif /* HAVE_INTEL_RORX */ 01162 01163 #define RND_STEP_0_1(a,b,c,d,e,f,g,h,i) \ 01164 /* L1 = e>>>14 */ \ 01165 "rorl $14, " L1 "\n\t" \ 01166 01167 #define RND_STEP_0_2(a,b,c,d,e,f,g,h,i) \ 01168 /* L3 = b */ \ 01169 "movl %" #b ", " L3 "\n\t" \ 01170 /* L2 = f */ \ 01171 "movl %" #f ", " L2 "\n\t" \ 01172 /* h += w_k */ \ 01173 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 01174 /* L2 = f ^ g */ \ 01175 "xorl %" #g ", " L2 "\n\t" \ 01176 01177 #define RND_STEP_0_3(a,b,c,d,e,f,g,h,i) \ 01178 /* L1 = (e>>>14) ^ e */ \ 01179 "xorl %" #e ", " L1 "\n\t" \ 01180 /* L2 = (f ^ g) & e */ \ 01181 "andl %" #e ", " L2 "\n\t" \ 01182 01183 #define RND_STEP_0_4(a,b,c,d,e,f,g,h,i) \ 01184 /* L1 = ((e>>>14) ^ e) >>> 5 */ \ 01185 "rorl $5, " L1 "\n\t" \ 01186 /* L2 = Ch(e,f,g) */ \ 01187 "xorl %" #g ", " L2 "\n\t" \ 01188 /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \ 01189 "xorl %" #e ", " L1 "\n\t" \ 01190 /* h += Ch(e,f,g) */ \ 01191 "addl " L2 ", %" #h "\n\t" \ 01192 01193 #define RND_STEP_0_5(a,b,c,d,e,f,g,h,i) \ 01194 /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \ 01195 "rorl $6, " L1 "\n\t" \ 01196 /* L3 = a ^ b (= b ^ c of next RND) */ \ 01197 "xorl %" #a ", " L3 "\n\t" \ 01198 /* h = h + w_k + Sigma1(e) */ \ 01199 "addl " L1 ", %" #h "\n\t" \ 01200 /* L2 = a */ \ 01201 "movl %" #a ", " L2 "\n\t" \ 01202 01203 #define RND_STEP_0_6(a,b,c,d,e,f,g,h,i) \ 01204 /* L3 = (a ^ b) & (b ^ c) */ \ 01205 "andl " L3 ", " L4 "\n\t" \ 01206 /* L2 = a>>>9 */ \ 01207 "rorl $9, " L2 "\n\t" \ 01208 /* L2 = (a>>>9) ^ a */ \ 01209 "xorl %" #a ", " L2 "\n\t" \ 01210 /* L1 = Maj(a,b,c) */ \ 01211 "xorl %" #b ", " L4 "\n\t" \ 01212 01213 #define RND_STEP_0_7(a,b,c,d,e,f,g,h,i) \ 01214 /* L2 = ((a>>>9) ^ a) >>> 11 */ \ 01215 "rorl $11, " L2 "\n\t" \ 01216 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 01217 "addl %" #h ", %" #d "\n\t" \ 01218 /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \ 01219 "xorl %" #a ", " L2 "\n\t" \ 01220 /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \ 01221 "addl " L4 ", %" #h "\n\t" \ 01222 01223 #define RND_STEP_0_8(a,b,c,d,e,f,g,h,i) \ 01224 /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \ 01225 "rorl $2, " L2 "\n\t" \ 01226 /* L1 = d (e of next RND) */ \ 01227 "movl %" #d ", " L1 "\n\t" \ 01228 /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ 01229 "addl " L2 ", %" #h "\n\t" \ 01230 01231 #define RND_STEP_1_1(a,b,c,d,e,f,g,h,i) \ 01232 /* L1 = e>>>14 */ \ 01233 "rorl $14, " L1 "\n\t" \ 01234 01235 #define RND_STEP_1_2(a,b,c,d,e,f,g,h,i) \ 01236 /* L3 = b */ \ 01237 "movl %" #b ", " L4 "\n\t" \ 01238 /* L2 = f */ \ 01239 "movl %" #f ", " L2 "\n\t" \ 01240 /* h += w_k */ \ 01241 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 01242 /* L2 = f ^ g */ \ 01243 "xorl %" #g ", " L2 "\n\t" \ 01244 01245 #define RND_STEP_1_3(a,b,c,d,e,f,g,h,i) \ 01246 /* L1 = (e>>>14) ^ e */ \ 01247 "xorl %" #e ", " L1 "\n\t" \ 01248 /* L2 = (f ^ g) & e */ \ 01249 "andl %" #e ", " L2 "\n\t" \ 01250 01251 #define RND_STEP_1_4(a,b,c,d,e,f,g,h,i) \ 01252 /* L1 = ((e>>>14) ^ e) >>> 5 */ \ 01253 "rorl $5, " L1 "\n\t" \ 01254 /* L2 = Ch(e,f,g) */ \ 01255 "xorl %" #g ", " L2 "\n\t" \ 01256 /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \ 01257 "xorl %" #e ", " L1 "\n\t" \ 01258 /* h += Ch(e,f,g) */ \ 01259 "addl " L2 ", %" #h "\n\t" \ 01260 01261 #define RND_STEP_1_5(a,b,c,d,e,f,g,h,i) \ 01262 /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \ 01263 "rorl $6, " L1 "\n\t" \ 01264 /* L4 = a ^ b (= b ^ c of next RND) */ \ 01265 "xorl %" #a ", " L4 "\n\t" \ 01266 /* h = h + w_k + Sigma1(e) */ \ 01267 "addl " L1 ", %" #h "\n\t" \ 01268 /* L2 = a */ \ 01269 "movl %" #a ", " L2 "\n\t" \ 01270 01271 #define RND_STEP_1_6(a,b,c,d,e,f,g,h,i) \ 01272 /* L3 = (a ^ b) & (b ^ c) */ \ 01273 "andl " L4 ", " L3 "\n\t" \ 01274 /* L2 = a>>>9 */ \ 01275 "rorl $9, " L2 "\n\t" \ 01276 /* L2 = (a>>>9) ^ a */ \ 01277 "xorl %" #a ", " L2 "\n\t" \ 01278 /* L1 = Maj(a,b,c) */ \ 01279 "xorl %" #b ", " L3 "\n\t" \ 01280 01281 #define RND_STEP_1_7(a,b,c,d,e,f,g,h,i) \ 01282 /* L2 = ((a>>>9) ^ a) >>> 11 */ \ 01283 "rorl $11, " L2 "\n\t" \ 01284 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 01285 "addl %" #h ", %" #d "\n\t" \ 01286 /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \ 01287 "xorl %" #a ", " L2 "\n\t" \ 01288 /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \ 01289 "addl " L3 ", %" #h "\n\t" \ 01290 01291 #define RND_STEP_1_8(a,b,c,d,e,f,g,h,i) \ 01292 /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \ 01293 "rorl $2, " L2 "\n\t" \ 01294 /* L1 = d (e of next RND) */ \ 01295 "movl %" #d ", " L1 "\n\t" \ 01296 /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ 01297 "addl " L2 ", %" #h "\n\t" \ 01298 01299 #define _RND_ALL_0(a,b,c,d,e,f,g,h,i) \ 01300 /* h += w_k */ \ 01301 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 01302 /* L2 = f */ \ 01303 "movl %" #f ", " L2 "\n\t" \ 01304 /* L3 = b */ \ 01305 "movl %" #b ", " L3 "\n\t" \ 01306 /* L2 = f ^ g */ \ 01307 "xorl %" #g ", " L2 "\n\t" \ 01308 /* L1 = e>>>14 */ \ 01309 "rorl $14, " L1 "\n\t" \ 01310 /* L2 = (f ^ g) & e */ \ 01311 "andl %" #e ", " L2 "\n\t" \ 01312 /* L1 = (e>>>14) ^ e */ \ 01313 "xorl %" #e ", " L1 "\n\t" \ 01314 /* L2 = Ch(e,f,g) */ \ 01315 "xorl %" #g ", " L2 "\n\t" \ 01316 /* L1 = ((e>>>14) ^ e) >>> 5 */ \ 01317 "rorl $5, " L1 "\n\t" \ 01318 /* h += Ch(e,f,g) */ \ 01319 "addl " L2 ", %" #h "\n\t" \ 01320 /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \ 01321 "xorl %" #e ", " L1 "\n\t" \ 01322 /* L3 = a ^ b */ \ 01323 "xorl %" #a ", " L3 "\n\t" \ 01324 /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \ 01325 "rorl $6, " L1 "\n\t" \ 01326 /* L2 = a */ \ 01327 "movl %" #a ", " L2 "\n\t" \ 01328 /* h = h + w_k + Sigma1(e) */ \ 01329 "addl " L1 ", %" #h "\n\t" \ 01330 /* L2 = a>>>9 */ \ 01331 "rorl $9, " L2 "\n\t" \ 01332 /* L3 = (a ^ b) & (b ^ c) */ \ 01333 "andl " L3 ", " L4 "\n\t" \ 01334 /* L2 = (a>>>9) ^ a */ \ 01335 "xorl %" #a ", " L2 "\n\t" \ 01336 /* L1 = Maj(a,b,c) */ \ 01337 "xorl %" #b ", " L4 "\n\t" \ 01338 /* L2 = ((a>>>9) ^ a) >>> 11 */ \ 01339 "rorl $11, " L2 "\n\t" \ 01340 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 01341 "addl %" #h ", %" #d "\n\t" \ 01342 /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \ 01343 "xorl %" #a ", " L2 "\n\t" \ 01344 /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \ 01345 "addl " L4 ", %" #h "\n\t" \ 01346 /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \ 01347 "rorl $2, " L2 "\n\t" \ 01348 /* L1 = d (e of next RND) */ \ 01349 "movl %" #d ", " L1 "\n\t" \ 01350 /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ 01351 "addl " L2 ", %" #h "\n\t" \ 01352 01353 #define _RND_ALL_1(a,b,c,d,e,f,g,h,i) \ 01354 /* h += w_k */ \ 01355 "addl (" #i ")*4(" WK "), %" #h "\n\t" \ 01356 /* L2 = f */ \ 01357 "movl %" #f ", " L2 "\n\t" \ 01358 /* L3 = b */ \ 01359 "movl %" #b ", " L4 "\n\t" \ 01360 /* L2 = f ^ g */ \ 01361 "xorl %" #g ", " L2 "\n\t" \ 01362 /* L1 = e>>>14 */ \ 01363 "rorl $14, " L1 "\n\t" \ 01364 /* L2 = (f ^ g) & e */ \ 01365 "andl %" #e ", " L2 "\n\t" \ 01366 /* L1 = (e>>>14) ^ e */ \ 01367 "xorl %" #e ", " L1 "\n\t" \ 01368 /* L2 = Ch(e,f,g) */ \ 01369 "xorl %" #g ", " L2 "\n\t" \ 01370 /* L1 = ((e>>>14) ^ e) >>> 5 */ \ 01371 "rorl $5, " L1 "\n\t" \ 01372 /* h += Ch(e,f,g) */ \ 01373 "addl " L2 ", %" #h "\n\t" \ 01374 /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */ \ 01375 "xorl %" #e ", " L1 "\n\t" \ 01376 /* L3 = a ^ b */ \ 01377 "xorl %" #a ", " L4 "\n\t" \ 01378 /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */ \ 01379 "rorl $6, " L1 "\n\t" \ 01380 /* L2 = a */ \ 01381 "movl %" #a ", " L2 "\n\t" \ 01382 /* h = h + w_k + Sigma1(e) */ \ 01383 "addl " L1 ", %" #h "\n\t" \ 01384 /* L2 = a>>>9 */ \ 01385 "rorl $9, " L2 "\n\t" \ 01386 /* L3 = (a ^ b) & (b ^ c) */ \ 01387 "andl " L4 ", " L3 "\n\t" \ 01388 /* L2 = (a>>>9) ^ a */ \ 01389 "xorl %" #a", " L2 "\n\t" \ 01390 /* L1 = Maj(a,b,c) */ \ 01391 "xorl %" #b ", " L3 "\n\t" \ 01392 /* L2 = ((a>>>9) ^ a) >>> 11 */ \ 01393 "rorl $11, " L2 "\n\t" \ 01394 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \ 01395 "addl %" #h ", %" #d "\n\t" \ 01396 /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */ \ 01397 "xorl %" #a ", " L2 "\n\t" \ 01398 /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */ \ 01399 "addl " L3 ", %" #h "\n\t" \ 01400 /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */ \ 01401 "rorl $2, " L2 "\n\t" \ 01402 /* L1 = d (e of next RND) */ \ 01403 "movl %" #d ", " L1 "\n\t" \ 01404 /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ 01405 "addl " L2 ", %" #h "\n\t" \ 01406 01407 01408 #define RND_ALL_0(a, b, c, d, e, f, g, h, i) \ 01409 _RND_ALL_0(a, b, c, d, e, f, g, h, i) 01410 #define RND_ALL_1(a, b, c, d, e, f, g, h, i) \ 01411 _RND_ALL_1(a, b, c, d, e, f, g, h, i) 01412 01413 #define RND_ALL_4(a, b, c, d, e, f, g, h, i) \ 01414 RND_ALL_0(a, b, c, d, e, f, g, h, i+0) \ 01415 RND_ALL_1(h, a, b, c, d, e, f, g, i+1) \ 01416 RND_ALL_0(g, h, a, b, c, d, e, f, i+2) \ 01417 RND_ALL_1(f, g, h, a, b, c, d, e, i+3) 01418 01419 #endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */ 01420 01421 #if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */ 01422 01423 #define _VPALIGNR(op1, op2, op3, op4) \ 01424 "vpalignr $" #op4", %" #op3", %" #op2", %" #op1"\n\t" 01425 #define VPALIGNR(op1, op2, op3, op4) \ 01426 _VPALIGNR(op1, op2, op3, op4) 01427 #define _VPADDD(op1, op2, op3) \ 01428 "vpaddd %" #op3", %" #op2", %" #op1"\n\t" 01429 #define VPADDD(op1, op2, op3) \ 01430 _VPADDD(op1, op2, op3) 01431 #define _VPSRLD(op1, op2, op3) \ 01432 "vpsrld $" #op3", %" #op2", %" #op1"\n\t" 01433 #define VPSRLD(op1, op2, op3) \ 01434 _VPSRLD(op1, op2, op3) 01435 #define _VPSRLQ(op1, op2, op3) \ 01436 "vpsrlq $" #op3", %" #op2", %" #op1"\n\t" 01437 #define VPSRLQ(op1,op2,op3) \ 01438 _VPSRLQ(op1,op2,op3) 01439 #define _VPSLLD(op1,op2,op3) \ 01440 "vpslld $" #op3", %" #op2", %" #op1"\n\t" 01441 #define VPSLLD(op1,op2,op3) \ 01442 _VPSLLD(op1,op2,op3) 01443 #define _VPOR(op1,op2,op3) \ 01444 "vpor %" #op3", %" #op2", %" #op1"\n\t" 01445 #define VPOR(op1,op2,op3) \ 01446 _VPOR(op1,op2,op3) 01447 #define _VPXOR(op1,op2,op3) \ 01448 "vpxor %" #op3", %" #op2", %" #op1"\n\t" 01449 #define VPXOR(op1,op2,op3) \ 01450 _VPXOR(op1,op2,op3) 01451 #define _VPSHUFD(op1,op2,op3) \ 01452 "vpshufd $" #op3", %" #op2", %" #op1"\n\t" 01453 #define VPSHUFD(op1,op2,op3) \ 01454 _VPSHUFD(op1,op2,op3) 01455 #define _VPSHUFB(op1,op2,op3) \ 01456 "vpshufb %" #op3", %" #op2", %" #op1"\n\t" 01457 #define VPSHUFB(op1,op2,op3) \ 01458 _VPSHUFB(op1,op2,op3) 01459 #define _VPSLLDQ(op1,op2,op3) \ 01460 "vpslldq $" #op3", %" #op2", %" #op1"\n\t" 01461 #define VPSLLDQ(op1,op2,op3) \ 01462 _VPSLLDQ(op1,op2,op3) 01463 01464 #define MsgSched(X0,X1,X2,X3,a,b,c,d,e,f,g,h,_i) \ 01465 RND_STEP_0_1(a,b,c,d,e,f,g,h,_i) \ 01466 VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */ \ 01467 VPALIGNR (XTMP0, X3, X2, 4) /* XTMP0 = W[-7] */ \ 01468 RND_STEP_0_2(a,b,c,d,e,f,g,h,_i) \ 01469 RND_STEP_0_3(a,b,c,d,e,f,g,h,_i) \ 01470 VPSRLD (XTMP2, XTMP1, 7) /* XTMP2 = W[-15] >> 7 */ \ 01471 VPSLLD (XTMP3, XTMP1, 25) /* XTEMP3 = W[-15] << (32-7) */ \ 01472 RND_STEP_0_4(a,b,c,d,e,f,g,h,_i) \ 01473 RND_STEP_0_5(a,b,c,d,e,f,g,h,_i) \ 01474 VPSRLD (XTMP4, XTMP1, 18) /* XTEMP4 = W[-15] >> 18 */ \ 01475 VPSLLD (XTMP5, XTMP1, 14) /* XTEMP5 = W[-15] << (32-18) */ \ 01476 RND_STEP_0_6(a,b,c,d,e,f,g,h,_i) \ 01477 RND_STEP_0_7(a,b,c,d,e,f,g,h,_i) \ 01478 VPOR (XTMP2, XTMP3, XTMP2) /* XTMP2 = W[-15] >>> 7 */ \ 01479 VPOR (XTMP4, XTMP5, XTMP4) /* XTMP4 = W[-15] >>> 18 */ \ 01480 RND_STEP_0_8(a,b,c,d,e,f,g,h,_i) \ 01481 RND_STEP_1_1(h,a,b,c,d,e,f,g,_i+1) \ 01482 RND_STEP_1_2(h,a,b,c,d,e,f,g,_i+1) \ 01483 VPSRLD (XTMP5, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */ \ 01484 VPXOR (XTMP2, XTMP4, XTMP2) \ 01485 /* XTMP2 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */ \ 01486 RND_STEP_1_3(h,a,b,c,d,e,f,g,_i+1) \ 01487 RND_STEP_1_4(h,a,b,c,d,e,f,g,_i+1) \ 01488 VPXOR (XTMP1, XTMP5, XTMP2) /* XTMP1 = s0 */ \ 01489 VPSHUFD (XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/ \ 01490 RND_STEP_1_5(h,a,b,c,d,e,f,g,_i+1) \ 01491 RND_STEP_1_6(h,a,b,c,d,e,f,g,_i+1) \ 01492 VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */ \ 01493 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */ \ 01494 RND_STEP_1_7(h,a,b,c,d,e,f,g,_i+1) \ 01495 RND_STEP_1_8(h,a,b,c,d,e,f,g,_i+1) \ 01496 RND_STEP_0_1(g,h,a,b,c,d,e,f,_i+2) \ 01497 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */ \ 01498 VPADDD (XTMP0, XTMP0, X0) \ 01499 RND_STEP_0_2(g,h,a,b,c,d,e,f,_i+2) \ 01500 RND_STEP_0_3(g,h,a,b,c,d,e,f,_i+2) \ 01501 RND_STEP_0_4(g,h,a,b,c,d,e,f,_i+2) \ 01502 VPXOR (XTMP2, XTMP3, XTMP2) \ 01503 VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */ \ 01504 RND_STEP_0_5(g,h,a,b,c,d,e,f,_i+2) \ 01505 VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */ \ 01506 RND_STEP_0_6(g,h,a,b,c,d,e,f,_i+2) \ 01507 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */ \ 01508 RND_STEP_0_7(g,h,a,b,c,d,e,f,_i+2) \ 01509 VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */ \ 01510 RND_STEP_0_8(g,h,a,b,c,d,e,f,_i+2) \ 01511 RND_STEP_1_1(f,g,h,a,b,c,d,e,_i+3) \ 01512 VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */ \ 01513 RND_STEP_1_2(f,g,h,a,b,c,d,e,_i+3) \ 01514 VPSRLQ (XTMP4, XTMP2, 17) /* XTMP4 = W[-2] MY_ROR 17 {xDxC} */ \ 01515 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */ \ 01516 RND_STEP_1_3(f,g,h,a,b,c,d,e,_i+3) \ 01517 RND_STEP_1_4(f,g,h,a,b,c,d,e,_i+3) \ 01518 VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */ \ 01519 VPXOR (XTMP4, XTMP3, XTMP4) \ 01520 RND_STEP_1_5(f,g,h,a,b,c,d,e,_i+3) \ 01521 RND_STEP_1_6(f,g,h,a,b,c,d,e,_i+3) \ 01522 VPXOR (XTMP5, XTMP4, XTMP5) /* XTMP5 = s1 {xDxC} */ \ 01523 RND_STEP_1_7(f,g,h,a,b,c,d,e,_i+3) \ 01524 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */ \ 01525 RND_STEP_1_8(f,g,h,a,b,c,d,e,_i+3) \ 01526 VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */ 01527 01528 #if defined(HAVE_INTEL_RORX) 01529 01530 #define MsgSched_RORX(X0,X1,X2,X3,a,b,c,d,e,f,g,h,_i) \ 01531 RND_STEP_RORX_0_1(a,b,c,d,e,f,g,h,_i) \ 01532 VPALIGNR (XTMP0, X3, X2, 4) \ 01533 VPALIGNR (XTMP1, X1, X0, 4) /* XTMP1 = W[-15] */ \ 01534 RND_STEP_RORX_0_2(a,b,c,d,e,f,g,h,_i) \ 01535 RND_STEP_RORX_0_3(a,b,c,d,e,f,g,h,_i) \ 01536 VPSRLD (XTMP2, XTMP1, 7) \ 01537 VPSLLD (XTMP3, XTMP1, 25) /* VPSLLD (XTMP3, XTMP1, (32-7)) */ \ 01538 RND_STEP_RORX_0_4(a,b,c,d,e,f,g,h,_i) \ 01539 RND_STEP_RORX_0_5(a,b,c,d,e,f,g,h,_i) \ 01540 VPSRLD (XTMP4, XTMP1, 3) /* XTMP4 = W[-15] >> 3 */ \ 01541 VPOR (XTMP3, XTMP3, XTMP2) /* XTMP1 = W[-15] MY_ROR 7 */ \ 01542 RND_STEP_RORX_0_6(a,b,c,d,e,f,g,h,_i) \ 01543 RND_STEP_RORX_0_7(a,b,c,d,e,f,g,h,_i) \ 01544 RND_STEP_RORX_0_8(a,b,c,d,e,f,g,h,_i) \ 01545 \ 01546 RND_STEP_RORX_1_1(h,a,b,c,d,e,f,g,_i+1) \ 01547 VPSRLD (XTMP2, XTMP1,18) \ 01548 RND_STEP_RORX_1_2(h,a,b,c,d,e,f,g,_i+1) \ 01549 VPSLLD (XTMP1, XTMP1, 14) /* VPSLLD (XTMP1, XTMP1, (32-18)) */ \ 01550 RND_STEP_RORX_1_3(h,a,b,c,d,e,f,g,_i+1) \ 01551 VPXOR (XTMP3, XTMP3, XTMP1) \ 01552 RND_STEP_RORX_1_4(h,a,b,c,d,e,f,g,_i+1) \ 01553 VPXOR (XTMP3, XTMP3, XTMP2) \ 01554 /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */ \ 01555 RND_STEP_RORX_1_5(h,a,b,c,d,e,f,g,_i+1) \ 01556 VPSHUFD (XTMP2, X3, 0b11111010) /* XTMP2 = W[-2] {BBAA}*/ \ 01557 RND_STEP_RORX_1_6(h,a,b,c,d,e,f,g,_i+1) \ 01558 VPXOR (XTMP1, XTMP3, XTMP4) /* XTMP1 = s0 */ \ 01559 RND_STEP_RORX_1_7(h,a,b,c,d,e,f,g,_i+1) \ 01560 VPSRLD (XTMP4, XTMP2, 10) /* XTMP4 = W[-2] >> 10 {BBAA} */ \ 01561 RND_STEP_RORX_1_8(h,a,b,c,d,e,f,g,_i+1) \ 01562 \ 01563 RND_STEP_RORX_0_1(g,h,a,b,c,d,e,f,_i+2) \ 01564 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */ \ 01565 RND_STEP_RORX_0_2(g,h,a,b,c,d,e,f,_i+2) \ 01566 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */ \ 01567 VPADDD (XTMP0, XTMP0, X0) \ 01568 RND_STEP_RORX_0_3(g,h,a,b,c,d,e,f,_i+2) \ 01569 VPADDD (XTMP0, XTMP0, XTMP1) /* XTMP0 = W[-16] + W[-7] + s0 */ \ 01570 RND_STEP_RORX_0_4(g,h,a,b,c,d,e,f,_i+2) \ 01571 VPXOR (XTMP2, XTMP2, XTMP3) \ 01572 RND_STEP_RORX_0_5(g,h,a,b,c,d,e,f,_i+2) \ 01573 VPXOR (XTMP4, XTMP4, XTMP2) /* XTMP4 = s1 {xBxA} */ \ 01574 RND_STEP_RORX_0_6(g,h,a,b,c,d,e,f,_i+2) \ 01575 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) /* XTMP4 = s1 {00BA} */ \ 01576 RND_STEP_RORX_0_7(g,h,a,b,c,d,e,f,_i+2) \ 01577 VPADDD (XTMP0, XTMP0, XTMP4) /* XTMP0 = {..., ..., W[1], W[0]} */ \ 01578 RND_STEP_RORX_0_8(g,h,a,b,c,d,e,f,_i+2) \ 01579 \ 01580 RND_STEP_RORX_1_1(f,g,h,a,b,c,d,e,_i+3) \ 01581 VPSHUFD (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */ \ 01582 RND_STEP_RORX_1_2(f,g,h,a,b,c,d,e,_i+3) \ 01583 VPSRLD (XTMP5, XTMP2, 10) /* XTMP5 = W[-2] >> 10 {DDCC} */ \ 01584 RND_STEP_RORX_1_3(f,g,h,a,b,c,d,e,_i+3) \ 01585 VPSRLQ (XTMP3, XTMP2, 19) /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */ \ 01586 RND_STEP_RORX_1_4(f,g,h,a,b,c,d,e,_i+3) \ 01587 VPSRLQ (XTMP2, XTMP2, 17) /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */ \ 01588 RND_STEP_RORX_1_5(f,g,h,a,b,c,d,e,_i+3) \ 01589 VPXOR (XTMP2, XTMP2, XTMP3) \ 01590 RND_STEP_RORX_1_6(f,g,h,a,b,c,d,e,_i+3) \ 01591 VPXOR (XTMP5, XTMP5, XTMP2) /* XTMP5 = s1 {xDxC} */ \ 01592 RND_STEP_RORX_1_7(f,g,h,a,b,c,d,e,_i+3) \ 01593 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */ \ 01594 RND_STEP_RORX_1_8(f,g,h,a,b,c,d,e,_i+3) \ 01595 VPADDD (X0, XTMP5, XTMP0) /* X0 = {W[3], W[2], W[1], W[0]} */ 01596 01597 #endif /* HAVE_INTEL_RORX */ 01598 01599 01600 #define _W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) \ 01601 "# X0, X1, X2, X3 = W[0..15]\n\t" \ 01602 "vmovdqu (%%rax), %" #X0 "\n\t" \ 01603 "vmovdqu 16(%%rax), %" #X1 "\n\t" \ 01604 VPSHUFB(X0, X0, BYTE_FLIP_MASK) \ 01605 VPSHUFB(X1, X1, BYTE_FLIP_MASK) \ 01606 "vmovdqu 32(%%rax), %" #X2 "\n\t" \ 01607 "vmovdqu 48(%%rax), %" #X3 "\n\t" \ 01608 VPSHUFB(X2, X2, BYTE_FLIP_MASK) \ 01609 VPSHUFB(X3, X3, BYTE_FLIP_MASK) 01610 01611 #define W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) \ 01612 _W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) 01613 01614 01615 #define _SET_W_K_XFER_4(i) \ 01616 "vpaddd (" #i "*4)+ 0+%[K], %%xmm0, %%xmm4\n\t" \ 01617 "vpaddd (" #i "*4)+16+%[K], %%xmm1, %%xmm5\n\t" \ 01618 "vmovdqu %%xmm4, (" WK ")\n\t" \ 01619 "vmovdqu %%xmm5, 16(" WK ")\n\t" \ 01620 "vpaddd (" #i "*4)+32+%[K], %%xmm2, %%xmm6\n\t" \ 01621 "vpaddd (" #i "*4)+48+%[K], %%xmm3, %%xmm7\n\t" \ 01622 "vmovdqu %%xmm6, 32(" WK ")\n\t" \ 01623 "vmovdqu %%xmm7, 48(" WK ")\n\t" 01624 01625 #define SET_W_K_XFER_4(i) \ 01626 _SET_W_K_XFER_4(i) 01627 01628 01629 static const ALIGN32 word64 mSHUF_00BA[] = 01630 { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */ 01631 static const ALIGN32 word64 mSHUF_DC00[] = 01632 { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */ 01633 static const ALIGN32 word64 mBYTE_FLIP_MASK[] = 01634 { 0x0405060700010203, 0x0c0d0e0f08090a0b }; 01635 01636 #define _Init_Masks(mask1, mask2, mask3) \ 01637 "vmovdqa %[FLIP], %" #mask1 "\n\t" \ 01638 "vmovdqa %[SHUF00BA], %" #mask2 "\n\t" \ 01639 "vmovdqa %[SHUFDC00], %" #mask3 "\n\t" 01640 01641 #define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) \ 01642 _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 01643 01644 #define X0 %xmm0 01645 #define X1 %xmm1 01646 #define X2 %xmm2 01647 #define X3 %xmm3 01648 01649 #define XTMP0 %xmm4 01650 #define XTMP1 %xmm5 01651 #define XTMP2 %xmm6 01652 #define XTMP3 %xmm7 01653 #define XTMP4 %xmm8 01654 #define XTMP5 %xmm9 01655 #define XFER %xmm10 01656 01657 #define SHUF_00BA %xmm11 /* shuffle xBxA -> 00BA */ 01658 #define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */ 01659 #define BYTE_FLIP_MASK %xmm13 01660 01661 01662 SHA256_NOINLINE static int Transform_Sha256_AVX1(wc_Sha256* sha256) 01663 { 01664 __asm__ __volatile__ ( 01665 01666 "subq $64, %%rsp\n\t" 01667 01668 "leaq 32(%[sha256]), %%rax\n\t" 01669 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 01670 LOAD_DIGEST() 01671 01672 W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) 01673 01674 "movl %%r9d, " L4 "\n\t" 01675 "movl %%r12d, " L1 "\n\t" 01676 "xorl %%r10d, " L4 "\n\t" 01677 01678 SET_W_K_XFER_4(0) 01679 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01680 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01681 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01682 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01683 01684 SET_W_K_XFER_4(16) 01685 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01686 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01687 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01688 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01689 01690 SET_W_K_XFER_4(32) 01691 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01692 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01693 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01694 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01695 01696 SET_W_K_XFER_4(48) 01697 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01698 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01699 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01700 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01701 01702 STORE_ADD_DIGEST() 01703 01704 "addq $64, %%rsp\n\t" 01705 01706 : 01707 : [FLIP] "m" (mBYTE_FLIP_MASK[0]), 01708 [SHUF00BA] "m" (mSHUF_00BA[0]), 01709 [SHUFDC00] "m" (mSHUF_DC00[0]), 01710 [sha256] "r" (sha256), 01711 [K] "m" (K) 01712 : WORK_REGS, STATE_REGS, XMM_REGS, "memory" 01713 ); 01714 01715 return 0; 01716 } 01717 01718 SHA256_NOINLINE static int Transform_Sha256_AVX1_Len(wc_Sha256* sha256, 01719 word32 len) 01720 { 01721 __asm__ __volatile__ ( 01722 01723 "subq $64, %%rsp\n\t" 01724 "movq 120(%[sha256]), %%rax\n\t" 01725 01726 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 01727 LOAD_DIGEST() 01728 01729 "# Start of loop processing a block\n" 01730 "1:\n\t" 01731 01732 W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) 01733 01734 "movl %%r9d, " L4 "\n\t" 01735 "movl %%r12d, " L1 "\n\t" 01736 "xorl %%r10d, " L4 "\n\t" 01737 01738 SET_W_K_XFER_4(0) 01739 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01740 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01741 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01742 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01743 01744 SET_W_K_XFER_4(16) 01745 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01746 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01747 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01748 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01749 01750 SET_W_K_XFER_4(32) 01751 MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01752 MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01753 MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01754 MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01755 01756 SET_W_K_XFER_4(48) 01757 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01758 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01759 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01760 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01761 "movq 120(%[sha256]), %%rax\n\t" 01762 01763 ADD_DIGEST() 01764 01765 "addq $64, %%rax\n\t" 01766 "subl $64, %[len]\n\t" 01767 01768 STORE_DIGEST() 01769 01770 "movq %%rax, 120(%[sha256])\n\t" 01771 "jnz 1b\n\t" 01772 01773 "addq $64, %%rsp\n\t" 01774 01775 : 01776 : [FLIP] "m" (mBYTE_FLIP_MASK[0]), 01777 [SHUF00BA] "m" (mSHUF_00BA[0]), 01778 [SHUFDC00] "m" (mSHUF_DC00[0]), 01779 [sha256] "r" (sha256), 01780 [len] "r" (len), 01781 [K] "m" (K) 01782 : WORK_REGS, STATE_REGS, XMM_REGS, "memory" 01783 ); 01784 01785 return 0; 01786 } 01787 #endif /* HAVE_INTEL_AVX1 */ 01788 01789 #if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX) 01790 SHA256_NOINLINE static int Transform_Sha256_AVX1_RORX(wc_Sha256* sha256) 01791 { 01792 __asm__ __volatile__ ( 01793 01794 "subq $64, %%rsp\n\t" 01795 01796 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 01797 "leaq 32(%[sha256]), %%rax\n\t" 01798 W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) 01799 01800 LOAD_DIGEST() 01801 01802 SET_W_K_XFER_4(0) 01803 "movl %%r9d, " L4 "\n\t" 01804 "rorx $6, %%r12d, " L1 "\n\t" 01805 "xorl %%r10d, " L4 "\n\t" 01806 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01807 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01808 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01809 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01810 01811 SET_W_K_XFER_4(16) 01812 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01813 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01814 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01815 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01816 01817 SET_W_K_XFER_4(32) 01818 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01819 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01820 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01821 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01822 01823 SET_W_K_XFER_4(48) 01824 "xorl " L3 ", " L3 "\n\t" 01825 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01826 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01827 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01828 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01829 /* Prev RND: h += Maj(a,b,c) */ 01830 "addl " L3 ", %%r8d\n\t" 01831 01832 STORE_ADD_DIGEST() 01833 01834 "addq $64, %%rsp\n\t" 01835 01836 : 01837 : [FLIP] "m" (mBYTE_FLIP_MASK[0]), 01838 [SHUF00BA] "m" (mSHUF_00BA[0]), 01839 [SHUFDC00] "m" (mSHUF_DC00[0]), 01840 [sha256] "r" (sha256), 01841 [K] "m" (K) 01842 : WORK_REGS, STATE_REGS, XMM_REGS, "memory" 01843 ); 01844 01845 return 0; 01846 } 01847 01848 SHA256_NOINLINE static int Transform_Sha256_AVX1_RORX_Len(wc_Sha256* sha256, 01849 word32 len) 01850 { 01851 __asm__ __volatile__ ( 01852 01853 "subq $64, %%rsp\n\t" 01854 "movq 120(%[sha256]), %%rax\n\t" 01855 01856 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 01857 LOAD_DIGEST() 01858 01859 "# Start of loop processing a block\n" 01860 "1:\n\t" 01861 01862 W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) 01863 01864 SET_W_K_XFER_4(0) 01865 "movl %%r9d, " L4 "\n\t" 01866 "rorx $6, %%r12d, " L1 "\n\t" 01867 "xorl %%r10d, " L4 "\n\t" 01868 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01869 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01870 MsgSched_RORX(X2, X3, X0, X1, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) 01871 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01872 01873 SET_W_K_XFER_4(16) 01874 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01875 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01876 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01877 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01878 01879 SET_W_K_XFER_4(32) 01880 MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01881 MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01882 MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01883 MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01884 01885 SET_W_K_XFER_4(48) 01886 "xorl " L3 ", " L3 "\n\t" 01887 "xorl " L2 ", " L2 "\n\t" 01888 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 01889 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 4) 01890 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 8) 01891 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 01892 /* Prev RND: h += Maj(a,b,c) */ 01893 "addl " L3 ", %%r8d\n\t" 01894 "movq 120(%[sha256]), %%rax\n\t" 01895 01896 ADD_DIGEST() 01897 01898 "addq $64, %%rax\n\t" 01899 "subl $64, %[len]\n\t" 01900 01901 STORE_DIGEST() 01902 01903 "movq %%rax, 120(%[sha256])\n\t" 01904 "jnz 1b\n\t" 01905 01906 "addq $64, %%rsp\n\t" 01907 01908 : 01909 : [FLIP] "m" (mBYTE_FLIP_MASK[0]), 01910 [SHUF00BA] "m" (mSHUF_00BA[0]), 01911 [SHUFDC00] "m" (mSHUF_DC00[0]), 01912 [sha256] "r" (sha256), 01913 [len] "r" (len), 01914 [K] "m" (K) 01915 : WORK_REGS, STATE_REGS, XMM_REGS, "memory" 01916 ); 01917 01918 return 0; 01919 } 01920 #endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_RORX */ 01921 01922 01923 #if defined(HAVE_INTEL_AVX2) 01924 #define Y0 %ymm0 01925 #define Y1 %ymm1 01926 #define Y2 %ymm2 01927 #define Y3 %ymm3 01928 01929 #define YTMP0 %ymm4 01930 #define YTMP1 %ymm5 01931 #define YTMP2 %ymm6 01932 #define YTMP3 %ymm7 01933 #define YTMP4 %ymm8 01934 #define YTMP5 %ymm9 01935 #define YXFER %ymm10 01936 01937 #define SHUF_Y_00BA %ymm11 /* shuffle xBxA -> 00BA */ 01938 #define SHUF_Y_DC00 %ymm12 /* shuffle xDxC -> DC00 */ 01939 #define BYTE_FLIP_Y_MASK %ymm13 01940 01941 #define YMM_REGS "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", \ 01942 "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13" 01943 01944 #define MsgSched_Y(Y0,Y1,Y2,Y3,a,b,c,d,e,f,g,h,_i) \ 01945 RND_STEP_0_1(a,b,c,d,e,f,g,h,_i) \ 01946 VPALIGNR (YTMP1, Y1, Y0, 4) /* YTMP1 = W[-15] */ \ 01947 VPALIGNR (YTMP0, Y3, Y2, 4) /* YTMP0 = W[-7] */ \ 01948 RND_STEP_0_2(a,b,c,d,e,f,g,h,_i) \ 01949 RND_STEP_0_3(a,b,c,d,e,f,g,h,_i) \ 01950 VPSRLD (YTMP2, YTMP1, 7) /* YTMP2 = W[-15] >> 7 */ \ 01951 VPSLLD (YTMP3, YTMP1, 25) /* YTEMP3 = W[-15] << (32-7) */ \ 01952 RND_STEP_0_4(a,b,c,d,e,f,g,h,_i) \ 01953 RND_STEP_0_5(a,b,c,d,e,f,g,h,_i) \ 01954 VPSRLD (YTMP4, YTMP1, 18) /* YTEMP4 = W[-15] >> 18 */ \ 01955 VPSLLD (YTMP5, YTMP1, 14) /* YTEMP5 = W[-15] << (32-18) */ \ 01956 RND_STEP_0_6(a,b,c,d,e,f,g,h,_i) \ 01957 RND_STEP_0_7(a,b,c,d,e,f,g,h,_i) \ 01958 VPOR (YTMP2, YTMP3, YTMP2) /* YTMP2 = W[-15] >>> 7 */ \ 01959 VPOR (YTMP4, YTMP5, YTMP4) /* YTMP4 = W[-15] >>> 18 */ \ 01960 RND_STEP_0_8(a,b,c,d,e,f,g,h,_i) \ 01961 RND_STEP_1_1(h,a,b,c,d,e,f,g,_i+1) \ 01962 RND_STEP_1_2(h,a,b,c,d,e,f,g,_i+1) \ 01963 VPSRLD (YTMP5, YTMP1, 3) /* YTMP4 = W[-15] >> 3 */ \ 01964 VPXOR (YTMP2, YTMP4, YTMP2) /* YTMP2 = W[-15] >>> 7 ^ W[-15] >>> 18 */ \ 01965 RND_STEP_1_3(h,a,b,c,d,e,f,g,_i+1) \ 01966 RND_STEP_1_4(h,a,b,c,d,e,f,g,_i+1) \ 01967 VPXOR (YTMP1, YTMP5, YTMP2) /* YTMP1 = s0 */ \ 01968 VPSHUFD (YTMP2, Y3, 0b11111010) /* YTMP2 = W[-2] {BBAA}*/ \ 01969 RND_STEP_1_5(h,a,b,c,d,e,f,g,_i+1) \ 01970 RND_STEP_1_6(h,a,b,c,d,e,f,g,_i+1) \ 01971 VPSRLD (YTMP4, YTMP2, 10) /* YTMP4 = W[-2] >> 10 {BBAA} */ \ 01972 VPSRLQ (YTMP3, YTMP2, 19) /* YTMP3 = W[-2] MY_ROR 19 {xBxA} */ \ 01973 RND_STEP_1_7(h,a,b,c,d,e,f,g,_i+1) \ 01974 RND_STEP_1_8(h,a,b,c,d,e,f,g,_i+1) \ 01975 RND_STEP_0_1(g,h,a,b,c,d,e,f,_i+2) \ 01976 VPSRLQ (YTMP2, YTMP2, 17) /* YTMP2 = W[-2] MY_ROR 17 {xBxA} */ \ 01977 VPADDD (YTMP0, YTMP0, Y0) \ 01978 RND_STEP_0_2(g,h,a,b,c,d,e,f,_i+2) \ 01979 RND_STEP_0_3(g,h,a,b,c,d,e,f,_i+2) \ 01980 RND_STEP_0_4(g,h,a,b,c,d,e,f,_i+2) \ 01981 VPXOR (YTMP2, YTMP3, YTMP2) \ 01982 VPADDD (YTMP0, YTMP0, YTMP1) /* YTMP0 = W[-16] + W[-7] + s0 */ \ 01983 RND_STEP_0_5(g,h,a,b,c,d,e,f,_i+2) \ 01984 VPXOR (YTMP4, YTMP4, YTMP2) /* YTMP4 = s1 {xBxA} */ \ 01985 RND_STEP_0_6(g,h,a,b,c,d,e,f,_i+2) \ 01986 VPSHUFB (YTMP4, YTMP4, SHUF_Y_00BA) /* YTMP4 = s1 {00BA} */ \ 01987 RND_STEP_0_7(g,h,a,b,c,d,e,f,_i+2) \ 01988 VPADDD (YTMP0, YTMP0, YTMP4) /* YTMP0 = {..., ..., W[1], W[0]} */ \ 01989 RND_STEP_0_8(g,h,a,b,c,d,e,f,_i+2) \ 01990 RND_STEP_1_1(f,g,h,a,b,c,d,e,_i+3) \ 01991 VPSHUFD (YTMP2, YTMP0, 0b01010000) /* YTMP2 = W[-2] {DDCC} */ \ 01992 RND_STEP_1_2(f,g,h,a,b,c,d,e,_i+3) \ 01993 VPSRLQ (YTMP4, YTMP2, 17) /* YTMP4 = W[-2] MY_ROR 17 {xDxC} */ \ 01994 VPSRLQ (YTMP3, YTMP2, 19) /* YTMP3 = W[-2] MY_ROR 19 {xDxC} */ \ 01995 RND_STEP_1_3(f,g,h,a,b,c,d,e,_i+3) \ 01996 RND_STEP_1_4(f,g,h,a,b,c,d,e,_i+3) \ 01997 VPSRLD (YTMP5, YTMP2, 10) /* YTMP5 = W[-2] >> 10 {DDCC} */ \ 01998 VPXOR (YTMP4, YTMP3, YTMP4) \ 01999 RND_STEP_1_5(f,g,h,a,b,c,d,e,_i+3) \ 02000 RND_STEP_1_6(f,g,h,a,b,c,d,e,_i+3) \ 02001 VPXOR (YTMP5, YTMP4, YTMP5) /* YTMP5 = s1 {xDxC} */ \ 02002 RND_STEP_1_7(f,g,h,a,b,c,d,e,_i+3) \ 02003 VPSHUFB (YTMP5, YTMP5, SHUF_Y_DC00) /* YTMP5 = s1 {DC00} */ \ 02004 RND_STEP_1_8(f,g,h,a,b,c,d,e,_i+3) \ 02005 VPADDD (Y0, YTMP5, YTMP0) /* Y0 = {W[3], W[2], W[1], W[0]} */ 02006 02007 #if defined(HAVE_INTEL_RORX) 02008 02009 #define MsgSched_Y_RORX(Y0,Y1,Y2,Y3,a,b,c,d,e,f,g,h,_i) \ 02010 RND_STEP_RORX_0_1(a,b,c,d,e,f,g,h,_i) \ 02011 VPALIGNR (YTMP1, Y1, Y0, 4) /* YTMP1 = W[-15] */ \ 02012 RND_STEP_RORX_0_2(a,b,c,d,e,f,g,h,_i) \ 02013 VPALIGNR (YTMP0, Y3, Y2, 4) /* YTMP0 = W[-7] */ \ 02014 RND_STEP_RORX_0_3(a,b,c,d,e,f,g,h,_i) \ 02015 VPSRLD (YTMP2, YTMP1, 7) /* YTMP2 = W[-15] >> 7 */ \ 02016 RND_STEP_RORX_0_4(a,b,c,d,e,f,g,h,_i) \ 02017 VPSLLD (YTMP3, YTMP1, 25) /* YTEMP3 = W[-15] << (32-7) */ \ 02018 RND_STEP_RORX_0_5(a,b,c,d,e,f,g,h,_i) \ 02019 VPSRLD (YTMP4, YTMP1, 18) /* YTEMP4 = W[-15] >> 18 */ \ 02020 RND_STEP_RORX_0_6(a,b,c,d,e,f,g,h,_i) \ 02021 VPSLLD (YTMP5, YTMP1, 14) /* YTEMP5 = W[-15] << (32-18) */ \ 02022 RND_STEP_RORX_0_7(a,b,c,d,e,f,g,h,_i) \ 02023 VPOR (YTMP2, YTMP2, YTMP3) /* YTMP2 = W[-15] >>> 7 */ \ 02024 RND_STEP_RORX_0_8(a,b,c,d,e,f,g,h,_i) \ 02025 VPOR (YTMP4, YTMP4, YTMP5) /* YTMP4 = W[-15] >>> 18 */ \ 02026 RND_STEP_RORX_1_1(h,a,b,c,d,e,f,g,_i+1) \ 02027 VPSRLD (YTMP5, YTMP1, 3) /* YTMP4 = W[-15] >> 3 */ \ 02028 RND_STEP_RORX_1_2(h,a,b,c,d,e,f,g,_i+1) \ 02029 VPXOR (YTMP2, YTMP2, YTMP4) /* YTMP2 = W[-15] >>> 7 ^ W[-15] >>> 18 */ \ 02030 RND_STEP_RORX_1_3(h,a,b,c,d,e,f,g,_i+1) \ 02031 VPSHUFD (YTMP3, Y3, 0b11111010) /* YTMP2 = W[-2] {BBAA}*/ \ 02032 RND_STEP_RORX_1_4(h,a,b,c,d,e,f,g,_i+1) \ 02033 VPXOR (YTMP1, YTMP5, YTMP2) /* YTMP1 = s0 */ \ 02034 RND_STEP_RORX_1_5(h,a,b,c,d,e,f,g,_i+1) \ 02035 VPSRLD (YTMP4, YTMP3, 10) /* YTMP4 = W[-2] >> 10 {BBAA} */ \ 02036 RND_STEP_RORX_1_6(h,a,b,c,d,e,f,g,_i+1) \ 02037 VPSRLQ (YTMP2, YTMP3, 19) /* YTMP3 = W[-2] MY_ROR 19 {xBxA} */ \ 02038 RND_STEP_RORX_1_7(h,a,b,c,d,e,f,g,_i+1) \ 02039 VPSRLQ (YTMP3, YTMP3, 17) /* YTMP2 = W[-2] MY_ROR 17 {xBxA} */ \ 02040 RND_STEP_RORX_1_8(h,a,b,c,d,e,f,g,_i+1) \ 02041 VPADDD (YTMP0, YTMP0, Y0) \ 02042 RND_STEP_RORX_0_1(g,h,a,b,c,d,e,f,_i+2) \ 02043 VPXOR (YTMP2, YTMP2, YTMP3) \ 02044 RND_STEP_RORX_0_2(g,h,a,b,c,d,e,f,_i+2) \ 02045 VPXOR (YTMP4, YTMP4, YTMP2) /* YTMP4 = s1 {xBxA} */ \ 02046 RND_STEP_RORX_0_3(g,h,a,b,c,d,e,f,_i+2) \ 02047 VPADDD (YTMP0, YTMP0, YTMP1) /* YTMP0 = W[-16] + W[-7] + s0 */ \ 02048 RND_STEP_RORX_0_4(g,h,a,b,c,d,e,f,_i+2) \ 02049 VPSHUFB (YTMP4, YTMP4, SHUF_Y_00BA) /* YTMP4 = s1 {00BA} */ \ 02050 RND_STEP_RORX_0_5(g,h,a,b,c,d,e,f,_i+2) \ 02051 VPADDD (YTMP0, YTMP0, YTMP4) /* YTMP0 = {..., ..., W[1], W[0]} */ \ 02052 RND_STEP_RORX_0_6(g,h,a,b,c,d,e,f,_i+2) \ 02053 VPSHUFD (YTMP2, YTMP0, 0b01010000) /* YTMP2 = W[-2] {DDCC} */ \ 02054 RND_STEP_RORX_0_7(g,h,a,b,c,d,e,f,_i+2) \ 02055 RND_STEP_RORX_0_8(g,h,a,b,c,d,e,f,_i+2) \ 02056 VPSRLQ (YTMP4, YTMP2, 17) /* YTMP4 = W[-2] MY_ROR 17 {xDxC} */ \ 02057 RND_STEP_RORX_1_1(f,g,h,a,b,c,d,e,_i+3) \ 02058 VPSRLQ (YTMP3, YTMP2, 19) /* YTMP3 = W[-2] MY_ROR 19 {xDxC} */ \ 02059 RND_STEP_RORX_1_2(f,g,h,a,b,c,d,e,_i+3) \ 02060 VPSRLD (YTMP5, YTMP2, 10) /* YTMP5 = W[-2] >> 10 {DDCC} */ \ 02061 RND_STEP_RORX_1_3(f,g,h,a,b,c,d,e,_i+3) \ 02062 VPXOR (YTMP4, YTMP4, YTMP3) \ 02063 RND_STEP_RORX_1_4(f,g,h,a,b,c,d,e,_i+3) \ 02064 VPXOR (YTMP5, YTMP5, YTMP4) /* YTMP5 = s1 {xDxC} */ \ 02065 RND_STEP_RORX_1_5(f,g,h,a,b,c,d,e,_i+3) \ 02066 RND_STEP_RORX_1_6(f,g,h,a,b,c,d,e,_i+3) \ 02067 VPSHUFB (YTMP5, YTMP5, SHUF_Y_DC00) /* YTMP5 = s1 {DC00} */ \ 02068 RND_STEP_RORX_1_7(f,g,h,a,b,c,d,e,_i+3) \ 02069 RND_STEP_RORX_1_8(f,g,h,a,b,c,d,e,_i+3) \ 02070 VPADDD (Y0, YTMP5, YTMP0) /* Y0 = {W[3], W[2], W[1], W[0]} */ \ 02071 02072 #endif /* HAVE_INTEL_RORX */ 02073 02074 #define _VINSERTI128(op1,op2,op3,op4) \ 02075 "vinserti128 $" #op4 ", %" #op3 ", %" #op2 ", %" #op1 "\n\t" 02076 #define VINSERTI128(op1,op2,op3,op4) \ 02077 _VINSERTI128(op1,op2,op3,op4) 02078 02079 02080 #define _LOAD_W_K_LOW(BYTE_FLIP_MASK, reg) \ 02081 "# X0, X1, X2, X3 = W[0..15]\n\t" \ 02082 "vmovdqu (%%" #reg "), %%xmm0\n\t" \ 02083 "vmovdqu 16(%%" #reg "), %%xmm1\n\t" \ 02084 VPSHUFB(X0, X0, BYTE_FLIP_MASK) \ 02085 VPSHUFB(X1, X1, BYTE_FLIP_MASK) \ 02086 "vmovdqu 32(%%" #reg "), %%xmm2\n\t" \ 02087 "vmovdqu 48(%%" #reg "), %%xmm3\n\t" \ 02088 VPSHUFB(X2, X2, BYTE_FLIP_MASK) \ 02089 VPSHUFB(X3, X3, BYTE_FLIP_MASK) 02090 02091 #define LOAD_W_K_LOW(BYTE_FLIP_MASK, reg) \ 02092 _LOAD_W_K_LOW(BYTE_FLIP_MASK, reg) 02093 02094 02095 #define _LOAD_W_K(BYTE_FLIP_Y_MASK, reg) \ 02096 "# X0, X1, X2, X3 = W[0..15]\n\t" \ 02097 "vmovdqu (%%" #reg "), %%xmm0\n\t" \ 02098 "vmovdqu 16(%%" #reg "), %%xmm1\n\t" \ 02099 "vmovdqu 64(%%" #reg "), %%xmm4\n\t" \ 02100 "vmovdqu 80(%%" #reg "), %%xmm5\n\t" \ 02101 VINSERTI128(Y0, Y0, XTMP0, 1) \ 02102 VINSERTI128(Y1, Y1, XTMP1, 1) \ 02103 VPSHUFB(Y0, Y0, BYTE_FLIP_Y_MASK) \ 02104 VPSHUFB(Y1, Y1, BYTE_FLIP_Y_MASK) \ 02105 "vmovdqu 32(%%" #reg "), %%xmm2\n\t" \ 02106 "vmovdqu 48(%%" #reg "), %%xmm3\n\t" \ 02107 "vmovdqu 96(%%" #reg "), %%xmm6\n\t" \ 02108 "vmovdqu 112(%%" #reg "), %%xmm7\n\t" \ 02109 VINSERTI128(Y2, Y2, XTMP2, 1) \ 02110 VINSERTI128(Y3, Y3, XTMP3, 1) \ 02111 VPSHUFB(Y2, Y2, BYTE_FLIP_Y_MASK) \ 02112 VPSHUFB(Y3, Y3, BYTE_FLIP_Y_MASK) 02113 02114 #define LOAD_W_K(BYTE_FLIP_Y_MASK, reg) \ 02115 _LOAD_W_K(BYTE_FLIP_Y_MASK, reg) 02116 02117 02118 #define _SET_W_Y_4(i) \ 02119 "vpaddd (" #i "*8)+ 0+%[K], %%ymm0, %%ymm4\n\t" \ 02120 "vpaddd (" #i "*8)+32+%[K], %%ymm1, %%ymm5\n\t" \ 02121 "vmovdqu %%ymm4, (" #i "*8)+ 0(" WK ")\n\t" \ 02122 "vmovdqu %%ymm5, (" #i "*8)+32(" WK ")\n\t" \ 02123 "vpaddd (" #i "*8)+64+%[K], %%ymm2, %%ymm4\n\t" \ 02124 "vpaddd (" #i "*8)+96+%[K], %%ymm3, %%ymm5\n\t" \ 02125 "vmovdqu %%ymm4, (" #i "*8)+64(" WK ")\n\t" \ 02126 "vmovdqu %%ymm5, (" #i "*8)+96(" WK ")\n\t" 02127 02128 #define SET_W_Y_4(i) \ 02129 _SET_W_Y_4(i) 02130 02131 02132 static const ALIGN32 word64 mSHUF_Y_00BA[] = 02133 { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF, 02134 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */ 02135 static const ALIGN32 word64 mSHUF_Y_DC00[] = 02136 { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100, 02137 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */ 02138 static const ALIGN32 word64 mBYTE_FLIP_Y_MASK[] = 02139 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 02140 0x0405060700010203, 0x0c0d0e0f08090a0b }; 02141 02142 #define _INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) \ 02143 "vmovdqa %[FLIP], %" #BYTE_FLIP_MASK "\n\t" \ 02144 "vmovdqa %[SHUF00BA], %" #SHUF_00BA "\n\t" \ 02145 "vmovdqa %[SHUFDC00], %" #SHUF_DC00 "\n\t" 02146 02147 #define INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) \ 02148 _INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 02149 02150 static const ALIGN32 word32 K256[128] = { 02151 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 02152 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 02153 0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 02154 0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 02155 0xD807AA98L, 0x12835B01L, 0x243185BEL, 0x550C7DC3L, 02156 0xD807AA98L, 0x12835B01L, 0x243185BEL, 0x550C7DC3L, 02157 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, 0xC19BF174L, 02158 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, 0xC19BF174L, 02159 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, 02160 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, 02161 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 02162 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 02163 0x983E5152L, 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 02164 0x983E5152L, 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 02165 0xC6E00BF3L, 0xD5A79147L, 0x06CA6351L, 0x14292967L, 02166 0xC6E00BF3L, 0xD5A79147L, 0x06CA6351L, 0x14292967L, 02167 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, 0x53380D13L, 02168 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, 0x53380D13L, 02169 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, 02170 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, 02171 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 02172 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 02173 0xD192E819L, 0xD6990624L, 0xF40E3585L, 0x106AA070L, 02174 0xD192E819L, 0xD6990624L, 0xF40E3585L, 0x106AA070L, 02175 0x19A4C116L, 0x1E376C08L, 0x2748774CL, 0x34B0BCB5L, 02176 0x19A4C116L, 0x1E376C08L, 0x2748774CL, 0x34B0BCB5L, 02177 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, 0x682E6FF3L, 02178 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, 0x682E6FF3L, 02179 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, 02180 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, 02181 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L, 02182 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L 02183 }; 02184 02185 SHA256_NOINLINE static int Transform_Sha256_AVX2(wc_Sha256* sha256) 02186 { 02187 __asm__ __volatile__ ( 02188 02189 "subq $512, %%rsp\n\t" 02190 "leaq 32(%[sha256]), %%rax\n\t" 02191 02192 INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_Y_00BA, SHUF_Y_DC00) 02193 LOAD_DIGEST() 02194 02195 LOAD_W_K_LOW(BYTE_FLIP_MASK, rax) 02196 02197 "movl %%r9d, " L4 "\n\t" 02198 "movl %%r12d, " L1 "\n\t" 02199 "xorl %%r10d, " L4 "\n\t" 02200 02201 SET_W_Y_4(0) 02202 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 02203 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) 02204 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16) 02205 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24) 02206 02207 SET_W_Y_4(16) 02208 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32) 02209 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40) 02210 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48) 02211 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56) 02212 02213 SET_W_Y_4(32) 02214 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64) 02215 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72) 02216 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80) 02217 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88) 02218 02219 SET_W_Y_4(48) 02220 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96) 02221 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104) 02222 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112) 02223 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120) 02224 02225 STORE_ADD_DIGEST() 02226 02227 "addq $512, %%rsp\n\t" 02228 02229 : 02230 : [FLIP] "m" (mBYTE_FLIP_MASK[0]), 02231 [SHUF00BA] "m" (mSHUF_Y_00BA[0]), 02232 [SHUFDC00] "m" (mSHUF_Y_DC00[0]), 02233 [sha256] "r" (sha256), 02234 [K] "m" (K256) 02235 : WORK_REGS, STATE_REGS, YMM_REGS, "memory" 02236 ); 02237 02238 return 0; 02239 } 02240 02241 SHA256_NOINLINE static int Transform_Sha256_AVX2_Len(wc_Sha256* sha256, 02242 word32 len) 02243 { 02244 if ((len & WC_SHA256_BLOCK_SIZE) != 0) { 02245 XMEMCPY(sha256->buffer, sha256->data, WC_SHA256_BLOCK_SIZE); 02246 Transform_Sha256_AVX2(sha256); 02247 sha256->data += WC_SHA256_BLOCK_SIZE; 02248 len -= WC_SHA256_BLOCK_SIZE; 02249 if (len == 0) 02250 return 0; 02251 } 02252 02253 __asm__ __volatile__ ( 02254 02255 "subq $512, %%rsp\n\t" 02256 "movq 120(%[sha256]), %%rax\n\t" 02257 02258 INIT_MASKS_Y(BYTE_FLIP_Y_MASK, SHUF_Y_00BA, SHUF_Y_DC00) 02259 LOAD_DIGEST() 02260 02261 "# Start of loop processing two blocks\n" 02262 "1:\n\t" 02263 02264 LOAD_W_K(BYTE_FLIP_Y_MASK, rax) 02265 02266 "movl %%r9d, " L4 "\n\t" 02267 "movl %%r12d, " L1 "\n\t" 02268 "xorl %%r10d, " L4 "\n\t" 02269 02270 SET_W_Y_4(0) 02271 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 02272 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) 02273 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16) 02274 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24) 02275 02276 SET_W_Y_4(16) 02277 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32) 02278 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40) 02279 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48) 02280 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56) 02281 02282 SET_W_Y_4(32) 02283 MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64) 02284 MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72) 02285 MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80) 02286 MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88) 02287 02288 SET_W_Y_4(48) 02289 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96) 02290 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104) 02291 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112) 02292 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120) 02293 02294 ADD_DIGEST() 02295 STORE_DIGEST() 02296 02297 "movl %%r9d, " L4 "\n\t" 02298 "movl %%r12d, " L1 "\n\t" 02299 "xorl %%r10d, " L4 "\n\t" 02300 02301 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 4) 02302 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 02303 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 20) 02304 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 28) 02305 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 36) 02306 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 44) 02307 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 52) 02308 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 60) 02309 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 68) 02310 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 76) 02311 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 84) 02312 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 92) 02313 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 100) 02314 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 108) 02315 RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 116) 02316 RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 124) 02317 02318 ADD_DIGEST() 02319 02320 "movq 120(%[sha256]), %%rax\n\t" 02321 "addq $128, %%rax\n\t" 02322 "subl $128, %[len]\n\t" 02323 02324 STORE_DIGEST() 02325 02326 "movq %%rax, 120(%[sha256])\n\t" 02327 "jnz 1b\n\t" 02328 02329 "addq $512, %%rsp\n\t" 02330 02331 : 02332 : [FLIP] "m" (mBYTE_FLIP_Y_MASK[0]), 02333 [SHUF00BA] "m" (mSHUF_Y_00BA[0]), 02334 [SHUFDC00] "m" (mSHUF_Y_DC00[0]), 02335 [sha256] "r" (sha256), 02336 [len] "r" (len), 02337 [K] "m" (K256) 02338 : WORK_REGS, STATE_REGS, YMM_REGS, "memory" 02339 ); 02340 02341 return 0; 02342 } 02343 02344 #if defined(HAVE_INTEL_RORX) 02345 SHA256_NOINLINE static int Transform_Sha256_AVX2_RORX(wc_Sha256* sha256) 02346 { 02347 __asm__ __volatile__ ( 02348 02349 "subq $512, %%rsp\n\t" 02350 "leaq 32(%[sha256]), %%rax\n\t" 02351 02352 INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_Y_00BA, SHUF_Y_DC00) 02353 LOAD_W_K_LOW(BYTE_FLIP_MASK, rax) 02354 02355 LOAD_DIGEST() 02356 02357 "movl %%r9d, " L4 "\n\t" 02358 "rorx $6, %%r12d, " L1 "\n\t" 02359 "xorl %%r10d, " L4 "\n\t" 02360 02361 SET_W_Y_4(0) 02362 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 02363 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) 02364 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16) 02365 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24) 02366 02367 SET_W_Y_4(16) 02368 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32) 02369 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40) 02370 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48) 02371 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56) 02372 02373 SET_W_Y_4(32) 02374 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64) 02375 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72) 02376 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80) 02377 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88) 02378 02379 SET_W_Y_4(48) 02380 "xorl " L3 ", " L3 "\n\t" 02381 "xorl " L2 ", " L2 "\n\t" 02382 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96) 02383 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104) 02384 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112) 02385 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120) 02386 /* Prev RND: h += Maj(a,b,c) */ 02387 "addl " L3 ", %%r8d\n\t" 02388 02389 STORE_ADD_DIGEST() 02390 02391 "addq $512, %%rsp\n\t" 02392 02393 : 02394 : [FLIP] "m" (mBYTE_FLIP_MASK[0]), 02395 [SHUF00BA] "m" (mSHUF_Y_00BA[0]), 02396 [SHUFDC00] "m" (mSHUF_Y_DC00[0]), 02397 [sha256] "r" (sha256), 02398 [K] "m" (K256) 02399 : WORK_REGS, STATE_REGS, YMM_REGS, "memory" 02400 ); 02401 02402 return 0; 02403 } 02404 02405 SHA256_NOINLINE static int Transform_Sha256_AVX2_RORX_Len(wc_Sha256* sha256, 02406 word32 len) 02407 { 02408 if ((len & WC_SHA256_BLOCK_SIZE) != 0) { 02409 XMEMCPY(sha256->buffer, sha256->data, WC_SHA256_BLOCK_SIZE); 02410 Transform_Sha256_AVX2_RORX(sha256); 02411 sha256->data += WC_SHA256_BLOCK_SIZE; 02412 len -= WC_SHA256_BLOCK_SIZE; 02413 if (len == 0) 02414 return 0; 02415 } 02416 02417 __asm__ __volatile__ ( 02418 02419 "subq $512, %%rsp\n\t" 02420 "movq 120(%[sha256]), %%rax\n\t" 02421 02422 INIT_MASKS_Y(BYTE_FLIP_Y_MASK, SHUF_Y_00BA, SHUF_Y_DC00) 02423 LOAD_DIGEST() 02424 02425 "# Start of loop processing two blocks\n" 02426 "1:\n\t" 02427 02428 LOAD_W_K(BYTE_FLIP_Y_MASK, rax) 02429 02430 "movl %%r9d, " L4 "\n\t" 02431 "rorx $6, %%r12d, " L1 "\n\t" 02432 "xorl %%r10d, " L4 "\n\t" 02433 02434 SET_W_Y_4(0) 02435 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 0) 02436 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 8) 02437 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16) 02438 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24) 02439 02440 SET_W_Y_4(16) 02441 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32) 02442 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40) 02443 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48) 02444 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56) 02445 02446 SET_W_Y_4(32) 02447 MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64) 02448 MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72) 02449 MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80) 02450 MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88) 02451 02452 SET_W_Y_4(48) 02453 "xorl " L3 ", " L3 "\n\t" 02454 "xorl " L2 ", " L2 "\n\t" 02455 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 96) 02456 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104) 02457 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112) 02458 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120) 02459 /* Prev RND: h += Maj(a,b,c) */ 02460 "addl " L3 ", %%r8d\n\t" 02461 "xorl " L2 ", " L2 "\n\t" 02462 02463 ADD_DIGEST() 02464 STORE_DIGEST() 02465 02466 "movl %%r9d, " L4 "\n\t" 02467 "xorl " L3 ", " L3 "\n\t" 02468 "xorl %%r10d, " L4 "\n\t" 02469 02470 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 4) 02471 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12) 02472 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 20) 02473 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 28) 02474 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 36) 02475 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 44) 02476 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 52) 02477 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 60) 02478 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 68) 02479 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 76) 02480 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 84) 02481 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 92) 02482 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 100) 02483 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 108) 02484 RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 116) 02485 RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 124) 02486 /* Prev RND: h += Maj(a,b,c) */ 02487 "addl " L3 ", %%r8d\n\t" 02488 "movq 120(%[sha256]), %%rax\n\t" 02489 02490 ADD_DIGEST() 02491 02492 "addq $128, %%rax\n\t" 02493 "subl $128, %[len]\n\t" 02494 02495 STORE_DIGEST() 02496 02497 "movq %%rax, 120(%[sha256])\n\t" 02498 "jnz 1b\n\t" 02499 02500 "addq $512, %%rsp\n\t" 02501 02502 : 02503 : [FLIP] "m" (mBYTE_FLIP_Y_MASK[0]), 02504 [SHUF00BA] "m" (mSHUF_Y_00BA[0]), 02505 [SHUFDC00] "m" (mSHUF_Y_DC00[0]), 02506 [sha256] "r" (sha256), 02507 [len] "r" (len), 02508 [K] "m" (K256) 02509 : WORK_REGS, STATE_REGS, YMM_REGS, "memory" 02510 ); 02511 02512 return 0; 02513 } 02514 #endif /* HAVE_INTEL_RORX */ 02515 #endif /* HAVE_INTEL_AVX2 */ 02516 02517 02518 #ifdef WOLFSSL_SHA224 02519 02520 #ifdef STM32_HASH_SHA2 02521 02522 /* Supports CubeMX HAL or Standard Peripheral Library */ 02523 02524 int wc_InitSha224_ex(wc_Sha224* sha224, void* heap, int devId) 02525 { 02526 if (sha224 == NULL) 02527 return BAD_FUNC_ARG; 02528 02529 (void)devId; 02530 (void)heap; 02531 02532 wc_Stm32_Hash_Init(&sha224->stmCtx); 02533 return 0; 02534 } 02535 02536 int wc_Sha224Update(wc_Sha224* sha224, const byte* data, word32 len) 02537 { 02538 int ret = 0; 02539 02540 if (sha224 == NULL || (data == NULL && len > 0)) { 02541 return BAD_FUNC_ARG; 02542 } 02543 02544 ret = wolfSSL_CryptHwMutexLock(); 02545 if (ret == 0) { 02546 ret = wc_Stm32_Hash_Update(&sha224->stmCtx, 02547 HASH_AlgoSelection_SHA224, data, len); 02548 wolfSSL_CryptHwMutexUnLock(); 02549 } 02550 return ret; 02551 } 02552 02553 int wc_Sha224Final(wc_Sha224* sha224, byte* hash) 02554 { 02555 int ret = 0; 02556 02557 if (sha224 == NULL || hash == NULL) { 02558 return BAD_FUNC_ARG; 02559 } 02560 02561 ret = wolfSSL_CryptHwMutexLock(); 02562 if (ret == 0) { 02563 ret = wc_Stm32_Hash_Final(&sha224->stmCtx, 02564 HASH_AlgoSelection_SHA224, hash, WC_SHA224_DIGEST_SIZE); 02565 wolfSSL_CryptHwMutexUnLock(); 02566 } 02567 02568 (void)wc_InitSha224(sha224); /* reset state */ 02569 02570 return ret; 02571 } 02572 02573 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH) 02574 /* functions defined in wolfcrypt/src/port/caam/caam_sha256.c */ 02575 #else 02576 02577 #define NEED_SOFT_SHA224 02578 02579 02580 static int InitSha224(wc_Sha224* sha224) 02581 { 02582 int ret = 0; 02583 02584 if (sha224 == NULL) { 02585 return BAD_FUNC_ARG; 02586 } 02587 02588 sha224->digest[0] = 0xc1059ed8; 02589 sha224->digest[1] = 0x367cd507; 02590 sha224->digest[2] = 0x3070dd17; 02591 sha224->digest[3] = 0xf70e5939; 02592 sha224->digest[4] = 0xffc00b31; 02593 sha224->digest[5] = 0x68581511; 02594 sha224->digest[6] = 0x64f98fa7; 02595 sha224->digest[7] = 0xbefa4fa4; 02596 02597 sha224->buffLen = 0; 02598 sha224->loLen = 0; 02599 sha224->hiLen = 0; 02600 02601 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) 02602 /* choose best Transform function under this runtime environment */ 02603 Sha256_SetTransform(); 02604 #endif 02605 02606 return ret; 02607 } 02608 02609 #endif 02610 02611 #ifdef NEED_SOFT_SHA224 02612 int wc_InitSha224_ex(wc_Sha224* sha224, void* heap, int devId) 02613 { 02614 int ret = 0; 02615 02616 if (sha224 == NULL) 02617 return BAD_FUNC_ARG; 02618 02619 sha224->heap = heap; 02620 02621 ret = InitSha224(sha224); 02622 if (ret != 0) 02623 return ret; 02624 02625 #ifdef WOLFSSL_SMALL_STACK_CACHE 02626 sha224->W = NULL; 02627 #endif 02628 02629 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224) 02630 ret = wolfAsync_DevCtxInit(&sha224->asyncDev, 02631 WOLFSSL_ASYNC_MARKER_SHA224, sha224->heap, devId); 02632 #else 02633 (void)devId; 02634 #endif /* WOLFSSL_ASYNC_CRYPT */ 02635 02636 return ret; 02637 } 02638 02639 int wc_Sha224Update(wc_Sha224* sha224, const byte* data, word32 len) 02640 { 02641 int ret; 02642 02643 if (sha224 == NULL || (data == NULL && len > 0)) { 02644 return BAD_FUNC_ARG; 02645 } 02646 02647 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224) 02648 if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) { 02649 #if defined(HAVE_INTEL_QA) 02650 return IntelQaSymSha224(&sha224->asyncDev, NULL, data, len); 02651 #endif 02652 } 02653 #endif /* WOLFSSL_ASYNC_CRYPT */ 02654 02655 ret = Sha256Update((wc_Sha256*)sha224, data, len); 02656 02657 return ret; 02658 } 02659 02660 int wc_Sha224Final(wc_Sha224* sha224, byte* hash) 02661 { 02662 int ret; 02663 02664 if (sha224 == NULL || hash == NULL) { 02665 return BAD_FUNC_ARG; 02666 } 02667 02668 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224) 02669 if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) { 02670 #if defined(HAVE_INTEL_QA) 02671 return IntelQaSymSha224(&sha224->asyncDev, hash, NULL, 02672 WC_SHA224_DIGEST_SIZE); 02673 #endif 02674 } 02675 #endif /* WOLFSSL_ASYNC_CRYPT */ 02676 02677 ret = Sha256Final((wc_Sha256*)sha224); 02678 if (ret != 0) 02679 return ret; 02680 02681 #if defined(LITTLE_ENDIAN_ORDER) 02682 ByteReverseWords(sha224->digest, sha224->digest, WC_SHA224_DIGEST_SIZE); 02683 #endif 02684 XMEMCPY(hash, sha224->digest, WC_SHA224_DIGEST_SIZE); 02685 02686 return InitSha224(sha224); /* reset state */ 02687 } 02688 #endif /* end of SHA224 software implementation */ 02689 02690 int wc_InitSha224(wc_Sha224* sha224) 02691 { 02692 return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID); 02693 } 02694 02695 void wc_Sha224Free(wc_Sha224* sha224) 02696 { 02697 if (sha224 == NULL) 02698 return; 02699 02700 #ifdef WOLFSSL_SMALL_STACK_CACHE 02701 if (sha224->W != NULL) { 02702 XFREE(sha224->W, NULL, DYNAMIC_TYPE_RNG); 02703 sha224->W = NULL; 02704 } 02705 #endif 02706 02707 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224) 02708 wolfAsync_DevCtxFree(&sha224->asyncDev, WOLFSSL_ASYNC_MARKER_SHA224); 02709 #endif /* WOLFSSL_ASYNC_CRYPT */ 02710 } 02711 #endif /* WOLFSSL_SHA224 */ 02712 02713 02714 int wc_InitSha256(wc_Sha256* sha256) 02715 { 02716 return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID); 02717 } 02718 02719 void wc_Sha256Free(wc_Sha256* sha256) 02720 { 02721 if (sha256 == NULL) 02722 return; 02723 02724 #ifdef WOLFSSL_SMALL_STACK_CACHE 02725 if (sha256->W != NULL) { 02726 XFREE(sha256->W, NULL, DYNAMIC_TYPE_RNG); 02727 sha256->W = NULL; 02728 } 02729 #endif 02730 02731 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256) 02732 wolfAsync_DevCtxFree(&sha256->asyncDev, WOLFSSL_ASYNC_MARKER_SHA256); 02733 #endif /* WOLFSSL_ASYNC_CRYPT */ 02734 } 02735 02736 #endif /* !WOLFSSL_TI_HASH */ 02737 #endif /* HAVE_FIPS */ 02738 02739 02740 #ifndef WOLFSSL_TI_HASH 02741 #ifdef WOLFSSL_SHA224 02742 int wc_Sha224GetHash(wc_Sha224* sha224, byte* hash) 02743 { 02744 int ret; 02745 wc_Sha224 tmpSha224; 02746 02747 if (sha224 == NULL || hash == NULL) 02748 return BAD_FUNC_ARG; 02749 02750 ret = wc_Sha224Copy(sha224, &tmpSha224); 02751 if (ret == 0) { 02752 ret = wc_Sha224Final(&tmpSha224, hash); 02753 wc_Sha224Free(&tmpSha224); 02754 } 02755 return ret; 02756 } 02757 int wc_Sha224Copy(wc_Sha224* src, wc_Sha224* dst) 02758 { 02759 int ret = 0; 02760 02761 if (src == NULL || dst == NULL) 02762 return BAD_FUNC_ARG; 02763 02764 XMEMCPY(dst, src, sizeof(wc_Sha224)); 02765 #ifdef WOLFSSL_SMALL_STACK_CACHE 02766 dst->W = NULL; 02767 #endif 02768 02769 #ifdef WOLFSSL_ASYNC_CRYPT 02770 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev); 02771 #endif 02772 02773 return ret; 02774 } 02775 #endif /* WOLFSSL_SHA224 */ 02776 02777 int wc_Sha256GetHash(wc_Sha256* sha256, byte* hash) 02778 { 02779 int ret; 02780 wc_Sha256 tmpSha256; 02781 02782 if (sha256 == NULL || hash == NULL) 02783 return BAD_FUNC_ARG; 02784 02785 ret = wc_Sha256Copy(sha256, &tmpSha256); 02786 if (ret == 0) { 02787 ret = wc_Sha256Final(&tmpSha256, hash); 02788 wc_Sha256Free(&tmpSha256); 02789 } 02790 return ret; 02791 } 02792 int wc_Sha256Copy(wc_Sha256* src, wc_Sha256* dst) 02793 { 02794 int ret = 0; 02795 02796 if (src == NULL || dst == NULL) 02797 return BAD_FUNC_ARG; 02798 02799 XMEMCPY(dst, src, sizeof(wc_Sha256)); 02800 #ifdef WOLFSSL_SMALL_STACK_CACHE 02801 dst->W = NULL; 02802 #endif 02803 02804 #ifdef WOLFSSL_ASYNC_CRYPT 02805 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev); 02806 #endif 02807 #ifdef WOLFSSL_PIC32MZ_HASH 02808 ret = wc_Pic32HashCopy(&src->cache, &dst->cache); 02809 #endif 02810 02811 return ret; 02812 } 02813 #endif /* !WOLFSSL_TI_HASH */ 02814 02815 #endif /* NO_SHA256 */ 02816
Generated on Tue Jul 12 2022 16:58:07 by
1.7.2