Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of wolfSSL by
sha256.c
00001 /* sha256.c 00002 * 00003 * Copyright (C) 2006-2016 wolfSSL Inc. 00004 * 00005 * This file is part of wolfSSL. 00006 * 00007 * wolfSSL is free software; you can redistribute it and/or modify 00008 * it under the terms of the GNU General Public License as published by 00009 * the Free Software Foundation; either version 2 of the License, or 00010 * (at your option) any later version. 00011 * 00012 * wolfSSL is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 * GNU General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU General Public License 00018 * along with this program; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA 00020 */ 00021 00022 00023 /* code submitted by raphael.huck@efixo.com */ 00024 00025 #ifdef HAVE_CONFIG_H 00026 #include <config.h> 00027 #endif 00028 00029 #include <wolfssl/wolfcrypt/settings.h> 00030 00031 #if !defined(NO_SHA256) 00032 00033 #include <wolfssl/wolfcrypt/sha256.h> 00034 #include <wolfssl/wolfcrypt/error-crypt.h> 00035 00036 /* fips wrapper calls, user can call direct */ 00037 #ifdef HAVE_FIPS 00038 00039 int wc_InitSha256(Sha256* sha) 00040 { 00041 return InitSha256_fips(sha); 00042 } 00043 int wc_InitSha256_ex(Sha256* sha, void* heap, int devId) 00044 { 00045 (void)heap; 00046 (void)devId; 00047 return InitSha256_fips(sha); 00048 } 00049 int wc_Sha256Update(Sha256* sha, const byte* data, word32 len) 00050 { 00051 return Sha256Update_fips(sha, data, len); 00052 } 00053 int wc_Sha256Final(Sha256* sha, byte* out) 00054 { 00055 return Sha256Final_fips(sha, out); 00056 } 00057 void wc_Sha256Free(Sha256* sha) 00058 { 00059 (void)sha; 00060 /* Not supported in FIPS */ 00061 } 00062 00063 #else /* else build without fips */ 00064 00065 00066 #if defined(WOLFSSL_TI_HASH) 00067 /* #include <wolfcrypt/src/port/ti/ti-hash.c> included by wc_port.c */ 00068 #else 00069 00070 #include <wolfssl/wolfcrypt/logging.h> 00071 00072 #ifdef NO_INLINE 00073 #include <wolfssl/wolfcrypt/misc.h> 00074 #else 00075 #define WOLFSSL_MISC_INCLUDED 00076 #include <wolfcrypt/src/misc.c> 00077 #endif 00078 00079 00080 #if defined(USE_INTEL_SPEEDUP) 00081 #define HAVE_INTEL_AVX1 00082 #define HAVE_INTEL_AVX2 00083 #endif /* USE_INTEL_SPEEDUP */ 00084 00085 #if defined(HAVE_INTEL_AVX2) 00086 #define HAVE_INTEL_RORX 00087 #endif 00088 00089 00090 static int InitSha256(Sha256* sha256) 00091 { 00092 int ret = 0; 00093 00094 if (sha256 == NULL) 00095 return BAD_FUNC_ARG; 00096 00097 sha256->digest[0] = 0x6A09E667L; 00098 sha256->digest[1] = 0xBB67AE85L; 00099 sha256->digest[2] = 0x3C6EF372L; 00100 sha256->digest[3] = 0xA54FF53AL; 00101 sha256->digest[4] = 0x510E527FL; 00102 sha256->digest[5] = 0x9B05688CL; 00103 sha256->digest[6] = 0x1F83D9ABL; 00104 sha256->digest[7] = 0x5BE0CD19L; 00105 00106 sha256->buffLen = 0; 00107 sha256->loLen = 0; 00108 sha256->hiLen = 0; 00109 00110 return ret; 00111 } 00112 00113 00114 /* Hardware Acceleration */ 00115 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00116 00117 /* in case intel instructions aren't available, plus we need the K[] global */ 00118 #define NEED_SOFT_SHA256 00119 00120 /***** 00121 Intel AVX1/AVX2 Macro Control Structure 00122 00123 #define HAVE_INTEL_AVX1 00124 #define HAVE_INTEL_AVX2 00125 00126 #define HAVE_INTEL_RORX 00127 00128 00129 int InitSha256(Sha256* sha256) { 00130 Save/Recover XMM, YMM 00131 ... 00132 } 00133 00134 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) 00135 Transform(); Function prototype 00136 #else 00137 Transform() { } 00138 int Sha256Final() { 00139 Save/Recover XMM, YMM 00140 ... 00141 } 00142 #endif 00143 00144 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) 00145 #if defined(HAVE_INTEL_RORX 00146 #define RND with rorx instuction 00147 #else 00148 #define RND 00149 #endif 00150 #endif 00151 00152 #if defined(HAVE_INTEL_AVX1) 00153 00154 #define XMM Instructions/inline asm 00155 00156 int Transform() { 00157 Stitched Message Sched/Round 00158 } 00159 00160 #elif defined(HAVE_INTEL_AVX2) 00161 00162 #define YMM Instructions/inline asm 00163 00164 int Transform() { 00165 More granural Stitched Message Sched/Round 00166 } 00167 00168 */ 00169 00170 /* Each platform needs to query info type 1 from cpuid to see if aesni is 00171 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts 00172 */ 00173 00174 #ifndef _MSC_VER 00175 #define cpuid(reg, leaf, sub)\ 00176 __asm__ __volatile__ ("cpuid":\ 00177 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ 00178 "a" (leaf), "c"(sub)); 00179 00180 #define XASM_LINK(f) asm(f) 00181 #else 00182 #include <intrin.h> 00183 #define cpuid(a,b) __cpuid((int*)a,b) 00184 00185 #define XASM_LINK(f) 00186 #endif /* _MSC_VER */ 00187 00188 #define EAX 0 00189 #define EBX 1 00190 #define ECX 2 00191 #define EDX 3 00192 00193 #define CPUID_AVX1 0x1 00194 #define CPUID_AVX2 0x2 00195 #define CPUID_RDRAND 0x4 00196 #define CPUID_RDSEED 0x8 00197 #define CPUID_BMI2 0x10 /* MULX, RORX */ 00198 00199 #define IS_INTEL_AVX1 (cpuid_flags & CPUID_AVX1) 00200 #define IS_INTEL_AVX2 (cpuid_flags & CPUID_AVX2) 00201 #define IS_INTEL_BMI2 (cpuid_flags & CPUID_BMI2) 00202 #define IS_INTEL_RDRAND (cpuid_flags & CPUID_RDRAND) 00203 #define IS_INTEL_RDSEED (cpuid_flags & CPUID_RDSEED) 00204 00205 static word32 cpuid_check = 0; 00206 static word32 cpuid_flags = 0; 00207 00208 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { 00209 int got_intel_cpu=0; 00210 unsigned int reg[5]; 00211 00212 reg[4] = '\0'; 00213 cpuid(reg, 0, 0); 00214 if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 && 00215 XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 && 00216 XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) { 00217 got_intel_cpu = 1; 00218 } 00219 if (got_intel_cpu) { 00220 cpuid(reg, leaf, sub); 00221 return ((reg[num] >> bit) & 0x1); 00222 } 00223 return 0; 00224 } 00225 00226 static int set_cpuid_flags(void) { 00227 if (cpuid_check==0) { 00228 if (cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1; } 00229 if (cpuid_flag(7, 0, EBX, 5)) { cpuid_flags |= CPUID_AVX2; } 00230 if (cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2; } 00231 if (cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND; } 00232 if (cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED; } 00233 cpuid_check = 1; 00234 return 0; 00235 } 00236 return 1; 00237 } 00238 00239 /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha256 */ 00240 static int Transform(Sha256* sha256); 00241 #if defined(HAVE_INTEL_AVX1) 00242 static int Transform_AVX1(Sha256 *sha256); 00243 #endif 00244 #if defined(HAVE_INTEL_AVX2) 00245 static int Transform_AVX2(Sha256 *sha256); 00246 static int Transform_AVX1_RORX(Sha256 *sha256); 00247 #endif 00248 static int (*Transform_p)(Sha256* sha256) /* = _Transform */; 00249 #define XTRANSFORM(sha256, B) (*Transform_p)(sha256) 00250 00251 static void set_Transform(void) { 00252 if (set_cpuid_flags()) return; 00253 00254 #if defined(HAVE_INTEL_AVX2) 00255 if (IS_INTEL_AVX2 && IS_INTEL_BMI2) { 00256 Transform_p = Transform_AVX1_RORX; return; 00257 Transform_p = Transform_AVX2; 00258 /* for avoiding warning,"not used" */ 00259 } 00260 #endif 00261 #if defined(HAVE_INTEL_AVX1) 00262 Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform); return; 00263 #endif 00264 Transform_p = Transform; return; 00265 } 00266 00267 /* Dummy for saving MM_REGs on behalf of Transform */ 00268 #if defined(HAVE_INTEL_AVX2) && !defined(HAVE_INTEL_AVX1) 00269 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\ 00270 "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15") 00271 #elif defined(HAVE_INTEL_AVX1) 00272 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\ 00273 "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\ 00274 "xmm11","xmm12","xmm13","xmm14","xmm15") 00275 #endif 00276 00277 int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId) 00278 { 00279 int ret = 0; 00280 if (sha256 == NULL) 00281 return BAD_FUNC_ARG; 00282 00283 sha256->heap = heap; 00284 00285 ret = InitSha256(sha256); 00286 if (ret != 0) 00287 return ret; 00288 00289 /* choose best Transform function under this runtime environment */ 00290 set_Transform(); 00291 00292 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256) 00293 ret = wolfAsync_DevCtxInit(&sha256->asyncDev, 00294 WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId); 00295 #else 00296 (void)devId; 00297 #endif /* WOLFSSL_ASYNC_CRYPT */ 00298 00299 return ret; 00300 } 00301 00302 #elif defined(FREESCALE_LTC_SHA) 00303 int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId) 00304 { 00305 (void)heap; 00306 (void)devId; 00307 00308 LTC_HASH_Init(LTC_BASE, &sha256->ctx, kLTC_Sha256, NULL, 0); 00309 00310 return 0; 00311 } 00312 00313 #elif defined(FREESCALE_MMCAU_SHA) 00314 #include "fsl_mmcau.h" 00315 #define XTRANSFORM(sha256, B) Transform(sha256, B) 00316 00317 int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId) 00318 { 00319 int ret = 0; 00320 00321 (void)heap; 00322 (void)devId; 00323 00324 ret = wolfSSL_CryptHwMutexLock(); 00325 if (ret != 0) { 00326 return ret; 00327 } 00328 MMCAU_SHA256_InitializeOutput((uint32_t*)sha256->digest); 00329 wolfSSL_CryptHwMutexUnLock(); 00330 00331 sha256->buffLen = 0; 00332 sha256->loLen = 0; 00333 sha256->hiLen = 0; 00334 00335 return ret; 00336 } 00337 00338 static int Transform(Sha256* sha256, byte* buf) 00339 { 00340 int ret = wolfSSL_CryptHwMutexLock(); 00341 if (ret == 0) { 00342 MMCAU_SHA256_HashN(buf, 1, sha256->digest); 00343 wolfSSL_CryptHwMutexUnLock(); 00344 } 00345 return ret; 00346 } 00347 00348 #elif defined(WOLFSSL_PIC32MZ_HASH) 00349 #define NEED_SOFT_SHA256 00350 00351 #define wc_InitSha256 wc_InitSha256_sw 00352 #define wc_Sha256Update wc_Sha256Update_sw 00353 #define wc_Sha256Final wc_Sha256Final_sw 00354 00355 int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId) 00356 { 00357 if (sha256 == NULL) 00358 return BAD_FUNC_ARG; 00359 00360 sha256->heap = heap; 00361 00362 return InitSha256(sha256); 00363 } 00364 00365 #else 00366 #define NEED_SOFT_SHA256 00367 00368 #define XTRANSFORM(sha256, B) Transform(sha256) 00369 00370 int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId) 00371 { 00372 int ret = 0; 00373 if (sha256 == NULL) 00374 return BAD_FUNC_ARG; 00375 00376 sha256->heap = heap; 00377 00378 ret = InitSha256(sha256); 00379 if (ret != 0) 00380 return ret; 00381 00382 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256) 00383 ret = wolfAsync_DevCtxInit(&sha256->asyncDev, 00384 WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId); 00385 #else 00386 (void)devId; 00387 #endif /* WOLFSSL_ASYNC_CRYPT */ 00388 00389 return ret; 00390 } 00391 #endif /* End Hardware Acceleration */ 00392 00393 #ifndef SAVE_XMM_YMM 00394 #define SAVE_XMM_YMM 00395 #endif 00396 00397 #ifdef NEED_SOFT_SHA256 00398 00399 static const ALIGN32 word32 K[64] = { 00400 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL, 00401 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L, 00402 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, 00403 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, 00404 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L, 00405 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L, 00406 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, 00407 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, 00408 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L, 00409 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L, 00410 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, 00411 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, 00412 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L 00413 }; 00414 00415 #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) 00416 #define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y))) 00417 #define R(x, n) (((x) & 0xFFFFFFFFU) >> (n)) 00418 00419 #define S(x, n) rotrFixed(x, n) 00420 #define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) 00421 #define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) 00422 #define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) 00423 #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) 00424 00425 #define RND(a,b,c,d,e,f,g,h,i) \ 00426 t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \ 00427 t1 = Sigma0((a)) + Maj((a), (b), (c)); \ 00428 (d) += t0; \ 00429 (h) = t0 + t1; 00430 00431 static int Transform(Sha256* sha256) 00432 { 00433 word32 S[8], t0, t1; 00434 int i; 00435 00436 #ifdef WOLFSSL_SMALL_STACK 00437 word32* W; 00438 00439 W = (word32*)XMALLOC(sizeof(word32) * SHA256_BLOCK_SIZE, NULL, 00440 DYNAMIC_TYPE_TMP_BUFFER); 00441 if (W == NULL) 00442 return MEMORY_E; 00443 #else 00444 word32 W[SHA256_BLOCK_SIZE]; 00445 #endif 00446 00447 /* Copy context->state[] to working vars */ 00448 for (i = 0; i < 8; i++) 00449 S[i] = sha256->digest[i]; 00450 00451 for (i = 0; i < 16; i++) 00452 W[i] = sha256->buffer[i]; 00453 00454 for (i = 16; i < SHA256_BLOCK_SIZE; i++) 00455 W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16]; 00456 00457 for (i = 0; i < SHA256_BLOCK_SIZE; i += 8) { 00458 RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0); 00459 RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1); 00460 RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2); 00461 RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3); 00462 RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4); 00463 RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5); 00464 RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6); 00465 RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7); 00466 } 00467 00468 /* Add the working vars back into digest state[] */ 00469 for (i = 0; i < 8; i++) { 00470 sha256->digest[i] += S[i]; 00471 } 00472 00473 #ifdef WOLFSSL_SMALL_STACK 00474 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); 00475 #endif 00476 00477 return 0; 00478 } 00479 #endif 00480 /* End wc_ software implementation */ 00481 00482 00483 #ifdef XTRANSFORM 00484 00485 static INLINE void AddLength(Sha256* sha256, word32 len) 00486 { 00487 word32 tmp = sha256->loLen; 00488 if ( (sha256->loLen += len) < tmp) 00489 sha256->hiLen++; /* carry low to high */ 00490 } 00491 00492 static INLINE int Sha256Update(Sha256* sha256, const byte* data, word32 len) 00493 { 00494 int ret = 0; 00495 byte* local; 00496 00497 if (sha256 == NULL || (data == NULL && len > 0)) { 00498 return BAD_FUNC_ARG; 00499 } 00500 00501 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256) 00502 if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) { 00503 #if defined(HAVE_INTEL_QA) 00504 return IntelQaSymSha256(&sha256->asyncDev, NULL, data, len); 00505 #endif 00506 } 00507 #endif /* WOLFSSL_ASYNC_CRYPT */ 00508 00509 /* do block size increments */ 00510 local = (byte*)sha256->buffer; 00511 00512 /* check that internal buffLen is valid */ 00513 if (sha256->buffLen >= SHA256_BLOCK_SIZE) 00514 return BUFFER_E; 00515 00516 SAVE_XMM_YMM; /* for Intel AVX */ 00517 00518 while (len) { 00519 word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen); 00520 XMEMCPY(&local[sha256->buffLen], data, add); 00521 00522 sha256->buffLen += add; 00523 data += add; 00524 len -= add; 00525 00526 if (sha256->buffLen == SHA256_BLOCK_SIZE) { 00527 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA) 00528 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00529 if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2) 00530 #endif 00531 { 00532 ByteReverseWords(sha256->buffer, sha256->buffer, 00533 SHA256_BLOCK_SIZE); 00534 } 00535 #endif 00536 ret = XTRANSFORM(sha256, local); 00537 if (ret != 0) { 00538 break; 00539 } 00540 00541 AddLength(sha256, SHA256_BLOCK_SIZE); 00542 sha256->buffLen = 0; 00543 } 00544 } 00545 00546 return ret; 00547 } 00548 00549 int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) 00550 { 00551 return Sha256Update(sha256, data, len); 00552 } 00553 00554 static INLINE int Sha256Final(Sha256* sha256) 00555 { 00556 int ret; 00557 byte* local = (byte*)sha256->buffer; 00558 00559 SAVE_XMM_YMM; /* for Intel AVX */ 00560 00561 AddLength(sha256, sha256->buffLen); /* before adding pads */ 00562 local[sha256->buffLen++] = 0x80; /* add 1 */ 00563 00564 /* pad with zeros */ 00565 if (sha256->buffLen > SHA256_PAD_SIZE) { 00566 XMEMSET(&local[sha256->buffLen], 0, 00567 SHA256_BLOCK_SIZE - sha256->buffLen); 00568 sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen; 00569 00570 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA) 00571 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00572 if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2) 00573 #endif 00574 { 00575 ByteReverseWords(sha256->buffer, sha256->buffer, 00576 SHA256_BLOCK_SIZE); 00577 } 00578 #endif 00579 00580 ret = XTRANSFORM(sha256, local); 00581 if (ret != 0) 00582 return ret; 00583 00584 sha256->buffLen = 0; 00585 } 00586 XMEMSET(&local[sha256->buffLen], 0, SHA256_PAD_SIZE - sha256->buffLen); 00587 00588 /* put lengths in bits */ 00589 sha256->hiLen = (sha256->loLen >> (8 * sizeof(sha256->loLen) - 3)) + 00590 (sha256->hiLen << 3); 00591 sha256->loLen = sha256->loLen << 3; 00592 00593 /* store lengths */ 00594 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA) 00595 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00596 if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2) 00597 #endif 00598 { 00599 ByteReverseWords(sha256->buffer, sha256->buffer, 00600 SHA256_BLOCK_SIZE); 00601 } 00602 #endif 00603 /* ! length ordering dependent on digest endian type ! */ 00604 XMEMCPY(&local[SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32)); 00605 XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen, 00606 sizeof(word32)); 00607 00608 #if defined(FREESCALE_MMCAU_SHA) || defined(HAVE_INTEL_AVX1) || \ 00609 defined(HAVE_INTEL_AVX2) 00610 /* Kinetis requires only these bytes reversed */ 00611 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00612 if (IS_INTEL_AVX1 || IS_INTEL_AVX2) 00613 #endif 00614 { 00615 ByteReverseWords( 00616 &sha256->buffer[SHA256_PAD_SIZE / sizeof(word32)], 00617 &sha256->buffer[SHA256_PAD_SIZE / sizeof(word32)], 00618 2 * sizeof(word32)); 00619 } 00620 #endif 00621 00622 return XTRANSFORM(sha256, local); 00623 } 00624 00625 int wc_Sha256Final(Sha256* sha256, byte* hash) 00626 { 00627 int ret; 00628 00629 if (sha256 == NULL || hash == NULL) { 00630 return BAD_FUNC_ARG; 00631 } 00632 00633 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256) 00634 if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) { 00635 #if defined(HAVE_INTEL_QA) 00636 return IntelQaSymSha256(&sha256->asyncDev, hash, NULL, 00637 SHA256_DIGEST_SIZE); 00638 #endif 00639 } 00640 #endif /* WOLFSSL_ASYNC_CRYPT */ 00641 00642 ret = Sha256Final(sha256); 00643 if (ret != 0) 00644 return ret; 00645 00646 #if defined(LITTLE_ENDIAN_ORDER) 00647 ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE); 00648 #endif 00649 XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE); 00650 00651 return InitSha256(sha256); /* reset state */ 00652 } 00653 00654 #endif /* XTRANSFORM */ 00655 00656 00657 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00658 00659 #define _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 00660 { word32 d;\ 00661 d = sha256->digest[0]; __asm__ volatile("movl %0, %"#S_0::"r"(d):SSE_REGs);\ 00662 d = sha256->digest[1]; __asm__ volatile("movl %0, %"#S_1::"r"(d):SSE_REGs);\ 00663 d = sha256->digest[2]; __asm__ volatile("movl %0, %"#S_2::"r"(d):SSE_REGs);\ 00664 d = sha256->digest[3]; __asm__ volatile("movl %0, %"#S_3::"r"(d):SSE_REGs);\ 00665 d = sha256->digest[4]; __asm__ volatile("movl %0, %"#S_4::"r"(d):SSE_REGs);\ 00666 d = sha256->digest[5]; __asm__ volatile("movl %0, %"#S_5::"r"(d):SSE_REGs);\ 00667 d = sha256->digest[6]; __asm__ volatile("movl %0, %"#S_6::"r"(d):SSE_REGs);\ 00668 d = sha256->digest[7]; __asm__ volatile("movl %0, %"#S_7::"r"(d):SSE_REGs);\ 00669 } 00670 00671 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 00672 { word32 d; \ 00673 __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs); sha256->digest[0] += d;\ 00674 __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs); sha256->digest[1] += d;\ 00675 __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs); sha256->digest[2] += d;\ 00676 __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs); sha256->digest[3] += d;\ 00677 __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs); sha256->digest[4] += d;\ 00678 __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs); sha256->digest[5] += d;\ 00679 __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs); sha256->digest[6] += d;\ 00680 __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs); sha256->digest[7] += d;\ 00681 } 00682 00683 00684 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 00685 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 00686 00687 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 00688 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 00689 00690 00691 #define S_0 %r15d 00692 #define S_1 %r10d 00693 #define S_2 %r11d 00694 #define S_3 %r12d 00695 #define S_4 %r13d 00696 #define S_5 %r14d 00697 #define S_6 %ebx 00698 #define S_7 %r9d 00699 00700 #define SSE_REGs "%edi", "%ecx", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15" 00701 00702 #if defined(HAVE_INTEL_RORX) 00703 #define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i)\ 00704 __asm__ volatile("rorx $6, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\ 00705 00706 #define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i)\ 00707 __asm__ volatile("rorx $11, %"#e",%%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\ 00708 __asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\ 00709 __asm__ volatile("rorx $25, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\ 00710 00711 #define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i)\ 00712 __asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\ 00713 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\ 00714 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\ 00715 __asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\ 00716 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\ 00717 00718 #define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i)\ 00719 /*__asm__ volatile("movl %0, %%edx\n\t"::"m"(w_k):"%edx");*/\ 00720 __asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\ 00721 __asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\ 00722 __asm__ volatile("rorx $2, %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\ 00723 __asm__ volatile("rorx $13, %"#a", %%edi\n\t":::"%edi",SSE_REGs);/* edi = a>>13 */\ 00724 00725 #define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i)\ 00726 __asm__ volatile("rorx $22, %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\ 00727 __asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs);/* edi = (a>>2) ^ (a>>13) */\ 00728 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma0(a) */\ 00729 00730 #define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i)\ 00731 __asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\ 00732 __asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\ 00733 __asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c*/\ 00734 __asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\ 00735 00736 #define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i)\ 00737 __asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\ 00738 __asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\ 00739 __asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\ 00740 00741 #define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i)\ 00742 __asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\ 00743 __asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \ 00744 __asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs); \ 00745 __asm__ volatile("movl %r8d, "#h"\n\t"); 00746 #endif /* HAVE_INTEL_RORX */ 00747 00748 #define RND_STEP_1(a,b,c,d,e,f,g,h,i)\ 00749 __asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs);\ 00750 __asm__ volatile("roll $26, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\ 00751 __asm__ volatile("movl %"#e", %%edi\n\t":::"%edi",SSE_REGs);\ 00752 00753 #define RND_STEP_2(a,b,c,d,e,f,g,h,i)\ 00754 __asm__ volatile("roll $21, %%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\ 00755 __asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\ 00756 __asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e */\ 00757 __asm__ volatile("roll $7, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\ 00758 00759 #define RND_STEP_3(a,b,c,d,e,f,g,h,i)\ 00760 __asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\ 00761 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\ 00762 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\ 00763 __asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\ 00764 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\ 00765 00766 #define RND_STEP_4(a,b,c,d,e,f,g,h,i)\ 00767 __asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\ 00768 __asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\ 00769 __asm__ volatile("movl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a */\ 00770 __asm__ volatile("roll $30, %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\ 00771 __asm__ volatile("movl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a */\ 00772 __asm__ volatile("roll $19, %%edi\n\t":::"%edi",SSE_REGs); /* edi = a>>13 */\ 00773 __asm__ volatile("movl %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a */\ 00774 00775 #define RND_STEP_5(a,b,c,d,e,f,g,h,i)\ 00776 __asm__ volatile("roll $10, %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\ 00777 __asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs); /* edi = (a>>2) ^ (a>>13) */\ 00778 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);/* edx = Sigma0(a) */\ 00779 00780 #define RND_STEP_6(a,b,c,d,e,f,g,h,i)\ 00781 __asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\ 00782 __asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\ 00783 __asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c */\ 00784 __asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\ 00785 00786 #define RND_STEP_7(a,b,c,d,e,f,g,h,i)\ 00787 __asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\ 00788 __asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\ 00789 __asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\ 00790 00791 #define RND_STEP_8(a,b,c,d,e,f,g,h,i)\ 00792 __asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\ 00793 __asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \ 00794 /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\ 00795 __asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs);\ 00796 /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */\ 00797 __asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \ 00798 /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ 00799 00800 #define RND_X(a,b,c,d,e,f,g,h,i) \ 00801 RND_STEP_1(a,b,c,d,e,f,g,h,i); \ 00802 RND_STEP_2(a,b,c,d,e,f,g,h,i); \ 00803 RND_STEP_3(a,b,c,d,e,f,g,h,i); \ 00804 RND_STEP_4(a,b,c,d,e,f,g,h,i); \ 00805 RND_STEP_5(a,b,c,d,e,f,g,h,i); \ 00806 RND_STEP_6(a,b,c,d,e,f,g,h,i); \ 00807 RND_STEP_7(a,b,c,d,e,f,g,h,i); \ 00808 RND_STEP_8(a,b,c,d,e,f,g,h,i); 00809 00810 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); 00811 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); 00812 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); 00813 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); 00814 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); 00815 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); 00816 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); 00817 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); 00818 00819 00820 #define RND_1_3(a,b,c,d,e,f,g,h,i) {\ 00821 RND_STEP_1(a,b,c,d,e,f,g,h,i); \ 00822 RND_STEP_2(a,b,c,d,e,f,g,h,i); \ 00823 RND_STEP_3(a,b,c,d,e,f,g,h,i); \ 00824 } 00825 00826 #define RND_4_6(a,b,c,d,e,f,g,h,i) {\ 00827 RND_STEP_4(a,b,c,d,e,f,g,h,i); \ 00828 RND_STEP_5(a,b,c,d,e,f,g,h,i); \ 00829 RND_STEP_6(a,b,c,d,e,f,g,h,i); \ 00830 } 00831 00832 #define RND_7_8(a,b,c,d,e,f,g,h,i) {\ 00833 RND_STEP_7(a,b,c,d,e,f,g,h,i); \ 00834 RND_STEP_8(a,b,c,d,e,f,g,h,i); \ 00835 } 00836 00837 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); 00838 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); 00839 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); 00840 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); 00841 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); 00842 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); 00843 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); 00844 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); 00845 00846 00847 #define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); 00848 #define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); 00849 #define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); 00850 #define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); 00851 #define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); 00852 #define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); 00853 #define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); 00854 #define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); 00855 00856 #define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); 00857 #define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); 00858 #define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); 00859 #define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); 00860 #define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); 00861 #define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); 00862 #define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); 00863 #define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); 00864 00865 #define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); 00866 #define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); 00867 #define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); 00868 #define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); 00869 #define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); 00870 #define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); 00871 #define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); 00872 #define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); 00873 00874 #define FOR(cnt, init, max, inc, loop) \ 00875 __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):) 00876 #define END(cnt, init, max, inc, loop) \ 00877 __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::); 00878 00879 #endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */ 00880 00881 #if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */ 00882 00883 #define VPALIGNR(op1,op2,op3,op4) __asm__ volatile("vpalignr $"#op4", %"#op3", %"#op2", %"#op1:::XMM_REGs) 00884 #define VPADDD(op1,op2,op3) __asm__ volatile("vpaddd %"#op3", %"#op2", %"#op1:::XMM_REGs) 00885 #define VPSRLD(op1,op2,op3) __asm__ volatile("vpsrld $"#op3", %"#op2", %"#op1:::XMM_REGs) 00886 #define VPSRLQ(op1,op2,op3) __asm__ volatile("vpsrlq $"#op3", %"#op2", %"#op1:::XMM_REGs) 00887 #define VPSLLD(op1,op2,op3) __asm__ volatile("vpslld $"#op3", %"#op2", %"#op1:::XMM_REGs) 00888 #define VPOR(op1,op2,op3) __asm__ volatile("vpor %"#op3", %"#op2", %"#op1:::XMM_REGs) 00889 #define VPXOR(op1,op2,op3) __asm__ volatile("vpxor %"#op3", %"#op2", %"#op1:::XMM_REGs) 00890 #define VPSHUFD(op1,op2,op3) __asm__ volatile("vpshufd $"#op3", %"#op2", %"#op1:::XMM_REGs) 00891 #define VPSHUFB(op1,op2,op3) __asm__ volatile("vpshufb %"#op3", %"#op2", %"#op1:::XMM_REGs) 00892 00893 #define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\ 00894 a,b,c,d,e,f,g,h,_i)\ 00895 RND_STEP_1(a,b,c,d,e,f,g,h,_i);\ 00896 VPALIGNR (XTMP0, X3, X2, 4);\ 00897 RND_STEP_2(a,b,c,d,e,f,g,h,_i);\ 00898 VPADDD (XTMP0, XTMP0, X0);\ 00899 RND_STEP_3(a,b,c,d,e,f,g,h,_i);\ 00900 VPALIGNR (XTMP1, X1, X0, 4); /* XTMP1 = W[-15] */\ 00901 RND_STEP_4(a,b,c,d,e,f,g,h,_i);\ 00902 VPSRLD (XTMP2, XTMP1, 7);\ 00903 RND_STEP_5(a,b,c,d,e,f,g,h,_i);\ 00904 VPSLLD (XTMP3, XTMP1, 25); /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ 00905 RND_STEP_6(a,b,c,d,e,f,g,h,_i);\ 00906 VPOR (XTMP3, XTMP3, XTMP2); /* XTMP1 = W[-15] MY_ROR 7 */\ 00907 RND_STEP_7(a,b,c,d,e,f,g,h,_i);\ 00908 VPSRLD (XTMP2, XTMP1,18);\ 00909 RND_STEP_8(a,b,c,d,e,f,g,h,_i);\ 00910 \ 00911 RND_STEP_1(h,a,b,c,d,e,f,g,_i+1);\ 00912 VPSRLD (XTMP4, XTMP1, 3); /* XTMP4 = W[-15] >> 3 */\ 00913 RND_STEP_2(h,a,b,c,d,e,f,g,_i+1);\ 00914 VPSLLD (XTMP1, XTMP1, 14); /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ 00915 RND_STEP_3(h,a,b,c,d,e,f,g,_i+1);\ 00916 VPXOR (XTMP3, XTMP3, XTMP1);\ 00917 RND_STEP_4(h,a,b,c,d,e,f,g,_i+1);\ 00918 VPXOR (XTMP3, XTMP3, XTMP2); /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ 00919 RND_STEP_5(h,a,b,c,d,e,f,g,_i+1);\ 00920 VPXOR (XTMP1, XTMP3, XTMP4); /* XTMP1 = s0 */\ 00921 RND_STEP_6(h,a,b,c,d,e,f,g,_i+1);\ 00922 VPSHUFD(XTMP2, X3, 0b11111010); /* XTMP2 = W[-2] {BBAA}*/\ 00923 RND_STEP_7(h,a,b,c,d,e,f,g,_i+1);\ 00924 VPADDD (XTMP0, XTMP0, XTMP1); /* XTMP0 = W[-16] + W[-7] + s0 */\ 00925 RND_STEP_8(h,a,b,c,d,e,f,g,_i+1);\ 00926 \ 00927 RND_STEP_1(g,h,a,b,c,d,e,f,_i+2);\ 00928 VPSRLD (XTMP4, XTMP2, 10); /* XTMP4 = W[-2] >> 10 {BBAA} */\ 00929 RND_STEP_2(g,h,a,b,c,d,e,f,_i+2);\ 00930 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ 00931 RND_STEP_3(g,h,a,b,c,d,e,f,_i+2);\ 00932 VPSRLQ (XTMP2, XTMP2, 17); /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ 00933 RND_STEP_4(g,h,a,b,c,d,e,f,_i+2);\ 00934 VPXOR (XTMP2, XTMP2, XTMP3);\ 00935 RND_STEP_5(g,h,a,b,c,d,e,f,_i+2);\ 00936 VPXOR (XTMP4, XTMP4, XTMP2); /* XTMP4 = s1 {xBxA} */\ 00937 RND_STEP_6(g,h,a,b,c,d,e,f,_i+2);\ 00938 VPSHUFB (XTMP4, XTMP4, SHUF_00BA); /* XTMP4 = s1 {00BA} */\ 00939 RND_STEP_7(g,h,a,b,c,d,e,f,_i+2);\ 00940 VPADDD (XTMP0, XTMP0, XTMP4); /* XTMP0 = {..., ..., W[1], W[0]} */\ 00941 RND_STEP_8(g,h,a,b,c,d,e,f,_i+2);\ 00942 \ 00943 RND_STEP_1(f,g,h,a,b,c,d,e,_i+3);\ 00944 VPSHUFD (XTMP2, XTMP0, 0b01010000); /* XTMP2 = W[-2] {DDCC} */\ 00945 RND_STEP_2(f,g,h,a,b,c,d,e,_i+3);\ 00946 VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\ 00947 RND_STEP_3(f,g,h,a,b,c,d,e,_i+3);\ 00948 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ 00949 RND_STEP_4(f,g,h,a,b,c,d,e,_i+3);\ 00950 VPSRLQ (XTMP2, XTMP2, 17); /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ 00951 RND_STEP_5(f,g,h,a,b,c,d,e,_i+3);\ 00952 VPXOR (XTMP2, XTMP2, XTMP3);\ 00953 RND_STEP_6(f,g,h,a,b,c,d,e,_i+3);\ 00954 VPXOR (XTMP5, XTMP5, XTMP2); /* XTMP5 = s1 {xDxC} */\ 00955 RND_STEP_7(f,g,h,a,b,c,d,e,_i+3);\ 00956 VPSHUFB (XTMP5, XTMP5, SHUF_DC00); /* XTMP5 = s1 {DC00} */\ 00957 RND_STEP_8(f,g,h,a,b,c,d,e,_i+3);\ 00958 VPADDD (X0, XTMP5, XTMP0); /* X0 = {W[3], W[2], W[1], W[0]} */\ 00959 00960 #if defined(HAVE_INTEL_RORX) 00961 00962 #define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \ 00963 XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\ 00964 RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i);\ 00965 VPALIGNR (XTMP0, X3, X2, 4);\ 00966 RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i);\ 00967 VPADDD (XTMP0, XTMP0, X0);\ 00968 RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i);\ 00969 VPALIGNR (XTMP1, X1, X0, 4); /* XTMP1 = W[-15] */\ 00970 RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i);\ 00971 VPSRLD (XTMP2, XTMP1, 7);\ 00972 RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i);\ 00973 VPSLLD (XTMP3, XTMP1, 25); /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ 00974 RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i);\ 00975 VPOR (XTMP3, XTMP3, XTMP2); /* XTMP1 = W[-15] MY_ROR 7 */\ 00976 RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i);\ 00977 VPSRLD (XTMP2, XTMP1,18);\ 00978 RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i);\ 00979 \ 00980 RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1);\ 00981 VPSRLD (XTMP4, XTMP1, 3); /* XTMP4 = W[-15] >> 3 */\ 00982 RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1);\ 00983 VPSLLD (XTMP1, XTMP1, 14); /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ 00984 RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1);\ 00985 VPXOR (XTMP3, XTMP3, XTMP1);\ 00986 RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1);\ 00987 VPXOR (XTMP3, XTMP3, XTMP2); /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ 00988 RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1);\ 00989 VPXOR (XTMP1, XTMP3, XTMP4); /* XTMP1 = s0 */\ 00990 RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1);\ 00991 VPSHUFD(XTMP2, X3, 0b11111010); /* XTMP2 = W[-2] {BBAA}*/\ 00992 RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1);\ 00993 VPADDD (XTMP0, XTMP0, XTMP1); /* XTMP0 = W[-16] + W[-7] + s0 */\ 00994 RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1);\ 00995 \ 00996 RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2);\ 00997 VPSRLD (XTMP4, XTMP2, 10); /* XTMP4 = W[-2] >> 10 {BBAA} */\ 00998 RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2);\ 00999 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ 01000 RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2);\ 01001 VPSRLQ (XTMP2, XTMP2, 17); /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ 01002 RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2);\ 01003 VPXOR (XTMP2, XTMP2, XTMP3);\ 01004 RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2);\ 01005 VPXOR (XTMP4, XTMP4, XTMP2); /* XTMP4 = s1 {xBxA} */\ 01006 RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2);\ 01007 VPSHUFB (XTMP4, XTMP4, SHUF_00BA); /* XTMP4 = s1 {00BA} */\ 01008 RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2);\ 01009 VPADDD (XTMP0, XTMP0, XTMP4); /* XTMP0 = {..., ..., W[1], W[0]} */\ 01010 RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2);\ 01011 \ 01012 RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3);\ 01013 VPSHUFD (XTMP2, XTMP0, 0b01010000); /* XTMP2 = W[-2] {DDCC} */\ 01014 RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3);\ 01015 VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\ 01016 RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3);\ 01017 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ 01018 RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3);\ 01019 VPSRLQ (XTMP2, XTMP2, 17); /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ 01020 RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3);\ 01021 VPXOR (XTMP2, XTMP2, XTMP3);\ 01022 RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3);\ 01023 VPXOR (XTMP5, XTMP5, XTMP2); /* XTMP5 = s1 {xDxC} */\ 01024 RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3);\ 01025 VPSHUFB (XTMP5, XTMP5, SHUF_DC00); /* XTMP5 = s1 {DC00} */\ 01026 RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3);\ 01027 VPADDD (X0, XTMP5, XTMP0); /* X0 = {W[3], W[2], W[1], W[0]} */\ 01028 01029 #endif /* HAVE_INTEL_RORX */ 01030 01031 01032 #define W_K_from_buff\ 01033 __asm__ volatile("vmovdqu %0, %%xmm4\n\t"\ 01034 "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t"\ 01035 :: "m"(sha256->buffer[0]):"%xmm4");\ 01036 __asm__ volatile("vmovdqu %0, %%xmm5\n\t"\ 01037 "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t"\ 01038 ::"m"(sha256->buffer[4]):"%xmm5");\ 01039 __asm__ volatile("vmovdqu %0, %%xmm6\n\t"\ 01040 "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t"\ 01041 ::"m"(sha256->buffer[8]):"%xmm6");\ 01042 __asm__ volatile("vmovdqu %0, %%xmm7\n\t"\ 01043 "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"\ 01044 ::"m"(sha256->buffer[12]):"%xmm7");\ 01045 01046 #define _SET_W_K_XFER(reg, i)\ 01047 __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs);\ 01048 __asm__ volatile("vmovdqa %%xmm9, %0":"=m"(W_K[i])::XMM_REGs); 01049 01050 #define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i) 01051 01052 static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */ 01053 static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */ 01054 static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b }; 01055 01056 01057 #define _Init_Masks(mask1, mask2, mask3)\ 01058 __asm__ volatile("vmovdqu %0, %"#mask1 ::"m"(mBYTE_FLIP_MASK[0]));\ 01059 __asm__ volatile("vmovdqu %0, %"#mask2 ::"m"(mSHUF_00BA[0]));\ 01060 __asm__ volatile("vmovdqu %0, %"#mask3 ::"m"(mSHUF_DC00[0])); 01061 01062 #define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\ 01063 _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 01064 01065 #define X0 %xmm4 01066 #define X1 %xmm5 01067 #define X2 %xmm6 01068 #define X3 %xmm7 01069 #define X_ X0 01070 01071 #define XTMP0 %xmm0 01072 #define XTMP1 %xmm1 01073 #define XTMP2 %xmm2 01074 #define XTMP3 %xmm3 01075 #define XTMP4 %xmm8 01076 #define XTMP5 %xmm9 01077 #define XFER %xmm10 01078 01079 #define SHUF_00BA %xmm11 /* shuffle xBxA -> 00BA */ 01080 #define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */ 01081 #define BYTE_FLIP_MASK %xmm13 01082 01083 #define XMM_REGs /* Registers are saved in Sha256Update/Finel */ 01084 /*"xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13" */ 01085 01086 static int Transform_AVX1(Sha256* sha256) 01087 { 01088 ALIGN32 word32 W_K[64]; /* temp for W+K */ 01089 01090 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00); 01091 W_K_from_buff; /* X0, X1, X2, X3 = W[0..15]; */ 01092 01093 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7); 01094 01095 SET_W_K_XFER(X0, 0); 01096 01097 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01098 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0); 01099 SET_W_K_XFER(X1, 4); 01100 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01101 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4); 01102 SET_W_K_XFER(X2, 8); 01103 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01104 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8); 01105 SET_W_K_XFER(X3, 12); 01106 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01107 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12); 01108 SET_W_K_XFER(X0, 16); 01109 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01110 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16); 01111 SET_W_K_XFER(X1, 20); 01112 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01113 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20); 01114 SET_W_K_XFER(X2, 24); 01115 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01116 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24); 01117 SET_W_K_XFER(X3, 28); 01118 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01119 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28); 01120 SET_W_K_XFER(X0, 32); 01121 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01122 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32); 01123 SET_W_K_XFER(X1, 36); 01124 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01125 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36); 01126 SET_W_K_XFER(X2, 40); 01127 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01128 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40); 01129 SET_W_K_XFER(X3, 44); 01130 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01131 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44); 01132 01133 SET_W_K_XFER(X0, 48); 01134 SET_W_K_XFER(X1, 52); 01135 SET_W_K_XFER(X2, 56); 01136 SET_W_K_XFER(X3, 60); 01137 01138 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48); 01139 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49); 01140 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50); 01141 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51); 01142 01143 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52); 01144 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53); 01145 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54); 01146 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55); 01147 01148 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56); 01149 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57); 01150 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58); 01151 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59); 01152 01153 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60); 01154 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61); 01155 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62); 01156 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63); 01157 01158 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7); 01159 01160 return 0; 01161 } 01162 01163 #if defined(HAVE_INTEL_RORX) 01164 static int Transform_AVX1_RORX(Sha256* sha256) 01165 { 01166 ALIGN32 word32 W_K[64]; /* temp for W+K */ 01167 01168 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00); 01169 W_K_from_buff; /* X0, X1, X2, X3 = W[0..15]; */ 01170 01171 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7); 01172 SET_W_K_XFER(X0, 0); 01173 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01174 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0); 01175 SET_W_K_XFER(X1, 4); 01176 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01177 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4); 01178 SET_W_K_XFER(X2, 8); 01179 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01180 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8); 01181 SET_W_K_XFER(X3, 12); 01182 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01183 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12); 01184 SET_W_K_XFER(X0, 16); 01185 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01186 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16); 01187 SET_W_K_XFER(X1, 20); 01188 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01189 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20); 01190 SET_W_K_XFER(X2, 24); 01191 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01192 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24); 01193 SET_W_K_XFER(X3, 28); 01194 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01195 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28); 01196 SET_W_K_XFER(X0, 32); 01197 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01198 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32); 01199 SET_W_K_XFER(X1, 36); 01200 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01201 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36); 01202 SET_W_K_XFER(X2, 40); 01203 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01204 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40); 01205 SET_W_K_XFER(X3, 44); 01206 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01207 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44); 01208 01209 SET_W_K_XFER(X0, 48); 01210 SET_W_K_XFER(X1, 52); 01211 SET_W_K_XFER(X2, 56); 01212 SET_W_K_XFER(X3, 60); 01213 01214 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48); 01215 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49); 01216 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50); 01217 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51); 01218 01219 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52); 01220 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53); 01221 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54); 01222 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55); 01223 01224 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56); 01225 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57); 01226 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58); 01227 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59); 01228 01229 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60); 01230 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61); 01231 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62); 01232 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63); 01233 01234 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7); 01235 01236 return 0; 01237 } 01238 #endif /* HAVE_INTEL_RORX */ 01239 #endif /* HAVE_INTEL_AVX1 */ 01240 01241 01242 #if defined(HAVE_INTEL_AVX2) 01243 01244 #define _MOVE_to_REG(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem):YMM_REGs); 01245 #define _MOVE_to_MEM(mem, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" : "=m"(mem)::YMM_REGs); 01246 #define _BYTE_SWAP(ymm, map) __asm__ volatile("vpshufb %0, %%"#ymm", %%"#ymm"\n\t"\ 01247 :: "m"(map):YMM_REGs); 01248 #define _MOVE_128(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"#map", %%"\ 01249 #ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs); 01250 #define _MOVE_BYTE(ymm0, ymm1, map) __asm__ volatile("vpshufb %0, %%"#ymm1", %%"\ 01251 #ymm0"\n\t":: "m"(map):YMM_REGs); 01252 #define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrld $"#bits", %%"\ 01253 #src", %%"#dest"\n\tvpslld $32-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\ 01254 #temp",%%"#dest", %%"#dest" ":::YMM_REGs); 01255 #define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrld $"#bits", %%"\ 01256 #src", %%"#dest" ":::YMM_REGs); 01257 #define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\ 01258 #src2", %%"#dest" ":::YMM_REGs); 01259 #define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\ 01260 #src2", %%"#dest" ":::YMM_REGs); 01261 #define _ADD(dest, src1, src2) __asm__ volatile("vpaddd %%"#src1", %%"\ 01262 #src2", %%"#dest" ":::YMM_REGs); 01263 #define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddd %0, %%"#src1", %%"\ 01264 #dest" "::"m"(mem):YMM_REGs); 01265 #define _BLEND(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\ 01266 #src1", %%"#src2", %%"#dest" ":::YMM_REGs); 01267 01268 #define _EXTRACT_XMM_0(xmm, mem) __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); 01269 #define _EXTRACT_XMM_1(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); 01270 #define _EXTRACT_XMM_2(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); 01271 #define _EXTRACT_XMM_3(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); 01272 #define _EXTRACT_XMM_4(ymm, xmm, mem)\ 01273 __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs);\ 01274 __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); 01275 #define _EXTRACT_XMM_5(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); 01276 #define _EXTRACT_XMM_6(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); 01277 #define _EXTRACT_XMM_7(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs); 01278 01279 #define _SWAP_YMM_HL(ymm) __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs); 01280 #define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm) 01281 01282 #define MOVE_to_REG(ymm, mem) _MOVE_to_REG(ymm, mem) 01283 #define MOVE_to_MEM(mem, ymm) _MOVE_to_MEM(mem, ymm) 01284 #define BYTE_SWAP(ymm, map) _BYTE_SWAP(ymm, map) 01285 #define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map) 01286 #define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map) 01287 #define XOR(dest, src1, src2) _XOR(dest, src1, src2) 01288 #define OR(dest, src1, src2) _OR(dest, src1, src2) 01289 #define ADD(dest, src1, src2) _ADD(dest, src1, src2) 01290 #define ADD_MEM(dest, src1, mem) _ADD_MEM(dest, src1, mem) 01291 #define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2) 01292 01293 #define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp); 01294 #define AVX2_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP) 01295 #define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits) 01296 01297 #define GAMMA0(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); \ 01298 XOR(dest, G_TEMP, dest); AVX2_R(G_TEMP, src, 3); XOR(dest, G_TEMP, dest); 01299 #define GAMMA0_1(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); 01300 #define GAMMA0_2(dest, src) XOR(dest, G_TEMP, dest); AVX2_R(G_TEMP, src, 3); \ 01301 XOR(dest, G_TEMP, dest); 01302 01303 #define GAMMA1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); \ 01304 XOR(dest, G_TEMP, dest); AVX2_R(G_TEMP, src, 10); XOR(dest, G_TEMP, dest); 01305 #define GAMMA1_1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); 01306 #define GAMMA1_2(dest, src) XOR(dest, G_TEMP, dest); AVX2_R(G_TEMP, src, 10); \ 01307 XOR(dest, G_TEMP, dest); 01308 01309 #define FEEDBACK1_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP1toW_I_2[0]); \ 01310 BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2); 01311 #define FEEDBACK2_to_W_I_2 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08); \ 01312 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAP2toW_I_2[0]); BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2); 01313 #define FEEDBACK3_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP3toW_I_2[0]); \ 01314 BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2); 01315 01316 #define FEEDBACK_to_W_I_7 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08);\ 01317 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAPtoW_I_7[0]); BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7); 01318 01319 #undef voitle 01320 01321 #define W_I_16 ymm8 01322 #define W_I_15 ymm9 01323 #define W_I_7 ymm10 01324 #define W_I_2 ymm11 01325 #define W_I ymm12 01326 #define G_TEMP ymm13 01327 #define S_TEMP ymm14 01328 #define YMM_TEMP0 ymm15 01329 #define YMM_TEMP0x xmm15 01330 #define W_I_TEMP ymm7 01331 #define W_K_TEMP ymm15 01332 #define W_K_TEMPx xmm15 01333 01334 #define YMM_REGs /* Registers are saved in Sha256Update/Finel */ 01335 /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/ 01336 01337 01338 #define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\ 01339 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs);\ 01340 __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs);\ 01341 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs);\ 01342 __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs);\ 01343 __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs);\ 01344 01345 #define MOVE_7_to_15(w_i_15, w_i_7)\ 01346 __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs);\ 01347 01348 #define MOVE_I_to_7(w_i_7, w_i)\ 01349 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" ":::YMM_REGs);\ 01350 __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" ":::YMM_REGs);\ 01351 __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs);\ 01352 01353 #define MOVE_I_to_2(w_i_2, w_i)\ 01354 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs);\ 01355 __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs);\ 01356 01357 #define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\ 01358 MOVE_15_to_16(w_i_16, w_i_15, w_i_7); \ 01359 MOVE_7_to_15(w_i_15, w_i_7); \ 01360 MOVE_I_to_7(w_i_7, w_i); \ 01361 MOVE_I_to_2(w_i_2, w_i);\ 01362 01363 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 01364 { word32 d;\ 01365 __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs);\ 01366 sha256->digest[0] += d;\ 01367 __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs);\ 01368 sha256->digest[1] += d;\ 01369 __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs);\ 01370 sha256->digest[2] += d;\ 01371 __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs);\ 01372 sha256->digest[3] += d;\ 01373 __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs);\ 01374 sha256->digest[4] += d;\ 01375 __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs);\ 01376 sha256->digest[5] += d;\ 01377 __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs);\ 01378 sha256->digest[6] += d;\ 01379 __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs);\ 01380 sha256->digest[7] += d;\ 01381 } 01382 01383 #define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 01384 { word32 d[8];\ 01385 __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs);\ 01386 __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs);\ 01387 __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs);\ 01388 __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs);\ 01389 __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs);\ 01390 __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs);\ 01391 __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs);\ 01392 __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs);\ 01393 printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\ 01394 __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs);\ 01395 __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs);\ 01396 __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs);\ 01397 __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs);\ 01398 __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs);\ 01399 __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs);\ 01400 __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs);\ 01401 __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs);\ 01402 } 01403 01404 01405 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 01406 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 01407 01408 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 01409 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 01410 01411 #define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 01412 _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 01413 01414 01415 /* Byte swap Masks to ensure that rest of the words are filled with zero's. */ 01416 static const unsigned long mBYTE_FLIP_MASK_16[] = 01417 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b }; 01418 static const unsigned long mBYTE_FLIP_MASK_15[] = 01419 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b }; 01420 static const unsigned long mBYTE_FLIP_MASK_7 [] = 01421 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b }; 01422 static const unsigned long mBYTE_FLIP_MASK_2 [] = 01423 { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 }; 01424 01425 static const unsigned long mMAPtoW_I_7[] = 01426 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 }; 01427 static const unsigned long mMAP1toW_I_2[] = 01428 { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 }; 01429 static const unsigned long mMAP2toW_I_2[] = 01430 { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 }; 01431 static const unsigned long mMAP3toW_I_2[] = 01432 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 }; 01433 01434 static int Transform_AVX2(Sha256* sha256) 01435 { 01436 #ifdef WOLFSSL_SMALL_STACK 01437 word32* W_K; 01438 W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER); 01439 if (W_K == NULL) 01440 return MEMORY_E; 01441 #else 01442 word32 W_K[64]; 01443 #endif 01444 01445 MOVE_to_REG(W_I_16, sha256->buffer[0]); BYTE_SWAP(W_I_16, mBYTE_FLIP_MASK_16[0]); 01446 MOVE_to_REG(W_I_15, sha256->buffer[1]); BYTE_SWAP(W_I_15, mBYTE_FLIP_MASK_15[0]); 01447 MOVE_to_REG(W_I, sha256->buffer[8]); BYTE_SWAP(W_I, mBYTE_FLIP_MASK_16[0]); 01448 MOVE_to_REG(W_I_7, sha256->buffer[16-7]); BYTE_SWAP(W_I_7, mBYTE_FLIP_MASK_7[0]); 01449 MOVE_to_REG(W_I_2, sha256->buffer[16-2]); BYTE_SWAP(W_I_2, mBYTE_FLIP_MASK_2[0]); 01450 01451 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7); 01452 01453 ADD_MEM(W_K_TEMP, W_I_16, K[0]); 01454 MOVE_to_MEM(W_K[0], W_K_TEMP); 01455 01456 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0); 01457 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1); 01458 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2); 01459 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3); 01460 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4); 01461 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5); 01462 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6); 01463 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7); 01464 01465 ADD_MEM(YMM_TEMP0, W_I, K[8]); 01466 MOVE_to_MEM(W_K[8], YMM_TEMP0); 01467 01468 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 01469 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8); 01470 GAMMA0_1(W_I_TEMP, W_I_15); 01471 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8); 01472 GAMMA0_2(W_I_TEMP, W_I_15); 01473 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8); 01474 ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */ 01475 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9); 01476 ADD(W_I, W_I_7, W_I_TEMP); 01477 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9); 01478 GAMMA1_1(YMM_TEMP0, W_I_2); 01479 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9); 01480 GAMMA1_2(YMM_TEMP0, W_I_2); 01481 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10); 01482 ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */ 01483 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10); 01484 FEEDBACK1_to_W_I_2; 01485 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10); 01486 FEEDBACK_to_W_I_7; 01487 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11); 01488 ADD(W_I_TEMP, W_I_7, W_I_TEMP); 01489 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11); 01490 GAMMA1_1(YMM_TEMP0, W_I_2); 01491 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11); 01492 GAMMA1_2(YMM_TEMP0, W_I_2); 01493 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12); 01494 ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */ 01495 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12); 01496 FEEDBACK2_to_W_I_2; 01497 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12); 01498 GAMMA1_1(YMM_TEMP0, W_I_2); 01499 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13); 01500 GAMMA1_2(YMM_TEMP0, W_I_2); 01501 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13); 01502 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */ 01503 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13); 01504 FEEDBACK3_to_W_I_2; 01505 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14); 01506 GAMMA1(YMM_TEMP0, W_I_2); 01507 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14); 01508 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14); 01509 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */ 01510 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15); 01511 01512 MOVE_to_REG(YMM_TEMP0, K[16]); 01513 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15); 01514 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I); 01515 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15); 01516 ADD(YMM_TEMP0, YMM_TEMP0, W_I); 01517 MOVE_to_MEM(W_K[16], YMM_TEMP0); 01518 01519 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 01520 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16); 01521 GAMMA0_1(W_I_TEMP, W_I_15); 01522 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16); 01523 GAMMA0_2(W_I_TEMP, W_I_15); 01524 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16); 01525 ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */ 01526 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17); 01527 ADD(W_I, W_I_7, W_I_TEMP); 01528 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17); 01529 GAMMA1_1(YMM_TEMP0, W_I_2); 01530 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17); 01531 GAMMA1_2(YMM_TEMP0, W_I_2); 01532 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18); 01533 ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */ 01534 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18); 01535 FEEDBACK1_to_W_I_2; 01536 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18); 01537 FEEDBACK_to_W_I_7; 01538 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19); 01539 ADD(W_I_TEMP, W_I_7, W_I_TEMP); 01540 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19); 01541 GAMMA1(YMM_TEMP0, W_I_2); 01542 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19); 01543 GAMMA1_2(YMM_TEMP0, W_I_2); 01544 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20); 01545 ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */ 01546 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20); 01547 FEEDBACK2_to_W_I_2; 01548 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20); 01549 GAMMA1_1(YMM_TEMP0, W_I_2); 01550 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21); 01551 GAMMA1_2(YMM_TEMP0, W_I_2); 01552 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21); 01553 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */ 01554 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21); 01555 FEEDBACK3_to_W_I_2; 01556 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22); 01557 GAMMA1_1(YMM_TEMP0, W_I_2); 01558 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22); 01559 GAMMA1_2(YMM_TEMP0, W_I_2); 01560 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22); 01561 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */ 01562 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23); 01563 01564 MOVE_to_REG(YMM_TEMP0, K[24]); 01565 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23); 01566 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I); 01567 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23); 01568 ADD(YMM_TEMP0, YMM_TEMP0, W_I); 01569 MOVE_to_MEM(W_K[24], YMM_TEMP0); 01570 01571 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 01572 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24); 01573 GAMMA0_1(W_I_TEMP, W_I_15); 01574 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24); 01575 GAMMA0_2(W_I_TEMP, W_I_15); 01576 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24); 01577 ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */ 01578 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25); 01579 ADD(W_I, W_I_7, W_I_TEMP); 01580 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25); 01581 GAMMA1_1(YMM_TEMP0, W_I_2); 01582 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25); 01583 GAMMA1_2(YMM_TEMP0, W_I_2); 01584 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26); 01585 ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */ 01586 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26); 01587 FEEDBACK1_to_W_I_2; 01588 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26); 01589 FEEDBACK_to_W_I_7; 01590 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27); 01591 ADD(W_I_TEMP, W_I_7, W_I_TEMP); 01592 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27); 01593 GAMMA1_1(YMM_TEMP0, W_I_2); 01594 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27); 01595 GAMMA1_2(YMM_TEMP0, W_I_2); 01596 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28); 01597 ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */ 01598 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28); 01599 FEEDBACK2_to_W_I_2; 01600 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28); 01601 GAMMA1_1(YMM_TEMP0, W_I_2); 01602 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29); 01603 GAMMA1_2(YMM_TEMP0, W_I_2); 01604 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29); 01605 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */ 01606 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29); 01607 FEEDBACK3_to_W_I_2; 01608 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30); 01609 GAMMA1(YMM_TEMP0, W_I_2); 01610 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30); 01611 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30); 01612 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */ 01613 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31); 01614 01615 MOVE_to_REG(YMM_TEMP0, K[32]); 01616 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31); 01617 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I); 01618 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31); 01619 ADD(YMM_TEMP0, YMM_TEMP0, W_I); 01620 MOVE_to_MEM(W_K[32], YMM_TEMP0); 01621 01622 01623 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 01624 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32); 01625 GAMMA0_1(W_I_TEMP, W_I_15); 01626 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32); 01627 GAMMA0_2(W_I_TEMP, W_I_15); 01628 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32); 01629 ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */ 01630 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33); 01631 ADD(W_I, W_I_7, W_I_TEMP); 01632 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33); 01633 GAMMA1_1(YMM_TEMP0, W_I_2); 01634 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33); 01635 GAMMA1_2(YMM_TEMP0, W_I_2); 01636 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34); 01637 ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */ 01638 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34); 01639 FEEDBACK1_to_W_I_2; 01640 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34); 01641 FEEDBACK_to_W_I_7; 01642 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35); 01643 ADD(W_I_TEMP, W_I_7, W_I_TEMP); 01644 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35); 01645 GAMMA1_1(YMM_TEMP0, W_I_2); 01646 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35); 01647 GAMMA1_2(YMM_TEMP0, W_I_2); 01648 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36); 01649 ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */ 01650 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36); 01651 FEEDBACK2_to_W_I_2; 01652 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36); 01653 GAMMA1_1(YMM_TEMP0, W_I_2); 01654 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37); 01655 GAMMA1_2(YMM_TEMP0, W_I_2); 01656 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37); 01657 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */ 01658 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37); 01659 FEEDBACK3_to_W_I_2; 01660 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38); 01661 GAMMA1_1(YMM_TEMP0, W_I_2); 01662 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38); 01663 GAMMA1_2(YMM_TEMP0, W_I_2); 01664 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38); 01665 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */ 01666 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39); 01667 01668 MOVE_to_REG(YMM_TEMP0, K[40]); 01669 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39); 01670 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I); 01671 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39); 01672 ADD(YMM_TEMP0, YMM_TEMP0, W_I); 01673 MOVE_to_MEM(W_K[40], YMM_TEMP0); 01674 01675 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 01676 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40); 01677 GAMMA0_1(W_I_TEMP, W_I_15); 01678 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40); 01679 GAMMA0_2(W_I_TEMP, W_I_15); 01680 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40); 01681 ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */ 01682 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41); 01683 ADD(W_I, W_I_7, W_I_TEMP); 01684 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41); 01685 GAMMA1_1(YMM_TEMP0, W_I_2); 01686 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41); 01687 GAMMA1_2(YMM_TEMP0, W_I_2); 01688 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42); 01689 ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */ 01690 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42); 01691 FEEDBACK1_to_W_I_2; 01692 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42); 01693 FEEDBACK_to_W_I_7; 01694 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43); 01695 ADD(W_I_TEMP, W_I_7, W_I_TEMP); 01696 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43); 01697 GAMMA1_1(YMM_TEMP0, W_I_2); 01698 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43); 01699 GAMMA1_2(YMM_TEMP0, W_I_2); 01700 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44); 01701 ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */ 01702 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44); 01703 FEEDBACK2_to_W_I_2; 01704 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44); 01705 GAMMA1_1(YMM_TEMP0, W_I_2); 01706 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45); 01707 GAMMA1_2(YMM_TEMP0, W_I_2); 01708 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45); 01709 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */ 01710 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45); 01711 FEEDBACK3_to_W_I_2; 01712 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46); 01713 GAMMA1_1(YMM_TEMP0, W_I_2); 01714 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46); 01715 GAMMA1_2(YMM_TEMP0, W_I_2); 01716 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46); 01717 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */ 01718 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47); 01719 01720 MOVE_to_REG(YMM_TEMP0, K[48]); 01721 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47); 01722 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I); 01723 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47); 01724 ADD(YMM_TEMP0, YMM_TEMP0, W_I); 01725 MOVE_to_MEM(W_K[48], YMM_TEMP0); 01726 01727 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 01728 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48); 01729 GAMMA0_1(W_I_TEMP, W_I_15); 01730 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48); 01731 GAMMA0_2(W_I_TEMP, W_I_15); 01732 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48); 01733 ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */ 01734 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49); 01735 ADD(W_I, W_I_7, W_I_TEMP); 01736 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49); 01737 GAMMA1_1(YMM_TEMP0, W_I_2); 01738 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49); 01739 GAMMA1_2(YMM_TEMP0, W_I_2); 01740 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50); 01741 ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */ 01742 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50); 01743 FEEDBACK1_to_W_I_2; 01744 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50); 01745 FEEDBACK_to_W_I_7; 01746 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51); 01747 ADD(W_I_TEMP, W_I_7, W_I_TEMP); 01748 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51); 01749 GAMMA1_1(YMM_TEMP0, W_I_2); 01750 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51); 01751 GAMMA1_2(YMM_TEMP0, W_I_2); 01752 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52); 01753 ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */ 01754 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52); 01755 FEEDBACK2_to_W_I_2; 01756 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52); 01757 GAMMA1_1(YMM_TEMP0, W_I_2); 01758 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53); 01759 GAMMA1_2(YMM_TEMP0, W_I_2); 01760 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53); 01761 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */ 01762 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53); 01763 FEEDBACK3_to_W_I_2; 01764 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54); 01765 GAMMA1_1(YMM_TEMP0, W_I_2); 01766 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54); 01767 GAMMA1_2(YMM_TEMP0, W_I_2); 01768 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54); 01769 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */ 01770 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55); 01771 01772 MOVE_to_REG(YMM_TEMP0, K[56]); 01773 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55); 01774 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I); 01775 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55); 01776 ADD(YMM_TEMP0, YMM_TEMP0, W_I); 01777 MOVE_to_MEM(W_K[56], YMM_TEMP0); 01778 01779 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56); 01780 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57); 01781 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58); 01782 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59); 01783 01784 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60); 01785 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61); 01786 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62); 01787 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63); 01788 01789 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7); 01790 01791 #ifdef WOLFSSL_SMALL_STACK 01792 XFREE(W_K, NULL, DYNAMIC_TYPE_TMP_BUFFER); 01793 #endif 01794 01795 return 0; 01796 } 01797 01798 #endif /* HAVE_INTEL_AVX2 */ 01799 01800 01801 #ifdef WOLFSSL_SHA224 01802 static int InitSha224(Sha224* sha224) 01803 { 01804 int ret = 0; 01805 01806 sha224->digest[0] = 0xc1059ed8; 01807 sha224->digest[1] = 0x367cd507; 01808 sha224->digest[2] = 0x3070dd17; 01809 sha224->digest[3] = 0xf70e5939; 01810 sha224->digest[4] = 0xffc00b31; 01811 sha224->digest[5] = 0x68581511; 01812 sha224->digest[6] = 0x64f98fa7; 01813 sha224->digest[7] = 0xbefa4fa4; 01814 01815 sha224->buffLen = 0; 01816 sha224->loLen = 0; 01817 sha224->hiLen = 0; 01818 01819 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) 01820 /* choose best Transform function under this runtime environment */ 01821 set_Transform(); 01822 #endif 01823 01824 return ret; 01825 } 01826 01827 int wc_InitSha224_ex(Sha224* sha224, void* heap, int devId) 01828 { 01829 int ret = 0; 01830 01831 if (sha224 == NULL) 01832 return BAD_FUNC_ARG; 01833 01834 sha224->heap = heap; 01835 01836 ret = InitSha224(sha224); 01837 if (ret != 0) 01838 return ret; 01839 01840 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224) 01841 ret = wolfAsync_DevCtxInit(&sha224->asyncDev, 01842 WOLFSSL_ASYNC_MARKER_SHA224, sha224->heap, devId); 01843 #else 01844 (void)devId; 01845 #endif /* WOLFSSL_ASYNC_CRYPT */ 01846 01847 return ret; 01848 } 01849 01850 int wc_InitSha224(Sha224* sha224) 01851 { 01852 return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID); 01853 } 01854 01855 int wc_Sha224Update(Sha224* sha224, const byte* data, word32 len) 01856 { 01857 int ret; 01858 01859 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224) 01860 if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) { 01861 #if defined(HAVE_INTEL_QA) 01862 return IntelQaSymSha224(&sha224->asyncDev, NULL, data, len); 01863 #endif 01864 } 01865 #endif /* WOLFSSL_ASYNC_CRYPT */ 01866 01867 ret = Sha256Update((Sha256 *)sha224, data, len); 01868 01869 return ret; 01870 } 01871 01872 int wc_Sha224Final(Sha224* sha224, byte* hash) 01873 { 01874 int ret; 01875 01876 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224) 01877 if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) { 01878 #if defined(HAVE_INTEL_QA) 01879 return IntelQaSymSha224(&sha224->asyncDev, hash, NULL, 01880 SHA224_DIGEST_SIZE); 01881 #endif 01882 } 01883 #endif /* WOLFSSL_ASYNC_CRYPT */ 01884 01885 ret = Sha256Final((Sha256*)sha224); 01886 if (ret != 0) 01887 return ret; 01888 01889 #if defined(LITTLE_ENDIAN_ORDER) 01890 ByteReverseWords(sha224->digest, sha224->digest, SHA224_DIGEST_SIZE); 01891 #endif 01892 XMEMCPY(hash, sha224->digest, SHA224_DIGEST_SIZE); 01893 01894 return InitSha224(sha224); /* reset state */ 01895 } 01896 01897 void wc_Sha224Free(Sha224* sha224) 01898 { 01899 if (sha224 == NULL) 01900 return; 01901 01902 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224) 01903 wolfAsync_DevCtxFree(&sha224->asyncDev, WOLFSSL_ASYNC_MARKER_SHA224); 01904 #endif /* WOLFSSL_ASYNC_CRYPT */ 01905 } 01906 01907 #endif /* WOLFSSL_SHA224 */ 01908 01909 01910 int wc_InitSha256(Sha256* sha256) 01911 { 01912 return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID); 01913 } 01914 01915 void wc_Sha256Free(Sha256* sha256) 01916 { 01917 if (sha256 == NULL) 01918 return; 01919 01920 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256) 01921 wolfAsync_DevCtxFree(&sha256->asyncDev, WOLFSSL_ASYNC_MARKER_SHA256); 01922 #endif /* WOLFSSL_ASYNC_CRYPT */ 01923 } 01924 01925 #endif /* !WOLFSSL_TI_HASH */ 01926 #endif /* HAVE_FIPS */ 01927 01928 01929 #ifndef WOLFSSL_TI_HASH 01930 #ifdef WOLFSSL_SHA224 01931 int wc_Sha224GetHash(Sha224* sha224, byte* hash) 01932 { 01933 int ret; 01934 Sha224 tmpSha224; 01935 01936 if (sha224 == NULL || hash == NULL) 01937 return BAD_FUNC_ARG; 01938 01939 ret = wc_Sha224Copy(sha224, &tmpSha224); 01940 if (ret == 0) { 01941 ret = wc_Sha224Final(&tmpSha224, hash); 01942 } 01943 return ret; 01944 } 01945 int wc_Sha224Copy(Sha224* src, Sha224* dst) 01946 { 01947 int ret = 0; 01948 01949 if (src == NULL || dst == NULL) 01950 return BAD_FUNC_ARG; 01951 01952 XMEMCPY(dst, src, sizeof(Sha224)); 01953 01954 #ifdef WOLFSSL_ASYNC_CRYPT 01955 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev); 01956 #endif 01957 01958 return ret; 01959 } 01960 #endif /* WOLFSSL_SHA224 */ 01961 01962 int wc_Sha256GetHash(Sha256* sha256, byte* hash) 01963 { 01964 int ret; 01965 Sha256 tmpSha256; 01966 01967 if (sha256 == NULL || hash == NULL) 01968 return BAD_FUNC_ARG; 01969 01970 ret = wc_Sha256Copy(sha256, &tmpSha256); 01971 if (ret == 0) { 01972 ret = wc_Sha256Final(&tmpSha256, hash); 01973 } 01974 return ret; 01975 } 01976 int wc_Sha256Copy(Sha256* src, Sha256* dst) 01977 { 01978 int ret = 0; 01979 01980 if (src == NULL || dst == NULL) 01981 return BAD_FUNC_ARG; 01982 01983 XMEMCPY(dst, src, sizeof(Sha256)); 01984 01985 #ifdef WOLFSSL_ASYNC_CRYPT 01986 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev); 01987 #endif 01988 01989 return ret; 01990 } 01991 #endif /* !WOLFSSL_TI_HASH */ 01992 01993 #endif /* NO_SHA256 */ 01994
Generated on Tue Jul 12 2022 23:30:59 by
1.7.2
