Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
sha256.c
00001 /* sha256.c 00002 * 00003 * Copyright (C) 2006-2016 wolfSSL Inc. 00004 * 00005 * This file is part of wolfSSL. 00006 * 00007 * wolfSSL is free software; you can redistribute it and/or modify 00008 * it under the terms of the GNU General Public License as published by 00009 * the Free Software Foundation; either version 2 of the License, or 00010 * (at your option) any later version. 00011 * 00012 * wolfSSL is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 * GNU General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU General Public License 00018 * along with this program; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA 00020 */ 00021 00022 00023 /* code submitted by raphael.huck@efixo.com */ 00024 00025 #ifdef HAVE_CONFIG_H 00026 #include <config.h> 00027 #endif 00028 00029 #include <wolfssl/wolfcrypt/settings.h> 00030 #include <wolfssl/wolfcrypt/sha256.h> 00031 00032 #if !defined(NO_SHA256) 00033 #ifdef HAVE_FIPS 00034 00035 int wc_InitSha256(Sha256* sha) 00036 { 00037 return InitSha256_fips(sha); 00038 } 00039 00040 00041 int wc_Sha256Update(Sha256* sha, const byte* data, word32 len) 00042 { 00043 return Sha256Update_fips(sha, data, len); 00044 } 00045 00046 00047 int wc_Sha256Final(Sha256* sha, byte* out) 00048 { 00049 return Sha256Final_fips(sha, out); 00050 } 00051 00052 00053 #else /* else build without fips */ 00054 00055 #if !defined(NO_SHA256) && defined(WOLFSSL_TI_HASH) 00056 /* #include <wolfcrypt/src/port/ti/ti-hash.c> included by wc_port.c */ 00057 #else 00058 00059 #if !defined (ALIGN32) 00060 #if defined (__GNUC__) 00061 #define ALIGN32 __attribute__ ( (aligned (32))) 00062 #elif defined(_MSC_VER) 00063 /* disable align warning, we want alignment ! */ 00064 #pragma warning(disable: 4324) 00065 #define ALIGN32 __declspec (align (32)) 00066 #else 00067 #define ALIGN32 00068 #endif 00069 #endif 00070 00071 #ifdef WOLFSSL_PIC32MZ_HASH 00072 #define wc_InitSha256 wc_InitSha256_sw 00073 #define wc_Sha256Update wc_Sha256Update_sw 00074 #define wc_Sha256Final wc_Sha256Final_sw 00075 #endif 00076 00077 #ifdef HAVE_FIPS 00078 /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */ 00079 #define FIPS_NO_WRAPPERS 00080 #endif 00081 00082 #if defined(USE_INTEL_SPEEDUP) 00083 #define HAVE_INTEL_AVX1 00084 #define HAVE_INTEL_AVX2 00085 00086 #if defined(DEBUG_XMM) 00087 #include "stdio.h" 00088 #endif 00089 00090 #endif 00091 00092 #if defined(HAVE_INTEL_AVX2) 00093 #define HAVE_INTEL_RORX 00094 #endif 00095 00096 00097 /***** 00098 Intel AVX1/AVX2 Macro Control Structure 00099 00100 #define HAVE_INTEL_AVX1 00101 #define HAVE_INTEL_AVX2 00102 00103 #define HAVE_INTEL_RORX 00104 00105 00106 int InitSha256(Sha256* sha256) { 00107 Save/Recover XMM, YMM 00108 ... 00109 } 00110 00111 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) 00112 Transform() ; Function prototype 00113 #else 00114 Transform() { } 00115 int Sha256Final() { 00116 Save/Recover XMM, YMM 00117 ... 00118 } 00119 #endif 00120 00121 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) 00122 #if defined(HAVE_INTEL_RORX 00123 #define RND with rorx instuction 00124 #else 00125 #define RND 00126 #endif 00127 #endif 00128 00129 #if defined(HAVE_INTEL_AVX1) 00130 00131 #define XMM Instructions/inline asm 00132 00133 int Transform() { 00134 Stitched Message Sched/Round 00135 } 00136 00137 #elif defined(HAVE_INTEL_AVX2) 00138 00139 #define YMM Instructions/inline asm 00140 00141 int Transform() { 00142 More granural Stitched Message Sched/Round 00143 } 00144 00145 */ 00146 00147 00148 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00149 00150 /* Each platform needs to query info type 1 from cpuid to see if aesni is 00151 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts 00152 */ 00153 00154 #ifndef _MSC_VER 00155 #define cpuid(reg, leaf, sub)\ 00156 __asm__ __volatile__ ("cpuid":\ 00157 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ 00158 "a" (leaf), "c"(sub)); 00159 00160 #define XASM_LINK(f) asm(f) 00161 #else 00162 00163 #include <intrin.h> 00164 #define cpuid(a,b) __cpuid((int*)a,b) 00165 00166 #define XASM_LINK(f) 00167 00168 #endif /* _MSC_VER */ 00169 00170 #define EAX 0 00171 #define EBX 1 00172 #define ECX 2 00173 #define EDX 3 00174 00175 #define CPUID_AVX1 0x1 00176 #define CPUID_AVX2 0x2 00177 #define CPUID_RDRAND 0x4 00178 #define CPUID_RDSEED 0x8 00179 #define CPUID_BMI2 0x10 /* MULX, RORX */ 00180 00181 #define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1) 00182 #define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2) 00183 #define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2) 00184 #define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND) 00185 #define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED) 00186 00187 static word32 cpuid_check = 0 ; 00188 static word32 cpuid_flags = 0 ; 00189 00190 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { 00191 int got_intel_cpu=0; 00192 unsigned int reg[5]; 00193 00194 reg[4] = '\0' ; 00195 cpuid(reg, 0, 0); 00196 if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 && 00197 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 && 00198 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) { 00199 got_intel_cpu = 1; 00200 } 00201 if (got_intel_cpu) { 00202 cpuid(reg, leaf, sub); 00203 return((reg[num]>>bit)&0x1) ; 00204 } 00205 return 0 ; 00206 } 00207 00208 static int set_cpuid_flags(void) { 00209 if(cpuid_check==0) { 00210 if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;} 00211 if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; } 00212 if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; } 00213 if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; } 00214 if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; } 00215 cpuid_check = 1 ; 00216 return 0 ; 00217 } 00218 return 1 ; 00219 } 00220 00221 00222 /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */ 00223 static int Transform(Sha256* sha256); 00224 00225 #if defined(HAVE_INTEL_AVX1) 00226 static int Transform_AVX1(Sha256 *sha256) ; 00227 #endif 00228 #if defined(HAVE_INTEL_AVX2) 00229 static int Transform_AVX2(Sha256 *sha256) ; 00230 static int Transform_AVX1_RORX(Sha256 *sha256) ; 00231 #endif 00232 00233 static int (*Transform_p)(Sha256* sha256) /* = _Transform */; 00234 00235 #define XTRANSFORM(sha256, B) (*Transform_p)(sha256) 00236 00237 static void set_Transform(void) { 00238 if(set_cpuid_flags())return ; 00239 00240 #if defined(HAVE_INTEL_AVX2) 00241 if(IS_INTEL_AVX2 && IS_INTEL_BMI2){ 00242 Transform_p = Transform_AVX1_RORX; return ; 00243 Transform_p = Transform_AVX2 ; 00244 /* for avoiding warning,"not used" */ 00245 } 00246 #endif 00247 #if defined(HAVE_INTEL_AVX1) 00248 Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform) ; return ; 00249 #endif 00250 Transform_p = Transform ; return ; 00251 } 00252 00253 #else 00254 #if defined(FREESCALE_MMCAU) 00255 #define XTRANSFORM(sha256, B) Transform(sha256, B) 00256 #else 00257 #define XTRANSFORM(sha256, B) Transform(sha256) 00258 #endif 00259 #endif 00260 00261 /* Dummy for saving MM_REGs on behalf of Transform */ 00262 #if defined(HAVE_INTEL_AVX2)&& !defined(HAVE_INTEL_AVX1) 00263 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\ 00264 "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15") 00265 #elif defined(HAVE_INTEL_AVX1) 00266 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\ 00267 "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\ 00268 "xmm11","xmm12","xmm13","xmm14","xmm15") 00269 #else 00270 #define SAVE_XMM_YMM 00271 #endif 00272 00273 #ifdef WOLFSSL_PIC32MZ_HASH 00274 #define InitSha256 InitSha256_sw 00275 #define Sha256Update Sha256Update_sw 00276 #define Sha256Final Sha256Final_sw 00277 #endif 00278 00279 #include <wolfssl/wolfcrypt/logging.h> 00280 #include <wolfssl/wolfcrypt/error-crypt.h> 00281 00282 #ifdef NO_INLINE 00283 #include <wolfssl/wolfcrypt/misc.h> 00284 #else 00285 #include <wolfcrypt/src/misc.c> 00286 #endif 00287 00288 #ifdef FREESCALE_MMCAU 00289 #include "cau_api.h" 00290 #endif 00291 00292 #ifndef WOLFSSL_HAVE_MIN 00293 #define WOLFSSL_HAVE_MIN 00294 00295 static INLINE word32 min(word32 a, word32 b) 00296 { 00297 return a > b ? b : a; 00298 } 00299 00300 #endif /* WOLFSSL_HAVE_MIN */ 00301 00302 00303 int wc_InitSha256(Sha256* sha256) 00304 { 00305 int ret = 0; 00306 #ifdef FREESCALE_MMCAU 00307 ret = wolfSSL_CryptHwMutexLock(); 00308 if(ret != 0) { 00309 return ret; 00310 } 00311 cau_sha256_initialize_output(sha256->digest); 00312 wolfSSL_CryptHwMutexUnLock(); 00313 #else 00314 sha256->digest[0] = 0x6A09E667L; 00315 sha256->digest[1] = 0xBB67AE85L; 00316 sha256->digest[2] = 0x3C6EF372L; 00317 sha256->digest[3] = 0xA54FF53AL; 00318 sha256->digest[4] = 0x510E527FL; 00319 sha256->digest[5] = 0x9B05688CL; 00320 sha256->digest[6] = 0x1F83D9ABL; 00321 sha256->digest[7] = 0x5BE0CD19L; 00322 #endif 00323 00324 sha256->buffLen = 0; 00325 sha256->loLen = 0; 00326 sha256->hiLen = 0; 00327 00328 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) 00329 set_Transform() ; /* choose best Transform function under this runtime environment */ 00330 #endif 00331 00332 return ret; 00333 } 00334 00335 00336 #if !defined(FREESCALE_MMCAU) 00337 static const ALIGN32 word32 K[64] = { 00338 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL, 00339 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L, 00340 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, 00341 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL, 00342 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L, 00343 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L, 00344 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, 00345 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L, 00346 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L, 00347 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L, 00348 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, 00349 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L, 00350 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L 00351 }; 00352 00353 #endif 00354 00355 #if defined(FREESCALE_MMCAU) 00356 00357 static int Transform(Sha256* sha256, byte* buf) 00358 { 00359 int ret = wolfSSL_CryptHwMutexLock(); 00360 if(ret == 0) { 00361 cau_sha256_hash_n(buf, 1, sha256->digest); 00362 wolfSSL_CryptHwMutexUnLock(); 00363 } 00364 return ret; 00365 } 00366 00367 #endif /* FREESCALE_MMCAU */ 00368 00369 #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) 00370 #define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y))) 00371 #define R(x, n) (((x)&0xFFFFFFFFU)>>(n)) 00372 00373 #define S(x, n) rotrFixed(x, n) 00374 #define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22)) 00375 #define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25)) 00376 #define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3)) 00377 #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10)) 00378 00379 #define RND(a,b,c,d,e,f,g,h,i) \ 00380 t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \ 00381 t1 = Sigma0((a)) + Maj((a), (b), (c)); \ 00382 (d) += t0; \ 00383 (h) = t0 + t1; 00384 00385 #if !defined(FREESCALE_MMCAU) 00386 static int Transform(Sha256* sha256) 00387 { 00388 word32 S[8], t0, t1; 00389 int i; 00390 00391 #ifdef WOLFSSL_SMALL_STACK 00392 word32* W; 00393 00394 W = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER); 00395 if (W == NULL) 00396 return MEMORY_E; 00397 #else 00398 word32 W[64]; 00399 #endif 00400 00401 /* Copy context->state[] to working vars */ 00402 for (i = 0; i < 8; i++) 00403 S[i] = sha256->digest[i]; 00404 00405 for (i = 0; i < 16; i++) 00406 W[i] = sha256->buffer[i]; 00407 00408 for (i = 16; i < 64; i++) 00409 W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16]; 00410 00411 for (i = 0; i < 64; i += 8) { 00412 RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0); 00413 RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1); 00414 RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2); 00415 RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3); 00416 RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4); 00417 RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5); 00418 RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6); 00419 RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7); 00420 } 00421 00422 /* Add the working vars back into digest state[] */ 00423 for (i = 0; i < 8; i++) { 00424 sha256->digest[i] += S[i]; 00425 } 00426 00427 #ifdef WOLFSSL_SMALL_STACK 00428 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); 00429 #endif 00430 00431 return 0; 00432 } 00433 00434 #endif /* #if !defined(FREESCALE_MMCAU) */ 00435 00436 static INLINE void AddLength(Sha256* sha256, word32 len) 00437 { 00438 word32 tmp = sha256->loLen; 00439 if ( (sha256->loLen += len) < tmp) 00440 sha256->hiLen++; /* carry low to high */ 00441 } 00442 00443 int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len) 00444 { 00445 00446 /* do block size increments */ 00447 byte* local = (byte*)sha256->buffer; 00448 00449 SAVE_XMM_YMM ; /* for Intel AVX */ 00450 00451 while (len) { 00452 word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen); 00453 XMEMCPY(&local[sha256->buffLen], data, add); 00454 00455 sha256->buffLen += add; 00456 data += add; 00457 len -= add; 00458 00459 if (sha256->buffLen == SHA256_BLOCK_SIZE) { 00460 int ret; 00461 00462 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) 00463 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00464 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) 00465 #endif 00466 ByteReverseWords(sha256->buffer, sha256->buffer, 00467 SHA256_BLOCK_SIZE); 00468 #endif 00469 ret = XTRANSFORM(sha256, local); 00470 if (ret != 0) 00471 return ret; 00472 00473 AddLength(sha256, SHA256_BLOCK_SIZE); 00474 sha256->buffLen = 0; 00475 } 00476 } 00477 00478 return 0; 00479 } 00480 00481 int wc_Sha256Final(Sha256* sha256, byte* hash) 00482 { 00483 byte* local = (byte*)sha256->buffer; 00484 int ret; 00485 00486 SAVE_XMM_YMM ; /* for Intel AVX */ 00487 00488 AddLength(sha256, sha256->buffLen); /* before adding pads */ 00489 00490 local[sha256->buffLen++] = 0x80; /* add 1 */ 00491 00492 /* pad with zeros */ 00493 if (sha256->buffLen > SHA256_PAD_SIZE) { 00494 XMEMSET(&local[sha256->buffLen], 0, SHA256_BLOCK_SIZE - sha256->buffLen); 00495 sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen; 00496 00497 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) 00498 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00499 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) 00500 #endif 00501 ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE); 00502 #endif 00503 00504 ret = XTRANSFORM(sha256, local); 00505 if (ret != 0) 00506 return ret; 00507 00508 sha256->buffLen = 0; 00509 } 00510 XMEMSET(&local[sha256->buffLen], 0, SHA256_PAD_SIZE - sha256->buffLen); 00511 00512 /* put lengths in bits */ 00513 sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) + 00514 (sha256->hiLen << 3); 00515 sha256->loLen = sha256->loLen << 3; 00516 00517 /* store lengths */ 00518 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU) 00519 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00520 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) 00521 #endif 00522 ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE); 00523 #endif 00524 /* ! length ordering dependent on digest endian type ! */ 00525 XMEMCPY(&local[SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32)); 00526 XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen, 00527 sizeof(word32)); 00528 00529 #if defined(FREESCALE_MMCAU) || defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00530 /* Kinetis requires only these bytes reversed */ 00531 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00532 if(IS_INTEL_AVX1 || IS_INTEL_AVX2) 00533 #endif 00534 ByteReverseWords(&sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)], 00535 &sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)], 00536 2 * sizeof(word32)); 00537 #endif 00538 00539 ret = XTRANSFORM(sha256, local); 00540 if (ret != 0) 00541 return ret; 00542 00543 #if defined(LITTLE_ENDIAN_ORDER) 00544 ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE); 00545 #endif 00546 XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE); 00547 00548 return wc_InitSha256(sha256); /* reset state */ 00549 } 00550 00551 00552 00553 00554 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00555 00556 #define _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 00557 { word32 d ;\ 00558 d = sha256->digest[0]; __asm__ volatile("movl %0, %"#S_0::"r"(d):SSE_REGs) ;\ 00559 d = sha256->digest[1]; __asm__ volatile("movl %0, %"#S_1::"r"(d):SSE_REGs) ;\ 00560 d = sha256->digest[2]; __asm__ volatile("movl %0, %"#S_2::"r"(d):SSE_REGs) ;\ 00561 d = sha256->digest[3]; __asm__ volatile("movl %0, %"#S_3::"r"(d):SSE_REGs) ;\ 00562 d = sha256->digest[4]; __asm__ volatile("movl %0, %"#S_4::"r"(d):SSE_REGs) ;\ 00563 d = sha256->digest[5]; __asm__ volatile("movl %0, %"#S_5::"r"(d):SSE_REGs) ;\ 00564 d = sha256->digest[6]; __asm__ volatile("movl %0, %"#S_6::"r"(d):SSE_REGs) ;\ 00565 d = sha256->digest[7]; __asm__ volatile("movl %0, %"#S_7::"r"(d):SSE_REGs) ;\ 00566 } 00567 00568 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 00569 { word32 d ; \ 00570 __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ; sha256->digest[0] += d;\ 00571 __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ; sha256->digest[1] += d;\ 00572 __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ; sha256->digest[2] += d;\ 00573 __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ; sha256->digest[3] += d;\ 00574 __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ; sha256->digest[4] += d;\ 00575 __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ; sha256->digest[5] += d;\ 00576 __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ; sha256->digest[6] += d;\ 00577 __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ; sha256->digest[7] += d;\ 00578 } 00579 00580 00581 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 00582 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 00583 00584 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 00585 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 00586 00587 00588 00589 00590 #define S_0 %r15d 00591 #define S_1 %r10d 00592 #define S_2 %r11d 00593 #define S_3 %r12d 00594 #define S_4 %r13d 00595 #define S_5 %r14d 00596 #define S_6 %ebx 00597 #define S_7 %r9d 00598 00599 #define SSE_REGs "%edi", "%ecx", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15" 00600 00601 #if defined(HAVE_INTEL_RORX) 00602 #define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i)\ 00603 __asm__ volatile("rorx $6, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\ 00604 00605 #define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i)\ 00606 __asm__ volatile("rorx $11, %"#e",%%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\ 00607 __asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\ 00608 __asm__ volatile("rorx $25, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\ 00609 00610 #define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i)\ 00611 __asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\ 00612 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\ 00613 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\ 00614 __asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\ 00615 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\ 00616 00617 #define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i)\ 00618 /*__asm__ volatile("movl %0, %%edx\n\t"::"m"(w_k):"%edx");*/\ 00619 __asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\ 00620 __asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\ 00621 __asm__ volatile("rorx $2, %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\ 00622 __asm__ volatile("rorx $13, %"#a", %%edi\n\t":::"%edi",SSE_REGs);/* edi = a>>13 */\ 00623 00624 #define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i)\ 00625 __asm__ volatile("rorx $22, %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\ 00626 __asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs);/* edi = (a>>2) ^ (a>>13) */\ 00627 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma0(a) */\ 00628 00629 #define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i)\ 00630 __asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\ 00631 __asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\ 00632 __asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c*/\ 00633 __asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\ 00634 00635 #define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i)\ 00636 __asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\ 00637 __asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\ 00638 __asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\ 00639 00640 #define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i)\ 00641 __asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\ 00642 __asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \ 00643 __asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs); \ 00644 __asm__ volatile("movl %r8d, "#h"\n\t"); 00645 00646 #endif 00647 00648 #define RND_STEP_1(a,b,c,d,e,f,g,h,i)\ 00649 __asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs);\ 00650 __asm__ volatile("roll $26, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\ 00651 __asm__ volatile("movl %"#e", %%edi\n\t":::"%edi",SSE_REGs);\ 00652 00653 #define RND_STEP_2(a,b,c,d,e,f,g,h,i)\ 00654 __asm__ volatile("roll $21, %%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\ 00655 __asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\ 00656 __asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e */\ 00657 __asm__ volatile("roll $7, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\ 00658 00659 #define RND_STEP_3(a,b,c,d,e,f,g,h,i)\ 00660 __asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\ 00661 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\ 00662 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\ 00663 __asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\ 00664 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\ 00665 00666 #define RND_STEP_4(a,b,c,d,e,f,g,h,i)\ 00667 __asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\ 00668 __asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\ 00669 __asm__ volatile("movl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a */\ 00670 __asm__ volatile("roll $30, %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\ 00671 __asm__ volatile("movl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a */\ 00672 __asm__ volatile("roll $19, %%edi\n\t":::"%edi",SSE_REGs); /* edi = a>>13 */\ 00673 __asm__ volatile("movl %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a */\ 00674 00675 #define RND_STEP_5(a,b,c,d,e,f,g,h,i)\ 00676 __asm__ volatile("roll $10, %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\ 00677 __asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs); /* edi = (a>>2) ^ (a>>13) */\ 00678 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);/* edx = Sigma0(a) */\ 00679 00680 #define RND_STEP_6(a,b,c,d,e,f,g,h,i)\ 00681 __asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\ 00682 __asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\ 00683 __asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c */\ 00684 __asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\ 00685 00686 #define RND_STEP_7(a,b,c,d,e,f,g,h,i)\ 00687 __asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\ 00688 __asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\ 00689 __asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\ 00690 00691 #define RND_STEP_8(a,b,c,d,e,f,g,h,i)\ 00692 __asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\ 00693 __asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \ 00694 /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\ 00695 __asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs);\ 00696 /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */\ 00697 __asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \ 00698 /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \ 00699 00700 #define RND_X(a,b,c,d,e,f,g,h,i) \ 00701 RND_STEP_1(a,b,c,d,e,f,g,h,i); \ 00702 RND_STEP_2(a,b,c,d,e,f,g,h,i); \ 00703 RND_STEP_3(a,b,c,d,e,f,g,h,i); \ 00704 RND_STEP_4(a,b,c,d,e,f,g,h,i); \ 00705 RND_STEP_5(a,b,c,d,e,f,g,h,i); \ 00706 RND_STEP_6(a,b,c,d,e,f,g,h,i); \ 00707 RND_STEP_7(a,b,c,d,e,f,g,h,i); \ 00708 RND_STEP_8(a,b,c,d,e,f,g,h,i); 00709 00710 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); 00711 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); 00712 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); 00713 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); 00714 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); 00715 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); 00716 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); 00717 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); 00718 00719 00720 #define RND_1_3(a,b,c,d,e,f,g,h,i) {\ 00721 RND_STEP_1(a,b,c,d,e,f,g,h,i); \ 00722 RND_STEP_2(a,b,c,d,e,f,g,h,i); \ 00723 RND_STEP_3(a,b,c,d,e,f,g,h,i); \ 00724 } 00725 00726 #define RND_4_6(a,b,c,d,e,f,g,h,i) {\ 00727 RND_STEP_4(a,b,c,d,e,f,g,h,i); \ 00728 RND_STEP_5(a,b,c,d,e,f,g,h,i); \ 00729 RND_STEP_6(a,b,c,d,e,f,g,h,i); \ 00730 } 00731 00732 #define RND_7_8(a,b,c,d,e,f,g,h,i) {\ 00733 RND_STEP_7(a,b,c,d,e,f,g,h,i); \ 00734 RND_STEP_8(a,b,c,d,e,f,g,h,i); \ 00735 } 00736 00737 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); 00738 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); 00739 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); 00740 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); 00741 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); 00742 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); 00743 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); 00744 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); 00745 00746 00747 #define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); 00748 #define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); 00749 #define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); 00750 #define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); 00751 #define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); 00752 #define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); 00753 #define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); 00754 #define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); 00755 00756 #define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); 00757 #define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); 00758 #define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); 00759 #define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); 00760 #define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); 00761 #define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); 00762 #define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); 00763 #define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); 00764 00765 #define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i); 00766 #define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i); 00767 #define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i); 00768 #define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i); 00769 #define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i); 00770 #define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i); 00771 #define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i); 00772 #define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i); 00773 00774 #define FOR(cnt, init, max, inc, loop) \ 00775 __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):) 00776 #define END(cnt, init, max, inc, loop) \ 00777 __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::) ; 00778 00779 #endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */ 00780 00781 #if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */ 00782 00783 #define VPALIGNR(op1,op2,op3,op4) __asm__ volatile("vpalignr $"#op4", %"#op3", %"#op2", %"#op1:::XMM_REGs) 00784 #define VPADDD(op1,op2,op3) __asm__ volatile("vpaddd %"#op3", %"#op2", %"#op1:::XMM_REGs) 00785 #define VPSRLD(op1,op2,op3) __asm__ volatile("vpsrld $"#op3", %"#op2", %"#op1:::XMM_REGs) 00786 #define VPSRLQ(op1,op2,op3) __asm__ volatile("vpsrlq $"#op3", %"#op2", %"#op1:::XMM_REGs) 00787 #define VPSLLD(op1,op2,op3) __asm__ volatile("vpslld $"#op3", %"#op2", %"#op1:::XMM_REGs) 00788 #define VPOR(op1,op2,op3) __asm__ volatile("vpor %"#op3", %"#op2", %"#op1:::XMM_REGs) 00789 #define VPXOR(op1,op2,op3) __asm__ volatile("vpxor %"#op3", %"#op2", %"#op1:::XMM_REGs) 00790 #define VPSHUFD(op1,op2,op3) __asm__ volatile("vpshufd $"#op3", %"#op2", %"#op1:::XMM_REGs) 00791 #define VPSHUFB(op1,op2,op3) __asm__ volatile("vpshufb %"#op3", %"#op2", %"#op1:::XMM_REGs) 00792 00793 #define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\ 00794 a,b,c,d,e,f,g,h,_i)\ 00795 RND_STEP_1(a,b,c,d,e,f,g,h,_i);\ 00796 VPALIGNR (XTMP0, X3, X2, 4) ;\ 00797 RND_STEP_2(a,b,c,d,e,f,g,h,_i);\ 00798 VPADDD (XTMP0, XTMP0, X0) ;\ 00799 RND_STEP_3(a,b,c,d,e,f,g,h,_i);\ 00800 VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\ 00801 RND_STEP_4(a,b,c,d,e,f,g,h,_i);\ 00802 VPSRLD (XTMP2, XTMP1, 7) ;\ 00803 RND_STEP_5(a,b,c,d,e,f,g,h,_i);\ 00804 VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ 00805 RND_STEP_6(a,b,c,d,e,f,g,h,_i);\ 00806 VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\ 00807 RND_STEP_7(a,b,c,d,e,f,g,h,_i);\ 00808 VPSRLD (XTMP2, XTMP1,18) ;\ 00809 RND_STEP_8(a,b,c,d,e,f,g,h,_i);\ 00810 \ 00811 RND_STEP_1(h,a,b,c,d,e,f,g,_i+1);\ 00812 VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\ 00813 RND_STEP_2(h,a,b,c,d,e,f,g,_i+1);\ 00814 VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ 00815 RND_STEP_3(h,a,b,c,d,e,f,g,_i+1);\ 00816 VPXOR (XTMP3, XTMP3, XTMP1) ;\ 00817 RND_STEP_4(h,a,b,c,d,e,f,g,_i+1);\ 00818 VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ 00819 RND_STEP_5(h,a,b,c,d,e,f,g,_i+1);\ 00820 VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\ 00821 RND_STEP_6(h,a,b,c,d,e,f,g,_i+1);\ 00822 VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\ 00823 RND_STEP_7(h,a,b,c,d,e,f,g,_i+1);\ 00824 VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\ 00825 RND_STEP_8(h,a,b,c,d,e,f,g,_i+1);\ 00826 \ 00827 RND_STEP_1(g,h,a,b,c,d,e,f,_i+2);\ 00828 VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\ 00829 RND_STEP_2(g,h,a,b,c,d,e,f,_i+2);\ 00830 VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ 00831 RND_STEP_3(g,h,a,b,c,d,e,f,_i+2);\ 00832 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ 00833 RND_STEP_4(g,h,a,b,c,d,e,f,_i+2);\ 00834 VPXOR (XTMP2, XTMP2, XTMP3) ;\ 00835 RND_STEP_5(g,h,a,b,c,d,e,f,_i+2);\ 00836 VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\ 00837 RND_STEP_6(g,h,a,b,c,d,e,f,_i+2);\ 00838 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\ 00839 RND_STEP_7(g,h,a,b,c,d,e,f,_i+2);\ 00840 VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\ 00841 RND_STEP_8(g,h,a,b,c,d,e,f,_i+2);\ 00842 \ 00843 RND_STEP_1(f,g,h,a,b,c,d,e,_i+3);\ 00844 VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\ 00845 RND_STEP_2(f,g,h,a,b,c,d,e,_i+3);\ 00846 VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\ 00847 RND_STEP_3(f,g,h,a,b,c,d,e,_i+3);\ 00848 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ 00849 RND_STEP_4(f,g,h,a,b,c,d,e,_i+3);\ 00850 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ 00851 RND_STEP_5(f,g,h,a,b,c,d,e,_i+3);\ 00852 VPXOR (XTMP2, XTMP2, XTMP3) ;\ 00853 RND_STEP_6(f,g,h,a,b,c,d,e,_i+3);\ 00854 VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\ 00855 RND_STEP_7(f,g,h,a,b,c,d,e,_i+3);\ 00856 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\ 00857 RND_STEP_8(f,g,h,a,b,c,d,e,_i+3);\ 00858 VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\ 00859 00860 #if defined(HAVE_INTEL_RORX) 00861 00862 #define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \ 00863 XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\ 00864 RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i);\ 00865 VPALIGNR (XTMP0, X3, X2, 4) ;\ 00866 RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i);\ 00867 VPADDD (XTMP0, XTMP0, X0) ;\ 00868 RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i);\ 00869 VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\ 00870 RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i);\ 00871 VPSRLD (XTMP2, XTMP1, 7) ;\ 00872 RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i);\ 00873 VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\ 00874 RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i);\ 00875 VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\ 00876 RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i);\ 00877 VPSRLD (XTMP2, XTMP1,18) ;\ 00878 RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i);\ 00879 \ 00880 RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1);\ 00881 VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\ 00882 RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1);\ 00883 VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\ 00884 RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1);\ 00885 VPXOR (XTMP3, XTMP3, XTMP1) ;\ 00886 RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1);\ 00887 VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\ 00888 RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1);\ 00889 VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\ 00890 RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1);\ 00891 VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\ 00892 RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1);\ 00893 VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\ 00894 RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1);\ 00895 \ 00896 RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2);\ 00897 VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\ 00898 RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2);\ 00899 VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\ 00900 RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2);\ 00901 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\ 00902 RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2);\ 00903 VPXOR (XTMP2, XTMP2, XTMP3) ;\ 00904 RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2);\ 00905 VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\ 00906 RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2);\ 00907 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\ 00908 RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2);\ 00909 VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\ 00910 RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2);\ 00911 \ 00912 RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3);\ 00913 VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\ 00914 RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3);\ 00915 VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\ 00916 RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3);\ 00917 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\ 00918 RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3);\ 00919 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\ 00920 RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3);\ 00921 VPXOR (XTMP2, XTMP2, XTMP3) ;\ 00922 RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3);\ 00923 VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\ 00924 RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3);\ 00925 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\ 00926 RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3);\ 00927 VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\ 00928 00929 #endif 00930 00931 00932 #define W_K_from_buff\ 00933 __asm__ volatile("vmovdqu %0, %%xmm4\n\t"\ 00934 "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t"\ 00935 :: "m"(sha256->buffer[0]):"%xmm4") ;\ 00936 __asm__ volatile("vmovdqu %0, %%xmm5\n\t"\ 00937 "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t"\ 00938 ::"m"(sha256->buffer[4]):"%xmm5") ;\ 00939 __asm__ volatile("vmovdqu %0, %%xmm6\n\t"\ 00940 "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t"\ 00941 ::"m"(sha256->buffer[8]):"%xmm6") ;\ 00942 __asm__ volatile("vmovdqu %0, %%xmm7\n\t"\ 00943 "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"\ 00944 ::"m"(sha256->buffer[12]):"%xmm7") ;\ 00945 00946 #define _SET_W_K_XFER(reg, i)\ 00947 __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs) ;\ 00948 __asm__ volatile("vmovdqa %%xmm9, %0":"=m"(W_K[i])::XMM_REGs) ; 00949 00950 #define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i) 00951 00952 static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF } ; /* shuffle xBxA -> 00BA */ 00953 static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 } ; /* shuffle xDxC -> DC00 */ 00954 static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b } ; 00955 00956 00957 #define _Init_Masks(mask1, mask2, mask3)\ 00958 __asm__ volatile("vmovdqu %0, %"#mask1 ::"m"(mBYTE_FLIP_MASK[0])) ;\ 00959 __asm__ volatile("vmovdqu %0, %"#mask2 ::"m"(mSHUF_00BA[0])) ;\ 00960 __asm__ volatile("vmovdqu %0, %"#mask3 ::"m"(mSHUF_DC00[0])) ; 00961 00962 #define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\ 00963 _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) 00964 00965 #define X0 %xmm4 00966 #define X1 %xmm5 00967 #define X2 %xmm6 00968 #define X3 %xmm7 00969 #define X_ X0 00970 00971 #define XTMP0 %xmm0 00972 #define XTMP1 %xmm1 00973 #define XTMP2 %xmm2 00974 #define XTMP3 %xmm3 00975 #define XTMP4 %xmm8 00976 #define XTMP5 %xmm9 00977 #define XFER %xmm10 00978 00979 #define SHUF_00BA %xmm11 /* shuffle xBxA -> 00BA */ 00980 #define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */ 00981 #define BYTE_FLIP_MASK %xmm13 00982 00983 #define XMM_REGs /* Registers are saved in Sha256Update/Finel */ 00984 /*"xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13" */ 00985 00986 static int Transform_AVX1(Sha256* sha256) 00987 { 00988 00989 word32 W_K[64] ; /* temp for W+K */ 00990 00991 #if defined(DEBUG_XMM) 00992 int i, j ; 00993 word32 xmm[29][4*15] ; 00994 #endif 00995 00996 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ; 00997 W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */ 00998 00999 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; 01000 01001 SET_W_K_XFER(X0, 0) ; 01002 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01003 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ; 01004 SET_W_K_XFER(X1, 4) ; 01005 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01006 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ; 01007 SET_W_K_XFER(X2, 8) ; 01008 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01009 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; 01010 SET_W_K_XFER(X3, 12) ; 01011 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01012 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ; 01013 SET_W_K_XFER(X0, 16) ; 01014 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01015 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; 01016 SET_W_K_XFER(X1, 20) ; 01017 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01018 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ; 01019 SET_W_K_XFER(X2, 24) ; 01020 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01021 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; 01022 SET_W_K_XFER(X3, 28) ; 01023 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01024 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ; 01025 SET_W_K_XFER(X0, 32) ; 01026 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01027 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; 01028 SET_W_K_XFER(X1, 36) ; 01029 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01030 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ; 01031 SET_W_K_XFER(X2, 40) ; 01032 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01033 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; 01034 SET_W_K_XFER(X3, 44) ; 01035 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 01036 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ; 01037 01038 SET_W_K_XFER(X0, 48) ; 01039 SET_W_K_XFER(X1, 52) ; 01040 SET_W_K_XFER(X2, 56) ; 01041 SET_W_K_XFER(X3, 60) ; 01042 01043 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; 01044 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; 01045 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; 01046 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; 01047 01048 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; 01049 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; 01050 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; 01051 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; 01052 01053 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ; 01054 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ; 01055 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ; 01056 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ; 01057 01058 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ; 01059 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ; 01060 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ; 01061 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ; 01062 01063 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; 01064 01065 #if defined(DEBUG_XMM) 01066 for(i=0; i<29; i++) { 01067 for(j=0; j<4*14; j+=4) 01068 printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i, 01069 xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ; 01070 printf("\n") ; 01071 } 01072 01073 for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ; 01074 #endif 01075 01076 return 0; 01077 } 01078 01079 #if defined(HAVE_INTEL_RORX) 01080 static int Transform_AVX1_RORX(Sha256* sha256) 01081 { 01082 01083 word32 W_K[64] ; /* temp for W+K */ 01084 01085 #if defined(DEBUG_XMM) 01086 int i, j ; 01087 word32 xmm[29][4*15] ; 01088 #endif 01089 01090 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ; 01091 W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */ 01092 01093 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; 01094 SET_W_K_XFER(X0, 0) ; 01095 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01096 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ; 01097 SET_W_K_XFER(X1, 4) ; 01098 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01099 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ; 01100 SET_W_K_XFER(X2, 8) ; 01101 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01102 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; 01103 SET_W_K_XFER(X3, 12) ; 01104 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01105 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ; 01106 SET_W_K_XFER(X0, 16) ; 01107 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01108 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; 01109 SET_W_K_XFER(X1, 20) ; 01110 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01111 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ; 01112 SET_W_K_XFER(X2, 24) ; 01113 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01114 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; 01115 SET_W_K_XFER(X3, 28) ; 01116 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01117 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ; 01118 SET_W_K_XFER(X0, 32) ; 01119 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01120 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; 01121 SET_W_K_XFER(X1, 36) ; 01122 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01123 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ; 01124 SET_W_K_XFER(X2, 40) ; 01125 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01126 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; 01127 SET_W_K_XFER(X3, 44) ; 01128 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 01129 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ; 01130 01131 SET_W_K_XFER(X0, 48) ; 01132 SET_W_K_XFER(X1, 52) ; 01133 SET_W_K_XFER(X2, 56) ; 01134 SET_W_K_XFER(X3, 60) ; 01135 01136 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; 01137 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; 01138 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; 01139 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; 01140 01141 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; 01142 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; 01143 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; 01144 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; 01145 01146 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ; 01147 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ; 01148 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ; 01149 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ; 01150 01151 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ; 01152 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ; 01153 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ; 01154 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ; 01155 01156 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; 01157 01158 #if defined(DEBUG_XMM) 01159 for(i=0; i<29; i++) { 01160 for(j=0; j<4*14; j+=4) 01161 printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i, 01162 xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ; 01163 printf("\n") ; 01164 } 01165 01166 for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ; 01167 #endif 01168 01169 return 0; 01170 } 01171 #endif /* HAVE_INTEL_RORX */ 01172 01173 #endif /* HAVE_INTEL_AVX1 */ 01174 01175 01176 #if defined(HAVE_INTEL_AVX2) 01177 01178 #define _MOVE_to_REG(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem):YMM_REGs) ; 01179 #define _MOVE_to_MEM(mem, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" : "=m"(mem)::YMM_REGs) ; 01180 #define _BYTE_SWAP(ymm, map) __asm__ volatile("vpshufb %0, %%"#ymm", %%"#ymm"\n\t"\ 01181 :: "m"(map):YMM_REGs) ; 01182 #define _MOVE_128(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"#map", %%"\ 01183 #ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs) ; 01184 #define _MOVE_BYTE(ymm0, ymm1, map) __asm__ volatile("vpshufb %0, %%"#ymm1", %%"\ 01185 #ymm0"\n\t":: "m"(map):YMM_REGs) ; 01186 #define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrld $"#bits", %%"\ 01187 #src", %%"#dest"\n\tvpslld $32-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\ 01188 #temp",%%"#dest", %%"#dest" ":::YMM_REGs) ; 01189 #define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrld $"#bits", %%"\ 01190 #src", %%"#dest" ":::YMM_REGs) ; 01191 #define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\ 01192 #src2", %%"#dest" ":::YMM_REGs) ; 01193 #define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\ 01194 #src2", %%"#dest" ":::YMM_REGs) ; 01195 #define _ADD(dest, src1, src2) __asm__ volatile("vpaddd %%"#src1", %%"\ 01196 #src2", %%"#dest" ":::YMM_REGs) ; 01197 #define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddd %0, %%"#src1", %%"\ 01198 #dest" "::"m"(mem):YMM_REGs) ; 01199 #define _BLEND(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\ 01200 #src1", %%"#src2", %%"#dest" ":::YMM_REGs) ; 01201 01202 #define _EXTRACT_XMM_0(xmm, mem) __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; 01203 #define _EXTRACT_XMM_1(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; 01204 #define _EXTRACT_XMM_2(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; 01205 #define _EXTRACT_XMM_3(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; 01206 #define _EXTRACT_XMM_4(ymm, xmm, mem)\ 01207 __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ;\ 01208 __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; 01209 #define _EXTRACT_XMM_5(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; 01210 #define _EXTRACT_XMM_6(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; 01211 #define _EXTRACT_XMM_7(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ; 01212 01213 #define _SWAP_YMM_HL(ymm) __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ; 01214 #define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm) 01215 01216 #define MOVE_to_REG(ymm, mem) _MOVE_to_REG(ymm, mem) 01217 #define MOVE_to_MEM(mem, ymm) _MOVE_to_MEM(mem, ymm) 01218 #define BYTE_SWAP(ymm, map) _BYTE_SWAP(ymm, map) 01219 #define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map) 01220 #define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map) 01221 #define XOR(dest, src1, src2) _XOR(dest, src1, src2) 01222 #define OR(dest, src1, src2) _OR(dest, src1, src2) 01223 #define ADD(dest, src1, src2) _ADD(dest, src1, src2) 01224 #define ADD_MEM(dest, src1, mem) _ADD_MEM(dest, src1, mem) 01225 #define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2) 01226 01227 #define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp); 01228 #define AVX2_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP) 01229 #define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits) 01230 01231 #define GAMMA0(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); \ 01232 XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); XOR(dest, G_TEMP, dest) ; 01233 #define GAMMA0_1(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); 01234 #define GAMMA0_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); \ 01235 XOR(dest, G_TEMP, dest) ; 01236 01237 #define GAMMA1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); \ 01238 XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); XOR(dest, G_TEMP, dest) ; 01239 #define GAMMA1_1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); 01240 #define GAMMA1_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); \ 01241 XOR(dest, G_TEMP, dest) ; 01242 01243 #define FEEDBACK1_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP1toW_I_2[0]) ; \ 01244 BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2) ; 01245 #define FEEDBACK2_to_W_I_2 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ; \ 01246 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAP2toW_I_2[0]) ; BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2) ; 01247 #define FEEDBACK3_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP3toW_I_2[0]) ; \ 01248 BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2) ; 01249 01250 #define FEEDBACK_to_W_I_7 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ;\ 01251 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAPtoW_I_7[0]) ; BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7) ; 01252 01253 #undef voitle 01254 01255 #define W_I_16 ymm8 01256 #define W_I_15 ymm9 01257 #define W_I_7 ymm10 01258 #define W_I_2 ymm11 01259 #define W_I ymm12 01260 #define G_TEMP ymm13 01261 #define S_TEMP ymm14 01262 #define YMM_TEMP0 ymm15 01263 #define YMM_TEMP0x xmm15 01264 #define W_I_TEMP ymm7 01265 #define W_K_TEMP ymm15 01266 #define W_K_TEMPx xmm15 01267 01268 #define YMM_REGs /* Registers are saved in Sha256Update/Finel */ 01269 /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/ 01270 01271 01272 #define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\ 01273 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs) ;\ 01274 __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs) ;\ 01275 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\ 01276 __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\ 01277 __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\ 01278 01279 #define MOVE_7_to_15(w_i_15, w_i_7)\ 01280 __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\ 01281 01282 #define MOVE_I_to_7(w_i_7, w_i)\ 01283 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\ 01284 __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\ 01285 __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs) ;\ 01286 01287 #define MOVE_I_to_2(w_i_2, w_i)\ 01288 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs) ;\ 01289 __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs) ;\ 01290 01291 #define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\ 01292 MOVE_15_to_16(w_i_16, w_i_15, w_i_7) ; \ 01293 MOVE_7_to_15(w_i_15, w_i_7) ; \ 01294 MOVE_I_to_7(w_i_7, w_i) ; \ 01295 MOVE_I_to_2(w_i_2, w_i) ;\ 01296 01297 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 01298 { word32 d ;\ 01299 __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ;\ 01300 sha256->digest[0] += d;\ 01301 __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ;\ 01302 sha256->digest[1] += d;\ 01303 __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ;\ 01304 sha256->digest[2] += d;\ 01305 __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ;\ 01306 sha256->digest[3] += d;\ 01307 __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ;\ 01308 sha256->digest[4] += d;\ 01309 __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ;\ 01310 sha256->digest[5] += d;\ 01311 __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ;\ 01312 sha256->digest[6] += d;\ 01313 __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ;\ 01314 sha256->digest[7] += d;\ 01315 } 01316 01317 #define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 01318 { word32 d[8] ;\ 01319 __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs) ;\ 01320 __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs) ;\ 01321 __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs) ;\ 01322 __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs) ;\ 01323 __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs) ;\ 01324 __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs) ;\ 01325 __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs) ;\ 01326 __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs) ;\ 01327 printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\ 01328 __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs) ;\ 01329 __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs) ;\ 01330 __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs) ;\ 01331 __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs) ;\ 01332 __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs) ;\ 01333 __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs) ;\ 01334 __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs) ;\ 01335 __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs) ;\ 01336 } 01337 01338 01339 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 01340 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 01341 01342 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 01343 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 01344 01345 #define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\ 01346 _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 ) 01347 01348 01349 /* Byte swap Masks to ensure that rest of the words are filled with zero's. */ 01350 static const unsigned long mBYTE_FLIP_MASK_16[] = 01351 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ; 01352 static const unsigned long mBYTE_FLIP_MASK_15[] = 01353 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ; 01354 static const unsigned long mBYTE_FLIP_MASK_7 [] = 01355 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b } ; 01356 static const unsigned long mBYTE_FLIP_MASK_2 [] = 01357 { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 } ; 01358 01359 static const unsigned long mMAPtoW_I_7[] = 01360 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 } ; 01361 static const unsigned long mMAP1toW_I_2[] = 01362 { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 } ; 01363 static const unsigned long mMAP2toW_I_2[] = 01364 { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 } ; 01365 static const unsigned long mMAP3toW_I_2[] = 01366 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 } ; 01367 01368 static int Transform_AVX2(Sha256* sha256) 01369 { 01370 01371 #ifdef WOLFSSL_SMALL_STACK 01372 word32* W_K; 01373 W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER); 01374 if (W_K == NULL) 01375 return MEMORY_E; 01376 #else 01377 word32 W_K[64] ; 01378 #endif 01379 01380 MOVE_to_REG(W_I_16, sha256->buffer[0]); BYTE_SWAP(W_I_16, mBYTE_FLIP_MASK_16[0]) ; 01381 MOVE_to_REG(W_I_15, sha256->buffer[1]); BYTE_SWAP(W_I_15, mBYTE_FLIP_MASK_15[0]) ; 01382 MOVE_to_REG(W_I, sha256->buffer[8]) ; BYTE_SWAP(W_I, mBYTE_FLIP_MASK_16[0]) ; 01383 MOVE_to_REG(W_I_7, sha256->buffer[16-7]) ; BYTE_SWAP(W_I_7, mBYTE_FLIP_MASK_7[0]) ; 01384 MOVE_to_REG(W_I_2, sha256->buffer[16-2]) ; BYTE_SWAP(W_I_2, mBYTE_FLIP_MASK_2[0]) ; 01385 01386 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; 01387 01388 ADD_MEM(W_K_TEMP, W_I_16, K[0]) ; 01389 MOVE_to_MEM(W_K[0], W_K_TEMP) ; 01390 01391 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ; 01392 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1) ; 01393 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2) ; 01394 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3) ; 01395 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4) ; 01396 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5) ; 01397 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6) ; 01398 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7) ; 01399 01400 ADD_MEM(YMM_TEMP0, W_I, K[8]) ; 01401 MOVE_to_MEM(W_K[8], YMM_TEMP0) ; 01402 01403 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 01404 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; 01405 GAMMA0_1(W_I_TEMP, W_I_15) ; 01406 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; 01407 GAMMA0_2(W_I_TEMP, W_I_15) ; 01408 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ; 01409 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ 01410 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ; 01411 ADD(W_I, W_I_7, W_I_TEMP); 01412 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ; 01413 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01414 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ; 01415 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01416 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ; 01417 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ 01418 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ; 01419 FEEDBACK1_to_W_I_2 ; 01420 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ; 01421 FEEDBACK_to_W_I_7 ; 01422 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ; 01423 ADD(W_I_TEMP, W_I_7, W_I_TEMP); 01424 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ; 01425 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01426 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ; 01427 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01428 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ; 01429 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ 01430 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ; 01431 FEEDBACK2_to_W_I_2 ; 01432 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ; 01433 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01434 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ; 01435 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01436 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ; 01437 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ 01438 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ; 01439 FEEDBACK3_to_W_I_2 ; 01440 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ; 01441 GAMMA1(YMM_TEMP0, W_I_2) ; 01442 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ; 01443 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ; 01444 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ 01445 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ; 01446 01447 MOVE_to_REG(YMM_TEMP0, K[16]) ; 01448 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ; 01449 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; 01450 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ; 01451 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; 01452 MOVE_to_MEM(W_K[16], YMM_TEMP0) ; 01453 01454 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 01455 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; 01456 GAMMA0_1(W_I_TEMP, W_I_15) ; 01457 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; 01458 GAMMA0_2(W_I_TEMP, W_I_15) ; 01459 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ; 01460 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ 01461 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ; 01462 ADD(W_I, W_I_7, W_I_TEMP); 01463 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ; 01464 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01465 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ; 01466 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01467 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ; 01468 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ 01469 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ; 01470 FEEDBACK1_to_W_I_2 ; 01471 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ; 01472 FEEDBACK_to_W_I_7 ; 01473 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ; 01474 ADD(W_I_TEMP, W_I_7, W_I_TEMP); 01475 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ; 01476 GAMMA1(YMM_TEMP0, W_I_2) ; 01477 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ; 01478 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01479 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ; 01480 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ 01481 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ; 01482 FEEDBACK2_to_W_I_2 ; 01483 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ; 01484 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01485 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ; 01486 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01487 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ; 01488 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ 01489 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ; 01490 FEEDBACK3_to_W_I_2 ; 01491 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ; 01492 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01493 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ; 01494 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01495 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ; 01496 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ 01497 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ; 01498 01499 MOVE_to_REG(YMM_TEMP0, K[24]) ; 01500 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ; 01501 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; 01502 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ; 01503 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; 01504 MOVE_to_MEM(W_K[24], YMM_TEMP0) ; 01505 01506 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 01507 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; 01508 GAMMA0_1(W_I_TEMP, W_I_15) ; 01509 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; 01510 GAMMA0_2(W_I_TEMP, W_I_15) ; 01511 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ; 01512 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ 01513 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ; 01514 ADD(W_I, W_I_7, W_I_TEMP); 01515 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ; 01516 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01517 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ; 01518 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01519 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ; 01520 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ 01521 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ; 01522 FEEDBACK1_to_W_I_2 ; 01523 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ; 01524 FEEDBACK_to_W_I_7 ; 01525 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ; 01526 ADD(W_I_TEMP, W_I_7, W_I_TEMP); 01527 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ; 01528 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01529 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ; 01530 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01531 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ; 01532 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ 01533 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ; 01534 FEEDBACK2_to_W_I_2 ; 01535 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ; 01536 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01537 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ; 01538 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01539 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ; 01540 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ 01541 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ; 01542 FEEDBACK3_to_W_I_2 ; 01543 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ; 01544 GAMMA1(YMM_TEMP0, W_I_2) ; 01545 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ; 01546 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ; 01547 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ 01548 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ; 01549 01550 MOVE_to_REG(YMM_TEMP0, K[32]) ; 01551 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ; 01552 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; 01553 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ; 01554 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; 01555 MOVE_to_MEM(W_K[32], YMM_TEMP0) ; 01556 01557 01558 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 01559 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; 01560 GAMMA0_1(W_I_TEMP, W_I_15) ; 01561 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; 01562 GAMMA0_2(W_I_TEMP, W_I_15) ; 01563 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ; 01564 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ 01565 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ; 01566 ADD(W_I, W_I_7, W_I_TEMP); 01567 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ; 01568 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01569 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ; 01570 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01571 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ; 01572 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ 01573 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ; 01574 FEEDBACK1_to_W_I_2 ; 01575 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ; 01576 FEEDBACK_to_W_I_7 ; 01577 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ; 01578 ADD(W_I_TEMP, W_I_7, W_I_TEMP); 01579 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ; 01580 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01581 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ; 01582 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01583 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ; 01584 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ 01585 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ; 01586 FEEDBACK2_to_W_I_2 ; 01587 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ; 01588 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01589 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ; 01590 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01591 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ; 01592 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ 01593 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ; 01594 FEEDBACK3_to_W_I_2 ; 01595 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ; 01596 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01597 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ; 01598 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01599 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ; 01600 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ 01601 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ; 01602 01603 MOVE_to_REG(YMM_TEMP0, K[40]) ; 01604 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ; 01605 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; 01606 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ; 01607 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; 01608 MOVE_to_MEM(W_K[40], YMM_TEMP0) ; 01609 01610 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 01611 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; 01612 GAMMA0_1(W_I_TEMP, W_I_15) ; 01613 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; 01614 GAMMA0_2(W_I_TEMP, W_I_15) ; 01615 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ; 01616 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ 01617 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ; 01618 ADD(W_I, W_I_7, W_I_TEMP); 01619 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ; 01620 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01621 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ; 01622 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01623 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ; 01624 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ 01625 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ; 01626 FEEDBACK1_to_W_I_2 ; 01627 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ; 01628 FEEDBACK_to_W_I_7 ; 01629 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ; 01630 ADD(W_I_TEMP, W_I_7, W_I_TEMP); 01631 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ; 01632 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01633 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ; 01634 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01635 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ; 01636 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ 01637 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ; 01638 FEEDBACK2_to_W_I_2 ; 01639 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ; 01640 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01641 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ; 01642 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01643 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ; 01644 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ 01645 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ; 01646 FEEDBACK3_to_W_I_2 ; 01647 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ; 01648 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01649 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ; 01650 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01651 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ; 01652 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ 01653 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ; 01654 01655 MOVE_to_REG(YMM_TEMP0, K[48]) ; 01656 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ; 01657 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; 01658 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ; 01659 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; 01660 MOVE_to_MEM(W_K[48], YMM_TEMP0) ; 01661 01662 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */ 01663 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; 01664 GAMMA0_1(W_I_TEMP, W_I_15) ; 01665 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; 01666 GAMMA0_2(W_I_TEMP, W_I_15) ; 01667 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ; 01668 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */ 01669 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; 01670 ADD(W_I, W_I_7, W_I_TEMP); 01671 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; 01672 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01673 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ; 01674 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01675 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; 01676 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */ 01677 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; 01678 FEEDBACK1_to_W_I_2 ; 01679 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ; 01680 FEEDBACK_to_W_I_7 ; 01681 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; 01682 ADD(W_I_TEMP, W_I_7, W_I_TEMP); 01683 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; 01684 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01685 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ; 01686 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01687 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; 01688 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */ 01689 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; 01690 FEEDBACK2_to_W_I_2 ; 01691 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ; 01692 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01693 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; 01694 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01695 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; 01696 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */ 01697 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ; 01698 FEEDBACK3_to_W_I_2 ; 01699 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; 01700 GAMMA1_1(YMM_TEMP0, W_I_2) ; 01701 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; 01702 GAMMA1_2(YMM_TEMP0, W_I_2) ; 01703 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ; 01704 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */ 01705 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; 01706 01707 MOVE_to_REG(YMM_TEMP0, K[56]) ; 01708 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; 01709 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ; 01710 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ; 01711 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ; 01712 MOVE_to_MEM(W_K[56], YMM_TEMP0) ; 01713 01714 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ; 01715 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ; 01716 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ; 01717 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ; 01718 01719 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ; 01720 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ; 01721 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ; 01722 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ; 01723 01724 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ; 01725 01726 #ifdef WOLFSSL_SMALL_STACK 01727 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER); 01728 #endif 01729 01730 return 0; 01731 } 01732 01733 #endif /* HAVE_INTEL_AVX2 */ 01734 01735 #endif /* HAVE_FIPS */ 01736 01737 #endif /* WOLFSSL_TI_HAHS */ 01738 01739 #endif /* NO_SHA256 */ 01740 01741
Generated on Tue Jul 12 2022 15:55:20 by
1.7.2