Renesas / SecureDweet
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers sha256.c Source File

sha256.c

00001 /* sha256.c
00002  *
00003  * Copyright (C) 2006-2016 wolfSSL Inc.
00004  *
00005  * This file is part of wolfSSL.
00006  *
00007  * wolfSSL is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * wolfSSL is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
00020  */
00021 
00022 
00023 /* code submitted by raphael.huck@efixo.com */
00024 
00025 #ifdef HAVE_CONFIG_H
00026     #include <config.h>
00027 #endif
00028 
00029 #include <wolfssl/wolfcrypt/settings.h>
00030 #include <wolfssl/wolfcrypt/sha256.h>
00031 
00032 #if !defined(NO_SHA256)
00033 #ifdef HAVE_FIPS
00034 
00035 int wc_InitSha256(Sha256* sha)
00036 {
00037     return InitSha256_fips(sha);
00038 }
00039 
00040 
00041 int wc_Sha256Update(Sha256* sha, const byte* data, word32 len)
00042 {
00043     return Sha256Update_fips(sha, data, len);
00044 }
00045 
00046 
00047 int wc_Sha256Final(Sha256* sha, byte* out)
00048 {
00049     return Sha256Final_fips(sha, out);
00050 }
00051 
00052 
00053 #else /* else build without fips */
00054 
00055 #if !defined(NO_SHA256) && defined(WOLFSSL_TI_HASH)
00056     /* #include <wolfcrypt/src/port/ti/ti-hash.c> included by wc_port.c */
00057 #else
00058 
00059 #if !defined (ALIGN32)
00060     #if defined (__GNUC__)
00061         #define ALIGN32 __attribute__ ( (aligned (32)))
00062     #elif defined(_MSC_VER)
00063         /* disable align warning, we want alignment ! */
00064         #pragma warning(disable: 4324)
00065         #define ALIGN32 __declspec (align (32))
00066     #else
00067         #define ALIGN32
00068     #endif
00069 #endif
00070 
00071 #ifdef WOLFSSL_PIC32MZ_HASH
00072 #define wc_InitSha256   wc_InitSha256_sw
00073 #define wc_Sha256Update wc_Sha256Update_sw
00074 #define wc_Sha256Final  wc_Sha256Final_sw
00075 #endif
00076 
00077 #ifdef HAVE_FIPS
00078     /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
00079     #define FIPS_NO_WRAPPERS
00080 #endif
00081 
00082 #if defined(USE_INTEL_SPEEDUP)
00083 #define HAVE_INTEL_AVX1
00084 #define HAVE_INTEL_AVX2
00085 
00086 #if defined(DEBUG_XMM)
00087 #include "stdio.h"
00088 #endif
00089 
00090 #endif
00091 
00092 #if defined(HAVE_INTEL_AVX2)
00093 #define HAVE_INTEL_RORX
00094 #endif
00095  
00096 
00097 /*****
00098 Intel AVX1/AVX2 Macro Control Structure
00099 
00100 #define HAVE_INTEL_AVX1
00101 #define HAVE_INTEL_AVX2
00102 
00103 #define HAVE_INTEL_RORX
00104 
00105 
00106 int InitSha256(Sha256* sha256) { 
00107      Save/Recover XMM, YMM
00108      ...
00109 }
00110 
00111 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
00112   Transform() ; Function prototype 
00113 #else
00114   Transform() {   }
00115   int Sha256Final() { 
00116      Save/Recover XMM, YMM
00117      ...
00118   }
00119 #endif
00120 
00121 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
00122     #if defined(HAVE_INTEL_RORX
00123          #define RND with rorx instuction
00124     #else
00125         #define RND
00126     #endif
00127 #endif
00128 
00129 #if defined(HAVE_INTEL_AVX1)
00130    
00131    #define XMM Instructions/inline asm
00132    
00133    int Transform() {
00134        Stitched Message Sched/Round
00135     } 
00136    
00137 #elif defined(HAVE_INTEL_AVX2)
00138   
00139   #define YMM Instructions/inline asm
00140   
00141   int Transform() {
00142       More granural Stitched Message Sched/Round
00143   }
00144   
00145 */
00146 
00147 
00148 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00149 
00150 /* Each platform needs to query info type 1 from cpuid to see if aesni is
00151  * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
00152  */
00153 
00154 #ifndef _MSC_VER
00155     #define cpuid(reg, leaf, sub)\
00156             __asm__ __volatile__ ("cpuid":\
00157              "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
00158              "a" (leaf), "c"(sub));
00159 
00160     #define XASM_LINK(f) asm(f)
00161 #else
00162 
00163     #include <intrin.h>
00164     #define cpuid(a,b) __cpuid((int*)a,b)
00165 
00166     #define XASM_LINK(f)
00167 
00168 #endif /* _MSC_VER */
00169 
00170 #define EAX 0
00171 #define EBX 1
00172 #define ECX 2 
00173 #define EDX 3
00174     
00175 #define CPUID_AVX1   0x1
00176 #define CPUID_AVX2   0x2
00177 #define CPUID_RDRAND 0x4
00178 #define CPUID_RDSEED 0x8
00179 #define CPUID_BMI2   0x10   /* MULX, RORX */
00180 
00181 #define IS_INTEL_AVX1       (cpuid_flags&CPUID_AVX1)
00182 #define IS_INTEL_AVX2       (cpuid_flags&CPUID_AVX2)
00183 #define IS_INTEL_BMI2       (cpuid_flags&CPUID_BMI2)
00184 #define IS_INTEL_RDRAND     (cpuid_flags&CPUID_RDRAND)
00185 #define IS_INTEL_RDSEED     (cpuid_flags&CPUID_RDSEED)
00186 
00187 static word32 cpuid_check = 0 ;
00188 static word32 cpuid_flags = 0 ;
00189 
00190 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
00191     int got_intel_cpu=0;
00192     unsigned int reg[5]; 
00193     
00194     reg[4] = '\0' ;
00195     cpuid(reg, 0, 0);  
00196     if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&  
00197                 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&  
00198                 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {  
00199         got_intel_cpu = 1;  
00200     }    
00201     if (got_intel_cpu) {
00202         cpuid(reg, leaf, sub);
00203         return((reg[num]>>bit)&0x1) ;
00204     }
00205     return 0 ;
00206 }
00207 
00208 static int set_cpuid_flags(void) {  
00209     if(cpuid_check==0) {
00210         if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;}
00211         if(cpuid_flag(7, 0, EBX, 5)){  cpuid_flags |= CPUID_AVX2 ; }
00212         if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; }
00213         if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ;  } 
00214         if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ;  }
00215         cpuid_check = 1 ;
00216         return 0 ;
00217     }
00218     return 1 ;
00219 }
00220 
00221 
00222 /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */
00223 static int Transform(Sha256* sha256);
00224 
00225 #if defined(HAVE_INTEL_AVX1)
00226 static int Transform_AVX1(Sha256 *sha256) ;
00227 #endif
00228 #if defined(HAVE_INTEL_AVX2)
00229 static int Transform_AVX2(Sha256 *sha256) ; 
00230 static int Transform_AVX1_RORX(Sha256 *sha256) ; 
00231 #endif
00232 
00233 static int (*Transform_p)(Sha256* sha256) /* = _Transform */;
00234 
00235 #define XTRANSFORM(sha256, B)  (*Transform_p)(sha256)
00236 
00237 static void set_Transform(void) {
00238      if(set_cpuid_flags())return ;
00239 
00240 #if defined(HAVE_INTEL_AVX2)
00241      if(IS_INTEL_AVX2 && IS_INTEL_BMI2){ 
00242          Transform_p = Transform_AVX1_RORX; return ; 
00243          Transform_p = Transform_AVX2      ; 
00244                   /* for avoiding warning,"not used" */
00245      }
00246 #endif
00247 #if defined(HAVE_INTEL_AVX1)
00248      Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform) ; return ;
00249 #endif
00250      Transform_p = Transform ; return ;
00251 }
00252 
00253 #else
00254    #if defined(FREESCALE_MMCAU)
00255       #define XTRANSFORM(sha256, B) Transform(sha256, B)
00256    #else
00257       #define XTRANSFORM(sha256, B) Transform(sha256)
00258    #endif
00259 #endif
00260 
00261 /* Dummy for saving MM_REGs on behalf of Transform */
00262 #if defined(HAVE_INTEL_AVX2)&& !defined(HAVE_INTEL_AVX1)
00263 #define  SAVE_XMM_YMM   __asm__ volatile("or %%r8d, %%r8d":::\
00264   "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15")
00265 #elif defined(HAVE_INTEL_AVX1)
00266 #define  SAVE_XMM_YMM   __asm__ volatile("or %%r8d, %%r8d":::\
00267     "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\
00268     "xmm11","xmm12","xmm13","xmm14","xmm15")
00269 #else
00270 #define  SAVE_XMM_YMM
00271 #endif
00272 
00273 #ifdef WOLFSSL_PIC32MZ_HASH
00274 #define InitSha256   InitSha256_sw
00275 #define Sha256Update Sha256Update_sw
00276 #define Sha256Final  Sha256Final_sw
00277 #endif
00278 
00279 #include <wolfssl/wolfcrypt/logging.h>
00280 #include <wolfssl/wolfcrypt/error-crypt.h>
00281 
00282 #ifdef NO_INLINE
00283     #include <wolfssl/wolfcrypt/misc.h>
00284 #else
00285     #include <wolfcrypt/src/misc.c>
00286 #endif
00287 
00288 #ifdef FREESCALE_MMCAU
00289     #include "cau_api.h"
00290 #endif
00291 
00292 #ifndef WOLFSSL_HAVE_MIN
00293 #define WOLFSSL_HAVE_MIN
00294 
00295     static INLINE word32 min(word32 a, word32 b)
00296     {
00297         return a > b ? b : a;
00298     }
00299 
00300 #endif /* WOLFSSL_HAVE_MIN */
00301 
00302 
00303 int wc_InitSha256(Sha256* sha256)
00304 {
00305     int ret = 0;
00306     #ifdef FREESCALE_MMCAU
00307         ret = wolfSSL_CryptHwMutexLock();
00308         if(ret != 0) {
00309             return ret;
00310         }
00311         cau_sha256_initialize_output(sha256->digest);
00312         wolfSSL_CryptHwMutexUnLock();
00313     #else
00314         sha256->digest[0] = 0x6A09E667L;
00315         sha256->digest[1] = 0xBB67AE85L;
00316         sha256->digest[2] = 0x3C6EF372L;
00317         sha256->digest[3] = 0xA54FF53AL;
00318         sha256->digest[4] = 0x510E527FL;
00319         sha256->digest[5] = 0x9B05688CL;
00320         sha256->digest[6] = 0x1F83D9ABL;
00321         sha256->digest[7] = 0x5BE0CD19L;
00322     #endif
00323 
00324     sha256->buffLen = 0;
00325     sha256->loLen   = 0;
00326     sha256->hiLen   = 0;
00327     
00328 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
00329     set_Transform() ; /* choose best Transform function under this runtime environment */
00330 #endif
00331 
00332     return ret;
00333 }
00334 
00335 
00336 #if !defined(FREESCALE_MMCAU)
00337 static const ALIGN32 word32 K[64] = {
00338     0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
00339     0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
00340     0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
00341     0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
00342     0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
00343     0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
00344     0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
00345     0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
00346     0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
00347     0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
00348     0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
00349     0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
00350     0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
00351 };
00352 
00353 #endif
00354 
00355 #if defined(FREESCALE_MMCAU)
00356 
00357 static int Transform(Sha256* sha256, byte* buf)
00358 {
00359     int ret = wolfSSL_CryptHwMutexLock();
00360     if(ret == 0) {
00361         cau_sha256_hash_n(buf, 1, sha256->digest);
00362         wolfSSL_CryptHwMutexUnLock();
00363     }
00364     return ret;
00365 }
00366 
00367 #endif /* FREESCALE_MMCAU */
00368 
00369 #define Ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
00370 #define Maj(x,y,z)      ((((x) | (y)) & (z)) | ((x) & (y)))
00371 #define R(x, n)         (((x)&0xFFFFFFFFU)>>(n))
00372 
00373 #define S(x, n)         rotrFixed(x, n)
00374 #define Sigma0(x)       (S(x, 2) ^ S(x, 13) ^ S(x, 22))
00375 #define Sigma1(x)       (S(x, 6) ^ S(x, 11) ^ S(x, 25))
00376 #define Gamma0(x)       (S(x, 7) ^ S(x, 18) ^ R(x, 3))
00377 #define Gamma1(x)       (S(x, 17) ^ S(x, 19) ^ R(x, 10))
00378 
00379 #define RND(a,b,c,d,e,f,g,h,i) \
00380      t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \
00381      t1 = Sigma0((a)) + Maj((a), (b), (c)); \
00382      (d) += t0; \
00383      (h)  = t0 + t1;
00384 
00385 #if !defined(FREESCALE_MMCAU)
00386 static int Transform(Sha256* sha256)
00387 {
00388     word32 S[8], t0, t1;
00389     int i;
00390 
00391 #ifdef WOLFSSL_SMALL_STACK
00392     word32* W;
00393 
00394     W = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
00395     if (W == NULL)
00396         return MEMORY_E;
00397 #else
00398     word32 W[64];
00399 #endif
00400 
00401     /* Copy context->state[] to working vars */
00402     for (i = 0; i < 8; i++)
00403         S[i] = sha256->digest[i];
00404 
00405     for (i = 0; i < 16; i++)
00406         W[i] = sha256->buffer[i];
00407 
00408     for (i = 16; i < 64; i++)
00409         W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
00410 
00411     for (i = 0; i < 64; i += 8) {
00412         RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0);
00413         RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1);
00414         RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2);
00415         RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3);
00416         RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4);
00417         RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5);
00418         RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6);
00419         RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7);
00420     }
00421 
00422     /* Add the working vars back into digest state[] */
00423     for (i = 0; i < 8; i++) {
00424         sha256->digest[i] += S[i];
00425     }
00426 
00427 #ifdef WOLFSSL_SMALL_STACK
00428     XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
00429 #endif
00430 
00431     return 0;
00432 }
00433 
00434 #endif /* #if !defined(FREESCALE_MMCAU) */
00435 
00436 static INLINE void AddLength(Sha256* sha256, word32 len)
00437 {
00438     word32 tmp = sha256->loLen;
00439     if ( (sha256->loLen += len) < tmp)
00440         sha256->hiLen++;                       /* carry low to high */
00441 }
00442 
00443 int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
00444 {
00445 
00446     /* do block size increments */
00447     byte* local = (byte*)sha256->buffer;
00448 
00449     SAVE_XMM_YMM ; /* for Intel AVX */
00450 
00451     while (len) {
00452         word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen);
00453         XMEMCPY(&local[sha256->buffLen], data, add);
00454 
00455         sha256->buffLen += add;
00456         data            += add;
00457         len             -= add;
00458 
00459         if (sha256->buffLen == SHA256_BLOCK_SIZE) {
00460             int ret;
00461 
00462             #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU)
00463                 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00464                 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
00465                 #endif
00466                 ByteReverseWords(sha256->buffer, sha256->buffer,
00467                                  SHA256_BLOCK_SIZE);
00468             #endif
00469             ret = XTRANSFORM(sha256, local);
00470             if (ret != 0)
00471                 return ret;
00472 
00473             AddLength(sha256, SHA256_BLOCK_SIZE);
00474             sha256->buffLen = 0;
00475         }
00476     }
00477 
00478     return 0;
00479 }
00480 
00481 int wc_Sha256Final(Sha256* sha256, byte* hash)
00482 {
00483     byte* local = (byte*)sha256->buffer;
00484     int ret;
00485     
00486     SAVE_XMM_YMM ; /* for Intel AVX */
00487 
00488     AddLength(sha256, sha256->buffLen);  /* before adding pads */
00489 
00490     local[sha256->buffLen++] = 0x80;     /* add 1 */
00491 
00492     /* pad with zeros */
00493     if (sha256->buffLen > SHA256_PAD_SIZE) {
00494         XMEMSET(&local[sha256->buffLen], 0, SHA256_BLOCK_SIZE - sha256->buffLen);
00495         sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen;
00496 
00497         #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU)
00498             #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00499             if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
00500             #endif
00501             ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE);
00502         #endif
00503 
00504         ret = XTRANSFORM(sha256, local);
00505         if (ret != 0)
00506             return ret;
00507 
00508         sha256->buffLen = 0;
00509     }
00510     XMEMSET(&local[sha256->buffLen], 0, SHA256_PAD_SIZE - sha256->buffLen);
00511 
00512     /* put lengths in bits */
00513     sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) +
00514                  (sha256->hiLen << 3);
00515     sha256->loLen = sha256->loLen << 3;
00516 
00517     /* store lengths */
00518     #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU)
00519         #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00520         if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
00521         #endif
00522             ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE);
00523     #endif
00524     /* ! length ordering dependent on digest endian type ! */
00525     XMEMCPY(&local[SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
00526     XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
00527             sizeof(word32));
00528 
00529     #if defined(FREESCALE_MMCAU) || defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00530         /* Kinetis requires only these bytes reversed */
00531         #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00532         if(IS_INTEL_AVX1 || IS_INTEL_AVX2)
00533         #endif
00534         ByteReverseWords(&sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)],
00535                          &sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)],
00536                          2 * sizeof(word32));
00537     #endif
00538 
00539     ret = XTRANSFORM(sha256, local);
00540     if (ret != 0)
00541         return ret;
00542 
00543     #if defined(LITTLE_ENDIAN_ORDER)
00544         ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE);
00545     #endif
00546     XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE);
00547 
00548     return wc_InitSha256(sha256);  /* reset state */
00549 }
00550 
00551 
00552 
00553 
00554 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00555 
00556 #define _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
00557     { word32 d ;\
00558     d = sha256->digest[0]; __asm__ volatile("movl %0, %"#S_0::"r"(d):SSE_REGs) ;\
00559     d = sha256->digest[1]; __asm__ volatile("movl %0, %"#S_1::"r"(d):SSE_REGs) ;\
00560     d = sha256->digest[2]; __asm__ volatile("movl %0, %"#S_2::"r"(d):SSE_REGs) ;\
00561     d = sha256->digest[3]; __asm__ volatile("movl %0, %"#S_3::"r"(d):SSE_REGs) ;\
00562     d = sha256->digest[4]; __asm__ volatile("movl %0, %"#S_4::"r"(d):SSE_REGs) ;\
00563     d = sha256->digest[5]; __asm__ volatile("movl %0, %"#S_5::"r"(d):SSE_REGs) ;\
00564     d = sha256->digest[6]; __asm__ volatile("movl %0, %"#S_6::"r"(d):SSE_REGs) ;\
00565     d = sha256->digest[7]; __asm__ volatile("movl %0, %"#S_7::"r"(d):SSE_REGs) ;\
00566 }
00567 
00568 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
00569     { word32 d ; \
00570     __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ; sha256->digest[0] += d;\
00571     __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ; sha256->digest[1] += d;\
00572     __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ; sha256->digest[2] += d;\
00573     __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ; sha256->digest[3] += d;\
00574     __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ; sha256->digest[4] += d;\
00575     __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ; sha256->digest[5] += d;\
00576     __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ; sha256->digest[6] += d;\
00577     __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ; sha256->digest[7] += d;\
00578 }
00579 
00580 
00581 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
00582     _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
00583 
00584 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
00585     _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
00586 
00587 
00588 
00589 
00590 #define S_0 %r15d 
00591 #define S_1 %r10d
00592 #define S_2 %r11d       
00593 #define S_3 %r12d
00594 #define S_4 %r13d
00595 #define S_5 %r14d
00596 #define S_6 %ebx
00597 #define S_7 %r9d
00598 
00599 #define SSE_REGs "%edi", "%ecx", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15"
00600 
00601 #if defined(HAVE_INTEL_RORX)
00602 #define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i)\
00603 __asm__ volatile("rorx  $6, %"#e", %%edx\n\t":::"%edx",SSE_REGs);  /* edx = e>>6 */\
00604 
00605 #define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i)\
00606 __asm__ volatile("rorx  $11, %"#e",%%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11  */\
00607 __asm__ volatile("xorl  %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6)  */\
00608 __asm__ volatile("rorx  $25, %"#e", %%edx\n\t":::"%edx",SSE_REGs);   /* edx = e>>25             */\
00609 
00610 #define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i)\
00611 __asm__ volatile("movl  %"#f", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = f   */\
00612 __asm__ volatile("xorl  %"#g", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = f ^ g  */\
00613 __asm__ volatile("xorl  %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);  /* edx = Sigma1(e)  */\
00614 __asm__ volatile("andl  %"#e", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = (f ^ g) & e       */\
00615 __asm__ volatile("xorl  %"#g", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = Ch(e,f,g)         */\
00616 
00617 #define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i)\
00618 /*__asm__ volatile("movl    %0, %%edx\n\t"::"m"(w_k):"%edx");*/\
00619 __asm__ volatile("addl  %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs);    /* h += w_k  */\
00620 __asm__ volatile("addl  %%edx, %"#h"\n\t":::"%edx",SSE_REGs);     /* h = h + w_k + Sigma1(e) */\
00621 __asm__ volatile("rorx  $2, %"#a", %%r8d\n\t":::"%r8",SSE_REGs);  /* r8d = a>>2   */\
00622 __asm__ volatile("rorx  $13, %"#a", %%edi\n\t":::"%edi",SSE_REGs);/* edi = a>>13  */\
00623 
00624 #define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i)\
00625 __asm__ volatile("rorx  $22, %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\
00626 __asm__ volatile("xorl  %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs);/* edi = (a>>2) ^ (a>>13)  */\
00627 __asm__ volatile("xorl  %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);  /* edx = Sigma0(a)      */\
00628  
00629 #define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i)\
00630 __asm__ volatile("movl  %"#b", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = b          */\
00631 __asm__ volatile("orl   %"#a", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = a | b      */\
00632 __asm__ volatile("andl  %"#c", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = (a | b) & c*/\
00633 __asm__ volatile("movl  %"#b", %%r8d\n\t":::"%r8",SSE_REGs);  /* r8d = b           */\
00634 
00635 #define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i)\
00636 __asm__ volatile("addl  %%esi, %"#h"\n\t":::"%esi",SSE_REGs);  /* h += Ch(e,f,g)   */\
00637 __asm__ volatile("andl  %"#a", %%r8d\n\t":::"%r8",SSE_REGs);  /* r8d = b & a       */\
00638 __asm__ volatile("orl   %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\
00639 
00640 #define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i)\
00641 __asm__ volatile("addl  "#h", "#d"\n\t");  /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\
00642 __asm__ volatile("addl  %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \
00643 __asm__ volatile("addl  %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs); \
00644 __asm__ volatile("movl  %r8d, "#h"\n\t");   
00645 
00646 #endif
00647 
00648 #define RND_STEP_1(a,b,c,d,e,f,g,h,i)\
00649 __asm__ volatile("movl  %"#e", %%edx\n\t":::"%edx",SSE_REGs);\
00650 __asm__ volatile("roll  $26, %%edx\n\t":::"%edx",SSE_REGs);  /* edx = e>>6     */\
00651 __asm__ volatile("movl  %"#e", %%edi\n\t":::"%edi",SSE_REGs);\
00652 
00653 #define RND_STEP_2(a,b,c,d,e,f,g,h,i)\
00654 __asm__ volatile("roll  $21, %%edi\n\t":::"%edi",SSE_REGs);         /* edi = e>>11 */\
00655 __asm__ volatile("xorl  %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6)  */\
00656 __asm__ volatile("movl  %"#e", %%edx\n\t":::"%edx",SSE_REGs);   /* edx = e      */\
00657 __asm__ volatile("roll  $7, %%edx\n\t":::"%edx",SSE_REGs);      /* edx = e>>25  */\
00658 
00659 #define RND_STEP_3(a,b,c,d,e,f,g,h,i)\
00660 __asm__ volatile("movl  %"#f", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = f       */\
00661 __asm__ volatile("xorl  %"#g", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = f ^ g   */\
00662 __asm__ volatile("xorl  %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\
00663 __asm__ volatile("andl  %"#e", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = (f ^ g) & e  */\
00664 __asm__ volatile("xorl  %"#g", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = Ch(e,f,g)    */\
00665 
00666 #define RND_STEP_4(a,b,c,d,e,f,g,h,i)\
00667 __asm__ volatile("addl  %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k  */\
00668 __asm__ volatile("addl  %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\
00669 __asm__ volatile("movl  %"#a", %%r8d\n\t":::"%r8",SSE_REGs);  /* r8d = a    */\
00670 __asm__ volatile("roll  $30, %%r8d\n\t":::"%r8",SSE_REGs);    /* r8d = a>>2 */\
00671 __asm__ volatile("movl  %"#a", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = a   */\
00672 __asm__ volatile("roll  $19, %%edi\n\t":::"%edi",SSE_REGs);    /* edi = a>>13 */\
00673 __asm__ volatile("movl  %"#a", %%edx\n\t":::"%edx",SSE_REGs);  /* edx = a     */\
00674 
00675 #define RND_STEP_5(a,b,c,d,e,f,g,h,i)\
00676 __asm__ volatile("roll  $10, %%edx\n\t":::"%edx",SSE_REGs);    /* edx = a>>22 */\
00677 __asm__ volatile("xorl  %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs); /* edi = (a>>2) ^ (a>>13)  */\
00678 __asm__ volatile("xorl  %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);/* edx = Sigma0(a)         */\
00679 
00680 #define RND_STEP_6(a,b,c,d,e,f,g,h,i)\
00681 __asm__ volatile("movl  %"#b", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = b      */\
00682 __asm__ volatile("orl   %"#a", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = a | b  */\
00683 __asm__ volatile("andl  %"#c", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = (a | b) & c */\
00684 __asm__ volatile("movl  %"#b", %%r8d\n\t":::"%r8",SSE_REGs);  /* r8d = b       */\
00685 
00686 #define RND_STEP_7(a,b,c,d,e,f,g,h,i)\
00687 __asm__ volatile("addl  %%esi, %"#h"\n\t":::"%esi",SSE_REGs);  /* h += Ch(e,f,g)        */\
00688 __asm__ volatile("andl  %"#a", %%r8d\n\t":::"%r8",SSE_REGs);  /* r8d = b & a            */\
00689 __asm__ volatile("orl   %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\
00690 
00691 #define RND_STEP_8(a,b,c,d,e,f,g,h,i)\
00692 __asm__ volatile("addl  "#h", "#d"\n\t");  /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\
00693 __asm__ volatile("addl  %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \
00694                  /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\
00695 __asm__ volatile("addl  %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs);\
00696                  /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c)     */\
00697 __asm__ volatile("movl  %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \
00698                  /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \
00699 
00700 #define RND_X(a,b,c,d,e,f,g,h,i) \
00701        RND_STEP_1(a,b,c,d,e,f,g,h,i); \
00702        RND_STEP_2(a,b,c,d,e,f,g,h,i); \
00703        RND_STEP_3(a,b,c,d,e,f,g,h,i); \
00704        RND_STEP_4(a,b,c,d,e,f,g,h,i); \
00705        RND_STEP_5(a,b,c,d,e,f,g,h,i); \
00706        RND_STEP_6(a,b,c,d,e,f,g,h,i); \
00707        RND_STEP_7(a,b,c,d,e,f,g,h,i); \
00708        RND_STEP_8(a,b,c,d,e,f,g,h,i); 
00709 
00710 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
00711 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
00712 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
00713 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
00714 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
00715 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
00716 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
00717 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
00718 
00719 
00720 #define RND_1_3(a,b,c,d,e,f,g,h,i) {\
00721        RND_STEP_1(a,b,c,d,e,f,g,h,i); \
00722        RND_STEP_2(a,b,c,d,e,f,g,h,i); \
00723        RND_STEP_3(a,b,c,d,e,f,g,h,i); \
00724 }
00725 
00726 #define RND_4_6(a,b,c,d,e,f,g,h,i) {\
00727        RND_STEP_4(a,b,c,d,e,f,g,h,i); \
00728        RND_STEP_5(a,b,c,d,e,f,g,h,i); \
00729        RND_STEP_6(a,b,c,d,e,f,g,h,i); \
00730 }
00731 
00732 #define RND_7_8(a,b,c,d,e,f,g,h,i) {\
00733        RND_STEP_7(a,b,c,d,e,f,g,h,i); \
00734        RND_STEP_8(a,b,c,d,e,f,g,h,i); \
00735 }
00736 
00737 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
00738 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
00739 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
00740 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
00741 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
00742 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
00743 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
00744 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
00745 
00746 
00747 #define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
00748 #define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
00749 #define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
00750 #define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
00751 #define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
00752 #define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
00753 #define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
00754 #define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
00755 
00756 #define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
00757 #define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
00758 #define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
00759 #define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
00760 #define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
00761 #define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
00762 #define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
00763 #define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
00764 
00765 #define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
00766 #define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
00767 #define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
00768 #define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
00769 #define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
00770 #define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
00771 #define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
00772 #define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
00773 
00774 #define FOR(cnt, init, max, inc, loop)  \
00775     __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):) 
00776 #define END(cnt, init, max, inc, loop)  \
00777     __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::) ;
00778 
00779 #endif  /* defined(HAVE_INTEL_AVX1) ||  defined(HAVE_INTEL_AVX2) */
00780 
00781 #if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */
00782 
00783 #define VPALIGNR(op1,op2,op3,op4) __asm__ volatile("vpalignr $"#op4", %"#op3", %"#op2", %"#op1:::XMM_REGs) 
00784 #define VPADDD(op1,op2,op3)       __asm__ volatile("vpaddd %"#op3", %"#op2", %"#op1:::XMM_REGs)
00785 #define VPSRLD(op1,op2,op3)       __asm__ volatile("vpsrld $"#op3", %"#op2", %"#op1:::XMM_REGs)
00786 #define VPSRLQ(op1,op2,op3)       __asm__ volatile("vpsrlq $"#op3", %"#op2", %"#op1:::XMM_REGs)
00787 #define VPSLLD(op1,op2,op3)       __asm__ volatile("vpslld $"#op3", %"#op2", %"#op1:::XMM_REGs)
00788 #define VPOR(op1,op2,op3)         __asm__ volatile("vpor   %"#op3", %"#op2", %"#op1:::XMM_REGs)
00789 #define VPXOR(op1,op2,op3)        __asm__ volatile("vpxor  %"#op3", %"#op2", %"#op1:::XMM_REGs)
00790 #define VPSHUFD(op1,op2,op3)      __asm__ volatile("vpshufd $"#op3", %"#op2", %"#op1:::XMM_REGs)
00791 #define VPSHUFB(op1,op2,op3)      __asm__ volatile("vpshufb %"#op3", %"#op2", %"#op1:::XMM_REGs)
00792 
00793 #define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\
00794      a,b,c,d,e,f,g,h,_i)\
00795             RND_STEP_1(a,b,c,d,e,f,g,h,_i);\
00796     VPALIGNR (XTMP0, X3, X2, 4) ;\
00797             RND_STEP_2(a,b,c,d,e,f,g,h,_i);\
00798     VPADDD   (XTMP0, XTMP0, X0) ;\
00799             RND_STEP_3(a,b,c,d,e,f,g,h,_i);\
00800     VPALIGNR (XTMP1, X1, X0, 4) ;   /* XTMP1 = W[-15] */\
00801             RND_STEP_4(a,b,c,d,e,f,g,h,_i);\
00802     VPSRLD   (XTMP2, XTMP1, 7) ;\
00803             RND_STEP_5(a,b,c,d,e,f,g,h,_i);\
00804     VPSLLD   (XTMP3, XTMP1, 25) ; /* VPSLLD   (XTMP3, XTMP1, (32-7)) */\
00805             RND_STEP_6(a,b,c,d,e,f,g,h,_i);\
00806     VPOR     (XTMP3, XTMP3, XTMP2)  ;  /* XTMP1 = W[-15] MY_ROR 7 */\
00807             RND_STEP_7(a,b,c,d,e,f,g,h,_i);\
00808     VPSRLD   (XTMP2, XTMP1,18) ;\
00809             RND_STEP_8(a,b,c,d,e,f,g,h,_i);\
00810 \
00811             RND_STEP_1(h,a,b,c,d,e,f,g,_i+1);\
00812     VPSRLD   (XTMP4, XTMP1, 3)      ;  /* XTMP4 = W[-15] >> 3 */\
00813             RND_STEP_2(h,a,b,c,d,e,f,g,_i+1);\
00814     VPSLLD   (XTMP1, XTMP1, 14) ; /* VPSLLD   (XTMP1, XTMP1, (32-18)) */\
00815             RND_STEP_3(h,a,b,c,d,e,f,g,_i+1);\
00816     VPXOR    (XTMP3, XTMP3, XTMP1)  ;\
00817             RND_STEP_4(h,a,b,c,d,e,f,g,_i+1);\
00818     VPXOR    (XTMP3, XTMP3, XTMP2)  ;  /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
00819             RND_STEP_5(h,a,b,c,d,e,f,g,_i+1);\
00820     VPXOR    (XTMP1, XTMP3, XTMP4)  ;  /* XTMP1 = s0 */\
00821             RND_STEP_6(h,a,b,c,d,e,f,g,_i+1);\
00822     VPSHUFD(XTMP2, X3, 0b11111010)  ;  /* XTMP2 = W[-2] {BBAA}*/\
00823             RND_STEP_7(h,a,b,c,d,e,f,g,_i+1);\
00824     VPADDD   (XTMP0, XTMP0, XTMP1)  ;  /* XTMP0 = W[-16] + W[-7] + s0 */\
00825             RND_STEP_8(h,a,b,c,d,e,f,g,_i+1);\
00826 \
00827             RND_STEP_1(g,h,a,b,c,d,e,f,_i+2);\
00828     VPSRLD   (XTMP4, XTMP2, 10) ;      /* XTMP4 = W[-2] >> 10 {BBAA} */\
00829             RND_STEP_2(g,h,a,b,c,d,e,f,_i+2);\
00830     VPSRLQ   (XTMP3, XTMP2, 19) ;      /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
00831             RND_STEP_3(g,h,a,b,c,d,e,f,_i+2);\
00832     VPSRLQ   (XTMP2, XTMP2, 17) ;      /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
00833             RND_STEP_4(g,h,a,b,c,d,e,f,_i+2);\
00834     VPXOR    (XTMP2, XTMP2, XTMP3) ;\
00835             RND_STEP_5(g,h,a,b,c,d,e,f,_i+2);\
00836     VPXOR    (XTMP4, XTMP4, XTMP2) ;   /* XTMP4 = s1 {xBxA} */\
00837             RND_STEP_6(g,h,a,b,c,d,e,f,_i+2);\
00838     VPSHUFB  (XTMP4, XTMP4, SHUF_00BA)  ;  /* XTMP4 = s1 {00BA} */\
00839             RND_STEP_7(g,h,a,b,c,d,e,f,_i+2);\
00840     VPADDD   (XTMP0, XTMP0, XTMP4)  ;  /* XTMP0 = {..., ..., W[1], W[0]} */\
00841             RND_STEP_8(g,h,a,b,c,d,e,f,_i+2);\
00842 \
00843             RND_STEP_1(f,g,h,a,b,c,d,e,_i+3);\
00844     VPSHUFD  (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\
00845             RND_STEP_2(f,g,h,a,b,c,d,e,_i+3);\
00846     VPSRLD   (XTMP5, XTMP2, 10);       /* XTMP5 = W[-2] >> 10 {DDCC} */\
00847             RND_STEP_3(f,g,h,a,b,c,d,e,_i+3);\
00848     VPSRLQ   (XTMP3, XTMP2, 19);       /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
00849             RND_STEP_4(f,g,h,a,b,c,d,e,_i+3);\
00850     VPSRLQ   (XTMP2, XTMP2, 17) ;      /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
00851             RND_STEP_5(f,g,h,a,b,c,d,e,_i+3);\
00852     VPXOR    (XTMP2, XTMP2, XTMP3) ;\
00853             RND_STEP_6(f,g,h,a,b,c,d,e,_i+3);\
00854     VPXOR    (XTMP5, XTMP5, XTMP2) ;   /* XTMP5 = s1 {xDxC} */\
00855             RND_STEP_7(f,g,h,a,b,c,d,e,_i+3);\
00856     VPSHUFB  (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\
00857             RND_STEP_8(f,g,h,a,b,c,d,e,_i+3);\
00858     VPADDD   (X0, XTMP5, XTMP0) ;      /* X0 = {W[3], W[2], W[1], W[0]} */\
00859 
00860 #if defined(HAVE_INTEL_RORX)
00861 
00862 #define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \
00863                           XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\
00864             RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i);\
00865     VPALIGNR (XTMP0, X3, X2, 4) ;\
00866             RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i);\
00867     VPADDD   (XTMP0, XTMP0, X0) ;\
00868             RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i);\
00869     VPALIGNR (XTMP1, X1, X0, 4) ;   /* XTMP1 = W[-15] */\
00870             RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i);\
00871     VPSRLD   (XTMP2, XTMP1, 7) ;\
00872             RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i);\
00873     VPSLLD   (XTMP3, XTMP1, 25) ; /* VPSLLD   (XTMP3, XTMP1, (32-7)) */\
00874             RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i);\
00875     VPOR     (XTMP3, XTMP3, XTMP2)  ;  /* XTMP1 = W[-15] MY_ROR 7 */\
00876             RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i);\
00877     VPSRLD   (XTMP2, XTMP1,18) ;\
00878             RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i);\
00879 \
00880             RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1);\
00881     VPSRLD   (XTMP4, XTMP1, 3)      ;  /* XTMP4 = W[-15] >> 3 */\
00882             RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1);\
00883     VPSLLD   (XTMP1, XTMP1, 14) ; /* VPSLLD   (XTMP1, XTMP1, (32-18)) */\
00884             RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1);\
00885     VPXOR    (XTMP3, XTMP3, XTMP1)  ;\
00886             RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1);\
00887     VPXOR    (XTMP3, XTMP3, XTMP2)  ;  /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
00888             RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1);\
00889     VPXOR    (XTMP1, XTMP3, XTMP4)  ;  /* XTMP1 = s0 */\
00890             RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1);\
00891     VPSHUFD(XTMP2, X3, 0b11111010)  ;  /* XTMP2 = W[-2] {BBAA}*/\
00892             RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1);\
00893     VPADDD   (XTMP0, XTMP0, XTMP1)  ;  /* XTMP0 = W[-16] + W[-7] + s0 */\
00894             RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1);\
00895 \
00896             RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2);\
00897     VPSRLD   (XTMP4, XTMP2, 10) ;      /* XTMP4 = W[-2] >> 10 {BBAA} */\
00898             RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2);\
00899     VPSRLQ   (XTMP3, XTMP2, 19) ;      /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
00900             RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2);\
00901     VPSRLQ   (XTMP2, XTMP2, 17) ;      /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
00902             RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2);\
00903     VPXOR    (XTMP2, XTMP2, XTMP3) ;\
00904             RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2);\
00905     VPXOR    (XTMP4, XTMP4, XTMP2) ;   /* XTMP4 = s1 {xBxA} */\
00906             RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2);\
00907     VPSHUFB  (XTMP4, XTMP4, SHUF_00BA)  ;  /* XTMP4 = s1 {00BA} */\
00908             RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2);\
00909     VPADDD   (XTMP0, XTMP0, XTMP4)  ;  /* XTMP0 = {..., ..., W[1], W[0]} */\
00910             RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2);\
00911 \
00912             RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3);\
00913     VPSHUFD  (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\
00914             RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3);\
00915     VPSRLD   (XTMP5, XTMP2, 10);       /* XTMP5 = W[-2] >> 10 {DDCC} */\
00916             RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3);\
00917     VPSRLQ   (XTMP3, XTMP2, 19);       /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
00918             RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3);\
00919     VPSRLQ   (XTMP2, XTMP2, 17) ;      /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
00920             RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3);\
00921     VPXOR    (XTMP2, XTMP2, XTMP3) ;\
00922             RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3);\
00923     VPXOR    (XTMP5, XTMP5, XTMP2) ;   /* XTMP5 = s1 {xDxC} */\
00924             RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3);\
00925     VPSHUFB  (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\
00926             RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3);\
00927     VPADDD   (X0, XTMP5, XTMP0) ;      /* X0 = {W[3], W[2], W[1], W[0]} */\
00928 
00929 #endif
00930 
00931 
00932 #define W_K_from_buff\
00933          __asm__ volatile("vmovdqu %0, %%xmm4\n\t"\
00934                           "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t"\
00935                           :: "m"(sha256->buffer[0]):"%xmm4") ;\
00936          __asm__ volatile("vmovdqu %0, %%xmm5\n\t"\
00937                           "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t"\
00938                           ::"m"(sha256->buffer[4]):"%xmm5") ;\
00939          __asm__ volatile("vmovdqu %0, %%xmm6\n\t"\
00940                           "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t"\
00941                           ::"m"(sha256->buffer[8]):"%xmm6") ;\
00942          __asm__ volatile("vmovdqu %0, %%xmm7\n\t"\
00943                           "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"\
00944                           ::"m"(sha256->buffer[12]):"%xmm7") ;\
00945 
00946 #define _SET_W_K_XFER(reg, i)\
00947     __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs) ;\
00948     __asm__ volatile("vmovdqa %%xmm9, %0":"=m"(W_K[i])::XMM_REGs) ;
00949 
00950 #define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i)
00951 
00952 static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF } ; /* shuffle xBxA -> 00BA */
00953 static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 } ; /* shuffle xDxC -> DC00 */
00954 static const ALIGN32 word64 mBYTE_FLIP_MASK[] =  { 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
00955 
00956 
00957 #define _Init_Masks(mask1, mask2, mask3)\
00958 __asm__ volatile("vmovdqu %0, %"#mask1 ::"m"(mBYTE_FLIP_MASK[0])) ;\
00959 __asm__ volatile("vmovdqu %0, %"#mask2 ::"m"(mSHUF_00BA[0])) ;\
00960 __asm__ volatile("vmovdqu %0, %"#mask3 ::"m"(mSHUF_DC00[0])) ;
00961 
00962 #define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\
00963     _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
00964 
00965 #define X0 %xmm4
00966 #define X1 %xmm5
00967 #define X2 %xmm6
00968 #define X3 %xmm7
00969 #define X_ X0
00970 
00971 #define XTMP0 %xmm0
00972 #define XTMP1 %xmm1
00973 #define XTMP2 %xmm2
00974 #define XTMP3 %xmm3
00975 #define XTMP4 %xmm8
00976 #define XTMP5 %xmm9
00977 #define XFER  %xmm10
00978 
00979 #define SHUF_00BA   %xmm11 /* shuffle xBxA -> 00BA */
00980 #define SHUF_DC00   %xmm12 /* shuffle xDxC -> DC00 */
00981 #define BYTE_FLIP_MASK  %xmm13
00982 
00983 #define XMM_REGs   /* Registers are saved in Sha256Update/Finel */
00984                    /*"xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13" */
00985 
00986 static int Transform_AVX1(Sha256* sha256)
00987 {
00988 
00989     word32 W_K[64] ;  /* temp for W+K */
00990 
00991     #if defined(DEBUG_XMM)
00992     int i, j ;
00993     word32 xmm[29][4*15] ;
00994     #endif
00995 
00996     Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ;
00997     W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */
00998 
00999     DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
01000   
01001     SET_W_K_XFER(X0, 0) ;
01002     MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 
01003             SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ;
01004     SET_W_K_XFER(X1, 4) ;
01005     MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
01006             SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ;
01007     SET_W_K_XFER(X2, 8) ;
01008     MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 
01009             SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
01010     SET_W_K_XFER(X3, 12) ;
01011     MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 
01012             SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ;
01013     SET_W_K_XFER(X0, 16) ;
01014     MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 
01015             SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
01016     SET_W_K_XFER(X1, 20) ;
01017     MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 
01018             SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ;
01019     SET_W_K_XFER(X2, 24) ;
01020     MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 
01021             SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
01022     SET_W_K_XFER(X3, 28) ;
01023     MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 
01024             SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ;
01025     SET_W_K_XFER(X0, 32) ;
01026     MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 
01027             SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
01028     SET_W_K_XFER(X1, 36) ;
01029     MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 
01030             SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ;
01031     SET_W_K_XFER(X2, 40) ;
01032     MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 
01033             SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
01034     SET_W_K_XFER(X3, 44) ;
01035     MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, 
01036             SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ;
01037 
01038     SET_W_K_XFER(X0, 48) ;
01039     SET_W_K_XFER(X1, 52) ;
01040     SET_W_K_XFER(X2, 56) ;
01041     SET_W_K_XFER(X3, 60) ;
01042     
01043     RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
01044     RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
01045     RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
01046     RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
01047 
01048     RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
01049     RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
01050     RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
01051     RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
01052 
01053     RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ;     
01054     RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ;
01055     RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ;
01056     RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ;
01057 
01058     RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ;
01059     RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ;
01060     RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ;
01061     RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ;
01062         
01063     RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;  
01064         
01065     #if defined(DEBUG_XMM)
01066     for(i=0; i<29; i++) {
01067         for(j=0; j<4*14; j+=4)
01068             printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i, 
01069                    xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ;
01070         printf("\n") ;
01071     }
01072         
01073     for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ;
01074     #endif
01075 
01076     return 0;
01077 }
01078 
01079 #if defined(HAVE_INTEL_RORX)
01080 static int Transform_AVX1_RORX(Sha256* sha256)
01081 {
01082 
01083     word32 W_K[64] ;  /* temp for W+K */
01084 
01085     #if defined(DEBUG_XMM)
01086     int i, j ;
01087     word32 xmm[29][4*15] ;
01088     #endif
01089 
01090     Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ;
01091     W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */
01092 
01093     DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
01094     SET_W_K_XFER(X0, 0) ;
01095     MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 
01096             XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ;
01097     SET_W_K_XFER(X1, 4) ;
01098     MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 
01099             XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ;
01100     SET_W_K_XFER(X2, 8) ;
01101     MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 
01102             XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
01103     SET_W_K_XFER(X3, 12) ;
01104     MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 
01105             XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ;
01106     SET_W_K_XFER(X0, 16) ;
01107     MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 
01108             XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
01109     SET_W_K_XFER(X1, 20) ;
01110     MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 
01111             XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ;
01112     SET_W_K_XFER(X2, 24) ;
01113     MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 
01114             XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
01115     SET_W_K_XFER(X3, 28) ;
01116     MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 
01117             XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ;
01118     SET_W_K_XFER(X0, 32) ;
01119     MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 
01120             XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
01121     SET_W_K_XFER(X1, 36) ;
01122     MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, 
01123             XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ;
01124     SET_W_K_XFER(X2, 40) ;
01125     MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01126             XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
01127     SET_W_K_XFER(X3, 44) ;
01128     MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01129             XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ;
01130 
01131     SET_W_K_XFER(X0, 48) ;
01132     SET_W_K_XFER(X1, 52) ;
01133     SET_W_K_XFER(X2, 56) ;
01134     SET_W_K_XFER(X3, 60) ;
01135     
01136     RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
01137     RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
01138     RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
01139     RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
01140 
01141     RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
01142     RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
01143     RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
01144     RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
01145 
01146     RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ;     
01147     RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ;
01148     RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ;
01149     RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ;
01150 
01151     RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ;
01152     RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ;
01153     RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ;
01154     RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ;
01155         
01156     RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;  
01157         
01158     #if defined(DEBUG_XMM)
01159     for(i=0; i<29; i++) {
01160         for(j=0; j<4*14; j+=4)
01161             printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i, 
01162                     xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ;
01163         printf("\n") ;
01164     }
01165         
01166     for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ;
01167     #endif
01168 
01169     return 0;
01170 }
01171 #endif  /* HAVE_INTEL_RORX */
01172 
01173 #endif  /* HAVE_INTEL_AVX1 */
01174 
01175 
01176 #if defined(HAVE_INTEL_AVX2)
01177 
01178 #define _MOVE_to_REG(ymm, mem)       __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem):YMM_REGs) ;
01179 #define _MOVE_to_MEM(mem, ymm)       __asm__ volatile("vmovdqu %%"#ymm", %0" : "=m"(mem)::YMM_REGs) ;
01180 #define _BYTE_SWAP(ymm, map)              __asm__ volatile("vpshufb %0, %%"#ymm", %%"#ymm"\n\t"\
01181                                                        :: "m"(map):YMM_REGs) ;
01182 #define _MOVE_128(ymm0, ymm1, ymm2, map)   __asm__ volatile("vperm2i128  $"#map", %%"\
01183                                   #ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs) ;
01184 #define _MOVE_BYTE(ymm0, ymm1, map)  __asm__ volatile("vpshufb %0, %%"#ymm1", %%"\
01185                                   #ymm0"\n\t":: "m"(map):YMM_REGs) ;
01186 #define _S_TEMP(dest, src, bits, temp)    __asm__ volatile("vpsrld  $"#bits", %%"\
01187          #src", %%"#dest"\n\tvpslld  $32-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\
01188          #temp",%%"#dest", %%"#dest" ":::YMM_REGs) ;
01189 #define _AVX2_R(dest, src, bits)          __asm__ volatile("vpsrld  $"#bits", %%"\
01190                                   #src", %%"#dest" ":::YMM_REGs) ;
01191 #define _XOR(dest, src1, src2)       __asm__ volatile("vpxor   %%"#src1", %%"\
01192          #src2", %%"#dest" ":::YMM_REGs) ;
01193 #define _OR(dest, src1, src2)       __asm__ volatile("vpor    %%"#src1", %%"\
01194          #src2", %%"#dest" ":::YMM_REGs) ;
01195 #define _ADD(dest, src1, src2)       __asm__ volatile("vpaddd   %%"#src1", %%"\
01196          #src2", %%"#dest" ":::YMM_REGs) ;
01197 #define _ADD_MEM(dest, src1, mem)    __asm__ volatile("vpaddd   %0, %%"#src1", %%"\
01198          #dest" "::"m"(mem):YMM_REGs) ;
01199 #define _BLEND(map, dest, src1, src2)    __asm__ volatile("vpblendd    $"#map", %%"\
01200          #src1",   %%"#src2", %%"#dest" ":::YMM_REGs) ;
01201 
01202 #define    _EXTRACT_XMM_0(xmm, mem)  __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
01203 #define    _EXTRACT_XMM_1(xmm, mem)  __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
01204 #define    _EXTRACT_XMM_2(xmm, mem)  __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
01205 #define    _EXTRACT_XMM_3(xmm, mem)  __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
01206 #define    _EXTRACT_XMM_4(ymm, xmm, mem)\
01207       __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ;\
01208       __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
01209 #define    _EXTRACT_XMM_5(xmm, mem)  __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
01210 #define    _EXTRACT_XMM_6(xmm, mem)  __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
01211 #define    _EXTRACT_XMM_7(xmm, mem)  __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
01212 
01213 #define    _SWAP_YMM_HL(ymm)   __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ;
01214 #define     SWAP_YMM_HL(ymm)   _SWAP_YMM_HL(ymm) 
01215 
01216 #define MOVE_to_REG(ymm, mem)      _MOVE_to_REG(ymm, mem)
01217 #define MOVE_to_MEM(mem, ymm)      _MOVE_to_MEM(mem, ymm)
01218 #define BYTE_SWAP(ymm, map)        _BYTE_SWAP(ymm, map)
01219 #define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map) 
01220 #define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map)
01221 #define XOR(dest, src1, src2)      _XOR(dest, src1, src2)
01222 #define OR(dest, src1, src2)       _OR(dest, src1, src2)
01223 #define ADD(dest, src1, src2)      _ADD(dest, src1, src2)
01224 #define ADD_MEM(dest, src1, mem)  _ADD_MEM(dest, src1, mem)
01225 #define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2)
01226 
01227 #define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp); 
01228 #define AVX2_S(dest, src, bits)      S_TMP(dest, src, bits, S_TEMP)
01229 #define AVX2_R(dest, src, bits)      _AVX2_R(dest, src, bits)
01230 
01231 #define GAMMA0(dest, src)      AVX2_S(dest, src, 7);  AVX2_S(G_TEMP, src, 18); \
01232     XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3);  XOR(dest, G_TEMP, dest) ;
01233 #define GAMMA0_1(dest, src)    AVX2_S(dest, src, 7);  AVX2_S(G_TEMP, src, 18); 
01234 #define GAMMA0_2(dest, src)    XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3);  \
01235     XOR(dest, G_TEMP, dest) ;
01236 
01237 #define GAMMA1(dest, src)      AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); \
01238     XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); XOR(dest, G_TEMP, dest) ;
01239 #define GAMMA1_1(dest, src)    AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); 
01240 #define GAMMA1_2(dest, src)    XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); \
01241     XOR(dest, G_TEMP, dest) ;
01242 
01243 #define    FEEDBACK1_to_W_I_2    MOVE_BYTE(YMM_TEMP0, W_I, mMAP1toW_I_2[0]) ; \
01244     BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2) ;
01245 #define    FEEDBACK2_to_W_I_2    MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ;  \
01246     MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAP2toW_I_2[0]) ; BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2) ; 
01247 #define    FEEDBACK3_to_W_I_2    MOVE_BYTE(YMM_TEMP0, W_I, mMAP3toW_I_2[0]) ; \
01248     BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2) ; 
01249 
01250 #define    FEEDBACK_to_W_I_7     MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ;\
01251     MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAPtoW_I_7[0]) ; BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7) ;
01252 
01253 #undef voitle
01254 
01255 #define W_I_16  ymm8
01256 #define W_I_15  ymm9
01257 #define W_I_7  ymm10
01258 #define W_I_2  ymm11
01259 #define W_I    ymm12
01260 #define G_TEMP     ymm13
01261 #define S_TEMP     ymm14
01262 #define YMM_TEMP0  ymm15
01263 #define YMM_TEMP0x xmm15
01264 #define W_I_TEMP   ymm7
01265 #define W_K_TEMP   ymm15
01266 #define W_K_TEMPx  xmm15
01267 
01268 #define YMM_REGs /* Registers are saved in Sha256Update/Finel */
01269  /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/
01270 
01271 
01272 #define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\
01273     __asm__ volatile("vperm2i128  $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs) ;\
01274     __asm__ volatile("vpblendd    $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs) ;\
01275     __asm__ volatile("vperm2i128 $0x01,  %%"#w_i_7",  %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\
01276     __asm__ volatile("vpblendd    $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\
01277     __asm__ volatile("vpshufd    $0x93,  %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\
01278 
01279 #define MOVE_7_to_15(w_i_15, w_i_7)\
01280     __asm__ volatile("vmovdqu                 %%"#w_i_7",  %%"#w_i_15" ":::YMM_REGs) ;\
01281 
01282 #define MOVE_I_to_7(w_i_7, w_i)\
01283     __asm__ volatile("vperm2i128 $0x01,       %%"#w_i",   %%"#w_i",   %%"#w_i_7" ":::YMM_REGs) ;\
01284     __asm__ volatile("vpblendd    $0x01,       %%"#w_i_7",   %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\
01285     __asm__ volatile("vpshufd    $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs) ;\
01286 
01287 #define MOVE_I_to_2(w_i_2, w_i)\
01288     __asm__ volatile("vperm2i128 $0x01,       %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs) ;\
01289     __asm__ volatile("vpshufd    $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs) ;\
01290 
01291 #define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\
01292     MOVE_15_to_16(w_i_16, w_i_15, w_i_7) ; \
01293     MOVE_7_to_15(w_i_15, w_i_7) ; \
01294     MOVE_I_to_7(w_i_7, w_i) ; \
01295     MOVE_I_to_2(w_i_2, w_i) ;\
01296 
01297 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
01298     { word32 d ;\
01299     __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ;\
01300     sha256->digest[0] += d;\
01301     __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ;\
01302     sha256->digest[1] += d;\
01303     __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ;\
01304     sha256->digest[2] += d;\
01305     __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ;\
01306     sha256->digest[3] += d;\
01307     __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ;\
01308     sha256->digest[4] += d;\
01309     __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ;\
01310     sha256->digest[5] += d;\
01311     __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ;\
01312     sha256->digest[6] += d;\
01313     __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ;\
01314     sha256->digest[7] += d;\
01315 }
01316 
01317 #define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
01318   { word32 d[8] ;\
01319     __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs) ;\
01320     __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs) ;\
01321     __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs) ;\
01322     __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs) ;\
01323     __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs) ;\
01324     __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs) ;\
01325     __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs) ;\
01326     __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs) ;\
01327         printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\
01328     __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs) ;\
01329     __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs) ;\
01330     __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs) ;\
01331     __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs) ;\
01332     __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs) ;\
01333     __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs) ;\
01334     __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs) ;\
01335     __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs) ;\
01336 }
01337 
01338 
01339 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
01340     _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
01341 
01342 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
01343     _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
01344 
01345 #define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
01346     _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
01347 
01348         
01349     /* Byte swap Masks to ensure that rest of the words are filled with zero's. */
01350     static const unsigned long mBYTE_FLIP_MASK_16[] =  
01351         { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
01352     static const unsigned long mBYTE_FLIP_MASK_15[] =  
01353         { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
01354     static const unsigned long mBYTE_FLIP_MASK_7 [] =  
01355         { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b } ;
01356     static const unsigned long mBYTE_FLIP_MASK_2 [] =  
01357         { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 } ;
01358 
01359     static const unsigned long mMAPtoW_I_7[] =  
01360         { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 } ;
01361     static const unsigned long mMAP1toW_I_2[] = 
01362         { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 } ;
01363     static const unsigned long mMAP2toW_I_2[] = 
01364         { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 } ;
01365     static const unsigned long mMAP3toW_I_2[] = 
01366         { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 } ;
01367  
01368 static int Transform_AVX2(Sha256* sha256)
01369 {
01370 
01371     #ifdef WOLFSSL_SMALL_STACK
01372         word32* W_K;
01373         W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
01374         if (W_K == NULL)
01375             return MEMORY_E;
01376     #else
01377         word32 W_K[64]  ;
01378     #endif
01379 
01380     MOVE_to_REG(W_I_16, sha256->buffer[0]);     BYTE_SWAP(W_I_16, mBYTE_FLIP_MASK_16[0]) ;
01381     MOVE_to_REG(W_I_15, sha256->buffer[1]);     BYTE_SWAP(W_I_15, mBYTE_FLIP_MASK_15[0]) ;
01382     MOVE_to_REG(W_I,    sha256->buffer[8]) ;    BYTE_SWAP(W_I,    mBYTE_FLIP_MASK_16[0]) ;
01383     MOVE_to_REG(W_I_7,  sha256->buffer[16-7]) ; BYTE_SWAP(W_I_7,  mBYTE_FLIP_MASK_7[0])  ;
01384     MOVE_to_REG(W_I_2,  sha256->buffer[16-2]) ; BYTE_SWAP(W_I_2,  mBYTE_FLIP_MASK_2[0])  ;
01385 
01386     DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
01387 
01388     ADD_MEM(W_K_TEMP, W_I_16, K[0]) ;
01389     MOVE_to_MEM(W_K[0], W_K_TEMP) ; 
01390 
01391     RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ;
01392     RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1) ;
01393     RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2) ;
01394     RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3) ;  
01395     RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4) ;
01396     RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5) ;
01397     RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6) ;
01398     RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7) ;
01399 
01400     ADD_MEM(YMM_TEMP0, W_I, K[8]) ;
01401     MOVE_to_MEM(W_K[8], YMM_TEMP0) ; 
01402 
01403         /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
01404                 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
01405         GAMMA0_1(W_I_TEMP, W_I_15) ;
01406                 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
01407         GAMMA0_2(W_I_TEMP, W_I_15) ;
01408                 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
01409         ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
01410                 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ;
01411         ADD(W_I, W_I_7, W_I_TEMP);
01412                 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ;
01413         GAMMA1_1(YMM_TEMP0, W_I_2) ; 
01414                 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ;
01415         GAMMA1_2(YMM_TEMP0, W_I_2) ; 
01416                 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ;
01417         ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
01418                 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ;
01419         FEEDBACK1_to_W_I_2 ;
01420                 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ;
01421         FEEDBACK_to_W_I_7 ; 
01422                 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ;
01423         ADD(W_I_TEMP, W_I_7, W_I_TEMP);
01424                 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ;
01425         GAMMA1_1(YMM_TEMP0, W_I_2) ; 
01426                 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ;
01427         GAMMA1_2(YMM_TEMP0, W_I_2) ; 
01428                 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ;
01429         ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
01430                 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ;
01431         FEEDBACK2_to_W_I_2 ;
01432                 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ;
01433         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01434                 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ;
01435         GAMMA1_2(YMM_TEMP0, W_I_2) ; 
01436                 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ;
01437         ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
01438                 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ;
01439         FEEDBACK3_to_W_I_2 ;
01440                 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ;
01441         GAMMA1(YMM_TEMP0, W_I_2) ;
01442                 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ;
01443                 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ;
01444         ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
01445                 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ;
01446 
01447         MOVE_to_REG(YMM_TEMP0, K[16]) ;    
01448                 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ;
01449         ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
01450                 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ;
01451         ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
01452         MOVE_to_MEM(W_K[16], YMM_TEMP0) ;
01453 
01454         /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
01455                 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
01456         GAMMA0_1(W_I_TEMP, W_I_15) ;
01457                 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
01458         GAMMA0_2(W_I_TEMP, W_I_15) ;
01459                 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
01460         ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
01461                 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ;
01462         ADD(W_I, W_I_7, W_I_TEMP);
01463                 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ;
01464         GAMMA1_1(YMM_TEMP0, W_I_2) ; 
01465                 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ;
01466         GAMMA1_2(YMM_TEMP0, W_I_2) ; 
01467                 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ;
01468         ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
01469                 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ;
01470         FEEDBACK1_to_W_I_2 ;
01471                 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ;
01472         FEEDBACK_to_W_I_7 ; 
01473                 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ;
01474         ADD(W_I_TEMP, W_I_7, W_I_TEMP);
01475                 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ;
01476         GAMMA1(YMM_TEMP0, W_I_2) ; 
01477                 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ;
01478         GAMMA1_2(YMM_TEMP0, W_I_2) ; 
01479                 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ;
01480         ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
01481                 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ;
01482         FEEDBACK2_to_W_I_2 ;
01483                 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ;
01484         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01485                 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ;
01486         GAMMA1_2(YMM_TEMP0, W_I_2) ; 
01487                 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ;
01488         ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
01489                 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ;
01490         FEEDBACK3_to_W_I_2 ;
01491                 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ;
01492         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01493                 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ;
01494         GAMMA1_2(YMM_TEMP0, W_I_2) ; 
01495                 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ;
01496         ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
01497                 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ;
01498 
01499         MOVE_to_REG(YMM_TEMP0, K[24]) ;    
01500                 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ;
01501         ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
01502                 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ;
01503         ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
01504         MOVE_to_MEM(W_K[24], YMM_TEMP0) ;
01505 
01506                 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
01507                 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
01508         GAMMA0_1(W_I_TEMP, W_I_15) ;
01509                 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
01510         GAMMA0_2(W_I_TEMP, W_I_15) ;
01511                 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
01512         ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
01513                 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ;
01514         ADD(W_I, W_I_7, W_I_TEMP);
01515                 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ;
01516         GAMMA1_1(YMM_TEMP0, W_I_2) ; 
01517                 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ;
01518         GAMMA1_2(YMM_TEMP0, W_I_2) ; 
01519                 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ;
01520         ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
01521                 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ;
01522         FEEDBACK1_to_W_I_2 ;
01523                 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ;
01524         FEEDBACK_to_W_I_7 ; 
01525                 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ;
01526         ADD(W_I_TEMP, W_I_7, W_I_TEMP);
01527                 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ;
01528         GAMMA1_1(YMM_TEMP0, W_I_2) ; 
01529                 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ;
01530         GAMMA1_2(YMM_TEMP0, W_I_2) ; 
01531                 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ;
01532         ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
01533                 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ;
01534         FEEDBACK2_to_W_I_2 ;
01535                 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ;
01536         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01537                 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ;
01538         GAMMA1_2(YMM_TEMP0, W_I_2) ; 
01539                 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ;
01540         ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
01541                 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ;
01542         FEEDBACK3_to_W_I_2 ;
01543                 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ;
01544         GAMMA1(YMM_TEMP0, W_I_2) ;
01545                 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ;
01546                 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ;
01547         ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
01548                 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ;
01549 
01550         MOVE_to_REG(YMM_TEMP0, K[32]) ;    
01551                 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ;
01552         ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
01553                 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ;
01554         ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
01555         MOVE_to_MEM(W_K[32], YMM_TEMP0) ;
01556 
01557         
01558                 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
01559                 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
01560         GAMMA0_1(W_I_TEMP, W_I_15) ;
01561                 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
01562         GAMMA0_2(W_I_TEMP, W_I_15) ;
01563                 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
01564         ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
01565                 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ;
01566         ADD(W_I, W_I_7, W_I_TEMP);
01567                 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ;
01568         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01569                 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ;
01570         GAMMA1_2(YMM_TEMP0, W_I_2) ; 
01571                 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ;
01572         ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
01573                 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ;
01574         FEEDBACK1_to_W_I_2 ;
01575                 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ;
01576         FEEDBACK_to_W_I_7 ; 
01577                 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ;
01578         ADD(W_I_TEMP, W_I_7, W_I_TEMP);
01579                 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ;
01580         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01581                 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ;
01582         GAMMA1_2(YMM_TEMP0, W_I_2) ;
01583                 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ;
01584         ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
01585                 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ;
01586         FEEDBACK2_to_W_I_2 ;
01587                 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ;
01588         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01589                 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ;
01590         GAMMA1_2(YMM_TEMP0, W_I_2) ;
01591                 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ;
01592         ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
01593                 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ;
01594         FEEDBACK3_to_W_I_2 ;
01595                 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ;
01596         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01597                 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ;
01598         GAMMA1_2(YMM_TEMP0, W_I_2) ;
01599                 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ;
01600         ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
01601                 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ;
01602 
01603         MOVE_to_REG(YMM_TEMP0, K[40]) ;    
01604                 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ;
01605         ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
01606                 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ;
01607         ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
01608         MOVE_to_MEM(W_K[40], YMM_TEMP0) ;
01609 
01610                 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
01611                 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
01612         GAMMA0_1(W_I_TEMP, W_I_15) ;
01613                 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
01614         GAMMA0_2(W_I_TEMP, W_I_15) ;
01615                 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
01616         ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
01617                 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ;
01618         ADD(W_I, W_I_7, W_I_TEMP);
01619                 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ;
01620         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01621                 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ;
01622         GAMMA1_2(YMM_TEMP0, W_I_2) ;
01623                 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ;
01624         ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
01625                 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ;
01626         FEEDBACK1_to_W_I_2 ;
01627                 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ;
01628         FEEDBACK_to_W_I_7 ; 
01629                 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ;
01630         ADD(W_I_TEMP, W_I_7, W_I_TEMP);
01631                 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ;
01632         GAMMA1_1(YMM_TEMP0, W_I_2) ; 
01633                 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ;
01634         GAMMA1_2(YMM_TEMP0, W_I_2) ;
01635                 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ;
01636         ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
01637                 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ;
01638         FEEDBACK2_to_W_I_2 ;
01639                 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ;
01640         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01641                 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ;
01642         GAMMA1_2(YMM_TEMP0, W_I_2) ;
01643                 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ;
01644         ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
01645                 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ;
01646         FEEDBACK3_to_W_I_2 ;
01647                 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ;
01648         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01649                 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ;
01650         GAMMA1_2(YMM_TEMP0, W_I_2) ;
01651                 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ;
01652         ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
01653                 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ;
01654 
01655         MOVE_to_REG(YMM_TEMP0, K[48]) ;    
01656                 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ;
01657         ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
01658                 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ;
01659         ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
01660         MOVE_to_MEM(W_K[48], YMM_TEMP0) ;
01661         
01662                 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
01663                 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
01664         GAMMA0_1(W_I_TEMP, W_I_15) ;
01665                 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
01666         GAMMA0_2(W_I_TEMP, W_I_15) ;
01667                 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
01668         ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
01669                 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
01670         ADD(W_I, W_I_7, W_I_TEMP);
01671                 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
01672         GAMMA1_1(YMM_TEMP0, W_I_2) ; 
01673                 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
01674         GAMMA1_2(YMM_TEMP0, W_I_2) ;
01675                 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
01676         ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
01677                 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
01678         FEEDBACK1_to_W_I_2 ;
01679                 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
01680         FEEDBACK_to_W_I_7 ; 
01681                 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
01682         ADD(W_I_TEMP, W_I_7, W_I_TEMP);
01683                 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
01684         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01685                 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
01686         GAMMA1_2(YMM_TEMP0, W_I_2) ;
01687                 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
01688         ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
01689                 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
01690         FEEDBACK2_to_W_I_2 ;
01691                 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
01692         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01693                 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
01694         GAMMA1_2(YMM_TEMP0, W_I_2) ;
01695                 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
01696         ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
01697                 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
01698         FEEDBACK3_to_W_I_2 ;
01699                 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
01700         GAMMA1_1(YMM_TEMP0, W_I_2) ;
01701                 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
01702         GAMMA1_2(YMM_TEMP0, W_I_2) ;
01703                 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
01704         ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
01705                 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
01706 
01707         MOVE_to_REG(YMM_TEMP0, K[56]) ;    
01708                 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
01709         ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
01710                 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
01711         ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
01712         MOVE_to_MEM(W_K[56], YMM_TEMP0) ;        
01713         
01714         RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ;
01715         RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ;
01716         RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ;
01717         RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ;
01718 
01719         RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ;
01720         RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ;
01721         RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ;
01722         RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ;
01723 
01724     RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;  
01725 
01726     #ifdef WOLFSSL_SMALL_STACK
01727         XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
01728     #endif
01729     
01730     return 0;
01731 }
01732 
01733 #endif   /* HAVE_INTEL_AVX2 */
01734 
01735 #endif   /* HAVE_FIPS */
01736 
01737 #endif   /* WOLFSSL_TI_HAHS */
01738 
01739 #endif /* NO_SHA256 */
01740 
01741