wolf SSL / wolfSSL-TLS13-Beta

Fork of wolfSSL by wolf SSL

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers sha256.c Source File

sha256.c

00001 /* sha256.c
00002  *
00003  * Copyright (C) 2006-2016 wolfSSL Inc.
00004  *
00005  * This file is part of wolfSSL.
00006  *
00007  * wolfSSL is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * wolfSSL is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
00020  */
00021 
00022 
00023 /* code submitted by raphael.huck@efixo.com */
00024 
00025 #ifdef HAVE_CONFIG_H
00026     #include <config.h>
00027 #endif
00028 
00029 #include <wolfssl/wolfcrypt/settings.h>
00030 
00031 #if !defined(NO_SHA256)
00032 
00033 #include <wolfssl/wolfcrypt/sha256.h>
00034 #include <wolfssl/wolfcrypt/error-crypt.h>
00035 
00036 /* fips wrapper calls, user can call direct */
00037 #ifdef HAVE_FIPS
00038 
00039     int wc_InitSha256(Sha256* sha)
00040     {
00041         return InitSha256_fips(sha);
00042     }
00043     int wc_InitSha256_ex(Sha256* sha, void* heap, int devId)
00044     {
00045         (void)heap;
00046         (void)devId;
00047         return InitSha256_fips(sha);
00048     }
00049     int wc_Sha256Update(Sha256* sha, const byte* data, word32 len)
00050     {
00051         return Sha256Update_fips(sha, data, len);
00052     }
00053     int wc_Sha256Final(Sha256* sha, byte* out)
00054     {
00055         return Sha256Final_fips(sha, out);
00056     }
00057     void wc_Sha256Free(Sha256* sha)
00058     {
00059         (void)sha;
00060         /* Not supported in FIPS */
00061     }
00062 
00063 #else /* else build without fips */
00064 
00065 
00066 #if defined(WOLFSSL_TI_HASH)
00067     /* #include <wolfcrypt/src/port/ti/ti-hash.c> included by wc_port.c */
00068 #else
00069 
00070 #include <wolfssl/wolfcrypt/logging.h>
00071 
00072 #ifdef NO_INLINE
00073     #include <wolfssl/wolfcrypt/misc.h>
00074 #else
00075     #define WOLFSSL_MISC_INCLUDED
00076     #include <wolfcrypt/src/misc.c>
00077 #endif
00078 
00079 
00080 #if defined(USE_INTEL_SPEEDUP)
00081     #define HAVE_INTEL_AVX1
00082     #define HAVE_INTEL_AVX2
00083 #endif /* USE_INTEL_SPEEDUP */
00084 
00085 #if defined(HAVE_INTEL_AVX2)
00086     #define HAVE_INTEL_RORX
00087 #endif
00088 
00089 
00090 static int InitSha256(Sha256* sha256)
00091 {
00092     int ret = 0;
00093 
00094     if (sha256 == NULL)
00095         return BAD_FUNC_ARG;
00096 
00097     sha256->digest[0] = 0x6A09E667L;
00098     sha256->digest[1] = 0xBB67AE85L;
00099     sha256->digest[2] = 0x3C6EF372L;
00100     sha256->digest[3] = 0xA54FF53AL;
00101     sha256->digest[4] = 0x510E527FL;
00102     sha256->digest[5] = 0x9B05688CL;
00103     sha256->digest[6] = 0x1F83D9ABL;
00104     sha256->digest[7] = 0x5BE0CD19L;
00105 
00106     sha256->buffLen = 0;
00107     sha256->loLen   = 0;
00108     sha256->hiLen   = 0;
00109 
00110     return ret;
00111 }
00112 
00113 
00114 /* Hardware Acceleration */
00115 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00116 
00117     /* in case intel instructions aren't available, plus we need the K[] global */
00118     #define NEED_SOFT_SHA256
00119 
00120     /*****
00121     Intel AVX1/AVX2 Macro Control Structure
00122 
00123     #define HAVE_INTEL_AVX1
00124     #define HAVE_INTEL_AVX2
00125 
00126     #define HAVE_INTEL_RORX
00127 
00128 
00129     int InitSha256(Sha256* sha256) {
00130          Save/Recover XMM, YMM
00131          ...
00132     }
00133 
00134     #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
00135       Transform(); Function prototype
00136     #else
00137       Transform() {   }
00138       int Sha256Final() {
00139          Save/Recover XMM, YMM
00140          ...
00141       }
00142     #endif
00143 
00144     #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
00145         #if defined(HAVE_INTEL_RORX
00146              #define RND with rorx instuction
00147         #else
00148             #define RND
00149         #endif
00150     #endif
00151 
00152     #if defined(HAVE_INTEL_AVX1)
00153 
00154        #define XMM Instructions/inline asm
00155 
00156        int Transform() {
00157            Stitched Message Sched/Round
00158         }
00159 
00160     #elif defined(HAVE_INTEL_AVX2)
00161 
00162       #define YMM Instructions/inline asm
00163 
00164       int Transform() {
00165           More granural Stitched Message Sched/Round
00166       }
00167 
00168     */
00169 
00170     /* Each platform needs to query info type 1 from cpuid to see if aesni is
00171      * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
00172      */
00173 
00174     #ifndef _MSC_VER
00175         #define cpuid(reg, leaf, sub)\
00176                 __asm__ __volatile__ ("cpuid":\
00177                  "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
00178                  "a" (leaf), "c"(sub));
00179 
00180         #define XASM_LINK(f) asm(f)
00181     #else
00182         #include <intrin.h>
00183         #define cpuid(a,b) __cpuid((int*)a,b)
00184 
00185         #define XASM_LINK(f)
00186     #endif /* _MSC_VER */
00187 
00188     #define EAX 0
00189     #define EBX 1
00190     #define ECX 2
00191     #define EDX 3
00192 
00193     #define CPUID_AVX1   0x1
00194     #define CPUID_AVX2   0x2
00195     #define CPUID_RDRAND 0x4
00196     #define CPUID_RDSEED 0x8
00197     #define CPUID_BMI2   0x10   /* MULX, RORX */
00198 
00199     #define IS_INTEL_AVX1       (cpuid_flags & CPUID_AVX1)
00200     #define IS_INTEL_AVX2       (cpuid_flags & CPUID_AVX2)
00201     #define IS_INTEL_BMI2       (cpuid_flags & CPUID_BMI2)
00202     #define IS_INTEL_RDRAND     (cpuid_flags & CPUID_RDRAND)
00203     #define IS_INTEL_RDSEED     (cpuid_flags & CPUID_RDSEED)
00204 
00205     static word32 cpuid_check = 0;
00206     static word32 cpuid_flags = 0;
00207 
00208     static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
00209         int got_intel_cpu=0;
00210         unsigned int reg[5];
00211 
00212         reg[4] = '\0';
00213         cpuid(reg, 0, 0);
00214         if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 &&
00215             XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 &&
00216             XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) {
00217             got_intel_cpu = 1;
00218         }
00219         if (got_intel_cpu) {
00220             cpuid(reg, leaf, sub);
00221             return ((reg[num] >> bit) & 0x1);
00222         }
00223         return 0;
00224     }
00225 
00226     static int set_cpuid_flags(void) {
00227         if (cpuid_check==0) {
00228             if (cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1; }
00229             if (cpuid_flag(7, 0, EBX, 5)) { cpuid_flags |= CPUID_AVX2; }
00230             if (cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2; }
00231             if (cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND; }
00232             if (cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED; }
00233             cpuid_check = 1;
00234             return 0;
00235         }
00236         return 1;
00237     }
00238 
00239     /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha256 */
00240     static int Transform(Sha256* sha256);
00241     #if defined(HAVE_INTEL_AVX1)
00242         static int Transform_AVX1(Sha256 *sha256);
00243     #endif
00244     #if defined(HAVE_INTEL_AVX2)
00245         static int Transform_AVX2(Sha256 *sha256);
00246         static int Transform_AVX1_RORX(Sha256 *sha256);
00247     #endif
00248     static int (*Transform_p)(Sha256* sha256) /* = _Transform */;
00249     #define XTRANSFORM(sha256, B)  (*Transform_p)(sha256)
00250 
00251     static void set_Transform(void) {
00252          if (set_cpuid_flags()) return;
00253 
00254     #if defined(HAVE_INTEL_AVX2)
00255          if (IS_INTEL_AVX2 && IS_INTEL_BMI2) {
00256              Transform_p = Transform_AVX1_RORX; return;
00257              Transform_p = Transform_AVX2;
00258                       /* for avoiding warning,"not used" */
00259          }
00260     #endif
00261     #if defined(HAVE_INTEL_AVX1)
00262          Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform); return;
00263     #endif
00264          Transform_p = Transform; return;
00265     }
00266 
00267     /* Dummy for saving MM_REGs on behalf of Transform */
00268     #if defined(HAVE_INTEL_AVX2) && !defined(HAVE_INTEL_AVX1)
00269         #define SAVE_XMM_YMM   __asm__ volatile("or %%r8d, %%r8d":::\
00270           "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15")
00271     #elif defined(HAVE_INTEL_AVX1)
00272         #define SAVE_XMM_YMM   __asm__ volatile("or %%r8d, %%r8d":::\
00273             "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\
00274             "xmm11","xmm12","xmm13","xmm14","xmm15")
00275     #endif
00276 
00277     int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId)
00278     {
00279         int ret = 0;
00280         if (sha256 == NULL)
00281             return BAD_FUNC_ARG;
00282 
00283         sha256->heap = heap;
00284 
00285         ret = InitSha256(sha256);
00286         if (ret != 0)
00287             return ret;
00288 
00289         /* choose best Transform function under this runtime environment */
00290         set_Transform();
00291 
00292     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
00293         ret = wolfAsync_DevCtxInit(&sha256->asyncDev,
00294                             WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId);
00295     #else
00296         (void)devId;
00297     #endif /* WOLFSSL_ASYNC_CRYPT */
00298 
00299         return ret;
00300     }
00301 
00302 #elif defined(FREESCALE_LTC_SHA)
00303     int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId)
00304     {
00305         (void)heap;
00306         (void)devId;
00307 
00308         LTC_HASH_Init(LTC_BASE, &sha256->ctx, kLTC_Sha256, NULL, 0);
00309 
00310         return 0;
00311     }
00312 
00313 #elif defined(FREESCALE_MMCAU_SHA)
00314     #include "fsl_mmcau.h"
00315     #define XTRANSFORM(sha256, B) Transform(sha256, B)
00316 
00317     int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId)
00318     {
00319         int ret = 0;
00320 
00321         (void)heap;
00322         (void)devId;
00323 
00324         ret = wolfSSL_CryptHwMutexLock();
00325         if (ret != 0) {
00326             return ret;
00327         }
00328         MMCAU_SHA256_InitializeOutput((uint32_t*)sha256->digest);
00329         wolfSSL_CryptHwMutexUnLock();
00330 
00331         sha256->buffLen = 0;
00332         sha256->loLen   = 0;
00333         sha256->hiLen   = 0;
00334 
00335         return ret;
00336     }
00337 
00338     static int Transform(Sha256* sha256, byte* buf)
00339     {
00340         int ret = wolfSSL_CryptHwMutexLock();
00341         if (ret == 0) {
00342             MMCAU_SHA256_HashN(buf, 1, sha256->digest);
00343             wolfSSL_CryptHwMutexUnLock();
00344         }
00345         return ret;
00346     }
00347 
00348 #elif defined(WOLFSSL_PIC32MZ_HASH)
00349     #define NEED_SOFT_SHA256
00350 
00351     #define wc_InitSha256   wc_InitSha256_sw
00352     #define wc_Sha256Update wc_Sha256Update_sw
00353     #define wc_Sha256Final  wc_Sha256Final_sw
00354 
00355     int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId)
00356     {
00357         if (sha256 == NULL)
00358             return BAD_FUNC_ARG;
00359 
00360         sha256->heap = heap;
00361 
00362         return InitSha256(sha256);
00363     }
00364 
00365 #else
00366     #define NEED_SOFT_SHA256
00367 
00368     #define XTRANSFORM(sha256, B) Transform(sha256)
00369 
00370     int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId)
00371     {
00372         int ret = 0;
00373         if (sha256 == NULL)
00374             return BAD_FUNC_ARG;
00375 
00376         sha256->heap = heap;
00377 
00378         ret = InitSha256(sha256);
00379         if (ret != 0)
00380             return ret;
00381 
00382     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
00383         ret = wolfAsync_DevCtxInit(&sha256->asyncDev,
00384                             WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId);
00385     #else
00386         (void)devId;
00387     #endif /* WOLFSSL_ASYNC_CRYPT */
00388 
00389         return ret;
00390     }
00391 #endif /* End Hardware Acceleration */
00392 
00393 #ifndef SAVE_XMM_YMM
00394     #define SAVE_XMM_YMM
00395 #endif
00396 
00397 #ifdef NEED_SOFT_SHA256
00398 
00399     static const ALIGN32 word32 K[64] = {
00400         0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
00401         0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
00402         0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
00403         0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
00404         0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
00405         0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
00406         0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
00407         0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
00408         0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
00409         0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
00410         0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
00411         0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
00412         0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
00413     };
00414 
00415     #define Ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
00416     #define Maj(x,y,z)      ((((x) | (y)) & (z)) | ((x) & (y)))
00417     #define R(x, n)         (((x) & 0xFFFFFFFFU) >> (n))
00418 
00419     #define S(x, n)         rotrFixed(x, n)
00420     #define Sigma0(x)       (S(x, 2) ^ S(x, 13) ^ S(x, 22))
00421     #define Sigma1(x)       (S(x, 6) ^ S(x, 11) ^ S(x, 25))
00422     #define Gamma0(x)       (S(x, 7) ^ S(x, 18) ^ R(x, 3))
00423     #define Gamma1(x)       (S(x, 17) ^ S(x, 19) ^ R(x, 10))
00424 
00425     #define RND(a,b,c,d,e,f,g,h,i) \
00426          t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \
00427          t1 = Sigma0((a)) + Maj((a), (b), (c)); \
00428          (d) += t0; \
00429          (h)  = t0 + t1;
00430 
00431     static int Transform(Sha256* sha256)
00432     {
00433         word32 S[8], t0, t1;
00434         int i;
00435 
00436     #ifdef WOLFSSL_SMALL_STACK
00437         word32* W;
00438 
00439         W = (word32*)XMALLOC(sizeof(word32) * SHA256_BLOCK_SIZE, NULL,
00440             DYNAMIC_TYPE_TMP_BUFFER);
00441         if (W == NULL)
00442             return MEMORY_E;
00443     #else
00444         word32 W[SHA256_BLOCK_SIZE];
00445     #endif
00446 
00447         /* Copy context->state[] to working vars */
00448         for (i = 0; i < 8; i++)
00449             S[i] = sha256->digest[i];
00450 
00451         for (i = 0; i < 16; i++)
00452             W[i] = sha256->buffer[i];
00453 
00454         for (i = 16; i < SHA256_BLOCK_SIZE; i++)
00455             W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
00456 
00457         for (i = 0; i < SHA256_BLOCK_SIZE; i += 8) {
00458             RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0);
00459             RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1);
00460             RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2);
00461             RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3);
00462             RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4);
00463             RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5);
00464             RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6);
00465             RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7);
00466         }
00467 
00468         /* Add the working vars back into digest state[] */
00469         for (i = 0; i < 8; i++) {
00470             sha256->digest[i] += S[i];
00471         }
00472 
00473     #ifdef WOLFSSL_SMALL_STACK
00474         XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
00475     #endif
00476 
00477         return 0;
00478     }
00479 #endif
00480 /* End wc_ software implementation */
00481 
00482 
00483 #ifdef XTRANSFORM
00484 
00485     static INLINE void AddLength(Sha256* sha256, word32 len)
00486     {
00487         word32 tmp = sha256->loLen;
00488         if ( (sha256->loLen += len) < tmp)
00489             sha256->hiLen++;                       /* carry low to high */
00490     }
00491 
00492     static INLINE int Sha256Update(Sha256* sha256, const byte* data, word32 len)
00493     {
00494         int ret = 0;
00495         byte* local;
00496 
00497         if (sha256 == NULL || (data == NULL && len > 0)) {
00498             return BAD_FUNC_ARG;
00499         }
00500 
00501     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
00502         if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) {
00503         #if defined(HAVE_INTEL_QA)
00504             return IntelQaSymSha256(&sha256->asyncDev, NULL, data, len);
00505         #endif
00506         }
00507     #endif /* WOLFSSL_ASYNC_CRYPT */
00508 
00509         /* do block size increments */
00510         local = (byte*)sha256->buffer;
00511 
00512         /* check that internal buffLen is valid */
00513         if (sha256->buffLen >= SHA256_BLOCK_SIZE)
00514             return BUFFER_E;
00515 
00516         SAVE_XMM_YMM; /* for Intel AVX */
00517 
00518         while (len) {
00519             word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen);
00520             XMEMCPY(&local[sha256->buffLen], data, add);
00521 
00522             sha256->buffLen += add;
00523             data            += add;
00524             len             -= add;
00525 
00526             if (sha256->buffLen == SHA256_BLOCK_SIZE) {
00527         #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
00528             #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00529                 if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
00530             #endif
00531                 {
00532                     ByteReverseWords(sha256->buffer, sha256->buffer,
00533                                                              SHA256_BLOCK_SIZE);
00534                 }
00535         #endif
00536                 ret = XTRANSFORM(sha256, local);
00537                 if (ret != 0) {
00538                     break;
00539                 }
00540 
00541                 AddLength(sha256, SHA256_BLOCK_SIZE);
00542                 sha256->buffLen = 0;
00543             }
00544         }
00545 
00546         return ret;
00547     }
00548 
00549     int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
00550     {
00551         return Sha256Update(sha256, data, len);
00552     }
00553 
00554     static INLINE int Sha256Final(Sha256* sha256)
00555     {
00556         int ret;
00557         byte* local = (byte*)sha256->buffer;
00558 
00559         SAVE_XMM_YMM; /* for Intel AVX */
00560 
00561         AddLength(sha256, sha256->buffLen);  /* before adding pads */
00562         local[sha256->buffLen++] = 0x80;     /* add 1 */
00563 
00564         /* pad with zeros */
00565         if (sha256->buffLen > SHA256_PAD_SIZE) {
00566             XMEMSET(&local[sha256->buffLen], 0,
00567                 SHA256_BLOCK_SIZE - sha256->buffLen);
00568             sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen;
00569 
00570     #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
00571         #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00572             if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
00573         #endif
00574             {
00575                 ByteReverseWords(sha256->buffer, sha256->buffer,
00576                     SHA256_BLOCK_SIZE);
00577             }
00578     #endif
00579 
00580             ret = XTRANSFORM(sha256, local);
00581             if (ret != 0)
00582                 return ret;
00583 
00584             sha256->buffLen = 0;
00585         }
00586         XMEMSET(&local[sha256->buffLen], 0, SHA256_PAD_SIZE - sha256->buffLen);
00587 
00588         /* put lengths in bits */
00589         sha256->hiLen = (sha256->loLen >> (8 * sizeof(sha256->loLen) - 3)) +
00590                                                          (sha256->hiLen << 3);
00591         sha256->loLen = sha256->loLen << 3;
00592 
00593         /* store lengths */
00594     #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
00595         #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00596             if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
00597         #endif
00598             {
00599                 ByteReverseWords(sha256->buffer, sha256->buffer,
00600                     SHA256_BLOCK_SIZE);
00601             }
00602     #endif
00603         /* ! length ordering dependent on digest endian type ! */
00604         XMEMCPY(&local[SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
00605         XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
00606                 sizeof(word32));
00607 
00608     #if defined(FREESCALE_MMCAU_SHA) || defined(HAVE_INTEL_AVX1) || \
00609             defined(HAVE_INTEL_AVX2)
00610         /* Kinetis requires only these bytes reversed */
00611         #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00612             if (IS_INTEL_AVX1 || IS_INTEL_AVX2)
00613         #endif
00614             {
00615                 ByteReverseWords(
00616                     &sha256->buffer[SHA256_PAD_SIZE / sizeof(word32)],
00617                     &sha256->buffer[SHA256_PAD_SIZE / sizeof(word32)],
00618                     2 * sizeof(word32));
00619             }
00620     #endif
00621 
00622         return XTRANSFORM(sha256, local);
00623     }
00624 
00625     int wc_Sha256Final(Sha256* sha256, byte* hash)
00626     {
00627         int ret;
00628 
00629         if (sha256 == NULL || hash == NULL) {
00630             return BAD_FUNC_ARG;
00631         }
00632 
00633     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
00634         if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) {
00635         #if defined(HAVE_INTEL_QA)
00636             return IntelQaSymSha256(&sha256->asyncDev, hash, NULL,
00637                                             SHA256_DIGEST_SIZE);
00638         #endif
00639         }
00640     #endif /* WOLFSSL_ASYNC_CRYPT */
00641 
00642         ret = Sha256Final(sha256);
00643         if (ret != 0)
00644             return ret;
00645 
00646     #if defined(LITTLE_ENDIAN_ORDER)
00647         ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE);
00648     #endif
00649         XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE);
00650 
00651         return InitSha256(sha256);  /* reset state */
00652     }
00653 
00654 #endif /* XTRANSFORM */
00655 
00656 
00657 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00658 
00659 #define _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
00660 { word32 d;\
00661     d = sha256->digest[0]; __asm__ volatile("movl %0, %"#S_0::"r"(d):SSE_REGs);\
00662     d = sha256->digest[1]; __asm__ volatile("movl %0, %"#S_1::"r"(d):SSE_REGs);\
00663     d = sha256->digest[2]; __asm__ volatile("movl %0, %"#S_2::"r"(d):SSE_REGs);\
00664     d = sha256->digest[3]; __asm__ volatile("movl %0, %"#S_3::"r"(d):SSE_REGs);\
00665     d = sha256->digest[4]; __asm__ volatile("movl %0, %"#S_4::"r"(d):SSE_REGs);\
00666     d = sha256->digest[5]; __asm__ volatile("movl %0, %"#S_5::"r"(d):SSE_REGs);\
00667     d = sha256->digest[6]; __asm__ volatile("movl %0, %"#S_6::"r"(d):SSE_REGs);\
00668     d = sha256->digest[7]; __asm__ volatile("movl %0, %"#S_7::"r"(d):SSE_REGs);\
00669 }
00670 
00671 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
00672 { word32 d; \
00673     __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs); sha256->digest[0] += d;\
00674     __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs); sha256->digest[1] += d;\
00675     __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs); sha256->digest[2] += d;\
00676     __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs); sha256->digest[3] += d;\
00677     __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs); sha256->digest[4] += d;\
00678     __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs); sha256->digest[5] += d;\
00679     __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs); sha256->digest[6] += d;\
00680     __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs); sha256->digest[7] += d;\
00681 }
00682 
00683 
00684 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
00685     _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
00686 
00687 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
00688     _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
00689 
00690 
00691 #define S_0 %r15d
00692 #define S_1 %r10d
00693 #define S_2 %r11d
00694 #define S_3 %r12d
00695 #define S_4 %r13d
00696 #define S_5 %r14d
00697 #define S_6 %ebx
00698 #define S_7 %r9d
00699 
00700 #define SSE_REGs "%edi", "%ecx", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15"
00701 
00702 #if defined(HAVE_INTEL_RORX)
00703 #define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i)\
00704 __asm__ volatile("rorx  $6, %"#e", %%edx\n\t":::"%edx",SSE_REGs);  /* edx = e>>6 */\
00705 
00706 #define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i)\
00707 __asm__ volatile("rorx  $11, %"#e",%%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11  */\
00708 __asm__ volatile("xorl  %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6)  */\
00709 __asm__ volatile("rorx  $25, %"#e", %%edx\n\t":::"%edx",SSE_REGs);   /* edx = e>>25             */\
00710 
00711 #define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i)\
00712 __asm__ volatile("movl  %"#f", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = f   */\
00713 __asm__ volatile("xorl  %"#g", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = f ^ g  */\
00714 __asm__ volatile("xorl  %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);  /* edx = Sigma1(e)  */\
00715 __asm__ volatile("andl  %"#e", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = (f ^ g) & e       */\
00716 __asm__ volatile("xorl  %"#g", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = Ch(e,f,g)         */\
00717 
00718 #define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i)\
00719 /*__asm__ volatile("movl    %0, %%edx\n\t"::"m"(w_k):"%edx");*/\
00720 __asm__ volatile("addl  %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs);    /* h += w_k  */\
00721 __asm__ volatile("addl  %%edx, %"#h"\n\t":::"%edx",SSE_REGs);     /* h = h + w_k + Sigma1(e) */\
00722 __asm__ volatile("rorx  $2, %"#a", %%r8d\n\t":::"%r8",SSE_REGs);  /* r8d = a>>2   */\
00723 __asm__ volatile("rorx  $13, %"#a", %%edi\n\t":::"%edi",SSE_REGs);/* edi = a>>13  */\
00724 
00725 #define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i)\
00726 __asm__ volatile("rorx  $22, %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\
00727 __asm__ volatile("xorl  %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs);/* edi = (a>>2) ^ (a>>13)  */\
00728 __asm__ volatile("xorl  %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);  /* edx = Sigma0(a)      */\
00729 
00730 #define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i)\
00731 __asm__ volatile("movl  %"#b", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = b          */\
00732 __asm__ volatile("orl   %"#a", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = a | b      */\
00733 __asm__ volatile("andl  %"#c", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = (a | b) & c*/\
00734 __asm__ volatile("movl  %"#b", %%r8d\n\t":::"%r8",SSE_REGs);  /* r8d = b           */\
00735 
00736 #define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i)\
00737 __asm__ volatile("addl  %%esi, %"#h"\n\t":::"%esi",SSE_REGs);  /* h += Ch(e,f,g)   */\
00738 __asm__ volatile("andl  %"#a", %%r8d\n\t":::"%r8",SSE_REGs);  /* r8d = b & a       */\
00739 __asm__ volatile("orl   %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\
00740 
00741 #define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i)\
00742 __asm__ volatile("addl  "#h", "#d"\n\t");  /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\
00743 __asm__ volatile("addl  %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \
00744 __asm__ volatile("addl  %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs); \
00745 __asm__ volatile("movl  %r8d, "#h"\n\t");
00746 #endif /* HAVE_INTEL_RORX */
00747 
00748 #define RND_STEP_1(a,b,c,d,e,f,g,h,i)\
00749 __asm__ volatile("movl  %"#e", %%edx\n\t":::"%edx",SSE_REGs);\
00750 __asm__ volatile("roll  $26, %%edx\n\t":::"%edx",SSE_REGs);  /* edx = e>>6     */\
00751 __asm__ volatile("movl  %"#e", %%edi\n\t":::"%edi",SSE_REGs);\
00752 
00753 #define RND_STEP_2(a,b,c,d,e,f,g,h,i)\
00754 __asm__ volatile("roll  $21, %%edi\n\t":::"%edi",SSE_REGs);         /* edi = e>>11 */\
00755 __asm__ volatile("xorl  %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6)  */\
00756 __asm__ volatile("movl  %"#e", %%edx\n\t":::"%edx",SSE_REGs);   /* edx = e      */\
00757 __asm__ volatile("roll  $7, %%edx\n\t":::"%edx",SSE_REGs);      /* edx = e>>25  */\
00758 
00759 #define RND_STEP_3(a,b,c,d,e,f,g,h,i)\
00760 __asm__ volatile("movl  %"#f", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = f       */\
00761 __asm__ volatile("xorl  %"#g", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = f ^ g   */\
00762 __asm__ volatile("xorl  %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\
00763 __asm__ volatile("andl  %"#e", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = (f ^ g) & e  */\
00764 __asm__ volatile("xorl  %"#g", %%esi\n\t":::"%esi",SSE_REGs);  /* esi = Ch(e,f,g)    */\
00765 
00766 #define RND_STEP_4(a,b,c,d,e,f,g,h,i)\
00767 __asm__ volatile("addl  %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k  */\
00768 __asm__ volatile("addl  %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\
00769 __asm__ volatile("movl  %"#a", %%r8d\n\t":::"%r8",SSE_REGs);  /* r8d = a    */\
00770 __asm__ volatile("roll  $30, %%r8d\n\t":::"%r8",SSE_REGs);    /* r8d = a>>2 */\
00771 __asm__ volatile("movl  %"#a", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = a   */\
00772 __asm__ volatile("roll  $19, %%edi\n\t":::"%edi",SSE_REGs);    /* edi = a>>13 */\
00773 __asm__ volatile("movl  %"#a", %%edx\n\t":::"%edx",SSE_REGs);  /* edx = a     */\
00774 
00775 #define RND_STEP_5(a,b,c,d,e,f,g,h,i)\
00776 __asm__ volatile("roll  $10, %%edx\n\t":::"%edx",SSE_REGs);    /* edx = a>>22 */\
00777 __asm__ volatile("xorl  %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs); /* edi = (a>>2) ^ (a>>13)  */\
00778 __asm__ volatile("xorl  %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);/* edx = Sigma0(a)         */\
00779 
00780 #define RND_STEP_6(a,b,c,d,e,f,g,h,i)\
00781 __asm__ volatile("movl  %"#b", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = b      */\
00782 __asm__ volatile("orl   %"#a", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = a | b  */\
00783 __asm__ volatile("andl  %"#c", %%edi\n\t":::"%edi",SSE_REGs);  /* edi = (a | b) & c */\
00784 __asm__ volatile("movl  %"#b", %%r8d\n\t":::"%r8",SSE_REGs);  /* r8d = b       */\
00785 
00786 #define RND_STEP_7(a,b,c,d,e,f,g,h,i)\
00787 __asm__ volatile("addl  %%esi, %"#h"\n\t":::"%esi",SSE_REGs);  /* h += Ch(e,f,g)        */\
00788 __asm__ volatile("andl  %"#a", %%r8d\n\t":::"%r8",SSE_REGs);  /* r8d = b & a            */\
00789 __asm__ volatile("orl   %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\
00790 
00791 #define RND_STEP_8(a,b,c,d,e,f,g,h,i)\
00792 __asm__ volatile("addl  "#h", "#d"\n\t");  /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\
00793 __asm__ volatile("addl  %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \
00794                  /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\
00795 __asm__ volatile("addl  %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs);\
00796                  /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c)     */\
00797 __asm__ volatile("movl  %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \
00798                  /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \
00799 
00800 #define RND_X(a,b,c,d,e,f,g,h,i) \
00801        RND_STEP_1(a,b,c,d,e,f,g,h,i); \
00802        RND_STEP_2(a,b,c,d,e,f,g,h,i); \
00803        RND_STEP_3(a,b,c,d,e,f,g,h,i); \
00804        RND_STEP_4(a,b,c,d,e,f,g,h,i); \
00805        RND_STEP_5(a,b,c,d,e,f,g,h,i); \
00806        RND_STEP_6(a,b,c,d,e,f,g,h,i); \
00807        RND_STEP_7(a,b,c,d,e,f,g,h,i); \
00808        RND_STEP_8(a,b,c,d,e,f,g,h,i);
00809 
00810 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
00811 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
00812 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
00813 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
00814 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
00815 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
00816 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
00817 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
00818 
00819 
00820 #define RND_1_3(a,b,c,d,e,f,g,h,i) {\
00821        RND_STEP_1(a,b,c,d,e,f,g,h,i); \
00822        RND_STEP_2(a,b,c,d,e,f,g,h,i); \
00823        RND_STEP_3(a,b,c,d,e,f,g,h,i); \
00824 }
00825 
00826 #define RND_4_6(a,b,c,d,e,f,g,h,i) {\
00827        RND_STEP_4(a,b,c,d,e,f,g,h,i); \
00828        RND_STEP_5(a,b,c,d,e,f,g,h,i); \
00829        RND_STEP_6(a,b,c,d,e,f,g,h,i); \
00830 }
00831 
00832 #define RND_7_8(a,b,c,d,e,f,g,h,i) {\
00833        RND_STEP_7(a,b,c,d,e,f,g,h,i); \
00834        RND_STEP_8(a,b,c,d,e,f,g,h,i); \
00835 }
00836 
00837 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
00838 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
00839 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
00840 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
00841 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
00842 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
00843 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
00844 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
00845 
00846 
00847 #define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
00848 #define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
00849 #define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
00850 #define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
00851 #define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
00852 #define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
00853 #define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
00854 #define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
00855 
00856 #define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
00857 #define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
00858 #define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
00859 #define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
00860 #define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
00861 #define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
00862 #define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
00863 #define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
00864 
00865 #define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
00866 #define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
00867 #define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
00868 #define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
00869 #define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
00870 #define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
00871 #define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
00872 #define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
00873 
00874 #define FOR(cnt, init, max, inc, loop)  \
00875     __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):)
00876 #define END(cnt, init, max, inc, loop)  \
00877     __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::);
00878 
00879 #endif  /* defined(HAVE_INTEL_AVX1) ||  defined(HAVE_INTEL_AVX2) */
00880 
00881 #if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */
00882 
00883 #define VPALIGNR(op1,op2,op3,op4) __asm__ volatile("vpalignr $"#op4", %"#op3", %"#op2", %"#op1:::XMM_REGs)
00884 #define VPADDD(op1,op2,op3)       __asm__ volatile("vpaddd %"#op3", %"#op2", %"#op1:::XMM_REGs)
00885 #define VPSRLD(op1,op2,op3)       __asm__ volatile("vpsrld $"#op3", %"#op2", %"#op1:::XMM_REGs)
00886 #define VPSRLQ(op1,op2,op3)       __asm__ volatile("vpsrlq $"#op3", %"#op2", %"#op1:::XMM_REGs)
00887 #define VPSLLD(op1,op2,op3)       __asm__ volatile("vpslld $"#op3", %"#op2", %"#op1:::XMM_REGs)
00888 #define VPOR(op1,op2,op3)         __asm__ volatile("vpor   %"#op3", %"#op2", %"#op1:::XMM_REGs)
00889 #define VPXOR(op1,op2,op3)        __asm__ volatile("vpxor  %"#op3", %"#op2", %"#op1:::XMM_REGs)
00890 #define VPSHUFD(op1,op2,op3)      __asm__ volatile("vpshufd $"#op3", %"#op2", %"#op1:::XMM_REGs)
00891 #define VPSHUFB(op1,op2,op3)      __asm__ volatile("vpshufb %"#op3", %"#op2", %"#op1:::XMM_REGs)
00892 
00893 #define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\
00894      a,b,c,d,e,f,g,h,_i)\
00895             RND_STEP_1(a,b,c,d,e,f,g,h,_i);\
00896     VPALIGNR (XTMP0, X3, X2, 4);\
00897             RND_STEP_2(a,b,c,d,e,f,g,h,_i);\
00898     VPADDD   (XTMP0, XTMP0, X0);\
00899             RND_STEP_3(a,b,c,d,e,f,g,h,_i);\
00900     VPALIGNR (XTMP1, X1, X0, 4);   /* XTMP1 = W[-15] */\
00901             RND_STEP_4(a,b,c,d,e,f,g,h,_i);\
00902     VPSRLD   (XTMP2, XTMP1, 7);\
00903             RND_STEP_5(a,b,c,d,e,f,g,h,_i);\
00904     VPSLLD   (XTMP3, XTMP1, 25); /* VPSLLD   (XTMP3, XTMP1, (32-7)) */\
00905             RND_STEP_6(a,b,c,d,e,f,g,h,_i);\
00906     VPOR     (XTMP3, XTMP3, XTMP2);  /* XTMP1 = W[-15] MY_ROR 7 */\
00907             RND_STEP_7(a,b,c,d,e,f,g,h,_i);\
00908     VPSRLD   (XTMP2, XTMP1,18);\
00909             RND_STEP_8(a,b,c,d,e,f,g,h,_i);\
00910 \
00911             RND_STEP_1(h,a,b,c,d,e,f,g,_i+1);\
00912     VPSRLD   (XTMP4, XTMP1, 3);  /* XTMP4 = W[-15] >> 3 */\
00913             RND_STEP_2(h,a,b,c,d,e,f,g,_i+1);\
00914     VPSLLD   (XTMP1, XTMP1, 14); /* VPSLLD   (XTMP1, XTMP1, (32-18)) */\
00915             RND_STEP_3(h,a,b,c,d,e,f,g,_i+1);\
00916     VPXOR    (XTMP3, XTMP3, XTMP1);\
00917             RND_STEP_4(h,a,b,c,d,e,f,g,_i+1);\
00918     VPXOR    (XTMP3, XTMP3, XTMP2);  /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
00919             RND_STEP_5(h,a,b,c,d,e,f,g,_i+1);\
00920     VPXOR    (XTMP1, XTMP3, XTMP4);  /* XTMP1 = s0 */\
00921             RND_STEP_6(h,a,b,c,d,e,f,g,_i+1);\
00922     VPSHUFD(XTMP2, X3, 0b11111010);  /* XTMP2 = W[-2] {BBAA}*/\
00923             RND_STEP_7(h,a,b,c,d,e,f,g,_i+1);\
00924     VPADDD   (XTMP0, XTMP0, XTMP1);  /* XTMP0 = W[-16] + W[-7] + s0 */\
00925             RND_STEP_8(h,a,b,c,d,e,f,g,_i+1);\
00926 \
00927             RND_STEP_1(g,h,a,b,c,d,e,f,_i+2);\
00928     VPSRLD   (XTMP4, XTMP2, 10);      /* XTMP4 = W[-2] >> 10 {BBAA} */\
00929             RND_STEP_2(g,h,a,b,c,d,e,f,_i+2);\
00930     VPSRLQ   (XTMP3, XTMP2, 19);      /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
00931             RND_STEP_3(g,h,a,b,c,d,e,f,_i+2);\
00932     VPSRLQ   (XTMP2, XTMP2, 17);      /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
00933             RND_STEP_4(g,h,a,b,c,d,e,f,_i+2);\
00934     VPXOR    (XTMP2, XTMP2, XTMP3);\
00935             RND_STEP_5(g,h,a,b,c,d,e,f,_i+2);\
00936     VPXOR    (XTMP4, XTMP4, XTMP2);   /* XTMP4 = s1 {xBxA} */\
00937             RND_STEP_6(g,h,a,b,c,d,e,f,_i+2);\
00938     VPSHUFB  (XTMP4, XTMP4, SHUF_00BA);  /* XTMP4 = s1 {00BA} */\
00939             RND_STEP_7(g,h,a,b,c,d,e,f,_i+2);\
00940     VPADDD   (XTMP0, XTMP0, XTMP4);  /* XTMP0 = {..., ..., W[1], W[0]} */\
00941             RND_STEP_8(g,h,a,b,c,d,e,f,_i+2);\
00942 \
00943             RND_STEP_1(f,g,h,a,b,c,d,e,_i+3);\
00944     VPSHUFD  (XTMP2, XTMP0, 0b01010000); /* XTMP2 = W[-2] {DDCC} */\
00945             RND_STEP_2(f,g,h,a,b,c,d,e,_i+3);\
00946     VPSRLD   (XTMP5, XTMP2, 10);       /* XTMP5 = W[-2] >> 10 {DDCC} */\
00947             RND_STEP_3(f,g,h,a,b,c,d,e,_i+3);\
00948     VPSRLQ   (XTMP3, XTMP2, 19);       /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
00949             RND_STEP_4(f,g,h,a,b,c,d,e,_i+3);\
00950     VPSRLQ   (XTMP2, XTMP2, 17);      /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
00951             RND_STEP_5(f,g,h,a,b,c,d,e,_i+3);\
00952     VPXOR    (XTMP2, XTMP2, XTMP3);\
00953             RND_STEP_6(f,g,h,a,b,c,d,e,_i+3);\
00954     VPXOR    (XTMP5, XTMP5, XTMP2);   /* XTMP5 = s1 {xDxC} */\
00955             RND_STEP_7(f,g,h,a,b,c,d,e,_i+3);\
00956     VPSHUFB  (XTMP5, XTMP5, SHUF_DC00); /* XTMP5 = s1 {DC00} */\
00957             RND_STEP_8(f,g,h,a,b,c,d,e,_i+3);\
00958     VPADDD   (X0, XTMP5, XTMP0);      /* X0 = {W[3], W[2], W[1], W[0]} */\
00959 
00960 #if defined(HAVE_INTEL_RORX)
00961 
00962 #define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \
00963                           XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\
00964             RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i);\
00965     VPALIGNR (XTMP0, X3, X2, 4);\
00966             RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i);\
00967     VPADDD   (XTMP0, XTMP0, X0);\
00968             RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i);\
00969     VPALIGNR (XTMP1, X1, X0, 4);   /* XTMP1 = W[-15] */\
00970             RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i);\
00971     VPSRLD   (XTMP2, XTMP1, 7);\
00972             RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i);\
00973     VPSLLD   (XTMP3, XTMP1, 25); /* VPSLLD   (XTMP3, XTMP1, (32-7)) */\
00974             RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i);\
00975     VPOR     (XTMP3, XTMP3, XTMP2);  /* XTMP1 = W[-15] MY_ROR 7 */\
00976             RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i);\
00977     VPSRLD   (XTMP2, XTMP1,18);\
00978             RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i);\
00979 \
00980             RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1);\
00981     VPSRLD   (XTMP4, XTMP1, 3);  /* XTMP4 = W[-15] >> 3 */\
00982             RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1);\
00983     VPSLLD   (XTMP1, XTMP1, 14); /* VPSLLD   (XTMP1, XTMP1, (32-18)) */\
00984             RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1);\
00985     VPXOR    (XTMP3, XTMP3, XTMP1);\
00986             RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1);\
00987     VPXOR    (XTMP3, XTMP3, XTMP2);  /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
00988             RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1);\
00989     VPXOR    (XTMP1, XTMP3, XTMP4);  /* XTMP1 = s0 */\
00990             RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1);\
00991     VPSHUFD(XTMP2, X3, 0b11111010);  /* XTMP2 = W[-2] {BBAA}*/\
00992             RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1);\
00993     VPADDD   (XTMP0, XTMP0, XTMP1);  /* XTMP0 = W[-16] + W[-7] + s0 */\
00994             RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1);\
00995 \
00996             RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2);\
00997     VPSRLD   (XTMP4, XTMP2, 10);      /* XTMP4 = W[-2] >> 10 {BBAA} */\
00998             RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2);\
00999     VPSRLQ   (XTMP3, XTMP2, 19);      /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
01000             RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2);\
01001     VPSRLQ   (XTMP2, XTMP2, 17);      /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
01002             RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2);\
01003     VPXOR    (XTMP2, XTMP2, XTMP3);\
01004             RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2);\
01005     VPXOR    (XTMP4, XTMP4, XTMP2);   /* XTMP4 = s1 {xBxA} */\
01006             RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2);\
01007     VPSHUFB  (XTMP4, XTMP4, SHUF_00BA);  /* XTMP4 = s1 {00BA} */\
01008             RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2);\
01009     VPADDD   (XTMP0, XTMP0, XTMP4);  /* XTMP0 = {..., ..., W[1], W[0]} */\
01010             RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2);\
01011 \
01012             RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3);\
01013     VPSHUFD  (XTMP2, XTMP0, 0b01010000); /* XTMP2 = W[-2] {DDCC} */\
01014             RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3);\
01015     VPSRLD   (XTMP5, XTMP2, 10);       /* XTMP5 = W[-2] >> 10 {DDCC} */\
01016             RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3);\
01017     VPSRLQ   (XTMP3, XTMP2, 19);       /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
01018             RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3);\
01019     VPSRLQ   (XTMP2, XTMP2, 17);      /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
01020             RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3);\
01021     VPXOR    (XTMP2, XTMP2, XTMP3);\
01022             RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3);\
01023     VPXOR    (XTMP5, XTMP5, XTMP2);   /* XTMP5 = s1 {xDxC} */\
01024             RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3);\
01025     VPSHUFB  (XTMP5, XTMP5, SHUF_DC00); /* XTMP5 = s1 {DC00} */\
01026             RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3);\
01027     VPADDD   (X0, XTMP5, XTMP0);      /* X0 = {W[3], W[2], W[1], W[0]} */\
01028 
01029 #endif /* HAVE_INTEL_RORX */
01030 
01031 
01032 #define W_K_from_buff\
01033          __asm__ volatile("vmovdqu %0, %%xmm4\n\t"\
01034                           "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t"\
01035                           :: "m"(sha256->buffer[0]):"%xmm4");\
01036          __asm__ volatile("vmovdqu %0, %%xmm5\n\t"\
01037                           "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t"\
01038                           ::"m"(sha256->buffer[4]):"%xmm5");\
01039          __asm__ volatile("vmovdqu %0, %%xmm6\n\t"\
01040                           "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t"\
01041                           ::"m"(sha256->buffer[8]):"%xmm6");\
01042          __asm__ volatile("vmovdqu %0, %%xmm7\n\t"\
01043                           "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"\
01044                           ::"m"(sha256->buffer[12]):"%xmm7");\
01045 
01046 #define _SET_W_K_XFER(reg, i)\
01047     __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs);\
01048     __asm__ volatile("vmovdqa %%xmm9, %0":"=m"(W_K[i])::XMM_REGs);
01049 
01050 #define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i)
01051 
01052 static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */
01053 static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */
01054 static const ALIGN32 word64 mBYTE_FLIP_MASK[] =  { 0x0405060700010203, 0x0c0d0e0f08090a0b };
01055 
01056 
01057 #define _Init_Masks(mask1, mask2, mask3)\
01058 __asm__ volatile("vmovdqu %0, %"#mask1 ::"m"(mBYTE_FLIP_MASK[0]));\
01059 __asm__ volatile("vmovdqu %0, %"#mask2 ::"m"(mSHUF_00BA[0]));\
01060 __asm__ volatile("vmovdqu %0, %"#mask3 ::"m"(mSHUF_DC00[0]));
01061 
01062 #define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\
01063     _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
01064 
01065 #define X0 %xmm4
01066 #define X1 %xmm5
01067 #define X2 %xmm6
01068 #define X3 %xmm7
01069 #define X_ X0
01070 
01071 #define XTMP0 %xmm0
01072 #define XTMP1 %xmm1
01073 #define XTMP2 %xmm2
01074 #define XTMP3 %xmm3
01075 #define XTMP4 %xmm8
01076 #define XTMP5 %xmm9
01077 #define XFER  %xmm10
01078 
01079 #define SHUF_00BA   %xmm11 /* shuffle xBxA -> 00BA */
01080 #define SHUF_DC00   %xmm12 /* shuffle xDxC -> DC00 */
01081 #define BYTE_FLIP_MASK  %xmm13
01082 
01083 #define XMM_REGs   /* Registers are saved in Sha256Update/Finel */
01084                    /*"xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13" */
01085 
01086 static int Transform_AVX1(Sha256* sha256)
01087 {
01088     ALIGN32 word32 W_K[64];  /* temp for W+K */
01089 
01090     Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00);
01091     W_K_from_buff; /* X0, X1, X2, X3 = W[0..15]; */
01092 
01093     DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7);
01094 
01095     SET_W_K_XFER(X0, 0);
01096 
01097     MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
01098             SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0);
01099     SET_W_K_XFER(X1, 4);
01100     MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
01101             SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4);
01102     SET_W_K_XFER(X2, 8);
01103     MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
01104             SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8);
01105     SET_W_K_XFER(X3, 12);
01106     MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
01107             SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12);
01108     SET_W_K_XFER(X0, 16);
01109     MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
01110             SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16);
01111     SET_W_K_XFER(X1, 20);
01112     MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
01113             SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20);
01114     SET_W_K_XFER(X2, 24);
01115     MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
01116             SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24);
01117     SET_W_K_XFER(X3, 28);
01118     MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
01119             SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28);
01120     SET_W_K_XFER(X0, 32);
01121     MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
01122             SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32);
01123     SET_W_K_XFER(X1, 36);
01124     MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
01125             SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36);
01126     SET_W_K_XFER(X2, 40);
01127     MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
01128             SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40);
01129     SET_W_K_XFER(X3, 44);
01130     MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
01131             SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44);
01132 
01133     SET_W_K_XFER(X0, 48);
01134     SET_W_K_XFER(X1, 52);
01135     SET_W_K_XFER(X2, 56);
01136     SET_W_K_XFER(X3, 60);
01137 
01138     RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48);
01139     RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49);
01140     RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50);
01141     RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51);
01142 
01143     RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52);
01144     RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53);
01145     RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54);
01146     RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55);
01147 
01148     RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56);
01149     RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57);
01150     RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58);
01151     RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59);
01152 
01153     RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60);
01154     RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61);
01155     RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62);
01156     RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63);
01157 
01158     RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7);
01159 
01160     return 0;
01161 }
01162 
01163 #if defined(HAVE_INTEL_RORX)
01164 static int Transform_AVX1_RORX(Sha256* sha256)
01165 {
01166     ALIGN32 word32 W_K[64];  /* temp for W+K */
01167 
01168     Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00);
01169     W_K_from_buff; /* X0, X1, X2, X3 = W[0..15]; */
01170 
01171     DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7);
01172     SET_W_K_XFER(X0, 0);
01173     MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01174             XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0);
01175     SET_W_K_XFER(X1, 4);
01176     MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01177             XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4);
01178     SET_W_K_XFER(X2, 8);
01179     MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01180             XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8);
01181     SET_W_K_XFER(X3, 12);
01182     MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01183             XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12);
01184     SET_W_K_XFER(X0, 16);
01185     MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01186             XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16);
01187     SET_W_K_XFER(X1, 20);
01188     MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01189             XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20);
01190     SET_W_K_XFER(X2, 24);
01191     MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01192             XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24);
01193     SET_W_K_XFER(X3, 28);
01194     MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01195             XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28);
01196     SET_W_K_XFER(X0, 32);
01197     MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01198             XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32);
01199     SET_W_K_XFER(X1, 36);
01200     MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01201             XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36);
01202     SET_W_K_XFER(X2, 40);
01203     MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01204             XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40);
01205     SET_W_K_XFER(X3, 44);
01206     MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
01207             XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44);
01208 
01209     SET_W_K_XFER(X0, 48);
01210     SET_W_K_XFER(X1, 52);
01211     SET_W_K_XFER(X2, 56);
01212     SET_W_K_XFER(X3, 60);
01213 
01214     RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48);
01215     RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49);
01216     RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50);
01217     RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51);
01218 
01219     RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52);
01220     RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53);
01221     RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54);
01222     RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55);
01223 
01224     RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56);
01225     RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57);
01226     RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58);
01227     RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59);
01228 
01229     RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60);
01230     RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61);
01231     RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62);
01232     RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63);
01233 
01234     RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7);
01235 
01236     return 0;
01237 }
01238 #endif  /* HAVE_INTEL_RORX */
01239 #endif  /* HAVE_INTEL_AVX1 */
01240 
01241 
01242 #if defined(HAVE_INTEL_AVX2)
01243 
01244 #define _MOVE_to_REG(ymm, mem)       __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem):YMM_REGs);
01245 #define _MOVE_to_MEM(mem, ymm)       __asm__ volatile("vmovdqu %%"#ymm", %0" : "=m"(mem)::YMM_REGs);
01246 #define _BYTE_SWAP(ymm, map)              __asm__ volatile("vpshufb %0, %%"#ymm", %%"#ymm"\n\t"\
01247                                                        :: "m"(map):YMM_REGs);
01248 #define _MOVE_128(ymm0, ymm1, ymm2, map)   __asm__ volatile("vperm2i128  $"#map", %%"\
01249                                   #ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs);
01250 #define _MOVE_BYTE(ymm0, ymm1, map)  __asm__ volatile("vpshufb %0, %%"#ymm1", %%"\
01251                                   #ymm0"\n\t":: "m"(map):YMM_REGs);
01252 #define _S_TEMP(dest, src, bits, temp)    __asm__ volatile("vpsrld  $"#bits", %%"\
01253          #src", %%"#dest"\n\tvpslld  $32-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\
01254          #temp",%%"#dest", %%"#dest" ":::YMM_REGs);
01255 #define _AVX2_R(dest, src, bits)          __asm__ volatile("vpsrld  $"#bits", %%"\
01256                                   #src", %%"#dest" ":::YMM_REGs);
01257 #define _XOR(dest, src1, src2)       __asm__ volatile("vpxor   %%"#src1", %%"\
01258          #src2", %%"#dest" ":::YMM_REGs);
01259 #define _OR(dest, src1, src2)       __asm__ volatile("vpor    %%"#src1", %%"\
01260          #src2", %%"#dest" ":::YMM_REGs);
01261 #define _ADD(dest, src1, src2)       __asm__ volatile("vpaddd   %%"#src1", %%"\
01262          #src2", %%"#dest" ":::YMM_REGs);
01263 #define _ADD_MEM(dest, src1, mem)    __asm__ volatile("vpaddd   %0, %%"#src1", %%"\
01264          #dest" "::"m"(mem):YMM_REGs);
01265 #define _BLEND(map, dest, src1, src2)    __asm__ volatile("vpblendd    $"#map", %%"\
01266          #src1",   %%"#src2", %%"#dest" ":::YMM_REGs);
01267 
01268 #define    _EXTRACT_XMM_0(xmm, mem)  __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
01269 #define    _EXTRACT_XMM_1(xmm, mem)  __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
01270 #define    _EXTRACT_XMM_2(xmm, mem)  __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
01271 #define    _EXTRACT_XMM_3(xmm, mem)  __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
01272 #define    _EXTRACT_XMM_4(ymm, xmm, mem)\
01273       __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs);\
01274       __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
01275 #define    _EXTRACT_XMM_5(xmm, mem)  __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
01276 #define    _EXTRACT_XMM_6(xmm, mem)  __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
01277 #define    _EXTRACT_XMM_7(xmm, mem)  __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
01278 
01279 #define    _SWAP_YMM_HL(ymm)   __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs);
01280 #define     SWAP_YMM_HL(ymm)   _SWAP_YMM_HL(ymm)
01281 
01282 #define MOVE_to_REG(ymm, mem)      _MOVE_to_REG(ymm, mem)
01283 #define MOVE_to_MEM(mem, ymm)      _MOVE_to_MEM(mem, ymm)
01284 #define BYTE_SWAP(ymm, map)        _BYTE_SWAP(ymm, map)
01285 #define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map)
01286 #define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map)
01287 #define XOR(dest, src1, src2)      _XOR(dest, src1, src2)
01288 #define OR(dest, src1, src2)       _OR(dest, src1, src2)
01289 #define ADD(dest, src1, src2)      _ADD(dest, src1, src2)
01290 #define ADD_MEM(dest, src1, mem)  _ADD_MEM(dest, src1, mem)
01291 #define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2)
01292 
01293 #define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp);
01294 #define AVX2_S(dest, src, bits)      S_TMP(dest, src, bits, S_TEMP)
01295 #define AVX2_R(dest, src, bits)      _AVX2_R(dest, src, bits)
01296 
01297 #define GAMMA0(dest, src)      AVX2_S(dest, src, 7);  AVX2_S(G_TEMP, src, 18); \
01298     XOR(dest, G_TEMP, dest); AVX2_R(G_TEMP, src, 3);  XOR(dest, G_TEMP, dest);
01299 #define GAMMA0_1(dest, src)    AVX2_S(dest, src, 7);  AVX2_S(G_TEMP, src, 18);
01300 #define GAMMA0_2(dest, src)    XOR(dest, G_TEMP, dest); AVX2_R(G_TEMP, src, 3);  \
01301     XOR(dest, G_TEMP, dest);
01302 
01303 #define GAMMA1(dest, src)      AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); \
01304     XOR(dest, G_TEMP, dest); AVX2_R(G_TEMP, src, 10); XOR(dest, G_TEMP, dest);
01305 #define GAMMA1_1(dest, src)    AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19);
01306 #define GAMMA1_2(dest, src)    XOR(dest, G_TEMP, dest); AVX2_R(G_TEMP, src, 10); \
01307     XOR(dest, G_TEMP, dest);
01308 
01309 #define    FEEDBACK1_to_W_I_2    MOVE_BYTE(YMM_TEMP0, W_I, mMAP1toW_I_2[0]); \
01310     BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2);
01311 #define    FEEDBACK2_to_W_I_2    MOVE_128(YMM_TEMP0, W_I, W_I, 0x08);  \
01312     MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAP2toW_I_2[0]); BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2);
01313 #define    FEEDBACK3_to_W_I_2    MOVE_BYTE(YMM_TEMP0, W_I, mMAP3toW_I_2[0]); \
01314     BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2);
01315 
01316 #define    FEEDBACK_to_W_I_7     MOVE_128(YMM_TEMP0, W_I, W_I, 0x08);\
01317     MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAPtoW_I_7[0]); BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7);
01318 
01319 #undef voitle
01320 
01321 #define W_I_16  ymm8
01322 #define W_I_15  ymm9
01323 #define W_I_7  ymm10
01324 #define W_I_2  ymm11
01325 #define W_I    ymm12
01326 #define G_TEMP     ymm13
01327 #define S_TEMP     ymm14
01328 #define YMM_TEMP0  ymm15
01329 #define YMM_TEMP0x xmm15
01330 #define W_I_TEMP   ymm7
01331 #define W_K_TEMP   ymm15
01332 #define W_K_TEMPx  xmm15
01333 
01334 #define YMM_REGs /* Registers are saved in Sha256Update/Finel */
01335  /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/
01336 
01337 
01338 #define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\
01339     __asm__ volatile("vperm2i128  $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs);\
01340     __asm__ volatile("vpblendd    $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs);\
01341     __asm__ volatile("vperm2i128 $0x01,  %%"#w_i_7",  %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs);\
01342     __asm__ volatile("vpblendd    $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs);\
01343     __asm__ volatile("vpshufd    $0x93,  %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs);\
01344 
01345 #define MOVE_7_to_15(w_i_15, w_i_7)\
01346     __asm__ volatile("vmovdqu                 %%"#w_i_7",  %%"#w_i_15" ":::YMM_REGs);\
01347 
01348 #define MOVE_I_to_7(w_i_7, w_i)\
01349     __asm__ volatile("vperm2i128 $0x01,       %%"#w_i",   %%"#w_i",   %%"#w_i_7" ":::YMM_REGs);\
01350     __asm__ volatile("vpblendd    $0x01,       %%"#w_i_7",   %%"#w_i", %%"#w_i_7" ":::YMM_REGs);\
01351     __asm__ volatile("vpshufd    $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs);\
01352 
01353 #define MOVE_I_to_2(w_i_2, w_i)\
01354     __asm__ volatile("vperm2i128 $0x01,       %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs);\
01355     __asm__ volatile("vpshufd    $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs);\
01356 
01357 #define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\
01358     MOVE_15_to_16(w_i_16, w_i_15, w_i_7); \
01359     MOVE_7_to_15(w_i_15, w_i_7); \
01360     MOVE_I_to_7(w_i_7, w_i); \
01361     MOVE_I_to_2(w_i_2, w_i);\
01362 
01363 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
01364     { word32 d;\
01365     __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs);\
01366     sha256->digest[0] += d;\
01367     __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs);\
01368     sha256->digest[1] += d;\
01369     __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs);\
01370     sha256->digest[2] += d;\
01371     __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs);\
01372     sha256->digest[3] += d;\
01373     __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs);\
01374     sha256->digest[4] += d;\
01375     __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs);\
01376     sha256->digest[5] += d;\
01377     __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs);\
01378     sha256->digest[6] += d;\
01379     __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs);\
01380     sha256->digest[7] += d;\
01381 }
01382 
01383 #define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
01384   { word32 d[8];\
01385     __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs);\
01386     __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs);\
01387     __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs);\
01388     __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs);\
01389     __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs);\
01390     __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs);\
01391     __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs);\
01392     __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs);\
01393         printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\
01394     __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs);\
01395     __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs);\
01396     __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs);\
01397     __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs);\
01398     __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs);\
01399     __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs);\
01400     __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs);\
01401     __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs);\
01402 }
01403 
01404 
01405 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
01406     _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
01407 
01408 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
01409     _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
01410 
01411 #define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
01412     _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
01413 
01414 
01415     /* Byte swap Masks to ensure that rest of the words are filled with zero's. */
01416     static const unsigned long mBYTE_FLIP_MASK_16[] =
01417         { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b };
01418     static const unsigned long mBYTE_FLIP_MASK_15[] =
01419         { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b };
01420     static const unsigned long mBYTE_FLIP_MASK_7 [] =
01421         { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b };
01422     static const unsigned long mBYTE_FLIP_MASK_2 [] =
01423         { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 };
01424 
01425     static const unsigned long mMAPtoW_I_7[] =
01426         { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 };
01427     static const unsigned long mMAP1toW_I_2[] =
01428         { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 };
01429     static const unsigned long mMAP2toW_I_2[] =
01430         { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 };
01431     static const unsigned long mMAP3toW_I_2[] =
01432         { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 };
01433 
01434 static int Transform_AVX2(Sha256* sha256)
01435 {
01436 #ifdef WOLFSSL_SMALL_STACK
01437     word32* W_K;
01438     W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
01439     if (W_K == NULL)
01440         return MEMORY_E;
01441 #else
01442     word32 W_K[64];
01443 #endif
01444 
01445     MOVE_to_REG(W_I_16, sha256->buffer[0]);     BYTE_SWAP(W_I_16, mBYTE_FLIP_MASK_16[0]);
01446     MOVE_to_REG(W_I_15, sha256->buffer[1]);     BYTE_SWAP(W_I_15, mBYTE_FLIP_MASK_15[0]);
01447     MOVE_to_REG(W_I,    sha256->buffer[8]);    BYTE_SWAP(W_I,    mBYTE_FLIP_MASK_16[0]);
01448     MOVE_to_REG(W_I_7,  sha256->buffer[16-7]); BYTE_SWAP(W_I_7,  mBYTE_FLIP_MASK_7[0]);
01449     MOVE_to_REG(W_I_2,  sha256->buffer[16-2]); BYTE_SWAP(W_I_2,  mBYTE_FLIP_MASK_2[0]);
01450 
01451     DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7);
01452 
01453     ADD_MEM(W_K_TEMP, W_I_16, K[0]);
01454     MOVE_to_MEM(W_K[0], W_K_TEMP);
01455 
01456     RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0);
01457     RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1);
01458     RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2);
01459     RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3);
01460     RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4);
01461     RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5);
01462     RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6);
01463     RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7);
01464 
01465     ADD_MEM(YMM_TEMP0, W_I, K[8]);
01466     MOVE_to_MEM(W_K[8], YMM_TEMP0);
01467 
01468     /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
01469             RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8);
01470     GAMMA0_1(W_I_TEMP, W_I_15);
01471             RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8);
01472     GAMMA0_2(W_I_TEMP, W_I_15);
01473             RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8);
01474     ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */
01475             RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9);
01476     ADD(W_I, W_I_7, W_I_TEMP);
01477             RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9);
01478     GAMMA1_1(YMM_TEMP0, W_I_2);
01479             RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9);
01480     GAMMA1_2(YMM_TEMP0, W_I_2);
01481             RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10);
01482     ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */
01483             RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10);
01484     FEEDBACK1_to_W_I_2;
01485             RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10);
01486     FEEDBACK_to_W_I_7;
01487             RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11);
01488     ADD(W_I_TEMP, W_I_7, W_I_TEMP);
01489             RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11);
01490     GAMMA1_1(YMM_TEMP0, W_I_2);
01491             RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11);
01492     GAMMA1_2(YMM_TEMP0, W_I_2);
01493             RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12);
01494     ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */
01495             RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12);
01496     FEEDBACK2_to_W_I_2;
01497             RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12);
01498     GAMMA1_1(YMM_TEMP0, W_I_2);
01499             RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13);
01500     GAMMA1_2(YMM_TEMP0, W_I_2);
01501             RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13);
01502     ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */
01503             RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13);
01504     FEEDBACK3_to_W_I_2;
01505             RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14);
01506     GAMMA1(YMM_TEMP0, W_I_2);
01507             RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14);
01508             RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14);
01509     ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */
01510             RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15);
01511 
01512     MOVE_to_REG(YMM_TEMP0, K[16]);
01513             RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15);
01514     ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I);
01515             RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15);
01516     ADD(YMM_TEMP0, YMM_TEMP0, W_I);
01517     MOVE_to_MEM(W_K[16], YMM_TEMP0);
01518 
01519     /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
01520             RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16);
01521     GAMMA0_1(W_I_TEMP, W_I_15);
01522             RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16);
01523     GAMMA0_2(W_I_TEMP, W_I_15);
01524             RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16);
01525     ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */
01526             RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17);
01527     ADD(W_I, W_I_7, W_I_TEMP);
01528             RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17);
01529     GAMMA1_1(YMM_TEMP0, W_I_2);
01530             RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17);
01531     GAMMA1_2(YMM_TEMP0, W_I_2);
01532             RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18);
01533     ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */
01534             RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18);
01535     FEEDBACK1_to_W_I_2;
01536             RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18);
01537     FEEDBACK_to_W_I_7;
01538             RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19);
01539     ADD(W_I_TEMP, W_I_7, W_I_TEMP);
01540             RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19);
01541     GAMMA1(YMM_TEMP0, W_I_2);
01542             RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19);
01543     GAMMA1_2(YMM_TEMP0, W_I_2);
01544             RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20);
01545     ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */
01546             RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20);
01547     FEEDBACK2_to_W_I_2;
01548             RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20);
01549     GAMMA1_1(YMM_TEMP0, W_I_2);
01550             RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21);
01551     GAMMA1_2(YMM_TEMP0, W_I_2);
01552             RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21);
01553     ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */
01554             RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21);
01555     FEEDBACK3_to_W_I_2;
01556             RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22);
01557     GAMMA1_1(YMM_TEMP0, W_I_2);
01558             RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22);
01559     GAMMA1_2(YMM_TEMP0, W_I_2);
01560             RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22);
01561     ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */
01562             RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23);
01563 
01564     MOVE_to_REG(YMM_TEMP0, K[24]);
01565             RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23);
01566     ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I);
01567             RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23);
01568     ADD(YMM_TEMP0, YMM_TEMP0, W_I);
01569     MOVE_to_MEM(W_K[24], YMM_TEMP0);
01570 
01571             /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
01572             RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24);
01573     GAMMA0_1(W_I_TEMP, W_I_15);
01574             RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24);
01575     GAMMA0_2(W_I_TEMP, W_I_15);
01576             RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24);
01577     ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */
01578             RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25);
01579     ADD(W_I, W_I_7, W_I_TEMP);
01580             RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25);
01581     GAMMA1_1(YMM_TEMP0, W_I_2);
01582             RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25);
01583     GAMMA1_2(YMM_TEMP0, W_I_2);
01584             RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26);
01585     ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */
01586             RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26);
01587     FEEDBACK1_to_W_I_2;
01588             RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26);
01589     FEEDBACK_to_W_I_7;
01590             RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27);
01591     ADD(W_I_TEMP, W_I_7, W_I_TEMP);
01592             RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27);
01593     GAMMA1_1(YMM_TEMP0, W_I_2);
01594             RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27);
01595     GAMMA1_2(YMM_TEMP0, W_I_2);
01596             RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28);
01597     ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */
01598             RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28);
01599     FEEDBACK2_to_W_I_2;
01600             RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28);
01601     GAMMA1_1(YMM_TEMP0, W_I_2);
01602             RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29);
01603     GAMMA1_2(YMM_TEMP0, W_I_2);
01604             RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29);
01605     ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */
01606             RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29);
01607     FEEDBACK3_to_W_I_2;
01608             RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30);
01609     GAMMA1(YMM_TEMP0, W_I_2);
01610             RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30);
01611             RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30);
01612     ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */
01613             RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31);
01614 
01615     MOVE_to_REG(YMM_TEMP0, K[32]);
01616             RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31);
01617     ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I);
01618             RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31);
01619     ADD(YMM_TEMP0, YMM_TEMP0, W_I);
01620     MOVE_to_MEM(W_K[32], YMM_TEMP0);
01621 
01622 
01623             /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
01624             RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32);
01625     GAMMA0_1(W_I_TEMP, W_I_15);
01626             RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32);
01627     GAMMA0_2(W_I_TEMP, W_I_15);
01628             RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32);
01629     ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */
01630             RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33);
01631     ADD(W_I, W_I_7, W_I_TEMP);
01632             RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33);
01633     GAMMA1_1(YMM_TEMP0, W_I_2);
01634             RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33);
01635     GAMMA1_2(YMM_TEMP0, W_I_2);
01636             RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34);
01637     ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */
01638             RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34);
01639     FEEDBACK1_to_W_I_2;
01640             RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34);
01641     FEEDBACK_to_W_I_7;
01642             RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35);
01643     ADD(W_I_TEMP, W_I_7, W_I_TEMP);
01644             RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35);
01645     GAMMA1_1(YMM_TEMP0, W_I_2);
01646             RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35);
01647     GAMMA1_2(YMM_TEMP0, W_I_2);
01648             RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36);
01649     ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */
01650             RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36);
01651     FEEDBACK2_to_W_I_2;
01652             RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36);
01653     GAMMA1_1(YMM_TEMP0, W_I_2);
01654             RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37);
01655     GAMMA1_2(YMM_TEMP0, W_I_2);
01656             RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37);
01657     ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */
01658             RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37);
01659     FEEDBACK3_to_W_I_2;
01660             RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38);
01661     GAMMA1_1(YMM_TEMP0, W_I_2);
01662             RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38);
01663     GAMMA1_2(YMM_TEMP0, W_I_2);
01664             RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38);
01665     ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */
01666             RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39);
01667 
01668     MOVE_to_REG(YMM_TEMP0, K[40]);
01669             RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39);
01670     ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I);
01671             RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39);
01672     ADD(YMM_TEMP0, YMM_TEMP0, W_I);
01673     MOVE_to_MEM(W_K[40], YMM_TEMP0);
01674 
01675             /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
01676             RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40);
01677     GAMMA0_1(W_I_TEMP, W_I_15);
01678             RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40);
01679     GAMMA0_2(W_I_TEMP, W_I_15);
01680             RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40);
01681     ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */
01682             RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41);
01683     ADD(W_I, W_I_7, W_I_TEMP);
01684             RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41);
01685     GAMMA1_1(YMM_TEMP0, W_I_2);
01686             RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41);
01687     GAMMA1_2(YMM_TEMP0, W_I_2);
01688             RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42);
01689     ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */
01690             RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42);
01691     FEEDBACK1_to_W_I_2;
01692             RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42);
01693     FEEDBACK_to_W_I_7;
01694             RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43);
01695     ADD(W_I_TEMP, W_I_7, W_I_TEMP);
01696             RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43);
01697     GAMMA1_1(YMM_TEMP0, W_I_2);
01698             RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43);
01699     GAMMA1_2(YMM_TEMP0, W_I_2);
01700             RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44);
01701     ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */
01702             RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44);
01703     FEEDBACK2_to_W_I_2;
01704             RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44);
01705     GAMMA1_1(YMM_TEMP0, W_I_2);
01706             RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45);
01707     GAMMA1_2(YMM_TEMP0, W_I_2);
01708             RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45);
01709     ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */
01710             RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45);
01711     FEEDBACK3_to_W_I_2;
01712             RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46);
01713     GAMMA1_1(YMM_TEMP0, W_I_2);
01714             RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46);
01715     GAMMA1_2(YMM_TEMP0, W_I_2);
01716             RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46);
01717     ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */
01718             RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47);
01719 
01720     MOVE_to_REG(YMM_TEMP0, K[48]);
01721             RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47);
01722     ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I);
01723             RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47);
01724     ADD(YMM_TEMP0, YMM_TEMP0, W_I);
01725     MOVE_to_MEM(W_K[48], YMM_TEMP0);
01726 
01727             /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
01728             RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48);
01729     GAMMA0_1(W_I_TEMP, W_I_15);
01730             RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48);
01731     GAMMA0_2(W_I_TEMP, W_I_15);
01732             RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48);
01733     ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */
01734             RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49);
01735     ADD(W_I, W_I_7, W_I_TEMP);
01736             RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49);
01737     GAMMA1_1(YMM_TEMP0, W_I_2);
01738             RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49);
01739     GAMMA1_2(YMM_TEMP0, W_I_2);
01740             RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50);
01741     ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */
01742             RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50);
01743     FEEDBACK1_to_W_I_2;
01744             RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50);
01745     FEEDBACK_to_W_I_7;
01746             RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51);
01747     ADD(W_I_TEMP, W_I_7, W_I_TEMP);
01748             RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51);
01749     GAMMA1_1(YMM_TEMP0, W_I_2);
01750             RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51);
01751     GAMMA1_2(YMM_TEMP0, W_I_2);
01752             RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52);
01753     ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */
01754             RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52);
01755     FEEDBACK2_to_W_I_2;
01756             RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52);
01757     GAMMA1_1(YMM_TEMP0, W_I_2);
01758             RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53);
01759     GAMMA1_2(YMM_TEMP0, W_I_2);
01760             RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53);
01761     ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */
01762             RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53);
01763     FEEDBACK3_to_W_I_2;
01764             RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54);
01765     GAMMA1_1(YMM_TEMP0, W_I_2);
01766             RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54);
01767     GAMMA1_2(YMM_TEMP0, W_I_2);
01768             RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54);
01769     ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */
01770             RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55);
01771 
01772     MOVE_to_REG(YMM_TEMP0, K[56]);
01773             RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55);
01774     ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I);
01775             RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55);
01776     ADD(YMM_TEMP0, YMM_TEMP0, W_I);
01777     MOVE_to_MEM(W_K[56], YMM_TEMP0);
01778 
01779     RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56);
01780     RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57);
01781     RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58);
01782     RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59);
01783 
01784     RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60);
01785     RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61);
01786     RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62);
01787     RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63);
01788 
01789     RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7);
01790 
01791 #ifdef WOLFSSL_SMALL_STACK
01792     XFREE(W_K, NULL, DYNAMIC_TYPE_TMP_BUFFER);
01793 #endif
01794 
01795     return 0;
01796 }
01797 
01798 #endif   /* HAVE_INTEL_AVX2 */
01799 
01800 
01801 #ifdef WOLFSSL_SHA224
01802     static int InitSha224(Sha224* sha224)
01803     {
01804         int ret = 0;
01805 
01806         sha224->digest[0] = 0xc1059ed8;
01807         sha224->digest[1] = 0x367cd507;
01808         sha224->digest[2] = 0x3070dd17;
01809         sha224->digest[3] = 0xf70e5939;
01810         sha224->digest[4] = 0xffc00b31;
01811         sha224->digest[5] = 0x68581511;
01812         sha224->digest[6] = 0x64f98fa7;
01813         sha224->digest[7] = 0xbefa4fa4;
01814 
01815         sha224->buffLen = 0;
01816         sha224->loLen   = 0;
01817         sha224->hiLen   = 0;
01818 
01819     #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
01820         /* choose best Transform function under this runtime environment */
01821         set_Transform();
01822     #endif
01823 
01824         return ret;
01825     }
01826 
01827     int wc_InitSha224_ex(Sha224* sha224, void* heap, int devId)
01828     {
01829         int ret = 0;
01830 
01831         if (sha224 == NULL)
01832             return BAD_FUNC_ARG;
01833 
01834         sha224->heap = heap;
01835 
01836         ret = InitSha224(sha224);
01837         if (ret != 0)
01838             return ret;
01839 
01840     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
01841         ret = wolfAsync_DevCtxInit(&sha224->asyncDev,
01842                             WOLFSSL_ASYNC_MARKER_SHA224, sha224->heap, devId);
01843     #else
01844         (void)devId;
01845     #endif /* WOLFSSL_ASYNC_CRYPT */
01846 
01847         return ret;
01848     }
01849 
01850     int wc_InitSha224(Sha224* sha224)
01851     {
01852         return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID);
01853     }
01854 
01855     int wc_Sha224Update(Sha224* sha224, const byte* data, word32 len)
01856     {
01857         int ret;
01858 
01859     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
01860         if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) {
01861         #if defined(HAVE_INTEL_QA)
01862             return IntelQaSymSha224(&sha224->asyncDev, NULL, data, len);
01863         #endif
01864         }
01865     #endif /* WOLFSSL_ASYNC_CRYPT */
01866 
01867         ret = Sha256Update((Sha256 *)sha224, data, len);
01868 
01869         return ret;
01870     }
01871 
01872     int wc_Sha224Final(Sha224* sha224, byte* hash)
01873     {
01874         int ret;
01875 
01876     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
01877         if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) {
01878         #if defined(HAVE_INTEL_QA)
01879             return IntelQaSymSha224(&sha224->asyncDev, hash, NULL,
01880                                             SHA224_DIGEST_SIZE);
01881         #endif
01882         }
01883     #endif /* WOLFSSL_ASYNC_CRYPT */
01884 
01885         ret = Sha256Final((Sha256*)sha224);
01886         if (ret != 0)
01887             return ret;
01888 
01889     #if defined(LITTLE_ENDIAN_ORDER)
01890         ByteReverseWords(sha224->digest, sha224->digest, SHA224_DIGEST_SIZE);
01891     #endif
01892         XMEMCPY(hash, sha224->digest, SHA224_DIGEST_SIZE);
01893 
01894         return InitSha224(sha224);  /* reset state */
01895     }
01896 
01897     void wc_Sha224Free(Sha224* sha224)
01898     {
01899         if (sha224 == NULL)
01900             return;
01901 
01902     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
01903         wolfAsync_DevCtxFree(&sha224->asyncDev, WOLFSSL_ASYNC_MARKER_SHA224);
01904     #endif /* WOLFSSL_ASYNC_CRYPT */
01905     }
01906 
01907 #endif /* WOLFSSL_SHA224 */
01908 
01909 
01910 int wc_InitSha256(Sha256* sha256)
01911 {
01912     return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID);
01913 }
01914 
01915 void wc_Sha256Free(Sha256* sha256)
01916 {
01917     if (sha256 == NULL)
01918         return;
01919 
01920 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
01921     wolfAsync_DevCtxFree(&sha256->asyncDev, WOLFSSL_ASYNC_MARKER_SHA256);
01922 #endif /* WOLFSSL_ASYNC_CRYPT */
01923 }
01924 
01925 #endif /* !WOLFSSL_TI_HASH */
01926 #endif /* HAVE_FIPS */
01927 
01928 
01929 #ifndef WOLFSSL_TI_HASH
01930 #ifdef WOLFSSL_SHA224
01931     int wc_Sha224GetHash(Sha224* sha224, byte* hash)
01932     {
01933         int ret;
01934         Sha224 tmpSha224;
01935 
01936         if (sha224 == NULL || hash == NULL)
01937             return BAD_FUNC_ARG;
01938 
01939         ret = wc_Sha224Copy(sha224, &tmpSha224);
01940         if (ret == 0) {
01941             ret = wc_Sha224Final(&tmpSha224, hash);
01942         }
01943         return ret;
01944     }
01945     int wc_Sha224Copy(Sha224* src, Sha224* dst)
01946     {
01947         int ret = 0;
01948 
01949         if (src == NULL || dst == NULL)
01950             return BAD_FUNC_ARG;
01951 
01952         XMEMCPY(dst, src, sizeof(Sha224));
01953 
01954     #ifdef WOLFSSL_ASYNC_CRYPT
01955         ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
01956     #endif
01957 
01958         return ret;
01959     }
01960 #endif /* WOLFSSL_SHA224 */
01961 
01962 int wc_Sha256GetHash(Sha256* sha256, byte* hash)
01963 {
01964     int ret;
01965     Sha256 tmpSha256;
01966 
01967     if (sha256 == NULL || hash == NULL)
01968         return BAD_FUNC_ARG;
01969 
01970     ret = wc_Sha256Copy(sha256, &tmpSha256);
01971     if (ret == 0) {
01972         ret = wc_Sha256Final(&tmpSha256, hash);
01973     }
01974     return ret;
01975 }
01976 int wc_Sha256Copy(Sha256* src, Sha256* dst)
01977 {
01978     int ret = 0;
01979 
01980     if (src == NULL || dst == NULL)
01981         return BAD_FUNC_ARG;
01982 
01983     XMEMCPY(dst, src, sizeof(Sha256));
01984 
01985 #ifdef WOLFSSL_ASYNC_CRYPT
01986     ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
01987 #endif
01988 
01989     return ret;
01990 }
01991 #endif /* !WOLFSSL_TI_HASH */
01992 
01993 #endif /* NO_SHA256 */
01994