Xuyi Wang / wolfcrypt

Dependents:   OS

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers sha256.c Source File

sha256.c

00001 /* sha256.c
00002  *
00003  * Copyright (C) 2006-2017 wolfSSL Inc.
00004  *
00005  * This file is part of wolfSSL.
00006  *
00007  * wolfSSL is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * wolfSSL is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
00020  */
00021 
00022 
00023 /* code submitted by raphael.huck@efixo.com */
00024 
00025 #ifdef HAVE_CONFIG_H
00026     #include <config.h>
00027 #endif
00028 
00029 #include <wolfcrypt/settings.h>
00030 
00031 #if !defined(NO_SHA256) && !defined(WOLFSSL_ARMASM)
00032 
00033 #if defined(HAVE_FIPS) && \
00034     defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2)
00035 
00036     /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
00037     #define FIPS_NO_WRAPPERS
00038 
00039     #ifdef USE_WINDOWS_API
00040         #pragma code_seg(".fipsA$d")
00041         #pragma const_seg(".fipsB$d")
00042     #endif
00043 #endif
00044 
00045 #include <wolfcrypt/sha256.h>
00046 #include <wolfcrypt/error-crypt.h>
00047 #include <wolfcrypt/cpuid.h>
00048 
00049 /* fips wrapper calls, user can call direct */
00050 #if defined(HAVE_FIPS) && \
00051     (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION < 2))
00052 
00053     int wc_InitSha256(wc_Sha256* sha)
00054     {
00055         if (sha == NULL) {
00056             return BAD_FUNC_ARG;
00057         }
00058         return InitSha256_fips(sha);
00059     }
00060     int wc_InitSha256_ex(wc_Sha256* sha, void* heap, int devId)
00061     {
00062         (void)heap;
00063         (void)devId;
00064         if (sha == NULL) {
00065             return BAD_FUNC_ARG;
00066         }
00067         return InitSha256_fips(sha);
00068     }
00069     int wc_Sha256Update(wc_Sha256* sha, const byte* data, word32 len)
00070     {
00071         if (sha == NULL ||  (data == NULL && len > 0)) {
00072             return BAD_FUNC_ARG;
00073         }
00074 
00075         if (data == NULL && len == 0) {
00076             /* valid, but do nothing */
00077             return 0;
00078         }
00079 
00080         return Sha256Update_fips(sha, data, len);
00081     }
00082     int wc_Sha256Final(wc_Sha256* sha, byte* out)
00083     {
00084         if (sha == NULL || out == NULL) {
00085             return BAD_FUNC_ARG;
00086         }
00087         return Sha256Final_fips(sha, out);
00088     }
00089     void wc_Sha256Free(wc_Sha256* sha)
00090     {
00091         (void)sha;
00092         /* Not supported in FIPS */
00093     }
00094 
00095 #else /* else build without fips, or for FIPS v2 */
00096 
00097 
00098 #if defined(WOLFSSL_TI_HASH)
00099     /* #include <wolfcrypt/src/port/ti/ti-hash.c> included by wc_port.c */
00100 #else
00101 
00102 #include <wolfcrypt/logging.h>
00103 
00104 #ifdef NO_INLINE
00105     #include <wolfcrypt/misc.h>
00106 #else
00107     #define WOLFSSL_MISC_INCLUDED
00108     #include <wolfcrypt/src/misc.c>
00109 #endif
00110 
00111 
00112 #if defined(USE_INTEL_SPEEDUP)
00113     #define HAVE_INTEL_AVX1
00114 
00115     #if defined(__GNUC__) && ((__GNUC__ < 4) || \
00116                               (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
00117         #define NO_AVX2_SUPPORT
00118     #endif
00119     #if defined(__clang__) && ((__clang_major__ < 3) || \
00120                                (__clang_major__ == 3 && __clang_minor__ <= 5))
00121         #define NO_AVX2_SUPPORT
00122     #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
00123         #undef NO_AVX2_SUPPORT
00124     #endif
00125 
00126     #define HAVE_INTEL_AVX1
00127     #ifndef NO_AVX2_SUPPORT
00128         #define HAVE_INTEL_AVX2
00129     #endif
00130 #endif /* USE_INTEL_SPEEDUP */
00131 
00132 #if defined(HAVE_INTEL_AVX2)
00133     #define HAVE_INTEL_RORX
00134 #endif
00135 
00136 
00137 #if !defined(WOLFSSL_PIC32MZ_HASH) && !defined(STM32_HASH_SHA2) && \
00138     (!defined(WOLFSSL_IMX6_CAAM) || defined(NO_IMX6_CAAM_HASH))
00139 static int InitSha256(wc_Sha256* sha256)
00140 {
00141     int ret = 0;
00142 
00143     if (sha256 == NULL)
00144         return BAD_FUNC_ARG;
00145 
00146     XMEMSET(sha256->digest, 0, sizeof(sha256->digest));
00147     sha256->digest[0] = 0x6A09E667L;
00148     sha256->digest[1] = 0xBB67AE85L;
00149     sha256->digest[2] = 0x3C6EF372L;
00150     sha256->digest[3] = 0xA54FF53AL;
00151     sha256->digest[4] = 0x510E527FL;
00152     sha256->digest[5] = 0x9B05688CL;
00153     sha256->digest[6] = 0x1F83D9ABL;
00154     sha256->digest[7] = 0x5BE0CD19L;
00155 
00156     sha256->buffLen = 0;
00157     sha256->loLen   = 0;
00158     sha256->hiLen   = 0;
00159 
00160     return ret;
00161 }
00162 #endif
00163 
00164 
00165 /* Hardware Acceleration */
00166 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00167 
00168     /* in case intel instructions aren't available, plus we need the K[] global */
00169     #define NEED_SOFT_SHA256
00170 
00171     /*****
00172     Intel AVX1/AVX2 Macro Control Structure
00173 
00174     #define HAVE_INTEL_AVX1
00175     #define HAVE_INTEL_AVX2
00176 
00177     #define HAVE_INTEL_RORX
00178 
00179 
00180     int InitSha256(wc_Sha256* sha256) {
00181          Save/Recover XMM, YMM
00182          ...
00183     }
00184 
00185     #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
00186       Transform_Sha256(); Function prototype
00187     #else
00188       Transform_Sha256() {   }
00189       int Sha256Final() {
00190          Save/Recover XMM, YMM
00191          ...
00192       }
00193     #endif
00194 
00195     #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
00196         #if defined(HAVE_INTEL_RORX
00197              #define RND with rorx instuction
00198         #else
00199             #define RND
00200         #endif
00201     #endif
00202 
00203     #if defined(HAVE_INTEL_AVX1)
00204 
00205        #define XMM Instructions/inline asm
00206 
00207        int Transform_Sha256() {
00208            Stitched Message Sched/Round
00209         }
00210 
00211     #elif defined(HAVE_INTEL_AVX2)
00212 
00213       #define YMM Instructions/inline asm
00214 
00215       int Transform_Sha256() {
00216           More granural Stitched Message Sched/Round
00217       }
00218 
00219     #endif
00220 
00221     */
00222 
00223     /* Each platform needs to query info type 1 from cpuid to see if aesni is
00224      * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
00225      */
00226 
00227     /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha256 */
00228     static int Transform_Sha256(wc_Sha256* sha256);
00229     #if defined(HAVE_INTEL_AVX1)
00230         static int Transform_Sha256_AVX1(wc_Sha256 *sha256);
00231         static int Transform_Sha256_AVX1_Len(wc_Sha256* sha256, word32 len);
00232     #endif
00233     #if defined(HAVE_INTEL_AVX2)
00234         static int Transform_Sha256_AVX2(wc_Sha256 *sha256);
00235         static int Transform_Sha256_AVX2_Len(wc_Sha256* sha256, word32 len);
00236         #ifdef HAVE_INTEL_RORX
00237         static int Transform_Sha256_AVX1_RORX(wc_Sha256 *sha256);
00238         static int Transform_Sha256_AVX1_RORX_Len(wc_Sha256* sha256, word32 len);
00239         static int Transform_Sha256_AVX2_RORX(wc_Sha256 *sha256);
00240         static int Transform_Sha256_AVX2_RORX_Len(wc_Sha256* sha256, word32 len);
00241         #endif
00242     #endif
00243     static int (*Transform_Sha256_p)(wc_Sha256* sha256);
00244                                                        /* = _Transform_Sha256 */
00245     static int (*Transform_Sha256_Len_p)(wc_Sha256* sha256, word32 len);
00246                                                                     /* = NULL */
00247     static int transform_check = 0;
00248     static word32 intel_flags;
00249     #define XTRANSFORM(S)         (*Transform_Sha256_p)((S))
00250     #define XTRANSFORM_LEN(S, L)  (*Transform_Sha256_Len_p)((S),(L))
00251 
00252     static void Sha256_SetTransform(void)
00253     {
00254 
00255         if (transform_check)
00256             return;
00257 
00258         intel_flags = cpuid_get_flags();
00259 
00260     #ifdef HAVE_INTEL_AVX2
00261         if (IS_INTEL_AVX2(intel_flags)) {
00262         #ifdef HAVE_INTEL_RORX
00263             if (IS_INTEL_BMI2(intel_flags)) {
00264                 Transform_Sha256_p = Transform_Sha256_AVX2_RORX;
00265                 Transform_Sha256_Len_p = Transform_Sha256_AVX2_RORX_Len;
00266             }
00267             else
00268         #endif
00269             if (1)
00270             {
00271                 Transform_Sha256_p = Transform_Sha256_AVX2;
00272                 Transform_Sha256_Len_p = Transform_Sha256_AVX2_Len;
00273             }
00274         #ifdef HAVE_INTEL_RORX
00275             else {
00276                 Transform_Sha256_p = Transform_Sha256_AVX1_RORX;
00277                 Transform_Sha256_Len_p = Transform_Sha256_AVX1_RORX_Len;
00278             }
00279         #endif
00280         }
00281         else
00282     #endif
00283     #ifdef HAVE_INTEL_AVX1
00284         if (IS_INTEL_AVX1(intel_flags)) {
00285             Transform_Sha256_p = Transform_Sha256_AVX1;
00286             Transform_Sha256_Len_p = Transform_Sha256_AVX1_Len;
00287         }
00288         else
00289     #endif
00290         {
00291             Transform_Sha256_p = Transform_Sha256;
00292             Transform_Sha256_Len_p = NULL;
00293         }
00294 
00295         transform_check = 1;
00296     }
00297 
00298     int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
00299     {
00300         int ret = 0;
00301         if (sha256 == NULL)
00302             return BAD_FUNC_ARG;
00303 
00304         sha256->heap = heap;
00305 
00306         ret = InitSha256(sha256);
00307         if (ret != 0)
00308             return ret;
00309 
00310         /* choose best Transform function under this runtime environment */
00311         Sha256_SetTransform();
00312 
00313     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
00314         ret = wolfAsync_DevCtxInit(&sha256->asyncDev,
00315                             WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId);
00316     #else
00317         (void)devId;
00318     #endif /* WOLFSSL_ASYNC_CRYPT */
00319 
00320         return ret;
00321     }
00322 
00323 #elif defined(FREESCALE_LTC_SHA)
00324     int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
00325     {
00326         (void)heap;
00327         (void)devId;
00328 
00329         LTC_HASH_Init(LTC_BASE, &sha256->ctx, kLTC_Sha256, NULL, 0);
00330 
00331         return 0;
00332     }
00333 
00334 #elif defined(FREESCALE_MMCAU_SHA)
00335 
00336     #ifdef FREESCALE_MMCAU_CLASSIC_SHA
00337         #include "cau_api.h"
00338     #else
00339         #include "fsl_mmcau.h"
00340     #endif
00341 
00342     #define XTRANSFORM(S)        Transform_Sha256((S))
00343     #define XTRANSFORM_LEN(S,L)  Transform_Sha256_Len((S),(L))
00344 
00345     int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
00346     {
00347         int ret = 0;
00348 
00349         (void)heap;
00350         (void)devId;
00351 
00352         ret = wolfSSL_CryptHwMutexLock();
00353         if (ret != 0) {
00354             return ret;
00355         }
00356     #ifdef FREESCALE_MMCAU_CLASSIC_SHA
00357         cau_sha256_initialize_output(sha256->digest);
00358     #else
00359         MMCAU_SHA256_InitializeOutput((uint32_t*)sha256->digest);
00360     #endif
00361         wolfSSL_CryptHwMutexUnLock();
00362 
00363         sha256->buffLen = 0;
00364         sha256->loLen   = 0;
00365         sha256->hiLen   = 0;
00366 
00367         return ret;
00368     }
00369 
00370     static int Transform_Sha256(wc_Sha256* sha256)
00371     {
00372         int ret = wolfSSL_CryptHwMutexLock();
00373         if (ret == 0) {
00374     #ifdef FREESCALE_MMCAU_CLASSIC_SHA
00375             cau_sha256_hash_n((byte*)sha256->buffer, 1, sha256->digest);
00376     #else
00377             MMCAU_SHA256_HashN((byte*)sha256->buffer, 1, sha256->digest);
00378     #endif
00379             wolfSSL_CryptHwMutexUnLock();
00380         }
00381         return ret;
00382     }
00383 
00384 #elif defined(WOLFSSL_PIC32MZ_HASH)
00385     #include <wolfssl/wolfcrypt/port/pic32/pic32mz-crypt.h>
00386 
00387 #elif defined(STM32_HASH_SHA2)
00388 
00389     /* Supports CubeMX HAL or Standard Peripheral Library */
00390 
00391     int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
00392     {
00393         if (sha256 == NULL)
00394             return BAD_FUNC_ARG;
00395 
00396         (void)devId;
00397         (void)heap;
00398 
00399         wc_Stm32_Hash_Init(&sha256->stmCtx);
00400         return 0;
00401     }
00402 
00403     int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
00404     {
00405         int ret = 0;
00406 
00407         if (sha256 == NULL || (data == NULL && len > 0)) {
00408             return BAD_FUNC_ARG;
00409         }
00410 
00411         ret = wolfSSL_CryptHwMutexLock();
00412         if (ret == 0) {
00413             ret = wc_Stm32_Hash_Update(&sha256->stmCtx,
00414                 HASH_AlgoSelection_SHA256, data, len);
00415             wolfSSL_CryptHwMutexUnLock();
00416         }
00417         return ret;
00418     }
00419 
00420     int wc_Sha256Final(wc_Sha256* sha256, byte* hash)
00421     {
00422         int ret = 0;
00423 
00424         if (sha256 == NULL || hash == NULL) {
00425             return BAD_FUNC_ARG;
00426         }
00427 
00428         ret = wolfSSL_CryptHwMutexLock();
00429         if (ret == 0) {
00430             ret = wc_Stm32_Hash_Final(&sha256->stmCtx,
00431                 HASH_AlgoSelection_SHA256, hash, WC_SHA256_DIGEST_SIZE);
00432             wolfSSL_CryptHwMutexUnLock();
00433         }
00434 
00435         (void)wc_InitSha256(sha256); /* reset state */
00436 
00437         return ret;
00438     }
00439 
00440 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH)
00441     /* functions defined in wolfcrypt/src/port/caam/caam_sha256.c */
00442 #else
00443     #define NEED_SOFT_SHA256
00444 
00445     int wc_InitSha256_ex(wc_Sha256* sha256, void* heap, int devId)
00446     {
00447         int ret = 0;
00448         if (sha256 == NULL)
00449             return BAD_FUNC_ARG;
00450 
00451         sha256->heap = heap;
00452 
00453         ret = InitSha256(sha256);
00454         if (ret != 0)
00455             return ret;
00456 
00457     #ifdef WOLFSSL_SMALL_STACK_CACHE
00458         sha256->W = NULL;
00459     #endif
00460 
00461     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
00462         ret = wolfAsync_DevCtxInit(&sha256->asyncDev,
00463                             WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId);
00464     #else
00465         (void)devId;
00466     #endif /* WOLFSSL_ASYNC_CRYPT */
00467 
00468         return ret;
00469     }
00470 #endif /* End Hardware Acceleration */
00471 
00472 #ifdef NEED_SOFT_SHA256
00473 
00474     static const ALIGN32 word32 K[64] = {
00475         0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
00476         0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
00477         0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
00478         0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
00479         0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
00480         0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
00481         0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
00482         0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
00483         0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
00484         0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
00485         0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
00486         0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
00487         0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
00488     };
00489 
00490     #define Ch(x,y,z)       ((z) ^ ((x) & ((y) ^ (z))))
00491     #define Maj(x,y,z)      ((((x) | (y)) & (z)) | ((x) & (y)))
00492     #define R(x, n)         (((x) & 0xFFFFFFFFU) >> (n))
00493 
00494     #define S(x, n)         rotrFixed(x, n)
00495     #define Sigma0(x)       (S(x, 2)  ^ S(x, 13) ^ S(x, 22))
00496     #define Sigma1(x)       (S(x, 6)  ^ S(x, 11) ^ S(x, 25))
00497     #define Gamma0(x)       (S(x, 7)  ^ S(x, 18) ^ R(x, 3))
00498     #define Gamma1(x)       (S(x, 17) ^ S(x, 19) ^ R(x, 10))
00499 
00500     #define a(i) S[(0-i) & 7]
00501     #define b(i) S[(1-i) & 7]
00502     #define c(i) S[(2-i) & 7]
00503     #define d(i) S[(3-i) & 7]
00504     #define e(i) S[(4-i) & 7]
00505     #define f(i) S[(5-i) & 7]
00506     #define g(i) S[(6-i) & 7]
00507     #define h(i) S[(7-i) & 7]
00508 
00509     #define RND(j) \
00510          t0 = h(j) + Sigma1(e(j)) + Ch(e(j), f(j), g(j)) + K[i+j] + W[i+j]; \
00511          t1 = Sigma0(a(j)) + Maj(a(j), b(j), c(j)); \
00512          d(j) += t0; \
00513          h(j)  = t0 + t1
00514 
00515     #ifndef XTRANSFORM
00516          #define XTRANSFORM(S)        Transform_Sha256((S))
00517          #define XTRANSFORM_LEN(S,L)  Transform_Sha256_Len((S),(L))
00518     #endif
00519 
00520     static int Transform_Sha256(wc_Sha256* sha256)
00521     {
00522         word32 S[8], t0, t1;
00523         int i;
00524 
00525     #ifdef WOLFSSL_SMALL_STACK_CACHE
00526         word32* W = sha256->W;
00527         if (W == NULL) {
00528             W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
00529                                                               DYNAMIC_TYPE_RNG);
00530             if (W == NULL)
00531                 return MEMORY_E;
00532             sha256->W = W;
00533         }
00534     #elif defined(WOLFSSL_SMALL_STACK)
00535         word32* W;
00536         W = (word32*)XMALLOC(sizeof(word32) * WC_SHA256_BLOCK_SIZE, NULL,
00537                                                        DYNAMIC_TYPE_TMP_BUFFER);
00538         if (W == NULL)
00539             return MEMORY_E;
00540     #else
00541         word32 W[WC_SHA256_BLOCK_SIZE];
00542     #endif
00543 
00544         /* Copy context->state[] to working vars */
00545         for (i = 0; i < 8; i++)
00546             S[i] = sha256->digest[i];
00547 
00548         for (i = 0; i < 16; i++)
00549             W[i] = sha256->buffer[i];
00550 
00551         for (i = 16; i < WC_SHA256_BLOCK_SIZE; i++)
00552             W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
00553 
00554     #ifdef USE_SLOW_SHA256
00555         /* not unrolled - ~2k smaller and ~25% slower */
00556         for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
00557             int j;
00558             for (j = 0; j < 8; j++) { /* braces needed here for macros {} */
00559                 RND(j);
00560             }
00561         }
00562     #else
00563         /* partially loop unrolled */
00564         for (i = 0; i < WC_SHA256_BLOCK_SIZE; i += 8) {
00565             RND(0); RND(1); RND(2); RND(3);
00566             RND(4); RND(5); RND(6); RND(7);
00567         }
00568     #endif /* USE_SLOW_SHA256 */
00569 
00570         /* Add the working vars back into digest state[] */
00571         for (i = 0; i < 8; i++) {
00572             sha256->digest[i] += S[i];
00573         }
00574 
00575     #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE)
00576         XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
00577     #endif
00578         return 0;
00579     }
00580 #endif
00581 /* End wc_ software implementation */
00582 
00583 
00584 #ifdef XTRANSFORM
00585 
00586     static WC_INLINE void AddLength(wc_Sha256* sha256, word32 len)
00587     {
00588         word32 tmp = sha256->loLen;
00589         if ((sha256->loLen += len) < tmp)
00590             sha256->hiLen++;                       /* carry low to high */
00591     }
00592 
00593     static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
00594     {
00595         int ret = 0;
00596         byte* local;
00597 
00598         if (sha256 == NULL || (data == NULL && len > 0)) {
00599             return BAD_FUNC_ARG;
00600         }
00601 
00602         if (data == NULL && len == 0) {
00603             /* valid, but do nothing */
00604             return 0;
00605         }
00606 
00607     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
00608         if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) {
00609         #if defined(HAVE_INTEL_QA)
00610             return IntelQaSymSha256(&sha256->asyncDev, NULL, data, len);
00611         #endif
00612         }
00613     #endif /* WOLFSSL_ASYNC_CRYPT */
00614 
00615         /* do block size increments */
00616         local = (byte*)sha256->buffer;
00617 
00618         /* check that internal buffLen is valid */
00619         if (sha256->buffLen >= WC_SHA256_BLOCK_SIZE)
00620             return BUFFER_E;
00621 
00622         if (sha256->buffLen > 0) {
00623             word32 add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
00624             XMEMCPY(&local[sha256->buffLen], data, add);
00625 
00626             sha256->buffLen += add;
00627             data            += add;
00628             len             -= add;
00629 
00630             if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) {
00631         #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
00632             #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00633                 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
00634             #endif
00635                 {
00636                     ByteReverseWords(sha256->buffer, sha256->buffer,
00637                                                           WC_SHA256_BLOCK_SIZE);
00638                 }
00639         #endif
00640                 ret = XTRANSFORM(sha256);
00641                 if (ret == 0) {
00642                     AddLength(sha256, WC_SHA256_BLOCK_SIZE);
00643                     sha256->buffLen = 0;
00644                 }
00645                 else
00646                     len = 0;
00647             }
00648         }
00649 
00650     #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00651         if (Transform_Sha256_Len_p != NULL) {
00652             word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1);
00653 
00654             if (blocksLen > 0) {
00655                 AddLength(sha256, blocksLen);
00656                 sha256->data = data;
00657                 /* Byte reversal performed in function if required. */
00658                 XTRANSFORM_LEN(sha256, blocksLen);
00659                 data += blocksLen;
00660                 len  -= blocksLen;
00661             }
00662         }
00663         else
00664     #endif
00665     #if !defined(LITTLE_ENDIAN_ORDER) || defined(FREESCALE_MMCAU_SHA) || \
00666                             defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00667         {
00668             word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1);
00669 
00670             AddLength(sha256, blocksLen);
00671             while (len >= WC_SHA256_BLOCK_SIZE) {
00672                 XMEMCPY(local, data, WC_SHA256_BLOCK_SIZE);
00673 
00674                 data += WC_SHA256_BLOCK_SIZE;
00675                 len  -= WC_SHA256_BLOCK_SIZE;
00676 
00677                 /* Byte reversal performed in function if required. */
00678                 ret = XTRANSFORM(sha256);
00679                 if (ret != 0)
00680                     break;
00681             }
00682         }
00683     #else
00684         {
00685             word32 blocksLen = len & ~(WC_SHA256_BLOCK_SIZE-1);
00686 
00687             AddLength(sha256, blocksLen);
00688             while (len >= WC_SHA256_BLOCK_SIZE) {
00689                 XMEMCPY(local, data, WC_SHA256_BLOCK_SIZE);
00690 
00691                 data += WC_SHA256_BLOCK_SIZE;
00692                 len  -= WC_SHA256_BLOCK_SIZE;
00693 
00694                 ByteReverseWords(sha256->buffer, sha256->buffer,
00695                                                           WC_SHA256_BLOCK_SIZE);
00696                 ret = XTRANSFORM(sha256);
00697                 if (ret != 0)
00698                     break;
00699             }
00700         }
00701     #endif
00702 
00703         if (len > 0) {
00704             XMEMCPY(local, data, len);
00705             sha256->buffLen = len;
00706         }
00707 
00708         return ret;
00709     }
00710 
00711     int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
00712     {
00713         return Sha256Update(sha256, data, len);
00714     }
00715 
00716     static WC_INLINE int Sha256Final(wc_Sha256* sha256)
00717     {
00718 
00719         int ret;
00720         byte* local = (byte*)sha256->buffer;
00721 
00722         if (sha256 == NULL) {
00723             return BAD_FUNC_ARG;
00724         }
00725 
00726         AddLength(sha256, sha256->buffLen);  /* before adding pads */
00727         local[sha256->buffLen++] = 0x80;     /* add 1 */
00728 
00729         /* pad with zeros */
00730         if (sha256->buffLen > WC_SHA256_PAD_SIZE) {
00731             XMEMSET(&local[sha256->buffLen], 0,
00732                 WC_SHA256_BLOCK_SIZE - sha256->buffLen);
00733             sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen;
00734 
00735             {
00736         #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
00737             #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00738                 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
00739             #endif
00740                 {
00741                     ByteReverseWords(sha256->buffer, sha256->buffer,
00742                                                           WC_SHA256_BLOCK_SIZE);
00743                 }
00744         #endif
00745             }
00746 
00747             ret = XTRANSFORM(sha256);
00748             if (ret != 0)
00749                 return ret;
00750 
00751             sha256->buffLen = 0;
00752         }
00753         XMEMSET(&local[sha256->buffLen], 0, WC_SHA256_PAD_SIZE - sha256->buffLen);
00754 
00755         /* put lengths in bits */
00756         sha256->hiLen = (sha256->loLen >> (8 * sizeof(sha256->loLen) - 3)) +
00757                                                          (sha256->hiLen << 3);
00758         sha256->loLen = sha256->loLen << 3;
00759 
00760         /* store lengths */
00761     #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
00762         #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00763             if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
00764         #endif
00765             {
00766                 ByteReverseWords(sha256->buffer, sha256->buffer,
00767                     WC_SHA256_BLOCK_SIZE);
00768             }
00769     #endif
00770         /* ! length ordering dependent on digest endian type ! */
00771         XMEMCPY(&local[WC_SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
00772         XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
00773                 sizeof(word32));
00774 
00775     #if defined(FREESCALE_MMCAU_SHA) || defined(HAVE_INTEL_AVX1) || \
00776         defined(HAVE_INTEL_AVX2)
00777         /* Kinetis requires only these bytes reversed */
00778         #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00779             if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
00780         #endif
00781             {
00782                 ByteReverseWords(
00783                     &sha256->buffer[WC_SHA256_PAD_SIZE / sizeof(word32)],
00784                     &sha256->buffer[WC_SHA256_PAD_SIZE / sizeof(word32)],
00785                     2 * sizeof(word32));
00786             }
00787     #endif
00788 
00789         return XTRANSFORM(sha256);
00790     }
00791 
00792     int wc_Sha256FinalRaw(wc_Sha256* sha256, byte* hash)
00793     {
00794     #ifdef LITTLE_ENDIAN_ORDER
00795         word32 digest[WC_SHA256_DIGEST_SIZE / sizeof(word32)];
00796     #endif
00797 
00798         if (sha256 == NULL || hash == NULL) {
00799             return BAD_FUNC_ARG;
00800         }
00801 
00802     #ifdef LITTLE_ENDIAN_ORDER
00803         ByteReverseWords((word32*)digest, (word32*)sha256->digest,
00804                                                          WC_SHA256_DIGEST_SIZE);
00805         XMEMCPY(hash, digest, WC_SHA256_DIGEST_SIZE);
00806     #else
00807         XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
00808     #endif
00809 
00810         return 0;
00811     }
00812 
00813     int wc_Sha256Final(wc_Sha256* sha256, byte* hash)
00814     {
00815         int ret;
00816 
00817         if (sha256 == NULL || hash == NULL) {
00818             return BAD_FUNC_ARG;
00819         }
00820 
00821     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
00822         if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) {
00823         #if defined(HAVE_INTEL_QA)
00824             return IntelQaSymSha256(&sha256->asyncDev, hash, NULL,
00825                                             WC_SHA256_DIGEST_SIZE);
00826         #endif
00827         }
00828     #endif /* WOLFSSL_ASYNC_CRYPT */
00829 
00830         ret = Sha256Final(sha256);
00831         if (ret != 0)
00832             return ret;
00833 
00834     #if defined(LITTLE_ENDIAN_ORDER)
00835         ByteReverseWords(sha256->digest, sha256->digest, WC_SHA256_DIGEST_SIZE);
00836     #endif
00837         XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
00838 
00839         return InitSha256(sha256);  /* reset state */
00840     }
00841 
00842 #endif /* XTRANSFORM */
00843 
00844 
00845 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00846 
00847 #define _LOAD_DIGEST()                     \
00848     "movl     (%[sha256]), %%r8d \n\t" \
00849     "movl    4(%[sha256]), %%r9d \n\t" \
00850     "movl    8(%[sha256]), %%r10d\n\t" \
00851     "movl   12(%[sha256]), %%r11d\n\t" \
00852     "movl   16(%[sha256]), %%r12d\n\t" \
00853     "movl   20(%[sha256]), %%r13d\n\t" \
00854     "movl   24(%[sha256]), %%r14d\n\t" \
00855     "movl   28(%[sha256]), %%r15d\n\t"
00856 
00857 #define _STORE_ADD_DIGEST()                \
00858     "addl   %%r8d ,   (%[sha256])\n\t" \
00859     "addl   %%r9d ,  4(%[sha256])\n\t" \
00860     "addl   %%r10d,  8(%[sha256])\n\t" \
00861     "addl   %%r11d, 12(%[sha256])\n\t" \
00862     "addl   %%r12d, 16(%[sha256])\n\t" \
00863     "addl   %%r13d, 20(%[sha256])\n\t" \
00864     "addl   %%r14d, 24(%[sha256])\n\t" \
00865     "addl   %%r15d, 28(%[sha256])\n\t"
00866 
00867 #define _ADD_DIGEST()                      \
00868     "addl     (%[sha256]), %%r8d \n\t" \
00869     "addl    4(%[sha256]), %%r9d \n\t" \
00870     "addl    8(%[sha256]), %%r10d\n\t" \
00871     "addl   12(%[sha256]), %%r11d\n\t" \
00872     "addl   16(%[sha256]), %%r12d\n\t" \
00873     "addl   20(%[sha256]), %%r13d\n\t" \
00874     "addl   24(%[sha256]), %%r14d\n\t" \
00875     "addl   28(%[sha256]), %%r15d\n\t"
00876 
00877 #define _STORE_DIGEST()                    \
00878     "movl   %%r8d ,   (%[sha256])\n\t" \
00879     "movl   %%r9d ,  4(%[sha256])\n\t" \
00880     "movl   %%r10d,  8(%[sha256])\n\t" \
00881     "movl   %%r11d, 12(%[sha256])\n\t" \
00882     "movl   %%r12d, 16(%[sha256])\n\t" \
00883     "movl   %%r13d, 20(%[sha256])\n\t" \
00884     "movl   %%r14d, 24(%[sha256])\n\t" \
00885     "movl   %%r15d, 28(%[sha256])\n\t"
00886 
00887 #define LOAD_DIGEST() \
00888     _LOAD_DIGEST()
00889 
00890 #define STORE_ADD_DIGEST() \
00891     _STORE_ADD_DIGEST()
00892 
00893 #define ADD_DIGEST() \
00894     _ADD_DIGEST()
00895 
00896 #define STORE_DIGEST() \
00897     _STORE_DIGEST()
00898 
00899 
00900 #define S_0 %r8d
00901 #define S_1 %r9d
00902 #define S_2 %r10d
00903 #define S_3 %r11d
00904 #define S_4 %r12d
00905 #define S_5 %r13d
00906 #define S_6 %r14d
00907 #define S_7 %r15d
00908 
00909 #define L1  "%%edx"
00910 #define L2  "%%ecx"
00911 #define L3  "%%eax"
00912 #define L4  "%%ebx"
00913 #define WK  "%%rsp"
00914 
00915 #define WORK_REGS  "eax", "ebx", "ecx", "edx"
00916 #define STATE_REGS "r8","r9","r10","r11","r12","r13","r14","r15"
00917 #define XMM_REGS   "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",    \
00918                    "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13"
00919 
00920 #if defined(HAVE_INTEL_RORX)
00921 #define RND_STEP_RORX_0_1(a, b, c, d, e, f, g, h, i) \
00922     /* L3 = f */                                     \
00923     "movl   %" #f ", " L3 "\n\t"                 \
00924     /* L2 = e>>>11 */                                \
00925     "rorx   $11, %" #e ", " L2 "\n\t"            \
00926     /* h += w_k */                                   \
00927     "addl   (" #i ")*4(" WK "), %" #h "\n\t"     \
00928 
00929 #define RND_STEP_RORX_0_2(a, b, c, d, e, f, g, h, i) \
00930     /* L2 = (e>>>6) ^ (e>>>11) */                    \
00931     "xorl   " L1 ", " L2 "\n\t"                  \
00932     /* L3 = f ^ g */                                 \
00933     "xorl   %" #g ", " L3 "\n\t"                 \
00934     /* L1 = e>>>25 */                                \
00935     "rorx   $25, %" #e ", " L1 "\n\t"            \
00936 
00937 #define RND_STEP_RORX_0_3(a, b, c, d, e, f, g, h, i) \
00938     /* L3 = (f ^ g) & e */                           \
00939     "andl   %" #e ", " L3 "\n\t"                 \
00940     /* L1 = Sigma1(e) */                             \
00941     "xorl   " L2 ", " L1 "\n\t"                  \
00942     /* L2 = a>>>13 */                                \
00943     "rorx   $13, %" #a ", " L2 "\n\t"            \
00944 
00945 #define RND_STEP_RORX_0_4(a, b, c, d, e, f, g, h, i) \
00946     /* h += Sigma1(e) */                             \
00947     "addl   " L1 ", %" #h "\n\t"                 \
00948     /* L1 = a>>>2 */                                 \
00949     "rorx   $2, %" #a ", " L1 "\n\t"             \
00950     /* L3 = Ch(e,f,g) */                             \
00951     "xorl   %" #g ", " L3 "\n\t"                 \
00952 
00953 #define RND_STEP_RORX_0_5(a, b, c, d, e, f, g, h, i) \
00954     /* L2 = (a>>>2) ^ (a>>>13) */                    \
00955     "xorl   " L1 ", " L2 "\n\t"                  \
00956     /* L1 = a>>>22 */                                \
00957     "rorx   $22, %" #a ", " L1 "\n\t"            \
00958     /* h += Ch(e,f,g) */                             \
00959     "addl   " L3 ", %" #h "\n\t"                 \
00960 
00961 #define RND_STEP_RORX_0_6(a, b, c, d, e, f, g, h, i) \
00962     /* L1 = Sigma0(a) */                             \
00963     "xorl   " L2 ", " L1 "\n\t"                  \
00964     /* L3 = b */                                     \
00965     "movl   %" #b ", " L3 "\n\t"                 \
00966     /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */       \
00967     "addl   %" #h ", %" #d "\n\t"                \
00968 
00969 #define RND_STEP_RORX_0_7(a, b, c, d, e, f, g, h, i) \
00970     /* L3 = a ^ b */                                 \
00971     "xorl   %" #a ", " L3 "\n\t"                 \
00972     /* h += Sigma0(a) */                             \
00973     "addl   " L1 ", %" #h "\n\t"                 \
00974     /* L4 = (a ^ b) & (b ^ c) */                     \
00975     "andl   " L3 ", " L4 "\n\t"                  \
00976 
00977 #define RND_STEP_RORX_0_8(a, b, c, d, e, f, g, h, i) \
00978     /* L4 = Maj(a,b,c) */                            \
00979     "xorl   %" #b ", " L4 "\n\t"                 \
00980     /* L1 = d>>>6 (= e>>>6 next RND) */              \
00981     "rorx   $6, %" #d ", " L1 "\n\t"             \
00982     /* h += Maj(a,b,c) */                            \
00983     "addl   " L4 ", %" #h "\n\t"                 \
00984 
00985 #define RND_STEP_RORX_1_1(a, b, c, d, e, f, g, h, i) \
00986     /* L4 = f */                                     \
00987     "movl   %" #f ", " L4 "\n\t"                 \
00988     /* L2 = e>>>11 */                                \
00989     "rorx   $11, %" #e ", " L2 "\n\t"            \
00990     /* h += w_k */                                   \
00991     "addl   (" #i ")*4(" WK "), %" #h "\n\t"     \
00992 
00993 #define RND_STEP_RORX_1_2(a, b, c, d, e, f, g, h, i) \
00994     /* L2 = (e>>>6) ^ (e>>>11) */                    \
00995     "xorl   " L1 ", " L2 "\n\t"                  \
00996     /* L4 = f ^ g */                                 \
00997     "xorl   %" #g ", " L4 "\n\t"                 \
00998     /* L1 = e>>>25 */                                \
00999     "rorx   $25, %" #e ", " L1 "\n\t"            \
01000 
01001 #define RND_STEP_RORX_1_3(a, b, c, d, e, f, g, h, i) \
01002     /* L4 = (f ^ g) & e */                           \
01003     "andl   %" #e ", " L4 "\n\t"                 \
01004     /* L1 = Sigma1(e) */                             \
01005     "xorl   " L2 ", " L1 "\n\t"                  \
01006     /* L2 = a>>>13 */                                \
01007     "rorx   $13, %" #a ", " L2 "\n\t"            \
01008 
01009 #define RND_STEP_RORX_1_4(a, b, c, d, e, f, g, h, i) \
01010     /* h += Sigma1(e) */                             \
01011     "addl   " L1 ", %" #h "\n\t"                 \
01012     /* L1 = a>>>2 */                                 \
01013     "rorx   $2, %" #a ", " L1 "\n\t"             \
01014     /* L4 = Ch(e,f,g) */                             \
01015     "xorl   %" #g ", " L4 "\n\t"                 \
01016 
01017 #define RND_STEP_RORX_1_5(a, b, c, d, e, f, g, h, i) \
01018     /* L2 = (a>>>2) ^ (a>>>13) */                    \
01019     "xorl   " L1 ", " L2 "\n\t"                  \
01020     /* L1 = a>>>22 */                                \
01021     "rorx   $22, %" #a ", " L1 "\n\t"            \
01022     /* h += Ch(e,f,g) */                             \
01023     "addl   " L4 ", %" #h "\n\t"                 \
01024 
01025 #define RND_STEP_RORX_1_6(a, b, c, d, e, f, g, h, i) \
01026     /* L1 = Sigma0(a) */                             \
01027     "xorl   " L2 ", " L1 "\n\t"                  \
01028     /* L4 = b */                                     \
01029     "movl   %" #b ", " L4 "\n\t"                 \
01030     /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */       \
01031     "addl   %" #h ", %" #d "\n\t"                \
01032 
01033 #define RND_STEP_RORX_1_7(a, b, c, d, e, f, g, h, i) \
01034     /* L4 = a ^ b */                                 \
01035     "xorl   %" #a ", " L4 "\n\t"                 \
01036     /* h += Sigma0(a) */                             \
01037     "addl   " L1 ", %" #h "\n\t"                 \
01038     /* L3 = (a ^ b) & (b ^ c) */                     \
01039     "andl   " L4 ", " L3 "\n\t"                  \
01040 
01041 #define RND_STEP_RORX_1_8(a, b, c, d, e, f, g, h, i) \
01042     /* L3 = Maj(a,b,c) */                            \
01043     "xorl   %" #b ", " L3 "\n\t"                 \
01044     /* L1 = d>>>6 (= e>>>6 next RND) */              \
01045     "rorx   $6, %" #d ", " L1 "\n\t"             \
01046     /* h += Maj(a,b,c) */                            \
01047     "addl   " L3 ", %" #h "\n\t"                 \
01048 
01049 #define _RND_RORX_X_0(a, b, c, d, e, f, g, h, i)     \
01050     /* L1 = e>>>6 */                                 \
01051     "rorx   $6, %" #e ", " L1 "\n\t"             \
01052     /* L2 = e>>>11 */                                \
01053     "rorx   $11, %" #e ", " L2 "\n\t"            \
01054     /* Prev RND: h += Maj(a,b,c) */                  \
01055     "addl   " L3 ", %" #a "\n\t"                 \
01056     /* h += w_k */                                   \
01057     "addl   (" #i ")*4(" WK "), %" #h "\n\t"     \
01058     /* L3 = f */                                     \
01059     "movl   %" #f ", " L3 "\n\t"                 \
01060     /* L2 = (e>>>6) ^ (e>>>11) */                    \
01061     "xorl   " L1 ", " L2 "\n\t"                  \
01062     /* L3 = f ^ g */                                 \
01063     "xorl   %" #g ", " L3 "\n\t"                 \
01064     /* L1 = e>>>25 */                                \
01065     "rorx   $25, %" #e ", " L1 "\n\t"            \
01066     /* L1 = Sigma1(e) */                             \
01067     "xorl   " L2 ", " L1 "\n\t"                  \
01068     /* L3 = (f ^ g) & e */                           \
01069     "andl   %" #e ", " L3 "\n\t"                 \
01070     /* h += Sigma1(e) */                             \
01071     "addl   " L1 ", %" #h "\n\t"                 \
01072     /* L1 = a>>>2 */                                 \
01073     "rorx   $2, %" #a ", " L1 "\n\t"             \
01074     /* L2 = a>>>13 */                                \
01075     "rorx   $13, %" #a ", " L2 "\n\t"            \
01076     /* L3 = Ch(e,f,g) */                             \
01077     "xorl   %" #g ", " L3 "\n\t"                 \
01078     /* L2 = (a>>>2) ^ (a>>>13) */                    \
01079     "xorl   " L1 ", " L2 "\n\t"                  \
01080     /* L1 = a>>>22 */                                \
01081     "rorx   $22, %" #a ", " L1 "\n\t"            \
01082     /* h += Ch(e,f,g) */                             \
01083     "addl   " L3 ", %" #h "\n\t"                 \
01084     /* L1 = Sigma0(a) */                             \
01085     "xorl   " L2 ", " L1 "\n\t"                  \
01086     /* L3 = b */                                     \
01087     "movl   %" #b ", " L3 "\n\t"                 \
01088     /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */       \
01089     "addl   %" #h ", %" #d "\n\t"                \
01090     /* L3 = a ^ b */                                 \
01091     "xorl   %" #a ", " L3 "\n\t"                 \
01092     /* L4 = (a ^ b) & (b ^ c) */                     \
01093     "andl   " L3 ", " L4 "\n\t"                  \
01094     /* h += Sigma0(a) */                             \
01095     "addl   " L1 ", %" #h "\n\t"                 \
01096     /* L4 = Maj(a,b,c) */                            \
01097     "xorl   %" #b ", " L4 "\n\t"                 \
01098 
01099 #define _RND_RORX_X_1(a, b, c, d, e, f, g, h, i)     \
01100     /* L1 = e>>>6 */                                 \
01101     "rorx   $6, %" #e ", " L1 "\n\t"             \
01102     /* L2 = e>>>11 */                                \
01103     "rorx   $11, %" #e ", " L2 "\n\t"            \
01104     /* Prev RND: h += Maj(a,b,c) */                  \
01105     "addl   " L4 ", %" #a "\n\t"                 \
01106     /* h += w_k */                                   \
01107     "addl   (" #i ")*4(" WK "), %" #h "\n\t"     \
01108     /* L4 = f */                                     \
01109     "movl   %" #f ", " L4 "\n\t"                 \
01110     /* L2 = (e>>>6) ^ (e>>>11) */                    \
01111     "xorl   " L1 ", " L2 "\n\t"                  \
01112     /* L4 = f ^ g */                                 \
01113     "xorl   %" #g ", " L4 "\n\t"                 \
01114     /* L1 = e>>>25 */                                \
01115     "rorx   $25, %" #e ", " L1 "\n\t"            \
01116     /* L1 = Sigma1(e) */                             \
01117     "xorl   " L2 ", " L1 "\n\t"                  \
01118     /* L4 = (f ^ g) & e */                           \
01119     "andl   %" #e ", " L4 "\n\t"                 \
01120     /* h += Sigma1(e) */                             \
01121     "addl   " L1 ", %" #h "\n\t"                 \
01122     /* L1 = a>>>2 */                                 \
01123     "rorx   $2, %" #a ", " L1 "\n\t"             \
01124     /* L2 = a>>>13 */                                \
01125     "rorx   $13, %" #a ", " L2 "\n\t"            \
01126     /* L4 = Ch(e,f,g) */                             \
01127     "xorl   %" #g ", " L4 "\n\t"                 \
01128     /* L2 = (a>>>2) ^ (a>>>13) */                    \
01129     "xorl   " L1 ", " L2 "\n\t"                  \
01130     /* L1 = a>>>22 */                                \
01131     "rorx   $22, %" #a ", " L1 "\n\t"            \
01132     /* h += Ch(e,f,g) */                             \
01133     "addl   " L4 ", %" #h "\n\t"                 \
01134     /* L1 = Sigma0(a) */                             \
01135     "xorl   " L2 ", " L1 "\n\t"                  \
01136     /* L4 = b */                                     \
01137     "movl   %" #b ", " L4 "\n\t"                 \
01138     /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */       \
01139     "addl   %" #h ", %" #d "\n\t"                \
01140     /* L4 = a ^ b */                                 \
01141     "xorl   %" #a ", " L4 "\n\t"                 \
01142     /* L2 = (a ^ b) & (b ^ c) */                     \
01143     "andl   " L4 ", " L3 "\n\t"                  \
01144     /* h += Sigma0(a) */                             \
01145     "addl   " L1 ", %" #h "\n\t"                 \
01146     /* L3 = Maj(a,b,c) */                            \
01147     "xorl   %" #b ", " L3 "\n\t"                 \
01148 
01149 
01150 #define RND_RORX_X_0(a,b,c,d,e,f,g,h,i) \
01151        _RND_RORX_X_0(a,b,c,d,e,f,g,h,i)
01152 #define RND_RORX_X_1(a,b,c,d,e,f,g,h,i) \
01153        _RND_RORX_X_1(a,b,c,d,e,f,g,h,i)
01154 
01155 #define RND_RORX_X4(a,b,c,d,e,f,g,h,i)    \
01156         RND_RORX_X_0(a,b,c,d,e,f,g,h,i+0) \
01157         RND_RORX_X_1(h,a,b,c,d,e,f,g,i+1) \
01158         RND_RORX_X_0(g,h,a,b,c,d,e,f,i+2) \
01159         RND_RORX_X_1(f,g,h,a,b,c,d,e,i+3)
01160 
01161 #endif /* HAVE_INTEL_RORX */
01162 
01163 #define RND_STEP_0_1(a,b,c,d,e,f,g,h,i)                               \
01164     /* L1 = e>>>14 */                                                 \
01165     "rorl   $14, " L1 "\n\t"                                      \
01166 
01167 #define RND_STEP_0_2(a,b,c,d,e,f,g,h,i)                               \
01168     /* L3 = b */                                                      \
01169     "movl   %" #b ", " L3 "\n\t"                                  \
01170     /* L2 = f */                                                      \
01171     "movl   %" #f ", " L2 "\n\t"                                  \
01172     /* h += w_k */                                                    \
01173     "addl   (" #i ")*4(" WK "), %" #h "\n\t"                      \
01174     /* L2 = f ^ g */                                                  \
01175     "xorl   %" #g ", " L2 "\n\t"                                  \
01176 
01177 #define RND_STEP_0_3(a,b,c,d,e,f,g,h,i)                               \
01178     /* L1 = (e>>>14) ^ e */                                           \
01179     "xorl   %" #e ", " L1 "\n\t"                                  \
01180     /* L2 = (f ^ g) & e */                                            \
01181     "andl   %" #e ", " L2 "\n\t"                                  \
01182  
01183 #define RND_STEP_0_4(a,b,c,d,e,f,g,h,i)                               \
01184     /* L1 = ((e>>>14) ^ e) >>> 5 */                                   \
01185     "rorl   $5, " L1 "\n\t"                                       \
01186     /* L2 = Ch(e,f,g) */                                              \
01187     "xorl   %" #g ", " L2 "\n\t"                                  \
01188     /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */                             \
01189     "xorl   %" #e ", " L1 "\n\t"                                  \
01190     /* h += Ch(e,f,g) */                                              \
01191     "addl   " L2 ", %" #h "\n\t"                                  \
01192 
01193 #define RND_STEP_0_5(a,b,c,d,e,f,g,h,i)                               \
01194     /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */                     \
01195     "rorl   $6, " L1 "\n\t"                                       \
01196     /* L3 = a ^ b (= b ^ c of next RND) */                            \
01197     "xorl   %" #a ", " L3 "\n\t"                                  \
01198     /* h = h + w_k + Sigma1(e) */                                     \
01199     "addl   " L1 ", %" #h "\n\t"                                  \
01200     /* L2 = a */                                                      \
01201     "movl   %" #a ", " L2 "\n\t"                                  \
01202 
01203 #define RND_STEP_0_6(a,b,c,d,e,f,g,h,i)                               \
01204     /* L3 = (a ^ b) & (b ^ c) */                                      \
01205     "andl   " L3 ", " L4 "\n\t"                                   \
01206     /* L2 = a>>>9 */                                                  \
01207     "rorl   $9, " L2 "\n\t"                                       \
01208     /* L2 = (a>>>9) ^ a */                                            \
01209     "xorl   %" #a ", " L2 "\n\t"                                  \
01210     /* L1 = Maj(a,b,c) */                                             \
01211     "xorl   %" #b ", " L4 "\n\t"                                  \
01212 
01213 #define RND_STEP_0_7(a,b,c,d,e,f,g,h,i)                               \
01214     /* L2 = ((a>>>9) ^ a) >>> 11 */                                   \
01215     "rorl   $11, " L2 "\n\t"                                      \
01216     /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */                        \
01217     "addl   %" #h ", %" #d "\n\t"                                 \
01218     /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */                             \
01219     "xorl   %" #a ", " L2 "\n\t"                                  \
01220     /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */            \
01221     "addl   " L4 ", %" #h "\n\t"                                  \
01222 
01223 #define RND_STEP_0_8(a,b,c,d,e,f,g,h,i)                               \
01224     /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */                     \
01225     "rorl   $2, " L2 "\n\t"                                       \
01226     /* L1 = d (e of next RND) */                                      \
01227     "movl   %" #d ", " L1 "\n\t"                                  \
01228     /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */  \
01229     "addl   " L2 ", %" #h "\n\t"                                  \
01230 
01231 #define RND_STEP_1_1(a,b,c,d,e,f,g,h,i)                               \
01232     /* L1 = e>>>14 */                                                 \
01233     "rorl   $14, " L1 "\n\t"                                      \
01234  
01235 #define RND_STEP_1_2(a,b,c,d,e,f,g,h,i)                               \
01236     /* L3 = b */                                                      \
01237     "movl   %" #b ", " L4 "\n\t"                                  \
01238     /* L2 = f */                                                      \
01239     "movl   %" #f ", " L2 "\n\t"                                  \
01240     /* h += w_k */                                                    \
01241     "addl   (" #i ")*4(" WK "), %" #h "\n\t"                      \
01242     /* L2 = f ^ g */                                                  \
01243     "xorl   %" #g ", " L2 "\n\t"                                  \
01244  
01245 #define RND_STEP_1_3(a,b,c,d,e,f,g,h,i)                               \
01246     /* L1 = (e>>>14) ^ e */                                           \
01247     "xorl   %" #e ", " L1 "\n\t"                                  \
01248     /* L2 = (f ^ g) & e */                                            \
01249     "andl   %" #e ", " L2 "\n\t"                                  \
01250  
01251 #define RND_STEP_1_4(a,b,c,d,e,f,g,h,i)                               \
01252     /* L1 = ((e>>>14) ^ e) >>> 5 */                                   \
01253     "rorl   $5, " L1 "\n\t"                                       \
01254     /* L2 = Ch(e,f,g) */                                              \
01255     "xorl   %" #g ", " L2 "\n\t"                                  \
01256     /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */                             \
01257     "xorl   %" #e ", " L1 "\n\t"                                  \
01258     /* h += Ch(e,f,g) */                                              \
01259     "addl   " L2 ", %" #h "\n\t"                                  \
01260 
01261 #define RND_STEP_1_5(a,b,c,d,e,f,g,h,i)                               \
01262     /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */                     \
01263     "rorl   $6, " L1 "\n\t"                                       \
01264     /* L4 = a ^ b (= b ^ c of next RND) */                            \
01265     "xorl   %" #a ", " L4 "\n\t"                                  \
01266     /* h = h + w_k + Sigma1(e) */                                     \
01267     "addl   " L1 ", %" #h "\n\t"                                  \
01268     /* L2 = a */                                                      \
01269     "movl   %" #a ", " L2 "\n\t"                                  \
01270 
01271 #define RND_STEP_1_6(a,b,c,d,e,f,g,h,i)                               \
01272     /* L3 = (a ^ b) & (b ^ c)  */                                     \
01273     "andl   " L4 ", " L3 "\n\t"                                   \
01274     /* L2 = a>>>9 */                                                  \
01275     "rorl   $9, " L2 "\n\t"                                       \
01276     /* L2 = (a>>>9) ^ a */                                            \
01277     "xorl   %" #a ", " L2 "\n\t"                                  \
01278     /* L1 = Maj(a,b,c) */                                             \
01279     "xorl   %" #b ", " L3 "\n\t"                                  \
01280 
01281 #define RND_STEP_1_7(a,b,c,d,e,f,g,h,i)                               \
01282     /* L2 = ((a>>>9) ^ a) >>> 11 */                                   \
01283     "rorl   $11, " L2 "\n\t"                                      \
01284     /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */                        \
01285     "addl   %" #h ", %" #d "\n\t"                                 \
01286     /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */                             \
01287     "xorl   %" #a ", " L2 "\n\t"                                  \
01288     /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */            \
01289     "addl   " L3 ", %" #h "\n\t"                                  \
01290 
01291 #define RND_STEP_1_8(a,b,c,d,e,f,g,h,i)                               \
01292     /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */                     \
01293     "rorl   $2, " L2 "\n\t"                                       \
01294     /* L1 = d (e of next RND) */                                      \
01295     "movl   %" #d ", " L1 "\n\t"                                  \
01296     /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */  \
01297     "addl   " L2 ", %" #h "\n\t"                                  \
01298 
01299 #define _RND_ALL_0(a,b,c,d,e,f,g,h,i)                                 \
01300     /* h += w_k */                                                    \
01301     "addl   (" #i ")*4(" WK "), %" #h "\n\t"                      \
01302     /* L2 = f */                                                      \
01303     "movl   %" #f ", " L2 "\n\t"                                  \
01304     /* L3 = b */                                                      \
01305     "movl   %" #b ", " L3 "\n\t"                                  \
01306     /* L2 = f ^ g */                                                  \
01307     "xorl   %" #g ", " L2 "\n\t"                                  \
01308     /* L1 = e>>>14 */                                                 \
01309     "rorl   $14, " L1 "\n\t"                                      \
01310     /* L2 = (f ^ g) & e */                                            \
01311     "andl   %" #e ", " L2 "\n\t"                                  \
01312     /* L1 = (e>>>14) ^ e */                                           \
01313     "xorl   %" #e ", " L1 "\n\t"                                  \
01314     /* L2 = Ch(e,f,g) */                                              \
01315     "xorl   %" #g ", " L2 "\n\t"                                  \
01316     /* L1 = ((e>>>14) ^ e) >>> 5 */                                   \
01317     "rorl   $5, " L1 "\n\t"                                       \
01318     /* h += Ch(e,f,g) */                                              \
01319     "addl   " L2 ", %" #h "\n\t"                                  \
01320     /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */                             \
01321     "xorl   %" #e ", " L1 "\n\t"                                  \
01322     /* L3 = a ^ b */                                                  \
01323     "xorl   %" #a ", " L3 "\n\t"                                  \
01324     /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */                     \
01325     "rorl   $6, " L1 "\n\t"                                       \
01326     /* L2 = a */                                                      \
01327     "movl   %" #a ", " L2 "\n\t"                                  \
01328     /* h = h + w_k + Sigma1(e) */                                     \
01329     "addl   " L1 ", %" #h "\n\t"                                  \
01330     /* L2 = a>>>9 */                                                  \
01331     "rorl   $9, " L2 "\n\t"                                       \
01332     /* L3 = (a ^ b) & (b ^ c) */                                      \
01333     "andl   " L3 ", " L4 "\n\t"                                   \
01334     /* L2 = (a>>>9) ^ a */                                            \
01335     "xorl   %" #a ", " L2 "\n\t"                                  \
01336     /* L1 = Maj(a,b,c) */                                             \
01337     "xorl   %" #b ", " L4 "\n\t"                                  \
01338     /* L2 = ((a>>>9) ^ a) >>> 11 */                                   \
01339     "rorl   $11, " L2 "\n\t"                                      \
01340     /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */                        \
01341     "addl   %" #h ", %" #d "\n\t"                                 \
01342     /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */                             \
01343     "xorl   %" #a ", " L2 "\n\t"                                  \
01344     /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */            \
01345     "addl   " L4 ", %" #h "\n\t"                                  \
01346     /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */                     \
01347     "rorl   $2, " L2 "\n\t"                                       \
01348     /* L1 = d (e of next RND) */                                      \
01349     "movl   %" #d ", " L1 "\n\t"                                  \
01350     /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */  \
01351     "addl   " L2 ", %" #h "\n\t"                                  \
01352 
01353 #define _RND_ALL_1(a,b,c,d,e,f,g,h,i)                                 \
01354     /* h += w_k */                                                    \
01355     "addl   (" #i ")*4(" WK "), %" #h "\n\t"                      \
01356     /* L2 = f */                                                      \
01357     "movl   %" #f ", " L2 "\n\t"                                  \
01358     /* L3 = b */                                                      \
01359     "movl   %" #b ", " L4 "\n\t"                                  \
01360     /* L2 = f ^ g */                                                  \
01361     "xorl   %" #g ", " L2 "\n\t"                                  \
01362     /* L1 = e>>>14 */                                                 \
01363     "rorl   $14, " L1 "\n\t"                                      \
01364     /* L2 = (f ^ g) & e */                                            \
01365     "andl   %" #e ", " L2 "\n\t"                                  \
01366     /* L1 = (e>>>14) ^ e */                                           \
01367     "xorl   %" #e ", " L1 "\n\t"                                  \
01368     /* L2 = Ch(e,f,g) */                                              \
01369     "xorl   %" #g ", " L2 "\n\t"                                  \
01370     /* L1 = ((e>>>14) ^ e) >>> 5 */                                   \
01371     "rorl   $5, " L1 "\n\t"                                       \
01372     /* h += Ch(e,f,g) */                                              \
01373     "addl   " L2 ", %" #h "\n\t"                                  \
01374     /* L1 = (((e>>>14) ^ e) >>> 5) ^ e */                             \
01375     "xorl   %" #e ", " L1 "\n\t"                                  \
01376     /* L3 = a ^ b */                                                  \
01377     "xorl   %" #a ", " L4 "\n\t"                                  \
01378     /* L1 = ((((e>>>14) ^ e) >>> 5) ^ e) >>> 6 */                     \
01379     "rorl   $6, " L1 "\n\t"                                       \
01380     /* L2 = a */                                                      \
01381     "movl   %" #a ", " L2 "\n\t"                                  \
01382     /* h = h + w_k + Sigma1(e) */                                     \
01383     "addl   " L1 ", %" #h "\n\t"                                  \
01384     /* L2 = a>>>9 */                                                  \
01385     "rorl   $9, " L2 "\n\t"                                       \
01386     /* L3 = (a ^ b) & (b ^ c)  */                                     \
01387     "andl   " L4 ", " L3 "\n\t"                                   \
01388     /* L2 = (a>>>9) ^ a */                                            \
01389     "xorl   %" #a", " L2 "\n\t"                                   \
01390     /* L1 = Maj(a,b,c) */                                             \
01391     "xorl   %" #b ", " L3 "\n\t"                                  \
01392     /* L2 = ((a>>>9) ^ a) >>> 11 */                                   \
01393     "rorl   $11, " L2 "\n\t"                                      \
01394     /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */                        \
01395     "addl   %" #h ", %" #d "\n\t"                                 \
01396     /* L2 = (((a>>>9) ^ a) >>> 11) ^ a */                             \
01397     "xorl   %" #a ", " L2 "\n\t"                                  \
01398     /* h = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */            \
01399     "addl   " L3 ", %" #h "\n\t"                                  \
01400     /* L2 = ((((a>>>9) ^ a) >>> 11) ^ a) >>> 2 */                     \
01401     "rorl   $2, " L2 "\n\t"                                       \
01402     /* L1 = d (e of next RND) */                                      \
01403     "movl   %" #d ", " L1 "\n\t"                                  \
01404     /* h = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */  \
01405     "addl   " L2 ", %" #h "\n\t"                                  \
01406 
01407 
01408 #define RND_ALL_0(a, b, c, d, e, f, g, h, i) \
01409        _RND_ALL_0(a, b, c, d, e, f, g, h, i)
01410 #define RND_ALL_1(a, b, c, d, e, f, g, h, i) \
01411        _RND_ALL_1(a, b, c, d, e, f, g, h, i)
01412 
01413 #define RND_ALL_4(a, b, c, d, e, f, g, h, i)   \
01414         RND_ALL_0(a, b, c, d, e, f, g, h, i+0) \
01415         RND_ALL_1(h, a, b, c, d, e, f, g, i+1) \
01416         RND_ALL_0(g, h, a, b, c, d, e, f, i+2) \
01417         RND_ALL_1(f, g, h, a, b, c, d, e, i+3)
01418 
01419 #endif  /* defined(HAVE_INTEL_AVX1) ||  defined(HAVE_INTEL_AVX2) */
01420 
01421 #if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */
01422 
01423 #define _VPALIGNR(op1, op2, op3, op4)                    \
01424     "vpalignr   $" #op4", %" #op3", %" #op2", %" #op1"\n\t"
01425 #define VPALIGNR(op1, op2, op3, op4)                     \
01426         _VPALIGNR(op1, op2, op3, op4)
01427 #define _VPADDD(op1, op2, op3)                           \
01428     "vpaddd %" #op3", %" #op2", %" #op1"\n\t"
01429 #define VPADDD(op1, op2, op3)                            \
01430        _VPADDD(op1, op2, op3)
01431 #define _VPSRLD(op1, op2, op3)                           \
01432     "vpsrld $" #op3", %" #op2", %" #op1"\n\t"
01433 #define VPSRLD(op1, op2, op3)        \
01434        _VPSRLD(op1, op2, op3)
01435 #define _VPSRLQ(op1, op2, op3)                           \
01436     "vpsrlq $" #op3", %" #op2", %" #op1"\n\t"
01437 #define VPSRLQ(op1,op2,op3)        \
01438        _VPSRLQ(op1,op2,op3)
01439 #define _VPSLLD(op1,op2,op3)                             \
01440     "vpslld $" #op3", %" #op2", %" #op1"\n\t"
01441 #define VPSLLD(op1,op2,op3)        \
01442        _VPSLLD(op1,op2,op3)
01443 #define _VPOR(op1,op2,op3)                               \
01444     "vpor   %" #op3", %" #op2", %" #op1"\n\t"
01445 #define VPOR(op1,op2,op3)          \
01446        _VPOR(op1,op2,op3)
01447 #define _VPXOR(op1,op2,op3)                              \
01448     "vpxor  %" #op3", %" #op2", %" #op1"\n\t"
01449 #define VPXOR(op1,op2,op3)         \
01450        _VPXOR(op1,op2,op3)
01451 #define _VPSHUFD(op1,op2,op3)                            \
01452     "vpshufd    $" #op3", %" #op2", %" #op1"\n\t"
01453 #define VPSHUFD(op1,op2,op3)       \
01454        _VPSHUFD(op1,op2,op3)
01455 #define _VPSHUFB(op1,op2,op3)                            \
01456     "vpshufb    %" #op3", %" #op2", %" #op1"\n\t"
01457 #define VPSHUFB(op1,op2,op3)       \
01458        _VPSHUFB(op1,op2,op3)
01459 #define _VPSLLDQ(op1,op2,op3)                            \
01460     "vpslldq    $" #op3", %" #op2", %" #op1"\n\t"
01461 #define VPSLLDQ(op1,op2,op3)       \
01462        _VPSLLDQ(op1,op2,op3)
01463 
01464 #define MsgSched(X0,X1,X2,X3,a,b,c,d,e,f,g,h,_i)                           \
01465             RND_STEP_0_1(a,b,c,d,e,f,g,h,_i)                               \
01466     VPALIGNR (XTMP1, X1, X0, 4)    /* XTMP1 = W[-15] */                    \
01467     VPALIGNR (XTMP0, X3, X2, 4)    /* XTMP0 = W[-7] */                     \
01468             RND_STEP_0_2(a,b,c,d,e,f,g,h,_i)                               \
01469             RND_STEP_0_3(a,b,c,d,e,f,g,h,_i)                               \
01470     VPSRLD   (XTMP2, XTMP1, 7)     /* XTMP2 = W[-15] >> 7 */               \
01471     VPSLLD   (XTMP3, XTMP1, 25)    /* XTEMP3 = W[-15] << (32-7) */         \
01472             RND_STEP_0_4(a,b,c,d,e,f,g,h,_i)                               \
01473             RND_STEP_0_5(a,b,c,d,e,f,g,h,_i)                               \
01474     VPSRLD   (XTMP4, XTMP1, 18)    /* XTEMP4 = W[-15] >> 18 */             \
01475     VPSLLD   (XTMP5, XTMP1, 14)    /* XTEMP5 = W[-15] << (32-18) */        \
01476             RND_STEP_0_6(a,b,c,d,e,f,g,h,_i)                               \
01477             RND_STEP_0_7(a,b,c,d,e,f,g,h,_i)                               \
01478     VPOR     (XTMP2, XTMP3, XTMP2) /* XTMP2 = W[-15] >>> 7 */              \
01479     VPOR     (XTMP4, XTMP5, XTMP4) /* XTMP4 = W[-15] >>> 18 */             \
01480             RND_STEP_0_8(a,b,c,d,e,f,g,h,_i)                               \
01481             RND_STEP_1_1(h,a,b,c,d,e,f,g,_i+1)                             \
01482             RND_STEP_1_2(h,a,b,c,d,e,f,g,_i+1)                             \
01483     VPSRLD   (XTMP5, XTMP1, 3)     /* XTMP4 = W[-15] >> 3 */               \
01484     VPXOR    (XTMP2, XTMP4, XTMP2)                                         \
01485                           /* XTMP2 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */ \
01486             RND_STEP_1_3(h,a,b,c,d,e,f,g,_i+1)                             \
01487             RND_STEP_1_4(h,a,b,c,d,e,f,g,_i+1)                             \
01488     VPXOR    (XTMP1, XTMP5, XTMP2)  /* XTMP1 = s0 */                       \
01489     VPSHUFD  (XTMP2, X3, 0b11111010)  /* XTMP2 = W[-2] {BBAA}*/            \
01490             RND_STEP_1_5(h,a,b,c,d,e,f,g,_i+1)                             \
01491             RND_STEP_1_6(h,a,b,c,d,e,f,g,_i+1)                             \
01492     VPSRLD   (XTMP4, XTMP2, 10)      /* XTMP4 = W[-2] >> 10 {BBAA} */      \
01493     VPSRLQ   (XTMP3, XTMP2, 19)      /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */  \
01494             RND_STEP_1_7(h,a,b,c,d,e,f,g,_i+1)                             \
01495             RND_STEP_1_8(h,a,b,c,d,e,f,g,_i+1)                             \
01496             RND_STEP_0_1(g,h,a,b,c,d,e,f,_i+2)                             \
01497     VPSRLQ   (XTMP2, XTMP2, 17)      /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */  \
01498     VPADDD   (XTMP0, XTMP0, X0)                                            \
01499             RND_STEP_0_2(g,h,a,b,c,d,e,f,_i+2)                             \
01500             RND_STEP_0_3(g,h,a,b,c,d,e,f,_i+2)                             \
01501             RND_STEP_0_4(g,h,a,b,c,d,e,f,_i+2)                             \
01502     VPXOR    (XTMP2, XTMP3, XTMP2)                                         \
01503     VPADDD   (XTMP0, XTMP0, XTMP1)  /* XTMP0 = W[-16] + W[-7] + s0 */      \
01504             RND_STEP_0_5(g,h,a,b,c,d,e,f,_i+2)                             \
01505     VPXOR    (XTMP4, XTMP4, XTMP2)   /* XTMP4 = s1 {xBxA} */               \
01506             RND_STEP_0_6(g,h,a,b,c,d,e,f,_i+2)                             \
01507     VPSHUFB  (XTMP4, XTMP4, SHUF_00BA)  /* XTMP4 = s1 {00BA} */            \
01508             RND_STEP_0_7(g,h,a,b,c,d,e,f,_i+2)                             \
01509     VPADDD   (XTMP0, XTMP0, XTMP4)  /* XTMP0 = {..., ..., W[1], W[0]} */   \
01510             RND_STEP_0_8(g,h,a,b,c,d,e,f,_i+2)                             \
01511             RND_STEP_1_1(f,g,h,a,b,c,d,e,_i+3)                             \
01512     VPSHUFD  (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */         \
01513             RND_STEP_1_2(f,g,h,a,b,c,d,e,_i+3)                             \
01514     VPSRLQ   (XTMP4, XTMP2, 17)      /* XTMP4 = W[-2] MY_ROR 17 {xDxC} */  \
01515     VPSRLQ   (XTMP3, XTMP2, 19)       /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */ \
01516             RND_STEP_1_3(f,g,h,a,b,c,d,e,_i+3)                             \
01517             RND_STEP_1_4(f,g,h,a,b,c,d,e,_i+3)                             \
01518     VPSRLD   (XTMP5, XTMP2, 10)       /* XTMP5 = W[-2] >> 10 {DDCC} */     \
01519     VPXOR    (XTMP4, XTMP3, XTMP4)                                         \
01520             RND_STEP_1_5(f,g,h,a,b,c,d,e,_i+3)                             \
01521             RND_STEP_1_6(f,g,h,a,b,c,d,e,_i+3)                             \
01522     VPXOR    (XTMP5, XTMP4, XTMP5)   /* XTMP5 = s1 {xDxC} */               \
01523             RND_STEP_1_7(f,g,h,a,b,c,d,e,_i+3)                             \
01524     VPSHUFB  (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */             \
01525             RND_STEP_1_8(f,g,h,a,b,c,d,e,_i+3)                             \
01526     VPADDD   (X0, XTMP5, XTMP0)      /* X0 = {W[3], W[2], W[1], W[0]} */
01527 
01528 #if defined(HAVE_INTEL_RORX)
01529 
01530 #define MsgSched_RORX(X0,X1,X2,X3,a,b,c,d,e,f,g,h,_i)                      \
01531             RND_STEP_RORX_0_1(a,b,c,d,e,f,g,h,_i)                          \
01532     VPALIGNR (XTMP0, X3, X2, 4)                                            \
01533     VPALIGNR (XTMP1, X1, X0, 4)   /* XTMP1 = W[-15] */                     \
01534             RND_STEP_RORX_0_2(a,b,c,d,e,f,g,h,_i)                          \
01535             RND_STEP_RORX_0_3(a,b,c,d,e,f,g,h,_i)                          \
01536     VPSRLD   (XTMP2, XTMP1, 7)                                             \
01537     VPSLLD   (XTMP3, XTMP1, 25) /* VPSLLD   (XTMP3, XTMP1, (32-7)) */      \
01538             RND_STEP_RORX_0_4(a,b,c,d,e,f,g,h,_i)                          \
01539             RND_STEP_RORX_0_5(a,b,c,d,e,f,g,h,_i)                          \
01540     VPSRLD   (XTMP4, XTMP1, 3)  /* XTMP4 = W[-15] >> 3 */                  \
01541     VPOR     (XTMP3, XTMP3, XTMP2)  /* XTMP1 = W[-15] MY_ROR 7 */          \
01542             RND_STEP_RORX_0_6(a,b,c,d,e,f,g,h,_i)                          \
01543             RND_STEP_RORX_0_7(a,b,c,d,e,f,g,h,_i)                          \
01544             RND_STEP_RORX_0_8(a,b,c,d,e,f,g,h,_i)                          \
01545                                                                            \
01546             RND_STEP_RORX_1_1(h,a,b,c,d,e,f,g,_i+1)                        \
01547     VPSRLD   (XTMP2, XTMP1,18)                                             \
01548             RND_STEP_RORX_1_2(h,a,b,c,d,e,f,g,_i+1)                        \
01549     VPSLLD   (XTMP1, XTMP1, 14) /* VPSLLD   (XTMP1, XTMP1, (32-18)) */     \
01550             RND_STEP_RORX_1_3(h,a,b,c,d,e,f,g,_i+1)                        \
01551     VPXOR    (XTMP3, XTMP3, XTMP1)                                         \
01552             RND_STEP_RORX_1_4(h,a,b,c,d,e,f,g,_i+1)                        \
01553     VPXOR    (XTMP3, XTMP3, XTMP2)                                         \
01554                           /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */ \
01555             RND_STEP_RORX_1_5(h,a,b,c,d,e,f,g,_i+1)                        \
01556     VPSHUFD  (XTMP2, X3, 0b11111010)  /* XTMP2 = W[-2] {BBAA}*/            \
01557             RND_STEP_RORX_1_6(h,a,b,c,d,e,f,g,_i+1)                        \
01558     VPXOR    (XTMP1, XTMP3, XTMP4)  /* XTMP1 = s0 */                       \
01559             RND_STEP_RORX_1_7(h,a,b,c,d,e,f,g,_i+1)                        \
01560     VPSRLD   (XTMP4, XTMP2, 10)      /* XTMP4 = W[-2] >> 10 {BBAA} */      \
01561             RND_STEP_RORX_1_8(h,a,b,c,d,e,f,g,_i+1)                        \
01562                                                                            \
01563             RND_STEP_RORX_0_1(g,h,a,b,c,d,e,f,_i+2)                        \
01564     VPSRLQ   (XTMP3, XTMP2, 19)      /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */  \
01565             RND_STEP_RORX_0_2(g,h,a,b,c,d,e,f,_i+2)                        \
01566     VPSRLQ   (XTMP2, XTMP2, 17)      /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */  \
01567     VPADDD   (XTMP0, XTMP0, X0)                                            \
01568             RND_STEP_RORX_0_3(g,h,a,b,c,d,e,f,_i+2)                        \
01569     VPADDD   (XTMP0, XTMP0, XTMP1)  /* XTMP0 = W[-16] + W[-7] + s0 */      \
01570             RND_STEP_RORX_0_4(g,h,a,b,c,d,e,f,_i+2)                        \
01571     VPXOR    (XTMP2, XTMP2, XTMP3)                                         \
01572             RND_STEP_RORX_0_5(g,h,a,b,c,d,e,f,_i+2)                        \
01573     VPXOR    (XTMP4, XTMP4, XTMP2)   /* XTMP4 = s1 {xBxA} */               \
01574             RND_STEP_RORX_0_6(g,h,a,b,c,d,e,f,_i+2)                        \
01575     VPSHUFB  (XTMP4, XTMP4, SHUF_00BA)  /* XTMP4 = s1 {00BA} */            \
01576             RND_STEP_RORX_0_7(g,h,a,b,c,d,e,f,_i+2)                        \
01577     VPADDD   (XTMP0, XTMP0, XTMP4)  /* XTMP0 = {..., ..., W[1], W[0]} */   \
01578             RND_STEP_RORX_0_8(g,h,a,b,c,d,e,f,_i+2)                        \
01579                                                                            \
01580             RND_STEP_RORX_1_1(f,g,h,a,b,c,d,e,_i+3)                        \
01581     VPSHUFD  (XTMP2, XTMP0, 0b01010000) /* XTMP2 = W[-2] {DDCC} */         \
01582             RND_STEP_RORX_1_2(f,g,h,a,b,c,d,e,_i+3)                        \
01583     VPSRLD   (XTMP5, XTMP2, 10)       /* XTMP5 = W[-2] >> 10 {DDCC} */     \
01584             RND_STEP_RORX_1_3(f,g,h,a,b,c,d,e,_i+3)                        \
01585     VPSRLQ   (XTMP3, XTMP2, 19)       /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */ \
01586             RND_STEP_RORX_1_4(f,g,h,a,b,c,d,e,_i+3)                        \
01587     VPSRLQ   (XTMP2, XTMP2, 17)      /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */  \
01588             RND_STEP_RORX_1_5(f,g,h,a,b,c,d,e,_i+3)                        \
01589     VPXOR    (XTMP2, XTMP2, XTMP3)                                         \
01590             RND_STEP_RORX_1_6(f,g,h,a,b,c,d,e,_i+3)                        \
01591     VPXOR    (XTMP5, XTMP5, XTMP2)   /* XTMP5 = s1 {xDxC} */               \
01592             RND_STEP_RORX_1_7(f,g,h,a,b,c,d,e,_i+3)                        \
01593     VPSHUFB  (XTMP5, XTMP5, SHUF_DC00) /* XTMP5 = s1 {DC00} */             \
01594             RND_STEP_RORX_1_8(f,g,h,a,b,c,d,e,_i+3)                        \
01595     VPADDD   (X0, XTMP5, XTMP0)      /* X0 = {W[3], W[2], W[1], W[0]} */
01596 
01597 #endif /* HAVE_INTEL_RORX */
01598 
01599 
01600 #define _W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) \
01601     "# X0, X1, X2, X3 = W[0..15]\n\t"                  \
01602     "vmovdqu      (%%rax), %" #X0 "\n\t"               \
01603     "vmovdqu    16(%%rax), %" #X1 "\n\t"               \
01604     VPSHUFB(X0, X0, BYTE_FLIP_MASK)                    \
01605     VPSHUFB(X1, X1, BYTE_FLIP_MASK)                    \
01606     "vmovdqu    32(%%rax), %" #X2 "\n\t"               \
01607     "vmovdqu    48(%%rax), %" #X3 "\n\t"               \
01608     VPSHUFB(X2, X2, BYTE_FLIP_MASK)                    \
01609     VPSHUFB(X3, X3, BYTE_FLIP_MASK)
01610 
01611 #define W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK) \
01612        _W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK)
01613 
01614 
01615 #define _SET_W_K_XFER_4(i) \
01616     "vpaddd (" #i "*4)+ 0+%[K], %%xmm0, %%xmm4\n\t"  \
01617     "vpaddd (" #i "*4)+16+%[K], %%xmm1, %%xmm5\n\t"  \
01618     "vmovdqu    %%xmm4,   (" WK ")\n\t"                  \
01619     "vmovdqu    %%xmm5, 16(" WK ")\n\t"                  \
01620     "vpaddd (" #i "*4)+32+%[K], %%xmm2, %%xmm6\n\t"  \
01621     "vpaddd (" #i "*4)+48+%[K], %%xmm3, %%xmm7\n\t"  \
01622     "vmovdqu    %%xmm6, 32(" WK ")\n\t"                  \
01623     "vmovdqu    %%xmm7, 48(" WK ")\n\t"
01624 
01625 #define SET_W_K_XFER_4(i) \
01626        _SET_W_K_XFER_4(i)
01627 
01628 
01629 static const ALIGN32 word64 mSHUF_00BA[] =
01630     { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */
01631 static const ALIGN32 word64 mSHUF_DC00[] =
01632     { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */
01633 static const ALIGN32 word64 mBYTE_FLIP_MASK[] =
01634     { 0x0405060700010203, 0x0c0d0e0f08090a0b };
01635 
01636 #define _Init_Masks(mask1, mask2, mask3)       \
01637     "vmovdqa    %[FLIP], %" #mask1 "\n\t"      \
01638     "vmovdqa    %[SHUF00BA], %" #mask2 "\n\t"  \
01639     "vmovdqa    %[SHUFDC00], %" #mask3 "\n\t"
01640 
01641 #define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) \
01642        _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
01643 
01644 #define X0 %xmm0
01645 #define X1 %xmm1
01646 #define X2 %xmm2
01647 #define X3 %xmm3
01648 
01649 #define XTMP0 %xmm4
01650 #define XTMP1 %xmm5
01651 #define XTMP2 %xmm6
01652 #define XTMP3 %xmm7
01653 #define XTMP4 %xmm8
01654 #define XTMP5 %xmm9
01655 #define XFER  %xmm10
01656 
01657 #define SHUF_00BA   %xmm11 /* shuffle xBxA -> 00BA */
01658 #define SHUF_DC00   %xmm12 /* shuffle xDxC -> DC00 */
01659 #define BYTE_FLIP_MASK  %xmm13
01660 
01661 
01662 SHA256_NOINLINE static int Transform_Sha256_AVX1(wc_Sha256* sha256)
01663 {
01664     __asm__ __volatile__ (
01665 
01666         "subq   $64, %%rsp\n\t"
01667 
01668         "leaq   32(%[sha256]), %%rax\n\t"
01669     Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
01670     LOAD_DIGEST()
01671 
01672     W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK)
01673 
01674         "movl   %%r9d, " L4 "\n\t"
01675         "movl   %%r12d, " L1 "\n\t"
01676         "xorl   %%r10d, " L4 "\n\t"
01677 
01678     SET_W_K_XFER_4(0)
01679     MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01680     MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01681     MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01682     MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01683 
01684     SET_W_K_XFER_4(16)
01685     MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01686     MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01687     MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01688     MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01689 
01690     SET_W_K_XFER_4(32)
01691     MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01692     MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01693     MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01694     MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01695 
01696     SET_W_K_XFER_4(48)
01697     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01698     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01699     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01700     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01701 
01702     STORE_ADD_DIGEST()
01703 
01704         "addq   $64, %%rsp\n\t"
01705 
01706         :
01707         : [FLIP]     "m" (mBYTE_FLIP_MASK[0]),
01708           [SHUF00BA] "m" (mSHUF_00BA[0]),
01709           [SHUFDC00] "m" (mSHUF_DC00[0]),
01710           [sha256]   "r" (sha256),
01711           [K]        "m" (K)
01712         : WORK_REGS, STATE_REGS, XMM_REGS, "memory"
01713     );
01714 
01715     return 0;
01716 }
01717 
01718 SHA256_NOINLINE static int Transform_Sha256_AVX1_Len(wc_Sha256* sha256,
01719                                                      word32 len)
01720 {
01721     __asm__ __volatile__ (
01722 
01723         "subq   $64, %%rsp\n\t"
01724         "movq   120(%[sha256]), %%rax\n\t"
01725 
01726     Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
01727     LOAD_DIGEST()
01728 
01729         "# Start of loop processing a block\n"
01730         "1:\n\t"
01731 
01732     W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK)
01733 
01734         "movl   %%r9d, " L4 "\n\t"
01735         "movl   %%r12d, " L1 "\n\t"
01736         "xorl   %%r10d, " L4 "\n\t"
01737 
01738     SET_W_K_XFER_4(0)
01739     MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01740     MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01741     MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01742     MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01743 
01744     SET_W_K_XFER_4(16)
01745     MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01746     MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01747     MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01748     MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01749 
01750     SET_W_K_XFER_4(32)
01751     MsgSched(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01752     MsgSched(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01753     MsgSched(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01754     MsgSched(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01755 
01756     SET_W_K_XFER_4(48)
01757     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01758     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01759     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01760     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01761         "movq   120(%[sha256]), %%rax\n\t"
01762 
01763     ADD_DIGEST()
01764 
01765         "addq   $64, %%rax\n\t"
01766         "subl   $64, %[len]\n\t"
01767 
01768     STORE_DIGEST()
01769 
01770         "movq   %%rax, 120(%[sha256])\n\t"
01771         "jnz    1b\n\t"
01772 
01773         "addq   $64, %%rsp\n\t"
01774 
01775         :
01776         : [FLIP]     "m" (mBYTE_FLIP_MASK[0]),
01777           [SHUF00BA] "m" (mSHUF_00BA[0]),
01778           [SHUFDC00] "m" (mSHUF_DC00[0]),
01779           [sha256]   "r" (sha256),
01780           [len]      "r" (len),
01781           [K]        "m" (K)
01782         : WORK_REGS, STATE_REGS, XMM_REGS, "memory"
01783     );
01784 
01785     return 0;
01786 }
01787 #endif  /* HAVE_INTEL_AVX1 */
01788 
01789 #if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX)
01790 SHA256_NOINLINE static int Transform_Sha256_AVX1_RORX(wc_Sha256* sha256)
01791 {
01792     __asm__ __volatile__ (
01793 
01794         "subq   $64, %%rsp\n\t"
01795 
01796     Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
01797         "leaq   32(%[sha256]), %%rax\n\t"
01798     W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK)
01799 
01800     LOAD_DIGEST()
01801 
01802     SET_W_K_XFER_4(0)
01803         "movl   %%r9d, " L4 "\n\t"
01804         "rorx   $6, %%r12d, " L1 "\n\t"
01805         "xorl   %%r10d, " L4 "\n\t"
01806     MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01807     MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01808     MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01809     MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01810 
01811     SET_W_K_XFER_4(16)
01812     MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01813     MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01814     MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01815     MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01816 
01817     SET_W_K_XFER_4(32)
01818     MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01819     MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01820     MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01821     MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01822 
01823     SET_W_K_XFER_4(48)
01824         "xorl   " L3 ", " L3 "\n\t"
01825     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01826     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01827     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01828     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01829         /* Prev RND: h += Maj(a,b,c) */
01830         "addl   " L3 ", %%r8d\n\t"
01831 
01832     STORE_ADD_DIGEST()
01833 
01834         "addq   $64, %%rsp\n\t"
01835 
01836         :
01837         : [FLIP]     "m" (mBYTE_FLIP_MASK[0]),
01838           [SHUF00BA] "m" (mSHUF_00BA[0]),
01839           [SHUFDC00] "m" (mSHUF_DC00[0]),
01840           [sha256]   "r" (sha256),
01841           [K]        "m" (K)
01842         : WORK_REGS, STATE_REGS, XMM_REGS, "memory"
01843     );
01844 
01845     return 0;
01846 }
01847 
01848 SHA256_NOINLINE static int Transform_Sha256_AVX1_RORX_Len(wc_Sha256* sha256,
01849                                                           word32 len)
01850 {
01851     __asm__ __volatile__ (
01852 
01853         "subq   $64, %%rsp\n\t"
01854         "movq   120(%[sha256]), %%rax\n\t"
01855 
01856     Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
01857     LOAD_DIGEST()
01858 
01859         "# Start of loop processing a block\n"
01860         "1:\n\t"
01861 
01862     W_K_from_buff(X0, X1, X2, X3, BYTE_FLIP_MASK)
01863 
01864     SET_W_K_XFER_4(0)
01865         "movl   %%r9d, " L4 "\n\t"
01866         "rorx   $6, %%r12d, " L1 "\n\t"
01867         "xorl   %%r10d, " L4 "\n\t"
01868     MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01869     MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01870     MsgSched_RORX(X2, X3, X0, X1, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  8)
01871     MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01872 
01873     SET_W_K_XFER_4(16)
01874     MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01875     MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01876     MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01877     MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01878 
01879     SET_W_K_XFER_4(32)
01880     MsgSched_RORX(X0, X1, X2, X3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01881     MsgSched_RORX(X1, X2, X3, X0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01882     MsgSched_RORX(X2, X3, X0, X1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01883     MsgSched_RORX(X3, X0, X1, X2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01884 
01885     SET_W_K_XFER_4(48)
01886         "xorl   " L3 ", " L3 "\n\t"
01887         "xorl   " L2 ", " L2 "\n\t"
01888     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
01889     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  4)
01890     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  8)
01891     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 12)
01892         /* Prev RND: h += Maj(a,b,c) */
01893         "addl   " L3 ", %%r8d\n\t"
01894         "movq   120(%[sha256]), %%rax\n\t"
01895 
01896     ADD_DIGEST()
01897 
01898         "addq   $64, %%rax\n\t"
01899         "subl   $64, %[len]\n\t"
01900 
01901     STORE_DIGEST()
01902 
01903         "movq   %%rax, 120(%[sha256])\n\t"
01904         "jnz    1b\n\t"
01905 
01906         "addq   $64, %%rsp\n\t"
01907 
01908         :
01909         : [FLIP]     "m" (mBYTE_FLIP_MASK[0]),
01910           [SHUF00BA] "m" (mSHUF_00BA[0]),
01911           [SHUFDC00] "m" (mSHUF_DC00[0]),
01912           [sha256]   "r" (sha256),
01913           [len]      "r" (len),
01914           [K]        "m" (K)
01915         : WORK_REGS, STATE_REGS, XMM_REGS, "memory"
01916     );
01917 
01918     return 0;
01919 }
01920 #endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_RORX */
01921 
01922 
01923 #if defined(HAVE_INTEL_AVX2)
01924 #define Y0 %ymm0
01925 #define Y1 %ymm1
01926 #define Y2 %ymm2
01927 #define Y3 %ymm3
01928 
01929 #define YTMP0 %ymm4
01930 #define YTMP1 %ymm5
01931 #define YTMP2 %ymm6
01932 #define YTMP3 %ymm7
01933 #define YTMP4 %ymm8
01934 #define YTMP5 %ymm9
01935 #define YXFER %ymm10
01936 
01937 #define SHUF_Y_00BA       %ymm11 /* shuffle xBxA -> 00BA */
01938 #define SHUF_Y_DC00       %ymm12 /* shuffle xDxC -> DC00 */
01939 #define BYTE_FLIP_Y_MASK  %ymm13
01940 
01941 #define YMM_REGS "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", \
01942                  "ymm7", "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13"
01943 
01944 #define MsgSched_Y(Y0,Y1,Y2,Y3,a,b,c,d,e,f,g,h,_i)                            \
01945             RND_STEP_0_1(a,b,c,d,e,f,g,h,_i)                                  \
01946     VPALIGNR (YTMP1, Y1, Y0, 4)    /* YTMP1 = W[-15] */                       \
01947     VPALIGNR (YTMP0, Y3, Y2, 4)    /* YTMP0 = W[-7] */                        \
01948             RND_STEP_0_2(a,b,c,d,e,f,g,h,_i)                                  \
01949             RND_STEP_0_3(a,b,c,d,e,f,g,h,_i)                                  \
01950     VPSRLD   (YTMP2, YTMP1, 7)     /* YTMP2 = W[-15] >> 7 */                  \
01951     VPSLLD   (YTMP3, YTMP1, 25)    /* YTEMP3 = W[-15] << (32-7) */            \
01952             RND_STEP_0_4(a,b,c,d,e,f,g,h,_i)                                  \
01953             RND_STEP_0_5(a,b,c,d,e,f,g,h,_i)                                  \
01954     VPSRLD   (YTMP4, YTMP1, 18)    /* YTEMP4 = W[-15] >> 18 */                \
01955     VPSLLD   (YTMP5, YTMP1, 14)    /* YTEMP5 = W[-15] << (32-18) */           \
01956             RND_STEP_0_6(a,b,c,d,e,f,g,h,_i)                                  \
01957             RND_STEP_0_7(a,b,c,d,e,f,g,h,_i)                                  \
01958     VPOR     (YTMP2, YTMP3, YTMP2) /* YTMP2 = W[-15] >>> 7 */                 \
01959     VPOR     (YTMP4, YTMP5, YTMP4) /* YTMP4 = W[-15] >>> 18 */                \
01960             RND_STEP_0_8(a,b,c,d,e,f,g,h,_i)                                  \
01961             RND_STEP_1_1(h,a,b,c,d,e,f,g,_i+1)                                \
01962             RND_STEP_1_2(h,a,b,c,d,e,f,g,_i+1)                                \
01963     VPSRLD   (YTMP5, YTMP1, 3)     /* YTMP4 = W[-15] >> 3 */                  \
01964     VPXOR    (YTMP2, YTMP4, YTMP2) /* YTMP2 = W[-15] >>> 7 ^ W[-15] >>> 18 */ \
01965             RND_STEP_1_3(h,a,b,c,d,e,f,g,_i+1)                                \
01966             RND_STEP_1_4(h,a,b,c,d,e,f,g,_i+1)                                \
01967     VPXOR    (YTMP1, YTMP5, YTMP2)  /* YTMP1 = s0 */                          \
01968     VPSHUFD  (YTMP2, Y3, 0b11111010)  /* YTMP2 = W[-2] {BBAA}*/               \
01969             RND_STEP_1_5(h,a,b,c,d,e,f,g,_i+1)                                \
01970             RND_STEP_1_6(h,a,b,c,d,e,f,g,_i+1)                                \
01971     VPSRLD   (YTMP4, YTMP2, 10)      /* YTMP4 = W[-2] >> 10 {BBAA} */         \
01972     VPSRLQ   (YTMP3, YTMP2, 19)      /* YTMP3 = W[-2] MY_ROR 19 {xBxA} */     \
01973             RND_STEP_1_7(h,a,b,c,d,e,f,g,_i+1)                                \
01974             RND_STEP_1_8(h,a,b,c,d,e,f,g,_i+1)                                \
01975             RND_STEP_0_1(g,h,a,b,c,d,e,f,_i+2)                                \
01976     VPSRLQ   (YTMP2, YTMP2, 17)      /* YTMP2 = W[-2] MY_ROR 17 {xBxA} */     \
01977     VPADDD   (YTMP0, YTMP0, Y0)                                               \
01978             RND_STEP_0_2(g,h,a,b,c,d,e,f,_i+2)                                \
01979             RND_STEP_0_3(g,h,a,b,c,d,e,f,_i+2)                                \
01980             RND_STEP_0_4(g,h,a,b,c,d,e,f,_i+2)                                \
01981     VPXOR    (YTMP2, YTMP3, YTMP2)                                            \
01982     VPADDD   (YTMP0, YTMP0, YTMP1)  /* YTMP0 = W[-16] + W[-7] + s0 */         \
01983             RND_STEP_0_5(g,h,a,b,c,d,e,f,_i+2)                                \
01984     VPXOR    (YTMP4, YTMP4, YTMP2)   /* YTMP4 = s1 {xBxA} */                  \
01985             RND_STEP_0_6(g,h,a,b,c,d,e,f,_i+2)                                \
01986     VPSHUFB  (YTMP4, YTMP4, SHUF_Y_00BA)  /* YTMP4 = s1 {00BA} */             \
01987             RND_STEP_0_7(g,h,a,b,c,d,e,f,_i+2)                                \
01988     VPADDD   (YTMP0, YTMP0, YTMP4)  /* YTMP0 = {..., ..., W[1], W[0]} */      \
01989             RND_STEP_0_8(g,h,a,b,c,d,e,f,_i+2)                                \
01990             RND_STEP_1_1(f,g,h,a,b,c,d,e,_i+3)                                \
01991     VPSHUFD  (YTMP2, YTMP0, 0b01010000) /* YTMP2 = W[-2] {DDCC} */            \
01992             RND_STEP_1_2(f,g,h,a,b,c,d,e,_i+3)                                \
01993     VPSRLQ   (YTMP4, YTMP2, 17)      /* YTMP4 = W[-2] MY_ROR 17 {xDxC} */     \
01994     VPSRLQ   (YTMP3, YTMP2, 19)       /* YTMP3 = W[-2] MY_ROR 19 {xDxC} */    \
01995             RND_STEP_1_3(f,g,h,a,b,c,d,e,_i+3)                                \
01996             RND_STEP_1_4(f,g,h,a,b,c,d,e,_i+3)                                \
01997     VPSRLD   (YTMP5, YTMP2, 10)       /* YTMP5 = W[-2] >> 10 {DDCC} */        \
01998     VPXOR    (YTMP4, YTMP3, YTMP4)                                            \
01999             RND_STEP_1_5(f,g,h,a,b,c,d,e,_i+3)                                \
02000             RND_STEP_1_6(f,g,h,a,b,c,d,e,_i+3)                                \
02001     VPXOR    (YTMP5, YTMP4, YTMP5)   /* YTMP5 = s1 {xDxC} */                  \
02002             RND_STEP_1_7(f,g,h,a,b,c,d,e,_i+3)                                \
02003     VPSHUFB  (YTMP5, YTMP5, SHUF_Y_DC00) /* YTMP5 = s1 {DC00} */              \
02004             RND_STEP_1_8(f,g,h,a,b,c,d,e,_i+3)                                \
02005     VPADDD   (Y0, YTMP5, YTMP0)      /* Y0 = {W[3], W[2], W[1], W[0]} */
02006 
02007 #if defined(HAVE_INTEL_RORX)
02008 
02009 #define MsgSched_Y_RORX(Y0,Y1,Y2,Y3,a,b,c,d,e,f,g,h,_i)                       \
02010             RND_STEP_RORX_0_1(a,b,c,d,e,f,g,h,_i)                             \
02011     VPALIGNR (YTMP1, Y1, Y0, 4)    /* YTMP1 = W[-15] */                       \
02012             RND_STEP_RORX_0_2(a,b,c,d,e,f,g,h,_i)                             \
02013     VPALIGNR (YTMP0, Y3, Y2, 4)    /* YTMP0 = W[-7] */                        \
02014             RND_STEP_RORX_0_3(a,b,c,d,e,f,g,h,_i)                             \
02015     VPSRLD   (YTMP2, YTMP1, 7)     /* YTMP2 = W[-15] >> 7 */                  \
02016             RND_STEP_RORX_0_4(a,b,c,d,e,f,g,h,_i)                             \
02017     VPSLLD   (YTMP3, YTMP1, 25)    /* YTEMP3 = W[-15] << (32-7) */            \
02018             RND_STEP_RORX_0_5(a,b,c,d,e,f,g,h,_i)                             \
02019     VPSRLD   (YTMP4, YTMP1, 18)    /* YTEMP4 = W[-15] >> 18 */                \
02020             RND_STEP_RORX_0_6(a,b,c,d,e,f,g,h,_i)                             \
02021     VPSLLD   (YTMP5, YTMP1, 14)    /* YTEMP5 = W[-15] << (32-18) */           \
02022             RND_STEP_RORX_0_7(a,b,c,d,e,f,g,h,_i)                             \
02023     VPOR     (YTMP2, YTMP2, YTMP3) /* YTMP2 = W[-15] >>> 7 */                 \
02024             RND_STEP_RORX_0_8(a,b,c,d,e,f,g,h,_i)                             \
02025     VPOR     (YTMP4, YTMP4, YTMP5) /* YTMP4 = W[-15] >>> 18 */                \
02026             RND_STEP_RORX_1_1(h,a,b,c,d,e,f,g,_i+1)                           \
02027     VPSRLD   (YTMP5, YTMP1, 3)     /* YTMP4 = W[-15] >> 3 */                  \
02028             RND_STEP_RORX_1_2(h,a,b,c,d,e,f,g,_i+1)                           \
02029     VPXOR    (YTMP2, YTMP2, YTMP4) /* YTMP2 = W[-15] >>> 7 ^ W[-15] >>> 18 */ \
02030             RND_STEP_RORX_1_3(h,a,b,c,d,e,f,g,_i+1)                           \
02031     VPSHUFD  (YTMP3, Y3, 0b11111010)  /* YTMP2 = W[-2] {BBAA}*/               \
02032             RND_STEP_RORX_1_4(h,a,b,c,d,e,f,g,_i+1)                           \
02033     VPXOR    (YTMP1, YTMP5, YTMP2)  /* YTMP1 = s0 */                          \
02034             RND_STEP_RORX_1_5(h,a,b,c,d,e,f,g,_i+1)                           \
02035     VPSRLD   (YTMP4, YTMP3, 10)      /* YTMP4 = W[-2] >> 10 {BBAA} */         \
02036             RND_STEP_RORX_1_6(h,a,b,c,d,e,f,g,_i+1)                           \
02037     VPSRLQ   (YTMP2, YTMP3, 19)      /* YTMP3 = W[-2] MY_ROR 19 {xBxA} */     \
02038             RND_STEP_RORX_1_7(h,a,b,c,d,e,f,g,_i+1)                           \
02039     VPSRLQ   (YTMP3, YTMP3, 17)      /* YTMP2 = W[-2] MY_ROR 17 {xBxA} */     \
02040             RND_STEP_RORX_1_8(h,a,b,c,d,e,f,g,_i+1)                           \
02041     VPADDD   (YTMP0, YTMP0, Y0)                                               \
02042             RND_STEP_RORX_0_1(g,h,a,b,c,d,e,f,_i+2)                           \
02043     VPXOR    (YTMP2, YTMP2, YTMP3)                                            \
02044             RND_STEP_RORX_0_2(g,h,a,b,c,d,e,f,_i+2)                           \
02045     VPXOR    (YTMP4, YTMP4, YTMP2)   /* YTMP4 = s1 {xBxA} */                  \
02046             RND_STEP_RORX_0_3(g,h,a,b,c,d,e,f,_i+2)                           \
02047     VPADDD   (YTMP0, YTMP0, YTMP1)  /* YTMP0 = W[-16] + W[-7] + s0 */         \
02048             RND_STEP_RORX_0_4(g,h,a,b,c,d,e,f,_i+2)                           \
02049     VPSHUFB  (YTMP4, YTMP4, SHUF_Y_00BA)  /* YTMP4 = s1 {00BA} */             \
02050             RND_STEP_RORX_0_5(g,h,a,b,c,d,e,f,_i+2)                           \
02051     VPADDD   (YTMP0, YTMP0, YTMP4)  /* YTMP0 = {..., ..., W[1], W[0]} */      \
02052             RND_STEP_RORX_0_6(g,h,a,b,c,d,e,f,_i+2)                           \
02053     VPSHUFD  (YTMP2, YTMP0, 0b01010000) /* YTMP2 = W[-2] {DDCC} */            \
02054             RND_STEP_RORX_0_7(g,h,a,b,c,d,e,f,_i+2)                           \
02055             RND_STEP_RORX_0_8(g,h,a,b,c,d,e,f,_i+2)                           \
02056     VPSRLQ   (YTMP4, YTMP2, 17)      /* YTMP4 = W[-2] MY_ROR 17 {xDxC} */     \
02057             RND_STEP_RORX_1_1(f,g,h,a,b,c,d,e,_i+3)                           \
02058     VPSRLQ   (YTMP3, YTMP2, 19)       /* YTMP3 = W[-2] MY_ROR 19 {xDxC} */    \
02059             RND_STEP_RORX_1_2(f,g,h,a,b,c,d,e,_i+3)                           \
02060     VPSRLD   (YTMP5, YTMP2, 10)       /* YTMP5 = W[-2] >> 10 {DDCC} */        \
02061             RND_STEP_RORX_1_3(f,g,h,a,b,c,d,e,_i+3)                           \
02062     VPXOR    (YTMP4, YTMP4, YTMP3)                                            \
02063             RND_STEP_RORX_1_4(f,g,h,a,b,c,d,e,_i+3)                           \
02064     VPXOR    (YTMP5, YTMP5, YTMP4)   /* YTMP5 = s1 {xDxC} */                  \
02065             RND_STEP_RORX_1_5(f,g,h,a,b,c,d,e,_i+3)                           \
02066             RND_STEP_RORX_1_6(f,g,h,a,b,c,d,e,_i+3)                           \
02067     VPSHUFB  (YTMP5, YTMP5, SHUF_Y_DC00) /* YTMP5 = s1 {DC00} */              \
02068             RND_STEP_RORX_1_7(f,g,h,a,b,c,d,e,_i+3)                           \
02069             RND_STEP_RORX_1_8(f,g,h,a,b,c,d,e,_i+3)                           \
02070     VPADDD   (Y0, YTMP5, YTMP0)      /* Y0 = {W[3], W[2], W[1], W[0]} */      \
02071 
02072 #endif /* HAVE_INTEL_RORX */
02073 
02074 #define _VINSERTI128(op1,op2,op3,op4) \
02075     "vinserti128    $" #op4 ", %" #op3 ", %" #op2 ", %" #op1 "\n\t"
02076 #define VINSERTI128(op1,op2,op3,op4)  \
02077        _VINSERTI128(op1,op2,op3,op4)
02078 
02079 
02080 #define _LOAD_W_K_LOW(BYTE_FLIP_MASK, reg)   \
02081     "# X0, X1, X2, X3 = W[0..15]\n\t"        \
02082     "vmovdqu      (%%" #reg "), %%xmm0\n\t"  \
02083     "vmovdqu    16(%%" #reg "), %%xmm1\n\t"  \
02084     VPSHUFB(X0, X0, BYTE_FLIP_MASK)          \
02085     VPSHUFB(X1, X1, BYTE_FLIP_MASK)          \
02086     "vmovdqu    32(%%" #reg "), %%xmm2\n\t"  \
02087     "vmovdqu    48(%%" #reg "), %%xmm3\n\t"  \
02088     VPSHUFB(X2, X2, BYTE_FLIP_MASK)          \
02089     VPSHUFB(X3, X3, BYTE_FLIP_MASK)
02090 
02091 #define LOAD_W_K_LOW(BYTE_FLIP_MASK, reg) \
02092        _LOAD_W_K_LOW(BYTE_FLIP_MASK, reg)
02093 
02094 
02095 #define _LOAD_W_K(BYTE_FLIP_Y_MASK, reg)      \
02096     "# X0, X1, X2, X3 = W[0..15]\n\t"         \
02097     "vmovdqu       (%%" #reg "), %%xmm0\n\t"  \
02098     "vmovdqu     16(%%" #reg "), %%xmm1\n\t"  \
02099     "vmovdqu     64(%%" #reg "), %%xmm4\n\t"  \
02100     "vmovdqu     80(%%" #reg "), %%xmm5\n\t"  \
02101     VINSERTI128(Y0, Y0, XTMP0, 1)             \
02102     VINSERTI128(Y1, Y1, XTMP1, 1)             \
02103     VPSHUFB(Y0, Y0, BYTE_FLIP_Y_MASK)         \
02104     VPSHUFB(Y1, Y1, BYTE_FLIP_Y_MASK)         \
02105     "vmovdqu     32(%%" #reg "), %%xmm2\n\t"  \
02106     "vmovdqu     48(%%" #reg "), %%xmm3\n\t"  \
02107     "vmovdqu     96(%%" #reg "), %%xmm6\n\t"  \
02108     "vmovdqu    112(%%" #reg "), %%xmm7\n\t"  \
02109     VINSERTI128(Y2, Y2, XTMP2, 1)             \
02110     VINSERTI128(Y3, Y3, XTMP3, 1)             \
02111     VPSHUFB(Y2, Y2, BYTE_FLIP_Y_MASK)         \
02112     VPSHUFB(Y3, Y3, BYTE_FLIP_Y_MASK)
02113 
02114 #define LOAD_W_K(BYTE_FLIP_Y_MASK, reg) \
02115        _LOAD_W_K(BYTE_FLIP_Y_MASK, reg)
02116 
02117 
02118 #define _SET_W_Y_4(i)  \
02119     "vpaddd (" #i "*8)+ 0+%[K], %%ymm0, %%ymm4\n\t" \
02120     "vpaddd (" #i "*8)+32+%[K], %%ymm1, %%ymm5\n\t" \
02121     "vmovdqu    %%ymm4, (" #i "*8)+ 0(" WK ")\n\t"      \
02122     "vmovdqu    %%ymm5, (" #i "*8)+32(" WK ")\n\t"      \
02123     "vpaddd (" #i "*8)+64+%[K], %%ymm2, %%ymm4\n\t" \
02124     "vpaddd (" #i "*8)+96+%[K], %%ymm3, %%ymm5\n\t" \
02125     "vmovdqu    %%ymm4, (" #i "*8)+64(" WK ")\n\t"      \
02126     "vmovdqu    %%ymm5, (" #i "*8)+96(" WK ")\n\t"
02127 
02128 #define SET_W_Y_4(i) \
02129        _SET_W_Y_4(i)
02130 
02131 
02132 static const ALIGN32 word64 mSHUF_Y_00BA[] =
02133     { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF,
02134       0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */
02135 static const ALIGN32 word64 mSHUF_Y_DC00[] =
02136     { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100,
02137       0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */
02138 static const ALIGN32 word64 mBYTE_FLIP_Y_MASK[] =
02139     { 0x0405060700010203, 0x0c0d0e0f08090a0b,
02140       0x0405060700010203, 0x0c0d0e0f08090a0b };
02141 
02142 #define _INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) \
02143     "vmovdqa    %[FLIP], %" #BYTE_FLIP_MASK "\n\t"          \
02144     "vmovdqa    %[SHUF00BA], %" #SHUF_00BA "\n\t"           \
02145     "vmovdqa    %[SHUFDC00], %" #SHUF_DC00 "\n\t"
02146 
02147 #define INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) \
02148        _INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
02149 
02150 static const ALIGN32 word32 K256[128] = {
02151     0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L,
02152     0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L,
02153     0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L,
02154     0x3956C25BL, 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L,
02155     0xD807AA98L, 0x12835B01L, 0x243185BEL, 0x550C7DC3L,
02156     0xD807AA98L, 0x12835B01L, 0x243185BEL, 0x550C7DC3L,
02157     0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, 0xC19BF174L,
02158     0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L, 0xC19BF174L,
02159     0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
02160     0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
02161     0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL,
02162     0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL,
02163     0x983E5152L, 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L,
02164     0x983E5152L, 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L,
02165     0xC6E00BF3L, 0xD5A79147L, 0x06CA6351L, 0x14292967L,
02166     0xC6E00BF3L, 0xD5A79147L, 0x06CA6351L, 0x14292967L,
02167     0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, 0x53380D13L,
02168     0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL, 0x53380D13L,
02169     0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
02170     0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
02171     0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L,
02172     0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L,
02173     0xD192E819L, 0xD6990624L, 0xF40E3585L, 0x106AA070L,
02174     0xD192E819L, 0xD6990624L, 0xF40E3585L, 0x106AA070L,
02175     0x19A4C116L, 0x1E376C08L, 0x2748774CL, 0x34B0BCB5L,
02176     0x19A4C116L, 0x1E376C08L, 0x2748774CL, 0x34B0BCB5L,
02177     0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, 0x682E6FF3L,
02178     0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL, 0x682E6FF3L,
02179     0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
02180     0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
02181     0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L,
02182     0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
02183 };
02184 
02185 SHA256_NOINLINE static int Transform_Sha256_AVX2(wc_Sha256* sha256)
02186 {
02187     __asm__ __volatile__ (
02188 
02189         "subq   $512, %%rsp\n\t"
02190         "leaq   32(%[sha256]), %%rax\n\t"
02191 
02192     INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_Y_00BA, SHUF_Y_DC00)
02193     LOAD_DIGEST()
02194 
02195     LOAD_W_K_LOW(BYTE_FLIP_MASK, rax)
02196 
02197         "movl   %%r9d, " L4 "\n\t"
02198         "movl   %%r12d, " L1 "\n\t"
02199         "xorl   %%r10d, " L4 "\n\t"
02200 
02201     SET_W_Y_4(0)
02202     MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
02203     MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  8)
02204     MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16)
02205     MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24)
02206 
02207     SET_W_Y_4(16)
02208     MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32)
02209     MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40)
02210     MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48)
02211     MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56)
02212 
02213     SET_W_Y_4(32)
02214     MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64)
02215     MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72)
02216     MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80)
02217     MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88)
02218 
02219     SET_W_Y_4(48)
02220     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  96)
02221     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104)
02222     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112)
02223     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120)
02224 
02225     STORE_ADD_DIGEST()
02226 
02227         "addq   $512, %%rsp\n\t"
02228 
02229         :
02230         : [FLIP]     "m" (mBYTE_FLIP_MASK[0]),
02231           [SHUF00BA] "m" (mSHUF_Y_00BA[0]),
02232           [SHUFDC00] "m" (mSHUF_Y_DC00[0]),
02233           [sha256]   "r" (sha256),
02234           [K]        "m" (K256)
02235         : WORK_REGS, STATE_REGS, YMM_REGS, "memory"
02236     );
02237 
02238     return 0;
02239 }
02240 
02241 SHA256_NOINLINE static int Transform_Sha256_AVX2_Len(wc_Sha256* sha256,
02242                                                      word32 len)
02243 {
02244     if ((len & WC_SHA256_BLOCK_SIZE) != 0) {
02245         XMEMCPY(sha256->buffer, sha256->data, WC_SHA256_BLOCK_SIZE);
02246         Transform_Sha256_AVX2(sha256);
02247         sha256->data += WC_SHA256_BLOCK_SIZE;
02248         len -= WC_SHA256_BLOCK_SIZE;
02249         if (len == 0)
02250             return 0;
02251     }
02252 
02253     __asm__ __volatile__ (
02254 
02255         "subq   $512, %%rsp\n\t"
02256         "movq   120(%[sha256]), %%rax\n\t"
02257 
02258     INIT_MASKS_Y(BYTE_FLIP_Y_MASK, SHUF_Y_00BA, SHUF_Y_DC00)
02259     LOAD_DIGEST()
02260 
02261         "# Start of loop processing two blocks\n"
02262         "1:\n\t"
02263 
02264     LOAD_W_K(BYTE_FLIP_Y_MASK, rax)
02265 
02266         "movl   %%r9d, " L4 "\n\t"
02267         "movl   %%r12d, " L1 "\n\t"
02268         "xorl   %%r10d, " L4 "\n\t"
02269 
02270     SET_W_Y_4(0)
02271     MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
02272     MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  8)
02273     MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16)
02274     MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24)
02275 
02276     SET_W_Y_4(16)
02277     MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32)
02278     MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40)
02279     MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48)
02280     MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56)
02281 
02282     SET_W_Y_4(32)
02283     MsgSched_Y(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64)
02284     MsgSched_Y(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72)
02285     MsgSched_Y(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80)
02286     MsgSched_Y(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88)
02287 
02288     SET_W_Y_4(48)
02289     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  96)
02290     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104)
02291     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112)
02292     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120)
02293 
02294     ADD_DIGEST()
02295     STORE_DIGEST()
02296 
02297         "movl   %%r9d, " L4 "\n\t"
02298         "movl   %%r12d, " L1 "\n\t"
02299         "xorl   %%r10d, " L4 "\n\t"
02300 
02301     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,   4)
02302     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  12)
02303     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  20)
02304     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  28)
02305     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  36)
02306     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  44)
02307     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  52)
02308     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  60)
02309     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  68)
02310     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  76)
02311     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  84)
02312     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  92)
02313     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 100)
02314     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 108)
02315     RND_ALL_4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 116)
02316     RND_ALL_4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 124)
02317 
02318     ADD_DIGEST()
02319 
02320         "movq   120(%[sha256]), %%rax\n\t"
02321         "addq   $128, %%rax\n\t"
02322         "subl   $128, %[len]\n\t"
02323 
02324     STORE_DIGEST()
02325 
02326         "movq   %%rax, 120(%[sha256])\n\t"
02327         "jnz    1b\n\t"
02328 
02329         "addq   $512, %%rsp\n\t"
02330 
02331         :
02332         : [FLIP]     "m" (mBYTE_FLIP_Y_MASK[0]),
02333           [SHUF00BA] "m" (mSHUF_Y_00BA[0]),
02334           [SHUFDC00] "m" (mSHUF_Y_DC00[0]),
02335           [sha256]   "r" (sha256),
02336           [len]      "r" (len),
02337           [K]        "m" (K256)
02338         : WORK_REGS, STATE_REGS, YMM_REGS, "memory"
02339     );
02340 
02341     return 0;
02342 }
02343 
02344 #if defined(HAVE_INTEL_RORX)
02345 SHA256_NOINLINE static int Transform_Sha256_AVX2_RORX(wc_Sha256* sha256)
02346 {
02347     __asm__ __volatile__ (
02348 
02349         "subq   $512, %%rsp\n\t"
02350         "leaq   32(%[sha256]), %%rax\n\t"
02351 
02352     INIT_MASKS_Y(BYTE_FLIP_MASK, SHUF_Y_00BA, SHUF_Y_DC00)
02353     LOAD_W_K_LOW(BYTE_FLIP_MASK, rax)
02354 
02355     LOAD_DIGEST()
02356 
02357         "movl   %%r9d, " L4 "\n\t"
02358         "rorx   $6, %%r12d, " L1 "\n\t"
02359         "xorl   %%r10d, " L4 "\n\t"
02360 
02361     SET_W_Y_4(0)
02362     MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
02363     MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  8)
02364     MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16)
02365     MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24)
02366 
02367     SET_W_Y_4(16)
02368     MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32)
02369     MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40)
02370     MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48)
02371     MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56)
02372 
02373     SET_W_Y_4(32)
02374     MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64)
02375     MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72)
02376     MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80)
02377     MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88)
02378 
02379     SET_W_Y_4(48)
02380         "xorl   " L3 ", " L3 "\n\t"
02381         "xorl   " L2 ", " L2 "\n\t"
02382     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  96)
02383     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104)
02384     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112)
02385     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120)
02386         /* Prev RND: h += Maj(a,b,c) */
02387         "addl   " L3 ", %%r8d\n\t"
02388 
02389     STORE_ADD_DIGEST()
02390 
02391         "addq   $512, %%rsp\n\t"
02392 
02393         :
02394         : [FLIP]     "m" (mBYTE_FLIP_MASK[0]),
02395           [SHUF00BA] "m" (mSHUF_Y_00BA[0]),
02396           [SHUFDC00] "m" (mSHUF_Y_DC00[0]),
02397           [sha256]   "r" (sha256),
02398           [K]        "m" (K256)
02399         : WORK_REGS, STATE_REGS, YMM_REGS, "memory"
02400     );
02401 
02402     return 0;
02403 }
02404 
02405 SHA256_NOINLINE static int Transform_Sha256_AVX2_RORX_Len(wc_Sha256* sha256,
02406                                                           word32 len)
02407 {
02408     if ((len & WC_SHA256_BLOCK_SIZE) != 0) {
02409         XMEMCPY(sha256->buffer, sha256->data, WC_SHA256_BLOCK_SIZE);
02410         Transform_Sha256_AVX2_RORX(sha256);
02411         sha256->data += WC_SHA256_BLOCK_SIZE;
02412         len -= WC_SHA256_BLOCK_SIZE;
02413         if (len == 0)
02414             return 0;
02415     }
02416 
02417     __asm__ __volatile__ (
02418 
02419         "subq   $512, %%rsp\n\t"
02420         "movq   120(%[sha256]), %%rax\n\t"
02421 
02422     INIT_MASKS_Y(BYTE_FLIP_Y_MASK, SHUF_Y_00BA, SHUF_Y_DC00)
02423     LOAD_DIGEST()
02424 
02425         "# Start of loop processing two blocks\n"
02426         "1:\n\t"
02427 
02428     LOAD_W_K(BYTE_FLIP_Y_MASK, rax)
02429 
02430         "movl   %%r9d, " L4 "\n\t"
02431         "rorx   $6, %%r12d, " L1 "\n\t"
02432         "xorl   %%r10d, " L4 "\n\t"
02433 
02434     SET_W_Y_4(0)
02435     MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  0)
02436     MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  8)
02437     MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 16)
02438     MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 24)
02439 
02440     SET_W_Y_4(16)
02441     MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 32)
02442     MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 40)
02443     MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 48)
02444     MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 56)
02445 
02446     SET_W_Y_4(32)
02447     MsgSched_Y_RORX(Y0, Y1, Y2, Y3, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 64)
02448     MsgSched_Y_RORX(Y1, Y2, Y3, Y0, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 72)
02449     MsgSched_Y_RORX(Y2, Y3, Y0, Y1, S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 80)
02450     MsgSched_Y_RORX(Y3, Y0, Y1, Y2, S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 88)
02451 
02452     SET_W_Y_4(48)
02453         "xorl   " L3 ", " L3 "\n\t"
02454         "xorl   " L2 ", " L2 "\n\t"
02455     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  96)
02456     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 104)
02457     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 112)
02458     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 120)
02459         /* Prev RND: h += Maj(a,b,c) */
02460         "addl   " L3 ", %%r8d\n\t"
02461         "xorl   " L2 ", " L2 "\n\t"
02462 
02463     ADD_DIGEST()
02464     STORE_DIGEST()
02465 
02466         "movl   %%r9d, " L4 "\n\t"
02467         "xorl   " L3 ", " L3 "\n\t"
02468         "xorl   %%r10d, " L4 "\n\t"
02469 
02470     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,   4)
02471     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  12)
02472     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  20)
02473     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  28)
02474     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  36)
02475     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  44)
02476     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  52)
02477     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  60)
02478     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  68)
02479     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  76)
02480     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7,  84)
02481     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3,  92)
02482     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 100)
02483     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 108)
02484     RND_RORX_X4(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7, 116)
02485     RND_RORX_X4(S_4, S_5, S_6, S_7, S_0, S_1, S_2, S_3, 124)
02486         /* Prev RND: h += Maj(a,b,c) */
02487         "addl   " L3 ", %%r8d\n\t"
02488         "movq   120(%[sha256]), %%rax\n\t"
02489 
02490     ADD_DIGEST()
02491 
02492         "addq   $128, %%rax\n\t"
02493         "subl   $128, %[len]\n\t"
02494 
02495     STORE_DIGEST()
02496 
02497         "movq   %%rax, 120(%[sha256])\n\t"
02498         "jnz    1b\n\t"
02499 
02500         "addq   $512, %%rsp\n\t"
02501 
02502         :
02503         : [FLIP]     "m" (mBYTE_FLIP_Y_MASK[0]),
02504           [SHUF00BA] "m" (mSHUF_Y_00BA[0]),
02505           [SHUFDC00] "m" (mSHUF_Y_DC00[0]),
02506           [sha256]   "r" (sha256),
02507           [len]      "r" (len),
02508           [K]        "m" (K256)
02509         : WORK_REGS, STATE_REGS, YMM_REGS, "memory"
02510     );
02511 
02512     return 0;
02513 }
02514 #endif  /* HAVE_INTEL_RORX */
02515 #endif  /* HAVE_INTEL_AVX2 */
02516 
02517 
02518 #ifdef WOLFSSL_SHA224
02519 
02520 #ifdef STM32_HASH_SHA2
02521 
02522     /* Supports CubeMX HAL or Standard Peripheral Library */
02523 
02524     int wc_InitSha224_ex(wc_Sha224* sha224, void* heap, int devId)
02525     {
02526         if (sha224 == NULL)
02527             return BAD_FUNC_ARG;
02528 
02529         (void)devId;
02530         (void)heap;
02531 
02532         wc_Stm32_Hash_Init(&sha224->stmCtx);
02533         return 0;
02534     }
02535 
02536     int wc_Sha224Update(wc_Sha224* sha224, const byte* data, word32 len)
02537     {
02538         int ret = 0;
02539 
02540         if (sha224 == NULL || (data == NULL && len > 0)) {
02541             return BAD_FUNC_ARG;
02542         }
02543 
02544         ret = wolfSSL_CryptHwMutexLock();
02545         if (ret == 0) {
02546             ret = wc_Stm32_Hash_Update(&sha224->stmCtx,
02547                 HASH_AlgoSelection_SHA224, data, len);
02548             wolfSSL_CryptHwMutexUnLock();
02549         }
02550         return ret;
02551     }
02552 
02553     int wc_Sha224Final(wc_Sha224* sha224, byte* hash)
02554     {
02555         int ret = 0;
02556 
02557         if (sha224 == NULL || hash == NULL) {
02558             return BAD_FUNC_ARG;
02559         }
02560 
02561         ret = wolfSSL_CryptHwMutexLock();
02562         if (ret == 0) {
02563             ret = wc_Stm32_Hash_Final(&sha224->stmCtx,
02564                 HASH_AlgoSelection_SHA224, hash, WC_SHA224_DIGEST_SIZE);
02565             wolfSSL_CryptHwMutexUnLock();
02566         }
02567 
02568         (void)wc_InitSha224(sha224); /* reset state */
02569 
02570         return ret;
02571     }
02572 
02573 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH)
02574     /* functions defined in wolfcrypt/src/port/caam/caam_sha256.c */
02575 #else
02576 
02577     #define NEED_SOFT_SHA224
02578 
02579 
02580     static int InitSha224(wc_Sha224* sha224)
02581     {
02582         int ret = 0;
02583 
02584         if (sha224 == NULL) {
02585             return BAD_FUNC_ARG;
02586         }
02587 
02588         sha224->digest[0] = 0xc1059ed8;
02589         sha224->digest[1] = 0x367cd507;
02590         sha224->digest[2] = 0x3070dd17;
02591         sha224->digest[3] = 0xf70e5939;
02592         sha224->digest[4] = 0xffc00b31;
02593         sha224->digest[5] = 0x68581511;
02594         sha224->digest[6] = 0x64f98fa7;
02595         sha224->digest[7] = 0xbefa4fa4;
02596 
02597         sha224->buffLen = 0;
02598         sha224->loLen   = 0;
02599         sha224->hiLen   = 0;
02600 
02601     #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
02602         /* choose best Transform function under this runtime environment */
02603         Sha256_SetTransform();
02604     #endif
02605 
02606         return ret;
02607     }
02608 
02609 #endif
02610 
02611 #ifdef NEED_SOFT_SHA224
02612     int wc_InitSha224_ex(wc_Sha224* sha224, void* heap, int devId)
02613     {
02614         int ret = 0;
02615 
02616         if (sha224 == NULL)
02617             return BAD_FUNC_ARG;
02618 
02619         sha224->heap = heap;
02620 
02621         ret = InitSha224(sha224);
02622         if (ret != 0)
02623             return ret;
02624 
02625     #ifdef WOLFSSL_SMALL_STACK_CACHE
02626         sha224->W = NULL;
02627     #endif
02628 
02629     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
02630         ret = wolfAsync_DevCtxInit(&sha224->asyncDev,
02631                             WOLFSSL_ASYNC_MARKER_SHA224, sha224->heap, devId);
02632     #else
02633         (void)devId;
02634     #endif /* WOLFSSL_ASYNC_CRYPT */
02635 
02636         return ret;
02637     }
02638 
02639     int wc_Sha224Update(wc_Sha224* sha224, const byte* data, word32 len)
02640     {
02641         int ret;
02642 
02643         if (sha224 == NULL || (data == NULL && len > 0)) {
02644             return BAD_FUNC_ARG;
02645         }
02646 
02647     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
02648         if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) {
02649         #if defined(HAVE_INTEL_QA)
02650             return IntelQaSymSha224(&sha224->asyncDev, NULL, data, len);
02651         #endif
02652         }
02653     #endif /* WOLFSSL_ASYNC_CRYPT */
02654 
02655         ret = Sha256Update((wc_Sha256*)sha224, data, len);
02656 
02657         return ret;
02658     }
02659 
02660     int wc_Sha224Final(wc_Sha224* sha224, byte* hash)
02661     {
02662         int ret;
02663 
02664         if (sha224 == NULL || hash == NULL) {
02665             return BAD_FUNC_ARG;
02666         }
02667 
02668     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
02669         if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) {
02670         #if defined(HAVE_INTEL_QA)
02671             return IntelQaSymSha224(&sha224->asyncDev, hash, NULL,
02672                                             WC_SHA224_DIGEST_SIZE);
02673         #endif
02674         }
02675     #endif /* WOLFSSL_ASYNC_CRYPT */
02676 
02677         ret = Sha256Final((wc_Sha256*)sha224);
02678         if (ret != 0)
02679             return ret;
02680 
02681     #if defined(LITTLE_ENDIAN_ORDER)
02682         ByteReverseWords(sha224->digest, sha224->digest, WC_SHA224_DIGEST_SIZE);
02683     #endif
02684         XMEMCPY(hash, sha224->digest, WC_SHA224_DIGEST_SIZE);
02685 
02686         return InitSha224(sha224);  /* reset state */
02687     }
02688 #endif /* end of SHA224 software implementation */
02689 
02690     int wc_InitSha224(wc_Sha224* sha224)
02691     {
02692         return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID);
02693     }
02694 
02695     void wc_Sha224Free(wc_Sha224* sha224)
02696     {
02697         if (sha224 == NULL)
02698             return;
02699 
02700 #ifdef WOLFSSL_SMALL_STACK_CACHE
02701     if (sha224->W != NULL) {
02702         XFREE(sha224->W, NULL, DYNAMIC_TYPE_RNG);
02703         sha224->W = NULL;
02704     }
02705 #endif
02706 
02707     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
02708         wolfAsync_DevCtxFree(&sha224->asyncDev, WOLFSSL_ASYNC_MARKER_SHA224);
02709     #endif /* WOLFSSL_ASYNC_CRYPT */
02710     }
02711 #endif /* WOLFSSL_SHA224 */
02712 
02713 
02714 int wc_InitSha256(wc_Sha256* sha256)
02715 {
02716     return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID);
02717 }
02718 
02719 void wc_Sha256Free(wc_Sha256* sha256)
02720 {
02721     if (sha256 == NULL)
02722         return;
02723 
02724 #ifdef WOLFSSL_SMALL_STACK_CACHE
02725     if (sha256->W != NULL) {
02726         XFREE(sha256->W, NULL, DYNAMIC_TYPE_RNG);
02727         sha256->W = NULL;
02728     }
02729 #endif
02730 
02731 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
02732     wolfAsync_DevCtxFree(&sha256->asyncDev, WOLFSSL_ASYNC_MARKER_SHA256);
02733 #endif /* WOLFSSL_ASYNC_CRYPT */
02734 }
02735 
02736 #endif /* !WOLFSSL_TI_HASH */
02737 #endif /* HAVE_FIPS */
02738 
02739 
02740 #ifndef WOLFSSL_TI_HASH
02741 #ifdef WOLFSSL_SHA224
02742     int wc_Sha224GetHash(wc_Sha224* sha224, byte* hash)
02743     {
02744         int ret;
02745         wc_Sha224 tmpSha224;
02746 
02747         if (sha224 == NULL || hash == NULL)
02748             return BAD_FUNC_ARG;
02749 
02750         ret = wc_Sha224Copy(sha224, &tmpSha224);
02751         if (ret == 0) {
02752             ret = wc_Sha224Final(&tmpSha224, hash);
02753             wc_Sha224Free(&tmpSha224);
02754         }
02755         return ret;
02756     }
02757     int wc_Sha224Copy(wc_Sha224* src, wc_Sha224* dst)
02758     {
02759         int ret = 0;
02760 
02761         if (src == NULL || dst == NULL)
02762             return BAD_FUNC_ARG;
02763 
02764         XMEMCPY(dst, src, sizeof(wc_Sha224));
02765     #ifdef WOLFSSL_SMALL_STACK_CACHE
02766         dst->W = NULL;
02767     #endif
02768 
02769     #ifdef WOLFSSL_ASYNC_CRYPT
02770         ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
02771     #endif
02772 
02773         return ret;
02774     }
02775 #endif /* WOLFSSL_SHA224 */
02776 
02777 int wc_Sha256GetHash(wc_Sha256* sha256, byte* hash)
02778 {
02779     int ret;
02780     wc_Sha256 tmpSha256;
02781 
02782     if (sha256 == NULL || hash == NULL)
02783         return BAD_FUNC_ARG;
02784 
02785     ret = wc_Sha256Copy(sha256, &tmpSha256);
02786     if (ret == 0) {
02787         ret = wc_Sha256Final(&tmpSha256, hash);
02788         wc_Sha256Free(&tmpSha256);
02789     }
02790     return ret;
02791 }
02792 int wc_Sha256Copy(wc_Sha256* src, wc_Sha256* dst)
02793 {
02794     int ret = 0;
02795 
02796     if (src == NULL || dst == NULL)
02797         return BAD_FUNC_ARG;
02798 
02799     XMEMCPY(dst, src, sizeof(wc_Sha256));
02800 #ifdef WOLFSSL_SMALL_STACK_CACHE
02801     dst->W = NULL;
02802 #endif
02803 
02804 #ifdef WOLFSSL_ASYNC_CRYPT
02805     ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
02806 #endif
02807 #ifdef WOLFSSL_PIC32MZ_HASH
02808     ret = wc_Pic32HashCopy(&src->cache, &dst->cache);
02809 #endif
02810 
02811     return ret;
02812 }
02813 #endif /* !WOLFSSL_TI_HASH */
02814 
02815 #endif /* NO_SHA256 */
02816