Xuyi Wang / wolfcrypt

Dependents:   OS

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers sha512.c Source File

sha512.c

00001 /* sha512.c
00002  *
00003  * Copyright (C) 2006-2017 wolfSSL Inc.
00004  *
00005  * This file is part of wolfSSL.
00006  *
00007  * wolfSSL is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * wolfSSL is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
00020  */
00021 
00022 
00023 #ifdef HAVE_CONFIG_H
00024     #include <config.h>
00025 #endif
00026 
00027 #include <wolfcrypt/settings.h>
00028 
00029 #if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
00030 
00031 #if defined(HAVE_FIPS) && \
00032     defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2)
00033 
00034     /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
00035     #define FIPS_NO_WRAPPERS
00036 
00037     #ifdef USE_WINDOWS_API
00038         #pragma code_seg(".fipsA$k")
00039         #pragma const_seg(".fipsB$k")
00040     #endif
00041 #endif
00042 
00043 #include <wolfcrypt/sha512.h>
00044 #include <wolfcrypt/error-crypt.h>
00045 #include <wolfcrypt/cpuid.h>
00046 
00047 /* deprecated USE_SLOW_SHA2 (replaced with USE_SLOW_SHA512) */
00048 #if defined(USE_SLOW_SHA2) && !defined(USE_SLOW_SHA512)
00049     #define USE_SLOW_SHA512
00050 #endif
00051 
00052 /* fips wrapper calls, user can call direct */
00053 #if defined(HAVE_FIPS) && \
00054     (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION < 2))
00055 
00056     #ifdef WOLFSSL_SHA512
00057 
00058         int wc_InitSha512(wc_Sha512* sha)
00059         {
00060             if (sha == NULL) {
00061                 return BAD_FUNC_ARG;
00062             }
00063 
00064             return InitSha512_fips(sha);
00065         }
00066         int wc_InitSha512_ex(wc_Sha512* sha, void* heap, int devId)
00067         {
00068             (void)heap;
00069             (void)devId;
00070             if (sha == NULL) {
00071                 return BAD_FUNC_ARG;
00072             }
00073             return InitSha512_fips(sha);
00074         }
00075         int wc_Sha512Update(wc_Sha512* sha, const byte* data, word32 len)
00076         {
00077             if (sha == NULL || (data == NULL && len > 0)) {
00078                 return BAD_FUNC_ARG;
00079             }
00080 
00081             return Sha512Update_fips(sha, data, len);
00082         }
00083         int wc_Sha512Final(wc_Sha512* sha, byte* out)
00084         {
00085             if (sha == NULL || out == NULL) {
00086                 return BAD_FUNC_ARG;
00087             }
00088 
00089             return Sha512Final_fips(sha, out);
00090         }
00091         void wc_Sha512Free(wc_Sha512* sha)
00092         {
00093             (void)sha;
00094             /* Not supported in FIPS */
00095         }
00096     #endif
00097 
00098     #if defined(WOLFSSL_SHA384) || defined(HAVE_AESGCM)
00099         int wc_InitSha384(wc_Sha384* sha)
00100         {
00101             if (sha == NULL) {
00102                 return BAD_FUNC_ARG;
00103             }
00104             return InitSha384_fips(sha);
00105         }
00106         int wc_InitSha384_ex(wc_Sha384* sha, void* heap, int devId)
00107         {
00108             (void)heap;
00109             (void)devId;
00110             if (sha == NULL) {
00111                 return BAD_FUNC_ARG;
00112             }
00113             return InitSha384_fips(sha);
00114         }
00115         int wc_Sha384Update(wc_Sha384* sha, const byte* data, word32 len)
00116         {
00117             if (sha == NULL || (data == NULL && len > 0)) {
00118                 return BAD_FUNC_ARG;
00119             }
00120             return Sha384Update_fips(sha, data, len);
00121         }
00122         int wc_Sha384Final(wc_Sha384* sha, byte* out)
00123         {
00124             if (sha == NULL || out == NULL) {
00125                 return BAD_FUNC_ARG;
00126             }
00127             return Sha384Final_fips(sha, out);
00128         }
00129         void wc_Sha384Free(wc_Sha384* sha)
00130         {
00131             (void)sha;
00132             /* Not supported in FIPS */
00133         }
00134     #endif /* WOLFSSL_SHA384 || HAVE_AESGCM */
00135 
00136 #else /* else build without fips, or for FIPS v2 */
00137 
00138 #include <wolfcrypt/logging.h>
00139 
00140 #ifdef NO_INLINE
00141     #include <wolfcrypt/misc.h>
00142 #else
00143     #define WOLFSSL_MISC_INCLUDED
00144     #include <wolfcrypt/src/misc.c>
00145 #endif
00146 
00147 
00148 #if defined(USE_INTEL_SPEEDUP)
00149     #define HAVE_INTEL_AVX1
00150 
00151     #if defined(__GNUC__) && ((__GNUC__ < 4) || \
00152                               (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
00153         #define NO_AVX2_SUPPORT
00154     #endif
00155     #if defined(__clang__) && ((__clang_major__ < 3) || \
00156                                (__clang_major__ == 3 && __clang_minor__ <= 5))
00157         #define NO_AVX2_SUPPORT
00158     #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
00159         #undef NO_AVX2_SUPPORT
00160     #endif
00161 
00162     #define HAVE_INTEL_AVX1
00163     #ifndef NO_AVX2_SUPPORT
00164         #define HAVE_INTEL_AVX2
00165     #endif
00166 #endif
00167 
00168 #if defined(HAVE_INTEL_AVX1)
00169     /* #define DEBUG_XMM  */
00170 #endif
00171 
00172 #if defined(HAVE_INTEL_AVX2)
00173     #define HAVE_INTEL_RORX
00174     /* #define DEBUG_YMM  */
00175 #endif
00176 
00177 #if defined(HAVE_BYTEREVERSE64) && \
00178         !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2)
00179     #define ByteReverseWords64(out, in, size) ByteReverseWords64_1(out, size)
00180     #define ByteReverseWords64_1(buf, size) \
00181         { unsigned int i ;\
00182             for(i=0; i< size/sizeof(word64); i++){\
00183                 __asm__ volatile("bswapq %0":"+r"(buf[i])::) ;\
00184             }\
00185         }
00186 #endif
00187 
00188 #if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH)
00189     /* functions defined in wolfcrypt/src/port/caam/caam_sha.c */
00190 #else
00191 
00192 #ifdef WOLFSSL_SHA512
00193 
00194 static int InitSha512(wc_Sha512* sha512)
00195 {
00196     if (sha512 == NULL)
00197         return BAD_FUNC_ARG;
00198 
00199     sha512->digest[0] = W64LIT(0x6a09e667f3bcc908);
00200     sha512->digest[1] = W64LIT(0xbb67ae8584caa73b);
00201     sha512->digest[2] = W64LIT(0x3c6ef372fe94f82b);
00202     sha512->digest[3] = W64LIT(0xa54ff53a5f1d36f1);
00203     sha512->digest[4] = W64LIT(0x510e527fade682d1);
00204     sha512->digest[5] = W64LIT(0x9b05688c2b3e6c1f);
00205     sha512->digest[6] = W64LIT(0x1f83d9abfb41bd6b);
00206     sha512->digest[7] = W64LIT(0x5be0cd19137e2179);
00207 
00208     sha512->buffLen = 0;
00209     sha512->loLen   = 0;
00210     sha512->hiLen   = 0;
00211 
00212     return 0;
00213 }
00214 
00215 #endif /* WOLFSSL_SHA512 */
00216 
00217 /* Hardware Acceleration */
00218 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00219 
00220 #ifdef WOLFSSL_SHA512
00221 
00222     /*****
00223     Intel AVX1/AVX2 Macro Control Structure
00224 
00225     #if defined(HAVE_INteL_SPEEDUP)
00226         #define HAVE_INTEL_AVX1
00227         #define HAVE_INTEL_AVX2
00228     #endif
00229 
00230     int InitSha512(wc_Sha512* sha512) {
00231          Save/Recover XMM, YMM
00232          ...
00233 
00234          Check Intel AVX cpuid flags
00235     }
00236 
00237     #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
00238       Transform_Sha512_AVX1(); # Function prototype
00239       Transform_Sha512_AVX2(); #
00240     #endif
00241 
00242       _Transform_Sha512() {     # Native Transform Function body
00243 
00244       }
00245 
00246       int Sha512Update() {
00247          Save/Recover XMM, YMM
00248          ...
00249       }
00250 
00251       int Sha512Final() {
00252          Save/Recover XMM, YMM
00253          ...
00254       }
00255 
00256 
00257     #if defined(HAVE_INTEL_AVX1)
00258 
00259        XMM Instructions/INLINE asm Definitions
00260 
00261     #endif
00262 
00263     #if defined(HAVE_INTEL_AVX2)
00264 
00265        YMM Instructions/INLINE asm Definitions
00266 
00267     #endif
00268 
00269     #if defnied(HAVE_INTEL_AVX1)
00270 
00271       int Transform_Sha512_AVX1() {
00272           Stitched Message Sched/Round
00273       }
00274 
00275     #endif
00276 
00277     #if defnied(HAVE_INTEL_AVX2)
00278 
00279       int Transform_Sha512_AVX2() {
00280           Stitched Message Sched/Round
00281       }
00282     #endif
00283 
00284     */
00285 
00286 
00287     /* Each platform needs to query info type 1 from cpuid to see if aesni is
00288      * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
00289      */
00290 
00291     #if defined(HAVE_INTEL_AVX1)
00292         static int Transform_Sha512_AVX1(wc_Sha512 *sha512);
00293         static int Transform_Sha512_AVX1_Len(wc_Sha512 *sha512, word32 len);
00294     #endif
00295     #if defined(HAVE_INTEL_AVX2)
00296         static int Transform_Sha512_AVX2(wc_Sha512 *sha512);
00297         static int Transform_Sha512_AVX2_Len(wc_Sha512 *sha512, word32 len);
00298         #if defined(HAVE_INTEL_RORX)
00299             static int Transform_Sha512_AVX1_RORX(wc_Sha512 *sha512);
00300             static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512 *sha512,
00301                                                       word32 len);
00302             static int Transform_Sha512_AVX2_RORX(wc_Sha512 *sha512);
00303             static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512 *sha512,
00304                                                       word32 len);
00305         #endif
00306     #endif
00307     static int _Transform_Sha512(wc_Sha512 *sha512);
00308     static int (*Transform_Sha512_p)(wc_Sha512* sha512) = _Transform_Sha512;
00309     static int (*Transform_Sha512_Len_p)(wc_Sha512* sha512, word32 len) = NULL;
00310     static int transform_check = 0;
00311     static int intel_flags;
00312     #define Transform_Sha512(sha512)     (*Transform_Sha512_p)(sha512)
00313     #define Transform_Sha512_Len(sha512, len) \
00314         (*Transform_Sha512_Len_p)(sha512, len)
00315 
00316     static void Sha512_SetTransform()
00317     {
00318         if (transform_check)
00319             return;
00320 
00321         intel_flags = cpuid_get_flags();
00322 
00323     #if defined(HAVE_INTEL_AVX2)
00324         if (IS_INTEL_AVX2(intel_flags)) {
00325         #ifdef HAVE_INTEL_RORX
00326             if (IS_INTEL_BMI2(intel_flags)) {
00327                 Transform_Sha512_p = Transform_Sha512_AVX2_RORX;
00328                 Transform_Sha512_Len_p = Transform_Sha512_AVX2_RORX_Len;
00329             }
00330             else
00331         #endif
00332             if (1) {
00333                 Transform_Sha512_p = Transform_Sha512_AVX2;
00334                 Transform_Sha512_Len_p = Transform_Sha512_AVX2_Len;
00335             }
00336         #ifdef HAVE_INTEL_RORX
00337             else {
00338                 Transform_Sha512_p = Transform_Sha512_AVX1_RORX;
00339                 Transform_Sha512_Len_p = Transform_Sha512_AVX1_RORX_Len;
00340             }
00341         #endif
00342         }
00343         else
00344     #endif
00345     #if defined(HAVE_INTEL_AVX1)
00346         if (IS_INTEL_AVX1(intel_flags)) {
00347             Transform_Sha512_p = Transform_Sha512_AVX1;
00348             Transform_Sha512_Len_p = Transform_Sha512_AVX1_Len;
00349         }
00350         else
00351     #endif
00352             Transform_Sha512_p = _Transform_Sha512;
00353 
00354         transform_check = 1;
00355     }
00356 
00357     int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId)
00358     {
00359         int ret = InitSha512(sha512);
00360 
00361         (void)heap;
00362         (void)devId;
00363 
00364         Sha512_SetTransform();
00365 
00366         return ret;
00367     }
00368 
00369 #endif /* WOLFSSL_SHA512 */
00370 
00371 #else
00372     #define Transform_Sha512(sha512) _Transform_Sha512(sha512)
00373 
00374     #ifdef WOLFSSL_SHA512
00375 
00376     int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId)
00377     {
00378         int ret = 0;
00379 
00380         if (sha512 == NULL)
00381             return BAD_FUNC_ARG;
00382 
00383         sha512->heap = heap;
00384 
00385         ret = InitSha512(sha512);
00386         if (ret != 0)
00387             return ret;
00388 
00389     #ifdef WOLFSSL_SMALL_STACK_CACHE
00390         sha512->W = NULL;
00391     #endif
00392 
00393     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
00394         ret = wolfAsync_DevCtxInit(&sha512->asyncDev,
00395                             WOLFSSL_ASYNC_MARKER_SHA512, sha512->heap, devId);
00396     #else
00397         (void)devId;
00398     #endif /* WOLFSSL_ASYNC_CRYPT */
00399 
00400         return ret;
00401     }
00402 
00403     #endif /* WOLFSSL_SHA512 */
00404 
00405 #endif /* Hardware Acceleration */
00406 
00407 static const word64 K512[80] = {
00408     W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
00409     W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
00410     W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
00411     W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
00412     W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
00413     W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
00414     W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
00415     W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
00416     W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
00417     W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
00418     W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
00419     W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
00420     W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
00421     W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
00422     W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
00423     W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
00424     W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
00425     W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
00426     W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
00427     W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
00428     W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
00429     W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
00430     W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
00431     W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
00432     W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
00433     W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
00434     W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
00435     W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
00436     W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
00437     W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
00438     W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
00439     W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
00440     W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
00441     W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
00442     W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
00443     W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
00444     W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
00445     W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
00446     W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
00447     W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
00448 };
00449 
00450 #define blk0(i) (W[i] = sha512->buffer[i])
00451 
00452 #define blk2(i) (\
00453                W[ i     & 15] += \
00454             s1(W[(i-2)  & 15])+ \
00455                W[(i-7)  & 15] + \
00456             s0(W[(i-15) & 15])  \
00457         )
00458 
00459 #define Ch(x,y,z)  (z ^ (x & (y ^ z)))
00460 #define Maj(x,y,z) ((x & y) | (z & (x | y)))
00461 
00462 #define a(i) T[(0-i) & 7]
00463 #define b(i) T[(1-i) & 7]
00464 #define c(i) T[(2-i) & 7]
00465 #define d(i) T[(3-i) & 7]
00466 #define e(i) T[(4-i) & 7]
00467 #define f(i) T[(5-i) & 7]
00468 #define g(i) T[(6-i) & 7]
00469 #define h(i) T[(7-i) & 7]
00470 
00471 #define S0(x) (rotrFixed64(x,28) ^ rotrFixed64(x,34) ^ rotrFixed64(x,39))
00472 #define S1(x) (rotrFixed64(x,14) ^ rotrFixed64(x,18) ^ rotrFixed64(x,41))
00473 #define s0(x) (rotrFixed64(x,1)  ^ rotrFixed64(x,8)  ^ (x>>7))
00474 #define s1(x) (rotrFixed64(x,19) ^ rotrFixed64(x,61) ^ (x>>6))
00475 
00476 #define R(i) \
00477     h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j ? blk2(i) : blk0(i)); \
00478     d(i) += h(i); \
00479     h(i) += S0(a(i)) + Maj(a(i),b(i),c(i))
00480 
00481 static int _Transform_Sha512(wc_Sha512* sha512)
00482 {
00483     const word64* K = K512;
00484     word32 j;
00485     word64 T[8];
00486 
00487 #ifdef WOLFSSL_SMALL_STACK_CACHE
00488     word64* W = sha512->W;
00489     if (W == NULL) {
00490         W = (word64*) XMALLOC(sizeof(word64) * 16, NULL,
00491                                                        DYNAMIC_TYPE_TMP_BUFFER);
00492         if (W == NULL)
00493             return MEMORY_E;
00494         sha512->W = W;
00495     }
00496 #elif defined(WOLFSSL_SMALL_STACK)
00497     word64* W;
00498     W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER);
00499     if (W == NULL)
00500         return MEMORY_E;
00501 #else
00502     word64 W[16];
00503 #endif
00504 
00505     /* Copy digest to working vars */
00506     XMEMCPY(T, sha512->digest, sizeof(T));
00507 
00508 #ifdef USE_SLOW_SHA512
00509     /* over twice as small, but 50% slower */
00510     /* 80 operations, not unrolled */
00511     for (j = 0; j < 80; j += 16) {
00512         int m;
00513         for (m = 0; m < 16; m++) { /* braces needed here for macros {} */
00514             R(m);
00515         }
00516     }
00517 #else
00518     /* 80 operations, partially loop unrolled */
00519     for (j = 0; j < 80; j += 16) {
00520         R( 0); R( 1); R( 2); R( 3);
00521         R( 4); R( 5); R( 6); R( 7);
00522         R( 8); R( 9); R(10); R(11);
00523         R(12); R(13); R(14); R(15);
00524     }
00525 #endif /* USE_SLOW_SHA512 */
00526 
00527     /* Add the working vars back into digest */
00528     sha512->digest[0] += a(0);
00529     sha512->digest[1] += b(0);
00530     sha512->digest[2] += c(0);
00531     sha512->digest[3] += d(0);
00532     sha512->digest[4] += e(0);
00533     sha512->digest[5] += f(0);
00534     sha512->digest[6] += g(0);
00535     sha512->digest[7] += h(0);
00536 
00537     /* Wipe variables */
00538     ForceZero(W, sizeof(word64) * 16);
00539     ForceZero(T, sizeof(T));
00540 
00541 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE)
00542     XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
00543 #endif
00544 
00545     return 0;
00546 }
00547 
00548 
00549 static WC_INLINE void AddLength(wc_Sha512* sha512, word32 len)
00550 {
00551     word64 tmp = sha512->loLen;
00552     if ( (sha512->loLen += len) < tmp)
00553         sha512->hiLen++;                       /* carry low to high */
00554 }
00555 
00556 static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 len)
00557 {
00558     int ret = 0;
00559     /* do block size increments */
00560     byte* local = (byte*)sha512->buffer;
00561 
00562     /* check that internal buffLen is valid */
00563     if (sha512->buffLen >= WC_SHA512_BLOCK_SIZE)
00564         return BUFFER_E;
00565 
00566     if (sha512->buffLen > 0) {
00567         word32 add = min(len, WC_SHA512_BLOCK_SIZE - sha512->buffLen);
00568         if (add > 0) {
00569             XMEMCPY(&local[sha512->buffLen], data, add);
00570 
00571             sha512->buffLen += add;
00572             data            += add;
00573             len             -= add;
00574         }
00575 
00576         if (sha512->buffLen == WC_SHA512_BLOCK_SIZE) {
00577     #if defined(LITTLE_ENDIAN_ORDER)
00578         #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00579             if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
00580         #endif
00581             {
00582                 ByteReverseWords64(sha512->buffer, sha512->buffer,
00583                                                           WC_SHA512_BLOCK_SIZE);
00584             }
00585     #endif
00586             ret = Transform_Sha512(sha512);
00587             if (ret == 0) {
00588                 AddLength(sha512, WC_SHA512_BLOCK_SIZE);
00589                 sha512->buffLen = 0;
00590             }
00591             else
00592                 len = 0;
00593         }
00594     }
00595 
00596 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00597     if (Transform_Sha512_Len_p != NULL) {
00598         word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1);
00599 
00600         if (blocksLen > 0) {
00601             AddLength(sha512, blocksLen);
00602             sha512->data = data;
00603             /* Byte reversal performed in function if required. */
00604             Transform_Sha512_Len(sha512, blocksLen);
00605             data += blocksLen;
00606             len  -= blocksLen;
00607         }
00608     }
00609     else
00610 #endif
00611 #if !defined(LITTLE_ENDIAN_ORDER) || defined(FREESCALE_MMCAU_SHA) || \
00612                             defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00613     {
00614         word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1);
00615 
00616         AddLength(sha512, blocksLen);
00617         while (len >= WC_SHA512_BLOCK_SIZE) {
00618             XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE);
00619 
00620             data += WC_SHA512_BLOCK_SIZE;
00621             len  -= WC_SHA512_BLOCK_SIZE;
00622 
00623             /* Byte reversal performed in function if required. */
00624             ret = Transform_Sha512(sha512);
00625             if (ret != 0)
00626                 break;
00627         }
00628     }
00629 #else
00630     {
00631         word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1);
00632 
00633         AddLength(sha512, blocksLen);
00634         while (len >= WC_SHA512_BLOCK_SIZE) {
00635             XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE);
00636 
00637             data += WC_SHA512_BLOCK_SIZE;
00638             len  -= WC_SHA512_BLOCK_SIZE;
00639 
00640             ByteReverseWords64(sha512->buffer, sha512->buffer,
00641                                                           WC_SHA512_BLOCK_SIZE);
00642             ret = Transform_Sha512(sha512);
00643             if (ret != 0)
00644                 break;
00645         }
00646     }
00647 #endif
00648 
00649     if (len > 0) {
00650         XMEMCPY(local, data, len);
00651         sha512->buffLen = len;
00652     }
00653 
00654     return ret;
00655 }
00656 
00657 #ifdef WOLFSSL_SHA512
00658 
00659 int wc_Sha512Update(wc_Sha512* sha512, const byte* data, word32 len)
00660 {
00661     if (sha512 == NULL || (data == NULL && len > 0)) {
00662         return BAD_FUNC_ARG;
00663     }
00664 
00665 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
00666     if (sha512->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA512) {
00667     #if defined(HAVE_INTEL_QA)
00668         return IntelQaSymSha512(&sha512->asyncDev, NULL, data, len);
00669     #endif
00670     }
00671 #endif /* WOLFSSL_ASYNC_CRYPT */
00672 
00673     return Sha512Update(sha512, data, len);
00674 }
00675 
00676 #endif /* WOLFSSL_SHA512 */
00677 
00678 #endif /* WOLFSSL_IMX6_CAAM */
00679 
00680 static WC_INLINE int Sha512Final(wc_Sha512* sha512)
00681 {
00682     byte* local = (byte*)sha512->buffer;
00683     int ret;
00684 
00685     if (sha512 == NULL) {
00686         return BAD_FUNC_ARG;
00687     }
00688 
00689     AddLength(sha512, sha512->buffLen);               /* before adding pads */
00690 
00691     local[sha512->buffLen++] = 0x80;  /* add 1 */
00692 
00693     /* pad with zeros */
00694     if (sha512->buffLen > WC_SHA512_PAD_SIZE) {
00695         XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_BLOCK_SIZE - sha512->buffLen);
00696         sha512->buffLen += WC_SHA512_BLOCK_SIZE - sha512->buffLen;
00697 #if defined(LITTLE_ENDIAN_ORDER)
00698     #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00699         if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
00700     #endif
00701         {
00702             ByteReverseWords64(sha512->buffer,sha512->buffer,
00703                                                              WC_SHA512_BLOCK_SIZE);
00704         }
00705 #endif /* LITTLE_ENDIAN_ORDER */
00706         ret = Transform_Sha512(sha512);
00707         if (ret != 0)
00708             return ret;
00709 
00710         sha512->buffLen = 0;
00711     }
00712     XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_PAD_SIZE - sha512->buffLen);
00713 
00714     /* put lengths in bits */
00715     sha512->hiLen = (sha512->loLen >> (8 * sizeof(sha512->loLen) - 3)) +
00716                                                          (sha512->hiLen << 3);
00717     sha512->loLen = sha512->loLen << 3;
00718 
00719     /* store lengths */
00720 #if defined(LITTLE_ENDIAN_ORDER)
00721     #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00722         if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
00723     #endif
00724             ByteReverseWords64(sha512->buffer, sha512->buffer, WC_SHA512_PAD_SIZE);
00725 #endif
00726     /* ! length ordering dependent on digest endian type ! */
00727 
00728     sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen;
00729     sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen;
00730 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00731     if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
00732         ByteReverseWords64(&(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
00733                            &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
00734                            WC_SHA512_BLOCK_SIZE - WC_SHA512_PAD_SIZE);
00735 #endif
00736     ret = Transform_Sha512(sha512);
00737     if (ret != 0)
00738         return ret;
00739 
00740     #ifdef LITTLE_ENDIAN_ORDER
00741         ByteReverseWords64(sha512->digest, sha512->digest, WC_SHA512_DIGEST_SIZE);
00742     #endif
00743 
00744     return 0;
00745 }
00746 
00747 #ifdef WOLFSSL_SHA512
00748 
00749 int wc_Sha512FinalRaw(wc_Sha512* sha512, byte* hash)
00750 {
00751 #ifdef LITTLE_ENDIAN_ORDER
00752     word64 digest[WC_SHA512_DIGEST_SIZE / sizeof(word64)];
00753 #endif
00754 
00755     if (sha512 == NULL || hash == NULL) {
00756         return BAD_FUNC_ARG;
00757     }
00758 
00759 #ifdef LITTLE_ENDIAN_ORDER
00760     ByteReverseWords64((word64*)digest, (word64*)sha512->digest,
00761                                                          WC_SHA512_DIGEST_SIZE);
00762     XMEMCPY(hash, digest, WC_SHA512_DIGEST_SIZE);
00763 #else
00764     XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE);
00765 #endif
00766 
00767     return 0;
00768 }
00769 
00770 int wc_Sha512Final(wc_Sha512* sha512, byte* hash)
00771 {
00772     int ret;
00773 
00774     if (sha512 == NULL || hash == NULL) {
00775         return BAD_FUNC_ARG;
00776     }
00777 
00778 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
00779     if (sha512->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA512) {
00780     #if defined(HAVE_INTEL_QA)
00781         return IntelQaSymSha512(&sha512->asyncDev, hash, NULL,
00782                                             WC_SHA512_DIGEST_SIZE);
00783     #endif
00784     }
00785 #endif /* WOLFSSL_ASYNC_CRYPT */
00786 
00787     ret = Sha512Final(sha512);
00788     if (ret != 0)
00789         return ret;
00790 
00791     XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE);
00792 
00793     return InitSha512(sha512);  /* reset state */
00794 }
00795 
00796 
00797 int wc_InitSha512(wc_Sha512* sha512)
00798 {
00799     return wc_InitSha512_ex(sha512, NULL, INVALID_DEVID);
00800 }
00801 
00802 void wc_Sha512Free(wc_Sha512* sha512)
00803 {
00804     if (sha512 == NULL)
00805         return;
00806 
00807 #ifdef WOLFSSL_SMALL_STACK_CACHE
00808     if (sha512->W != NULL) {
00809         XFREE(sha512->W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
00810         sha512->W = NULL;
00811     }
00812 #endif
00813 
00814 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
00815     wolfAsync_DevCtxFree(&sha512->asyncDev, WOLFSSL_ASYNC_MARKER_SHA512);
00816 #endif /* WOLFSSL_ASYNC_CRYPT */
00817 }
00818 
00819 
00820 #if defined(HAVE_INTEL_AVX1)
00821 
00822 static word64 mBYTE_FLIP_MASK[] =  { 0x0001020304050607, 0x08090a0b0c0d0e0f };
00823 
00824 #define W_0     xmm0
00825 #define W_2     xmm1
00826 #define W_4     xmm2
00827 #define W_6     xmm3
00828 #define W_8     xmm4
00829 #define W_10    xmm5
00830 #define W_12    xmm6
00831 #define W_14    xmm7
00832 
00833 #define W_M15   xmm12
00834 #define W_M7    xmm13
00835 #define MASK    xmm14
00836 
00837 #define XTMP1   xmm8
00838 #define XTMP2   xmm9
00839 #define XTMP3   xmm10
00840 #define XTMP4   xmm11
00841 
00842 #define XMM_REGS \
00843     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",       \
00844     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
00845 
00846 #define _VPALIGNR(dest, src1, src2, bits)                               \
00847     "vpalignr   $" #bits ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t"
00848 #define VPALIGNR(dest, src1, src2, bits) \
00849        _VPALIGNR(dest, src1, src2, bits)
00850 
00851 #define _V_SHIFT_R(dest, src, bits)                             \
00852     "vpsrlq $" #bits ", %%" #src ", %%" #dest "\n\t"
00853 #define V_SHIFT_R(dest, src, bits) \
00854        _V_SHIFT_R(dest, src, bits)
00855 
00856 #define _V_SHIFT_L(dest, src, bits)                             \
00857     "vpsllq $" #bits ", %%" #src ", %%" #dest "\n\t"
00858 #define V_SHIFT_L(dest, src, bits) \
00859        _V_SHIFT_L(dest, src, bits)
00860 
00861 #define _V_ADD(dest, src1, src2)                                \
00862     "vpaddq %%" #src1 ", %%" #src2 ", %%" #dest "\n\t"
00863 #define V_ADD(dest, src1, src2) \
00864        _V_ADD(dest, src1, src2)
00865 
00866 #define _V_XOR(dest, src1, src2)                                \
00867     "vpxor  %%" #src1 ", %%" #src2 ", %%" #dest "\n\t"
00868 #define V_XOR(dest, src1, src2) \
00869        _V_XOR(dest, src1, src2)
00870 
00871 #define _V_OR(dest, src1, src2)                                 \
00872     "vpor   %%" #src1 ", %%" #src2 ", %%" #dest "\n\t"
00873 #define V_OR(dest, src1, src2) \
00874        _V_OR(dest, src1, src2)
00875 
00876 #define RA  %%r8
00877 #define RB  %%r9
00878 #define RC  %%r10
00879 #define RD  %%r11
00880 #define RE  %%r12
00881 #define RF  %%r13
00882 #define RG  %%r14
00883 #define RH  %%r15
00884 
00885 #define STATE_REGS "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
00886 
00887 #define L1  "%%rax"
00888 #define L2  "%%rcx"
00889 #define L3  "%%rdx"
00890 #define L4  "%%rbx"
00891 #define WX  "%%rsp"
00892 
00893 #define WORK_REGS "rax", "rbx", "rcx", "rdx"
00894 
00895 #define RND_0_1(a,b,c,d,e,f,g,h,i)                   \
00896     /* L1 = e >>> 23 */                              \
00897     "rorq    $23, " L1 "\n\t"                    \
00898 
00899 #define RND_0_2(a,b,c,d,e,f,g,h,i)                   \
00900     /* L3 = a */                                     \
00901     "movq   "#a", " L3 "\n\t"                    \
00902     /* L2 = f */                                     \
00903     "movq   "#f", " L2 "\n\t"                    \
00904     /* h += W_X[i] */                                \
00905     "addq   ("#i")*8(" WX "), "#h"\n\t"          \
00906     /* L2 = f ^ g */                                 \
00907     "xorq   "#g", " L2 "\n\t"                    \
00908 
00909 #define RND_0_2_A(a,b,c,d,e,f,g,h,i)                 \
00910     /* L3 = a */                                     \
00911     "movq   "#a", " L3 "\n\t"                    \
00912     /* L2 = f */                                     \
00913     "movq   "#f", " L2 "\n\t"                    \
00914 
00915 #define RND_0_2_B(a,b,c,d,e,f,g,h,i)                 \
00916     /* h += W_X[i] */                                \
00917     "addq   ("#i")*8(" WX "), "#h"\n\t"          \
00918     /* L2 = f ^ g */                                 \
00919     "xorq   "#g", " L2 "\n\t"                    \
00920 
00921 #define RND_0_3(a,b,c,d,e,f,g,h,i)                   \
00922     /* L1 = (e >>> 23) ^ e */                        \
00923     "xorq   "#e", " L1 "\n\t"                    \
00924     /* L2 = (f ^ g) & e */                           \
00925     "andq   "#e", " L2 "\n\t"                    \
00926 
00927 #define RND_0_4(a,b,c,d,e,f,g,h,i)                   \
00928     /* L1 = ((e >>> 23) ^ e) >>> 4 */                \
00929     "rorq    $4, " L1 "\n\t"                     \
00930     /* L2 = ((f ^ g) & e) ^ g */                     \
00931     "xorq   "#g", " L2 "\n\t"                    \
00932 
00933 #define RND_0_5(a,b,c,d,e,f,g,h,i)                   \
00934     /* L1 = (((e >>> 23) ^ e) >>> 4) ^ e */          \
00935     "xorq   "#e", " L1 "\n\t"                    \
00936     /* h += Ch(e,f,g) */                             \
00937     "addq   " L2 ", "#h"\n\t"                    \
00938 
00939 #define RND_0_6(a,b,c,d,e,f,g,h,i)                   \
00940     /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \
00941     "rorq   $14, " L1 "\n\t"                     \
00942     /* L3 = a ^ b */                                 \
00943     "xorq   "#b", " L3 "\n\t"                    \
00944 
00945 #define RND_0_7(a,b,c,d,e,f,g,h,i)                   \
00946     /* h += Sigma1(e) */                             \
00947     "addq   " L1 ", "#h"\n\t"                    \
00948     /* L2 = a */                                     \
00949     "movq   "#a", " L2 "\n\t"                    \
00950 
00951 #define RND_0_8(a,b,c,d,e,f,g,h,i)                   \
00952     /* L4 = (a ^ b) & (b ^ c) */                     \
00953     "andq   " L3 ", " L4 "\n\t"                  \
00954     /* L2 = a >>> 5 */                               \
00955     "rorq   $5, " L2 "\n\t"                      \
00956 
00957 #define RND_0_9(a,b,c,d,e,f,g,h,i)                   \
00958     /* L2 = (a >>> 5) ^ a */                         \
00959     "xorq   "#a", " L2 "\n\t"                    \
00960     /* L4 = ((a ^ b) & (b ^ c) ^ b */                \
00961     "xorq   "#b", " L4 "\n\t"                    \
00962 
00963 #define RND_0_10(a,b,c,d,e,f,g,h,i)                  \
00964     /* L2 = ((a >>> 5) ^ a) >>> 6 */                 \
00965     "rorq    $6, " L2 "\n\t"                     \
00966     /* d += h */                                     \
00967     "addq   "#h", "#d"\n\t"                      \
00968 
00969 #define RND_0_11(a,b,c,d,e,f,g,h,i)                  \
00970     /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */           \
00971     "xorq   "#a", " L2 "\n\t"                    \
00972     /* h += Sigma0(a) */                             \
00973     "addq   " L4 ", "#h"\n\t"                    \
00974 
00975 #define RND_0_12(a,b,c,d,e,f,g,h,i)                  \
00976     /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */  \
00977     "rorq   $28, " L2 "\n\t"                     \
00978     /* d (= e next RND) */                           \
00979     "movq   "#d", " L1 "\n\t"                    \
00980     /* h += Maj(a,b,c) */                            \
00981     "addq   " L2 ", "#h"\n\t"                    \
00982 
00983 #define RND_1_1(a,b,c,d,e,f,g,h,i)                   \
00984     /* L1 = e >>> 23 */                              \
00985     "rorq    $23, " L1 "\n\t"                    \
00986 
00987 #define RND_1_2(a,b,c,d,e,f,g,h,i)                   \
00988     /* L4 = a */                                     \
00989     "movq   "#a", " L4 "\n\t"                    \
00990     /* L2 = f */                                     \
00991     "movq   "#f", " L2 "\n\t"                    \
00992     /* h += W_X[i] */                                \
00993     "addq   ("#i")*8(" WX "), "#h"\n\t"          \
00994     /* L2 = f ^ g */                                 \
00995     "xorq   "#g", " L2 "\n\t"                    \
00996 
00997 #define RND_1_2_A(a,b,c,d,e,f,g,h,i)                 \
00998     /* L4 = a */                                     \
00999     "movq   "#a", " L4 "\n\t"                    \
01000     /* L2 = f */                                     \
01001     "movq   "#f", " L2 "\n\t"                    \
01002 
01003 #define RND_1_2_B(a,b,c,d,e,f,g,h,i)                 \
01004     /* h += W_X[i] */                                \
01005     "addq   ("#i")*8(" WX "), "#h"\n\t"          \
01006     /* L2 = f ^ g */                                 \
01007     "xorq   "#g", " L2 "\n\t"                    \
01008 
01009 #define RND_1_3(a,b,c,d,e,f,g,h,i)                   \
01010     /* L1 = (e >>> 23) ^ e */                        \
01011     "xorq   "#e", " L1 "\n\t"                    \
01012     /* L2 = (f ^ g) & e */                           \
01013     "andq   "#e", " L2 "\n\t"                    \
01014 
01015 #define RND_1_4(a,b,c,d,e,f,g,h,i)                   \
01016     /* ((e >>> 23) ^ e) >>> 4 */                     \
01017     "rorq    $4, " L1 "\n\t"                     \
01018     /* ((f ^ g) & e) ^ g */                          \
01019     "xorq   "#g", " L2 "\n\t"                    \
01020 
01021 #define RND_1_5(a,b,c,d,e,f,g,h,i)                   \
01022     /* (((e >>> 23) ^ e) >>> 4) ^ e */               \
01023     "xorq   "#e", " L1 "\n\t"                    \
01024     /* h += Ch(e,f,g) */                             \
01025     "addq   " L2 ", "#h"\n\t"                    \
01026 
01027 #define RND_1_6(a,b,c,d,e,f,g,h,i)                   \
01028     /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \
01029     "rorq   $14, " L1 "\n\t"                     \
01030     /* L4 = a ^ b */                                 \
01031     "xorq   "#b", " L4 "\n\t"                    \
01032 
01033 #define RND_1_7(a,b,c,d,e,f,g,h,i)                   \
01034     /* h += Sigma1(e) */                             \
01035     "addq   " L1 ", "#h"\n\t"                    \
01036     /* L2 = a */                                     \
01037     "movq   "#a", " L2 "\n\t"                    \
01038  
01039 #define RND_1_8(a,b,c,d,e,f,g,h,i)                   \
01040     /* L3 = (a ^ b) & (b ^ c) */                     \
01041     "andq   " L4 ", " L3 "\n\t"                  \
01042     /* L2 = a >>> 5 */                               \
01043     "rorq   $5, " L2 "\n\t"                      \
01044 
01045 #define RND_1_9(a,b,c,d,e,f,g,h,i)                   \
01046     /* L2 = (a >>> 5) ^ a */                         \
01047     "xorq   "#a", " L2 "\n\t"                    \
01048     /* L3 = ((a ^ b) & (b ^ c) ^ b */                \
01049     "xorq   "#b", " L3 "\n\t"                    \
01050 
01051 #define RND_1_10(a,b,c,d,e,f,g,h,i)                  \
01052     /* L2 = ((a >>> 5) ^ a) >>> 6 */                 \
01053     "rorq    $6, " L2 "\n\t"                     \
01054     /* d += h */                                     \
01055     "addq   "#h", "#d"\n\t"                      \
01056 
01057 #define RND_1_11(a,b,c,d,e,f,g,h,i)                  \
01058     /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */           \
01059     "xorq   "#a", " L2 "\n\t"                    \
01060     /* h += Sigma0(a) */                             \
01061     "addq   " L3 ", "#h"\n\t"                    \
01062 
01063 #define RND_1_12(a,b,c,d,e,f,g,h,i)                  \
01064     /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */  \
01065     "rorq   $28, " L2 "\n\t"                     \
01066     /* d (= e next RND) */                           \
01067     "movq   "#d", " L1 "\n\t"                    \
01068     /* h += Maj(a,b,c) */                            \
01069     "addq   " L2 ", "#h"\n\t"                    \
01070 
01071 
01072 #define MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \
01073             RND_0_1(a,b,c,d,e,f,g,h,i)                                  \
01074     VPALIGNR(W_M15, W_2, W_0, 8)                                        \
01075     VPALIGNR(W_M7, W_10, W_8, 8)                                        \
01076             RND_0_2(a,b,c,d,e,f,g,h,i)                                  \
01077     V_SHIFT_R(XTMP1, W_M15, 1)                                          \
01078     V_SHIFT_L(XTMP2, W_M15, 63)                                         \
01079             RND_0_3(a,b,c,d,e,f,g,h,i)                                  \
01080             RND_0_4(a,b,c,d,e,f,g,h,i)                                  \
01081     V_SHIFT_R(XTMP3, W_M15, 8)                                          \
01082     V_SHIFT_L(XTMP4, W_M15, 56)                                         \
01083             RND_0_5(a,b,c,d,e,f,g,h,i)                                  \
01084             RND_0_6(a,b,c,d,e,f,g,h,i)                                  \
01085     V_OR(XTMP1, XTMP2, XTMP1)                                           \
01086     V_OR(XTMP3, XTMP4, XTMP3)                                           \
01087             RND_0_7(a,b,c,d,e,f,g,h,i)                                  \
01088             RND_0_8(a,b,c,d,e,f,g,h,i)                                  \
01089     V_SHIFT_R(XTMP4, W_M15, 7)                                          \
01090     V_XOR(XTMP1, XTMP3, XTMP1)                                          \
01091             RND_0_9(a,b,c,d,e,f,g,h,i)                                  \
01092             RND_0_10(a,b,c,d,e,f,g,h,i)                                 \
01093     V_XOR(XTMP1, XTMP4, XTMP1)                                          \
01094     V_ADD(W_0, W_0, W_M7)                                               \
01095             RND_0_11(a,b,c,d,e,f,g,h,i)                                 \
01096             RND_0_12(a,b,c,d,e,f,g,h,i)                                 \
01097             RND_1_1(h,a,b,c,d,e,f,g,i+1)                                \
01098     V_ADD(W_0, W_0, XTMP1)                                              \
01099             RND_1_2(h,a,b,c,d,e,f,g,i+1)                                \
01100     V_SHIFT_R(XTMP1, W_14, 19)                                          \
01101     V_SHIFT_L(XTMP2, W_14, 45)                                          \
01102             RND_1_3(h,a,b,c,d,e,f,g,i+1)                                \
01103             RND_1_4(h,a,b,c,d,e,f,g,i+1)                                \
01104     V_SHIFT_R(XTMP3, W_14, 61)                                          \
01105     V_SHIFT_L(XTMP4, W_14, 3)                                           \
01106             RND_1_5(h,a,b,c,d,e,f,g,i+1)                                \
01107             RND_1_6(h,a,b,c,d,e,f,g,i+1)                                \
01108             RND_1_7(h,a,b,c,d,e,f,g,i+1)                                \
01109     V_OR(XTMP1, XTMP2, XTMP1)                                           \
01110     V_OR(XTMP3, XTMP4, XTMP3)                                           \
01111             RND_1_8(h,a,b,c,d,e,f,g,i+1)                                \
01112             RND_1_9(h,a,b,c,d,e,f,g,i+1)                                \
01113     V_XOR(XTMP1, XTMP3, XTMP1)                                          \
01114     V_SHIFT_R(XTMP4, W_14, 6)                                           \
01115             RND_1_10(h,a,b,c,d,e,f,g,i+1)                               \
01116             RND_1_11(h,a,b,c,d,e,f,g,i+1)                               \
01117     V_XOR(XTMP1, XTMP4, XTMP1)                                          \
01118             RND_1_12(h,a,b,c,d,e,f,g,i+1)                               \
01119     V_ADD(W_0, W_0, XTMP1)                                              \
01120 
01121 #define RND_ALL_2(a, b, c, d, e, f, g, h, i) \
01122     RND_0_1 (a, b, c, d, e, f, g, h, i )     \
01123     RND_0_2 (a, b, c, d, e, f, g, h, i )     \
01124     RND_0_3 (a, b, c, d, e, f, g, h, i )     \
01125     RND_0_4 (a, b, c, d, e, f, g, h, i )     \
01126     RND_0_5 (a, b, c, d, e, f, g, h, i )     \
01127     RND_0_6 (a, b, c, d, e, f, g, h, i )     \
01128     RND_0_7 (a, b, c, d, e, f, g, h, i )     \
01129     RND_0_8 (a, b, c, d, e, f, g, h, i )     \
01130     RND_0_9 (a, b, c, d, e, f, g, h, i )     \
01131     RND_0_10(a, b, c, d, e, f, g, h, i )     \
01132     RND_0_11(a, b, c, d, e, f, g, h, i )     \
01133     RND_0_12(a, b, c, d, e, f, g, h, i )     \
01134     RND_1_1 (h, a, b, c, d, e, f, g, i+1)    \
01135     RND_1_2 (h, a, b, c, d, e, f, g, i+1)    \
01136     RND_1_3 (h, a, b, c, d, e, f, g, i+1)    \
01137     RND_1_4 (h, a, b, c, d, e, f, g, i+1)    \
01138     RND_1_5 (h, a, b, c, d, e, f, g, i+1)    \
01139     RND_1_6 (h, a, b, c, d, e, f, g, i+1)    \
01140     RND_1_7 (h, a, b, c, d, e, f, g, i+1)    \
01141     RND_1_8 (h, a, b, c, d, e, f, g, i+1)    \
01142     RND_1_9 (h, a, b, c, d, e, f, g, i+1)    \
01143     RND_1_10(h, a, b, c, d, e, f, g, i+1)    \
01144     RND_1_11(h, a, b, c, d, e, f, g, i+1)    \
01145     RND_1_12(h, a, b, c, d, e, f, g, i+1)
01146 
01147 
01148 #if defined(HAVE_INTEL_RORX)
01149 
01150 #define RND_RORX_0_1(a, b, c, d, e, f, g, h, i) \
01151     /* L1 = e>>>14 */                           \
01152     "rorxq  $14, "#e", " L1 "\n\t"          \
01153     /* L2 = e>>>18 */                           \
01154     "rorxq  $18, "#e", " L2 "\n\t"          \
01155     /* Prev RND: h += Maj(a,b,c) */             \
01156     "addq   " L3 ", "#a"\n\t"               \
01157 
01158 #define RND_RORX_0_2(a, b, c, d, e, f, g, h, i) \
01159     /* h += w_k */                              \
01160     "addq   ("#i")*8(" WX "), "#h"\n\t"     \
01161     /* L3 = f */                                \
01162     "movq   "#f", " L3 "\n\t"               \
01163     /* L2 = (e>>>14) ^ (e>>>18) */              \
01164     "xorq   " L1 ", " L2 "\n\t"             \
01165 
01166 #define RND_RORX_0_3(a, b, c, d, e, f, g, h, i) \
01167     /* L3 = f ^ g */                            \
01168     "xorq   "#g", " L3 "\n\t"               \
01169     /* L1 = e>>>41 */                           \
01170     "rorxq  $41, "#e", " L1 "\n\t"          \
01171     /* L1 = Sigma1(e) */                        \
01172     "xorq   " L2 ", " L1 "\n\t"             \
01173 
01174 #define RND_RORX_0_4(a, b, c, d, e, f, g, h, i) \
01175     /* L3 = (f ^ g) & e */                      \
01176     "andq   "#e", " L3 "\n\t"               \
01177     /* h += Sigma1(e) */                        \
01178     "addq   " L1 ", "#h"\n\t"               \
01179     /* L1 = a>>>28 */                           \
01180     "rorxq  $28, "#a", " L1 "\n\t"          \
01181 
01182 #define RND_RORX_0_5(a, b, c, d, e, f, g, h, i) \
01183     /* L2 = a>>>34 */                           \
01184     "rorxq  $34, "#a", " L2 "\n\t"          \
01185     /* L3 = Ch(e,f,g) */                        \
01186     "xorq   "#g", " L3 "\n\t"               \
01187     /* L2 = (a>>>28) ^ (a>>>34) */              \
01188     "xorq   " L1 ", " L2 "\n\t"             \
01189 
01190 #define RND_RORX_0_6(a, b, c, d, e, f, g, h, i) \
01191     /* L1 = a>>>39 */                           \
01192     "rorxq  $39, "#a", " L1 "\n\t"          \
01193     /* h += Ch(e,f,g) */                        \
01194     "addq   " L3 ", "#h"\n\t"               \
01195     /* L1 = Sigma0(a) */                        \
01196     "xorq   " L2 ", " L1 "\n\t"             \
01197 
01198 #define RND_RORX_0_7(a, b, c, d, e, f, g, h, i) \
01199     /* L3 = b */                                \
01200     "movq   "#b", " L3 "\n\t"               \
01201     /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */  \
01202     "addq   "#h", "#d"\n\t"                 \
01203     /* L3 = a ^ b */                            \
01204     "xorq   "#a", " L3 "\n\t"               \
01205 
01206 #define RND_RORX_0_8(a, b, c, d, e, f, g, h, i) \
01207     /* L4 = (a ^ b) & (b ^ c) */                \
01208     "andq   " L3 ", " L4 "\n\t"             \
01209     /* h += Sigma0(a) */                        \
01210     "addq   " L1 ", "#h"\n\t"               \
01211     /* L4 = Maj(a,b,c) */                       \
01212     "xorq   "#b", " L4 "\n\t"               \
01213 
01214 #define RND_RORX_1_1(a, b, c, d, e, f, g, h, i) \
01215     /* L1 = e>>>14 */                           \
01216     "rorxq  $14, "#e", " L1 "\n\t"          \
01217     /* L2 = e>>>18 */                           \
01218     "rorxq  $18, "#e", " L2 "\n\t"          \
01219     /* Prev RND: h += Maj(a,b,c) */             \
01220     "addq   " L4 ", "#a"\n\t"               \
01221 
01222 #define RND_RORX_1_2(a, b, c, d, e, f, g, h, i) \
01223     /* h += w_k */                              \
01224     "addq   ("#i")*8(" WX "), "#h"\n\t"     \
01225     /* L4 = f */                                \
01226     "movq   "#f", " L4 "\n\t"               \
01227     /* L2 = (e>>>14) ^ (e>>>18) */              \
01228     "xorq   " L1 ", " L2 "\n\t"             \
01229 
01230 #define RND_RORX_1_3(a, b, c, d, e, f, g, h, i) \
01231     /* L4 = f ^ g */                            \
01232     "xorq   "#g", " L4 "\n\t"               \
01233     /* L1 = e>>>41 */                           \
01234     "rorxq  $41, "#e", " L1 "\n\t"          \
01235     /* L1 = Sigma1(e) */                        \
01236     "xorq   " L2 ", " L1 "\n\t"             \
01237 
01238 #define RND_RORX_1_4(a, b, c, d, e, f, g, h, i) \
01239     /* L4 = (f ^ g) & e */                      \
01240     "andq   "#e", " L4 "\n\t"               \
01241     /* h += Sigma1(e) */                        \
01242     "addq   " L1 ", "#h"\n\t"               \
01243     /* L1 = a>>>28 */                           \
01244     "rorxq  $28, "#a", " L1 "\n\t"          \
01245 
01246 #define RND_RORX_1_5(a, b, c, d, e, f, g, h, i) \
01247     /* L2 = a>>>34 */                           \
01248     "rorxq  $34, "#a", " L2 "\n\t"          \
01249     /* L4 = Ch(e,f,g) */                        \
01250     "xorq   "#g", " L4 "\n\t"               \
01251     /* L2 = (a>>>28) ^ (a>>>34) */              \
01252     "xorq   " L1 ", " L2 "\n\t"             \
01253 
01254 #define RND_RORX_1_6(a, b, c, d, e, f, g, h, i) \
01255     /* L1 = a>>>39 */                           \
01256     "rorxq  $39, "#a", " L1 "\n\t"          \
01257     /* h += Ch(e,f,g) */                        \
01258     "addq   " L4 ", "#h"\n\t"               \
01259     /* L1 = Sigma0(a) */                        \
01260     "xorq   " L2 ", " L1 "\n\t"             \
01261 
01262 #define RND_RORX_1_7(a, b, c, d, e, f, g, h, i) \
01263     /* L4 = b */                                \
01264     "movq   "#b", " L4 "\n\t"               \
01265     /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */  \
01266     "addq   "#h", "#d"\n\t"                 \
01267     /* L4 = a ^ b */                            \
01268     "xorq   "#a", " L4 "\n\t"               \
01269 
01270 #define RND_RORX_1_8(a, b, c, d, e, f, g, h, i) \
01271     /* L2 = (a ^ b) & (b ^ c) */                \
01272     "andq   " L4 ", " L3 "\n\t"             \
01273     /* h += Sigma0(a) */                        \
01274     "addq   " L1 ", "#h"\n\t"               \
01275     /* L3 = Maj(a,b,c) */                       \
01276     "xorq   "#b", " L3 "\n\t"               \
01277 
01278 #define RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i) \
01279     RND_RORX_0_1(a, b, c, d, e, f, g, h, i+0)     \
01280     RND_RORX_0_2(a, b, c, d, e, f, g, h, i+0)     \
01281     RND_RORX_0_3(a, b, c, d, e, f, g, h, i+0)     \
01282     RND_RORX_0_4(a, b, c, d, e, f, g, h, i+0)     \
01283     RND_RORX_0_5(a, b, c, d, e, f, g, h, i+0)     \
01284     RND_RORX_0_6(a, b, c, d, e, f, g, h, i+0)     \
01285     RND_RORX_0_7(a, b, c, d, e, f, g, h, i+0)     \
01286     RND_RORX_0_8(a, b, c, d, e, f, g, h, i+0)     \
01287     RND_RORX_1_1(h, a, b, c, d, e, f, g, i+1)     \
01288     RND_RORX_1_2(h, a, b, c, d, e, f, g, i+1)     \
01289     RND_RORX_1_3(h, a, b, c, d, e, f, g, i+1)     \
01290     RND_RORX_1_4(h, a, b, c, d, e, f, g, i+1)     \
01291     RND_RORX_1_5(h, a, b, c, d, e, f, g, i+1)     \
01292     RND_RORX_1_6(h, a, b, c, d, e, f, g, i+1)     \
01293     RND_RORX_1_7(h, a, b, c, d, e, f, g, i+1)     \
01294     RND_RORX_1_8(h, a, b, c, d, e, f, g, i+1)     \
01295 
01296 #define RND_RORX_ALL_4(a, b, c, d, e, f, g, h, i) \
01297     RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i+0)   \
01298     RND_RORX_ALL_2(g, h, a, b, c, d, e, f, i+2)
01299 
01300 #define MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \
01301             RND_RORX_0_1(a,b,c,d,e,f,g,h,i)                                 \
01302     VPALIGNR(W_M15, W_2, W_0, 8)                                            \
01303     VPALIGNR(W_M7, W_10, W_8, 8)                                            \
01304             RND_RORX_0_2(a,b,c,d,e,f,g,h,i)                                 \
01305     V_SHIFT_R(XTMP1, W_M15, 1)                                              \
01306     V_SHIFT_L(XTMP2, W_M15, 63)                                             \
01307             RND_RORX_0_3(a,b,c,d,e,f,g,h,i)                                 \
01308     V_SHIFT_R(XTMP3, W_M15, 8)                                              \
01309     V_SHIFT_L(XTMP4, W_M15, 56)                                             \
01310             RND_RORX_0_4(a,b,c,d,e,f,g,h,i)                                 \
01311     V_OR(XTMP1, XTMP2, XTMP1)                                               \
01312     V_OR(XTMP3, XTMP4, XTMP3)                                               \
01313             RND_RORX_0_5(a,b,c,d,e,f,g,h,i)                                 \
01314     V_SHIFT_R(XTMP4, W_M15, 7)                                              \
01315     V_XOR(XTMP1, XTMP3, XTMP1)                                              \
01316             RND_RORX_0_6(a,b,c,d,e,f,g,h,i)                                 \
01317     V_XOR(XTMP1, XTMP4, XTMP1)                                              \
01318     V_ADD(W_0, W_0, W_M7)                                                   \
01319             RND_RORX_0_7(a,b,c,d,e,f,g,h,i)                                 \
01320             RND_RORX_0_8(a,b,c,d,e,f,g,h,i)                                 \
01321     V_ADD(W_0, W_0, XTMP1)                                                  \
01322             RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1)                               \
01323     V_SHIFT_R(XTMP1, W_14, 19)                                              \
01324     V_SHIFT_L(XTMP2, W_14, 45)                                              \
01325             RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1)                               \
01326     V_SHIFT_R(XTMP3, W_14, 61)                                              \
01327     V_SHIFT_L(XTMP4, W_14, 3)                                               \
01328             RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1)                               \
01329     V_OR(XTMP1, XTMP2, XTMP1)                                               \
01330     V_OR(XTMP3, XTMP4, XTMP3)                                               \
01331             RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1)                               \
01332             RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1)                               \
01333     V_XOR(XTMP1, XTMP3, XTMP1)                                              \
01334     V_SHIFT_R(XTMP4, W_14, 6)                                               \
01335             RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1)                               \
01336             RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1)                               \
01337     V_XOR(XTMP1, XTMP4, XTMP1)                                              \
01338             RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1)                               \
01339     V_ADD(W_0, W_0, XTMP1)                                                  \
01340 
01341 #endif
01342 
01343 #define _INIT_MASK(mask) \
01344     "vmovdqu %[mask], %%" #mask "\n\t"
01345 #define INIT_MASK(mask) \
01346        _INIT_MASK(mask)
01347 
01348 #define _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg)           \
01349     "vmovdqu    " #i1 "*16(%%" #reg "), %%" #xmm1 "\n\t"   \
01350     "vmovdqu    " #i2 "*16(%%" #reg "), %%" #xmm2 "\n\t"   \
01351     "vpshufb    %%" #mask ", %%" #xmm1 ", %%" #xmm1 "\n\t" \
01352     "vpshufb    %%" #mask ", %%" #xmm2 ", %%" #xmm2 "\n\t"
01353 #define LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) \
01354        _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg)
01355 
01356 #define LOAD_W(mask, reg)                           \
01357     /* X0..3(xmm4..7), W[0..15] = buffer[0.15];  */ \
01358     LOAD_W_2(0, 1, W_0 , W_2 , mask, reg)           \
01359     LOAD_W_2(2, 3, W_4 , W_6 , mask, reg)           \
01360     LOAD_W_2(4, 5, W_8 , W_10, mask, reg)           \
01361     LOAD_W_2(6, 7, W_12, W_14, mask, reg)
01362 
01363 #define _SET_W_X_2(xmm0, xmm1, reg, i)                          \
01364     "vpaddq " #i "+ 0(%%" #reg "), %%" #xmm0 ", %%xmm8\n\t" \
01365     "vpaddq " #i "+16(%%" #reg "), %%" #xmm1 ", %%xmm9\n\t" \
01366     "vmovdqu    %%xmm8, " #i "+ 0(" WX ")\n\t"                  \
01367     "vmovdqu    %%xmm9, " #i "+16(" WX ")\n\t"                  \
01368 
01369 #define SET_W_X_2(xmm0, xmm1, reg, i) \
01370        _SET_W_X_2(xmm0, xmm1, reg, i)
01371 
01372 #define SET_W_X(reg)                \
01373     SET_W_X_2(W_0 , W_2 , reg,  0)  \
01374     SET_W_X_2(W_4 , W_6 , reg, 32)  \
01375     SET_W_X_2(W_8 , W_10, reg, 64)  \
01376     SET_W_X_2(W_12, W_14, reg, 96)
01377 
01378 #define LOAD_DIGEST()                     \
01379     "movq     (%[sha512]), %%r8 \n\t" \
01380     "movq    8(%[sha512]), %%r9 \n\t" \
01381     "movq   16(%[sha512]), %%r10\n\t" \
01382     "movq   24(%[sha512]), %%r11\n\t" \
01383     "movq   32(%[sha512]), %%r12\n\t" \
01384     "movq   40(%[sha512]), %%r13\n\t" \
01385     "movq   48(%[sha512]), %%r14\n\t" \
01386     "movq   56(%[sha512]), %%r15\n\t"
01387 
01388 #define STORE_ADD_DIGEST()                \
01389     "addq    %%r8,   (%[sha512])\n\t" \
01390     "addq    %%r9,  8(%[sha512])\n\t" \
01391     "addq   %%r10, 16(%[sha512])\n\t" \
01392     "addq   %%r11, 24(%[sha512])\n\t" \
01393     "addq   %%r12, 32(%[sha512])\n\t" \
01394     "addq   %%r13, 40(%[sha512])\n\t" \
01395     "addq   %%r14, 48(%[sha512])\n\t" \
01396     "addq   %%r15, 56(%[sha512])\n\t"
01397 
01398 #define ADD_DIGEST()                      \
01399     "addq     (%[sha512]), %%r8 \n\t" \
01400     "addq    8(%[sha512]), %%r9 \n\t" \
01401     "addq   16(%[sha512]), %%r10\n\t" \
01402     "addq   24(%[sha512]), %%r11\n\t" \
01403     "addq   32(%[sha512]), %%r12\n\t" \
01404     "addq   40(%[sha512]), %%r13\n\t" \
01405     "addq   48(%[sha512]), %%r14\n\t" \
01406     "addq   56(%[sha512]), %%r15\n\t"
01407 
01408 #define STORE_DIGEST()                    \
01409     "movq    %%r8,   (%[sha512])\n\t" \
01410     "movq    %%r9,  8(%[sha512])\n\t" \
01411     "movq   %%r10, 16(%[sha512])\n\t" \
01412     "movq   %%r11, 24(%[sha512])\n\t" \
01413     "movq   %%r12, 32(%[sha512])\n\t" \
01414     "movq   %%r13, 40(%[sha512])\n\t" \
01415     "movq   %%r14, 48(%[sha512])\n\t" \
01416     "movq   %%r15, 56(%[sha512])\n\t"
01417 
01418 #endif /* HAVE_INTEL_AVX1 */
01419 
01420 
01421 /***  Transform Body ***/
01422 #if defined(HAVE_INTEL_AVX1)
01423 static int Transform_Sha512_AVX1(wc_Sha512* sha512)
01424 {
01425     __asm__ __volatile__ (
01426 
01427         /* 16 Ws plus loop counter. */
01428         "subq   $136, %%rsp\n\t"
01429         "leaq   64(%[sha512]), %%rax\n\t"
01430 
01431     INIT_MASK(MASK)
01432     LOAD_DIGEST()
01433 
01434     LOAD_W(MASK, rax)
01435 
01436         "movl   $4, 16*8(" WX ")\n\t"
01437         "leaq   %[K512], %%rsi\n\t"
01438         /* b */
01439         "movq   %%r9, " L4 "\n\t"
01440         /* e */
01441         "movq   %%r12, " L1 "\n\t"
01442         /* b ^ c */
01443         "xorq   %%r10, " L4 "\n\t"
01444 
01445         "# Start of 16 rounds\n"
01446         "1:\n\t"
01447 
01448     SET_W_X(rsi)
01449 
01450         "addq   $128, %%rsi\n\t"
01451 
01452     MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
01453     MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
01454     MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
01455     MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
01456     MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
01457     MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
01458     MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
01459     MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
01460 
01461         "subl   $1, 16*8(" WX ")\n\t"
01462         "jne    1b\n\t"
01463 
01464     SET_W_X(rsi)
01465 
01466     RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
01467     RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
01468     RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
01469     RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
01470 
01471     RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
01472     RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
01473     RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
01474     RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
01475 
01476     STORE_ADD_DIGEST()
01477 
01478         "addq   $136, %%rsp\n\t"
01479 
01480         :
01481         : [mask]   "m" (mBYTE_FLIP_MASK),
01482           [sha512] "r" (sha512),
01483           [K512]   "m" (K512)
01484         : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
01485     );
01486 
01487     return 0;
01488 }
01489 
01490 static int Transform_Sha512_AVX1_Len(wc_Sha512* sha512, word32 len)
01491 {
01492     __asm__ __volatile__ (
01493 
01494         "movq   224(%[sha512]), %%rsi\n\t"
01495         "leaq   %[K512], %%rdx\n\t"
01496 
01497     INIT_MASK(MASK)
01498     LOAD_DIGEST()
01499 
01500         "# Start of processing a block\n"
01501         "2:\n\t"
01502 
01503         /* 16 Ws plus loop counter and K512. len goes into -4(%rsp).
01504          * Debug needs more stack space. */
01505         "subq   $256, %%rsp\n\t"
01506 
01507     LOAD_W(MASK, rsi)
01508 
01509         "movl   $4, 16*8(" WX ")\n\t"
01510         /* b */
01511         "movq   %%r9, " L4 "\n\t"
01512         /* e */
01513         "movq   %%r12, " L1 "\n\t"
01514         /* b ^ c */
01515         "xorq   %%r10, " L4 "\n\t"
01516 
01517     SET_W_X(rdx)
01518 
01519         "# Start of 16 rounds\n"
01520         "1:\n\t"
01521 
01522         "addq   $128, %%rdx\n\t"
01523         "movq   %%rdx, 17*8(%%rsp)\n\t"
01524 
01525     MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
01526     MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
01527     MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
01528     MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
01529     MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
01530     MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
01531     MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
01532     MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
01533 
01534         "movq   17*8(%%rsp), %%rdx\n\t"
01535 
01536     SET_W_X(rdx)
01537 
01538         "subl   $1, 16*8(" WX ")\n\t"
01539         "jne    1b\n\t"
01540 
01541     RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
01542     RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
01543     RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
01544     RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
01545 
01546     RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
01547     RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
01548     RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
01549     RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
01550 
01551     ADD_DIGEST()
01552 
01553         "addq   $256, %%rsp\n\t"
01554         "leaq   %[K512], %%rdx\n\t"
01555         "addq   $128, %%rsi\n\t"
01556         "subl   $128, %[len]\n\t"
01557 
01558     STORE_DIGEST()
01559 
01560         "jnz    2b\n\t"
01561 
01562         :
01563         : [mask]   "m" (mBYTE_FLIP_MASK),
01564           [len]    "m" (len),
01565           [sha512] "r" (sha512),
01566           [K512]   "m" (K512)
01567         : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
01568     );
01569 
01570     return 0;
01571 }
01572 #endif /* HAVE_INTEL_AVX1 */
01573 
01574 #if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX)
01575 static int Transform_Sha512_AVX1_RORX(wc_Sha512* sha512)
01576 {
01577     __asm__ __volatile__ (
01578 
01579         /* 16 Ws plus loop counter and K512. */
01580         "subq   $144, %%rsp\n\t"
01581         "leaq   64(%[sha512]), %%rax\n\t"
01582 
01583     INIT_MASK(MASK)
01584     LOAD_DIGEST()
01585 
01586     LOAD_W(MASK, rax)
01587 
01588         "movl   $4, 16*8(" WX ")\n\t"
01589         "leaq   %[K512], %%rsi\n\t"
01590         /* L4 = b */
01591         "movq   %%r9, " L4 "\n\t"
01592         /* L3 = 0 (add to prev h) */
01593         "xorq   " L3 ", " L3 "\n\t"
01594         /* L4 = b ^ c */
01595         "xorq   %%r10, " L4 "\n\t"
01596 
01597     SET_W_X(rsi)
01598 
01599         "# Start of 16 rounds\n"
01600         "1:\n\t"
01601 
01602         "addq   $128, %%rsi\n\t"
01603 
01604     MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
01605     MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
01606     MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
01607     MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
01608     MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
01609     MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
01610     MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
01611     MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
01612 
01613     SET_W_X(rsi)
01614 
01615         "subl   $1, 16*8(" WX ")\n\t"
01616         "jne    1b\n\t"
01617 
01618     RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
01619     RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
01620     RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
01621     RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
01622 
01623     RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
01624     RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
01625     RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
01626     RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
01627 
01628         /* Prev RND: h += Maj(a,b,c) */
01629         "addq   " L3 ", %%r8\n\t"
01630         "addq   $144, %%rsp\n\t"
01631 
01632     STORE_ADD_DIGEST()
01633 
01634         :
01635         : [mask]   "m" (mBYTE_FLIP_MASK),
01636           [sha512] "r" (sha512),
01637           [K512]   "m" (K512)
01638         : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
01639     );
01640 
01641     return 0;
01642 }
01643 
01644 static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512* sha512, word32 len)
01645 {
01646     __asm__ __volatile__ (
01647 
01648         "movq   224(%[sha512]), %%rsi\n\t"
01649         "leaq   %[K512], %%rcx\n\t"
01650 
01651     INIT_MASK(MASK)
01652     LOAD_DIGEST()
01653 
01654         "# Start of processing a block\n"
01655         "2:\n\t"
01656 
01657         /* 16 Ws plus loop counter and K512. len goes into -4(%rsp).
01658          * Debug needs more stack space. */
01659         "subq   $256, %%rsp\n\t"
01660 
01661     LOAD_W(MASK, rsi)
01662 
01663         "movl   $4, 16*8(" WX ")\n\t"
01664         /* L4 = b */
01665         "movq   %%r9, " L4 "\n\t"
01666         /* L3 = 0 (add to prev h) */
01667         "xorq   " L3 ", " L3 "\n\t"
01668         /* L4 = b ^ c */
01669         "xorq   %%r10, " L4 "\n\t"
01670 
01671     SET_W_X(rcx)
01672 
01673         "# Start of 16 rounds\n"
01674         "1:\n\t"
01675 
01676         "addq   $128, %%rcx\n\t"
01677         "movq   %%rcx, 17*8(%%rsp)\n\t"
01678 
01679     MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
01680     MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
01681     MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
01682     MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
01683     MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
01684     MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
01685     MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
01686     MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
01687 
01688         "movq   17*8(%%rsp), %%rcx\n\t"
01689 
01690     SET_W_X(rcx)
01691 
01692         "subl   $1, 16*8(" WX ")\n\t"
01693         "jne    1b\n\t"
01694 
01695     SET_W_X(rcx)
01696 
01697     RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
01698     RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
01699     RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
01700     RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
01701 
01702     RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
01703     RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
01704     RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
01705     RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
01706 
01707         /* Prev RND: h += Maj(a,b,c) */
01708         "addq   " L3 ", %%r8\n\t"
01709         "addq   $256, %%rsp\n\t"
01710 
01711     ADD_DIGEST()
01712 
01713         "leaq   %[K512], %%rcx\n\t"
01714         "addq   $128, %%rsi\n\t"
01715         "subl   $128, %[len]\n\t"
01716 
01717     STORE_DIGEST()
01718 
01719         "jnz    2b\n\t"
01720 
01721         :
01722         : [mask]   "m" (mBYTE_FLIP_MASK),
01723           [len]    "m" (len),
01724           [sha512] "r" (sha512),
01725           [K512]   "m" (K512)
01726         : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
01727     );
01728 
01729     return 0;
01730 }
01731 #endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_RORX */
01732 
01733 #if defined(HAVE_INTEL_AVX2)
01734 static const unsigned long mBYTE_FLIP_MASK_Y[] =
01735    { 0x0001020304050607, 0x08090a0b0c0d0e0f,
01736      0x0001020304050607, 0x08090a0b0c0d0e0f };
01737 
01738 #define W_Y_0       ymm0
01739 #define W_Y_4       ymm1
01740 #define W_Y_8       ymm2
01741 #define W_Y_12      ymm3
01742 
01743 #define X0       xmm0
01744 #define X1       xmm1
01745 #define X2       xmm2
01746 #define X3       xmm3
01747 #define X4       xmm4
01748 #define X5       xmm5
01749 #define X6       xmm6
01750 #define X7       xmm7
01751 #define X8       xmm8
01752 #define X9       xmm9
01753 #define Y0       ymm0
01754 #define Y1       ymm1
01755 #define Y2       ymm2
01756 #define Y3       ymm3
01757 #define Y4       ymm4
01758 #define Y5       ymm5
01759 #define Y6       ymm6
01760 #define Y7       ymm7
01761 
01762 #define W_Y_M15     ymm12
01763 #define W_Y_M7      ymm13
01764 #define W_Y_M2      ymm14
01765 #define MASK_Y      ymm15
01766 
01767 #define YTMP1       ymm8
01768 #define YTMP2       ymm9
01769 #define YTMP3       ymm10
01770 #define YTMP4       ymm11
01771 
01772 #define YMM_REGS \
01773     "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",       \
01774     "xmm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15"
01775 
01776 #define _VPERM2I128(dest, src1, src2, sel)                             \
01777     "vperm2I128 $" #sel ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t"
01778 #define VPERM2I128(dest, src1, src2, sel) \
01779        _VPERM2I128(dest, src1, src2, sel)
01780 
01781 #define _VPERMQ(dest, src, sel)                                        \
01782     "vpermq $" #sel ", %%" #src ", %%" #dest "\n\t"
01783 #define VPERMQ(dest, src, sel) \
01784        _VPERMQ(dest, src, sel)
01785 
01786 #define _VPBLENDD(dest, src1, src2, sel)                               \
01787     "vpblendd   $" #sel ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t"
01788 #define VPBLENDD(dest, src1, src2, sel) \
01789        _VPBLENDD(dest, src1, src2, sel)
01790 
01791 #define _V_ADD_I(dest, src1, addr, i)                                  \
01792     "vpaddq  "#i"*8(%%" #addr "), %%" #src1 ", %%" #dest "\n\t"
01793 #define V_ADD_I(dest, src1, addr, i) \
01794        _V_ADD_I(dest, src1, addr, i)
01795 
01796 #define _VMOVDQU_I(addr, i, src)                                       \
01797     "vmovdqu     %%" #src ", " #i "*8(%%" #addr ")\n\t"
01798 #define VMOVDQU_I(addr, i, src) \
01799        _VMOVDQU_I(addr, i, src)
01800 
01801 #define MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \
01802             RND_0_1(a,b,c,d,e,f,g,h,i)                             \
01803     /* W[-13]..W[-15], W[-12] */                                   \
01804     VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03)                          \
01805     /* W[-5]..W[-7], W[-4] */                                      \
01806     VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03)                          \
01807             RND_0_2(a,b,c,d,e,f,g,h,i)                             \
01808             RND_0_3(a,b,c,d,e,f,g,h,i)                             \
01809     /* W_Y_M15 = W[-12]..W[-15] */                                 \
01810     VPERMQ(W_Y_M15, W_Y_M15, 0x39)                                 \
01811             RND_0_4(a,b,c,d,e,f,g,h,i)                             \
01812     /* W_Y_M7 = W[-4]..W[-7] */                                    \
01813     VPERMQ(W_Y_M7, W_Y_M7, 0x39)                                   \
01814             RND_0_5(a,b,c,d,e,f,g,h,i)                             \
01815             RND_0_6(a,b,c,d,e,f,g,h,i)                             \
01816     /* W[-15] >>  1 */                                             \
01817     V_SHIFT_R(YTMP1, W_Y_M15, 1)                                   \
01818             RND_0_7(a,b,c,d,e,f,g,h,i)                             \
01819     /* W[-15] << 63 */                                             \
01820     V_SHIFT_L(YTMP2, W_Y_M15, 63)                                  \
01821             RND_0_8(a,b,c,d,e,f,g,h,i)                             \
01822     /* W[-15] >>  8 */                                             \
01823     V_SHIFT_R(YTMP3, W_Y_M15, 8)                                   \
01824             RND_0_9(a,b,c,d,e,f,g,h,i)                             \
01825     /* W[-15] << 56 */                                             \
01826     V_SHIFT_L(YTMP4, W_Y_M15, 56)                                  \
01827             RND_0_10(a,b,c,d,e,f,g,h,i)                            \
01828     /* W[-15] >>> 1 */                                             \
01829     V_OR(YTMP1, YTMP2, YTMP1)                                      \
01830             RND_0_11(a,b,c,d,e,f,g,h,i)                            \
01831     /* W[-15] >>> 8 */                                             \
01832     V_OR(YTMP3, YTMP4, YTMP3)                                      \
01833             RND_0_12(a,b,c,d,e,f,g,h,i)                            \
01834             RND_1_1(h,a,b,c,d,e,f,g,i+1)                           \
01835     /* W[-15] >> 7 */                                              \
01836     V_SHIFT_R(YTMP4, W_Y_M15, 7)                                   \
01837             RND_1_2_A(h,a,b,c,d,e,f,g,i+1)                         \
01838     /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */                          \
01839     V_XOR(YTMP1, YTMP3, YTMP1)                                     \
01840             RND_1_2_B(h,a,b,c,d,e,f,g,i+1)                         \
01841     /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */          \
01842     V_XOR(YTMP1, YTMP4, YTMP1)                                     \
01843             RND_1_3(h,a,b,c,d,e,f,g,i+1)                           \
01844     /* W[0] = W[-16] + W[-7] */                                    \
01845     V_ADD(W_Y_0, W_Y_0, W_Y_M7)                                    \
01846             RND_1_4(h,a,b,c,d,e,f,g,i+1)                           \
01847     /* W[0] = W[-16] + W[-7] + s0(W[-15]) */                       \
01848     V_ADD(W_Y_0, W_Y_0, YTMP1)                                     \
01849             RND_1_5(h,a,b,c,d,e,f,g,i+1)                           \
01850     /* 0, 0, W[-1], W[-2] */                                       \
01851     VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81)                       \
01852             RND_1_6(h,a,b,c,d,e,f,g,i+1)                           \
01853             RND_1_7(h,a,b,c,d,e,f,g,i+1)                           \
01854             RND_1_8(h,a,b,c,d,e,f,g,i+1)                           \
01855     /* W[-2] >> 19 */                                              \
01856     V_SHIFT_R(YTMP1, W_Y_M2, 19)                                   \
01857             RND_1_9(h,a,b,c,d,e,f,g,i+1)                           \
01858     /* W[-2] << 45 */                                              \
01859     V_SHIFT_L(YTMP2, W_Y_M2, 45)                                   \
01860             RND_1_10(h,a,b,c,d,e,f,g,i+1)                          \
01861     /* W[-2] >> 61 */                                              \
01862     V_SHIFT_R(YTMP3, W_Y_M2, 61)                                   \
01863             RND_1_11(h,a,b,c,d,e,f,g,i+1)                          \
01864     /* W[-2] <<  3 */                                              \
01865     V_SHIFT_L(YTMP4, W_Y_M2, 3)                                    \
01866             RND_1_12(h,a,b,c,d,e,f,g,i+1)                          \
01867             RND_0_1(g,h,a,b,c,d,e,f,i+2)                           \
01868     /* W[-2] >>> 19 */                                             \
01869     V_OR(YTMP1, YTMP2, YTMP1)                                      \
01870             RND_0_2(g,h,a,b,c,d,e,f,i+2)                           \
01871     /* W[-2] >>> 61 */                                             \
01872     V_OR(YTMP3, YTMP4, YTMP3)                                      \
01873             RND_0_3(g,h,a,b,c,d,e,f,i+2)                           \
01874     /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */                          \
01875     V_XOR(YTMP1, YTMP3, YTMP1)                                     \
01876             RND_0_4(g,h,a,b,c,d,e,f,i+2)                           \
01877     /* W[-2] >>  6 */                                              \
01878     V_SHIFT_R(YTMP4, W_Y_M2, 6)                                    \
01879             RND_0_5(g,h,a,b,c,d,e,f,i+2)                           \
01880     /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */           \
01881     V_XOR(YTMP1, YTMP4, YTMP1)                                     \
01882             RND_0_6(g,h,a,b,c,d,e,f,i+2)                           \
01883     /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */           \
01884     V_ADD(W_Y_0, W_Y_0, YTMP1)                                     \
01885             RND_0_7(g,h,a,b,c,d,e,f,i+2)                           \
01886             RND_0_8(g,h,a,b,c,d,e,f,i+2)                           \
01887     /* W[1], W[0], 0, 0 */                                         \
01888     VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08)                         \
01889             RND_0_9(g,h,a,b,c,d,e,f,i+2)                           \
01890             RND_0_10(g,h,a,b,c,d,e,f,i+2)                          \
01891     /* W[-2] >> 19 */                                              \
01892     V_SHIFT_R(YTMP1, W_Y_M2, 19)                                   \
01893             RND_0_11(g,h,a,b,c,d,e,f,i+2)                          \
01894     /* W[-2] << 45 */                                              \
01895     V_SHIFT_L(YTMP2, W_Y_M2, 45)                                   \
01896             RND_0_12(g,h,a,b,c,d,e,f,i+2)                          \
01897             RND_1_1(f,g,h,a,b,c,d,e,i+3)                           \
01898     /* W[-2] >> 61 */                                              \
01899     V_SHIFT_R(YTMP3, W_Y_M2, 61)                                   \
01900             RND_1_2(f,g,h,a,b,c,d,e,i+3)                           \
01901     /* W[-2] <<  3 */                                              \
01902     V_SHIFT_L(YTMP4, W_Y_M2, 3)                                    \
01903             RND_1_3(f,g,h,a,b,c,d,e,i+3)                           \
01904     /* W[-2] >>> 19 */                                             \
01905     V_OR(YTMP1, YTMP2, YTMP1)                                      \
01906             RND_1_4(f,g,h,a,b,c,d,e,i+3)                           \
01907     /* W[-2] >>> 61 */                                             \
01908     V_OR(YTMP3, YTMP4, YTMP3)                                      \
01909             RND_1_5(f,g,h,a,b,c,d,e,i+3)                           \
01910     /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */                          \
01911     V_XOR(YTMP1, YTMP3, YTMP1)                                     \
01912             RND_1_6(f,g,h,a,b,c,d,e,i+3)                           \
01913     /* W[-2] >>  6 */                                              \
01914     V_SHIFT_R(YTMP4, W_Y_M2, 6)                                    \
01915             RND_1_7(f,g,h,a,b,c,d,e,i+3)                           \
01916     /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */           \
01917     V_XOR(YTMP1, YTMP4, YTMP1)                                     \
01918             RND_1_8(f,g,h,a,b,c,d,e,i+3)                           \
01919     /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */           \
01920     V_ADD(W_Y_0, W_Y_0, YTMP1)                                     \
01921             RND_1_9(f,g,h,a,b,c,d,e,i+3)                           \
01922             RND_1_10(f,g,h,a,b,c,d,e,i+3)                          \
01923             RND_1_11(f,g,h,a,b,c,d,e,i+3)                          \
01924             RND_1_12(f,g,h,a,b,c,d,e,i+3)                          \
01925 
01926 #define MsgSched2_AVX2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \
01927             RND_0_1(a,b,c,d,e,f,g,h,i)                                       \
01928     VPALIGNR(W_Y_M15, W_2, W_0, 8)                                           \
01929     VPALIGNR(W_Y_M7, W_10, W_8, 8)                                           \
01930             RND_0_2(a,b,c,d,e,f,g,h,i)                                       \
01931     V_SHIFT_R(YTMP1, W_Y_M15, 1)                                             \
01932     V_SHIFT_L(YTMP2, W_Y_M15, 63)                                            \
01933             RND_0_3(a,b,c,d,e,f,g,h,i)                                       \
01934             RND_0_4(a,b,c,d,e,f,g,h,i)                                       \
01935     V_SHIFT_R(YTMP3, W_Y_M15, 8)                                             \
01936     V_SHIFT_L(YTMP4, W_Y_M15, 56)                                            \
01937             RND_0_5(a,b,c,d,e,f,g,h,i)                                       \
01938             RND_0_6(a,b,c,d,e,f,g,h,i)                                       \
01939     V_OR(YTMP1, YTMP2, YTMP1)                                                \
01940     V_OR(YTMP3, YTMP4, YTMP3)                                                \
01941             RND_0_7(a,b,c,d,e,f,g,h,i)                                       \
01942             RND_0_8(a,b,c,d,e,f,g,h,i)                                       \
01943     V_SHIFT_R(YTMP4, W_Y_M15, 7)                                             \
01944     V_XOR(YTMP1, YTMP3, YTMP1)                                               \
01945             RND_0_9(a,b,c,d,e,f,g,h,i)                                       \
01946             RND_0_10(a,b,c,d,e,f,g,h,i)                                      \
01947     V_XOR(YTMP1, YTMP4, YTMP1)                                               \
01948     V_ADD(W_0, W_0, W_Y_M7)                                                  \
01949             RND_0_11(a,b,c,d,e,f,g,h,i)                                      \
01950             RND_0_12(a,b,c,d,e,f,g,h,i)                                      \
01951             RND_1_1(h,a,b,c,d,e,f,g,i+1)                                     \
01952     V_ADD(W_0, W_0, YTMP1)                                                   \
01953             RND_1_2(h,a,b,c,d,e,f,g,i+1)                                     \
01954     V_SHIFT_R(YTMP1, W_14, 19)                                               \
01955     V_SHIFT_L(YTMP2, W_14, 45)                                               \
01956             RND_1_3(h,a,b,c,d,e,f,g,i+1)                                     \
01957             RND_1_4(h,a,b,c,d,e,f,g,i+1)                                     \
01958     V_SHIFT_R(YTMP3, W_14, 61)                                               \
01959     V_SHIFT_L(YTMP4, W_14, 3)                                                \
01960             RND_1_5(h,a,b,c,d,e,f,g,i+1)                                     \
01961             RND_1_6(h,a,b,c,d,e,f,g,i+1)                                     \
01962             RND_1_7(h,a,b,c,d,e,f,g,i+1)                                     \
01963     V_OR(YTMP1, YTMP2, YTMP1)                                                \
01964     V_OR(YTMP3, YTMP4, YTMP3)                                                \
01965             RND_1_8(h,a,b,c,d,e,f,g,i+1)                                     \
01966             RND_1_9(h,a,b,c,d,e,f,g,i+1)                                     \
01967     V_XOR(YTMP1, YTMP3, YTMP1)                                               \
01968     V_SHIFT_R(YTMP4, W_14, 6)                                                \
01969             RND_1_10(h,a,b,c,d,e,f,g,i+1)                                    \
01970             RND_1_11(h,a,b,c,d,e,f,g,i+1)                                    \
01971     V_XOR(YTMP1, YTMP4, YTMP1)                                               \
01972             RND_1_12(h,a,b,c,d,e,f,g,i+1)                                    \
01973     V_ADD(W_0, W_0, YTMP1)                                                   \
01974 
01975 #define MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \
01976             RND_RORX_0_1(a,b,c,d,e,f,g,h,i)                                 \
01977     /* W[-13]..W[-15], W[-12] */                                            \
01978     VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03)                                   \
01979     /* W[-5]..W[-7], W[-4] */                                               \
01980     VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03)                                   \
01981             RND_RORX_0_2(a,b,c,d,e,f,g,h,i)                                 \
01982     /* W_Y_M15 = W[-12]..W[-15] */                                          \
01983     VPERMQ(W_Y_M15, W_Y_M15, 0x39)                                          \
01984             RND_RORX_0_3(a,b,c,d,e,f,g,h,i)                                 \
01985     /* W_Y_M7 = W[-4]..W[-7] */                                             \
01986     VPERMQ(W_Y_M7, W_Y_M7, 0x39)                                            \
01987             RND_RORX_0_4(a,b,c,d,e,f,g,h,i)                                 \
01988     /* W[-15] >>  1 */                                                      \
01989     V_SHIFT_R(YTMP1, W_Y_M15, 1)                                            \
01990     /* W[-15] << 63 */                                                      \
01991     V_SHIFT_L(YTMP2, W_Y_M15, 63)                                           \
01992             RND_RORX_0_5(a,b,c,d,e,f,g,h,i)                                 \
01993     /* W[-15] >>  8 */                                                      \
01994     V_SHIFT_R(YTMP3, W_Y_M15, 8)                                            \
01995     /* W[-15] << 56 */                                                      \
01996     V_SHIFT_L(YTMP4, W_Y_M15, 56)                                           \
01997     /* W[-15] >>> 1 */                                                      \
01998     V_OR(YTMP1, YTMP2, YTMP1)                                               \
01999     /* W[-15] >>> 8 */                                                      \
02000     V_OR(YTMP3, YTMP4, YTMP3)                                               \
02001             RND_RORX_0_6(a,b,c,d,e,f,g,h,i)                                 \
02002     /* W[-15] >> 7 */                                                       \
02003     V_SHIFT_R(YTMP4, W_Y_M15, 7)                                            \
02004             RND_RORX_0_7(a,b,c,d,e,f,g,h,i)                                 \
02005     /* 0, 0, W[-1], W[-2] */                                                \
02006     VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81)                                \
02007             RND_RORX_0_8(a,b,c,d,e,f,g,h,i)                                 \
02008             RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1)                               \
02009     /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */                                   \
02010     V_XOR(YTMP1, YTMP3, YTMP1)                                              \
02011             RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1)                               \
02012     /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */                   \
02013     V_XOR(YTMP1, YTMP4, YTMP1)                                              \
02014             RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1)                               \
02015     /* W[0] = W[-16] + W[-7] */                                             \
02016     V_ADD(W_Y_0, W_Y_0, W_Y_M7)                                             \
02017     /* W[0] = W[-16] + W[-7] + s0(W[-15]) */                                \
02018     V_ADD(W_Y_0, W_Y_0, YTMP1)                                              \
02019             RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1)                               \
02020     /* W[-2] >> 19 */                                                       \
02021     V_SHIFT_R(YTMP1, W_Y_M2, 19)                                            \
02022     /* W[-2] << 45 */                                                       \
02023     V_SHIFT_L(YTMP2, W_Y_M2, 45)                                            \
02024             RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1)                               \
02025     /* W[-2] >> 61 */                                                       \
02026     V_SHIFT_R(YTMP3, W_Y_M2, 61)                                            \
02027     /* W[-2] <<  3 */                                                       \
02028     V_SHIFT_L(YTMP4, W_Y_M2, 3)                                             \
02029     /* W[-2] >>> 19 */                                                      \
02030     V_OR(YTMP1, YTMP2, YTMP1)                                               \
02031             RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1)                               \
02032     /* W[-2] >>> 61 */                                                      \
02033     V_OR(YTMP3, YTMP4, YTMP3)                                               \
02034             RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1)                               \
02035     /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */                                   \
02036     V_XOR(YTMP1, YTMP3, YTMP1)                                              \
02037             RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1)                               \
02038     /* W[-2] >>  6 */                                                       \
02039     V_SHIFT_R(YTMP4, W_Y_M2, 6)                                             \
02040             RND_RORX_0_1(g,h,a,b,c,d,e,f,i+2)                               \
02041     /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */                    \
02042     V_XOR(YTMP1, YTMP4, YTMP1)                                              \
02043             RND_RORX_0_2(g,h,a,b,c,d,e,f,i+2)                               \
02044     /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */                    \
02045     V_ADD(W_Y_0, W_Y_0, YTMP1)                                              \
02046             RND_RORX_0_3(g,h,a,b,c,d,e,f,i+2)                               \
02047     /* W[1], W[0], 0, 0 */                                                  \
02048     VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08)                                  \
02049             RND_RORX_0_4(g,h,a,b,c,d,e,f,i+2)                               \
02050             RND_RORX_0_5(g,h,a,b,c,d,e,f,i+2)                               \
02051     /* W[-2] >> 19 */                                                       \
02052     V_SHIFT_R(YTMP1, W_Y_M2, 19)                                            \
02053     /* W[-2] << 45 */                                                       \
02054     V_SHIFT_L(YTMP2, W_Y_M2, 45)                                            \
02055             RND_RORX_0_6(g,h,a,b,c,d,e,f,i+2)                               \
02056     /* W[-2] >> 61 */                                                       \
02057     V_SHIFT_R(YTMP3, W_Y_M2, 61)                                            \
02058     /* W[-2] <<  3 */                                                       \
02059     V_SHIFT_L(YTMP4, W_Y_M2, 3)                                             \
02060     /* W[-2] >>> 19 */                                                      \
02061     V_OR(YTMP1, YTMP2, YTMP1)                                               \
02062             RND_RORX_0_7(g,h,a,b,c,d,e,f,i+2)                               \
02063     /* W[-2] >>> 61 */                                                      \
02064     V_OR(YTMP3, YTMP4, YTMP3)                                               \
02065             RND_RORX_0_8(g,h,a,b,c,d,e,f,i+2)                               \
02066     /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */                                   \
02067     V_XOR(YTMP1, YTMP3, YTMP1)                                              \
02068             RND_RORX_1_1(f,g,h,a,b,c,d,e,i+3)                               \
02069     /* W[-2] >>  6 */                                                       \
02070     V_SHIFT_R(YTMP4, W_Y_M2, 6)                                             \
02071             RND_RORX_1_2(f,g,h,a,b,c,d,e,i+3)                               \
02072             RND_RORX_1_3(f,g,h,a,b,c,d,e,i+3)                               \
02073     /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */                    \
02074     V_XOR(YTMP1, YTMP4, YTMP1)                                              \
02075             RND_RORX_1_4(f,g,h,a,b,c,d,e,i+3)                               \
02076             RND_RORX_1_5(f,g,h,a,b,c,d,e,i+3)                               \
02077     /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */                    \
02078     V_ADD(W_Y_0, W_Y_0, YTMP1)                                              \
02079             RND_RORX_1_6(f,g,h,a,b,c,d,e,i+3)                               \
02080     V_ADD_I(YTMP1, W_Y_0, rsi, i)                                           \
02081             RND_RORX_1_7(f,g,h,a,b,c,d,e,i+3)                               \
02082             RND_RORX_1_8(f,g,h,a,b,c,d,e,i+3)                               \
02083     VMOVDQU_I(rsp, i, YTMP1)                                                \
02084 
02085 #define MsgSched2_AVX2_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,  \
02086                             f,g,h,i)                                       \
02087             RND_RORX_0_1(a,b,c,d,e,f,g,h,i)                                \
02088     VPALIGNR(W_Y_M15, W_2, W_0, 8)                                         \
02089     VPALIGNR(W_Y_M7, W_10, W_8, 8)                                         \
02090             RND_RORX_0_2(a,b,c,d,e,f,g,h,i)                                \
02091     V_SHIFT_R(YTMP1, W_Y_M15, 1)                                           \
02092     V_SHIFT_L(YTMP2, W_Y_M15, 63)                                          \
02093             RND_RORX_0_3(a,b,c,d,e,f,g,h,i)                                \
02094     V_SHIFT_R(YTMP3, W_Y_M15, 8)                                           \
02095     V_SHIFT_L(YTMP4, W_Y_M15, 56)                                          \
02096             RND_RORX_0_4(a,b,c,d,e,f,g,h,i)                                \
02097     V_OR(YTMP1, YTMP2, YTMP1)                                              \
02098     V_OR(YTMP3, YTMP4, YTMP3)                                              \
02099             RND_RORX_0_5(a,b,c,d,e,f,g,h,i)                                \
02100     V_SHIFT_R(YTMP4, W_Y_M15, 7)                                           \
02101     V_XOR(YTMP1, YTMP3, YTMP1)                                             \
02102             RND_RORX_0_6(a,b,c,d,e,f,g,h,i)                                \
02103     V_XOR(YTMP1, YTMP4, YTMP1)                                             \
02104     V_ADD(W_0, W_0, W_Y_M7)                                                \
02105             RND_RORX_0_7(a,b,c,d,e,f,g,h,i)                                \
02106             RND_RORX_0_8(a,b,c,d,e,f,g,h,i)                                \
02107     V_ADD(W_0, W_0, YTMP1)                                                 \
02108             RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1)                              \
02109     V_SHIFT_R(YTMP1, W_14, 19)                                             \
02110     V_SHIFT_L(YTMP2, W_14, 45)                                             \
02111             RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1)                              \
02112     V_SHIFT_R(YTMP3, W_14, 61)                                             \
02113     V_SHIFT_L(YTMP4, W_14, 3)                                              \
02114             RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1)                              \
02115     V_OR(YTMP1, YTMP2, YTMP1)                                              \
02116     V_OR(YTMP3, YTMP4, YTMP3)                                              \
02117             RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1)                              \
02118             RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1)                              \
02119     V_XOR(YTMP1, YTMP3, YTMP1)                                             \
02120     V_SHIFT_R(YTMP4, W_14, 6)                                              \
02121             RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1)                              \
02122             RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1)                              \
02123     V_XOR(YTMP1, YTMP4, YTMP1)                                             \
02124             RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1)                              \
02125     V_ADD(W_0, W_0, YTMP1)                                                 \
02126 
02127 
02128 #define _INIT_MASK_Y(mask)            \
02129     "vmovdqu %[mask], %%"#mask"\n\t"
02130 #define INIT_MASK_Y(mask) \
02131        _INIT_MASK_Y(mask)
02132 
02133 /* Load into YMM registers and swap endian. */
02134 #define _LOAD_BLOCK_W_Y_2(mask, ymm0, ymm1, reg, i)           \
02135     /* buffer[0..15] => ymm0..ymm3;  */                       \
02136     "vmovdqu    " #i "+ 0(%%" #reg "), %%" #ymm0 "\n\t"       \
02137     "vmovdqu    " #i "+32(%%" #reg "), %%" #ymm1 "\n\t"       \
02138     "vpshufb    %%" #mask ", %%" #ymm0 ", %%" #ymm0 "\n\t"    \
02139     "vpshufb    %%" #mask ", %%" #ymm1 ", %%" #ymm1 "\n\t"
02140 
02141 #define LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i) \
02142        _LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i)
02143 
02144 #define LOAD_BLOCK_W_Y(mask, reg)                  \
02145     LOAD_BLOCK_W_Y_2(mask, W_Y_0, W_Y_4 , reg,  0) \
02146     LOAD_BLOCK_W_Y_2(mask, W_Y_8, W_Y_12, reg, 64)
02147 
02148 #define _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i)                    \
02149     "vpaddq " #i "+ 0(%%" #reg "), %%" #ymm0 ", %%" #ymm2 "\n\t"  \
02150     "vpaddq " #i "+32(%%" #reg "), %%" #ymm1 ", %%" #ymm3 "\n\t"  \
02151     "vmovdqu    %%" #ymm2 ", " #i "+ 0(" WX ")\n\t"                   \
02152     "vmovdqu    %%" #ymm3 ", " #i "+32(" WX ")\n\t"
02153 
02154 #define SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) \
02155        _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i)
02156 
02157 #define SET_BLOCK_W_Y(reg)                          \
02158     SET_W_Y_2(W_Y_0, W_Y_4 , YTMP1, YTMP2, reg,  0) \
02159     SET_W_Y_2(W_Y_8, W_Y_12, YTMP1, YTMP2, reg, 64)
02160 
02161 /* Load into YMM registers and swap endian. */
02162 #define _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i)   \
02163     "vmovdqu    " #i "+  0(%%" #reg "), %%" #X0 "\n\t"                   \
02164     "vmovdqu    " #i "+ 16(%%" #reg "), %%" #X1 "\n\t"                   \
02165     "vmovdqu    " #i "+128(%%" #reg "), %%" #X8 "\n\t"                   \
02166     "vmovdqu    " #i "+144(%%" #reg "), %%" #X9 "\n\t"                   \
02167     "vinserti128    $1, %%" #X8 ", %%" #Y0 ", %%" #Y0 "\n\t"         \
02168     "vinserti128    $1, %%" #X9 ", %%" #Y1 ", %%" #Y1 "\n\t"         \
02169     "vpshufb    %%" #mask ", %%" #Y0 ", %%" #Y0 "\n\t"                   \
02170     "vpshufb    %%" #mask ", %%" #Y1 ", %%" #Y1 "\n\t"
02171 
02172 #define LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) \
02173        _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i)
02174 
02175 #define LOAD_BLOCK2_W_Y(mask, reg)                           \
02176     LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg,  0) \
02177     LOAD_BLOCK2_W_Y_2(mask, Y2, Y3, X2, X3, X8, X9, reg, 32) \
02178     LOAD_BLOCK2_W_Y_2(mask, Y4, Y5, X4, X5, X8, X9, reg, 64) \
02179     LOAD_BLOCK2_W_Y_2(mask, Y6, Y7, X6, X7, X8, X9, reg, 96) \
02180 
02181 #define SET_BLOCK2_W_Y(reg)                   \
02182     SET_W_Y_2(Y0, Y1, YTMP1, YTMP2, reg,   0) \
02183     SET_W_Y_2(Y2, Y3, YTMP1, YTMP2, reg,  64) \
02184     SET_W_Y_2(Y4, Y5, YTMP1, YTMP2, reg, 128) \
02185     SET_W_Y_2(Y6, Y7, YTMP1, YTMP2, reg, 192)
02186 
02187 static const word64 K512_AVX2[160] = {
02188     W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
02189     W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
02190     W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
02191     W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
02192     W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
02193     W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
02194     W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
02195     W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
02196     W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
02197     W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
02198     W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
02199     W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
02200     W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
02201     W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
02202     W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
02203     W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
02204     W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
02205     W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
02206     W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
02207     W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
02208     W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
02209     W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
02210     W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
02211     W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
02212     W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
02213     W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
02214     W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
02215     W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
02216     W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
02217     W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
02218     W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
02219     W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
02220     W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
02221     W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
02222     W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
02223     W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
02224     W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
02225     W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
02226     W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
02227     W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
02228     W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
02229     W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
02230     W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
02231     W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
02232     W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
02233     W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
02234     W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
02235     W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
02236     W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
02237     W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
02238     W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
02239     W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
02240     W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
02241     W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
02242     W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
02243     W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
02244     W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
02245     W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
02246     W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
02247     W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
02248     W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
02249     W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
02250     W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
02251     W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
02252     W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
02253     W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
02254     W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
02255     W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
02256     W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
02257     W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
02258     W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
02259     W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
02260     W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
02261     W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
02262     W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
02263     W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
02264     W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
02265     W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
02266     W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817),
02267     W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
02268 };
02269 static const word64* K512_AVX2_END = &K512_AVX2[128];
02270 
02271 static int Transform_Sha512_AVX2(wc_Sha512* sha512)
02272 {
02273     __asm__ __volatile__ (
02274 
02275         /* 16 Ws plus loop counter and K512. */
02276         "subq   $136, %%rsp\n\t"
02277         "leaq   64(%[sha512]), %%rax\n\t"
02278 
02279     INIT_MASK(MASK_Y)
02280     LOAD_DIGEST()
02281 
02282     LOAD_BLOCK_W_Y(MASK_Y, rax)
02283 
02284         "movl   $4, 16*8(" WX ")\n\t"
02285         "leaq   %[K512], %%rsi\n\t"
02286         /* b */
02287         "movq   %%r9, " L4 "\n\t"
02288         /* e */
02289         "movq   %%r12, " L1 "\n\t"
02290         /* b ^ c */
02291         "xorq   %%r10, " L4 "\n\t"
02292 
02293     SET_BLOCK_W_Y(rsi)
02294 
02295         "# Start of 16 rounds\n"
02296         "1:\n\t"
02297 
02298         "addq   $128, %%rsi\n\t"
02299 
02300     MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0)
02301     MsgSched4_AVX2(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4)
02302     MsgSched4_AVX2(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8)
02303     MsgSched4_AVX2(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12)
02304 
02305     SET_BLOCK_W_Y(rsi)
02306 
02307         "subl   $1, 16*8(" WX ")\n\t"
02308         "jne    1b\n\t"
02309 
02310     RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
02311     RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
02312     RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
02313     RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
02314 
02315     RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
02316     RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
02317     RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
02318     RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
02319 
02320     STORE_ADD_DIGEST()
02321 
02322         "addq   $136, %%rsp\n\t"
02323 
02324         :
02325         : [mask]   "m" (mBYTE_FLIP_MASK_Y),
02326           [sha512] "r" (sha512),
02327           [K512]   "m" (K512)
02328         : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
02329     );
02330 
02331     return 0;
02332 }
02333 
02334 static int Transform_Sha512_AVX2_Len(wc_Sha512* sha512, word32 len)
02335 {
02336     if ((len & WC_SHA512_BLOCK_SIZE) != 0) {
02337         XMEMCPY(sha512->buffer, sha512->data, WC_SHA512_BLOCK_SIZE);
02338         Transform_Sha512_AVX2(sha512);
02339         sha512->data += WC_SHA512_BLOCK_SIZE;
02340         len -= WC_SHA512_BLOCK_SIZE;
02341         if (len == 0)
02342             return 0;
02343     }
02344 
02345     __asm__ __volatile__ (
02346 
02347         "movq   224(%[sha512]), %%rcx\n\t"
02348 
02349     INIT_MASK(MASK_Y)
02350     LOAD_DIGEST()
02351 
02352         "# Start of processing two blocks\n"
02353         "2:\n\t"
02354 
02355         "subq   $1344, %%rsp\n\t"
02356         "leaq   %[K512], %%rsi\n\t"
02357 
02358         /* L4 = b */
02359         "movq   %%r9, " L4 "\n\t"
02360         /* e */
02361         "movq   %%r12, " L1 "\n\t"
02362 
02363     LOAD_BLOCK2_W_Y(MASK_Y, rcx)
02364 
02365         /* L4 = b ^ c */
02366         "xorq   %%r10, " L4 "\n\t"
02367         "\n"
02368         "1:\n\t"
02369     SET_BLOCK2_W_Y(rsi)
02370     MsgSched2_AVX2(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0)
02371     MsgSched2_AVX2(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4)
02372     MsgSched2_AVX2(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8)
02373     MsgSched2_AVX2(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12)
02374     MsgSched2_AVX2(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16)
02375     MsgSched2_AVX2(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20)
02376     MsgSched2_AVX2(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24)
02377     MsgSched2_AVX2(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28)
02378         "addq   $256, %%rsi\n\t"
02379         "addq   $256, %%rsp\n\t"
02380         "cmpq   %[K512_END], %%rsi\n\t"
02381         "jne    1b\n\t"
02382 
02383     SET_BLOCK2_W_Y(rsi)
02384     RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
02385     RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4)
02386     RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8)
02387     RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12)
02388 
02389     RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16)
02390     RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20)
02391     RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24)
02392     RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28)
02393         "subq   $1024, %%rsp\n\t"
02394 
02395     ADD_DIGEST()
02396     STORE_DIGEST()
02397 
02398         /* L4 = b */
02399         "movq   %%r9, " L4 "\n\t"
02400         /* e */
02401         "movq   %%r12, " L1 "\n\t"
02402         /* L4 = b ^ c */
02403         "xorq   %%r10, " L4 "\n\t"
02404 
02405         "movq   $5, %%rsi\n\t"
02406         "\n"
02407         "3:\n\t"
02408     RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2)
02409     RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6)
02410     RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10)
02411     RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
02412 
02413     RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18)
02414     RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22)
02415     RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26)
02416     RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30)
02417         "addq   $256, %%rsp\n\t"
02418         "subq   $1, %%rsi\n\t"
02419         "jnz    3b\n\t"
02420 
02421     ADD_DIGEST()
02422 
02423         "movq   224(%[sha512]), %%rcx\n\t"
02424         "addq   $64, %%rsp\n\t"
02425         "addq   $256, %%rcx\n\t"
02426         "subl   $256, %[len]\n\t"
02427         "movq   %%rcx, 224(%[sha512])\n\t"
02428 
02429     STORE_DIGEST()
02430 
02431         "jnz    2b\n\t"
02432 
02433         :
02434         : [mask]   "m" (mBYTE_FLIP_MASK_Y),
02435           [len]    "m" (len),
02436           [sha512] "r" (sha512),
02437           [K512]   "m" (K512_AVX2),
02438           [K512_END]   "m" (K512_AVX2_END)
02439         : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
02440     );
02441 
02442     return 0;
02443 }
02444 
02445 #ifdef HAVE_INTEL_RORX
02446 static int Transform_Sha512_AVX2_RORX(wc_Sha512* sha512)
02447 {
02448     __asm__ __volatile__ (
02449 
02450         /* 16 Ws plus loop counter. */
02451         "subq   $136, %%rsp\n\t"
02452         "leaq   64(%[sha512]), " L2 "\n\t"
02453 
02454     INIT_MASK(MASK_Y)
02455     LOAD_DIGEST()
02456 
02457     LOAD_BLOCK_W_Y(MASK_Y, rcx)
02458 
02459         "movl   $4, 16*8(" WX ")\n\t"
02460         "leaq   %[K512], %%rsi\n\t"
02461         /* b */
02462         "movq   %%r9, " L4 "\n\t"
02463         /* L3 = 0 (add to prev h) */
02464         "xorq   " L3 ", " L3 "\n\t"
02465         /* b ^ c */
02466         "xorq   %%r10, " L4 "\n\t"
02467 
02468     SET_BLOCK_W_Y(rsi)
02469 
02470         "# Start of 16 rounds\n"
02471         "1:\n\t"
02472 
02473         "addq   $128, %%rsi\n\t"
02474 
02475     MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0)
02476     MsgSched4_AVX2_RORX_SET(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4)
02477     MsgSched4_AVX2_RORX_SET(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8)
02478     MsgSched4_AVX2_RORX_SET(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12)
02479 
02480         "subl   $1, 16*8(%%rsp)\n\t"
02481         "jnz    1b\n\t"
02482 
02483     RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 0)
02484     RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD, 4)
02485     RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 8)
02486     RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD,12)
02487         /* Prev RND: h += Maj(a,b,c) */
02488         "addq   " L3 ", %%r8\n\t"
02489         "addq   $136, %%rsp\n\t"
02490 
02491     STORE_ADD_DIGEST()
02492 
02493         :
02494         : [mask]   "m" (mBYTE_FLIP_MASK_Y),
02495           [sha512] "r" (sha512),
02496           [K512]   "m" (K512)
02497         : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
02498     );
02499 
02500     return 0;
02501 }
02502 
02503 static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512* sha512, word32 len)
02504 {
02505     if ((len & WC_SHA512_BLOCK_SIZE) != 0) {
02506         XMEMCPY(sha512->buffer, sha512->data, WC_SHA512_BLOCK_SIZE);
02507         Transform_Sha512_AVX2_RORX(sha512);
02508         sha512->data += WC_SHA512_BLOCK_SIZE;
02509         len -= WC_SHA512_BLOCK_SIZE;
02510         if (len == 0)
02511             return 0;
02512     }
02513 
02514     __asm__ __volatile__ (
02515 
02516         "movq   224(%[sha512]), %%rax\n\t"
02517 
02518     INIT_MASK(MASK_Y)
02519     LOAD_DIGEST()
02520 
02521         "# Start of processing two blocks\n"
02522         "2:\n\t"
02523 
02524         "subq   $1344, %%rsp\n\t"
02525         "leaq   %[K512], %%rsi\n\t"
02526 
02527         /* L4 = b */
02528         "movq   %%r9, " L4 "\n\t"
02529         /* L3 = 0 (add to prev h) */
02530         "xorq   " L3 ", " L3 "\n\t"
02531 
02532     LOAD_BLOCK2_W_Y(MASK_Y, rax)
02533 
02534         /* L4 = b ^ c */
02535         "xorq   %%r10, " L4 "\n\t"
02536         "\n"
02537         "1:\n\t"
02538     SET_BLOCK2_W_Y(rsi)
02539     MsgSched2_AVX2_RORX(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0)
02540     MsgSched2_AVX2_RORX(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4)
02541     MsgSched2_AVX2_RORX(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8)
02542     MsgSched2_AVX2_RORX(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12)
02543     MsgSched2_AVX2_RORX(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16)
02544     MsgSched2_AVX2_RORX(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20)
02545     MsgSched2_AVX2_RORX(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24)
02546     MsgSched2_AVX2_RORX(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28)
02547         "addq   $256, %%rsi\n\t"
02548         "addq   $256, %%rsp\n\t"
02549         "cmpq   %[K512_END], %%rsi\n\t"
02550         "jne    1b\n\t"
02551 
02552     SET_BLOCK2_W_Y(rsi)
02553     RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
02554     RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4)
02555     RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8)
02556     RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12)
02557 
02558     RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16)
02559     RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20)
02560     RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24)
02561     RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28)
02562         "addq   " L3 ", %%r8\n\t"
02563         "subq   $1024, %%rsp\n\t"
02564 
02565     ADD_DIGEST()
02566     STORE_DIGEST()
02567 
02568         /* L4 = b */
02569         "movq   %%r9, " L4 "\n\t"
02570         /* L3 = 0 (add to prev h) */
02571         "xorq   " L3 ", " L3 "\n\t"
02572         /* L4 = b ^ c */
02573         "xorq   %%r10, " L4 "\n\t"
02574 
02575         "movq   $5, %%rsi\n\t"
02576         "\n"
02577         "3:\n\t"
02578     RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2)
02579     RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6)
02580     RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10)
02581     RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
02582 
02583     RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18)
02584     RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22)
02585     RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26)
02586     RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30)
02587         "addq   $256, %%rsp\n\t"
02588         "subq   $1, %%rsi\n\t"
02589         "jnz    3b\n\t"
02590 
02591         "addq   " L3 ", %%r8\n\t"
02592 
02593     ADD_DIGEST()
02594 
02595         "movq   224(%[sha512]), %%rax\n\t"
02596         "addq   $64, %%rsp\n\t"
02597         "addq   $256, %%rax\n\t"
02598         "subl   $256, %[len]\n\t"
02599         "movq   %%rax, 224(%[sha512])\n\t"
02600 
02601     STORE_DIGEST()
02602 
02603         "jnz    2b\n\t"
02604 
02605         :
02606         : [mask]   "m" (mBYTE_FLIP_MASK_Y),
02607           [len]    "m" (len),
02608           [sha512] "r" (sha512),
02609           [K512]   "m" (K512_AVX2),
02610           [K512_END]   "m" (K512_AVX2_END)
02611         : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
02612     );
02613 
02614     return 0;
02615 }
02616 #endif /* HAVE_INTEL_RORX */
02617 #endif /* HAVE_INTEL_AVX2 */
02618 
02619 #endif /* WOLFSSL_SHA512 */
02620 
02621 
02622 /* -------------------------------------------------------------------------- */
02623 /* SHA384 */
02624 /* -------------------------------------------------------------------------- */
02625 #ifdef WOLFSSL_SHA384
02626 
02627 #if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH)
02628     /* functions defined in wolfcrypt/src/port/caam/caam_sha.c */
02629 #else
02630 
02631 static int InitSha384(wc_Sha384* sha384)
02632 {
02633     if (sha384 == NULL) {
02634         return BAD_FUNC_ARG;
02635     }
02636 
02637     sha384->digest[0] = W64LIT(0xcbbb9d5dc1059ed8);
02638     sha384->digest[1] = W64LIT(0x629a292a367cd507);
02639     sha384->digest[2] = W64LIT(0x9159015a3070dd17);
02640     sha384->digest[3] = W64LIT(0x152fecd8f70e5939);
02641     sha384->digest[4] = W64LIT(0x67332667ffc00b31);
02642     sha384->digest[5] = W64LIT(0x8eb44a8768581511);
02643     sha384->digest[6] = W64LIT(0xdb0c2e0d64f98fa7);
02644     sha384->digest[7] = W64LIT(0x47b5481dbefa4fa4);
02645 
02646     sha384->buffLen = 0;
02647     sha384->loLen   = 0;
02648     sha384->hiLen   = 0;
02649 
02650     return 0;
02651 }
02652 
02653 int wc_Sha384Update(wc_Sha384* sha384, const byte* data, word32 len)
02654 {
02655     if (sha384 == NULL || (data == NULL && len > 0)) {
02656         return BAD_FUNC_ARG;
02657     }
02658 
02659 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
02660     if (sha384->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA384) {
02661     #if defined(HAVE_INTEL_QA)
02662         return IntelQaSymSha384(&sha384->asyncDev, NULL, data, len);
02663     #endif
02664     }
02665 #endif /* WOLFSSL_ASYNC_CRYPT */
02666 
02667     return Sha512Update((wc_Sha512*)sha384, data, len);
02668 }
02669 
02670 
02671 int wc_Sha384FinalRaw(wc_Sha384* sha384, byte* hash)
02672 {
02673 #ifdef LITTLE_ENDIAN_ORDER
02674     word64 digest[WC_SHA384_DIGEST_SIZE / sizeof(word64)];
02675 #endif
02676 
02677     if (sha384 == NULL || hash == NULL) {
02678         return BAD_FUNC_ARG;
02679     }
02680 
02681 #ifdef LITTLE_ENDIAN_ORDER
02682     ByteReverseWords64((word64*)digest, (word64*)sha384->digest,
02683                                                          WC_SHA384_DIGEST_SIZE);
02684     XMEMCPY(hash, digest, WC_SHA384_DIGEST_SIZE);
02685 #else
02686     XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
02687 #endif
02688 
02689     return 0;
02690 }
02691 
02692 int wc_Sha384Final(wc_Sha384* sha384, byte* hash)
02693 {
02694     int ret;
02695 
02696     if (sha384 == NULL || hash == NULL) {
02697         return BAD_FUNC_ARG;
02698     }
02699 
02700 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
02701     if (sha384->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA384) {
02702     #if defined(HAVE_INTEL_QA)
02703         return IntelQaSymSha384(&sha384->asyncDev, hash, NULL,
02704                                             WC_SHA384_DIGEST_SIZE);
02705     #endif
02706     }
02707 #endif /* WOLFSSL_ASYNC_CRYPT */
02708 
02709     ret = Sha512Final((wc_Sha512*)sha384);
02710     if (ret != 0)
02711         return ret;
02712 
02713     XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
02714 
02715     return InitSha384(sha384);  /* reset state */
02716 }
02717 
02718 
02719 /* Hardware Acceleration */
02720 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
02721     int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId)
02722     {
02723         int ret = InitSha384(sha384);
02724 
02725         (void)heap;
02726         (void)devId;
02727 
02728         Sha512_SetTransform();
02729 
02730         return ret;
02731     }
02732 #else
02733 int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId)
02734 {
02735     int ret;
02736 
02737     if (sha384 == NULL) {
02738         return BAD_FUNC_ARG;
02739     }
02740 
02741     sha384->heap = heap;
02742     ret = InitSha384(sha384);
02743     if (ret != 0)
02744         return ret;
02745 
02746 #ifdef WOLFSSL_SMALL_STACK_CACHE
02747     sha384->W = NULL;
02748 #endif
02749 
02750 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
02751     ret = wolfAsync_DevCtxInit(&sha384->asyncDev, WOLFSSL_ASYNC_MARKER_SHA384,
02752                                                            sha384->heap, devId);
02753 #else
02754     (void)devId;
02755 #endif /* WOLFSSL_ASYNC_CRYPT */
02756 
02757     return ret;
02758 }
02759 #endif
02760 #endif /* WOLFSSL_IMX6_CAAM */
02761 
02762 int wc_InitSha384(wc_Sha384* sha384)
02763 {
02764     return wc_InitSha384_ex(sha384, NULL, INVALID_DEVID);
02765 }
02766 
02767 void wc_Sha384Free(wc_Sha384* sha384)
02768 {
02769     if (sha384 == NULL)
02770         return;
02771 
02772 #ifdef WOLFSSL_SMALL_STACK_CACHE
02773     if (sha384->W != NULL) {
02774         XFREE(sha384->W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
02775         sha384->W = NULL;
02776     }
02777 #endif
02778 
02779 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
02780     wolfAsync_DevCtxFree(&sha384->asyncDev, WOLFSSL_ASYNC_MARKER_SHA384);
02781 #endif /* WOLFSSL_ASYNC_CRYPT */
02782 }
02783 
02784 #endif /* WOLFSSL_SHA384 */
02785 
02786 #endif /* HAVE_FIPS */
02787 
02788 #ifdef WOLFSSL_SHA512
02789 
02790 int wc_Sha512GetHash(wc_Sha512* sha512, byte* hash)
02791 {
02792     int ret;
02793     wc_Sha512 tmpSha512;
02794 
02795     if (sha512 == NULL || hash == NULL)
02796         return BAD_FUNC_ARG;
02797 
02798     ret = wc_Sha512Copy(sha512, &tmpSha512);
02799     if (ret == 0) {
02800         ret = wc_Sha512Final(&tmpSha512, hash);
02801         wc_Sha512Free(&tmpSha512);
02802     }
02803     return ret;
02804 }
02805 
02806 int wc_Sha512Copy(wc_Sha512* src, wc_Sha512* dst)
02807 {
02808     int ret = 0;
02809 
02810     if (src == NULL || dst == NULL)
02811         return BAD_FUNC_ARG;
02812 
02813     XMEMCPY(dst, src, sizeof(wc_Sha512));
02814 #ifdef WOLFSSL_SMALL_STACK_CACHE
02815     dst->W = NULL;
02816 #endif
02817 
02818 #ifdef WOLFSSL_ASYNC_CRYPT
02819     ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
02820 #endif
02821 
02822     return ret;
02823 }
02824 
02825 #endif /* WOLFSSL_SHA512 */
02826 
02827 #ifdef WOLFSSL_SHA384
02828 
02829 int wc_Sha384GetHash(wc_Sha384* sha384, byte* hash)
02830 {
02831     int ret;
02832     wc_Sha384 tmpSha384;
02833 
02834     if (sha384 == NULL || hash == NULL)
02835         return BAD_FUNC_ARG;
02836 
02837     ret = wc_Sha384Copy(sha384, &tmpSha384);
02838     if (ret == 0) {
02839         ret = wc_Sha384Final(&tmpSha384, hash);
02840         wc_Sha384Free(&tmpSha384);
02841     }
02842     return ret;
02843 }
02844 int wc_Sha384Copy(wc_Sha384* src, wc_Sha384* dst)
02845 {
02846     int ret = 0;
02847 
02848     if (src == NULL || dst == NULL)
02849         return BAD_FUNC_ARG;
02850 
02851     XMEMCPY(dst, src, sizeof(wc_Sha384));
02852 #ifdef WOLFSSL_SMALL_STACK_CACHE
02853     dst->W = NULL;
02854 #endif
02855 
02856 #ifdef WOLFSSL_ASYNC_CRYPT
02857     ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
02858 #endif
02859 
02860     return ret;
02861 }
02862 
02863 #endif /* WOLFSSL_SHA384 */
02864 
02865 #endif /* WOLFSSL_SHA512 || WOLFSSL_SHA384 */
02866