wolf SSL / wolfSSL-TLS13-Beta

Fork of wolfSSL by wolf SSL

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers sha512.c Source File

sha512.c

00001 /* sha512.c
00002  *
00003  * Copyright (C) 2006-2016 wolfSSL Inc.
00004  *
00005  * This file is part of wolfSSL.
00006  *
00007  * wolfSSL is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * wolfSSL is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
00020  */
00021 
00022 
00023 #ifdef HAVE_CONFIG_H
00024     #include <config.h>
00025 #endif
00026 
00027 #include <wolfssl/wolfcrypt/settings.h>
00028 
00029 #ifdef WOLFSSL_SHA512
00030 
00031 #include <wolfssl/wolfcrypt/sha512.h>
00032 #include <wolfssl/wolfcrypt/error-crypt.h>
00033 
00034 /* fips wrapper calls, user can call direct */
00035 #ifdef HAVE_FIPS
00036     int wc_InitSha512(Sha512* sha)
00037     {
00038         return InitSha512_fips(sha);
00039     }
00040     int wc_InitSha512_ex(Sha512* sha, void* heap, int devId)
00041     {
00042         (void)heap;
00043         (void)devId;
00044         return InitSha512_fips(sha);
00045     }
00046     int wc_Sha512Update(Sha512* sha, const byte* data, word32 len)
00047     {
00048         return Sha512Update_fips(sha, data, len);
00049     }
00050     int wc_Sha512Final(Sha512* sha, byte* out)
00051     {
00052         return Sha512Final_fips(sha, out);
00053     }
00054     void wc_Sha512Free(Sha512* sha)
00055     {
00056         (void)sha;
00057         /* Not supported in FIPS */
00058     }
00059 
00060     #if defined(WOLFSSL_SHA384) || defined(HAVE_AESGCM)
00061         int wc_InitSha384(Sha384* sha)
00062         {
00063             return InitSha384_fips(sha);
00064         }
00065         int wc_InitSha384_ex(Sha384* sha, void* heap, int devId)
00066         {
00067             (void)heap;
00068             (void)devId;
00069             return InitSha384_fips(sha);
00070         }
00071         int wc_Sha384Update(Sha384* sha, const byte* data, word32 len)
00072         {
00073             return Sha384Update_fips(sha, data, len);
00074         }
00075         int wc_Sha384Final(Sha384* sha, byte* out)
00076         {
00077             return Sha384Final_fips(sha, out);
00078         }
00079         void wc_Sha384Free(Sha384* sha)
00080         {
00081             (void)sha;
00082             /* Not supported in FIPS */
00083         }
00084     #endif /* WOLFSSL_SHA384 || HAVE_AESGCM */
00085 
00086 #else /* else build without using fips */
00087 
00088 #include <wolfssl/wolfcrypt/logging.h>
00089 
00090 #ifdef NO_INLINE
00091     #include <wolfssl/wolfcrypt/misc.h>
00092 #else
00093     #define WOLFSSL_MISC_INCLUDED
00094     #include <wolfcrypt/src/misc.c>
00095 #endif
00096 
00097 
00098 #if defined(USE_INTEL_SPEEDUP)
00099     #define HAVE_INTEL_AVX1
00100     #define HAVE_INTEL_AVX2
00101 #endif
00102 
00103 #if defined(HAVE_INTEL_AVX1)
00104     /* #define DEBUG_XMM  */
00105 #endif
00106 
00107 #if defined(HAVE_INTEL_AVX2)
00108     #define HAVE_INTEL_RORX
00109     /* #define DEBUG_YMM  */
00110 #endif
00111 
00112 
00113 #if defined(HAVE_INTEL_RORX)
00114     #define ROTR(func, bits, x) \
00115     word64 func(word64 x) {  word64 ret ;\
00116         __asm__ ("rorx $"#bits", %1, %0\n\t":"=r"(ret):"r"(x):) ;\
00117         return ret ;\
00118     }
00119 
00120     static INLINE ROTR(rotrFixed64_28, 28, x);
00121     static INLINE ROTR(rotrFixed64_34, 34, x);
00122     static INLINE ROTR(rotrFixed64_39, 39, x);
00123     static INLINE ROTR(rotrFixed64_14, 14, x);
00124     static INLINE ROTR(rotrFixed64_18, 18, x);
00125     static INLINE ROTR(rotrFixed64_41, 41, x);
00126 
00127     #define S0_RORX(x) (rotrFixed64_28(x)^rotrFixed64_34(x)^rotrFixed64_39(x))
00128     #define S1_RORX(x) (rotrFixed64_14(x)^rotrFixed64_18(x)^rotrFixed64_41(x))
00129 #endif /* HAVE_INTEL_RORX */
00130 
00131 #if defined(HAVE_BYTEREVERSE64) && \
00132         !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2)
00133     #define ByteReverseWords64(out, in, size) ByteReverseWords64_1(out, size)
00134     #define ByteReverseWords64_1(buf, size) \
00135         { unsigned int i ;\
00136             for(i=0; i< size/sizeof(word64); i++){\
00137                 __asm__ volatile("bswapq %0":"+r"(buf[i])::) ;\
00138             }\
00139         }
00140 #endif
00141 
00142 static int InitSha512(Sha512* sha512)
00143 {
00144     if (sha512 == NULL)
00145         return BAD_FUNC_ARG;
00146 
00147     sha512->digest[0] = W64LIT(0x6a09e667f3bcc908);
00148     sha512->digest[1] = W64LIT(0xbb67ae8584caa73b);
00149     sha512->digest[2] = W64LIT(0x3c6ef372fe94f82b);
00150     sha512->digest[3] = W64LIT(0xa54ff53a5f1d36f1);
00151     sha512->digest[4] = W64LIT(0x510e527fade682d1);
00152     sha512->digest[5] = W64LIT(0x9b05688c2b3e6c1f);
00153     sha512->digest[6] = W64LIT(0x1f83d9abfb41bd6b);
00154     sha512->digest[7] = W64LIT(0x5be0cd19137e2179);
00155 
00156     sha512->buffLen = 0;
00157     sha512->loLen   = 0;
00158     sha512->hiLen   = 0;
00159 
00160     return 0;
00161 }
00162 
00163 
00164 /* Hardware Acceleration */
00165 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00166 
00167     /*****
00168     Intel AVX1/AVX2 Macro Control Structure
00169 
00170     #if defined(HAVE_INteL_SPEEDUP)
00171         #define HAVE_INTEL_AVX1
00172         #define HAVE_INTEL_AVX2
00173     #endif
00174 
00175     int InitSha512(Sha512* sha512) {
00176          Save/Recover XMM, YMM
00177          ...
00178 
00179          Check Intel AVX cpuid flags
00180     }
00181 
00182     #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
00183       Transform_AVX1(); # Function prototype
00184       Transform_AVX2(); #
00185     #endif
00186 
00187       _Transform() {     # Native Transform Function body
00188 
00189       }
00190 
00191       int Sha512Update() {
00192          Save/Recover XMM, YMM
00193          ...
00194       }
00195 
00196       int Sha512Final() {
00197          Save/Recover XMM, YMM
00198          ...
00199       }
00200 
00201 
00202     #if defined(HAVE_INTEL_AVX1)
00203 
00204        XMM Instructions/INLINE asm Definitions
00205 
00206     #endif
00207 
00208     #if defined(HAVE_INTEL_AVX2)
00209 
00210        YMM Instructions/INLINE asm Definitions
00211 
00212     #endif
00213 
00214     #if defnied(HAVE_INTEL_AVX1)
00215 
00216       int Transform_AVX1() {
00217           Stitched Message Sched/Round
00218       }
00219 
00220     #endif
00221 
00222     #if defnied(HAVE_INTEL_AVX2)
00223 
00224       int Transform_AVX2() {
00225           Stitched Message Sched/Round
00226       }
00227     #endif
00228 
00229     */
00230 
00231 
00232     /* Each platform needs to query info type 1 from cpuid to see if aesni is
00233      * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
00234      */
00235 
00236     #ifndef _MSC_VER
00237         #define cpuid(reg, leaf, sub)\
00238             __asm__ __volatile__ ("cpuid":\
00239                 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
00240                 "a" (leaf), "c"(sub));
00241 
00242         #define XASM_LINK(f) asm(f)
00243     #else
00244 
00245         #include <intrin.h>
00246         #define cpuid(a,b) __cpuid((int*)a,b)
00247 
00248         #define XASM_LINK(f)
00249     #endif /* _MSC_VER */
00250 
00251     #define EAX 0
00252     #define EBX 1
00253     #define ECX 2
00254     #define EDX 3
00255 
00256     #define CPUID_AVX1   0x1
00257     #define CPUID_AVX2   0x2
00258     #define CPUID_RDRAND 0x4
00259     #define CPUID_RDSEED 0x8
00260     #define CPUID_BMI2   0x10   /* MULX, RORX */
00261 
00262     #define IS_INTEL_AVX1       (cpuid_flags & CPUID_AVX1)
00263     #define IS_INTEL_AVX2       (cpuid_flags & CPUID_AVX2)
00264     #define IS_INTEL_BMI2       (cpuid_flags & CPUID_BMI2)
00265     #define IS_INTEL_RDRAND     (cpuid_flags & CPUID_RDRAND)
00266     #define IS_INTEL_RDSEED     (cpuid_flags & CPUID_RDSEED)
00267 
00268     static word32 cpuid_check = 0;
00269     static word32 cpuid_flags = 0;
00270 
00271     static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
00272         int got_intel_cpu = 0;
00273         unsigned int reg[5];
00274 
00275         reg[4] = '\0';
00276         cpuid(reg, 0, 0);
00277         if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 &&
00278             XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 &&
00279             XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) {
00280             got_intel_cpu = 1;
00281         }
00282         if (got_intel_cpu) {
00283             cpuid(reg, leaf, sub);
00284             return ((reg[num] >> bit) & 0x1);
00285         }
00286         return 0;
00287     }
00288 
00289 
00290     static int set_cpuid_flags() {
00291         if(cpuid_check ==0) {
00292             if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;}
00293             if(cpuid_flag(7, 0, EBX, 5)){  cpuid_flags |= CPUID_AVX2 ; }
00294             if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; }
00295             if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ;  }
00296             if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ;  }
00297             cpuid_check = 1 ;
00298             return 0 ;
00299         }
00300         return 1 ;
00301     }
00302 
00303 
00304     #if defined(HAVE_INTEL_AVX1)
00305         static int Transform_AVX1(Sha512 *sha512);
00306     #endif
00307     #if defined(HAVE_INTEL_AVX2)
00308         static int Transform_AVX2(Sha512 *sha512);
00309         #if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX)
00310             static int Transform_AVX1_RORX(Sha512 *sha512);
00311         #endif
00312     #endif
00313     static int _Transform(Sha512 *sha512);
00314     static int (*Transform_p)(Sha512* sha512) = _Transform;
00315     #define Transform(sha512) (*Transform_p)(sha512)
00316 
00317     /* Dummy for saving MM_REGs on behalf of Transform */
00318     /* #if defined(HAVE_INTEL_AVX2)
00319      #define SAVE_XMM_YMM   __asm__ volatile("orq %%r8, %%r8":::\
00320        "%ymm0","%ymm1","%ymm2","%ymm3","%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11",\
00321        "%ymm12","%ymm13","%ymm14","%ymm15")
00322     */
00323     #if defined(HAVE_INTEL_AVX1)
00324         #define SAVE_XMM_YMM   __asm__ volatile("orq %%r8, %%r8":::\
00325             "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15")
00326     #endif
00327 
00328 
00329     int wc_InitSha512_ex(Sha512* sha512, void* heap, int devId)
00330     {
00331         int ret = InitSha512(sha512);
00332 
00333         (void)heap;
00334         (void)devId;
00335 
00336         if (set_cpuid_flags())
00337             return ret;
00338 
00339     #if defined(HAVE_INTEL_AVX2)
00340         if (IS_INTEL_AVX2 && IS_INTEL_BMI2) {
00341             Transform_p = Transform_AVX1_RORX; return ret;
00342             Transform_p = Transform_AVX2;
00343                 /* for avoiding warning,"not used" */
00344         }
00345     #endif
00346     #if defined(HAVE_INTEL_AVX1)
00347         Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform); return ret;
00348     #endif
00349         Transform_p = _Transform;
00350 
00351         return ret;
00352     }
00353 
00354 #else
00355     #define Transform(sha512) _Transform(sha512)
00356 
00357     int wc_InitSha512_ex(Sha512* sha512, void* heap, int devId)
00358     {
00359         int ret = 0;
00360 
00361         if (sha512 == NULL)
00362             return BAD_FUNC_ARG;
00363 
00364         sha512->heap = heap;
00365 
00366         ret = InitSha512(sha512);
00367         if (ret != 0)
00368             return ret;
00369 
00370     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
00371         ret = wolfAsync_DevCtxInit(&sha512->asyncDev,
00372                             WOLFSSL_ASYNC_MARKER_SHA512, sha512->heap, devId);
00373     #else
00374         (void)devId;
00375     #endif /* WOLFSSL_ASYNC_CRYPT */
00376 
00377         return ret;
00378     }
00379 
00380 #endif /* Hardware Acceleration */
00381 
00382 #ifndef SAVE_XMM_YMM
00383     #define SAVE_XMM_YMM
00384 #endif
00385 
00386 static const word64 K512[80] = {
00387     W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
00388     W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
00389     W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
00390     W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
00391     W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
00392     W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
00393     W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
00394     W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
00395     W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
00396     W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
00397     W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
00398     W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
00399     W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
00400     W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
00401     W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
00402     W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
00403     W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
00404     W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
00405     W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
00406     W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
00407     W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
00408     W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
00409     W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
00410     W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
00411     W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
00412     W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
00413     W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
00414     W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
00415     W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
00416     W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
00417     W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
00418     W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
00419     W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
00420     W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
00421     W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
00422     W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
00423     W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
00424     W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
00425     W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
00426     W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
00427 };
00428 
00429 
00430 
00431 #define blk0(i) (W[i] = sha512->buffer[i])
00432 
00433 #define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))
00434 
00435 #define Ch(x,y,z) (z^(x&(y^z)))
00436 #define Maj(x,y,z) ((x&y)|(z&(x|y)))
00437 
00438 #define a(i) T[(0-i)&7]
00439 #define b(i) T[(1-i)&7]
00440 #define c(i) T[(2-i)&7]
00441 #define d(i) T[(3-i)&7]
00442 #define e(i) T[(4-i)&7]
00443 #define f(i) T[(5-i)&7]
00444 #define g(i) T[(6-i)&7]
00445 #define h(i) T[(7-i)&7]
00446 
00447 #define S0(x) (rotrFixed64(x,28)^rotrFixed64(x,34)^rotrFixed64(x,39))
00448 #define S1(x) (rotrFixed64(x,14)^rotrFixed64(x,18)^rotrFixed64(x,41))
00449 #define s0(x) (rotrFixed64(x,1)^rotrFixed64(x,8)^(x>>7))
00450 #define s1(x) (rotrFixed64(x,19)^rotrFixed64(x,61)^(x>>6))
00451 
00452 #define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j]+(j?blk2(i):blk0(i));\
00453     d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
00454 
00455 static int _Transform(Sha512* sha512)
00456 {
00457     const word64* K = K512;
00458 
00459     word32 j;
00460     word64 T[8];
00461 
00462 
00463 #ifdef WOLFSSL_SMALL_STACK
00464     word64* W;
00465     W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER);
00466     if (W == NULL)
00467         return MEMORY_E;
00468 #else
00469     word64 W[16];
00470 #endif
00471 
00472     /* Copy digest to working vars */
00473     XMEMCPY(T, sha512->digest, sizeof(T));
00474 
00475 #ifdef USE_SLOW_SHA2
00476     /* over twice as small, but 50% slower */
00477     /* 80 operations, not unrolled */
00478     for (j = 0; j < 80; j += 16) {
00479         int m;
00480         for (m = 0; m < 16; m++) { /* braces needed here for macros {} */
00481             R(m);
00482         }
00483     }
00484 #else
00485     /* 80 operations, partially loop unrolled */
00486     for (j = 0; j < 80; j += 16) {
00487         R( 0); R( 1); R( 2); R( 3);
00488         R( 4); R( 5); R( 6); R( 7);
00489         R( 8); R( 9); R(10); R(11);
00490         R(12); R(13); R(14); R(15);
00491     }
00492 #endif /* USE_SLOW_SHA2 */
00493 
00494     /* Add the working vars back into digest */
00495 
00496     sha512->digest[0] += a(0);
00497     sha512->digest[1] += b(0);
00498     sha512->digest[2] += c(0);
00499     sha512->digest[3] += d(0);
00500     sha512->digest[4] += e(0);
00501     sha512->digest[5] += f(0);
00502     sha512->digest[6] += g(0);
00503     sha512->digest[7] += h(0);
00504 
00505     /* Wipe variables */
00506     ForceZero(W, sizeof(word64) * 16);
00507     ForceZero(T, sizeof(T));
00508 
00509 #ifdef WOLFSSL_SMALL_STACK
00510     XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
00511 #endif
00512 
00513     return 0;
00514 }
00515 
00516 
00517 static INLINE void AddLength(Sha512* sha512, word32 len)
00518 {
00519     word64 tmp = sha512->loLen;
00520     if ( (sha512->loLen += len) < tmp)
00521         sha512->hiLen++;                       /* carry low to high */
00522 }
00523 
00524 static INLINE int Sha512Update(Sha512* sha512, const byte* data, word32 len)
00525 {
00526     int ret = 0;
00527 
00528     /* do block size increments */
00529     byte* local = (byte*)sha512->buffer;
00530 
00531     /* check that internal buffLen is valid */
00532     if (sha512->buffLen >= SHA512_BLOCK_SIZE)
00533         return BUFFER_E;
00534 
00535     SAVE_XMM_YMM; /* for Intel AVX */
00536 
00537     while (len) {
00538         word32 add = min(len, SHA512_BLOCK_SIZE - sha512->buffLen);
00539         XMEMCPY(&local[sha512->buffLen], data, add);
00540 
00541         sha512->buffLen += add;
00542         data         += add;
00543         len          -= add;
00544 
00545         if (sha512->buffLen == SHA512_BLOCK_SIZE) {
00546     #if defined(LITTLE_ENDIAN_ORDER)
00547         #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00548             if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
00549         #endif
00550                 ByteReverseWords64(sha512->buffer, sha512->buffer,
00551                                SHA512_BLOCK_SIZE);
00552     #endif
00553             ret = Transform(sha512);
00554             if (ret != 0)
00555                 break;
00556 
00557             AddLength(sha512, SHA512_BLOCK_SIZE);
00558             sha512->buffLen = 0;
00559         }
00560     }
00561 
00562     return ret;
00563 }
00564 
00565 int wc_Sha512Update(Sha512* sha512, const byte* data, word32 len)
00566 {
00567 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
00568     if (sha512->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA512) {
00569     #if defined(HAVE_INTEL_QA)
00570         return IntelQaSymSha512(&sha512->asyncDev, NULL, data, len);
00571     #endif
00572     }
00573 #endif /* WOLFSSL_ASYNC_CRYPT */
00574 
00575     return Sha512Update(sha512, data, len);
00576 }
00577 
00578 
00579 static INLINE int Sha512Final(Sha512* sha512)
00580 {
00581     byte* local = (byte*)sha512->buffer;
00582     int ret;
00583 
00584     SAVE_XMM_YMM ; /* for Intel AVX */
00585     AddLength(sha512, sha512->buffLen);               /* before adding pads */
00586 
00587     local[sha512->buffLen++] = 0x80;  /* add 1 */
00588 
00589     /* pad with zeros */
00590     if (sha512->buffLen > SHA512_PAD_SIZE) {
00591         XMEMSET(&local[sha512->buffLen], 0, SHA512_BLOCK_SIZE - sha512->buffLen);
00592         sha512->buffLen += SHA512_BLOCK_SIZE - sha512->buffLen;
00593 #if defined(LITTLE_ENDIAN_ORDER)
00594     #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00595         if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
00596     #endif
00597             ByteReverseWords64(sha512->buffer,sha512->buffer,SHA512_BLOCK_SIZE);
00598 
00599 #endif /* LITTLE_ENDIAN_ORDER */
00600         ret = Transform(sha512);
00601         if (ret != 0)
00602             return ret;
00603 
00604         sha512->buffLen = 0;
00605     }
00606     XMEMSET(&local[sha512->buffLen], 0, SHA512_PAD_SIZE - sha512->buffLen);
00607 
00608     /* put lengths in bits */
00609     sha512->hiLen = (sha512->loLen >> (8 * sizeof(sha512->loLen) - 3)) +
00610                                                          (sha512->hiLen << 3);
00611     sha512->loLen = sha512->loLen << 3;
00612 
00613     /* store lengths */
00614 #if defined(LITTLE_ENDIAN_ORDER)
00615 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00616     if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
00617 #endif
00618         ByteReverseWords64(sha512->buffer, sha512->buffer, SHA512_PAD_SIZE);
00619 #endif
00620     /* ! length ordering dependent on digest endian type ! */
00621 
00622     sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen;
00623     sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen;
00624 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00625     if (IS_INTEL_AVX1 || IS_INTEL_AVX2)
00626         ByteReverseWords64(&(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
00627                            &(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
00628                            SHA512_BLOCK_SIZE - SHA512_PAD_SIZE);
00629 #endif
00630     ret = Transform(sha512);
00631     if (ret != 0)
00632         return ret;
00633 
00634     #ifdef LITTLE_ENDIAN_ORDER
00635         ByteReverseWords64(sha512->digest, sha512->digest, SHA512_DIGEST_SIZE);
00636     #endif
00637 
00638     return 0;
00639 }
00640 
00641 int wc_Sha512Final(Sha512* sha512, byte* hash)
00642 {
00643     int ret;
00644 
00645 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
00646     if (sha512->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA512) {
00647     #if defined(HAVE_INTEL_QA)
00648         return IntelQaSymSha512(&sha512->asyncDev, hash, NULL,
00649                                             SHA512_DIGEST_SIZE);
00650     #endif
00651     }
00652 #endif /* WOLFSSL_ASYNC_CRYPT */
00653 
00654     ret = Sha512Final(sha512);
00655     if (ret != 0)
00656         return ret;
00657 
00658     XMEMCPY(hash, sha512->digest, SHA512_DIGEST_SIZE);
00659 
00660     return InitSha512(sha512);  /* reset state */
00661 }
00662 
00663 
00664 int wc_InitSha512(Sha512* sha512)
00665 {
00666     return wc_InitSha512_ex(sha512, NULL, INVALID_DEVID);
00667 }
00668 
00669 void wc_Sha512Free(Sha512* sha512)
00670 {
00671     if (sha512 == NULL)
00672         return;
00673 
00674 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
00675     wolfAsync_DevCtxFree(&sha512->asyncDev, WOLFSSL_ASYNC_MARKER_SHA512);
00676 #endif /* WOLFSSL_ASYNC_CRYPT */
00677 }
00678 
00679 
00680 #if defined(HAVE_INTEL_AVX1)
00681 
00682 #define Rx_1(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + W_X[i];
00683 #define Rx_2(i) d(i)+=h(i);
00684 #define Rx_3(i) h(i)+=S0(a(i))+Maj(a(i),b(i),c(i));
00685 
00686 #if defined(HAVE_INTEL_RORX)
00687 
00688     #define Rx_RORX_1(i) h(i)+=S1_RORX(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + W_X[i];
00689     #define Rx_RORX_2(i) d(i)+=h(i);
00690     #define Rx_RORX_3(i) h(i)+=S0_RORX(a(i))+Maj(a(i),b(i),c(i));
00691 #endif /* HAVE_INTEL_RORX */
00692 
00693 #endif /* HAVE_INTEL_AVX1 */
00694 
00695 #if defined(HAVE_INTEL_AVX2)
00696 #define Ry_1(i, w) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + w;
00697 #define Ry_2(i, w) d(i)+=h(i);
00698 #define Ry_3(i, w) h(i)+=S0(a(i))+Maj(a(i),b(i),c(i));
00699 #endif /* HAVE_INTEL_AVX2 */
00700 
00701 /* INLINE Assember for Intel AVX1 instructions */
00702 #if defined(HAVE_INTEL_AVX1)
00703 #if defined(DEBUG_XMM)
00704     #define SAVE_REG(i)     __asm__ volatile("vmovdqu %%xmm"#i", %0 \n\t":"=m"(reg[i][0])::XMM_REGs);
00705     #define RECV_REG(i)     __asm__ volatile("vmovdqu %0, %%xmm"#i" \n\t"::"m"(reg[i][0]):XMM_REGs);
00706 
00707     #define _DUMP_REG(REG, name)\
00708         { word64 buf[16];word64 reg[16][2];int k;\
00709           SAVE_REG(0); SAVE_REG(1); SAVE_REG(2);  SAVE_REG(3);  SAVE_REG(4);  \
00710           SAVE_REG(5);   SAVE_REG(6); SAVE_REG(7);SAVE_REG(8); SAVE_REG(9); SAVE_REG(10);\
00711            SAVE_REG(11); SAVE_REG(12); SAVE_REG(13); SAVE_REG(14); SAVE_REG(15); \
00712           __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0])::XMM_REGs);\
00713           printf(" "#name":\t"); for(k=0; k<2; k++) printf("%016lx.", (word64)(buf[k])); printf("\n"); \
00714           RECV_REG(0); RECV_REG(1); RECV_REG(2);  RECV_REG(3);  RECV_REG(4);\
00715           RECV_REG(5);   RECV_REG(6); RECV_REG(7); RECV_REG(8); RECV_REG(9);\
00716           RECV_REG(10); RECV_REG(11); RECV_REG(12); RECV_REG(13); RECV_REG(14); RECV_REG(15);\
00717         }
00718 
00719     #define DUMP_REG(REG) _DUMP_REG(REG, #REG)
00720     #define PRINTF(fmt, ...)
00721 #else
00722     #define DUMP_REG(REG)
00723     #define PRINTF(fmt, ...)
00724 #endif /* DEBUG_XMM */
00725 
00726 #define _MOVE_to_REG(xymm, mem)       __asm__ volatile("vmovdqu %0, %%"#xymm" "\
00727         :: "m"(mem):XMM_REGs);
00728 #define _MOVE_to_MEM(mem,i, xymm)     __asm__ volatile("vmovdqu %%"#xymm", %0" :\
00729          "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3])::XMM_REGs);
00730 #define _MOVE(dest, src)              __asm__ volatile("vmovdqu %%"#src",  %%"\
00731         #dest" ":::XMM_REGs);
00732 
00733 #define _S_TEMP(dest, src, bits, temp)  __asm__ volatile("vpsrlq  $"#bits", %%"\
00734         #src", %%"#dest"\n\tvpsllq  $64-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\
00735         #temp",%%"#dest", %%"#dest" ":::XMM_REGs);
00736 #define _AVX1_R(dest, src, bits)      __asm__ volatile("vpsrlq  $"#bits", %%"\
00737         #src", %%"#dest" ":::XMM_REGs);
00738 #define _XOR(dest, src1, src2)        __asm__ volatile("vpxor   %%"#src1", %%"\
00739         #src2", %%"#dest" ":::XMM_REGs);
00740 #define _OR(dest, src1, src2)         __asm__ volatile("vpor    %%"#src1", %%"\
00741         #src2", %%"#dest" ":::XMM_REGs);
00742 #define _ADD(dest, src1, src2)        __asm__ volatile("vpaddq   %%"#src1", %%"\
00743         #src2", %%"#dest" ":::XMM_REGs);
00744 #define _ADD_MEM(dest, src1, mem)     __asm__ volatile("vpaddq   %0, %%"#src1", %%"\
00745         #dest" "::"m"(mem):XMM_REGs);
00746 
00747 #define MOVE_to_REG(xymm, mem)      _MOVE_to_REG(xymm, mem)
00748 #define MOVE_to_MEM(mem, i, xymm)   _MOVE_to_MEM(mem, i, xymm)
00749 #define MOVE(dest, src)             _MOVE(dest, src)
00750 
00751 #define XOR(dest, src1, src2)      _XOR(dest, src1, src2)
00752 #define OR(dest, src1, src2)       _OR(dest, src1, src2)
00753 #define ADD(dest, src1, src2)      _ADD(dest, src1, src2)
00754 
00755 #define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp);
00756 #define AVX1_S(dest, src, bits)      S_TMP(dest, src, bits, S_TEMP)
00757 #define AVX1_R(dest, src, bits)      _AVX1_R(dest, src, bits)
00758 
00759 #define Init_Mask(mask) \
00760      __asm__ volatile("vmovdqu %0, %%xmm1\n\t"::"m"(mask):"%xmm1");
00761 
00762 #define _W_from_buff1(w, buff, xmm) \
00763     /* X0..3(xmm4..7), W[0..15] = sha512->buffer[0.15];  */\
00764      __asm__ volatile("vmovdqu %1, %%"#xmm"\n\t"\
00765                       "vpshufb %%xmm1, %%"#xmm", %%"#xmm"\n\t"\
00766                       "vmovdqu %%"#xmm", %0"\
00767                       :"=m"(w): "m"(buff):"%xmm0");
00768 
00769 #define W_from_buff1(w, buff, xmm) _W_from_buff1(w, buff, xmm)
00770 
00771 #define W_from_buff(w, buff)\
00772      Init_Mask(mBYTE_FLIP_MASK[0]);\
00773      W_from_buff1(w[0], buff[0], W_0);\
00774      W_from_buff1(w[2], buff[2], W_2);\
00775      W_from_buff1(w[4], buff[4], W_4);\
00776      W_from_buff1(w[6], buff[6], W_6);\
00777      W_from_buff1(w[8], buff[8], W_8);\
00778      W_from_buff1(w[10],buff[10],W_10);\
00779      W_from_buff1(w[12],buff[12],W_12);\
00780      W_from_buff1(w[14],buff[14],W_14);
00781 
00782 static word64 mBYTE_FLIP_MASK[] =  { 0x0001020304050607, 0x08090a0b0c0d0e0f };
00783 
00784 #define W_I_15  xmm14
00785 #define W_I_7   xmm11
00786 #define W_I_2   xmm13
00787 #define W_I     xmm12
00788 #define G_TEMP  xmm0
00789 #define S_TEMP  xmm1
00790 #define XMM_TEMP0  xmm2
00791 
00792 #define W_0     xmm12
00793 #define W_2     xmm3
00794 #define W_4     xmm4
00795 #define W_6     xmm5
00796 #define W_8     xmm6
00797 #define W_10    xmm7
00798 #define W_12    xmm8
00799 #define W_14    xmm9
00800 
00801 #define XMM_REGs
00802 
00803 #define s0_1(dest, src)      AVX1_S(dest, src, 1);
00804 #define s0_2(dest, src)      AVX1_S(G_TEMP, src, 8); XOR(dest, G_TEMP, dest);
00805 #define s0_3(dest, src)      AVX1_R(G_TEMP, src, 7);  XOR(dest, G_TEMP, dest);
00806 
00807 #define s1_1(dest, src)      AVX1_S(dest, src, 19);
00808 #define s1_2(dest, src)      AVX1_S(G_TEMP, src, 61); XOR(dest, G_TEMP, dest);
00809 #define s1_3(dest, src)      AVX1_R(G_TEMP, src, 6); XOR(dest, G_TEMP, dest);
00810 
00811 #define s0_(dest, src)       s0_1(dest, src); s0_2(dest, src); s0_3(dest, src)
00812 #define s1_(dest, src)       s1_1(dest, src); s1_2(dest, src); s1_3(dest, src)
00813 
00814 #define Block_xx_1(i) \
00815     MOVE_to_REG(W_I_15, W_X[(i-15)&15]);\
00816     MOVE_to_REG(W_I_7,  W_X[(i- 7)&15]);\
00817 
00818 #define Block_xx_2(i) \
00819     MOVE_to_REG(W_I_2,  W_X[(i- 2)&15]);\
00820     MOVE_to_REG(W_I,    W_X[(i)]);\
00821 
00822 #define Block_xx_3(i) \
00823     s0_ (XMM_TEMP0, W_I_15);\
00824 
00825 #define Block_xx_4(i) \
00826     ADD(W_I, W_I, XMM_TEMP0);\
00827     ADD(W_I, W_I, W_I_7);\
00828 
00829 #define Block_xx_5(i) \
00830     s1_ (XMM_TEMP0, W_I_2);\
00831 
00832 #define Block_xx_6(i) \
00833     ADD(W_I, W_I, XMM_TEMP0);\
00834     MOVE_to_MEM(W_X,i, W_I);\
00835     if (i==0)\
00836         MOVE_to_MEM(W_X,16, W_I);\
00837 
00838 #define Block_xx_7(i) \
00839     MOVE_to_REG(W_I_15, W_X[(i-15)&15]);\
00840     MOVE_to_REG(W_I_7,  W_X[(i- 7)&15]);\
00841 
00842 #define Block_xx_8(i) \
00843     MOVE_to_REG(W_I_2,  W_X[(i- 2)&15]);\
00844     MOVE_to_REG(W_I,    W_X[(i)]);\
00845 
00846 #define Block_xx_9(i) \
00847     s0_ (XMM_TEMP0, W_I_15);\
00848 
00849 #define Block_xx_10(i) \
00850     ADD(W_I, W_I, XMM_TEMP0);\
00851     ADD(W_I, W_I, W_I_7);\
00852 
00853 #define Block_xx_11(i) \
00854     s1_ (XMM_TEMP0, W_I_2);\
00855 
00856 #define Block_xx_12(i) \
00857     ADD(W_I, W_I, XMM_TEMP0);\
00858     MOVE_to_MEM(W_X,i, W_I);\
00859     if ((i)==0)\
00860         MOVE_to_MEM(W_X,16, W_I);\
00861 
00862 static INLINE void Block_0_1(word64 *W_X) { Block_xx_1(0); }
00863 static INLINE void Block_0_2(word64 *W_X) { Block_xx_2(0); }
00864 static INLINE void Block_0_3(void) { Block_xx_3(0); }
00865 static INLINE void Block_0_4(void) { Block_xx_4(0); }
00866 static INLINE void Block_0_5(void) { Block_xx_5(0); }
00867 static INLINE void Block_0_6(word64 *W_X) { Block_xx_6(0); }
00868 static INLINE void Block_0_7(word64 *W_X) { Block_xx_7(2); }
00869 static INLINE void Block_0_8(word64 *W_X) { Block_xx_8(2); }
00870 static INLINE void Block_0_9(void) { Block_xx_9(2); }
00871 static INLINE void Block_0_10(void){ Block_xx_10(2); }
00872 static INLINE void Block_0_11(void){ Block_xx_11(2); }
00873 static INLINE void Block_0_12(word64 *W_X){ Block_xx_12(2); }
00874 
00875 static INLINE void Block_4_1(word64 *W_X) { Block_xx_1(4); }
00876 static INLINE void Block_4_2(word64 *W_X) { Block_xx_2(4); }
00877 static INLINE void Block_4_3(void) { Block_xx_3(4); }
00878 static INLINE void Block_4_4(void) { Block_xx_4(4); }
00879 static INLINE void Block_4_5(void) { Block_xx_5(4); }
00880 static INLINE void Block_4_6(word64 *W_X) { Block_xx_6(4); }
00881 static INLINE void Block_4_7(word64 *W_X) { Block_xx_7(6); }
00882 static INLINE void Block_4_8(word64 *W_X) { Block_xx_8(6); }
00883 static INLINE void Block_4_9(void) { Block_xx_9(6); }
00884 static INLINE void Block_4_10(void){ Block_xx_10(6); }
00885 static INLINE void Block_4_11(void){ Block_xx_11(6); }
00886 static INLINE void Block_4_12(word64 *W_X){ Block_xx_12(6); }
00887 
00888 static INLINE void Block_8_1(word64 *W_X) { Block_xx_1(8); }
00889 static INLINE void Block_8_2(word64 *W_X) { Block_xx_2(8); }
00890 static INLINE void Block_8_3(void) { Block_xx_3(8); }
00891 static INLINE void Block_8_4(void) { Block_xx_4(8); }
00892 static INLINE void Block_8_5(void) { Block_xx_5(8); }
00893 static INLINE void Block_8_6(word64 *W_X) { Block_xx_6(8); }
00894 static INLINE void Block_8_7(word64 *W_X) { Block_xx_7(10); }
00895 static INLINE void Block_8_8(word64 *W_X) { Block_xx_8(10); }
00896 static INLINE void Block_8_9(void) { Block_xx_9(10); }
00897 static INLINE void Block_8_10(void){ Block_xx_10(10); }
00898 static INLINE void Block_8_11(void){ Block_xx_11(10); }
00899 static INLINE void Block_8_12(word64 *W_X){ Block_xx_12(10); }
00900 
00901 static INLINE void Block_12_1(word64 *W_X) { Block_xx_1(12); }
00902 static INLINE void Block_12_2(word64 *W_X) { Block_xx_2(12); }
00903 static INLINE void Block_12_3(void) { Block_xx_3(12); }
00904 static INLINE void Block_12_4(void) { Block_xx_4(12); }
00905 static INLINE void Block_12_5(void) { Block_xx_5(12); }
00906 static INLINE void Block_12_6(word64 *W_X) { Block_xx_6(12); }
00907 static INLINE void Block_12_7(word64 *W_X) { Block_xx_7(14); }
00908 static INLINE void Block_12_8(word64 *W_X) { Block_xx_8(14); }
00909 static INLINE void Block_12_9(void) { Block_xx_9(14); }
00910 static INLINE void Block_12_10(void){ Block_xx_10(14); }
00911 static INLINE void Block_12_11(void){ Block_xx_11(14); }
00912 static INLINE void Block_12_12(word64 *W_X){ Block_xx_12(14); }
00913 
00914 #endif /* HAVE_INTEL_AVX1 */
00915 
00916 #if defined(HAVE_INTEL_AVX2)
00917 static const unsigned long mBYTE_FLIP_MASK_Y[] =
00918    { 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f };
00919 
00920 #define W_from_buff_Y(buff)\
00921     { /* X0..3(ymm9..12), W_X[0..15] = sha512->buffer[0.15];  */\
00922      __asm__ volatile("vmovdqu %0, %%ymm8\n\t"::"m"(mBYTE_FLIP_MASK_Y[0]):YMM_REGs);\
00923      __asm__ volatile("vmovdqu %0, %%ymm12\n\t"\
00924                       "vmovdqu %1, %%ymm4\n\t"\
00925                       "vpshufb %%ymm8, %%ymm12, %%ymm12\n\t"\
00926                       "vpshufb %%ymm8, %%ymm4, %%ymm4\n\t"\
00927                       :: "m"(buff[0]),  "m"(buff[4]):YMM_REGs);\
00928      __asm__ volatile("vmovdqu %0, %%ymm5\n\t"\
00929                       "vmovdqu %1, %%ymm6\n\t"\
00930                       "vpshufb %%ymm8, %%ymm5, %%ymm5\n\t"\
00931                       "vpshufb %%ymm8, %%ymm6, %%ymm6\n\t"\
00932                       :: "m"(buff[8]),  "m"(buff[12]):YMM_REGs);\
00933     }
00934 
00935 #if defined(DEBUG_YMM)
00936     #define SAVE_REG_Y(i) __asm__ volatile("vmovdqu %%ymm"#i", %0 \n\t":"=m"(reg[i-4][0])::YMM_REGs);
00937     #define RECV_REG_Y(i) __asm__ volatile("vmovdqu %0, %%ymm"#i" \n\t"::"m"(reg[i-4][0]):YMM_REGs);
00938 
00939     #define _DUMP_REG_Y(REG, name)\
00940         { word64 buf[16];word64 reg[16][2];int k;\
00941           SAVE_REG_Y(4);  SAVE_REG_Y(5);   SAVE_REG_Y(6); SAVE_REG_Y(7); \
00942           SAVE_REG_Y(8); SAVE_REG_Y(9); SAVE_REG_Y(10); SAVE_REG_Y(11); SAVE_REG_Y(12);\
00943           SAVE_REG_Y(13); SAVE_REG_Y(14); SAVE_REG_Y(15); \
00944           __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0])::YMM_REGs);\
00945           printf(" "#name":\t"); for(k=0; k<4; k++) printf("%016lx.", (word64)buf[k]); printf("\n"); \
00946           RECV_REG_Y(4);  RECV_REG_Y(5);   RECV_REG_Y(6); RECV_REG_Y(7); \
00947           RECV_REG_Y(8); RECV_REG_Y(9); RECV_REG_Y(10); RECV_REG_Y(11); RECV_REG_Y(12); \
00948           RECV_REG_Y(13); RECV_REG_Y(14); RECV_REG_Y(15);\
00949         }
00950 
00951     #define DUMP_REG_Y(REG) _DUMP_REG_Y(REG, #REG)
00952     #define DUMP_REG2_Y(REG) _DUMP_REG_Y(REG, #REG)
00953     #define PRINTF_Y(fmt, ...)
00954 #else
00955     #define DUMP_REG_Y(REG)
00956     #define DUMP_REG2_Y(REG)
00957     #define PRINTF_Y(fmt, ...)
00958 #endif /* DEBUG_YMM */
00959 
00960 #define _MOVE_to_REGy(ymm, mem)         __asm__ volatile("vmovdqu %0, %%"#ymm" "\
00961                                         :: "m"(mem):YMM_REGs);
00962 #define _MOVE_to_MEMy(mem,i, ymm)       __asm__ volatile("vmovdqu %%"#ymm", %0" \
00963         : "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3])::YMM_REGs);
00964 #define _MOVE_128y(ymm0, ymm1, ymm2, map)  __asm__ volatile("vperm2i128  $"\
00965         #map", %%"#ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs);
00966 #define _S_TEMPy(dest, src, bits, temp) \
00967          __asm__ volatile("vpsrlq  $"#bits", %%"#src", %%"#dest"\n\tvpsllq  $64-"#bits\
00968         ", %%"#src", %%"#temp"\n\tvpor %%"#temp",%%"#dest", %%"#dest" ":::YMM_REGs);
00969 #define _AVX2_R(dest, src, bits)        __asm__ volatile("vpsrlq  $"#bits", %%"\
00970          #src", %%"#dest" ":::YMM_REGs);
00971 #define _XORy(dest, src1, src2)         __asm__ volatile("vpxor   %%"#src1", %%"\
00972          #src2", %%"#dest" ":::YMM_REGs);
00973 #define _ADDy(dest, src1, src2)         __asm__ volatile("vpaddq   %%"#src1", %%"\
00974          #src2", %%"#dest" ":::YMM_REGs);
00975 #define _BLENDy(map, dest, src1, src2)  __asm__ volatile("vpblendd    $"#map", %%"\
00976          #src1",   %%"#src2", %%"#dest" ":::YMM_REGs);
00977 #define _BLENDQy(map, dest, src1, src2) __asm__ volatile("vblendpd   $"#map", %%"\
00978          #src1",   %%"#src2", %%"#dest" ":::YMM_REGs);
00979 #define _PERMQy(map, dest, src)         __asm__ volatile("vpermq  $"#map", %%"\
00980          #src", %%"#dest" ":::YMM_REGs);
00981 
00982 #define MOVE_to_REGy(ymm, mem)      _MOVE_to_REGy(ymm, mem)
00983 #define MOVE_to_MEMy(mem, i, ymm)   _MOVE_to_MEMy(mem, i, ymm)
00984 
00985 #define MOVE_128y(ymm0, ymm1, ymm2, map) _MOVE_128y(ymm0, ymm1, ymm2, map)
00986 #define XORy(dest, src1, src2)      _XORy(dest, src1, src2)
00987 #define ADDy(dest, src1, src2)      _ADDy(dest, src1, src2)
00988 #define BLENDy(map, dest, src1, src2) _BLENDy(map, dest, src1, src2)
00989 #define BLENDQy(map, dest, src1, src2) _BLENDQy(map, dest, src1, src2)
00990 #define PERMQy(map, dest, src)      _PERMQy(map, dest, src)
00991 
00992 
00993 #define S_TMPy(dest, src, bits, temp) _S_TEMPy(dest, src, bits, temp);
00994 #define AVX2_S(dest, src, bits)      S_TMPy(dest, src, bits, S_TEMPy)
00995 #define AVX2_R(dest, src, bits)      _AVX2_R(dest, src, bits)
00996 
00997 
00998 #define    FEEDBACK1_to_W_I_2(w_i_2, w_i)    MOVE_128y(YMM_TEMP0, w_i, w_i, 0x08);\
00999                                        BLENDy(0xf0, w_i_2, YMM_TEMP0, w_i_2);
01000 
01001 #define    MOVE_W_to_W_I_15(w_i_15, w_0, w_4)  BLENDQy(0x1, w_i_15, w_4, w_0);\
01002                                        PERMQy(0x39, w_i_15, w_i_15);
01003 #define    MOVE_W_to_W_I_7(w_i_7,  w_8, w_12)  BLENDQy(0x1, w_i_7, w_12, w_8);\
01004                                        PERMQy(0x39, w_i_7, w_i_7);
01005 #define    MOVE_W_to_W_I_2(w_i_2,  w_12)       BLENDQy(0xc, w_i_2, w_12, w_i_2);\
01006                                        PERMQy(0x0e, w_i_2, w_i_2);
01007 
01008 
01009 #define W_I_16y  ymm8
01010 #define W_I_15y  ymm9
01011 #define W_I_7y  ymm10
01012 #define W_I_2y  ymm11
01013 #define W_Iy    ymm12
01014 #define G_TEMPy     ymm13
01015 #define S_TEMPy     ymm14
01016 #define YMM_TEMP0  ymm15
01017 #define YMM_TEMP0x xmm15
01018 #define W_I_TEMPy   ymm7
01019 #define W_K_TEMPy   ymm15
01020 #define W_K_TEMPx  xmm15
01021 #define W_0y     ymm12
01022 #define W_4y     ymm4
01023 #define W_8y     ymm5
01024 #define W_12y    ymm6
01025 
01026 #define YMM_REGs
01027 /* Registers are saved in Sha512Update/Final */
01028                  /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/
01029 
01030 #define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\
01031     __asm__ volatile("vperm2i128  $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs);\
01032     __asm__ volatile("vpblendd    $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs);\
01033     __asm__ volatile("vperm2i128 $0x01,  %%"#w_i_7",  %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs);\
01034     __asm__ volatile("vpblendd    $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs);\
01035     __asm__ volatile("vpshufd    $0x93,  %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs);\
01036 
01037 #define MOVE_7_to_15(w_i_15, w_i_7)\
01038     __asm__ volatile("vmovdqu                 %%"#w_i_7",  %%"#w_i_15" ":::YMM_REGs);\
01039 
01040 #define MOVE_I_to_7(w_i_7, w_i)\
01041     __asm__ volatile("vperm2i128 $0x01,       %%"#w_i",   %%"#w_i",   %%"#w_i_7" ":::YMM_REGs);\
01042     __asm__ volatile("vpblendd    $0x01,       %%"#w_i_7",   %%"#w_i", %%"#w_i_7" ":::YMM_REGs);\
01043     __asm__ volatile("vpshufd    $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs);\
01044 
01045 #define MOVE_I_to_2(w_i_2, w_i)\
01046     __asm__ volatile("vperm2i128 $0x01,       %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs);\
01047     __asm__ volatile("vpshufd    $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs);\
01048 
01049 #endif /* HAVE_INTEL_AVX2 */
01050 
01051 
01052 /***  Transform Body ***/
01053 #if defined(HAVE_INTEL_AVX1)
01054 static int Transform_AVX1(Sha512* sha512)
01055 {
01056     const word64* K = K512;
01057     word64 W_X[16+4] = {0};
01058     word32 j;
01059     word64 T[8];
01060 
01061     /* Copy digest to working vars */
01062     XMEMCPY(T, sha512->digest, sizeof(T));
01063 
01064     W_from_buff(W_X, sha512->buffer);
01065     for (j = 0; j < 80; j += 16) {
01066         Rx_1( 0); Block_0_1(W_X); Rx_2( 0); Block_0_2(W_X); Rx_3( 0); Block_0_3();
01067         Rx_1( 1); Block_0_4(); Rx_2( 1); Block_0_5(); Rx_3( 1); Block_0_6(W_X);
01068         Rx_1( 2); Block_0_7(W_X); Rx_2( 2); Block_0_8(W_X); Rx_3( 2); Block_0_9();
01069         Rx_1( 3); Block_0_10();Rx_2( 3); Block_0_11();Rx_3( 3); Block_0_12(W_X);
01070 
01071         Rx_1( 4); Block_4_1(W_X); Rx_2( 4); Block_4_2(W_X); Rx_3( 4); Block_4_3();
01072         Rx_1( 5); Block_4_4(); Rx_2( 5); Block_4_5(); Rx_3( 5); Block_4_6(W_X);
01073         Rx_1( 6); Block_4_7(W_X); Rx_2( 6); Block_4_8(W_X); Rx_3( 6); Block_4_9();
01074         Rx_1( 7); Block_4_10();Rx_2( 7); Block_4_11();Rx_3( 7); Block_4_12(W_X);
01075 
01076         Rx_1( 8); Block_8_1(W_X); Rx_2( 8); Block_8_2(W_X); Rx_3( 8); Block_8_3();
01077         Rx_1( 9); Block_8_4(); Rx_2( 9); Block_8_5(); Rx_3( 9); Block_8_6(W_X);
01078         Rx_1(10); Block_8_7(W_X); Rx_2(10); Block_8_8(W_X); Rx_3(10); Block_8_9();
01079         Rx_1(11); Block_8_10();Rx_2(11); Block_8_11();Rx_3(11); Block_8_12(W_X);
01080 
01081         Rx_1(12); Block_12_1(W_X); Rx_2(12); Block_12_2(W_X); Rx_3(12); Block_12_3();
01082         Rx_1(13); Block_12_4(); Rx_2(13); Block_12_5(); Rx_3(13); Block_12_6(W_X);
01083         Rx_1(14); Block_12_7(W_X); Rx_2(14); Block_12_8(W_X); Rx_3(14); Block_12_9();
01084         Rx_1(15); Block_12_10();Rx_2(15); Block_12_11();Rx_3(15); Block_12_12(W_X);
01085     }
01086 
01087     /* Add the working vars back into digest */
01088     sha512->digest[0] += a(0);
01089     sha512->digest[1] += b(0);
01090     sha512->digest[2] += c(0);
01091     sha512->digest[3] += d(0);
01092     sha512->digest[4] += e(0);
01093     sha512->digest[5] += f(0);
01094     sha512->digest[6] += g(0);
01095     sha512->digest[7] += h(0);
01096 
01097     /* Wipe variables */
01098 #if !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2)
01099     XMEMSET(W_X, 0, sizeof(word64) * 16);
01100 #endif
01101     XMEMSET(T, 0, sizeof(T));
01102 
01103     return 0;
01104 }
01105 #endif /* HAVE_INTEL_AVX1 */
01106 
01107 #if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_RORX)
01108 static int Transform_AVX1_RORX(Sha512* sha512)
01109 {
01110     const word64* K = K512;
01111     word64 W_X[16+4] = {0};
01112     word32 j;
01113     word64 T[8];
01114 
01115     /* Copy digest to working vars */
01116     XMEMCPY(T, sha512->digest, sizeof(T));
01117 
01118     W_from_buff(W_X, sha512->buffer);
01119     for (j = 0; j < 80; j += 16) {
01120         Rx_RORX_1( 0); Block_0_1(W_X); Rx_RORX_2( 0); Block_0_2(W_X);
01121                                     Rx_RORX_3( 0); Block_0_3();
01122         Rx_RORX_1( 1); Block_0_4(); Rx_RORX_2( 1); Block_0_5();
01123                                     Rx_RORX_3( 1); Block_0_6(W_X);
01124         Rx_RORX_1( 2); Block_0_7(W_X); Rx_RORX_2( 2); Block_0_8(W_X);
01125                                     Rx_RORX_3( 2); Block_0_9();
01126         Rx_RORX_1( 3); Block_0_10();Rx_RORX_2( 3); Block_0_11();
01127                                     Rx_RORX_3( 3); Block_0_12(W_X);
01128 
01129         Rx_RORX_1( 4); Block_4_1(W_X); Rx_RORX_2( 4); Block_4_2(W_X);
01130                                     Rx_RORX_3( 4); Block_4_3();
01131         Rx_RORX_1( 5); Block_4_4(); Rx_RORX_2( 5); Block_4_5();
01132                                     Rx_RORX_3( 5); Block_4_6(W_X);
01133         Rx_RORX_1( 6); Block_4_7(W_X); Rx_RORX_2( 6); Block_4_8(W_X);
01134                                     Rx_RORX_3( 6); Block_4_9();
01135         Rx_RORX_1( 7); Block_4_10();Rx_RORX_2( 7); Block_4_11();
01136                                     Rx_RORX_3( 7); Block_4_12(W_X);
01137 
01138         Rx_RORX_1( 8); Block_8_1(W_X); Rx_RORX_2( 8); Block_8_2(W_X);
01139                                     Rx_RORX_3( 8); Block_8_3();
01140         Rx_RORX_1( 9); Block_8_4(); Rx_RORX_2( 9); Block_8_5();
01141                                     Rx_RORX_3( 9); Block_8_6(W_X);
01142         Rx_RORX_1(10); Block_8_7(W_X); Rx_RORX_2(10); Block_8_8(W_X);
01143                                     Rx_RORX_3(10); Block_8_9();
01144         Rx_RORX_1(11); Block_8_10();Rx_RORX_2(11); Block_8_11();
01145                                     Rx_RORX_3(11); Block_8_12(W_X);
01146 
01147         Rx_RORX_1(12); Block_12_1(W_X); Rx_RORX_2(12); Block_12_2(W_X);
01148                                      Rx_RORX_3(12); Block_12_3();
01149         Rx_RORX_1(13); Block_12_4(); Rx_RORX_2(13); Block_12_5();
01150                                      Rx_RORX_3(13); Block_12_6(W_X);
01151         Rx_RORX_1(14); Block_12_7(W_X); Rx_RORX_2(14); Block_12_8(W_X);
01152                                      Rx_RORX_3(14); Block_12_9();
01153         Rx_RORX_1(15); Block_12_10();Rx_RORX_2(15); Block_12_11();
01154                                      Rx_RORX_3(15); Block_12_12(W_X);
01155     }
01156 
01157     /* Add the working vars back into digest */
01158     sha512->digest[0] += a(0);
01159     sha512->digest[1] += b(0);
01160     sha512->digest[2] += c(0);
01161     sha512->digest[3] += d(0);
01162     sha512->digest[4] += e(0);
01163     sha512->digest[5] += f(0);
01164     sha512->digest[6] += g(0);
01165     sha512->digest[7] += h(0);
01166 
01167     /* Wipe variables */
01168 #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2)
01169     XMEMSET(W_X, 0, sizeof(word64) * 16);
01170 #endif
01171     XMEMSET(T, 0, sizeof(T));
01172 
01173     return 0;
01174 }
01175 #endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_AVX1 && HAVE_INTEL_RORX */
01176 
01177 #if defined(HAVE_INTEL_AVX2)
01178 
01179 #define s0_1y(dest, src)      AVX2_S(dest, src, 1);
01180 #define s0_2y(dest, src)      AVX2_S(G_TEMPy, src, 8); XORy(dest, G_TEMPy, dest);
01181 #define s0_3y(dest, src)      AVX2_R(G_TEMPy, src, 7);  XORy(dest, G_TEMPy, dest);
01182 
01183 #define s1_1y(dest, src)      AVX2_S(dest, src, 19);
01184 #define s1_2y(dest, src)      AVX2_S(G_TEMPy, src, 61); XORy(dest, G_TEMPy, dest);
01185 #define s1_3y(dest, src)      AVX2_R(G_TEMPy, src, 6); XORy(dest, G_TEMPy, dest);
01186 
01187 #define s0_y(dest, src)       s0_1y(dest, src); s0_2y(dest, src); s0_3y(dest, src)
01188 #define s1_y(dest, src)       s1_1y(dest, src); s1_2y(dest, src); s1_3y(dest, src)
01189 
01190 
01191 #define Block_Y_xx_1(i, w_0, w_4, w_8, w_12)\
01192     MOVE_W_to_W_I_15(W_I_15y, w_0, w_4);\
01193     MOVE_W_to_W_I_7 (W_I_7y,  w_8, w_12);\
01194     MOVE_W_to_W_I_2 (W_I_2y,  w_12);\
01195 
01196 #define Block_Y_xx_2(i, w_0, w_4, w_8, w_12)\
01197     s0_1y (YMM_TEMP0, W_I_15y);\
01198 
01199 #define Block_Y_xx_3(i, w_0, w_4, w_8, w_12)\
01200     s0_2y (YMM_TEMP0, W_I_15y);\
01201 
01202 #define Block_Y_xx_4(i, w_0, w_4, w_8, w_12)\
01203     s0_3y (YMM_TEMP0, W_I_15y);\
01204 
01205 #define Block_Y_xx_5(i, w_0, w_4, w_8, w_12)\
01206     ADDy(W_I_TEMPy, w_0, YMM_TEMP0);\
01207 
01208 #define Block_Y_xx_6(i, w_0, w_4, w_8, w_12)\
01209     ADDy(W_I_TEMPy, W_I_TEMPy, W_I_7y);\
01210     s1_1y (YMM_TEMP0, W_I_2y);\
01211 
01212 #define Block_Y_xx_7(i, w_0, w_4, w_8, w_12)\
01213     s1_2y (YMM_TEMP0, W_I_2y);\
01214 
01215 #define Block_Y_xx_8(i, w_0, w_4, w_8, w_12)\
01216     s1_3y (YMM_TEMP0, W_I_2y);\
01217     ADDy(w_0, W_I_TEMPy, YMM_TEMP0);\
01218 
01219 #define Block_Y_xx_9(i, w_0, w_4, w_8, w_12)\
01220     FEEDBACK1_to_W_I_2(W_I_2y, w_0);\
01221 
01222 #define Block_Y_xx_10(i, w_0, w_4, w_8, w_12) \
01223     s1_1y (YMM_TEMP0, W_I_2y);\
01224 
01225 #define Block_Y_xx_11(i, w_0, w_4, w_8, w_12) \
01226     s1_2y (YMM_TEMP0, W_I_2y);\
01227 
01228 #define Block_Y_xx_12(i, w_0, w_4, w_8, w_12)\
01229     s1_3y (YMM_TEMP0, W_I_2y);\
01230     ADDy(w_0, W_I_TEMPy, YMM_TEMP0);\
01231     MOVE_to_MEMy(w,0, w_4);\
01232 
01233 
01234 static INLINE void Block_Y_0_1(void) { Block_Y_xx_1(0, W_0y, W_4y, W_8y, W_12y); }
01235 static INLINE void Block_Y_0_2(void) { Block_Y_xx_2(0, W_0y, W_4y, W_8y, W_12y); }
01236 static INLINE void Block_Y_0_3(void) { Block_Y_xx_3(0, W_0y, W_4y, W_8y, W_12y); }
01237 static INLINE void Block_Y_0_4(void) { Block_Y_xx_4(0, W_0y, W_4y, W_8y, W_12y); }
01238 static INLINE void Block_Y_0_5(void) { Block_Y_xx_5(0, W_0y, W_4y, W_8y, W_12y); }
01239 static INLINE void Block_Y_0_6(void) { Block_Y_xx_6(0, W_0y, W_4y, W_8y, W_12y); }
01240 static INLINE void Block_Y_0_7(void) { Block_Y_xx_7(0, W_0y, W_4y, W_8y, W_12y); }
01241 static INLINE void Block_Y_0_8(void) { Block_Y_xx_8(0, W_0y, W_4y, W_8y, W_12y); }
01242 static INLINE void Block_Y_0_9(void) { Block_Y_xx_9(0, W_0y, W_4y, W_8y, W_12y); }
01243 static INLINE void Block_Y_0_10(void){ Block_Y_xx_10(0, W_0y, W_4y, W_8y, W_12y); }
01244 static INLINE void Block_Y_0_11(void){ Block_Y_xx_11(0, W_0y, W_4y, W_8y, W_12y); }
01245 static INLINE void Block_Y_0_12(word64 *w){ Block_Y_xx_12(0, W_0y, W_4y, W_8y, W_12y); }
01246 
01247 static INLINE void Block_Y_4_1(void) { Block_Y_xx_1(4, W_4y, W_8y, W_12y, W_0y); }
01248 static INLINE void Block_Y_4_2(void) { Block_Y_xx_2(4, W_4y, W_8y, W_12y, W_0y); }
01249 static INLINE void Block_Y_4_3(void) { Block_Y_xx_3(4, W_4y, W_8y, W_12y, W_0y); }
01250 static INLINE void Block_Y_4_4(void) { Block_Y_xx_4(4, W_4y, W_8y, W_12y, W_0y); }
01251 static INLINE void Block_Y_4_5(void) { Block_Y_xx_5(4, W_4y, W_8y, W_12y, W_0y); }
01252 static INLINE void Block_Y_4_6(void) { Block_Y_xx_6(4, W_4y, W_8y, W_12y, W_0y); }
01253 static INLINE void Block_Y_4_7(void) { Block_Y_xx_7(4, W_4y, W_8y, W_12y, W_0y); }
01254 static INLINE void Block_Y_4_8(void) { Block_Y_xx_8(4, W_4y, W_8y, W_12y, W_0y); }
01255 static INLINE void Block_Y_4_9(void) { Block_Y_xx_9(4, W_4y, W_8y, W_12y, W_0y); }
01256 static INLINE void Block_Y_4_10(void) { Block_Y_xx_10(4, W_4y, W_8y, W_12y, W_0y); }
01257 static INLINE void Block_Y_4_11(void) { Block_Y_xx_11(4, W_4y, W_8y, W_12y, W_0y); }
01258 static INLINE void Block_Y_4_12(word64 *w) { Block_Y_xx_12(4, W_4y, W_8y, W_12y, W_0y); }
01259 
01260 static INLINE void Block_Y_8_1(void) { Block_Y_xx_1(8, W_8y, W_12y, W_0y, W_4y); }
01261 static INLINE void Block_Y_8_2(void) { Block_Y_xx_2(8, W_8y, W_12y, W_0y, W_4y); }
01262 static INLINE void Block_Y_8_3(void) { Block_Y_xx_3(8, W_8y, W_12y, W_0y, W_4y); }
01263 static INLINE void Block_Y_8_4(void) { Block_Y_xx_4(8, W_8y, W_12y, W_0y, W_4y); }
01264 static INLINE void Block_Y_8_5(void) { Block_Y_xx_5(8, W_8y, W_12y, W_0y, W_4y); }
01265 static INLINE void Block_Y_8_6(void) { Block_Y_xx_6(8, W_8y, W_12y, W_0y, W_4y); }
01266 static INLINE void Block_Y_8_7(void) { Block_Y_xx_7(8, W_8y, W_12y, W_0y, W_4y); }
01267 static INLINE void Block_Y_8_8(void) { Block_Y_xx_8(8, W_8y, W_12y, W_0y, W_4y); }
01268 static INLINE void Block_Y_8_9(void) { Block_Y_xx_9(8, W_8y, W_12y, W_0y, W_4y); }
01269 static INLINE void Block_Y_8_10(void) { Block_Y_xx_10(8, W_8y, W_12y, W_0y, W_4y); }
01270 static INLINE void Block_Y_8_11(void) { Block_Y_xx_11(8, W_8y, W_12y, W_0y, W_4y); }
01271 static INLINE void Block_Y_8_12(word64 *w) { Block_Y_xx_12(8, W_8y, W_12y, W_0y, W_4y); }
01272 
01273 static INLINE void Block_Y_12_1(void) { Block_Y_xx_1(12, W_12y, W_0y, W_4y, W_8y); }
01274 static INLINE void Block_Y_12_2(void) { Block_Y_xx_2(12, W_12y, W_0y, W_4y, W_8y); }
01275 static INLINE void Block_Y_12_3(void) { Block_Y_xx_3(12, W_12y, W_0y, W_4y, W_8y); }
01276 static INLINE void Block_Y_12_4(void) { Block_Y_xx_4(12, W_12y, W_0y, W_4y, W_8y); }
01277 static INLINE void Block_Y_12_5(void) { Block_Y_xx_5(12, W_12y, W_0y, W_4y, W_8y); }
01278 static INLINE void Block_Y_12_6(void) { Block_Y_xx_6(12, W_12y, W_0y, W_4y, W_8y); }
01279 static INLINE void Block_Y_12_7(void) { Block_Y_xx_7(12, W_12y, W_0y, W_4y, W_8y); }
01280 static INLINE void Block_Y_12_8(void) { Block_Y_xx_8(12, W_12y, W_0y, W_4y, W_8y); }
01281 static INLINE void Block_Y_12_9(void) { Block_Y_xx_9(12, W_12y, W_0y, W_4y, W_8y); }
01282 static INLINE void Block_Y_12_10(void) { Block_Y_xx_10(12, W_12y, W_0y, W_4y, W_8y); }
01283 static INLINE void Block_Y_12_11(void) { Block_Y_xx_11(12, W_12y, W_0y, W_4y, W_8y); }
01284 static INLINE void Block_Y_12_12(word64 *w) { Block_Y_xx_12(12, W_12y, W_0y, W_4y, W_8y); }
01285 
01286 
01287 static int Transform_AVX2(Sha512* sha512)
01288 {
01289     const word64* K = K512;
01290     word64 w[4];
01291     word32 j;
01292     word64 T[8];
01293 
01294     /* Copy digest to working vars */
01295     XMEMCPY(T, sha512->digest, sizeof(T));
01296 
01297     W_from_buff_Y(sha512->buffer);
01298     MOVE_to_MEMy(w,0, W_0y);
01299     for (j = 0; j < 80; j += 16) {
01300         Ry_1( 0, w[0]); Block_Y_0_1(); Ry_2( 0, w[0]); Block_Y_0_2();
01301                                        Ry_3( 0, w[0]); Block_Y_0_3();
01302         Ry_1( 1, w[1]); Block_Y_0_4(); Ry_2( 1, w[1]); Block_Y_0_5();
01303                                        Ry_3( 1, w[1]); Block_Y_0_6();
01304         Ry_1( 2, w[2]); Block_Y_0_7(); Ry_2( 2, w[2]); Block_Y_0_8();
01305                                        Ry_3( 2, w[2]); Block_Y_0_9();
01306         Ry_1( 3, w[3]); Block_Y_0_10();Ry_2( 3, w[3]); Block_Y_0_11();
01307                                        Ry_3( 3, w[3]); Block_Y_0_12(w);
01308 
01309         Ry_1( 4, w[0]); Block_Y_4_1(); Ry_2( 4, w[0]); Block_Y_4_2();
01310                                        Ry_3( 4, w[0]); Block_Y_4_3();
01311         Ry_1( 5, w[1]); Block_Y_4_4(); Ry_2( 5, w[1]); Block_Y_4_5();
01312                                        Ry_3( 5, w[1]); Block_Y_4_6();
01313         Ry_1( 6, w[2]); Block_Y_4_7(); Ry_2( 6, w[2]); Block_Y_4_8();
01314                                        Ry_3( 6, w[2]); Block_Y_4_9();
01315         Ry_1( 7, w[3]); Block_Y_4_10(); Ry_2( 7, w[3]);Block_Y_4_11();
01316                                         Ry_3( 7, w[3]);Block_Y_4_12(w);
01317 
01318         Ry_1( 8, w[0]); Block_Y_8_1(); Ry_2( 8, w[0]); Block_Y_8_2();
01319                                        Ry_3( 8, w[0]); Block_Y_8_3();
01320         Ry_1( 9, w[1]); Block_Y_8_4(); Ry_2( 9, w[1]); Block_Y_8_5();
01321                                        Ry_3( 9, w[1]); Block_Y_8_6();
01322         Ry_1(10, w[2]); Block_Y_8_7(); Ry_2(10, w[2]); Block_Y_8_8();
01323                                        Ry_3(10, w[2]); Block_Y_8_9();
01324         Ry_1(11, w[3]); Block_Y_8_10();Ry_2(11, w[3]); Block_Y_8_11();
01325                                        Ry_3(11, w[3]); Block_Y_8_12(w);
01326 
01327         Ry_1(12, w[0]); Block_Y_12_1(); Ry_2(12, w[0]); Block_Y_12_2();
01328                                         Ry_3(12, w[0]); Block_Y_12_3();
01329         Ry_1(13, w[1]); Block_Y_12_4(); Ry_2(13, w[1]); Block_Y_12_5();
01330                                         Ry_3(13, w[1]); Block_Y_12_6();
01331         Ry_1(14, w[2]); Block_Y_12_7(); Ry_2(14, w[2]); Block_Y_12_8();
01332                                         Ry_3(14, w[2]); Block_Y_12_9();
01333         Ry_1(15, w[3]); Block_Y_12_10();Ry_2(15, w[3]); Block_Y_12_11();
01334                                         Ry_3(15, w[3]);Block_Y_12_12(w);
01335     }
01336 
01337     /* Add the working vars back into digest */
01338     sha512->digest[0] += a(0);
01339     sha512->digest[1] += b(0);
01340     sha512->digest[2] += c(0);
01341     sha512->digest[3] += d(0);
01342     sha512->digest[4] += e(0);
01343     sha512->digest[5] += f(0);
01344     sha512->digest[6] += g(0);
01345     sha512->digest[7] += h(0);
01346 
01347     /* Wipe variables */
01348 #if !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2)
01349     XMEMSET(W, 0, sizeof(word64) * 16);
01350 #endif
01351     XMEMSET(T, 0, sizeof(T));
01352 
01353     return 0;
01354 }
01355 #endif /* HAVE_INTEL_AVX2 */
01356 
01357 
01358 
01359 /* -------------------------------------------------------------------------- */
01360 /* SHA384 */
01361 /* -------------------------------------------------------------------------- */
01362 #ifdef WOLFSSL_SHA384
01363 static int InitSha384(Sha384* sha384)
01364 {
01365     sha384->digest[0] = W64LIT(0xcbbb9d5dc1059ed8);
01366     sha384->digest[1] = W64LIT(0x629a292a367cd507);
01367     sha384->digest[2] = W64LIT(0x9159015a3070dd17);
01368     sha384->digest[3] = W64LIT(0x152fecd8f70e5939);
01369     sha384->digest[4] = W64LIT(0x67332667ffc00b31);
01370     sha384->digest[5] = W64LIT(0x8eb44a8768581511);
01371     sha384->digest[6] = W64LIT(0xdb0c2e0d64f98fa7);
01372     sha384->digest[7] = W64LIT(0x47b5481dbefa4fa4);
01373 
01374     sha384->buffLen = 0;
01375     sha384->loLen   = 0;
01376     sha384->hiLen   = 0;
01377 
01378     return 0;
01379 }
01380 
01381 int wc_Sha384Update(Sha384* sha384, const byte* data, word32 len)
01382 {
01383 
01384 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
01385     if (sha384->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA384) {
01386     #if defined(HAVE_INTEL_QA)
01387         return IntelQaSymSha384(&sha384->asyncDev, NULL, data, len);
01388     #endif
01389     }
01390 #endif /* WOLFSSL_ASYNC_CRYPT */
01391 
01392     return Sha512Update((Sha512*)sha384, data, len);
01393 }
01394 
01395 
01396 int wc_Sha384Final(Sha384* sha384, byte* hash)
01397 {
01398     int ret;
01399 
01400 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
01401     if (sha384->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA384) {
01402     #if defined(HAVE_INTEL_QA)
01403         return IntelQaSymSha384(&sha384->asyncDev, hash, NULL,
01404                                             SHA384_DIGEST_SIZE);
01405     #endif
01406     }
01407 #endif /* WOLFSSL_ASYNC_CRYPT */
01408 
01409     ret = Sha512Final((Sha512*)sha384);
01410     if (ret != 0)
01411         return ret;
01412 
01413     XMEMCPY(hash, sha384->digest, SHA384_DIGEST_SIZE);
01414 
01415     return InitSha384(sha384);  /* reset state */
01416 }
01417 
01418 
01419 int wc_InitSha384_ex(Sha384* sha384, void* heap, int devId)
01420 {
01421     int ret;
01422 
01423     if (sha384 == NULL) {
01424         return BAD_FUNC_ARG;
01425     }
01426 
01427     sha384->heap = heap;
01428     ret = InitSha384(sha384);
01429     if (ret != 0)
01430         return ret;
01431 
01432 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
01433     ret = wolfAsync_DevCtxInit(&sha384->asyncDev, WOLFSSL_ASYNC_MARKER_SHA384,
01434                                                            sha384->heap, devId);
01435 #else
01436     (void)devId;
01437 #endif /* WOLFSSL_ASYNC_CRYPT */
01438 
01439     return ret;
01440 }
01441 
01442 int wc_InitSha384(Sha384* sha384)
01443 {
01444     return wc_InitSha384_ex(sha384, NULL, INVALID_DEVID);
01445 }
01446 
01447 void wc_Sha384Free(Sha384* sha384)
01448 {
01449     if (sha384 == NULL)
01450         return;
01451 
01452 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
01453     wolfAsync_DevCtxFree(&sha384->asyncDev, WOLFSSL_ASYNC_MARKER_SHA384);
01454 #endif /* WOLFSSL_ASYNC_CRYPT */
01455 }
01456 
01457 #endif /* WOLFSSL_SHA384 */
01458 
01459 #endif /* HAVE_FIPS */
01460 
01461 
01462 int wc_Sha512GetHash(Sha512* sha512, byte* hash)
01463 {
01464     int ret;
01465     Sha512 tmpSha512;
01466 
01467     if (sha512 == NULL || hash == NULL)
01468         return BAD_FUNC_ARG;
01469 
01470     ret = wc_Sha512Copy(sha512, &tmpSha512);
01471     if (ret == 0) {
01472         ret = wc_Sha512Final(&tmpSha512, hash);
01473     }
01474     return ret;
01475 }
01476 
01477 int wc_Sha512Copy(Sha512* src, Sha512* dst)
01478 {
01479     int ret = 0;
01480 
01481     if (src == NULL || dst == NULL)
01482         return BAD_FUNC_ARG;
01483 
01484     XMEMCPY(dst, src, sizeof(Sha512));
01485 
01486 #ifdef WOLFSSL_ASYNC_CRYPT
01487     ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
01488 #endif
01489 
01490     return ret;
01491 }
01492 
01493 #ifdef WOLFSSL_SHA384
01494 int wc_Sha384GetHash(Sha384* sha384, byte* hash)
01495 {
01496     int ret;
01497     Sha384 tmpSha384;
01498 
01499     if (sha384 == NULL || hash == NULL)
01500         return BAD_FUNC_ARG;
01501 
01502     ret = wc_Sha384Copy(sha384, &tmpSha384);
01503     if (ret == 0) {
01504         ret = wc_Sha384Final(&tmpSha384, hash);
01505     }
01506     return ret;
01507 }
01508 int wc_Sha384Copy(Sha384* src, Sha384* dst)
01509 {
01510     int ret = 0;
01511 
01512     if (src == NULL || dst == NULL)
01513         return BAD_FUNC_ARG;
01514 
01515     XMEMCPY(dst, src, sizeof(Sha384));
01516 
01517 #ifdef WOLFSSL_ASYNC_CRYPT
01518     ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
01519 #endif
01520 
01521     return ret;
01522 }
01523 #endif /* WOLFSSL_SHA384 */
01524 
01525 #endif /* WOLFSSL_SHA512 */
01526