Renesas / SecureDweet
Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers sha512.c Source File

sha512.c

00001 /* sha512.c
00002  *
00003  * Copyright (C) 2006-2016 wolfSSL Inc.
00004  *
00005  * This file is part of wolfSSL.
00006  *
00007  * wolfSSL is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * wolfSSL is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
00020  */
00021 
00022 
00023 #ifdef HAVE_CONFIG_H
00024     #include <config.h>
00025 #endif
00026 
00027 #include <wolfssl/wolfcrypt/settings.h>
00028 #include <wolfssl/wolfcrypt/sha512.h>
00029 
00030 #ifdef WOLFSSL_SHA512
00031 
00032 #ifdef HAVE_FIPS
00033 int wc_InitSha512(Sha512* sha)
00034 {
00035     return InitSha512_fips(sha);
00036 }
00037 
00038 
00039 int wc_Sha512Update(Sha512* sha, const byte* data, word32 len)
00040 {
00041     return Sha512Update_fips(sha, data, len);
00042 }
00043 
00044 
00045 int wc_Sha512Final(Sha512* sha, byte* out)
00046 {
00047     return Sha512Final_fips(sha, out);
00048 }
00049 
00050 
00051 #if defined(WOLFSSL_SHA384) || defined(HAVE_AESGCM)
00052 
00053 int wc_InitSha384(Sha384* sha)
00054 {
00055     return InitSha384_fips(sha);
00056 }
00057 
00058 
00059 int wc_Sha384Update(Sha384* sha, const byte* data, word32 len)
00060 {
00061     return Sha384Update_fips(sha, data, len);
00062 }
00063 
00064 
00065 int wc_Sha384Final(Sha384* sha, byte* out)
00066 {
00067     return Sha384Final_fips(sha, out);
00068 }
00069 
00070 
00071 #endif /* WOLFSSL_SHA384 */
00072 #else /* else build without using fips */
00073 #include <wolfssl/wolfcrypt/logging.h>
00074 #include <wolfssl/wolfcrypt/error-crypt.h>
00075 
00076 #ifdef NO_INLINE
00077     #include <wolfssl/wolfcrypt/misc.h>
00078 #else
00079     #include <wolfcrypt/src/misc.c>
00080 #endif
00081 
00082 
00083 #ifndef WOLFSSL_HAVE_MIN
00084 #define WOLFSSL_HAVE_MIN
00085 
00086     static INLINE word32 min(word32 a, word32 b)
00087     {
00088         return a > b ? b : a;
00089     }
00090 
00091 #endif /* WOLFSSL_HAVE_MIN */
00092 
00093 #if defined(USE_INTEL_SPEEDUP)
00094   #define HAVE_INTEL_AVX1
00095   #define HAVE_INTEL_AVX2
00096 #endif
00097 
00098 #if defined(HAVE_INTEL_AVX1)
00099 /* #define DEBUG_XMM  */
00100 #endif
00101 
00102 #if defined(HAVE_INTEL_AVX2)
00103 #define HAVE_INTEL_RORX
00104 /* #define DEBUG_YMM  */
00105 #endif
00106 
00107 /*****
00108 Intel AVX1/AVX2 Macro Control Structure
00109 
00110 #if defined(HAVE_INteL_SPEEDUP)
00111     #define HAVE_INTEL_AVX1
00112     #define HAVE_INTEL_AVX2
00113 #endif
00114 
00115 int InitSha512(Sha512* sha512) { 
00116      Save/Recover XMM, YMM
00117      ...
00118 
00119      Check Intel AVX cpuid flags
00120 }
00121 
00122 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
00123   Transform_AVX1() ; # Function prototype 
00124   Transform_AVX2() ; #
00125 #endif
00126 
00127   _Transform() {     # Native Transform Function body
00128   
00129   }
00130   
00131   int Sha512Update() { 
00132      Save/Recover XMM, YMM
00133      ...
00134   }
00135   
00136   int Sha512Final() { 
00137      Save/Recover XMM, YMM
00138      ...
00139   }
00140 
00141 
00142 #if defined(HAVE_INTEL_AVX1)
00143    
00144    XMM Instructions/INLINE asm Definitions
00145 
00146 #endif
00147 
00148 #if defined(HAVE_INTEL_AVX2)
00149 
00150    YMM Instructions/INLINE asm Definitions
00151 
00152 #endif
00153 
00154 #if defnied(HAVE_INTEL_AVX1)
00155   
00156   int Transform_AVX1() {
00157       Stitched Message Sched/Round
00158   }
00159 
00160 #endif
00161 
00162 #if defnied(HAVE_INTEL_AVX2)
00163   
00164   int Transform_AVX2() {
00165       Stitched Message Sched/Round
00166   }
00167 #endif
00168 
00169 
00170 */
00171 
00172 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00173 
00174 
00175 /* Each platform needs to query info type 1 from cpuid to see if aesni is
00176  * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
00177  */
00178 
00179 #ifndef _MSC_VER
00180     #define cpuid(reg, leaf, sub)\
00181             __asm__ __volatile__ ("cpuid":\
00182              "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
00183              "a" (leaf), "c"(sub));
00184 
00185     #define XASM_LINK(f) asm(f)
00186 #else
00187 
00188     #include <intrin.h>
00189     #define cpuid(a,b) __cpuid((int*)a,b)
00190 
00191     #define XASM_LINK(f)
00192 
00193 #endif /* _MSC_VER */
00194 
00195 #define EAX 0
00196 #define EBX 1
00197 #define ECX 2 
00198 #define EDX 3
00199     
00200 #define CPUID_AVX1   0x1
00201 #define CPUID_AVX2   0x2
00202 #define CPUID_RDRAND 0x4
00203 #define CPUID_RDSEED 0x8
00204 #define CPUID_BMI2   0x10   /* MULX, RORX */
00205 
00206 #define IS_INTEL_AVX1       (cpuid_flags&CPUID_AVX1)
00207 #define IS_INTEL_AVX2       (cpuid_flags&CPUID_AVX2)
00208 #define IS_INTEL_BMI2       (cpuid_flags&CPUID_BMI2)
00209 #define IS_INTEL_RDRAND     (cpuid_flags&CPUID_RDRAND)
00210 #define IS_INTEL_RDSEED     (cpuid_flags&CPUID_RDSEED)
00211 
00212 static word32 cpuid_check = 0 ;
00213 static word32 cpuid_flags = 0 ;
00214 
00215 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
00216     int got_intel_cpu=0;
00217     unsigned int reg[5]; 
00218     
00219     reg[4] = '\0' ;
00220     cpuid(reg, 0, 0);  
00221     if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&  
00222                 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&  
00223                 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {  
00224         got_intel_cpu = 1;  
00225     }    
00226     if (got_intel_cpu) {
00227         cpuid(reg, leaf, sub);
00228         return((reg[num]>>bit)&0x1) ;
00229     }
00230     return 0 ;
00231 }
00232 
00233 #define CHECK_SHA512 0x1
00234 #define CHECK_SHA384 0x2
00235 
00236 static int set_cpuid_flags(int sha) {  
00237     if((cpuid_check & sha) ==0) {
00238         if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;}
00239         if(cpuid_flag(7, 0, EBX, 5)){  cpuid_flags |= CPUID_AVX2 ; }
00240         if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; }
00241         if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ;  } 
00242         if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ;  }
00243         cpuid_check |= sha ;
00244         return 0 ;
00245     }
00246     return 1 ;
00247 }
00248 
00249 
00250 /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */
00251 
00252 #if defined(HAVE_INTEL_AVX1)
00253 static int Transform_AVX1(Sha512 *sha512) ;
00254 #endif
00255 
00256 #if defined(HAVE_INTEL_AVX2)
00257 static int Transform_AVX2(Sha512 *sha512) ; 
00258 
00259 #if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX)
00260 static int Transform_AVX1_RORX(Sha512 *sha512) ;
00261 #endif
00262 
00263 #endif
00264 
00265 static int _Transform(Sha512 *sha512) ; 
00266     
00267 static int (*Transform_p)(Sha512* sha512) = _Transform ;
00268 
00269 #define Transform(sha512) (*Transform_p)(sha512)
00270 
00271 static void set_Transform(void) {
00272      if(set_cpuid_flags(CHECK_SHA512)) return ;
00273 
00274 #if defined(HAVE_INTEL_AVX2)
00275      if(IS_INTEL_AVX2 && IS_INTEL_BMI2){ 
00276          Transform_p = Transform_AVX1_RORX; return ; 
00277          Transform_p = Transform_AVX2      ; 
00278                   /* for avoiding warning,"not used" */
00279      }
00280 #endif
00281 #if defined(HAVE_INTEL_AVX1)
00282      Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : _Transform) ; return ;
00283 #endif
00284      Transform_p = _Transform ; return ;
00285 }
00286 
00287 #else
00288    #define Transform(sha512) _Transform(sha512)
00289 #endif
00290 
00291 /* Dummy for saving MM_REGs on behalf of Transform */
00292 /* #if defined(HAVE_INTEL_AVX2)
00293  #define  SAVE_XMM_YMM   __asm__ volatile("orq %%r8, %%r8":::\
00294    "%ymm0","%ymm1","%ymm2","%ymm3","%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11",\
00295    "%ymm12","%ymm13","%ymm14","%ymm15")
00296 */
00297 #if defined(HAVE_INTEL_AVX1)
00298    #define  SAVE_XMM_YMM   __asm__ volatile("orq %%r8, %%r8":::\
00299     "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15")
00300 #else
00301 #define  SAVE_XMM_YMM
00302 #endif
00303 
00304 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
00305 
00306 #include <string.h>
00307 
00308 #endif /* defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2) */
00309 
00310 
00311 #if defined(HAVE_INTEL_RORX)
00312 #define ROTR(func, bits, x) \
00313 word64 func(word64 x) {  word64 ret ;\
00314     __asm__ ("rorx $"#bits", %1, %0\n\t":"=r"(ret):"r"(x):) ;\
00315     return ret ;\
00316 }
00317 
00318 static INLINE ROTR(rotrFixed64_28, 28, x)
00319 static INLINE ROTR(rotrFixed64_34, 34, x)
00320 static INLINE ROTR(rotrFixed64_39, 39, x)
00321 static INLINE ROTR(rotrFixed64_14, 14, x)
00322 static INLINE ROTR(rotrFixed64_18, 18, x)
00323 static INLINE ROTR(rotrFixed64_41, 41, x)
00324 
00325 #define S0_RORX(x) (rotrFixed64_28(x)^rotrFixed64_34(x)^rotrFixed64_39(x))
00326 #define S1_RORX(x) (rotrFixed64_14(x)^rotrFixed64_18(x)^rotrFixed64_41(x))
00327 #endif
00328 
00329 #if defined(HAVE_BYTEREVERSE64) && !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2)
00330 #define ByteReverseWords64(out, in, size) ByteReverseWords64_1(out, size)
00331 #define ByteReverseWords64_1(buf, size)\
00332  { unsigned int i ;\
00333    for(i=0; i< size/sizeof(word64); i++){\
00334        __asm__ volatile("bswapq %0":"+r"(buf[i])::) ;\
00335    }\
00336 }
00337 #endif
00338 
00339 
00340 int wc_InitSha512(Sha512* sha512)
00341 {
00342     sha512->digest[0] = W64LIT(0x6a09e667f3bcc908);
00343     sha512->digest[1] = W64LIT(0xbb67ae8584caa73b);
00344     sha512->digest[2] = W64LIT(0x3c6ef372fe94f82b);
00345     sha512->digest[3] = W64LIT(0xa54ff53a5f1d36f1);
00346     sha512->digest[4] = W64LIT(0x510e527fade682d1);
00347     sha512->digest[5] = W64LIT(0x9b05688c2b3e6c1f);
00348     sha512->digest[6] = W64LIT(0x1f83d9abfb41bd6b);
00349     sha512->digest[7] = W64LIT(0x5be0cd19137e2179);
00350 
00351     sha512->buffLen = 0;
00352     sha512->loLen   = 0;
00353     sha512->hiLen   = 0;
00354     
00355 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
00356     set_Transform() ; /* choose best Transform function under this runtime environment */
00357 #endif
00358     
00359     return 0 ;
00360 }
00361 
00362 
00363 static const word64 K512[80] = {
00364     W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
00365     W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
00366     W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
00367     W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
00368     W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
00369     W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
00370     W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
00371     W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
00372     W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
00373     W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
00374     W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
00375     W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
00376     W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
00377     W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
00378     W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
00379     W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
00380     W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
00381     W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
00382     W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
00383     W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
00384     W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
00385     W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
00386     W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
00387     W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
00388     W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
00389     W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
00390     W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
00391     W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
00392     W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
00393     W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
00394     W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
00395     W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
00396     W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
00397     W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
00398     W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
00399     W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
00400     W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
00401     W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
00402     W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
00403     W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
00404 };
00405 
00406 
00407 
00408 #define blk0(i) (W[i] = sha512->buffer[i])
00409 
00410 #define blk2(i) (W[i&15]+=s1(W[(i-2)&15])+W[(i-7)&15]+s0(W[(i-15)&15]))
00411 
00412 #define Ch(x,y,z) (z^(x&(y^z)))
00413 #define Maj(x,y,z) ((x&y)|(z&(x|y)))
00414 
00415 #define a(i) T[(0-i)&7]
00416 #define b(i) T[(1-i)&7]
00417 #define c(i) T[(2-i)&7]
00418 #define d(i) T[(3-i)&7]
00419 #define e(i) T[(4-i)&7]
00420 #define f(i) T[(5-i)&7]
00421 #define g(i) T[(6-i)&7]
00422 #define h(i) T[(7-i)&7]
00423 
00424 #define S0(x) (rotrFixed64(x,28)^rotrFixed64(x,34)^rotrFixed64(x,39))
00425 #define S1(x) (rotrFixed64(x,14)^rotrFixed64(x,18)^rotrFixed64(x,41))
00426 #define s0(x) (rotrFixed64(x,1)^rotrFixed64(x,8)^(x>>7))
00427 #define s1(x) (rotrFixed64(x,19)^rotrFixed64(x,61)^(x>>6))
00428 
00429 #define R(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j]+(j?blk2(i):blk0(i));\
00430     d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
00431 
00432 #define blk384(i) (W[i] = sha384->buffer[i])
00433 
00434 #define R2(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j]+(j?blk2(i):blk384(i));\
00435     d(i)+=h(i);h(i)+=S0(a(i))+Maj(a(i),b(i),c(i))
00436 
00437 static int _Transform(Sha512* sha512)
00438 {
00439     const word64* K = K512;
00440 
00441     word32 j;
00442     word64 T[8];
00443 
00444 
00445 #ifdef WOLFSSL_SMALL_STACK
00446     word64* W;
00447     W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER);
00448     if (W == NULL)
00449         return MEMORY_E;
00450 #else
00451     word64 W[16];
00452 #endif
00453 
00454     /* Copy digest to working vars */
00455     XMEMCPY(T, sha512->digest, sizeof(T));
00456 
00457 #ifdef USE_SLOW_SHA2
00458     /* over twice as small, but 50% slower */
00459     /* 80 operations, not unrolled */
00460     for (j = 0; j < 80; j += 16) {
00461         int m; 
00462         for (m = 0; m < 16; m++) { /* braces needed here for macros {} */
00463             R(m);
00464         }
00465     }
00466 #else
00467     /* 80 operations, partially loop unrolled */
00468     for (j = 0; j < 80; j += 16) {
00469         R( 0); R( 1); R( 2); R( 3);
00470         R( 4); R( 5); R( 6); R( 7);
00471         R( 8); R( 9); R(10); R(11);
00472         R(12); R(13); R(14); R(15);
00473     }
00474 #endif /* USE_SLOW_SHA2 */
00475 
00476     /* Add the working vars back into digest */
00477 
00478     sha512->digest[0] += a(0);
00479     sha512->digest[1] += b(0);
00480     sha512->digest[2] += c(0);
00481     sha512->digest[3] += d(0);
00482     sha512->digest[4] += e(0);
00483     sha512->digest[5] += f(0);
00484     sha512->digest[6] += g(0);
00485     sha512->digest[7] += h(0);
00486 
00487     /* Wipe variables */
00488     ForceZero(W, sizeof(word64) * 16);
00489     ForceZero(T, sizeof(T));
00490 
00491 #ifdef WOLFSSL_SMALL_STACK
00492     XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
00493 #endif
00494 
00495     return 0;
00496 }
00497 
00498 
00499 static INLINE void AddLength(Sha512* sha512, word32 len)
00500 {
00501     word32 tmp = sha512->loLen;
00502     if ( (sha512->loLen += len) < tmp)
00503         sha512->hiLen++;                       /* carry low to high */
00504 }
00505 
00506 int wc_Sha512Update(Sha512* sha512, const byte* data, word32 len)
00507 {
00508     /* do block size increments */
00509     byte* local = (byte*)sha512->buffer;
00510     SAVE_XMM_YMM ; /* for Intel AVX */
00511 
00512     while (len) {
00513         word32 add = min(len, SHA512_BLOCK_SIZE - sha512->buffLen);
00514         XMEMCPY(&local[sha512->buffLen], data, add);
00515 
00516         sha512->buffLen += add;
00517         data         += add;
00518         len          -= add;
00519 
00520         if (sha512->buffLen == SHA512_BLOCK_SIZE) {
00521             int ret;
00522             #if defined(LITTLE_ENDIAN_ORDER)
00523                 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00524                 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) 
00525                 #endif
00526                     ByteReverseWords64(sha512->buffer, sha512->buffer,
00527                                    SHA512_BLOCK_SIZE);
00528             #endif
00529             ret = Transform(sha512);
00530             if (ret != 0)
00531                 return ret;
00532 
00533             AddLength(sha512, SHA512_BLOCK_SIZE);
00534             sha512->buffLen = 0;
00535         }
00536     }
00537     return 0;
00538 }
00539 
00540 
00541 int wc_Sha512Final(Sha512* sha512, byte* hash)
00542 {
00543     byte* local = (byte*)sha512->buffer;
00544     int ret;
00545 
00546     SAVE_XMM_YMM ; /* for Intel AVX */
00547     AddLength(sha512, sha512->buffLen);               /* before adding pads */
00548 
00549     local[sha512->buffLen++] = 0x80;  /* add 1 */
00550 
00551     /* pad with zeros */
00552     if (sha512->buffLen > SHA512_PAD_SIZE) {
00553         XMEMSET(&local[sha512->buffLen], 0, SHA512_BLOCK_SIZE -sha512->buffLen);
00554         sha512->buffLen += SHA512_BLOCK_SIZE - sha512->buffLen;
00555         #if defined(LITTLE_ENDIAN_ORDER) 
00556             #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00557             if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
00558             #endif
00559             ByteReverseWords64(sha512->buffer,sha512->buffer,SHA512_BLOCK_SIZE);
00560         #endif
00561         ret = Transform(sha512);
00562         if (ret != 0)
00563             return ret;
00564 
00565         sha512->buffLen = 0;
00566     }
00567     XMEMSET(&local[sha512->buffLen], 0, SHA512_PAD_SIZE - sha512->buffLen);
00568    
00569     /* put lengths in bits */
00570     sha512->hiLen = (sha512->loLen >> (8*sizeof(sha512->loLen) - 3)) + 
00571                  (sha512->hiLen << 3);
00572     sha512->loLen = sha512->loLen << 3;
00573 
00574     /* store lengths */
00575     #if defined(LITTLE_ENDIAN_ORDER)
00576         #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00577         if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
00578         #endif
00579         ByteReverseWords64(sha512->buffer, sha512->buffer, SHA512_PAD_SIZE);
00580     #endif
00581     /* ! length ordering dependent on digest endian type ! */
00582 
00583     sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen;
00584     sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen;
00585     #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00586     if(IS_INTEL_AVX1 || IS_INTEL_AVX2)
00587         ByteReverseWords64(&(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
00588                            &(sha512->buffer[SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
00589                            SHA512_BLOCK_SIZE - SHA512_PAD_SIZE);
00590     #endif
00591     ret = Transform(sha512);
00592     if (ret != 0)
00593         return ret;
00594 
00595     #ifdef LITTLE_ENDIAN_ORDER
00596         ByteReverseWords64(sha512->digest, sha512->digest, SHA512_DIGEST_SIZE);
00597     #endif
00598     XMEMCPY(hash, sha512->digest, SHA512_DIGEST_SIZE);
00599 
00600     return wc_InitSha512(sha512);  /* reset state */
00601 }
00602 
00603 
00604 
00605 #if defined(HAVE_INTEL_AVX1)
00606 
00607 #define Rx_1(i) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + W_X[i] ;
00608 #define Rx_2(i) d(i)+=h(i);
00609 #define Rx_3(i) h(i)+=S0(a(i))+Maj(a(i),b(i),c(i));
00610 
00611 #if defined(HAVE_INTEL_RORX)
00612 #define Rx_RORX_1(i) h(i)+=S1_RORX(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + W_X[i] ;
00613 #define Rx_RORX_2(i) d(i)+=h(i);
00614 #define Rx_RORX_3(i) h(i)+=S0_RORX(a(i))+Maj(a(i),b(i),c(i));
00615 #endif
00616 
00617 #endif
00618 
00619 #if defined(HAVE_INTEL_AVX2) 
00620 #define Ry_1(i, w) h(i)+=S1(e(i))+Ch(e(i),f(i),g(i))+K[i+j] + w ; 
00621 #define Ry_2(i, w) d(i)+=h(i);
00622 #define Ry_3(i, w) h(i)+=S0(a(i))+Maj(a(i),b(i),c(i));
00623 #endif
00624 
00625 #if defined(HAVE_INTEL_AVX1) /* INLINE Assember for Intel AVX1 instructions */
00626 #if defined(DEBUG_XMM)
00627 
00628 #define SAVE_REG(i)     __asm__ volatile("vmovdqu %%xmm"#i", %0 \n\t":"=m"(reg[i][0])::XMM_REGs);
00629 #define RECV_REG(i)     __asm__ volatile("vmovdqu %0, %%xmm"#i" \n\t"::"m"(reg[i][0]):XMM_REGs);
00630 
00631 #define _DUMP_REG(REG, name)\
00632     { word64 buf[16] ;word64 reg[16][2];int k ;\
00633       SAVE_REG(0); SAVE_REG(1); SAVE_REG(2);  SAVE_REG(3);  SAVE_REG(4);  \
00634       SAVE_REG(5);   SAVE_REG(6); SAVE_REG(7);SAVE_REG(8); SAVE_REG(9); SAVE_REG(10);\
00635        SAVE_REG(11); SAVE_REG(12); SAVE_REG(13); SAVE_REG(14); SAVE_REG(15); \
00636       __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0])::XMM_REGs);\
00637       printf(" "#name":\t") ; for(k=0; k<2; k++) printf("%016lx.", (word64)(buf[k])); printf("\n") ; \
00638       RECV_REG(0); RECV_REG(1); RECV_REG(2);  RECV_REG(3);  RECV_REG(4);\
00639       RECV_REG(5);   RECV_REG(6); RECV_REG(7); RECV_REG(8); RECV_REG(9);\
00640       RECV_REG(10); RECV_REG(11); RECV_REG(12); RECV_REG(13); RECV_REG(14); RECV_REG(15);\
00641     }
00642 
00643 #define DUMP_REG(REG) _DUMP_REG(REG, #REG) 
00644 #define PRINTF(fmt, ...) 
00645 
00646 #else
00647 
00648 #define DUMP_REG(REG) 
00649 #define PRINTF(fmt, ...) 
00650 
00651 #endif
00652 
00653 #define _MOVE_to_REG(xymm, mem)       __asm__ volatile("vmovdqu %0, %%"#xymm" "\
00654         :: "m"(mem):XMM_REGs) ;
00655 #define _MOVE_to_MEM(mem,i, xymm)     __asm__ volatile("vmovdqu %%"#xymm", %0" :\
00656          "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3])::XMM_REGs) ;
00657 #define _MOVE(dest, src)              __asm__ volatile("vmovdqu %%"#src",  %%"\
00658         #dest" ":::XMM_REGs) ;
00659 
00660 #define _S_TEMP(dest, src, bits, temp)  __asm__ volatile("vpsrlq  $"#bits", %%"\
00661         #src", %%"#dest"\n\tvpsllq  $64-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\
00662         #temp",%%"#dest", %%"#dest" ":::XMM_REGs) ;
00663 #define _AVX1_R(dest, src, bits)      __asm__ volatile("vpsrlq  $"#bits", %%"\
00664         #src", %%"#dest" ":::XMM_REGs) ;
00665 #define _XOR(dest, src1, src2)        __asm__ volatile("vpxor   %%"#src1", %%"\
00666         #src2", %%"#dest" ":::XMM_REGs) ;
00667 #define _OR(dest, src1, src2)         __asm__ volatile("vpor    %%"#src1", %%"\
00668         #src2", %%"#dest" ":::XMM_REGs) ;
00669 #define _ADD(dest, src1, src2)        __asm__ volatile("vpaddq   %%"#src1", %%"\
00670         #src2", %%"#dest" ":::XMM_REGs) ;
00671 #define _ADD_MEM(dest, src1, mem)     __asm__ volatile("vpaddq   %0, %%"#src1", %%"\
00672         #dest" "::"m"(mem):XMM_REGs) ;
00673 
00674 #define MOVE_to_REG(xymm, mem)      _MOVE_to_REG(xymm, mem)
00675 #define MOVE_to_MEM(mem, i, xymm)   _MOVE_to_MEM(mem, i, xymm)
00676 #define MOVE(dest, src)             _MOVE(dest, src)  
00677 
00678 #define XOR(dest, src1, src2)      _XOR(dest, src1, src2)
00679 #define OR(dest, src1, src2)       _OR(dest, src1, src2)
00680 #define ADD(dest, src1, src2)      _ADD(dest, src1, src2)
00681 
00682 #define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp);
00683 #define AVX1_S(dest, src, bits)      S_TMP(dest, src, bits, S_TEMP)
00684 #define AVX1_R(dest, src, bits)      _AVX1_R(dest, src, bits)
00685 
00686 #define Init_Mask(mask) \
00687      __asm__ volatile("vmovdqu %0, %%xmm1\n\t"::"m"(mask):"%xmm1") ;
00688      
00689 #define _W_from_buff1(w, buff, xmm) \
00690     /* X0..3(xmm4..7), W[0..15] = sha512->buffer[0.15];  */\
00691      __asm__ volatile("vmovdqu %1, %%"#xmm"\n\t"\
00692                       "vpshufb %%xmm1, %%"#xmm", %%"#xmm"\n\t"\
00693                       "vmovdqu %%"#xmm", %0"\
00694                       :"=m"(w): "m"(buff):"%xmm0") ;
00695 
00696 #define W_from_buff1(w, buff, xmm) _W_from_buff1(w, buff, xmm) 
00697 
00698 #define W_from_buff(w, buff)\
00699      Init_Mask(mBYTE_FLIP_MASK[0]) ;\
00700      W_from_buff1(w[0], buff[0], W_0);\
00701      W_from_buff1(w[2], buff[2], W_2);\
00702      W_from_buff1(w[4], buff[4], W_4);\
00703      W_from_buff1(w[6], buff[6], W_6);\
00704      W_from_buff1(w[8], buff[8], W_8);\
00705      W_from_buff1(w[10],buff[10],W_10);\
00706      W_from_buff1(w[12],buff[12],W_12);\
00707      W_from_buff1(w[14],buff[14],W_14);
00708                           
00709 static word64 mBYTE_FLIP_MASK[] =  { 0x0001020304050607, 0x08090a0b0c0d0e0f } ;
00710 
00711 #define W_I_15  xmm14
00712 #define W_I_7   xmm11
00713 #define W_I_2   xmm13
00714 #define W_I     xmm12
00715 #define G_TEMP  xmm0
00716 #define S_TEMP  xmm1
00717 #define XMM_TEMP0  xmm2
00718 
00719 #define W_0     xmm12
00720 #define W_2     xmm3
00721 #define W_4     xmm4
00722 #define W_6     xmm5
00723 #define W_8     xmm6
00724 #define W_10    xmm7
00725 #define W_12    xmm8
00726 #define W_14    xmm9
00727 
00728 #define XMM_REGs
00729 
00730 #define s0_1(dest, src)      AVX1_S(dest, src, 1); 
00731 #define s0_2(dest, src)      AVX1_S(G_TEMP, src, 8); XOR(dest, G_TEMP, dest) ; 
00732 #define s0_3(dest, src)      AVX1_R(G_TEMP, src, 7);  XOR(dest, G_TEMP, dest) ;
00733 
00734 #define s1_1(dest, src)      AVX1_S(dest, src, 19);
00735 #define s1_2(dest, src)      AVX1_S(G_TEMP, src, 61); XOR(dest, G_TEMP, dest) ; 
00736 #define s1_3(dest, src)      AVX1_R(G_TEMP, src, 6); XOR(dest, G_TEMP, dest) ;
00737 
00738 #define s0_(dest, src)       s0_1(dest, src) ; s0_2(dest, src) ; s0_3(dest, src)
00739 #define s1_(dest, src)       s1_1(dest, src) ; s1_2(dest, src) ; s1_3(dest, src)
00740         
00741 #define Block_xx_1(i) \
00742     MOVE_to_REG(W_I_15, W_X[(i-15)&15]) ;\
00743     MOVE_to_REG(W_I_7,  W_X[(i- 7)&15]) ;\
00744         
00745 #define Block_xx_2(i) \
00746     MOVE_to_REG(W_I_2,  W_X[(i- 2)&15]) ;\
00747     MOVE_to_REG(W_I,    W_X[(i)]) ;\
00748         
00749 #define Block_xx_3(i) \
00750     s0_ (XMM_TEMP0, W_I_15) ;\
00751         
00752 #define Block_xx_4(i) \
00753     ADD(W_I, W_I, XMM_TEMP0) ;\
00754     ADD(W_I, W_I, W_I_7) ;\
00755         
00756 #define Block_xx_5(i) \
00757     s1_ (XMM_TEMP0, W_I_2) ;\
00758     
00759 #define Block_xx_6(i) \
00760     ADD(W_I, W_I, XMM_TEMP0) ;\
00761     MOVE_to_MEM(W_X,i, W_I) ;\
00762     if(i==0)\
00763         MOVE_to_MEM(W_X,16, W_I) ;\
00764 
00765 #define Block_xx_7(i) \
00766     MOVE_to_REG(W_I_15, W_X[(i-15)&15]) ;\
00767     MOVE_to_REG(W_I_7,  W_X[(i- 7)&15]) ;\
00768             
00769 #define Block_xx_8(i) \
00770     MOVE_to_REG(W_I_2,  W_X[(i- 2)&15]) ;\
00771     MOVE_to_REG(W_I,    W_X[(i)]) ;\
00772 
00773 #define Block_xx_9(i) \
00774     s0_ (XMM_TEMP0, W_I_15) ;\
00775 
00776 #define Block_xx_10(i) \
00777     ADD(W_I, W_I, XMM_TEMP0) ;\
00778     ADD(W_I, W_I, W_I_7) ;\
00779 
00780 #define Block_xx_11(i) \
00781     s1_ (XMM_TEMP0, W_I_2) ;\
00782 
00783 #define Block_xx_12(i) \
00784     ADD(W_I, W_I, XMM_TEMP0) ;\
00785     MOVE_to_MEM(W_X,i, W_I) ;\
00786     if((i)==0)\
00787         MOVE_to_MEM(W_X,16, W_I) ;\
00788 
00789 static INLINE void Block_0_1(word64 *W_X) { Block_xx_1(0) ; }
00790 static INLINE void Block_0_2(word64 *W_X) { Block_xx_2(0) ; }
00791 static INLINE void Block_0_3(void) { Block_xx_3(0) ; }
00792 static INLINE void Block_0_4(void) { Block_xx_4(0) ; }
00793 static INLINE void Block_0_5(void) { Block_xx_5(0) ; }
00794 static INLINE void Block_0_6(word64 *W_X) { Block_xx_6(0) ; }
00795 static INLINE void Block_0_7(word64 *W_X) { Block_xx_7(2) ; }
00796 static INLINE void Block_0_8(word64 *W_X) { Block_xx_8(2) ; }
00797 static INLINE void Block_0_9(void) { Block_xx_9(2) ; }
00798 static INLINE void Block_0_10(void){ Block_xx_10(2) ; }
00799 static INLINE void Block_0_11(void){ Block_xx_11(2) ; }
00800 static INLINE void Block_0_12(word64 *W_X){ Block_xx_12(2) ; }
00801 
00802 static INLINE void Block_4_1(word64 *W_X) { Block_xx_1(4) ; }
00803 static INLINE void Block_4_2(word64 *W_X) { Block_xx_2(4) ; }
00804 static INLINE void Block_4_3(void) { Block_xx_3(4) ; }
00805 static INLINE void Block_4_4(void) { Block_xx_4(4) ; }
00806 static INLINE void Block_4_5(void) { Block_xx_5(4) ; }
00807 static INLINE void Block_4_6(word64 *W_X) { Block_xx_6(4) ; }
00808 static INLINE void Block_4_7(word64 *W_X) { Block_xx_7(6) ; }
00809 static INLINE void Block_4_8(word64 *W_X) { Block_xx_8(6) ; }
00810 static INLINE void Block_4_9(void) { Block_xx_9(6) ; }
00811 static INLINE void Block_4_10(void){ Block_xx_10(6) ; }
00812 static INLINE void Block_4_11(void){ Block_xx_11(6) ; }
00813 static INLINE void Block_4_12(word64 *W_X){ Block_xx_12(6) ; }
00814 
00815 static INLINE void Block_8_1(word64 *W_X) { Block_xx_1(8) ; }
00816 static INLINE void Block_8_2(word64 *W_X) { Block_xx_2(8) ; }
00817 static INLINE void Block_8_3(void) { Block_xx_3(8) ; }
00818 static INLINE void Block_8_4(void) { Block_xx_4(8) ; }
00819 static INLINE void Block_8_5(void) { Block_xx_5(8) ; }
00820 static INLINE void Block_8_6(word64 *W_X) { Block_xx_6(8) ; }
00821 static INLINE void Block_8_7(word64 *W_X) { Block_xx_7(10) ; }
00822 static INLINE void Block_8_8(word64 *W_X) { Block_xx_8(10) ; }
00823 static INLINE void Block_8_9(void) { Block_xx_9(10) ; }
00824 static INLINE void Block_8_10(void){ Block_xx_10(10) ; }
00825 static INLINE void Block_8_11(void){ Block_xx_11(10) ; }
00826 static INLINE void Block_8_12(word64 *W_X){ Block_xx_12(10) ; }
00827 
00828 static INLINE void Block_12_1(word64 *W_X) { Block_xx_1(12) ; }
00829 static INLINE void Block_12_2(word64 *W_X) { Block_xx_2(12) ; }
00830 static INLINE void Block_12_3(void) { Block_xx_3(12) ; }
00831 static INLINE void Block_12_4(void) { Block_xx_4(12) ; }
00832 static INLINE void Block_12_5(void) { Block_xx_5(12) ; }
00833 static INLINE void Block_12_6(word64 *W_X) { Block_xx_6(12) ; }
00834 static INLINE void Block_12_7(word64 *W_X) { Block_xx_7(14) ; }
00835 static INLINE void Block_12_8(word64 *W_X) { Block_xx_8(14) ; }
00836 static INLINE void Block_12_9(void) { Block_xx_9(14) ; }
00837 static INLINE void Block_12_10(void){ Block_xx_10(14) ; }
00838 static INLINE void Block_12_11(void){ Block_xx_11(14) ; }
00839 static INLINE void Block_12_12(word64 *W_X){ Block_xx_12(14) ; }
00840 
00841 #endif
00842 
00843 #if defined(HAVE_INTEL_AVX2)
00844 static const unsigned long mBYTE_FLIP_MASK_Y[] =
00845    { 0x0001020304050607, 0x08090a0b0c0d0e0f, 0x0001020304050607, 0x08090a0b0c0d0e0f } ;
00846 
00847 #define W_from_buff_Y(buff)\
00848     { /* X0..3(ymm9..12), W_X[0..15] = sha512->buffer[0.15];  */\
00849      __asm__ volatile("vmovdqu %0, %%ymm8\n\t"::"m"(mBYTE_FLIP_MASK_Y[0]):YMM_REGs) ;\
00850      __asm__ volatile("vmovdqu %0, %%ymm12\n\t"\
00851                       "vmovdqu %1, %%ymm4\n\t"\
00852                       "vpshufb %%ymm8, %%ymm12, %%ymm12\n\t"\
00853                       "vpshufb %%ymm8, %%ymm4, %%ymm4\n\t"\
00854                       :: "m"(buff[0]),  "m"(buff[4]):YMM_REGs) ;\
00855      __asm__ volatile("vmovdqu %0, %%ymm5\n\t"\
00856                       "vmovdqu %1, %%ymm6\n\t"\
00857                       "vpshufb %%ymm8, %%ymm5, %%ymm5\n\t"\
00858                       "vpshufb %%ymm8, %%ymm6, %%ymm6\n\t"\
00859                       :: "m"(buff[8]),  "m"(buff[12]):YMM_REGs) ;\
00860     }
00861 
00862 #if defined(DEBUG_YMM)
00863 
00864 #define SAVE_REG_Y(i) __asm__ volatile("vmovdqu %%ymm"#i", %0 \n\t":"=m"(reg[i-4][0])::YMM_REGs);
00865 #define RECV_REG_Y(i) __asm__ volatile("vmovdqu %0, %%ymm"#i" \n\t"::"m"(reg[i-4][0]):YMM_REGs);
00866 
00867 #define _DUMP_REG_Y(REG, name)\
00868     { word64 buf[16] ;word64 reg[16][2];int k ;\
00869       SAVE_REG_Y(4);  SAVE_REG_Y(5);   SAVE_REG_Y(6); SAVE_REG_Y(7); \
00870       SAVE_REG_Y(8); SAVE_REG_Y(9); SAVE_REG_Y(10); SAVE_REG_Y(11); SAVE_REG_Y(12);\
00871       SAVE_REG_Y(13); SAVE_REG_Y(14); SAVE_REG_Y(15); \
00872       __asm__ volatile("vmovdqu %%"#REG", %0 \n\t":"=m"(buf[0])::YMM_REGs);\
00873       printf(" "#name":\t") ; for(k=0; k<4; k++) printf("%016lx.", (word64)buf[k]) ; printf("\n") ; \
00874       RECV_REG_Y(4);  RECV_REG_Y(5);   RECV_REG_Y(6); RECV_REG_Y(7); \
00875       RECV_REG_Y(8); RECV_REG_Y(9); RECV_REG_Y(10); RECV_REG_Y(11); RECV_REG_Y(12); \
00876       RECV_REG_Y(13); RECV_REG_Y(14); RECV_REG_Y(15);\
00877     }
00878 
00879 #define DUMP_REG_Y(REG) _DUMP_REG_Y(REG, #REG) 
00880 #define DUMP_REG2_Y(REG) _DUMP_REG_Y(REG, #REG) 
00881 #define PRINTF_Y(fmt, ...) 
00882 
00883 #else
00884 
00885 #define DUMP_REG_Y(REG) 
00886 #define DUMP_REG2_Y(REG)
00887 #define PRINTF_Y(fmt, ...) 
00888 
00889 #endif
00890 
00891 #define _MOVE_to_REGy(ymm, mem)         __asm__ volatile("vmovdqu %0, %%"#ymm" "\
00892                                         :: "m"(mem):YMM_REGs) ;
00893 #define _MOVE_to_MEMy(mem,i, ymm)       __asm__ volatile("vmovdqu %%"#ymm", %0" \
00894         : "=m"(mem[i]),"=m"(mem[i+1]),"=m"(mem[i+2]),"=m"(mem[i+3])::YMM_REGs) ;
00895 #define _MOVE_128y(ymm0, ymm1, ymm2, map)  __asm__ volatile("vperm2i128  $"\
00896         #map", %%"#ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs) ;
00897 #define _S_TEMPy(dest, src, bits, temp) \
00898          __asm__ volatile("vpsrlq  $"#bits", %%"#src", %%"#dest"\n\tvpsllq  $64-"#bits\
00899         ", %%"#src", %%"#temp"\n\tvpor %%"#temp",%%"#dest", %%"#dest" ":::YMM_REGs) ;
00900 #define _AVX2_R(dest, src, bits)        __asm__ volatile("vpsrlq  $"#bits", %%"\
00901          #src", %%"#dest" ":::YMM_REGs) ;
00902 #define _XORy(dest, src1, src2)         __asm__ volatile("vpxor   %%"#src1", %%"\
00903          #src2", %%"#dest" ":::YMM_REGs) ;
00904 #define _ADDy(dest, src1, src2)         __asm__ volatile("vpaddq   %%"#src1", %%"\
00905          #src2", %%"#dest" ":::YMM_REGs) ;
00906 #define _BLENDy(map, dest, src1, src2)  __asm__ volatile("vpblendd    $"#map", %%"\
00907          #src1",   %%"#src2", %%"#dest" ":::YMM_REGs) ;
00908 #define _BLENDQy(map, dest, src1, src2) __asm__ volatile("vblendpd   $"#map", %%"\
00909          #src1",   %%"#src2", %%"#dest" ":::YMM_REGs) ;
00910 #define _PERMQy(map, dest, src)         __asm__ volatile("vpermq  $"#map", %%"\
00911          #src", %%"#dest" ":::YMM_REGs) ;
00912 
00913 #define MOVE_to_REGy(ymm, mem)      _MOVE_to_REGy(ymm, mem)
00914 #define MOVE_to_MEMy(mem, i, ymm)   _MOVE_to_MEMy(mem, i, ymm)
00915 
00916 #define MOVE_128y(ymm0, ymm1, ymm2, map) _MOVE_128y(ymm0, ymm1, ymm2, map) 
00917 #define XORy(dest, src1, src2)      _XORy(dest, src1, src2)
00918 #define ADDy(dest, src1, src2)      _ADDy(dest, src1, src2)
00919 #define BLENDy(map, dest, src1, src2) _BLENDy(map, dest, src1, src2)
00920 #define BLENDQy(map, dest, src1, src2) _BLENDQy(map, dest, src1, src2)
00921 #define PERMQy(map, dest, src)      _PERMQy(map, dest, src)
00922 
00923 
00924 #define S_TMPy(dest, src, bits, temp) _S_TEMPy(dest, src, bits, temp);
00925 #define AVX2_S(dest, src, bits)      S_TMPy(dest, src, bits, S_TEMPy)
00926 #define AVX2_R(dest, src, bits)      _AVX2_R(dest, src, bits)
00927 
00928 
00929 #define    FEEDBACK1_to_W_I_2(w_i_2, w_i)    MOVE_128y(YMM_TEMP0, w_i, w_i, 0x08) ;\
00930                                        BLENDy(0xf0, w_i_2, YMM_TEMP0, w_i_2) ; 
00931 
00932 #define    MOVE_W_to_W_I_15(w_i_15, w_0, w_4)  BLENDQy(0x1, w_i_15, w_4, w_0) ;\
00933                                        PERMQy(0x39, w_i_15, w_i_15) ;
00934 #define    MOVE_W_to_W_I_7(w_i_7,  w_8, w_12)  BLENDQy(0x1, w_i_7, w_12, w_8) ;\
00935                                        PERMQy(0x39, w_i_7, w_i_7) ; 
00936 #define    MOVE_W_to_W_I_2(w_i_2,  w_12)       BLENDQy(0xc, w_i_2, w_12, w_i_2) ;\
00937                                        PERMQy(0x0e, w_i_2, w_i_2) ;
00938 
00939 
00940 #define W_I_16y  ymm8
00941 #define W_I_15y  ymm9
00942 #define W_I_7y  ymm10
00943 #define W_I_2y  ymm11
00944 #define W_Iy    ymm12
00945 #define G_TEMPy     ymm13
00946 #define S_TEMPy     ymm14
00947 #define YMM_TEMP0  ymm15
00948 #define YMM_TEMP0x xmm15
00949 #define W_I_TEMPy   ymm7
00950 #define W_K_TEMPy   ymm15
00951 #define W_K_TEMPx  xmm15
00952 #define W_0y     ymm12
00953 #define W_4y     ymm4
00954 #define W_8y     ymm5
00955 #define W_12y    ymm6
00956 
00957 #define YMM_REGs
00958 /* Registers are saved in Sha512Update/Final */
00959                  /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/
00960 
00961 #define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\
00962     __asm__ volatile("vperm2i128  $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs) ;\
00963     __asm__ volatile("vpblendd    $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs) ;\
00964     __asm__ volatile("vperm2i128 $0x01,  %%"#w_i_7",  %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\
00965     __asm__ volatile("vpblendd    $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\
00966     __asm__ volatile("vpshufd    $0x93,  %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\
00967 
00968 #define MOVE_7_to_15(w_i_15, w_i_7)\
00969     __asm__ volatile("vmovdqu                 %%"#w_i_7",  %%"#w_i_15" ":::YMM_REGs) ;\
00970 
00971 #define MOVE_I_to_7(w_i_7, w_i)\
00972     __asm__ volatile("vperm2i128 $0x01,       %%"#w_i",   %%"#w_i",   %%"#w_i_7" ":::YMM_REGs) ;\
00973     __asm__ volatile("vpblendd    $0x01,       %%"#w_i_7",   %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\
00974     __asm__ volatile("vpshufd    $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs) ;\
00975 
00976 #define MOVE_I_to_2(w_i_2, w_i)\
00977     __asm__ volatile("vperm2i128 $0x01,       %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs) ;\
00978     __asm__ volatile("vpshufd    $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs) ;\
00979 
00980 #endif
00981 
00982 
00983 /***  Transform Body ***/
00984 #if defined(HAVE_INTEL_AVX1)
00985 
00986 static int Transform_AVX1(Sha512* sha512)
00987 {
00988     const word64* K = K512;
00989     word64 W_X[16+4];
00990     word32 j;
00991     word64 T[8];
00992     /* Copy digest to working vars */
00993     XMEMCPY(T, sha512->digest, sizeof(T));
00994 
00995     W_from_buff(W_X, sha512->buffer) ;
00996     for (j = 0; j < 80; j += 16) {
00997         Rx_1( 0); Block_0_1(W_X); Rx_2( 0); Block_0_2(W_X); Rx_3( 0); Block_0_3(); 
00998         Rx_1( 1); Block_0_4(); Rx_2( 1); Block_0_5(); Rx_3( 1); Block_0_6(W_X); 
00999         Rx_1( 2); Block_0_7(W_X); Rx_2( 2); Block_0_8(W_X); Rx_3( 2); Block_0_9();
01000         Rx_1( 3); Block_0_10();Rx_2( 3); Block_0_11();Rx_3( 3); Block_0_12(W_X);   
01001         
01002         Rx_1( 4); Block_4_1(W_X); Rx_2( 4); Block_4_2(W_X); Rx_3( 4); Block_4_3(); 
01003         Rx_1( 5); Block_4_4(); Rx_2( 5); Block_4_5(); Rx_3( 5); Block_4_6(W_X); 
01004         Rx_1( 6); Block_4_7(W_X); Rx_2( 6); Block_4_8(W_X); Rx_3( 6); Block_4_9();
01005         Rx_1( 7); Block_4_10();Rx_2( 7); Block_4_11();Rx_3( 7); Block_4_12(W_X);   
01006         
01007         Rx_1( 8); Block_8_1(W_X); Rx_2( 8); Block_8_2(W_X); Rx_3( 8); Block_8_3(); 
01008         Rx_1( 9); Block_8_4(); Rx_2( 9); Block_8_5(); Rx_3( 9); Block_8_6(W_X); 
01009         Rx_1(10); Block_8_7(W_X); Rx_2(10); Block_8_8(W_X); Rx_3(10); Block_8_9();
01010         Rx_1(11); Block_8_10();Rx_2(11); Block_8_11();Rx_3(11); Block_8_12(W_X);   
01011         
01012         Rx_1(12); Block_12_1(W_X); Rx_2(12); Block_12_2(W_X); Rx_3(12); Block_12_3(); 
01013         Rx_1(13); Block_12_4(); Rx_2(13); Block_12_5(); Rx_3(13); Block_12_6(W_X); 
01014         Rx_1(14); Block_12_7(W_X); Rx_2(14); Block_12_8(W_X); Rx_3(14); Block_12_9();
01015         Rx_1(15); Block_12_10();Rx_2(15); Block_12_11();Rx_3(15); Block_12_12(W_X);     
01016     }
01017 
01018     /* Add the working vars back into digest */
01019 
01020     sha512->digest[0] += a(0);
01021     sha512->digest[1] += b(0);
01022     sha512->digest[2] += c(0);
01023     sha512->digest[3] += d(0);
01024     sha512->digest[4] += e(0);
01025     sha512->digest[5] += f(0);
01026     sha512->digest[6] += g(0);
01027     sha512->digest[7] += h(0);
01028 
01029     /* Wipe variables */
01030     #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2)
01031     XMEMSET(W_X, 0, sizeof(word64) * 16);
01032     #endif
01033     XMEMSET(T, 0, sizeof(T));
01034 
01035     return 0;
01036 }
01037 
01038 #endif
01039 
01040 #if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_RORX)
01041 
01042 static int Transform_AVX1_RORX(Sha512* sha512)
01043 {
01044     const word64* K = K512;
01045     word64 W_X[16+4];
01046     word32 j;
01047     word64 T[8];
01048     /* Copy digest to working vars */
01049     XMEMCPY(T, sha512->digest, sizeof(T));
01050 
01051     W_from_buff(W_X, sha512->buffer) ;
01052     for (j = 0; j < 80; j += 16) {
01053         Rx_RORX_1( 0); Block_0_1(W_X); Rx_RORX_2( 0); Block_0_2(W_X); 
01054                                     Rx_RORX_3( 0); Block_0_3(); 
01055         Rx_RORX_1( 1); Block_0_4(); Rx_RORX_2( 1); Block_0_5(); 
01056                                     Rx_RORX_3( 1); Block_0_6(W_X); 
01057         Rx_RORX_1( 2); Block_0_7(W_X); Rx_RORX_2( 2); Block_0_8(W_X); 
01058                                     Rx_RORX_3( 2); Block_0_9();
01059         Rx_RORX_1( 3); Block_0_10();Rx_RORX_2( 3); Block_0_11();
01060                                     Rx_RORX_3( 3); Block_0_12(W_X);   
01061         
01062         Rx_RORX_1( 4); Block_4_1(W_X); Rx_RORX_2( 4); Block_4_2(W_X); 
01063                                     Rx_RORX_3( 4); Block_4_3(); 
01064         Rx_RORX_1( 5); Block_4_4(); Rx_RORX_2( 5); Block_4_5(); 
01065                                     Rx_RORX_3( 5); Block_4_6(W_X); 
01066         Rx_RORX_1( 6); Block_4_7(W_X); Rx_RORX_2( 6); Block_4_8(W_X); 
01067                                     Rx_RORX_3( 6); Block_4_9();
01068         Rx_RORX_1( 7); Block_4_10();Rx_RORX_2( 7); Block_4_11();
01069                                     Rx_RORX_3( 7); Block_4_12(W_X);   
01070         
01071         Rx_RORX_1( 8); Block_8_1(W_X); Rx_RORX_2( 8); Block_8_2(W_X); 
01072                                     Rx_RORX_3( 8); Block_8_3(); 
01073         Rx_RORX_1( 9); Block_8_4(); Rx_RORX_2( 9); Block_8_5(); 
01074                                     Rx_RORX_3( 9); Block_8_6(W_X); 
01075         Rx_RORX_1(10); Block_8_7(W_X); Rx_RORX_2(10); Block_8_8(W_X); 
01076                                     Rx_RORX_3(10); Block_8_9();
01077         Rx_RORX_1(11); Block_8_10();Rx_RORX_2(11); Block_8_11();
01078                                     Rx_RORX_3(11); Block_8_12(W_X);   
01079         
01080         Rx_RORX_1(12); Block_12_1(W_X); Rx_RORX_2(12); Block_12_2(W_X); 
01081                                      Rx_RORX_3(12); Block_12_3(); 
01082         Rx_RORX_1(13); Block_12_4(); Rx_RORX_2(13); Block_12_5(); 
01083                                      Rx_RORX_3(13); Block_12_6(W_X); 
01084         Rx_RORX_1(14); Block_12_7(W_X); Rx_RORX_2(14); Block_12_8(W_X); 
01085                                      Rx_RORX_3(14); Block_12_9();
01086         Rx_RORX_1(15); Block_12_10();Rx_RORX_2(15); Block_12_11();
01087                                      Rx_RORX_3(15); Block_12_12(W_X);     
01088     }
01089     /* Add the working vars back into digest */
01090 
01091     sha512->digest[0] += a(0);
01092     sha512->digest[1] += b(0);
01093     sha512->digest[2] += c(0);
01094     sha512->digest[3] += d(0);
01095     sha512->digest[4] += e(0);
01096     sha512->digest[5] += f(0);
01097     sha512->digest[6] += g(0);
01098     sha512->digest[7] += h(0);
01099 
01100     /* Wipe variables */
01101     #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2)
01102     XMEMSET(W_X, 0, sizeof(word64) * 16);
01103     #endif
01104     XMEMSET(T, 0, sizeof(T));
01105 
01106     return 0;
01107 }
01108 #endif
01109 
01110 #if defined(HAVE_INTEL_AVX2)
01111 
01112 #define s0_1y(dest, src)      AVX2_S(dest, src, 1); 
01113 #define s0_2y(dest, src)      AVX2_S(G_TEMPy, src, 8); XORy(dest, G_TEMPy, dest) ; 
01114 #define s0_3y(dest, src)      AVX2_R(G_TEMPy, src, 7);  XORy(dest, G_TEMPy, dest) ;
01115 
01116 #define s1_1y(dest, src)      AVX2_S(dest, src, 19);
01117 #define s1_2y(dest, src)      AVX2_S(G_TEMPy, src, 61); XORy(dest, G_TEMPy, dest) ; 
01118 #define s1_3y(dest, src)      AVX2_R(G_TEMPy, src, 6); XORy(dest, G_TEMPy, dest) ;
01119 
01120 #define s0_y(dest, src)       s0_1y(dest, src) ; s0_2y(dest, src) ; s0_3y(dest, src)
01121 #define s1_y(dest, src)       s1_1y(dest, src) ; s1_2y(dest, src) ; s1_3y(dest, src)
01122 
01123 #define blk384(i) (W[i] = sha384->buffer[i])
01124 
01125 
01126 #define Block_Y_xx_1(i, w_0, w_4, w_8, w_12)\
01127     MOVE_W_to_W_I_15(W_I_15y, w_0, w_4) ;\
01128     MOVE_W_to_W_I_7 (W_I_7y,  w_8, w_12) ;\
01129     MOVE_W_to_W_I_2 (W_I_2y,  w_12) ;\
01130 
01131 #define Block_Y_xx_2(i, w_0, w_4, w_8, w_12)\
01132     s0_1y (YMM_TEMP0, W_I_15y) ;\
01133 
01134 #define Block_Y_xx_3(i, w_0, w_4, w_8, w_12)\
01135     s0_2y (YMM_TEMP0, W_I_15y) ;\
01136 
01137 #define Block_Y_xx_4(i, w_0, w_4, w_8, w_12)\
01138     s0_3y (YMM_TEMP0, W_I_15y) ;\
01139 
01140 #define Block_Y_xx_5(i, w_0, w_4, w_8, w_12)\
01141     ADDy(W_I_TEMPy, w_0, YMM_TEMP0) ;\
01142 
01143 #define Block_Y_xx_6(i, w_0, w_4, w_8, w_12)\
01144     ADDy(W_I_TEMPy, W_I_TEMPy, W_I_7y) ;\
01145     s1_1y (YMM_TEMP0, W_I_2y) ;\
01146 
01147 #define Block_Y_xx_7(i, w_0, w_4, w_8, w_12)\
01148     s1_2y (YMM_TEMP0, W_I_2y) ;\
01149 
01150 #define Block_Y_xx_8(i, w_0, w_4, w_8, w_12)\
01151     s1_3y (YMM_TEMP0, W_I_2y) ;\
01152     ADDy(w_0, W_I_TEMPy, YMM_TEMP0) ;\
01153 
01154 #define Block_Y_xx_9(i, w_0, w_4, w_8, w_12)\
01155     FEEDBACK1_to_W_I_2(W_I_2y, w_0) ;\
01156 
01157 #define Block_Y_xx_10(i, w_0, w_4, w_8, w_12) \
01158     s1_1y (YMM_TEMP0, W_I_2y) ;\
01159 
01160 #define Block_Y_xx_11(i, w_0, w_4, w_8, w_12) \
01161     s1_2y (YMM_TEMP0, W_I_2y) ;\
01162 
01163 #define Block_Y_xx_12(i, w_0, w_4, w_8, w_12)\
01164     s1_3y (YMM_TEMP0, W_I_2y) ;\
01165     ADDy(w_0, W_I_TEMPy, YMM_TEMP0) ;\
01166     MOVE_to_MEMy(w,0, w_4) ;\
01167 
01168 
01169 static INLINE void Block_Y_0_1(void) { Block_Y_xx_1(0, W_0y, W_4y, W_8y, W_12y) ; }
01170 static INLINE void Block_Y_0_2(void) { Block_Y_xx_2(0, W_0y, W_4y, W_8y, W_12y) ; }
01171 static INLINE void Block_Y_0_3(void) { Block_Y_xx_3(0, W_0y, W_4y, W_8y, W_12y) ; }
01172 static INLINE void Block_Y_0_4(void) { Block_Y_xx_4(0, W_0y, W_4y, W_8y, W_12y) ; }
01173 static INLINE void Block_Y_0_5(void) { Block_Y_xx_5(0, W_0y, W_4y, W_8y, W_12y) ; }
01174 static INLINE void Block_Y_0_6(void) { Block_Y_xx_6(0, W_0y, W_4y, W_8y, W_12y) ; }
01175 static INLINE void Block_Y_0_7(void) { Block_Y_xx_7(0, W_0y, W_4y, W_8y, W_12y) ; }
01176 static INLINE void Block_Y_0_8(void) { Block_Y_xx_8(0, W_0y, W_4y, W_8y, W_12y) ; }
01177 static INLINE void Block_Y_0_9(void) { Block_Y_xx_9(0, W_0y, W_4y, W_8y, W_12y) ; }
01178 static INLINE void Block_Y_0_10(void){ Block_Y_xx_10(0, W_0y, W_4y, W_8y, W_12y) ; }
01179 static INLINE void Block_Y_0_11(void){ Block_Y_xx_11(0, W_0y, W_4y, W_8y, W_12y) ; }
01180 static INLINE void Block_Y_0_12(word64 *w){ Block_Y_xx_12(0, W_0y, W_4y, W_8y, W_12y) ; }
01181 
01182 static INLINE void Block_Y_4_1(void) { Block_Y_xx_1(4, W_4y, W_8y, W_12y, W_0y) ; }
01183 static INLINE void Block_Y_4_2(void) { Block_Y_xx_2(4, W_4y, W_8y, W_12y, W_0y) ; }
01184 static INLINE void Block_Y_4_3(void) { Block_Y_xx_3(4, W_4y, W_8y, W_12y, W_0y) ; }
01185 static INLINE void Block_Y_4_4(void) { Block_Y_xx_4(4, W_4y, W_8y, W_12y, W_0y) ; }
01186 static INLINE void Block_Y_4_5(void) { Block_Y_xx_5(4, W_4y, W_8y, W_12y, W_0y) ; }
01187 static INLINE void Block_Y_4_6(void) { Block_Y_xx_6(4, W_4y, W_8y, W_12y, W_0y) ; }
01188 static INLINE void Block_Y_4_7(void) { Block_Y_xx_7(4, W_4y, W_8y, W_12y, W_0y) ; }
01189 static INLINE void Block_Y_4_8(void) { Block_Y_xx_8(4, W_4y, W_8y, W_12y, W_0y) ; }
01190 static INLINE void Block_Y_4_9(void) { Block_Y_xx_9(4, W_4y, W_8y, W_12y, W_0y) ; }
01191 static INLINE void Block_Y_4_10(void) { Block_Y_xx_10(4, W_4y, W_8y, W_12y, W_0y) ; }
01192 static INLINE void Block_Y_4_11(void) { Block_Y_xx_11(4, W_4y, W_8y, W_12y, W_0y) ; }
01193 static INLINE void Block_Y_4_12(word64 *w) { Block_Y_xx_12(4, W_4y, W_8y, W_12y, W_0y) ; }
01194 
01195 static INLINE void Block_Y_8_1(void) { Block_Y_xx_1(8, W_8y, W_12y, W_0y, W_4y) ; }
01196 static INLINE void Block_Y_8_2(void) { Block_Y_xx_2(8, W_8y, W_12y, W_0y, W_4y) ; }
01197 static INLINE void Block_Y_8_3(void) { Block_Y_xx_3(8, W_8y, W_12y, W_0y, W_4y) ; }
01198 static INLINE void Block_Y_8_4(void) { Block_Y_xx_4(8, W_8y, W_12y, W_0y, W_4y) ; }
01199 static INLINE void Block_Y_8_5(void) { Block_Y_xx_5(8, W_8y, W_12y, W_0y, W_4y) ; }
01200 static INLINE void Block_Y_8_6(void) { Block_Y_xx_6(8, W_8y, W_12y, W_0y, W_4y) ; }
01201 static INLINE void Block_Y_8_7(void) { Block_Y_xx_7(8, W_8y, W_12y, W_0y, W_4y) ; }
01202 static INLINE void Block_Y_8_8(void) { Block_Y_xx_8(8, W_8y, W_12y, W_0y, W_4y) ; }
01203 static INLINE void Block_Y_8_9(void) { Block_Y_xx_9(8, W_8y, W_12y, W_0y, W_4y) ; }
01204 static INLINE void Block_Y_8_10(void) { Block_Y_xx_10(8, W_8y, W_12y, W_0y, W_4y) ; }
01205 static INLINE void Block_Y_8_11(void) { Block_Y_xx_11(8, W_8y, W_12y, W_0y, W_4y) ; }
01206 static INLINE void Block_Y_8_12(word64 *w) { Block_Y_xx_12(8, W_8y, W_12y, W_0y, W_4y) ; }
01207 
01208 static INLINE void Block_Y_12_1(void) { Block_Y_xx_1(12, W_12y, W_0y, W_4y, W_8y) ; }
01209 static INLINE void Block_Y_12_2(void) { Block_Y_xx_2(12, W_12y, W_0y, W_4y, W_8y) ; }
01210 static INLINE void Block_Y_12_3(void) { Block_Y_xx_3(12, W_12y, W_0y, W_4y, W_8y) ; }
01211 static INLINE void Block_Y_12_4(void) { Block_Y_xx_4(12, W_12y, W_0y, W_4y, W_8y) ; }
01212 static INLINE void Block_Y_12_5(void) { Block_Y_xx_5(12, W_12y, W_0y, W_4y, W_8y) ; }
01213 static INLINE void Block_Y_12_6(void) { Block_Y_xx_6(12, W_12y, W_0y, W_4y, W_8y) ; }
01214 static INLINE void Block_Y_12_7(void) { Block_Y_xx_7(12, W_12y, W_0y, W_4y, W_8y) ; }
01215 static INLINE void Block_Y_12_8(void) { Block_Y_xx_8(12, W_12y, W_0y, W_4y, W_8y) ; }
01216 static INLINE void Block_Y_12_9(void) { Block_Y_xx_9(12, W_12y, W_0y, W_4y, W_8y) ; }
01217 static INLINE void Block_Y_12_10(void) { Block_Y_xx_10(12, W_12y, W_0y, W_4y, W_8y) ; }
01218 static INLINE void Block_Y_12_11(void) { Block_Y_xx_11(12, W_12y, W_0y, W_4y, W_8y) ; }
01219 static INLINE void Block_Y_12_12(word64 *w) { Block_Y_xx_12(12, W_12y, W_0y, W_4y, W_8y) ; }
01220 
01221 
01222 static int Transform_AVX2(Sha512* sha512)
01223 {
01224     const word64* K = K512;
01225     word64 w[4] ;
01226     word32 j /*, k*/;
01227     word64 T[8];
01228     /* Copy digest to working vars */
01229     XMEMCPY(T, sha512->digest, sizeof(T));
01230 
01231     W_from_buff_Y(sha512->buffer) ;
01232     MOVE_to_MEMy(w,0, W_0y) ; 
01233     for (j = 0; j < 80; j += 16) {
01234         Ry_1( 0, w[0]); Block_Y_0_1(); Ry_2( 0, w[0]); Block_Y_0_2(); 
01235                                        Ry_3( 0, w[0]); Block_Y_0_3(); 
01236         Ry_1( 1, w[1]); Block_Y_0_4(); Ry_2( 1, w[1]); Block_Y_0_5(); 
01237                                        Ry_3( 1, w[1]); Block_Y_0_6();  
01238         Ry_1( 2, w[2]); Block_Y_0_7(); Ry_2( 2, w[2]); Block_Y_0_8(); 
01239                                        Ry_3( 2, w[2]); Block_Y_0_9();
01240         Ry_1( 3, w[3]); Block_Y_0_10();Ry_2( 3, w[3]); Block_Y_0_11();
01241                                        Ry_3( 3, w[3]); Block_Y_0_12(w);
01242         
01243         Ry_1( 4, w[0]); Block_Y_4_1(); Ry_2( 4, w[0]); Block_Y_4_2(); 
01244                                        Ry_3( 4, w[0]); Block_Y_4_3(); 
01245         Ry_1( 5, w[1]); Block_Y_4_4(); Ry_2( 5, w[1]); Block_Y_4_5(); 
01246                                        Ry_3( 5, w[1]); Block_Y_4_6();
01247         Ry_1( 6, w[2]); Block_Y_4_7(); Ry_2( 6, w[2]); Block_Y_4_8(); 
01248                                        Ry_3( 6, w[2]); Block_Y_4_9();
01249         Ry_1( 7, w[3]); Block_Y_4_10(); Ry_2( 7, w[3]);Block_Y_4_11(); 
01250                                         Ry_3( 7, w[3]);Block_Y_4_12(w);  
01251         
01252         Ry_1( 8, w[0]); Block_Y_8_1(); Ry_2( 8, w[0]); Block_Y_8_2(); 
01253                                        Ry_3( 8, w[0]); Block_Y_8_3();
01254         Ry_1( 9, w[1]); Block_Y_8_4(); Ry_2( 9, w[1]); Block_Y_8_5(); 
01255                                        Ry_3( 9, w[1]); Block_Y_8_6();
01256         Ry_1(10, w[2]); Block_Y_8_7(); Ry_2(10, w[2]); Block_Y_8_8(); 
01257                                        Ry_3(10, w[2]); Block_Y_8_9(); 
01258         Ry_1(11, w[3]); Block_Y_8_10();Ry_2(11, w[3]); Block_Y_8_11();
01259                                        Ry_3(11, w[3]); Block_Y_8_12(w);
01260                  
01261         Ry_1(12, w[0]); Block_Y_12_1(); Ry_2(12, w[0]); Block_Y_12_2(); 
01262                                         Ry_3(12, w[0]); Block_Y_12_3();
01263         Ry_1(13, w[1]); Block_Y_12_4(); Ry_2(13, w[1]); Block_Y_12_5(); 
01264                                         Ry_3(13, w[1]); Block_Y_12_6(); 
01265         Ry_1(14, w[2]); Block_Y_12_7(); Ry_2(14, w[2]); Block_Y_12_8(); 
01266                                         Ry_3(14, w[2]); Block_Y_12_9();
01267         Ry_1(15, w[3]); Block_Y_12_10();Ry_2(15, w[3]); Block_Y_12_11();
01268                                         Ry_3(15, w[3]);Block_Y_12_12(w);
01269     }
01270  
01271     /* Add the working vars back into digest */
01272 
01273     sha512->digest[0] += a(0);
01274     sha512->digest[1] += b(0);
01275     sha512->digest[2] += c(0);
01276     sha512->digest[3] += d(0);
01277     sha512->digest[4] += e(0);
01278     sha512->digest[5] += f(0);
01279     sha512->digest[6] += g(0);
01280     sha512->digest[7] += h(0);
01281 
01282     /* Wipe variables */
01283     #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2)
01284     XMEMSET(W, 0, sizeof(word64) * 16);
01285     #endif
01286     XMEMSET(T, 0, sizeof(T));
01287 
01288     return 0;
01289 }
01290 
01291 #endif
01292 
01293 
01294 #ifdef WOLFSSL_SHA384
01295 
01296 #if defined(HAVE_INTEL_AVX1) ||  defined(HAVE_INTEL_AVX2) 
01297 
01298 #if defined(HAVE_INTEL_AVX1)
01299 static int Transform384_AVX1(Sha384 *sha384) ;
01300 #endif
01301 #if defined(HAVE_INTEL_AVX2)
01302 static int Transform384_AVX2(Sha384 *sha384) ; 
01303 #endif
01304 
01305 #if defined(HAVE_INTEL_AVX1) &&  defined(HAVE_INTEL_AVX2) &&defined(HAVE_INTEL_RORX)
01306 static int Transform384_AVX1_RORX(Sha384 *sha384) ; 
01307 #endif
01308 
01309 static int _Transform384(Sha384 *sha384) ; 
01310 static int (*Transform384_p)(Sha384* sha384) = _Transform384 ;
01311 
01312 #define Transform384(sha384) (*Transform384_p)(sha384)
01313 static void set_Transform384(void) {
01314      if(set_cpuid_flags(CHECK_SHA384))return ;
01315 
01316 #if defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2)
01317      Transform384_p = ((IS_INTEL_AVX1) ? Transform384_AVX1 : _Transform384) ;
01318 #elif defined(HAVE_INTEL_AVX2)
01319      #if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_RORX)
01320      if(IS_INTEL_AVX2 && IS_INTEL_BMI2) { Transform384_p = Transform384_AVX1_RORX ; return ; }
01321      #endif
01322      if(IS_INTEL_AVX2) { Transform384_p = Transform384_AVX2 ; return ; }
01323      #if defined(HAVE_INTEL_AVX1)
01324      Transform384_p = ((IS_INTEL_AVX1) ? Transform384_AVX1 : _Transform384) ;
01325      #endif
01326 #else
01327      Transform384_p = ((IS_INTEL_AVX1) ? Transform384_AVX1 : _Transform384) ;
01328 #endif
01329 }
01330 
01331 #else
01332    #define Transform384(sha512) _Transform384(sha512)
01333 #endif
01334 
01335 int wc_InitSha384(Sha384* sha384)
01336 {
01337     sha384->digest[0] = W64LIT(0xcbbb9d5dc1059ed8);
01338     sha384->digest[1] = W64LIT(0x629a292a367cd507);
01339     sha384->digest[2] = W64LIT(0x9159015a3070dd17);
01340     sha384->digest[3] = W64LIT(0x152fecd8f70e5939);
01341     sha384->digest[4] = W64LIT(0x67332667ffc00b31);
01342     sha384->digest[5] = W64LIT(0x8eb44a8768581511);
01343     sha384->digest[6] = W64LIT(0xdb0c2e0d64f98fa7);
01344     sha384->digest[7] = W64LIT(0x47b5481dbefa4fa4);
01345 
01346     sha384->buffLen = 0;
01347     sha384->loLen   = 0;
01348     sha384->hiLen   = 0;
01349 
01350 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
01351     set_Transform384() ;
01352 #endif
01353     
01354     return 0;
01355 }
01356 
01357 static int _Transform384(Sha384* sha384)
01358 {
01359     const word64* K = K512;
01360 
01361     word32 j;
01362     word64 T[8];
01363 
01364 #ifdef WOLFSSL_SMALL_STACK
01365     word64* W;
01366 
01367     W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER);
01368     if (W == NULL)
01369         return MEMORY_E;
01370 #else
01371     word64 W[16];
01372 #endif
01373 
01374     /* Copy digest to working vars */
01375     XMEMCPY(T, sha384->digest, sizeof(T));
01376 
01377 #ifdef USE_SLOW_SHA2
01378     /* over twice as small, but 50% slower */
01379     /* 80 operations, not unrolled */
01380     for (j = 0; j < 80; j += 16) {
01381         int m;
01382         for (m = 0; m < 16; m++) {  /* braces needed for macros {} */
01383             R2(m);
01384         }
01385     }
01386 #else
01387     /* 80 operations, partially loop unrolled */
01388     for (j = 0; j < 80; j += 16) {
01389         R2( 0); R2( 1); R2( 2); R2( 3);
01390         R2( 4); R2( 5); R2( 6); R2( 7);
01391         R2( 8); R2( 9); R2(10); R2(11);
01392         R2(12); R2(13); R2(14); R2(15);
01393     }
01394 #endif /* USE_SLOW_SHA2 */
01395 
01396     /* Add the working vars back into digest */
01397 
01398     sha384->digest[0] += a(0);
01399     sha384->digest[1] += b(0);
01400     sha384->digest[2] += c(0);
01401     sha384->digest[3] += d(0);
01402     sha384->digest[4] += e(0);
01403     sha384->digest[5] += f(0);
01404     sha384->digest[6] += g(0);
01405     sha384->digest[7] += h(0);
01406 
01407     /* Wipe variables */
01408     XMEMSET(W, 0, sizeof(word64) * 16);
01409     XMEMSET(T, 0, sizeof(T));
01410 
01411 #ifdef WOLFSSL_SMALL_STACK
01412     XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
01413 #endif
01414 
01415     return 0;
01416 }
01417 
01418 static INLINE void AddLength384(Sha384* sha384, word32 len)
01419 {
01420     word32 tmp = sha384->loLen;
01421     if ( (sha384->loLen += len) < tmp)
01422         sha384->hiLen++;                       /* carry low to high */
01423 }
01424 
01425 int wc_Sha384Update(Sha384* sha384, const byte* data, word32 len)
01426 {
01427     /* do block size increments */
01428     byte* local = (byte*)sha384->buffer;
01429     
01430     SAVE_XMM_YMM ; /* for Intel AVX */
01431     
01432     while (len) {
01433         word32 add = min(len, SHA384_BLOCK_SIZE - sha384->buffLen);
01434         XMEMCPY(&local[sha384->buffLen], data, add);
01435 
01436         sha384->buffLen += add;
01437         data         += add;
01438         len          -= add;
01439 
01440         if (sha384->buffLen == SHA384_BLOCK_SIZE) {
01441             int ret;
01442 
01443             #if defined(LITTLE_ENDIAN_ORDER)
01444                 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
01445                 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) 
01446                 #endif
01447                     ByteReverseWords64(sha384->buffer, sha384->buffer,
01448                                    SHA384_BLOCK_SIZE);
01449             #endif
01450             ret = Transform384(sha384);
01451             if (ret != 0)
01452                 return ret;
01453 
01454             AddLength384(sha384, SHA384_BLOCK_SIZE);
01455             sha384->buffLen = 0;
01456         }
01457     }
01458     return 0;
01459 }
01460 
01461 
01462 int wc_Sha384Final(Sha384* sha384, byte* hash)
01463 {
01464     byte* local = (byte*)sha384->buffer;
01465     int ret;
01466 
01467     SAVE_XMM_YMM ; /* for Intel AVX */
01468     AddLength384(sha384, sha384->buffLen);              /* before adding pads */
01469 
01470     local[sha384->buffLen++] = 0x80;  /* add 1 */
01471 
01472     /* pad with zeros */
01473     if (sha384->buffLen > SHA384_PAD_SIZE) {
01474         XMEMSET(&local[sha384->buffLen], 0, SHA384_BLOCK_SIZE -sha384->buffLen);
01475         sha384->buffLen += SHA384_BLOCK_SIZE - sha384->buffLen;
01476 
01477         #if defined(LITTLE_ENDIAN_ORDER)
01478             #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
01479             if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) 
01480             #endif
01481                  ByteReverseWords64(sha384->buffer, sha384->buffer,
01482                                SHA384_BLOCK_SIZE);
01483         #endif
01484         ret = Transform384(sha384);
01485         if (ret !=  0)
01486             return ret;
01487 
01488         sha384->buffLen = 0;
01489     }
01490     XMEMSET(&local[sha384->buffLen], 0, SHA384_PAD_SIZE - sha384->buffLen);
01491    
01492     /* put lengths in bits */
01493     sha384->hiLen = (sha384->loLen >> (8*sizeof(sha384->loLen) - 3)) + 
01494                  (sha384->hiLen << 3);
01495     sha384->loLen = sha384->loLen << 3;
01496 
01497     /* store lengths */
01498     #if defined(LITTLE_ENDIAN_ORDER)
01499         #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
01500         if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2) 
01501         #endif
01502              ByteReverseWords64(sha384->buffer, sha384->buffer,
01503                            SHA384_BLOCK_SIZE);
01504     #endif
01505     /* ! length ordering dependent on digest endian type ! */
01506     sha384->buffer[SHA384_BLOCK_SIZE / sizeof(word64) - 2] = sha384->hiLen;
01507     sha384->buffer[SHA384_BLOCK_SIZE / sizeof(word64) - 1] = sha384->loLen;
01508     #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
01509     if(IS_INTEL_AVX1 || IS_INTEL_AVX2)
01510         ByteReverseWords64(&(sha384->buffer[SHA384_BLOCK_SIZE / sizeof(word64) - 2]),
01511                            &(sha384->buffer[SHA384_BLOCK_SIZE / sizeof(word64) - 2]),
01512                            SHA384_BLOCK_SIZE - SHA384_PAD_SIZE);
01513     #endif
01514     ret = Transform384(sha384);
01515     if (ret != 0)
01516         return ret;
01517 
01518     #ifdef LITTLE_ENDIAN_ORDER
01519         ByteReverseWords64(sha384->digest, sha384->digest, SHA384_DIGEST_SIZE);
01520     #endif
01521     XMEMCPY(hash, sha384->digest, SHA384_DIGEST_SIZE);
01522 
01523     return wc_InitSha384(sha384);  /* reset state */
01524 }
01525 
01526 
01527 
01528 #if defined(HAVE_INTEL_AVX1)
01529  
01530 static int Transform384_AVX1(Sha384* sha384)
01531 {
01532     const word64* K = K512;
01533     word64 W_X[16+4];
01534     word32 j;
01535     word64 T[8];
01536 
01537     /* Copy digest to working vars */
01538     XMEMCPY(T, sha384->digest, sizeof(T));
01539     W_from_buff(W_X, sha384->buffer) ;
01540     for (j = 0; j < 80; j += 16) {
01541         Rx_1( 0); Block_0_1(W_X); Rx_2( 0); Block_0_2(W_X); Rx_3( 0); Block_0_3(); 
01542         Rx_1( 1); Block_0_4(); Rx_2( 1); Block_0_5(); Rx_3( 1); Block_0_6(W_X); 
01543         Rx_1( 2); Block_0_7(W_X); Rx_2( 2); Block_0_8(W_X); Rx_3( 2); Block_0_9();
01544         Rx_1( 3); Block_0_10();Rx_2( 3); Block_0_11();Rx_3( 3); Block_0_12(W_X);   
01545         
01546         Rx_1( 4); Block_4_1(W_X); Rx_2( 4); Block_4_2(W_X); Rx_3( 4); Block_4_3(); 
01547         Rx_1( 5); Block_4_4(); Rx_2( 5); Block_4_5(); Rx_3( 5); Block_4_6(W_X); 
01548         Rx_1( 6); Block_4_7(W_X); Rx_2( 6); Block_4_8(W_X); Rx_3( 6); Block_4_9();
01549         Rx_1( 7); Block_4_10();Rx_2( 7); Block_4_11();Rx_3( 7); Block_4_12(W_X);   
01550         
01551         Rx_1( 8); Block_8_1(W_X); Rx_2( 8); Block_8_2(W_X); Rx_3( 8); Block_8_3(); 
01552         Rx_1( 9); Block_8_4(); Rx_2( 9); Block_8_5(); Rx_3( 9); Block_8_6(W_X); 
01553         Rx_1(10); Block_8_7(W_X); Rx_2(10); Block_8_8(W_X); Rx_3(10); Block_8_9();
01554         Rx_1(11); Block_8_10();Rx_2(11); Block_8_11();Rx_3(11); Block_8_12(W_X);   
01555         
01556         Rx_1(12); Block_12_1(W_X); Rx_2(12); Block_12_2(W_X); Rx_3(12); Block_12_3(); 
01557         Rx_1(13); Block_12_4(); Rx_2(13); Block_12_5(); Rx_3(13); Block_12_6(W_X); 
01558         Rx_1(14); Block_12_7(W_X); Rx_2(14); Block_12_8(W_X); Rx_3(14); Block_12_9();
01559         Rx_1(15); Block_12_10();Rx_2(15); Block_12_11();Rx_3(15); Block_12_12(W_X);     
01560     }
01561 
01562     /* Add the working vars back into digest */
01563 
01564     sha384->digest[0] += a(0);
01565     sha384->digest[1] += b(0);
01566     sha384->digest[2] += c(0);
01567     sha384->digest[3] += d(0);
01568     sha384->digest[4] += e(0);
01569     sha384->digest[5] += f(0);
01570     sha384->digest[6] += g(0);
01571     sha384->digest[7] += h(0);
01572 
01573     /* Wipe variables */
01574     #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2)
01575     XMEMSET(W, 0, sizeof(word64) * 16);
01576     #endif
01577     XMEMSET(T, 0, sizeof(T));
01578 
01579     return 0;
01580 }
01581 
01582 #endif
01583 
01584 #if defined(HAVE_INTEL_AVX1) && defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX)
01585 static int Transform384_AVX1_RORX(Sha384* sha384)
01586 {
01587     const word64* K = K512;
01588     word64 W_X[16+4];
01589     word32 j;
01590     word64 T[8];
01591 
01592     /* Copy digest to working vars */
01593     XMEMCPY(T, sha384->digest, sizeof(T));
01594 
01595     W_from_buff(W_X, sha384->buffer) ;
01596     for (j = 0; j < 80; j += 16) {
01597         Rx_RORX_1( 0); Block_0_1(W_X); Rx_RORX_2( 0); 
01598             Block_0_2(W_X); Rx_RORX_3( 0); Block_0_3(); 
01599         Rx_RORX_1( 1); Block_0_4(); Rx_RORX_2( 1); 
01600             Block_0_5(); Rx_RORX_3( 1); Block_0_6(W_X); 
01601         Rx_RORX_1( 2); Block_0_7(W_X); Rx_RORX_2( 2); 
01602             Block_0_8(W_X); Rx_RORX_3( 2); Block_0_9();
01603         Rx_RORX_1( 3); Block_0_10();Rx_RORX_2( 3); 
01604             Block_0_11();Rx_RORX_3( 3); Block_0_12(W_X);   
01605         
01606         Rx_RORX_1( 4); Block_4_1(W_X); Rx_RORX_2( 4); 
01607             Block_4_2(W_X); Rx_RORX_3( 4); Block_4_3(); 
01608         Rx_RORX_1( 5); Block_4_4(); Rx_RORX_2( 5); 
01609             Block_4_5(); Rx_RORX_3( 5); Block_4_6(W_X); 
01610         Rx_RORX_1( 6); Block_4_7(W_X); Rx_RORX_2( 6); 
01611             Block_4_8(W_X); Rx_RORX_3( 6); Block_4_9();
01612         Rx_RORX_1( 7); Block_4_10();Rx_RORX_2( 7); 
01613             Block_4_11();Rx_RORX_3( 7); Block_4_12(W_X);   
01614         
01615         Rx_RORX_1( 8); Block_8_1(W_X); Rx_RORX_2( 8); 
01616             Block_8_2(W_X); Rx_RORX_3( 8); Block_8_3(); 
01617         Rx_RORX_1( 9); Block_8_4(); Rx_RORX_2( 9); 
01618             Block_8_5(); Rx_RORX_3( 9); Block_8_6(W_X); 
01619         Rx_RORX_1(10); Block_8_7(W_X); Rx_RORX_2(10); 
01620             Block_8_8(W_X); Rx_RORX_3(10); Block_8_9();
01621         Rx_RORX_1(11); Block_8_10();Rx_RORX_2(11); 
01622             Block_8_11();Rx_RORX_3(11); Block_8_12(W_X);   
01623         
01624         Rx_RORX_1(12); Block_12_1(W_X); Rx_RORX_2(12);
01625             Block_12_2(W_X); Rx_RORX_3(12); Block_12_3(); 
01626         Rx_RORX_1(13); Block_12_4(); Rx_RORX_2(13); 
01627             Block_12_5(); Rx_RORX_3(13); Block_12_6(W_X); 
01628         Rx_RORX_1(14); Block_12_7(W_X); Rx_RORX_2(14); 
01629             Block_12_8(W_X); Rx_RORX_3(14); Block_12_9();
01630         Rx_RORX_1(15); Block_12_10();Rx_RORX_2(15); 
01631             Block_12_11();Rx_RORX_3(15); Block_12_12(W_X);     
01632     }
01633 
01634     /* Add the working vars back into digest */
01635 
01636     sha384->digest[0] += a(0);
01637     sha384->digest[1] += b(0);
01638     sha384->digest[2] += c(0);
01639     sha384->digest[3] += d(0);
01640     sha384->digest[4] += e(0);
01641     sha384->digest[5] += f(0);
01642     sha384->digest[6] += g(0);
01643     sha384->digest[7] += h(0);
01644 
01645     /* Wipe variables */
01646     #if !defined(HAVE_INTEL_AVX1)&&!defined(HAVE_INTEL_AVX2)
01647     XMEMSET(W, 0, sizeof(word64) * 16);
01648     #endif
01649     XMEMSET(T, 0, sizeof(T));
01650 
01651     return 0;
01652 }
01653 #endif
01654 
01655 #if defined(HAVE_INTEL_AVX2)
01656 
01657 static int Transform384_AVX2(Sha384* sha384)
01658 {
01659     const word64* K = K512;
01660     word64 w[4] ;
01661     word32 j;
01662     word64 T[8];
01663 
01664     /* Copy digest to working vars */
01665     XMEMCPY(T, sha384->digest, sizeof(T));
01666 
01667     /* over twice as small, but 50% slower */
01668     /* 80 operations, not unrolled */
01669 
01670     W_from_buff_Y(sha384->buffer) ;
01671 
01672     MOVE_to_MEMy(w,0, W_0y) ;
01673     for (j = 0; j < 80; j += 16) {
01674         Ry_1( 0, w[0]); Block_Y_0_1(); Ry_2( 0, w[0]); 
01675             Block_Y_0_2(); Ry_3( 0, w[0]); Block_Y_0_3(); 
01676         Ry_1( 1, w[1]); Block_Y_0_4(); Ry_2( 1, w[1]); 
01677             Block_Y_0_5(); Ry_3( 1, w[1]); Block_Y_0_6();  
01678         Ry_1( 2, w[2]); Block_Y_0_7(); Ry_2( 2, w[2]); 
01679             Block_Y_0_8(); Ry_3( 2, w[2]); Block_Y_0_9();
01680         Ry_1( 3, w[3]); Block_Y_0_10();Ry_2( 3, w[3]); 
01681             Block_Y_0_11();Ry_3( 3, w[3]); Block_Y_0_12(w);
01682         
01683         Ry_1( 4, w[0]); Block_Y_4_1(); Ry_2( 4, w[0]); 
01684             Block_Y_4_2(); Ry_3( 4, w[0]); Block_Y_4_3(); 
01685         Ry_1( 5, w[1]); Block_Y_4_4(); Ry_2( 5, w[1]);   
01686             Block_Y_4_5(); Ry_3( 5, w[1]); Block_Y_4_6();
01687         Ry_1( 6, w[2]); Block_Y_4_7(); Ry_2( 6, w[2]); 
01688             Block_Y_4_8(); Ry_3( 6, w[2]); Block_Y_4_9();
01689         Ry_1( 7, w[3]); Block_Y_4_10(); Ry_2( 7, w[3]);
01690             Block_Y_4_11(); Ry_3( 7, w[3]);Block_Y_4_12(w);  
01691         
01692         Ry_1( 8, w[0]); Block_Y_8_1(); Ry_2( 8, w[0]); 
01693             Block_Y_8_2(); Ry_3( 8, w[0]); Block_Y_8_3();
01694         Ry_1( 9, w[1]); Block_Y_8_4(); Ry_2( 9, w[1]);
01695             Block_Y_8_5(); Ry_3( 9, w[1]); Block_Y_8_6();
01696         Ry_1(10, w[2]); Block_Y_8_7(); Ry_2(10, w[2]); 
01697             Block_Y_8_8(); Ry_3(10, w[2]); Block_Y_8_9(); 
01698         Ry_1(11, w[3]); Block_Y_8_10();Ry_2(11, w[3]); 
01699            Block_Y_8_11();Ry_3(11, w[3]); Block_Y_8_12(w);
01700                  
01701         Ry_1(12, w[0]); Block_Y_12_1(); Ry_2(12, w[0]); 
01702             Block_Y_12_2(); Ry_3(12, w[0]); Block_Y_12_3();
01703         Ry_1(13, w[1]); Block_Y_12_4(); Ry_2(13, w[1]); 
01704             Block_Y_12_5(); Ry_3(13, w[1]); Block_Y_12_6(); 
01705         Ry_1(14, w[2]); Block_Y_12_7(); Ry_2(14, w[2]); 
01706             Block_Y_12_8(); Ry_3(14, w[2]); Block_Y_12_9();
01707         Ry_1(15, w[3]); Block_Y_12_10();Ry_2(15, w[3]); 
01708             Block_Y_12_11();Ry_3(15, w[3]); Block_Y_12_12(w);
01709     }
01710 
01711     /* Add the working vars back into digest */
01712 
01713     sha384->digest[0] += a(0);
01714     sha384->digest[1] += b(0);
01715     sha384->digest[2] += c(0);
01716     sha384->digest[3] += d(0);
01717     sha384->digest[4] += e(0);
01718     sha384->digest[5] += f(0);
01719     sha384->digest[6] += g(0);
01720     sha384->digest[7] += h(0);
01721 
01722     /* Wipe variables */
01723     XMEMSET(T, 0, sizeof(T));
01724 
01725     return 0;
01726 }
01727 
01728 #endif
01729 
01730 #endif /* WOLFSSL_SHA384 */
01731 
01732 #endif /* HAVE_FIPS */
01733 
01734 #endif /* WOLFSSL_SHA512 */
01735 
01736