wolfSSL SSL/TLS library, support up to TLS1.3

Dependents:   CyaSSL-Twitter-OAuth4Tw Example-client-tls-cert TwitterReader TweetTest ... more

Committer:
wolfSSL
Date:
Thu Apr 28 00:57:21 2016 +0000
Revision:
4:1b0d80432c79
wolfSSL 3.9.0

Who changed what in which revision?

UserRevisionLine numberNew contents of line
wolfSSL 4:1b0d80432c79 1 /* sha256.c
wolfSSL 4:1b0d80432c79 2 *
wolfSSL 4:1b0d80432c79 3 * Copyright (C) 2006-2016 wolfSSL Inc.
wolfSSL 4:1b0d80432c79 4 *
wolfSSL 4:1b0d80432c79 5 * This file is part of wolfSSL.
wolfSSL 4:1b0d80432c79 6 *
wolfSSL 4:1b0d80432c79 7 * wolfSSL is free software; you can redistribute it and/or modify
wolfSSL 4:1b0d80432c79 8 * it under the terms of the GNU General Public License as published by
wolfSSL 4:1b0d80432c79 9 * the Free Software Foundation; either version 2 of the License, or
wolfSSL 4:1b0d80432c79 10 * (at your option) any later version.
wolfSSL 4:1b0d80432c79 11 *
wolfSSL 4:1b0d80432c79 12 * wolfSSL is distributed in the hope that it will be useful,
wolfSSL 4:1b0d80432c79 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
wolfSSL 4:1b0d80432c79 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
wolfSSL 4:1b0d80432c79 15 * GNU General Public License for more details.
wolfSSL 4:1b0d80432c79 16 *
wolfSSL 4:1b0d80432c79 17 * You should have received a copy of the GNU General Public License
wolfSSL 4:1b0d80432c79 18 * along with this program; if not, write to the Free Software
wolfSSL 4:1b0d80432c79 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
wolfSSL 4:1b0d80432c79 20 */
wolfSSL 4:1b0d80432c79 21
wolfSSL 4:1b0d80432c79 22
wolfSSL 4:1b0d80432c79 23 /* code submitted by raphael.huck@efixo.com */
wolfSSL 4:1b0d80432c79 24
wolfSSL 4:1b0d80432c79 25 #ifdef HAVE_CONFIG_H
wolfSSL 4:1b0d80432c79 26 #include <config.h>
wolfSSL 4:1b0d80432c79 27 #endif
wolfSSL 4:1b0d80432c79 28
wolfSSL 4:1b0d80432c79 29 #include <wolfssl/wolfcrypt/settings.h>
wolfSSL 4:1b0d80432c79 30 #include <wolfssl/wolfcrypt/sha256.h>
wolfSSL 4:1b0d80432c79 31
wolfSSL 4:1b0d80432c79 32 #if !defined(NO_SHA256)
wolfSSL 4:1b0d80432c79 33 #ifdef HAVE_FIPS
wolfSSL 4:1b0d80432c79 34
wolfSSL 4:1b0d80432c79 35 int wc_InitSha256(Sha256* sha)
wolfSSL 4:1b0d80432c79 36 {
wolfSSL 4:1b0d80432c79 37 return InitSha256_fips(sha);
wolfSSL 4:1b0d80432c79 38 }
wolfSSL 4:1b0d80432c79 39
wolfSSL 4:1b0d80432c79 40
wolfSSL 4:1b0d80432c79 41 int wc_Sha256Update(Sha256* sha, const byte* data, word32 len)
wolfSSL 4:1b0d80432c79 42 {
wolfSSL 4:1b0d80432c79 43 return Sha256Update_fips(sha, data, len);
wolfSSL 4:1b0d80432c79 44 }
wolfSSL 4:1b0d80432c79 45
wolfSSL 4:1b0d80432c79 46
wolfSSL 4:1b0d80432c79 47 int wc_Sha256Final(Sha256* sha, byte* out)
wolfSSL 4:1b0d80432c79 48 {
wolfSSL 4:1b0d80432c79 49 return Sha256Final_fips(sha, out);
wolfSSL 4:1b0d80432c79 50 }
wolfSSL 4:1b0d80432c79 51
wolfSSL 4:1b0d80432c79 52
wolfSSL 4:1b0d80432c79 53 #else /* else build without fips */
wolfSSL 4:1b0d80432c79 54
wolfSSL 4:1b0d80432c79 55 #if !defined(NO_SHA256) && defined(WOLFSSL_TI_HASH)
wolfSSL 4:1b0d80432c79 56 /* #include <wolfcrypt/src/port/ti/ti-hash.c> included by wc_port.c */
wolfSSL 4:1b0d80432c79 57 #else
wolfSSL 4:1b0d80432c79 58
wolfSSL 4:1b0d80432c79 59 #if !defined (ALIGN32)
wolfSSL 4:1b0d80432c79 60 #if defined (__GNUC__)
wolfSSL 4:1b0d80432c79 61 #define ALIGN32 __attribute__ ( (aligned (32)))
wolfSSL 4:1b0d80432c79 62 #elif defined(_MSC_VER)
wolfSSL 4:1b0d80432c79 63 /* disable align warning, we want alignment ! */
wolfSSL 4:1b0d80432c79 64 #pragma warning(disable: 4324)
wolfSSL 4:1b0d80432c79 65 #define ALIGN32 __declspec (align (32))
wolfSSL 4:1b0d80432c79 66 #else
wolfSSL 4:1b0d80432c79 67 #define ALIGN32
wolfSSL 4:1b0d80432c79 68 #endif
wolfSSL 4:1b0d80432c79 69 #endif
wolfSSL 4:1b0d80432c79 70
wolfSSL 4:1b0d80432c79 71 #ifdef WOLFSSL_PIC32MZ_HASH
wolfSSL 4:1b0d80432c79 72 #define wc_InitSha256 wc_InitSha256_sw
wolfSSL 4:1b0d80432c79 73 #define wc_Sha256Update wc_Sha256Update_sw
wolfSSL 4:1b0d80432c79 74 #define wc_Sha256Final wc_Sha256Final_sw
wolfSSL 4:1b0d80432c79 75 #endif
wolfSSL 4:1b0d80432c79 76
wolfSSL 4:1b0d80432c79 77 #ifdef HAVE_FIPS
wolfSSL 4:1b0d80432c79 78 /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
wolfSSL 4:1b0d80432c79 79 #define FIPS_NO_WRAPPERS
wolfSSL 4:1b0d80432c79 80 #endif
wolfSSL 4:1b0d80432c79 81
wolfSSL 4:1b0d80432c79 82 #if defined(USE_INTEL_SPEEDUP)
wolfSSL 4:1b0d80432c79 83 #define HAVE_INTEL_AVX1
wolfSSL 4:1b0d80432c79 84 #define HAVE_INTEL_AVX2
wolfSSL 4:1b0d80432c79 85
wolfSSL 4:1b0d80432c79 86 #if defined(DEBUG_XMM)
wolfSSL 4:1b0d80432c79 87 #include "stdio.h"
wolfSSL 4:1b0d80432c79 88 #endif
wolfSSL 4:1b0d80432c79 89
wolfSSL 4:1b0d80432c79 90 #endif
wolfSSL 4:1b0d80432c79 91
wolfSSL 4:1b0d80432c79 92 #if defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 93 #define HAVE_INTEL_RORX
wolfSSL 4:1b0d80432c79 94 #endif
wolfSSL 4:1b0d80432c79 95
wolfSSL 4:1b0d80432c79 96
wolfSSL 4:1b0d80432c79 97 /*****
wolfSSL 4:1b0d80432c79 98 Intel AVX1/AVX2 Macro Control Structure
wolfSSL 4:1b0d80432c79 99
wolfSSL 4:1b0d80432c79 100 #define HAVE_INTEL_AVX1
wolfSSL 4:1b0d80432c79 101 #define HAVE_INTEL_AVX2
wolfSSL 4:1b0d80432c79 102
wolfSSL 4:1b0d80432c79 103 #define HAVE_INTEL_RORX
wolfSSL 4:1b0d80432c79 104
wolfSSL 4:1b0d80432c79 105
wolfSSL 4:1b0d80432c79 106 int InitSha256(Sha256* sha256) {
wolfSSL 4:1b0d80432c79 107 Save/Recover XMM, YMM
wolfSSL 4:1b0d80432c79 108 ...
wolfSSL 4:1b0d80432c79 109 }
wolfSSL 4:1b0d80432c79 110
wolfSSL 4:1b0d80432c79 111 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 112 Transform() ; Function prototype
wolfSSL 4:1b0d80432c79 113 #else
wolfSSL 4:1b0d80432c79 114 Transform() { }
wolfSSL 4:1b0d80432c79 115 int Sha256Final() {
wolfSSL 4:1b0d80432c79 116 Save/Recover XMM, YMM
wolfSSL 4:1b0d80432c79 117 ...
wolfSSL 4:1b0d80432c79 118 }
wolfSSL 4:1b0d80432c79 119 #endif
wolfSSL 4:1b0d80432c79 120
wolfSSL 4:1b0d80432c79 121 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 122 #if defined(HAVE_INTEL_RORX
wolfSSL 4:1b0d80432c79 123 #define RND with rorx instuction
wolfSSL 4:1b0d80432c79 124 #else
wolfSSL 4:1b0d80432c79 125 #define RND
wolfSSL 4:1b0d80432c79 126 #endif
wolfSSL 4:1b0d80432c79 127 #endif
wolfSSL 4:1b0d80432c79 128
wolfSSL 4:1b0d80432c79 129 #if defined(HAVE_INTEL_AVX1)
wolfSSL 4:1b0d80432c79 130
wolfSSL 4:1b0d80432c79 131 #define XMM Instructions/inline asm
wolfSSL 4:1b0d80432c79 132
wolfSSL 4:1b0d80432c79 133 int Transform() {
wolfSSL 4:1b0d80432c79 134 Stitched Message Sched/Round
wolfSSL 4:1b0d80432c79 135 }
wolfSSL 4:1b0d80432c79 136
wolfSSL 4:1b0d80432c79 137 #elif defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 138
wolfSSL 4:1b0d80432c79 139 #define YMM Instructions/inline asm
wolfSSL 4:1b0d80432c79 140
wolfSSL 4:1b0d80432c79 141 int Transform() {
wolfSSL 4:1b0d80432c79 142 More granural Stitched Message Sched/Round
wolfSSL 4:1b0d80432c79 143 }
wolfSSL 4:1b0d80432c79 144
wolfSSL 4:1b0d80432c79 145 */
wolfSSL 4:1b0d80432c79 146
wolfSSL 4:1b0d80432c79 147
wolfSSL 4:1b0d80432c79 148 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 149
wolfSSL 4:1b0d80432c79 150 /* Each platform needs to query info type 1 from cpuid to see if aesni is
wolfSSL 4:1b0d80432c79 151 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
wolfSSL 4:1b0d80432c79 152 */
wolfSSL 4:1b0d80432c79 153
wolfSSL 4:1b0d80432c79 154 #ifndef _MSC_VER
wolfSSL 4:1b0d80432c79 155 #define cpuid(reg, leaf, sub)\
wolfSSL 4:1b0d80432c79 156 __asm__ __volatile__ ("cpuid":\
wolfSSL 4:1b0d80432c79 157 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
wolfSSL 4:1b0d80432c79 158 "a" (leaf), "c"(sub));
wolfSSL 4:1b0d80432c79 159
wolfSSL 4:1b0d80432c79 160 #define XASM_LINK(f) asm(f)
wolfSSL 4:1b0d80432c79 161 #else
wolfSSL 4:1b0d80432c79 162
wolfSSL 4:1b0d80432c79 163 #include <intrin.h>
wolfSSL 4:1b0d80432c79 164 #define cpuid(a,b) __cpuid((int*)a,b)
wolfSSL 4:1b0d80432c79 165
wolfSSL 4:1b0d80432c79 166 #define XASM_LINK(f)
wolfSSL 4:1b0d80432c79 167
wolfSSL 4:1b0d80432c79 168 #endif /* _MSC_VER */
wolfSSL 4:1b0d80432c79 169
wolfSSL 4:1b0d80432c79 170 #define EAX 0
wolfSSL 4:1b0d80432c79 171 #define EBX 1
wolfSSL 4:1b0d80432c79 172 #define ECX 2
wolfSSL 4:1b0d80432c79 173 #define EDX 3
wolfSSL 4:1b0d80432c79 174
wolfSSL 4:1b0d80432c79 175 #define CPUID_AVX1 0x1
wolfSSL 4:1b0d80432c79 176 #define CPUID_AVX2 0x2
wolfSSL 4:1b0d80432c79 177 #define CPUID_RDRAND 0x4
wolfSSL 4:1b0d80432c79 178 #define CPUID_RDSEED 0x8
wolfSSL 4:1b0d80432c79 179 #define CPUID_BMI2 0x10 /* MULX, RORX */
wolfSSL 4:1b0d80432c79 180
wolfSSL 4:1b0d80432c79 181 #define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1)
wolfSSL 4:1b0d80432c79 182 #define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2)
wolfSSL 4:1b0d80432c79 183 #define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2)
wolfSSL 4:1b0d80432c79 184 #define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND)
wolfSSL 4:1b0d80432c79 185 #define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED)
wolfSSL 4:1b0d80432c79 186
wolfSSL 4:1b0d80432c79 187 static word32 cpuid_check = 0 ;
wolfSSL 4:1b0d80432c79 188 static word32 cpuid_flags = 0 ;
wolfSSL 4:1b0d80432c79 189
wolfSSL 4:1b0d80432c79 190 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
wolfSSL 4:1b0d80432c79 191 int got_intel_cpu=0;
wolfSSL 4:1b0d80432c79 192 unsigned int reg[5];
wolfSSL 4:1b0d80432c79 193
wolfSSL 4:1b0d80432c79 194 reg[4] = '\0' ;
wolfSSL 4:1b0d80432c79 195 cpuid(reg, 0, 0);
wolfSSL 4:1b0d80432c79 196 if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&
wolfSSL 4:1b0d80432c79 197 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&
wolfSSL 4:1b0d80432c79 198 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {
wolfSSL 4:1b0d80432c79 199 got_intel_cpu = 1;
wolfSSL 4:1b0d80432c79 200 }
wolfSSL 4:1b0d80432c79 201 if (got_intel_cpu) {
wolfSSL 4:1b0d80432c79 202 cpuid(reg, leaf, sub);
wolfSSL 4:1b0d80432c79 203 return((reg[num]>>bit)&0x1) ;
wolfSSL 4:1b0d80432c79 204 }
wolfSSL 4:1b0d80432c79 205 return 0 ;
wolfSSL 4:1b0d80432c79 206 }
wolfSSL 4:1b0d80432c79 207
wolfSSL 4:1b0d80432c79 208 static int set_cpuid_flags(void) {
wolfSSL 4:1b0d80432c79 209 if(cpuid_check==0) {
wolfSSL 4:1b0d80432c79 210 if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;}
wolfSSL 4:1b0d80432c79 211 if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; }
wolfSSL 4:1b0d80432c79 212 if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; }
wolfSSL 4:1b0d80432c79 213 if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; }
wolfSSL 4:1b0d80432c79 214 if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; }
wolfSSL 4:1b0d80432c79 215 cpuid_check = 1 ;
wolfSSL 4:1b0d80432c79 216 return 0 ;
wolfSSL 4:1b0d80432c79 217 }
wolfSSL 4:1b0d80432c79 218 return 1 ;
wolfSSL 4:1b0d80432c79 219 }
wolfSSL 4:1b0d80432c79 220
wolfSSL 4:1b0d80432c79 221
wolfSSL 4:1b0d80432c79 222 /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */
wolfSSL 4:1b0d80432c79 223 static int Transform(Sha256* sha256);
wolfSSL 4:1b0d80432c79 224
wolfSSL 4:1b0d80432c79 225 #if defined(HAVE_INTEL_AVX1)
wolfSSL 4:1b0d80432c79 226 static int Transform_AVX1(Sha256 *sha256) ;
wolfSSL 4:1b0d80432c79 227 #endif
wolfSSL 4:1b0d80432c79 228 #if defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 229 static int Transform_AVX2(Sha256 *sha256) ;
wolfSSL 4:1b0d80432c79 230 static int Transform_AVX1_RORX(Sha256 *sha256) ;
wolfSSL 4:1b0d80432c79 231 #endif
wolfSSL 4:1b0d80432c79 232
wolfSSL 4:1b0d80432c79 233 static int (*Transform_p)(Sha256* sha256) /* = _Transform */;
wolfSSL 4:1b0d80432c79 234
wolfSSL 4:1b0d80432c79 235 #define XTRANSFORM(sha256, B) (*Transform_p)(sha256)
wolfSSL 4:1b0d80432c79 236
wolfSSL 4:1b0d80432c79 237 static void set_Transform(void) {
wolfSSL 4:1b0d80432c79 238 if(set_cpuid_flags())return ;
wolfSSL 4:1b0d80432c79 239
wolfSSL 4:1b0d80432c79 240 #if defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 241 if(IS_INTEL_AVX2 && IS_INTEL_BMI2){
wolfSSL 4:1b0d80432c79 242 Transform_p = Transform_AVX1_RORX; return ;
wolfSSL 4:1b0d80432c79 243 Transform_p = Transform_AVX2 ;
wolfSSL 4:1b0d80432c79 244 /* for avoiding warning,"not used" */
wolfSSL 4:1b0d80432c79 245 }
wolfSSL 4:1b0d80432c79 246 #endif
wolfSSL 4:1b0d80432c79 247 #if defined(HAVE_INTEL_AVX1)
wolfSSL 4:1b0d80432c79 248 Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform) ; return ;
wolfSSL 4:1b0d80432c79 249 #endif
wolfSSL 4:1b0d80432c79 250 Transform_p = Transform ; return ;
wolfSSL 4:1b0d80432c79 251 }
wolfSSL 4:1b0d80432c79 252
wolfSSL 4:1b0d80432c79 253 #else
wolfSSL 4:1b0d80432c79 254 #if defined(FREESCALE_MMCAU)
wolfSSL 4:1b0d80432c79 255 #define XTRANSFORM(sha256, B) Transform(sha256, B)
wolfSSL 4:1b0d80432c79 256 #else
wolfSSL 4:1b0d80432c79 257 #define XTRANSFORM(sha256, B) Transform(sha256)
wolfSSL 4:1b0d80432c79 258 #endif
wolfSSL 4:1b0d80432c79 259 #endif
wolfSSL 4:1b0d80432c79 260
wolfSSL 4:1b0d80432c79 261 /* Dummy for saving MM_REGs on behalf of Transform */
wolfSSL 4:1b0d80432c79 262 #if defined(HAVE_INTEL_AVX2)&& !defined(HAVE_INTEL_AVX1)
wolfSSL 4:1b0d80432c79 263 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
wolfSSL 4:1b0d80432c79 264 "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15")
wolfSSL 4:1b0d80432c79 265 #elif defined(HAVE_INTEL_AVX1)
wolfSSL 4:1b0d80432c79 266 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
wolfSSL 4:1b0d80432c79 267 "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\
wolfSSL 4:1b0d80432c79 268 "xmm11","xmm12","xmm13","xmm14","xmm15")
wolfSSL 4:1b0d80432c79 269 #else
wolfSSL 4:1b0d80432c79 270 #define SAVE_XMM_YMM
wolfSSL 4:1b0d80432c79 271 #endif
wolfSSL 4:1b0d80432c79 272
wolfSSL 4:1b0d80432c79 273 #ifdef WOLFSSL_PIC32MZ_HASH
wolfSSL 4:1b0d80432c79 274 #define InitSha256 InitSha256_sw
wolfSSL 4:1b0d80432c79 275 #define Sha256Update Sha256Update_sw
wolfSSL 4:1b0d80432c79 276 #define Sha256Final Sha256Final_sw
wolfSSL 4:1b0d80432c79 277 #endif
wolfSSL 4:1b0d80432c79 278
wolfSSL 4:1b0d80432c79 279 #include <wolfssl/wolfcrypt/logging.h>
wolfSSL 4:1b0d80432c79 280 #include <wolfssl/wolfcrypt/error-crypt.h>
wolfSSL 4:1b0d80432c79 281
wolfSSL 4:1b0d80432c79 282 #ifdef NO_INLINE
wolfSSL 4:1b0d80432c79 283 #include <wolfssl/wolfcrypt/misc.h>
wolfSSL 4:1b0d80432c79 284 #else
wolfSSL 4:1b0d80432c79 285 #include <wolfcrypt/src/misc.c>
wolfSSL 4:1b0d80432c79 286 #endif
wolfSSL 4:1b0d80432c79 287
wolfSSL 4:1b0d80432c79 288 #ifdef FREESCALE_MMCAU
wolfSSL 4:1b0d80432c79 289 #include "cau_api.h"
wolfSSL 4:1b0d80432c79 290 #endif
wolfSSL 4:1b0d80432c79 291
wolfSSL 4:1b0d80432c79 292 #ifndef WOLFSSL_HAVE_MIN
wolfSSL 4:1b0d80432c79 293 #define WOLFSSL_HAVE_MIN
wolfSSL 4:1b0d80432c79 294
wolfSSL 4:1b0d80432c79 295 static INLINE word32 min(word32 a, word32 b)
wolfSSL 4:1b0d80432c79 296 {
wolfSSL 4:1b0d80432c79 297 return a > b ? b : a;
wolfSSL 4:1b0d80432c79 298 }
wolfSSL 4:1b0d80432c79 299
wolfSSL 4:1b0d80432c79 300 #endif /* WOLFSSL_HAVE_MIN */
wolfSSL 4:1b0d80432c79 301
wolfSSL 4:1b0d80432c79 302
wolfSSL 4:1b0d80432c79 303 int wc_InitSha256(Sha256* sha256)
wolfSSL 4:1b0d80432c79 304 {
wolfSSL 4:1b0d80432c79 305 int ret = 0;
wolfSSL 4:1b0d80432c79 306 #ifdef FREESCALE_MMCAU
wolfSSL 4:1b0d80432c79 307 ret = wolfSSL_CryptHwMutexLock();
wolfSSL 4:1b0d80432c79 308 if(ret != 0) {
wolfSSL 4:1b0d80432c79 309 return ret;
wolfSSL 4:1b0d80432c79 310 }
wolfSSL 4:1b0d80432c79 311 cau_sha256_initialize_output(sha256->digest);
wolfSSL 4:1b0d80432c79 312 wolfSSL_CryptHwMutexUnLock();
wolfSSL 4:1b0d80432c79 313 #else
wolfSSL 4:1b0d80432c79 314 sha256->digest[0] = 0x6A09E667L;
wolfSSL 4:1b0d80432c79 315 sha256->digest[1] = 0xBB67AE85L;
wolfSSL 4:1b0d80432c79 316 sha256->digest[2] = 0x3C6EF372L;
wolfSSL 4:1b0d80432c79 317 sha256->digest[3] = 0xA54FF53AL;
wolfSSL 4:1b0d80432c79 318 sha256->digest[4] = 0x510E527FL;
wolfSSL 4:1b0d80432c79 319 sha256->digest[5] = 0x9B05688CL;
wolfSSL 4:1b0d80432c79 320 sha256->digest[6] = 0x1F83D9ABL;
wolfSSL 4:1b0d80432c79 321 sha256->digest[7] = 0x5BE0CD19L;
wolfSSL 4:1b0d80432c79 322 #endif
wolfSSL 4:1b0d80432c79 323
wolfSSL 4:1b0d80432c79 324 sha256->buffLen = 0;
wolfSSL 4:1b0d80432c79 325 sha256->loLen = 0;
wolfSSL 4:1b0d80432c79 326 sha256->hiLen = 0;
wolfSSL 4:1b0d80432c79 327
wolfSSL 4:1b0d80432c79 328 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 329 set_Transform() ; /* choose best Transform function under this runtime environment */
wolfSSL 4:1b0d80432c79 330 #endif
wolfSSL 4:1b0d80432c79 331
wolfSSL 4:1b0d80432c79 332 return ret;
wolfSSL 4:1b0d80432c79 333 }
wolfSSL 4:1b0d80432c79 334
wolfSSL 4:1b0d80432c79 335
wolfSSL 4:1b0d80432c79 336 #if !defined(FREESCALE_MMCAU)
wolfSSL 4:1b0d80432c79 337 static const ALIGN32 word32 K[64] = {
wolfSSL 4:1b0d80432c79 338 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
wolfSSL 4:1b0d80432c79 339 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
wolfSSL 4:1b0d80432c79 340 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
wolfSSL 4:1b0d80432c79 341 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
wolfSSL 4:1b0d80432c79 342 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
wolfSSL 4:1b0d80432c79 343 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
wolfSSL 4:1b0d80432c79 344 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
wolfSSL 4:1b0d80432c79 345 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
wolfSSL 4:1b0d80432c79 346 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
wolfSSL 4:1b0d80432c79 347 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
wolfSSL 4:1b0d80432c79 348 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
wolfSSL 4:1b0d80432c79 349 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
wolfSSL 4:1b0d80432c79 350 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
wolfSSL 4:1b0d80432c79 351 };
wolfSSL 4:1b0d80432c79 352
wolfSSL 4:1b0d80432c79 353 #endif
wolfSSL 4:1b0d80432c79 354
wolfSSL 4:1b0d80432c79 355 #if defined(FREESCALE_MMCAU)
wolfSSL 4:1b0d80432c79 356
wolfSSL 4:1b0d80432c79 357 static int Transform(Sha256* sha256, byte* buf)
wolfSSL 4:1b0d80432c79 358 {
wolfSSL 4:1b0d80432c79 359 int ret = wolfSSL_CryptHwMutexLock();
wolfSSL 4:1b0d80432c79 360 if(ret == 0) {
wolfSSL 4:1b0d80432c79 361 cau_sha256_hash_n(buf, 1, sha256->digest);
wolfSSL 4:1b0d80432c79 362 wolfSSL_CryptHwMutexUnLock();
wolfSSL 4:1b0d80432c79 363 }
wolfSSL 4:1b0d80432c79 364 return ret;
wolfSSL 4:1b0d80432c79 365 }
wolfSSL 4:1b0d80432c79 366
wolfSSL 4:1b0d80432c79 367 #endif /* FREESCALE_MMCAU */
wolfSSL 4:1b0d80432c79 368
wolfSSL 4:1b0d80432c79 369 #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
wolfSSL 4:1b0d80432c79 370 #define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y)))
wolfSSL 4:1b0d80432c79 371 #define R(x, n) (((x)&0xFFFFFFFFU)>>(n))
wolfSSL 4:1b0d80432c79 372
wolfSSL 4:1b0d80432c79 373 #define S(x, n) rotrFixed(x, n)
wolfSSL 4:1b0d80432c79 374 #define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
wolfSSL 4:1b0d80432c79 375 #define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
wolfSSL 4:1b0d80432c79 376 #define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
wolfSSL 4:1b0d80432c79 377 #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
wolfSSL 4:1b0d80432c79 378
wolfSSL 4:1b0d80432c79 379 #define RND(a,b,c,d,e,f,g,h,i) \
wolfSSL 4:1b0d80432c79 380 t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \
wolfSSL 4:1b0d80432c79 381 t1 = Sigma0((a)) + Maj((a), (b), (c)); \
wolfSSL 4:1b0d80432c79 382 (d) += t0; \
wolfSSL 4:1b0d80432c79 383 (h) = t0 + t1;
wolfSSL 4:1b0d80432c79 384
wolfSSL 4:1b0d80432c79 385 #if !defined(FREESCALE_MMCAU)
wolfSSL 4:1b0d80432c79 386 static int Transform(Sha256* sha256)
wolfSSL 4:1b0d80432c79 387 {
wolfSSL 4:1b0d80432c79 388 word32 S[8], t0, t1;
wolfSSL 4:1b0d80432c79 389 int i;
wolfSSL 4:1b0d80432c79 390
wolfSSL 4:1b0d80432c79 391 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 4:1b0d80432c79 392 word32* W;
wolfSSL 4:1b0d80432c79 393
wolfSSL 4:1b0d80432c79 394 W = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 4:1b0d80432c79 395 if (W == NULL)
wolfSSL 4:1b0d80432c79 396 return MEMORY_E;
wolfSSL 4:1b0d80432c79 397 #else
wolfSSL 4:1b0d80432c79 398 word32 W[64];
wolfSSL 4:1b0d80432c79 399 #endif
wolfSSL 4:1b0d80432c79 400
wolfSSL 4:1b0d80432c79 401 /* Copy context->state[] to working vars */
wolfSSL 4:1b0d80432c79 402 for (i = 0; i < 8; i++)
wolfSSL 4:1b0d80432c79 403 S[i] = sha256->digest[i];
wolfSSL 4:1b0d80432c79 404
wolfSSL 4:1b0d80432c79 405 for (i = 0; i < 16; i++)
wolfSSL 4:1b0d80432c79 406 W[i] = sha256->buffer[i];
wolfSSL 4:1b0d80432c79 407
wolfSSL 4:1b0d80432c79 408 for (i = 16; i < 64; i++)
wolfSSL 4:1b0d80432c79 409 W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
wolfSSL 4:1b0d80432c79 410
wolfSSL 4:1b0d80432c79 411 for (i = 0; i < 64; i += 8) {
wolfSSL 4:1b0d80432c79 412 RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0);
wolfSSL 4:1b0d80432c79 413 RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1);
wolfSSL 4:1b0d80432c79 414 RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2);
wolfSSL 4:1b0d80432c79 415 RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3);
wolfSSL 4:1b0d80432c79 416 RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4);
wolfSSL 4:1b0d80432c79 417 RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5);
wolfSSL 4:1b0d80432c79 418 RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6);
wolfSSL 4:1b0d80432c79 419 RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7);
wolfSSL 4:1b0d80432c79 420 }
wolfSSL 4:1b0d80432c79 421
wolfSSL 4:1b0d80432c79 422 /* Add the working vars back into digest state[] */
wolfSSL 4:1b0d80432c79 423 for (i = 0; i < 8; i++) {
wolfSSL 4:1b0d80432c79 424 sha256->digest[i] += S[i];
wolfSSL 4:1b0d80432c79 425 }
wolfSSL 4:1b0d80432c79 426
wolfSSL 4:1b0d80432c79 427 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 4:1b0d80432c79 428 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 4:1b0d80432c79 429 #endif
wolfSSL 4:1b0d80432c79 430
wolfSSL 4:1b0d80432c79 431 return 0;
wolfSSL 4:1b0d80432c79 432 }
wolfSSL 4:1b0d80432c79 433
wolfSSL 4:1b0d80432c79 434 #endif /* #if !defined(FREESCALE_MMCAU) */
wolfSSL 4:1b0d80432c79 435
wolfSSL 4:1b0d80432c79 436 static INLINE void AddLength(Sha256* sha256, word32 len)
wolfSSL 4:1b0d80432c79 437 {
wolfSSL 4:1b0d80432c79 438 word32 tmp = sha256->loLen;
wolfSSL 4:1b0d80432c79 439 if ( (sha256->loLen += len) < tmp)
wolfSSL 4:1b0d80432c79 440 sha256->hiLen++; /* carry low to high */
wolfSSL 4:1b0d80432c79 441 }
wolfSSL 4:1b0d80432c79 442
wolfSSL 4:1b0d80432c79 443 int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
wolfSSL 4:1b0d80432c79 444 {
wolfSSL 4:1b0d80432c79 445
wolfSSL 4:1b0d80432c79 446 /* do block size increments */
wolfSSL 4:1b0d80432c79 447 byte* local = (byte*)sha256->buffer;
wolfSSL 4:1b0d80432c79 448
wolfSSL 4:1b0d80432c79 449 SAVE_XMM_YMM ; /* for Intel AVX */
wolfSSL 4:1b0d80432c79 450
wolfSSL 4:1b0d80432c79 451 while (len) {
wolfSSL 4:1b0d80432c79 452 word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen);
wolfSSL 4:1b0d80432c79 453 XMEMCPY(&local[sha256->buffLen], data, add);
wolfSSL 4:1b0d80432c79 454
wolfSSL 4:1b0d80432c79 455 sha256->buffLen += add;
wolfSSL 4:1b0d80432c79 456 data += add;
wolfSSL 4:1b0d80432c79 457 len -= add;
wolfSSL 4:1b0d80432c79 458
wolfSSL 4:1b0d80432c79 459 if (sha256->buffLen == SHA256_BLOCK_SIZE) {
wolfSSL 4:1b0d80432c79 460 int ret;
wolfSSL 4:1b0d80432c79 461
wolfSSL 4:1b0d80432c79 462 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU)
wolfSSL 4:1b0d80432c79 463 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 464 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 465 #endif
wolfSSL 4:1b0d80432c79 466 ByteReverseWords(sha256->buffer, sha256->buffer,
wolfSSL 4:1b0d80432c79 467 SHA256_BLOCK_SIZE);
wolfSSL 4:1b0d80432c79 468 #endif
wolfSSL 4:1b0d80432c79 469 ret = XTRANSFORM(sha256, local);
wolfSSL 4:1b0d80432c79 470 if (ret != 0)
wolfSSL 4:1b0d80432c79 471 return ret;
wolfSSL 4:1b0d80432c79 472
wolfSSL 4:1b0d80432c79 473 AddLength(sha256, SHA256_BLOCK_SIZE);
wolfSSL 4:1b0d80432c79 474 sha256->buffLen = 0;
wolfSSL 4:1b0d80432c79 475 }
wolfSSL 4:1b0d80432c79 476 }
wolfSSL 4:1b0d80432c79 477
wolfSSL 4:1b0d80432c79 478 return 0;
wolfSSL 4:1b0d80432c79 479 }
wolfSSL 4:1b0d80432c79 480
wolfSSL 4:1b0d80432c79 481 int wc_Sha256Final(Sha256* sha256, byte* hash)
wolfSSL 4:1b0d80432c79 482 {
wolfSSL 4:1b0d80432c79 483 byte* local = (byte*)sha256->buffer;
wolfSSL 4:1b0d80432c79 484 int ret;
wolfSSL 4:1b0d80432c79 485
wolfSSL 4:1b0d80432c79 486 SAVE_XMM_YMM ; /* for Intel AVX */
wolfSSL 4:1b0d80432c79 487
wolfSSL 4:1b0d80432c79 488 AddLength(sha256, sha256->buffLen); /* before adding pads */
wolfSSL 4:1b0d80432c79 489
wolfSSL 4:1b0d80432c79 490 local[sha256->buffLen++] = 0x80; /* add 1 */
wolfSSL 4:1b0d80432c79 491
wolfSSL 4:1b0d80432c79 492 /* pad with zeros */
wolfSSL 4:1b0d80432c79 493 if (sha256->buffLen > SHA256_PAD_SIZE) {
wolfSSL 4:1b0d80432c79 494 XMEMSET(&local[sha256->buffLen], 0, SHA256_BLOCK_SIZE - sha256->buffLen);
wolfSSL 4:1b0d80432c79 495 sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen;
wolfSSL 4:1b0d80432c79 496
wolfSSL 4:1b0d80432c79 497 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU)
wolfSSL 4:1b0d80432c79 498 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 499 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 500 #endif
wolfSSL 4:1b0d80432c79 501 ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE);
wolfSSL 4:1b0d80432c79 502 #endif
wolfSSL 4:1b0d80432c79 503
wolfSSL 4:1b0d80432c79 504 ret = XTRANSFORM(sha256, local);
wolfSSL 4:1b0d80432c79 505 if (ret != 0)
wolfSSL 4:1b0d80432c79 506 return ret;
wolfSSL 4:1b0d80432c79 507
wolfSSL 4:1b0d80432c79 508 sha256->buffLen = 0;
wolfSSL 4:1b0d80432c79 509 }
wolfSSL 4:1b0d80432c79 510 XMEMSET(&local[sha256->buffLen], 0, SHA256_PAD_SIZE - sha256->buffLen);
wolfSSL 4:1b0d80432c79 511
wolfSSL 4:1b0d80432c79 512 /* put lengths in bits */
wolfSSL 4:1b0d80432c79 513 sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) +
wolfSSL 4:1b0d80432c79 514 (sha256->hiLen << 3);
wolfSSL 4:1b0d80432c79 515 sha256->loLen = sha256->loLen << 3;
wolfSSL 4:1b0d80432c79 516
wolfSSL 4:1b0d80432c79 517 /* store lengths */
wolfSSL 4:1b0d80432c79 518 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU)
wolfSSL 4:1b0d80432c79 519 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 520 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 521 #endif
wolfSSL 4:1b0d80432c79 522 ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE);
wolfSSL 4:1b0d80432c79 523 #endif
wolfSSL 4:1b0d80432c79 524 /* ! length ordering dependent on digest endian type ! */
wolfSSL 4:1b0d80432c79 525 XMEMCPY(&local[SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
wolfSSL 4:1b0d80432c79 526 XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
wolfSSL 4:1b0d80432c79 527 sizeof(word32));
wolfSSL 4:1b0d80432c79 528
wolfSSL 4:1b0d80432c79 529 #if defined(FREESCALE_MMCAU) || defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 530 /* Kinetis requires only these bytes reversed */
wolfSSL 4:1b0d80432c79 531 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 532 if(IS_INTEL_AVX1 || IS_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 533 #endif
wolfSSL 4:1b0d80432c79 534 ByteReverseWords(&sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)],
wolfSSL 4:1b0d80432c79 535 &sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)],
wolfSSL 4:1b0d80432c79 536 2 * sizeof(word32));
wolfSSL 4:1b0d80432c79 537 #endif
wolfSSL 4:1b0d80432c79 538
wolfSSL 4:1b0d80432c79 539 ret = XTRANSFORM(sha256, local);
wolfSSL 4:1b0d80432c79 540 if (ret != 0)
wolfSSL 4:1b0d80432c79 541 return ret;
wolfSSL 4:1b0d80432c79 542
wolfSSL 4:1b0d80432c79 543 #if defined(LITTLE_ENDIAN_ORDER)
wolfSSL 4:1b0d80432c79 544 ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE);
wolfSSL 4:1b0d80432c79 545 #endif
wolfSSL 4:1b0d80432c79 546 XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE);
wolfSSL 4:1b0d80432c79 547
wolfSSL 4:1b0d80432c79 548 return wc_InitSha256(sha256); /* reset state */
wolfSSL 4:1b0d80432c79 549 }
wolfSSL 4:1b0d80432c79 550
wolfSSL 4:1b0d80432c79 551
wolfSSL 4:1b0d80432c79 552
wolfSSL 4:1b0d80432c79 553
wolfSSL 4:1b0d80432c79 554 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 555
wolfSSL 4:1b0d80432c79 556 #define _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 4:1b0d80432c79 557 { word32 d ;\
wolfSSL 4:1b0d80432c79 558 d = sha256->digest[0]; __asm__ volatile("movl %0, %"#S_0::"r"(d):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 559 d = sha256->digest[1]; __asm__ volatile("movl %0, %"#S_1::"r"(d):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 560 d = sha256->digest[2]; __asm__ volatile("movl %0, %"#S_2::"r"(d):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 561 d = sha256->digest[3]; __asm__ volatile("movl %0, %"#S_3::"r"(d):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 562 d = sha256->digest[4]; __asm__ volatile("movl %0, %"#S_4::"r"(d):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 563 d = sha256->digest[5]; __asm__ volatile("movl %0, %"#S_5::"r"(d):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 564 d = sha256->digest[6]; __asm__ volatile("movl %0, %"#S_6::"r"(d):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 565 d = sha256->digest[7]; __asm__ volatile("movl %0, %"#S_7::"r"(d):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 566 }
wolfSSL 4:1b0d80432c79 567
wolfSSL 4:1b0d80432c79 568 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 4:1b0d80432c79 569 { word32 d ; \
wolfSSL 4:1b0d80432c79 570 __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ; sha256->digest[0] += d;\
wolfSSL 4:1b0d80432c79 571 __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ; sha256->digest[1] += d;\
wolfSSL 4:1b0d80432c79 572 __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ; sha256->digest[2] += d;\
wolfSSL 4:1b0d80432c79 573 __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ; sha256->digest[3] += d;\
wolfSSL 4:1b0d80432c79 574 __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ; sha256->digest[4] += d;\
wolfSSL 4:1b0d80432c79 575 __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ; sha256->digest[5] += d;\
wolfSSL 4:1b0d80432c79 576 __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ; sha256->digest[6] += d;\
wolfSSL 4:1b0d80432c79 577 __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ; sha256->digest[7] += d;\
wolfSSL 4:1b0d80432c79 578 }
wolfSSL 4:1b0d80432c79 579
wolfSSL 4:1b0d80432c79 580
wolfSSL 4:1b0d80432c79 581 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 4:1b0d80432c79 582 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 4:1b0d80432c79 583
wolfSSL 4:1b0d80432c79 584 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 4:1b0d80432c79 585 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 4:1b0d80432c79 586
wolfSSL 4:1b0d80432c79 587
wolfSSL 4:1b0d80432c79 588
wolfSSL 4:1b0d80432c79 589
wolfSSL 4:1b0d80432c79 590 #define S_0 %r15d
wolfSSL 4:1b0d80432c79 591 #define S_1 %r10d
wolfSSL 4:1b0d80432c79 592 #define S_2 %r11d
wolfSSL 4:1b0d80432c79 593 #define S_3 %r12d
wolfSSL 4:1b0d80432c79 594 #define S_4 %r13d
wolfSSL 4:1b0d80432c79 595 #define S_5 %r14d
wolfSSL 4:1b0d80432c79 596 #define S_6 %ebx
wolfSSL 4:1b0d80432c79 597 #define S_7 %r9d
wolfSSL 4:1b0d80432c79 598
wolfSSL 4:1b0d80432c79 599 #define SSE_REGs "%edi", "%ecx", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15"
wolfSSL 4:1b0d80432c79 600
wolfSSL 4:1b0d80432c79 601 #if defined(HAVE_INTEL_RORX)
wolfSSL 4:1b0d80432c79 602 #define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 603 __asm__ volatile("rorx $6, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\
wolfSSL 4:1b0d80432c79 604
wolfSSL 4:1b0d80432c79 605 #define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 606 __asm__ volatile("rorx $11, %"#e",%%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\
wolfSSL 4:1b0d80432c79 607 __asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\
wolfSSL 4:1b0d80432c79 608 __asm__ volatile("rorx $25, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\
wolfSSL 4:1b0d80432c79 609
wolfSSL 4:1b0d80432c79 610 #define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 611 __asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\
wolfSSL 4:1b0d80432c79 612 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\
wolfSSL 4:1b0d80432c79 613 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\
wolfSSL 4:1b0d80432c79 614 __asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\
wolfSSL 4:1b0d80432c79 615 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\
wolfSSL 4:1b0d80432c79 616
wolfSSL 4:1b0d80432c79 617 #define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 618 /*__asm__ volatile("movl %0, %%edx\n\t"::"m"(w_k):"%edx");*/\
wolfSSL 4:1b0d80432c79 619 __asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\
wolfSSL 4:1b0d80432c79 620 __asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\
wolfSSL 4:1b0d80432c79 621 __asm__ volatile("rorx $2, %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\
wolfSSL 4:1b0d80432c79 622 __asm__ volatile("rorx $13, %"#a", %%edi\n\t":::"%edi",SSE_REGs);/* edi = a>>13 */\
wolfSSL 4:1b0d80432c79 623
wolfSSL 4:1b0d80432c79 624 #define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 625 __asm__ volatile("rorx $22, %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\
wolfSSL 4:1b0d80432c79 626 __asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs);/* edi = (a>>2) ^ (a>>13) */\
wolfSSL 4:1b0d80432c79 627 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma0(a) */\
wolfSSL 4:1b0d80432c79 628
wolfSSL 4:1b0d80432c79 629 #define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 630 __asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\
wolfSSL 4:1b0d80432c79 631 __asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\
wolfSSL 4:1b0d80432c79 632 __asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c*/\
wolfSSL 4:1b0d80432c79 633 __asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\
wolfSSL 4:1b0d80432c79 634
wolfSSL 4:1b0d80432c79 635 #define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 636 __asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\
wolfSSL 4:1b0d80432c79 637 __asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\
wolfSSL 4:1b0d80432c79 638 __asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\
wolfSSL 4:1b0d80432c79 639
wolfSSL 4:1b0d80432c79 640 #define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 641 __asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\
wolfSSL 4:1b0d80432c79 642 __asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \
wolfSSL 4:1b0d80432c79 643 __asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs); \
wolfSSL 4:1b0d80432c79 644 __asm__ volatile("movl %r8d, "#h"\n\t");
wolfSSL 4:1b0d80432c79 645
wolfSSL 4:1b0d80432c79 646 #endif
wolfSSL 4:1b0d80432c79 647
wolfSSL 4:1b0d80432c79 648 #define RND_STEP_1(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 649 __asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs);\
wolfSSL 4:1b0d80432c79 650 __asm__ volatile("roll $26, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\
wolfSSL 4:1b0d80432c79 651 __asm__ volatile("movl %"#e", %%edi\n\t":::"%edi",SSE_REGs);\
wolfSSL 4:1b0d80432c79 652
wolfSSL 4:1b0d80432c79 653 #define RND_STEP_2(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 654 __asm__ volatile("roll $21, %%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\
wolfSSL 4:1b0d80432c79 655 __asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\
wolfSSL 4:1b0d80432c79 656 __asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e */\
wolfSSL 4:1b0d80432c79 657 __asm__ volatile("roll $7, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\
wolfSSL 4:1b0d80432c79 658
wolfSSL 4:1b0d80432c79 659 #define RND_STEP_3(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 660 __asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\
wolfSSL 4:1b0d80432c79 661 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\
wolfSSL 4:1b0d80432c79 662 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\
wolfSSL 4:1b0d80432c79 663 __asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\
wolfSSL 4:1b0d80432c79 664 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\
wolfSSL 4:1b0d80432c79 665
wolfSSL 4:1b0d80432c79 666 #define RND_STEP_4(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 667 __asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\
wolfSSL 4:1b0d80432c79 668 __asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\
wolfSSL 4:1b0d80432c79 669 __asm__ volatile("movl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a */\
wolfSSL 4:1b0d80432c79 670 __asm__ volatile("roll $30, %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\
wolfSSL 4:1b0d80432c79 671 __asm__ volatile("movl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a */\
wolfSSL 4:1b0d80432c79 672 __asm__ volatile("roll $19, %%edi\n\t":::"%edi",SSE_REGs); /* edi = a>>13 */\
wolfSSL 4:1b0d80432c79 673 __asm__ volatile("movl %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a */\
wolfSSL 4:1b0d80432c79 674
wolfSSL 4:1b0d80432c79 675 #define RND_STEP_5(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 676 __asm__ volatile("roll $10, %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\
wolfSSL 4:1b0d80432c79 677 __asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs); /* edi = (a>>2) ^ (a>>13) */\
wolfSSL 4:1b0d80432c79 678 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);/* edx = Sigma0(a) */\
wolfSSL 4:1b0d80432c79 679
wolfSSL 4:1b0d80432c79 680 #define RND_STEP_6(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 681 __asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\
wolfSSL 4:1b0d80432c79 682 __asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\
wolfSSL 4:1b0d80432c79 683 __asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c */\
wolfSSL 4:1b0d80432c79 684 __asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\
wolfSSL 4:1b0d80432c79 685
wolfSSL 4:1b0d80432c79 686 #define RND_STEP_7(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 687 __asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\
wolfSSL 4:1b0d80432c79 688 __asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\
wolfSSL 4:1b0d80432c79 689 __asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\
wolfSSL 4:1b0d80432c79 690
wolfSSL 4:1b0d80432c79 691 #define RND_STEP_8(a,b,c,d,e,f,g,h,i)\
wolfSSL 4:1b0d80432c79 692 __asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\
wolfSSL 4:1b0d80432c79 693 __asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \
wolfSSL 4:1b0d80432c79 694 /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\
wolfSSL 4:1b0d80432c79 695 __asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs);\
wolfSSL 4:1b0d80432c79 696 /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */\
wolfSSL 4:1b0d80432c79 697 __asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \
wolfSSL 4:1b0d80432c79 698 /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \
wolfSSL 4:1b0d80432c79 699
wolfSSL 4:1b0d80432c79 700 #define RND_X(a,b,c,d,e,f,g,h,i) \
wolfSSL 4:1b0d80432c79 701 RND_STEP_1(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 702 RND_STEP_2(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 703 RND_STEP_3(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 704 RND_STEP_4(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 705 RND_STEP_5(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 706 RND_STEP_6(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 707 RND_STEP_7(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 708 RND_STEP_8(a,b,c,d,e,f,g,h,i);
wolfSSL 4:1b0d80432c79 709
wolfSSL 4:1b0d80432c79 710 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 4:1b0d80432c79 711 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 4:1b0d80432c79 712 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 4:1b0d80432c79 713 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 4:1b0d80432c79 714 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 4:1b0d80432c79 715 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 4:1b0d80432c79 716 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 4:1b0d80432c79 717 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 4:1b0d80432c79 718
wolfSSL 4:1b0d80432c79 719
wolfSSL 4:1b0d80432c79 720 #define RND_1_3(a,b,c,d,e,f,g,h,i) {\
wolfSSL 4:1b0d80432c79 721 RND_STEP_1(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 722 RND_STEP_2(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 723 RND_STEP_3(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 724 }
wolfSSL 4:1b0d80432c79 725
wolfSSL 4:1b0d80432c79 726 #define RND_4_6(a,b,c,d,e,f,g,h,i) {\
wolfSSL 4:1b0d80432c79 727 RND_STEP_4(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 728 RND_STEP_5(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 729 RND_STEP_6(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 730 }
wolfSSL 4:1b0d80432c79 731
wolfSSL 4:1b0d80432c79 732 #define RND_7_8(a,b,c,d,e,f,g,h,i) {\
wolfSSL 4:1b0d80432c79 733 RND_STEP_7(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 734 RND_STEP_8(a,b,c,d,e,f,g,h,i); \
wolfSSL 4:1b0d80432c79 735 }
wolfSSL 4:1b0d80432c79 736
wolfSSL 4:1b0d80432c79 737 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 4:1b0d80432c79 738 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 4:1b0d80432c79 739 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 4:1b0d80432c79 740 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 4:1b0d80432c79 741 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 4:1b0d80432c79 742 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 4:1b0d80432c79 743 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 4:1b0d80432c79 744 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 4:1b0d80432c79 745
wolfSSL 4:1b0d80432c79 746
wolfSSL 4:1b0d80432c79 747 #define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 4:1b0d80432c79 748 #define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 4:1b0d80432c79 749 #define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 4:1b0d80432c79 750 #define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 4:1b0d80432c79 751 #define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 4:1b0d80432c79 752 #define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 4:1b0d80432c79 753 #define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 4:1b0d80432c79 754 #define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 4:1b0d80432c79 755
wolfSSL 4:1b0d80432c79 756 #define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 4:1b0d80432c79 757 #define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 4:1b0d80432c79 758 #define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 4:1b0d80432c79 759 #define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 4:1b0d80432c79 760 #define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 4:1b0d80432c79 761 #define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 4:1b0d80432c79 762 #define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 4:1b0d80432c79 763 #define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 4:1b0d80432c79 764
wolfSSL 4:1b0d80432c79 765 #define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 4:1b0d80432c79 766 #define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 4:1b0d80432c79 767 #define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 4:1b0d80432c79 768 #define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 4:1b0d80432c79 769 #define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 4:1b0d80432c79 770 #define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 4:1b0d80432c79 771 #define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 4:1b0d80432c79 772 #define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 4:1b0d80432c79 773
wolfSSL 4:1b0d80432c79 774 #define FOR(cnt, init, max, inc, loop) \
wolfSSL 4:1b0d80432c79 775 __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):)
wolfSSL 4:1b0d80432c79 776 #define END(cnt, init, max, inc, loop) \
wolfSSL 4:1b0d80432c79 777 __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::) ;
wolfSSL 4:1b0d80432c79 778
wolfSSL 4:1b0d80432c79 779 #endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */
wolfSSL 4:1b0d80432c79 780
wolfSSL 4:1b0d80432c79 781 #if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */
wolfSSL 4:1b0d80432c79 782
wolfSSL 4:1b0d80432c79 783 #define VPALIGNR(op1,op2,op3,op4) __asm__ volatile("vpalignr $"#op4", %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 4:1b0d80432c79 784 #define VPADDD(op1,op2,op3) __asm__ volatile("vpaddd %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 4:1b0d80432c79 785 #define VPSRLD(op1,op2,op3) __asm__ volatile("vpsrld $"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 4:1b0d80432c79 786 #define VPSRLQ(op1,op2,op3) __asm__ volatile("vpsrlq $"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 4:1b0d80432c79 787 #define VPSLLD(op1,op2,op3) __asm__ volatile("vpslld $"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 4:1b0d80432c79 788 #define VPOR(op1,op2,op3) __asm__ volatile("vpor %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 4:1b0d80432c79 789 #define VPXOR(op1,op2,op3) __asm__ volatile("vpxor %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 4:1b0d80432c79 790 #define VPSHUFD(op1,op2,op3) __asm__ volatile("vpshufd $"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 4:1b0d80432c79 791 #define VPSHUFB(op1,op2,op3) __asm__ volatile("vpshufb %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 4:1b0d80432c79 792
wolfSSL 4:1b0d80432c79 793 #define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\
wolfSSL 4:1b0d80432c79 794 a,b,c,d,e,f,g,h,_i)\
wolfSSL 4:1b0d80432c79 795 RND_STEP_1(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 796 VPALIGNR (XTMP0, X3, X2, 4) ;\
wolfSSL 4:1b0d80432c79 797 RND_STEP_2(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 798 VPADDD (XTMP0, XTMP0, X0) ;\
wolfSSL 4:1b0d80432c79 799 RND_STEP_3(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 800 VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\
wolfSSL 4:1b0d80432c79 801 RND_STEP_4(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 802 VPSRLD (XTMP2, XTMP1, 7) ;\
wolfSSL 4:1b0d80432c79 803 RND_STEP_5(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 804 VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\
wolfSSL 4:1b0d80432c79 805 RND_STEP_6(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 806 VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\
wolfSSL 4:1b0d80432c79 807 RND_STEP_7(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 808 VPSRLD (XTMP2, XTMP1,18) ;\
wolfSSL 4:1b0d80432c79 809 RND_STEP_8(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 810 \
wolfSSL 4:1b0d80432c79 811 RND_STEP_1(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 812 VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\
wolfSSL 4:1b0d80432c79 813 RND_STEP_2(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 814 VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\
wolfSSL 4:1b0d80432c79 815 RND_STEP_3(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 816 VPXOR (XTMP3, XTMP3, XTMP1) ;\
wolfSSL 4:1b0d80432c79 817 RND_STEP_4(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 818 VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
wolfSSL 4:1b0d80432c79 819 RND_STEP_5(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 820 VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\
wolfSSL 4:1b0d80432c79 821 RND_STEP_6(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 822 VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\
wolfSSL 4:1b0d80432c79 823 RND_STEP_7(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 824 VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\
wolfSSL 4:1b0d80432c79 825 RND_STEP_8(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 826 \
wolfSSL 4:1b0d80432c79 827 RND_STEP_1(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 828 VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\
wolfSSL 4:1b0d80432c79 829 RND_STEP_2(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 830 VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
wolfSSL 4:1b0d80432c79 831 RND_STEP_3(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 832 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
wolfSSL 4:1b0d80432c79 833 RND_STEP_4(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 834 VPXOR (XTMP2, XTMP2, XTMP3) ;\
wolfSSL 4:1b0d80432c79 835 RND_STEP_5(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 836 VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\
wolfSSL 4:1b0d80432c79 837 RND_STEP_6(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 838 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\
wolfSSL 4:1b0d80432c79 839 RND_STEP_7(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 840 VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\
wolfSSL 4:1b0d80432c79 841 RND_STEP_8(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 842 \
wolfSSL 4:1b0d80432c79 843 RND_STEP_1(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 844 VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\
wolfSSL 4:1b0d80432c79 845 RND_STEP_2(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 846 VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\
wolfSSL 4:1b0d80432c79 847 RND_STEP_3(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 848 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
wolfSSL 4:1b0d80432c79 849 RND_STEP_4(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 850 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
wolfSSL 4:1b0d80432c79 851 RND_STEP_5(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 852 VPXOR (XTMP2, XTMP2, XTMP3) ;\
wolfSSL 4:1b0d80432c79 853 RND_STEP_6(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 854 VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\
wolfSSL 4:1b0d80432c79 855 RND_STEP_7(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 856 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\
wolfSSL 4:1b0d80432c79 857 RND_STEP_8(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 858 VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\
wolfSSL 4:1b0d80432c79 859
wolfSSL 4:1b0d80432c79 860 #if defined(HAVE_INTEL_RORX)
wolfSSL 4:1b0d80432c79 861
wolfSSL 4:1b0d80432c79 862 #define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \
wolfSSL 4:1b0d80432c79 863 XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\
wolfSSL 4:1b0d80432c79 864 RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 865 VPALIGNR (XTMP0, X3, X2, 4) ;\
wolfSSL 4:1b0d80432c79 866 RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 867 VPADDD (XTMP0, XTMP0, X0) ;\
wolfSSL 4:1b0d80432c79 868 RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 869 VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\
wolfSSL 4:1b0d80432c79 870 RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 871 VPSRLD (XTMP2, XTMP1, 7) ;\
wolfSSL 4:1b0d80432c79 872 RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 873 VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\
wolfSSL 4:1b0d80432c79 874 RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 875 VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\
wolfSSL 4:1b0d80432c79 876 RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 877 VPSRLD (XTMP2, XTMP1,18) ;\
wolfSSL 4:1b0d80432c79 878 RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i);\
wolfSSL 4:1b0d80432c79 879 \
wolfSSL 4:1b0d80432c79 880 RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 881 VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\
wolfSSL 4:1b0d80432c79 882 RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 883 VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\
wolfSSL 4:1b0d80432c79 884 RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 885 VPXOR (XTMP3, XTMP3, XTMP1) ;\
wolfSSL 4:1b0d80432c79 886 RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 887 VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
wolfSSL 4:1b0d80432c79 888 RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 889 VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\
wolfSSL 4:1b0d80432c79 890 RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 891 VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\
wolfSSL 4:1b0d80432c79 892 RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 893 VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\
wolfSSL 4:1b0d80432c79 894 RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 4:1b0d80432c79 895 \
wolfSSL 4:1b0d80432c79 896 RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 897 VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\
wolfSSL 4:1b0d80432c79 898 RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 899 VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
wolfSSL 4:1b0d80432c79 900 RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 901 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
wolfSSL 4:1b0d80432c79 902 RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 903 VPXOR (XTMP2, XTMP2, XTMP3) ;\
wolfSSL 4:1b0d80432c79 904 RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 905 VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\
wolfSSL 4:1b0d80432c79 906 RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 907 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\
wolfSSL 4:1b0d80432c79 908 RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 909 VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\
wolfSSL 4:1b0d80432c79 910 RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 4:1b0d80432c79 911 \
wolfSSL 4:1b0d80432c79 912 RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 913 VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\
wolfSSL 4:1b0d80432c79 914 RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 915 VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\
wolfSSL 4:1b0d80432c79 916 RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 917 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
wolfSSL 4:1b0d80432c79 918 RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 919 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
wolfSSL 4:1b0d80432c79 920 RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 921 VPXOR (XTMP2, XTMP2, XTMP3) ;\
wolfSSL 4:1b0d80432c79 922 RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 923 VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\
wolfSSL 4:1b0d80432c79 924 RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 925 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\
wolfSSL 4:1b0d80432c79 926 RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 4:1b0d80432c79 927 VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\
wolfSSL 4:1b0d80432c79 928
wolfSSL 4:1b0d80432c79 929 #endif
wolfSSL 4:1b0d80432c79 930
wolfSSL 4:1b0d80432c79 931
wolfSSL 4:1b0d80432c79 932 #define W_K_from_buff\
wolfSSL 4:1b0d80432c79 933 __asm__ volatile("vmovdqu %0, %%xmm4\n\t"\
wolfSSL 4:1b0d80432c79 934 "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t"\
wolfSSL 4:1b0d80432c79 935 :: "m"(sha256->buffer[0]):"%xmm4") ;\
wolfSSL 4:1b0d80432c79 936 __asm__ volatile("vmovdqu %0, %%xmm5\n\t"\
wolfSSL 4:1b0d80432c79 937 "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t"\
wolfSSL 4:1b0d80432c79 938 ::"m"(sha256->buffer[4]):"%xmm5") ;\
wolfSSL 4:1b0d80432c79 939 __asm__ volatile("vmovdqu %0, %%xmm6\n\t"\
wolfSSL 4:1b0d80432c79 940 "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t"\
wolfSSL 4:1b0d80432c79 941 ::"m"(sha256->buffer[8]):"%xmm6") ;\
wolfSSL 4:1b0d80432c79 942 __asm__ volatile("vmovdqu %0, %%xmm7\n\t"\
wolfSSL 4:1b0d80432c79 943 "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"\
wolfSSL 4:1b0d80432c79 944 ::"m"(sha256->buffer[12]):"%xmm7") ;\
wolfSSL 4:1b0d80432c79 945
wolfSSL 4:1b0d80432c79 946 #define _SET_W_K_XFER(reg, i)\
wolfSSL 4:1b0d80432c79 947 __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs) ;\
wolfSSL 4:1b0d80432c79 948 __asm__ volatile("vmovdqa %%xmm9, %0":"=m"(W_K[i])::XMM_REGs) ;
wolfSSL 4:1b0d80432c79 949
wolfSSL 4:1b0d80432c79 950 #define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i)
wolfSSL 4:1b0d80432c79 951
wolfSSL 4:1b0d80432c79 952 static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF } ; /* shuffle xBxA -> 00BA */
wolfSSL 4:1b0d80432c79 953 static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 } ; /* shuffle xDxC -> DC00 */
wolfSSL 4:1b0d80432c79 954 static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
wolfSSL 4:1b0d80432c79 955
wolfSSL 4:1b0d80432c79 956
wolfSSL 4:1b0d80432c79 957 #define _Init_Masks(mask1, mask2, mask3)\
wolfSSL 4:1b0d80432c79 958 __asm__ volatile("vmovdqu %0, %"#mask1 ::"m"(mBYTE_FLIP_MASK[0])) ;\
wolfSSL 4:1b0d80432c79 959 __asm__ volatile("vmovdqu %0, %"#mask2 ::"m"(mSHUF_00BA[0])) ;\
wolfSSL 4:1b0d80432c79 960 __asm__ volatile("vmovdqu %0, %"#mask3 ::"m"(mSHUF_DC00[0])) ;
wolfSSL 4:1b0d80432c79 961
wolfSSL 4:1b0d80432c79 962 #define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\
wolfSSL 4:1b0d80432c79 963 _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
wolfSSL 4:1b0d80432c79 964
wolfSSL 4:1b0d80432c79 965 #define X0 %xmm4
wolfSSL 4:1b0d80432c79 966 #define X1 %xmm5
wolfSSL 4:1b0d80432c79 967 #define X2 %xmm6
wolfSSL 4:1b0d80432c79 968 #define X3 %xmm7
wolfSSL 4:1b0d80432c79 969 #define X_ X0
wolfSSL 4:1b0d80432c79 970
wolfSSL 4:1b0d80432c79 971 #define XTMP0 %xmm0
wolfSSL 4:1b0d80432c79 972 #define XTMP1 %xmm1
wolfSSL 4:1b0d80432c79 973 #define XTMP2 %xmm2
wolfSSL 4:1b0d80432c79 974 #define XTMP3 %xmm3
wolfSSL 4:1b0d80432c79 975 #define XTMP4 %xmm8
wolfSSL 4:1b0d80432c79 976 #define XTMP5 %xmm9
wolfSSL 4:1b0d80432c79 977 #define XFER %xmm10
wolfSSL 4:1b0d80432c79 978
wolfSSL 4:1b0d80432c79 979 #define SHUF_00BA %xmm11 /* shuffle xBxA -> 00BA */
wolfSSL 4:1b0d80432c79 980 #define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */
wolfSSL 4:1b0d80432c79 981 #define BYTE_FLIP_MASK %xmm13
wolfSSL 4:1b0d80432c79 982
wolfSSL 4:1b0d80432c79 983 #define XMM_REGs /* Registers are saved in Sha256Update/Finel */
wolfSSL 4:1b0d80432c79 984 /*"xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13" */
wolfSSL 4:1b0d80432c79 985
wolfSSL 4:1b0d80432c79 986 static int Transform_AVX1(Sha256* sha256)
wolfSSL 4:1b0d80432c79 987 {
wolfSSL 4:1b0d80432c79 988
wolfSSL 4:1b0d80432c79 989 word32 W_K[64] ; /* temp for W+K */
wolfSSL 4:1b0d80432c79 990
wolfSSL 4:1b0d80432c79 991 #if defined(DEBUG_XMM)
wolfSSL 4:1b0d80432c79 992 int i, j ;
wolfSSL 4:1b0d80432c79 993 word32 xmm[29][4*15] ;
wolfSSL 4:1b0d80432c79 994 #endif
wolfSSL 4:1b0d80432c79 995
wolfSSL 4:1b0d80432c79 996 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ;
wolfSSL 4:1b0d80432c79 997 W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */
wolfSSL 4:1b0d80432c79 998
wolfSSL 4:1b0d80432c79 999 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
wolfSSL 4:1b0d80432c79 1000
wolfSSL 4:1b0d80432c79 1001 SET_W_K_XFER(X0, 0) ;
wolfSSL 4:1b0d80432c79 1002 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 4:1b0d80432c79 1003 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ;
wolfSSL 4:1b0d80432c79 1004 SET_W_K_XFER(X1, 4) ;
wolfSSL 4:1b0d80432c79 1005 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 4:1b0d80432c79 1006 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ;
wolfSSL 4:1b0d80432c79 1007 SET_W_K_XFER(X2, 8) ;
wolfSSL 4:1b0d80432c79 1008 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 4:1b0d80432c79 1009 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
wolfSSL 4:1b0d80432c79 1010 SET_W_K_XFER(X3, 12) ;
wolfSSL 4:1b0d80432c79 1011 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 4:1b0d80432c79 1012 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ;
wolfSSL 4:1b0d80432c79 1013 SET_W_K_XFER(X0, 16) ;
wolfSSL 4:1b0d80432c79 1014 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 4:1b0d80432c79 1015 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
wolfSSL 4:1b0d80432c79 1016 SET_W_K_XFER(X1, 20) ;
wolfSSL 4:1b0d80432c79 1017 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 4:1b0d80432c79 1018 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ;
wolfSSL 4:1b0d80432c79 1019 SET_W_K_XFER(X2, 24) ;
wolfSSL 4:1b0d80432c79 1020 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 4:1b0d80432c79 1021 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
wolfSSL 4:1b0d80432c79 1022 SET_W_K_XFER(X3, 28) ;
wolfSSL 4:1b0d80432c79 1023 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 4:1b0d80432c79 1024 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ;
wolfSSL 4:1b0d80432c79 1025 SET_W_K_XFER(X0, 32) ;
wolfSSL 4:1b0d80432c79 1026 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 4:1b0d80432c79 1027 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
wolfSSL 4:1b0d80432c79 1028 SET_W_K_XFER(X1, 36) ;
wolfSSL 4:1b0d80432c79 1029 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 4:1b0d80432c79 1030 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ;
wolfSSL 4:1b0d80432c79 1031 SET_W_K_XFER(X2, 40) ;
wolfSSL 4:1b0d80432c79 1032 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 4:1b0d80432c79 1033 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
wolfSSL 4:1b0d80432c79 1034 SET_W_K_XFER(X3, 44) ;
wolfSSL 4:1b0d80432c79 1035 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 4:1b0d80432c79 1036 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ;
wolfSSL 4:1b0d80432c79 1037
wolfSSL 4:1b0d80432c79 1038 SET_W_K_XFER(X0, 48) ;
wolfSSL 4:1b0d80432c79 1039 SET_W_K_XFER(X1, 52) ;
wolfSSL 4:1b0d80432c79 1040 SET_W_K_XFER(X2, 56) ;
wolfSSL 4:1b0d80432c79 1041 SET_W_K_XFER(X3, 60) ;
wolfSSL 4:1b0d80432c79 1042
wolfSSL 4:1b0d80432c79 1043 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
wolfSSL 4:1b0d80432c79 1044 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
wolfSSL 4:1b0d80432c79 1045 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
wolfSSL 4:1b0d80432c79 1046 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
wolfSSL 4:1b0d80432c79 1047
wolfSSL 4:1b0d80432c79 1048 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
wolfSSL 4:1b0d80432c79 1049 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
wolfSSL 4:1b0d80432c79 1050 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
wolfSSL 4:1b0d80432c79 1051 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
wolfSSL 4:1b0d80432c79 1052
wolfSSL 4:1b0d80432c79 1053 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ;
wolfSSL 4:1b0d80432c79 1054 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ;
wolfSSL 4:1b0d80432c79 1055 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ;
wolfSSL 4:1b0d80432c79 1056 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ;
wolfSSL 4:1b0d80432c79 1057
wolfSSL 4:1b0d80432c79 1058 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ;
wolfSSL 4:1b0d80432c79 1059 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ;
wolfSSL 4:1b0d80432c79 1060 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ;
wolfSSL 4:1b0d80432c79 1061 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ;
wolfSSL 4:1b0d80432c79 1062
wolfSSL 4:1b0d80432c79 1063 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
wolfSSL 4:1b0d80432c79 1064
wolfSSL 4:1b0d80432c79 1065 #if defined(DEBUG_XMM)
wolfSSL 4:1b0d80432c79 1066 for(i=0; i<29; i++) {
wolfSSL 4:1b0d80432c79 1067 for(j=0; j<4*14; j+=4)
wolfSSL 4:1b0d80432c79 1068 printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i,
wolfSSL 4:1b0d80432c79 1069 xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ;
wolfSSL 4:1b0d80432c79 1070 printf("\n") ;
wolfSSL 4:1b0d80432c79 1071 }
wolfSSL 4:1b0d80432c79 1072
wolfSSL 4:1b0d80432c79 1073 for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ;
wolfSSL 4:1b0d80432c79 1074 #endif
wolfSSL 4:1b0d80432c79 1075
wolfSSL 4:1b0d80432c79 1076 return 0;
wolfSSL 4:1b0d80432c79 1077 }
wolfSSL 4:1b0d80432c79 1078
wolfSSL 4:1b0d80432c79 1079 #if defined(HAVE_INTEL_RORX)
wolfSSL 4:1b0d80432c79 1080 static int Transform_AVX1_RORX(Sha256* sha256)
wolfSSL 4:1b0d80432c79 1081 {
wolfSSL 4:1b0d80432c79 1082
wolfSSL 4:1b0d80432c79 1083 word32 W_K[64] ; /* temp for W+K */
wolfSSL 4:1b0d80432c79 1084
wolfSSL 4:1b0d80432c79 1085 #if defined(DEBUG_XMM)
wolfSSL 4:1b0d80432c79 1086 int i, j ;
wolfSSL 4:1b0d80432c79 1087 word32 xmm[29][4*15] ;
wolfSSL 4:1b0d80432c79 1088 #endif
wolfSSL 4:1b0d80432c79 1089
wolfSSL 4:1b0d80432c79 1090 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ;
wolfSSL 4:1b0d80432c79 1091 W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */
wolfSSL 4:1b0d80432c79 1092
wolfSSL 4:1b0d80432c79 1093 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
wolfSSL 4:1b0d80432c79 1094 SET_W_K_XFER(X0, 0) ;
wolfSSL 4:1b0d80432c79 1095 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 4:1b0d80432c79 1096 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ;
wolfSSL 4:1b0d80432c79 1097 SET_W_K_XFER(X1, 4) ;
wolfSSL 4:1b0d80432c79 1098 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 4:1b0d80432c79 1099 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ;
wolfSSL 4:1b0d80432c79 1100 SET_W_K_XFER(X2, 8) ;
wolfSSL 4:1b0d80432c79 1101 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 4:1b0d80432c79 1102 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
wolfSSL 4:1b0d80432c79 1103 SET_W_K_XFER(X3, 12) ;
wolfSSL 4:1b0d80432c79 1104 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 4:1b0d80432c79 1105 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ;
wolfSSL 4:1b0d80432c79 1106 SET_W_K_XFER(X0, 16) ;
wolfSSL 4:1b0d80432c79 1107 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 4:1b0d80432c79 1108 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
wolfSSL 4:1b0d80432c79 1109 SET_W_K_XFER(X1, 20) ;
wolfSSL 4:1b0d80432c79 1110 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 4:1b0d80432c79 1111 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ;
wolfSSL 4:1b0d80432c79 1112 SET_W_K_XFER(X2, 24) ;
wolfSSL 4:1b0d80432c79 1113 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 4:1b0d80432c79 1114 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
wolfSSL 4:1b0d80432c79 1115 SET_W_K_XFER(X3, 28) ;
wolfSSL 4:1b0d80432c79 1116 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 4:1b0d80432c79 1117 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ;
wolfSSL 4:1b0d80432c79 1118 SET_W_K_XFER(X0, 32) ;
wolfSSL 4:1b0d80432c79 1119 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 4:1b0d80432c79 1120 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
wolfSSL 4:1b0d80432c79 1121 SET_W_K_XFER(X1, 36) ;
wolfSSL 4:1b0d80432c79 1122 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 4:1b0d80432c79 1123 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ;
wolfSSL 4:1b0d80432c79 1124 SET_W_K_XFER(X2, 40) ;
wolfSSL 4:1b0d80432c79 1125 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 4:1b0d80432c79 1126 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
wolfSSL 4:1b0d80432c79 1127 SET_W_K_XFER(X3, 44) ;
wolfSSL 4:1b0d80432c79 1128 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 4:1b0d80432c79 1129 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ;
wolfSSL 4:1b0d80432c79 1130
wolfSSL 4:1b0d80432c79 1131 SET_W_K_XFER(X0, 48) ;
wolfSSL 4:1b0d80432c79 1132 SET_W_K_XFER(X1, 52) ;
wolfSSL 4:1b0d80432c79 1133 SET_W_K_XFER(X2, 56) ;
wolfSSL 4:1b0d80432c79 1134 SET_W_K_XFER(X3, 60) ;
wolfSSL 4:1b0d80432c79 1135
wolfSSL 4:1b0d80432c79 1136 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
wolfSSL 4:1b0d80432c79 1137 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
wolfSSL 4:1b0d80432c79 1138 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
wolfSSL 4:1b0d80432c79 1139 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
wolfSSL 4:1b0d80432c79 1140
wolfSSL 4:1b0d80432c79 1141 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
wolfSSL 4:1b0d80432c79 1142 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
wolfSSL 4:1b0d80432c79 1143 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
wolfSSL 4:1b0d80432c79 1144 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
wolfSSL 4:1b0d80432c79 1145
wolfSSL 4:1b0d80432c79 1146 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ;
wolfSSL 4:1b0d80432c79 1147 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ;
wolfSSL 4:1b0d80432c79 1148 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ;
wolfSSL 4:1b0d80432c79 1149 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ;
wolfSSL 4:1b0d80432c79 1150
wolfSSL 4:1b0d80432c79 1151 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ;
wolfSSL 4:1b0d80432c79 1152 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ;
wolfSSL 4:1b0d80432c79 1153 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ;
wolfSSL 4:1b0d80432c79 1154 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ;
wolfSSL 4:1b0d80432c79 1155
wolfSSL 4:1b0d80432c79 1156 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
wolfSSL 4:1b0d80432c79 1157
wolfSSL 4:1b0d80432c79 1158 #if defined(DEBUG_XMM)
wolfSSL 4:1b0d80432c79 1159 for(i=0; i<29; i++) {
wolfSSL 4:1b0d80432c79 1160 for(j=0; j<4*14; j+=4)
wolfSSL 4:1b0d80432c79 1161 printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i,
wolfSSL 4:1b0d80432c79 1162 xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ;
wolfSSL 4:1b0d80432c79 1163 printf("\n") ;
wolfSSL 4:1b0d80432c79 1164 }
wolfSSL 4:1b0d80432c79 1165
wolfSSL 4:1b0d80432c79 1166 for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ;
wolfSSL 4:1b0d80432c79 1167 #endif
wolfSSL 4:1b0d80432c79 1168
wolfSSL 4:1b0d80432c79 1169 return 0;
wolfSSL 4:1b0d80432c79 1170 }
wolfSSL 4:1b0d80432c79 1171 #endif /* HAVE_INTEL_RORX */
wolfSSL 4:1b0d80432c79 1172
wolfSSL 4:1b0d80432c79 1173 #endif /* HAVE_INTEL_AVX1 */
wolfSSL 4:1b0d80432c79 1174
wolfSSL 4:1b0d80432c79 1175
wolfSSL 4:1b0d80432c79 1176 #if defined(HAVE_INTEL_AVX2)
wolfSSL 4:1b0d80432c79 1177
wolfSSL 4:1b0d80432c79 1178 #define _MOVE_to_REG(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem):YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1179 #define _MOVE_to_MEM(mem, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" : "=m"(mem)::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1180 #define _BYTE_SWAP(ymm, map) __asm__ volatile("vpshufb %0, %%"#ymm", %%"#ymm"\n\t"\
wolfSSL 4:1b0d80432c79 1181 :: "m"(map):YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1182 #define _MOVE_128(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"#map", %%"\
wolfSSL 4:1b0d80432c79 1183 #ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1184 #define _MOVE_BYTE(ymm0, ymm1, map) __asm__ volatile("vpshufb %0, %%"#ymm1", %%"\
wolfSSL 4:1b0d80432c79 1185 #ymm0"\n\t":: "m"(map):YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1186 #define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrld $"#bits", %%"\
wolfSSL 4:1b0d80432c79 1187 #src", %%"#dest"\n\tvpslld $32-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\
wolfSSL 4:1b0d80432c79 1188 #temp",%%"#dest", %%"#dest" ":::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1189 #define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrld $"#bits", %%"\
wolfSSL 4:1b0d80432c79 1190 #src", %%"#dest" ":::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1191 #define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\
wolfSSL 4:1b0d80432c79 1192 #src2", %%"#dest" ":::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1193 #define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\
wolfSSL 4:1b0d80432c79 1194 #src2", %%"#dest" ":::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1195 #define _ADD(dest, src1, src2) __asm__ volatile("vpaddd %%"#src1", %%"\
wolfSSL 4:1b0d80432c79 1196 #src2", %%"#dest" ":::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1197 #define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddd %0, %%"#src1", %%"\
wolfSSL 4:1b0d80432c79 1198 #dest" "::"m"(mem):YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1199 #define _BLEND(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\
wolfSSL 4:1b0d80432c79 1200 #src1", %%"#src2", %%"#dest" ":::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1201
wolfSSL 4:1b0d80432c79 1202 #define _EXTRACT_XMM_0(xmm, mem) __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1203 #define _EXTRACT_XMM_1(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1204 #define _EXTRACT_XMM_2(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1205 #define _EXTRACT_XMM_3(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1206 #define _EXTRACT_XMM_4(ymm, xmm, mem)\
wolfSSL 4:1b0d80432c79 1207 __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ;\
wolfSSL 4:1b0d80432c79 1208 __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1209 #define _EXTRACT_XMM_5(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1210 #define _EXTRACT_XMM_6(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1211 #define _EXTRACT_XMM_7(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1212
wolfSSL 4:1b0d80432c79 1213 #define _SWAP_YMM_HL(ymm) __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ;
wolfSSL 4:1b0d80432c79 1214 #define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm)
wolfSSL 4:1b0d80432c79 1215
wolfSSL 4:1b0d80432c79 1216 #define MOVE_to_REG(ymm, mem) _MOVE_to_REG(ymm, mem)
wolfSSL 4:1b0d80432c79 1217 #define MOVE_to_MEM(mem, ymm) _MOVE_to_MEM(mem, ymm)
wolfSSL 4:1b0d80432c79 1218 #define BYTE_SWAP(ymm, map) _BYTE_SWAP(ymm, map)
wolfSSL 4:1b0d80432c79 1219 #define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map)
wolfSSL 4:1b0d80432c79 1220 #define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map)
wolfSSL 4:1b0d80432c79 1221 #define XOR(dest, src1, src2) _XOR(dest, src1, src2)
wolfSSL 4:1b0d80432c79 1222 #define OR(dest, src1, src2) _OR(dest, src1, src2)
wolfSSL 4:1b0d80432c79 1223 #define ADD(dest, src1, src2) _ADD(dest, src1, src2)
wolfSSL 4:1b0d80432c79 1224 #define ADD_MEM(dest, src1, mem) _ADD_MEM(dest, src1, mem)
wolfSSL 4:1b0d80432c79 1225 #define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2)
wolfSSL 4:1b0d80432c79 1226
wolfSSL 4:1b0d80432c79 1227 #define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp);
wolfSSL 4:1b0d80432c79 1228 #define AVX2_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP)
wolfSSL 4:1b0d80432c79 1229 #define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits)
wolfSSL 4:1b0d80432c79 1230
wolfSSL 4:1b0d80432c79 1231 #define GAMMA0(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); \
wolfSSL 4:1b0d80432c79 1232 XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); XOR(dest, G_TEMP, dest) ;
wolfSSL 4:1b0d80432c79 1233 #define GAMMA0_1(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18);
wolfSSL 4:1b0d80432c79 1234 #define GAMMA0_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); \
wolfSSL 4:1b0d80432c79 1235 XOR(dest, G_TEMP, dest) ;
wolfSSL 4:1b0d80432c79 1236
wolfSSL 4:1b0d80432c79 1237 #define GAMMA1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); \
wolfSSL 4:1b0d80432c79 1238 XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); XOR(dest, G_TEMP, dest) ;
wolfSSL 4:1b0d80432c79 1239 #define GAMMA1_1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19);
wolfSSL 4:1b0d80432c79 1240 #define GAMMA1_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); \
wolfSSL 4:1b0d80432c79 1241 XOR(dest, G_TEMP, dest) ;
wolfSSL 4:1b0d80432c79 1242
wolfSSL 4:1b0d80432c79 1243 #define FEEDBACK1_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP1toW_I_2[0]) ; \
wolfSSL 4:1b0d80432c79 1244 BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1245 #define FEEDBACK2_to_W_I_2 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ; \
wolfSSL 4:1b0d80432c79 1246 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAP2toW_I_2[0]) ; BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1247 #define FEEDBACK3_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP3toW_I_2[0]) ; \
wolfSSL 4:1b0d80432c79 1248 BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1249
wolfSSL 4:1b0d80432c79 1250 #define FEEDBACK_to_W_I_7 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ;\
wolfSSL 4:1b0d80432c79 1251 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAPtoW_I_7[0]) ; BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7) ;
wolfSSL 4:1b0d80432c79 1252
wolfSSL 4:1b0d80432c79 1253 #undef voitle
wolfSSL 4:1b0d80432c79 1254
wolfSSL 4:1b0d80432c79 1255 #define W_I_16 ymm8
wolfSSL 4:1b0d80432c79 1256 #define W_I_15 ymm9
wolfSSL 4:1b0d80432c79 1257 #define W_I_7 ymm10
wolfSSL 4:1b0d80432c79 1258 #define W_I_2 ymm11
wolfSSL 4:1b0d80432c79 1259 #define W_I ymm12
wolfSSL 4:1b0d80432c79 1260 #define G_TEMP ymm13
wolfSSL 4:1b0d80432c79 1261 #define S_TEMP ymm14
wolfSSL 4:1b0d80432c79 1262 #define YMM_TEMP0 ymm15
wolfSSL 4:1b0d80432c79 1263 #define YMM_TEMP0x xmm15
wolfSSL 4:1b0d80432c79 1264 #define W_I_TEMP ymm7
wolfSSL 4:1b0d80432c79 1265 #define W_K_TEMP ymm15
wolfSSL 4:1b0d80432c79 1266 #define W_K_TEMPx xmm15
wolfSSL 4:1b0d80432c79 1267
wolfSSL 4:1b0d80432c79 1268 #define YMM_REGs /* Registers are saved in Sha256Update/Finel */
wolfSSL 4:1b0d80432c79 1269 /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/
wolfSSL 4:1b0d80432c79 1270
wolfSSL 4:1b0d80432c79 1271
wolfSSL 4:1b0d80432c79 1272 #define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\
wolfSSL 4:1b0d80432c79 1273 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs) ;\
wolfSSL 4:1b0d80432c79 1274 __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs) ;\
wolfSSL 4:1b0d80432c79 1275 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\
wolfSSL 4:1b0d80432c79 1276 __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\
wolfSSL 4:1b0d80432c79 1277 __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\
wolfSSL 4:1b0d80432c79 1278
wolfSSL 4:1b0d80432c79 1279 #define MOVE_7_to_15(w_i_15, w_i_7)\
wolfSSL 4:1b0d80432c79 1280 __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\
wolfSSL 4:1b0d80432c79 1281
wolfSSL 4:1b0d80432c79 1282 #define MOVE_I_to_7(w_i_7, w_i)\
wolfSSL 4:1b0d80432c79 1283 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\
wolfSSL 4:1b0d80432c79 1284 __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\
wolfSSL 4:1b0d80432c79 1285 __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs) ;\
wolfSSL 4:1b0d80432c79 1286
wolfSSL 4:1b0d80432c79 1287 #define MOVE_I_to_2(w_i_2, w_i)\
wolfSSL 4:1b0d80432c79 1288 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs) ;\
wolfSSL 4:1b0d80432c79 1289 __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs) ;\
wolfSSL 4:1b0d80432c79 1290
wolfSSL 4:1b0d80432c79 1291 #define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\
wolfSSL 4:1b0d80432c79 1292 MOVE_15_to_16(w_i_16, w_i_15, w_i_7) ; \
wolfSSL 4:1b0d80432c79 1293 MOVE_7_to_15(w_i_15, w_i_7) ; \
wolfSSL 4:1b0d80432c79 1294 MOVE_I_to_7(w_i_7, w_i) ; \
wolfSSL 4:1b0d80432c79 1295 MOVE_I_to_2(w_i_2, w_i) ;\
wolfSSL 4:1b0d80432c79 1296
wolfSSL 4:1b0d80432c79 1297 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 4:1b0d80432c79 1298 { word32 d ;\
wolfSSL 4:1b0d80432c79 1299 __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1300 sha256->digest[0] += d;\
wolfSSL 4:1b0d80432c79 1301 __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1302 sha256->digest[1] += d;\
wolfSSL 4:1b0d80432c79 1303 __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1304 sha256->digest[2] += d;\
wolfSSL 4:1b0d80432c79 1305 __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1306 sha256->digest[3] += d;\
wolfSSL 4:1b0d80432c79 1307 __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1308 sha256->digest[4] += d;\
wolfSSL 4:1b0d80432c79 1309 __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1310 sha256->digest[5] += d;\
wolfSSL 4:1b0d80432c79 1311 __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1312 sha256->digest[6] += d;\
wolfSSL 4:1b0d80432c79 1313 __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1314 sha256->digest[7] += d;\
wolfSSL 4:1b0d80432c79 1315 }
wolfSSL 4:1b0d80432c79 1316
wolfSSL 4:1b0d80432c79 1317 #define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 4:1b0d80432c79 1318 { word32 d[8] ;\
wolfSSL 4:1b0d80432c79 1319 __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1320 __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1321 __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1322 __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1323 __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1324 __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1325 __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1326 __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1327 printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\
wolfSSL 4:1b0d80432c79 1328 __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1329 __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1330 __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1331 __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1332 __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1333 __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1334 __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1335 __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs) ;\
wolfSSL 4:1b0d80432c79 1336 }
wolfSSL 4:1b0d80432c79 1337
wolfSSL 4:1b0d80432c79 1338
wolfSSL 4:1b0d80432c79 1339 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 4:1b0d80432c79 1340 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 4:1b0d80432c79 1341
wolfSSL 4:1b0d80432c79 1342 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 4:1b0d80432c79 1343 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 4:1b0d80432c79 1344
wolfSSL 4:1b0d80432c79 1345 #define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 4:1b0d80432c79 1346 _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 4:1b0d80432c79 1347
wolfSSL 4:1b0d80432c79 1348
wolfSSL 4:1b0d80432c79 1349 /* Byte swap Masks to ensure that rest of the words are filled with zero's. */
wolfSSL 4:1b0d80432c79 1350 static const unsigned long mBYTE_FLIP_MASK_16[] =
wolfSSL 4:1b0d80432c79 1351 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
wolfSSL 4:1b0d80432c79 1352 static const unsigned long mBYTE_FLIP_MASK_15[] =
wolfSSL 4:1b0d80432c79 1353 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
wolfSSL 4:1b0d80432c79 1354 static const unsigned long mBYTE_FLIP_MASK_7 [] =
wolfSSL 4:1b0d80432c79 1355 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b } ;
wolfSSL 4:1b0d80432c79 1356 static const unsigned long mBYTE_FLIP_MASK_2 [] =
wolfSSL 4:1b0d80432c79 1357 { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 } ;
wolfSSL 4:1b0d80432c79 1358
wolfSSL 4:1b0d80432c79 1359 static const unsigned long mMAPtoW_I_7[] =
wolfSSL 4:1b0d80432c79 1360 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 } ;
wolfSSL 4:1b0d80432c79 1361 static const unsigned long mMAP1toW_I_2[] =
wolfSSL 4:1b0d80432c79 1362 { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 } ;
wolfSSL 4:1b0d80432c79 1363 static const unsigned long mMAP2toW_I_2[] =
wolfSSL 4:1b0d80432c79 1364 { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 } ;
wolfSSL 4:1b0d80432c79 1365 static const unsigned long mMAP3toW_I_2[] =
wolfSSL 4:1b0d80432c79 1366 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 } ;
wolfSSL 4:1b0d80432c79 1367
wolfSSL 4:1b0d80432c79 1368 static int Transform_AVX2(Sha256* sha256)
wolfSSL 4:1b0d80432c79 1369 {
wolfSSL 4:1b0d80432c79 1370
wolfSSL 4:1b0d80432c79 1371 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 4:1b0d80432c79 1372 word32* W_K;
wolfSSL 4:1b0d80432c79 1373 W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 4:1b0d80432c79 1374 if (W_K == NULL)
wolfSSL 4:1b0d80432c79 1375 return MEMORY_E;
wolfSSL 4:1b0d80432c79 1376 #else
wolfSSL 4:1b0d80432c79 1377 word32 W_K[64] ;
wolfSSL 4:1b0d80432c79 1378 #endif
wolfSSL 4:1b0d80432c79 1379
wolfSSL 4:1b0d80432c79 1380 MOVE_to_REG(W_I_16, sha256->buffer[0]); BYTE_SWAP(W_I_16, mBYTE_FLIP_MASK_16[0]) ;
wolfSSL 4:1b0d80432c79 1381 MOVE_to_REG(W_I_15, sha256->buffer[1]); BYTE_SWAP(W_I_15, mBYTE_FLIP_MASK_15[0]) ;
wolfSSL 4:1b0d80432c79 1382 MOVE_to_REG(W_I, sha256->buffer[8]) ; BYTE_SWAP(W_I, mBYTE_FLIP_MASK_16[0]) ;
wolfSSL 4:1b0d80432c79 1383 MOVE_to_REG(W_I_7, sha256->buffer[16-7]) ; BYTE_SWAP(W_I_7, mBYTE_FLIP_MASK_7[0]) ;
wolfSSL 4:1b0d80432c79 1384 MOVE_to_REG(W_I_2, sha256->buffer[16-2]) ; BYTE_SWAP(W_I_2, mBYTE_FLIP_MASK_2[0]) ;
wolfSSL 4:1b0d80432c79 1385
wolfSSL 4:1b0d80432c79 1386 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
wolfSSL 4:1b0d80432c79 1387
wolfSSL 4:1b0d80432c79 1388 ADD_MEM(W_K_TEMP, W_I_16, K[0]) ;
wolfSSL 4:1b0d80432c79 1389 MOVE_to_MEM(W_K[0], W_K_TEMP) ;
wolfSSL 4:1b0d80432c79 1390
wolfSSL 4:1b0d80432c79 1391 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ;
wolfSSL 4:1b0d80432c79 1392 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1) ;
wolfSSL 4:1b0d80432c79 1393 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2) ;
wolfSSL 4:1b0d80432c79 1394 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3) ;
wolfSSL 4:1b0d80432c79 1395 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4) ;
wolfSSL 4:1b0d80432c79 1396 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5) ;
wolfSSL 4:1b0d80432c79 1397 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6) ;
wolfSSL 4:1b0d80432c79 1398 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7) ;
wolfSSL 4:1b0d80432c79 1399
wolfSSL 4:1b0d80432c79 1400 ADD_MEM(YMM_TEMP0, W_I, K[8]) ;
wolfSSL 4:1b0d80432c79 1401 MOVE_to_MEM(W_K[8], YMM_TEMP0) ;
wolfSSL 4:1b0d80432c79 1402
wolfSSL 4:1b0d80432c79 1403 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 4:1b0d80432c79 1404 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
wolfSSL 4:1b0d80432c79 1405 GAMMA0_1(W_I_TEMP, W_I_15) ;
wolfSSL 4:1b0d80432c79 1406 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
wolfSSL 4:1b0d80432c79 1407 GAMMA0_2(W_I_TEMP, W_I_15) ;
wolfSSL 4:1b0d80432c79 1408 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
wolfSSL 4:1b0d80432c79 1409 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 4:1b0d80432c79 1410 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ;
wolfSSL 4:1b0d80432c79 1411 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 4:1b0d80432c79 1412 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ;
wolfSSL 4:1b0d80432c79 1413 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1414 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ;
wolfSSL 4:1b0d80432c79 1415 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1416 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ;
wolfSSL 4:1b0d80432c79 1417 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
wolfSSL 4:1b0d80432c79 1418 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ;
wolfSSL 4:1b0d80432c79 1419 FEEDBACK1_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1420 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ;
wolfSSL 4:1b0d80432c79 1421 FEEDBACK_to_W_I_7 ;
wolfSSL 4:1b0d80432c79 1422 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ;
wolfSSL 4:1b0d80432c79 1423 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 4:1b0d80432c79 1424 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ;
wolfSSL 4:1b0d80432c79 1425 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1426 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ;
wolfSSL 4:1b0d80432c79 1427 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1428 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ;
wolfSSL 4:1b0d80432c79 1429 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
wolfSSL 4:1b0d80432c79 1430 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ;
wolfSSL 4:1b0d80432c79 1431 FEEDBACK2_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1432 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ;
wolfSSL 4:1b0d80432c79 1433 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1434 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ;
wolfSSL 4:1b0d80432c79 1435 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1436 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ;
wolfSSL 4:1b0d80432c79 1437 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
wolfSSL 4:1b0d80432c79 1438 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ;
wolfSSL 4:1b0d80432c79 1439 FEEDBACK3_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1440 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ;
wolfSSL 4:1b0d80432c79 1441 GAMMA1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1442 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ;
wolfSSL 4:1b0d80432c79 1443 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ;
wolfSSL 4:1b0d80432c79 1444 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
wolfSSL 4:1b0d80432c79 1445 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ;
wolfSSL 4:1b0d80432c79 1446
wolfSSL 4:1b0d80432c79 1447 MOVE_to_REG(YMM_TEMP0, K[16]) ;
wolfSSL 4:1b0d80432c79 1448 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ;
wolfSSL 4:1b0d80432c79 1449 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
wolfSSL 4:1b0d80432c79 1450 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ;
wolfSSL 4:1b0d80432c79 1451 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
wolfSSL 4:1b0d80432c79 1452 MOVE_to_MEM(W_K[16], YMM_TEMP0) ;
wolfSSL 4:1b0d80432c79 1453
wolfSSL 4:1b0d80432c79 1454 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 4:1b0d80432c79 1455 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
wolfSSL 4:1b0d80432c79 1456 GAMMA0_1(W_I_TEMP, W_I_15) ;
wolfSSL 4:1b0d80432c79 1457 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
wolfSSL 4:1b0d80432c79 1458 GAMMA0_2(W_I_TEMP, W_I_15) ;
wolfSSL 4:1b0d80432c79 1459 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
wolfSSL 4:1b0d80432c79 1460 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 4:1b0d80432c79 1461 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ;
wolfSSL 4:1b0d80432c79 1462 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 4:1b0d80432c79 1463 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ;
wolfSSL 4:1b0d80432c79 1464 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1465 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ;
wolfSSL 4:1b0d80432c79 1466 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1467 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ;
wolfSSL 4:1b0d80432c79 1468 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
wolfSSL 4:1b0d80432c79 1469 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ;
wolfSSL 4:1b0d80432c79 1470 FEEDBACK1_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1471 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ;
wolfSSL 4:1b0d80432c79 1472 FEEDBACK_to_W_I_7 ;
wolfSSL 4:1b0d80432c79 1473 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ;
wolfSSL 4:1b0d80432c79 1474 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 4:1b0d80432c79 1475 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ;
wolfSSL 4:1b0d80432c79 1476 GAMMA1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1477 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ;
wolfSSL 4:1b0d80432c79 1478 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1479 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ;
wolfSSL 4:1b0d80432c79 1480 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
wolfSSL 4:1b0d80432c79 1481 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ;
wolfSSL 4:1b0d80432c79 1482 FEEDBACK2_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1483 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ;
wolfSSL 4:1b0d80432c79 1484 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1485 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ;
wolfSSL 4:1b0d80432c79 1486 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1487 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ;
wolfSSL 4:1b0d80432c79 1488 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
wolfSSL 4:1b0d80432c79 1489 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ;
wolfSSL 4:1b0d80432c79 1490 FEEDBACK3_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1491 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ;
wolfSSL 4:1b0d80432c79 1492 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1493 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ;
wolfSSL 4:1b0d80432c79 1494 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1495 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ;
wolfSSL 4:1b0d80432c79 1496 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
wolfSSL 4:1b0d80432c79 1497 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ;
wolfSSL 4:1b0d80432c79 1498
wolfSSL 4:1b0d80432c79 1499 MOVE_to_REG(YMM_TEMP0, K[24]) ;
wolfSSL 4:1b0d80432c79 1500 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ;
wolfSSL 4:1b0d80432c79 1501 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
wolfSSL 4:1b0d80432c79 1502 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ;
wolfSSL 4:1b0d80432c79 1503 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
wolfSSL 4:1b0d80432c79 1504 MOVE_to_MEM(W_K[24], YMM_TEMP0) ;
wolfSSL 4:1b0d80432c79 1505
wolfSSL 4:1b0d80432c79 1506 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 4:1b0d80432c79 1507 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
wolfSSL 4:1b0d80432c79 1508 GAMMA0_1(W_I_TEMP, W_I_15) ;
wolfSSL 4:1b0d80432c79 1509 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
wolfSSL 4:1b0d80432c79 1510 GAMMA0_2(W_I_TEMP, W_I_15) ;
wolfSSL 4:1b0d80432c79 1511 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
wolfSSL 4:1b0d80432c79 1512 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 4:1b0d80432c79 1513 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ;
wolfSSL 4:1b0d80432c79 1514 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 4:1b0d80432c79 1515 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ;
wolfSSL 4:1b0d80432c79 1516 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1517 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ;
wolfSSL 4:1b0d80432c79 1518 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1519 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ;
wolfSSL 4:1b0d80432c79 1520 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
wolfSSL 4:1b0d80432c79 1521 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ;
wolfSSL 4:1b0d80432c79 1522 FEEDBACK1_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1523 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ;
wolfSSL 4:1b0d80432c79 1524 FEEDBACK_to_W_I_7 ;
wolfSSL 4:1b0d80432c79 1525 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ;
wolfSSL 4:1b0d80432c79 1526 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 4:1b0d80432c79 1527 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ;
wolfSSL 4:1b0d80432c79 1528 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1529 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ;
wolfSSL 4:1b0d80432c79 1530 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1531 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ;
wolfSSL 4:1b0d80432c79 1532 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
wolfSSL 4:1b0d80432c79 1533 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ;
wolfSSL 4:1b0d80432c79 1534 FEEDBACK2_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1535 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ;
wolfSSL 4:1b0d80432c79 1536 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1537 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ;
wolfSSL 4:1b0d80432c79 1538 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1539 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ;
wolfSSL 4:1b0d80432c79 1540 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
wolfSSL 4:1b0d80432c79 1541 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ;
wolfSSL 4:1b0d80432c79 1542 FEEDBACK3_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1543 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ;
wolfSSL 4:1b0d80432c79 1544 GAMMA1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1545 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ;
wolfSSL 4:1b0d80432c79 1546 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ;
wolfSSL 4:1b0d80432c79 1547 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
wolfSSL 4:1b0d80432c79 1548 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ;
wolfSSL 4:1b0d80432c79 1549
wolfSSL 4:1b0d80432c79 1550 MOVE_to_REG(YMM_TEMP0, K[32]) ;
wolfSSL 4:1b0d80432c79 1551 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ;
wolfSSL 4:1b0d80432c79 1552 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
wolfSSL 4:1b0d80432c79 1553 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ;
wolfSSL 4:1b0d80432c79 1554 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
wolfSSL 4:1b0d80432c79 1555 MOVE_to_MEM(W_K[32], YMM_TEMP0) ;
wolfSSL 4:1b0d80432c79 1556
wolfSSL 4:1b0d80432c79 1557
wolfSSL 4:1b0d80432c79 1558 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 4:1b0d80432c79 1559 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
wolfSSL 4:1b0d80432c79 1560 GAMMA0_1(W_I_TEMP, W_I_15) ;
wolfSSL 4:1b0d80432c79 1561 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
wolfSSL 4:1b0d80432c79 1562 GAMMA0_2(W_I_TEMP, W_I_15) ;
wolfSSL 4:1b0d80432c79 1563 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
wolfSSL 4:1b0d80432c79 1564 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 4:1b0d80432c79 1565 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ;
wolfSSL 4:1b0d80432c79 1566 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 4:1b0d80432c79 1567 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ;
wolfSSL 4:1b0d80432c79 1568 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1569 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ;
wolfSSL 4:1b0d80432c79 1570 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1571 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ;
wolfSSL 4:1b0d80432c79 1572 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
wolfSSL 4:1b0d80432c79 1573 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ;
wolfSSL 4:1b0d80432c79 1574 FEEDBACK1_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1575 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ;
wolfSSL 4:1b0d80432c79 1576 FEEDBACK_to_W_I_7 ;
wolfSSL 4:1b0d80432c79 1577 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ;
wolfSSL 4:1b0d80432c79 1578 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 4:1b0d80432c79 1579 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ;
wolfSSL 4:1b0d80432c79 1580 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1581 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ;
wolfSSL 4:1b0d80432c79 1582 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1583 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ;
wolfSSL 4:1b0d80432c79 1584 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
wolfSSL 4:1b0d80432c79 1585 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ;
wolfSSL 4:1b0d80432c79 1586 FEEDBACK2_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1587 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ;
wolfSSL 4:1b0d80432c79 1588 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1589 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ;
wolfSSL 4:1b0d80432c79 1590 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1591 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ;
wolfSSL 4:1b0d80432c79 1592 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
wolfSSL 4:1b0d80432c79 1593 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ;
wolfSSL 4:1b0d80432c79 1594 FEEDBACK3_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1595 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ;
wolfSSL 4:1b0d80432c79 1596 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1597 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ;
wolfSSL 4:1b0d80432c79 1598 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1599 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ;
wolfSSL 4:1b0d80432c79 1600 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
wolfSSL 4:1b0d80432c79 1601 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ;
wolfSSL 4:1b0d80432c79 1602
wolfSSL 4:1b0d80432c79 1603 MOVE_to_REG(YMM_TEMP0, K[40]) ;
wolfSSL 4:1b0d80432c79 1604 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ;
wolfSSL 4:1b0d80432c79 1605 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
wolfSSL 4:1b0d80432c79 1606 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ;
wolfSSL 4:1b0d80432c79 1607 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
wolfSSL 4:1b0d80432c79 1608 MOVE_to_MEM(W_K[40], YMM_TEMP0) ;
wolfSSL 4:1b0d80432c79 1609
wolfSSL 4:1b0d80432c79 1610 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 4:1b0d80432c79 1611 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
wolfSSL 4:1b0d80432c79 1612 GAMMA0_1(W_I_TEMP, W_I_15) ;
wolfSSL 4:1b0d80432c79 1613 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
wolfSSL 4:1b0d80432c79 1614 GAMMA0_2(W_I_TEMP, W_I_15) ;
wolfSSL 4:1b0d80432c79 1615 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
wolfSSL 4:1b0d80432c79 1616 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 4:1b0d80432c79 1617 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ;
wolfSSL 4:1b0d80432c79 1618 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 4:1b0d80432c79 1619 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ;
wolfSSL 4:1b0d80432c79 1620 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1621 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ;
wolfSSL 4:1b0d80432c79 1622 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1623 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ;
wolfSSL 4:1b0d80432c79 1624 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
wolfSSL 4:1b0d80432c79 1625 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ;
wolfSSL 4:1b0d80432c79 1626 FEEDBACK1_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1627 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ;
wolfSSL 4:1b0d80432c79 1628 FEEDBACK_to_W_I_7 ;
wolfSSL 4:1b0d80432c79 1629 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ;
wolfSSL 4:1b0d80432c79 1630 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 4:1b0d80432c79 1631 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ;
wolfSSL 4:1b0d80432c79 1632 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1633 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ;
wolfSSL 4:1b0d80432c79 1634 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1635 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ;
wolfSSL 4:1b0d80432c79 1636 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
wolfSSL 4:1b0d80432c79 1637 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ;
wolfSSL 4:1b0d80432c79 1638 FEEDBACK2_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1639 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ;
wolfSSL 4:1b0d80432c79 1640 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1641 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ;
wolfSSL 4:1b0d80432c79 1642 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1643 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ;
wolfSSL 4:1b0d80432c79 1644 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
wolfSSL 4:1b0d80432c79 1645 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ;
wolfSSL 4:1b0d80432c79 1646 FEEDBACK3_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1647 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ;
wolfSSL 4:1b0d80432c79 1648 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1649 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ;
wolfSSL 4:1b0d80432c79 1650 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1651 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ;
wolfSSL 4:1b0d80432c79 1652 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
wolfSSL 4:1b0d80432c79 1653 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ;
wolfSSL 4:1b0d80432c79 1654
wolfSSL 4:1b0d80432c79 1655 MOVE_to_REG(YMM_TEMP0, K[48]) ;
wolfSSL 4:1b0d80432c79 1656 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ;
wolfSSL 4:1b0d80432c79 1657 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
wolfSSL 4:1b0d80432c79 1658 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ;
wolfSSL 4:1b0d80432c79 1659 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
wolfSSL 4:1b0d80432c79 1660 MOVE_to_MEM(W_K[48], YMM_TEMP0) ;
wolfSSL 4:1b0d80432c79 1661
wolfSSL 4:1b0d80432c79 1662 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 4:1b0d80432c79 1663 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
wolfSSL 4:1b0d80432c79 1664 GAMMA0_1(W_I_TEMP, W_I_15) ;
wolfSSL 4:1b0d80432c79 1665 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
wolfSSL 4:1b0d80432c79 1666 GAMMA0_2(W_I_TEMP, W_I_15) ;
wolfSSL 4:1b0d80432c79 1667 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
wolfSSL 4:1b0d80432c79 1668 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 4:1b0d80432c79 1669 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
wolfSSL 4:1b0d80432c79 1670 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 4:1b0d80432c79 1671 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
wolfSSL 4:1b0d80432c79 1672 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1673 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
wolfSSL 4:1b0d80432c79 1674 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1675 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
wolfSSL 4:1b0d80432c79 1676 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
wolfSSL 4:1b0d80432c79 1677 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
wolfSSL 4:1b0d80432c79 1678 FEEDBACK1_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1679 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
wolfSSL 4:1b0d80432c79 1680 FEEDBACK_to_W_I_7 ;
wolfSSL 4:1b0d80432c79 1681 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
wolfSSL 4:1b0d80432c79 1682 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 4:1b0d80432c79 1683 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
wolfSSL 4:1b0d80432c79 1684 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1685 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
wolfSSL 4:1b0d80432c79 1686 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1687 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
wolfSSL 4:1b0d80432c79 1688 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
wolfSSL 4:1b0d80432c79 1689 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
wolfSSL 4:1b0d80432c79 1690 FEEDBACK2_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1691 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
wolfSSL 4:1b0d80432c79 1692 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1693 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
wolfSSL 4:1b0d80432c79 1694 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1695 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
wolfSSL 4:1b0d80432c79 1696 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
wolfSSL 4:1b0d80432c79 1697 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
wolfSSL 4:1b0d80432c79 1698 FEEDBACK3_to_W_I_2 ;
wolfSSL 4:1b0d80432c79 1699 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
wolfSSL 4:1b0d80432c79 1700 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1701 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
wolfSSL 4:1b0d80432c79 1702 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 4:1b0d80432c79 1703 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
wolfSSL 4:1b0d80432c79 1704 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
wolfSSL 4:1b0d80432c79 1705 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
wolfSSL 4:1b0d80432c79 1706
wolfSSL 4:1b0d80432c79 1707 MOVE_to_REG(YMM_TEMP0, K[56]) ;
wolfSSL 4:1b0d80432c79 1708 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
wolfSSL 4:1b0d80432c79 1709 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
wolfSSL 4:1b0d80432c79 1710 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
wolfSSL 4:1b0d80432c79 1711 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
wolfSSL 4:1b0d80432c79 1712 MOVE_to_MEM(W_K[56], YMM_TEMP0) ;
wolfSSL 4:1b0d80432c79 1713
wolfSSL 4:1b0d80432c79 1714 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ;
wolfSSL 4:1b0d80432c79 1715 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ;
wolfSSL 4:1b0d80432c79 1716 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ;
wolfSSL 4:1b0d80432c79 1717 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ;
wolfSSL 4:1b0d80432c79 1718
wolfSSL 4:1b0d80432c79 1719 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ;
wolfSSL 4:1b0d80432c79 1720 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ;
wolfSSL 4:1b0d80432c79 1721 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ;
wolfSSL 4:1b0d80432c79 1722 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ;
wolfSSL 4:1b0d80432c79 1723
wolfSSL 4:1b0d80432c79 1724 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
wolfSSL 4:1b0d80432c79 1725
wolfSSL 4:1b0d80432c79 1726 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 4:1b0d80432c79 1727 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 4:1b0d80432c79 1728 #endif
wolfSSL 4:1b0d80432c79 1729
wolfSSL 4:1b0d80432c79 1730 return 0;
wolfSSL 4:1b0d80432c79 1731 }
wolfSSL 4:1b0d80432c79 1732
wolfSSL 4:1b0d80432c79 1733 #endif /* HAVE_INTEL_AVX2 */
wolfSSL 4:1b0d80432c79 1734
wolfSSL 4:1b0d80432c79 1735 #endif /* HAVE_FIPS */
wolfSSL 4:1b0d80432c79 1736
wolfSSL 4:1b0d80432c79 1737 #endif /* WOLFSSL_TI_HAHS */
wolfSSL 4:1b0d80432c79 1738
wolfSSL 4:1b0d80432c79 1739 #endif /* NO_SHA256 */
wolfSSL 4:1b0d80432c79 1740
wolfSSL 4:1b0d80432c79 1741