Xuyi Wang / wolfSSL

Dependents:   OS

Committer:
wolfSSL
Date:
Fri Jun 26 00:39:20 2015 +0000
Revision:
0:d92f9d21154c
wolfSSL 3.6.0

Who changed what in which revision?

UserRevisionLine numberNew contents of line
wolfSSL 0:d92f9d21154c 1 /* sha256.c
wolfSSL 0:d92f9d21154c 2 *
wolfSSL 0:d92f9d21154c 3 * Copyright (C) 2006-2015 wolfSSL Inc.
wolfSSL 0:d92f9d21154c 4 *
wolfSSL 0:d92f9d21154c 5 * This file is part of wolfSSL. (formerly known as CyaSSL)
wolfSSL 0:d92f9d21154c 6 *
wolfSSL 0:d92f9d21154c 7 * wolfSSL is free software; you can redistribute it and/or modify
wolfSSL 0:d92f9d21154c 8 * it under the terms of the GNU General Public License as published by
wolfSSL 0:d92f9d21154c 9 * the Free Software Foundation; either version 2 of the License, or
wolfSSL 0:d92f9d21154c 10 * (at your option) any later version.
wolfSSL 0:d92f9d21154c 11 *
wolfSSL 0:d92f9d21154c 12 * wolfSSL is distributed in the hope that it will be useful,
wolfSSL 0:d92f9d21154c 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
wolfSSL 0:d92f9d21154c 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
wolfSSL 0:d92f9d21154c 15 * GNU General Public License for more details.
wolfSSL 0:d92f9d21154c 16 *
wolfSSL 0:d92f9d21154c 17 * You should have received a copy of the GNU General Public License
wolfSSL 0:d92f9d21154c 18 * along with this program; if not, write to the Free Software
wolfSSL 0:d92f9d21154c 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
wolfSSL 0:d92f9d21154c 20 */
wolfSSL 0:d92f9d21154c 21
wolfSSL 0:d92f9d21154c 22 /* code submitted by raphael.huck@efixo.com */
wolfSSL 0:d92f9d21154c 23
wolfSSL 0:d92f9d21154c 24 #ifdef HAVE_CONFIG_H
wolfSSL 0:d92f9d21154c 25 #include <config.h>
wolfSSL 0:d92f9d21154c 26 #endif
wolfSSL 0:d92f9d21154c 27
wolfSSL 0:d92f9d21154c 28 #include <wolfssl/wolfcrypt/settings.h>
wolfSSL 0:d92f9d21154c 29 #include <wolfssl/wolfcrypt/sha256.h>
wolfSSL 0:d92f9d21154c 30
wolfSSL 0:d92f9d21154c 31 #if !defined(NO_SHA256)
wolfSSL 0:d92f9d21154c 32 #ifdef HAVE_FIPS
wolfSSL 0:d92f9d21154c 33
wolfSSL 0:d92f9d21154c 34 int wc_InitSha256(Sha256* sha)
wolfSSL 0:d92f9d21154c 35 {
wolfSSL 0:d92f9d21154c 36 return InitSha256_fips(sha);
wolfSSL 0:d92f9d21154c 37 }
wolfSSL 0:d92f9d21154c 38
wolfSSL 0:d92f9d21154c 39
wolfSSL 0:d92f9d21154c 40 int wc_Sha256Update(Sha256* sha, const byte* data, word32 len)
wolfSSL 0:d92f9d21154c 41 {
wolfSSL 0:d92f9d21154c 42 return Sha256Update_fips(sha, data, len);
wolfSSL 0:d92f9d21154c 43 }
wolfSSL 0:d92f9d21154c 44
wolfSSL 0:d92f9d21154c 45
wolfSSL 0:d92f9d21154c 46 int wc_Sha256Final(Sha256* sha, byte* out)
wolfSSL 0:d92f9d21154c 47 {
wolfSSL 0:d92f9d21154c 48 return Sha256Final_fips(sha, out);
wolfSSL 0:d92f9d21154c 49 }
wolfSSL 0:d92f9d21154c 50
wolfSSL 0:d92f9d21154c 51
wolfSSL 0:d92f9d21154c 52 int wc_Sha256Hash(const byte* data, word32 len, byte* out)
wolfSSL 0:d92f9d21154c 53 {
wolfSSL 0:d92f9d21154c 54 return Sha256Hash(data, len, out);
wolfSSL 0:d92f9d21154c 55 }
wolfSSL 0:d92f9d21154c 56
wolfSSL 0:d92f9d21154c 57 #else /* else build without fips */
wolfSSL 0:d92f9d21154c 58
wolfSSL 0:d92f9d21154c 59 #if !defined(NO_SHA256) && defined(WOLFSSL_TI_HASH)
wolfSSL 0:d92f9d21154c 60 /* #include <wolfcrypt/src/port/ti/ti-hash.c> included by wc_port.c */
wolfSSL 0:d92f9d21154c 61 #else
wolfSSL 0:d92f9d21154c 62
wolfSSL 0:d92f9d21154c 63 #if !defined (ALIGN32)
wolfSSL 0:d92f9d21154c 64 #if defined (__GNUC__)
wolfSSL 0:d92f9d21154c 65 #define ALIGN32 __attribute__ ( (aligned (32)))
wolfSSL 0:d92f9d21154c 66 #elif defined(_MSC_VER)
wolfSSL 0:d92f9d21154c 67 /* disable align warning, we want alignment ! */
wolfSSL 0:d92f9d21154c 68 #pragma warning(disable: 4324)
wolfSSL 0:d92f9d21154c 69 #define ALIGN32 __declspec (align (32))
wolfSSL 0:d92f9d21154c 70 #else
wolfSSL 0:d92f9d21154c 71 #define ALIGN32
wolfSSL 0:d92f9d21154c 72 #endif
wolfSSL 0:d92f9d21154c 73 #endif
wolfSSL 0:d92f9d21154c 74
wolfSSL 0:d92f9d21154c 75 #ifdef WOLFSSL_PIC32MZ_HASH
wolfSSL 0:d92f9d21154c 76 #define wc_InitSha256 wc_InitSha256_sw
wolfSSL 0:d92f9d21154c 77 #define wc_Sha256Update wc_Sha256Update_sw
wolfSSL 0:d92f9d21154c 78 #define wc_Sha256Final wc_Sha256Final_sw
wolfSSL 0:d92f9d21154c 79 #endif
wolfSSL 0:d92f9d21154c 80
wolfSSL 0:d92f9d21154c 81 #ifdef HAVE_FIPS
wolfSSL 0:d92f9d21154c 82 /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
wolfSSL 0:d92f9d21154c 83 #define FIPS_NO_WRAPPERS
wolfSSL 0:d92f9d21154c 84 #endif
wolfSSL 0:d92f9d21154c 85
wolfSSL 0:d92f9d21154c 86 #if defined(USE_INTEL_SPEEDUP)
wolfSSL 0:d92f9d21154c 87 #define HAVE_INTEL_AVX1
wolfSSL 0:d92f9d21154c 88 #define HAVE_INTEL_AVX2
wolfSSL 0:d92f9d21154c 89
wolfSSL 0:d92f9d21154c 90 #if defined(DEBUG_XMM)
wolfSSL 0:d92f9d21154c 91 #include "stdio.h"
wolfSSL 0:d92f9d21154c 92 #endif
wolfSSL 0:d92f9d21154c 93
wolfSSL 0:d92f9d21154c 94 #endif
wolfSSL 0:d92f9d21154c 95
wolfSSL 0:d92f9d21154c 96 #if defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 97 #define HAVE_INTEL_RORX
wolfSSL 0:d92f9d21154c 98 #endif
wolfSSL 0:d92f9d21154c 99
wolfSSL 0:d92f9d21154c 100
wolfSSL 0:d92f9d21154c 101 /*****
wolfSSL 0:d92f9d21154c 102 Intel AVX1/AVX2 Macro Control Structure
wolfSSL 0:d92f9d21154c 103
wolfSSL 0:d92f9d21154c 104 #define HAVE_INTEL_AVX1
wolfSSL 0:d92f9d21154c 105 #define HAVE_INTEL_AVX2
wolfSSL 0:d92f9d21154c 106
wolfSSL 0:d92f9d21154c 107 #define HAVE_INTEL_RORX
wolfSSL 0:d92f9d21154c 108
wolfSSL 0:d92f9d21154c 109
wolfSSL 0:d92f9d21154c 110 int InitSha256(Sha256* sha256) {
wolfSSL 0:d92f9d21154c 111 Save/Recover XMM, YMM
wolfSSL 0:d92f9d21154c 112 ...
wolfSSL 0:d92f9d21154c 113 }
wolfSSL 0:d92f9d21154c 114
wolfSSL 0:d92f9d21154c 115 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 116 Transform() ; Function prototype
wolfSSL 0:d92f9d21154c 117 #else
wolfSSL 0:d92f9d21154c 118 Transform() { }
wolfSSL 0:d92f9d21154c 119 int Sha256Final() {
wolfSSL 0:d92f9d21154c 120 Save/Recover XMM, YMM
wolfSSL 0:d92f9d21154c 121 ...
wolfSSL 0:d92f9d21154c 122 }
wolfSSL 0:d92f9d21154c 123 #endif
wolfSSL 0:d92f9d21154c 124
wolfSSL 0:d92f9d21154c 125 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 126 #if defined(HAVE_INTEL_RORX
wolfSSL 0:d92f9d21154c 127 #define RND with rorx instuction
wolfSSL 0:d92f9d21154c 128 #else
wolfSSL 0:d92f9d21154c 129 #define RND
wolfSSL 0:d92f9d21154c 130 #endif
wolfSSL 0:d92f9d21154c 131 #endif
wolfSSL 0:d92f9d21154c 132
wolfSSL 0:d92f9d21154c 133 #if defined(HAVE_INTEL_AVX1)
wolfSSL 0:d92f9d21154c 134
wolfSSL 0:d92f9d21154c 135 #define XMM Instructions/inline asm
wolfSSL 0:d92f9d21154c 136
wolfSSL 0:d92f9d21154c 137 int Transform() {
wolfSSL 0:d92f9d21154c 138 Stitched Message Sched/Round
wolfSSL 0:d92f9d21154c 139 }
wolfSSL 0:d92f9d21154c 140
wolfSSL 0:d92f9d21154c 141 #elif defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 142
wolfSSL 0:d92f9d21154c 143 #define YMM Instructions/inline asm
wolfSSL 0:d92f9d21154c 144
wolfSSL 0:d92f9d21154c 145 int Transform() {
wolfSSL 0:d92f9d21154c 146 More granural Stitched Message Sched/Round
wolfSSL 0:d92f9d21154c 147 }
wolfSSL 0:d92f9d21154c 148
wolfSSL 0:d92f9d21154c 149 */
wolfSSL 0:d92f9d21154c 150
wolfSSL 0:d92f9d21154c 151
wolfSSL 0:d92f9d21154c 152 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 153
wolfSSL 0:d92f9d21154c 154 /* Each platform needs to query info type 1 from cpuid to see if aesni is
wolfSSL 0:d92f9d21154c 155 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
wolfSSL 0:d92f9d21154c 156 */
wolfSSL 0:d92f9d21154c 157
wolfSSL 0:d92f9d21154c 158 #ifndef _MSC_VER
wolfSSL 0:d92f9d21154c 159 #define cpuid(reg, leaf, sub)\
wolfSSL 0:d92f9d21154c 160 __asm__ __volatile__ ("cpuid":\
wolfSSL 0:d92f9d21154c 161 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
wolfSSL 0:d92f9d21154c 162 "a" (leaf), "c"(sub));
wolfSSL 0:d92f9d21154c 163
wolfSSL 0:d92f9d21154c 164 #define XASM_LINK(f) asm(f)
wolfSSL 0:d92f9d21154c 165 #else
wolfSSL 0:d92f9d21154c 166
wolfSSL 0:d92f9d21154c 167 #include <intrin.h>
wolfSSL 0:d92f9d21154c 168 #define cpuid(a,b) __cpuid((int*)a,b)
wolfSSL 0:d92f9d21154c 169
wolfSSL 0:d92f9d21154c 170 #define XASM_LINK(f)
wolfSSL 0:d92f9d21154c 171
wolfSSL 0:d92f9d21154c 172 #endif /* _MSC_VER */
wolfSSL 0:d92f9d21154c 173
wolfSSL 0:d92f9d21154c 174 #define EAX 0
wolfSSL 0:d92f9d21154c 175 #define EBX 1
wolfSSL 0:d92f9d21154c 176 #define ECX 2
wolfSSL 0:d92f9d21154c 177 #define EDX 3
wolfSSL 0:d92f9d21154c 178
wolfSSL 0:d92f9d21154c 179 #define CPUID_AVX1 0x1
wolfSSL 0:d92f9d21154c 180 #define CPUID_AVX2 0x2
wolfSSL 0:d92f9d21154c 181 #define CPUID_RDRAND 0x4
wolfSSL 0:d92f9d21154c 182 #define CPUID_RDSEED 0x8
wolfSSL 0:d92f9d21154c 183 #define CPUID_BMI2 0x10 /* MULX, RORX */
wolfSSL 0:d92f9d21154c 184
wolfSSL 0:d92f9d21154c 185 #define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1)
wolfSSL 0:d92f9d21154c 186 #define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2)
wolfSSL 0:d92f9d21154c 187 #define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2)
wolfSSL 0:d92f9d21154c 188 #define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND)
wolfSSL 0:d92f9d21154c 189 #define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED)
wolfSSL 0:d92f9d21154c 190
wolfSSL 0:d92f9d21154c 191 static word32 cpuid_check = 0 ;
wolfSSL 0:d92f9d21154c 192 static word32 cpuid_flags = 0 ;
wolfSSL 0:d92f9d21154c 193
wolfSSL 0:d92f9d21154c 194 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
wolfSSL 0:d92f9d21154c 195 int got_intel_cpu=0;
wolfSSL 0:d92f9d21154c 196 unsigned int reg[5];
wolfSSL 0:d92f9d21154c 197
wolfSSL 0:d92f9d21154c 198 reg[4] = '\0' ;
wolfSSL 0:d92f9d21154c 199 cpuid(reg, 0, 0);
wolfSSL 0:d92f9d21154c 200 if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&
wolfSSL 0:d92f9d21154c 201 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&
wolfSSL 0:d92f9d21154c 202 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {
wolfSSL 0:d92f9d21154c 203 got_intel_cpu = 1;
wolfSSL 0:d92f9d21154c 204 }
wolfSSL 0:d92f9d21154c 205 if (got_intel_cpu) {
wolfSSL 0:d92f9d21154c 206 cpuid(reg, leaf, sub);
wolfSSL 0:d92f9d21154c 207 return((reg[num]>>bit)&0x1) ;
wolfSSL 0:d92f9d21154c 208 }
wolfSSL 0:d92f9d21154c 209 return 0 ;
wolfSSL 0:d92f9d21154c 210 }
wolfSSL 0:d92f9d21154c 211
wolfSSL 0:d92f9d21154c 212 static int set_cpuid_flags(void) {
wolfSSL 0:d92f9d21154c 213 if(cpuid_check==0) {
wolfSSL 0:d92f9d21154c 214 if(cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1 ;}
wolfSSL 0:d92f9d21154c 215 if(cpuid_flag(7, 0, EBX, 5)){ cpuid_flags |= CPUID_AVX2 ; }
wolfSSL 0:d92f9d21154c 216 if(cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2 ; }
wolfSSL 0:d92f9d21154c 217 if(cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND ; }
wolfSSL 0:d92f9d21154c 218 if(cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED ; }
wolfSSL 0:d92f9d21154c 219 cpuid_check = 1 ;
wolfSSL 0:d92f9d21154c 220 return 0 ;
wolfSSL 0:d92f9d21154c 221 }
wolfSSL 0:d92f9d21154c 222 return 1 ;
wolfSSL 0:d92f9d21154c 223 }
wolfSSL 0:d92f9d21154c 224
wolfSSL 0:d92f9d21154c 225
wolfSSL 0:d92f9d21154c 226 /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha512 */
wolfSSL 0:d92f9d21154c 227 static int Transform(Sha256* sha256);
wolfSSL 0:d92f9d21154c 228
wolfSSL 0:d92f9d21154c 229 #if defined(HAVE_INTEL_AVX1)
wolfSSL 0:d92f9d21154c 230 static int Transform_AVX1(Sha256 *sha256) ;
wolfSSL 0:d92f9d21154c 231 #endif
wolfSSL 0:d92f9d21154c 232 #if defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 233 static int Transform_AVX2(Sha256 *sha256) ;
wolfSSL 0:d92f9d21154c 234 static int Transform_AVX1_RORX(Sha256 *sha256) ;
wolfSSL 0:d92f9d21154c 235 #endif
wolfSSL 0:d92f9d21154c 236
wolfSSL 0:d92f9d21154c 237 static int (*Transform_p)(Sha256* sha256) /* = _Transform */;
wolfSSL 0:d92f9d21154c 238
wolfSSL 0:d92f9d21154c 239 #define XTRANSFORM(sha256, B) (*Transform_p)(sha256)
wolfSSL 0:d92f9d21154c 240
wolfSSL 0:d92f9d21154c 241 static void set_Transform(void) {
wolfSSL 0:d92f9d21154c 242 if(set_cpuid_flags())return ;
wolfSSL 0:d92f9d21154c 243
wolfSSL 0:d92f9d21154c 244 #if defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 245 if(IS_INTEL_AVX2 && IS_INTEL_BMI2){
wolfSSL 0:d92f9d21154c 246 Transform_p = Transform_AVX1_RORX; return ;
wolfSSL 0:d92f9d21154c 247 Transform_p = Transform_AVX2 ;
wolfSSL 0:d92f9d21154c 248 /* for avoiding warning,"not used" */
wolfSSL 0:d92f9d21154c 249 }
wolfSSL 0:d92f9d21154c 250 #endif
wolfSSL 0:d92f9d21154c 251 #if defined(HAVE_INTEL_AVX1)
wolfSSL 0:d92f9d21154c 252 Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform) ; return ;
wolfSSL 0:d92f9d21154c 253 #endif
wolfSSL 0:d92f9d21154c 254 Transform_p = Transform ; return ;
wolfSSL 0:d92f9d21154c 255 }
wolfSSL 0:d92f9d21154c 256
wolfSSL 0:d92f9d21154c 257 #else
wolfSSL 0:d92f9d21154c 258 #if defined(FREESCALE_MMCAU)
wolfSSL 0:d92f9d21154c 259 #define XTRANSFORM(sha256, B) Transform(sha256, B)
wolfSSL 0:d92f9d21154c 260 #else
wolfSSL 0:d92f9d21154c 261 #define XTRANSFORM(sha256, B) Transform(sha256)
wolfSSL 0:d92f9d21154c 262 #endif
wolfSSL 0:d92f9d21154c 263 #endif
wolfSSL 0:d92f9d21154c 264
wolfSSL 0:d92f9d21154c 265 /* Dummy for saving MM_REGs on behalf of Transform */
wolfSSL 0:d92f9d21154c 266 #if defined(HAVE_INTEL_AVX2)&& !defined(HAVE_INTEL_AVX1)
wolfSSL 0:d92f9d21154c 267 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
wolfSSL 0:d92f9d21154c 268 "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15")
wolfSSL 0:d92f9d21154c 269 #elif defined(HAVE_INTEL_AVX1)
wolfSSL 0:d92f9d21154c 270 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
wolfSSL 0:d92f9d21154c 271 "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\
wolfSSL 0:d92f9d21154c 272 "xmm11","xmm12","xmm13","xmm14","xmm15")
wolfSSL 0:d92f9d21154c 273 #else
wolfSSL 0:d92f9d21154c 274 #define SAVE_XMM_YMM
wolfSSL 0:d92f9d21154c 275 #endif
wolfSSL 0:d92f9d21154c 276
wolfSSL 0:d92f9d21154c 277 #ifdef WOLFSSL_PIC32MZ_HASH
wolfSSL 0:d92f9d21154c 278 #define InitSha256 InitSha256_sw
wolfSSL 0:d92f9d21154c 279 #define Sha256Update Sha256Update_sw
wolfSSL 0:d92f9d21154c 280 #define Sha256Final Sha256Final_sw
wolfSSL 0:d92f9d21154c 281 #endif
wolfSSL 0:d92f9d21154c 282
wolfSSL 0:d92f9d21154c 283 #include <wolfssl/wolfcrypt/logging.h>
wolfSSL 0:d92f9d21154c 284 #include <wolfssl/wolfcrypt/error-crypt.h>
wolfSSL 0:d92f9d21154c 285
wolfSSL 0:d92f9d21154c 286 #ifdef NO_INLINE
wolfSSL 0:d92f9d21154c 287 #include <wolfssl/wolfcrypt/misc.h>
wolfSSL 0:d92f9d21154c 288 #else
wolfSSL 0:d92f9d21154c 289 #include <wolfcrypt/src/misc.c>
wolfSSL 0:d92f9d21154c 290 #endif
wolfSSL 0:d92f9d21154c 291
wolfSSL 0:d92f9d21154c 292 #ifdef FREESCALE_MMCAU
wolfSSL 0:d92f9d21154c 293 #include "cau_api.h"
wolfSSL 0:d92f9d21154c 294 #endif
wolfSSL 0:d92f9d21154c 295
wolfSSL 0:d92f9d21154c 296 #ifndef WOLFSSL_HAVE_MIN
wolfSSL 0:d92f9d21154c 297 #define WOLFSSL_HAVE_MIN
wolfSSL 0:d92f9d21154c 298
wolfSSL 0:d92f9d21154c 299 static INLINE word32 min(word32 a, word32 b)
wolfSSL 0:d92f9d21154c 300 {
wolfSSL 0:d92f9d21154c 301 return a > b ? b : a;
wolfSSL 0:d92f9d21154c 302 }
wolfSSL 0:d92f9d21154c 303
wolfSSL 0:d92f9d21154c 304 #endif /* WOLFSSL_HAVE_MIN */
wolfSSL 0:d92f9d21154c 305
wolfSSL 0:d92f9d21154c 306
wolfSSL 0:d92f9d21154c 307 int wc_InitSha256(Sha256* sha256)
wolfSSL 0:d92f9d21154c 308 {
wolfSSL 0:d92f9d21154c 309 #ifdef FREESCALE_MMCAU
wolfSSL 0:d92f9d21154c 310 cau_sha256_initialize_output(sha256->digest);
wolfSSL 0:d92f9d21154c 311 #else
wolfSSL 0:d92f9d21154c 312 sha256->digest[0] = 0x6A09E667L;
wolfSSL 0:d92f9d21154c 313 sha256->digest[1] = 0xBB67AE85L;
wolfSSL 0:d92f9d21154c 314 sha256->digest[2] = 0x3C6EF372L;
wolfSSL 0:d92f9d21154c 315 sha256->digest[3] = 0xA54FF53AL;
wolfSSL 0:d92f9d21154c 316 sha256->digest[4] = 0x510E527FL;
wolfSSL 0:d92f9d21154c 317 sha256->digest[5] = 0x9B05688CL;
wolfSSL 0:d92f9d21154c 318 sha256->digest[6] = 0x1F83D9ABL;
wolfSSL 0:d92f9d21154c 319 sha256->digest[7] = 0x5BE0CD19L;
wolfSSL 0:d92f9d21154c 320 #endif
wolfSSL 0:d92f9d21154c 321
wolfSSL 0:d92f9d21154c 322 sha256->buffLen = 0;
wolfSSL 0:d92f9d21154c 323 sha256->loLen = 0;
wolfSSL 0:d92f9d21154c 324 sha256->hiLen = 0;
wolfSSL 0:d92f9d21154c 325
wolfSSL 0:d92f9d21154c 326 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 327 set_Transform() ; /* choose best Transform function under this runtime environment */
wolfSSL 0:d92f9d21154c 328 #endif
wolfSSL 0:d92f9d21154c 329
wolfSSL 0:d92f9d21154c 330 return 0;
wolfSSL 0:d92f9d21154c 331 }
wolfSSL 0:d92f9d21154c 332
wolfSSL 0:d92f9d21154c 333
wolfSSL 0:d92f9d21154c 334 #if !defined(FREESCALE_MMCAU)
wolfSSL 0:d92f9d21154c 335 static const ALIGN32 word32 K[64] = {
wolfSSL 0:d92f9d21154c 336 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
wolfSSL 0:d92f9d21154c 337 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
wolfSSL 0:d92f9d21154c 338 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
wolfSSL 0:d92f9d21154c 339 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
wolfSSL 0:d92f9d21154c 340 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
wolfSSL 0:d92f9d21154c 341 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
wolfSSL 0:d92f9d21154c 342 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
wolfSSL 0:d92f9d21154c 343 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
wolfSSL 0:d92f9d21154c 344 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
wolfSSL 0:d92f9d21154c 345 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
wolfSSL 0:d92f9d21154c 346 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
wolfSSL 0:d92f9d21154c 347 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
wolfSSL 0:d92f9d21154c 348 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
wolfSSL 0:d92f9d21154c 349 };
wolfSSL 0:d92f9d21154c 350
wolfSSL 0:d92f9d21154c 351 #endif
wolfSSL 0:d92f9d21154c 352
wolfSSL 0:d92f9d21154c 353 #if defined(FREESCALE_MMCAU)
wolfSSL 0:d92f9d21154c 354
wolfSSL 0:d92f9d21154c 355 static int Transform(Sha256* sha256, byte* buf)
wolfSSL 0:d92f9d21154c 356 {
wolfSSL 0:d92f9d21154c 357 cau_sha256_hash_n(buf, 1, sha256->digest);
wolfSSL 0:d92f9d21154c 358
wolfSSL 0:d92f9d21154c 359 return 0;
wolfSSL 0:d92f9d21154c 360 }
wolfSSL 0:d92f9d21154c 361
wolfSSL 0:d92f9d21154c 362 #endif /* FREESCALE_MMCAU */
wolfSSL 0:d92f9d21154c 363
wolfSSL 0:d92f9d21154c 364 #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
wolfSSL 0:d92f9d21154c 365 #define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y)))
wolfSSL 0:d92f9d21154c 366 #define R(x, n) (((x)&0xFFFFFFFFU)>>(n))
wolfSSL 0:d92f9d21154c 367
wolfSSL 0:d92f9d21154c 368 #define S(x, n) rotrFixed(x, n)
wolfSSL 0:d92f9d21154c 369 #define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
wolfSSL 0:d92f9d21154c 370 #define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
wolfSSL 0:d92f9d21154c 371 #define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
wolfSSL 0:d92f9d21154c 372 #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
wolfSSL 0:d92f9d21154c 373
wolfSSL 0:d92f9d21154c 374 #define RND(a,b,c,d,e,f,g,h,i) \
wolfSSL 0:d92f9d21154c 375 t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \
wolfSSL 0:d92f9d21154c 376 t1 = Sigma0((a)) + Maj((a), (b), (c)); \
wolfSSL 0:d92f9d21154c 377 (d) += t0; \
wolfSSL 0:d92f9d21154c 378 (h) = t0 + t1;
wolfSSL 0:d92f9d21154c 379
wolfSSL 0:d92f9d21154c 380 #if !defined(FREESCALE_MMCAU)
wolfSSL 0:d92f9d21154c 381 static int Transform(Sha256* sha256)
wolfSSL 0:d92f9d21154c 382 {
wolfSSL 0:d92f9d21154c 383 word32 S[8], t0, t1;
wolfSSL 0:d92f9d21154c 384 int i;
wolfSSL 0:d92f9d21154c 385
wolfSSL 0:d92f9d21154c 386 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 0:d92f9d21154c 387 word32* W;
wolfSSL 0:d92f9d21154c 388
wolfSSL 0:d92f9d21154c 389 W = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 0:d92f9d21154c 390 if (W == NULL)
wolfSSL 0:d92f9d21154c 391 return MEMORY_E;
wolfSSL 0:d92f9d21154c 392 #else
wolfSSL 0:d92f9d21154c 393 word32 W[64];
wolfSSL 0:d92f9d21154c 394 #endif
wolfSSL 0:d92f9d21154c 395
wolfSSL 0:d92f9d21154c 396 /* Copy context->state[] to working vars */
wolfSSL 0:d92f9d21154c 397 for (i = 0; i < 8; i++)
wolfSSL 0:d92f9d21154c 398 S[i] = sha256->digest[i];
wolfSSL 0:d92f9d21154c 399
wolfSSL 0:d92f9d21154c 400 for (i = 0; i < 16; i++)
wolfSSL 0:d92f9d21154c 401 W[i] = sha256->buffer[i];
wolfSSL 0:d92f9d21154c 402
wolfSSL 0:d92f9d21154c 403 for (i = 16; i < 64; i++)
wolfSSL 0:d92f9d21154c 404 W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
wolfSSL 0:d92f9d21154c 405
wolfSSL 0:d92f9d21154c 406 for (i = 0; i < 64; i += 8) {
wolfSSL 0:d92f9d21154c 407 RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0);
wolfSSL 0:d92f9d21154c 408 RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1);
wolfSSL 0:d92f9d21154c 409 RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2);
wolfSSL 0:d92f9d21154c 410 RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3);
wolfSSL 0:d92f9d21154c 411 RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4);
wolfSSL 0:d92f9d21154c 412 RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5);
wolfSSL 0:d92f9d21154c 413 RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6);
wolfSSL 0:d92f9d21154c 414 RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7);
wolfSSL 0:d92f9d21154c 415 }
wolfSSL 0:d92f9d21154c 416
wolfSSL 0:d92f9d21154c 417 /* Add the working vars back into digest state[] */
wolfSSL 0:d92f9d21154c 418 for (i = 0; i < 8; i++) {
wolfSSL 0:d92f9d21154c 419 sha256->digest[i] += S[i];
wolfSSL 0:d92f9d21154c 420 }
wolfSSL 0:d92f9d21154c 421
wolfSSL 0:d92f9d21154c 422 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 0:d92f9d21154c 423 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 0:d92f9d21154c 424 #endif
wolfSSL 0:d92f9d21154c 425
wolfSSL 0:d92f9d21154c 426 return 0;
wolfSSL 0:d92f9d21154c 427 }
wolfSSL 0:d92f9d21154c 428
wolfSSL 0:d92f9d21154c 429 #endif /* #if !defined(FREESCALE_MMCAU) */
wolfSSL 0:d92f9d21154c 430
wolfSSL 0:d92f9d21154c 431 static INLINE void AddLength(Sha256* sha256, word32 len)
wolfSSL 0:d92f9d21154c 432 {
wolfSSL 0:d92f9d21154c 433 word32 tmp = sha256->loLen;
wolfSSL 0:d92f9d21154c 434 if ( (sha256->loLen += len) < tmp)
wolfSSL 0:d92f9d21154c 435 sha256->hiLen++; /* carry low to high */
wolfSSL 0:d92f9d21154c 436 }
wolfSSL 0:d92f9d21154c 437
wolfSSL 0:d92f9d21154c 438 int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
wolfSSL 0:d92f9d21154c 439 {
wolfSSL 0:d92f9d21154c 440
wolfSSL 0:d92f9d21154c 441 /* do block size increments */
wolfSSL 0:d92f9d21154c 442 byte* local = (byte*)sha256->buffer;
wolfSSL 0:d92f9d21154c 443
wolfSSL 0:d92f9d21154c 444 SAVE_XMM_YMM ; /* for Intel AVX */
wolfSSL 0:d92f9d21154c 445
wolfSSL 0:d92f9d21154c 446 while (len) {
wolfSSL 0:d92f9d21154c 447 word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen);
wolfSSL 0:d92f9d21154c 448 XMEMCPY(&local[sha256->buffLen], data, add);
wolfSSL 0:d92f9d21154c 449
wolfSSL 0:d92f9d21154c 450 sha256->buffLen += add;
wolfSSL 0:d92f9d21154c 451 data += add;
wolfSSL 0:d92f9d21154c 452 len -= add;
wolfSSL 0:d92f9d21154c 453
wolfSSL 0:d92f9d21154c 454 if (sha256->buffLen == SHA256_BLOCK_SIZE) {
wolfSSL 0:d92f9d21154c 455 int ret;
wolfSSL 0:d92f9d21154c 456
wolfSSL 0:d92f9d21154c 457 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU)
wolfSSL 0:d92f9d21154c 458 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 459 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 460 #endif
wolfSSL 0:d92f9d21154c 461 ByteReverseWords(sha256->buffer, sha256->buffer,
wolfSSL 0:d92f9d21154c 462 SHA256_BLOCK_SIZE);
wolfSSL 0:d92f9d21154c 463 #endif
wolfSSL 0:d92f9d21154c 464 ret = XTRANSFORM(sha256, local);
wolfSSL 0:d92f9d21154c 465 if (ret != 0)
wolfSSL 0:d92f9d21154c 466 return ret;
wolfSSL 0:d92f9d21154c 467
wolfSSL 0:d92f9d21154c 468 AddLength(sha256, SHA256_BLOCK_SIZE);
wolfSSL 0:d92f9d21154c 469 sha256->buffLen = 0;
wolfSSL 0:d92f9d21154c 470 }
wolfSSL 0:d92f9d21154c 471 }
wolfSSL 0:d92f9d21154c 472
wolfSSL 0:d92f9d21154c 473 return 0;
wolfSSL 0:d92f9d21154c 474 }
wolfSSL 0:d92f9d21154c 475
wolfSSL 0:d92f9d21154c 476 int wc_Sha256Final(Sha256* sha256, byte* hash)
wolfSSL 0:d92f9d21154c 477 {
wolfSSL 0:d92f9d21154c 478 byte* local = (byte*)sha256->buffer;
wolfSSL 0:d92f9d21154c 479 int ret;
wolfSSL 0:d92f9d21154c 480
wolfSSL 0:d92f9d21154c 481 SAVE_XMM_YMM ; /* for Intel AVX */
wolfSSL 0:d92f9d21154c 482
wolfSSL 0:d92f9d21154c 483 AddLength(sha256, sha256->buffLen); /* before adding pads */
wolfSSL 0:d92f9d21154c 484
wolfSSL 0:d92f9d21154c 485 local[sha256->buffLen++] = 0x80; /* add 1 */
wolfSSL 0:d92f9d21154c 486
wolfSSL 0:d92f9d21154c 487 /* pad with zeros */
wolfSSL 0:d92f9d21154c 488 if (sha256->buffLen > SHA256_PAD_SIZE) {
wolfSSL 0:d92f9d21154c 489 XMEMSET(&local[sha256->buffLen], 0, SHA256_BLOCK_SIZE - sha256->buffLen);
wolfSSL 0:d92f9d21154c 490 sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen;
wolfSSL 0:d92f9d21154c 491
wolfSSL 0:d92f9d21154c 492 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU)
wolfSSL 0:d92f9d21154c 493 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 494 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 495 #endif
wolfSSL 0:d92f9d21154c 496 ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE);
wolfSSL 0:d92f9d21154c 497 #endif
wolfSSL 0:d92f9d21154c 498
wolfSSL 0:d92f9d21154c 499 ret = XTRANSFORM(sha256, local);
wolfSSL 0:d92f9d21154c 500 if (ret != 0)
wolfSSL 0:d92f9d21154c 501 return ret;
wolfSSL 0:d92f9d21154c 502
wolfSSL 0:d92f9d21154c 503 sha256->buffLen = 0;
wolfSSL 0:d92f9d21154c 504 }
wolfSSL 0:d92f9d21154c 505 XMEMSET(&local[sha256->buffLen], 0, SHA256_PAD_SIZE - sha256->buffLen);
wolfSSL 0:d92f9d21154c 506
wolfSSL 0:d92f9d21154c 507 /* put lengths in bits */
wolfSSL 0:d92f9d21154c 508 sha256->hiLen = (sha256->loLen >> (8*sizeof(sha256->loLen) - 3)) +
wolfSSL 0:d92f9d21154c 509 (sha256->hiLen << 3);
wolfSSL 0:d92f9d21154c 510 sha256->loLen = sha256->loLen << 3;
wolfSSL 0:d92f9d21154c 511
wolfSSL 0:d92f9d21154c 512 /* store lengths */
wolfSSL 0:d92f9d21154c 513 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU)
wolfSSL 0:d92f9d21154c 514 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 515 if(!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 516 #endif
wolfSSL 0:d92f9d21154c 517 ByteReverseWords(sha256->buffer, sha256->buffer, SHA256_BLOCK_SIZE);
wolfSSL 0:d92f9d21154c 518 #endif
wolfSSL 0:d92f9d21154c 519 /* ! length ordering dependent on digest endian type ! */
wolfSSL 0:d92f9d21154c 520 XMEMCPY(&local[SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
wolfSSL 0:d92f9d21154c 521 XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
wolfSSL 0:d92f9d21154c 522 sizeof(word32));
wolfSSL 0:d92f9d21154c 523
wolfSSL 0:d92f9d21154c 524 #if defined(FREESCALE_MMCAU) || defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 525 /* Kinetis requires only these bytes reversed */
wolfSSL 0:d92f9d21154c 526 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 527 if(IS_INTEL_AVX1 || IS_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 528 #endif
wolfSSL 0:d92f9d21154c 529 ByteReverseWords(&sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)],
wolfSSL 0:d92f9d21154c 530 &sha256->buffer[SHA256_PAD_SIZE/sizeof(word32)],
wolfSSL 0:d92f9d21154c 531 2 * sizeof(word32));
wolfSSL 0:d92f9d21154c 532 #endif
wolfSSL 0:d92f9d21154c 533
wolfSSL 0:d92f9d21154c 534 ret = XTRANSFORM(sha256, local);
wolfSSL 0:d92f9d21154c 535 if (ret != 0)
wolfSSL 0:d92f9d21154c 536 return ret;
wolfSSL 0:d92f9d21154c 537
wolfSSL 0:d92f9d21154c 538 #if defined(LITTLE_ENDIAN_ORDER)
wolfSSL 0:d92f9d21154c 539 ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE);
wolfSSL 0:d92f9d21154c 540 #endif
wolfSSL 0:d92f9d21154c 541 XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE);
wolfSSL 0:d92f9d21154c 542
wolfSSL 0:d92f9d21154c 543 return wc_InitSha256(sha256); /* reset state */
wolfSSL 0:d92f9d21154c 544 }
wolfSSL 0:d92f9d21154c 545
wolfSSL 0:d92f9d21154c 546
wolfSSL 0:d92f9d21154c 547
wolfSSL 0:d92f9d21154c 548 int wc_Sha256Hash(const byte* data, word32 len, byte* hash)
wolfSSL 0:d92f9d21154c 549 {
wolfSSL 0:d92f9d21154c 550 int ret = 0;
wolfSSL 0:d92f9d21154c 551 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 0:d92f9d21154c 552 Sha256* sha256;
wolfSSL 0:d92f9d21154c 553 #else
wolfSSL 0:d92f9d21154c 554 Sha256 sha256[1];
wolfSSL 0:d92f9d21154c 555 #endif
wolfSSL 0:d92f9d21154c 556
wolfSSL 0:d92f9d21154c 557 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 0:d92f9d21154c 558 sha256 = (Sha256*)XMALLOC(sizeof(Sha256), NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 0:d92f9d21154c 559 if (sha256 == NULL)
wolfSSL 0:d92f9d21154c 560 return MEMORY_E;
wolfSSL 0:d92f9d21154c 561 #endif
wolfSSL 0:d92f9d21154c 562
wolfSSL 0:d92f9d21154c 563 if ((ret = wc_InitSha256(sha256)) != 0) {
wolfSSL 0:d92f9d21154c 564 WOLFSSL_MSG("InitSha256 failed");
wolfSSL 0:d92f9d21154c 565 }
wolfSSL 0:d92f9d21154c 566 else if ((ret = wc_Sha256Update(sha256, data, len)) != 0) {
wolfSSL 0:d92f9d21154c 567 WOLFSSL_MSG("Sha256Update failed");
wolfSSL 0:d92f9d21154c 568 }
wolfSSL 0:d92f9d21154c 569 else if ((ret = wc_Sha256Final(sha256, hash)) != 0) {
wolfSSL 0:d92f9d21154c 570 WOLFSSL_MSG("Sha256Final failed");
wolfSSL 0:d92f9d21154c 571 }
wolfSSL 0:d92f9d21154c 572
wolfSSL 0:d92f9d21154c 573 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 0:d92f9d21154c 574 XFREE(sha256, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 0:d92f9d21154c 575 #endif
wolfSSL 0:d92f9d21154c 576
wolfSSL 0:d92f9d21154c 577 return ret;
wolfSSL 0:d92f9d21154c 578 }
wolfSSL 0:d92f9d21154c 579
wolfSSL 0:d92f9d21154c 580 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 581
wolfSSL 0:d92f9d21154c 582 #define _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 0:d92f9d21154c 583 { word32 d ;\
wolfSSL 0:d92f9d21154c 584 d = sha256->digest[0]; __asm__ volatile("movl %0, %"#S_0::"r"(d):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 585 d = sha256->digest[1]; __asm__ volatile("movl %0, %"#S_1::"r"(d):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 586 d = sha256->digest[2]; __asm__ volatile("movl %0, %"#S_2::"r"(d):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 587 d = sha256->digest[3]; __asm__ volatile("movl %0, %"#S_3::"r"(d):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 588 d = sha256->digest[4]; __asm__ volatile("movl %0, %"#S_4::"r"(d):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 589 d = sha256->digest[5]; __asm__ volatile("movl %0, %"#S_5::"r"(d):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 590 d = sha256->digest[6]; __asm__ volatile("movl %0, %"#S_6::"r"(d):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 591 d = sha256->digest[7]; __asm__ volatile("movl %0, %"#S_7::"r"(d):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 592 }
wolfSSL 0:d92f9d21154c 593
wolfSSL 0:d92f9d21154c 594 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 0:d92f9d21154c 595 { word32 d ; \
wolfSSL 0:d92f9d21154c 596 __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ; sha256->digest[0] += d;\
wolfSSL 0:d92f9d21154c 597 __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ; sha256->digest[1] += d;\
wolfSSL 0:d92f9d21154c 598 __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ; sha256->digest[2] += d;\
wolfSSL 0:d92f9d21154c 599 __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ; sha256->digest[3] += d;\
wolfSSL 0:d92f9d21154c 600 __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ; sha256->digest[4] += d;\
wolfSSL 0:d92f9d21154c 601 __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ; sha256->digest[5] += d;\
wolfSSL 0:d92f9d21154c 602 __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ; sha256->digest[6] += d;\
wolfSSL 0:d92f9d21154c 603 __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ; sha256->digest[7] += d;\
wolfSSL 0:d92f9d21154c 604 }
wolfSSL 0:d92f9d21154c 605
wolfSSL 0:d92f9d21154c 606
wolfSSL 0:d92f9d21154c 607 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 0:d92f9d21154c 608 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 0:d92f9d21154c 609
wolfSSL 0:d92f9d21154c 610 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 0:d92f9d21154c 611 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 0:d92f9d21154c 612
wolfSSL 0:d92f9d21154c 613
wolfSSL 0:d92f9d21154c 614
wolfSSL 0:d92f9d21154c 615
wolfSSL 0:d92f9d21154c 616 #define S_0 %r15d
wolfSSL 0:d92f9d21154c 617 #define S_1 %r10d
wolfSSL 0:d92f9d21154c 618 #define S_2 %r11d
wolfSSL 0:d92f9d21154c 619 #define S_3 %r12d
wolfSSL 0:d92f9d21154c 620 #define S_4 %r13d
wolfSSL 0:d92f9d21154c 621 #define S_5 %r14d
wolfSSL 0:d92f9d21154c 622 #define S_6 %ebx
wolfSSL 0:d92f9d21154c 623 #define S_7 %r9d
wolfSSL 0:d92f9d21154c 624
wolfSSL 0:d92f9d21154c 625 #define SSE_REGs "%edi", "%ecx", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15"
wolfSSL 0:d92f9d21154c 626
wolfSSL 0:d92f9d21154c 627 #if defined(HAVE_INTEL_RORX)
wolfSSL 0:d92f9d21154c 628 #define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 629 __asm__ volatile("rorx $6, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\
wolfSSL 0:d92f9d21154c 630
wolfSSL 0:d92f9d21154c 631 #define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 632 __asm__ volatile("rorx $11, %"#e",%%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\
wolfSSL 0:d92f9d21154c 633 __asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\
wolfSSL 0:d92f9d21154c 634 __asm__ volatile("rorx $25, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\
wolfSSL 0:d92f9d21154c 635
wolfSSL 0:d92f9d21154c 636 #define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 637 __asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\
wolfSSL 0:d92f9d21154c 638 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\
wolfSSL 0:d92f9d21154c 639 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\
wolfSSL 0:d92f9d21154c 640 __asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\
wolfSSL 0:d92f9d21154c 641 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\
wolfSSL 0:d92f9d21154c 642
wolfSSL 0:d92f9d21154c 643 #define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 644 /*__asm__ volatile("movl %0, %%edx\n\t"::"m"(w_k):"%edx");*/\
wolfSSL 0:d92f9d21154c 645 __asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\
wolfSSL 0:d92f9d21154c 646 __asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\
wolfSSL 0:d92f9d21154c 647 __asm__ volatile("rorx $2, %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\
wolfSSL 0:d92f9d21154c 648 __asm__ volatile("rorx $13, %"#a", %%edi\n\t":::"%edi",SSE_REGs);/* edi = a>>13 */\
wolfSSL 0:d92f9d21154c 649
wolfSSL 0:d92f9d21154c 650 #define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 651 __asm__ volatile("rorx $22, %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\
wolfSSL 0:d92f9d21154c 652 __asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs);/* edi = (a>>2) ^ (a>>13) */\
wolfSSL 0:d92f9d21154c 653 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma0(a) */\
wolfSSL 0:d92f9d21154c 654
wolfSSL 0:d92f9d21154c 655 #define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 656 __asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\
wolfSSL 0:d92f9d21154c 657 __asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\
wolfSSL 0:d92f9d21154c 658 __asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c*/\
wolfSSL 0:d92f9d21154c 659 __asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\
wolfSSL 0:d92f9d21154c 660
wolfSSL 0:d92f9d21154c 661 #define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 662 __asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\
wolfSSL 0:d92f9d21154c 663 __asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\
wolfSSL 0:d92f9d21154c 664 __asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\
wolfSSL 0:d92f9d21154c 665
wolfSSL 0:d92f9d21154c 666 #define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 667 __asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\
wolfSSL 0:d92f9d21154c 668 __asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \
wolfSSL 0:d92f9d21154c 669 __asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs); \
wolfSSL 0:d92f9d21154c 670 __asm__ volatile("movl %r8d, "#h"\n\t");
wolfSSL 0:d92f9d21154c 671
wolfSSL 0:d92f9d21154c 672 #endif
wolfSSL 0:d92f9d21154c 673
wolfSSL 0:d92f9d21154c 674 #define RND_STEP_1(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 675 __asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs);\
wolfSSL 0:d92f9d21154c 676 __asm__ volatile("roll $26, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\
wolfSSL 0:d92f9d21154c 677 __asm__ volatile("movl %"#e", %%edi\n\t":::"%edi",SSE_REGs);\
wolfSSL 0:d92f9d21154c 678
wolfSSL 0:d92f9d21154c 679 #define RND_STEP_2(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 680 __asm__ volatile("roll $21, %%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\
wolfSSL 0:d92f9d21154c 681 __asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\
wolfSSL 0:d92f9d21154c 682 __asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e */\
wolfSSL 0:d92f9d21154c 683 __asm__ volatile("roll $7, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\
wolfSSL 0:d92f9d21154c 684
wolfSSL 0:d92f9d21154c 685 #define RND_STEP_3(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 686 __asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\
wolfSSL 0:d92f9d21154c 687 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\
wolfSSL 0:d92f9d21154c 688 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\
wolfSSL 0:d92f9d21154c 689 __asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\
wolfSSL 0:d92f9d21154c 690 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\
wolfSSL 0:d92f9d21154c 691
wolfSSL 0:d92f9d21154c 692 #define RND_STEP_4(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 693 __asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\
wolfSSL 0:d92f9d21154c 694 __asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\
wolfSSL 0:d92f9d21154c 695 __asm__ volatile("movl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a */\
wolfSSL 0:d92f9d21154c 696 __asm__ volatile("roll $30, %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\
wolfSSL 0:d92f9d21154c 697 __asm__ volatile("movl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a */\
wolfSSL 0:d92f9d21154c 698 __asm__ volatile("roll $19, %%edi\n\t":::"%edi",SSE_REGs); /* edi = a>>13 */\
wolfSSL 0:d92f9d21154c 699 __asm__ volatile("movl %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a */\
wolfSSL 0:d92f9d21154c 700
wolfSSL 0:d92f9d21154c 701 #define RND_STEP_5(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 702 __asm__ volatile("roll $10, %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\
wolfSSL 0:d92f9d21154c 703 __asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs); /* edi = (a>>2) ^ (a>>13) */\
wolfSSL 0:d92f9d21154c 704 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);/* edx = Sigma0(a) */\
wolfSSL 0:d92f9d21154c 705
wolfSSL 0:d92f9d21154c 706 #define RND_STEP_6(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 707 __asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\
wolfSSL 0:d92f9d21154c 708 __asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\
wolfSSL 0:d92f9d21154c 709 __asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c */\
wolfSSL 0:d92f9d21154c 710 __asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\
wolfSSL 0:d92f9d21154c 711
wolfSSL 0:d92f9d21154c 712 #define RND_STEP_7(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 713 __asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\
wolfSSL 0:d92f9d21154c 714 __asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\
wolfSSL 0:d92f9d21154c 715 __asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\
wolfSSL 0:d92f9d21154c 716
wolfSSL 0:d92f9d21154c 717 #define RND_STEP_8(a,b,c,d,e,f,g,h,i)\
wolfSSL 0:d92f9d21154c 718 __asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\
wolfSSL 0:d92f9d21154c 719 __asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \
wolfSSL 0:d92f9d21154c 720 /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\
wolfSSL 0:d92f9d21154c 721 __asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs);\
wolfSSL 0:d92f9d21154c 722 /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */\
wolfSSL 0:d92f9d21154c 723 __asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \
wolfSSL 0:d92f9d21154c 724 /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \
wolfSSL 0:d92f9d21154c 725
wolfSSL 0:d92f9d21154c 726 #define RND_X(a,b,c,d,e,f,g,h,i) \
wolfSSL 0:d92f9d21154c 727 RND_STEP_1(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 728 RND_STEP_2(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 729 RND_STEP_3(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 730 RND_STEP_4(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 731 RND_STEP_5(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 732 RND_STEP_6(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 733 RND_STEP_7(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 734 RND_STEP_8(a,b,c,d,e,f,g,h,i);
wolfSSL 0:d92f9d21154c 735
wolfSSL 0:d92f9d21154c 736 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 0:d92f9d21154c 737 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 0:d92f9d21154c 738 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 0:d92f9d21154c 739 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 0:d92f9d21154c 740 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 0:d92f9d21154c 741 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 0:d92f9d21154c 742 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 0:d92f9d21154c 743 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 0:d92f9d21154c 744
wolfSSL 0:d92f9d21154c 745
wolfSSL 0:d92f9d21154c 746 #define RND_1_3(a,b,c,d,e,f,g,h,i) {\
wolfSSL 0:d92f9d21154c 747 RND_STEP_1(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 748 RND_STEP_2(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 749 RND_STEP_3(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 750 }
wolfSSL 0:d92f9d21154c 751
wolfSSL 0:d92f9d21154c 752 #define RND_4_6(a,b,c,d,e,f,g,h,i) {\
wolfSSL 0:d92f9d21154c 753 RND_STEP_4(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 754 RND_STEP_5(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 755 RND_STEP_6(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 756 }
wolfSSL 0:d92f9d21154c 757
wolfSSL 0:d92f9d21154c 758 #define RND_7_8(a,b,c,d,e,f,g,h,i) {\
wolfSSL 0:d92f9d21154c 759 RND_STEP_7(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 760 RND_STEP_8(a,b,c,d,e,f,g,h,i); \
wolfSSL 0:d92f9d21154c 761 }
wolfSSL 0:d92f9d21154c 762
wolfSSL 0:d92f9d21154c 763 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 0:d92f9d21154c 764 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 0:d92f9d21154c 765 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 0:d92f9d21154c 766 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 0:d92f9d21154c 767 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 0:d92f9d21154c 768 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 0:d92f9d21154c 769 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 0:d92f9d21154c 770 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 0:d92f9d21154c 771
wolfSSL 0:d92f9d21154c 772
wolfSSL 0:d92f9d21154c 773 #define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 0:d92f9d21154c 774 #define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 0:d92f9d21154c 775 #define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 0:d92f9d21154c 776 #define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 0:d92f9d21154c 777 #define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 0:d92f9d21154c 778 #define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 0:d92f9d21154c 779 #define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 0:d92f9d21154c 780 #define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 0:d92f9d21154c 781
wolfSSL 0:d92f9d21154c 782 #define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 0:d92f9d21154c 783 #define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 0:d92f9d21154c 784 #define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 0:d92f9d21154c 785 #define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 0:d92f9d21154c 786 #define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 0:d92f9d21154c 787 #define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 0:d92f9d21154c 788 #define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 0:d92f9d21154c 789 #define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 0:d92f9d21154c 790
wolfSSL 0:d92f9d21154c 791 #define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 0:d92f9d21154c 792 #define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 0:d92f9d21154c 793 #define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 0:d92f9d21154c 794 #define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 0:d92f9d21154c 795 #define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 0:d92f9d21154c 796 #define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 0:d92f9d21154c 797 #define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 0:d92f9d21154c 798 #define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 0:d92f9d21154c 799
wolfSSL 0:d92f9d21154c 800 #define FOR(cnt, init, max, inc, loop) \
wolfSSL 0:d92f9d21154c 801 __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):)
wolfSSL 0:d92f9d21154c 802 #define END(cnt, init, max, inc, loop) \
wolfSSL 0:d92f9d21154c 803 __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::) ;
wolfSSL 0:d92f9d21154c 804
wolfSSL 0:d92f9d21154c 805 #endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */
wolfSSL 0:d92f9d21154c 806
wolfSSL 0:d92f9d21154c 807 #if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */
wolfSSL 0:d92f9d21154c 808
wolfSSL 0:d92f9d21154c 809 #define VPALIGNR(op1,op2,op3,op4) __asm__ volatile("vpalignr $"#op4", %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 0:d92f9d21154c 810 #define VPADDD(op1,op2,op3) __asm__ volatile("vpaddd %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 0:d92f9d21154c 811 #define VPSRLD(op1,op2,op3) __asm__ volatile("vpsrld $"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 0:d92f9d21154c 812 #define VPSRLQ(op1,op2,op3) __asm__ volatile("vpsrlq $"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 0:d92f9d21154c 813 #define VPSLLD(op1,op2,op3) __asm__ volatile("vpslld $"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 0:d92f9d21154c 814 #define VPOR(op1,op2,op3) __asm__ volatile("vpor %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 0:d92f9d21154c 815 #define VPXOR(op1,op2,op3) __asm__ volatile("vpxor %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 0:d92f9d21154c 816 #define VPSHUFD(op1,op2,op3) __asm__ volatile("vpshufd $"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 0:d92f9d21154c 817 #define VPSHUFB(op1,op2,op3) __asm__ volatile("vpshufb %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 0:d92f9d21154c 818
wolfSSL 0:d92f9d21154c 819 #define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\
wolfSSL 0:d92f9d21154c 820 a,b,c,d,e,f,g,h,_i)\
wolfSSL 0:d92f9d21154c 821 RND_STEP_1(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 822 VPALIGNR (XTMP0, X3, X2, 4) ;\
wolfSSL 0:d92f9d21154c 823 RND_STEP_2(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 824 VPADDD (XTMP0, XTMP0, X0) ;\
wolfSSL 0:d92f9d21154c 825 RND_STEP_3(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 826 VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\
wolfSSL 0:d92f9d21154c 827 RND_STEP_4(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 828 VPSRLD (XTMP2, XTMP1, 7) ;\
wolfSSL 0:d92f9d21154c 829 RND_STEP_5(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 830 VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\
wolfSSL 0:d92f9d21154c 831 RND_STEP_6(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 832 VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\
wolfSSL 0:d92f9d21154c 833 RND_STEP_7(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 834 VPSRLD (XTMP2, XTMP1,18) ;\
wolfSSL 0:d92f9d21154c 835 RND_STEP_8(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 836 \
wolfSSL 0:d92f9d21154c 837 RND_STEP_1(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 838 VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\
wolfSSL 0:d92f9d21154c 839 RND_STEP_2(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 840 VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\
wolfSSL 0:d92f9d21154c 841 RND_STEP_3(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 842 VPXOR (XTMP3, XTMP3, XTMP1) ;\
wolfSSL 0:d92f9d21154c 843 RND_STEP_4(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 844 VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
wolfSSL 0:d92f9d21154c 845 RND_STEP_5(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 846 VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\
wolfSSL 0:d92f9d21154c 847 RND_STEP_6(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 848 VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\
wolfSSL 0:d92f9d21154c 849 RND_STEP_7(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 850 VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\
wolfSSL 0:d92f9d21154c 851 RND_STEP_8(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 852 \
wolfSSL 0:d92f9d21154c 853 RND_STEP_1(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 854 VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\
wolfSSL 0:d92f9d21154c 855 RND_STEP_2(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 856 VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
wolfSSL 0:d92f9d21154c 857 RND_STEP_3(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 858 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
wolfSSL 0:d92f9d21154c 859 RND_STEP_4(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 860 VPXOR (XTMP2, XTMP2, XTMP3) ;\
wolfSSL 0:d92f9d21154c 861 RND_STEP_5(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 862 VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\
wolfSSL 0:d92f9d21154c 863 RND_STEP_6(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 864 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\
wolfSSL 0:d92f9d21154c 865 RND_STEP_7(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 866 VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\
wolfSSL 0:d92f9d21154c 867 RND_STEP_8(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 868 \
wolfSSL 0:d92f9d21154c 869 RND_STEP_1(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 870 VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\
wolfSSL 0:d92f9d21154c 871 RND_STEP_2(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 872 VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\
wolfSSL 0:d92f9d21154c 873 RND_STEP_3(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 874 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
wolfSSL 0:d92f9d21154c 875 RND_STEP_4(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 876 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
wolfSSL 0:d92f9d21154c 877 RND_STEP_5(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 878 VPXOR (XTMP2, XTMP2, XTMP3) ;\
wolfSSL 0:d92f9d21154c 879 RND_STEP_6(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 880 VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\
wolfSSL 0:d92f9d21154c 881 RND_STEP_7(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 882 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\
wolfSSL 0:d92f9d21154c 883 RND_STEP_8(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 884 VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\
wolfSSL 0:d92f9d21154c 885
wolfSSL 0:d92f9d21154c 886 #if defined(HAVE_INTEL_RORX)
wolfSSL 0:d92f9d21154c 887
wolfSSL 0:d92f9d21154c 888 #define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \
wolfSSL 0:d92f9d21154c 889 XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\
wolfSSL 0:d92f9d21154c 890 RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 891 VPALIGNR (XTMP0, X3, X2, 4) ;\
wolfSSL 0:d92f9d21154c 892 RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 893 VPADDD (XTMP0, XTMP0, X0) ;\
wolfSSL 0:d92f9d21154c 894 RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 895 VPALIGNR (XTMP1, X1, X0, 4) ; /* XTMP1 = W[-15] */\
wolfSSL 0:d92f9d21154c 896 RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 897 VPSRLD (XTMP2, XTMP1, 7) ;\
wolfSSL 0:d92f9d21154c 898 RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 899 VPSLLD (XTMP3, XTMP1, 25) ; /* VPSLLD (XTMP3, XTMP1, (32-7)) */\
wolfSSL 0:d92f9d21154c 900 RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 901 VPOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 */\
wolfSSL 0:d92f9d21154c 902 RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 903 VPSRLD (XTMP2, XTMP1,18) ;\
wolfSSL 0:d92f9d21154c 904 RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i);\
wolfSSL 0:d92f9d21154c 905 \
wolfSSL 0:d92f9d21154c 906 RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 907 VPSRLD (XTMP4, XTMP1, 3) ; /* XTMP4 = W[-15] >> 3 */\
wolfSSL 0:d92f9d21154c 908 RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 909 VPSLLD (XTMP1, XTMP1, 14) ; /* VPSLLD (XTMP1, XTMP1, (32-18)) */\
wolfSSL 0:d92f9d21154c 910 RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 911 VPXOR (XTMP3, XTMP3, XTMP1) ;\
wolfSSL 0:d92f9d21154c 912 RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 913 VPXOR (XTMP3, XTMP3, XTMP2) ; /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
wolfSSL 0:d92f9d21154c 914 RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 915 VPXOR (XTMP1, XTMP3, XTMP4) ; /* XTMP1 = s0 */\
wolfSSL 0:d92f9d21154c 916 RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 917 VPSHUFD(XTMP2, X3, 0b11111010) ; /* XTMP2 = W[-2] {BBAA}*/\
wolfSSL 0:d92f9d21154c 918 RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 919 VPADDD (XTMP0, XTMP0, XTMP1) ; /* XTMP0 = W[-16] + W[-7] + s0 */\
wolfSSL 0:d92f9d21154c 920 RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 0:d92f9d21154c 921 \
wolfSSL 0:d92f9d21154c 922 RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 923 VPSRLD (XTMP4, XTMP2, 10) ; /* XTMP4 = W[-2] >> 10 {BBAA} */\
wolfSSL 0:d92f9d21154c 924 RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 925 VPSRLQ (XTMP3, XTMP2, 19) ; /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
wolfSSL 0:d92f9d21154c 926 RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 927 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
wolfSSL 0:d92f9d21154c 928 RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 929 VPXOR (XTMP2, XTMP2, XTMP3) ;\
wolfSSL 0:d92f9d21154c 930 RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 931 VPXOR (XTMP4, XTMP4, XTMP2) ; /* XTMP4 = s1 {xBxA} */\
wolfSSL 0:d92f9d21154c 932 RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 933 VPSHUFB (XTMP4, XTMP4, SHUF_00BA) ; /* XTMP4 = s1 {00BA} */\
wolfSSL 0:d92f9d21154c 934 RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 935 VPADDD (XTMP0, XTMP0, XTMP4) ; /* XTMP0 = {..., ..., W[1], W[0]} */\
wolfSSL 0:d92f9d21154c 936 RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 0:d92f9d21154c 937 \
wolfSSL 0:d92f9d21154c 938 RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 939 VPSHUFD (XTMP2, XTMP0, 0b01010000) ; /* XTMP2 = W[-2] {DDCC} */\
wolfSSL 0:d92f9d21154c 940 RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 941 VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\
wolfSSL 0:d92f9d21154c 942 RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 943 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
wolfSSL 0:d92f9d21154c 944 RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 945 VPSRLQ (XTMP2, XTMP2, 17) ; /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
wolfSSL 0:d92f9d21154c 946 RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 947 VPXOR (XTMP2, XTMP2, XTMP3) ;\
wolfSSL 0:d92f9d21154c 948 RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 949 VPXOR (XTMP5, XTMP5, XTMP2) ; /* XTMP5 = s1 {xDxC} */\
wolfSSL 0:d92f9d21154c 950 RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 951 VPSHUFB (XTMP5, XTMP5, SHUF_DC00) ; /* XTMP5 = s1 {DC00} */\
wolfSSL 0:d92f9d21154c 952 RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 0:d92f9d21154c 953 VPADDD (X0, XTMP5, XTMP0) ; /* X0 = {W[3], W[2], W[1], W[0]} */\
wolfSSL 0:d92f9d21154c 954
wolfSSL 0:d92f9d21154c 955 #endif
wolfSSL 0:d92f9d21154c 956
wolfSSL 0:d92f9d21154c 957
wolfSSL 0:d92f9d21154c 958 #define W_K_from_buff\
wolfSSL 0:d92f9d21154c 959 __asm__ volatile("vmovdqu %0, %%xmm4\n\t"\
wolfSSL 0:d92f9d21154c 960 "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t"\
wolfSSL 0:d92f9d21154c 961 :: "m"(sha256->buffer[0]):"%xmm4") ;\
wolfSSL 0:d92f9d21154c 962 __asm__ volatile("vmovdqu %0, %%xmm5\n\t"\
wolfSSL 0:d92f9d21154c 963 "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t"\
wolfSSL 0:d92f9d21154c 964 ::"m"(sha256->buffer[4]):"%xmm5") ;\
wolfSSL 0:d92f9d21154c 965 __asm__ volatile("vmovdqu %0, %%xmm6\n\t"\
wolfSSL 0:d92f9d21154c 966 "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t"\
wolfSSL 0:d92f9d21154c 967 ::"m"(sha256->buffer[8]):"%xmm6") ;\
wolfSSL 0:d92f9d21154c 968 __asm__ volatile("vmovdqu %0, %%xmm7\n\t"\
wolfSSL 0:d92f9d21154c 969 "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"\
wolfSSL 0:d92f9d21154c 970 ::"m"(sha256->buffer[12]):"%xmm7") ;\
wolfSSL 0:d92f9d21154c 971
wolfSSL 0:d92f9d21154c 972 #define _SET_W_K_XFER(reg, i)\
wolfSSL 0:d92f9d21154c 973 __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs) ;\
wolfSSL 0:d92f9d21154c 974 __asm__ volatile("vmovdqa %%xmm9, %0":"=m"(W_K[i])::XMM_REGs) ;
wolfSSL 0:d92f9d21154c 975
wolfSSL 0:d92f9d21154c 976 #define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i)
wolfSSL 0:d92f9d21154c 977
wolfSSL 0:d92f9d21154c 978 static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF } ; /* shuffle xBxA -> 00BA */
wolfSSL 0:d92f9d21154c 979 static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 } ; /* shuffle xDxC -> DC00 */
wolfSSL 0:d92f9d21154c 980 static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
wolfSSL 0:d92f9d21154c 981
wolfSSL 0:d92f9d21154c 982
wolfSSL 0:d92f9d21154c 983 #define _Init_Masks(mask1, mask2, mask3)\
wolfSSL 0:d92f9d21154c 984 __asm__ volatile("vmovdqu %0, %"#mask1 ::"m"(mBYTE_FLIP_MASK[0])) ;\
wolfSSL 0:d92f9d21154c 985 __asm__ volatile("vmovdqu %0, %"#mask2 ::"m"(mSHUF_00BA[0])) ;\
wolfSSL 0:d92f9d21154c 986 __asm__ volatile("vmovdqu %0, %"#mask3 ::"m"(mSHUF_DC00[0])) ;
wolfSSL 0:d92f9d21154c 987
wolfSSL 0:d92f9d21154c 988 #define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\
wolfSSL 0:d92f9d21154c 989 _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
wolfSSL 0:d92f9d21154c 990
wolfSSL 0:d92f9d21154c 991 #define X0 %xmm4
wolfSSL 0:d92f9d21154c 992 #define X1 %xmm5
wolfSSL 0:d92f9d21154c 993 #define X2 %xmm6
wolfSSL 0:d92f9d21154c 994 #define X3 %xmm7
wolfSSL 0:d92f9d21154c 995 #define X_ X0
wolfSSL 0:d92f9d21154c 996
wolfSSL 0:d92f9d21154c 997 #define XTMP0 %xmm0
wolfSSL 0:d92f9d21154c 998 #define XTMP1 %xmm1
wolfSSL 0:d92f9d21154c 999 #define XTMP2 %xmm2
wolfSSL 0:d92f9d21154c 1000 #define XTMP3 %xmm3
wolfSSL 0:d92f9d21154c 1001 #define XTMP4 %xmm8
wolfSSL 0:d92f9d21154c 1002 #define XTMP5 %xmm9
wolfSSL 0:d92f9d21154c 1003 #define XFER %xmm10
wolfSSL 0:d92f9d21154c 1004
wolfSSL 0:d92f9d21154c 1005 #define SHUF_00BA %xmm11 /* shuffle xBxA -> 00BA */
wolfSSL 0:d92f9d21154c 1006 #define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */
wolfSSL 0:d92f9d21154c 1007 #define BYTE_FLIP_MASK %xmm13
wolfSSL 0:d92f9d21154c 1008
wolfSSL 0:d92f9d21154c 1009 #define XMM_REGs /* Registers are saved in Sha256Update/Finel */
wolfSSL 0:d92f9d21154c 1010 /*"xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13" */
wolfSSL 0:d92f9d21154c 1011
wolfSSL 0:d92f9d21154c 1012 static int Transform_AVX1(Sha256* sha256)
wolfSSL 0:d92f9d21154c 1013 {
wolfSSL 0:d92f9d21154c 1014
wolfSSL 0:d92f9d21154c 1015 word32 W_K[64] ; /* temp for W+K */
wolfSSL 0:d92f9d21154c 1016
wolfSSL 0:d92f9d21154c 1017 #if defined(DEBUG_XMM)
wolfSSL 0:d92f9d21154c 1018 int i, j ;
wolfSSL 0:d92f9d21154c 1019 word32 xmm[29][4*15] ;
wolfSSL 0:d92f9d21154c 1020 #endif
wolfSSL 0:d92f9d21154c 1021
wolfSSL 0:d92f9d21154c 1022 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ;
wolfSSL 0:d92f9d21154c 1023 W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */
wolfSSL 0:d92f9d21154c 1024
wolfSSL 0:d92f9d21154c 1025 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
wolfSSL 0:d92f9d21154c 1026
wolfSSL 0:d92f9d21154c 1027 SET_W_K_XFER(X0, 0) ;
wolfSSL 0:d92f9d21154c 1028 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 0:d92f9d21154c 1029 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ;
wolfSSL 0:d92f9d21154c 1030 SET_W_K_XFER(X1, 4) ;
wolfSSL 0:d92f9d21154c 1031 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 0:d92f9d21154c 1032 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ;
wolfSSL 0:d92f9d21154c 1033 SET_W_K_XFER(X2, 8) ;
wolfSSL 0:d92f9d21154c 1034 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 0:d92f9d21154c 1035 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
wolfSSL 0:d92f9d21154c 1036 SET_W_K_XFER(X3, 12) ;
wolfSSL 0:d92f9d21154c 1037 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 0:d92f9d21154c 1038 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ;
wolfSSL 0:d92f9d21154c 1039 SET_W_K_XFER(X0, 16) ;
wolfSSL 0:d92f9d21154c 1040 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 0:d92f9d21154c 1041 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
wolfSSL 0:d92f9d21154c 1042 SET_W_K_XFER(X1, 20) ;
wolfSSL 0:d92f9d21154c 1043 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 0:d92f9d21154c 1044 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ;
wolfSSL 0:d92f9d21154c 1045 SET_W_K_XFER(X2, 24) ;
wolfSSL 0:d92f9d21154c 1046 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 0:d92f9d21154c 1047 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
wolfSSL 0:d92f9d21154c 1048 SET_W_K_XFER(X3, 28) ;
wolfSSL 0:d92f9d21154c 1049 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 0:d92f9d21154c 1050 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ;
wolfSSL 0:d92f9d21154c 1051 SET_W_K_XFER(X0, 32) ;
wolfSSL 0:d92f9d21154c 1052 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 0:d92f9d21154c 1053 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
wolfSSL 0:d92f9d21154c 1054 SET_W_K_XFER(X1, 36) ;
wolfSSL 0:d92f9d21154c 1055 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 0:d92f9d21154c 1056 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ;
wolfSSL 0:d92f9d21154c 1057 SET_W_K_XFER(X2, 40) ;
wolfSSL 0:d92f9d21154c 1058 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 0:d92f9d21154c 1059 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
wolfSSL 0:d92f9d21154c 1060 SET_W_K_XFER(X3, 44) ;
wolfSSL 0:d92f9d21154c 1061 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 0:d92f9d21154c 1062 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ;
wolfSSL 0:d92f9d21154c 1063
wolfSSL 0:d92f9d21154c 1064 SET_W_K_XFER(X0, 48) ;
wolfSSL 0:d92f9d21154c 1065 SET_W_K_XFER(X1, 52) ;
wolfSSL 0:d92f9d21154c 1066 SET_W_K_XFER(X2, 56) ;
wolfSSL 0:d92f9d21154c 1067 SET_W_K_XFER(X3, 60) ;
wolfSSL 0:d92f9d21154c 1068
wolfSSL 0:d92f9d21154c 1069 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
wolfSSL 0:d92f9d21154c 1070 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
wolfSSL 0:d92f9d21154c 1071 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
wolfSSL 0:d92f9d21154c 1072 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
wolfSSL 0:d92f9d21154c 1073
wolfSSL 0:d92f9d21154c 1074 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
wolfSSL 0:d92f9d21154c 1075 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
wolfSSL 0:d92f9d21154c 1076 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
wolfSSL 0:d92f9d21154c 1077 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
wolfSSL 0:d92f9d21154c 1078
wolfSSL 0:d92f9d21154c 1079 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ;
wolfSSL 0:d92f9d21154c 1080 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ;
wolfSSL 0:d92f9d21154c 1081 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ;
wolfSSL 0:d92f9d21154c 1082 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ;
wolfSSL 0:d92f9d21154c 1083
wolfSSL 0:d92f9d21154c 1084 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ;
wolfSSL 0:d92f9d21154c 1085 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ;
wolfSSL 0:d92f9d21154c 1086 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ;
wolfSSL 0:d92f9d21154c 1087 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ;
wolfSSL 0:d92f9d21154c 1088
wolfSSL 0:d92f9d21154c 1089 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
wolfSSL 0:d92f9d21154c 1090
wolfSSL 0:d92f9d21154c 1091 #if defined(DEBUG_XMM)
wolfSSL 0:d92f9d21154c 1092 for(i=0; i<29; i++) {
wolfSSL 0:d92f9d21154c 1093 for(j=0; j<4*14; j+=4)
wolfSSL 0:d92f9d21154c 1094 printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i,
wolfSSL 0:d92f9d21154c 1095 xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ;
wolfSSL 0:d92f9d21154c 1096 printf("\n") ;
wolfSSL 0:d92f9d21154c 1097 }
wolfSSL 0:d92f9d21154c 1098
wolfSSL 0:d92f9d21154c 1099 for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ;
wolfSSL 0:d92f9d21154c 1100 #endif
wolfSSL 0:d92f9d21154c 1101
wolfSSL 0:d92f9d21154c 1102 return 0;
wolfSSL 0:d92f9d21154c 1103 }
wolfSSL 0:d92f9d21154c 1104
wolfSSL 0:d92f9d21154c 1105 #if defined(HAVE_INTEL_RORX)
wolfSSL 0:d92f9d21154c 1106 static int Transform_AVX1_RORX(Sha256* sha256)
wolfSSL 0:d92f9d21154c 1107 {
wolfSSL 0:d92f9d21154c 1108
wolfSSL 0:d92f9d21154c 1109 word32 W_K[64] ; /* temp for W+K */
wolfSSL 0:d92f9d21154c 1110
wolfSSL 0:d92f9d21154c 1111 #if defined(DEBUG_XMM)
wolfSSL 0:d92f9d21154c 1112 int i, j ;
wolfSSL 0:d92f9d21154c 1113 word32 xmm[29][4*15] ;
wolfSSL 0:d92f9d21154c 1114 #endif
wolfSSL 0:d92f9d21154c 1115
wolfSSL 0:d92f9d21154c 1116 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00) ;
wolfSSL 0:d92f9d21154c 1117 W_K_from_buff ; /* X0, X1, X2, X3 = W[0..15] ; */
wolfSSL 0:d92f9d21154c 1118
wolfSSL 0:d92f9d21154c 1119 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
wolfSSL 0:d92f9d21154c 1120 SET_W_K_XFER(X0, 0) ;
wolfSSL 0:d92f9d21154c 1121 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 0:d92f9d21154c 1122 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ;
wolfSSL 0:d92f9d21154c 1123 SET_W_K_XFER(X1, 4) ;
wolfSSL 0:d92f9d21154c 1124 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 0:d92f9d21154c 1125 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4) ;
wolfSSL 0:d92f9d21154c 1126 SET_W_K_XFER(X2, 8) ;
wolfSSL 0:d92f9d21154c 1127 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 0:d92f9d21154c 1128 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
wolfSSL 0:d92f9d21154c 1129 SET_W_K_XFER(X3, 12) ;
wolfSSL 0:d92f9d21154c 1130 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 0:d92f9d21154c 1131 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12) ;
wolfSSL 0:d92f9d21154c 1132 SET_W_K_XFER(X0, 16) ;
wolfSSL 0:d92f9d21154c 1133 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 0:d92f9d21154c 1134 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
wolfSSL 0:d92f9d21154c 1135 SET_W_K_XFER(X1, 20) ;
wolfSSL 0:d92f9d21154c 1136 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 0:d92f9d21154c 1137 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20) ;
wolfSSL 0:d92f9d21154c 1138 SET_W_K_XFER(X2, 24) ;
wolfSSL 0:d92f9d21154c 1139 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 0:d92f9d21154c 1140 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
wolfSSL 0:d92f9d21154c 1141 SET_W_K_XFER(X3, 28) ;
wolfSSL 0:d92f9d21154c 1142 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 0:d92f9d21154c 1143 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28) ;
wolfSSL 0:d92f9d21154c 1144 SET_W_K_XFER(X0, 32) ;
wolfSSL 0:d92f9d21154c 1145 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 0:d92f9d21154c 1146 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
wolfSSL 0:d92f9d21154c 1147 SET_W_K_XFER(X1, 36) ;
wolfSSL 0:d92f9d21154c 1148 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 0:d92f9d21154c 1149 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36) ;
wolfSSL 0:d92f9d21154c 1150 SET_W_K_XFER(X2, 40) ;
wolfSSL 0:d92f9d21154c 1151 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 0:d92f9d21154c 1152 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
wolfSSL 0:d92f9d21154c 1153 SET_W_K_XFER(X3, 44) ;
wolfSSL 0:d92f9d21154c 1154 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 0:d92f9d21154c 1155 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44) ;
wolfSSL 0:d92f9d21154c 1156
wolfSSL 0:d92f9d21154c 1157 SET_W_K_XFER(X0, 48) ;
wolfSSL 0:d92f9d21154c 1158 SET_W_K_XFER(X1, 52) ;
wolfSSL 0:d92f9d21154c 1159 SET_W_K_XFER(X2, 56) ;
wolfSSL 0:d92f9d21154c 1160 SET_W_K_XFER(X3, 60) ;
wolfSSL 0:d92f9d21154c 1161
wolfSSL 0:d92f9d21154c 1162 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
wolfSSL 0:d92f9d21154c 1163 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
wolfSSL 0:d92f9d21154c 1164 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
wolfSSL 0:d92f9d21154c 1165 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
wolfSSL 0:d92f9d21154c 1166
wolfSSL 0:d92f9d21154c 1167 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
wolfSSL 0:d92f9d21154c 1168 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
wolfSSL 0:d92f9d21154c 1169 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
wolfSSL 0:d92f9d21154c 1170 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
wolfSSL 0:d92f9d21154c 1171
wolfSSL 0:d92f9d21154c 1172 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ;
wolfSSL 0:d92f9d21154c 1173 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ;
wolfSSL 0:d92f9d21154c 1174 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ;
wolfSSL 0:d92f9d21154c 1175 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ;
wolfSSL 0:d92f9d21154c 1176
wolfSSL 0:d92f9d21154c 1177 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ;
wolfSSL 0:d92f9d21154c 1178 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ;
wolfSSL 0:d92f9d21154c 1179 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ;
wolfSSL 0:d92f9d21154c 1180 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ;
wolfSSL 0:d92f9d21154c 1181
wolfSSL 0:d92f9d21154c 1182 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
wolfSSL 0:d92f9d21154c 1183
wolfSSL 0:d92f9d21154c 1184 #if defined(DEBUG_XMM)
wolfSSL 0:d92f9d21154c 1185 for(i=0; i<29; i++) {
wolfSSL 0:d92f9d21154c 1186 for(j=0; j<4*14; j+=4)
wolfSSL 0:d92f9d21154c 1187 printf("xmm%d[%d]=%08x,%08x,%08x,%08x\n", j/4, i,
wolfSSL 0:d92f9d21154c 1188 xmm[i][j],xmm[i][j+1],xmm[i][j+2],xmm[i][j+3]) ;
wolfSSL 0:d92f9d21154c 1189 printf("\n") ;
wolfSSL 0:d92f9d21154c 1190 }
wolfSSL 0:d92f9d21154c 1191
wolfSSL 0:d92f9d21154c 1192 for(i=0; i<64; i++)printf("W_K[%d]%08x\n", i, W_K[i]) ;
wolfSSL 0:d92f9d21154c 1193 #endif
wolfSSL 0:d92f9d21154c 1194
wolfSSL 0:d92f9d21154c 1195 return 0;
wolfSSL 0:d92f9d21154c 1196 }
wolfSSL 0:d92f9d21154c 1197 #endif /* HAVE_INTEL_RORX */
wolfSSL 0:d92f9d21154c 1198
wolfSSL 0:d92f9d21154c 1199 #endif /* HAVE_INTEL_AVX1 */
wolfSSL 0:d92f9d21154c 1200
wolfSSL 0:d92f9d21154c 1201
wolfSSL 0:d92f9d21154c 1202 #if defined(HAVE_INTEL_AVX2)
wolfSSL 0:d92f9d21154c 1203
wolfSSL 0:d92f9d21154c 1204 #define _MOVE_to_REG(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem):YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1205 #define _MOVE_to_MEM(mem, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" : "=m"(mem)::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1206 #define _BYTE_SWAP(ymm, map) __asm__ volatile("vpshufb %0, %%"#ymm", %%"#ymm"\n\t"\
wolfSSL 0:d92f9d21154c 1207 :: "m"(map):YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1208 #define _MOVE_128(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"#map", %%"\
wolfSSL 0:d92f9d21154c 1209 #ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1210 #define _MOVE_BYTE(ymm0, ymm1, map) __asm__ volatile("vpshufb %0, %%"#ymm1", %%"\
wolfSSL 0:d92f9d21154c 1211 #ymm0"\n\t":: "m"(map):YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1212 #define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrld $"#bits", %%"\
wolfSSL 0:d92f9d21154c 1213 #src", %%"#dest"\n\tvpslld $32-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\
wolfSSL 0:d92f9d21154c 1214 #temp",%%"#dest", %%"#dest" ":::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1215 #define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrld $"#bits", %%"\
wolfSSL 0:d92f9d21154c 1216 #src", %%"#dest" ":::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1217 #define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\
wolfSSL 0:d92f9d21154c 1218 #src2", %%"#dest" ":::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1219 #define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\
wolfSSL 0:d92f9d21154c 1220 #src2", %%"#dest" ":::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1221 #define _ADD(dest, src1, src2) __asm__ volatile("vpaddd %%"#src1", %%"\
wolfSSL 0:d92f9d21154c 1222 #src2", %%"#dest" ":::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1223 #define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddd %0, %%"#src1", %%"\
wolfSSL 0:d92f9d21154c 1224 #dest" "::"m"(mem):YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1225 #define _BLEND(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\
wolfSSL 0:d92f9d21154c 1226 #src1", %%"#src2", %%"#dest" ":::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1227
wolfSSL 0:d92f9d21154c 1228 #define _EXTRACT_XMM_0(xmm, mem) __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1229 #define _EXTRACT_XMM_1(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1230 #define _EXTRACT_XMM_2(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1231 #define _EXTRACT_XMM_3(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1232 #define _EXTRACT_XMM_4(ymm, xmm, mem)\
wolfSSL 0:d92f9d21154c 1233 __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ;\
wolfSSL 0:d92f9d21154c 1234 __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1235 #define _EXTRACT_XMM_5(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1236 #define _EXTRACT_XMM_6(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1237 #define _EXTRACT_XMM_7(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1238
wolfSSL 0:d92f9d21154c 1239 #define _SWAP_YMM_HL(ymm) __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs) ;
wolfSSL 0:d92f9d21154c 1240 #define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm)
wolfSSL 0:d92f9d21154c 1241
wolfSSL 0:d92f9d21154c 1242 #define MOVE_to_REG(ymm, mem) _MOVE_to_REG(ymm, mem)
wolfSSL 0:d92f9d21154c 1243 #define MOVE_to_MEM(mem, ymm) _MOVE_to_MEM(mem, ymm)
wolfSSL 0:d92f9d21154c 1244 #define BYTE_SWAP(ymm, map) _BYTE_SWAP(ymm, map)
wolfSSL 0:d92f9d21154c 1245 #define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map)
wolfSSL 0:d92f9d21154c 1246 #define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map)
wolfSSL 0:d92f9d21154c 1247 #define XOR(dest, src1, src2) _XOR(dest, src1, src2)
wolfSSL 0:d92f9d21154c 1248 #define OR(dest, src1, src2) _OR(dest, src1, src2)
wolfSSL 0:d92f9d21154c 1249 #define ADD(dest, src1, src2) _ADD(dest, src1, src2)
wolfSSL 0:d92f9d21154c 1250 #define ADD_MEM(dest, src1, mem) _ADD_MEM(dest, src1, mem)
wolfSSL 0:d92f9d21154c 1251 #define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2)
wolfSSL 0:d92f9d21154c 1252
wolfSSL 0:d92f9d21154c 1253 #define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp);
wolfSSL 0:d92f9d21154c 1254 #define AVX2_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP)
wolfSSL 0:d92f9d21154c 1255 #define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits)
wolfSSL 0:d92f9d21154c 1256
wolfSSL 0:d92f9d21154c 1257 #define GAMMA0(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); \
wolfSSL 0:d92f9d21154c 1258 XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); XOR(dest, G_TEMP, dest) ;
wolfSSL 0:d92f9d21154c 1259 #define GAMMA0_1(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18);
wolfSSL 0:d92f9d21154c 1260 #define GAMMA0_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 3); \
wolfSSL 0:d92f9d21154c 1261 XOR(dest, G_TEMP, dest) ;
wolfSSL 0:d92f9d21154c 1262
wolfSSL 0:d92f9d21154c 1263 #define GAMMA1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); \
wolfSSL 0:d92f9d21154c 1264 XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); XOR(dest, G_TEMP, dest) ;
wolfSSL 0:d92f9d21154c 1265 #define GAMMA1_1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19);
wolfSSL 0:d92f9d21154c 1266 #define GAMMA1_2(dest, src) XOR(dest, G_TEMP, dest) ; AVX2_R(G_TEMP, src, 10); \
wolfSSL 0:d92f9d21154c 1267 XOR(dest, G_TEMP, dest) ;
wolfSSL 0:d92f9d21154c 1268
wolfSSL 0:d92f9d21154c 1269 #define FEEDBACK1_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP1toW_I_2[0]) ; \
wolfSSL 0:d92f9d21154c 1270 BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1271 #define FEEDBACK2_to_W_I_2 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ; \
wolfSSL 0:d92f9d21154c 1272 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAP2toW_I_2[0]) ; BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1273 #define FEEDBACK3_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP3toW_I_2[0]) ; \
wolfSSL 0:d92f9d21154c 1274 BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1275
wolfSSL 0:d92f9d21154c 1276 #define FEEDBACK_to_W_I_7 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08) ;\
wolfSSL 0:d92f9d21154c 1277 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAPtoW_I_7[0]) ; BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7) ;
wolfSSL 0:d92f9d21154c 1278
wolfSSL 0:d92f9d21154c 1279 #undef voitle
wolfSSL 0:d92f9d21154c 1280
wolfSSL 0:d92f9d21154c 1281 #define W_I_16 ymm8
wolfSSL 0:d92f9d21154c 1282 #define W_I_15 ymm9
wolfSSL 0:d92f9d21154c 1283 #define W_I_7 ymm10
wolfSSL 0:d92f9d21154c 1284 #define W_I_2 ymm11
wolfSSL 0:d92f9d21154c 1285 #define W_I ymm12
wolfSSL 0:d92f9d21154c 1286 #define G_TEMP ymm13
wolfSSL 0:d92f9d21154c 1287 #define S_TEMP ymm14
wolfSSL 0:d92f9d21154c 1288 #define YMM_TEMP0 ymm15
wolfSSL 0:d92f9d21154c 1289 #define YMM_TEMP0x xmm15
wolfSSL 0:d92f9d21154c 1290 #define W_I_TEMP ymm7
wolfSSL 0:d92f9d21154c 1291 #define W_K_TEMP ymm15
wolfSSL 0:d92f9d21154c 1292 #define W_K_TEMPx xmm15
wolfSSL 0:d92f9d21154c 1293
wolfSSL 0:d92f9d21154c 1294 #define YMM_REGs /* Registers are saved in Sha256Update/Finel */
wolfSSL 0:d92f9d21154c 1295 /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/
wolfSSL 0:d92f9d21154c 1296
wolfSSL 0:d92f9d21154c 1297
wolfSSL 0:d92f9d21154c 1298 #define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\
wolfSSL 0:d92f9d21154c 1299 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs) ;\
wolfSSL 0:d92f9d21154c 1300 __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs) ;\
wolfSSL 0:d92f9d21154c 1301 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\
wolfSSL 0:d92f9d21154c 1302 __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\
wolfSSL 0:d92f9d21154c 1303 __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs) ;\
wolfSSL 0:d92f9d21154c 1304
wolfSSL 0:d92f9d21154c 1305 #define MOVE_7_to_15(w_i_15, w_i_7)\
wolfSSL 0:d92f9d21154c 1306 __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs) ;\
wolfSSL 0:d92f9d21154c 1307
wolfSSL 0:d92f9d21154c 1308 #define MOVE_I_to_7(w_i_7, w_i)\
wolfSSL 0:d92f9d21154c 1309 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\
wolfSSL 0:d92f9d21154c 1310 __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" ":::YMM_REGs) ;\
wolfSSL 0:d92f9d21154c 1311 __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs) ;\
wolfSSL 0:d92f9d21154c 1312
wolfSSL 0:d92f9d21154c 1313 #define MOVE_I_to_2(w_i_2, w_i)\
wolfSSL 0:d92f9d21154c 1314 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs) ;\
wolfSSL 0:d92f9d21154c 1315 __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs) ;\
wolfSSL 0:d92f9d21154c 1316
wolfSSL 0:d92f9d21154c 1317 #define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\
wolfSSL 0:d92f9d21154c 1318 MOVE_15_to_16(w_i_16, w_i_15, w_i_7) ; \
wolfSSL 0:d92f9d21154c 1319 MOVE_7_to_15(w_i_15, w_i_7) ; \
wolfSSL 0:d92f9d21154c 1320 MOVE_I_to_7(w_i_7, w_i) ; \
wolfSSL 0:d92f9d21154c 1321 MOVE_I_to_2(w_i_2, w_i) ;\
wolfSSL 0:d92f9d21154c 1322
wolfSSL 0:d92f9d21154c 1323 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 0:d92f9d21154c 1324 { word32 d ;\
wolfSSL 0:d92f9d21154c 1325 __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1326 sha256->digest[0] += d;\
wolfSSL 0:d92f9d21154c 1327 __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1328 sha256->digest[1] += d;\
wolfSSL 0:d92f9d21154c 1329 __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1330 sha256->digest[2] += d;\
wolfSSL 0:d92f9d21154c 1331 __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1332 sha256->digest[3] += d;\
wolfSSL 0:d92f9d21154c 1333 __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1334 sha256->digest[4] += d;\
wolfSSL 0:d92f9d21154c 1335 __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1336 sha256->digest[5] += d;\
wolfSSL 0:d92f9d21154c 1337 __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1338 sha256->digest[6] += d;\
wolfSSL 0:d92f9d21154c 1339 __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1340 sha256->digest[7] += d;\
wolfSSL 0:d92f9d21154c 1341 }
wolfSSL 0:d92f9d21154c 1342
wolfSSL 0:d92f9d21154c 1343 #define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 0:d92f9d21154c 1344 { word32 d[8] ;\
wolfSSL 0:d92f9d21154c 1345 __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1346 __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1347 __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1348 __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1349 __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1350 __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1351 __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1352 __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1353 printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\
wolfSSL 0:d92f9d21154c 1354 __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1355 __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1356 __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1357 __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1358 __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1359 __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1360 __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1361 __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs) ;\
wolfSSL 0:d92f9d21154c 1362 }
wolfSSL 0:d92f9d21154c 1363
wolfSSL 0:d92f9d21154c 1364
wolfSSL 0:d92f9d21154c 1365 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 0:d92f9d21154c 1366 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 0:d92f9d21154c 1367
wolfSSL 0:d92f9d21154c 1368 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 0:d92f9d21154c 1369 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 0:d92f9d21154c 1370
wolfSSL 0:d92f9d21154c 1371 #define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 0:d92f9d21154c 1372 _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 0:d92f9d21154c 1373
wolfSSL 0:d92f9d21154c 1374
wolfSSL 0:d92f9d21154c 1375 /* Byte swap Masks to ensure that rest of the words are filled with zero's. */
wolfSSL 0:d92f9d21154c 1376 static const unsigned long mBYTE_FLIP_MASK_16[] =
wolfSSL 0:d92f9d21154c 1377 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
wolfSSL 0:d92f9d21154c 1378 static const unsigned long mBYTE_FLIP_MASK_15[] =
wolfSSL 0:d92f9d21154c 1379 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b } ;
wolfSSL 0:d92f9d21154c 1380 static const unsigned long mBYTE_FLIP_MASK_7 [] =
wolfSSL 0:d92f9d21154c 1381 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b } ;
wolfSSL 0:d92f9d21154c 1382 static const unsigned long mBYTE_FLIP_MASK_2 [] =
wolfSSL 0:d92f9d21154c 1383 { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 } ;
wolfSSL 0:d92f9d21154c 1384
wolfSSL 0:d92f9d21154c 1385 static const unsigned long mMAPtoW_I_7[] =
wolfSSL 0:d92f9d21154c 1386 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 } ;
wolfSSL 0:d92f9d21154c 1387 static const unsigned long mMAP1toW_I_2[] =
wolfSSL 0:d92f9d21154c 1388 { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 } ;
wolfSSL 0:d92f9d21154c 1389 static const unsigned long mMAP2toW_I_2[] =
wolfSSL 0:d92f9d21154c 1390 { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 } ;
wolfSSL 0:d92f9d21154c 1391 static const unsigned long mMAP3toW_I_2[] =
wolfSSL 0:d92f9d21154c 1392 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 } ;
wolfSSL 0:d92f9d21154c 1393
wolfSSL 0:d92f9d21154c 1394 static int Transform_AVX2(Sha256* sha256)
wolfSSL 0:d92f9d21154c 1395 {
wolfSSL 0:d92f9d21154c 1396
wolfSSL 0:d92f9d21154c 1397 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 0:d92f9d21154c 1398 word32* W_K;
wolfSSL 0:d92f9d21154c 1399 W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 0:d92f9d21154c 1400 if (W_K == NULL)
wolfSSL 0:d92f9d21154c 1401 return MEMORY_E;
wolfSSL 0:d92f9d21154c 1402 #else
wolfSSL 0:d92f9d21154c 1403 word32 W_K[64] ;
wolfSSL 0:d92f9d21154c 1404 #endif
wolfSSL 0:d92f9d21154c 1405
wolfSSL 0:d92f9d21154c 1406 MOVE_to_REG(W_I_16, sha256->buffer[0]); BYTE_SWAP(W_I_16, mBYTE_FLIP_MASK_16[0]) ;
wolfSSL 0:d92f9d21154c 1407 MOVE_to_REG(W_I_15, sha256->buffer[1]); BYTE_SWAP(W_I_15, mBYTE_FLIP_MASK_15[0]) ;
wolfSSL 0:d92f9d21154c 1408 MOVE_to_REG(W_I, sha256->buffer[8]) ; BYTE_SWAP(W_I, mBYTE_FLIP_MASK_16[0]) ;
wolfSSL 0:d92f9d21154c 1409 MOVE_to_REG(W_I_7, sha256->buffer[16-7]) ; BYTE_SWAP(W_I_7, mBYTE_FLIP_MASK_7[0]) ;
wolfSSL 0:d92f9d21154c 1410 MOVE_to_REG(W_I_2, sha256->buffer[16-2]) ; BYTE_SWAP(W_I_2, mBYTE_FLIP_MASK_2[0]) ;
wolfSSL 0:d92f9d21154c 1411
wolfSSL 0:d92f9d21154c 1412 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
wolfSSL 0:d92f9d21154c 1413
wolfSSL 0:d92f9d21154c 1414 ADD_MEM(W_K_TEMP, W_I_16, K[0]) ;
wolfSSL 0:d92f9d21154c 1415 MOVE_to_MEM(W_K[0], W_K_TEMP) ;
wolfSSL 0:d92f9d21154c 1416
wolfSSL 0:d92f9d21154c 1417 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0) ;
wolfSSL 0:d92f9d21154c 1418 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1) ;
wolfSSL 0:d92f9d21154c 1419 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2) ;
wolfSSL 0:d92f9d21154c 1420 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3) ;
wolfSSL 0:d92f9d21154c 1421 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4) ;
wolfSSL 0:d92f9d21154c 1422 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5) ;
wolfSSL 0:d92f9d21154c 1423 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6) ;
wolfSSL 0:d92f9d21154c 1424 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7) ;
wolfSSL 0:d92f9d21154c 1425
wolfSSL 0:d92f9d21154c 1426 ADD_MEM(YMM_TEMP0, W_I, K[8]) ;
wolfSSL 0:d92f9d21154c 1427 MOVE_to_MEM(W_K[8], YMM_TEMP0) ;
wolfSSL 0:d92f9d21154c 1428
wolfSSL 0:d92f9d21154c 1429 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 0:d92f9d21154c 1430 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
wolfSSL 0:d92f9d21154c 1431 GAMMA0_1(W_I_TEMP, W_I_15) ;
wolfSSL 0:d92f9d21154c 1432 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
wolfSSL 0:d92f9d21154c 1433 GAMMA0_2(W_I_TEMP, W_I_15) ;
wolfSSL 0:d92f9d21154c 1434 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8) ;
wolfSSL 0:d92f9d21154c 1435 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 0:d92f9d21154c 1436 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ;
wolfSSL 0:d92f9d21154c 1437 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 0:d92f9d21154c 1438 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ;
wolfSSL 0:d92f9d21154c 1439 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1440 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9) ;
wolfSSL 0:d92f9d21154c 1441 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1442 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ;
wolfSSL 0:d92f9d21154c 1443 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
wolfSSL 0:d92f9d21154c 1444 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ;
wolfSSL 0:d92f9d21154c 1445 FEEDBACK1_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1446 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10) ;
wolfSSL 0:d92f9d21154c 1447 FEEDBACK_to_W_I_7 ;
wolfSSL 0:d92f9d21154c 1448 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ;
wolfSSL 0:d92f9d21154c 1449 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 0:d92f9d21154c 1450 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ;
wolfSSL 0:d92f9d21154c 1451 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1452 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11) ;
wolfSSL 0:d92f9d21154c 1453 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1454 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ;
wolfSSL 0:d92f9d21154c 1455 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
wolfSSL 0:d92f9d21154c 1456 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ;
wolfSSL 0:d92f9d21154c 1457 FEEDBACK2_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1458 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12) ;
wolfSSL 0:d92f9d21154c 1459 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1460 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ;
wolfSSL 0:d92f9d21154c 1461 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1462 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ;
wolfSSL 0:d92f9d21154c 1463 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
wolfSSL 0:d92f9d21154c 1464 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13) ;
wolfSSL 0:d92f9d21154c 1465 FEEDBACK3_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1466 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ;
wolfSSL 0:d92f9d21154c 1467 GAMMA1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1468 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ;
wolfSSL 0:d92f9d21154c 1469 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14) ;
wolfSSL 0:d92f9d21154c 1470 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
wolfSSL 0:d92f9d21154c 1471 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ;
wolfSSL 0:d92f9d21154c 1472
wolfSSL 0:d92f9d21154c 1473 MOVE_to_REG(YMM_TEMP0, K[16]) ;
wolfSSL 0:d92f9d21154c 1474 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ;
wolfSSL 0:d92f9d21154c 1475 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
wolfSSL 0:d92f9d21154c 1476 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15) ;
wolfSSL 0:d92f9d21154c 1477 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
wolfSSL 0:d92f9d21154c 1478 MOVE_to_MEM(W_K[16], YMM_TEMP0) ;
wolfSSL 0:d92f9d21154c 1479
wolfSSL 0:d92f9d21154c 1480 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 0:d92f9d21154c 1481 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
wolfSSL 0:d92f9d21154c 1482 GAMMA0_1(W_I_TEMP, W_I_15) ;
wolfSSL 0:d92f9d21154c 1483 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
wolfSSL 0:d92f9d21154c 1484 GAMMA0_2(W_I_TEMP, W_I_15) ;
wolfSSL 0:d92f9d21154c 1485 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16) ;
wolfSSL 0:d92f9d21154c 1486 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 0:d92f9d21154c 1487 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ;
wolfSSL 0:d92f9d21154c 1488 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 0:d92f9d21154c 1489 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ;
wolfSSL 0:d92f9d21154c 1490 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1491 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17) ;
wolfSSL 0:d92f9d21154c 1492 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1493 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ;
wolfSSL 0:d92f9d21154c 1494 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
wolfSSL 0:d92f9d21154c 1495 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ;
wolfSSL 0:d92f9d21154c 1496 FEEDBACK1_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1497 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18) ;
wolfSSL 0:d92f9d21154c 1498 FEEDBACK_to_W_I_7 ;
wolfSSL 0:d92f9d21154c 1499 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ;
wolfSSL 0:d92f9d21154c 1500 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 0:d92f9d21154c 1501 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ;
wolfSSL 0:d92f9d21154c 1502 GAMMA1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1503 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19) ;
wolfSSL 0:d92f9d21154c 1504 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1505 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ;
wolfSSL 0:d92f9d21154c 1506 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
wolfSSL 0:d92f9d21154c 1507 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ;
wolfSSL 0:d92f9d21154c 1508 FEEDBACK2_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1509 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20) ;
wolfSSL 0:d92f9d21154c 1510 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1511 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ;
wolfSSL 0:d92f9d21154c 1512 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1513 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ;
wolfSSL 0:d92f9d21154c 1514 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
wolfSSL 0:d92f9d21154c 1515 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21) ;
wolfSSL 0:d92f9d21154c 1516 FEEDBACK3_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1517 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ;
wolfSSL 0:d92f9d21154c 1518 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1519 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ;
wolfSSL 0:d92f9d21154c 1520 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1521 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22) ;
wolfSSL 0:d92f9d21154c 1522 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
wolfSSL 0:d92f9d21154c 1523 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ;
wolfSSL 0:d92f9d21154c 1524
wolfSSL 0:d92f9d21154c 1525 MOVE_to_REG(YMM_TEMP0, K[24]) ;
wolfSSL 0:d92f9d21154c 1526 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ;
wolfSSL 0:d92f9d21154c 1527 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
wolfSSL 0:d92f9d21154c 1528 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23) ;
wolfSSL 0:d92f9d21154c 1529 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
wolfSSL 0:d92f9d21154c 1530 MOVE_to_MEM(W_K[24], YMM_TEMP0) ;
wolfSSL 0:d92f9d21154c 1531
wolfSSL 0:d92f9d21154c 1532 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 0:d92f9d21154c 1533 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
wolfSSL 0:d92f9d21154c 1534 GAMMA0_1(W_I_TEMP, W_I_15) ;
wolfSSL 0:d92f9d21154c 1535 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
wolfSSL 0:d92f9d21154c 1536 GAMMA0_2(W_I_TEMP, W_I_15) ;
wolfSSL 0:d92f9d21154c 1537 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24) ;
wolfSSL 0:d92f9d21154c 1538 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 0:d92f9d21154c 1539 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ;
wolfSSL 0:d92f9d21154c 1540 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 0:d92f9d21154c 1541 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ;
wolfSSL 0:d92f9d21154c 1542 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1543 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25) ;
wolfSSL 0:d92f9d21154c 1544 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1545 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ;
wolfSSL 0:d92f9d21154c 1546 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
wolfSSL 0:d92f9d21154c 1547 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ;
wolfSSL 0:d92f9d21154c 1548 FEEDBACK1_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1549 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26) ;
wolfSSL 0:d92f9d21154c 1550 FEEDBACK_to_W_I_7 ;
wolfSSL 0:d92f9d21154c 1551 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ;
wolfSSL 0:d92f9d21154c 1552 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 0:d92f9d21154c 1553 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ;
wolfSSL 0:d92f9d21154c 1554 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1555 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27) ;
wolfSSL 0:d92f9d21154c 1556 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1557 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ;
wolfSSL 0:d92f9d21154c 1558 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
wolfSSL 0:d92f9d21154c 1559 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ;
wolfSSL 0:d92f9d21154c 1560 FEEDBACK2_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1561 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28) ;
wolfSSL 0:d92f9d21154c 1562 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1563 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ;
wolfSSL 0:d92f9d21154c 1564 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1565 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ;
wolfSSL 0:d92f9d21154c 1566 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
wolfSSL 0:d92f9d21154c 1567 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29) ;
wolfSSL 0:d92f9d21154c 1568 FEEDBACK3_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1569 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ;
wolfSSL 0:d92f9d21154c 1570 GAMMA1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1571 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ;
wolfSSL 0:d92f9d21154c 1572 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30) ;
wolfSSL 0:d92f9d21154c 1573 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
wolfSSL 0:d92f9d21154c 1574 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ;
wolfSSL 0:d92f9d21154c 1575
wolfSSL 0:d92f9d21154c 1576 MOVE_to_REG(YMM_TEMP0, K[32]) ;
wolfSSL 0:d92f9d21154c 1577 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ;
wolfSSL 0:d92f9d21154c 1578 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
wolfSSL 0:d92f9d21154c 1579 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31) ;
wolfSSL 0:d92f9d21154c 1580 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
wolfSSL 0:d92f9d21154c 1581 MOVE_to_MEM(W_K[32], YMM_TEMP0) ;
wolfSSL 0:d92f9d21154c 1582
wolfSSL 0:d92f9d21154c 1583
wolfSSL 0:d92f9d21154c 1584 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 0:d92f9d21154c 1585 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
wolfSSL 0:d92f9d21154c 1586 GAMMA0_1(W_I_TEMP, W_I_15) ;
wolfSSL 0:d92f9d21154c 1587 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
wolfSSL 0:d92f9d21154c 1588 GAMMA0_2(W_I_TEMP, W_I_15) ;
wolfSSL 0:d92f9d21154c 1589 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32) ;
wolfSSL 0:d92f9d21154c 1590 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 0:d92f9d21154c 1591 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ;
wolfSSL 0:d92f9d21154c 1592 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 0:d92f9d21154c 1593 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ;
wolfSSL 0:d92f9d21154c 1594 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1595 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33) ;
wolfSSL 0:d92f9d21154c 1596 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1597 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ;
wolfSSL 0:d92f9d21154c 1598 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
wolfSSL 0:d92f9d21154c 1599 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ;
wolfSSL 0:d92f9d21154c 1600 FEEDBACK1_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1601 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34) ;
wolfSSL 0:d92f9d21154c 1602 FEEDBACK_to_W_I_7 ;
wolfSSL 0:d92f9d21154c 1603 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ;
wolfSSL 0:d92f9d21154c 1604 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 0:d92f9d21154c 1605 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ;
wolfSSL 0:d92f9d21154c 1606 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1607 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35) ;
wolfSSL 0:d92f9d21154c 1608 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1609 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ;
wolfSSL 0:d92f9d21154c 1610 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
wolfSSL 0:d92f9d21154c 1611 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ;
wolfSSL 0:d92f9d21154c 1612 FEEDBACK2_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1613 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36) ;
wolfSSL 0:d92f9d21154c 1614 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1615 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ;
wolfSSL 0:d92f9d21154c 1616 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1617 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ;
wolfSSL 0:d92f9d21154c 1618 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
wolfSSL 0:d92f9d21154c 1619 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37) ;
wolfSSL 0:d92f9d21154c 1620 FEEDBACK3_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1621 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ;
wolfSSL 0:d92f9d21154c 1622 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1623 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ;
wolfSSL 0:d92f9d21154c 1624 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1625 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38) ;
wolfSSL 0:d92f9d21154c 1626 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
wolfSSL 0:d92f9d21154c 1627 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ;
wolfSSL 0:d92f9d21154c 1628
wolfSSL 0:d92f9d21154c 1629 MOVE_to_REG(YMM_TEMP0, K[40]) ;
wolfSSL 0:d92f9d21154c 1630 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ;
wolfSSL 0:d92f9d21154c 1631 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
wolfSSL 0:d92f9d21154c 1632 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39) ;
wolfSSL 0:d92f9d21154c 1633 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
wolfSSL 0:d92f9d21154c 1634 MOVE_to_MEM(W_K[40], YMM_TEMP0) ;
wolfSSL 0:d92f9d21154c 1635
wolfSSL 0:d92f9d21154c 1636 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 0:d92f9d21154c 1637 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
wolfSSL 0:d92f9d21154c 1638 GAMMA0_1(W_I_TEMP, W_I_15) ;
wolfSSL 0:d92f9d21154c 1639 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
wolfSSL 0:d92f9d21154c 1640 GAMMA0_2(W_I_TEMP, W_I_15) ;
wolfSSL 0:d92f9d21154c 1641 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40) ;
wolfSSL 0:d92f9d21154c 1642 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 0:d92f9d21154c 1643 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ;
wolfSSL 0:d92f9d21154c 1644 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 0:d92f9d21154c 1645 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ;
wolfSSL 0:d92f9d21154c 1646 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1647 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41) ;
wolfSSL 0:d92f9d21154c 1648 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1649 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ;
wolfSSL 0:d92f9d21154c 1650 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
wolfSSL 0:d92f9d21154c 1651 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ;
wolfSSL 0:d92f9d21154c 1652 FEEDBACK1_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1653 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42) ;
wolfSSL 0:d92f9d21154c 1654 FEEDBACK_to_W_I_7 ;
wolfSSL 0:d92f9d21154c 1655 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ;
wolfSSL 0:d92f9d21154c 1656 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 0:d92f9d21154c 1657 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ;
wolfSSL 0:d92f9d21154c 1658 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1659 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43) ;
wolfSSL 0:d92f9d21154c 1660 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1661 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ;
wolfSSL 0:d92f9d21154c 1662 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
wolfSSL 0:d92f9d21154c 1663 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ;
wolfSSL 0:d92f9d21154c 1664 FEEDBACK2_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1665 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44) ;
wolfSSL 0:d92f9d21154c 1666 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1667 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ;
wolfSSL 0:d92f9d21154c 1668 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1669 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ;
wolfSSL 0:d92f9d21154c 1670 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
wolfSSL 0:d92f9d21154c 1671 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45) ;
wolfSSL 0:d92f9d21154c 1672 FEEDBACK3_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1673 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ;
wolfSSL 0:d92f9d21154c 1674 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1675 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ;
wolfSSL 0:d92f9d21154c 1676 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1677 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46) ;
wolfSSL 0:d92f9d21154c 1678 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
wolfSSL 0:d92f9d21154c 1679 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ;
wolfSSL 0:d92f9d21154c 1680
wolfSSL 0:d92f9d21154c 1681 MOVE_to_REG(YMM_TEMP0, K[48]) ;
wolfSSL 0:d92f9d21154c 1682 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ;
wolfSSL 0:d92f9d21154c 1683 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
wolfSSL 0:d92f9d21154c 1684 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47) ;
wolfSSL 0:d92f9d21154c 1685 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
wolfSSL 0:d92f9d21154c 1686 MOVE_to_MEM(W_K[48], YMM_TEMP0) ;
wolfSSL 0:d92f9d21154c 1687
wolfSSL 0:d92f9d21154c 1688 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 0:d92f9d21154c 1689 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
wolfSSL 0:d92f9d21154c 1690 GAMMA0_1(W_I_TEMP, W_I_15) ;
wolfSSL 0:d92f9d21154c 1691 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
wolfSSL 0:d92f9d21154c 1692 GAMMA0_2(W_I_TEMP, W_I_15) ;
wolfSSL 0:d92f9d21154c 1693 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48) ;
wolfSSL 0:d92f9d21154c 1694 ADD(W_I_TEMP, W_I_16, W_I_TEMP) ;/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 0:d92f9d21154c 1695 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
wolfSSL 0:d92f9d21154c 1696 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 0:d92f9d21154c 1697 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
wolfSSL 0:d92f9d21154c 1698 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1699 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49) ;
wolfSSL 0:d92f9d21154c 1700 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1701 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
wolfSSL 0:d92f9d21154c 1702 ADD(W_I, W_I, YMM_TEMP0) ;/* now W[16..17] are completed */
wolfSSL 0:d92f9d21154c 1703 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
wolfSSL 0:d92f9d21154c 1704 FEEDBACK1_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1705 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50) ;
wolfSSL 0:d92f9d21154c 1706 FEEDBACK_to_W_I_7 ;
wolfSSL 0:d92f9d21154c 1707 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
wolfSSL 0:d92f9d21154c 1708 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 0:d92f9d21154c 1709 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
wolfSSL 0:d92f9d21154c 1710 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1711 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51) ;
wolfSSL 0:d92f9d21154c 1712 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1713 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
wolfSSL 0:d92f9d21154c 1714 ADD(W_I, W_I_TEMP, YMM_TEMP0) ;/* now W[16..19] are completed */
wolfSSL 0:d92f9d21154c 1715 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
wolfSSL 0:d92f9d21154c 1716 FEEDBACK2_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1717 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52) ;
wolfSSL 0:d92f9d21154c 1718 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1719 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
wolfSSL 0:d92f9d21154c 1720 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1721 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
wolfSSL 0:d92f9d21154c 1722 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..21] are completed */
wolfSSL 0:d92f9d21154c 1723 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53) ;
wolfSSL 0:d92f9d21154c 1724 FEEDBACK3_to_W_I_2 ;
wolfSSL 0:d92f9d21154c 1725 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
wolfSSL 0:d92f9d21154c 1726 GAMMA1_1(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1727 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
wolfSSL 0:d92f9d21154c 1728 GAMMA1_2(YMM_TEMP0, W_I_2) ;
wolfSSL 0:d92f9d21154c 1729 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54) ;
wolfSSL 0:d92f9d21154c 1730 ADD(W_I, W_I_TEMP, YMM_TEMP0) ; /* now W[16..23] are completed */
wolfSSL 0:d92f9d21154c 1731 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
wolfSSL 0:d92f9d21154c 1732
wolfSSL 0:d92f9d21154c 1733 MOVE_to_REG(YMM_TEMP0, K[56]) ;
wolfSSL 0:d92f9d21154c 1734 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
wolfSSL 0:d92f9d21154c 1735 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I) ;
wolfSSL 0:d92f9d21154c 1736 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55) ;
wolfSSL 0:d92f9d21154c 1737 ADD(YMM_TEMP0, YMM_TEMP0, W_I) ;
wolfSSL 0:d92f9d21154c 1738 MOVE_to_MEM(W_K[56], YMM_TEMP0) ;
wolfSSL 0:d92f9d21154c 1739
wolfSSL 0:d92f9d21154c 1740 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56) ;
wolfSSL 0:d92f9d21154c 1741 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57) ;
wolfSSL 0:d92f9d21154c 1742 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58) ;
wolfSSL 0:d92f9d21154c 1743 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59) ;
wolfSSL 0:d92f9d21154c 1744
wolfSSL 0:d92f9d21154c 1745 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60) ;
wolfSSL 0:d92f9d21154c 1746 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61) ;
wolfSSL 0:d92f9d21154c 1747 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62) ;
wolfSSL 0:d92f9d21154c 1748 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63) ;
wolfSSL 0:d92f9d21154c 1749
wolfSSL 0:d92f9d21154c 1750 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7) ;
wolfSSL 0:d92f9d21154c 1751
wolfSSL 0:d92f9d21154c 1752 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 0:d92f9d21154c 1753 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 0:d92f9d21154c 1754 #endif
wolfSSL 0:d92f9d21154c 1755
wolfSSL 0:d92f9d21154c 1756 return 0;
wolfSSL 0:d92f9d21154c 1757 }
wolfSSL 0:d92f9d21154c 1758
wolfSSL 0:d92f9d21154c 1759 #endif /* HAVE_INTEL_AVX2 */
wolfSSL 0:d92f9d21154c 1760
wolfSSL 0:d92f9d21154c 1761 #endif /* HAVE_FIPS */
wolfSSL 0:d92f9d21154c 1762
wolfSSL 0:d92f9d21154c 1763 #endif /* WOLFSSL_TI_HAHS */
wolfSSL 0:d92f9d21154c 1764
wolfSSL 0:d92f9d21154c 1765 #endif /* NO_SHA256 */
wolfSSL 0:d92f9d21154c 1766
wolfSSL 0:d92f9d21154c 1767