wolfSSL 3.11.1 for TLS1.3 beta

Fork of wolfSSL by wolf SSL

Committer:
wolfSSL
Date:
Tue May 30 01:44:10 2017 +0000
Revision:
11:cee25a834751
wolfSSL 3.11.0

Who changed what in which revision?

UserRevisionLine numberNew contents of line
wolfSSL 11:cee25a834751 1 /* sha256.c
wolfSSL 11:cee25a834751 2 *
wolfSSL 11:cee25a834751 3 * Copyright (C) 2006-2016 wolfSSL Inc.
wolfSSL 11:cee25a834751 4 *
wolfSSL 11:cee25a834751 5 * This file is part of wolfSSL.
wolfSSL 11:cee25a834751 6 *
wolfSSL 11:cee25a834751 7 * wolfSSL is free software; you can redistribute it and/or modify
wolfSSL 11:cee25a834751 8 * it under the terms of the GNU General Public License as published by
wolfSSL 11:cee25a834751 9 * the Free Software Foundation; either version 2 of the License, or
wolfSSL 11:cee25a834751 10 * (at your option) any later version.
wolfSSL 11:cee25a834751 11 *
wolfSSL 11:cee25a834751 12 * wolfSSL is distributed in the hope that it will be useful,
wolfSSL 11:cee25a834751 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
wolfSSL 11:cee25a834751 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
wolfSSL 11:cee25a834751 15 * GNU General Public License for more details.
wolfSSL 11:cee25a834751 16 *
wolfSSL 11:cee25a834751 17 * You should have received a copy of the GNU General Public License
wolfSSL 11:cee25a834751 18 * along with this program; if not, write to the Free Software
wolfSSL 11:cee25a834751 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
wolfSSL 11:cee25a834751 20 */
wolfSSL 11:cee25a834751 21
wolfSSL 11:cee25a834751 22
wolfSSL 11:cee25a834751 23 /* code submitted by raphael.huck@efixo.com */
wolfSSL 11:cee25a834751 24
wolfSSL 11:cee25a834751 25 #ifdef HAVE_CONFIG_H
wolfSSL 11:cee25a834751 26 #include <config.h>
wolfSSL 11:cee25a834751 27 #endif
wolfSSL 11:cee25a834751 28
wolfSSL 11:cee25a834751 29 #include <wolfssl/wolfcrypt/settings.h>
wolfSSL 11:cee25a834751 30
wolfSSL 11:cee25a834751 31 #if !defined(NO_SHA256)
wolfSSL 11:cee25a834751 32
wolfSSL 11:cee25a834751 33 #include <wolfssl/wolfcrypt/sha256.h>
wolfSSL 11:cee25a834751 34 #include <wolfssl/wolfcrypt/error-crypt.h>
wolfSSL 11:cee25a834751 35
wolfSSL 11:cee25a834751 36 /* fips wrapper calls, user can call direct */
wolfSSL 11:cee25a834751 37 #ifdef HAVE_FIPS
wolfSSL 11:cee25a834751 38
wolfSSL 11:cee25a834751 39 int wc_InitSha256(Sha256* sha)
wolfSSL 11:cee25a834751 40 {
wolfSSL 11:cee25a834751 41 return InitSha256_fips(sha);
wolfSSL 11:cee25a834751 42 }
wolfSSL 11:cee25a834751 43 int wc_InitSha256_ex(Sha256* sha, void* heap, int devId)
wolfSSL 11:cee25a834751 44 {
wolfSSL 11:cee25a834751 45 (void)heap;
wolfSSL 11:cee25a834751 46 (void)devId;
wolfSSL 11:cee25a834751 47 return InitSha256_fips(sha);
wolfSSL 11:cee25a834751 48 }
wolfSSL 11:cee25a834751 49 int wc_Sha256Update(Sha256* sha, const byte* data, word32 len)
wolfSSL 11:cee25a834751 50 {
wolfSSL 11:cee25a834751 51 return Sha256Update_fips(sha, data, len);
wolfSSL 11:cee25a834751 52 }
wolfSSL 11:cee25a834751 53 int wc_Sha256Final(Sha256* sha, byte* out)
wolfSSL 11:cee25a834751 54 {
wolfSSL 11:cee25a834751 55 return Sha256Final_fips(sha, out);
wolfSSL 11:cee25a834751 56 }
wolfSSL 11:cee25a834751 57 void wc_Sha256Free(Sha256* sha)
wolfSSL 11:cee25a834751 58 {
wolfSSL 11:cee25a834751 59 (void)sha;
wolfSSL 11:cee25a834751 60 /* Not supported in FIPS */
wolfSSL 11:cee25a834751 61 }
wolfSSL 11:cee25a834751 62
wolfSSL 11:cee25a834751 63 #else /* else build without fips */
wolfSSL 11:cee25a834751 64
wolfSSL 11:cee25a834751 65
wolfSSL 11:cee25a834751 66 #if defined(WOLFSSL_TI_HASH)
wolfSSL 11:cee25a834751 67 /* #include <wolfcrypt/src/port/ti/ti-hash.c> included by wc_port.c */
wolfSSL 11:cee25a834751 68 #else
wolfSSL 11:cee25a834751 69
wolfSSL 11:cee25a834751 70 #include <wolfssl/wolfcrypt/logging.h>
wolfSSL 11:cee25a834751 71
wolfSSL 11:cee25a834751 72 #ifdef NO_INLINE
wolfSSL 11:cee25a834751 73 #include <wolfssl/wolfcrypt/misc.h>
wolfSSL 11:cee25a834751 74 #else
wolfSSL 11:cee25a834751 75 #define WOLFSSL_MISC_INCLUDED
wolfSSL 11:cee25a834751 76 #include <wolfcrypt/src/misc.c>
wolfSSL 11:cee25a834751 77 #endif
wolfSSL 11:cee25a834751 78
wolfSSL 11:cee25a834751 79
wolfSSL 11:cee25a834751 80 #if defined(USE_INTEL_SPEEDUP)
wolfSSL 11:cee25a834751 81 #define HAVE_INTEL_AVX1
wolfSSL 11:cee25a834751 82 #define HAVE_INTEL_AVX2
wolfSSL 11:cee25a834751 83 #endif /* USE_INTEL_SPEEDUP */
wolfSSL 11:cee25a834751 84
wolfSSL 11:cee25a834751 85 #if defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 86 #define HAVE_INTEL_RORX
wolfSSL 11:cee25a834751 87 #endif
wolfSSL 11:cee25a834751 88
wolfSSL 11:cee25a834751 89
wolfSSL 11:cee25a834751 90 static int InitSha256(Sha256* sha256)
wolfSSL 11:cee25a834751 91 {
wolfSSL 11:cee25a834751 92 int ret = 0;
wolfSSL 11:cee25a834751 93
wolfSSL 11:cee25a834751 94 if (sha256 == NULL)
wolfSSL 11:cee25a834751 95 return BAD_FUNC_ARG;
wolfSSL 11:cee25a834751 96
wolfSSL 11:cee25a834751 97 sha256->digest[0] = 0x6A09E667L;
wolfSSL 11:cee25a834751 98 sha256->digest[1] = 0xBB67AE85L;
wolfSSL 11:cee25a834751 99 sha256->digest[2] = 0x3C6EF372L;
wolfSSL 11:cee25a834751 100 sha256->digest[3] = 0xA54FF53AL;
wolfSSL 11:cee25a834751 101 sha256->digest[4] = 0x510E527FL;
wolfSSL 11:cee25a834751 102 sha256->digest[5] = 0x9B05688CL;
wolfSSL 11:cee25a834751 103 sha256->digest[6] = 0x1F83D9ABL;
wolfSSL 11:cee25a834751 104 sha256->digest[7] = 0x5BE0CD19L;
wolfSSL 11:cee25a834751 105
wolfSSL 11:cee25a834751 106 sha256->buffLen = 0;
wolfSSL 11:cee25a834751 107 sha256->loLen = 0;
wolfSSL 11:cee25a834751 108 sha256->hiLen = 0;
wolfSSL 11:cee25a834751 109
wolfSSL 11:cee25a834751 110 return ret;
wolfSSL 11:cee25a834751 111 }
wolfSSL 11:cee25a834751 112
wolfSSL 11:cee25a834751 113
wolfSSL 11:cee25a834751 114 /* Hardware Acceleration */
wolfSSL 11:cee25a834751 115 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 116
wolfSSL 11:cee25a834751 117 /* in case intel instructions aren't available, plus we need the K[] global */
wolfSSL 11:cee25a834751 118 #define NEED_SOFT_SHA256
wolfSSL 11:cee25a834751 119
wolfSSL 11:cee25a834751 120 /*****
wolfSSL 11:cee25a834751 121 Intel AVX1/AVX2 Macro Control Structure
wolfSSL 11:cee25a834751 122
wolfSSL 11:cee25a834751 123 #define HAVE_INTEL_AVX1
wolfSSL 11:cee25a834751 124 #define HAVE_INTEL_AVX2
wolfSSL 11:cee25a834751 125
wolfSSL 11:cee25a834751 126 #define HAVE_INTEL_RORX
wolfSSL 11:cee25a834751 127
wolfSSL 11:cee25a834751 128
wolfSSL 11:cee25a834751 129 int InitSha256(Sha256* sha256) {
wolfSSL 11:cee25a834751 130 Save/Recover XMM, YMM
wolfSSL 11:cee25a834751 131 ...
wolfSSL 11:cee25a834751 132 }
wolfSSL 11:cee25a834751 133
wolfSSL 11:cee25a834751 134 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 135 Transform(); Function prototype
wolfSSL 11:cee25a834751 136 #else
wolfSSL 11:cee25a834751 137 Transform() { }
wolfSSL 11:cee25a834751 138 int Sha256Final() {
wolfSSL 11:cee25a834751 139 Save/Recover XMM, YMM
wolfSSL 11:cee25a834751 140 ...
wolfSSL 11:cee25a834751 141 }
wolfSSL 11:cee25a834751 142 #endif
wolfSSL 11:cee25a834751 143
wolfSSL 11:cee25a834751 144 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 145 #if defined(HAVE_INTEL_RORX
wolfSSL 11:cee25a834751 146 #define RND with rorx instuction
wolfSSL 11:cee25a834751 147 #else
wolfSSL 11:cee25a834751 148 #define RND
wolfSSL 11:cee25a834751 149 #endif
wolfSSL 11:cee25a834751 150 #endif
wolfSSL 11:cee25a834751 151
wolfSSL 11:cee25a834751 152 #if defined(HAVE_INTEL_AVX1)
wolfSSL 11:cee25a834751 153
wolfSSL 11:cee25a834751 154 #define XMM Instructions/inline asm
wolfSSL 11:cee25a834751 155
wolfSSL 11:cee25a834751 156 int Transform() {
wolfSSL 11:cee25a834751 157 Stitched Message Sched/Round
wolfSSL 11:cee25a834751 158 }
wolfSSL 11:cee25a834751 159
wolfSSL 11:cee25a834751 160 #elif defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 161
wolfSSL 11:cee25a834751 162 #define YMM Instructions/inline asm
wolfSSL 11:cee25a834751 163
wolfSSL 11:cee25a834751 164 int Transform() {
wolfSSL 11:cee25a834751 165 More granural Stitched Message Sched/Round
wolfSSL 11:cee25a834751 166 }
wolfSSL 11:cee25a834751 167
wolfSSL 11:cee25a834751 168 */
wolfSSL 11:cee25a834751 169
wolfSSL 11:cee25a834751 170 /* Each platform needs to query info type 1 from cpuid to see if aesni is
wolfSSL 11:cee25a834751 171 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
wolfSSL 11:cee25a834751 172 */
wolfSSL 11:cee25a834751 173
wolfSSL 11:cee25a834751 174 #ifndef _MSC_VER
wolfSSL 11:cee25a834751 175 #define cpuid(reg, leaf, sub)\
wolfSSL 11:cee25a834751 176 __asm__ __volatile__ ("cpuid":\
wolfSSL 11:cee25a834751 177 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
wolfSSL 11:cee25a834751 178 "a" (leaf), "c"(sub));
wolfSSL 11:cee25a834751 179
wolfSSL 11:cee25a834751 180 #define XASM_LINK(f) asm(f)
wolfSSL 11:cee25a834751 181 #else
wolfSSL 11:cee25a834751 182 #include <intrin.h>
wolfSSL 11:cee25a834751 183 #define cpuid(a,b) __cpuid((int*)a,b)
wolfSSL 11:cee25a834751 184
wolfSSL 11:cee25a834751 185 #define XASM_LINK(f)
wolfSSL 11:cee25a834751 186 #endif /* _MSC_VER */
wolfSSL 11:cee25a834751 187
wolfSSL 11:cee25a834751 188 #define EAX 0
wolfSSL 11:cee25a834751 189 #define EBX 1
wolfSSL 11:cee25a834751 190 #define ECX 2
wolfSSL 11:cee25a834751 191 #define EDX 3
wolfSSL 11:cee25a834751 192
wolfSSL 11:cee25a834751 193 #define CPUID_AVX1 0x1
wolfSSL 11:cee25a834751 194 #define CPUID_AVX2 0x2
wolfSSL 11:cee25a834751 195 #define CPUID_RDRAND 0x4
wolfSSL 11:cee25a834751 196 #define CPUID_RDSEED 0x8
wolfSSL 11:cee25a834751 197 #define CPUID_BMI2 0x10 /* MULX, RORX */
wolfSSL 11:cee25a834751 198
wolfSSL 11:cee25a834751 199 #define IS_INTEL_AVX1 (cpuid_flags & CPUID_AVX1)
wolfSSL 11:cee25a834751 200 #define IS_INTEL_AVX2 (cpuid_flags & CPUID_AVX2)
wolfSSL 11:cee25a834751 201 #define IS_INTEL_BMI2 (cpuid_flags & CPUID_BMI2)
wolfSSL 11:cee25a834751 202 #define IS_INTEL_RDRAND (cpuid_flags & CPUID_RDRAND)
wolfSSL 11:cee25a834751 203 #define IS_INTEL_RDSEED (cpuid_flags & CPUID_RDSEED)
wolfSSL 11:cee25a834751 204
wolfSSL 11:cee25a834751 205 static word32 cpuid_check = 0;
wolfSSL 11:cee25a834751 206 static word32 cpuid_flags = 0;
wolfSSL 11:cee25a834751 207
wolfSSL 11:cee25a834751 208 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
wolfSSL 11:cee25a834751 209 int got_intel_cpu=0;
wolfSSL 11:cee25a834751 210 unsigned int reg[5];
wolfSSL 11:cee25a834751 211
wolfSSL 11:cee25a834751 212 reg[4] = '\0';
wolfSSL 11:cee25a834751 213 cpuid(reg, 0, 0);
wolfSSL 11:cee25a834751 214 if (XMEMCMP((char *)&(reg[EBX]), "Genu", 4) == 0 &&
wolfSSL 11:cee25a834751 215 XMEMCMP((char *)&(reg[EDX]), "ineI", 4) == 0 &&
wolfSSL 11:cee25a834751 216 XMEMCMP((char *)&(reg[ECX]), "ntel", 4) == 0) {
wolfSSL 11:cee25a834751 217 got_intel_cpu = 1;
wolfSSL 11:cee25a834751 218 }
wolfSSL 11:cee25a834751 219 if (got_intel_cpu) {
wolfSSL 11:cee25a834751 220 cpuid(reg, leaf, sub);
wolfSSL 11:cee25a834751 221 return ((reg[num] >> bit) & 0x1);
wolfSSL 11:cee25a834751 222 }
wolfSSL 11:cee25a834751 223 return 0;
wolfSSL 11:cee25a834751 224 }
wolfSSL 11:cee25a834751 225
wolfSSL 11:cee25a834751 226 static int set_cpuid_flags(void) {
wolfSSL 11:cee25a834751 227 if (cpuid_check==0) {
wolfSSL 11:cee25a834751 228 if (cpuid_flag(1, 0, ECX, 28)){ cpuid_flags |= CPUID_AVX1; }
wolfSSL 11:cee25a834751 229 if (cpuid_flag(7, 0, EBX, 5)) { cpuid_flags |= CPUID_AVX2; }
wolfSSL 11:cee25a834751 230 if (cpuid_flag(7, 0, EBX, 8)) { cpuid_flags |= CPUID_BMI2; }
wolfSSL 11:cee25a834751 231 if (cpuid_flag(1, 0, ECX, 30)){ cpuid_flags |= CPUID_RDRAND; }
wolfSSL 11:cee25a834751 232 if (cpuid_flag(7, 0, EBX, 18)){ cpuid_flags |= CPUID_RDSEED; }
wolfSSL 11:cee25a834751 233 cpuid_check = 1;
wolfSSL 11:cee25a834751 234 return 0;
wolfSSL 11:cee25a834751 235 }
wolfSSL 11:cee25a834751 236 return 1;
wolfSSL 11:cee25a834751 237 }
wolfSSL 11:cee25a834751 238
wolfSSL 11:cee25a834751 239 /* #if defined(HAVE_INTEL_AVX1/2) at the tail of sha256 */
wolfSSL 11:cee25a834751 240 static int Transform(Sha256* sha256);
wolfSSL 11:cee25a834751 241 #if defined(HAVE_INTEL_AVX1)
wolfSSL 11:cee25a834751 242 static int Transform_AVX1(Sha256 *sha256);
wolfSSL 11:cee25a834751 243 #endif
wolfSSL 11:cee25a834751 244 #if defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 245 static int Transform_AVX2(Sha256 *sha256);
wolfSSL 11:cee25a834751 246 static int Transform_AVX1_RORX(Sha256 *sha256);
wolfSSL 11:cee25a834751 247 #endif
wolfSSL 11:cee25a834751 248 static int (*Transform_p)(Sha256* sha256) /* = _Transform */;
wolfSSL 11:cee25a834751 249 #define XTRANSFORM(sha256, B) (*Transform_p)(sha256)
wolfSSL 11:cee25a834751 250
wolfSSL 11:cee25a834751 251 static void set_Transform(void) {
wolfSSL 11:cee25a834751 252 if (set_cpuid_flags()) return;
wolfSSL 11:cee25a834751 253
wolfSSL 11:cee25a834751 254 #if defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 255 if (IS_INTEL_AVX2 && IS_INTEL_BMI2) {
wolfSSL 11:cee25a834751 256 Transform_p = Transform_AVX1_RORX; return;
wolfSSL 11:cee25a834751 257 Transform_p = Transform_AVX2;
wolfSSL 11:cee25a834751 258 /* for avoiding warning,"not used" */
wolfSSL 11:cee25a834751 259 }
wolfSSL 11:cee25a834751 260 #endif
wolfSSL 11:cee25a834751 261 #if defined(HAVE_INTEL_AVX1)
wolfSSL 11:cee25a834751 262 Transform_p = ((IS_INTEL_AVX1) ? Transform_AVX1 : Transform); return;
wolfSSL 11:cee25a834751 263 #endif
wolfSSL 11:cee25a834751 264 Transform_p = Transform; return;
wolfSSL 11:cee25a834751 265 }
wolfSSL 11:cee25a834751 266
wolfSSL 11:cee25a834751 267 /* Dummy for saving MM_REGs on behalf of Transform */
wolfSSL 11:cee25a834751 268 #if defined(HAVE_INTEL_AVX2) && !defined(HAVE_INTEL_AVX1)
wolfSSL 11:cee25a834751 269 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
wolfSSL 11:cee25a834751 270 "%ymm4","%ymm5","%ymm6","%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15")
wolfSSL 11:cee25a834751 271 #elif defined(HAVE_INTEL_AVX1)
wolfSSL 11:cee25a834751 272 #define SAVE_XMM_YMM __asm__ volatile("or %%r8d, %%r8d":::\
wolfSSL 11:cee25a834751 273 "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10",\
wolfSSL 11:cee25a834751 274 "xmm11","xmm12","xmm13","xmm14","xmm15")
wolfSSL 11:cee25a834751 275 #endif
wolfSSL 11:cee25a834751 276
wolfSSL 11:cee25a834751 277 int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId)
wolfSSL 11:cee25a834751 278 {
wolfSSL 11:cee25a834751 279 int ret = 0;
wolfSSL 11:cee25a834751 280 if (sha256 == NULL)
wolfSSL 11:cee25a834751 281 return BAD_FUNC_ARG;
wolfSSL 11:cee25a834751 282
wolfSSL 11:cee25a834751 283 sha256->heap = heap;
wolfSSL 11:cee25a834751 284
wolfSSL 11:cee25a834751 285 ret = InitSha256(sha256);
wolfSSL 11:cee25a834751 286 if (ret != 0)
wolfSSL 11:cee25a834751 287 return ret;
wolfSSL 11:cee25a834751 288
wolfSSL 11:cee25a834751 289 /* choose best Transform function under this runtime environment */
wolfSSL 11:cee25a834751 290 set_Transform();
wolfSSL 11:cee25a834751 291
wolfSSL 11:cee25a834751 292 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
wolfSSL 11:cee25a834751 293 ret = wolfAsync_DevCtxInit(&sha256->asyncDev,
wolfSSL 11:cee25a834751 294 WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId);
wolfSSL 11:cee25a834751 295 #else
wolfSSL 11:cee25a834751 296 (void)devId;
wolfSSL 11:cee25a834751 297 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 11:cee25a834751 298
wolfSSL 11:cee25a834751 299 return ret;
wolfSSL 11:cee25a834751 300 }
wolfSSL 11:cee25a834751 301
wolfSSL 11:cee25a834751 302 #elif defined(FREESCALE_LTC_SHA)
wolfSSL 11:cee25a834751 303 int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId)
wolfSSL 11:cee25a834751 304 {
wolfSSL 11:cee25a834751 305 (void)heap;
wolfSSL 11:cee25a834751 306 (void)devId;
wolfSSL 11:cee25a834751 307
wolfSSL 11:cee25a834751 308 LTC_HASH_Init(LTC_BASE, &sha256->ctx, kLTC_Sha256, NULL, 0);
wolfSSL 11:cee25a834751 309
wolfSSL 11:cee25a834751 310 return 0;
wolfSSL 11:cee25a834751 311 }
wolfSSL 11:cee25a834751 312
wolfSSL 11:cee25a834751 313 #elif defined(FREESCALE_MMCAU_SHA)
wolfSSL 11:cee25a834751 314 #include "fsl_mmcau.h"
wolfSSL 11:cee25a834751 315 #define XTRANSFORM(sha256, B) Transform(sha256, B)
wolfSSL 11:cee25a834751 316
wolfSSL 11:cee25a834751 317 int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId)
wolfSSL 11:cee25a834751 318 {
wolfSSL 11:cee25a834751 319 int ret = 0;
wolfSSL 11:cee25a834751 320
wolfSSL 11:cee25a834751 321 (void)heap;
wolfSSL 11:cee25a834751 322 (void)devId;
wolfSSL 11:cee25a834751 323
wolfSSL 11:cee25a834751 324 ret = wolfSSL_CryptHwMutexLock();
wolfSSL 11:cee25a834751 325 if (ret != 0) {
wolfSSL 11:cee25a834751 326 return ret;
wolfSSL 11:cee25a834751 327 }
wolfSSL 11:cee25a834751 328 MMCAU_SHA256_InitializeOutput((uint32_t*)sha256->digest);
wolfSSL 11:cee25a834751 329 wolfSSL_CryptHwMutexUnLock();
wolfSSL 11:cee25a834751 330
wolfSSL 11:cee25a834751 331 sha256->buffLen = 0;
wolfSSL 11:cee25a834751 332 sha256->loLen = 0;
wolfSSL 11:cee25a834751 333 sha256->hiLen = 0;
wolfSSL 11:cee25a834751 334
wolfSSL 11:cee25a834751 335 return ret;
wolfSSL 11:cee25a834751 336 }
wolfSSL 11:cee25a834751 337
wolfSSL 11:cee25a834751 338 static int Transform(Sha256* sha256, byte* buf)
wolfSSL 11:cee25a834751 339 {
wolfSSL 11:cee25a834751 340 int ret = wolfSSL_CryptHwMutexLock();
wolfSSL 11:cee25a834751 341 if (ret == 0) {
wolfSSL 11:cee25a834751 342 MMCAU_SHA256_HashN(buf, 1, sha256->digest);
wolfSSL 11:cee25a834751 343 wolfSSL_CryptHwMutexUnLock();
wolfSSL 11:cee25a834751 344 }
wolfSSL 11:cee25a834751 345 return ret;
wolfSSL 11:cee25a834751 346 }
wolfSSL 11:cee25a834751 347
wolfSSL 11:cee25a834751 348 #elif defined(WOLFSSL_PIC32MZ_HASH)
wolfSSL 11:cee25a834751 349 #define NEED_SOFT_SHA256
wolfSSL 11:cee25a834751 350
wolfSSL 11:cee25a834751 351 #define wc_InitSha256 wc_InitSha256_sw
wolfSSL 11:cee25a834751 352 #define wc_Sha256Update wc_Sha256Update_sw
wolfSSL 11:cee25a834751 353 #define wc_Sha256Final wc_Sha256Final_sw
wolfSSL 11:cee25a834751 354
wolfSSL 11:cee25a834751 355 int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId)
wolfSSL 11:cee25a834751 356 {
wolfSSL 11:cee25a834751 357 if (sha256 == NULL)
wolfSSL 11:cee25a834751 358 return BAD_FUNC_ARG;
wolfSSL 11:cee25a834751 359
wolfSSL 11:cee25a834751 360 sha256->heap = heap;
wolfSSL 11:cee25a834751 361
wolfSSL 11:cee25a834751 362 return InitSha256(sha256);
wolfSSL 11:cee25a834751 363 }
wolfSSL 11:cee25a834751 364
wolfSSL 11:cee25a834751 365 #else
wolfSSL 11:cee25a834751 366 #define NEED_SOFT_SHA256
wolfSSL 11:cee25a834751 367
wolfSSL 11:cee25a834751 368 #define XTRANSFORM(sha256, B) Transform(sha256)
wolfSSL 11:cee25a834751 369
wolfSSL 11:cee25a834751 370 int wc_InitSha256_ex(Sha256* sha256, void* heap, int devId)
wolfSSL 11:cee25a834751 371 {
wolfSSL 11:cee25a834751 372 int ret = 0;
wolfSSL 11:cee25a834751 373 if (sha256 == NULL)
wolfSSL 11:cee25a834751 374 return BAD_FUNC_ARG;
wolfSSL 11:cee25a834751 375
wolfSSL 11:cee25a834751 376 sha256->heap = heap;
wolfSSL 11:cee25a834751 377
wolfSSL 11:cee25a834751 378 ret = InitSha256(sha256);
wolfSSL 11:cee25a834751 379 if (ret != 0)
wolfSSL 11:cee25a834751 380 return ret;
wolfSSL 11:cee25a834751 381
wolfSSL 11:cee25a834751 382 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
wolfSSL 11:cee25a834751 383 ret = wolfAsync_DevCtxInit(&sha256->asyncDev,
wolfSSL 11:cee25a834751 384 WOLFSSL_ASYNC_MARKER_SHA256, sha256->heap, devId);
wolfSSL 11:cee25a834751 385 #else
wolfSSL 11:cee25a834751 386 (void)devId;
wolfSSL 11:cee25a834751 387 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 11:cee25a834751 388
wolfSSL 11:cee25a834751 389 return ret;
wolfSSL 11:cee25a834751 390 }
wolfSSL 11:cee25a834751 391 #endif /* End Hardware Acceleration */
wolfSSL 11:cee25a834751 392
wolfSSL 11:cee25a834751 393 #ifndef SAVE_XMM_YMM
wolfSSL 11:cee25a834751 394 #define SAVE_XMM_YMM
wolfSSL 11:cee25a834751 395 #endif
wolfSSL 11:cee25a834751 396
wolfSSL 11:cee25a834751 397 #ifdef NEED_SOFT_SHA256
wolfSSL 11:cee25a834751 398
wolfSSL 11:cee25a834751 399 static const ALIGN32 word32 K[64] = {
wolfSSL 11:cee25a834751 400 0x428A2F98L, 0x71374491L, 0xB5C0FBCFL, 0xE9B5DBA5L, 0x3956C25BL,
wolfSSL 11:cee25a834751 401 0x59F111F1L, 0x923F82A4L, 0xAB1C5ED5L, 0xD807AA98L, 0x12835B01L,
wolfSSL 11:cee25a834751 402 0x243185BEL, 0x550C7DC3L, 0x72BE5D74L, 0x80DEB1FEL, 0x9BDC06A7L,
wolfSSL 11:cee25a834751 403 0xC19BF174L, 0xE49B69C1L, 0xEFBE4786L, 0x0FC19DC6L, 0x240CA1CCL,
wolfSSL 11:cee25a834751 404 0x2DE92C6FL, 0x4A7484AAL, 0x5CB0A9DCL, 0x76F988DAL, 0x983E5152L,
wolfSSL 11:cee25a834751 405 0xA831C66DL, 0xB00327C8L, 0xBF597FC7L, 0xC6E00BF3L, 0xD5A79147L,
wolfSSL 11:cee25a834751 406 0x06CA6351L, 0x14292967L, 0x27B70A85L, 0x2E1B2138L, 0x4D2C6DFCL,
wolfSSL 11:cee25a834751 407 0x53380D13L, 0x650A7354L, 0x766A0ABBL, 0x81C2C92EL, 0x92722C85L,
wolfSSL 11:cee25a834751 408 0xA2BFE8A1L, 0xA81A664BL, 0xC24B8B70L, 0xC76C51A3L, 0xD192E819L,
wolfSSL 11:cee25a834751 409 0xD6990624L, 0xF40E3585L, 0x106AA070L, 0x19A4C116L, 0x1E376C08L,
wolfSSL 11:cee25a834751 410 0x2748774CL, 0x34B0BCB5L, 0x391C0CB3L, 0x4ED8AA4AL, 0x5B9CCA4FL,
wolfSSL 11:cee25a834751 411 0x682E6FF3L, 0x748F82EEL, 0x78A5636FL, 0x84C87814L, 0x8CC70208L,
wolfSSL 11:cee25a834751 412 0x90BEFFFAL, 0xA4506CEBL, 0xBEF9A3F7L, 0xC67178F2L
wolfSSL 11:cee25a834751 413 };
wolfSSL 11:cee25a834751 414
wolfSSL 11:cee25a834751 415 #define Ch(x,y,z) ((z) ^ ((x) & ((y) ^ (z))))
wolfSSL 11:cee25a834751 416 #define Maj(x,y,z) ((((x) | (y)) & (z)) | ((x) & (y)))
wolfSSL 11:cee25a834751 417 #define R(x, n) (((x) & 0xFFFFFFFFU) >> (n))
wolfSSL 11:cee25a834751 418
wolfSSL 11:cee25a834751 419 #define S(x, n) rotrFixed(x, n)
wolfSSL 11:cee25a834751 420 #define Sigma0(x) (S(x, 2) ^ S(x, 13) ^ S(x, 22))
wolfSSL 11:cee25a834751 421 #define Sigma1(x) (S(x, 6) ^ S(x, 11) ^ S(x, 25))
wolfSSL 11:cee25a834751 422 #define Gamma0(x) (S(x, 7) ^ S(x, 18) ^ R(x, 3))
wolfSSL 11:cee25a834751 423 #define Gamma1(x) (S(x, 17) ^ S(x, 19) ^ R(x, 10))
wolfSSL 11:cee25a834751 424
wolfSSL 11:cee25a834751 425 #define RND(a,b,c,d,e,f,g,h,i) \
wolfSSL 11:cee25a834751 426 t0 = (h) + Sigma1((e)) + Ch((e), (f), (g)) + K[(i)] + W[(i)]; \
wolfSSL 11:cee25a834751 427 t1 = Sigma0((a)) + Maj((a), (b), (c)); \
wolfSSL 11:cee25a834751 428 (d) += t0; \
wolfSSL 11:cee25a834751 429 (h) = t0 + t1;
wolfSSL 11:cee25a834751 430
wolfSSL 11:cee25a834751 431 static int Transform(Sha256* sha256)
wolfSSL 11:cee25a834751 432 {
wolfSSL 11:cee25a834751 433 word32 S[8], t0, t1;
wolfSSL 11:cee25a834751 434 int i;
wolfSSL 11:cee25a834751 435
wolfSSL 11:cee25a834751 436 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 11:cee25a834751 437 word32* W;
wolfSSL 11:cee25a834751 438
wolfSSL 11:cee25a834751 439 W = (word32*)XMALLOC(sizeof(word32) * SHA256_BLOCK_SIZE, NULL,
wolfSSL 11:cee25a834751 440 DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 11:cee25a834751 441 if (W == NULL)
wolfSSL 11:cee25a834751 442 return MEMORY_E;
wolfSSL 11:cee25a834751 443 #else
wolfSSL 11:cee25a834751 444 word32 W[SHA256_BLOCK_SIZE];
wolfSSL 11:cee25a834751 445 #endif
wolfSSL 11:cee25a834751 446
wolfSSL 11:cee25a834751 447 /* Copy context->state[] to working vars */
wolfSSL 11:cee25a834751 448 for (i = 0; i < 8; i++)
wolfSSL 11:cee25a834751 449 S[i] = sha256->digest[i];
wolfSSL 11:cee25a834751 450
wolfSSL 11:cee25a834751 451 for (i = 0; i < 16; i++)
wolfSSL 11:cee25a834751 452 W[i] = sha256->buffer[i];
wolfSSL 11:cee25a834751 453
wolfSSL 11:cee25a834751 454 for (i = 16; i < SHA256_BLOCK_SIZE; i++)
wolfSSL 11:cee25a834751 455 W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15]) + W[i-16];
wolfSSL 11:cee25a834751 456
wolfSSL 11:cee25a834751 457 for (i = 0; i < SHA256_BLOCK_SIZE; i += 8) {
wolfSSL 11:cee25a834751 458 RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0);
wolfSSL 11:cee25a834751 459 RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1);
wolfSSL 11:cee25a834751 460 RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2);
wolfSSL 11:cee25a834751 461 RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3);
wolfSSL 11:cee25a834751 462 RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4);
wolfSSL 11:cee25a834751 463 RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5);
wolfSSL 11:cee25a834751 464 RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6);
wolfSSL 11:cee25a834751 465 RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7);
wolfSSL 11:cee25a834751 466 }
wolfSSL 11:cee25a834751 467
wolfSSL 11:cee25a834751 468 /* Add the working vars back into digest state[] */
wolfSSL 11:cee25a834751 469 for (i = 0; i < 8; i++) {
wolfSSL 11:cee25a834751 470 sha256->digest[i] += S[i];
wolfSSL 11:cee25a834751 471 }
wolfSSL 11:cee25a834751 472
wolfSSL 11:cee25a834751 473 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 11:cee25a834751 474 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 11:cee25a834751 475 #endif
wolfSSL 11:cee25a834751 476
wolfSSL 11:cee25a834751 477 return 0;
wolfSSL 11:cee25a834751 478 }
wolfSSL 11:cee25a834751 479 #endif
wolfSSL 11:cee25a834751 480 /* End wc_ software implementation */
wolfSSL 11:cee25a834751 481
wolfSSL 11:cee25a834751 482
wolfSSL 11:cee25a834751 483 #ifdef XTRANSFORM
wolfSSL 11:cee25a834751 484
wolfSSL 11:cee25a834751 485 static INLINE void AddLength(Sha256* sha256, word32 len)
wolfSSL 11:cee25a834751 486 {
wolfSSL 11:cee25a834751 487 word32 tmp = sha256->loLen;
wolfSSL 11:cee25a834751 488 if ( (sha256->loLen += len) < tmp)
wolfSSL 11:cee25a834751 489 sha256->hiLen++; /* carry low to high */
wolfSSL 11:cee25a834751 490 }
wolfSSL 11:cee25a834751 491
wolfSSL 11:cee25a834751 492 static INLINE int Sha256Update(Sha256* sha256, const byte* data, word32 len)
wolfSSL 11:cee25a834751 493 {
wolfSSL 11:cee25a834751 494 int ret = 0;
wolfSSL 11:cee25a834751 495 byte* local;
wolfSSL 11:cee25a834751 496
wolfSSL 11:cee25a834751 497 if (sha256 == NULL || (data == NULL && len > 0)) {
wolfSSL 11:cee25a834751 498 return BAD_FUNC_ARG;
wolfSSL 11:cee25a834751 499 }
wolfSSL 11:cee25a834751 500
wolfSSL 11:cee25a834751 501 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
wolfSSL 11:cee25a834751 502 if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) {
wolfSSL 11:cee25a834751 503 #if defined(HAVE_INTEL_QA)
wolfSSL 11:cee25a834751 504 return IntelQaSymSha256(&sha256->asyncDev, NULL, data, len);
wolfSSL 11:cee25a834751 505 #endif
wolfSSL 11:cee25a834751 506 }
wolfSSL 11:cee25a834751 507 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 11:cee25a834751 508
wolfSSL 11:cee25a834751 509 /* do block size increments */
wolfSSL 11:cee25a834751 510 local = (byte*)sha256->buffer;
wolfSSL 11:cee25a834751 511
wolfSSL 11:cee25a834751 512 /* check that internal buffLen is valid */
wolfSSL 11:cee25a834751 513 if (sha256->buffLen >= SHA256_BLOCK_SIZE)
wolfSSL 11:cee25a834751 514 return BUFFER_E;
wolfSSL 11:cee25a834751 515
wolfSSL 11:cee25a834751 516 SAVE_XMM_YMM; /* for Intel AVX */
wolfSSL 11:cee25a834751 517
wolfSSL 11:cee25a834751 518 while (len) {
wolfSSL 11:cee25a834751 519 word32 add = min(len, SHA256_BLOCK_SIZE - sha256->buffLen);
wolfSSL 11:cee25a834751 520 XMEMCPY(&local[sha256->buffLen], data, add);
wolfSSL 11:cee25a834751 521
wolfSSL 11:cee25a834751 522 sha256->buffLen += add;
wolfSSL 11:cee25a834751 523 data += add;
wolfSSL 11:cee25a834751 524 len -= add;
wolfSSL 11:cee25a834751 525
wolfSSL 11:cee25a834751 526 if (sha256->buffLen == SHA256_BLOCK_SIZE) {
wolfSSL 11:cee25a834751 527 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
wolfSSL 11:cee25a834751 528 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 529 if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
wolfSSL 11:cee25a834751 530 #endif
wolfSSL 11:cee25a834751 531 {
wolfSSL 11:cee25a834751 532 ByteReverseWords(sha256->buffer, sha256->buffer,
wolfSSL 11:cee25a834751 533 SHA256_BLOCK_SIZE);
wolfSSL 11:cee25a834751 534 }
wolfSSL 11:cee25a834751 535 #endif
wolfSSL 11:cee25a834751 536 ret = XTRANSFORM(sha256, local);
wolfSSL 11:cee25a834751 537 if (ret != 0) {
wolfSSL 11:cee25a834751 538 break;
wolfSSL 11:cee25a834751 539 }
wolfSSL 11:cee25a834751 540
wolfSSL 11:cee25a834751 541 AddLength(sha256, SHA256_BLOCK_SIZE);
wolfSSL 11:cee25a834751 542 sha256->buffLen = 0;
wolfSSL 11:cee25a834751 543 }
wolfSSL 11:cee25a834751 544 }
wolfSSL 11:cee25a834751 545
wolfSSL 11:cee25a834751 546 return ret;
wolfSSL 11:cee25a834751 547 }
wolfSSL 11:cee25a834751 548
wolfSSL 11:cee25a834751 549 int wc_Sha256Update(Sha256* sha256, const byte* data, word32 len)
wolfSSL 11:cee25a834751 550 {
wolfSSL 11:cee25a834751 551 return Sha256Update(sha256, data, len);
wolfSSL 11:cee25a834751 552 }
wolfSSL 11:cee25a834751 553
wolfSSL 11:cee25a834751 554 static INLINE int Sha256Final(Sha256* sha256)
wolfSSL 11:cee25a834751 555 {
wolfSSL 11:cee25a834751 556 int ret;
wolfSSL 11:cee25a834751 557 byte* local = (byte*)sha256->buffer;
wolfSSL 11:cee25a834751 558
wolfSSL 11:cee25a834751 559 SAVE_XMM_YMM; /* for Intel AVX */
wolfSSL 11:cee25a834751 560
wolfSSL 11:cee25a834751 561 AddLength(sha256, sha256->buffLen); /* before adding pads */
wolfSSL 11:cee25a834751 562 local[sha256->buffLen++] = 0x80; /* add 1 */
wolfSSL 11:cee25a834751 563
wolfSSL 11:cee25a834751 564 /* pad with zeros */
wolfSSL 11:cee25a834751 565 if (sha256->buffLen > SHA256_PAD_SIZE) {
wolfSSL 11:cee25a834751 566 XMEMSET(&local[sha256->buffLen], 0,
wolfSSL 11:cee25a834751 567 SHA256_BLOCK_SIZE - sha256->buffLen);
wolfSSL 11:cee25a834751 568 sha256->buffLen += SHA256_BLOCK_SIZE - sha256->buffLen;
wolfSSL 11:cee25a834751 569
wolfSSL 11:cee25a834751 570 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
wolfSSL 11:cee25a834751 571 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 572 if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
wolfSSL 11:cee25a834751 573 #endif
wolfSSL 11:cee25a834751 574 {
wolfSSL 11:cee25a834751 575 ByteReverseWords(sha256->buffer, sha256->buffer,
wolfSSL 11:cee25a834751 576 SHA256_BLOCK_SIZE);
wolfSSL 11:cee25a834751 577 }
wolfSSL 11:cee25a834751 578 #endif
wolfSSL 11:cee25a834751 579
wolfSSL 11:cee25a834751 580 ret = XTRANSFORM(sha256, local);
wolfSSL 11:cee25a834751 581 if (ret != 0)
wolfSSL 11:cee25a834751 582 return ret;
wolfSSL 11:cee25a834751 583
wolfSSL 11:cee25a834751 584 sha256->buffLen = 0;
wolfSSL 11:cee25a834751 585 }
wolfSSL 11:cee25a834751 586 XMEMSET(&local[sha256->buffLen], 0, SHA256_PAD_SIZE - sha256->buffLen);
wolfSSL 11:cee25a834751 587
wolfSSL 11:cee25a834751 588 /* put lengths in bits */
wolfSSL 11:cee25a834751 589 sha256->hiLen = (sha256->loLen >> (8 * sizeof(sha256->loLen) - 3)) +
wolfSSL 11:cee25a834751 590 (sha256->hiLen << 3);
wolfSSL 11:cee25a834751 591 sha256->loLen = sha256->loLen << 3;
wolfSSL 11:cee25a834751 592
wolfSSL 11:cee25a834751 593 /* store lengths */
wolfSSL 11:cee25a834751 594 #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
wolfSSL 11:cee25a834751 595 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 596 if (!IS_INTEL_AVX1 && !IS_INTEL_AVX2)
wolfSSL 11:cee25a834751 597 #endif
wolfSSL 11:cee25a834751 598 {
wolfSSL 11:cee25a834751 599 ByteReverseWords(sha256->buffer, sha256->buffer,
wolfSSL 11:cee25a834751 600 SHA256_BLOCK_SIZE);
wolfSSL 11:cee25a834751 601 }
wolfSSL 11:cee25a834751 602 #endif
wolfSSL 11:cee25a834751 603 /* ! length ordering dependent on digest endian type ! */
wolfSSL 11:cee25a834751 604 XMEMCPY(&local[SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
wolfSSL 11:cee25a834751 605 XMEMCPY(&local[SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
wolfSSL 11:cee25a834751 606 sizeof(word32));
wolfSSL 11:cee25a834751 607
wolfSSL 11:cee25a834751 608 #if defined(FREESCALE_MMCAU_SHA) || defined(HAVE_INTEL_AVX1) || \
wolfSSL 11:cee25a834751 609 defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 610 /* Kinetis requires only these bytes reversed */
wolfSSL 11:cee25a834751 611 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 612 if (IS_INTEL_AVX1 || IS_INTEL_AVX2)
wolfSSL 11:cee25a834751 613 #endif
wolfSSL 11:cee25a834751 614 {
wolfSSL 11:cee25a834751 615 ByteReverseWords(
wolfSSL 11:cee25a834751 616 &sha256->buffer[SHA256_PAD_SIZE / sizeof(word32)],
wolfSSL 11:cee25a834751 617 &sha256->buffer[SHA256_PAD_SIZE / sizeof(word32)],
wolfSSL 11:cee25a834751 618 2 * sizeof(word32));
wolfSSL 11:cee25a834751 619 }
wolfSSL 11:cee25a834751 620 #endif
wolfSSL 11:cee25a834751 621
wolfSSL 11:cee25a834751 622 return XTRANSFORM(sha256, local);
wolfSSL 11:cee25a834751 623 }
wolfSSL 11:cee25a834751 624
wolfSSL 11:cee25a834751 625 int wc_Sha256Final(Sha256* sha256, byte* hash)
wolfSSL 11:cee25a834751 626 {
wolfSSL 11:cee25a834751 627 int ret;
wolfSSL 11:cee25a834751 628
wolfSSL 11:cee25a834751 629 if (sha256 == NULL || hash == NULL) {
wolfSSL 11:cee25a834751 630 return BAD_FUNC_ARG;
wolfSSL 11:cee25a834751 631 }
wolfSSL 11:cee25a834751 632
wolfSSL 11:cee25a834751 633 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
wolfSSL 11:cee25a834751 634 if (sha256->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA256) {
wolfSSL 11:cee25a834751 635 #if defined(HAVE_INTEL_QA)
wolfSSL 11:cee25a834751 636 return IntelQaSymSha256(&sha256->asyncDev, hash, NULL,
wolfSSL 11:cee25a834751 637 SHA256_DIGEST_SIZE);
wolfSSL 11:cee25a834751 638 #endif
wolfSSL 11:cee25a834751 639 }
wolfSSL 11:cee25a834751 640 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 11:cee25a834751 641
wolfSSL 11:cee25a834751 642 ret = Sha256Final(sha256);
wolfSSL 11:cee25a834751 643 if (ret != 0)
wolfSSL 11:cee25a834751 644 return ret;
wolfSSL 11:cee25a834751 645
wolfSSL 11:cee25a834751 646 #if defined(LITTLE_ENDIAN_ORDER)
wolfSSL 11:cee25a834751 647 ByteReverseWords(sha256->digest, sha256->digest, SHA256_DIGEST_SIZE);
wolfSSL 11:cee25a834751 648 #endif
wolfSSL 11:cee25a834751 649 XMEMCPY(hash, sha256->digest, SHA256_DIGEST_SIZE);
wolfSSL 11:cee25a834751 650
wolfSSL 11:cee25a834751 651 return InitSha256(sha256); /* reset state */
wolfSSL 11:cee25a834751 652 }
wolfSSL 11:cee25a834751 653
wolfSSL 11:cee25a834751 654 #endif /* XTRANSFORM */
wolfSSL 11:cee25a834751 655
wolfSSL 11:cee25a834751 656
wolfSSL 11:cee25a834751 657 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 658
wolfSSL 11:cee25a834751 659 #define _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 11:cee25a834751 660 { word32 d;\
wolfSSL 11:cee25a834751 661 d = sha256->digest[0]; __asm__ volatile("movl %0, %"#S_0::"r"(d):SSE_REGs);\
wolfSSL 11:cee25a834751 662 d = sha256->digest[1]; __asm__ volatile("movl %0, %"#S_1::"r"(d):SSE_REGs);\
wolfSSL 11:cee25a834751 663 d = sha256->digest[2]; __asm__ volatile("movl %0, %"#S_2::"r"(d):SSE_REGs);\
wolfSSL 11:cee25a834751 664 d = sha256->digest[3]; __asm__ volatile("movl %0, %"#S_3::"r"(d):SSE_REGs);\
wolfSSL 11:cee25a834751 665 d = sha256->digest[4]; __asm__ volatile("movl %0, %"#S_4::"r"(d):SSE_REGs);\
wolfSSL 11:cee25a834751 666 d = sha256->digest[5]; __asm__ volatile("movl %0, %"#S_5::"r"(d):SSE_REGs);\
wolfSSL 11:cee25a834751 667 d = sha256->digest[6]; __asm__ volatile("movl %0, %"#S_6::"r"(d):SSE_REGs);\
wolfSSL 11:cee25a834751 668 d = sha256->digest[7]; __asm__ volatile("movl %0, %"#S_7::"r"(d):SSE_REGs);\
wolfSSL 11:cee25a834751 669 }
wolfSSL 11:cee25a834751 670
wolfSSL 11:cee25a834751 671 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 11:cee25a834751 672 { word32 d; \
wolfSSL 11:cee25a834751 673 __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs); sha256->digest[0] += d;\
wolfSSL 11:cee25a834751 674 __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs); sha256->digest[1] += d;\
wolfSSL 11:cee25a834751 675 __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs); sha256->digest[2] += d;\
wolfSSL 11:cee25a834751 676 __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs); sha256->digest[3] += d;\
wolfSSL 11:cee25a834751 677 __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs); sha256->digest[4] += d;\
wolfSSL 11:cee25a834751 678 __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs); sha256->digest[5] += d;\
wolfSSL 11:cee25a834751 679 __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs); sha256->digest[6] += d;\
wolfSSL 11:cee25a834751 680 __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs); sha256->digest[7] += d;\
wolfSSL 11:cee25a834751 681 }
wolfSSL 11:cee25a834751 682
wolfSSL 11:cee25a834751 683
wolfSSL 11:cee25a834751 684 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 11:cee25a834751 685 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 11:cee25a834751 686
wolfSSL 11:cee25a834751 687 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 11:cee25a834751 688 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 11:cee25a834751 689
wolfSSL 11:cee25a834751 690
wolfSSL 11:cee25a834751 691 #define S_0 %r15d
wolfSSL 11:cee25a834751 692 #define S_1 %r10d
wolfSSL 11:cee25a834751 693 #define S_2 %r11d
wolfSSL 11:cee25a834751 694 #define S_3 %r12d
wolfSSL 11:cee25a834751 695 #define S_4 %r13d
wolfSSL 11:cee25a834751 696 #define S_5 %r14d
wolfSSL 11:cee25a834751 697 #define S_6 %ebx
wolfSSL 11:cee25a834751 698 #define S_7 %r9d
wolfSSL 11:cee25a834751 699
wolfSSL 11:cee25a834751 700 #define SSE_REGs "%edi", "%ecx", "%esi", "%edx", "%ebx","%r8","%r9","%r10","%r11","%r12","%r13","%r14","%r15"
wolfSSL 11:cee25a834751 701
wolfSSL 11:cee25a834751 702 #if defined(HAVE_INTEL_RORX)
wolfSSL 11:cee25a834751 703 #define RND_STEP_RORX_1(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 704 __asm__ volatile("rorx $6, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\
wolfSSL 11:cee25a834751 705
wolfSSL 11:cee25a834751 706 #define RND_STEP_RORX_2(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 707 __asm__ volatile("rorx $11, %"#e",%%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\
wolfSSL 11:cee25a834751 708 __asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\
wolfSSL 11:cee25a834751 709 __asm__ volatile("rorx $25, %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\
wolfSSL 11:cee25a834751 710
wolfSSL 11:cee25a834751 711 #define RND_STEP_RORX_3(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 712 __asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\
wolfSSL 11:cee25a834751 713 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\
wolfSSL 11:cee25a834751 714 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\
wolfSSL 11:cee25a834751 715 __asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\
wolfSSL 11:cee25a834751 716 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\
wolfSSL 11:cee25a834751 717
wolfSSL 11:cee25a834751 718 #define RND_STEP_RORX_4(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 719 /*__asm__ volatile("movl %0, %%edx\n\t"::"m"(w_k):"%edx");*/\
wolfSSL 11:cee25a834751 720 __asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\
wolfSSL 11:cee25a834751 721 __asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\
wolfSSL 11:cee25a834751 722 __asm__ volatile("rorx $2, %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\
wolfSSL 11:cee25a834751 723 __asm__ volatile("rorx $13, %"#a", %%edi\n\t":::"%edi",SSE_REGs);/* edi = a>>13 */\
wolfSSL 11:cee25a834751 724
wolfSSL 11:cee25a834751 725 #define RND_STEP_RORX_5(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 726 __asm__ volatile("rorx $22, %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\
wolfSSL 11:cee25a834751 727 __asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs);/* edi = (a>>2) ^ (a>>13) */\
wolfSSL 11:cee25a834751 728 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma0(a) */\
wolfSSL 11:cee25a834751 729
wolfSSL 11:cee25a834751 730 #define RND_STEP_RORX_6(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 731 __asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\
wolfSSL 11:cee25a834751 732 __asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\
wolfSSL 11:cee25a834751 733 __asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c*/\
wolfSSL 11:cee25a834751 734 __asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\
wolfSSL 11:cee25a834751 735
wolfSSL 11:cee25a834751 736 #define RND_STEP_RORX_7(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 737 __asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\
wolfSSL 11:cee25a834751 738 __asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\
wolfSSL 11:cee25a834751 739 __asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\
wolfSSL 11:cee25a834751 740
wolfSSL 11:cee25a834751 741 #define RND_STEP_RORX_8(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 742 __asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\
wolfSSL 11:cee25a834751 743 __asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \
wolfSSL 11:cee25a834751 744 __asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs); \
wolfSSL 11:cee25a834751 745 __asm__ volatile("movl %r8d, "#h"\n\t");
wolfSSL 11:cee25a834751 746 #endif /* HAVE_INTEL_RORX */
wolfSSL 11:cee25a834751 747
wolfSSL 11:cee25a834751 748 #define RND_STEP_1(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 749 __asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs);\
wolfSSL 11:cee25a834751 750 __asm__ volatile("roll $26, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>6 */\
wolfSSL 11:cee25a834751 751 __asm__ volatile("movl %"#e", %%edi\n\t":::"%edi",SSE_REGs);\
wolfSSL 11:cee25a834751 752
wolfSSL 11:cee25a834751 753 #define RND_STEP_2(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 754 __asm__ volatile("roll $21, %%edi\n\t":::"%edi",SSE_REGs); /* edi = e>>11 */\
wolfSSL 11:cee25a834751 755 __asm__ volatile("xorl %%edx, %%edi\n\t":::"%edx","%edi",SSE_REGs); /* edi = (e>>11) ^ (e>>6) */\
wolfSSL 11:cee25a834751 756 __asm__ volatile("movl %"#e", %%edx\n\t":::"%edx",SSE_REGs); /* edx = e */\
wolfSSL 11:cee25a834751 757 __asm__ volatile("roll $7, %%edx\n\t":::"%edx",SSE_REGs); /* edx = e>>25 */\
wolfSSL 11:cee25a834751 758
wolfSSL 11:cee25a834751 759 #define RND_STEP_3(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 760 __asm__ volatile("movl %"#f", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f */\
wolfSSL 11:cee25a834751 761 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = f ^ g */\
wolfSSL 11:cee25a834751 762 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs); /* edx = Sigma1(e) */\
wolfSSL 11:cee25a834751 763 __asm__ volatile("andl %"#e", %%esi\n\t":::"%esi",SSE_REGs); /* esi = (f ^ g) & e */\
wolfSSL 11:cee25a834751 764 __asm__ volatile("xorl %"#g", %%esi\n\t":::"%esi",SSE_REGs); /* esi = Ch(e,f,g) */\
wolfSSL 11:cee25a834751 765
wolfSSL 11:cee25a834751 766 #define RND_STEP_4(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 767 __asm__ volatile("addl %0, %"#h"\n\t"::"r"(W_K[i]):SSE_REGs); /* h += w_k */\
wolfSSL 11:cee25a834751 768 __asm__ volatile("addl %%edx, %"#h"\n\t":::"%edx",SSE_REGs); /* h = h + w_k + Sigma1(e) */\
wolfSSL 11:cee25a834751 769 __asm__ volatile("movl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a */\
wolfSSL 11:cee25a834751 770 __asm__ volatile("roll $30, %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = a>>2 */\
wolfSSL 11:cee25a834751 771 __asm__ volatile("movl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a */\
wolfSSL 11:cee25a834751 772 __asm__ volatile("roll $19, %%edi\n\t":::"%edi",SSE_REGs); /* edi = a>>13 */\
wolfSSL 11:cee25a834751 773 __asm__ volatile("movl %"#a", %%edx\n\t":::"%edx",SSE_REGs); /* edx = a */\
wolfSSL 11:cee25a834751 774
wolfSSL 11:cee25a834751 775 #define RND_STEP_5(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 776 __asm__ volatile("roll $10, %%edx\n\t":::"%edx",SSE_REGs); /* edx = a>>22 */\
wolfSSL 11:cee25a834751 777 __asm__ volatile("xorl %%r8d, %%edi\n\t":::"%edi","%r8",SSE_REGs); /* edi = (a>>2) ^ (a>>13) */\
wolfSSL 11:cee25a834751 778 __asm__ volatile("xorl %%edi, %%edx\n\t":::"%edi","%edx",SSE_REGs);/* edx = Sigma0(a) */\
wolfSSL 11:cee25a834751 779
wolfSSL 11:cee25a834751 780 #define RND_STEP_6(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 781 __asm__ volatile("movl %"#b", %%edi\n\t":::"%edi",SSE_REGs); /* edi = b */\
wolfSSL 11:cee25a834751 782 __asm__ volatile("orl %"#a", %%edi\n\t":::"%edi",SSE_REGs); /* edi = a | b */\
wolfSSL 11:cee25a834751 783 __asm__ volatile("andl %"#c", %%edi\n\t":::"%edi",SSE_REGs); /* edi = (a | b) & c */\
wolfSSL 11:cee25a834751 784 __asm__ volatile("movl %"#b", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b */\
wolfSSL 11:cee25a834751 785
wolfSSL 11:cee25a834751 786 #define RND_STEP_7(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 787 __asm__ volatile("addl %%esi, %"#h"\n\t":::"%esi",SSE_REGs); /* h += Ch(e,f,g) */\
wolfSSL 11:cee25a834751 788 __asm__ volatile("andl %"#a", %%r8d\n\t":::"%r8",SSE_REGs); /* r8d = b & a */\
wolfSSL 11:cee25a834751 789 __asm__ volatile("orl %%edi, %%r8d\n\t":::"%edi","%r8",SSE_REGs); /* r8d = Maj(a,b,c) */\
wolfSSL 11:cee25a834751 790
wolfSSL 11:cee25a834751 791 #define RND_STEP_8(a,b,c,d,e,f,g,h,i)\
wolfSSL 11:cee25a834751 792 __asm__ volatile("addl "#h", "#d"\n\t"); /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */\
wolfSSL 11:cee25a834751 793 __asm__ volatile("addl %"#h", %%r8d\n\t":::"%r8",SSE_REGs); \
wolfSSL 11:cee25a834751 794 /* r8b = h + w_k + Sigma1(e) + Ch(e,f,g) + Maj(a,b,c) */\
wolfSSL 11:cee25a834751 795 __asm__ volatile("addl %%edx, %%r8d\n\t":::"%edx","%r8",SSE_REGs);\
wolfSSL 11:cee25a834751 796 /* r8b = h + w_k + Sigma1(e) Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */\
wolfSSL 11:cee25a834751 797 __asm__ volatile("movl %%r8d, %"#h"\n\t":::"%r8", SSE_REGs); \
wolfSSL 11:cee25a834751 798 /* h = h + w_k + Sigma1(e) + Sigma0(a) + Ch(e,f,g) + Maj(a,b,c) */ \
wolfSSL 11:cee25a834751 799
wolfSSL 11:cee25a834751 800 #define RND_X(a,b,c,d,e,f,g,h,i) \
wolfSSL 11:cee25a834751 801 RND_STEP_1(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 802 RND_STEP_2(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 803 RND_STEP_3(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 804 RND_STEP_4(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 805 RND_STEP_5(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 806 RND_STEP_6(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 807 RND_STEP_7(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 808 RND_STEP_8(a,b,c,d,e,f,g,h,i);
wolfSSL 11:cee25a834751 809
wolfSSL 11:cee25a834751 810 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 11:cee25a834751 811 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 11:cee25a834751 812 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 11:cee25a834751 813 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 11:cee25a834751 814 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 11:cee25a834751 815 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 11:cee25a834751 816 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 11:cee25a834751 817 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 11:cee25a834751 818
wolfSSL 11:cee25a834751 819
wolfSSL 11:cee25a834751 820 #define RND_1_3(a,b,c,d,e,f,g,h,i) {\
wolfSSL 11:cee25a834751 821 RND_STEP_1(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 822 RND_STEP_2(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 823 RND_STEP_3(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 824 }
wolfSSL 11:cee25a834751 825
wolfSSL 11:cee25a834751 826 #define RND_4_6(a,b,c,d,e,f,g,h,i) {\
wolfSSL 11:cee25a834751 827 RND_STEP_4(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 828 RND_STEP_5(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 829 RND_STEP_6(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 830 }
wolfSSL 11:cee25a834751 831
wolfSSL 11:cee25a834751 832 #define RND_7_8(a,b,c,d,e,f,g,h,i) {\
wolfSSL 11:cee25a834751 833 RND_STEP_7(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 834 RND_STEP_8(a,b,c,d,e,f,g,h,i); \
wolfSSL 11:cee25a834751 835 }
wolfSSL 11:cee25a834751 836
wolfSSL 11:cee25a834751 837 #define RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 11:cee25a834751 838 #define RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 11:cee25a834751 839 #define RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 11:cee25a834751 840 #define RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 11:cee25a834751 841 #define RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 11:cee25a834751 842 #define RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 11:cee25a834751 843 #define RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 11:cee25a834751 844 #define RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_X(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 11:cee25a834751 845
wolfSSL 11:cee25a834751 846
wolfSSL 11:cee25a834751 847 #define RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 11:cee25a834751 848 #define RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 11:cee25a834751 849 #define RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 11:cee25a834751 850 #define RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 11:cee25a834751 851 #define RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 11:cee25a834751 852 #define RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 11:cee25a834751 853 #define RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 11:cee25a834751 854 #define RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_1_3(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 11:cee25a834751 855
wolfSSL 11:cee25a834751 856 #define RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 11:cee25a834751 857 #define RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 11:cee25a834751 858 #define RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 11:cee25a834751 859 #define RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 11:cee25a834751 860 #define RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 11:cee25a834751 861 #define RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 11:cee25a834751 862 #define RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 11:cee25a834751 863 #define RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_4_6(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 11:cee25a834751 864
wolfSSL 11:cee25a834751 865 #define RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i);
wolfSSL 11:cee25a834751 866 #define RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_7,S_0,S_1,S_2,S_3,S_4,S_5,S_6,_i);
wolfSSL 11:cee25a834751 867 #define RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_6,S_7,S_0,S_1,S_2,S_3,S_4,S_5,_i);
wolfSSL 11:cee25a834751 868 #define RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_5,S_6,S_7,S_0,S_1,S_2,S_3,S_4,_i);
wolfSSL 11:cee25a834751 869 #define RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,_i);
wolfSSL 11:cee25a834751 870 #define RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_3,S_4,S_5,S_6,S_7,S_0,S_1,S_2,_i);
wolfSSL 11:cee25a834751 871 #define RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_2,S_3,S_4,S_5,S_6,S_7,S_0,S_1,_i);
wolfSSL 11:cee25a834751 872 #define RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,_i) RND_7_8(S_1,S_2,S_3,S_4,S_5,S_6,S_7,S_0,_i);
wolfSSL 11:cee25a834751 873
wolfSSL 11:cee25a834751 874 #define FOR(cnt, init, max, inc, loop) \
wolfSSL 11:cee25a834751 875 __asm__ volatile("movl $"#init", %0\n\t"#loop":"::"m"(cnt):)
wolfSSL 11:cee25a834751 876 #define END(cnt, init, max, inc, loop) \
wolfSSL 11:cee25a834751 877 __asm__ volatile("addl $"#inc", %0\n\tcmpl $"#max", %0\n\tjle "#loop"\n\t":"=m"(cnt)::);
wolfSSL 11:cee25a834751 878
wolfSSL 11:cee25a834751 879 #endif /* defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) */
wolfSSL 11:cee25a834751 880
wolfSSL 11:cee25a834751 881 #if defined(HAVE_INTEL_AVX1) /* inline Assember for Intel AVX1 instructions */
wolfSSL 11:cee25a834751 882
wolfSSL 11:cee25a834751 883 #define VPALIGNR(op1,op2,op3,op4) __asm__ volatile("vpalignr $"#op4", %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 11:cee25a834751 884 #define VPADDD(op1,op2,op3) __asm__ volatile("vpaddd %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 11:cee25a834751 885 #define VPSRLD(op1,op2,op3) __asm__ volatile("vpsrld $"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 11:cee25a834751 886 #define VPSRLQ(op1,op2,op3) __asm__ volatile("vpsrlq $"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 11:cee25a834751 887 #define VPSLLD(op1,op2,op3) __asm__ volatile("vpslld $"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 11:cee25a834751 888 #define VPOR(op1,op2,op3) __asm__ volatile("vpor %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 11:cee25a834751 889 #define VPXOR(op1,op2,op3) __asm__ volatile("vpxor %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 11:cee25a834751 890 #define VPSHUFD(op1,op2,op3) __asm__ volatile("vpshufd $"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 11:cee25a834751 891 #define VPSHUFB(op1,op2,op3) __asm__ volatile("vpshufb %"#op3", %"#op2", %"#op1:::XMM_REGs)
wolfSSL 11:cee25a834751 892
wolfSSL 11:cee25a834751 893 #define MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER, SHUF_00BA, SHUF_DC00,\
wolfSSL 11:cee25a834751 894 a,b,c,d,e,f,g,h,_i)\
wolfSSL 11:cee25a834751 895 RND_STEP_1(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 896 VPALIGNR (XTMP0, X3, X2, 4);\
wolfSSL 11:cee25a834751 897 RND_STEP_2(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 898 VPADDD (XTMP0, XTMP0, X0);\
wolfSSL 11:cee25a834751 899 RND_STEP_3(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 900 VPALIGNR (XTMP1, X1, X0, 4); /* XTMP1 = W[-15] */\
wolfSSL 11:cee25a834751 901 RND_STEP_4(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 902 VPSRLD (XTMP2, XTMP1, 7);\
wolfSSL 11:cee25a834751 903 RND_STEP_5(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 904 VPSLLD (XTMP3, XTMP1, 25); /* VPSLLD (XTMP3, XTMP1, (32-7)) */\
wolfSSL 11:cee25a834751 905 RND_STEP_6(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 906 VPOR (XTMP3, XTMP3, XTMP2); /* XTMP1 = W[-15] MY_ROR 7 */\
wolfSSL 11:cee25a834751 907 RND_STEP_7(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 908 VPSRLD (XTMP2, XTMP1,18);\
wolfSSL 11:cee25a834751 909 RND_STEP_8(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 910 \
wolfSSL 11:cee25a834751 911 RND_STEP_1(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 912 VPSRLD (XTMP4, XTMP1, 3); /* XTMP4 = W[-15] >> 3 */\
wolfSSL 11:cee25a834751 913 RND_STEP_2(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 914 VPSLLD (XTMP1, XTMP1, 14); /* VPSLLD (XTMP1, XTMP1, (32-18)) */\
wolfSSL 11:cee25a834751 915 RND_STEP_3(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 916 VPXOR (XTMP3, XTMP3, XTMP1);\
wolfSSL 11:cee25a834751 917 RND_STEP_4(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 918 VPXOR (XTMP3, XTMP3, XTMP2); /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
wolfSSL 11:cee25a834751 919 RND_STEP_5(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 920 VPXOR (XTMP1, XTMP3, XTMP4); /* XTMP1 = s0 */\
wolfSSL 11:cee25a834751 921 RND_STEP_6(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 922 VPSHUFD(XTMP2, X3, 0b11111010); /* XTMP2 = W[-2] {BBAA}*/\
wolfSSL 11:cee25a834751 923 RND_STEP_7(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 924 VPADDD (XTMP0, XTMP0, XTMP1); /* XTMP0 = W[-16] + W[-7] + s0 */\
wolfSSL 11:cee25a834751 925 RND_STEP_8(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 926 \
wolfSSL 11:cee25a834751 927 RND_STEP_1(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 928 VPSRLD (XTMP4, XTMP2, 10); /* XTMP4 = W[-2] >> 10 {BBAA} */\
wolfSSL 11:cee25a834751 929 RND_STEP_2(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 930 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
wolfSSL 11:cee25a834751 931 RND_STEP_3(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 932 VPSRLQ (XTMP2, XTMP2, 17); /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
wolfSSL 11:cee25a834751 933 RND_STEP_4(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 934 VPXOR (XTMP2, XTMP2, XTMP3);\
wolfSSL 11:cee25a834751 935 RND_STEP_5(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 936 VPXOR (XTMP4, XTMP4, XTMP2); /* XTMP4 = s1 {xBxA} */\
wolfSSL 11:cee25a834751 937 RND_STEP_6(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 938 VPSHUFB (XTMP4, XTMP4, SHUF_00BA); /* XTMP4 = s1 {00BA} */\
wolfSSL 11:cee25a834751 939 RND_STEP_7(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 940 VPADDD (XTMP0, XTMP0, XTMP4); /* XTMP0 = {..., ..., W[1], W[0]} */\
wolfSSL 11:cee25a834751 941 RND_STEP_8(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 942 \
wolfSSL 11:cee25a834751 943 RND_STEP_1(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 944 VPSHUFD (XTMP2, XTMP0, 0b01010000); /* XTMP2 = W[-2] {DDCC} */\
wolfSSL 11:cee25a834751 945 RND_STEP_2(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 946 VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\
wolfSSL 11:cee25a834751 947 RND_STEP_3(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 948 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
wolfSSL 11:cee25a834751 949 RND_STEP_4(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 950 VPSRLQ (XTMP2, XTMP2, 17); /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
wolfSSL 11:cee25a834751 951 RND_STEP_5(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 952 VPXOR (XTMP2, XTMP2, XTMP3);\
wolfSSL 11:cee25a834751 953 RND_STEP_6(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 954 VPXOR (XTMP5, XTMP5, XTMP2); /* XTMP5 = s1 {xDxC} */\
wolfSSL 11:cee25a834751 955 RND_STEP_7(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 956 VPSHUFB (XTMP5, XTMP5, SHUF_DC00); /* XTMP5 = s1 {DC00} */\
wolfSSL 11:cee25a834751 957 RND_STEP_8(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 958 VPADDD (X0, XTMP5, XTMP0); /* X0 = {W[3], W[2], W[1], W[0]} */\
wolfSSL 11:cee25a834751 959
wolfSSL 11:cee25a834751 960 #if defined(HAVE_INTEL_RORX)
wolfSSL 11:cee25a834751 961
wolfSSL 11:cee25a834751 962 #define MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, \
wolfSSL 11:cee25a834751 963 XFER, SHUF_00BA, SHUF_DC00,a,b,c,d,e,f,g,h,_i)\
wolfSSL 11:cee25a834751 964 RND_STEP_RORX_1(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 965 VPALIGNR (XTMP0, X3, X2, 4);\
wolfSSL 11:cee25a834751 966 RND_STEP_RORX_2(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 967 VPADDD (XTMP0, XTMP0, X0);\
wolfSSL 11:cee25a834751 968 RND_STEP_RORX_3(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 969 VPALIGNR (XTMP1, X1, X0, 4); /* XTMP1 = W[-15] */\
wolfSSL 11:cee25a834751 970 RND_STEP_RORX_4(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 971 VPSRLD (XTMP2, XTMP1, 7);\
wolfSSL 11:cee25a834751 972 RND_STEP_RORX_5(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 973 VPSLLD (XTMP3, XTMP1, 25); /* VPSLLD (XTMP3, XTMP1, (32-7)) */\
wolfSSL 11:cee25a834751 974 RND_STEP_RORX_6(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 975 VPOR (XTMP3, XTMP3, XTMP2); /* XTMP1 = W[-15] MY_ROR 7 */\
wolfSSL 11:cee25a834751 976 RND_STEP_RORX_7(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 977 VPSRLD (XTMP2, XTMP1,18);\
wolfSSL 11:cee25a834751 978 RND_STEP_RORX_8(a,b,c,d,e,f,g,h,_i);\
wolfSSL 11:cee25a834751 979 \
wolfSSL 11:cee25a834751 980 RND_STEP_RORX_1(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 981 VPSRLD (XTMP4, XTMP1, 3); /* XTMP4 = W[-15] >> 3 */\
wolfSSL 11:cee25a834751 982 RND_STEP_RORX_2(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 983 VPSLLD (XTMP1, XTMP1, 14); /* VPSLLD (XTMP1, XTMP1, (32-18)) */\
wolfSSL 11:cee25a834751 984 RND_STEP_RORX_3(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 985 VPXOR (XTMP3, XTMP3, XTMP1);\
wolfSSL 11:cee25a834751 986 RND_STEP_RORX_4(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 987 VPXOR (XTMP3, XTMP3, XTMP2); /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR 18 */\
wolfSSL 11:cee25a834751 988 RND_STEP_RORX_5(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 989 VPXOR (XTMP1, XTMP3, XTMP4); /* XTMP1 = s0 */\
wolfSSL 11:cee25a834751 990 RND_STEP_RORX_6(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 991 VPSHUFD(XTMP2, X3, 0b11111010); /* XTMP2 = W[-2] {BBAA}*/\
wolfSSL 11:cee25a834751 992 RND_STEP_RORX_7(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 993 VPADDD (XTMP0, XTMP0, XTMP1); /* XTMP0 = W[-16] + W[-7] + s0 */\
wolfSSL 11:cee25a834751 994 RND_STEP_RORX_8(h,a,b,c,d,e,f,g,_i+1);\
wolfSSL 11:cee25a834751 995 \
wolfSSL 11:cee25a834751 996 RND_STEP_RORX_1(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 997 VPSRLD (XTMP4, XTMP2, 10); /* XTMP4 = W[-2] >> 10 {BBAA} */\
wolfSSL 11:cee25a834751 998 RND_STEP_RORX_2(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 999 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */\
wolfSSL 11:cee25a834751 1000 RND_STEP_RORX_3(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 1001 VPSRLQ (XTMP2, XTMP2, 17); /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */\
wolfSSL 11:cee25a834751 1002 RND_STEP_RORX_4(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 1003 VPXOR (XTMP2, XTMP2, XTMP3);\
wolfSSL 11:cee25a834751 1004 RND_STEP_RORX_5(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 1005 VPXOR (XTMP4, XTMP4, XTMP2); /* XTMP4 = s1 {xBxA} */\
wolfSSL 11:cee25a834751 1006 RND_STEP_RORX_6(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 1007 VPSHUFB (XTMP4, XTMP4, SHUF_00BA); /* XTMP4 = s1 {00BA} */\
wolfSSL 11:cee25a834751 1008 RND_STEP_RORX_7(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 1009 VPADDD (XTMP0, XTMP0, XTMP4); /* XTMP0 = {..., ..., W[1], W[0]} */\
wolfSSL 11:cee25a834751 1010 RND_STEP_RORX_8(g,h,a,b,c,d,e,f,_i+2);\
wolfSSL 11:cee25a834751 1011 \
wolfSSL 11:cee25a834751 1012 RND_STEP_RORX_1(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 1013 VPSHUFD (XTMP2, XTMP0, 0b01010000); /* XTMP2 = W[-2] {DDCC} */\
wolfSSL 11:cee25a834751 1014 RND_STEP_RORX_2(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 1015 VPSRLD (XTMP5, XTMP2, 10); /* XTMP5 = W[-2] >> 10 {DDCC} */\
wolfSSL 11:cee25a834751 1016 RND_STEP_RORX_3(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 1017 VPSRLQ (XTMP3, XTMP2, 19); /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */\
wolfSSL 11:cee25a834751 1018 RND_STEP_RORX_4(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 1019 VPSRLQ (XTMP2, XTMP2, 17); /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */\
wolfSSL 11:cee25a834751 1020 RND_STEP_RORX_5(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 1021 VPXOR (XTMP2, XTMP2, XTMP3);\
wolfSSL 11:cee25a834751 1022 RND_STEP_RORX_6(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 1023 VPXOR (XTMP5, XTMP5, XTMP2); /* XTMP5 = s1 {xDxC} */\
wolfSSL 11:cee25a834751 1024 RND_STEP_RORX_7(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 1025 VPSHUFB (XTMP5, XTMP5, SHUF_DC00); /* XTMP5 = s1 {DC00} */\
wolfSSL 11:cee25a834751 1026 RND_STEP_RORX_8(f,g,h,a,b,c,d,e,_i+3);\
wolfSSL 11:cee25a834751 1027 VPADDD (X0, XTMP5, XTMP0); /* X0 = {W[3], W[2], W[1], W[0]} */\
wolfSSL 11:cee25a834751 1028
wolfSSL 11:cee25a834751 1029 #endif /* HAVE_INTEL_RORX */
wolfSSL 11:cee25a834751 1030
wolfSSL 11:cee25a834751 1031
wolfSSL 11:cee25a834751 1032 #define W_K_from_buff\
wolfSSL 11:cee25a834751 1033 __asm__ volatile("vmovdqu %0, %%xmm4\n\t"\
wolfSSL 11:cee25a834751 1034 "vpshufb %%xmm13, %%xmm4, %%xmm4\n\t"\
wolfSSL 11:cee25a834751 1035 :: "m"(sha256->buffer[0]):"%xmm4");\
wolfSSL 11:cee25a834751 1036 __asm__ volatile("vmovdqu %0, %%xmm5\n\t"\
wolfSSL 11:cee25a834751 1037 "vpshufb %%xmm13, %%xmm5, %%xmm5\n\t"\
wolfSSL 11:cee25a834751 1038 ::"m"(sha256->buffer[4]):"%xmm5");\
wolfSSL 11:cee25a834751 1039 __asm__ volatile("vmovdqu %0, %%xmm6\n\t"\
wolfSSL 11:cee25a834751 1040 "vpshufb %%xmm13, %%xmm6, %%xmm6\n\t"\
wolfSSL 11:cee25a834751 1041 ::"m"(sha256->buffer[8]):"%xmm6");\
wolfSSL 11:cee25a834751 1042 __asm__ volatile("vmovdqu %0, %%xmm7\n\t"\
wolfSSL 11:cee25a834751 1043 "vpshufb %%xmm13, %%xmm7, %%xmm7\n\t"\
wolfSSL 11:cee25a834751 1044 ::"m"(sha256->buffer[12]):"%xmm7");\
wolfSSL 11:cee25a834751 1045
wolfSSL 11:cee25a834751 1046 #define _SET_W_K_XFER(reg, i)\
wolfSSL 11:cee25a834751 1047 __asm__ volatile("vpaddd %0, %"#reg", %%xmm9"::"m"(K[i]):XMM_REGs);\
wolfSSL 11:cee25a834751 1048 __asm__ volatile("vmovdqa %%xmm9, %0":"=m"(W_K[i])::XMM_REGs);
wolfSSL 11:cee25a834751 1049
wolfSSL 11:cee25a834751 1050 #define SET_W_K_XFER(reg, i) _SET_W_K_XFER(reg, i)
wolfSSL 11:cee25a834751 1051
wolfSSL 11:cee25a834751 1052 static const ALIGN32 word64 mSHUF_00BA[] = { 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF }; /* shuffle xBxA -> 00BA */
wolfSSL 11:cee25a834751 1053 static const ALIGN32 word64 mSHUF_DC00[] = { 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100 }; /* shuffle xDxC -> DC00 */
wolfSSL 11:cee25a834751 1054 static const ALIGN32 word64 mBYTE_FLIP_MASK[] = { 0x0405060700010203, 0x0c0d0e0f08090a0b };
wolfSSL 11:cee25a834751 1055
wolfSSL 11:cee25a834751 1056
wolfSSL 11:cee25a834751 1057 #define _Init_Masks(mask1, mask2, mask3)\
wolfSSL 11:cee25a834751 1058 __asm__ volatile("vmovdqu %0, %"#mask1 ::"m"(mBYTE_FLIP_MASK[0]));\
wolfSSL 11:cee25a834751 1059 __asm__ volatile("vmovdqu %0, %"#mask2 ::"m"(mSHUF_00BA[0]));\
wolfSSL 11:cee25a834751 1060 __asm__ volatile("vmovdqu %0, %"#mask3 ::"m"(mSHUF_DC00[0]));
wolfSSL 11:cee25a834751 1061
wolfSSL 11:cee25a834751 1062 #define Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)\
wolfSSL 11:cee25a834751 1063 _Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00)
wolfSSL 11:cee25a834751 1064
wolfSSL 11:cee25a834751 1065 #define X0 %xmm4
wolfSSL 11:cee25a834751 1066 #define X1 %xmm5
wolfSSL 11:cee25a834751 1067 #define X2 %xmm6
wolfSSL 11:cee25a834751 1068 #define X3 %xmm7
wolfSSL 11:cee25a834751 1069 #define X_ X0
wolfSSL 11:cee25a834751 1070
wolfSSL 11:cee25a834751 1071 #define XTMP0 %xmm0
wolfSSL 11:cee25a834751 1072 #define XTMP1 %xmm1
wolfSSL 11:cee25a834751 1073 #define XTMP2 %xmm2
wolfSSL 11:cee25a834751 1074 #define XTMP3 %xmm3
wolfSSL 11:cee25a834751 1075 #define XTMP4 %xmm8
wolfSSL 11:cee25a834751 1076 #define XTMP5 %xmm9
wolfSSL 11:cee25a834751 1077 #define XFER %xmm10
wolfSSL 11:cee25a834751 1078
wolfSSL 11:cee25a834751 1079 #define SHUF_00BA %xmm11 /* shuffle xBxA -> 00BA */
wolfSSL 11:cee25a834751 1080 #define SHUF_DC00 %xmm12 /* shuffle xDxC -> DC00 */
wolfSSL 11:cee25a834751 1081 #define BYTE_FLIP_MASK %xmm13
wolfSSL 11:cee25a834751 1082
wolfSSL 11:cee25a834751 1083 #define XMM_REGs /* Registers are saved in Sha256Update/Finel */
wolfSSL 11:cee25a834751 1084 /*"xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13" */
wolfSSL 11:cee25a834751 1085
wolfSSL 11:cee25a834751 1086 static int Transform_AVX1(Sha256* sha256)
wolfSSL 11:cee25a834751 1087 {
wolfSSL 11:cee25a834751 1088 ALIGN32 word32 W_K[64]; /* temp for W+K */
wolfSSL 11:cee25a834751 1089
wolfSSL 11:cee25a834751 1090 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00);
wolfSSL 11:cee25a834751 1091 W_K_from_buff; /* X0, X1, X2, X3 = W[0..15]; */
wolfSSL 11:cee25a834751 1092
wolfSSL 11:cee25a834751 1093 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7);
wolfSSL 11:cee25a834751 1094
wolfSSL 11:cee25a834751 1095 SET_W_K_XFER(X0, 0);
wolfSSL 11:cee25a834751 1096
wolfSSL 11:cee25a834751 1097 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 11:cee25a834751 1098 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0);
wolfSSL 11:cee25a834751 1099 SET_W_K_XFER(X1, 4);
wolfSSL 11:cee25a834751 1100 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 11:cee25a834751 1101 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4);
wolfSSL 11:cee25a834751 1102 SET_W_K_XFER(X2, 8);
wolfSSL 11:cee25a834751 1103 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 11:cee25a834751 1104 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8);
wolfSSL 11:cee25a834751 1105 SET_W_K_XFER(X3, 12);
wolfSSL 11:cee25a834751 1106 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 11:cee25a834751 1107 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12);
wolfSSL 11:cee25a834751 1108 SET_W_K_XFER(X0, 16);
wolfSSL 11:cee25a834751 1109 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 11:cee25a834751 1110 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16);
wolfSSL 11:cee25a834751 1111 SET_W_K_XFER(X1, 20);
wolfSSL 11:cee25a834751 1112 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 11:cee25a834751 1113 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20);
wolfSSL 11:cee25a834751 1114 SET_W_K_XFER(X2, 24);
wolfSSL 11:cee25a834751 1115 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 11:cee25a834751 1116 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24);
wolfSSL 11:cee25a834751 1117 SET_W_K_XFER(X3, 28);
wolfSSL 11:cee25a834751 1118 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 11:cee25a834751 1119 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28);
wolfSSL 11:cee25a834751 1120 SET_W_K_XFER(X0, 32);
wolfSSL 11:cee25a834751 1121 MessageSched(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 11:cee25a834751 1122 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32);
wolfSSL 11:cee25a834751 1123 SET_W_K_XFER(X1, 36);
wolfSSL 11:cee25a834751 1124 MessageSched(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 11:cee25a834751 1125 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36);
wolfSSL 11:cee25a834751 1126 SET_W_K_XFER(X2, 40);
wolfSSL 11:cee25a834751 1127 MessageSched(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 11:cee25a834751 1128 SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40);
wolfSSL 11:cee25a834751 1129 SET_W_K_XFER(X3, 44);
wolfSSL 11:cee25a834751 1130 MessageSched(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5, XFER,
wolfSSL 11:cee25a834751 1131 SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44);
wolfSSL 11:cee25a834751 1132
wolfSSL 11:cee25a834751 1133 SET_W_K_XFER(X0, 48);
wolfSSL 11:cee25a834751 1134 SET_W_K_XFER(X1, 52);
wolfSSL 11:cee25a834751 1135 SET_W_K_XFER(X2, 56);
wolfSSL 11:cee25a834751 1136 SET_W_K_XFER(X3, 60);
wolfSSL 11:cee25a834751 1137
wolfSSL 11:cee25a834751 1138 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48);
wolfSSL 11:cee25a834751 1139 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49);
wolfSSL 11:cee25a834751 1140 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50);
wolfSSL 11:cee25a834751 1141 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51);
wolfSSL 11:cee25a834751 1142
wolfSSL 11:cee25a834751 1143 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52);
wolfSSL 11:cee25a834751 1144 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53);
wolfSSL 11:cee25a834751 1145 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54);
wolfSSL 11:cee25a834751 1146 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55);
wolfSSL 11:cee25a834751 1147
wolfSSL 11:cee25a834751 1148 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56);
wolfSSL 11:cee25a834751 1149 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57);
wolfSSL 11:cee25a834751 1150 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58);
wolfSSL 11:cee25a834751 1151 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59);
wolfSSL 11:cee25a834751 1152
wolfSSL 11:cee25a834751 1153 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60);
wolfSSL 11:cee25a834751 1154 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61);
wolfSSL 11:cee25a834751 1155 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62);
wolfSSL 11:cee25a834751 1156 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63);
wolfSSL 11:cee25a834751 1157
wolfSSL 11:cee25a834751 1158 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7);
wolfSSL 11:cee25a834751 1159
wolfSSL 11:cee25a834751 1160 return 0;
wolfSSL 11:cee25a834751 1161 }
wolfSSL 11:cee25a834751 1162
wolfSSL 11:cee25a834751 1163 #if defined(HAVE_INTEL_RORX)
wolfSSL 11:cee25a834751 1164 static int Transform_AVX1_RORX(Sha256* sha256)
wolfSSL 11:cee25a834751 1165 {
wolfSSL 11:cee25a834751 1166 ALIGN32 word32 W_K[64]; /* temp for W+K */
wolfSSL 11:cee25a834751 1167
wolfSSL 11:cee25a834751 1168 Init_Masks(BYTE_FLIP_MASK, SHUF_00BA, SHUF_DC00);
wolfSSL 11:cee25a834751 1169 W_K_from_buff; /* X0, X1, X2, X3 = W[0..15]; */
wolfSSL 11:cee25a834751 1170
wolfSSL 11:cee25a834751 1171 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7);
wolfSSL 11:cee25a834751 1172 SET_W_K_XFER(X0, 0);
wolfSSL 11:cee25a834751 1173 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 11:cee25a834751 1174 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0);
wolfSSL 11:cee25a834751 1175 SET_W_K_XFER(X1, 4);
wolfSSL 11:cee25a834751 1176 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 11:cee25a834751 1177 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,4);
wolfSSL 11:cee25a834751 1178 SET_W_K_XFER(X2, 8);
wolfSSL 11:cee25a834751 1179 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 11:cee25a834751 1180 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8);
wolfSSL 11:cee25a834751 1181 SET_W_K_XFER(X3, 12);
wolfSSL 11:cee25a834751 1182 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 11:cee25a834751 1183 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,12);
wolfSSL 11:cee25a834751 1184 SET_W_K_XFER(X0, 16);
wolfSSL 11:cee25a834751 1185 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 11:cee25a834751 1186 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16);
wolfSSL 11:cee25a834751 1187 SET_W_K_XFER(X1, 20);
wolfSSL 11:cee25a834751 1188 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 11:cee25a834751 1189 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,20);
wolfSSL 11:cee25a834751 1190 SET_W_K_XFER(X2, 24);
wolfSSL 11:cee25a834751 1191 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 11:cee25a834751 1192 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24);
wolfSSL 11:cee25a834751 1193 SET_W_K_XFER(X3, 28);
wolfSSL 11:cee25a834751 1194 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 11:cee25a834751 1195 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,28);
wolfSSL 11:cee25a834751 1196 SET_W_K_XFER(X0, 32);
wolfSSL 11:cee25a834751 1197 MessageSched_RORX(X0, X1, X2, X3, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 11:cee25a834751 1198 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32);
wolfSSL 11:cee25a834751 1199 SET_W_K_XFER(X1, 36);
wolfSSL 11:cee25a834751 1200 MessageSched_RORX(X1, X2, X3, X0, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 11:cee25a834751 1201 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,36);
wolfSSL 11:cee25a834751 1202 SET_W_K_XFER(X2, 40);
wolfSSL 11:cee25a834751 1203 MessageSched_RORX(X2, X3, X0, X1, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 11:cee25a834751 1204 XFER, SHUF_00BA, SHUF_DC00, S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40);
wolfSSL 11:cee25a834751 1205 SET_W_K_XFER(X3, 44);
wolfSSL 11:cee25a834751 1206 MessageSched_RORX(X3, X0, X1, X2, XTMP0, XTMP1, XTMP2, XTMP3, XTMP4, XTMP5,
wolfSSL 11:cee25a834751 1207 XFER, SHUF_00BA, SHUF_DC00, S_4,S_5,S_6,S_7,S_0,S_1,S_2,S_3,44);
wolfSSL 11:cee25a834751 1208
wolfSSL 11:cee25a834751 1209 SET_W_K_XFER(X0, 48);
wolfSSL 11:cee25a834751 1210 SET_W_K_XFER(X1, 52);
wolfSSL 11:cee25a834751 1211 SET_W_K_XFER(X2, 56);
wolfSSL 11:cee25a834751 1212 SET_W_K_XFER(X3, 60);
wolfSSL 11:cee25a834751 1213
wolfSSL 11:cee25a834751 1214 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48);
wolfSSL 11:cee25a834751 1215 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49);
wolfSSL 11:cee25a834751 1216 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50);
wolfSSL 11:cee25a834751 1217 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51);
wolfSSL 11:cee25a834751 1218
wolfSSL 11:cee25a834751 1219 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52);
wolfSSL 11:cee25a834751 1220 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53);
wolfSSL 11:cee25a834751 1221 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54);
wolfSSL 11:cee25a834751 1222 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55);
wolfSSL 11:cee25a834751 1223
wolfSSL 11:cee25a834751 1224 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56);
wolfSSL 11:cee25a834751 1225 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57);
wolfSSL 11:cee25a834751 1226 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58);
wolfSSL 11:cee25a834751 1227 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59);
wolfSSL 11:cee25a834751 1228
wolfSSL 11:cee25a834751 1229 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60);
wolfSSL 11:cee25a834751 1230 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61);
wolfSSL 11:cee25a834751 1231 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62);
wolfSSL 11:cee25a834751 1232 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63);
wolfSSL 11:cee25a834751 1233
wolfSSL 11:cee25a834751 1234 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7);
wolfSSL 11:cee25a834751 1235
wolfSSL 11:cee25a834751 1236 return 0;
wolfSSL 11:cee25a834751 1237 }
wolfSSL 11:cee25a834751 1238 #endif /* HAVE_INTEL_RORX */
wolfSSL 11:cee25a834751 1239 #endif /* HAVE_INTEL_AVX1 */
wolfSSL 11:cee25a834751 1240
wolfSSL 11:cee25a834751 1241
wolfSSL 11:cee25a834751 1242 #if defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 1243
wolfSSL 11:cee25a834751 1244 #define _MOVE_to_REG(ymm, mem) __asm__ volatile("vmovdqu %0, %%"#ymm" ":: "m"(mem):YMM_REGs);
wolfSSL 11:cee25a834751 1245 #define _MOVE_to_MEM(mem, ymm) __asm__ volatile("vmovdqu %%"#ymm", %0" : "=m"(mem)::YMM_REGs);
wolfSSL 11:cee25a834751 1246 #define _BYTE_SWAP(ymm, map) __asm__ volatile("vpshufb %0, %%"#ymm", %%"#ymm"\n\t"\
wolfSSL 11:cee25a834751 1247 :: "m"(map):YMM_REGs);
wolfSSL 11:cee25a834751 1248 #define _MOVE_128(ymm0, ymm1, ymm2, map) __asm__ volatile("vperm2i128 $"#map", %%"\
wolfSSL 11:cee25a834751 1249 #ymm2", %%"#ymm1", %%"#ymm0" ":::YMM_REGs);
wolfSSL 11:cee25a834751 1250 #define _MOVE_BYTE(ymm0, ymm1, map) __asm__ volatile("vpshufb %0, %%"#ymm1", %%"\
wolfSSL 11:cee25a834751 1251 #ymm0"\n\t":: "m"(map):YMM_REGs);
wolfSSL 11:cee25a834751 1252 #define _S_TEMP(dest, src, bits, temp) __asm__ volatile("vpsrld $"#bits", %%"\
wolfSSL 11:cee25a834751 1253 #src", %%"#dest"\n\tvpslld $32-"#bits", %%"#src", %%"#temp"\n\tvpor %%"\
wolfSSL 11:cee25a834751 1254 #temp",%%"#dest", %%"#dest" ":::YMM_REGs);
wolfSSL 11:cee25a834751 1255 #define _AVX2_R(dest, src, bits) __asm__ volatile("vpsrld $"#bits", %%"\
wolfSSL 11:cee25a834751 1256 #src", %%"#dest" ":::YMM_REGs);
wolfSSL 11:cee25a834751 1257 #define _XOR(dest, src1, src2) __asm__ volatile("vpxor %%"#src1", %%"\
wolfSSL 11:cee25a834751 1258 #src2", %%"#dest" ":::YMM_REGs);
wolfSSL 11:cee25a834751 1259 #define _OR(dest, src1, src2) __asm__ volatile("vpor %%"#src1", %%"\
wolfSSL 11:cee25a834751 1260 #src2", %%"#dest" ":::YMM_REGs);
wolfSSL 11:cee25a834751 1261 #define _ADD(dest, src1, src2) __asm__ volatile("vpaddd %%"#src1", %%"\
wolfSSL 11:cee25a834751 1262 #src2", %%"#dest" ":::YMM_REGs);
wolfSSL 11:cee25a834751 1263 #define _ADD_MEM(dest, src1, mem) __asm__ volatile("vpaddd %0, %%"#src1", %%"\
wolfSSL 11:cee25a834751 1264 #dest" "::"m"(mem):YMM_REGs);
wolfSSL 11:cee25a834751 1265 #define _BLEND(map, dest, src1, src2) __asm__ volatile("vpblendd $"#map", %%"\
wolfSSL 11:cee25a834751 1266 #src1", %%"#src2", %%"#dest" ":::YMM_REGs);
wolfSSL 11:cee25a834751 1267
wolfSSL 11:cee25a834751 1268 #define _EXTRACT_XMM_0(xmm, mem) __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
wolfSSL 11:cee25a834751 1269 #define _EXTRACT_XMM_1(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
wolfSSL 11:cee25a834751 1270 #define _EXTRACT_XMM_2(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
wolfSSL 11:cee25a834751 1271 #define _EXTRACT_XMM_3(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
wolfSSL 11:cee25a834751 1272 #define _EXTRACT_XMM_4(ymm, xmm, mem)\
wolfSSL 11:cee25a834751 1273 __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs);\
wolfSSL 11:cee25a834751 1274 __asm__ volatile("vpextrd $0, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
wolfSSL 11:cee25a834751 1275 #define _EXTRACT_XMM_5(xmm, mem) __asm__ volatile("vpextrd $1, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
wolfSSL 11:cee25a834751 1276 #define _EXTRACT_XMM_6(xmm, mem) __asm__ volatile("vpextrd $2, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
wolfSSL 11:cee25a834751 1277 #define _EXTRACT_XMM_7(xmm, mem) __asm__ volatile("vpextrd $3, %%"#xmm", %0 ":"=r"(mem)::YMM_REGs);
wolfSSL 11:cee25a834751 1278
wolfSSL 11:cee25a834751 1279 #define _SWAP_YMM_HL(ymm) __asm__ volatile("vperm2i128 $0x1, %%"#ymm", %%"#ymm", %%"#ymm" ":::YMM_REGs);
wolfSSL 11:cee25a834751 1280 #define SWAP_YMM_HL(ymm) _SWAP_YMM_HL(ymm)
wolfSSL 11:cee25a834751 1281
wolfSSL 11:cee25a834751 1282 #define MOVE_to_REG(ymm, mem) _MOVE_to_REG(ymm, mem)
wolfSSL 11:cee25a834751 1283 #define MOVE_to_MEM(mem, ymm) _MOVE_to_MEM(mem, ymm)
wolfSSL 11:cee25a834751 1284 #define BYTE_SWAP(ymm, map) _BYTE_SWAP(ymm, map)
wolfSSL 11:cee25a834751 1285 #define MOVE_128(ymm0, ymm1, ymm2, map) _MOVE_128(ymm0, ymm1, ymm2, map)
wolfSSL 11:cee25a834751 1286 #define MOVE_BYTE(ymm0, ymm1, map) _MOVE_BYTE(ymm0, ymm1, map)
wolfSSL 11:cee25a834751 1287 #define XOR(dest, src1, src2) _XOR(dest, src1, src2)
wolfSSL 11:cee25a834751 1288 #define OR(dest, src1, src2) _OR(dest, src1, src2)
wolfSSL 11:cee25a834751 1289 #define ADD(dest, src1, src2) _ADD(dest, src1, src2)
wolfSSL 11:cee25a834751 1290 #define ADD_MEM(dest, src1, mem) _ADD_MEM(dest, src1, mem)
wolfSSL 11:cee25a834751 1291 #define BLEND(map, dest, src1, src2) _BLEND(map, dest, src1, src2)
wolfSSL 11:cee25a834751 1292
wolfSSL 11:cee25a834751 1293 #define S_TMP(dest, src, bits, temp) _S_TEMP(dest, src, bits, temp);
wolfSSL 11:cee25a834751 1294 #define AVX2_S(dest, src, bits) S_TMP(dest, src, bits, S_TEMP)
wolfSSL 11:cee25a834751 1295 #define AVX2_R(dest, src, bits) _AVX2_R(dest, src, bits)
wolfSSL 11:cee25a834751 1296
wolfSSL 11:cee25a834751 1297 #define GAMMA0(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18); \
wolfSSL 11:cee25a834751 1298 XOR(dest, G_TEMP, dest); AVX2_R(G_TEMP, src, 3); XOR(dest, G_TEMP, dest);
wolfSSL 11:cee25a834751 1299 #define GAMMA0_1(dest, src) AVX2_S(dest, src, 7); AVX2_S(G_TEMP, src, 18);
wolfSSL 11:cee25a834751 1300 #define GAMMA0_2(dest, src) XOR(dest, G_TEMP, dest); AVX2_R(G_TEMP, src, 3); \
wolfSSL 11:cee25a834751 1301 XOR(dest, G_TEMP, dest);
wolfSSL 11:cee25a834751 1302
wolfSSL 11:cee25a834751 1303 #define GAMMA1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19); \
wolfSSL 11:cee25a834751 1304 XOR(dest, G_TEMP, dest); AVX2_R(G_TEMP, src, 10); XOR(dest, G_TEMP, dest);
wolfSSL 11:cee25a834751 1305 #define GAMMA1_1(dest, src) AVX2_S(dest, src, 17); AVX2_S(G_TEMP, src, 19);
wolfSSL 11:cee25a834751 1306 #define GAMMA1_2(dest, src) XOR(dest, G_TEMP, dest); AVX2_R(G_TEMP, src, 10); \
wolfSSL 11:cee25a834751 1307 XOR(dest, G_TEMP, dest);
wolfSSL 11:cee25a834751 1308
wolfSSL 11:cee25a834751 1309 #define FEEDBACK1_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP1toW_I_2[0]); \
wolfSSL 11:cee25a834751 1310 BLEND(0x0c, W_I_2, YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1311 #define FEEDBACK2_to_W_I_2 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08); \
wolfSSL 11:cee25a834751 1312 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAP2toW_I_2[0]); BLEND(0x30, W_I_2, YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1313 #define FEEDBACK3_to_W_I_2 MOVE_BYTE(YMM_TEMP0, W_I, mMAP3toW_I_2[0]); \
wolfSSL 11:cee25a834751 1314 BLEND(0xc0, W_I_2, YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1315
wolfSSL 11:cee25a834751 1316 #define FEEDBACK_to_W_I_7 MOVE_128(YMM_TEMP0, W_I, W_I, 0x08);\
wolfSSL 11:cee25a834751 1317 MOVE_BYTE(YMM_TEMP0, YMM_TEMP0, mMAPtoW_I_7[0]); BLEND(0x80, W_I_7, YMM_TEMP0, W_I_7);
wolfSSL 11:cee25a834751 1318
wolfSSL 11:cee25a834751 1319 #undef voitle
wolfSSL 11:cee25a834751 1320
wolfSSL 11:cee25a834751 1321 #define W_I_16 ymm8
wolfSSL 11:cee25a834751 1322 #define W_I_15 ymm9
wolfSSL 11:cee25a834751 1323 #define W_I_7 ymm10
wolfSSL 11:cee25a834751 1324 #define W_I_2 ymm11
wolfSSL 11:cee25a834751 1325 #define W_I ymm12
wolfSSL 11:cee25a834751 1326 #define G_TEMP ymm13
wolfSSL 11:cee25a834751 1327 #define S_TEMP ymm14
wolfSSL 11:cee25a834751 1328 #define YMM_TEMP0 ymm15
wolfSSL 11:cee25a834751 1329 #define YMM_TEMP0x xmm15
wolfSSL 11:cee25a834751 1330 #define W_I_TEMP ymm7
wolfSSL 11:cee25a834751 1331 #define W_K_TEMP ymm15
wolfSSL 11:cee25a834751 1332 #define W_K_TEMPx xmm15
wolfSSL 11:cee25a834751 1333
wolfSSL 11:cee25a834751 1334 #define YMM_REGs /* Registers are saved in Sha256Update/Finel */
wolfSSL 11:cee25a834751 1335 /* "%ymm7","%ymm8","%ymm9","%ymm10","%ymm11","%ymm12","%ymm13","%ymm14","%ymm15"*/
wolfSSL 11:cee25a834751 1336
wolfSSL 11:cee25a834751 1337
wolfSSL 11:cee25a834751 1338 #define MOVE_15_to_16(w_i_16, w_i_15, w_i_7)\
wolfSSL 11:cee25a834751 1339 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_15", %%"#w_i_15", %%"#w_i_15" ":::YMM_REGs);\
wolfSSL 11:cee25a834751 1340 __asm__ volatile("vpblendd $0x08, %%"#w_i_15", %%"#w_i_7", %%"#w_i_16" ":::YMM_REGs);\
wolfSSL 11:cee25a834751 1341 __asm__ volatile("vperm2i128 $0x01, %%"#w_i_7", %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs);\
wolfSSL 11:cee25a834751 1342 __asm__ volatile("vpblendd $0x80, %%"#w_i_15", %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs);\
wolfSSL 11:cee25a834751 1343 __asm__ volatile("vpshufd $0x93, %%"#w_i_16", %%"#w_i_16" ":::YMM_REGs);\
wolfSSL 11:cee25a834751 1344
wolfSSL 11:cee25a834751 1345 #define MOVE_7_to_15(w_i_15, w_i_7)\
wolfSSL 11:cee25a834751 1346 __asm__ volatile("vmovdqu %%"#w_i_7", %%"#w_i_15" ":::YMM_REGs);\
wolfSSL 11:cee25a834751 1347
wolfSSL 11:cee25a834751 1348 #define MOVE_I_to_7(w_i_7, w_i)\
wolfSSL 11:cee25a834751 1349 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_7" ":::YMM_REGs);\
wolfSSL 11:cee25a834751 1350 __asm__ volatile("vpblendd $0x01, %%"#w_i_7", %%"#w_i", %%"#w_i_7" ":::YMM_REGs);\
wolfSSL 11:cee25a834751 1351 __asm__ volatile("vpshufd $0x39, %%"#w_i_7", %%"#w_i_7" ":::YMM_REGs);\
wolfSSL 11:cee25a834751 1352
wolfSSL 11:cee25a834751 1353 #define MOVE_I_to_2(w_i_2, w_i)\
wolfSSL 11:cee25a834751 1354 __asm__ volatile("vperm2i128 $0x01, %%"#w_i", %%"#w_i", %%"#w_i_2" ":::YMM_REGs);\
wolfSSL 11:cee25a834751 1355 __asm__ volatile("vpshufd $0x0e, %%"#w_i_2", %%"#w_i_2" ":::YMM_REGs);\
wolfSSL 11:cee25a834751 1356
wolfSSL 11:cee25a834751 1357 #define ROTATE_W(w_i_16, w_i_15, w_i_7, w_i_2, w_i)\
wolfSSL 11:cee25a834751 1358 MOVE_15_to_16(w_i_16, w_i_15, w_i_7); \
wolfSSL 11:cee25a834751 1359 MOVE_7_to_15(w_i_15, w_i_7); \
wolfSSL 11:cee25a834751 1360 MOVE_I_to_7(w_i_7, w_i); \
wolfSSL 11:cee25a834751 1361 MOVE_I_to_2(w_i_2, w_i);\
wolfSSL 11:cee25a834751 1362
wolfSSL 11:cee25a834751 1363 #define _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 11:cee25a834751 1364 { word32 d;\
wolfSSL 11:cee25a834751 1365 __asm__ volatile("movl %"#S_0", %0":"=r"(d)::SSE_REGs);\
wolfSSL 11:cee25a834751 1366 sha256->digest[0] += d;\
wolfSSL 11:cee25a834751 1367 __asm__ volatile("movl %"#S_1", %0":"=r"(d)::SSE_REGs);\
wolfSSL 11:cee25a834751 1368 sha256->digest[1] += d;\
wolfSSL 11:cee25a834751 1369 __asm__ volatile("movl %"#S_2", %0":"=r"(d)::SSE_REGs);\
wolfSSL 11:cee25a834751 1370 sha256->digest[2] += d;\
wolfSSL 11:cee25a834751 1371 __asm__ volatile("movl %"#S_3", %0":"=r"(d)::SSE_REGs);\
wolfSSL 11:cee25a834751 1372 sha256->digest[3] += d;\
wolfSSL 11:cee25a834751 1373 __asm__ volatile("movl %"#S_4", %0":"=r"(d)::SSE_REGs);\
wolfSSL 11:cee25a834751 1374 sha256->digest[4] += d;\
wolfSSL 11:cee25a834751 1375 __asm__ volatile("movl %"#S_5", %0":"=r"(d)::SSE_REGs);\
wolfSSL 11:cee25a834751 1376 sha256->digest[5] += d;\
wolfSSL 11:cee25a834751 1377 __asm__ volatile("movl %"#S_6", %0":"=r"(d)::SSE_REGs);\
wolfSSL 11:cee25a834751 1378 sha256->digest[6] += d;\
wolfSSL 11:cee25a834751 1379 __asm__ volatile("movl %"#S_7", %0":"=r"(d)::SSE_REGs);\
wolfSSL 11:cee25a834751 1380 sha256->digest[7] += d;\
wolfSSL 11:cee25a834751 1381 }
wolfSSL 11:cee25a834751 1382
wolfSSL 11:cee25a834751 1383 #define _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 11:cee25a834751 1384 { word32 d[8];\
wolfSSL 11:cee25a834751 1385 __asm__ volatile("movl %"#S_0", %0":"=r"(d[0])::SSE_REGs);\
wolfSSL 11:cee25a834751 1386 __asm__ volatile("movl %"#S_1", %0":"=r"(d[1])::SSE_REGs);\
wolfSSL 11:cee25a834751 1387 __asm__ volatile("movl %"#S_2", %0":"=r"(d[2])::SSE_REGs);\
wolfSSL 11:cee25a834751 1388 __asm__ volatile("movl %"#S_3", %0":"=r"(d[3])::SSE_REGs);\
wolfSSL 11:cee25a834751 1389 __asm__ volatile("movl %"#S_4", %0":"=r"(d[4])::SSE_REGs);\
wolfSSL 11:cee25a834751 1390 __asm__ volatile("movl %"#S_5", %0":"=r"(d[5])::SSE_REGs);\
wolfSSL 11:cee25a834751 1391 __asm__ volatile("movl %"#S_6", %0":"=r"(d[6])::SSE_REGs);\
wolfSSL 11:cee25a834751 1392 __asm__ volatile("movl %"#S_7", %0":"=r"(d[7])::SSE_REGs);\
wolfSSL 11:cee25a834751 1393 printf("S[0..7]=%08x,%08x,%08x,%08x,%08x,%08x,%08x,%08x\n", d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7]);\
wolfSSL 11:cee25a834751 1394 __asm__ volatile("movl %0, %"#S_0::"r"(d[0]):SSE_REGs);\
wolfSSL 11:cee25a834751 1395 __asm__ volatile("movl %0, %"#S_1::"r"(d[1]):SSE_REGs);\
wolfSSL 11:cee25a834751 1396 __asm__ volatile("movl %0, %"#S_2::"r"(d[2]):SSE_REGs);\
wolfSSL 11:cee25a834751 1397 __asm__ volatile("movl %0, %"#S_3::"r"(d[3]):SSE_REGs);\
wolfSSL 11:cee25a834751 1398 __asm__ volatile("movl %0, %"#S_4::"r"(d[4]):SSE_REGs);\
wolfSSL 11:cee25a834751 1399 __asm__ volatile("movl %0, %"#S_5::"r"(d[5]):SSE_REGs);\
wolfSSL 11:cee25a834751 1400 __asm__ volatile("movl %0, %"#S_6::"r"(d[6]):SSE_REGs);\
wolfSSL 11:cee25a834751 1401 __asm__ volatile("movl %0, %"#S_7::"r"(d[7]):SSE_REGs);\
wolfSSL 11:cee25a834751 1402 }
wolfSSL 11:cee25a834751 1403
wolfSSL 11:cee25a834751 1404
wolfSSL 11:cee25a834751 1405 #define DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 11:cee25a834751 1406 _DigestToReg(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 11:cee25a834751 1407
wolfSSL 11:cee25a834751 1408 #define RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 11:cee25a834751 1409 _RegToDigest(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 11:cee25a834751 1410
wolfSSL 11:cee25a834751 1411 #define DumS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )\
wolfSSL 11:cee25a834751 1412 _DumpS(S_0, S_1, S_2, S_3, S_4, S_5, S_6, S_7 )
wolfSSL 11:cee25a834751 1413
wolfSSL 11:cee25a834751 1414
wolfSSL 11:cee25a834751 1415 /* Byte swap Masks to ensure that rest of the words are filled with zero's. */
wolfSSL 11:cee25a834751 1416 static const unsigned long mBYTE_FLIP_MASK_16[] =
wolfSSL 11:cee25a834751 1417 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b };
wolfSSL 11:cee25a834751 1418 static const unsigned long mBYTE_FLIP_MASK_15[] =
wolfSSL 11:cee25a834751 1419 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x0c0d0e0f08090a0b };
wolfSSL 11:cee25a834751 1420 static const unsigned long mBYTE_FLIP_MASK_7 [] =
wolfSSL 11:cee25a834751 1421 { 0x0405060700010203, 0x0c0d0e0f08090a0b, 0x0405060700010203, 0x8080808008090a0b };
wolfSSL 11:cee25a834751 1422 static const unsigned long mBYTE_FLIP_MASK_2 [] =
wolfSSL 11:cee25a834751 1423 { 0x0405060700010203, 0x8080808080808080, 0x8080808080808080, 0x8080808080808080 };
wolfSSL 11:cee25a834751 1424
wolfSSL 11:cee25a834751 1425 static const unsigned long mMAPtoW_I_7[] =
wolfSSL 11:cee25a834751 1426 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0302010080808080 };
wolfSSL 11:cee25a834751 1427 static const unsigned long mMAP1toW_I_2[] =
wolfSSL 11:cee25a834751 1428 { 0x8080808080808080, 0x0706050403020100, 0x8080808080808080, 0x8080808080808080 };
wolfSSL 11:cee25a834751 1429 static const unsigned long mMAP2toW_I_2[] =
wolfSSL 11:cee25a834751 1430 { 0x8080808080808080, 0x8080808080808080, 0x0f0e0d0c0b0a0908, 0x8080808080808080 };
wolfSSL 11:cee25a834751 1431 static const unsigned long mMAP3toW_I_2[] =
wolfSSL 11:cee25a834751 1432 { 0x8080808080808080, 0x8080808080808080, 0x8080808080808080, 0x0706050403020100 };
wolfSSL 11:cee25a834751 1433
wolfSSL 11:cee25a834751 1434 static int Transform_AVX2(Sha256* sha256)
wolfSSL 11:cee25a834751 1435 {
wolfSSL 11:cee25a834751 1436 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 11:cee25a834751 1437 word32* W_K;
wolfSSL 11:cee25a834751 1438 W_K = (word32*) XMALLOC(sizeof(word32) * 64, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 11:cee25a834751 1439 if (W_K == NULL)
wolfSSL 11:cee25a834751 1440 return MEMORY_E;
wolfSSL 11:cee25a834751 1441 #else
wolfSSL 11:cee25a834751 1442 word32 W_K[64];
wolfSSL 11:cee25a834751 1443 #endif
wolfSSL 11:cee25a834751 1444
wolfSSL 11:cee25a834751 1445 MOVE_to_REG(W_I_16, sha256->buffer[0]); BYTE_SWAP(W_I_16, mBYTE_FLIP_MASK_16[0]);
wolfSSL 11:cee25a834751 1446 MOVE_to_REG(W_I_15, sha256->buffer[1]); BYTE_SWAP(W_I_15, mBYTE_FLIP_MASK_15[0]);
wolfSSL 11:cee25a834751 1447 MOVE_to_REG(W_I, sha256->buffer[8]); BYTE_SWAP(W_I, mBYTE_FLIP_MASK_16[0]);
wolfSSL 11:cee25a834751 1448 MOVE_to_REG(W_I_7, sha256->buffer[16-7]); BYTE_SWAP(W_I_7, mBYTE_FLIP_MASK_7[0]);
wolfSSL 11:cee25a834751 1449 MOVE_to_REG(W_I_2, sha256->buffer[16-2]); BYTE_SWAP(W_I_2, mBYTE_FLIP_MASK_2[0]);
wolfSSL 11:cee25a834751 1450
wolfSSL 11:cee25a834751 1451 DigestToReg(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7);
wolfSSL 11:cee25a834751 1452
wolfSSL 11:cee25a834751 1453 ADD_MEM(W_K_TEMP, W_I_16, K[0]);
wolfSSL 11:cee25a834751 1454 MOVE_to_MEM(W_K[0], W_K_TEMP);
wolfSSL 11:cee25a834751 1455
wolfSSL 11:cee25a834751 1456 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,0);
wolfSSL 11:cee25a834751 1457 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,1);
wolfSSL 11:cee25a834751 1458 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,2);
wolfSSL 11:cee25a834751 1459 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,3);
wolfSSL 11:cee25a834751 1460 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,4);
wolfSSL 11:cee25a834751 1461 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,5);
wolfSSL 11:cee25a834751 1462 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,6);
wolfSSL 11:cee25a834751 1463 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,7);
wolfSSL 11:cee25a834751 1464
wolfSSL 11:cee25a834751 1465 ADD_MEM(YMM_TEMP0, W_I, K[8]);
wolfSSL 11:cee25a834751 1466 MOVE_to_MEM(W_K[8], YMM_TEMP0);
wolfSSL 11:cee25a834751 1467
wolfSSL 11:cee25a834751 1468 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 11:cee25a834751 1469 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8);
wolfSSL 11:cee25a834751 1470 GAMMA0_1(W_I_TEMP, W_I_15);
wolfSSL 11:cee25a834751 1471 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8);
wolfSSL 11:cee25a834751 1472 GAMMA0_2(W_I_TEMP, W_I_15);
wolfSSL 11:cee25a834751 1473 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,8);
wolfSSL 11:cee25a834751 1474 ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 11:cee25a834751 1475 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9);
wolfSSL 11:cee25a834751 1476 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 11:cee25a834751 1477 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9);
wolfSSL 11:cee25a834751 1478 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1479 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,9);
wolfSSL 11:cee25a834751 1480 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1481 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10);
wolfSSL 11:cee25a834751 1482 ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */
wolfSSL 11:cee25a834751 1483 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10);
wolfSSL 11:cee25a834751 1484 FEEDBACK1_to_W_I_2;
wolfSSL 11:cee25a834751 1485 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,10);
wolfSSL 11:cee25a834751 1486 FEEDBACK_to_W_I_7;
wolfSSL 11:cee25a834751 1487 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11);
wolfSSL 11:cee25a834751 1488 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 11:cee25a834751 1489 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11);
wolfSSL 11:cee25a834751 1490 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1491 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,11);
wolfSSL 11:cee25a834751 1492 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1493 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12);
wolfSSL 11:cee25a834751 1494 ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */
wolfSSL 11:cee25a834751 1495 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12);
wolfSSL 11:cee25a834751 1496 FEEDBACK2_to_W_I_2;
wolfSSL 11:cee25a834751 1497 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,12);
wolfSSL 11:cee25a834751 1498 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1499 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13);
wolfSSL 11:cee25a834751 1500 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1501 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13);
wolfSSL 11:cee25a834751 1502 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */
wolfSSL 11:cee25a834751 1503 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,13);
wolfSSL 11:cee25a834751 1504 FEEDBACK3_to_W_I_2;
wolfSSL 11:cee25a834751 1505 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14);
wolfSSL 11:cee25a834751 1506 GAMMA1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1507 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14);
wolfSSL 11:cee25a834751 1508 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,14);
wolfSSL 11:cee25a834751 1509 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */
wolfSSL 11:cee25a834751 1510 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15);
wolfSSL 11:cee25a834751 1511
wolfSSL 11:cee25a834751 1512 MOVE_to_REG(YMM_TEMP0, K[16]);
wolfSSL 11:cee25a834751 1513 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15);
wolfSSL 11:cee25a834751 1514 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I);
wolfSSL 11:cee25a834751 1515 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,15);
wolfSSL 11:cee25a834751 1516 ADD(YMM_TEMP0, YMM_TEMP0, W_I);
wolfSSL 11:cee25a834751 1517 MOVE_to_MEM(W_K[16], YMM_TEMP0);
wolfSSL 11:cee25a834751 1518
wolfSSL 11:cee25a834751 1519 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 11:cee25a834751 1520 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16);
wolfSSL 11:cee25a834751 1521 GAMMA0_1(W_I_TEMP, W_I_15);
wolfSSL 11:cee25a834751 1522 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16);
wolfSSL 11:cee25a834751 1523 GAMMA0_2(W_I_TEMP, W_I_15);
wolfSSL 11:cee25a834751 1524 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,16);
wolfSSL 11:cee25a834751 1525 ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 11:cee25a834751 1526 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17);
wolfSSL 11:cee25a834751 1527 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 11:cee25a834751 1528 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17);
wolfSSL 11:cee25a834751 1529 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1530 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,17);
wolfSSL 11:cee25a834751 1531 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1532 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18);
wolfSSL 11:cee25a834751 1533 ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */
wolfSSL 11:cee25a834751 1534 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18);
wolfSSL 11:cee25a834751 1535 FEEDBACK1_to_W_I_2;
wolfSSL 11:cee25a834751 1536 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,18);
wolfSSL 11:cee25a834751 1537 FEEDBACK_to_W_I_7;
wolfSSL 11:cee25a834751 1538 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19);
wolfSSL 11:cee25a834751 1539 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 11:cee25a834751 1540 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19);
wolfSSL 11:cee25a834751 1541 GAMMA1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1542 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,19);
wolfSSL 11:cee25a834751 1543 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1544 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20);
wolfSSL 11:cee25a834751 1545 ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */
wolfSSL 11:cee25a834751 1546 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20);
wolfSSL 11:cee25a834751 1547 FEEDBACK2_to_W_I_2;
wolfSSL 11:cee25a834751 1548 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,20);
wolfSSL 11:cee25a834751 1549 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1550 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21);
wolfSSL 11:cee25a834751 1551 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1552 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21);
wolfSSL 11:cee25a834751 1553 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */
wolfSSL 11:cee25a834751 1554 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,21);
wolfSSL 11:cee25a834751 1555 FEEDBACK3_to_W_I_2;
wolfSSL 11:cee25a834751 1556 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22);
wolfSSL 11:cee25a834751 1557 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1558 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22);
wolfSSL 11:cee25a834751 1559 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1560 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,22);
wolfSSL 11:cee25a834751 1561 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */
wolfSSL 11:cee25a834751 1562 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23);
wolfSSL 11:cee25a834751 1563
wolfSSL 11:cee25a834751 1564 MOVE_to_REG(YMM_TEMP0, K[24]);
wolfSSL 11:cee25a834751 1565 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23);
wolfSSL 11:cee25a834751 1566 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I);
wolfSSL 11:cee25a834751 1567 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,23);
wolfSSL 11:cee25a834751 1568 ADD(YMM_TEMP0, YMM_TEMP0, W_I);
wolfSSL 11:cee25a834751 1569 MOVE_to_MEM(W_K[24], YMM_TEMP0);
wolfSSL 11:cee25a834751 1570
wolfSSL 11:cee25a834751 1571 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 11:cee25a834751 1572 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24);
wolfSSL 11:cee25a834751 1573 GAMMA0_1(W_I_TEMP, W_I_15);
wolfSSL 11:cee25a834751 1574 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24);
wolfSSL 11:cee25a834751 1575 GAMMA0_2(W_I_TEMP, W_I_15);
wolfSSL 11:cee25a834751 1576 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,24);
wolfSSL 11:cee25a834751 1577 ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 11:cee25a834751 1578 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25);
wolfSSL 11:cee25a834751 1579 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 11:cee25a834751 1580 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25);
wolfSSL 11:cee25a834751 1581 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1582 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,25);
wolfSSL 11:cee25a834751 1583 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1584 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26);
wolfSSL 11:cee25a834751 1585 ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */
wolfSSL 11:cee25a834751 1586 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26);
wolfSSL 11:cee25a834751 1587 FEEDBACK1_to_W_I_2;
wolfSSL 11:cee25a834751 1588 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,26);
wolfSSL 11:cee25a834751 1589 FEEDBACK_to_W_I_7;
wolfSSL 11:cee25a834751 1590 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27);
wolfSSL 11:cee25a834751 1591 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 11:cee25a834751 1592 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27);
wolfSSL 11:cee25a834751 1593 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1594 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,27);
wolfSSL 11:cee25a834751 1595 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1596 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28);
wolfSSL 11:cee25a834751 1597 ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */
wolfSSL 11:cee25a834751 1598 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28);
wolfSSL 11:cee25a834751 1599 FEEDBACK2_to_W_I_2;
wolfSSL 11:cee25a834751 1600 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,28);
wolfSSL 11:cee25a834751 1601 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1602 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29);
wolfSSL 11:cee25a834751 1603 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1604 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29);
wolfSSL 11:cee25a834751 1605 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */
wolfSSL 11:cee25a834751 1606 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,29);
wolfSSL 11:cee25a834751 1607 FEEDBACK3_to_W_I_2;
wolfSSL 11:cee25a834751 1608 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30);
wolfSSL 11:cee25a834751 1609 GAMMA1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1610 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30);
wolfSSL 11:cee25a834751 1611 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,30);
wolfSSL 11:cee25a834751 1612 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */
wolfSSL 11:cee25a834751 1613 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31);
wolfSSL 11:cee25a834751 1614
wolfSSL 11:cee25a834751 1615 MOVE_to_REG(YMM_TEMP0, K[32]);
wolfSSL 11:cee25a834751 1616 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31);
wolfSSL 11:cee25a834751 1617 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I);
wolfSSL 11:cee25a834751 1618 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,31);
wolfSSL 11:cee25a834751 1619 ADD(YMM_TEMP0, YMM_TEMP0, W_I);
wolfSSL 11:cee25a834751 1620 MOVE_to_MEM(W_K[32], YMM_TEMP0);
wolfSSL 11:cee25a834751 1621
wolfSSL 11:cee25a834751 1622
wolfSSL 11:cee25a834751 1623 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 11:cee25a834751 1624 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32);
wolfSSL 11:cee25a834751 1625 GAMMA0_1(W_I_TEMP, W_I_15);
wolfSSL 11:cee25a834751 1626 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32);
wolfSSL 11:cee25a834751 1627 GAMMA0_2(W_I_TEMP, W_I_15);
wolfSSL 11:cee25a834751 1628 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,32);
wolfSSL 11:cee25a834751 1629 ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 11:cee25a834751 1630 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33);
wolfSSL 11:cee25a834751 1631 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 11:cee25a834751 1632 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33);
wolfSSL 11:cee25a834751 1633 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1634 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,33);
wolfSSL 11:cee25a834751 1635 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1636 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34);
wolfSSL 11:cee25a834751 1637 ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */
wolfSSL 11:cee25a834751 1638 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34);
wolfSSL 11:cee25a834751 1639 FEEDBACK1_to_W_I_2;
wolfSSL 11:cee25a834751 1640 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,34);
wolfSSL 11:cee25a834751 1641 FEEDBACK_to_W_I_7;
wolfSSL 11:cee25a834751 1642 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35);
wolfSSL 11:cee25a834751 1643 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 11:cee25a834751 1644 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35);
wolfSSL 11:cee25a834751 1645 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1646 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,35);
wolfSSL 11:cee25a834751 1647 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1648 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36);
wolfSSL 11:cee25a834751 1649 ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */
wolfSSL 11:cee25a834751 1650 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36);
wolfSSL 11:cee25a834751 1651 FEEDBACK2_to_W_I_2;
wolfSSL 11:cee25a834751 1652 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,36);
wolfSSL 11:cee25a834751 1653 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1654 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37);
wolfSSL 11:cee25a834751 1655 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1656 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37);
wolfSSL 11:cee25a834751 1657 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */
wolfSSL 11:cee25a834751 1658 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,37);
wolfSSL 11:cee25a834751 1659 FEEDBACK3_to_W_I_2;
wolfSSL 11:cee25a834751 1660 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38);
wolfSSL 11:cee25a834751 1661 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1662 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38);
wolfSSL 11:cee25a834751 1663 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1664 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,38);
wolfSSL 11:cee25a834751 1665 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */
wolfSSL 11:cee25a834751 1666 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39);
wolfSSL 11:cee25a834751 1667
wolfSSL 11:cee25a834751 1668 MOVE_to_REG(YMM_TEMP0, K[40]);
wolfSSL 11:cee25a834751 1669 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39);
wolfSSL 11:cee25a834751 1670 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I);
wolfSSL 11:cee25a834751 1671 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,39);
wolfSSL 11:cee25a834751 1672 ADD(YMM_TEMP0, YMM_TEMP0, W_I);
wolfSSL 11:cee25a834751 1673 MOVE_to_MEM(W_K[40], YMM_TEMP0);
wolfSSL 11:cee25a834751 1674
wolfSSL 11:cee25a834751 1675 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 11:cee25a834751 1676 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40);
wolfSSL 11:cee25a834751 1677 GAMMA0_1(W_I_TEMP, W_I_15);
wolfSSL 11:cee25a834751 1678 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40);
wolfSSL 11:cee25a834751 1679 GAMMA0_2(W_I_TEMP, W_I_15);
wolfSSL 11:cee25a834751 1680 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,40);
wolfSSL 11:cee25a834751 1681 ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 11:cee25a834751 1682 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41);
wolfSSL 11:cee25a834751 1683 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 11:cee25a834751 1684 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41);
wolfSSL 11:cee25a834751 1685 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1686 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,41);
wolfSSL 11:cee25a834751 1687 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1688 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42);
wolfSSL 11:cee25a834751 1689 ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */
wolfSSL 11:cee25a834751 1690 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42);
wolfSSL 11:cee25a834751 1691 FEEDBACK1_to_W_I_2;
wolfSSL 11:cee25a834751 1692 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,42);
wolfSSL 11:cee25a834751 1693 FEEDBACK_to_W_I_7;
wolfSSL 11:cee25a834751 1694 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43);
wolfSSL 11:cee25a834751 1695 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 11:cee25a834751 1696 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43);
wolfSSL 11:cee25a834751 1697 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1698 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,43);
wolfSSL 11:cee25a834751 1699 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1700 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44);
wolfSSL 11:cee25a834751 1701 ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */
wolfSSL 11:cee25a834751 1702 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44);
wolfSSL 11:cee25a834751 1703 FEEDBACK2_to_W_I_2;
wolfSSL 11:cee25a834751 1704 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,44);
wolfSSL 11:cee25a834751 1705 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1706 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45);
wolfSSL 11:cee25a834751 1707 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1708 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45);
wolfSSL 11:cee25a834751 1709 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */
wolfSSL 11:cee25a834751 1710 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,45);
wolfSSL 11:cee25a834751 1711 FEEDBACK3_to_W_I_2;
wolfSSL 11:cee25a834751 1712 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46);
wolfSSL 11:cee25a834751 1713 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1714 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46);
wolfSSL 11:cee25a834751 1715 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1716 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,46);
wolfSSL 11:cee25a834751 1717 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */
wolfSSL 11:cee25a834751 1718 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47);
wolfSSL 11:cee25a834751 1719
wolfSSL 11:cee25a834751 1720 MOVE_to_REG(YMM_TEMP0, K[48]);
wolfSSL 11:cee25a834751 1721 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47);
wolfSSL 11:cee25a834751 1722 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I);
wolfSSL 11:cee25a834751 1723 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,47);
wolfSSL 11:cee25a834751 1724 ADD(YMM_TEMP0, YMM_TEMP0, W_I);
wolfSSL 11:cee25a834751 1725 MOVE_to_MEM(W_K[48], YMM_TEMP0);
wolfSSL 11:cee25a834751 1726
wolfSSL 11:cee25a834751 1727 /* W[i] = Gamma1(W[i-2]) + W[i-7] + Gamma0(W[i-15] + W[i-16]) */
wolfSSL 11:cee25a834751 1728 RND_0_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48);
wolfSSL 11:cee25a834751 1729 GAMMA0_1(W_I_TEMP, W_I_15);
wolfSSL 11:cee25a834751 1730 RND_0_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48);
wolfSSL 11:cee25a834751 1731 GAMMA0_2(W_I_TEMP, W_I_15);
wolfSSL 11:cee25a834751 1732 RND_0_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,48);
wolfSSL 11:cee25a834751 1733 ADD(W_I_TEMP, W_I_16, W_I_TEMP);/* for saving W_I before adding incomplete W_I_7 */
wolfSSL 11:cee25a834751 1734 RND_7_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49);
wolfSSL 11:cee25a834751 1735 ADD(W_I, W_I_7, W_I_TEMP);
wolfSSL 11:cee25a834751 1736 RND_7_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49);
wolfSSL 11:cee25a834751 1737 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1738 RND_7_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,49);
wolfSSL 11:cee25a834751 1739 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1740 RND_6_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50);
wolfSSL 11:cee25a834751 1741 ADD(W_I, W_I, YMM_TEMP0);/* now W[16..17] are completed */
wolfSSL 11:cee25a834751 1742 RND_6_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50);
wolfSSL 11:cee25a834751 1743 FEEDBACK1_to_W_I_2;
wolfSSL 11:cee25a834751 1744 RND_6_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,50);
wolfSSL 11:cee25a834751 1745 FEEDBACK_to_W_I_7;
wolfSSL 11:cee25a834751 1746 RND_5_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51);
wolfSSL 11:cee25a834751 1747 ADD(W_I_TEMP, W_I_7, W_I_TEMP);
wolfSSL 11:cee25a834751 1748 RND_5_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51);
wolfSSL 11:cee25a834751 1749 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1750 RND_5_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,51);
wolfSSL 11:cee25a834751 1751 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1752 RND_4_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52);
wolfSSL 11:cee25a834751 1753 ADD(W_I, W_I_TEMP, YMM_TEMP0);/* now W[16..19] are completed */
wolfSSL 11:cee25a834751 1754 RND_4_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52);
wolfSSL 11:cee25a834751 1755 FEEDBACK2_to_W_I_2;
wolfSSL 11:cee25a834751 1756 RND_4_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,52);
wolfSSL 11:cee25a834751 1757 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1758 RND_3_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53);
wolfSSL 11:cee25a834751 1759 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1760 RND_3_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53);
wolfSSL 11:cee25a834751 1761 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..21] are completed */
wolfSSL 11:cee25a834751 1762 RND_3_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,53);
wolfSSL 11:cee25a834751 1763 FEEDBACK3_to_W_I_2;
wolfSSL 11:cee25a834751 1764 RND_2_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54);
wolfSSL 11:cee25a834751 1765 GAMMA1_1(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1766 RND_2_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54);
wolfSSL 11:cee25a834751 1767 GAMMA1_2(YMM_TEMP0, W_I_2);
wolfSSL 11:cee25a834751 1768 RND_2_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,54);
wolfSSL 11:cee25a834751 1769 ADD(W_I, W_I_TEMP, YMM_TEMP0); /* now W[16..23] are completed */
wolfSSL 11:cee25a834751 1770 RND_1_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55);
wolfSSL 11:cee25a834751 1771
wolfSSL 11:cee25a834751 1772 MOVE_to_REG(YMM_TEMP0, K[56]);
wolfSSL 11:cee25a834751 1773 RND_1_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55);
wolfSSL 11:cee25a834751 1774 ROTATE_W(W_I_16, W_I_15, W_I_7, W_I_2, W_I);
wolfSSL 11:cee25a834751 1775 RND_1_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,55);
wolfSSL 11:cee25a834751 1776 ADD(YMM_TEMP0, YMM_TEMP0, W_I);
wolfSSL 11:cee25a834751 1777 MOVE_to_MEM(W_K[56], YMM_TEMP0);
wolfSSL 11:cee25a834751 1778
wolfSSL 11:cee25a834751 1779 RND_0(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,56);
wolfSSL 11:cee25a834751 1780 RND_7(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,57);
wolfSSL 11:cee25a834751 1781 RND_6(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,58);
wolfSSL 11:cee25a834751 1782 RND_5(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,59);
wolfSSL 11:cee25a834751 1783
wolfSSL 11:cee25a834751 1784 RND_4(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,60);
wolfSSL 11:cee25a834751 1785 RND_3(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,61);
wolfSSL 11:cee25a834751 1786 RND_2(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,62);
wolfSSL 11:cee25a834751 1787 RND_1(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7,63);
wolfSSL 11:cee25a834751 1788
wolfSSL 11:cee25a834751 1789 RegToDigest(S_0,S_1,S_2,S_3,S_4,S_5,S_6,S_7);
wolfSSL 11:cee25a834751 1790
wolfSSL 11:cee25a834751 1791 #ifdef WOLFSSL_SMALL_STACK
wolfSSL 11:cee25a834751 1792 XFREE(W_K, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 11:cee25a834751 1793 #endif
wolfSSL 11:cee25a834751 1794
wolfSSL 11:cee25a834751 1795 return 0;
wolfSSL 11:cee25a834751 1796 }
wolfSSL 11:cee25a834751 1797
wolfSSL 11:cee25a834751 1798 #endif /* HAVE_INTEL_AVX2 */
wolfSSL 11:cee25a834751 1799
wolfSSL 11:cee25a834751 1800
wolfSSL 11:cee25a834751 1801 #ifdef WOLFSSL_SHA224
wolfSSL 11:cee25a834751 1802 static int InitSha224(Sha224* sha224)
wolfSSL 11:cee25a834751 1803 {
wolfSSL 11:cee25a834751 1804 int ret = 0;
wolfSSL 11:cee25a834751 1805
wolfSSL 11:cee25a834751 1806 sha224->digest[0] = 0xc1059ed8;
wolfSSL 11:cee25a834751 1807 sha224->digest[1] = 0x367cd507;
wolfSSL 11:cee25a834751 1808 sha224->digest[2] = 0x3070dd17;
wolfSSL 11:cee25a834751 1809 sha224->digest[3] = 0xf70e5939;
wolfSSL 11:cee25a834751 1810 sha224->digest[4] = 0xffc00b31;
wolfSSL 11:cee25a834751 1811 sha224->digest[5] = 0x68581511;
wolfSSL 11:cee25a834751 1812 sha224->digest[6] = 0x64f98fa7;
wolfSSL 11:cee25a834751 1813 sha224->digest[7] = 0xbefa4fa4;
wolfSSL 11:cee25a834751 1814
wolfSSL 11:cee25a834751 1815 sha224->buffLen = 0;
wolfSSL 11:cee25a834751 1816 sha224->loLen = 0;
wolfSSL 11:cee25a834751 1817 sha224->hiLen = 0;
wolfSSL 11:cee25a834751 1818
wolfSSL 11:cee25a834751 1819 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
wolfSSL 11:cee25a834751 1820 /* choose best Transform function under this runtime environment */
wolfSSL 11:cee25a834751 1821 set_Transform();
wolfSSL 11:cee25a834751 1822 #endif
wolfSSL 11:cee25a834751 1823
wolfSSL 11:cee25a834751 1824 return ret;
wolfSSL 11:cee25a834751 1825 }
wolfSSL 11:cee25a834751 1826
wolfSSL 11:cee25a834751 1827 int wc_InitSha224_ex(Sha224* sha224, void* heap, int devId)
wolfSSL 11:cee25a834751 1828 {
wolfSSL 11:cee25a834751 1829 int ret = 0;
wolfSSL 11:cee25a834751 1830
wolfSSL 11:cee25a834751 1831 if (sha224 == NULL)
wolfSSL 11:cee25a834751 1832 return BAD_FUNC_ARG;
wolfSSL 11:cee25a834751 1833
wolfSSL 11:cee25a834751 1834 sha224->heap = heap;
wolfSSL 11:cee25a834751 1835
wolfSSL 11:cee25a834751 1836 ret = InitSha224(sha224);
wolfSSL 11:cee25a834751 1837 if (ret != 0)
wolfSSL 11:cee25a834751 1838 return ret;
wolfSSL 11:cee25a834751 1839
wolfSSL 11:cee25a834751 1840 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
wolfSSL 11:cee25a834751 1841 ret = wolfAsync_DevCtxInit(&sha224->asyncDev,
wolfSSL 11:cee25a834751 1842 WOLFSSL_ASYNC_MARKER_SHA224, sha224->heap, devId);
wolfSSL 11:cee25a834751 1843 #else
wolfSSL 11:cee25a834751 1844 (void)devId;
wolfSSL 11:cee25a834751 1845 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 11:cee25a834751 1846
wolfSSL 11:cee25a834751 1847 return ret;
wolfSSL 11:cee25a834751 1848 }
wolfSSL 11:cee25a834751 1849
wolfSSL 11:cee25a834751 1850 int wc_InitSha224(Sha224* sha224)
wolfSSL 11:cee25a834751 1851 {
wolfSSL 11:cee25a834751 1852 return wc_InitSha224_ex(sha224, NULL, INVALID_DEVID);
wolfSSL 11:cee25a834751 1853 }
wolfSSL 11:cee25a834751 1854
wolfSSL 11:cee25a834751 1855 int wc_Sha224Update(Sha224* sha224, const byte* data, word32 len)
wolfSSL 11:cee25a834751 1856 {
wolfSSL 11:cee25a834751 1857 int ret;
wolfSSL 11:cee25a834751 1858
wolfSSL 11:cee25a834751 1859 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
wolfSSL 11:cee25a834751 1860 if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) {
wolfSSL 11:cee25a834751 1861 #if defined(HAVE_INTEL_QA)
wolfSSL 11:cee25a834751 1862 return IntelQaSymSha224(&sha224->asyncDev, NULL, data, len);
wolfSSL 11:cee25a834751 1863 #endif
wolfSSL 11:cee25a834751 1864 }
wolfSSL 11:cee25a834751 1865 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 11:cee25a834751 1866
wolfSSL 11:cee25a834751 1867 ret = Sha256Update((Sha256 *)sha224, data, len);
wolfSSL 11:cee25a834751 1868
wolfSSL 11:cee25a834751 1869 return ret;
wolfSSL 11:cee25a834751 1870 }
wolfSSL 11:cee25a834751 1871
wolfSSL 11:cee25a834751 1872 int wc_Sha224Final(Sha224* sha224, byte* hash)
wolfSSL 11:cee25a834751 1873 {
wolfSSL 11:cee25a834751 1874 int ret;
wolfSSL 11:cee25a834751 1875
wolfSSL 11:cee25a834751 1876 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
wolfSSL 11:cee25a834751 1877 if (sha224->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA224) {
wolfSSL 11:cee25a834751 1878 #if defined(HAVE_INTEL_QA)
wolfSSL 11:cee25a834751 1879 return IntelQaSymSha224(&sha224->asyncDev, hash, NULL,
wolfSSL 11:cee25a834751 1880 SHA224_DIGEST_SIZE);
wolfSSL 11:cee25a834751 1881 #endif
wolfSSL 11:cee25a834751 1882 }
wolfSSL 11:cee25a834751 1883 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 11:cee25a834751 1884
wolfSSL 11:cee25a834751 1885 ret = Sha256Final((Sha256*)sha224);
wolfSSL 11:cee25a834751 1886 if (ret != 0)
wolfSSL 11:cee25a834751 1887 return ret;
wolfSSL 11:cee25a834751 1888
wolfSSL 11:cee25a834751 1889 #if defined(LITTLE_ENDIAN_ORDER)
wolfSSL 11:cee25a834751 1890 ByteReverseWords(sha224->digest, sha224->digest, SHA224_DIGEST_SIZE);
wolfSSL 11:cee25a834751 1891 #endif
wolfSSL 11:cee25a834751 1892 XMEMCPY(hash, sha224->digest, SHA224_DIGEST_SIZE);
wolfSSL 11:cee25a834751 1893
wolfSSL 11:cee25a834751 1894 return InitSha224(sha224); /* reset state */
wolfSSL 11:cee25a834751 1895 }
wolfSSL 11:cee25a834751 1896
wolfSSL 11:cee25a834751 1897 void wc_Sha224Free(Sha224* sha224)
wolfSSL 11:cee25a834751 1898 {
wolfSSL 11:cee25a834751 1899 if (sha224 == NULL)
wolfSSL 11:cee25a834751 1900 return;
wolfSSL 11:cee25a834751 1901
wolfSSL 11:cee25a834751 1902 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA224)
wolfSSL 11:cee25a834751 1903 wolfAsync_DevCtxFree(&sha224->asyncDev, WOLFSSL_ASYNC_MARKER_SHA224);
wolfSSL 11:cee25a834751 1904 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 11:cee25a834751 1905 }
wolfSSL 11:cee25a834751 1906
wolfSSL 11:cee25a834751 1907 #endif /* WOLFSSL_SHA224 */
wolfSSL 11:cee25a834751 1908
wolfSSL 11:cee25a834751 1909
wolfSSL 11:cee25a834751 1910 int wc_InitSha256(Sha256* sha256)
wolfSSL 11:cee25a834751 1911 {
wolfSSL 11:cee25a834751 1912 return wc_InitSha256_ex(sha256, NULL, INVALID_DEVID);
wolfSSL 11:cee25a834751 1913 }
wolfSSL 11:cee25a834751 1914
wolfSSL 11:cee25a834751 1915 void wc_Sha256Free(Sha256* sha256)
wolfSSL 11:cee25a834751 1916 {
wolfSSL 11:cee25a834751 1917 if (sha256 == NULL)
wolfSSL 11:cee25a834751 1918 return;
wolfSSL 11:cee25a834751 1919
wolfSSL 11:cee25a834751 1920 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA256)
wolfSSL 11:cee25a834751 1921 wolfAsync_DevCtxFree(&sha256->asyncDev, WOLFSSL_ASYNC_MARKER_SHA256);
wolfSSL 11:cee25a834751 1922 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 11:cee25a834751 1923 }
wolfSSL 11:cee25a834751 1924
wolfSSL 11:cee25a834751 1925 #endif /* !WOLFSSL_TI_HASH */
wolfSSL 11:cee25a834751 1926 #endif /* HAVE_FIPS */
wolfSSL 11:cee25a834751 1927
wolfSSL 11:cee25a834751 1928
wolfSSL 11:cee25a834751 1929 #ifndef WOLFSSL_TI_HASH
wolfSSL 11:cee25a834751 1930 #ifdef WOLFSSL_SHA224
wolfSSL 11:cee25a834751 1931 int wc_Sha224GetHash(Sha224* sha224, byte* hash)
wolfSSL 11:cee25a834751 1932 {
wolfSSL 11:cee25a834751 1933 int ret;
wolfSSL 11:cee25a834751 1934 Sha224 tmpSha224;
wolfSSL 11:cee25a834751 1935
wolfSSL 11:cee25a834751 1936 if (sha224 == NULL || hash == NULL)
wolfSSL 11:cee25a834751 1937 return BAD_FUNC_ARG;
wolfSSL 11:cee25a834751 1938
wolfSSL 11:cee25a834751 1939 ret = wc_Sha224Copy(sha224, &tmpSha224);
wolfSSL 11:cee25a834751 1940 if (ret == 0) {
wolfSSL 11:cee25a834751 1941 ret = wc_Sha224Final(&tmpSha224, hash);
wolfSSL 11:cee25a834751 1942 }
wolfSSL 11:cee25a834751 1943 return ret;
wolfSSL 11:cee25a834751 1944 }
wolfSSL 11:cee25a834751 1945 int wc_Sha224Copy(Sha224* src, Sha224* dst)
wolfSSL 11:cee25a834751 1946 {
wolfSSL 11:cee25a834751 1947 int ret = 0;
wolfSSL 11:cee25a834751 1948
wolfSSL 11:cee25a834751 1949 if (src == NULL || dst == NULL)
wolfSSL 11:cee25a834751 1950 return BAD_FUNC_ARG;
wolfSSL 11:cee25a834751 1951
wolfSSL 11:cee25a834751 1952 XMEMCPY(dst, src, sizeof(Sha224));
wolfSSL 11:cee25a834751 1953
wolfSSL 11:cee25a834751 1954 #ifdef WOLFSSL_ASYNC_CRYPT
wolfSSL 11:cee25a834751 1955 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
wolfSSL 11:cee25a834751 1956 #endif
wolfSSL 11:cee25a834751 1957
wolfSSL 11:cee25a834751 1958 return ret;
wolfSSL 11:cee25a834751 1959 }
wolfSSL 11:cee25a834751 1960 #endif /* WOLFSSL_SHA224 */
wolfSSL 11:cee25a834751 1961
wolfSSL 11:cee25a834751 1962 int wc_Sha256GetHash(Sha256* sha256, byte* hash)
wolfSSL 11:cee25a834751 1963 {
wolfSSL 11:cee25a834751 1964 int ret;
wolfSSL 11:cee25a834751 1965 Sha256 tmpSha256;
wolfSSL 11:cee25a834751 1966
wolfSSL 11:cee25a834751 1967 if (sha256 == NULL || hash == NULL)
wolfSSL 11:cee25a834751 1968 return BAD_FUNC_ARG;
wolfSSL 11:cee25a834751 1969
wolfSSL 11:cee25a834751 1970 ret = wc_Sha256Copy(sha256, &tmpSha256);
wolfSSL 11:cee25a834751 1971 if (ret == 0) {
wolfSSL 11:cee25a834751 1972 ret = wc_Sha256Final(&tmpSha256, hash);
wolfSSL 11:cee25a834751 1973 }
wolfSSL 11:cee25a834751 1974 return ret;
wolfSSL 11:cee25a834751 1975 }
wolfSSL 11:cee25a834751 1976 int wc_Sha256Copy(Sha256* src, Sha256* dst)
wolfSSL 11:cee25a834751 1977 {
wolfSSL 11:cee25a834751 1978 int ret = 0;
wolfSSL 11:cee25a834751 1979
wolfSSL 11:cee25a834751 1980 if (src == NULL || dst == NULL)
wolfSSL 11:cee25a834751 1981 return BAD_FUNC_ARG;
wolfSSL 11:cee25a834751 1982
wolfSSL 11:cee25a834751 1983 XMEMCPY(dst, src, sizeof(Sha256));
wolfSSL 11:cee25a834751 1984
wolfSSL 11:cee25a834751 1985 #ifdef WOLFSSL_ASYNC_CRYPT
wolfSSL 11:cee25a834751 1986 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
wolfSSL 11:cee25a834751 1987 #endif
wolfSSL 11:cee25a834751 1988
wolfSSL 11:cee25a834751 1989 return ret;
wolfSSL 11:cee25a834751 1990 }
wolfSSL 11:cee25a834751 1991 #endif /* !WOLFSSL_TI_HASH */
wolfSSL 11:cee25a834751 1992
wolfSSL 11:cee25a834751 1993 #endif /* NO_SHA256 */
wolfSSL 11:cee25a834751 1994