Xuyi Wang / wolfSSL

Dependents:   OS

Committer:
wolfSSL
Date:
Sat Aug 18 22:20:43 2018 +0000
Revision:
15:117db924cf7c
wolfSSL 3.15.3

Who changed what in which revision?

UserRevisionLine numberNew contents of line
wolfSSL 15:117db924cf7c 1 /* sha512.c
wolfSSL 15:117db924cf7c 2 *
wolfSSL 15:117db924cf7c 3 * Copyright (C) 2006-2017 wolfSSL Inc.
wolfSSL 15:117db924cf7c 4 *
wolfSSL 15:117db924cf7c 5 * This file is part of wolfSSL.
wolfSSL 15:117db924cf7c 6 *
wolfSSL 15:117db924cf7c 7 * wolfSSL is free software; you can redistribute it and/or modify
wolfSSL 15:117db924cf7c 8 * it under the terms of the GNU General Public License as published by
wolfSSL 15:117db924cf7c 9 * the Free Software Foundation; either version 2 of the License, or
wolfSSL 15:117db924cf7c 10 * (at your option) any later version.
wolfSSL 15:117db924cf7c 11 *
wolfSSL 15:117db924cf7c 12 * wolfSSL is distributed in the hope that it will be useful,
wolfSSL 15:117db924cf7c 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
wolfSSL 15:117db924cf7c 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
wolfSSL 15:117db924cf7c 15 * GNU General Public License for more details.
wolfSSL 15:117db924cf7c 16 *
wolfSSL 15:117db924cf7c 17 * You should have received a copy of the GNU General Public License
wolfSSL 15:117db924cf7c 18 * along with this program; if not, write to the Free Software
wolfSSL 15:117db924cf7c 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
wolfSSL 15:117db924cf7c 20 */
wolfSSL 15:117db924cf7c 21
wolfSSL 15:117db924cf7c 22
wolfSSL 15:117db924cf7c 23 #ifdef HAVE_CONFIG_H
wolfSSL 15:117db924cf7c 24 #include <config.h>
wolfSSL 15:117db924cf7c 25 #endif
wolfSSL 15:117db924cf7c 26
wolfSSL 15:117db924cf7c 27 #include <wolfssl/wolfcrypt/settings.h>
wolfSSL 15:117db924cf7c 28
wolfSSL 15:117db924cf7c 29 #if defined(WOLFSSL_SHA512) || defined(WOLFSSL_SHA384)
wolfSSL 15:117db924cf7c 30
wolfSSL 15:117db924cf7c 31 #if defined(HAVE_FIPS) && \
wolfSSL 15:117db924cf7c 32 defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2)
wolfSSL 15:117db924cf7c 33
wolfSSL 15:117db924cf7c 34 /* set NO_WRAPPERS before headers, use direct internal f()s not wrappers */
wolfSSL 15:117db924cf7c 35 #define FIPS_NO_WRAPPERS
wolfSSL 15:117db924cf7c 36
wolfSSL 15:117db924cf7c 37 #ifdef USE_WINDOWS_API
wolfSSL 15:117db924cf7c 38 #pragma code_seg(".fipsA$k")
wolfSSL 15:117db924cf7c 39 #pragma const_seg(".fipsB$k")
wolfSSL 15:117db924cf7c 40 #endif
wolfSSL 15:117db924cf7c 41 #endif
wolfSSL 15:117db924cf7c 42
wolfSSL 15:117db924cf7c 43 #include <wolfssl/wolfcrypt/sha512.h>
wolfSSL 15:117db924cf7c 44 #include <wolfssl/wolfcrypt/error-crypt.h>
wolfSSL 15:117db924cf7c 45 #include <wolfssl/wolfcrypt/cpuid.h>
wolfSSL 15:117db924cf7c 46
wolfSSL 15:117db924cf7c 47 /* deprecated USE_SLOW_SHA2 (replaced with USE_SLOW_SHA512) */
wolfSSL 15:117db924cf7c 48 #if defined(USE_SLOW_SHA2) && !defined(USE_SLOW_SHA512)
wolfSSL 15:117db924cf7c 49 #define USE_SLOW_SHA512
wolfSSL 15:117db924cf7c 50 #endif
wolfSSL 15:117db924cf7c 51
wolfSSL 15:117db924cf7c 52 /* fips wrapper calls, user can call direct */
wolfSSL 15:117db924cf7c 53 #if defined(HAVE_FIPS) && \
wolfSSL 15:117db924cf7c 54 (!defined(HAVE_FIPS_VERSION) || (HAVE_FIPS_VERSION < 2))
wolfSSL 15:117db924cf7c 55
wolfSSL 15:117db924cf7c 56 #ifdef WOLFSSL_SHA512
wolfSSL 15:117db924cf7c 57
wolfSSL 15:117db924cf7c 58 int wc_InitSha512(wc_Sha512* sha)
wolfSSL 15:117db924cf7c 59 {
wolfSSL 15:117db924cf7c 60 if (sha == NULL) {
wolfSSL 15:117db924cf7c 61 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 62 }
wolfSSL 15:117db924cf7c 63
wolfSSL 15:117db924cf7c 64 return InitSha512_fips(sha);
wolfSSL 15:117db924cf7c 65 }
wolfSSL 15:117db924cf7c 66 int wc_InitSha512_ex(wc_Sha512* sha, void* heap, int devId)
wolfSSL 15:117db924cf7c 67 {
wolfSSL 15:117db924cf7c 68 (void)heap;
wolfSSL 15:117db924cf7c 69 (void)devId;
wolfSSL 15:117db924cf7c 70 if (sha == NULL) {
wolfSSL 15:117db924cf7c 71 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 72 }
wolfSSL 15:117db924cf7c 73 return InitSha512_fips(sha);
wolfSSL 15:117db924cf7c 74 }
wolfSSL 15:117db924cf7c 75 int wc_Sha512Update(wc_Sha512* sha, const byte* data, word32 len)
wolfSSL 15:117db924cf7c 76 {
wolfSSL 15:117db924cf7c 77 if (sha == NULL || (data == NULL && len > 0)) {
wolfSSL 15:117db924cf7c 78 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 79 }
wolfSSL 15:117db924cf7c 80
wolfSSL 15:117db924cf7c 81 return Sha512Update_fips(sha, data, len);
wolfSSL 15:117db924cf7c 82 }
wolfSSL 15:117db924cf7c 83 int wc_Sha512Final(wc_Sha512* sha, byte* out)
wolfSSL 15:117db924cf7c 84 {
wolfSSL 15:117db924cf7c 85 if (sha == NULL || out == NULL) {
wolfSSL 15:117db924cf7c 86 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 87 }
wolfSSL 15:117db924cf7c 88
wolfSSL 15:117db924cf7c 89 return Sha512Final_fips(sha, out);
wolfSSL 15:117db924cf7c 90 }
wolfSSL 15:117db924cf7c 91 void wc_Sha512Free(wc_Sha512* sha)
wolfSSL 15:117db924cf7c 92 {
wolfSSL 15:117db924cf7c 93 (void)sha;
wolfSSL 15:117db924cf7c 94 /* Not supported in FIPS */
wolfSSL 15:117db924cf7c 95 }
wolfSSL 15:117db924cf7c 96 #endif
wolfSSL 15:117db924cf7c 97
wolfSSL 15:117db924cf7c 98 #if defined(WOLFSSL_SHA384) || defined(HAVE_AESGCM)
wolfSSL 15:117db924cf7c 99 int wc_InitSha384(wc_Sha384* sha)
wolfSSL 15:117db924cf7c 100 {
wolfSSL 15:117db924cf7c 101 if (sha == NULL) {
wolfSSL 15:117db924cf7c 102 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 103 }
wolfSSL 15:117db924cf7c 104 return InitSha384_fips(sha);
wolfSSL 15:117db924cf7c 105 }
wolfSSL 15:117db924cf7c 106 int wc_InitSha384_ex(wc_Sha384* sha, void* heap, int devId)
wolfSSL 15:117db924cf7c 107 {
wolfSSL 15:117db924cf7c 108 (void)heap;
wolfSSL 15:117db924cf7c 109 (void)devId;
wolfSSL 15:117db924cf7c 110 if (sha == NULL) {
wolfSSL 15:117db924cf7c 111 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 112 }
wolfSSL 15:117db924cf7c 113 return InitSha384_fips(sha);
wolfSSL 15:117db924cf7c 114 }
wolfSSL 15:117db924cf7c 115 int wc_Sha384Update(wc_Sha384* sha, const byte* data, word32 len)
wolfSSL 15:117db924cf7c 116 {
wolfSSL 15:117db924cf7c 117 if (sha == NULL || (data == NULL && len > 0)) {
wolfSSL 15:117db924cf7c 118 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 119 }
wolfSSL 15:117db924cf7c 120 return Sha384Update_fips(sha, data, len);
wolfSSL 15:117db924cf7c 121 }
wolfSSL 15:117db924cf7c 122 int wc_Sha384Final(wc_Sha384* sha, byte* out)
wolfSSL 15:117db924cf7c 123 {
wolfSSL 15:117db924cf7c 124 if (sha == NULL || out == NULL) {
wolfSSL 15:117db924cf7c 125 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 126 }
wolfSSL 15:117db924cf7c 127 return Sha384Final_fips(sha, out);
wolfSSL 15:117db924cf7c 128 }
wolfSSL 15:117db924cf7c 129 void wc_Sha384Free(wc_Sha384* sha)
wolfSSL 15:117db924cf7c 130 {
wolfSSL 15:117db924cf7c 131 (void)sha;
wolfSSL 15:117db924cf7c 132 /* Not supported in FIPS */
wolfSSL 15:117db924cf7c 133 }
wolfSSL 15:117db924cf7c 134 #endif /* WOLFSSL_SHA384 || HAVE_AESGCM */
wolfSSL 15:117db924cf7c 135
wolfSSL 15:117db924cf7c 136 #else /* else build without fips, or for FIPS v2 */
wolfSSL 15:117db924cf7c 137
wolfSSL 15:117db924cf7c 138 #include <wolfssl/wolfcrypt/logging.h>
wolfSSL 15:117db924cf7c 139
wolfSSL 15:117db924cf7c 140 #ifdef NO_INLINE
wolfSSL 15:117db924cf7c 141 #include <wolfssl/wolfcrypt/misc.h>
wolfSSL 15:117db924cf7c 142 #else
wolfSSL 15:117db924cf7c 143 #define WOLFSSL_MISC_INCLUDED
wolfSSL 15:117db924cf7c 144 #include <wolfcrypt/src/misc.c>
wolfSSL 15:117db924cf7c 145 #endif
wolfSSL 15:117db924cf7c 146
wolfSSL 15:117db924cf7c 147
wolfSSL 15:117db924cf7c 148 #if defined(USE_INTEL_SPEEDUP)
wolfSSL 15:117db924cf7c 149 #define HAVE_INTEL_AVX1
wolfSSL 15:117db924cf7c 150
wolfSSL 15:117db924cf7c 151 #if defined(__GNUC__) && ((__GNUC__ < 4) || \
wolfSSL 15:117db924cf7c 152 (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
wolfSSL 15:117db924cf7c 153 #define NO_AVX2_SUPPORT
wolfSSL 15:117db924cf7c 154 #endif
wolfSSL 15:117db924cf7c 155 #if defined(__clang__) && ((__clang_major__ < 3) || \
wolfSSL 15:117db924cf7c 156 (__clang_major__ == 3 && __clang_minor__ <= 5))
wolfSSL 15:117db924cf7c 157 #define NO_AVX2_SUPPORT
wolfSSL 15:117db924cf7c 158 #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
wolfSSL 15:117db924cf7c 159 #undef NO_AVX2_SUPPORT
wolfSSL 15:117db924cf7c 160 #endif
wolfSSL 15:117db924cf7c 161
wolfSSL 15:117db924cf7c 162 #define HAVE_INTEL_AVX1
wolfSSL 15:117db924cf7c 163 #ifndef NO_AVX2_SUPPORT
wolfSSL 15:117db924cf7c 164 #define HAVE_INTEL_AVX2
wolfSSL 15:117db924cf7c 165 #endif
wolfSSL 15:117db924cf7c 166 #endif
wolfSSL 15:117db924cf7c 167
wolfSSL 15:117db924cf7c 168 #if defined(HAVE_INTEL_AVX1)
wolfSSL 15:117db924cf7c 169 /* #define DEBUG_XMM */
wolfSSL 15:117db924cf7c 170 #endif
wolfSSL 15:117db924cf7c 171
wolfSSL 15:117db924cf7c 172 #if defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 173 #define HAVE_INTEL_RORX
wolfSSL 15:117db924cf7c 174 /* #define DEBUG_YMM */
wolfSSL 15:117db924cf7c 175 #endif
wolfSSL 15:117db924cf7c 176
wolfSSL 15:117db924cf7c 177 #if defined(HAVE_BYTEREVERSE64) && \
wolfSSL 15:117db924cf7c 178 !defined(HAVE_INTEL_AVX1) && !defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 179 #define ByteReverseWords64(out, in, size) ByteReverseWords64_1(out, size)
wolfSSL 15:117db924cf7c 180 #define ByteReverseWords64_1(buf, size) \
wolfSSL 15:117db924cf7c 181 { unsigned int i ;\
wolfSSL 15:117db924cf7c 182 for(i=0; i< size/sizeof(word64); i++){\
wolfSSL 15:117db924cf7c 183 __asm__ volatile("bswapq %0":"+r"(buf[i])::) ;\
wolfSSL 15:117db924cf7c 184 }\
wolfSSL 15:117db924cf7c 185 }
wolfSSL 15:117db924cf7c 186 #endif
wolfSSL 15:117db924cf7c 187
wolfSSL 15:117db924cf7c 188 #if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH)
wolfSSL 15:117db924cf7c 189 /* functions defined in wolfcrypt/src/port/caam/caam_sha.c */
wolfSSL 15:117db924cf7c 190 #else
wolfSSL 15:117db924cf7c 191
wolfSSL 15:117db924cf7c 192 #ifdef WOLFSSL_SHA512
wolfSSL 15:117db924cf7c 193
wolfSSL 15:117db924cf7c 194 static int InitSha512(wc_Sha512* sha512)
wolfSSL 15:117db924cf7c 195 {
wolfSSL 15:117db924cf7c 196 if (sha512 == NULL)
wolfSSL 15:117db924cf7c 197 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 198
wolfSSL 15:117db924cf7c 199 sha512->digest[0] = W64LIT(0x6a09e667f3bcc908);
wolfSSL 15:117db924cf7c 200 sha512->digest[1] = W64LIT(0xbb67ae8584caa73b);
wolfSSL 15:117db924cf7c 201 sha512->digest[2] = W64LIT(0x3c6ef372fe94f82b);
wolfSSL 15:117db924cf7c 202 sha512->digest[3] = W64LIT(0xa54ff53a5f1d36f1);
wolfSSL 15:117db924cf7c 203 sha512->digest[4] = W64LIT(0x510e527fade682d1);
wolfSSL 15:117db924cf7c 204 sha512->digest[5] = W64LIT(0x9b05688c2b3e6c1f);
wolfSSL 15:117db924cf7c 205 sha512->digest[6] = W64LIT(0x1f83d9abfb41bd6b);
wolfSSL 15:117db924cf7c 206 sha512->digest[7] = W64LIT(0x5be0cd19137e2179);
wolfSSL 15:117db924cf7c 207
wolfSSL 15:117db924cf7c 208 sha512->buffLen = 0;
wolfSSL 15:117db924cf7c 209 sha512->loLen = 0;
wolfSSL 15:117db924cf7c 210 sha512->hiLen = 0;
wolfSSL 15:117db924cf7c 211
wolfSSL 15:117db924cf7c 212 return 0;
wolfSSL 15:117db924cf7c 213 }
wolfSSL 15:117db924cf7c 214
wolfSSL 15:117db924cf7c 215 #endif /* WOLFSSL_SHA512 */
wolfSSL 15:117db924cf7c 216
wolfSSL 15:117db924cf7c 217 /* Hardware Acceleration */
wolfSSL 15:117db924cf7c 218 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 219
wolfSSL 15:117db924cf7c 220 #ifdef WOLFSSL_SHA512
wolfSSL 15:117db924cf7c 221
wolfSSL 15:117db924cf7c 222 /*****
wolfSSL 15:117db924cf7c 223 Intel AVX1/AVX2 Macro Control Structure
wolfSSL 15:117db924cf7c 224
wolfSSL 15:117db924cf7c 225 #if defined(HAVE_INteL_SPEEDUP)
wolfSSL 15:117db924cf7c 226 #define HAVE_INTEL_AVX1
wolfSSL 15:117db924cf7c 227 #define HAVE_INTEL_AVX2
wolfSSL 15:117db924cf7c 228 #endif
wolfSSL 15:117db924cf7c 229
wolfSSL 15:117db924cf7c 230 int InitSha512(wc_Sha512* sha512) {
wolfSSL 15:117db924cf7c 231 Save/Recover XMM, YMM
wolfSSL 15:117db924cf7c 232 ...
wolfSSL 15:117db924cf7c 233
wolfSSL 15:117db924cf7c 234 Check Intel AVX cpuid flags
wolfSSL 15:117db924cf7c 235 }
wolfSSL 15:117db924cf7c 236
wolfSSL 15:117db924cf7c 237 #if defined(HAVE_INTEL_AVX1)|| defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 238 Transform_Sha512_AVX1(); # Function prototype
wolfSSL 15:117db924cf7c 239 Transform_Sha512_AVX2(); #
wolfSSL 15:117db924cf7c 240 #endif
wolfSSL 15:117db924cf7c 241
wolfSSL 15:117db924cf7c 242 _Transform_Sha512() { # Native Transform Function body
wolfSSL 15:117db924cf7c 243
wolfSSL 15:117db924cf7c 244 }
wolfSSL 15:117db924cf7c 245
wolfSSL 15:117db924cf7c 246 int Sha512Update() {
wolfSSL 15:117db924cf7c 247 Save/Recover XMM, YMM
wolfSSL 15:117db924cf7c 248 ...
wolfSSL 15:117db924cf7c 249 }
wolfSSL 15:117db924cf7c 250
wolfSSL 15:117db924cf7c 251 int Sha512Final() {
wolfSSL 15:117db924cf7c 252 Save/Recover XMM, YMM
wolfSSL 15:117db924cf7c 253 ...
wolfSSL 15:117db924cf7c 254 }
wolfSSL 15:117db924cf7c 255
wolfSSL 15:117db924cf7c 256
wolfSSL 15:117db924cf7c 257 #if defined(HAVE_INTEL_AVX1)
wolfSSL 15:117db924cf7c 258
wolfSSL 15:117db924cf7c 259 XMM Instructions/INLINE asm Definitions
wolfSSL 15:117db924cf7c 260
wolfSSL 15:117db924cf7c 261 #endif
wolfSSL 15:117db924cf7c 262
wolfSSL 15:117db924cf7c 263 #if defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 264
wolfSSL 15:117db924cf7c 265 YMM Instructions/INLINE asm Definitions
wolfSSL 15:117db924cf7c 266
wolfSSL 15:117db924cf7c 267 #endif
wolfSSL 15:117db924cf7c 268
wolfSSL 15:117db924cf7c 269 #if defnied(HAVE_INTEL_AVX1)
wolfSSL 15:117db924cf7c 270
wolfSSL 15:117db924cf7c 271 int Transform_Sha512_AVX1() {
wolfSSL 15:117db924cf7c 272 Stitched Message Sched/Round
wolfSSL 15:117db924cf7c 273 }
wolfSSL 15:117db924cf7c 274
wolfSSL 15:117db924cf7c 275 #endif
wolfSSL 15:117db924cf7c 276
wolfSSL 15:117db924cf7c 277 #if defnied(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 278
wolfSSL 15:117db924cf7c 279 int Transform_Sha512_AVX2() {
wolfSSL 15:117db924cf7c 280 Stitched Message Sched/Round
wolfSSL 15:117db924cf7c 281 }
wolfSSL 15:117db924cf7c 282 #endif
wolfSSL 15:117db924cf7c 283
wolfSSL 15:117db924cf7c 284 */
wolfSSL 15:117db924cf7c 285
wolfSSL 15:117db924cf7c 286
wolfSSL 15:117db924cf7c 287 /* Each platform needs to query info type 1 from cpuid to see if aesni is
wolfSSL 15:117db924cf7c 288 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
wolfSSL 15:117db924cf7c 289 */
wolfSSL 15:117db924cf7c 290
wolfSSL 15:117db924cf7c 291 #if defined(HAVE_INTEL_AVX1)
wolfSSL 15:117db924cf7c 292 static int Transform_Sha512_AVX1(wc_Sha512 *sha512);
wolfSSL 15:117db924cf7c 293 static int Transform_Sha512_AVX1_Len(wc_Sha512 *sha512, word32 len);
wolfSSL 15:117db924cf7c 294 #endif
wolfSSL 15:117db924cf7c 295 #if defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 296 static int Transform_Sha512_AVX2(wc_Sha512 *sha512);
wolfSSL 15:117db924cf7c 297 static int Transform_Sha512_AVX2_Len(wc_Sha512 *sha512, word32 len);
wolfSSL 15:117db924cf7c 298 #if defined(HAVE_INTEL_RORX)
wolfSSL 15:117db924cf7c 299 static int Transform_Sha512_AVX1_RORX(wc_Sha512 *sha512);
wolfSSL 15:117db924cf7c 300 static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512 *sha512,
wolfSSL 15:117db924cf7c 301 word32 len);
wolfSSL 15:117db924cf7c 302 static int Transform_Sha512_AVX2_RORX(wc_Sha512 *sha512);
wolfSSL 15:117db924cf7c 303 static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512 *sha512,
wolfSSL 15:117db924cf7c 304 word32 len);
wolfSSL 15:117db924cf7c 305 #endif
wolfSSL 15:117db924cf7c 306 #endif
wolfSSL 15:117db924cf7c 307 static int _Transform_Sha512(wc_Sha512 *sha512);
wolfSSL 15:117db924cf7c 308 static int (*Transform_Sha512_p)(wc_Sha512* sha512) = _Transform_Sha512;
wolfSSL 15:117db924cf7c 309 static int (*Transform_Sha512_Len_p)(wc_Sha512* sha512, word32 len) = NULL;
wolfSSL 15:117db924cf7c 310 static int transform_check = 0;
wolfSSL 15:117db924cf7c 311 static int intel_flags;
wolfSSL 15:117db924cf7c 312 #define Transform_Sha512(sha512) (*Transform_Sha512_p)(sha512)
wolfSSL 15:117db924cf7c 313 #define Transform_Sha512_Len(sha512, len) \
wolfSSL 15:117db924cf7c 314 (*Transform_Sha512_Len_p)(sha512, len)
wolfSSL 15:117db924cf7c 315
wolfSSL 15:117db924cf7c 316 static void Sha512_SetTransform()
wolfSSL 15:117db924cf7c 317 {
wolfSSL 15:117db924cf7c 318 if (transform_check)
wolfSSL 15:117db924cf7c 319 return;
wolfSSL 15:117db924cf7c 320
wolfSSL 15:117db924cf7c 321 intel_flags = cpuid_get_flags();
wolfSSL 15:117db924cf7c 322
wolfSSL 15:117db924cf7c 323 #if defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 324 if (IS_INTEL_AVX2(intel_flags)) {
wolfSSL 15:117db924cf7c 325 #ifdef HAVE_INTEL_RORX
wolfSSL 15:117db924cf7c 326 if (IS_INTEL_BMI2(intel_flags)) {
wolfSSL 15:117db924cf7c 327 Transform_Sha512_p = Transform_Sha512_AVX2_RORX;
wolfSSL 15:117db924cf7c 328 Transform_Sha512_Len_p = Transform_Sha512_AVX2_RORX_Len;
wolfSSL 15:117db924cf7c 329 }
wolfSSL 15:117db924cf7c 330 else
wolfSSL 15:117db924cf7c 331 #endif
wolfSSL 15:117db924cf7c 332 if (1) {
wolfSSL 15:117db924cf7c 333 Transform_Sha512_p = Transform_Sha512_AVX2;
wolfSSL 15:117db924cf7c 334 Transform_Sha512_Len_p = Transform_Sha512_AVX2_Len;
wolfSSL 15:117db924cf7c 335 }
wolfSSL 15:117db924cf7c 336 #ifdef HAVE_INTEL_RORX
wolfSSL 15:117db924cf7c 337 else {
wolfSSL 15:117db924cf7c 338 Transform_Sha512_p = Transform_Sha512_AVX1_RORX;
wolfSSL 15:117db924cf7c 339 Transform_Sha512_Len_p = Transform_Sha512_AVX1_RORX_Len;
wolfSSL 15:117db924cf7c 340 }
wolfSSL 15:117db924cf7c 341 #endif
wolfSSL 15:117db924cf7c 342 }
wolfSSL 15:117db924cf7c 343 else
wolfSSL 15:117db924cf7c 344 #endif
wolfSSL 15:117db924cf7c 345 #if defined(HAVE_INTEL_AVX1)
wolfSSL 15:117db924cf7c 346 if (IS_INTEL_AVX1(intel_flags)) {
wolfSSL 15:117db924cf7c 347 Transform_Sha512_p = Transform_Sha512_AVX1;
wolfSSL 15:117db924cf7c 348 Transform_Sha512_Len_p = Transform_Sha512_AVX1_Len;
wolfSSL 15:117db924cf7c 349 }
wolfSSL 15:117db924cf7c 350 else
wolfSSL 15:117db924cf7c 351 #endif
wolfSSL 15:117db924cf7c 352 Transform_Sha512_p = _Transform_Sha512;
wolfSSL 15:117db924cf7c 353
wolfSSL 15:117db924cf7c 354 transform_check = 1;
wolfSSL 15:117db924cf7c 355 }
wolfSSL 15:117db924cf7c 356
wolfSSL 15:117db924cf7c 357 int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId)
wolfSSL 15:117db924cf7c 358 {
wolfSSL 15:117db924cf7c 359 int ret = InitSha512(sha512);
wolfSSL 15:117db924cf7c 360
wolfSSL 15:117db924cf7c 361 (void)heap;
wolfSSL 15:117db924cf7c 362 (void)devId;
wolfSSL 15:117db924cf7c 363
wolfSSL 15:117db924cf7c 364 Sha512_SetTransform();
wolfSSL 15:117db924cf7c 365
wolfSSL 15:117db924cf7c 366 return ret;
wolfSSL 15:117db924cf7c 367 }
wolfSSL 15:117db924cf7c 368
wolfSSL 15:117db924cf7c 369 #endif /* WOLFSSL_SHA512 */
wolfSSL 15:117db924cf7c 370
wolfSSL 15:117db924cf7c 371 #else
wolfSSL 15:117db924cf7c 372 #define Transform_Sha512(sha512) _Transform_Sha512(sha512)
wolfSSL 15:117db924cf7c 373
wolfSSL 15:117db924cf7c 374 #ifdef WOLFSSL_SHA512
wolfSSL 15:117db924cf7c 375
wolfSSL 15:117db924cf7c 376 int wc_InitSha512_ex(wc_Sha512* sha512, void* heap, int devId)
wolfSSL 15:117db924cf7c 377 {
wolfSSL 15:117db924cf7c 378 int ret = 0;
wolfSSL 15:117db924cf7c 379
wolfSSL 15:117db924cf7c 380 if (sha512 == NULL)
wolfSSL 15:117db924cf7c 381 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 382
wolfSSL 15:117db924cf7c 383 sha512->heap = heap;
wolfSSL 15:117db924cf7c 384
wolfSSL 15:117db924cf7c 385 ret = InitSha512(sha512);
wolfSSL 15:117db924cf7c 386 if (ret != 0)
wolfSSL 15:117db924cf7c 387 return ret;
wolfSSL 15:117db924cf7c 388
wolfSSL 15:117db924cf7c 389 #ifdef WOLFSSL_SMALL_STACK_CACHE
wolfSSL 15:117db924cf7c 390 sha512->W = NULL;
wolfSSL 15:117db924cf7c 391 #endif
wolfSSL 15:117db924cf7c 392
wolfSSL 15:117db924cf7c 393 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
wolfSSL 15:117db924cf7c 394 ret = wolfAsync_DevCtxInit(&sha512->asyncDev,
wolfSSL 15:117db924cf7c 395 WOLFSSL_ASYNC_MARKER_SHA512, sha512->heap, devId);
wolfSSL 15:117db924cf7c 396 #else
wolfSSL 15:117db924cf7c 397 (void)devId;
wolfSSL 15:117db924cf7c 398 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 15:117db924cf7c 399
wolfSSL 15:117db924cf7c 400 return ret;
wolfSSL 15:117db924cf7c 401 }
wolfSSL 15:117db924cf7c 402
wolfSSL 15:117db924cf7c 403 #endif /* WOLFSSL_SHA512 */
wolfSSL 15:117db924cf7c 404
wolfSSL 15:117db924cf7c 405 #endif /* Hardware Acceleration */
wolfSSL 15:117db924cf7c 406
wolfSSL 15:117db924cf7c 407 static const word64 K512[80] = {
wolfSSL 15:117db924cf7c 408 W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
wolfSSL 15:117db924cf7c 409 W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
wolfSSL 15:117db924cf7c 410 W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
wolfSSL 15:117db924cf7c 411 W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
wolfSSL 15:117db924cf7c 412 W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
wolfSSL 15:117db924cf7c 413 W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
wolfSSL 15:117db924cf7c 414 W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
wolfSSL 15:117db924cf7c 415 W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
wolfSSL 15:117db924cf7c 416 W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
wolfSSL 15:117db924cf7c 417 W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
wolfSSL 15:117db924cf7c 418 W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
wolfSSL 15:117db924cf7c 419 W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
wolfSSL 15:117db924cf7c 420 W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
wolfSSL 15:117db924cf7c 421 W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
wolfSSL 15:117db924cf7c 422 W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
wolfSSL 15:117db924cf7c 423 W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
wolfSSL 15:117db924cf7c 424 W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
wolfSSL 15:117db924cf7c 425 W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
wolfSSL 15:117db924cf7c 426 W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
wolfSSL 15:117db924cf7c 427 W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
wolfSSL 15:117db924cf7c 428 W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
wolfSSL 15:117db924cf7c 429 W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
wolfSSL 15:117db924cf7c 430 W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
wolfSSL 15:117db924cf7c 431 W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
wolfSSL 15:117db924cf7c 432 W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
wolfSSL 15:117db924cf7c 433 W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
wolfSSL 15:117db924cf7c 434 W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
wolfSSL 15:117db924cf7c 435 W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
wolfSSL 15:117db924cf7c 436 W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
wolfSSL 15:117db924cf7c 437 W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
wolfSSL 15:117db924cf7c 438 W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
wolfSSL 15:117db924cf7c 439 W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
wolfSSL 15:117db924cf7c 440 W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
wolfSSL 15:117db924cf7c 441 W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
wolfSSL 15:117db924cf7c 442 W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
wolfSSL 15:117db924cf7c 443 W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
wolfSSL 15:117db924cf7c 444 W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
wolfSSL 15:117db924cf7c 445 W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
wolfSSL 15:117db924cf7c 446 W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
wolfSSL 15:117db924cf7c 447 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
wolfSSL 15:117db924cf7c 448 };
wolfSSL 15:117db924cf7c 449
wolfSSL 15:117db924cf7c 450 #define blk0(i) (W[i] = sha512->buffer[i])
wolfSSL 15:117db924cf7c 451
wolfSSL 15:117db924cf7c 452 #define blk2(i) (\
wolfSSL 15:117db924cf7c 453 W[ i & 15] += \
wolfSSL 15:117db924cf7c 454 s1(W[(i-2) & 15])+ \
wolfSSL 15:117db924cf7c 455 W[(i-7) & 15] + \
wolfSSL 15:117db924cf7c 456 s0(W[(i-15) & 15]) \
wolfSSL 15:117db924cf7c 457 )
wolfSSL 15:117db924cf7c 458
wolfSSL 15:117db924cf7c 459 #define Ch(x,y,z) (z ^ (x & (y ^ z)))
wolfSSL 15:117db924cf7c 460 #define Maj(x,y,z) ((x & y) | (z & (x | y)))
wolfSSL 15:117db924cf7c 461
wolfSSL 15:117db924cf7c 462 #define a(i) T[(0-i) & 7]
wolfSSL 15:117db924cf7c 463 #define b(i) T[(1-i) & 7]
wolfSSL 15:117db924cf7c 464 #define c(i) T[(2-i) & 7]
wolfSSL 15:117db924cf7c 465 #define d(i) T[(3-i) & 7]
wolfSSL 15:117db924cf7c 466 #define e(i) T[(4-i) & 7]
wolfSSL 15:117db924cf7c 467 #define f(i) T[(5-i) & 7]
wolfSSL 15:117db924cf7c 468 #define g(i) T[(6-i) & 7]
wolfSSL 15:117db924cf7c 469 #define h(i) T[(7-i) & 7]
wolfSSL 15:117db924cf7c 470
wolfSSL 15:117db924cf7c 471 #define S0(x) (rotrFixed64(x,28) ^ rotrFixed64(x,34) ^ rotrFixed64(x,39))
wolfSSL 15:117db924cf7c 472 #define S1(x) (rotrFixed64(x,14) ^ rotrFixed64(x,18) ^ rotrFixed64(x,41))
wolfSSL 15:117db924cf7c 473 #define s0(x) (rotrFixed64(x,1) ^ rotrFixed64(x,8) ^ (x>>7))
wolfSSL 15:117db924cf7c 474 #define s1(x) (rotrFixed64(x,19) ^ rotrFixed64(x,61) ^ (x>>6))
wolfSSL 15:117db924cf7c 475
wolfSSL 15:117db924cf7c 476 #define R(i) \
wolfSSL 15:117db924cf7c 477 h(i) += S1(e(i)) + Ch(e(i),f(i),g(i)) + K[i+j] + (j ? blk2(i) : blk0(i)); \
wolfSSL 15:117db924cf7c 478 d(i) += h(i); \
wolfSSL 15:117db924cf7c 479 h(i) += S0(a(i)) + Maj(a(i),b(i),c(i))
wolfSSL 15:117db924cf7c 480
wolfSSL 15:117db924cf7c 481 static int _Transform_Sha512(wc_Sha512* sha512)
wolfSSL 15:117db924cf7c 482 {
wolfSSL 15:117db924cf7c 483 const word64* K = K512;
wolfSSL 15:117db924cf7c 484 word32 j;
wolfSSL 15:117db924cf7c 485 word64 T[8];
wolfSSL 15:117db924cf7c 486
wolfSSL 15:117db924cf7c 487 #ifdef WOLFSSL_SMALL_STACK_CACHE
wolfSSL 15:117db924cf7c 488 word64* W = sha512->W;
wolfSSL 15:117db924cf7c 489 if (W == NULL) {
wolfSSL 15:117db924cf7c 490 W = (word64*) XMALLOC(sizeof(word64) * 16, NULL,
wolfSSL 15:117db924cf7c 491 DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 15:117db924cf7c 492 if (W == NULL)
wolfSSL 15:117db924cf7c 493 return MEMORY_E;
wolfSSL 15:117db924cf7c 494 sha512->W = W;
wolfSSL 15:117db924cf7c 495 }
wolfSSL 15:117db924cf7c 496 #elif defined(WOLFSSL_SMALL_STACK)
wolfSSL 15:117db924cf7c 497 word64* W;
wolfSSL 15:117db924cf7c 498 W = (word64*) XMALLOC(sizeof(word64) * 16, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 15:117db924cf7c 499 if (W == NULL)
wolfSSL 15:117db924cf7c 500 return MEMORY_E;
wolfSSL 15:117db924cf7c 501 #else
wolfSSL 15:117db924cf7c 502 word64 W[16];
wolfSSL 15:117db924cf7c 503 #endif
wolfSSL 15:117db924cf7c 504
wolfSSL 15:117db924cf7c 505 /* Copy digest to working vars */
wolfSSL 15:117db924cf7c 506 XMEMCPY(T, sha512->digest, sizeof(T));
wolfSSL 15:117db924cf7c 507
wolfSSL 15:117db924cf7c 508 #ifdef USE_SLOW_SHA512
wolfSSL 15:117db924cf7c 509 /* over twice as small, but 50% slower */
wolfSSL 15:117db924cf7c 510 /* 80 operations, not unrolled */
wolfSSL 15:117db924cf7c 511 for (j = 0; j < 80; j += 16) {
wolfSSL 15:117db924cf7c 512 int m;
wolfSSL 15:117db924cf7c 513 for (m = 0; m < 16; m++) { /* braces needed here for macros {} */
wolfSSL 15:117db924cf7c 514 R(m);
wolfSSL 15:117db924cf7c 515 }
wolfSSL 15:117db924cf7c 516 }
wolfSSL 15:117db924cf7c 517 #else
wolfSSL 15:117db924cf7c 518 /* 80 operations, partially loop unrolled */
wolfSSL 15:117db924cf7c 519 for (j = 0; j < 80; j += 16) {
wolfSSL 15:117db924cf7c 520 R( 0); R( 1); R( 2); R( 3);
wolfSSL 15:117db924cf7c 521 R( 4); R( 5); R( 6); R( 7);
wolfSSL 15:117db924cf7c 522 R( 8); R( 9); R(10); R(11);
wolfSSL 15:117db924cf7c 523 R(12); R(13); R(14); R(15);
wolfSSL 15:117db924cf7c 524 }
wolfSSL 15:117db924cf7c 525 #endif /* USE_SLOW_SHA512 */
wolfSSL 15:117db924cf7c 526
wolfSSL 15:117db924cf7c 527 /* Add the working vars back into digest */
wolfSSL 15:117db924cf7c 528 sha512->digest[0] += a(0);
wolfSSL 15:117db924cf7c 529 sha512->digest[1] += b(0);
wolfSSL 15:117db924cf7c 530 sha512->digest[2] += c(0);
wolfSSL 15:117db924cf7c 531 sha512->digest[3] += d(0);
wolfSSL 15:117db924cf7c 532 sha512->digest[4] += e(0);
wolfSSL 15:117db924cf7c 533 sha512->digest[5] += f(0);
wolfSSL 15:117db924cf7c 534 sha512->digest[6] += g(0);
wolfSSL 15:117db924cf7c 535 sha512->digest[7] += h(0);
wolfSSL 15:117db924cf7c 536
wolfSSL 15:117db924cf7c 537 /* Wipe variables */
wolfSSL 15:117db924cf7c 538 ForceZero(W, sizeof(word64) * 16);
wolfSSL 15:117db924cf7c 539 ForceZero(T, sizeof(T));
wolfSSL 15:117db924cf7c 540
wolfSSL 15:117db924cf7c 541 #if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SMALL_STACK_CACHE)
wolfSSL 15:117db924cf7c 542 XFREE(W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 15:117db924cf7c 543 #endif
wolfSSL 15:117db924cf7c 544
wolfSSL 15:117db924cf7c 545 return 0;
wolfSSL 15:117db924cf7c 546 }
wolfSSL 15:117db924cf7c 547
wolfSSL 15:117db924cf7c 548
wolfSSL 15:117db924cf7c 549 static WC_INLINE void AddLength(wc_Sha512* sha512, word32 len)
wolfSSL 15:117db924cf7c 550 {
wolfSSL 15:117db924cf7c 551 word64 tmp = sha512->loLen;
wolfSSL 15:117db924cf7c 552 if ( (sha512->loLen += len) < tmp)
wolfSSL 15:117db924cf7c 553 sha512->hiLen++; /* carry low to high */
wolfSSL 15:117db924cf7c 554 }
wolfSSL 15:117db924cf7c 555
wolfSSL 15:117db924cf7c 556 static WC_INLINE int Sha512Update(wc_Sha512* sha512, const byte* data, word32 len)
wolfSSL 15:117db924cf7c 557 {
wolfSSL 15:117db924cf7c 558 int ret = 0;
wolfSSL 15:117db924cf7c 559 /* do block size increments */
wolfSSL 15:117db924cf7c 560 byte* local = (byte*)sha512->buffer;
wolfSSL 15:117db924cf7c 561
wolfSSL 15:117db924cf7c 562 /* check that internal buffLen is valid */
wolfSSL 15:117db924cf7c 563 if (sha512->buffLen >= WC_SHA512_BLOCK_SIZE)
wolfSSL 15:117db924cf7c 564 return BUFFER_E;
wolfSSL 15:117db924cf7c 565
wolfSSL 15:117db924cf7c 566 if (sha512->buffLen > 0) {
wolfSSL 15:117db924cf7c 567 word32 add = min(len, WC_SHA512_BLOCK_SIZE - sha512->buffLen);
wolfSSL 15:117db924cf7c 568 if (add > 0) {
wolfSSL 15:117db924cf7c 569 XMEMCPY(&local[sha512->buffLen], data, add);
wolfSSL 15:117db924cf7c 570
wolfSSL 15:117db924cf7c 571 sha512->buffLen += add;
wolfSSL 15:117db924cf7c 572 data += add;
wolfSSL 15:117db924cf7c 573 len -= add;
wolfSSL 15:117db924cf7c 574 }
wolfSSL 15:117db924cf7c 575
wolfSSL 15:117db924cf7c 576 if (sha512->buffLen == WC_SHA512_BLOCK_SIZE) {
wolfSSL 15:117db924cf7c 577 #if defined(LITTLE_ENDIAN_ORDER)
wolfSSL 15:117db924cf7c 578 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 579 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
wolfSSL 15:117db924cf7c 580 #endif
wolfSSL 15:117db924cf7c 581 {
wolfSSL 15:117db924cf7c 582 ByteReverseWords64(sha512->buffer, sha512->buffer,
wolfSSL 15:117db924cf7c 583 WC_SHA512_BLOCK_SIZE);
wolfSSL 15:117db924cf7c 584 }
wolfSSL 15:117db924cf7c 585 #endif
wolfSSL 15:117db924cf7c 586 ret = Transform_Sha512(sha512);
wolfSSL 15:117db924cf7c 587 if (ret == 0) {
wolfSSL 15:117db924cf7c 588 AddLength(sha512, WC_SHA512_BLOCK_SIZE);
wolfSSL 15:117db924cf7c 589 sha512->buffLen = 0;
wolfSSL 15:117db924cf7c 590 }
wolfSSL 15:117db924cf7c 591 else
wolfSSL 15:117db924cf7c 592 len = 0;
wolfSSL 15:117db924cf7c 593 }
wolfSSL 15:117db924cf7c 594 }
wolfSSL 15:117db924cf7c 595
wolfSSL 15:117db924cf7c 596 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 597 if (Transform_Sha512_Len_p != NULL) {
wolfSSL 15:117db924cf7c 598 word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1);
wolfSSL 15:117db924cf7c 599
wolfSSL 15:117db924cf7c 600 if (blocksLen > 0) {
wolfSSL 15:117db924cf7c 601 AddLength(sha512, blocksLen);
wolfSSL 15:117db924cf7c 602 sha512->data = data;
wolfSSL 15:117db924cf7c 603 /* Byte reversal performed in function if required. */
wolfSSL 15:117db924cf7c 604 Transform_Sha512_Len(sha512, blocksLen);
wolfSSL 15:117db924cf7c 605 data += blocksLen;
wolfSSL 15:117db924cf7c 606 len -= blocksLen;
wolfSSL 15:117db924cf7c 607 }
wolfSSL 15:117db924cf7c 608 }
wolfSSL 15:117db924cf7c 609 else
wolfSSL 15:117db924cf7c 610 #endif
wolfSSL 15:117db924cf7c 611 #if !defined(LITTLE_ENDIAN_ORDER) || defined(FREESCALE_MMCAU_SHA) || \
wolfSSL 15:117db924cf7c 612 defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 613 {
wolfSSL 15:117db924cf7c 614 word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1);
wolfSSL 15:117db924cf7c 615
wolfSSL 15:117db924cf7c 616 AddLength(sha512, blocksLen);
wolfSSL 15:117db924cf7c 617 while (len >= WC_SHA512_BLOCK_SIZE) {
wolfSSL 15:117db924cf7c 618 XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE);
wolfSSL 15:117db924cf7c 619
wolfSSL 15:117db924cf7c 620 data += WC_SHA512_BLOCK_SIZE;
wolfSSL 15:117db924cf7c 621 len -= WC_SHA512_BLOCK_SIZE;
wolfSSL 15:117db924cf7c 622
wolfSSL 15:117db924cf7c 623 /* Byte reversal performed in function if required. */
wolfSSL 15:117db924cf7c 624 ret = Transform_Sha512(sha512);
wolfSSL 15:117db924cf7c 625 if (ret != 0)
wolfSSL 15:117db924cf7c 626 break;
wolfSSL 15:117db924cf7c 627 }
wolfSSL 15:117db924cf7c 628 }
wolfSSL 15:117db924cf7c 629 #else
wolfSSL 15:117db924cf7c 630 {
wolfSSL 15:117db924cf7c 631 word32 blocksLen = len & ~(WC_SHA512_BLOCK_SIZE-1);
wolfSSL 15:117db924cf7c 632
wolfSSL 15:117db924cf7c 633 AddLength(sha512, blocksLen);
wolfSSL 15:117db924cf7c 634 while (len >= WC_SHA512_BLOCK_SIZE) {
wolfSSL 15:117db924cf7c 635 XMEMCPY(local, data, WC_SHA512_BLOCK_SIZE);
wolfSSL 15:117db924cf7c 636
wolfSSL 15:117db924cf7c 637 data += WC_SHA512_BLOCK_SIZE;
wolfSSL 15:117db924cf7c 638 len -= WC_SHA512_BLOCK_SIZE;
wolfSSL 15:117db924cf7c 639
wolfSSL 15:117db924cf7c 640 ByteReverseWords64(sha512->buffer, sha512->buffer,
wolfSSL 15:117db924cf7c 641 WC_SHA512_BLOCK_SIZE);
wolfSSL 15:117db924cf7c 642 ret = Transform_Sha512(sha512);
wolfSSL 15:117db924cf7c 643 if (ret != 0)
wolfSSL 15:117db924cf7c 644 break;
wolfSSL 15:117db924cf7c 645 }
wolfSSL 15:117db924cf7c 646 }
wolfSSL 15:117db924cf7c 647 #endif
wolfSSL 15:117db924cf7c 648
wolfSSL 15:117db924cf7c 649 if (len > 0) {
wolfSSL 15:117db924cf7c 650 XMEMCPY(local, data, len);
wolfSSL 15:117db924cf7c 651 sha512->buffLen = len;
wolfSSL 15:117db924cf7c 652 }
wolfSSL 15:117db924cf7c 653
wolfSSL 15:117db924cf7c 654 return ret;
wolfSSL 15:117db924cf7c 655 }
wolfSSL 15:117db924cf7c 656
wolfSSL 15:117db924cf7c 657 #ifdef WOLFSSL_SHA512
wolfSSL 15:117db924cf7c 658
wolfSSL 15:117db924cf7c 659 int wc_Sha512Update(wc_Sha512* sha512, const byte* data, word32 len)
wolfSSL 15:117db924cf7c 660 {
wolfSSL 15:117db924cf7c 661 if (sha512 == NULL || (data == NULL && len > 0)) {
wolfSSL 15:117db924cf7c 662 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 663 }
wolfSSL 15:117db924cf7c 664
wolfSSL 15:117db924cf7c 665 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
wolfSSL 15:117db924cf7c 666 if (sha512->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA512) {
wolfSSL 15:117db924cf7c 667 #if defined(HAVE_INTEL_QA)
wolfSSL 15:117db924cf7c 668 return IntelQaSymSha512(&sha512->asyncDev, NULL, data, len);
wolfSSL 15:117db924cf7c 669 #endif
wolfSSL 15:117db924cf7c 670 }
wolfSSL 15:117db924cf7c 671 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 15:117db924cf7c 672
wolfSSL 15:117db924cf7c 673 return Sha512Update(sha512, data, len);
wolfSSL 15:117db924cf7c 674 }
wolfSSL 15:117db924cf7c 675
wolfSSL 15:117db924cf7c 676 #endif /* WOLFSSL_SHA512 */
wolfSSL 15:117db924cf7c 677
wolfSSL 15:117db924cf7c 678 #endif /* WOLFSSL_IMX6_CAAM */
wolfSSL 15:117db924cf7c 679
wolfSSL 15:117db924cf7c 680 static WC_INLINE int Sha512Final(wc_Sha512* sha512)
wolfSSL 15:117db924cf7c 681 {
wolfSSL 15:117db924cf7c 682 byte* local = (byte*)sha512->buffer;
wolfSSL 15:117db924cf7c 683 int ret;
wolfSSL 15:117db924cf7c 684
wolfSSL 15:117db924cf7c 685 if (sha512 == NULL) {
wolfSSL 15:117db924cf7c 686 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 687 }
wolfSSL 15:117db924cf7c 688
wolfSSL 15:117db924cf7c 689 AddLength(sha512, sha512->buffLen); /* before adding pads */
wolfSSL 15:117db924cf7c 690
wolfSSL 15:117db924cf7c 691 local[sha512->buffLen++] = 0x80; /* add 1 */
wolfSSL 15:117db924cf7c 692
wolfSSL 15:117db924cf7c 693 /* pad with zeros */
wolfSSL 15:117db924cf7c 694 if (sha512->buffLen > WC_SHA512_PAD_SIZE) {
wolfSSL 15:117db924cf7c 695 XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_BLOCK_SIZE - sha512->buffLen);
wolfSSL 15:117db924cf7c 696 sha512->buffLen += WC_SHA512_BLOCK_SIZE - sha512->buffLen;
wolfSSL 15:117db924cf7c 697 #if defined(LITTLE_ENDIAN_ORDER)
wolfSSL 15:117db924cf7c 698 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 699 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
wolfSSL 15:117db924cf7c 700 #endif
wolfSSL 15:117db924cf7c 701 {
wolfSSL 15:117db924cf7c 702 ByteReverseWords64(sha512->buffer,sha512->buffer,
wolfSSL 15:117db924cf7c 703 WC_SHA512_BLOCK_SIZE);
wolfSSL 15:117db924cf7c 704 }
wolfSSL 15:117db924cf7c 705 #endif /* LITTLE_ENDIAN_ORDER */
wolfSSL 15:117db924cf7c 706 ret = Transform_Sha512(sha512);
wolfSSL 15:117db924cf7c 707 if (ret != 0)
wolfSSL 15:117db924cf7c 708 return ret;
wolfSSL 15:117db924cf7c 709
wolfSSL 15:117db924cf7c 710 sha512->buffLen = 0;
wolfSSL 15:117db924cf7c 711 }
wolfSSL 15:117db924cf7c 712 XMEMSET(&local[sha512->buffLen], 0, WC_SHA512_PAD_SIZE - sha512->buffLen);
wolfSSL 15:117db924cf7c 713
wolfSSL 15:117db924cf7c 714 /* put lengths in bits */
wolfSSL 15:117db924cf7c 715 sha512->hiLen = (sha512->loLen >> (8 * sizeof(sha512->loLen) - 3)) +
wolfSSL 15:117db924cf7c 716 (sha512->hiLen << 3);
wolfSSL 15:117db924cf7c 717 sha512->loLen = sha512->loLen << 3;
wolfSSL 15:117db924cf7c 718
wolfSSL 15:117db924cf7c 719 /* store lengths */
wolfSSL 15:117db924cf7c 720 #if defined(LITTLE_ENDIAN_ORDER)
wolfSSL 15:117db924cf7c 721 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 722 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
wolfSSL 15:117db924cf7c 723 #endif
wolfSSL 15:117db924cf7c 724 ByteReverseWords64(sha512->buffer, sha512->buffer, WC_SHA512_PAD_SIZE);
wolfSSL 15:117db924cf7c 725 #endif
wolfSSL 15:117db924cf7c 726 /* ! length ordering dependent on digest endian type ! */
wolfSSL 15:117db924cf7c 727
wolfSSL 15:117db924cf7c 728 sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2] = sha512->hiLen;
wolfSSL 15:117db924cf7c 729 sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 1] = sha512->loLen;
wolfSSL 15:117db924cf7c 730 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 731 if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
wolfSSL 15:117db924cf7c 732 ByteReverseWords64(&(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
wolfSSL 15:117db924cf7c 733 &(sha512->buffer[WC_SHA512_BLOCK_SIZE / sizeof(word64) - 2]),
wolfSSL 15:117db924cf7c 734 WC_SHA512_BLOCK_SIZE - WC_SHA512_PAD_SIZE);
wolfSSL 15:117db924cf7c 735 #endif
wolfSSL 15:117db924cf7c 736 ret = Transform_Sha512(sha512);
wolfSSL 15:117db924cf7c 737 if (ret != 0)
wolfSSL 15:117db924cf7c 738 return ret;
wolfSSL 15:117db924cf7c 739
wolfSSL 15:117db924cf7c 740 #ifdef LITTLE_ENDIAN_ORDER
wolfSSL 15:117db924cf7c 741 ByteReverseWords64(sha512->digest, sha512->digest, WC_SHA512_DIGEST_SIZE);
wolfSSL 15:117db924cf7c 742 #endif
wolfSSL 15:117db924cf7c 743
wolfSSL 15:117db924cf7c 744 return 0;
wolfSSL 15:117db924cf7c 745 }
wolfSSL 15:117db924cf7c 746
wolfSSL 15:117db924cf7c 747 #ifdef WOLFSSL_SHA512
wolfSSL 15:117db924cf7c 748
wolfSSL 15:117db924cf7c 749 int wc_Sha512FinalRaw(wc_Sha512* sha512, byte* hash)
wolfSSL 15:117db924cf7c 750 {
wolfSSL 15:117db924cf7c 751 #ifdef LITTLE_ENDIAN_ORDER
wolfSSL 15:117db924cf7c 752 word64 digest[WC_SHA512_DIGEST_SIZE / sizeof(word64)];
wolfSSL 15:117db924cf7c 753 #endif
wolfSSL 15:117db924cf7c 754
wolfSSL 15:117db924cf7c 755 if (sha512 == NULL || hash == NULL) {
wolfSSL 15:117db924cf7c 756 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 757 }
wolfSSL 15:117db924cf7c 758
wolfSSL 15:117db924cf7c 759 #ifdef LITTLE_ENDIAN_ORDER
wolfSSL 15:117db924cf7c 760 ByteReverseWords64((word64*)digest, (word64*)sha512->digest,
wolfSSL 15:117db924cf7c 761 WC_SHA512_DIGEST_SIZE);
wolfSSL 15:117db924cf7c 762 XMEMCPY(hash, digest, WC_SHA512_DIGEST_SIZE);
wolfSSL 15:117db924cf7c 763 #else
wolfSSL 15:117db924cf7c 764 XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE);
wolfSSL 15:117db924cf7c 765 #endif
wolfSSL 15:117db924cf7c 766
wolfSSL 15:117db924cf7c 767 return 0;
wolfSSL 15:117db924cf7c 768 }
wolfSSL 15:117db924cf7c 769
wolfSSL 15:117db924cf7c 770 int wc_Sha512Final(wc_Sha512* sha512, byte* hash)
wolfSSL 15:117db924cf7c 771 {
wolfSSL 15:117db924cf7c 772 int ret;
wolfSSL 15:117db924cf7c 773
wolfSSL 15:117db924cf7c 774 if (sha512 == NULL || hash == NULL) {
wolfSSL 15:117db924cf7c 775 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 776 }
wolfSSL 15:117db924cf7c 777
wolfSSL 15:117db924cf7c 778 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
wolfSSL 15:117db924cf7c 779 if (sha512->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA512) {
wolfSSL 15:117db924cf7c 780 #if defined(HAVE_INTEL_QA)
wolfSSL 15:117db924cf7c 781 return IntelQaSymSha512(&sha512->asyncDev, hash, NULL,
wolfSSL 15:117db924cf7c 782 WC_SHA512_DIGEST_SIZE);
wolfSSL 15:117db924cf7c 783 #endif
wolfSSL 15:117db924cf7c 784 }
wolfSSL 15:117db924cf7c 785 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 15:117db924cf7c 786
wolfSSL 15:117db924cf7c 787 ret = Sha512Final(sha512);
wolfSSL 15:117db924cf7c 788 if (ret != 0)
wolfSSL 15:117db924cf7c 789 return ret;
wolfSSL 15:117db924cf7c 790
wolfSSL 15:117db924cf7c 791 XMEMCPY(hash, sha512->digest, WC_SHA512_DIGEST_SIZE);
wolfSSL 15:117db924cf7c 792
wolfSSL 15:117db924cf7c 793 return InitSha512(sha512); /* reset state */
wolfSSL 15:117db924cf7c 794 }
wolfSSL 15:117db924cf7c 795
wolfSSL 15:117db924cf7c 796
wolfSSL 15:117db924cf7c 797 int wc_InitSha512(wc_Sha512* sha512)
wolfSSL 15:117db924cf7c 798 {
wolfSSL 15:117db924cf7c 799 return wc_InitSha512_ex(sha512, NULL, INVALID_DEVID);
wolfSSL 15:117db924cf7c 800 }
wolfSSL 15:117db924cf7c 801
wolfSSL 15:117db924cf7c 802 void wc_Sha512Free(wc_Sha512* sha512)
wolfSSL 15:117db924cf7c 803 {
wolfSSL 15:117db924cf7c 804 if (sha512 == NULL)
wolfSSL 15:117db924cf7c 805 return;
wolfSSL 15:117db924cf7c 806
wolfSSL 15:117db924cf7c 807 #ifdef WOLFSSL_SMALL_STACK_CACHE
wolfSSL 15:117db924cf7c 808 if (sha512->W != NULL) {
wolfSSL 15:117db924cf7c 809 XFREE(sha512->W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 15:117db924cf7c 810 sha512->W = NULL;
wolfSSL 15:117db924cf7c 811 }
wolfSSL 15:117db924cf7c 812 #endif
wolfSSL 15:117db924cf7c 813
wolfSSL 15:117db924cf7c 814 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA512)
wolfSSL 15:117db924cf7c 815 wolfAsync_DevCtxFree(&sha512->asyncDev, WOLFSSL_ASYNC_MARKER_SHA512);
wolfSSL 15:117db924cf7c 816 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 15:117db924cf7c 817 }
wolfSSL 15:117db924cf7c 818
wolfSSL 15:117db924cf7c 819
wolfSSL 15:117db924cf7c 820 #if defined(HAVE_INTEL_AVX1)
wolfSSL 15:117db924cf7c 821
wolfSSL 15:117db924cf7c 822 static word64 mBYTE_FLIP_MASK[] = { 0x0001020304050607, 0x08090a0b0c0d0e0f };
wolfSSL 15:117db924cf7c 823
wolfSSL 15:117db924cf7c 824 #define W_0 xmm0
wolfSSL 15:117db924cf7c 825 #define W_2 xmm1
wolfSSL 15:117db924cf7c 826 #define W_4 xmm2
wolfSSL 15:117db924cf7c 827 #define W_6 xmm3
wolfSSL 15:117db924cf7c 828 #define W_8 xmm4
wolfSSL 15:117db924cf7c 829 #define W_10 xmm5
wolfSSL 15:117db924cf7c 830 #define W_12 xmm6
wolfSSL 15:117db924cf7c 831 #define W_14 xmm7
wolfSSL 15:117db924cf7c 832
wolfSSL 15:117db924cf7c 833 #define W_M15 xmm12
wolfSSL 15:117db924cf7c 834 #define W_M7 xmm13
wolfSSL 15:117db924cf7c 835 #define MASK xmm14
wolfSSL 15:117db924cf7c 836
wolfSSL 15:117db924cf7c 837 #define XTMP1 xmm8
wolfSSL 15:117db924cf7c 838 #define XTMP2 xmm9
wolfSSL 15:117db924cf7c 839 #define XTMP3 xmm10
wolfSSL 15:117db924cf7c 840 #define XTMP4 xmm11
wolfSSL 15:117db924cf7c 841
wolfSSL 15:117db924cf7c 842 #define XMM_REGS \
wolfSSL 15:117db924cf7c 843 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \
wolfSSL 15:117db924cf7c 844 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
wolfSSL 15:117db924cf7c 845
wolfSSL 15:117db924cf7c 846 #define _VPALIGNR(dest, src1, src2, bits) \
wolfSSL 15:117db924cf7c 847 "vpalignr $" #bits ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t"
wolfSSL 15:117db924cf7c 848 #define VPALIGNR(dest, src1, src2, bits) \
wolfSSL 15:117db924cf7c 849 _VPALIGNR(dest, src1, src2, bits)
wolfSSL 15:117db924cf7c 850
wolfSSL 15:117db924cf7c 851 #define _V_SHIFT_R(dest, src, bits) \
wolfSSL 15:117db924cf7c 852 "vpsrlq $" #bits ", %%" #src ", %%" #dest "\n\t"
wolfSSL 15:117db924cf7c 853 #define V_SHIFT_R(dest, src, bits) \
wolfSSL 15:117db924cf7c 854 _V_SHIFT_R(dest, src, bits)
wolfSSL 15:117db924cf7c 855
wolfSSL 15:117db924cf7c 856 #define _V_SHIFT_L(dest, src, bits) \
wolfSSL 15:117db924cf7c 857 "vpsllq $" #bits ", %%" #src ", %%" #dest "\n\t"
wolfSSL 15:117db924cf7c 858 #define V_SHIFT_L(dest, src, bits) \
wolfSSL 15:117db924cf7c 859 _V_SHIFT_L(dest, src, bits)
wolfSSL 15:117db924cf7c 860
wolfSSL 15:117db924cf7c 861 #define _V_ADD(dest, src1, src2) \
wolfSSL 15:117db924cf7c 862 "vpaddq %%" #src1 ", %%" #src2 ", %%" #dest "\n\t"
wolfSSL 15:117db924cf7c 863 #define V_ADD(dest, src1, src2) \
wolfSSL 15:117db924cf7c 864 _V_ADD(dest, src1, src2)
wolfSSL 15:117db924cf7c 865
wolfSSL 15:117db924cf7c 866 #define _V_XOR(dest, src1, src2) \
wolfSSL 15:117db924cf7c 867 "vpxor %%" #src1 ", %%" #src2 ", %%" #dest "\n\t"
wolfSSL 15:117db924cf7c 868 #define V_XOR(dest, src1, src2) \
wolfSSL 15:117db924cf7c 869 _V_XOR(dest, src1, src2)
wolfSSL 15:117db924cf7c 870
wolfSSL 15:117db924cf7c 871 #define _V_OR(dest, src1, src2) \
wolfSSL 15:117db924cf7c 872 "vpor %%" #src1 ", %%" #src2 ", %%" #dest "\n\t"
wolfSSL 15:117db924cf7c 873 #define V_OR(dest, src1, src2) \
wolfSSL 15:117db924cf7c 874 _V_OR(dest, src1, src2)
wolfSSL 15:117db924cf7c 875
wolfSSL 15:117db924cf7c 876 #define RA %%r8
wolfSSL 15:117db924cf7c 877 #define RB %%r9
wolfSSL 15:117db924cf7c 878 #define RC %%r10
wolfSSL 15:117db924cf7c 879 #define RD %%r11
wolfSSL 15:117db924cf7c 880 #define RE %%r12
wolfSSL 15:117db924cf7c 881 #define RF %%r13
wolfSSL 15:117db924cf7c 882 #define RG %%r14
wolfSSL 15:117db924cf7c 883 #define RH %%r15
wolfSSL 15:117db924cf7c 884
wolfSSL 15:117db924cf7c 885 #define STATE_REGS "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
wolfSSL 15:117db924cf7c 886
wolfSSL 15:117db924cf7c 887 #define L1 "%%rax"
wolfSSL 15:117db924cf7c 888 #define L2 "%%rcx"
wolfSSL 15:117db924cf7c 889 #define L3 "%%rdx"
wolfSSL 15:117db924cf7c 890 #define L4 "%%rbx"
wolfSSL 15:117db924cf7c 891 #define WX "%%rsp"
wolfSSL 15:117db924cf7c 892
wolfSSL 15:117db924cf7c 893 #define WORK_REGS "rax", "rbx", "rcx", "rdx"
wolfSSL 15:117db924cf7c 894
wolfSSL 15:117db924cf7c 895 #define RND_0_1(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 896 /* L1 = e >>> 23 */ \
wolfSSL 15:117db924cf7c 897 "rorq $23, " L1 "\n\t" \
wolfSSL 15:117db924cf7c 898
wolfSSL 15:117db924cf7c 899 #define RND_0_2(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 900 /* L3 = a */ \
wolfSSL 15:117db924cf7c 901 "movq "#a", " L3 "\n\t" \
wolfSSL 15:117db924cf7c 902 /* L2 = f */ \
wolfSSL 15:117db924cf7c 903 "movq "#f", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 904 /* h += W_X[i] */ \
wolfSSL 15:117db924cf7c 905 "addq ("#i")*8(" WX "), "#h"\n\t" \
wolfSSL 15:117db924cf7c 906 /* L2 = f ^ g */ \
wolfSSL 15:117db924cf7c 907 "xorq "#g", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 908
wolfSSL 15:117db924cf7c 909 #define RND_0_2_A(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 910 /* L3 = a */ \
wolfSSL 15:117db924cf7c 911 "movq "#a", " L3 "\n\t" \
wolfSSL 15:117db924cf7c 912 /* L2 = f */ \
wolfSSL 15:117db924cf7c 913 "movq "#f", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 914
wolfSSL 15:117db924cf7c 915 #define RND_0_2_B(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 916 /* h += W_X[i] */ \
wolfSSL 15:117db924cf7c 917 "addq ("#i")*8(" WX "), "#h"\n\t" \
wolfSSL 15:117db924cf7c 918 /* L2 = f ^ g */ \
wolfSSL 15:117db924cf7c 919 "xorq "#g", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 920
wolfSSL 15:117db924cf7c 921 #define RND_0_3(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 922 /* L1 = (e >>> 23) ^ e */ \
wolfSSL 15:117db924cf7c 923 "xorq "#e", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 924 /* L2 = (f ^ g) & e */ \
wolfSSL 15:117db924cf7c 925 "andq "#e", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 926
wolfSSL 15:117db924cf7c 927 #define RND_0_4(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 928 /* L1 = ((e >>> 23) ^ e) >>> 4 */ \
wolfSSL 15:117db924cf7c 929 "rorq $4, " L1 "\n\t" \
wolfSSL 15:117db924cf7c 930 /* L2 = ((f ^ g) & e) ^ g */ \
wolfSSL 15:117db924cf7c 931 "xorq "#g", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 932
wolfSSL 15:117db924cf7c 933 #define RND_0_5(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 934 /* L1 = (((e >>> 23) ^ e) >>> 4) ^ e */ \
wolfSSL 15:117db924cf7c 935 "xorq "#e", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 936 /* h += Ch(e,f,g) */ \
wolfSSL 15:117db924cf7c 937 "addq " L2 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 938
wolfSSL 15:117db924cf7c 939 #define RND_0_6(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 940 /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \
wolfSSL 15:117db924cf7c 941 "rorq $14, " L1 "\n\t" \
wolfSSL 15:117db924cf7c 942 /* L3 = a ^ b */ \
wolfSSL 15:117db924cf7c 943 "xorq "#b", " L3 "\n\t" \
wolfSSL 15:117db924cf7c 944
wolfSSL 15:117db924cf7c 945 #define RND_0_7(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 946 /* h += Sigma1(e) */ \
wolfSSL 15:117db924cf7c 947 "addq " L1 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 948 /* L2 = a */ \
wolfSSL 15:117db924cf7c 949 "movq "#a", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 950
wolfSSL 15:117db924cf7c 951 #define RND_0_8(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 952 /* L4 = (a ^ b) & (b ^ c) */ \
wolfSSL 15:117db924cf7c 953 "andq " L3 ", " L4 "\n\t" \
wolfSSL 15:117db924cf7c 954 /* L2 = a >>> 5 */ \
wolfSSL 15:117db924cf7c 955 "rorq $5, " L2 "\n\t" \
wolfSSL 15:117db924cf7c 956
wolfSSL 15:117db924cf7c 957 #define RND_0_9(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 958 /* L2 = (a >>> 5) ^ a */ \
wolfSSL 15:117db924cf7c 959 "xorq "#a", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 960 /* L4 = ((a ^ b) & (b ^ c) ^ b */ \
wolfSSL 15:117db924cf7c 961 "xorq "#b", " L4 "\n\t" \
wolfSSL 15:117db924cf7c 962
wolfSSL 15:117db924cf7c 963 #define RND_0_10(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 964 /* L2 = ((a >>> 5) ^ a) >>> 6 */ \
wolfSSL 15:117db924cf7c 965 "rorq $6, " L2 "\n\t" \
wolfSSL 15:117db924cf7c 966 /* d += h */ \
wolfSSL 15:117db924cf7c 967 "addq "#h", "#d"\n\t" \
wolfSSL 15:117db924cf7c 968
wolfSSL 15:117db924cf7c 969 #define RND_0_11(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 970 /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */ \
wolfSSL 15:117db924cf7c 971 "xorq "#a", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 972 /* h += Sigma0(a) */ \
wolfSSL 15:117db924cf7c 973 "addq " L4 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 974
wolfSSL 15:117db924cf7c 975 #define RND_0_12(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 976 /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */ \
wolfSSL 15:117db924cf7c 977 "rorq $28, " L2 "\n\t" \
wolfSSL 15:117db924cf7c 978 /* d (= e next RND) */ \
wolfSSL 15:117db924cf7c 979 "movq "#d", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 980 /* h += Maj(a,b,c) */ \
wolfSSL 15:117db924cf7c 981 "addq " L2 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 982
wolfSSL 15:117db924cf7c 983 #define RND_1_1(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 984 /* L1 = e >>> 23 */ \
wolfSSL 15:117db924cf7c 985 "rorq $23, " L1 "\n\t" \
wolfSSL 15:117db924cf7c 986
wolfSSL 15:117db924cf7c 987 #define RND_1_2(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 988 /* L4 = a */ \
wolfSSL 15:117db924cf7c 989 "movq "#a", " L4 "\n\t" \
wolfSSL 15:117db924cf7c 990 /* L2 = f */ \
wolfSSL 15:117db924cf7c 991 "movq "#f", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 992 /* h += W_X[i] */ \
wolfSSL 15:117db924cf7c 993 "addq ("#i")*8(" WX "), "#h"\n\t" \
wolfSSL 15:117db924cf7c 994 /* L2 = f ^ g */ \
wolfSSL 15:117db924cf7c 995 "xorq "#g", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 996
wolfSSL 15:117db924cf7c 997 #define RND_1_2_A(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 998 /* L4 = a */ \
wolfSSL 15:117db924cf7c 999 "movq "#a", " L4 "\n\t" \
wolfSSL 15:117db924cf7c 1000 /* L2 = f */ \
wolfSSL 15:117db924cf7c 1001 "movq "#f", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1002
wolfSSL 15:117db924cf7c 1003 #define RND_1_2_B(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1004 /* h += W_X[i] */ \
wolfSSL 15:117db924cf7c 1005 "addq ("#i")*8(" WX "), "#h"\n\t" \
wolfSSL 15:117db924cf7c 1006 /* L2 = f ^ g */ \
wolfSSL 15:117db924cf7c 1007 "xorq "#g", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1008
wolfSSL 15:117db924cf7c 1009 #define RND_1_3(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1010 /* L1 = (e >>> 23) ^ e */ \
wolfSSL 15:117db924cf7c 1011 "xorq "#e", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1012 /* L2 = (f ^ g) & e */ \
wolfSSL 15:117db924cf7c 1013 "andq "#e", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1014
wolfSSL 15:117db924cf7c 1015 #define RND_1_4(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1016 /* ((e >>> 23) ^ e) >>> 4 */ \
wolfSSL 15:117db924cf7c 1017 "rorq $4, " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1018 /* ((f ^ g) & e) ^ g */ \
wolfSSL 15:117db924cf7c 1019 "xorq "#g", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1020
wolfSSL 15:117db924cf7c 1021 #define RND_1_5(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1022 /* (((e >>> 23) ^ e) >>> 4) ^ e */ \
wolfSSL 15:117db924cf7c 1023 "xorq "#e", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1024 /* h += Ch(e,f,g) */ \
wolfSSL 15:117db924cf7c 1025 "addq " L2 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 1026
wolfSSL 15:117db924cf7c 1027 #define RND_1_6(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1028 /* L1 = ((((e >>> 23) ^ e) >>> 4) ^ e) >>> 14 */ \
wolfSSL 15:117db924cf7c 1029 "rorq $14, " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1030 /* L4 = a ^ b */ \
wolfSSL 15:117db924cf7c 1031 "xorq "#b", " L4 "\n\t" \
wolfSSL 15:117db924cf7c 1032
wolfSSL 15:117db924cf7c 1033 #define RND_1_7(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1034 /* h += Sigma1(e) */ \
wolfSSL 15:117db924cf7c 1035 "addq " L1 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 1036 /* L2 = a */ \
wolfSSL 15:117db924cf7c 1037 "movq "#a", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1038
wolfSSL 15:117db924cf7c 1039 #define RND_1_8(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1040 /* L3 = (a ^ b) & (b ^ c) */ \
wolfSSL 15:117db924cf7c 1041 "andq " L4 ", " L3 "\n\t" \
wolfSSL 15:117db924cf7c 1042 /* L2 = a >>> 5 */ \
wolfSSL 15:117db924cf7c 1043 "rorq $5, " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1044
wolfSSL 15:117db924cf7c 1045 #define RND_1_9(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1046 /* L2 = (a >>> 5) ^ a */ \
wolfSSL 15:117db924cf7c 1047 "xorq "#a", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1048 /* L3 = ((a ^ b) & (b ^ c) ^ b */ \
wolfSSL 15:117db924cf7c 1049 "xorq "#b", " L3 "\n\t" \
wolfSSL 15:117db924cf7c 1050
wolfSSL 15:117db924cf7c 1051 #define RND_1_10(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1052 /* L2 = ((a >>> 5) ^ a) >>> 6 */ \
wolfSSL 15:117db924cf7c 1053 "rorq $6, " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1054 /* d += h */ \
wolfSSL 15:117db924cf7c 1055 "addq "#h", "#d"\n\t" \
wolfSSL 15:117db924cf7c 1056
wolfSSL 15:117db924cf7c 1057 #define RND_1_11(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1058 /* L2 = (((a >>> 5) ^ a) >>> 6) ^ a */ \
wolfSSL 15:117db924cf7c 1059 "xorq "#a", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1060 /* h += Sigma0(a) */ \
wolfSSL 15:117db924cf7c 1061 "addq " L3 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 1062
wolfSSL 15:117db924cf7c 1063 #define RND_1_12(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1064 /* L2 = ((((a >>> 5) ^ a) >>> 6) ^ a) >>> 28 */ \
wolfSSL 15:117db924cf7c 1065 "rorq $28, " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1066 /* d (= e next RND) */ \
wolfSSL 15:117db924cf7c 1067 "movq "#d", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1068 /* h += Maj(a,b,c) */ \
wolfSSL 15:117db924cf7c 1069 "addq " L2 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 1070
wolfSSL 15:117db924cf7c 1071
wolfSSL 15:117db924cf7c 1072 #define MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1073 RND_0_1(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1074 VPALIGNR(W_M15, W_2, W_0, 8) \
wolfSSL 15:117db924cf7c 1075 VPALIGNR(W_M7, W_10, W_8, 8) \
wolfSSL 15:117db924cf7c 1076 RND_0_2(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1077 V_SHIFT_R(XTMP1, W_M15, 1) \
wolfSSL 15:117db924cf7c 1078 V_SHIFT_L(XTMP2, W_M15, 63) \
wolfSSL 15:117db924cf7c 1079 RND_0_3(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1080 RND_0_4(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1081 V_SHIFT_R(XTMP3, W_M15, 8) \
wolfSSL 15:117db924cf7c 1082 V_SHIFT_L(XTMP4, W_M15, 56) \
wolfSSL 15:117db924cf7c 1083 RND_0_5(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1084 RND_0_6(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1085 V_OR(XTMP1, XTMP2, XTMP1) \
wolfSSL 15:117db924cf7c 1086 V_OR(XTMP3, XTMP4, XTMP3) \
wolfSSL 15:117db924cf7c 1087 RND_0_7(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1088 RND_0_8(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1089 V_SHIFT_R(XTMP4, W_M15, 7) \
wolfSSL 15:117db924cf7c 1090 V_XOR(XTMP1, XTMP3, XTMP1) \
wolfSSL 15:117db924cf7c 1091 RND_0_9(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1092 RND_0_10(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1093 V_XOR(XTMP1, XTMP4, XTMP1) \
wolfSSL 15:117db924cf7c 1094 V_ADD(W_0, W_0, W_M7) \
wolfSSL 15:117db924cf7c 1095 RND_0_11(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1096 RND_0_12(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1097 RND_1_1(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1098 V_ADD(W_0, W_0, XTMP1) \
wolfSSL 15:117db924cf7c 1099 RND_1_2(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1100 V_SHIFT_R(XTMP1, W_14, 19) \
wolfSSL 15:117db924cf7c 1101 V_SHIFT_L(XTMP2, W_14, 45) \
wolfSSL 15:117db924cf7c 1102 RND_1_3(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1103 RND_1_4(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1104 V_SHIFT_R(XTMP3, W_14, 61) \
wolfSSL 15:117db924cf7c 1105 V_SHIFT_L(XTMP4, W_14, 3) \
wolfSSL 15:117db924cf7c 1106 RND_1_5(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1107 RND_1_6(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1108 RND_1_7(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1109 V_OR(XTMP1, XTMP2, XTMP1) \
wolfSSL 15:117db924cf7c 1110 V_OR(XTMP3, XTMP4, XTMP3) \
wolfSSL 15:117db924cf7c 1111 RND_1_8(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1112 RND_1_9(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1113 V_XOR(XTMP1, XTMP3, XTMP1) \
wolfSSL 15:117db924cf7c 1114 V_SHIFT_R(XTMP4, W_14, 6) \
wolfSSL 15:117db924cf7c 1115 RND_1_10(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1116 RND_1_11(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1117 V_XOR(XTMP1, XTMP4, XTMP1) \
wolfSSL 15:117db924cf7c 1118 RND_1_12(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1119 V_ADD(W_0, W_0, XTMP1) \
wolfSSL 15:117db924cf7c 1120
wolfSSL 15:117db924cf7c 1121 #define RND_ALL_2(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1122 RND_0_1 (a, b, c, d, e, f, g, h, i ) \
wolfSSL 15:117db924cf7c 1123 RND_0_2 (a, b, c, d, e, f, g, h, i ) \
wolfSSL 15:117db924cf7c 1124 RND_0_3 (a, b, c, d, e, f, g, h, i ) \
wolfSSL 15:117db924cf7c 1125 RND_0_4 (a, b, c, d, e, f, g, h, i ) \
wolfSSL 15:117db924cf7c 1126 RND_0_5 (a, b, c, d, e, f, g, h, i ) \
wolfSSL 15:117db924cf7c 1127 RND_0_6 (a, b, c, d, e, f, g, h, i ) \
wolfSSL 15:117db924cf7c 1128 RND_0_7 (a, b, c, d, e, f, g, h, i ) \
wolfSSL 15:117db924cf7c 1129 RND_0_8 (a, b, c, d, e, f, g, h, i ) \
wolfSSL 15:117db924cf7c 1130 RND_0_9 (a, b, c, d, e, f, g, h, i ) \
wolfSSL 15:117db924cf7c 1131 RND_0_10(a, b, c, d, e, f, g, h, i ) \
wolfSSL 15:117db924cf7c 1132 RND_0_11(a, b, c, d, e, f, g, h, i ) \
wolfSSL 15:117db924cf7c 1133 RND_0_12(a, b, c, d, e, f, g, h, i ) \
wolfSSL 15:117db924cf7c 1134 RND_1_1 (h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1135 RND_1_2 (h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1136 RND_1_3 (h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1137 RND_1_4 (h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1138 RND_1_5 (h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1139 RND_1_6 (h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1140 RND_1_7 (h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1141 RND_1_8 (h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1142 RND_1_9 (h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1143 RND_1_10(h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1144 RND_1_11(h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1145 RND_1_12(h, a, b, c, d, e, f, g, i+1)
wolfSSL 15:117db924cf7c 1146
wolfSSL 15:117db924cf7c 1147
wolfSSL 15:117db924cf7c 1148 #if defined(HAVE_INTEL_RORX)
wolfSSL 15:117db924cf7c 1149
wolfSSL 15:117db924cf7c 1150 #define RND_RORX_0_1(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1151 /* L1 = e>>>14 */ \
wolfSSL 15:117db924cf7c 1152 "rorxq $14, "#e", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1153 /* L2 = e>>>18 */ \
wolfSSL 15:117db924cf7c 1154 "rorxq $18, "#e", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1155 /* Prev RND: h += Maj(a,b,c) */ \
wolfSSL 15:117db924cf7c 1156 "addq " L3 ", "#a"\n\t" \
wolfSSL 15:117db924cf7c 1157
wolfSSL 15:117db924cf7c 1158 #define RND_RORX_0_2(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1159 /* h += w_k */ \
wolfSSL 15:117db924cf7c 1160 "addq ("#i")*8(" WX "), "#h"\n\t" \
wolfSSL 15:117db924cf7c 1161 /* L3 = f */ \
wolfSSL 15:117db924cf7c 1162 "movq "#f", " L3 "\n\t" \
wolfSSL 15:117db924cf7c 1163 /* L2 = (e>>>14) ^ (e>>>18) */ \
wolfSSL 15:117db924cf7c 1164 "xorq " L1 ", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1165
wolfSSL 15:117db924cf7c 1166 #define RND_RORX_0_3(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1167 /* L3 = f ^ g */ \
wolfSSL 15:117db924cf7c 1168 "xorq "#g", " L3 "\n\t" \
wolfSSL 15:117db924cf7c 1169 /* L1 = e>>>41 */ \
wolfSSL 15:117db924cf7c 1170 "rorxq $41, "#e", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1171 /* L1 = Sigma1(e) */ \
wolfSSL 15:117db924cf7c 1172 "xorq " L2 ", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1173
wolfSSL 15:117db924cf7c 1174 #define RND_RORX_0_4(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1175 /* L3 = (f ^ g) & e */ \
wolfSSL 15:117db924cf7c 1176 "andq "#e", " L3 "\n\t" \
wolfSSL 15:117db924cf7c 1177 /* h += Sigma1(e) */ \
wolfSSL 15:117db924cf7c 1178 "addq " L1 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 1179 /* L1 = a>>>28 */ \
wolfSSL 15:117db924cf7c 1180 "rorxq $28, "#a", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1181
wolfSSL 15:117db924cf7c 1182 #define RND_RORX_0_5(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1183 /* L2 = a>>>34 */ \
wolfSSL 15:117db924cf7c 1184 "rorxq $34, "#a", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1185 /* L3 = Ch(e,f,g) */ \
wolfSSL 15:117db924cf7c 1186 "xorq "#g", " L3 "\n\t" \
wolfSSL 15:117db924cf7c 1187 /* L2 = (a>>>28) ^ (a>>>34) */ \
wolfSSL 15:117db924cf7c 1188 "xorq " L1 ", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1189
wolfSSL 15:117db924cf7c 1190 #define RND_RORX_0_6(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1191 /* L1 = a>>>39 */ \
wolfSSL 15:117db924cf7c 1192 "rorxq $39, "#a", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1193 /* h += Ch(e,f,g) */ \
wolfSSL 15:117db924cf7c 1194 "addq " L3 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 1195 /* L1 = Sigma0(a) */ \
wolfSSL 15:117db924cf7c 1196 "xorq " L2 ", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1197
wolfSSL 15:117db924cf7c 1198 #define RND_RORX_0_7(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1199 /* L3 = b */ \
wolfSSL 15:117db924cf7c 1200 "movq "#b", " L3 "\n\t" \
wolfSSL 15:117db924cf7c 1201 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
wolfSSL 15:117db924cf7c 1202 "addq "#h", "#d"\n\t" \
wolfSSL 15:117db924cf7c 1203 /* L3 = a ^ b */ \
wolfSSL 15:117db924cf7c 1204 "xorq "#a", " L3 "\n\t" \
wolfSSL 15:117db924cf7c 1205
wolfSSL 15:117db924cf7c 1206 #define RND_RORX_0_8(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1207 /* L4 = (a ^ b) & (b ^ c) */ \
wolfSSL 15:117db924cf7c 1208 "andq " L3 ", " L4 "\n\t" \
wolfSSL 15:117db924cf7c 1209 /* h += Sigma0(a) */ \
wolfSSL 15:117db924cf7c 1210 "addq " L1 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 1211 /* L4 = Maj(a,b,c) */ \
wolfSSL 15:117db924cf7c 1212 "xorq "#b", " L4 "\n\t" \
wolfSSL 15:117db924cf7c 1213
wolfSSL 15:117db924cf7c 1214 #define RND_RORX_1_1(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1215 /* L1 = e>>>14 */ \
wolfSSL 15:117db924cf7c 1216 "rorxq $14, "#e", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1217 /* L2 = e>>>18 */ \
wolfSSL 15:117db924cf7c 1218 "rorxq $18, "#e", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1219 /* Prev RND: h += Maj(a,b,c) */ \
wolfSSL 15:117db924cf7c 1220 "addq " L4 ", "#a"\n\t" \
wolfSSL 15:117db924cf7c 1221
wolfSSL 15:117db924cf7c 1222 #define RND_RORX_1_2(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1223 /* h += w_k */ \
wolfSSL 15:117db924cf7c 1224 "addq ("#i")*8(" WX "), "#h"\n\t" \
wolfSSL 15:117db924cf7c 1225 /* L4 = f */ \
wolfSSL 15:117db924cf7c 1226 "movq "#f", " L4 "\n\t" \
wolfSSL 15:117db924cf7c 1227 /* L2 = (e>>>14) ^ (e>>>18) */ \
wolfSSL 15:117db924cf7c 1228 "xorq " L1 ", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1229
wolfSSL 15:117db924cf7c 1230 #define RND_RORX_1_3(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1231 /* L4 = f ^ g */ \
wolfSSL 15:117db924cf7c 1232 "xorq "#g", " L4 "\n\t" \
wolfSSL 15:117db924cf7c 1233 /* L1 = e>>>41 */ \
wolfSSL 15:117db924cf7c 1234 "rorxq $41, "#e", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1235 /* L1 = Sigma1(e) */ \
wolfSSL 15:117db924cf7c 1236 "xorq " L2 ", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1237
wolfSSL 15:117db924cf7c 1238 #define RND_RORX_1_4(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1239 /* L4 = (f ^ g) & e */ \
wolfSSL 15:117db924cf7c 1240 "andq "#e", " L4 "\n\t" \
wolfSSL 15:117db924cf7c 1241 /* h += Sigma1(e) */ \
wolfSSL 15:117db924cf7c 1242 "addq " L1 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 1243 /* L1 = a>>>28 */ \
wolfSSL 15:117db924cf7c 1244 "rorxq $28, "#a", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1245
wolfSSL 15:117db924cf7c 1246 #define RND_RORX_1_5(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1247 /* L2 = a>>>34 */ \
wolfSSL 15:117db924cf7c 1248 "rorxq $34, "#a", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1249 /* L4 = Ch(e,f,g) */ \
wolfSSL 15:117db924cf7c 1250 "xorq "#g", " L4 "\n\t" \
wolfSSL 15:117db924cf7c 1251 /* L2 = (a>>>28) ^ (a>>>34) */ \
wolfSSL 15:117db924cf7c 1252 "xorq " L1 ", " L2 "\n\t" \
wolfSSL 15:117db924cf7c 1253
wolfSSL 15:117db924cf7c 1254 #define RND_RORX_1_6(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1255 /* L1 = a>>>39 */ \
wolfSSL 15:117db924cf7c 1256 "rorxq $39, "#a", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1257 /* h += Ch(e,f,g) */ \
wolfSSL 15:117db924cf7c 1258 "addq " L4 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 1259 /* L1 = Sigma0(a) */ \
wolfSSL 15:117db924cf7c 1260 "xorq " L2 ", " L1 "\n\t" \
wolfSSL 15:117db924cf7c 1261
wolfSSL 15:117db924cf7c 1262 #define RND_RORX_1_7(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1263 /* L4 = b */ \
wolfSSL 15:117db924cf7c 1264 "movq "#b", " L4 "\n\t" \
wolfSSL 15:117db924cf7c 1265 /* d += h + w_k + Sigma1(e) + Ch(e,f,g) */ \
wolfSSL 15:117db924cf7c 1266 "addq "#h", "#d"\n\t" \
wolfSSL 15:117db924cf7c 1267 /* L4 = a ^ b */ \
wolfSSL 15:117db924cf7c 1268 "xorq "#a", " L4 "\n\t" \
wolfSSL 15:117db924cf7c 1269
wolfSSL 15:117db924cf7c 1270 #define RND_RORX_1_8(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1271 /* L2 = (a ^ b) & (b ^ c) */ \
wolfSSL 15:117db924cf7c 1272 "andq " L4 ", " L3 "\n\t" \
wolfSSL 15:117db924cf7c 1273 /* h += Sigma0(a) */ \
wolfSSL 15:117db924cf7c 1274 "addq " L1 ", "#h"\n\t" \
wolfSSL 15:117db924cf7c 1275 /* L3 = Maj(a,b,c) */ \
wolfSSL 15:117db924cf7c 1276 "xorq "#b", " L3 "\n\t" \
wolfSSL 15:117db924cf7c 1277
wolfSSL 15:117db924cf7c 1278 #define RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1279 RND_RORX_0_1(a, b, c, d, e, f, g, h, i+0) \
wolfSSL 15:117db924cf7c 1280 RND_RORX_0_2(a, b, c, d, e, f, g, h, i+0) \
wolfSSL 15:117db924cf7c 1281 RND_RORX_0_3(a, b, c, d, e, f, g, h, i+0) \
wolfSSL 15:117db924cf7c 1282 RND_RORX_0_4(a, b, c, d, e, f, g, h, i+0) \
wolfSSL 15:117db924cf7c 1283 RND_RORX_0_5(a, b, c, d, e, f, g, h, i+0) \
wolfSSL 15:117db924cf7c 1284 RND_RORX_0_6(a, b, c, d, e, f, g, h, i+0) \
wolfSSL 15:117db924cf7c 1285 RND_RORX_0_7(a, b, c, d, e, f, g, h, i+0) \
wolfSSL 15:117db924cf7c 1286 RND_RORX_0_8(a, b, c, d, e, f, g, h, i+0) \
wolfSSL 15:117db924cf7c 1287 RND_RORX_1_1(h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1288 RND_RORX_1_2(h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1289 RND_RORX_1_3(h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1290 RND_RORX_1_4(h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1291 RND_RORX_1_5(h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1292 RND_RORX_1_6(h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1293 RND_RORX_1_7(h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1294 RND_RORX_1_8(h, a, b, c, d, e, f, g, i+1) \
wolfSSL 15:117db924cf7c 1295
wolfSSL 15:117db924cf7c 1296 #define RND_RORX_ALL_4(a, b, c, d, e, f, g, h, i) \
wolfSSL 15:117db924cf7c 1297 RND_RORX_ALL_2(a, b, c, d, e, f, g, h, i+0) \
wolfSSL 15:117db924cf7c 1298 RND_RORX_ALL_2(g, h, a, b, c, d, e, f, i+2)
wolfSSL 15:117db924cf7c 1299
wolfSSL 15:117db924cf7c 1300 #define MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1301 RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1302 VPALIGNR(W_M15, W_2, W_0, 8) \
wolfSSL 15:117db924cf7c 1303 VPALIGNR(W_M7, W_10, W_8, 8) \
wolfSSL 15:117db924cf7c 1304 RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1305 V_SHIFT_R(XTMP1, W_M15, 1) \
wolfSSL 15:117db924cf7c 1306 V_SHIFT_L(XTMP2, W_M15, 63) \
wolfSSL 15:117db924cf7c 1307 RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1308 V_SHIFT_R(XTMP3, W_M15, 8) \
wolfSSL 15:117db924cf7c 1309 V_SHIFT_L(XTMP4, W_M15, 56) \
wolfSSL 15:117db924cf7c 1310 RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1311 V_OR(XTMP1, XTMP2, XTMP1) \
wolfSSL 15:117db924cf7c 1312 V_OR(XTMP3, XTMP4, XTMP3) \
wolfSSL 15:117db924cf7c 1313 RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1314 V_SHIFT_R(XTMP4, W_M15, 7) \
wolfSSL 15:117db924cf7c 1315 V_XOR(XTMP1, XTMP3, XTMP1) \
wolfSSL 15:117db924cf7c 1316 RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1317 V_XOR(XTMP1, XTMP4, XTMP1) \
wolfSSL 15:117db924cf7c 1318 V_ADD(W_0, W_0, W_M7) \
wolfSSL 15:117db924cf7c 1319 RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1320 RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1321 V_ADD(W_0, W_0, XTMP1) \
wolfSSL 15:117db924cf7c 1322 RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1323 V_SHIFT_R(XTMP1, W_14, 19) \
wolfSSL 15:117db924cf7c 1324 V_SHIFT_L(XTMP2, W_14, 45) \
wolfSSL 15:117db924cf7c 1325 RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1326 V_SHIFT_R(XTMP3, W_14, 61) \
wolfSSL 15:117db924cf7c 1327 V_SHIFT_L(XTMP4, W_14, 3) \
wolfSSL 15:117db924cf7c 1328 RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1329 V_OR(XTMP1, XTMP2, XTMP1) \
wolfSSL 15:117db924cf7c 1330 V_OR(XTMP3, XTMP4, XTMP3) \
wolfSSL 15:117db924cf7c 1331 RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1332 RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1333 V_XOR(XTMP1, XTMP3, XTMP1) \
wolfSSL 15:117db924cf7c 1334 V_SHIFT_R(XTMP4, W_14, 6) \
wolfSSL 15:117db924cf7c 1335 RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1336 RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1337 V_XOR(XTMP1, XTMP4, XTMP1) \
wolfSSL 15:117db924cf7c 1338 RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1339 V_ADD(W_0, W_0, XTMP1) \
wolfSSL 15:117db924cf7c 1340
wolfSSL 15:117db924cf7c 1341 #endif
wolfSSL 15:117db924cf7c 1342
wolfSSL 15:117db924cf7c 1343 #define _INIT_MASK(mask) \
wolfSSL 15:117db924cf7c 1344 "vmovdqu %[mask], %%" #mask "\n\t"
wolfSSL 15:117db924cf7c 1345 #define INIT_MASK(mask) \
wolfSSL 15:117db924cf7c 1346 _INIT_MASK(mask)
wolfSSL 15:117db924cf7c 1347
wolfSSL 15:117db924cf7c 1348 #define _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) \
wolfSSL 15:117db924cf7c 1349 "vmovdqu " #i1 "*16(%%" #reg "), %%" #xmm1 "\n\t" \
wolfSSL 15:117db924cf7c 1350 "vmovdqu " #i2 "*16(%%" #reg "), %%" #xmm2 "\n\t" \
wolfSSL 15:117db924cf7c 1351 "vpshufb %%" #mask ", %%" #xmm1 ", %%" #xmm1 "\n\t" \
wolfSSL 15:117db924cf7c 1352 "vpshufb %%" #mask ", %%" #xmm2 ", %%" #xmm2 "\n\t"
wolfSSL 15:117db924cf7c 1353 #define LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg) \
wolfSSL 15:117db924cf7c 1354 _LOAD_W_2(i1, i2, xmm1, xmm2, mask, reg)
wolfSSL 15:117db924cf7c 1355
wolfSSL 15:117db924cf7c 1356 #define LOAD_W(mask, reg) \
wolfSSL 15:117db924cf7c 1357 /* X0..3(xmm4..7), W[0..15] = buffer[0.15]; */ \
wolfSSL 15:117db924cf7c 1358 LOAD_W_2(0, 1, W_0 , W_2 , mask, reg) \
wolfSSL 15:117db924cf7c 1359 LOAD_W_2(2, 3, W_4 , W_6 , mask, reg) \
wolfSSL 15:117db924cf7c 1360 LOAD_W_2(4, 5, W_8 , W_10, mask, reg) \
wolfSSL 15:117db924cf7c 1361 LOAD_W_2(6, 7, W_12, W_14, mask, reg)
wolfSSL 15:117db924cf7c 1362
wolfSSL 15:117db924cf7c 1363 #define _SET_W_X_2(xmm0, xmm1, reg, i) \
wolfSSL 15:117db924cf7c 1364 "vpaddq " #i "+ 0(%%" #reg "), %%" #xmm0 ", %%xmm8\n\t" \
wolfSSL 15:117db924cf7c 1365 "vpaddq " #i "+16(%%" #reg "), %%" #xmm1 ", %%xmm9\n\t" \
wolfSSL 15:117db924cf7c 1366 "vmovdqu %%xmm8, " #i "+ 0(" WX ")\n\t" \
wolfSSL 15:117db924cf7c 1367 "vmovdqu %%xmm9, " #i "+16(" WX ")\n\t" \
wolfSSL 15:117db924cf7c 1368
wolfSSL 15:117db924cf7c 1369 #define SET_W_X_2(xmm0, xmm1, reg, i) \
wolfSSL 15:117db924cf7c 1370 _SET_W_X_2(xmm0, xmm1, reg, i)
wolfSSL 15:117db924cf7c 1371
wolfSSL 15:117db924cf7c 1372 #define SET_W_X(reg) \
wolfSSL 15:117db924cf7c 1373 SET_W_X_2(W_0 , W_2 , reg, 0) \
wolfSSL 15:117db924cf7c 1374 SET_W_X_2(W_4 , W_6 , reg, 32) \
wolfSSL 15:117db924cf7c 1375 SET_W_X_2(W_8 , W_10, reg, 64) \
wolfSSL 15:117db924cf7c 1376 SET_W_X_2(W_12, W_14, reg, 96)
wolfSSL 15:117db924cf7c 1377
wolfSSL 15:117db924cf7c 1378 #define LOAD_DIGEST() \
wolfSSL 15:117db924cf7c 1379 "movq (%[sha512]), %%r8 \n\t" \
wolfSSL 15:117db924cf7c 1380 "movq 8(%[sha512]), %%r9 \n\t" \
wolfSSL 15:117db924cf7c 1381 "movq 16(%[sha512]), %%r10\n\t" \
wolfSSL 15:117db924cf7c 1382 "movq 24(%[sha512]), %%r11\n\t" \
wolfSSL 15:117db924cf7c 1383 "movq 32(%[sha512]), %%r12\n\t" \
wolfSSL 15:117db924cf7c 1384 "movq 40(%[sha512]), %%r13\n\t" \
wolfSSL 15:117db924cf7c 1385 "movq 48(%[sha512]), %%r14\n\t" \
wolfSSL 15:117db924cf7c 1386 "movq 56(%[sha512]), %%r15\n\t"
wolfSSL 15:117db924cf7c 1387
wolfSSL 15:117db924cf7c 1388 #define STORE_ADD_DIGEST() \
wolfSSL 15:117db924cf7c 1389 "addq %%r8, (%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1390 "addq %%r9, 8(%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1391 "addq %%r10, 16(%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1392 "addq %%r11, 24(%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1393 "addq %%r12, 32(%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1394 "addq %%r13, 40(%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1395 "addq %%r14, 48(%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1396 "addq %%r15, 56(%[sha512])\n\t"
wolfSSL 15:117db924cf7c 1397
wolfSSL 15:117db924cf7c 1398 #define ADD_DIGEST() \
wolfSSL 15:117db924cf7c 1399 "addq (%[sha512]), %%r8 \n\t" \
wolfSSL 15:117db924cf7c 1400 "addq 8(%[sha512]), %%r9 \n\t" \
wolfSSL 15:117db924cf7c 1401 "addq 16(%[sha512]), %%r10\n\t" \
wolfSSL 15:117db924cf7c 1402 "addq 24(%[sha512]), %%r11\n\t" \
wolfSSL 15:117db924cf7c 1403 "addq 32(%[sha512]), %%r12\n\t" \
wolfSSL 15:117db924cf7c 1404 "addq 40(%[sha512]), %%r13\n\t" \
wolfSSL 15:117db924cf7c 1405 "addq 48(%[sha512]), %%r14\n\t" \
wolfSSL 15:117db924cf7c 1406 "addq 56(%[sha512]), %%r15\n\t"
wolfSSL 15:117db924cf7c 1407
wolfSSL 15:117db924cf7c 1408 #define STORE_DIGEST() \
wolfSSL 15:117db924cf7c 1409 "movq %%r8, (%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1410 "movq %%r9, 8(%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1411 "movq %%r10, 16(%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1412 "movq %%r11, 24(%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1413 "movq %%r12, 32(%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1414 "movq %%r13, 40(%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1415 "movq %%r14, 48(%[sha512])\n\t" \
wolfSSL 15:117db924cf7c 1416 "movq %%r15, 56(%[sha512])\n\t"
wolfSSL 15:117db924cf7c 1417
wolfSSL 15:117db924cf7c 1418 #endif /* HAVE_INTEL_AVX1 */
wolfSSL 15:117db924cf7c 1419
wolfSSL 15:117db924cf7c 1420
wolfSSL 15:117db924cf7c 1421 /*** Transform Body ***/
wolfSSL 15:117db924cf7c 1422 #if defined(HAVE_INTEL_AVX1)
wolfSSL 15:117db924cf7c 1423 static int Transform_Sha512_AVX1(wc_Sha512* sha512)
wolfSSL 15:117db924cf7c 1424 {
wolfSSL 15:117db924cf7c 1425 __asm__ __volatile__ (
wolfSSL 15:117db924cf7c 1426
wolfSSL 15:117db924cf7c 1427 /* 16 Ws plus loop counter. */
wolfSSL 15:117db924cf7c 1428 "subq $136, %%rsp\n\t"
wolfSSL 15:117db924cf7c 1429 "leaq 64(%[sha512]), %%rax\n\t"
wolfSSL 15:117db924cf7c 1430
wolfSSL 15:117db924cf7c 1431 INIT_MASK(MASK)
wolfSSL 15:117db924cf7c 1432 LOAD_DIGEST()
wolfSSL 15:117db924cf7c 1433
wolfSSL 15:117db924cf7c 1434 LOAD_W(MASK, rax)
wolfSSL 15:117db924cf7c 1435
wolfSSL 15:117db924cf7c 1436 "movl $4, 16*8(" WX ")\n\t"
wolfSSL 15:117db924cf7c 1437 "leaq %[K512], %%rsi\n\t"
wolfSSL 15:117db924cf7c 1438 /* b */
wolfSSL 15:117db924cf7c 1439 "movq %%r9, " L4 "\n\t"
wolfSSL 15:117db924cf7c 1440 /* e */
wolfSSL 15:117db924cf7c 1441 "movq %%r12, " L1 "\n\t"
wolfSSL 15:117db924cf7c 1442 /* b ^ c */
wolfSSL 15:117db924cf7c 1443 "xorq %%r10, " L4 "\n\t"
wolfSSL 15:117db924cf7c 1444
wolfSSL 15:117db924cf7c 1445 "# Start of 16 rounds\n"
wolfSSL 15:117db924cf7c 1446 "1:\n\t"
wolfSSL 15:117db924cf7c 1447
wolfSSL 15:117db924cf7c 1448 SET_W_X(rsi)
wolfSSL 15:117db924cf7c 1449
wolfSSL 15:117db924cf7c 1450 "addq $128, %%rsi\n\t"
wolfSSL 15:117db924cf7c 1451
wolfSSL 15:117db924cf7c 1452 MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 1453 MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
wolfSSL 15:117db924cf7c 1454 MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
wolfSSL 15:117db924cf7c 1455 MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
wolfSSL 15:117db924cf7c 1456 MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
wolfSSL 15:117db924cf7c 1457 MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
wolfSSL 15:117db924cf7c 1458 MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
wolfSSL 15:117db924cf7c 1459 MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
wolfSSL 15:117db924cf7c 1460
wolfSSL 15:117db924cf7c 1461 "subl $1, 16*8(" WX ")\n\t"
wolfSSL 15:117db924cf7c 1462 "jne 1b\n\t"
wolfSSL 15:117db924cf7c 1463
wolfSSL 15:117db924cf7c 1464 SET_W_X(rsi)
wolfSSL 15:117db924cf7c 1465
wolfSSL 15:117db924cf7c 1466 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 1467 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
wolfSSL 15:117db924cf7c 1468 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
wolfSSL 15:117db924cf7c 1469 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
wolfSSL 15:117db924cf7c 1470
wolfSSL 15:117db924cf7c 1471 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
wolfSSL 15:117db924cf7c 1472 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
wolfSSL 15:117db924cf7c 1473 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
wolfSSL 15:117db924cf7c 1474 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
wolfSSL 15:117db924cf7c 1475
wolfSSL 15:117db924cf7c 1476 STORE_ADD_DIGEST()
wolfSSL 15:117db924cf7c 1477
wolfSSL 15:117db924cf7c 1478 "addq $136, %%rsp\n\t"
wolfSSL 15:117db924cf7c 1479
wolfSSL 15:117db924cf7c 1480 :
wolfSSL 15:117db924cf7c 1481 : [mask] "m" (mBYTE_FLIP_MASK),
wolfSSL 15:117db924cf7c 1482 [sha512] "r" (sha512),
wolfSSL 15:117db924cf7c 1483 [K512] "m" (K512)
wolfSSL 15:117db924cf7c 1484 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
wolfSSL 15:117db924cf7c 1485 );
wolfSSL 15:117db924cf7c 1486
wolfSSL 15:117db924cf7c 1487 return 0;
wolfSSL 15:117db924cf7c 1488 }
wolfSSL 15:117db924cf7c 1489
wolfSSL 15:117db924cf7c 1490 static int Transform_Sha512_AVX1_Len(wc_Sha512* sha512, word32 len)
wolfSSL 15:117db924cf7c 1491 {
wolfSSL 15:117db924cf7c 1492 __asm__ __volatile__ (
wolfSSL 15:117db924cf7c 1493
wolfSSL 15:117db924cf7c 1494 "movq 224(%[sha512]), %%rsi\n\t"
wolfSSL 15:117db924cf7c 1495 "leaq %[K512], %%rdx\n\t"
wolfSSL 15:117db924cf7c 1496
wolfSSL 15:117db924cf7c 1497 INIT_MASK(MASK)
wolfSSL 15:117db924cf7c 1498 LOAD_DIGEST()
wolfSSL 15:117db924cf7c 1499
wolfSSL 15:117db924cf7c 1500 "# Start of processing a block\n"
wolfSSL 15:117db924cf7c 1501 "2:\n\t"
wolfSSL 15:117db924cf7c 1502
wolfSSL 15:117db924cf7c 1503 /* 16 Ws plus loop counter and K512. len goes into -4(%rsp).
wolfSSL 15:117db924cf7c 1504 * Debug needs more stack space. */
wolfSSL 15:117db924cf7c 1505 "subq $256, %%rsp\n\t"
wolfSSL 15:117db924cf7c 1506
wolfSSL 15:117db924cf7c 1507 LOAD_W(MASK, rsi)
wolfSSL 15:117db924cf7c 1508
wolfSSL 15:117db924cf7c 1509 "movl $4, 16*8(" WX ")\n\t"
wolfSSL 15:117db924cf7c 1510 /* b */
wolfSSL 15:117db924cf7c 1511 "movq %%r9, " L4 "\n\t"
wolfSSL 15:117db924cf7c 1512 /* e */
wolfSSL 15:117db924cf7c 1513 "movq %%r12, " L1 "\n\t"
wolfSSL 15:117db924cf7c 1514 /* b ^ c */
wolfSSL 15:117db924cf7c 1515 "xorq %%r10, " L4 "\n\t"
wolfSSL 15:117db924cf7c 1516
wolfSSL 15:117db924cf7c 1517 SET_W_X(rdx)
wolfSSL 15:117db924cf7c 1518
wolfSSL 15:117db924cf7c 1519 "# Start of 16 rounds\n"
wolfSSL 15:117db924cf7c 1520 "1:\n\t"
wolfSSL 15:117db924cf7c 1521
wolfSSL 15:117db924cf7c 1522 "addq $128, %%rdx\n\t"
wolfSSL 15:117db924cf7c 1523 "movq %%rdx, 17*8(%%rsp)\n\t"
wolfSSL 15:117db924cf7c 1524
wolfSSL 15:117db924cf7c 1525 MsgSched2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 1526 MsgSched2(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
wolfSSL 15:117db924cf7c 1527 MsgSched2(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
wolfSSL 15:117db924cf7c 1528 MsgSched2(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
wolfSSL 15:117db924cf7c 1529 MsgSched2(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
wolfSSL 15:117db924cf7c 1530 MsgSched2(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
wolfSSL 15:117db924cf7c 1531 MsgSched2(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
wolfSSL 15:117db924cf7c 1532 MsgSched2(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
wolfSSL 15:117db924cf7c 1533
wolfSSL 15:117db924cf7c 1534 "movq 17*8(%%rsp), %%rdx\n\t"
wolfSSL 15:117db924cf7c 1535
wolfSSL 15:117db924cf7c 1536 SET_W_X(rdx)
wolfSSL 15:117db924cf7c 1537
wolfSSL 15:117db924cf7c 1538 "subl $1, 16*8(" WX ")\n\t"
wolfSSL 15:117db924cf7c 1539 "jne 1b\n\t"
wolfSSL 15:117db924cf7c 1540
wolfSSL 15:117db924cf7c 1541 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 1542 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
wolfSSL 15:117db924cf7c 1543 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
wolfSSL 15:117db924cf7c 1544 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
wolfSSL 15:117db924cf7c 1545
wolfSSL 15:117db924cf7c 1546 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
wolfSSL 15:117db924cf7c 1547 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
wolfSSL 15:117db924cf7c 1548 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
wolfSSL 15:117db924cf7c 1549 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
wolfSSL 15:117db924cf7c 1550
wolfSSL 15:117db924cf7c 1551 ADD_DIGEST()
wolfSSL 15:117db924cf7c 1552
wolfSSL 15:117db924cf7c 1553 "addq $256, %%rsp\n\t"
wolfSSL 15:117db924cf7c 1554 "leaq %[K512], %%rdx\n\t"
wolfSSL 15:117db924cf7c 1555 "addq $128, %%rsi\n\t"
wolfSSL 15:117db924cf7c 1556 "subl $128, %[len]\n\t"
wolfSSL 15:117db924cf7c 1557
wolfSSL 15:117db924cf7c 1558 STORE_DIGEST()
wolfSSL 15:117db924cf7c 1559
wolfSSL 15:117db924cf7c 1560 "jnz 2b\n\t"
wolfSSL 15:117db924cf7c 1561
wolfSSL 15:117db924cf7c 1562 :
wolfSSL 15:117db924cf7c 1563 : [mask] "m" (mBYTE_FLIP_MASK),
wolfSSL 15:117db924cf7c 1564 [len] "m" (len),
wolfSSL 15:117db924cf7c 1565 [sha512] "r" (sha512),
wolfSSL 15:117db924cf7c 1566 [K512] "m" (K512)
wolfSSL 15:117db924cf7c 1567 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
wolfSSL 15:117db924cf7c 1568 );
wolfSSL 15:117db924cf7c 1569
wolfSSL 15:117db924cf7c 1570 return 0;
wolfSSL 15:117db924cf7c 1571 }
wolfSSL 15:117db924cf7c 1572 #endif /* HAVE_INTEL_AVX1 */
wolfSSL 15:117db924cf7c 1573
wolfSSL 15:117db924cf7c 1574 #if defined(HAVE_INTEL_AVX2) && defined(HAVE_INTEL_RORX)
wolfSSL 15:117db924cf7c 1575 static int Transform_Sha512_AVX1_RORX(wc_Sha512* sha512)
wolfSSL 15:117db924cf7c 1576 {
wolfSSL 15:117db924cf7c 1577 __asm__ __volatile__ (
wolfSSL 15:117db924cf7c 1578
wolfSSL 15:117db924cf7c 1579 /* 16 Ws plus loop counter and K512. */
wolfSSL 15:117db924cf7c 1580 "subq $144, %%rsp\n\t"
wolfSSL 15:117db924cf7c 1581 "leaq 64(%[sha512]), %%rax\n\t"
wolfSSL 15:117db924cf7c 1582
wolfSSL 15:117db924cf7c 1583 INIT_MASK(MASK)
wolfSSL 15:117db924cf7c 1584 LOAD_DIGEST()
wolfSSL 15:117db924cf7c 1585
wolfSSL 15:117db924cf7c 1586 LOAD_W(MASK, rax)
wolfSSL 15:117db924cf7c 1587
wolfSSL 15:117db924cf7c 1588 "movl $4, 16*8(" WX ")\n\t"
wolfSSL 15:117db924cf7c 1589 "leaq %[K512], %%rsi\n\t"
wolfSSL 15:117db924cf7c 1590 /* L4 = b */
wolfSSL 15:117db924cf7c 1591 "movq %%r9, " L4 "\n\t"
wolfSSL 15:117db924cf7c 1592 /* L3 = 0 (add to prev h) */
wolfSSL 15:117db924cf7c 1593 "xorq " L3 ", " L3 "\n\t"
wolfSSL 15:117db924cf7c 1594 /* L4 = b ^ c */
wolfSSL 15:117db924cf7c 1595 "xorq %%r10, " L4 "\n\t"
wolfSSL 15:117db924cf7c 1596
wolfSSL 15:117db924cf7c 1597 SET_W_X(rsi)
wolfSSL 15:117db924cf7c 1598
wolfSSL 15:117db924cf7c 1599 "# Start of 16 rounds\n"
wolfSSL 15:117db924cf7c 1600 "1:\n\t"
wolfSSL 15:117db924cf7c 1601
wolfSSL 15:117db924cf7c 1602 "addq $128, %%rsi\n\t"
wolfSSL 15:117db924cf7c 1603
wolfSSL 15:117db924cf7c 1604 MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 1605 MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
wolfSSL 15:117db924cf7c 1606 MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
wolfSSL 15:117db924cf7c 1607 MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
wolfSSL 15:117db924cf7c 1608 MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
wolfSSL 15:117db924cf7c 1609 MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
wolfSSL 15:117db924cf7c 1610 MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
wolfSSL 15:117db924cf7c 1611 MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
wolfSSL 15:117db924cf7c 1612
wolfSSL 15:117db924cf7c 1613 SET_W_X(rsi)
wolfSSL 15:117db924cf7c 1614
wolfSSL 15:117db924cf7c 1615 "subl $1, 16*8(" WX ")\n\t"
wolfSSL 15:117db924cf7c 1616 "jne 1b\n\t"
wolfSSL 15:117db924cf7c 1617
wolfSSL 15:117db924cf7c 1618 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 1619 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
wolfSSL 15:117db924cf7c 1620 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
wolfSSL 15:117db924cf7c 1621 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
wolfSSL 15:117db924cf7c 1622
wolfSSL 15:117db924cf7c 1623 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
wolfSSL 15:117db924cf7c 1624 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
wolfSSL 15:117db924cf7c 1625 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
wolfSSL 15:117db924cf7c 1626 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
wolfSSL 15:117db924cf7c 1627
wolfSSL 15:117db924cf7c 1628 /* Prev RND: h += Maj(a,b,c) */
wolfSSL 15:117db924cf7c 1629 "addq " L3 ", %%r8\n\t"
wolfSSL 15:117db924cf7c 1630 "addq $144, %%rsp\n\t"
wolfSSL 15:117db924cf7c 1631
wolfSSL 15:117db924cf7c 1632 STORE_ADD_DIGEST()
wolfSSL 15:117db924cf7c 1633
wolfSSL 15:117db924cf7c 1634 :
wolfSSL 15:117db924cf7c 1635 : [mask] "m" (mBYTE_FLIP_MASK),
wolfSSL 15:117db924cf7c 1636 [sha512] "r" (sha512),
wolfSSL 15:117db924cf7c 1637 [K512] "m" (K512)
wolfSSL 15:117db924cf7c 1638 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
wolfSSL 15:117db924cf7c 1639 );
wolfSSL 15:117db924cf7c 1640
wolfSSL 15:117db924cf7c 1641 return 0;
wolfSSL 15:117db924cf7c 1642 }
wolfSSL 15:117db924cf7c 1643
wolfSSL 15:117db924cf7c 1644 static int Transform_Sha512_AVX1_RORX_Len(wc_Sha512* sha512, word32 len)
wolfSSL 15:117db924cf7c 1645 {
wolfSSL 15:117db924cf7c 1646 __asm__ __volatile__ (
wolfSSL 15:117db924cf7c 1647
wolfSSL 15:117db924cf7c 1648 "movq 224(%[sha512]), %%rsi\n\t"
wolfSSL 15:117db924cf7c 1649 "leaq %[K512], %%rcx\n\t"
wolfSSL 15:117db924cf7c 1650
wolfSSL 15:117db924cf7c 1651 INIT_MASK(MASK)
wolfSSL 15:117db924cf7c 1652 LOAD_DIGEST()
wolfSSL 15:117db924cf7c 1653
wolfSSL 15:117db924cf7c 1654 "# Start of processing a block\n"
wolfSSL 15:117db924cf7c 1655 "2:\n\t"
wolfSSL 15:117db924cf7c 1656
wolfSSL 15:117db924cf7c 1657 /* 16 Ws plus loop counter and K512. len goes into -4(%rsp).
wolfSSL 15:117db924cf7c 1658 * Debug needs more stack space. */
wolfSSL 15:117db924cf7c 1659 "subq $256, %%rsp\n\t"
wolfSSL 15:117db924cf7c 1660
wolfSSL 15:117db924cf7c 1661 LOAD_W(MASK, rsi)
wolfSSL 15:117db924cf7c 1662
wolfSSL 15:117db924cf7c 1663 "movl $4, 16*8(" WX ")\n\t"
wolfSSL 15:117db924cf7c 1664 /* L4 = b */
wolfSSL 15:117db924cf7c 1665 "movq %%r9, " L4 "\n\t"
wolfSSL 15:117db924cf7c 1666 /* L3 = 0 (add to prev h) */
wolfSSL 15:117db924cf7c 1667 "xorq " L3 ", " L3 "\n\t"
wolfSSL 15:117db924cf7c 1668 /* L4 = b ^ c */
wolfSSL 15:117db924cf7c 1669 "xorq %%r10, " L4 "\n\t"
wolfSSL 15:117db924cf7c 1670
wolfSSL 15:117db924cf7c 1671 SET_W_X(rcx)
wolfSSL 15:117db924cf7c 1672
wolfSSL 15:117db924cf7c 1673 "# Start of 16 rounds\n"
wolfSSL 15:117db924cf7c 1674 "1:\n\t"
wolfSSL 15:117db924cf7c 1675
wolfSSL 15:117db924cf7c 1676 "addq $128, %%rcx\n\t"
wolfSSL 15:117db924cf7c 1677 "movq %%rcx, 17*8(%%rsp)\n\t"
wolfSSL 15:117db924cf7c 1678
wolfSSL 15:117db924cf7c 1679 MsgSched_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 1680 MsgSched_RORX(W_2,W_4,W_6,W_8,W_10,W_12,W_14,W_0,RG,RH,RA,RB,RC,RD,RE,RF, 2)
wolfSSL 15:117db924cf7c 1681 MsgSched_RORX(W_4,W_6,W_8,W_10,W_12,W_14,W_0,W_2,RE,RF,RG,RH,RA,RB,RC,RD, 4)
wolfSSL 15:117db924cf7c 1682 MsgSched_RORX(W_6,W_8,W_10,W_12,W_14,W_0,W_2,W_4,RC,RD,RE,RF,RG,RH,RA,RB, 6)
wolfSSL 15:117db924cf7c 1683 MsgSched_RORX(W_8,W_10,W_12,W_14,W_0,W_2,W_4,W_6,RA,RB,RC,RD,RE,RF,RG,RH, 8)
wolfSSL 15:117db924cf7c 1684 MsgSched_RORX(W_10,W_12,W_14,W_0,W_2,W_4,W_6,W_8,RG,RH,RA,RB,RC,RD,RE,RF,10)
wolfSSL 15:117db924cf7c 1685 MsgSched_RORX(W_12,W_14,W_0,W_2,W_4,W_6,W_8,W_10,RE,RF,RG,RH,RA,RB,RC,RD,12)
wolfSSL 15:117db924cf7c 1686 MsgSched_RORX(W_14,W_0,W_2,W_4,W_6,W_8,W_10,W_12,RC,RD,RE,RF,RG,RH,RA,RB,14)
wolfSSL 15:117db924cf7c 1687
wolfSSL 15:117db924cf7c 1688 "movq 17*8(%%rsp), %%rcx\n\t"
wolfSSL 15:117db924cf7c 1689
wolfSSL 15:117db924cf7c 1690 SET_W_X(rcx)
wolfSSL 15:117db924cf7c 1691
wolfSSL 15:117db924cf7c 1692 "subl $1, 16*8(" WX ")\n\t"
wolfSSL 15:117db924cf7c 1693 "jne 1b\n\t"
wolfSSL 15:117db924cf7c 1694
wolfSSL 15:117db924cf7c 1695 SET_W_X(rcx)
wolfSSL 15:117db924cf7c 1696
wolfSSL 15:117db924cf7c 1697 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 1698 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
wolfSSL 15:117db924cf7c 1699 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
wolfSSL 15:117db924cf7c 1700 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
wolfSSL 15:117db924cf7c 1701
wolfSSL 15:117db924cf7c 1702 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
wolfSSL 15:117db924cf7c 1703 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
wolfSSL 15:117db924cf7c 1704 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
wolfSSL 15:117db924cf7c 1705 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
wolfSSL 15:117db924cf7c 1706
wolfSSL 15:117db924cf7c 1707 /* Prev RND: h += Maj(a,b,c) */
wolfSSL 15:117db924cf7c 1708 "addq " L3 ", %%r8\n\t"
wolfSSL 15:117db924cf7c 1709 "addq $256, %%rsp\n\t"
wolfSSL 15:117db924cf7c 1710
wolfSSL 15:117db924cf7c 1711 ADD_DIGEST()
wolfSSL 15:117db924cf7c 1712
wolfSSL 15:117db924cf7c 1713 "leaq %[K512], %%rcx\n\t"
wolfSSL 15:117db924cf7c 1714 "addq $128, %%rsi\n\t"
wolfSSL 15:117db924cf7c 1715 "subl $128, %[len]\n\t"
wolfSSL 15:117db924cf7c 1716
wolfSSL 15:117db924cf7c 1717 STORE_DIGEST()
wolfSSL 15:117db924cf7c 1718
wolfSSL 15:117db924cf7c 1719 "jnz 2b\n\t"
wolfSSL 15:117db924cf7c 1720
wolfSSL 15:117db924cf7c 1721 :
wolfSSL 15:117db924cf7c 1722 : [mask] "m" (mBYTE_FLIP_MASK),
wolfSSL 15:117db924cf7c 1723 [len] "m" (len),
wolfSSL 15:117db924cf7c 1724 [sha512] "r" (sha512),
wolfSSL 15:117db924cf7c 1725 [K512] "m" (K512)
wolfSSL 15:117db924cf7c 1726 : WORK_REGS, STATE_REGS, XMM_REGS, "memory", "rsi"
wolfSSL 15:117db924cf7c 1727 );
wolfSSL 15:117db924cf7c 1728
wolfSSL 15:117db924cf7c 1729 return 0;
wolfSSL 15:117db924cf7c 1730 }
wolfSSL 15:117db924cf7c 1731 #endif /* HAVE_INTEL_AVX2 && HAVE_INTEL_RORX */
wolfSSL 15:117db924cf7c 1732
wolfSSL 15:117db924cf7c 1733 #if defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 1734 static const unsigned long mBYTE_FLIP_MASK_Y[] =
wolfSSL 15:117db924cf7c 1735 { 0x0001020304050607, 0x08090a0b0c0d0e0f,
wolfSSL 15:117db924cf7c 1736 0x0001020304050607, 0x08090a0b0c0d0e0f };
wolfSSL 15:117db924cf7c 1737
wolfSSL 15:117db924cf7c 1738 #define W_Y_0 ymm0
wolfSSL 15:117db924cf7c 1739 #define W_Y_4 ymm1
wolfSSL 15:117db924cf7c 1740 #define W_Y_8 ymm2
wolfSSL 15:117db924cf7c 1741 #define W_Y_12 ymm3
wolfSSL 15:117db924cf7c 1742
wolfSSL 15:117db924cf7c 1743 #define X0 xmm0
wolfSSL 15:117db924cf7c 1744 #define X1 xmm1
wolfSSL 15:117db924cf7c 1745 #define X2 xmm2
wolfSSL 15:117db924cf7c 1746 #define X3 xmm3
wolfSSL 15:117db924cf7c 1747 #define X4 xmm4
wolfSSL 15:117db924cf7c 1748 #define X5 xmm5
wolfSSL 15:117db924cf7c 1749 #define X6 xmm6
wolfSSL 15:117db924cf7c 1750 #define X7 xmm7
wolfSSL 15:117db924cf7c 1751 #define X8 xmm8
wolfSSL 15:117db924cf7c 1752 #define X9 xmm9
wolfSSL 15:117db924cf7c 1753 #define Y0 ymm0
wolfSSL 15:117db924cf7c 1754 #define Y1 ymm1
wolfSSL 15:117db924cf7c 1755 #define Y2 ymm2
wolfSSL 15:117db924cf7c 1756 #define Y3 ymm3
wolfSSL 15:117db924cf7c 1757 #define Y4 ymm4
wolfSSL 15:117db924cf7c 1758 #define Y5 ymm5
wolfSSL 15:117db924cf7c 1759 #define Y6 ymm6
wolfSSL 15:117db924cf7c 1760 #define Y7 ymm7
wolfSSL 15:117db924cf7c 1761
wolfSSL 15:117db924cf7c 1762 #define W_Y_M15 ymm12
wolfSSL 15:117db924cf7c 1763 #define W_Y_M7 ymm13
wolfSSL 15:117db924cf7c 1764 #define W_Y_M2 ymm14
wolfSSL 15:117db924cf7c 1765 #define MASK_Y ymm15
wolfSSL 15:117db924cf7c 1766
wolfSSL 15:117db924cf7c 1767 #define YTMP1 ymm8
wolfSSL 15:117db924cf7c 1768 #define YTMP2 ymm9
wolfSSL 15:117db924cf7c 1769 #define YTMP3 ymm10
wolfSSL 15:117db924cf7c 1770 #define YTMP4 ymm11
wolfSSL 15:117db924cf7c 1771
wolfSSL 15:117db924cf7c 1772 #define YMM_REGS \
wolfSSL 15:117db924cf7c 1773 "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", \
wolfSSL 15:117db924cf7c 1774 "xmm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15"
wolfSSL 15:117db924cf7c 1775
wolfSSL 15:117db924cf7c 1776 #define _VPERM2I128(dest, src1, src2, sel) \
wolfSSL 15:117db924cf7c 1777 "vperm2I128 $" #sel ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t"
wolfSSL 15:117db924cf7c 1778 #define VPERM2I128(dest, src1, src2, sel) \
wolfSSL 15:117db924cf7c 1779 _VPERM2I128(dest, src1, src2, sel)
wolfSSL 15:117db924cf7c 1780
wolfSSL 15:117db924cf7c 1781 #define _VPERMQ(dest, src, sel) \
wolfSSL 15:117db924cf7c 1782 "vpermq $" #sel ", %%" #src ", %%" #dest "\n\t"
wolfSSL 15:117db924cf7c 1783 #define VPERMQ(dest, src, sel) \
wolfSSL 15:117db924cf7c 1784 _VPERMQ(dest, src, sel)
wolfSSL 15:117db924cf7c 1785
wolfSSL 15:117db924cf7c 1786 #define _VPBLENDD(dest, src1, src2, sel) \
wolfSSL 15:117db924cf7c 1787 "vpblendd $" #sel ", %%" #src2 ", %%" #src1 ", %%" #dest "\n\t"
wolfSSL 15:117db924cf7c 1788 #define VPBLENDD(dest, src1, src2, sel) \
wolfSSL 15:117db924cf7c 1789 _VPBLENDD(dest, src1, src2, sel)
wolfSSL 15:117db924cf7c 1790
wolfSSL 15:117db924cf7c 1791 #define _V_ADD_I(dest, src1, addr, i) \
wolfSSL 15:117db924cf7c 1792 "vpaddq "#i"*8(%%" #addr "), %%" #src1 ", %%" #dest "\n\t"
wolfSSL 15:117db924cf7c 1793 #define V_ADD_I(dest, src1, addr, i) \
wolfSSL 15:117db924cf7c 1794 _V_ADD_I(dest, src1, addr, i)
wolfSSL 15:117db924cf7c 1795
wolfSSL 15:117db924cf7c 1796 #define _VMOVDQU_I(addr, i, src) \
wolfSSL 15:117db924cf7c 1797 "vmovdqu %%" #src ", " #i "*8(%%" #addr ")\n\t"
wolfSSL 15:117db924cf7c 1798 #define VMOVDQU_I(addr, i, src) \
wolfSSL 15:117db924cf7c 1799 _VMOVDQU_I(addr, i, src)
wolfSSL 15:117db924cf7c 1800
wolfSSL 15:117db924cf7c 1801 #define MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1802 RND_0_1(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1803 /* W[-13]..W[-15], W[-12] */ \
wolfSSL 15:117db924cf7c 1804 VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03) \
wolfSSL 15:117db924cf7c 1805 /* W[-5]..W[-7], W[-4] */ \
wolfSSL 15:117db924cf7c 1806 VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03) \
wolfSSL 15:117db924cf7c 1807 RND_0_2(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1808 RND_0_3(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1809 /* W_Y_M15 = W[-12]..W[-15] */ \
wolfSSL 15:117db924cf7c 1810 VPERMQ(W_Y_M15, W_Y_M15, 0x39) \
wolfSSL 15:117db924cf7c 1811 RND_0_4(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1812 /* W_Y_M7 = W[-4]..W[-7] */ \
wolfSSL 15:117db924cf7c 1813 VPERMQ(W_Y_M7, W_Y_M7, 0x39) \
wolfSSL 15:117db924cf7c 1814 RND_0_5(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1815 RND_0_6(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1816 /* W[-15] >> 1 */ \
wolfSSL 15:117db924cf7c 1817 V_SHIFT_R(YTMP1, W_Y_M15, 1) \
wolfSSL 15:117db924cf7c 1818 RND_0_7(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1819 /* W[-15] << 63 */ \
wolfSSL 15:117db924cf7c 1820 V_SHIFT_L(YTMP2, W_Y_M15, 63) \
wolfSSL 15:117db924cf7c 1821 RND_0_8(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1822 /* W[-15] >> 8 */ \
wolfSSL 15:117db924cf7c 1823 V_SHIFT_R(YTMP3, W_Y_M15, 8) \
wolfSSL 15:117db924cf7c 1824 RND_0_9(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1825 /* W[-15] << 56 */ \
wolfSSL 15:117db924cf7c 1826 V_SHIFT_L(YTMP4, W_Y_M15, 56) \
wolfSSL 15:117db924cf7c 1827 RND_0_10(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1828 /* W[-15] >>> 1 */ \
wolfSSL 15:117db924cf7c 1829 V_OR(YTMP1, YTMP2, YTMP1) \
wolfSSL 15:117db924cf7c 1830 RND_0_11(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1831 /* W[-15] >>> 8 */ \
wolfSSL 15:117db924cf7c 1832 V_OR(YTMP3, YTMP4, YTMP3) \
wolfSSL 15:117db924cf7c 1833 RND_0_12(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1834 RND_1_1(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1835 /* W[-15] >> 7 */ \
wolfSSL 15:117db924cf7c 1836 V_SHIFT_R(YTMP4, W_Y_M15, 7) \
wolfSSL 15:117db924cf7c 1837 RND_1_2_A(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1838 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */ \
wolfSSL 15:117db924cf7c 1839 V_XOR(YTMP1, YTMP3, YTMP1) \
wolfSSL 15:117db924cf7c 1840 RND_1_2_B(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1841 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */ \
wolfSSL 15:117db924cf7c 1842 V_XOR(YTMP1, YTMP4, YTMP1) \
wolfSSL 15:117db924cf7c 1843 RND_1_3(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1844 /* W[0] = W[-16] + W[-7] */ \
wolfSSL 15:117db924cf7c 1845 V_ADD(W_Y_0, W_Y_0, W_Y_M7) \
wolfSSL 15:117db924cf7c 1846 RND_1_4(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1847 /* W[0] = W[-16] + W[-7] + s0(W[-15]) */ \
wolfSSL 15:117db924cf7c 1848 V_ADD(W_Y_0, W_Y_0, YTMP1) \
wolfSSL 15:117db924cf7c 1849 RND_1_5(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1850 /* 0, 0, W[-1], W[-2] */ \
wolfSSL 15:117db924cf7c 1851 VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81) \
wolfSSL 15:117db924cf7c 1852 RND_1_6(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1853 RND_1_7(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1854 RND_1_8(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1855 /* W[-2] >> 19 */ \
wolfSSL 15:117db924cf7c 1856 V_SHIFT_R(YTMP1, W_Y_M2, 19) \
wolfSSL 15:117db924cf7c 1857 RND_1_9(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1858 /* W[-2] << 45 */ \
wolfSSL 15:117db924cf7c 1859 V_SHIFT_L(YTMP2, W_Y_M2, 45) \
wolfSSL 15:117db924cf7c 1860 RND_1_10(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1861 /* W[-2] >> 61 */ \
wolfSSL 15:117db924cf7c 1862 V_SHIFT_R(YTMP3, W_Y_M2, 61) \
wolfSSL 15:117db924cf7c 1863 RND_1_11(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1864 /* W[-2] << 3 */ \
wolfSSL 15:117db924cf7c 1865 V_SHIFT_L(YTMP4, W_Y_M2, 3) \
wolfSSL 15:117db924cf7c 1866 RND_1_12(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1867 RND_0_1(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 1868 /* W[-2] >>> 19 */ \
wolfSSL 15:117db924cf7c 1869 V_OR(YTMP1, YTMP2, YTMP1) \
wolfSSL 15:117db924cf7c 1870 RND_0_2(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 1871 /* W[-2] >>> 61 */ \
wolfSSL 15:117db924cf7c 1872 V_OR(YTMP3, YTMP4, YTMP3) \
wolfSSL 15:117db924cf7c 1873 RND_0_3(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 1874 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \
wolfSSL 15:117db924cf7c 1875 V_XOR(YTMP1, YTMP3, YTMP1) \
wolfSSL 15:117db924cf7c 1876 RND_0_4(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 1877 /* W[-2] >> 6 */ \
wolfSSL 15:117db924cf7c 1878 V_SHIFT_R(YTMP4, W_Y_M2, 6) \
wolfSSL 15:117db924cf7c 1879 RND_0_5(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 1880 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \
wolfSSL 15:117db924cf7c 1881 V_XOR(YTMP1, YTMP4, YTMP1) \
wolfSSL 15:117db924cf7c 1882 RND_0_6(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 1883 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \
wolfSSL 15:117db924cf7c 1884 V_ADD(W_Y_0, W_Y_0, YTMP1) \
wolfSSL 15:117db924cf7c 1885 RND_0_7(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 1886 RND_0_8(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 1887 /* W[1], W[0], 0, 0 */ \
wolfSSL 15:117db924cf7c 1888 VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08) \
wolfSSL 15:117db924cf7c 1889 RND_0_9(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 1890 RND_0_10(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 1891 /* W[-2] >> 19 */ \
wolfSSL 15:117db924cf7c 1892 V_SHIFT_R(YTMP1, W_Y_M2, 19) \
wolfSSL 15:117db924cf7c 1893 RND_0_11(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 1894 /* W[-2] << 45 */ \
wolfSSL 15:117db924cf7c 1895 V_SHIFT_L(YTMP2, W_Y_M2, 45) \
wolfSSL 15:117db924cf7c 1896 RND_0_12(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 1897 RND_1_1(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 1898 /* W[-2] >> 61 */ \
wolfSSL 15:117db924cf7c 1899 V_SHIFT_R(YTMP3, W_Y_M2, 61) \
wolfSSL 15:117db924cf7c 1900 RND_1_2(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 1901 /* W[-2] << 3 */ \
wolfSSL 15:117db924cf7c 1902 V_SHIFT_L(YTMP4, W_Y_M2, 3) \
wolfSSL 15:117db924cf7c 1903 RND_1_3(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 1904 /* W[-2] >>> 19 */ \
wolfSSL 15:117db924cf7c 1905 V_OR(YTMP1, YTMP2, YTMP1) \
wolfSSL 15:117db924cf7c 1906 RND_1_4(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 1907 /* W[-2] >>> 61 */ \
wolfSSL 15:117db924cf7c 1908 V_OR(YTMP3, YTMP4, YTMP3) \
wolfSSL 15:117db924cf7c 1909 RND_1_5(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 1910 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \
wolfSSL 15:117db924cf7c 1911 V_XOR(YTMP1, YTMP3, YTMP1) \
wolfSSL 15:117db924cf7c 1912 RND_1_6(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 1913 /* W[-2] >> 6 */ \
wolfSSL 15:117db924cf7c 1914 V_SHIFT_R(YTMP4, W_Y_M2, 6) \
wolfSSL 15:117db924cf7c 1915 RND_1_7(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 1916 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \
wolfSSL 15:117db924cf7c 1917 V_XOR(YTMP1, YTMP4, YTMP1) \
wolfSSL 15:117db924cf7c 1918 RND_1_8(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 1919 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \
wolfSSL 15:117db924cf7c 1920 V_ADD(W_Y_0, W_Y_0, YTMP1) \
wolfSSL 15:117db924cf7c 1921 RND_1_9(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 1922 RND_1_10(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 1923 RND_1_11(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 1924 RND_1_12(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 1925
wolfSSL 15:117db924cf7c 1926 #define MsgSched2_AVX2(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1927 RND_0_1(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1928 VPALIGNR(W_Y_M15, W_2, W_0, 8) \
wolfSSL 15:117db924cf7c 1929 VPALIGNR(W_Y_M7, W_10, W_8, 8) \
wolfSSL 15:117db924cf7c 1930 RND_0_2(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1931 V_SHIFT_R(YTMP1, W_Y_M15, 1) \
wolfSSL 15:117db924cf7c 1932 V_SHIFT_L(YTMP2, W_Y_M15, 63) \
wolfSSL 15:117db924cf7c 1933 RND_0_3(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1934 RND_0_4(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1935 V_SHIFT_R(YTMP3, W_Y_M15, 8) \
wolfSSL 15:117db924cf7c 1936 V_SHIFT_L(YTMP4, W_Y_M15, 56) \
wolfSSL 15:117db924cf7c 1937 RND_0_5(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1938 RND_0_6(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1939 V_OR(YTMP1, YTMP2, YTMP1) \
wolfSSL 15:117db924cf7c 1940 V_OR(YTMP3, YTMP4, YTMP3) \
wolfSSL 15:117db924cf7c 1941 RND_0_7(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1942 RND_0_8(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1943 V_SHIFT_R(YTMP4, W_Y_M15, 7) \
wolfSSL 15:117db924cf7c 1944 V_XOR(YTMP1, YTMP3, YTMP1) \
wolfSSL 15:117db924cf7c 1945 RND_0_9(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1946 RND_0_10(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1947 V_XOR(YTMP1, YTMP4, YTMP1) \
wolfSSL 15:117db924cf7c 1948 V_ADD(W_0, W_0, W_Y_M7) \
wolfSSL 15:117db924cf7c 1949 RND_0_11(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1950 RND_0_12(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1951 RND_1_1(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1952 V_ADD(W_0, W_0, YTMP1) \
wolfSSL 15:117db924cf7c 1953 RND_1_2(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1954 V_SHIFT_R(YTMP1, W_14, 19) \
wolfSSL 15:117db924cf7c 1955 V_SHIFT_L(YTMP2, W_14, 45) \
wolfSSL 15:117db924cf7c 1956 RND_1_3(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1957 RND_1_4(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1958 V_SHIFT_R(YTMP3, W_14, 61) \
wolfSSL 15:117db924cf7c 1959 V_SHIFT_L(YTMP4, W_14, 3) \
wolfSSL 15:117db924cf7c 1960 RND_1_5(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1961 RND_1_6(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1962 RND_1_7(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1963 V_OR(YTMP1, YTMP2, YTMP1) \
wolfSSL 15:117db924cf7c 1964 V_OR(YTMP3, YTMP4, YTMP3) \
wolfSSL 15:117db924cf7c 1965 RND_1_8(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1966 RND_1_9(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1967 V_XOR(YTMP1, YTMP3, YTMP1) \
wolfSSL 15:117db924cf7c 1968 V_SHIFT_R(YTMP4, W_14, 6) \
wolfSSL 15:117db924cf7c 1969 RND_1_10(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1970 RND_1_11(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1971 V_XOR(YTMP1, YTMP4, YTMP1) \
wolfSSL 15:117db924cf7c 1972 RND_1_12(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 1973 V_ADD(W_0, W_0, YTMP1) \
wolfSSL 15:117db924cf7c 1974
wolfSSL 15:117db924cf7c 1975 #define MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1976 RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1977 /* W[-13]..W[-15], W[-12] */ \
wolfSSL 15:117db924cf7c 1978 VPBLENDD(W_Y_M15, W_Y_0, W_Y_4, 0x03) \
wolfSSL 15:117db924cf7c 1979 /* W[-5]..W[-7], W[-4] */ \
wolfSSL 15:117db924cf7c 1980 VPBLENDD(W_Y_M7, W_Y_8, W_Y_12, 0x03) \
wolfSSL 15:117db924cf7c 1981 RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1982 /* W_Y_M15 = W[-12]..W[-15] */ \
wolfSSL 15:117db924cf7c 1983 VPERMQ(W_Y_M15, W_Y_M15, 0x39) \
wolfSSL 15:117db924cf7c 1984 RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1985 /* W_Y_M7 = W[-4]..W[-7] */ \
wolfSSL 15:117db924cf7c 1986 VPERMQ(W_Y_M7, W_Y_M7, 0x39) \
wolfSSL 15:117db924cf7c 1987 RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1988 /* W[-15] >> 1 */ \
wolfSSL 15:117db924cf7c 1989 V_SHIFT_R(YTMP1, W_Y_M15, 1) \
wolfSSL 15:117db924cf7c 1990 /* W[-15] << 63 */ \
wolfSSL 15:117db924cf7c 1991 V_SHIFT_L(YTMP2, W_Y_M15, 63) \
wolfSSL 15:117db924cf7c 1992 RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 1993 /* W[-15] >> 8 */ \
wolfSSL 15:117db924cf7c 1994 V_SHIFT_R(YTMP3, W_Y_M15, 8) \
wolfSSL 15:117db924cf7c 1995 /* W[-15] << 56 */ \
wolfSSL 15:117db924cf7c 1996 V_SHIFT_L(YTMP4, W_Y_M15, 56) \
wolfSSL 15:117db924cf7c 1997 /* W[-15] >>> 1 */ \
wolfSSL 15:117db924cf7c 1998 V_OR(YTMP1, YTMP2, YTMP1) \
wolfSSL 15:117db924cf7c 1999 /* W[-15] >>> 8 */ \
wolfSSL 15:117db924cf7c 2000 V_OR(YTMP3, YTMP4, YTMP3) \
wolfSSL 15:117db924cf7c 2001 RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 2002 /* W[-15] >> 7 */ \
wolfSSL 15:117db924cf7c 2003 V_SHIFT_R(YTMP4, W_Y_M15, 7) \
wolfSSL 15:117db924cf7c 2004 RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 2005 /* 0, 0, W[-1], W[-2] */ \
wolfSSL 15:117db924cf7c 2006 VPERM2I128(W_Y_M2, W_Y_12, W_Y_12, 0x81) \
wolfSSL 15:117db924cf7c 2007 RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 2008 RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2009 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) */ \
wolfSSL 15:117db924cf7c 2010 V_XOR(YTMP1, YTMP3, YTMP1) \
wolfSSL 15:117db924cf7c 2011 RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2012 /* (W[-15] >>> 1) ^ (W[-15] >>> 8) ^ (W[-15] >> 7) */ \
wolfSSL 15:117db924cf7c 2013 V_XOR(YTMP1, YTMP4, YTMP1) \
wolfSSL 15:117db924cf7c 2014 RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2015 /* W[0] = W[-16] + W[-7] */ \
wolfSSL 15:117db924cf7c 2016 V_ADD(W_Y_0, W_Y_0, W_Y_M7) \
wolfSSL 15:117db924cf7c 2017 /* W[0] = W[-16] + W[-7] + s0(W[-15]) */ \
wolfSSL 15:117db924cf7c 2018 V_ADD(W_Y_0, W_Y_0, YTMP1) \
wolfSSL 15:117db924cf7c 2019 RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2020 /* W[-2] >> 19 */ \
wolfSSL 15:117db924cf7c 2021 V_SHIFT_R(YTMP1, W_Y_M2, 19) \
wolfSSL 15:117db924cf7c 2022 /* W[-2] << 45 */ \
wolfSSL 15:117db924cf7c 2023 V_SHIFT_L(YTMP2, W_Y_M2, 45) \
wolfSSL 15:117db924cf7c 2024 RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2025 /* W[-2] >> 61 */ \
wolfSSL 15:117db924cf7c 2026 V_SHIFT_R(YTMP3, W_Y_M2, 61) \
wolfSSL 15:117db924cf7c 2027 /* W[-2] << 3 */ \
wolfSSL 15:117db924cf7c 2028 V_SHIFT_L(YTMP4, W_Y_M2, 3) \
wolfSSL 15:117db924cf7c 2029 /* W[-2] >>> 19 */ \
wolfSSL 15:117db924cf7c 2030 V_OR(YTMP1, YTMP2, YTMP1) \
wolfSSL 15:117db924cf7c 2031 RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2032 /* W[-2] >>> 61 */ \
wolfSSL 15:117db924cf7c 2033 V_OR(YTMP3, YTMP4, YTMP3) \
wolfSSL 15:117db924cf7c 2034 RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2035 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \
wolfSSL 15:117db924cf7c 2036 V_XOR(YTMP1, YTMP3, YTMP1) \
wolfSSL 15:117db924cf7c 2037 RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2038 /* W[-2] >> 6 */ \
wolfSSL 15:117db924cf7c 2039 V_SHIFT_R(YTMP4, W_Y_M2, 6) \
wolfSSL 15:117db924cf7c 2040 RND_RORX_0_1(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 2041 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \
wolfSSL 15:117db924cf7c 2042 V_XOR(YTMP1, YTMP4, YTMP1) \
wolfSSL 15:117db924cf7c 2043 RND_RORX_0_2(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 2044 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \
wolfSSL 15:117db924cf7c 2045 V_ADD(W_Y_0, W_Y_0, YTMP1) \
wolfSSL 15:117db924cf7c 2046 RND_RORX_0_3(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 2047 /* W[1], W[0], 0, 0 */ \
wolfSSL 15:117db924cf7c 2048 VPERM2I128(W_Y_M2, W_Y_0, W_Y_0, 0x08) \
wolfSSL 15:117db924cf7c 2049 RND_RORX_0_4(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 2050 RND_RORX_0_5(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 2051 /* W[-2] >> 19 */ \
wolfSSL 15:117db924cf7c 2052 V_SHIFT_R(YTMP1, W_Y_M2, 19) \
wolfSSL 15:117db924cf7c 2053 /* W[-2] << 45 */ \
wolfSSL 15:117db924cf7c 2054 V_SHIFT_L(YTMP2, W_Y_M2, 45) \
wolfSSL 15:117db924cf7c 2055 RND_RORX_0_6(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 2056 /* W[-2] >> 61 */ \
wolfSSL 15:117db924cf7c 2057 V_SHIFT_R(YTMP3, W_Y_M2, 61) \
wolfSSL 15:117db924cf7c 2058 /* W[-2] << 3 */ \
wolfSSL 15:117db924cf7c 2059 V_SHIFT_L(YTMP4, W_Y_M2, 3) \
wolfSSL 15:117db924cf7c 2060 /* W[-2] >>> 19 */ \
wolfSSL 15:117db924cf7c 2061 V_OR(YTMP1, YTMP2, YTMP1) \
wolfSSL 15:117db924cf7c 2062 RND_RORX_0_7(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 2063 /* W[-2] >>> 61 */ \
wolfSSL 15:117db924cf7c 2064 V_OR(YTMP3, YTMP4, YTMP3) \
wolfSSL 15:117db924cf7c 2065 RND_RORX_0_8(g,h,a,b,c,d,e,f,i+2) \
wolfSSL 15:117db924cf7c 2066 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) */ \
wolfSSL 15:117db924cf7c 2067 V_XOR(YTMP1, YTMP3, YTMP1) \
wolfSSL 15:117db924cf7c 2068 RND_RORX_1_1(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 2069 /* W[-2] >> 6 */ \
wolfSSL 15:117db924cf7c 2070 V_SHIFT_R(YTMP4, W_Y_M2, 6) \
wolfSSL 15:117db924cf7c 2071 RND_RORX_1_2(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 2072 RND_RORX_1_3(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 2073 /* (W[-2] >>> 19) ^ (W[-2] >>> 61) ^ (W[-2] >> 6) */ \
wolfSSL 15:117db924cf7c 2074 V_XOR(YTMP1, YTMP4, YTMP1) \
wolfSSL 15:117db924cf7c 2075 RND_RORX_1_4(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 2076 RND_RORX_1_5(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 2077 /* W[0] = W[-16] + W[-7] + s0(W[-15]) + s1(W[-2]) */ \
wolfSSL 15:117db924cf7c 2078 V_ADD(W_Y_0, W_Y_0, YTMP1) \
wolfSSL 15:117db924cf7c 2079 RND_RORX_1_6(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 2080 V_ADD_I(YTMP1, W_Y_0, rsi, i) \
wolfSSL 15:117db924cf7c 2081 RND_RORX_1_7(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 2082 RND_RORX_1_8(f,g,h,a,b,c,d,e,i+3) \
wolfSSL 15:117db924cf7c 2083 VMOVDQU_I(rsp, i, YTMP1) \
wolfSSL 15:117db924cf7c 2084
wolfSSL 15:117db924cf7c 2085 #define MsgSched2_AVX2_RORX(W_0,W_2,W_4,W_6,W_8,W_10,W_12,W_14,a,b,c,d,e, \
wolfSSL 15:117db924cf7c 2086 f,g,h,i) \
wolfSSL 15:117db924cf7c 2087 RND_RORX_0_1(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 2088 VPALIGNR(W_Y_M15, W_2, W_0, 8) \
wolfSSL 15:117db924cf7c 2089 VPALIGNR(W_Y_M7, W_10, W_8, 8) \
wolfSSL 15:117db924cf7c 2090 RND_RORX_0_2(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 2091 V_SHIFT_R(YTMP1, W_Y_M15, 1) \
wolfSSL 15:117db924cf7c 2092 V_SHIFT_L(YTMP2, W_Y_M15, 63) \
wolfSSL 15:117db924cf7c 2093 RND_RORX_0_3(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 2094 V_SHIFT_R(YTMP3, W_Y_M15, 8) \
wolfSSL 15:117db924cf7c 2095 V_SHIFT_L(YTMP4, W_Y_M15, 56) \
wolfSSL 15:117db924cf7c 2096 RND_RORX_0_4(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 2097 V_OR(YTMP1, YTMP2, YTMP1) \
wolfSSL 15:117db924cf7c 2098 V_OR(YTMP3, YTMP4, YTMP3) \
wolfSSL 15:117db924cf7c 2099 RND_RORX_0_5(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 2100 V_SHIFT_R(YTMP4, W_Y_M15, 7) \
wolfSSL 15:117db924cf7c 2101 V_XOR(YTMP1, YTMP3, YTMP1) \
wolfSSL 15:117db924cf7c 2102 RND_RORX_0_6(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 2103 V_XOR(YTMP1, YTMP4, YTMP1) \
wolfSSL 15:117db924cf7c 2104 V_ADD(W_0, W_0, W_Y_M7) \
wolfSSL 15:117db924cf7c 2105 RND_RORX_0_7(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 2106 RND_RORX_0_8(a,b,c,d,e,f,g,h,i) \
wolfSSL 15:117db924cf7c 2107 V_ADD(W_0, W_0, YTMP1) \
wolfSSL 15:117db924cf7c 2108 RND_RORX_1_1(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2109 V_SHIFT_R(YTMP1, W_14, 19) \
wolfSSL 15:117db924cf7c 2110 V_SHIFT_L(YTMP2, W_14, 45) \
wolfSSL 15:117db924cf7c 2111 RND_RORX_1_2(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2112 V_SHIFT_R(YTMP3, W_14, 61) \
wolfSSL 15:117db924cf7c 2113 V_SHIFT_L(YTMP4, W_14, 3) \
wolfSSL 15:117db924cf7c 2114 RND_RORX_1_3(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2115 V_OR(YTMP1, YTMP2, YTMP1) \
wolfSSL 15:117db924cf7c 2116 V_OR(YTMP3, YTMP4, YTMP3) \
wolfSSL 15:117db924cf7c 2117 RND_RORX_1_4(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2118 RND_RORX_1_5(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2119 V_XOR(YTMP1, YTMP3, YTMP1) \
wolfSSL 15:117db924cf7c 2120 V_SHIFT_R(YTMP4, W_14, 6) \
wolfSSL 15:117db924cf7c 2121 RND_RORX_1_6(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2122 RND_RORX_1_7(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2123 V_XOR(YTMP1, YTMP4, YTMP1) \
wolfSSL 15:117db924cf7c 2124 RND_RORX_1_8(h,a,b,c,d,e,f,g,i+1) \
wolfSSL 15:117db924cf7c 2125 V_ADD(W_0, W_0, YTMP1) \
wolfSSL 15:117db924cf7c 2126
wolfSSL 15:117db924cf7c 2127
wolfSSL 15:117db924cf7c 2128 #define _INIT_MASK_Y(mask) \
wolfSSL 15:117db924cf7c 2129 "vmovdqu %[mask], %%"#mask"\n\t"
wolfSSL 15:117db924cf7c 2130 #define INIT_MASK_Y(mask) \
wolfSSL 15:117db924cf7c 2131 _INIT_MASK_Y(mask)
wolfSSL 15:117db924cf7c 2132
wolfSSL 15:117db924cf7c 2133 /* Load into YMM registers and swap endian. */
wolfSSL 15:117db924cf7c 2134 #define _LOAD_BLOCK_W_Y_2(mask, ymm0, ymm1, reg, i) \
wolfSSL 15:117db924cf7c 2135 /* buffer[0..15] => ymm0..ymm3; */ \
wolfSSL 15:117db924cf7c 2136 "vmovdqu " #i "+ 0(%%" #reg "), %%" #ymm0 "\n\t" \
wolfSSL 15:117db924cf7c 2137 "vmovdqu " #i "+32(%%" #reg "), %%" #ymm1 "\n\t" \
wolfSSL 15:117db924cf7c 2138 "vpshufb %%" #mask ", %%" #ymm0 ", %%" #ymm0 "\n\t" \
wolfSSL 15:117db924cf7c 2139 "vpshufb %%" #mask ", %%" #ymm1 ", %%" #ymm1 "\n\t"
wolfSSL 15:117db924cf7c 2140
wolfSSL 15:117db924cf7c 2141 #define LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i) \
wolfSSL 15:117db924cf7c 2142 _LOAD_BLOCK_W_Y_2(mask, ymm1, ymm2, reg, i)
wolfSSL 15:117db924cf7c 2143
wolfSSL 15:117db924cf7c 2144 #define LOAD_BLOCK_W_Y(mask, reg) \
wolfSSL 15:117db924cf7c 2145 LOAD_BLOCK_W_Y_2(mask, W_Y_0, W_Y_4 , reg, 0) \
wolfSSL 15:117db924cf7c 2146 LOAD_BLOCK_W_Y_2(mask, W_Y_8, W_Y_12, reg, 64)
wolfSSL 15:117db924cf7c 2147
wolfSSL 15:117db924cf7c 2148 #define _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) \
wolfSSL 15:117db924cf7c 2149 "vpaddq " #i "+ 0(%%" #reg "), %%" #ymm0 ", %%" #ymm2 "\n\t" \
wolfSSL 15:117db924cf7c 2150 "vpaddq " #i "+32(%%" #reg "), %%" #ymm1 ", %%" #ymm3 "\n\t" \
wolfSSL 15:117db924cf7c 2151 "vmovdqu %%" #ymm2 ", " #i "+ 0(" WX ")\n\t" \
wolfSSL 15:117db924cf7c 2152 "vmovdqu %%" #ymm3 ", " #i "+32(" WX ")\n\t"
wolfSSL 15:117db924cf7c 2153
wolfSSL 15:117db924cf7c 2154 #define SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i) \
wolfSSL 15:117db924cf7c 2155 _SET_W_Y_2(ymm0, ymm1, ymm2, ymm3, reg, i)
wolfSSL 15:117db924cf7c 2156
wolfSSL 15:117db924cf7c 2157 #define SET_BLOCK_W_Y(reg) \
wolfSSL 15:117db924cf7c 2158 SET_W_Y_2(W_Y_0, W_Y_4 , YTMP1, YTMP2, reg, 0) \
wolfSSL 15:117db924cf7c 2159 SET_W_Y_2(W_Y_8, W_Y_12, YTMP1, YTMP2, reg, 64)
wolfSSL 15:117db924cf7c 2160
wolfSSL 15:117db924cf7c 2161 /* Load into YMM registers and swap endian. */
wolfSSL 15:117db924cf7c 2162 #define _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) \
wolfSSL 15:117db924cf7c 2163 "vmovdqu " #i "+ 0(%%" #reg "), %%" #X0 "\n\t" \
wolfSSL 15:117db924cf7c 2164 "vmovdqu " #i "+ 16(%%" #reg "), %%" #X1 "\n\t" \
wolfSSL 15:117db924cf7c 2165 "vmovdqu " #i "+128(%%" #reg "), %%" #X8 "\n\t" \
wolfSSL 15:117db924cf7c 2166 "vmovdqu " #i "+144(%%" #reg "), %%" #X9 "\n\t" \
wolfSSL 15:117db924cf7c 2167 "vinserti128 $1, %%" #X8 ", %%" #Y0 ", %%" #Y0 "\n\t" \
wolfSSL 15:117db924cf7c 2168 "vinserti128 $1, %%" #X9 ", %%" #Y1 ", %%" #Y1 "\n\t" \
wolfSSL 15:117db924cf7c 2169 "vpshufb %%" #mask ", %%" #Y0 ", %%" #Y0 "\n\t" \
wolfSSL 15:117db924cf7c 2170 "vpshufb %%" #mask ", %%" #Y1 ", %%" #Y1 "\n\t"
wolfSSL 15:117db924cf7c 2171
wolfSSL 15:117db924cf7c 2172 #define LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i) \
wolfSSL 15:117db924cf7c 2173 _LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, i)
wolfSSL 15:117db924cf7c 2174
wolfSSL 15:117db924cf7c 2175 #define LOAD_BLOCK2_W_Y(mask, reg) \
wolfSSL 15:117db924cf7c 2176 LOAD_BLOCK2_W_Y_2(mask, Y0, Y1, X0, X1, X8, X9, reg, 0) \
wolfSSL 15:117db924cf7c 2177 LOAD_BLOCK2_W_Y_2(mask, Y2, Y3, X2, X3, X8, X9, reg, 32) \
wolfSSL 15:117db924cf7c 2178 LOAD_BLOCK2_W_Y_2(mask, Y4, Y5, X4, X5, X8, X9, reg, 64) \
wolfSSL 15:117db924cf7c 2179 LOAD_BLOCK2_W_Y_2(mask, Y6, Y7, X6, X7, X8, X9, reg, 96) \
wolfSSL 15:117db924cf7c 2180
wolfSSL 15:117db924cf7c 2181 #define SET_BLOCK2_W_Y(reg) \
wolfSSL 15:117db924cf7c 2182 SET_W_Y_2(Y0, Y1, YTMP1, YTMP2, reg, 0) \
wolfSSL 15:117db924cf7c 2183 SET_W_Y_2(Y2, Y3, YTMP1, YTMP2, reg, 64) \
wolfSSL 15:117db924cf7c 2184 SET_W_Y_2(Y4, Y5, YTMP1, YTMP2, reg, 128) \
wolfSSL 15:117db924cf7c 2185 SET_W_Y_2(Y6, Y7, YTMP1, YTMP2, reg, 192)
wolfSSL 15:117db924cf7c 2186
wolfSSL 15:117db924cf7c 2187 static const word64 K512_AVX2[160] = {
wolfSSL 15:117db924cf7c 2188 W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
wolfSSL 15:117db924cf7c 2189 W64LIT(0x428a2f98d728ae22), W64LIT(0x7137449123ef65cd),
wolfSSL 15:117db924cf7c 2190 W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
wolfSSL 15:117db924cf7c 2191 W64LIT(0xb5c0fbcfec4d3b2f), W64LIT(0xe9b5dba58189dbbc),
wolfSSL 15:117db924cf7c 2192 W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
wolfSSL 15:117db924cf7c 2193 W64LIT(0x3956c25bf348b538), W64LIT(0x59f111f1b605d019),
wolfSSL 15:117db924cf7c 2194 W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
wolfSSL 15:117db924cf7c 2195 W64LIT(0x923f82a4af194f9b), W64LIT(0xab1c5ed5da6d8118),
wolfSSL 15:117db924cf7c 2196 W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
wolfSSL 15:117db924cf7c 2197 W64LIT(0xd807aa98a3030242), W64LIT(0x12835b0145706fbe),
wolfSSL 15:117db924cf7c 2198 W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
wolfSSL 15:117db924cf7c 2199 W64LIT(0x243185be4ee4b28c), W64LIT(0x550c7dc3d5ffb4e2),
wolfSSL 15:117db924cf7c 2200 W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
wolfSSL 15:117db924cf7c 2201 W64LIT(0x72be5d74f27b896f), W64LIT(0x80deb1fe3b1696b1),
wolfSSL 15:117db924cf7c 2202 W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
wolfSSL 15:117db924cf7c 2203 W64LIT(0x9bdc06a725c71235), W64LIT(0xc19bf174cf692694),
wolfSSL 15:117db924cf7c 2204 W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
wolfSSL 15:117db924cf7c 2205 W64LIT(0xe49b69c19ef14ad2), W64LIT(0xefbe4786384f25e3),
wolfSSL 15:117db924cf7c 2206 W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
wolfSSL 15:117db924cf7c 2207 W64LIT(0x0fc19dc68b8cd5b5), W64LIT(0x240ca1cc77ac9c65),
wolfSSL 15:117db924cf7c 2208 W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
wolfSSL 15:117db924cf7c 2209 W64LIT(0x2de92c6f592b0275), W64LIT(0x4a7484aa6ea6e483),
wolfSSL 15:117db924cf7c 2210 W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
wolfSSL 15:117db924cf7c 2211 W64LIT(0x5cb0a9dcbd41fbd4), W64LIT(0x76f988da831153b5),
wolfSSL 15:117db924cf7c 2212 W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
wolfSSL 15:117db924cf7c 2213 W64LIT(0x983e5152ee66dfab), W64LIT(0xa831c66d2db43210),
wolfSSL 15:117db924cf7c 2214 W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
wolfSSL 15:117db924cf7c 2215 W64LIT(0xb00327c898fb213f), W64LIT(0xbf597fc7beef0ee4),
wolfSSL 15:117db924cf7c 2216 W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
wolfSSL 15:117db924cf7c 2217 W64LIT(0xc6e00bf33da88fc2), W64LIT(0xd5a79147930aa725),
wolfSSL 15:117db924cf7c 2218 W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
wolfSSL 15:117db924cf7c 2219 W64LIT(0x06ca6351e003826f), W64LIT(0x142929670a0e6e70),
wolfSSL 15:117db924cf7c 2220 W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
wolfSSL 15:117db924cf7c 2221 W64LIT(0x27b70a8546d22ffc), W64LIT(0x2e1b21385c26c926),
wolfSSL 15:117db924cf7c 2222 W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
wolfSSL 15:117db924cf7c 2223 W64LIT(0x4d2c6dfc5ac42aed), W64LIT(0x53380d139d95b3df),
wolfSSL 15:117db924cf7c 2224 W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
wolfSSL 15:117db924cf7c 2225 W64LIT(0x650a73548baf63de), W64LIT(0x766a0abb3c77b2a8),
wolfSSL 15:117db924cf7c 2226 W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
wolfSSL 15:117db924cf7c 2227 W64LIT(0x81c2c92e47edaee6), W64LIT(0x92722c851482353b),
wolfSSL 15:117db924cf7c 2228 W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
wolfSSL 15:117db924cf7c 2229 W64LIT(0xa2bfe8a14cf10364), W64LIT(0xa81a664bbc423001),
wolfSSL 15:117db924cf7c 2230 W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
wolfSSL 15:117db924cf7c 2231 W64LIT(0xc24b8b70d0f89791), W64LIT(0xc76c51a30654be30),
wolfSSL 15:117db924cf7c 2232 W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
wolfSSL 15:117db924cf7c 2233 W64LIT(0xd192e819d6ef5218), W64LIT(0xd69906245565a910),
wolfSSL 15:117db924cf7c 2234 W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
wolfSSL 15:117db924cf7c 2235 W64LIT(0xf40e35855771202a), W64LIT(0x106aa07032bbd1b8),
wolfSSL 15:117db924cf7c 2236 W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
wolfSSL 15:117db924cf7c 2237 W64LIT(0x19a4c116b8d2d0c8), W64LIT(0x1e376c085141ab53),
wolfSSL 15:117db924cf7c 2238 W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
wolfSSL 15:117db924cf7c 2239 W64LIT(0x2748774cdf8eeb99), W64LIT(0x34b0bcb5e19b48a8),
wolfSSL 15:117db924cf7c 2240 W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
wolfSSL 15:117db924cf7c 2241 W64LIT(0x391c0cb3c5c95a63), W64LIT(0x4ed8aa4ae3418acb),
wolfSSL 15:117db924cf7c 2242 W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
wolfSSL 15:117db924cf7c 2243 W64LIT(0x5b9cca4f7763e373), W64LIT(0x682e6ff3d6b2b8a3),
wolfSSL 15:117db924cf7c 2244 W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
wolfSSL 15:117db924cf7c 2245 W64LIT(0x748f82ee5defb2fc), W64LIT(0x78a5636f43172f60),
wolfSSL 15:117db924cf7c 2246 W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
wolfSSL 15:117db924cf7c 2247 W64LIT(0x84c87814a1f0ab72), W64LIT(0x8cc702081a6439ec),
wolfSSL 15:117db924cf7c 2248 W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
wolfSSL 15:117db924cf7c 2249 W64LIT(0x90befffa23631e28), W64LIT(0xa4506cebde82bde9),
wolfSSL 15:117db924cf7c 2250 W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
wolfSSL 15:117db924cf7c 2251 W64LIT(0xbef9a3f7b2c67915), W64LIT(0xc67178f2e372532b),
wolfSSL 15:117db924cf7c 2252 W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
wolfSSL 15:117db924cf7c 2253 W64LIT(0xca273eceea26619c), W64LIT(0xd186b8c721c0c207),
wolfSSL 15:117db924cf7c 2254 W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
wolfSSL 15:117db924cf7c 2255 W64LIT(0xeada7dd6cde0eb1e), W64LIT(0xf57d4f7fee6ed178),
wolfSSL 15:117db924cf7c 2256 W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
wolfSSL 15:117db924cf7c 2257 W64LIT(0x06f067aa72176fba), W64LIT(0x0a637dc5a2c898a6),
wolfSSL 15:117db924cf7c 2258 W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
wolfSSL 15:117db924cf7c 2259 W64LIT(0x113f9804bef90dae), W64LIT(0x1b710b35131c471b),
wolfSSL 15:117db924cf7c 2260 W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
wolfSSL 15:117db924cf7c 2261 W64LIT(0x28db77f523047d84), W64LIT(0x32caab7b40c72493),
wolfSSL 15:117db924cf7c 2262 W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
wolfSSL 15:117db924cf7c 2263 W64LIT(0x3c9ebe0a15c9bebc), W64LIT(0x431d67c49c100d4c),
wolfSSL 15:117db924cf7c 2264 W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
wolfSSL 15:117db924cf7c 2265 W64LIT(0x4cc5d4becb3e42b6), W64LIT(0x597f299cfc657e2a),
wolfSSL 15:117db924cf7c 2266 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817),
wolfSSL 15:117db924cf7c 2267 W64LIT(0x5fcb6fab3ad6faec), W64LIT(0x6c44198c4a475817)
wolfSSL 15:117db924cf7c 2268 };
wolfSSL 15:117db924cf7c 2269 static const word64* K512_AVX2_END = &K512_AVX2[128];
wolfSSL 15:117db924cf7c 2270
wolfSSL 15:117db924cf7c 2271 static int Transform_Sha512_AVX2(wc_Sha512* sha512)
wolfSSL 15:117db924cf7c 2272 {
wolfSSL 15:117db924cf7c 2273 __asm__ __volatile__ (
wolfSSL 15:117db924cf7c 2274
wolfSSL 15:117db924cf7c 2275 /* 16 Ws plus loop counter and K512. */
wolfSSL 15:117db924cf7c 2276 "subq $136, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2277 "leaq 64(%[sha512]), %%rax\n\t"
wolfSSL 15:117db924cf7c 2278
wolfSSL 15:117db924cf7c 2279 INIT_MASK(MASK_Y)
wolfSSL 15:117db924cf7c 2280 LOAD_DIGEST()
wolfSSL 15:117db924cf7c 2281
wolfSSL 15:117db924cf7c 2282 LOAD_BLOCK_W_Y(MASK_Y, rax)
wolfSSL 15:117db924cf7c 2283
wolfSSL 15:117db924cf7c 2284 "movl $4, 16*8(" WX ")\n\t"
wolfSSL 15:117db924cf7c 2285 "leaq %[K512], %%rsi\n\t"
wolfSSL 15:117db924cf7c 2286 /* b */
wolfSSL 15:117db924cf7c 2287 "movq %%r9, " L4 "\n\t"
wolfSSL 15:117db924cf7c 2288 /* e */
wolfSSL 15:117db924cf7c 2289 "movq %%r12, " L1 "\n\t"
wolfSSL 15:117db924cf7c 2290 /* b ^ c */
wolfSSL 15:117db924cf7c 2291 "xorq %%r10, " L4 "\n\t"
wolfSSL 15:117db924cf7c 2292
wolfSSL 15:117db924cf7c 2293 SET_BLOCK_W_Y(rsi)
wolfSSL 15:117db924cf7c 2294
wolfSSL 15:117db924cf7c 2295 "# Start of 16 rounds\n"
wolfSSL 15:117db924cf7c 2296 "1:\n\t"
wolfSSL 15:117db924cf7c 2297
wolfSSL 15:117db924cf7c 2298 "addq $128, %%rsi\n\t"
wolfSSL 15:117db924cf7c 2299
wolfSSL 15:117db924cf7c 2300 MsgSched4_AVX2(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 2301 MsgSched4_AVX2(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4)
wolfSSL 15:117db924cf7c 2302 MsgSched4_AVX2(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8)
wolfSSL 15:117db924cf7c 2303 MsgSched4_AVX2(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12)
wolfSSL 15:117db924cf7c 2304
wolfSSL 15:117db924cf7c 2305 SET_BLOCK_W_Y(rsi)
wolfSSL 15:117db924cf7c 2306
wolfSSL 15:117db924cf7c 2307 "subl $1, 16*8(" WX ")\n\t"
wolfSSL 15:117db924cf7c 2308 "jne 1b\n\t"
wolfSSL 15:117db924cf7c 2309
wolfSSL 15:117db924cf7c 2310 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 2311 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 2)
wolfSSL 15:117db924cf7c 2312 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 4)
wolfSSL 15:117db924cf7c 2313 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB, 6)
wolfSSL 15:117db924cf7c 2314
wolfSSL 15:117db924cf7c 2315 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 8)
wolfSSL 15:117db924cf7c 2316 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,10)
wolfSSL 15:117db924cf7c 2317 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,12)
wolfSSL 15:117db924cf7c 2318 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
wolfSSL 15:117db924cf7c 2319
wolfSSL 15:117db924cf7c 2320 STORE_ADD_DIGEST()
wolfSSL 15:117db924cf7c 2321
wolfSSL 15:117db924cf7c 2322 "addq $136, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2323
wolfSSL 15:117db924cf7c 2324 :
wolfSSL 15:117db924cf7c 2325 : [mask] "m" (mBYTE_FLIP_MASK_Y),
wolfSSL 15:117db924cf7c 2326 [sha512] "r" (sha512),
wolfSSL 15:117db924cf7c 2327 [K512] "m" (K512)
wolfSSL 15:117db924cf7c 2328 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
wolfSSL 15:117db924cf7c 2329 );
wolfSSL 15:117db924cf7c 2330
wolfSSL 15:117db924cf7c 2331 return 0;
wolfSSL 15:117db924cf7c 2332 }
wolfSSL 15:117db924cf7c 2333
wolfSSL 15:117db924cf7c 2334 static int Transform_Sha512_AVX2_Len(wc_Sha512* sha512, word32 len)
wolfSSL 15:117db924cf7c 2335 {
wolfSSL 15:117db924cf7c 2336 if ((len & WC_SHA512_BLOCK_SIZE) != 0) {
wolfSSL 15:117db924cf7c 2337 XMEMCPY(sha512->buffer, sha512->data, WC_SHA512_BLOCK_SIZE);
wolfSSL 15:117db924cf7c 2338 Transform_Sha512_AVX2(sha512);
wolfSSL 15:117db924cf7c 2339 sha512->data += WC_SHA512_BLOCK_SIZE;
wolfSSL 15:117db924cf7c 2340 len -= WC_SHA512_BLOCK_SIZE;
wolfSSL 15:117db924cf7c 2341 if (len == 0)
wolfSSL 15:117db924cf7c 2342 return 0;
wolfSSL 15:117db924cf7c 2343 }
wolfSSL 15:117db924cf7c 2344
wolfSSL 15:117db924cf7c 2345 __asm__ __volatile__ (
wolfSSL 15:117db924cf7c 2346
wolfSSL 15:117db924cf7c 2347 "movq 224(%[sha512]), %%rcx\n\t"
wolfSSL 15:117db924cf7c 2348
wolfSSL 15:117db924cf7c 2349 INIT_MASK(MASK_Y)
wolfSSL 15:117db924cf7c 2350 LOAD_DIGEST()
wolfSSL 15:117db924cf7c 2351
wolfSSL 15:117db924cf7c 2352 "# Start of processing two blocks\n"
wolfSSL 15:117db924cf7c 2353 "2:\n\t"
wolfSSL 15:117db924cf7c 2354
wolfSSL 15:117db924cf7c 2355 "subq $1344, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2356 "leaq %[K512], %%rsi\n\t"
wolfSSL 15:117db924cf7c 2357
wolfSSL 15:117db924cf7c 2358 /* L4 = b */
wolfSSL 15:117db924cf7c 2359 "movq %%r9, " L4 "\n\t"
wolfSSL 15:117db924cf7c 2360 /* e */
wolfSSL 15:117db924cf7c 2361 "movq %%r12, " L1 "\n\t"
wolfSSL 15:117db924cf7c 2362
wolfSSL 15:117db924cf7c 2363 LOAD_BLOCK2_W_Y(MASK_Y, rcx)
wolfSSL 15:117db924cf7c 2364
wolfSSL 15:117db924cf7c 2365 /* L4 = b ^ c */
wolfSSL 15:117db924cf7c 2366 "xorq %%r10, " L4 "\n\t"
wolfSSL 15:117db924cf7c 2367 "\n"
wolfSSL 15:117db924cf7c 2368 "1:\n\t"
wolfSSL 15:117db924cf7c 2369 SET_BLOCK2_W_Y(rsi)
wolfSSL 15:117db924cf7c 2370 MsgSched2_AVX2(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 2371 MsgSched2_AVX2(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4)
wolfSSL 15:117db924cf7c 2372 MsgSched2_AVX2(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8)
wolfSSL 15:117db924cf7c 2373 MsgSched2_AVX2(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12)
wolfSSL 15:117db924cf7c 2374 MsgSched2_AVX2(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16)
wolfSSL 15:117db924cf7c 2375 MsgSched2_AVX2(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20)
wolfSSL 15:117db924cf7c 2376 MsgSched2_AVX2(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24)
wolfSSL 15:117db924cf7c 2377 MsgSched2_AVX2(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28)
wolfSSL 15:117db924cf7c 2378 "addq $256, %%rsi\n\t"
wolfSSL 15:117db924cf7c 2379 "addq $256, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2380 "cmpq %[K512_END], %%rsi\n\t"
wolfSSL 15:117db924cf7c 2381 "jne 1b\n\t"
wolfSSL 15:117db924cf7c 2382
wolfSSL 15:117db924cf7c 2383 SET_BLOCK2_W_Y(rsi)
wolfSSL 15:117db924cf7c 2384 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 2385 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4)
wolfSSL 15:117db924cf7c 2386 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8)
wolfSSL 15:117db924cf7c 2387 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12)
wolfSSL 15:117db924cf7c 2388
wolfSSL 15:117db924cf7c 2389 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16)
wolfSSL 15:117db924cf7c 2390 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20)
wolfSSL 15:117db924cf7c 2391 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24)
wolfSSL 15:117db924cf7c 2392 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28)
wolfSSL 15:117db924cf7c 2393 "subq $1024, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2394
wolfSSL 15:117db924cf7c 2395 ADD_DIGEST()
wolfSSL 15:117db924cf7c 2396 STORE_DIGEST()
wolfSSL 15:117db924cf7c 2397
wolfSSL 15:117db924cf7c 2398 /* L4 = b */
wolfSSL 15:117db924cf7c 2399 "movq %%r9, " L4 "\n\t"
wolfSSL 15:117db924cf7c 2400 /* e */
wolfSSL 15:117db924cf7c 2401 "movq %%r12, " L1 "\n\t"
wolfSSL 15:117db924cf7c 2402 /* L4 = b ^ c */
wolfSSL 15:117db924cf7c 2403 "xorq %%r10, " L4 "\n\t"
wolfSSL 15:117db924cf7c 2404
wolfSSL 15:117db924cf7c 2405 "movq $5, %%rsi\n\t"
wolfSSL 15:117db924cf7c 2406 "\n"
wolfSSL 15:117db924cf7c 2407 "3:\n\t"
wolfSSL 15:117db924cf7c 2408 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2)
wolfSSL 15:117db924cf7c 2409 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6)
wolfSSL 15:117db924cf7c 2410 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10)
wolfSSL 15:117db924cf7c 2411 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
wolfSSL 15:117db924cf7c 2412
wolfSSL 15:117db924cf7c 2413 RND_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18)
wolfSSL 15:117db924cf7c 2414 RND_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22)
wolfSSL 15:117db924cf7c 2415 RND_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26)
wolfSSL 15:117db924cf7c 2416 RND_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30)
wolfSSL 15:117db924cf7c 2417 "addq $256, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2418 "subq $1, %%rsi\n\t"
wolfSSL 15:117db924cf7c 2419 "jnz 3b\n\t"
wolfSSL 15:117db924cf7c 2420
wolfSSL 15:117db924cf7c 2421 ADD_DIGEST()
wolfSSL 15:117db924cf7c 2422
wolfSSL 15:117db924cf7c 2423 "movq 224(%[sha512]), %%rcx\n\t"
wolfSSL 15:117db924cf7c 2424 "addq $64, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2425 "addq $256, %%rcx\n\t"
wolfSSL 15:117db924cf7c 2426 "subl $256, %[len]\n\t"
wolfSSL 15:117db924cf7c 2427 "movq %%rcx, 224(%[sha512])\n\t"
wolfSSL 15:117db924cf7c 2428
wolfSSL 15:117db924cf7c 2429 STORE_DIGEST()
wolfSSL 15:117db924cf7c 2430
wolfSSL 15:117db924cf7c 2431 "jnz 2b\n\t"
wolfSSL 15:117db924cf7c 2432
wolfSSL 15:117db924cf7c 2433 :
wolfSSL 15:117db924cf7c 2434 : [mask] "m" (mBYTE_FLIP_MASK_Y),
wolfSSL 15:117db924cf7c 2435 [len] "m" (len),
wolfSSL 15:117db924cf7c 2436 [sha512] "r" (sha512),
wolfSSL 15:117db924cf7c 2437 [K512] "m" (K512_AVX2),
wolfSSL 15:117db924cf7c 2438 [K512_END] "m" (K512_AVX2_END)
wolfSSL 15:117db924cf7c 2439 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
wolfSSL 15:117db924cf7c 2440 );
wolfSSL 15:117db924cf7c 2441
wolfSSL 15:117db924cf7c 2442 return 0;
wolfSSL 15:117db924cf7c 2443 }
wolfSSL 15:117db924cf7c 2444
wolfSSL 15:117db924cf7c 2445 #ifdef HAVE_INTEL_RORX
wolfSSL 15:117db924cf7c 2446 static int Transform_Sha512_AVX2_RORX(wc_Sha512* sha512)
wolfSSL 15:117db924cf7c 2447 {
wolfSSL 15:117db924cf7c 2448 __asm__ __volatile__ (
wolfSSL 15:117db924cf7c 2449
wolfSSL 15:117db924cf7c 2450 /* 16 Ws plus loop counter. */
wolfSSL 15:117db924cf7c 2451 "subq $136, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2452 "leaq 64(%[sha512]), " L2 "\n\t"
wolfSSL 15:117db924cf7c 2453
wolfSSL 15:117db924cf7c 2454 INIT_MASK(MASK_Y)
wolfSSL 15:117db924cf7c 2455 LOAD_DIGEST()
wolfSSL 15:117db924cf7c 2456
wolfSSL 15:117db924cf7c 2457 LOAD_BLOCK_W_Y(MASK_Y, rcx)
wolfSSL 15:117db924cf7c 2458
wolfSSL 15:117db924cf7c 2459 "movl $4, 16*8(" WX ")\n\t"
wolfSSL 15:117db924cf7c 2460 "leaq %[K512], %%rsi\n\t"
wolfSSL 15:117db924cf7c 2461 /* b */
wolfSSL 15:117db924cf7c 2462 "movq %%r9, " L4 "\n\t"
wolfSSL 15:117db924cf7c 2463 /* L3 = 0 (add to prev h) */
wolfSSL 15:117db924cf7c 2464 "xorq " L3 ", " L3 "\n\t"
wolfSSL 15:117db924cf7c 2465 /* b ^ c */
wolfSSL 15:117db924cf7c 2466 "xorq %%r10, " L4 "\n\t"
wolfSSL 15:117db924cf7c 2467
wolfSSL 15:117db924cf7c 2468 SET_BLOCK_W_Y(rsi)
wolfSSL 15:117db924cf7c 2469
wolfSSL 15:117db924cf7c 2470 "# Start of 16 rounds\n"
wolfSSL 15:117db924cf7c 2471 "1:\n\t"
wolfSSL 15:117db924cf7c 2472
wolfSSL 15:117db924cf7c 2473 "addq $128, %%rsi\n\t"
wolfSSL 15:117db924cf7c 2474
wolfSSL 15:117db924cf7c 2475 MsgSched4_AVX2_RORX_SET(W_Y_0,W_Y_4,W_Y_8,W_Y_12,RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 2476 MsgSched4_AVX2_RORX_SET(W_Y_4,W_Y_8,W_Y_12,W_Y_0,RE,RF,RG,RH,RA,RB,RC,RD, 4)
wolfSSL 15:117db924cf7c 2477 MsgSched4_AVX2_RORX_SET(W_Y_8,W_Y_12,W_Y_0,W_Y_4,RA,RB,RC,RD,RE,RF,RG,RH, 8)
wolfSSL 15:117db924cf7c 2478 MsgSched4_AVX2_RORX_SET(W_Y_12,W_Y_0,W_Y_4,W_Y_8,RE,RF,RG,RH,RA,RB,RC,RD,12)
wolfSSL 15:117db924cf7c 2479
wolfSSL 15:117db924cf7c 2480 "subl $1, 16*8(%%rsp)\n\t"
wolfSSL 15:117db924cf7c 2481 "jnz 1b\n\t"
wolfSSL 15:117db924cf7c 2482
wolfSSL 15:117db924cf7c 2483 RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 2484 RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD, 4)
wolfSSL 15:117db924cf7c 2485 RND_RORX_ALL_4(RA,RB,RC,RD,RE,RF,RG,RH, 8)
wolfSSL 15:117db924cf7c 2486 RND_RORX_ALL_4(RE,RF,RG,RH,RA,RB,RC,RD,12)
wolfSSL 15:117db924cf7c 2487 /* Prev RND: h += Maj(a,b,c) */
wolfSSL 15:117db924cf7c 2488 "addq " L3 ", %%r8\n\t"
wolfSSL 15:117db924cf7c 2489 "addq $136, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2490
wolfSSL 15:117db924cf7c 2491 STORE_ADD_DIGEST()
wolfSSL 15:117db924cf7c 2492
wolfSSL 15:117db924cf7c 2493 :
wolfSSL 15:117db924cf7c 2494 : [mask] "m" (mBYTE_FLIP_MASK_Y),
wolfSSL 15:117db924cf7c 2495 [sha512] "r" (sha512),
wolfSSL 15:117db924cf7c 2496 [K512] "m" (K512)
wolfSSL 15:117db924cf7c 2497 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
wolfSSL 15:117db924cf7c 2498 );
wolfSSL 15:117db924cf7c 2499
wolfSSL 15:117db924cf7c 2500 return 0;
wolfSSL 15:117db924cf7c 2501 }
wolfSSL 15:117db924cf7c 2502
wolfSSL 15:117db924cf7c 2503 static int Transform_Sha512_AVX2_RORX_Len(wc_Sha512* sha512, word32 len)
wolfSSL 15:117db924cf7c 2504 {
wolfSSL 15:117db924cf7c 2505 if ((len & WC_SHA512_BLOCK_SIZE) != 0) {
wolfSSL 15:117db924cf7c 2506 XMEMCPY(sha512->buffer, sha512->data, WC_SHA512_BLOCK_SIZE);
wolfSSL 15:117db924cf7c 2507 Transform_Sha512_AVX2_RORX(sha512);
wolfSSL 15:117db924cf7c 2508 sha512->data += WC_SHA512_BLOCK_SIZE;
wolfSSL 15:117db924cf7c 2509 len -= WC_SHA512_BLOCK_SIZE;
wolfSSL 15:117db924cf7c 2510 if (len == 0)
wolfSSL 15:117db924cf7c 2511 return 0;
wolfSSL 15:117db924cf7c 2512 }
wolfSSL 15:117db924cf7c 2513
wolfSSL 15:117db924cf7c 2514 __asm__ __volatile__ (
wolfSSL 15:117db924cf7c 2515
wolfSSL 15:117db924cf7c 2516 "movq 224(%[sha512]), %%rax\n\t"
wolfSSL 15:117db924cf7c 2517
wolfSSL 15:117db924cf7c 2518 INIT_MASK(MASK_Y)
wolfSSL 15:117db924cf7c 2519 LOAD_DIGEST()
wolfSSL 15:117db924cf7c 2520
wolfSSL 15:117db924cf7c 2521 "# Start of processing two blocks\n"
wolfSSL 15:117db924cf7c 2522 "2:\n\t"
wolfSSL 15:117db924cf7c 2523
wolfSSL 15:117db924cf7c 2524 "subq $1344, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2525 "leaq %[K512], %%rsi\n\t"
wolfSSL 15:117db924cf7c 2526
wolfSSL 15:117db924cf7c 2527 /* L4 = b */
wolfSSL 15:117db924cf7c 2528 "movq %%r9, " L4 "\n\t"
wolfSSL 15:117db924cf7c 2529 /* L3 = 0 (add to prev h) */
wolfSSL 15:117db924cf7c 2530 "xorq " L3 ", " L3 "\n\t"
wolfSSL 15:117db924cf7c 2531
wolfSSL 15:117db924cf7c 2532 LOAD_BLOCK2_W_Y(MASK_Y, rax)
wolfSSL 15:117db924cf7c 2533
wolfSSL 15:117db924cf7c 2534 /* L4 = b ^ c */
wolfSSL 15:117db924cf7c 2535 "xorq %%r10, " L4 "\n\t"
wolfSSL 15:117db924cf7c 2536 "\n"
wolfSSL 15:117db924cf7c 2537 "1:\n\t"
wolfSSL 15:117db924cf7c 2538 SET_BLOCK2_W_Y(rsi)
wolfSSL 15:117db924cf7c 2539 MsgSched2_AVX2_RORX(Y0,Y1,Y2,Y3,Y4,Y5,Y6,Y7,RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 2540 MsgSched2_AVX2_RORX(Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y0,RG,RH,RA,RB,RC,RD,RE,RF, 4)
wolfSSL 15:117db924cf7c 2541 MsgSched2_AVX2_RORX(Y2,Y3,Y4,Y5,Y6,Y7,Y0,Y1,RE,RF,RG,RH,RA,RB,RC,RD, 8)
wolfSSL 15:117db924cf7c 2542 MsgSched2_AVX2_RORX(Y3,Y4,Y5,Y6,Y7,Y0,Y1,Y2,RC,RD,RE,RF,RG,RH,RA,RB,12)
wolfSSL 15:117db924cf7c 2543 MsgSched2_AVX2_RORX(Y4,Y5,Y6,Y7,Y0,Y1,Y2,Y3,RA,RB,RC,RD,RE,RF,RG,RH,16)
wolfSSL 15:117db924cf7c 2544 MsgSched2_AVX2_RORX(Y5,Y6,Y7,Y0,Y1,Y2,Y3,Y4,RG,RH,RA,RB,RC,RD,RE,RF,20)
wolfSSL 15:117db924cf7c 2545 MsgSched2_AVX2_RORX(Y6,Y7,Y0,Y1,Y2,Y3,Y4,Y5,RE,RF,RG,RH,RA,RB,RC,RD,24)
wolfSSL 15:117db924cf7c 2546 MsgSched2_AVX2_RORX(Y7,Y0,Y1,Y2,Y3,Y4,Y5,Y6,RC,RD,RE,RF,RG,RH,RA,RB,28)
wolfSSL 15:117db924cf7c 2547 "addq $256, %%rsi\n\t"
wolfSSL 15:117db924cf7c 2548 "addq $256, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2549 "cmpq %[K512_END], %%rsi\n\t"
wolfSSL 15:117db924cf7c 2550 "jne 1b\n\t"
wolfSSL 15:117db924cf7c 2551
wolfSSL 15:117db924cf7c 2552 SET_BLOCK2_W_Y(rsi)
wolfSSL 15:117db924cf7c 2553 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 0)
wolfSSL 15:117db924cf7c 2554 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 4)
wolfSSL 15:117db924cf7c 2555 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD, 8)
wolfSSL 15:117db924cf7c 2556 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,12)
wolfSSL 15:117db924cf7c 2557
wolfSSL 15:117db924cf7c 2558 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,16)
wolfSSL 15:117db924cf7c 2559 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,20)
wolfSSL 15:117db924cf7c 2560 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,24)
wolfSSL 15:117db924cf7c 2561 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,28)
wolfSSL 15:117db924cf7c 2562 "addq " L3 ", %%r8\n\t"
wolfSSL 15:117db924cf7c 2563 "subq $1024, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2564
wolfSSL 15:117db924cf7c 2565 ADD_DIGEST()
wolfSSL 15:117db924cf7c 2566 STORE_DIGEST()
wolfSSL 15:117db924cf7c 2567
wolfSSL 15:117db924cf7c 2568 /* L4 = b */
wolfSSL 15:117db924cf7c 2569 "movq %%r9, " L4 "\n\t"
wolfSSL 15:117db924cf7c 2570 /* L3 = 0 (add to prev h) */
wolfSSL 15:117db924cf7c 2571 "xorq " L3 ", " L3 "\n\t"
wolfSSL 15:117db924cf7c 2572 /* L4 = b ^ c */
wolfSSL 15:117db924cf7c 2573 "xorq %%r10, " L4 "\n\t"
wolfSSL 15:117db924cf7c 2574
wolfSSL 15:117db924cf7c 2575 "movq $5, %%rsi\n\t"
wolfSSL 15:117db924cf7c 2576 "\n"
wolfSSL 15:117db924cf7c 2577 "3:\n\t"
wolfSSL 15:117db924cf7c 2578 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH, 2)
wolfSSL 15:117db924cf7c 2579 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF, 6)
wolfSSL 15:117db924cf7c 2580 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,10)
wolfSSL 15:117db924cf7c 2581 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,14)
wolfSSL 15:117db924cf7c 2582
wolfSSL 15:117db924cf7c 2583 RND_RORX_ALL_2(RA,RB,RC,RD,RE,RF,RG,RH,18)
wolfSSL 15:117db924cf7c 2584 RND_RORX_ALL_2(RG,RH,RA,RB,RC,RD,RE,RF,22)
wolfSSL 15:117db924cf7c 2585 RND_RORX_ALL_2(RE,RF,RG,RH,RA,RB,RC,RD,26)
wolfSSL 15:117db924cf7c 2586 RND_RORX_ALL_2(RC,RD,RE,RF,RG,RH,RA,RB,30)
wolfSSL 15:117db924cf7c 2587 "addq $256, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2588 "subq $1, %%rsi\n\t"
wolfSSL 15:117db924cf7c 2589 "jnz 3b\n\t"
wolfSSL 15:117db924cf7c 2590
wolfSSL 15:117db924cf7c 2591 "addq " L3 ", %%r8\n\t"
wolfSSL 15:117db924cf7c 2592
wolfSSL 15:117db924cf7c 2593 ADD_DIGEST()
wolfSSL 15:117db924cf7c 2594
wolfSSL 15:117db924cf7c 2595 "movq 224(%[sha512]), %%rax\n\t"
wolfSSL 15:117db924cf7c 2596 "addq $64, %%rsp\n\t"
wolfSSL 15:117db924cf7c 2597 "addq $256, %%rax\n\t"
wolfSSL 15:117db924cf7c 2598 "subl $256, %[len]\n\t"
wolfSSL 15:117db924cf7c 2599 "movq %%rax, 224(%[sha512])\n\t"
wolfSSL 15:117db924cf7c 2600
wolfSSL 15:117db924cf7c 2601 STORE_DIGEST()
wolfSSL 15:117db924cf7c 2602
wolfSSL 15:117db924cf7c 2603 "jnz 2b\n\t"
wolfSSL 15:117db924cf7c 2604
wolfSSL 15:117db924cf7c 2605 :
wolfSSL 15:117db924cf7c 2606 : [mask] "m" (mBYTE_FLIP_MASK_Y),
wolfSSL 15:117db924cf7c 2607 [len] "m" (len),
wolfSSL 15:117db924cf7c 2608 [sha512] "r" (sha512),
wolfSSL 15:117db924cf7c 2609 [K512] "m" (K512_AVX2),
wolfSSL 15:117db924cf7c 2610 [K512_END] "m" (K512_AVX2_END)
wolfSSL 15:117db924cf7c 2611 : WORK_REGS, STATE_REGS, YMM_REGS, "memory", "rsi"
wolfSSL 15:117db924cf7c 2612 );
wolfSSL 15:117db924cf7c 2613
wolfSSL 15:117db924cf7c 2614 return 0;
wolfSSL 15:117db924cf7c 2615 }
wolfSSL 15:117db924cf7c 2616 #endif /* HAVE_INTEL_RORX */
wolfSSL 15:117db924cf7c 2617 #endif /* HAVE_INTEL_AVX2 */
wolfSSL 15:117db924cf7c 2618
wolfSSL 15:117db924cf7c 2619 #endif /* WOLFSSL_SHA512 */
wolfSSL 15:117db924cf7c 2620
wolfSSL 15:117db924cf7c 2621
wolfSSL 15:117db924cf7c 2622 /* -------------------------------------------------------------------------- */
wolfSSL 15:117db924cf7c 2623 /* SHA384 */
wolfSSL 15:117db924cf7c 2624 /* -------------------------------------------------------------------------- */
wolfSSL 15:117db924cf7c 2625 #ifdef WOLFSSL_SHA384
wolfSSL 15:117db924cf7c 2626
wolfSSL 15:117db924cf7c 2627 #if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_HASH)
wolfSSL 15:117db924cf7c 2628 /* functions defined in wolfcrypt/src/port/caam/caam_sha.c */
wolfSSL 15:117db924cf7c 2629 #else
wolfSSL 15:117db924cf7c 2630
wolfSSL 15:117db924cf7c 2631 static int InitSha384(wc_Sha384* sha384)
wolfSSL 15:117db924cf7c 2632 {
wolfSSL 15:117db924cf7c 2633 if (sha384 == NULL) {
wolfSSL 15:117db924cf7c 2634 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 2635 }
wolfSSL 15:117db924cf7c 2636
wolfSSL 15:117db924cf7c 2637 sha384->digest[0] = W64LIT(0xcbbb9d5dc1059ed8);
wolfSSL 15:117db924cf7c 2638 sha384->digest[1] = W64LIT(0x629a292a367cd507);
wolfSSL 15:117db924cf7c 2639 sha384->digest[2] = W64LIT(0x9159015a3070dd17);
wolfSSL 15:117db924cf7c 2640 sha384->digest[3] = W64LIT(0x152fecd8f70e5939);
wolfSSL 15:117db924cf7c 2641 sha384->digest[4] = W64LIT(0x67332667ffc00b31);
wolfSSL 15:117db924cf7c 2642 sha384->digest[5] = W64LIT(0x8eb44a8768581511);
wolfSSL 15:117db924cf7c 2643 sha384->digest[6] = W64LIT(0xdb0c2e0d64f98fa7);
wolfSSL 15:117db924cf7c 2644 sha384->digest[7] = W64LIT(0x47b5481dbefa4fa4);
wolfSSL 15:117db924cf7c 2645
wolfSSL 15:117db924cf7c 2646 sha384->buffLen = 0;
wolfSSL 15:117db924cf7c 2647 sha384->loLen = 0;
wolfSSL 15:117db924cf7c 2648 sha384->hiLen = 0;
wolfSSL 15:117db924cf7c 2649
wolfSSL 15:117db924cf7c 2650 return 0;
wolfSSL 15:117db924cf7c 2651 }
wolfSSL 15:117db924cf7c 2652
wolfSSL 15:117db924cf7c 2653 int wc_Sha384Update(wc_Sha384* sha384, const byte* data, word32 len)
wolfSSL 15:117db924cf7c 2654 {
wolfSSL 15:117db924cf7c 2655 if (sha384 == NULL || (data == NULL && len > 0)) {
wolfSSL 15:117db924cf7c 2656 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 2657 }
wolfSSL 15:117db924cf7c 2658
wolfSSL 15:117db924cf7c 2659 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
wolfSSL 15:117db924cf7c 2660 if (sha384->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA384) {
wolfSSL 15:117db924cf7c 2661 #if defined(HAVE_INTEL_QA)
wolfSSL 15:117db924cf7c 2662 return IntelQaSymSha384(&sha384->asyncDev, NULL, data, len);
wolfSSL 15:117db924cf7c 2663 #endif
wolfSSL 15:117db924cf7c 2664 }
wolfSSL 15:117db924cf7c 2665 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 15:117db924cf7c 2666
wolfSSL 15:117db924cf7c 2667 return Sha512Update((wc_Sha512*)sha384, data, len);
wolfSSL 15:117db924cf7c 2668 }
wolfSSL 15:117db924cf7c 2669
wolfSSL 15:117db924cf7c 2670
wolfSSL 15:117db924cf7c 2671 int wc_Sha384FinalRaw(wc_Sha384* sha384, byte* hash)
wolfSSL 15:117db924cf7c 2672 {
wolfSSL 15:117db924cf7c 2673 #ifdef LITTLE_ENDIAN_ORDER
wolfSSL 15:117db924cf7c 2674 word64 digest[WC_SHA384_DIGEST_SIZE / sizeof(word64)];
wolfSSL 15:117db924cf7c 2675 #endif
wolfSSL 15:117db924cf7c 2676
wolfSSL 15:117db924cf7c 2677 if (sha384 == NULL || hash == NULL) {
wolfSSL 15:117db924cf7c 2678 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 2679 }
wolfSSL 15:117db924cf7c 2680
wolfSSL 15:117db924cf7c 2681 #ifdef LITTLE_ENDIAN_ORDER
wolfSSL 15:117db924cf7c 2682 ByteReverseWords64((word64*)digest, (word64*)sha384->digest,
wolfSSL 15:117db924cf7c 2683 WC_SHA384_DIGEST_SIZE);
wolfSSL 15:117db924cf7c 2684 XMEMCPY(hash, digest, WC_SHA384_DIGEST_SIZE);
wolfSSL 15:117db924cf7c 2685 #else
wolfSSL 15:117db924cf7c 2686 XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
wolfSSL 15:117db924cf7c 2687 #endif
wolfSSL 15:117db924cf7c 2688
wolfSSL 15:117db924cf7c 2689 return 0;
wolfSSL 15:117db924cf7c 2690 }
wolfSSL 15:117db924cf7c 2691
wolfSSL 15:117db924cf7c 2692 int wc_Sha384Final(wc_Sha384* sha384, byte* hash)
wolfSSL 15:117db924cf7c 2693 {
wolfSSL 15:117db924cf7c 2694 int ret;
wolfSSL 15:117db924cf7c 2695
wolfSSL 15:117db924cf7c 2696 if (sha384 == NULL || hash == NULL) {
wolfSSL 15:117db924cf7c 2697 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 2698 }
wolfSSL 15:117db924cf7c 2699
wolfSSL 15:117db924cf7c 2700 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
wolfSSL 15:117db924cf7c 2701 if (sha384->asyncDev.marker == WOLFSSL_ASYNC_MARKER_SHA384) {
wolfSSL 15:117db924cf7c 2702 #if defined(HAVE_INTEL_QA)
wolfSSL 15:117db924cf7c 2703 return IntelQaSymSha384(&sha384->asyncDev, hash, NULL,
wolfSSL 15:117db924cf7c 2704 WC_SHA384_DIGEST_SIZE);
wolfSSL 15:117db924cf7c 2705 #endif
wolfSSL 15:117db924cf7c 2706 }
wolfSSL 15:117db924cf7c 2707 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 15:117db924cf7c 2708
wolfSSL 15:117db924cf7c 2709 ret = Sha512Final((wc_Sha512*)sha384);
wolfSSL 15:117db924cf7c 2710 if (ret != 0)
wolfSSL 15:117db924cf7c 2711 return ret;
wolfSSL 15:117db924cf7c 2712
wolfSSL 15:117db924cf7c 2713 XMEMCPY(hash, sha384->digest, WC_SHA384_DIGEST_SIZE);
wolfSSL 15:117db924cf7c 2714
wolfSSL 15:117db924cf7c 2715 return InitSha384(sha384); /* reset state */
wolfSSL 15:117db924cf7c 2716 }
wolfSSL 15:117db924cf7c 2717
wolfSSL 15:117db924cf7c 2718
wolfSSL 15:117db924cf7c 2719 /* Hardware Acceleration */
wolfSSL 15:117db924cf7c 2720 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 2721 int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId)
wolfSSL 15:117db924cf7c 2722 {
wolfSSL 15:117db924cf7c 2723 int ret = InitSha384(sha384);
wolfSSL 15:117db924cf7c 2724
wolfSSL 15:117db924cf7c 2725 (void)heap;
wolfSSL 15:117db924cf7c 2726 (void)devId;
wolfSSL 15:117db924cf7c 2727
wolfSSL 15:117db924cf7c 2728 Sha512_SetTransform();
wolfSSL 15:117db924cf7c 2729
wolfSSL 15:117db924cf7c 2730 return ret;
wolfSSL 15:117db924cf7c 2731 }
wolfSSL 15:117db924cf7c 2732 #else
wolfSSL 15:117db924cf7c 2733 int wc_InitSha384_ex(wc_Sha384* sha384, void* heap, int devId)
wolfSSL 15:117db924cf7c 2734 {
wolfSSL 15:117db924cf7c 2735 int ret;
wolfSSL 15:117db924cf7c 2736
wolfSSL 15:117db924cf7c 2737 if (sha384 == NULL) {
wolfSSL 15:117db924cf7c 2738 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 2739 }
wolfSSL 15:117db924cf7c 2740
wolfSSL 15:117db924cf7c 2741 sha384->heap = heap;
wolfSSL 15:117db924cf7c 2742 ret = InitSha384(sha384);
wolfSSL 15:117db924cf7c 2743 if (ret != 0)
wolfSSL 15:117db924cf7c 2744 return ret;
wolfSSL 15:117db924cf7c 2745
wolfSSL 15:117db924cf7c 2746 #ifdef WOLFSSL_SMALL_STACK_CACHE
wolfSSL 15:117db924cf7c 2747 sha384->W = NULL;
wolfSSL 15:117db924cf7c 2748 #endif
wolfSSL 15:117db924cf7c 2749
wolfSSL 15:117db924cf7c 2750 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
wolfSSL 15:117db924cf7c 2751 ret = wolfAsync_DevCtxInit(&sha384->asyncDev, WOLFSSL_ASYNC_MARKER_SHA384,
wolfSSL 15:117db924cf7c 2752 sha384->heap, devId);
wolfSSL 15:117db924cf7c 2753 #else
wolfSSL 15:117db924cf7c 2754 (void)devId;
wolfSSL 15:117db924cf7c 2755 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 15:117db924cf7c 2756
wolfSSL 15:117db924cf7c 2757 return ret;
wolfSSL 15:117db924cf7c 2758 }
wolfSSL 15:117db924cf7c 2759 #endif
wolfSSL 15:117db924cf7c 2760 #endif /* WOLFSSL_IMX6_CAAM */
wolfSSL 15:117db924cf7c 2761
wolfSSL 15:117db924cf7c 2762 int wc_InitSha384(wc_Sha384* sha384)
wolfSSL 15:117db924cf7c 2763 {
wolfSSL 15:117db924cf7c 2764 return wc_InitSha384_ex(sha384, NULL, INVALID_DEVID);
wolfSSL 15:117db924cf7c 2765 }
wolfSSL 15:117db924cf7c 2766
wolfSSL 15:117db924cf7c 2767 void wc_Sha384Free(wc_Sha384* sha384)
wolfSSL 15:117db924cf7c 2768 {
wolfSSL 15:117db924cf7c 2769 if (sha384 == NULL)
wolfSSL 15:117db924cf7c 2770 return;
wolfSSL 15:117db924cf7c 2771
wolfSSL 15:117db924cf7c 2772 #ifdef WOLFSSL_SMALL_STACK_CACHE
wolfSSL 15:117db924cf7c 2773 if (sha384->W != NULL) {
wolfSSL 15:117db924cf7c 2774 XFREE(sha384->W, NULL, DYNAMIC_TYPE_TMP_BUFFER);
wolfSSL 15:117db924cf7c 2775 sha384->W = NULL;
wolfSSL 15:117db924cf7c 2776 }
wolfSSL 15:117db924cf7c 2777 #endif
wolfSSL 15:117db924cf7c 2778
wolfSSL 15:117db924cf7c 2779 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_SHA384)
wolfSSL 15:117db924cf7c 2780 wolfAsync_DevCtxFree(&sha384->asyncDev, WOLFSSL_ASYNC_MARKER_SHA384);
wolfSSL 15:117db924cf7c 2781 #endif /* WOLFSSL_ASYNC_CRYPT */
wolfSSL 15:117db924cf7c 2782 }
wolfSSL 15:117db924cf7c 2783
wolfSSL 15:117db924cf7c 2784 #endif /* WOLFSSL_SHA384 */
wolfSSL 15:117db924cf7c 2785
wolfSSL 15:117db924cf7c 2786 #endif /* HAVE_FIPS */
wolfSSL 15:117db924cf7c 2787
wolfSSL 15:117db924cf7c 2788 #ifdef WOLFSSL_SHA512
wolfSSL 15:117db924cf7c 2789
wolfSSL 15:117db924cf7c 2790 int wc_Sha512GetHash(wc_Sha512* sha512, byte* hash)
wolfSSL 15:117db924cf7c 2791 {
wolfSSL 15:117db924cf7c 2792 int ret;
wolfSSL 15:117db924cf7c 2793 wc_Sha512 tmpSha512;
wolfSSL 15:117db924cf7c 2794
wolfSSL 15:117db924cf7c 2795 if (sha512 == NULL || hash == NULL)
wolfSSL 15:117db924cf7c 2796 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 2797
wolfSSL 15:117db924cf7c 2798 ret = wc_Sha512Copy(sha512, &tmpSha512);
wolfSSL 15:117db924cf7c 2799 if (ret == 0) {
wolfSSL 15:117db924cf7c 2800 ret = wc_Sha512Final(&tmpSha512, hash);
wolfSSL 15:117db924cf7c 2801 wc_Sha512Free(&tmpSha512);
wolfSSL 15:117db924cf7c 2802 }
wolfSSL 15:117db924cf7c 2803 return ret;
wolfSSL 15:117db924cf7c 2804 }
wolfSSL 15:117db924cf7c 2805
wolfSSL 15:117db924cf7c 2806 int wc_Sha512Copy(wc_Sha512* src, wc_Sha512* dst)
wolfSSL 15:117db924cf7c 2807 {
wolfSSL 15:117db924cf7c 2808 int ret = 0;
wolfSSL 15:117db924cf7c 2809
wolfSSL 15:117db924cf7c 2810 if (src == NULL || dst == NULL)
wolfSSL 15:117db924cf7c 2811 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 2812
wolfSSL 15:117db924cf7c 2813 XMEMCPY(dst, src, sizeof(wc_Sha512));
wolfSSL 15:117db924cf7c 2814 #ifdef WOLFSSL_SMALL_STACK_CACHE
wolfSSL 15:117db924cf7c 2815 dst->W = NULL;
wolfSSL 15:117db924cf7c 2816 #endif
wolfSSL 15:117db924cf7c 2817
wolfSSL 15:117db924cf7c 2818 #ifdef WOLFSSL_ASYNC_CRYPT
wolfSSL 15:117db924cf7c 2819 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
wolfSSL 15:117db924cf7c 2820 #endif
wolfSSL 15:117db924cf7c 2821
wolfSSL 15:117db924cf7c 2822 return ret;
wolfSSL 15:117db924cf7c 2823 }
wolfSSL 15:117db924cf7c 2824
wolfSSL 15:117db924cf7c 2825 #endif /* WOLFSSL_SHA512 */
wolfSSL 15:117db924cf7c 2826
wolfSSL 15:117db924cf7c 2827 #ifdef WOLFSSL_SHA384
wolfSSL 15:117db924cf7c 2828
wolfSSL 15:117db924cf7c 2829 int wc_Sha384GetHash(wc_Sha384* sha384, byte* hash)
wolfSSL 15:117db924cf7c 2830 {
wolfSSL 15:117db924cf7c 2831 int ret;
wolfSSL 15:117db924cf7c 2832 wc_Sha384 tmpSha384;
wolfSSL 15:117db924cf7c 2833
wolfSSL 15:117db924cf7c 2834 if (sha384 == NULL || hash == NULL)
wolfSSL 15:117db924cf7c 2835 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 2836
wolfSSL 15:117db924cf7c 2837 ret = wc_Sha384Copy(sha384, &tmpSha384);
wolfSSL 15:117db924cf7c 2838 if (ret == 0) {
wolfSSL 15:117db924cf7c 2839 ret = wc_Sha384Final(&tmpSha384, hash);
wolfSSL 15:117db924cf7c 2840 wc_Sha384Free(&tmpSha384);
wolfSSL 15:117db924cf7c 2841 }
wolfSSL 15:117db924cf7c 2842 return ret;
wolfSSL 15:117db924cf7c 2843 }
wolfSSL 15:117db924cf7c 2844 int wc_Sha384Copy(wc_Sha384* src, wc_Sha384* dst)
wolfSSL 15:117db924cf7c 2845 {
wolfSSL 15:117db924cf7c 2846 int ret = 0;
wolfSSL 15:117db924cf7c 2847
wolfSSL 15:117db924cf7c 2848 if (src == NULL || dst == NULL)
wolfSSL 15:117db924cf7c 2849 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 2850
wolfSSL 15:117db924cf7c 2851 XMEMCPY(dst, src, sizeof(wc_Sha384));
wolfSSL 15:117db924cf7c 2852 #ifdef WOLFSSL_SMALL_STACK_CACHE
wolfSSL 15:117db924cf7c 2853 dst->W = NULL;
wolfSSL 15:117db924cf7c 2854 #endif
wolfSSL 15:117db924cf7c 2855
wolfSSL 15:117db924cf7c 2856 #ifdef WOLFSSL_ASYNC_CRYPT
wolfSSL 15:117db924cf7c 2857 ret = wolfAsync_DevCopy(&src->asyncDev, &dst->asyncDev);
wolfSSL 15:117db924cf7c 2858 #endif
wolfSSL 15:117db924cf7c 2859
wolfSSL 15:117db924cf7c 2860 return ret;
wolfSSL 15:117db924cf7c 2861 }
wolfSSL 15:117db924cf7c 2862
wolfSSL 15:117db924cf7c 2863 #endif /* WOLFSSL_SHA384 */
wolfSSL 15:117db924cf7c 2864
wolfSSL 15:117db924cf7c 2865 #endif /* WOLFSSL_SHA512 || WOLFSSL_SHA384 */
wolfSSL 15:117db924cf7c 2866