Xuyi Wang / wolfSSL

Dependents:   OS

Committer:
wolfSSL
Date:
Sat Aug 18 22:20:43 2018 +0000
Revision:
15:117db924cf7c
wolfSSL 3.15.3

Who changed what in which revision?

UserRevisionLine numberNew contents of line
wolfSSL 15:117db924cf7c 1 /* chacha.c
wolfSSL 15:117db924cf7c 2 *
wolfSSL 15:117db924cf7c 3 * Copyright (C) 2006-2017 wolfSSL Inc.
wolfSSL 15:117db924cf7c 4 *
wolfSSL 15:117db924cf7c 5 * This file is part of wolfSSL.
wolfSSL 15:117db924cf7c 6 *
wolfSSL 15:117db924cf7c 7 * wolfSSL is free software; you can redistribute it and/or modify
wolfSSL 15:117db924cf7c 8 * it under the terms of the GNU General Public License as published by
wolfSSL 15:117db924cf7c 9 * the Free Software Foundation; either version 2 of the License, or
wolfSSL 15:117db924cf7c 10 * (at your option) any later version.
wolfSSL 15:117db924cf7c 11 *
wolfSSL 15:117db924cf7c 12 * wolfSSL is distributed in the hope that it will be useful,
wolfSSL 15:117db924cf7c 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
wolfSSL 15:117db924cf7c 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
wolfSSL 15:117db924cf7c 15 * GNU General Public License for more details.
wolfSSL 15:117db924cf7c 16 *
wolfSSL 15:117db924cf7c 17 * You should have received a copy of the GNU General Public License
wolfSSL 15:117db924cf7c 18 * along with this program; if not, write to the Free Software
wolfSSL 15:117db924cf7c 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
wolfSSL 15:117db924cf7c 20 *
wolfSSL 15:117db924cf7c 21 * based from
wolfSSL 15:117db924cf7c 22 * chacha-ref.c version 20080118
wolfSSL 15:117db924cf7c 23 * D. J. Bernstein
wolfSSL 15:117db924cf7c 24 * Public domain.
wolfSSL 15:117db924cf7c 25 */
wolfSSL 15:117db924cf7c 26
wolfSSL 15:117db924cf7c 27
wolfSSL 15:117db924cf7c 28
wolfSSL 15:117db924cf7c 29 #ifdef HAVE_CONFIG_H
wolfSSL 15:117db924cf7c 30 #include <config.h>
wolfSSL 15:117db924cf7c 31 #endif
wolfSSL 15:117db924cf7c 32
wolfSSL 15:117db924cf7c 33 #include <wolfssl/wolfcrypt/settings.h>
wolfSSL 15:117db924cf7c 34
wolfSSL 15:117db924cf7c 35 #ifdef HAVE_CHACHA
wolfSSL 15:117db924cf7c 36
wolfSSL 15:117db924cf7c 37 #include <wolfssl/wolfcrypt/chacha.h>
wolfSSL 15:117db924cf7c 38 #include <wolfssl/wolfcrypt/error-crypt.h>
wolfSSL 15:117db924cf7c 39 #include <wolfssl/wolfcrypt/logging.h>
wolfSSL 15:117db924cf7c 40 #include <wolfssl/wolfcrypt/cpuid.h>
wolfSSL 15:117db924cf7c 41 #ifdef NO_INLINE
wolfSSL 15:117db924cf7c 42 #include <wolfssl/wolfcrypt/misc.h>
wolfSSL 15:117db924cf7c 43 #else
wolfSSL 15:117db924cf7c 44 #define WOLFSSL_MISC_INCLUDED
wolfSSL 15:117db924cf7c 45 #include <wolfcrypt/src/misc.c>
wolfSSL 15:117db924cf7c 46 #endif
wolfSSL 15:117db924cf7c 47
wolfSSL 15:117db924cf7c 48 #ifdef CHACHA_AEAD_TEST
wolfSSL 15:117db924cf7c 49 #include <stdio.h>
wolfSSL 15:117db924cf7c 50 #endif
wolfSSL 15:117db924cf7c 51
wolfSSL 15:117db924cf7c 52 #ifdef USE_INTEL_CHACHA_SPEEDUP
wolfSSL 15:117db924cf7c 53 #include <emmintrin.h>
wolfSSL 15:117db924cf7c 54 #include <immintrin.h>
wolfSSL 15:117db924cf7c 55
wolfSSL 15:117db924cf7c 56 #if defined(__GNUC__) && ((__GNUC__ < 4) || \
wolfSSL 15:117db924cf7c 57 (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
wolfSSL 15:117db924cf7c 58 #define NO_AVX2_SUPPORT
wolfSSL 15:117db924cf7c 59 #endif
wolfSSL 15:117db924cf7c 60 #if defined(__clang__) && ((__clang_major__ < 3) || \
wolfSSL 15:117db924cf7c 61 (__clang_major__ == 3 && __clang_minor__ <= 5))
wolfSSL 15:117db924cf7c 62 #define NO_AVX2_SUPPORT
wolfSSL 15:117db924cf7c 63 #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
wolfSSL 15:117db924cf7c 64 #undef NO_AVX2_SUPPORT
wolfSSL 15:117db924cf7c 65 #endif
wolfSSL 15:117db924cf7c 66
wolfSSL 15:117db924cf7c 67 #ifndef NO_AVX2_SUPPORT
wolfSSL 15:117db924cf7c 68 #define HAVE_INTEL_AVX2
wolfSSL 15:117db924cf7c 69 #endif
wolfSSL 15:117db924cf7c 70
wolfSSL 15:117db924cf7c 71 #if defined(_MSC_VER)
wolfSSL 15:117db924cf7c 72 #define CHACHA20_NOINLINE __declspec(noinline)
wolfSSL 15:117db924cf7c 73 #elif defined(__GNUC__)
wolfSSL 15:117db924cf7c 74 #define CHACHA20_NOINLINE __attribute__((noinline))
wolfSSL 15:117db924cf7c 75 #else
wolfSSL 15:117db924cf7c 76 #define CHACHA20_NOINLINE
wolfSSL 15:117db924cf7c 77 #endif
wolfSSL 15:117db924cf7c 78
wolfSSL 15:117db924cf7c 79 static int cpuidFlagsSet = 0;
wolfSSL 15:117db924cf7c 80 static int cpuidFlags = 0;
wolfSSL 15:117db924cf7c 81 #endif
wolfSSL 15:117db924cf7c 82
wolfSSL 15:117db924cf7c 83 #ifdef BIG_ENDIAN_ORDER
wolfSSL 15:117db924cf7c 84 #define LITTLE32(x) ByteReverseWord32(x)
wolfSSL 15:117db924cf7c 85 #else
wolfSSL 15:117db924cf7c 86 #define LITTLE32(x) (x)
wolfSSL 15:117db924cf7c 87 #endif
wolfSSL 15:117db924cf7c 88
wolfSSL 15:117db924cf7c 89 /* Number of rounds */
wolfSSL 15:117db924cf7c 90 #define ROUNDS 20
wolfSSL 15:117db924cf7c 91
wolfSSL 15:117db924cf7c 92 #define U32C(v) (v##U)
wolfSSL 15:117db924cf7c 93 #define U32V(v) ((word32)(v) & U32C(0xFFFFFFFF))
wolfSSL 15:117db924cf7c 94 #define U8TO32_LITTLE(p) LITTLE32(((word32*)(p))[0])
wolfSSL 15:117db924cf7c 95
wolfSSL 15:117db924cf7c 96 #define ROTATE(v,c) rotlFixed(v, c)
wolfSSL 15:117db924cf7c 97 #define XOR(v,w) ((v) ^ (w))
wolfSSL 15:117db924cf7c 98 #define PLUS(v,w) (U32V((v) + (w)))
wolfSSL 15:117db924cf7c 99 #define PLUSONE(v) (PLUS((v),1))
wolfSSL 15:117db924cf7c 100
wolfSSL 15:117db924cf7c 101 #define QUARTERROUND(a,b,c,d) \
wolfSSL 15:117db924cf7c 102 x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \
wolfSSL 15:117db924cf7c 103 x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \
wolfSSL 15:117db924cf7c 104 x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \
wolfSSL 15:117db924cf7c 105 x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7);
wolfSSL 15:117db924cf7c 106
wolfSSL 15:117db924cf7c 107
wolfSSL 15:117db924cf7c 108
wolfSSL 15:117db924cf7c 109 #define QUARTERROUND_INTEL_ASM(a0,b0,c0,d0, \
wolfSSL 15:117db924cf7c 110 a1,b1,c1,d1, \
wolfSSL 15:117db924cf7c 111 a2,b2,c2,d2, \
wolfSSL 15:117db924cf7c 112 a3,b3,c3,d3, \
wolfSSL 15:117db924cf7c 113 t1,o1) \
wolfSSL 15:117db924cf7c 114 "vpaddd "#b0", "#a0", "#a0"\n\t" \
wolfSSL 15:117db924cf7c 115 "vpxor "#a0", "#d0", "#d0"\n\t" \
wolfSSL 15:117db924cf7c 116 "vmovdqa "#o1"(%[x]), "#c3"\n\t" \
wolfSSL 15:117db924cf7c 117 "vpshufb %[rotl16], "#d0", "#d0"\n\t" \
wolfSSL 15:117db924cf7c 118 "vpaddd "#d0", "#c0", "#c0"\n\t" \
wolfSSL 15:117db924cf7c 119 "vpxor "#c0", "#b0", "#b0"\n\t" \
wolfSSL 15:117db924cf7c 120 "vpaddd "#b1", "#a1", "#a1"\n\t" \
wolfSSL 15:117db924cf7c 121 "vpxor "#a1", "#d1", "#d1"\n\t" \
wolfSSL 15:117db924cf7c 122 "vpshufb %[rotl16], "#d1", "#d1"\n\t" \
wolfSSL 15:117db924cf7c 123 "vpaddd "#d1", "#c1", "#c1"\n\t" \
wolfSSL 15:117db924cf7c 124 "vpxor "#c1", "#b1", "#b1"\n\t" \
wolfSSL 15:117db924cf7c 125 "vpaddd "#b2", "#a2", "#a2"\n\t" \
wolfSSL 15:117db924cf7c 126 "vpxor "#a2", "#d2", "#d2"\n\t" \
wolfSSL 15:117db924cf7c 127 "vpshufb %[rotl16], "#d2", "#d2"\n\t" \
wolfSSL 15:117db924cf7c 128 "vpaddd "#d2", "#c2", "#c2"\n\t" \
wolfSSL 15:117db924cf7c 129 "vpxor "#c2", "#b2", "#b2"\n\t" \
wolfSSL 15:117db924cf7c 130 "vpaddd "#b3", "#a3", "#a3"\n\t" \
wolfSSL 15:117db924cf7c 131 "vpxor "#a3", "#d3", "#d3"\n\t" \
wolfSSL 15:117db924cf7c 132 "vpshufb %[rotl16], "#d3", "#d3"\n\t" \
wolfSSL 15:117db924cf7c 133 "vpaddd "#d3", "#c3", "#c3"\n\t" \
wolfSSL 15:117db924cf7c 134 "vpxor "#c3", "#b3", "#b3"\n\t" \
wolfSSL 15:117db924cf7c 135 "vmovdqa "#c3", "#o1"(%[x])\n\t" \
wolfSSL 15:117db924cf7c 136 "vpsrld $20, "#b0", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 137 "vpslld $12, "#b0", "#b0"\n\t" \
wolfSSL 15:117db924cf7c 138 "vpxor "#t1", "#b0", "#b0"\n\t" \
wolfSSL 15:117db924cf7c 139 "vpsrld $20, "#b1", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 140 "vpslld $12, "#b1", "#b1"\n\t" \
wolfSSL 15:117db924cf7c 141 "vpxor "#t1", "#b1", "#b1"\n\t" \
wolfSSL 15:117db924cf7c 142 "vpsrld $20, "#b2", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 143 "vpslld $12, "#b2", "#b2"\n\t" \
wolfSSL 15:117db924cf7c 144 "vpxor "#t1", "#b2", "#b2"\n\t" \
wolfSSL 15:117db924cf7c 145 "vpsrld $20, "#b3", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 146 "vpslld $12, "#b3", "#b3"\n\t" \
wolfSSL 15:117db924cf7c 147 "vpxor "#t1", "#b3", "#b3"\n\t" \
wolfSSL 15:117db924cf7c 148 "vpaddd "#b0", "#a0", "#a0"\n\t" \
wolfSSL 15:117db924cf7c 149 "vpxor "#a0", "#d0", "#d0"\n\t" \
wolfSSL 15:117db924cf7c 150 "vmovdqa "#o1"(%[x]), "#c3"\n\t" \
wolfSSL 15:117db924cf7c 151 "vpshufb %[rotl8], "#d0", "#d0"\n\t" \
wolfSSL 15:117db924cf7c 152 "vpaddd "#d0", "#c0", "#c0"\n\t" \
wolfSSL 15:117db924cf7c 153 "vpxor "#c0", "#b0", "#b0"\n\t" \
wolfSSL 15:117db924cf7c 154 "vpaddd "#b1", "#a1", "#a1"\n\t" \
wolfSSL 15:117db924cf7c 155 "vpxor "#a1", "#d1", "#d1"\n\t" \
wolfSSL 15:117db924cf7c 156 "vpshufb %[rotl8], "#d1", "#d1"\n\t" \
wolfSSL 15:117db924cf7c 157 "vpaddd "#d1", "#c1", "#c1"\n\t" \
wolfSSL 15:117db924cf7c 158 "vpxor "#c1", "#b1", "#b1"\n\t" \
wolfSSL 15:117db924cf7c 159 "vpaddd "#b2", "#a2", "#a2"\n\t" \
wolfSSL 15:117db924cf7c 160 "vpxor "#a2", "#d2", "#d2"\n\t" \
wolfSSL 15:117db924cf7c 161 "vpshufb %[rotl8], "#d2", "#d2"\n\t" \
wolfSSL 15:117db924cf7c 162 "vpaddd "#d2", "#c2", "#c2"\n\t" \
wolfSSL 15:117db924cf7c 163 "vpxor "#c2", "#b2", "#b2"\n\t" \
wolfSSL 15:117db924cf7c 164 "vpaddd "#b3", "#a3", "#a3"\n\t" \
wolfSSL 15:117db924cf7c 165 "vpxor "#a3", "#d3", "#d3"\n\t" \
wolfSSL 15:117db924cf7c 166 "vpshufb %[rotl8], "#d3", "#d3"\n\t" \
wolfSSL 15:117db924cf7c 167 "vpaddd "#d3", "#c3", "#c3"\n\t" \
wolfSSL 15:117db924cf7c 168 "vpxor "#c3", "#b3", "#b3"\n\t" \
wolfSSL 15:117db924cf7c 169 "vmovdqa "#c3", "#o1"(%[x])\n\t" \
wolfSSL 15:117db924cf7c 170 "vpsrld $25, "#b0", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 171 "vpslld $7, "#b0", "#b0"\n\t" \
wolfSSL 15:117db924cf7c 172 "vpxor "#t1", "#b0", "#b0"\n\t" \
wolfSSL 15:117db924cf7c 173 "vpsrld $25, "#b1", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 174 "vpslld $7, "#b1", "#b1"\n\t" \
wolfSSL 15:117db924cf7c 175 "vpxor "#t1", "#b1", "#b1"\n\t" \
wolfSSL 15:117db924cf7c 176 "vpsrld $25, "#b2", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 177 "vpslld $7, "#b2", "#b2"\n\t" \
wolfSSL 15:117db924cf7c 178 "vpxor "#t1", "#b2", "#b2"\n\t" \
wolfSSL 15:117db924cf7c 179 "vpsrld $25, "#b3", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 180 "vpslld $7, "#b3", "#b3"\n\t" \
wolfSSL 15:117db924cf7c 181 "vpxor "#t1", "#b3", "#b3"\n\t"
wolfSSL 15:117db924cf7c 182
wolfSSL 15:117db924cf7c 183 #define QUARTERROUND_INTEL_ASM_2(a0,b0,c0,d0, \
wolfSSL 15:117db924cf7c 184 a1,b1,c1,d1, \
wolfSSL 15:117db924cf7c 185 a2,b2,c2,d2, \
wolfSSL 15:117db924cf7c 186 a3,b3,c3,d3, \
wolfSSL 15:117db924cf7c 187 t1,o1) \
wolfSSL 15:117db924cf7c 188 "vpaddd "#b0", "#a0", "#a0"\n\t" \
wolfSSL 15:117db924cf7c 189 "vpxor "#a0", "#d0", "#d0"\n\t" \
wolfSSL 15:117db924cf7c 190 "vmovdqa "#o1"(%[x]), "#c1"\n\t" \
wolfSSL 15:117db924cf7c 191 "vpshufb %[rotl16], "#d0", "#d0"\n\t" \
wolfSSL 15:117db924cf7c 192 "vpaddd "#d0", "#c0", "#c0"\n\t" \
wolfSSL 15:117db924cf7c 193 "vpxor "#c0", "#b0", "#b0"\n\t" \
wolfSSL 15:117db924cf7c 194 "vpaddd "#b1", "#a1", "#a1"\n\t" \
wolfSSL 15:117db924cf7c 195 "vpxor "#a1", "#d1", "#d1"\n\t" \
wolfSSL 15:117db924cf7c 196 "vpshufb %[rotl16], "#d1", "#d1"\n\t" \
wolfSSL 15:117db924cf7c 197 "vpaddd "#d1", "#c1", "#c1"\n\t" \
wolfSSL 15:117db924cf7c 198 "vpxor "#c1", "#b1", "#b1"\n\t" \
wolfSSL 15:117db924cf7c 199 "vpaddd "#b2", "#a2", "#a2"\n\t" \
wolfSSL 15:117db924cf7c 200 "vpxor "#a2", "#d2", "#d2"\n\t" \
wolfSSL 15:117db924cf7c 201 "vpshufb %[rotl16], "#d2", "#d2"\n\t" \
wolfSSL 15:117db924cf7c 202 "vpaddd "#d2", "#c2", "#c2"\n\t" \
wolfSSL 15:117db924cf7c 203 "vpxor "#c2", "#b2", "#b2"\n\t" \
wolfSSL 15:117db924cf7c 204 "vpaddd "#b3", "#a3", "#a3"\n\t" \
wolfSSL 15:117db924cf7c 205 "vpxor "#a3", "#d3", "#d3"\n\t" \
wolfSSL 15:117db924cf7c 206 "vpshufb %[rotl16], "#d3", "#d3"\n\t" \
wolfSSL 15:117db924cf7c 207 "vpaddd "#d3", "#c3", "#c3"\n\t" \
wolfSSL 15:117db924cf7c 208 "vpxor "#c3", "#b3", "#b3"\n\t" \
wolfSSL 15:117db924cf7c 209 "vmovdqa "#c1", "#o1"(%[x])\n\t" \
wolfSSL 15:117db924cf7c 210 "vpsrld $20, "#b0", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 211 "vpslld $12, "#b0", "#b0"\n\t" \
wolfSSL 15:117db924cf7c 212 "vpxor "#t1", "#b0", "#b0"\n\t" \
wolfSSL 15:117db924cf7c 213 "vpsrld $20, "#b1", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 214 "vpslld $12, "#b1", "#b1"\n\t" \
wolfSSL 15:117db924cf7c 215 "vpxor "#t1", "#b1", "#b1"\n\t" \
wolfSSL 15:117db924cf7c 216 "vpsrld $20, "#b2", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 217 "vpslld $12, "#b2", "#b2"\n\t" \
wolfSSL 15:117db924cf7c 218 "vpxor "#t1", "#b2", "#b2"\n\t" \
wolfSSL 15:117db924cf7c 219 "vpsrld $20, "#b3", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 220 "vpslld $12, "#b3", "#b3"\n\t" \
wolfSSL 15:117db924cf7c 221 "vpxor "#t1", "#b3", "#b3"\n\t" \
wolfSSL 15:117db924cf7c 222 "vpaddd "#b0", "#a0", "#a0"\n\t" \
wolfSSL 15:117db924cf7c 223 "vpxor "#a0", "#d0", "#d0"\n\t" \
wolfSSL 15:117db924cf7c 224 "vmovdqa "#o1"(%[x]), "#c1"\n\t" \
wolfSSL 15:117db924cf7c 225 "vpshufb %[rotl8], "#d0", "#d0"\n\t" \
wolfSSL 15:117db924cf7c 226 "vpaddd "#d0", "#c0", "#c0"\n\t" \
wolfSSL 15:117db924cf7c 227 "vpxor "#c0", "#b0", "#b0"\n\t" \
wolfSSL 15:117db924cf7c 228 "vpaddd "#b1", "#a1", "#a1"\n\t" \
wolfSSL 15:117db924cf7c 229 "vpxor "#a1", "#d1", "#d1"\n\t" \
wolfSSL 15:117db924cf7c 230 "vpshufb %[rotl8], "#d1", "#d1"\n\t" \
wolfSSL 15:117db924cf7c 231 "vpaddd "#d1", "#c1", "#c1"\n\t" \
wolfSSL 15:117db924cf7c 232 "vpxor "#c1", "#b1", "#b1"\n\t" \
wolfSSL 15:117db924cf7c 233 "vpaddd "#b2", "#a2", "#a2"\n\t" \
wolfSSL 15:117db924cf7c 234 "vpxor "#a2", "#d2", "#d2"\n\t" \
wolfSSL 15:117db924cf7c 235 "vpshufb %[rotl8], "#d2", "#d2"\n\t" \
wolfSSL 15:117db924cf7c 236 "vpaddd "#d2", "#c2", "#c2"\n\t" \
wolfSSL 15:117db924cf7c 237 "vpxor "#c2", "#b2", "#b2"\n\t" \
wolfSSL 15:117db924cf7c 238 "vpaddd "#b3", "#a3", "#a3"\n\t" \
wolfSSL 15:117db924cf7c 239 "vpxor "#a3", "#d3", "#d3"\n\t" \
wolfSSL 15:117db924cf7c 240 "vpshufb %[rotl8], "#d3", "#d3"\n\t" \
wolfSSL 15:117db924cf7c 241 "vpaddd "#d3", "#c3", "#c3"\n\t" \
wolfSSL 15:117db924cf7c 242 "vpxor "#c3", "#b3", "#b3"\n\t" \
wolfSSL 15:117db924cf7c 243 "vmovdqa "#c1", "#o1"(%[x])\n\t" \
wolfSSL 15:117db924cf7c 244 "vpsrld $25, "#b0", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 245 "vpslld $7, "#b0", "#b0"\n\t" \
wolfSSL 15:117db924cf7c 246 "vpxor "#t1", "#b0", "#b0"\n\t" \
wolfSSL 15:117db924cf7c 247 "vpsrld $25, "#b1", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 248 "vpslld $7, "#b1", "#b1"\n\t" \
wolfSSL 15:117db924cf7c 249 "vpxor "#t1", "#b1", "#b1"\n\t" \
wolfSSL 15:117db924cf7c 250 "vpsrld $25, "#b2", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 251 "vpslld $7, "#b2", "#b2"\n\t" \
wolfSSL 15:117db924cf7c 252 "vpxor "#t1", "#b2", "#b2"\n\t" \
wolfSSL 15:117db924cf7c 253 "vpsrld $25, "#b3", "#t1"\n\t" \
wolfSSL 15:117db924cf7c 254 "vpslld $7, "#b3", "#b3"\n\t" \
wolfSSL 15:117db924cf7c 255 "vpxor "#t1", "#b3", "#b3"\n\t"
wolfSSL 15:117db924cf7c 256
wolfSSL 15:117db924cf7c 257
wolfSSL 15:117db924cf7c 258 #define QUARTERROUND_XMM() \
wolfSSL 15:117db924cf7c 259 QUARTERROUND_INTEL_ASM(%%xmm0,%%xmm4,%%xmm8,%%xmm12, \
wolfSSL 15:117db924cf7c 260 %%xmm1,%%xmm5,%%xmm9,%%xmm13, \
wolfSSL 15:117db924cf7c 261 %%xmm2,%%xmm6,%%xmm10,%%xmm14, \
wolfSSL 15:117db924cf7c 262 %%xmm3,%%xmm7,%%xmm11,%%xmm15, \
wolfSSL 15:117db924cf7c 263 %%xmm11,48)
wolfSSL 15:117db924cf7c 264 #define QUARTERROUND_XMM_2() \
wolfSSL 15:117db924cf7c 265 QUARTERROUND_INTEL_ASM_2(%%xmm0,%%xmm5,%%xmm10,%%xmm15, \
wolfSSL 15:117db924cf7c 266 %%xmm1,%%xmm6,%%xmm11,%%xmm12, \
wolfSSL 15:117db924cf7c 267 %%xmm2,%%xmm7,%%xmm8,%%xmm13, \
wolfSSL 15:117db924cf7c 268 %%xmm3,%%xmm4,%%xmm9,%%xmm14, \
wolfSSL 15:117db924cf7c 269 %%xmm11,48)
wolfSSL 15:117db924cf7c 270
wolfSSL 15:117db924cf7c 271 #define QUARTERROUND_YMM() \
wolfSSL 15:117db924cf7c 272 QUARTERROUND_INTEL_ASM(%%ymm0,%%ymm4,%%ymm8,%%ymm12, \
wolfSSL 15:117db924cf7c 273 %%ymm1,%%ymm5,%%ymm9,%%ymm13, \
wolfSSL 15:117db924cf7c 274 %%ymm2,%%ymm6,%%ymm10,%%ymm14, \
wolfSSL 15:117db924cf7c 275 %%ymm3,%%ymm7,%%ymm11,%%ymm15, \
wolfSSL 15:117db924cf7c 276 %%ymm11,96)
wolfSSL 15:117db924cf7c 277 #define QUARTERROUND_YMM_2() \
wolfSSL 15:117db924cf7c 278 QUARTERROUND_INTEL_ASM_2(%%ymm0,%%ymm5,%%ymm10,%%ymm15, \
wolfSSL 15:117db924cf7c 279 %%ymm1,%%ymm6,%%ymm11,%%ymm12, \
wolfSSL 15:117db924cf7c 280 %%ymm2,%%ymm7,%%ymm8,%%ymm13, \
wolfSSL 15:117db924cf7c 281 %%ymm3,%%ymm4,%%ymm9,%%ymm14, \
wolfSSL 15:117db924cf7c 282 %%ymm11,96)
wolfSSL 15:117db924cf7c 283
wolfSSL 15:117db924cf7c 284 /**
wolfSSL 15:117db924cf7c 285 * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version
wolfSSL 15:117db924cf7c 286 * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB.
wolfSSL 15:117db924cf7c 287 */
wolfSSL 15:117db924cf7c 288 int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter)
wolfSSL 15:117db924cf7c 289 {
wolfSSL 15:117db924cf7c 290 word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */
wolfSSL 15:117db924cf7c 291
wolfSSL 15:117db924cf7c 292 #ifdef CHACHA_AEAD_TEST
wolfSSL 15:117db924cf7c 293 word32 i;
wolfSSL 15:117db924cf7c 294 printf("NONCE : ");
wolfSSL 15:117db924cf7c 295 for (i = 0; i < CHACHA_IV_BYTES; i++) {
wolfSSL 15:117db924cf7c 296 printf("%02x", inIv[i]);
wolfSSL 15:117db924cf7c 297 }
wolfSSL 15:117db924cf7c 298 printf("\n\n");
wolfSSL 15:117db924cf7c 299 #endif
wolfSSL 15:117db924cf7c 300
wolfSSL 15:117db924cf7c 301 if (ctx == NULL)
wolfSSL 15:117db924cf7c 302 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 303
wolfSSL 15:117db924cf7c 304 XMEMCPY(temp, inIv, CHACHA_IV_BYTES);
wolfSSL 15:117db924cf7c 305
wolfSSL 15:117db924cf7c 306 ctx->X[CHACHA_IV_BYTES+0] = counter; /* block counter */
wolfSSL 15:117db924cf7c 307 ctx->X[CHACHA_IV_BYTES+1] = LITTLE32(temp[0]); /* fixed variable from nonce */
wolfSSL 15:117db924cf7c 308 ctx->X[CHACHA_IV_BYTES+2] = LITTLE32(temp[1]); /* counter from nonce */
wolfSSL 15:117db924cf7c 309 ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */
wolfSSL 15:117db924cf7c 310
wolfSSL 15:117db924cf7c 311 return 0;
wolfSSL 15:117db924cf7c 312 }
wolfSSL 15:117db924cf7c 313
wolfSSL 15:117db924cf7c 314 /* "expand 32-byte k" as unsigned 32 byte */
wolfSSL 15:117db924cf7c 315 static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
wolfSSL 15:117db924cf7c 316 /* "expand 16-byte k" as unsigned 16 byte */
wolfSSL 15:117db924cf7c 317 static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574};
wolfSSL 15:117db924cf7c 318
wolfSSL 15:117db924cf7c 319 /**
wolfSSL 15:117db924cf7c 320 * Key setup. 8 word iv (nonce)
wolfSSL 15:117db924cf7c 321 */
wolfSSL 15:117db924cf7c 322 int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz)
wolfSSL 15:117db924cf7c 323 {
wolfSSL 15:117db924cf7c 324 const word32* constants;
wolfSSL 15:117db924cf7c 325 const byte* k;
wolfSSL 15:117db924cf7c 326
wolfSSL 15:117db924cf7c 327 #ifdef XSTREAM_ALIGN
wolfSSL 15:117db924cf7c 328 word32 alignKey[8];
wolfSSL 15:117db924cf7c 329 #endif
wolfSSL 15:117db924cf7c 330
wolfSSL 15:117db924cf7c 331 if (ctx == NULL)
wolfSSL 15:117db924cf7c 332 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 333
wolfSSL 15:117db924cf7c 334 if (keySz != (CHACHA_MAX_KEY_SZ/2) && keySz != CHACHA_MAX_KEY_SZ)
wolfSSL 15:117db924cf7c 335 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 336
wolfSSL 15:117db924cf7c 337 #ifdef XSTREAM_ALIGN
wolfSSL 15:117db924cf7c 338 if ((wolfssl_word)key % 4) {
wolfSSL 15:117db924cf7c 339 WOLFSSL_MSG("wc_ChachaSetKey unaligned key");
wolfSSL 15:117db924cf7c 340 XMEMCPY(alignKey, key, keySz);
wolfSSL 15:117db924cf7c 341 k = (byte*)alignKey;
wolfSSL 15:117db924cf7c 342 }
wolfSSL 15:117db924cf7c 343 else {
wolfSSL 15:117db924cf7c 344 k = key;
wolfSSL 15:117db924cf7c 345 }
wolfSSL 15:117db924cf7c 346 #else
wolfSSL 15:117db924cf7c 347 k = key;
wolfSSL 15:117db924cf7c 348 #endif /* XSTREAM_ALIGN */
wolfSSL 15:117db924cf7c 349
wolfSSL 15:117db924cf7c 350 #ifdef CHACHA_AEAD_TEST
wolfSSL 15:117db924cf7c 351 word32 i;
wolfSSL 15:117db924cf7c 352 printf("ChaCha key used :\n");
wolfSSL 15:117db924cf7c 353 for (i = 0; i < keySz; i++) {
wolfSSL 15:117db924cf7c 354 printf("%02x", key[i]);
wolfSSL 15:117db924cf7c 355 if ((i + 1) % 8 == 0)
wolfSSL 15:117db924cf7c 356 printf("\n");
wolfSSL 15:117db924cf7c 357 }
wolfSSL 15:117db924cf7c 358 printf("\n\n");
wolfSSL 15:117db924cf7c 359 #endif
wolfSSL 15:117db924cf7c 360
wolfSSL 15:117db924cf7c 361 ctx->X[4] = U8TO32_LITTLE(k + 0);
wolfSSL 15:117db924cf7c 362 ctx->X[5] = U8TO32_LITTLE(k + 4);
wolfSSL 15:117db924cf7c 363 ctx->X[6] = U8TO32_LITTLE(k + 8);
wolfSSL 15:117db924cf7c 364 ctx->X[7] = U8TO32_LITTLE(k + 12);
wolfSSL 15:117db924cf7c 365 if (keySz == CHACHA_MAX_KEY_SZ) {
wolfSSL 15:117db924cf7c 366 k += 16;
wolfSSL 15:117db924cf7c 367 constants = sigma;
wolfSSL 15:117db924cf7c 368 }
wolfSSL 15:117db924cf7c 369 else {
wolfSSL 15:117db924cf7c 370 constants = tau;
wolfSSL 15:117db924cf7c 371 }
wolfSSL 15:117db924cf7c 372 ctx->X[ 8] = U8TO32_LITTLE(k + 0);
wolfSSL 15:117db924cf7c 373 ctx->X[ 9] = U8TO32_LITTLE(k + 4);
wolfSSL 15:117db924cf7c 374 ctx->X[10] = U8TO32_LITTLE(k + 8);
wolfSSL 15:117db924cf7c 375 ctx->X[11] = U8TO32_LITTLE(k + 12);
wolfSSL 15:117db924cf7c 376 ctx->X[ 0] = constants[0];
wolfSSL 15:117db924cf7c 377 ctx->X[ 1] = constants[1];
wolfSSL 15:117db924cf7c 378 ctx->X[ 2] = constants[2];
wolfSSL 15:117db924cf7c 379 ctx->X[ 3] = constants[3];
wolfSSL 15:117db924cf7c 380
wolfSSL 15:117db924cf7c 381 return 0;
wolfSSL 15:117db924cf7c 382 }
wolfSSL 15:117db924cf7c 383
wolfSSL 15:117db924cf7c 384 /**
wolfSSL 15:117db924cf7c 385 * Converts word into bytes with rotations having been done.
wolfSSL 15:117db924cf7c 386 */
wolfSSL 15:117db924cf7c 387 static WC_INLINE void wc_Chacha_wordtobyte(word32 output[CHACHA_CHUNK_WORDS],
wolfSSL 15:117db924cf7c 388 const word32 input[CHACHA_CHUNK_WORDS])
wolfSSL 15:117db924cf7c 389 {
wolfSSL 15:117db924cf7c 390 word32 x[CHACHA_CHUNK_WORDS];
wolfSSL 15:117db924cf7c 391 word32 i;
wolfSSL 15:117db924cf7c 392
wolfSSL 15:117db924cf7c 393 for (i = 0; i < CHACHA_CHUNK_WORDS; i++) {
wolfSSL 15:117db924cf7c 394 x[i] = input[i];
wolfSSL 15:117db924cf7c 395 }
wolfSSL 15:117db924cf7c 396
wolfSSL 15:117db924cf7c 397 for (i = (ROUNDS); i > 0; i -= 2) {
wolfSSL 15:117db924cf7c 398 QUARTERROUND(0, 4, 8, 12)
wolfSSL 15:117db924cf7c 399 QUARTERROUND(1, 5, 9, 13)
wolfSSL 15:117db924cf7c 400 QUARTERROUND(2, 6, 10, 14)
wolfSSL 15:117db924cf7c 401 QUARTERROUND(3, 7, 11, 15)
wolfSSL 15:117db924cf7c 402 QUARTERROUND(0, 5, 10, 15)
wolfSSL 15:117db924cf7c 403 QUARTERROUND(1, 6, 11, 12)
wolfSSL 15:117db924cf7c 404 QUARTERROUND(2, 7, 8, 13)
wolfSSL 15:117db924cf7c 405 QUARTERROUND(3, 4, 9, 14)
wolfSSL 15:117db924cf7c 406 }
wolfSSL 15:117db924cf7c 407
wolfSSL 15:117db924cf7c 408 for (i = 0; i < CHACHA_CHUNK_WORDS; i++) {
wolfSSL 15:117db924cf7c 409 x[i] = PLUS(x[i], input[i]);
wolfSSL 15:117db924cf7c 410 }
wolfSSL 15:117db924cf7c 411
wolfSSL 15:117db924cf7c 412 for (i = 0; i < CHACHA_CHUNK_WORDS; i++) {
wolfSSL 15:117db924cf7c 413 output[i] = LITTLE32(x[i]);
wolfSSL 15:117db924cf7c 414 }
wolfSSL 15:117db924cf7c 415 }
wolfSSL 15:117db924cf7c 416
wolfSSL 15:117db924cf7c 417
wolfSSL 15:117db924cf7c 418 #ifdef USE_INTEL_CHACHA_SPEEDUP
wolfSSL 15:117db924cf7c 419
wolfSSL 15:117db924cf7c 420 #define QUARTERROUND_2_X64(r11, r12, r13, r14, r21, r22, r23, r24) \
wolfSSL 15:117db924cf7c 421 "addl "#r12", "#r11"\n\t" \
wolfSSL 15:117db924cf7c 422 "addl "#r22", "#r21"\n\t" \
wolfSSL 15:117db924cf7c 423 "xorl "#r11", "#r14"\n\t" \
wolfSSL 15:117db924cf7c 424 "xorl "#r21", "#r24"\n\t" \
wolfSSL 15:117db924cf7c 425 "roll $16, "#r14"\n\t" \
wolfSSL 15:117db924cf7c 426 "roll $16, "#r24"\n\t" \
wolfSSL 15:117db924cf7c 427 "addl "#r14", "#r13"\n\t" \
wolfSSL 15:117db924cf7c 428 "addl "#r24", "#r23"\n\t" \
wolfSSL 15:117db924cf7c 429 "xorl "#r13", "#r12"\n\t" \
wolfSSL 15:117db924cf7c 430 "xorl "#r23", "#r22"\n\t" \
wolfSSL 15:117db924cf7c 431 "roll $12, "#r12"\n\t" \
wolfSSL 15:117db924cf7c 432 "roll $12, "#r22"\n\t" \
wolfSSL 15:117db924cf7c 433 "addl "#r12", "#r11"\n\t" \
wolfSSL 15:117db924cf7c 434 "addl "#r22", "#r21"\n\t" \
wolfSSL 15:117db924cf7c 435 "xorl "#r11", "#r14"\n\t" \
wolfSSL 15:117db924cf7c 436 "xorl "#r21", "#r24"\n\t" \
wolfSSL 15:117db924cf7c 437 "roll $8, "#r14"\n\t" \
wolfSSL 15:117db924cf7c 438 "roll $8, "#r24"\n\t" \
wolfSSL 15:117db924cf7c 439 "addl "#r14", "#r13"\n\t" \
wolfSSL 15:117db924cf7c 440 "addl "#r24", "#r23"\n\t" \
wolfSSL 15:117db924cf7c 441 "xorl "#r13", "#r12"\n\t" \
wolfSSL 15:117db924cf7c 442 "xorl "#r23", "#r22"\n\t" \
wolfSSL 15:117db924cf7c 443 "roll $7, "#r12"\n\t" \
wolfSSL 15:117db924cf7c 444 "roll $7, "#r22"\n\t" \
wolfSSL 15:117db924cf7c 445
wolfSSL 15:117db924cf7c 446 #define CHACHA_CRYPT_X64() \
wolfSSL 15:117db924cf7c 447 "subq $40, %%rsp\n\t" \
wolfSSL 15:117db924cf7c 448 "movq 32(%[input]), %%rax\n\t" \
wolfSSL 15:117db924cf7c 449 "movq 40(%[input]), %%rdx\n\t" \
wolfSSL 15:117db924cf7c 450 "movq %%rax, 8(%%rsp)\n\t" \
wolfSSL 15:117db924cf7c 451 "movq %%rdx, 16(%%rsp)\n\t" \
wolfSSL 15:117db924cf7c 452 "movl 0(%[input]), %%eax\n\t" \
wolfSSL 15:117db924cf7c 453 "movl 4(%[input]), %%ebx\n\t" \
wolfSSL 15:117db924cf7c 454 "movl 8(%[input]), %%ecx\n\t" \
wolfSSL 15:117db924cf7c 455 "movl 12(%[input]), %%edx\n\t" \
wolfSSL 15:117db924cf7c 456 "movl 16(%[input]), %%r8d\n\t" \
wolfSSL 15:117db924cf7c 457 "movl 20(%[input]), %%r9d\n\t" \
wolfSSL 15:117db924cf7c 458 "movl 24(%[input]), %%r10d\n\t" \
wolfSSL 15:117db924cf7c 459 "movl 28(%[input]), %%r11d\n\t" \
wolfSSL 15:117db924cf7c 460 "movl 48(%[input]), %%r12d\n\t" \
wolfSSL 15:117db924cf7c 461 "movl 52(%[input]), %%r13d\n\t" \
wolfSSL 15:117db924cf7c 462 "movl 56(%[input]), %%r14d\n\t" \
wolfSSL 15:117db924cf7c 463 "movl 60(%[input]), %%r15d\n\t" \
wolfSSL 15:117db924cf7c 464 "movb $10, (%%rsp)\n\t" \
wolfSSL 15:117db924cf7c 465 "movq %%rsi, 32(%%rsp)\n\t" \
wolfSSL 15:117db924cf7c 466 "movq %%rdi, 24(%%rsp)\n\t" \
wolfSSL 15:117db924cf7c 467 "movl 8(%%rsp), %%esi\n\t" \
wolfSSL 15:117db924cf7c 468 "movl 12(%%rsp), %%edi\n\t" \
wolfSSL 15:117db924cf7c 469 "\n" \
wolfSSL 15:117db924cf7c 470 "1:\n\t" \
wolfSSL 15:117db924cf7c 471 QUARTERROUND_2_X64(%%eax, %%r8d, %%esi, %%r12d, \
wolfSSL 15:117db924cf7c 472 %%ebx, %%r9d, %%edi, %%r13d) \
wolfSSL 15:117db924cf7c 473 "movl %%esi, 8(%%rsp)\n\t" \
wolfSSL 15:117db924cf7c 474 "movl %%edi, 12(%%rsp)\n\t" \
wolfSSL 15:117db924cf7c 475 "movl 16(%%rsp), %%esi\n\t" \
wolfSSL 15:117db924cf7c 476 "movl 20(%%rsp), %%edi\n\t" \
wolfSSL 15:117db924cf7c 477 QUARTERROUND_2_X64(%%ecx, %%r10d, %%esi, %%r14d, \
wolfSSL 15:117db924cf7c 478 %%edx, %%r11d, %%edi, %%r15d) \
wolfSSL 15:117db924cf7c 479 QUARTERROUND_2_X64(%%eax, %%r9d, %%esi, %%r15d, \
wolfSSL 15:117db924cf7c 480 %%ebx, %%r10d, %%edi, %%r12d) \
wolfSSL 15:117db924cf7c 481 "movl %%esi, 16(%%rsp)\n\t" \
wolfSSL 15:117db924cf7c 482 "movl %%edi, 20(%%rsp)\n\t" \
wolfSSL 15:117db924cf7c 483 "movl 8(%%rsp), %%esi\n\t" \
wolfSSL 15:117db924cf7c 484 "movl 12(%%rsp), %%edi\n\t" \
wolfSSL 15:117db924cf7c 485 QUARTERROUND_2_X64(%%ecx, %%r11d, %%esi, %%r13d, \
wolfSSL 15:117db924cf7c 486 %%edx, %%r8d, %%edi, %%r14d) \
wolfSSL 15:117db924cf7c 487 "decb (%%rsp)\n\t" \
wolfSSL 15:117db924cf7c 488 "jnz 1b\n\t" \
wolfSSL 15:117db924cf7c 489 "movl %%esi, 8(%%rsp)\n\t" \
wolfSSL 15:117db924cf7c 490 "movl %%edi, 12(%%rsp)\n\t" \
wolfSSL 15:117db924cf7c 491 "movq 32(%%rsp), %%rsi\n\t" \
wolfSSL 15:117db924cf7c 492 "movq 24(%%rsp), %%rdi\n\t" \
wolfSSL 15:117db924cf7c 493 "addl 0(%[input]), %%eax\n\t" \
wolfSSL 15:117db924cf7c 494 "addl 4(%[input]), %%ebx\n\t" \
wolfSSL 15:117db924cf7c 495 "addl 8(%[input]), %%ecx\n\t" \
wolfSSL 15:117db924cf7c 496 "addl 12(%[input]), %%edx\n\t" \
wolfSSL 15:117db924cf7c 497 "addl 16(%[input]), %%r8d\n\t" \
wolfSSL 15:117db924cf7c 498 "addl 20(%[input]), %%r9d\n\t" \
wolfSSL 15:117db924cf7c 499 "addl 24(%[input]), %%r10d\n\t" \
wolfSSL 15:117db924cf7c 500 "addl 28(%[input]), %%r11d\n\t" \
wolfSSL 15:117db924cf7c 501 "addl 48(%[input]), %%r12d\n\t" \
wolfSSL 15:117db924cf7c 502 "addl 52(%[input]), %%r13d\n\t" \
wolfSSL 15:117db924cf7c 503 "addl 56(%[input]), %%r14d\n\t" \
wolfSSL 15:117db924cf7c 504 "addl 60(%[input]), %%r15d\n\t" \
wolfSSL 15:117db924cf7c 505
wolfSSL 15:117db924cf7c 506 #define CHACHA_PARTIAL_CHUNK_X64() \
wolfSSL 15:117db924cf7c 507 __asm__ __volatile__ ( \
wolfSSL 15:117db924cf7c 508 CHACHA_CRYPT_X64() \
wolfSSL 15:117db924cf7c 509 "movl %%eax , 0(%[c])\n\t" \
wolfSSL 15:117db924cf7c 510 "movl %%ebx , 4(%[c])\n\t" \
wolfSSL 15:117db924cf7c 511 "movl %%ecx , 8(%[c])\n\t" \
wolfSSL 15:117db924cf7c 512 "movl %%edx , 12(%[c])\n\t" \
wolfSSL 15:117db924cf7c 513 "movl %%r8d , 16(%[c])\n\t" \
wolfSSL 15:117db924cf7c 514 "movl %%r9d , 20(%[c])\n\t" \
wolfSSL 15:117db924cf7c 515 "movl %%r10d, 24(%[c])\n\t" \
wolfSSL 15:117db924cf7c 516 "movl %%r11d, 28(%[c])\n\t" \
wolfSSL 15:117db924cf7c 517 "movl %%r12d, 48(%[c])\n\t" \
wolfSSL 15:117db924cf7c 518 "movl %%r13d, 52(%[c])\n\t" \
wolfSSL 15:117db924cf7c 519 "movl %%r14d, 56(%[c])\n\t" \
wolfSSL 15:117db924cf7c 520 "movl %%r15d, 60(%[c])\n\t" \
wolfSSL 15:117db924cf7c 521 "movl 8(%%rsp), %%eax\n\t" \
wolfSSL 15:117db924cf7c 522 "movl 12(%%rsp), %%ebx\n\t" \
wolfSSL 15:117db924cf7c 523 "movl 16(%%rsp), %%ecx\n\t" \
wolfSSL 15:117db924cf7c 524 "movl 20(%%rsp), %%edx\n\t" \
wolfSSL 15:117db924cf7c 525 "addl 32(%[input]), %%eax\n\t" \
wolfSSL 15:117db924cf7c 526 "addl 36(%[input]), %%ebx\n\t" \
wolfSSL 15:117db924cf7c 527 "addl 40(%[input]), %%ecx\n\t" \
wolfSSL 15:117db924cf7c 528 "addl 44(%[input]), %%edx\n\t" \
wolfSSL 15:117db924cf7c 529 "movl %%eax , 32(%[c])\n\t" \
wolfSSL 15:117db924cf7c 530 "movl %%ebx , 36(%[c])\n\t" \
wolfSSL 15:117db924cf7c 531 "movl %%ecx , 40(%[c])\n\t" \
wolfSSL 15:117db924cf7c 532 "movl %%edx , 44(%[c])\n\t" \
wolfSSL 15:117db924cf7c 533 "addl $1, 48(%[input])\n\t" \
wolfSSL 15:117db924cf7c 534 "addq $40, %%rsp\n\t" \
wolfSSL 15:117db924cf7c 535 "movq %[output], %%rax\n\t" \
wolfSSL 15:117db924cf7c 536 "movq %[m], %%rbx\n\t" \
wolfSSL 15:117db924cf7c 537 "movl %[bytes], %%r8d\n\t" \
wolfSSL 15:117db924cf7c 538 "xorq %%rdx, %%rdx\n\t" \
wolfSSL 15:117db924cf7c 539 "movl %%r8d, %%r9d\n\t" \
wolfSSL 15:117db924cf7c 540 "andl $7, %%r9d\n\t" \
wolfSSL 15:117db924cf7c 541 "jz 4f\n\t" \
wolfSSL 15:117db924cf7c 542 "\n" \
wolfSSL 15:117db924cf7c 543 "2:\n\t" \
wolfSSL 15:117db924cf7c 544 "movzbl (%[c],%%rdx,1), %%ecx\n\t" \
wolfSSL 15:117db924cf7c 545 "xorb (%%rbx,%%rdx,1), %%cl\n\t" \
wolfSSL 15:117db924cf7c 546 "movb %%cl, (%%rax,%%rdx,1)\n\t" \
wolfSSL 15:117db924cf7c 547 "incl %%edx\n\t" \
wolfSSL 15:117db924cf7c 548 "cmpl %%r9d, %%edx\n\t" \
wolfSSL 15:117db924cf7c 549 "jne 2b\n\t" \
wolfSSL 15:117db924cf7c 550 "je 3f\n\t" \
wolfSSL 15:117db924cf7c 551 "\n" \
wolfSSL 15:117db924cf7c 552 "4:\n\t" \
wolfSSL 15:117db924cf7c 553 "movq (%[c],%%rdx,1), %%rcx\n\t" \
wolfSSL 15:117db924cf7c 554 "xorq (%%rbx,%%rdx,1), %%rcx\n\t" \
wolfSSL 15:117db924cf7c 555 "movq %%rcx, (%%rax,%%rdx,1)\n\t" \
wolfSSL 15:117db924cf7c 556 "addl $8, %%edx\n\t" \
wolfSSL 15:117db924cf7c 557 "\n" \
wolfSSL 15:117db924cf7c 558 "3:\n\t" \
wolfSSL 15:117db924cf7c 559 "cmpl %%r8d, %%edx\n\t" \
wolfSSL 15:117db924cf7c 560 "jne 4b\n\t" \
wolfSSL 15:117db924cf7c 561 : \
wolfSSL 15:117db924cf7c 562 : [input] "r" (ctx->X), [c] "r" (x), \
wolfSSL 15:117db924cf7c 563 [output] "m" (c), [bytes] "m" (bytes), [m] "m" (m) \
wolfSSL 15:117db924cf7c 564 : "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \
wolfSSL 15:117db924cf7c 565 "r14", "r15", "memory" \
wolfSSL 15:117db924cf7c 566 )
wolfSSL 15:117db924cf7c 567
wolfSSL 15:117db924cf7c 568
wolfSSL 15:117db924cf7c 569 #define CHACHA_CHUNK_X64() \
wolfSSL 15:117db924cf7c 570 __asm__ __volatile__ ( \
wolfSSL 15:117db924cf7c 571 CHACHA_CRYPT_X64() \
wolfSSL 15:117db924cf7c 572 "movq %%rsi, 32(%%rsp)\n\t" \
wolfSSL 15:117db924cf7c 573 "addq $40, %%rsp\n\t" \
wolfSSL 15:117db924cf7c 574 "movq %[m], %%rsi\n\t" \
wolfSSL 15:117db924cf7c 575 "subq $40, %%rsp\n\t" \
wolfSSL 15:117db924cf7c 576 "xorl 0(%%rsi), %%eax\n\t" \
wolfSSL 15:117db924cf7c 577 "xorl 4(%%rsi), %%ebx\n\t" \
wolfSSL 15:117db924cf7c 578 "xorl 8(%%rsi), %%ecx\n\t" \
wolfSSL 15:117db924cf7c 579 "xorl 12(%%rsi), %%edx\n\t" \
wolfSSL 15:117db924cf7c 580 "xorl 16(%%rsi), %%r8d\n\t" \
wolfSSL 15:117db924cf7c 581 "xorl 20(%%rsi), %%r9d\n\t" \
wolfSSL 15:117db924cf7c 582 "xorl 24(%%rsi), %%r10d\n\t" \
wolfSSL 15:117db924cf7c 583 "xorl 28(%%rsi), %%r11d\n\t" \
wolfSSL 15:117db924cf7c 584 "xorl 48(%%rsi), %%r12d\n\t" \
wolfSSL 15:117db924cf7c 585 "xorl 52(%%rsi), %%r13d\n\t" \
wolfSSL 15:117db924cf7c 586 "xorl 56(%%rsi), %%r14d\n\t" \
wolfSSL 15:117db924cf7c 587 "xorl 60(%%rsi), %%r15d\n\t" \
wolfSSL 15:117db924cf7c 588 "movq 32(%%rsp), %%rsi\n\t" \
wolfSSL 15:117db924cf7c 589 "movl %%eax , 0(%[c])\n\t" \
wolfSSL 15:117db924cf7c 590 "movl %%ebx , 4(%[c])\n\t" \
wolfSSL 15:117db924cf7c 591 "movl %%ecx , 8(%[c])\n\t" \
wolfSSL 15:117db924cf7c 592 "movl %%edx , 12(%[c])\n\t" \
wolfSSL 15:117db924cf7c 593 "movl %%r8d , 16(%[c])\n\t" \
wolfSSL 15:117db924cf7c 594 "movl %%r9d , 20(%[c])\n\t" \
wolfSSL 15:117db924cf7c 595 "movl %%r10d, 24(%[c])\n\t" \
wolfSSL 15:117db924cf7c 596 "movl %%r11d, 28(%[c])\n\t" \
wolfSSL 15:117db924cf7c 597 "movl %%r12d, 48(%[c])\n\t" \
wolfSSL 15:117db924cf7c 598 "movl %%r13d, 52(%[c])\n\t" \
wolfSSL 15:117db924cf7c 599 "movl %%r14d, 56(%[c])\n\t" \
wolfSSL 15:117db924cf7c 600 "movl %%r15d, 60(%[c])\n\t" \
wolfSSL 15:117db924cf7c 601 "addq $40, %%rsp\n\t" \
wolfSSL 15:117db924cf7c 602 "movq %[m], %%r8\n\t" \
wolfSSL 15:117db924cf7c 603 "subq $40, %%rsp\n\t" \
wolfSSL 15:117db924cf7c 604 "movl 8(%%rsp), %%eax\n\t" \
wolfSSL 15:117db924cf7c 605 "movl 12(%%rsp), %%ebx\n\t" \
wolfSSL 15:117db924cf7c 606 "movl 16(%%rsp), %%ecx\n\t" \
wolfSSL 15:117db924cf7c 607 "movl 20(%%rsp), %%edx\n\t" \
wolfSSL 15:117db924cf7c 608 "addl 32(%[input]), %%eax\n\t" \
wolfSSL 15:117db924cf7c 609 "addl 36(%[input]), %%ebx\n\t" \
wolfSSL 15:117db924cf7c 610 "addl 40(%[input]), %%ecx\n\t" \
wolfSSL 15:117db924cf7c 611 "addl 44(%[input]), %%edx\n\t" \
wolfSSL 15:117db924cf7c 612 "xorl 32(%%r8), %%eax\n\t" \
wolfSSL 15:117db924cf7c 613 "xorl 36(%%r8), %%ebx\n\t" \
wolfSSL 15:117db924cf7c 614 "xorl 40(%%r8), %%ecx\n\t" \
wolfSSL 15:117db924cf7c 615 "xorl 44(%%r8), %%edx\n\t" \
wolfSSL 15:117db924cf7c 616 "movl %%eax , 32(%[c])\n\t" \
wolfSSL 15:117db924cf7c 617 "movl %%ebx , 36(%[c])\n\t" \
wolfSSL 15:117db924cf7c 618 "movl %%ecx , 40(%[c])\n\t" \
wolfSSL 15:117db924cf7c 619 "movl %%edx , 44(%[c])\n\t" \
wolfSSL 15:117db924cf7c 620 "addl $1, 48(%[input])\n\t" \
wolfSSL 15:117db924cf7c 621 "addq $40, %%rsp\n\t" \
wolfSSL 15:117db924cf7c 622 : \
wolfSSL 15:117db924cf7c 623 : [input] "r" (ctx->X), [c] "r" (c), [m] "m" (m) \
wolfSSL 15:117db924cf7c 624 : "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \
wolfSSL 15:117db924cf7c 625 "r14", "r15", "memory" \
wolfSSL 15:117db924cf7c 626 )
wolfSSL 15:117db924cf7c 627
wolfSSL 15:117db924cf7c 628
wolfSSL 15:117db924cf7c 629 static void chacha_encrypt_x64(ChaCha* ctx, const byte* m, byte* c,
wolfSSL 15:117db924cf7c 630 word32 bytes)
wolfSSL 15:117db924cf7c 631 {
wolfSSL 15:117db924cf7c 632 word32 x[CHACHA_CHUNK_WORDS];
wolfSSL 15:117db924cf7c 633
wolfSSL 15:117db924cf7c 634 if (bytes == 0)
wolfSSL 15:117db924cf7c 635 return;
wolfSSL 15:117db924cf7c 636
wolfSSL 15:117db924cf7c 637 for (; bytes >= CHACHA_CHUNK_BYTES;) {
wolfSSL 15:117db924cf7c 638 CHACHA_CHUNK_X64();
wolfSSL 15:117db924cf7c 639 bytes -= CHACHA_CHUNK_BYTES;
wolfSSL 15:117db924cf7c 640 c += CHACHA_CHUNK_BYTES;
wolfSSL 15:117db924cf7c 641 m += CHACHA_CHUNK_BYTES;
wolfSSL 15:117db924cf7c 642 }
wolfSSL 15:117db924cf7c 643 if (bytes > 0) {
wolfSSL 15:117db924cf7c 644 CHACHA_PARTIAL_CHUNK_X64();
wolfSSL 15:117db924cf7c 645 }
wolfSSL 15:117db924cf7c 646 }
wolfSSL 15:117db924cf7c 647
wolfSSL 15:117db924cf7c 648 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
wolfSSL 15:117db924cf7c 649 static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
wolfSSL 15:117db924cf7c 650 static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
wolfSSL 15:117db924cf7c 651 #endif /* HAVE_INTEL_AVX1 || HAVE_INTEL_AVX2 */
wolfSSL 15:117db924cf7c 652
wolfSSL 15:117db924cf7c 653 #ifdef HAVE_INTEL_AVX1
wolfSSL 15:117db924cf7c 654 #define QUARTERROUND_2_AVX() \
wolfSSL 15:117db924cf7c 655 "paddd %%xmm1, %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 656 "pxor %%xmm0, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 657 "pshufb %[rotl16], %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 658 "paddd %%xmm3, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 659 "pxor %%xmm2, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 660 "movdqa %%xmm1, %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 661 "pslld $12, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 662 "psrld $20, %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 663 "pxor %%xmm4, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 664 "paddd %%xmm1, %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 665 "pxor %%xmm0, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 666 "pshufb %[rotl8], %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 667 "paddd %%xmm3, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 668 "pxor %%xmm2, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 669 "movdqa %%xmm1, %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 670 "pslld $7, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 671 "psrld $25, %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 672 "pxor %%xmm4, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 673 "# Swap words for next round\n\t" \
wolfSSL 15:117db924cf7c 674 "pshufd $0x39, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 675 "pshufd $0x4e, %%xmm2, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 676 "pshufd $0x93, %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 677 "paddd %%xmm1, %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 678 "pxor %%xmm0, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 679 "pshufb %[rotl16], %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 680 "paddd %%xmm3, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 681 "pxor %%xmm2, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 682 "movdqa %%xmm1, %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 683 "pslld $12, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 684 "psrld $20, %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 685 "pxor %%xmm4, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 686 "paddd %%xmm1, %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 687 "pxor %%xmm0, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 688 "pshufb %[rotl8], %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 689 "paddd %%xmm3, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 690 "pxor %%xmm2, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 691 "movdqa %%xmm1, %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 692 "pslld $7, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 693 "psrld $25, %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 694 "pxor %%xmm4, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 695 "# Swap words back\n\t" \
wolfSSL 15:117db924cf7c 696 "pshufd $0x93, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 697 "pshufd $0x4e, %%xmm2, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 698 "pshufd $0x39, %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 699
wolfSSL 15:117db924cf7c 700 #define CHACHA_CRYPT_AVX() \
wolfSSL 15:117db924cf7c 701 "movdqu 0(%[input]), %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 702 "movdqu 16(%[input]), %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 703 "movdqu 32(%[input]), %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 704 "movdqu 48(%[input]), %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 705 "movb $10, %%al\n\t" \
wolfSSL 15:117db924cf7c 706 "\n" \
wolfSSL 15:117db924cf7c 707 "1:\n\t" \
wolfSSL 15:117db924cf7c 708 QUARTERROUND_2_AVX() \
wolfSSL 15:117db924cf7c 709 "decb %%al\n\t" \
wolfSSL 15:117db924cf7c 710 "jnz 1b\n\t" \
wolfSSL 15:117db924cf7c 711 "movdqu 0(%[input]), %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 712 "movdqu 16(%[input]), %%xmm5\n\t" \
wolfSSL 15:117db924cf7c 713 "movdqu 32(%[input]), %%xmm6\n\t" \
wolfSSL 15:117db924cf7c 714 "movdqu 48(%[input]), %%xmm7\n\t" \
wolfSSL 15:117db924cf7c 715 "paddd %%xmm4, %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 716 "paddd %%xmm5, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 717 "paddd %%xmm6, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 718 "paddd %%xmm7, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 719
wolfSSL 15:117db924cf7c 720 #define CHACHA_PARTIAL_CHUNK_AVX() \
wolfSSL 15:117db924cf7c 721 __asm__ __volatile__ ( \
wolfSSL 15:117db924cf7c 722 CHACHA_CRYPT_AVX() \
wolfSSL 15:117db924cf7c 723 "movdqu %%xmm0, 0(%[c])\n\t" \
wolfSSL 15:117db924cf7c 724 "movdqu %%xmm1, 16(%[c])\n\t" \
wolfSSL 15:117db924cf7c 725 "movdqu %%xmm2, 32(%[c])\n\t" \
wolfSSL 15:117db924cf7c 726 "movdqu %%xmm3, 48(%[c])\n\t" \
wolfSSL 15:117db924cf7c 727 "addl $1, 48(%[input])\n\t" \
wolfSSL 15:117db924cf7c 728 "movl %[bytes], %%r8d\n\t" \
wolfSSL 15:117db924cf7c 729 "xorq %%rdx, %%rdx\n\t" \
wolfSSL 15:117db924cf7c 730 "movl %%r8d, %%r9d\n\t" \
wolfSSL 15:117db924cf7c 731 "andl $7, %%r9d\n\t" \
wolfSSL 15:117db924cf7c 732 "jz 4f\n\t" \
wolfSSL 15:117db924cf7c 733 "\n" \
wolfSSL 15:117db924cf7c 734 "2:\n\t" \
wolfSSL 15:117db924cf7c 735 "movzbl (%[c],%%rdx,1), %%ecx\n\t" \
wolfSSL 15:117db924cf7c 736 "xorb (%[m],%%rdx,1), %%cl\n\t" \
wolfSSL 15:117db924cf7c 737 "movb %%cl, (%[output],%%rdx,1)\n\t" \
wolfSSL 15:117db924cf7c 738 "incl %%edx\n\t" \
wolfSSL 15:117db924cf7c 739 "cmpl %%r9d, %%edx\n\t" \
wolfSSL 15:117db924cf7c 740 "jne 2b\n\t" \
wolfSSL 15:117db924cf7c 741 "je 3f\n\t" \
wolfSSL 15:117db924cf7c 742 "\n" \
wolfSSL 15:117db924cf7c 743 "4:\n\t" \
wolfSSL 15:117db924cf7c 744 "movq (%[c],%%rdx,1), %%rcx\n\t" \
wolfSSL 15:117db924cf7c 745 "xorq (%[m],%%rdx,1), %%rcx\n\t" \
wolfSSL 15:117db924cf7c 746 "movq %%rcx, (%[output],%%rdx,1)\n\t" \
wolfSSL 15:117db924cf7c 747 "addl $8, %%edx\n\t" \
wolfSSL 15:117db924cf7c 748 "\n" \
wolfSSL 15:117db924cf7c 749 "3:\n\t" \
wolfSSL 15:117db924cf7c 750 "cmpl %%r8d, %%edx\n\t" \
wolfSSL 15:117db924cf7c 751 "jne 4b\n\t" \
wolfSSL 15:117db924cf7c 752 : \
wolfSSL 15:117db924cf7c 753 : [input] "r" (ctx->X), [c] "r" (x), \
wolfSSL 15:117db924cf7c 754 [output] "r" (c), [bytes] "r" (bytes), [m] "r" (m), \
wolfSSL 15:117db924cf7c 755 [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
wolfSSL 15:117db924cf7c 756 : "eax", "ecx", "edx", "r8", "r9", "memory", \
wolfSSL 15:117db924cf7c 757 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \
wolfSSL 15:117db924cf7c 758 )
wolfSSL 15:117db924cf7c 759
wolfSSL 15:117db924cf7c 760
wolfSSL 15:117db924cf7c 761 #define CHACHA_CHUNK_AVX() \
wolfSSL 15:117db924cf7c 762 __asm__ __volatile__ ( \
wolfSSL 15:117db924cf7c 763 CHACHA_CRYPT_AVX() \
wolfSSL 15:117db924cf7c 764 "movdqu 0(%[m]), %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 765 "movdqu 16(%[m]), %%xmm5\n\t" \
wolfSSL 15:117db924cf7c 766 "movdqu 32(%[m]), %%xmm6\n\t" \
wolfSSL 15:117db924cf7c 767 "movdqu 48(%[m]), %%xmm7\n\t" \
wolfSSL 15:117db924cf7c 768 "pxor %%xmm4, %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 769 "pxor %%xmm5, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 770 "pxor %%xmm6, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 771 "pxor %%xmm7, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 772 "movdqu %%xmm0, 0(%[c])\n\t" \
wolfSSL 15:117db924cf7c 773 "movdqu %%xmm1, 16(%[c])\n\t" \
wolfSSL 15:117db924cf7c 774 "movdqu %%xmm2, 32(%[c])\n\t" \
wolfSSL 15:117db924cf7c 775 "movdqu %%xmm3, 48(%[c])\n\t" \
wolfSSL 15:117db924cf7c 776 "addl $1, 48(%[input])\n\t" \
wolfSSL 15:117db924cf7c 777 : \
wolfSSL 15:117db924cf7c 778 : [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m), \
wolfSSL 15:117db924cf7c 779 [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
wolfSSL 15:117db924cf7c 780 : "rax", "memory", \
wolfSSL 15:117db924cf7c 781 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \
wolfSSL 15:117db924cf7c 782 )
wolfSSL 15:117db924cf7c 783
wolfSSL 15:117db924cf7c 784 CHACHA20_NOINLINE static void chacha_encrypt_avx(ChaCha* ctx, const byte* m,
wolfSSL 15:117db924cf7c 785 byte* c, word32 bytes)
wolfSSL 15:117db924cf7c 786 {
wolfSSL 15:117db924cf7c 787 ALIGN128 word32 X[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
wolfSSL 15:117db924cf7c 788 ALIGN128 word32 x[2*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
wolfSSL 15:117db924cf7c 789 word32 cnt = 0;
wolfSSL 15:117db924cf7c 790 static const __m128i add = { 0x0000000100000000UL,0x0000000300000002UL };
wolfSSL 15:117db924cf7c 791 static const __m128i four = { 0x0000000400000004UL,0x0000000400000004UL };
wolfSSL 15:117db924cf7c 792
wolfSSL 15:117db924cf7c 793 if (bytes == 0)
wolfSSL 15:117db924cf7c 794 return;
wolfSSL 15:117db924cf7c 795
wolfSSL 15:117db924cf7c 796 __asm__ __volatile__ (
wolfSSL 15:117db924cf7c 797 "movl %[bytes], %[cnt]\n\t"
wolfSSL 15:117db924cf7c 798 "shrl $8, %[cnt]\n\t"
wolfSSL 15:117db924cf7c 799 "jz L_end128\n\t"
wolfSSL 15:117db924cf7c 800
wolfSSL 15:117db924cf7c 801 "vpshufd $0, (%[key]), %%xmm0\n\t"
wolfSSL 15:117db924cf7c 802 "vpshufd $0, 4(%[key]), %%xmm1\n\t"
wolfSSL 15:117db924cf7c 803 "vpshufd $0, 8(%[key]), %%xmm2\n\t"
wolfSSL 15:117db924cf7c 804 "vpshufd $0, 12(%[key]), %%xmm3\n\t"
wolfSSL 15:117db924cf7c 805 "vpshufd $0, 16(%[key]), %%xmm4\n\t"
wolfSSL 15:117db924cf7c 806 "vpshufd $0, 20(%[key]), %%xmm5\n\t"
wolfSSL 15:117db924cf7c 807 "vpshufd $0, 24(%[key]), %%xmm6\n\t"
wolfSSL 15:117db924cf7c 808 "vpshufd $0, 28(%[key]), %%xmm7\n\t"
wolfSSL 15:117db924cf7c 809 "vpshufd $0, 32(%[key]), %%xmm8\n\t"
wolfSSL 15:117db924cf7c 810 "vpshufd $0, 36(%[key]), %%xmm9\n\t"
wolfSSL 15:117db924cf7c 811 "vpshufd $0, 40(%[key]), %%xmm10\n\t"
wolfSSL 15:117db924cf7c 812 "vpshufd $0, 44(%[key]), %%xmm11\n\t"
wolfSSL 15:117db924cf7c 813 "vpshufd $0, 48(%[key]), %%xmm12\n\t"
wolfSSL 15:117db924cf7c 814 "vpshufd $0, 52(%[key]), %%xmm13\n\t"
wolfSSL 15:117db924cf7c 815 "vpshufd $0, 56(%[key]), %%xmm14\n\t"
wolfSSL 15:117db924cf7c 816 "vpshufd $0, 60(%[key]), %%xmm15\n\t"
wolfSSL 15:117db924cf7c 817
wolfSSL 15:117db924cf7c 818 "vpaddd %[add], %%xmm12, %%xmm12\n\t"
wolfSSL 15:117db924cf7c 819
wolfSSL 15:117db924cf7c 820 "vmovdqa %%xmm0, (%[X])\n\t"
wolfSSL 15:117db924cf7c 821 "vmovdqa %%xmm1, 16(%[X])\n\t"
wolfSSL 15:117db924cf7c 822 "vmovdqa %%xmm2, 32(%[X])\n\t"
wolfSSL 15:117db924cf7c 823 "vmovdqa %%xmm3, 48(%[X])\n\t"
wolfSSL 15:117db924cf7c 824 "vmovdqa %%xmm4, 64(%[X])\n\t"
wolfSSL 15:117db924cf7c 825 "vmovdqa %%xmm5, 80(%[X])\n\t"
wolfSSL 15:117db924cf7c 826 "vmovdqa %%xmm6, 96(%[X])\n\t"
wolfSSL 15:117db924cf7c 827 "vmovdqa %%xmm7, 112(%[X])\n\t"
wolfSSL 15:117db924cf7c 828 "vmovdqa %%xmm8, 128(%[X])\n\t"
wolfSSL 15:117db924cf7c 829 "vmovdqa %%xmm9, 144(%[X])\n\t"
wolfSSL 15:117db924cf7c 830 "vmovdqa %%xmm10, 160(%[X])\n\t"
wolfSSL 15:117db924cf7c 831 "vmovdqa %%xmm11, 176(%[X])\n\t"
wolfSSL 15:117db924cf7c 832 "vmovdqa %%xmm12, 192(%[X])\n\t"
wolfSSL 15:117db924cf7c 833 "vmovdqa %%xmm13, 208(%[X])\n\t"
wolfSSL 15:117db924cf7c 834 "vmovdqa %%xmm14, 224(%[X])\n\t"
wolfSSL 15:117db924cf7c 835 "vmovdqa %%xmm15, 240(%[X])\n\t"
wolfSSL 15:117db924cf7c 836 "\n"
wolfSSL 15:117db924cf7c 837 "L_enc128_loop:\n\t"
wolfSSL 15:117db924cf7c 838 "vmovdqa %%xmm11, 48(%[x])\n\t"
wolfSSL 15:117db924cf7c 839 QUARTERROUND_XMM()
wolfSSL 15:117db924cf7c 840 QUARTERROUND_XMM_2()
wolfSSL 15:117db924cf7c 841 QUARTERROUND_XMM()
wolfSSL 15:117db924cf7c 842 QUARTERROUND_XMM_2()
wolfSSL 15:117db924cf7c 843 QUARTERROUND_XMM()
wolfSSL 15:117db924cf7c 844 QUARTERROUND_XMM_2()
wolfSSL 15:117db924cf7c 845 QUARTERROUND_XMM()
wolfSSL 15:117db924cf7c 846 QUARTERROUND_XMM_2()
wolfSSL 15:117db924cf7c 847 QUARTERROUND_XMM()
wolfSSL 15:117db924cf7c 848 QUARTERROUND_XMM_2()
wolfSSL 15:117db924cf7c 849 QUARTERROUND_XMM()
wolfSSL 15:117db924cf7c 850 QUARTERROUND_XMM_2()
wolfSSL 15:117db924cf7c 851 QUARTERROUND_XMM()
wolfSSL 15:117db924cf7c 852 QUARTERROUND_XMM_2()
wolfSSL 15:117db924cf7c 853 QUARTERROUND_XMM()
wolfSSL 15:117db924cf7c 854 QUARTERROUND_XMM_2()
wolfSSL 15:117db924cf7c 855 QUARTERROUND_XMM()
wolfSSL 15:117db924cf7c 856 QUARTERROUND_XMM_2()
wolfSSL 15:117db924cf7c 857 QUARTERROUND_XMM()
wolfSSL 15:117db924cf7c 858 QUARTERROUND_XMM_2()
wolfSSL 15:117db924cf7c 859 "vmovdqa 48(%[x]), %%xmm11\n\t"
wolfSSL 15:117db924cf7c 860
wolfSSL 15:117db924cf7c 861 "vpaddd (%[X]), %%xmm0, %%xmm0\n\t"
wolfSSL 15:117db924cf7c 862 "vpaddd 16(%[X]), %%xmm1, %%xmm1\n\t"
wolfSSL 15:117db924cf7c 863 "vpaddd 32(%[X]), %%xmm2, %%xmm2\n\t"
wolfSSL 15:117db924cf7c 864 "vpaddd 48(%[X]), %%xmm3, %%xmm3\n\t"
wolfSSL 15:117db924cf7c 865 "vpaddd 64(%[X]), %%xmm4, %%xmm4\n\t"
wolfSSL 15:117db924cf7c 866 "vpaddd 80(%[X]), %%xmm5, %%xmm5\n\t"
wolfSSL 15:117db924cf7c 867 "vpaddd 96(%[X]), %%xmm6, %%xmm6\n\t"
wolfSSL 15:117db924cf7c 868 "vpaddd 112(%[X]), %%xmm7, %%xmm7\n\t"
wolfSSL 15:117db924cf7c 869 "vpaddd 128(%[X]), %%xmm8, %%xmm8\n\t"
wolfSSL 15:117db924cf7c 870 "vpaddd 144(%[X]), %%xmm9, %%xmm9\n\t"
wolfSSL 15:117db924cf7c 871 "vpaddd 160(%[X]), %%xmm10, %%xmm10\n\t"
wolfSSL 15:117db924cf7c 872 "vpaddd 176(%[X]), %%xmm11, %%xmm11\n\t"
wolfSSL 15:117db924cf7c 873 "vpaddd 192(%[X]), %%xmm12, %%xmm12\n\t"
wolfSSL 15:117db924cf7c 874 "vpaddd 208(%[X]), %%xmm13, %%xmm13\n\t"
wolfSSL 15:117db924cf7c 875 "vpaddd 224(%[X]), %%xmm14, %%xmm14\n\t"
wolfSSL 15:117db924cf7c 876 "vpaddd 240(%[X]), %%xmm15, %%xmm15\n\t"
wolfSSL 15:117db924cf7c 877
wolfSSL 15:117db924cf7c 878 "vmovdqa %%xmm8, (%[x])\n\t"
wolfSSL 15:117db924cf7c 879 "vmovdqa %%xmm9, 16(%[x])\n\t"
wolfSSL 15:117db924cf7c 880 "vmovdqa %%xmm10, 32(%[x])\n\t"
wolfSSL 15:117db924cf7c 881 "vmovdqa %%xmm11, 48(%[x])\n\t"
wolfSSL 15:117db924cf7c 882 "vmovdqa %%xmm12, 64(%[x])\n\t"
wolfSSL 15:117db924cf7c 883 "vmovdqa %%xmm13, 80(%[x])\n\t"
wolfSSL 15:117db924cf7c 884 "vmovdqa %%xmm14, 96(%[x])\n\t"
wolfSSL 15:117db924cf7c 885 "vmovdqa %%xmm15, 112(%[x])\n\t"
wolfSSL 15:117db924cf7c 886
wolfSSL 15:117db924cf7c 887 "vpunpckldq %%xmm1, %%xmm0, %%xmm8\n\t"
wolfSSL 15:117db924cf7c 888 "vpunpckldq %%xmm3, %%xmm2, %%xmm9\n\t"
wolfSSL 15:117db924cf7c 889 "vpunpckhdq %%xmm1, %%xmm0, %%xmm12\n\t"
wolfSSL 15:117db924cf7c 890 "vpunpckhdq %%xmm3, %%xmm2, %%xmm13\n\t"
wolfSSL 15:117db924cf7c 891 "vpunpckldq %%xmm5, %%xmm4, %%xmm10\n\t"
wolfSSL 15:117db924cf7c 892 "vpunpckldq %%xmm7, %%xmm6, %%xmm11\n\t"
wolfSSL 15:117db924cf7c 893 "vpunpckhdq %%xmm5, %%xmm4, %%xmm14\n\t"
wolfSSL 15:117db924cf7c 894 "vpunpckhdq %%xmm7, %%xmm6, %%xmm15\n\t"
wolfSSL 15:117db924cf7c 895 "vpunpcklqdq %%xmm9, %%xmm8, %%xmm0\n\t"
wolfSSL 15:117db924cf7c 896 "vpunpcklqdq %%xmm11, %%xmm10, %%xmm1\n\t"
wolfSSL 15:117db924cf7c 897 "vpunpckhqdq %%xmm9, %%xmm8, %%xmm2\n\t"
wolfSSL 15:117db924cf7c 898 "vpunpckhqdq %%xmm11, %%xmm10, %%xmm3\n\t"
wolfSSL 15:117db924cf7c 899 "vpunpcklqdq %%xmm13, %%xmm12, %%xmm4\n\t"
wolfSSL 15:117db924cf7c 900 "vpunpcklqdq %%xmm15, %%xmm14, %%xmm5\n\t"
wolfSSL 15:117db924cf7c 901 "vpunpckhqdq %%xmm13, %%xmm12, %%xmm6\n\t"
wolfSSL 15:117db924cf7c 902 "vpunpckhqdq %%xmm15, %%xmm14, %%xmm7\n\t"
wolfSSL 15:117db924cf7c 903 "vmovdqu (%[in]), %%xmm8\n\t"
wolfSSL 15:117db924cf7c 904 "vmovdqu 16(%[in]), %%xmm9\n\t"
wolfSSL 15:117db924cf7c 905 "vmovdqu 64(%[in]), %%xmm10\n\t"
wolfSSL 15:117db924cf7c 906 "vmovdqu 80(%[in]), %%xmm11\n\t"
wolfSSL 15:117db924cf7c 907 "vmovdqu 128(%[in]), %%xmm12\n\t"
wolfSSL 15:117db924cf7c 908 "vmovdqu 144(%[in]), %%xmm13\n\t"
wolfSSL 15:117db924cf7c 909 "vmovdqu 192(%[in]), %%xmm14\n\t"
wolfSSL 15:117db924cf7c 910 "vmovdqu 208(%[in]), %%xmm15\n\t"
wolfSSL 15:117db924cf7c 911 "vpxor %%xmm8, %%xmm0, %%xmm0\n\t"
wolfSSL 15:117db924cf7c 912 "vpxor %%xmm9, %%xmm1, %%xmm1\n\t"
wolfSSL 15:117db924cf7c 913 "vpxor %%xmm10, %%xmm2, %%xmm2\n\t"
wolfSSL 15:117db924cf7c 914 "vpxor %%xmm11, %%xmm3, %%xmm3\n\t"
wolfSSL 15:117db924cf7c 915 "vpxor %%xmm12, %%xmm4, %%xmm4\n\t"
wolfSSL 15:117db924cf7c 916 "vpxor %%xmm13, %%xmm5, %%xmm5\n\t"
wolfSSL 15:117db924cf7c 917 "vpxor %%xmm14, %%xmm6, %%xmm6\n\t"
wolfSSL 15:117db924cf7c 918 "vpxor %%xmm15, %%xmm7, %%xmm7\n\t"
wolfSSL 15:117db924cf7c 919 "vmovdqu %%xmm0, (%[out])\n\t"
wolfSSL 15:117db924cf7c 920 "vmovdqu %%xmm1, 16(%[out])\n\t"
wolfSSL 15:117db924cf7c 921 "vmovdqu %%xmm2, 64(%[out])\n\t"
wolfSSL 15:117db924cf7c 922 "vmovdqu %%xmm3, 80(%[out])\n\t"
wolfSSL 15:117db924cf7c 923 "vmovdqu %%xmm4, 128(%[out])\n\t"
wolfSSL 15:117db924cf7c 924 "vmovdqu %%xmm5, 144(%[out])\n\t"
wolfSSL 15:117db924cf7c 925 "vmovdqu %%xmm6, 192(%[out])\n\t"
wolfSSL 15:117db924cf7c 926 "vmovdqu %%xmm7, 208(%[out])\n\t"
wolfSSL 15:117db924cf7c 927
wolfSSL 15:117db924cf7c 928 "vmovdqa (%[x]), %%xmm0\n\t"
wolfSSL 15:117db924cf7c 929 "vmovdqa 16(%[x]), %%xmm1\n\t"
wolfSSL 15:117db924cf7c 930 "vmovdqa 32(%[x]), %%xmm2\n\t"
wolfSSL 15:117db924cf7c 931 "vmovdqa 48(%[x]), %%xmm3\n\t"
wolfSSL 15:117db924cf7c 932 "vmovdqa 64(%[x]), %%xmm4\n\t"
wolfSSL 15:117db924cf7c 933 "vmovdqa 80(%[x]), %%xmm5\n\t"
wolfSSL 15:117db924cf7c 934 "vmovdqa 96(%[x]), %%xmm6\n\t"
wolfSSL 15:117db924cf7c 935 "vmovdqa 112(%[x]), %%xmm7\n\t"
wolfSSL 15:117db924cf7c 936
wolfSSL 15:117db924cf7c 937 "vpunpckldq %%xmm1, %%xmm0, %%xmm8\n\t"
wolfSSL 15:117db924cf7c 938 "vpunpckldq %%xmm3, %%xmm2, %%xmm9\n\t"
wolfSSL 15:117db924cf7c 939 "vpunpckhdq %%xmm1, %%xmm0, %%xmm12\n\t"
wolfSSL 15:117db924cf7c 940 "vpunpckhdq %%xmm3, %%xmm2, %%xmm13\n\t"
wolfSSL 15:117db924cf7c 941 "vpunpckldq %%xmm5, %%xmm4, %%xmm10\n\t"
wolfSSL 15:117db924cf7c 942 "vpunpckldq %%xmm7, %%xmm6, %%xmm11\n\t"
wolfSSL 15:117db924cf7c 943 "vpunpckhdq %%xmm5, %%xmm4, %%xmm14\n\t"
wolfSSL 15:117db924cf7c 944 "vpunpckhdq %%xmm7, %%xmm6, %%xmm15\n\t"
wolfSSL 15:117db924cf7c 945 "vpunpcklqdq %%xmm9, %%xmm8, %%xmm0\n\t"
wolfSSL 15:117db924cf7c 946 "vpunpcklqdq %%xmm11, %%xmm10, %%xmm1\n\t"
wolfSSL 15:117db924cf7c 947 "vpunpckhqdq %%xmm9, %%xmm8, %%xmm2\n\t"
wolfSSL 15:117db924cf7c 948 "vpunpckhqdq %%xmm11, %%xmm10, %%xmm3\n\t"
wolfSSL 15:117db924cf7c 949 "vpunpcklqdq %%xmm13, %%xmm12, %%xmm4\n\t"
wolfSSL 15:117db924cf7c 950 "vpunpcklqdq %%xmm15, %%xmm14, %%xmm5\n\t"
wolfSSL 15:117db924cf7c 951 "vpunpckhqdq %%xmm13, %%xmm12, %%xmm6\n\t"
wolfSSL 15:117db924cf7c 952 "vpunpckhqdq %%xmm15, %%xmm14, %%xmm7\n\t"
wolfSSL 15:117db924cf7c 953 "vmovdqu 32(%[in]), %%xmm8\n\t"
wolfSSL 15:117db924cf7c 954 "vmovdqu 48(%[in]), %%xmm9\n\t"
wolfSSL 15:117db924cf7c 955 "vmovdqu 96(%[in]), %%xmm10\n\t"
wolfSSL 15:117db924cf7c 956 "vmovdqu 112(%[in]), %%xmm11\n\t"
wolfSSL 15:117db924cf7c 957 "vmovdqu 160(%[in]), %%xmm12\n\t"
wolfSSL 15:117db924cf7c 958 "vmovdqu 176(%[in]), %%xmm13\n\t"
wolfSSL 15:117db924cf7c 959 "vmovdqu 224(%[in]), %%xmm14\n\t"
wolfSSL 15:117db924cf7c 960 "vmovdqu 240(%[in]), %%xmm15\n\t"
wolfSSL 15:117db924cf7c 961 "vpxor %%xmm8, %%xmm0, %%xmm0\n\t"
wolfSSL 15:117db924cf7c 962 "vpxor %%xmm9, %%xmm1, %%xmm1\n\t"
wolfSSL 15:117db924cf7c 963 "vpxor %%xmm10, %%xmm2, %%xmm2\n\t"
wolfSSL 15:117db924cf7c 964 "vpxor %%xmm11, %%xmm3, %%xmm3\n\t"
wolfSSL 15:117db924cf7c 965 "vpxor %%xmm12, %%xmm4, %%xmm4\n\t"
wolfSSL 15:117db924cf7c 966 "vpxor %%xmm13, %%xmm5, %%xmm5\n\t"
wolfSSL 15:117db924cf7c 967 "vpxor %%xmm14, %%xmm6, %%xmm6\n\t"
wolfSSL 15:117db924cf7c 968 "vpxor %%xmm15, %%xmm7, %%xmm7\n\t"
wolfSSL 15:117db924cf7c 969 "vmovdqu %%xmm0, 32(%[out])\n\t"
wolfSSL 15:117db924cf7c 970 "vmovdqu %%xmm1, 48(%[out])\n\t"
wolfSSL 15:117db924cf7c 971 "vmovdqu %%xmm2, 96(%[out])\n\t"
wolfSSL 15:117db924cf7c 972 "vmovdqu %%xmm3, 112(%[out])\n\t"
wolfSSL 15:117db924cf7c 973 "vmovdqu %%xmm4, 160(%[out])\n\t"
wolfSSL 15:117db924cf7c 974 "vmovdqu %%xmm5, 176(%[out])\n\t"
wolfSSL 15:117db924cf7c 975 "vmovdqu %%xmm6, 224(%[out])\n\t"
wolfSSL 15:117db924cf7c 976 "vmovdqu %%xmm7, 240(%[out])\n\t"
wolfSSL 15:117db924cf7c 977
wolfSSL 15:117db924cf7c 978 "vmovdqa 192(%[X]), %%xmm12\n\t"
wolfSSL 15:117db924cf7c 979 "add $256, %[in]\n\t"
wolfSSL 15:117db924cf7c 980 "add $256, %[out]\n\t"
wolfSSL 15:117db924cf7c 981 "vpaddd %[four], %%xmm12, %%xmm12\n\t"
wolfSSL 15:117db924cf7c 982 "sub $256, %[bytes]\n\t"
wolfSSL 15:117db924cf7c 983 "vmovdqa %%xmm12, 192(%[X])\n\t"
wolfSSL 15:117db924cf7c 984 "cmp $256, %[bytes]\n\t"
wolfSSL 15:117db924cf7c 985 "jl L_done\n\t"
wolfSSL 15:117db924cf7c 986
wolfSSL 15:117db924cf7c 987 "vmovdqa (%[X]), %%xmm0\n\t"
wolfSSL 15:117db924cf7c 988 "vmovdqa 16(%[X]), %%xmm1\n\t"
wolfSSL 15:117db924cf7c 989 "vmovdqa 32(%[X]), %%xmm2\n\t"
wolfSSL 15:117db924cf7c 990 "vmovdqa 48(%[X]), %%xmm3\n\t"
wolfSSL 15:117db924cf7c 991 "vmovdqa 64(%[X]), %%xmm4\n\t"
wolfSSL 15:117db924cf7c 992 "vmovdqa 80(%[X]), %%xmm5\n\t"
wolfSSL 15:117db924cf7c 993 "vmovdqa 96(%[X]), %%xmm6\n\t"
wolfSSL 15:117db924cf7c 994 "vmovdqa 112(%[X]), %%xmm7\n\t"
wolfSSL 15:117db924cf7c 995 "vmovdqa 128(%[X]), %%xmm8\n\t"
wolfSSL 15:117db924cf7c 996 "vmovdqa 144(%[X]), %%xmm9\n\t"
wolfSSL 15:117db924cf7c 997 "vmovdqa 160(%[X]), %%xmm10\n\t"
wolfSSL 15:117db924cf7c 998 "vmovdqa 176(%[X]), %%xmm11\n\t"
wolfSSL 15:117db924cf7c 999 "vmovdqa 192(%[X]), %%xmm12\n\t"
wolfSSL 15:117db924cf7c 1000 "vmovdqa 208(%[X]), %%xmm13\n\t"
wolfSSL 15:117db924cf7c 1001 "vmovdqa 224(%[X]), %%xmm14\n\t"
wolfSSL 15:117db924cf7c 1002 "vmovdqa 240(%[X]), %%xmm15\n\t"
wolfSSL 15:117db924cf7c 1003 "jmp L_enc128_loop\n\t"
wolfSSL 15:117db924cf7c 1004
wolfSSL 15:117db924cf7c 1005 "\n"
wolfSSL 15:117db924cf7c 1006 "L_done:\n\t"
wolfSSL 15:117db924cf7c 1007
wolfSSL 15:117db924cf7c 1008 "shl $2, %[cnt]\n\t"
wolfSSL 15:117db924cf7c 1009 "add 48(%[key]), %[cnt]\n\t"
wolfSSL 15:117db924cf7c 1010 "movl %[cnt], 48(%[key])\n\t"
wolfSSL 15:117db924cf7c 1011 "\n"
wolfSSL 15:117db924cf7c 1012 "L_end128:\n\t"
wolfSSL 15:117db924cf7c 1013 : [bytes] "+r" (bytes), [cnt] "+r" (cnt),
wolfSSL 15:117db924cf7c 1014 [in] "+r" (m), [out] "+r" (c)
wolfSSL 15:117db924cf7c 1015 : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X),
wolfSSL 15:117db924cf7c 1016 [add] "xrm" (add), [four] "xrm" (four),
wolfSSL 15:117db924cf7c 1017 [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16)
wolfSSL 15:117db924cf7c 1018 : "xmm0", "xmm1", "xmm2", "xmm3",
wolfSSL 15:117db924cf7c 1019 "xmm4", "xmm5", "xmm6", "xmm7",
wolfSSL 15:117db924cf7c 1020 "xmm8", "xmm9", "xmm10", "xmm11",
wolfSSL 15:117db924cf7c 1021 "xmm12", "xmm13", "xmm14", "xmm15", "memory"
wolfSSL 15:117db924cf7c 1022 );
wolfSSL 15:117db924cf7c 1023
wolfSSL 15:117db924cf7c 1024 for (; bytes >= CHACHA_CHUNK_BYTES;) {
wolfSSL 15:117db924cf7c 1025 CHACHA_CHUNK_AVX();
wolfSSL 15:117db924cf7c 1026 bytes -= CHACHA_CHUNK_BYTES;
wolfSSL 15:117db924cf7c 1027 c += CHACHA_CHUNK_BYTES;
wolfSSL 15:117db924cf7c 1028 m += CHACHA_CHUNK_BYTES;
wolfSSL 15:117db924cf7c 1029 }
wolfSSL 15:117db924cf7c 1030 if (bytes > 0) {
wolfSSL 15:117db924cf7c 1031 CHACHA_PARTIAL_CHUNK_AVX();
wolfSSL 15:117db924cf7c 1032 }
wolfSSL 15:117db924cf7c 1033 }
wolfSSL 15:117db924cf7c 1034 #endif /* HAVE_INTEL_AVX1 */
wolfSSL 15:117db924cf7c 1035
wolfSSL 15:117db924cf7c 1036 #ifdef HAVE_INTEL_AVX2
wolfSSL 15:117db924cf7c 1037 #define QUARTERROUND_2_AVX2() \
wolfSSL 15:117db924cf7c 1038 "vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 1039 "vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 1040 "vpshufb %[rotl16], %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 1041 "vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 1042 "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1043 "vpsrld $20, %%xmm1, %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 1044 "vpslld $12, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1045 "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1046 "vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 1047 "vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 1048 "vpshufb %[rotl8], %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 1049 "vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 1050 "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1051 "vpsrld $25, %%xmm1, %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 1052 "vpslld $7, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1053 "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1054 "# Swap words for next round\n\t" \
wolfSSL 15:117db924cf7c 1055 "vpshufd $0x39, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1056 "vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 1057 "vpshufd $0x93, %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 1058 "vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 1059 "vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 1060 "vpshufb %[rotl16], %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 1061 "vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 1062 "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1063 "vpsrld $20, %%xmm1, %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 1064 "vpslld $12, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1065 "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1066 "vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 1067 "vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 1068 "vpshufb %[rotl8], %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 1069 "vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 1070 "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1071 "vpsrld $25, %%Xmm1, %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 1072 "vpslld $7, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1073 "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1074 "# Swap words back\n\t" \
wolfSSL 15:117db924cf7c 1075 "vpshufd $0x93, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1076 "vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 1077 "vpshufd $0x39, %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 1078
wolfSSL 15:117db924cf7c 1079 #define CHACHA_CRYPT_AVX2() \
wolfSSL 15:117db924cf7c 1080 "vmovdqu 0(%[input]), %%xmm8\n\t" \
wolfSSL 15:117db924cf7c 1081 "vmovdqu 16(%[input]), %%xmm9\n\t" \
wolfSSL 15:117db924cf7c 1082 "vmovdqu 32(%[input]), %%xmm10\n\t" \
wolfSSL 15:117db924cf7c 1083 "vmovdqu 48(%[input]), %%xmm11\n\t" \
wolfSSL 15:117db924cf7c 1084 "vmovdqu %%xmm8, %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 1085 "vmovdqu %%xmm9, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1086 "vmovdqu %%xmm10, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 1087 "vmovdqu %%xmm11, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 1088 "movb $10, %%al\n\t" \
wolfSSL 15:117db924cf7c 1089 "\n" \
wolfSSL 15:117db924cf7c 1090 "1:\n\t" \
wolfSSL 15:117db924cf7c 1091 QUARTERROUND_2_AVX2() \
wolfSSL 15:117db924cf7c 1092 "decb %%al\n\t" \
wolfSSL 15:117db924cf7c 1093 "jnz 1b\n\t" \
wolfSSL 15:117db924cf7c 1094 "vpaddd %%xmm8, %%xmm0, %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 1095 "vpaddd %%xmm9, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1096 "vpaddd %%xmm10, %%xmm2, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 1097 "vpaddd %%xmm11, %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 1098
wolfSSL 15:117db924cf7c 1099 #define CHACHA_PARTIAL_CHUNK_AVX2() \
wolfSSL 15:117db924cf7c 1100 __asm__ __volatile__ ( \
wolfSSL 15:117db924cf7c 1101 CHACHA_CRYPT_AVX2() \
wolfSSL 15:117db924cf7c 1102 "vmovdqu %%xmm0, 0(%[c])\n\t" \
wolfSSL 15:117db924cf7c 1103 "vmovdqu %%xmm1, 16(%[c])\n\t" \
wolfSSL 15:117db924cf7c 1104 "vmovdqu %%xmm2, 32(%[c])\n\t" \
wolfSSL 15:117db924cf7c 1105 "vmovdqu %%xmm3, 48(%[c])\n\t" \
wolfSSL 15:117db924cf7c 1106 "addl $1, 48(%[input])\n\t" \
wolfSSL 15:117db924cf7c 1107 "movl %[bytes], %%r8d\n\t" \
wolfSSL 15:117db924cf7c 1108 "xorq %%rdx, %%rdx\n\t" \
wolfSSL 15:117db924cf7c 1109 "movl %%r8d, %%r9d\n\t" \
wolfSSL 15:117db924cf7c 1110 "andl $7, %%r9d\n\t" \
wolfSSL 15:117db924cf7c 1111 "jz 4f\n\t" \
wolfSSL 15:117db924cf7c 1112 "\n" \
wolfSSL 15:117db924cf7c 1113 "2:\n\t" \
wolfSSL 15:117db924cf7c 1114 "movzbl (%[c],%%rdx,1), %%ecx\n\t" \
wolfSSL 15:117db924cf7c 1115 "xorb (%[m],%%rdx,1), %%cl\n\t" \
wolfSSL 15:117db924cf7c 1116 "movb %%cl, (%[output],%%rdx,1)\n\t" \
wolfSSL 15:117db924cf7c 1117 "incl %%edx\n\t" \
wolfSSL 15:117db924cf7c 1118 "cmpl %%r9d, %%edx\n\t" \
wolfSSL 15:117db924cf7c 1119 "jne 2b\n\t" \
wolfSSL 15:117db924cf7c 1120 "je 3f\n\t" \
wolfSSL 15:117db924cf7c 1121 "\n" \
wolfSSL 15:117db924cf7c 1122 "4:\n\t" \
wolfSSL 15:117db924cf7c 1123 "movq (%[c],%%rdx,1), %%rcx\n\t" \
wolfSSL 15:117db924cf7c 1124 "xorq (%[m],%%rdx,1), %%rcx\n\t" \
wolfSSL 15:117db924cf7c 1125 "movq %%rcx, (%[output],%%rdx,1)\n\t" \
wolfSSL 15:117db924cf7c 1126 "addl $8, %%edx\n\t" \
wolfSSL 15:117db924cf7c 1127 "\n" \
wolfSSL 15:117db924cf7c 1128 "3:\n\t" \
wolfSSL 15:117db924cf7c 1129 "cmpl %%r8d, %%edx\n\t" \
wolfSSL 15:117db924cf7c 1130 "jne 4b\n\t" \
wolfSSL 15:117db924cf7c 1131 : \
wolfSSL 15:117db924cf7c 1132 : [input] "r" (ctx->X), [c] "r" (x), \
wolfSSL 15:117db924cf7c 1133 [output] "r" (c), [bytes] "r" (bytes), [m] "r" (m), \
wolfSSL 15:117db924cf7c 1134 [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
wolfSSL 15:117db924cf7c 1135 : "eax", "ecx", "edx", "r8", "r9", "memory", \
wolfSSL 15:117db924cf7c 1136 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \
wolfSSL 15:117db924cf7c 1137 "xmm8", "xmm9", "xmm10", "xmm11" \
wolfSSL 15:117db924cf7c 1138 )
wolfSSL 15:117db924cf7c 1139
wolfSSL 15:117db924cf7c 1140
wolfSSL 15:117db924cf7c 1141 #define CHACHA_CHUNK_AVX2() \
wolfSSL 15:117db924cf7c 1142 __asm__ __volatile__ ( \
wolfSSL 15:117db924cf7c 1143 CHACHA_CRYPT_AVX2() \
wolfSSL 15:117db924cf7c 1144 "vmovdqu 0(%[m]), %%xmm4\n\t" \
wolfSSL 15:117db924cf7c 1145 "vmovdqu 16(%[m]), %%xmm5\n\t" \
wolfSSL 15:117db924cf7c 1146 "vmovdqu 32(%[m]), %%xmm6\n\t" \
wolfSSL 15:117db924cf7c 1147 "vmovdqu 48(%[m]), %%xmm7\n\t" \
wolfSSL 15:117db924cf7c 1148 "vpxor %%xmm4, %%xmm0, %%xmm0\n\t" \
wolfSSL 15:117db924cf7c 1149 "vpxor %%xmm5, %%xmm1, %%xmm1\n\t" \
wolfSSL 15:117db924cf7c 1150 "vpxor %%xmm6, %%xmm2, %%xmm2\n\t" \
wolfSSL 15:117db924cf7c 1151 "vpxor %%xmm7, %%xmm3, %%xmm3\n\t" \
wolfSSL 15:117db924cf7c 1152 "vmovdqu %%xmm0, 0(%[c])\n\t" \
wolfSSL 15:117db924cf7c 1153 "vmovdqu %%xmm1, 16(%[c])\n\t" \
wolfSSL 15:117db924cf7c 1154 "vmovdqu %%xmm2, 32(%[c])\n\t" \
wolfSSL 15:117db924cf7c 1155 "vmovdqu %%xmm3, 48(%[c])\n\t" \
wolfSSL 15:117db924cf7c 1156 "addl $1, 48(%[input])\n\t" \
wolfSSL 15:117db924cf7c 1157 : \
wolfSSL 15:117db924cf7c 1158 : [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m), \
wolfSSL 15:117db924cf7c 1159 [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \
wolfSSL 15:117db924cf7c 1160 : "rax", "memory", \
wolfSSL 15:117db924cf7c 1161 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \
wolfSSL 15:117db924cf7c 1162 "xmm8", "xmm9", "xmm10", "xmm11" \
wolfSSL 15:117db924cf7c 1163 )
wolfSSL 15:117db924cf7c 1164
wolfSSL 15:117db924cf7c 1165
wolfSSL 15:117db924cf7c 1166 static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
wolfSSL 15:117db924cf7c 1167 word32 bytes)
wolfSSL 15:117db924cf7c 1168 {
wolfSSL 15:117db924cf7c 1169 ALIGN256 word32 X[8*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
wolfSSL 15:117db924cf7c 1170 ALIGN256 word32 x[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
wolfSSL 15:117db924cf7c 1171 word32 cnt = 0;
wolfSSL 15:117db924cf7c 1172 static const __m256i add = { 0x0000000100000000UL,0x0000000300000002UL,
wolfSSL 15:117db924cf7c 1173 0x0000000500000004UL,0x0000000700000006UL };
wolfSSL 15:117db924cf7c 1174 static const __m256i eight = { 0x0000000800000008UL,0x0000000800000008UL,
wolfSSL 15:117db924cf7c 1175 0x0000000800000008UL,0x0000000800000008UL };
wolfSSL 15:117db924cf7c 1176 static const __m256i rotl8_256 =
wolfSSL 15:117db924cf7c 1177 { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL,
wolfSSL 15:117db924cf7c 1178 0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
wolfSSL 15:117db924cf7c 1179 static const __m256i rotl16_256 =
wolfSSL 15:117db924cf7c 1180 { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL,
wolfSSL 15:117db924cf7c 1181 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
wolfSSL 15:117db924cf7c 1182
wolfSSL 15:117db924cf7c 1183 if (bytes == 0)
wolfSSL 15:117db924cf7c 1184 return;
wolfSSL 15:117db924cf7c 1185
wolfSSL 15:117db924cf7c 1186 __asm__ __volatile__ (
wolfSSL 15:117db924cf7c 1187 "movl %[bytes], %[cnt]\n\t"
wolfSSL 15:117db924cf7c 1188 "shrl $9, %[cnt]\n\t"
wolfSSL 15:117db924cf7c 1189 "jz L_end256\n\t"
wolfSSL 15:117db924cf7c 1190
wolfSSL 15:117db924cf7c 1191 "vpbroadcastd (%[key]), %%ymm0\n\t"
wolfSSL 15:117db924cf7c 1192 "vpbroadcastd 4(%[key]), %%ymm1\n\t"
wolfSSL 15:117db924cf7c 1193 "vpbroadcastd 8(%[key]), %%ymm2\n\t"
wolfSSL 15:117db924cf7c 1194 "vpbroadcastd 12(%[key]), %%ymm3\n\t"
wolfSSL 15:117db924cf7c 1195 "vpbroadcastd 16(%[key]), %%ymm4\n\t"
wolfSSL 15:117db924cf7c 1196 "vpbroadcastd 20(%[key]), %%ymm5\n\t"
wolfSSL 15:117db924cf7c 1197 "vpbroadcastd 24(%[key]), %%ymm6\n\t"
wolfSSL 15:117db924cf7c 1198 "vpbroadcastd 28(%[key]), %%ymm7\n\t"
wolfSSL 15:117db924cf7c 1199 "vpbroadcastd 32(%[key]), %%ymm8\n\t"
wolfSSL 15:117db924cf7c 1200 "vpbroadcastd 36(%[key]), %%ymm9\n\t"
wolfSSL 15:117db924cf7c 1201 "vpbroadcastd 40(%[key]), %%ymm10\n\t"
wolfSSL 15:117db924cf7c 1202 "vpbroadcastd 44(%[key]), %%ymm11\n\t"
wolfSSL 15:117db924cf7c 1203 "vpbroadcastd 48(%[key]), %%ymm12\n\t"
wolfSSL 15:117db924cf7c 1204 "vpbroadcastd 52(%[key]), %%ymm13\n\t"
wolfSSL 15:117db924cf7c 1205 "vpbroadcastd 56(%[key]), %%ymm14\n\t"
wolfSSL 15:117db924cf7c 1206 "vpbroadcastd 60(%[key]), %%ymm15\n\t"
wolfSSL 15:117db924cf7c 1207
wolfSSL 15:117db924cf7c 1208 "vpaddd %[add], %%ymm12, %%ymm12\n\t"
wolfSSL 15:117db924cf7c 1209
wolfSSL 15:117db924cf7c 1210 "vmovdqa %%ymm0, (%[X])\n\t"
wolfSSL 15:117db924cf7c 1211 "vmovdqa %%ymm1, 32(%[X])\n\t"
wolfSSL 15:117db924cf7c 1212 "vmovdqa %%ymm2, 64(%[X])\n\t"
wolfSSL 15:117db924cf7c 1213 "vmovdqa %%ymm3, 96(%[X])\n\t"
wolfSSL 15:117db924cf7c 1214 "vmovdqa %%ymm4, 128(%[X])\n\t"
wolfSSL 15:117db924cf7c 1215 "vmovdqa %%ymm5, 160(%[X])\n\t"
wolfSSL 15:117db924cf7c 1216 "vmovdqa %%ymm6, 192(%[X])\n\t"
wolfSSL 15:117db924cf7c 1217 "vmovdqa %%ymm7, 224(%[X])\n\t"
wolfSSL 15:117db924cf7c 1218 "vmovdqa %%ymm8, 256(%[X])\n\t"
wolfSSL 15:117db924cf7c 1219 "vmovdqa %%ymm9, 288(%[X])\n\t"
wolfSSL 15:117db924cf7c 1220 "vmovdqa %%ymm10, 320(%[X])\n\t"
wolfSSL 15:117db924cf7c 1221 "vmovdqa %%ymm11, 352(%[X])\n\t"
wolfSSL 15:117db924cf7c 1222 "vmovdqa %%ymm12, 384(%[X])\n\t"
wolfSSL 15:117db924cf7c 1223 "vmovdqa %%ymm13, 416(%[X])\n\t"
wolfSSL 15:117db924cf7c 1224 "vmovdqa %%ymm14, 448(%[X])\n\t"
wolfSSL 15:117db924cf7c 1225 "vmovdqa %%ymm15, 480(%[X])\n\t"
wolfSSL 15:117db924cf7c 1226 "\n"
wolfSSL 15:117db924cf7c 1227 "L_enc256_loop:\n\t"
wolfSSL 15:117db924cf7c 1228 "vmovdqa %%ymm11, 96(%[x])\n\t"
wolfSSL 15:117db924cf7c 1229 QUARTERROUND_YMM()
wolfSSL 15:117db924cf7c 1230 QUARTERROUND_YMM_2()
wolfSSL 15:117db924cf7c 1231 QUARTERROUND_YMM()
wolfSSL 15:117db924cf7c 1232 QUARTERROUND_YMM_2()
wolfSSL 15:117db924cf7c 1233 QUARTERROUND_YMM()
wolfSSL 15:117db924cf7c 1234 QUARTERROUND_YMM_2()
wolfSSL 15:117db924cf7c 1235 QUARTERROUND_YMM()
wolfSSL 15:117db924cf7c 1236 QUARTERROUND_YMM_2()
wolfSSL 15:117db924cf7c 1237 QUARTERROUND_YMM()
wolfSSL 15:117db924cf7c 1238 QUARTERROUND_YMM_2()
wolfSSL 15:117db924cf7c 1239 QUARTERROUND_YMM()
wolfSSL 15:117db924cf7c 1240 QUARTERROUND_YMM_2()
wolfSSL 15:117db924cf7c 1241 QUARTERROUND_YMM()
wolfSSL 15:117db924cf7c 1242 QUARTERROUND_YMM_2()
wolfSSL 15:117db924cf7c 1243 QUARTERROUND_YMM()
wolfSSL 15:117db924cf7c 1244 QUARTERROUND_YMM_2()
wolfSSL 15:117db924cf7c 1245 QUARTERROUND_YMM()
wolfSSL 15:117db924cf7c 1246 QUARTERROUND_YMM_2()
wolfSSL 15:117db924cf7c 1247 QUARTERROUND_YMM()
wolfSSL 15:117db924cf7c 1248 QUARTERROUND_YMM_2()
wolfSSL 15:117db924cf7c 1249 "vmovdqa 96(%[x]), %%ymm11\n\t"
wolfSSL 15:117db924cf7c 1250
wolfSSL 15:117db924cf7c 1251 "vpaddd (%[X]), %%ymm0, %%ymm0\n\t"
wolfSSL 15:117db924cf7c 1252 "vpaddd 32(%[X]), %%ymm1, %%ymm1\n\t"
wolfSSL 15:117db924cf7c 1253 "vpaddd 64(%[X]), %%ymm2, %%ymm2\n\t"
wolfSSL 15:117db924cf7c 1254 "vpaddd 96(%[X]), %%ymm3, %%ymm3\n\t"
wolfSSL 15:117db924cf7c 1255 "vpaddd 128(%[X]), %%ymm4, %%ymm4\n\t"
wolfSSL 15:117db924cf7c 1256 "vpaddd 160(%[X]), %%ymm5, %%ymm5\n\t"
wolfSSL 15:117db924cf7c 1257 "vpaddd 192(%[X]), %%ymm6, %%ymm6\n\t"
wolfSSL 15:117db924cf7c 1258 "vpaddd 224(%[X]), %%ymm7, %%ymm7\n\t"
wolfSSL 15:117db924cf7c 1259 "vpaddd 256(%[X]), %%ymm8, %%ymm8\n\t"
wolfSSL 15:117db924cf7c 1260 "vpaddd 288(%[X]), %%ymm9, %%ymm9\n\t"
wolfSSL 15:117db924cf7c 1261 "vpaddd 320(%[X]), %%ymm10, %%ymm10\n\t"
wolfSSL 15:117db924cf7c 1262 "vpaddd 352(%[X]), %%ymm11, %%ymm11\n\t"
wolfSSL 15:117db924cf7c 1263 "vpaddd 384(%[X]), %%ymm12, %%ymm12\n\t"
wolfSSL 15:117db924cf7c 1264 "vpaddd 416(%[X]), %%ymm13, %%ymm13\n\t"
wolfSSL 15:117db924cf7c 1265 "vpaddd 448(%[X]), %%ymm14, %%ymm14\n\t"
wolfSSL 15:117db924cf7c 1266 "vpaddd 480(%[X]), %%ymm15, %%ymm15\n\t"
wolfSSL 15:117db924cf7c 1267
wolfSSL 15:117db924cf7c 1268 "vmovdqa %%ymm8, (%[x])\n\t"
wolfSSL 15:117db924cf7c 1269 "vmovdqa %%ymm9, 32(%[x])\n\t"
wolfSSL 15:117db924cf7c 1270 "vmovdqa %%ymm10, 64(%[x])\n\t"
wolfSSL 15:117db924cf7c 1271 "vmovdqa %%ymm11, 96(%[x])\n\t"
wolfSSL 15:117db924cf7c 1272 "vmovdqa %%ymm12, 128(%[x])\n\t"
wolfSSL 15:117db924cf7c 1273 "vmovdqa %%ymm13, 160(%[x])\n\t"
wolfSSL 15:117db924cf7c 1274 "vmovdqa %%ymm14, 192(%[x])\n\t"
wolfSSL 15:117db924cf7c 1275 "vmovdqa %%ymm15, 224(%[x])\n\t"
wolfSSL 15:117db924cf7c 1276
wolfSSL 15:117db924cf7c 1277 "vpunpckldq %%ymm1, %%ymm0, %%ymm8\n\t"
wolfSSL 15:117db924cf7c 1278 "vpunpckldq %%ymm3, %%ymm2, %%ymm9\n\t"
wolfSSL 15:117db924cf7c 1279 "vpunpckhdq %%ymm1, %%ymm0, %%ymm12\n\t"
wolfSSL 15:117db924cf7c 1280 "vpunpckhdq %%ymm3, %%ymm2, %%ymm13\n\t"
wolfSSL 15:117db924cf7c 1281 "vpunpckldq %%ymm5, %%ymm4, %%ymm10\n\t"
wolfSSL 15:117db924cf7c 1282 "vpunpckldq %%ymm7, %%ymm6, %%ymm11\n\t"
wolfSSL 15:117db924cf7c 1283 "vpunpckhdq %%ymm5, %%ymm4, %%ymm14\n\t"
wolfSSL 15:117db924cf7c 1284 "vpunpckhdq %%ymm7, %%ymm6, %%ymm15\n\t"
wolfSSL 15:117db924cf7c 1285 "vpunpcklqdq %%ymm9, %%ymm8, %%ymm0\n\t"
wolfSSL 15:117db924cf7c 1286 "vpunpcklqdq %%ymm11, %%ymm10, %%ymm1\n\t"
wolfSSL 15:117db924cf7c 1287 "vpunpckhqdq %%ymm9, %%ymm8, %%ymm2\n\t"
wolfSSL 15:117db924cf7c 1288 "vpunpckhqdq %%ymm11, %%ymm10, %%ymm3\n\t"
wolfSSL 15:117db924cf7c 1289 "vpunpcklqdq %%ymm13, %%ymm12, %%ymm4\n\t"
wolfSSL 15:117db924cf7c 1290 "vpunpcklqdq %%ymm15, %%ymm14, %%ymm5\n\t"
wolfSSL 15:117db924cf7c 1291 "vpunpckhqdq %%ymm13, %%ymm12, %%ymm6\n\t"
wolfSSL 15:117db924cf7c 1292 "vpunpckhqdq %%ymm15, %%ymm14, %%ymm7\n\t"
wolfSSL 15:117db924cf7c 1293 "vperm2i128 $0x20, %%ymm1, %%ymm0, %%ymm8\n\t"
wolfSSL 15:117db924cf7c 1294 "vperm2i128 $0x20, %%ymm3, %%ymm2, %%ymm9\n\t"
wolfSSL 15:117db924cf7c 1295 "vperm2i128 $0x31, %%ymm1, %%ymm0, %%ymm12\n\t"
wolfSSL 15:117db924cf7c 1296 "vperm2i128 $0x31, %%ymm3, %%ymm2, %%ymm13\n\t"
wolfSSL 15:117db924cf7c 1297 "vperm2i128 $0x20, %%ymm5, %%ymm4, %%ymm10\n\t"
wolfSSL 15:117db924cf7c 1298 "vperm2i128 $0x20, %%ymm7, %%ymm6, %%ymm11\n\t"
wolfSSL 15:117db924cf7c 1299 "vperm2i128 $0x31, %%ymm5, %%ymm4, %%ymm14\n\t"
wolfSSL 15:117db924cf7c 1300 "vperm2i128 $0x31, %%ymm7, %%ymm6, %%ymm15\n\t"
wolfSSL 15:117db924cf7c 1301
wolfSSL 15:117db924cf7c 1302 "vmovdqu (%[in]), %%ymm0\n\t"
wolfSSL 15:117db924cf7c 1303 "vmovdqu 64(%[in]), %%ymm1\n\t"
wolfSSL 15:117db924cf7c 1304 "vmovdqu 128(%[in]), %%ymm2\n\t"
wolfSSL 15:117db924cf7c 1305 "vmovdqu 192(%[in]), %%ymm3\n\t"
wolfSSL 15:117db924cf7c 1306 "vmovdqu 256(%[in]), %%ymm4\n\t"
wolfSSL 15:117db924cf7c 1307 "vmovdqu 320(%[in]), %%ymm5\n\t"
wolfSSL 15:117db924cf7c 1308 "vmovdqu 384(%[in]), %%ymm6\n\t"
wolfSSL 15:117db924cf7c 1309 "vmovdqu 448(%[in]), %%ymm7\n\t"
wolfSSL 15:117db924cf7c 1310 "vpxor %%ymm0, %%ymm8, %%ymm8\n\t"
wolfSSL 15:117db924cf7c 1311 "vpxor %%ymm1, %%ymm9, %%ymm9\n\t"
wolfSSL 15:117db924cf7c 1312 "vpxor %%ymm2, %%ymm10, %%ymm10\n\t"
wolfSSL 15:117db924cf7c 1313 "vpxor %%ymm3, %%ymm11, %%ymm11\n\t"
wolfSSL 15:117db924cf7c 1314 "vpxor %%ymm4, %%ymm12, %%ymm12\n\t"
wolfSSL 15:117db924cf7c 1315 "vpxor %%ymm5, %%ymm13, %%ymm13\n\t"
wolfSSL 15:117db924cf7c 1316 "vpxor %%ymm6, %%ymm14, %%ymm14\n\t"
wolfSSL 15:117db924cf7c 1317 "vpxor %%ymm7, %%ymm15, %%ymm15\n\t"
wolfSSL 15:117db924cf7c 1318 "vmovdqu %%ymm8, (%[out])\n\t"
wolfSSL 15:117db924cf7c 1319 "vmovdqu %%ymm9, 64(%[out])\n\t"
wolfSSL 15:117db924cf7c 1320 "vmovdqu %%ymm10, 128(%[out])\n\t"
wolfSSL 15:117db924cf7c 1321 "vmovdqu %%ymm11, 192(%[out])\n\t"
wolfSSL 15:117db924cf7c 1322 "vmovdqu %%ymm12, 256(%[out])\n\t"
wolfSSL 15:117db924cf7c 1323 "vmovdqu %%ymm13, 320(%[out])\n\t"
wolfSSL 15:117db924cf7c 1324 "vmovdqu %%ymm14, 384(%[out])\n\t"
wolfSSL 15:117db924cf7c 1325 "vmovdqu %%ymm15, 448(%[out])\n\t"
wolfSSL 15:117db924cf7c 1326
wolfSSL 15:117db924cf7c 1327 "vmovdqa (%[x]), %%ymm0\n\t"
wolfSSL 15:117db924cf7c 1328 "vmovdqa 32(%[x]), %%ymm1\n\t"
wolfSSL 15:117db924cf7c 1329 "vmovdqa 64(%[x]), %%ymm2\n\t"
wolfSSL 15:117db924cf7c 1330 "vmovdqa 96(%[x]), %%ymm3\n\t"
wolfSSL 15:117db924cf7c 1331 "vmovdqa 128(%[x]), %%ymm4\n\t"
wolfSSL 15:117db924cf7c 1332 "vmovdqa 160(%[x]), %%ymm5\n\t"
wolfSSL 15:117db924cf7c 1333 "vmovdqa 192(%[x]), %%ymm6\n\t"
wolfSSL 15:117db924cf7c 1334 "vmovdqa 224(%[x]), %%ymm7\n\t"
wolfSSL 15:117db924cf7c 1335
wolfSSL 15:117db924cf7c 1336 "vpunpckldq %%ymm1, %%ymm0, %%ymm8\n\t"
wolfSSL 15:117db924cf7c 1337 "vpunpckldq %%ymm3, %%ymm2, %%ymm9\n\t"
wolfSSL 15:117db924cf7c 1338 "vpunpckhdq %%ymm1, %%ymm0, %%ymm12\n\t"
wolfSSL 15:117db924cf7c 1339 "vpunpckhdq %%ymm3, %%ymm2, %%ymm13\n\t"
wolfSSL 15:117db924cf7c 1340 "vpunpckldq %%ymm5, %%ymm4, %%ymm10\n\t"
wolfSSL 15:117db924cf7c 1341 "vpunpckldq %%ymm7, %%ymm6, %%ymm11\n\t"
wolfSSL 15:117db924cf7c 1342 "vpunpckhdq %%ymm5, %%ymm4, %%ymm14\n\t"
wolfSSL 15:117db924cf7c 1343 "vpunpckhdq %%ymm7, %%ymm6, %%ymm15\n\t"
wolfSSL 15:117db924cf7c 1344 "vpunpcklqdq %%ymm9, %%ymm8, %%ymm0\n\t"
wolfSSL 15:117db924cf7c 1345 "vpunpcklqdq %%ymm11, %%ymm10, %%ymm1\n\t"
wolfSSL 15:117db924cf7c 1346 "vpunpckhqdq %%ymm9 , %%ymm8, %%ymm2\n\t"
wolfSSL 15:117db924cf7c 1347 "vpunpckhqdq %%ymm11, %%ymm10, %%ymm3\n\t"
wolfSSL 15:117db924cf7c 1348 "vpunpcklqdq %%ymm13, %%ymm12, %%ymm4\n\t"
wolfSSL 15:117db924cf7c 1349 "vpunpcklqdq %%ymm15, %%ymm14, %%ymm5\n\t"
wolfSSL 15:117db924cf7c 1350 "vpunpckhqdq %%ymm13, %%ymm12, %%ymm6\n\t"
wolfSSL 15:117db924cf7c 1351 "vpunpckhqdq %%ymm15, %%ymm14, %%ymm7\n\t"
wolfSSL 15:117db924cf7c 1352 "vperm2i128 $0x20, %%ymm1, %%ymm0, %%ymm8\n\t"
wolfSSL 15:117db924cf7c 1353 "vperm2i128 $0x20, %%ymm3, %%ymm2, %%ymm9\n\t"
wolfSSL 15:117db924cf7c 1354 "vperm2i128 $0x31, %%ymm1, %%ymm0, %%ymm12\n\t"
wolfSSL 15:117db924cf7c 1355 "vperm2i128 $0x31, %%ymm3, %%ymm2, %%ymm13\n\t"
wolfSSL 15:117db924cf7c 1356 "vperm2i128 $0x20, %%ymm5, %%ymm4, %%ymm10\n\t"
wolfSSL 15:117db924cf7c 1357 "vperm2i128 $0x20, %%ymm7, %%ymm6, %%ymm11\n\t"
wolfSSL 15:117db924cf7c 1358 "vperm2i128 $0x31, %%ymm5, %%ymm4, %%ymm14\n\t"
wolfSSL 15:117db924cf7c 1359 "vperm2i128 $0x31, %%ymm7, %%ymm6, %%ymm15\n\t"
wolfSSL 15:117db924cf7c 1360
wolfSSL 15:117db924cf7c 1361 "vmovdqu 32(%[in]), %%ymm0\n\t"
wolfSSL 15:117db924cf7c 1362 "vmovdqu 96(%[in]), %%ymm1\n\t"
wolfSSL 15:117db924cf7c 1363 "vmovdqu 160(%[in]), %%ymm2\n\t"
wolfSSL 15:117db924cf7c 1364 "vmovdqu 224(%[in]), %%ymm3\n\t"
wolfSSL 15:117db924cf7c 1365 "vmovdqu 288(%[in]), %%ymm4\n\t"
wolfSSL 15:117db924cf7c 1366 "vmovdqu 352(%[in]), %%ymm5\n\t"
wolfSSL 15:117db924cf7c 1367 "vmovdqu 416(%[in]), %%ymm6\n\t"
wolfSSL 15:117db924cf7c 1368 "vmovdqu 480(%[in]), %%ymm7\n\t"
wolfSSL 15:117db924cf7c 1369 "vpxor %%ymm0, %%ymm8, %%ymm8\n\t"
wolfSSL 15:117db924cf7c 1370 "vpxor %%ymm1, %%ymm9, %%ymm9\n\t"
wolfSSL 15:117db924cf7c 1371 "vpxor %%ymm2, %%ymm10, %%ymm10\n\t"
wolfSSL 15:117db924cf7c 1372 "vpxor %%ymm3, %%ymm11, %%ymm11\n\t"
wolfSSL 15:117db924cf7c 1373 "vpxor %%ymm4, %%ymm12, %%ymm12\n\t"
wolfSSL 15:117db924cf7c 1374 "vpxor %%ymm5, %%ymm13, %%ymm13\n\t"
wolfSSL 15:117db924cf7c 1375 "vpxor %%ymm6, %%ymm14, %%ymm14\n\t"
wolfSSL 15:117db924cf7c 1376 "vpxor %%ymm7, %%ymm15, %%ymm15\n\t"
wolfSSL 15:117db924cf7c 1377 "vmovdqu %%ymm8, 32(%[out])\n\t"
wolfSSL 15:117db924cf7c 1378 "vmovdqu %%ymm9, 96(%[out])\n\t"
wolfSSL 15:117db924cf7c 1379 "vmovdqu %%ymm10, 160(%[out])\n\t"
wolfSSL 15:117db924cf7c 1380 "vmovdqu %%ymm11, 224(%[out])\n\t"
wolfSSL 15:117db924cf7c 1381 "vmovdqu %%ymm12, 288(%[out])\n\t"
wolfSSL 15:117db924cf7c 1382 "vmovdqu %%ymm13, 352(%[out])\n\t"
wolfSSL 15:117db924cf7c 1383 "vmovdqu %%ymm14, 416(%[out])\n\t"
wolfSSL 15:117db924cf7c 1384 "vmovdqu %%ymm15, 480(%[out])\n\t"
wolfSSL 15:117db924cf7c 1385
wolfSSL 15:117db924cf7c 1386 "vmovdqa 384(%[X]), %%ymm12\n\t"
wolfSSL 15:117db924cf7c 1387 "add $512, %[in]\n\t"
wolfSSL 15:117db924cf7c 1388 "add $512, %[out]\n\t"
wolfSSL 15:117db924cf7c 1389 "vpaddd %[eight], %%ymm12, %%ymm12\n\t"
wolfSSL 15:117db924cf7c 1390 "sub $512, %[bytes]\n\t"
wolfSSL 15:117db924cf7c 1391 "vmovdqa %%ymm12, 384(%[X])\n\t"
wolfSSL 15:117db924cf7c 1392 "cmp $512, %[bytes]\n\t"
wolfSSL 15:117db924cf7c 1393 "jl L_done256\n\t"
wolfSSL 15:117db924cf7c 1394
wolfSSL 15:117db924cf7c 1395 "vmovdqa (%[X]), %%ymm0\n\t"
wolfSSL 15:117db924cf7c 1396 "vmovdqa 32(%[X]), %%ymm1\n\t"
wolfSSL 15:117db924cf7c 1397 "vmovdqa 64(%[X]), %%ymm2\n\t"
wolfSSL 15:117db924cf7c 1398 "vmovdqa 96(%[X]), %%ymm3\n\t"
wolfSSL 15:117db924cf7c 1399 "vmovdqa 128(%[X]), %%ymm4\n\t"
wolfSSL 15:117db924cf7c 1400 "vmovdqa 160(%[X]), %%ymm5\n\t"
wolfSSL 15:117db924cf7c 1401 "vmovdqa 192(%[X]), %%ymm6\n\t"
wolfSSL 15:117db924cf7c 1402 "vmovdqa 224(%[X]), %%ymm7\n\t"
wolfSSL 15:117db924cf7c 1403 "vmovdqa 256(%[X]), %%ymm8\n\t"
wolfSSL 15:117db924cf7c 1404 "vmovdqa 288(%[X]), %%ymm9\n\t"
wolfSSL 15:117db924cf7c 1405 "vmovdqa 320(%[X]), %%ymm10\n\t"
wolfSSL 15:117db924cf7c 1406 "vmovdqa 352(%[X]), %%ymm11\n\t"
wolfSSL 15:117db924cf7c 1407 "vmovdqa 384(%[X]), %%ymm12\n\t"
wolfSSL 15:117db924cf7c 1408 "vmovdqa 416(%[X]), %%ymm13\n\t"
wolfSSL 15:117db924cf7c 1409 "vmovdqa 448(%[X]), %%ymm14\n\t"
wolfSSL 15:117db924cf7c 1410 "vmovdqa 480(%[X]), %%ymm15\n\t"
wolfSSL 15:117db924cf7c 1411 "jmp L_enc256_loop\n\t"
wolfSSL 15:117db924cf7c 1412 "\n"
wolfSSL 15:117db924cf7c 1413 "L_done256:\n\t"
wolfSSL 15:117db924cf7c 1414 "shl $3, %[cnt]\n\t"
wolfSSL 15:117db924cf7c 1415 "add 48(%[key]), %[cnt]\n\t"
wolfSSL 15:117db924cf7c 1416 "movl %[cnt], 48(%[key])\n\t"
wolfSSL 15:117db924cf7c 1417 "\n"
wolfSSL 15:117db924cf7c 1418 "L_end256:\n\t"
wolfSSL 15:117db924cf7c 1419 : [bytes] "+r" (bytes), [cnt] "+r" (cnt),
wolfSSL 15:117db924cf7c 1420 [in] "+r" (m), [out] "+r" (c)
wolfSSL 15:117db924cf7c 1421 : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X),
wolfSSL 15:117db924cf7c 1422 [add] "m" (add), [eight] "m" (eight),
wolfSSL 15:117db924cf7c 1423 [rotl8] "m" (rotl8_256), [rotl16] "m" (rotl16_256)
wolfSSL 15:117db924cf7c 1424 : "ymm0", "ymm1", "ymm2", "ymm3",
wolfSSL 15:117db924cf7c 1425 "ymm4", "ymm5", "ymm6", "ymm7",
wolfSSL 15:117db924cf7c 1426 "ymm8", "ymm9", "ymm10", "ymm11",
wolfSSL 15:117db924cf7c 1427 "ymm12", "ymm13", "ymm14", "ymm15", "memory"
wolfSSL 15:117db924cf7c 1428 );
wolfSSL 15:117db924cf7c 1429
wolfSSL 15:117db924cf7c 1430 /* AVX code optimised for multiples of 256 bytes. */
wolfSSL 15:117db924cf7c 1431 if (bytes == 256) {
wolfSSL 15:117db924cf7c 1432 chacha_encrypt_avx(ctx, m, c, bytes);
wolfSSL 15:117db924cf7c 1433 bytes -= 256;
wolfSSL 15:117db924cf7c 1434 }
wolfSSL 15:117db924cf7c 1435
wolfSSL 15:117db924cf7c 1436 for (; bytes >= CHACHA_CHUNK_BYTES;) {
wolfSSL 15:117db924cf7c 1437 CHACHA_CHUNK_AVX2();
wolfSSL 15:117db924cf7c 1438 bytes -= CHACHA_CHUNK_BYTES;
wolfSSL 15:117db924cf7c 1439 c += CHACHA_CHUNK_BYTES;
wolfSSL 15:117db924cf7c 1440 m += CHACHA_CHUNK_BYTES;
wolfSSL 15:117db924cf7c 1441 }
wolfSSL 15:117db924cf7c 1442 if (bytes > 0) {
wolfSSL 15:117db924cf7c 1443 CHACHA_PARTIAL_CHUNK_AVX2();
wolfSSL 15:117db924cf7c 1444 }
wolfSSL 15:117db924cf7c 1445 }
wolfSSL 15:117db924cf7c 1446 #endif /* HAVE_INTEL_AVX2 */
wolfSSL 15:117db924cf7c 1447 #endif /* USE_INTEL_CHACHA_SPEEDUP */
wolfSSL 15:117db924cf7c 1448
wolfSSL 15:117db924cf7c 1449 /**
wolfSSL 15:117db924cf7c 1450 * Encrypt a stream of bytes
wolfSSL 15:117db924cf7c 1451 */
wolfSSL 15:117db924cf7c 1452 static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c,
wolfSSL 15:117db924cf7c 1453 word32 bytes)
wolfSSL 15:117db924cf7c 1454 {
wolfSSL 15:117db924cf7c 1455 byte* output;
wolfSSL 15:117db924cf7c 1456 word32 temp[CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
wolfSSL 15:117db924cf7c 1457 word32 i;
wolfSSL 15:117db924cf7c 1458
wolfSSL 15:117db924cf7c 1459 output = (byte*)temp;
wolfSSL 15:117db924cf7c 1460
wolfSSL 15:117db924cf7c 1461 for (; bytes > 0;) {
wolfSSL 15:117db924cf7c 1462 wc_Chacha_wordtobyte(temp, ctx->X);
wolfSSL 15:117db924cf7c 1463 ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
wolfSSL 15:117db924cf7c 1464 if (bytes <= CHACHA_CHUNK_BYTES) {
wolfSSL 15:117db924cf7c 1465 for (i = 0; i < bytes; ++i) {
wolfSSL 15:117db924cf7c 1466 c[i] = m[i] ^ output[i];
wolfSSL 15:117db924cf7c 1467 }
wolfSSL 15:117db924cf7c 1468 return;
wolfSSL 15:117db924cf7c 1469 }
wolfSSL 15:117db924cf7c 1470 for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) {
wolfSSL 15:117db924cf7c 1471 c[i] = m[i] ^ output[i];
wolfSSL 15:117db924cf7c 1472 }
wolfSSL 15:117db924cf7c 1473 bytes -= CHACHA_CHUNK_BYTES;
wolfSSL 15:117db924cf7c 1474 c += CHACHA_CHUNK_BYTES;
wolfSSL 15:117db924cf7c 1475 m += CHACHA_CHUNK_BYTES;
wolfSSL 15:117db924cf7c 1476 }
wolfSSL 15:117db924cf7c 1477 }
wolfSSL 15:117db924cf7c 1478
wolfSSL 15:117db924cf7c 1479 /**
wolfSSL 15:117db924cf7c 1480 * API to encrypt/decrypt a message of any size.
wolfSSL 15:117db924cf7c 1481 */
wolfSSL 15:117db924cf7c 1482 int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
wolfSSL 15:117db924cf7c 1483 word32 msglen)
wolfSSL 15:117db924cf7c 1484 {
wolfSSL 15:117db924cf7c 1485 if (ctx == NULL)
wolfSSL 15:117db924cf7c 1486 return BAD_FUNC_ARG;
wolfSSL 15:117db924cf7c 1487
wolfSSL 15:117db924cf7c 1488 #ifdef USE_INTEL_CHACHA_SPEEDUP
wolfSSL 15:117db924cf7c 1489 if (!cpuidFlagsSet) {
wolfSSL 15:117db924cf7c 1490 cpuidFlags = cpuid_get_flags();
wolfSSL 15:117db924cf7c 1491 cpuidFlagsSet = 1;
wolfSSL 15:117db924cf7c 1492 }
wolfSSL 15:117db924cf7c 1493
wolfSSL 15:117db924cf7c 1494 #ifdef HAVE_INTEL_AVX2
wolfSSL 15:117db924cf7c 1495 if (IS_INTEL_AVX2(cpuidFlags)) {
wolfSSL 15:117db924cf7c 1496 chacha_encrypt_avx2(ctx, input, output, msglen);
wolfSSL 15:117db924cf7c 1497 return 0;
wolfSSL 15:117db924cf7c 1498 }
wolfSSL 15:117db924cf7c 1499 #endif
wolfSSL 15:117db924cf7c 1500 if (IS_INTEL_AVX1(cpuidFlags)) {
wolfSSL 15:117db924cf7c 1501 chacha_encrypt_avx(ctx, input, output, msglen);
wolfSSL 15:117db924cf7c 1502 return 0;
wolfSSL 15:117db924cf7c 1503 }
wolfSSL 15:117db924cf7c 1504 else {
wolfSSL 15:117db924cf7c 1505 chacha_encrypt_x64(ctx, input, output, msglen);
wolfSSL 15:117db924cf7c 1506 return 0;
wolfSSL 15:117db924cf7c 1507 }
wolfSSL 15:117db924cf7c 1508 #endif
wolfSSL 15:117db924cf7c 1509 wc_Chacha_encrypt_bytes(ctx, input, output, msglen);
wolfSSL 15:117db924cf7c 1510
wolfSSL 15:117db924cf7c 1511 return 0;
wolfSSL 15:117db924cf7c 1512 }
wolfSSL 15:117db924cf7c 1513
wolfSSL 15:117db924cf7c 1514 #endif /* HAVE_CHACHA*/
wolfSSL 15:117db924cf7c 1515
wolfSSL 15:117db924cf7c 1516