Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
wolfcrypt/src/chacha.c@15:117db924cf7c, 2018-08-18 (annotated)
- Committer:
- wolfSSL
- Date:
- Sat Aug 18 22:20:43 2018 +0000
- Revision:
- 15:117db924cf7c
wolfSSL 3.15.3
Who changed what in which revision?
User | Revision | Line number | New contents of line |
---|---|---|---|
wolfSSL | 15:117db924cf7c | 1 | /* chacha.c |
wolfSSL | 15:117db924cf7c | 2 | * |
wolfSSL | 15:117db924cf7c | 3 | * Copyright (C) 2006-2017 wolfSSL Inc. |
wolfSSL | 15:117db924cf7c | 4 | * |
wolfSSL | 15:117db924cf7c | 5 | * This file is part of wolfSSL. |
wolfSSL | 15:117db924cf7c | 6 | * |
wolfSSL | 15:117db924cf7c | 7 | * wolfSSL is free software; you can redistribute it and/or modify |
wolfSSL | 15:117db924cf7c | 8 | * it under the terms of the GNU General Public License as published by |
wolfSSL | 15:117db924cf7c | 9 | * the Free Software Foundation; either version 2 of the License, or |
wolfSSL | 15:117db924cf7c | 10 | * (at your option) any later version. |
wolfSSL | 15:117db924cf7c | 11 | * |
wolfSSL | 15:117db924cf7c | 12 | * wolfSSL is distributed in the hope that it will be useful, |
wolfSSL | 15:117db924cf7c | 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
wolfSSL | 15:117db924cf7c | 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
wolfSSL | 15:117db924cf7c | 15 | * GNU General Public License for more details. |
wolfSSL | 15:117db924cf7c | 16 | * |
wolfSSL | 15:117db924cf7c | 17 | * You should have received a copy of the GNU General Public License |
wolfSSL | 15:117db924cf7c | 18 | * along with this program; if not, write to the Free Software |
wolfSSL | 15:117db924cf7c | 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA |
wolfSSL | 15:117db924cf7c | 20 | * |
wolfSSL | 15:117db924cf7c | 21 | * based from |
wolfSSL | 15:117db924cf7c | 22 | * chacha-ref.c version 20080118 |
wolfSSL | 15:117db924cf7c | 23 | * D. J. Bernstein |
wolfSSL | 15:117db924cf7c | 24 | * Public domain. |
wolfSSL | 15:117db924cf7c | 25 | */ |
wolfSSL | 15:117db924cf7c | 26 | |
wolfSSL | 15:117db924cf7c | 27 | |
wolfSSL | 15:117db924cf7c | 28 | |
wolfSSL | 15:117db924cf7c | 29 | #ifdef HAVE_CONFIG_H |
wolfSSL | 15:117db924cf7c | 30 | #include <config.h> |
wolfSSL | 15:117db924cf7c | 31 | #endif |
wolfSSL | 15:117db924cf7c | 32 | |
wolfSSL | 15:117db924cf7c | 33 | #include <wolfssl/wolfcrypt/settings.h> |
wolfSSL | 15:117db924cf7c | 34 | |
wolfSSL | 15:117db924cf7c | 35 | #ifdef HAVE_CHACHA |
wolfSSL | 15:117db924cf7c | 36 | |
wolfSSL | 15:117db924cf7c | 37 | #include <wolfssl/wolfcrypt/chacha.h> |
wolfSSL | 15:117db924cf7c | 38 | #include <wolfssl/wolfcrypt/error-crypt.h> |
wolfSSL | 15:117db924cf7c | 39 | #include <wolfssl/wolfcrypt/logging.h> |
wolfSSL | 15:117db924cf7c | 40 | #include <wolfssl/wolfcrypt/cpuid.h> |
wolfSSL | 15:117db924cf7c | 41 | #ifdef NO_INLINE |
wolfSSL | 15:117db924cf7c | 42 | #include <wolfssl/wolfcrypt/misc.h> |
wolfSSL | 15:117db924cf7c | 43 | #else |
wolfSSL | 15:117db924cf7c | 44 | #define WOLFSSL_MISC_INCLUDED |
wolfSSL | 15:117db924cf7c | 45 | #include <wolfcrypt/src/misc.c> |
wolfSSL | 15:117db924cf7c | 46 | #endif |
wolfSSL | 15:117db924cf7c | 47 | |
wolfSSL | 15:117db924cf7c | 48 | #ifdef CHACHA_AEAD_TEST |
wolfSSL | 15:117db924cf7c | 49 | #include <stdio.h> |
wolfSSL | 15:117db924cf7c | 50 | #endif |
wolfSSL | 15:117db924cf7c | 51 | |
wolfSSL | 15:117db924cf7c | 52 | #ifdef USE_INTEL_CHACHA_SPEEDUP |
wolfSSL | 15:117db924cf7c | 53 | #include <emmintrin.h> |
wolfSSL | 15:117db924cf7c | 54 | #include <immintrin.h> |
wolfSSL | 15:117db924cf7c | 55 | |
wolfSSL | 15:117db924cf7c | 56 | #if defined(__GNUC__) && ((__GNUC__ < 4) || \ |
wolfSSL | 15:117db924cf7c | 57 | (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) |
wolfSSL | 15:117db924cf7c | 58 | #define NO_AVX2_SUPPORT |
wolfSSL | 15:117db924cf7c | 59 | #endif |
wolfSSL | 15:117db924cf7c | 60 | #if defined(__clang__) && ((__clang_major__ < 3) || \ |
wolfSSL | 15:117db924cf7c | 61 | (__clang_major__ == 3 && __clang_minor__ <= 5)) |
wolfSSL | 15:117db924cf7c | 62 | #define NO_AVX2_SUPPORT |
wolfSSL | 15:117db924cf7c | 63 | #elif defined(__clang__) && defined(NO_AVX2_SUPPORT) |
wolfSSL | 15:117db924cf7c | 64 | #undef NO_AVX2_SUPPORT |
wolfSSL | 15:117db924cf7c | 65 | #endif |
wolfSSL | 15:117db924cf7c | 66 | |
wolfSSL | 15:117db924cf7c | 67 | #ifndef NO_AVX2_SUPPORT |
wolfSSL | 15:117db924cf7c | 68 | #define HAVE_INTEL_AVX2 |
wolfSSL | 15:117db924cf7c | 69 | #endif |
wolfSSL | 15:117db924cf7c | 70 | |
wolfSSL | 15:117db924cf7c | 71 | #if defined(_MSC_VER) |
wolfSSL | 15:117db924cf7c | 72 | #define CHACHA20_NOINLINE __declspec(noinline) |
wolfSSL | 15:117db924cf7c | 73 | #elif defined(__GNUC__) |
wolfSSL | 15:117db924cf7c | 74 | #define CHACHA20_NOINLINE __attribute__((noinline)) |
wolfSSL | 15:117db924cf7c | 75 | #else |
wolfSSL | 15:117db924cf7c | 76 | #define CHACHA20_NOINLINE |
wolfSSL | 15:117db924cf7c | 77 | #endif |
wolfSSL | 15:117db924cf7c | 78 | |
wolfSSL | 15:117db924cf7c | 79 | static int cpuidFlagsSet = 0; |
wolfSSL | 15:117db924cf7c | 80 | static int cpuidFlags = 0; |
wolfSSL | 15:117db924cf7c | 81 | #endif |
wolfSSL | 15:117db924cf7c | 82 | |
wolfSSL | 15:117db924cf7c | 83 | #ifdef BIG_ENDIAN_ORDER |
wolfSSL | 15:117db924cf7c | 84 | #define LITTLE32(x) ByteReverseWord32(x) |
wolfSSL | 15:117db924cf7c | 85 | #else |
wolfSSL | 15:117db924cf7c | 86 | #define LITTLE32(x) (x) |
wolfSSL | 15:117db924cf7c | 87 | #endif |
wolfSSL | 15:117db924cf7c | 88 | |
wolfSSL | 15:117db924cf7c | 89 | /* Number of rounds */ |
wolfSSL | 15:117db924cf7c | 90 | #define ROUNDS 20 |
wolfSSL | 15:117db924cf7c | 91 | |
wolfSSL | 15:117db924cf7c | 92 | #define U32C(v) (v##U) |
wolfSSL | 15:117db924cf7c | 93 | #define U32V(v) ((word32)(v) & U32C(0xFFFFFFFF)) |
wolfSSL | 15:117db924cf7c | 94 | #define U8TO32_LITTLE(p) LITTLE32(((word32*)(p))[0]) |
wolfSSL | 15:117db924cf7c | 95 | |
wolfSSL | 15:117db924cf7c | 96 | #define ROTATE(v,c) rotlFixed(v, c) |
wolfSSL | 15:117db924cf7c | 97 | #define XOR(v,w) ((v) ^ (w)) |
wolfSSL | 15:117db924cf7c | 98 | #define PLUS(v,w) (U32V((v) + (w))) |
wolfSSL | 15:117db924cf7c | 99 | #define PLUSONE(v) (PLUS((v),1)) |
wolfSSL | 15:117db924cf7c | 100 | |
wolfSSL | 15:117db924cf7c | 101 | #define QUARTERROUND(a,b,c,d) \ |
wolfSSL | 15:117db924cf7c | 102 | x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \ |
wolfSSL | 15:117db924cf7c | 103 | x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \ |
wolfSSL | 15:117db924cf7c | 104 | x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \ |
wolfSSL | 15:117db924cf7c | 105 | x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7); |
wolfSSL | 15:117db924cf7c | 106 | |
wolfSSL | 15:117db924cf7c | 107 | |
wolfSSL | 15:117db924cf7c | 108 | |
wolfSSL | 15:117db924cf7c | 109 | #define QUARTERROUND_INTEL_ASM(a0,b0,c0,d0, \ |
wolfSSL | 15:117db924cf7c | 110 | a1,b1,c1,d1, \ |
wolfSSL | 15:117db924cf7c | 111 | a2,b2,c2,d2, \ |
wolfSSL | 15:117db924cf7c | 112 | a3,b3,c3,d3, \ |
wolfSSL | 15:117db924cf7c | 113 | t1,o1) \ |
wolfSSL | 15:117db924cf7c | 114 | "vpaddd "#b0", "#a0", "#a0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 115 | "vpxor "#a0", "#d0", "#d0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 116 | "vmovdqa "#o1"(%[x]), "#c3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 117 | "vpshufb %[rotl16], "#d0", "#d0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 118 | "vpaddd "#d0", "#c0", "#c0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 119 | "vpxor "#c0", "#b0", "#b0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 120 | "vpaddd "#b1", "#a1", "#a1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 121 | "vpxor "#a1", "#d1", "#d1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 122 | "vpshufb %[rotl16], "#d1", "#d1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 123 | "vpaddd "#d1", "#c1", "#c1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 124 | "vpxor "#c1", "#b1", "#b1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 125 | "vpaddd "#b2", "#a2", "#a2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 126 | "vpxor "#a2", "#d2", "#d2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 127 | "vpshufb %[rotl16], "#d2", "#d2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 128 | "vpaddd "#d2", "#c2", "#c2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 129 | "vpxor "#c2", "#b2", "#b2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 130 | "vpaddd "#b3", "#a3", "#a3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 131 | "vpxor "#a3", "#d3", "#d3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 132 | "vpshufb %[rotl16], "#d3", "#d3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 133 | "vpaddd "#d3", "#c3", "#c3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 134 | "vpxor "#c3", "#b3", "#b3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 135 | "vmovdqa "#c3", "#o1"(%[x])\n\t" \ |
wolfSSL | 15:117db924cf7c | 136 | "vpsrld $20, "#b0", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 137 | "vpslld $12, "#b0", "#b0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 138 | "vpxor "#t1", "#b0", "#b0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 139 | "vpsrld $20, "#b1", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 140 | "vpslld $12, "#b1", "#b1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 141 | "vpxor "#t1", "#b1", "#b1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 142 | "vpsrld $20, "#b2", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 143 | "vpslld $12, "#b2", "#b2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 144 | "vpxor "#t1", "#b2", "#b2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 145 | "vpsrld $20, "#b3", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 146 | "vpslld $12, "#b3", "#b3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 147 | "vpxor "#t1", "#b3", "#b3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 148 | "vpaddd "#b0", "#a0", "#a0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 149 | "vpxor "#a0", "#d0", "#d0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 150 | "vmovdqa "#o1"(%[x]), "#c3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 151 | "vpshufb %[rotl8], "#d0", "#d0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 152 | "vpaddd "#d0", "#c0", "#c0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 153 | "vpxor "#c0", "#b0", "#b0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 154 | "vpaddd "#b1", "#a1", "#a1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 155 | "vpxor "#a1", "#d1", "#d1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 156 | "vpshufb %[rotl8], "#d1", "#d1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 157 | "vpaddd "#d1", "#c1", "#c1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 158 | "vpxor "#c1", "#b1", "#b1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 159 | "vpaddd "#b2", "#a2", "#a2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 160 | "vpxor "#a2", "#d2", "#d2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 161 | "vpshufb %[rotl8], "#d2", "#d2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 162 | "vpaddd "#d2", "#c2", "#c2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 163 | "vpxor "#c2", "#b2", "#b2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 164 | "vpaddd "#b3", "#a3", "#a3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 165 | "vpxor "#a3", "#d3", "#d3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 166 | "vpshufb %[rotl8], "#d3", "#d3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 167 | "vpaddd "#d3", "#c3", "#c3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 168 | "vpxor "#c3", "#b3", "#b3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 169 | "vmovdqa "#c3", "#o1"(%[x])\n\t" \ |
wolfSSL | 15:117db924cf7c | 170 | "vpsrld $25, "#b0", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 171 | "vpslld $7, "#b0", "#b0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 172 | "vpxor "#t1", "#b0", "#b0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 173 | "vpsrld $25, "#b1", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 174 | "vpslld $7, "#b1", "#b1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 175 | "vpxor "#t1", "#b1", "#b1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 176 | "vpsrld $25, "#b2", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 177 | "vpslld $7, "#b2", "#b2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 178 | "vpxor "#t1", "#b2", "#b2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 179 | "vpsrld $25, "#b3", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 180 | "vpslld $7, "#b3", "#b3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 181 | "vpxor "#t1", "#b3", "#b3"\n\t" |
wolfSSL | 15:117db924cf7c | 182 | |
wolfSSL | 15:117db924cf7c | 183 | #define QUARTERROUND_INTEL_ASM_2(a0,b0,c0,d0, \ |
wolfSSL | 15:117db924cf7c | 184 | a1,b1,c1,d1, \ |
wolfSSL | 15:117db924cf7c | 185 | a2,b2,c2,d2, \ |
wolfSSL | 15:117db924cf7c | 186 | a3,b3,c3,d3, \ |
wolfSSL | 15:117db924cf7c | 187 | t1,o1) \ |
wolfSSL | 15:117db924cf7c | 188 | "vpaddd "#b0", "#a0", "#a0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 189 | "vpxor "#a0", "#d0", "#d0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 190 | "vmovdqa "#o1"(%[x]), "#c1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 191 | "vpshufb %[rotl16], "#d0", "#d0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 192 | "vpaddd "#d0", "#c0", "#c0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 193 | "vpxor "#c0", "#b0", "#b0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 194 | "vpaddd "#b1", "#a1", "#a1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 195 | "vpxor "#a1", "#d1", "#d1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 196 | "vpshufb %[rotl16], "#d1", "#d1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 197 | "vpaddd "#d1", "#c1", "#c1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 198 | "vpxor "#c1", "#b1", "#b1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 199 | "vpaddd "#b2", "#a2", "#a2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 200 | "vpxor "#a2", "#d2", "#d2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 201 | "vpshufb %[rotl16], "#d2", "#d2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 202 | "vpaddd "#d2", "#c2", "#c2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 203 | "vpxor "#c2", "#b2", "#b2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 204 | "vpaddd "#b3", "#a3", "#a3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 205 | "vpxor "#a3", "#d3", "#d3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 206 | "vpshufb %[rotl16], "#d3", "#d3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 207 | "vpaddd "#d3", "#c3", "#c3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 208 | "vpxor "#c3", "#b3", "#b3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 209 | "vmovdqa "#c1", "#o1"(%[x])\n\t" \ |
wolfSSL | 15:117db924cf7c | 210 | "vpsrld $20, "#b0", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 211 | "vpslld $12, "#b0", "#b0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 212 | "vpxor "#t1", "#b0", "#b0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 213 | "vpsrld $20, "#b1", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 214 | "vpslld $12, "#b1", "#b1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 215 | "vpxor "#t1", "#b1", "#b1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 216 | "vpsrld $20, "#b2", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 217 | "vpslld $12, "#b2", "#b2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 218 | "vpxor "#t1", "#b2", "#b2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 219 | "vpsrld $20, "#b3", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 220 | "vpslld $12, "#b3", "#b3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 221 | "vpxor "#t1", "#b3", "#b3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 222 | "vpaddd "#b0", "#a0", "#a0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 223 | "vpxor "#a0", "#d0", "#d0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 224 | "vmovdqa "#o1"(%[x]), "#c1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 225 | "vpshufb %[rotl8], "#d0", "#d0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 226 | "vpaddd "#d0", "#c0", "#c0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 227 | "vpxor "#c0", "#b0", "#b0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 228 | "vpaddd "#b1", "#a1", "#a1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 229 | "vpxor "#a1", "#d1", "#d1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 230 | "vpshufb %[rotl8], "#d1", "#d1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 231 | "vpaddd "#d1", "#c1", "#c1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 232 | "vpxor "#c1", "#b1", "#b1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 233 | "vpaddd "#b2", "#a2", "#a2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 234 | "vpxor "#a2", "#d2", "#d2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 235 | "vpshufb %[rotl8], "#d2", "#d2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 236 | "vpaddd "#d2", "#c2", "#c2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 237 | "vpxor "#c2", "#b2", "#b2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 238 | "vpaddd "#b3", "#a3", "#a3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 239 | "vpxor "#a3", "#d3", "#d3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 240 | "vpshufb %[rotl8], "#d3", "#d3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 241 | "vpaddd "#d3", "#c3", "#c3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 242 | "vpxor "#c3", "#b3", "#b3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 243 | "vmovdqa "#c1", "#o1"(%[x])\n\t" \ |
wolfSSL | 15:117db924cf7c | 244 | "vpsrld $25, "#b0", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 245 | "vpslld $7, "#b0", "#b0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 246 | "vpxor "#t1", "#b0", "#b0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 247 | "vpsrld $25, "#b1", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 248 | "vpslld $7, "#b1", "#b1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 249 | "vpxor "#t1", "#b1", "#b1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 250 | "vpsrld $25, "#b2", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 251 | "vpslld $7, "#b2", "#b2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 252 | "vpxor "#t1", "#b2", "#b2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 253 | "vpsrld $25, "#b3", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 254 | "vpslld $7, "#b3", "#b3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 255 | "vpxor "#t1", "#b3", "#b3"\n\t" |
wolfSSL | 15:117db924cf7c | 256 | |
wolfSSL | 15:117db924cf7c | 257 | |
wolfSSL | 15:117db924cf7c | 258 | #define QUARTERROUND_XMM() \ |
wolfSSL | 15:117db924cf7c | 259 | QUARTERROUND_INTEL_ASM(%%xmm0,%%xmm4,%%xmm8,%%xmm12, \ |
wolfSSL | 15:117db924cf7c | 260 | %%xmm1,%%xmm5,%%xmm9,%%xmm13, \ |
wolfSSL | 15:117db924cf7c | 261 | %%xmm2,%%xmm6,%%xmm10,%%xmm14, \ |
wolfSSL | 15:117db924cf7c | 262 | %%xmm3,%%xmm7,%%xmm11,%%xmm15, \ |
wolfSSL | 15:117db924cf7c | 263 | %%xmm11,48) |
wolfSSL | 15:117db924cf7c | 264 | #define QUARTERROUND_XMM_2() \ |
wolfSSL | 15:117db924cf7c | 265 | QUARTERROUND_INTEL_ASM_2(%%xmm0,%%xmm5,%%xmm10,%%xmm15, \ |
wolfSSL | 15:117db924cf7c | 266 | %%xmm1,%%xmm6,%%xmm11,%%xmm12, \ |
wolfSSL | 15:117db924cf7c | 267 | %%xmm2,%%xmm7,%%xmm8,%%xmm13, \ |
wolfSSL | 15:117db924cf7c | 268 | %%xmm3,%%xmm4,%%xmm9,%%xmm14, \ |
wolfSSL | 15:117db924cf7c | 269 | %%xmm11,48) |
wolfSSL | 15:117db924cf7c | 270 | |
wolfSSL | 15:117db924cf7c | 271 | #define QUARTERROUND_YMM() \ |
wolfSSL | 15:117db924cf7c | 272 | QUARTERROUND_INTEL_ASM(%%ymm0,%%ymm4,%%ymm8,%%ymm12, \ |
wolfSSL | 15:117db924cf7c | 273 | %%ymm1,%%ymm5,%%ymm9,%%ymm13, \ |
wolfSSL | 15:117db924cf7c | 274 | %%ymm2,%%ymm6,%%ymm10,%%ymm14, \ |
wolfSSL | 15:117db924cf7c | 275 | %%ymm3,%%ymm7,%%ymm11,%%ymm15, \ |
wolfSSL | 15:117db924cf7c | 276 | %%ymm11,96) |
wolfSSL | 15:117db924cf7c | 277 | #define QUARTERROUND_YMM_2() \ |
wolfSSL | 15:117db924cf7c | 278 | QUARTERROUND_INTEL_ASM_2(%%ymm0,%%ymm5,%%ymm10,%%ymm15, \ |
wolfSSL | 15:117db924cf7c | 279 | %%ymm1,%%ymm6,%%ymm11,%%ymm12, \ |
wolfSSL | 15:117db924cf7c | 280 | %%ymm2,%%ymm7,%%ymm8,%%ymm13, \ |
wolfSSL | 15:117db924cf7c | 281 | %%ymm3,%%ymm4,%%ymm9,%%ymm14, \ |
wolfSSL | 15:117db924cf7c | 282 | %%ymm11,96) |
wolfSSL | 15:117db924cf7c | 283 | |
wolfSSL | 15:117db924cf7c | 284 | /** |
wolfSSL | 15:117db924cf7c | 285 | * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version |
wolfSSL | 15:117db924cf7c | 286 | * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB. |
wolfSSL | 15:117db924cf7c | 287 | */ |
wolfSSL | 15:117db924cf7c | 288 | int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) |
wolfSSL | 15:117db924cf7c | 289 | { |
wolfSSL | 15:117db924cf7c | 290 | word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */ |
wolfSSL | 15:117db924cf7c | 291 | |
wolfSSL | 15:117db924cf7c | 292 | #ifdef CHACHA_AEAD_TEST |
wolfSSL | 15:117db924cf7c | 293 | word32 i; |
wolfSSL | 15:117db924cf7c | 294 | printf("NONCE : "); |
wolfSSL | 15:117db924cf7c | 295 | for (i = 0; i < CHACHA_IV_BYTES; i++) { |
wolfSSL | 15:117db924cf7c | 296 | printf("%02x", inIv[i]); |
wolfSSL | 15:117db924cf7c | 297 | } |
wolfSSL | 15:117db924cf7c | 298 | printf("\n\n"); |
wolfSSL | 15:117db924cf7c | 299 | #endif |
wolfSSL | 15:117db924cf7c | 300 | |
wolfSSL | 15:117db924cf7c | 301 | if (ctx == NULL) |
wolfSSL | 15:117db924cf7c | 302 | return BAD_FUNC_ARG; |
wolfSSL | 15:117db924cf7c | 303 | |
wolfSSL | 15:117db924cf7c | 304 | XMEMCPY(temp, inIv, CHACHA_IV_BYTES); |
wolfSSL | 15:117db924cf7c | 305 | |
wolfSSL | 15:117db924cf7c | 306 | ctx->X[CHACHA_IV_BYTES+0] = counter; /* block counter */ |
wolfSSL | 15:117db924cf7c | 307 | ctx->X[CHACHA_IV_BYTES+1] = LITTLE32(temp[0]); /* fixed variable from nonce */ |
wolfSSL | 15:117db924cf7c | 308 | ctx->X[CHACHA_IV_BYTES+2] = LITTLE32(temp[1]); /* counter from nonce */ |
wolfSSL | 15:117db924cf7c | 309 | ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */ |
wolfSSL | 15:117db924cf7c | 310 | |
wolfSSL | 15:117db924cf7c | 311 | return 0; |
wolfSSL | 15:117db924cf7c | 312 | } |
wolfSSL | 15:117db924cf7c | 313 | |
wolfSSL | 15:117db924cf7c | 314 | /* "expand 32-byte k" as unsigned 32 byte */ |
wolfSSL | 15:117db924cf7c | 315 | static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; |
wolfSSL | 15:117db924cf7c | 316 | /* "expand 16-byte k" as unsigned 16 byte */ |
wolfSSL | 15:117db924cf7c | 317 | static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574}; |
wolfSSL | 15:117db924cf7c | 318 | |
wolfSSL | 15:117db924cf7c | 319 | /** |
wolfSSL | 15:117db924cf7c | 320 | * Key setup. 8 word iv (nonce) |
wolfSSL | 15:117db924cf7c | 321 | */ |
wolfSSL | 15:117db924cf7c | 322 | int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) |
wolfSSL | 15:117db924cf7c | 323 | { |
wolfSSL | 15:117db924cf7c | 324 | const word32* constants; |
wolfSSL | 15:117db924cf7c | 325 | const byte* k; |
wolfSSL | 15:117db924cf7c | 326 | |
wolfSSL | 15:117db924cf7c | 327 | #ifdef XSTREAM_ALIGN |
wolfSSL | 15:117db924cf7c | 328 | word32 alignKey[8]; |
wolfSSL | 15:117db924cf7c | 329 | #endif |
wolfSSL | 15:117db924cf7c | 330 | |
wolfSSL | 15:117db924cf7c | 331 | if (ctx == NULL) |
wolfSSL | 15:117db924cf7c | 332 | return BAD_FUNC_ARG; |
wolfSSL | 15:117db924cf7c | 333 | |
wolfSSL | 15:117db924cf7c | 334 | if (keySz != (CHACHA_MAX_KEY_SZ/2) && keySz != CHACHA_MAX_KEY_SZ) |
wolfSSL | 15:117db924cf7c | 335 | return BAD_FUNC_ARG; |
wolfSSL | 15:117db924cf7c | 336 | |
wolfSSL | 15:117db924cf7c | 337 | #ifdef XSTREAM_ALIGN |
wolfSSL | 15:117db924cf7c | 338 | if ((wolfssl_word)key % 4) { |
wolfSSL | 15:117db924cf7c | 339 | WOLFSSL_MSG("wc_ChachaSetKey unaligned key"); |
wolfSSL | 15:117db924cf7c | 340 | XMEMCPY(alignKey, key, keySz); |
wolfSSL | 15:117db924cf7c | 341 | k = (byte*)alignKey; |
wolfSSL | 15:117db924cf7c | 342 | } |
wolfSSL | 15:117db924cf7c | 343 | else { |
wolfSSL | 15:117db924cf7c | 344 | k = key; |
wolfSSL | 15:117db924cf7c | 345 | } |
wolfSSL | 15:117db924cf7c | 346 | #else |
wolfSSL | 15:117db924cf7c | 347 | k = key; |
wolfSSL | 15:117db924cf7c | 348 | #endif /* XSTREAM_ALIGN */ |
wolfSSL | 15:117db924cf7c | 349 | |
wolfSSL | 15:117db924cf7c | 350 | #ifdef CHACHA_AEAD_TEST |
wolfSSL | 15:117db924cf7c | 351 | word32 i; |
wolfSSL | 15:117db924cf7c | 352 | printf("ChaCha key used :\n"); |
wolfSSL | 15:117db924cf7c | 353 | for (i = 0; i < keySz; i++) { |
wolfSSL | 15:117db924cf7c | 354 | printf("%02x", key[i]); |
wolfSSL | 15:117db924cf7c | 355 | if ((i + 1) % 8 == 0) |
wolfSSL | 15:117db924cf7c | 356 | printf("\n"); |
wolfSSL | 15:117db924cf7c | 357 | } |
wolfSSL | 15:117db924cf7c | 358 | printf("\n\n"); |
wolfSSL | 15:117db924cf7c | 359 | #endif |
wolfSSL | 15:117db924cf7c | 360 | |
wolfSSL | 15:117db924cf7c | 361 | ctx->X[4] = U8TO32_LITTLE(k + 0); |
wolfSSL | 15:117db924cf7c | 362 | ctx->X[5] = U8TO32_LITTLE(k + 4); |
wolfSSL | 15:117db924cf7c | 363 | ctx->X[6] = U8TO32_LITTLE(k + 8); |
wolfSSL | 15:117db924cf7c | 364 | ctx->X[7] = U8TO32_LITTLE(k + 12); |
wolfSSL | 15:117db924cf7c | 365 | if (keySz == CHACHA_MAX_KEY_SZ) { |
wolfSSL | 15:117db924cf7c | 366 | k += 16; |
wolfSSL | 15:117db924cf7c | 367 | constants = sigma; |
wolfSSL | 15:117db924cf7c | 368 | } |
wolfSSL | 15:117db924cf7c | 369 | else { |
wolfSSL | 15:117db924cf7c | 370 | constants = tau; |
wolfSSL | 15:117db924cf7c | 371 | } |
wolfSSL | 15:117db924cf7c | 372 | ctx->X[ 8] = U8TO32_LITTLE(k + 0); |
wolfSSL | 15:117db924cf7c | 373 | ctx->X[ 9] = U8TO32_LITTLE(k + 4); |
wolfSSL | 15:117db924cf7c | 374 | ctx->X[10] = U8TO32_LITTLE(k + 8); |
wolfSSL | 15:117db924cf7c | 375 | ctx->X[11] = U8TO32_LITTLE(k + 12); |
wolfSSL | 15:117db924cf7c | 376 | ctx->X[ 0] = constants[0]; |
wolfSSL | 15:117db924cf7c | 377 | ctx->X[ 1] = constants[1]; |
wolfSSL | 15:117db924cf7c | 378 | ctx->X[ 2] = constants[2]; |
wolfSSL | 15:117db924cf7c | 379 | ctx->X[ 3] = constants[3]; |
wolfSSL | 15:117db924cf7c | 380 | |
wolfSSL | 15:117db924cf7c | 381 | return 0; |
wolfSSL | 15:117db924cf7c | 382 | } |
wolfSSL | 15:117db924cf7c | 383 | |
wolfSSL | 15:117db924cf7c | 384 | /** |
wolfSSL | 15:117db924cf7c | 385 | * Converts word into bytes with rotations having been done. |
wolfSSL | 15:117db924cf7c | 386 | */ |
wolfSSL | 15:117db924cf7c | 387 | static WC_INLINE void wc_Chacha_wordtobyte(word32 output[CHACHA_CHUNK_WORDS], |
wolfSSL | 15:117db924cf7c | 388 | const word32 input[CHACHA_CHUNK_WORDS]) |
wolfSSL | 15:117db924cf7c | 389 | { |
wolfSSL | 15:117db924cf7c | 390 | word32 x[CHACHA_CHUNK_WORDS]; |
wolfSSL | 15:117db924cf7c | 391 | word32 i; |
wolfSSL | 15:117db924cf7c | 392 | |
wolfSSL | 15:117db924cf7c | 393 | for (i = 0; i < CHACHA_CHUNK_WORDS; i++) { |
wolfSSL | 15:117db924cf7c | 394 | x[i] = input[i]; |
wolfSSL | 15:117db924cf7c | 395 | } |
wolfSSL | 15:117db924cf7c | 396 | |
wolfSSL | 15:117db924cf7c | 397 | for (i = (ROUNDS); i > 0; i -= 2) { |
wolfSSL | 15:117db924cf7c | 398 | QUARTERROUND(0, 4, 8, 12) |
wolfSSL | 15:117db924cf7c | 399 | QUARTERROUND(1, 5, 9, 13) |
wolfSSL | 15:117db924cf7c | 400 | QUARTERROUND(2, 6, 10, 14) |
wolfSSL | 15:117db924cf7c | 401 | QUARTERROUND(3, 7, 11, 15) |
wolfSSL | 15:117db924cf7c | 402 | QUARTERROUND(0, 5, 10, 15) |
wolfSSL | 15:117db924cf7c | 403 | QUARTERROUND(1, 6, 11, 12) |
wolfSSL | 15:117db924cf7c | 404 | QUARTERROUND(2, 7, 8, 13) |
wolfSSL | 15:117db924cf7c | 405 | QUARTERROUND(3, 4, 9, 14) |
wolfSSL | 15:117db924cf7c | 406 | } |
wolfSSL | 15:117db924cf7c | 407 | |
wolfSSL | 15:117db924cf7c | 408 | for (i = 0; i < CHACHA_CHUNK_WORDS; i++) { |
wolfSSL | 15:117db924cf7c | 409 | x[i] = PLUS(x[i], input[i]); |
wolfSSL | 15:117db924cf7c | 410 | } |
wolfSSL | 15:117db924cf7c | 411 | |
wolfSSL | 15:117db924cf7c | 412 | for (i = 0; i < CHACHA_CHUNK_WORDS; i++) { |
wolfSSL | 15:117db924cf7c | 413 | output[i] = LITTLE32(x[i]); |
wolfSSL | 15:117db924cf7c | 414 | } |
wolfSSL | 15:117db924cf7c | 415 | } |
wolfSSL | 15:117db924cf7c | 416 | |
wolfSSL | 15:117db924cf7c | 417 | |
wolfSSL | 15:117db924cf7c | 418 | #ifdef USE_INTEL_CHACHA_SPEEDUP |
wolfSSL | 15:117db924cf7c | 419 | |
wolfSSL | 15:117db924cf7c | 420 | #define QUARTERROUND_2_X64(r11, r12, r13, r14, r21, r22, r23, r24) \ |
wolfSSL | 15:117db924cf7c | 421 | "addl "#r12", "#r11"\n\t" \ |
wolfSSL | 15:117db924cf7c | 422 | "addl "#r22", "#r21"\n\t" \ |
wolfSSL | 15:117db924cf7c | 423 | "xorl "#r11", "#r14"\n\t" \ |
wolfSSL | 15:117db924cf7c | 424 | "xorl "#r21", "#r24"\n\t" \ |
wolfSSL | 15:117db924cf7c | 425 | "roll $16, "#r14"\n\t" \ |
wolfSSL | 15:117db924cf7c | 426 | "roll $16, "#r24"\n\t" \ |
wolfSSL | 15:117db924cf7c | 427 | "addl "#r14", "#r13"\n\t" \ |
wolfSSL | 15:117db924cf7c | 428 | "addl "#r24", "#r23"\n\t" \ |
wolfSSL | 15:117db924cf7c | 429 | "xorl "#r13", "#r12"\n\t" \ |
wolfSSL | 15:117db924cf7c | 430 | "xorl "#r23", "#r22"\n\t" \ |
wolfSSL | 15:117db924cf7c | 431 | "roll $12, "#r12"\n\t" \ |
wolfSSL | 15:117db924cf7c | 432 | "roll $12, "#r22"\n\t" \ |
wolfSSL | 15:117db924cf7c | 433 | "addl "#r12", "#r11"\n\t" \ |
wolfSSL | 15:117db924cf7c | 434 | "addl "#r22", "#r21"\n\t" \ |
wolfSSL | 15:117db924cf7c | 435 | "xorl "#r11", "#r14"\n\t" \ |
wolfSSL | 15:117db924cf7c | 436 | "xorl "#r21", "#r24"\n\t" \ |
wolfSSL | 15:117db924cf7c | 437 | "roll $8, "#r14"\n\t" \ |
wolfSSL | 15:117db924cf7c | 438 | "roll $8, "#r24"\n\t" \ |
wolfSSL | 15:117db924cf7c | 439 | "addl "#r14", "#r13"\n\t" \ |
wolfSSL | 15:117db924cf7c | 440 | "addl "#r24", "#r23"\n\t" \ |
wolfSSL | 15:117db924cf7c | 441 | "xorl "#r13", "#r12"\n\t" \ |
wolfSSL | 15:117db924cf7c | 442 | "xorl "#r23", "#r22"\n\t" \ |
wolfSSL | 15:117db924cf7c | 443 | "roll $7, "#r12"\n\t" \ |
wolfSSL | 15:117db924cf7c | 444 | "roll $7, "#r22"\n\t" \ |
wolfSSL | 15:117db924cf7c | 445 | |
wolfSSL | 15:117db924cf7c | 446 | #define CHACHA_CRYPT_X64() \ |
wolfSSL | 15:117db924cf7c | 447 | "subq $40, %%rsp\n\t" \ |
wolfSSL | 15:117db924cf7c | 448 | "movq 32(%[input]), %%rax\n\t" \ |
wolfSSL | 15:117db924cf7c | 449 | "movq 40(%[input]), %%rdx\n\t" \ |
wolfSSL | 15:117db924cf7c | 450 | "movq %%rax, 8(%%rsp)\n\t" \ |
wolfSSL | 15:117db924cf7c | 451 | "movq %%rdx, 16(%%rsp)\n\t" \ |
wolfSSL | 15:117db924cf7c | 452 | "movl 0(%[input]), %%eax\n\t" \ |
wolfSSL | 15:117db924cf7c | 453 | "movl 4(%[input]), %%ebx\n\t" \ |
wolfSSL | 15:117db924cf7c | 454 | "movl 8(%[input]), %%ecx\n\t" \ |
wolfSSL | 15:117db924cf7c | 455 | "movl 12(%[input]), %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 456 | "movl 16(%[input]), %%r8d\n\t" \ |
wolfSSL | 15:117db924cf7c | 457 | "movl 20(%[input]), %%r9d\n\t" \ |
wolfSSL | 15:117db924cf7c | 458 | "movl 24(%[input]), %%r10d\n\t" \ |
wolfSSL | 15:117db924cf7c | 459 | "movl 28(%[input]), %%r11d\n\t" \ |
wolfSSL | 15:117db924cf7c | 460 | "movl 48(%[input]), %%r12d\n\t" \ |
wolfSSL | 15:117db924cf7c | 461 | "movl 52(%[input]), %%r13d\n\t" \ |
wolfSSL | 15:117db924cf7c | 462 | "movl 56(%[input]), %%r14d\n\t" \ |
wolfSSL | 15:117db924cf7c | 463 | "movl 60(%[input]), %%r15d\n\t" \ |
wolfSSL | 15:117db924cf7c | 464 | "movb $10, (%%rsp)\n\t" \ |
wolfSSL | 15:117db924cf7c | 465 | "movq %%rsi, 32(%%rsp)\n\t" \ |
wolfSSL | 15:117db924cf7c | 466 | "movq %%rdi, 24(%%rsp)\n\t" \ |
wolfSSL | 15:117db924cf7c | 467 | "movl 8(%%rsp), %%esi\n\t" \ |
wolfSSL | 15:117db924cf7c | 468 | "movl 12(%%rsp), %%edi\n\t" \ |
wolfSSL | 15:117db924cf7c | 469 | "\n" \ |
wolfSSL | 15:117db924cf7c | 470 | "1:\n\t" \ |
wolfSSL | 15:117db924cf7c | 471 | QUARTERROUND_2_X64(%%eax, %%r8d, %%esi, %%r12d, \ |
wolfSSL | 15:117db924cf7c | 472 | %%ebx, %%r9d, %%edi, %%r13d) \ |
wolfSSL | 15:117db924cf7c | 473 | "movl %%esi, 8(%%rsp)\n\t" \ |
wolfSSL | 15:117db924cf7c | 474 | "movl %%edi, 12(%%rsp)\n\t" \ |
wolfSSL | 15:117db924cf7c | 475 | "movl 16(%%rsp), %%esi\n\t" \ |
wolfSSL | 15:117db924cf7c | 476 | "movl 20(%%rsp), %%edi\n\t" \ |
wolfSSL | 15:117db924cf7c | 477 | QUARTERROUND_2_X64(%%ecx, %%r10d, %%esi, %%r14d, \ |
wolfSSL | 15:117db924cf7c | 478 | %%edx, %%r11d, %%edi, %%r15d) \ |
wolfSSL | 15:117db924cf7c | 479 | QUARTERROUND_2_X64(%%eax, %%r9d, %%esi, %%r15d, \ |
wolfSSL | 15:117db924cf7c | 480 | %%ebx, %%r10d, %%edi, %%r12d) \ |
wolfSSL | 15:117db924cf7c | 481 | "movl %%esi, 16(%%rsp)\n\t" \ |
wolfSSL | 15:117db924cf7c | 482 | "movl %%edi, 20(%%rsp)\n\t" \ |
wolfSSL | 15:117db924cf7c | 483 | "movl 8(%%rsp), %%esi\n\t" \ |
wolfSSL | 15:117db924cf7c | 484 | "movl 12(%%rsp), %%edi\n\t" \ |
wolfSSL | 15:117db924cf7c | 485 | QUARTERROUND_2_X64(%%ecx, %%r11d, %%esi, %%r13d, \ |
wolfSSL | 15:117db924cf7c | 486 | %%edx, %%r8d, %%edi, %%r14d) \ |
wolfSSL | 15:117db924cf7c | 487 | "decb (%%rsp)\n\t" \ |
wolfSSL | 15:117db924cf7c | 488 | "jnz 1b\n\t" \ |
wolfSSL | 15:117db924cf7c | 489 | "movl %%esi, 8(%%rsp)\n\t" \ |
wolfSSL | 15:117db924cf7c | 490 | "movl %%edi, 12(%%rsp)\n\t" \ |
wolfSSL | 15:117db924cf7c | 491 | "movq 32(%%rsp), %%rsi\n\t" \ |
wolfSSL | 15:117db924cf7c | 492 | "movq 24(%%rsp), %%rdi\n\t" \ |
wolfSSL | 15:117db924cf7c | 493 | "addl 0(%[input]), %%eax\n\t" \ |
wolfSSL | 15:117db924cf7c | 494 | "addl 4(%[input]), %%ebx\n\t" \ |
wolfSSL | 15:117db924cf7c | 495 | "addl 8(%[input]), %%ecx\n\t" \ |
wolfSSL | 15:117db924cf7c | 496 | "addl 12(%[input]), %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 497 | "addl 16(%[input]), %%r8d\n\t" \ |
wolfSSL | 15:117db924cf7c | 498 | "addl 20(%[input]), %%r9d\n\t" \ |
wolfSSL | 15:117db924cf7c | 499 | "addl 24(%[input]), %%r10d\n\t" \ |
wolfSSL | 15:117db924cf7c | 500 | "addl 28(%[input]), %%r11d\n\t" \ |
wolfSSL | 15:117db924cf7c | 501 | "addl 48(%[input]), %%r12d\n\t" \ |
wolfSSL | 15:117db924cf7c | 502 | "addl 52(%[input]), %%r13d\n\t" \ |
wolfSSL | 15:117db924cf7c | 503 | "addl 56(%[input]), %%r14d\n\t" \ |
wolfSSL | 15:117db924cf7c | 504 | "addl 60(%[input]), %%r15d\n\t" \ |
wolfSSL | 15:117db924cf7c | 505 | |
wolfSSL | 15:117db924cf7c | 506 | #define CHACHA_PARTIAL_CHUNK_X64() \ |
wolfSSL | 15:117db924cf7c | 507 | __asm__ __volatile__ ( \ |
wolfSSL | 15:117db924cf7c | 508 | CHACHA_CRYPT_X64() \ |
wolfSSL | 15:117db924cf7c | 509 | "movl %%eax , 0(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 510 | "movl %%ebx , 4(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 511 | "movl %%ecx , 8(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 512 | "movl %%edx , 12(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 513 | "movl %%r8d , 16(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 514 | "movl %%r9d , 20(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 515 | "movl %%r10d, 24(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 516 | "movl %%r11d, 28(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 517 | "movl %%r12d, 48(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 518 | "movl %%r13d, 52(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 519 | "movl %%r14d, 56(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 520 | "movl %%r15d, 60(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 521 | "movl 8(%%rsp), %%eax\n\t" \ |
wolfSSL | 15:117db924cf7c | 522 | "movl 12(%%rsp), %%ebx\n\t" \ |
wolfSSL | 15:117db924cf7c | 523 | "movl 16(%%rsp), %%ecx\n\t" \ |
wolfSSL | 15:117db924cf7c | 524 | "movl 20(%%rsp), %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 525 | "addl 32(%[input]), %%eax\n\t" \ |
wolfSSL | 15:117db924cf7c | 526 | "addl 36(%[input]), %%ebx\n\t" \ |
wolfSSL | 15:117db924cf7c | 527 | "addl 40(%[input]), %%ecx\n\t" \ |
wolfSSL | 15:117db924cf7c | 528 | "addl 44(%[input]), %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 529 | "movl %%eax , 32(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 530 | "movl %%ebx , 36(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 531 | "movl %%ecx , 40(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 532 | "movl %%edx , 44(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 533 | "addl $1, 48(%[input])\n\t" \ |
wolfSSL | 15:117db924cf7c | 534 | "addq $40, %%rsp\n\t" \ |
wolfSSL | 15:117db924cf7c | 535 | "movq %[output], %%rax\n\t" \ |
wolfSSL | 15:117db924cf7c | 536 | "movq %[m], %%rbx\n\t" \ |
wolfSSL | 15:117db924cf7c | 537 | "movl %[bytes], %%r8d\n\t" \ |
wolfSSL | 15:117db924cf7c | 538 | "xorq %%rdx, %%rdx\n\t" \ |
wolfSSL | 15:117db924cf7c | 539 | "movl %%r8d, %%r9d\n\t" \ |
wolfSSL | 15:117db924cf7c | 540 | "andl $7, %%r9d\n\t" \ |
wolfSSL | 15:117db924cf7c | 541 | "jz 4f\n\t" \ |
wolfSSL | 15:117db924cf7c | 542 | "\n" \ |
wolfSSL | 15:117db924cf7c | 543 | "2:\n\t" \ |
wolfSSL | 15:117db924cf7c | 544 | "movzbl (%[c],%%rdx,1), %%ecx\n\t" \ |
wolfSSL | 15:117db924cf7c | 545 | "xorb (%%rbx,%%rdx,1), %%cl\n\t" \ |
wolfSSL | 15:117db924cf7c | 546 | "movb %%cl, (%%rax,%%rdx,1)\n\t" \ |
wolfSSL | 15:117db924cf7c | 547 | "incl %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 548 | "cmpl %%r9d, %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 549 | "jne 2b\n\t" \ |
wolfSSL | 15:117db924cf7c | 550 | "je 3f\n\t" \ |
wolfSSL | 15:117db924cf7c | 551 | "\n" \ |
wolfSSL | 15:117db924cf7c | 552 | "4:\n\t" \ |
wolfSSL | 15:117db924cf7c | 553 | "movq (%[c],%%rdx,1), %%rcx\n\t" \ |
wolfSSL | 15:117db924cf7c | 554 | "xorq (%%rbx,%%rdx,1), %%rcx\n\t" \ |
wolfSSL | 15:117db924cf7c | 555 | "movq %%rcx, (%%rax,%%rdx,1)\n\t" \ |
wolfSSL | 15:117db924cf7c | 556 | "addl $8, %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 557 | "\n" \ |
wolfSSL | 15:117db924cf7c | 558 | "3:\n\t" \ |
wolfSSL | 15:117db924cf7c | 559 | "cmpl %%r8d, %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 560 | "jne 4b\n\t" \ |
wolfSSL | 15:117db924cf7c | 561 | : \ |
wolfSSL | 15:117db924cf7c | 562 | : [input] "r" (ctx->X), [c] "r" (x), \ |
wolfSSL | 15:117db924cf7c | 563 | [output] "m" (c), [bytes] "m" (bytes), [m] "m" (m) \ |
wolfSSL | 15:117db924cf7c | 564 | : "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \ |
wolfSSL | 15:117db924cf7c | 565 | "r14", "r15", "memory" \ |
wolfSSL | 15:117db924cf7c | 566 | ) |
wolfSSL | 15:117db924cf7c | 567 | |
wolfSSL | 15:117db924cf7c | 568 | |
wolfSSL | 15:117db924cf7c | 569 | #define CHACHA_CHUNK_X64() \ |
wolfSSL | 15:117db924cf7c | 570 | __asm__ __volatile__ ( \ |
wolfSSL | 15:117db924cf7c | 571 | CHACHA_CRYPT_X64() \ |
wolfSSL | 15:117db924cf7c | 572 | "movq %%rsi, 32(%%rsp)\n\t" \ |
wolfSSL | 15:117db924cf7c | 573 | "addq $40, %%rsp\n\t" \ |
wolfSSL | 15:117db924cf7c | 574 | "movq %[m], %%rsi\n\t" \ |
wolfSSL | 15:117db924cf7c | 575 | "subq $40, %%rsp\n\t" \ |
wolfSSL | 15:117db924cf7c | 576 | "xorl 0(%%rsi), %%eax\n\t" \ |
wolfSSL | 15:117db924cf7c | 577 | "xorl 4(%%rsi), %%ebx\n\t" \ |
wolfSSL | 15:117db924cf7c | 578 | "xorl 8(%%rsi), %%ecx\n\t" \ |
wolfSSL | 15:117db924cf7c | 579 | "xorl 12(%%rsi), %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 580 | "xorl 16(%%rsi), %%r8d\n\t" \ |
wolfSSL | 15:117db924cf7c | 581 | "xorl 20(%%rsi), %%r9d\n\t" \ |
wolfSSL | 15:117db924cf7c | 582 | "xorl 24(%%rsi), %%r10d\n\t" \ |
wolfSSL | 15:117db924cf7c | 583 | "xorl 28(%%rsi), %%r11d\n\t" \ |
wolfSSL | 15:117db924cf7c | 584 | "xorl 48(%%rsi), %%r12d\n\t" \ |
wolfSSL | 15:117db924cf7c | 585 | "xorl 52(%%rsi), %%r13d\n\t" \ |
wolfSSL | 15:117db924cf7c | 586 | "xorl 56(%%rsi), %%r14d\n\t" \ |
wolfSSL | 15:117db924cf7c | 587 | "xorl 60(%%rsi), %%r15d\n\t" \ |
wolfSSL | 15:117db924cf7c | 588 | "movq 32(%%rsp), %%rsi\n\t" \ |
wolfSSL | 15:117db924cf7c | 589 | "movl %%eax , 0(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 590 | "movl %%ebx , 4(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 591 | "movl %%ecx , 8(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 592 | "movl %%edx , 12(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 593 | "movl %%r8d , 16(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 594 | "movl %%r9d , 20(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 595 | "movl %%r10d, 24(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 596 | "movl %%r11d, 28(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 597 | "movl %%r12d, 48(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 598 | "movl %%r13d, 52(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 599 | "movl %%r14d, 56(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 600 | "movl %%r15d, 60(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 601 | "addq $40, %%rsp\n\t" \ |
wolfSSL | 15:117db924cf7c | 602 | "movq %[m], %%r8\n\t" \ |
wolfSSL | 15:117db924cf7c | 603 | "subq $40, %%rsp\n\t" \ |
wolfSSL | 15:117db924cf7c | 604 | "movl 8(%%rsp), %%eax\n\t" \ |
wolfSSL | 15:117db924cf7c | 605 | "movl 12(%%rsp), %%ebx\n\t" \ |
wolfSSL | 15:117db924cf7c | 606 | "movl 16(%%rsp), %%ecx\n\t" \ |
wolfSSL | 15:117db924cf7c | 607 | "movl 20(%%rsp), %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 608 | "addl 32(%[input]), %%eax\n\t" \ |
wolfSSL | 15:117db924cf7c | 609 | "addl 36(%[input]), %%ebx\n\t" \ |
wolfSSL | 15:117db924cf7c | 610 | "addl 40(%[input]), %%ecx\n\t" \ |
wolfSSL | 15:117db924cf7c | 611 | "addl 44(%[input]), %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 612 | "xorl 32(%%r8), %%eax\n\t" \ |
wolfSSL | 15:117db924cf7c | 613 | "xorl 36(%%r8), %%ebx\n\t" \ |
wolfSSL | 15:117db924cf7c | 614 | "xorl 40(%%r8), %%ecx\n\t" \ |
wolfSSL | 15:117db924cf7c | 615 | "xorl 44(%%r8), %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 616 | "movl %%eax , 32(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 617 | "movl %%ebx , 36(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 618 | "movl %%ecx , 40(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 619 | "movl %%edx , 44(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 620 | "addl $1, 48(%[input])\n\t" \ |
wolfSSL | 15:117db924cf7c | 621 | "addq $40, %%rsp\n\t" \ |
wolfSSL | 15:117db924cf7c | 622 | : \ |
wolfSSL | 15:117db924cf7c | 623 | : [input] "r" (ctx->X), [c] "r" (c), [m] "m" (m) \ |
wolfSSL | 15:117db924cf7c | 624 | : "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \ |
wolfSSL | 15:117db924cf7c | 625 | "r14", "r15", "memory" \ |
wolfSSL | 15:117db924cf7c | 626 | ) |
wolfSSL | 15:117db924cf7c | 627 | |
wolfSSL | 15:117db924cf7c | 628 | |
wolfSSL | 15:117db924cf7c | 629 | static void chacha_encrypt_x64(ChaCha* ctx, const byte* m, byte* c, |
wolfSSL | 15:117db924cf7c | 630 | word32 bytes) |
wolfSSL | 15:117db924cf7c | 631 | { |
wolfSSL | 15:117db924cf7c | 632 | word32 x[CHACHA_CHUNK_WORDS]; |
wolfSSL | 15:117db924cf7c | 633 | |
wolfSSL | 15:117db924cf7c | 634 | if (bytes == 0) |
wolfSSL | 15:117db924cf7c | 635 | return; |
wolfSSL | 15:117db924cf7c | 636 | |
wolfSSL | 15:117db924cf7c | 637 | for (; bytes >= CHACHA_CHUNK_BYTES;) { |
wolfSSL | 15:117db924cf7c | 638 | CHACHA_CHUNK_X64(); |
wolfSSL | 15:117db924cf7c | 639 | bytes -= CHACHA_CHUNK_BYTES; |
wolfSSL | 15:117db924cf7c | 640 | c += CHACHA_CHUNK_BYTES; |
wolfSSL | 15:117db924cf7c | 641 | m += CHACHA_CHUNK_BYTES; |
wolfSSL | 15:117db924cf7c | 642 | } |
wolfSSL | 15:117db924cf7c | 643 | if (bytes > 0) { |
wolfSSL | 15:117db924cf7c | 644 | CHACHA_PARTIAL_CHUNK_X64(); |
wolfSSL | 15:117db924cf7c | 645 | } |
wolfSSL | 15:117db924cf7c | 646 | } |
wolfSSL | 15:117db924cf7c | 647 | |
wolfSSL | 15:117db924cf7c | 648 | #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) |
wolfSSL | 15:117db924cf7c | 649 | static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; |
wolfSSL | 15:117db924cf7c | 650 | static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; |
wolfSSL | 15:117db924cf7c | 651 | #endif /* HAVE_INTEL_AVX1 || HAVE_INTEL_AVX2 */ |
wolfSSL | 15:117db924cf7c | 652 | |
wolfSSL | 15:117db924cf7c | 653 | #ifdef HAVE_INTEL_AVX1 |
wolfSSL | 15:117db924cf7c | 654 | #define QUARTERROUND_2_AVX() \ |
wolfSSL | 15:117db924cf7c | 655 | "paddd %%xmm1, %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 656 | "pxor %%xmm0, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 657 | "pshufb %[rotl16], %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 658 | "paddd %%xmm3, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 659 | "pxor %%xmm2, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 660 | "movdqa %%xmm1, %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 661 | "pslld $12, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 662 | "psrld $20, %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 663 | "pxor %%xmm4, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 664 | "paddd %%xmm1, %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 665 | "pxor %%xmm0, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 666 | "pshufb %[rotl8], %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 667 | "paddd %%xmm3, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 668 | "pxor %%xmm2, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 669 | "movdqa %%xmm1, %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 670 | "pslld $7, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 671 | "psrld $25, %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 672 | "pxor %%xmm4, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 673 | "# Swap words for next round\n\t" \ |
wolfSSL | 15:117db924cf7c | 674 | "pshufd $0x39, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 675 | "pshufd $0x4e, %%xmm2, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 676 | "pshufd $0x93, %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 677 | "paddd %%xmm1, %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 678 | "pxor %%xmm0, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 679 | "pshufb %[rotl16], %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 680 | "paddd %%xmm3, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 681 | "pxor %%xmm2, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 682 | "movdqa %%xmm1, %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 683 | "pslld $12, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 684 | "psrld $20, %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 685 | "pxor %%xmm4, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 686 | "paddd %%xmm1, %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 687 | "pxor %%xmm0, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 688 | "pshufb %[rotl8], %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 689 | "paddd %%xmm3, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 690 | "pxor %%xmm2, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 691 | "movdqa %%xmm1, %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 692 | "pslld $7, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 693 | "psrld $25, %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 694 | "pxor %%xmm4, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 695 | "# Swap words back\n\t" \ |
wolfSSL | 15:117db924cf7c | 696 | "pshufd $0x93, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 697 | "pshufd $0x4e, %%xmm2, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 698 | "pshufd $0x39, %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 699 | |
wolfSSL | 15:117db924cf7c | 700 | #define CHACHA_CRYPT_AVX() \ |
wolfSSL | 15:117db924cf7c | 701 | "movdqu 0(%[input]), %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 702 | "movdqu 16(%[input]), %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 703 | "movdqu 32(%[input]), %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 704 | "movdqu 48(%[input]), %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 705 | "movb $10, %%al\n\t" \ |
wolfSSL | 15:117db924cf7c | 706 | "\n" \ |
wolfSSL | 15:117db924cf7c | 707 | "1:\n\t" \ |
wolfSSL | 15:117db924cf7c | 708 | QUARTERROUND_2_AVX() \ |
wolfSSL | 15:117db924cf7c | 709 | "decb %%al\n\t" \ |
wolfSSL | 15:117db924cf7c | 710 | "jnz 1b\n\t" \ |
wolfSSL | 15:117db924cf7c | 711 | "movdqu 0(%[input]), %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 712 | "movdqu 16(%[input]), %%xmm5\n\t" \ |
wolfSSL | 15:117db924cf7c | 713 | "movdqu 32(%[input]), %%xmm6\n\t" \ |
wolfSSL | 15:117db924cf7c | 714 | "movdqu 48(%[input]), %%xmm7\n\t" \ |
wolfSSL | 15:117db924cf7c | 715 | "paddd %%xmm4, %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 716 | "paddd %%xmm5, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 717 | "paddd %%xmm6, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 718 | "paddd %%xmm7, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 719 | |
wolfSSL | 15:117db924cf7c | 720 | #define CHACHA_PARTIAL_CHUNK_AVX() \ |
wolfSSL | 15:117db924cf7c | 721 | __asm__ __volatile__ ( \ |
wolfSSL | 15:117db924cf7c | 722 | CHACHA_CRYPT_AVX() \ |
wolfSSL | 15:117db924cf7c | 723 | "movdqu %%xmm0, 0(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 724 | "movdqu %%xmm1, 16(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 725 | "movdqu %%xmm2, 32(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 726 | "movdqu %%xmm3, 48(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 727 | "addl $1, 48(%[input])\n\t" \ |
wolfSSL | 15:117db924cf7c | 728 | "movl %[bytes], %%r8d\n\t" \ |
wolfSSL | 15:117db924cf7c | 729 | "xorq %%rdx, %%rdx\n\t" \ |
wolfSSL | 15:117db924cf7c | 730 | "movl %%r8d, %%r9d\n\t" \ |
wolfSSL | 15:117db924cf7c | 731 | "andl $7, %%r9d\n\t" \ |
wolfSSL | 15:117db924cf7c | 732 | "jz 4f\n\t" \ |
wolfSSL | 15:117db924cf7c | 733 | "\n" \ |
wolfSSL | 15:117db924cf7c | 734 | "2:\n\t" \ |
wolfSSL | 15:117db924cf7c | 735 | "movzbl (%[c],%%rdx,1), %%ecx\n\t" \ |
wolfSSL | 15:117db924cf7c | 736 | "xorb (%[m],%%rdx,1), %%cl\n\t" \ |
wolfSSL | 15:117db924cf7c | 737 | "movb %%cl, (%[output],%%rdx,1)\n\t" \ |
wolfSSL | 15:117db924cf7c | 738 | "incl %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 739 | "cmpl %%r9d, %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 740 | "jne 2b\n\t" \ |
wolfSSL | 15:117db924cf7c | 741 | "je 3f\n\t" \ |
wolfSSL | 15:117db924cf7c | 742 | "\n" \ |
wolfSSL | 15:117db924cf7c | 743 | "4:\n\t" \ |
wolfSSL | 15:117db924cf7c | 744 | "movq (%[c],%%rdx,1), %%rcx\n\t" \ |
wolfSSL | 15:117db924cf7c | 745 | "xorq (%[m],%%rdx,1), %%rcx\n\t" \ |
wolfSSL | 15:117db924cf7c | 746 | "movq %%rcx, (%[output],%%rdx,1)\n\t" \ |
wolfSSL | 15:117db924cf7c | 747 | "addl $8, %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 748 | "\n" \ |
wolfSSL | 15:117db924cf7c | 749 | "3:\n\t" \ |
wolfSSL | 15:117db924cf7c | 750 | "cmpl %%r8d, %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 751 | "jne 4b\n\t" \ |
wolfSSL | 15:117db924cf7c | 752 | : \ |
wolfSSL | 15:117db924cf7c | 753 | : [input] "r" (ctx->X), [c] "r" (x), \ |
wolfSSL | 15:117db924cf7c | 754 | [output] "r" (c), [bytes] "r" (bytes), [m] "r" (m), \ |
wolfSSL | 15:117db924cf7c | 755 | [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \ |
wolfSSL | 15:117db924cf7c | 756 | : "eax", "ecx", "edx", "r8", "r9", "memory", \ |
wolfSSL | 15:117db924cf7c | 757 | "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \ |
wolfSSL | 15:117db924cf7c | 758 | ) |
wolfSSL | 15:117db924cf7c | 759 | |
wolfSSL | 15:117db924cf7c | 760 | |
wolfSSL | 15:117db924cf7c | 761 | #define CHACHA_CHUNK_AVX() \ |
wolfSSL | 15:117db924cf7c | 762 | __asm__ __volatile__ ( \ |
wolfSSL | 15:117db924cf7c | 763 | CHACHA_CRYPT_AVX() \ |
wolfSSL | 15:117db924cf7c | 764 | "movdqu 0(%[m]), %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 765 | "movdqu 16(%[m]), %%xmm5\n\t" \ |
wolfSSL | 15:117db924cf7c | 766 | "movdqu 32(%[m]), %%xmm6\n\t" \ |
wolfSSL | 15:117db924cf7c | 767 | "movdqu 48(%[m]), %%xmm7\n\t" \ |
wolfSSL | 15:117db924cf7c | 768 | "pxor %%xmm4, %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 769 | "pxor %%xmm5, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 770 | "pxor %%xmm6, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 771 | "pxor %%xmm7, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 772 | "movdqu %%xmm0, 0(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 773 | "movdqu %%xmm1, 16(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 774 | "movdqu %%xmm2, 32(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 775 | "movdqu %%xmm3, 48(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 776 | "addl $1, 48(%[input])\n\t" \ |
wolfSSL | 15:117db924cf7c | 777 | : \ |
wolfSSL | 15:117db924cf7c | 778 | : [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m), \ |
wolfSSL | 15:117db924cf7c | 779 | [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \ |
wolfSSL | 15:117db924cf7c | 780 | : "rax", "memory", \ |
wolfSSL | 15:117db924cf7c | 781 | "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \ |
wolfSSL | 15:117db924cf7c | 782 | ) |
wolfSSL | 15:117db924cf7c | 783 | |
wolfSSL | 15:117db924cf7c | 784 | CHACHA20_NOINLINE static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, |
wolfSSL | 15:117db924cf7c | 785 | byte* c, word32 bytes) |
wolfSSL | 15:117db924cf7c | 786 | { |
wolfSSL | 15:117db924cf7c | 787 | ALIGN128 word32 X[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ |
wolfSSL | 15:117db924cf7c | 788 | ALIGN128 word32 x[2*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ |
wolfSSL | 15:117db924cf7c | 789 | word32 cnt = 0; |
wolfSSL | 15:117db924cf7c | 790 | static const __m128i add = { 0x0000000100000000UL,0x0000000300000002UL }; |
wolfSSL | 15:117db924cf7c | 791 | static const __m128i four = { 0x0000000400000004UL,0x0000000400000004UL }; |
wolfSSL | 15:117db924cf7c | 792 | |
wolfSSL | 15:117db924cf7c | 793 | if (bytes == 0) |
wolfSSL | 15:117db924cf7c | 794 | return; |
wolfSSL | 15:117db924cf7c | 795 | |
wolfSSL | 15:117db924cf7c | 796 | __asm__ __volatile__ ( |
wolfSSL | 15:117db924cf7c | 797 | "movl %[bytes], %[cnt]\n\t" |
wolfSSL | 15:117db924cf7c | 798 | "shrl $8, %[cnt]\n\t" |
wolfSSL | 15:117db924cf7c | 799 | "jz L_end128\n\t" |
wolfSSL | 15:117db924cf7c | 800 | |
wolfSSL | 15:117db924cf7c | 801 | "vpshufd $0, (%[key]), %%xmm0\n\t" |
wolfSSL | 15:117db924cf7c | 802 | "vpshufd $0, 4(%[key]), %%xmm1\n\t" |
wolfSSL | 15:117db924cf7c | 803 | "vpshufd $0, 8(%[key]), %%xmm2\n\t" |
wolfSSL | 15:117db924cf7c | 804 | "vpshufd $0, 12(%[key]), %%xmm3\n\t" |
wolfSSL | 15:117db924cf7c | 805 | "vpshufd $0, 16(%[key]), %%xmm4\n\t" |
wolfSSL | 15:117db924cf7c | 806 | "vpshufd $0, 20(%[key]), %%xmm5\n\t" |
wolfSSL | 15:117db924cf7c | 807 | "vpshufd $0, 24(%[key]), %%xmm6\n\t" |
wolfSSL | 15:117db924cf7c | 808 | "vpshufd $0, 28(%[key]), %%xmm7\n\t" |
wolfSSL | 15:117db924cf7c | 809 | "vpshufd $0, 32(%[key]), %%xmm8\n\t" |
wolfSSL | 15:117db924cf7c | 810 | "vpshufd $0, 36(%[key]), %%xmm9\n\t" |
wolfSSL | 15:117db924cf7c | 811 | "vpshufd $0, 40(%[key]), %%xmm10\n\t" |
wolfSSL | 15:117db924cf7c | 812 | "vpshufd $0, 44(%[key]), %%xmm11\n\t" |
wolfSSL | 15:117db924cf7c | 813 | "vpshufd $0, 48(%[key]), %%xmm12\n\t" |
wolfSSL | 15:117db924cf7c | 814 | "vpshufd $0, 52(%[key]), %%xmm13\n\t" |
wolfSSL | 15:117db924cf7c | 815 | "vpshufd $0, 56(%[key]), %%xmm14\n\t" |
wolfSSL | 15:117db924cf7c | 816 | "vpshufd $0, 60(%[key]), %%xmm15\n\t" |
wolfSSL | 15:117db924cf7c | 817 | |
wolfSSL | 15:117db924cf7c | 818 | "vpaddd %[add], %%xmm12, %%xmm12\n\t" |
wolfSSL | 15:117db924cf7c | 819 | |
wolfSSL | 15:117db924cf7c | 820 | "vmovdqa %%xmm0, (%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 821 | "vmovdqa %%xmm1, 16(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 822 | "vmovdqa %%xmm2, 32(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 823 | "vmovdqa %%xmm3, 48(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 824 | "vmovdqa %%xmm4, 64(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 825 | "vmovdqa %%xmm5, 80(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 826 | "vmovdqa %%xmm6, 96(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 827 | "vmovdqa %%xmm7, 112(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 828 | "vmovdqa %%xmm8, 128(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 829 | "vmovdqa %%xmm9, 144(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 830 | "vmovdqa %%xmm10, 160(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 831 | "vmovdqa %%xmm11, 176(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 832 | "vmovdqa %%xmm12, 192(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 833 | "vmovdqa %%xmm13, 208(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 834 | "vmovdqa %%xmm14, 224(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 835 | "vmovdqa %%xmm15, 240(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 836 | "\n" |
wolfSSL | 15:117db924cf7c | 837 | "L_enc128_loop:\n\t" |
wolfSSL | 15:117db924cf7c | 838 | "vmovdqa %%xmm11, 48(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 839 | QUARTERROUND_XMM() |
wolfSSL | 15:117db924cf7c | 840 | QUARTERROUND_XMM_2() |
wolfSSL | 15:117db924cf7c | 841 | QUARTERROUND_XMM() |
wolfSSL | 15:117db924cf7c | 842 | QUARTERROUND_XMM_2() |
wolfSSL | 15:117db924cf7c | 843 | QUARTERROUND_XMM() |
wolfSSL | 15:117db924cf7c | 844 | QUARTERROUND_XMM_2() |
wolfSSL | 15:117db924cf7c | 845 | QUARTERROUND_XMM() |
wolfSSL | 15:117db924cf7c | 846 | QUARTERROUND_XMM_2() |
wolfSSL | 15:117db924cf7c | 847 | QUARTERROUND_XMM() |
wolfSSL | 15:117db924cf7c | 848 | QUARTERROUND_XMM_2() |
wolfSSL | 15:117db924cf7c | 849 | QUARTERROUND_XMM() |
wolfSSL | 15:117db924cf7c | 850 | QUARTERROUND_XMM_2() |
wolfSSL | 15:117db924cf7c | 851 | QUARTERROUND_XMM() |
wolfSSL | 15:117db924cf7c | 852 | QUARTERROUND_XMM_2() |
wolfSSL | 15:117db924cf7c | 853 | QUARTERROUND_XMM() |
wolfSSL | 15:117db924cf7c | 854 | QUARTERROUND_XMM_2() |
wolfSSL | 15:117db924cf7c | 855 | QUARTERROUND_XMM() |
wolfSSL | 15:117db924cf7c | 856 | QUARTERROUND_XMM_2() |
wolfSSL | 15:117db924cf7c | 857 | QUARTERROUND_XMM() |
wolfSSL | 15:117db924cf7c | 858 | QUARTERROUND_XMM_2() |
wolfSSL | 15:117db924cf7c | 859 | "vmovdqa 48(%[x]), %%xmm11\n\t" |
wolfSSL | 15:117db924cf7c | 860 | |
wolfSSL | 15:117db924cf7c | 861 | "vpaddd (%[X]), %%xmm0, %%xmm0\n\t" |
wolfSSL | 15:117db924cf7c | 862 | "vpaddd 16(%[X]), %%xmm1, %%xmm1\n\t" |
wolfSSL | 15:117db924cf7c | 863 | "vpaddd 32(%[X]), %%xmm2, %%xmm2\n\t" |
wolfSSL | 15:117db924cf7c | 864 | "vpaddd 48(%[X]), %%xmm3, %%xmm3\n\t" |
wolfSSL | 15:117db924cf7c | 865 | "vpaddd 64(%[X]), %%xmm4, %%xmm4\n\t" |
wolfSSL | 15:117db924cf7c | 866 | "vpaddd 80(%[X]), %%xmm5, %%xmm5\n\t" |
wolfSSL | 15:117db924cf7c | 867 | "vpaddd 96(%[X]), %%xmm6, %%xmm6\n\t" |
wolfSSL | 15:117db924cf7c | 868 | "vpaddd 112(%[X]), %%xmm7, %%xmm7\n\t" |
wolfSSL | 15:117db924cf7c | 869 | "vpaddd 128(%[X]), %%xmm8, %%xmm8\n\t" |
wolfSSL | 15:117db924cf7c | 870 | "vpaddd 144(%[X]), %%xmm9, %%xmm9\n\t" |
wolfSSL | 15:117db924cf7c | 871 | "vpaddd 160(%[X]), %%xmm10, %%xmm10\n\t" |
wolfSSL | 15:117db924cf7c | 872 | "vpaddd 176(%[X]), %%xmm11, %%xmm11\n\t" |
wolfSSL | 15:117db924cf7c | 873 | "vpaddd 192(%[X]), %%xmm12, %%xmm12\n\t" |
wolfSSL | 15:117db924cf7c | 874 | "vpaddd 208(%[X]), %%xmm13, %%xmm13\n\t" |
wolfSSL | 15:117db924cf7c | 875 | "vpaddd 224(%[X]), %%xmm14, %%xmm14\n\t" |
wolfSSL | 15:117db924cf7c | 876 | "vpaddd 240(%[X]), %%xmm15, %%xmm15\n\t" |
wolfSSL | 15:117db924cf7c | 877 | |
wolfSSL | 15:117db924cf7c | 878 | "vmovdqa %%xmm8, (%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 879 | "vmovdqa %%xmm9, 16(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 880 | "vmovdqa %%xmm10, 32(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 881 | "vmovdqa %%xmm11, 48(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 882 | "vmovdqa %%xmm12, 64(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 883 | "vmovdqa %%xmm13, 80(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 884 | "vmovdqa %%xmm14, 96(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 885 | "vmovdqa %%xmm15, 112(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 886 | |
wolfSSL | 15:117db924cf7c | 887 | "vpunpckldq %%xmm1, %%xmm0, %%xmm8\n\t" |
wolfSSL | 15:117db924cf7c | 888 | "vpunpckldq %%xmm3, %%xmm2, %%xmm9\n\t" |
wolfSSL | 15:117db924cf7c | 889 | "vpunpckhdq %%xmm1, %%xmm0, %%xmm12\n\t" |
wolfSSL | 15:117db924cf7c | 890 | "vpunpckhdq %%xmm3, %%xmm2, %%xmm13\n\t" |
wolfSSL | 15:117db924cf7c | 891 | "vpunpckldq %%xmm5, %%xmm4, %%xmm10\n\t" |
wolfSSL | 15:117db924cf7c | 892 | "vpunpckldq %%xmm7, %%xmm6, %%xmm11\n\t" |
wolfSSL | 15:117db924cf7c | 893 | "vpunpckhdq %%xmm5, %%xmm4, %%xmm14\n\t" |
wolfSSL | 15:117db924cf7c | 894 | "vpunpckhdq %%xmm7, %%xmm6, %%xmm15\n\t" |
wolfSSL | 15:117db924cf7c | 895 | "vpunpcklqdq %%xmm9, %%xmm8, %%xmm0\n\t" |
wolfSSL | 15:117db924cf7c | 896 | "vpunpcklqdq %%xmm11, %%xmm10, %%xmm1\n\t" |
wolfSSL | 15:117db924cf7c | 897 | "vpunpckhqdq %%xmm9, %%xmm8, %%xmm2\n\t" |
wolfSSL | 15:117db924cf7c | 898 | "vpunpckhqdq %%xmm11, %%xmm10, %%xmm3\n\t" |
wolfSSL | 15:117db924cf7c | 899 | "vpunpcklqdq %%xmm13, %%xmm12, %%xmm4\n\t" |
wolfSSL | 15:117db924cf7c | 900 | "vpunpcklqdq %%xmm15, %%xmm14, %%xmm5\n\t" |
wolfSSL | 15:117db924cf7c | 901 | "vpunpckhqdq %%xmm13, %%xmm12, %%xmm6\n\t" |
wolfSSL | 15:117db924cf7c | 902 | "vpunpckhqdq %%xmm15, %%xmm14, %%xmm7\n\t" |
wolfSSL | 15:117db924cf7c | 903 | "vmovdqu (%[in]), %%xmm8\n\t" |
wolfSSL | 15:117db924cf7c | 904 | "vmovdqu 16(%[in]), %%xmm9\n\t" |
wolfSSL | 15:117db924cf7c | 905 | "vmovdqu 64(%[in]), %%xmm10\n\t" |
wolfSSL | 15:117db924cf7c | 906 | "vmovdqu 80(%[in]), %%xmm11\n\t" |
wolfSSL | 15:117db924cf7c | 907 | "vmovdqu 128(%[in]), %%xmm12\n\t" |
wolfSSL | 15:117db924cf7c | 908 | "vmovdqu 144(%[in]), %%xmm13\n\t" |
wolfSSL | 15:117db924cf7c | 909 | "vmovdqu 192(%[in]), %%xmm14\n\t" |
wolfSSL | 15:117db924cf7c | 910 | "vmovdqu 208(%[in]), %%xmm15\n\t" |
wolfSSL | 15:117db924cf7c | 911 | "vpxor %%xmm8, %%xmm0, %%xmm0\n\t" |
wolfSSL | 15:117db924cf7c | 912 | "vpxor %%xmm9, %%xmm1, %%xmm1\n\t" |
wolfSSL | 15:117db924cf7c | 913 | "vpxor %%xmm10, %%xmm2, %%xmm2\n\t" |
wolfSSL | 15:117db924cf7c | 914 | "vpxor %%xmm11, %%xmm3, %%xmm3\n\t" |
wolfSSL | 15:117db924cf7c | 915 | "vpxor %%xmm12, %%xmm4, %%xmm4\n\t" |
wolfSSL | 15:117db924cf7c | 916 | "vpxor %%xmm13, %%xmm5, %%xmm5\n\t" |
wolfSSL | 15:117db924cf7c | 917 | "vpxor %%xmm14, %%xmm6, %%xmm6\n\t" |
wolfSSL | 15:117db924cf7c | 918 | "vpxor %%xmm15, %%xmm7, %%xmm7\n\t" |
wolfSSL | 15:117db924cf7c | 919 | "vmovdqu %%xmm0, (%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 920 | "vmovdqu %%xmm1, 16(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 921 | "vmovdqu %%xmm2, 64(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 922 | "vmovdqu %%xmm3, 80(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 923 | "vmovdqu %%xmm4, 128(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 924 | "vmovdqu %%xmm5, 144(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 925 | "vmovdqu %%xmm6, 192(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 926 | "vmovdqu %%xmm7, 208(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 927 | |
wolfSSL | 15:117db924cf7c | 928 | "vmovdqa (%[x]), %%xmm0\n\t" |
wolfSSL | 15:117db924cf7c | 929 | "vmovdqa 16(%[x]), %%xmm1\n\t" |
wolfSSL | 15:117db924cf7c | 930 | "vmovdqa 32(%[x]), %%xmm2\n\t" |
wolfSSL | 15:117db924cf7c | 931 | "vmovdqa 48(%[x]), %%xmm3\n\t" |
wolfSSL | 15:117db924cf7c | 932 | "vmovdqa 64(%[x]), %%xmm4\n\t" |
wolfSSL | 15:117db924cf7c | 933 | "vmovdqa 80(%[x]), %%xmm5\n\t" |
wolfSSL | 15:117db924cf7c | 934 | "vmovdqa 96(%[x]), %%xmm6\n\t" |
wolfSSL | 15:117db924cf7c | 935 | "vmovdqa 112(%[x]), %%xmm7\n\t" |
wolfSSL | 15:117db924cf7c | 936 | |
wolfSSL | 15:117db924cf7c | 937 | "vpunpckldq %%xmm1, %%xmm0, %%xmm8\n\t" |
wolfSSL | 15:117db924cf7c | 938 | "vpunpckldq %%xmm3, %%xmm2, %%xmm9\n\t" |
wolfSSL | 15:117db924cf7c | 939 | "vpunpckhdq %%xmm1, %%xmm0, %%xmm12\n\t" |
wolfSSL | 15:117db924cf7c | 940 | "vpunpckhdq %%xmm3, %%xmm2, %%xmm13\n\t" |
wolfSSL | 15:117db924cf7c | 941 | "vpunpckldq %%xmm5, %%xmm4, %%xmm10\n\t" |
wolfSSL | 15:117db924cf7c | 942 | "vpunpckldq %%xmm7, %%xmm6, %%xmm11\n\t" |
wolfSSL | 15:117db924cf7c | 943 | "vpunpckhdq %%xmm5, %%xmm4, %%xmm14\n\t" |
wolfSSL | 15:117db924cf7c | 944 | "vpunpckhdq %%xmm7, %%xmm6, %%xmm15\n\t" |
wolfSSL | 15:117db924cf7c | 945 | "vpunpcklqdq %%xmm9, %%xmm8, %%xmm0\n\t" |
wolfSSL | 15:117db924cf7c | 946 | "vpunpcklqdq %%xmm11, %%xmm10, %%xmm1\n\t" |
wolfSSL | 15:117db924cf7c | 947 | "vpunpckhqdq %%xmm9, %%xmm8, %%xmm2\n\t" |
wolfSSL | 15:117db924cf7c | 948 | "vpunpckhqdq %%xmm11, %%xmm10, %%xmm3\n\t" |
wolfSSL | 15:117db924cf7c | 949 | "vpunpcklqdq %%xmm13, %%xmm12, %%xmm4\n\t" |
wolfSSL | 15:117db924cf7c | 950 | "vpunpcklqdq %%xmm15, %%xmm14, %%xmm5\n\t" |
wolfSSL | 15:117db924cf7c | 951 | "vpunpckhqdq %%xmm13, %%xmm12, %%xmm6\n\t" |
wolfSSL | 15:117db924cf7c | 952 | "vpunpckhqdq %%xmm15, %%xmm14, %%xmm7\n\t" |
wolfSSL | 15:117db924cf7c | 953 | "vmovdqu 32(%[in]), %%xmm8\n\t" |
wolfSSL | 15:117db924cf7c | 954 | "vmovdqu 48(%[in]), %%xmm9\n\t" |
wolfSSL | 15:117db924cf7c | 955 | "vmovdqu 96(%[in]), %%xmm10\n\t" |
wolfSSL | 15:117db924cf7c | 956 | "vmovdqu 112(%[in]), %%xmm11\n\t" |
wolfSSL | 15:117db924cf7c | 957 | "vmovdqu 160(%[in]), %%xmm12\n\t" |
wolfSSL | 15:117db924cf7c | 958 | "vmovdqu 176(%[in]), %%xmm13\n\t" |
wolfSSL | 15:117db924cf7c | 959 | "vmovdqu 224(%[in]), %%xmm14\n\t" |
wolfSSL | 15:117db924cf7c | 960 | "vmovdqu 240(%[in]), %%xmm15\n\t" |
wolfSSL | 15:117db924cf7c | 961 | "vpxor %%xmm8, %%xmm0, %%xmm0\n\t" |
wolfSSL | 15:117db924cf7c | 962 | "vpxor %%xmm9, %%xmm1, %%xmm1\n\t" |
wolfSSL | 15:117db924cf7c | 963 | "vpxor %%xmm10, %%xmm2, %%xmm2\n\t" |
wolfSSL | 15:117db924cf7c | 964 | "vpxor %%xmm11, %%xmm3, %%xmm3\n\t" |
wolfSSL | 15:117db924cf7c | 965 | "vpxor %%xmm12, %%xmm4, %%xmm4\n\t" |
wolfSSL | 15:117db924cf7c | 966 | "vpxor %%xmm13, %%xmm5, %%xmm5\n\t" |
wolfSSL | 15:117db924cf7c | 967 | "vpxor %%xmm14, %%xmm6, %%xmm6\n\t" |
wolfSSL | 15:117db924cf7c | 968 | "vpxor %%xmm15, %%xmm7, %%xmm7\n\t" |
wolfSSL | 15:117db924cf7c | 969 | "vmovdqu %%xmm0, 32(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 970 | "vmovdqu %%xmm1, 48(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 971 | "vmovdqu %%xmm2, 96(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 972 | "vmovdqu %%xmm3, 112(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 973 | "vmovdqu %%xmm4, 160(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 974 | "vmovdqu %%xmm5, 176(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 975 | "vmovdqu %%xmm6, 224(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 976 | "vmovdqu %%xmm7, 240(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 977 | |
wolfSSL | 15:117db924cf7c | 978 | "vmovdqa 192(%[X]), %%xmm12\n\t" |
wolfSSL | 15:117db924cf7c | 979 | "add $256, %[in]\n\t" |
wolfSSL | 15:117db924cf7c | 980 | "add $256, %[out]\n\t" |
wolfSSL | 15:117db924cf7c | 981 | "vpaddd %[four], %%xmm12, %%xmm12\n\t" |
wolfSSL | 15:117db924cf7c | 982 | "sub $256, %[bytes]\n\t" |
wolfSSL | 15:117db924cf7c | 983 | "vmovdqa %%xmm12, 192(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 984 | "cmp $256, %[bytes]\n\t" |
wolfSSL | 15:117db924cf7c | 985 | "jl L_done\n\t" |
wolfSSL | 15:117db924cf7c | 986 | |
wolfSSL | 15:117db924cf7c | 987 | "vmovdqa (%[X]), %%xmm0\n\t" |
wolfSSL | 15:117db924cf7c | 988 | "vmovdqa 16(%[X]), %%xmm1\n\t" |
wolfSSL | 15:117db924cf7c | 989 | "vmovdqa 32(%[X]), %%xmm2\n\t" |
wolfSSL | 15:117db924cf7c | 990 | "vmovdqa 48(%[X]), %%xmm3\n\t" |
wolfSSL | 15:117db924cf7c | 991 | "vmovdqa 64(%[X]), %%xmm4\n\t" |
wolfSSL | 15:117db924cf7c | 992 | "vmovdqa 80(%[X]), %%xmm5\n\t" |
wolfSSL | 15:117db924cf7c | 993 | "vmovdqa 96(%[X]), %%xmm6\n\t" |
wolfSSL | 15:117db924cf7c | 994 | "vmovdqa 112(%[X]), %%xmm7\n\t" |
wolfSSL | 15:117db924cf7c | 995 | "vmovdqa 128(%[X]), %%xmm8\n\t" |
wolfSSL | 15:117db924cf7c | 996 | "vmovdqa 144(%[X]), %%xmm9\n\t" |
wolfSSL | 15:117db924cf7c | 997 | "vmovdqa 160(%[X]), %%xmm10\n\t" |
wolfSSL | 15:117db924cf7c | 998 | "vmovdqa 176(%[X]), %%xmm11\n\t" |
wolfSSL | 15:117db924cf7c | 999 | "vmovdqa 192(%[X]), %%xmm12\n\t" |
wolfSSL | 15:117db924cf7c | 1000 | "vmovdqa 208(%[X]), %%xmm13\n\t" |
wolfSSL | 15:117db924cf7c | 1001 | "vmovdqa 224(%[X]), %%xmm14\n\t" |
wolfSSL | 15:117db924cf7c | 1002 | "vmovdqa 240(%[X]), %%xmm15\n\t" |
wolfSSL | 15:117db924cf7c | 1003 | "jmp L_enc128_loop\n\t" |
wolfSSL | 15:117db924cf7c | 1004 | |
wolfSSL | 15:117db924cf7c | 1005 | "\n" |
wolfSSL | 15:117db924cf7c | 1006 | "L_done:\n\t" |
wolfSSL | 15:117db924cf7c | 1007 | |
wolfSSL | 15:117db924cf7c | 1008 | "shl $2, %[cnt]\n\t" |
wolfSSL | 15:117db924cf7c | 1009 | "add 48(%[key]), %[cnt]\n\t" |
wolfSSL | 15:117db924cf7c | 1010 | "movl %[cnt], 48(%[key])\n\t" |
wolfSSL | 15:117db924cf7c | 1011 | "\n" |
wolfSSL | 15:117db924cf7c | 1012 | "L_end128:\n\t" |
wolfSSL | 15:117db924cf7c | 1013 | : [bytes] "+r" (bytes), [cnt] "+r" (cnt), |
wolfSSL | 15:117db924cf7c | 1014 | [in] "+r" (m), [out] "+r" (c) |
wolfSSL | 15:117db924cf7c | 1015 | : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), |
wolfSSL | 15:117db924cf7c | 1016 | [add] "xrm" (add), [four] "xrm" (four), |
wolfSSL | 15:117db924cf7c | 1017 | [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) |
wolfSSL | 15:117db924cf7c | 1018 | : "xmm0", "xmm1", "xmm2", "xmm3", |
wolfSSL | 15:117db924cf7c | 1019 | "xmm4", "xmm5", "xmm6", "xmm7", |
wolfSSL | 15:117db924cf7c | 1020 | "xmm8", "xmm9", "xmm10", "xmm11", |
wolfSSL | 15:117db924cf7c | 1021 | "xmm12", "xmm13", "xmm14", "xmm15", "memory" |
wolfSSL | 15:117db924cf7c | 1022 | ); |
wolfSSL | 15:117db924cf7c | 1023 | |
wolfSSL | 15:117db924cf7c | 1024 | for (; bytes >= CHACHA_CHUNK_BYTES;) { |
wolfSSL | 15:117db924cf7c | 1025 | CHACHA_CHUNK_AVX(); |
wolfSSL | 15:117db924cf7c | 1026 | bytes -= CHACHA_CHUNK_BYTES; |
wolfSSL | 15:117db924cf7c | 1027 | c += CHACHA_CHUNK_BYTES; |
wolfSSL | 15:117db924cf7c | 1028 | m += CHACHA_CHUNK_BYTES; |
wolfSSL | 15:117db924cf7c | 1029 | } |
wolfSSL | 15:117db924cf7c | 1030 | if (bytes > 0) { |
wolfSSL | 15:117db924cf7c | 1031 | CHACHA_PARTIAL_CHUNK_AVX(); |
wolfSSL | 15:117db924cf7c | 1032 | } |
wolfSSL | 15:117db924cf7c | 1033 | } |
wolfSSL | 15:117db924cf7c | 1034 | #endif /* HAVE_INTEL_AVX1 */ |
wolfSSL | 15:117db924cf7c | 1035 | |
wolfSSL | 15:117db924cf7c | 1036 | #ifdef HAVE_INTEL_AVX2 |
wolfSSL | 15:117db924cf7c | 1037 | #define QUARTERROUND_2_AVX2() \ |
wolfSSL | 15:117db924cf7c | 1038 | "vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 1039 | "vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 1040 | "vpshufb %[rotl16], %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 1041 | "vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 1042 | "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1043 | "vpsrld $20, %%xmm1, %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 1044 | "vpslld $12, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1045 | "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1046 | "vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 1047 | "vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 1048 | "vpshufb %[rotl8], %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 1049 | "vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 1050 | "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1051 | "vpsrld $25, %%xmm1, %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 1052 | "vpslld $7, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1053 | "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1054 | "# Swap words for next round\n\t" \ |
wolfSSL | 15:117db924cf7c | 1055 | "vpshufd $0x39, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1056 | "vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 1057 | "vpshufd $0x93, %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 1058 | "vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 1059 | "vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 1060 | "vpshufb %[rotl16], %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 1061 | "vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 1062 | "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1063 | "vpsrld $20, %%xmm1, %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 1064 | "vpslld $12, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1065 | "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1066 | "vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 1067 | "vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 1068 | "vpshufb %[rotl8], %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 1069 | "vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 1070 | "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1071 | "vpsrld $25, %%Xmm1, %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 1072 | "vpslld $7, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1073 | "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1074 | "# Swap words back\n\t" \ |
wolfSSL | 15:117db924cf7c | 1075 | "vpshufd $0x93, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1076 | "vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 1077 | "vpshufd $0x39, %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 1078 | |
wolfSSL | 15:117db924cf7c | 1079 | #define CHACHA_CRYPT_AVX2() \ |
wolfSSL | 15:117db924cf7c | 1080 | "vmovdqu 0(%[input]), %%xmm8\n\t" \ |
wolfSSL | 15:117db924cf7c | 1081 | "vmovdqu 16(%[input]), %%xmm9\n\t" \ |
wolfSSL | 15:117db924cf7c | 1082 | "vmovdqu 32(%[input]), %%xmm10\n\t" \ |
wolfSSL | 15:117db924cf7c | 1083 | "vmovdqu 48(%[input]), %%xmm11\n\t" \ |
wolfSSL | 15:117db924cf7c | 1084 | "vmovdqu %%xmm8, %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 1085 | "vmovdqu %%xmm9, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1086 | "vmovdqu %%xmm10, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 1087 | "vmovdqu %%xmm11, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 1088 | "movb $10, %%al\n\t" \ |
wolfSSL | 15:117db924cf7c | 1089 | "\n" \ |
wolfSSL | 15:117db924cf7c | 1090 | "1:\n\t" \ |
wolfSSL | 15:117db924cf7c | 1091 | QUARTERROUND_2_AVX2() \ |
wolfSSL | 15:117db924cf7c | 1092 | "decb %%al\n\t" \ |
wolfSSL | 15:117db924cf7c | 1093 | "jnz 1b\n\t" \ |
wolfSSL | 15:117db924cf7c | 1094 | "vpaddd %%xmm8, %%xmm0, %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 1095 | "vpaddd %%xmm9, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1096 | "vpaddd %%xmm10, %%xmm2, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 1097 | "vpaddd %%xmm11, %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 1098 | |
wolfSSL | 15:117db924cf7c | 1099 | #define CHACHA_PARTIAL_CHUNK_AVX2() \ |
wolfSSL | 15:117db924cf7c | 1100 | __asm__ __volatile__ ( \ |
wolfSSL | 15:117db924cf7c | 1101 | CHACHA_CRYPT_AVX2() \ |
wolfSSL | 15:117db924cf7c | 1102 | "vmovdqu %%xmm0, 0(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 1103 | "vmovdqu %%xmm1, 16(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 1104 | "vmovdqu %%xmm2, 32(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 1105 | "vmovdqu %%xmm3, 48(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 1106 | "addl $1, 48(%[input])\n\t" \ |
wolfSSL | 15:117db924cf7c | 1107 | "movl %[bytes], %%r8d\n\t" \ |
wolfSSL | 15:117db924cf7c | 1108 | "xorq %%rdx, %%rdx\n\t" \ |
wolfSSL | 15:117db924cf7c | 1109 | "movl %%r8d, %%r9d\n\t" \ |
wolfSSL | 15:117db924cf7c | 1110 | "andl $7, %%r9d\n\t" \ |
wolfSSL | 15:117db924cf7c | 1111 | "jz 4f\n\t" \ |
wolfSSL | 15:117db924cf7c | 1112 | "\n" \ |
wolfSSL | 15:117db924cf7c | 1113 | "2:\n\t" \ |
wolfSSL | 15:117db924cf7c | 1114 | "movzbl (%[c],%%rdx,1), %%ecx\n\t" \ |
wolfSSL | 15:117db924cf7c | 1115 | "xorb (%[m],%%rdx,1), %%cl\n\t" \ |
wolfSSL | 15:117db924cf7c | 1116 | "movb %%cl, (%[output],%%rdx,1)\n\t" \ |
wolfSSL | 15:117db924cf7c | 1117 | "incl %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 1118 | "cmpl %%r9d, %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 1119 | "jne 2b\n\t" \ |
wolfSSL | 15:117db924cf7c | 1120 | "je 3f\n\t" \ |
wolfSSL | 15:117db924cf7c | 1121 | "\n" \ |
wolfSSL | 15:117db924cf7c | 1122 | "4:\n\t" \ |
wolfSSL | 15:117db924cf7c | 1123 | "movq (%[c],%%rdx,1), %%rcx\n\t" \ |
wolfSSL | 15:117db924cf7c | 1124 | "xorq (%[m],%%rdx,1), %%rcx\n\t" \ |
wolfSSL | 15:117db924cf7c | 1125 | "movq %%rcx, (%[output],%%rdx,1)\n\t" \ |
wolfSSL | 15:117db924cf7c | 1126 | "addl $8, %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 1127 | "\n" \ |
wolfSSL | 15:117db924cf7c | 1128 | "3:\n\t" \ |
wolfSSL | 15:117db924cf7c | 1129 | "cmpl %%r8d, %%edx\n\t" \ |
wolfSSL | 15:117db924cf7c | 1130 | "jne 4b\n\t" \ |
wolfSSL | 15:117db924cf7c | 1131 | : \ |
wolfSSL | 15:117db924cf7c | 1132 | : [input] "r" (ctx->X), [c] "r" (x), \ |
wolfSSL | 15:117db924cf7c | 1133 | [output] "r" (c), [bytes] "r" (bytes), [m] "r" (m), \ |
wolfSSL | 15:117db924cf7c | 1134 | [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \ |
wolfSSL | 15:117db924cf7c | 1135 | : "eax", "ecx", "edx", "r8", "r9", "memory", \ |
wolfSSL | 15:117db924cf7c | 1136 | "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \ |
wolfSSL | 15:117db924cf7c | 1137 | "xmm8", "xmm9", "xmm10", "xmm11" \ |
wolfSSL | 15:117db924cf7c | 1138 | ) |
wolfSSL | 15:117db924cf7c | 1139 | |
wolfSSL | 15:117db924cf7c | 1140 | |
wolfSSL | 15:117db924cf7c | 1141 | #define CHACHA_CHUNK_AVX2() \ |
wolfSSL | 15:117db924cf7c | 1142 | __asm__ __volatile__ ( \ |
wolfSSL | 15:117db924cf7c | 1143 | CHACHA_CRYPT_AVX2() \ |
wolfSSL | 15:117db924cf7c | 1144 | "vmovdqu 0(%[m]), %%xmm4\n\t" \ |
wolfSSL | 15:117db924cf7c | 1145 | "vmovdqu 16(%[m]), %%xmm5\n\t" \ |
wolfSSL | 15:117db924cf7c | 1146 | "vmovdqu 32(%[m]), %%xmm6\n\t" \ |
wolfSSL | 15:117db924cf7c | 1147 | "vmovdqu 48(%[m]), %%xmm7\n\t" \ |
wolfSSL | 15:117db924cf7c | 1148 | "vpxor %%xmm4, %%xmm0, %%xmm0\n\t" \ |
wolfSSL | 15:117db924cf7c | 1149 | "vpxor %%xmm5, %%xmm1, %%xmm1\n\t" \ |
wolfSSL | 15:117db924cf7c | 1150 | "vpxor %%xmm6, %%xmm2, %%xmm2\n\t" \ |
wolfSSL | 15:117db924cf7c | 1151 | "vpxor %%xmm7, %%xmm3, %%xmm3\n\t" \ |
wolfSSL | 15:117db924cf7c | 1152 | "vmovdqu %%xmm0, 0(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 1153 | "vmovdqu %%xmm1, 16(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 1154 | "vmovdqu %%xmm2, 32(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 1155 | "vmovdqu %%xmm3, 48(%[c])\n\t" \ |
wolfSSL | 15:117db924cf7c | 1156 | "addl $1, 48(%[input])\n\t" \ |
wolfSSL | 15:117db924cf7c | 1157 | : \ |
wolfSSL | 15:117db924cf7c | 1158 | : [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m), \ |
wolfSSL | 15:117db924cf7c | 1159 | [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \ |
wolfSSL | 15:117db924cf7c | 1160 | : "rax", "memory", \ |
wolfSSL | 15:117db924cf7c | 1161 | "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \ |
wolfSSL | 15:117db924cf7c | 1162 | "xmm8", "xmm9", "xmm10", "xmm11" \ |
wolfSSL | 15:117db924cf7c | 1163 | ) |
wolfSSL | 15:117db924cf7c | 1164 | |
wolfSSL | 15:117db924cf7c | 1165 | |
wolfSSL | 15:117db924cf7c | 1166 | static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c, |
wolfSSL | 15:117db924cf7c | 1167 | word32 bytes) |
wolfSSL | 15:117db924cf7c | 1168 | { |
wolfSSL | 15:117db924cf7c | 1169 | ALIGN256 word32 X[8*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ |
wolfSSL | 15:117db924cf7c | 1170 | ALIGN256 word32 x[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ |
wolfSSL | 15:117db924cf7c | 1171 | word32 cnt = 0; |
wolfSSL | 15:117db924cf7c | 1172 | static const __m256i add = { 0x0000000100000000UL,0x0000000300000002UL, |
wolfSSL | 15:117db924cf7c | 1173 | 0x0000000500000004UL,0x0000000700000006UL }; |
wolfSSL | 15:117db924cf7c | 1174 | static const __m256i eight = { 0x0000000800000008UL,0x0000000800000008UL, |
wolfSSL | 15:117db924cf7c | 1175 | 0x0000000800000008UL,0x0000000800000008UL }; |
wolfSSL | 15:117db924cf7c | 1176 | static const __m256i rotl8_256 = |
wolfSSL | 15:117db924cf7c | 1177 | { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL, |
wolfSSL | 15:117db924cf7c | 1178 | 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; |
wolfSSL | 15:117db924cf7c | 1179 | static const __m256i rotl16_256 = |
wolfSSL | 15:117db924cf7c | 1180 | { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL, |
wolfSSL | 15:117db924cf7c | 1181 | 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; |
wolfSSL | 15:117db924cf7c | 1182 | |
wolfSSL | 15:117db924cf7c | 1183 | if (bytes == 0) |
wolfSSL | 15:117db924cf7c | 1184 | return; |
wolfSSL | 15:117db924cf7c | 1185 | |
wolfSSL | 15:117db924cf7c | 1186 | __asm__ __volatile__ ( |
wolfSSL | 15:117db924cf7c | 1187 | "movl %[bytes], %[cnt]\n\t" |
wolfSSL | 15:117db924cf7c | 1188 | "shrl $9, %[cnt]\n\t" |
wolfSSL | 15:117db924cf7c | 1189 | "jz L_end256\n\t" |
wolfSSL | 15:117db924cf7c | 1190 | |
wolfSSL | 15:117db924cf7c | 1191 | "vpbroadcastd (%[key]), %%ymm0\n\t" |
wolfSSL | 15:117db924cf7c | 1192 | "vpbroadcastd 4(%[key]), %%ymm1\n\t" |
wolfSSL | 15:117db924cf7c | 1193 | "vpbroadcastd 8(%[key]), %%ymm2\n\t" |
wolfSSL | 15:117db924cf7c | 1194 | "vpbroadcastd 12(%[key]), %%ymm3\n\t" |
wolfSSL | 15:117db924cf7c | 1195 | "vpbroadcastd 16(%[key]), %%ymm4\n\t" |
wolfSSL | 15:117db924cf7c | 1196 | "vpbroadcastd 20(%[key]), %%ymm5\n\t" |
wolfSSL | 15:117db924cf7c | 1197 | "vpbroadcastd 24(%[key]), %%ymm6\n\t" |
wolfSSL | 15:117db924cf7c | 1198 | "vpbroadcastd 28(%[key]), %%ymm7\n\t" |
wolfSSL | 15:117db924cf7c | 1199 | "vpbroadcastd 32(%[key]), %%ymm8\n\t" |
wolfSSL | 15:117db924cf7c | 1200 | "vpbroadcastd 36(%[key]), %%ymm9\n\t" |
wolfSSL | 15:117db924cf7c | 1201 | "vpbroadcastd 40(%[key]), %%ymm10\n\t" |
wolfSSL | 15:117db924cf7c | 1202 | "vpbroadcastd 44(%[key]), %%ymm11\n\t" |
wolfSSL | 15:117db924cf7c | 1203 | "vpbroadcastd 48(%[key]), %%ymm12\n\t" |
wolfSSL | 15:117db924cf7c | 1204 | "vpbroadcastd 52(%[key]), %%ymm13\n\t" |
wolfSSL | 15:117db924cf7c | 1205 | "vpbroadcastd 56(%[key]), %%ymm14\n\t" |
wolfSSL | 15:117db924cf7c | 1206 | "vpbroadcastd 60(%[key]), %%ymm15\n\t" |
wolfSSL | 15:117db924cf7c | 1207 | |
wolfSSL | 15:117db924cf7c | 1208 | "vpaddd %[add], %%ymm12, %%ymm12\n\t" |
wolfSSL | 15:117db924cf7c | 1209 | |
wolfSSL | 15:117db924cf7c | 1210 | "vmovdqa %%ymm0, (%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1211 | "vmovdqa %%ymm1, 32(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1212 | "vmovdqa %%ymm2, 64(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1213 | "vmovdqa %%ymm3, 96(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1214 | "vmovdqa %%ymm4, 128(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1215 | "vmovdqa %%ymm5, 160(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1216 | "vmovdqa %%ymm6, 192(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1217 | "vmovdqa %%ymm7, 224(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1218 | "vmovdqa %%ymm8, 256(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1219 | "vmovdqa %%ymm9, 288(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1220 | "vmovdqa %%ymm10, 320(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1221 | "vmovdqa %%ymm11, 352(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1222 | "vmovdqa %%ymm12, 384(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1223 | "vmovdqa %%ymm13, 416(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1224 | "vmovdqa %%ymm14, 448(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1225 | "vmovdqa %%ymm15, 480(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1226 | "\n" |
wolfSSL | 15:117db924cf7c | 1227 | "L_enc256_loop:\n\t" |
wolfSSL | 15:117db924cf7c | 1228 | "vmovdqa %%ymm11, 96(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 1229 | QUARTERROUND_YMM() |
wolfSSL | 15:117db924cf7c | 1230 | QUARTERROUND_YMM_2() |
wolfSSL | 15:117db924cf7c | 1231 | QUARTERROUND_YMM() |
wolfSSL | 15:117db924cf7c | 1232 | QUARTERROUND_YMM_2() |
wolfSSL | 15:117db924cf7c | 1233 | QUARTERROUND_YMM() |
wolfSSL | 15:117db924cf7c | 1234 | QUARTERROUND_YMM_2() |
wolfSSL | 15:117db924cf7c | 1235 | QUARTERROUND_YMM() |
wolfSSL | 15:117db924cf7c | 1236 | QUARTERROUND_YMM_2() |
wolfSSL | 15:117db924cf7c | 1237 | QUARTERROUND_YMM() |
wolfSSL | 15:117db924cf7c | 1238 | QUARTERROUND_YMM_2() |
wolfSSL | 15:117db924cf7c | 1239 | QUARTERROUND_YMM() |
wolfSSL | 15:117db924cf7c | 1240 | QUARTERROUND_YMM_2() |
wolfSSL | 15:117db924cf7c | 1241 | QUARTERROUND_YMM() |
wolfSSL | 15:117db924cf7c | 1242 | QUARTERROUND_YMM_2() |
wolfSSL | 15:117db924cf7c | 1243 | QUARTERROUND_YMM() |
wolfSSL | 15:117db924cf7c | 1244 | QUARTERROUND_YMM_2() |
wolfSSL | 15:117db924cf7c | 1245 | QUARTERROUND_YMM() |
wolfSSL | 15:117db924cf7c | 1246 | QUARTERROUND_YMM_2() |
wolfSSL | 15:117db924cf7c | 1247 | QUARTERROUND_YMM() |
wolfSSL | 15:117db924cf7c | 1248 | QUARTERROUND_YMM_2() |
wolfSSL | 15:117db924cf7c | 1249 | "vmovdqa 96(%[x]), %%ymm11\n\t" |
wolfSSL | 15:117db924cf7c | 1250 | |
wolfSSL | 15:117db924cf7c | 1251 | "vpaddd (%[X]), %%ymm0, %%ymm0\n\t" |
wolfSSL | 15:117db924cf7c | 1252 | "vpaddd 32(%[X]), %%ymm1, %%ymm1\n\t" |
wolfSSL | 15:117db924cf7c | 1253 | "vpaddd 64(%[X]), %%ymm2, %%ymm2\n\t" |
wolfSSL | 15:117db924cf7c | 1254 | "vpaddd 96(%[X]), %%ymm3, %%ymm3\n\t" |
wolfSSL | 15:117db924cf7c | 1255 | "vpaddd 128(%[X]), %%ymm4, %%ymm4\n\t" |
wolfSSL | 15:117db924cf7c | 1256 | "vpaddd 160(%[X]), %%ymm5, %%ymm5\n\t" |
wolfSSL | 15:117db924cf7c | 1257 | "vpaddd 192(%[X]), %%ymm6, %%ymm6\n\t" |
wolfSSL | 15:117db924cf7c | 1258 | "vpaddd 224(%[X]), %%ymm7, %%ymm7\n\t" |
wolfSSL | 15:117db924cf7c | 1259 | "vpaddd 256(%[X]), %%ymm8, %%ymm8\n\t" |
wolfSSL | 15:117db924cf7c | 1260 | "vpaddd 288(%[X]), %%ymm9, %%ymm9\n\t" |
wolfSSL | 15:117db924cf7c | 1261 | "vpaddd 320(%[X]), %%ymm10, %%ymm10\n\t" |
wolfSSL | 15:117db924cf7c | 1262 | "vpaddd 352(%[X]), %%ymm11, %%ymm11\n\t" |
wolfSSL | 15:117db924cf7c | 1263 | "vpaddd 384(%[X]), %%ymm12, %%ymm12\n\t" |
wolfSSL | 15:117db924cf7c | 1264 | "vpaddd 416(%[X]), %%ymm13, %%ymm13\n\t" |
wolfSSL | 15:117db924cf7c | 1265 | "vpaddd 448(%[X]), %%ymm14, %%ymm14\n\t" |
wolfSSL | 15:117db924cf7c | 1266 | "vpaddd 480(%[X]), %%ymm15, %%ymm15\n\t" |
wolfSSL | 15:117db924cf7c | 1267 | |
wolfSSL | 15:117db924cf7c | 1268 | "vmovdqa %%ymm8, (%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 1269 | "vmovdqa %%ymm9, 32(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 1270 | "vmovdqa %%ymm10, 64(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 1271 | "vmovdqa %%ymm11, 96(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 1272 | "vmovdqa %%ymm12, 128(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 1273 | "vmovdqa %%ymm13, 160(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 1274 | "vmovdqa %%ymm14, 192(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 1275 | "vmovdqa %%ymm15, 224(%[x])\n\t" |
wolfSSL | 15:117db924cf7c | 1276 | |
wolfSSL | 15:117db924cf7c | 1277 | "vpunpckldq %%ymm1, %%ymm0, %%ymm8\n\t" |
wolfSSL | 15:117db924cf7c | 1278 | "vpunpckldq %%ymm3, %%ymm2, %%ymm9\n\t" |
wolfSSL | 15:117db924cf7c | 1279 | "vpunpckhdq %%ymm1, %%ymm0, %%ymm12\n\t" |
wolfSSL | 15:117db924cf7c | 1280 | "vpunpckhdq %%ymm3, %%ymm2, %%ymm13\n\t" |
wolfSSL | 15:117db924cf7c | 1281 | "vpunpckldq %%ymm5, %%ymm4, %%ymm10\n\t" |
wolfSSL | 15:117db924cf7c | 1282 | "vpunpckldq %%ymm7, %%ymm6, %%ymm11\n\t" |
wolfSSL | 15:117db924cf7c | 1283 | "vpunpckhdq %%ymm5, %%ymm4, %%ymm14\n\t" |
wolfSSL | 15:117db924cf7c | 1284 | "vpunpckhdq %%ymm7, %%ymm6, %%ymm15\n\t" |
wolfSSL | 15:117db924cf7c | 1285 | "vpunpcklqdq %%ymm9, %%ymm8, %%ymm0\n\t" |
wolfSSL | 15:117db924cf7c | 1286 | "vpunpcklqdq %%ymm11, %%ymm10, %%ymm1\n\t" |
wolfSSL | 15:117db924cf7c | 1287 | "vpunpckhqdq %%ymm9, %%ymm8, %%ymm2\n\t" |
wolfSSL | 15:117db924cf7c | 1288 | "vpunpckhqdq %%ymm11, %%ymm10, %%ymm3\n\t" |
wolfSSL | 15:117db924cf7c | 1289 | "vpunpcklqdq %%ymm13, %%ymm12, %%ymm4\n\t" |
wolfSSL | 15:117db924cf7c | 1290 | "vpunpcklqdq %%ymm15, %%ymm14, %%ymm5\n\t" |
wolfSSL | 15:117db924cf7c | 1291 | "vpunpckhqdq %%ymm13, %%ymm12, %%ymm6\n\t" |
wolfSSL | 15:117db924cf7c | 1292 | "vpunpckhqdq %%ymm15, %%ymm14, %%ymm7\n\t" |
wolfSSL | 15:117db924cf7c | 1293 | "vperm2i128 $0x20, %%ymm1, %%ymm0, %%ymm8\n\t" |
wolfSSL | 15:117db924cf7c | 1294 | "vperm2i128 $0x20, %%ymm3, %%ymm2, %%ymm9\n\t" |
wolfSSL | 15:117db924cf7c | 1295 | "vperm2i128 $0x31, %%ymm1, %%ymm0, %%ymm12\n\t" |
wolfSSL | 15:117db924cf7c | 1296 | "vperm2i128 $0x31, %%ymm3, %%ymm2, %%ymm13\n\t" |
wolfSSL | 15:117db924cf7c | 1297 | "vperm2i128 $0x20, %%ymm5, %%ymm4, %%ymm10\n\t" |
wolfSSL | 15:117db924cf7c | 1298 | "vperm2i128 $0x20, %%ymm7, %%ymm6, %%ymm11\n\t" |
wolfSSL | 15:117db924cf7c | 1299 | "vperm2i128 $0x31, %%ymm5, %%ymm4, %%ymm14\n\t" |
wolfSSL | 15:117db924cf7c | 1300 | "vperm2i128 $0x31, %%ymm7, %%ymm6, %%ymm15\n\t" |
wolfSSL | 15:117db924cf7c | 1301 | |
wolfSSL | 15:117db924cf7c | 1302 | "vmovdqu (%[in]), %%ymm0\n\t" |
wolfSSL | 15:117db924cf7c | 1303 | "vmovdqu 64(%[in]), %%ymm1\n\t" |
wolfSSL | 15:117db924cf7c | 1304 | "vmovdqu 128(%[in]), %%ymm2\n\t" |
wolfSSL | 15:117db924cf7c | 1305 | "vmovdqu 192(%[in]), %%ymm3\n\t" |
wolfSSL | 15:117db924cf7c | 1306 | "vmovdqu 256(%[in]), %%ymm4\n\t" |
wolfSSL | 15:117db924cf7c | 1307 | "vmovdqu 320(%[in]), %%ymm5\n\t" |
wolfSSL | 15:117db924cf7c | 1308 | "vmovdqu 384(%[in]), %%ymm6\n\t" |
wolfSSL | 15:117db924cf7c | 1309 | "vmovdqu 448(%[in]), %%ymm7\n\t" |
wolfSSL | 15:117db924cf7c | 1310 | "vpxor %%ymm0, %%ymm8, %%ymm8\n\t" |
wolfSSL | 15:117db924cf7c | 1311 | "vpxor %%ymm1, %%ymm9, %%ymm9\n\t" |
wolfSSL | 15:117db924cf7c | 1312 | "vpxor %%ymm2, %%ymm10, %%ymm10\n\t" |
wolfSSL | 15:117db924cf7c | 1313 | "vpxor %%ymm3, %%ymm11, %%ymm11\n\t" |
wolfSSL | 15:117db924cf7c | 1314 | "vpxor %%ymm4, %%ymm12, %%ymm12\n\t" |
wolfSSL | 15:117db924cf7c | 1315 | "vpxor %%ymm5, %%ymm13, %%ymm13\n\t" |
wolfSSL | 15:117db924cf7c | 1316 | "vpxor %%ymm6, %%ymm14, %%ymm14\n\t" |
wolfSSL | 15:117db924cf7c | 1317 | "vpxor %%ymm7, %%ymm15, %%ymm15\n\t" |
wolfSSL | 15:117db924cf7c | 1318 | "vmovdqu %%ymm8, (%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1319 | "vmovdqu %%ymm9, 64(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1320 | "vmovdqu %%ymm10, 128(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1321 | "vmovdqu %%ymm11, 192(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1322 | "vmovdqu %%ymm12, 256(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1323 | "vmovdqu %%ymm13, 320(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1324 | "vmovdqu %%ymm14, 384(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1325 | "vmovdqu %%ymm15, 448(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1326 | |
wolfSSL | 15:117db924cf7c | 1327 | "vmovdqa (%[x]), %%ymm0\n\t" |
wolfSSL | 15:117db924cf7c | 1328 | "vmovdqa 32(%[x]), %%ymm1\n\t" |
wolfSSL | 15:117db924cf7c | 1329 | "vmovdqa 64(%[x]), %%ymm2\n\t" |
wolfSSL | 15:117db924cf7c | 1330 | "vmovdqa 96(%[x]), %%ymm3\n\t" |
wolfSSL | 15:117db924cf7c | 1331 | "vmovdqa 128(%[x]), %%ymm4\n\t" |
wolfSSL | 15:117db924cf7c | 1332 | "vmovdqa 160(%[x]), %%ymm5\n\t" |
wolfSSL | 15:117db924cf7c | 1333 | "vmovdqa 192(%[x]), %%ymm6\n\t" |
wolfSSL | 15:117db924cf7c | 1334 | "vmovdqa 224(%[x]), %%ymm7\n\t" |
wolfSSL | 15:117db924cf7c | 1335 | |
wolfSSL | 15:117db924cf7c | 1336 | "vpunpckldq %%ymm1, %%ymm0, %%ymm8\n\t" |
wolfSSL | 15:117db924cf7c | 1337 | "vpunpckldq %%ymm3, %%ymm2, %%ymm9\n\t" |
wolfSSL | 15:117db924cf7c | 1338 | "vpunpckhdq %%ymm1, %%ymm0, %%ymm12\n\t" |
wolfSSL | 15:117db924cf7c | 1339 | "vpunpckhdq %%ymm3, %%ymm2, %%ymm13\n\t" |
wolfSSL | 15:117db924cf7c | 1340 | "vpunpckldq %%ymm5, %%ymm4, %%ymm10\n\t" |
wolfSSL | 15:117db924cf7c | 1341 | "vpunpckldq %%ymm7, %%ymm6, %%ymm11\n\t" |
wolfSSL | 15:117db924cf7c | 1342 | "vpunpckhdq %%ymm5, %%ymm4, %%ymm14\n\t" |
wolfSSL | 15:117db924cf7c | 1343 | "vpunpckhdq %%ymm7, %%ymm6, %%ymm15\n\t" |
wolfSSL | 15:117db924cf7c | 1344 | "vpunpcklqdq %%ymm9, %%ymm8, %%ymm0\n\t" |
wolfSSL | 15:117db924cf7c | 1345 | "vpunpcklqdq %%ymm11, %%ymm10, %%ymm1\n\t" |
wolfSSL | 15:117db924cf7c | 1346 | "vpunpckhqdq %%ymm9 , %%ymm8, %%ymm2\n\t" |
wolfSSL | 15:117db924cf7c | 1347 | "vpunpckhqdq %%ymm11, %%ymm10, %%ymm3\n\t" |
wolfSSL | 15:117db924cf7c | 1348 | "vpunpcklqdq %%ymm13, %%ymm12, %%ymm4\n\t" |
wolfSSL | 15:117db924cf7c | 1349 | "vpunpcklqdq %%ymm15, %%ymm14, %%ymm5\n\t" |
wolfSSL | 15:117db924cf7c | 1350 | "vpunpckhqdq %%ymm13, %%ymm12, %%ymm6\n\t" |
wolfSSL | 15:117db924cf7c | 1351 | "vpunpckhqdq %%ymm15, %%ymm14, %%ymm7\n\t" |
wolfSSL | 15:117db924cf7c | 1352 | "vperm2i128 $0x20, %%ymm1, %%ymm0, %%ymm8\n\t" |
wolfSSL | 15:117db924cf7c | 1353 | "vperm2i128 $0x20, %%ymm3, %%ymm2, %%ymm9\n\t" |
wolfSSL | 15:117db924cf7c | 1354 | "vperm2i128 $0x31, %%ymm1, %%ymm0, %%ymm12\n\t" |
wolfSSL | 15:117db924cf7c | 1355 | "vperm2i128 $0x31, %%ymm3, %%ymm2, %%ymm13\n\t" |
wolfSSL | 15:117db924cf7c | 1356 | "vperm2i128 $0x20, %%ymm5, %%ymm4, %%ymm10\n\t" |
wolfSSL | 15:117db924cf7c | 1357 | "vperm2i128 $0x20, %%ymm7, %%ymm6, %%ymm11\n\t" |
wolfSSL | 15:117db924cf7c | 1358 | "vperm2i128 $0x31, %%ymm5, %%ymm4, %%ymm14\n\t" |
wolfSSL | 15:117db924cf7c | 1359 | "vperm2i128 $0x31, %%ymm7, %%ymm6, %%ymm15\n\t" |
wolfSSL | 15:117db924cf7c | 1360 | |
wolfSSL | 15:117db924cf7c | 1361 | "vmovdqu 32(%[in]), %%ymm0\n\t" |
wolfSSL | 15:117db924cf7c | 1362 | "vmovdqu 96(%[in]), %%ymm1\n\t" |
wolfSSL | 15:117db924cf7c | 1363 | "vmovdqu 160(%[in]), %%ymm2\n\t" |
wolfSSL | 15:117db924cf7c | 1364 | "vmovdqu 224(%[in]), %%ymm3\n\t" |
wolfSSL | 15:117db924cf7c | 1365 | "vmovdqu 288(%[in]), %%ymm4\n\t" |
wolfSSL | 15:117db924cf7c | 1366 | "vmovdqu 352(%[in]), %%ymm5\n\t" |
wolfSSL | 15:117db924cf7c | 1367 | "vmovdqu 416(%[in]), %%ymm6\n\t" |
wolfSSL | 15:117db924cf7c | 1368 | "vmovdqu 480(%[in]), %%ymm7\n\t" |
wolfSSL | 15:117db924cf7c | 1369 | "vpxor %%ymm0, %%ymm8, %%ymm8\n\t" |
wolfSSL | 15:117db924cf7c | 1370 | "vpxor %%ymm1, %%ymm9, %%ymm9\n\t" |
wolfSSL | 15:117db924cf7c | 1371 | "vpxor %%ymm2, %%ymm10, %%ymm10\n\t" |
wolfSSL | 15:117db924cf7c | 1372 | "vpxor %%ymm3, %%ymm11, %%ymm11\n\t" |
wolfSSL | 15:117db924cf7c | 1373 | "vpxor %%ymm4, %%ymm12, %%ymm12\n\t" |
wolfSSL | 15:117db924cf7c | 1374 | "vpxor %%ymm5, %%ymm13, %%ymm13\n\t" |
wolfSSL | 15:117db924cf7c | 1375 | "vpxor %%ymm6, %%ymm14, %%ymm14\n\t" |
wolfSSL | 15:117db924cf7c | 1376 | "vpxor %%ymm7, %%ymm15, %%ymm15\n\t" |
wolfSSL | 15:117db924cf7c | 1377 | "vmovdqu %%ymm8, 32(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1378 | "vmovdqu %%ymm9, 96(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1379 | "vmovdqu %%ymm10, 160(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1380 | "vmovdqu %%ymm11, 224(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1381 | "vmovdqu %%ymm12, 288(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1382 | "vmovdqu %%ymm13, 352(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1383 | "vmovdqu %%ymm14, 416(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1384 | "vmovdqu %%ymm15, 480(%[out])\n\t" |
wolfSSL | 15:117db924cf7c | 1385 | |
wolfSSL | 15:117db924cf7c | 1386 | "vmovdqa 384(%[X]), %%ymm12\n\t" |
wolfSSL | 15:117db924cf7c | 1387 | "add $512, %[in]\n\t" |
wolfSSL | 15:117db924cf7c | 1388 | "add $512, %[out]\n\t" |
wolfSSL | 15:117db924cf7c | 1389 | "vpaddd %[eight], %%ymm12, %%ymm12\n\t" |
wolfSSL | 15:117db924cf7c | 1390 | "sub $512, %[bytes]\n\t" |
wolfSSL | 15:117db924cf7c | 1391 | "vmovdqa %%ymm12, 384(%[X])\n\t" |
wolfSSL | 15:117db924cf7c | 1392 | "cmp $512, %[bytes]\n\t" |
wolfSSL | 15:117db924cf7c | 1393 | "jl L_done256\n\t" |
wolfSSL | 15:117db924cf7c | 1394 | |
wolfSSL | 15:117db924cf7c | 1395 | "vmovdqa (%[X]), %%ymm0\n\t" |
wolfSSL | 15:117db924cf7c | 1396 | "vmovdqa 32(%[X]), %%ymm1\n\t" |
wolfSSL | 15:117db924cf7c | 1397 | "vmovdqa 64(%[X]), %%ymm2\n\t" |
wolfSSL | 15:117db924cf7c | 1398 | "vmovdqa 96(%[X]), %%ymm3\n\t" |
wolfSSL | 15:117db924cf7c | 1399 | "vmovdqa 128(%[X]), %%ymm4\n\t" |
wolfSSL | 15:117db924cf7c | 1400 | "vmovdqa 160(%[X]), %%ymm5\n\t" |
wolfSSL | 15:117db924cf7c | 1401 | "vmovdqa 192(%[X]), %%ymm6\n\t" |
wolfSSL | 15:117db924cf7c | 1402 | "vmovdqa 224(%[X]), %%ymm7\n\t" |
wolfSSL | 15:117db924cf7c | 1403 | "vmovdqa 256(%[X]), %%ymm8\n\t" |
wolfSSL | 15:117db924cf7c | 1404 | "vmovdqa 288(%[X]), %%ymm9\n\t" |
wolfSSL | 15:117db924cf7c | 1405 | "vmovdqa 320(%[X]), %%ymm10\n\t" |
wolfSSL | 15:117db924cf7c | 1406 | "vmovdqa 352(%[X]), %%ymm11\n\t" |
wolfSSL | 15:117db924cf7c | 1407 | "vmovdqa 384(%[X]), %%ymm12\n\t" |
wolfSSL | 15:117db924cf7c | 1408 | "vmovdqa 416(%[X]), %%ymm13\n\t" |
wolfSSL | 15:117db924cf7c | 1409 | "vmovdqa 448(%[X]), %%ymm14\n\t" |
wolfSSL | 15:117db924cf7c | 1410 | "vmovdqa 480(%[X]), %%ymm15\n\t" |
wolfSSL | 15:117db924cf7c | 1411 | "jmp L_enc256_loop\n\t" |
wolfSSL | 15:117db924cf7c | 1412 | "\n" |
wolfSSL | 15:117db924cf7c | 1413 | "L_done256:\n\t" |
wolfSSL | 15:117db924cf7c | 1414 | "shl $3, %[cnt]\n\t" |
wolfSSL | 15:117db924cf7c | 1415 | "add 48(%[key]), %[cnt]\n\t" |
wolfSSL | 15:117db924cf7c | 1416 | "movl %[cnt], 48(%[key])\n\t" |
wolfSSL | 15:117db924cf7c | 1417 | "\n" |
wolfSSL | 15:117db924cf7c | 1418 | "L_end256:\n\t" |
wolfSSL | 15:117db924cf7c | 1419 | : [bytes] "+r" (bytes), [cnt] "+r" (cnt), |
wolfSSL | 15:117db924cf7c | 1420 | [in] "+r" (m), [out] "+r" (c) |
wolfSSL | 15:117db924cf7c | 1421 | : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), |
wolfSSL | 15:117db924cf7c | 1422 | [add] "m" (add), [eight] "m" (eight), |
wolfSSL | 15:117db924cf7c | 1423 | [rotl8] "m" (rotl8_256), [rotl16] "m" (rotl16_256) |
wolfSSL | 15:117db924cf7c | 1424 | : "ymm0", "ymm1", "ymm2", "ymm3", |
wolfSSL | 15:117db924cf7c | 1425 | "ymm4", "ymm5", "ymm6", "ymm7", |
wolfSSL | 15:117db924cf7c | 1426 | "ymm8", "ymm9", "ymm10", "ymm11", |
wolfSSL | 15:117db924cf7c | 1427 | "ymm12", "ymm13", "ymm14", "ymm15", "memory" |
wolfSSL | 15:117db924cf7c | 1428 | ); |
wolfSSL | 15:117db924cf7c | 1429 | |
wolfSSL | 15:117db924cf7c | 1430 | /* AVX code optimised for multiples of 256 bytes. */ |
wolfSSL | 15:117db924cf7c | 1431 | if (bytes == 256) { |
wolfSSL | 15:117db924cf7c | 1432 | chacha_encrypt_avx(ctx, m, c, bytes); |
wolfSSL | 15:117db924cf7c | 1433 | bytes -= 256; |
wolfSSL | 15:117db924cf7c | 1434 | } |
wolfSSL | 15:117db924cf7c | 1435 | |
wolfSSL | 15:117db924cf7c | 1436 | for (; bytes >= CHACHA_CHUNK_BYTES;) { |
wolfSSL | 15:117db924cf7c | 1437 | CHACHA_CHUNK_AVX2(); |
wolfSSL | 15:117db924cf7c | 1438 | bytes -= CHACHA_CHUNK_BYTES; |
wolfSSL | 15:117db924cf7c | 1439 | c += CHACHA_CHUNK_BYTES; |
wolfSSL | 15:117db924cf7c | 1440 | m += CHACHA_CHUNK_BYTES; |
wolfSSL | 15:117db924cf7c | 1441 | } |
wolfSSL | 15:117db924cf7c | 1442 | if (bytes > 0) { |
wolfSSL | 15:117db924cf7c | 1443 | CHACHA_PARTIAL_CHUNK_AVX2(); |
wolfSSL | 15:117db924cf7c | 1444 | } |
wolfSSL | 15:117db924cf7c | 1445 | } |
wolfSSL | 15:117db924cf7c | 1446 | #endif /* HAVE_INTEL_AVX2 */ |
wolfSSL | 15:117db924cf7c | 1447 | #endif /* USE_INTEL_CHACHA_SPEEDUP */ |
wolfSSL | 15:117db924cf7c | 1448 | |
wolfSSL | 15:117db924cf7c | 1449 | /** |
wolfSSL | 15:117db924cf7c | 1450 | * Encrypt a stream of bytes |
wolfSSL | 15:117db924cf7c | 1451 | */ |
wolfSSL | 15:117db924cf7c | 1452 | static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c, |
wolfSSL | 15:117db924cf7c | 1453 | word32 bytes) |
wolfSSL | 15:117db924cf7c | 1454 | { |
wolfSSL | 15:117db924cf7c | 1455 | byte* output; |
wolfSSL | 15:117db924cf7c | 1456 | word32 temp[CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ |
wolfSSL | 15:117db924cf7c | 1457 | word32 i; |
wolfSSL | 15:117db924cf7c | 1458 | |
wolfSSL | 15:117db924cf7c | 1459 | output = (byte*)temp; |
wolfSSL | 15:117db924cf7c | 1460 | |
wolfSSL | 15:117db924cf7c | 1461 | for (; bytes > 0;) { |
wolfSSL | 15:117db924cf7c | 1462 | wc_Chacha_wordtobyte(temp, ctx->X); |
wolfSSL | 15:117db924cf7c | 1463 | ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); |
wolfSSL | 15:117db924cf7c | 1464 | if (bytes <= CHACHA_CHUNK_BYTES) { |
wolfSSL | 15:117db924cf7c | 1465 | for (i = 0; i < bytes; ++i) { |
wolfSSL | 15:117db924cf7c | 1466 | c[i] = m[i] ^ output[i]; |
wolfSSL | 15:117db924cf7c | 1467 | } |
wolfSSL | 15:117db924cf7c | 1468 | return; |
wolfSSL | 15:117db924cf7c | 1469 | } |
wolfSSL | 15:117db924cf7c | 1470 | for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) { |
wolfSSL | 15:117db924cf7c | 1471 | c[i] = m[i] ^ output[i]; |
wolfSSL | 15:117db924cf7c | 1472 | } |
wolfSSL | 15:117db924cf7c | 1473 | bytes -= CHACHA_CHUNK_BYTES; |
wolfSSL | 15:117db924cf7c | 1474 | c += CHACHA_CHUNK_BYTES; |
wolfSSL | 15:117db924cf7c | 1475 | m += CHACHA_CHUNK_BYTES; |
wolfSSL | 15:117db924cf7c | 1476 | } |
wolfSSL | 15:117db924cf7c | 1477 | } |
wolfSSL | 15:117db924cf7c | 1478 | |
wolfSSL | 15:117db924cf7c | 1479 | /** |
wolfSSL | 15:117db924cf7c | 1480 | * API to encrypt/decrypt a message of any size. |
wolfSSL | 15:117db924cf7c | 1481 | */ |
wolfSSL | 15:117db924cf7c | 1482 | int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, |
wolfSSL | 15:117db924cf7c | 1483 | word32 msglen) |
wolfSSL | 15:117db924cf7c | 1484 | { |
wolfSSL | 15:117db924cf7c | 1485 | if (ctx == NULL) |
wolfSSL | 15:117db924cf7c | 1486 | return BAD_FUNC_ARG; |
wolfSSL | 15:117db924cf7c | 1487 | |
wolfSSL | 15:117db924cf7c | 1488 | #ifdef USE_INTEL_CHACHA_SPEEDUP |
wolfSSL | 15:117db924cf7c | 1489 | if (!cpuidFlagsSet) { |
wolfSSL | 15:117db924cf7c | 1490 | cpuidFlags = cpuid_get_flags(); |
wolfSSL | 15:117db924cf7c | 1491 | cpuidFlagsSet = 1; |
wolfSSL | 15:117db924cf7c | 1492 | } |
wolfSSL | 15:117db924cf7c | 1493 | |
wolfSSL | 15:117db924cf7c | 1494 | #ifdef HAVE_INTEL_AVX2 |
wolfSSL | 15:117db924cf7c | 1495 | if (IS_INTEL_AVX2(cpuidFlags)) { |
wolfSSL | 15:117db924cf7c | 1496 | chacha_encrypt_avx2(ctx, input, output, msglen); |
wolfSSL | 15:117db924cf7c | 1497 | return 0; |
wolfSSL | 15:117db924cf7c | 1498 | } |
wolfSSL | 15:117db924cf7c | 1499 | #endif |
wolfSSL | 15:117db924cf7c | 1500 | if (IS_INTEL_AVX1(cpuidFlags)) { |
wolfSSL | 15:117db924cf7c | 1501 | chacha_encrypt_avx(ctx, input, output, msglen); |
wolfSSL | 15:117db924cf7c | 1502 | return 0; |
wolfSSL | 15:117db924cf7c | 1503 | } |
wolfSSL | 15:117db924cf7c | 1504 | else { |
wolfSSL | 15:117db924cf7c | 1505 | chacha_encrypt_x64(ctx, input, output, msglen); |
wolfSSL | 15:117db924cf7c | 1506 | return 0; |
wolfSSL | 15:117db924cf7c | 1507 | } |
wolfSSL | 15:117db924cf7c | 1508 | #endif |
wolfSSL | 15:117db924cf7c | 1509 | wc_Chacha_encrypt_bytes(ctx, input, output, msglen); |
wolfSSL | 15:117db924cf7c | 1510 | |
wolfSSL | 15:117db924cf7c | 1511 | return 0; |
wolfSSL | 15:117db924cf7c | 1512 | } |
wolfSSL | 15:117db924cf7c | 1513 | |
wolfSSL | 15:117db924cf7c | 1514 | #endif /* HAVE_CHACHA*/ |
wolfSSL | 15:117db924cf7c | 1515 | |
wolfSSL | 15:117db924cf7c | 1516 |