wolfSSL SSL/TLS library, support up to TLS1.3
Dependents: CyaSSL-Twitter-OAuth4Tw Example-client-tls-cert TwitterReader TweetTest ... more
wolfcrypt/src/chacha.c@13:f67a6c6013ca, 2017-08-22 (annotated)
- Committer:
- wolfSSL
- Date:
- Tue Aug 22 10:48:22 2017 +0000
- Revision:
- 13:f67a6c6013ca
wolfSSL3.12.0 with TLS1.3
Who changed what in which revision?
User | Revision | Line number | New contents of line |
---|---|---|---|
wolfSSL | 13:f67a6c6013ca | 1 | /* chacha.c |
wolfSSL | 13:f67a6c6013ca | 2 | * |
wolfSSL | 13:f67a6c6013ca | 3 | * Copyright (C) 2006-2016 wolfSSL Inc. |
wolfSSL | 13:f67a6c6013ca | 4 | * |
wolfSSL | 13:f67a6c6013ca | 5 | * This file is part of wolfSSL. |
wolfSSL | 13:f67a6c6013ca | 6 | * |
wolfSSL | 13:f67a6c6013ca | 7 | * wolfSSL is free software; you can redistribute it and/or modify |
wolfSSL | 13:f67a6c6013ca | 8 | * it under the terms of the GNU General Public License as published by |
wolfSSL | 13:f67a6c6013ca | 9 | * the Free Software Foundation; either version 2 of the License, or |
wolfSSL | 13:f67a6c6013ca | 10 | * (at your option) any later version. |
wolfSSL | 13:f67a6c6013ca | 11 | * |
wolfSSL | 13:f67a6c6013ca | 12 | * wolfSSL is distributed in the hope that it will be useful, |
wolfSSL | 13:f67a6c6013ca | 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
wolfSSL | 13:f67a6c6013ca | 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
wolfSSL | 13:f67a6c6013ca | 15 | * GNU General Public License for more details. |
wolfSSL | 13:f67a6c6013ca | 16 | * |
wolfSSL | 13:f67a6c6013ca | 17 | * You should have received a copy of the GNU General Public License |
wolfSSL | 13:f67a6c6013ca | 18 | * along with this program; if not, write to the Free Software |
wolfSSL | 13:f67a6c6013ca | 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA |
wolfSSL | 13:f67a6c6013ca | 20 | * |
wolfSSL | 13:f67a6c6013ca | 21 | * based from |
wolfSSL | 13:f67a6c6013ca | 22 | * chacha-ref.c version 20080118 |
wolfSSL | 13:f67a6c6013ca | 23 | * D. J. Bernstein |
wolfSSL | 13:f67a6c6013ca | 24 | * Public domain. |
wolfSSL | 13:f67a6c6013ca | 25 | */ |
wolfSSL | 13:f67a6c6013ca | 26 | |
wolfSSL | 13:f67a6c6013ca | 27 | |
wolfSSL | 13:f67a6c6013ca | 28 | |
wolfSSL | 13:f67a6c6013ca | 29 | #ifdef HAVE_CONFIG_H |
wolfSSL | 13:f67a6c6013ca | 30 | #include <config.h> |
wolfSSL | 13:f67a6c6013ca | 31 | #endif |
wolfSSL | 13:f67a6c6013ca | 32 | |
wolfSSL | 13:f67a6c6013ca | 33 | #include <wolfssl/wolfcrypt/settings.h> |
wolfSSL | 13:f67a6c6013ca | 34 | |
wolfSSL | 13:f67a6c6013ca | 35 | #ifdef HAVE_CHACHA |
wolfSSL | 13:f67a6c6013ca | 36 | |
wolfSSL | 13:f67a6c6013ca | 37 | #include <wolfssl/wolfcrypt/chacha.h> |
wolfSSL | 13:f67a6c6013ca | 38 | #include <wolfssl/wolfcrypt/error-crypt.h> |
wolfSSL | 13:f67a6c6013ca | 39 | #include <wolfssl/wolfcrypt/logging.h> |
wolfSSL | 13:f67a6c6013ca | 40 | #include <wolfssl/wolfcrypt/cpuid.h> |
wolfSSL | 13:f67a6c6013ca | 41 | #ifdef NO_INLINE |
wolfSSL | 13:f67a6c6013ca | 42 | #include <wolfssl/wolfcrypt/misc.h> |
wolfSSL | 13:f67a6c6013ca | 43 | #else |
wolfSSL | 13:f67a6c6013ca | 44 | #define WOLFSSL_MISC_INCLUDED |
wolfSSL | 13:f67a6c6013ca | 45 | #include <wolfcrypt/src/misc.c> |
wolfSSL | 13:f67a6c6013ca | 46 | #endif |
wolfSSL | 13:f67a6c6013ca | 47 | |
wolfSSL | 13:f67a6c6013ca | 48 | #ifdef CHACHA_AEAD_TEST |
wolfSSL | 13:f67a6c6013ca | 49 | #include <stdio.h> |
wolfSSL | 13:f67a6c6013ca | 50 | #endif |
wolfSSL | 13:f67a6c6013ca | 51 | |
wolfSSL | 13:f67a6c6013ca | 52 | #ifdef WOLFSSL_X86_64_BUILD |
wolfSSL | 13:f67a6c6013ca | 53 | #if defined(USE_INTEL_SPEEDUP) && !defined(NO_CHACHA_ASM) |
wolfSSL | 13:f67a6c6013ca | 54 | #define USE_INTEL_CHACHA_SPEEDUP |
wolfSSL | 13:f67a6c6013ca | 55 | #endif |
wolfSSL | 13:f67a6c6013ca | 56 | #endif |
wolfSSL | 13:f67a6c6013ca | 57 | |
wolfSSL | 13:f67a6c6013ca | 58 | #ifdef USE_INTEL_CHACHA_SPEEDUP |
wolfSSL | 13:f67a6c6013ca | 59 | #include <emmintrin.h> |
wolfSSL | 13:f67a6c6013ca | 60 | #include <immintrin.h> |
wolfSSL | 13:f67a6c6013ca | 61 | #define HAVE_INTEL_AVX1 |
wolfSSL | 13:f67a6c6013ca | 62 | #define HAVE_INTEL_AVX2 |
wolfSSL | 13:f67a6c6013ca | 63 | #endif |
wolfSSL | 13:f67a6c6013ca | 64 | |
wolfSSL | 13:f67a6c6013ca | 65 | #ifdef BIG_ENDIAN_ORDER |
wolfSSL | 13:f67a6c6013ca | 66 | #define LITTLE32(x) ByteReverseWord32(x) |
wolfSSL | 13:f67a6c6013ca | 67 | #else |
wolfSSL | 13:f67a6c6013ca | 68 | #define LITTLE32(x) (x) |
wolfSSL | 13:f67a6c6013ca | 69 | #endif |
wolfSSL | 13:f67a6c6013ca | 70 | |
wolfSSL | 13:f67a6c6013ca | 71 | /* Number of rounds */ |
wolfSSL | 13:f67a6c6013ca | 72 | #define ROUNDS 20 |
wolfSSL | 13:f67a6c6013ca | 73 | |
wolfSSL | 13:f67a6c6013ca | 74 | #define U32C(v) (v##U) |
wolfSSL | 13:f67a6c6013ca | 75 | #define U32V(v) ((word32)(v) & U32C(0xFFFFFFFF)) |
wolfSSL | 13:f67a6c6013ca | 76 | #define U8TO32_LITTLE(p) LITTLE32(((word32*)(p))[0]) |
wolfSSL | 13:f67a6c6013ca | 77 | |
wolfSSL | 13:f67a6c6013ca | 78 | #define ROTATE(v,c) rotlFixed(v, c) |
wolfSSL | 13:f67a6c6013ca | 79 | #define XOR(v,w) ((v) ^ (w)) |
wolfSSL | 13:f67a6c6013ca | 80 | #define PLUS(v,w) (U32V((v) + (w))) |
wolfSSL | 13:f67a6c6013ca | 81 | #define PLUSONE(v) (PLUS((v),1)) |
wolfSSL | 13:f67a6c6013ca | 82 | |
wolfSSL | 13:f67a6c6013ca | 83 | #define QUARTERROUND(a,b,c,d) \ |
wolfSSL | 13:f67a6c6013ca | 84 | x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \ |
wolfSSL | 13:f67a6c6013ca | 85 | x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \ |
wolfSSL | 13:f67a6c6013ca | 86 | x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \ |
wolfSSL | 13:f67a6c6013ca | 87 | x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7); |
wolfSSL | 13:f67a6c6013ca | 88 | |
wolfSSL | 13:f67a6c6013ca | 89 | |
wolfSSL | 13:f67a6c6013ca | 90 | |
wolfSSL | 13:f67a6c6013ca | 91 | #define QUARTERROUND_INTEL_ASM(a0,b0,c0,d0, \ |
wolfSSL | 13:f67a6c6013ca | 92 | a1,b1,c1,d1, \ |
wolfSSL | 13:f67a6c6013ca | 93 | a2,b2,c2,d2, \ |
wolfSSL | 13:f67a6c6013ca | 94 | a3,b3,c3,d3, \ |
wolfSSL | 13:f67a6c6013ca | 95 | t1,o1) \ |
wolfSSL | 13:f67a6c6013ca | 96 | "vpaddd "#b0", "#a0", "#a0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 97 | "vpxor "#a0", "#d0", "#d0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 98 | "vmovdqa "#o1"(%[x]), "#c3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 99 | "vpshufb %[rotl16], "#d0", "#d0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 100 | "vpaddd "#d0", "#c0", "#c0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 101 | "vpxor "#c0", "#b0", "#b0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 102 | "vpaddd "#b1", "#a1", "#a1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 103 | "vpxor "#a1", "#d1", "#d1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 104 | "vpshufb %[rotl16], "#d1", "#d1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 105 | "vpaddd "#d1", "#c1", "#c1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 106 | "vpxor "#c1", "#b1", "#b1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 107 | "vpaddd "#b2", "#a2", "#a2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 108 | "vpxor "#a2", "#d2", "#d2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 109 | "vpshufb %[rotl16], "#d2", "#d2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 110 | "vpaddd "#d2", "#c2", "#c2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 111 | "vpxor "#c2", "#b2", "#b2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 112 | "vpaddd "#b3", "#a3", "#a3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 113 | "vpxor "#a3", "#d3", "#d3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 114 | "vpshufb %[rotl16], "#d3", "#d3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 115 | "vpaddd "#d3", "#c3", "#c3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 116 | "vpxor "#c3", "#b3", "#b3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 117 | "vmovdqa "#c3", "#o1"(%[x])\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 118 | "vpsrld $20, "#b0", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 119 | "vpslld $12, "#b0", "#b0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 120 | "vpxor "#t1", "#b0", "#b0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 121 | "vpsrld $20, "#b1", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 122 | "vpslld $12, "#b1", "#b1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 123 | "vpxor "#t1", "#b1", "#b1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 124 | "vpsrld $20, "#b2", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 125 | "vpslld $12, "#b2", "#b2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 126 | "vpxor "#t1", "#b2", "#b2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 127 | "vpsrld $20, "#b3", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 128 | "vpslld $12, "#b3", "#b3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 129 | "vpxor "#t1", "#b3", "#b3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 130 | "vpaddd "#b0", "#a0", "#a0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 131 | "vpxor "#a0", "#d0", "#d0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 132 | "vmovdqa "#o1"(%[x]), "#c3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 133 | "vpshufb %[rotl8], "#d0", "#d0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 134 | "vpaddd "#d0", "#c0", "#c0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 135 | "vpxor "#c0", "#b0", "#b0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 136 | "vpaddd "#b1", "#a1", "#a1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 137 | "vpxor "#a1", "#d1", "#d1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 138 | "vpshufb %[rotl8], "#d1", "#d1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 139 | "vpaddd "#d1", "#c1", "#c1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 140 | "vpxor "#c1", "#b1", "#b1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 141 | "vpaddd "#b2", "#a2", "#a2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 142 | "vpxor "#a2", "#d2", "#d2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 143 | "vpshufb %[rotl8], "#d2", "#d2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 144 | "vpaddd "#d2", "#c2", "#c2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 145 | "vpxor "#c2", "#b2", "#b2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 146 | "vpaddd "#b3", "#a3", "#a3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 147 | "vpxor "#a3", "#d3", "#d3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 148 | "vpshufb %[rotl8], "#d3", "#d3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 149 | "vpaddd "#d3", "#c3", "#c3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 150 | "vpxor "#c3", "#b3", "#b3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 151 | "vmovdqa "#c3", "#o1"(%[x])\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 152 | "vpsrld $25, "#b0", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 153 | "vpslld $7, "#b0", "#b0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 154 | "vpxor "#t1", "#b0", "#b0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 155 | "vpsrld $25, "#b1", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 156 | "vpslld $7, "#b1", "#b1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 157 | "vpxor "#t1", "#b1", "#b1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 158 | "vpsrld $25, "#b2", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 159 | "vpslld $7, "#b2", "#b2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 160 | "vpxor "#t1", "#b2", "#b2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 161 | "vpsrld $25, "#b3", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 162 | "vpslld $7, "#b3", "#b3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 163 | "vpxor "#t1", "#b3", "#b3"\n\t" |
wolfSSL | 13:f67a6c6013ca | 164 | |
wolfSSL | 13:f67a6c6013ca | 165 | #define QUARTERROUND_INTEL_ASM_2(a0,b0,c0,d0, \ |
wolfSSL | 13:f67a6c6013ca | 166 | a1,b1,c1,d1, \ |
wolfSSL | 13:f67a6c6013ca | 167 | a2,b2,c2,d2, \ |
wolfSSL | 13:f67a6c6013ca | 168 | a3,b3,c3,d3, \ |
wolfSSL | 13:f67a6c6013ca | 169 | t1,o1) \ |
wolfSSL | 13:f67a6c6013ca | 170 | "vpaddd "#b0", "#a0", "#a0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 171 | "vpxor "#a0", "#d0", "#d0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 172 | "vmovdqa "#o1"(%[x]), "#c1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 173 | "vpshufb %[rotl16], "#d0", "#d0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 174 | "vpaddd "#d0", "#c0", "#c0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 175 | "vpxor "#c0", "#b0", "#b0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 176 | "vpaddd "#b1", "#a1", "#a1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 177 | "vpxor "#a1", "#d1", "#d1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 178 | "vpshufb %[rotl16], "#d1", "#d1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 179 | "vpaddd "#d1", "#c1", "#c1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 180 | "vpxor "#c1", "#b1", "#b1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 181 | "vpaddd "#b2", "#a2", "#a2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 182 | "vpxor "#a2", "#d2", "#d2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 183 | "vpshufb %[rotl16], "#d2", "#d2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 184 | "vpaddd "#d2", "#c2", "#c2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 185 | "vpxor "#c2", "#b2", "#b2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 186 | "vpaddd "#b3", "#a3", "#a3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 187 | "vpxor "#a3", "#d3", "#d3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 188 | "vpshufb %[rotl16], "#d3", "#d3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 189 | "vpaddd "#d3", "#c3", "#c3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 190 | "vpxor "#c3", "#b3", "#b3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 191 | "vmovdqa "#c1", "#o1"(%[x])\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 192 | "vpsrld $20, "#b0", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 193 | "vpslld $12, "#b0", "#b0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 194 | "vpxor "#t1", "#b0", "#b0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 195 | "vpsrld $20, "#b1", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 196 | "vpslld $12, "#b1", "#b1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 197 | "vpxor "#t1", "#b1", "#b1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 198 | "vpsrld $20, "#b2", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 199 | "vpslld $12, "#b2", "#b2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 200 | "vpxor "#t1", "#b2", "#b2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 201 | "vpsrld $20, "#b3", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 202 | "vpslld $12, "#b3", "#b3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 203 | "vpxor "#t1", "#b3", "#b3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 204 | "vpaddd "#b0", "#a0", "#a0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 205 | "vpxor "#a0", "#d0", "#d0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 206 | "vmovdqa "#o1"(%[x]), "#c1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 207 | "vpshufb %[rotl8], "#d0", "#d0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 208 | "vpaddd "#d0", "#c0", "#c0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 209 | "vpxor "#c0", "#b0", "#b0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 210 | "vpaddd "#b1", "#a1", "#a1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 211 | "vpxor "#a1", "#d1", "#d1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 212 | "vpshufb %[rotl8], "#d1", "#d1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 213 | "vpaddd "#d1", "#c1", "#c1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 214 | "vpxor "#c1", "#b1", "#b1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 215 | "vpaddd "#b2", "#a2", "#a2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 216 | "vpxor "#a2", "#d2", "#d2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 217 | "vpshufb %[rotl8], "#d2", "#d2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 218 | "vpaddd "#d2", "#c2", "#c2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 219 | "vpxor "#c2", "#b2", "#b2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 220 | "vpaddd "#b3", "#a3", "#a3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 221 | "vpxor "#a3", "#d3", "#d3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 222 | "vpshufb %[rotl8], "#d3", "#d3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 223 | "vpaddd "#d3", "#c3", "#c3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 224 | "vpxor "#c3", "#b3", "#b3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 225 | "vmovdqa "#c1", "#o1"(%[x])\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 226 | "vpsrld $25, "#b0", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 227 | "vpslld $7, "#b0", "#b0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 228 | "vpxor "#t1", "#b0", "#b0"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 229 | "vpsrld $25, "#b1", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 230 | "vpslld $7, "#b1", "#b1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 231 | "vpxor "#t1", "#b1", "#b1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 232 | "vpsrld $25, "#b2", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 233 | "vpslld $7, "#b2", "#b2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 234 | "vpxor "#t1", "#b2", "#b2"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 235 | "vpsrld $25, "#b3", "#t1"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 236 | "vpslld $7, "#b3", "#b3"\n\t" \ |
wolfSSL | 13:f67a6c6013ca | 237 | "vpxor "#t1", "#b3", "#b3"\n\t" |
wolfSSL | 13:f67a6c6013ca | 238 | |
wolfSSL | 13:f67a6c6013ca | 239 | |
wolfSSL | 13:f67a6c6013ca | 240 | #define QUARTERROUND_XMM() \ |
wolfSSL | 13:f67a6c6013ca | 241 | QUARTERROUND_INTEL_ASM(%%xmm0,%%xmm4,%%xmm8,%%xmm12, \ |
wolfSSL | 13:f67a6c6013ca | 242 | %%xmm1,%%xmm5,%%xmm9,%%xmm13, \ |
wolfSSL | 13:f67a6c6013ca | 243 | %%xmm2,%%xmm6,%%xmm10,%%xmm14, \ |
wolfSSL | 13:f67a6c6013ca | 244 | %%xmm3,%%xmm7,%%xmm11,%%xmm15, \ |
wolfSSL | 13:f67a6c6013ca | 245 | %%xmm11,48) |
wolfSSL | 13:f67a6c6013ca | 246 | #define QUARTERROUND_XMM_2() \ |
wolfSSL | 13:f67a6c6013ca | 247 | QUARTERROUND_INTEL_ASM_2(%%xmm0,%%xmm5,%%xmm10,%%xmm15, \ |
wolfSSL | 13:f67a6c6013ca | 248 | %%xmm1,%%xmm6,%%xmm11,%%xmm12, \ |
wolfSSL | 13:f67a6c6013ca | 249 | %%xmm2,%%xmm7,%%xmm8,%%xmm13, \ |
wolfSSL | 13:f67a6c6013ca | 250 | %%xmm3,%%xmm4,%%xmm9,%%xmm14, \ |
wolfSSL | 13:f67a6c6013ca | 251 | %%xmm11,48) |
wolfSSL | 13:f67a6c6013ca | 252 | |
wolfSSL | 13:f67a6c6013ca | 253 | #define QUARTERROUND_YMM() \ |
wolfSSL | 13:f67a6c6013ca | 254 | QUARTERROUND_INTEL_ASM(%%ymm0,%%ymm4,%%ymm8,%%ymm12, \ |
wolfSSL | 13:f67a6c6013ca | 255 | %%ymm1,%%ymm5,%%ymm9,%%ymm13, \ |
wolfSSL | 13:f67a6c6013ca | 256 | %%ymm2,%%ymm6,%%ymm10,%%ymm14, \ |
wolfSSL | 13:f67a6c6013ca | 257 | %%ymm3,%%ymm7,%%ymm11,%%ymm15, \ |
wolfSSL | 13:f67a6c6013ca | 258 | %%ymm11,96) |
wolfSSL | 13:f67a6c6013ca | 259 | #define QUARTERROUND_YMM_2() \ |
wolfSSL | 13:f67a6c6013ca | 260 | QUARTERROUND_INTEL_ASM_2(%%ymm0,%%ymm5,%%ymm10,%%ymm15, \ |
wolfSSL | 13:f67a6c6013ca | 261 | %%ymm1,%%ymm6,%%ymm11,%%ymm12, \ |
wolfSSL | 13:f67a6c6013ca | 262 | %%ymm2,%%ymm7,%%ymm8,%%ymm13, \ |
wolfSSL | 13:f67a6c6013ca | 263 | %%ymm3,%%ymm4,%%ymm9,%%ymm14, \ |
wolfSSL | 13:f67a6c6013ca | 264 | %%ymm11,96) |
wolfSSL | 13:f67a6c6013ca | 265 | |
wolfSSL | 13:f67a6c6013ca | 266 | /** |
wolfSSL | 13:f67a6c6013ca | 267 | * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version |
wolfSSL | 13:f67a6c6013ca | 268 | * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB. |
wolfSSL | 13:f67a6c6013ca | 269 | */ |
wolfSSL | 13:f67a6c6013ca | 270 | int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) |
wolfSSL | 13:f67a6c6013ca | 271 | { |
wolfSSL | 13:f67a6c6013ca | 272 | word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */ |
wolfSSL | 13:f67a6c6013ca | 273 | |
wolfSSL | 13:f67a6c6013ca | 274 | #ifdef CHACHA_AEAD_TEST |
wolfSSL | 13:f67a6c6013ca | 275 | word32 i; |
wolfSSL | 13:f67a6c6013ca | 276 | printf("NONCE : "); |
wolfSSL | 13:f67a6c6013ca | 277 | for (i = 0; i < CHACHA_IV_BYTES; i++) { |
wolfSSL | 13:f67a6c6013ca | 278 | printf("%02x", inIv[i]); |
wolfSSL | 13:f67a6c6013ca | 279 | } |
wolfSSL | 13:f67a6c6013ca | 280 | printf("\n\n"); |
wolfSSL | 13:f67a6c6013ca | 281 | #endif |
wolfSSL | 13:f67a6c6013ca | 282 | |
wolfSSL | 13:f67a6c6013ca | 283 | if (ctx == NULL) |
wolfSSL | 13:f67a6c6013ca | 284 | return BAD_FUNC_ARG; |
wolfSSL | 13:f67a6c6013ca | 285 | |
wolfSSL | 13:f67a6c6013ca | 286 | XMEMCPY(temp, inIv, CHACHA_IV_BYTES); |
wolfSSL | 13:f67a6c6013ca | 287 | |
wolfSSL | 13:f67a6c6013ca | 288 | ctx->X[CHACHA_IV_BYTES+0] = counter; /* block counter */ |
wolfSSL | 13:f67a6c6013ca | 289 | ctx->X[CHACHA_IV_BYTES+1] = LITTLE32(temp[0]); /* fixed variable from nonce */ |
wolfSSL | 13:f67a6c6013ca | 290 | ctx->X[CHACHA_IV_BYTES+2] = LITTLE32(temp[1]); /* counter from nonce */ |
wolfSSL | 13:f67a6c6013ca | 291 | ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */ |
wolfSSL | 13:f67a6c6013ca | 292 | |
wolfSSL | 13:f67a6c6013ca | 293 | return 0; |
wolfSSL | 13:f67a6c6013ca | 294 | } |
wolfSSL | 13:f67a6c6013ca | 295 | |
wolfSSL | 13:f67a6c6013ca | 296 | /* "expand 32-byte k" as unsigned 32 byte */ |
wolfSSL | 13:f67a6c6013ca | 297 | static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; |
wolfSSL | 13:f67a6c6013ca | 298 | /* "expand 16-byte k" as unsigned 16 byte */ |
wolfSSL | 13:f67a6c6013ca | 299 | static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574}; |
wolfSSL | 13:f67a6c6013ca | 300 | |
wolfSSL | 13:f67a6c6013ca | 301 | /** |
wolfSSL | 13:f67a6c6013ca | 302 | * Key setup. 8 word iv (nonce) |
wolfSSL | 13:f67a6c6013ca | 303 | */ |
wolfSSL | 13:f67a6c6013ca | 304 | int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) |
wolfSSL | 13:f67a6c6013ca | 305 | { |
wolfSSL | 13:f67a6c6013ca | 306 | const word32* constants; |
wolfSSL | 13:f67a6c6013ca | 307 | const byte* k; |
wolfSSL | 13:f67a6c6013ca | 308 | |
wolfSSL | 13:f67a6c6013ca | 309 | #ifdef XSTREAM_ALIGN |
wolfSSL | 13:f67a6c6013ca | 310 | word32 alignKey[8]; |
wolfSSL | 13:f67a6c6013ca | 311 | #endif |
wolfSSL | 13:f67a6c6013ca | 312 | |
wolfSSL | 13:f67a6c6013ca | 313 | if (ctx == NULL) |
wolfSSL | 13:f67a6c6013ca | 314 | return BAD_FUNC_ARG; |
wolfSSL | 13:f67a6c6013ca | 315 | |
wolfSSL | 13:f67a6c6013ca | 316 | if (keySz != 16 && keySz != 32) |
wolfSSL | 13:f67a6c6013ca | 317 | return BAD_FUNC_ARG; |
wolfSSL | 13:f67a6c6013ca | 318 | |
wolfSSL | 13:f67a6c6013ca | 319 | #ifdef XSTREAM_ALIGN |
wolfSSL | 13:f67a6c6013ca | 320 | if ((wolfssl_word)key % 4) { |
wolfSSL | 13:f67a6c6013ca | 321 | WOLFSSL_MSG("wc_ChachaSetKey unaligned key"); |
wolfSSL | 13:f67a6c6013ca | 322 | XMEMCPY(alignKey, key, keySz); |
wolfSSL | 13:f67a6c6013ca | 323 | k = (byte*)alignKey; |
wolfSSL | 13:f67a6c6013ca | 324 | } |
wolfSSL | 13:f67a6c6013ca | 325 | else { |
wolfSSL | 13:f67a6c6013ca | 326 | k = key; |
wolfSSL | 13:f67a6c6013ca | 327 | } |
wolfSSL | 13:f67a6c6013ca | 328 | #else |
wolfSSL | 13:f67a6c6013ca | 329 | k = key; |
wolfSSL | 13:f67a6c6013ca | 330 | #endif /* XSTREAM_ALIGN */ |
wolfSSL | 13:f67a6c6013ca | 331 | |
wolfSSL | 13:f67a6c6013ca | 332 | #ifdef CHACHA_AEAD_TEST |
wolfSSL | 13:f67a6c6013ca | 333 | word32 i; |
wolfSSL | 13:f67a6c6013ca | 334 | printf("ChaCha key used :\n"); |
wolfSSL | 13:f67a6c6013ca | 335 | for (i = 0; i < keySz; i++) { |
wolfSSL | 13:f67a6c6013ca | 336 | printf("%02x", key[i]); |
wolfSSL | 13:f67a6c6013ca | 337 | if ((i + 1) % 8 == 0) |
wolfSSL | 13:f67a6c6013ca | 338 | printf("\n"); |
wolfSSL | 13:f67a6c6013ca | 339 | } |
wolfSSL | 13:f67a6c6013ca | 340 | printf("\n\n"); |
wolfSSL | 13:f67a6c6013ca | 341 | #endif |
wolfSSL | 13:f67a6c6013ca | 342 | |
wolfSSL | 13:f67a6c6013ca | 343 | ctx->X[4] = U8TO32_LITTLE(k + 0); |
wolfSSL | 13:f67a6c6013ca | 344 | ctx->X[5] = U8TO32_LITTLE(k + 4); |
wolfSSL | 13:f67a6c6013ca | 345 | ctx->X[6] = U8TO32_LITTLE(k + 8); |
wolfSSL | 13:f67a6c6013ca | 346 | ctx->X[7] = U8TO32_LITTLE(k + 12); |
wolfSSL | 13:f67a6c6013ca | 347 | if (keySz == 32) { |
wolfSSL | 13:f67a6c6013ca | 348 | k += 16; |
wolfSSL | 13:f67a6c6013ca | 349 | constants = sigma; |
wolfSSL | 13:f67a6c6013ca | 350 | } |
wolfSSL | 13:f67a6c6013ca | 351 | else { |
wolfSSL | 13:f67a6c6013ca | 352 | constants = tau; |
wolfSSL | 13:f67a6c6013ca | 353 | } |
wolfSSL | 13:f67a6c6013ca | 354 | ctx->X[ 8] = U8TO32_LITTLE(k + 0); |
wolfSSL | 13:f67a6c6013ca | 355 | ctx->X[ 9] = U8TO32_LITTLE(k + 4); |
wolfSSL | 13:f67a6c6013ca | 356 | ctx->X[10] = U8TO32_LITTLE(k + 8); |
wolfSSL | 13:f67a6c6013ca | 357 | ctx->X[11] = U8TO32_LITTLE(k + 12); |
wolfSSL | 13:f67a6c6013ca | 358 | ctx->X[ 0] = constants[0]; |
wolfSSL | 13:f67a6c6013ca | 359 | ctx->X[ 1] = constants[1]; |
wolfSSL | 13:f67a6c6013ca | 360 | ctx->X[ 2] = constants[2]; |
wolfSSL | 13:f67a6c6013ca | 361 | ctx->X[ 3] = constants[3]; |
wolfSSL | 13:f67a6c6013ca | 362 | |
wolfSSL | 13:f67a6c6013ca | 363 | return 0; |
wolfSSL | 13:f67a6c6013ca | 364 | } |
wolfSSL | 13:f67a6c6013ca | 365 | |
wolfSSL | 13:f67a6c6013ca | 366 | /** |
wolfSSL | 13:f67a6c6013ca | 367 | * Converts word into bytes with rotations having been done. |
wolfSSL | 13:f67a6c6013ca | 368 | */ |
wolfSSL | 13:f67a6c6013ca | 369 | static INLINE void wc_Chacha_wordtobyte(word32 output[CHACHA_CHUNK_WORDS], |
wolfSSL | 13:f67a6c6013ca | 370 | const word32 input[CHACHA_CHUNK_WORDS]) |
wolfSSL | 13:f67a6c6013ca | 371 | { |
wolfSSL | 13:f67a6c6013ca | 372 | word32 x[CHACHA_CHUNK_WORDS]; |
wolfSSL | 13:f67a6c6013ca | 373 | word32 i; |
wolfSSL | 13:f67a6c6013ca | 374 | |
wolfSSL | 13:f67a6c6013ca | 375 | for (i = 0; i < CHACHA_CHUNK_WORDS; i++) { |
wolfSSL | 13:f67a6c6013ca | 376 | x[i] = input[i]; |
wolfSSL | 13:f67a6c6013ca | 377 | } |
wolfSSL | 13:f67a6c6013ca | 378 | |
wolfSSL | 13:f67a6c6013ca | 379 | for (i = (ROUNDS); i > 0; i -= 2) { |
wolfSSL | 13:f67a6c6013ca | 380 | QUARTERROUND(0, 4, 8, 12) |
wolfSSL | 13:f67a6c6013ca | 381 | QUARTERROUND(1, 5, 9, 13) |
wolfSSL | 13:f67a6c6013ca | 382 | QUARTERROUND(2, 6, 10, 14) |
wolfSSL | 13:f67a6c6013ca | 383 | QUARTERROUND(3, 7, 11, 15) |
wolfSSL | 13:f67a6c6013ca | 384 | QUARTERROUND(0, 5, 10, 15) |
wolfSSL | 13:f67a6c6013ca | 385 | QUARTERROUND(1, 6, 11, 12) |
wolfSSL | 13:f67a6c6013ca | 386 | QUARTERROUND(2, 7, 8, 13) |
wolfSSL | 13:f67a6c6013ca | 387 | QUARTERROUND(3, 4, 9, 14) |
wolfSSL | 13:f67a6c6013ca | 388 | } |
wolfSSL | 13:f67a6c6013ca | 389 | |
wolfSSL | 13:f67a6c6013ca | 390 | for (i = 0; i < CHACHA_CHUNK_WORDS; i++) { |
wolfSSL | 13:f67a6c6013ca | 391 | x[i] = PLUS(x[i], input[i]); |
wolfSSL | 13:f67a6c6013ca | 392 | } |
wolfSSL | 13:f67a6c6013ca | 393 | |
wolfSSL | 13:f67a6c6013ca | 394 | for (i = 0; i < CHACHA_CHUNK_WORDS; i++) { |
wolfSSL | 13:f67a6c6013ca | 395 | output[i] = LITTLE32(x[i]); |
wolfSSL | 13:f67a6c6013ca | 396 | } |
wolfSSL | 13:f67a6c6013ca | 397 | } |
wolfSSL | 13:f67a6c6013ca | 398 | |
wolfSSL | 13:f67a6c6013ca | 399 | |
wolfSSL | 13:f67a6c6013ca | 400 | #ifdef USE_INTEL_CHACHA_SPEEDUP |
wolfSSL | 13:f67a6c6013ca | 401 | |
wolfSSL | 13:f67a6c6013ca | 402 | #ifdef HAVE_INTEL_AVX1 |
wolfSSL | 13:f67a6c6013ca | 403 | static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, byte* c, |
wolfSSL | 13:f67a6c6013ca | 404 | word32 bytes) |
wolfSSL | 13:f67a6c6013ca | 405 | { |
wolfSSL | 13:f67a6c6013ca | 406 | ALIGN128 word32 X[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ |
wolfSSL | 13:f67a6c6013ca | 407 | ALIGN128 word32 x[2*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ |
wolfSSL | 13:f67a6c6013ca | 408 | byte* output; |
wolfSSL | 13:f67a6c6013ca | 409 | word32 i; |
wolfSSL | 13:f67a6c6013ca | 410 | word32 cnt = 0; |
wolfSSL | 13:f67a6c6013ca | 411 | static const word64 add[2] = { 0x0000000100000000UL,0x0000000300000002UL }; |
wolfSSL | 13:f67a6c6013ca | 412 | static const word64 four[2] = { 0x0000000400000004UL,0x0000000400000004UL }; |
wolfSSL | 13:f67a6c6013ca | 413 | static const word64 rotl8[2] = |
wolfSSL | 13:f67a6c6013ca | 414 | { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; |
wolfSSL | 13:f67a6c6013ca | 415 | static const word64 rotl16[2] = |
wolfSSL | 13:f67a6c6013ca | 416 | { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; |
wolfSSL | 13:f67a6c6013ca | 417 | |
wolfSSL | 13:f67a6c6013ca | 418 | if (bytes == 0) |
wolfSSL | 13:f67a6c6013ca | 419 | return; |
wolfSSL | 13:f67a6c6013ca | 420 | |
wolfSSL | 13:f67a6c6013ca | 421 | __asm__ __volatile__ ( |
wolfSSL | 13:f67a6c6013ca | 422 | "movl %[bytes], %[cnt]\n\t" |
wolfSSL | 13:f67a6c6013ca | 423 | "shrl $8, %[cnt]\n\t" |
wolfSSL | 13:f67a6c6013ca | 424 | "jz L_end128\n\t" |
wolfSSL | 13:f67a6c6013ca | 425 | |
wolfSSL | 13:f67a6c6013ca | 426 | "vpshufd $0, (%[key]), %%xmm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 427 | "vpshufd $0, 4(%[key]), %%xmm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 428 | "vpshufd $0, 8(%[key]), %%xmm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 429 | "vpshufd $0, 12(%[key]), %%xmm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 430 | "vpshufd $0, 16(%[key]), %%xmm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 431 | "vpshufd $0, 20(%[key]), %%xmm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 432 | "vpshufd $0, 24(%[key]), %%xmm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 433 | "vpshufd $0, 28(%[key]), %%xmm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 434 | "vpshufd $0, 32(%[key]), %%xmm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 435 | "vpshufd $0, 36(%[key]), %%xmm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 436 | "vpshufd $0, 40(%[key]), %%xmm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 437 | "vpshufd $0, 44(%[key]), %%xmm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 438 | "vpshufd $0, 48(%[key]), %%xmm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 439 | "vpshufd $0, 52(%[key]), %%xmm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 440 | "vpshufd $0, 56(%[key]), %%xmm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 441 | "vpshufd $0, 60(%[key]), %%xmm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 442 | |
wolfSSL | 13:f67a6c6013ca | 443 | "vpaddd %[add], %%xmm12, %%xmm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 444 | |
wolfSSL | 13:f67a6c6013ca | 445 | "vmovdqa %%xmm0, (%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 446 | "vmovdqa %%xmm1, 16(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 447 | "vmovdqa %%xmm2, 32(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 448 | "vmovdqa %%xmm3, 48(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 449 | "vmovdqa %%xmm4, 64(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 450 | "vmovdqa %%xmm5, 80(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 451 | "vmovdqa %%xmm6, 96(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 452 | "vmovdqa %%xmm7, 112(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 453 | "vmovdqa %%xmm8, 128(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 454 | "vmovdqa %%xmm9, 144(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 455 | "vmovdqa %%xmm10, 160(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 456 | "vmovdqa %%xmm11, 176(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 457 | "vmovdqa %%xmm12, 192(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 458 | "vmovdqa %%xmm13, 208(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 459 | "vmovdqa %%xmm14, 224(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 460 | "vmovdqa %%xmm15, 240(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 461 | "\n" |
wolfSSL | 13:f67a6c6013ca | 462 | "L_enc128_loop:\n\t" |
wolfSSL | 13:f67a6c6013ca | 463 | "vmovdqa %%xmm11, 48(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 464 | QUARTERROUND_XMM() |
wolfSSL | 13:f67a6c6013ca | 465 | QUARTERROUND_XMM_2() |
wolfSSL | 13:f67a6c6013ca | 466 | QUARTERROUND_XMM() |
wolfSSL | 13:f67a6c6013ca | 467 | QUARTERROUND_XMM_2() |
wolfSSL | 13:f67a6c6013ca | 468 | QUARTERROUND_XMM() |
wolfSSL | 13:f67a6c6013ca | 469 | QUARTERROUND_XMM_2() |
wolfSSL | 13:f67a6c6013ca | 470 | QUARTERROUND_XMM() |
wolfSSL | 13:f67a6c6013ca | 471 | QUARTERROUND_XMM_2() |
wolfSSL | 13:f67a6c6013ca | 472 | QUARTERROUND_XMM() |
wolfSSL | 13:f67a6c6013ca | 473 | QUARTERROUND_XMM_2() |
wolfSSL | 13:f67a6c6013ca | 474 | QUARTERROUND_XMM() |
wolfSSL | 13:f67a6c6013ca | 475 | QUARTERROUND_XMM_2() |
wolfSSL | 13:f67a6c6013ca | 476 | QUARTERROUND_XMM() |
wolfSSL | 13:f67a6c6013ca | 477 | QUARTERROUND_XMM_2() |
wolfSSL | 13:f67a6c6013ca | 478 | QUARTERROUND_XMM() |
wolfSSL | 13:f67a6c6013ca | 479 | QUARTERROUND_XMM_2() |
wolfSSL | 13:f67a6c6013ca | 480 | QUARTERROUND_XMM() |
wolfSSL | 13:f67a6c6013ca | 481 | QUARTERROUND_XMM_2() |
wolfSSL | 13:f67a6c6013ca | 482 | QUARTERROUND_XMM() |
wolfSSL | 13:f67a6c6013ca | 483 | QUARTERROUND_XMM_2() |
wolfSSL | 13:f67a6c6013ca | 484 | "vmovdqa 48(%[x]), %%xmm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 485 | |
wolfSSL | 13:f67a6c6013ca | 486 | "vpaddd (%[X]), %%xmm0, %%xmm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 487 | "vpaddd 16(%[X]), %%xmm1, %%xmm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 488 | "vpaddd 32(%[X]), %%xmm2, %%xmm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 489 | "vpaddd 48(%[X]), %%xmm3, %%xmm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 490 | "vpaddd 64(%[X]), %%xmm4, %%xmm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 491 | "vpaddd 80(%[X]), %%xmm5, %%xmm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 492 | "vpaddd 96(%[X]), %%xmm6, %%xmm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 493 | "vpaddd 112(%[X]), %%xmm7, %%xmm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 494 | "vpaddd 128(%[X]), %%xmm8, %%xmm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 495 | "vpaddd 144(%[X]), %%xmm9, %%xmm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 496 | "vpaddd 160(%[X]), %%xmm10, %%xmm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 497 | "vpaddd 176(%[X]), %%xmm11, %%xmm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 498 | "vpaddd 192(%[X]), %%xmm12, %%xmm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 499 | "vpaddd 208(%[X]), %%xmm13, %%xmm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 500 | "vpaddd 224(%[X]), %%xmm14, %%xmm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 501 | "vpaddd 240(%[X]), %%xmm15, %%xmm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 502 | |
wolfSSL | 13:f67a6c6013ca | 503 | "vmovdqa %%xmm8, (%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 504 | "vmovdqa %%xmm9, 16(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 505 | "vmovdqa %%xmm10, 32(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 506 | "vmovdqa %%xmm11, 48(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 507 | "vmovdqa %%xmm12, 64(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 508 | "vmovdqa %%xmm13, 80(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 509 | "vmovdqa %%xmm14, 96(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 510 | "vmovdqa %%xmm15, 112(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 511 | |
wolfSSL | 13:f67a6c6013ca | 512 | "vpunpckldq %%xmm1, %%xmm0, %%xmm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 513 | "vpunpckldq %%xmm3, %%xmm2, %%xmm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 514 | "vpunpckhdq %%xmm1, %%xmm0, %%xmm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 515 | "vpunpckhdq %%xmm3, %%xmm2, %%xmm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 516 | "vpunpckldq %%xmm5, %%xmm4, %%xmm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 517 | "vpunpckldq %%xmm7, %%xmm6, %%xmm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 518 | "vpunpckhdq %%xmm5, %%xmm4, %%xmm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 519 | "vpunpckhdq %%xmm7, %%xmm6, %%xmm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 520 | "vpunpcklqdq %%xmm9, %%xmm8, %%xmm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 521 | "vpunpcklqdq %%xmm11, %%xmm10, %%xmm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 522 | "vpunpckhqdq %%xmm9, %%xmm8, %%xmm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 523 | "vpunpckhqdq %%xmm11, %%xmm10, %%xmm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 524 | "vpunpcklqdq %%xmm13, %%xmm12, %%xmm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 525 | "vpunpcklqdq %%xmm15, %%xmm14, %%xmm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 526 | "vpunpckhqdq %%xmm13, %%xmm12, %%xmm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 527 | "vpunpckhqdq %%xmm15, %%xmm14, %%xmm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 528 | "vmovdqu (%[in]), %%xmm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 529 | "vmovdqu 16(%[in]), %%xmm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 530 | "vmovdqu 64(%[in]), %%xmm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 531 | "vmovdqu 80(%[in]), %%xmm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 532 | "vmovdqu 128(%[in]), %%xmm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 533 | "vmovdqu 144(%[in]), %%xmm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 534 | "vmovdqu 192(%[in]), %%xmm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 535 | "vmovdqu 208(%[in]), %%xmm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 536 | "vpxor %%xmm8, %%xmm0, %%xmm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 537 | "vpxor %%xmm9, %%xmm1, %%xmm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 538 | "vpxor %%xmm10, %%xmm2, %%xmm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 539 | "vpxor %%xmm11, %%xmm3, %%xmm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 540 | "vpxor %%xmm12, %%xmm4, %%xmm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 541 | "vpxor %%xmm13, %%xmm5, %%xmm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 542 | "vpxor %%xmm14, %%xmm6, %%xmm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 543 | "vpxor %%xmm15, %%xmm7, %%xmm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 544 | "vmovdqu %%xmm0, (%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 545 | "vmovdqu %%xmm1, 16(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 546 | "vmovdqu %%xmm2, 64(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 547 | "vmovdqu %%xmm3, 80(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 548 | "vmovdqu %%xmm4, 128(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 549 | "vmovdqu %%xmm5, 144(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 550 | "vmovdqu %%xmm6, 192(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 551 | "vmovdqu %%xmm7, 208(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 552 | |
wolfSSL | 13:f67a6c6013ca | 553 | "vmovdqa (%[x]), %%xmm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 554 | "vmovdqa 16(%[x]), %%xmm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 555 | "vmovdqa 32(%[x]), %%xmm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 556 | "vmovdqa 48(%[x]), %%xmm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 557 | "vmovdqa 64(%[x]), %%xmm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 558 | "vmovdqa 80(%[x]), %%xmm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 559 | "vmovdqa 96(%[x]), %%xmm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 560 | "vmovdqa 112(%[x]), %%xmm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 561 | |
wolfSSL | 13:f67a6c6013ca | 562 | "vpunpckldq %%xmm1, %%xmm0, %%xmm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 563 | "vpunpckldq %%xmm3, %%xmm2, %%xmm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 564 | "vpunpckhdq %%xmm1, %%xmm0, %%xmm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 565 | "vpunpckhdq %%xmm3, %%xmm2, %%xmm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 566 | "vpunpckldq %%xmm5, %%xmm4, %%xmm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 567 | "vpunpckldq %%xmm7, %%xmm6, %%xmm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 568 | "vpunpckhdq %%xmm5, %%xmm4, %%xmm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 569 | "vpunpckhdq %%xmm7, %%xmm6, %%xmm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 570 | "vpunpcklqdq %%xmm9, %%xmm8, %%xmm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 571 | "vpunpcklqdq %%xmm11, %%xmm10, %%xmm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 572 | "vpunpckhqdq %%xmm9, %%xmm8, %%xmm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 573 | "vpunpckhqdq %%xmm11, %%xmm10, %%xmm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 574 | "vpunpcklqdq %%xmm13, %%xmm12, %%xmm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 575 | "vpunpcklqdq %%xmm15, %%xmm14, %%xmm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 576 | "vpunpckhqdq %%xmm13, %%xmm12, %%xmm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 577 | "vpunpckhqdq %%xmm15, %%xmm14, %%xmm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 578 | "vmovdqu 32(%[in]), %%xmm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 579 | "vmovdqu 48(%[in]), %%xmm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 580 | "vmovdqu 96(%[in]), %%xmm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 581 | "vmovdqu 112(%[in]), %%xmm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 582 | "vmovdqu 160(%[in]), %%xmm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 583 | "vmovdqu 176(%[in]), %%xmm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 584 | "vmovdqu 224(%[in]), %%xmm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 585 | "vmovdqu 240(%[in]), %%xmm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 586 | "vpxor %%xmm8, %%xmm0, %%xmm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 587 | "vpxor %%xmm9, %%xmm1, %%xmm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 588 | "vpxor %%xmm10, %%xmm2, %%xmm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 589 | "vpxor %%xmm11, %%xmm3, %%xmm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 590 | "vpxor %%xmm12, %%xmm4, %%xmm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 591 | "vpxor %%xmm13, %%xmm5, %%xmm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 592 | "vpxor %%xmm14, %%xmm6, %%xmm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 593 | "vpxor %%xmm15, %%xmm7, %%xmm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 594 | "vmovdqu %%xmm0, 32(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 595 | "vmovdqu %%xmm1, 48(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 596 | "vmovdqu %%xmm2, 96(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 597 | "vmovdqu %%xmm3, 112(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 598 | "vmovdqu %%xmm4, 160(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 599 | "vmovdqu %%xmm5, 176(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 600 | "vmovdqu %%xmm6, 224(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 601 | "vmovdqu %%xmm7, 240(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 602 | |
wolfSSL | 13:f67a6c6013ca | 603 | "vmovdqa 192(%[X]), %%xmm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 604 | "add $256, %[in]\n\t" |
wolfSSL | 13:f67a6c6013ca | 605 | "add $256, %[out]\n\t" |
wolfSSL | 13:f67a6c6013ca | 606 | "vpaddd %[four], %%xmm12, %%xmm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 607 | "sub $256, %[bytes]\n\t" |
wolfSSL | 13:f67a6c6013ca | 608 | "vmovdqa %%xmm12, 192(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 609 | "cmp $256, %[bytes]\n\t" |
wolfSSL | 13:f67a6c6013ca | 610 | "jl L_done\n\t" |
wolfSSL | 13:f67a6c6013ca | 611 | |
wolfSSL | 13:f67a6c6013ca | 612 | "vmovdqa (%[X]), %%xmm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 613 | "vmovdqa 16(%[X]), %%xmm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 614 | "vmovdqa 32(%[X]), %%xmm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 615 | "vmovdqa 48(%[X]), %%xmm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 616 | "vmovdqa 64(%[X]), %%xmm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 617 | "vmovdqa 80(%[X]), %%xmm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 618 | "vmovdqa 96(%[X]), %%xmm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 619 | "vmovdqa 112(%[X]), %%xmm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 620 | "vmovdqa 128(%[X]), %%xmm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 621 | "vmovdqa 144(%[X]), %%xmm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 622 | "vmovdqa 160(%[X]), %%xmm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 623 | "vmovdqa 176(%[X]), %%xmm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 624 | "vmovdqa 192(%[X]), %%xmm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 625 | "vmovdqa 208(%[X]), %%xmm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 626 | "vmovdqa 224(%[X]), %%xmm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 627 | "vmovdqa 240(%[X]), %%xmm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 628 | "jmp L_enc128_loop\n\t" |
wolfSSL | 13:f67a6c6013ca | 629 | |
wolfSSL | 13:f67a6c6013ca | 630 | "\n" |
wolfSSL | 13:f67a6c6013ca | 631 | "L_done:\n\t" |
wolfSSL | 13:f67a6c6013ca | 632 | |
wolfSSL | 13:f67a6c6013ca | 633 | "shl $2, %[cnt]\n\t" |
wolfSSL | 13:f67a6c6013ca | 634 | "add 48(%[key]), %[cnt]\n\t" |
wolfSSL | 13:f67a6c6013ca | 635 | "movl %[cnt], 48(%[key])\n\t" |
wolfSSL | 13:f67a6c6013ca | 636 | "\n" |
wolfSSL | 13:f67a6c6013ca | 637 | "L_end128:" |
wolfSSL | 13:f67a6c6013ca | 638 | : [bytes] "+r" (bytes), [cnt] "+r" (cnt), |
wolfSSL | 13:f67a6c6013ca | 639 | [in] "+r" (m), [out] "+r" (c) |
wolfSSL | 13:f67a6c6013ca | 640 | : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), |
wolfSSL | 13:f67a6c6013ca | 641 | [add] "m" (add), [four] "m" (four), |
wolfSSL | 13:f67a6c6013ca | 642 | [rotl8] "m" (rotl8), [rotl16] "m" (rotl16) |
wolfSSL | 13:f67a6c6013ca | 643 | : "xmm0", "xmm1", "xmm2", "xmm3", |
wolfSSL | 13:f67a6c6013ca | 644 | "xmm4", "xmm5", "xmm6", "xmm7", |
wolfSSL | 13:f67a6c6013ca | 645 | "xmm8", "xmm9", "xmm10", "xmm11", |
wolfSSL | 13:f67a6c6013ca | 646 | "xmm12", "xmm13", "xmm14", "xmm15", "memory" |
wolfSSL | 13:f67a6c6013ca | 647 | ); |
wolfSSL | 13:f67a6c6013ca | 648 | |
wolfSSL | 13:f67a6c6013ca | 649 | output = (byte*)x; |
wolfSSL | 13:f67a6c6013ca | 650 | for (; bytes > 0;) { |
wolfSSL | 13:f67a6c6013ca | 651 | wc_Chacha_wordtobyte(x, ctx->X); |
wolfSSL | 13:f67a6c6013ca | 652 | ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); |
wolfSSL | 13:f67a6c6013ca | 653 | if (bytes <= CHACHA_CHUNK_BYTES) { |
wolfSSL | 13:f67a6c6013ca | 654 | for (i = 0; i < bytes; ++i) { |
wolfSSL | 13:f67a6c6013ca | 655 | c[i] = m[i] ^ output[i]; |
wolfSSL | 13:f67a6c6013ca | 656 | } |
wolfSSL | 13:f67a6c6013ca | 657 | return; |
wolfSSL | 13:f67a6c6013ca | 658 | } |
wolfSSL | 13:f67a6c6013ca | 659 | for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) { |
wolfSSL | 13:f67a6c6013ca | 660 | c[i] = m[i] ^ output[i]; |
wolfSSL | 13:f67a6c6013ca | 661 | } |
wolfSSL | 13:f67a6c6013ca | 662 | bytes -= CHACHA_CHUNK_BYTES; |
wolfSSL | 13:f67a6c6013ca | 663 | c += CHACHA_CHUNK_BYTES; |
wolfSSL | 13:f67a6c6013ca | 664 | m += CHACHA_CHUNK_BYTES; |
wolfSSL | 13:f67a6c6013ca | 665 | } |
wolfSSL | 13:f67a6c6013ca | 666 | } |
wolfSSL | 13:f67a6c6013ca | 667 | #endif /* HAVE_INTEL_AVX1 */ |
wolfSSL | 13:f67a6c6013ca | 668 | |
wolfSSL | 13:f67a6c6013ca | 669 | #ifdef HAVE_INTEL_AVX2 |
wolfSSL | 13:f67a6c6013ca | 670 | static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c, |
wolfSSL | 13:f67a6c6013ca | 671 | word32 bytes) |
wolfSSL | 13:f67a6c6013ca | 672 | { |
wolfSSL | 13:f67a6c6013ca | 673 | ALIGN256 word32 X[8*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ |
wolfSSL | 13:f67a6c6013ca | 674 | ALIGN256 word32 x[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ |
wolfSSL | 13:f67a6c6013ca | 675 | byte* output; |
wolfSSL | 13:f67a6c6013ca | 676 | word32 i; |
wolfSSL | 13:f67a6c6013ca | 677 | word32 cnt = 0; |
wolfSSL | 13:f67a6c6013ca | 678 | static const word64 add[4] = { 0x0000000100000000UL, 0x0000000300000002UL, |
wolfSSL | 13:f67a6c6013ca | 679 | 0x0000000500000004UL, 0x0000000700000006UL }; |
wolfSSL | 13:f67a6c6013ca | 680 | static const word64 eight[4] = |
wolfSSL | 13:f67a6c6013ca | 681 | { 0x0000000800000008UL, 0x0000000800000008UL, |
wolfSSL | 13:f67a6c6013ca | 682 | 0x0000000800000008UL, 0x0000000800000008UL }; |
wolfSSL | 13:f67a6c6013ca | 683 | static const word64 rotl8[4] = |
wolfSSL | 13:f67a6c6013ca | 684 | { 0x0605040702010003UL, 0x0e0d0c0f0a09080bUL, |
wolfSSL | 13:f67a6c6013ca | 685 | 0x0605040702010003UL, 0x0e0d0c0f0a09080bUL }; |
wolfSSL | 13:f67a6c6013ca | 686 | static const word64 rotl16[4] = |
wolfSSL | 13:f67a6c6013ca | 687 | { 0x0504070601000302UL, 0x0d0c0f0e09080b0aUL, |
wolfSSL | 13:f67a6c6013ca | 688 | 0x0504070601000302UL, 0x0d0c0f0e09080b0aUL }; |
wolfSSL | 13:f67a6c6013ca | 689 | |
wolfSSL | 13:f67a6c6013ca | 690 | if (bytes == 0) |
wolfSSL | 13:f67a6c6013ca | 691 | return; |
wolfSSL | 13:f67a6c6013ca | 692 | |
wolfSSL | 13:f67a6c6013ca | 693 | __asm__ __volatile__ ( |
wolfSSL | 13:f67a6c6013ca | 694 | "movl %[bytes], %[cnt]\n\t" |
wolfSSL | 13:f67a6c6013ca | 695 | "shrl $9, %[cnt]\n\t" |
wolfSSL | 13:f67a6c6013ca | 696 | "jz L_end256\n\t" |
wolfSSL | 13:f67a6c6013ca | 697 | |
wolfSSL | 13:f67a6c6013ca | 698 | "vpbroadcastd (%[key]), %%ymm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 699 | "vpbroadcastd 4(%[key]), %%ymm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 700 | "vpbroadcastd 8(%[key]), %%ymm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 701 | "vpbroadcastd 12(%[key]), %%ymm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 702 | "vpbroadcastd 16(%[key]), %%ymm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 703 | "vpbroadcastd 20(%[key]), %%ymm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 704 | "vpbroadcastd 24(%[key]), %%ymm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 705 | "vpbroadcastd 28(%[key]), %%ymm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 706 | "vpbroadcastd 32(%[key]), %%ymm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 707 | "vpbroadcastd 36(%[key]), %%ymm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 708 | "vpbroadcastd 40(%[key]), %%ymm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 709 | "vpbroadcastd 44(%[key]), %%ymm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 710 | "vpbroadcastd 48(%[key]), %%ymm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 711 | "vpbroadcastd 52(%[key]), %%ymm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 712 | "vpbroadcastd 56(%[key]), %%ymm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 713 | "vpbroadcastd 60(%[key]), %%ymm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 714 | |
wolfSSL | 13:f67a6c6013ca | 715 | "vpaddd %[add], %%ymm12, %%ymm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 716 | |
wolfSSL | 13:f67a6c6013ca | 717 | "vmovdqa %%ymm0, (%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 718 | "vmovdqa %%ymm1, 32(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 719 | "vmovdqa %%ymm2, 64(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 720 | "vmovdqa %%ymm3, 96(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 721 | "vmovdqa %%ymm4, 128(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 722 | "vmovdqa %%ymm5, 160(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 723 | "vmovdqa %%ymm6, 192(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 724 | "vmovdqa %%ymm7, 224(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 725 | "vmovdqa %%ymm8, 256(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 726 | "vmovdqa %%ymm9, 288(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 727 | "vmovdqa %%ymm10, 320(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 728 | "vmovdqa %%ymm11, 352(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 729 | "vmovdqa %%ymm12, 384(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 730 | "vmovdqa %%ymm13, 416(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 731 | "vmovdqa %%ymm14, 448(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 732 | "vmovdqa %%ymm15, 480(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 733 | "\n" |
wolfSSL | 13:f67a6c6013ca | 734 | "L_enc256_loop:\n\t" |
wolfSSL | 13:f67a6c6013ca | 735 | "vmovdqa %%ymm11, 96(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 736 | QUARTERROUND_YMM() |
wolfSSL | 13:f67a6c6013ca | 737 | QUARTERROUND_YMM_2() |
wolfSSL | 13:f67a6c6013ca | 738 | QUARTERROUND_YMM() |
wolfSSL | 13:f67a6c6013ca | 739 | QUARTERROUND_YMM_2() |
wolfSSL | 13:f67a6c6013ca | 740 | QUARTERROUND_YMM() |
wolfSSL | 13:f67a6c6013ca | 741 | QUARTERROUND_YMM_2() |
wolfSSL | 13:f67a6c6013ca | 742 | QUARTERROUND_YMM() |
wolfSSL | 13:f67a6c6013ca | 743 | QUARTERROUND_YMM_2() |
wolfSSL | 13:f67a6c6013ca | 744 | QUARTERROUND_YMM() |
wolfSSL | 13:f67a6c6013ca | 745 | QUARTERROUND_YMM_2() |
wolfSSL | 13:f67a6c6013ca | 746 | QUARTERROUND_YMM() |
wolfSSL | 13:f67a6c6013ca | 747 | QUARTERROUND_YMM_2() |
wolfSSL | 13:f67a6c6013ca | 748 | QUARTERROUND_YMM() |
wolfSSL | 13:f67a6c6013ca | 749 | QUARTERROUND_YMM_2() |
wolfSSL | 13:f67a6c6013ca | 750 | QUARTERROUND_YMM() |
wolfSSL | 13:f67a6c6013ca | 751 | QUARTERROUND_YMM_2() |
wolfSSL | 13:f67a6c6013ca | 752 | QUARTERROUND_YMM() |
wolfSSL | 13:f67a6c6013ca | 753 | QUARTERROUND_YMM_2() |
wolfSSL | 13:f67a6c6013ca | 754 | QUARTERROUND_YMM() |
wolfSSL | 13:f67a6c6013ca | 755 | QUARTERROUND_YMM_2() |
wolfSSL | 13:f67a6c6013ca | 756 | "vmovdqa 96(%[x]), %%ymm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 757 | |
wolfSSL | 13:f67a6c6013ca | 758 | "vpaddd (%[X]), %%ymm0, %%ymm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 759 | "vpaddd 32(%[X]), %%ymm1, %%ymm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 760 | "vpaddd 64(%[X]), %%ymm2, %%ymm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 761 | "vpaddd 96(%[X]), %%ymm3, %%ymm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 762 | "vpaddd 128(%[X]), %%ymm4, %%ymm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 763 | "vpaddd 160(%[X]), %%ymm5, %%ymm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 764 | "vpaddd 192(%[X]), %%ymm6, %%ymm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 765 | "vpaddd 224(%[X]), %%ymm7, %%ymm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 766 | "vpaddd 256(%[X]), %%ymm8, %%ymm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 767 | "vpaddd 288(%[X]), %%ymm9, %%ymm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 768 | "vpaddd 320(%[X]), %%ymm10, %%ymm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 769 | "vpaddd 352(%[X]), %%ymm11, %%ymm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 770 | "vpaddd 384(%[X]), %%ymm12, %%ymm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 771 | "vpaddd 416(%[X]), %%ymm13, %%ymm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 772 | "vpaddd 448(%[X]), %%ymm14, %%ymm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 773 | "vpaddd 480(%[X]), %%ymm15, %%ymm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 774 | |
wolfSSL | 13:f67a6c6013ca | 775 | "vmovdqa %%ymm8, (%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 776 | "vmovdqa %%ymm9, 32(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 777 | "vmovdqa %%ymm10, 64(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 778 | "vmovdqa %%ymm11, 96(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 779 | "vmovdqa %%ymm12, 128(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 780 | "vmovdqa %%ymm13, 160(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 781 | "vmovdqa %%ymm14, 192(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 782 | "vmovdqa %%ymm15, 224(%[x])\n\t" |
wolfSSL | 13:f67a6c6013ca | 783 | |
wolfSSL | 13:f67a6c6013ca | 784 | "vpunpckldq %%ymm1, %%ymm0, %%ymm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 785 | "vpunpckldq %%ymm3, %%ymm2, %%ymm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 786 | "vpunpckhdq %%ymm1, %%ymm0, %%ymm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 787 | "vpunpckhdq %%ymm3, %%ymm2, %%ymm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 788 | "vpunpckldq %%ymm5, %%ymm4, %%ymm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 789 | "vpunpckldq %%ymm7, %%ymm6, %%ymm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 790 | "vpunpckhdq %%ymm5, %%ymm4, %%ymm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 791 | "vpunpckhdq %%ymm7, %%ymm6, %%ymm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 792 | "vpunpcklqdq %%ymm9, %%ymm8, %%ymm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 793 | "vpunpcklqdq %%ymm11, %%ymm10, %%ymm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 794 | "vpunpckhqdq %%ymm9, %%ymm8, %%ymm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 795 | "vpunpckhqdq %%ymm11, %%ymm10, %%ymm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 796 | "vpunpcklqdq %%ymm13, %%ymm12, %%ymm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 797 | "vpunpcklqdq %%ymm15, %%ymm14, %%ymm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 798 | "vpunpckhqdq %%ymm13, %%ymm12, %%ymm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 799 | "vpunpckhqdq %%ymm15, %%ymm14, %%ymm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 800 | "vperm2i128 $0x20, %%ymm1, %%ymm0, %%ymm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 801 | "vperm2i128 $0x20, %%ymm3, %%ymm2, %%ymm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 802 | "vperm2i128 $0x31, %%ymm1, %%ymm0, %%ymm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 803 | "vperm2i128 $0x31, %%ymm3, %%ymm2, %%ymm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 804 | "vperm2i128 $0x20, %%ymm5, %%ymm4, %%ymm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 805 | "vperm2i128 $0x20, %%ymm7, %%ymm6, %%ymm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 806 | "vperm2i128 $0x31, %%ymm5, %%ymm4, %%ymm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 807 | "vperm2i128 $0x31, %%ymm7, %%ymm6, %%ymm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 808 | |
wolfSSL | 13:f67a6c6013ca | 809 | "vmovdqu (%[in]), %%ymm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 810 | "vmovdqu 64(%[in]), %%ymm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 811 | "vmovdqu 128(%[in]), %%ymm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 812 | "vmovdqu 192(%[in]), %%ymm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 813 | "vmovdqu 256(%[in]), %%ymm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 814 | "vmovdqu 320(%[in]), %%ymm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 815 | "vmovdqu 384(%[in]), %%ymm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 816 | "vmovdqu 448(%[in]), %%ymm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 817 | "vpxor %%ymm0, %%ymm8, %%ymm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 818 | "vpxor %%ymm1, %%ymm9, %%ymm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 819 | "vpxor %%ymm2, %%ymm10, %%ymm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 820 | "vpxor %%ymm3, %%ymm11, %%ymm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 821 | "vpxor %%ymm4, %%ymm12, %%ymm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 822 | "vpxor %%ymm5, %%ymm13, %%ymm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 823 | "vpxor %%ymm6, %%ymm14, %%ymm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 824 | "vpxor %%ymm7, %%ymm15, %%ymm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 825 | "vmovdqu %%ymm8, (%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 826 | "vmovdqu %%ymm9, 64(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 827 | "vmovdqu %%ymm10, 128(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 828 | "vmovdqu %%ymm11, 192(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 829 | "vmovdqu %%ymm12, 256(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 830 | "vmovdqu %%ymm13, 320(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 831 | "vmovdqu %%ymm14, 384(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 832 | "vmovdqu %%ymm15, 448(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 833 | |
wolfSSL | 13:f67a6c6013ca | 834 | "vmovdqa (%[x]), %%ymm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 835 | "vmovdqa 32(%[x]), %%ymm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 836 | "vmovdqa 64(%[x]), %%ymm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 837 | "vmovdqa 96(%[x]), %%ymm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 838 | "vmovdqa 128(%[x]), %%ymm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 839 | "vmovdqa 160(%[x]), %%ymm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 840 | "vmovdqa 192(%[x]), %%ymm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 841 | "vmovdqa 224(%[x]), %%ymm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 842 | |
wolfSSL | 13:f67a6c6013ca | 843 | "vpunpckldq %%ymm1, %%ymm0, %%ymm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 844 | "vpunpckldq %%ymm3, %%ymm2, %%ymm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 845 | "vpunpckhdq %%ymm1, %%ymm0, %%ymm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 846 | "vpunpckhdq %%ymm3, %%ymm2, %%ymm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 847 | "vpunpckldq %%ymm5, %%ymm4, %%ymm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 848 | "vpunpckldq %%ymm7, %%ymm6, %%ymm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 849 | "vpunpckhdq %%ymm5, %%ymm4, %%ymm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 850 | "vpunpckhdq %%ymm7, %%ymm6, %%ymm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 851 | "vpunpcklqdq %%ymm9, %%ymm8, %%ymm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 852 | "vpunpcklqdq %%ymm11, %%ymm10, %%ymm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 853 | "vpunpckhqdq %%ymm9 , %%ymm8, %%ymm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 854 | "vpunpckhqdq %%ymm11, %%ymm10, %%ymm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 855 | "vpunpcklqdq %%ymm13, %%ymm12, %%ymm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 856 | "vpunpcklqdq %%ymm15, %%ymm14, %%ymm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 857 | "vpunpckhqdq %%ymm13, %%ymm12, %%ymm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 858 | "vpunpckhqdq %%ymm15, %%ymm14, %%ymm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 859 | "vperm2i128 $0x20, %%ymm1, %%ymm0, %%ymm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 860 | "vperm2i128 $0x20, %%ymm3, %%ymm2, %%ymm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 861 | "vperm2i128 $0x31, %%ymm1, %%ymm0, %%ymm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 862 | "vperm2i128 $0x31, %%ymm3, %%ymm2, %%ymm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 863 | "vperm2i128 $0x20, %%ymm5, %%ymm4, %%ymm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 864 | "vperm2i128 $0x20, %%ymm7, %%ymm6, %%ymm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 865 | "vperm2i128 $0x31, %%ymm5, %%ymm4, %%ymm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 866 | "vperm2i128 $0x31, %%ymm7, %%ymm6, %%ymm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 867 | |
wolfSSL | 13:f67a6c6013ca | 868 | "vmovdqu 32(%[in]), %%ymm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 869 | "vmovdqu 96(%[in]), %%ymm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 870 | "vmovdqu 160(%[in]), %%ymm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 871 | "vmovdqu 224(%[in]), %%ymm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 872 | "vmovdqu 288(%[in]), %%ymm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 873 | "vmovdqu 352(%[in]), %%ymm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 874 | "vmovdqu 416(%[in]), %%ymm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 875 | "vmovdqu 480(%[in]), %%ymm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 876 | "vpxor %%ymm0, %%ymm8, %%ymm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 877 | "vpxor %%ymm1, %%ymm9, %%ymm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 878 | "vpxor %%ymm2, %%ymm10, %%ymm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 879 | "vpxor %%ymm3, %%ymm11, %%ymm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 880 | "vpxor %%ymm4, %%ymm12, %%ymm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 881 | "vpxor %%ymm5, %%ymm13, %%ymm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 882 | "vpxor %%ymm6, %%ymm14, %%ymm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 883 | "vpxor %%ymm7, %%ymm15, %%ymm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 884 | "vmovdqu %%ymm8, 32(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 885 | "vmovdqu %%ymm9, 96(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 886 | "vmovdqu %%ymm10, 160(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 887 | "vmovdqu %%ymm11, 224(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 888 | "vmovdqu %%ymm12, 288(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 889 | "vmovdqu %%ymm13, 352(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 890 | "vmovdqu %%ymm14, 416(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 891 | "vmovdqu %%ymm15, 480(%[out])\n\t" |
wolfSSL | 13:f67a6c6013ca | 892 | |
wolfSSL | 13:f67a6c6013ca | 893 | "vmovdqa 384(%[X]), %%ymm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 894 | "add $512, %[in]\n\t" |
wolfSSL | 13:f67a6c6013ca | 895 | "add $512, %[out]\n\t" |
wolfSSL | 13:f67a6c6013ca | 896 | "vpaddd %[eight], %%ymm12, %%ymm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 897 | "sub $512, %[bytes]\n\t" |
wolfSSL | 13:f67a6c6013ca | 898 | "vmovdqa %%ymm12, 384(%[X])\n\t" |
wolfSSL | 13:f67a6c6013ca | 899 | "cmp $512, %[bytes]\n\t" |
wolfSSL | 13:f67a6c6013ca | 900 | "jl L_done256\n\t" |
wolfSSL | 13:f67a6c6013ca | 901 | |
wolfSSL | 13:f67a6c6013ca | 902 | "vmovdqa (%[X]), %%ymm0\n\t" |
wolfSSL | 13:f67a6c6013ca | 903 | "vmovdqa 32(%[X]), %%ymm1\n\t" |
wolfSSL | 13:f67a6c6013ca | 904 | "vmovdqa 64(%[X]), %%ymm2\n\t" |
wolfSSL | 13:f67a6c6013ca | 905 | "vmovdqa 96(%[X]), %%ymm3\n\t" |
wolfSSL | 13:f67a6c6013ca | 906 | "vmovdqa 128(%[X]), %%ymm4\n\t" |
wolfSSL | 13:f67a6c6013ca | 907 | "vmovdqa 160(%[X]), %%ymm5\n\t" |
wolfSSL | 13:f67a6c6013ca | 908 | "vmovdqa 192(%[X]), %%ymm6\n\t" |
wolfSSL | 13:f67a6c6013ca | 909 | "vmovdqa 224(%[X]), %%ymm7\n\t" |
wolfSSL | 13:f67a6c6013ca | 910 | "vmovdqa 256(%[X]), %%ymm8\n\t" |
wolfSSL | 13:f67a6c6013ca | 911 | "vmovdqa 288(%[X]), %%ymm9\n\t" |
wolfSSL | 13:f67a6c6013ca | 912 | "vmovdqa 320(%[X]), %%ymm10\n\t" |
wolfSSL | 13:f67a6c6013ca | 913 | "vmovdqa 352(%[X]), %%ymm11\n\t" |
wolfSSL | 13:f67a6c6013ca | 914 | "vmovdqa 384(%[X]), %%ymm12\n\t" |
wolfSSL | 13:f67a6c6013ca | 915 | "vmovdqa 416(%[X]), %%ymm13\n\t" |
wolfSSL | 13:f67a6c6013ca | 916 | "vmovdqa 448(%[X]), %%ymm14\n\t" |
wolfSSL | 13:f67a6c6013ca | 917 | "vmovdqa 480(%[X]), %%ymm15\n\t" |
wolfSSL | 13:f67a6c6013ca | 918 | "jmp L_enc256_loop\n\t" |
wolfSSL | 13:f67a6c6013ca | 919 | "\n" |
wolfSSL | 13:f67a6c6013ca | 920 | "L_done256:\n\t" |
wolfSSL | 13:f67a6c6013ca | 921 | "shl $3, %[cnt]\n\t" |
wolfSSL | 13:f67a6c6013ca | 922 | "add 48(%[key]), %[cnt]\n\t" |
wolfSSL | 13:f67a6c6013ca | 923 | "movl %[cnt], 48(%[key])\n\t" |
wolfSSL | 13:f67a6c6013ca | 924 | "\n" |
wolfSSL | 13:f67a6c6013ca | 925 | "L_end256:" |
wolfSSL | 13:f67a6c6013ca | 926 | : [bytes] "+r" (bytes), [cnt] "+r" (cnt), |
wolfSSL | 13:f67a6c6013ca | 927 | [in] "+r" (m), [out] "+r" (c) |
wolfSSL | 13:f67a6c6013ca | 928 | : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), |
wolfSSL | 13:f67a6c6013ca | 929 | [add] "m" (add), [eight] "m" (eight), |
wolfSSL | 13:f67a6c6013ca | 930 | [rotl8] "m" (rotl8), [rotl16] "m" (rotl16) |
wolfSSL | 13:f67a6c6013ca | 931 | : "ymm0", "ymm1", "ymm2", "ymm3", |
wolfSSL | 13:f67a6c6013ca | 932 | "ymm4", "ymm5", "ymm6", "ymm7", |
wolfSSL | 13:f67a6c6013ca | 933 | "ymm8", "ymm9", "ymm10", "ymm11", |
wolfSSL | 13:f67a6c6013ca | 934 | "ymm12", "ymm13", "ymm14", "ymm15", "memory" |
wolfSSL | 13:f67a6c6013ca | 935 | ); |
wolfSSL | 13:f67a6c6013ca | 936 | |
wolfSSL | 13:f67a6c6013ca | 937 | output = (byte*)x; |
wolfSSL | 13:f67a6c6013ca | 938 | for (; bytes > 0;) { |
wolfSSL | 13:f67a6c6013ca | 939 | wc_Chacha_wordtobyte(x, ctx->X); |
wolfSSL | 13:f67a6c6013ca | 940 | ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); |
wolfSSL | 13:f67a6c6013ca | 941 | if (bytes <= CHACHA_CHUNK_BYTES) { |
wolfSSL | 13:f67a6c6013ca | 942 | for (i = 0; i < bytes; ++i) { |
wolfSSL | 13:f67a6c6013ca | 943 | c[i] = m[i] ^ output[i]; |
wolfSSL | 13:f67a6c6013ca | 944 | } |
wolfSSL | 13:f67a6c6013ca | 945 | return; |
wolfSSL | 13:f67a6c6013ca | 946 | } |
wolfSSL | 13:f67a6c6013ca | 947 | for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) { |
wolfSSL | 13:f67a6c6013ca | 948 | c[i] = m[i] ^ output[i]; |
wolfSSL | 13:f67a6c6013ca | 949 | } |
wolfSSL | 13:f67a6c6013ca | 950 | bytes -= CHACHA_CHUNK_BYTES; |
wolfSSL | 13:f67a6c6013ca | 951 | c += CHACHA_CHUNK_BYTES; |
wolfSSL | 13:f67a6c6013ca | 952 | m += CHACHA_CHUNK_BYTES; |
wolfSSL | 13:f67a6c6013ca | 953 | } |
wolfSSL | 13:f67a6c6013ca | 954 | } |
wolfSSL | 13:f67a6c6013ca | 955 | #endif /* HAVE_INTEL_AVX2 */ |
wolfSSL | 13:f67a6c6013ca | 956 | #endif /* USE_INTEL_CHACHA_SPEEDUP */ |
wolfSSL | 13:f67a6c6013ca | 957 | |
wolfSSL | 13:f67a6c6013ca | 958 | /** |
wolfSSL | 13:f67a6c6013ca | 959 | * Encrypt a stream of bytes |
wolfSSL | 13:f67a6c6013ca | 960 | */ |
wolfSSL | 13:f67a6c6013ca | 961 | static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c, |
wolfSSL | 13:f67a6c6013ca | 962 | word32 bytes) |
wolfSSL | 13:f67a6c6013ca | 963 | { |
wolfSSL | 13:f67a6c6013ca | 964 | byte* output; |
wolfSSL | 13:f67a6c6013ca | 965 | word32 temp[CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ |
wolfSSL | 13:f67a6c6013ca | 966 | word32 i; |
wolfSSL | 13:f67a6c6013ca | 967 | |
wolfSSL | 13:f67a6c6013ca | 968 | output = (byte*)temp; |
wolfSSL | 13:f67a6c6013ca | 969 | |
wolfSSL | 13:f67a6c6013ca | 970 | for (; bytes > 0;) { |
wolfSSL | 13:f67a6c6013ca | 971 | wc_Chacha_wordtobyte(temp, ctx->X); |
wolfSSL | 13:f67a6c6013ca | 972 | ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); |
wolfSSL | 13:f67a6c6013ca | 973 | if (bytes <= CHACHA_CHUNK_BYTES) { |
wolfSSL | 13:f67a6c6013ca | 974 | for (i = 0; i < bytes; ++i) { |
wolfSSL | 13:f67a6c6013ca | 975 | c[i] = m[i] ^ output[i]; |
wolfSSL | 13:f67a6c6013ca | 976 | } |
wolfSSL | 13:f67a6c6013ca | 977 | return; |
wolfSSL | 13:f67a6c6013ca | 978 | } |
wolfSSL | 13:f67a6c6013ca | 979 | for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) { |
wolfSSL | 13:f67a6c6013ca | 980 | c[i] = m[i] ^ output[i]; |
wolfSSL | 13:f67a6c6013ca | 981 | } |
wolfSSL | 13:f67a6c6013ca | 982 | bytes -= CHACHA_CHUNK_BYTES; |
wolfSSL | 13:f67a6c6013ca | 983 | c += CHACHA_CHUNK_BYTES; |
wolfSSL | 13:f67a6c6013ca | 984 | m += CHACHA_CHUNK_BYTES; |
wolfSSL | 13:f67a6c6013ca | 985 | } |
wolfSSL | 13:f67a6c6013ca | 986 | } |
wolfSSL | 13:f67a6c6013ca | 987 | |
wolfSSL | 13:f67a6c6013ca | 988 | /** |
wolfSSL | 13:f67a6c6013ca | 989 | * API to encrypt/decrypt a message of any size. |
wolfSSL | 13:f67a6c6013ca | 990 | */ |
wolfSSL | 13:f67a6c6013ca | 991 | int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, |
wolfSSL | 13:f67a6c6013ca | 992 | word32 msglen) |
wolfSSL | 13:f67a6c6013ca | 993 | { |
wolfSSL | 13:f67a6c6013ca | 994 | if (ctx == NULL) |
wolfSSL | 13:f67a6c6013ca | 995 | return BAD_FUNC_ARG; |
wolfSSL | 13:f67a6c6013ca | 996 | |
wolfSSL | 13:f67a6c6013ca | 997 | #ifdef USE_INTEL_CHACHA_SPEEDUP |
wolfSSL | 13:f67a6c6013ca | 998 | #ifdef HAVE_INTEL_AVX2 |
wolfSSL | 13:f67a6c6013ca | 999 | if (IS_INTEL_AVX2(cpuid_get_flags())) |
wolfSSL | 13:f67a6c6013ca | 1000 | chacha_encrypt_avx2(ctx, input, output, msglen); |
wolfSSL | 13:f67a6c6013ca | 1001 | else |
wolfSSL | 13:f67a6c6013ca | 1002 | #endif |
wolfSSL | 13:f67a6c6013ca | 1003 | chacha_encrypt_avx(ctx, input, output, msglen); |
wolfSSL | 13:f67a6c6013ca | 1004 | return 0; |
wolfSSL | 13:f67a6c6013ca | 1005 | #endif |
wolfSSL | 13:f67a6c6013ca | 1006 | wc_Chacha_encrypt_bytes(ctx, input, output, msglen); |
wolfSSL | 13:f67a6c6013ca | 1007 | |
wolfSSL | 13:f67a6c6013ca | 1008 | return 0; |
wolfSSL | 13:f67a6c6013ca | 1009 | } |
wolfSSL | 13:f67a6c6013ca | 1010 | |
wolfSSL | 13:f67a6c6013ca | 1011 | #endif /* HAVE_CHACHA*/ |
wolfSSL | 13:f67a6c6013ca | 1012 | |
wolfSSL | 13:f67a6c6013ca | 1013 |