ssh lib
Embed:
(wiki syntax)
Show/hide line numbers
chacha.c
00001 /* chacha.c 00002 * 00003 * Copyright (C) 2006-2017 wolfSSL Inc. 00004 * 00005 * This file is part of wolfSSL. 00006 * 00007 * wolfSSL is free software; you can redistribute it and/or modify 00008 * it under the terms of the GNU General Public License as published by 00009 * the Free Software Foundation; either version 2 of the License, or 00010 * (at your option) any later version. 00011 * 00012 * wolfSSL is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 * GNU General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU General Public License 00018 * along with this program; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA 00020 * 00021 * based from 00022 * chacha-ref.c version 20080118 00023 * D. J. Bernstein 00024 * Public domain. 00025 */ 00026 00027 00028 00029 #ifdef HAVE_CONFIG_H 00030 #include <config.h> 00031 #endif 00032 00033 #include <wolfcrypt/settings.h> 00034 00035 #ifdef HAVE_CHACHA 00036 00037 #include <wolfcrypt/chacha.h> 00038 #include <wolfcrypt/error-crypt.h> 00039 #include <wolfcrypt/logging.h> 00040 #include <wolfcrypt/cpuid.h> 00041 #ifdef NO_INLINE 00042 #include <wolfcrypt/misc.h> 00043 #else 00044 #define WOLFSSL_MISC_INCLUDED 00045 #include <wolfcrypt/src/misc.c> 00046 #endif 00047 00048 #ifdef CHACHA_AEAD_TEST 00049 #include <stdio.h> 00050 #endif 00051 00052 #ifdef USE_INTEL_CHACHA_SPEEDUP 00053 #include <emmintrin.h> 00054 #include <immintrin.h> 00055 00056 #if defined(__GNUC__) && ((__GNUC__ < 4) || \ 00057 (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) 00058 #define NO_AVX2_SUPPORT 00059 #endif 00060 #if defined(__clang__) && ((__clang_major__ < 3) || \ 00061 (__clang_major__ == 3 && __clang_minor__ <= 5)) 00062 #define NO_AVX2_SUPPORT 00063 #elif defined(__clang__) && defined(NO_AVX2_SUPPORT) 00064 #undef NO_AVX2_SUPPORT 00065 #endif 00066 00067 #ifndef NO_AVX2_SUPPORT 00068 #define HAVE_INTEL_AVX2 00069 #endif 00070 00071 #if defined(_MSC_VER) 00072 #define CHACHA20_NOINLINE __declspec(noinline) 00073 #elif defined(__GNUC__) 00074 #define CHACHA20_NOINLINE __attribute__((noinline)) 00075 #else 00076 #define CHACHA20_NOINLINE 00077 #endif 00078 00079 static int cpuidFlagsSet = 0; 00080 static int cpuidFlags = 0; 00081 #endif 00082 00083 #ifdef BIG_ENDIAN_ORDER 00084 #define LITTLE32(x) ByteReverseWord32(x) 00085 #else 00086 #define LITTLE32(x) (x) 00087 #endif 00088 00089 /* Number of rounds */ 00090 #define ROUNDS 20 00091 00092 #define U32C(v) (v##U) 00093 #define U32V(v) ((word32)(v) & U32C(0xFFFFFFFF)) 00094 #define U8TO32_LITTLE(p) LITTLE32(((word32*)(p))[0]) 00095 00096 #define ROTATE(v,c) rotlFixed(v, c) 00097 #define XOR(v,w) ((v) ^ (w)) 00098 #define PLUS(v,w) (U32V((v) + (w))) 00099 #define PLUSONE(v) (PLUS((v),1)) 00100 00101 #define QUARTERROUND(a,b,c,d) \ 00102 x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \ 00103 x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \ 00104 x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \ 00105 x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7); 00106 00107 00108 00109 #define QUARTERROUND_INTEL_ASM(a0,b0,c0,d0, \ 00110 a1,b1,c1,d1, \ 00111 a2,b2,c2,d2, \ 00112 a3,b3,c3,d3, \ 00113 t1,o1) \ 00114 "vpaddd "#b0", "#a0", "#a0"\n\t" \ 00115 "vpxor "#a0", "#d0", "#d0"\n\t" \ 00116 "vmovdqa "#o1"(%[x]), "#c3"\n\t" \ 00117 "vpshufb %[rotl16], "#d0", "#d0"\n\t" \ 00118 "vpaddd "#d0", "#c0", "#c0"\n\t" \ 00119 "vpxor "#c0", "#b0", "#b0"\n\t" \ 00120 "vpaddd "#b1", "#a1", "#a1"\n\t" \ 00121 "vpxor "#a1", "#d1", "#d1"\n\t" \ 00122 "vpshufb %[rotl16], "#d1", "#d1"\n\t" \ 00123 "vpaddd "#d1", "#c1", "#c1"\n\t" \ 00124 "vpxor "#c1", "#b1", "#b1"\n\t" \ 00125 "vpaddd "#b2", "#a2", "#a2"\n\t" \ 00126 "vpxor "#a2", "#d2", "#d2"\n\t" \ 00127 "vpshufb %[rotl16], "#d2", "#d2"\n\t" \ 00128 "vpaddd "#d2", "#c2", "#c2"\n\t" \ 00129 "vpxor "#c2", "#b2", "#b2"\n\t" \ 00130 "vpaddd "#b3", "#a3", "#a3"\n\t" \ 00131 "vpxor "#a3", "#d3", "#d3"\n\t" \ 00132 "vpshufb %[rotl16], "#d3", "#d3"\n\t" \ 00133 "vpaddd "#d3", "#c3", "#c3"\n\t" \ 00134 "vpxor "#c3", "#b3", "#b3"\n\t" \ 00135 "vmovdqa "#c3", "#o1"(%[x])\n\t" \ 00136 "vpsrld $20, "#b0", "#t1"\n\t" \ 00137 "vpslld $12, "#b0", "#b0"\n\t" \ 00138 "vpxor "#t1", "#b0", "#b0"\n\t" \ 00139 "vpsrld $20, "#b1", "#t1"\n\t" \ 00140 "vpslld $12, "#b1", "#b1"\n\t" \ 00141 "vpxor "#t1", "#b1", "#b1"\n\t" \ 00142 "vpsrld $20, "#b2", "#t1"\n\t" \ 00143 "vpslld $12, "#b2", "#b2"\n\t" \ 00144 "vpxor "#t1", "#b2", "#b2"\n\t" \ 00145 "vpsrld $20, "#b3", "#t1"\n\t" \ 00146 "vpslld $12, "#b3", "#b3"\n\t" \ 00147 "vpxor "#t1", "#b3", "#b3"\n\t" \ 00148 "vpaddd "#b0", "#a0", "#a0"\n\t" \ 00149 "vpxor "#a0", "#d0", "#d0"\n\t" \ 00150 "vmovdqa "#o1"(%[x]), "#c3"\n\t" \ 00151 "vpshufb %[rotl8], "#d0", "#d0"\n\t" \ 00152 "vpaddd "#d0", "#c0", "#c0"\n\t" \ 00153 "vpxor "#c0", "#b0", "#b0"\n\t" \ 00154 "vpaddd "#b1", "#a1", "#a1"\n\t" \ 00155 "vpxor "#a1", "#d1", "#d1"\n\t" \ 00156 "vpshufb %[rotl8], "#d1", "#d1"\n\t" \ 00157 "vpaddd "#d1", "#c1", "#c1"\n\t" \ 00158 "vpxor "#c1", "#b1", "#b1"\n\t" \ 00159 "vpaddd "#b2", "#a2", "#a2"\n\t" \ 00160 "vpxor "#a2", "#d2", "#d2"\n\t" \ 00161 "vpshufb %[rotl8], "#d2", "#d2"\n\t" \ 00162 "vpaddd "#d2", "#c2", "#c2"\n\t" \ 00163 "vpxor "#c2", "#b2", "#b2"\n\t" \ 00164 "vpaddd "#b3", "#a3", "#a3"\n\t" \ 00165 "vpxor "#a3", "#d3", "#d3"\n\t" \ 00166 "vpshufb %[rotl8], "#d3", "#d3"\n\t" \ 00167 "vpaddd "#d3", "#c3", "#c3"\n\t" \ 00168 "vpxor "#c3", "#b3", "#b3"\n\t" \ 00169 "vmovdqa "#c3", "#o1"(%[x])\n\t" \ 00170 "vpsrld $25, "#b0", "#t1"\n\t" \ 00171 "vpslld $7, "#b0", "#b0"\n\t" \ 00172 "vpxor "#t1", "#b0", "#b0"\n\t" \ 00173 "vpsrld $25, "#b1", "#t1"\n\t" \ 00174 "vpslld $7, "#b1", "#b1"\n\t" \ 00175 "vpxor "#t1", "#b1", "#b1"\n\t" \ 00176 "vpsrld $25, "#b2", "#t1"\n\t" \ 00177 "vpslld $7, "#b2", "#b2"\n\t" \ 00178 "vpxor "#t1", "#b2", "#b2"\n\t" \ 00179 "vpsrld $25, "#b3", "#t1"\n\t" \ 00180 "vpslld $7, "#b3", "#b3"\n\t" \ 00181 "vpxor "#t1", "#b3", "#b3"\n\t" 00182 00183 #define QUARTERROUND_INTEL_ASM_2(a0,b0,c0,d0, \ 00184 a1,b1,c1,d1, \ 00185 a2,b2,c2,d2, \ 00186 a3,b3,c3,d3, \ 00187 t1,o1) \ 00188 "vpaddd "#b0", "#a0", "#a0"\n\t" \ 00189 "vpxor "#a0", "#d0", "#d0"\n\t" \ 00190 "vmovdqa "#o1"(%[x]), "#c1"\n\t" \ 00191 "vpshufb %[rotl16], "#d0", "#d0"\n\t" \ 00192 "vpaddd "#d0", "#c0", "#c0"\n\t" \ 00193 "vpxor "#c0", "#b0", "#b0"\n\t" \ 00194 "vpaddd "#b1", "#a1", "#a1"\n\t" \ 00195 "vpxor "#a1", "#d1", "#d1"\n\t" \ 00196 "vpshufb %[rotl16], "#d1", "#d1"\n\t" \ 00197 "vpaddd "#d1", "#c1", "#c1"\n\t" \ 00198 "vpxor "#c1", "#b1", "#b1"\n\t" \ 00199 "vpaddd "#b2", "#a2", "#a2"\n\t" \ 00200 "vpxor "#a2", "#d2", "#d2"\n\t" \ 00201 "vpshufb %[rotl16], "#d2", "#d2"\n\t" \ 00202 "vpaddd "#d2", "#c2", "#c2"\n\t" \ 00203 "vpxor "#c2", "#b2", "#b2"\n\t" \ 00204 "vpaddd "#b3", "#a3", "#a3"\n\t" \ 00205 "vpxor "#a3", "#d3", "#d3"\n\t" \ 00206 "vpshufb %[rotl16], "#d3", "#d3"\n\t" \ 00207 "vpaddd "#d3", "#c3", "#c3"\n\t" \ 00208 "vpxor "#c3", "#b3", "#b3"\n\t" \ 00209 "vmovdqa "#c1", "#o1"(%[x])\n\t" \ 00210 "vpsrld $20, "#b0", "#t1"\n\t" \ 00211 "vpslld $12, "#b0", "#b0"\n\t" \ 00212 "vpxor "#t1", "#b0", "#b0"\n\t" \ 00213 "vpsrld $20, "#b1", "#t1"\n\t" \ 00214 "vpslld $12, "#b1", "#b1"\n\t" \ 00215 "vpxor "#t1", "#b1", "#b1"\n\t" \ 00216 "vpsrld $20, "#b2", "#t1"\n\t" \ 00217 "vpslld $12, "#b2", "#b2"\n\t" \ 00218 "vpxor "#t1", "#b2", "#b2"\n\t" \ 00219 "vpsrld $20, "#b3", "#t1"\n\t" \ 00220 "vpslld $12, "#b3", "#b3"\n\t" \ 00221 "vpxor "#t1", "#b3", "#b3"\n\t" \ 00222 "vpaddd "#b0", "#a0", "#a0"\n\t" \ 00223 "vpxor "#a0", "#d0", "#d0"\n\t" \ 00224 "vmovdqa "#o1"(%[x]), "#c1"\n\t" \ 00225 "vpshufb %[rotl8], "#d0", "#d0"\n\t" \ 00226 "vpaddd "#d0", "#c0", "#c0"\n\t" \ 00227 "vpxor "#c0", "#b0", "#b0"\n\t" \ 00228 "vpaddd "#b1", "#a1", "#a1"\n\t" \ 00229 "vpxor "#a1", "#d1", "#d1"\n\t" \ 00230 "vpshufb %[rotl8], "#d1", "#d1"\n\t" \ 00231 "vpaddd "#d1", "#c1", "#c1"\n\t" \ 00232 "vpxor "#c1", "#b1", "#b1"\n\t" \ 00233 "vpaddd "#b2", "#a2", "#a2"\n\t" \ 00234 "vpxor "#a2", "#d2", "#d2"\n\t" \ 00235 "vpshufb %[rotl8], "#d2", "#d2"\n\t" \ 00236 "vpaddd "#d2", "#c2", "#c2"\n\t" \ 00237 "vpxor "#c2", "#b2", "#b2"\n\t" \ 00238 "vpaddd "#b3", "#a3", "#a3"\n\t" \ 00239 "vpxor "#a3", "#d3", "#d3"\n\t" \ 00240 "vpshufb %[rotl8], "#d3", "#d3"\n\t" \ 00241 "vpaddd "#d3", "#c3", "#c3"\n\t" \ 00242 "vpxor "#c3", "#b3", "#b3"\n\t" \ 00243 "vmovdqa "#c1", "#o1"(%[x])\n\t" \ 00244 "vpsrld $25, "#b0", "#t1"\n\t" \ 00245 "vpslld $7, "#b0", "#b0"\n\t" \ 00246 "vpxor "#t1", "#b0", "#b0"\n\t" \ 00247 "vpsrld $25, "#b1", "#t1"\n\t" \ 00248 "vpslld $7, "#b1", "#b1"\n\t" \ 00249 "vpxor "#t1", "#b1", "#b1"\n\t" \ 00250 "vpsrld $25, "#b2", "#t1"\n\t" \ 00251 "vpslld $7, "#b2", "#b2"\n\t" \ 00252 "vpxor "#t1", "#b2", "#b2"\n\t" \ 00253 "vpsrld $25, "#b3", "#t1"\n\t" \ 00254 "vpslld $7, "#b3", "#b3"\n\t" \ 00255 "vpxor "#t1", "#b3", "#b3"\n\t" 00256 00257 00258 #define QUARTERROUND_XMM() \ 00259 QUARTERROUND_INTEL_ASM(%%xmm0,%%xmm4,%%xmm8,%%xmm12, \ 00260 %%xmm1,%%xmm5,%%xmm9,%%xmm13, \ 00261 %%xmm2,%%xmm6,%%xmm10,%%xmm14, \ 00262 %%xmm3,%%xmm7,%%xmm11,%%xmm15, \ 00263 %%xmm11,48) 00264 #define QUARTERROUND_XMM_2() \ 00265 QUARTERROUND_INTEL_ASM_2(%%xmm0,%%xmm5,%%xmm10,%%xmm15, \ 00266 %%xmm1,%%xmm6,%%xmm11,%%xmm12, \ 00267 %%xmm2,%%xmm7,%%xmm8,%%xmm13, \ 00268 %%xmm3,%%xmm4,%%xmm9,%%xmm14, \ 00269 %%xmm11,48) 00270 00271 #define QUARTERROUND_YMM() \ 00272 QUARTERROUND_INTEL_ASM(%%ymm0,%%ymm4,%%ymm8,%%ymm12, \ 00273 %%ymm1,%%ymm5,%%ymm9,%%ymm13, \ 00274 %%ymm2,%%ymm6,%%ymm10,%%ymm14, \ 00275 %%ymm3,%%ymm7,%%ymm11,%%ymm15, \ 00276 %%ymm11,96) 00277 #define QUARTERROUND_YMM_2() \ 00278 QUARTERROUND_INTEL_ASM_2(%%ymm0,%%ymm5,%%ymm10,%%ymm15, \ 00279 %%ymm1,%%ymm6,%%ymm11,%%ymm12, \ 00280 %%ymm2,%%ymm7,%%ymm8,%%ymm13, \ 00281 %%ymm3,%%ymm4,%%ymm9,%%ymm14, \ 00282 %%ymm11,96) 00283 00284 /** 00285 * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version 00286 * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB. 00287 */ 00288 int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter) 00289 { 00290 word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */ 00291 00292 #ifdef CHACHA_AEAD_TEST 00293 word32 i; 00294 printf("NONCE : "); 00295 for (i = 0; i < CHACHA_IV_BYTES; i++) { 00296 printf("%02x", inIv[i]); 00297 } 00298 printf("\n\n"); 00299 #endif 00300 00301 if (ctx == NULL) 00302 return BAD_FUNC_ARG; 00303 00304 XMEMCPY(temp, inIv, CHACHA_IV_BYTES); 00305 00306 ctx->X[CHACHA_IV_BYTES+0] = counter; /* block counter */ 00307 ctx->X[CHACHA_IV_BYTES+1] = LITTLE32(temp[0]); /* fixed variable from nonce */ 00308 ctx->X[CHACHA_IV_BYTES+2] = LITTLE32(temp[1]); /* counter from nonce */ 00309 ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */ 00310 00311 return 0; 00312 } 00313 00314 /* "expand 32-byte k" as unsigned 32 byte */ 00315 static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; 00316 /* "expand 16-byte k" as unsigned 16 byte */ 00317 static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574}; 00318 00319 /** 00320 * Key setup. 8 word iv (nonce) 00321 */ 00322 int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz) 00323 { 00324 const word32* constants; 00325 const byte* k; 00326 00327 #ifdef XSTREAM_ALIGN 00328 word32 alignKey[8]; 00329 #endif 00330 00331 if (ctx == NULL) 00332 return BAD_FUNC_ARG; 00333 00334 if (keySz != (CHACHA_MAX_KEY_SZ/2) && keySz != CHACHA_MAX_KEY_SZ) 00335 return BAD_FUNC_ARG; 00336 00337 #ifdef XSTREAM_ALIGN 00338 if ((wolfssl_word)key % 4) { 00339 WOLFSSL_MSG("wc_ChachaSetKey unaligned key"); 00340 XMEMCPY(alignKey, key, keySz); 00341 k = (byte*)alignKey; 00342 } 00343 else { 00344 k = key; 00345 } 00346 #else 00347 k = key; 00348 #endif /* XSTREAM_ALIGN */ 00349 00350 #ifdef CHACHA_AEAD_TEST 00351 word32 i; 00352 printf("ChaCha key used :\n"); 00353 for (i = 0; i < keySz; i++) { 00354 printf("%02x", key[i]); 00355 if ((i + 1) % 8 == 0) 00356 printf("\n"); 00357 } 00358 printf("\n\n"); 00359 #endif 00360 00361 ctx->X[4] = U8TO32_LITTLE(k + 0); 00362 ctx->X[5] = U8TO32_LITTLE(k + 4); 00363 ctx->X[6] = U8TO32_LITTLE(k + 8); 00364 ctx->X[7] = U8TO32_LITTLE(k + 12); 00365 if (keySz == CHACHA_MAX_KEY_SZ) { 00366 k += 16; 00367 constants = sigma; 00368 } 00369 else { 00370 constants = tau; 00371 } 00372 ctx->X[ 8] = U8TO32_LITTLE(k + 0); 00373 ctx->X[ 9] = U8TO32_LITTLE(k + 4); 00374 ctx->X[10] = U8TO32_LITTLE(k + 8); 00375 ctx->X[11] = U8TO32_LITTLE(k + 12); 00376 ctx->X[ 0] = constants[0]; 00377 ctx->X[ 1] = constants[1]; 00378 ctx->X[ 2] = constants[2]; 00379 ctx->X[ 3] = constants[3]; 00380 00381 return 0; 00382 } 00383 00384 /** 00385 * Converts word into bytes with rotations having been done. 00386 */ 00387 static WC_INLINE void wc_Chacha_wordtobyte(word32 output[CHACHA_CHUNK_WORDS], 00388 const word32 input[CHACHA_CHUNK_WORDS]) 00389 { 00390 word32 x[CHACHA_CHUNK_WORDS]; 00391 word32 i; 00392 00393 for (i = 0; i < CHACHA_CHUNK_WORDS; i++) { 00394 x[i] = input[i]; 00395 } 00396 00397 for (i = (ROUNDS); i > 0; i -= 2) { 00398 QUARTERROUND(0, 4, 8, 12) 00399 QUARTERROUND(1, 5, 9, 13) 00400 QUARTERROUND(2, 6, 10, 14) 00401 QUARTERROUND(3, 7, 11, 15) 00402 QUARTERROUND(0, 5, 10, 15) 00403 QUARTERROUND(1, 6, 11, 12) 00404 QUARTERROUND(2, 7, 8, 13) 00405 QUARTERROUND(3, 4, 9, 14) 00406 } 00407 00408 for (i = 0; i < CHACHA_CHUNK_WORDS; i++) { 00409 x[i] = PLUS(x[i], input[i]); 00410 } 00411 00412 for (i = 0; i < CHACHA_CHUNK_WORDS; i++) { 00413 output[i] = LITTLE32(x[i]); 00414 } 00415 } 00416 00417 00418 #ifdef USE_INTEL_CHACHA_SPEEDUP 00419 00420 #define QUARTERROUND_2_X64(r11, r12, r13, r14, r21, r22, r23, r24) \ 00421 "addl "#r12", "#r11"\n\t" \ 00422 "addl "#r22", "#r21"\n\t" \ 00423 "xorl "#r11", "#r14"\n\t" \ 00424 "xorl "#r21", "#r24"\n\t" \ 00425 "roll $16, "#r14"\n\t" \ 00426 "roll $16, "#r24"\n\t" \ 00427 "addl "#r14", "#r13"\n\t" \ 00428 "addl "#r24", "#r23"\n\t" \ 00429 "xorl "#r13", "#r12"\n\t" \ 00430 "xorl "#r23", "#r22"\n\t" \ 00431 "roll $12, "#r12"\n\t" \ 00432 "roll $12, "#r22"\n\t" \ 00433 "addl "#r12", "#r11"\n\t" \ 00434 "addl "#r22", "#r21"\n\t" \ 00435 "xorl "#r11", "#r14"\n\t" \ 00436 "xorl "#r21", "#r24"\n\t" \ 00437 "roll $8, "#r14"\n\t" \ 00438 "roll $8, "#r24"\n\t" \ 00439 "addl "#r14", "#r13"\n\t" \ 00440 "addl "#r24", "#r23"\n\t" \ 00441 "xorl "#r13", "#r12"\n\t" \ 00442 "xorl "#r23", "#r22"\n\t" \ 00443 "roll $7, "#r12"\n\t" \ 00444 "roll $7, "#r22"\n\t" \ 00445 00446 #define CHACHA_CRYPT_X64() \ 00447 "subq $40, %%rsp\n\t" \ 00448 "movq 32(%[input]), %%rax\n\t" \ 00449 "movq 40(%[input]), %%rdx\n\t" \ 00450 "movq %%rax, 8(%%rsp)\n\t" \ 00451 "movq %%rdx, 16(%%rsp)\n\t" \ 00452 "movl 0(%[input]), %%eax\n\t" \ 00453 "movl 4(%[input]), %%ebx\n\t" \ 00454 "movl 8(%[input]), %%ecx\n\t" \ 00455 "movl 12(%[input]), %%edx\n\t" \ 00456 "movl 16(%[input]), %%r8d\n\t" \ 00457 "movl 20(%[input]), %%r9d\n\t" \ 00458 "movl 24(%[input]), %%r10d\n\t" \ 00459 "movl 28(%[input]), %%r11d\n\t" \ 00460 "movl 48(%[input]), %%r12d\n\t" \ 00461 "movl 52(%[input]), %%r13d\n\t" \ 00462 "movl 56(%[input]), %%r14d\n\t" \ 00463 "movl 60(%[input]), %%r15d\n\t" \ 00464 "movb $10, (%%rsp)\n\t" \ 00465 "movq %%rsi, 32(%%rsp)\n\t" \ 00466 "movq %%rdi, 24(%%rsp)\n\t" \ 00467 "movl 8(%%rsp), %%esi\n\t" \ 00468 "movl 12(%%rsp), %%edi\n\t" \ 00469 "\n" \ 00470 "1:\n\t" \ 00471 QUARTERROUND_2_X64(%%eax, %%r8d, %%esi, %%r12d, \ 00472 %%ebx, %%r9d, %%edi, %%r13d) \ 00473 "movl %%esi, 8(%%rsp)\n\t" \ 00474 "movl %%edi, 12(%%rsp)\n\t" \ 00475 "movl 16(%%rsp), %%esi\n\t" \ 00476 "movl 20(%%rsp), %%edi\n\t" \ 00477 QUARTERROUND_2_X64(%%ecx, %%r10d, %%esi, %%r14d, \ 00478 %%edx, %%r11d, %%edi, %%r15d) \ 00479 QUARTERROUND_2_X64(%%eax, %%r9d, %%esi, %%r15d, \ 00480 %%ebx, %%r10d, %%edi, %%r12d) \ 00481 "movl %%esi, 16(%%rsp)\n\t" \ 00482 "movl %%edi, 20(%%rsp)\n\t" \ 00483 "movl 8(%%rsp), %%esi\n\t" \ 00484 "movl 12(%%rsp), %%edi\n\t" \ 00485 QUARTERROUND_2_X64(%%ecx, %%r11d, %%esi, %%r13d, \ 00486 %%edx, %%r8d, %%edi, %%r14d) \ 00487 "decb (%%rsp)\n\t" \ 00488 "jnz 1b\n\t" \ 00489 "movl %%esi, 8(%%rsp)\n\t" \ 00490 "movl %%edi, 12(%%rsp)\n\t" \ 00491 "movq 32(%%rsp), %%rsi\n\t" \ 00492 "movq 24(%%rsp), %%rdi\n\t" \ 00493 "addl 0(%[input]), %%eax\n\t" \ 00494 "addl 4(%[input]), %%ebx\n\t" \ 00495 "addl 8(%[input]), %%ecx\n\t" \ 00496 "addl 12(%[input]), %%edx\n\t" \ 00497 "addl 16(%[input]), %%r8d\n\t" \ 00498 "addl 20(%[input]), %%r9d\n\t" \ 00499 "addl 24(%[input]), %%r10d\n\t" \ 00500 "addl 28(%[input]), %%r11d\n\t" \ 00501 "addl 48(%[input]), %%r12d\n\t" \ 00502 "addl 52(%[input]), %%r13d\n\t" \ 00503 "addl 56(%[input]), %%r14d\n\t" \ 00504 "addl 60(%[input]), %%r15d\n\t" \ 00505 00506 #define CHACHA_PARTIAL_CHUNK_X64() \ 00507 __asm__ __volatile__ ( \ 00508 CHACHA_CRYPT_X64() \ 00509 "movl %%eax , 0(%[c])\n\t" \ 00510 "movl %%ebx , 4(%[c])\n\t" \ 00511 "movl %%ecx , 8(%[c])\n\t" \ 00512 "movl %%edx , 12(%[c])\n\t" \ 00513 "movl %%r8d , 16(%[c])\n\t" \ 00514 "movl %%r9d , 20(%[c])\n\t" \ 00515 "movl %%r10d, 24(%[c])\n\t" \ 00516 "movl %%r11d, 28(%[c])\n\t" \ 00517 "movl %%r12d, 48(%[c])\n\t" \ 00518 "movl %%r13d, 52(%[c])\n\t" \ 00519 "movl %%r14d, 56(%[c])\n\t" \ 00520 "movl %%r15d, 60(%[c])\n\t" \ 00521 "movl 8(%%rsp), %%eax\n\t" \ 00522 "movl 12(%%rsp), %%ebx\n\t" \ 00523 "movl 16(%%rsp), %%ecx\n\t" \ 00524 "movl 20(%%rsp), %%edx\n\t" \ 00525 "addl 32(%[input]), %%eax\n\t" \ 00526 "addl 36(%[input]), %%ebx\n\t" \ 00527 "addl 40(%[input]), %%ecx\n\t" \ 00528 "addl 44(%[input]), %%edx\n\t" \ 00529 "movl %%eax , 32(%[c])\n\t" \ 00530 "movl %%ebx , 36(%[c])\n\t" \ 00531 "movl %%ecx , 40(%[c])\n\t" \ 00532 "movl %%edx , 44(%[c])\n\t" \ 00533 "addl $1, 48(%[input])\n\t" \ 00534 "addq $40, %%rsp\n\t" \ 00535 "movq %[output], %%rax\n\t" \ 00536 "movq %[m], %%rbx\n\t" \ 00537 "movl %[bytes], %%r8d\n\t" \ 00538 "xorq %%rdx, %%rdx\n\t" \ 00539 "movl %%r8d, %%r9d\n\t" \ 00540 "andl $7, %%r9d\n\t" \ 00541 "jz 4f\n\t" \ 00542 "\n" \ 00543 "2:\n\t" \ 00544 "movzbl (%[c],%%rdx,1), %%ecx\n\t" \ 00545 "xorb (%%rbx,%%rdx,1), %%cl\n\t" \ 00546 "movb %%cl, (%%rax,%%rdx,1)\n\t" \ 00547 "incl %%edx\n\t" \ 00548 "cmpl %%r9d, %%edx\n\t" \ 00549 "jne 2b\n\t" \ 00550 "je 3f\n\t" \ 00551 "\n" \ 00552 "4:\n\t" \ 00553 "movq (%[c],%%rdx,1), %%rcx\n\t" \ 00554 "xorq (%%rbx,%%rdx,1), %%rcx\n\t" \ 00555 "movq %%rcx, (%%rax,%%rdx,1)\n\t" \ 00556 "addl $8, %%edx\n\t" \ 00557 "\n" \ 00558 "3:\n\t" \ 00559 "cmpl %%r8d, %%edx\n\t" \ 00560 "jne 4b\n\t" \ 00561 : \ 00562 : [input] "r" (ctx->X), [c] "r" (x), \ 00563 [output] "m" (c), [bytes] "m" (bytes), [m] "m" (m) \ 00564 : "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \ 00565 "r14", "r15", "memory" \ 00566 ) 00567 00568 00569 #define CHACHA_CHUNK_X64() \ 00570 __asm__ __volatile__ ( \ 00571 CHACHA_CRYPT_X64() \ 00572 "movq %%rsi, 32(%%rsp)\n\t" \ 00573 "addq $40, %%rsp\n\t" \ 00574 "movq %[m], %%rsi\n\t" \ 00575 "subq $40, %%rsp\n\t" \ 00576 "xorl 0(%%rsi), %%eax\n\t" \ 00577 "xorl 4(%%rsi), %%ebx\n\t" \ 00578 "xorl 8(%%rsi), %%ecx\n\t" \ 00579 "xorl 12(%%rsi), %%edx\n\t" \ 00580 "xorl 16(%%rsi), %%r8d\n\t" \ 00581 "xorl 20(%%rsi), %%r9d\n\t" \ 00582 "xorl 24(%%rsi), %%r10d\n\t" \ 00583 "xorl 28(%%rsi), %%r11d\n\t" \ 00584 "xorl 48(%%rsi), %%r12d\n\t" \ 00585 "xorl 52(%%rsi), %%r13d\n\t" \ 00586 "xorl 56(%%rsi), %%r14d\n\t" \ 00587 "xorl 60(%%rsi), %%r15d\n\t" \ 00588 "movq 32(%%rsp), %%rsi\n\t" \ 00589 "movl %%eax , 0(%[c])\n\t" \ 00590 "movl %%ebx , 4(%[c])\n\t" \ 00591 "movl %%ecx , 8(%[c])\n\t" \ 00592 "movl %%edx , 12(%[c])\n\t" \ 00593 "movl %%r8d , 16(%[c])\n\t" \ 00594 "movl %%r9d , 20(%[c])\n\t" \ 00595 "movl %%r10d, 24(%[c])\n\t" \ 00596 "movl %%r11d, 28(%[c])\n\t" \ 00597 "movl %%r12d, 48(%[c])\n\t" \ 00598 "movl %%r13d, 52(%[c])\n\t" \ 00599 "movl %%r14d, 56(%[c])\n\t" \ 00600 "movl %%r15d, 60(%[c])\n\t" \ 00601 "addq $40, %%rsp\n\t" \ 00602 "movq %[m], %%r8\n\t" \ 00603 "subq $40, %%rsp\n\t" \ 00604 "movl 8(%%rsp), %%eax\n\t" \ 00605 "movl 12(%%rsp), %%ebx\n\t" \ 00606 "movl 16(%%rsp), %%ecx\n\t" \ 00607 "movl 20(%%rsp), %%edx\n\t" \ 00608 "addl 32(%[input]), %%eax\n\t" \ 00609 "addl 36(%[input]), %%ebx\n\t" \ 00610 "addl 40(%[input]), %%ecx\n\t" \ 00611 "addl 44(%[input]), %%edx\n\t" \ 00612 "xorl 32(%%r8), %%eax\n\t" \ 00613 "xorl 36(%%r8), %%ebx\n\t" \ 00614 "xorl 40(%%r8), %%ecx\n\t" \ 00615 "xorl 44(%%r8), %%edx\n\t" \ 00616 "movl %%eax , 32(%[c])\n\t" \ 00617 "movl %%ebx , 36(%[c])\n\t" \ 00618 "movl %%ecx , 40(%[c])\n\t" \ 00619 "movl %%edx , 44(%[c])\n\t" \ 00620 "addl $1, 48(%[input])\n\t" \ 00621 "addq $40, %%rsp\n\t" \ 00622 : \ 00623 : [input] "r" (ctx->X), [c] "r" (c), [m] "m" (m) \ 00624 : "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13", \ 00625 "r14", "r15", "memory" \ 00626 ) 00627 00628 00629 static void chacha_encrypt_x64(ChaCha* ctx, const byte* m, byte* c, 00630 word32 bytes) 00631 { 00632 word32 x[CHACHA_CHUNK_WORDS]; 00633 00634 if (bytes == 0) 00635 return; 00636 00637 for (; bytes >= CHACHA_CHUNK_BYTES;) { 00638 CHACHA_CHUNK_X64(); 00639 bytes -= CHACHA_CHUNK_BYTES; 00640 c += CHACHA_CHUNK_BYTES; 00641 m += CHACHA_CHUNK_BYTES; 00642 } 00643 if (bytes > 0) { 00644 CHACHA_PARTIAL_CHUNK_X64(); 00645 } 00646 } 00647 00648 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2) 00649 static const __m128i rotl8 = { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; 00650 static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; 00651 #endif /* HAVE_INTEL_AVX1 || HAVE_INTEL_AVX2 */ 00652 00653 #ifdef HAVE_INTEL_AVX1 00654 #define QUARTERROUND_2_AVX() \ 00655 "paddd %%xmm1, %%xmm0\n\t" \ 00656 "pxor %%xmm0, %%xmm3\n\t" \ 00657 "pshufb %[rotl16], %%xmm3\n\t" \ 00658 "paddd %%xmm3, %%xmm2\n\t" \ 00659 "pxor %%xmm2, %%xmm1\n\t" \ 00660 "movdqa %%xmm1, %%xmm4\n\t" \ 00661 "pslld $12, %%xmm1\n\t" \ 00662 "psrld $20, %%xmm4\n\t" \ 00663 "pxor %%xmm4, %%xmm1\n\t" \ 00664 "paddd %%xmm1, %%xmm0\n\t" \ 00665 "pxor %%xmm0, %%xmm3\n\t" \ 00666 "pshufb %[rotl8], %%xmm3\n\t" \ 00667 "paddd %%xmm3, %%xmm2\n\t" \ 00668 "pxor %%xmm2, %%xmm1\n\t" \ 00669 "movdqa %%xmm1, %%xmm4\n\t" \ 00670 "pslld $7, %%xmm1\n\t" \ 00671 "psrld $25, %%xmm4\n\t" \ 00672 "pxor %%xmm4, %%xmm1\n\t" \ 00673 "# Swap words for next round\n\t" \ 00674 "pshufd $0x39, %%xmm1, %%xmm1\n\t" \ 00675 "pshufd $0x4e, %%xmm2, %%xmm2\n\t" \ 00676 "pshufd $0x93, %%xmm3, %%xmm3\n\t" \ 00677 "paddd %%xmm1, %%xmm0\n\t" \ 00678 "pxor %%xmm0, %%xmm3\n\t" \ 00679 "pshufb %[rotl16], %%xmm3\n\t" \ 00680 "paddd %%xmm3, %%xmm2\n\t" \ 00681 "pxor %%xmm2, %%xmm1\n\t" \ 00682 "movdqa %%xmm1, %%xmm4\n\t" \ 00683 "pslld $12, %%xmm1\n\t" \ 00684 "psrld $20, %%xmm4\n\t" \ 00685 "pxor %%xmm4, %%xmm1\n\t" \ 00686 "paddd %%xmm1, %%xmm0\n\t" \ 00687 "pxor %%xmm0, %%xmm3\n\t" \ 00688 "pshufb %[rotl8], %%xmm3\n\t" \ 00689 "paddd %%xmm3, %%xmm2\n\t" \ 00690 "pxor %%xmm2, %%xmm1\n\t" \ 00691 "movdqa %%xmm1, %%xmm4\n\t" \ 00692 "pslld $7, %%xmm1\n\t" \ 00693 "psrld $25, %%xmm4\n\t" \ 00694 "pxor %%xmm4, %%xmm1\n\t" \ 00695 "# Swap words back\n\t" \ 00696 "pshufd $0x93, %%xmm1, %%xmm1\n\t" \ 00697 "pshufd $0x4e, %%xmm2, %%xmm2\n\t" \ 00698 "pshufd $0x39, %%xmm3, %%xmm3\n\t" \ 00699 00700 #define CHACHA_CRYPT_AVX() \ 00701 "movdqu 0(%[input]), %%xmm0\n\t" \ 00702 "movdqu 16(%[input]), %%xmm1\n\t" \ 00703 "movdqu 32(%[input]), %%xmm2\n\t" \ 00704 "movdqu 48(%[input]), %%xmm3\n\t" \ 00705 "movb $10, %%al\n\t" \ 00706 "\n" \ 00707 "1:\n\t" \ 00708 QUARTERROUND_2_AVX() \ 00709 "decb %%al\n\t" \ 00710 "jnz 1b\n\t" \ 00711 "movdqu 0(%[input]), %%xmm4\n\t" \ 00712 "movdqu 16(%[input]), %%xmm5\n\t" \ 00713 "movdqu 32(%[input]), %%xmm6\n\t" \ 00714 "movdqu 48(%[input]), %%xmm7\n\t" \ 00715 "paddd %%xmm4, %%xmm0\n\t" \ 00716 "paddd %%xmm5, %%xmm1\n\t" \ 00717 "paddd %%xmm6, %%xmm2\n\t" \ 00718 "paddd %%xmm7, %%xmm3\n\t" \ 00719 00720 #define CHACHA_PARTIAL_CHUNK_AVX() \ 00721 __asm__ __volatile__ ( \ 00722 CHACHA_CRYPT_AVX() \ 00723 "movdqu %%xmm0, 0(%[c])\n\t" \ 00724 "movdqu %%xmm1, 16(%[c])\n\t" \ 00725 "movdqu %%xmm2, 32(%[c])\n\t" \ 00726 "movdqu %%xmm3, 48(%[c])\n\t" \ 00727 "addl $1, 48(%[input])\n\t" \ 00728 "movl %[bytes], %%r8d\n\t" \ 00729 "xorq %%rdx, %%rdx\n\t" \ 00730 "movl %%r8d, %%r9d\n\t" \ 00731 "andl $7, %%r9d\n\t" \ 00732 "jz 4f\n\t" \ 00733 "\n" \ 00734 "2:\n\t" \ 00735 "movzbl (%[c],%%rdx,1), %%ecx\n\t" \ 00736 "xorb (%[m],%%rdx,1), %%cl\n\t" \ 00737 "movb %%cl, (%[output],%%rdx,1)\n\t" \ 00738 "incl %%edx\n\t" \ 00739 "cmpl %%r9d, %%edx\n\t" \ 00740 "jne 2b\n\t" \ 00741 "je 3f\n\t" \ 00742 "\n" \ 00743 "4:\n\t" \ 00744 "movq (%[c],%%rdx,1), %%rcx\n\t" \ 00745 "xorq (%[m],%%rdx,1), %%rcx\n\t" \ 00746 "movq %%rcx, (%[output],%%rdx,1)\n\t" \ 00747 "addl $8, %%edx\n\t" \ 00748 "\n" \ 00749 "3:\n\t" \ 00750 "cmpl %%r8d, %%edx\n\t" \ 00751 "jne 4b\n\t" \ 00752 : \ 00753 : [input] "r" (ctx->X), [c] "r" (x), \ 00754 [output] "r" (c), [bytes] "r" (bytes), [m] "r" (m), \ 00755 [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \ 00756 : "eax", "ecx", "edx", "r8", "r9", "memory", \ 00757 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \ 00758 ) 00759 00760 00761 #define CHACHA_CHUNK_AVX() \ 00762 __asm__ __volatile__ ( \ 00763 CHACHA_CRYPT_AVX() \ 00764 "movdqu 0(%[m]), %%xmm4\n\t" \ 00765 "movdqu 16(%[m]), %%xmm5\n\t" \ 00766 "movdqu 32(%[m]), %%xmm6\n\t" \ 00767 "movdqu 48(%[m]), %%xmm7\n\t" \ 00768 "pxor %%xmm4, %%xmm0\n\t" \ 00769 "pxor %%xmm5, %%xmm1\n\t" \ 00770 "pxor %%xmm6, %%xmm2\n\t" \ 00771 "pxor %%xmm7, %%xmm3\n\t" \ 00772 "movdqu %%xmm0, 0(%[c])\n\t" \ 00773 "movdqu %%xmm1, 16(%[c])\n\t" \ 00774 "movdqu %%xmm2, 32(%[c])\n\t" \ 00775 "movdqu %%xmm3, 48(%[c])\n\t" \ 00776 "addl $1, 48(%[input])\n\t" \ 00777 : \ 00778 : [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m), \ 00779 [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \ 00780 : "rax", "memory", \ 00781 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" \ 00782 ) 00783 00784 CHACHA20_NOINLINE static void chacha_encrypt_avx(ChaCha* ctx, const byte* m, 00785 byte* c, word32 bytes) 00786 { 00787 ALIGN128 word32 X[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ 00788 ALIGN128 word32 x[2*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ 00789 word32 cnt = 0; 00790 static const __m128i add = { 0x0000000100000000UL,0x0000000300000002UL }; 00791 static const __m128i four = { 0x0000000400000004UL,0x0000000400000004UL }; 00792 00793 if (bytes == 0) 00794 return; 00795 00796 __asm__ __volatile__ ( 00797 "movl %[bytes], %[cnt]\n\t" 00798 "shrl $8, %[cnt]\n\t" 00799 "jz L_end128\n\t" 00800 00801 "vpshufd $0, (%[key]), %%xmm0\n\t" 00802 "vpshufd $0, 4(%[key]), %%xmm1\n\t" 00803 "vpshufd $0, 8(%[key]), %%xmm2\n\t" 00804 "vpshufd $0, 12(%[key]), %%xmm3\n\t" 00805 "vpshufd $0, 16(%[key]), %%xmm4\n\t" 00806 "vpshufd $0, 20(%[key]), %%xmm5\n\t" 00807 "vpshufd $0, 24(%[key]), %%xmm6\n\t" 00808 "vpshufd $0, 28(%[key]), %%xmm7\n\t" 00809 "vpshufd $0, 32(%[key]), %%xmm8\n\t" 00810 "vpshufd $0, 36(%[key]), %%xmm9\n\t" 00811 "vpshufd $0, 40(%[key]), %%xmm10\n\t" 00812 "vpshufd $0, 44(%[key]), %%xmm11\n\t" 00813 "vpshufd $0, 48(%[key]), %%xmm12\n\t" 00814 "vpshufd $0, 52(%[key]), %%xmm13\n\t" 00815 "vpshufd $0, 56(%[key]), %%xmm14\n\t" 00816 "vpshufd $0, 60(%[key]), %%xmm15\n\t" 00817 00818 "vpaddd %[add], %%xmm12, %%xmm12\n\t" 00819 00820 "vmovdqa %%xmm0, (%[X])\n\t" 00821 "vmovdqa %%xmm1, 16(%[X])\n\t" 00822 "vmovdqa %%xmm2, 32(%[X])\n\t" 00823 "vmovdqa %%xmm3, 48(%[X])\n\t" 00824 "vmovdqa %%xmm4, 64(%[X])\n\t" 00825 "vmovdqa %%xmm5, 80(%[X])\n\t" 00826 "vmovdqa %%xmm6, 96(%[X])\n\t" 00827 "vmovdqa %%xmm7, 112(%[X])\n\t" 00828 "vmovdqa %%xmm8, 128(%[X])\n\t" 00829 "vmovdqa %%xmm9, 144(%[X])\n\t" 00830 "vmovdqa %%xmm10, 160(%[X])\n\t" 00831 "vmovdqa %%xmm11, 176(%[X])\n\t" 00832 "vmovdqa %%xmm12, 192(%[X])\n\t" 00833 "vmovdqa %%xmm13, 208(%[X])\n\t" 00834 "vmovdqa %%xmm14, 224(%[X])\n\t" 00835 "vmovdqa %%xmm15, 240(%[X])\n\t" 00836 "\n" 00837 "L_enc128_loop:\n\t" 00838 "vmovdqa %%xmm11, 48(%[x])\n\t" 00839 QUARTERROUND_XMM() 00840 QUARTERROUND_XMM_2() 00841 QUARTERROUND_XMM() 00842 QUARTERROUND_XMM_2() 00843 QUARTERROUND_XMM() 00844 QUARTERROUND_XMM_2() 00845 QUARTERROUND_XMM() 00846 QUARTERROUND_XMM_2() 00847 QUARTERROUND_XMM() 00848 QUARTERROUND_XMM_2() 00849 QUARTERROUND_XMM() 00850 QUARTERROUND_XMM_2() 00851 QUARTERROUND_XMM() 00852 QUARTERROUND_XMM_2() 00853 QUARTERROUND_XMM() 00854 QUARTERROUND_XMM_2() 00855 QUARTERROUND_XMM() 00856 QUARTERROUND_XMM_2() 00857 QUARTERROUND_XMM() 00858 QUARTERROUND_XMM_2() 00859 "vmovdqa 48(%[x]), %%xmm11\n\t" 00860 00861 "vpaddd (%[X]), %%xmm0, %%xmm0\n\t" 00862 "vpaddd 16(%[X]), %%xmm1, %%xmm1\n\t" 00863 "vpaddd 32(%[X]), %%xmm2, %%xmm2\n\t" 00864 "vpaddd 48(%[X]), %%xmm3, %%xmm3\n\t" 00865 "vpaddd 64(%[X]), %%xmm4, %%xmm4\n\t" 00866 "vpaddd 80(%[X]), %%xmm5, %%xmm5\n\t" 00867 "vpaddd 96(%[X]), %%xmm6, %%xmm6\n\t" 00868 "vpaddd 112(%[X]), %%xmm7, %%xmm7\n\t" 00869 "vpaddd 128(%[X]), %%xmm8, %%xmm8\n\t" 00870 "vpaddd 144(%[X]), %%xmm9, %%xmm9\n\t" 00871 "vpaddd 160(%[X]), %%xmm10, %%xmm10\n\t" 00872 "vpaddd 176(%[X]), %%xmm11, %%xmm11\n\t" 00873 "vpaddd 192(%[X]), %%xmm12, %%xmm12\n\t" 00874 "vpaddd 208(%[X]), %%xmm13, %%xmm13\n\t" 00875 "vpaddd 224(%[X]), %%xmm14, %%xmm14\n\t" 00876 "vpaddd 240(%[X]), %%xmm15, %%xmm15\n\t" 00877 00878 "vmovdqa %%xmm8, (%[x])\n\t" 00879 "vmovdqa %%xmm9, 16(%[x])\n\t" 00880 "vmovdqa %%xmm10, 32(%[x])\n\t" 00881 "vmovdqa %%xmm11, 48(%[x])\n\t" 00882 "vmovdqa %%xmm12, 64(%[x])\n\t" 00883 "vmovdqa %%xmm13, 80(%[x])\n\t" 00884 "vmovdqa %%xmm14, 96(%[x])\n\t" 00885 "vmovdqa %%xmm15, 112(%[x])\n\t" 00886 00887 "vpunpckldq %%xmm1, %%xmm0, %%xmm8\n\t" 00888 "vpunpckldq %%xmm3, %%xmm2, %%xmm9\n\t" 00889 "vpunpckhdq %%xmm1, %%xmm0, %%xmm12\n\t" 00890 "vpunpckhdq %%xmm3, %%xmm2, %%xmm13\n\t" 00891 "vpunpckldq %%xmm5, %%xmm4, %%xmm10\n\t" 00892 "vpunpckldq %%xmm7, %%xmm6, %%xmm11\n\t" 00893 "vpunpckhdq %%xmm5, %%xmm4, %%xmm14\n\t" 00894 "vpunpckhdq %%xmm7, %%xmm6, %%xmm15\n\t" 00895 "vpunpcklqdq %%xmm9, %%xmm8, %%xmm0\n\t" 00896 "vpunpcklqdq %%xmm11, %%xmm10, %%xmm1\n\t" 00897 "vpunpckhqdq %%xmm9, %%xmm8, %%xmm2\n\t" 00898 "vpunpckhqdq %%xmm11, %%xmm10, %%xmm3\n\t" 00899 "vpunpcklqdq %%xmm13, %%xmm12, %%xmm4\n\t" 00900 "vpunpcklqdq %%xmm15, %%xmm14, %%xmm5\n\t" 00901 "vpunpckhqdq %%xmm13, %%xmm12, %%xmm6\n\t" 00902 "vpunpckhqdq %%xmm15, %%xmm14, %%xmm7\n\t" 00903 "vmovdqu (%[in]), %%xmm8\n\t" 00904 "vmovdqu 16(%[in]), %%xmm9\n\t" 00905 "vmovdqu 64(%[in]), %%xmm10\n\t" 00906 "vmovdqu 80(%[in]), %%xmm11\n\t" 00907 "vmovdqu 128(%[in]), %%xmm12\n\t" 00908 "vmovdqu 144(%[in]), %%xmm13\n\t" 00909 "vmovdqu 192(%[in]), %%xmm14\n\t" 00910 "vmovdqu 208(%[in]), %%xmm15\n\t" 00911 "vpxor %%xmm8, %%xmm0, %%xmm0\n\t" 00912 "vpxor %%xmm9, %%xmm1, %%xmm1\n\t" 00913 "vpxor %%xmm10, %%xmm2, %%xmm2\n\t" 00914 "vpxor %%xmm11, %%xmm3, %%xmm3\n\t" 00915 "vpxor %%xmm12, %%xmm4, %%xmm4\n\t" 00916 "vpxor %%xmm13, %%xmm5, %%xmm5\n\t" 00917 "vpxor %%xmm14, %%xmm6, %%xmm6\n\t" 00918 "vpxor %%xmm15, %%xmm7, %%xmm7\n\t" 00919 "vmovdqu %%xmm0, (%[out])\n\t" 00920 "vmovdqu %%xmm1, 16(%[out])\n\t" 00921 "vmovdqu %%xmm2, 64(%[out])\n\t" 00922 "vmovdqu %%xmm3, 80(%[out])\n\t" 00923 "vmovdqu %%xmm4, 128(%[out])\n\t" 00924 "vmovdqu %%xmm5, 144(%[out])\n\t" 00925 "vmovdqu %%xmm6, 192(%[out])\n\t" 00926 "vmovdqu %%xmm7, 208(%[out])\n\t" 00927 00928 "vmovdqa (%[x]), %%xmm0\n\t" 00929 "vmovdqa 16(%[x]), %%xmm1\n\t" 00930 "vmovdqa 32(%[x]), %%xmm2\n\t" 00931 "vmovdqa 48(%[x]), %%xmm3\n\t" 00932 "vmovdqa 64(%[x]), %%xmm4\n\t" 00933 "vmovdqa 80(%[x]), %%xmm5\n\t" 00934 "vmovdqa 96(%[x]), %%xmm6\n\t" 00935 "vmovdqa 112(%[x]), %%xmm7\n\t" 00936 00937 "vpunpckldq %%xmm1, %%xmm0, %%xmm8\n\t" 00938 "vpunpckldq %%xmm3, %%xmm2, %%xmm9\n\t" 00939 "vpunpckhdq %%xmm1, %%xmm0, %%xmm12\n\t" 00940 "vpunpckhdq %%xmm3, %%xmm2, %%xmm13\n\t" 00941 "vpunpckldq %%xmm5, %%xmm4, %%xmm10\n\t" 00942 "vpunpckldq %%xmm7, %%xmm6, %%xmm11\n\t" 00943 "vpunpckhdq %%xmm5, %%xmm4, %%xmm14\n\t" 00944 "vpunpckhdq %%xmm7, %%xmm6, %%xmm15\n\t" 00945 "vpunpcklqdq %%xmm9, %%xmm8, %%xmm0\n\t" 00946 "vpunpcklqdq %%xmm11, %%xmm10, %%xmm1\n\t" 00947 "vpunpckhqdq %%xmm9, %%xmm8, %%xmm2\n\t" 00948 "vpunpckhqdq %%xmm11, %%xmm10, %%xmm3\n\t" 00949 "vpunpcklqdq %%xmm13, %%xmm12, %%xmm4\n\t" 00950 "vpunpcklqdq %%xmm15, %%xmm14, %%xmm5\n\t" 00951 "vpunpckhqdq %%xmm13, %%xmm12, %%xmm6\n\t" 00952 "vpunpckhqdq %%xmm15, %%xmm14, %%xmm7\n\t" 00953 "vmovdqu 32(%[in]), %%xmm8\n\t" 00954 "vmovdqu 48(%[in]), %%xmm9\n\t" 00955 "vmovdqu 96(%[in]), %%xmm10\n\t" 00956 "vmovdqu 112(%[in]), %%xmm11\n\t" 00957 "vmovdqu 160(%[in]), %%xmm12\n\t" 00958 "vmovdqu 176(%[in]), %%xmm13\n\t" 00959 "vmovdqu 224(%[in]), %%xmm14\n\t" 00960 "vmovdqu 240(%[in]), %%xmm15\n\t" 00961 "vpxor %%xmm8, %%xmm0, %%xmm0\n\t" 00962 "vpxor %%xmm9, %%xmm1, %%xmm1\n\t" 00963 "vpxor %%xmm10, %%xmm2, %%xmm2\n\t" 00964 "vpxor %%xmm11, %%xmm3, %%xmm3\n\t" 00965 "vpxor %%xmm12, %%xmm4, %%xmm4\n\t" 00966 "vpxor %%xmm13, %%xmm5, %%xmm5\n\t" 00967 "vpxor %%xmm14, %%xmm6, %%xmm6\n\t" 00968 "vpxor %%xmm15, %%xmm7, %%xmm7\n\t" 00969 "vmovdqu %%xmm0, 32(%[out])\n\t" 00970 "vmovdqu %%xmm1, 48(%[out])\n\t" 00971 "vmovdqu %%xmm2, 96(%[out])\n\t" 00972 "vmovdqu %%xmm3, 112(%[out])\n\t" 00973 "vmovdqu %%xmm4, 160(%[out])\n\t" 00974 "vmovdqu %%xmm5, 176(%[out])\n\t" 00975 "vmovdqu %%xmm6, 224(%[out])\n\t" 00976 "vmovdqu %%xmm7, 240(%[out])\n\t" 00977 00978 "vmovdqa 192(%[X]), %%xmm12\n\t" 00979 "add $256, %[in]\n\t" 00980 "add $256, %[out]\n\t" 00981 "vpaddd %[four], %%xmm12, %%xmm12\n\t" 00982 "sub $256, %[bytes]\n\t" 00983 "vmovdqa %%xmm12, 192(%[X])\n\t" 00984 "cmp $256, %[bytes]\n\t" 00985 "jl L_done\n\t" 00986 00987 "vmovdqa (%[X]), %%xmm0\n\t" 00988 "vmovdqa 16(%[X]), %%xmm1\n\t" 00989 "vmovdqa 32(%[X]), %%xmm2\n\t" 00990 "vmovdqa 48(%[X]), %%xmm3\n\t" 00991 "vmovdqa 64(%[X]), %%xmm4\n\t" 00992 "vmovdqa 80(%[X]), %%xmm5\n\t" 00993 "vmovdqa 96(%[X]), %%xmm6\n\t" 00994 "vmovdqa 112(%[X]), %%xmm7\n\t" 00995 "vmovdqa 128(%[X]), %%xmm8\n\t" 00996 "vmovdqa 144(%[X]), %%xmm9\n\t" 00997 "vmovdqa 160(%[X]), %%xmm10\n\t" 00998 "vmovdqa 176(%[X]), %%xmm11\n\t" 00999 "vmovdqa 192(%[X]), %%xmm12\n\t" 01000 "vmovdqa 208(%[X]), %%xmm13\n\t" 01001 "vmovdqa 224(%[X]), %%xmm14\n\t" 01002 "vmovdqa 240(%[X]), %%xmm15\n\t" 01003 "jmp L_enc128_loop\n\t" 01004 01005 "\n" 01006 "L_done:\n\t" 01007 01008 "shl $2, %[cnt]\n\t" 01009 "add 48(%[key]), %[cnt]\n\t" 01010 "movl %[cnt], 48(%[key])\n\t" 01011 "\n" 01012 "L_end128:\n\t" 01013 : [bytes] "+r" (bytes), [cnt] "+r" (cnt), 01014 [in] "+r" (m), [out] "+r" (c) 01015 : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), 01016 [add] "xrm" (add), [four] "xrm" (four), 01017 [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) 01018 : "xmm0", "xmm1", "xmm2", "xmm3", 01019 "xmm4", "xmm5", "xmm6", "xmm7", 01020 "xmm8", "xmm9", "xmm10", "xmm11", 01021 "xmm12", "xmm13", "xmm14", "xmm15", "memory" 01022 ); 01023 01024 for (; bytes >= CHACHA_CHUNK_BYTES;) { 01025 CHACHA_CHUNK_AVX(); 01026 bytes -= CHACHA_CHUNK_BYTES; 01027 c += CHACHA_CHUNK_BYTES; 01028 m += CHACHA_CHUNK_BYTES; 01029 } 01030 if (bytes > 0) { 01031 CHACHA_PARTIAL_CHUNK_AVX(); 01032 } 01033 } 01034 #endif /* HAVE_INTEL_AVX1 */ 01035 01036 #ifdef HAVE_INTEL_AVX2 01037 #define QUARTERROUND_2_AVX2() \ 01038 "vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \ 01039 "vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \ 01040 "vpshufb %[rotl16], %%xmm3, %%xmm3\n\t" \ 01041 "vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \ 01042 "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \ 01043 "vpsrld $20, %%xmm1, %%xmm4\n\t" \ 01044 "vpslld $12, %%xmm1, %%xmm1\n\t" \ 01045 "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \ 01046 "vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \ 01047 "vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \ 01048 "vpshufb %[rotl8], %%xmm3, %%xmm3\n\t" \ 01049 "vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \ 01050 "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \ 01051 "vpsrld $25, %%xmm1, %%xmm4\n\t" \ 01052 "vpslld $7, %%xmm1, %%xmm1\n\t" \ 01053 "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \ 01054 "# Swap words for next round\n\t" \ 01055 "vpshufd $0x39, %%xmm1, %%xmm1\n\t" \ 01056 "vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \ 01057 "vpshufd $0x93, %%xmm3, %%xmm3\n\t" \ 01058 "vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \ 01059 "vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \ 01060 "vpshufb %[rotl16], %%xmm3, %%xmm3\n\t" \ 01061 "vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \ 01062 "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \ 01063 "vpsrld $20, %%xmm1, %%xmm4\n\t" \ 01064 "vpslld $12, %%xmm1, %%xmm1\n\t" \ 01065 "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \ 01066 "vpaddd %%xmm1, %%xmm0, %%xmm0\n\t" \ 01067 "vpxor %%xmm0, %%xmm3, %%xmm3\n\t" \ 01068 "vpshufb %[rotl8], %%xmm3, %%xmm3\n\t" \ 01069 "vpaddd %%xmm3, %%xmm2, %%xmm2\n\t" \ 01070 "vpxor %%xmm2, %%xmm1, %%xmm1\n\t" \ 01071 "vpsrld $25, %%Xmm1, %%xmm4\n\t" \ 01072 "vpslld $7, %%xmm1, %%xmm1\n\t" \ 01073 "vpxor %%xmm4, %%xmm1, %%xmm1\n\t" \ 01074 "# Swap words back\n\t" \ 01075 "vpshufd $0x93, %%xmm1, %%xmm1\n\t" \ 01076 "vpshufd $0x4e, %%xmm2, %%xmm2\n\t" \ 01077 "vpshufd $0x39, %%xmm3, %%xmm3\n\t" \ 01078 01079 #define CHACHA_CRYPT_AVX2() \ 01080 "vmovdqu 0(%[input]), %%xmm8\n\t" \ 01081 "vmovdqu 16(%[input]), %%xmm9\n\t" \ 01082 "vmovdqu 32(%[input]), %%xmm10\n\t" \ 01083 "vmovdqu 48(%[input]), %%xmm11\n\t" \ 01084 "vmovdqu %%xmm8, %%xmm0\n\t" \ 01085 "vmovdqu %%xmm9, %%xmm1\n\t" \ 01086 "vmovdqu %%xmm10, %%xmm2\n\t" \ 01087 "vmovdqu %%xmm11, %%xmm3\n\t" \ 01088 "movb $10, %%al\n\t" \ 01089 "\n" \ 01090 "1:\n\t" \ 01091 QUARTERROUND_2_AVX2() \ 01092 "decb %%al\n\t" \ 01093 "jnz 1b\n\t" \ 01094 "vpaddd %%xmm8, %%xmm0, %%xmm0\n\t" \ 01095 "vpaddd %%xmm9, %%xmm1, %%xmm1\n\t" \ 01096 "vpaddd %%xmm10, %%xmm2, %%xmm2\n\t" \ 01097 "vpaddd %%xmm11, %%xmm3, %%xmm3\n\t" \ 01098 01099 #define CHACHA_PARTIAL_CHUNK_AVX2() \ 01100 __asm__ __volatile__ ( \ 01101 CHACHA_CRYPT_AVX2() \ 01102 "vmovdqu %%xmm0, 0(%[c])\n\t" \ 01103 "vmovdqu %%xmm1, 16(%[c])\n\t" \ 01104 "vmovdqu %%xmm2, 32(%[c])\n\t" \ 01105 "vmovdqu %%xmm3, 48(%[c])\n\t" \ 01106 "addl $1, 48(%[input])\n\t" \ 01107 "movl %[bytes], %%r8d\n\t" \ 01108 "xorq %%rdx, %%rdx\n\t" \ 01109 "movl %%r8d, %%r9d\n\t" \ 01110 "andl $7, %%r9d\n\t" \ 01111 "jz 4f\n\t" \ 01112 "\n" \ 01113 "2:\n\t" \ 01114 "movzbl (%[c],%%rdx,1), %%ecx\n\t" \ 01115 "xorb (%[m],%%rdx,1), %%cl\n\t" \ 01116 "movb %%cl, (%[output],%%rdx,1)\n\t" \ 01117 "incl %%edx\n\t" \ 01118 "cmpl %%r9d, %%edx\n\t" \ 01119 "jne 2b\n\t" \ 01120 "je 3f\n\t" \ 01121 "\n" \ 01122 "4:\n\t" \ 01123 "movq (%[c],%%rdx,1), %%rcx\n\t" \ 01124 "xorq (%[m],%%rdx,1), %%rcx\n\t" \ 01125 "movq %%rcx, (%[output],%%rdx,1)\n\t" \ 01126 "addl $8, %%edx\n\t" \ 01127 "\n" \ 01128 "3:\n\t" \ 01129 "cmpl %%r8d, %%edx\n\t" \ 01130 "jne 4b\n\t" \ 01131 : \ 01132 : [input] "r" (ctx->X), [c] "r" (x), \ 01133 [output] "r" (c), [bytes] "r" (bytes), [m] "r" (m), \ 01134 [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \ 01135 : "eax", "ecx", "edx", "r8", "r9", "memory", \ 01136 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \ 01137 "xmm8", "xmm9", "xmm10", "xmm11" \ 01138 ) 01139 01140 01141 #define CHACHA_CHUNK_AVX2() \ 01142 __asm__ __volatile__ ( \ 01143 CHACHA_CRYPT_AVX2() \ 01144 "vmovdqu 0(%[m]), %%xmm4\n\t" \ 01145 "vmovdqu 16(%[m]), %%xmm5\n\t" \ 01146 "vmovdqu 32(%[m]), %%xmm6\n\t" \ 01147 "vmovdqu 48(%[m]), %%xmm7\n\t" \ 01148 "vpxor %%xmm4, %%xmm0, %%xmm0\n\t" \ 01149 "vpxor %%xmm5, %%xmm1, %%xmm1\n\t" \ 01150 "vpxor %%xmm6, %%xmm2, %%xmm2\n\t" \ 01151 "vpxor %%xmm7, %%xmm3, %%xmm3\n\t" \ 01152 "vmovdqu %%xmm0, 0(%[c])\n\t" \ 01153 "vmovdqu %%xmm1, 16(%[c])\n\t" \ 01154 "vmovdqu %%xmm2, 32(%[c])\n\t" \ 01155 "vmovdqu %%xmm3, 48(%[c])\n\t" \ 01156 "addl $1, 48(%[input])\n\t" \ 01157 : \ 01158 : [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m), \ 01159 [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16) \ 01160 : "rax", "memory", \ 01161 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", \ 01162 "xmm8", "xmm9", "xmm10", "xmm11" \ 01163 ) 01164 01165 01166 static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c, 01167 word32 bytes) 01168 { 01169 ALIGN256 word32 X[8*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ 01170 ALIGN256 word32 x[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ 01171 word32 cnt = 0; 01172 static const __m256i add = { 0x0000000100000000UL,0x0000000300000002UL, 01173 0x0000000500000004UL,0x0000000700000006UL }; 01174 static const __m256i eight = { 0x0000000800000008UL,0x0000000800000008UL, 01175 0x0000000800000008UL,0x0000000800000008UL }; 01176 static const __m256i rotl8_256 = 01177 { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL, 01178 0x0605040702010003UL,0x0e0d0c0f0a09080bUL }; 01179 static const __m256i rotl16_256 = 01180 { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL, 01181 0x0504070601000302UL,0x0d0c0f0e09080b0aUL }; 01182 01183 if (bytes == 0) 01184 return; 01185 01186 __asm__ __volatile__ ( 01187 "movl %[bytes], %[cnt]\n\t" 01188 "shrl $9, %[cnt]\n\t" 01189 "jz L_end256\n\t" 01190 01191 "vpbroadcastd (%[key]), %%ymm0\n\t" 01192 "vpbroadcastd 4(%[key]), %%ymm1\n\t" 01193 "vpbroadcastd 8(%[key]), %%ymm2\n\t" 01194 "vpbroadcastd 12(%[key]), %%ymm3\n\t" 01195 "vpbroadcastd 16(%[key]), %%ymm4\n\t" 01196 "vpbroadcastd 20(%[key]), %%ymm5\n\t" 01197 "vpbroadcastd 24(%[key]), %%ymm6\n\t" 01198 "vpbroadcastd 28(%[key]), %%ymm7\n\t" 01199 "vpbroadcastd 32(%[key]), %%ymm8\n\t" 01200 "vpbroadcastd 36(%[key]), %%ymm9\n\t" 01201 "vpbroadcastd 40(%[key]), %%ymm10\n\t" 01202 "vpbroadcastd 44(%[key]), %%ymm11\n\t" 01203 "vpbroadcastd 48(%[key]), %%ymm12\n\t" 01204 "vpbroadcastd 52(%[key]), %%ymm13\n\t" 01205 "vpbroadcastd 56(%[key]), %%ymm14\n\t" 01206 "vpbroadcastd 60(%[key]), %%ymm15\n\t" 01207 01208 "vpaddd %[add], %%ymm12, %%ymm12\n\t" 01209 01210 "vmovdqa %%ymm0, (%[X])\n\t" 01211 "vmovdqa %%ymm1, 32(%[X])\n\t" 01212 "vmovdqa %%ymm2, 64(%[X])\n\t" 01213 "vmovdqa %%ymm3, 96(%[X])\n\t" 01214 "vmovdqa %%ymm4, 128(%[X])\n\t" 01215 "vmovdqa %%ymm5, 160(%[X])\n\t" 01216 "vmovdqa %%ymm6, 192(%[X])\n\t" 01217 "vmovdqa %%ymm7, 224(%[X])\n\t" 01218 "vmovdqa %%ymm8, 256(%[X])\n\t" 01219 "vmovdqa %%ymm9, 288(%[X])\n\t" 01220 "vmovdqa %%ymm10, 320(%[X])\n\t" 01221 "vmovdqa %%ymm11, 352(%[X])\n\t" 01222 "vmovdqa %%ymm12, 384(%[X])\n\t" 01223 "vmovdqa %%ymm13, 416(%[X])\n\t" 01224 "vmovdqa %%ymm14, 448(%[X])\n\t" 01225 "vmovdqa %%ymm15, 480(%[X])\n\t" 01226 "\n" 01227 "L_enc256_loop:\n\t" 01228 "vmovdqa %%ymm11, 96(%[x])\n\t" 01229 QUARTERROUND_YMM() 01230 QUARTERROUND_YMM_2() 01231 QUARTERROUND_YMM() 01232 QUARTERROUND_YMM_2() 01233 QUARTERROUND_YMM() 01234 QUARTERROUND_YMM_2() 01235 QUARTERROUND_YMM() 01236 QUARTERROUND_YMM_2() 01237 QUARTERROUND_YMM() 01238 QUARTERROUND_YMM_2() 01239 QUARTERROUND_YMM() 01240 QUARTERROUND_YMM_2() 01241 QUARTERROUND_YMM() 01242 QUARTERROUND_YMM_2() 01243 QUARTERROUND_YMM() 01244 QUARTERROUND_YMM_2() 01245 QUARTERROUND_YMM() 01246 QUARTERROUND_YMM_2() 01247 QUARTERROUND_YMM() 01248 QUARTERROUND_YMM_2() 01249 "vmovdqa 96(%[x]), %%ymm11\n\t" 01250 01251 "vpaddd (%[X]), %%ymm0, %%ymm0\n\t" 01252 "vpaddd 32(%[X]), %%ymm1, %%ymm1\n\t" 01253 "vpaddd 64(%[X]), %%ymm2, %%ymm2\n\t" 01254 "vpaddd 96(%[X]), %%ymm3, %%ymm3\n\t" 01255 "vpaddd 128(%[X]), %%ymm4, %%ymm4\n\t" 01256 "vpaddd 160(%[X]), %%ymm5, %%ymm5\n\t" 01257 "vpaddd 192(%[X]), %%ymm6, %%ymm6\n\t" 01258 "vpaddd 224(%[X]), %%ymm7, %%ymm7\n\t" 01259 "vpaddd 256(%[X]), %%ymm8, %%ymm8\n\t" 01260 "vpaddd 288(%[X]), %%ymm9, %%ymm9\n\t" 01261 "vpaddd 320(%[X]), %%ymm10, %%ymm10\n\t" 01262 "vpaddd 352(%[X]), %%ymm11, %%ymm11\n\t" 01263 "vpaddd 384(%[X]), %%ymm12, %%ymm12\n\t" 01264 "vpaddd 416(%[X]), %%ymm13, %%ymm13\n\t" 01265 "vpaddd 448(%[X]), %%ymm14, %%ymm14\n\t" 01266 "vpaddd 480(%[X]), %%ymm15, %%ymm15\n\t" 01267 01268 "vmovdqa %%ymm8, (%[x])\n\t" 01269 "vmovdqa %%ymm9, 32(%[x])\n\t" 01270 "vmovdqa %%ymm10, 64(%[x])\n\t" 01271 "vmovdqa %%ymm11, 96(%[x])\n\t" 01272 "vmovdqa %%ymm12, 128(%[x])\n\t" 01273 "vmovdqa %%ymm13, 160(%[x])\n\t" 01274 "vmovdqa %%ymm14, 192(%[x])\n\t" 01275 "vmovdqa %%ymm15, 224(%[x])\n\t" 01276 01277 "vpunpckldq %%ymm1, %%ymm0, %%ymm8\n\t" 01278 "vpunpckldq %%ymm3, %%ymm2, %%ymm9\n\t" 01279 "vpunpckhdq %%ymm1, %%ymm0, %%ymm12\n\t" 01280 "vpunpckhdq %%ymm3, %%ymm2, %%ymm13\n\t" 01281 "vpunpckldq %%ymm5, %%ymm4, %%ymm10\n\t" 01282 "vpunpckldq %%ymm7, %%ymm6, %%ymm11\n\t" 01283 "vpunpckhdq %%ymm5, %%ymm4, %%ymm14\n\t" 01284 "vpunpckhdq %%ymm7, %%ymm6, %%ymm15\n\t" 01285 "vpunpcklqdq %%ymm9, %%ymm8, %%ymm0\n\t" 01286 "vpunpcklqdq %%ymm11, %%ymm10, %%ymm1\n\t" 01287 "vpunpckhqdq %%ymm9, %%ymm8, %%ymm2\n\t" 01288 "vpunpckhqdq %%ymm11, %%ymm10, %%ymm3\n\t" 01289 "vpunpcklqdq %%ymm13, %%ymm12, %%ymm4\n\t" 01290 "vpunpcklqdq %%ymm15, %%ymm14, %%ymm5\n\t" 01291 "vpunpckhqdq %%ymm13, %%ymm12, %%ymm6\n\t" 01292 "vpunpckhqdq %%ymm15, %%ymm14, %%ymm7\n\t" 01293 "vperm2i128 $0x20, %%ymm1, %%ymm0, %%ymm8\n\t" 01294 "vperm2i128 $0x20, %%ymm3, %%ymm2, %%ymm9\n\t" 01295 "vperm2i128 $0x31, %%ymm1, %%ymm0, %%ymm12\n\t" 01296 "vperm2i128 $0x31, %%ymm3, %%ymm2, %%ymm13\n\t" 01297 "vperm2i128 $0x20, %%ymm5, %%ymm4, %%ymm10\n\t" 01298 "vperm2i128 $0x20, %%ymm7, %%ymm6, %%ymm11\n\t" 01299 "vperm2i128 $0x31, %%ymm5, %%ymm4, %%ymm14\n\t" 01300 "vperm2i128 $0x31, %%ymm7, %%ymm6, %%ymm15\n\t" 01301 01302 "vmovdqu (%[in]), %%ymm0\n\t" 01303 "vmovdqu 64(%[in]), %%ymm1\n\t" 01304 "vmovdqu 128(%[in]), %%ymm2\n\t" 01305 "vmovdqu 192(%[in]), %%ymm3\n\t" 01306 "vmovdqu 256(%[in]), %%ymm4\n\t" 01307 "vmovdqu 320(%[in]), %%ymm5\n\t" 01308 "vmovdqu 384(%[in]), %%ymm6\n\t" 01309 "vmovdqu 448(%[in]), %%ymm7\n\t" 01310 "vpxor %%ymm0, %%ymm8, %%ymm8\n\t" 01311 "vpxor %%ymm1, %%ymm9, %%ymm9\n\t" 01312 "vpxor %%ymm2, %%ymm10, %%ymm10\n\t" 01313 "vpxor %%ymm3, %%ymm11, %%ymm11\n\t" 01314 "vpxor %%ymm4, %%ymm12, %%ymm12\n\t" 01315 "vpxor %%ymm5, %%ymm13, %%ymm13\n\t" 01316 "vpxor %%ymm6, %%ymm14, %%ymm14\n\t" 01317 "vpxor %%ymm7, %%ymm15, %%ymm15\n\t" 01318 "vmovdqu %%ymm8, (%[out])\n\t" 01319 "vmovdqu %%ymm9, 64(%[out])\n\t" 01320 "vmovdqu %%ymm10, 128(%[out])\n\t" 01321 "vmovdqu %%ymm11, 192(%[out])\n\t" 01322 "vmovdqu %%ymm12, 256(%[out])\n\t" 01323 "vmovdqu %%ymm13, 320(%[out])\n\t" 01324 "vmovdqu %%ymm14, 384(%[out])\n\t" 01325 "vmovdqu %%ymm15, 448(%[out])\n\t" 01326 01327 "vmovdqa (%[x]), %%ymm0\n\t" 01328 "vmovdqa 32(%[x]), %%ymm1\n\t" 01329 "vmovdqa 64(%[x]), %%ymm2\n\t" 01330 "vmovdqa 96(%[x]), %%ymm3\n\t" 01331 "vmovdqa 128(%[x]), %%ymm4\n\t" 01332 "vmovdqa 160(%[x]), %%ymm5\n\t" 01333 "vmovdqa 192(%[x]), %%ymm6\n\t" 01334 "vmovdqa 224(%[x]), %%ymm7\n\t" 01335 01336 "vpunpckldq %%ymm1, %%ymm0, %%ymm8\n\t" 01337 "vpunpckldq %%ymm3, %%ymm2, %%ymm9\n\t" 01338 "vpunpckhdq %%ymm1, %%ymm0, %%ymm12\n\t" 01339 "vpunpckhdq %%ymm3, %%ymm2, %%ymm13\n\t" 01340 "vpunpckldq %%ymm5, %%ymm4, %%ymm10\n\t" 01341 "vpunpckldq %%ymm7, %%ymm6, %%ymm11\n\t" 01342 "vpunpckhdq %%ymm5, %%ymm4, %%ymm14\n\t" 01343 "vpunpckhdq %%ymm7, %%ymm6, %%ymm15\n\t" 01344 "vpunpcklqdq %%ymm9, %%ymm8, %%ymm0\n\t" 01345 "vpunpcklqdq %%ymm11, %%ymm10, %%ymm1\n\t" 01346 "vpunpckhqdq %%ymm9 , %%ymm8, %%ymm2\n\t" 01347 "vpunpckhqdq %%ymm11, %%ymm10, %%ymm3\n\t" 01348 "vpunpcklqdq %%ymm13, %%ymm12, %%ymm4\n\t" 01349 "vpunpcklqdq %%ymm15, %%ymm14, %%ymm5\n\t" 01350 "vpunpckhqdq %%ymm13, %%ymm12, %%ymm6\n\t" 01351 "vpunpckhqdq %%ymm15, %%ymm14, %%ymm7\n\t" 01352 "vperm2i128 $0x20, %%ymm1, %%ymm0, %%ymm8\n\t" 01353 "vperm2i128 $0x20, %%ymm3, %%ymm2, %%ymm9\n\t" 01354 "vperm2i128 $0x31, %%ymm1, %%ymm0, %%ymm12\n\t" 01355 "vperm2i128 $0x31, %%ymm3, %%ymm2, %%ymm13\n\t" 01356 "vperm2i128 $0x20, %%ymm5, %%ymm4, %%ymm10\n\t" 01357 "vperm2i128 $0x20, %%ymm7, %%ymm6, %%ymm11\n\t" 01358 "vperm2i128 $0x31, %%ymm5, %%ymm4, %%ymm14\n\t" 01359 "vperm2i128 $0x31, %%ymm7, %%ymm6, %%ymm15\n\t" 01360 01361 "vmovdqu 32(%[in]), %%ymm0\n\t" 01362 "vmovdqu 96(%[in]), %%ymm1\n\t" 01363 "vmovdqu 160(%[in]), %%ymm2\n\t" 01364 "vmovdqu 224(%[in]), %%ymm3\n\t" 01365 "vmovdqu 288(%[in]), %%ymm4\n\t" 01366 "vmovdqu 352(%[in]), %%ymm5\n\t" 01367 "vmovdqu 416(%[in]), %%ymm6\n\t" 01368 "vmovdqu 480(%[in]), %%ymm7\n\t" 01369 "vpxor %%ymm0, %%ymm8, %%ymm8\n\t" 01370 "vpxor %%ymm1, %%ymm9, %%ymm9\n\t" 01371 "vpxor %%ymm2, %%ymm10, %%ymm10\n\t" 01372 "vpxor %%ymm3, %%ymm11, %%ymm11\n\t" 01373 "vpxor %%ymm4, %%ymm12, %%ymm12\n\t" 01374 "vpxor %%ymm5, %%ymm13, %%ymm13\n\t" 01375 "vpxor %%ymm6, %%ymm14, %%ymm14\n\t" 01376 "vpxor %%ymm7, %%ymm15, %%ymm15\n\t" 01377 "vmovdqu %%ymm8, 32(%[out])\n\t" 01378 "vmovdqu %%ymm9, 96(%[out])\n\t" 01379 "vmovdqu %%ymm10, 160(%[out])\n\t" 01380 "vmovdqu %%ymm11, 224(%[out])\n\t" 01381 "vmovdqu %%ymm12, 288(%[out])\n\t" 01382 "vmovdqu %%ymm13, 352(%[out])\n\t" 01383 "vmovdqu %%ymm14, 416(%[out])\n\t" 01384 "vmovdqu %%ymm15, 480(%[out])\n\t" 01385 01386 "vmovdqa 384(%[X]), %%ymm12\n\t" 01387 "add $512, %[in]\n\t" 01388 "add $512, %[out]\n\t" 01389 "vpaddd %[eight], %%ymm12, %%ymm12\n\t" 01390 "sub $512, %[bytes]\n\t" 01391 "vmovdqa %%ymm12, 384(%[X])\n\t" 01392 "cmp $512, %[bytes]\n\t" 01393 "jl L_done256\n\t" 01394 01395 "vmovdqa (%[X]), %%ymm0\n\t" 01396 "vmovdqa 32(%[X]), %%ymm1\n\t" 01397 "vmovdqa 64(%[X]), %%ymm2\n\t" 01398 "vmovdqa 96(%[X]), %%ymm3\n\t" 01399 "vmovdqa 128(%[X]), %%ymm4\n\t" 01400 "vmovdqa 160(%[X]), %%ymm5\n\t" 01401 "vmovdqa 192(%[X]), %%ymm6\n\t" 01402 "vmovdqa 224(%[X]), %%ymm7\n\t" 01403 "vmovdqa 256(%[X]), %%ymm8\n\t" 01404 "vmovdqa 288(%[X]), %%ymm9\n\t" 01405 "vmovdqa 320(%[X]), %%ymm10\n\t" 01406 "vmovdqa 352(%[X]), %%ymm11\n\t" 01407 "vmovdqa 384(%[X]), %%ymm12\n\t" 01408 "vmovdqa 416(%[X]), %%ymm13\n\t" 01409 "vmovdqa 448(%[X]), %%ymm14\n\t" 01410 "vmovdqa 480(%[X]), %%ymm15\n\t" 01411 "jmp L_enc256_loop\n\t" 01412 "\n" 01413 "L_done256:\n\t" 01414 "shl $3, %[cnt]\n\t" 01415 "add 48(%[key]), %[cnt]\n\t" 01416 "movl %[cnt], 48(%[key])\n\t" 01417 "\n" 01418 "L_end256:\n\t" 01419 : [bytes] "+r" (bytes), [cnt] "+r" (cnt), 01420 [in] "+r" (m), [out] "+r" (c) 01421 : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X), 01422 [add] "m" (add), [eight] "m" (eight), 01423 [rotl8] "m" (rotl8_256), [rotl16] "m" (rotl16_256) 01424 : "ymm0", "ymm1", "ymm2", "ymm3", 01425 "ymm4", "ymm5", "ymm6", "ymm7", 01426 "ymm8", "ymm9", "ymm10", "ymm11", 01427 "ymm12", "ymm13", "ymm14", "ymm15", "memory" 01428 ); 01429 01430 /* AVX code optimised for multiples of 256 bytes. */ 01431 if (bytes == 256) { 01432 chacha_encrypt_avx(ctx, m, c, bytes); 01433 bytes -= 256; 01434 } 01435 01436 for (; bytes >= CHACHA_CHUNK_BYTES;) { 01437 CHACHA_CHUNK_AVX2(); 01438 bytes -= CHACHA_CHUNK_BYTES; 01439 c += CHACHA_CHUNK_BYTES; 01440 m += CHACHA_CHUNK_BYTES; 01441 } 01442 if (bytes > 0) { 01443 CHACHA_PARTIAL_CHUNK_AVX2(); 01444 } 01445 } 01446 #endif /* HAVE_INTEL_AVX2 */ 01447 #endif /* USE_INTEL_CHACHA_SPEEDUP */ 01448 01449 /** 01450 * Encrypt a stream of bytes 01451 */ 01452 static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c, 01453 word32 bytes) 01454 { 01455 byte* output; 01456 word32 temp[CHACHA_CHUNK_WORDS]; /* used to make sure aligned */ 01457 word32 i; 01458 01459 output = (byte*)temp; 01460 01461 for (; bytes > 0;) { 01462 wc_Chacha_wordtobyte(temp, ctx->X); 01463 ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]); 01464 if (bytes <= CHACHA_CHUNK_BYTES) { 01465 for (i = 0; i < bytes; ++i) { 01466 c[i] = m[i] ^ output[i]; 01467 } 01468 return; 01469 } 01470 for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) { 01471 c[i] = m[i] ^ output[i]; 01472 } 01473 bytes -= CHACHA_CHUNK_BYTES; 01474 c += CHACHA_CHUNK_BYTES; 01475 m += CHACHA_CHUNK_BYTES; 01476 } 01477 } 01478 01479 /** 01480 * API to encrypt/decrypt a message of any size. 01481 */ 01482 int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input, 01483 word32 msglen) 01484 { 01485 if (ctx == NULL) 01486 return BAD_FUNC_ARG; 01487 01488 #ifdef USE_INTEL_CHACHA_SPEEDUP 01489 if (!cpuidFlagsSet) { 01490 cpuidFlags = cpuid_get_flags(); 01491 cpuidFlagsSet = 1; 01492 } 01493 01494 #ifdef HAVE_INTEL_AVX2 01495 if (IS_INTEL_AVX2(cpuidFlags)) { 01496 chacha_encrypt_avx2(ctx, input, output, msglen); 01497 return 0; 01498 } 01499 #endif 01500 if (IS_INTEL_AVX1(cpuidFlags)) { 01501 chacha_encrypt_avx(ctx, input, output, msglen); 01502 return 0; 01503 } 01504 else { 01505 chacha_encrypt_x64(ctx, input, output, msglen); 01506 return 0; 01507 } 01508 #endif 01509 wc_Chacha_encrypt_bytes(ctx, input, output, msglen); 01510 01511 return 0; 01512 } 01513 01514 #endif /* HAVE_CHACHA*/ 01515 01516
Generated on Tue Jul 12 2022 16:58:05 by 1.7.2