ssh lib

Dependents:   OS

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers chacha.c Source File

chacha.c

00001 /* chacha.c
00002  *
00003  * Copyright (C) 2006-2017 wolfSSL Inc.
00004  *
00005  * This file is part of wolfSSL.
00006  *
00007  * wolfSSL is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * wolfSSL is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
00020  *
00021  *  based from
00022  *  chacha-ref.c version 20080118
00023  *  D. J. Bernstein
00024  *  Public domain.
00025  */
00026 
00027 
00028 
00029 #ifdef HAVE_CONFIG_H
00030     #include <config.h>
00031 #endif
00032 
00033 #include <wolfcrypt/settings.h>
00034 
00035 #ifdef HAVE_CHACHA
00036 
00037 #include <wolfcrypt/chacha.h>
00038 #include <wolfcrypt/error-crypt.h>
00039 #include <wolfcrypt/logging.h>
00040 #include <wolfcrypt/cpuid.h>
00041 #ifdef NO_INLINE
00042     #include <wolfcrypt/misc.h>
00043 #else
00044     #define WOLFSSL_MISC_INCLUDED
00045     #include <wolfcrypt/src/misc.c>
00046 #endif
00047 
00048 #ifdef CHACHA_AEAD_TEST
00049     #include <stdio.h>
00050 #endif
00051 
00052 #ifdef USE_INTEL_CHACHA_SPEEDUP
00053     #include <emmintrin.h>
00054     #include <immintrin.h>
00055 
00056     #if defined(__GNUC__) && ((__GNUC__ < 4) || \
00057                               (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
00058         #define NO_AVX2_SUPPORT
00059     #endif
00060     #if defined(__clang__) && ((__clang_major__ < 3) || \
00061                                (__clang_major__ == 3 && __clang_minor__ <= 5))
00062         #define NO_AVX2_SUPPORT
00063     #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
00064         #undef NO_AVX2_SUPPORT
00065     #endif
00066 
00067     #ifndef NO_AVX2_SUPPORT
00068         #define HAVE_INTEL_AVX2
00069     #endif
00070 
00071     #if defined(_MSC_VER)
00072         #define CHACHA20_NOINLINE __declspec(noinline)
00073     #elif defined(__GNUC__)
00074         #define CHACHA20_NOINLINE __attribute__((noinline))
00075     #else
00076         #define CHACHA20_NOINLINE
00077     #endif
00078 
00079     static int cpuidFlagsSet = 0;
00080     static int cpuidFlags = 0;
00081 #endif
00082 
00083 #ifdef BIG_ENDIAN_ORDER
00084     #define LITTLE32(x) ByteReverseWord32(x)
00085 #else
00086     #define LITTLE32(x) (x)
00087 #endif
00088 
00089 /* Number of rounds */
00090 #define ROUNDS  20
00091 
00092 #define U32C(v) (v##U)
00093 #define U32V(v) ((word32)(v) & U32C(0xFFFFFFFF))
00094 #define U8TO32_LITTLE(p) LITTLE32(((word32*)(p))[0])
00095 
00096 #define ROTATE(v,c) rotlFixed(v, c)
00097 #define XOR(v,w)    ((v) ^ (w))
00098 #define PLUS(v,w)   (U32V((v) + (w)))
00099 #define PLUSONE(v)  (PLUS((v),1))
00100 
00101 #define QUARTERROUND(a,b,c,d) \
00102   x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]),16); \
00103   x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]),12); \
00104   x[a] = PLUS(x[a],x[b]); x[d] = ROTATE(XOR(x[d],x[a]), 8); \
00105   x[c] = PLUS(x[c],x[d]); x[b] = ROTATE(XOR(x[b],x[c]), 7);
00106 
00107 
00108 
00109 #define QUARTERROUND_INTEL_ASM(a0,b0,c0,d0,   \
00110                                a1,b1,c1,d1,   \
00111                                a2,b2,c2,d2,   \
00112                                a3,b3,c3,d3,   \
00113                                t1,o1)         \
00114        "vpaddd  "#b0", "#a0", "#a0"\n\t"      \
00115        "vpxor   "#a0", "#d0", "#d0"\n\t"      \
00116        "vmovdqa "#o1"(%[x]), "#c3"\n\t"       \
00117        "vpshufb %[rotl16], "#d0", "#d0"\n\t"  \
00118        "vpaddd  "#d0", "#c0", "#c0"\n\t"      \
00119        "vpxor   "#c0", "#b0", "#b0"\n\t"      \
00120        "vpaddd  "#b1", "#a1", "#a1"\n\t"      \
00121        "vpxor   "#a1", "#d1", "#d1"\n\t"      \
00122        "vpshufb %[rotl16], "#d1", "#d1"\n\t"  \
00123        "vpaddd  "#d1", "#c1", "#c1"\n\t"      \
00124        "vpxor   "#c1", "#b1", "#b1"\n\t"      \
00125        "vpaddd  "#b2", "#a2", "#a2"\n\t"      \
00126        "vpxor   "#a2", "#d2", "#d2"\n\t"      \
00127        "vpshufb %[rotl16], "#d2", "#d2"\n\t"  \
00128        "vpaddd  "#d2", "#c2", "#c2"\n\t"      \
00129        "vpxor   "#c2", "#b2", "#b2"\n\t"      \
00130        "vpaddd  "#b3", "#a3", "#a3"\n\t"      \
00131        "vpxor   "#a3", "#d3", "#d3"\n\t"      \
00132        "vpshufb %[rotl16], "#d3", "#d3"\n\t"  \
00133        "vpaddd  "#d3", "#c3", "#c3"\n\t"      \
00134        "vpxor   "#c3", "#b3", "#b3"\n\t"      \
00135        "vmovdqa "#c3", "#o1"(%[x])\n\t"       \
00136        "vpsrld  $20, "#b0", "#t1"\n\t"        \
00137        "vpslld  $12, "#b0", "#b0"\n\t"        \
00138        "vpxor   "#t1", "#b0", "#b0"\n\t"      \
00139        "vpsrld  $20, "#b1", "#t1"\n\t"        \
00140        "vpslld  $12, "#b1", "#b1"\n\t"        \
00141        "vpxor   "#t1", "#b1", "#b1"\n\t"      \
00142        "vpsrld  $20, "#b2", "#t1"\n\t"        \
00143        "vpslld  $12, "#b2", "#b2"\n\t"        \
00144        "vpxor   "#t1", "#b2", "#b2"\n\t"      \
00145        "vpsrld  $20, "#b3", "#t1"\n\t"        \
00146        "vpslld  $12, "#b3", "#b3"\n\t"        \
00147        "vpxor   "#t1", "#b3", "#b3"\n\t"      \
00148        "vpaddd  "#b0", "#a0", "#a0"\n\t"      \
00149        "vpxor   "#a0", "#d0", "#d0"\n\t"      \
00150        "vmovdqa "#o1"(%[x]), "#c3"\n\t"       \
00151        "vpshufb %[rotl8], "#d0", "#d0"\n\t"   \
00152        "vpaddd  "#d0", "#c0", "#c0"\n\t"      \
00153        "vpxor   "#c0", "#b0", "#b0"\n\t"      \
00154        "vpaddd  "#b1", "#a1", "#a1"\n\t"      \
00155        "vpxor   "#a1", "#d1", "#d1"\n\t"      \
00156        "vpshufb %[rotl8], "#d1", "#d1"\n\t"   \
00157        "vpaddd  "#d1", "#c1", "#c1"\n\t"      \
00158        "vpxor   "#c1", "#b1", "#b1"\n\t"      \
00159        "vpaddd  "#b2", "#a2", "#a2"\n\t"      \
00160        "vpxor   "#a2", "#d2", "#d2"\n\t"      \
00161        "vpshufb %[rotl8], "#d2", "#d2"\n\t"   \
00162        "vpaddd  "#d2", "#c2", "#c2"\n\t"      \
00163        "vpxor   "#c2", "#b2", "#b2"\n\t"      \
00164        "vpaddd  "#b3", "#a3", "#a3"\n\t"      \
00165        "vpxor   "#a3", "#d3", "#d3"\n\t"      \
00166        "vpshufb %[rotl8], "#d3", "#d3"\n\t"   \
00167        "vpaddd  "#d3", "#c3", "#c3"\n\t"      \
00168        "vpxor   "#c3", "#b3", "#b3"\n\t"      \
00169        "vmovdqa "#c3", "#o1"(%[x])\n\t"       \
00170        "vpsrld  $25, "#b0", "#t1"\n\t"        \
00171        "vpslld   $7, "#b0", "#b0"\n\t"        \
00172        "vpxor   "#t1", "#b0", "#b0"\n\t"      \
00173        "vpsrld  $25, "#b1", "#t1"\n\t"        \
00174        "vpslld   $7, "#b1", "#b1"\n\t"        \
00175        "vpxor   "#t1", "#b1", "#b1"\n\t"      \
00176        "vpsrld  $25, "#b2", "#t1"\n\t"        \
00177        "vpslld   $7, "#b2", "#b2"\n\t"        \
00178        "vpxor   "#t1", "#b2", "#b2"\n\t"      \
00179        "vpsrld  $25, "#b3", "#t1"\n\t"        \
00180        "vpslld   $7, "#b3", "#b3"\n\t"        \
00181        "vpxor   "#t1", "#b3", "#b3"\n\t"
00182 
00183 #define QUARTERROUND_INTEL_ASM_2(a0,b0,c0,d0, \
00184                                  a1,b1,c1,d1, \
00185                                  a2,b2,c2,d2, \
00186                                  a3,b3,c3,d3, \
00187                                  t1,o1)       \
00188        "vpaddd  "#b0", "#a0", "#a0"\n\t"      \
00189        "vpxor   "#a0", "#d0", "#d0"\n\t"      \
00190        "vmovdqa "#o1"(%[x]), "#c1"\n\t"       \
00191        "vpshufb %[rotl16], "#d0", "#d0"\n\t"  \
00192        "vpaddd  "#d0", "#c0", "#c0"\n\t"      \
00193        "vpxor   "#c0", "#b0", "#b0"\n\t"      \
00194        "vpaddd  "#b1", "#a1", "#a1"\n\t"      \
00195        "vpxor   "#a1", "#d1", "#d1"\n\t"      \
00196        "vpshufb %[rotl16], "#d1", "#d1"\n\t"  \
00197        "vpaddd  "#d1", "#c1", "#c1"\n\t"      \
00198        "vpxor   "#c1", "#b1", "#b1"\n\t"      \
00199        "vpaddd  "#b2", "#a2", "#a2"\n\t"      \
00200        "vpxor   "#a2", "#d2", "#d2"\n\t"      \
00201        "vpshufb %[rotl16], "#d2", "#d2"\n\t"  \
00202        "vpaddd  "#d2", "#c2", "#c2"\n\t"      \
00203        "vpxor   "#c2", "#b2", "#b2"\n\t"      \
00204        "vpaddd  "#b3", "#a3", "#a3"\n\t"      \
00205        "vpxor   "#a3", "#d3", "#d3"\n\t"      \
00206        "vpshufb %[rotl16], "#d3", "#d3"\n\t"  \
00207        "vpaddd  "#d3", "#c3", "#c3"\n\t"      \
00208        "vpxor   "#c3", "#b3", "#b3"\n\t"      \
00209        "vmovdqa "#c1", "#o1"(%[x])\n\t"       \
00210        "vpsrld  $20, "#b0", "#t1"\n\t"        \
00211        "vpslld  $12, "#b0", "#b0"\n\t"        \
00212        "vpxor   "#t1", "#b0", "#b0"\n\t"      \
00213        "vpsrld  $20, "#b1", "#t1"\n\t"        \
00214        "vpslld  $12, "#b1", "#b1"\n\t"        \
00215        "vpxor   "#t1", "#b1", "#b1"\n\t"      \
00216        "vpsrld  $20, "#b2", "#t1"\n\t"        \
00217        "vpslld  $12, "#b2", "#b2"\n\t"        \
00218        "vpxor   "#t1", "#b2", "#b2"\n\t"      \
00219        "vpsrld  $20, "#b3", "#t1"\n\t"        \
00220        "vpslld  $12, "#b3", "#b3"\n\t"        \
00221        "vpxor   "#t1", "#b3", "#b3"\n\t"      \
00222        "vpaddd  "#b0", "#a0", "#a0"\n\t"      \
00223        "vpxor   "#a0", "#d0", "#d0"\n\t"      \
00224        "vmovdqa "#o1"(%[x]), "#c1"\n\t"       \
00225        "vpshufb %[rotl8], "#d0", "#d0"\n\t"   \
00226        "vpaddd  "#d0", "#c0", "#c0"\n\t"      \
00227        "vpxor   "#c0", "#b0", "#b0"\n\t"      \
00228        "vpaddd  "#b1", "#a1", "#a1"\n\t"      \
00229        "vpxor   "#a1", "#d1", "#d1"\n\t"      \
00230        "vpshufb %[rotl8], "#d1", "#d1"\n\t"   \
00231        "vpaddd  "#d1", "#c1", "#c1"\n\t"      \
00232        "vpxor   "#c1", "#b1", "#b1"\n\t"      \
00233        "vpaddd  "#b2", "#a2", "#a2"\n\t"      \
00234        "vpxor   "#a2", "#d2", "#d2"\n\t"      \
00235        "vpshufb %[rotl8], "#d2", "#d2"\n\t"   \
00236        "vpaddd  "#d2", "#c2", "#c2"\n\t"      \
00237        "vpxor   "#c2", "#b2", "#b2"\n\t"      \
00238        "vpaddd  "#b3", "#a3", "#a3"\n\t"      \
00239        "vpxor   "#a3", "#d3", "#d3"\n\t"      \
00240        "vpshufb %[rotl8], "#d3", "#d3"\n\t"   \
00241        "vpaddd  "#d3", "#c3", "#c3"\n\t"      \
00242        "vpxor   "#c3", "#b3", "#b3"\n\t"      \
00243        "vmovdqa "#c1", "#o1"(%[x])\n\t"       \
00244        "vpsrld  $25, "#b0", "#t1"\n\t"        \
00245        "vpslld   $7, "#b0", "#b0"\n\t"        \
00246        "vpxor   "#t1", "#b0", "#b0"\n\t"      \
00247        "vpsrld  $25, "#b1", "#t1"\n\t"        \
00248        "vpslld   $7, "#b1", "#b1"\n\t"        \
00249        "vpxor   "#t1", "#b1", "#b1"\n\t"      \
00250        "vpsrld  $25, "#b2", "#t1"\n\t"        \
00251        "vpslld   $7, "#b2", "#b2"\n\t"        \
00252        "vpxor   "#t1", "#b2", "#b2"\n\t"      \
00253        "vpsrld  $25, "#b3", "#t1"\n\t"        \
00254        "vpslld   $7, "#b3", "#b3"\n\t"        \
00255        "vpxor   "#t1", "#b3", "#b3"\n\t"
00256 
00257 
00258 #define QUARTERROUND_XMM()                                      \
00259         QUARTERROUND_INTEL_ASM(%%xmm0,%%xmm4,%%xmm8,%%xmm12,    \
00260                                %%xmm1,%%xmm5,%%xmm9,%%xmm13,    \
00261                                %%xmm2,%%xmm6,%%xmm10,%%xmm14,   \
00262                                %%xmm3,%%xmm7,%%xmm11,%%xmm15,   \
00263                                %%xmm11,48)
00264 #define QUARTERROUND_XMM_2()                                    \
00265         QUARTERROUND_INTEL_ASM_2(%%xmm0,%%xmm5,%%xmm10,%%xmm15, \
00266                                  %%xmm1,%%xmm6,%%xmm11,%%xmm12, \
00267                                  %%xmm2,%%xmm7,%%xmm8,%%xmm13,  \
00268                                  %%xmm3,%%xmm4,%%xmm9,%%xmm14,  \
00269                                  %%xmm11,48)
00270 
00271 #define QUARTERROUND_YMM()                                      \
00272         QUARTERROUND_INTEL_ASM(%%ymm0,%%ymm4,%%ymm8,%%ymm12,    \
00273                                %%ymm1,%%ymm5,%%ymm9,%%ymm13,    \
00274                                %%ymm2,%%ymm6,%%ymm10,%%ymm14,   \
00275                                %%ymm3,%%ymm7,%%ymm11,%%ymm15,   \
00276                                %%ymm11,96)
00277 #define QUARTERROUND_YMM_2()                                    \
00278         QUARTERROUND_INTEL_ASM_2(%%ymm0,%%ymm5,%%ymm10,%%ymm15, \
00279                                  %%ymm1,%%ymm6,%%ymm11,%%ymm12, \
00280                                  %%ymm2,%%ymm7,%%ymm8,%%ymm13,  \
00281                                  %%ymm3,%%ymm4,%%ymm9,%%ymm14,  \
00282                                  %%ymm11,96)
00283 
00284 /**
00285   * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version
00286   * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB.
00287   */
00288 int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter)
00289 {
00290     word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */
00291 
00292 #ifdef CHACHA_AEAD_TEST
00293     word32 i;
00294     printf("NONCE : ");
00295     for (i = 0; i < CHACHA_IV_BYTES; i++) {
00296         printf("%02x", inIv[i]);
00297     }
00298     printf("\n\n");
00299 #endif
00300 
00301     if (ctx == NULL)
00302         return BAD_FUNC_ARG;
00303 
00304     XMEMCPY(temp, inIv, CHACHA_IV_BYTES);
00305 
00306     ctx->X[CHACHA_IV_BYTES+0] = counter;           /* block counter */
00307     ctx->X[CHACHA_IV_BYTES+1] = LITTLE32(temp[0]); /* fixed variable from nonce */
00308     ctx->X[CHACHA_IV_BYTES+2] = LITTLE32(temp[1]); /* counter from nonce */
00309     ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */
00310 
00311     return 0;
00312 }
00313 
00314 /* "expand 32-byte k" as unsigned 32 byte */
00315 static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
00316 /* "expand 16-byte k" as unsigned 16 byte */
00317 static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574};
00318 
00319 /**
00320   * Key setup. 8 word iv (nonce)
00321   */
00322 int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz)
00323 {
00324     const word32* constants;
00325     const byte*   k;
00326 
00327 #ifdef XSTREAM_ALIGN
00328     word32 alignKey[8];
00329 #endif
00330 
00331     if (ctx == NULL)
00332         return BAD_FUNC_ARG;
00333 
00334     if (keySz != (CHACHA_MAX_KEY_SZ/2) && keySz != CHACHA_MAX_KEY_SZ)
00335         return BAD_FUNC_ARG;
00336 
00337 #ifdef XSTREAM_ALIGN
00338     if ((wolfssl_word)key % 4) {
00339         WOLFSSL_MSG("wc_ChachaSetKey unaligned key");
00340         XMEMCPY(alignKey, key, keySz);
00341         k = (byte*)alignKey;
00342     }
00343     else {
00344         k = key;
00345     }
00346 #else
00347     k = key;
00348 #endif /* XSTREAM_ALIGN */
00349 
00350 #ifdef CHACHA_AEAD_TEST
00351     word32 i;
00352     printf("ChaCha key used :\n");
00353     for (i = 0; i < keySz; i++) {
00354         printf("%02x", key[i]);
00355         if ((i + 1) % 8 == 0)
00356            printf("\n");
00357     }
00358     printf("\n\n");
00359 #endif
00360 
00361     ctx->X[4] = U8TO32_LITTLE(k +  0);
00362     ctx->X[5] = U8TO32_LITTLE(k +  4);
00363     ctx->X[6] = U8TO32_LITTLE(k +  8);
00364     ctx->X[7] = U8TO32_LITTLE(k + 12);
00365     if (keySz == CHACHA_MAX_KEY_SZ) {
00366         k += 16;
00367         constants = sigma;
00368     }
00369     else {
00370         constants = tau;
00371     }
00372     ctx->X[ 8] = U8TO32_LITTLE(k +  0);
00373     ctx->X[ 9] = U8TO32_LITTLE(k +  4);
00374     ctx->X[10] = U8TO32_LITTLE(k +  8);
00375     ctx->X[11] = U8TO32_LITTLE(k + 12);
00376     ctx->X[ 0] = constants[0];
00377     ctx->X[ 1] = constants[1];
00378     ctx->X[ 2] = constants[2];
00379     ctx->X[ 3] = constants[3];
00380 
00381     return 0;
00382 }
00383 
00384 /**
00385   * Converts word into bytes with rotations having been done.
00386   */
00387 static WC_INLINE void wc_Chacha_wordtobyte(word32 output[CHACHA_CHUNK_WORDS],
00388     const word32 input[CHACHA_CHUNK_WORDS])
00389 {
00390     word32 x[CHACHA_CHUNK_WORDS];
00391     word32 i;
00392 
00393     for (i = 0; i < CHACHA_CHUNK_WORDS; i++) {
00394         x[i] = input[i];
00395     }
00396 
00397     for (i = (ROUNDS); i > 0; i -= 2) {
00398         QUARTERROUND(0, 4,  8, 12)
00399         QUARTERROUND(1, 5,  9, 13)
00400         QUARTERROUND(2, 6, 10, 14)
00401         QUARTERROUND(3, 7, 11, 15)
00402         QUARTERROUND(0, 5, 10, 15)
00403         QUARTERROUND(1, 6, 11, 12)
00404         QUARTERROUND(2, 7,  8, 13)
00405         QUARTERROUND(3, 4,  9, 14)
00406     }
00407 
00408     for (i = 0; i < CHACHA_CHUNK_WORDS; i++) {
00409         x[i] = PLUS(x[i], input[i]);
00410     }
00411 
00412     for (i = 0; i < CHACHA_CHUNK_WORDS; i++) {
00413         output[i] = LITTLE32(x[i]);
00414     }
00415 }
00416 
00417 
00418 #ifdef USE_INTEL_CHACHA_SPEEDUP
00419 
00420 #define QUARTERROUND_2_X64(r11, r12, r13, r14, r21, r22, r23, r24) \
00421         "addl   "#r12", "#r11"\n\t"                                \
00422         "addl   "#r22", "#r21"\n\t"                                \
00423         "xorl   "#r11", "#r14"\n\t"                                \
00424         "xorl   "#r21", "#r24"\n\t"                                \
00425         "roll   $16, "#r14"\n\t"                                   \
00426         "roll   $16, "#r24"\n\t"                                   \
00427         "addl   "#r14", "#r13"\n\t"                                \
00428         "addl   "#r24", "#r23"\n\t"                                \
00429         "xorl   "#r13", "#r12"\n\t"                                \
00430         "xorl   "#r23", "#r22"\n\t"                                \
00431         "roll   $12, "#r12"\n\t"                                   \
00432         "roll   $12, "#r22"\n\t"                                   \
00433         "addl   "#r12", "#r11"\n\t"                                \
00434         "addl   "#r22", "#r21"\n\t"                                \
00435         "xorl   "#r11", "#r14"\n\t"                                \
00436         "xorl   "#r21", "#r24"\n\t"                                \
00437         "roll   $8, "#r14"\n\t"                                    \
00438         "roll   $8, "#r24"\n\t"                                    \
00439         "addl   "#r14", "#r13"\n\t"                                \
00440         "addl   "#r24", "#r23"\n\t"                                \
00441         "xorl   "#r13", "#r12"\n\t"                                \
00442         "xorl   "#r23", "#r22"\n\t"                                \
00443         "roll   $7, "#r12"\n\t"                                    \
00444         "roll   $7, "#r22"\n\t"                                    \
00445 
00446 #define CHACHA_CRYPT_X64()                                                     \
00447         "subq   $40, %%rsp\n\t"                                                \
00448         "movq   32(%[input]), %%rax\n\t"                                       \
00449         "movq   40(%[input]), %%rdx\n\t"                                       \
00450         "movq   %%rax,  8(%%rsp)\n\t"                                          \
00451         "movq   %%rdx, 16(%%rsp)\n\t"                                          \
00452         "movl    0(%[input]), %%eax\n\t"                                       \
00453         "movl    4(%[input]), %%ebx\n\t"                                       \
00454         "movl    8(%[input]), %%ecx\n\t"                                       \
00455         "movl   12(%[input]), %%edx\n\t"                                       \
00456         "movl   16(%[input]), %%r8d\n\t"                                       \
00457         "movl   20(%[input]), %%r9d\n\t"                                       \
00458         "movl   24(%[input]), %%r10d\n\t"                                      \
00459         "movl   28(%[input]), %%r11d\n\t"                                      \
00460         "movl   48(%[input]), %%r12d\n\t"                                      \
00461         "movl   52(%[input]), %%r13d\n\t"                                      \
00462         "movl   56(%[input]), %%r14d\n\t"                                      \
00463         "movl   60(%[input]), %%r15d\n\t"                                      \
00464         "movb   $10, (%%rsp)\n\t"                                              \
00465         "movq   %%rsi, 32(%%rsp)\n\t"                                          \
00466         "movq   %%rdi, 24(%%rsp)\n\t"                                          \
00467         "movl    8(%%rsp), %%esi\n\t"                                          \
00468         "movl   12(%%rsp), %%edi\n\t"                                          \
00469         "\n"                                                                   \
00470         "1:\n\t"                                                               \
00471         QUARTERROUND_2_X64(%%eax,  %%r8d, %%esi, %%r12d,                       \
00472                            %%ebx,  %%r9d, %%edi, %%r13d)                       \
00473         "movl   %%esi,  8(%%rsp)\n\t"                                          \
00474         "movl   %%edi, 12(%%rsp)\n\t"                                          \
00475         "movl   16(%%rsp), %%esi\n\t"                                          \
00476         "movl   20(%%rsp), %%edi\n\t"                                          \
00477         QUARTERROUND_2_X64(%%ecx, %%r10d, %%esi, %%r14d,                       \
00478                            %%edx, %%r11d, %%edi, %%r15d)                       \
00479         QUARTERROUND_2_X64(%%eax,  %%r9d, %%esi, %%r15d,                       \
00480                            %%ebx, %%r10d, %%edi, %%r12d)                       \
00481         "movl   %%esi, 16(%%rsp)\n\t"                                          \
00482         "movl   %%edi, 20(%%rsp)\n\t"                                          \
00483         "movl    8(%%rsp), %%esi\n\t"                                          \
00484         "movl   12(%%rsp), %%edi\n\t"                                          \
00485         QUARTERROUND_2_X64(%%ecx, %%r11d, %%esi, %%r13d,                       \
00486                            %%edx,  %%r8d, %%edi, %%r14d)                       \
00487         "decb   (%%rsp)\n\t"                                                   \
00488         "jnz    1b\n\t"                                                        \
00489         "movl   %%esi,  8(%%rsp)\n\t"                                          \
00490         "movl   %%edi, 12(%%rsp)\n\t"                                          \
00491         "movq   32(%%rsp), %%rsi\n\t"                                          \
00492         "movq   24(%%rsp), %%rdi\n\t"                                          \
00493         "addl    0(%[input]), %%eax\n\t"                                       \
00494         "addl    4(%[input]), %%ebx\n\t"                                       \
00495         "addl    8(%[input]), %%ecx\n\t"                                       \
00496         "addl   12(%[input]), %%edx\n\t"                                       \
00497         "addl   16(%[input]), %%r8d\n\t"                                       \
00498         "addl   20(%[input]), %%r9d\n\t"                                       \
00499         "addl   24(%[input]), %%r10d\n\t"                                      \
00500         "addl   28(%[input]), %%r11d\n\t"                                      \
00501         "addl   48(%[input]), %%r12d\n\t"                                      \
00502         "addl   52(%[input]), %%r13d\n\t"                                      \
00503         "addl   56(%[input]), %%r14d\n\t"                                      \
00504         "addl   60(%[input]), %%r15d\n\t"                                      \
00505 
00506 #define CHACHA_PARTIAL_CHUNK_X64()                                             \
00507     __asm__ __volatile__ (                                                     \
00508         CHACHA_CRYPT_X64()                                                     \
00509         "movl   %%eax ,  0(%[c])\n\t"                                          \
00510         "movl   %%ebx ,  4(%[c])\n\t"                                          \
00511         "movl   %%ecx ,  8(%[c])\n\t"                                          \
00512         "movl   %%edx , 12(%[c])\n\t"                                          \
00513         "movl   %%r8d , 16(%[c])\n\t"                                          \
00514         "movl   %%r9d , 20(%[c])\n\t"                                          \
00515         "movl   %%r10d, 24(%[c])\n\t"                                          \
00516         "movl   %%r11d, 28(%[c])\n\t"                                          \
00517         "movl   %%r12d, 48(%[c])\n\t"                                          \
00518         "movl   %%r13d, 52(%[c])\n\t"                                          \
00519         "movl   %%r14d, 56(%[c])\n\t"                                          \
00520         "movl   %%r15d, 60(%[c])\n\t"                                          \
00521         "movl    8(%%rsp), %%eax\n\t"                                          \
00522         "movl   12(%%rsp), %%ebx\n\t"                                          \
00523         "movl   16(%%rsp), %%ecx\n\t"                                          \
00524         "movl   20(%%rsp), %%edx\n\t"                                          \
00525         "addl   32(%[input]), %%eax\n\t"                                       \
00526         "addl   36(%[input]), %%ebx\n\t"                                       \
00527         "addl   40(%[input]), %%ecx\n\t"                                       \
00528         "addl   44(%[input]), %%edx\n\t"                                       \
00529         "movl   %%eax , 32(%[c])\n\t"                                          \
00530         "movl   %%ebx , 36(%[c])\n\t"                                          \
00531         "movl   %%ecx , 40(%[c])\n\t"                                          \
00532         "movl   %%edx , 44(%[c])\n\t"                                          \
00533         "addl   $1, 48(%[input])\n\t"                                          \
00534         "addq   $40, %%rsp\n\t"                                                \
00535         "movq   %[output], %%rax\n\t"                                          \
00536         "movq   %[m], %%rbx\n\t"                                               \
00537         "movl   %[bytes], %%r8d\n\t"                                           \
00538         "xorq   %%rdx, %%rdx\n\t"                                              \
00539         "movl   %%r8d, %%r9d\n\t"                                              \
00540         "andl   $7, %%r9d\n\t"                                                 \
00541         "jz 4f\n\t"                                                        \
00542         "\n"                                                                   \
00543         "2:\n\t"                                                               \
00544         "movzbl (%[c],%%rdx,1), %%ecx\n\t"                                     \
00545         "xorb   (%%rbx,%%rdx,1), %%cl\n\t"                                     \
00546         "movb   %%cl, (%%rax,%%rdx,1)\n\t"                                     \
00547         "incl   %%edx\n\t"                                                     \
00548         "cmpl   %%r9d, %%edx\n\t"                                              \
00549         "jne    2b\n\t"                                                        \
00550         "je 3f\n\t"                                                        \
00551         "\n"                                                                   \
00552         "4:\n\t"                                                               \
00553         "movq   (%[c],%%rdx,1), %%rcx\n\t"                                     \
00554         "xorq   (%%rbx,%%rdx,1), %%rcx\n\t"                                    \
00555         "movq   %%rcx, (%%rax,%%rdx,1)\n\t"                                    \
00556         "addl   $8, %%edx\n\t"                                                 \
00557         "\n"                                                                   \
00558         "3:\n\t"                                                               \
00559         "cmpl   %%r8d, %%edx\n\t"                                              \
00560         "jne    4b\n\t"                                                        \
00561         :                                                                      \
00562         : [input] "r" (ctx->X), [c] "r" (x),                                   \
00563           [output] "m" (c), [bytes] "m" (bytes), [m] "m" (m)                   \
00564         : "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13",  \
00565           "r14", "r15", "memory"                                               \
00566     )
00567 
00568 
00569 #define CHACHA_CHUNK_X64()                                                     \
00570     __asm__ __volatile__ (                                                     \
00571         CHACHA_CRYPT_X64()                                                     \
00572         "movq   %%rsi, 32(%%rsp)\n\t"                                          \
00573         "addq   $40, %%rsp\n\t"                                                \
00574         "movq   %[m], %%rsi\n\t"                                               \
00575         "subq   $40, %%rsp\n\t"                                                \
00576         "xorl    0(%%rsi), %%eax\n\t"                                          \
00577         "xorl    4(%%rsi), %%ebx\n\t"                                          \
00578         "xorl    8(%%rsi), %%ecx\n\t"                                          \
00579         "xorl   12(%%rsi), %%edx\n\t"                                          \
00580         "xorl   16(%%rsi), %%r8d\n\t"                                          \
00581         "xorl   20(%%rsi), %%r9d\n\t"                                          \
00582         "xorl   24(%%rsi), %%r10d\n\t"                                         \
00583         "xorl   28(%%rsi), %%r11d\n\t"                                         \
00584         "xorl   48(%%rsi), %%r12d\n\t"                                         \
00585         "xorl   52(%%rsi), %%r13d\n\t"                                         \
00586         "xorl   56(%%rsi), %%r14d\n\t"                                         \
00587         "xorl   60(%%rsi), %%r15d\n\t"                                         \
00588         "movq   32(%%rsp), %%rsi\n\t"                                          \
00589         "movl   %%eax ,  0(%[c])\n\t"                                          \
00590         "movl   %%ebx ,  4(%[c])\n\t"                                          \
00591         "movl   %%ecx ,  8(%[c])\n\t"                                          \
00592         "movl   %%edx , 12(%[c])\n\t"                                          \
00593         "movl   %%r8d , 16(%[c])\n\t"                                          \
00594         "movl   %%r9d , 20(%[c])\n\t"                                          \
00595         "movl   %%r10d, 24(%[c])\n\t"                                          \
00596         "movl   %%r11d, 28(%[c])\n\t"                                          \
00597         "movl   %%r12d, 48(%[c])\n\t"                                          \
00598         "movl   %%r13d, 52(%[c])\n\t"                                          \
00599         "movl   %%r14d, 56(%[c])\n\t"                                          \
00600         "movl   %%r15d, 60(%[c])\n\t"                                          \
00601         "addq   $40, %%rsp\n\t"                                                \
00602         "movq   %[m], %%r8\n\t"                                                \
00603         "subq   $40, %%rsp\n\t"                                                \
00604         "movl    8(%%rsp), %%eax\n\t"                                          \
00605         "movl   12(%%rsp), %%ebx\n\t"                                          \
00606         "movl   16(%%rsp), %%ecx\n\t"                                          \
00607         "movl   20(%%rsp), %%edx\n\t"                                          \
00608         "addl   32(%[input]), %%eax\n\t"                                       \
00609         "addl   36(%[input]), %%ebx\n\t"                                       \
00610         "addl   40(%[input]), %%ecx\n\t"                                       \
00611         "addl   44(%[input]), %%edx\n\t"                                       \
00612         "xorl   32(%%r8), %%eax\n\t"                                           \
00613         "xorl   36(%%r8), %%ebx\n\t"                                           \
00614         "xorl   40(%%r8), %%ecx\n\t"                                           \
00615         "xorl   44(%%r8), %%edx\n\t"                                           \
00616         "movl   %%eax , 32(%[c])\n\t"                                          \
00617         "movl   %%ebx , 36(%[c])\n\t"                                          \
00618         "movl   %%ecx , 40(%[c])\n\t"                                          \
00619         "movl   %%edx , 44(%[c])\n\t"                                          \
00620         "addl   $1, 48(%[input])\n\t"                                          \
00621         "addq   $40, %%rsp\n\t"                                                \
00622         :                                                                      \
00623         : [input] "r" (ctx->X), [c] "r" (c), [m] "m" (m)                       \
00624         : "eax", "ebx", "ecx", "edx", "r8", "r9", "r10", "r11", "r12", "r13",  \
00625           "r14", "r15", "memory"                                               \
00626     )
00627 
00628 
00629 static void chacha_encrypt_x64(ChaCha* ctx, const byte* m, byte* c,
00630                                word32 bytes)
00631 {
00632     word32 x[CHACHA_CHUNK_WORDS];
00633 
00634     if (bytes == 0)
00635         return;
00636 
00637     for (; bytes >= CHACHA_CHUNK_BYTES;) {
00638         CHACHA_CHUNK_X64();
00639         bytes -= CHACHA_CHUNK_BYTES;
00640         c += CHACHA_CHUNK_BYTES;
00641         m += CHACHA_CHUNK_BYTES;
00642     }
00643     if (bytes > 0) {
00644         CHACHA_PARTIAL_CHUNK_X64();
00645     }
00646 }
00647 
00648 #if defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)
00649 static const __m128i rotl8 =  { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
00650 static const __m128i rotl16 = { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
00651 #endif /* HAVE_INTEL_AVX1 || HAVE_INTEL_AVX2 */
00652 
00653 #ifdef HAVE_INTEL_AVX1
00654 #define QUARTERROUND_2_AVX()               \
00655         "paddd  %%xmm1, %%xmm0\n\t"        \
00656         "pxor   %%xmm0, %%xmm3\n\t"        \
00657         "pshufb %[rotl16], %%xmm3\n\t"     \
00658         "paddd  %%xmm3, %%xmm2\n\t"        \
00659         "pxor   %%xmm2, %%xmm1\n\t"        \
00660         "movdqa %%xmm1, %%xmm4\n\t"        \
00661         "pslld  $12, %%xmm1\n\t"           \
00662         "psrld  $20, %%xmm4\n\t"           \
00663         "pxor   %%xmm4, %%xmm1\n\t"        \
00664         "paddd  %%xmm1, %%xmm0\n\t"        \
00665         "pxor   %%xmm0, %%xmm3\n\t"        \
00666         "pshufb %[rotl8], %%xmm3\n\t"      \
00667         "paddd  %%xmm3, %%xmm2\n\t"        \
00668         "pxor   %%xmm2, %%xmm1\n\t"        \
00669         "movdqa %%xmm1, %%xmm4\n\t"        \
00670         "pslld  $7, %%xmm1\n\t"            \
00671         "psrld  $25, %%xmm4\n\t"           \
00672         "pxor   %%xmm4, %%xmm1\n\t"        \
00673         "# Swap words for next round\n\t"  \
00674         "pshufd $0x39, %%xmm1, %%xmm1\n\t" \
00675         "pshufd $0x4e, %%xmm2, %%xmm2\n\t" \
00676         "pshufd $0x93, %%xmm3, %%xmm3\n\t" \
00677         "paddd  %%xmm1, %%xmm0\n\t"        \
00678         "pxor   %%xmm0, %%xmm3\n\t"        \
00679         "pshufb %[rotl16], %%xmm3\n\t"     \
00680         "paddd  %%xmm3, %%xmm2\n\t"        \
00681         "pxor   %%xmm2, %%xmm1\n\t"        \
00682         "movdqa %%xmm1, %%xmm4\n\t"        \
00683         "pslld  $12, %%xmm1\n\t"           \
00684         "psrld  $20, %%xmm4\n\t"           \
00685         "pxor   %%xmm4, %%xmm1\n\t"        \
00686         "paddd  %%xmm1, %%xmm0\n\t"        \
00687         "pxor   %%xmm0, %%xmm3\n\t"        \
00688         "pshufb %[rotl8], %%xmm3\n\t"      \
00689         "paddd  %%xmm3, %%xmm2\n\t"        \
00690         "pxor   %%xmm2, %%xmm1\n\t"        \
00691         "movdqa %%xmm1, %%xmm4\n\t"        \
00692         "pslld  $7, %%xmm1\n\t"            \
00693         "psrld  $25, %%xmm4\n\t"           \
00694         "pxor   %%xmm4, %%xmm1\n\t"        \
00695         "# Swap words back\n\t"            \
00696         "pshufd $0x93, %%xmm1, %%xmm1\n\t" \
00697         "pshufd $0x4e, %%xmm2, %%xmm2\n\t" \
00698         "pshufd $0x39, %%xmm3, %%xmm3\n\t" \
00699 
00700 #define CHACHA_CRYPT_AVX()                                                     \
00701         "movdqu  0(%[input]), %%xmm0\n\t"                                      \
00702         "movdqu 16(%[input]), %%xmm1\n\t"                                      \
00703         "movdqu 32(%[input]), %%xmm2\n\t"                                      \
00704         "movdqu 48(%[input]), %%xmm3\n\t"                                      \
00705         "movb   $10, %%al\n\t"                                                 \
00706         "\n"                                                                   \
00707         "1:\n\t"                                                               \
00708         QUARTERROUND_2_AVX()                                                   \
00709         "decb   %%al\n\t"                                                      \
00710         "jnz    1b\n\t"                                                        \
00711         "movdqu  0(%[input]), %%xmm4\n\t"                                      \
00712         "movdqu 16(%[input]), %%xmm5\n\t"                                      \
00713         "movdqu 32(%[input]), %%xmm6\n\t"                                      \
00714         "movdqu 48(%[input]), %%xmm7\n\t"                                      \
00715         "paddd  %%xmm4, %%xmm0\n\t"                                            \
00716         "paddd  %%xmm5, %%xmm1\n\t"                                            \
00717         "paddd  %%xmm6, %%xmm2\n\t"                                            \
00718         "paddd  %%xmm7, %%xmm3\n\t"                                            \
00719 
00720 #define CHACHA_PARTIAL_CHUNK_AVX()                                             \
00721     __asm__ __volatile__ (                                                     \
00722         CHACHA_CRYPT_AVX()                                                     \
00723         "movdqu %%xmm0,  0(%[c])\n\t"                                          \
00724         "movdqu %%xmm1, 16(%[c])\n\t"                                          \
00725         "movdqu %%xmm2, 32(%[c])\n\t"                                          \
00726         "movdqu %%xmm3, 48(%[c])\n\t"                                          \
00727         "addl   $1, 48(%[input])\n\t"                                          \
00728         "movl   %[bytes], %%r8d\n\t"                                           \
00729         "xorq   %%rdx, %%rdx\n\t"                                              \
00730         "movl   %%r8d, %%r9d\n\t"                                              \
00731         "andl   $7, %%r9d\n\t"                                                 \
00732         "jz 4f\n\t"                                                        \
00733         "\n"                                                                   \
00734         "2:\n\t"                                                               \
00735         "movzbl (%[c],%%rdx,1), %%ecx\n\t"                                     \
00736         "xorb   (%[m],%%rdx,1), %%cl\n\t"                                      \
00737         "movb   %%cl, (%[output],%%rdx,1)\n\t"                                 \
00738         "incl   %%edx\n\t"                                                     \
00739         "cmpl   %%r9d, %%edx\n\t"                                              \
00740         "jne    2b\n\t"                                                        \
00741         "je 3f\n\t"                                                        \
00742         "\n"                                                                   \
00743         "4:\n\t"                                                               \
00744         "movq   (%[c],%%rdx,1), %%rcx\n\t"                                     \
00745         "xorq   (%[m],%%rdx,1), %%rcx\n\t"                                     \
00746         "movq   %%rcx, (%[output],%%rdx,1)\n\t"                                \
00747         "addl   $8, %%edx\n\t"                                                 \
00748         "\n"                                                                   \
00749         "3:\n\t"                                                               \
00750         "cmpl   %%r8d, %%edx\n\t"                                              \
00751         "jne    4b\n\t"                                                        \
00752         :                                                                      \
00753         : [input] "r" (ctx->X), [c] "r" (x),                                   \
00754           [output] "r" (c), [bytes] "r" (bytes), [m] "r" (m),                  \
00755           [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16)                       \
00756         : "eax", "ecx", "edx", "r8", "r9", "memory",                           \
00757           "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"       \
00758     )
00759 
00760 
00761 #define CHACHA_CHUNK_AVX()                                                     \
00762     __asm__ __volatile__ (                                                     \
00763         CHACHA_CRYPT_AVX()                                                     \
00764         "movdqu  0(%[m]), %%xmm4\n\t"                                          \
00765         "movdqu 16(%[m]), %%xmm5\n\t"                                          \
00766         "movdqu 32(%[m]), %%xmm6\n\t"                                          \
00767         "movdqu 48(%[m]), %%xmm7\n\t"                                          \
00768         "pxor   %%xmm4, %%xmm0\n\t"                                            \
00769         "pxor   %%xmm5, %%xmm1\n\t"                                            \
00770         "pxor   %%xmm6, %%xmm2\n\t"                                            \
00771         "pxor   %%xmm7, %%xmm3\n\t"                                            \
00772         "movdqu %%xmm0,  0(%[c])\n\t"                                          \
00773         "movdqu %%xmm1, 16(%[c])\n\t"                                          \
00774         "movdqu %%xmm2, 32(%[c])\n\t"                                          \
00775         "movdqu %%xmm3, 48(%[c])\n\t"                                          \
00776         "addl   $1, 48(%[input])\n\t"                                          \
00777         :                                                                      \
00778         : [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m),                      \
00779           [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16)                       \
00780         : "rax", "memory",                                                     \
00781           "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"       \
00782     )
00783 
00784 CHACHA20_NOINLINE static void chacha_encrypt_avx(ChaCha* ctx, const byte* m,
00785                                                  byte* c, word32 bytes)
00786 {
00787     ALIGN128 word32 X[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
00788     ALIGN128 word32 x[2*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
00789     word32 cnt = 0;
00790     static const __m128i add =    { 0x0000000100000000UL,0x0000000300000002UL };
00791     static const __m128i four =   { 0x0000000400000004UL,0x0000000400000004UL };
00792 
00793     if (bytes == 0)
00794         return;
00795 
00796     __asm__ __volatile__ (
00797        "movl    %[bytes], %[cnt]\n\t"
00798        "shrl    $8, %[cnt]\n\t"
00799        "jz      L_end128\n\t"
00800 
00801        "vpshufd $0,   (%[key]), %%xmm0\n\t"
00802        "vpshufd $0,  4(%[key]), %%xmm1\n\t"
00803        "vpshufd $0,  8(%[key]), %%xmm2\n\t"
00804        "vpshufd $0, 12(%[key]), %%xmm3\n\t"
00805        "vpshufd $0, 16(%[key]), %%xmm4\n\t"
00806        "vpshufd $0, 20(%[key]), %%xmm5\n\t"
00807        "vpshufd $0, 24(%[key]), %%xmm6\n\t"
00808        "vpshufd $0, 28(%[key]), %%xmm7\n\t"
00809        "vpshufd $0, 32(%[key]), %%xmm8\n\t"
00810        "vpshufd $0, 36(%[key]), %%xmm9\n\t"
00811        "vpshufd $0, 40(%[key]), %%xmm10\n\t"
00812        "vpshufd $0, 44(%[key]), %%xmm11\n\t"
00813        "vpshufd $0, 48(%[key]), %%xmm12\n\t"
00814        "vpshufd $0, 52(%[key]), %%xmm13\n\t"
00815        "vpshufd $0, 56(%[key]), %%xmm14\n\t"
00816        "vpshufd $0, 60(%[key]), %%xmm15\n\t"
00817 
00818        "vpaddd  %[add], %%xmm12, %%xmm12\n\t"
00819 
00820        "vmovdqa %%xmm0,     (%[X])\n\t"
00821        "vmovdqa %%xmm1,   16(%[X])\n\t"
00822        "vmovdqa %%xmm2,   32(%[X])\n\t"
00823        "vmovdqa %%xmm3,   48(%[X])\n\t"
00824        "vmovdqa %%xmm4,   64(%[X])\n\t"
00825        "vmovdqa %%xmm5,   80(%[X])\n\t"
00826        "vmovdqa %%xmm6,   96(%[X])\n\t"
00827        "vmovdqa %%xmm7,  112(%[X])\n\t"
00828        "vmovdqa %%xmm8,  128(%[X])\n\t"
00829        "vmovdqa %%xmm9,  144(%[X])\n\t"
00830        "vmovdqa %%xmm10, 160(%[X])\n\t"
00831        "vmovdqa %%xmm11, 176(%[X])\n\t"
00832        "vmovdqa %%xmm12, 192(%[X])\n\t"
00833        "vmovdqa %%xmm13, 208(%[X])\n\t"
00834        "vmovdqa %%xmm14, 224(%[X])\n\t"
00835        "vmovdqa %%xmm15, 240(%[X])\n\t"
00836        "\n"
00837    "L_enc128_loop:\n\t"
00838        "vmovdqa %%xmm11, 48(%[x])\n\t"
00839        QUARTERROUND_XMM()
00840        QUARTERROUND_XMM_2()
00841        QUARTERROUND_XMM()
00842        QUARTERROUND_XMM_2()
00843        QUARTERROUND_XMM()
00844        QUARTERROUND_XMM_2()
00845        QUARTERROUND_XMM()
00846        QUARTERROUND_XMM_2()
00847        QUARTERROUND_XMM()
00848        QUARTERROUND_XMM_2()
00849        QUARTERROUND_XMM()
00850        QUARTERROUND_XMM_2()
00851        QUARTERROUND_XMM()
00852        QUARTERROUND_XMM_2()
00853        QUARTERROUND_XMM()
00854        QUARTERROUND_XMM_2()
00855        QUARTERROUND_XMM()
00856        QUARTERROUND_XMM_2()
00857        QUARTERROUND_XMM()
00858        QUARTERROUND_XMM_2()
00859        "vmovdqa 48(%[x]), %%xmm11\n\t"
00860 
00861        "vpaddd     (%[X]), %%xmm0,  %%xmm0\n\t"
00862        "vpaddd   16(%[X]), %%xmm1,  %%xmm1\n\t"
00863        "vpaddd   32(%[X]), %%xmm2,  %%xmm2\n\t"
00864        "vpaddd   48(%[X]), %%xmm3,  %%xmm3\n\t"
00865        "vpaddd   64(%[X]), %%xmm4,  %%xmm4\n\t"
00866        "vpaddd   80(%[X]), %%xmm5,  %%xmm5\n\t"
00867        "vpaddd   96(%[X]), %%xmm6,  %%xmm6\n\t"
00868        "vpaddd  112(%[X]), %%xmm7,  %%xmm7\n\t"
00869        "vpaddd  128(%[X]), %%xmm8,  %%xmm8\n\t"
00870        "vpaddd  144(%[X]), %%xmm9,  %%xmm9\n\t"
00871        "vpaddd  160(%[X]), %%xmm10, %%xmm10\n\t"
00872        "vpaddd  176(%[X]), %%xmm11, %%xmm11\n\t"
00873        "vpaddd  192(%[X]), %%xmm12, %%xmm12\n\t"
00874        "vpaddd  208(%[X]), %%xmm13, %%xmm13\n\t"
00875        "vpaddd  224(%[X]), %%xmm14, %%xmm14\n\t"
00876        "vpaddd  240(%[X]), %%xmm15, %%xmm15\n\t"
00877 
00878        "vmovdqa %%xmm8,     (%[x])\n\t"
00879        "vmovdqa %%xmm9,   16(%[x])\n\t"
00880        "vmovdqa %%xmm10,  32(%[x])\n\t"
00881        "vmovdqa %%xmm11,  48(%[x])\n\t"
00882        "vmovdqa %%xmm12,  64(%[x])\n\t"
00883        "vmovdqa %%xmm13,  80(%[x])\n\t"
00884        "vmovdqa %%xmm14,  96(%[x])\n\t"
00885        "vmovdqa %%xmm15, 112(%[x])\n\t"
00886 
00887        "vpunpckldq %%xmm1, %%xmm0, %%xmm8\n\t"
00888        "vpunpckldq %%xmm3, %%xmm2, %%xmm9\n\t"
00889        "vpunpckhdq %%xmm1, %%xmm0, %%xmm12\n\t"
00890        "vpunpckhdq %%xmm3, %%xmm2, %%xmm13\n\t"
00891        "vpunpckldq %%xmm5, %%xmm4, %%xmm10\n\t"
00892        "vpunpckldq %%xmm7, %%xmm6, %%xmm11\n\t"
00893        "vpunpckhdq %%xmm5, %%xmm4, %%xmm14\n\t"
00894        "vpunpckhdq %%xmm7, %%xmm6, %%xmm15\n\t"
00895        "vpunpcklqdq %%xmm9,  %%xmm8,  %%xmm0\n\t"
00896        "vpunpcklqdq %%xmm11, %%xmm10, %%xmm1\n\t"
00897        "vpunpckhqdq %%xmm9,  %%xmm8,  %%xmm2\n\t"
00898        "vpunpckhqdq %%xmm11, %%xmm10, %%xmm3\n\t"
00899        "vpunpcklqdq %%xmm13, %%xmm12, %%xmm4\n\t"
00900        "vpunpcklqdq %%xmm15, %%xmm14, %%xmm5\n\t"
00901        "vpunpckhqdq %%xmm13, %%xmm12, %%xmm6\n\t"
00902        "vpunpckhqdq %%xmm15, %%xmm14, %%xmm7\n\t"
00903        "vmovdqu    (%[in]), %%xmm8\n\t"
00904        "vmovdqu  16(%[in]), %%xmm9\n\t"
00905        "vmovdqu  64(%[in]), %%xmm10\n\t"
00906        "vmovdqu  80(%[in]), %%xmm11\n\t"
00907        "vmovdqu 128(%[in]), %%xmm12\n\t"
00908        "vmovdqu 144(%[in]), %%xmm13\n\t"
00909        "vmovdqu 192(%[in]), %%xmm14\n\t"
00910        "vmovdqu 208(%[in]), %%xmm15\n\t"
00911        "vpxor   %%xmm8,  %%xmm0, %%xmm0\n\t"
00912        "vpxor   %%xmm9,  %%xmm1, %%xmm1\n\t"
00913        "vpxor   %%xmm10, %%xmm2, %%xmm2\n\t"
00914        "vpxor   %%xmm11, %%xmm3, %%xmm3\n\t"
00915        "vpxor   %%xmm12, %%xmm4, %%xmm4\n\t"
00916        "vpxor   %%xmm13, %%xmm5, %%xmm5\n\t"
00917        "vpxor   %%xmm14, %%xmm6, %%xmm6\n\t"
00918        "vpxor   %%xmm15, %%xmm7, %%xmm7\n\t"
00919        "vmovdqu %%xmm0,    (%[out])\n\t"
00920        "vmovdqu %%xmm1,  16(%[out])\n\t"
00921        "vmovdqu %%xmm2,  64(%[out])\n\t"
00922        "vmovdqu %%xmm3,  80(%[out])\n\t"
00923        "vmovdqu %%xmm4, 128(%[out])\n\t"
00924        "vmovdqu %%xmm5, 144(%[out])\n\t"
00925        "vmovdqu %%xmm6, 192(%[out])\n\t"
00926        "vmovdqu %%xmm7, 208(%[out])\n\t"
00927 
00928        "vmovdqa    (%[x]), %%xmm0\n\t"
00929        "vmovdqa  16(%[x]), %%xmm1\n\t"
00930        "vmovdqa  32(%[x]), %%xmm2\n\t"
00931        "vmovdqa  48(%[x]), %%xmm3\n\t"
00932        "vmovdqa  64(%[x]), %%xmm4\n\t"
00933        "vmovdqa  80(%[x]), %%xmm5\n\t"
00934        "vmovdqa  96(%[x]), %%xmm6\n\t"
00935        "vmovdqa 112(%[x]), %%xmm7\n\t"
00936 
00937        "vpunpckldq %%xmm1, %%xmm0, %%xmm8\n\t"
00938        "vpunpckldq %%xmm3, %%xmm2, %%xmm9\n\t"
00939        "vpunpckhdq %%xmm1, %%xmm0, %%xmm12\n\t"
00940        "vpunpckhdq %%xmm3, %%xmm2, %%xmm13\n\t"
00941        "vpunpckldq %%xmm5, %%xmm4, %%xmm10\n\t"
00942        "vpunpckldq %%xmm7, %%xmm6, %%xmm11\n\t"
00943        "vpunpckhdq %%xmm5, %%xmm4, %%xmm14\n\t"
00944        "vpunpckhdq %%xmm7, %%xmm6, %%xmm15\n\t"
00945        "vpunpcklqdq %%xmm9,  %%xmm8,  %%xmm0\n\t"
00946        "vpunpcklqdq %%xmm11, %%xmm10, %%xmm1\n\t"
00947        "vpunpckhqdq %%xmm9,  %%xmm8,  %%xmm2\n\t"
00948        "vpunpckhqdq %%xmm11, %%xmm10, %%xmm3\n\t"
00949        "vpunpcklqdq %%xmm13, %%xmm12, %%xmm4\n\t"
00950        "vpunpcklqdq %%xmm15, %%xmm14, %%xmm5\n\t"
00951        "vpunpckhqdq %%xmm13, %%xmm12, %%xmm6\n\t"
00952        "vpunpckhqdq %%xmm15, %%xmm14, %%xmm7\n\t"
00953        "vmovdqu  32(%[in]), %%xmm8\n\t"
00954        "vmovdqu  48(%[in]), %%xmm9\n\t"
00955        "vmovdqu  96(%[in]), %%xmm10\n\t"
00956        "vmovdqu 112(%[in]), %%xmm11\n\t"
00957        "vmovdqu 160(%[in]), %%xmm12\n\t"
00958        "vmovdqu 176(%[in]), %%xmm13\n\t"
00959        "vmovdqu 224(%[in]), %%xmm14\n\t"
00960        "vmovdqu 240(%[in]), %%xmm15\n\t"
00961        "vpxor   %%xmm8,  %%xmm0, %%xmm0\n\t"
00962        "vpxor   %%xmm9,  %%xmm1, %%xmm1\n\t"
00963        "vpxor   %%xmm10, %%xmm2, %%xmm2\n\t"
00964        "vpxor   %%xmm11, %%xmm3, %%xmm3\n\t"
00965        "vpxor   %%xmm12, %%xmm4, %%xmm4\n\t"
00966        "vpxor   %%xmm13, %%xmm5, %%xmm5\n\t"
00967        "vpxor   %%xmm14, %%xmm6, %%xmm6\n\t"
00968        "vpxor   %%xmm15, %%xmm7, %%xmm7\n\t"
00969        "vmovdqu %%xmm0,  32(%[out])\n\t"
00970        "vmovdqu %%xmm1,  48(%[out])\n\t"
00971        "vmovdqu %%xmm2,  96(%[out])\n\t"
00972        "vmovdqu %%xmm3, 112(%[out])\n\t"
00973        "vmovdqu %%xmm4, 160(%[out])\n\t"
00974        "vmovdqu %%xmm5, 176(%[out])\n\t"
00975        "vmovdqu %%xmm6, 224(%[out])\n\t"
00976        "vmovdqu %%xmm7, 240(%[out])\n\t"
00977 
00978        "vmovdqa 192(%[X]), %%xmm12\n\t"
00979        "add $256, %[in]\n\t"
00980        "add $256, %[out]\n\t"
00981        "vpaddd  %[four], %%xmm12, %%xmm12\n\t"
00982        "sub $256, %[bytes]\n\t"
00983        "vmovdqa %%xmm12, 192(%[X])\n\t"
00984        "cmp $256, %[bytes]\n\t"
00985        "jl  L_done\n\t"
00986 
00987        "vmovdqa    (%[X]), %%xmm0\n\t"
00988        "vmovdqa  16(%[X]), %%xmm1\n\t"
00989        "vmovdqa  32(%[X]), %%xmm2\n\t"
00990        "vmovdqa  48(%[X]), %%xmm3\n\t"
00991        "vmovdqa  64(%[X]), %%xmm4\n\t"
00992        "vmovdqa  80(%[X]), %%xmm5\n\t"
00993        "vmovdqa  96(%[X]), %%xmm6\n\t"
00994        "vmovdqa 112(%[X]), %%xmm7\n\t"
00995        "vmovdqa 128(%[X]), %%xmm8\n\t"
00996        "vmovdqa 144(%[X]), %%xmm9\n\t"
00997        "vmovdqa 160(%[X]), %%xmm10\n\t"
00998        "vmovdqa 176(%[X]), %%xmm11\n\t"
00999        "vmovdqa 192(%[X]), %%xmm12\n\t"
01000        "vmovdqa 208(%[X]), %%xmm13\n\t"
01001        "vmovdqa 224(%[X]), %%xmm14\n\t"
01002        "vmovdqa 240(%[X]), %%xmm15\n\t"
01003        "jmp L_enc128_loop\n\t"
01004 
01005        "\n"
01006    "L_done:\n\t"
01007 
01008        "shl $2, %[cnt]\n\t"
01009        "add 48(%[key]), %[cnt]\n\t"
01010        "movl    %[cnt], 48(%[key])\n\t"
01011        "\n"
01012    "L_end128:\n\t"
01013        : [bytes] "+r" (bytes), [cnt] "+r" (cnt),
01014          [in] "+r" (m), [out] "+r" (c)
01015        : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X),
01016          [add] "xrm" (add), [four] "xrm" (four),
01017          [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16)
01018        : "xmm0", "xmm1", "xmm2", "xmm3",
01019          "xmm4", "xmm5", "xmm6", "xmm7",
01020          "xmm8", "xmm9", "xmm10", "xmm11",
01021          "xmm12", "xmm13", "xmm14", "xmm15", "memory"
01022     );
01023 
01024     for (; bytes >= CHACHA_CHUNK_BYTES;) {
01025         CHACHA_CHUNK_AVX();
01026         bytes -= CHACHA_CHUNK_BYTES;
01027         c += CHACHA_CHUNK_BYTES;
01028         m += CHACHA_CHUNK_BYTES;
01029     }
01030     if (bytes > 0) {
01031         CHACHA_PARTIAL_CHUNK_AVX();
01032     }
01033 }
01034 #endif /* HAVE_INTEL_AVX1 */
01035 
01036 #ifdef HAVE_INTEL_AVX2
01037 #define QUARTERROUND_2_AVX2()                          \
01038         "vpaddd     %%xmm1, %%xmm0, %%xmm0\n\t"    \
01039         "vpxor      %%xmm0, %%xmm3, %%xmm3\n\t"    \
01040         "vpshufb    %[rotl16], %%xmm3, %%xmm3\n\t" \
01041         "vpaddd     %%xmm3, %%xmm2, %%xmm2\n\t"    \
01042         "vpxor      %%xmm2, %%xmm1, %%xmm1\n\t"    \
01043         "vpsrld     $20, %%xmm1, %%xmm4\n\t"       \
01044         "vpslld     $12, %%xmm1, %%xmm1\n\t"       \
01045         "vpxor      %%xmm4, %%xmm1, %%xmm1\n\t"    \
01046         "vpaddd     %%xmm1, %%xmm0, %%xmm0\n\t"    \
01047         "vpxor      %%xmm0, %%xmm3, %%xmm3\n\t"    \
01048         "vpshufb    %[rotl8], %%xmm3, %%xmm3\n\t"  \
01049         "vpaddd     %%xmm3, %%xmm2, %%xmm2\n\t"    \
01050         "vpxor      %%xmm2, %%xmm1, %%xmm1\n\t"    \
01051         "vpsrld     $25, %%xmm1, %%xmm4\n\t"       \
01052         "vpslld     $7, %%xmm1, %%xmm1\n\t"        \
01053         "vpxor      %%xmm4, %%xmm1, %%xmm1\n\t"    \
01054         "# Swap words for next round\n\t"              \
01055         "vpshufd    $0x39, %%xmm1, %%xmm1\n\t"     \
01056         "vpshufd    $0x4e, %%xmm2, %%xmm2\n\t"     \
01057         "vpshufd    $0x93, %%xmm3, %%xmm3\n\t"     \
01058         "vpaddd     %%xmm1, %%xmm0, %%xmm0\n\t"    \
01059         "vpxor      %%xmm0, %%xmm3, %%xmm3\n\t"    \
01060         "vpshufb    %[rotl16], %%xmm3, %%xmm3\n\t" \
01061         "vpaddd     %%xmm3, %%xmm2, %%xmm2\n\t"    \
01062         "vpxor      %%xmm2, %%xmm1, %%xmm1\n\t"    \
01063         "vpsrld     $20, %%xmm1, %%xmm4\n\t"       \
01064         "vpslld     $12, %%xmm1, %%xmm1\n\t"       \
01065         "vpxor      %%xmm4, %%xmm1, %%xmm1\n\t"    \
01066         "vpaddd     %%xmm1, %%xmm0, %%xmm0\n\t"    \
01067         "vpxor      %%xmm0, %%xmm3, %%xmm3\n\t"    \
01068         "vpshufb    %[rotl8], %%xmm3, %%xmm3\n\t"  \
01069         "vpaddd     %%xmm3, %%xmm2, %%xmm2\n\t"    \
01070         "vpxor      %%xmm2, %%xmm1, %%xmm1\n\t"    \
01071         "vpsrld     $25, %%Xmm1, %%xmm4\n\t"       \
01072         "vpslld     $7, %%xmm1, %%xmm1\n\t"        \
01073         "vpxor      %%xmm4, %%xmm1, %%xmm1\n\t"    \
01074         "# Swap words back\n\t"                        \
01075         "vpshufd    $0x93, %%xmm1, %%xmm1\n\t"     \
01076         "vpshufd    $0x4e, %%xmm2, %%xmm2\n\t"     \
01077         "vpshufd    $0x39, %%xmm3, %%xmm3\n\t"     \
01078 
01079 #define CHACHA_CRYPT_AVX2()                                                    \
01080         "vmovdqu     0(%[input]), %%xmm8\n\t"                              \
01081         "vmovdqu    16(%[input]), %%xmm9\n\t"                              \
01082         "vmovdqu    32(%[input]), %%xmm10\n\t"                             \
01083         "vmovdqu    48(%[input]), %%xmm11\n\t"                             \
01084         "vmovdqu    %%xmm8, %%xmm0\n\t"                                    \
01085         "vmovdqu    %%xmm9, %%xmm1\n\t"                                    \
01086         "vmovdqu    %%xmm10, %%xmm2\n\t"                                   \
01087         "vmovdqu    %%xmm11, %%xmm3\n\t"                                   \
01088         "movb       $10, %%al\n\t"                                         \
01089         "\n"                                                                   \
01090         "1:\n\t"                                                               \
01091         QUARTERROUND_2_AVX2()                                                  \
01092         "decb       %%al\n\t"                                              \
01093         "jnz        1b\n\t"                                                \
01094         "vpaddd     %%xmm8, %%xmm0, %%xmm0\n\t"                            \
01095         "vpaddd     %%xmm9, %%xmm1, %%xmm1\n\t"                            \
01096         "vpaddd     %%xmm10, %%xmm2, %%xmm2\n\t"                           \
01097         "vpaddd     %%xmm11, %%xmm3, %%xmm3\n\t"                           \
01098 
01099 #define CHACHA_PARTIAL_CHUNK_AVX2()                                            \
01100     __asm__ __volatile__ (                                                     \
01101         CHACHA_CRYPT_AVX2()                                                    \
01102         "vmovdqu    %%xmm0,  0(%[c])\n\t"                                  \
01103         "vmovdqu    %%xmm1, 16(%[c])\n\t"                                  \
01104         "vmovdqu    %%xmm2, 32(%[c])\n\t"                                  \
01105         "vmovdqu    %%xmm3, 48(%[c])\n\t"                                  \
01106         "addl       $1, 48(%[input])\n\t"                                  \
01107         "movl       %[bytes], %%r8d\n\t"                                   \
01108         "xorq       %%rdx, %%rdx\n\t"                                      \
01109         "movl       %%r8d, %%r9d\n\t"                                      \
01110         "andl       $7, %%r9d\n\t"                                         \
01111         "jz     4f\n\t"                                                \
01112         "\n"                                                                   \
01113         "2:\n\t"                                                               \
01114         "movzbl     (%[c],%%rdx,1), %%ecx\n\t"                             \
01115         "xorb       (%[m],%%rdx,1), %%cl\n\t"                              \
01116         "movb       %%cl, (%[output],%%rdx,1)\n\t"                         \
01117         "incl       %%edx\n\t"                                             \
01118         "cmpl       %%r9d, %%edx\n\t"                                      \
01119         "jne        2b\n\t"                                                \
01120         "je     3f\n\t"                                                \
01121         "\n"                                                                   \
01122         "4:\n\t"                                                               \
01123         "movq       (%[c],%%rdx,1), %%rcx\n\t"                             \
01124         "xorq       (%[m],%%rdx,1), %%rcx\n\t"                             \
01125         "movq       %%rcx, (%[output],%%rdx,1)\n\t"                        \
01126         "addl       $8, %%edx\n\t"                                         \
01127         "\n"                                                                   \
01128         "3:\n\t"                                                               \
01129         "cmpl       %%r8d, %%edx\n\t"                                      \
01130         "jne        4b\n\t"                                                \
01131         :                                                                      \
01132         : [input] "r" (ctx->X), [c] "r" (x),                                   \
01133           [output] "r" (c), [bytes] "r" (bytes), [m] "r" (m),                  \
01134           [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16)                       \
01135         : "eax", "ecx", "edx", "r8", "r9", "memory",                           \
01136           "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",      \
01137           "xmm8", "xmm9", "xmm10", "xmm11"                                     \
01138     )
01139 
01140 
01141 #define CHACHA_CHUNK_AVX2()                                                    \
01142     __asm__ __volatile__ (                                                     \
01143         CHACHA_CRYPT_AVX2()                                                    \
01144         "vmovdqu     0(%[m]), %%xmm4\n\t"                                  \
01145         "vmovdqu    16(%[m]), %%xmm5\n\t"                                  \
01146         "vmovdqu    32(%[m]), %%xmm6\n\t"                                  \
01147         "vmovdqu    48(%[m]), %%xmm7\n\t"                                  \
01148         "vpxor      %%xmm4, %%xmm0, %%xmm0\n\t"                            \
01149         "vpxor      %%xmm5, %%xmm1, %%xmm1\n\t"                            \
01150         "vpxor      %%xmm6, %%xmm2, %%xmm2\n\t"                            \
01151         "vpxor      %%xmm7, %%xmm3, %%xmm3\n\t"                            \
01152         "vmovdqu    %%xmm0,  0(%[c])\n\t"                                  \
01153         "vmovdqu    %%xmm1, 16(%[c])\n\t"                                  \
01154         "vmovdqu    %%xmm2, 32(%[c])\n\t"                                  \
01155         "vmovdqu    %%xmm3, 48(%[c])\n\t"                                  \
01156         "addl       $1, 48(%[input])\n\t"                                  \
01157         :                                                                      \
01158         : [input] "r" (ctx->X), [c] "r" (c), [m] "r" (m),                      \
01159           [rotl8] "xrm" (rotl8), [rotl16] "xrm" (rotl16)                       \
01160         : "rax", "memory",                                                     \
01161           "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",      \
01162           "xmm8", "xmm9", "xmm10", "xmm11"                                     \
01163     )
01164 
01165 
01166 static void chacha_encrypt_avx2(ChaCha* ctx, const byte* m, byte* c,
01167                                  word32 bytes)
01168 {
01169     ALIGN256 word32 X[8*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
01170     ALIGN256 word32 x[4*CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
01171     word32 cnt = 0;
01172     static const __m256i add    = { 0x0000000100000000UL,0x0000000300000002UL,
01173                                     0x0000000500000004UL,0x0000000700000006UL };
01174     static const __m256i eight  = { 0x0000000800000008UL,0x0000000800000008UL,
01175                                     0x0000000800000008UL,0x0000000800000008UL };
01176     static const __m256i rotl8_256  =
01177                                   { 0x0605040702010003UL,0x0e0d0c0f0a09080bUL,
01178                                     0x0605040702010003UL,0x0e0d0c0f0a09080bUL };
01179     static const __m256i rotl16_256 =
01180                                   { 0x0504070601000302UL,0x0d0c0f0e09080b0aUL,
01181                                     0x0504070601000302UL,0x0d0c0f0e09080b0aUL };
01182 
01183     if (bytes == 0)
01184         return;
01185 
01186     __asm__ __volatile__ (
01187        "movl    %[bytes], %[cnt]\n\t"
01188        "shrl    $9, %[cnt]\n\t"
01189        "jz      L_end256\n\t"
01190 
01191        "vpbroadcastd      (%[key]), %%ymm0\n\t"
01192        "vpbroadcastd     4(%[key]), %%ymm1\n\t"
01193        "vpbroadcastd     8(%[key]), %%ymm2\n\t"
01194        "vpbroadcastd    12(%[key]), %%ymm3\n\t"
01195        "vpbroadcastd    16(%[key]), %%ymm4\n\t"
01196        "vpbroadcastd    20(%[key]), %%ymm5\n\t"
01197        "vpbroadcastd    24(%[key]), %%ymm6\n\t"
01198        "vpbroadcastd    28(%[key]), %%ymm7\n\t"
01199        "vpbroadcastd    32(%[key]), %%ymm8\n\t"
01200        "vpbroadcastd    36(%[key]), %%ymm9\n\t"
01201        "vpbroadcastd    40(%[key]), %%ymm10\n\t"
01202        "vpbroadcastd    44(%[key]), %%ymm11\n\t"
01203        "vpbroadcastd    48(%[key]), %%ymm12\n\t"
01204        "vpbroadcastd    52(%[key]), %%ymm13\n\t"
01205        "vpbroadcastd    56(%[key]), %%ymm14\n\t"
01206        "vpbroadcastd    60(%[key]), %%ymm15\n\t"
01207 
01208        "vpaddd  %[add], %%ymm12, %%ymm12\n\t"
01209 
01210        "vmovdqa %%ymm0,     (%[X])\n\t"
01211        "vmovdqa %%ymm1,   32(%[X])\n\t"
01212        "vmovdqa %%ymm2,   64(%[X])\n\t"
01213        "vmovdqa %%ymm3,   96(%[X])\n\t"
01214        "vmovdqa %%ymm4,  128(%[X])\n\t"
01215        "vmovdqa %%ymm5,  160(%[X])\n\t"
01216        "vmovdqa %%ymm6,  192(%[X])\n\t"
01217        "vmovdqa %%ymm7,  224(%[X])\n\t"
01218        "vmovdqa %%ymm8,  256(%[X])\n\t"
01219        "vmovdqa %%ymm9,  288(%[X])\n\t"
01220        "vmovdqa %%ymm10, 320(%[X])\n\t"
01221        "vmovdqa %%ymm11, 352(%[X])\n\t"
01222        "vmovdqa %%ymm12, 384(%[X])\n\t"
01223        "vmovdqa %%ymm13, 416(%[X])\n\t"
01224        "vmovdqa %%ymm14, 448(%[X])\n\t"
01225        "vmovdqa %%ymm15, 480(%[X])\n\t"
01226        "\n"
01227    "L_enc256_loop:\n\t"
01228        "vmovdqa %%ymm11, 96(%[x])\n\t"
01229        QUARTERROUND_YMM()
01230        QUARTERROUND_YMM_2()
01231        QUARTERROUND_YMM()
01232        QUARTERROUND_YMM_2()
01233        QUARTERROUND_YMM()
01234        QUARTERROUND_YMM_2()
01235        QUARTERROUND_YMM()
01236        QUARTERROUND_YMM_2()
01237        QUARTERROUND_YMM()
01238        QUARTERROUND_YMM_2()
01239        QUARTERROUND_YMM()
01240        QUARTERROUND_YMM_2()
01241        QUARTERROUND_YMM()
01242        QUARTERROUND_YMM_2()
01243        QUARTERROUND_YMM()
01244        QUARTERROUND_YMM_2()
01245        QUARTERROUND_YMM()
01246        QUARTERROUND_YMM_2()
01247        QUARTERROUND_YMM()
01248        QUARTERROUND_YMM_2()
01249        "vmovdqa 96(%[x]), %%ymm11\n\t"
01250 
01251        "vpaddd     (%[X]), %%ymm0,  %%ymm0\n\t"
01252        "vpaddd   32(%[X]), %%ymm1,  %%ymm1\n\t"
01253        "vpaddd   64(%[X]), %%ymm2,  %%ymm2\n\t"
01254        "vpaddd   96(%[X]), %%ymm3,  %%ymm3\n\t"
01255        "vpaddd  128(%[X]), %%ymm4,  %%ymm4\n\t"
01256        "vpaddd  160(%[X]), %%ymm5,  %%ymm5\n\t"
01257        "vpaddd  192(%[X]), %%ymm6,  %%ymm6\n\t"
01258        "vpaddd  224(%[X]), %%ymm7,  %%ymm7\n\t"
01259        "vpaddd  256(%[X]), %%ymm8,  %%ymm8\n\t"
01260        "vpaddd  288(%[X]), %%ymm9,  %%ymm9\n\t"
01261        "vpaddd  320(%[X]), %%ymm10, %%ymm10\n\t"
01262        "vpaddd  352(%[X]), %%ymm11, %%ymm11\n\t"
01263        "vpaddd  384(%[X]), %%ymm12, %%ymm12\n\t"
01264        "vpaddd  416(%[X]), %%ymm13, %%ymm13\n\t"
01265        "vpaddd  448(%[X]), %%ymm14, %%ymm14\n\t"
01266        "vpaddd  480(%[X]), %%ymm15, %%ymm15\n\t"
01267 
01268        "vmovdqa %%ymm8,     (%[x])\n\t"
01269        "vmovdqa %%ymm9,   32(%[x])\n\t"
01270        "vmovdqa %%ymm10,  64(%[x])\n\t"
01271        "vmovdqa %%ymm11,  96(%[x])\n\t"
01272        "vmovdqa %%ymm12, 128(%[x])\n\t"
01273        "vmovdqa %%ymm13, 160(%[x])\n\t"
01274        "vmovdqa %%ymm14, 192(%[x])\n\t"
01275        "vmovdqa %%ymm15, 224(%[x])\n\t"
01276 
01277        "vpunpckldq  %%ymm1, %%ymm0, %%ymm8\n\t"
01278        "vpunpckldq  %%ymm3, %%ymm2, %%ymm9\n\t"
01279        "vpunpckhdq  %%ymm1, %%ymm0, %%ymm12\n\t"
01280        "vpunpckhdq  %%ymm3, %%ymm2, %%ymm13\n\t"
01281        "vpunpckldq  %%ymm5, %%ymm4, %%ymm10\n\t"
01282        "vpunpckldq  %%ymm7, %%ymm6, %%ymm11\n\t"
01283        "vpunpckhdq  %%ymm5, %%ymm4, %%ymm14\n\t"
01284        "vpunpckhdq  %%ymm7, %%ymm6, %%ymm15\n\t"
01285        "vpunpcklqdq %%ymm9,  %%ymm8,  %%ymm0\n\t"
01286        "vpunpcklqdq %%ymm11, %%ymm10, %%ymm1\n\t"
01287        "vpunpckhqdq %%ymm9,  %%ymm8,  %%ymm2\n\t"
01288        "vpunpckhqdq %%ymm11, %%ymm10, %%ymm3\n\t"
01289        "vpunpcklqdq %%ymm13, %%ymm12, %%ymm4\n\t"
01290        "vpunpcklqdq %%ymm15, %%ymm14, %%ymm5\n\t"
01291        "vpunpckhqdq %%ymm13, %%ymm12, %%ymm6\n\t"
01292        "vpunpckhqdq %%ymm15, %%ymm14, %%ymm7\n\t"
01293        "vperm2i128  $0x20, %%ymm1, %%ymm0, %%ymm8\n\t"
01294        "vperm2i128  $0x20, %%ymm3, %%ymm2, %%ymm9\n\t"
01295        "vperm2i128  $0x31, %%ymm1, %%ymm0, %%ymm12\n\t"
01296        "vperm2i128  $0x31, %%ymm3, %%ymm2, %%ymm13\n\t"
01297        "vperm2i128  $0x20, %%ymm5, %%ymm4, %%ymm10\n\t"
01298        "vperm2i128  $0x20, %%ymm7, %%ymm6, %%ymm11\n\t"
01299        "vperm2i128  $0x31, %%ymm5, %%ymm4, %%ymm14\n\t"
01300        "vperm2i128  $0x31, %%ymm7, %%ymm6, %%ymm15\n\t"
01301 
01302        "vmovdqu    (%[in]), %%ymm0\n\t"
01303        "vmovdqu  64(%[in]), %%ymm1\n\t"
01304        "vmovdqu 128(%[in]), %%ymm2\n\t"
01305        "vmovdqu 192(%[in]), %%ymm3\n\t"
01306        "vmovdqu 256(%[in]), %%ymm4\n\t"
01307        "vmovdqu 320(%[in]), %%ymm5\n\t"
01308        "vmovdqu 384(%[in]), %%ymm6\n\t"
01309        "vmovdqu 448(%[in]), %%ymm7\n\t"
01310        "vpxor   %%ymm0, %%ymm8,  %%ymm8\n\t"
01311        "vpxor   %%ymm1, %%ymm9,  %%ymm9\n\t"
01312        "vpxor   %%ymm2, %%ymm10, %%ymm10\n\t"
01313        "vpxor   %%ymm3, %%ymm11, %%ymm11\n\t"
01314        "vpxor   %%ymm4, %%ymm12, %%ymm12\n\t"
01315        "vpxor   %%ymm5, %%ymm13, %%ymm13\n\t"
01316        "vpxor   %%ymm6, %%ymm14, %%ymm14\n\t"
01317        "vpxor   %%ymm7, %%ymm15, %%ymm15\n\t"
01318        "vmovdqu %%ymm8,     (%[out])\n\t"
01319        "vmovdqu %%ymm9,   64(%[out])\n\t"
01320        "vmovdqu %%ymm10, 128(%[out])\n\t"
01321        "vmovdqu %%ymm11, 192(%[out])\n\t"
01322        "vmovdqu %%ymm12, 256(%[out])\n\t"
01323        "vmovdqu %%ymm13, 320(%[out])\n\t"
01324        "vmovdqu %%ymm14, 384(%[out])\n\t"
01325        "vmovdqu %%ymm15, 448(%[out])\n\t"
01326 
01327        "vmovdqa    (%[x]), %%ymm0\n\t"
01328        "vmovdqa  32(%[x]), %%ymm1\n\t"
01329        "vmovdqa  64(%[x]), %%ymm2\n\t"
01330        "vmovdqa  96(%[x]), %%ymm3\n\t"
01331        "vmovdqa 128(%[x]), %%ymm4\n\t"
01332        "vmovdqa 160(%[x]), %%ymm5\n\t"
01333        "vmovdqa 192(%[x]), %%ymm6\n\t"
01334        "vmovdqa 224(%[x]), %%ymm7\n\t"
01335 
01336        "vpunpckldq  %%ymm1, %%ymm0, %%ymm8\n\t"
01337        "vpunpckldq  %%ymm3, %%ymm2, %%ymm9\n\t"
01338        "vpunpckhdq  %%ymm1, %%ymm0, %%ymm12\n\t"
01339        "vpunpckhdq  %%ymm3, %%ymm2, %%ymm13\n\t"
01340        "vpunpckldq  %%ymm5, %%ymm4, %%ymm10\n\t"
01341        "vpunpckldq  %%ymm7, %%ymm6, %%ymm11\n\t"
01342        "vpunpckhdq  %%ymm5, %%ymm4, %%ymm14\n\t"
01343        "vpunpckhdq  %%ymm7, %%ymm6, %%ymm15\n\t"
01344        "vpunpcklqdq %%ymm9,  %%ymm8,  %%ymm0\n\t"
01345        "vpunpcklqdq %%ymm11, %%ymm10, %%ymm1\n\t"
01346        "vpunpckhqdq %%ymm9 , %%ymm8,  %%ymm2\n\t"
01347        "vpunpckhqdq %%ymm11, %%ymm10, %%ymm3\n\t"
01348        "vpunpcklqdq %%ymm13, %%ymm12, %%ymm4\n\t"
01349        "vpunpcklqdq %%ymm15, %%ymm14, %%ymm5\n\t"
01350        "vpunpckhqdq %%ymm13, %%ymm12, %%ymm6\n\t"
01351        "vpunpckhqdq %%ymm15, %%ymm14, %%ymm7\n\t"
01352        "vperm2i128  $0x20, %%ymm1, %%ymm0, %%ymm8\n\t"
01353        "vperm2i128  $0x20, %%ymm3, %%ymm2, %%ymm9\n\t"
01354        "vperm2i128  $0x31, %%ymm1, %%ymm0, %%ymm12\n\t"
01355        "vperm2i128  $0x31, %%ymm3, %%ymm2, %%ymm13\n\t"
01356        "vperm2i128  $0x20, %%ymm5, %%ymm4, %%ymm10\n\t"
01357        "vperm2i128  $0x20, %%ymm7, %%ymm6, %%ymm11\n\t"
01358        "vperm2i128  $0x31, %%ymm5, %%ymm4, %%ymm14\n\t"
01359        "vperm2i128  $0x31, %%ymm7, %%ymm6, %%ymm15\n\t"
01360 
01361        "vmovdqu  32(%[in]), %%ymm0\n\t"
01362        "vmovdqu  96(%[in]), %%ymm1\n\t"
01363        "vmovdqu 160(%[in]), %%ymm2\n\t"
01364        "vmovdqu 224(%[in]), %%ymm3\n\t"
01365        "vmovdqu 288(%[in]), %%ymm4\n\t"
01366        "vmovdqu 352(%[in]), %%ymm5\n\t"
01367        "vmovdqu 416(%[in]), %%ymm6\n\t"
01368        "vmovdqu 480(%[in]), %%ymm7\n\t"
01369        "vpxor   %%ymm0, %%ymm8,  %%ymm8\n\t"
01370        "vpxor   %%ymm1, %%ymm9,  %%ymm9\n\t"
01371        "vpxor   %%ymm2, %%ymm10, %%ymm10\n\t"
01372        "vpxor   %%ymm3, %%ymm11, %%ymm11\n\t"
01373        "vpxor   %%ymm4, %%ymm12, %%ymm12\n\t"
01374        "vpxor   %%ymm5, %%ymm13, %%ymm13\n\t"
01375        "vpxor   %%ymm6, %%ymm14, %%ymm14\n\t"
01376        "vpxor   %%ymm7, %%ymm15, %%ymm15\n\t"
01377        "vmovdqu %%ymm8,   32(%[out])\n\t"
01378        "vmovdqu %%ymm9,   96(%[out])\n\t"
01379        "vmovdqu %%ymm10, 160(%[out])\n\t"
01380        "vmovdqu %%ymm11, 224(%[out])\n\t"
01381        "vmovdqu %%ymm12, 288(%[out])\n\t"
01382        "vmovdqu %%ymm13, 352(%[out])\n\t"
01383        "vmovdqu %%ymm14, 416(%[out])\n\t"
01384        "vmovdqu %%ymm15, 480(%[out])\n\t"
01385 
01386        "vmovdqa 384(%[X]), %%ymm12\n\t"
01387        "add $512, %[in]\n\t"
01388        "add $512, %[out]\n\t"
01389        "vpaddd  %[eight], %%ymm12, %%ymm12\n\t"
01390        "sub $512, %[bytes]\n\t"
01391        "vmovdqa %%ymm12, 384(%[X])\n\t"
01392        "cmp $512, %[bytes]\n\t"
01393        "jl  L_done256\n\t"
01394 
01395        "vmovdqa    (%[X]), %%ymm0\n\t"
01396        "vmovdqa  32(%[X]), %%ymm1\n\t"
01397        "vmovdqa  64(%[X]), %%ymm2\n\t"
01398        "vmovdqa  96(%[X]), %%ymm3\n\t"
01399        "vmovdqa 128(%[X]), %%ymm4\n\t"
01400        "vmovdqa 160(%[X]), %%ymm5\n\t"
01401        "vmovdqa 192(%[X]), %%ymm6\n\t"
01402        "vmovdqa 224(%[X]), %%ymm7\n\t"
01403        "vmovdqa 256(%[X]), %%ymm8\n\t"
01404        "vmovdqa 288(%[X]), %%ymm9\n\t"
01405        "vmovdqa 320(%[X]), %%ymm10\n\t"
01406        "vmovdqa 352(%[X]), %%ymm11\n\t"
01407        "vmovdqa 384(%[X]), %%ymm12\n\t"
01408        "vmovdqa 416(%[X]), %%ymm13\n\t"
01409        "vmovdqa 448(%[X]), %%ymm14\n\t"
01410        "vmovdqa 480(%[X]), %%ymm15\n\t"
01411        "jmp L_enc256_loop\n\t"
01412        "\n"
01413    "L_done256:\n\t"
01414        "shl $3, %[cnt]\n\t"
01415        "add 48(%[key]), %[cnt]\n\t"
01416        "movl    %[cnt], 48(%[key])\n\t"
01417        "\n"
01418    "L_end256:\n\t"
01419        : [bytes] "+r" (bytes), [cnt] "+r" (cnt),
01420          [in] "+r" (m), [out] "+r" (c)
01421        : [X] "r" (X), [x] "r" (x), [key] "r" (ctx->X),
01422          [add] "m" (add), [eight] "m" (eight),
01423          [rotl8] "m" (rotl8_256), [rotl16] "m" (rotl16_256)
01424        : "ymm0", "ymm1", "ymm2", "ymm3",
01425          "ymm4", "ymm5", "ymm6", "ymm7",
01426          "ymm8", "ymm9", "ymm10", "ymm11",
01427          "ymm12", "ymm13", "ymm14", "ymm15", "memory"
01428     );
01429 
01430     /* AVX code optimised for multiples of 256 bytes. */
01431     if (bytes == 256) {
01432         chacha_encrypt_avx(ctx, m, c, bytes);
01433         bytes -= 256;
01434     }
01435 
01436     for (; bytes >= CHACHA_CHUNK_BYTES;) {
01437         CHACHA_CHUNK_AVX2();
01438         bytes -= CHACHA_CHUNK_BYTES;
01439         c += CHACHA_CHUNK_BYTES;
01440         m += CHACHA_CHUNK_BYTES;
01441     }
01442     if (bytes > 0) {
01443         CHACHA_PARTIAL_CHUNK_AVX2();
01444     }
01445 }
01446 #endif /* HAVE_INTEL_AVX2 */
01447 #endif /* USE_INTEL_CHACHA_SPEEDUP */
01448 
01449 /**
01450   * Encrypt a stream of bytes
01451   */
01452 static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c,
01453                                     word32 bytes)
01454 {
01455     byte*  output;
01456     word32 temp[CHACHA_CHUNK_WORDS]; /* used to make sure aligned */
01457     word32 i;
01458 
01459     output = (byte*)temp;
01460 
01461     for (; bytes > 0;) {
01462         wc_Chacha_wordtobyte(temp, ctx->X);
01463         ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
01464         if (bytes <= CHACHA_CHUNK_BYTES) {
01465             for (i = 0; i < bytes; ++i) {
01466                 c[i] = m[i] ^ output[i];
01467             }
01468             return;
01469         }
01470         for (i = 0; i < CHACHA_CHUNK_BYTES; ++i) {
01471             c[i] = m[i] ^ output[i];
01472         }
01473         bytes -= CHACHA_CHUNK_BYTES;
01474         c += CHACHA_CHUNK_BYTES;
01475         m += CHACHA_CHUNK_BYTES;
01476     }
01477 }
01478 
01479 /**
01480   * API to encrypt/decrypt a message of any size.
01481   */
01482 int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
01483                       word32 msglen)
01484 {
01485     if (ctx == NULL)
01486         return BAD_FUNC_ARG;
01487 
01488 #ifdef USE_INTEL_CHACHA_SPEEDUP
01489     if (!cpuidFlagsSet) {
01490         cpuidFlags = cpuid_get_flags();
01491         cpuidFlagsSet = 1;
01492     }
01493 
01494     #ifdef HAVE_INTEL_AVX2
01495     if (IS_INTEL_AVX2(cpuidFlags)) {
01496         chacha_encrypt_avx2(ctx, input, output, msglen);
01497         return 0;
01498     }
01499     #endif
01500     if (IS_INTEL_AVX1(cpuidFlags)) {
01501         chacha_encrypt_avx(ctx, input, output, msglen);
01502         return 0;
01503     }
01504     else {
01505         chacha_encrypt_x64(ctx, input, output, msglen);
01506         return 0;
01507     }
01508 #endif
01509     wc_Chacha_encrypt_bytes(ctx, input, output, msglen);
01510 
01511     return 0;
01512 }
01513 
01514 #endif /* HAVE_CHACHA*/
01515 
01516