wolfcrypt | Mbed

Users » sPymbed » Code » wolfcrypt » Documentation
Embed: (wiki syntax)
Show/hide line numbers poly1305.c Source File
00001 /* poly1305.c
00002  *
00003  * Copyright (C) 2006-2017 wolfSSL Inc.
00004  *
00005  * This file is part of wolfSSL.
00006  *
00007  * wolfSSL is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * wolfSSL is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
00020  */
00021 
00022 /*
00023  * Based off the public domain implementations by Andrew Moon
00024  * and Daniel J. Bernstein
00025  */
00026 
00027 #ifdef HAVE_CONFIG_H
00028     #include <config.h>
00029 #endif
00030 
00031 #include <wolfcrypt/settings.h>
00032 
00033 #ifdef HAVE_POLY1305
00034 #include <wolfcrypt/poly1305.h>
00035 #include <wolfcrypt/error-crypt.h>
00036 #include <wolfcrypt/logging.h>
00037 #include <wolfcrypt/cpuid.h>
00038 #ifdef NO_INLINE
00039     #include <wolfcrypt/misc.h>
00040 #else
00041     #define WOLFSSL_MISC_INCLUDED
00042     #include <wolfcrypt/src/misc.c>
00043 #endif
00044 #ifdef CHACHA_AEAD_TEST
00045     #include <stdio.h>
00046 #endif
00047 
00048 #ifdef _MSC_VER
00049     /* 4127 warning constant while(1)  */
00050     #pragma warning(disable: 4127)
00051 #endif
00052 
00053 #ifdef USE_INTEL_SPEEDUP
00054     #include <emmintrin.h>
00055     #include <immintrin.h>
00056 
00057     #if defined(__GNUC__) && ((__GNUC__ < 4) || \
00058                               (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
00059         #define NO_AVX2_SUPPORT
00060     #endif
00061     #if defined(__clang__) && ((__clang_major__ < 3) || \
00062                                (__clang_major__ == 3 && __clang_minor__ <= 5))
00063         #define NO_AVX2_SUPPORT
00064     #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
00065         #undef NO_AVX2_SUPPORT
00066     #endif
00067 
00068     #define HAVE_INTEL_AVX1
00069     #ifndef NO_AVX2_SUPPORT
00070         #define HAVE_INTEL_AVX2
00071     #endif
00072 #endif
00073 
00074 #ifdef USE_INTEL_SPEEDUP
00075 static word32 intel_flags = 0;
00076 static word32 cpu_flags_set = 0;
00077 #endif
00078 
00079 #if defined(USE_INTEL_SPEEDUP) || defined(POLY130564)
00080     #if defined(_MSC_VER)
00081         #define POLY1305_NOINLINE __declspec(noinline)
00082     #elif defined(__GNUC__)
00083         #define POLY1305_NOINLINE __attribute__((noinline))
00084     #else
00085         #define POLY1305_NOINLINE
00086     #endif
00087 
00088     #if defined(_MSC_VER)
00089         #include <intrin.h>
00090 
00091         typedef struct word128 {
00092             word64 lo;
00093             word64 hi;
00094         } word128;
00095 
00096         #define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi)
00097         #define ADD(out, in) { word64 t = out.lo; out.lo += in.lo; \
00098                                out.hi += (out.lo < t) + in.hi; }
00099         #define ADDLO(out, in) { word64 t = out.lo; out.lo += in; \
00100                                  out.hi += (out.lo < t); }
00101         #define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift)))
00102         #define LO(in) (in.lo)
00103 
00104     #elif defined(__GNUC__)
00105         #if defined(__SIZEOF_INT128__)
00106             typedef unsigned __int128 word128;
00107         #else
00108             typedef unsigned word128 __attribute__((mode(TI)));
00109         #endif
00110 
00111         #define MUL(out, x, y) out = ((word128)x * y)
00112         #define ADD(out, in) out += in
00113         #define ADDLO(out, in) out += in
00114         #define SHR(in, shift) (word64)(in >> (shift))
00115         #define LO(in) (word64)(in)
00116     #endif
00117 #endif
00118 
00119 #ifdef USE_INTEL_SPEEDUP
00120 #ifdef HAVE_INTEL_AVX1
00121 /* Process one block (16 bytes) of data.
00122  *
00123  * ctx  Poly1305 context.
00124  * m    One block of message data.
00125  */
00126 static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m)
00127 {
00128         __asm__ __volatile__ (
00129             "movq   (%[ctx]), %%r15\n\t"
00130             "movq   24(%[ctx]), %%r8\n\t"
00131             "movq   32(%[ctx]), %%r9\n\t"
00132             "movq   40(%[ctx]), %%r10\n\t"
00133             "xorq   %%rbx, %%rbx\n\t"
00134             "movb   %[nfin], %%bl\n\t"
00135             "# h += m\n\t"
00136             "movq    (%[m]), %%r11\n\t"
00137             "movq   8(%[m]), %%r12\n\t"
00138             "addq   %%r11, %%r8\n\t"
00139             "adcq   %%r12, %%r9\n\t"
00140             "movq   8(%[ctx]), %%rax\n\t"
00141             "adcq   %%rbx, %%r10\n\t"
00142             "# r[1] * h[0] => rdx, rax ==> t2, t1\n\t"
00143             "mulq   %%r8\n\t"
00144             "movq   %%rax, %%r12\n\t"
00145             "movq   %%rdx, %%r13\n\t"
00146             "# r[0] * h[1] => rdx, rax ++> t2, t1\n\t"
00147             "movq   %%r15, %%rax\n\t"
00148             "mulq   %%r9\n\t"
00149             "addq   %%rax, %%r12\n\t"
00150             "movq   %%r15, %%rax\n\t"
00151             "adcq   %%rdx, %%r13\n\t"
00152             "# r[0] * h[0] => rdx, rax ==> t4, t0\n\t"
00153             "mulq   %%r8\n\t"
00154             "movq   %%rax, %%r11\n\t"
00155             "movq   %%rdx, %%r8\n\t"
00156             "# r[1] * h[1] => rdx, rax =+> t3, t2\n\t"
00157             "movq   8(%[ctx]), %%rax\n\t"
00158             "mulq   %%r9\n\t"
00159             "#   r[0] * h[2] +> t2\n\t"
00160             "addq   352(%[ctx],%%r10,8), %%r13\n\t"
00161             "movq   %%rdx, %%r14\n\t"
00162             "addq   %%r8, %%r12\n\t"
00163             "adcq   %%rax, %%r13\n\t"
00164             "#   r[1] * h[2] +> t3\n\t"
00165             "adcq   408(%[ctx],%%r10,8), %%r14\n\t"
00166             "# r * h in r14, r13, r12, r11 \n\t"
00167             "# h = (r * h) mod 2^130 - 5\n\t"
00168             "movq   %%r13, %%r10\n\t"
00169             "andq     $-4, %%r13\n\t"
00170             "andq      $3, %%r10\n\t"
00171             "addq   %%r13, %%r11\n\t"
00172             "movq   %%r13, %%r8\n\t"
00173             "adcq   %%r14, %%r12\n\t"
00174             "adcq      $0, %%r10\n\t"
00175             "shrdq     $2, %%r14, %%r8\n\t"
00176             "shrq      $2, %%r14\n\t"
00177             "addq   %%r11, %%r8\n\t"
00178             "adcq   %%r14, %%r12\n\t"
00179             "movq   %%r12, %%r9\n\t"
00180             "adcq      $0, %%r10\n\t"
00181             "# h in r10, r9, r8 \n\t"
00182             "# Store h to ctx\n\t"
00183             "movq       %%r8, 24(%[ctx])\n\t"
00184             "movq       %%r9, 32(%[ctx])\n\t"
00185             "movq       %%r10, 40(%[ctx])\n\t"
00186             :
00187             : [m] "r" (m), [ctx] "r" (ctx), [nfin] "m" (ctx->finished)
00188             : "rax", "rdx", "r11", "r12", "r13", "r14", "r15", "rbx",
00189               "r8", "r9", "r10", "memory"
00190         );
00191 }
00192 
00193 /* Process multiple blocks (n * 16 bytes) of data.
00194  *
00195  * ctx    Poly1305 context.
00196  * m      Blocks of message data.
00197  * bytes  The number of bytes to process.
00198  */
00199 POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx,
00200                                            const unsigned char* m, size_t bytes)
00201 {
00202         __asm__ __volatile__ (
00203             "movq   (%[ctx]), %%r15\n\t"
00204             "movq   24(%[ctx]), %%r8\n\t"
00205             "movq   32(%[ctx]), %%r9\n\t"
00206             "movq   40(%[ctx]), %%r10\n"
00207         "L_avx_start:\n\t"
00208             "# h += m\n\t"
00209             "movq    (%[m]), %%r11\n\t"
00210             "movq   8(%[m]), %%r12\n\t"
00211             "addq   %%r11, %%r8\n\t"
00212             "adcq   %%r12, %%r9\n\t"
00213             "movq   8(%[ctx]), %%rax\n\t"
00214             "adcq   $0, %%r10\n\t"
00215             "# r[1] * h[0] => rdx, rax ==> t2, t1\n\t"
00216             "mulq   %%r8\n\t"
00217             "movq   %%rax, %%r12\n\t"
00218             "movq   %%rdx, %%r13\n\t"
00219             "# r[0] * h[1] => rdx, rax ++> t2, t1\n\t"
00220             "movq   %%r15, %%rax\n\t"
00221             "mulq   %%r9\n\t"
00222             "addq   %%rax, %%r12\n\t"
00223             "movq   %%r15, %%rax\n\t"
00224             "adcq   %%rdx, %%r13\n\t"
00225             "# r[0] * h[0] => rdx, rax ==> t4, t0\n\t"
00226             "mulq   %%r8\n\t"
00227             "movq   %%rax, %%r11\n\t"
00228             "movq   %%rdx, %%r8\n\t"
00229             "# r[1] * h[1] => rdx, rax =+> t3, t2\n\t"
00230             "movq   8(%[ctx]), %%rax\n\t"
00231             "mulq   %%r9\n\t"
00232             "#   r[0] * h[2] +> t2\n\t"
00233             "addq   360(%[ctx],%%r10,8), %%r13\n\t"
00234             "movq   %%rdx, %%r14\n\t"
00235             "addq   %%r8, %%r12\n\t"
00236             "adcq   %%rax, %%r13\n\t"
00237             "#   r[1] * h[2] +> t3\n\t"
00238             "adcq   416(%[ctx],%%r10,8), %%r14\n\t"
00239             "# r * h in r14, r13, r12, r11 \n\t"
00240             "# h = (r * h) mod 2^130 - 5\n\t"
00241             "movq   %%r13, %%r10\n\t"
00242             "andq     $-4, %%r13\n\t"
00243             "andq      $3, %%r10\n\t"
00244             "addq   %%r13, %%r11\n\t"
00245             "movq   %%r13, %%r8\n\t"
00246             "adcq   %%r14, %%r12\n\t"
00247             "adcq      $0, %%r10\n\t"
00248             "shrdq     $2, %%r14, %%r8\n\t"
00249             "shrq      $2, %%r14\n\t"
00250             "addq   %%r11, %%r8\n\t"
00251             "adcq   %%r14, %%r12\n\t"
00252             "movq   %%r12, %%r9\n\t"
00253             "adcq      $0, %%r10\n\t"
00254             "# h in r10, r9, r8 \n\t"
00255             "# Next block from message\n\t"
00256             "addq   $16, %[m]\n\t"
00257             "subq   $16, %[bytes]\n\t"
00258             "cmp        $16, %[bytes]\n\t"
00259             "jge    L_avx_start\n\t"
00260             "# Store h to ctx\n\t"
00261             "movq   %%r8, 24(%[ctx])\n\t"
00262             "movq   %%r9, 32(%[ctx])\n\t"
00263             "movq   %%r10, 40(%[ctx])\n\t"
00264             : [m] "+r" (m), [bytes] "+r" (bytes)
00265             : [ctx] "r" (ctx)
00266             : "rax", "rdx", "r11", "r12", "r13", "r14", "r15",
00267               "r8", "r9", "r10", "memory"
00268         );
00269 }
00270 
00271 /* Set the key to use when processing data.
00272  * Initialize the context.
00273  *
00274  * ctx  Poly1305 context.
00275  * key  The key data (16 bytes).
00276  */
00277 static void poly1305_setkey_avx(Poly1305* ctx, const byte* key)
00278 {
00279     int i;
00280 
00281     ctx->r[0] = *(word64*)(key + 0) & 0x0ffffffc0fffffffL;
00282     ctx->r[1] = *(word64*)(key + 8) & 0x0ffffffc0ffffffcL;
00283 
00284     for (i=0; i<7; i++) {
00285         ctx->hm[i + 0] = ctx->r[0] * i;
00286         ctx->hm[i + 7] = ctx->r[1] * i;
00287     }
00288 
00289     /* h (accumulator) = 0 */
00290     ctx->h[0] = 0;
00291     ctx->h[1] = 0;
00292     ctx->h[2] = 0;
00293 
00294     /* save pad for later */
00295     ctx->pad[0] = *(word64*)(key + 16);
00296     ctx->pad[1] = *(word64*)(key + 24);
00297 
00298     ctx->leftover = 0;
00299     ctx->finished = 1;
00300 }
00301 
00302 /* Calculate the final result - authentication data.
00303  * Zeros out the private data in the context.
00304  *
00305  * ctx  Poly1305 context.
00306  * mac  Buffer to hold 16 bytes.
00307  */
00308 static void poly1305_final_avx(Poly1305* ctx, byte* mac)
00309 {
00310     word64 h0, h1, h2;
00311 
00312     /* process the remaining block */
00313     if (ctx->leftover) {
00314         size_t i = ctx->leftover;
00315         ctx->buffer[i] = 1;
00316         for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++)
00317             ctx->buffer[i] = 0;
00318         ctx->finished = 0;
00319         poly1305_block_avx(ctx, ctx->buffer);
00320     }
00321 
00322     h0 = ctx->h[0];
00323     h1 = ctx->h[1];
00324     h2 = ctx->h[2];
00325 
00326     /* h %= p */
00327     /* h = (h + pad) */
00328     __asm__ __volatile__ (
00329         "# mod 2^130 - 5\n\t"
00330         "movq   %[h2],  %%r13\n\t"
00331         "andq    $0x3, %[h2]\n\t"
00332         "shrq    $0x2, %%r13\n\t"
00333         "leaq   (%%r13, %%r13, 4), %%r13\n\t"
00334         "add     %%r13, %[h0]\n\t"
00335         "adc       $0, %[h1]\n\t"
00336         "adc       $0, %[h2]\n\t"
00337         "# Fixup when between (1 << 130) - 1 and (1 << 130) - 5\n\t"
00338         "movq   %[h0], %%r13\n\t"
00339         "movq   %[h1], %%r14\n\t"
00340         "movq   %[h2], %%r15\n\t"
00341         "addq   $5, %%r13\n\t"
00342         "adcq   $0, %%r14\n\t"
00343         "adcq   $0, %%r15\n\t"
00344         "movq   %%r15, %%r12\n\t"
00345         "andq   $3, %%r15\n\t"
00346         "cmpq   $4, %%r12\n\t"
00347         "cmove  %%r13, %[h0]\n\t"
00348         "cmove  %%r14, %[h1]\n\t"
00349         "cmove  %%r15, %[h2]\n\t"
00350         "# h += pad\n\t"
00351         "add    %[p0], %[h0]\n\t"
00352         "adc    %[p1], %[h1]\n\t"
00353         "movq   %[h0], (%[m])\n\t"
00354         "movq   %[h1], 8(%[m])\n\t"
00355         : [h0] "+r" (h0), [h1] "+r" (h1), [h2] "+r" (h2),
00356           [p0] "+r" (ctx->pad[0]), [p1] "+r" (ctx->pad[1])
00357         : [m] "r" (mac)
00358         : "memory", "r15", "r14", "r13", "r12"
00359     );
00360 
00361     /* zero out the state */
00362     ctx->h[0] = 0;
00363     ctx->h[1] = 0;
00364     ctx->h[2] = 0;
00365     ctx->r[0] = 0;
00366     ctx->r[1] = 0;
00367     ctx->pad[0] = 0;
00368     ctx->pad[1] = 0;
00369 }
00370 #endif
00371 
00372 #ifdef HAVE_INTEL_AVX2
00373 #if defined(_MSC_VER)
00374     #define POLY1305_NOINLINE __declspec(noinline)
00375 #elif defined(__GNUC__)
00376     #define POLY1305_NOINLINE __attribute__((noinline))
00377 #else
00378     #define POLY1305_NOINLINE
00379 #endif
00380 
00381 /* Load H into five 256-bit registers.
00382  *
00383  * h is the memory location of the data - 26 of 32 bits.
00384  * h0-h4 the 4 H values with 26 bits stored in 64 for multiply.
00385  */
00386 #define LOAD_H(h, h0, h1, h2, h3, h4)  \
00387     "vmovdqu       ("#h"), "#h0"\n\t"  \
00388     "vmovdqu     32("#h"), "#h1"\n\t"  \
00389     "vmovdqu     64("#h"), "#h2"\n\t"  \
00390     "vmovdqu     96("#h"), "#h3"\n\t"  \
00391     "vmovdqu    128("#h"), "#h4"\n\t"
00392 
00393 /* Store H, five 256-bit registers, packed.
00394  *
00395  * h is the memory location of the data - 26 bits in 32.
00396  * h0-h4 the 4 H values with 26 bits stored in 64.
00397  * x4 is the xmm register of h4.
00398  */
00399 #define STORE_H(h, h0, h1, h2, h3, h4, x4)      \
00400     "vmovdqu     "#h0",    ("#h")\n\t"          \
00401     "vmovdqu     "#h1",  32("#h")\n\t"          \
00402     "vmovdqu     "#h2",  64("#h")\n\t"          \
00403     "vmovdqu     "#h3",  96("#h")\n\t"          \
00404     "vmovdqu     "#h4", 128("#h")\n\t"
00405 
00406 /* Load four powers of r into position to be multiplied by the 4 H values.
00407  *
00408  * r0-r4 holds the loaded values with 26 bits stored in 64 for multiply.
00409  * t0-t3 are temporary registers.
00410  */
00411 #define LOAD_Rx4(r0, r1, r2, r3, r4,                     \
00412                  t0, t1, t2, t3)                         \
00413     "vmovdqu        224(%[ctx]), "#r3"\n\t"          \
00414     "vmovdqu        256(%[ctx]), "#r2"\n\t"          \
00415     "vmovdqu        288(%[ctx]), "#r1"\n\t"          \
00416     "vmovdqu        320(%[ctx]), "#r0"\n\t"          \
00417     "vpermq     $0xd8, "#r0", "#r0"\n\t"         \
00418     "vpermq     $0xd8, "#r1", "#r1"\n\t"         \
00419     "vpermq     $0xd8, "#r2", "#r2"\n\t"         \
00420     "vpermq     $0xd8, "#r3", "#r3"\n\t"         \
00421     "vpunpcklqdq    "#r1", "#r0", "#t0"\n\t"         \
00422     "vpunpckhqdq    "#r1", "#r0", "#t1"\n\t"         \
00423     "vpunpcklqdq    "#r3", "#r2", "#t2"\n\t"         \
00424     "vpunpckhqdq    "#r3", "#r2", "#t3"\n\t"         \
00425     "vperm2i128     $0x20, "#t2", "#t0", "#r0"\n\t"  \
00426     "vperm2i128     $0x31, "#t2", "#t0", "#r2"\n\t"  \
00427     "vperm2i128     $0x20, "#t3", "#t1", "#r4"\n\t"  \
00428     "vpsrlq       $32, "#r0", "#r1"\n\t"         \
00429     "vpsrlq       $32, "#r2", "#r3"\n\t"
00430 
00431 /* Load the r^4 value into position to be multiplied by all 4 H values.
00432  *
00433  * r4 holds r^4 as five 26 bits each in 32.
00434  * r0-r4 holds the loaded values with 26 bits stored in 64 for multiply.
00435  * t0-t1 are temporary registers.
00436  */
00437 #define LOAD_R4(r4, r40, r41, r42, r43, r44, \
00438                 t0, t1)                      \
00439     "vmovdqu    "#r4", "#t0"\n\t"            \
00440     "vpermq  $0x0, "#t0", "#r40"\n\t"    \
00441     "vpsrlq   $32, "#t0", "#t1"\n\t"     \
00442     "vpermq $0x55, "#t0", "#r42"\n\t"    \
00443     "vpermq $0xaa, "#t0", "#r44"\n\t"    \
00444     "vpermq  $0x0, "#t1", "#r41"\n\t"    \
00445     "vpermq $0x55, "#t1", "#r43"\n\t"
00446 
00447 /* Multiply the top 4 26-bit values in 64 bits of each H by 5 for reduction in
00448  * multiply.
00449  *
00450  * s1-s4 are each 64 bit value in r1-r4 multiplied by 5.
00451  * r1-r4 are the top 4
00452  */
00453 #define MUL5(s1, s2, s3, s4, r1, r2, r3, r4) \
00454     "vpslld    $2, "#r1", "#s1"\n\t"     \
00455     "vpslld    $2, "#r2", "#s2"\n\t"     \
00456     "vpslld    $2, "#r3", "#s3"\n\t"     \
00457     "vpslld    $2, "#r4", "#s4"\n\t"     \
00458     "vpaddq "#s1", "#r1", "#s1"\n\t"     \
00459     "vpaddq "#s2", "#r2", "#s2"\n\t"     \
00460     "vpaddq "#s3", "#r3", "#s3"\n\t"     \
00461     "vpaddq "#s4", "#r4", "#s4"\n\t"
00462 
00463 /* Add the 4 H values together.
00464  * Each 64 bits in a register is 26 bits of one of the H values.
00465  *
00466  * h0-h4 contains the 4 H values.
00467  * t1-t4 are temporary registers.
00468  */
00469 #define FINALIZE_H(h0, h1, h2, h3, h4,    \
00470                    t0, t1, t2, t3, t4)    \
00471     "vpsrldq    $8, "#h0", "#t0"\n\t"     \
00472     "vpsrldq    $8, "#h1", "#t1"\n\t"     \
00473     "vpsrldq    $8, "#h2", "#t2"\n\t"     \
00474     "vpsrldq    $8, "#h3", "#t3"\n\t"     \
00475     "vpsrldq    $8, "#h4", "#t4"\n\t"     \
00476     "vpaddq "#h0", "#t0", "#h0"\n\t"  \
00477     "vpaddq "#h1", "#t1", "#h1"\n\t"  \
00478     "vpaddq "#h2", "#t2", "#h2"\n\t"  \
00479     "vpaddq "#h3", "#t3", "#h3"\n\t"  \
00480     "vpaddq "#h4", "#t4", "#h4"\n\t"  \
00481     "vpermq $0x02, "#h0", "#t0"\n\t"  \
00482     "vpermq $0x02, "#h1", "#t1"\n\t"  \
00483     "vpermq $0x02, "#h2", "#t2"\n\t"  \
00484     "vpermq $0x02, "#h3", "#t3"\n\t"  \
00485     "vpermq $0x02, "#h4", "#t4"\n\t"  \
00486     "vpaddq "#h0", "#t0", "#h0"\n\t"  \
00487     "vpaddq "#h1", "#t1", "#h1"\n\t"  \
00488     "vpaddq "#h2", "#t2", "#h2"\n\t"  \
00489     "vpaddq "#h3", "#t3", "#h3"\n\t"  \
00490     "vpaddq "#h4", "#t4", "#h4"\n\t"
00491 
00492 /* Move 32 bits from each xmm register to a 32 bit register.
00493  *
00494  * x0-x4 are the xmm version of the ymm registers used.
00495  * t0-t4 are the 32-bit registers to store data in.
00496  */
00497 #define MOVE_TO_32(x0, x1, x2, x3, x4,  \
00498                    t0, t1, t2, t3, t4)  \
00499     "vmovd  "#x0", "#t0"\n\t"       \
00500     "vmovd  "#x1", "#t1"\n\t"       \
00501     "vmovd  "#x2", "#t2"\n\t"       \
00502     "vmovd  "#x3", "#t3"\n\t"       \
00503     "vmovd  "#x4", "#t4"\n\t"
00504 
00505 /* Multiply using AVX2 instructions.
00506  * Each register contains up to 32 bits of data in 64 bits.
00507  * This is a 4 way parallel multiply.
00508  *
00509  * h0-h4 contain 4 H values with the 32 bits of each per register.
00510  * r0-r4 contain the 4 powers of r.
00511  * s1-s4 contain r1-r4 times 5.
00512  * t0-t4 and v0-v3 are temporary registers.
00513  */
00514 #define MUL_AVX2(h0, h1, h2, h3, h4,        \
00515                  r0, r1, r2, r3, r4,        \
00516                  s1, s2, s3, s4,            \
00517                  t0, t1, t2, t3, t4,        \
00518                  v0, v1, v2, v3)            \
00519     "vpmuludq   "#s1", "#h4", "#t0"\n\t"    \
00520     "vpmuludq   "#s2", "#h3", "#v0"\n\t"    \
00521     "vpmuludq   "#s2", "#h4", "#t1"\n\t"    \
00522     "vpmuludq   "#s3", "#h3", "#v1"\n\t"    \
00523     "vpmuludq   "#s3", "#h4", "#t2"\n\t"    \
00524     "vpaddq "#t0", "#v0", "#t0"\n\t"    \
00525     "vpmuludq   "#s3", "#h2", "#v2"\n\t"    \
00526     "vpmuludq   "#s4", "#h4", "#t3"\n\t"    \
00527     "vpaddq "#t1", "#v1", "#t1"\n\t"    \
00528     "vpmuludq   "#s4", "#h1", "#v3"\n\t"    \
00529     "vpmuludq   "#s4", "#h2", "#v0"\n\t"    \
00530     "vpaddq "#t0", "#v2", "#t0"\n\t"    \
00531     "vpmuludq   "#s4", "#h3", "#v1"\n\t"    \
00532     "vpmuludq   "#r0", "#h3", "#v2"\n\t"    \
00533     "vpaddq "#t0", "#v3", "#t0"\n\t"    \
00534     "vpmuludq   "#r0", "#h4", "#t4"\n\t"    \
00535     "vpaddq "#t1", "#v0", "#t1"\n\t"    \
00536     "vpmuludq   "#r0", "#h0", "#v3"\n\t"    \
00537     "vpaddq "#t2", "#v1", "#t2"\n\t"    \
00538     "vpmuludq   "#r0", "#h1", "#v0"\n\t"    \
00539     "vpaddq "#t3", "#v2", "#t3"\n\t"    \
00540     "vpmuludq   "#r0", "#h2", "#v1"\n\t"    \
00541     "vpmuludq   "#r1", "#h2", "#v2"\n\t"    \
00542     "vpaddq "#t0", "#v3", "#t0"\n\t"    \
00543     "vpmuludq   "#r1", "#h3", "#v3"\n\t"    \
00544     "vpaddq "#t1", "#v0", "#t1"\n\t"    \
00545     "vpmuludq   "#r1", "#h0", "#v0"\n\t"    \
00546     "vpaddq "#t2", "#v1", "#t2"\n\t"    \
00547     "vpmuludq   "#r1", "#h1", "#v1"\n\t"    \
00548     "vpaddq "#t3", "#v2", "#t3"\n\t"    \
00549     "vpmuludq   "#r2", "#h1", "#v2"\n\t"    \
00550     "vpaddq "#t4", "#v3", "#t4"\n\t"    \
00551     "vpmuludq   "#r2", "#h2", "#v3"\n\t"    \
00552     "vpaddq "#t1", "#v0", "#t1"\n\t"    \
00553     "vpmuludq   "#r2", "#h0", "#v0"\n\t"    \
00554     "vpaddq "#t2", "#v1", "#t2"\n\t"    \
00555     "vpmuludq   "#r3", "#h0", "#v1"\n\t"    \
00556     "vpaddq "#t3", "#v2", "#t3"\n\t"    \
00557     "vpmuludq   "#r3", "#h1", "#v2"\n\t"    \
00558     "vpaddq "#t4", "#v3", "#t4"\n\t"    \
00559     "vpmuludq   "#r4", "#h0", "#v3"\n\t"    \
00560     "vpaddq "#t2", "#v0", "#t2"\n\t"    \
00561     "vpaddq "#t3", "#v1", "#t3"\n\t"    \
00562     "vpaddq "#t4", "#v2", "#t4"\n\t"    \
00563     "vpaddq "#t4", "#v3", "#t4"\n\t"
00564 
00565 /* Load the 4 blocks of the message.
00566  *
00567  * m the address of the message to load.
00568  * m0-m4 is the loaded message with 32 bits in 64. Loaded so data is parallel.
00569  * hi is the high bits of the 4 m (1 << 128 as not final block).
00570  * z is zero.
00571  */
00572 #define LOAD_M(m, m0, m1, m2, m3, m4, hi, z)     \
00573     "vmovdqu      (%[m]), "#m0"\n\t"             \
00574     "vmovdqu    32(%[m]), "#m1"\n\t"             \
00575     "vperm2i128 $0x20, "#m1", "#m0", "#m2"\n\t"  \
00576     "vperm2i128 $0x31, "#m1", "#m0", "#m0"\n\t"  \
00577     "vpunpckldq "#m0", "#m2", "#m1"\n\t"         \
00578     "vpunpckhdq "#m0", "#m2", "#m3"\n\t"         \
00579     "vpunpckldq  "#z", "#m1", "#m0"\n\t"         \
00580     "vpunpckhdq  "#z", "#m1", "#m1"\n\t"         \
00581     "vpunpckldq  "#z", "#m3", "#m2"\n\t"         \
00582     "vpunpckhdq  "#z", "#m3", "#m3"\n\t"         \
00583     "vmovdqu    "#hi", "#m4"\n\t"                \
00584     "vpsllq    $6, "#m1", "#m1"\n\t"         \
00585     "vpsllq   $12, "#m2", "#m2"\n\t"         \
00586     "vpsllq   $18, "#m3", "#m3"\n\t"
00587 
00588 
00589 /* Multiply using AVX2 instructions - adding with message.
00590  * Each register contains up to 32 bits of data in 64 bits.
00591  * This is a 4 way parallel multiply.
00592  * The message data is loaded first and the multiplication adds into it.
00593  *
00594  * h0-h4 contain 4 H values with the 32 bits of each per register.
00595  * r0-r4 contain the 4 powers of r.
00596  * s1-s4 contain r1-r4 times 5.
00597  * t0-t4 and v0-v3 are temporary registers.
00598  * hi is the high bits of the 4 m (1 << 128 as not final block).
00599  * z is zero.
00600  */
00601 #define MUL_ADD_AVX2(h0, h1, h2, h3, h4,         \
00602                      r0, r1, r2, r3, r4,         \
00603                      s1, s2, s3, s4,             \
00604                      t0, t1, t2, t3, t4,         \
00605                      v0, v1, v2, v3,             \
00606                      hi, z)                      \
00607     "vmovdqu      (%[m]), "#t0"\n\t"             \
00608     "vmovdqu    32(%[m]), "#t1"\n\t"             \
00609     "vperm2i128 $0x20, "#t1", "#t0", "#t2"\n\t"  \
00610     "vperm2i128 $0x31, "#t1", "#t0", "#t0"\n\t"  \
00611     "vpunpckldq "#t0", "#t2", "#t1"\n\t"         \
00612     "vpunpckhdq "#t0", "#t2", "#t3"\n\t"         \
00613     "vpunpckldq  "#z", "#t1", "#t0"\n\t"         \
00614     "vpunpckhdq  "#z", "#t1", "#t1"\n\t"         \
00615     "vpunpckldq  "#z", "#t3", "#t2"\n\t"         \
00616     "vpunpckhdq  "#z", "#t3", "#t3"\n\t"         \
00617     "vmovdqu    "#hi", "#t4"\n\t"                \
00618     "vpsllq    $6, "#t1", "#t1"\n\t"         \
00619     "vpsllq   $12, "#t2", "#t2"\n\t"         \
00620     "vpsllq   $18, "#t3", "#t3"\n\t"         \
00621     "vpmuludq   "#s1", "#h4", "#v0"\n\t"         \
00622     "vpaddq     "#t0", "#v0", "#t0"\n\t"         \
00623     "vpmuludq   "#s2", "#h3", "#v0"\n\t"         \
00624     "vpmuludq   "#s2", "#h4", "#v1"\n\t"         \
00625     "vpaddq     "#t1", "#v1", "#t1"\n\t"         \
00626     "vpmuludq   "#s3", "#h3", "#v1"\n\t"         \
00627     "vpmuludq   "#s3", "#h4", "#v2"\n\t"         \
00628     "vpaddq     "#t2", "#v2", "#t2"\n\t"         \
00629     "vpaddq "#t0", "#v0", "#t0"\n\t"         \
00630     "vpmuludq   "#s3", "#h2", "#v2"\n\t"         \
00631     "vpmuludq   "#s4", "#h4", "#v3"\n\t"         \
00632     "vpaddq     "#t3", "#v3", "#t3"\n\t"         \
00633     "vpaddq "#t1", "#v1", "#t1"\n\t"         \
00634     "vpmuludq   "#s4", "#h1", "#v3"\n\t"         \
00635     "vpmuludq   "#s4", "#h2", "#v0"\n\t"         \
00636     "vpaddq "#t0", "#v2", "#t0"\n\t"         \
00637     "vpmuludq   "#s4", "#h3", "#v1"\n\t"         \
00638     "vpmuludq   "#r0", "#h3", "#v2"\n\t"         \
00639     "vpaddq "#t0", "#v3", "#t0"\n\t"         \
00640     "vpmuludq   "#r0", "#h4", "#v3"\n\t"         \
00641     "vpaddq "#t4", "#v3", "#t4"\n\t"         \
00642     "vpaddq "#t1", "#v0", "#t1"\n\t"         \
00643     "vpmuludq   "#r0", "#h0", "#v3"\n\t"         \
00644     "vpaddq "#t2", "#v1", "#t2"\n\t"         \
00645     "vpmuludq   "#r0", "#h1", "#v0"\n\t"         \
00646     "vpaddq "#t3", "#v2", "#t3"\n\t"         \
00647     "vpmuludq   "#r0", "#h2", "#v1"\n\t"         \
00648     "vpmuludq   "#r1", "#h2", "#v2"\n\t"         \
00649     "vpaddq "#t0", "#v3", "#t0"\n\t"         \
00650     "vpmuludq   "#r1", "#h3", "#v3"\n\t"         \
00651     "vpaddq "#t1", "#v0", "#t1"\n\t"         \
00652     "vpmuludq   "#r1", "#h0", "#v0"\n\t"         \
00653     "vpaddq "#t2", "#v1", "#t2"\n\t"         \
00654     "vpmuludq   "#r1", "#h1", "#v1"\n\t"         \
00655     "vpaddq "#t3", "#v2", "#t3"\n\t"         \
00656     "vpmuludq   "#r2", "#h1", "#v2"\n\t"         \
00657     "vpaddq "#t4", "#v3", "#t4"\n\t"         \
00658     "vpmuludq   "#r2", "#h2", "#v3"\n\t"         \
00659     "vpaddq "#t1", "#v0", "#t1"\n\t"         \
00660     "vpmuludq   "#r2", "#h0", "#v0"\n\t"         \
00661     "vpaddq "#t2", "#v1", "#t2"\n\t"         \
00662     "vpmuludq   "#r3", "#h0", "#v1"\n\t"         \
00663     "vpaddq "#t3", "#v2", "#t3"\n\t"         \
00664     "vpmuludq   "#r3", "#h1", "#v2"\n\t"         \
00665     "vpaddq "#t4", "#v3", "#t4"\n\t"         \
00666     "vpmuludq   "#r4", "#h0", "#v3"\n\t"         \
00667     "vpaddq "#t2", "#v0", "#t2"\n\t"         \
00668     "vpaddq "#t3", "#v1", "#t3"\n\t"         \
00669     "vpaddq "#t4", "#v2", "#t4"\n\t"         \
00670     "vpaddq "#t4", "#v3", "#t4"\n\t"
00671 
00672 /* Reduce the 64 bits of data to 26 bits.
00673  *
00674  * h0-h4 contain the reduced H values.
00675  * m0-m4 contain the 4 H values to reduce.
00676  * t0-t2 are temporaries.
00677  * mask contains the 26-bit mask for each 64 bit value in the 256 bit register.
00678  */
00679 #define REDUCE(h0, h1, h2, h3, h4,          \
00680                m0, m1, m2, m3, m4,          \
00681                t0, t1, t2, mask)            \
00682     "vpsrlq     $26, "#m0", "#t0"\n\t"  \
00683     "vpsrlq     $26, "#m3", "#t1"\n\t"  \
00684     "vpand  "#mask", "#m0", "#m0"\n\t"  \
00685     "vpand  "#mask", "#m3", "#m3"\n\t"  \
00686     "vpaddq   "#m1", "#t0", "#m1"\n\t"  \
00687     "vpaddq   "#m4", "#t1", "#m4"\n\t"  \
00688                                             \
00689     "vpsrlq     $26, "#m1", "#t0"\n\t"  \
00690     "vpsrlq     $26, "#m4", "#t1"\n\t"  \
00691     "vpand  "#mask", "#m1", "#h1"\n\t"  \
00692     "vpand  "#mask", "#m4", "#h4"\n\t"  \
00693     "vpaddq   "#m2", "#t0", "#m2"\n\t"  \
00694     "vpslld      $2, "#t1", "#t2"\n\t"  \
00695     "vpaddd   "#t2", "#t1", "#t2"\n\t"  \
00696                                             \
00697     "vpsrlq     $26, "#m2", "#t0"\n\t"  \
00698     "vpaddq   "#m0", "#t2", "#m0"\n\t"  \
00699     "vpsrlq     $26, "#m0", "#t1"\n\t"  \
00700     "vpand  "#mask", "#m2", "#h2"\n\t"  \
00701     "vpand  "#mask", "#m0", "#h0"\n\t"  \
00702     "vpaddq   "#m3", "#t0", "#m3"\n\t"  \
00703     "vpaddq   "#h1", "#t1", "#h1"\n\t"  \
00704                                             \
00705     "vpsrlq     $26, "#m3", "#t0"\n\t"  \
00706     "vpand  "#mask", "#m3", "#h3"\n\t"  \
00707     "vpaddq   "#h4", "#t0", "#h4"\n\t"  \
00708 
00709 
00710 /* Process multiple blocks (n * 16 bytes) of data.
00711  *
00712  * ctx    Poly1305 context.
00713  * m      Blocks of message data.
00714  * bytes  The number of bytes to process.
00715  */
00716 POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx,
00717                                            const unsigned char* m, size_t bytes)
00718 {
00719     ALIGN256 word64 r4[5][4];
00720     ALIGN256 word64 s[4][4];
00721     register word32 t0 asm("r8") = 0;
00722     register word32 t1 asm("r9") = 0;
00723     register word32 t2 asm("r10") = 0;
00724     register word32 t3 asm("r11") = 0;
00725     register word32 t4 asm("r12") = 0;
00726     static const word64 mask[4] = { 0x0000000003ffffff, 0x0000000003ffffff,
00727                                     0x0000000003ffffff, 0x0000000003ffffff };
00728     static const word64 hibit[4] = { 0x1000000, 0x1000000,
00729                                      0x1000000, 0x1000000 };
00730 
00731     __asm__ __volatile__ (
00732         "vpxor      %%ymm15, %%ymm15, %%ymm15\n\t"
00733         "cmpb       $1, %[started]\n\t"
00734         "je     L_begin\n\t"
00735         "cmpb       $1, %[fin]\n\t"
00736         "je     L_begin\n\t"
00737         "# Load the message data\n\t"
00738         LOAD_M(m, %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %[hibit], %%ymm15)
00739         "vmovdqu    %[mask], %%ymm14\n\t"
00740         "# Reduce, in place, the message data\n\t"
00741         REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
00742                %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
00743                %%ymm10, %%ymm11, %%ymm12, %%ymm14)
00744         "addq       $64, %[m]\n\t"
00745         "subq       $64, %[bytes]\n\t"
00746         "jz     L_store\n\t"
00747         "jmp        L_load_r4\n\t"
00748         "\n"
00749     "L_begin:\n\t"
00750         "# Load the H values.\n\t"
00751         LOAD_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4)
00752         "# Check if there is a power of r to load - otherwise use r^4.\n\t"
00753         "cmpb       $0, %[fin]\n\t"
00754         "je     L_load_r4\n\t"
00755         "\n\t"
00756         "# Load the 4 powers of r - r^4, r^3, r^2, r^1.\n\t"
00757         LOAD_Rx4(%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
00758                  %%ymm10, %%ymm11, %%ymm12, %%ymm13)
00759         "jmp        L_mul_5\n\t"
00760         "\n"
00761      "L_load_r4:\n\t"
00762         "# Load r^4 into all four positions.\n\t"
00763         LOAD_R4(320(%[ctx]), %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
00764                 %%ymm13, %%ymm14)
00765         "\n"
00766     "L_mul_5:\n\t"
00767         "# Multiply top 4 26-bit values of all four H by 5\n\t"
00768         MUL5(%%ymm10, %%ymm11, %%ymm12, %%ymm13, %%ymm6, %%ymm7, %%ymm8, %%ymm9)
00769         "# Store powers of r and multiple of 5 for use in multiply.\n\t"
00770         "vmovdqa    %%ymm10,    (%[s])\n\t"
00771         "vmovdqa    %%ymm11,  32(%[s])\n\t"
00772         "vmovdqa    %%ymm12,  64(%[s])\n\t"
00773         "vmovdqa    %%ymm13,  96(%[s])\n\t"
00774         "vmovdqa    %%ymm5 ,    (%[r4])\n\t"
00775         "vmovdqa    %%ymm6 ,  32(%[r4])\n\t"
00776         "vmovdqa    %%ymm7 ,  64(%[r4])\n\t"
00777         "vmovdqa    %%ymm8 ,  96(%[r4])\n\t"
00778         "vmovdqa    %%ymm9 , 128(%[r4])\n\t"
00779         "vmovdqu    %[mask], %%ymm14\n\t"
00780         "\n"
00781         "# If not finished then loop over data\n\t"
00782         "cmpb       $0x1, %[fin]\n\t"
00783         "jne        L_start\n\t"
00784         "# Do last multiply, reduce, add the four H together and move to\n\t"
00785         "# 32-bit registers\n\t"
00786         MUL_AVX2(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
00787                  (%[r4]), 32(%[r4]), 64(%[r4]), 96(%[r4]), 128(%[r4]),
00788                  (%[s]), 32(%[s]), 64(%[s]), 96(%[s]),
00789                  %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
00790                  %%ymm10, %%ymm11, %%ymm12, %%ymm13)
00791         REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
00792                %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
00793                %%ymm10, %%ymm11, %%ymm12, %%ymm14)
00794         FINALIZE_H(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
00795                    %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9)
00796         MOVE_TO_32(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4,
00797                    %[t0], %[t1], %[t2], %[t3], %[t4])
00798         "jmp        L_end\n\t"
00799         "\n"
00800     "L_start:\n\t"
00801         MUL_ADD_AVX2(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
00802                      (%[r4]), 32(%[r4]), 64(%[r4]), 96(%[r4]), 128(%[r4]),
00803                      (%[s]), 32(%[s]), 64(%[s]), 96(%[s]),
00804                      %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
00805                      %%ymm10, %%ymm11, %%ymm12, %%ymm13,
00806                      %[hibit], %%ymm15)
00807         REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4,
00808                %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9,
00809                %%ymm10, %%ymm11, %%ymm12, %%ymm14)
00810         "addq       $64, %[m]\n\t"
00811         "subq       $64, %[bytes]\n\t"
00812         "jnz        L_start\n\t"
00813         "\n"
00814     "L_store:\n\t"
00815         "# Store four H values - state\n\t"
00816         STORE_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %%xmm4)
00817         "\n"
00818     "L_end:\n\t"
00819         : [m] "+r" (m), [bytes] "+r" (bytes),
00820           [t0] "+r" (t0), [t1] "+r" (t1), [t2] "+r" (t2),
00821           [t3] "+r" (t3), [t4] "+r" (t4)
00822         : [ctx] "r" (ctx), [h] "r" (ctx->hh),
00823           [r4] "r" (r4), [s] "r" (s),
00824           [fin] "m" (ctx->finished), [started] "m" (ctx->started),
00825           [mask] "m" (mask), [hibit] "m" (hibit)
00826         : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
00827           "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
00828           "memory"
00829     );
00830 
00831     if (ctx->finished)
00832     {
00833         word64 h0, h1, h2, c;
00834 
00835         /* Convert to 64-bit form. */
00836         h0 = (((word64)(t1 & 0x3FFFF)) << 26) +  t0;
00837         h1 = (((word64)(t3 &   0x3FF)) << 34) +
00838              (((word64) t2           ) <<  8) + (t1 >> 18);
00839         h2 = (((word64) t4           ) << 16) + (t3 >> 10);
00840 
00841         /* Perform modulur reduction. */
00842                      c = (h1 >> 44); h1 &= 0xfffffffffff;
00843         h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
00844         h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
00845         h1 += c;     c = (h1 >> 44); h1 &= 0xfffffffffff;
00846         h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
00847         h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
00848         h1 += c;
00849 
00850         /* Convert from 42/44/44 to 2/64/64 bits used and store result. */
00851         ctx->h[0] =  h0        | (h1 << 44);
00852         ctx->h[1] = (h1 >> 20) | (h2 << 24);
00853         ctx->h[2] =  h2 >> 40;
00854     }
00855 
00856     ctx->started = 1;
00857 }
00858 
00859 /* Multiply two 130-bit numbers in 64-bit registers and reduce.
00860  * 44 + 44 + 42 = 130 bits
00861  *
00862  * r0-r2 are the first operand and the result.
00863  * a0-a2 are the second operand.
00864  */
00865 #define MUL_64(r0, r1, r2, a0, a1, a2)                                       \
00866     s1 = a1 * (5 << 2);                                                      \
00867     s2 = a2 * (5 << 2);                                                      \
00868     MUL(d0, r0, a0); MUL(d, r1, s2); ADD(d0, d); MUL(d, r2, s1); ADD(d0, d); \
00869     MUL(d1, r0, a1); MUL(d, r1, a0); ADD(d1, d); MUL(d, r2, s2); ADD(d1, d); \
00870     MUL(d2, r0, a2); MUL(d, r1, a1); ADD(d2, d); MUL(d, r2, a0); ADD(d2, d); \
00871                                                                              \
00872                   c = SHR(d0, 44); r0 = LO(d0) & 0xfffffffffff;              \
00873     ADDLO(d1, c); c = SHR(d1, 44); r1 = LO(d1) & 0xfffffffffff;              \
00874     ADDLO(d2, c); c = SHR(d2, 42); r2 = LO(d2) & 0x3ffffffffff;              \
00875     r0  += c * 5; c = (r0 >> 44);  r0 =    r0  & 0xfffffffffff;              \
00876     r1  += c
00877 
00878 #define SQR_64(r0, r1, r2)                                      \
00879     s2 = r2 * (5 << 2);                                         \
00880     MUL(d0, r1, s2); ADD(d0, d0); MUL(d, r0, r0); ADD(d0, d);   \
00881     MUL(d1, r0, r1); ADD(d1, d1); MUL(d, r2, s2); ADD(d1, d);   \
00882     MUL(d2, r0, r2); ADD(d2, d2); MUL(d, r1, r1); ADD(d2, d);   \
00883                                                                 \
00884                   c = SHR(d0, 44); r0 = LO(d0) & 0xfffffffffff; \
00885     ADDLO(d1, c); c = SHR(d1, 44); r1 = LO(d1) & 0xfffffffffff; \
00886     ADDLO(d2, c); c = SHR(d2, 42); r2 = LO(d2) & 0x3ffffffffff; \
00887     r0  += c * 5; c = (r0 >> 44);  r0 =    r0  & 0xfffffffffff; \
00888     r1  += c
00889 
00890 /* Store the 130-bit number in 64-bit registers as 26-bit values in 32 bits.
00891  *
00892  * r0-r2 contains the 130-bit number in 64-bit registers.
00893  * r is the address of where to store the 26 of 32 bits result.
00894  */
00895 #define CONV_64_TO_32(r0, r1, r2, r)                      \
00896     r[0] = (word32)( r0                    ) & 0x3ffffff; \
00897     r[1] = (word32)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; \
00898     r[2] = (word32)( r1 >> 8               ) & 0x3ffffff; \
00899     r[3] = (word32)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; \
00900     r[4] = (word32)( r2 >> 16              )
00901 
00902 /* Calculate R^1, R^2, R^3 and R^4 and store them in the context.
00903  *
00904  * ctx    Poly1305 context.
00905  */
00906 static void poly1305_calc_powers(Poly1305* ctx)
00907 {
00908     word64 r0, r1, r2, t0, t1, c;
00909     word64 r20, r21, r22;
00910     word64 r30, r31, r32;
00911     word64 r40, r41, r42;
00912     word64 s1, s2;
00913     word128 d0, d1, d2, d;
00914 
00915     t0 = ctx->r[0];
00916     t1 = ctx->r[1];
00917     r0 = ( t0                    ) & 0xfffffffffff;
00918     r1 = ((t0 >> 44) | (t1 << 20)) & 0xfffffffffff;
00919     r2 = ((t1 >> 24)             ) & 0x00fffffffff;
00920 
00921     /* Store r^1 */
00922     CONV_64_TO_32(r0, r1, r2, ctx->r1);
00923 
00924     /* Calc and store r^2 */
00925     r20 = r0; r21 = r1; r22 = r2;
00926     SQR_64(r20, r21, r22);
00927     CONV_64_TO_32(r20, r21, r22, ctx->r2);
00928 
00929     /* Calc and store r^3 */
00930     r30 = r20; r31 = r21; r32 = r22;
00931     MUL_64(r30, r31, r32, r0, r1, r2);
00932     CONV_64_TO_32(r30, r31, r32, ctx->r3);
00933 
00934     /* Calc and store r^4 */
00935     r40 = r20; r41 = r21; r42 = r22;
00936     SQR_64(r40, r41, r42);
00937     CONV_64_TO_32(r40, r41, r42, ctx->r4);
00938 
00939 }
00940 
00941 /* Set the key to use when processing data.
00942  * Initialize the context.
00943  * Calls AVX set key function as final function calls AVX code.
00944  *
00945  * ctx  Poly1305 context.
00946  * key  The key data (16 bytes).
00947  */
00948 static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key)
00949 {
00950     poly1305_setkey_avx(ctx, key);
00951 
00952     __asm__ __volatile__ (
00953         "vpxor      %%ymm0, %%ymm0, %%ymm0\n\t"
00954         "vmovdqu    %%ymm0,    (%[hh])\n\t"
00955         "vmovdqu    %%ymm0,  32(%[hh])\n\t"
00956         "vmovdqu    %%ymm0,  64(%[hh])\n\t"
00957         "vmovdqu    %%ymm0,  96(%[hh])\n\t"
00958         "vmovdqu    %%ymm0, 128(%[hh])\n\t"
00959         :
00960         : [hh] "r" (ctx->hh)
00961         : "memory", "ymm0"
00962     );
00963 
00964     ctx->leftover = 0;
00965     ctx->finished = 0;
00966     ctx->started = 0;
00967 }
00968 
00969 /* Calculate the final result - authentication data.
00970  * Zeros out the private data in the context.
00971  * Calls AVX final function to quickly process last blocks.
00972  *
00973  * ctx  Poly1305 context.
00974  * mac  Buffer to hold 16 bytes - authentication data.
00975  */
00976 static void poly1305_final_avx2(Poly1305* ctx, byte* mac)
00977 {
00978     int i, j;
00979     int l = (int)ctx->leftover;
00980 
00981     ctx->finished = 1;
00982     if (ctx->started)
00983         poly1305_blocks_avx2(ctx, ctx->buffer, POLY1305_BLOCK_SIZE * 4);
00984 
00985     i = l & ~(POLY1305_BLOCK_SIZE - 1);
00986     if (i > 0)
00987         poly1305_blocks_avx(ctx, ctx->buffer, i);
00988     ctx->leftover -= i;
00989     for (j = 0; i < l; i++, j++)
00990         ctx->buffer[j] = ctx->buffer[i];
00991 
00992     poly1305_final_avx(ctx, mac);
00993 
00994     /* zero out the state */
00995     __asm__ __volatile__ (
00996         "vpxor      %%ymm0, %%ymm0, %%ymm0\n\t"
00997         "vmovdqu    %%ymm0,    (%[hh])\n\t"
00998         "vmovdqu    %%ymm0,  32(%[hh])\n\t"
00999         "vmovdqu    %%ymm0,  64(%[hh])\n\t"
01000         "vmovdqu    %%ymm0,  96(%[hh])\n\t"
01001         "vmovdqu    %%ymm0, 128(%[hh])\n\t"
01002         "vmovdqu    %%ymm0,    (%[r1])\n\t"
01003         "vmovdqu    %%ymm0,    (%[r2])\n\t"
01004         "vmovdqu    %%ymm0,    (%[r3])\n\t"
01005         "vmovdqu    %%ymm0,    (%[r4])\n\t"
01006         :
01007         : [hh] "r" (ctx->hh), [r1] "r" (ctx->r1), [r2] "r" (ctx->r2),
01008           [r3] "r" (ctx->r3), [r4] "r" (ctx->r4)
01009         : "memory", "ymm0"
01010     );
01011 
01012     ctx->leftover = 0;
01013     ctx->finished = 0;
01014     ctx->started = 0;
01015 }
01016 #endif
01017 
01018 #elif defined(POLY130564)
01019 
01020     static word64 U8TO64(const byte* p)
01021     {
01022         return
01023             (((word64)(p[0] & 0xff)      ) |
01024              ((word64)(p[1] & 0xff) <<  8) |
01025              ((word64)(p[2] & 0xff) << 16) |
01026              ((word64)(p[3] & 0xff) << 24) |
01027              ((word64)(p[4] & 0xff) << 32) |
01028              ((word64)(p[5] & 0xff) << 40) |
01029              ((word64)(p[6] & 0xff) << 48) |
01030              ((word64)(p[7] & 0xff) << 56));
01031     }
01032 
01033     static void U64TO8(byte* p, word64 v) {
01034         p[0] = (v      ) & 0xff;
01035         p[1] = (v >>  8) & 0xff;
01036         p[2] = (v >> 16) & 0xff;
01037         p[3] = (v >> 24) & 0xff;
01038         p[4] = (v >> 32) & 0xff;
01039         p[5] = (v >> 40) & 0xff;
01040         p[6] = (v >> 48) & 0xff;
01041         p[7] = (v >> 56) & 0xff;
01042     }
01043 
01044 #else /* if not 64 bit then use 32 bit */
01045 
01046     static word32 U8TO32(const byte *p)
01047     {
01048         return
01049             (((word32)(p[0] & 0xff)      ) |
01050              ((word32)(p[1] & 0xff) <<  8) |
01051              ((word32)(p[2] & 0xff) << 16) |
01052              ((word32)(p[3] & 0xff) << 24));
01053     }
01054 
01055     static void U32TO8(byte *p, word32 v) {
01056         p[0] = (v      ) & 0xff;
01057         p[1] = (v >>  8) & 0xff;
01058         p[2] = (v >> 16) & 0xff;
01059         p[3] = (v >> 24) & 0xff;
01060     }
01061 #endif
01062 
01063 
01064 static void U32TO64(word32 v, byte* p)
01065 {
01066     XMEMSET(p, 0, 8);
01067     p[0] = (v & 0xFF);
01068     p[1] = (v >>  8) & 0xFF;
01069     p[2] = (v >> 16) & 0xFF;
01070     p[3] = (v >> 24) & 0xFF;
01071 }
01072 
01073 static void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
01074                             size_t bytes)
01075 {
01076 #ifdef USE_INTEL_SPEEDUP
01077     /* AVX2 is handled in wc_Poly1305Update. */
01078     poly1305_blocks_avx(ctx, m, bytes);
01079 #elif defined(POLY130564)
01080     const word64 hibit = (ctx->finished) ? 0 : ((word64)1 << 40); /* 1 << 128 */
01081     word64 r0,r1,r2;
01082     word64 s1,s2;
01083     word64 h0,h1,h2;
01084     word64 c;
01085     word128 d0,d1,d2,d;
01086 
01087     r0 = ctx->r[0];
01088     r1 = ctx->r[1];
01089     r2 = ctx->r[2];
01090 
01091     h0 = ctx->h[0];
01092     h1 = ctx->h[1];
01093     h2 = ctx->h[2];
01094 
01095     s1 = r1 * (5 << 2);
01096     s2 = r2 * (5 << 2);
01097 
01098     while (bytes >= POLY1305_BLOCK_SIZE) {
01099         word64 t0,t1;
01100 
01101         /* h += m[i] */
01102         t0 = U8TO64(&m[0]);
01103         t1 = U8TO64(&m[8]);
01104 
01105         h0 += (( t0                    ) & 0xfffffffffff);
01106         h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
01107         h2 += (((t1 >> 24)             ) & 0x3ffffffffff) | hibit;
01108 
01109         /* h *= r */
01110         MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d);
01111         MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d);
01112         MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d);
01113 
01114         /* (partial) h %= p */
01115                       c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff;
01116         ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff;
01117         ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff;
01118         h0  += c * 5; c = (h0 >> 44);  h0 =    h0  & 0xfffffffffff;
01119         h1  += c;
01120 
01121         m += POLY1305_BLOCK_SIZE;
01122         bytes -= POLY1305_BLOCK_SIZE;
01123     }
01124 
01125     ctx->h[0] = h0;
01126     ctx->h[1] = h1;
01127     ctx->h[2] = h2;
01128 
01129 #else /* if not 64 bit then use 32 bit */
01130     const word32 hibit = (ctx->finished) ? 0 : (1 << 24); /* 1 << 128 */
01131     word32 r0,r1,r2,r3,r4;
01132     word32 s1,s2,s3,s4;
01133     word32 h0,h1,h2,h3,h4;
01134     word64 d0,d1,d2,d3,d4;
01135     word32 c;
01136 
01137 
01138     r0 = ctx->r[0];
01139     r1 = ctx->r[1];
01140     r2 = ctx->r[2];
01141     r3 = ctx->r[3];
01142     r4 = ctx->r[4];
01143 
01144     s1 = r1 * 5;
01145     s2 = r2 * 5;
01146     s3 = r3 * 5;
01147     s4 = r4 * 5;
01148 
01149     h0 = ctx->h[0];
01150     h1 = ctx->h[1];
01151     h2 = ctx->h[2];
01152     h3 = ctx->h[3];
01153     h4 = ctx->h[4];
01154 
01155     while (bytes >= POLY1305_BLOCK_SIZE) {
01156         /* h += m[i] */
01157         h0 += (U8TO32(m+ 0)     ) & 0x3ffffff;
01158         h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff;
01159         h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff;
01160         h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff;
01161         h4 += (U8TO32(m+12) >> 8) | hibit;
01162 
01163         /* h *= r */
01164         d0 = ((word64)h0 * r0) + ((word64)h1 * s4) + ((word64)h2 * s3) +
01165              ((word64)h3 * s2) + ((word64)h4 * s1);
01166         d1 = ((word64)h0 * r1) + ((word64)h1 * r0) + ((word64)h2 * s4) +
01167              ((word64)h3 * s3) + ((word64)h4 * s2);
01168         d2 = ((word64)h0 * r2) + ((word64)h1 * r1) + ((word64)h2 * r0) +
01169              ((word64)h3 * s4) + ((word64)h4 * s3);
01170         d3 = ((word64)h0 * r3) + ((word64)h1 * r2) + ((word64)h2 * r1) +
01171              ((word64)h3 * r0) + ((word64)h4 * s4);
01172         d4 = ((word64)h0 * r4) + ((word64)h1 * r3) + ((word64)h2 * r2) +
01173              ((word64)h3 * r1) + ((word64)h4 * r0);
01174 
01175         /* (partial) h %= p */
01176                       c = (word32)(d0 >> 26); h0 = (word32)d0 & 0x3ffffff;
01177         d1 += c;      c = (word32)(d1 >> 26); h1 = (word32)d1 & 0x3ffffff;
01178         d2 += c;      c = (word32)(d2 >> 26); h2 = (word32)d2 & 0x3ffffff;
01179         d3 += c;      c = (word32)(d3 >> 26); h3 = (word32)d3 & 0x3ffffff;
01180         d4 += c;      c = (word32)(d4 >> 26); h4 = (word32)d4 & 0x3ffffff;
01181         h0 += c * 5;  c =  (h0 >> 26); h0 =                h0 & 0x3ffffff;
01182         h1 += c;
01183 
01184         m += POLY1305_BLOCK_SIZE;
01185         bytes -= POLY1305_BLOCK_SIZE;
01186     }
01187 
01188     ctx->h[0] = h0;
01189     ctx->h[1] = h1;
01190     ctx->h[2] = h2;
01191     ctx->h[3] = h3;
01192     ctx->h[4] = h4;
01193 
01194 #endif /* end of 64 bit cpu blocks or 32 bit cpu */
01195 }
01196 
01197 static void poly1305_block(Poly1305* ctx, const unsigned char *m)
01198 {
01199 #ifdef USE_INTEL_SPEEDUP
01200     /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
01201     poly1305_block_avx(ctx, m);
01202 #else
01203     poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE);
01204 #endif
01205 }
01206 
01207 
01208 int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
01209 {
01210 #if defined(POLY130564)
01211     word64 t0,t1;
01212 #endif
01213 
01214     if (key == NULL)
01215         return BAD_FUNC_ARG;
01216 
01217 #ifdef CHACHA_AEAD_TEST
01218     word32 k;
01219     printf("Poly key used:\n");
01220     for (k = 0; k < keySz; k++) {
01221         printf("%02x", key[k]);
01222         if ((k+1) % 8 == 0)
01223             printf("\n");
01224     }
01225     printf("\n");
01226 #endif
01227 
01228     if (keySz != 32 || ctx == NULL)
01229         return BAD_FUNC_ARG;
01230 
01231 #ifdef USE_INTEL_SPEEDUP
01232     if (!cpu_flags_set) {
01233         intel_flags = cpuid_get_flags();
01234         cpu_flags_set = 1;
01235     }
01236     #ifdef HAVE_INTEL_AVX2
01237     if (IS_INTEL_AVX2(intel_flags))
01238         poly1305_setkey_avx2(ctx, key);
01239     else
01240     #endif
01241         poly1305_setkey_avx(ctx, key);
01242 #elif defined(POLY130564)
01243 
01244     /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
01245     t0 = U8TO64(key + 0);
01246     t1 = U8TO64(key + 8);
01247 
01248     ctx->r[0] = ( t0                    ) & 0xffc0fffffff;
01249     ctx->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
01250     ctx->r[2] = ((t1 >> 24)             ) & 0x00ffffffc0f;
01251 
01252     /* h (accumulator) = 0 */
01253     ctx->h[0] = 0;
01254     ctx->h[1] = 0;
01255     ctx->h[2] = 0;
01256 
01257     /* save pad for later */
01258     ctx->pad[0] = U8TO64(key + 16);
01259     ctx->pad[1] = U8TO64(key + 24);
01260 
01261     ctx->leftover = 0;
01262     ctx->finished = 0;
01263 
01264 #else /* if not 64 bit then use 32 bit */
01265 
01266     /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
01267     ctx->r[0] = (U8TO32(key +  0)     ) & 0x3ffffff;
01268     ctx->r[1] = (U8TO32(key +  3) >> 2) & 0x3ffff03;
01269     ctx->r[2] = (U8TO32(key +  6) >> 4) & 0x3ffc0ff;
01270     ctx->r[3] = (U8TO32(key +  9) >> 6) & 0x3f03fff;
01271     ctx->r[4] = (U8TO32(key + 12) >> 8) & 0x00fffff;
01272 
01273     /* h = 0 */
01274     ctx->h[0] = 0;
01275     ctx->h[1] = 0;
01276     ctx->h[2] = 0;
01277     ctx->h[3] = 0;
01278     ctx->h[4] = 0;
01279 
01280     /* save pad for later */
01281     ctx->pad[0] = U8TO32(key + 16);
01282     ctx->pad[1] = U8TO32(key + 20);
01283     ctx->pad[2] = U8TO32(key + 24);
01284     ctx->pad[3] = U8TO32(key + 28);
01285 
01286     ctx->leftover = 0;
01287     ctx->finished = 0;
01288 
01289 #endif
01290 
01291     return 0;
01292 }
01293 
01294 
01295 int wc_Poly1305Final(Poly1305* ctx, byte* mac)
01296 {
01297 #ifdef USE_INTEL_SPEEDUP
01298 #elif defined(POLY130564)
01299 
01300     word64 h0,h1,h2,c;
01301     word64 g0,g1,g2;
01302     word64 t0,t1;
01303 
01304 #else
01305 
01306     word32 h0,h1,h2,h3,h4,c;
01307     word32 g0,g1,g2,g3,g4;
01308     word64 f;
01309     word32 mask;
01310 
01311 #endif
01312 
01313     if (ctx == NULL)
01314         return BAD_FUNC_ARG;
01315 
01316 #ifdef USE_INTEL_SPEEDUP
01317     #ifdef HAVE_INTEL_AVX2
01318     if (IS_INTEL_AVX2(intel_flags))
01319         poly1305_final_avx2(ctx, mac);
01320     else
01321     #endif
01322         poly1305_final_avx(ctx, mac);
01323 #elif defined(POLY130564)
01324 
01325     /* process the remaining block */
01326     if (ctx->leftover) {
01327         size_t i = ctx->leftover;
01328         ctx->buffer[i] = 1;
01329         for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++)
01330             ctx->buffer[i] = 0;
01331         ctx->finished = 1;
01332         poly1305_block(ctx, ctx->buffer);
01333     }
01334 
01335     /* fully carry h */
01336     h0 = ctx->h[0];
01337     h1 = ctx->h[1];
01338     h2 = ctx->h[2];
01339 
01340                  c = (h1 >> 44); h1 &= 0xfffffffffff;
01341     h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
01342     h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
01343     h1 += c;     c = (h1 >> 44); h1 &= 0xfffffffffff;
01344     h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
01345     h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
01346     h1 += c;
01347 
01348     /* compute h + -p */
01349     g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
01350     g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
01351     g2 = h2 + c - ((word64)1 << 42);
01352 
01353     /* select h if h < p, or h + -p if h >= p */
01354     c = (g2 >> ((sizeof(word64) * 8) - 1)) - 1;
01355     g0 &= c;
01356     g1 &= c;
01357     g2 &= c;
01358     c = ~c;
01359     h0 = (h0 & c) | g0;
01360     h1 = (h1 & c) | g1;
01361     h2 = (h2 & c) | g2;
01362 
01363     /* h = (h + pad) */
01364     t0 = ctx->pad[0];
01365     t1 = ctx->pad[1];
01366 
01367     h0 += (( t0                    ) & 0xfffffffffff)    ;
01368     c = (h0 >> 44); h0 &= 0xfffffffffff;
01369     h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
01370     c = (h1 >> 44); h1 &= 0xfffffffffff;
01371     h2 += (((t1 >> 24)             ) & 0x3ffffffffff) + c;
01372     h2 &= 0x3ffffffffff;
01373 
01374     /* mac = h % (2^128) */
01375     h0 = ((h0      ) | (h1 << 44));
01376     h1 = ((h1 >> 20) | (h2 << 24));
01377 
01378     U64TO8(mac + 0, h0);
01379     U64TO8(mac + 8, h1);
01380 
01381     /* zero out the state */
01382     ctx->h[0] = 0;
01383     ctx->h[1] = 0;
01384     ctx->h[2] = 0;
01385     ctx->r[0] = 0;
01386     ctx->r[1] = 0;
01387     ctx->r[2] = 0;
01388     ctx->pad[0] = 0;
01389     ctx->pad[1] = 0;
01390 
01391 #else /* if not 64 bit then use 32 bit */
01392 
01393     /* process the remaining block */
01394     if (ctx->leftover) {
01395         size_t i = ctx->leftover;
01396         ctx->buffer[i++] = 1;
01397         for (; i < POLY1305_BLOCK_SIZE; i++)
01398             ctx->buffer[i] = 0;
01399         ctx->finished = 1;
01400         poly1305_block(ctx, ctx->buffer);
01401     }
01402 
01403     /* fully carry h */
01404     h0 = ctx->h[0];
01405     h1 = ctx->h[1];
01406     h2 = ctx->h[2];
01407     h3 = ctx->h[3];
01408     h4 = ctx->h[4];
01409 
01410                  c = h1 >> 26; h1 = h1 & 0x3ffffff;
01411     h2 +=     c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
01412     h3 +=     c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
01413     h4 +=     c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
01414     h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
01415     h1 +=     c;
01416 
01417     /* compute h + -p */
01418     g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
01419     g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
01420     g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
01421     g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
01422     g4 = h4 + c - (1 << 26);
01423 
01424     /* select h if h < p, or h + -p if h >= p */
01425     mask = (g4 >> ((sizeof(word32) * 8) - 1)) - 1;
01426     g0 &= mask;
01427     g1 &= mask;
01428     g2 &= mask;
01429     g3 &= mask;
01430     g4 &= mask;
01431     mask = ~mask;
01432     h0 = (h0 & mask) | g0;
01433     h1 = (h1 & mask) | g1;
01434     h2 = (h2 & mask) | g2;
01435     h3 = (h3 & mask) | g3;
01436     h4 = (h4 & mask) | g4;
01437 
01438     /* h = h % (2^128) */
01439     h0 = ((h0      ) | (h1 << 26)) & 0xffffffff;
01440     h1 = ((h1 >>  6) | (h2 << 20)) & 0xffffffff;
01441     h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
01442     h3 = ((h3 >> 18) | (h4 <<  8)) & 0xffffffff;
01443 
01444     /* mac = (h + pad) % (2^128) */
01445     f = (word64)h0 + ctx->pad[0]            ; h0 = (word32)f;
01446     f = (word64)h1 + ctx->pad[1] + (f >> 32); h1 = (word32)f;
01447     f = (word64)h2 + ctx->pad[2] + (f >> 32); h2 = (word32)f;
01448     f = (word64)h3 + ctx->pad[3] + (f >> 32); h3 = (word32)f;
01449 
01450     U32TO8(mac + 0, h0);
01451     U32TO8(mac + 4, h1);
01452     U32TO8(mac + 8, h2);
01453     U32TO8(mac + 12, h3);
01454 
01455     /* zero out the state */
01456     ctx->h[0] = 0;
01457     ctx->h[1] = 0;
01458     ctx->h[2] = 0;
01459     ctx->h[3] = 0;
01460     ctx->h[4] = 0;
01461     ctx->r[0] = 0;
01462     ctx->r[1] = 0;
01463     ctx->r[2] = 0;
01464     ctx->r[3] = 0;
01465     ctx->r[4] = 0;
01466     ctx->pad[0] = 0;
01467     ctx->pad[1] = 0;
01468     ctx->pad[2] = 0;
01469     ctx->pad[3] = 0;
01470 
01471 #endif
01472 
01473     return 0;
01474 }
01475 
01476 
01477 int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
01478 {
01479     size_t i;
01480 
01481 #ifdef CHACHA_AEAD_TEST
01482     word32 k;
01483     printf("Raw input to poly:\n");
01484     for (k = 0; k < bytes; k++) {
01485         printf("%02x", m[k]);
01486         if ((k+1) % 16 == 0)
01487             printf("\n");
01488     }
01489     printf("\n");
01490 #endif
01491 
01492     if (ctx == NULL)
01493         return BAD_FUNC_ARG;
01494 
01495 #ifdef USE_INTEL_SPEEDUP
01496     #ifdef HAVE_INTEL_AVX2
01497     if (IS_INTEL_AVX2(intel_flags)) {
01498         /* handle leftover */
01499         if (ctx->leftover) {
01500             size_t want = sizeof(ctx->buffer) - ctx->leftover;
01501             if (want > bytes)
01502                 want = bytes;
01503 
01504             for (i = 0; i < want; i++)
01505                 ctx->buffer[ctx->leftover + i] = m[i];
01506             bytes -= (word32)want;
01507             m += want;
01508             ctx->leftover += want;
01509             if (ctx->leftover < sizeof(ctx->buffer))
01510                 return 0;
01511 
01512             if (!ctx->started)
01513                 poly1305_calc_powers(ctx);
01514             poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
01515             ctx->leftover = 0;
01516         }
01517 
01518         /* process full blocks */
01519         if (bytes >= sizeof(ctx->buffer)) {
01520             size_t want = bytes & ~(sizeof(ctx->buffer) - 1);
01521 
01522             if (!ctx->started)
01523                 poly1305_calc_powers(ctx);
01524             poly1305_blocks_avx2(ctx, m, want);
01525             m += want;
01526             bytes -= (word32)want;
01527         }
01528 
01529         /* store leftover */
01530         if (bytes) {
01531             for (i = 0; i < bytes; i++)
01532                 ctx->buffer[ctx->leftover + i] = m[i];
01533             ctx->leftover += bytes;
01534         }
01535     }
01536     else
01537     #endif
01538 #endif
01539     {
01540         /* handle leftover */
01541         if (ctx->leftover) {
01542             size_t want = (POLY1305_BLOCK_SIZE - ctx->leftover);
01543             if (want > bytes)
01544                 want = bytes;
01545             for (i = 0; i < want; i++)
01546                 ctx->buffer[ctx->leftover + i] = m[i];
01547             bytes -= (word32)want;
01548             m += want;
01549             ctx->leftover += want;
01550             if (ctx->leftover < POLY1305_BLOCK_SIZE)
01551                 return 0;
01552             poly1305_block(ctx, ctx->buffer);
01553             ctx->leftover = 0;
01554         }
01555 
01556         /* process full blocks */
01557         if (bytes >= POLY1305_BLOCK_SIZE) {
01558             size_t want = (bytes & ~(POLY1305_BLOCK_SIZE - 1));
01559             poly1305_blocks(ctx, m, want);
01560             m += want;
01561             bytes -= (word32)want;
01562         }
01563 
01564         /* store leftover */
01565         if (bytes) {
01566             for (i = 0; i < bytes; i++)
01567                 ctx->buffer[ctx->leftover + i] = m[i];
01568             ctx->leftover += bytes;
01569         }
01570     }
01571 
01572     return 0;
01573 }
01574 
01575 
01576 /*  Takes in an initialized Poly1305 struct that has a key loaded and creates
01577     a MAC (tag) using recent TLS AEAD padding scheme.
01578     ctx        : Initialized Poly1305 struct to use
01579     additional : Additional data to use
01580     addSz      : Size of additional buffer
01581     input      : Input buffer to create tag from
01582     sz         : Size of input buffer
01583     tag        : Buffer to hold created tag
01584     tagSz      : Size of input tag buffer (must be at least
01585                  WC_POLY1305_MAC_SZ(16))
01586  */
01587 int wc_Poly1305_MAC(Poly1305* ctx, byte* additional, word32 addSz,
01588                     byte* input, word32 sz, byte* tag, word32 tagSz)
01589 {
01590     int ret;
01591     byte padding[WC_POLY1305_PAD_SZ - 1];
01592     word32 paddingLen;
01593     byte little64[16];
01594 
01595     XMEMSET(padding, 0, sizeof(padding));
01596 
01597     /* sanity check on arguments */
01598     if (ctx == NULL || input == NULL || tag == NULL ||
01599                                                    tagSz < WC_POLY1305_MAC_SZ) {
01600         return BAD_FUNC_ARG;
01601     }
01602 
01603     /* additional allowed to be 0 */
01604     if (addSz > 0) {
01605         if (additional == NULL)
01606             return BAD_FUNC_ARG;
01607 
01608         /* additional data plus padding */
01609         if ((ret = wc_Poly1305Update(ctx, additional, addSz)) != 0) {
01610             return ret;
01611         }
01612         paddingLen = -((int)addSz) & (WC_POLY1305_PAD_SZ - 1);
01613         if (paddingLen) {
01614             if ((ret = wc_Poly1305Update(ctx, padding, paddingLen)) != 0) {
01615                 return ret;
01616             }
01617         }
01618     }
01619 
01620     /* input plus padding */
01621     if ((ret = wc_Poly1305Update(ctx, input, sz)) != 0) {
01622         return ret;
01623     }
01624     paddingLen = -((int)sz) & (WC_POLY1305_PAD_SZ - 1);
01625     if (paddingLen) {
01626         if ((ret = wc_Poly1305Update(ctx, padding, paddingLen)) != 0) {
01627             return ret;
01628         }
01629     }
01630 
01631     /* size of additional data and input as little endian 64 bit types */
01632     U32TO64(addSz, little64);
01633     U32TO64(sz, little64 + 8);
01634     ret = wc_Poly1305Update(ctx, little64, sizeof(little64));
01635     if (ret)
01636     {
01637         return ret;
01638     }
01639 
01640     /* Finalize the auth tag */
01641     ret = wc_Poly1305Final(ctx, tag);
01642 
01643     return ret;
01644 
01645 }
01646 #endif /* HAVE_POLY1305 */
01647 
01648
Repository toolbox

Repository details

Type:	Library
Created:	20 Nov 2019
Imports:	1
Forks:	0
Commits:	2
Dependents:	1
Dependencies:	0
Followers:	1
Important changes to repositories hosted on mbed.com

poly1305.c

Repository toolbox

Repository details

Important Information for this Arm website

Important changes to repositories hosted on mbed.com

poly1305.c

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning