wolfSSL SSL/TLS library, support up to TLS1.3

Dependents:   CyaSSL-Twitter-OAuth4Tw Example-client-tls-cert TwitterReader TweetTest ... more

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers poly1305.c Source File

poly1305.c

00001 /* poly1305.c
00002  *
00003  * Copyright (C) 2006-2020 wolfSSL Inc.
00004  *
00005  * This file is part of wolfSSL.
00006  *
00007  * wolfSSL is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * wolfSSL is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
00020  */
00021 
00022 /*
00023  * Based off the public domain implementations by Andrew Moon
00024  * and Daniel J. Bernstein
00025  */
00026 
00027 
00028 #ifdef HAVE_CONFIG_H
00029     #include <config.h>
00030 #endif
00031 
00032 #include <wolfssl/wolfcrypt/settings.h>
00033 
00034 #ifdef HAVE_POLY1305
00035 #include <wolfssl/wolfcrypt/poly1305.h >
00036 #include <wolfssl/wolfcrypt/error-crypt.h >
00037 #include <wolfssl/wolfcrypt/logging.h >
00038 #include <wolfssl/wolfcrypt/cpuid.h>
00039 #ifdef NO_INLINE
00040     #include <wolfssl/wolfcrypt/misc.h>
00041 #else
00042     #define WOLFSSL_MISC_INCLUDED
00043     #include <wolfcrypt/src/misc.c>
00044 #endif
00045 #ifdef CHACHA_AEAD_TEST
00046     #include <stdio.h>
00047 #endif
00048 
00049 #ifdef _MSC_VER
00050     /* 4127 warning constant while(1)  */
00051     #pragma warning(disable: 4127)
00052 #endif
00053 
00054 #ifdef USE_INTEL_SPEEDUP
00055     #include <emmintrin.h>
00056     #include <immintrin.h>
00057 
00058     #if defined(__GNUC__) && ((__GNUC__ < 4) || \
00059                               (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
00060         #undef  NO_AVX2_SUPPORT
00061         #define NO_AVX2_SUPPORT
00062     #endif
00063     #if defined(__clang__) && ((__clang_major__ < 3) || \
00064                                (__clang_major__ == 3 && __clang_minor__ <= 5))
00065         #define NO_AVX2_SUPPORT
00066     #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
00067         #undef NO_AVX2_SUPPORT
00068     #endif
00069 
00070     #define HAVE_INTEL_AVX1
00071     #ifndef NO_AVX2_SUPPORT
00072         #define HAVE_INTEL_AVX2
00073     #endif
00074 #endif
00075 
00076 #ifdef USE_INTEL_SPEEDUP
00077 static word32 intel_flags = 0;
00078 static word32 cpu_flags_set = 0;
00079 #endif
00080 
00081 #if defined(USE_INTEL_SPEEDUP) || defined(POLY130564)
00082     #if defined(_MSC_VER)
00083         #define POLY1305_NOINLINE __declspec(noinline)
00084     #elif defined(__GNUC__)
00085         #define POLY1305_NOINLINE __attribute__((noinline))
00086     #else
00087         #define POLY1305_NOINLINE
00088     #endif
00089 
00090     #if defined(_MSC_VER)
00091         #include <intrin.h>
00092 
00093         typedef struct word128 {
00094             word64 lo;
00095             word64 hi;
00096         } word128;
00097 
00098         #define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi)
00099         #define ADD(out, in) { word64 t = out.lo; out.lo += in.lo; \
00100                                out.hi += (out.lo < t) + in.hi; }
00101         #define ADDLO(out, in) { word64 t = out.lo; out.lo += in; \
00102                                  out.hi += (out.lo < t); }
00103         #define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift)))
00104         #define LO(in) (in.lo)
00105 
00106     #elif defined(__GNUC__)
00107         #if defined(__SIZEOF_INT128__)
00108             typedef unsigned __int128 word128;
00109         #else
00110             typedef unsigned word128 __attribute__((mode(TI)));
00111         #endif
00112 
00113         #define MUL(out, x, y) out = ((word128)x * y)
00114         #define ADD(out, in) out += in
00115         #define ADDLO(out, in) out += in
00116         #define SHR(in, shift) (word64)(in >> (shift))
00117         #define LO(in) (word64)(in)
00118     #endif
00119 #endif
00120 
00121 #ifdef USE_INTEL_SPEEDUP
00122 #ifdef __cplusplus
00123     extern "C" {
00124 #endif
00125 
00126 #ifdef HAVE_INTEL_AVX1
00127 /* Process one block (16 bytes) of data.
00128  *
00129  * ctx  Poly1305 context.
00130  * m    One block of message data.
00131  */
00132 extern void poly1305_block_avx(Poly1305* ctx, const unsigned char *m);
00133 /* Process multiple blocks (n * 16 bytes) of data.
00134  *
00135  * ctx    Poly1305 context.
00136  * m      Blocks of message data.
00137  * bytes  The number of bytes to process.
00138  */
00139 extern void poly1305_blocks_avx(Poly1305* ctx, const unsigned char* m,
00140                                 size_t bytes);
00141 /* Set the key to use when processing data.
00142  * Initialize the context.
00143  *
00144  * ctx  Poly1305 context.
00145  * key  The key data (16 bytes).
00146  */
00147 extern void poly1305_setkey_avx(Poly1305* ctx, const byte* key);
00148 /* Calculate the final result - authentication data.
00149  * Zeros out the private data in the context.
00150  *
00151  * ctx  Poly1305 context.
00152  * mac  Buffer to hold 16 bytes.
00153  */
00154 extern void poly1305_final_avx(Poly1305* ctx, byte* mac);
00155 #endif
00156 
00157 #ifdef HAVE_INTEL_AVX2
00158 /* Process multiple blocks (n * 16 bytes) of data.
00159  *
00160  * ctx    Poly1305 context.
00161  * m      Blocks of message data.
00162  * bytes  The number of bytes to process.
00163  */
00164 extern void poly1305_blocks_avx2(Poly1305* ctx, const unsigned char* m,
00165                                  size_t bytes);
00166 /* Calculate R^1, R^2, R^3 and R^4 and store them in the context.
00167  *
00168  * ctx    Poly1305 context.
00169  */
00170 extern void poly1305_calc_powers_avx2(Poly1305* ctx);
00171 /* Set the key to use when processing data.
00172  * Initialize the context.
00173  * Calls AVX set key function as final function calls AVX code.
00174  *
00175  * ctx  Poly1305 context.
00176  * key  The key data (16 bytes).
00177  */
00178 extern void poly1305_setkey_avx2(Poly1305* ctx, const byte* key);
00179 /* Calculate the final result - authentication data.
00180  * Zeros out the private data in the context.
00181  * Calls AVX final function to quickly process last blocks.
00182  *
00183  * ctx  Poly1305 context.
00184  * mac  Buffer to hold 16 bytes - authentication data.
00185  */
00186 extern void poly1305_final_avx2(Poly1305* ctx, byte* mac);
00187 #endif
00188 
00189 #ifdef __cplusplus
00190     }  /* extern "C" */
00191 #endif
00192 
00193 #elif defined(POLY130564)
00194 #ifndef WOLFSSL_ARMASM
00195     static word64 U8TO64(const byte* p)
00196     {
00197         return
00198             (((word64)(p[0] & 0xff)      ) |
00199              ((word64)(p[1] & 0xff) <<  8) |
00200              ((word64)(p[2] & 0xff) << 16) |
00201              ((word64)(p[3] & 0xff) << 24) |
00202              ((word64)(p[4] & 0xff) << 32) |
00203              ((word64)(p[5] & 0xff) << 40) |
00204              ((word64)(p[6] & 0xff) << 48) |
00205              ((word64)(p[7] & 0xff) << 56));
00206     }
00207 
00208     static void U64TO8(byte* p, word64 v) {
00209         p[0] = (v      ) & 0xff;
00210         p[1] = (v >>  8) & 0xff;
00211         p[2] = (v >> 16) & 0xff;
00212         p[3] = (v >> 24) & 0xff;
00213         p[4] = (v >> 32) & 0xff;
00214         p[5] = (v >> 40) & 0xff;
00215         p[6] = (v >> 48) & 0xff;
00216         p[7] = (v >> 56) & 0xff;
00217     }
00218 #endif/* WOLFSSL_ARMASM */
00219 #else /* if not 64 bit then use 32 bit */
00220 
00221     static word32 U8TO32(const byte *p)
00222     {
00223         return
00224             (((word32)(p[0] & 0xff)      ) |
00225              ((word32)(p[1] & 0xff) <<  8) |
00226              ((word32)(p[2] & 0xff) << 16) |
00227              ((word32)(p[3] & 0xff) << 24));
00228     }
00229 
00230     static void U32TO8(byte *p, word32 v) {
00231         p[0] = (v      ) & 0xff;
00232         p[1] = (v >>  8) & 0xff;
00233         p[2] = (v >> 16) & 0xff;
00234         p[3] = (v >> 24) & 0xff;
00235     }
00236 #endif
00237 
00238 /* convert 32-bit unsigned to little endian 64 bit type as byte array */
00239 static WC_INLINE void u32tole64(const word32 inLe32, byte outLe64[8])
00240 {
00241 #ifndef WOLFSSL_X86_64_BUILD
00242     outLe64[0] = (byte)(inLe32  & 0x000000FF);
00243     outLe64[1] = (byte)((inLe32 & 0x0000FF00) >> 8);
00244     outLe64[2] = (byte)((inLe32 & 0x00FF0000) >> 16);
00245     outLe64[3] = (byte)((inLe32 & 0xFF000000) >> 24);
00246     outLe64[4] = 0;
00247     outLe64[5] = 0;
00248     outLe64[6] = 0;
00249     outLe64[7] = 0;
00250 #else
00251     *(word64*)outLe64 = inLe32;
00252 #endif
00253 }
00254 
00255 
00256 #if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
00257 void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
00258                      size_t bytes)
00259 {
00260 #ifdef USE_INTEL_SPEEDUP
00261     /* AVX2 is handled in wc_Poly1305Update. */
00262     poly1305_blocks_avx(ctx, m, bytes);
00263 #elif defined(POLY130564)
00264     const word64 hibit = (ctx->finished) ? 0 : ((word64)1 << 40); /* 1 << 128 */
00265     word64 r0,r1,r2;
00266     word64 s1,s2;
00267     word64 h0,h1,h2;
00268     word64 c;
00269     word128 d0,d1,d2,d;
00270 
00271     r0 = ctx->r[0];
00272     r1 = ctx->r[1];
00273     r2 = ctx->r[2];
00274 
00275     h0 = ctx->h[0];
00276     h1 = ctx->h[1];
00277     h2 = ctx->h[2];
00278 
00279     s1 = r1 * (5 << 2);
00280     s2 = r2 * (5 << 2);
00281 
00282     while (bytes >= POLY1305_BLOCK_SIZE) {
00283         word64 t0,t1;
00284 
00285         /* h += m[i] */
00286         t0 = U8TO64(&m[0]);
00287         t1 = U8TO64(&m[8]);
00288 
00289         h0 += (( t0                    ) & 0xfffffffffff);
00290         h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
00291         h2 += (((t1 >> 24)             ) & 0x3ffffffffff) | hibit;
00292 
00293         /* h *= r */
00294         MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d);
00295         MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d);
00296         MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d);
00297 
00298         /* (partial) h %= p */
00299                       c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff;
00300         ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff;
00301         ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff;
00302         h0  += c * 5; c = (h0 >> 44);  h0 =    h0  & 0xfffffffffff;
00303         h1  += c;
00304 
00305         m += POLY1305_BLOCK_SIZE;
00306         bytes -= POLY1305_BLOCK_SIZE;
00307     }
00308 
00309     ctx->h[0] = h0;
00310     ctx->h[1] = h1;
00311     ctx->h[2] = h2;
00312 
00313 #else /* if not 64 bit then use 32 bit */
00314     const word32 hibit = (ctx->finished) ? 0 : ((word32)1 << 24); /* 1 << 128 */
00315     word32 r0,r1,r2,r3,r4;
00316     word32 s1,s2,s3,s4;
00317     word32 h0,h1,h2,h3,h4;
00318     word64 d0,d1,d2,d3,d4;
00319     word32 c;
00320 
00321 
00322     r0 = ctx->r[0];
00323     r1 = ctx->r[1];
00324     r2 = ctx->r[2];
00325     r3 = ctx->r[3];
00326     r4 = ctx->r[4];
00327 
00328     s1 = r1 * 5;
00329     s2 = r2 * 5;
00330     s3 = r3 * 5;
00331     s4 = r4 * 5;
00332 
00333     h0 = ctx->h[0];
00334     h1 = ctx->h[1];
00335     h2 = ctx->h[2];
00336     h3 = ctx->h[3];
00337     h4 = ctx->h[4];
00338 
00339     while (bytes >= POLY1305_BLOCK_SIZE) {
00340         /* h += m[i] */
00341         h0 += (U8TO32(m+ 0)     ) & 0x3ffffff;
00342         h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff;
00343         h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff;
00344         h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff;
00345         h4 += (U8TO32(m+12) >> 8) | hibit;
00346 
00347         /* h *= r */
00348         d0 = ((word64)h0 * r0) + ((word64)h1 * s4) + ((word64)h2 * s3) +
00349              ((word64)h3 * s2) + ((word64)h4 * s1);
00350         d1 = ((word64)h0 * r1) + ((word64)h1 * r0) + ((word64)h2 * s4) +
00351              ((word64)h3 * s3) + ((word64)h4 * s2);
00352         d2 = ((word64)h0 * r2) + ((word64)h1 * r1) + ((word64)h2 * r0) +
00353              ((word64)h3 * s4) + ((word64)h4 * s3);
00354         d3 = ((word64)h0 * r3) + ((word64)h1 * r2) + ((word64)h2 * r1) +
00355              ((word64)h3 * r0) + ((word64)h4 * s4);
00356         d4 = ((word64)h0 * r4) + ((word64)h1 * r3) + ((word64)h2 * r2) +
00357              ((word64)h3 * r1) + ((word64)h4 * r0);
00358 
00359         /* (partial) h %= p */
00360                       c = (word32)(d0 >> 26); h0 = (word32)d0 & 0x3ffffff;
00361         d1 += c;      c = (word32)(d1 >> 26); h1 = (word32)d1 & 0x3ffffff;
00362         d2 += c;      c = (word32)(d2 >> 26); h2 = (word32)d2 & 0x3ffffff;
00363         d3 += c;      c = (word32)(d3 >> 26); h3 = (word32)d3 & 0x3ffffff;
00364         d4 += c;      c = (word32)(d4 >> 26); h4 = (word32)d4 & 0x3ffffff;
00365         h0 += c * 5;  c =  (h0 >> 26); h0 =                h0 & 0x3ffffff;
00366         h1 += c;
00367 
00368         m += POLY1305_BLOCK_SIZE;
00369         bytes -= POLY1305_BLOCK_SIZE;
00370     }
00371 
00372     ctx->h[0] = h0;
00373     ctx->h[1] = h1;
00374     ctx->h[2] = h2;
00375     ctx->h[3] = h3;
00376     ctx->h[4] = h4;
00377 
00378 #endif /* end of 64 bit cpu blocks or 32 bit cpu */
00379 }
00380 
00381 void poly1305_block(Poly1305* ctx, const unsigned char *m)
00382 {
00383 #ifdef USE_INTEL_SPEEDUP
00384     /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
00385     poly1305_block_avx(ctx, m);
00386 #else
00387     poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE);
00388 #endif
00389 }
00390 #endif /* !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) */
00391 
00392 #if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
00393 int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
00394 {
00395 #if defined(POLY130564) && !defined(USE_INTEL_SPEEDUP)
00396     word64 t0,t1;
00397 #endif
00398 
00399     if (key == NULL)
00400         return BAD_FUNC_ARG;
00401 
00402 #ifdef CHACHA_AEAD_TEST
00403     word32 k;
00404     printf("Poly key used:\n");
00405     for (k = 0; k < keySz; k++) {
00406         printf("%02x", key[k]);
00407         if ((k+1) % 8 == 0)
00408             printf("\n");
00409     }
00410     printf("\n");
00411 #endif
00412 
00413     if (keySz != 32 || ctx == NULL)
00414         return BAD_FUNC_ARG;
00415 
00416 #ifdef USE_INTEL_SPEEDUP
00417     if (!cpu_flags_set) {
00418         intel_flags = cpuid_get_flags();
00419         cpu_flags_set = 1;
00420     }
00421     #ifdef HAVE_INTEL_AVX2
00422     if (IS_INTEL_AVX2(intel_flags))
00423         poly1305_setkey_avx2(ctx, key);
00424     else
00425     #endif
00426         poly1305_setkey_avx(ctx, key);
00427 #elif defined(POLY130564)
00428 
00429     /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
00430     t0 = U8TO64(key + 0);
00431     t1 = U8TO64(key + 8);
00432 
00433     ctx->r[0] = ( t0                    ) & 0xffc0fffffff;
00434     ctx->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
00435     ctx->r[2] = ((t1 >> 24)             ) & 0x00ffffffc0f;
00436 
00437     /* h (accumulator) = 0 */
00438     ctx->h[0] = 0;
00439     ctx->h[1] = 0;
00440     ctx->h[2] = 0;
00441 
00442     /* save pad for later */
00443     ctx->pad[0] = U8TO64(key + 16);
00444     ctx->pad[1] = U8TO64(key + 24);
00445 
00446     ctx->leftover = 0;
00447     ctx->finished = 0;
00448 
00449 #else /* if not 64 bit then use 32 bit */
00450 
00451     /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
00452     ctx->r[0] = (U8TO32(key +  0)     ) & 0x3ffffff;
00453     ctx->r[1] = (U8TO32(key +  3) >> 2) & 0x3ffff03;
00454     ctx->r[2] = (U8TO32(key +  6) >> 4) & 0x3ffc0ff;
00455     ctx->r[3] = (U8TO32(key +  9) >> 6) & 0x3f03fff;
00456     ctx->r[4] = (U8TO32(key + 12) >> 8) & 0x00fffff;
00457 
00458     /* h = 0 */
00459     ctx->h[0] = 0;
00460     ctx->h[1] = 0;
00461     ctx->h[2] = 0;
00462     ctx->h[3] = 0;
00463     ctx->h[4] = 0;
00464 
00465     /* save pad for later */
00466     ctx->pad[0] = U8TO32(key + 16);
00467     ctx->pad[1] = U8TO32(key + 20);
00468     ctx->pad[2] = U8TO32(key + 24);
00469     ctx->pad[3] = U8TO32(key + 28);
00470 
00471     ctx->leftover = 0;
00472     ctx->finished = 0;
00473 
00474 #endif
00475 
00476     return 0;
00477 }
00478 
00479 int wc_Poly1305Final(Poly1305* ctx, byte* mac)
00480 {
00481 #ifdef USE_INTEL_SPEEDUP
00482 #elif defined(POLY130564)
00483 
00484     word64 h0,h1,h2,c;
00485     word64 g0,g1,g2;
00486     word64 t0,t1;
00487 
00488 #else
00489 
00490     word32 h0,h1,h2,h3,h4,c;
00491     word32 g0,g1,g2,g3,g4;
00492     word64 f;
00493     word32 mask;
00494 
00495 #endif
00496 
00497     if (ctx == NULL)
00498         return BAD_FUNC_ARG;
00499 
00500 #ifdef USE_INTEL_SPEEDUP
00501     #ifdef HAVE_INTEL_AVX2
00502     if (IS_INTEL_AVX2(intel_flags))
00503         poly1305_final_avx2(ctx, mac);
00504     else
00505     #endif
00506         poly1305_final_avx(ctx, mac);
00507 #elif defined(POLY130564)
00508 
00509     /* process the remaining block */
00510     if (ctx->leftover) {
00511         size_t i = ctx->leftover;
00512         ctx->buffer[i] = 1;
00513         for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++)
00514             ctx->buffer[i] = 0;
00515         ctx->finished = 1;
00516         poly1305_block(ctx, ctx->buffer);
00517     }
00518 
00519     /* fully carry h */
00520     h0 = ctx->h[0];
00521     h1 = ctx->h[1];
00522     h2 = ctx->h[2];
00523 
00524                  c = (h1 >> 44); h1 &= 0xfffffffffff;
00525     h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
00526     h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
00527     h1 += c;     c = (h1 >> 44); h1 &= 0xfffffffffff;
00528     h2 += c;     c = (h2 >> 42); h2 &= 0x3ffffffffff;
00529     h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff;
00530     h1 += c;
00531 
00532     /* compute h + -p */
00533     g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff;
00534     g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff;
00535     g2 = h2 + c - ((word64)1 << 42);
00536 
00537     /* select h if h < p, or h + -p if h >= p */
00538     c = (g2 >> ((sizeof(word64) * 8) - 1)) - 1;
00539     g0 &= c;
00540     g1 &= c;
00541     g2 &= c;
00542     c = ~c;
00543     h0 = (h0 & c) | g0;
00544     h1 = (h1 & c) | g1;
00545     h2 = (h2 & c) | g2;
00546 
00547     /* h = (h + pad) */
00548     t0 = ctx->pad[0];
00549     t1 = ctx->pad[1];
00550 
00551     h0 += (( t0                    ) & 0xfffffffffff)    ;
00552     c = (h0 >> 44); h0 &= 0xfffffffffff;
00553     h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
00554     c = (h1 >> 44); h1 &= 0xfffffffffff;
00555     h2 += (((t1 >> 24)             ) & 0x3ffffffffff) + c;
00556     h2 &= 0x3ffffffffff;
00557 
00558     /* mac = h % (2^128) */
00559     h0 = ((h0      ) | (h1 << 44));
00560     h1 = ((h1 >> 20) | (h2 << 24));
00561 
00562     U64TO8(mac + 0, h0);
00563     U64TO8(mac + 8, h1);
00564 
00565     /* zero out the state */
00566     ctx->h[0] = 0;
00567     ctx->h[1] = 0;
00568     ctx->h[2] = 0;
00569     ctx->r[0] = 0;
00570     ctx->r[1] = 0;
00571     ctx->r[2] = 0;
00572     ctx->pad[0] = 0;
00573     ctx->pad[1] = 0;
00574 
00575 #else /* if not 64 bit then use 32 bit */
00576 
00577     /* process the remaining block */
00578     if (ctx->leftover) {
00579         size_t i = ctx->leftover;
00580         ctx->buffer[i++] = 1;
00581         for (; i < POLY1305_BLOCK_SIZE; i++)
00582             ctx->buffer[i] = 0;
00583         ctx->finished = 1;
00584         poly1305_block(ctx, ctx->buffer);
00585     }
00586 
00587     /* fully carry h */
00588     h0 = ctx->h[0];
00589     h1 = ctx->h[1];
00590     h2 = ctx->h[2];
00591     h3 = ctx->h[3];
00592     h4 = ctx->h[4];
00593 
00594                  c = h1 >> 26; h1 = h1 & 0x3ffffff;
00595     h2 +=     c; c = h2 >> 26; h2 = h2 & 0x3ffffff;
00596     h3 +=     c; c = h3 >> 26; h3 = h3 & 0x3ffffff;
00597     h4 +=     c; c = h4 >> 26; h4 = h4 & 0x3ffffff;
00598     h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff;
00599     h1 +=     c;
00600 
00601     /* compute h + -p */
00602     g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff;
00603     g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff;
00604     g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff;
00605     g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff;
00606     g4 = h4 + c - ((word32)1 << 26);
00607 
00608     /* select h if h < p, or h + -p if h >= p */
00609     mask = ((word32)g4 >> ((sizeof(word32) * 8) - 1)) - 1;
00610     g0 &= mask;
00611     g1 &= mask;
00612     g2 &= mask;
00613     g3 &= mask;
00614     g4 &= mask;
00615     mask = ~mask;
00616     h0 = (h0 & mask) | g0;
00617     h1 = (h1 & mask) | g1;
00618     h2 = (h2 & mask) | g2;
00619     h3 = (h3 & mask) | g3;
00620     h4 = (h4 & mask) | g4;
00621 
00622     /* h = h % (2^128) */
00623     h0 = ((h0      ) | (h1 << 26)) & 0xffffffff;
00624     h1 = ((h1 >>  6) | (h2 << 20)) & 0xffffffff;
00625     h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff;
00626     h3 = ((h3 >> 18) | (h4 <<  8)) & 0xffffffff;
00627 
00628     /* mac = (h + pad) % (2^128) */
00629     f = (word64)h0 + ctx->pad[0]            ; h0 = (word32)f;
00630     f = (word64)h1 + ctx->pad[1] + (f >> 32); h1 = (word32)f;
00631     f = (word64)h2 + ctx->pad[2] + (f >> 32); h2 = (word32)f;
00632     f = (word64)h3 + ctx->pad[3] + (f >> 32); h3 = (word32)f;
00633 
00634     U32TO8(mac + 0, h0);
00635     U32TO8(mac + 4, h1);
00636     U32TO8(mac + 8, h2);
00637     U32TO8(mac + 12, h3);
00638 
00639     /* zero out the state */
00640     ctx->h[0] = 0;
00641     ctx->h[1] = 0;
00642     ctx->h[2] = 0;
00643     ctx->h[3] = 0;
00644     ctx->h[4] = 0;
00645     ctx->r[0] = 0;
00646     ctx->r[1] = 0;
00647     ctx->r[2] = 0;
00648     ctx->r[3] = 0;
00649     ctx->r[4] = 0;
00650     ctx->pad[0] = 0;
00651     ctx->pad[1] = 0;
00652     ctx->pad[2] = 0;
00653     ctx->pad[3] = 0;
00654 
00655 #endif
00656 
00657     return 0;
00658 }
00659 #endif /* !defined(WOLFSSL_ARMASM) || !defined(__aarch64__) */
00660 
00661 
00662 int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
00663 {
00664     size_t i;
00665 
00666 #ifdef CHACHA_AEAD_TEST
00667     word32 k;
00668     printf("Raw input to poly:\n");
00669     for (k = 0; k < bytes; k++) {
00670         printf("%02x", m[k]);
00671         if ((k+1) % 16 == 0)
00672             printf("\n");
00673     }
00674     printf("\n");
00675 #endif
00676 
00677     if (ctx == NULL)
00678         return BAD_FUNC_ARG;
00679 
00680 #ifdef USE_INTEL_SPEEDUP
00681     #ifdef HAVE_INTEL_AVX2
00682     if (IS_INTEL_AVX2(intel_flags)) {
00683         /* handle leftover */
00684         if (ctx->leftover) {
00685             size_t want = sizeof(ctx->buffer) - ctx->leftover;
00686             if (want > bytes)
00687                 want = bytes;
00688 
00689             for (i = 0; i < want; i++)
00690                 ctx->buffer[ctx->leftover + i] = m[i];
00691             bytes -= (word32)want;
00692             m += want;
00693             ctx->leftover += want;
00694             if (ctx->leftover < sizeof(ctx->buffer))
00695                 return 0;
00696 
00697             if (!ctx->started)
00698                 poly1305_calc_powers_avx2(ctx);
00699             poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer));
00700             ctx->leftover = 0;
00701         }
00702 
00703         /* process full blocks */
00704         if (bytes >= sizeof(ctx->buffer)) {
00705             size_t want = bytes & ~(sizeof(ctx->buffer) - 1);
00706 
00707             if (!ctx->started)
00708                 poly1305_calc_powers_avx2(ctx);
00709             poly1305_blocks_avx2(ctx, m, want);
00710             m += want;
00711             bytes -= (word32)want;
00712         }
00713 
00714         /* store leftover */
00715         if (bytes) {
00716             for (i = 0; i < bytes; i++)
00717                 ctx->buffer[ctx->leftover + i] = m[i];
00718             ctx->leftover += bytes;
00719         }
00720     }
00721     else
00722     #endif
00723 #endif
00724     {
00725         /* handle leftover */
00726         if (ctx->leftover) {
00727             size_t want = (POLY1305_BLOCK_SIZE - ctx->leftover);
00728             if (want > bytes)
00729                 want = bytes;
00730             for (i = 0; i < want; i++)
00731                 ctx->buffer[ctx->leftover + i] = m[i];
00732             bytes -= (word32)want;
00733             m += want;
00734             ctx->leftover += want;
00735             if (ctx->leftover < POLY1305_BLOCK_SIZE)
00736                 return 0;
00737             poly1305_block(ctx, ctx->buffer);
00738             ctx->leftover = 0;
00739         }
00740 
00741         /* process full blocks */
00742         if (bytes >= POLY1305_BLOCK_SIZE) {
00743             size_t want = (bytes & ~(POLY1305_BLOCK_SIZE - 1));
00744             poly1305_blocks(ctx, m, want);
00745             m += want;
00746             bytes -= (word32)want;
00747         }
00748 
00749         /* store leftover */
00750         if (bytes) {
00751             for (i = 0; i < bytes; i++)
00752                 ctx->buffer[ctx->leftover + i] = m[i];
00753             ctx->leftover += bytes;
00754         }
00755     }
00756 
00757     return 0;
00758 }
00759 
00760 /*  Takes a Poly1305 struct that has a key loaded and pads the provided length
00761     ctx        : Initialized Poly1305 struct to use
00762     lenToPad   : Current number of bytes updated that needs padding to 16
00763  */
00764 int wc_Poly1305_Pad(Poly1305* ctx, word32 lenToPad)
00765 {
00766     int ret = 0;
00767     word32 paddingLen;
00768     byte padding[WC_POLY1305_PAD_SZ - 1];
00769 
00770     if (ctx == NULL) {
00771         return BAD_FUNC_ARG;
00772     }
00773     if (lenToPad == 0) {
00774         return 0; /* nothing needs to be done */
00775     }
00776 
00777     XMEMSET(padding, 0, sizeof(padding));
00778 
00779     /* Pad length to 16 bytes */
00780     paddingLen = -(int)lenToPad & (WC_POLY1305_PAD_SZ - 1);
00781     if (paddingLen > 0) {
00782         ret = wc_Poly1305Update(ctx, padding, paddingLen);
00783     }
00784     return ret;
00785 }
00786 
00787 /*  Takes a Poly1305 struct that has a key loaded and adds the AEAD length
00788     encoding in 64-bit little endian
00789     aadSz      : Size of the additional authentication data
00790     dataSz     : Size of the plaintext or ciphertext
00791  */
00792 int wc_Poly1305_EncodeSizes(Poly1305* ctx, word32 aadSz, word32 dataSz)
00793 {
00794     int ret;
00795     byte little64[16]; /* sizeof(word64) * 2 */
00796 
00797     if (ctx == NULL) {
00798         return BAD_FUNC_ARG;
00799     }
00800 
00801     XMEMSET(little64, 0, sizeof(little64));
00802 
00803     /* size of additional data and input data as little endian 64 bit types */
00804     u32tole64(aadSz,  little64);
00805     u32tole64(dataSz, little64 + 8);
00806     ret = wc_Poly1305Update(ctx, little64, sizeof(little64));
00807 
00808     return ret;
00809 }
00810 
00811 /*  Takes in an initialized Poly1305 struct that has a key loaded and creates
00812     a MAC (tag) using recent TLS AEAD padding scheme.
00813     ctx        : Initialized Poly1305 struct to use
00814     additional : Additional data to use
00815     addSz      : Size of additional buffer
00816     input      : Input buffer to create tag from
00817     sz         : Size of input buffer
00818     tag        : Buffer to hold created tag
00819     tagSz      : Size of input tag buffer (must be at least
00820                  WC_POLY1305_MAC_SZ(16))
00821  */
00822 int wc_Poly1305_MAC(Poly1305* ctx, byte* additional, word32 addSz,
00823                     byte* input, word32 sz, byte* tag, word32 tagSz)
00824 {
00825     int ret;
00826 
00827     /* sanity check on arguments */
00828     if (ctx == NULL || input == NULL || tag == NULL ||
00829                                                    tagSz < WC_POLY1305_MAC_SZ) {
00830         return BAD_FUNC_ARG;
00831     }
00832 
00833     /* additional allowed to be 0 */
00834     if (addSz > 0) {
00835         if (additional == NULL)
00836             return BAD_FUNC_ARG;
00837 
00838         /* additional data plus padding */
00839         if ((ret = wc_Poly1305Update(ctx, additional, addSz)) != 0) {
00840             return ret;
00841         }
00842         /* pad additional data */
00843         if ((ret = wc_Poly1305_Pad(ctx, addSz)) != 0) {
00844             return ret;
00845         }
00846     }
00847 
00848     /* input plus padding */
00849     if ((ret = wc_Poly1305Update(ctx, input, sz)) != 0) {
00850         return ret;
00851     }
00852     /* pad input data */
00853     if ((ret = wc_Poly1305_Pad(ctx, sz)) != 0) {
00854         return ret;
00855     }
00856 
00857     /* encode size of AAD and input data as little endian 64 bit types */
00858     if ((ret = wc_Poly1305_EncodeSizes(ctx, addSz, sz)) != 0) {
00859         return ret;
00860     }
00861 
00862     /* Finalize the auth tag */
00863     ret = wc_Poly1305Final(ctx, tag);
00864 
00865     return ret;
00866 
00867 }
00868 #endif /* HAVE_POLY1305 */
00869