Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
wolfcrypt/src/poly1305.c@15:117db924cf7c, 2018-08-18 (annotated)
- Committer:
- wolfSSL
- Date:
- Sat Aug 18 22:20:43 2018 +0000
- Revision:
- 15:117db924cf7c
wolfSSL 3.15.3
Who changed what in which revision?
User | Revision | Line number | New contents of line |
---|---|---|---|
wolfSSL | 15:117db924cf7c | 1 | /* poly1305.c |
wolfSSL | 15:117db924cf7c | 2 | * |
wolfSSL | 15:117db924cf7c | 3 | * Copyright (C) 2006-2017 wolfSSL Inc. |
wolfSSL | 15:117db924cf7c | 4 | * |
wolfSSL | 15:117db924cf7c | 5 | * This file is part of wolfSSL. |
wolfSSL | 15:117db924cf7c | 6 | * |
wolfSSL | 15:117db924cf7c | 7 | * wolfSSL is free software; you can redistribute it and/or modify |
wolfSSL | 15:117db924cf7c | 8 | * it under the terms of the GNU General Public License as published by |
wolfSSL | 15:117db924cf7c | 9 | * the Free Software Foundation; either version 2 of the License, or |
wolfSSL | 15:117db924cf7c | 10 | * (at your option) any later version. |
wolfSSL | 15:117db924cf7c | 11 | * |
wolfSSL | 15:117db924cf7c | 12 | * wolfSSL is distributed in the hope that it will be useful, |
wolfSSL | 15:117db924cf7c | 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
wolfSSL | 15:117db924cf7c | 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
wolfSSL | 15:117db924cf7c | 15 | * GNU General Public License for more details. |
wolfSSL | 15:117db924cf7c | 16 | * |
wolfSSL | 15:117db924cf7c | 17 | * You should have received a copy of the GNU General Public License |
wolfSSL | 15:117db924cf7c | 18 | * along with this program; if not, write to the Free Software |
wolfSSL | 15:117db924cf7c | 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA |
wolfSSL | 15:117db924cf7c | 20 | */ |
wolfSSL | 15:117db924cf7c | 21 | |
wolfSSL | 15:117db924cf7c | 22 | /* |
wolfSSL | 15:117db924cf7c | 23 | * Based off the public domain implementations by Andrew Moon |
wolfSSL | 15:117db924cf7c | 24 | * and Daniel J. Bernstein |
wolfSSL | 15:117db924cf7c | 25 | */ |
wolfSSL | 15:117db924cf7c | 26 | |
wolfSSL | 15:117db924cf7c | 27 | #ifdef HAVE_CONFIG_H |
wolfSSL | 15:117db924cf7c | 28 | #include <config.h> |
wolfSSL | 15:117db924cf7c | 29 | #endif |
wolfSSL | 15:117db924cf7c | 30 | |
wolfSSL | 15:117db924cf7c | 31 | #include <wolfssl/wolfcrypt/settings.h> |
wolfSSL | 15:117db924cf7c | 32 | |
wolfSSL | 15:117db924cf7c | 33 | #ifdef HAVE_POLY1305 |
wolfSSL | 15:117db924cf7c | 34 | #include <wolfssl/wolfcrypt/poly1305.h> |
wolfSSL | 15:117db924cf7c | 35 | #include <wolfssl/wolfcrypt/error-crypt.h> |
wolfSSL | 15:117db924cf7c | 36 | #include <wolfssl/wolfcrypt/logging.h> |
wolfSSL | 15:117db924cf7c | 37 | #include <wolfssl/wolfcrypt/cpuid.h> |
wolfSSL | 15:117db924cf7c | 38 | #ifdef NO_INLINE |
wolfSSL | 15:117db924cf7c | 39 | #include <wolfssl/wolfcrypt/misc.h> |
wolfSSL | 15:117db924cf7c | 40 | #else |
wolfSSL | 15:117db924cf7c | 41 | #define WOLFSSL_MISC_INCLUDED |
wolfSSL | 15:117db924cf7c | 42 | #include <wolfcrypt/src/misc.c> |
wolfSSL | 15:117db924cf7c | 43 | #endif |
wolfSSL | 15:117db924cf7c | 44 | #ifdef CHACHA_AEAD_TEST |
wolfSSL | 15:117db924cf7c | 45 | #include <stdio.h> |
wolfSSL | 15:117db924cf7c | 46 | #endif |
wolfSSL | 15:117db924cf7c | 47 | |
wolfSSL | 15:117db924cf7c | 48 | #ifdef _MSC_VER |
wolfSSL | 15:117db924cf7c | 49 | /* 4127 warning constant while(1) */ |
wolfSSL | 15:117db924cf7c | 50 | #pragma warning(disable: 4127) |
wolfSSL | 15:117db924cf7c | 51 | #endif |
wolfSSL | 15:117db924cf7c | 52 | |
wolfSSL | 15:117db924cf7c | 53 | #ifdef USE_INTEL_SPEEDUP |
wolfSSL | 15:117db924cf7c | 54 | #include <emmintrin.h> |
wolfSSL | 15:117db924cf7c | 55 | #include <immintrin.h> |
wolfSSL | 15:117db924cf7c | 56 | |
wolfSSL | 15:117db924cf7c | 57 | #if defined(__GNUC__) && ((__GNUC__ < 4) || \ |
wolfSSL | 15:117db924cf7c | 58 | (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) |
wolfSSL | 15:117db924cf7c | 59 | #define NO_AVX2_SUPPORT |
wolfSSL | 15:117db924cf7c | 60 | #endif |
wolfSSL | 15:117db924cf7c | 61 | #if defined(__clang__) && ((__clang_major__ < 3) || \ |
wolfSSL | 15:117db924cf7c | 62 | (__clang_major__ == 3 && __clang_minor__ <= 5)) |
wolfSSL | 15:117db924cf7c | 63 | #define NO_AVX2_SUPPORT |
wolfSSL | 15:117db924cf7c | 64 | #elif defined(__clang__) && defined(NO_AVX2_SUPPORT) |
wolfSSL | 15:117db924cf7c | 65 | #undef NO_AVX2_SUPPORT |
wolfSSL | 15:117db924cf7c | 66 | #endif |
wolfSSL | 15:117db924cf7c | 67 | |
wolfSSL | 15:117db924cf7c | 68 | #define HAVE_INTEL_AVX1 |
wolfSSL | 15:117db924cf7c | 69 | #ifndef NO_AVX2_SUPPORT |
wolfSSL | 15:117db924cf7c | 70 | #define HAVE_INTEL_AVX2 |
wolfSSL | 15:117db924cf7c | 71 | #endif |
wolfSSL | 15:117db924cf7c | 72 | #endif |
wolfSSL | 15:117db924cf7c | 73 | |
wolfSSL | 15:117db924cf7c | 74 | #ifdef USE_INTEL_SPEEDUP |
wolfSSL | 15:117db924cf7c | 75 | static word32 intel_flags = 0; |
wolfSSL | 15:117db924cf7c | 76 | static word32 cpu_flags_set = 0; |
wolfSSL | 15:117db924cf7c | 77 | #endif |
wolfSSL | 15:117db924cf7c | 78 | |
wolfSSL | 15:117db924cf7c | 79 | #if defined(USE_INTEL_SPEEDUP) || defined(POLY130564) |
wolfSSL | 15:117db924cf7c | 80 | #if defined(_MSC_VER) |
wolfSSL | 15:117db924cf7c | 81 | #define POLY1305_NOINLINE __declspec(noinline) |
wolfSSL | 15:117db924cf7c | 82 | #elif defined(__GNUC__) |
wolfSSL | 15:117db924cf7c | 83 | #define POLY1305_NOINLINE __attribute__((noinline)) |
wolfSSL | 15:117db924cf7c | 84 | #else |
wolfSSL | 15:117db924cf7c | 85 | #define POLY1305_NOINLINE |
wolfSSL | 15:117db924cf7c | 86 | #endif |
wolfSSL | 15:117db924cf7c | 87 | |
wolfSSL | 15:117db924cf7c | 88 | #if defined(_MSC_VER) |
wolfSSL | 15:117db924cf7c | 89 | #include <intrin.h> |
wolfSSL | 15:117db924cf7c | 90 | |
wolfSSL | 15:117db924cf7c | 91 | typedef struct word128 { |
wolfSSL | 15:117db924cf7c | 92 | word64 lo; |
wolfSSL | 15:117db924cf7c | 93 | word64 hi; |
wolfSSL | 15:117db924cf7c | 94 | } word128; |
wolfSSL | 15:117db924cf7c | 95 | |
wolfSSL | 15:117db924cf7c | 96 | #define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi) |
wolfSSL | 15:117db924cf7c | 97 | #define ADD(out, in) { word64 t = out.lo; out.lo += in.lo; \ |
wolfSSL | 15:117db924cf7c | 98 | out.hi += (out.lo < t) + in.hi; } |
wolfSSL | 15:117db924cf7c | 99 | #define ADDLO(out, in) { word64 t = out.lo; out.lo += in; \ |
wolfSSL | 15:117db924cf7c | 100 | out.hi += (out.lo < t); } |
wolfSSL | 15:117db924cf7c | 101 | #define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift))) |
wolfSSL | 15:117db924cf7c | 102 | #define LO(in) (in.lo) |
wolfSSL | 15:117db924cf7c | 103 | |
wolfSSL | 15:117db924cf7c | 104 | #elif defined(__GNUC__) |
wolfSSL | 15:117db924cf7c | 105 | #if defined(__SIZEOF_INT128__) |
wolfSSL | 15:117db924cf7c | 106 | typedef unsigned __int128 word128; |
wolfSSL | 15:117db924cf7c | 107 | #else |
wolfSSL | 15:117db924cf7c | 108 | typedef unsigned word128 __attribute__((mode(TI))); |
wolfSSL | 15:117db924cf7c | 109 | #endif |
wolfSSL | 15:117db924cf7c | 110 | |
wolfSSL | 15:117db924cf7c | 111 | #define MUL(out, x, y) out = ((word128)x * y) |
wolfSSL | 15:117db924cf7c | 112 | #define ADD(out, in) out += in |
wolfSSL | 15:117db924cf7c | 113 | #define ADDLO(out, in) out += in |
wolfSSL | 15:117db924cf7c | 114 | #define SHR(in, shift) (word64)(in >> (shift)) |
wolfSSL | 15:117db924cf7c | 115 | #define LO(in) (word64)(in) |
wolfSSL | 15:117db924cf7c | 116 | #endif |
wolfSSL | 15:117db924cf7c | 117 | #endif |
wolfSSL | 15:117db924cf7c | 118 | |
wolfSSL | 15:117db924cf7c | 119 | #ifdef USE_INTEL_SPEEDUP |
wolfSSL | 15:117db924cf7c | 120 | #ifdef HAVE_INTEL_AVX1 |
wolfSSL | 15:117db924cf7c | 121 | /* Process one block (16 bytes) of data. |
wolfSSL | 15:117db924cf7c | 122 | * |
wolfSSL | 15:117db924cf7c | 123 | * ctx Poly1305 context. |
wolfSSL | 15:117db924cf7c | 124 | * m One block of message data. |
wolfSSL | 15:117db924cf7c | 125 | */ |
wolfSSL | 15:117db924cf7c | 126 | static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m) |
wolfSSL | 15:117db924cf7c | 127 | { |
wolfSSL | 15:117db924cf7c | 128 | __asm__ __volatile__ ( |
wolfSSL | 15:117db924cf7c | 129 | "movq (%[ctx]), %%r15\n\t" |
wolfSSL | 15:117db924cf7c | 130 | "movq 24(%[ctx]), %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 131 | "movq 32(%[ctx]), %%r9\n\t" |
wolfSSL | 15:117db924cf7c | 132 | "movq 40(%[ctx]), %%r10\n\t" |
wolfSSL | 15:117db924cf7c | 133 | "xorq %%rbx, %%rbx\n\t" |
wolfSSL | 15:117db924cf7c | 134 | "movb %[nfin], %%bl\n\t" |
wolfSSL | 15:117db924cf7c | 135 | "# h += m\n\t" |
wolfSSL | 15:117db924cf7c | 136 | "movq (%[m]), %%r11\n\t" |
wolfSSL | 15:117db924cf7c | 137 | "movq 8(%[m]), %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 138 | "addq %%r11, %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 139 | "adcq %%r12, %%r9\n\t" |
wolfSSL | 15:117db924cf7c | 140 | "movq 8(%[ctx]), %%rax\n\t" |
wolfSSL | 15:117db924cf7c | 141 | "adcq %%rbx, %%r10\n\t" |
wolfSSL | 15:117db924cf7c | 142 | "# r[1] * h[0] => rdx, rax ==> t2, t1\n\t" |
wolfSSL | 15:117db924cf7c | 143 | "mulq %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 144 | "movq %%rax, %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 145 | "movq %%rdx, %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 146 | "# r[0] * h[1] => rdx, rax ++> t2, t1\n\t" |
wolfSSL | 15:117db924cf7c | 147 | "movq %%r15, %%rax\n\t" |
wolfSSL | 15:117db924cf7c | 148 | "mulq %%r9\n\t" |
wolfSSL | 15:117db924cf7c | 149 | "addq %%rax, %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 150 | "movq %%r15, %%rax\n\t" |
wolfSSL | 15:117db924cf7c | 151 | "adcq %%rdx, %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 152 | "# r[0] * h[0] => rdx, rax ==> t4, t0\n\t" |
wolfSSL | 15:117db924cf7c | 153 | "mulq %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 154 | "movq %%rax, %%r11\n\t" |
wolfSSL | 15:117db924cf7c | 155 | "movq %%rdx, %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 156 | "# r[1] * h[1] => rdx, rax =+> t3, t2\n\t" |
wolfSSL | 15:117db924cf7c | 157 | "movq 8(%[ctx]), %%rax\n\t" |
wolfSSL | 15:117db924cf7c | 158 | "mulq %%r9\n\t" |
wolfSSL | 15:117db924cf7c | 159 | "# r[0] * h[2] +> t2\n\t" |
wolfSSL | 15:117db924cf7c | 160 | "addq 352(%[ctx],%%r10,8), %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 161 | "movq %%rdx, %%r14\n\t" |
wolfSSL | 15:117db924cf7c | 162 | "addq %%r8, %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 163 | "adcq %%rax, %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 164 | "# r[1] * h[2] +> t3\n\t" |
wolfSSL | 15:117db924cf7c | 165 | "adcq 408(%[ctx],%%r10,8), %%r14\n\t" |
wolfSSL | 15:117db924cf7c | 166 | "# r * h in r14, r13, r12, r11 \n\t" |
wolfSSL | 15:117db924cf7c | 167 | "# h = (r * h) mod 2^130 - 5\n\t" |
wolfSSL | 15:117db924cf7c | 168 | "movq %%r13, %%r10\n\t" |
wolfSSL | 15:117db924cf7c | 169 | "andq $-4, %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 170 | "andq $3, %%r10\n\t" |
wolfSSL | 15:117db924cf7c | 171 | "addq %%r13, %%r11\n\t" |
wolfSSL | 15:117db924cf7c | 172 | "movq %%r13, %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 173 | "adcq %%r14, %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 174 | "adcq $0, %%r10\n\t" |
wolfSSL | 15:117db924cf7c | 175 | "shrdq $2, %%r14, %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 176 | "shrq $2, %%r14\n\t" |
wolfSSL | 15:117db924cf7c | 177 | "addq %%r11, %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 178 | "adcq %%r14, %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 179 | "movq %%r12, %%r9\n\t" |
wolfSSL | 15:117db924cf7c | 180 | "adcq $0, %%r10\n\t" |
wolfSSL | 15:117db924cf7c | 181 | "# h in r10, r9, r8 \n\t" |
wolfSSL | 15:117db924cf7c | 182 | "# Store h to ctx\n\t" |
wolfSSL | 15:117db924cf7c | 183 | "movq %%r8, 24(%[ctx])\n\t" |
wolfSSL | 15:117db924cf7c | 184 | "movq %%r9, 32(%[ctx])\n\t" |
wolfSSL | 15:117db924cf7c | 185 | "movq %%r10, 40(%[ctx])\n\t" |
wolfSSL | 15:117db924cf7c | 186 | : |
wolfSSL | 15:117db924cf7c | 187 | : [m] "r" (m), [ctx] "r" (ctx), [nfin] "m" (ctx->finished) |
wolfSSL | 15:117db924cf7c | 188 | : "rax", "rdx", "r11", "r12", "r13", "r14", "r15", "rbx", |
wolfSSL | 15:117db924cf7c | 189 | "r8", "r9", "r10", "memory" |
wolfSSL | 15:117db924cf7c | 190 | ); |
wolfSSL | 15:117db924cf7c | 191 | } |
wolfSSL | 15:117db924cf7c | 192 | |
wolfSSL | 15:117db924cf7c | 193 | /* Process multiple blocks (n * 16 bytes) of data. |
wolfSSL | 15:117db924cf7c | 194 | * |
wolfSSL | 15:117db924cf7c | 195 | * ctx Poly1305 context. |
wolfSSL | 15:117db924cf7c | 196 | * m Blocks of message data. |
wolfSSL | 15:117db924cf7c | 197 | * bytes The number of bytes to process. |
wolfSSL | 15:117db924cf7c | 198 | */ |
wolfSSL | 15:117db924cf7c | 199 | POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx, |
wolfSSL | 15:117db924cf7c | 200 | const unsigned char* m, size_t bytes) |
wolfSSL | 15:117db924cf7c | 201 | { |
wolfSSL | 15:117db924cf7c | 202 | __asm__ __volatile__ ( |
wolfSSL | 15:117db924cf7c | 203 | "movq (%[ctx]), %%r15\n\t" |
wolfSSL | 15:117db924cf7c | 204 | "movq 24(%[ctx]), %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 205 | "movq 32(%[ctx]), %%r9\n\t" |
wolfSSL | 15:117db924cf7c | 206 | "movq 40(%[ctx]), %%r10\n" |
wolfSSL | 15:117db924cf7c | 207 | "L_avx_start:\n\t" |
wolfSSL | 15:117db924cf7c | 208 | "# h += m\n\t" |
wolfSSL | 15:117db924cf7c | 209 | "movq (%[m]), %%r11\n\t" |
wolfSSL | 15:117db924cf7c | 210 | "movq 8(%[m]), %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 211 | "addq %%r11, %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 212 | "adcq %%r12, %%r9\n\t" |
wolfSSL | 15:117db924cf7c | 213 | "movq 8(%[ctx]), %%rax\n\t" |
wolfSSL | 15:117db924cf7c | 214 | "adcq $0, %%r10\n\t" |
wolfSSL | 15:117db924cf7c | 215 | "# r[1] * h[0] => rdx, rax ==> t2, t1\n\t" |
wolfSSL | 15:117db924cf7c | 216 | "mulq %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 217 | "movq %%rax, %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 218 | "movq %%rdx, %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 219 | "# r[0] * h[1] => rdx, rax ++> t2, t1\n\t" |
wolfSSL | 15:117db924cf7c | 220 | "movq %%r15, %%rax\n\t" |
wolfSSL | 15:117db924cf7c | 221 | "mulq %%r9\n\t" |
wolfSSL | 15:117db924cf7c | 222 | "addq %%rax, %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 223 | "movq %%r15, %%rax\n\t" |
wolfSSL | 15:117db924cf7c | 224 | "adcq %%rdx, %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 225 | "# r[0] * h[0] => rdx, rax ==> t4, t0\n\t" |
wolfSSL | 15:117db924cf7c | 226 | "mulq %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 227 | "movq %%rax, %%r11\n\t" |
wolfSSL | 15:117db924cf7c | 228 | "movq %%rdx, %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 229 | "# r[1] * h[1] => rdx, rax =+> t3, t2\n\t" |
wolfSSL | 15:117db924cf7c | 230 | "movq 8(%[ctx]), %%rax\n\t" |
wolfSSL | 15:117db924cf7c | 231 | "mulq %%r9\n\t" |
wolfSSL | 15:117db924cf7c | 232 | "# r[0] * h[2] +> t2\n\t" |
wolfSSL | 15:117db924cf7c | 233 | "addq 360(%[ctx],%%r10,8), %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 234 | "movq %%rdx, %%r14\n\t" |
wolfSSL | 15:117db924cf7c | 235 | "addq %%r8, %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 236 | "adcq %%rax, %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 237 | "# r[1] * h[2] +> t3\n\t" |
wolfSSL | 15:117db924cf7c | 238 | "adcq 416(%[ctx],%%r10,8), %%r14\n\t" |
wolfSSL | 15:117db924cf7c | 239 | "# r * h in r14, r13, r12, r11 \n\t" |
wolfSSL | 15:117db924cf7c | 240 | "# h = (r * h) mod 2^130 - 5\n\t" |
wolfSSL | 15:117db924cf7c | 241 | "movq %%r13, %%r10\n\t" |
wolfSSL | 15:117db924cf7c | 242 | "andq $-4, %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 243 | "andq $3, %%r10\n\t" |
wolfSSL | 15:117db924cf7c | 244 | "addq %%r13, %%r11\n\t" |
wolfSSL | 15:117db924cf7c | 245 | "movq %%r13, %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 246 | "adcq %%r14, %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 247 | "adcq $0, %%r10\n\t" |
wolfSSL | 15:117db924cf7c | 248 | "shrdq $2, %%r14, %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 249 | "shrq $2, %%r14\n\t" |
wolfSSL | 15:117db924cf7c | 250 | "addq %%r11, %%r8\n\t" |
wolfSSL | 15:117db924cf7c | 251 | "adcq %%r14, %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 252 | "movq %%r12, %%r9\n\t" |
wolfSSL | 15:117db924cf7c | 253 | "adcq $0, %%r10\n\t" |
wolfSSL | 15:117db924cf7c | 254 | "# h in r10, r9, r8 \n\t" |
wolfSSL | 15:117db924cf7c | 255 | "# Next block from message\n\t" |
wolfSSL | 15:117db924cf7c | 256 | "addq $16, %[m]\n\t" |
wolfSSL | 15:117db924cf7c | 257 | "subq $16, %[bytes]\n\t" |
wolfSSL | 15:117db924cf7c | 258 | "cmp $16, %[bytes]\n\t" |
wolfSSL | 15:117db924cf7c | 259 | "jge L_avx_start\n\t" |
wolfSSL | 15:117db924cf7c | 260 | "# Store h to ctx\n\t" |
wolfSSL | 15:117db924cf7c | 261 | "movq %%r8, 24(%[ctx])\n\t" |
wolfSSL | 15:117db924cf7c | 262 | "movq %%r9, 32(%[ctx])\n\t" |
wolfSSL | 15:117db924cf7c | 263 | "movq %%r10, 40(%[ctx])\n\t" |
wolfSSL | 15:117db924cf7c | 264 | : [m] "+r" (m), [bytes] "+r" (bytes) |
wolfSSL | 15:117db924cf7c | 265 | : [ctx] "r" (ctx) |
wolfSSL | 15:117db924cf7c | 266 | : "rax", "rdx", "r11", "r12", "r13", "r14", "r15", |
wolfSSL | 15:117db924cf7c | 267 | "r8", "r9", "r10", "memory" |
wolfSSL | 15:117db924cf7c | 268 | ); |
wolfSSL | 15:117db924cf7c | 269 | } |
wolfSSL | 15:117db924cf7c | 270 | |
wolfSSL | 15:117db924cf7c | 271 | /* Set the key to use when processing data. |
wolfSSL | 15:117db924cf7c | 272 | * Initialize the context. |
wolfSSL | 15:117db924cf7c | 273 | * |
wolfSSL | 15:117db924cf7c | 274 | * ctx Poly1305 context. |
wolfSSL | 15:117db924cf7c | 275 | * key The key data (16 bytes). |
wolfSSL | 15:117db924cf7c | 276 | */ |
wolfSSL | 15:117db924cf7c | 277 | static void poly1305_setkey_avx(Poly1305* ctx, const byte* key) |
wolfSSL | 15:117db924cf7c | 278 | { |
wolfSSL | 15:117db924cf7c | 279 | int i; |
wolfSSL | 15:117db924cf7c | 280 | |
wolfSSL | 15:117db924cf7c | 281 | ctx->r[0] = *(word64*)(key + 0) & 0x0ffffffc0fffffffL; |
wolfSSL | 15:117db924cf7c | 282 | ctx->r[1] = *(word64*)(key + 8) & 0x0ffffffc0ffffffcL; |
wolfSSL | 15:117db924cf7c | 283 | |
wolfSSL | 15:117db924cf7c | 284 | for (i=0; i<7; i++) { |
wolfSSL | 15:117db924cf7c | 285 | ctx->hm[i + 0] = ctx->r[0] * i; |
wolfSSL | 15:117db924cf7c | 286 | ctx->hm[i + 7] = ctx->r[1] * i; |
wolfSSL | 15:117db924cf7c | 287 | } |
wolfSSL | 15:117db924cf7c | 288 | |
wolfSSL | 15:117db924cf7c | 289 | /* h (accumulator) = 0 */ |
wolfSSL | 15:117db924cf7c | 290 | ctx->h[0] = 0; |
wolfSSL | 15:117db924cf7c | 291 | ctx->h[1] = 0; |
wolfSSL | 15:117db924cf7c | 292 | ctx->h[2] = 0; |
wolfSSL | 15:117db924cf7c | 293 | |
wolfSSL | 15:117db924cf7c | 294 | /* save pad for later */ |
wolfSSL | 15:117db924cf7c | 295 | ctx->pad[0] = *(word64*)(key + 16); |
wolfSSL | 15:117db924cf7c | 296 | ctx->pad[1] = *(word64*)(key + 24); |
wolfSSL | 15:117db924cf7c | 297 | |
wolfSSL | 15:117db924cf7c | 298 | ctx->leftover = 0; |
wolfSSL | 15:117db924cf7c | 299 | ctx->finished = 1; |
wolfSSL | 15:117db924cf7c | 300 | } |
wolfSSL | 15:117db924cf7c | 301 | |
wolfSSL | 15:117db924cf7c | 302 | /* Calculate the final result - authentication data. |
wolfSSL | 15:117db924cf7c | 303 | * Zeros out the private data in the context. |
wolfSSL | 15:117db924cf7c | 304 | * |
wolfSSL | 15:117db924cf7c | 305 | * ctx Poly1305 context. |
wolfSSL | 15:117db924cf7c | 306 | * mac Buffer to hold 16 bytes. |
wolfSSL | 15:117db924cf7c | 307 | */ |
wolfSSL | 15:117db924cf7c | 308 | static void poly1305_final_avx(Poly1305* ctx, byte* mac) |
wolfSSL | 15:117db924cf7c | 309 | { |
wolfSSL | 15:117db924cf7c | 310 | word64 h0, h1, h2; |
wolfSSL | 15:117db924cf7c | 311 | |
wolfSSL | 15:117db924cf7c | 312 | /* process the remaining block */ |
wolfSSL | 15:117db924cf7c | 313 | if (ctx->leftover) { |
wolfSSL | 15:117db924cf7c | 314 | size_t i = ctx->leftover; |
wolfSSL | 15:117db924cf7c | 315 | ctx->buffer[i] = 1; |
wolfSSL | 15:117db924cf7c | 316 | for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++) |
wolfSSL | 15:117db924cf7c | 317 | ctx->buffer[i] = 0; |
wolfSSL | 15:117db924cf7c | 318 | ctx->finished = 0; |
wolfSSL | 15:117db924cf7c | 319 | poly1305_block_avx(ctx, ctx->buffer); |
wolfSSL | 15:117db924cf7c | 320 | } |
wolfSSL | 15:117db924cf7c | 321 | |
wolfSSL | 15:117db924cf7c | 322 | h0 = ctx->h[0]; |
wolfSSL | 15:117db924cf7c | 323 | h1 = ctx->h[1]; |
wolfSSL | 15:117db924cf7c | 324 | h2 = ctx->h[2]; |
wolfSSL | 15:117db924cf7c | 325 | |
wolfSSL | 15:117db924cf7c | 326 | /* h %= p */ |
wolfSSL | 15:117db924cf7c | 327 | /* h = (h + pad) */ |
wolfSSL | 15:117db924cf7c | 328 | __asm__ __volatile__ ( |
wolfSSL | 15:117db924cf7c | 329 | "# mod 2^130 - 5\n\t" |
wolfSSL | 15:117db924cf7c | 330 | "movq %[h2], %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 331 | "andq $0x3, %[h2]\n\t" |
wolfSSL | 15:117db924cf7c | 332 | "shrq $0x2, %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 333 | "leaq (%%r13, %%r13, 4), %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 334 | "add %%r13, %[h0]\n\t" |
wolfSSL | 15:117db924cf7c | 335 | "adc $0, %[h1]\n\t" |
wolfSSL | 15:117db924cf7c | 336 | "adc $0, %[h2]\n\t" |
wolfSSL | 15:117db924cf7c | 337 | "# Fixup when between (1 << 130) - 1 and (1 << 130) - 5\n\t" |
wolfSSL | 15:117db924cf7c | 338 | "movq %[h0], %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 339 | "movq %[h1], %%r14\n\t" |
wolfSSL | 15:117db924cf7c | 340 | "movq %[h2], %%r15\n\t" |
wolfSSL | 15:117db924cf7c | 341 | "addq $5, %%r13\n\t" |
wolfSSL | 15:117db924cf7c | 342 | "adcq $0, %%r14\n\t" |
wolfSSL | 15:117db924cf7c | 343 | "adcq $0, %%r15\n\t" |
wolfSSL | 15:117db924cf7c | 344 | "movq %%r15, %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 345 | "andq $3, %%r15\n\t" |
wolfSSL | 15:117db924cf7c | 346 | "cmpq $4, %%r12\n\t" |
wolfSSL | 15:117db924cf7c | 347 | "cmove %%r13, %[h0]\n\t" |
wolfSSL | 15:117db924cf7c | 348 | "cmove %%r14, %[h1]\n\t" |
wolfSSL | 15:117db924cf7c | 349 | "cmove %%r15, %[h2]\n\t" |
wolfSSL | 15:117db924cf7c | 350 | "# h += pad\n\t" |
wolfSSL | 15:117db924cf7c | 351 | "add %[p0], %[h0]\n\t" |
wolfSSL | 15:117db924cf7c | 352 | "adc %[p1], %[h1]\n\t" |
wolfSSL | 15:117db924cf7c | 353 | "movq %[h0], (%[m])\n\t" |
wolfSSL | 15:117db924cf7c | 354 | "movq %[h1], 8(%[m])\n\t" |
wolfSSL | 15:117db924cf7c | 355 | : [h0] "+r" (h0), [h1] "+r" (h1), [h2] "+r" (h2), |
wolfSSL | 15:117db924cf7c | 356 | [p0] "+r" (ctx->pad[0]), [p1] "+r" (ctx->pad[1]) |
wolfSSL | 15:117db924cf7c | 357 | : [m] "r" (mac) |
wolfSSL | 15:117db924cf7c | 358 | : "memory", "r15", "r14", "r13", "r12" |
wolfSSL | 15:117db924cf7c | 359 | ); |
wolfSSL | 15:117db924cf7c | 360 | |
wolfSSL | 15:117db924cf7c | 361 | /* zero out the state */ |
wolfSSL | 15:117db924cf7c | 362 | ctx->h[0] = 0; |
wolfSSL | 15:117db924cf7c | 363 | ctx->h[1] = 0; |
wolfSSL | 15:117db924cf7c | 364 | ctx->h[2] = 0; |
wolfSSL | 15:117db924cf7c | 365 | ctx->r[0] = 0; |
wolfSSL | 15:117db924cf7c | 366 | ctx->r[1] = 0; |
wolfSSL | 15:117db924cf7c | 367 | ctx->pad[0] = 0; |
wolfSSL | 15:117db924cf7c | 368 | ctx->pad[1] = 0; |
wolfSSL | 15:117db924cf7c | 369 | } |
wolfSSL | 15:117db924cf7c | 370 | #endif |
wolfSSL | 15:117db924cf7c | 371 | |
wolfSSL | 15:117db924cf7c | 372 | #ifdef HAVE_INTEL_AVX2 |
wolfSSL | 15:117db924cf7c | 373 | #if defined(_MSC_VER) |
wolfSSL | 15:117db924cf7c | 374 | #define POLY1305_NOINLINE __declspec(noinline) |
wolfSSL | 15:117db924cf7c | 375 | #elif defined(__GNUC__) |
wolfSSL | 15:117db924cf7c | 376 | #define POLY1305_NOINLINE __attribute__((noinline)) |
wolfSSL | 15:117db924cf7c | 377 | #else |
wolfSSL | 15:117db924cf7c | 378 | #define POLY1305_NOINLINE |
wolfSSL | 15:117db924cf7c | 379 | #endif |
wolfSSL | 15:117db924cf7c | 380 | |
wolfSSL | 15:117db924cf7c | 381 | /* Load H into five 256-bit registers. |
wolfSSL | 15:117db924cf7c | 382 | * |
wolfSSL | 15:117db924cf7c | 383 | * h is the memory location of the data - 26 of 32 bits. |
wolfSSL | 15:117db924cf7c | 384 | * h0-h4 the 4 H values with 26 bits stored in 64 for multiply. |
wolfSSL | 15:117db924cf7c | 385 | */ |
wolfSSL | 15:117db924cf7c | 386 | #define LOAD_H(h, h0, h1, h2, h3, h4) \ |
wolfSSL | 15:117db924cf7c | 387 | "vmovdqu ("#h"), "#h0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 388 | "vmovdqu 32("#h"), "#h1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 389 | "vmovdqu 64("#h"), "#h2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 390 | "vmovdqu 96("#h"), "#h3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 391 | "vmovdqu 128("#h"), "#h4"\n\t" |
wolfSSL | 15:117db924cf7c | 392 | |
wolfSSL | 15:117db924cf7c | 393 | /* Store H, five 256-bit registers, packed. |
wolfSSL | 15:117db924cf7c | 394 | * |
wolfSSL | 15:117db924cf7c | 395 | * h is the memory location of the data - 26 bits in 32. |
wolfSSL | 15:117db924cf7c | 396 | * h0-h4 the 4 H values with 26 bits stored in 64. |
wolfSSL | 15:117db924cf7c | 397 | * x4 is the xmm register of h4. |
wolfSSL | 15:117db924cf7c | 398 | */ |
wolfSSL | 15:117db924cf7c | 399 | #define STORE_H(h, h0, h1, h2, h3, h4, x4) \ |
wolfSSL | 15:117db924cf7c | 400 | "vmovdqu "#h0", ("#h")\n\t" \ |
wolfSSL | 15:117db924cf7c | 401 | "vmovdqu "#h1", 32("#h")\n\t" \ |
wolfSSL | 15:117db924cf7c | 402 | "vmovdqu "#h2", 64("#h")\n\t" \ |
wolfSSL | 15:117db924cf7c | 403 | "vmovdqu "#h3", 96("#h")\n\t" \ |
wolfSSL | 15:117db924cf7c | 404 | "vmovdqu "#h4", 128("#h")\n\t" |
wolfSSL | 15:117db924cf7c | 405 | |
wolfSSL | 15:117db924cf7c | 406 | /* Load four powers of r into position to be multiplied by the 4 H values. |
wolfSSL | 15:117db924cf7c | 407 | * |
wolfSSL | 15:117db924cf7c | 408 | * r0-r4 holds the loaded values with 26 bits stored in 64 for multiply. |
wolfSSL | 15:117db924cf7c | 409 | * t0-t3 are temporary registers. |
wolfSSL | 15:117db924cf7c | 410 | */ |
wolfSSL | 15:117db924cf7c | 411 | #define LOAD_Rx4(r0, r1, r2, r3, r4, \ |
wolfSSL | 15:117db924cf7c | 412 | t0, t1, t2, t3) \ |
wolfSSL | 15:117db924cf7c | 413 | "vmovdqu 224(%[ctx]), "#r3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 414 | "vmovdqu 256(%[ctx]), "#r2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 415 | "vmovdqu 288(%[ctx]), "#r1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 416 | "vmovdqu 320(%[ctx]), "#r0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 417 | "vpermq $0xd8, "#r0", "#r0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 418 | "vpermq $0xd8, "#r1", "#r1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 419 | "vpermq $0xd8, "#r2", "#r2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 420 | "vpermq $0xd8, "#r3", "#r3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 421 | "vpunpcklqdq "#r1", "#r0", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 422 | "vpunpckhqdq "#r1", "#r0", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 423 | "vpunpcklqdq "#r3", "#r2", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 424 | "vpunpckhqdq "#r3", "#r2", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 425 | "vperm2i128 $0x20, "#t2", "#t0", "#r0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 426 | "vperm2i128 $0x31, "#t2", "#t0", "#r2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 427 | "vperm2i128 $0x20, "#t3", "#t1", "#r4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 428 | "vpsrlq $32, "#r0", "#r1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 429 | "vpsrlq $32, "#r2", "#r3"\n\t" |
wolfSSL | 15:117db924cf7c | 430 | |
wolfSSL | 15:117db924cf7c | 431 | /* Load the r^4 value into position to be multiplied by all 4 H values. |
wolfSSL | 15:117db924cf7c | 432 | * |
wolfSSL | 15:117db924cf7c | 433 | * r4 holds r^4 as five 26 bits each in 32. |
wolfSSL | 15:117db924cf7c | 434 | * r0-r4 holds the loaded values with 26 bits stored in 64 for multiply. |
wolfSSL | 15:117db924cf7c | 435 | * t0-t1 are temporary registers. |
wolfSSL | 15:117db924cf7c | 436 | */ |
wolfSSL | 15:117db924cf7c | 437 | #define LOAD_R4(r4, r40, r41, r42, r43, r44, \ |
wolfSSL | 15:117db924cf7c | 438 | t0, t1) \ |
wolfSSL | 15:117db924cf7c | 439 | "vmovdqu "#r4", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 440 | "vpermq $0x0, "#t0", "#r40"\n\t" \ |
wolfSSL | 15:117db924cf7c | 441 | "vpsrlq $32, "#t0", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 442 | "vpermq $0x55, "#t0", "#r42"\n\t" \ |
wolfSSL | 15:117db924cf7c | 443 | "vpermq $0xaa, "#t0", "#r44"\n\t" \ |
wolfSSL | 15:117db924cf7c | 444 | "vpermq $0x0, "#t1", "#r41"\n\t" \ |
wolfSSL | 15:117db924cf7c | 445 | "vpermq $0x55, "#t1", "#r43"\n\t" |
wolfSSL | 15:117db924cf7c | 446 | |
wolfSSL | 15:117db924cf7c | 447 | /* Multiply the top 4 26-bit values in 64 bits of each H by 5 for reduction in |
wolfSSL | 15:117db924cf7c | 448 | * multiply. |
wolfSSL | 15:117db924cf7c | 449 | * |
wolfSSL | 15:117db924cf7c | 450 | * s1-s4 are each 64 bit value in r1-r4 multiplied by 5. |
wolfSSL | 15:117db924cf7c | 451 | * r1-r4 are the top 4 |
wolfSSL | 15:117db924cf7c | 452 | */ |
wolfSSL | 15:117db924cf7c | 453 | #define MUL5(s1, s2, s3, s4, r1, r2, r3, r4) \ |
wolfSSL | 15:117db924cf7c | 454 | "vpslld $2, "#r1", "#s1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 455 | "vpslld $2, "#r2", "#s2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 456 | "vpslld $2, "#r3", "#s3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 457 | "vpslld $2, "#r4", "#s4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 458 | "vpaddq "#s1", "#r1", "#s1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 459 | "vpaddq "#s2", "#r2", "#s2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 460 | "vpaddq "#s3", "#r3", "#s3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 461 | "vpaddq "#s4", "#r4", "#s4"\n\t" |
wolfSSL | 15:117db924cf7c | 462 | |
wolfSSL | 15:117db924cf7c | 463 | /* Add the 4 H values together. |
wolfSSL | 15:117db924cf7c | 464 | * Each 64 bits in a register is 26 bits of one of the H values. |
wolfSSL | 15:117db924cf7c | 465 | * |
wolfSSL | 15:117db924cf7c | 466 | * h0-h4 contains the 4 H values. |
wolfSSL | 15:117db924cf7c | 467 | * t1-t4 are temporary registers. |
wolfSSL | 15:117db924cf7c | 468 | */ |
wolfSSL | 15:117db924cf7c | 469 | #define FINALIZE_H(h0, h1, h2, h3, h4, \ |
wolfSSL | 15:117db924cf7c | 470 | t0, t1, t2, t3, t4) \ |
wolfSSL | 15:117db924cf7c | 471 | "vpsrldq $8, "#h0", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 472 | "vpsrldq $8, "#h1", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 473 | "vpsrldq $8, "#h2", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 474 | "vpsrldq $8, "#h3", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 475 | "vpsrldq $8, "#h4", "#t4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 476 | "vpaddq "#h0", "#t0", "#h0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 477 | "vpaddq "#h1", "#t1", "#h1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 478 | "vpaddq "#h2", "#t2", "#h2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 479 | "vpaddq "#h3", "#t3", "#h3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 480 | "vpaddq "#h4", "#t4", "#h4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 481 | "vpermq $0x02, "#h0", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 482 | "vpermq $0x02, "#h1", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 483 | "vpermq $0x02, "#h2", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 484 | "vpermq $0x02, "#h3", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 485 | "vpermq $0x02, "#h4", "#t4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 486 | "vpaddq "#h0", "#t0", "#h0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 487 | "vpaddq "#h1", "#t1", "#h1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 488 | "vpaddq "#h2", "#t2", "#h2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 489 | "vpaddq "#h3", "#t3", "#h3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 490 | "vpaddq "#h4", "#t4", "#h4"\n\t" |
wolfSSL | 15:117db924cf7c | 491 | |
wolfSSL | 15:117db924cf7c | 492 | /* Move 32 bits from each xmm register to a 32 bit register. |
wolfSSL | 15:117db924cf7c | 493 | * |
wolfSSL | 15:117db924cf7c | 494 | * x0-x4 are the xmm version of the ymm registers used. |
wolfSSL | 15:117db924cf7c | 495 | * t0-t4 are the 32-bit registers to store data in. |
wolfSSL | 15:117db924cf7c | 496 | */ |
wolfSSL | 15:117db924cf7c | 497 | #define MOVE_TO_32(x0, x1, x2, x3, x4, \ |
wolfSSL | 15:117db924cf7c | 498 | t0, t1, t2, t3, t4) \ |
wolfSSL | 15:117db924cf7c | 499 | "vmovd "#x0", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 500 | "vmovd "#x1", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 501 | "vmovd "#x2", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 502 | "vmovd "#x3", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 503 | "vmovd "#x4", "#t4"\n\t" |
wolfSSL | 15:117db924cf7c | 504 | |
wolfSSL | 15:117db924cf7c | 505 | /* Multiply using AVX2 instructions. |
wolfSSL | 15:117db924cf7c | 506 | * Each register contains up to 32 bits of data in 64 bits. |
wolfSSL | 15:117db924cf7c | 507 | * This is a 4 way parallel multiply. |
wolfSSL | 15:117db924cf7c | 508 | * |
wolfSSL | 15:117db924cf7c | 509 | * h0-h4 contain 4 H values with the 32 bits of each per register. |
wolfSSL | 15:117db924cf7c | 510 | * r0-r4 contain the 4 powers of r. |
wolfSSL | 15:117db924cf7c | 511 | * s1-s4 contain r1-r4 times 5. |
wolfSSL | 15:117db924cf7c | 512 | * t0-t4 and v0-v3 are temporary registers. |
wolfSSL | 15:117db924cf7c | 513 | */ |
wolfSSL | 15:117db924cf7c | 514 | #define MUL_AVX2(h0, h1, h2, h3, h4, \ |
wolfSSL | 15:117db924cf7c | 515 | r0, r1, r2, r3, r4, \ |
wolfSSL | 15:117db924cf7c | 516 | s1, s2, s3, s4, \ |
wolfSSL | 15:117db924cf7c | 517 | t0, t1, t2, t3, t4, \ |
wolfSSL | 15:117db924cf7c | 518 | v0, v1, v2, v3) \ |
wolfSSL | 15:117db924cf7c | 519 | "vpmuludq "#s1", "#h4", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 520 | "vpmuludq "#s2", "#h3", "#v0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 521 | "vpmuludq "#s2", "#h4", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 522 | "vpmuludq "#s3", "#h3", "#v1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 523 | "vpmuludq "#s3", "#h4", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 524 | "vpaddq "#t0", "#v0", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 525 | "vpmuludq "#s3", "#h2", "#v2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 526 | "vpmuludq "#s4", "#h4", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 527 | "vpaddq "#t1", "#v1", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 528 | "vpmuludq "#s4", "#h1", "#v3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 529 | "vpmuludq "#s4", "#h2", "#v0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 530 | "vpaddq "#t0", "#v2", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 531 | "vpmuludq "#s4", "#h3", "#v1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 532 | "vpmuludq "#r0", "#h3", "#v2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 533 | "vpaddq "#t0", "#v3", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 534 | "vpmuludq "#r0", "#h4", "#t4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 535 | "vpaddq "#t1", "#v0", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 536 | "vpmuludq "#r0", "#h0", "#v3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 537 | "vpaddq "#t2", "#v1", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 538 | "vpmuludq "#r0", "#h1", "#v0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 539 | "vpaddq "#t3", "#v2", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 540 | "vpmuludq "#r0", "#h2", "#v1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 541 | "vpmuludq "#r1", "#h2", "#v2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 542 | "vpaddq "#t0", "#v3", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 543 | "vpmuludq "#r1", "#h3", "#v3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 544 | "vpaddq "#t1", "#v0", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 545 | "vpmuludq "#r1", "#h0", "#v0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 546 | "vpaddq "#t2", "#v1", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 547 | "vpmuludq "#r1", "#h1", "#v1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 548 | "vpaddq "#t3", "#v2", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 549 | "vpmuludq "#r2", "#h1", "#v2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 550 | "vpaddq "#t4", "#v3", "#t4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 551 | "vpmuludq "#r2", "#h2", "#v3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 552 | "vpaddq "#t1", "#v0", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 553 | "vpmuludq "#r2", "#h0", "#v0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 554 | "vpaddq "#t2", "#v1", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 555 | "vpmuludq "#r3", "#h0", "#v1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 556 | "vpaddq "#t3", "#v2", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 557 | "vpmuludq "#r3", "#h1", "#v2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 558 | "vpaddq "#t4", "#v3", "#t4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 559 | "vpmuludq "#r4", "#h0", "#v3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 560 | "vpaddq "#t2", "#v0", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 561 | "vpaddq "#t3", "#v1", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 562 | "vpaddq "#t4", "#v2", "#t4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 563 | "vpaddq "#t4", "#v3", "#t4"\n\t" |
wolfSSL | 15:117db924cf7c | 564 | |
wolfSSL | 15:117db924cf7c | 565 | /* Load the 4 blocks of the message. |
wolfSSL | 15:117db924cf7c | 566 | * |
wolfSSL | 15:117db924cf7c | 567 | * m the address of the message to load. |
wolfSSL | 15:117db924cf7c | 568 | * m0-m4 is the loaded message with 32 bits in 64. Loaded so data is parallel. |
wolfSSL | 15:117db924cf7c | 569 | * hi is the high bits of the 4 m (1 << 128 as not final block). |
wolfSSL | 15:117db924cf7c | 570 | * z is zero. |
wolfSSL | 15:117db924cf7c | 571 | */ |
wolfSSL | 15:117db924cf7c | 572 | #define LOAD_M(m, m0, m1, m2, m3, m4, hi, z) \ |
wolfSSL | 15:117db924cf7c | 573 | "vmovdqu (%[m]), "#m0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 574 | "vmovdqu 32(%[m]), "#m1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 575 | "vperm2i128 $0x20, "#m1", "#m0", "#m2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 576 | "vperm2i128 $0x31, "#m1", "#m0", "#m0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 577 | "vpunpckldq "#m0", "#m2", "#m1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 578 | "vpunpckhdq "#m0", "#m2", "#m3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 579 | "vpunpckldq "#z", "#m1", "#m0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 580 | "vpunpckhdq "#z", "#m1", "#m1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 581 | "vpunpckldq "#z", "#m3", "#m2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 582 | "vpunpckhdq "#z", "#m3", "#m3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 583 | "vmovdqu "#hi", "#m4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 584 | "vpsllq $6, "#m1", "#m1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 585 | "vpsllq $12, "#m2", "#m2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 586 | "vpsllq $18, "#m3", "#m3"\n\t" |
wolfSSL | 15:117db924cf7c | 587 | |
wolfSSL | 15:117db924cf7c | 588 | |
wolfSSL | 15:117db924cf7c | 589 | /* Multiply using AVX2 instructions - adding with message. |
wolfSSL | 15:117db924cf7c | 590 | * Each register contains up to 32 bits of data in 64 bits. |
wolfSSL | 15:117db924cf7c | 591 | * This is a 4 way parallel multiply. |
wolfSSL | 15:117db924cf7c | 592 | * The message data is loaded first and the multiplication adds into it. |
wolfSSL | 15:117db924cf7c | 593 | * |
wolfSSL | 15:117db924cf7c | 594 | * h0-h4 contain 4 H values with the 32 bits of each per register. |
wolfSSL | 15:117db924cf7c | 595 | * r0-r4 contain the 4 powers of r. |
wolfSSL | 15:117db924cf7c | 596 | * s1-s4 contain r1-r4 times 5. |
wolfSSL | 15:117db924cf7c | 597 | * t0-t4 and v0-v3 are temporary registers. |
wolfSSL | 15:117db924cf7c | 598 | * hi is the high bits of the 4 m (1 << 128 as not final block). |
wolfSSL | 15:117db924cf7c | 599 | * z is zero. |
wolfSSL | 15:117db924cf7c | 600 | */ |
wolfSSL | 15:117db924cf7c | 601 | #define MUL_ADD_AVX2(h0, h1, h2, h3, h4, \ |
wolfSSL | 15:117db924cf7c | 602 | r0, r1, r2, r3, r4, \ |
wolfSSL | 15:117db924cf7c | 603 | s1, s2, s3, s4, \ |
wolfSSL | 15:117db924cf7c | 604 | t0, t1, t2, t3, t4, \ |
wolfSSL | 15:117db924cf7c | 605 | v0, v1, v2, v3, \ |
wolfSSL | 15:117db924cf7c | 606 | hi, z) \ |
wolfSSL | 15:117db924cf7c | 607 | "vmovdqu (%[m]), "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 608 | "vmovdqu 32(%[m]), "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 609 | "vperm2i128 $0x20, "#t1", "#t0", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 610 | "vperm2i128 $0x31, "#t1", "#t0", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 611 | "vpunpckldq "#t0", "#t2", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 612 | "vpunpckhdq "#t0", "#t2", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 613 | "vpunpckldq "#z", "#t1", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 614 | "vpunpckhdq "#z", "#t1", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 615 | "vpunpckldq "#z", "#t3", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 616 | "vpunpckhdq "#z", "#t3", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 617 | "vmovdqu "#hi", "#t4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 618 | "vpsllq $6, "#t1", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 619 | "vpsllq $12, "#t2", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 620 | "vpsllq $18, "#t3", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 621 | "vpmuludq "#s1", "#h4", "#v0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 622 | "vpaddq "#t0", "#v0", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 623 | "vpmuludq "#s2", "#h3", "#v0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 624 | "vpmuludq "#s2", "#h4", "#v1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 625 | "vpaddq "#t1", "#v1", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 626 | "vpmuludq "#s3", "#h3", "#v1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 627 | "vpmuludq "#s3", "#h4", "#v2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 628 | "vpaddq "#t2", "#v2", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 629 | "vpaddq "#t0", "#v0", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 630 | "vpmuludq "#s3", "#h2", "#v2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 631 | "vpmuludq "#s4", "#h4", "#v3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 632 | "vpaddq "#t3", "#v3", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 633 | "vpaddq "#t1", "#v1", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 634 | "vpmuludq "#s4", "#h1", "#v3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 635 | "vpmuludq "#s4", "#h2", "#v0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 636 | "vpaddq "#t0", "#v2", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 637 | "vpmuludq "#s4", "#h3", "#v1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 638 | "vpmuludq "#r0", "#h3", "#v2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 639 | "vpaddq "#t0", "#v3", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 640 | "vpmuludq "#r0", "#h4", "#v3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 641 | "vpaddq "#t4", "#v3", "#t4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 642 | "vpaddq "#t1", "#v0", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 643 | "vpmuludq "#r0", "#h0", "#v3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 644 | "vpaddq "#t2", "#v1", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 645 | "vpmuludq "#r0", "#h1", "#v0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 646 | "vpaddq "#t3", "#v2", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 647 | "vpmuludq "#r0", "#h2", "#v1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 648 | "vpmuludq "#r1", "#h2", "#v2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 649 | "vpaddq "#t0", "#v3", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 650 | "vpmuludq "#r1", "#h3", "#v3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 651 | "vpaddq "#t1", "#v0", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 652 | "vpmuludq "#r1", "#h0", "#v0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 653 | "vpaddq "#t2", "#v1", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 654 | "vpmuludq "#r1", "#h1", "#v1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 655 | "vpaddq "#t3", "#v2", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 656 | "vpmuludq "#r2", "#h1", "#v2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 657 | "vpaddq "#t4", "#v3", "#t4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 658 | "vpmuludq "#r2", "#h2", "#v3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 659 | "vpaddq "#t1", "#v0", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 660 | "vpmuludq "#r2", "#h0", "#v0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 661 | "vpaddq "#t2", "#v1", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 662 | "vpmuludq "#r3", "#h0", "#v1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 663 | "vpaddq "#t3", "#v2", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 664 | "vpmuludq "#r3", "#h1", "#v2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 665 | "vpaddq "#t4", "#v3", "#t4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 666 | "vpmuludq "#r4", "#h0", "#v3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 667 | "vpaddq "#t2", "#v0", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 668 | "vpaddq "#t3", "#v1", "#t3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 669 | "vpaddq "#t4", "#v2", "#t4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 670 | "vpaddq "#t4", "#v3", "#t4"\n\t" |
wolfSSL | 15:117db924cf7c | 671 | |
wolfSSL | 15:117db924cf7c | 672 | /* Reduce the 64 bits of data to 26 bits. |
wolfSSL | 15:117db924cf7c | 673 | * |
wolfSSL | 15:117db924cf7c | 674 | * h0-h4 contain the reduced H values. |
wolfSSL | 15:117db924cf7c | 675 | * m0-m4 contain the 4 H values to reduce. |
wolfSSL | 15:117db924cf7c | 676 | * t0-t2 are temporaries. |
wolfSSL | 15:117db924cf7c | 677 | * mask contains the 26-bit mask for each 64 bit value in the 256 bit register. |
wolfSSL | 15:117db924cf7c | 678 | */ |
wolfSSL | 15:117db924cf7c | 679 | #define REDUCE(h0, h1, h2, h3, h4, \ |
wolfSSL | 15:117db924cf7c | 680 | m0, m1, m2, m3, m4, \ |
wolfSSL | 15:117db924cf7c | 681 | t0, t1, t2, mask) \ |
wolfSSL | 15:117db924cf7c | 682 | "vpsrlq $26, "#m0", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 683 | "vpsrlq $26, "#m3", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 684 | "vpand "#mask", "#m0", "#m0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 685 | "vpand "#mask", "#m3", "#m3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 686 | "vpaddq "#m1", "#t0", "#m1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 687 | "vpaddq "#m4", "#t1", "#m4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 688 | \ |
wolfSSL | 15:117db924cf7c | 689 | "vpsrlq $26, "#m1", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 690 | "vpsrlq $26, "#m4", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 691 | "vpand "#mask", "#m1", "#h1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 692 | "vpand "#mask", "#m4", "#h4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 693 | "vpaddq "#m2", "#t0", "#m2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 694 | "vpslld $2, "#t1", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 695 | "vpaddd "#t2", "#t1", "#t2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 696 | \ |
wolfSSL | 15:117db924cf7c | 697 | "vpsrlq $26, "#m2", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 698 | "vpaddq "#m0", "#t2", "#m0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 699 | "vpsrlq $26, "#m0", "#t1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 700 | "vpand "#mask", "#m2", "#h2"\n\t" \ |
wolfSSL | 15:117db924cf7c | 701 | "vpand "#mask", "#m0", "#h0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 702 | "vpaddq "#m3", "#t0", "#m3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 703 | "vpaddq "#h1", "#t1", "#h1"\n\t" \ |
wolfSSL | 15:117db924cf7c | 704 | \ |
wolfSSL | 15:117db924cf7c | 705 | "vpsrlq $26, "#m3", "#t0"\n\t" \ |
wolfSSL | 15:117db924cf7c | 706 | "vpand "#mask", "#m3", "#h3"\n\t" \ |
wolfSSL | 15:117db924cf7c | 707 | "vpaddq "#h4", "#t0", "#h4"\n\t" \ |
wolfSSL | 15:117db924cf7c | 708 | |
wolfSSL | 15:117db924cf7c | 709 | |
wolfSSL | 15:117db924cf7c | 710 | /* Process multiple blocks (n * 16 bytes) of data. |
wolfSSL | 15:117db924cf7c | 711 | * |
wolfSSL | 15:117db924cf7c | 712 | * ctx Poly1305 context. |
wolfSSL | 15:117db924cf7c | 713 | * m Blocks of message data. |
wolfSSL | 15:117db924cf7c | 714 | * bytes The number of bytes to process. |
wolfSSL | 15:117db924cf7c | 715 | */ |
wolfSSL | 15:117db924cf7c | 716 | POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx, |
wolfSSL | 15:117db924cf7c | 717 | const unsigned char* m, size_t bytes) |
wolfSSL | 15:117db924cf7c | 718 | { |
wolfSSL | 15:117db924cf7c | 719 | ALIGN256 word64 r4[5][4]; |
wolfSSL | 15:117db924cf7c | 720 | ALIGN256 word64 s[4][4]; |
wolfSSL | 15:117db924cf7c | 721 | register word32 t0 asm("r8") = 0; |
wolfSSL | 15:117db924cf7c | 722 | register word32 t1 asm("r9") = 0; |
wolfSSL | 15:117db924cf7c | 723 | register word32 t2 asm("r10") = 0; |
wolfSSL | 15:117db924cf7c | 724 | register word32 t3 asm("r11") = 0; |
wolfSSL | 15:117db924cf7c | 725 | register word32 t4 asm("r12") = 0; |
wolfSSL | 15:117db924cf7c | 726 | static const word64 mask[4] = { 0x0000000003ffffff, 0x0000000003ffffff, |
wolfSSL | 15:117db924cf7c | 727 | 0x0000000003ffffff, 0x0000000003ffffff }; |
wolfSSL | 15:117db924cf7c | 728 | static const word64 hibit[4] = { 0x1000000, 0x1000000, |
wolfSSL | 15:117db924cf7c | 729 | 0x1000000, 0x1000000 }; |
wolfSSL | 15:117db924cf7c | 730 | |
wolfSSL | 15:117db924cf7c | 731 | __asm__ __volatile__ ( |
wolfSSL | 15:117db924cf7c | 732 | "vpxor %%ymm15, %%ymm15, %%ymm15\n\t" |
wolfSSL | 15:117db924cf7c | 733 | "cmpb $1, %[started]\n\t" |
wolfSSL | 15:117db924cf7c | 734 | "je L_begin\n\t" |
wolfSSL | 15:117db924cf7c | 735 | "cmpb $1, %[fin]\n\t" |
wolfSSL | 15:117db924cf7c | 736 | "je L_begin\n\t" |
wolfSSL | 15:117db924cf7c | 737 | "# Load the message data\n\t" |
wolfSSL | 15:117db924cf7c | 738 | LOAD_M(m, %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %[hibit], %%ymm15) |
wolfSSL | 15:117db924cf7c | 739 | "vmovdqu %[mask], %%ymm14\n\t" |
wolfSSL | 15:117db924cf7c | 740 | "# Reduce, in place, the message data\n\t" |
wolfSSL | 15:117db924cf7c | 741 | REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, |
wolfSSL | 15:117db924cf7c | 742 | %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, |
wolfSSL | 15:117db924cf7c | 743 | %%ymm10, %%ymm11, %%ymm12, %%ymm14) |
wolfSSL | 15:117db924cf7c | 744 | "addq $64, %[m]\n\t" |
wolfSSL | 15:117db924cf7c | 745 | "subq $64, %[bytes]\n\t" |
wolfSSL | 15:117db924cf7c | 746 | "jz L_store\n\t" |
wolfSSL | 15:117db924cf7c | 747 | "jmp L_load_r4\n\t" |
wolfSSL | 15:117db924cf7c | 748 | "\n" |
wolfSSL | 15:117db924cf7c | 749 | "L_begin:\n\t" |
wolfSSL | 15:117db924cf7c | 750 | "# Load the H values.\n\t" |
wolfSSL | 15:117db924cf7c | 751 | LOAD_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4) |
wolfSSL | 15:117db924cf7c | 752 | "# Check if there is a power of r to load - otherwise use r^4.\n\t" |
wolfSSL | 15:117db924cf7c | 753 | "cmpb $0, %[fin]\n\t" |
wolfSSL | 15:117db924cf7c | 754 | "je L_load_r4\n\t" |
wolfSSL | 15:117db924cf7c | 755 | "\n\t" |
wolfSSL | 15:117db924cf7c | 756 | "# Load the 4 powers of r - r^4, r^3, r^2, r^1.\n\t" |
wolfSSL | 15:117db924cf7c | 757 | LOAD_Rx4(%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9, |
wolfSSL | 15:117db924cf7c | 758 | %%ymm10, %%ymm11, %%ymm12, %%ymm13) |
wolfSSL | 15:117db924cf7c | 759 | "jmp L_mul_5\n\t" |
wolfSSL | 15:117db924cf7c | 760 | "\n" |
wolfSSL | 15:117db924cf7c | 761 | "L_load_r4:\n\t" |
wolfSSL | 15:117db924cf7c | 762 | "# Load r^4 into all four positions.\n\t" |
wolfSSL | 15:117db924cf7c | 763 | LOAD_R4(320(%[ctx]), %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9, |
wolfSSL | 15:117db924cf7c | 764 | %%ymm13, %%ymm14) |
wolfSSL | 15:117db924cf7c | 765 | "\n" |
wolfSSL | 15:117db924cf7c | 766 | "L_mul_5:\n\t" |
wolfSSL | 15:117db924cf7c | 767 | "# Multiply top 4 26-bit values of all four H by 5\n\t" |
wolfSSL | 15:117db924cf7c | 768 | MUL5(%%ymm10, %%ymm11, %%ymm12, %%ymm13, %%ymm6, %%ymm7, %%ymm8, %%ymm9) |
wolfSSL | 15:117db924cf7c | 769 | "# Store powers of r and multiple of 5 for use in multiply.\n\t" |
wolfSSL | 15:117db924cf7c | 770 | "vmovdqa %%ymm10, (%[s])\n\t" |
wolfSSL | 15:117db924cf7c | 771 | "vmovdqa %%ymm11, 32(%[s])\n\t" |
wolfSSL | 15:117db924cf7c | 772 | "vmovdqa %%ymm12, 64(%[s])\n\t" |
wolfSSL | 15:117db924cf7c | 773 | "vmovdqa %%ymm13, 96(%[s])\n\t" |
wolfSSL | 15:117db924cf7c | 774 | "vmovdqa %%ymm5 , (%[r4])\n\t" |
wolfSSL | 15:117db924cf7c | 775 | "vmovdqa %%ymm6 , 32(%[r4])\n\t" |
wolfSSL | 15:117db924cf7c | 776 | "vmovdqa %%ymm7 , 64(%[r4])\n\t" |
wolfSSL | 15:117db924cf7c | 777 | "vmovdqa %%ymm8 , 96(%[r4])\n\t" |
wolfSSL | 15:117db924cf7c | 778 | "vmovdqa %%ymm9 , 128(%[r4])\n\t" |
wolfSSL | 15:117db924cf7c | 779 | "vmovdqu %[mask], %%ymm14\n\t" |
wolfSSL | 15:117db924cf7c | 780 | "\n" |
wolfSSL | 15:117db924cf7c | 781 | "# If not finished then loop over data\n\t" |
wolfSSL | 15:117db924cf7c | 782 | "cmpb $0x1, %[fin]\n\t" |
wolfSSL | 15:117db924cf7c | 783 | "jne L_start\n\t" |
wolfSSL | 15:117db924cf7c | 784 | "# Do last multiply, reduce, add the four H together and move to\n\t" |
wolfSSL | 15:117db924cf7c | 785 | "# 32-bit registers\n\t" |
wolfSSL | 15:117db924cf7c | 786 | MUL_AVX2(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, |
wolfSSL | 15:117db924cf7c | 787 | (%[r4]), 32(%[r4]), 64(%[r4]), 96(%[r4]), 128(%[r4]), |
wolfSSL | 15:117db924cf7c | 788 | (%[s]), 32(%[s]), 64(%[s]), 96(%[s]), |
wolfSSL | 15:117db924cf7c | 789 | %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9, |
wolfSSL | 15:117db924cf7c | 790 | %%ymm10, %%ymm11, %%ymm12, %%ymm13) |
wolfSSL | 15:117db924cf7c | 791 | REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, |
wolfSSL | 15:117db924cf7c | 792 | %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9, |
wolfSSL | 15:117db924cf7c | 793 | %%ymm10, %%ymm11, %%ymm12, %%ymm14) |
wolfSSL | 15:117db924cf7c | 794 | FINALIZE_H(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, |
wolfSSL | 15:117db924cf7c | 795 | %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9) |
wolfSSL | 15:117db924cf7c | 796 | MOVE_TO_32(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, |
wolfSSL | 15:117db924cf7c | 797 | %[t0], %[t1], %[t2], %[t3], %[t4]) |
wolfSSL | 15:117db924cf7c | 798 | "jmp L_end\n\t" |
wolfSSL | 15:117db924cf7c | 799 | "\n" |
wolfSSL | 15:117db924cf7c | 800 | "L_start:\n\t" |
wolfSSL | 15:117db924cf7c | 801 | MUL_ADD_AVX2(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, |
wolfSSL | 15:117db924cf7c | 802 | (%[r4]), 32(%[r4]), 64(%[r4]), 96(%[r4]), 128(%[r4]), |
wolfSSL | 15:117db924cf7c | 803 | (%[s]), 32(%[s]), 64(%[s]), 96(%[s]), |
wolfSSL | 15:117db924cf7c | 804 | %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9, |
wolfSSL | 15:117db924cf7c | 805 | %%ymm10, %%ymm11, %%ymm12, %%ymm13, |
wolfSSL | 15:117db924cf7c | 806 | %[hibit], %%ymm15) |
wolfSSL | 15:117db924cf7c | 807 | REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, |
wolfSSL | 15:117db924cf7c | 808 | %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9, |
wolfSSL | 15:117db924cf7c | 809 | %%ymm10, %%ymm11, %%ymm12, %%ymm14) |
wolfSSL | 15:117db924cf7c | 810 | "addq $64, %[m]\n\t" |
wolfSSL | 15:117db924cf7c | 811 | "subq $64, %[bytes]\n\t" |
wolfSSL | 15:117db924cf7c | 812 | "jnz L_start\n\t" |
wolfSSL | 15:117db924cf7c | 813 | "\n" |
wolfSSL | 15:117db924cf7c | 814 | "L_store:\n\t" |
wolfSSL | 15:117db924cf7c | 815 | "# Store four H values - state\n\t" |
wolfSSL | 15:117db924cf7c | 816 | STORE_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %%xmm4) |
wolfSSL | 15:117db924cf7c | 817 | "\n" |
wolfSSL | 15:117db924cf7c | 818 | "L_end:\n\t" |
wolfSSL | 15:117db924cf7c | 819 | : [m] "+r" (m), [bytes] "+r" (bytes), |
wolfSSL | 15:117db924cf7c | 820 | [t0] "+r" (t0), [t1] "+r" (t1), [t2] "+r" (t2), |
wolfSSL | 15:117db924cf7c | 821 | [t3] "+r" (t3), [t4] "+r" (t4) |
wolfSSL | 15:117db924cf7c | 822 | : [ctx] "r" (ctx), [h] "r" (ctx->hh), |
wolfSSL | 15:117db924cf7c | 823 | [r4] "r" (r4), [s] "r" (s), |
wolfSSL | 15:117db924cf7c | 824 | [fin] "m" (ctx->finished), [started] "m" (ctx->started), |
wolfSSL | 15:117db924cf7c | 825 | [mask] "m" (mask), [hibit] "m" (hibit) |
wolfSSL | 15:117db924cf7c | 826 | : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", |
wolfSSL | 15:117db924cf7c | 827 | "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", |
wolfSSL | 15:117db924cf7c | 828 | "memory" |
wolfSSL | 15:117db924cf7c | 829 | ); |
wolfSSL | 15:117db924cf7c | 830 | |
wolfSSL | 15:117db924cf7c | 831 | if (ctx->finished) |
wolfSSL | 15:117db924cf7c | 832 | { |
wolfSSL | 15:117db924cf7c | 833 | word64 h0, h1, h2, c; |
wolfSSL | 15:117db924cf7c | 834 | |
wolfSSL | 15:117db924cf7c | 835 | /* Convert to 64-bit form. */ |
wolfSSL | 15:117db924cf7c | 836 | h0 = (((word64)(t1 & 0x3FFFF)) << 26) + t0; |
wolfSSL | 15:117db924cf7c | 837 | h1 = (((word64)(t3 & 0x3FF)) << 34) + |
wolfSSL | 15:117db924cf7c | 838 | (((word64) t2 ) << 8) + (t1 >> 18); |
wolfSSL | 15:117db924cf7c | 839 | h2 = (((word64) t4 ) << 16) + (t3 >> 10); |
wolfSSL | 15:117db924cf7c | 840 | |
wolfSSL | 15:117db924cf7c | 841 | /* Perform modulur reduction. */ |
wolfSSL | 15:117db924cf7c | 842 | c = (h1 >> 44); h1 &= 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 843 | h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; |
wolfSSL | 15:117db924cf7c | 844 | h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 845 | h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 846 | h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; |
wolfSSL | 15:117db924cf7c | 847 | h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 848 | h1 += c; |
wolfSSL | 15:117db924cf7c | 849 | |
wolfSSL | 15:117db924cf7c | 850 | /* Convert from 42/44/44 to 2/64/64 bits used and store result. */ |
wolfSSL | 15:117db924cf7c | 851 | ctx->h[0] = h0 | (h1 << 44); |
wolfSSL | 15:117db924cf7c | 852 | ctx->h[1] = (h1 >> 20) | (h2 << 24); |
wolfSSL | 15:117db924cf7c | 853 | ctx->h[2] = h2 >> 40; |
wolfSSL | 15:117db924cf7c | 854 | } |
wolfSSL | 15:117db924cf7c | 855 | |
wolfSSL | 15:117db924cf7c | 856 | ctx->started = 1; |
wolfSSL | 15:117db924cf7c | 857 | } |
wolfSSL | 15:117db924cf7c | 858 | |
wolfSSL | 15:117db924cf7c | 859 | /* Multiply two 130-bit numbers in 64-bit registers and reduce. |
wolfSSL | 15:117db924cf7c | 860 | * 44 + 44 + 42 = 130 bits |
wolfSSL | 15:117db924cf7c | 861 | * |
wolfSSL | 15:117db924cf7c | 862 | * r0-r2 are the first operand and the result. |
wolfSSL | 15:117db924cf7c | 863 | * a0-a2 are the second operand. |
wolfSSL | 15:117db924cf7c | 864 | */ |
wolfSSL | 15:117db924cf7c | 865 | #define MUL_64(r0, r1, r2, a0, a1, a2) \ |
wolfSSL | 15:117db924cf7c | 866 | s1 = a1 * (5 << 2); \ |
wolfSSL | 15:117db924cf7c | 867 | s2 = a2 * (5 << 2); \ |
wolfSSL | 15:117db924cf7c | 868 | MUL(d0, r0, a0); MUL(d, r1, s2); ADD(d0, d); MUL(d, r2, s1); ADD(d0, d); \ |
wolfSSL | 15:117db924cf7c | 869 | MUL(d1, r0, a1); MUL(d, r1, a0); ADD(d1, d); MUL(d, r2, s2); ADD(d1, d); \ |
wolfSSL | 15:117db924cf7c | 870 | MUL(d2, r0, a2); MUL(d, r1, a1); ADD(d2, d); MUL(d, r2, a0); ADD(d2, d); \ |
wolfSSL | 15:117db924cf7c | 871 | \ |
wolfSSL | 15:117db924cf7c | 872 | c = SHR(d0, 44); r0 = LO(d0) & 0xfffffffffff; \ |
wolfSSL | 15:117db924cf7c | 873 | ADDLO(d1, c); c = SHR(d1, 44); r1 = LO(d1) & 0xfffffffffff; \ |
wolfSSL | 15:117db924cf7c | 874 | ADDLO(d2, c); c = SHR(d2, 42); r2 = LO(d2) & 0x3ffffffffff; \ |
wolfSSL | 15:117db924cf7c | 875 | r0 += c * 5; c = (r0 >> 44); r0 = r0 & 0xfffffffffff; \ |
wolfSSL | 15:117db924cf7c | 876 | r1 += c |
wolfSSL | 15:117db924cf7c | 877 | |
wolfSSL | 15:117db924cf7c | 878 | #define SQR_64(r0, r1, r2) \ |
wolfSSL | 15:117db924cf7c | 879 | s2 = r2 * (5 << 2); \ |
wolfSSL | 15:117db924cf7c | 880 | MUL(d0, r1, s2); ADD(d0, d0); MUL(d, r0, r0); ADD(d0, d); \ |
wolfSSL | 15:117db924cf7c | 881 | MUL(d1, r0, r1); ADD(d1, d1); MUL(d, r2, s2); ADD(d1, d); \ |
wolfSSL | 15:117db924cf7c | 882 | MUL(d2, r0, r2); ADD(d2, d2); MUL(d, r1, r1); ADD(d2, d); \ |
wolfSSL | 15:117db924cf7c | 883 | \ |
wolfSSL | 15:117db924cf7c | 884 | c = SHR(d0, 44); r0 = LO(d0) & 0xfffffffffff; \ |
wolfSSL | 15:117db924cf7c | 885 | ADDLO(d1, c); c = SHR(d1, 44); r1 = LO(d1) & 0xfffffffffff; \ |
wolfSSL | 15:117db924cf7c | 886 | ADDLO(d2, c); c = SHR(d2, 42); r2 = LO(d2) & 0x3ffffffffff; \ |
wolfSSL | 15:117db924cf7c | 887 | r0 += c * 5; c = (r0 >> 44); r0 = r0 & 0xfffffffffff; \ |
wolfSSL | 15:117db924cf7c | 888 | r1 += c |
wolfSSL | 15:117db924cf7c | 889 | |
wolfSSL | 15:117db924cf7c | 890 | /* Store the 130-bit number in 64-bit registers as 26-bit values in 32 bits. |
wolfSSL | 15:117db924cf7c | 891 | * |
wolfSSL | 15:117db924cf7c | 892 | * r0-r2 contains the 130-bit number in 64-bit registers. |
wolfSSL | 15:117db924cf7c | 893 | * r is the address of where to store the 26 of 32 bits result. |
wolfSSL | 15:117db924cf7c | 894 | */ |
wolfSSL | 15:117db924cf7c | 895 | #define CONV_64_TO_32(r0, r1, r2, r) \ |
wolfSSL | 15:117db924cf7c | 896 | r[0] = (word32)( r0 ) & 0x3ffffff; \ |
wolfSSL | 15:117db924cf7c | 897 | r[1] = (word32)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; \ |
wolfSSL | 15:117db924cf7c | 898 | r[2] = (word32)( r1 >> 8 ) & 0x3ffffff; \ |
wolfSSL | 15:117db924cf7c | 899 | r[3] = (word32)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; \ |
wolfSSL | 15:117db924cf7c | 900 | r[4] = (word32)( r2 >> 16 ) |
wolfSSL | 15:117db924cf7c | 901 | |
wolfSSL | 15:117db924cf7c | 902 | /* Calculate R^1, R^2, R^3 and R^4 and store them in the context. |
wolfSSL | 15:117db924cf7c | 903 | * |
wolfSSL | 15:117db924cf7c | 904 | * ctx Poly1305 context. |
wolfSSL | 15:117db924cf7c | 905 | */ |
wolfSSL | 15:117db924cf7c | 906 | static void poly1305_calc_powers(Poly1305* ctx) |
wolfSSL | 15:117db924cf7c | 907 | { |
wolfSSL | 15:117db924cf7c | 908 | word64 r0, r1, r2, t0, t1, c; |
wolfSSL | 15:117db924cf7c | 909 | word64 r20, r21, r22; |
wolfSSL | 15:117db924cf7c | 910 | word64 r30, r31, r32; |
wolfSSL | 15:117db924cf7c | 911 | word64 r40, r41, r42; |
wolfSSL | 15:117db924cf7c | 912 | word64 s1, s2; |
wolfSSL | 15:117db924cf7c | 913 | word128 d0, d1, d2, d; |
wolfSSL | 15:117db924cf7c | 914 | |
wolfSSL | 15:117db924cf7c | 915 | t0 = ctx->r[0]; |
wolfSSL | 15:117db924cf7c | 916 | t1 = ctx->r[1]; |
wolfSSL | 15:117db924cf7c | 917 | r0 = ( t0 ) & 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 918 | r1 = ((t0 >> 44) | (t1 << 20)) & 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 919 | r2 = ((t1 >> 24) ) & 0x00fffffffff; |
wolfSSL | 15:117db924cf7c | 920 | |
wolfSSL | 15:117db924cf7c | 921 | /* Store r^1 */ |
wolfSSL | 15:117db924cf7c | 922 | CONV_64_TO_32(r0, r1, r2, ctx->r1); |
wolfSSL | 15:117db924cf7c | 923 | |
wolfSSL | 15:117db924cf7c | 924 | /* Calc and store r^2 */ |
wolfSSL | 15:117db924cf7c | 925 | r20 = r0; r21 = r1; r22 = r2; |
wolfSSL | 15:117db924cf7c | 926 | SQR_64(r20, r21, r22); |
wolfSSL | 15:117db924cf7c | 927 | CONV_64_TO_32(r20, r21, r22, ctx->r2); |
wolfSSL | 15:117db924cf7c | 928 | |
wolfSSL | 15:117db924cf7c | 929 | /* Calc and store r^3 */ |
wolfSSL | 15:117db924cf7c | 930 | r30 = r20; r31 = r21; r32 = r22; |
wolfSSL | 15:117db924cf7c | 931 | MUL_64(r30, r31, r32, r0, r1, r2); |
wolfSSL | 15:117db924cf7c | 932 | CONV_64_TO_32(r30, r31, r32, ctx->r3); |
wolfSSL | 15:117db924cf7c | 933 | |
wolfSSL | 15:117db924cf7c | 934 | /* Calc and store r^4 */ |
wolfSSL | 15:117db924cf7c | 935 | r40 = r20; r41 = r21; r42 = r22; |
wolfSSL | 15:117db924cf7c | 936 | SQR_64(r40, r41, r42); |
wolfSSL | 15:117db924cf7c | 937 | CONV_64_TO_32(r40, r41, r42, ctx->r4); |
wolfSSL | 15:117db924cf7c | 938 | |
wolfSSL | 15:117db924cf7c | 939 | } |
wolfSSL | 15:117db924cf7c | 940 | |
wolfSSL | 15:117db924cf7c | 941 | /* Set the key to use when processing data. |
wolfSSL | 15:117db924cf7c | 942 | * Initialize the context. |
wolfSSL | 15:117db924cf7c | 943 | * Calls AVX set key function as final function calls AVX code. |
wolfSSL | 15:117db924cf7c | 944 | * |
wolfSSL | 15:117db924cf7c | 945 | * ctx Poly1305 context. |
wolfSSL | 15:117db924cf7c | 946 | * key The key data (16 bytes). |
wolfSSL | 15:117db924cf7c | 947 | */ |
wolfSSL | 15:117db924cf7c | 948 | static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key) |
wolfSSL | 15:117db924cf7c | 949 | { |
wolfSSL | 15:117db924cf7c | 950 | poly1305_setkey_avx(ctx, key); |
wolfSSL | 15:117db924cf7c | 951 | |
wolfSSL | 15:117db924cf7c | 952 | __asm__ __volatile__ ( |
wolfSSL | 15:117db924cf7c | 953 | "vpxor %%ymm0, %%ymm0, %%ymm0\n\t" |
wolfSSL | 15:117db924cf7c | 954 | "vmovdqu %%ymm0, (%[hh])\n\t" |
wolfSSL | 15:117db924cf7c | 955 | "vmovdqu %%ymm0, 32(%[hh])\n\t" |
wolfSSL | 15:117db924cf7c | 956 | "vmovdqu %%ymm0, 64(%[hh])\n\t" |
wolfSSL | 15:117db924cf7c | 957 | "vmovdqu %%ymm0, 96(%[hh])\n\t" |
wolfSSL | 15:117db924cf7c | 958 | "vmovdqu %%ymm0, 128(%[hh])\n\t" |
wolfSSL | 15:117db924cf7c | 959 | : |
wolfSSL | 15:117db924cf7c | 960 | : [hh] "r" (ctx->hh) |
wolfSSL | 15:117db924cf7c | 961 | : "memory", "ymm0" |
wolfSSL | 15:117db924cf7c | 962 | ); |
wolfSSL | 15:117db924cf7c | 963 | |
wolfSSL | 15:117db924cf7c | 964 | ctx->leftover = 0; |
wolfSSL | 15:117db924cf7c | 965 | ctx->finished = 0; |
wolfSSL | 15:117db924cf7c | 966 | ctx->started = 0; |
wolfSSL | 15:117db924cf7c | 967 | } |
wolfSSL | 15:117db924cf7c | 968 | |
wolfSSL | 15:117db924cf7c | 969 | /* Calculate the final result - authentication data. |
wolfSSL | 15:117db924cf7c | 970 | * Zeros out the private data in the context. |
wolfSSL | 15:117db924cf7c | 971 | * Calls AVX final function to quickly process last blocks. |
wolfSSL | 15:117db924cf7c | 972 | * |
wolfSSL | 15:117db924cf7c | 973 | * ctx Poly1305 context. |
wolfSSL | 15:117db924cf7c | 974 | * mac Buffer to hold 16 bytes - authentication data. |
wolfSSL | 15:117db924cf7c | 975 | */ |
wolfSSL | 15:117db924cf7c | 976 | static void poly1305_final_avx2(Poly1305* ctx, byte* mac) |
wolfSSL | 15:117db924cf7c | 977 | { |
wolfSSL | 15:117db924cf7c | 978 | int i, j; |
wolfSSL | 15:117db924cf7c | 979 | int l = (int)ctx->leftover; |
wolfSSL | 15:117db924cf7c | 980 | |
wolfSSL | 15:117db924cf7c | 981 | ctx->finished = 1; |
wolfSSL | 15:117db924cf7c | 982 | if (ctx->started) |
wolfSSL | 15:117db924cf7c | 983 | poly1305_blocks_avx2(ctx, ctx->buffer, POLY1305_BLOCK_SIZE * 4); |
wolfSSL | 15:117db924cf7c | 984 | |
wolfSSL | 15:117db924cf7c | 985 | i = l & ~(POLY1305_BLOCK_SIZE - 1); |
wolfSSL | 15:117db924cf7c | 986 | if (i > 0) |
wolfSSL | 15:117db924cf7c | 987 | poly1305_blocks_avx(ctx, ctx->buffer, i); |
wolfSSL | 15:117db924cf7c | 988 | ctx->leftover -= i; |
wolfSSL | 15:117db924cf7c | 989 | for (j = 0; i < l; i++, j++) |
wolfSSL | 15:117db924cf7c | 990 | ctx->buffer[j] = ctx->buffer[i]; |
wolfSSL | 15:117db924cf7c | 991 | |
wolfSSL | 15:117db924cf7c | 992 | poly1305_final_avx(ctx, mac); |
wolfSSL | 15:117db924cf7c | 993 | |
wolfSSL | 15:117db924cf7c | 994 | /* zero out the state */ |
wolfSSL | 15:117db924cf7c | 995 | __asm__ __volatile__ ( |
wolfSSL | 15:117db924cf7c | 996 | "vpxor %%ymm0, %%ymm0, %%ymm0\n\t" |
wolfSSL | 15:117db924cf7c | 997 | "vmovdqu %%ymm0, (%[hh])\n\t" |
wolfSSL | 15:117db924cf7c | 998 | "vmovdqu %%ymm0, 32(%[hh])\n\t" |
wolfSSL | 15:117db924cf7c | 999 | "vmovdqu %%ymm0, 64(%[hh])\n\t" |
wolfSSL | 15:117db924cf7c | 1000 | "vmovdqu %%ymm0, 96(%[hh])\n\t" |
wolfSSL | 15:117db924cf7c | 1001 | "vmovdqu %%ymm0, 128(%[hh])\n\t" |
wolfSSL | 15:117db924cf7c | 1002 | "vmovdqu %%ymm0, (%[r1])\n\t" |
wolfSSL | 15:117db924cf7c | 1003 | "vmovdqu %%ymm0, (%[r2])\n\t" |
wolfSSL | 15:117db924cf7c | 1004 | "vmovdqu %%ymm0, (%[r3])\n\t" |
wolfSSL | 15:117db924cf7c | 1005 | "vmovdqu %%ymm0, (%[r4])\n\t" |
wolfSSL | 15:117db924cf7c | 1006 | : |
wolfSSL | 15:117db924cf7c | 1007 | : [hh] "r" (ctx->hh), [r1] "r" (ctx->r1), [r2] "r" (ctx->r2), |
wolfSSL | 15:117db924cf7c | 1008 | [r3] "r" (ctx->r3), [r4] "r" (ctx->r4) |
wolfSSL | 15:117db924cf7c | 1009 | : "memory", "ymm0" |
wolfSSL | 15:117db924cf7c | 1010 | ); |
wolfSSL | 15:117db924cf7c | 1011 | |
wolfSSL | 15:117db924cf7c | 1012 | ctx->leftover = 0; |
wolfSSL | 15:117db924cf7c | 1013 | ctx->finished = 0; |
wolfSSL | 15:117db924cf7c | 1014 | ctx->started = 0; |
wolfSSL | 15:117db924cf7c | 1015 | } |
wolfSSL | 15:117db924cf7c | 1016 | #endif |
wolfSSL | 15:117db924cf7c | 1017 | |
wolfSSL | 15:117db924cf7c | 1018 | #elif defined(POLY130564) |
wolfSSL | 15:117db924cf7c | 1019 | |
wolfSSL | 15:117db924cf7c | 1020 | static word64 U8TO64(const byte* p) |
wolfSSL | 15:117db924cf7c | 1021 | { |
wolfSSL | 15:117db924cf7c | 1022 | return |
wolfSSL | 15:117db924cf7c | 1023 | (((word64)(p[0] & 0xff) ) | |
wolfSSL | 15:117db924cf7c | 1024 | ((word64)(p[1] & 0xff) << 8) | |
wolfSSL | 15:117db924cf7c | 1025 | ((word64)(p[2] & 0xff) << 16) | |
wolfSSL | 15:117db924cf7c | 1026 | ((word64)(p[3] & 0xff) << 24) | |
wolfSSL | 15:117db924cf7c | 1027 | ((word64)(p[4] & 0xff) << 32) | |
wolfSSL | 15:117db924cf7c | 1028 | ((word64)(p[5] & 0xff) << 40) | |
wolfSSL | 15:117db924cf7c | 1029 | ((word64)(p[6] & 0xff) << 48) | |
wolfSSL | 15:117db924cf7c | 1030 | ((word64)(p[7] & 0xff) << 56)); |
wolfSSL | 15:117db924cf7c | 1031 | } |
wolfSSL | 15:117db924cf7c | 1032 | |
wolfSSL | 15:117db924cf7c | 1033 | static void U64TO8(byte* p, word64 v) { |
wolfSSL | 15:117db924cf7c | 1034 | p[0] = (v ) & 0xff; |
wolfSSL | 15:117db924cf7c | 1035 | p[1] = (v >> 8) & 0xff; |
wolfSSL | 15:117db924cf7c | 1036 | p[2] = (v >> 16) & 0xff; |
wolfSSL | 15:117db924cf7c | 1037 | p[3] = (v >> 24) & 0xff; |
wolfSSL | 15:117db924cf7c | 1038 | p[4] = (v >> 32) & 0xff; |
wolfSSL | 15:117db924cf7c | 1039 | p[5] = (v >> 40) & 0xff; |
wolfSSL | 15:117db924cf7c | 1040 | p[6] = (v >> 48) & 0xff; |
wolfSSL | 15:117db924cf7c | 1041 | p[7] = (v >> 56) & 0xff; |
wolfSSL | 15:117db924cf7c | 1042 | } |
wolfSSL | 15:117db924cf7c | 1043 | |
wolfSSL | 15:117db924cf7c | 1044 | #else /* if not 64 bit then use 32 bit */ |
wolfSSL | 15:117db924cf7c | 1045 | |
wolfSSL | 15:117db924cf7c | 1046 | static word32 U8TO32(const byte *p) |
wolfSSL | 15:117db924cf7c | 1047 | { |
wolfSSL | 15:117db924cf7c | 1048 | return |
wolfSSL | 15:117db924cf7c | 1049 | (((word32)(p[0] & 0xff) ) | |
wolfSSL | 15:117db924cf7c | 1050 | ((word32)(p[1] & 0xff) << 8) | |
wolfSSL | 15:117db924cf7c | 1051 | ((word32)(p[2] & 0xff) << 16) | |
wolfSSL | 15:117db924cf7c | 1052 | ((word32)(p[3] & 0xff) << 24)); |
wolfSSL | 15:117db924cf7c | 1053 | } |
wolfSSL | 15:117db924cf7c | 1054 | |
wolfSSL | 15:117db924cf7c | 1055 | static void U32TO8(byte *p, word32 v) { |
wolfSSL | 15:117db924cf7c | 1056 | p[0] = (v ) & 0xff; |
wolfSSL | 15:117db924cf7c | 1057 | p[1] = (v >> 8) & 0xff; |
wolfSSL | 15:117db924cf7c | 1058 | p[2] = (v >> 16) & 0xff; |
wolfSSL | 15:117db924cf7c | 1059 | p[3] = (v >> 24) & 0xff; |
wolfSSL | 15:117db924cf7c | 1060 | } |
wolfSSL | 15:117db924cf7c | 1061 | #endif |
wolfSSL | 15:117db924cf7c | 1062 | |
wolfSSL | 15:117db924cf7c | 1063 | |
wolfSSL | 15:117db924cf7c | 1064 | static void U32TO64(word32 v, byte* p) |
wolfSSL | 15:117db924cf7c | 1065 | { |
wolfSSL | 15:117db924cf7c | 1066 | XMEMSET(p, 0, 8); |
wolfSSL | 15:117db924cf7c | 1067 | p[0] = (v & 0xFF); |
wolfSSL | 15:117db924cf7c | 1068 | p[1] = (v >> 8) & 0xFF; |
wolfSSL | 15:117db924cf7c | 1069 | p[2] = (v >> 16) & 0xFF; |
wolfSSL | 15:117db924cf7c | 1070 | p[3] = (v >> 24) & 0xFF; |
wolfSSL | 15:117db924cf7c | 1071 | } |
wolfSSL | 15:117db924cf7c | 1072 | |
wolfSSL | 15:117db924cf7c | 1073 | static void poly1305_blocks(Poly1305* ctx, const unsigned char *m, |
wolfSSL | 15:117db924cf7c | 1074 | size_t bytes) |
wolfSSL | 15:117db924cf7c | 1075 | { |
wolfSSL | 15:117db924cf7c | 1076 | #ifdef USE_INTEL_SPEEDUP |
wolfSSL | 15:117db924cf7c | 1077 | /* AVX2 is handled in wc_Poly1305Update. */ |
wolfSSL | 15:117db924cf7c | 1078 | poly1305_blocks_avx(ctx, m, bytes); |
wolfSSL | 15:117db924cf7c | 1079 | #elif defined(POLY130564) |
wolfSSL | 15:117db924cf7c | 1080 | const word64 hibit = (ctx->finished) ? 0 : ((word64)1 << 40); /* 1 << 128 */ |
wolfSSL | 15:117db924cf7c | 1081 | word64 r0,r1,r2; |
wolfSSL | 15:117db924cf7c | 1082 | word64 s1,s2; |
wolfSSL | 15:117db924cf7c | 1083 | word64 h0,h1,h2; |
wolfSSL | 15:117db924cf7c | 1084 | word64 c; |
wolfSSL | 15:117db924cf7c | 1085 | word128 d0,d1,d2,d; |
wolfSSL | 15:117db924cf7c | 1086 | |
wolfSSL | 15:117db924cf7c | 1087 | r0 = ctx->r[0]; |
wolfSSL | 15:117db924cf7c | 1088 | r1 = ctx->r[1]; |
wolfSSL | 15:117db924cf7c | 1089 | r2 = ctx->r[2]; |
wolfSSL | 15:117db924cf7c | 1090 | |
wolfSSL | 15:117db924cf7c | 1091 | h0 = ctx->h[0]; |
wolfSSL | 15:117db924cf7c | 1092 | h1 = ctx->h[1]; |
wolfSSL | 15:117db924cf7c | 1093 | h2 = ctx->h[2]; |
wolfSSL | 15:117db924cf7c | 1094 | |
wolfSSL | 15:117db924cf7c | 1095 | s1 = r1 * (5 << 2); |
wolfSSL | 15:117db924cf7c | 1096 | s2 = r2 * (5 << 2); |
wolfSSL | 15:117db924cf7c | 1097 | |
wolfSSL | 15:117db924cf7c | 1098 | while (bytes >= POLY1305_BLOCK_SIZE) { |
wolfSSL | 15:117db924cf7c | 1099 | word64 t0,t1; |
wolfSSL | 15:117db924cf7c | 1100 | |
wolfSSL | 15:117db924cf7c | 1101 | /* h += m[i] */ |
wolfSSL | 15:117db924cf7c | 1102 | t0 = U8TO64(&m[0]); |
wolfSSL | 15:117db924cf7c | 1103 | t1 = U8TO64(&m[8]); |
wolfSSL | 15:117db924cf7c | 1104 | |
wolfSSL | 15:117db924cf7c | 1105 | h0 += (( t0 ) & 0xfffffffffff); |
wolfSSL | 15:117db924cf7c | 1106 | h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff); |
wolfSSL | 15:117db924cf7c | 1107 | h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit; |
wolfSSL | 15:117db924cf7c | 1108 | |
wolfSSL | 15:117db924cf7c | 1109 | /* h *= r */ |
wolfSSL | 15:117db924cf7c | 1110 | MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d); |
wolfSSL | 15:117db924cf7c | 1111 | MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d); |
wolfSSL | 15:117db924cf7c | 1112 | MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d); |
wolfSSL | 15:117db924cf7c | 1113 | |
wolfSSL | 15:117db924cf7c | 1114 | /* (partial) h %= p */ |
wolfSSL | 15:117db924cf7c | 1115 | c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 1116 | ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 1117 | ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff; |
wolfSSL | 15:117db924cf7c | 1118 | h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 1119 | h1 += c; |
wolfSSL | 15:117db924cf7c | 1120 | |
wolfSSL | 15:117db924cf7c | 1121 | m += POLY1305_BLOCK_SIZE; |
wolfSSL | 15:117db924cf7c | 1122 | bytes -= POLY1305_BLOCK_SIZE; |
wolfSSL | 15:117db924cf7c | 1123 | } |
wolfSSL | 15:117db924cf7c | 1124 | |
wolfSSL | 15:117db924cf7c | 1125 | ctx->h[0] = h0; |
wolfSSL | 15:117db924cf7c | 1126 | ctx->h[1] = h1; |
wolfSSL | 15:117db924cf7c | 1127 | ctx->h[2] = h2; |
wolfSSL | 15:117db924cf7c | 1128 | |
wolfSSL | 15:117db924cf7c | 1129 | #else /* if not 64 bit then use 32 bit */ |
wolfSSL | 15:117db924cf7c | 1130 | const word32 hibit = (ctx->finished) ? 0 : (1 << 24); /* 1 << 128 */ |
wolfSSL | 15:117db924cf7c | 1131 | word32 r0,r1,r2,r3,r4; |
wolfSSL | 15:117db924cf7c | 1132 | word32 s1,s2,s3,s4; |
wolfSSL | 15:117db924cf7c | 1133 | word32 h0,h1,h2,h3,h4; |
wolfSSL | 15:117db924cf7c | 1134 | word64 d0,d1,d2,d3,d4; |
wolfSSL | 15:117db924cf7c | 1135 | word32 c; |
wolfSSL | 15:117db924cf7c | 1136 | |
wolfSSL | 15:117db924cf7c | 1137 | |
wolfSSL | 15:117db924cf7c | 1138 | r0 = ctx->r[0]; |
wolfSSL | 15:117db924cf7c | 1139 | r1 = ctx->r[1]; |
wolfSSL | 15:117db924cf7c | 1140 | r2 = ctx->r[2]; |
wolfSSL | 15:117db924cf7c | 1141 | r3 = ctx->r[3]; |
wolfSSL | 15:117db924cf7c | 1142 | r4 = ctx->r[4]; |
wolfSSL | 15:117db924cf7c | 1143 | |
wolfSSL | 15:117db924cf7c | 1144 | s1 = r1 * 5; |
wolfSSL | 15:117db924cf7c | 1145 | s2 = r2 * 5; |
wolfSSL | 15:117db924cf7c | 1146 | s3 = r3 * 5; |
wolfSSL | 15:117db924cf7c | 1147 | s4 = r4 * 5; |
wolfSSL | 15:117db924cf7c | 1148 | |
wolfSSL | 15:117db924cf7c | 1149 | h0 = ctx->h[0]; |
wolfSSL | 15:117db924cf7c | 1150 | h1 = ctx->h[1]; |
wolfSSL | 15:117db924cf7c | 1151 | h2 = ctx->h[2]; |
wolfSSL | 15:117db924cf7c | 1152 | h3 = ctx->h[3]; |
wolfSSL | 15:117db924cf7c | 1153 | h4 = ctx->h[4]; |
wolfSSL | 15:117db924cf7c | 1154 | |
wolfSSL | 15:117db924cf7c | 1155 | while (bytes >= POLY1305_BLOCK_SIZE) { |
wolfSSL | 15:117db924cf7c | 1156 | /* h += m[i] */ |
wolfSSL | 15:117db924cf7c | 1157 | h0 += (U8TO32(m+ 0) ) & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1158 | h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1159 | h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1160 | h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1161 | h4 += (U8TO32(m+12) >> 8) | hibit; |
wolfSSL | 15:117db924cf7c | 1162 | |
wolfSSL | 15:117db924cf7c | 1163 | /* h *= r */ |
wolfSSL | 15:117db924cf7c | 1164 | d0 = ((word64)h0 * r0) + ((word64)h1 * s4) + ((word64)h2 * s3) + |
wolfSSL | 15:117db924cf7c | 1165 | ((word64)h3 * s2) + ((word64)h4 * s1); |
wolfSSL | 15:117db924cf7c | 1166 | d1 = ((word64)h0 * r1) + ((word64)h1 * r0) + ((word64)h2 * s4) + |
wolfSSL | 15:117db924cf7c | 1167 | ((word64)h3 * s3) + ((word64)h4 * s2); |
wolfSSL | 15:117db924cf7c | 1168 | d2 = ((word64)h0 * r2) + ((word64)h1 * r1) + ((word64)h2 * r0) + |
wolfSSL | 15:117db924cf7c | 1169 | ((word64)h3 * s4) + ((word64)h4 * s3); |
wolfSSL | 15:117db924cf7c | 1170 | d3 = ((word64)h0 * r3) + ((word64)h1 * r2) + ((word64)h2 * r1) + |
wolfSSL | 15:117db924cf7c | 1171 | ((word64)h3 * r0) + ((word64)h4 * s4); |
wolfSSL | 15:117db924cf7c | 1172 | d4 = ((word64)h0 * r4) + ((word64)h1 * r3) + ((word64)h2 * r2) + |
wolfSSL | 15:117db924cf7c | 1173 | ((word64)h3 * r1) + ((word64)h4 * r0); |
wolfSSL | 15:117db924cf7c | 1174 | |
wolfSSL | 15:117db924cf7c | 1175 | /* (partial) h %= p */ |
wolfSSL | 15:117db924cf7c | 1176 | c = (word32)(d0 >> 26); h0 = (word32)d0 & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1177 | d1 += c; c = (word32)(d1 >> 26); h1 = (word32)d1 & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1178 | d2 += c; c = (word32)(d2 >> 26); h2 = (word32)d2 & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1179 | d3 += c; c = (word32)(d3 >> 26); h3 = (word32)d3 & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1180 | d4 += c; c = (word32)(d4 >> 26); h4 = (word32)d4 & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1181 | h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1182 | h1 += c; |
wolfSSL | 15:117db924cf7c | 1183 | |
wolfSSL | 15:117db924cf7c | 1184 | m += POLY1305_BLOCK_SIZE; |
wolfSSL | 15:117db924cf7c | 1185 | bytes -= POLY1305_BLOCK_SIZE; |
wolfSSL | 15:117db924cf7c | 1186 | } |
wolfSSL | 15:117db924cf7c | 1187 | |
wolfSSL | 15:117db924cf7c | 1188 | ctx->h[0] = h0; |
wolfSSL | 15:117db924cf7c | 1189 | ctx->h[1] = h1; |
wolfSSL | 15:117db924cf7c | 1190 | ctx->h[2] = h2; |
wolfSSL | 15:117db924cf7c | 1191 | ctx->h[3] = h3; |
wolfSSL | 15:117db924cf7c | 1192 | ctx->h[4] = h4; |
wolfSSL | 15:117db924cf7c | 1193 | |
wolfSSL | 15:117db924cf7c | 1194 | #endif /* end of 64 bit cpu blocks or 32 bit cpu */ |
wolfSSL | 15:117db924cf7c | 1195 | } |
wolfSSL | 15:117db924cf7c | 1196 | |
wolfSSL | 15:117db924cf7c | 1197 | static void poly1305_block(Poly1305* ctx, const unsigned char *m) |
wolfSSL | 15:117db924cf7c | 1198 | { |
wolfSSL | 15:117db924cf7c | 1199 | #ifdef USE_INTEL_SPEEDUP |
wolfSSL | 15:117db924cf7c | 1200 | /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */ |
wolfSSL | 15:117db924cf7c | 1201 | poly1305_block_avx(ctx, m); |
wolfSSL | 15:117db924cf7c | 1202 | #else |
wolfSSL | 15:117db924cf7c | 1203 | poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE); |
wolfSSL | 15:117db924cf7c | 1204 | #endif |
wolfSSL | 15:117db924cf7c | 1205 | } |
wolfSSL | 15:117db924cf7c | 1206 | |
wolfSSL | 15:117db924cf7c | 1207 | |
wolfSSL | 15:117db924cf7c | 1208 | int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) |
wolfSSL | 15:117db924cf7c | 1209 | { |
wolfSSL | 15:117db924cf7c | 1210 | #if defined(POLY130564) |
wolfSSL | 15:117db924cf7c | 1211 | word64 t0,t1; |
wolfSSL | 15:117db924cf7c | 1212 | #endif |
wolfSSL | 15:117db924cf7c | 1213 | |
wolfSSL | 15:117db924cf7c | 1214 | if (key == NULL) |
wolfSSL | 15:117db924cf7c | 1215 | return BAD_FUNC_ARG; |
wolfSSL | 15:117db924cf7c | 1216 | |
wolfSSL | 15:117db924cf7c | 1217 | #ifdef CHACHA_AEAD_TEST |
wolfSSL | 15:117db924cf7c | 1218 | word32 k; |
wolfSSL | 15:117db924cf7c | 1219 | printf("Poly key used:\n"); |
wolfSSL | 15:117db924cf7c | 1220 | for (k = 0; k < keySz; k++) { |
wolfSSL | 15:117db924cf7c | 1221 | printf("%02x", key[k]); |
wolfSSL | 15:117db924cf7c | 1222 | if ((k+1) % 8 == 0) |
wolfSSL | 15:117db924cf7c | 1223 | printf("\n"); |
wolfSSL | 15:117db924cf7c | 1224 | } |
wolfSSL | 15:117db924cf7c | 1225 | printf("\n"); |
wolfSSL | 15:117db924cf7c | 1226 | #endif |
wolfSSL | 15:117db924cf7c | 1227 | |
wolfSSL | 15:117db924cf7c | 1228 | if (keySz != 32 || ctx == NULL) |
wolfSSL | 15:117db924cf7c | 1229 | return BAD_FUNC_ARG; |
wolfSSL | 15:117db924cf7c | 1230 | |
wolfSSL | 15:117db924cf7c | 1231 | #ifdef USE_INTEL_SPEEDUP |
wolfSSL | 15:117db924cf7c | 1232 | if (!cpu_flags_set) { |
wolfSSL | 15:117db924cf7c | 1233 | intel_flags = cpuid_get_flags(); |
wolfSSL | 15:117db924cf7c | 1234 | cpu_flags_set = 1; |
wolfSSL | 15:117db924cf7c | 1235 | } |
wolfSSL | 15:117db924cf7c | 1236 | #ifdef HAVE_INTEL_AVX2 |
wolfSSL | 15:117db924cf7c | 1237 | if (IS_INTEL_AVX2(intel_flags)) |
wolfSSL | 15:117db924cf7c | 1238 | poly1305_setkey_avx2(ctx, key); |
wolfSSL | 15:117db924cf7c | 1239 | else |
wolfSSL | 15:117db924cf7c | 1240 | #endif |
wolfSSL | 15:117db924cf7c | 1241 | poly1305_setkey_avx(ctx, key); |
wolfSSL | 15:117db924cf7c | 1242 | #elif defined(POLY130564) |
wolfSSL | 15:117db924cf7c | 1243 | |
wolfSSL | 15:117db924cf7c | 1244 | /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ |
wolfSSL | 15:117db924cf7c | 1245 | t0 = U8TO64(key + 0); |
wolfSSL | 15:117db924cf7c | 1246 | t1 = U8TO64(key + 8); |
wolfSSL | 15:117db924cf7c | 1247 | |
wolfSSL | 15:117db924cf7c | 1248 | ctx->r[0] = ( t0 ) & 0xffc0fffffff; |
wolfSSL | 15:117db924cf7c | 1249 | ctx->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff; |
wolfSSL | 15:117db924cf7c | 1250 | ctx->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f; |
wolfSSL | 15:117db924cf7c | 1251 | |
wolfSSL | 15:117db924cf7c | 1252 | /* h (accumulator) = 0 */ |
wolfSSL | 15:117db924cf7c | 1253 | ctx->h[0] = 0; |
wolfSSL | 15:117db924cf7c | 1254 | ctx->h[1] = 0; |
wolfSSL | 15:117db924cf7c | 1255 | ctx->h[2] = 0; |
wolfSSL | 15:117db924cf7c | 1256 | |
wolfSSL | 15:117db924cf7c | 1257 | /* save pad for later */ |
wolfSSL | 15:117db924cf7c | 1258 | ctx->pad[0] = U8TO64(key + 16); |
wolfSSL | 15:117db924cf7c | 1259 | ctx->pad[1] = U8TO64(key + 24); |
wolfSSL | 15:117db924cf7c | 1260 | |
wolfSSL | 15:117db924cf7c | 1261 | ctx->leftover = 0; |
wolfSSL | 15:117db924cf7c | 1262 | ctx->finished = 0; |
wolfSSL | 15:117db924cf7c | 1263 | |
wolfSSL | 15:117db924cf7c | 1264 | #else /* if not 64 bit then use 32 bit */ |
wolfSSL | 15:117db924cf7c | 1265 | |
wolfSSL | 15:117db924cf7c | 1266 | /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ |
wolfSSL | 15:117db924cf7c | 1267 | ctx->r[0] = (U8TO32(key + 0) ) & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1268 | ctx->r[1] = (U8TO32(key + 3) >> 2) & 0x3ffff03; |
wolfSSL | 15:117db924cf7c | 1269 | ctx->r[2] = (U8TO32(key + 6) >> 4) & 0x3ffc0ff; |
wolfSSL | 15:117db924cf7c | 1270 | ctx->r[3] = (U8TO32(key + 9) >> 6) & 0x3f03fff; |
wolfSSL | 15:117db924cf7c | 1271 | ctx->r[4] = (U8TO32(key + 12) >> 8) & 0x00fffff; |
wolfSSL | 15:117db924cf7c | 1272 | |
wolfSSL | 15:117db924cf7c | 1273 | /* h = 0 */ |
wolfSSL | 15:117db924cf7c | 1274 | ctx->h[0] = 0; |
wolfSSL | 15:117db924cf7c | 1275 | ctx->h[1] = 0; |
wolfSSL | 15:117db924cf7c | 1276 | ctx->h[2] = 0; |
wolfSSL | 15:117db924cf7c | 1277 | ctx->h[3] = 0; |
wolfSSL | 15:117db924cf7c | 1278 | ctx->h[4] = 0; |
wolfSSL | 15:117db924cf7c | 1279 | |
wolfSSL | 15:117db924cf7c | 1280 | /* save pad for later */ |
wolfSSL | 15:117db924cf7c | 1281 | ctx->pad[0] = U8TO32(key + 16); |
wolfSSL | 15:117db924cf7c | 1282 | ctx->pad[1] = U8TO32(key + 20); |
wolfSSL | 15:117db924cf7c | 1283 | ctx->pad[2] = U8TO32(key + 24); |
wolfSSL | 15:117db924cf7c | 1284 | ctx->pad[3] = U8TO32(key + 28); |
wolfSSL | 15:117db924cf7c | 1285 | |
wolfSSL | 15:117db924cf7c | 1286 | ctx->leftover = 0; |
wolfSSL | 15:117db924cf7c | 1287 | ctx->finished = 0; |
wolfSSL | 15:117db924cf7c | 1288 | |
wolfSSL | 15:117db924cf7c | 1289 | #endif |
wolfSSL | 15:117db924cf7c | 1290 | |
wolfSSL | 15:117db924cf7c | 1291 | return 0; |
wolfSSL | 15:117db924cf7c | 1292 | } |
wolfSSL | 15:117db924cf7c | 1293 | |
wolfSSL | 15:117db924cf7c | 1294 | |
wolfSSL | 15:117db924cf7c | 1295 | int wc_Poly1305Final(Poly1305* ctx, byte* mac) |
wolfSSL | 15:117db924cf7c | 1296 | { |
wolfSSL | 15:117db924cf7c | 1297 | #ifdef USE_INTEL_SPEEDUP |
wolfSSL | 15:117db924cf7c | 1298 | #elif defined(POLY130564) |
wolfSSL | 15:117db924cf7c | 1299 | |
wolfSSL | 15:117db924cf7c | 1300 | word64 h0,h1,h2,c; |
wolfSSL | 15:117db924cf7c | 1301 | word64 g0,g1,g2; |
wolfSSL | 15:117db924cf7c | 1302 | word64 t0,t1; |
wolfSSL | 15:117db924cf7c | 1303 | |
wolfSSL | 15:117db924cf7c | 1304 | #else |
wolfSSL | 15:117db924cf7c | 1305 | |
wolfSSL | 15:117db924cf7c | 1306 | word32 h0,h1,h2,h3,h4,c; |
wolfSSL | 15:117db924cf7c | 1307 | word32 g0,g1,g2,g3,g4; |
wolfSSL | 15:117db924cf7c | 1308 | word64 f; |
wolfSSL | 15:117db924cf7c | 1309 | word32 mask; |
wolfSSL | 15:117db924cf7c | 1310 | |
wolfSSL | 15:117db924cf7c | 1311 | #endif |
wolfSSL | 15:117db924cf7c | 1312 | |
wolfSSL | 15:117db924cf7c | 1313 | if (ctx == NULL) |
wolfSSL | 15:117db924cf7c | 1314 | return BAD_FUNC_ARG; |
wolfSSL | 15:117db924cf7c | 1315 | |
wolfSSL | 15:117db924cf7c | 1316 | #ifdef USE_INTEL_SPEEDUP |
wolfSSL | 15:117db924cf7c | 1317 | #ifdef HAVE_INTEL_AVX2 |
wolfSSL | 15:117db924cf7c | 1318 | if (IS_INTEL_AVX2(intel_flags)) |
wolfSSL | 15:117db924cf7c | 1319 | poly1305_final_avx2(ctx, mac); |
wolfSSL | 15:117db924cf7c | 1320 | else |
wolfSSL | 15:117db924cf7c | 1321 | #endif |
wolfSSL | 15:117db924cf7c | 1322 | poly1305_final_avx(ctx, mac); |
wolfSSL | 15:117db924cf7c | 1323 | #elif defined(POLY130564) |
wolfSSL | 15:117db924cf7c | 1324 | |
wolfSSL | 15:117db924cf7c | 1325 | /* process the remaining block */ |
wolfSSL | 15:117db924cf7c | 1326 | if (ctx->leftover) { |
wolfSSL | 15:117db924cf7c | 1327 | size_t i = ctx->leftover; |
wolfSSL | 15:117db924cf7c | 1328 | ctx->buffer[i] = 1; |
wolfSSL | 15:117db924cf7c | 1329 | for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++) |
wolfSSL | 15:117db924cf7c | 1330 | ctx->buffer[i] = 0; |
wolfSSL | 15:117db924cf7c | 1331 | ctx->finished = 1; |
wolfSSL | 15:117db924cf7c | 1332 | poly1305_block(ctx, ctx->buffer); |
wolfSSL | 15:117db924cf7c | 1333 | } |
wolfSSL | 15:117db924cf7c | 1334 | |
wolfSSL | 15:117db924cf7c | 1335 | /* fully carry h */ |
wolfSSL | 15:117db924cf7c | 1336 | h0 = ctx->h[0]; |
wolfSSL | 15:117db924cf7c | 1337 | h1 = ctx->h[1]; |
wolfSSL | 15:117db924cf7c | 1338 | h2 = ctx->h[2]; |
wolfSSL | 15:117db924cf7c | 1339 | |
wolfSSL | 15:117db924cf7c | 1340 | c = (h1 >> 44); h1 &= 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 1341 | h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; |
wolfSSL | 15:117db924cf7c | 1342 | h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 1343 | h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 1344 | h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; |
wolfSSL | 15:117db924cf7c | 1345 | h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 1346 | h1 += c; |
wolfSSL | 15:117db924cf7c | 1347 | |
wolfSSL | 15:117db924cf7c | 1348 | /* compute h + -p */ |
wolfSSL | 15:117db924cf7c | 1349 | g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 1350 | g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 1351 | g2 = h2 + c - ((word64)1 << 42); |
wolfSSL | 15:117db924cf7c | 1352 | |
wolfSSL | 15:117db924cf7c | 1353 | /* select h if h < p, or h + -p if h >= p */ |
wolfSSL | 15:117db924cf7c | 1354 | c = (g2 >> ((sizeof(word64) * 8) - 1)) - 1; |
wolfSSL | 15:117db924cf7c | 1355 | g0 &= c; |
wolfSSL | 15:117db924cf7c | 1356 | g1 &= c; |
wolfSSL | 15:117db924cf7c | 1357 | g2 &= c; |
wolfSSL | 15:117db924cf7c | 1358 | c = ~c; |
wolfSSL | 15:117db924cf7c | 1359 | h0 = (h0 & c) | g0; |
wolfSSL | 15:117db924cf7c | 1360 | h1 = (h1 & c) | g1; |
wolfSSL | 15:117db924cf7c | 1361 | h2 = (h2 & c) | g2; |
wolfSSL | 15:117db924cf7c | 1362 | |
wolfSSL | 15:117db924cf7c | 1363 | /* h = (h + pad) */ |
wolfSSL | 15:117db924cf7c | 1364 | t0 = ctx->pad[0]; |
wolfSSL | 15:117db924cf7c | 1365 | t1 = ctx->pad[1]; |
wolfSSL | 15:117db924cf7c | 1366 | |
wolfSSL | 15:117db924cf7c | 1367 | h0 += (( t0 ) & 0xfffffffffff) ; |
wolfSSL | 15:117db924cf7c | 1368 | c = (h0 >> 44); h0 &= 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 1369 | h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; |
wolfSSL | 15:117db924cf7c | 1370 | c = (h1 >> 44); h1 &= 0xfffffffffff; |
wolfSSL | 15:117db924cf7c | 1371 | h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; |
wolfSSL | 15:117db924cf7c | 1372 | h2 &= 0x3ffffffffff; |
wolfSSL | 15:117db924cf7c | 1373 | |
wolfSSL | 15:117db924cf7c | 1374 | /* mac = h % (2^128) */ |
wolfSSL | 15:117db924cf7c | 1375 | h0 = ((h0 ) | (h1 << 44)); |
wolfSSL | 15:117db924cf7c | 1376 | h1 = ((h1 >> 20) | (h2 << 24)); |
wolfSSL | 15:117db924cf7c | 1377 | |
wolfSSL | 15:117db924cf7c | 1378 | U64TO8(mac + 0, h0); |
wolfSSL | 15:117db924cf7c | 1379 | U64TO8(mac + 8, h1); |
wolfSSL | 15:117db924cf7c | 1380 | |
wolfSSL | 15:117db924cf7c | 1381 | /* zero out the state */ |
wolfSSL | 15:117db924cf7c | 1382 | ctx->h[0] = 0; |
wolfSSL | 15:117db924cf7c | 1383 | ctx->h[1] = 0; |
wolfSSL | 15:117db924cf7c | 1384 | ctx->h[2] = 0; |
wolfSSL | 15:117db924cf7c | 1385 | ctx->r[0] = 0; |
wolfSSL | 15:117db924cf7c | 1386 | ctx->r[1] = 0; |
wolfSSL | 15:117db924cf7c | 1387 | ctx->r[2] = 0; |
wolfSSL | 15:117db924cf7c | 1388 | ctx->pad[0] = 0; |
wolfSSL | 15:117db924cf7c | 1389 | ctx->pad[1] = 0; |
wolfSSL | 15:117db924cf7c | 1390 | |
wolfSSL | 15:117db924cf7c | 1391 | #else /* if not 64 bit then use 32 bit */ |
wolfSSL | 15:117db924cf7c | 1392 | |
wolfSSL | 15:117db924cf7c | 1393 | /* process the remaining block */ |
wolfSSL | 15:117db924cf7c | 1394 | if (ctx->leftover) { |
wolfSSL | 15:117db924cf7c | 1395 | size_t i = ctx->leftover; |
wolfSSL | 15:117db924cf7c | 1396 | ctx->buffer[i++] = 1; |
wolfSSL | 15:117db924cf7c | 1397 | for (; i < POLY1305_BLOCK_SIZE; i++) |
wolfSSL | 15:117db924cf7c | 1398 | ctx->buffer[i] = 0; |
wolfSSL | 15:117db924cf7c | 1399 | ctx->finished = 1; |
wolfSSL | 15:117db924cf7c | 1400 | poly1305_block(ctx, ctx->buffer); |
wolfSSL | 15:117db924cf7c | 1401 | } |
wolfSSL | 15:117db924cf7c | 1402 | |
wolfSSL | 15:117db924cf7c | 1403 | /* fully carry h */ |
wolfSSL | 15:117db924cf7c | 1404 | h0 = ctx->h[0]; |
wolfSSL | 15:117db924cf7c | 1405 | h1 = ctx->h[1]; |
wolfSSL | 15:117db924cf7c | 1406 | h2 = ctx->h[2]; |
wolfSSL | 15:117db924cf7c | 1407 | h3 = ctx->h[3]; |
wolfSSL | 15:117db924cf7c | 1408 | h4 = ctx->h[4]; |
wolfSSL | 15:117db924cf7c | 1409 | |
wolfSSL | 15:117db924cf7c | 1410 | c = h1 >> 26; h1 = h1 & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1411 | h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1412 | h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1413 | h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1414 | h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1415 | h1 += c; |
wolfSSL | 15:117db924cf7c | 1416 | |
wolfSSL | 15:117db924cf7c | 1417 | /* compute h + -p */ |
wolfSSL | 15:117db924cf7c | 1418 | g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1419 | g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1420 | g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1421 | g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff; |
wolfSSL | 15:117db924cf7c | 1422 | g4 = h4 + c - (1 << 26); |
wolfSSL | 15:117db924cf7c | 1423 | |
wolfSSL | 15:117db924cf7c | 1424 | /* select h if h < p, or h + -p if h >= p */ |
wolfSSL | 15:117db924cf7c | 1425 | mask = (g4 >> ((sizeof(word32) * 8) - 1)) - 1; |
wolfSSL | 15:117db924cf7c | 1426 | g0 &= mask; |
wolfSSL | 15:117db924cf7c | 1427 | g1 &= mask; |
wolfSSL | 15:117db924cf7c | 1428 | g2 &= mask; |
wolfSSL | 15:117db924cf7c | 1429 | g3 &= mask; |
wolfSSL | 15:117db924cf7c | 1430 | g4 &= mask; |
wolfSSL | 15:117db924cf7c | 1431 | mask = ~mask; |
wolfSSL | 15:117db924cf7c | 1432 | h0 = (h0 & mask) | g0; |
wolfSSL | 15:117db924cf7c | 1433 | h1 = (h1 & mask) | g1; |
wolfSSL | 15:117db924cf7c | 1434 | h2 = (h2 & mask) | g2; |
wolfSSL | 15:117db924cf7c | 1435 | h3 = (h3 & mask) | g3; |
wolfSSL | 15:117db924cf7c | 1436 | h4 = (h4 & mask) | g4; |
wolfSSL | 15:117db924cf7c | 1437 | |
wolfSSL | 15:117db924cf7c | 1438 | /* h = h % (2^128) */ |
wolfSSL | 15:117db924cf7c | 1439 | h0 = ((h0 ) | (h1 << 26)) & 0xffffffff; |
wolfSSL | 15:117db924cf7c | 1440 | h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; |
wolfSSL | 15:117db924cf7c | 1441 | h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; |
wolfSSL | 15:117db924cf7c | 1442 | h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; |
wolfSSL | 15:117db924cf7c | 1443 | |
wolfSSL | 15:117db924cf7c | 1444 | /* mac = (h + pad) % (2^128) */ |
wolfSSL | 15:117db924cf7c | 1445 | f = (word64)h0 + ctx->pad[0] ; h0 = (word32)f; |
wolfSSL | 15:117db924cf7c | 1446 | f = (word64)h1 + ctx->pad[1] + (f >> 32); h1 = (word32)f; |
wolfSSL | 15:117db924cf7c | 1447 | f = (word64)h2 + ctx->pad[2] + (f >> 32); h2 = (word32)f; |
wolfSSL | 15:117db924cf7c | 1448 | f = (word64)h3 + ctx->pad[3] + (f >> 32); h3 = (word32)f; |
wolfSSL | 15:117db924cf7c | 1449 | |
wolfSSL | 15:117db924cf7c | 1450 | U32TO8(mac + 0, h0); |
wolfSSL | 15:117db924cf7c | 1451 | U32TO8(mac + 4, h1); |
wolfSSL | 15:117db924cf7c | 1452 | U32TO8(mac + 8, h2); |
wolfSSL | 15:117db924cf7c | 1453 | U32TO8(mac + 12, h3); |
wolfSSL | 15:117db924cf7c | 1454 | |
wolfSSL | 15:117db924cf7c | 1455 | /* zero out the state */ |
wolfSSL | 15:117db924cf7c | 1456 | ctx->h[0] = 0; |
wolfSSL | 15:117db924cf7c | 1457 | ctx->h[1] = 0; |
wolfSSL | 15:117db924cf7c | 1458 | ctx->h[2] = 0; |
wolfSSL | 15:117db924cf7c | 1459 | ctx->h[3] = 0; |
wolfSSL | 15:117db924cf7c | 1460 | ctx->h[4] = 0; |
wolfSSL | 15:117db924cf7c | 1461 | ctx->r[0] = 0; |
wolfSSL | 15:117db924cf7c | 1462 | ctx->r[1] = 0; |
wolfSSL | 15:117db924cf7c | 1463 | ctx->r[2] = 0; |
wolfSSL | 15:117db924cf7c | 1464 | ctx->r[3] = 0; |
wolfSSL | 15:117db924cf7c | 1465 | ctx->r[4] = 0; |
wolfSSL | 15:117db924cf7c | 1466 | ctx->pad[0] = 0; |
wolfSSL | 15:117db924cf7c | 1467 | ctx->pad[1] = 0; |
wolfSSL | 15:117db924cf7c | 1468 | ctx->pad[2] = 0; |
wolfSSL | 15:117db924cf7c | 1469 | ctx->pad[3] = 0; |
wolfSSL | 15:117db924cf7c | 1470 | |
wolfSSL | 15:117db924cf7c | 1471 | #endif |
wolfSSL | 15:117db924cf7c | 1472 | |
wolfSSL | 15:117db924cf7c | 1473 | return 0; |
wolfSSL | 15:117db924cf7c | 1474 | } |
wolfSSL | 15:117db924cf7c | 1475 | |
wolfSSL | 15:117db924cf7c | 1476 | |
wolfSSL | 15:117db924cf7c | 1477 | int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) |
wolfSSL | 15:117db924cf7c | 1478 | { |
wolfSSL | 15:117db924cf7c | 1479 | size_t i; |
wolfSSL | 15:117db924cf7c | 1480 | |
wolfSSL | 15:117db924cf7c | 1481 | #ifdef CHACHA_AEAD_TEST |
wolfSSL | 15:117db924cf7c | 1482 | word32 k; |
wolfSSL | 15:117db924cf7c | 1483 | printf("Raw input to poly:\n"); |
wolfSSL | 15:117db924cf7c | 1484 | for (k = 0; k < bytes; k++) { |
wolfSSL | 15:117db924cf7c | 1485 | printf("%02x", m[k]); |
wolfSSL | 15:117db924cf7c | 1486 | if ((k+1) % 16 == 0) |
wolfSSL | 15:117db924cf7c | 1487 | printf("\n"); |
wolfSSL | 15:117db924cf7c | 1488 | } |
wolfSSL | 15:117db924cf7c | 1489 | printf("\n"); |
wolfSSL | 15:117db924cf7c | 1490 | #endif |
wolfSSL | 15:117db924cf7c | 1491 | |
wolfSSL | 15:117db924cf7c | 1492 | if (ctx == NULL) |
wolfSSL | 15:117db924cf7c | 1493 | return BAD_FUNC_ARG; |
wolfSSL | 15:117db924cf7c | 1494 | |
wolfSSL | 15:117db924cf7c | 1495 | #ifdef USE_INTEL_SPEEDUP |
wolfSSL | 15:117db924cf7c | 1496 | #ifdef HAVE_INTEL_AVX2 |
wolfSSL | 15:117db924cf7c | 1497 | if (IS_INTEL_AVX2(intel_flags)) { |
wolfSSL | 15:117db924cf7c | 1498 | /* handle leftover */ |
wolfSSL | 15:117db924cf7c | 1499 | if (ctx->leftover) { |
wolfSSL | 15:117db924cf7c | 1500 | size_t want = sizeof(ctx->buffer) - ctx->leftover; |
wolfSSL | 15:117db924cf7c | 1501 | if (want > bytes) |
wolfSSL | 15:117db924cf7c | 1502 | want = bytes; |
wolfSSL | 15:117db924cf7c | 1503 | |
wolfSSL | 15:117db924cf7c | 1504 | for (i = 0; i < want; i++) |
wolfSSL | 15:117db924cf7c | 1505 | ctx->buffer[ctx->leftover + i] = m[i]; |
wolfSSL | 15:117db924cf7c | 1506 | bytes -= (word32)want; |
wolfSSL | 15:117db924cf7c | 1507 | m += want; |
wolfSSL | 15:117db924cf7c | 1508 | ctx->leftover += want; |
wolfSSL | 15:117db924cf7c | 1509 | if (ctx->leftover < sizeof(ctx->buffer)) |
wolfSSL | 15:117db924cf7c | 1510 | return 0; |
wolfSSL | 15:117db924cf7c | 1511 | |
wolfSSL | 15:117db924cf7c | 1512 | if (!ctx->started) |
wolfSSL | 15:117db924cf7c | 1513 | poly1305_calc_powers(ctx); |
wolfSSL | 15:117db924cf7c | 1514 | poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer)); |
wolfSSL | 15:117db924cf7c | 1515 | ctx->leftover = 0; |
wolfSSL | 15:117db924cf7c | 1516 | } |
wolfSSL | 15:117db924cf7c | 1517 | |
wolfSSL | 15:117db924cf7c | 1518 | /* process full blocks */ |
wolfSSL | 15:117db924cf7c | 1519 | if (bytes >= sizeof(ctx->buffer)) { |
wolfSSL | 15:117db924cf7c | 1520 | size_t want = bytes & ~(sizeof(ctx->buffer) - 1); |
wolfSSL | 15:117db924cf7c | 1521 | |
wolfSSL | 15:117db924cf7c | 1522 | if (!ctx->started) |
wolfSSL | 15:117db924cf7c | 1523 | poly1305_calc_powers(ctx); |
wolfSSL | 15:117db924cf7c | 1524 | poly1305_blocks_avx2(ctx, m, want); |
wolfSSL | 15:117db924cf7c | 1525 | m += want; |
wolfSSL | 15:117db924cf7c | 1526 | bytes -= (word32)want; |
wolfSSL | 15:117db924cf7c | 1527 | } |
wolfSSL | 15:117db924cf7c | 1528 | |
wolfSSL | 15:117db924cf7c | 1529 | /* store leftover */ |
wolfSSL | 15:117db924cf7c | 1530 | if (bytes) { |
wolfSSL | 15:117db924cf7c | 1531 | for (i = 0; i < bytes; i++) |
wolfSSL | 15:117db924cf7c | 1532 | ctx->buffer[ctx->leftover + i] = m[i]; |
wolfSSL | 15:117db924cf7c | 1533 | ctx->leftover += bytes; |
wolfSSL | 15:117db924cf7c | 1534 | } |
wolfSSL | 15:117db924cf7c | 1535 | } |
wolfSSL | 15:117db924cf7c | 1536 | else |
wolfSSL | 15:117db924cf7c | 1537 | #endif |
wolfSSL | 15:117db924cf7c | 1538 | #endif |
wolfSSL | 15:117db924cf7c | 1539 | { |
wolfSSL | 15:117db924cf7c | 1540 | /* handle leftover */ |
wolfSSL | 15:117db924cf7c | 1541 | if (ctx->leftover) { |
wolfSSL | 15:117db924cf7c | 1542 | size_t want = (POLY1305_BLOCK_SIZE - ctx->leftover); |
wolfSSL | 15:117db924cf7c | 1543 | if (want > bytes) |
wolfSSL | 15:117db924cf7c | 1544 | want = bytes; |
wolfSSL | 15:117db924cf7c | 1545 | for (i = 0; i < want; i++) |
wolfSSL | 15:117db924cf7c | 1546 | ctx->buffer[ctx->leftover + i] = m[i]; |
wolfSSL | 15:117db924cf7c | 1547 | bytes -= (word32)want; |
wolfSSL | 15:117db924cf7c | 1548 | m += want; |
wolfSSL | 15:117db924cf7c | 1549 | ctx->leftover += want; |
wolfSSL | 15:117db924cf7c | 1550 | if (ctx->leftover < POLY1305_BLOCK_SIZE) |
wolfSSL | 15:117db924cf7c | 1551 | return 0; |
wolfSSL | 15:117db924cf7c | 1552 | poly1305_block(ctx, ctx->buffer); |
wolfSSL | 15:117db924cf7c | 1553 | ctx->leftover = 0; |
wolfSSL | 15:117db924cf7c | 1554 | } |
wolfSSL | 15:117db924cf7c | 1555 | |
wolfSSL | 15:117db924cf7c | 1556 | /* process full blocks */ |
wolfSSL | 15:117db924cf7c | 1557 | if (bytes >= POLY1305_BLOCK_SIZE) { |
wolfSSL | 15:117db924cf7c | 1558 | size_t want = (bytes & ~(POLY1305_BLOCK_SIZE - 1)); |
wolfSSL | 15:117db924cf7c | 1559 | poly1305_blocks(ctx, m, want); |
wolfSSL | 15:117db924cf7c | 1560 | m += want; |
wolfSSL | 15:117db924cf7c | 1561 | bytes -= (word32)want; |
wolfSSL | 15:117db924cf7c | 1562 | } |
wolfSSL | 15:117db924cf7c | 1563 | |
wolfSSL | 15:117db924cf7c | 1564 | /* store leftover */ |
wolfSSL | 15:117db924cf7c | 1565 | if (bytes) { |
wolfSSL | 15:117db924cf7c | 1566 | for (i = 0; i < bytes; i++) |
wolfSSL | 15:117db924cf7c | 1567 | ctx->buffer[ctx->leftover + i] = m[i]; |
wolfSSL | 15:117db924cf7c | 1568 | ctx->leftover += bytes; |
wolfSSL | 15:117db924cf7c | 1569 | } |
wolfSSL | 15:117db924cf7c | 1570 | } |
wolfSSL | 15:117db924cf7c | 1571 | |
wolfSSL | 15:117db924cf7c | 1572 | return 0; |
wolfSSL | 15:117db924cf7c | 1573 | } |
wolfSSL | 15:117db924cf7c | 1574 | |
wolfSSL | 15:117db924cf7c | 1575 | |
wolfSSL | 15:117db924cf7c | 1576 | /* Takes in an initialized Poly1305 struct that has a key loaded and creates |
wolfSSL | 15:117db924cf7c | 1577 | a MAC (tag) using recent TLS AEAD padding scheme. |
wolfSSL | 15:117db924cf7c | 1578 | ctx : Initialized Poly1305 struct to use |
wolfSSL | 15:117db924cf7c | 1579 | additional : Additional data to use |
wolfSSL | 15:117db924cf7c | 1580 | addSz : Size of additional buffer |
wolfSSL | 15:117db924cf7c | 1581 | input : Input buffer to create tag from |
wolfSSL | 15:117db924cf7c | 1582 | sz : Size of input buffer |
wolfSSL | 15:117db924cf7c | 1583 | tag : Buffer to hold created tag |
wolfSSL | 15:117db924cf7c | 1584 | tagSz : Size of input tag buffer (must be at least |
wolfSSL | 15:117db924cf7c | 1585 | WC_POLY1305_MAC_SZ(16)) |
wolfSSL | 15:117db924cf7c | 1586 | */ |
wolfSSL | 15:117db924cf7c | 1587 | int wc_Poly1305_MAC(Poly1305* ctx, byte* additional, word32 addSz, |
wolfSSL | 15:117db924cf7c | 1588 | byte* input, word32 sz, byte* tag, word32 tagSz) |
wolfSSL | 15:117db924cf7c | 1589 | { |
wolfSSL | 15:117db924cf7c | 1590 | int ret; |
wolfSSL | 15:117db924cf7c | 1591 | byte padding[WC_POLY1305_PAD_SZ - 1]; |
wolfSSL | 15:117db924cf7c | 1592 | word32 paddingLen; |
wolfSSL | 15:117db924cf7c | 1593 | byte little64[16]; |
wolfSSL | 15:117db924cf7c | 1594 | |
wolfSSL | 15:117db924cf7c | 1595 | XMEMSET(padding, 0, sizeof(padding)); |
wolfSSL | 15:117db924cf7c | 1596 | |
wolfSSL | 15:117db924cf7c | 1597 | /* sanity check on arguments */ |
wolfSSL | 15:117db924cf7c | 1598 | if (ctx == NULL || input == NULL || tag == NULL || |
wolfSSL | 15:117db924cf7c | 1599 | tagSz < WC_POLY1305_MAC_SZ) { |
wolfSSL | 15:117db924cf7c | 1600 | return BAD_FUNC_ARG; |
wolfSSL | 15:117db924cf7c | 1601 | } |
wolfSSL | 15:117db924cf7c | 1602 | |
wolfSSL | 15:117db924cf7c | 1603 | /* additional allowed to be 0 */ |
wolfSSL | 15:117db924cf7c | 1604 | if (addSz > 0) { |
wolfSSL | 15:117db924cf7c | 1605 | if (additional == NULL) |
wolfSSL | 15:117db924cf7c | 1606 | return BAD_FUNC_ARG; |
wolfSSL | 15:117db924cf7c | 1607 | |
wolfSSL | 15:117db924cf7c | 1608 | /* additional data plus padding */ |
wolfSSL | 15:117db924cf7c | 1609 | if ((ret = wc_Poly1305Update(ctx, additional, addSz)) != 0) { |
wolfSSL | 15:117db924cf7c | 1610 | return ret; |
wolfSSL | 15:117db924cf7c | 1611 | } |
wolfSSL | 15:117db924cf7c | 1612 | paddingLen = -((int)addSz) & (WC_POLY1305_PAD_SZ - 1); |
wolfSSL | 15:117db924cf7c | 1613 | if (paddingLen) { |
wolfSSL | 15:117db924cf7c | 1614 | if ((ret = wc_Poly1305Update(ctx, padding, paddingLen)) != 0) { |
wolfSSL | 15:117db924cf7c | 1615 | return ret; |
wolfSSL | 15:117db924cf7c | 1616 | } |
wolfSSL | 15:117db924cf7c | 1617 | } |
wolfSSL | 15:117db924cf7c | 1618 | } |
wolfSSL | 15:117db924cf7c | 1619 | |
wolfSSL | 15:117db924cf7c | 1620 | /* input plus padding */ |
wolfSSL | 15:117db924cf7c | 1621 | if ((ret = wc_Poly1305Update(ctx, input, sz)) != 0) { |
wolfSSL | 15:117db924cf7c | 1622 | return ret; |
wolfSSL | 15:117db924cf7c | 1623 | } |
wolfSSL | 15:117db924cf7c | 1624 | paddingLen = -((int)sz) & (WC_POLY1305_PAD_SZ - 1); |
wolfSSL | 15:117db924cf7c | 1625 | if (paddingLen) { |
wolfSSL | 15:117db924cf7c | 1626 | if ((ret = wc_Poly1305Update(ctx, padding, paddingLen)) != 0) { |
wolfSSL | 15:117db924cf7c | 1627 | return ret; |
wolfSSL | 15:117db924cf7c | 1628 | } |
wolfSSL | 15:117db924cf7c | 1629 | } |
wolfSSL | 15:117db924cf7c | 1630 | |
wolfSSL | 15:117db924cf7c | 1631 | /* size of additional data and input as little endian 64 bit types */ |
wolfSSL | 15:117db924cf7c | 1632 | U32TO64(addSz, little64); |
wolfSSL | 15:117db924cf7c | 1633 | U32TO64(sz, little64 + 8); |
wolfSSL | 15:117db924cf7c | 1634 | ret = wc_Poly1305Update(ctx, little64, sizeof(little64)); |
wolfSSL | 15:117db924cf7c | 1635 | if (ret) |
wolfSSL | 15:117db924cf7c | 1636 | { |
wolfSSL | 15:117db924cf7c | 1637 | return ret; |
wolfSSL | 15:117db924cf7c | 1638 | } |
wolfSSL | 15:117db924cf7c | 1639 | |
wolfSSL | 15:117db924cf7c | 1640 | /* Finalize the auth tag */ |
wolfSSL | 15:117db924cf7c | 1641 | ret = wc_Poly1305Final(ctx, tag); |
wolfSSL | 15:117db924cf7c | 1642 | |
wolfSSL | 15:117db924cf7c | 1643 | return ret; |
wolfSSL | 15:117db924cf7c | 1644 | |
wolfSSL | 15:117db924cf7c | 1645 | } |
wolfSSL | 15:117db924cf7c | 1646 | #endif /* HAVE_POLY1305 */ |
wolfSSL | 15:117db924cf7c | 1647 | |
wolfSSL | 15:117db924cf7c | 1648 |