Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
poly1305.c
00001 /* poly1305.c 00002 * 00003 * Copyright (C) 2006-2017 wolfSSL Inc. 00004 * 00005 * This file is part of wolfSSL. 00006 * 00007 * wolfSSL is free software; you can redistribute it and/or modify 00008 * it under the terms of the GNU General Public License as published by 00009 * the Free Software Foundation; either version 2 of the License, or 00010 * (at your option) any later version. 00011 * 00012 * wolfSSL is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 * GNU General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU General Public License 00018 * along with this program; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA 00020 */ 00021 00022 /* 00023 * Based off the public domain implementations by Andrew Moon 00024 * and Daniel J. Bernstein 00025 */ 00026 00027 #ifdef HAVE_CONFIG_H 00028 #include <config.h> 00029 #endif 00030 00031 #include <wolfcrypt/settings.h> 00032 00033 #ifdef HAVE_POLY1305 00034 #include <wolfcrypt/poly1305.h> 00035 #include <wolfcrypt/error-crypt.h> 00036 #include <wolfcrypt/logging.h> 00037 #include <wolfcrypt/cpuid.h> 00038 #ifdef NO_INLINE 00039 #include <wolfcrypt/misc.h> 00040 #else 00041 #define WOLFSSL_MISC_INCLUDED 00042 #include <wolfcrypt/src/misc.c> 00043 #endif 00044 #ifdef CHACHA_AEAD_TEST 00045 #include <stdio.h> 00046 #endif 00047 00048 #ifdef _MSC_VER 00049 /* 4127 warning constant while(1) */ 00050 #pragma warning(disable: 4127) 00051 #endif 00052 00053 #ifdef USE_INTEL_SPEEDUP 00054 #include <emmintrin.h> 00055 #include <immintrin.h> 00056 00057 #if defined(__GNUC__) && ((__GNUC__ < 4) || \ 00058 (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) 00059 #define NO_AVX2_SUPPORT 00060 #endif 00061 #if defined(__clang__) && ((__clang_major__ < 3) || \ 00062 (__clang_major__ == 3 && __clang_minor__ <= 5)) 00063 #define NO_AVX2_SUPPORT 00064 #elif defined(__clang__) && defined(NO_AVX2_SUPPORT) 00065 #undef NO_AVX2_SUPPORT 00066 #endif 00067 00068 #define HAVE_INTEL_AVX1 00069 #ifndef NO_AVX2_SUPPORT 00070 #define HAVE_INTEL_AVX2 00071 #endif 00072 #endif 00073 00074 #ifdef USE_INTEL_SPEEDUP 00075 static word32 intel_flags = 0; 00076 static word32 cpu_flags_set = 0; 00077 #endif 00078 00079 #if defined(USE_INTEL_SPEEDUP) || defined(POLY130564) 00080 #if defined(_MSC_VER) 00081 #define POLY1305_NOINLINE __declspec(noinline) 00082 #elif defined(__GNUC__) 00083 #define POLY1305_NOINLINE __attribute__((noinline)) 00084 #else 00085 #define POLY1305_NOINLINE 00086 #endif 00087 00088 #if defined(_MSC_VER) 00089 #include <intrin.h> 00090 00091 typedef struct word128 { 00092 word64 lo; 00093 word64 hi; 00094 } word128; 00095 00096 #define MUL(out, x, y) out.lo = _umul128((x), (y), &out.hi) 00097 #define ADD(out, in) { word64 t = out.lo; out.lo += in.lo; \ 00098 out.hi += (out.lo < t) + in.hi; } 00099 #define ADDLO(out, in) { word64 t = out.lo; out.lo += in; \ 00100 out.hi += (out.lo < t); } 00101 #define SHR(in, shift) (__shiftright128(in.lo, in.hi, (shift))) 00102 #define LO(in) (in.lo) 00103 00104 #elif defined(__GNUC__) 00105 #if defined(__SIZEOF_INT128__) 00106 typedef unsigned __int128 word128; 00107 #else 00108 typedef unsigned word128 __attribute__((mode(TI))); 00109 #endif 00110 00111 #define MUL(out, x, y) out = ((word128)x * y) 00112 #define ADD(out, in) out += in 00113 #define ADDLO(out, in) out += in 00114 #define SHR(in, shift) (word64)(in >> (shift)) 00115 #define LO(in) (word64)(in) 00116 #endif 00117 #endif 00118 00119 #ifdef USE_INTEL_SPEEDUP 00120 #ifdef HAVE_INTEL_AVX1 00121 /* Process one block (16 bytes) of data. 00122 * 00123 * ctx Poly1305 context. 00124 * m One block of message data. 00125 */ 00126 static void poly1305_block_avx(Poly1305* ctx, const unsigned char *m) 00127 { 00128 __asm__ __volatile__ ( 00129 "movq (%[ctx]), %%r15\n\t" 00130 "movq 24(%[ctx]), %%r8\n\t" 00131 "movq 32(%[ctx]), %%r9\n\t" 00132 "movq 40(%[ctx]), %%r10\n\t" 00133 "xorq %%rbx, %%rbx\n\t" 00134 "movb %[nfin], %%bl\n\t" 00135 "# h += m\n\t" 00136 "movq (%[m]), %%r11\n\t" 00137 "movq 8(%[m]), %%r12\n\t" 00138 "addq %%r11, %%r8\n\t" 00139 "adcq %%r12, %%r9\n\t" 00140 "movq 8(%[ctx]), %%rax\n\t" 00141 "adcq %%rbx, %%r10\n\t" 00142 "# r[1] * h[0] => rdx, rax ==> t2, t1\n\t" 00143 "mulq %%r8\n\t" 00144 "movq %%rax, %%r12\n\t" 00145 "movq %%rdx, %%r13\n\t" 00146 "# r[0] * h[1] => rdx, rax ++> t2, t1\n\t" 00147 "movq %%r15, %%rax\n\t" 00148 "mulq %%r9\n\t" 00149 "addq %%rax, %%r12\n\t" 00150 "movq %%r15, %%rax\n\t" 00151 "adcq %%rdx, %%r13\n\t" 00152 "# r[0] * h[0] => rdx, rax ==> t4, t0\n\t" 00153 "mulq %%r8\n\t" 00154 "movq %%rax, %%r11\n\t" 00155 "movq %%rdx, %%r8\n\t" 00156 "# r[1] * h[1] => rdx, rax =+> t3, t2\n\t" 00157 "movq 8(%[ctx]), %%rax\n\t" 00158 "mulq %%r9\n\t" 00159 "# r[0] * h[2] +> t2\n\t" 00160 "addq 352(%[ctx],%%r10,8), %%r13\n\t" 00161 "movq %%rdx, %%r14\n\t" 00162 "addq %%r8, %%r12\n\t" 00163 "adcq %%rax, %%r13\n\t" 00164 "# r[1] * h[2] +> t3\n\t" 00165 "adcq 408(%[ctx],%%r10,8), %%r14\n\t" 00166 "# r * h in r14, r13, r12, r11 \n\t" 00167 "# h = (r * h) mod 2^130 - 5\n\t" 00168 "movq %%r13, %%r10\n\t" 00169 "andq $-4, %%r13\n\t" 00170 "andq $3, %%r10\n\t" 00171 "addq %%r13, %%r11\n\t" 00172 "movq %%r13, %%r8\n\t" 00173 "adcq %%r14, %%r12\n\t" 00174 "adcq $0, %%r10\n\t" 00175 "shrdq $2, %%r14, %%r8\n\t" 00176 "shrq $2, %%r14\n\t" 00177 "addq %%r11, %%r8\n\t" 00178 "adcq %%r14, %%r12\n\t" 00179 "movq %%r12, %%r9\n\t" 00180 "adcq $0, %%r10\n\t" 00181 "# h in r10, r9, r8 \n\t" 00182 "# Store h to ctx\n\t" 00183 "movq %%r8, 24(%[ctx])\n\t" 00184 "movq %%r9, 32(%[ctx])\n\t" 00185 "movq %%r10, 40(%[ctx])\n\t" 00186 : 00187 : [m] "r" (m), [ctx] "r" (ctx), [nfin] "m" (ctx->finished) 00188 : "rax", "rdx", "r11", "r12", "r13", "r14", "r15", "rbx", 00189 "r8", "r9", "r10", "memory" 00190 ); 00191 } 00192 00193 /* Process multiple blocks (n * 16 bytes) of data. 00194 * 00195 * ctx Poly1305 context. 00196 * m Blocks of message data. 00197 * bytes The number of bytes to process. 00198 */ 00199 POLY1305_NOINLINE static void poly1305_blocks_avx(Poly1305* ctx, 00200 const unsigned char* m, size_t bytes) 00201 { 00202 __asm__ __volatile__ ( 00203 "movq (%[ctx]), %%r15\n\t" 00204 "movq 24(%[ctx]), %%r8\n\t" 00205 "movq 32(%[ctx]), %%r9\n\t" 00206 "movq 40(%[ctx]), %%r10\n" 00207 "L_avx_start:\n\t" 00208 "# h += m\n\t" 00209 "movq (%[m]), %%r11\n\t" 00210 "movq 8(%[m]), %%r12\n\t" 00211 "addq %%r11, %%r8\n\t" 00212 "adcq %%r12, %%r9\n\t" 00213 "movq 8(%[ctx]), %%rax\n\t" 00214 "adcq $0, %%r10\n\t" 00215 "# r[1] * h[0] => rdx, rax ==> t2, t1\n\t" 00216 "mulq %%r8\n\t" 00217 "movq %%rax, %%r12\n\t" 00218 "movq %%rdx, %%r13\n\t" 00219 "# r[0] * h[1] => rdx, rax ++> t2, t1\n\t" 00220 "movq %%r15, %%rax\n\t" 00221 "mulq %%r9\n\t" 00222 "addq %%rax, %%r12\n\t" 00223 "movq %%r15, %%rax\n\t" 00224 "adcq %%rdx, %%r13\n\t" 00225 "# r[0] * h[0] => rdx, rax ==> t4, t0\n\t" 00226 "mulq %%r8\n\t" 00227 "movq %%rax, %%r11\n\t" 00228 "movq %%rdx, %%r8\n\t" 00229 "# r[1] * h[1] => rdx, rax =+> t3, t2\n\t" 00230 "movq 8(%[ctx]), %%rax\n\t" 00231 "mulq %%r9\n\t" 00232 "# r[0] * h[2] +> t2\n\t" 00233 "addq 360(%[ctx],%%r10,8), %%r13\n\t" 00234 "movq %%rdx, %%r14\n\t" 00235 "addq %%r8, %%r12\n\t" 00236 "adcq %%rax, %%r13\n\t" 00237 "# r[1] * h[2] +> t3\n\t" 00238 "adcq 416(%[ctx],%%r10,8), %%r14\n\t" 00239 "# r * h in r14, r13, r12, r11 \n\t" 00240 "# h = (r * h) mod 2^130 - 5\n\t" 00241 "movq %%r13, %%r10\n\t" 00242 "andq $-4, %%r13\n\t" 00243 "andq $3, %%r10\n\t" 00244 "addq %%r13, %%r11\n\t" 00245 "movq %%r13, %%r8\n\t" 00246 "adcq %%r14, %%r12\n\t" 00247 "adcq $0, %%r10\n\t" 00248 "shrdq $2, %%r14, %%r8\n\t" 00249 "shrq $2, %%r14\n\t" 00250 "addq %%r11, %%r8\n\t" 00251 "adcq %%r14, %%r12\n\t" 00252 "movq %%r12, %%r9\n\t" 00253 "adcq $0, %%r10\n\t" 00254 "# h in r10, r9, r8 \n\t" 00255 "# Next block from message\n\t" 00256 "addq $16, %[m]\n\t" 00257 "subq $16, %[bytes]\n\t" 00258 "cmp $16, %[bytes]\n\t" 00259 "jge L_avx_start\n\t" 00260 "# Store h to ctx\n\t" 00261 "movq %%r8, 24(%[ctx])\n\t" 00262 "movq %%r9, 32(%[ctx])\n\t" 00263 "movq %%r10, 40(%[ctx])\n\t" 00264 : [m] "+r" (m), [bytes] "+r" (bytes) 00265 : [ctx] "r" (ctx) 00266 : "rax", "rdx", "r11", "r12", "r13", "r14", "r15", 00267 "r8", "r9", "r10", "memory" 00268 ); 00269 } 00270 00271 /* Set the key to use when processing data. 00272 * Initialize the context. 00273 * 00274 * ctx Poly1305 context. 00275 * key The key data (16 bytes). 00276 */ 00277 static void poly1305_setkey_avx(Poly1305* ctx, const byte* key) 00278 { 00279 int i; 00280 00281 ctx->r[0] = *(word64*)(key + 0) & 0x0ffffffc0fffffffL; 00282 ctx->r[1] = *(word64*)(key + 8) & 0x0ffffffc0ffffffcL; 00283 00284 for (i=0; i<7; i++) { 00285 ctx->hm[i + 0] = ctx->r[0] * i; 00286 ctx->hm[i + 7] = ctx->r[1] * i; 00287 } 00288 00289 /* h (accumulator) = 0 */ 00290 ctx->h[0] = 0; 00291 ctx->h[1] = 0; 00292 ctx->h[2] = 0; 00293 00294 /* save pad for later */ 00295 ctx->pad[0] = *(word64*)(key + 16); 00296 ctx->pad[1] = *(word64*)(key + 24); 00297 00298 ctx->leftover = 0; 00299 ctx->finished = 1; 00300 } 00301 00302 /* Calculate the final result - authentication data. 00303 * Zeros out the private data in the context. 00304 * 00305 * ctx Poly1305 context. 00306 * mac Buffer to hold 16 bytes. 00307 */ 00308 static void poly1305_final_avx(Poly1305* ctx, byte* mac) 00309 { 00310 word64 h0, h1, h2; 00311 00312 /* process the remaining block */ 00313 if (ctx->leftover) { 00314 size_t i = ctx->leftover; 00315 ctx->buffer[i] = 1; 00316 for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++) 00317 ctx->buffer[i] = 0; 00318 ctx->finished = 0; 00319 poly1305_block_avx(ctx, ctx->buffer); 00320 } 00321 00322 h0 = ctx->h[0]; 00323 h1 = ctx->h[1]; 00324 h2 = ctx->h[2]; 00325 00326 /* h %= p */ 00327 /* h = (h + pad) */ 00328 __asm__ __volatile__ ( 00329 "# mod 2^130 - 5\n\t" 00330 "movq %[h2], %%r13\n\t" 00331 "andq $0x3, %[h2]\n\t" 00332 "shrq $0x2, %%r13\n\t" 00333 "leaq (%%r13, %%r13, 4), %%r13\n\t" 00334 "add %%r13, %[h0]\n\t" 00335 "adc $0, %[h1]\n\t" 00336 "adc $0, %[h2]\n\t" 00337 "# Fixup when between (1 << 130) - 1 and (1 << 130) - 5\n\t" 00338 "movq %[h0], %%r13\n\t" 00339 "movq %[h1], %%r14\n\t" 00340 "movq %[h2], %%r15\n\t" 00341 "addq $5, %%r13\n\t" 00342 "adcq $0, %%r14\n\t" 00343 "adcq $0, %%r15\n\t" 00344 "movq %%r15, %%r12\n\t" 00345 "andq $3, %%r15\n\t" 00346 "cmpq $4, %%r12\n\t" 00347 "cmove %%r13, %[h0]\n\t" 00348 "cmove %%r14, %[h1]\n\t" 00349 "cmove %%r15, %[h2]\n\t" 00350 "# h += pad\n\t" 00351 "add %[p0], %[h0]\n\t" 00352 "adc %[p1], %[h1]\n\t" 00353 "movq %[h0], (%[m])\n\t" 00354 "movq %[h1], 8(%[m])\n\t" 00355 : [h0] "+r" (h0), [h1] "+r" (h1), [h2] "+r" (h2), 00356 [p0] "+r" (ctx->pad[0]), [p1] "+r" (ctx->pad[1]) 00357 : [m] "r" (mac) 00358 : "memory", "r15", "r14", "r13", "r12" 00359 ); 00360 00361 /* zero out the state */ 00362 ctx->h[0] = 0; 00363 ctx->h[1] = 0; 00364 ctx->h[2] = 0; 00365 ctx->r[0] = 0; 00366 ctx->r[1] = 0; 00367 ctx->pad[0] = 0; 00368 ctx->pad[1] = 0; 00369 } 00370 #endif 00371 00372 #ifdef HAVE_INTEL_AVX2 00373 #if defined(_MSC_VER) 00374 #define POLY1305_NOINLINE __declspec(noinline) 00375 #elif defined(__GNUC__) 00376 #define POLY1305_NOINLINE __attribute__((noinline)) 00377 #else 00378 #define POLY1305_NOINLINE 00379 #endif 00380 00381 /* Load H into five 256-bit registers. 00382 * 00383 * h is the memory location of the data - 26 of 32 bits. 00384 * h0-h4 the 4 H values with 26 bits stored in 64 for multiply. 00385 */ 00386 #define LOAD_H(h, h0, h1, h2, h3, h4) \ 00387 "vmovdqu ("#h"), "#h0"\n\t" \ 00388 "vmovdqu 32("#h"), "#h1"\n\t" \ 00389 "vmovdqu 64("#h"), "#h2"\n\t" \ 00390 "vmovdqu 96("#h"), "#h3"\n\t" \ 00391 "vmovdqu 128("#h"), "#h4"\n\t" 00392 00393 /* Store H, five 256-bit registers, packed. 00394 * 00395 * h is the memory location of the data - 26 bits in 32. 00396 * h0-h4 the 4 H values with 26 bits stored in 64. 00397 * x4 is the xmm register of h4. 00398 */ 00399 #define STORE_H(h, h0, h1, h2, h3, h4, x4) \ 00400 "vmovdqu "#h0", ("#h")\n\t" \ 00401 "vmovdqu "#h1", 32("#h")\n\t" \ 00402 "vmovdqu "#h2", 64("#h")\n\t" \ 00403 "vmovdqu "#h3", 96("#h")\n\t" \ 00404 "vmovdqu "#h4", 128("#h")\n\t" 00405 00406 /* Load four powers of r into position to be multiplied by the 4 H values. 00407 * 00408 * r0-r4 holds the loaded values with 26 bits stored in 64 for multiply. 00409 * t0-t3 are temporary registers. 00410 */ 00411 #define LOAD_Rx4(r0, r1, r2, r3, r4, \ 00412 t0, t1, t2, t3) \ 00413 "vmovdqu 224(%[ctx]), "#r3"\n\t" \ 00414 "vmovdqu 256(%[ctx]), "#r2"\n\t" \ 00415 "vmovdqu 288(%[ctx]), "#r1"\n\t" \ 00416 "vmovdqu 320(%[ctx]), "#r0"\n\t" \ 00417 "vpermq $0xd8, "#r0", "#r0"\n\t" \ 00418 "vpermq $0xd8, "#r1", "#r1"\n\t" \ 00419 "vpermq $0xd8, "#r2", "#r2"\n\t" \ 00420 "vpermq $0xd8, "#r3", "#r3"\n\t" \ 00421 "vpunpcklqdq "#r1", "#r0", "#t0"\n\t" \ 00422 "vpunpckhqdq "#r1", "#r0", "#t1"\n\t" \ 00423 "vpunpcklqdq "#r3", "#r2", "#t2"\n\t" \ 00424 "vpunpckhqdq "#r3", "#r2", "#t3"\n\t" \ 00425 "vperm2i128 $0x20, "#t2", "#t0", "#r0"\n\t" \ 00426 "vperm2i128 $0x31, "#t2", "#t0", "#r2"\n\t" \ 00427 "vperm2i128 $0x20, "#t3", "#t1", "#r4"\n\t" \ 00428 "vpsrlq $32, "#r0", "#r1"\n\t" \ 00429 "vpsrlq $32, "#r2", "#r3"\n\t" 00430 00431 /* Load the r^4 value into position to be multiplied by all 4 H values. 00432 * 00433 * r4 holds r^4 as five 26 bits each in 32. 00434 * r0-r4 holds the loaded values with 26 bits stored in 64 for multiply. 00435 * t0-t1 are temporary registers. 00436 */ 00437 #define LOAD_R4(r4, r40, r41, r42, r43, r44, \ 00438 t0, t1) \ 00439 "vmovdqu "#r4", "#t0"\n\t" \ 00440 "vpermq $0x0, "#t0", "#r40"\n\t" \ 00441 "vpsrlq $32, "#t0", "#t1"\n\t" \ 00442 "vpermq $0x55, "#t0", "#r42"\n\t" \ 00443 "vpermq $0xaa, "#t0", "#r44"\n\t" \ 00444 "vpermq $0x0, "#t1", "#r41"\n\t" \ 00445 "vpermq $0x55, "#t1", "#r43"\n\t" 00446 00447 /* Multiply the top 4 26-bit values in 64 bits of each H by 5 for reduction in 00448 * multiply. 00449 * 00450 * s1-s4 are each 64 bit value in r1-r4 multiplied by 5. 00451 * r1-r4 are the top 4 00452 */ 00453 #define MUL5(s1, s2, s3, s4, r1, r2, r3, r4) \ 00454 "vpslld $2, "#r1", "#s1"\n\t" \ 00455 "vpslld $2, "#r2", "#s2"\n\t" \ 00456 "vpslld $2, "#r3", "#s3"\n\t" \ 00457 "vpslld $2, "#r4", "#s4"\n\t" \ 00458 "vpaddq "#s1", "#r1", "#s1"\n\t" \ 00459 "vpaddq "#s2", "#r2", "#s2"\n\t" \ 00460 "vpaddq "#s3", "#r3", "#s3"\n\t" \ 00461 "vpaddq "#s4", "#r4", "#s4"\n\t" 00462 00463 /* Add the 4 H values together. 00464 * Each 64 bits in a register is 26 bits of one of the H values. 00465 * 00466 * h0-h4 contains the 4 H values. 00467 * t1-t4 are temporary registers. 00468 */ 00469 #define FINALIZE_H(h0, h1, h2, h3, h4, \ 00470 t0, t1, t2, t3, t4) \ 00471 "vpsrldq $8, "#h0", "#t0"\n\t" \ 00472 "vpsrldq $8, "#h1", "#t1"\n\t" \ 00473 "vpsrldq $8, "#h2", "#t2"\n\t" \ 00474 "vpsrldq $8, "#h3", "#t3"\n\t" \ 00475 "vpsrldq $8, "#h4", "#t4"\n\t" \ 00476 "vpaddq "#h0", "#t0", "#h0"\n\t" \ 00477 "vpaddq "#h1", "#t1", "#h1"\n\t" \ 00478 "vpaddq "#h2", "#t2", "#h2"\n\t" \ 00479 "vpaddq "#h3", "#t3", "#h3"\n\t" \ 00480 "vpaddq "#h4", "#t4", "#h4"\n\t" \ 00481 "vpermq $0x02, "#h0", "#t0"\n\t" \ 00482 "vpermq $0x02, "#h1", "#t1"\n\t" \ 00483 "vpermq $0x02, "#h2", "#t2"\n\t" \ 00484 "vpermq $0x02, "#h3", "#t3"\n\t" \ 00485 "vpermq $0x02, "#h4", "#t4"\n\t" \ 00486 "vpaddq "#h0", "#t0", "#h0"\n\t" \ 00487 "vpaddq "#h1", "#t1", "#h1"\n\t" \ 00488 "vpaddq "#h2", "#t2", "#h2"\n\t" \ 00489 "vpaddq "#h3", "#t3", "#h3"\n\t" \ 00490 "vpaddq "#h4", "#t4", "#h4"\n\t" 00491 00492 /* Move 32 bits from each xmm register to a 32 bit register. 00493 * 00494 * x0-x4 are the xmm version of the ymm registers used. 00495 * t0-t4 are the 32-bit registers to store data in. 00496 */ 00497 #define MOVE_TO_32(x0, x1, x2, x3, x4, \ 00498 t0, t1, t2, t3, t4) \ 00499 "vmovd "#x0", "#t0"\n\t" \ 00500 "vmovd "#x1", "#t1"\n\t" \ 00501 "vmovd "#x2", "#t2"\n\t" \ 00502 "vmovd "#x3", "#t3"\n\t" \ 00503 "vmovd "#x4", "#t4"\n\t" 00504 00505 /* Multiply using AVX2 instructions. 00506 * Each register contains up to 32 bits of data in 64 bits. 00507 * This is a 4 way parallel multiply. 00508 * 00509 * h0-h4 contain 4 H values with the 32 bits of each per register. 00510 * r0-r4 contain the 4 powers of r. 00511 * s1-s4 contain r1-r4 times 5. 00512 * t0-t4 and v0-v3 are temporary registers. 00513 */ 00514 #define MUL_AVX2(h0, h1, h2, h3, h4, \ 00515 r0, r1, r2, r3, r4, \ 00516 s1, s2, s3, s4, \ 00517 t0, t1, t2, t3, t4, \ 00518 v0, v1, v2, v3) \ 00519 "vpmuludq "#s1", "#h4", "#t0"\n\t" \ 00520 "vpmuludq "#s2", "#h3", "#v0"\n\t" \ 00521 "vpmuludq "#s2", "#h4", "#t1"\n\t" \ 00522 "vpmuludq "#s3", "#h3", "#v1"\n\t" \ 00523 "vpmuludq "#s3", "#h4", "#t2"\n\t" \ 00524 "vpaddq "#t0", "#v0", "#t0"\n\t" \ 00525 "vpmuludq "#s3", "#h2", "#v2"\n\t" \ 00526 "vpmuludq "#s4", "#h4", "#t3"\n\t" \ 00527 "vpaddq "#t1", "#v1", "#t1"\n\t" \ 00528 "vpmuludq "#s4", "#h1", "#v3"\n\t" \ 00529 "vpmuludq "#s4", "#h2", "#v0"\n\t" \ 00530 "vpaddq "#t0", "#v2", "#t0"\n\t" \ 00531 "vpmuludq "#s4", "#h3", "#v1"\n\t" \ 00532 "vpmuludq "#r0", "#h3", "#v2"\n\t" \ 00533 "vpaddq "#t0", "#v3", "#t0"\n\t" \ 00534 "vpmuludq "#r0", "#h4", "#t4"\n\t" \ 00535 "vpaddq "#t1", "#v0", "#t1"\n\t" \ 00536 "vpmuludq "#r0", "#h0", "#v3"\n\t" \ 00537 "vpaddq "#t2", "#v1", "#t2"\n\t" \ 00538 "vpmuludq "#r0", "#h1", "#v0"\n\t" \ 00539 "vpaddq "#t3", "#v2", "#t3"\n\t" \ 00540 "vpmuludq "#r0", "#h2", "#v1"\n\t" \ 00541 "vpmuludq "#r1", "#h2", "#v2"\n\t" \ 00542 "vpaddq "#t0", "#v3", "#t0"\n\t" \ 00543 "vpmuludq "#r1", "#h3", "#v3"\n\t" \ 00544 "vpaddq "#t1", "#v0", "#t1"\n\t" \ 00545 "vpmuludq "#r1", "#h0", "#v0"\n\t" \ 00546 "vpaddq "#t2", "#v1", "#t2"\n\t" \ 00547 "vpmuludq "#r1", "#h1", "#v1"\n\t" \ 00548 "vpaddq "#t3", "#v2", "#t3"\n\t" \ 00549 "vpmuludq "#r2", "#h1", "#v2"\n\t" \ 00550 "vpaddq "#t4", "#v3", "#t4"\n\t" \ 00551 "vpmuludq "#r2", "#h2", "#v3"\n\t" \ 00552 "vpaddq "#t1", "#v0", "#t1"\n\t" \ 00553 "vpmuludq "#r2", "#h0", "#v0"\n\t" \ 00554 "vpaddq "#t2", "#v1", "#t2"\n\t" \ 00555 "vpmuludq "#r3", "#h0", "#v1"\n\t" \ 00556 "vpaddq "#t3", "#v2", "#t3"\n\t" \ 00557 "vpmuludq "#r3", "#h1", "#v2"\n\t" \ 00558 "vpaddq "#t4", "#v3", "#t4"\n\t" \ 00559 "vpmuludq "#r4", "#h0", "#v3"\n\t" \ 00560 "vpaddq "#t2", "#v0", "#t2"\n\t" \ 00561 "vpaddq "#t3", "#v1", "#t3"\n\t" \ 00562 "vpaddq "#t4", "#v2", "#t4"\n\t" \ 00563 "vpaddq "#t4", "#v3", "#t4"\n\t" 00564 00565 /* Load the 4 blocks of the message. 00566 * 00567 * m the address of the message to load. 00568 * m0-m4 is the loaded message with 32 bits in 64. Loaded so data is parallel. 00569 * hi is the high bits of the 4 m (1 << 128 as not final block). 00570 * z is zero. 00571 */ 00572 #define LOAD_M(m, m0, m1, m2, m3, m4, hi, z) \ 00573 "vmovdqu (%[m]), "#m0"\n\t" \ 00574 "vmovdqu 32(%[m]), "#m1"\n\t" \ 00575 "vperm2i128 $0x20, "#m1", "#m0", "#m2"\n\t" \ 00576 "vperm2i128 $0x31, "#m1", "#m0", "#m0"\n\t" \ 00577 "vpunpckldq "#m0", "#m2", "#m1"\n\t" \ 00578 "vpunpckhdq "#m0", "#m2", "#m3"\n\t" \ 00579 "vpunpckldq "#z", "#m1", "#m0"\n\t" \ 00580 "vpunpckhdq "#z", "#m1", "#m1"\n\t" \ 00581 "vpunpckldq "#z", "#m3", "#m2"\n\t" \ 00582 "vpunpckhdq "#z", "#m3", "#m3"\n\t" \ 00583 "vmovdqu "#hi", "#m4"\n\t" \ 00584 "vpsllq $6, "#m1", "#m1"\n\t" \ 00585 "vpsllq $12, "#m2", "#m2"\n\t" \ 00586 "vpsllq $18, "#m3", "#m3"\n\t" 00587 00588 00589 /* Multiply using AVX2 instructions - adding with message. 00590 * Each register contains up to 32 bits of data in 64 bits. 00591 * This is a 4 way parallel multiply. 00592 * The message data is loaded first and the multiplication adds into it. 00593 * 00594 * h0-h4 contain 4 H values with the 32 bits of each per register. 00595 * r0-r4 contain the 4 powers of r. 00596 * s1-s4 contain r1-r4 times 5. 00597 * t0-t4 and v0-v3 are temporary registers. 00598 * hi is the high bits of the 4 m (1 << 128 as not final block). 00599 * z is zero. 00600 */ 00601 #define MUL_ADD_AVX2(h0, h1, h2, h3, h4, \ 00602 r0, r1, r2, r3, r4, \ 00603 s1, s2, s3, s4, \ 00604 t0, t1, t2, t3, t4, \ 00605 v0, v1, v2, v3, \ 00606 hi, z) \ 00607 "vmovdqu (%[m]), "#t0"\n\t" \ 00608 "vmovdqu 32(%[m]), "#t1"\n\t" \ 00609 "vperm2i128 $0x20, "#t1", "#t0", "#t2"\n\t" \ 00610 "vperm2i128 $0x31, "#t1", "#t0", "#t0"\n\t" \ 00611 "vpunpckldq "#t0", "#t2", "#t1"\n\t" \ 00612 "vpunpckhdq "#t0", "#t2", "#t3"\n\t" \ 00613 "vpunpckldq "#z", "#t1", "#t0"\n\t" \ 00614 "vpunpckhdq "#z", "#t1", "#t1"\n\t" \ 00615 "vpunpckldq "#z", "#t3", "#t2"\n\t" \ 00616 "vpunpckhdq "#z", "#t3", "#t3"\n\t" \ 00617 "vmovdqu "#hi", "#t4"\n\t" \ 00618 "vpsllq $6, "#t1", "#t1"\n\t" \ 00619 "vpsllq $12, "#t2", "#t2"\n\t" \ 00620 "vpsllq $18, "#t3", "#t3"\n\t" \ 00621 "vpmuludq "#s1", "#h4", "#v0"\n\t" \ 00622 "vpaddq "#t0", "#v0", "#t0"\n\t" \ 00623 "vpmuludq "#s2", "#h3", "#v0"\n\t" \ 00624 "vpmuludq "#s2", "#h4", "#v1"\n\t" \ 00625 "vpaddq "#t1", "#v1", "#t1"\n\t" \ 00626 "vpmuludq "#s3", "#h3", "#v1"\n\t" \ 00627 "vpmuludq "#s3", "#h4", "#v2"\n\t" \ 00628 "vpaddq "#t2", "#v2", "#t2"\n\t" \ 00629 "vpaddq "#t0", "#v0", "#t0"\n\t" \ 00630 "vpmuludq "#s3", "#h2", "#v2"\n\t" \ 00631 "vpmuludq "#s4", "#h4", "#v3"\n\t" \ 00632 "vpaddq "#t3", "#v3", "#t3"\n\t" \ 00633 "vpaddq "#t1", "#v1", "#t1"\n\t" \ 00634 "vpmuludq "#s4", "#h1", "#v3"\n\t" \ 00635 "vpmuludq "#s4", "#h2", "#v0"\n\t" \ 00636 "vpaddq "#t0", "#v2", "#t0"\n\t" \ 00637 "vpmuludq "#s4", "#h3", "#v1"\n\t" \ 00638 "vpmuludq "#r0", "#h3", "#v2"\n\t" \ 00639 "vpaddq "#t0", "#v3", "#t0"\n\t" \ 00640 "vpmuludq "#r0", "#h4", "#v3"\n\t" \ 00641 "vpaddq "#t4", "#v3", "#t4"\n\t" \ 00642 "vpaddq "#t1", "#v0", "#t1"\n\t" \ 00643 "vpmuludq "#r0", "#h0", "#v3"\n\t" \ 00644 "vpaddq "#t2", "#v1", "#t2"\n\t" \ 00645 "vpmuludq "#r0", "#h1", "#v0"\n\t" \ 00646 "vpaddq "#t3", "#v2", "#t3"\n\t" \ 00647 "vpmuludq "#r0", "#h2", "#v1"\n\t" \ 00648 "vpmuludq "#r1", "#h2", "#v2"\n\t" \ 00649 "vpaddq "#t0", "#v3", "#t0"\n\t" \ 00650 "vpmuludq "#r1", "#h3", "#v3"\n\t" \ 00651 "vpaddq "#t1", "#v0", "#t1"\n\t" \ 00652 "vpmuludq "#r1", "#h0", "#v0"\n\t" \ 00653 "vpaddq "#t2", "#v1", "#t2"\n\t" \ 00654 "vpmuludq "#r1", "#h1", "#v1"\n\t" \ 00655 "vpaddq "#t3", "#v2", "#t3"\n\t" \ 00656 "vpmuludq "#r2", "#h1", "#v2"\n\t" \ 00657 "vpaddq "#t4", "#v3", "#t4"\n\t" \ 00658 "vpmuludq "#r2", "#h2", "#v3"\n\t" \ 00659 "vpaddq "#t1", "#v0", "#t1"\n\t" \ 00660 "vpmuludq "#r2", "#h0", "#v0"\n\t" \ 00661 "vpaddq "#t2", "#v1", "#t2"\n\t" \ 00662 "vpmuludq "#r3", "#h0", "#v1"\n\t" \ 00663 "vpaddq "#t3", "#v2", "#t3"\n\t" \ 00664 "vpmuludq "#r3", "#h1", "#v2"\n\t" \ 00665 "vpaddq "#t4", "#v3", "#t4"\n\t" \ 00666 "vpmuludq "#r4", "#h0", "#v3"\n\t" \ 00667 "vpaddq "#t2", "#v0", "#t2"\n\t" \ 00668 "vpaddq "#t3", "#v1", "#t3"\n\t" \ 00669 "vpaddq "#t4", "#v2", "#t4"\n\t" \ 00670 "vpaddq "#t4", "#v3", "#t4"\n\t" 00671 00672 /* Reduce the 64 bits of data to 26 bits. 00673 * 00674 * h0-h4 contain the reduced H values. 00675 * m0-m4 contain the 4 H values to reduce. 00676 * t0-t2 are temporaries. 00677 * mask contains the 26-bit mask for each 64 bit value in the 256 bit register. 00678 */ 00679 #define REDUCE(h0, h1, h2, h3, h4, \ 00680 m0, m1, m2, m3, m4, \ 00681 t0, t1, t2, mask) \ 00682 "vpsrlq $26, "#m0", "#t0"\n\t" \ 00683 "vpsrlq $26, "#m3", "#t1"\n\t" \ 00684 "vpand "#mask", "#m0", "#m0"\n\t" \ 00685 "vpand "#mask", "#m3", "#m3"\n\t" \ 00686 "vpaddq "#m1", "#t0", "#m1"\n\t" \ 00687 "vpaddq "#m4", "#t1", "#m4"\n\t" \ 00688 \ 00689 "vpsrlq $26, "#m1", "#t0"\n\t" \ 00690 "vpsrlq $26, "#m4", "#t1"\n\t" \ 00691 "vpand "#mask", "#m1", "#h1"\n\t" \ 00692 "vpand "#mask", "#m4", "#h4"\n\t" \ 00693 "vpaddq "#m2", "#t0", "#m2"\n\t" \ 00694 "vpslld $2, "#t1", "#t2"\n\t" \ 00695 "vpaddd "#t2", "#t1", "#t2"\n\t" \ 00696 \ 00697 "vpsrlq $26, "#m2", "#t0"\n\t" \ 00698 "vpaddq "#m0", "#t2", "#m0"\n\t" \ 00699 "vpsrlq $26, "#m0", "#t1"\n\t" \ 00700 "vpand "#mask", "#m2", "#h2"\n\t" \ 00701 "vpand "#mask", "#m0", "#h0"\n\t" \ 00702 "vpaddq "#m3", "#t0", "#m3"\n\t" \ 00703 "vpaddq "#h1", "#t1", "#h1"\n\t" \ 00704 \ 00705 "vpsrlq $26, "#m3", "#t0"\n\t" \ 00706 "vpand "#mask", "#m3", "#h3"\n\t" \ 00707 "vpaddq "#h4", "#t0", "#h4"\n\t" \ 00708 00709 00710 /* Process multiple blocks (n * 16 bytes) of data. 00711 * 00712 * ctx Poly1305 context. 00713 * m Blocks of message data. 00714 * bytes The number of bytes to process. 00715 */ 00716 POLY1305_NOINLINE static void poly1305_blocks_avx2(Poly1305* ctx, 00717 const unsigned char* m, size_t bytes) 00718 { 00719 ALIGN256 word64 r4[5][4]; 00720 ALIGN256 word64 s[4][4]; 00721 register word32 t0 asm("r8") = 0; 00722 register word32 t1 asm("r9") = 0; 00723 register word32 t2 asm("r10") = 0; 00724 register word32 t3 asm("r11") = 0; 00725 register word32 t4 asm("r12") = 0; 00726 static const word64 mask[4] = { 0x0000000003ffffff, 0x0000000003ffffff, 00727 0x0000000003ffffff, 0x0000000003ffffff }; 00728 static const word64 hibit[4] = { 0x1000000, 0x1000000, 00729 0x1000000, 0x1000000 }; 00730 00731 __asm__ __volatile__ ( 00732 "vpxor %%ymm15, %%ymm15, %%ymm15\n\t" 00733 "cmpb $1, %[started]\n\t" 00734 "je L_begin\n\t" 00735 "cmpb $1, %[fin]\n\t" 00736 "je L_begin\n\t" 00737 "# Load the message data\n\t" 00738 LOAD_M(m, %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %[hibit], %%ymm15) 00739 "vmovdqu %[mask], %%ymm14\n\t" 00740 "# Reduce, in place, the message data\n\t" 00741 REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, 00742 %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, 00743 %%ymm10, %%ymm11, %%ymm12, %%ymm14) 00744 "addq $64, %[m]\n\t" 00745 "subq $64, %[bytes]\n\t" 00746 "jz L_store\n\t" 00747 "jmp L_load_r4\n\t" 00748 "\n" 00749 "L_begin:\n\t" 00750 "# Load the H values.\n\t" 00751 LOAD_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4) 00752 "# Check if there is a power of r to load - otherwise use r^4.\n\t" 00753 "cmpb $0, %[fin]\n\t" 00754 "je L_load_r4\n\t" 00755 "\n\t" 00756 "# Load the 4 powers of r - r^4, r^3, r^2, r^1.\n\t" 00757 LOAD_Rx4(%%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9, 00758 %%ymm10, %%ymm11, %%ymm12, %%ymm13) 00759 "jmp L_mul_5\n\t" 00760 "\n" 00761 "L_load_r4:\n\t" 00762 "# Load r^4 into all four positions.\n\t" 00763 LOAD_R4(320(%[ctx]), %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9, 00764 %%ymm13, %%ymm14) 00765 "\n" 00766 "L_mul_5:\n\t" 00767 "# Multiply top 4 26-bit values of all four H by 5\n\t" 00768 MUL5(%%ymm10, %%ymm11, %%ymm12, %%ymm13, %%ymm6, %%ymm7, %%ymm8, %%ymm9) 00769 "# Store powers of r and multiple of 5 for use in multiply.\n\t" 00770 "vmovdqa %%ymm10, (%[s])\n\t" 00771 "vmovdqa %%ymm11, 32(%[s])\n\t" 00772 "vmovdqa %%ymm12, 64(%[s])\n\t" 00773 "vmovdqa %%ymm13, 96(%[s])\n\t" 00774 "vmovdqa %%ymm5 , (%[r4])\n\t" 00775 "vmovdqa %%ymm6 , 32(%[r4])\n\t" 00776 "vmovdqa %%ymm7 , 64(%[r4])\n\t" 00777 "vmovdqa %%ymm8 , 96(%[r4])\n\t" 00778 "vmovdqa %%ymm9 , 128(%[r4])\n\t" 00779 "vmovdqu %[mask], %%ymm14\n\t" 00780 "\n" 00781 "# If not finished then loop over data\n\t" 00782 "cmpb $0x1, %[fin]\n\t" 00783 "jne L_start\n\t" 00784 "# Do last multiply, reduce, add the four H together and move to\n\t" 00785 "# 32-bit registers\n\t" 00786 MUL_AVX2(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, 00787 (%[r4]), 32(%[r4]), 64(%[r4]), 96(%[r4]), 128(%[r4]), 00788 (%[s]), 32(%[s]), 64(%[s]), 96(%[s]), 00789 %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9, 00790 %%ymm10, %%ymm11, %%ymm12, %%ymm13) 00791 REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, 00792 %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9, 00793 %%ymm10, %%ymm11, %%ymm12, %%ymm14) 00794 FINALIZE_H(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, 00795 %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9) 00796 MOVE_TO_32(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 00797 %[t0], %[t1], %[t2], %[t3], %[t4]) 00798 "jmp L_end\n\t" 00799 "\n" 00800 "L_start:\n\t" 00801 MUL_ADD_AVX2(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, 00802 (%[r4]), 32(%[r4]), 64(%[r4]), 96(%[r4]), 128(%[r4]), 00803 (%[s]), 32(%[s]), 64(%[s]), 96(%[s]), 00804 %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9, 00805 %%ymm10, %%ymm11, %%ymm12, %%ymm13, 00806 %[hibit], %%ymm15) 00807 REDUCE(%%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, 00808 %%ymm5, %%ymm6, %%ymm7, %%ymm8, %%ymm9, 00809 %%ymm10, %%ymm11, %%ymm12, %%ymm14) 00810 "addq $64, %[m]\n\t" 00811 "subq $64, %[bytes]\n\t" 00812 "jnz L_start\n\t" 00813 "\n" 00814 "L_store:\n\t" 00815 "# Store four H values - state\n\t" 00816 STORE_H(%[h], %%ymm0, %%ymm1, %%ymm2, %%ymm3, %%ymm4, %%xmm4) 00817 "\n" 00818 "L_end:\n\t" 00819 : [m] "+r" (m), [bytes] "+r" (bytes), 00820 [t0] "+r" (t0), [t1] "+r" (t1), [t2] "+r" (t2), 00821 [t3] "+r" (t3), [t4] "+r" (t4) 00822 : [ctx] "r" (ctx), [h] "r" (ctx->hh), 00823 [r4] "r" (r4), [s] "r" (s), 00824 [fin] "m" (ctx->finished), [started] "m" (ctx->started), 00825 [mask] "m" (mask), [hibit] "m" (hibit) 00826 : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", 00827 "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15", 00828 "memory" 00829 ); 00830 00831 if (ctx->finished) 00832 { 00833 word64 h0, h1, h2, c; 00834 00835 /* Convert to 64-bit form. */ 00836 h0 = (((word64)(t1 & 0x3FFFF)) << 26) + t0; 00837 h1 = (((word64)(t3 & 0x3FF)) << 34) + 00838 (((word64) t2 ) << 8) + (t1 >> 18); 00839 h2 = (((word64) t4 ) << 16) + (t3 >> 10); 00840 00841 /* Perform modulur reduction. */ 00842 c = (h1 >> 44); h1 &= 0xfffffffffff; 00843 h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; 00844 h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; 00845 h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff; 00846 h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; 00847 h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; 00848 h1 += c; 00849 00850 /* Convert from 42/44/44 to 2/64/64 bits used and store result. */ 00851 ctx->h[0] = h0 | (h1 << 44); 00852 ctx->h[1] = (h1 >> 20) | (h2 << 24); 00853 ctx->h[2] = h2 >> 40; 00854 } 00855 00856 ctx->started = 1; 00857 } 00858 00859 /* Multiply two 130-bit numbers in 64-bit registers and reduce. 00860 * 44 + 44 + 42 = 130 bits 00861 * 00862 * r0-r2 are the first operand and the result. 00863 * a0-a2 are the second operand. 00864 */ 00865 #define MUL_64(r0, r1, r2, a0, a1, a2) \ 00866 s1 = a1 * (5 << 2); \ 00867 s2 = a2 * (5 << 2); \ 00868 MUL(d0, r0, a0); MUL(d, r1, s2); ADD(d0, d); MUL(d, r2, s1); ADD(d0, d); \ 00869 MUL(d1, r0, a1); MUL(d, r1, a0); ADD(d1, d); MUL(d, r2, s2); ADD(d1, d); \ 00870 MUL(d2, r0, a2); MUL(d, r1, a1); ADD(d2, d); MUL(d, r2, a0); ADD(d2, d); \ 00871 \ 00872 c = SHR(d0, 44); r0 = LO(d0) & 0xfffffffffff; \ 00873 ADDLO(d1, c); c = SHR(d1, 44); r1 = LO(d1) & 0xfffffffffff; \ 00874 ADDLO(d2, c); c = SHR(d2, 42); r2 = LO(d2) & 0x3ffffffffff; \ 00875 r0 += c * 5; c = (r0 >> 44); r0 = r0 & 0xfffffffffff; \ 00876 r1 += c 00877 00878 #define SQR_64(r0, r1, r2) \ 00879 s2 = r2 * (5 << 2); \ 00880 MUL(d0, r1, s2); ADD(d0, d0); MUL(d, r0, r0); ADD(d0, d); \ 00881 MUL(d1, r0, r1); ADD(d1, d1); MUL(d, r2, s2); ADD(d1, d); \ 00882 MUL(d2, r0, r2); ADD(d2, d2); MUL(d, r1, r1); ADD(d2, d); \ 00883 \ 00884 c = SHR(d0, 44); r0 = LO(d0) & 0xfffffffffff; \ 00885 ADDLO(d1, c); c = SHR(d1, 44); r1 = LO(d1) & 0xfffffffffff; \ 00886 ADDLO(d2, c); c = SHR(d2, 42); r2 = LO(d2) & 0x3ffffffffff; \ 00887 r0 += c * 5; c = (r0 >> 44); r0 = r0 & 0xfffffffffff; \ 00888 r1 += c 00889 00890 /* Store the 130-bit number in 64-bit registers as 26-bit values in 32 bits. 00891 * 00892 * r0-r2 contains the 130-bit number in 64-bit registers. 00893 * r is the address of where to store the 26 of 32 bits result. 00894 */ 00895 #define CONV_64_TO_32(r0, r1, r2, r) \ 00896 r[0] = (word32)( r0 ) & 0x3ffffff; \ 00897 r[1] = (word32)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; \ 00898 r[2] = (word32)( r1 >> 8 ) & 0x3ffffff; \ 00899 r[3] = (word32)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; \ 00900 r[4] = (word32)( r2 >> 16 ) 00901 00902 /* Calculate R^1, R^2, R^3 and R^4 and store them in the context. 00903 * 00904 * ctx Poly1305 context. 00905 */ 00906 static void poly1305_calc_powers(Poly1305* ctx) 00907 { 00908 word64 r0, r1, r2, t0, t1, c; 00909 word64 r20, r21, r22; 00910 word64 r30, r31, r32; 00911 word64 r40, r41, r42; 00912 word64 s1, s2; 00913 word128 d0, d1, d2, d; 00914 00915 t0 = ctx->r[0]; 00916 t1 = ctx->r[1]; 00917 r0 = ( t0 ) & 0xfffffffffff; 00918 r1 = ((t0 >> 44) | (t1 << 20)) & 0xfffffffffff; 00919 r2 = ((t1 >> 24) ) & 0x00fffffffff; 00920 00921 /* Store r^1 */ 00922 CONV_64_TO_32(r0, r1, r2, ctx->r1); 00923 00924 /* Calc and store r^2 */ 00925 r20 = r0; r21 = r1; r22 = r2; 00926 SQR_64(r20, r21, r22); 00927 CONV_64_TO_32(r20, r21, r22, ctx->r2); 00928 00929 /* Calc and store r^3 */ 00930 r30 = r20; r31 = r21; r32 = r22; 00931 MUL_64(r30, r31, r32, r0, r1, r2); 00932 CONV_64_TO_32(r30, r31, r32, ctx->r3); 00933 00934 /* Calc and store r^4 */ 00935 r40 = r20; r41 = r21; r42 = r22; 00936 SQR_64(r40, r41, r42); 00937 CONV_64_TO_32(r40, r41, r42, ctx->r4); 00938 00939 } 00940 00941 /* Set the key to use when processing data. 00942 * Initialize the context. 00943 * Calls AVX set key function as final function calls AVX code. 00944 * 00945 * ctx Poly1305 context. 00946 * key The key data (16 bytes). 00947 */ 00948 static void poly1305_setkey_avx2(Poly1305* ctx, const byte* key) 00949 { 00950 poly1305_setkey_avx(ctx, key); 00951 00952 __asm__ __volatile__ ( 00953 "vpxor %%ymm0, %%ymm0, %%ymm0\n\t" 00954 "vmovdqu %%ymm0, (%[hh])\n\t" 00955 "vmovdqu %%ymm0, 32(%[hh])\n\t" 00956 "vmovdqu %%ymm0, 64(%[hh])\n\t" 00957 "vmovdqu %%ymm0, 96(%[hh])\n\t" 00958 "vmovdqu %%ymm0, 128(%[hh])\n\t" 00959 : 00960 : [hh] "r" (ctx->hh) 00961 : "memory", "ymm0" 00962 ); 00963 00964 ctx->leftover = 0; 00965 ctx->finished = 0; 00966 ctx->started = 0; 00967 } 00968 00969 /* Calculate the final result - authentication data. 00970 * Zeros out the private data in the context. 00971 * Calls AVX final function to quickly process last blocks. 00972 * 00973 * ctx Poly1305 context. 00974 * mac Buffer to hold 16 bytes - authentication data. 00975 */ 00976 static void poly1305_final_avx2(Poly1305* ctx, byte* mac) 00977 { 00978 int i, j; 00979 int l = (int)ctx->leftover; 00980 00981 ctx->finished = 1; 00982 if (ctx->started) 00983 poly1305_blocks_avx2(ctx, ctx->buffer, POLY1305_BLOCK_SIZE * 4); 00984 00985 i = l & ~(POLY1305_BLOCK_SIZE - 1); 00986 if (i > 0) 00987 poly1305_blocks_avx(ctx, ctx->buffer, i); 00988 ctx->leftover -= i; 00989 for (j = 0; i < l; i++, j++) 00990 ctx->buffer[j] = ctx->buffer[i]; 00991 00992 poly1305_final_avx(ctx, mac); 00993 00994 /* zero out the state */ 00995 __asm__ __volatile__ ( 00996 "vpxor %%ymm0, %%ymm0, %%ymm0\n\t" 00997 "vmovdqu %%ymm0, (%[hh])\n\t" 00998 "vmovdqu %%ymm0, 32(%[hh])\n\t" 00999 "vmovdqu %%ymm0, 64(%[hh])\n\t" 01000 "vmovdqu %%ymm0, 96(%[hh])\n\t" 01001 "vmovdqu %%ymm0, 128(%[hh])\n\t" 01002 "vmovdqu %%ymm0, (%[r1])\n\t" 01003 "vmovdqu %%ymm0, (%[r2])\n\t" 01004 "vmovdqu %%ymm0, (%[r3])\n\t" 01005 "vmovdqu %%ymm0, (%[r4])\n\t" 01006 : 01007 : [hh] "r" (ctx->hh), [r1] "r" (ctx->r1), [r2] "r" (ctx->r2), 01008 [r3] "r" (ctx->r3), [r4] "r" (ctx->r4) 01009 : "memory", "ymm0" 01010 ); 01011 01012 ctx->leftover = 0; 01013 ctx->finished = 0; 01014 ctx->started = 0; 01015 } 01016 #endif 01017 01018 #elif defined(POLY130564) 01019 01020 static word64 U8TO64(const byte* p) 01021 { 01022 return 01023 (((word64)(p[0] & 0xff) ) | 01024 ((word64)(p[1] & 0xff) << 8) | 01025 ((word64)(p[2] & 0xff) << 16) | 01026 ((word64)(p[3] & 0xff) << 24) | 01027 ((word64)(p[4] & 0xff) << 32) | 01028 ((word64)(p[5] & 0xff) << 40) | 01029 ((word64)(p[6] & 0xff) << 48) | 01030 ((word64)(p[7] & 0xff) << 56)); 01031 } 01032 01033 static void U64TO8(byte* p, word64 v) { 01034 p[0] = (v ) & 0xff; 01035 p[1] = (v >> 8) & 0xff; 01036 p[2] = (v >> 16) & 0xff; 01037 p[3] = (v >> 24) & 0xff; 01038 p[4] = (v >> 32) & 0xff; 01039 p[5] = (v >> 40) & 0xff; 01040 p[6] = (v >> 48) & 0xff; 01041 p[7] = (v >> 56) & 0xff; 01042 } 01043 01044 #else /* if not 64 bit then use 32 bit */ 01045 01046 static word32 U8TO32(const byte *p) 01047 { 01048 return 01049 (((word32)(p[0] & 0xff) ) | 01050 ((word32)(p[1] & 0xff) << 8) | 01051 ((word32)(p[2] & 0xff) << 16) | 01052 ((word32)(p[3] & 0xff) << 24)); 01053 } 01054 01055 static void U32TO8(byte *p, word32 v) { 01056 p[0] = (v ) & 0xff; 01057 p[1] = (v >> 8) & 0xff; 01058 p[2] = (v >> 16) & 0xff; 01059 p[3] = (v >> 24) & 0xff; 01060 } 01061 #endif 01062 01063 01064 static void U32TO64(word32 v, byte* p) 01065 { 01066 XMEMSET(p, 0, 8); 01067 p[0] = (v & 0xFF); 01068 p[1] = (v >> 8) & 0xFF; 01069 p[2] = (v >> 16) & 0xFF; 01070 p[3] = (v >> 24) & 0xFF; 01071 } 01072 01073 static void poly1305_blocks(Poly1305* ctx, const unsigned char *m, 01074 size_t bytes) 01075 { 01076 #ifdef USE_INTEL_SPEEDUP 01077 /* AVX2 is handled in wc_Poly1305Update. */ 01078 poly1305_blocks_avx(ctx, m, bytes); 01079 #elif defined(POLY130564) 01080 const word64 hibit = (ctx->finished) ? 0 : ((word64)1 << 40); /* 1 << 128 */ 01081 word64 r0,r1,r2; 01082 word64 s1,s2; 01083 word64 h0,h1,h2; 01084 word64 c; 01085 word128 d0,d1,d2,d; 01086 01087 r0 = ctx->r[0]; 01088 r1 = ctx->r[1]; 01089 r2 = ctx->r[2]; 01090 01091 h0 = ctx->h[0]; 01092 h1 = ctx->h[1]; 01093 h2 = ctx->h[2]; 01094 01095 s1 = r1 * (5 << 2); 01096 s2 = r2 * (5 << 2); 01097 01098 while (bytes >= POLY1305_BLOCK_SIZE) { 01099 word64 t0,t1; 01100 01101 /* h += m[i] */ 01102 t0 = U8TO64(&m[0]); 01103 t1 = U8TO64(&m[8]); 01104 01105 h0 += (( t0 ) & 0xfffffffffff); 01106 h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff); 01107 h2 += (((t1 >> 24) ) & 0x3ffffffffff) | hibit; 01108 01109 /* h *= r */ 01110 MUL(d0, h0, r0); MUL(d, h1, s2); ADD(d0, d); MUL(d, h2, s1); ADD(d0, d); 01111 MUL(d1, h0, r1); MUL(d, h1, r0); ADD(d1, d); MUL(d, h2, s2); ADD(d1, d); 01112 MUL(d2, h0, r2); MUL(d, h1, r1); ADD(d2, d); MUL(d, h2, r0); ADD(d2, d); 01113 01114 /* (partial) h %= p */ 01115 c = SHR(d0, 44); h0 = LO(d0) & 0xfffffffffff; 01116 ADDLO(d1, c); c = SHR(d1, 44); h1 = LO(d1) & 0xfffffffffff; 01117 ADDLO(d2, c); c = SHR(d2, 42); h2 = LO(d2) & 0x3ffffffffff; 01118 h0 += c * 5; c = (h0 >> 44); h0 = h0 & 0xfffffffffff; 01119 h1 += c; 01120 01121 m += POLY1305_BLOCK_SIZE; 01122 bytes -= POLY1305_BLOCK_SIZE; 01123 } 01124 01125 ctx->h[0] = h0; 01126 ctx->h[1] = h1; 01127 ctx->h[2] = h2; 01128 01129 #else /* if not 64 bit then use 32 bit */ 01130 const word32 hibit = (ctx->finished) ? 0 : (1 << 24); /* 1 << 128 */ 01131 word32 r0,r1,r2,r3,r4; 01132 word32 s1,s2,s3,s4; 01133 word32 h0,h1,h2,h3,h4; 01134 word64 d0,d1,d2,d3,d4; 01135 word32 c; 01136 01137 01138 r0 = ctx->r[0]; 01139 r1 = ctx->r[1]; 01140 r2 = ctx->r[2]; 01141 r3 = ctx->r[3]; 01142 r4 = ctx->r[4]; 01143 01144 s1 = r1 * 5; 01145 s2 = r2 * 5; 01146 s3 = r3 * 5; 01147 s4 = r4 * 5; 01148 01149 h0 = ctx->h[0]; 01150 h1 = ctx->h[1]; 01151 h2 = ctx->h[2]; 01152 h3 = ctx->h[3]; 01153 h4 = ctx->h[4]; 01154 01155 while (bytes >= POLY1305_BLOCK_SIZE) { 01156 /* h += m[i] */ 01157 h0 += (U8TO32(m+ 0) ) & 0x3ffffff; 01158 h1 += (U8TO32(m+ 3) >> 2) & 0x3ffffff; 01159 h2 += (U8TO32(m+ 6) >> 4) & 0x3ffffff; 01160 h3 += (U8TO32(m+ 9) >> 6) & 0x3ffffff; 01161 h4 += (U8TO32(m+12) >> 8) | hibit; 01162 01163 /* h *= r */ 01164 d0 = ((word64)h0 * r0) + ((word64)h1 * s4) + ((word64)h2 * s3) + 01165 ((word64)h3 * s2) + ((word64)h4 * s1); 01166 d1 = ((word64)h0 * r1) + ((word64)h1 * r0) + ((word64)h2 * s4) + 01167 ((word64)h3 * s3) + ((word64)h4 * s2); 01168 d2 = ((word64)h0 * r2) + ((word64)h1 * r1) + ((word64)h2 * r0) + 01169 ((word64)h3 * s4) + ((word64)h4 * s3); 01170 d3 = ((word64)h0 * r3) + ((word64)h1 * r2) + ((word64)h2 * r1) + 01171 ((word64)h3 * r0) + ((word64)h4 * s4); 01172 d4 = ((word64)h0 * r4) + ((word64)h1 * r3) + ((word64)h2 * r2) + 01173 ((word64)h3 * r1) + ((word64)h4 * r0); 01174 01175 /* (partial) h %= p */ 01176 c = (word32)(d0 >> 26); h0 = (word32)d0 & 0x3ffffff; 01177 d1 += c; c = (word32)(d1 >> 26); h1 = (word32)d1 & 0x3ffffff; 01178 d2 += c; c = (word32)(d2 >> 26); h2 = (word32)d2 & 0x3ffffff; 01179 d3 += c; c = (word32)(d3 >> 26); h3 = (word32)d3 & 0x3ffffff; 01180 d4 += c; c = (word32)(d4 >> 26); h4 = (word32)d4 & 0x3ffffff; 01181 h0 += c * 5; c = (h0 >> 26); h0 = h0 & 0x3ffffff; 01182 h1 += c; 01183 01184 m += POLY1305_BLOCK_SIZE; 01185 bytes -= POLY1305_BLOCK_SIZE; 01186 } 01187 01188 ctx->h[0] = h0; 01189 ctx->h[1] = h1; 01190 ctx->h[2] = h2; 01191 ctx->h[3] = h3; 01192 ctx->h[4] = h4; 01193 01194 #endif /* end of 64 bit cpu blocks or 32 bit cpu */ 01195 } 01196 01197 static void poly1305_block(Poly1305* ctx, const unsigned char *m) 01198 { 01199 #ifdef USE_INTEL_SPEEDUP 01200 /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */ 01201 poly1305_block_avx(ctx, m); 01202 #else 01203 poly1305_blocks(ctx, m, POLY1305_BLOCK_SIZE); 01204 #endif 01205 } 01206 01207 01208 int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz) 01209 { 01210 #if defined(POLY130564) 01211 word64 t0,t1; 01212 #endif 01213 01214 if (key == NULL) 01215 return BAD_FUNC_ARG; 01216 01217 #ifdef CHACHA_AEAD_TEST 01218 word32 k; 01219 printf("Poly key used:\n"); 01220 for (k = 0; k < keySz; k++) { 01221 printf("%02x", key[k]); 01222 if ((k+1) % 8 == 0) 01223 printf("\n"); 01224 } 01225 printf("\n"); 01226 #endif 01227 01228 if (keySz != 32 || ctx == NULL) 01229 return BAD_FUNC_ARG; 01230 01231 #ifdef USE_INTEL_SPEEDUP 01232 if (!cpu_flags_set) { 01233 intel_flags = cpuid_get_flags(); 01234 cpu_flags_set = 1; 01235 } 01236 #ifdef HAVE_INTEL_AVX2 01237 if (IS_INTEL_AVX2(intel_flags)) 01238 poly1305_setkey_avx2(ctx, key); 01239 else 01240 #endif 01241 poly1305_setkey_avx(ctx, key); 01242 #elif defined(POLY130564) 01243 01244 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 01245 t0 = U8TO64(key + 0); 01246 t1 = U8TO64(key + 8); 01247 01248 ctx->r[0] = ( t0 ) & 0xffc0fffffff; 01249 ctx->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff; 01250 ctx->r[2] = ((t1 >> 24) ) & 0x00ffffffc0f; 01251 01252 /* h (accumulator) = 0 */ 01253 ctx->h[0] = 0; 01254 ctx->h[1] = 0; 01255 ctx->h[2] = 0; 01256 01257 /* save pad for later */ 01258 ctx->pad[0] = U8TO64(key + 16); 01259 ctx->pad[1] = U8TO64(key + 24); 01260 01261 ctx->leftover = 0; 01262 ctx->finished = 0; 01263 01264 #else /* if not 64 bit then use 32 bit */ 01265 01266 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */ 01267 ctx->r[0] = (U8TO32(key + 0) ) & 0x3ffffff; 01268 ctx->r[1] = (U8TO32(key + 3) >> 2) & 0x3ffff03; 01269 ctx->r[2] = (U8TO32(key + 6) >> 4) & 0x3ffc0ff; 01270 ctx->r[3] = (U8TO32(key + 9) >> 6) & 0x3f03fff; 01271 ctx->r[4] = (U8TO32(key + 12) >> 8) & 0x00fffff; 01272 01273 /* h = 0 */ 01274 ctx->h[0] = 0; 01275 ctx->h[1] = 0; 01276 ctx->h[2] = 0; 01277 ctx->h[3] = 0; 01278 ctx->h[4] = 0; 01279 01280 /* save pad for later */ 01281 ctx->pad[0] = U8TO32(key + 16); 01282 ctx->pad[1] = U8TO32(key + 20); 01283 ctx->pad[2] = U8TO32(key + 24); 01284 ctx->pad[3] = U8TO32(key + 28); 01285 01286 ctx->leftover = 0; 01287 ctx->finished = 0; 01288 01289 #endif 01290 01291 return 0; 01292 } 01293 01294 01295 int wc_Poly1305Final(Poly1305* ctx, byte* mac) 01296 { 01297 #ifdef USE_INTEL_SPEEDUP 01298 #elif defined(POLY130564) 01299 01300 word64 h0,h1,h2,c; 01301 word64 g0,g1,g2; 01302 word64 t0,t1; 01303 01304 #else 01305 01306 word32 h0,h1,h2,h3,h4,c; 01307 word32 g0,g1,g2,g3,g4; 01308 word64 f; 01309 word32 mask; 01310 01311 #endif 01312 01313 if (ctx == NULL) 01314 return BAD_FUNC_ARG; 01315 01316 #ifdef USE_INTEL_SPEEDUP 01317 #ifdef HAVE_INTEL_AVX2 01318 if (IS_INTEL_AVX2(intel_flags)) 01319 poly1305_final_avx2(ctx, mac); 01320 else 01321 #endif 01322 poly1305_final_avx(ctx, mac); 01323 #elif defined(POLY130564) 01324 01325 /* process the remaining block */ 01326 if (ctx->leftover) { 01327 size_t i = ctx->leftover; 01328 ctx->buffer[i] = 1; 01329 for (i = i + 1; i < POLY1305_BLOCK_SIZE; i++) 01330 ctx->buffer[i] = 0; 01331 ctx->finished = 1; 01332 poly1305_block(ctx, ctx->buffer); 01333 } 01334 01335 /* fully carry h */ 01336 h0 = ctx->h[0]; 01337 h1 = ctx->h[1]; 01338 h2 = ctx->h[2]; 01339 01340 c = (h1 >> 44); h1 &= 0xfffffffffff; 01341 h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; 01342 h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; 01343 h1 += c; c = (h1 >> 44); h1 &= 0xfffffffffff; 01344 h2 += c; c = (h2 >> 42); h2 &= 0x3ffffffffff; 01345 h0 += c * 5; c = (h0 >> 44); h0 &= 0xfffffffffff; 01346 h1 += c; 01347 01348 /* compute h + -p */ 01349 g0 = h0 + 5; c = (g0 >> 44); g0 &= 0xfffffffffff; 01350 g1 = h1 + c; c = (g1 >> 44); g1 &= 0xfffffffffff; 01351 g2 = h2 + c - ((word64)1 << 42); 01352 01353 /* select h if h < p, or h + -p if h >= p */ 01354 c = (g2 >> ((sizeof(word64) * 8) - 1)) - 1; 01355 g0 &= c; 01356 g1 &= c; 01357 g2 &= c; 01358 c = ~c; 01359 h0 = (h0 & c) | g0; 01360 h1 = (h1 & c) | g1; 01361 h2 = (h2 & c) | g2; 01362 01363 /* h = (h + pad) */ 01364 t0 = ctx->pad[0]; 01365 t1 = ctx->pad[1]; 01366 01367 h0 += (( t0 ) & 0xfffffffffff) ; 01368 c = (h0 >> 44); h0 &= 0xfffffffffff; 01369 h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c; 01370 c = (h1 >> 44); h1 &= 0xfffffffffff; 01371 h2 += (((t1 >> 24) ) & 0x3ffffffffff) + c; 01372 h2 &= 0x3ffffffffff; 01373 01374 /* mac = h % (2^128) */ 01375 h0 = ((h0 ) | (h1 << 44)); 01376 h1 = ((h1 >> 20) | (h2 << 24)); 01377 01378 U64TO8(mac + 0, h0); 01379 U64TO8(mac + 8, h1); 01380 01381 /* zero out the state */ 01382 ctx->h[0] = 0; 01383 ctx->h[1] = 0; 01384 ctx->h[2] = 0; 01385 ctx->r[0] = 0; 01386 ctx->r[1] = 0; 01387 ctx->r[2] = 0; 01388 ctx->pad[0] = 0; 01389 ctx->pad[1] = 0; 01390 01391 #else /* if not 64 bit then use 32 bit */ 01392 01393 /* process the remaining block */ 01394 if (ctx->leftover) { 01395 size_t i = ctx->leftover; 01396 ctx->buffer[i++] = 1; 01397 for (; i < POLY1305_BLOCK_SIZE; i++) 01398 ctx->buffer[i] = 0; 01399 ctx->finished = 1; 01400 poly1305_block(ctx, ctx->buffer); 01401 } 01402 01403 /* fully carry h */ 01404 h0 = ctx->h[0]; 01405 h1 = ctx->h[1]; 01406 h2 = ctx->h[2]; 01407 h3 = ctx->h[3]; 01408 h4 = ctx->h[4]; 01409 01410 c = h1 >> 26; h1 = h1 & 0x3ffffff; 01411 h2 += c; c = h2 >> 26; h2 = h2 & 0x3ffffff; 01412 h3 += c; c = h3 >> 26; h3 = h3 & 0x3ffffff; 01413 h4 += c; c = h4 >> 26; h4 = h4 & 0x3ffffff; 01414 h0 += c * 5; c = h0 >> 26; h0 = h0 & 0x3ffffff; 01415 h1 += c; 01416 01417 /* compute h + -p */ 01418 g0 = h0 + 5; c = g0 >> 26; g0 &= 0x3ffffff; 01419 g1 = h1 + c; c = g1 >> 26; g1 &= 0x3ffffff; 01420 g2 = h2 + c; c = g2 >> 26; g2 &= 0x3ffffff; 01421 g3 = h3 + c; c = g3 >> 26; g3 &= 0x3ffffff; 01422 g4 = h4 + c - (1 << 26); 01423 01424 /* select h if h < p, or h + -p if h >= p */ 01425 mask = (g4 >> ((sizeof(word32) * 8) - 1)) - 1; 01426 g0 &= mask; 01427 g1 &= mask; 01428 g2 &= mask; 01429 g3 &= mask; 01430 g4 &= mask; 01431 mask = ~mask; 01432 h0 = (h0 & mask) | g0; 01433 h1 = (h1 & mask) | g1; 01434 h2 = (h2 & mask) | g2; 01435 h3 = (h3 & mask) | g3; 01436 h4 = (h4 & mask) | g4; 01437 01438 /* h = h % (2^128) */ 01439 h0 = ((h0 ) | (h1 << 26)) & 0xffffffff; 01440 h1 = ((h1 >> 6) | (h2 << 20)) & 0xffffffff; 01441 h2 = ((h2 >> 12) | (h3 << 14)) & 0xffffffff; 01442 h3 = ((h3 >> 18) | (h4 << 8)) & 0xffffffff; 01443 01444 /* mac = (h + pad) % (2^128) */ 01445 f = (word64)h0 + ctx->pad[0] ; h0 = (word32)f; 01446 f = (word64)h1 + ctx->pad[1] + (f >> 32); h1 = (word32)f; 01447 f = (word64)h2 + ctx->pad[2] + (f >> 32); h2 = (word32)f; 01448 f = (word64)h3 + ctx->pad[3] + (f >> 32); h3 = (word32)f; 01449 01450 U32TO8(mac + 0, h0); 01451 U32TO8(mac + 4, h1); 01452 U32TO8(mac + 8, h2); 01453 U32TO8(mac + 12, h3); 01454 01455 /* zero out the state */ 01456 ctx->h[0] = 0; 01457 ctx->h[1] = 0; 01458 ctx->h[2] = 0; 01459 ctx->h[3] = 0; 01460 ctx->h[4] = 0; 01461 ctx->r[0] = 0; 01462 ctx->r[1] = 0; 01463 ctx->r[2] = 0; 01464 ctx->r[3] = 0; 01465 ctx->r[4] = 0; 01466 ctx->pad[0] = 0; 01467 ctx->pad[1] = 0; 01468 ctx->pad[2] = 0; 01469 ctx->pad[3] = 0; 01470 01471 #endif 01472 01473 return 0; 01474 } 01475 01476 01477 int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes) 01478 { 01479 size_t i; 01480 01481 #ifdef CHACHA_AEAD_TEST 01482 word32 k; 01483 printf("Raw input to poly:\n"); 01484 for (k = 0; k < bytes; k++) { 01485 printf("%02x", m[k]); 01486 if ((k+1) % 16 == 0) 01487 printf("\n"); 01488 } 01489 printf("\n"); 01490 #endif 01491 01492 if (ctx == NULL) 01493 return BAD_FUNC_ARG; 01494 01495 #ifdef USE_INTEL_SPEEDUP 01496 #ifdef HAVE_INTEL_AVX2 01497 if (IS_INTEL_AVX2(intel_flags)) { 01498 /* handle leftover */ 01499 if (ctx->leftover) { 01500 size_t want = sizeof(ctx->buffer) - ctx->leftover; 01501 if (want > bytes) 01502 want = bytes; 01503 01504 for (i = 0; i < want; i++) 01505 ctx->buffer[ctx->leftover + i] = m[i]; 01506 bytes -= (word32)want; 01507 m += want; 01508 ctx->leftover += want; 01509 if (ctx->leftover < sizeof(ctx->buffer)) 01510 return 0; 01511 01512 if (!ctx->started) 01513 poly1305_calc_powers(ctx); 01514 poly1305_blocks_avx2(ctx, ctx->buffer, sizeof(ctx->buffer)); 01515 ctx->leftover = 0; 01516 } 01517 01518 /* process full blocks */ 01519 if (bytes >= sizeof(ctx->buffer)) { 01520 size_t want = bytes & ~(sizeof(ctx->buffer) - 1); 01521 01522 if (!ctx->started) 01523 poly1305_calc_powers(ctx); 01524 poly1305_blocks_avx2(ctx, m, want); 01525 m += want; 01526 bytes -= (word32)want; 01527 } 01528 01529 /* store leftover */ 01530 if (bytes) { 01531 for (i = 0; i < bytes; i++) 01532 ctx->buffer[ctx->leftover + i] = m[i]; 01533 ctx->leftover += bytes; 01534 } 01535 } 01536 else 01537 #endif 01538 #endif 01539 { 01540 /* handle leftover */ 01541 if (ctx->leftover) { 01542 size_t want = (POLY1305_BLOCK_SIZE - ctx->leftover); 01543 if (want > bytes) 01544 want = bytes; 01545 for (i = 0; i < want; i++) 01546 ctx->buffer[ctx->leftover + i] = m[i]; 01547 bytes -= (word32)want; 01548 m += want; 01549 ctx->leftover += want; 01550 if (ctx->leftover < POLY1305_BLOCK_SIZE) 01551 return 0; 01552 poly1305_block(ctx, ctx->buffer); 01553 ctx->leftover = 0; 01554 } 01555 01556 /* process full blocks */ 01557 if (bytes >= POLY1305_BLOCK_SIZE) { 01558 size_t want = (bytes & ~(POLY1305_BLOCK_SIZE - 1)); 01559 poly1305_blocks(ctx, m, want); 01560 m += want; 01561 bytes -= (word32)want; 01562 } 01563 01564 /* store leftover */ 01565 if (bytes) { 01566 for (i = 0; i < bytes; i++) 01567 ctx->buffer[ctx->leftover + i] = m[i]; 01568 ctx->leftover += bytes; 01569 } 01570 } 01571 01572 return 0; 01573 } 01574 01575 01576 /* Takes in an initialized Poly1305 struct that has a key loaded and creates 01577 a MAC (tag) using recent TLS AEAD padding scheme. 01578 ctx : Initialized Poly1305 struct to use 01579 additional : Additional data to use 01580 addSz : Size of additional buffer 01581 input : Input buffer to create tag from 01582 sz : Size of input buffer 01583 tag : Buffer to hold created tag 01584 tagSz : Size of input tag buffer (must be at least 01585 WC_POLY1305_MAC_SZ(16)) 01586 */ 01587 int wc_Poly1305_MAC(Poly1305* ctx, byte* additional, word32 addSz, 01588 byte* input, word32 sz, byte* tag, word32 tagSz) 01589 { 01590 int ret; 01591 byte padding[WC_POLY1305_PAD_SZ - 1]; 01592 word32 paddingLen; 01593 byte little64[16]; 01594 01595 XMEMSET(padding, 0, sizeof(padding)); 01596 01597 /* sanity check on arguments */ 01598 if (ctx == NULL || input == NULL || tag == NULL || 01599 tagSz < WC_POLY1305_MAC_SZ) { 01600 return BAD_FUNC_ARG; 01601 } 01602 01603 /* additional allowed to be 0 */ 01604 if (addSz > 0) { 01605 if (additional == NULL) 01606 return BAD_FUNC_ARG; 01607 01608 /* additional data plus padding */ 01609 if ((ret = wc_Poly1305Update(ctx, additional, addSz)) != 0) { 01610 return ret; 01611 } 01612 paddingLen = -((int)addSz) & (WC_POLY1305_PAD_SZ - 1); 01613 if (paddingLen) { 01614 if ((ret = wc_Poly1305Update(ctx, padding, paddingLen)) != 0) { 01615 return ret; 01616 } 01617 } 01618 } 01619 01620 /* input plus padding */ 01621 if ((ret = wc_Poly1305Update(ctx, input, sz)) != 0) { 01622 return ret; 01623 } 01624 paddingLen = -((int)sz) & (WC_POLY1305_PAD_SZ - 1); 01625 if (paddingLen) { 01626 if ((ret = wc_Poly1305Update(ctx, padding, paddingLen)) != 0) { 01627 return ret; 01628 } 01629 } 01630 01631 /* size of additional data and input as little endian 64 bit types */ 01632 U32TO64(addSz, little64); 01633 U32TO64(sz, little64 + 8); 01634 ret = wc_Poly1305Update(ctx, little64, sizeof(little64)); 01635 if (ret) 01636 { 01637 return ret; 01638 } 01639 01640 /* Finalize the auth tag */ 01641 ret = wc_Poly1305Final(ctx, tag); 01642 01643 return ret; 01644 01645 } 01646 #endif /* HAVE_POLY1305 */ 01647 01648
Generated on Tue Jul 12 2022 16:58:06 by
1.7.2