wolfSSL 3.11.1 for TLS1.3 beta

Fork of wolfSSL by wolf SSL

Committer:
wolfSSL
Date:
Thu Apr 28 00:57:21 2016 +0000
Revision:
4:1b0d80432c79
wolfSSL 3.9.0

Who changed what in which revision?

UserRevisionLine numberNew contents of line
wolfSSL 4:1b0d80432c79 1 /* asm.c
wolfSSL 4:1b0d80432c79 2 *
wolfSSL 4:1b0d80432c79 3 * Copyright (C) 2006-2016 wolfSSL Inc.
wolfSSL 4:1b0d80432c79 4 *
wolfSSL 4:1b0d80432c79 5 * This file is part of wolfSSL.
wolfSSL 4:1b0d80432c79 6 *
wolfSSL 4:1b0d80432c79 7 * wolfSSL is free software; you can redistribute it and/or modify
wolfSSL 4:1b0d80432c79 8 * it under the terms of the GNU General Public License as published by
wolfSSL 4:1b0d80432c79 9 * the Free Software Foundation; either version 2 of the License, or
wolfSSL 4:1b0d80432c79 10 * (at your option) any later version.
wolfSSL 4:1b0d80432c79 11 *
wolfSSL 4:1b0d80432c79 12 * wolfSSL is distributed in the hope that it will be useful,
wolfSSL 4:1b0d80432c79 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
wolfSSL 4:1b0d80432c79 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
wolfSSL 4:1b0d80432c79 15 * GNU General Public License for more details.
wolfSSL 4:1b0d80432c79 16 *
wolfSSL 4:1b0d80432c79 17 * You should have received a copy of the GNU General Public License
wolfSSL 4:1b0d80432c79 18 * along with this program; if not, write to the Free Software
wolfSSL 4:1b0d80432c79 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
wolfSSL 4:1b0d80432c79 20 */
wolfSSL 4:1b0d80432c79 21
wolfSSL 4:1b0d80432c79 22
wolfSSL 4:1b0d80432c79 23 #ifdef HAVE_CONFIG_H
wolfSSL 4:1b0d80432c79 24 #include <config.h>
wolfSSL 4:1b0d80432c79 25 #endif
wolfSSL 4:1b0d80432c79 26
wolfSSL 4:1b0d80432c79 27 #include <wolfssl/wolfcrypt/settings.h>
wolfSSL 4:1b0d80432c79 28
wolfSSL 4:1b0d80432c79 29 /*
wolfSSL 4:1b0d80432c79 30 * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
wolfSSL 4:1b0d80432c79 31 * http://math.libtomcrypt.com
wolfSSL 4:1b0d80432c79 32 */
wolfSSL 4:1b0d80432c79 33
wolfSSL 4:1b0d80432c79 34
wolfSSL 4:1b0d80432c79 35 /******************************************************************/
wolfSSL 4:1b0d80432c79 36 /* fp_montgomery_reduce.c asm or generic */
wolfSSL 4:1b0d80432c79 37
wolfSSL 4:1b0d80432c79 38
wolfSSL 4:1b0d80432c79 39 /* Each platform needs to query info type 1 from cpuid to see if aesni is
wolfSSL 4:1b0d80432c79 40 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
wolfSSL 4:1b0d80432c79 41 */
wolfSSL 4:1b0d80432c79 42
wolfSSL 4:1b0d80432c79 43 #if defined(HAVE_INTEL_MULX)
wolfSSL 4:1b0d80432c79 44 #ifndef _MSC_VER
wolfSSL 4:1b0d80432c79 45 #define cpuid(reg, leaf, sub)\
wolfSSL 4:1b0d80432c79 46 __asm__ __volatile__ ("cpuid":\
wolfSSL 4:1b0d80432c79 47 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
wolfSSL 4:1b0d80432c79 48 "a" (leaf), "c"(sub));
wolfSSL 4:1b0d80432c79 49
wolfSSL 4:1b0d80432c79 50 #define XASM_LINK(f) asm(f)
wolfSSL 4:1b0d80432c79 51 #else
wolfSSL 4:1b0d80432c79 52
wolfSSL 4:1b0d80432c79 53 #include <intrin.h>
wolfSSL 4:1b0d80432c79 54 #define cpuid(a,b) __cpuid((int*)a,b)
wolfSSL 4:1b0d80432c79 55
wolfSSL 4:1b0d80432c79 56 #define XASM_LINK(f)
wolfSSL 4:1b0d80432c79 57
wolfSSL 4:1b0d80432c79 58 #endif /* _MSC_VER */
wolfSSL 4:1b0d80432c79 59
wolfSSL 4:1b0d80432c79 60 #define EAX 0
wolfSSL 4:1b0d80432c79 61 #define EBX 1
wolfSSL 4:1b0d80432c79 62 #define ECX 2
wolfSSL 4:1b0d80432c79 63 #define EDX 3
wolfSSL 4:1b0d80432c79 64
wolfSSL 4:1b0d80432c79 65 #define CPUID_AVX1 0x1
wolfSSL 4:1b0d80432c79 66 #define CPUID_AVX2 0x2
wolfSSL 4:1b0d80432c79 67 #define CPUID_RDRAND 0x4
wolfSSL 4:1b0d80432c79 68 #define CPUID_RDSEED 0x8
wolfSSL 4:1b0d80432c79 69 #define CPUID_BMI2 0x10 /* MULX, RORX */
wolfSSL 4:1b0d80432c79 70 #define CPUID_ADX 0x20 /* ADCX, ADOX */
wolfSSL 4:1b0d80432c79 71
wolfSSL 4:1b0d80432c79 72 #define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1)
wolfSSL 4:1b0d80432c79 73 #define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2)
wolfSSL 4:1b0d80432c79 74 #define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2)
wolfSSL 4:1b0d80432c79 75 #define IS_INTEL_ADX (cpuid_flags&CPUID_ADX)
wolfSSL 4:1b0d80432c79 76 #define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND)
wolfSSL 4:1b0d80432c79 77 #define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED)
wolfSSL 4:1b0d80432c79 78 #define SET_FLAGS
wolfSSL 4:1b0d80432c79 79
wolfSSL 4:1b0d80432c79 80 static word32 cpuid_check = 0 ;
wolfSSL 4:1b0d80432c79 81 static word32 cpuid_flags = 0 ;
wolfSSL 4:1b0d80432c79 82
wolfSSL 4:1b0d80432c79 83 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
wolfSSL 4:1b0d80432c79 84 int got_intel_cpu=0;
wolfSSL 4:1b0d80432c79 85 unsigned int reg[5];
wolfSSL 4:1b0d80432c79 86
wolfSSL 4:1b0d80432c79 87 reg[4] = '\0' ;
wolfSSL 4:1b0d80432c79 88 cpuid(reg, 0, 0);
wolfSSL 4:1b0d80432c79 89 if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&
wolfSSL 4:1b0d80432c79 90 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&
wolfSSL 4:1b0d80432c79 91 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {
wolfSSL 4:1b0d80432c79 92 got_intel_cpu = 1;
wolfSSL 4:1b0d80432c79 93 }
wolfSSL 4:1b0d80432c79 94 if (got_intel_cpu) {
wolfSSL 4:1b0d80432c79 95 cpuid(reg, leaf, sub);
wolfSSL 4:1b0d80432c79 96 return((reg[num]>>bit)&0x1) ;
wolfSSL 4:1b0d80432c79 97 }
wolfSSL 4:1b0d80432c79 98 return 0 ;
wolfSSL 4:1b0d80432c79 99 }
wolfSSL 4:1b0d80432c79 100
wolfSSL 4:1b0d80432c79 101 INLINE static int set_cpuid_flags(void) {
wolfSSL 4:1b0d80432c79 102 if(cpuid_check == 0) {
wolfSSL 4:1b0d80432c79 103 if(cpuid_flag(7, 0, EBX, 8)){ cpuid_flags |= CPUID_BMI2 ; }
wolfSSL 4:1b0d80432c79 104 if(cpuid_flag(7, 0, EBX,19)){ cpuid_flags |= CPUID_ADX ; }
wolfSSL 4:1b0d80432c79 105 cpuid_check = 1 ;
wolfSSL 4:1b0d80432c79 106 return 0 ;
wolfSSL 4:1b0d80432c79 107 }
wolfSSL 4:1b0d80432c79 108 return 1 ;
wolfSSL 4:1b0d80432c79 109 }
wolfSSL 4:1b0d80432c79 110
wolfSSL 4:1b0d80432c79 111 #define RETURN return
wolfSSL 4:1b0d80432c79 112 #define IF_HAVE_INTEL_MULX(func, ret) \
wolfSSL 4:1b0d80432c79 113 if(cpuid_check==0)set_cpuid_flags() ; \
wolfSSL 4:1b0d80432c79 114 if(IS_INTEL_BMI2 && IS_INTEL_ADX){ func; ret ; }
wolfSSL 4:1b0d80432c79 115
wolfSSL 4:1b0d80432c79 116 #else
wolfSSL 4:1b0d80432c79 117 #define IF_HAVE_INTEL_MULX(func, ret)
wolfSSL 4:1b0d80432c79 118 #endif
wolfSSL 4:1b0d80432c79 119
wolfSSL 4:1b0d80432c79 120 #if defined(TFM_X86) && !defined(TFM_SSE2)
wolfSSL 4:1b0d80432c79 121 /* x86-32 code */
wolfSSL 4:1b0d80432c79 122
wolfSSL 4:1b0d80432c79 123 #define MONT_START
wolfSSL 4:1b0d80432c79 124 #define MONT_FINI
wolfSSL 4:1b0d80432c79 125 #define LOOP_END
wolfSSL 4:1b0d80432c79 126 #define LOOP_START \
wolfSSL 4:1b0d80432c79 127 mu = c[x] * mp
wolfSSL 4:1b0d80432c79 128
wolfSSL 4:1b0d80432c79 129 #define INNERMUL \
wolfSSL 4:1b0d80432c79 130 __asm__( \
wolfSSL 4:1b0d80432c79 131 "movl %5,%%eax \n\t" \
wolfSSL 4:1b0d80432c79 132 "mull %4 \n\t" \
wolfSSL 4:1b0d80432c79 133 "addl %1,%%eax \n\t" \
wolfSSL 4:1b0d80432c79 134 "adcl $0,%%edx \n\t" \
wolfSSL 4:1b0d80432c79 135 "addl %%eax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 136 "adcl $0,%%edx \n\t" \
wolfSSL 4:1b0d80432c79 137 "movl %%edx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 138 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 4:1b0d80432c79 139 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
wolfSSL 4:1b0d80432c79 140 : "%eax", "%edx", "cc")
wolfSSL 4:1b0d80432c79 141
wolfSSL 4:1b0d80432c79 142 #define PROPCARRY \
wolfSSL 4:1b0d80432c79 143 __asm__( \
wolfSSL 4:1b0d80432c79 144 "addl %1,%0 \n\t" \
wolfSSL 4:1b0d80432c79 145 "setb %%al \n\t" \
wolfSSL 4:1b0d80432c79 146 "movzbl %%al,%1 \n\t" \
wolfSSL 4:1b0d80432c79 147 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 4:1b0d80432c79 148 :"0"(_c[LO]), "1"(cy) \
wolfSSL 4:1b0d80432c79 149 : "%eax", "cc")
wolfSSL 4:1b0d80432c79 150
wolfSSL 4:1b0d80432c79 151 /******************************************************************/
wolfSSL 4:1b0d80432c79 152 #elif defined(TFM_X86_64)
wolfSSL 4:1b0d80432c79 153 /* x86-64 code */
wolfSSL 4:1b0d80432c79 154
wolfSSL 4:1b0d80432c79 155 #define MONT_START
wolfSSL 4:1b0d80432c79 156 #define MONT_FINI
wolfSSL 4:1b0d80432c79 157 #define LOOP_END
wolfSSL 4:1b0d80432c79 158 #define LOOP_START \
wolfSSL 4:1b0d80432c79 159 mu = c[x] * mp;
wolfSSL 4:1b0d80432c79 160
wolfSSL 4:1b0d80432c79 161 #define INNERMUL \
wolfSSL 4:1b0d80432c79 162 __asm__( \
wolfSSL 4:1b0d80432c79 163 "movq %5,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 164 "mulq %4 \n\t" \
wolfSSL 4:1b0d80432c79 165 "addq %1,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 166 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 167 "addq %%rax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 168 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 169 "movq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 170 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 4:1b0d80432c79 171 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
wolfSSL 4:1b0d80432c79 172 : "%rax", "%rdx", "cc")
wolfSSL 4:1b0d80432c79 173
wolfSSL 4:1b0d80432c79 174 #if defined(HAVE_INTEL_MULX)
wolfSSL 4:1b0d80432c79 175 #define MULX_INIT(a0, c0, cy)\
wolfSSL 4:1b0d80432c79 176 __asm__ volatile( \
wolfSSL 4:1b0d80432c79 177 "xorq %%r10, %%r10\n\t" \
wolfSSL 4:1b0d80432c79 178 "movq %1,%%rdx\n\t" \
wolfSSL 4:1b0d80432c79 179 "addq %2, %0\n\t" /* c0+=cy; Set CF, OF */ \
wolfSSL 4:1b0d80432c79 180 "adoxq %%r10, %%r10\n\t" /* Reset OF */ \
wolfSSL 4:1b0d80432c79 181 :"+m"(c0):"r"(a0),"r"(cy):"%r8","%r9", "%r10","%r11","%r12","%rdx") ; \
wolfSSL 4:1b0d80432c79 182
wolfSSL 4:1b0d80432c79 183 #define MULX_INNERMUL_R1(c0, c1, pre, rdx)\
wolfSSL 4:1b0d80432c79 184 { \
wolfSSL 4:1b0d80432c79 185 __asm__ volatile ( \
wolfSSL 4:1b0d80432c79 186 "movq %3, %%rdx\n\t" \
wolfSSL 4:1b0d80432c79 187 "mulx %%r11,%%r9, %%r8 \n\t" \
wolfSSL 4:1b0d80432c79 188 "movq %2, %%r12\n\t" \
wolfSSL 4:1b0d80432c79 189 "adoxq %%r9,%0 \n\t" \
wolfSSL 4:1b0d80432c79 190 "adcxq %%r8,%1 \n\t" \
wolfSSL 4:1b0d80432c79 191 :"+r"(c0),"+r"(c1):"m"(pre),"r"(rdx):"%r8","%r9", "%r10", "%r11","%r12","%rdx" \
wolfSSL 4:1b0d80432c79 192 ); }
wolfSSL 4:1b0d80432c79 193
wolfSSL 4:1b0d80432c79 194
wolfSSL 4:1b0d80432c79 195 #define MULX_INNERMUL_R2(c0, c1, pre, rdx)\
wolfSSL 4:1b0d80432c79 196 { \
wolfSSL 4:1b0d80432c79 197 __asm__ volatile ( \
wolfSSL 4:1b0d80432c79 198 "movq %3, %%rdx\n\t" \
wolfSSL 4:1b0d80432c79 199 "mulx %%r12,%%r9, %%r8 \n\t" \
wolfSSL 4:1b0d80432c79 200 "movq %2, %%r11\n\t" \
wolfSSL 4:1b0d80432c79 201 "adoxq %%r9,%0 \n\t" \
wolfSSL 4:1b0d80432c79 202 "adcxq %%r8,%1 \n\t" \
wolfSSL 4:1b0d80432c79 203 :"+r"(c0),"+r"(c1):"m"(pre),"r"(rdx):"%r8","%r9", "%r10", "%r11","%r12","%rdx" \
wolfSSL 4:1b0d80432c79 204 ); }
wolfSSL 4:1b0d80432c79 205
wolfSSL 4:1b0d80432c79 206 #define MULX_LOAD_R1(val)\
wolfSSL 4:1b0d80432c79 207 __asm__ volatile ( \
wolfSSL 4:1b0d80432c79 208 "movq %0, %%r11\n\t"\
wolfSSL 4:1b0d80432c79 209 ::"m"(val):"%r8","%r9", "%r10", "%r11","%r12","%rdx"\
wolfSSL 4:1b0d80432c79 210 ) ;
wolfSSL 4:1b0d80432c79 211
wolfSSL 4:1b0d80432c79 212 #define MULX_INNERMUL_LAST(c0, c1, rdx)\
wolfSSL 4:1b0d80432c79 213 { \
wolfSSL 4:1b0d80432c79 214 __asm__ volatile ( \
wolfSSL 4:1b0d80432c79 215 "movq %2, %%rdx\n\t" \
wolfSSL 4:1b0d80432c79 216 "mulx %%r12,%%r9, %%r8 \n\t" \
wolfSSL 4:1b0d80432c79 217 "movq $0, %%r10 \n\t" \
wolfSSL 4:1b0d80432c79 218 "adoxq %%r10, %%r9 \n\t" \
wolfSSL 4:1b0d80432c79 219 "adcq $0,%%r8 \n\t" \
wolfSSL 4:1b0d80432c79 220 "addq %%r9,%0 \n\t" \
wolfSSL 4:1b0d80432c79 221 "adcq $0,%%r8 \n\t" \
wolfSSL 4:1b0d80432c79 222 "movq %%r8,%1 \n\t" \
wolfSSL 4:1b0d80432c79 223 :"+m"(c0),"=m"(c1):"r"(rdx):"%r8","%r9","%r10", "%r11", "%r12","%rdx"\
wolfSSL 4:1b0d80432c79 224 ); }
wolfSSL 4:1b0d80432c79 225
wolfSSL 4:1b0d80432c79 226 #define MULX_INNERMUL8(x,y,z,cy)\
wolfSSL 4:1b0d80432c79 227 { word64 rdx = y ;\
wolfSSL 4:1b0d80432c79 228 MULX_LOAD_R1(x[0]) ;\
wolfSSL 4:1b0d80432c79 229 MULX_INIT(y, _c0, cy) ; /* rdx=y; z0+=cy; */ \
wolfSSL 4:1b0d80432c79 230 MULX_INNERMUL_R1(_c0, _c1, x[1], rdx) ;\
wolfSSL 4:1b0d80432c79 231 MULX_INNERMUL_R2(_c1, _c2, x[2], rdx) ;\
wolfSSL 4:1b0d80432c79 232 MULX_INNERMUL_R1(_c2, _c3, x[3], rdx) ;\
wolfSSL 4:1b0d80432c79 233 MULX_INNERMUL_R2(_c3, _c4, x[4], rdx) ;\
wolfSSL 4:1b0d80432c79 234 MULX_INNERMUL_R1(_c4, _c5, x[5], rdx) ;\
wolfSSL 4:1b0d80432c79 235 MULX_INNERMUL_R2(_c5, _c6, x[6], rdx) ;\
wolfSSL 4:1b0d80432c79 236 MULX_INNERMUL_R1(_c6, _c7, x[7], rdx) ;\
wolfSSL 4:1b0d80432c79 237 MULX_INNERMUL_LAST(_c7, cy, rdx) ;\
wolfSSL 4:1b0d80432c79 238 }
wolfSSL 4:1b0d80432c79 239 #define INNERMUL8_MULX \
wolfSSL 4:1b0d80432c79 240 {\
wolfSSL 4:1b0d80432c79 241 MULX_INNERMUL8(tmpm, mu, _c, cy);\
wolfSSL 4:1b0d80432c79 242 }
wolfSSL 4:1b0d80432c79 243 #endif
wolfSSL 4:1b0d80432c79 244
wolfSSL 4:1b0d80432c79 245 #define INNERMUL8 \
wolfSSL 4:1b0d80432c79 246 __asm__( \
wolfSSL 4:1b0d80432c79 247 "movq 0(%5),%%rax \n\t" \
wolfSSL 4:1b0d80432c79 248 "movq 0(%2),%%r10 \n\t" \
wolfSSL 4:1b0d80432c79 249 "movq 0x8(%5),%%r11 \n\t" \
wolfSSL 4:1b0d80432c79 250 "mulq %4 \n\t" \
wolfSSL 4:1b0d80432c79 251 "addq %%r10,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 252 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 253 "movq 0x8(%2),%%r10 \n\t" \
wolfSSL 4:1b0d80432c79 254 "addq %3,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 255 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 256 "movq %%rax,0(%0) \n\t" \
wolfSSL 4:1b0d80432c79 257 "movq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 258 \
wolfSSL 4:1b0d80432c79 259 "movq %%r11,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 260 "movq 0x10(%5),%%r11 \n\t" \
wolfSSL 4:1b0d80432c79 261 "mulq %4 \n\t" \
wolfSSL 4:1b0d80432c79 262 "addq %%r10,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 263 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 264 "movq 0x10(%2),%%r10 \n\t" \
wolfSSL 4:1b0d80432c79 265 "addq %3,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 266 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 267 "movq %%rax,0x8(%0) \n\t" \
wolfSSL 4:1b0d80432c79 268 "movq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 269 \
wolfSSL 4:1b0d80432c79 270 "movq %%r11,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 271 "movq 0x18(%5),%%r11 \n\t" \
wolfSSL 4:1b0d80432c79 272 "mulq %4 \n\t" \
wolfSSL 4:1b0d80432c79 273 "addq %%r10,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 274 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 275 "movq 0x18(%2),%%r10 \n\t" \
wolfSSL 4:1b0d80432c79 276 "addq %3,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 277 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 278 "movq %%rax,0x10(%0) \n\t" \
wolfSSL 4:1b0d80432c79 279 "movq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 280 \
wolfSSL 4:1b0d80432c79 281 "movq %%r11,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 282 "movq 0x20(%5),%%r11 \n\t" \
wolfSSL 4:1b0d80432c79 283 "mulq %4 \n\t" \
wolfSSL 4:1b0d80432c79 284 "addq %%r10,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 285 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 286 "movq 0x20(%2),%%r10 \n\t" \
wolfSSL 4:1b0d80432c79 287 "addq %3,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 288 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 289 "movq %%rax,0x18(%0) \n\t" \
wolfSSL 4:1b0d80432c79 290 "movq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 291 \
wolfSSL 4:1b0d80432c79 292 "movq %%r11,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 293 "movq 0x28(%5),%%r11 \n\t" \
wolfSSL 4:1b0d80432c79 294 "mulq %4 \n\t" \
wolfSSL 4:1b0d80432c79 295 "addq %%r10,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 296 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 297 "movq 0x28(%2),%%r10 \n\t" \
wolfSSL 4:1b0d80432c79 298 "addq %3,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 299 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 300 "movq %%rax,0x20(%0) \n\t" \
wolfSSL 4:1b0d80432c79 301 "movq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 302 \
wolfSSL 4:1b0d80432c79 303 "movq %%r11,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 304 "movq 0x30(%5),%%r11 \n\t" \
wolfSSL 4:1b0d80432c79 305 "mulq %4 \n\t" \
wolfSSL 4:1b0d80432c79 306 "addq %%r10,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 307 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 308 "movq 0x30(%2),%%r10 \n\t" \
wolfSSL 4:1b0d80432c79 309 "addq %3,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 310 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 311 "movq %%rax,0x28(%0) \n\t" \
wolfSSL 4:1b0d80432c79 312 "movq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 313 \
wolfSSL 4:1b0d80432c79 314 "movq %%r11,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 315 "movq 0x38(%5),%%r11 \n\t" \
wolfSSL 4:1b0d80432c79 316 "mulq %4 \n\t" \
wolfSSL 4:1b0d80432c79 317 "addq %%r10,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 318 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 319 "movq 0x38(%2),%%r10 \n\t" \
wolfSSL 4:1b0d80432c79 320 "addq %3,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 321 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 322 "movq %%rax,0x30(%0) \n\t" \
wolfSSL 4:1b0d80432c79 323 "movq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 324 \
wolfSSL 4:1b0d80432c79 325 "movq %%r11,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 326 "mulq %4 \n\t" \
wolfSSL 4:1b0d80432c79 327 "addq %%r10,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 328 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 329 "addq %3,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 330 "adcq $0,%%rdx \n\t" \
wolfSSL 4:1b0d80432c79 331 "movq %%rax,0x38(%0) \n\t" \
wolfSSL 4:1b0d80432c79 332 "movq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 333 \
wolfSSL 4:1b0d80432c79 334 :"=r"(_c), "=r"(cy) \
wolfSSL 4:1b0d80432c79 335 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
wolfSSL 4:1b0d80432c79 336 : "%rax", "%rdx", "%r10", "%r11", "cc")\
wolfSSL 4:1b0d80432c79 337
wolfSSL 4:1b0d80432c79 338 #define PROPCARRY \
wolfSSL 4:1b0d80432c79 339 __asm__( \
wolfSSL 4:1b0d80432c79 340 "addq %1,%0 \n\t" \
wolfSSL 4:1b0d80432c79 341 "setb %%al \n\t" \
wolfSSL 4:1b0d80432c79 342 "movzbq %%al,%1 \n\t" \
wolfSSL 4:1b0d80432c79 343 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 4:1b0d80432c79 344 :"0"(_c[LO]), "1"(cy) \
wolfSSL 4:1b0d80432c79 345 : "%rax", "cc")
wolfSSL 4:1b0d80432c79 346
wolfSSL 4:1b0d80432c79 347 /******************************************************************/
wolfSSL 4:1b0d80432c79 348 #elif defined(TFM_SSE2)
wolfSSL 4:1b0d80432c79 349 /* SSE2 code (assumes 32-bit fp_digits) */
wolfSSL 4:1b0d80432c79 350 /* XMM register assignments:
wolfSSL 4:1b0d80432c79 351 * xmm0 *tmpm++, then Mu * (*tmpm++)
wolfSSL 4:1b0d80432c79 352 * xmm1 c[x], then Mu
wolfSSL 4:1b0d80432c79 353 * xmm2 mp
wolfSSL 4:1b0d80432c79 354 * xmm3 cy
wolfSSL 4:1b0d80432c79 355 * xmm4 _c[LO]
wolfSSL 4:1b0d80432c79 356 */
wolfSSL 4:1b0d80432c79 357
wolfSSL 4:1b0d80432c79 358 #define MONT_START \
wolfSSL 4:1b0d80432c79 359 __asm__("movd %0,%%mm2"::"g"(mp))
wolfSSL 4:1b0d80432c79 360
wolfSSL 4:1b0d80432c79 361 #define MONT_FINI \
wolfSSL 4:1b0d80432c79 362 __asm__("emms")
wolfSSL 4:1b0d80432c79 363
wolfSSL 4:1b0d80432c79 364 #define LOOP_START \
wolfSSL 4:1b0d80432c79 365 __asm__( \
wolfSSL 4:1b0d80432c79 366 "movd %0,%%mm1 \n\t" \
wolfSSL 4:1b0d80432c79 367 "pxor %%mm3,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 368 "pmuludq %%mm2,%%mm1 \n\t" \
wolfSSL 4:1b0d80432c79 369 :: "g"(c[x]))
wolfSSL 4:1b0d80432c79 370
wolfSSL 4:1b0d80432c79 371 /* pmuludq on mmx registers does a 32x32->64 multiply. */
wolfSSL 4:1b0d80432c79 372 #define INNERMUL \
wolfSSL 4:1b0d80432c79 373 __asm__( \
wolfSSL 4:1b0d80432c79 374 "movd %1,%%mm4 \n\t" \
wolfSSL 4:1b0d80432c79 375 "movd %2,%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 376 "paddq %%mm4,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 377 "pmuludq %%mm1,%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 378 "paddq %%mm0,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 379 "movd %%mm3,%0 \n\t" \
wolfSSL 4:1b0d80432c79 380 "psrlq $32, %%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 381 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
wolfSSL 4:1b0d80432c79 382
wolfSSL 4:1b0d80432c79 383 #define INNERMUL8 \
wolfSSL 4:1b0d80432c79 384 __asm__( \
wolfSSL 4:1b0d80432c79 385 "movd 0(%1),%%mm4 \n\t" \
wolfSSL 4:1b0d80432c79 386 "movd 0(%2),%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 387 "paddq %%mm4,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 388 "pmuludq %%mm1,%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 389 "movd 4(%2),%%mm5 \n\t" \
wolfSSL 4:1b0d80432c79 390 "paddq %%mm0,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 391 "movd 4(%1),%%mm6 \n\t" \
wolfSSL 4:1b0d80432c79 392 "movd %%mm3,0(%0) \n\t" \
wolfSSL 4:1b0d80432c79 393 "psrlq $32, %%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 394 \
wolfSSL 4:1b0d80432c79 395 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 396 "pmuludq %%mm1,%%mm5 \n\t" \
wolfSSL 4:1b0d80432c79 397 "movd 8(%2),%%mm6 \n\t" \
wolfSSL 4:1b0d80432c79 398 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 399 "movd 8(%1),%%mm7 \n\t" \
wolfSSL 4:1b0d80432c79 400 "movd %%mm3,4(%0) \n\t" \
wolfSSL 4:1b0d80432c79 401 "psrlq $32, %%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 402 \
wolfSSL 4:1b0d80432c79 403 "paddq %%mm7,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 404 "pmuludq %%mm1,%%mm6 \n\t" \
wolfSSL 4:1b0d80432c79 405 "movd 12(%2),%%mm7 \n\t" \
wolfSSL 4:1b0d80432c79 406 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 407 "movd 12(%1),%%mm5 \n\t" \
wolfSSL 4:1b0d80432c79 408 "movd %%mm3,8(%0) \n\t" \
wolfSSL 4:1b0d80432c79 409 "psrlq $32, %%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 410 \
wolfSSL 4:1b0d80432c79 411 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 412 "pmuludq %%mm1,%%mm7 \n\t" \
wolfSSL 4:1b0d80432c79 413 "movd 16(%2),%%mm5 \n\t" \
wolfSSL 4:1b0d80432c79 414 "paddq %%mm7,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 415 "movd 16(%1),%%mm6 \n\t" \
wolfSSL 4:1b0d80432c79 416 "movd %%mm3,12(%0) \n\t" \
wolfSSL 4:1b0d80432c79 417 "psrlq $32, %%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 418 \
wolfSSL 4:1b0d80432c79 419 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 420 "pmuludq %%mm1,%%mm5 \n\t" \
wolfSSL 4:1b0d80432c79 421 "movd 20(%2),%%mm6 \n\t" \
wolfSSL 4:1b0d80432c79 422 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 423 "movd 20(%1),%%mm7 \n\t" \
wolfSSL 4:1b0d80432c79 424 "movd %%mm3,16(%0) \n\t" \
wolfSSL 4:1b0d80432c79 425 "psrlq $32, %%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 426 \
wolfSSL 4:1b0d80432c79 427 "paddq %%mm7,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 428 "pmuludq %%mm1,%%mm6 \n\t" \
wolfSSL 4:1b0d80432c79 429 "movd 24(%2),%%mm7 \n\t" \
wolfSSL 4:1b0d80432c79 430 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 431 "movd 24(%1),%%mm5 \n\t" \
wolfSSL 4:1b0d80432c79 432 "movd %%mm3,20(%0) \n\t" \
wolfSSL 4:1b0d80432c79 433 "psrlq $32, %%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 434 \
wolfSSL 4:1b0d80432c79 435 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 436 "pmuludq %%mm1,%%mm7 \n\t" \
wolfSSL 4:1b0d80432c79 437 "movd 28(%2),%%mm5 \n\t" \
wolfSSL 4:1b0d80432c79 438 "paddq %%mm7,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 439 "movd 28(%1),%%mm6 \n\t" \
wolfSSL 4:1b0d80432c79 440 "movd %%mm3,24(%0) \n\t" \
wolfSSL 4:1b0d80432c79 441 "psrlq $32, %%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 442 \
wolfSSL 4:1b0d80432c79 443 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 444 "pmuludq %%mm1,%%mm5 \n\t" \
wolfSSL 4:1b0d80432c79 445 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 446 "movd %%mm3,28(%0) \n\t" \
wolfSSL 4:1b0d80432c79 447 "psrlq $32, %%mm3 \n\t" \
wolfSSL 4:1b0d80432c79 448 :"=r"(_c) : "0"(_c), "r"(tmpm) );
wolfSSL 4:1b0d80432c79 449
wolfSSL 4:1b0d80432c79 450 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack
wolfSSL 4:1b0d80432c79 451 pointer */
wolfSSL 4:1b0d80432c79 452
wolfSSL 4:1b0d80432c79 453 #define LOOP_END \
wolfSSL 4:1b0d80432c79 454 __asm__( "movd %%mm3,%0 \n" :"=r"(cy))
wolfSSL 4:1b0d80432c79 455
wolfSSL 4:1b0d80432c79 456 #define PROPCARRY \
wolfSSL 4:1b0d80432c79 457 __asm__( \
wolfSSL 4:1b0d80432c79 458 "addl %1,%0 \n\t" \
wolfSSL 4:1b0d80432c79 459 "setb %%al \n\t" \
wolfSSL 4:1b0d80432c79 460 "movzbl %%al,%1 \n\t" \
wolfSSL 4:1b0d80432c79 461 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 4:1b0d80432c79 462 :"0"(_c[LO]), "1"(cy) \
wolfSSL 4:1b0d80432c79 463 : "%eax", "cc")
wolfSSL 4:1b0d80432c79 464
wolfSSL 4:1b0d80432c79 465 /******************************************************************/
wolfSSL 4:1b0d80432c79 466 #elif defined(TFM_ARM)
wolfSSL 4:1b0d80432c79 467 /* ARMv4 code */
wolfSSL 4:1b0d80432c79 468
wolfSSL 4:1b0d80432c79 469 #define MONT_START
wolfSSL 4:1b0d80432c79 470 #define MONT_FINI
wolfSSL 4:1b0d80432c79 471 #define LOOP_END
wolfSSL 4:1b0d80432c79 472 #define LOOP_START \
wolfSSL 4:1b0d80432c79 473 mu = c[x] * mp
wolfSSL 4:1b0d80432c79 474
wolfSSL 4:1b0d80432c79 475
wolfSSL 4:1b0d80432c79 476 #ifdef __thumb__
wolfSSL 4:1b0d80432c79 477
wolfSSL 4:1b0d80432c79 478 #define INNERMUL \
wolfSSL 4:1b0d80432c79 479 __asm__( \
wolfSSL 4:1b0d80432c79 480 " LDR r0,%1 \n\t" \
wolfSSL 4:1b0d80432c79 481 " ADDS r0,r0,%0 \n\t" \
wolfSSL 4:1b0d80432c79 482 " ITE CS \n\t" \
wolfSSL 4:1b0d80432c79 483 " MOVCS %0,#1 \n\t" \
wolfSSL 4:1b0d80432c79 484 " MOVCC %0,#0 \n\t" \
wolfSSL 4:1b0d80432c79 485 " UMLAL r0,%0,%3,%4 \n\t" \
wolfSSL 4:1b0d80432c79 486 " STR r0,%1 \n\t" \
wolfSSL 4:1b0d80432c79 487 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc");
wolfSSL 4:1b0d80432c79 488
wolfSSL 4:1b0d80432c79 489 #define PROPCARRY \
wolfSSL 4:1b0d80432c79 490 __asm__( \
wolfSSL 4:1b0d80432c79 491 " LDR r0,%1 \n\t" \
wolfSSL 4:1b0d80432c79 492 " ADDS r0,r0,%0 \n\t" \
wolfSSL 4:1b0d80432c79 493 " STR r0,%1 \n\t" \
wolfSSL 4:1b0d80432c79 494 " ITE CS \n\t" \
wolfSSL 4:1b0d80432c79 495 " MOVCS %0,#1 \n\t" \
wolfSSL 4:1b0d80432c79 496 " MOVCC %0,#0 \n\t" \
wolfSSL 4:1b0d80432c79 497 :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc");
wolfSSL 4:1b0d80432c79 498
wolfSSL 4:1b0d80432c79 499
wolfSSL 4:1b0d80432c79 500 /* TAO thumb mode uses ite (if then else) to detect carry directly
wolfSSL 4:1b0d80432c79 501 * fixed unmatched constraint warning by changing 1 to m */
wolfSSL 4:1b0d80432c79 502
wolfSSL 4:1b0d80432c79 503 #else /* __thumb__ */
wolfSSL 4:1b0d80432c79 504
wolfSSL 4:1b0d80432c79 505 #define INNERMUL \
wolfSSL 4:1b0d80432c79 506 __asm__( \
wolfSSL 4:1b0d80432c79 507 " LDR r0,%1 \n\t" \
wolfSSL 4:1b0d80432c79 508 " ADDS r0,r0,%0 \n\t" \
wolfSSL 4:1b0d80432c79 509 " MOVCS %0,#1 \n\t" \
wolfSSL 4:1b0d80432c79 510 " MOVCC %0,#0 \n\t" \
wolfSSL 4:1b0d80432c79 511 " UMLAL r0,%0,%3,%4 \n\t" \
wolfSSL 4:1b0d80432c79 512 " STR r0,%1 \n\t" \
wolfSSL 4:1b0d80432c79 513 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
wolfSSL 4:1b0d80432c79 514
wolfSSL 4:1b0d80432c79 515 #define PROPCARRY \
wolfSSL 4:1b0d80432c79 516 __asm__( \
wolfSSL 4:1b0d80432c79 517 " LDR r0,%1 \n\t" \
wolfSSL 4:1b0d80432c79 518 " ADDS r0,r0,%0 \n\t" \
wolfSSL 4:1b0d80432c79 519 " STR r0,%1 \n\t" \
wolfSSL 4:1b0d80432c79 520 " MOVCS %0,#1 \n\t" \
wolfSSL 4:1b0d80432c79 521 " MOVCC %0,#0 \n\t" \
wolfSSL 4:1b0d80432c79 522 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
wolfSSL 4:1b0d80432c79 523
wolfSSL 4:1b0d80432c79 524 #endif /* __thumb__ */
wolfSSL 4:1b0d80432c79 525
wolfSSL 4:1b0d80432c79 526 #elif defined(TFM_PPC32)
wolfSSL 4:1b0d80432c79 527
wolfSSL 4:1b0d80432c79 528 /* PPC32 */
wolfSSL 4:1b0d80432c79 529 #define MONT_START
wolfSSL 4:1b0d80432c79 530 #define MONT_FINI
wolfSSL 4:1b0d80432c79 531 #define LOOP_END
wolfSSL 4:1b0d80432c79 532 #define LOOP_START \
wolfSSL 4:1b0d80432c79 533 mu = c[x] * mp
wolfSSL 4:1b0d80432c79 534
wolfSSL 4:1b0d80432c79 535 #define INNERMUL \
wolfSSL 4:1b0d80432c79 536 __asm__( \
wolfSSL 4:1b0d80432c79 537 " mullw 16,%3,%4 \n\t" \
wolfSSL 4:1b0d80432c79 538 " mulhwu 17,%3,%4 \n\t" \
wolfSSL 4:1b0d80432c79 539 " addc 16,16,%0 \n\t" \
wolfSSL 4:1b0d80432c79 540 " addze 17,17 \n\t" \
wolfSSL 4:1b0d80432c79 541 " lwz 18,%1 \n\t" \
wolfSSL 4:1b0d80432c79 542 " addc 16,16,18 \n\t" \
wolfSSL 4:1b0d80432c79 543 " addze %0,17 \n\t" \
wolfSSL 4:1b0d80432c79 544 " stw 16,%1 \n\t" \
wolfSSL 4:1b0d80432c79 545 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
wolfSSL 4:1b0d80432c79 546
wolfSSL 4:1b0d80432c79 547 #define PROPCARRY \
wolfSSL 4:1b0d80432c79 548 __asm__( \
wolfSSL 4:1b0d80432c79 549 " lwz 16,%1 \n\t" \
wolfSSL 4:1b0d80432c79 550 " addc 16,16,%0 \n\t" \
wolfSSL 4:1b0d80432c79 551 " stw 16,%1 \n\t" \
wolfSSL 4:1b0d80432c79 552 " xor %0,%0,%0 \n\t" \
wolfSSL 4:1b0d80432c79 553 " addze %0,%0 \n\t" \
wolfSSL 4:1b0d80432c79 554 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
wolfSSL 4:1b0d80432c79 555
wolfSSL 4:1b0d80432c79 556 #elif defined(TFM_PPC64)
wolfSSL 4:1b0d80432c79 557
wolfSSL 4:1b0d80432c79 558 /* PPC64 */
wolfSSL 4:1b0d80432c79 559 #define MONT_START
wolfSSL 4:1b0d80432c79 560 #define MONT_FINI
wolfSSL 4:1b0d80432c79 561 #define LOOP_END
wolfSSL 4:1b0d80432c79 562 #define LOOP_START \
wolfSSL 4:1b0d80432c79 563 mu = c[x] * mp
wolfSSL 4:1b0d80432c79 564
wolfSSL 4:1b0d80432c79 565 #define INNERMUL \
wolfSSL 4:1b0d80432c79 566 __asm__( \
wolfSSL 4:1b0d80432c79 567 " mulld 16,%3,%4 \n\t" \
wolfSSL 4:1b0d80432c79 568 " mulhdu 17,%3,%4 \n\t" \
wolfSSL 4:1b0d80432c79 569 " addc 16,16,%0 \n\t" \
wolfSSL 4:1b0d80432c79 570 " addze 17,17 \n\t" \
wolfSSL 4:1b0d80432c79 571 " ldx 18,0,%1 \n\t" \
wolfSSL 4:1b0d80432c79 572 " addc 16,16,18 \n\t" \
wolfSSL 4:1b0d80432c79 573 " addze %0,17 \n\t" \
wolfSSL 4:1b0d80432c79 574 " sdx 16,0,%1 \n\t" \
wolfSSL 4:1b0d80432c79 575 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
wolfSSL 4:1b0d80432c79 576
wolfSSL 4:1b0d80432c79 577 #define PROPCARRY \
wolfSSL 4:1b0d80432c79 578 __asm__( \
wolfSSL 4:1b0d80432c79 579 " ldx 16,0,%1 \n\t" \
wolfSSL 4:1b0d80432c79 580 " addc 16,16,%0 \n\t" \
wolfSSL 4:1b0d80432c79 581 " sdx 16,0,%1 \n\t" \
wolfSSL 4:1b0d80432c79 582 " xor %0,%0,%0 \n\t" \
wolfSSL 4:1b0d80432c79 583 " addze %0,%0 \n\t" \
wolfSSL 4:1b0d80432c79 584 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
wolfSSL 4:1b0d80432c79 585
wolfSSL 4:1b0d80432c79 586 /******************************************************************/
wolfSSL 4:1b0d80432c79 587
wolfSSL 4:1b0d80432c79 588 #elif defined(TFM_AVR32)
wolfSSL 4:1b0d80432c79 589
wolfSSL 4:1b0d80432c79 590 /* AVR32 */
wolfSSL 4:1b0d80432c79 591 #define MONT_START
wolfSSL 4:1b0d80432c79 592 #define MONT_FINI
wolfSSL 4:1b0d80432c79 593 #define LOOP_END
wolfSSL 4:1b0d80432c79 594 #define LOOP_START \
wolfSSL 4:1b0d80432c79 595 mu = c[x] * mp
wolfSSL 4:1b0d80432c79 596
wolfSSL 4:1b0d80432c79 597 #define INNERMUL \
wolfSSL 4:1b0d80432c79 598 __asm__( \
wolfSSL 4:1b0d80432c79 599 " ld.w r2,%1 \n\t" \
wolfSSL 4:1b0d80432c79 600 " add r2,%0 \n\t" \
wolfSSL 4:1b0d80432c79 601 " eor r3,r3 \n\t" \
wolfSSL 4:1b0d80432c79 602 " acr r3 \n\t" \
wolfSSL 4:1b0d80432c79 603 " macu.d r2,%3,%4 \n\t" \
wolfSSL 4:1b0d80432c79 604 " st.w %1,r2 \n\t" \
wolfSSL 4:1b0d80432c79 605 " mov %0,r3 \n\t" \
wolfSSL 4:1b0d80432c79 606 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
wolfSSL 4:1b0d80432c79 607
wolfSSL 4:1b0d80432c79 608 #define PROPCARRY \
wolfSSL 4:1b0d80432c79 609 __asm__( \
wolfSSL 4:1b0d80432c79 610 " ld.w r2,%1 \n\t" \
wolfSSL 4:1b0d80432c79 611 " add r2,%0 \n\t" \
wolfSSL 4:1b0d80432c79 612 " st.w %1,r2 \n\t" \
wolfSSL 4:1b0d80432c79 613 " eor %0,%0 \n\t" \
wolfSSL 4:1b0d80432c79 614 " acr %0 \n\t" \
wolfSSL 4:1b0d80432c79 615 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
wolfSSL 4:1b0d80432c79 616
wolfSSL 4:1b0d80432c79 617 #else
wolfSSL 4:1b0d80432c79 618
wolfSSL 4:1b0d80432c79 619 /* ISO C code */
wolfSSL 4:1b0d80432c79 620 #define MONT_START
wolfSSL 4:1b0d80432c79 621 #define MONT_FINI
wolfSSL 4:1b0d80432c79 622 #define LOOP_END
wolfSSL 4:1b0d80432c79 623 #define LOOP_START \
wolfSSL 4:1b0d80432c79 624 mu = c[x] * mp
wolfSSL 4:1b0d80432c79 625
wolfSSL 4:1b0d80432c79 626 #define INNERMUL \
wolfSSL 4:1b0d80432c79 627 do { fp_word t; \
wolfSSL 4:1b0d80432c79 628 t = ((fp_word)_c[0] + (fp_word)cy) + \
wolfSSL 4:1b0d80432c79 629 (((fp_word)mu) * ((fp_word)*tmpm++)); \
wolfSSL 4:1b0d80432c79 630 _c[0] = (fp_digit)t; \
wolfSSL 4:1b0d80432c79 631 cy = (fp_digit)(t >> DIGIT_BIT); \
wolfSSL 4:1b0d80432c79 632 } while (0)
wolfSSL 4:1b0d80432c79 633
wolfSSL 4:1b0d80432c79 634 #define PROPCARRY \
wolfSSL 4:1b0d80432c79 635 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
wolfSSL 4:1b0d80432c79 636
wolfSSL 4:1b0d80432c79 637 #endif
wolfSSL 4:1b0d80432c79 638 /******************************************************************/
wolfSSL 4:1b0d80432c79 639
wolfSSL 4:1b0d80432c79 640
wolfSSL 4:1b0d80432c79 641 #define LO 0
wolfSSL 4:1b0d80432c79 642 /* end fp_montogomery_reduce.c asm */
wolfSSL 4:1b0d80432c79 643
wolfSSL 4:1b0d80432c79 644
wolfSSL 4:1b0d80432c79 645 /* start fp_sqr_comba.c asm */
wolfSSL 4:1b0d80432c79 646 #if defined(TFM_X86)
wolfSSL 4:1b0d80432c79 647
wolfSSL 4:1b0d80432c79 648 /* x86-32 optimized */
wolfSSL 4:1b0d80432c79 649
wolfSSL 4:1b0d80432c79 650 #define COMBA_START
wolfSSL 4:1b0d80432c79 651
wolfSSL 4:1b0d80432c79 652 #define CLEAR_CARRY \
wolfSSL 4:1b0d80432c79 653 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 654
wolfSSL 4:1b0d80432c79 655 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 656 x = c0;
wolfSSL 4:1b0d80432c79 657
wolfSSL 4:1b0d80432c79 658 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 659 x = c1;
wolfSSL 4:1b0d80432c79 660
wolfSSL 4:1b0d80432c79 661 #define CARRY_FORWARD \
wolfSSL 4:1b0d80432c79 662 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 663
wolfSSL 4:1b0d80432c79 664 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 665
wolfSSL 4:1b0d80432c79 666 #define SQRADD(i, j) \
wolfSSL 4:1b0d80432c79 667 __asm__( \
wolfSSL 4:1b0d80432c79 668 "movl %6,%%eax \n\t" \
wolfSSL 4:1b0d80432c79 669 "mull %%eax \n\t" \
wolfSSL 4:1b0d80432c79 670 "addl %%eax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 671 "adcl %%edx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 672 "adcl $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 673 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
wolfSSL 4:1b0d80432c79 674
wolfSSL 4:1b0d80432c79 675 #define SQRADD2(i, j) \
wolfSSL 4:1b0d80432c79 676 __asm__( \
wolfSSL 4:1b0d80432c79 677 "movl %6,%%eax \n\t" \
wolfSSL 4:1b0d80432c79 678 "mull %7 \n\t" \
wolfSSL 4:1b0d80432c79 679 "addl %%eax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 680 "adcl %%edx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 681 "adcl $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 682 "addl %%eax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 683 "adcl %%edx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 684 "adcl $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 685 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx", "cc");
wolfSSL 4:1b0d80432c79 686
wolfSSL 4:1b0d80432c79 687 #define SQRADDSC(i, j) \
wolfSSL 4:1b0d80432c79 688 __asm__( \
wolfSSL 4:1b0d80432c79 689 "movl %3,%%eax \n\t" \
wolfSSL 4:1b0d80432c79 690 "mull %4 \n\t" \
wolfSSL 4:1b0d80432c79 691 "movl %%eax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 692 "movl %%edx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 693 "xorl %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 694 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc");
wolfSSL 4:1b0d80432c79 695
wolfSSL 4:1b0d80432c79 696 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
wolfSSL 4:1b0d80432c79 697
wolfSSL 4:1b0d80432c79 698 #define SQRADDAC(i, j) \
wolfSSL 4:1b0d80432c79 699 __asm__( \
wolfSSL 4:1b0d80432c79 700 "movl %6,%%eax \n\t" \
wolfSSL 4:1b0d80432c79 701 "mull %7 \n\t" \
wolfSSL 4:1b0d80432c79 702 "addl %%eax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 703 "adcl %%edx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 704 "adcl $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 705 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
wolfSSL 4:1b0d80432c79 706
wolfSSL 4:1b0d80432c79 707 #define SQRADDDB \
wolfSSL 4:1b0d80432c79 708 __asm__( \
wolfSSL 4:1b0d80432c79 709 "addl %6,%0 \n\t" \
wolfSSL 4:1b0d80432c79 710 "adcl %7,%1 \n\t" \
wolfSSL 4:1b0d80432c79 711 "adcl %8,%2 \n\t" \
wolfSSL 4:1b0d80432c79 712 "addl %6,%0 \n\t" \
wolfSSL 4:1b0d80432c79 713 "adcl %7,%1 \n\t" \
wolfSSL 4:1b0d80432c79 714 "adcl %8,%2 \n\t" \
wolfSSL 4:1b0d80432c79 715 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
wolfSSL 4:1b0d80432c79 716
wolfSSL 4:1b0d80432c79 717 #elif defined(TFM_X86_64)
wolfSSL 4:1b0d80432c79 718 /* x86-64 optimized */
wolfSSL 4:1b0d80432c79 719
wolfSSL 4:1b0d80432c79 720 #define COMBA_START
wolfSSL 4:1b0d80432c79 721
wolfSSL 4:1b0d80432c79 722 #define CLEAR_CARRY \
wolfSSL 4:1b0d80432c79 723 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 724
wolfSSL 4:1b0d80432c79 725 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 726 x = c0;
wolfSSL 4:1b0d80432c79 727
wolfSSL 4:1b0d80432c79 728 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 729 x = c1;
wolfSSL 4:1b0d80432c79 730
wolfSSL 4:1b0d80432c79 731 #define CARRY_FORWARD \
wolfSSL 4:1b0d80432c79 732 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 733
wolfSSL 4:1b0d80432c79 734 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 735
wolfSSL 4:1b0d80432c79 736 #define SQRADD(i, j) \
wolfSSL 4:1b0d80432c79 737 __asm__( \
wolfSSL 4:1b0d80432c79 738 "movq %6,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 739 "mulq %%rax \n\t" \
wolfSSL 4:1b0d80432c79 740 "addq %%rax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 741 "adcq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 742 "adcq $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 743 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
wolfSSL 4:1b0d80432c79 744
wolfSSL 4:1b0d80432c79 745 #define SQRADD2(i, j) \
wolfSSL 4:1b0d80432c79 746 __asm__( \
wolfSSL 4:1b0d80432c79 747 "movq %6,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 748 "mulq %7 \n\t" \
wolfSSL 4:1b0d80432c79 749 "addq %%rax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 750 "adcq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 751 "adcq $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 752 "addq %%rax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 753 "adcq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 754 "adcq $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 755 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
wolfSSL 4:1b0d80432c79 756
wolfSSL 4:1b0d80432c79 757 #define SQRADDSC(i, j) \
wolfSSL 4:1b0d80432c79 758 __asm__( \
wolfSSL 4:1b0d80432c79 759 "movq %3,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 760 "mulq %4 \n\t" \
wolfSSL 4:1b0d80432c79 761 "movq %%rax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 762 "movq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 763 "xorq %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 764 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
wolfSSL 4:1b0d80432c79 765
wolfSSL 4:1b0d80432c79 766 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
wolfSSL 4:1b0d80432c79 767
wolfSSL 4:1b0d80432c79 768 #define SQRADDAC(i, j) \
wolfSSL 4:1b0d80432c79 769 __asm__( \
wolfSSL 4:1b0d80432c79 770 "movq %6,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 771 "mulq %7 \n\t" \
wolfSSL 4:1b0d80432c79 772 "addq %%rax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 773 "adcq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 774 "adcq $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 775 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
wolfSSL 4:1b0d80432c79 776
wolfSSL 4:1b0d80432c79 777 #define SQRADDDB \
wolfSSL 4:1b0d80432c79 778 __asm__( \
wolfSSL 4:1b0d80432c79 779 "addq %6,%0 \n\t" \
wolfSSL 4:1b0d80432c79 780 "adcq %7,%1 \n\t" \
wolfSSL 4:1b0d80432c79 781 "adcq %8,%2 \n\t" \
wolfSSL 4:1b0d80432c79 782 "addq %6,%0 \n\t" \
wolfSSL 4:1b0d80432c79 783 "adcq %7,%1 \n\t" \
wolfSSL 4:1b0d80432c79 784 "adcq %8,%2 \n\t" \
wolfSSL 4:1b0d80432c79 785 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
wolfSSL 4:1b0d80432c79 786
wolfSSL 4:1b0d80432c79 787 #elif defined(TFM_SSE2)
wolfSSL 4:1b0d80432c79 788
wolfSSL 4:1b0d80432c79 789 /* SSE2 Optimized */
wolfSSL 4:1b0d80432c79 790 #define COMBA_START
wolfSSL 4:1b0d80432c79 791
wolfSSL 4:1b0d80432c79 792 #define CLEAR_CARRY \
wolfSSL 4:1b0d80432c79 793 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 794
wolfSSL 4:1b0d80432c79 795 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 796 x = c0;
wolfSSL 4:1b0d80432c79 797
wolfSSL 4:1b0d80432c79 798 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 799 x = c1;
wolfSSL 4:1b0d80432c79 800
wolfSSL 4:1b0d80432c79 801 #define CARRY_FORWARD \
wolfSSL 4:1b0d80432c79 802 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 803
wolfSSL 4:1b0d80432c79 804 #define COMBA_FINI \
wolfSSL 4:1b0d80432c79 805 __asm__("emms");
wolfSSL 4:1b0d80432c79 806
wolfSSL 4:1b0d80432c79 807 #define SQRADD(i, j) \
wolfSSL 4:1b0d80432c79 808 __asm__( \
wolfSSL 4:1b0d80432c79 809 "movd %6,%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 810 "pmuludq %%mm0,%%mm0\n\t" \
wolfSSL 4:1b0d80432c79 811 "movd %%mm0,%%eax \n\t" \
wolfSSL 4:1b0d80432c79 812 "psrlq $32,%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 813 "addl %%eax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 814 "movd %%mm0,%%eax \n\t" \
wolfSSL 4:1b0d80432c79 815 "adcl %%eax,%1 \n\t" \
wolfSSL 4:1b0d80432c79 816 "adcl $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 817 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
wolfSSL 4:1b0d80432c79 818
wolfSSL 4:1b0d80432c79 819 #define SQRADD2(i, j) \
wolfSSL 4:1b0d80432c79 820 __asm__( \
wolfSSL 4:1b0d80432c79 821 "movd %6,%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 822 "movd %7,%%mm1 \n\t" \
wolfSSL 4:1b0d80432c79 823 "pmuludq %%mm1,%%mm0\n\t" \
wolfSSL 4:1b0d80432c79 824 "movd %%mm0,%%eax \n\t" \
wolfSSL 4:1b0d80432c79 825 "psrlq $32,%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 826 "movd %%mm0,%%edx \n\t" \
wolfSSL 4:1b0d80432c79 827 "addl %%eax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 828 "adcl %%edx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 829 "adcl $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 830 "addl %%eax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 831 "adcl %%edx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 832 "adcl $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 833 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
wolfSSL 4:1b0d80432c79 834
wolfSSL 4:1b0d80432c79 835 #define SQRADDSC(i, j) \
wolfSSL 4:1b0d80432c79 836 __asm__( \
wolfSSL 4:1b0d80432c79 837 "movd %3,%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 838 "movd %4,%%mm1 \n\t" \
wolfSSL 4:1b0d80432c79 839 "pmuludq %%mm1,%%mm0\n\t" \
wolfSSL 4:1b0d80432c79 840 "movd %%mm0,%0 \n\t" \
wolfSSL 4:1b0d80432c79 841 "psrlq $32,%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 842 "movd %%mm0,%1 \n\t" \
wolfSSL 4:1b0d80432c79 843 "xorl %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 844 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j));
wolfSSL 4:1b0d80432c79 845
wolfSSL 4:1b0d80432c79 846 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
wolfSSL 4:1b0d80432c79 847
wolfSSL 4:1b0d80432c79 848 #define SQRADDAC(i, j) \
wolfSSL 4:1b0d80432c79 849 __asm__( \
wolfSSL 4:1b0d80432c79 850 "movd %6,%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 851 "movd %7,%%mm1 \n\t" \
wolfSSL 4:1b0d80432c79 852 "pmuludq %%mm1,%%mm0\n\t" \
wolfSSL 4:1b0d80432c79 853 "movd %%mm0,%%eax \n\t" \
wolfSSL 4:1b0d80432c79 854 "psrlq $32,%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 855 "movd %%mm0,%%edx \n\t" \
wolfSSL 4:1b0d80432c79 856 "addl %%eax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 857 "adcl %%edx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 858 "adcl $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 859 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","cc");
wolfSSL 4:1b0d80432c79 860
wolfSSL 4:1b0d80432c79 861 #define SQRADDDB \
wolfSSL 4:1b0d80432c79 862 __asm__( \
wolfSSL 4:1b0d80432c79 863 "addl %6,%0 \n\t" \
wolfSSL 4:1b0d80432c79 864 "adcl %7,%1 \n\t" \
wolfSSL 4:1b0d80432c79 865 "adcl %8,%2 \n\t" \
wolfSSL 4:1b0d80432c79 866 "addl %6,%0 \n\t" \
wolfSSL 4:1b0d80432c79 867 "adcl %7,%1 \n\t" \
wolfSSL 4:1b0d80432c79 868 "adcl %8,%2 \n\t" \
wolfSSL 4:1b0d80432c79 869 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
wolfSSL 4:1b0d80432c79 870
wolfSSL 4:1b0d80432c79 871 #elif defined(TFM_ARM)
wolfSSL 4:1b0d80432c79 872
wolfSSL 4:1b0d80432c79 873 /* ARM code */
wolfSSL 4:1b0d80432c79 874
wolfSSL 4:1b0d80432c79 875 #define COMBA_START
wolfSSL 4:1b0d80432c79 876
wolfSSL 4:1b0d80432c79 877 #define CLEAR_CARRY \
wolfSSL 4:1b0d80432c79 878 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 879
wolfSSL 4:1b0d80432c79 880 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 881 x = c0;
wolfSSL 4:1b0d80432c79 882
wolfSSL 4:1b0d80432c79 883 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 884 x = c1;
wolfSSL 4:1b0d80432c79 885
wolfSSL 4:1b0d80432c79 886 #define CARRY_FORWARD \
wolfSSL 4:1b0d80432c79 887 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 888
wolfSSL 4:1b0d80432c79 889 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 890
wolfSSL 4:1b0d80432c79 891 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 4:1b0d80432c79 892 #define SQRADD(i, j) \
wolfSSL 4:1b0d80432c79 893 __asm__( \
wolfSSL 4:1b0d80432c79 894 " UMULL r0,r1,%6,%6 \n\t" \
wolfSSL 4:1b0d80432c79 895 " ADDS %0,%0,r0 \n\t" \
wolfSSL 4:1b0d80432c79 896 " ADCS %1,%1,r1 \n\t" \
wolfSSL 4:1b0d80432c79 897 " ADC %2,%2,#0 \n\t" \
wolfSSL 4:1b0d80432c79 898 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
wolfSSL 4:1b0d80432c79 899
wolfSSL 4:1b0d80432c79 900 /* for squaring some of the terms are doubled... */
wolfSSL 4:1b0d80432c79 901 #define SQRADD2(i, j) \
wolfSSL 4:1b0d80432c79 902 __asm__( \
wolfSSL 4:1b0d80432c79 903 " UMULL r0,r1,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 904 " ADDS %0,%0,r0 \n\t" \
wolfSSL 4:1b0d80432c79 905 " ADCS %1,%1,r1 \n\t" \
wolfSSL 4:1b0d80432c79 906 " ADC %2,%2,#0 \n\t" \
wolfSSL 4:1b0d80432c79 907 " ADDS %0,%0,r0 \n\t" \
wolfSSL 4:1b0d80432c79 908 " ADCS %1,%1,r1 \n\t" \
wolfSSL 4:1b0d80432c79 909 " ADC %2,%2,#0 \n\t" \
wolfSSL 4:1b0d80432c79 910 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
wolfSSL 4:1b0d80432c79 911
wolfSSL 4:1b0d80432c79 912 #define SQRADDSC(i, j) \
wolfSSL 4:1b0d80432c79 913 __asm__( \
wolfSSL 4:1b0d80432c79 914 " UMULL %0,%1,%3,%4 \n\t" \
wolfSSL 4:1b0d80432c79 915 " SUB %2,%2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 916 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc");
wolfSSL 4:1b0d80432c79 917
wolfSSL 4:1b0d80432c79 918 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
wolfSSL 4:1b0d80432c79 919
wolfSSL 4:1b0d80432c79 920 #define SQRADDAC(i, j) \
wolfSSL 4:1b0d80432c79 921 __asm__( \
wolfSSL 4:1b0d80432c79 922 " UMULL r0,r1,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 923 " ADDS %0,%0,r0 \n\t" \
wolfSSL 4:1b0d80432c79 924 " ADCS %1,%1,r1 \n\t" \
wolfSSL 4:1b0d80432c79 925 " ADC %2,%2,#0 \n\t" \
wolfSSL 4:1b0d80432c79 926 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
wolfSSL 4:1b0d80432c79 927
wolfSSL 4:1b0d80432c79 928 #define SQRADDDB \
wolfSSL 4:1b0d80432c79 929 __asm__( \
wolfSSL 4:1b0d80432c79 930 " ADDS %0,%0,%3 \n\t" \
wolfSSL 4:1b0d80432c79 931 " ADCS %1,%1,%4 \n\t" \
wolfSSL 4:1b0d80432c79 932 " ADC %2,%2,%5 \n\t" \
wolfSSL 4:1b0d80432c79 933 " ADDS %0,%0,%3 \n\t" \
wolfSSL 4:1b0d80432c79 934 " ADCS %1,%1,%4 \n\t" \
wolfSSL 4:1b0d80432c79 935 " ADC %2,%2,%5 \n\t" \
wolfSSL 4:1b0d80432c79 936 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
wolfSSL 4:1b0d80432c79 937
wolfSSL 4:1b0d80432c79 938 #elif defined(TFM_PPC32)
wolfSSL 4:1b0d80432c79 939
wolfSSL 4:1b0d80432c79 940 /* PPC32 */
wolfSSL 4:1b0d80432c79 941
wolfSSL 4:1b0d80432c79 942 #define COMBA_START
wolfSSL 4:1b0d80432c79 943
wolfSSL 4:1b0d80432c79 944 #define CLEAR_CARRY \
wolfSSL 4:1b0d80432c79 945 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 946
wolfSSL 4:1b0d80432c79 947 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 948 x = c0;
wolfSSL 4:1b0d80432c79 949
wolfSSL 4:1b0d80432c79 950 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 951 x = c1;
wolfSSL 4:1b0d80432c79 952
wolfSSL 4:1b0d80432c79 953 #define CARRY_FORWARD \
wolfSSL 4:1b0d80432c79 954 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 955
wolfSSL 4:1b0d80432c79 956 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 957
wolfSSL 4:1b0d80432c79 958 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 4:1b0d80432c79 959 #define SQRADD(i, j) \
wolfSSL 4:1b0d80432c79 960 __asm__( \
wolfSSL 4:1b0d80432c79 961 " mullw 16,%6,%6 \n\t" \
wolfSSL 4:1b0d80432c79 962 " addc %0,%0,16 \n\t" \
wolfSSL 4:1b0d80432c79 963 " mulhwu 16,%6,%6 \n\t" \
wolfSSL 4:1b0d80432c79 964 " adde %1,%1,16 \n\t" \
wolfSSL 4:1b0d80432c79 965 " addze %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 966 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
wolfSSL 4:1b0d80432c79 967
wolfSSL 4:1b0d80432c79 968 /* for squaring some of the terms are doubled... */
wolfSSL 4:1b0d80432c79 969 #define SQRADD2(i, j) \
wolfSSL 4:1b0d80432c79 970 __asm__( \
wolfSSL 4:1b0d80432c79 971 " mullw 16,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 972 " mulhwu 17,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 973 " addc %0,%0,16 \n\t" \
wolfSSL 4:1b0d80432c79 974 " adde %1,%1,17 \n\t" \
wolfSSL 4:1b0d80432c79 975 " addze %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 976 " addc %0,%0,16 \n\t" \
wolfSSL 4:1b0d80432c79 977 " adde %1,%1,17 \n\t" \
wolfSSL 4:1b0d80432c79 978 " addze %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 979 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
wolfSSL 4:1b0d80432c79 980
wolfSSL 4:1b0d80432c79 981 #define SQRADDSC(i, j) \
wolfSSL 4:1b0d80432c79 982 __asm__( \
wolfSSL 4:1b0d80432c79 983 " mullw %0,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 984 " mulhwu %1,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 985 " xor %2,%2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 986 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
wolfSSL 4:1b0d80432c79 987
wolfSSL 4:1b0d80432c79 988 #define SQRADDAC(i, j) \
wolfSSL 4:1b0d80432c79 989 __asm__( \
wolfSSL 4:1b0d80432c79 990 " mullw 16,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 991 " addc %0,%0,16 \n\t" \
wolfSSL 4:1b0d80432c79 992 " mulhwu 16,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 993 " adde %1,%1,16 \n\t" \
wolfSSL 4:1b0d80432c79 994 " addze %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 995 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
wolfSSL 4:1b0d80432c79 996
wolfSSL 4:1b0d80432c79 997 #define SQRADDDB \
wolfSSL 4:1b0d80432c79 998 __asm__( \
wolfSSL 4:1b0d80432c79 999 " addc %0,%0,%3 \n\t" \
wolfSSL 4:1b0d80432c79 1000 " adde %1,%1,%4 \n\t" \
wolfSSL 4:1b0d80432c79 1001 " adde %2,%2,%5 \n\t" \
wolfSSL 4:1b0d80432c79 1002 " addc %0,%0,%3 \n\t" \
wolfSSL 4:1b0d80432c79 1003 " adde %1,%1,%4 \n\t" \
wolfSSL 4:1b0d80432c79 1004 " adde %2,%2,%5 \n\t" \
wolfSSL 4:1b0d80432c79 1005 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
wolfSSL 4:1b0d80432c79 1006
wolfSSL 4:1b0d80432c79 1007 #elif defined(TFM_PPC64)
wolfSSL 4:1b0d80432c79 1008 /* PPC64 */
wolfSSL 4:1b0d80432c79 1009
wolfSSL 4:1b0d80432c79 1010 #define COMBA_START
wolfSSL 4:1b0d80432c79 1011
wolfSSL 4:1b0d80432c79 1012 #define CLEAR_CARRY \
wolfSSL 4:1b0d80432c79 1013 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 1014
wolfSSL 4:1b0d80432c79 1015 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 1016 x = c0;
wolfSSL 4:1b0d80432c79 1017
wolfSSL 4:1b0d80432c79 1018 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 1019 x = c1;
wolfSSL 4:1b0d80432c79 1020
wolfSSL 4:1b0d80432c79 1021 #define CARRY_FORWARD \
wolfSSL 4:1b0d80432c79 1022 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 1023
wolfSSL 4:1b0d80432c79 1024 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 1025
wolfSSL 4:1b0d80432c79 1026 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 4:1b0d80432c79 1027 #define SQRADD(i, j) \
wolfSSL 4:1b0d80432c79 1028 __asm__( \
wolfSSL 4:1b0d80432c79 1029 " mulld 16,%6,%6 \n\t" \
wolfSSL 4:1b0d80432c79 1030 " addc %0,%0,16 \n\t" \
wolfSSL 4:1b0d80432c79 1031 " mulhdu 16,%6,%6 \n\t" \
wolfSSL 4:1b0d80432c79 1032 " adde %1,%1,16 \n\t" \
wolfSSL 4:1b0d80432c79 1033 " addze %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 1034 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
wolfSSL 4:1b0d80432c79 1035
wolfSSL 4:1b0d80432c79 1036 /* for squaring some of the terms are doubled... */
wolfSSL 4:1b0d80432c79 1037 #define SQRADD2(i, j) \
wolfSSL 4:1b0d80432c79 1038 __asm__( \
wolfSSL 4:1b0d80432c79 1039 " mulld 16,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1040 " mulhdu 17,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1041 " addc %0,%0,16 \n\t" \
wolfSSL 4:1b0d80432c79 1042 " adde %1,%1,17 \n\t" \
wolfSSL 4:1b0d80432c79 1043 " addze %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 1044 " addc %0,%0,16 \n\t" \
wolfSSL 4:1b0d80432c79 1045 " adde %1,%1,17 \n\t" \
wolfSSL 4:1b0d80432c79 1046 " addze %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 1047 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
wolfSSL 4:1b0d80432c79 1048
wolfSSL 4:1b0d80432c79 1049 #define SQRADDSC(i, j) \
wolfSSL 4:1b0d80432c79 1050 __asm__( \
wolfSSL 4:1b0d80432c79 1051 " mulld %0,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1052 " mulhdu %1,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1053 " xor %2,%2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 1054 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
wolfSSL 4:1b0d80432c79 1055
wolfSSL 4:1b0d80432c79 1056 #define SQRADDAC(i, j) \
wolfSSL 4:1b0d80432c79 1057 __asm__( \
wolfSSL 4:1b0d80432c79 1058 " mulld 16,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1059 " addc %0,%0,16 \n\t" \
wolfSSL 4:1b0d80432c79 1060 " mulhdu 16,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1061 " adde %1,%1,16 \n\t" \
wolfSSL 4:1b0d80432c79 1062 " addze %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 1063 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
wolfSSL 4:1b0d80432c79 1064
wolfSSL 4:1b0d80432c79 1065 #define SQRADDDB \
wolfSSL 4:1b0d80432c79 1066 __asm__( \
wolfSSL 4:1b0d80432c79 1067 " addc %0,%0,%3 \n\t" \
wolfSSL 4:1b0d80432c79 1068 " adde %1,%1,%4 \n\t" \
wolfSSL 4:1b0d80432c79 1069 " adde %2,%2,%5 \n\t" \
wolfSSL 4:1b0d80432c79 1070 " addc %0,%0,%3 \n\t" \
wolfSSL 4:1b0d80432c79 1071 " adde %1,%1,%4 \n\t" \
wolfSSL 4:1b0d80432c79 1072 " adde %2,%2,%5 \n\t" \
wolfSSL 4:1b0d80432c79 1073 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
wolfSSL 4:1b0d80432c79 1074
wolfSSL 4:1b0d80432c79 1075
wolfSSL 4:1b0d80432c79 1076 #elif defined(TFM_AVR32)
wolfSSL 4:1b0d80432c79 1077
wolfSSL 4:1b0d80432c79 1078 /* AVR32 */
wolfSSL 4:1b0d80432c79 1079
wolfSSL 4:1b0d80432c79 1080 #define COMBA_START
wolfSSL 4:1b0d80432c79 1081
wolfSSL 4:1b0d80432c79 1082 #define CLEAR_CARRY \
wolfSSL 4:1b0d80432c79 1083 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 1084
wolfSSL 4:1b0d80432c79 1085 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 1086 x = c0;
wolfSSL 4:1b0d80432c79 1087
wolfSSL 4:1b0d80432c79 1088 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 1089 x = c1;
wolfSSL 4:1b0d80432c79 1090
wolfSSL 4:1b0d80432c79 1091 #define CARRY_FORWARD \
wolfSSL 4:1b0d80432c79 1092 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 1093
wolfSSL 4:1b0d80432c79 1094 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 1095
wolfSSL 4:1b0d80432c79 1096 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 4:1b0d80432c79 1097 #define SQRADD(i, j) \
wolfSSL 4:1b0d80432c79 1098 __asm__( \
wolfSSL 4:1b0d80432c79 1099 " mulu.d r2,%6,%6 \n\t" \
wolfSSL 4:1b0d80432c79 1100 " add %0,%0,r2 \n\t" \
wolfSSL 4:1b0d80432c79 1101 " adc %1,%1,r3 \n\t" \
wolfSSL 4:1b0d80432c79 1102 " acr %2 \n\t" \
wolfSSL 4:1b0d80432c79 1103 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
wolfSSL 4:1b0d80432c79 1104
wolfSSL 4:1b0d80432c79 1105 /* for squaring some of the terms are doubled... */
wolfSSL 4:1b0d80432c79 1106 #define SQRADD2(i, j) \
wolfSSL 4:1b0d80432c79 1107 __asm__( \
wolfSSL 4:1b0d80432c79 1108 " mulu.d r2,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1109 " add %0,%0,r2 \n\t" \
wolfSSL 4:1b0d80432c79 1110 " adc %1,%1,r3 \n\t" \
wolfSSL 4:1b0d80432c79 1111 " acr %2, \n\t" \
wolfSSL 4:1b0d80432c79 1112 " add %0,%0,r2 \n\t" \
wolfSSL 4:1b0d80432c79 1113 " adc %1,%1,r3 \n\t" \
wolfSSL 4:1b0d80432c79 1114 " acr %2, \n\t" \
wolfSSL 4:1b0d80432c79 1115 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
wolfSSL 4:1b0d80432c79 1116
wolfSSL 4:1b0d80432c79 1117 #define SQRADDSC(i, j) \
wolfSSL 4:1b0d80432c79 1118 __asm__( \
wolfSSL 4:1b0d80432c79 1119 " mulu.d r2,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1120 " mov %0,r2 \n\t" \
wolfSSL 4:1b0d80432c79 1121 " mov %1,r3 \n\t" \
wolfSSL 4:1b0d80432c79 1122 " eor %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 1123 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
wolfSSL 4:1b0d80432c79 1124
wolfSSL 4:1b0d80432c79 1125 #define SQRADDAC(i, j) \
wolfSSL 4:1b0d80432c79 1126 __asm__( \
wolfSSL 4:1b0d80432c79 1127 " mulu.d r2,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1128 " add %0,%0,r2 \n\t" \
wolfSSL 4:1b0d80432c79 1129 " adc %1,%1,r3 \n\t" \
wolfSSL 4:1b0d80432c79 1130 " acr %2 \n\t" \
wolfSSL 4:1b0d80432c79 1131 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
wolfSSL 4:1b0d80432c79 1132
wolfSSL 4:1b0d80432c79 1133 #define SQRADDDB \
wolfSSL 4:1b0d80432c79 1134 __asm__( \
wolfSSL 4:1b0d80432c79 1135 " add %0,%0,%3 \n\t" \
wolfSSL 4:1b0d80432c79 1136 " adc %1,%1,%4 \n\t" \
wolfSSL 4:1b0d80432c79 1137 " adc %2,%2,%5 \n\t" \
wolfSSL 4:1b0d80432c79 1138 " add %0,%0,%3 \n\t" \
wolfSSL 4:1b0d80432c79 1139 " adc %1,%1,%4 \n\t" \
wolfSSL 4:1b0d80432c79 1140 " adc %2,%2,%5 \n\t" \
wolfSSL 4:1b0d80432c79 1141 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
wolfSSL 4:1b0d80432c79 1142
wolfSSL 4:1b0d80432c79 1143
wolfSSL 4:1b0d80432c79 1144 #else
wolfSSL 4:1b0d80432c79 1145
wolfSSL 4:1b0d80432c79 1146 #define TFM_ISO
wolfSSL 4:1b0d80432c79 1147
wolfSSL 4:1b0d80432c79 1148 /* ISO C portable code */
wolfSSL 4:1b0d80432c79 1149
wolfSSL 4:1b0d80432c79 1150 #define COMBA_START
wolfSSL 4:1b0d80432c79 1151
wolfSSL 4:1b0d80432c79 1152 #define CLEAR_CARRY \
wolfSSL 4:1b0d80432c79 1153 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 1154
wolfSSL 4:1b0d80432c79 1155 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 1156 x = c0;
wolfSSL 4:1b0d80432c79 1157
wolfSSL 4:1b0d80432c79 1158 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 1159 x = c1;
wolfSSL 4:1b0d80432c79 1160
wolfSSL 4:1b0d80432c79 1161 #define CARRY_FORWARD \
wolfSSL 4:1b0d80432c79 1162 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 1163
wolfSSL 4:1b0d80432c79 1164 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 1165
wolfSSL 4:1b0d80432c79 1166 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 4:1b0d80432c79 1167 #define SQRADD(i, j) \
wolfSSL 4:1b0d80432c79 1168 do { fp_word t; \
wolfSSL 4:1b0d80432c79 1169 t = c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \
wolfSSL 4:1b0d80432c79 1170 t = c1 + (t >> DIGIT_BIT); c1 = (fp_digit)t; \
wolfSSL 4:1b0d80432c79 1171 c2 +=(fp_digit) (t >> DIGIT_BIT); \
wolfSSL 4:1b0d80432c79 1172 } while (0);
wolfSSL 4:1b0d80432c79 1173
wolfSSL 4:1b0d80432c79 1174
wolfSSL 4:1b0d80432c79 1175 /* for squaring some of the terms are doubled... */
wolfSSL 4:1b0d80432c79 1176 #define SQRADD2(i, j) \
wolfSSL 4:1b0d80432c79 1177 do { fp_word t; \
wolfSSL 4:1b0d80432c79 1178 t = ((fp_word)i) * ((fp_word)j); \
wolfSSL 4:1b0d80432c79 1179 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
wolfSSL 4:1b0d80432c79 1180 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
wolfSSL 4:1b0d80432c79 1181 c2 +=(fp_digit)( tt >> DIGIT_BIT); \
wolfSSL 4:1b0d80432c79 1182 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
wolfSSL 4:1b0d80432c79 1183 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
wolfSSL 4:1b0d80432c79 1184 c2 +=(fp_digit) (tt >> DIGIT_BIT); \
wolfSSL 4:1b0d80432c79 1185 } while (0);
wolfSSL 4:1b0d80432c79 1186
wolfSSL 4:1b0d80432c79 1187 #define SQRADDSC(i, j) \
wolfSSL 4:1b0d80432c79 1188 do { fp_word t; \
wolfSSL 4:1b0d80432c79 1189 t = ((fp_word)i) * ((fp_word)j); \
wolfSSL 4:1b0d80432c79 1190 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \
wolfSSL 4:1b0d80432c79 1191 } while (0);
wolfSSL 4:1b0d80432c79 1192
wolfSSL 4:1b0d80432c79 1193 #define SQRADDAC(i, j) \
wolfSSL 4:1b0d80432c79 1194 do { fp_word t; \
wolfSSL 4:1b0d80432c79 1195 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = (fp_digit)t; \
wolfSSL 4:1b0d80432c79 1196 t = sc1 + (t >> DIGIT_BIT); sc1 = (fp_digit)t; \
wolfSSL 4:1b0d80432c79 1197 sc2 += (fp_digit)(t >> DIGIT_BIT); \
wolfSSL 4:1b0d80432c79 1198 } while (0);
wolfSSL 4:1b0d80432c79 1199
wolfSSL 4:1b0d80432c79 1200 #define SQRADDDB \
wolfSSL 4:1b0d80432c79 1201 do { fp_word t; \
wolfSSL 4:1b0d80432c79 1202 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t; \
wolfSSL 4:1b0d80432c79 1203 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); \
wolfSSL 4:1b0d80432c79 1204 c1 = (fp_digit)t; \
wolfSSL 4:1b0d80432c79 1205 c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT)); \
wolfSSL 4:1b0d80432c79 1206 } while (0);
wolfSSL 4:1b0d80432c79 1207
wolfSSL 4:1b0d80432c79 1208 #endif
wolfSSL 4:1b0d80432c79 1209
wolfSSL 4:1b0d80432c79 1210 #ifdef TFM_SMALL_SET
wolfSSL 4:1b0d80432c79 1211 #include "fp_sqr_comba_small_set.i"
wolfSSL 4:1b0d80432c79 1212 #endif
wolfSSL 4:1b0d80432c79 1213
wolfSSL 4:1b0d80432c79 1214 #if defined(TFM_SQR3)
wolfSSL 4:1b0d80432c79 1215 #include "fp_sqr_comba_3.i"
wolfSSL 4:1b0d80432c79 1216 #endif
wolfSSL 4:1b0d80432c79 1217 #if defined(TFM_SQR4)
wolfSSL 4:1b0d80432c79 1218 #include "fp_sqr_comba_4.i"
wolfSSL 4:1b0d80432c79 1219 #endif
wolfSSL 4:1b0d80432c79 1220 #if defined(TFM_SQR6)
wolfSSL 4:1b0d80432c79 1221 #include "fp_sqr_comba_6.i"
wolfSSL 4:1b0d80432c79 1222 #endif
wolfSSL 4:1b0d80432c79 1223 #if defined(TFM_SQR7)
wolfSSL 4:1b0d80432c79 1224 #include "fp_sqr_comba_7.i"
wolfSSL 4:1b0d80432c79 1225 #endif
wolfSSL 4:1b0d80432c79 1226 #if defined(TFM_SQR8)
wolfSSL 4:1b0d80432c79 1227 #include "fp_sqr_comba_8.i"
wolfSSL 4:1b0d80432c79 1228 #endif
wolfSSL 4:1b0d80432c79 1229 #if defined(TFM_SQR9)
wolfSSL 4:1b0d80432c79 1230 #include "fp_sqr_comba_9.i"
wolfSSL 4:1b0d80432c79 1231 #endif
wolfSSL 4:1b0d80432c79 1232 #if defined(TFM_SQR12)
wolfSSL 4:1b0d80432c79 1233 #include "fp_sqr_comba_12.i"
wolfSSL 4:1b0d80432c79 1234 #endif
wolfSSL 4:1b0d80432c79 1235 #if defined(TFM_SQR17)
wolfSSL 4:1b0d80432c79 1236 #include "fp_sqr_comba_17.i"
wolfSSL 4:1b0d80432c79 1237 #endif
wolfSSL 4:1b0d80432c79 1238 #if defined(TFM_SQR20)
wolfSSL 4:1b0d80432c79 1239 #include "fp_sqr_comba_20.i"
wolfSSL 4:1b0d80432c79 1240 #endif
wolfSSL 4:1b0d80432c79 1241 #if defined(TFM_SQR24)
wolfSSL 4:1b0d80432c79 1242 #include "fp_sqr_comba_24.i"
wolfSSL 4:1b0d80432c79 1243 #endif
wolfSSL 4:1b0d80432c79 1244 #if defined(TFM_SQR28)
wolfSSL 4:1b0d80432c79 1245 #include "fp_sqr_comba_28.i"
wolfSSL 4:1b0d80432c79 1246 #endif
wolfSSL 4:1b0d80432c79 1247 #if defined(TFM_SQR32)
wolfSSL 4:1b0d80432c79 1248 #include "fp_sqr_comba_32.i"
wolfSSL 4:1b0d80432c79 1249 #endif
wolfSSL 4:1b0d80432c79 1250 #if defined(TFM_SQR48)
wolfSSL 4:1b0d80432c79 1251 #include "fp_sqr_comba_48.i"
wolfSSL 4:1b0d80432c79 1252 #endif
wolfSSL 4:1b0d80432c79 1253 #if defined(TFM_SQR64)
wolfSSL 4:1b0d80432c79 1254 #include "fp_sqr_comba_64.i"
wolfSSL 4:1b0d80432c79 1255 #endif
wolfSSL 4:1b0d80432c79 1256 /* end fp_sqr_comba.c asm */
wolfSSL 4:1b0d80432c79 1257
wolfSSL 4:1b0d80432c79 1258 /* start fp_mul_comba.c asm */
wolfSSL 4:1b0d80432c79 1259 /* these are the combas. Worship them. */
wolfSSL 4:1b0d80432c79 1260 #if defined(TFM_X86)
wolfSSL 4:1b0d80432c79 1261 /* Generic x86 optimized code */
wolfSSL 4:1b0d80432c79 1262
wolfSSL 4:1b0d80432c79 1263 /* anything you need at the start */
wolfSSL 4:1b0d80432c79 1264 #define COMBA_START
wolfSSL 4:1b0d80432c79 1265
wolfSSL 4:1b0d80432c79 1266 /* clear the chaining variables */
wolfSSL 4:1b0d80432c79 1267 #define COMBA_CLEAR \
wolfSSL 4:1b0d80432c79 1268 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 1269
wolfSSL 4:1b0d80432c79 1270 /* forward the carry to the next digit */
wolfSSL 4:1b0d80432c79 1271 #define COMBA_FORWARD \
wolfSSL 4:1b0d80432c79 1272 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 1273
wolfSSL 4:1b0d80432c79 1274 /* store the first sum */
wolfSSL 4:1b0d80432c79 1275 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 1276 x = c0;
wolfSSL 4:1b0d80432c79 1277
wolfSSL 4:1b0d80432c79 1278 /* store the second sum [carry] */
wolfSSL 4:1b0d80432c79 1279 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 1280 x = c1;
wolfSSL 4:1b0d80432c79 1281
wolfSSL 4:1b0d80432c79 1282 /* anything you need at the end */
wolfSSL 4:1b0d80432c79 1283 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 1284
wolfSSL 4:1b0d80432c79 1285 /* this should multiply i and j */
wolfSSL 4:1b0d80432c79 1286 #define MULADD(i, j) \
wolfSSL 4:1b0d80432c79 1287 __asm__( \
wolfSSL 4:1b0d80432c79 1288 "movl %6,%%eax \n\t" \
wolfSSL 4:1b0d80432c79 1289 "mull %7 \n\t" \
wolfSSL 4:1b0d80432c79 1290 "addl %%eax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 1291 "adcl %%edx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 1292 "adcl $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 1293 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
wolfSSL 4:1b0d80432c79 1294
wolfSSL 4:1b0d80432c79 1295 #elif defined(TFM_X86_64)
wolfSSL 4:1b0d80432c79 1296 /* x86-64 optimized */
wolfSSL 4:1b0d80432c79 1297
wolfSSL 4:1b0d80432c79 1298 /* anything you need at the start */
wolfSSL 4:1b0d80432c79 1299 #define COMBA_START
wolfSSL 4:1b0d80432c79 1300
wolfSSL 4:1b0d80432c79 1301 /* clear the chaining variables */
wolfSSL 4:1b0d80432c79 1302 #define COMBA_CLEAR \
wolfSSL 4:1b0d80432c79 1303 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 1304
wolfSSL 4:1b0d80432c79 1305 /* forward the carry to the next digit */
wolfSSL 4:1b0d80432c79 1306 #define COMBA_FORWARD \
wolfSSL 4:1b0d80432c79 1307 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 1308
wolfSSL 4:1b0d80432c79 1309 /* store the first sum */
wolfSSL 4:1b0d80432c79 1310 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 1311 x = c0;
wolfSSL 4:1b0d80432c79 1312
wolfSSL 4:1b0d80432c79 1313 /* store the second sum [carry] */
wolfSSL 4:1b0d80432c79 1314 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 1315 x = c1;
wolfSSL 4:1b0d80432c79 1316
wolfSSL 4:1b0d80432c79 1317 /* anything you need at the end */
wolfSSL 4:1b0d80432c79 1318 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 1319
wolfSSL 4:1b0d80432c79 1320 /* this should multiply i and j */
wolfSSL 4:1b0d80432c79 1321 #define MULADD(i, j) \
wolfSSL 4:1b0d80432c79 1322 __asm__ ( \
wolfSSL 4:1b0d80432c79 1323 "movq %6,%%rax \n\t" \
wolfSSL 4:1b0d80432c79 1324 "mulq %7 \n\t" \
wolfSSL 4:1b0d80432c79 1325 "addq %%rax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 1326 "adcq %%rdx,%1 \n\t" \
wolfSSL 4:1b0d80432c79 1327 "adcq $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 1328 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
wolfSSL 4:1b0d80432c79 1329
wolfSSL 4:1b0d80432c79 1330
wolfSSL 4:1b0d80432c79 1331 #if defined(HAVE_INTEL_MULX)
wolfSSL 4:1b0d80432c79 1332 #define MULADD_MULX(b0, c0, c1, rdx)\
wolfSSL 4:1b0d80432c79 1333 __asm__ volatile ( \
wolfSSL 4:1b0d80432c79 1334 "movq %3, %%rdx\n\t" \
wolfSSL 4:1b0d80432c79 1335 "mulx %2,%%r9, %%r8 \n\t" \
wolfSSL 4:1b0d80432c79 1336 "adoxq %%r9,%0 \n\t" \
wolfSSL 4:1b0d80432c79 1337 "adcxq %%r8,%1 \n\t" \
wolfSSL 4:1b0d80432c79 1338 :"+r"(c0),"+r"(c1):"r"(b0), "r"(rdx):"%r8","%r9","%r10","%rdx"\
wolfSSL 4:1b0d80432c79 1339 )
wolfSSL 4:1b0d80432c79 1340
wolfSSL 4:1b0d80432c79 1341
wolfSSL 4:1b0d80432c79 1342 #define MULADD_MULX_ADD_CARRY(c0, c1)\
wolfSSL 4:1b0d80432c79 1343 __asm__ volatile(\
wolfSSL 4:1b0d80432c79 1344 "mov $0, %%r10\n\t"\
wolfSSL 4:1b0d80432c79 1345 "movq %1, %%r8\n\t"\
wolfSSL 4:1b0d80432c79 1346 "adox %%r10, %0\n\t"\
wolfSSL 4:1b0d80432c79 1347 "adcx %%r10, %1\n\t"\
wolfSSL 4:1b0d80432c79 1348 :"+r"(c0),"+r"(c1)::"%r8","%r9","%r10","%rdx") ;
wolfSSL 4:1b0d80432c79 1349
wolfSSL 4:1b0d80432c79 1350 #define MULADD_SET_A(a0)\
wolfSSL 4:1b0d80432c79 1351 __asm__ volatile("add $0, %%r8\n\t" \
wolfSSL 4:1b0d80432c79 1352 "movq %0,%%rdx\n\t" \
wolfSSL 4:1b0d80432c79 1353 ::"r"(a0):"%r8","%r9","%r10","%rdx") ;
wolfSSL 4:1b0d80432c79 1354
wolfSSL 4:1b0d80432c79 1355 #define MULADD_BODY(a,b,c)\
wolfSSL 4:1b0d80432c79 1356 { word64 rdx = a->dp[ix] ; \
wolfSSL 4:1b0d80432c79 1357 cp = &(c->dp[iz]) ; \
wolfSSL 4:1b0d80432c79 1358 c0 = cp[0] ; c1 = cp[1]; \
wolfSSL 4:1b0d80432c79 1359 MULADD_SET_A(rdx) ; \
wolfSSL 4:1b0d80432c79 1360 MULADD_MULX(b0, c0, c1, rdx) ;\
wolfSSL 4:1b0d80432c79 1361 cp[0]=c0; c0=cp[2]; \
wolfSSL 4:1b0d80432c79 1362 MULADD_MULX(b1, c1, c0, rdx) ;\
wolfSSL 4:1b0d80432c79 1363 cp[1]=c1; c1=cp[3]; \
wolfSSL 4:1b0d80432c79 1364 MULADD_MULX(b2, c0, c1, rdx) ;\
wolfSSL 4:1b0d80432c79 1365 cp[2]=c0; c0=cp[4]; \
wolfSSL 4:1b0d80432c79 1366 MULADD_MULX(b3, c1, c0, rdx) ;\
wolfSSL 4:1b0d80432c79 1367 cp[3]=c1; c1=cp[5]; \
wolfSSL 4:1b0d80432c79 1368 MULADD_MULX_ADD_CARRY(c0, c1);\
wolfSSL 4:1b0d80432c79 1369 cp[4]=c0; cp[5]=c1; \
wolfSSL 4:1b0d80432c79 1370 }
wolfSSL 4:1b0d80432c79 1371
wolfSSL 4:1b0d80432c79 1372 #define TFM_INTEL_MUL_COMBA(a, b, c)\
wolfSSL 4:1b0d80432c79 1373 for(ix=0; ix<pa; ix++)c->dp[ix]=0 ; \
wolfSSL 4:1b0d80432c79 1374 for(iy=0; (iy<b->used); iy+=4) { \
wolfSSL 4:1b0d80432c79 1375 fp_digit *bp ; \
wolfSSL 4:1b0d80432c79 1376 bp = &(b->dp[iy+0]) ; \
wolfSSL 4:1b0d80432c79 1377 fp_digit b0 = bp[0] , b1= bp[1], \
wolfSSL 4:1b0d80432c79 1378 b2= bp[2], b3= bp[3]; \
wolfSSL 4:1b0d80432c79 1379 ix=0, iz=iy; \
wolfSSL 4:1b0d80432c79 1380 while(ix<a->used) { \
wolfSSL 4:1b0d80432c79 1381 fp_digit c0, c1; \
wolfSSL 4:1b0d80432c79 1382 fp_digit *cp ; \
wolfSSL 4:1b0d80432c79 1383 MULADD_BODY(a,b,c); \
wolfSSL 4:1b0d80432c79 1384 ix++ ; iz++ ; \
wolfSSL 4:1b0d80432c79 1385 } \
wolfSSL 4:1b0d80432c79 1386 };
wolfSSL 4:1b0d80432c79 1387 #endif
wolfSSL 4:1b0d80432c79 1388
wolfSSL 4:1b0d80432c79 1389 #elif defined(TFM_SSE2)
wolfSSL 4:1b0d80432c79 1390 /* use SSE2 optimizations */
wolfSSL 4:1b0d80432c79 1391
wolfSSL 4:1b0d80432c79 1392 /* anything you need at the start */
wolfSSL 4:1b0d80432c79 1393 #define COMBA_START
wolfSSL 4:1b0d80432c79 1394
wolfSSL 4:1b0d80432c79 1395 /* clear the chaining variables */
wolfSSL 4:1b0d80432c79 1396 #define COMBA_CLEAR \
wolfSSL 4:1b0d80432c79 1397 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 1398
wolfSSL 4:1b0d80432c79 1399 /* forward the carry to the next digit */
wolfSSL 4:1b0d80432c79 1400 #define COMBA_FORWARD \
wolfSSL 4:1b0d80432c79 1401 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 1402
wolfSSL 4:1b0d80432c79 1403 /* store the first sum */
wolfSSL 4:1b0d80432c79 1404 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 1405 x = c0;
wolfSSL 4:1b0d80432c79 1406
wolfSSL 4:1b0d80432c79 1407 /* store the second sum [carry] */
wolfSSL 4:1b0d80432c79 1408 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 1409 x = c1;
wolfSSL 4:1b0d80432c79 1410
wolfSSL 4:1b0d80432c79 1411 /* anything you need at the end */
wolfSSL 4:1b0d80432c79 1412 #define COMBA_FINI \
wolfSSL 4:1b0d80432c79 1413 __asm__("emms");
wolfSSL 4:1b0d80432c79 1414
wolfSSL 4:1b0d80432c79 1415 /* this should multiply i and j */
wolfSSL 4:1b0d80432c79 1416 #define MULADD(i, j) \
wolfSSL 4:1b0d80432c79 1417 __asm__( \
wolfSSL 4:1b0d80432c79 1418 "movd %6,%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 1419 "movd %7,%%mm1 \n\t" \
wolfSSL 4:1b0d80432c79 1420 "pmuludq %%mm1,%%mm0\n\t" \
wolfSSL 4:1b0d80432c79 1421 "movd %%mm0,%%eax \n\t" \
wolfSSL 4:1b0d80432c79 1422 "psrlq $32,%%mm0 \n\t" \
wolfSSL 4:1b0d80432c79 1423 "addl %%eax,%0 \n\t" \
wolfSSL 4:1b0d80432c79 1424 "movd %%mm0,%%eax \n\t" \
wolfSSL 4:1b0d80432c79 1425 "adcl %%eax,%1 \n\t" \
wolfSSL 4:1b0d80432c79 1426 "adcl $0,%2 \n\t" \
wolfSSL 4:1b0d80432c79 1427 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc");
wolfSSL 4:1b0d80432c79 1428
wolfSSL 4:1b0d80432c79 1429 #elif defined(TFM_ARM)
wolfSSL 4:1b0d80432c79 1430 /* ARM code */
wolfSSL 4:1b0d80432c79 1431
wolfSSL 4:1b0d80432c79 1432 #define COMBA_START
wolfSSL 4:1b0d80432c79 1433
wolfSSL 4:1b0d80432c79 1434 #define COMBA_CLEAR \
wolfSSL 4:1b0d80432c79 1435 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 1436
wolfSSL 4:1b0d80432c79 1437 #define COMBA_FORWARD \
wolfSSL 4:1b0d80432c79 1438 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 1439
wolfSSL 4:1b0d80432c79 1440 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 1441 x = c0;
wolfSSL 4:1b0d80432c79 1442
wolfSSL 4:1b0d80432c79 1443 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 1444 x = c1;
wolfSSL 4:1b0d80432c79 1445
wolfSSL 4:1b0d80432c79 1446 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 1447
wolfSSL 4:1b0d80432c79 1448 #define MULADD(i, j) \
wolfSSL 4:1b0d80432c79 1449 __asm__( \
wolfSSL 4:1b0d80432c79 1450 " UMULL r0,r1,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1451 " ADDS %0,%0,r0 \n\t" \
wolfSSL 4:1b0d80432c79 1452 " ADCS %1,%1,r1 \n\t" \
wolfSSL 4:1b0d80432c79 1453 " ADC %2,%2,#0 \n\t" \
wolfSSL 4:1b0d80432c79 1454 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
wolfSSL 4:1b0d80432c79 1455
wolfSSL 4:1b0d80432c79 1456 #elif defined(TFM_PPC32)
wolfSSL 4:1b0d80432c79 1457 /* For 32-bit PPC */
wolfSSL 4:1b0d80432c79 1458
wolfSSL 4:1b0d80432c79 1459 #define COMBA_START
wolfSSL 4:1b0d80432c79 1460
wolfSSL 4:1b0d80432c79 1461 #define COMBA_CLEAR \
wolfSSL 4:1b0d80432c79 1462 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 1463
wolfSSL 4:1b0d80432c79 1464 #define COMBA_FORWARD \
wolfSSL 4:1b0d80432c79 1465 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 1466
wolfSSL 4:1b0d80432c79 1467 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 1468 x = c0;
wolfSSL 4:1b0d80432c79 1469
wolfSSL 4:1b0d80432c79 1470 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 1471 x = c1;
wolfSSL 4:1b0d80432c79 1472
wolfSSL 4:1b0d80432c79 1473 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 1474
wolfSSL 4:1b0d80432c79 1475 /* untested: will mulhwu change the flags? Docs say no */
wolfSSL 4:1b0d80432c79 1476 #define MULADD(i, j) \
wolfSSL 4:1b0d80432c79 1477 __asm__( \
wolfSSL 4:1b0d80432c79 1478 " mullw 16,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1479 " addc %0,%0,16 \n\t" \
wolfSSL 4:1b0d80432c79 1480 " mulhwu 16,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1481 " adde %1,%1,16 \n\t" \
wolfSSL 4:1b0d80432c79 1482 " addze %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 1483 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
wolfSSL 4:1b0d80432c79 1484
wolfSSL 4:1b0d80432c79 1485 #elif defined(TFM_PPC64)
wolfSSL 4:1b0d80432c79 1486 /* For 64-bit PPC */
wolfSSL 4:1b0d80432c79 1487
wolfSSL 4:1b0d80432c79 1488 #define COMBA_START
wolfSSL 4:1b0d80432c79 1489
wolfSSL 4:1b0d80432c79 1490 #define COMBA_CLEAR \
wolfSSL 4:1b0d80432c79 1491 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 1492
wolfSSL 4:1b0d80432c79 1493 #define COMBA_FORWARD \
wolfSSL 4:1b0d80432c79 1494 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 1495
wolfSSL 4:1b0d80432c79 1496 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 1497 x = c0;
wolfSSL 4:1b0d80432c79 1498
wolfSSL 4:1b0d80432c79 1499 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 1500 x = c1;
wolfSSL 4:1b0d80432c79 1501
wolfSSL 4:1b0d80432c79 1502 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 1503
wolfSSL 4:1b0d80432c79 1504 /* untested: will mulhwu change the flags? Docs say no */
wolfSSL 4:1b0d80432c79 1505 #define MULADD(i, j) \
wolfSSL 4:1b0d80432c79 1506 ____asm__( \
wolfSSL 4:1b0d80432c79 1507 " mulld 16,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1508 " addc %0,%0,16 \n\t" \
wolfSSL 4:1b0d80432c79 1509 " mulhdu 16,%6,%7 \n\t" \
wolfSSL 4:1b0d80432c79 1510 " adde %1,%1,16 \n\t" \
wolfSSL 4:1b0d80432c79 1511 " addze %2,%2 \n\t" \
wolfSSL 4:1b0d80432c79 1512 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
wolfSSL 4:1b0d80432c79 1513
wolfSSL 4:1b0d80432c79 1514 #elif defined(TFM_AVR32)
wolfSSL 4:1b0d80432c79 1515
wolfSSL 4:1b0d80432c79 1516 /* ISO C code */
wolfSSL 4:1b0d80432c79 1517
wolfSSL 4:1b0d80432c79 1518 #define COMBA_START
wolfSSL 4:1b0d80432c79 1519
wolfSSL 4:1b0d80432c79 1520 #define COMBA_CLEAR \
wolfSSL 4:1b0d80432c79 1521 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 1522
wolfSSL 4:1b0d80432c79 1523 #define COMBA_FORWARD \
wolfSSL 4:1b0d80432c79 1524 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 1525
wolfSSL 4:1b0d80432c79 1526 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 1527 x = c0;
wolfSSL 4:1b0d80432c79 1528
wolfSSL 4:1b0d80432c79 1529 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 1530 x = c1;
wolfSSL 4:1b0d80432c79 1531
wolfSSL 4:1b0d80432c79 1532 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 1533
wolfSSL 4:1b0d80432c79 1534 #define MULADD(i, j) \
wolfSSL 4:1b0d80432c79 1535 ____asm__( \
wolfSSL 4:1b0d80432c79 1536 " mulu.d r2,%6,%7 \n\t"\
wolfSSL 4:1b0d80432c79 1537 " add %0,r2 \n\t"\
wolfSSL 4:1b0d80432c79 1538 " adc %1,%1,r3 \n\t"\
wolfSSL 4:1b0d80432c79 1539 " acr %2 \n\t"\
wolfSSL 4:1b0d80432c79 1540 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
wolfSSL 4:1b0d80432c79 1541
wolfSSL 4:1b0d80432c79 1542 #else
wolfSSL 4:1b0d80432c79 1543 /* ISO C code */
wolfSSL 4:1b0d80432c79 1544
wolfSSL 4:1b0d80432c79 1545 #define COMBA_START
wolfSSL 4:1b0d80432c79 1546
wolfSSL 4:1b0d80432c79 1547 #define COMBA_CLEAR \
wolfSSL 4:1b0d80432c79 1548 c0 = c1 = c2 = 0;
wolfSSL 4:1b0d80432c79 1549
wolfSSL 4:1b0d80432c79 1550 #define COMBA_FORWARD \
wolfSSL 4:1b0d80432c79 1551 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 4:1b0d80432c79 1552
wolfSSL 4:1b0d80432c79 1553 #define COMBA_STORE(x) \
wolfSSL 4:1b0d80432c79 1554 x = c0;
wolfSSL 4:1b0d80432c79 1555
wolfSSL 4:1b0d80432c79 1556 #define COMBA_STORE2(x) \
wolfSSL 4:1b0d80432c79 1557 x = c1;
wolfSSL 4:1b0d80432c79 1558
wolfSSL 4:1b0d80432c79 1559 #define COMBA_FINI
wolfSSL 4:1b0d80432c79 1560
wolfSSL 4:1b0d80432c79 1561 #define MULADD(i, j) \
wolfSSL 4:1b0d80432c79 1562 do { fp_word t; \
wolfSSL 4:1b0d80432c79 1563 t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \
wolfSSL 4:1b0d80432c79 1564 t = (fp_word)c1 + (t >> DIGIT_BIT); \
wolfSSL 4:1b0d80432c79 1565 c1 = (fp_digit)t; c2 += (fp_digit)(t >> DIGIT_BIT); \
wolfSSL 4:1b0d80432c79 1566 } while (0);
wolfSSL 4:1b0d80432c79 1567
wolfSSL 4:1b0d80432c79 1568 #endif
wolfSSL 4:1b0d80432c79 1569
wolfSSL 4:1b0d80432c79 1570
wolfSSL 4:1b0d80432c79 1571 #ifdef TFM_SMALL_SET
wolfSSL 4:1b0d80432c79 1572 #include "fp_mul_comba_small_set.i"
wolfSSL 4:1b0d80432c79 1573 #endif
wolfSSL 4:1b0d80432c79 1574
wolfSSL 4:1b0d80432c79 1575 #if defined(TFM_MUL3)
wolfSSL 4:1b0d80432c79 1576 #include "fp_mul_comba_3.i"
wolfSSL 4:1b0d80432c79 1577 #endif
wolfSSL 4:1b0d80432c79 1578 #if defined(TFM_MUL4)
wolfSSL 4:1b0d80432c79 1579 #include "fp_mul_comba_4.i"
wolfSSL 4:1b0d80432c79 1580 #endif
wolfSSL 4:1b0d80432c79 1581 #if defined(TFM_MUL6)
wolfSSL 4:1b0d80432c79 1582 #include "fp_mul_comba_6.i"
wolfSSL 4:1b0d80432c79 1583 #endif
wolfSSL 4:1b0d80432c79 1584 #if defined(TFM_MUL7)
wolfSSL 4:1b0d80432c79 1585 #include "fp_mul_comba_7.i"
wolfSSL 4:1b0d80432c79 1586 #endif
wolfSSL 4:1b0d80432c79 1587 #if defined(TFM_MUL8)
wolfSSL 4:1b0d80432c79 1588 #include "fp_mul_comba_8.i"
wolfSSL 4:1b0d80432c79 1589 #endif
wolfSSL 4:1b0d80432c79 1590 #if defined(TFM_MUL9)
wolfSSL 4:1b0d80432c79 1591 #include "fp_mul_comba_9.i"
wolfSSL 4:1b0d80432c79 1592 #endif
wolfSSL 4:1b0d80432c79 1593 #if defined(TFM_MUL12)
wolfSSL 4:1b0d80432c79 1594 #include "fp_mul_comba_12.i"
wolfSSL 4:1b0d80432c79 1595 #endif
wolfSSL 4:1b0d80432c79 1596 #if defined(TFM_MUL17)
wolfSSL 4:1b0d80432c79 1597 #include "fp_mul_comba_17.i"
wolfSSL 4:1b0d80432c79 1598 #endif
wolfSSL 4:1b0d80432c79 1599 #if defined(TFM_MUL20)
wolfSSL 4:1b0d80432c79 1600 #include "fp_mul_comba_20.i"
wolfSSL 4:1b0d80432c79 1601 #endif
wolfSSL 4:1b0d80432c79 1602 #if defined(TFM_MUL24)
wolfSSL 4:1b0d80432c79 1603 #include "fp_mul_comba_24.i"
wolfSSL 4:1b0d80432c79 1604 #endif
wolfSSL 4:1b0d80432c79 1605 #if defined(TFM_MUL28)
wolfSSL 4:1b0d80432c79 1606 #include "fp_mul_comba_28.i"
wolfSSL 4:1b0d80432c79 1607 #endif
wolfSSL 4:1b0d80432c79 1608 #if defined(TFM_MUL32)
wolfSSL 4:1b0d80432c79 1609 #include "fp_mul_comba_32.i"
wolfSSL 4:1b0d80432c79 1610 #endif
wolfSSL 4:1b0d80432c79 1611 #if defined(TFM_MUL48)
wolfSSL 4:1b0d80432c79 1612 #include "fp_mul_comba_48.i"
wolfSSL 4:1b0d80432c79 1613 #endif
wolfSSL 4:1b0d80432c79 1614 #if defined(TFM_MUL64)
wolfSSL 4:1b0d80432c79 1615 #include "fp_mul_comba_64.i"
wolfSSL 4:1b0d80432c79 1616 #endif
wolfSSL 4:1b0d80432c79 1617
wolfSSL 4:1b0d80432c79 1618 /* end fp_mul_comba.c asm */
wolfSSL 4:1b0d80432c79 1619
wolfSSL 4:1b0d80432c79 1620