wolfSSL SSL/TLS library, support up to TLS1.3

Dependents:   CyaSSL-Twitter-OAuth4Tw Example-client-tls-cert TwitterReader TweetTest ... more

Committer:
wolfSSL
Date:
Tue Aug 22 10:48:22 2017 +0000
Revision:
13:f67a6c6013ca
Parent:
11:cee25a834751
wolfSSL3.12.0 with TLS1.3

Who changed what in which revision?

UserRevisionLine numberNew contents of line
wolfSSL 11:cee25a834751 1 /* asm.c
wolfSSL 11:cee25a834751 2 *
wolfSSL 11:cee25a834751 3 * Copyright (C) 2006-2016 wolfSSL Inc.
wolfSSL 11:cee25a834751 4 *
wolfSSL 11:cee25a834751 5 * This file is part of wolfSSL.
wolfSSL 11:cee25a834751 6 *
wolfSSL 11:cee25a834751 7 * wolfSSL is free software; you can redistribute it and/or modify
wolfSSL 11:cee25a834751 8 * it under the terms of the GNU General Public License as published by
wolfSSL 11:cee25a834751 9 * the Free Software Foundation; either version 2 of the License, or
wolfSSL 11:cee25a834751 10 * (at your option) any later version.
wolfSSL 11:cee25a834751 11 *
wolfSSL 11:cee25a834751 12 * wolfSSL is distributed in the hope that it will be useful,
wolfSSL 11:cee25a834751 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
wolfSSL 11:cee25a834751 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
wolfSSL 11:cee25a834751 15 * GNU General Public License for more details.
wolfSSL 11:cee25a834751 16 *
wolfSSL 11:cee25a834751 17 * You should have received a copy of the GNU General Public License
wolfSSL 11:cee25a834751 18 * along with this program; if not, write to the Free Software
wolfSSL 11:cee25a834751 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
wolfSSL 11:cee25a834751 20 */
wolfSSL 11:cee25a834751 21
wolfSSL 11:cee25a834751 22
wolfSSL 11:cee25a834751 23 #ifdef HAVE_CONFIG_H
wolfSSL 11:cee25a834751 24 #include <config.h>
wolfSSL 11:cee25a834751 25 #endif
wolfSSL 11:cee25a834751 26
wolfSSL 11:cee25a834751 27 #include <wolfssl/wolfcrypt/settings.h>
wolfSSL 11:cee25a834751 28
wolfSSL 11:cee25a834751 29 /*
wolfSSL 11:cee25a834751 30 * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
wolfSSL 11:cee25a834751 31 * http://math.libtomcrypt.com
wolfSSL 11:cee25a834751 32 */
wolfSSL 11:cee25a834751 33
wolfSSL 11:cee25a834751 34
wolfSSL 11:cee25a834751 35 /******************************************************************/
wolfSSL 11:cee25a834751 36 /* fp_montgomery_reduce.c asm or generic */
wolfSSL 11:cee25a834751 37
wolfSSL 11:cee25a834751 38
wolfSSL 11:cee25a834751 39 /* Each platform needs to query info type 1 from cpuid to see if aesni is
wolfSSL 11:cee25a834751 40 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
wolfSSL 11:cee25a834751 41 */
wolfSSL 11:cee25a834751 42
wolfSSL 11:cee25a834751 43 #if defined(HAVE_INTEL_MULX)
wolfSSL 11:cee25a834751 44 #ifndef _MSC_VER
wolfSSL 11:cee25a834751 45 #define cpuid(reg, leaf, sub)\
wolfSSL 11:cee25a834751 46 __asm__ __volatile__ ("cpuid":\
wolfSSL 11:cee25a834751 47 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
wolfSSL 11:cee25a834751 48 "a" (leaf), "c"(sub));
wolfSSL 11:cee25a834751 49
wolfSSL 11:cee25a834751 50 #define XASM_LINK(f) asm(f)
wolfSSL 11:cee25a834751 51 #else
wolfSSL 11:cee25a834751 52
wolfSSL 11:cee25a834751 53 #include <intrin.h>
wolfSSL 11:cee25a834751 54 #define cpuid(a,b) __cpuid((int*)a,b)
wolfSSL 11:cee25a834751 55
wolfSSL 11:cee25a834751 56 #define XASM_LINK(f)
wolfSSL 11:cee25a834751 57
wolfSSL 11:cee25a834751 58 #endif /* _MSC_VER */
wolfSSL 11:cee25a834751 59
wolfSSL 11:cee25a834751 60 #define EAX 0
wolfSSL 11:cee25a834751 61 #define EBX 1
wolfSSL 11:cee25a834751 62 #define ECX 2
wolfSSL 11:cee25a834751 63 #define EDX 3
wolfSSL 11:cee25a834751 64
wolfSSL 11:cee25a834751 65 #define CPUID_AVX1 0x1
wolfSSL 11:cee25a834751 66 #define CPUID_AVX2 0x2
wolfSSL 11:cee25a834751 67 #define CPUID_RDRAND 0x4
wolfSSL 11:cee25a834751 68 #define CPUID_RDSEED 0x8
wolfSSL 11:cee25a834751 69 #define CPUID_BMI2 0x10 /* MULX, RORX */
wolfSSL 11:cee25a834751 70 #define CPUID_ADX 0x20 /* ADCX, ADOX */
wolfSSL 11:cee25a834751 71
wolfSSL 11:cee25a834751 72 #define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1)
wolfSSL 11:cee25a834751 73 #define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2)
wolfSSL 11:cee25a834751 74 #define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2)
wolfSSL 11:cee25a834751 75 #define IS_INTEL_ADX (cpuid_flags&CPUID_ADX)
wolfSSL 11:cee25a834751 76 #define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND)
wolfSSL 11:cee25a834751 77 #define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED)
wolfSSL 11:cee25a834751 78 #define SET_FLAGS
wolfSSL 11:cee25a834751 79
wolfSSL 11:cee25a834751 80 static word32 cpuid_check = 0 ;
wolfSSL 11:cee25a834751 81 static word32 cpuid_flags = 0 ;
wolfSSL 11:cee25a834751 82
wolfSSL 11:cee25a834751 83 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
wolfSSL 11:cee25a834751 84 int got_intel_cpu=0;
wolfSSL 11:cee25a834751 85 unsigned int reg[5];
wolfSSL 11:cee25a834751 86
wolfSSL 11:cee25a834751 87 reg[4] = '\0' ;
wolfSSL 11:cee25a834751 88 cpuid(reg, 0, 0);
wolfSSL 11:cee25a834751 89 if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&
wolfSSL 11:cee25a834751 90 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&
wolfSSL 11:cee25a834751 91 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {
wolfSSL 11:cee25a834751 92 got_intel_cpu = 1;
wolfSSL 11:cee25a834751 93 }
wolfSSL 11:cee25a834751 94 if (got_intel_cpu) {
wolfSSL 11:cee25a834751 95 cpuid(reg, leaf, sub);
wolfSSL 11:cee25a834751 96 return((reg[num]>>bit)&0x1) ;
wolfSSL 11:cee25a834751 97 }
wolfSSL 11:cee25a834751 98 return 0 ;
wolfSSL 11:cee25a834751 99 }
wolfSSL 11:cee25a834751 100
wolfSSL 11:cee25a834751 101 INLINE static int set_cpuid_flags(void) {
wolfSSL 11:cee25a834751 102 if(cpuid_check == 0) {
wolfSSL 11:cee25a834751 103 if(cpuid_flag(7, 0, EBX, 8)){ cpuid_flags |= CPUID_BMI2 ; }
wolfSSL 11:cee25a834751 104 if(cpuid_flag(7, 0, EBX,19)){ cpuid_flags |= CPUID_ADX ; }
wolfSSL 11:cee25a834751 105 cpuid_check = 1 ;
wolfSSL 11:cee25a834751 106 return 0 ;
wolfSSL 11:cee25a834751 107 }
wolfSSL 11:cee25a834751 108 return 1 ;
wolfSSL 11:cee25a834751 109 }
wolfSSL 11:cee25a834751 110
wolfSSL 11:cee25a834751 111 #define RETURN return
wolfSSL 11:cee25a834751 112 #define IF_HAVE_INTEL_MULX(func, ret) \
wolfSSL 11:cee25a834751 113 if(cpuid_check==0)set_cpuid_flags() ; \
wolfSSL 11:cee25a834751 114 if(IS_INTEL_BMI2 && IS_INTEL_ADX){ func; ret ; }
wolfSSL 11:cee25a834751 115
wolfSSL 11:cee25a834751 116 #else
wolfSSL 11:cee25a834751 117 #define IF_HAVE_INTEL_MULX(func, ret)
wolfSSL 11:cee25a834751 118 #endif
wolfSSL 11:cee25a834751 119
wolfSSL 11:cee25a834751 120 #if defined(TFM_X86) && !defined(TFM_SSE2)
wolfSSL 11:cee25a834751 121 /* x86-32 code */
wolfSSL 11:cee25a834751 122
wolfSSL 11:cee25a834751 123 #define MONT_START
wolfSSL 11:cee25a834751 124 #define MONT_FINI
wolfSSL 11:cee25a834751 125 #define LOOP_END
wolfSSL 11:cee25a834751 126 #define LOOP_START \
wolfSSL 11:cee25a834751 127 mu = c[x] * mp
wolfSSL 11:cee25a834751 128
wolfSSL 11:cee25a834751 129 #define INNERMUL \
wolfSSL 11:cee25a834751 130 __asm__( \
wolfSSL 11:cee25a834751 131 "movl %5,%%eax \n\t" \
wolfSSL 11:cee25a834751 132 "mull %4 \n\t" \
wolfSSL 11:cee25a834751 133 "addl %1,%%eax \n\t" \
wolfSSL 11:cee25a834751 134 "adcl $0,%%edx \n\t" \
wolfSSL 11:cee25a834751 135 "addl %%eax,%0 \n\t" \
wolfSSL 11:cee25a834751 136 "adcl $0,%%edx \n\t" \
wolfSSL 11:cee25a834751 137 "movl %%edx,%1 \n\t" \
wolfSSL 11:cee25a834751 138 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 11:cee25a834751 139 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
wolfSSL 11:cee25a834751 140 : "%eax", "%edx", "cc")
wolfSSL 11:cee25a834751 141
wolfSSL 11:cee25a834751 142 #define PROPCARRY \
wolfSSL 11:cee25a834751 143 __asm__( \
wolfSSL 11:cee25a834751 144 "addl %1,%0 \n\t" \
wolfSSL 11:cee25a834751 145 "setb %%al \n\t" \
wolfSSL 11:cee25a834751 146 "movzbl %%al,%1 \n\t" \
wolfSSL 11:cee25a834751 147 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 11:cee25a834751 148 :"0"(_c[LO]), "1"(cy) \
wolfSSL 11:cee25a834751 149 : "%eax", "cc")
wolfSSL 11:cee25a834751 150
wolfSSL 11:cee25a834751 151 /******************************************************************/
wolfSSL 11:cee25a834751 152 #elif defined(TFM_X86_64)
wolfSSL 11:cee25a834751 153 /* x86-64 code */
wolfSSL 11:cee25a834751 154
wolfSSL 11:cee25a834751 155 #define MONT_START
wolfSSL 11:cee25a834751 156 #define MONT_FINI
wolfSSL 11:cee25a834751 157 #define LOOP_END
wolfSSL 11:cee25a834751 158 #define LOOP_START \
wolfSSL 11:cee25a834751 159 mu = c[x] * mp;
wolfSSL 11:cee25a834751 160
wolfSSL 11:cee25a834751 161 #define INNERMUL \
wolfSSL 11:cee25a834751 162 __asm__( \
wolfSSL 11:cee25a834751 163 "movq %5,%%rax \n\t" \
wolfSSL 11:cee25a834751 164 "mulq %4 \n\t" \
wolfSSL 11:cee25a834751 165 "addq %1,%%rax \n\t" \
wolfSSL 11:cee25a834751 166 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 167 "addq %%rax,%0 \n\t" \
wolfSSL 11:cee25a834751 168 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 169 "movq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 170 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 11:cee25a834751 171 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
wolfSSL 11:cee25a834751 172 : "%rax", "%rdx", "cc")
wolfSSL 11:cee25a834751 173
wolfSSL 11:cee25a834751 174 #if defined(HAVE_INTEL_MULX)
wolfSSL 11:cee25a834751 175 #define MULX_INIT(a0, c0, cy)\
wolfSSL 11:cee25a834751 176 __asm__ volatile( \
wolfSSL 11:cee25a834751 177 "xorq %%r10, %%r10\n\t" \
wolfSSL 11:cee25a834751 178 "movq %1,%%rdx\n\t" \
wolfSSL 11:cee25a834751 179 "addq %2, %0\n\t" /* c0+=cy; Set CF, OF */ \
wolfSSL 11:cee25a834751 180 "adoxq %%r10, %%r10\n\t" /* Reset OF */ \
wolfSSL 11:cee25a834751 181 :"+m"(c0):"r"(a0),"r"(cy):"%r8","%r9", "%r10","%r11","%r12","%rdx") ; \
wolfSSL 11:cee25a834751 182
wolfSSL 11:cee25a834751 183 #define MULX_INNERMUL_R1(c0, c1, pre, rdx)\
wolfSSL 11:cee25a834751 184 { \
wolfSSL 11:cee25a834751 185 __asm__ volatile ( \
wolfSSL 11:cee25a834751 186 "movq %3, %%rdx\n\t" \
wolfSSL 11:cee25a834751 187 "mulx %%r11,%%r9, %%r8 \n\t" \
wolfSSL 11:cee25a834751 188 "movq %2, %%r12\n\t" \
wolfSSL 11:cee25a834751 189 "adoxq %%r9,%0 \n\t" \
wolfSSL 11:cee25a834751 190 "adcxq %%r8,%1 \n\t" \
wolfSSL 11:cee25a834751 191 :"+r"(c0),"+r"(c1):"m"(pre),"r"(rdx):"%r8","%r9", "%r10", "%r11","%r12","%rdx" \
wolfSSL 11:cee25a834751 192 ); }
wolfSSL 11:cee25a834751 193
wolfSSL 11:cee25a834751 194
wolfSSL 11:cee25a834751 195 #define MULX_INNERMUL_R2(c0, c1, pre, rdx)\
wolfSSL 11:cee25a834751 196 { \
wolfSSL 11:cee25a834751 197 __asm__ volatile ( \
wolfSSL 11:cee25a834751 198 "movq %3, %%rdx\n\t" \
wolfSSL 11:cee25a834751 199 "mulx %%r12,%%r9, %%r8 \n\t" \
wolfSSL 11:cee25a834751 200 "movq %2, %%r11\n\t" \
wolfSSL 11:cee25a834751 201 "adoxq %%r9,%0 \n\t" \
wolfSSL 11:cee25a834751 202 "adcxq %%r8,%1 \n\t" \
wolfSSL 11:cee25a834751 203 :"+r"(c0),"+r"(c1):"m"(pre),"r"(rdx):"%r8","%r9", "%r10", "%r11","%r12","%rdx" \
wolfSSL 11:cee25a834751 204 ); }
wolfSSL 11:cee25a834751 205
wolfSSL 11:cee25a834751 206 #define MULX_LOAD_R1(val)\
wolfSSL 11:cee25a834751 207 __asm__ volatile ( \
wolfSSL 11:cee25a834751 208 "movq %0, %%r11\n\t"\
wolfSSL 11:cee25a834751 209 ::"m"(val):"%r8","%r9", "%r10", "%r11","%r12","%rdx"\
wolfSSL 11:cee25a834751 210 ) ;
wolfSSL 11:cee25a834751 211
wolfSSL 11:cee25a834751 212 #define MULX_INNERMUL_LAST(c0, c1, rdx)\
wolfSSL 11:cee25a834751 213 { \
wolfSSL 11:cee25a834751 214 __asm__ volatile ( \
wolfSSL 11:cee25a834751 215 "movq %2, %%rdx\n\t" \
wolfSSL 11:cee25a834751 216 "mulx %%r12,%%r9, %%r8 \n\t" \
wolfSSL 11:cee25a834751 217 "movq $0, %%r10 \n\t" \
wolfSSL 11:cee25a834751 218 "adoxq %%r10, %%r9 \n\t" \
wolfSSL 11:cee25a834751 219 "adcq $0,%%r8 \n\t" \
wolfSSL 11:cee25a834751 220 "addq %%r9,%0 \n\t" \
wolfSSL 11:cee25a834751 221 "adcq $0,%%r8 \n\t" \
wolfSSL 11:cee25a834751 222 "movq %%r8,%1 \n\t" \
wolfSSL 11:cee25a834751 223 :"+m"(c0),"=m"(c1):"r"(rdx):"%r8","%r9","%r10", "%r11", "%r12","%rdx"\
wolfSSL 11:cee25a834751 224 ); }
wolfSSL 11:cee25a834751 225
wolfSSL 11:cee25a834751 226 #define MULX_INNERMUL8(x,y,z,cy)\
wolfSSL 11:cee25a834751 227 { word64 rdx = y ;\
wolfSSL 11:cee25a834751 228 MULX_LOAD_R1(x[0]) ;\
wolfSSL 11:cee25a834751 229 MULX_INIT(y, _c0, cy) ; /* rdx=y; z0+=cy; */ \
wolfSSL 11:cee25a834751 230 MULX_INNERMUL_R1(_c0, _c1, x[1], rdx) ;\
wolfSSL 11:cee25a834751 231 MULX_INNERMUL_R2(_c1, _c2, x[2], rdx) ;\
wolfSSL 11:cee25a834751 232 MULX_INNERMUL_R1(_c2, _c3, x[3], rdx) ;\
wolfSSL 11:cee25a834751 233 MULX_INNERMUL_R2(_c3, _c4, x[4], rdx) ;\
wolfSSL 11:cee25a834751 234 MULX_INNERMUL_R1(_c4, _c5, x[5], rdx) ;\
wolfSSL 11:cee25a834751 235 MULX_INNERMUL_R2(_c5, _c6, x[6], rdx) ;\
wolfSSL 11:cee25a834751 236 MULX_INNERMUL_R1(_c6, _c7, x[7], rdx) ;\
wolfSSL 11:cee25a834751 237 MULX_INNERMUL_LAST(_c7, cy, rdx) ;\
wolfSSL 11:cee25a834751 238 }
wolfSSL 11:cee25a834751 239 #define INNERMUL8_MULX \
wolfSSL 11:cee25a834751 240 {\
wolfSSL 11:cee25a834751 241 MULX_INNERMUL8(tmpm, mu, _c, cy);\
wolfSSL 11:cee25a834751 242 }
wolfSSL 11:cee25a834751 243 #endif
wolfSSL 11:cee25a834751 244
wolfSSL 11:cee25a834751 245 #define INNERMUL8 \
wolfSSL 11:cee25a834751 246 __asm__( \
wolfSSL 11:cee25a834751 247 "movq 0(%5),%%rax \n\t" \
wolfSSL 11:cee25a834751 248 "movq 0(%2),%%r10 \n\t" \
wolfSSL 11:cee25a834751 249 "movq 0x8(%5),%%r11 \n\t" \
wolfSSL 11:cee25a834751 250 "mulq %4 \n\t" \
wolfSSL 11:cee25a834751 251 "addq %%r10,%%rax \n\t" \
wolfSSL 11:cee25a834751 252 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 253 "movq 0x8(%2),%%r10 \n\t" \
wolfSSL 11:cee25a834751 254 "addq %3,%%rax \n\t" \
wolfSSL 11:cee25a834751 255 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 256 "movq %%rax,0(%0) \n\t" \
wolfSSL 11:cee25a834751 257 "movq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 258 \
wolfSSL 11:cee25a834751 259 "movq %%r11,%%rax \n\t" \
wolfSSL 11:cee25a834751 260 "movq 0x10(%5),%%r11 \n\t" \
wolfSSL 11:cee25a834751 261 "mulq %4 \n\t" \
wolfSSL 11:cee25a834751 262 "addq %%r10,%%rax \n\t" \
wolfSSL 11:cee25a834751 263 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 264 "movq 0x10(%2),%%r10 \n\t" \
wolfSSL 11:cee25a834751 265 "addq %3,%%rax \n\t" \
wolfSSL 11:cee25a834751 266 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 267 "movq %%rax,0x8(%0) \n\t" \
wolfSSL 11:cee25a834751 268 "movq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 269 \
wolfSSL 11:cee25a834751 270 "movq %%r11,%%rax \n\t" \
wolfSSL 11:cee25a834751 271 "movq 0x18(%5),%%r11 \n\t" \
wolfSSL 11:cee25a834751 272 "mulq %4 \n\t" \
wolfSSL 11:cee25a834751 273 "addq %%r10,%%rax \n\t" \
wolfSSL 11:cee25a834751 274 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 275 "movq 0x18(%2),%%r10 \n\t" \
wolfSSL 11:cee25a834751 276 "addq %3,%%rax \n\t" \
wolfSSL 11:cee25a834751 277 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 278 "movq %%rax,0x10(%0) \n\t" \
wolfSSL 11:cee25a834751 279 "movq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 280 \
wolfSSL 11:cee25a834751 281 "movq %%r11,%%rax \n\t" \
wolfSSL 11:cee25a834751 282 "movq 0x20(%5),%%r11 \n\t" \
wolfSSL 11:cee25a834751 283 "mulq %4 \n\t" \
wolfSSL 11:cee25a834751 284 "addq %%r10,%%rax \n\t" \
wolfSSL 11:cee25a834751 285 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 286 "movq 0x20(%2),%%r10 \n\t" \
wolfSSL 11:cee25a834751 287 "addq %3,%%rax \n\t" \
wolfSSL 11:cee25a834751 288 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 289 "movq %%rax,0x18(%0) \n\t" \
wolfSSL 11:cee25a834751 290 "movq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 291 \
wolfSSL 11:cee25a834751 292 "movq %%r11,%%rax \n\t" \
wolfSSL 11:cee25a834751 293 "movq 0x28(%5),%%r11 \n\t" \
wolfSSL 11:cee25a834751 294 "mulq %4 \n\t" \
wolfSSL 11:cee25a834751 295 "addq %%r10,%%rax \n\t" \
wolfSSL 11:cee25a834751 296 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 297 "movq 0x28(%2),%%r10 \n\t" \
wolfSSL 11:cee25a834751 298 "addq %3,%%rax \n\t" \
wolfSSL 11:cee25a834751 299 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 300 "movq %%rax,0x20(%0) \n\t" \
wolfSSL 11:cee25a834751 301 "movq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 302 \
wolfSSL 11:cee25a834751 303 "movq %%r11,%%rax \n\t" \
wolfSSL 11:cee25a834751 304 "movq 0x30(%5),%%r11 \n\t" \
wolfSSL 11:cee25a834751 305 "mulq %4 \n\t" \
wolfSSL 11:cee25a834751 306 "addq %%r10,%%rax \n\t" \
wolfSSL 11:cee25a834751 307 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 308 "movq 0x30(%2),%%r10 \n\t" \
wolfSSL 11:cee25a834751 309 "addq %3,%%rax \n\t" \
wolfSSL 11:cee25a834751 310 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 311 "movq %%rax,0x28(%0) \n\t" \
wolfSSL 11:cee25a834751 312 "movq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 313 \
wolfSSL 11:cee25a834751 314 "movq %%r11,%%rax \n\t" \
wolfSSL 11:cee25a834751 315 "movq 0x38(%5),%%r11 \n\t" \
wolfSSL 11:cee25a834751 316 "mulq %4 \n\t" \
wolfSSL 11:cee25a834751 317 "addq %%r10,%%rax \n\t" \
wolfSSL 11:cee25a834751 318 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 319 "movq 0x38(%2),%%r10 \n\t" \
wolfSSL 11:cee25a834751 320 "addq %3,%%rax \n\t" \
wolfSSL 11:cee25a834751 321 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 322 "movq %%rax,0x30(%0) \n\t" \
wolfSSL 11:cee25a834751 323 "movq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 324 \
wolfSSL 11:cee25a834751 325 "movq %%r11,%%rax \n\t" \
wolfSSL 11:cee25a834751 326 "mulq %4 \n\t" \
wolfSSL 11:cee25a834751 327 "addq %%r10,%%rax \n\t" \
wolfSSL 11:cee25a834751 328 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 329 "addq %3,%%rax \n\t" \
wolfSSL 11:cee25a834751 330 "adcq $0,%%rdx \n\t" \
wolfSSL 11:cee25a834751 331 "movq %%rax,0x38(%0) \n\t" \
wolfSSL 11:cee25a834751 332 "movq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 333 \
wolfSSL 11:cee25a834751 334 :"=r"(_c), "=r"(cy) \
wolfSSL 11:cee25a834751 335 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
wolfSSL 11:cee25a834751 336 : "%rax", "%rdx", "%r10", "%r11", "cc")\
wolfSSL 11:cee25a834751 337
wolfSSL 11:cee25a834751 338 #define PROPCARRY \
wolfSSL 11:cee25a834751 339 __asm__( \
wolfSSL 11:cee25a834751 340 "addq %1,%0 \n\t" \
wolfSSL 11:cee25a834751 341 "setb %%al \n\t" \
wolfSSL 11:cee25a834751 342 "movzbq %%al,%1 \n\t" \
wolfSSL 11:cee25a834751 343 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 11:cee25a834751 344 :"0"(_c[LO]), "1"(cy) \
wolfSSL 11:cee25a834751 345 : "%rax", "cc")
wolfSSL 11:cee25a834751 346
wolfSSL 11:cee25a834751 347 /******************************************************************/
wolfSSL 11:cee25a834751 348 #elif defined(TFM_SSE2)
wolfSSL 11:cee25a834751 349 /* SSE2 code (assumes 32-bit fp_digits) */
wolfSSL 11:cee25a834751 350 /* XMM register assignments:
wolfSSL 11:cee25a834751 351 * xmm0 *tmpm++, then Mu * (*tmpm++)
wolfSSL 11:cee25a834751 352 * xmm1 c[x], then Mu
wolfSSL 11:cee25a834751 353 * xmm2 mp
wolfSSL 11:cee25a834751 354 * xmm3 cy
wolfSSL 11:cee25a834751 355 * xmm4 _c[LO]
wolfSSL 11:cee25a834751 356 */
wolfSSL 11:cee25a834751 357
wolfSSL 11:cee25a834751 358 #define MONT_START \
wolfSSL 11:cee25a834751 359 __asm__("movd %0,%%mm2"::"g"(mp))
wolfSSL 11:cee25a834751 360
wolfSSL 11:cee25a834751 361 #define MONT_FINI \
wolfSSL 11:cee25a834751 362 __asm__("emms")
wolfSSL 11:cee25a834751 363
wolfSSL 11:cee25a834751 364 #define LOOP_START \
wolfSSL 11:cee25a834751 365 __asm__( \
wolfSSL 11:cee25a834751 366 "movd %0,%%mm1 \n\t" \
wolfSSL 11:cee25a834751 367 "pxor %%mm3,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 368 "pmuludq %%mm2,%%mm1 \n\t" \
wolfSSL 11:cee25a834751 369 :: "g"(c[x]))
wolfSSL 11:cee25a834751 370
wolfSSL 11:cee25a834751 371 /* pmuludq on mmx registers does a 32x32->64 multiply. */
wolfSSL 11:cee25a834751 372 #define INNERMUL \
wolfSSL 11:cee25a834751 373 __asm__( \
wolfSSL 11:cee25a834751 374 "movd %1,%%mm4 \n\t" \
wolfSSL 11:cee25a834751 375 "movd %2,%%mm0 \n\t" \
wolfSSL 11:cee25a834751 376 "paddq %%mm4,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 377 "pmuludq %%mm1,%%mm0 \n\t" \
wolfSSL 11:cee25a834751 378 "paddq %%mm0,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 379 "movd %%mm3,%0 \n\t" \
wolfSSL 11:cee25a834751 380 "psrlq $32, %%mm3 \n\t" \
wolfSSL 11:cee25a834751 381 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
wolfSSL 11:cee25a834751 382
wolfSSL 11:cee25a834751 383 #define INNERMUL8 \
wolfSSL 11:cee25a834751 384 __asm__( \
wolfSSL 11:cee25a834751 385 "movd 0(%1),%%mm4 \n\t" \
wolfSSL 11:cee25a834751 386 "movd 0(%2),%%mm0 \n\t" \
wolfSSL 11:cee25a834751 387 "paddq %%mm4,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 388 "pmuludq %%mm1,%%mm0 \n\t" \
wolfSSL 11:cee25a834751 389 "movd 4(%2),%%mm5 \n\t" \
wolfSSL 11:cee25a834751 390 "paddq %%mm0,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 391 "movd 4(%1),%%mm6 \n\t" \
wolfSSL 11:cee25a834751 392 "movd %%mm3,0(%0) \n\t" \
wolfSSL 11:cee25a834751 393 "psrlq $32, %%mm3 \n\t" \
wolfSSL 11:cee25a834751 394 \
wolfSSL 11:cee25a834751 395 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 396 "pmuludq %%mm1,%%mm5 \n\t" \
wolfSSL 11:cee25a834751 397 "movd 8(%2),%%mm6 \n\t" \
wolfSSL 11:cee25a834751 398 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 399 "movd 8(%1),%%mm7 \n\t" \
wolfSSL 11:cee25a834751 400 "movd %%mm3,4(%0) \n\t" \
wolfSSL 11:cee25a834751 401 "psrlq $32, %%mm3 \n\t" \
wolfSSL 11:cee25a834751 402 \
wolfSSL 11:cee25a834751 403 "paddq %%mm7,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 404 "pmuludq %%mm1,%%mm6 \n\t" \
wolfSSL 11:cee25a834751 405 "movd 12(%2),%%mm7 \n\t" \
wolfSSL 11:cee25a834751 406 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 407 "movd 12(%1),%%mm5 \n\t" \
wolfSSL 11:cee25a834751 408 "movd %%mm3,8(%0) \n\t" \
wolfSSL 11:cee25a834751 409 "psrlq $32, %%mm3 \n\t" \
wolfSSL 11:cee25a834751 410 \
wolfSSL 11:cee25a834751 411 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 412 "pmuludq %%mm1,%%mm7 \n\t" \
wolfSSL 11:cee25a834751 413 "movd 16(%2),%%mm5 \n\t" \
wolfSSL 11:cee25a834751 414 "paddq %%mm7,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 415 "movd 16(%1),%%mm6 \n\t" \
wolfSSL 11:cee25a834751 416 "movd %%mm3,12(%0) \n\t" \
wolfSSL 11:cee25a834751 417 "psrlq $32, %%mm3 \n\t" \
wolfSSL 11:cee25a834751 418 \
wolfSSL 11:cee25a834751 419 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 420 "pmuludq %%mm1,%%mm5 \n\t" \
wolfSSL 11:cee25a834751 421 "movd 20(%2),%%mm6 \n\t" \
wolfSSL 11:cee25a834751 422 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 423 "movd 20(%1),%%mm7 \n\t" \
wolfSSL 11:cee25a834751 424 "movd %%mm3,16(%0) \n\t" \
wolfSSL 11:cee25a834751 425 "psrlq $32, %%mm3 \n\t" \
wolfSSL 11:cee25a834751 426 \
wolfSSL 11:cee25a834751 427 "paddq %%mm7,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 428 "pmuludq %%mm1,%%mm6 \n\t" \
wolfSSL 11:cee25a834751 429 "movd 24(%2),%%mm7 \n\t" \
wolfSSL 11:cee25a834751 430 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 431 "movd 24(%1),%%mm5 \n\t" \
wolfSSL 11:cee25a834751 432 "movd %%mm3,20(%0) \n\t" \
wolfSSL 11:cee25a834751 433 "psrlq $32, %%mm3 \n\t" \
wolfSSL 11:cee25a834751 434 \
wolfSSL 11:cee25a834751 435 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 436 "pmuludq %%mm1,%%mm7 \n\t" \
wolfSSL 11:cee25a834751 437 "movd 28(%2),%%mm5 \n\t" \
wolfSSL 11:cee25a834751 438 "paddq %%mm7,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 439 "movd 28(%1),%%mm6 \n\t" \
wolfSSL 11:cee25a834751 440 "movd %%mm3,24(%0) \n\t" \
wolfSSL 11:cee25a834751 441 "psrlq $32, %%mm3 \n\t" \
wolfSSL 11:cee25a834751 442 \
wolfSSL 11:cee25a834751 443 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 444 "pmuludq %%mm1,%%mm5 \n\t" \
wolfSSL 11:cee25a834751 445 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 11:cee25a834751 446 "movd %%mm3,28(%0) \n\t" \
wolfSSL 11:cee25a834751 447 "psrlq $32, %%mm3 \n\t" \
wolfSSL 11:cee25a834751 448 :"=r"(_c) : "0"(_c), "r"(tmpm) );
wolfSSL 11:cee25a834751 449
wolfSSL 11:cee25a834751 450 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack
wolfSSL 11:cee25a834751 451 pointer */
wolfSSL 11:cee25a834751 452
wolfSSL 11:cee25a834751 453 #define LOOP_END \
wolfSSL 11:cee25a834751 454 __asm__( "movd %%mm3,%0 \n" :"=r"(cy))
wolfSSL 11:cee25a834751 455
wolfSSL 11:cee25a834751 456 #define PROPCARRY \
wolfSSL 11:cee25a834751 457 __asm__( \
wolfSSL 11:cee25a834751 458 "addl %1,%0 \n\t" \
wolfSSL 11:cee25a834751 459 "setb %%al \n\t" \
wolfSSL 11:cee25a834751 460 "movzbl %%al,%1 \n\t" \
wolfSSL 11:cee25a834751 461 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 11:cee25a834751 462 :"0"(_c[LO]), "1"(cy) \
wolfSSL 11:cee25a834751 463 : "%eax", "cc")
wolfSSL 11:cee25a834751 464
wolfSSL 11:cee25a834751 465 /******************************************************************/
wolfSSL 11:cee25a834751 466 #elif defined(TFM_ARM)
wolfSSL 11:cee25a834751 467 /* ARMv4 code */
wolfSSL 11:cee25a834751 468
wolfSSL 11:cee25a834751 469 #define MONT_START
wolfSSL 11:cee25a834751 470 #define MONT_FINI
wolfSSL 11:cee25a834751 471 #define LOOP_END
wolfSSL 11:cee25a834751 472 #define LOOP_START \
wolfSSL 11:cee25a834751 473 mu = c[x] * mp
wolfSSL 11:cee25a834751 474
wolfSSL 11:cee25a834751 475
wolfSSL 11:cee25a834751 476 #ifdef __thumb__
wolfSSL 11:cee25a834751 477
wolfSSL 11:cee25a834751 478 #define INNERMUL \
wolfSSL 11:cee25a834751 479 __asm__( \
wolfSSL 11:cee25a834751 480 " LDR r0,%1 \n\t" \
wolfSSL 11:cee25a834751 481 " ADDS r0,r0,%0 \n\t" \
wolfSSL 11:cee25a834751 482 " ITE CS \n\t" \
wolfSSL 11:cee25a834751 483 " MOVCS %0,#1 \n\t" \
wolfSSL 11:cee25a834751 484 " MOVCC %0,#0 \n\t" \
wolfSSL 11:cee25a834751 485 " UMLAL r0,%0,%3,%4 \n\t" \
wolfSSL 11:cee25a834751 486 " STR r0,%1 \n\t" \
wolfSSL 11:cee25a834751 487 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc");
wolfSSL 11:cee25a834751 488
wolfSSL 11:cee25a834751 489 #define PROPCARRY \
wolfSSL 11:cee25a834751 490 __asm__( \
wolfSSL 11:cee25a834751 491 " LDR r0,%1 \n\t" \
wolfSSL 11:cee25a834751 492 " ADDS r0,r0,%0 \n\t" \
wolfSSL 11:cee25a834751 493 " STR r0,%1 \n\t" \
wolfSSL 11:cee25a834751 494 " ITE CS \n\t" \
wolfSSL 11:cee25a834751 495 " MOVCS %0,#1 \n\t" \
wolfSSL 11:cee25a834751 496 " MOVCC %0,#0 \n\t" \
wolfSSL 11:cee25a834751 497 :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc");
wolfSSL 11:cee25a834751 498
wolfSSL 11:cee25a834751 499
wolfSSL 11:cee25a834751 500 /* TAO thumb mode uses ite (if then else) to detect carry directly
wolfSSL 11:cee25a834751 501 * fixed unmatched constraint warning by changing 1 to m */
wolfSSL 11:cee25a834751 502
wolfSSL 11:cee25a834751 503 #else /* __thumb__ */
wolfSSL 11:cee25a834751 504
wolfSSL 11:cee25a834751 505 #define INNERMUL \
wolfSSL 11:cee25a834751 506 __asm__( \
wolfSSL 11:cee25a834751 507 " LDR r0,%1 \n\t" \
wolfSSL 11:cee25a834751 508 " ADDS r0,r0,%0 \n\t" \
wolfSSL 11:cee25a834751 509 " MOVCS %0,#1 \n\t" \
wolfSSL 11:cee25a834751 510 " MOVCC %0,#0 \n\t" \
wolfSSL 11:cee25a834751 511 " UMLAL r0,%0,%3,%4 \n\t" \
wolfSSL 11:cee25a834751 512 " STR r0,%1 \n\t" \
wolfSSL 11:cee25a834751 513 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
wolfSSL 11:cee25a834751 514
wolfSSL 11:cee25a834751 515 #define PROPCARRY \
wolfSSL 11:cee25a834751 516 __asm__( \
wolfSSL 11:cee25a834751 517 " LDR r0,%1 \n\t" \
wolfSSL 11:cee25a834751 518 " ADDS r0,r0,%0 \n\t" \
wolfSSL 11:cee25a834751 519 " STR r0,%1 \n\t" \
wolfSSL 11:cee25a834751 520 " MOVCS %0,#1 \n\t" \
wolfSSL 11:cee25a834751 521 " MOVCC %0,#0 \n\t" \
wolfSSL 11:cee25a834751 522 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
wolfSSL 11:cee25a834751 523
wolfSSL 11:cee25a834751 524 #endif /* __thumb__ */
wolfSSL 11:cee25a834751 525
wolfSSL 11:cee25a834751 526 #elif defined(TFM_PPC32)
wolfSSL 11:cee25a834751 527
wolfSSL 11:cee25a834751 528 /* PPC32 */
wolfSSL 11:cee25a834751 529 #define MONT_START
wolfSSL 11:cee25a834751 530 #define MONT_FINI
wolfSSL 11:cee25a834751 531 #define LOOP_END
wolfSSL 11:cee25a834751 532 #define LOOP_START \
wolfSSL 11:cee25a834751 533 mu = c[x] * mp
wolfSSL 11:cee25a834751 534
wolfSSL 11:cee25a834751 535 #define INNERMUL \
wolfSSL 11:cee25a834751 536 __asm__( \
wolfSSL 11:cee25a834751 537 " mullw 16,%3,%4 \n\t" \
wolfSSL 11:cee25a834751 538 " mulhwu 17,%3,%4 \n\t" \
wolfSSL 11:cee25a834751 539 " addc 16,16,%2 \n\t" \
wolfSSL 11:cee25a834751 540 " addze 17,17 \n\t" \
wolfSSL 11:cee25a834751 541 " addc %1,16,%5 \n\t" \
wolfSSL 11:cee25a834751 542 " addze %0,17 \n\t" \
wolfSSL 11:cee25a834751 543 :"=r"(cy),"=r"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "cc"); ++tmpm;
wolfSSL 11:cee25a834751 544
wolfSSL 11:cee25a834751 545 #define PROPCARRY \
wolfSSL 11:cee25a834751 546 __asm__( \
wolfSSL 11:cee25a834751 547 " addc %1,%3,%2 \n\t" \
wolfSSL 11:cee25a834751 548 " xor %0,%2,%2 \n\t" \
wolfSSL 11:cee25a834751 549 " addze %0,%2 \n\t" \
wolfSSL 11:cee25a834751 550 :"=r"(cy),"=r"(_c[0]):"0"(cy),"1"(_c[0]):"cc");
wolfSSL 11:cee25a834751 551
wolfSSL 11:cee25a834751 552 #elif defined(TFM_PPC64)
wolfSSL 11:cee25a834751 553
wolfSSL 11:cee25a834751 554 /* PPC64 */
wolfSSL 11:cee25a834751 555 #define MONT_START
wolfSSL 11:cee25a834751 556 #define MONT_FINI
wolfSSL 11:cee25a834751 557 #define LOOP_END
wolfSSL 11:cee25a834751 558 #define LOOP_START \
wolfSSL 11:cee25a834751 559 mu = c[x] * mp
wolfSSL 11:cee25a834751 560
wolfSSL 11:cee25a834751 561 #define INNERMUL \
wolfSSL 11:cee25a834751 562 __asm__( \
wolfSSL 11:cee25a834751 563 " mulld 16,%3,%4 \n\t" \
wolfSSL 11:cee25a834751 564 " mulhdu 17,%3,%4 \n\t" \
wolfSSL 11:cee25a834751 565 " addc 16,16,%0 \n\t" \
wolfSSL 11:cee25a834751 566 " addze 17,17 \n\t" \
wolfSSL 11:cee25a834751 567 " ldx 18,0,%1 \n\t" \
wolfSSL 11:cee25a834751 568 " addc 16,16,18 \n\t" \
wolfSSL 11:cee25a834751 569 " addze %0,17 \n\t" \
wolfSSL 11:cee25a834751 570 " sdx 16,0,%1 \n\t" \
wolfSSL 11:cee25a834751 571 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
wolfSSL 11:cee25a834751 572
wolfSSL 11:cee25a834751 573 #define PROPCARRY \
wolfSSL 11:cee25a834751 574 __asm__( \
wolfSSL 11:cee25a834751 575 " ldx 16,0,%1 \n\t" \
wolfSSL 11:cee25a834751 576 " addc 16,16,%0 \n\t" \
wolfSSL 11:cee25a834751 577 " sdx 16,0,%1 \n\t" \
wolfSSL 11:cee25a834751 578 " xor %0,%0,%0 \n\t" \
wolfSSL 11:cee25a834751 579 " addze %0,%0 \n\t" \
wolfSSL 11:cee25a834751 580 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
wolfSSL 11:cee25a834751 581
wolfSSL 11:cee25a834751 582 /******************************************************************/
wolfSSL 11:cee25a834751 583
wolfSSL 11:cee25a834751 584 #elif defined(TFM_AVR32)
wolfSSL 11:cee25a834751 585
wolfSSL 11:cee25a834751 586 /* AVR32 */
wolfSSL 11:cee25a834751 587 #define MONT_START
wolfSSL 11:cee25a834751 588 #define MONT_FINI
wolfSSL 11:cee25a834751 589 #define LOOP_END
wolfSSL 11:cee25a834751 590 #define LOOP_START \
wolfSSL 11:cee25a834751 591 mu = c[x] * mp
wolfSSL 11:cee25a834751 592
wolfSSL 11:cee25a834751 593 #define INNERMUL \
wolfSSL 11:cee25a834751 594 __asm__( \
wolfSSL 11:cee25a834751 595 " ld.w r2,%1 \n\t" \
wolfSSL 11:cee25a834751 596 " add r2,%0 \n\t" \
wolfSSL 11:cee25a834751 597 " eor r3,r3 \n\t" \
wolfSSL 11:cee25a834751 598 " acr r3 \n\t" \
wolfSSL 11:cee25a834751 599 " macu.d r2,%3,%4 \n\t" \
wolfSSL 11:cee25a834751 600 " st.w %1,r2 \n\t" \
wolfSSL 11:cee25a834751 601 " mov %0,r3 \n\t" \
wolfSSL 11:cee25a834751 602 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
wolfSSL 11:cee25a834751 603
wolfSSL 11:cee25a834751 604 #define PROPCARRY \
wolfSSL 11:cee25a834751 605 __asm__( \
wolfSSL 11:cee25a834751 606 " ld.w r2,%1 \n\t" \
wolfSSL 11:cee25a834751 607 " add r2,%0 \n\t" \
wolfSSL 11:cee25a834751 608 " st.w %1,r2 \n\t" \
wolfSSL 11:cee25a834751 609 " eor %0,%0 \n\t" \
wolfSSL 11:cee25a834751 610 " acr %0 \n\t" \
wolfSSL 11:cee25a834751 611 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
wolfSSL 11:cee25a834751 612
wolfSSL 11:cee25a834751 613 #else
wolfSSL 11:cee25a834751 614
wolfSSL 11:cee25a834751 615 /* ISO C code */
wolfSSL 11:cee25a834751 616 #define MONT_START
wolfSSL 11:cee25a834751 617 #define MONT_FINI
wolfSSL 11:cee25a834751 618 #define LOOP_END
wolfSSL 11:cee25a834751 619 #define LOOP_START \
wolfSSL 11:cee25a834751 620 mu = c[x] * mp
wolfSSL 11:cee25a834751 621
wolfSSL 11:cee25a834751 622 #define INNERMUL \
wolfSSL 11:cee25a834751 623 do { fp_word t; \
wolfSSL 11:cee25a834751 624 t = ((fp_word)_c[0] + (fp_word)cy) + \
wolfSSL 11:cee25a834751 625 (((fp_word)mu) * ((fp_word)*tmpm++)); \
wolfSSL 11:cee25a834751 626 _c[0] = (fp_digit)t; \
wolfSSL 11:cee25a834751 627 cy = (fp_digit)(t >> DIGIT_BIT); \
wolfSSL 11:cee25a834751 628 } while (0)
wolfSSL 11:cee25a834751 629
wolfSSL 11:cee25a834751 630 #define PROPCARRY \
wolfSSL 11:cee25a834751 631 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
wolfSSL 11:cee25a834751 632
wolfSSL 11:cee25a834751 633 #endif
wolfSSL 11:cee25a834751 634 /******************************************************************/
wolfSSL 11:cee25a834751 635
wolfSSL 11:cee25a834751 636
wolfSSL 11:cee25a834751 637 #define LO 0
wolfSSL 11:cee25a834751 638 /* end fp_montogomery_reduce.c asm */
wolfSSL 11:cee25a834751 639
wolfSSL 11:cee25a834751 640
wolfSSL 11:cee25a834751 641 /* start fp_sqr_comba.c asm */
wolfSSL 11:cee25a834751 642 #if defined(TFM_X86)
wolfSSL 11:cee25a834751 643
wolfSSL 11:cee25a834751 644 /* x86-32 optimized */
wolfSSL 11:cee25a834751 645
wolfSSL 11:cee25a834751 646 #define COMBA_START
wolfSSL 11:cee25a834751 647
wolfSSL 11:cee25a834751 648 #define CLEAR_CARRY \
wolfSSL 11:cee25a834751 649 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 650
wolfSSL 11:cee25a834751 651 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 652 x = c0;
wolfSSL 11:cee25a834751 653
wolfSSL 11:cee25a834751 654 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 655 x = c1;
wolfSSL 11:cee25a834751 656
wolfSSL 11:cee25a834751 657 #define CARRY_FORWARD \
wolfSSL 11:cee25a834751 658 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 659
wolfSSL 11:cee25a834751 660 #define COMBA_FINI
wolfSSL 11:cee25a834751 661
wolfSSL 11:cee25a834751 662 #define SQRADD(i, j) \
wolfSSL 11:cee25a834751 663 __asm__( \
wolfSSL 11:cee25a834751 664 "movl %6,%%eax \n\t" \
wolfSSL 11:cee25a834751 665 "mull %%eax \n\t" \
wolfSSL 11:cee25a834751 666 "addl %%eax,%0 \n\t" \
wolfSSL 11:cee25a834751 667 "adcl %%edx,%1 \n\t" \
wolfSSL 11:cee25a834751 668 "adcl $0,%2 \n\t" \
wolfSSL 11:cee25a834751 669 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
wolfSSL 11:cee25a834751 670
wolfSSL 11:cee25a834751 671 #define SQRADD2(i, j) \
wolfSSL 11:cee25a834751 672 __asm__( \
wolfSSL 11:cee25a834751 673 "movl %6,%%eax \n\t" \
wolfSSL 11:cee25a834751 674 "mull %7 \n\t" \
wolfSSL 11:cee25a834751 675 "addl %%eax,%0 \n\t" \
wolfSSL 11:cee25a834751 676 "adcl %%edx,%1 \n\t" \
wolfSSL 11:cee25a834751 677 "adcl $0,%2 \n\t" \
wolfSSL 11:cee25a834751 678 "addl %%eax,%0 \n\t" \
wolfSSL 11:cee25a834751 679 "adcl %%edx,%1 \n\t" \
wolfSSL 11:cee25a834751 680 "adcl $0,%2 \n\t" \
wolfSSL 11:cee25a834751 681 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx", "cc");
wolfSSL 11:cee25a834751 682
wolfSSL 11:cee25a834751 683 #define SQRADDSC(i, j) \
wolfSSL 11:cee25a834751 684 __asm__( \
wolfSSL 11:cee25a834751 685 "movl %3,%%eax \n\t" \
wolfSSL 11:cee25a834751 686 "mull %4 \n\t" \
wolfSSL 11:cee25a834751 687 "movl %%eax,%0 \n\t" \
wolfSSL 11:cee25a834751 688 "movl %%edx,%1 \n\t" \
wolfSSL 11:cee25a834751 689 "xorl %2,%2 \n\t" \
wolfSSL 11:cee25a834751 690 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc");
wolfSSL 11:cee25a834751 691
wolfSSL 11:cee25a834751 692 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
wolfSSL 11:cee25a834751 693
wolfSSL 11:cee25a834751 694 #define SQRADDAC(i, j) \
wolfSSL 11:cee25a834751 695 __asm__( \
wolfSSL 11:cee25a834751 696 "movl %6,%%eax \n\t" \
wolfSSL 11:cee25a834751 697 "mull %7 \n\t" \
wolfSSL 11:cee25a834751 698 "addl %%eax,%0 \n\t" \
wolfSSL 11:cee25a834751 699 "adcl %%edx,%1 \n\t" \
wolfSSL 11:cee25a834751 700 "adcl $0,%2 \n\t" \
wolfSSL 11:cee25a834751 701 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
wolfSSL 11:cee25a834751 702
wolfSSL 11:cee25a834751 703 #define SQRADDDB \
wolfSSL 11:cee25a834751 704 __asm__( \
wolfSSL 11:cee25a834751 705 "addl %6,%0 \n\t" \
wolfSSL 11:cee25a834751 706 "adcl %7,%1 \n\t" \
wolfSSL 11:cee25a834751 707 "adcl %8,%2 \n\t" \
wolfSSL 11:cee25a834751 708 "addl %6,%0 \n\t" \
wolfSSL 11:cee25a834751 709 "adcl %7,%1 \n\t" \
wolfSSL 11:cee25a834751 710 "adcl %8,%2 \n\t" \
wolfSSL 11:cee25a834751 711 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
wolfSSL 11:cee25a834751 712
wolfSSL 11:cee25a834751 713 #elif defined(TFM_X86_64)
wolfSSL 11:cee25a834751 714 /* x86-64 optimized */
wolfSSL 11:cee25a834751 715
wolfSSL 11:cee25a834751 716 #define COMBA_START
wolfSSL 11:cee25a834751 717
wolfSSL 11:cee25a834751 718 #define CLEAR_CARRY \
wolfSSL 11:cee25a834751 719 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 720
wolfSSL 11:cee25a834751 721 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 722 x = c0;
wolfSSL 11:cee25a834751 723
wolfSSL 11:cee25a834751 724 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 725 x = c1;
wolfSSL 11:cee25a834751 726
wolfSSL 11:cee25a834751 727 #define CARRY_FORWARD \
wolfSSL 11:cee25a834751 728 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 729
wolfSSL 11:cee25a834751 730 #define COMBA_FINI
wolfSSL 11:cee25a834751 731
wolfSSL 11:cee25a834751 732 #define SQRADD(i, j) \
wolfSSL 11:cee25a834751 733 __asm__( \
wolfSSL 11:cee25a834751 734 "movq %6,%%rax \n\t" \
wolfSSL 11:cee25a834751 735 "mulq %%rax \n\t" \
wolfSSL 11:cee25a834751 736 "addq %%rax,%0 \n\t" \
wolfSSL 11:cee25a834751 737 "adcq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 738 "adcq $0,%2 \n\t" \
wolfSSL 11:cee25a834751 739 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "x"(i) :"%rax","%rdx","cc");
wolfSSL 11:cee25a834751 740
wolfSSL 11:cee25a834751 741 #define SQRADD2(i, j) \
wolfSSL 11:cee25a834751 742 __asm__( \
wolfSSL 11:cee25a834751 743 "movq %6,%%rax \n\t" \
wolfSSL 11:cee25a834751 744 "mulq %7 \n\t" \
wolfSSL 11:cee25a834751 745 "addq %%rax,%0 \n\t" \
wolfSSL 11:cee25a834751 746 "adcq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 747 "adcq $0,%2 \n\t" \
wolfSSL 11:cee25a834751 748 "addq %%rax,%0 \n\t" \
wolfSSL 11:cee25a834751 749 "adcq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 750 "adcq $0,%2 \n\t" \
wolfSSL 11:cee25a834751 751 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
wolfSSL 11:cee25a834751 752
wolfSSL 11:cee25a834751 753 #define SQRADDSC(i, j) \
wolfSSL 11:cee25a834751 754 __asm__( \
wolfSSL 11:cee25a834751 755 "movq %3,%%rax \n\t" \
wolfSSL 11:cee25a834751 756 "mulq %4 \n\t" \
wolfSSL 11:cee25a834751 757 "movq %%rax,%0 \n\t" \
wolfSSL 11:cee25a834751 758 "movq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 759 "xorq %2,%2 \n\t" \
wolfSSL 11:cee25a834751 760 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
wolfSSL 11:cee25a834751 761
wolfSSL 11:cee25a834751 762 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
wolfSSL 11:cee25a834751 763
wolfSSL 11:cee25a834751 764 #define SQRADDAC(i, j) \
wolfSSL 11:cee25a834751 765 __asm__( \
wolfSSL 11:cee25a834751 766 "movq %6,%%rax \n\t" \
wolfSSL 11:cee25a834751 767 "mulq %7 \n\t" \
wolfSSL 11:cee25a834751 768 "addq %%rax,%0 \n\t" \
wolfSSL 11:cee25a834751 769 "adcq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 770 "adcq $0,%2 \n\t" \
wolfSSL 11:cee25a834751 771 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
wolfSSL 11:cee25a834751 772
wolfSSL 11:cee25a834751 773 #define SQRADDDB \
wolfSSL 11:cee25a834751 774 __asm__( \
wolfSSL 11:cee25a834751 775 "addq %6,%0 \n\t" \
wolfSSL 11:cee25a834751 776 "adcq %7,%1 \n\t" \
wolfSSL 11:cee25a834751 777 "adcq %8,%2 \n\t" \
wolfSSL 11:cee25a834751 778 "addq %6,%0 \n\t" \
wolfSSL 11:cee25a834751 779 "adcq %7,%1 \n\t" \
wolfSSL 11:cee25a834751 780 "adcq %8,%2 \n\t" \
wolfSSL 11:cee25a834751 781 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
wolfSSL 11:cee25a834751 782
wolfSSL 11:cee25a834751 783 #elif defined(TFM_SSE2)
wolfSSL 11:cee25a834751 784
wolfSSL 11:cee25a834751 785 /* SSE2 Optimized */
wolfSSL 11:cee25a834751 786 #define COMBA_START
wolfSSL 11:cee25a834751 787
wolfSSL 11:cee25a834751 788 #define CLEAR_CARRY \
wolfSSL 11:cee25a834751 789 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 790
wolfSSL 11:cee25a834751 791 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 792 x = c0;
wolfSSL 11:cee25a834751 793
wolfSSL 11:cee25a834751 794 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 795 x = c1;
wolfSSL 11:cee25a834751 796
wolfSSL 11:cee25a834751 797 #define CARRY_FORWARD \
wolfSSL 11:cee25a834751 798 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 799
wolfSSL 11:cee25a834751 800 #define COMBA_FINI \
wolfSSL 11:cee25a834751 801 __asm__("emms");
wolfSSL 11:cee25a834751 802
wolfSSL 11:cee25a834751 803 #define SQRADD(i, j) \
wolfSSL 11:cee25a834751 804 __asm__( \
wolfSSL 11:cee25a834751 805 "movd %6,%%mm0 \n\t" \
wolfSSL 11:cee25a834751 806 "pmuludq %%mm0,%%mm0\n\t" \
wolfSSL 11:cee25a834751 807 "movd %%mm0,%%eax \n\t" \
wolfSSL 11:cee25a834751 808 "psrlq $32,%%mm0 \n\t" \
wolfSSL 11:cee25a834751 809 "addl %%eax,%0 \n\t" \
wolfSSL 11:cee25a834751 810 "movd %%mm0,%%eax \n\t" \
wolfSSL 11:cee25a834751 811 "adcl %%eax,%1 \n\t" \
wolfSSL 11:cee25a834751 812 "adcl $0,%2 \n\t" \
wolfSSL 11:cee25a834751 813 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
wolfSSL 11:cee25a834751 814
wolfSSL 11:cee25a834751 815 #define SQRADD2(i, j) \
wolfSSL 11:cee25a834751 816 __asm__( \
wolfSSL 11:cee25a834751 817 "movd %6,%%mm0 \n\t" \
wolfSSL 11:cee25a834751 818 "movd %7,%%mm1 \n\t" \
wolfSSL 11:cee25a834751 819 "pmuludq %%mm1,%%mm0\n\t" \
wolfSSL 11:cee25a834751 820 "movd %%mm0,%%eax \n\t" \
wolfSSL 11:cee25a834751 821 "psrlq $32,%%mm0 \n\t" \
wolfSSL 11:cee25a834751 822 "movd %%mm0,%%edx \n\t" \
wolfSSL 11:cee25a834751 823 "addl %%eax,%0 \n\t" \
wolfSSL 11:cee25a834751 824 "adcl %%edx,%1 \n\t" \
wolfSSL 11:cee25a834751 825 "adcl $0,%2 \n\t" \
wolfSSL 11:cee25a834751 826 "addl %%eax,%0 \n\t" \
wolfSSL 11:cee25a834751 827 "adcl %%edx,%1 \n\t" \
wolfSSL 11:cee25a834751 828 "adcl $0,%2 \n\t" \
wolfSSL 11:cee25a834751 829 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
wolfSSL 11:cee25a834751 830
wolfSSL 11:cee25a834751 831 #define SQRADDSC(i, j) \
wolfSSL 11:cee25a834751 832 __asm__( \
wolfSSL 11:cee25a834751 833 "movd %3,%%mm0 \n\t" \
wolfSSL 11:cee25a834751 834 "movd %4,%%mm1 \n\t" \
wolfSSL 11:cee25a834751 835 "pmuludq %%mm1,%%mm0\n\t" \
wolfSSL 11:cee25a834751 836 "movd %%mm0,%0 \n\t" \
wolfSSL 11:cee25a834751 837 "psrlq $32,%%mm0 \n\t" \
wolfSSL 11:cee25a834751 838 "movd %%mm0,%1 \n\t" \
wolfSSL 11:cee25a834751 839 "xorl %2,%2 \n\t" \
wolfSSL 11:cee25a834751 840 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j));
wolfSSL 11:cee25a834751 841
wolfSSL 11:cee25a834751 842 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
wolfSSL 11:cee25a834751 843
wolfSSL 11:cee25a834751 844 #define SQRADDAC(i, j) \
wolfSSL 11:cee25a834751 845 __asm__( \
wolfSSL 11:cee25a834751 846 "movd %6,%%mm0 \n\t" \
wolfSSL 11:cee25a834751 847 "movd %7,%%mm1 \n\t" \
wolfSSL 11:cee25a834751 848 "pmuludq %%mm1,%%mm0\n\t" \
wolfSSL 11:cee25a834751 849 "movd %%mm0,%%eax \n\t" \
wolfSSL 11:cee25a834751 850 "psrlq $32,%%mm0 \n\t" \
wolfSSL 11:cee25a834751 851 "movd %%mm0,%%edx \n\t" \
wolfSSL 11:cee25a834751 852 "addl %%eax,%0 \n\t" \
wolfSSL 11:cee25a834751 853 "adcl %%edx,%1 \n\t" \
wolfSSL 11:cee25a834751 854 "adcl $0,%2 \n\t" \
wolfSSL 11:cee25a834751 855 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","cc");
wolfSSL 11:cee25a834751 856
wolfSSL 11:cee25a834751 857 #define SQRADDDB \
wolfSSL 11:cee25a834751 858 __asm__( \
wolfSSL 11:cee25a834751 859 "addl %6,%0 \n\t" \
wolfSSL 11:cee25a834751 860 "adcl %7,%1 \n\t" \
wolfSSL 11:cee25a834751 861 "adcl %8,%2 \n\t" \
wolfSSL 11:cee25a834751 862 "addl %6,%0 \n\t" \
wolfSSL 11:cee25a834751 863 "adcl %7,%1 \n\t" \
wolfSSL 11:cee25a834751 864 "adcl %8,%2 \n\t" \
wolfSSL 11:cee25a834751 865 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
wolfSSL 11:cee25a834751 866
wolfSSL 11:cee25a834751 867 #elif defined(TFM_ARM)
wolfSSL 11:cee25a834751 868
wolfSSL 11:cee25a834751 869 /* ARM code */
wolfSSL 11:cee25a834751 870
wolfSSL 11:cee25a834751 871 #define COMBA_START
wolfSSL 11:cee25a834751 872
wolfSSL 11:cee25a834751 873 #define CLEAR_CARRY \
wolfSSL 11:cee25a834751 874 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 875
wolfSSL 11:cee25a834751 876 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 877 x = c0;
wolfSSL 11:cee25a834751 878
wolfSSL 11:cee25a834751 879 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 880 x = c1;
wolfSSL 11:cee25a834751 881
wolfSSL 11:cee25a834751 882 #define CARRY_FORWARD \
wolfSSL 11:cee25a834751 883 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 884
wolfSSL 11:cee25a834751 885 #define COMBA_FINI
wolfSSL 11:cee25a834751 886
wolfSSL 11:cee25a834751 887 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 11:cee25a834751 888 #define SQRADD(i, j) \
wolfSSL 11:cee25a834751 889 __asm__( \
wolfSSL 11:cee25a834751 890 " UMULL r0,r1,%6,%6 \n\t" \
wolfSSL 11:cee25a834751 891 " ADDS %0,%0,r0 \n\t" \
wolfSSL 11:cee25a834751 892 " ADCS %1,%1,r1 \n\t" \
wolfSSL 11:cee25a834751 893 " ADC %2,%2,#0 \n\t" \
wolfSSL 11:cee25a834751 894 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
wolfSSL 11:cee25a834751 895
wolfSSL 11:cee25a834751 896 /* for squaring some of the terms are doubled... */
wolfSSL 11:cee25a834751 897 #define SQRADD2(i, j) \
wolfSSL 11:cee25a834751 898 __asm__( \
wolfSSL 11:cee25a834751 899 " UMULL r0,r1,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 900 " ADDS %0,%0,r0 \n\t" \
wolfSSL 11:cee25a834751 901 " ADCS %1,%1,r1 \n\t" \
wolfSSL 11:cee25a834751 902 " ADC %2,%2,#0 \n\t" \
wolfSSL 11:cee25a834751 903 " ADDS %0,%0,r0 \n\t" \
wolfSSL 11:cee25a834751 904 " ADCS %1,%1,r1 \n\t" \
wolfSSL 11:cee25a834751 905 " ADC %2,%2,#0 \n\t" \
wolfSSL 11:cee25a834751 906 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
wolfSSL 11:cee25a834751 907
wolfSSL 11:cee25a834751 908 #define SQRADDSC(i, j) \
wolfSSL 11:cee25a834751 909 __asm__( \
wolfSSL 11:cee25a834751 910 " UMULL %0,%1,%3,%4 \n\t" \
wolfSSL 11:cee25a834751 911 " SUB %2,%2,%2 \n\t" \
wolfSSL 11:cee25a834751 912 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc");
wolfSSL 11:cee25a834751 913
wolfSSL 11:cee25a834751 914 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
wolfSSL 11:cee25a834751 915
wolfSSL 11:cee25a834751 916 #define SQRADDAC(i, j) \
wolfSSL 11:cee25a834751 917 __asm__( \
wolfSSL 11:cee25a834751 918 " UMULL r0,r1,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 919 " ADDS %0,%0,r0 \n\t" \
wolfSSL 11:cee25a834751 920 " ADCS %1,%1,r1 \n\t" \
wolfSSL 11:cee25a834751 921 " ADC %2,%2,#0 \n\t" \
wolfSSL 11:cee25a834751 922 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
wolfSSL 11:cee25a834751 923
wolfSSL 11:cee25a834751 924 #define SQRADDDB \
wolfSSL 11:cee25a834751 925 __asm__( \
wolfSSL 11:cee25a834751 926 " ADDS %0,%0,%3 \n\t" \
wolfSSL 11:cee25a834751 927 " ADCS %1,%1,%4 \n\t" \
wolfSSL 11:cee25a834751 928 " ADC %2,%2,%5 \n\t" \
wolfSSL 11:cee25a834751 929 " ADDS %0,%0,%3 \n\t" \
wolfSSL 11:cee25a834751 930 " ADCS %1,%1,%4 \n\t" \
wolfSSL 11:cee25a834751 931 " ADC %2,%2,%5 \n\t" \
wolfSSL 11:cee25a834751 932 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
wolfSSL 11:cee25a834751 933
wolfSSL 11:cee25a834751 934 #elif defined(TFM_PPC32)
wolfSSL 11:cee25a834751 935
wolfSSL 11:cee25a834751 936 /* PPC32 */
wolfSSL 11:cee25a834751 937
wolfSSL 11:cee25a834751 938 #define COMBA_START
wolfSSL 11:cee25a834751 939
wolfSSL 11:cee25a834751 940 #define CLEAR_CARRY \
wolfSSL 11:cee25a834751 941 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 942
wolfSSL 11:cee25a834751 943 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 944 x = c0;
wolfSSL 11:cee25a834751 945
wolfSSL 11:cee25a834751 946 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 947 x = c1;
wolfSSL 11:cee25a834751 948
wolfSSL 11:cee25a834751 949 #define CARRY_FORWARD \
wolfSSL 11:cee25a834751 950 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 951
wolfSSL 11:cee25a834751 952 #define COMBA_FINI
wolfSSL 11:cee25a834751 953
wolfSSL 11:cee25a834751 954 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 11:cee25a834751 955 #define SQRADD(i, j) \
wolfSSL 11:cee25a834751 956 __asm__( \
wolfSSL 11:cee25a834751 957 " mullw 16,%6,%6 \n\t" \
wolfSSL 11:cee25a834751 958 " addc %0,%0,16 \n\t" \
wolfSSL 11:cee25a834751 959 " mulhwu 16,%6,%6 \n\t" \
wolfSSL 11:cee25a834751 960 " adde %1,%1,16 \n\t" \
wolfSSL 11:cee25a834751 961 " addze %2,%2 \n\t" \
wolfSSL 11:cee25a834751 962 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
wolfSSL 11:cee25a834751 963
wolfSSL 11:cee25a834751 964 /* for squaring some of the terms are doubled... */
wolfSSL 11:cee25a834751 965 #define SQRADD2(i, j) \
wolfSSL 11:cee25a834751 966 __asm__( \
wolfSSL 11:cee25a834751 967 " mullw 16,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 968 " mulhwu 17,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 969 " addc %0,%0,16 \n\t" \
wolfSSL 11:cee25a834751 970 " adde %1,%1,17 \n\t" \
wolfSSL 11:cee25a834751 971 " addze %2,%2 \n\t" \
wolfSSL 11:cee25a834751 972 " addc %0,%0,16 \n\t" \
wolfSSL 11:cee25a834751 973 " adde %1,%1,17 \n\t" \
wolfSSL 11:cee25a834751 974 " addze %2,%2 \n\t" \
wolfSSL 11:cee25a834751 975 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
wolfSSL 11:cee25a834751 976
wolfSSL 11:cee25a834751 977 #define SQRADDSC(i, j) \
wolfSSL 11:cee25a834751 978 __asm__( \
wolfSSL 11:cee25a834751 979 " mullw %0,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 980 " mulhwu %1,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 981 " xor %2,%2,%2 \n\t" \
wolfSSL 11:cee25a834751 982 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
wolfSSL 11:cee25a834751 983
wolfSSL 11:cee25a834751 984 #define SQRADDAC(i, j) \
wolfSSL 11:cee25a834751 985 __asm__( \
wolfSSL 11:cee25a834751 986 " mullw 16,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 987 " addc %0,%0,16 \n\t" \
wolfSSL 11:cee25a834751 988 " mulhwu 16,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 989 " adde %1,%1,16 \n\t" \
wolfSSL 11:cee25a834751 990 " addze %2,%2 \n\t" \
wolfSSL 11:cee25a834751 991 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
wolfSSL 11:cee25a834751 992
wolfSSL 11:cee25a834751 993 #define SQRADDDB \
wolfSSL 11:cee25a834751 994 __asm__( \
wolfSSL 11:cee25a834751 995 " addc %0,%0,%3 \n\t" \
wolfSSL 11:cee25a834751 996 " adde %1,%1,%4 \n\t" \
wolfSSL 11:cee25a834751 997 " adde %2,%2,%5 \n\t" \
wolfSSL 11:cee25a834751 998 " addc %0,%0,%3 \n\t" \
wolfSSL 11:cee25a834751 999 " adde %1,%1,%4 \n\t" \
wolfSSL 11:cee25a834751 1000 " adde %2,%2,%5 \n\t" \
wolfSSL 11:cee25a834751 1001 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
wolfSSL 11:cee25a834751 1002
wolfSSL 11:cee25a834751 1003 #elif defined(TFM_PPC64)
wolfSSL 11:cee25a834751 1004 /* PPC64 */
wolfSSL 11:cee25a834751 1005
wolfSSL 11:cee25a834751 1006 #define COMBA_START
wolfSSL 11:cee25a834751 1007
wolfSSL 11:cee25a834751 1008 #define CLEAR_CARRY \
wolfSSL 11:cee25a834751 1009 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 1010
wolfSSL 11:cee25a834751 1011 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 1012 x = c0;
wolfSSL 11:cee25a834751 1013
wolfSSL 11:cee25a834751 1014 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 1015 x = c1;
wolfSSL 11:cee25a834751 1016
wolfSSL 11:cee25a834751 1017 #define CARRY_FORWARD \
wolfSSL 11:cee25a834751 1018 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 1019
wolfSSL 11:cee25a834751 1020 #define COMBA_FINI
wolfSSL 11:cee25a834751 1021
wolfSSL 11:cee25a834751 1022 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 11:cee25a834751 1023 #define SQRADD(i, j) \
wolfSSL 11:cee25a834751 1024 __asm__( \
wolfSSL 11:cee25a834751 1025 " mulld 16,%6,%6 \n\t" \
wolfSSL 11:cee25a834751 1026 " addc %0,%0,16 \n\t" \
wolfSSL 11:cee25a834751 1027 " mulhdu 16,%6,%6 \n\t" \
wolfSSL 11:cee25a834751 1028 " adde %1,%1,16 \n\t" \
wolfSSL 11:cee25a834751 1029 " addze %2,%2 \n\t" \
wolfSSL 11:cee25a834751 1030 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
wolfSSL 11:cee25a834751 1031
wolfSSL 11:cee25a834751 1032 /* for squaring some of the terms are doubled... */
wolfSSL 11:cee25a834751 1033 #define SQRADD2(i, j) \
wolfSSL 11:cee25a834751 1034 __asm__( \
wolfSSL 11:cee25a834751 1035 " mulld 16,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1036 " mulhdu 17,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1037 " addc %0,%0,16 \n\t" \
wolfSSL 11:cee25a834751 1038 " adde %1,%1,17 \n\t" \
wolfSSL 11:cee25a834751 1039 " addze %2,%2 \n\t" \
wolfSSL 11:cee25a834751 1040 " addc %0,%0,16 \n\t" \
wolfSSL 11:cee25a834751 1041 " adde %1,%1,17 \n\t" \
wolfSSL 11:cee25a834751 1042 " addze %2,%2 \n\t" \
wolfSSL 11:cee25a834751 1043 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
wolfSSL 11:cee25a834751 1044
wolfSSL 11:cee25a834751 1045 #define SQRADDSC(i, j) \
wolfSSL 11:cee25a834751 1046 __asm__( \
wolfSSL 11:cee25a834751 1047 " mulld %0,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1048 " mulhdu %1,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1049 " xor %2,%2,%2 \n\t" \
wolfSSL 11:cee25a834751 1050 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
wolfSSL 11:cee25a834751 1051
wolfSSL 11:cee25a834751 1052 #define SQRADDAC(i, j) \
wolfSSL 11:cee25a834751 1053 __asm__( \
wolfSSL 11:cee25a834751 1054 " mulld 16,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1055 " addc %0,%0,16 \n\t" \
wolfSSL 11:cee25a834751 1056 " mulhdu 16,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1057 " adde %1,%1,16 \n\t" \
wolfSSL 11:cee25a834751 1058 " addze %2,%2 \n\t" \
wolfSSL 11:cee25a834751 1059 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
wolfSSL 11:cee25a834751 1060
wolfSSL 11:cee25a834751 1061 #define SQRADDDB \
wolfSSL 11:cee25a834751 1062 __asm__( \
wolfSSL 11:cee25a834751 1063 " addc %0,%0,%3 \n\t" \
wolfSSL 11:cee25a834751 1064 " adde %1,%1,%4 \n\t" \
wolfSSL 11:cee25a834751 1065 " adde %2,%2,%5 \n\t" \
wolfSSL 11:cee25a834751 1066 " addc %0,%0,%3 \n\t" \
wolfSSL 11:cee25a834751 1067 " adde %1,%1,%4 \n\t" \
wolfSSL 11:cee25a834751 1068 " adde %2,%2,%5 \n\t" \
wolfSSL 11:cee25a834751 1069 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
wolfSSL 11:cee25a834751 1070
wolfSSL 11:cee25a834751 1071
wolfSSL 11:cee25a834751 1072 #elif defined(TFM_AVR32)
wolfSSL 11:cee25a834751 1073
wolfSSL 11:cee25a834751 1074 /* AVR32 */
wolfSSL 11:cee25a834751 1075
wolfSSL 11:cee25a834751 1076 #define COMBA_START
wolfSSL 11:cee25a834751 1077
wolfSSL 11:cee25a834751 1078 #define CLEAR_CARRY \
wolfSSL 11:cee25a834751 1079 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 1080
wolfSSL 11:cee25a834751 1081 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 1082 x = c0;
wolfSSL 11:cee25a834751 1083
wolfSSL 11:cee25a834751 1084 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 1085 x = c1;
wolfSSL 11:cee25a834751 1086
wolfSSL 11:cee25a834751 1087 #define CARRY_FORWARD \
wolfSSL 11:cee25a834751 1088 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 1089
wolfSSL 11:cee25a834751 1090 #define COMBA_FINI
wolfSSL 11:cee25a834751 1091
wolfSSL 11:cee25a834751 1092 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 11:cee25a834751 1093 #define SQRADD(i, j) \
wolfSSL 11:cee25a834751 1094 __asm__( \
wolfSSL 11:cee25a834751 1095 " mulu.d r2,%6,%6 \n\t" \
wolfSSL 11:cee25a834751 1096 " add %0,%0,r2 \n\t" \
wolfSSL 11:cee25a834751 1097 " adc %1,%1,r3 \n\t" \
wolfSSL 11:cee25a834751 1098 " acr %2 \n\t" \
wolfSSL 11:cee25a834751 1099 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
wolfSSL 11:cee25a834751 1100
wolfSSL 11:cee25a834751 1101 /* for squaring some of the terms are doubled... */
wolfSSL 11:cee25a834751 1102 #define SQRADD2(i, j) \
wolfSSL 11:cee25a834751 1103 __asm__( \
wolfSSL 11:cee25a834751 1104 " mulu.d r2,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1105 " add %0,%0,r2 \n\t" \
wolfSSL 11:cee25a834751 1106 " adc %1,%1,r3 \n\t" \
wolfSSL 11:cee25a834751 1107 " acr %2, \n\t" \
wolfSSL 11:cee25a834751 1108 " add %0,%0,r2 \n\t" \
wolfSSL 11:cee25a834751 1109 " adc %1,%1,r3 \n\t" \
wolfSSL 11:cee25a834751 1110 " acr %2, \n\t" \
wolfSSL 11:cee25a834751 1111 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
wolfSSL 11:cee25a834751 1112
wolfSSL 11:cee25a834751 1113 #define SQRADDSC(i, j) \
wolfSSL 11:cee25a834751 1114 __asm__( \
wolfSSL 11:cee25a834751 1115 " mulu.d r2,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1116 " mov %0,r2 \n\t" \
wolfSSL 11:cee25a834751 1117 " mov %1,r3 \n\t" \
wolfSSL 11:cee25a834751 1118 " eor %2,%2 \n\t" \
wolfSSL 11:cee25a834751 1119 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
wolfSSL 11:cee25a834751 1120
wolfSSL 11:cee25a834751 1121 #define SQRADDAC(i, j) \
wolfSSL 11:cee25a834751 1122 __asm__( \
wolfSSL 11:cee25a834751 1123 " mulu.d r2,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1124 " add %0,%0,r2 \n\t" \
wolfSSL 11:cee25a834751 1125 " adc %1,%1,r3 \n\t" \
wolfSSL 11:cee25a834751 1126 " acr %2 \n\t" \
wolfSSL 11:cee25a834751 1127 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
wolfSSL 11:cee25a834751 1128
wolfSSL 11:cee25a834751 1129 #define SQRADDDB \
wolfSSL 11:cee25a834751 1130 __asm__( \
wolfSSL 11:cee25a834751 1131 " add %0,%0,%3 \n\t" \
wolfSSL 11:cee25a834751 1132 " adc %1,%1,%4 \n\t" \
wolfSSL 11:cee25a834751 1133 " adc %2,%2,%5 \n\t" \
wolfSSL 11:cee25a834751 1134 " add %0,%0,%3 \n\t" \
wolfSSL 11:cee25a834751 1135 " adc %1,%1,%4 \n\t" \
wolfSSL 11:cee25a834751 1136 " adc %2,%2,%5 \n\t" \
wolfSSL 11:cee25a834751 1137 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
wolfSSL 11:cee25a834751 1138
wolfSSL 11:cee25a834751 1139
wolfSSL 11:cee25a834751 1140 #else
wolfSSL 11:cee25a834751 1141
wolfSSL 11:cee25a834751 1142 #define TFM_ISO
wolfSSL 11:cee25a834751 1143
wolfSSL 11:cee25a834751 1144 /* ISO C portable code */
wolfSSL 11:cee25a834751 1145
wolfSSL 11:cee25a834751 1146 #define COMBA_START
wolfSSL 11:cee25a834751 1147
wolfSSL 11:cee25a834751 1148 #define CLEAR_CARRY \
wolfSSL 11:cee25a834751 1149 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 1150
wolfSSL 11:cee25a834751 1151 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 1152 x = c0;
wolfSSL 11:cee25a834751 1153
wolfSSL 11:cee25a834751 1154 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 1155 x = c1;
wolfSSL 11:cee25a834751 1156
wolfSSL 11:cee25a834751 1157 #define CARRY_FORWARD \
wolfSSL 11:cee25a834751 1158 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 1159
wolfSSL 11:cee25a834751 1160 #define COMBA_FINI
wolfSSL 11:cee25a834751 1161
wolfSSL 11:cee25a834751 1162 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 11:cee25a834751 1163 #define SQRADD(i, j) \
wolfSSL 11:cee25a834751 1164 do { fp_word t; \
wolfSSL 11:cee25a834751 1165 t = c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \
wolfSSL 11:cee25a834751 1166 t = c1 + (t >> DIGIT_BIT); c1 = (fp_digit)t; \
wolfSSL 11:cee25a834751 1167 c2 +=(fp_digit) (t >> DIGIT_BIT); \
wolfSSL 11:cee25a834751 1168 } while (0);
wolfSSL 11:cee25a834751 1169
wolfSSL 11:cee25a834751 1170
wolfSSL 11:cee25a834751 1171 /* for squaring some of the terms are doubled... */
wolfSSL 11:cee25a834751 1172 #define SQRADD2(i, j) \
wolfSSL 11:cee25a834751 1173 do { fp_word t; \
wolfSSL 11:cee25a834751 1174 t = ((fp_word)i) * ((fp_word)j); \
wolfSSL 11:cee25a834751 1175 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
wolfSSL 11:cee25a834751 1176 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
wolfSSL 11:cee25a834751 1177 c2 +=(fp_digit)( tt >> DIGIT_BIT); \
wolfSSL 11:cee25a834751 1178 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
wolfSSL 11:cee25a834751 1179 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
wolfSSL 11:cee25a834751 1180 c2 +=(fp_digit) (tt >> DIGIT_BIT); \
wolfSSL 11:cee25a834751 1181 } while (0);
wolfSSL 11:cee25a834751 1182
wolfSSL 11:cee25a834751 1183 #define SQRADDSC(i, j) \
wolfSSL 11:cee25a834751 1184 do { fp_word t; \
wolfSSL 11:cee25a834751 1185 t = ((fp_word)i) * ((fp_word)j); \
wolfSSL 11:cee25a834751 1186 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \
wolfSSL 11:cee25a834751 1187 } while (0);
wolfSSL 11:cee25a834751 1188
wolfSSL 11:cee25a834751 1189 #define SQRADDAC(i, j) \
wolfSSL 11:cee25a834751 1190 do { fp_word t; \
wolfSSL 11:cee25a834751 1191 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = (fp_digit)t; \
wolfSSL 11:cee25a834751 1192 t = sc1 + (t >> DIGIT_BIT); sc1 = (fp_digit)t; \
wolfSSL 11:cee25a834751 1193 sc2 += (fp_digit)(t >> DIGIT_BIT); \
wolfSSL 11:cee25a834751 1194 } while (0);
wolfSSL 11:cee25a834751 1195
wolfSSL 11:cee25a834751 1196 #define SQRADDDB \
wolfSSL 11:cee25a834751 1197 do { fp_word t; \
wolfSSL 11:cee25a834751 1198 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t; \
wolfSSL 11:cee25a834751 1199 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); \
wolfSSL 11:cee25a834751 1200 c1 = (fp_digit)t; \
wolfSSL 11:cee25a834751 1201 c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT)); \
wolfSSL 11:cee25a834751 1202 } while (0);
wolfSSL 11:cee25a834751 1203
wolfSSL 11:cee25a834751 1204 #endif
wolfSSL 11:cee25a834751 1205
wolfSSL 11:cee25a834751 1206 #ifdef TFM_SMALL_SET
wolfSSL 11:cee25a834751 1207 #include "fp_sqr_comba_small_set.i"
wolfSSL 11:cee25a834751 1208 #endif
wolfSSL 11:cee25a834751 1209
wolfSSL 11:cee25a834751 1210 #if defined(TFM_SQR3) && FP_SIZE >= 6
wolfSSL 11:cee25a834751 1211 #include "fp_sqr_comba_3.i"
wolfSSL 11:cee25a834751 1212 #endif
wolfSSL 11:cee25a834751 1213 #if defined(TFM_SQR4) && FP_SIZE >= 8
wolfSSL 11:cee25a834751 1214 #include "fp_sqr_comba_4.i"
wolfSSL 11:cee25a834751 1215 #endif
wolfSSL 11:cee25a834751 1216 #if defined(TFM_SQR6) && FP_SIZE >= 12
wolfSSL 11:cee25a834751 1217 #include "fp_sqr_comba_6.i"
wolfSSL 11:cee25a834751 1218 #endif
wolfSSL 11:cee25a834751 1219 #if defined(TFM_SQR7) && FP_SIZE >= 14
wolfSSL 11:cee25a834751 1220 #include "fp_sqr_comba_7.i"
wolfSSL 11:cee25a834751 1221 #endif
wolfSSL 11:cee25a834751 1222 #if defined(TFM_SQR8) && FP_SIZE >= 16
wolfSSL 11:cee25a834751 1223 #include "fp_sqr_comba_8.i"
wolfSSL 11:cee25a834751 1224 #endif
wolfSSL 11:cee25a834751 1225 #if defined(TFM_SQR9) && FP_SIZE >= 18
wolfSSL 11:cee25a834751 1226 #include "fp_sqr_comba_9.i"
wolfSSL 11:cee25a834751 1227 #endif
wolfSSL 11:cee25a834751 1228 #if defined(TFM_SQR12) && FP_SIZE >= 24
wolfSSL 11:cee25a834751 1229 #include "fp_sqr_comba_12.i"
wolfSSL 11:cee25a834751 1230 #endif
wolfSSL 11:cee25a834751 1231 #if defined(TFM_SQR17) && FP_SIZE >= 34
wolfSSL 11:cee25a834751 1232 #include "fp_sqr_comba_17.i"
wolfSSL 11:cee25a834751 1233 #endif
wolfSSL 11:cee25a834751 1234 #if defined(TFM_SQR20) && FP_SIZE >= 40
wolfSSL 11:cee25a834751 1235 #include "fp_sqr_comba_20.i"
wolfSSL 11:cee25a834751 1236 #endif
wolfSSL 11:cee25a834751 1237 #if defined(TFM_SQR24) && FP_SIZE >= 48
wolfSSL 11:cee25a834751 1238 #include "fp_sqr_comba_24.i"
wolfSSL 11:cee25a834751 1239 #endif
wolfSSL 11:cee25a834751 1240 #if defined(TFM_SQR28) && FP_SIZE >= 56
wolfSSL 11:cee25a834751 1241 #include "fp_sqr_comba_28.i"
wolfSSL 11:cee25a834751 1242 #endif
wolfSSL 11:cee25a834751 1243 #if defined(TFM_SQR32) && FP_SIZE >= 64
wolfSSL 11:cee25a834751 1244 #include "fp_sqr_comba_32.i"
wolfSSL 11:cee25a834751 1245 #endif
wolfSSL 11:cee25a834751 1246 #if defined(TFM_SQR48) && FP_SIZE >= 96
wolfSSL 11:cee25a834751 1247 #include "fp_sqr_comba_48.i"
wolfSSL 11:cee25a834751 1248 #endif
wolfSSL 11:cee25a834751 1249 #if defined(TFM_SQR64) && FP_SIZE >= 128
wolfSSL 11:cee25a834751 1250 #include "fp_sqr_comba_64.i"
wolfSSL 11:cee25a834751 1251 #endif
wolfSSL 11:cee25a834751 1252 /* end fp_sqr_comba.c asm */
wolfSSL 11:cee25a834751 1253
wolfSSL 11:cee25a834751 1254 /* start fp_mul_comba.c asm */
wolfSSL 11:cee25a834751 1255 /* these are the combas. Worship them. */
wolfSSL 11:cee25a834751 1256 #if defined(TFM_X86)
wolfSSL 11:cee25a834751 1257 /* Generic x86 optimized code */
wolfSSL 11:cee25a834751 1258
wolfSSL 11:cee25a834751 1259 /* anything you need at the start */
wolfSSL 11:cee25a834751 1260 #define COMBA_START
wolfSSL 11:cee25a834751 1261
wolfSSL 11:cee25a834751 1262 /* clear the chaining variables */
wolfSSL 11:cee25a834751 1263 #define COMBA_CLEAR \
wolfSSL 11:cee25a834751 1264 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 1265
wolfSSL 11:cee25a834751 1266 /* forward the carry to the next digit */
wolfSSL 11:cee25a834751 1267 #define COMBA_FORWARD \
wolfSSL 11:cee25a834751 1268 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 1269
wolfSSL 11:cee25a834751 1270 /* store the first sum */
wolfSSL 11:cee25a834751 1271 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 1272 x = c0;
wolfSSL 11:cee25a834751 1273
wolfSSL 11:cee25a834751 1274 /* store the second sum [carry] */
wolfSSL 11:cee25a834751 1275 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 1276 x = c1;
wolfSSL 11:cee25a834751 1277
wolfSSL 11:cee25a834751 1278 /* anything you need at the end */
wolfSSL 11:cee25a834751 1279 #define COMBA_FINI
wolfSSL 11:cee25a834751 1280
wolfSSL 11:cee25a834751 1281 /* this should multiply i and j */
wolfSSL 11:cee25a834751 1282 #define MULADD(i, j) \
wolfSSL 11:cee25a834751 1283 __asm__( \
wolfSSL 11:cee25a834751 1284 "movl %6,%%eax \n\t" \
wolfSSL 11:cee25a834751 1285 "mull %7 \n\t" \
wolfSSL 11:cee25a834751 1286 "addl %%eax,%0 \n\t" \
wolfSSL 11:cee25a834751 1287 "adcl %%edx,%1 \n\t" \
wolfSSL 11:cee25a834751 1288 "adcl $0,%2 \n\t" \
wolfSSL 11:cee25a834751 1289 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
wolfSSL 11:cee25a834751 1290
wolfSSL 11:cee25a834751 1291 #elif defined(TFM_X86_64)
wolfSSL 11:cee25a834751 1292 /* x86-64 optimized */
wolfSSL 11:cee25a834751 1293
wolfSSL 11:cee25a834751 1294 /* anything you need at the start */
wolfSSL 11:cee25a834751 1295 #define COMBA_START
wolfSSL 11:cee25a834751 1296
wolfSSL 11:cee25a834751 1297 /* clear the chaining variables */
wolfSSL 11:cee25a834751 1298 #define COMBA_CLEAR \
wolfSSL 11:cee25a834751 1299 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 1300
wolfSSL 11:cee25a834751 1301 /* forward the carry to the next digit */
wolfSSL 11:cee25a834751 1302 #define COMBA_FORWARD \
wolfSSL 11:cee25a834751 1303 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 1304
wolfSSL 11:cee25a834751 1305 /* store the first sum */
wolfSSL 11:cee25a834751 1306 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 1307 x = c0;
wolfSSL 11:cee25a834751 1308
wolfSSL 11:cee25a834751 1309 /* store the second sum [carry] */
wolfSSL 11:cee25a834751 1310 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 1311 x = c1;
wolfSSL 11:cee25a834751 1312
wolfSSL 11:cee25a834751 1313 /* anything you need at the end */
wolfSSL 11:cee25a834751 1314 #define COMBA_FINI
wolfSSL 11:cee25a834751 1315
wolfSSL 11:cee25a834751 1316 /* this should multiply i and j */
wolfSSL 11:cee25a834751 1317 #define MULADD(i, j) \
wolfSSL 11:cee25a834751 1318 __asm__ ( \
wolfSSL 11:cee25a834751 1319 "movq %6,%%rax \n\t" \
wolfSSL 11:cee25a834751 1320 "mulq %7 \n\t" \
wolfSSL 11:cee25a834751 1321 "addq %%rax,%0 \n\t" \
wolfSSL 11:cee25a834751 1322 "adcq %%rdx,%1 \n\t" \
wolfSSL 11:cee25a834751 1323 "adcq $0,%2 \n\t" \
wolfSSL 11:cee25a834751 1324 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
wolfSSL 11:cee25a834751 1325
wolfSSL 11:cee25a834751 1326
wolfSSL 11:cee25a834751 1327 #if defined(HAVE_INTEL_MULX)
wolfSSL 11:cee25a834751 1328 #define MULADD_MULX(b0, c0, c1, rdx)\
wolfSSL 11:cee25a834751 1329 __asm__ volatile ( \
wolfSSL 11:cee25a834751 1330 "movq %3, %%rdx\n\t" \
wolfSSL 11:cee25a834751 1331 "mulx %2,%%r9, %%r8 \n\t" \
wolfSSL 11:cee25a834751 1332 "adoxq %%r9,%0 \n\t" \
wolfSSL 11:cee25a834751 1333 "adcxq %%r8,%1 \n\t" \
wolfSSL 11:cee25a834751 1334 :"+r"(c0),"+r"(c1):"r"(b0), "r"(rdx):"%r8","%r9","%r10","%rdx"\
wolfSSL 11:cee25a834751 1335 )
wolfSSL 11:cee25a834751 1336
wolfSSL 11:cee25a834751 1337
wolfSSL 11:cee25a834751 1338 #define MULADD_MULX_ADD_CARRY(c0, c1)\
wolfSSL 11:cee25a834751 1339 __asm__ volatile(\
wolfSSL 11:cee25a834751 1340 "mov $0, %%r10\n\t"\
wolfSSL 11:cee25a834751 1341 "movq %1, %%r8\n\t"\
wolfSSL 11:cee25a834751 1342 "adox %%r10, %0\n\t"\
wolfSSL 11:cee25a834751 1343 "adcx %%r10, %1\n\t"\
wolfSSL 11:cee25a834751 1344 :"+r"(c0),"+r"(c1)::"%r8","%r9","%r10","%rdx") ;
wolfSSL 11:cee25a834751 1345
wolfSSL 11:cee25a834751 1346 #define MULADD_SET_A(a0)\
wolfSSL 11:cee25a834751 1347 __asm__ volatile("add $0, %%r8\n\t" \
wolfSSL 11:cee25a834751 1348 "movq %0,%%rdx\n\t" \
wolfSSL 11:cee25a834751 1349 ::"r"(a0):"%r8","%r9","%r10","%rdx") ;
wolfSSL 11:cee25a834751 1350
wolfSSL 11:cee25a834751 1351 #define MULADD_BODY(a,b,c)\
wolfSSL 11:cee25a834751 1352 { word64 rdx = a->dp[ix] ; \
wolfSSL 11:cee25a834751 1353 cp = &(c->dp[iz]) ; \
wolfSSL 11:cee25a834751 1354 c0 = cp[0] ; c1 = cp[1]; \
wolfSSL 11:cee25a834751 1355 MULADD_SET_A(rdx) ; \
wolfSSL 11:cee25a834751 1356 MULADD_MULX(b0, c0, c1, rdx) ;\
wolfSSL 11:cee25a834751 1357 cp[0]=c0; c0=cp[2]; \
wolfSSL 11:cee25a834751 1358 MULADD_MULX(b1, c1, c0, rdx) ;\
wolfSSL 11:cee25a834751 1359 cp[1]=c1; c1=cp[3]; \
wolfSSL 11:cee25a834751 1360 MULADD_MULX(b2, c0, c1, rdx) ;\
wolfSSL 11:cee25a834751 1361 cp[2]=c0; c0=cp[4]; \
wolfSSL 11:cee25a834751 1362 MULADD_MULX(b3, c1, c0, rdx) ;\
wolfSSL 11:cee25a834751 1363 cp[3]=c1; c1=cp[5]; \
wolfSSL 11:cee25a834751 1364 MULADD_MULX_ADD_CARRY(c0, c1);\
wolfSSL 11:cee25a834751 1365 cp[4]=c0; cp[5]=c1; \
wolfSSL 11:cee25a834751 1366 }
wolfSSL 11:cee25a834751 1367
wolfSSL 11:cee25a834751 1368 #define TFM_INTEL_MUL_COMBA(a, b, c)\
wolfSSL 11:cee25a834751 1369 for(ix=0; ix<pa; ix++)c->dp[ix]=0 ; \
wolfSSL 11:cee25a834751 1370 for(iy=0; (iy<b->used); iy+=4) { \
wolfSSL 11:cee25a834751 1371 fp_digit *bp ; \
wolfSSL 11:cee25a834751 1372 bp = &(b->dp[iy+0]) ; \
wolfSSL 11:cee25a834751 1373 fp_digit b0 = bp[0] , b1= bp[1], \
wolfSSL 11:cee25a834751 1374 b2= bp[2], b3= bp[3]; \
wolfSSL 11:cee25a834751 1375 ix=0, iz=iy; \
wolfSSL 11:cee25a834751 1376 while(ix<a->used) { \
wolfSSL 11:cee25a834751 1377 fp_digit c0, c1; \
wolfSSL 11:cee25a834751 1378 fp_digit *cp ; \
wolfSSL 11:cee25a834751 1379 MULADD_BODY(a,b,c); \
wolfSSL 11:cee25a834751 1380 ix++ ; iz++ ; \
wolfSSL 11:cee25a834751 1381 } \
wolfSSL 11:cee25a834751 1382 };
wolfSSL 11:cee25a834751 1383 #endif
wolfSSL 11:cee25a834751 1384
wolfSSL 11:cee25a834751 1385 #elif defined(TFM_SSE2)
wolfSSL 11:cee25a834751 1386 /* use SSE2 optimizations */
wolfSSL 11:cee25a834751 1387
wolfSSL 11:cee25a834751 1388 /* anything you need at the start */
wolfSSL 11:cee25a834751 1389 #define COMBA_START
wolfSSL 11:cee25a834751 1390
wolfSSL 11:cee25a834751 1391 /* clear the chaining variables */
wolfSSL 11:cee25a834751 1392 #define COMBA_CLEAR \
wolfSSL 11:cee25a834751 1393 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 1394
wolfSSL 11:cee25a834751 1395 /* forward the carry to the next digit */
wolfSSL 11:cee25a834751 1396 #define COMBA_FORWARD \
wolfSSL 11:cee25a834751 1397 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 1398
wolfSSL 11:cee25a834751 1399 /* store the first sum */
wolfSSL 11:cee25a834751 1400 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 1401 x = c0;
wolfSSL 11:cee25a834751 1402
wolfSSL 11:cee25a834751 1403 /* store the second sum [carry] */
wolfSSL 11:cee25a834751 1404 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 1405 x = c1;
wolfSSL 11:cee25a834751 1406
wolfSSL 11:cee25a834751 1407 /* anything you need at the end */
wolfSSL 11:cee25a834751 1408 #define COMBA_FINI \
wolfSSL 11:cee25a834751 1409 __asm__("emms");
wolfSSL 11:cee25a834751 1410
wolfSSL 11:cee25a834751 1411 /* this should multiply i and j */
wolfSSL 11:cee25a834751 1412 #define MULADD(i, j) \
wolfSSL 11:cee25a834751 1413 __asm__( \
wolfSSL 11:cee25a834751 1414 "movd %6,%%mm0 \n\t" \
wolfSSL 11:cee25a834751 1415 "movd %7,%%mm1 \n\t" \
wolfSSL 11:cee25a834751 1416 "pmuludq %%mm1,%%mm0\n\t" \
wolfSSL 11:cee25a834751 1417 "movd %%mm0,%%eax \n\t" \
wolfSSL 11:cee25a834751 1418 "psrlq $32,%%mm0 \n\t" \
wolfSSL 11:cee25a834751 1419 "addl %%eax,%0 \n\t" \
wolfSSL 11:cee25a834751 1420 "movd %%mm0,%%eax \n\t" \
wolfSSL 11:cee25a834751 1421 "adcl %%eax,%1 \n\t" \
wolfSSL 11:cee25a834751 1422 "adcl $0,%2 \n\t" \
wolfSSL 11:cee25a834751 1423 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc");
wolfSSL 11:cee25a834751 1424
wolfSSL 11:cee25a834751 1425 #elif defined(TFM_ARM)
wolfSSL 11:cee25a834751 1426 /* ARM code */
wolfSSL 11:cee25a834751 1427
wolfSSL 11:cee25a834751 1428 #define COMBA_START
wolfSSL 11:cee25a834751 1429
wolfSSL 11:cee25a834751 1430 #define COMBA_CLEAR \
wolfSSL 11:cee25a834751 1431 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 1432
wolfSSL 11:cee25a834751 1433 #define COMBA_FORWARD \
wolfSSL 11:cee25a834751 1434 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 1435
wolfSSL 11:cee25a834751 1436 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 1437 x = c0;
wolfSSL 11:cee25a834751 1438
wolfSSL 11:cee25a834751 1439 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 1440 x = c1;
wolfSSL 11:cee25a834751 1441
wolfSSL 11:cee25a834751 1442 #define COMBA_FINI
wolfSSL 11:cee25a834751 1443
wolfSSL 11:cee25a834751 1444 #define MULADD(i, j) \
wolfSSL 11:cee25a834751 1445 __asm__( \
wolfSSL 11:cee25a834751 1446 " UMULL r0,r1,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1447 " ADDS %0,%0,r0 \n\t" \
wolfSSL 11:cee25a834751 1448 " ADCS %1,%1,r1 \n\t" \
wolfSSL 11:cee25a834751 1449 " ADC %2,%2,#0 \n\t" \
wolfSSL 11:cee25a834751 1450 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
wolfSSL 11:cee25a834751 1451
wolfSSL 11:cee25a834751 1452 #elif defined(TFM_PPC32)
wolfSSL 11:cee25a834751 1453 /* For 32-bit PPC */
wolfSSL 11:cee25a834751 1454
wolfSSL 11:cee25a834751 1455 #define COMBA_START
wolfSSL 11:cee25a834751 1456
wolfSSL 11:cee25a834751 1457 #define COMBA_CLEAR \
wolfSSL 11:cee25a834751 1458 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 1459
wolfSSL 11:cee25a834751 1460 #define COMBA_FORWARD \
wolfSSL 11:cee25a834751 1461 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 1462
wolfSSL 11:cee25a834751 1463 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 1464 x = c0;
wolfSSL 11:cee25a834751 1465
wolfSSL 11:cee25a834751 1466 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 1467 x = c1;
wolfSSL 11:cee25a834751 1468
wolfSSL 11:cee25a834751 1469 #define COMBA_FINI
wolfSSL 11:cee25a834751 1470
wolfSSL 11:cee25a834751 1471 /* untested: will mulhwu change the flags? Docs say no */
wolfSSL 11:cee25a834751 1472 #define MULADD(i, j) \
wolfSSL 11:cee25a834751 1473 __asm__( \
wolfSSL 11:cee25a834751 1474 " mullw 16,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1475 " addc %0,%0,16 \n\t" \
wolfSSL 11:cee25a834751 1476 " mulhwu 16,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1477 " adde %1,%1,16 \n\t" \
wolfSSL 11:cee25a834751 1478 " addze %2,%2 \n\t" \
wolfSSL 11:cee25a834751 1479 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
wolfSSL 11:cee25a834751 1480
wolfSSL 11:cee25a834751 1481 #elif defined(TFM_PPC64)
wolfSSL 11:cee25a834751 1482 /* For 64-bit PPC */
wolfSSL 11:cee25a834751 1483
wolfSSL 11:cee25a834751 1484 #define COMBA_START
wolfSSL 11:cee25a834751 1485
wolfSSL 11:cee25a834751 1486 #define COMBA_CLEAR \
wolfSSL 11:cee25a834751 1487 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 1488
wolfSSL 11:cee25a834751 1489 #define COMBA_FORWARD \
wolfSSL 11:cee25a834751 1490 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 1491
wolfSSL 11:cee25a834751 1492 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 1493 x = c0;
wolfSSL 11:cee25a834751 1494
wolfSSL 11:cee25a834751 1495 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 1496 x = c1;
wolfSSL 11:cee25a834751 1497
wolfSSL 11:cee25a834751 1498 #define COMBA_FINI
wolfSSL 11:cee25a834751 1499
wolfSSL 11:cee25a834751 1500 /* untested: will mulhwu change the flags? Docs say no */
wolfSSL 11:cee25a834751 1501 #define MULADD(i, j) \
wolfSSL 11:cee25a834751 1502 ____asm__( \
wolfSSL 11:cee25a834751 1503 " mulld 16,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1504 " addc %0,%0,16 \n\t" \
wolfSSL 11:cee25a834751 1505 " mulhdu 16,%6,%7 \n\t" \
wolfSSL 11:cee25a834751 1506 " adde %1,%1,16 \n\t" \
wolfSSL 11:cee25a834751 1507 " addze %2,%2 \n\t" \
wolfSSL 11:cee25a834751 1508 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
wolfSSL 11:cee25a834751 1509
wolfSSL 11:cee25a834751 1510 #elif defined(TFM_AVR32)
wolfSSL 11:cee25a834751 1511
wolfSSL 11:cee25a834751 1512 /* ISO C code */
wolfSSL 11:cee25a834751 1513
wolfSSL 11:cee25a834751 1514 #define COMBA_START
wolfSSL 11:cee25a834751 1515
wolfSSL 11:cee25a834751 1516 #define COMBA_CLEAR \
wolfSSL 11:cee25a834751 1517 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 1518
wolfSSL 11:cee25a834751 1519 #define COMBA_FORWARD \
wolfSSL 11:cee25a834751 1520 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 1521
wolfSSL 11:cee25a834751 1522 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 1523 x = c0;
wolfSSL 11:cee25a834751 1524
wolfSSL 11:cee25a834751 1525 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 1526 x = c1;
wolfSSL 11:cee25a834751 1527
wolfSSL 11:cee25a834751 1528 #define COMBA_FINI
wolfSSL 11:cee25a834751 1529
wolfSSL 11:cee25a834751 1530 #define MULADD(i, j) \
wolfSSL 11:cee25a834751 1531 ____asm__( \
wolfSSL 11:cee25a834751 1532 " mulu.d r2,%6,%7 \n\t"\
wolfSSL 11:cee25a834751 1533 " add %0,r2 \n\t"\
wolfSSL 11:cee25a834751 1534 " adc %1,%1,r3 \n\t"\
wolfSSL 11:cee25a834751 1535 " acr %2 \n\t"\
wolfSSL 11:cee25a834751 1536 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
wolfSSL 11:cee25a834751 1537
wolfSSL 11:cee25a834751 1538 #else
wolfSSL 11:cee25a834751 1539 /* ISO C code */
wolfSSL 11:cee25a834751 1540
wolfSSL 11:cee25a834751 1541 #define COMBA_START
wolfSSL 11:cee25a834751 1542
wolfSSL 11:cee25a834751 1543 #define COMBA_CLEAR \
wolfSSL 11:cee25a834751 1544 c0 = c1 = c2 = 0;
wolfSSL 11:cee25a834751 1545
wolfSSL 11:cee25a834751 1546 #define COMBA_FORWARD \
wolfSSL 11:cee25a834751 1547 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 11:cee25a834751 1548
wolfSSL 11:cee25a834751 1549 #define COMBA_STORE(x) \
wolfSSL 11:cee25a834751 1550 x = c0;
wolfSSL 11:cee25a834751 1551
wolfSSL 11:cee25a834751 1552 #define COMBA_STORE2(x) \
wolfSSL 11:cee25a834751 1553 x = c1;
wolfSSL 11:cee25a834751 1554
wolfSSL 11:cee25a834751 1555 #define COMBA_FINI
wolfSSL 11:cee25a834751 1556
wolfSSL 11:cee25a834751 1557 #define MULADD(i, j) \
wolfSSL 11:cee25a834751 1558 do { fp_word t; \
wolfSSL 11:cee25a834751 1559 t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \
wolfSSL 11:cee25a834751 1560 t = (fp_word)c1 + (t >> DIGIT_BIT); \
wolfSSL 11:cee25a834751 1561 c1 = (fp_digit)t; c2 += (fp_digit)(t >> DIGIT_BIT); \
wolfSSL 11:cee25a834751 1562 } while (0);
wolfSSL 11:cee25a834751 1563
wolfSSL 11:cee25a834751 1564 #endif
wolfSSL 11:cee25a834751 1565
wolfSSL 11:cee25a834751 1566
wolfSSL 11:cee25a834751 1567 #ifdef TFM_SMALL_SET
wolfSSL 11:cee25a834751 1568 #include "fp_mul_comba_small_set.i"
wolfSSL 11:cee25a834751 1569 #endif
wolfSSL 11:cee25a834751 1570
wolfSSL 11:cee25a834751 1571 #if defined(TFM_MUL3) && FP_SIZE >= 6
wolfSSL 11:cee25a834751 1572 #include "fp_mul_comba_3.i"
wolfSSL 11:cee25a834751 1573 #endif
wolfSSL 11:cee25a834751 1574 #if defined(TFM_MUL4) && FP_SIZE >= 8
wolfSSL 11:cee25a834751 1575 #include "fp_mul_comba_4.i"
wolfSSL 11:cee25a834751 1576 #endif
wolfSSL 11:cee25a834751 1577 #if defined(TFM_MUL6) && FP_SIZE >= 12
wolfSSL 11:cee25a834751 1578 #include "fp_mul_comba_6.i"
wolfSSL 11:cee25a834751 1579 #endif
wolfSSL 11:cee25a834751 1580 #if defined(TFM_MUL7) && FP_SIZE >= 14
wolfSSL 11:cee25a834751 1581 #include "fp_mul_comba_7.i"
wolfSSL 11:cee25a834751 1582 #endif
wolfSSL 11:cee25a834751 1583 #if defined(TFM_MUL8) && FP_SIZE >= 16
wolfSSL 11:cee25a834751 1584 #include "fp_mul_comba_8.i"
wolfSSL 11:cee25a834751 1585 #endif
wolfSSL 11:cee25a834751 1586 #if defined(TFM_MUL9) && FP_SIZE >= 18
wolfSSL 11:cee25a834751 1587 #include "fp_mul_comba_9.i"
wolfSSL 11:cee25a834751 1588 #endif
wolfSSL 11:cee25a834751 1589 #if defined(TFM_MUL12) && FP_SIZE >= 24
wolfSSL 11:cee25a834751 1590 #include "fp_mul_comba_12.i"
wolfSSL 11:cee25a834751 1591 #endif
wolfSSL 11:cee25a834751 1592 #if defined(TFM_MUL17) && FP_SIZE >= 34
wolfSSL 11:cee25a834751 1593 #include "fp_mul_comba_17.i"
wolfSSL 11:cee25a834751 1594 #endif
wolfSSL 11:cee25a834751 1595 #if defined(TFM_MUL20) && FP_SIZE >= 40
wolfSSL 11:cee25a834751 1596 #include "fp_mul_comba_20.i"
wolfSSL 11:cee25a834751 1597 #endif
wolfSSL 11:cee25a834751 1598 #if defined(TFM_MUL24) && FP_SIZE >= 48
wolfSSL 11:cee25a834751 1599 #include "fp_mul_comba_24.i"
wolfSSL 11:cee25a834751 1600 #endif
wolfSSL 11:cee25a834751 1601 #if defined(TFM_MUL28) && FP_SIZE >= 56
wolfSSL 11:cee25a834751 1602 #include "fp_mul_comba_28.i"
wolfSSL 11:cee25a834751 1603 #endif
wolfSSL 11:cee25a834751 1604 #if defined(TFM_MUL32) && FP_SIZE >= 64
wolfSSL 11:cee25a834751 1605 #include "fp_mul_comba_32.i"
wolfSSL 11:cee25a834751 1606 #endif
wolfSSL 11:cee25a834751 1607 #if defined(TFM_MUL48) && FP_SIZE >= 96
wolfSSL 11:cee25a834751 1608 #include "fp_mul_comba_48.i"
wolfSSL 11:cee25a834751 1609 #endif
wolfSSL 11:cee25a834751 1610 #if defined(TFM_MUL64) && FP_SIZE >= 128
wolfSSL 11:cee25a834751 1611 #include "fp_mul_comba_64.i"
wolfSSL 11:cee25a834751 1612 #endif
wolfSSL 11:cee25a834751 1613
wolfSSL 11:cee25a834751 1614 /* end fp_mul_comba.c asm */
wolfSSL 11:cee25a834751 1615
wolfSSL 11:cee25a834751 1616