wolfSSL 3.11.1 for TLS1.3 beta

Fork of wolfSSL by wolf SSL

Committer:
wolfSSL
Date:
Fri Jun 26 00:39:20 2015 +0000
Revision:
0:d92f9d21154c
wolfSSL 3.6.0

Who changed what in which revision?

UserRevisionLine numberNew contents of line
wolfSSL 0:d92f9d21154c 1 /* asm.c
wolfSSL 0:d92f9d21154c 2 *
wolfSSL 0:d92f9d21154c 3 * Copyright (C) 2006-2015 wolfSSL Inc.
wolfSSL 0:d92f9d21154c 4 *
wolfSSL 0:d92f9d21154c 5 * This file is part of wolfSSL. (formerly known as CyaSSL)
wolfSSL 0:d92f9d21154c 6 *
wolfSSL 0:d92f9d21154c 7 * wolfSSL is free software; you can redistribute it and/or modify
wolfSSL 0:d92f9d21154c 8 * it under the terms of the GNU General Public License as published by
wolfSSL 0:d92f9d21154c 9 * the Free Software Foundation; either version 2 of the License, or
wolfSSL 0:d92f9d21154c 10 * (at your option) any later version.
wolfSSL 0:d92f9d21154c 11 *
wolfSSL 0:d92f9d21154c 12 * wolfSSL is distributed in the hope that it will be useful,
wolfSSL 0:d92f9d21154c 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
wolfSSL 0:d92f9d21154c 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
wolfSSL 0:d92f9d21154c 15 * GNU General Public License for more details.
wolfSSL 0:d92f9d21154c 16 *
wolfSSL 0:d92f9d21154c 17 * You should have received a copy of the GNU General Public License
wolfSSL 0:d92f9d21154c 18 * along with this program; if not, write to the Free Software
wolfSSL 0:d92f9d21154c 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
wolfSSL 0:d92f9d21154c 20 */
wolfSSL 0:d92f9d21154c 21
wolfSSL 0:d92f9d21154c 22 #ifdef HAVE_CONFIG_H
wolfSSL 0:d92f9d21154c 23 #include <config.h>
wolfSSL 0:d92f9d21154c 24 #endif
wolfSSL 0:d92f9d21154c 25
wolfSSL 0:d92f9d21154c 26 #include <wolfssl/wolfcrypt/settings.h>
wolfSSL 0:d92f9d21154c 27
wolfSSL 0:d92f9d21154c 28 /*
wolfSSL 0:d92f9d21154c 29 * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
wolfSSL 0:d92f9d21154c 30 * http://math.libtomcrypt.com
wolfSSL 0:d92f9d21154c 31 */
wolfSSL 0:d92f9d21154c 32
wolfSSL 0:d92f9d21154c 33
wolfSSL 0:d92f9d21154c 34 /******************************************************************/
wolfSSL 0:d92f9d21154c 35 /* fp_montgomery_reduce.c asm or generic */
wolfSSL 0:d92f9d21154c 36
wolfSSL 0:d92f9d21154c 37
wolfSSL 0:d92f9d21154c 38 /* Each platform needs to query info type 1 from cpuid to see if aesni is
wolfSSL 0:d92f9d21154c 39 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
wolfSSL 0:d92f9d21154c 40 */
wolfSSL 0:d92f9d21154c 41
wolfSSL 0:d92f9d21154c 42 #if defined(HAVE_INTEL_MULX)
wolfSSL 0:d92f9d21154c 43 #ifndef _MSC_VER
wolfSSL 0:d92f9d21154c 44 #define cpuid(reg, leaf, sub)\
wolfSSL 0:d92f9d21154c 45 __asm__ __volatile__ ("cpuid":\
wolfSSL 0:d92f9d21154c 46 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
wolfSSL 0:d92f9d21154c 47 "a" (leaf), "c"(sub));
wolfSSL 0:d92f9d21154c 48
wolfSSL 0:d92f9d21154c 49 #define XASM_LINK(f) asm(f)
wolfSSL 0:d92f9d21154c 50 #else
wolfSSL 0:d92f9d21154c 51
wolfSSL 0:d92f9d21154c 52 #include <intrin.h>
wolfSSL 0:d92f9d21154c 53 #define cpuid(a,b) __cpuid((int*)a,b)
wolfSSL 0:d92f9d21154c 54
wolfSSL 0:d92f9d21154c 55 #define XASM_LINK(f)
wolfSSL 0:d92f9d21154c 56
wolfSSL 0:d92f9d21154c 57 #endif /* _MSC_VER */
wolfSSL 0:d92f9d21154c 58
wolfSSL 0:d92f9d21154c 59 #define EAX 0
wolfSSL 0:d92f9d21154c 60 #define EBX 1
wolfSSL 0:d92f9d21154c 61 #define ECX 2
wolfSSL 0:d92f9d21154c 62 #define EDX 3
wolfSSL 0:d92f9d21154c 63
wolfSSL 0:d92f9d21154c 64 #define CPUID_AVX1 0x1
wolfSSL 0:d92f9d21154c 65 #define CPUID_AVX2 0x2
wolfSSL 0:d92f9d21154c 66 #define CPUID_RDRAND 0x4
wolfSSL 0:d92f9d21154c 67 #define CPUID_RDSEED 0x8
wolfSSL 0:d92f9d21154c 68 #define CPUID_BMI2 0x10 /* MULX, RORX */
wolfSSL 0:d92f9d21154c 69 #define CPUID_ADX 0x20 /* ADCX, ADOX */
wolfSSL 0:d92f9d21154c 70
wolfSSL 0:d92f9d21154c 71 #define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1)
wolfSSL 0:d92f9d21154c 72 #define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2)
wolfSSL 0:d92f9d21154c 73 #define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2)
wolfSSL 0:d92f9d21154c 74 #define IS_INTEL_ADX (cpuid_flags&CPUID_ADX)
wolfSSL 0:d92f9d21154c 75 #define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND)
wolfSSL 0:d92f9d21154c 76 #define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED)
wolfSSL 0:d92f9d21154c 77 #define SET_FLAGS
wolfSSL 0:d92f9d21154c 78
wolfSSL 0:d92f9d21154c 79 static word32 cpuid_check = 0 ;
wolfSSL 0:d92f9d21154c 80 static word32 cpuid_flags = 0 ;
wolfSSL 0:d92f9d21154c 81
wolfSSL 0:d92f9d21154c 82 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
wolfSSL 0:d92f9d21154c 83 int got_intel_cpu=0;
wolfSSL 0:d92f9d21154c 84 unsigned int reg[5];
wolfSSL 0:d92f9d21154c 85
wolfSSL 0:d92f9d21154c 86 reg[4] = '\0' ;
wolfSSL 0:d92f9d21154c 87 cpuid(reg, 0, 0);
wolfSSL 0:d92f9d21154c 88 if(memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&
wolfSSL 0:d92f9d21154c 89 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&
wolfSSL 0:d92f9d21154c 90 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {
wolfSSL 0:d92f9d21154c 91 got_intel_cpu = 1;
wolfSSL 0:d92f9d21154c 92 }
wolfSSL 0:d92f9d21154c 93 if (got_intel_cpu) {
wolfSSL 0:d92f9d21154c 94 cpuid(reg, leaf, sub);
wolfSSL 0:d92f9d21154c 95 return((reg[num]>>bit)&0x1) ;
wolfSSL 0:d92f9d21154c 96 }
wolfSSL 0:d92f9d21154c 97 return 0 ;
wolfSSL 0:d92f9d21154c 98 }
wolfSSL 0:d92f9d21154c 99
wolfSSL 0:d92f9d21154c 100 INLINE static int set_cpuid_flags(void) {
wolfSSL 0:d92f9d21154c 101 if(cpuid_check == 0) {
wolfSSL 0:d92f9d21154c 102 if(cpuid_flag(7, 0, EBX, 8)){ cpuid_flags |= CPUID_BMI2 ; }
wolfSSL 0:d92f9d21154c 103 if(cpuid_flag(7, 0, EBX,19)){ cpuid_flags |= CPUID_ADX ; }
wolfSSL 0:d92f9d21154c 104 cpuid_check = 1 ;
wolfSSL 0:d92f9d21154c 105 return 0 ;
wolfSSL 0:d92f9d21154c 106 }
wolfSSL 0:d92f9d21154c 107 return 1 ;
wolfSSL 0:d92f9d21154c 108 }
wolfSSL 0:d92f9d21154c 109
wolfSSL 0:d92f9d21154c 110 #define RETURN return
wolfSSL 0:d92f9d21154c 111 #define IF_HAVE_INTEL_MULX(func, ret) \
wolfSSL 0:d92f9d21154c 112 if(cpuid_check==0)set_cpuid_flags() ; \
wolfSSL 0:d92f9d21154c 113 if(IS_INTEL_BMI2 && IS_INTEL_ADX){ func; ret ; }
wolfSSL 0:d92f9d21154c 114
wolfSSL 0:d92f9d21154c 115 #else
wolfSSL 0:d92f9d21154c 116 #define IF_HAVE_INTEL_MULX(func, ret)
wolfSSL 0:d92f9d21154c 117 #endif
wolfSSL 0:d92f9d21154c 118
wolfSSL 0:d92f9d21154c 119 #if defined(TFM_X86) && !defined(TFM_SSE2)
wolfSSL 0:d92f9d21154c 120 /* x86-32 code */
wolfSSL 0:d92f9d21154c 121
wolfSSL 0:d92f9d21154c 122 #define MONT_START
wolfSSL 0:d92f9d21154c 123 #define MONT_FINI
wolfSSL 0:d92f9d21154c 124 #define LOOP_END
wolfSSL 0:d92f9d21154c 125 #define LOOP_START \
wolfSSL 0:d92f9d21154c 126 mu = c[x] * mp
wolfSSL 0:d92f9d21154c 127
wolfSSL 0:d92f9d21154c 128 #define INNERMUL \
wolfSSL 0:d92f9d21154c 129 __asm__( \
wolfSSL 0:d92f9d21154c 130 "movl %5,%%eax \n\t" \
wolfSSL 0:d92f9d21154c 131 "mull %4 \n\t" \
wolfSSL 0:d92f9d21154c 132 "addl %1,%%eax \n\t" \
wolfSSL 0:d92f9d21154c 133 "adcl $0,%%edx \n\t" \
wolfSSL 0:d92f9d21154c 134 "addl %%eax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 135 "adcl $0,%%edx \n\t" \
wolfSSL 0:d92f9d21154c 136 "movl %%edx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 137 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 0:d92f9d21154c 138 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \
wolfSSL 0:d92f9d21154c 139 : "%eax", "%edx", "cc")
wolfSSL 0:d92f9d21154c 140
wolfSSL 0:d92f9d21154c 141 #define PROPCARRY \
wolfSSL 0:d92f9d21154c 142 __asm__( \
wolfSSL 0:d92f9d21154c 143 "addl %1,%0 \n\t" \
wolfSSL 0:d92f9d21154c 144 "setb %%al \n\t" \
wolfSSL 0:d92f9d21154c 145 "movzbl %%al,%1 \n\t" \
wolfSSL 0:d92f9d21154c 146 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 0:d92f9d21154c 147 :"0"(_c[LO]), "1"(cy) \
wolfSSL 0:d92f9d21154c 148 : "%eax", "cc")
wolfSSL 0:d92f9d21154c 149
wolfSSL 0:d92f9d21154c 150 /******************************************************************/
wolfSSL 0:d92f9d21154c 151 #elif defined(TFM_X86_64)
wolfSSL 0:d92f9d21154c 152 /* x86-64 code */
wolfSSL 0:d92f9d21154c 153
wolfSSL 0:d92f9d21154c 154 #define MONT_START
wolfSSL 0:d92f9d21154c 155 #define MONT_FINI
wolfSSL 0:d92f9d21154c 156 #define LOOP_END
wolfSSL 0:d92f9d21154c 157 #define LOOP_START \
wolfSSL 0:d92f9d21154c 158 mu = c[x] * mp;
wolfSSL 0:d92f9d21154c 159
wolfSSL 0:d92f9d21154c 160 #define INNERMUL \
wolfSSL 0:d92f9d21154c 161 __asm__( \
wolfSSL 0:d92f9d21154c 162 "movq %5,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 163 "mulq %4 \n\t" \
wolfSSL 0:d92f9d21154c 164 "addq %1,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 165 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 166 "addq %%rax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 167 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 168 "movq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 169 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 0:d92f9d21154c 170 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \
wolfSSL 0:d92f9d21154c 171 : "%rax", "%rdx", "cc")
wolfSSL 0:d92f9d21154c 172
wolfSSL 0:d92f9d21154c 173 #if defined(HAVE_INTEL_MULX)
wolfSSL 0:d92f9d21154c 174 #define MULX_INIT(a0, c0, cy)\
wolfSSL 0:d92f9d21154c 175 __asm__ volatile( \
wolfSSL 0:d92f9d21154c 176 "xorq %%r10, %%r10\n\t" \
wolfSSL 0:d92f9d21154c 177 "movq %1,%%rdx\n\t" \
wolfSSL 0:d92f9d21154c 178 "addq %2, %0\n\t" /* c0+=cy; Set CF, OF */ \
wolfSSL 0:d92f9d21154c 179 "adoxq %%r10, %%r10\n\t" /* Reset OF */ \
wolfSSL 0:d92f9d21154c 180 :"+m"(c0):"r"(a0),"r"(cy):"%r8","%r9", "%r10","%r11","%r12","%rdx") ; \
wolfSSL 0:d92f9d21154c 181
wolfSSL 0:d92f9d21154c 182 #define MULX_INNERMUL_R1(c0, c1, pre, rdx)\
wolfSSL 0:d92f9d21154c 183 { \
wolfSSL 0:d92f9d21154c 184 __asm__ volatile ( \
wolfSSL 0:d92f9d21154c 185 "movq %3, %%rdx\n\t" \
wolfSSL 0:d92f9d21154c 186 "mulx %%r11,%%r9, %%r8 \n\t" \
wolfSSL 0:d92f9d21154c 187 "movq %2, %%r12\n\t" \
wolfSSL 0:d92f9d21154c 188 "adoxq %%r9,%0 \n\t" \
wolfSSL 0:d92f9d21154c 189 "adcxq %%r8,%1 \n\t" \
wolfSSL 0:d92f9d21154c 190 :"+r"(c0),"+r"(c1):"m"(pre),"r"(rdx):"%r8","%r9", "%r10", "%r11","%r12","%rdx" \
wolfSSL 0:d92f9d21154c 191 ); }
wolfSSL 0:d92f9d21154c 192
wolfSSL 0:d92f9d21154c 193
wolfSSL 0:d92f9d21154c 194 #define MULX_INNERMUL_R2(c0, c1, pre, rdx)\
wolfSSL 0:d92f9d21154c 195 { \
wolfSSL 0:d92f9d21154c 196 __asm__ volatile ( \
wolfSSL 0:d92f9d21154c 197 "movq %3, %%rdx\n\t" \
wolfSSL 0:d92f9d21154c 198 "mulx %%r12,%%r9, %%r8 \n\t" \
wolfSSL 0:d92f9d21154c 199 "movq %2, %%r11\n\t" \
wolfSSL 0:d92f9d21154c 200 "adoxq %%r9,%0 \n\t" \
wolfSSL 0:d92f9d21154c 201 "adcxq %%r8,%1 \n\t" \
wolfSSL 0:d92f9d21154c 202 :"+r"(c0),"+r"(c1):"m"(pre),"r"(rdx):"%r8","%r9", "%r10", "%r11","%r12","%rdx" \
wolfSSL 0:d92f9d21154c 203 ); }
wolfSSL 0:d92f9d21154c 204
wolfSSL 0:d92f9d21154c 205 #define MULX_LOAD_R1(val)\
wolfSSL 0:d92f9d21154c 206 __asm__ volatile ( \
wolfSSL 0:d92f9d21154c 207 "movq %0, %%r11\n\t"\
wolfSSL 0:d92f9d21154c 208 ::"m"(val):"%r8","%r9", "%r10", "%r11","%r12","%rdx"\
wolfSSL 0:d92f9d21154c 209 ) ;
wolfSSL 0:d92f9d21154c 210
wolfSSL 0:d92f9d21154c 211 #define MULX_INNERMUL_LAST(c0, c1, rdx)\
wolfSSL 0:d92f9d21154c 212 { \
wolfSSL 0:d92f9d21154c 213 __asm__ volatile ( \
wolfSSL 0:d92f9d21154c 214 "movq %2, %%rdx\n\t" \
wolfSSL 0:d92f9d21154c 215 "mulx %%r12,%%r9, %%r8 \n\t" \
wolfSSL 0:d92f9d21154c 216 "movq $0, %%r10 \n\t" \
wolfSSL 0:d92f9d21154c 217 "adoxq %%r10, %%r9 \n\t" \
wolfSSL 0:d92f9d21154c 218 "adcq $0,%%r8 \n\t" \
wolfSSL 0:d92f9d21154c 219 "addq %%r9,%0 \n\t" \
wolfSSL 0:d92f9d21154c 220 "adcq $0,%%r8 \n\t" \
wolfSSL 0:d92f9d21154c 221 "movq %%r8,%1 \n\t" \
wolfSSL 0:d92f9d21154c 222 :"+m"(c0),"=m"(c1):"r"(rdx):"%r8","%r9","%r10", "%r11", "%r12","%rdx"\
wolfSSL 0:d92f9d21154c 223 ); }
wolfSSL 0:d92f9d21154c 224
wolfSSL 0:d92f9d21154c 225 #define MULX_INNERMUL8(x,y,z,cy)\
wolfSSL 0:d92f9d21154c 226 { word64 rdx = y ;\
wolfSSL 0:d92f9d21154c 227 MULX_LOAD_R1(x[0]) ;\
wolfSSL 0:d92f9d21154c 228 MULX_INIT(y, _c0, cy) ; /* rdx=y; z0+=cy; */ \
wolfSSL 0:d92f9d21154c 229 MULX_INNERMUL_R1(_c0, _c1, x[1], rdx) ;\
wolfSSL 0:d92f9d21154c 230 MULX_INNERMUL_R2(_c1, _c2, x[2], rdx) ;\
wolfSSL 0:d92f9d21154c 231 MULX_INNERMUL_R1(_c2, _c3, x[3], rdx) ;\
wolfSSL 0:d92f9d21154c 232 MULX_INNERMUL_R2(_c3, _c4, x[4], rdx) ;\
wolfSSL 0:d92f9d21154c 233 MULX_INNERMUL_R1(_c4, _c5, x[5], rdx) ;\
wolfSSL 0:d92f9d21154c 234 MULX_INNERMUL_R2(_c5, _c6, x[6], rdx) ;\
wolfSSL 0:d92f9d21154c 235 MULX_INNERMUL_R1(_c6, _c7, x[7], rdx) ;\
wolfSSL 0:d92f9d21154c 236 MULX_INNERMUL_LAST(_c7, cy, rdx) ;\
wolfSSL 0:d92f9d21154c 237 }
wolfSSL 0:d92f9d21154c 238 #define INNERMUL8_MULX \
wolfSSL 0:d92f9d21154c 239 {\
wolfSSL 0:d92f9d21154c 240 MULX_INNERMUL8(tmpm, mu, _c, cy);\
wolfSSL 0:d92f9d21154c 241 }
wolfSSL 0:d92f9d21154c 242 #endif
wolfSSL 0:d92f9d21154c 243
wolfSSL 0:d92f9d21154c 244 #define INNERMUL8 \
wolfSSL 0:d92f9d21154c 245 __asm__( \
wolfSSL 0:d92f9d21154c 246 "movq 0(%5),%%rax \n\t" \
wolfSSL 0:d92f9d21154c 247 "movq 0(%2),%%r10 \n\t" \
wolfSSL 0:d92f9d21154c 248 "movq 0x8(%5),%%r11 \n\t" \
wolfSSL 0:d92f9d21154c 249 "mulq %4 \n\t" \
wolfSSL 0:d92f9d21154c 250 "addq %%r10,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 251 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 252 "movq 0x8(%2),%%r10 \n\t" \
wolfSSL 0:d92f9d21154c 253 "addq %3,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 254 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 255 "movq %%rax,0(%0) \n\t" \
wolfSSL 0:d92f9d21154c 256 "movq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 257 \
wolfSSL 0:d92f9d21154c 258 "movq %%r11,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 259 "movq 0x10(%5),%%r11 \n\t" \
wolfSSL 0:d92f9d21154c 260 "mulq %4 \n\t" \
wolfSSL 0:d92f9d21154c 261 "addq %%r10,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 262 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 263 "movq 0x10(%2),%%r10 \n\t" \
wolfSSL 0:d92f9d21154c 264 "addq %3,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 265 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 266 "movq %%rax,0x8(%0) \n\t" \
wolfSSL 0:d92f9d21154c 267 "movq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 268 \
wolfSSL 0:d92f9d21154c 269 "movq %%r11,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 270 "movq 0x18(%5),%%r11 \n\t" \
wolfSSL 0:d92f9d21154c 271 "mulq %4 \n\t" \
wolfSSL 0:d92f9d21154c 272 "addq %%r10,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 273 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 274 "movq 0x18(%2),%%r10 \n\t" \
wolfSSL 0:d92f9d21154c 275 "addq %3,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 276 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 277 "movq %%rax,0x10(%0) \n\t" \
wolfSSL 0:d92f9d21154c 278 "movq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 279 \
wolfSSL 0:d92f9d21154c 280 "movq %%r11,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 281 "movq 0x20(%5),%%r11 \n\t" \
wolfSSL 0:d92f9d21154c 282 "mulq %4 \n\t" \
wolfSSL 0:d92f9d21154c 283 "addq %%r10,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 284 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 285 "movq 0x20(%2),%%r10 \n\t" \
wolfSSL 0:d92f9d21154c 286 "addq %3,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 287 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 288 "movq %%rax,0x18(%0) \n\t" \
wolfSSL 0:d92f9d21154c 289 "movq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 290 \
wolfSSL 0:d92f9d21154c 291 "movq %%r11,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 292 "movq 0x28(%5),%%r11 \n\t" \
wolfSSL 0:d92f9d21154c 293 "mulq %4 \n\t" \
wolfSSL 0:d92f9d21154c 294 "addq %%r10,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 295 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 296 "movq 0x28(%2),%%r10 \n\t" \
wolfSSL 0:d92f9d21154c 297 "addq %3,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 298 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 299 "movq %%rax,0x20(%0) \n\t" \
wolfSSL 0:d92f9d21154c 300 "movq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 301 \
wolfSSL 0:d92f9d21154c 302 "movq %%r11,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 303 "movq 0x30(%5),%%r11 \n\t" \
wolfSSL 0:d92f9d21154c 304 "mulq %4 \n\t" \
wolfSSL 0:d92f9d21154c 305 "addq %%r10,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 306 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 307 "movq 0x30(%2),%%r10 \n\t" \
wolfSSL 0:d92f9d21154c 308 "addq %3,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 309 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 310 "movq %%rax,0x28(%0) \n\t" \
wolfSSL 0:d92f9d21154c 311 "movq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 312 \
wolfSSL 0:d92f9d21154c 313 "movq %%r11,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 314 "movq 0x38(%5),%%r11 \n\t" \
wolfSSL 0:d92f9d21154c 315 "mulq %4 \n\t" \
wolfSSL 0:d92f9d21154c 316 "addq %%r10,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 317 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 318 "movq 0x38(%2),%%r10 \n\t" \
wolfSSL 0:d92f9d21154c 319 "addq %3,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 320 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 321 "movq %%rax,0x30(%0) \n\t" \
wolfSSL 0:d92f9d21154c 322 "movq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 323 \
wolfSSL 0:d92f9d21154c 324 "movq %%r11,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 325 "mulq %4 \n\t" \
wolfSSL 0:d92f9d21154c 326 "addq %%r10,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 327 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 328 "addq %3,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 329 "adcq $0,%%rdx \n\t" \
wolfSSL 0:d92f9d21154c 330 "movq %%rax,0x38(%0) \n\t" \
wolfSSL 0:d92f9d21154c 331 "movq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 332 \
wolfSSL 0:d92f9d21154c 333 :"=r"(_c), "=r"(cy) \
wolfSSL 0:d92f9d21154c 334 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\
wolfSSL 0:d92f9d21154c 335 : "%rax", "%rdx", "%r10", "%r11", "cc")\
wolfSSL 0:d92f9d21154c 336
wolfSSL 0:d92f9d21154c 337 #define PROPCARRY \
wolfSSL 0:d92f9d21154c 338 __asm__( \
wolfSSL 0:d92f9d21154c 339 "addq %1,%0 \n\t" \
wolfSSL 0:d92f9d21154c 340 "setb %%al \n\t" \
wolfSSL 0:d92f9d21154c 341 "movzbq %%al,%1 \n\t" \
wolfSSL 0:d92f9d21154c 342 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 0:d92f9d21154c 343 :"0"(_c[LO]), "1"(cy) \
wolfSSL 0:d92f9d21154c 344 : "%rax", "cc")
wolfSSL 0:d92f9d21154c 345
wolfSSL 0:d92f9d21154c 346 /******************************************************************/
wolfSSL 0:d92f9d21154c 347 #elif defined(TFM_SSE2)
wolfSSL 0:d92f9d21154c 348 /* SSE2 code (assumes 32-bit fp_digits) */
wolfSSL 0:d92f9d21154c 349 /* XMM register assignments:
wolfSSL 0:d92f9d21154c 350 * xmm0 *tmpm++, then Mu * (*tmpm++)
wolfSSL 0:d92f9d21154c 351 * xmm1 c[x], then Mu
wolfSSL 0:d92f9d21154c 352 * xmm2 mp
wolfSSL 0:d92f9d21154c 353 * xmm3 cy
wolfSSL 0:d92f9d21154c 354 * xmm4 _c[LO]
wolfSSL 0:d92f9d21154c 355 */
wolfSSL 0:d92f9d21154c 356
wolfSSL 0:d92f9d21154c 357 #define MONT_START \
wolfSSL 0:d92f9d21154c 358 __asm__("movd %0,%%mm2"::"g"(mp))
wolfSSL 0:d92f9d21154c 359
wolfSSL 0:d92f9d21154c 360 #define MONT_FINI \
wolfSSL 0:d92f9d21154c 361 __asm__("emms")
wolfSSL 0:d92f9d21154c 362
wolfSSL 0:d92f9d21154c 363 #define LOOP_START \
wolfSSL 0:d92f9d21154c 364 __asm__( \
wolfSSL 0:d92f9d21154c 365 "movd %0,%%mm1 \n\t" \
wolfSSL 0:d92f9d21154c 366 "pxor %%mm3,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 367 "pmuludq %%mm2,%%mm1 \n\t" \
wolfSSL 0:d92f9d21154c 368 :: "g"(c[x]))
wolfSSL 0:d92f9d21154c 369
wolfSSL 0:d92f9d21154c 370 /* pmuludq on mmx registers does a 32x32->64 multiply. */
wolfSSL 0:d92f9d21154c 371 #define INNERMUL \
wolfSSL 0:d92f9d21154c 372 __asm__( \
wolfSSL 0:d92f9d21154c 373 "movd %1,%%mm4 \n\t" \
wolfSSL 0:d92f9d21154c 374 "movd %2,%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 375 "paddq %%mm4,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 376 "pmuludq %%mm1,%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 377 "paddq %%mm0,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 378 "movd %%mm3,%0 \n\t" \
wolfSSL 0:d92f9d21154c 379 "psrlq $32, %%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 380 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
wolfSSL 0:d92f9d21154c 381
wolfSSL 0:d92f9d21154c 382 #define INNERMUL8 \
wolfSSL 0:d92f9d21154c 383 __asm__( \
wolfSSL 0:d92f9d21154c 384 "movd 0(%1),%%mm4 \n\t" \
wolfSSL 0:d92f9d21154c 385 "movd 0(%2),%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 386 "paddq %%mm4,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 387 "pmuludq %%mm1,%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 388 "movd 4(%2),%%mm5 \n\t" \
wolfSSL 0:d92f9d21154c 389 "paddq %%mm0,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 390 "movd 4(%1),%%mm6 \n\t" \
wolfSSL 0:d92f9d21154c 391 "movd %%mm3,0(%0) \n\t" \
wolfSSL 0:d92f9d21154c 392 "psrlq $32, %%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 393 \
wolfSSL 0:d92f9d21154c 394 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 395 "pmuludq %%mm1,%%mm5 \n\t" \
wolfSSL 0:d92f9d21154c 396 "movd 8(%2),%%mm6 \n\t" \
wolfSSL 0:d92f9d21154c 397 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 398 "movd 8(%1),%%mm7 \n\t" \
wolfSSL 0:d92f9d21154c 399 "movd %%mm3,4(%0) \n\t" \
wolfSSL 0:d92f9d21154c 400 "psrlq $32, %%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 401 \
wolfSSL 0:d92f9d21154c 402 "paddq %%mm7,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 403 "pmuludq %%mm1,%%mm6 \n\t" \
wolfSSL 0:d92f9d21154c 404 "movd 12(%2),%%mm7 \n\t" \
wolfSSL 0:d92f9d21154c 405 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 406 "movd 12(%1),%%mm5 \n\t" \
wolfSSL 0:d92f9d21154c 407 "movd %%mm3,8(%0) \n\t" \
wolfSSL 0:d92f9d21154c 408 "psrlq $32, %%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 409 \
wolfSSL 0:d92f9d21154c 410 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 411 "pmuludq %%mm1,%%mm7 \n\t" \
wolfSSL 0:d92f9d21154c 412 "movd 16(%2),%%mm5 \n\t" \
wolfSSL 0:d92f9d21154c 413 "paddq %%mm7,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 414 "movd 16(%1),%%mm6 \n\t" \
wolfSSL 0:d92f9d21154c 415 "movd %%mm3,12(%0) \n\t" \
wolfSSL 0:d92f9d21154c 416 "psrlq $32, %%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 417 \
wolfSSL 0:d92f9d21154c 418 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 419 "pmuludq %%mm1,%%mm5 \n\t" \
wolfSSL 0:d92f9d21154c 420 "movd 20(%2),%%mm6 \n\t" \
wolfSSL 0:d92f9d21154c 421 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 422 "movd 20(%1),%%mm7 \n\t" \
wolfSSL 0:d92f9d21154c 423 "movd %%mm3,16(%0) \n\t" \
wolfSSL 0:d92f9d21154c 424 "psrlq $32, %%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 425 \
wolfSSL 0:d92f9d21154c 426 "paddq %%mm7,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 427 "pmuludq %%mm1,%%mm6 \n\t" \
wolfSSL 0:d92f9d21154c 428 "movd 24(%2),%%mm7 \n\t" \
wolfSSL 0:d92f9d21154c 429 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 430 "movd 24(%1),%%mm5 \n\t" \
wolfSSL 0:d92f9d21154c 431 "movd %%mm3,20(%0) \n\t" \
wolfSSL 0:d92f9d21154c 432 "psrlq $32, %%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 433 \
wolfSSL 0:d92f9d21154c 434 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 435 "pmuludq %%mm1,%%mm7 \n\t" \
wolfSSL 0:d92f9d21154c 436 "movd 28(%2),%%mm5 \n\t" \
wolfSSL 0:d92f9d21154c 437 "paddq %%mm7,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 438 "movd 28(%1),%%mm6 \n\t" \
wolfSSL 0:d92f9d21154c 439 "movd %%mm3,24(%0) \n\t" \
wolfSSL 0:d92f9d21154c 440 "psrlq $32, %%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 441 \
wolfSSL 0:d92f9d21154c 442 "paddq %%mm6,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 443 "pmuludq %%mm1,%%mm5 \n\t" \
wolfSSL 0:d92f9d21154c 444 "paddq %%mm5,%%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 445 "movd %%mm3,28(%0) \n\t" \
wolfSSL 0:d92f9d21154c 446 "psrlq $32, %%mm3 \n\t" \
wolfSSL 0:d92f9d21154c 447 :"=r"(_c) : "0"(_c), "r"(tmpm) );
wolfSSL 0:d92f9d21154c 448
wolfSSL 0:d92f9d21154c 449 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack
wolfSSL 0:d92f9d21154c 450 pointer */
wolfSSL 0:d92f9d21154c 451
wolfSSL 0:d92f9d21154c 452 #define LOOP_END \
wolfSSL 0:d92f9d21154c 453 __asm__( "movd %%mm3,%0 \n" :"=r"(cy))
wolfSSL 0:d92f9d21154c 454
wolfSSL 0:d92f9d21154c 455 #define PROPCARRY \
wolfSSL 0:d92f9d21154c 456 __asm__( \
wolfSSL 0:d92f9d21154c 457 "addl %1,%0 \n\t" \
wolfSSL 0:d92f9d21154c 458 "setb %%al \n\t" \
wolfSSL 0:d92f9d21154c 459 "movzbl %%al,%1 \n\t" \
wolfSSL 0:d92f9d21154c 460 :"=g"(_c[LO]), "=r"(cy) \
wolfSSL 0:d92f9d21154c 461 :"0"(_c[LO]), "1"(cy) \
wolfSSL 0:d92f9d21154c 462 : "%eax", "cc")
wolfSSL 0:d92f9d21154c 463
wolfSSL 0:d92f9d21154c 464 /******************************************************************/
wolfSSL 0:d92f9d21154c 465 #elif defined(TFM_ARM)
wolfSSL 0:d92f9d21154c 466 /* ARMv4 code */
wolfSSL 0:d92f9d21154c 467
wolfSSL 0:d92f9d21154c 468 #define MONT_START
wolfSSL 0:d92f9d21154c 469 #define MONT_FINI
wolfSSL 0:d92f9d21154c 470 #define LOOP_END
wolfSSL 0:d92f9d21154c 471 #define LOOP_START \
wolfSSL 0:d92f9d21154c 472 mu = c[x] * mp
wolfSSL 0:d92f9d21154c 473
wolfSSL 0:d92f9d21154c 474
wolfSSL 0:d92f9d21154c 475 #ifdef __thumb__
wolfSSL 0:d92f9d21154c 476
wolfSSL 0:d92f9d21154c 477 #define INNERMUL \
wolfSSL 0:d92f9d21154c 478 __asm__( \
wolfSSL 0:d92f9d21154c 479 " LDR r0,%1 \n\t" \
wolfSSL 0:d92f9d21154c 480 " ADDS r0,r0,%0 \n\t" \
wolfSSL 0:d92f9d21154c 481 " ITE CS \n\t" \
wolfSSL 0:d92f9d21154c 482 " MOVCS %0,#1 \n\t" \
wolfSSL 0:d92f9d21154c 483 " MOVCC %0,#0 \n\t" \
wolfSSL 0:d92f9d21154c 484 " UMLAL r0,%0,%3,%4 \n\t" \
wolfSSL 0:d92f9d21154c 485 " STR r0,%1 \n\t" \
wolfSSL 0:d92f9d21154c 486 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc");
wolfSSL 0:d92f9d21154c 487
wolfSSL 0:d92f9d21154c 488 #define PROPCARRY \
wolfSSL 0:d92f9d21154c 489 __asm__( \
wolfSSL 0:d92f9d21154c 490 " LDR r0,%1 \n\t" \
wolfSSL 0:d92f9d21154c 491 " ADDS r0,r0,%0 \n\t" \
wolfSSL 0:d92f9d21154c 492 " STR r0,%1 \n\t" \
wolfSSL 0:d92f9d21154c 493 " ITE CS \n\t" \
wolfSSL 0:d92f9d21154c 494 " MOVCS %0,#1 \n\t" \
wolfSSL 0:d92f9d21154c 495 " MOVCC %0,#0 \n\t" \
wolfSSL 0:d92f9d21154c 496 :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc");
wolfSSL 0:d92f9d21154c 497
wolfSSL 0:d92f9d21154c 498
wolfSSL 0:d92f9d21154c 499 /* TAO thumb mode uses ite (if then else) to detect carry directly
wolfSSL 0:d92f9d21154c 500 * fixed unmatched constraint warning by changing 1 to m */
wolfSSL 0:d92f9d21154c 501
wolfSSL 0:d92f9d21154c 502 #else /* __thumb__ */
wolfSSL 0:d92f9d21154c 503
wolfSSL 0:d92f9d21154c 504 #define INNERMUL \
wolfSSL 0:d92f9d21154c 505 __asm__( \
wolfSSL 0:d92f9d21154c 506 " LDR r0,%1 \n\t" \
wolfSSL 0:d92f9d21154c 507 " ADDS r0,r0,%0 \n\t" \
wolfSSL 0:d92f9d21154c 508 " MOVCS %0,#1 \n\t" \
wolfSSL 0:d92f9d21154c 509 " MOVCC %0,#0 \n\t" \
wolfSSL 0:d92f9d21154c 510 " UMLAL r0,%0,%3,%4 \n\t" \
wolfSSL 0:d92f9d21154c 511 " STR r0,%1 \n\t" \
wolfSSL 0:d92f9d21154c 512 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
wolfSSL 0:d92f9d21154c 513
wolfSSL 0:d92f9d21154c 514 #define PROPCARRY \
wolfSSL 0:d92f9d21154c 515 __asm__( \
wolfSSL 0:d92f9d21154c 516 " LDR r0,%1 \n\t" \
wolfSSL 0:d92f9d21154c 517 " ADDS r0,r0,%0 \n\t" \
wolfSSL 0:d92f9d21154c 518 " STR r0,%1 \n\t" \
wolfSSL 0:d92f9d21154c 519 " MOVCS %0,#1 \n\t" \
wolfSSL 0:d92f9d21154c 520 " MOVCC %0,#0 \n\t" \
wolfSSL 0:d92f9d21154c 521 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
wolfSSL 0:d92f9d21154c 522
wolfSSL 0:d92f9d21154c 523 #endif /* __thumb__ */
wolfSSL 0:d92f9d21154c 524
wolfSSL 0:d92f9d21154c 525 #elif defined(TFM_PPC32)
wolfSSL 0:d92f9d21154c 526
wolfSSL 0:d92f9d21154c 527 /* PPC32 */
wolfSSL 0:d92f9d21154c 528 #define MONT_START
wolfSSL 0:d92f9d21154c 529 #define MONT_FINI
wolfSSL 0:d92f9d21154c 530 #define LOOP_END
wolfSSL 0:d92f9d21154c 531 #define LOOP_START \
wolfSSL 0:d92f9d21154c 532 mu = c[x] * mp
wolfSSL 0:d92f9d21154c 533
wolfSSL 0:d92f9d21154c 534 #define INNERMUL \
wolfSSL 0:d92f9d21154c 535 __asm__( \
wolfSSL 0:d92f9d21154c 536 " mullw 16,%3,%4 \n\t" \
wolfSSL 0:d92f9d21154c 537 " mulhwu 17,%3,%4 \n\t" \
wolfSSL 0:d92f9d21154c 538 " addc 16,16,%0 \n\t" \
wolfSSL 0:d92f9d21154c 539 " addze 17,17 \n\t" \
wolfSSL 0:d92f9d21154c 540 " lwz 18,%1 \n\t" \
wolfSSL 0:d92f9d21154c 541 " addc 16,16,18 \n\t" \
wolfSSL 0:d92f9d21154c 542 " addze %0,17 \n\t" \
wolfSSL 0:d92f9d21154c 543 " stw 16,%1 \n\t" \
wolfSSL 0:d92f9d21154c 544 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
wolfSSL 0:d92f9d21154c 545
wolfSSL 0:d92f9d21154c 546 #define PROPCARRY \
wolfSSL 0:d92f9d21154c 547 __asm__( \
wolfSSL 0:d92f9d21154c 548 " lwz 16,%1 \n\t" \
wolfSSL 0:d92f9d21154c 549 " addc 16,16,%0 \n\t" \
wolfSSL 0:d92f9d21154c 550 " stw 16,%1 \n\t" \
wolfSSL 0:d92f9d21154c 551 " xor %0,%0,%0 \n\t" \
wolfSSL 0:d92f9d21154c 552 " addze %0,%0 \n\t" \
wolfSSL 0:d92f9d21154c 553 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
wolfSSL 0:d92f9d21154c 554
wolfSSL 0:d92f9d21154c 555 #elif defined(TFM_PPC64)
wolfSSL 0:d92f9d21154c 556
wolfSSL 0:d92f9d21154c 557 /* PPC64 */
wolfSSL 0:d92f9d21154c 558 #define MONT_START
wolfSSL 0:d92f9d21154c 559 #define MONT_FINI
wolfSSL 0:d92f9d21154c 560 #define LOOP_END
wolfSSL 0:d92f9d21154c 561 #define LOOP_START \
wolfSSL 0:d92f9d21154c 562 mu = c[x] * mp
wolfSSL 0:d92f9d21154c 563
wolfSSL 0:d92f9d21154c 564 #define INNERMUL \
wolfSSL 0:d92f9d21154c 565 __asm__( \
wolfSSL 0:d92f9d21154c 566 " mulld 16,%3,%4 \n\t" \
wolfSSL 0:d92f9d21154c 567 " mulhdu 17,%3,%4 \n\t" \
wolfSSL 0:d92f9d21154c 568 " addc 16,16,%0 \n\t" \
wolfSSL 0:d92f9d21154c 569 " addze 17,17 \n\t" \
wolfSSL 0:d92f9d21154c 570 " ldx 18,0,%1 \n\t" \
wolfSSL 0:d92f9d21154c 571 " addc 16,16,18 \n\t" \
wolfSSL 0:d92f9d21154c 572 " addze %0,17 \n\t" \
wolfSSL 0:d92f9d21154c 573 " sdx 16,0,%1 \n\t" \
wolfSSL 0:d92f9d21154c 574 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
wolfSSL 0:d92f9d21154c 575
wolfSSL 0:d92f9d21154c 576 #define PROPCARRY \
wolfSSL 0:d92f9d21154c 577 __asm__( \
wolfSSL 0:d92f9d21154c 578 " ldx 16,0,%1 \n\t" \
wolfSSL 0:d92f9d21154c 579 " addc 16,16,%0 \n\t" \
wolfSSL 0:d92f9d21154c 580 " sdx 16,0,%1 \n\t" \
wolfSSL 0:d92f9d21154c 581 " xor %0,%0,%0 \n\t" \
wolfSSL 0:d92f9d21154c 582 " addze %0,%0 \n\t" \
wolfSSL 0:d92f9d21154c 583 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
wolfSSL 0:d92f9d21154c 584
wolfSSL 0:d92f9d21154c 585 /******************************************************************/
wolfSSL 0:d92f9d21154c 586
wolfSSL 0:d92f9d21154c 587 #elif defined(TFM_AVR32)
wolfSSL 0:d92f9d21154c 588
wolfSSL 0:d92f9d21154c 589 /* AVR32 */
wolfSSL 0:d92f9d21154c 590 #define MONT_START
wolfSSL 0:d92f9d21154c 591 #define MONT_FINI
wolfSSL 0:d92f9d21154c 592 #define LOOP_END
wolfSSL 0:d92f9d21154c 593 #define LOOP_START \
wolfSSL 0:d92f9d21154c 594 mu = c[x] * mp
wolfSSL 0:d92f9d21154c 595
wolfSSL 0:d92f9d21154c 596 #define INNERMUL \
wolfSSL 0:d92f9d21154c 597 __asm__( \
wolfSSL 0:d92f9d21154c 598 " ld.w r2,%1 \n\t" \
wolfSSL 0:d92f9d21154c 599 " add r2,%0 \n\t" \
wolfSSL 0:d92f9d21154c 600 " eor r3,r3 \n\t" \
wolfSSL 0:d92f9d21154c 601 " acr r3 \n\t" \
wolfSSL 0:d92f9d21154c 602 " macu.d r2,%3,%4 \n\t" \
wolfSSL 0:d92f9d21154c 603 " st.w %1,r2 \n\t" \
wolfSSL 0:d92f9d21154c 604 " mov %0,r3 \n\t" \
wolfSSL 0:d92f9d21154c 605 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
wolfSSL 0:d92f9d21154c 606
wolfSSL 0:d92f9d21154c 607 #define PROPCARRY \
wolfSSL 0:d92f9d21154c 608 __asm__( \
wolfSSL 0:d92f9d21154c 609 " ld.w r2,%1 \n\t" \
wolfSSL 0:d92f9d21154c 610 " add r2,%0 \n\t" \
wolfSSL 0:d92f9d21154c 611 " st.w %1,r2 \n\t" \
wolfSSL 0:d92f9d21154c 612 " eor %0,%0 \n\t" \
wolfSSL 0:d92f9d21154c 613 " acr %0 \n\t" \
wolfSSL 0:d92f9d21154c 614 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
wolfSSL 0:d92f9d21154c 615
wolfSSL 0:d92f9d21154c 616 #else
wolfSSL 0:d92f9d21154c 617
wolfSSL 0:d92f9d21154c 618 /* ISO C code */
wolfSSL 0:d92f9d21154c 619 #define MONT_START
wolfSSL 0:d92f9d21154c 620 #define MONT_FINI
wolfSSL 0:d92f9d21154c 621 #define LOOP_END
wolfSSL 0:d92f9d21154c 622 #define LOOP_START \
wolfSSL 0:d92f9d21154c 623 mu = c[x] * mp
wolfSSL 0:d92f9d21154c 624
wolfSSL 0:d92f9d21154c 625 #define INNERMUL \
wolfSSL 0:d92f9d21154c 626 do { fp_word t; \
wolfSSL 0:d92f9d21154c 627 t = ((fp_word)_c[0] + (fp_word)cy) + \
wolfSSL 0:d92f9d21154c 628 (((fp_word)mu) * ((fp_word)*tmpm++)); \
wolfSSL 0:d92f9d21154c 629 _c[0] = (fp_digit)t; \
wolfSSL 0:d92f9d21154c 630 cy = (fp_digit)(t >> DIGIT_BIT); \
wolfSSL 0:d92f9d21154c 631 } while (0)
wolfSSL 0:d92f9d21154c 632
wolfSSL 0:d92f9d21154c 633 #define PROPCARRY \
wolfSSL 0:d92f9d21154c 634 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
wolfSSL 0:d92f9d21154c 635
wolfSSL 0:d92f9d21154c 636 #endif
wolfSSL 0:d92f9d21154c 637 /******************************************************************/
wolfSSL 0:d92f9d21154c 638
wolfSSL 0:d92f9d21154c 639
wolfSSL 0:d92f9d21154c 640 #define LO 0
wolfSSL 0:d92f9d21154c 641 /* end fp_montogomery_reduce.c asm */
wolfSSL 0:d92f9d21154c 642
wolfSSL 0:d92f9d21154c 643
wolfSSL 0:d92f9d21154c 644 /* start fp_sqr_comba.c asm */
wolfSSL 0:d92f9d21154c 645 #if defined(TFM_X86)
wolfSSL 0:d92f9d21154c 646
wolfSSL 0:d92f9d21154c 647 /* x86-32 optimized */
wolfSSL 0:d92f9d21154c 648
wolfSSL 0:d92f9d21154c 649 #define COMBA_START
wolfSSL 0:d92f9d21154c 650
wolfSSL 0:d92f9d21154c 651 #define CLEAR_CARRY \
wolfSSL 0:d92f9d21154c 652 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 653
wolfSSL 0:d92f9d21154c 654 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 655 x = c0;
wolfSSL 0:d92f9d21154c 656
wolfSSL 0:d92f9d21154c 657 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 658 x = c1;
wolfSSL 0:d92f9d21154c 659
wolfSSL 0:d92f9d21154c 660 #define CARRY_FORWARD \
wolfSSL 0:d92f9d21154c 661 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 662
wolfSSL 0:d92f9d21154c 663 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 664
wolfSSL 0:d92f9d21154c 665 #define SQRADD(i, j) \
wolfSSL 0:d92f9d21154c 666 __asm__( \
wolfSSL 0:d92f9d21154c 667 "movl %6,%%eax \n\t" \
wolfSSL 0:d92f9d21154c 668 "mull %%eax \n\t" \
wolfSSL 0:d92f9d21154c 669 "addl %%eax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 670 "adcl %%edx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 671 "adcl $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 672 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
wolfSSL 0:d92f9d21154c 673
wolfSSL 0:d92f9d21154c 674 #define SQRADD2(i, j) \
wolfSSL 0:d92f9d21154c 675 __asm__( \
wolfSSL 0:d92f9d21154c 676 "movl %6,%%eax \n\t" \
wolfSSL 0:d92f9d21154c 677 "mull %7 \n\t" \
wolfSSL 0:d92f9d21154c 678 "addl %%eax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 679 "adcl %%edx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 680 "adcl $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 681 "addl %%eax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 682 "adcl %%edx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 683 "adcl $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 684 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx", "cc");
wolfSSL 0:d92f9d21154c 685
wolfSSL 0:d92f9d21154c 686 #define SQRADDSC(i, j) \
wolfSSL 0:d92f9d21154c 687 __asm__( \
wolfSSL 0:d92f9d21154c 688 "movl %3,%%eax \n\t" \
wolfSSL 0:d92f9d21154c 689 "mull %4 \n\t" \
wolfSSL 0:d92f9d21154c 690 "movl %%eax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 691 "movl %%edx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 692 "xorl %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 693 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc");
wolfSSL 0:d92f9d21154c 694
wolfSSL 0:d92f9d21154c 695 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
wolfSSL 0:d92f9d21154c 696
wolfSSL 0:d92f9d21154c 697 #define SQRADDAC(i, j) \
wolfSSL 0:d92f9d21154c 698 __asm__( \
wolfSSL 0:d92f9d21154c 699 "movl %6,%%eax \n\t" \
wolfSSL 0:d92f9d21154c 700 "mull %7 \n\t" \
wolfSSL 0:d92f9d21154c 701 "addl %%eax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 702 "adcl %%edx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 703 "adcl $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 704 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
wolfSSL 0:d92f9d21154c 705
wolfSSL 0:d92f9d21154c 706 #define SQRADDDB \
wolfSSL 0:d92f9d21154c 707 __asm__( \
wolfSSL 0:d92f9d21154c 708 "addl %6,%0 \n\t" \
wolfSSL 0:d92f9d21154c 709 "adcl %7,%1 \n\t" \
wolfSSL 0:d92f9d21154c 710 "adcl %8,%2 \n\t" \
wolfSSL 0:d92f9d21154c 711 "addl %6,%0 \n\t" \
wolfSSL 0:d92f9d21154c 712 "adcl %7,%1 \n\t" \
wolfSSL 0:d92f9d21154c 713 "adcl %8,%2 \n\t" \
wolfSSL 0:d92f9d21154c 714 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
wolfSSL 0:d92f9d21154c 715
wolfSSL 0:d92f9d21154c 716 #elif defined(TFM_X86_64)
wolfSSL 0:d92f9d21154c 717 /* x86-64 optimized */
wolfSSL 0:d92f9d21154c 718
wolfSSL 0:d92f9d21154c 719 #define COMBA_START
wolfSSL 0:d92f9d21154c 720
wolfSSL 0:d92f9d21154c 721 #define CLEAR_CARRY \
wolfSSL 0:d92f9d21154c 722 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 723
wolfSSL 0:d92f9d21154c 724 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 725 x = c0;
wolfSSL 0:d92f9d21154c 726
wolfSSL 0:d92f9d21154c 727 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 728 x = c1;
wolfSSL 0:d92f9d21154c 729
wolfSSL 0:d92f9d21154c 730 #define CARRY_FORWARD \
wolfSSL 0:d92f9d21154c 731 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 732
wolfSSL 0:d92f9d21154c 733 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 734
wolfSSL 0:d92f9d21154c 735 #define SQRADD(i, j) \
wolfSSL 0:d92f9d21154c 736 __asm__( \
wolfSSL 0:d92f9d21154c 737 "movq %6,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 738 "mulq %%rax \n\t" \
wolfSSL 0:d92f9d21154c 739 "addq %%rax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 740 "adcq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 741 "adcq $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 742 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
wolfSSL 0:d92f9d21154c 743
wolfSSL 0:d92f9d21154c 744 #define SQRADD2(i, j) \
wolfSSL 0:d92f9d21154c 745 __asm__( \
wolfSSL 0:d92f9d21154c 746 "movq %6,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 747 "mulq %7 \n\t" \
wolfSSL 0:d92f9d21154c 748 "addq %%rax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 749 "adcq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 750 "adcq $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 751 "addq %%rax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 752 "adcq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 753 "adcq $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 754 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
wolfSSL 0:d92f9d21154c 755
wolfSSL 0:d92f9d21154c 756 #define SQRADDSC(i, j) \
wolfSSL 0:d92f9d21154c 757 __asm__( \
wolfSSL 0:d92f9d21154c 758 "movq %3,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 759 "mulq %4 \n\t" \
wolfSSL 0:d92f9d21154c 760 "movq %%rax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 761 "movq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 762 "xorq %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 763 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
wolfSSL 0:d92f9d21154c 764
wolfSSL 0:d92f9d21154c 765 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
wolfSSL 0:d92f9d21154c 766
wolfSSL 0:d92f9d21154c 767 #define SQRADDAC(i, j) \
wolfSSL 0:d92f9d21154c 768 __asm__( \
wolfSSL 0:d92f9d21154c 769 "movq %6,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 770 "mulq %7 \n\t" \
wolfSSL 0:d92f9d21154c 771 "addq %%rax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 772 "adcq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 773 "adcq $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 774 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
wolfSSL 0:d92f9d21154c 775
wolfSSL 0:d92f9d21154c 776 #define SQRADDDB \
wolfSSL 0:d92f9d21154c 777 __asm__( \
wolfSSL 0:d92f9d21154c 778 "addq %6,%0 \n\t" \
wolfSSL 0:d92f9d21154c 779 "adcq %7,%1 \n\t" \
wolfSSL 0:d92f9d21154c 780 "adcq %8,%2 \n\t" \
wolfSSL 0:d92f9d21154c 781 "addq %6,%0 \n\t" \
wolfSSL 0:d92f9d21154c 782 "adcq %7,%1 \n\t" \
wolfSSL 0:d92f9d21154c 783 "adcq %8,%2 \n\t" \
wolfSSL 0:d92f9d21154c 784 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
wolfSSL 0:d92f9d21154c 785
wolfSSL 0:d92f9d21154c 786 #elif defined(TFM_SSE2)
wolfSSL 0:d92f9d21154c 787
wolfSSL 0:d92f9d21154c 788 /* SSE2 Optimized */
wolfSSL 0:d92f9d21154c 789 #define COMBA_START
wolfSSL 0:d92f9d21154c 790
wolfSSL 0:d92f9d21154c 791 #define CLEAR_CARRY \
wolfSSL 0:d92f9d21154c 792 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 793
wolfSSL 0:d92f9d21154c 794 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 795 x = c0;
wolfSSL 0:d92f9d21154c 796
wolfSSL 0:d92f9d21154c 797 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 798 x = c1;
wolfSSL 0:d92f9d21154c 799
wolfSSL 0:d92f9d21154c 800 #define CARRY_FORWARD \
wolfSSL 0:d92f9d21154c 801 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 802
wolfSSL 0:d92f9d21154c 803 #define COMBA_FINI \
wolfSSL 0:d92f9d21154c 804 __asm__("emms");
wolfSSL 0:d92f9d21154c 805
wolfSSL 0:d92f9d21154c 806 #define SQRADD(i, j) \
wolfSSL 0:d92f9d21154c 807 __asm__( \
wolfSSL 0:d92f9d21154c 808 "movd %6,%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 809 "pmuludq %%mm0,%%mm0\n\t" \
wolfSSL 0:d92f9d21154c 810 "movd %%mm0,%%eax \n\t" \
wolfSSL 0:d92f9d21154c 811 "psrlq $32,%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 812 "addl %%eax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 813 "movd %%mm0,%%eax \n\t" \
wolfSSL 0:d92f9d21154c 814 "adcl %%eax,%1 \n\t" \
wolfSSL 0:d92f9d21154c 815 "adcl $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 816 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
wolfSSL 0:d92f9d21154c 817
wolfSSL 0:d92f9d21154c 818 #define SQRADD2(i, j) \
wolfSSL 0:d92f9d21154c 819 __asm__( \
wolfSSL 0:d92f9d21154c 820 "movd %6,%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 821 "movd %7,%%mm1 \n\t" \
wolfSSL 0:d92f9d21154c 822 "pmuludq %%mm1,%%mm0\n\t" \
wolfSSL 0:d92f9d21154c 823 "movd %%mm0,%%eax \n\t" \
wolfSSL 0:d92f9d21154c 824 "psrlq $32,%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 825 "movd %%mm0,%%edx \n\t" \
wolfSSL 0:d92f9d21154c 826 "addl %%eax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 827 "adcl %%edx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 828 "adcl $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 829 "addl %%eax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 830 "adcl %%edx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 831 "adcl $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 832 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
wolfSSL 0:d92f9d21154c 833
wolfSSL 0:d92f9d21154c 834 #define SQRADDSC(i, j) \
wolfSSL 0:d92f9d21154c 835 __asm__( \
wolfSSL 0:d92f9d21154c 836 "movd %3,%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 837 "movd %4,%%mm1 \n\t" \
wolfSSL 0:d92f9d21154c 838 "pmuludq %%mm1,%%mm0\n\t" \
wolfSSL 0:d92f9d21154c 839 "movd %%mm0,%0 \n\t" \
wolfSSL 0:d92f9d21154c 840 "psrlq $32,%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 841 "movd %%mm0,%1 \n\t" \
wolfSSL 0:d92f9d21154c 842 "xorl %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 843 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j));
wolfSSL 0:d92f9d21154c 844
wolfSSL 0:d92f9d21154c 845 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
wolfSSL 0:d92f9d21154c 846
wolfSSL 0:d92f9d21154c 847 #define SQRADDAC(i, j) \
wolfSSL 0:d92f9d21154c 848 __asm__( \
wolfSSL 0:d92f9d21154c 849 "movd %6,%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 850 "movd %7,%%mm1 \n\t" \
wolfSSL 0:d92f9d21154c 851 "pmuludq %%mm1,%%mm0\n\t" \
wolfSSL 0:d92f9d21154c 852 "movd %%mm0,%%eax \n\t" \
wolfSSL 0:d92f9d21154c 853 "psrlq $32,%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 854 "movd %%mm0,%%edx \n\t" \
wolfSSL 0:d92f9d21154c 855 "addl %%eax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 856 "adcl %%edx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 857 "adcl $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 858 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","cc");
wolfSSL 0:d92f9d21154c 859
wolfSSL 0:d92f9d21154c 860 #define SQRADDDB \
wolfSSL 0:d92f9d21154c 861 __asm__( \
wolfSSL 0:d92f9d21154c 862 "addl %6,%0 \n\t" \
wolfSSL 0:d92f9d21154c 863 "adcl %7,%1 \n\t" \
wolfSSL 0:d92f9d21154c 864 "adcl %8,%2 \n\t" \
wolfSSL 0:d92f9d21154c 865 "addl %6,%0 \n\t" \
wolfSSL 0:d92f9d21154c 866 "adcl %7,%1 \n\t" \
wolfSSL 0:d92f9d21154c 867 "adcl %8,%2 \n\t" \
wolfSSL 0:d92f9d21154c 868 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
wolfSSL 0:d92f9d21154c 869
wolfSSL 0:d92f9d21154c 870 #elif defined(TFM_ARM)
wolfSSL 0:d92f9d21154c 871
wolfSSL 0:d92f9d21154c 872 /* ARM code */
wolfSSL 0:d92f9d21154c 873
wolfSSL 0:d92f9d21154c 874 #define COMBA_START
wolfSSL 0:d92f9d21154c 875
wolfSSL 0:d92f9d21154c 876 #define CLEAR_CARRY \
wolfSSL 0:d92f9d21154c 877 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 878
wolfSSL 0:d92f9d21154c 879 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 880 x = c0;
wolfSSL 0:d92f9d21154c 881
wolfSSL 0:d92f9d21154c 882 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 883 x = c1;
wolfSSL 0:d92f9d21154c 884
wolfSSL 0:d92f9d21154c 885 #define CARRY_FORWARD \
wolfSSL 0:d92f9d21154c 886 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 887
wolfSSL 0:d92f9d21154c 888 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 889
wolfSSL 0:d92f9d21154c 890 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 0:d92f9d21154c 891 #define SQRADD(i, j) \
wolfSSL 0:d92f9d21154c 892 __asm__( \
wolfSSL 0:d92f9d21154c 893 " UMULL r0,r1,%6,%6 \n\t" \
wolfSSL 0:d92f9d21154c 894 " ADDS %0,%0,r0 \n\t" \
wolfSSL 0:d92f9d21154c 895 " ADCS %1,%1,r1 \n\t" \
wolfSSL 0:d92f9d21154c 896 " ADC %2,%2,#0 \n\t" \
wolfSSL 0:d92f9d21154c 897 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
wolfSSL 0:d92f9d21154c 898
wolfSSL 0:d92f9d21154c 899 /* for squaring some of the terms are doubled... */
wolfSSL 0:d92f9d21154c 900 #define SQRADD2(i, j) \
wolfSSL 0:d92f9d21154c 901 __asm__( \
wolfSSL 0:d92f9d21154c 902 " UMULL r0,r1,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 903 " ADDS %0,%0,r0 \n\t" \
wolfSSL 0:d92f9d21154c 904 " ADCS %1,%1,r1 \n\t" \
wolfSSL 0:d92f9d21154c 905 " ADC %2,%2,#0 \n\t" \
wolfSSL 0:d92f9d21154c 906 " ADDS %0,%0,r0 \n\t" \
wolfSSL 0:d92f9d21154c 907 " ADCS %1,%1,r1 \n\t" \
wolfSSL 0:d92f9d21154c 908 " ADC %2,%2,#0 \n\t" \
wolfSSL 0:d92f9d21154c 909 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
wolfSSL 0:d92f9d21154c 910
wolfSSL 0:d92f9d21154c 911 #define SQRADDSC(i, j) \
wolfSSL 0:d92f9d21154c 912 __asm__( \
wolfSSL 0:d92f9d21154c 913 " UMULL %0,%1,%3,%4 \n\t" \
wolfSSL 0:d92f9d21154c 914 " SUB %2,%2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 915 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc");
wolfSSL 0:d92f9d21154c 916
wolfSSL 0:d92f9d21154c 917 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
wolfSSL 0:d92f9d21154c 918
wolfSSL 0:d92f9d21154c 919 #define SQRADDAC(i, j) \
wolfSSL 0:d92f9d21154c 920 __asm__( \
wolfSSL 0:d92f9d21154c 921 " UMULL r0,r1,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 922 " ADDS %0,%0,r0 \n\t" \
wolfSSL 0:d92f9d21154c 923 " ADCS %1,%1,r1 \n\t" \
wolfSSL 0:d92f9d21154c 924 " ADC %2,%2,#0 \n\t" \
wolfSSL 0:d92f9d21154c 925 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
wolfSSL 0:d92f9d21154c 926
wolfSSL 0:d92f9d21154c 927 #define SQRADDDB \
wolfSSL 0:d92f9d21154c 928 __asm__( \
wolfSSL 0:d92f9d21154c 929 " ADDS %0,%0,%3 \n\t" \
wolfSSL 0:d92f9d21154c 930 " ADCS %1,%1,%4 \n\t" \
wolfSSL 0:d92f9d21154c 931 " ADC %2,%2,%5 \n\t" \
wolfSSL 0:d92f9d21154c 932 " ADDS %0,%0,%3 \n\t" \
wolfSSL 0:d92f9d21154c 933 " ADCS %1,%1,%4 \n\t" \
wolfSSL 0:d92f9d21154c 934 " ADC %2,%2,%5 \n\t" \
wolfSSL 0:d92f9d21154c 935 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
wolfSSL 0:d92f9d21154c 936
wolfSSL 0:d92f9d21154c 937 #elif defined(TFM_PPC32)
wolfSSL 0:d92f9d21154c 938
wolfSSL 0:d92f9d21154c 939 /* PPC32 */
wolfSSL 0:d92f9d21154c 940
wolfSSL 0:d92f9d21154c 941 #define COMBA_START
wolfSSL 0:d92f9d21154c 942
wolfSSL 0:d92f9d21154c 943 #define CLEAR_CARRY \
wolfSSL 0:d92f9d21154c 944 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 945
wolfSSL 0:d92f9d21154c 946 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 947 x = c0;
wolfSSL 0:d92f9d21154c 948
wolfSSL 0:d92f9d21154c 949 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 950 x = c1;
wolfSSL 0:d92f9d21154c 951
wolfSSL 0:d92f9d21154c 952 #define CARRY_FORWARD \
wolfSSL 0:d92f9d21154c 953 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 954
wolfSSL 0:d92f9d21154c 955 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 956
wolfSSL 0:d92f9d21154c 957 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 0:d92f9d21154c 958 #define SQRADD(i, j) \
wolfSSL 0:d92f9d21154c 959 __asm__( \
wolfSSL 0:d92f9d21154c 960 " mullw 16,%6,%6 \n\t" \
wolfSSL 0:d92f9d21154c 961 " addc %0,%0,16 \n\t" \
wolfSSL 0:d92f9d21154c 962 " mulhwu 16,%6,%6 \n\t" \
wolfSSL 0:d92f9d21154c 963 " adde %1,%1,16 \n\t" \
wolfSSL 0:d92f9d21154c 964 " addze %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 965 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
wolfSSL 0:d92f9d21154c 966
wolfSSL 0:d92f9d21154c 967 /* for squaring some of the terms are doubled... */
wolfSSL 0:d92f9d21154c 968 #define SQRADD2(i, j) \
wolfSSL 0:d92f9d21154c 969 __asm__( \
wolfSSL 0:d92f9d21154c 970 " mullw 16,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 971 " mulhwu 17,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 972 " addc %0,%0,16 \n\t" \
wolfSSL 0:d92f9d21154c 973 " adde %1,%1,17 \n\t" \
wolfSSL 0:d92f9d21154c 974 " addze %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 975 " addc %0,%0,16 \n\t" \
wolfSSL 0:d92f9d21154c 976 " adde %1,%1,17 \n\t" \
wolfSSL 0:d92f9d21154c 977 " addze %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 978 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
wolfSSL 0:d92f9d21154c 979
wolfSSL 0:d92f9d21154c 980 #define SQRADDSC(i, j) \
wolfSSL 0:d92f9d21154c 981 __asm__( \
wolfSSL 0:d92f9d21154c 982 " mullw %0,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 983 " mulhwu %1,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 984 " xor %2,%2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 985 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
wolfSSL 0:d92f9d21154c 986
wolfSSL 0:d92f9d21154c 987 #define SQRADDAC(i, j) \
wolfSSL 0:d92f9d21154c 988 __asm__( \
wolfSSL 0:d92f9d21154c 989 " mullw 16,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 990 " addc %0,%0,16 \n\t" \
wolfSSL 0:d92f9d21154c 991 " mulhwu 16,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 992 " adde %1,%1,16 \n\t" \
wolfSSL 0:d92f9d21154c 993 " addze %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 994 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
wolfSSL 0:d92f9d21154c 995
wolfSSL 0:d92f9d21154c 996 #define SQRADDDB \
wolfSSL 0:d92f9d21154c 997 __asm__( \
wolfSSL 0:d92f9d21154c 998 " addc %0,%0,%3 \n\t" \
wolfSSL 0:d92f9d21154c 999 " adde %1,%1,%4 \n\t" \
wolfSSL 0:d92f9d21154c 1000 " adde %2,%2,%5 \n\t" \
wolfSSL 0:d92f9d21154c 1001 " addc %0,%0,%3 \n\t" \
wolfSSL 0:d92f9d21154c 1002 " adde %1,%1,%4 \n\t" \
wolfSSL 0:d92f9d21154c 1003 " adde %2,%2,%5 \n\t" \
wolfSSL 0:d92f9d21154c 1004 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
wolfSSL 0:d92f9d21154c 1005
wolfSSL 0:d92f9d21154c 1006 #elif defined(TFM_PPC64)
wolfSSL 0:d92f9d21154c 1007 /* PPC64 */
wolfSSL 0:d92f9d21154c 1008
wolfSSL 0:d92f9d21154c 1009 #define COMBA_START
wolfSSL 0:d92f9d21154c 1010
wolfSSL 0:d92f9d21154c 1011 #define CLEAR_CARRY \
wolfSSL 0:d92f9d21154c 1012 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 1013
wolfSSL 0:d92f9d21154c 1014 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 1015 x = c0;
wolfSSL 0:d92f9d21154c 1016
wolfSSL 0:d92f9d21154c 1017 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 1018 x = c1;
wolfSSL 0:d92f9d21154c 1019
wolfSSL 0:d92f9d21154c 1020 #define CARRY_FORWARD \
wolfSSL 0:d92f9d21154c 1021 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 1022
wolfSSL 0:d92f9d21154c 1023 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 1024
wolfSSL 0:d92f9d21154c 1025 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 0:d92f9d21154c 1026 #define SQRADD(i, j) \
wolfSSL 0:d92f9d21154c 1027 __asm__( \
wolfSSL 0:d92f9d21154c 1028 " mulld 16,%6,%6 \n\t" \
wolfSSL 0:d92f9d21154c 1029 " addc %0,%0,16 \n\t" \
wolfSSL 0:d92f9d21154c 1030 " mulhdu 16,%6,%6 \n\t" \
wolfSSL 0:d92f9d21154c 1031 " adde %1,%1,16 \n\t" \
wolfSSL 0:d92f9d21154c 1032 " addze %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 1033 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
wolfSSL 0:d92f9d21154c 1034
wolfSSL 0:d92f9d21154c 1035 /* for squaring some of the terms are doubled... */
wolfSSL 0:d92f9d21154c 1036 #define SQRADD2(i, j) \
wolfSSL 0:d92f9d21154c 1037 __asm__( \
wolfSSL 0:d92f9d21154c 1038 " mulld 16,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1039 " mulhdu 17,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1040 " addc %0,%0,16 \n\t" \
wolfSSL 0:d92f9d21154c 1041 " adde %1,%1,17 \n\t" \
wolfSSL 0:d92f9d21154c 1042 " addze %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 1043 " addc %0,%0,16 \n\t" \
wolfSSL 0:d92f9d21154c 1044 " adde %1,%1,17 \n\t" \
wolfSSL 0:d92f9d21154c 1045 " addze %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 1046 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
wolfSSL 0:d92f9d21154c 1047
wolfSSL 0:d92f9d21154c 1048 #define SQRADDSC(i, j) \
wolfSSL 0:d92f9d21154c 1049 __asm__( \
wolfSSL 0:d92f9d21154c 1050 " mulld %0,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1051 " mulhdu %1,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1052 " xor %2,%2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 1053 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
wolfSSL 0:d92f9d21154c 1054
wolfSSL 0:d92f9d21154c 1055 #define SQRADDAC(i, j) \
wolfSSL 0:d92f9d21154c 1056 __asm__( \
wolfSSL 0:d92f9d21154c 1057 " mulld 16,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1058 " addc %0,%0,16 \n\t" \
wolfSSL 0:d92f9d21154c 1059 " mulhdu 16,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1060 " adde %1,%1,16 \n\t" \
wolfSSL 0:d92f9d21154c 1061 " addze %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 1062 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
wolfSSL 0:d92f9d21154c 1063
wolfSSL 0:d92f9d21154c 1064 #define SQRADDDB \
wolfSSL 0:d92f9d21154c 1065 __asm__( \
wolfSSL 0:d92f9d21154c 1066 " addc %0,%0,%3 \n\t" \
wolfSSL 0:d92f9d21154c 1067 " adde %1,%1,%4 \n\t" \
wolfSSL 0:d92f9d21154c 1068 " adde %2,%2,%5 \n\t" \
wolfSSL 0:d92f9d21154c 1069 " addc %0,%0,%3 \n\t" \
wolfSSL 0:d92f9d21154c 1070 " adde %1,%1,%4 \n\t" \
wolfSSL 0:d92f9d21154c 1071 " adde %2,%2,%5 \n\t" \
wolfSSL 0:d92f9d21154c 1072 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
wolfSSL 0:d92f9d21154c 1073
wolfSSL 0:d92f9d21154c 1074
wolfSSL 0:d92f9d21154c 1075 #elif defined(TFM_AVR32)
wolfSSL 0:d92f9d21154c 1076
wolfSSL 0:d92f9d21154c 1077 /* AVR32 */
wolfSSL 0:d92f9d21154c 1078
wolfSSL 0:d92f9d21154c 1079 #define COMBA_START
wolfSSL 0:d92f9d21154c 1080
wolfSSL 0:d92f9d21154c 1081 #define CLEAR_CARRY \
wolfSSL 0:d92f9d21154c 1082 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 1083
wolfSSL 0:d92f9d21154c 1084 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 1085 x = c0;
wolfSSL 0:d92f9d21154c 1086
wolfSSL 0:d92f9d21154c 1087 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 1088 x = c1;
wolfSSL 0:d92f9d21154c 1089
wolfSSL 0:d92f9d21154c 1090 #define CARRY_FORWARD \
wolfSSL 0:d92f9d21154c 1091 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 1092
wolfSSL 0:d92f9d21154c 1093 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 1094
wolfSSL 0:d92f9d21154c 1095 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 0:d92f9d21154c 1096 #define SQRADD(i, j) \
wolfSSL 0:d92f9d21154c 1097 __asm__( \
wolfSSL 0:d92f9d21154c 1098 " mulu.d r2,%6,%6 \n\t" \
wolfSSL 0:d92f9d21154c 1099 " add %0,%0,r2 \n\t" \
wolfSSL 0:d92f9d21154c 1100 " adc %1,%1,r3 \n\t" \
wolfSSL 0:d92f9d21154c 1101 " acr %2 \n\t" \
wolfSSL 0:d92f9d21154c 1102 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
wolfSSL 0:d92f9d21154c 1103
wolfSSL 0:d92f9d21154c 1104 /* for squaring some of the terms are doubled... */
wolfSSL 0:d92f9d21154c 1105 #define SQRADD2(i, j) \
wolfSSL 0:d92f9d21154c 1106 __asm__( \
wolfSSL 0:d92f9d21154c 1107 " mulu.d r2,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1108 " add %0,%0,r2 \n\t" \
wolfSSL 0:d92f9d21154c 1109 " adc %1,%1,r3 \n\t" \
wolfSSL 0:d92f9d21154c 1110 " acr %2, \n\t" \
wolfSSL 0:d92f9d21154c 1111 " add %0,%0,r2 \n\t" \
wolfSSL 0:d92f9d21154c 1112 " adc %1,%1,r3 \n\t" \
wolfSSL 0:d92f9d21154c 1113 " acr %2, \n\t" \
wolfSSL 0:d92f9d21154c 1114 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
wolfSSL 0:d92f9d21154c 1115
wolfSSL 0:d92f9d21154c 1116 #define SQRADDSC(i, j) \
wolfSSL 0:d92f9d21154c 1117 __asm__( \
wolfSSL 0:d92f9d21154c 1118 " mulu.d r2,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1119 " mov %0,r2 \n\t" \
wolfSSL 0:d92f9d21154c 1120 " mov %1,r3 \n\t" \
wolfSSL 0:d92f9d21154c 1121 " eor %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 1122 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
wolfSSL 0:d92f9d21154c 1123
wolfSSL 0:d92f9d21154c 1124 #define SQRADDAC(i, j) \
wolfSSL 0:d92f9d21154c 1125 __asm__( \
wolfSSL 0:d92f9d21154c 1126 " mulu.d r2,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1127 " add %0,%0,r2 \n\t" \
wolfSSL 0:d92f9d21154c 1128 " adc %1,%1,r3 \n\t" \
wolfSSL 0:d92f9d21154c 1129 " acr %2 \n\t" \
wolfSSL 0:d92f9d21154c 1130 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
wolfSSL 0:d92f9d21154c 1131
wolfSSL 0:d92f9d21154c 1132 #define SQRADDDB \
wolfSSL 0:d92f9d21154c 1133 __asm__( \
wolfSSL 0:d92f9d21154c 1134 " add %0,%0,%3 \n\t" \
wolfSSL 0:d92f9d21154c 1135 " adc %1,%1,%4 \n\t" \
wolfSSL 0:d92f9d21154c 1136 " adc %2,%2,%5 \n\t" \
wolfSSL 0:d92f9d21154c 1137 " add %0,%0,%3 \n\t" \
wolfSSL 0:d92f9d21154c 1138 " adc %1,%1,%4 \n\t" \
wolfSSL 0:d92f9d21154c 1139 " adc %2,%2,%5 \n\t" \
wolfSSL 0:d92f9d21154c 1140 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
wolfSSL 0:d92f9d21154c 1141
wolfSSL 0:d92f9d21154c 1142
wolfSSL 0:d92f9d21154c 1143 #else
wolfSSL 0:d92f9d21154c 1144
wolfSSL 0:d92f9d21154c 1145 #define TFM_ISO
wolfSSL 0:d92f9d21154c 1146
wolfSSL 0:d92f9d21154c 1147 /* ISO C portable code */
wolfSSL 0:d92f9d21154c 1148
wolfSSL 0:d92f9d21154c 1149 #define COMBA_START
wolfSSL 0:d92f9d21154c 1150
wolfSSL 0:d92f9d21154c 1151 #define CLEAR_CARRY \
wolfSSL 0:d92f9d21154c 1152 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 1153
wolfSSL 0:d92f9d21154c 1154 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 1155 x = c0;
wolfSSL 0:d92f9d21154c 1156
wolfSSL 0:d92f9d21154c 1157 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 1158 x = c1;
wolfSSL 0:d92f9d21154c 1159
wolfSSL 0:d92f9d21154c 1160 #define CARRY_FORWARD \
wolfSSL 0:d92f9d21154c 1161 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 1162
wolfSSL 0:d92f9d21154c 1163 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 1164
wolfSSL 0:d92f9d21154c 1165 /* multiplies point i and j, updates carry "c1" and digit c2 */
wolfSSL 0:d92f9d21154c 1166 #define SQRADD(i, j) \
wolfSSL 0:d92f9d21154c 1167 do { fp_word t; \
wolfSSL 0:d92f9d21154c 1168 t = c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \
wolfSSL 0:d92f9d21154c 1169 t = c1 + (t >> DIGIT_BIT); c1 = (fp_digit)t; \
wolfSSL 0:d92f9d21154c 1170 c2 +=(fp_digit) (t >> DIGIT_BIT); \
wolfSSL 0:d92f9d21154c 1171 } while (0);
wolfSSL 0:d92f9d21154c 1172
wolfSSL 0:d92f9d21154c 1173
wolfSSL 0:d92f9d21154c 1174 /* for squaring some of the terms are doubled... */
wolfSSL 0:d92f9d21154c 1175 #define SQRADD2(i, j) \
wolfSSL 0:d92f9d21154c 1176 do { fp_word t; \
wolfSSL 0:d92f9d21154c 1177 t = ((fp_word)i) * ((fp_word)j); \
wolfSSL 0:d92f9d21154c 1178 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
wolfSSL 0:d92f9d21154c 1179 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
wolfSSL 0:d92f9d21154c 1180 c2 +=(fp_digit)( tt >> DIGIT_BIT); \
wolfSSL 0:d92f9d21154c 1181 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \
wolfSSL 0:d92f9d21154c 1182 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \
wolfSSL 0:d92f9d21154c 1183 c2 +=(fp_digit) (tt >> DIGIT_BIT); \
wolfSSL 0:d92f9d21154c 1184 } while (0);
wolfSSL 0:d92f9d21154c 1185
wolfSSL 0:d92f9d21154c 1186 #define SQRADDSC(i, j) \
wolfSSL 0:d92f9d21154c 1187 do { fp_word t; \
wolfSSL 0:d92f9d21154c 1188 t = ((fp_word)i) * ((fp_word)j); \
wolfSSL 0:d92f9d21154c 1189 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \
wolfSSL 0:d92f9d21154c 1190 } while (0);
wolfSSL 0:d92f9d21154c 1191
wolfSSL 0:d92f9d21154c 1192 #define SQRADDAC(i, j) \
wolfSSL 0:d92f9d21154c 1193 do { fp_word t; \
wolfSSL 0:d92f9d21154c 1194 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = (fp_digit)t; \
wolfSSL 0:d92f9d21154c 1195 t = sc1 + (t >> DIGIT_BIT); sc1 = (fp_digit)t; \
wolfSSL 0:d92f9d21154c 1196 sc2 += (fp_digit)(t >> DIGIT_BIT); \
wolfSSL 0:d92f9d21154c 1197 } while (0);
wolfSSL 0:d92f9d21154c 1198
wolfSSL 0:d92f9d21154c 1199 #define SQRADDDB \
wolfSSL 0:d92f9d21154c 1200 do { fp_word t; \
wolfSSL 0:d92f9d21154c 1201 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t; \
wolfSSL 0:d92f9d21154c 1202 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); \
wolfSSL 0:d92f9d21154c 1203 c1 = (fp_digit)t; \
wolfSSL 0:d92f9d21154c 1204 c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT)); \
wolfSSL 0:d92f9d21154c 1205 } while (0);
wolfSSL 0:d92f9d21154c 1206
wolfSSL 0:d92f9d21154c 1207 #endif
wolfSSL 0:d92f9d21154c 1208
wolfSSL 0:d92f9d21154c 1209 #ifdef TFM_SMALL_SET
wolfSSL 0:d92f9d21154c 1210 #include "fp_sqr_comba_small_set.i"
wolfSSL 0:d92f9d21154c 1211 #endif
wolfSSL 0:d92f9d21154c 1212
wolfSSL 0:d92f9d21154c 1213 #if defined(TFM_SQR3)
wolfSSL 0:d92f9d21154c 1214 #include "fp_sqr_comba_3.i"
wolfSSL 0:d92f9d21154c 1215 #endif
wolfSSL 0:d92f9d21154c 1216 #if defined(TFM_SQR4)
wolfSSL 0:d92f9d21154c 1217 #include "fp_sqr_comba_4.i"
wolfSSL 0:d92f9d21154c 1218 #endif
wolfSSL 0:d92f9d21154c 1219 #if defined(TFM_SQR6)
wolfSSL 0:d92f9d21154c 1220 #include "fp_sqr_comba_6.i"
wolfSSL 0:d92f9d21154c 1221 #endif
wolfSSL 0:d92f9d21154c 1222 #if defined(TFM_SQR7)
wolfSSL 0:d92f9d21154c 1223 #include "fp_sqr_comba_7.i"
wolfSSL 0:d92f9d21154c 1224 #endif
wolfSSL 0:d92f9d21154c 1225 #if defined(TFM_SQR8)
wolfSSL 0:d92f9d21154c 1226 #include "fp_sqr_comba_8.i"
wolfSSL 0:d92f9d21154c 1227 #endif
wolfSSL 0:d92f9d21154c 1228 #if defined(TFM_SQR9)
wolfSSL 0:d92f9d21154c 1229 #include "fp_sqr_comba_9.i"
wolfSSL 0:d92f9d21154c 1230 #endif
wolfSSL 0:d92f9d21154c 1231 #if defined(TFM_SQR12)
wolfSSL 0:d92f9d21154c 1232 #include "fp_sqr_comba_12.i"
wolfSSL 0:d92f9d21154c 1233 #endif
wolfSSL 0:d92f9d21154c 1234 #if defined(TFM_SQR17)
wolfSSL 0:d92f9d21154c 1235 #include "fp_sqr_comba_17.i"
wolfSSL 0:d92f9d21154c 1236 #endif
wolfSSL 0:d92f9d21154c 1237 #if defined(TFM_SQR20)
wolfSSL 0:d92f9d21154c 1238 #include "fp_sqr_comba_20.i"
wolfSSL 0:d92f9d21154c 1239 #endif
wolfSSL 0:d92f9d21154c 1240 #if defined(TFM_SQR24)
wolfSSL 0:d92f9d21154c 1241 #include "fp_sqr_comba_24.i"
wolfSSL 0:d92f9d21154c 1242 #endif
wolfSSL 0:d92f9d21154c 1243 #if defined(TFM_SQR28)
wolfSSL 0:d92f9d21154c 1244 #include "fp_sqr_comba_28.i"
wolfSSL 0:d92f9d21154c 1245 #endif
wolfSSL 0:d92f9d21154c 1246 #if defined(TFM_SQR32)
wolfSSL 0:d92f9d21154c 1247 #include "fp_sqr_comba_32.i"
wolfSSL 0:d92f9d21154c 1248 #endif
wolfSSL 0:d92f9d21154c 1249 #if defined(TFM_SQR48)
wolfSSL 0:d92f9d21154c 1250 #include "fp_sqr_comba_48.i"
wolfSSL 0:d92f9d21154c 1251 #endif
wolfSSL 0:d92f9d21154c 1252 #if defined(TFM_SQR64)
wolfSSL 0:d92f9d21154c 1253 #include "fp_sqr_comba_64.i"
wolfSSL 0:d92f9d21154c 1254 #endif
wolfSSL 0:d92f9d21154c 1255 /* end fp_sqr_comba.c asm */
wolfSSL 0:d92f9d21154c 1256
wolfSSL 0:d92f9d21154c 1257 /* start fp_mul_comba.c asm */
wolfSSL 0:d92f9d21154c 1258 /* these are the combas. Worship them. */
wolfSSL 0:d92f9d21154c 1259 #if defined(TFM_X86)
wolfSSL 0:d92f9d21154c 1260 /* Generic x86 optimized code */
wolfSSL 0:d92f9d21154c 1261
wolfSSL 0:d92f9d21154c 1262 /* anything you need at the start */
wolfSSL 0:d92f9d21154c 1263 #define COMBA_START
wolfSSL 0:d92f9d21154c 1264
wolfSSL 0:d92f9d21154c 1265 /* clear the chaining variables */
wolfSSL 0:d92f9d21154c 1266 #define COMBA_CLEAR \
wolfSSL 0:d92f9d21154c 1267 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 1268
wolfSSL 0:d92f9d21154c 1269 /* forward the carry to the next digit */
wolfSSL 0:d92f9d21154c 1270 #define COMBA_FORWARD \
wolfSSL 0:d92f9d21154c 1271 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 1272
wolfSSL 0:d92f9d21154c 1273 /* store the first sum */
wolfSSL 0:d92f9d21154c 1274 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 1275 x = c0;
wolfSSL 0:d92f9d21154c 1276
wolfSSL 0:d92f9d21154c 1277 /* store the second sum [carry] */
wolfSSL 0:d92f9d21154c 1278 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 1279 x = c1;
wolfSSL 0:d92f9d21154c 1280
wolfSSL 0:d92f9d21154c 1281 /* anything you need at the end */
wolfSSL 0:d92f9d21154c 1282 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 1283
wolfSSL 0:d92f9d21154c 1284 /* this should multiply i and j */
wolfSSL 0:d92f9d21154c 1285 #define MULADD(i, j) \
wolfSSL 0:d92f9d21154c 1286 __asm__( \
wolfSSL 0:d92f9d21154c 1287 "movl %6,%%eax \n\t" \
wolfSSL 0:d92f9d21154c 1288 "mull %7 \n\t" \
wolfSSL 0:d92f9d21154c 1289 "addl %%eax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 1290 "adcl %%edx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 1291 "adcl $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 1292 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
wolfSSL 0:d92f9d21154c 1293
wolfSSL 0:d92f9d21154c 1294 #elif defined(TFM_X86_64)
wolfSSL 0:d92f9d21154c 1295 /* x86-64 optimized */
wolfSSL 0:d92f9d21154c 1296
wolfSSL 0:d92f9d21154c 1297 /* anything you need at the start */
wolfSSL 0:d92f9d21154c 1298 #define COMBA_START
wolfSSL 0:d92f9d21154c 1299
wolfSSL 0:d92f9d21154c 1300 /* clear the chaining variables */
wolfSSL 0:d92f9d21154c 1301 #define COMBA_CLEAR \
wolfSSL 0:d92f9d21154c 1302 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 1303
wolfSSL 0:d92f9d21154c 1304 /* forward the carry to the next digit */
wolfSSL 0:d92f9d21154c 1305 #define COMBA_FORWARD \
wolfSSL 0:d92f9d21154c 1306 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 1307
wolfSSL 0:d92f9d21154c 1308 /* store the first sum */
wolfSSL 0:d92f9d21154c 1309 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 1310 x = c0;
wolfSSL 0:d92f9d21154c 1311
wolfSSL 0:d92f9d21154c 1312 /* store the second sum [carry] */
wolfSSL 0:d92f9d21154c 1313 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 1314 x = c1;
wolfSSL 0:d92f9d21154c 1315
wolfSSL 0:d92f9d21154c 1316 /* anything you need at the end */
wolfSSL 0:d92f9d21154c 1317 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 1318
wolfSSL 0:d92f9d21154c 1319 /* this should multiply i and j */
wolfSSL 0:d92f9d21154c 1320 #define MULADD(i, j) \
wolfSSL 0:d92f9d21154c 1321 __asm__ ( \
wolfSSL 0:d92f9d21154c 1322 "movq %6,%%rax \n\t" \
wolfSSL 0:d92f9d21154c 1323 "mulq %7 \n\t" \
wolfSSL 0:d92f9d21154c 1324 "addq %%rax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 1325 "adcq %%rdx,%1 \n\t" \
wolfSSL 0:d92f9d21154c 1326 "adcq $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 1327 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
wolfSSL 0:d92f9d21154c 1328
wolfSSL 0:d92f9d21154c 1329
wolfSSL 0:d92f9d21154c 1330 #if defined(HAVE_INTEL_MULX)
wolfSSL 0:d92f9d21154c 1331 #define MULADD_MULX(b0, c0, c1, rdx)\
wolfSSL 0:d92f9d21154c 1332 __asm__ volatile ( \
wolfSSL 0:d92f9d21154c 1333 "movq %3, %%rdx\n\t" \
wolfSSL 0:d92f9d21154c 1334 "mulx %2,%%r9, %%r8 \n\t" \
wolfSSL 0:d92f9d21154c 1335 "adoxq %%r9,%0 \n\t" \
wolfSSL 0:d92f9d21154c 1336 "adcxq %%r8,%1 \n\t" \
wolfSSL 0:d92f9d21154c 1337 :"+r"(c0),"+r"(c1):"r"(b0), "r"(rdx):"%r8","%r9","%r10","%rdx"\
wolfSSL 0:d92f9d21154c 1338 )
wolfSSL 0:d92f9d21154c 1339
wolfSSL 0:d92f9d21154c 1340
wolfSSL 0:d92f9d21154c 1341 #define MULADD_MULX_ADD_CARRY(c0, c1)\
wolfSSL 0:d92f9d21154c 1342 __asm__ volatile(\
wolfSSL 0:d92f9d21154c 1343 "mov $0, %%r10\n\t"\
wolfSSL 0:d92f9d21154c 1344 "movq %1, %%r8\n\t"\
wolfSSL 0:d92f9d21154c 1345 "adox %%r10, %0\n\t"\
wolfSSL 0:d92f9d21154c 1346 "adcx %%r10, %1\n\t"\
wolfSSL 0:d92f9d21154c 1347 :"+r"(c0),"+r"(c1)::"%r8","%r9","%r10","%rdx") ;
wolfSSL 0:d92f9d21154c 1348
wolfSSL 0:d92f9d21154c 1349 #define MULADD_SET_A(a0)\
wolfSSL 0:d92f9d21154c 1350 __asm__ volatile("add $0, %%r8\n\t" \
wolfSSL 0:d92f9d21154c 1351 "movq %0,%%rdx\n\t" \
wolfSSL 0:d92f9d21154c 1352 ::"r"(a0):"%r8","%r9","%r10","%rdx") ;
wolfSSL 0:d92f9d21154c 1353
wolfSSL 0:d92f9d21154c 1354 #define MULADD_BODY(a,b,c)\
wolfSSL 0:d92f9d21154c 1355 { word64 rdx = a->dp[ix] ; \
wolfSSL 0:d92f9d21154c 1356 cp = &(c->dp[iz]) ; \
wolfSSL 0:d92f9d21154c 1357 c0 = cp[0] ; c1 = cp[1]; \
wolfSSL 0:d92f9d21154c 1358 MULADD_SET_A(rdx) ; \
wolfSSL 0:d92f9d21154c 1359 MULADD_MULX(b0, c0, c1, rdx) ;\
wolfSSL 0:d92f9d21154c 1360 cp[0]=c0; c0=cp[2]; \
wolfSSL 0:d92f9d21154c 1361 MULADD_MULX(b1, c1, c0, rdx) ;\
wolfSSL 0:d92f9d21154c 1362 cp[1]=c1; c1=cp[3]; \
wolfSSL 0:d92f9d21154c 1363 MULADD_MULX(b2, c0, c1, rdx) ;\
wolfSSL 0:d92f9d21154c 1364 cp[2]=c0; c0=cp[4]; \
wolfSSL 0:d92f9d21154c 1365 MULADD_MULX(b3, c1, c0, rdx) ;\
wolfSSL 0:d92f9d21154c 1366 cp[3]=c1; c1=cp[5]; \
wolfSSL 0:d92f9d21154c 1367 MULADD_MULX_ADD_CARRY(c0, c1);\
wolfSSL 0:d92f9d21154c 1368 cp[4]=c0; cp[5]=c1; \
wolfSSL 0:d92f9d21154c 1369 }
wolfSSL 0:d92f9d21154c 1370
wolfSSL 0:d92f9d21154c 1371 #define TFM_INTEL_MUL_COMBA(a, b, c)\
wolfSSL 0:d92f9d21154c 1372 for(ix=0; ix<pa; ix++)c->dp[ix]=0 ; \
wolfSSL 0:d92f9d21154c 1373 for(iy=0; (iy<b->used); iy+=4) { \
wolfSSL 0:d92f9d21154c 1374 fp_digit *bp ; \
wolfSSL 0:d92f9d21154c 1375 bp = &(b->dp[iy+0]) ; \
wolfSSL 0:d92f9d21154c 1376 fp_digit b0 = bp[0] , b1= bp[1], \
wolfSSL 0:d92f9d21154c 1377 b2= bp[2], b3= bp[3]; \
wolfSSL 0:d92f9d21154c 1378 ix=0, iz=iy; \
wolfSSL 0:d92f9d21154c 1379 while(ix<a->used) { \
wolfSSL 0:d92f9d21154c 1380 fp_digit c0, c1; \
wolfSSL 0:d92f9d21154c 1381 fp_digit *cp ; \
wolfSSL 0:d92f9d21154c 1382 MULADD_BODY(a,b,c); \
wolfSSL 0:d92f9d21154c 1383 ix++ ; iz++ ; \
wolfSSL 0:d92f9d21154c 1384 } \
wolfSSL 0:d92f9d21154c 1385 };
wolfSSL 0:d92f9d21154c 1386 #endif
wolfSSL 0:d92f9d21154c 1387
wolfSSL 0:d92f9d21154c 1388 #elif defined(TFM_SSE2)
wolfSSL 0:d92f9d21154c 1389 /* use SSE2 optimizations */
wolfSSL 0:d92f9d21154c 1390
wolfSSL 0:d92f9d21154c 1391 /* anything you need at the start */
wolfSSL 0:d92f9d21154c 1392 #define COMBA_START
wolfSSL 0:d92f9d21154c 1393
wolfSSL 0:d92f9d21154c 1394 /* clear the chaining variables */
wolfSSL 0:d92f9d21154c 1395 #define COMBA_CLEAR \
wolfSSL 0:d92f9d21154c 1396 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 1397
wolfSSL 0:d92f9d21154c 1398 /* forward the carry to the next digit */
wolfSSL 0:d92f9d21154c 1399 #define COMBA_FORWARD \
wolfSSL 0:d92f9d21154c 1400 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 1401
wolfSSL 0:d92f9d21154c 1402 /* store the first sum */
wolfSSL 0:d92f9d21154c 1403 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 1404 x = c0;
wolfSSL 0:d92f9d21154c 1405
wolfSSL 0:d92f9d21154c 1406 /* store the second sum [carry] */
wolfSSL 0:d92f9d21154c 1407 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 1408 x = c1;
wolfSSL 0:d92f9d21154c 1409
wolfSSL 0:d92f9d21154c 1410 /* anything you need at the end */
wolfSSL 0:d92f9d21154c 1411 #define COMBA_FINI \
wolfSSL 0:d92f9d21154c 1412 __asm__("emms");
wolfSSL 0:d92f9d21154c 1413
wolfSSL 0:d92f9d21154c 1414 /* this should multiply i and j */
wolfSSL 0:d92f9d21154c 1415 #define MULADD(i, j) \
wolfSSL 0:d92f9d21154c 1416 __asm__( \
wolfSSL 0:d92f9d21154c 1417 "movd %6,%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 1418 "movd %7,%%mm1 \n\t" \
wolfSSL 0:d92f9d21154c 1419 "pmuludq %%mm1,%%mm0\n\t" \
wolfSSL 0:d92f9d21154c 1420 "movd %%mm0,%%eax \n\t" \
wolfSSL 0:d92f9d21154c 1421 "psrlq $32,%%mm0 \n\t" \
wolfSSL 0:d92f9d21154c 1422 "addl %%eax,%0 \n\t" \
wolfSSL 0:d92f9d21154c 1423 "movd %%mm0,%%eax \n\t" \
wolfSSL 0:d92f9d21154c 1424 "adcl %%eax,%1 \n\t" \
wolfSSL 0:d92f9d21154c 1425 "adcl $0,%2 \n\t" \
wolfSSL 0:d92f9d21154c 1426 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc");
wolfSSL 0:d92f9d21154c 1427
wolfSSL 0:d92f9d21154c 1428 #elif defined(TFM_ARM)
wolfSSL 0:d92f9d21154c 1429 /* ARM code */
wolfSSL 0:d92f9d21154c 1430
wolfSSL 0:d92f9d21154c 1431 #define COMBA_START
wolfSSL 0:d92f9d21154c 1432
wolfSSL 0:d92f9d21154c 1433 #define COMBA_CLEAR \
wolfSSL 0:d92f9d21154c 1434 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 1435
wolfSSL 0:d92f9d21154c 1436 #define COMBA_FORWARD \
wolfSSL 0:d92f9d21154c 1437 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 1438
wolfSSL 0:d92f9d21154c 1439 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 1440 x = c0;
wolfSSL 0:d92f9d21154c 1441
wolfSSL 0:d92f9d21154c 1442 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 1443 x = c1;
wolfSSL 0:d92f9d21154c 1444
wolfSSL 0:d92f9d21154c 1445 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 1446
wolfSSL 0:d92f9d21154c 1447 #define MULADD(i, j) \
wolfSSL 0:d92f9d21154c 1448 __asm__( \
wolfSSL 0:d92f9d21154c 1449 " UMULL r0,r1,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1450 " ADDS %0,%0,r0 \n\t" \
wolfSSL 0:d92f9d21154c 1451 " ADCS %1,%1,r1 \n\t" \
wolfSSL 0:d92f9d21154c 1452 " ADC %2,%2,#0 \n\t" \
wolfSSL 0:d92f9d21154c 1453 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
wolfSSL 0:d92f9d21154c 1454
wolfSSL 0:d92f9d21154c 1455 #elif defined(TFM_PPC32)
wolfSSL 0:d92f9d21154c 1456 /* For 32-bit PPC */
wolfSSL 0:d92f9d21154c 1457
wolfSSL 0:d92f9d21154c 1458 #define COMBA_START
wolfSSL 0:d92f9d21154c 1459
wolfSSL 0:d92f9d21154c 1460 #define COMBA_CLEAR \
wolfSSL 0:d92f9d21154c 1461 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 1462
wolfSSL 0:d92f9d21154c 1463 #define COMBA_FORWARD \
wolfSSL 0:d92f9d21154c 1464 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 1465
wolfSSL 0:d92f9d21154c 1466 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 1467 x = c0;
wolfSSL 0:d92f9d21154c 1468
wolfSSL 0:d92f9d21154c 1469 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 1470 x = c1;
wolfSSL 0:d92f9d21154c 1471
wolfSSL 0:d92f9d21154c 1472 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 1473
wolfSSL 0:d92f9d21154c 1474 /* untested: will mulhwu change the flags? Docs say no */
wolfSSL 0:d92f9d21154c 1475 #define MULADD(i, j) \
wolfSSL 0:d92f9d21154c 1476 __asm__( \
wolfSSL 0:d92f9d21154c 1477 " mullw 16,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1478 " addc %0,%0,16 \n\t" \
wolfSSL 0:d92f9d21154c 1479 " mulhwu 16,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1480 " adde %1,%1,16 \n\t" \
wolfSSL 0:d92f9d21154c 1481 " addze %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 1482 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
wolfSSL 0:d92f9d21154c 1483
wolfSSL 0:d92f9d21154c 1484 #elif defined(TFM_PPC64)
wolfSSL 0:d92f9d21154c 1485 /* For 64-bit PPC */
wolfSSL 0:d92f9d21154c 1486
wolfSSL 0:d92f9d21154c 1487 #define COMBA_START
wolfSSL 0:d92f9d21154c 1488
wolfSSL 0:d92f9d21154c 1489 #define COMBA_CLEAR \
wolfSSL 0:d92f9d21154c 1490 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 1491
wolfSSL 0:d92f9d21154c 1492 #define COMBA_FORWARD \
wolfSSL 0:d92f9d21154c 1493 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 1494
wolfSSL 0:d92f9d21154c 1495 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 1496 x = c0;
wolfSSL 0:d92f9d21154c 1497
wolfSSL 0:d92f9d21154c 1498 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 1499 x = c1;
wolfSSL 0:d92f9d21154c 1500
wolfSSL 0:d92f9d21154c 1501 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 1502
wolfSSL 0:d92f9d21154c 1503 /* untested: will mulhwu change the flags? Docs say no */
wolfSSL 0:d92f9d21154c 1504 #define MULADD(i, j) \
wolfSSL 0:d92f9d21154c 1505 ____asm__( \
wolfSSL 0:d92f9d21154c 1506 " mulld 16,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1507 " addc %0,%0,16 \n\t" \
wolfSSL 0:d92f9d21154c 1508 " mulhdu 16,%6,%7 \n\t" \
wolfSSL 0:d92f9d21154c 1509 " adde %1,%1,16 \n\t" \
wolfSSL 0:d92f9d21154c 1510 " addze %2,%2 \n\t" \
wolfSSL 0:d92f9d21154c 1511 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
wolfSSL 0:d92f9d21154c 1512
wolfSSL 0:d92f9d21154c 1513 #elif defined(TFM_AVR32)
wolfSSL 0:d92f9d21154c 1514
wolfSSL 0:d92f9d21154c 1515 /* ISO C code */
wolfSSL 0:d92f9d21154c 1516
wolfSSL 0:d92f9d21154c 1517 #define COMBA_START
wolfSSL 0:d92f9d21154c 1518
wolfSSL 0:d92f9d21154c 1519 #define COMBA_CLEAR \
wolfSSL 0:d92f9d21154c 1520 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 1521
wolfSSL 0:d92f9d21154c 1522 #define COMBA_FORWARD \
wolfSSL 0:d92f9d21154c 1523 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 1524
wolfSSL 0:d92f9d21154c 1525 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 1526 x = c0;
wolfSSL 0:d92f9d21154c 1527
wolfSSL 0:d92f9d21154c 1528 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 1529 x = c1;
wolfSSL 0:d92f9d21154c 1530
wolfSSL 0:d92f9d21154c 1531 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 1532
wolfSSL 0:d92f9d21154c 1533 #define MULADD(i, j) \
wolfSSL 0:d92f9d21154c 1534 ____asm__( \
wolfSSL 0:d92f9d21154c 1535 " mulu.d r2,%6,%7 \n\t"\
wolfSSL 0:d92f9d21154c 1536 " add %0,r2 \n\t"\
wolfSSL 0:d92f9d21154c 1537 " adc %1,%1,r3 \n\t"\
wolfSSL 0:d92f9d21154c 1538 " acr %2 \n\t"\
wolfSSL 0:d92f9d21154c 1539 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
wolfSSL 0:d92f9d21154c 1540
wolfSSL 0:d92f9d21154c 1541 #else
wolfSSL 0:d92f9d21154c 1542 /* ISO C code */
wolfSSL 0:d92f9d21154c 1543
wolfSSL 0:d92f9d21154c 1544 #define COMBA_START
wolfSSL 0:d92f9d21154c 1545
wolfSSL 0:d92f9d21154c 1546 #define COMBA_CLEAR \
wolfSSL 0:d92f9d21154c 1547 c0 = c1 = c2 = 0;
wolfSSL 0:d92f9d21154c 1548
wolfSSL 0:d92f9d21154c 1549 #define COMBA_FORWARD \
wolfSSL 0:d92f9d21154c 1550 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
wolfSSL 0:d92f9d21154c 1551
wolfSSL 0:d92f9d21154c 1552 #define COMBA_STORE(x) \
wolfSSL 0:d92f9d21154c 1553 x = c0;
wolfSSL 0:d92f9d21154c 1554
wolfSSL 0:d92f9d21154c 1555 #define COMBA_STORE2(x) \
wolfSSL 0:d92f9d21154c 1556 x = c1;
wolfSSL 0:d92f9d21154c 1557
wolfSSL 0:d92f9d21154c 1558 #define COMBA_FINI
wolfSSL 0:d92f9d21154c 1559
wolfSSL 0:d92f9d21154c 1560 #define MULADD(i, j) \
wolfSSL 0:d92f9d21154c 1561 do { fp_word t; \
wolfSSL 0:d92f9d21154c 1562 t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \
wolfSSL 0:d92f9d21154c 1563 t = (fp_word)c1 + (t >> DIGIT_BIT); \
wolfSSL 0:d92f9d21154c 1564 c1 = (fp_digit)t; c2 += (fp_digit)(t >> DIGIT_BIT); \
wolfSSL 0:d92f9d21154c 1565 } while (0);
wolfSSL 0:d92f9d21154c 1566
wolfSSL 0:d92f9d21154c 1567 #endif
wolfSSL 0:d92f9d21154c 1568
wolfSSL 0:d92f9d21154c 1569
wolfSSL 0:d92f9d21154c 1570 #ifdef TFM_SMALL_SET
wolfSSL 0:d92f9d21154c 1571 #include "fp_mul_comba_small_set.i"
wolfSSL 0:d92f9d21154c 1572 #endif
wolfSSL 0:d92f9d21154c 1573
wolfSSL 0:d92f9d21154c 1574 #if defined(TFM_MUL3)
wolfSSL 0:d92f9d21154c 1575 #include "fp_mul_comba_3.i"
wolfSSL 0:d92f9d21154c 1576 #endif
wolfSSL 0:d92f9d21154c 1577 #if defined(TFM_MUL4)
wolfSSL 0:d92f9d21154c 1578 #include "fp_mul_comba_4.i"
wolfSSL 0:d92f9d21154c 1579 #endif
wolfSSL 0:d92f9d21154c 1580 #if defined(TFM_MUL6)
wolfSSL 0:d92f9d21154c 1581 #include "fp_mul_comba_6.i"
wolfSSL 0:d92f9d21154c 1582 #endif
wolfSSL 0:d92f9d21154c 1583 #if defined(TFM_MUL7)
wolfSSL 0:d92f9d21154c 1584 #include "fp_mul_comba_7.i"
wolfSSL 0:d92f9d21154c 1585 #endif
wolfSSL 0:d92f9d21154c 1586 #if defined(TFM_MUL8)
wolfSSL 0:d92f9d21154c 1587 #include "fp_mul_comba_8.i"
wolfSSL 0:d92f9d21154c 1588 #endif
wolfSSL 0:d92f9d21154c 1589 #if defined(TFM_MUL9)
wolfSSL 0:d92f9d21154c 1590 #include "fp_mul_comba_9.i"
wolfSSL 0:d92f9d21154c 1591 #endif
wolfSSL 0:d92f9d21154c 1592 #if defined(TFM_MUL12)
wolfSSL 0:d92f9d21154c 1593 #include "fp_mul_comba_12.i"
wolfSSL 0:d92f9d21154c 1594 #endif
wolfSSL 0:d92f9d21154c 1595 #if defined(TFM_MUL17)
wolfSSL 0:d92f9d21154c 1596 #include "fp_mul_comba_17.i"
wolfSSL 0:d92f9d21154c 1597 #endif
wolfSSL 0:d92f9d21154c 1598 #if defined(TFM_MUL20)
wolfSSL 0:d92f9d21154c 1599 #include "fp_mul_comba_20.i"
wolfSSL 0:d92f9d21154c 1600 #endif
wolfSSL 0:d92f9d21154c 1601 #if defined(TFM_MUL24)
wolfSSL 0:d92f9d21154c 1602 #include "fp_mul_comba_24.i"
wolfSSL 0:d92f9d21154c 1603 #endif
wolfSSL 0:d92f9d21154c 1604 #if defined(TFM_MUL28)
wolfSSL 0:d92f9d21154c 1605 #include "fp_mul_comba_28.i"
wolfSSL 0:d92f9d21154c 1606 #endif
wolfSSL 0:d92f9d21154c 1607 #if defined(TFM_MUL32)
wolfSSL 0:d92f9d21154c 1608 #include "fp_mul_comba_32.i"
wolfSSL 0:d92f9d21154c 1609 #endif
wolfSSL 0:d92f9d21154c 1610 #if defined(TFM_MUL48)
wolfSSL 0:d92f9d21154c 1611 #include "fp_mul_comba_48.i"
wolfSSL 0:d92f9d21154c 1612 #endif
wolfSSL 0:d92f9d21154c 1613 #if defined(TFM_MUL64)
wolfSSL 0:d92f9d21154c 1614 #include "fp_mul_comba_64.i"
wolfSSL 0:d92f9d21154c 1615 #endif
wolfSSL 0:d92f9d21154c 1616
wolfSSL 0:d92f9d21154c 1617 /* end fp_mul_comba.c asm */
wolfSSL 0:d92f9d21154c 1618
wolfSSL 0:d92f9d21154c 1619