Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
asm.c
00001 /* asm.c 00002 * 00003 * Copyright (C) 2006-2017 wolfSSL Inc. 00004 * 00005 * This file is part of wolfSSL. 00006 * 00007 * wolfSSL is free software; you can redistribute it and/or modify 00008 * it under the terms of the GNU General Public License as published by 00009 * the Free Software Foundation; either version 2 of the License, or 00010 * (at your option) any later version. 00011 * 00012 * wolfSSL is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 * GNU General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU General Public License 00018 * along with this program; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA 00020 */ 00021 00022 00023 #ifdef HAVE_CONFIG_H 00024 #include <config.h> 00025 #endif 00026 00027 #include <wolfcrypt/settings.h> 00028 00029 /* 00030 * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca, 00031 * http://math.libtomcrypt.com 00032 */ 00033 00034 00035 /******************************************************************/ 00036 /* fp_montgomery_reduce.c asm or generic */ 00037 00038 00039 /* Each platform needs to query info type 1 from cpuid to see if aesni is 00040 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts 00041 */ 00042 00043 #if defined(HAVE_INTEL_MULX) 00044 #ifndef _MSC_VER 00045 #define cpuid(reg, leaf, sub)\ 00046 __asm__ __volatile__ ("cpuid":\ 00047 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ 00048 "a" (leaf), "c"(sub)); 00049 00050 #define XASM_LINK(f) asm(f) 00051 #else 00052 00053 #include <intrin.h> 00054 #define cpuid(a,b,c) __cpuidex((int*)a,b,c) 00055 00056 #define XASM_LINK(f) 00057 00058 #endif /* _MSC_VER */ 00059 00060 #define EAX 0 00061 #define EBX 1 00062 #define ECX 2 00063 #define EDX 3 00064 00065 #define CPUID_AVX1 0x1 00066 #define CPUID_AVX2 0x2 00067 #define CPUID_RDRAND 0x4 00068 #define CPUID_RDSEED 0x8 00069 #define CPUID_BMI2 0x10 /* MULX, RORX */ 00070 #define CPUID_ADX 0x20 /* ADCX, ADOX */ 00071 00072 #define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1) 00073 #define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2) 00074 #define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2) 00075 #define IS_INTEL_ADX (cpuid_flags&CPUID_ADX) 00076 #define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND) 00077 #define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED) 00078 #define SET_FLAGS 00079 00080 static word32 cpuid_check = 0 ; 00081 static word32 cpuid_flags = 0 ; 00082 00083 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { 00084 int got_intel_cpu = 0; 00085 int got_amd_cpu = 0; 00086 unsigned int reg[5]; 00087 00088 reg[4] = '\0' ; 00089 cpuid(reg, 0, 0); 00090 00091 /* check for intel cpu */ 00092 if( memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 && 00093 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 && 00094 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) { 00095 got_intel_cpu = 1; 00096 } 00097 00098 /* check for AMD cpu */ 00099 if( memcmp((char *)&(reg[EBX]), "Auth", 4) == 0 && 00100 memcmp((char *)&(reg[EDX]), "enti", 4) == 0 && 00101 memcmp((char *)&(reg[ECX]), "cAMD", 4) == 0) { 00102 got_amd_cpu = 1; 00103 } 00104 if (got_intel_cpu || got_amd_cpu) { 00105 cpuid(reg, leaf, sub); 00106 return((reg[num]>>bit)&0x1) ; 00107 } 00108 return 0 ; 00109 } 00110 00111 WC_INLINE static int set_cpuid_flags(void) { 00112 if(cpuid_check == 0) { 00113 if(cpuid_flag(7, 0, EBX, 8)){ cpuid_flags |= CPUID_BMI2 ; } 00114 if(cpuid_flag(7, 0, EBX,19)){ cpuid_flags |= CPUID_ADX ; } 00115 cpuid_check = 1 ; 00116 return 0 ; 00117 } 00118 return 1 ; 00119 } 00120 00121 #define RETURN return 00122 #define IF_HAVE_INTEL_MULX(func, ret) \ 00123 if(cpuid_check==0)set_cpuid_flags() ; \ 00124 if(IS_INTEL_BMI2 && IS_INTEL_ADX){ func; ret ; } 00125 00126 #else 00127 #define IF_HAVE_INTEL_MULX(func, ret) 00128 #endif 00129 00130 #if defined(TFM_X86) && !defined(TFM_SSE2) 00131 /* x86-32 code */ 00132 00133 #define MONT_START 00134 #define MONT_FINI 00135 #define LOOP_END 00136 #define LOOP_START \ 00137 mu = c[x] * mp 00138 00139 #define INNERMUL \ 00140 __asm__( \ 00141 "movl %5,%%eax \n\t" \ 00142 "mull %4 \n\t" \ 00143 "addl %1,%%eax \n\t" \ 00144 "adcl $0,%%edx \n\t" \ 00145 "addl %%eax,%0 \n\t" \ 00146 "adcl $0,%%edx \n\t" \ 00147 "movl %%edx,%1 \n\t" \ 00148 :"=g"(_c[LO]), "=r"(cy) \ 00149 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \ 00150 : "%eax", "%edx", "cc") 00151 00152 #define PROPCARRY \ 00153 __asm__( \ 00154 "addl %1,%0 \n\t" \ 00155 "setb %%al \n\t" \ 00156 "movzbl %%al,%1 \n\t" \ 00157 :"=g"(_c[LO]), "=r"(cy) \ 00158 :"0"(_c[LO]), "1"(cy) \ 00159 : "%eax", "cc") 00160 00161 /******************************************************************/ 00162 #elif defined(TFM_X86_64) 00163 /* x86-64 code */ 00164 00165 #define MONT_START 00166 #define MONT_FINI 00167 #define LOOP_END 00168 #define LOOP_START \ 00169 mu = c[x] * mp 00170 00171 #define INNERMUL \ 00172 __asm__( \ 00173 "movq %5,%%rax \n\t" \ 00174 "mulq %4 \n\t" \ 00175 "addq %1,%%rax \n\t" \ 00176 "adcq $0,%%rdx \n\t" \ 00177 "addq %%rax,%0 \n\t" \ 00178 "adcq $0,%%rdx \n\t" \ 00179 "movq %%rdx,%1 \n\t" \ 00180 :"=g"(_c[LO]), "=r"(cy) \ 00181 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \ 00182 : "%rax", "%rdx", "cc") 00183 00184 #if defined(HAVE_INTEL_MULX) 00185 #define MULX_INNERMUL8(x,y,z,cy) \ 00186 __asm__ volatile ( \ 00187 "movq %[yn], %%rdx\n\t" \ 00188 "xorq %%rcx, %%rcx\n\t" \ 00189 "movq 0(%[c]), %%r8\n\t" \ 00190 "movq 8(%[c]), %%r9\n\t" \ 00191 "movq 16(%[c]), %%r10\n\t" \ 00192 "movq 24(%[c]), %%r11\n\t" \ 00193 "movq 32(%[c]), %%r12\n\t" \ 00194 "movq 40(%[c]), %%r13\n\t" \ 00195 "movq 48(%[c]), %%r14\n\t" \ 00196 "movq 56(%[c]), %%r15\n\t" \ 00197 \ 00198 "mulx 0(%[xp]), %%rax, %%rcx\n\t" \ 00199 "adcxq %[cy], %%r8\n\t" \ 00200 "adoxq %%rax, %%r8\n\t" \ 00201 "mulx 8(%[xp]), %%rax, %[cy]\n\t" \ 00202 "adcxq %%rcx, %%r9\n\t" \ 00203 "adoxq %%rax, %%r9\n\t" \ 00204 "mulx 16(%[xp]), %%rax, %%rcx\n\t" \ 00205 "adcxq %[cy], %%r10\n\t" \ 00206 "adoxq %%rax, %%r10\n\t" \ 00207 "mulx 24(%[xp]), %%rax, %[cy]\n\t" \ 00208 "adcxq %%rcx, %%r11\n\t" \ 00209 "adoxq %%rax, %%r11\n\t" \ 00210 "mulx 32(%[xp]), %%rax, %%rcx\n\t" \ 00211 "adcxq %[cy], %%r12\n\t" \ 00212 "adoxq %%rax, %%r12\n\t" \ 00213 "mulx 40(%[xp]), %%rax, %[cy]\n\t" \ 00214 "adcxq %%rcx, %%r13\n\t" \ 00215 "adoxq %%rax, %%r13\n\t" \ 00216 "mulx 48(%[xp]), %%rax, %%rcx\n\t" \ 00217 "adcxq %[cy], %%r14\n\t" \ 00218 "adoxq %%rax, %%r14\n\t" \ 00219 "adcxq %%rcx, %%r15\n\t" \ 00220 "mulx 56(%[xp]), %%rax, %[cy]\n\t" \ 00221 "movq $0, %%rdx\n\t" \ 00222 "adoxq %%rdx, %%rax\n\t" \ 00223 "adcxq %%rdx, %[cy]\n\t" \ 00224 "adoxq %%rdx, %[cy]\n\t" \ 00225 "addq %%rax, %%r15\n\t" \ 00226 "adcq $0, %[cy]\n\t" \ 00227 \ 00228 "movq %%r8, 0(%[c])\n\t" \ 00229 "movq %%r9, 8(%[c])\n\t" \ 00230 "movq %%r10, 16(%[c])\n\t" \ 00231 "movq %%r11, 24(%[c])\n\t" \ 00232 "movq %%r12, 32(%[c])\n\t" \ 00233 "movq %%r13, 40(%[c])\n\t" \ 00234 "movq %%r14, 48(%[c])\n\t" \ 00235 "movq %%r15, 56(%[c])\n\t" \ 00236 : [cy] "+r" (cy) \ 00237 : [xp] "r" (x), [c] "r" (c_mulx), [yn] "rm" (y) \ 00238 :"%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", \ 00239 "%rdx", "%rax", "%rcx" \ 00240 ) 00241 00242 #define INNERMUL8_MULX \ 00243 {\ 00244 MULX_INNERMUL8(tmpm, mu, _c, cy);\ 00245 } 00246 #endif 00247 00248 #define INNERMUL8 \ 00249 __asm__( \ 00250 "movq 0(%5),%%rax \n\t" \ 00251 "movq 0(%2),%%r10 \n\t" \ 00252 "movq 0x8(%5),%%r11 \n\t" \ 00253 "mulq %4 \n\t" \ 00254 "addq %%r10,%%rax \n\t" \ 00255 "adcq $0,%%rdx \n\t" \ 00256 "movq 0x8(%2),%%r10 \n\t" \ 00257 "addq %3,%%rax \n\t" \ 00258 "adcq $0,%%rdx \n\t" \ 00259 "movq %%rax,0(%0) \n\t" \ 00260 "movq %%rdx,%1 \n\t" \ 00261 \ 00262 "movq %%r11,%%rax \n\t" \ 00263 "movq 0x10(%5),%%r11 \n\t" \ 00264 "mulq %4 \n\t" \ 00265 "addq %%r10,%%rax \n\t" \ 00266 "adcq $0,%%rdx \n\t" \ 00267 "movq 0x10(%2),%%r10 \n\t" \ 00268 "addq %3,%%rax \n\t" \ 00269 "adcq $0,%%rdx \n\t" \ 00270 "movq %%rax,0x8(%0) \n\t" \ 00271 "movq %%rdx,%1 \n\t" \ 00272 \ 00273 "movq %%r11,%%rax \n\t" \ 00274 "movq 0x18(%5),%%r11 \n\t" \ 00275 "mulq %4 \n\t" \ 00276 "addq %%r10,%%rax \n\t" \ 00277 "adcq $0,%%rdx \n\t" \ 00278 "movq 0x18(%2),%%r10 \n\t" \ 00279 "addq %3,%%rax \n\t" \ 00280 "adcq $0,%%rdx \n\t" \ 00281 "movq %%rax,0x10(%0) \n\t" \ 00282 "movq %%rdx,%1 \n\t" \ 00283 \ 00284 "movq %%r11,%%rax \n\t" \ 00285 "movq 0x20(%5),%%r11 \n\t" \ 00286 "mulq %4 \n\t" \ 00287 "addq %%r10,%%rax \n\t" \ 00288 "adcq $0,%%rdx \n\t" \ 00289 "movq 0x20(%2),%%r10 \n\t" \ 00290 "addq %3,%%rax \n\t" \ 00291 "adcq $0,%%rdx \n\t" \ 00292 "movq %%rax,0x18(%0) \n\t" \ 00293 "movq %%rdx,%1 \n\t" \ 00294 \ 00295 "movq %%r11,%%rax \n\t" \ 00296 "movq 0x28(%5),%%r11 \n\t" \ 00297 "mulq %4 \n\t" \ 00298 "addq %%r10,%%rax \n\t" \ 00299 "adcq $0,%%rdx \n\t" \ 00300 "movq 0x28(%2),%%r10 \n\t" \ 00301 "addq %3,%%rax \n\t" \ 00302 "adcq $0,%%rdx \n\t" \ 00303 "movq %%rax,0x20(%0) \n\t" \ 00304 "movq %%rdx,%1 \n\t" \ 00305 \ 00306 "movq %%r11,%%rax \n\t" \ 00307 "movq 0x30(%5),%%r11 \n\t" \ 00308 "mulq %4 \n\t" \ 00309 "addq %%r10,%%rax \n\t" \ 00310 "adcq $0,%%rdx \n\t" \ 00311 "movq 0x30(%2),%%r10 \n\t" \ 00312 "addq %3,%%rax \n\t" \ 00313 "adcq $0,%%rdx \n\t" \ 00314 "movq %%rax,0x28(%0) \n\t" \ 00315 "movq %%rdx,%1 \n\t" \ 00316 \ 00317 "movq %%r11,%%rax \n\t" \ 00318 "movq 0x38(%5),%%r11 \n\t" \ 00319 "mulq %4 \n\t" \ 00320 "addq %%r10,%%rax \n\t" \ 00321 "adcq $0,%%rdx \n\t" \ 00322 "movq 0x38(%2),%%r10 \n\t" \ 00323 "addq %3,%%rax \n\t" \ 00324 "adcq $0,%%rdx \n\t" \ 00325 "movq %%rax,0x30(%0) \n\t" \ 00326 "movq %%rdx,%1 \n\t" \ 00327 \ 00328 "movq %%r11,%%rax \n\t" \ 00329 "mulq %4 \n\t" \ 00330 "addq %%r10,%%rax \n\t" \ 00331 "adcq $0,%%rdx \n\t" \ 00332 "addq %3,%%rax \n\t" \ 00333 "adcq $0,%%rdx \n\t" \ 00334 "movq %%rax,0x38(%0) \n\t" \ 00335 "movq %%rdx,%1 \n\t" \ 00336 \ 00337 :"=r"(_c), "=r"(cy) \ 00338 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\ 00339 : "%rax", "%rdx", "%r10", "%r11", "cc") 00340 00341 #define PROPCARRY \ 00342 __asm__( \ 00343 "addq %1,%0 \n\t" \ 00344 "setb %%al \n\t" \ 00345 "movzbq %%al,%1 \n\t" \ 00346 :"=g"(_c[LO]), "=r"(cy) \ 00347 :"0"(_c[LO]), "1"(cy) \ 00348 : "%rax", "cc") 00349 00350 /******************************************************************/ 00351 #elif defined(TFM_SSE2) 00352 /* SSE2 code (assumes 32-bit fp_digits) */ 00353 /* XMM register assignments: 00354 * xmm0 *tmpm++, then Mu * (*tmpm++) 00355 * xmm1 c[x], then Mu 00356 * xmm2 mp 00357 * xmm3 cy 00358 * xmm4 _c[LO] 00359 */ 00360 00361 #define MONT_START \ 00362 __asm__("movd %0,%%mm2"::"g"(mp)) 00363 00364 #define MONT_FINI \ 00365 __asm__("emms") 00366 00367 #define LOOP_START \ 00368 __asm__( \ 00369 "movd %0,%%mm1 \n\t" \ 00370 "pxor %%mm3,%%mm3 \n\t" \ 00371 "pmuludq %%mm2,%%mm1 \n\t" \ 00372 :: "g"(c[x])) 00373 00374 /* pmuludq on mmx registers does a 32x32->64 multiply. */ 00375 #define INNERMUL \ 00376 __asm__( \ 00377 "movd %1,%%mm4 \n\t" \ 00378 "movd %2,%%mm0 \n\t" \ 00379 "paddq %%mm4,%%mm3 \n\t" \ 00380 "pmuludq %%mm1,%%mm0 \n\t" \ 00381 "paddq %%mm0,%%mm3 \n\t" \ 00382 "movd %%mm3,%0 \n\t" \ 00383 "psrlq $32, %%mm3 \n\t" \ 00384 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) ); 00385 00386 #define INNERMUL8 \ 00387 __asm__( \ 00388 "movd 0(%1),%%mm4 \n\t" \ 00389 "movd 0(%2),%%mm0 \n\t" \ 00390 "paddq %%mm4,%%mm3 \n\t" \ 00391 "pmuludq %%mm1,%%mm0 \n\t" \ 00392 "movd 4(%2),%%mm5 \n\t" \ 00393 "paddq %%mm0,%%mm3 \n\t" \ 00394 "movd 4(%1),%%mm6 \n\t" \ 00395 "movd %%mm3,0(%0) \n\t" \ 00396 "psrlq $32, %%mm3 \n\t" \ 00397 \ 00398 "paddq %%mm6,%%mm3 \n\t" \ 00399 "pmuludq %%mm1,%%mm5 \n\t" \ 00400 "movd 8(%2),%%mm6 \n\t" \ 00401 "paddq %%mm5,%%mm3 \n\t" \ 00402 "movd 8(%1),%%mm7 \n\t" \ 00403 "movd %%mm3,4(%0) \n\t" \ 00404 "psrlq $32, %%mm3 \n\t" \ 00405 \ 00406 "paddq %%mm7,%%mm3 \n\t" \ 00407 "pmuludq %%mm1,%%mm6 \n\t" \ 00408 "movd 12(%2),%%mm7 \n\t" \ 00409 "paddq %%mm6,%%mm3 \n\t" \ 00410 "movd 12(%1),%%mm5 \n\t" \ 00411 "movd %%mm3,8(%0) \n\t" \ 00412 "psrlq $32, %%mm3 \n\t" \ 00413 \ 00414 "paddq %%mm5,%%mm3 \n\t" \ 00415 "pmuludq %%mm1,%%mm7 \n\t" \ 00416 "movd 16(%2),%%mm5 \n\t" \ 00417 "paddq %%mm7,%%mm3 \n\t" \ 00418 "movd 16(%1),%%mm6 \n\t" \ 00419 "movd %%mm3,12(%0) \n\t" \ 00420 "psrlq $32, %%mm3 \n\t" \ 00421 \ 00422 "paddq %%mm6,%%mm3 \n\t" \ 00423 "pmuludq %%mm1,%%mm5 \n\t" \ 00424 "movd 20(%2),%%mm6 \n\t" \ 00425 "paddq %%mm5,%%mm3 \n\t" \ 00426 "movd 20(%1),%%mm7 \n\t" \ 00427 "movd %%mm3,16(%0) \n\t" \ 00428 "psrlq $32, %%mm3 \n\t" \ 00429 \ 00430 "paddq %%mm7,%%mm3 \n\t" \ 00431 "pmuludq %%mm1,%%mm6 \n\t" \ 00432 "movd 24(%2),%%mm7 \n\t" \ 00433 "paddq %%mm6,%%mm3 \n\t" \ 00434 "movd 24(%1),%%mm5 \n\t" \ 00435 "movd %%mm3,20(%0) \n\t" \ 00436 "psrlq $32, %%mm3 \n\t" \ 00437 \ 00438 "paddq %%mm5,%%mm3 \n\t" \ 00439 "pmuludq %%mm1,%%mm7 \n\t" \ 00440 "movd 28(%2),%%mm5 \n\t" \ 00441 "paddq %%mm7,%%mm3 \n\t" \ 00442 "movd 28(%1),%%mm6 \n\t" \ 00443 "movd %%mm3,24(%0) \n\t" \ 00444 "psrlq $32, %%mm3 \n\t" \ 00445 \ 00446 "paddq %%mm6,%%mm3 \n\t" \ 00447 "pmuludq %%mm1,%%mm5 \n\t" \ 00448 "paddq %%mm5,%%mm3 \n\t" \ 00449 "movd %%mm3,28(%0) \n\t" \ 00450 "psrlq $32, %%mm3 \n\t" \ 00451 :"=r"(_c) : "0"(_c), "r"(tmpm) ); 00452 00453 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack 00454 pointer */ 00455 00456 #define LOOP_END \ 00457 __asm__( "movd %%mm3,%0 \n" :"=r"(cy)) 00458 00459 #define PROPCARRY \ 00460 __asm__( \ 00461 "addl %1,%0 \n\t" \ 00462 "setb %%al \n\t" \ 00463 "movzbl %%al,%1 \n\t" \ 00464 :"=g"(_c[LO]), "=r"(cy) \ 00465 :"0"(_c[LO]), "1"(cy) \ 00466 : "%eax", "cc") 00467 00468 /******************************************************************/ 00469 #elif defined(TFM_ARM) 00470 /* ARMv4 code */ 00471 00472 #define MONT_START 00473 #define MONT_FINI 00474 #define LOOP_END 00475 #define LOOP_START \ 00476 mu = c[x] * mp 00477 00478 00479 #ifdef __thumb__ 00480 00481 #define INNERMUL \ 00482 __asm__( \ 00483 " LDR r0,%1 \n\t" \ 00484 " ADDS r0,r0,%0 \n\t" \ 00485 " ITE CS \n\t" \ 00486 " MOVCS %0,#1 \n\t" \ 00487 " MOVCC %0,#0 \n\t" \ 00488 " UMLAL r0,%0,%3,%4 \n\t" \ 00489 " STR r0,%1 \n\t" \ 00490 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc"); 00491 00492 #define PROPCARRY \ 00493 __asm__( \ 00494 " LDR r0,%1 \n\t" \ 00495 " ADDS r0,r0,%0 \n\t" \ 00496 " STR r0,%1 \n\t" \ 00497 " ITE CS \n\t" \ 00498 " MOVCS %0,#1 \n\t" \ 00499 " MOVCC %0,#0 \n\t" \ 00500 :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc"); 00501 00502 00503 /* TAO thumb mode uses ite (if then else) to detect carry directly 00504 * fixed unmatched constraint warning by changing 1 to m */ 00505 00506 #else /* __thumb__ */ 00507 00508 #define INNERMUL \ 00509 __asm__( \ 00510 " LDR r0,%1 \n\t" \ 00511 " ADDS r0,r0,%0 \n\t" \ 00512 " MOVCS %0,#1 \n\t" \ 00513 " MOVCC %0,#0 \n\t" \ 00514 " UMLAL r0,%0,%3,%4 \n\t" \ 00515 " STR r0,%1 \n\t" \ 00516 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc"); 00517 00518 #define PROPCARRY \ 00519 __asm__( \ 00520 " LDR r0,%1 \n\t" \ 00521 " ADDS r0,r0,%0 \n\t" \ 00522 " STR r0,%1 \n\t" \ 00523 " MOVCS %0,#1 \n\t" \ 00524 " MOVCC %0,#0 \n\t" \ 00525 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc"); 00526 00527 #endif /* __thumb__ */ 00528 00529 #elif defined(TFM_PPC32) 00530 00531 /* PPC32 */ 00532 #define MONT_START 00533 #define MONT_FINI 00534 #define LOOP_END 00535 #define LOOP_START \ 00536 mu = c[x] * mp 00537 00538 #define INNERMUL \ 00539 __asm__( \ 00540 " mullw 16,%3,%4 \n\t" \ 00541 " mulhwu 17,%3,%4 \n\t" \ 00542 " addc 16,16,%2 \n\t" \ 00543 " addze 17,17 \n\t" \ 00544 " addc %1,16,%5 \n\t" \ 00545 " addze %0,17 \n\t" \ 00546 :"=r"(cy),"=r"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "cc"); ++tmpm; 00547 00548 #define PROPCARRY \ 00549 __asm__( \ 00550 " addc %1,%3,%2 \n\t" \ 00551 " xor %0,%2,%2 \n\t" \ 00552 " addze %0,%2 \n\t" \ 00553 :"=r"(cy),"=r"(_c[0]):"0"(cy),"1"(_c[0]):"cc"); 00554 00555 #elif defined(TFM_PPC64) 00556 00557 /* PPC64 */ 00558 #define MONT_START 00559 #define MONT_FINI 00560 #define LOOP_END 00561 #define LOOP_START \ 00562 mu = c[x] * mp 00563 00564 #define INNERMUL \ 00565 __asm__( \ 00566 " mulld r16,%3,%4 \n\t" \ 00567 " mulhdu r17,%3,%4 \n\t" \ 00568 " addc r16,16,%0 \n\t" \ 00569 " addze r17,r17 \n\t" \ 00570 " ldx r18,0,%1 \n\t" \ 00571 " addc r16,r16,r18 \n\t" \ 00572 " addze %0,r17 \n\t" \ 00573 " sdx r16,0,%1 \n\t" \ 00574 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","cc"); ++tmpm; 00575 00576 #define PROPCARRY \ 00577 __asm__( \ 00578 " ldx r16,0,%1 \n\t" \ 00579 " addc r16,r16,%0 \n\t" \ 00580 " sdx r16,0,%1 \n\t" \ 00581 " xor %0,%0,%0 \n\t" \ 00582 " addze %0,%0 \n\t" \ 00583 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","cc"); 00584 00585 /******************************************************************/ 00586 00587 #elif defined(TFM_AVR32) 00588 00589 /* AVR32 */ 00590 #define MONT_START 00591 #define MONT_FINI 00592 #define LOOP_END 00593 #define LOOP_START \ 00594 mu = c[x] * mp 00595 00596 #define INNERMUL \ 00597 __asm__( \ 00598 " ld.w r2,%1 \n\t" \ 00599 " add r2,%0 \n\t" \ 00600 " eor r3,r3 \n\t" \ 00601 " acr r3 \n\t" \ 00602 " macu.d r2,%3,%4 \n\t" \ 00603 " st.w %1,r2 \n\t" \ 00604 " mov %0,r3 \n\t" \ 00605 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3"); 00606 00607 #define PROPCARRY \ 00608 __asm__( \ 00609 " ld.w r2,%1 \n\t" \ 00610 " add r2,%0 \n\t" \ 00611 " st.w %1,r2 \n\t" \ 00612 " eor %0,%0 \n\t" \ 00613 " acr %0 \n\t" \ 00614 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc"); 00615 00616 /******************************************************************/ 00617 #elif defined(TFM_MIPS) 00618 00619 /* MIPS */ 00620 #define MONT_START 00621 #define MONT_FINI 00622 #define LOOP_END 00623 #define LOOP_START \ 00624 mu = c[x] * mp 00625 00626 #define INNERMUL \ 00627 __asm__( \ 00628 " multu %3,%4 \n\t" \ 00629 " mflo $12 \n\t" \ 00630 " mfhi $13 \n\t" \ 00631 " addu $12,$12,%0 \n\t" \ 00632 " sltu $10,$12,%0 \n\t" \ 00633 " addu $13,$13,$10 \n\t" \ 00634 " lw $10,%1 \n\t" \ 00635 " addu $12,$12,$10 \n\t" \ 00636 " sltu $10,$12,$10 \n\t" \ 00637 " addu %0,$13,$10 \n\t" \ 00638 " sw $12,%1 \n\t" \ 00639 :"+r"(cy),"+m"(_c[0]):""(cy),"r"(mu),"r"(tmpm[0]),""(_c[0]):"$10","$12","$13"); ++tmpm; 00640 00641 #define PROPCARRY \ 00642 __asm__( \ 00643 " lw $10,%1 \n\t" \ 00644 " addu $10,$10,%0 \n\t" \ 00645 " sw $10,%1 \n\t" \ 00646 " sltu %0,$10,%0 \n\t" \ 00647 :"+r"(cy),"+m"(_c[0]):""(cy),""(_c[0]):"$10"); 00648 00649 /******************************************************************/ 00650 #else 00651 00652 /* ISO C code */ 00653 #define MONT_START 00654 #define MONT_FINI 00655 #define LOOP_END 00656 #define LOOP_START \ 00657 mu = c[x] * mp 00658 00659 #define INNERMUL \ 00660 do { fp_word t; \ 00661 t = ((fp_word)_c[0] + (fp_word)cy) + \ 00662 (((fp_word)mu) * ((fp_word)*tmpm++)); \ 00663 _c[0] = (fp_digit)t; \ 00664 cy = (fp_digit)(t >> DIGIT_BIT); \ 00665 } while (0) 00666 00667 #define PROPCARRY \ 00668 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0) 00669 00670 #endif 00671 /******************************************************************/ 00672 00673 00674 #define LO 0 00675 /* end fp_montogomery_reduce.c asm */ 00676 00677 00678 /* start fp_sqr_comba.c asm */ 00679 #if defined(TFM_X86) 00680 00681 /* x86-32 optimized */ 00682 00683 #define COMBA_START 00684 00685 #define CLEAR_CARRY \ 00686 c0 = c1 = c2 = 0; 00687 00688 #define COMBA_STORE(x) \ 00689 x = c0; 00690 00691 #define COMBA_STORE2(x) \ 00692 x = c1; 00693 00694 #define CARRY_FORWARD \ 00695 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00696 00697 #define COMBA_FINI 00698 00699 #define SQRADD(i, j) \ 00700 __asm__( \ 00701 "movl %6,%%eax \n\t" \ 00702 "mull %%eax \n\t" \ 00703 "addl %%eax,%0 \n\t" \ 00704 "adcl %%edx,%1 \n\t" \ 00705 "adcl $0,%2 \n\t" \ 00706 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc"); 00707 00708 #define SQRADD2(i, j) \ 00709 __asm__( \ 00710 "movl %6,%%eax \n\t" \ 00711 "mull %7 \n\t" \ 00712 "addl %%eax,%0 \n\t" \ 00713 "adcl %%edx,%1 \n\t" \ 00714 "adcl $0,%2 \n\t" \ 00715 "addl %%eax,%0 \n\t" \ 00716 "adcl %%edx,%1 \n\t" \ 00717 "adcl $0,%2 \n\t" \ 00718 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx", "cc"); 00719 00720 #define SQRADDSC(i, j) \ 00721 __asm__( \ 00722 "movl %3,%%eax \n\t" \ 00723 "mull %4 \n\t" \ 00724 "movl %%eax,%0 \n\t" \ 00725 "movl %%edx,%1 \n\t" \ 00726 "xorl %2,%2 \n\t" \ 00727 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc"); 00728 00729 #define SQRADDAC(i, j) \ 00730 __asm__( \ 00731 "movl %6,%%eax \n\t" \ 00732 "mull %7 \n\t" \ 00733 "addl %%eax,%0 \n\t" \ 00734 "adcl %%edx,%1 \n\t" \ 00735 "adcl $0,%2 \n\t" \ 00736 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc"); 00737 00738 #define SQRADDDB \ 00739 __asm__( \ 00740 "addl %6,%0 \n\t" \ 00741 "adcl %7,%1 \n\t" \ 00742 "adcl %8,%2 \n\t" \ 00743 "addl %6,%0 \n\t" \ 00744 "adcl %7,%1 \n\t" \ 00745 "adcl %8,%2 \n\t" \ 00746 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc"); 00747 00748 #elif defined(TFM_X86_64) 00749 /* x86-64 optimized */ 00750 00751 #define COMBA_START 00752 00753 #define CLEAR_CARRY \ 00754 c0 = c1 = c2 = 0; 00755 00756 #define COMBA_STORE(x) \ 00757 x = c0; 00758 00759 #define COMBA_STORE2(x) \ 00760 x = c1; 00761 00762 #define CARRY_FORWARD \ 00763 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00764 00765 #define COMBA_FINI 00766 00767 #define SQRADD(i, j) \ 00768 __asm__( \ 00769 "movq %6,%%rax \n\t" \ 00770 "mulq %%rax \n\t" \ 00771 "addq %%rax,%0 \n\t" \ 00772 "adcq %%rdx,%1 \n\t" \ 00773 "adcq $0,%2 \n\t" \ 00774 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "x"(i) :"%rax","%rdx","cc"); 00775 00776 #define SQRADD2(i, j) \ 00777 __asm__( \ 00778 "movq %6,%%rax \n\t" \ 00779 "mulq %7 \n\t" \ 00780 "addq %%rax,%0 \n\t" \ 00781 "adcq %%rdx,%1 \n\t" \ 00782 "adcq $0,%2 \n\t" \ 00783 "addq %%rax,%0 \n\t" \ 00784 "adcq %%rdx,%1 \n\t" \ 00785 "adcq $0,%2 \n\t" \ 00786 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc"); 00787 00788 #define SQRADDSC(i, j) \ 00789 __asm__( \ 00790 "movq %3,%%rax \n\t" \ 00791 "mulq %4 \n\t" \ 00792 "movq %%rax,%0 \n\t" \ 00793 "movq %%rdx,%1 \n\t" \ 00794 "xorq %2,%2 \n\t" \ 00795 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc"); 00796 00797 #define SQRADDAC(i, j) \ 00798 __asm__( \ 00799 "movq %6,%%rax \n\t" \ 00800 "mulq %7 \n\t" \ 00801 "addq %%rax,%0 \n\t" \ 00802 "adcq %%rdx,%1 \n\t" \ 00803 "adcq $0,%2 \n\t" \ 00804 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc"); 00805 00806 #define SQRADDDB \ 00807 __asm__( \ 00808 "addq %6,%0 \n\t" \ 00809 "adcq %7,%1 \n\t" \ 00810 "adcq %8,%2 \n\t" \ 00811 "addq %6,%0 \n\t" \ 00812 "adcq %7,%1 \n\t" \ 00813 "adcq %8,%2 \n\t" \ 00814 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc"); 00815 00816 #elif defined(TFM_SSE2) 00817 00818 /* SSE2 Optimized */ 00819 #define COMBA_START 00820 00821 #define CLEAR_CARRY \ 00822 c0 = c1 = c2 = 0; 00823 00824 #define COMBA_STORE(x) \ 00825 x = c0; 00826 00827 #define COMBA_STORE2(x) \ 00828 x = c1; 00829 00830 #define CARRY_FORWARD \ 00831 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00832 00833 #define COMBA_FINI \ 00834 __asm__("emms"); 00835 00836 #define SQRADD(i, j) \ 00837 __asm__( \ 00838 "movd %6,%%mm0 \n\t" \ 00839 "pmuludq %%mm0,%%mm0\n\t" \ 00840 "movd %%mm0,%%eax \n\t" \ 00841 "psrlq $32,%%mm0 \n\t" \ 00842 "addl %%eax,%0 \n\t" \ 00843 "movd %%mm0,%%eax \n\t" \ 00844 "adcl %%eax,%1 \n\t" \ 00845 "adcl $0,%2 \n\t" \ 00846 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc"); 00847 00848 #define SQRADD2(i, j) \ 00849 __asm__( \ 00850 "movd %6,%%mm0 \n\t" \ 00851 "movd %7,%%mm1 \n\t" \ 00852 "pmuludq %%mm1,%%mm0\n\t" \ 00853 "movd %%mm0,%%eax \n\t" \ 00854 "psrlq $32,%%mm0 \n\t" \ 00855 "movd %%mm0,%%edx \n\t" \ 00856 "addl %%eax,%0 \n\t" \ 00857 "adcl %%edx,%1 \n\t" \ 00858 "adcl $0,%2 \n\t" \ 00859 "addl %%eax,%0 \n\t" \ 00860 "adcl %%edx,%1 \n\t" \ 00861 "adcl $0,%2 \n\t" \ 00862 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc"); 00863 00864 #define SQRADDSC(i, j) \ 00865 __asm__( \ 00866 "movd %3,%%mm0 \n\t" \ 00867 "movd %4,%%mm1 \n\t" \ 00868 "pmuludq %%mm1,%%mm0\n\t" \ 00869 "movd %%mm0,%0 \n\t" \ 00870 "psrlq $32,%%mm0 \n\t" \ 00871 "movd %%mm0,%1 \n\t" \ 00872 "xorl %2,%2 \n\t" \ 00873 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j)); 00874 00875 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */ 00876 00877 #define SQRADDAC(i, j) \ 00878 __asm__( \ 00879 "movd %6,%%mm0 \n\t" \ 00880 "movd %7,%%mm1 \n\t" \ 00881 "pmuludq %%mm1,%%mm0\n\t" \ 00882 "movd %%mm0,%%eax \n\t" \ 00883 "psrlq $32,%%mm0 \n\t" \ 00884 "movd %%mm0,%%edx \n\t" \ 00885 "addl %%eax,%0 \n\t" \ 00886 "adcl %%edx,%1 \n\t" \ 00887 "adcl $0,%2 \n\t" \ 00888 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","cc"); 00889 00890 #define SQRADDDB \ 00891 __asm__( \ 00892 "addl %6,%0 \n\t" \ 00893 "adcl %7,%1 \n\t" \ 00894 "adcl %8,%2 \n\t" \ 00895 "addl %6,%0 \n\t" \ 00896 "adcl %7,%1 \n\t" \ 00897 "adcl %8,%2 \n\t" \ 00898 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc"); 00899 00900 #elif defined(TFM_ARM) 00901 00902 /* ARM code */ 00903 00904 #define COMBA_START 00905 00906 #define CLEAR_CARRY \ 00907 c0 = c1 = c2 = 0; 00908 00909 #define COMBA_STORE(x) \ 00910 x = c0; 00911 00912 #define COMBA_STORE2(x) \ 00913 x = c1; 00914 00915 #define CARRY_FORWARD \ 00916 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00917 00918 #define COMBA_FINI 00919 00920 /* multiplies point i and j, updates carry "c1" and digit c2 */ 00921 #define SQRADD(i, j) \ 00922 __asm__( \ 00923 " UMULL r0,r1,%6,%6 \n\t" \ 00924 " ADDS %0,%0,r0 \n\t" \ 00925 " ADCS %1,%1,r1 \n\t" \ 00926 " ADC %2,%2,#0 \n\t" \ 00927 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc"); 00928 00929 /* for squaring some of the terms are doubled... */ 00930 #define SQRADD2(i, j) \ 00931 __asm__( \ 00932 " UMULL r0,r1,%6,%7 \n\t" \ 00933 " ADDS %0,%0,r0 \n\t" \ 00934 " ADCS %1,%1,r1 \n\t" \ 00935 " ADC %2,%2,#0 \n\t" \ 00936 " ADDS %0,%0,r0 \n\t" \ 00937 " ADCS %1,%1,r1 \n\t" \ 00938 " ADC %2,%2,#0 \n\t" \ 00939 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc"); 00940 00941 #define SQRADDSC(i, j) \ 00942 __asm__( \ 00943 " UMULL %0,%1,%3,%4 \n\t" \ 00944 " SUB %2,%2,%2 \n\t" \ 00945 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc"); 00946 00947 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */ 00948 00949 #define SQRADDAC(i, j) \ 00950 __asm__( \ 00951 " UMULL r0,r1,%6,%7 \n\t" \ 00952 " ADDS %0,%0,r0 \n\t" \ 00953 " ADCS %1,%1,r1 \n\t" \ 00954 " ADC %2,%2,#0 \n\t" \ 00955 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc"); 00956 00957 #define SQRADDDB \ 00958 __asm__( \ 00959 " ADDS %0,%0,%3 \n\t" \ 00960 " ADCS %1,%1,%4 \n\t" \ 00961 " ADC %2,%2,%5 \n\t" \ 00962 " ADDS %0,%0,%3 \n\t" \ 00963 " ADCS %1,%1,%4 \n\t" \ 00964 " ADC %2,%2,%5 \n\t" \ 00965 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 00966 00967 #elif defined(TFM_PPC32) 00968 00969 /* PPC32 */ 00970 00971 #define COMBA_START 00972 00973 #define CLEAR_CARRY \ 00974 c0 = c1 = c2 = 0; 00975 00976 #define COMBA_STORE(x) \ 00977 x = c0; 00978 00979 #define COMBA_STORE2(x) \ 00980 x = c1; 00981 00982 #define CARRY_FORWARD \ 00983 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00984 00985 #define COMBA_FINI 00986 00987 /* multiplies point i and j, updates carry "c1" and digit c2 */ 00988 #define SQRADD(i, j) \ 00989 __asm__( \ 00990 " mullw 16,%6,%6 \n\t" \ 00991 " addc %0,%0,16 \n\t" \ 00992 " mulhwu 16,%6,%6 \n\t" \ 00993 " adde %1,%1,16 \n\t" \ 00994 " addze %2,%2 \n\t" \ 00995 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc"); 00996 00997 /* for squaring some of the terms are doubled... */ 00998 #define SQRADD2(i, j) \ 00999 __asm__( \ 01000 " mullw 16,%6,%7 \n\t" \ 01001 " mulhwu 17,%6,%7 \n\t" \ 01002 " addc %0,%0,16 \n\t" \ 01003 " adde %1,%1,17 \n\t" \ 01004 " addze %2,%2 \n\t" \ 01005 " addc %0,%0,16 \n\t" \ 01006 " adde %1,%1,17 \n\t" \ 01007 " addze %2,%2 \n\t" \ 01008 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc"); 01009 01010 #define SQRADDSC(i, j) \ 01011 __asm__( \ 01012 " mullw %0,%6,%7 \n\t" \ 01013 " mulhwu %1,%6,%7 \n\t" \ 01014 " xor %2,%2,%2 \n\t" \ 01015 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc"); 01016 01017 #define SQRADDAC(i, j) \ 01018 __asm__( \ 01019 " mullw 16,%6,%7 \n\t" \ 01020 " addc %0,%0,16 \n\t" \ 01021 " mulhwu 16,%6,%7 \n\t" \ 01022 " adde %1,%1,16 \n\t" \ 01023 " addze %2,%2 \n\t" \ 01024 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc"); 01025 01026 #define SQRADDDB \ 01027 __asm__( \ 01028 " addc %0,%0,%3 \n\t" \ 01029 " adde %1,%1,%4 \n\t" \ 01030 " adde %2,%2,%5 \n\t" \ 01031 " addc %0,%0,%3 \n\t" \ 01032 " adde %1,%1,%4 \n\t" \ 01033 " adde %2,%2,%5 \n\t" \ 01034 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 01035 01036 #elif defined(TFM_PPC64) 01037 /* PPC64 */ 01038 01039 #define COMBA_START 01040 01041 #define CLEAR_CARRY \ 01042 c0 = c1 = c2 = 0; 01043 01044 #define COMBA_STORE(x) \ 01045 x = c0; 01046 01047 #define COMBA_STORE2(x) \ 01048 x = c1; 01049 01050 #define CARRY_FORWARD \ 01051 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01052 01053 #define COMBA_FINI 01054 01055 /* multiplies point i and j, updates carry "c1" and digit c2 */ 01056 #define SQRADD(i, j) \ 01057 __asm__( \ 01058 " mulld r16,%6,%6 \n\t" \ 01059 " addc %0,%0,r16 \n\t" \ 01060 " mulhdu r16,%6,%6 \n\t" \ 01061 " adde %1,%1,r16 \n\t" \ 01062 " addze %2,%2 \n\t" \ 01063 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r16","cc"); 01064 01065 /* for squaring some of the terms are doubled... */ 01066 #define SQRADD2(i, j) \ 01067 __asm__( \ 01068 " mulld r16,%6,%7 \n\t" \ 01069 " mulhdu r17,%6,%7 \n\t" \ 01070 " addc %0,%0,r16 \n\t" \ 01071 " adde %1,%1,r17 \n\t" \ 01072 " addze %2,%2 \n\t" \ 01073 " addc %0,%0,r16 \n\t" \ 01074 " adde %1,%1,r17 \n\t" \ 01075 " addze %2,%2 \n\t" \ 01076 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16", "r17","cc"); 01077 01078 #define SQRADDSC(i, j) \ 01079 __asm__( \ 01080 " mulld %0,%6,%7 \n\t" \ 01081 " mulhdu %1,%6,%7 \n\t" \ 01082 " xor %2,%2,%2 \n\t" \ 01083 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc"); 01084 01085 #define SQRADDAC(i, j) \ 01086 __asm__( \ 01087 " mulld r16,%6,%7 \n\t" \ 01088 " addc %0,%0,r16 \n\t" \ 01089 " mulhdu r16,%6,%7 \n\t" \ 01090 " adde %1,%1,r16 \n\t" \ 01091 " addze %2,%2 \n\t" \ 01092 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r16", "cc"); 01093 01094 #define SQRADDDB \ 01095 __asm__( \ 01096 " addc %0,%0,%3 \n\t" \ 01097 " adde %1,%1,%4 \n\t" \ 01098 " adde %2,%2,%5 \n\t" \ 01099 " addc %0,%0,%3 \n\t" \ 01100 " adde %1,%1,%4 \n\t" \ 01101 " adde %2,%2,%5 \n\t" \ 01102 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 01103 01104 01105 #elif defined(TFM_AVR32) 01106 01107 /* AVR32 */ 01108 01109 #define COMBA_START 01110 01111 #define CLEAR_CARRY \ 01112 c0 = c1 = c2 = 0; 01113 01114 #define COMBA_STORE(x) \ 01115 x = c0; 01116 01117 #define COMBA_STORE2(x) \ 01118 x = c1; 01119 01120 #define CARRY_FORWARD \ 01121 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01122 01123 #define COMBA_FINI 01124 01125 /* multiplies point i and j, updates carry "c1" and digit c2 */ 01126 #define SQRADD(i, j) \ 01127 __asm__( \ 01128 " mulu.d r2,%6,%6 \n\t" \ 01129 " add %0,%0,r2 \n\t" \ 01130 " adc %1,%1,r3 \n\t" \ 01131 " acr %2 \n\t" \ 01132 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3"); 01133 01134 /* for squaring some of the terms are doubled... */ 01135 #define SQRADD2(i, j) \ 01136 __asm__( \ 01137 " mulu.d r2,%6,%7 \n\t" \ 01138 " add %0,%0,r2 \n\t" \ 01139 " adc %1,%1,r3 \n\t" \ 01140 " acr %2, \n\t" \ 01141 " add %0,%0,r2 \n\t" \ 01142 " adc %1,%1,r3 \n\t" \ 01143 " acr %2, \n\t" \ 01144 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3"); 01145 01146 #define SQRADDSC(i, j) \ 01147 __asm__( \ 01148 " mulu.d r2,%6,%7 \n\t" \ 01149 " mov %0,r2 \n\t" \ 01150 " mov %1,r3 \n\t" \ 01151 " eor %2,%2 \n\t" \ 01152 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3"); 01153 01154 #define SQRADDAC(i, j) \ 01155 __asm__( \ 01156 " mulu.d r2,%6,%7 \n\t" \ 01157 " add %0,%0,r2 \n\t" \ 01158 " adc %1,%1,r3 \n\t" \ 01159 " acr %2 \n\t" \ 01160 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3"); 01161 01162 #define SQRADDDB \ 01163 __asm__( \ 01164 " add %0,%0,%3 \n\t" \ 01165 " adc %1,%1,%4 \n\t" \ 01166 " adc %2,%2,%5 \n\t" \ 01167 " add %0,%0,%3 \n\t" \ 01168 " adc %1,%1,%4 \n\t" \ 01169 " adc %2,%2,%5 \n\t" \ 01170 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 01171 01172 #elif defined(TFM_MIPS) 01173 01174 /* MIPS */ 01175 #define COMBA_START 01176 01177 #define CLEAR_CARRY \ 01178 c0 = c1 = c2 = 0; 01179 01180 #define COMBA_STORE(x) \ 01181 x = c0; 01182 01183 #define COMBA_STORE2(x) \ 01184 x = c1; 01185 01186 #define CARRY_FORWARD \ 01187 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01188 01189 #define COMBA_FINI 01190 01191 /* multiplies point i and j, updates carry "c1" and digit c2 */ 01192 #define SQRADD(i, j) \ 01193 __asm__( \ 01194 " multu %6,%6 \n\t" \ 01195 " mflo $12 \n\t" \ 01196 " mfhi $13 \n\t" \ 01197 " addu %0,%0,$12 \n\t" \ 01198 " sltu $12,%0,$12 \n\t" \ 01199 " addu %1,%1,$13 \n\t" \ 01200 " sltu $13,%1,$13 \n\t" \ 01201 " addu %1,%1,$12 \n\t" \ 01202 " sltu $12,%1,$12 \n\t" \ 01203 " addu %2,%2,$13 \n\t" \ 01204 " addu %2,%2,$12 \n\t" \ 01205 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13"); 01206 01207 /* for squaring some of the terms are doubled... */ 01208 #define SQRADD2(i, j) \ 01209 __asm__( \ 01210 " multu %6,%7 \n\t" \ 01211 " mflo $12 \n\t" \ 01212 " mfhi $13 \n\t" \ 01213 \ 01214 " addu %0,%0,$12 \n\t" \ 01215 " sltu $14,%0,$12 \n\t" \ 01216 " addu %1,%1,$13 \n\t" \ 01217 " sltu $15,%1,$13 \n\t" \ 01218 " addu %1,%1,$14 \n\t" \ 01219 " sltu $14,%1,$14 \n\t" \ 01220 " addu %2,%2,$15 \n\t" \ 01221 " addu %2,%2,$14 \n\t" \ 01222 \ 01223 " addu %0,%0,$12 \n\t" \ 01224 " sltu $14,%0,$12 \n\t" \ 01225 " addu %1,%1,$13 \n\t" \ 01226 " sltu $15,%1,$13 \n\t" \ 01227 " addu %1,%1,$14 \n\t" \ 01228 " sltu $14,%1,$14 \n\t" \ 01229 " addu %2,%2,$15 \n\t" \ 01230 " addu %2,%2,$14 \n\t" \ 01231 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15"); 01232 01233 #define SQRADDSC(i, j) \ 01234 __asm__( \ 01235 " multu %6,%7 \n\t" \ 01236 " mflo %0 \n\t" \ 01237 " mfhi %1 \n\t" \ 01238 " xor %2,%2,%2 \n\t" \ 01239 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc"); 01240 01241 #define SQRADDAC(i, j) \ 01242 __asm__( \ 01243 " multu %6,%7 \n\t" \ 01244 " mflo $12 \n\t" \ 01245 " mfhi $13 \n\t" \ 01246 " addu %0,%0,$12 \n\t" \ 01247 " sltu $12,%0,$12 \n\t" \ 01248 " addu %1,%1,$13 \n\t" \ 01249 " sltu $13,%1,$13 \n\t" \ 01250 " addu %1,%1,$12 \n\t" \ 01251 " sltu $12,%1,$12 \n\t" \ 01252 " addu %2,%2,$13 \n\t" \ 01253 " addu %2,%2,$12 \n\t" \ 01254 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14"); 01255 01256 #define SQRADDDB \ 01257 __asm__( \ 01258 " addu %0,%0,%3 \n\t" \ 01259 " sltu $10,%0,%3 \n\t" \ 01260 " addu %1,%1,$10 \n\t" \ 01261 " sltu $10,%1,$10 \n\t" \ 01262 " addu %1,%1,%4 \n\t" \ 01263 " sltu $11,%1,%4 \n\t" \ 01264 " addu %2,%2,$10 \n\t" \ 01265 " addu %2,%2,$11 \n\t" \ 01266 " addu %2,%2,%5 \n\t" \ 01267 \ 01268 " addu %0,%0,%3 \n\t" \ 01269 " sltu $10,%0,%3 \n\t" \ 01270 " addu %1,%1,$10 \n\t" \ 01271 " sltu $10,%1,$10 \n\t" \ 01272 " addu %1,%1,%4 \n\t" \ 01273 " sltu $11,%1,%4 \n\t" \ 01274 " addu %2,%2,$10 \n\t" \ 01275 " addu %2,%2,$11 \n\t" \ 01276 " addu %2,%2,%5 \n\t" \ 01277 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11"); 01278 01279 #else 01280 01281 #define TFM_ISO 01282 01283 /* ISO C portable code */ 01284 01285 #define COMBA_START 01286 01287 #define CLEAR_CARRY \ 01288 c0 = c1 = c2 = 0; 01289 01290 #define COMBA_STORE(x) \ 01291 x = c0; 01292 01293 #define COMBA_STORE2(x) \ 01294 x = c1; 01295 01296 #define CARRY_FORWARD \ 01297 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01298 01299 #define COMBA_FINI 01300 01301 /* multiplies point i and j, updates carry "c1" and digit c2 */ 01302 #define SQRADD(i, j) \ 01303 do { fp_word t; \ 01304 t = c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \ 01305 t = c1 + (t >> DIGIT_BIT); c1 = (fp_digit)t; \ 01306 c2 +=(fp_digit) (t >> DIGIT_BIT); \ 01307 } while (0); 01308 01309 01310 /* for squaring some of the terms are doubled... */ 01311 #define SQRADD2(i, j) \ 01312 do { fp_word t; \ 01313 t = ((fp_word)i) * ((fp_word)j); \ 01314 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \ 01315 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \ 01316 c2 +=(fp_digit)(tt >> DIGIT_BIT); \ 01317 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \ 01318 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \ 01319 c2 +=(fp_digit)(tt >> DIGIT_BIT); \ 01320 } while (0); 01321 01322 #define SQRADDSC(i, j) \ 01323 do { fp_word t; \ 01324 t = ((fp_word)i) * ((fp_word)j); \ 01325 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \ 01326 } while (0); 01327 01328 #define SQRADDAC(i, j) \ 01329 do { fp_word t; \ 01330 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = (fp_digit)t; \ 01331 t = sc1 + (t >> DIGIT_BIT); sc1 = (fp_digit)t; \ 01332 sc2 += (fp_digit)(t >> DIGIT_BIT); \ 01333 } while (0); 01334 01335 #define SQRADDDB \ 01336 do { fp_word t; \ 01337 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t; \ 01338 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); \ 01339 c1 = (fp_digit)t; \ 01340 c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT)); \ 01341 } while (0); 01342 01343 #endif 01344 01345 #ifdef TFM_SMALL_SET 01346 #include "fp_sqr_comba_small_set.i" 01347 #endif 01348 01349 #if defined(TFM_SQR3) && FP_SIZE >= 6 01350 #include "fp_sqr_comba_3.i" 01351 #endif 01352 #if defined(TFM_SQR4) && FP_SIZE >= 8 01353 #include "fp_sqr_comba_4.i" 01354 #endif 01355 #if defined(TFM_SQR6) && FP_SIZE >= 12 01356 #include "fp_sqr_comba_6.i" 01357 #endif 01358 #if defined(TFM_SQR7) && FP_SIZE >= 14 01359 #include "fp_sqr_comba_7.i" 01360 #endif 01361 #if defined(TFM_SQR8) && FP_SIZE >= 16 01362 #include "fp_sqr_comba_8.i" 01363 #endif 01364 #if defined(TFM_SQR9) && FP_SIZE >= 18 01365 #include "fp_sqr_comba_9.i" 01366 #endif 01367 #if defined(TFM_SQR12) && FP_SIZE >= 24 01368 #include "fp_sqr_comba_12.i" 01369 #endif 01370 #if defined(TFM_SQR17) && FP_SIZE >= 34 01371 #include "fp_sqr_comba_17.i" 01372 #endif 01373 #if defined(TFM_SQR20) && FP_SIZE >= 40 01374 #include "fp_sqr_comba_20.i" 01375 #endif 01376 #if defined(TFM_SQR24) && FP_SIZE >= 48 01377 #include "fp_sqr_comba_24.i" 01378 #endif 01379 #if defined(TFM_SQR28) && FP_SIZE >= 56 01380 #include "fp_sqr_comba_28.i" 01381 #endif 01382 #if defined(TFM_SQR32) && FP_SIZE >= 64 01383 #include "fp_sqr_comba_32.i" 01384 #endif 01385 #if defined(TFM_SQR48) && FP_SIZE >= 96 01386 #include "fp_sqr_comba_48.i" 01387 #endif 01388 #if defined(TFM_SQR64) && FP_SIZE >= 128 01389 #include "fp_sqr_comba_64.i" 01390 #endif 01391 /* end fp_sqr_comba.c asm */ 01392 01393 /* start fp_mul_comba.c asm */ 01394 /* these are the combas. Worship them. */ 01395 #if defined(TFM_X86) 01396 /* Generic x86 optimized code */ 01397 01398 /* anything you need at the start */ 01399 #define COMBA_START 01400 01401 /* clear the chaining variables */ 01402 #define COMBA_CLEAR \ 01403 c0 = c1 = c2 = 0; 01404 01405 /* forward the carry to the next digit */ 01406 #define COMBA_FORWARD \ 01407 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01408 01409 /* store the first sum */ 01410 #define COMBA_STORE(x) \ 01411 x = c0; 01412 01413 /* store the second sum [carry] */ 01414 #define COMBA_STORE2(x) \ 01415 x = c1; 01416 01417 /* anything you need at the end */ 01418 #define COMBA_FINI 01419 01420 /* this should multiply i and j */ 01421 #define MULADD(i, j) \ 01422 __asm__( \ 01423 "movl %6,%%eax \n\t" \ 01424 "mull %7 \n\t" \ 01425 "addl %%eax,%0 \n\t" \ 01426 "adcl %%edx,%1 \n\t" \ 01427 "adcl $0,%2 \n\t" \ 01428 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc"); 01429 01430 #elif defined(TFM_X86_64) 01431 /* x86-64 optimized */ 01432 01433 /* anything you need at the start */ 01434 #define COMBA_START 01435 01436 /* clear the chaining variables */ 01437 #define COMBA_CLEAR \ 01438 c0 = c1 = c2 = 0; 01439 01440 /* forward the carry to the next digit */ 01441 #define COMBA_FORWARD \ 01442 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01443 01444 /* store the first sum */ 01445 #define COMBA_STORE(x) \ 01446 x = c0; 01447 01448 /* store the second sum [carry] */ 01449 #define COMBA_STORE2(x) \ 01450 x = c1; 01451 01452 /* anything you need at the end */ 01453 #define COMBA_FINI 01454 01455 /* this should multiply i and j */ 01456 #define MULADD(i, j) \ 01457 __asm__ ( \ 01458 "movq %6,%%rax \n\t" \ 01459 "mulq %7 \n\t" \ 01460 "addq %%rax,%0 \n\t" \ 01461 "adcq %%rdx,%1 \n\t" \ 01462 "adcq $0,%2 \n\t" \ 01463 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc"); 01464 01465 01466 #if defined(HAVE_INTEL_MULX) 01467 #define MULADD_BODY(a,b,c) \ 01468 __asm__ volatile( \ 01469 "movq %[a0],%%rdx\n\t" \ 01470 "xorq %%rcx, %%rcx\n\t" \ 01471 "movq 0(%[cp]),%%r8\n\t" \ 01472 "movq 8(%[cp]),%%r9\n\t" \ 01473 "movq 16(%[cp]),%%r10\n\t" \ 01474 "movq 24(%[cp]),%%r11\n\t" \ 01475 "movq 32(%[cp]),%%r12\n\t" \ 01476 "movq 40(%[cp]),%%r13\n\t" \ 01477 \ 01478 "mulx (%[bp]),%%rax, %%rbx\n\t" \ 01479 "adoxq %%rax, %%r8\n\t" \ 01480 "mulx 8(%[bp]),%%rax, %%rcx\n\t" \ 01481 "adcxq %%rbx, %%r9\n\t" \ 01482 "adoxq %%rax, %%r9\n\t" \ 01483 "mulx 16(%[bp]),%%rax, %%rbx\n\t" \ 01484 "adcxq %%rcx, %%r10\n\t" \ 01485 "adoxq %%rax, %%r10\n\t" \ 01486 "mulx 24(%[bp]),%%rax, %%rcx\n\t" \ 01487 "adcxq %%rbx, %%r11\n\t" \ 01488 "adoxq %%rax, %%r11\n\t" \ 01489 "adcxq %%rcx, %%r12\n\t" \ 01490 "mov $0, %%rdx\n\t" \ 01491 "adox %%rdx, %%r12\n\t" \ 01492 "adcx %%rdx, %%r13\n\t" \ 01493 \ 01494 "movq %%r8, 0(%[cp])\n\t" \ 01495 "movq %%r9, 8(%[cp])\n\t" \ 01496 "movq %%r10, 16(%[cp])\n\t" \ 01497 "movq %%r11, 24(%[cp])\n\t" \ 01498 "movq %%r12, 32(%[cp])\n\t" \ 01499 "movq %%r13, 40(%[cp])\n\t" \ 01500 : \ 01501 : [a0] "r" (a->dp[ix]), [bp] "r" (&(b->dp[iy])), \ 01502 [cp] "r" (&(c->dp[iz])) \ 01503 : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", \ 01504 "%rdx", "%rax", "%rcx", "%rbx" \ 01505 ) 01506 01507 #define TFM_INTEL_MUL_COMBA(a, b, c) \ 01508 for (iz=0; iz<pa; iz++) c->dp[iz] = 0; \ 01509 for (ix=0; ix<a->used; ix++) { \ 01510 for (iy=0; iy<b->used; iy+=4) { \ 01511 iz = ix + iy; \ 01512 MULADD_BODY(a, b, c); \ 01513 } \ 01514 } 01515 #endif 01516 01517 #elif defined(TFM_SSE2) 01518 /* use SSE2 optimizations */ 01519 01520 /* anything you need at the start */ 01521 #define COMBA_START 01522 01523 /* clear the chaining variables */ 01524 #define COMBA_CLEAR \ 01525 c0 = c1 = c2 = 0; 01526 01527 /* forward the carry to the next digit */ 01528 #define COMBA_FORWARD \ 01529 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01530 01531 /* store the first sum */ 01532 #define COMBA_STORE(x) \ 01533 x = c0; 01534 01535 /* store the second sum [carry] */ 01536 #define COMBA_STORE2(x) \ 01537 x = c1; 01538 01539 /* anything you need at the end */ 01540 #define COMBA_FINI \ 01541 __asm__("emms"); 01542 01543 /* this should multiply i and j */ 01544 #define MULADD(i, j) \ 01545 __asm__( \ 01546 "movd %6,%%mm0 \n\t" \ 01547 "movd %7,%%mm1 \n\t" \ 01548 "pmuludq %%mm1,%%mm0\n\t" \ 01549 "movd %%mm0,%%eax \n\t" \ 01550 "psrlq $32,%%mm0 \n\t" \ 01551 "addl %%eax,%0 \n\t" \ 01552 "movd %%mm0,%%eax \n\t" \ 01553 "adcl %%eax,%1 \n\t" \ 01554 "adcl $0,%2 \n\t" \ 01555 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc"); 01556 01557 #elif defined(TFM_ARM) 01558 /* ARM code */ 01559 01560 #define COMBA_START 01561 01562 #define COMBA_CLEAR \ 01563 c0 = c1 = c2 = 0; 01564 01565 #define COMBA_FORWARD \ 01566 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01567 01568 #define COMBA_STORE(x) \ 01569 x = c0; 01570 01571 #define COMBA_STORE2(x) \ 01572 x = c1; 01573 01574 #define COMBA_FINI 01575 01576 #define MULADD(i, j) \ 01577 __asm__( \ 01578 " UMULL r0,r1,%6,%7 \n\t" \ 01579 " ADDS %0,%0,r0 \n\t" \ 01580 " ADCS %1,%1,r1 \n\t" \ 01581 " ADC %2,%2,#0 \n\t" \ 01582 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc"); 01583 01584 #elif defined(TFM_PPC32) 01585 /* For 32-bit PPC */ 01586 01587 #define COMBA_START 01588 01589 #define COMBA_CLEAR \ 01590 c0 = c1 = c2 = 0; 01591 01592 #define COMBA_FORWARD \ 01593 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01594 01595 #define COMBA_STORE(x) \ 01596 x = c0; 01597 01598 #define COMBA_STORE2(x) \ 01599 x = c1; 01600 01601 #define COMBA_FINI 01602 01603 /* untested: will mulhwu change the flags? Docs say no */ 01604 #define MULADD(i, j) \ 01605 __asm__( \ 01606 " mullw 16,%6,%7 \n\t" \ 01607 " addc %0,%0,16 \n\t" \ 01608 " mulhwu 16,%6,%7 \n\t" \ 01609 " adde %1,%1,16 \n\t" \ 01610 " addze %2,%2 \n\t" \ 01611 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16"); 01612 01613 #elif defined(TFM_PPC64) 01614 /* For 64-bit PPC */ 01615 01616 #define COMBA_START 01617 01618 #define COMBA_CLEAR \ 01619 c0 = c1 = c2 = 0; 01620 01621 #define COMBA_FORWARD \ 01622 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01623 01624 #define COMBA_STORE(x) \ 01625 x = c0; 01626 01627 #define COMBA_STORE2(x) \ 01628 x = c1; 01629 01630 #define COMBA_FINI 01631 01632 /* untested: will mulhdu change the flags? Docs say no */ 01633 #define MULADD(i, j) \ 01634 ____asm__( \ 01635 " mulld r16,%6,%7 \n\t" \ 01636 " addc %0,%0,16 \n\t" \ 01637 " mulhdu r16,%6,%7 \n\t" \ 01638 " adde %1,%1,16 \n\t" \ 01639 " addze %2,%2 \n\t" \ 01640 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16"); 01641 01642 #elif defined(TFM_AVR32) 01643 01644 /* ISO C code */ 01645 01646 #define COMBA_START 01647 01648 #define COMBA_CLEAR \ 01649 c0 = c1 = c2 = 0; 01650 01651 #define COMBA_FORWARD \ 01652 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01653 01654 #define COMBA_STORE(x) \ 01655 x = c0; 01656 01657 #define COMBA_STORE2(x) \ 01658 x = c1; 01659 01660 #define COMBA_FINI 01661 01662 #define MULADD(i, j) \ 01663 ____asm__( \ 01664 " mulu.d r2,%6,%7 \n\t"\ 01665 " add %0,r2 \n\t"\ 01666 " adc %1,%1,r3 \n\t"\ 01667 " acr %2 \n\t"\ 01668 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3"); 01669 01670 #elif defined(TFM_MIPS) 01671 01672 /* MIPS */ 01673 #define COMBA_START 01674 01675 #define COMBA_CLEAR \ 01676 c0 = c1 = c2 = 0; 01677 01678 #define COMBA_FORWARD \ 01679 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01680 01681 #define COMBA_STORE(x) \ 01682 x = c0; 01683 01684 #define COMBA_STORE2(x) \ 01685 x = c1; 01686 01687 #define COMBA_FINI 01688 01689 #define MULADD(i, j) \ 01690 __asm__( \ 01691 " multu %6,%7 \n\t" \ 01692 " mflo $12 \n\t" \ 01693 " mfhi $13 \n\t" \ 01694 " addu %0,%0,$12 \n\t" \ 01695 " sltu $12,%0,$12 \n\t" \ 01696 " addu %1,%1,$13 \n\t" \ 01697 " sltu $13,%1,$13 \n\t" \ 01698 " addu %1,%1,$12 \n\t" \ 01699 " sltu $12,%1,$12 \n\t" \ 01700 " addu %2,%2,$13 \n\t" \ 01701 " addu %2,%2,$12 \n\t" \ 01702 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12","$13"); 01703 01704 #else 01705 /* ISO C code */ 01706 01707 #define COMBA_START 01708 01709 #define COMBA_CLEAR \ 01710 c0 = c1 = c2 = 0; 01711 01712 #define COMBA_FORWARD \ 01713 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01714 01715 #define COMBA_STORE(x) \ 01716 x = c0; 01717 01718 #define COMBA_STORE2(x) \ 01719 x = c1; 01720 01721 #define COMBA_FINI 01722 01723 #define MULADD(i, j) \ 01724 do { fp_word t; \ 01725 t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); \ 01726 c0 = (fp_digit)t; \ 01727 t = (fp_word)c1 + (t >> DIGIT_BIT); \ 01728 c1 = (fp_digit)t; \ 01729 c2 += (fp_digit)(t >> DIGIT_BIT); \ 01730 } while (0); 01731 01732 #endif 01733 01734 01735 #ifdef TFM_SMALL_SET 01736 #include "fp_mul_comba_small_set.i" 01737 #endif 01738 01739 #if defined(TFM_MUL3) && FP_SIZE >= 6 01740 #include "fp_mul_comba_3.i" 01741 #endif 01742 #if defined(TFM_MUL4) && FP_SIZE >= 8 01743 #include "fp_mul_comba_4.i" 01744 #endif 01745 #if defined(TFM_MUL6) && FP_SIZE >= 12 01746 #include "fp_mul_comba_6.i" 01747 #endif 01748 #if defined(TFM_MUL7) && FP_SIZE >= 14 01749 #include "fp_mul_comba_7.i" 01750 #endif 01751 #if defined(TFM_MUL8) && FP_SIZE >= 16 01752 #include "fp_mul_comba_8.i" 01753 #endif 01754 #if defined(TFM_MUL9) && FP_SIZE >= 18 01755 #include "fp_mul_comba_9.i" 01756 #endif 01757 #if defined(TFM_MUL12) && FP_SIZE >= 24 01758 #include "fp_mul_comba_12.i" 01759 #endif 01760 #if defined(TFM_MUL17) && FP_SIZE >= 34 01761 #include "fp_mul_comba_17.i" 01762 #endif 01763 #if defined(TFM_MUL20) && FP_SIZE >= 40 01764 #include "fp_mul_comba_20.i" 01765 #endif 01766 #if defined(TFM_MUL24) && FP_SIZE >= 48 01767 #include "fp_mul_comba_24.i" 01768 #endif 01769 #if defined(TFM_MUL28) && FP_SIZE >= 56 01770 #include "fp_mul_comba_28.i" 01771 #endif 01772 #if defined(TFM_MUL32) && FP_SIZE >= 64 01773 #include "fp_mul_comba_32.i" 01774 #endif 01775 #if defined(TFM_MUL48) && FP_SIZE >= 96 01776 #include "fp_mul_comba_48.i" 01777 #endif 01778 #if defined(TFM_MUL64) && FP_SIZE >= 128 01779 #include "fp_mul_comba_64.i" 01780 #endif 01781 01782 /* end fp_mul_comba.c asm */ 01783 01784
Generated on Tue Jul 12 2022 16:58:03 by
1.7.2