ssh lib
Embed:
(wiki syntax)
Show/hide line numbers
asm.c
00001 /* asm.c 00002 * 00003 * Copyright (C) 2006-2017 wolfSSL Inc. 00004 * 00005 * This file is part of wolfSSL. 00006 * 00007 * wolfSSL is free software; you can redistribute it and/or modify 00008 * it under the terms of the GNU General Public License as published by 00009 * the Free Software Foundation; either version 2 of the License, or 00010 * (at your option) any later version. 00011 * 00012 * wolfSSL is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 * GNU General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU General Public License 00018 * along with this program; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA 00020 */ 00021 00022 00023 #ifdef HAVE_CONFIG_H 00024 #include <config.h> 00025 #endif 00026 00027 #include <wolfcrypt/settings.h> 00028 00029 /* 00030 * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca, 00031 * http://math.libtomcrypt.com 00032 */ 00033 00034 00035 /******************************************************************/ 00036 /* fp_montgomery_reduce.c asm or generic */ 00037 00038 00039 /* Each platform needs to query info type 1 from cpuid to see if aesni is 00040 * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts 00041 */ 00042 00043 #if defined(HAVE_INTEL_MULX) 00044 #ifndef _MSC_VER 00045 #define cpuid(reg, leaf, sub)\ 00046 __asm__ __volatile__ ("cpuid":\ 00047 "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\ 00048 "a" (leaf), "c"(sub)); 00049 00050 #define XASM_LINK(f) asm(f) 00051 #else 00052 00053 #include <intrin.h> 00054 #define cpuid(a,b,c) __cpuidex((int*)a,b,c) 00055 00056 #define XASM_LINK(f) 00057 00058 #endif /* _MSC_VER */ 00059 00060 #define EAX 0 00061 #define EBX 1 00062 #define ECX 2 00063 #define EDX 3 00064 00065 #define CPUID_AVX1 0x1 00066 #define CPUID_AVX2 0x2 00067 #define CPUID_RDRAND 0x4 00068 #define CPUID_RDSEED 0x8 00069 #define CPUID_BMI2 0x10 /* MULX, RORX */ 00070 #define CPUID_ADX 0x20 /* ADCX, ADOX */ 00071 00072 #define IS_INTEL_AVX1 (cpuid_flags&CPUID_AVX1) 00073 #define IS_INTEL_AVX2 (cpuid_flags&CPUID_AVX2) 00074 #define IS_INTEL_BMI2 (cpuid_flags&CPUID_BMI2) 00075 #define IS_INTEL_ADX (cpuid_flags&CPUID_ADX) 00076 #define IS_INTEL_RDRAND (cpuid_flags&CPUID_RDRAND) 00077 #define IS_INTEL_RDSEED (cpuid_flags&CPUID_RDSEED) 00078 #define SET_FLAGS 00079 00080 static word32 cpuid_check = 0 ; 00081 static word32 cpuid_flags = 0 ; 00082 00083 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) { 00084 int got_intel_cpu = 0; 00085 int got_amd_cpu = 0; 00086 unsigned int reg[5]; 00087 00088 reg[4] = '\0' ; 00089 cpuid(reg, 0, 0); 00090 00091 /* check for intel cpu */ 00092 if( memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 && 00093 memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 && 00094 memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) { 00095 got_intel_cpu = 1; 00096 } 00097 00098 /* check for AMD cpu */ 00099 if( memcmp((char *)&(reg[EBX]), "Auth", 4) == 0 && 00100 memcmp((char *)&(reg[EDX]), "enti", 4) == 0 && 00101 memcmp((char *)&(reg[ECX]), "cAMD", 4) == 0) { 00102 got_amd_cpu = 1; 00103 } 00104 if (got_intel_cpu || got_amd_cpu) { 00105 cpuid(reg, leaf, sub); 00106 return((reg[num]>>bit)&0x1) ; 00107 } 00108 return 0 ; 00109 } 00110 00111 WC_INLINE static int set_cpuid_flags(void) { 00112 if(cpuid_check == 0) { 00113 if(cpuid_flag(7, 0, EBX, 8)){ cpuid_flags |= CPUID_BMI2 ; } 00114 if(cpuid_flag(7, 0, EBX,19)){ cpuid_flags |= CPUID_ADX ; } 00115 cpuid_check = 1 ; 00116 return 0 ; 00117 } 00118 return 1 ; 00119 } 00120 00121 #define RETURN return 00122 #define IF_HAVE_INTEL_MULX(func, ret) \ 00123 if(cpuid_check==0)set_cpuid_flags() ; \ 00124 if(IS_INTEL_BMI2 && IS_INTEL_ADX){ func; ret ; } 00125 00126 #else 00127 #define IF_HAVE_INTEL_MULX(func, ret) 00128 #endif 00129 00130 #if defined(TFM_X86) && !defined(TFM_SSE2) 00131 /* x86-32 code */ 00132 00133 #define MONT_START 00134 #define MONT_FINI 00135 #define LOOP_END 00136 #define LOOP_START \ 00137 mu = c[x] * mp 00138 00139 #define INNERMUL \ 00140 __asm__( \ 00141 "movl %5,%%eax \n\t" \ 00142 "mull %4 \n\t" \ 00143 "addl %1,%%eax \n\t" \ 00144 "adcl $0,%%edx \n\t" \ 00145 "addl %%eax,%0 \n\t" \ 00146 "adcl $0,%%edx \n\t" \ 00147 "movl %%edx,%1 \n\t" \ 00148 :"=g"(_c[LO]), "=r"(cy) \ 00149 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \ 00150 : "%eax", "%edx", "cc") 00151 00152 #define PROPCARRY \ 00153 __asm__( \ 00154 "addl %1,%0 \n\t" \ 00155 "setb %%al \n\t" \ 00156 "movzbl %%al,%1 \n\t" \ 00157 :"=g"(_c[LO]), "=r"(cy) \ 00158 :"0"(_c[LO]), "1"(cy) \ 00159 : "%eax", "cc") 00160 00161 /******************************************************************/ 00162 #elif defined(TFM_X86_64) 00163 /* x86-64 code */ 00164 00165 #define MONT_START 00166 #define MONT_FINI 00167 #define LOOP_END 00168 #define LOOP_START \ 00169 mu = c[x] * mp 00170 00171 #define INNERMUL \ 00172 __asm__( \ 00173 "movq %5,%%rax \n\t" \ 00174 "mulq %4 \n\t" \ 00175 "addq %1,%%rax \n\t" \ 00176 "adcq $0,%%rdx \n\t" \ 00177 "addq %%rax,%0 \n\t" \ 00178 "adcq $0,%%rdx \n\t" \ 00179 "movq %%rdx,%1 \n\t" \ 00180 :"=g"(_c[LO]), "=r"(cy) \ 00181 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \ 00182 : "%rax", "%rdx", "cc") 00183 00184 #if defined(HAVE_INTEL_MULX) 00185 #define MULX_INNERMUL8(x,y,z,cy) \ 00186 __asm__ volatile ( \ 00187 "movq %[yn], %%rdx\n\t" \ 00188 "xorq %%rcx, %%rcx\n\t" \ 00189 "movq 0(%[c]), %%r8\n\t" \ 00190 "movq 8(%[c]), %%r9\n\t" \ 00191 "movq 16(%[c]), %%r10\n\t" \ 00192 "movq 24(%[c]), %%r11\n\t" \ 00193 "movq 32(%[c]), %%r12\n\t" \ 00194 "movq 40(%[c]), %%r13\n\t" \ 00195 "movq 48(%[c]), %%r14\n\t" \ 00196 "movq 56(%[c]), %%r15\n\t" \ 00197 \ 00198 "mulx 0(%[xp]), %%rax, %%rcx\n\t" \ 00199 "adcxq %[cy], %%r8\n\t" \ 00200 "adoxq %%rax, %%r8\n\t" \ 00201 "mulx 8(%[xp]), %%rax, %[cy]\n\t" \ 00202 "adcxq %%rcx, %%r9\n\t" \ 00203 "adoxq %%rax, %%r9\n\t" \ 00204 "mulx 16(%[xp]), %%rax, %%rcx\n\t" \ 00205 "adcxq %[cy], %%r10\n\t" \ 00206 "adoxq %%rax, %%r10\n\t" \ 00207 "mulx 24(%[xp]), %%rax, %[cy]\n\t" \ 00208 "adcxq %%rcx, %%r11\n\t" \ 00209 "adoxq %%rax, %%r11\n\t" \ 00210 "mulx 32(%[xp]), %%rax, %%rcx\n\t" \ 00211 "adcxq %[cy], %%r12\n\t" \ 00212 "adoxq %%rax, %%r12\n\t" \ 00213 "mulx 40(%[xp]), %%rax, %[cy]\n\t" \ 00214 "adcxq %%rcx, %%r13\n\t" \ 00215 "adoxq %%rax, %%r13\n\t" \ 00216 "mulx 48(%[xp]), %%rax, %%rcx\n\t" \ 00217 "adcxq %[cy], %%r14\n\t" \ 00218 "adoxq %%rax, %%r14\n\t" \ 00219 "adcxq %%rcx, %%r15\n\t" \ 00220 "mulx 56(%[xp]), %%rax, %[cy]\n\t" \ 00221 "movq $0, %%rdx\n\t" \ 00222 "adoxq %%rdx, %%rax\n\t" \ 00223 "adcxq %%rdx, %[cy]\n\t" \ 00224 "adoxq %%rdx, %[cy]\n\t" \ 00225 "addq %%rax, %%r15\n\t" \ 00226 "adcq $0, %[cy]\n\t" \ 00227 \ 00228 "movq %%r8, 0(%[c])\n\t" \ 00229 "movq %%r9, 8(%[c])\n\t" \ 00230 "movq %%r10, 16(%[c])\n\t" \ 00231 "movq %%r11, 24(%[c])\n\t" \ 00232 "movq %%r12, 32(%[c])\n\t" \ 00233 "movq %%r13, 40(%[c])\n\t" \ 00234 "movq %%r14, 48(%[c])\n\t" \ 00235 "movq %%r15, 56(%[c])\n\t" \ 00236 : [cy] "+r" (cy) \ 00237 : [xp] "r" (x), [c] "r" (c_mulx), [yn] "rm" (y) \ 00238 :"%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", \ 00239 "%rdx", "%rax", "%rcx" \ 00240 ) 00241 00242 #define INNERMUL8_MULX \ 00243 {\ 00244 MULX_INNERMUL8(tmpm, mu, _c, cy);\ 00245 } 00246 #endif 00247 00248 #define INNERMUL8 \ 00249 __asm__( \ 00250 "movq 0(%5),%%rax \n\t" \ 00251 "movq 0(%2),%%r10 \n\t" \ 00252 "movq 0x8(%5),%%r11 \n\t" \ 00253 "mulq %4 \n\t" \ 00254 "addq %%r10,%%rax \n\t" \ 00255 "adcq $0,%%rdx \n\t" \ 00256 "movq 0x8(%2),%%r10 \n\t" \ 00257 "addq %3,%%rax \n\t" \ 00258 "adcq $0,%%rdx \n\t" \ 00259 "movq %%rax,0(%0) \n\t" \ 00260 "movq %%rdx,%1 \n\t" \ 00261 \ 00262 "movq %%r11,%%rax \n\t" \ 00263 "movq 0x10(%5),%%r11 \n\t" \ 00264 "mulq %4 \n\t" \ 00265 "addq %%r10,%%rax \n\t" \ 00266 "adcq $0,%%rdx \n\t" \ 00267 "movq 0x10(%2),%%r10 \n\t" \ 00268 "addq %3,%%rax \n\t" \ 00269 "adcq $0,%%rdx \n\t" \ 00270 "movq %%rax,0x8(%0) \n\t" \ 00271 "movq %%rdx,%1 \n\t" \ 00272 \ 00273 "movq %%r11,%%rax \n\t" \ 00274 "movq 0x18(%5),%%r11 \n\t" \ 00275 "mulq %4 \n\t" \ 00276 "addq %%r10,%%rax \n\t" \ 00277 "adcq $0,%%rdx \n\t" \ 00278 "movq 0x18(%2),%%r10 \n\t" \ 00279 "addq %3,%%rax \n\t" \ 00280 "adcq $0,%%rdx \n\t" \ 00281 "movq %%rax,0x10(%0) \n\t" \ 00282 "movq %%rdx,%1 \n\t" \ 00283 \ 00284 "movq %%r11,%%rax \n\t" \ 00285 "movq 0x20(%5),%%r11 \n\t" \ 00286 "mulq %4 \n\t" \ 00287 "addq %%r10,%%rax \n\t" \ 00288 "adcq $0,%%rdx \n\t" \ 00289 "movq 0x20(%2),%%r10 \n\t" \ 00290 "addq %3,%%rax \n\t" \ 00291 "adcq $0,%%rdx \n\t" \ 00292 "movq %%rax,0x18(%0) \n\t" \ 00293 "movq %%rdx,%1 \n\t" \ 00294 \ 00295 "movq %%r11,%%rax \n\t" \ 00296 "movq 0x28(%5),%%r11 \n\t" \ 00297 "mulq %4 \n\t" \ 00298 "addq %%r10,%%rax \n\t" \ 00299 "adcq $0,%%rdx \n\t" \ 00300 "movq 0x28(%2),%%r10 \n\t" \ 00301 "addq %3,%%rax \n\t" \ 00302 "adcq $0,%%rdx \n\t" \ 00303 "movq %%rax,0x20(%0) \n\t" \ 00304 "movq %%rdx,%1 \n\t" \ 00305 \ 00306 "movq %%r11,%%rax \n\t" \ 00307 "movq 0x30(%5),%%r11 \n\t" \ 00308 "mulq %4 \n\t" \ 00309 "addq %%r10,%%rax \n\t" \ 00310 "adcq $0,%%rdx \n\t" \ 00311 "movq 0x30(%2),%%r10 \n\t" \ 00312 "addq %3,%%rax \n\t" \ 00313 "adcq $0,%%rdx \n\t" \ 00314 "movq %%rax,0x28(%0) \n\t" \ 00315 "movq %%rdx,%1 \n\t" \ 00316 \ 00317 "movq %%r11,%%rax \n\t" \ 00318 "movq 0x38(%5),%%r11 \n\t" \ 00319 "mulq %4 \n\t" \ 00320 "addq %%r10,%%rax \n\t" \ 00321 "adcq $0,%%rdx \n\t" \ 00322 "movq 0x38(%2),%%r10 \n\t" \ 00323 "addq %3,%%rax \n\t" \ 00324 "adcq $0,%%rdx \n\t" \ 00325 "movq %%rax,0x30(%0) \n\t" \ 00326 "movq %%rdx,%1 \n\t" \ 00327 \ 00328 "movq %%r11,%%rax \n\t" \ 00329 "mulq %4 \n\t" \ 00330 "addq %%r10,%%rax \n\t" \ 00331 "adcq $0,%%rdx \n\t" \ 00332 "addq %3,%%rax \n\t" \ 00333 "adcq $0,%%rdx \n\t" \ 00334 "movq %%rax,0x38(%0) \n\t" \ 00335 "movq %%rdx,%1 \n\t" \ 00336 \ 00337 :"=r"(_c), "=r"(cy) \ 00338 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\ 00339 : "%rax", "%rdx", "%r10", "%r11", "cc") 00340 00341 #define PROPCARRY \ 00342 __asm__( \ 00343 "addq %1,%0 \n\t" \ 00344 "setb %%al \n\t" \ 00345 "movzbq %%al,%1 \n\t" \ 00346 :"=g"(_c[LO]), "=r"(cy) \ 00347 :"0"(_c[LO]), "1"(cy) \ 00348 : "%rax", "cc") 00349 00350 /******************************************************************/ 00351 #elif defined(TFM_SSE2) 00352 /* SSE2 code (assumes 32-bit fp_digits) */ 00353 /* XMM register assignments: 00354 * xmm0 *tmpm++, then Mu * (*tmpm++) 00355 * xmm1 c[x], then Mu 00356 * xmm2 mp 00357 * xmm3 cy 00358 * xmm4 _c[LO] 00359 */ 00360 00361 #define MONT_START \ 00362 __asm__("movd %0,%%mm2"::"g"(mp)) 00363 00364 #define MONT_FINI \ 00365 __asm__("emms") 00366 00367 #define LOOP_START \ 00368 __asm__( \ 00369 "movd %0,%%mm1 \n\t" \ 00370 "pxor %%mm3,%%mm3 \n\t" \ 00371 "pmuludq %%mm2,%%mm1 \n\t" \ 00372 :: "g"(c[x])) 00373 00374 /* pmuludq on mmx registers does a 32x32->64 multiply. */ 00375 #define INNERMUL \ 00376 __asm__( \ 00377 "movd %1,%%mm4 \n\t" \ 00378 "movd %2,%%mm0 \n\t" \ 00379 "paddq %%mm4,%%mm3 \n\t" \ 00380 "pmuludq %%mm1,%%mm0 \n\t" \ 00381 "paddq %%mm0,%%mm3 \n\t" \ 00382 "movd %%mm3,%0 \n\t" \ 00383 "psrlq $32, %%mm3 \n\t" \ 00384 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) ); 00385 00386 #define INNERMUL8 \ 00387 __asm__( \ 00388 "movd 0(%1),%%mm4 \n\t" \ 00389 "movd 0(%2),%%mm0 \n\t" \ 00390 "paddq %%mm4,%%mm3 \n\t" \ 00391 "pmuludq %%mm1,%%mm0 \n\t" \ 00392 "movd 4(%2),%%mm5 \n\t" \ 00393 "paddq %%mm0,%%mm3 \n\t" \ 00394 "movd 4(%1),%%mm6 \n\t" \ 00395 "movd %%mm3,0(%0) \n\t" \ 00396 "psrlq $32, %%mm3 \n\t" \ 00397 \ 00398 "paddq %%mm6,%%mm3 \n\t" \ 00399 "pmuludq %%mm1,%%mm5 \n\t" \ 00400 "movd 8(%2),%%mm6 \n\t" \ 00401 "paddq %%mm5,%%mm3 \n\t" \ 00402 "movd 8(%1),%%mm7 \n\t" \ 00403 "movd %%mm3,4(%0) \n\t" \ 00404 "psrlq $32, %%mm3 \n\t" \ 00405 \ 00406 "paddq %%mm7,%%mm3 \n\t" \ 00407 "pmuludq %%mm1,%%mm6 \n\t" \ 00408 "movd 12(%2),%%mm7 \n\t" \ 00409 "paddq %%mm6,%%mm3 \n\t" \ 00410 "movd 12(%1),%%mm5 \n\t" \ 00411 "movd %%mm3,8(%0) \n\t" \ 00412 "psrlq $32, %%mm3 \n\t" \ 00413 \ 00414 "paddq %%mm5,%%mm3 \n\t" \ 00415 "pmuludq %%mm1,%%mm7 \n\t" \ 00416 "movd 16(%2),%%mm5 \n\t" \ 00417 "paddq %%mm7,%%mm3 \n\t" \ 00418 "movd 16(%1),%%mm6 \n\t" \ 00419 "movd %%mm3,12(%0) \n\t" \ 00420 "psrlq $32, %%mm3 \n\t" \ 00421 \ 00422 "paddq %%mm6,%%mm3 \n\t" \ 00423 "pmuludq %%mm1,%%mm5 \n\t" \ 00424 "movd 20(%2),%%mm6 \n\t" \ 00425 "paddq %%mm5,%%mm3 \n\t" \ 00426 "movd 20(%1),%%mm7 \n\t" \ 00427 "movd %%mm3,16(%0) \n\t" \ 00428 "psrlq $32, %%mm3 \n\t" \ 00429 \ 00430 "paddq %%mm7,%%mm3 \n\t" \ 00431 "pmuludq %%mm1,%%mm6 \n\t" \ 00432 "movd 24(%2),%%mm7 \n\t" \ 00433 "paddq %%mm6,%%mm3 \n\t" \ 00434 "movd 24(%1),%%mm5 \n\t" \ 00435 "movd %%mm3,20(%0) \n\t" \ 00436 "psrlq $32, %%mm3 \n\t" \ 00437 \ 00438 "paddq %%mm5,%%mm3 \n\t" \ 00439 "pmuludq %%mm1,%%mm7 \n\t" \ 00440 "movd 28(%2),%%mm5 \n\t" \ 00441 "paddq %%mm7,%%mm3 \n\t" \ 00442 "movd 28(%1),%%mm6 \n\t" \ 00443 "movd %%mm3,24(%0) \n\t" \ 00444 "psrlq $32, %%mm3 \n\t" \ 00445 \ 00446 "paddq %%mm6,%%mm3 \n\t" \ 00447 "pmuludq %%mm1,%%mm5 \n\t" \ 00448 "paddq %%mm5,%%mm3 \n\t" \ 00449 "movd %%mm3,28(%0) \n\t" \ 00450 "psrlq $32, %%mm3 \n\t" \ 00451 :"=r"(_c) : "0"(_c), "r"(tmpm) ); 00452 00453 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack 00454 pointer */ 00455 00456 #define LOOP_END \ 00457 __asm__( "movd %%mm3,%0 \n" :"=r"(cy)) 00458 00459 #define PROPCARRY \ 00460 __asm__( \ 00461 "addl %1,%0 \n\t" \ 00462 "setb %%al \n\t" \ 00463 "movzbl %%al,%1 \n\t" \ 00464 :"=g"(_c[LO]), "=r"(cy) \ 00465 :"0"(_c[LO]), "1"(cy) \ 00466 : "%eax", "cc") 00467 00468 /******************************************************************/ 00469 #elif defined(TFM_ARM) 00470 /* ARMv4 code */ 00471 00472 #define MONT_START 00473 #define MONT_FINI 00474 #define LOOP_END 00475 #define LOOP_START \ 00476 mu = c[x] * mp 00477 00478 00479 #ifdef __thumb__ 00480 00481 #define INNERMUL \ 00482 __asm__( \ 00483 " LDR r0,%1 \n\t" \ 00484 " ADDS r0,r0,%0 \n\t" \ 00485 " ITE CS \n\t" \ 00486 " MOVCS %0,#1 \n\t" \ 00487 " MOVCC %0,#0 \n\t" \ 00488 " UMLAL r0,%0,%3,%4 \n\t" \ 00489 " STR r0,%1 \n\t" \ 00490 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc"); 00491 00492 #define PROPCARRY \ 00493 __asm__( \ 00494 " LDR r0,%1 \n\t" \ 00495 " ADDS r0,r0,%0 \n\t" \ 00496 " STR r0,%1 \n\t" \ 00497 " ITE CS \n\t" \ 00498 " MOVCS %0,#1 \n\t" \ 00499 " MOVCC %0,#0 \n\t" \ 00500 :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc"); 00501 00502 00503 /* TAO thumb mode uses ite (if then else) to detect carry directly 00504 * fixed unmatched constraint warning by changing 1 to m */ 00505 00506 #else /* __thumb__ */ 00507 00508 #define INNERMUL \ 00509 __asm__( \ 00510 " LDR r0,%1 \n\t" \ 00511 " ADDS r0,r0,%0 \n\t" \ 00512 " MOVCS %0,#1 \n\t" \ 00513 " MOVCC %0,#0 \n\t" \ 00514 " UMLAL r0,%0,%3,%4 \n\t" \ 00515 " STR r0,%1 \n\t" \ 00516 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc"); 00517 00518 #define PROPCARRY \ 00519 __asm__( \ 00520 " LDR r0,%1 \n\t" \ 00521 " ADDS r0,r0,%0 \n\t" \ 00522 " STR r0,%1 \n\t" \ 00523 " MOVCS %0,#1 \n\t" \ 00524 " MOVCC %0,#0 \n\t" \ 00525 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc"); 00526 00527 #endif /* __thumb__ */ 00528 00529 #elif defined(TFM_PPC32) 00530 00531 /* PPC32 */ 00532 #define MONT_START 00533 #define MONT_FINI 00534 #define LOOP_END 00535 #define LOOP_START \ 00536 mu = c[x] * mp 00537 00538 #define INNERMUL \ 00539 __asm__( \ 00540 " mullw 16,%3,%4 \n\t" \ 00541 " mulhwu 17,%3,%4 \n\t" \ 00542 " addc 16,16,%2 \n\t" \ 00543 " addze 17,17 \n\t" \ 00544 " addc %1,16,%5 \n\t" \ 00545 " addze %0,17 \n\t" \ 00546 :"=r"(cy),"=r"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "cc"); ++tmpm; 00547 00548 #define PROPCARRY \ 00549 __asm__( \ 00550 " addc %1,%3,%2 \n\t" \ 00551 " xor %0,%2,%2 \n\t" \ 00552 " addze %0,%2 \n\t" \ 00553 :"=r"(cy),"=r"(_c[0]):"0"(cy),"1"(_c[0]):"cc"); 00554 00555 #elif defined(TFM_PPC64) 00556 00557 /* PPC64 */ 00558 #define MONT_START 00559 #define MONT_FINI 00560 #define LOOP_END 00561 #define LOOP_START \ 00562 mu = c[x] * mp 00563 00564 #define INNERMUL \ 00565 __asm__( \ 00566 " mulld r16,%3,%4 \n\t" \ 00567 " mulhdu r17,%3,%4 \n\t" \ 00568 " addc r16,16,%0 \n\t" \ 00569 " addze r17,r17 \n\t" \ 00570 " ldx r18,0,%1 \n\t" \ 00571 " addc r16,r16,r18 \n\t" \ 00572 " addze %0,r17 \n\t" \ 00573 " sdx r16,0,%1 \n\t" \ 00574 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","cc"); ++tmpm; 00575 00576 #define PROPCARRY \ 00577 __asm__( \ 00578 " ldx r16,0,%1 \n\t" \ 00579 " addc r16,r16,%0 \n\t" \ 00580 " sdx r16,0,%1 \n\t" \ 00581 " xor %0,%0,%0 \n\t" \ 00582 " addze %0,%0 \n\t" \ 00583 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","cc"); 00584 00585 /******************************************************************/ 00586 00587 #elif defined(TFM_AVR32) 00588 00589 /* AVR32 */ 00590 #define MONT_START 00591 #define MONT_FINI 00592 #define LOOP_END 00593 #define LOOP_START \ 00594 mu = c[x] * mp 00595 00596 #define INNERMUL \ 00597 __asm__( \ 00598 " ld.w r2,%1 \n\t" \ 00599 " add r2,%0 \n\t" \ 00600 " eor r3,r3 \n\t" \ 00601 " acr r3 \n\t" \ 00602 " macu.d r2,%3,%4 \n\t" \ 00603 " st.w %1,r2 \n\t" \ 00604 " mov %0,r3 \n\t" \ 00605 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3"); 00606 00607 #define PROPCARRY \ 00608 __asm__( \ 00609 " ld.w r2,%1 \n\t" \ 00610 " add r2,%0 \n\t" \ 00611 " st.w %1,r2 \n\t" \ 00612 " eor %0,%0 \n\t" \ 00613 " acr %0 \n\t" \ 00614 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc"); 00615 00616 /******************************************************************/ 00617 #elif defined(TFM_MIPS) 00618 00619 /* MIPS */ 00620 #define MONT_START 00621 #define MONT_FINI 00622 #define LOOP_END 00623 #define LOOP_START \ 00624 mu = c[x] * mp 00625 00626 #define INNERMUL \ 00627 __asm__( \ 00628 " multu %3,%4 \n\t" \ 00629 " mflo $12 \n\t" \ 00630 " mfhi $13 \n\t" \ 00631 " addu $12,$12,%0 \n\t" \ 00632 " sltu $10,$12,%0 \n\t" \ 00633 " addu $13,$13,$10 \n\t" \ 00634 " lw $10,%1 \n\t" \ 00635 " addu $12,$12,$10 \n\t" \ 00636 " sltu $10,$12,$10 \n\t" \ 00637 " addu %0,$13,$10 \n\t" \ 00638 " sw $12,%1 \n\t" \ 00639 :"+r"(cy),"+m"(_c[0]):""(cy),"r"(mu),"r"(tmpm[0]),""(_c[0]):"$10","$12","$13"); ++tmpm; 00640 00641 #define PROPCARRY \ 00642 __asm__( \ 00643 " lw $10,%1 \n\t" \ 00644 " addu $10,$10,%0 \n\t" \ 00645 " sw $10,%1 \n\t" \ 00646 " sltu %0,$10,%0 \n\t" \ 00647 :"+r"(cy),"+m"(_c[0]):""(cy),""(_c[0]):"$10"); 00648 00649 /******************************************************************/ 00650 #else 00651 00652 /* ISO C code */ 00653 #define MONT_START 00654 #define MONT_FINI 00655 #define LOOP_END 00656 #define LOOP_START \ 00657 mu = c[x] * mp 00658 00659 #define INNERMUL \ 00660 do { fp_word t; \ 00661 t = ((fp_word)_c[0] + (fp_word)cy) + \ 00662 (((fp_word)mu) * ((fp_word)*tmpm++)); \ 00663 _c[0] = (fp_digit)t; \ 00664 cy = (fp_digit)(t >> DIGIT_BIT); \ 00665 } while (0) 00666 00667 #define PROPCARRY \ 00668 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0) 00669 00670 #endif 00671 /******************************************************************/ 00672 00673 00674 #define LO 0 00675 /* end fp_montogomery_reduce.c asm */ 00676 00677 00678 /* start fp_sqr_comba.c asm */ 00679 #if defined(TFM_X86) 00680 00681 /* x86-32 optimized */ 00682 00683 #define COMBA_START 00684 00685 #define CLEAR_CARRY \ 00686 c0 = c1 = c2 = 0; 00687 00688 #define COMBA_STORE(x) \ 00689 x = c0; 00690 00691 #define COMBA_STORE2(x) \ 00692 x = c1; 00693 00694 #define CARRY_FORWARD \ 00695 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00696 00697 #define COMBA_FINI 00698 00699 #define SQRADD(i, j) \ 00700 __asm__( \ 00701 "movl %6,%%eax \n\t" \ 00702 "mull %%eax \n\t" \ 00703 "addl %%eax,%0 \n\t" \ 00704 "adcl %%edx,%1 \n\t" \ 00705 "adcl $0,%2 \n\t" \ 00706 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc"); 00707 00708 #define SQRADD2(i, j) \ 00709 __asm__( \ 00710 "movl %6,%%eax \n\t" \ 00711 "mull %7 \n\t" \ 00712 "addl %%eax,%0 \n\t" \ 00713 "adcl %%edx,%1 \n\t" \ 00714 "adcl $0,%2 \n\t" \ 00715 "addl %%eax,%0 \n\t" \ 00716 "adcl %%edx,%1 \n\t" \ 00717 "adcl $0,%2 \n\t" \ 00718 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx", "cc"); 00719 00720 #define SQRADDSC(i, j) \ 00721 __asm__( \ 00722 "movl %3,%%eax \n\t" \ 00723 "mull %4 \n\t" \ 00724 "movl %%eax,%0 \n\t" \ 00725 "movl %%edx,%1 \n\t" \ 00726 "xorl %2,%2 \n\t" \ 00727 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc"); 00728 00729 #define SQRADDAC(i, j) \ 00730 __asm__( \ 00731 "movl %6,%%eax \n\t" \ 00732 "mull %7 \n\t" \ 00733 "addl %%eax,%0 \n\t" \ 00734 "adcl %%edx,%1 \n\t" \ 00735 "adcl $0,%2 \n\t" \ 00736 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc"); 00737 00738 #define SQRADDDB \ 00739 __asm__( \ 00740 "addl %6,%0 \n\t" \ 00741 "adcl %7,%1 \n\t" \ 00742 "adcl %8,%2 \n\t" \ 00743 "addl %6,%0 \n\t" \ 00744 "adcl %7,%1 \n\t" \ 00745 "adcl %8,%2 \n\t" \ 00746 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc"); 00747 00748 #elif defined(TFM_X86_64) 00749 /* x86-64 optimized */ 00750 00751 #define COMBA_START 00752 00753 #define CLEAR_CARRY \ 00754 c0 = c1 = c2 = 0; 00755 00756 #define COMBA_STORE(x) \ 00757 x = c0; 00758 00759 #define COMBA_STORE2(x) \ 00760 x = c1; 00761 00762 #define CARRY_FORWARD \ 00763 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00764 00765 #define COMBA_FINI 00766 00767 #define SQRADD(i, j) \ 00768 __asm__( \ 00769 "movq %6,%%rax \n\t" \ 00770 "mulq %%rax \n\t" \ 00771 "addq %%rax,%0 \n\t" \ 00772 "adcq %%rdx,%1 \n\t" \ 00773 "adcq $0,%2 \n\t" \ 00774 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "x"(i) :"%rax","%rdx","cc"); 00775 00776 #define SQRADD2(i, j) \ 00777 __asm__( \ 00778 "movq %6,%%rax \n\t" \ 00779 "mulq %7 \n\t" \ 00780 "addq %%rax,%0 \n\t" \ 00781 "adcq %%rdx,%1 \n\t" \ 00782 "adcq $0,%2 \n\t" \ 00783 "addq %%rax,%0 \n\t" \ 00784 "adcq %%rdx,%1 \n\t" \ 00785 "adcq $0,%2 \n\t" \ 00786 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc"); 00787 00788 #define SQRADDSC(i, j) \ 00789 __asm__( \ 00790 "movq %3,%%rax \n\t" \ 00791 "mulq %4 \n\t" \ 00792 "movq %%rax,%0 \n\t" \ 00793 "movq %%rdx,%1 \n\t" \ 00794 "xorq %2,%2 \n\t" \ 00795 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc"); 00796 00797 #define SQRADDAC(i, j) \ 00798 __asm__( \ 00799 "movq %6,%%rax \n\t" \ 00800 "mulq %7 \n\t" \ 00801 "addq %%rax,%0 \n\t" \ 00802 "adcq %%rdx,%1 \n\t" \ 00803 "adcq $0,%2 \n\t" \ 00804 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc"); 00805 00806 #define SQRADDDB \ 00807 __asm__( \ 00808 "addq %6,%0 \n\t" \ 00809 "adcq %7,%1 \n\t" \ 00810 "adcq %8,%2 \n\t" \ 00811 "addq %6,%0 \n\t" \ 00812 "adcq %7,%1 \n\t" \ 00813 "adcq %8,%2 \n\t" \ 00814 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc"); 00815 00816 #elif defined(TFM_SSE2) 00817 00818 /* SSE2 Optimized */ 00819 #define COMBA_START 00820 00821 #define CLEAR_CARRY \ 00822 c0 = c1 = c2 = 0; 00823 00824 #define COMBA_STORE(x) \ 00825 x = c0; 00826 00827 #define COMBA_STORE2(x) \ 00828 x = c1; 00829 00830 #define CARRY_FORWARD \ 00831 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00832 00833 #define COMBA_FINI \ 00834 __asm__("emms"); 00835 00836 #define SQRADD(i, j) \ 00837 __asm__( \ 00838 "movd %6,%%mm0 \n\t" \ 00839 "pmuludq %%mm0,%%mm0\n\t" \ 00840 "movd %%mm0,%%eax \n\t" \ 00841 "psrlq $32,%%mm0 \n\t" \ 00842 "addl %%eax,%0 \n\t" \ 00843 "movd %%mm0,%%eax \n\t" \ 00844 "adcl %%eax,%1 \n\t" \ 00845 "adcl $0,%2 \n\t" \ 00846 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc"); 00847 00848 #define SQRADD2(i, j) \ 00849 __asm__( \ 00850 "movd %6,%%mm0 \n\t" \ 00851 "movd %7,%%mm1 \n\t" \ 00852 "pmuludq %%mm1,%%mm0\n\t" \ 00853 "movd %%mm0,%%eax \n\t" \ 00854 "psrlq $32,%%mm0 \n\t" \ 00855 "movd %%mm0,%%edx \n\t" \ 00856 "addl %%eax,%0 \n\t" \ 00857 "adcl %%edx,%1 \n\t" \ 00858 "adcl $0,%2 \n\t" \ 00859 "addl %%eax,%0 \n\t" \ 00860 "adcl %%edx,%1 \n\t" \ 00861 "adcl $0,%2 \n\t" \ 00862 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc"); 00863 00864 #define SQRADDSC(i, j) \ 00865 __asm__( \ 00866 "movd %3,%%mm0 \n\t" \ 00867 "movd %4,%%mm1 \n\t" \ 00868 "pmuludq %%mm1,%%mm0\n\t" \ 00869 "movd %%mm0,%0 \n\t" \ 00870 "psrlq $32,%%mm0 \n\t" \ 00871 "movd %%mm0,%1 \n\t" \ 00872 "xorl %2,%2 \n\t" \ 00873 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j)); 00874 00875 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */ 00876 00877 #define SQRADDAC(i, j) \ 00878 __asm__( \ 00879 "movd %6,%%mm0 \n\t" \ 00880 "movd %7,%%mm1 \n\t" \ 00881 "pmuludq %%mm1,%%mm0\n\t" \ 00882 "movd %%mm0,%%eax \n\t" \ 00883 "psrlq $32,%%mm0 \n\t" \ 00884 "movd %%mm0,%%edx \n\t" \ 00885 "addl %%eax,%0 \n\t" \ 00886 "adcl %%edx,%1 \n\t" \ 00887 "adcl $0,%2 \n\t" \ 00888 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","cc"); 00889 00890 #define SQRADDDB \ 00891 __asm__( \ 00892 "addl %6,%0 \n\t" \ 00893 "adcl %7,%1 \n\t" \ 00894 "adcl %8,%2 \n\t" \ 00895 "addl %6,%0 \n\t" \ 00896 "adcl %7,%1 \n\t" \ 00897 "adcl %8,%2 \n\t" \ 00898 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc"); 00899 00900 #elif defined(TFM_ARM) 00901 00902 /* ARM code */ 00903 00904 #define COMBA_START 00905 00906 #define CLEAR_CARRY \ 00907 c0 = c1 = c2 = 0; 00908 00909 #define COMBA_STORE(x) \ 00910 x = c0; 00911 00912 #define COMBA_STORE2(x) \ 00913 x = c1; 00914 00915 #define CARRY_FORWARD \ 00916 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00917 00918 #define COMBA_FINI 00919 00920 /* multiplies point i and j, updates carry "c1" and digit c2 */ 00921 #define SQRADD(i, j) \ 00922 __asm__( \ 00923 " UMULL r0,r1,%6,%6 \n\t" \ 00924 " ADDS %0,%0,r0 \n\t" \ 00925 " ADCS %1,%1,r1 \n\t" \ 00926 " ADC %2,%2,#0 \n\t" \ 00927 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc"); 00928 00929 /* for squaring some of the terms are doubled... */ 00930 #define SQRADD2(i, j) \ 00931 __asm__( \ 00932 " UMULL r0,r1,%6,%7 \n\t" \ 00933 " ADDS %0,%0,r0 \n\t" \ 00934 " ADCS %1,%1,r1 \n\t" \ 00935 " ADC %2,%2,#0 \n\t" \ 00936 " ADDS %0,%0,r0 \n\t" \ 00937 " ADCS %1,%1,r1 \n\t" \ 00938 " ADC %2,%2,#0 \n\t" \ 00939 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc"); 00940 00941 #define SQRADDSC(i, j) \ 00942 __asm__( \ 00943 " UMULL %0,%1,%3,%4 \n\t" \ 00944 " SUB %2,%2,%2 \n\t" \ 00945 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc"); 00946 00947 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */ 00948 00949 #define SQRADDAC(i, j) \ 00950 __asm__( \ 00951 " UMULL r0,r1,%6,%7 \n\t" \ 00952 " ADDS %0,%0,r0 \n\t" \ 00953 " ADCS %1,%1,r1 \n\t" \ 00954 " ADC %2,%2,#0 \n\t" \ 00955 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc"); 00956 00957 #define SQRADDDB \ 00958 __asm__( \ 00959 " ADDS %0,%0,%3 \n\t" \ 00960 " ADCS %1,%1,%4 \n\t" \ 00961 " ADC %2,%2,%5 \n\t" \ 00962 " ADDS %0,%0,%3 \n\t" \ 00963 " ADCS %1,%1,%4 \n\t" \ 00964 " ADC %2,%2,%5 \n\t" \ 00965 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 00966 00967 #elif defined(TFM_PPC32) 00968 00969 /* PPC32 */ 00970 00971 #define COMBA_START 00972 00973 #define CLEAR_CARRY \ 00974 c0 = c1 = c2 = 0; 00975 00976 #define COMBA_STORE(x) \ 00977 x = c0; 00978 00979 #define COMBA_STORE2(x) \ 00980 x = c1; 00981 00982 #define CARRY_FORWARD \ 00983 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00984 00985 #define COMBA_FINI 00986 00987 /* multiplies point i and j, updates carry "c1" and digit c2 */ 00988 #define SQRADD(i, j) \ 00989 __asm__( \ 00990 " mullw 16,%6,%6 \n\t" \ 00991 " addc %0,%0,16 \n\t" \ 00992 " mulhwu 16,%6,%6 \n\t" \ 00993 " adde %1,%1,16 \n\t" \ 00994 " addze %2,%2 \n\t" \ 00995 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc"); 00996 00997 /* for squaring some of the terms are doubled... */ 00998 #define SQRADD2(i, j) \ 00999 __asm__( \ 01000 " mullw 16,%6,%7 \n\t" \ 01001 " mulhwu 17,%6,%7 \n\t" \ 01002 " addc %0,%0,16 \n\t" \ 01003 " adde %1,%1,17 \n\t" \ 01004 " addze %2,%2 \n\t" \ 01005 " addc %0,%0,16 \n\t" \ 01006 " adde %1,%1,17 \n\t" \ 01007 " addze %2,%2 \n\t" \ 01008 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc"); 01009 01010 #define SQRADDSC(i, j) \ 01011 __asm__( \ 01012 " mullw %0,%6,%7 \n\t" \ 01013 " mulhwu %1,%6,%7 \n\t" \ 01014 " xor %2,%2,%2 \n\t" \ 01015 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc"); 01016 01017 #define SQRADDAC(i, j) \ 01018 __asm__( \ 01019 " mullw 16,%6,%7 \n\t" \ 01020 " addc %0,%0,16 \n\t" \ 01021 " mulhwu 16,%6,%7 \n\t" \ 01022 " adde %1,%1,16 \n\t" \ 01023 " addze %2,%2 \n\t" \ 01024 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc"); 01025 01026 #define SQRADDDB \ 01027 __asm__( \ 01028 " addc %0,%0,%3 \n\t" \ 01029 " adde %1,%1,%4 \n\t" \ 01030 " adde %2,%2,%5 \n\t" \ 01031 " addc %0,%0,%3 \n\t" \ 01032 " adde %1,%1,%4 \n\t" \ 01033 " adde %2,%2,%5 \n\t" \ 01034 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 01035 01036 #elif defined(TFM_PPC64) 01037 /* PPC64 */ 01038 01039 #define COMBA_START 01040 01041 #define CLEAR_CARRY \ 01042 c0 = c1 = c2 = 0; 01043 01044 #define COMBA_STORE(x) \ 01045 x = c0; 01046 01047 #define COMBA_STORE2(x) \ 01048 x = c1; 01049 01050 #define CARRY_FORWARD \ 01051 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01052 01053 #define COMBA_FINI 01054 01055 /* multiplies point i and j, updates carry "c1" and digit c2 */ 01056 #define SQRADD(i, j) \ 01057 __asm__( \ 01058 " mulld r16,%6,%6 \n\t" \ 01059 " addc %0,%0,r16 \n\t" \ 01060 " mulhdu r16,%6,%6 \n\t" \ 01061 " adde %1,%1,r16 \n\t" \ 01062 " addze %2,%2 \n\t" \ 01063 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r16","cc"); 01064 01065 /* for squaring some of the terms are doubled... */ 01066 #define SQRADD2(i, j) \ 01067 __asm__( \ 01068 " mulld r16,%6,%7 \n\t" \ 01069 " mulhdu r17,%6,%7 \n\t" \ 01070 " addc %0,%0,r16 \n\t" \ 01071 " adde %1,%1,r17 \n\t" \ 01072 " addze %2,%2 \n\t" \ 01073 " addc %0,%0,r16 \n\t" \ 01074 " adde %1,%1,r17 \n\t" \ 01075 " addze %2,%2 \n\t" \ 01076 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16", "r17","cc"); 01077 01078 #define SQRADDSC(i, j) \ 01079 __asm__( \ 01080 " mulld %0,%6,%7 \n\t" \ 01081 " mulhdu %1,%6,%7 \n\t" \ 01082 " xor %2,%2,%2 \n\t" \ 01083 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc"); 01084 01085 #define SQRADDAC(i, j) \ 01086 __asm__( \ 01087 " mulld r16,%6,%7 \n\t" \ 01088 " addc %0,%0,r16 \n\t" \ 01089 " mulhdu r16,%6,%7 \n\t" \ 01090 " adde %1,%1,r16 \n\t" \ 01091 " addze %2,%2 \n\t" \ 01092 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r16", "cc"); 01093 01094 #define SQRADDDB \ 01095 __asm__( \ 01096 " addc %0,%0,%3 \n\t" \ 01097 " adde %1,%1,%4 \n\t" \ 01098 " adde %2,%2,%5 \n\t" \ 01099 " addc %0,%0,%3 \n\t" \ 01100 " adde %1,%1,%4 \n\t" \ 01101 " adde %2,%2,%5 \n\t" \ 01102 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 01103 01104 01105 #elif defined(TFM_AVR32) 01106 01107 /* AVR32 */ 01108 01109 #define COMBA_START 01110 01111 #define CLEAR_CARRY \ 01112 c0 = c1 = c2 = 0; 01113 01114 #define COMBA_STORE(x) \ 01115 x = c0; 01116 01117 #define COMBA_STORE2(x) \ 01118 x = c1; 01119 01120 #define CARRY_FORWARD \ 01121 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01122 01123 #define COMBA_FINI 01124 01125 /* multiplies point i and j, updates carry "c1" and digit c2 */ 01126 #define SQRADD(i, j) \ 01127 __asm__( \ 01128 " mulu.d r2,%6,%6 \n\t" \ 01129 " add %0,%0,r2 \n\t" \ 01130 " adc %1,%1,r3 \n\t" \ 01131 " acr %2 \n\t" \ 01132 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3"); 01133 01134 /* for squaring some of the terms are doubled... */ 01135 #define SQRADD2(i, j) \ 01136 __asm__( \ 01137 " mulu.d r2,%6,%7 \n\t" \ 01138 " add %0,%0,r2 \n\t" \ 01139 " adc %1,%1,r3 \n\t" \ 01140 " acr %2, \n\t" \ 01141 " add %0,%0,r2 \n\t" \ 01142 " adc %1,%1,r3 \n\t" \ 01143 " acr %2, \n\t" \ 01144 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3"); 01145 01146 #define SQRADDSC(i, j) \ 01147 __asm__( \ 01148 " mulu.d r2,%6,%7 \n\t" \ 01149 " mov %0,r2 \n\t" \ 01150 " mov %1,r3 \n\t" \ 01151 " eor %2,%2 \n\t" \ 01152 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3"); 01153 01154 #define SQRADDAC(i, j) \ 01155 __asm__( \ 01156 " mulu.d r2,%6,%7 \n\t" \ 01157 " add %0,%0,r2 \n\t" \ 01158 " adc %1,%1,r3 \n\t" \ 01159 " acr %2 \n\t" \ 01160 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3"); 01161 01162 #define SQRADDDB \ 01163 __asm__( \ 01164 " add %0,%0,%3 \n\t" \ 01165 " adc %1,%1,%4 \n\t" \ 01166 " adc %2,%2,%5 \n\t" \ 01167 " add %0,%0,%3 \n\t" \ 01168 " adc %1,%1,%4 \n\t" \ 01169 " adc %2,%2,%5 \n\t" \ 01170 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 01171 01172 #elif defined(TFM_MIPS) 01173 01174 /* MIPS */ 01175 #define COMBA_START 01176 01177 #define CLEAR_CARRY \ 01178 c0 = c1 = c2 = 0; 01179 01180 #define COMBA_STORE(x) \ 01181 x = c0; 01182 01183 #define COMBA_STORE2(x) \ 01184 x = c1; 01185 01186 #define CARRY_FORWARD \ 01187 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01188 01189 #define COMBA_FINI 01190 01191 /* multiplies point i and j, updates carry "c1" and digit c2 */ 01192 #define SQRADD(i, j) \ 01193 __asm__( \ 01194 " multu %6,%6 \n\t" \ 01195 " mflo $12 \n\t" \ 01196 " mfhi $13 \n\t" \ 01197 " addu %0,%0,$12 \n\t" \ 01198 " sltu $12,%0,$12 \n\t" \ 01199 " addu %1,%1,$13 \n\t" \ 01200 " sltu $13,%1,$13 \n\t" \ 01201 " addu %1,%1,$12 \n\t" \ 01202 " sltu $12,%1,$12 \n\t" \ 01203 " addu %2,%2,$13 \n\t" \ 01204 " addu %2,%2,$12 \n\t" \ 01205 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13"); 01206 01207 /* for squaring some of the terms are doubled... */ 01208 #define SQRADD2(i, j) \ 01209 __asm__( \ 01210 " multu %6,%7 \n\t" \ 01211 " mflo $12 \n\t" \ 01212 " mfhi $13 \n\t" \ 01213 \ 01214 " addu %0,%0,$12 \n\t" \ 01215 " sltu $14,%0,$12 \n\t" \ 01216 " addu %1,%1,$13 \n\t" \ 01217 " sltu $15,%1,$13 \n\t" \ 01218 " addu %1,%1,$14 \n\t" \ 01219 " sltu $14,%1,$14 \n\t" \ 01220 " addu %2,%2,$15 \n\t" \ 01221 " addu %2,%2,$14 \n\t" \ 01222 \ 01223 " addu %0,%0,$12 \n\t" \ 01224 " sltu $14,%0,$12 \n\t" \ 01225 " addu %1,%1,$13 \n\t" \ 01226 " sltu $15,%1,$13 \n\t" \ 01227 " addu %1,%1,$14 \n\t" \ 01228 " sltu $14,%1,$14 \n\t" \ 01229 " addu %2,%2,$15 \n\t" \ 01230 " addu %2,%2,$14 \n\t" \ 01231 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15"); 01232 01233 #define SQRADDSC(i, j) \ 01234 __asm__( \ 01235 " multu %6,%7 \n\t" \ 01236 " mflo %0 \n\t" \ 01237 " mfhi %1 \n\t" \ 01238 " xor %2,%2,%2 \n\t" \ 01239 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc"); 01240 01241 #define SQRADDAC(i, j) \ 01242 __asm__( \ 01243 " multu %6,%7 \n\t" \ 01244 " mflo $12 \n\t" \ 01245 " mfhi $13 \n\t" \ 01246 " addu %0,%0,$12 \n\t" \ 01247 " sltu $12,%0,$12 \n\t" \ 01248 " addu %1,%1,$13 \n\t" \ 01249 " sltu $13,%1,$13 \n\t" \ 01250 " addu %1,%1,$12 \n\t" \ 01251 " sltu $12,%1,$12 \n\t" \ 01252 " addu %2,%2,$13 \n\t" \ 01253 " addu %2,%2,$12 \n\t" \ 01254 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14"); 01255 01256 #define SQRADDDB \ 01257 __asm__( \ 01258 " addu %0,%0,%3 \n\t" \ 01259 " sltu $10,%0,%3 \n\t" \ 01260 " addu %1,%1,$10 \n\t" \ 01261 " sltu $10,%1,$10 \n\t" \ 01262 " addu %1,%1,%4 \n\t" \ 01263 " sltu $11,%1,%4 \n\t" \ 01264 " addu %2,%2,$10 \n\t" \ 01265 " addu %2,%2,$11 \n\t" \ 01266 " addu %2,%2,%5 \n\t" \ 01267 \ 01268 " addu %0,%0,%3 \n\t" \ 01269 " sltu $10,%0,%3 \n\t" \ 01270 " addu %1,%1,$10 \n\t" \ 01271 " sltu $10,%1,$10 \n\t" \ 01272 " addu %1,%1,%4 \n\t" \ 01273 " sltu $11,%1,%4 \n\t" \ 01274 " addu %2,%2,$10 \n\t" \ 01275 " addu %2,%2,$11 \n\t" \ 01276 " addu %2,%2,%5 \n\t" \ 01277 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11"); 01278 01279 #else 01280 01281 #define TFM_ISO 01282 01283 /* ISO C portable code */ 01284 01285 #define COMBA_START 01286 01287 #define CLEAR_CARRY \ 01288 c0 = c1 = c2 = 0; 01289 01290 #define COMBA_STORE(x) \ 01291 x = c0; 01292 01293 #define COMBA_STORE2(x) \ 01294 x = c1; 01295 01296 #define CARRY_FORWARD \ 01297 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01298 01299 #define COMBA_FINI 01300 01301 /* multiplies point i and j, updates carry "c1" and digit c2 */ 01302 #define SQRADD(i, j) \ 01303 do { fp_word t; \ 01304 t = c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \ 01305 t = c1 + (t >> DIGIT_BIT); c1 = (fp_digit)t; \ 01306 c2 +=(fp_digit) (t >> DIGIT_BIT); \ 01307 } while (0); 01308 01309 01310 /* for squaring some of the terms are doubled... */ 01311 #define SQRADD2(i, j) \ 01312 do { fp_word t; \ 01313 t = ((fp_word)i) * ((fp_word)j); \ 01314 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \ 01315 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \ 01316 c2 +=(fp_digit)(tt >> DIGIT_BIT); \ 01317 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \ 01318 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \ 01319 c2 +=(fp_digit)(tt >> DIGIT_BIT); \ 01320 } while (0); 01321 01322 #define SQRADDSC(i, j) \ 01323 do { fp_word t; \ 01324 t = ((fp_word)i) * ((fp_word)j); \ 01325 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \ 01326 } while (0); 01327 01328 #define SQRADDAC(i, j) \ 01329 do { fp_word t; \ 01330 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = (fp_digit)t; \ 01331 t = sc1 + (t >> DIGIT_BIT); sc1 = (fp_digit)t; \ 01332 sc2 += (fp_digit)(t >> DIGIT_BIT); \ 01333 } while (0); 01334 01335 #define SQRADDDB \ 01336 do { fp_word t; \ 01337 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t; \ 01338 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); \ 01339 c1 = (fp_digit)t; \ 01340 c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT)); \ 01341 } while (0); 01342 01343 #endif 01344 01345 #ifdef TFM_SMALL_SET 01346 #include "fp_sqr_comba_small_set.i" 01347 #endif 01348 01349 #if defined(TFM_SQR3) && FP_SIZE >= 6 01350 #include "fp_sqr_comba_3.i" 01351 #endif 01352 #if defined(TFM_SQR4) && FP_SIZE >= 8 01353 #include "fp_sqr_comba_4.i" 01354 #endif 01355 #if defined(TFM_SQR6) && FP_SIZE >= 12 01356 #include "fp_sqr_comba_6.i" 01357 #endif 01358 #if defined(TFM_SQR7) && FP_SIZE >= 14 01359 #include "fp_sqr_comba_7.i" 01360 #endif 01361 #if defined(TFM_SQR8) && FP_SIZE >= 16 01362 #include "fp_sqr_comba_8.i" 01363 #endif 01364 #if defined(TFM_SQR9) && FP_SIZE >= 18 01365 #include "fp_sqr_comba_9.i" 01366 #endif 01367 #if defined(TFM_SQR12) && FP_SIZE >= 24 01368 #include "fp_sqr_comba_12.i" 01369 #endif 01370 #if defined(TFM_SQR17) && FP_SIZE >= 34 01371 #include "fp_sqr_comba_17.i" 01372 #endif 01373 #if defined(TFM_SQR20) && FP_SIZE >= 40 01374 #include "fp_sqr_comba_20.i" 01375 #endif 01376 #if defined(TFM_SQR24) && FP_SIZE >= 48 01377 #include "fp_sqr_comba_24.i" 01378 #endif 01379 #if defined(TFM_SQR28) && FP_SIZE >= 56 01380 #include "fp_sqr_comba_28.i" 01381 #endif 01382 #if defined(TFM_SQR32) && FP_SIZE >= 64 01383 #include "fp_sqr_comba_32.i" 01384 #endif 01385 #if defined(TFM_SQR48) && FP_SIZE >= 96 01386 #include "fp_sqr_comba_48.i" 01387 #endif 01388 #if defined(TFM_SQR64) && FP_SIZE >= 128 01389 #include "fp_sqr_comba_64.i" 01390 #endif 01391 /* end fp_sqr_comba.c asm */ 01392 01393 /* start fp_mul_comba.c asm */ 01394 /* these are the combas. Worship them. */ 01395 #if defined(TFM_X86) 01396 /* Generic x86 optimized code */ 01397 01398 /* anything you need at the start */ 01399 #define COMBA_START 01400 01401 /* clear the chaining variables */ 01402 #define COMBA_CLEAR \ 01403 c0 = c1 = c2 = 0; 01404 01405 /* forward the carry to the next digit */ 01406 #define COMBA_FORWARD \ 01407 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01408 01409 /* store the first sum */ 01410 #define COMBA_STORE(x) \ 01411 x = c0; 01412 01413 /* store the second sum [carry] */ 01414 #define COMBA_STORE2(x) \ 01415 x = c1; 01416 01417 /* anything you need at the end */ 01418 #define COMBA_FINI 01419 01420 /* this should multiply i and j */ 01421 #define MULADD(i, j) \ 01422 __asm__( \ 01423 "movl %6,%%eax \n\t" \ 01424 "mull %7 \n\t" \ 01425 "addl %%eax,%0 \n\t" \ 01426 "adcl %%edx,%1 \n\t" \ 01427 "adcl $0,%2 \n\t" \ 01428 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc"); 01429 01430 #elif defined(TFM_X86_64) 01431 /* x86-64 optimized */ 01432 01433 /* anything you need at the start */ 01434 #define COMBA_START 01435 01436 /* clear the chaining variables */ 01437 #define COMBA_CLEAR \ 01438 c0 = c1 = c2 = 0; 01439 01440 /* forward the carry to the next digit */ 01441 #define COMBA_FORWARD \ 01442 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01443 01444 /* store the first sum */ 01445 #define COMBA_STORE(x) \ 01446 x = c0; 01447 01448 /* store the second sum [carry] */ 01449 #define COMBA_STORE2(x) \ 01450 x = c1; 01451 01452 /* anything you need at the end */ 01453 #define COMBA_FINI 01454 01455 /* this should multiply i and j */ 01456 #define MULADD(i, j) \ 01457 __asm__ ( \ 01458 "movq %6,%%rax \n\t" \ 01459 "mulq %7 \n\t" \ 01460 "addq %%rax,%0 \n\t" \ 01461 "adcq %%rdx,%1 \n\t" \ 01462 "adcq $0,%2 \n\t" \ 01463 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc"); 01464 01465 01466 #if defined(HAVE_INTEL_MULX) 01467 #define MULADD_BODY(a,b,c) \ 01468 __asm__ volatile( \ 01469 "movq %[a0],%%rdx\n\t" \ 01470 "xorq %%rcx, %%rcx\n\t" \ 01471 "movq 0(%[cp]),%%r8\n\t" \ 01472 "movq 8(%[cp]),%%r9\n\t" \ 01473 "movq 16(%[cp]),%%r10\n\t" \ 01474 "movq 24(%[cp]),%%r11\n\t" \ 01475 "movq 32(%[cp]),%%r12\n\t" \ 01476 "movq 40(%[cp]),%%r13\n\t" \ 01477 \ 01478 "mulx (%[bp]),%%rax, %%rbx\n\t" \ 01479 "adoxq %%rax, %%r8\n\t" \ 01480 "mulx 8(%[bp]),%%rax, %%rcx\n\t" \ 01481 "adcxq %%rbx, %%r9\n\t" \ 01482 "adoxq %%rax, %%r9\n\t" \ 01483 "mulx 16(%[bp]),%%rax, %%rbx\n\t" \ 01484 "adcxq %%rcx, %%r10\n\t" \ 01485 "adoxq %%rax, %%r10\n\t" \ 01486 "mulx 24(%[bp]),%%rax, %%rcx\n\t" \ 01487 "adcxq %%rbx, %%r11\n\t" \ 01488 "adoxq %%rax, %%r11\n\t" \ 01489 "adcxq %%rcx, %%r12\n\t" \ 01490 "mov $0, %%rdx\n\t" \ 01491 "adox %%rdx, %%r12\n\t" \ 01492 "adcx %%rdx, %%r13\n\t" \ 01493 \ 01494 "movq %%r8, 0(%[cp])\n\t" \ 01495 "movq %%r9, 8(%[cp])\n\t" \ 01496 "movq %%r10, 16(%[cp])\n\t" \ 01497 "movq %%r11, 24(%[cp])\n\t" \ 01498 "movq %%r12, 32(%[cp])\n\t" \ 01499 "movq %%r13, 40(%[cp])\n\t" \ 01500 : \ 01501 : [a0] "r" (a->dp[ix]), [bp] "r" (&(b->dp[iy])), \ 01502 [cp] "r" (&(c->dp[iz])) \ 01503 : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", \ 01504 "%rdx", "%rax", "%rcx", "%rbx" \ 01505 ) 01506 01507 #define TFM_INTEL_MUL_COMBA(a, b, c) \ 01508 for (iz=0; iz<pa; iz++) c->dp[iz] = 0; \ 01509 for (ix=0; ix<a->used; ix++) { \ 01510 for (iy=0; iy<b->used; iy+=4) { \ 01511 iz = ix + iy; \ 01512 MULADD_BODY(a, b, c); \ 01513 } \ 01514 } 01515 #endif 01516 01517 #elif defined(TFM_SSE2) 01518 /* use SSE2 optimizations */ 01519 01520 /* anything you need at the start */ 01521 #define COMBA_START 01522 01523 /* clear the chaining variables */ 01524 #define COMBA_CLEAR \ 01525 c0 = c1 = c2 = 0; 01526 01527 /* forward the carry to the next digit */ 01528 #define COMBA_FORWARD \ 01529 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01530 01531 /* store the first sum */ 01532 #define COMBA_STORE(x) \ 01533 x = c0; 01534 01535 /* store the second sum [carry] */ 01536 #define COMBA_STORE2(x) \ 01537 x = c1; 01538 01539 /* anything you need at the end */ 01540 #define COMBA_FINI \ 01541 __asm__("emms"); 01542 01543 /* this should multiply i and j */ 01544 #define MULADD(i, j) \ 01545 __asm__( \ 01546 "movd %6,%%mm0 \n\t" \ 01547 "movd %7,%%mm1 \n\t" \ 01548 "pmuludq %%mm1,%%mm0\n\t" \ 01549 "movd %%mm0,%%eax \n\t" \ 01550 "psrlq $32,%%mm0 \n\t" \ 01551 "addl %%eax,%0 \n\t" \ 01552 "movd %%mm0,%%eax \n\t" \ 01553 "adcl %%eax,%1 \n\t" \ 01554 "adcl $0,%2 \n\t" \ 01555 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc"); 01556 01557 #elif defined(TFM_ARM) 01558 /* ARM code */ 01559 01560 #define COMBA_START 01561 01562 #define COMBA_CLEAR \ 01563 c0 = c1 = c2 = 0; 01564 01565 #define COMBA_FORWARD \ 01566 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01567 01568 #define COMBA_STORE(x) \ 01569 x = c0; 01570 01571 #define COMBA_STORE2(x) \ 01572 x = c1; 01573 01574 #define COMBA_FINI 01575 01576 #define MULADD(i, j) \ 01577 __asm__( \ 01578 " UMULL r0,r1,%6,%7 \n\t" \ 01579 " ADDS %0,%0,r0 \n\t" \ 01580 " ADCS %1,%1,r1 \n\t" \ 01581 " ADC %2,%2,#0 \n\t" \ 01582 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc"); 01583 01584 #elif defined(TFM_PPC32) 01585 /* For 32-bit PPC */ 01586 01587 #define COMBA_START 01588 01589 #define COMBA_CLEAR \ 01590 c0 = c1 = c2 = 0; 01591 01592 #define COMBA_FORWARD \ 01593 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01594 01595 #define COMBA_STORE(x) \ 01596 x = c0; 01597 01598 #define COMBA_STORE2(x) \ 01599 x = c1; 01600 01601 #define COMBA_FINI 01602 01603 /* untested: will mulhwu change the flags? Docs say no */ 01604 #define MULADD(i, j) \ 01605 __asm__( \ 01606 " mullw 16,%6,%7 \n\t" \ 01607 " addc %0,%0,16 \n\t" \ 01608 " mulhwu 16,%6,%7 \n\t" \ 01609 " adde %1,%1,16 \n\t" \ 01610 " addze %2,%2 \n\t" \ 01611 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16"); 01612 01613 #elif defined(TFM_PPC64) 01614 /* For 64-bit PPC */ 01615 01616 #define COMBA_START 01617 01618 #define COMBA_CLEAR \ 01619 c0 = c1 = c2 = 0; 01620 01621 #define COMBA_FORWARD \ 01622 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01623 01624 #define COMBA_STORE(x) \ 01625 x = c0; 01626 01627 #define COMBA_STORE2(x) \ 01628 x = c1; 01629 01630 #define COMBA_FINI 01631 01632 /* untested: will mulhdu change the flags? Docs say no */ 01633 #define MULADD(i, j) \ 01634 ____asm__( \ 01635 " mulld r16,%6,%7 \n\t" \ 01636 " addc %0,%0,16 \n\t" \ 01637 " mulhdu r16,%6,%7 \n\t" \ 01638 " adde %1,%1,16 \n\t" \ 01639 " addze %2,%2 \n\t" \ 01640 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16"); 01641 01642 #elif defined(TFM_AVR32) 01643 01644 /* ISO C code */ 01645 01646 #define COMBA_START 01647 01648 #define COMBA_CLEAR \ 01649 c0 = c1 = c2 = 0; 01650 01651 #define COMBA_FORWARD \ 01652 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01653 01654 #define COMBA_STORE(x) \ 01655 x = c0; 01656 01657 #define COMBA_STORE2(x) \ 01658 x = c1; 01659 01660 #define COMBA_FINI 01661 01662 #define MULADD(i, j) \ 01663 ____asm__( \ 01664 " mulu.d r2,%6,%7 \n\t"\ 01665 " add %0,r2 \n\t"\ 01666 " adc %1,%1,r3 \n\t"\ 01667 " acr %2 \n\t"\ 01668 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3"); 01669 01670 #elif defined(TFM_MIPS) 01671 01672 /* MIPS */ 01673 #define COMBA_START 01674 01675 #define COMBA_CLEAR \ 01676 c0 = c1 = c2 = 0; 01677 01678 #define COMBA_FORWARD \ 01679 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01680 01681 #define COMBA_STORE(x) \ 01682 x = c0; 01683 01684 #define COMBA_STORE2(x) \ 01685 x = c1; 01686 01687 #define COMBA_FINI 01688 01689 #define MULADD(i, j) \ 01690 __asm__( \ 01691 " multu %6,%7 \n\t" \ 01692 " mflo $12 \n\t" \ 01693 " mfhi $13 \n\t" \ 01694 " addu %0,%0,$12 \n\t" \ 01695 " sltu $12,%0,$12 \n\t" \ 01696 " addu %1,%1,$13 \n\t" \ 01697 " sltu $13,%1,$13 \n\t" \ 01698 " addu %1,%1,$12 \n\t" \ 01699 " sltu $12,%1,$12 \n\t" \ 01700 " addu %2,%2,$13 \n\t" \ 01701 " addu %2,%2,$12 \n\t" \ 01702 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12","$13"); 01703 01704 #else 01705 /* ISO C code */ 01706 01707 #define COMBA_START 01708 01709 #define COMBA_CLEAR \ 01710 c0 = c1 = c2 = 0; 01711 01712 #define COMBA_FORWARD \ 01713 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01714 01715 #define COMBA_STORE(x) \ 01716 x = c0; 01717 01718 #define COMBA_STORE2(x) \ 01719 x = c1; 01720 01721 #define COMBA_FINI 01722 01723 #define MULADD(i, j) \ 01724 do { fp_word t; \ 01725 t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); \ 01726 c0 = (fp_digit)t; \ 01727 t = (fp_word)c1 + (t >> DIGIT_BIT); \ 01728 c1 = (fp_digit)t; \ 01729 c2 += (fp_digit)(t >> DIGIT_BIT); \ 01730 } while (0); 01731 01732 #endif 01733 01734 01735 #ifdef TFM_SMALL_SET 01736 #include "fp_mul_comba_small_set.i" 01737 #endif 01738 01739 #if defined(TFM_MUL3) && FP_SIZE >= 6 01740 #include "fp_mul_comba_3.i" 01741 #endif 01742 #if defined(TFM_MUL4) && FP_SIZE >= 8 01743 #include "fp_mul_comba_4.i" 01744 #endif 01745 #if defined(TFM_MUL6) && FP_SIZE >= 12 01746 #include "fp_mul_comba_6.i" 01747 #endif 01748 #if defined(TFM_MUL7) && FP_SIZE >= 14 01749 #include "fp_mul_comba_7.i" 01750 #endif 01751 #if defined(TFM_MUL8) && FP_SIZE >= 16 01752 #include "fp_mul_comba_8.i" 01753 #endif 01754 #if defined(TFM_MUL9) && FP_SIZE >= 18 01755 #include "fp_mul_comba_9.i" 01756 #endif 01757 #if defined(TFM_MUL12) && FP_SIZE >= 24 01758 #include "fp_mul_comba_12.i" 01759 #endif 01760 #if defined(TFM_MUL17) && FP_SIZE >= 34 01761 #include "fp_mul_comba_17.i" 01762 #endif 01763 #if defined(TFM_MUL20) && FP_SIZE >= 40 01764 #include "fp_mul_comba_20.i" 01765 #endif 01766 #if defined(TFM_MUL24) && FP_SIZE >= 48 01767 #include "fp_mul_comba_24.i" 01768 #endif 01769 #if defined(TFM_MUL28) && FP_SIZE >= 56 01770 #include "fp_mul_comba_28.i" 01771 #endif 01772 #if defined(TFM_MUL32) && FP_SIZE >= 64 01773 #include "fp_mul_comba_32.i" 01774 #endif 01775 #if defined(TFM_MUL48) && FP_SIZE >= 96 01776 #include "fp_mul_comba_48.i" 01777 #endif 01778 #if defined(TFM_MUL64) && FP_SIZE >= 128 01779 #include "fp_mul_comba_64.i" 01780 #endif 01781 01782 /* end fp_mul_comba.c asm */ 01783 01784
Generated on Tue Jul 12 2022 16:58:03 by 1.7.2