Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
asm.c
00001 /* asm.c 00002 * 00003 * Copyright (C) 2006-2013 wolfSSL Inc. 00004 * 00005 * This file is part of CyaSSL. 00006 * 00007 * CyaSSL is free software; you can redistribute it and/or modify 00008 * it under the terms of the GNU General Public License as published by 00009 * the Free Software Foundation; either version 2 of the License, or 00010 * (at your option) any later version. 00011 * 00012 * CyaSSL is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 * GNU General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU General Public License 00018 * along with this program; if not, write to the Free Software 00019 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA 00020 */ 00021 00022 #ifdef HAVE_CONFIG_H 00023 #include <config.h> 00024 #endif 00025 00026 #include <cyassl/ctaocrypt/settings.h> 00027 00028 /* 00029 * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca, 00030 * http://math.libtomcrypt.com 00031 */ 00032 00033 00034 /******************************************************************/ 00035 /* fp_montgomery_reduce.c asm or generic */ 00036 #if defined(TFM_X86) && !defined(TFM_SSE2) 00037 /* x86-32 code */ 00038 00039 #define MONT_START 00040 #define MONT_FINI 00041 #define LOOP_END 00042 #define LOOP_START \ 00043 mu = c[x] * mp 00044 00045 #define INNERMUL \ 00046 __asm__( \ 00047 "movl %5,%%eax \n\t" \ 00048 "mull %4 \n\t" \ 00049 "addl %1,%%eax \n\t" \ 00050 "adcl $0,%%edx \n\t" \ 00051 "addl %%eax,%0 \n\t" \ 00052 "adcl $0,%%edx \n\t" \ 00053 "movl %%edx,%1 \n\t" \ 00054 :"=g"(_c[LO]), "=r"(cy) \ 00055 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \ 00056 : "%eax", "%edx", "cc") 00057 00058 #define PROPCARRY \ 00059 __asm__( \ 00060 "addl %1,%0 \n\t" \ 00061 "setb %%al \n\t" \ 00062 "movzbl %%al,%1 \n\t" \ 00063 :"=g"(_c[LO]), "=r"(cy) \ 00064 :"0"(_c[LO]), "1"(cy) \ 00065 : "%eax", "cc") 00066 00067 /******************************************************************/ 00068 #elif defined(TFM_X86_64) 00069 /* x86-64 code */ 00070 00071 #define MONT_START 00072 #define MONT_FINI 00073 #define LOOP_END 00074 #define LOOP_START \ 00075 mu = c[x] * mp 00076 00077 #define INNERMUL \ 00078 __asm__( \ 00079 "movq %5,%%rax \n\t" \ 00080 "mulq %4 \n\t" \ 00081 "addq %1,%%rax \n\t" \ 00082 "adcq $0,%%rdx \n\t" \ 00083 "addq %%rax,%0 \n\t" \ 00084 "adcq $0,%%rdx \n\t" \ 00085 "movq %%rdx,%1 \n\t" \ 00086 :"=g"(_c[LO]), "=r"(cy) \ 00087 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \ 00088 : "%rax", "%rdx", "cc") 00089 00090 #define INNERMUL8 \ 00091 __asm__( \ 00092 "movq 0(%5),%%rax \n\t" \ 00093 "movq 0(%2),%%r10 \n\t" \ 00094 "movq 0x8(%5),%%r11 \n\t" \ 00095 "mulq %4 \n\t" \ 00096 "addq %%r10,%%rax \n\t" \ 00097 "adcq $0,%%rdx \n\t" \ 00098 "movq 0x8(%2),%%r10 \n\t" \ 00099 "addq %3,%%rax \n\t" \ 00100 "adcq $0,%%rdx \n\t" \ 00101 "movq %%rax,0(%0) \n\t" \ 00102 "movq %%rdx,%1 \n\t" \ 00103 \ 00104 "movq %%r11,%%rax \n\t" \ 00105 "movq 0x10(%5),%%r11 \n\t" \ 00106 "mulq %4 \n\t" \ 00107 "addq %%r10,%%rax \n\t" \ 00108 "adcq $0,%%rdx \n\t" \ 00109 "movq 0x10(%2),%%r10 \n\t" \ 00110 "addq %3,%%rax \n\t" \ 00111 "adcq $0,%%rdx \n\t" \ 00112 "movq %%rax,0x8(%0) \n\t" \ 00113 "movq %%rdx,%1 \n\t" \ 00114 \ 00115 "movq %%r11,%%rax \n\t" \ 00116 "movq 0x18(%5),%%r11 \n\t" \ 00117 "mulq %4 \n\t" \ 00118 "addq %%r10,%%rax \n\t" \ 00119 "adcq $0,%%rdx \n\t" \ 00120 "movq 0x18(%2),%%r10 \n\t" \ 00121 "addq %3,%%rax \n\t" \ 00122 "adcq $0,%%rdx \n\t" \ 00123 "movq %%rax,0x10(%0) \n\t" \ 00124 "movq %%rdx,%1 \n\t" \ 00125 \ 00126 "movq %%r11,%%rax \n\t" \ 00127 "movq 0x20(%5),%%r11 \n\t" \ 00128 "mulq %4 \n\t" \ 00129 "addq %%r10,%%rax \n\t" \ 00130 "adcq $0,%%rdx \n\t" \ 00131 "movq 0x20(%2),%%r10 \n\t" \ 00132 "addq %3,%%rax \n\t" \ 00133 "adcq $0,%%rdx \n\t" \ 00134 "movq %%rax,0x18(%0) \n\t" \ 00135 "movq %%rdx,%1 \n\t" \ 00136 \ 00137 "movq %%r11,%%rax \n\t" \ 00138 "movq 0x28(%5),%%r11 \n\t" \ 00139 "mulq %4 \n\t" \ 00140 "addq %%r10,%%rax \n\t" \ 00141 "adcq $0,%%rdx \n\t" \ 00142 "movq 0x28(%2),%%r10 \n\t" \ 00143 "addq %3,%%rax \n\t" \ 00144 "adcq $0,%%rdx \n\t" \ 00145 "movq %%rax,0x20(%0) \n\t" \ 00146 "movq %%rdx,%1 \n\t" \ 00147 \ 00148 "movq %%r11,%%rax \n\t" \ 00149 "movq 0x30(%5),%%r11 \n\t" \ 00150 "mulq %4 \n\t" \ 00151 "addq %%r10,%%rax \n\t" \ 00152 "adcq $0,%%rdx \n\t" \ 00153 "movq 0x30(%2),%%r10 \n\t" \ 00154 "addq %3,%%rax \n\t" \ 00155 "adcq $0,%%rdx \n\t" \ 00156 "movq %%rax,0x28(%0) \n\t" \ 00157 "movq %%rdx,%1 \n\t" \ 00158 \ 00159 "movq %%r11,%%rax \n\t" \ 00160 "movq 0x38(%5),%%r11 \n\t" \ 00161 "mulq %4 \n\t" \ 00162 "addq %%r10,%%rax \n\t" \ 00163 "adcq $0,%%rdx \n\t" \ 00164 "movq 0x38(%2),%%r10 \n\t" \ 00165 "addq %3,%%rax \n\t" \ 00166 "adcq $0,%%rdx \n\t" \ 00167 "movq %%rax,0x30(%0) \n\t" \ 00168 "movq %%rdx,%1 \n\t" \ 00169 \ 00170 "movq %%r11,%%rax \n\t" \ 00171 "mulq %4 \n\t" \ 00172 "addq %%r10,%%rax \n\t" \ 00173 "adcq $0,%%rdx \n\t" \ 00174 "addq %3,%%rax \n\t" \ 00175 "adcq $0,%%rdx \n\t" \ 00176 "movq %%rax,0x38(%0) \n\t" \ 00177 "movq %%rdx,%1 \n\t" \ 00178 \ 00179 :"=r"(_c), "=r"(cy) \ 00180 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\ 00181 : "%rax", "%rdx", "%r10", "%r11", "cc") 00182 00183 00184 #define PROPCARRY \ 00185 __asm__( \ 00186 "addq %1,%0 \n\t" \ 00187 "setb %%al \n\t" \ 00188 "movzbq %%al,%1 \n\t" \ 00189 :"=g"(_c[LO]), "=r"(cy) \ 00190 :"0"(_c[LO]), "1"(cy) \ 00191 : "%rax", "cc") 00192 00193 /******************************************************************/ 00194 #elif defined(TFM_SSE2) 00195 /* SSE2 code (assumes 32-bit fp_digits) */ 00196 /* XMM register assignments: 00197 * xmm0 *tmpm++, then Mu * (*tmpm++) 00198 * xmm1 c[x], then Mu 00199 * xmm2 mp 00200 * xmm3 cy 00201 * xmm4 _c[LO] 00202 */ 00203 00204 #define MONT_START \ 00205 __asm__("movd %0,%%mm2"::"g"(mp)) 00206 00207 #define MONT_FINI \ 00208 __asm__("emms") 00209 00210 #define LOOP_START \ 00211 __asm__( \ 00212 "movd %0,%%mm1 \n\t" \ 00213 "pxor %%mm3,%%mm3 \n\t" \ 00214 "pmuludq %%mm2,%%mm1 \n\t" \ 00215 :: "g"(c[x])) 00216 00217 /* pmuludq on mmx registers does a 32x32->64 multiply. */ 00218 #define INNERMUL \ 00219 __asm__( \ 00220 "movd %1,%%mm4 \n\t" \ 00221 "movd %2,%%mm0 \n\t" \ 00222 "paddq %%mm4,%%mm3 \n\t" \ 00223 "pmuludq %%mm1,%%mm0 \n\t" \ 00224 "paddq %%mm0,%%mm3 \n\t" \ 00225 "movd %%mm3,%0 \n\t" \ 00226 "psrlq $32, %%mm3 \n\t" \ 00227 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) ); 00228 00229 #define INNERMUL8 \ 00230 __asm__( \ 00231 "movd 0(%1),%%mm4 \n\t" \ 00232 "movd 0(%2),%%mm0 \n\t" \ 00233 "paddq %%mm4,%%mm3 \n\t" \ 00234 "pmuludq %%mm1,%%mm0 \n\t" \ 00235 "movd 4(%2),%%mm5 \n\t" \ 00236 "paddq %%mm0,%%mm3 \n\t" \ 00237 "movd 4(%1),%%mm6 \n\t" \ 00238 "movd %%mm3,0(%0) \n\t" \ 00239 "psrlq $32, %%mm3 \n\t" \ 00240 \ 00241 "paddq %%mm6,%%mm3 \n\t" \ 00242 "pmuludq %%mm1,%%mm5 \n\t" \ 00243 "movd 8(%2),%%mm6 \n\t" \ 00244 "paddq %%mm5,%%mm3 \n\t" \ 00245 "movd 8(%1),%%mm7 \n\t" \ 00246 "movd %%mm3,4(%0) \n\t" \ 00247 "psrlq $32, %%mm3 \n\t" \ 00248 \ 00249 "paddq %%mm7,%%mm3 \n\t" \ 00250 "pmuludq %%mm1,%%mm6 \n\t" \ 00251 "movd 12(%2),%%mm7 \n\t" \ 00252 "paddq %%mm6,%%mm3 \n\t" \ 00253 "movd 12(%1),%%mm5 \n\t" \ 00254 "movd %%mm3,8(%0) \n\t" \ 00255 "psrlq $32, %%mm3 \n\t" \ 00256 \ 00257 "paddq %%mm5,%%mm3 \n\t" \ 00258 "pmuludq %%mm1,%%mm7 \n\t" \ 00259 "movd 16(%2),%%mm5 \n\t" \ 00260 "paddq %%mm7,%%mm3 \n\t" \ 00261 "movd 16(%1),%%mm6 \n\t" \ 00262 "movd %%mm3,12(%0) \n\t" \ 00263 "psrlq $32, %%mm3 \n\t" \ 00264 \ 00265 "paddq %%mm6,%%mm3 \n\t" \ 00266 "pmuludq %%mm1,%%mm5 \n\t" \ 00267 "movd 20(%2),%%mm6 \n\t" \ 00268 "paddq %%mm5,%%mm3 \n\t" \ 00269 "movd 20(%1),%%mm7 \n\t" \ 00270 "movd %%mm3,16(%0) \n\t" \ 00271 "psrlq $32, %%mm3 \n\t" \ 00272 \ 00273 "paddq %%mm7,%%mm3 \n\t" \ 00274 "pmuludq %%mm1,%%mm6 \n\t" \ 00275 "movd 24(%2),%%mm7 \n\t" \ 00276 "paddq %%mm6,%%mm3 \n\t" \ 00277 "movd 24(%1),%%mm5 \n\t" \ 00278 "movd %%mm3,20(%0) \n\t" \ 00279 "psrlq $32, %%mm3 \n\t" \ 00280 \ 00281 "paddq %%mm5,%%mm3 \n\t" \ 00282 "pmuludq %%mm1,%%mm7 \n\t" \ 00283 "movd 28(%2),%%mm5 \n\t" \ 00284 "paddq %%mm7,%%mm3 \n\t" \ 00285 "movd 28(%1),%%mm6 \n\t" \ 00286 "movd %%mm3,24(%0) \n\t" \ 00287 "psrlq $32, %%mm3 \n\t" \ 00288 \ 00289 "paddq %%mm6,%%mm3 \n\t" \ 00290 "pmuludq %%mm1,%%mm5 \n\t" \ 00291 "paddq %%mm5,%%mm3 \n\t" \ 00292 "movd %%mm3,28(%0) \n\t" \ 00293 "psrlq $32, %%mm3 \n\t" \ 00294 :"=r"(_c) : "0"(_c), "r"(tmpm) ); 00295 00296 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack 00297 pointer */ 00298 00299 #define LOOP_END \ 00300 __asm__( "movd %%mm3,%0 \n" :"=r"(cy)) 00301 00302 #define PROPCARRY \ 00303 __asm__( \ 00304 "addl %1,%0 \n\t" \ 00305 "setb %%al \n\t" \ 00306 "movzbl %%al,%1 \n\t" \ 00307 :"=g"(_c[LO]), "=r"(cy) \ 00308 :"0"(_c[LO]), "1"(cy) \ 00309 : "%eax", "cc") 00310 00311 /******************************************************************/ 00312 #elif defined(TFM_ARM) 00313 /* ARMv4 code */ 00314 00315 #define MONT_START 00316 #define MONT_FINI 00317 #define LOOP_END 00318 #define LOOP_START \ 00319 mu = c[x] * mp 00320 00321 00322 #ifdef __thumb__ 00323 00324 #define INNERMUL \ 00325 __asm__( \ 00326 " LDR r0,%1 \n\t" \ 00327 " ADDS r0,r0,%0 \n\t" \ 00328 " ITE CS \n\t" \ 00329 " MOVCS %0,#1 \n\t" \ 00330 " MOVCC %0,#0 \n\t" \ 00331 " UMLAL r0,%0,%3,%4 \n\t" \ 00332 " STR r0,%1 \n\t" \ 00333 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc"); 00334 00335 #define PROPCARRY \ 00336 __asm__( \ 00337 " LDR r0,%1 \n\t" \ 00338 " ADDS r0,r0,%0 \n\t" \ 00339 " STR r0,%1 \n\t" \ 00340 " ITE CS \n\t" \ 00341 " MOVCS %0,#1 \n\t" \ 00342 " MOVCC %0,#0 \n\t" \ 00343 :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc"); 00344 00345 00346 /* TAO thumb mode uses ite (if then else) to detect carry directly 00347 * fixed unmatched constraint warning by changing 1 to m */ 00348 00349 #else /* __thumb__ */ 00350 00351 #define INNERMUL \ 00352 __asm__( \ 00353 " LDR r0,%1 \n\t" \ 00354 " ADDS r0,r0,%0 \n\t" \ 00355 " MOVCS %0,#1 \n\t" \ 00356 " MOVCC %0,#0 \n\t" \ 00357 " UMLAL r0,%0,%3,%4 \n\t" \ 00358 " STR r0,%1 \n\t" \ 00359 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc"); 00360 00361 #define PROPCARRY \ 00362 __asm__( \ 00363 " LDR r0,%1 \n\t" \ 00364 " ADDS r0,r0,%0 \n\t" \ 00365 " STR r0,%1 \n\t" \ 00366 " MOVCS %0,#1 \n\t" \ 00367 " MOVCC %0,#0 \n\t" \ 00368 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc"); 00369 00370 #endif /* __thumb__ */ 00371 00372 #elif defined(TFM_PPC32) 00373 00374 /* PPC32 */ 00375 #define MONT_START 00376 #define MONT_FINI 00377 #define LOOP_END 00378 #define LOOP_START \ 00379 mu = c[x] * mp 00380 00381 #define INNERMUL \ 00382 __asm__( \ 00383 " mullw 16,%3,%4 \n\t" \ 00384 " mulhwu 17,%3,%4 \n\t" \ 00385 " addc 16,16,%0 \n\t" \ 00386 " addze 17,17 \n\t" \ 00387 " lwz 18,%1 \n\t" \ 00388 " addc 16,16,18 \n\t" \ 00389 " addze %0,17 \n\t" \ 00390 " stw 16,%1 \n\t" \ 00391 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm; 00392 00393 #define PROPCARRY \ 00394 __asm__( \ 00395 " lwz 16,%1 \n\t" \ 00396 " addc 16,16,%0 \n\t" \ 00397 " stw 16,%1 \n\t" \ 00398 " xor %0,%0,%0 \n\t" \ 00399 " addze %0,%0 \n\t" \ 00400 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc"); 00401 00402 #elif defined(TFM_PPC64) 00403 00404 /* PPC64 */ 00405 #define MONT_START 00406 #define MONT_FINI 00407 #define LOOP_END 00408 #define LOOP_START \ 00409 mu = c[x] * mp 00410 00411 #define INNERMUL \ 00412 __asm__( \ 00413 " mulld 16,%3,%4 \n\t" \ 00414 " mulhdu 17,%3,%4 \n\t" \ 00415 " addc 16,16,%0 \n\t" \ 00416 " addze 17,17 \n\t" \ 00417 " ldx 18,0,%1 \n\t" \ 00418 " addc 16,16,18 \n\t" \ 00419 " addze %0,17 \n\t" \ 00420 " sdx 16,0,%1 \n\t" \ 00421 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm; 00422 00423 #define PROPCARRY \ 00424 __asm__( \ 00425 " ldx 16,0,%1 \n\t" \ 00426 " addc 16,16,%0 \n\t" \ 00427 " sdx 16,0,%1 \n\t" \ 00428 " xor %0,%0,%0 \n\t" \ 00429 " addze %0,%0 \n\t" \ 00430 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc"); 00431 00432 /******************************************************************/ 00433 00434 #elif defined(TFM_AVR32) 00435 00436 /* AVR32 */ 00437 #define MONT_START 00438 #define MONT_FINI 00439 #define LOOP_END 00440 #define LOOP_START \ 00441 mu = c[x] * mp 00442 00443 #define INNERMUL \ 00444 __asm__( \ 00445 " ld.w r2,%1 \n\t" \ 00446 " add r2,%0 \n\t" \ 00447 " eor r3,r3 \n\t" \ 00448 " acr r3 \n\t" \ 00449 " macu.d r2,%3,%4 \n\t" \ 00450 " st.w %1,r2 \n\t" \ 00451 " mov %0,r3 \n\t" \ 00452 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3"); 00453 00454 #define PROPCARRY \ 00455 __asm__( \ 00456 " ld.w r2,%1 \n\t" \ 00457 " add r2,%0 \n\t" \ 00458 " st.w %1,r2 \n\t" \ 00459 " eor %0,%0 \n\t" \ 00460 " acr %0 \n\t" \ 00461 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc"); 00462 00463 #else 00464 00465 /* ISO C code */ 00466 #define MONT_START 00467 #define MONT_FINI 00468 #define LOOP_END 00469 #define LOOP_START \ 00470 mu = c[x] * mp 00471 00472 #define INNERMUL \ 00473 do { fp_word t; \ 00474 t = ((fp_word)_c[0] + (fp_word)cy) + \ 00475 (((fp_word)mu) * ((fp_word)*tmpm++)); \ 00476 _c[0] = (fp_digit)t; \ 00477 cy = (fp_digit)(t >> DIGIT_BIT); \ 00478 } while (0) 00479 00480 #define PROPCARRY \ 00481 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0) 00482 00483 #endif 00484 /******************************************************************/ 00485 00486 00487 #define LO 0 00488 /* end fp_montogomery_reduce.c asm */ 00489 00490 00491 /* start fp_sqr_comba.c asm */ 00492 #if defined(TFM_X86) 00493 00494 /* x86-32 optimized */ 00495 00496 #define COMBA_START 00497 00498 #define CLEAR_CARRY \ 00499 c0 = c1 = c2 = 0; 00500 00501 #define COMBA_STORE(x) \ 00502 x = c0; 00503 00504 #define COMBA_STORE2(x) \ 00505 x = c1; 00506 00507 #define CARRY_FORWARD \ 00508 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00509 00510 #define COMBA_FINI 00511 00512 #define SQRADD(i, j) \ 00513 __asm__( \ 00514 "movl %6,%%eax \n\t" \ 00515 "mull %%eax \n\t" \ 00516 "addl %%eax,%0 \n\t" \ 00517 "adcl %%edx,%1 \n\t" \ 00518 "adcl $0,%2 \n\t" \ 00519 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc"); 00520 00521 #define SQRADD2(i, j) \ 00522 __asm__( \ 00523 "movl %6,%%eax \n\t" \ 00524 "mull %7 \n\t" \ 00525 "addl %%eax,%0 \n\t" \ 00526 "adcl %%edx,%1 \n\t" \ 00527 "adcl $0,%2 \n\t" \ 00528 "addl %%eax,%0 \n\t" \ 00529 "adcl %%edx,%1 \n\t" \ 00530 "adcl $0,%2 \n\t" \ 00531 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx", "cc"); 00532 00533 #define SQRADDSC(i, j) \ 00534 __asm__( \ 00535 "movl %3,%%eax \n\t" \ 00536 "mull %4 \n\t" \ 00537 "movl %%eax,%0 \n\t" \ 00538 "movl %%edx,%1 \n\t" \ 00539 "xorl %2,%2 \n\t" \ 00540 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc"); 00541 00542 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */ 00543 00544 #define SQRADDAC(i, j) \ 00545 __asm__( \ 00546 "movl %6,%%eax \n\t" \ 00547 "mull %7 \n\t" \ 00548 "addl %%eax,%0 \n\t" \ 00549 "adcl %%edx,%1 \n\t" \ 00550 "adcl $0,%2 \n\t" \ 00551 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc"); 00552 00553 #define SQRADDDB \ 00554 __asm__( \ 00555 "addl %6,%0 \n\t" \ 00556 "adcl %7,%1 \n\t" \ 00557 "adcl %8,%2 \n\t" \ 00558 "addl %6,%0 \n\t" \ 00559 "adcl %7,%1 \n\t" \ 00560 "adcl %8,%2 \n\t" \ 00561 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc"); 00562 00563 #elif defined(TFM_X86_64) 00564 /* x86-64 optimized */ 00565 00566 #define COMBA_START 00567 00568 #define CLEAR_CARRY \ 00569 c0 = c1 = c2 = 0; 00570 00571 #define COMBA_STORE(x) \ 00572 x = c0; 00573 00574 #define COMBA_STORE2(x) \ 00575 x = c1; 00576 00577 #define CARRY_FORWARD \ 00578 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00579 00580 #define COMBA_FINI 00581 00582 #define SQRADD(i, j) \ 00583 __asm__( \ 00584 "movq %6,%%rax \n\t" \ 00585 "mulq %%rax \n\t" \ 00586 "addq %%rax,%0 \n\t" \ 00587 "adcq %%rdx,%1 \n\t" \ 00588 "adcq $0,%2 \n\t" \ 00589 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc"); 00590 00591 #define SQRADD2(i, j) \ 00592 __asm__( \ 00593 "movq %6,%%rax \n\t" \ 00594 "mulq %7 \n\t" \ 00595 "addq %%rax,%0 \n\t" \ 00596 "adcq %%rdx,%1 \n\t" \ 00597 "adcq $0,%2 \n\t" \ 00598 "addq %%rax,%0 \n\t" \ 00599 "adcq %%rdx,%1 \n\t" \ 00600 "adcq $0,%2 \n\t" \ 00601 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc"); 00602 00603 #define SQRADDSC(i, j) \ 00604 __asm__( \ 00605 "movq %3,%%rax \n\t" \ 00606 "mulq %4 \n\t" \ 00607 "movq %%rax,%0 \n\t" \ 00608 "movq %%rdx,%1 \n\t" \ 00609 "xorq %2,%2 \n\t" \ 00610 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc"); 00611 00612 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */ 00613 00614 #define SQRADDAC(i, j) \ 00615 __asm__( \ 00616 "movq %6,%%rax \n\t" \ 00617 "mulq %7 \n\t" \ 00618 "addq %%rax,%0 \n\t" \ 00619 "adcq %%rdx,%1 \n\t" \ 00620 "adcq $0,%2 \n\t" \ 00621 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc"); 00622 00623 #define SQRADDDB \ 00624 __asm__( \ 00625 "addq %6,%0 \n\t" \ 00626 "adcq %7,%1 \n\t" \ 00627 "adcq %8,%2 \n\t" \ 00628 "addq %6,%0 \n\t" \ 00629 "adcq %7,%1 \n\t" \ 00630 "adcq %8,%2 \n\t" \ 00631 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc"); 00632 00633 #elif defined(TFM_SSE2) 00634 00635 /* SSE2 Optimized */ 00636 #define COMBA_START 00637 00638 #define CLEAR_CARRY \ 00639 c0 = c1 = c2 = 0; 00640 00641 #define COMBA_STORE(x) \ 00642 x = c0; 00643 00644 #define COMBA_STORE2(x) \ 00645 x = c1; 00646 00647 #define CARRY_FORWARD \ 00648 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00649 00650 #define COMBA_FINI \ 00651 __asm__("emms"); 00652 00653 #define SQRADD(i, j) \ 00654 __asm__( \ 00655 "movd %6,%%mm0 \n\t" \ 00656 "pmuludq %%mm0,%%mm0\n\t" \ 00657 "movd %%mm0,%%eax \n\t" \ 00658 "psrlq $32,%%mm0 \n\t" \ 00659 "addl %%eax,%0 \n\t" \ 00660 "movd %%mm0,%%eax \n\t" \ 00661 "adcl %%eax,%1 \n\t" \ 00662 "adcl $0,%2 \n\t" \ 00663 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc"); 00664 00665 #define SQRADD2(i, j) \ 00666 __asm__( \ 00667 "movd %6,%%mm0 \n\t" \ 00668 "movd %7,%%mm1 \n\t" \ 00669 "pmuludq %%mm1,%%mm0\n\t" \ 00670 "movd %%mm0,%%eax \n\t" \ 00671 "psrlq $32,%%mm0 \n\t" \ 00672 "movd %%mm0,%%edx \n\t" \ 00673 "addl %%eax,%0 \n\t" \ 00674 "adcl %%edx,%1 \n\t" \ 00675 "adcl $0,%2 \n\t" \ 00676 "addl %%eax,%0 \n\t" \ 00677 "adcl %%edx,%1 \n\t" \ 00678 "adcl $0,%2 \n\t" \ 00679 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc"); 00680 00681 #define SQRADDSC(i, j) \ 00682 __asm__( \ 00683 "movd %3,%%mm0 \n\t" \ 00684 "movd %4,%%mm1 \n\t" \ 00685 "pmuludq %%mm1,%%mm0\n\t" \ 00686 "movd %%mm0,%0 \n\t" \ 00687 "psrlq $32,%%mm0 \n\t" \ 00688 "movd %%mm0,%1 \n\t" \ 00689 "xorl %2,%2 \n\t" \ 00690 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j)); 00691 00692 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */ 00693 00694 #define SQRADDAC(i, j) \ 00695 __asm__( \ 00696 "movd %6,%%mm0 \n\t" \ 00697 "movd %7,%%mm1 \n\t" \ 00698 "pmuludq %%mm1,%%mm0\n\t" \ 00699 "movd %%mm0,%%eax \n\t" \ 00700 "psrlq $32,%%mm0 \n\t" \ 00701 "movd %%mm0,%%edx \n\t" \ 00702 "addl %%eax,%0 \n\t" \ 00703 "adcl %%edx,%1 \n\t" \ 00704 "adcl $0,%2 \n\t" \ 00705 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","cc"); 00706 00707 #define SQRADDDB \ 00708 __asm__( \ 00709 "addl %6,%0 \n\t" \ 00710 "adcl %7,%1 \n\t" \ 00711 "adcl %8,%2 \n\t" \ 00712 "addl %6,%0 \n\t" \ 00713 "adcl %7,%1 \n\t" \ 00714 "adcl %8,%2 \n\t" \ 00715 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc"); 00716 00717 #elif defined(TFM_ARM) 00718 00719 /* ARM code */ 00720 00721 #define COMBA_START 00722 00723 #define CLEAR_CARRY \ 00724 c0 = c1 = c2 = 0; 00725 00726 #define COMBA_STORE(x) \ 00727 x = c0; 00728 00729 #define COMBA_STORE2(x) \ 00730 x = c1; 00731 00732 #define CARRY_FORWARD \ 00733 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00734 00735 #define COMBA_FINI 00736 00737 /* multiplies point i and j, updates carry "c1" and digit c2 */ 00738 #define SQRADD(i, j) \ 00739 __asm__( \ 00740 " UMULL r0,r1,%6,%6 \n\t" \ 00741 " ADDS %0,%0,r0 \n\t" \ 00742 " ADCS %1,%1,r1 \n\t" \ 00743 " ADC %2,%2,#0 \n\t" \ 00744 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc"); 00745 00746 /* for squaring some of the terms are doubled... */ 00747 #define SQRADD2(i, j) \ 00748 __asm__( \ 00749 " UMULL r0,r1,%6,%7 \n\t" \ 00750 " ADDS %0,%0,r0 \n\t" \ 00751 " ADCS %1,%1,r1 \n\t" \ 00752 " ADC %2,%2,#0 \n\t" \ 00753 " ADDS %0,%0,r0 \n\t" \ 00754 " ADCS %1,%1,r1 \n\t" \ 00755 " ADC %2,%2,#0 \n\t" \ 00756 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc"); 00757 00758 #define SQRADDSC(i, j) \ 00759 __asm__( \ 00760 " UMULL %0,%1,%3,%4 \n\t" \ 00761 " SUB %2,%2,%2 \n\t" \ 00762 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc"); 00763 00764 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */ 00765 00766 #define SQRADDAC(i, j) \ 00767 __asm__( \ 00768 " UMULL r0,r1,%6,%7 \n\t" \ 00769 " ADDS %0,%0,r0 \n\t" \ 00770 " ADCS %1,%1,r1 \n\t" \ 00771 " ADC %2,%2,#0 \n\t" \ 00772 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc"); 00773 00774 #define SQRADDDB \ 00775 __asm__( \ 00776 " ADDS %0,%0,%3 \n\t" \ 00777 " ADCS %1,%1,%4 \n\t" \ 00778 " ADC %2,%2,%5 \n\t" \ 00779 " ADDS %0,%0,%3 \n\t" \ 00780 " ADCS %1,%1,%4 \n\t" \ 00781 " ADC %2,%2,%5 \n\t" \ 00782 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 00783 00784 #elif defined(TFM_PPC32) 00785 00786 /* PPC32 */ 00787 00788 #define COMBA_START 00789 00790 #define CLEAR_CARRY \ 00791 c0 = c1 = c2 = 0; 00792 00793 #define COMBA_STORE(x) \ 00794 x = c0; 00795 00796 #define COMBA_STORE2(x) \ 00797 x = c1; 00798 00799 #define CARRY_FORWARD \ 00800 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00801 00802 #define COMBA_FINI 00803 00804 /* multiplies point i and j, updates carry "c1" and digit c2 */ 00805 #define SQRADD(i, j) \ 00806 __asm__( \ 00807 " mullw 16,%6,%6 \n\t" \ 00808 " addc %0,%0,16 \n\t" \ 00809 " mulhwu 16,%6,%6 \n\t" \ 00810 " adde %1,%1,16 \n\t" \ 00811 " addze %2,%2 \n\t" \ 00812 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc"); 00813 00814 /* for squaring some of the terms are doubled... */ 00815 #define SQRADD2(i, j) \ 00816 __asm__( \ 00817 " mullw 16,%6,%7 \n\t" \ 00818 " mulhwu 17,%6,%7 \n\t" \ 00819 " addc %0,%0,16 \n\t" \ 00820 " adde %1,%1,17 \n\t" \ 00821 " addze %2,%2 \n\t" \ 00822 " addc %0,%0,16 \n\t" \ 00823 " adde %1,%1,17 \n\t" \ 00824 " addze %2,%2 \n\t" \ 00825 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc"); 00826 00827 #define SQRADDSC(i, j) \ 00828 __asm__( \ 00829 " mullw %0,%6,%7 \n\t" \ 00830 " mulhwu %1,%6,%7 \n\t" \ 00831 " xor %2,%2,%2 \n\t" \ 00832 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc"); 00833 00834 #define SQRADDAC(i, j) \ 00835 __asm__( \ 00836 " mullw 16,%6,%7 \n\t" \ 00837 " addc %0,%0,16 \n\t" \ 00838 " mulhwu 16,%6,%7 \n\t" \ 00839 " adde %1,%1,16 \n\t" \ 00840 " addze %2,%2 \n\t" \ 00841 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc"); 00842 00843 #define SQRADDDB \ 00844 __asm__( \ 00845 " addc %0,%0,%3 \n\t" \ 00846 " adde %1,%1,%4 \n\t" \ 00847 " adde %2,%2,%5 \n\t" \ 00848 " addc %0,%0,%3 \n\t" \ 00849 " adde %1,%1,%4 \n\t" \ 00850 " adde %2,%2,%5 \n\t" \ 00851 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 00852 00853 #elif defined(TFM_PPC64) 00854 /* PPC64 */ 00855 00856 #define COMBA_START 00857 00858 #define CLEAR_CARRY \ 00859 c0 = c1 = c2 = 0; 00860 00861 #define COMBA_STORE(x) \ 00862 x = c0; 00863 00864 #define COMBA_STORE2(x) \ 00865 x = c1; 00866 00867 #define CARRY_FORWARD \ 00868 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00869 00870 #define COMBA_FINI 00871 00872 /* multiplies point i and j, updates carry "c1" and digit c2 */ 00873 #define SQRADD(i, j) \ 00874 __asm__( \ 00875 " mulld 16,%6,%6 \n\t" \ 00876 " addc %0,%0,16 \n\t" \ 00877 " mulhdu 16,%6,%6 \n\t" \ 00878 " adde %1,%1,16 \n\t" \ 00879 " addze %2,%2 \n\t" \ 00880 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc"); 00881 00882 /* for squaring some of the terms are doubled... */ 00883 #define SQRADD2(i, j) \ 00884 __asm__( \ 00885 " mulld 16,%6,%7 \n\t" \ 00886 " mulhdu 17,%6,%7 \n\t" \ 00887 " addc %0,%0,16 \n\t" \ 00888 " adde %1,%1,17 \n\t" \ 00889 " addze %2,%2 \n\t" \ 00890 " addc %0,%0,16 \n\t" \ 00891 " adde %1,%1,17 \n\t" \ 00892 " addze %2,%2 \n\t" \ 00893 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc"); 00894 00895 #define SQRADDSC(i, j) \ 00896 __asm__( \ 00897 " mulld %0,%6,%7 \n\t" \ 00898 " mulhdu %1,%6,%7 \n\t" \ 00899 " xor %2,%2,%2 \n\t" \ 00900 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc"); 00901 00902 #define SQRADDAC(i, j) \ 00903 __asm__( \ 00904 " mulld 16,%6,%7 \n\t" \ 00905 " addc %0,%0,16 \n\t" \ 00906 " mulhdu 16,%6,%7 \n\t" \ 00907 " adde %1,%1,16 \n\t" \ 00908 " addze %2,%2 \n\t" \ 00909 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc"); 00910 00911 #define SQRADDDB \ 00912 __asm__( \ 00913 " addc %0,%0,%3 \n\t" \ 00914 " adde %1,%1,%4 \n\t" \ 00915 " adde %2,%2,%5 \n\t" \ 00916 " addc %0,%0,%3 \n\t" \ 00917 " adde %1,%1,%4 \n\t" \ 00918 " adde %2,%2,%5 \n\t" \ 00919 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 00920 00921 00922 #elif defined(TFM_AVR32) 00923 00924 /* AVR32 */ 00925 00926 #define COMBA_START 00927 00928 #define CLEAR_CARRY \ 00929 c0 = c1 = c2 = 0; 00930 00931 #define COMBA_STORE(x) \ 00932 x = c0; 00933 00934 #define COMBA_STORE2(x) \ 00935 x = c1; 00936 00937 #define CARRY_FORWARD \ 00938 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00939 00940 #define COMBA_FINI 00941 00942 /* multiplies point i and j, updates carry "c1" and digit c2 */ 00943 #define SQRADD(i, j) \ 00944 __asm__( \ 00945 " mulu.d r2,%6,%6 \n\t" \ 00946 " add %0,%0,r2 \n\t" \ 00947 " adc %1,%1,r3 \n\t" \ 00948 " acr %2 \n\t" \ 00949 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3"); 00950 00951 /* for squaring some of the terms are doubled... */ 00952 #define SQRADD2(i, j) \ 00953 __asm__( \ 00954 " mulu.d r2,%6,%7 \n\t" \ 00955 " add %0,%0,r2 \n\t" \ 00956 " adc %1,%1,r3 \n\t" \ 00957 " acr %2, \n\t" \ 00958 " add %0,%0,r2 \n\t" \ 00959 " adc %1,%1,r3 \n\t" \ 00960 " acr %2, \n\t" \ 00961 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3"); 00962 00963 #define SQRADDSC(i, j) \ 00964 __asm__( \ 00965 " mulu.d r2,%6,%7 \n\t" \ 00966 " mov %0,r2 \n\t" \ 00967 " mov %1,r3 \n\t" \ 00968 " eor %2,%2 \n\t" \ 00969 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3"); 00970 00971 #define SQRADDAC(i, j) \ 00972 __asm__( \ 00973 " mulu.d r2,%6,%7 \n\t" \ 00974 " add %0,%0,r2 \n\t" \ 00975 " adc %1,%1,r3 \n\t" \ 00976 " acr %2 \n\t" \ 00977 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3"); 00978 00979 #define SQRADDDB \ 00980 __asm__( \ 00981 " add %0,%0,%3 \n\t" \ 00982 " adc %1,%1,%4 \n\t" \ 00983 " adc %2,%2,%5 \n\t" \ 00984 " add %0,%0,%3 \n\t" \ 00985 " adc %1,%1,%4 \n\t" \ 00986 " adc %2,%2,%5 \n\t" \ 00987 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 00988 00989 00990 #else 00991 00992 #define TFM_ISO 00993 00994 /* ISO C portable code */ 00995 00996 #define COMBA_START 00997 00998 #define CLEAR_CARRY \ 00999 c0 = c1 = c2 = 0; 01000 01001 #define COMBA_STORE(x) \ 01002 x = c0; 01003 01004 #define COMBA_STORE2(x) \ 01005 x = c1; 01006 01007 #define CARRY_FORWARD \ 01008 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01009 01010 #define COMBA_FINI 01011 01012 /* multiplies point i and j, updates carry "c1" and digit c2 */ 01013 #define SQRADD(i, j) \ 01014 do { fp_word t; \ 01015 t = c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \ 01016 t = c1 + (t >> DIGIT_BIT); c1 = (fp_digit)t; \ 01017 c2 +=(fp_digit) (t >> DIGIT_BIT); \ 01018 } while (0); 01019 01020 01021 /* for squaring some of the terms are doubled... */ 01022 #define SQRADD2(i, j) \ 01023 do { fp_word t; \ 01024 t = ((fp_word)i) * ((fp_word)j); \ 01025 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \ 01026 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \ 01027 c2 +=(fp_digit)( tt >> DIGIT_BIT); \ 01028 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \ 01029 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \ 01030 c2 +=(fp_digit) (tt >> DIGIT_BIT); \ 01031 } while (0); 01032 01033 #define SQRADDSC(i, j) \ 01034 do { fp_word t; \ 01035 t = ((fp_word)i) * ((fp_word)j); \ 01036 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \ 01037 } while (0); 01038 01039 #define SQRADDAC(i, j) \ 01040 do { fp_word t; \ 01041 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = (fp_digit)t; \ 01042 t = sc1 + (t >> DIGIT_BIT); sc1 = (fp_digit)t; \ 01043 sc2 += (fp_digit)(t >> DIGIT_BIT); \ 01044 } while (0); 01045 01046 #define SQRADDDB \ 01047 do { fp_word t; \ 01048 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t; \ 01049 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); \ 01050 c1 = (fp_digit)t; \ 01051 c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT)); \ 01052 } while (0); 01053 01054 #endif 01055 01056 #ifdef TFM_SMALL_SET 01057 #include "fp_sqr_comba_small_set.i" 01058 #endif 01059 01060 #if defined(TFM_SQR3) 01061 #include "fp_sqr_comba_3.i" 01062 #endif 01063 #if defined(TFM_SQR4) 01064 #include "fp_sqr_comba_4.i" 01065 #endif 01066 #if defined(TFM_SQR6) 01067 #include "fp_sqr_comba_6.i" 01068 #endif 01069 #if defined(TFM_SQR7) 01070 #include "fp_sqr_comba_7.i" 01071 #endif 01072 #if defined(TFM_SQR8) 01073 #include "fp_sqr_comba_8.i" 01074 #endif 01075 #if defined(TFM_SQR9) 01076 #include "fp_sqr_comba_9.i" 01077 #endif 01078 #if defined(TFM_SQR12) 01079 #include "fp_sqr_comba_12.i" 01080 #endif 01081 #if defined(TFM_SQR17) 01082 #include "fp_sqr_comba_17.i" 01083 #endif 01084 #if defined(TFM_SQR20) 01085 #include "fp_sqr_comba_20.i" 01086 #endif 01087 #if defined(TFM_SQR24) 01088 #include "fp_sqr_comba_24.i" 01089 #endif 01090 #if defined(TFM_SQR28) 01091 #include "fp_sqr_comba_28.i" 01092 #endif 01093 #if defined(TFM_SQR32) 01094 #include "fp_sqr_comba_32.i" 01095 #endif 01096 #if defined(TFM_SQR48) 01097 #include "fp_sqr_comba_48.i" 01098 #endif 01099 #if defined(TFM_SQR64) 01100 #include "fp_sqr_comba_64.i" 01101 #endif 01102 /* end fp_sqr_comba.c asm */ 01103 01104 /* start fp_mul_comba.c asm */ 01105 /* these are the combas. Worship them. */ 01106 #if defined(TFM_X86) 01107 /* Generic x86 optimized code */ 01108 01109 /* anything you need at the start */ 01110 #define COMBA_START 01111 01112 /* clear the chaining variables */ 01113 #define COMBA_CLEAR \ 01114 c0 = c1 = c2 = 0; 01115 01116 /* forward the carry to the next digit */ 01117 #define COMBA_FORWARD \ 01118 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01119 01120 /* store the first sum */ 01121 #define COMBA_STORE(x) \ 01122 x = c0; 01123 01124 /* store the second sum [carry] */ 01125 #define COMBA_STORE2(x) \ 01126 x = c1; 01127 01128 /* anything you need at the end */ 01129 #define COMBA_FINI 01130 01131 /* this should multiply i and j */ 01132 #define MULADD(i, j) \ 01133 __asm__( \ 01134 "movl %6,%%eax \n\t" \ 01135 "mull %7 \n\t" \ 01136 "addl %%eax,%0 \n\t" \ 01137 "adcl %%edx,%1 \n\t" \ 01138 "adcl $0,%2 \n\t" \ 01139 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc"); 01140 01141 #elif defined(TFM_X86_64) 01142 /* x86-64 optimized */ 01143 01144 /* anything you need at the start */ 01145 #define COMBA_START 01146 01147 /* clear the chaining variables */ 01148 #define COMBA_CLEAR \ 01149 c0 = c1 = c2 = 0; 01150 01151 /* forward the carry to the next digit */ 01152 #define COMBA_FORWARD \ 01153 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01154 01155 /* store the first sum */ 01156 #define COMBA_STORE(x) \ 01157 x = c0; 01158 01159 /* store the second sum [carry] */ 01160 #define COMBA_STORE2(x) \ 01161 x = c1; 01162 01163 /* anything you need at the end */ 01164 #define COMBA_FINI 01165 01166 /* this should multiply i and j */ 01167 #define MULADD(i, j) \ 01168 __asm__ ( \ 01169 "movq %6,%%rax \n\t" \ 01170 "mulq %7 \n\t" \ 01171 "addq %%rax,%0 \n\t" \ 01172 "adcq %%rdx,%1 \n\t" \ 01173 "adcq $0,%2 \n\t" \ 01174 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc"); 01175 01176 #elif defined(TFM_SSE2) 01177 /* use SSE2 optimizations */ 01178 01179 /* anything you need at the start */ 01180 #define COMBA_START 01181 01182 /* clear the chaining variables */ 01183 #define COMBA_CLEAR \ 01184 c0 = c1 = c2 = 0; 01185 01186 /* forward the carry to the next digit */ 01187 #define COMBA_FORWARD \ 01188 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01189 01190 /* store the first sum */ 01191 #define COMBA_STORE(x) \ 01192 x = c0; 01193 01194 /* store the second sum [carry] */ 01195 #define COMBA_STORE2(x) \ 01196 x = c1; 01197 01198 /* anything you need at the end */ 01199 #define COMBA_FINI \ 01200 __asm__("emms"); 01201 01202 /* this should multiply i and j */ 01203 #define MULADD(i, j) \ 01204 __asm__( \ 01205 "movd %6,%%mm0 \n\t" \ 01206 "movd %7,%%mm1 \n\t" \ 01207 "pmuludq %%mm1,%%mm0\n\t" \ 01208 "movd %%mm0,%%eax \n\t" \ 01209 "psrlq $32,%%mm0 \n\t" \ 01210 "addl %%eax,%0 \n\t" \ 01211 "movd %%mm0,%%eax \n\t" \ 01212 "adcl %%eax,%1 \n\t" \ 01213 "adcl $0,%2 \n\t" \ 01214 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc"); 01215 01216 #elif defined(TFM_ARM) 01217 /* ARM code */ 01218 01219 #define COMBA_START 01220 01221 #define COMBA_CLEAR \ 01222 c0 = c1 = c2 = 0; 01223 01224 #define COMBA_FORWARD \ 01225 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01226 01227 #define COMBA_STORE(x) \ 01228 x = c0; 01229 01230 #define COMBA_STORE2(x) \ 01231 x = c1; 01232 01233 #define COMBA_FINI 01234 01235 #define MULADD(i, j) \ 01236 __asm__( \ 01237 " UMULL r0,r1,%6,%7 \n\t" \ 01238 " ADDS %0,%0,r0 \n\t" \ 01239 " ADCS %1,%1,r1 \n\t" \ 01240 " ADC %2,%2,#0 \n\t" \ 01241 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc"); 01242 01243 #elif defined(TFM_PPC32) 01244 /* For 32-bit PPC */ 01245 01246 #define COMBA_START 01247 01248 #define COMBA_CLEAR \ 01249 c0 = c1 = c2 = 0; 01250 01251 #define COMBA_FORWARD \ 01252 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01253 01254 #define COMBA_STORE(x) \ 01255 x = c0; 01256 01257 #define COMBA_STORE2(x) \ 01258 x = c1; 01259 01260 #define COMBA_FINI 01261 01262 /* untested: will mulhwu change the flags? Docs say no */ 01263 #define MULADD(i, j) \ 01264 __asm__( \ 01265 " mullw 16,%6,%7 \n\t" \ 01266 " addc %0,%0,16 \n\t" \ 01267 " mulhwu 16,%6,%7 \n\t" \ 01268 " adde %1,%1,16 \n\t" \ 01269 " addze %2,%2 \n\t" \ 01270 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16"); 01271 01272 #elif defined(TFM_PPC64) 01273 /* For 64-bit PPC */ 01274 01275 #define COMBA_START 01276 01277 #define COMBA_CLEAR \ 01278 c0 = c1 = c2 = 0; 01279 01280 #define COMBA_FORWARD \ 01281 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01282 01283 #define COMBA_STORE(x) \ 01284 x = c0; 01285 01286 #define COMBA_STORE2(x) \ 01287 x = c1; 01288 01289 #define COMBA_FINI 01290 01291 /* untested: will mulhwu change the flags? Docs say no */ 01292 #define MULADD(i, j) \ 01293 ____asm__( \ 01294 " mulld 16,%6,%7 \n\t" \ 01295 " addc %0,%0,16 \n\t" \ 01296 " mulhdu 16,%6,%7 \n\t" \ 01297 " adde %1,%1,16 \n\t" \ 01298 " addze %2,%2 \n\t" \ 01299 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16"); 01300 01301 #elif defined(TFM_AVR32) 01302 01303 /* ISO C code */ 01304 01305 #define COMBA_START 01306 01307 #define COMBA_CLEAR \ 01308 c0 = c1 = c2 = 0; 01309 01310 #define COMBA_FORWARD \ 01311 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01312 01313 #define COMBA_STORE(x) \ 01314 x = c0; 01315 01316 #define COMBA_STORE2(x) \ 01317 x = c1; 01318 01319 #define COMBA_FINI 01320 01321 #define MULADD(i, j) \ 01322 ____asm__( \ 01323 " mulu.d r2,%6,%7 \n\t"\ 01324 " add %0,r2 \n\t"\ 01325 " adc %1,%1,r3 \n\t"\ 01326 " acr %2 \n\t"\ 01327 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3"); 01328 01329 #else 01330 /* ISO C code */ 01331 01332 #define COMBA_START 01333 01334 #define COMBA_CLEAR \ 01335 c0 = c1 = c2 = 0; 01336 01337 #define COMBA_FORWARD \ 01338 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01339 01340 #define COMBA_STORE(x) \ 01341 x = c0; 01342 01343 #define COMBA_STORE2(x) \ 01344 x = c1; 01345 01346 #define COMBA_FINI 01347 01348 #define MULADD(i, j) \ 01349 do { fp_word t; \ 01350 t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \ 01351 t = (fp_word)c1 + (t >> DIGIT_BIT); \ 01352 c1 = (fp_digit)t; c2 += (fp_digit)(t >> DIGIT_BIT); \ 01353 } while (0); 01354 01355 #endif 01356 01357 01358 #ifdef TFM_SMALL_SET 01359 #include "fp_mul_comba_small_set.i" 01360 #endif 01361 01362 #if defined(TFM_MUL3) 01363 #include "fp_mul_comba_3.i" 01364 #endif 01365 #if defined(TFM_MUL4) 01366 #include "fp_mul_comba_4.i" 01367 #endif 01368 #if defined(TFM_MUL6) 01369 #include "fp_mul_comba_6.i" 01370 #endif 01371 #if defined(TFM_MUL7) 01372 #include "fp_mul_comba_7.i" 01373 #endif 01374 #if defined(TFM_MUL8) 01375 #include "fp_mul_comba_8.i" 01376 #endif 01377 #if defined(TFM_MUL9) 01378 #include "fp_mul_comba_9.i" 01379 #endif 01380 #if defined(TFM_MUL12) 01381 #include "fp_mul_comba_12.i" 01382 #endif 01383 #if defined(TFM_MUL17) 01384 #include "fp_mul_comba_17.i" 01385 #endif 01386 #if defined(TFM_MUL20) 01387 #include "fp_mul_comba_20.i" 01388 #endif 01389 #if defined(TFM_MUL24) 01390 #include "fp_mul_comba_24.i" 01391 #endif 01392 #if defined(TFM_MUL28) 01393 #include "fp_mul_comba_28.i" 01394 #endif 01395 #if defined(TFM_MUL32) 01396 #include "fp_mul_comba_32.i" 01397 #endif 01398 #if defined(TFM_MUL48) 01399 #include "fp_mul_comba_48.i" 01400 #endif 01401 #if defined(TFM_MUL64) 01402 #include "fp_mul_comba_64.i" 01403 #endif 01404 01405 /* end fp_mul_comba.c asm */ 01406
Generated on Tue Jul 12 2022 20:12:50 by
