Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Dependents: TLS_cyassl TLS_cyassl
asm.c
00001 /* asm.c 00002 * 00003 * Copyright (C) 2006-2013 wolfSSL Inc. 00004 * 00005 * This file is part of CyaSSL. 00006 * 00007 * CyaSSL is free software; you can redistribute it and/or modify 00008 * it under the terms of the GNU General Public License as published by 00009 * the Free Software Foundation; either version 2 of the License, or 00010 * (at your option) any later version. 00011 * 00012 * CyaSSL is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00015 * GNU General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU General Public License 00018 * along with this program; if not, write to the Free Software 00019 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA 00020 */ 00021 00022 #ifdef HAVE_CONFIG_H 00023 #include <config.h> 00024 #endif 00025 00026 #include <cyassl/ctaocrypt/settings.h> 00027 00028 /* 00029 * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca, 00030 * http://math.libtomcrypt.com 00031 */ 00032 00033 00034 /******************************************************************/ 00035 /* fp_montgomery_reduce.c asm or generic */ 00036 #if defined(TFM_X86) && !defined(TFM_SSE2) 00037 /* x86-32 code */ 00038 00039 #define MONT_START 00040 #define MONT_FINI 00041 #define LOOP_END 00042 #define LOOP_START \ 00043 mu = c[x] * mp 00044 00045 #define INNERMUL \ 00046 __asm__( \ 00047 "movl %5,%%eax \n\t" \ 00048 "mull %4 \n\t" \ 00049 "addl %1,%%eax \n\t" \ 00050 "adcl $0,%%edx \n\t" \ 00051 "addl %%eax,%0 \n\t" \ 00052 "adcl $0,%%edx \n\t" \ 00053 "movl %%edx,%1 \n\t" \ 00054 :"=g"(_c[LO]), "=r"(cy) \ 00055 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \ 00056 : "%eax", "%edx", "cc") 00057 00058 #define PROPCARRY \ 00059 __asm__( \ 00060 "addl %1,%0 \n\t" \ 00061 "setb %%al \n\t" \ 00062 "movzbl %%al,%1 \n\t" \ 00063 :"=g"(_c[LO]), "=r"(cy) \ 00064 :"0"(_c[LO]), "1"(cy) \ 00065 : "%eax", "cc") 00066 00067 /******************************************************************/ 00068 #elif defined(TFM_X86_64) 00069 /* x86-64 code */ 00070 00071 #define MONT_START 00072 #define MONT_FINI 00073 #define LOOP_END 00074 #define LOOP_START \ 00075 mu = c[x] * mp 00076 00077 #define INNERMUL \ 00078 __asm__( \ 00079 "movq %5,%%rax \n\t" \ 00080 "mulq %4 \n\t" \ 00081 "addq %1,%%rax \n\t" \ 00082 "adcq $0,%%rdx \n\t" \ 00083 "addq %%rax,%0 \n\t" \ 00084 "adcq $0,%%rdx \n\t" \ 00085 "movq %%rdx,%1 \n\t" \ 00086 :"=g"(_c[LO]), "=r"(cy) \ 00087 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \ 00088 : "%rax", "%rdx", "cc") 00089 00090 #define INNERMUL8 \ 00091 __asm__( \ 00092 "movq 0(%5),%%rax \n\t" \ 00093 "movq 0(%2),%%r10 \n\t" \ 00094 "movq 0x8(%5),%%r11 \n\t" \ 00095 "mulq %4 \n\t" \ 00096 "addq %%r10,%%rax \n\t" \ 00097 "adcq $0,%%rdx \n\t" \ 00098 "movq 0x8(%2),%%r10 \n\t" \ 00099 "addq %3,%%rax \n\t" \ 00100 "adcq $0,%%rdx \n\t" \ 00101 "movq %%rax,0(%0) \n\t" \ 00102 "movq %%rdx,%1 \n\t" \ 00103 \ 00104 "movq %%r11,%%rax \n\t" \ 00105 "movq 0x10(%5),%%r11 \n\t" \ 00106 "mulq %4 \n\t" \ 00107 "addq %%r10,%%rax \n\t" \ 00108 "adcq $0,%%rdx \n\t" \ 00109 "movq 0x10(%2),%%r10 \n\t" \ 00110 "addq %3,%%rax \n\t" \ 00111 "adcq $0,%%rdx \n\t" \ 00112 "movq %%rax,0x8(%0) \n\t" \ 00113 "movq %%rdx,%1 \n\t" \ 00114 \ 00115 "movq %%r11,%%rax \n\t" \ 00116 "movq 0x18(%5),%%r11 \n\t" \ 00117 "mulq %4 \n\t" \ 00118 "addq %%r10,%%rax \n\t" \ 00119 "adcq $0,%%rdx \n\t" \ 00120 "movq 0x18(%2),%%r10 \n\t" \ 00121 "addq %3,%%rax \n\t" \ 00122 "adcq $0,%%rdx \n\t" \ 00123 "movq %%rax,0x10(%0) \n\t" \ 00124 "movq %%rdx,%1 \n\t" \ 00125 \ 00126 "movq %%r11,%%rax \n\t" \ 00127 "movq 0x20(%5),%%r11 \n\t" \ 00128 "mulq %4 \n\t" \ 00129 "addq %%r10,%%rax \n\t" \ 00130 "adcq $0,%%rdx \n\t" \ 00131 "movq 0x20(%2),%%r10 \n\t" \ 00132 "addq %3,%%rax \n\t" \ 00133 "adcq $0,%%rdx \n\t" \ 00134 "movq %%rax,0x18(%0) \n\t" \ 00135 "movq %%rdx,%1 \n\t" \ 00136 \ 00137 "movq %%r11,%%rax \n\t" \ 00138 "movq 0x28(%5),%%r11 \n\t" \ 00139 "mulq %4 \n\t" \ 00140 "addq %%r10,%%rax \n\t" \ 00141 "adcq $0,%%rdx \n\t" \ 00142 "movq 0x28(%2),%%r10 \n\t" \ 00143 "addq %3,%%rax \n\t" \ 00144 "adcq $0,%%rdx \n\t" \ 00145 "movq %%rax,0x20(%0) \n\t" \ 00146 "movq %%rdx,%1 \n\t" \ 00147 \ 00148 "movq %%r11,%%rax \n\t" \ 00149 "movq 0x30(%5),%%r11 \n\t" \ 00150 "mulq %4 \n\t" \ 00151 "addq %%r10,%%rax \n\t" \ 00152 "adcq $0,%%rdx \n\t" \ 00153 "movq 0x30(%2),%%r10 \n\t" \ 00154 "addq %3,%%rax \n\t" \ 00155 "adcq $0,%%rdx \n\t" \ 00156 "movq %%rax,0x28(%0) \n\t" \ 00157 "movq %%rdx,%1 \n\t" \ 00158 \ 00159 "movq %%r11,%%rax \n\t" \ 00160 "movq 0x38(%5),%%r11 \n\t" \ 00161 "mulq %4 \n\t" \ 00162 "addq %%r10,%%rax \n\t" \ 00163 "adcq $0,%%rdx \n\t" \ 00164 "movq 0x38(%2),%%r10 \n\t" \ 00165 "addq %3,%%rax \n\t" \ 00166 "adcq $0,%%rdx \n\t" \ 00167 "movq %%rax,0x30(%0) \n\t" \ 00168 "movq %%rdx,%1 \n\t" \ 00169 \ 00170 "movq %%r11,%%rax \n\t" \ 00171 "mulq %4 \n\t" \ 00172 "addq %%r10,%%rax \n\t" \ 00173 "adcq $0,%%rdx \n\t" \ 00174 "addq %3,%%rax \n\t" \ 00175 "adcq $0,%%rdx \n\t" \ 00176 "movq %%rax,0x38(%0) \n\t" \ 00177 "movq %%rdx,%1 \n\t" \ 00178 \ 00179 :"=r"(_c), "=r"(cy) \ 00180 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\ 00181 : "%rax", "%rdx", "%r10", "%r11", "cc") 00182 00183 00184 #define PROPCARRY \ 00185 __asm__( \ 00186 "addq %1,%0 \n\t" \ 00187 "setb %%al \n\t" \ 00188 "movzbq %%al,%1 \n\t" \ 00189 :"=g"(_c[LO]), "=r"(cy) \ 00190 :"0"(_c[LO]), "1"(cy) \ 00191 : "%rax", "cc") 00192 00193 /******************************************************************/ 00194 #elif defined(TFM_SSE2) 00195 /* SSE2 code (assumes 32-bit fp_digits) */ 00196 /* XMM register assignments: 00197 * xmm0 *tmpm++, then Mu * (*tmpm++) 00198 * xmm1 c[x], then Mu 00199 * xmm2 mp 00200 * xmm3 cy 00201 * xmm4 _c[LO] 00202 */ 00203 00204 #define MONT_START \ 00205 __asm__("movd %0,%%mm2"::"g"(mp)) 00206 00207 #define MONT_FINI \ 00208 __asm__("emms") 00209 00210 #define LOOP_START \ 00211 __asm__( \ 00212 "movd %0,%%mm1 \n\t" \ 00213 "pxor %%mm3,%%mm3 \n\t" \ 00214 "pmuludq %%mm2,%%mm1 \n\t" \ 00215 :: "g"(c[x])) 00216 00217 /* pmuludq on mmx registers does a 32x32->64 multiply. */ 00218 #define INNERMUL \ 00219 __asm__( \ 00220 "movd %1,%%mm4 \n\t" \ 00221 "movd %2,%%mm0 \n\t" \ 00222 "paddq %%mm4,%%mm3 \n\t" \ 00223 "pmuludq %%mm1,%%mm0 \n\t" \ 00224 "paddq %%mm0,%%mm3 \n\t" \ 00225 "movd %%mm3,%0 \n\t" \ 00226 "psrlq $32, %%mm3 \n\t" \ 00227 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) ); 00228 00229 #define INNERMUL8 \ 00230 __asm__( \ 00231 "movd 0(%1),%%mm4 \n\t" \ 00232 "movd 0(%2),%%mm0 \n\t" \ 00233 "paddq %%mm4,%%mm3 \n\t" \ 00234 "pmuludq %%mm1,%%mm0 \n\t" \ 00235 "movd 4(%2),%%mm5 \n\t" \ 00236 "paddq %%mm0,%%mm3 \n\t" \ 00237 "movd 4(%1),%%mm6 \n\t" \ 00238 "movd %%mm3,0(%0) \n\t" \ 00239 "psrlq $32, %%mm3 \n\t" \ 00240 \ 00241 "paddq %%mm6,%%mm3 \n\t" \ 00242 "pmuludq %%mm1,%%mm5 \n\t" \ 00243 "movd 8(%2),%%mm6 \n\t" \ 00244 "paddq %%mm5,%%mm3 \n\t" \ 00245 "movd 8(%1),%%mm7 \n\t" \ 00246 "movd %%mm3,4(%0) \n\t" \ 00247 "psrlq $32, %%mm3 \n\t" \ 00248 \ 00249 "paddq %%mm7,%%mm3 \n\t" \ 00250 "pmuludq %%mm1,%%mm6 \n\t" \ 00251 "movd 12(%2),%%mm7 \n\t" \ 00252 "paddq %%mm6,%%mm3 \n\t" \ 00253 "movd 12(%1),%%mm5 \n\t" \ 00254 "movd %%mm3,8(%0) \n\t" \ 00255 "psrlq $32, %%mm3 \n\t" \ 00256 \ 00257 "paddq %%mm5,%%mm3 \n\t" \ 00258 "pmuludq %%mm1,%%mm7 \n\t" \ 00259 "movd 16(%2),%%mm5 \n\t" \ 00260 "paddq %%mm7,%%mm3 \n\t" \ 00261 "movd 16(%1),%%mm6 \n\t" \ 00262 "movd %%mm3,12(%0) \n\t" \ 00263 "psrlq $32, %%mm3 \n\t" \ 00264 \ 00265 "paddq %%mm6,%%mm3 \n\t" \ 00266 "pmuludq %%mm1,%%mm5 \n\t" \ 00267 "movd 20(%2),%%mm6 \n\t" \ 00268 "paddq %%mm5,%%mm3 \n\t" \ 00269 "movd 20(%1),%%mm7 \n\t" \ 00270 "movd %%mm3,16(%0) \n\t" \ 00271 "psrlq $32, %%mm3 \n\t" \ 00272 \ 00273 "paddq %%mm7,%%mm3 \n\t" \ 00274 "pmuludq %%mm1,%%mm6 \n\t" \ 00275 "movd 24(%2),%%mm7 \n\t" \ 00276 "paddq %%mm6,%%mm3 \n\t" \ 00277 "movd 24(%1),%%mm5 \n\t" \ 00278 "movd %%mm3,20(%0) \n\t" \ 00279 "psrlq $32, %%mm3 \n\t" \ 00280 \ 00281 "paddq %%mm5,%%mm3 \n\t" \ 00282 "pmuludq %%mm1,%%mm7 \n\t" \ 00283 "movd 28(%2),%%mm5 \n\t" \ 00284 "paddq %%mm7,%%mm3 \n\t" \ 00285 "movd 28(%1),%%mm6 \n\t" \ 00286 "movd %%mm3,24(%0) \n\t" \ 00287 "psrlq $32, %%mm3 \n\t" \ 00288 \ 00289 "paddq %%mm6,%%mm3 \n\t" \ 00290 "pmuludq %%mm1,%%mm5 \n\t" \ 00291 "paddq %%mm5,%%mm3 \n\t" \ 00292 "movd %%mm3,28(%0) \n\t" \ 00293 "psrlq $32, %%mm3 \n\t" \ 00294 :"=r"(_c) : "0"(_c), "r"(tmpm) ); 00295 00296 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack 00297 pointer */ 00298 00299 #define LOOP_END \ 00300 __asm__( "movd %%mm3,%0 \n" :"=r"(cy)) 00301 00302 #define PROPCARRY \ 00303 __asm__( \ 00304 "addl %1,%0 \n\t" \ 00305 "setb %%al \n\t" \ 00306 "movzbl %%al,%1 \n\t" \ 00307 :"=g"(_c[LO]), "=r"(cy) \ 00308 :"0"(_c[LO]), "1"(cy) \ 00309 : "%eax", "cc") 00310 00311 /******************************************************************/ 00312 #elif defined(TFM_ARM) 00313 /* ARMv4 code */ 00314 00315 #define MONT_START 00316 #define MONT_FINI 00317 #define LOOP_END 00318 #define LOOP_START \ 00319 mu = c[x] * mp 00320 00321 #define INNERMUL \ 00322 __asm__( \ 00323 " LDR r0,%1 \n\t" \ 00324 " ADDS r0,r0,%0 \n\t" \ 00325 " MOVCS %0,#1 \n\t" \ 00326 " MOVCC %0,#0 \n\t" \ 00327 " UMLAL r0,%0,%3,%4 \n\t" \ 00328 " STR r0,%1 \n\t" \ 00329 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc"); 00330 00331 #define PROPCARRY \ 00332 __asm__( \ 00333 " LDR r0,%1 \n\t" \ 00334 " ADDS r0,r0,%0 \n\t" \ 00335 " STR r0,%1 \n\t" \ 00336 " MOVCS %0,#1 \n\t" \ 00337 " MOVCC %0,#0 \n\t" \ 00338 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc"); 00339 00340 #elif defined(TFM_PPC32) 00341 00342 /* PPC32 */ 00343 #define MONT_START 00344 #define MONT_FINI 00345 #define LOOP_END 00346 #define LOOP_START \ 00347 mu = c[x] * mp 00348 00349 #define INNERMUL \ 00350 __asm__( \ 00351 " mullw 16,%3,%4 \n\t" \ 00352 " mulhwu 17,%3,%4 \n\t" \ 00353 " addc 16,16,%0 \n\t" \ 00354 " addze 17,17 \n\t" \ 00355 " lwz 18,%1 \n\t" \ 00356 " addc 16,16,18 \n\t" \ 00357 " addze %0,17 \n\t" \ 00358 " stw 16,%1 \n\t" \ 00359 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm; 00360 00361 #define PROPCARRY \ 00362 __asm__( \ 00363 " lwz 16,%1 \n\t" \ 00364 " addc 16,16,%0 \n\t" \ 00365 " stw 16,%1 \n\t" \ 00366 " xor %0,%0,%0 \n\t" \ 00367 " addze %0,%0 \n\t" \ 00368 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc"); 00369 00370 #elif defined(TFM_PPC64) 00371 00372 /* PPC64 */ 00373 #define MONT_START 00374 #define MONT_FINI 00375 #define LOOP_END 00376 #define LOOP_START \ 00377 mu = c[x] * mp 00378 00379 #define INNERMUL \ 00380 __asm__( \ 00381 " mulld 16,%3,%4 \n\t" \ 00382 " mulhdu 17,%3,%4 \n\t" \ 00383 " addc 16,16,%0 \n\t" \ 00384 " addze 17,17 \n\t" \ 00385 " ldx 18,0,%1 \n\t" \ 00386 " addc 16,16,18 \n\t" \ 00387 " addze %0,17 \n\t" \ 00388 " sdx 16,0,%1 \n\t" \ 00389 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm; 00390 00391 #define PROPCARRY \ 00392 __asm__( \ 00393 " ldx 16,0,%1 \n\t" \ 00394 " addc 16,16,%0 \n\t" \ 00395 " sdx 16,0,%1 \n\t" \ 00396 " xor %0,%0,%0 \n\t" \ 00397 " addze %0,%0 \n\t" \ 00398 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc"); 00399 00400 /******************************************************************/ 00401 00402 #elif defined(TFM_AVR32) 00403 00404 /* AVR32 */ 00405 #define MONT_START 00406 #define MONT_FINI 00407 #define LOOP_END 00408 #define LOOP_START \ 00409 mu = c[x] * mp 00410 00411 #define INNERMUL \ 00412 __asm__( \ 00413 " ld.w r2,%1 \n\t" \ 00414 " add r2,%0 \n\t" \ 00415 " eor r3,r3 \n\t" \ 00416 " acr r3 \n\t" \ 00417 " macu.d r2,%3,%4 \n\t" \ 00418 " st.w %1,r2 \n\t" \ 00419 " mov %0,r3 \n\t" \ 00420 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3"); 00421 00422 #define PROPCARRY \ 00423 __asm__( \ 00424 " ld.w r2,%1 \n\t" \ 00425 " add r2,%0 \n\t" \ 00426 " st.w %1,r2 \n\t" \ 00427 " eor %0,%0 \n\t" \ 00428 " acr %0 \n\t" \ 00429 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc"); 00430 00431 #else 00432 00433 /* ISO C code */ 00434 #define MONT_START 00435 #define MONT_FINI 00436 #define LOOP_END 00437 #define LOOP_START \ 00438 mu = c[x] * mp 00439 00440 #define INNERMUL \ 00441 do { fp_word t; \ 00442 t = ((fp_word)_c[0] + (fp_word)cy) + \ 00443 (((fp_word)mu) * ((fp_word)*tmpm++)); \ 00444 _c[0] = (fp_digit)t; \ 00445 cy = (fp_digit)(t >> DIGIT_BIT); \ 00446 } while (0) 00447 00448 #define PROPCARRY \ 00449 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0) 00450 00451 #endif 00452 /******************************************************************/ 00453 00454 00455 #define LO 0 00456 /* end fp_montogomery_reduce.c asm */ 00457 00458 00459 /* start fp_sqr_comba.c asm */ 00460 #if defined(TFM_X86) 00461 00462 /* x86-32 optimized */ 00463 00464 #define COMBA_START 00465 00466 #define CLEAR_CARRY \ 00467 c0 = c1 = c2 = 0; 00468 00469 #define COMBA_STORE(x) \ 00470 x = c0; 00471 00472 #define COMBA_STORE2(x) \ 00473 x = c1; 00474 00475 #define CARRY_FORWARD \ 00476 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00477 00478 #define COMBA_FINI 00479 00480 #define SQRADD(i, j) \ 00481 __asm__( \ 00482 "movl %6,%%eax \n\t" \ 00483 "mull %%eax \n\t" \ 00484 "addl %%eax,%0 \n\t" \ 00485 "adcl %%edx,%1 \n\t" \ 00486 "adcl $0,%2 \n\t" \ 00487 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc"); 00488 00489 #define SQRADD2(i, j) \ 00490 __asm__( \ 00491 "movl %6,%%eax \n\t" \ 00492 "mull %7 \n\t" \ 00493 "addl %%eax,%0 \n\t" \ 00494 "adcl %%edx,%1 \n\t" \ 00495 "adcl $0,%2 \n\t" \ 00496 "addl %%eax,%0 \n\t" \ 00497 "adcl %%edx,%1 \n\t" \ 00498 "adcl $0,%2 \n\t" \ 00499 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx", "cc"); 00500 00501 #define SQRADDSC(i, j) \ 00502 __asm__( \ 00503 "movl %3,%%eax \n\t" \ 00504 "mull %4 \n\t" \ 00505 "movl %%eax,%0 \n\t" \ 00506 "movl %%edx,%1 \n\t" \ 00507 "xorl %2,%2 \n\t" \ 00508 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc"); 00509 00510 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */ 00511 00512 #define SQRADDAC(i, j) \ 00513 __asm__( \ 00514 "movl %6,%%eax \n\t" \ 00515 "mull %7 \n\t" \ 00516 "addl %%eax,%0 \n\t" \ 00517 "adcl %%edx,%1 \n\t" \ 00518 "adcl $0,%2 \n\t" \ 00519 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc"); 00520 00521 #define SQRADDDB \ 00522 __asm__( \ 00523 "addl %6,%0 \n\t" \ 00524 "adcl %7,%1 \n\t" \ 00525 "adcl %8,%2 \n\t" \ 00526 "addl %6,%0 \n\t" \ 00527 "adcl %7,%1 \n\t" \ 00528 "adcl %8,%2 \n\t" \ 00529 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc"); 00530 00531 #elif defined(TFM_X86_64) 00532 /* x86-64 optimized */ 00533 00534 #define COMBA_START 00535 00536 #define CLEAR_CARRY \ 00537 c0 = c1 = c2 = 0; 00538 00539 #define COMBA_STORE(x) \ 00540 x = c0; 00541 00542 #define COMBA_STORE2(x) \ 00543 x = c1; 00544 00545 #define CARRY_FORWARD \ 00546 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00547 00548 #define COMBA_FINI 00549 00550 #define SQRADD(i, j) \ 00551 __asm__( \ 00552 "movq %6,%%rax \n\t" \ 00553 "mulq %%rax \n\t" \ 00554 "addq %%rax,%0 \n\t" \ 00555 "adcq %%rdx,%1 \n\t" \ 00556 "adcq $0,%2 \n\t" \ 00557 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc"); 00558 00559 #define SQRADD2(i, j) \ 00560 __asm__( \ 00561 "movq %6,%%rax \n\t" \ 00562 "mulq %7 \n\t" \ 00563 "addq %%rax,%0 \n\t" \ 00564 "adcq %%rdx,%1 \n\t" \ 00565 "adcq $0,%2 \n\t" \ 00566 "addq %%rax,%0 \n\t" \ 00567 "adcq %%rdx,%1 \n\t" \ 00568 "adcq $0,%2 \n\t" \ 00569 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc"); 00570 00571 #define SQRADDSC(i, j) \ 00572 __asm__( \ 00573 "movq %3,%%rax \n\t" \ 00574 "mulq %4 \n\t" \ 00575 "movq %%rax,%0 \n\t" \ 00576 "movq %%rdx,%1 \n\t" \ 00577 "xorq %2,%2 \n\t" \ 00578 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc"); 00579 00580 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */ 00581 00582 #define SQRADDAC(i, j) \ 00583 __asm__( \ 00584 "movq %6,%%rax \n\t" \ 00585 "mulq %7 \n\t" \ 00586 "addq %%rax,%0 \n\t" \ 00587 "adcq %%rdx,%1 \n\t" \ 00588 "adcq $0,%2 \n\t" \ 00589 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc"); 00590 00591 #define SQRADDDB \ 00592 __asm__( \ 00593 "addq %6,%0 \n\t" \ 00594 "adcq %7,%1 \n\t" \ 00595 "adcq %8,%2 \n\t" \ 00596 "addq %6,%0 \n\t" \ 00597 "adcq %7,%1 \n\t" \ 00598 "adcq %8,%2 \n\t" \ 00599 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc"); 00600 00601 #elif defined(TFM_SSE2) 00602 00603 /* SSE2 Optimized */ 00604 #define COMBA_START 00605 00606 #define CLEAR_CARRY \ 00607 c0 = c1 = c2 = 0; 00608 00609 #define COMBA_STORE(x) \ 00610 x = c0; 00611 00612 #define COMBA_STORE2(x) \ 00613 x = c1; 00614 00615 #define CARRY_FORWARD \ 00616 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00617 00618 #define COMBA_FINI \ 00619 __asm__("emms"); 00620 00621 #define SQRADD(i, j) \ 00622 __asm__( \ 00623 "movd %6,%%mm0 \n\t" \ 00624 "pmuludq %%mm0,%%mm0\n\t" \ 00625 "movd %%mm0,%%eax \n\t" \ 00626 "psrlq $32,%%mm0 \n\t" \ 00627 "addl %%eax,%0 \n\t" \ 00628 "movd %%mm0,%%eax \n\t" \ 00629 "adcl %%eax,%1 \n\t" \ 00630 "adcl $0,%2 \n\t" \ 00631 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc"); 00632 00633 #define SQRADD2(i, j) \ 00634 __asm__( \ 00635 "movd %6,%%mm0 \n\t" \ 00636 "movd %7,%%mm1 \n\t" \ 00637 "pmuludq %%mm1,%%mm0\n\t" \ 00638 "movd %%mm0,%%eax \n\t" \ 00639 "psrlq $32,%%mm0 \n\t" \ 00640 "movd %%mm0,%%edx \n\t" \ 00641 "addl %%eax,%0 \n\t" \ 00642 "adcl %%edx,%1 \n\t" \ 00643 "adcl $0,%2 \n\t" \ 00644 "addl %%eax,%0 \n\t" \ 00645 "adcl %%edx,%1 \n\t" \ 00646 "adcl $0,%2 \n\t" \ 00647 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc"); 00648 00649 #define SQRADDSC(i, j) \ 00650 __asm__( \ 00651 "movd %3,%%mm0 \n\t" \ 00652 "movd %4,%%mm1 \n\t" \ 00653 "pmuludq %%mm1,%%mm0\n\t" \ 00654 "movd %%mm0,%0 \n\t" \ 00655 "psrlq $32,%%mm0 \n\t" \ 00656 "movd %%mm0,%1 \n\t" \ 00657 "xorl %2,%2 \n\t" \ 00658 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j)); 00659 00660 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */ 00661 00662 #define SQRADDAC(i, j) \ 00663 __asm__( \ 00664 "movd %6,%%mm0 \n\t" \ 00665 "movd %7,%%mm1 \n\t" \ 00666 "pmuludq %%mm1,%%mm0\n\t" \ 00667 "movd %%mm0,%%eax \n\t" \ 00668 "psrlq $32,%%mm0 \n\t" \ 00669 "movd %%mm0,%%edx \n\t" \ 00670 "addl %%eax,%0 \n\t" \ 00671 "adcl %%edx,%1 \n\t" \ 00672 "adcl $0,%2 \n\t" \ 00673 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j) :"%eax","%edx","cc"); 00674 00675 #define SQRADDDB \ 00676 __asm__( \ 00677 "addl %6,%0 \n\t" \ 00678 "adcl %7,%1 \n\t" \ 00679 "adcl %8,%2 \n\t" \ 00680 "addl %6,%0 \n\t" \ 00681 "adcl %7,%1 \n\t" \ 00682 "adcl %8,%2 \n\t" \ 00683 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc"); 00684 00685 #elif defined(TFM_ARM) 00686 00687 /* ARM code */ 00688 00689 #define COMBA_START 00690 00691 #define CLEAR_CARRY \ 00692 c0 = c1 = c2 = 0; 00693 00694 #define COMBA_STORE(x) \ 00695 x = c0; 00696 00697 #define COMBA_STORE2(x) \ 00698 x = c1; 00699 00700 #define CARRY_FORWARD \ 00701 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00702 00703 #define COMBA_FINI 00704 00705 /* multiplies point i and j, updates carry "c1" and digit c2 */ 00706 #define SQRADD(i, j) \ 00707 __asm__( \ 00708 " UMULL r0,r1,%6,%6 \n\t" \ 00709 " ADDS %0,%0,r0 \n\t" \ 00710 " ADCS %1,%1,r1 \n\t" \ 00711 " ADC %2,%2,#0 \n\t" \ 00712 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc"); 00713 00714 /* for squaring some of the terms are doubled... */ 00715 #define SQRADD2(i, j) \ 00716 __asm__( \ 00717 " UMULL r0,r1,%6,%7 \n\t" \ 00718 " ADDS %0,%0,r0 \n\t" \ 00719 " ADCS %1,%1,r1 \n\t" \ 00720 " ADC %2,%2,#0 \n\t" \ 00721 " ADDS %0,%0,r0 \n\t" \ 00722 " ADCS %1,%1,r1 \n\t" \ 00723 " ADC %2,%2,#0 \n\t" \ 00724 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc"); 00725 00726 #define SQRADDSC(i, j) \ 00727 __asm__( \ 00728 " UMULL %0,%1,%6,%7 \n\t" \ 00729 " SUB %2,%2,%2 \n\t" \ 00730 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "cc"); 00731 00732 #define SQRADDAC(i, j) \ 00733 __asm__( \ 00734 " UMULL r0,r1,%6,%7 \n\t" \ 00735 " ADDS %0,%0,r0 \n\t" \ 00736 " ADCS %1,%1,r1 \n\t" \ 00737 " ADC %2,%2,#0 \n\t" \ 00738 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc"); 00739 00740 #define SQRADDDB \ 00741 __asm__( \ 00742 " ADDS %0,%0,%3 \n\t" \ 00743 " ADCS %1,%1,%4 \n\t" \ 00744 " ADC %2,%2,%5 \n\t" \ 00745 " ADDS %0,%0,%3 \n\t" \ 00746 " ADCS %1,%1,%4 \n\t" \ 00747 " ADC %2,%2,%5 \n\t" \ 00748 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 00749 00750 #elif defined(TFM_PPC32) 00751 00752 /* PPC32 */ 00753 00754 #define COMBA_START 00755 00756 #define CLEAR_CARRY \ 00757 c0 = c1 = c2 = 0; 00758 00759 #define COMBA_STORE(x) \ 00760 x = c0; 00761 00762 #define COMBA_STORE2(x) \ 00763 x = c1; 00764 00765 #define CARRY_FORWARD \ 00766 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00767 00768 #define COMBA_FINI 00769 00770 /* multiplies point i and j, updates carry "c1" and digit c2 */ 00771 #define SQRADD(i, j) \ 00772 __asm__( \ 00773 " mullw 16,%6,%6 \n\t" \ 00774 " addc %0,%0,16 \n\t" \ 00775 " mulhwu 16,%6,%6 \n\t" \ 00776 " adde %1,%1,16 \n\t" \ 00777 " addze %2,%2 \n\t" \ 00778 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc"); 00779 00780 /* for squaring some of the terms are doubled... */ 00781 #define SQRADD2(i, j) \ 00782 __asm__( \ 00783 " mullw 16,%6,%7 \n\t" \ 00784 " mulhwu 17,%6,%7 \n\t" \ 00785 " addc %0,%0,16 \n\t" \ 00786 " adde %1,%1,17 \n\t" \ 00787 " addze %2,%2 \n\t" \ 00788 " addc %0,%0,16 \n\t" \ 00789 " adde %1,%1,17 \n\t" \ 00790 " addze %2,%2 \n\t" \ 00791 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc"); 00792 00793 #define SQRADDSC(i, j) \ 00794 __asm__( \ 00795 " mullw %0,%6,%7 \n\t" \ 00796 " mulhwu %1,%6,%7 \n\t" \ 00797 " xor %2,%2,%2 \n\t" \ 00798 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc"); 00799 00800 #define SQRADDAC(i, j) \ 00801 __asm__( \ 00802 " mullw 16,%6,%7 \n\t" \ 00803 " addc %0,%0,16 \n\t" \ 00804 " mulhwu 16,%6,%7 \n\t" \ 00805 " adde %1,%1,16 \n\t" \ 00806 " addze %2,%2 \n\t" \ 00807 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc"); 00808 00809 #define SQRADDDB \ 00810 __asm__( \ 00811 " addc %0,%0,%3 \n\t" \ 00812 " adde %1,%1,%4 \n\t" \ 00813 " adde %2,%2,%5 \n\t" \ 00814 " addc %0,%0,%3 \n\t" \ 00815 " adde %1,%1,%4 \n\t" \ 00816 " adde %2,%2,%5 \n\t" \ 00817 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 00818 00819 #elif defined(TFM_PPC64) 00820 /* PPC64 */ 00821 00822 #define COMBA_START 00823 00824 #define CLEAR_CARRY \ 00825 c0 = c1 = c2 = 0; 00826 00827 #define COMBA_STORE(x) \ 00828 x = c0; 00829 00830 #define COMBA_STORE2(x) \ 00831 x = c1; 00832 00833 #define CARRY_FORWARD \ 00834 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00835 00836 #define COMBA_FINI 00837 00838 /* multiplies point i and j, updates carry "c1" and digit c2 */ 00839 #define SQRADD(i, j) \ 00840 __asm__( \ 00841 " mulld 16,%6,%6 \n\t" \ 00842 " addc %0,%0,16 \n\t" \ 00843 " mulhdu 16,%6,%6 \n\t" \ 00844 " adde %1,%1,16 \n\t" \ 00845 " addze %2,%2 \n\t" \ 00846 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc"); 00847 00848 /* for squaring some of the terms are doubled... */ 00849 #define SQRADD2(i, j) \ 00850 __asm__( \ 00851 " mulld 16,%6,%7 \n\t" \ 00852 " mulhdu 17,%6,%7 \n\t" \ 00853 " addc %0,%0,16 \n\t" \ 00854 " adde %1,%1,17 \n\t" \ 00855 " addze %2,%2 \n\t" \ 00856 " addc %0,%0,16 \n\t" \ 00857 " adde %1,%1,17 \n\t" \ 00858 " addze %2,%2 \n\t" \ 00859 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc"); 00860 00861 #define SQRADDSC(i, j) \ 00862 __asm__( \ 00863 " mulld %0,%6,%7 \n\t" \ 00864 " mulhdu %1,%6,%7 \n\t" \ 00865 " xor %2,%2,%2 \n\t" \ 00866 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc"); 00867 00868 #define SQRADDAC(i, j) \ 00869 __asm__( \ 00870 " mulld 16,%6,%7 \n\t" \ 00871 " addc %0,%0,16 \n\t" \ 00872 " mulhdu 16,%6,%7 \n\t" \ 00873 " adde %1,%1,16 \n\t" \ 00874 " addze %2,%2 \n\t" \ 00875 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc"); 00876 00877 #define SQRADDDB \ 00878 __asm__( \ 00879 " addc %0,%0,%3 \n\t" \ 00880 " adde %1,%1,%4 \n\t" \ 00881 " adde %2,%2,%5 \n\t" \ 00882 " addc %0,%0,%3 \n\t" \ 00883 " adde %1,%1,%4 \n\t" \ 00884 " adde %2,%2,%5 \n\t" \ 00885 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 00886 00887 00888 #elif defined(TFM_AVR32) 00889 00890 /* AVR32 */ 00891 00892 #define COMBA_START 00893 00894 #define CLEAR_CARRY \ 00895 c0 = c1 = c2 = 0; 00896 00897 #define COMBA_STORE(x) \ 00898 x = c0; 00899 00900 #define COMBA_STORE2(x) \ 00901 x = c1; 00902 00903 #define CARRY_FORWARD \ 00904 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00905 00906 #define COMBA_FINI 00907 00908 /* multiplies point i and j, updates carry "c1" and digit c2 */ 00909 #define SQRADD(i, j) \ 00910 __asm__( \ 00911 " mulu.d r2,%6,%6 \n\t" \ 00912 " add %0,%0,r2 \n\t" \ 00913 " adc %1,%1,r3 \n\t" \ 00914 " acr %2 \n\t" \ 00915 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3"); 00916 00917 /* for squaring some of the terms are doubled... */ 00918 #define SQRADD2(i, j) \ 00919 __asm__( \ 00920 " mulu.d r2,%6,%7 \n\t" \ 00921 " add %0,%0,r2 \n\t" \ 00922 " adc %1,%1,r3 \n\t" \ 00923 " acr %2, \n\t" \ 00924 " add %0,%0,r2 \n\t" \ 00925 " adc %1,%1,r3 \n\t" \ 00926 " acr %2, \n\t" \ 00927 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3"); 00928 00929 #define SQRADDSC(i, j) \ 00930 __asm__( \ 00931 " mulu.d r2,%6,%7 \n\t" \ 00932 " mov %0,r2 \n\t" \ 00933 " mov %1,r3 \n\t" \ 00934 " eor %2,%2 \n\t" \ 00935 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3"); 00936 00937 #define SQRADDAC(i, j) \ 00938 __asm__( \ 00939 " mulu.d r2,%6,%7 \n\t" \ 00940 " add %0,%0,r2 \n\t" \ 00941 " adc %1,%1,r3 \n\t" \ 00942 " acr %2 \n\t" \ 00943 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3"); 00944 00945 #define SQRADDDB \ 00946 __asm__( \ 00947 " add %0,%0,%3 \n\t" \ 00948 " adc %1,%1,%4 \n\t" \ 00949 " adc %2,%2,%5 \n\t" \ 00950 " add %0,%0,%3 \n\t" \ 00951 " adc %1,%1,%4 \n\t" \ 00952 " adc %2,%2,%5 \n\t" \ 00953 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc"); 00954 00955 00956 #else 00957 00958 #define TFM_ISO 00959 00960 /* ISO C portable code */ 00961 00962 #define COMBA_START 00963 00964 #define CLEAR_CARRY \ 00965 c0 = c1 = c2 = 0; 00966 00967 #define COMBA_STORE(x) \ 00968 x = c0; 00969 00970 #define COMBA_STORE2(x) \ 00971 x = c1; 00972 00973 #define CARRY_FORWARD \ 00974 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 00975 00976 #define COMBA_FINI 00977 00978 /* multiplies point i and j, updates carry "c1" and digit c2 */ 00979 #define SQRADD(i, j) \ 00980 do { fp_word t; \ 00981 t = c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \ 00982 t = c1 + (t >> DIGIT_BIT); c1 = (fp_digit)t; \ 00983 c2 +=(fp_digit) (t >> DIGIT_BIT); \ 00984 } while (0); 00985 00986 00987 /* for squaring some of the terms are doubled... */ 00988 #define SQRADD2(i, j) \ 00989 do { fp_word t; \ 00990 t = ((fp_word)i) * ((fp_word)j); \ 00991 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \ 00992 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \ 00993 c2 +=(fp_digit)( tt >> DIGIT_BIT); \ 00994 tt = (fp_word)c0 + t; c0 = (fp_digit)tt; \ 00995 tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt; \ 00996 c2 +=(fp_digit) (tt >> DIGIT_BIT); \ 00997 } while (0); 00998 00999 #define SQRADDSC(i, j) \ 01000 do { fp_word t; \ 01001 t = ((fp_word)i) * ((fp_word)j); \ 01002 sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0; \ 01003 } while (0); 01004 01005 #define SQRADDAC(i, j) \ 01006 do { fp_word t; \ 01007 t = sc0 + ((fp_word)i) * ((fp_word)j); sc0 = t; \ 01008 t = sc1 + (t >> DIGIT_BIT); sc1 = t; sc2 += t >> DIGIT_BIT; \ 01009 } while (0); 01010 01011 #define SQRADDDB \ 01012 do { fp_word t; \ 01013 t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t; \ 01014 t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); c1 = t; \ 01015 c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT); \ 01016 } while (0); 01017 01018 #endif 01019 01020 #ifdef TFM_SMALL_SET 01021 #include "fp_sqr_comba_small_set.i" 01022 #include "fp_sqr_comba_3.i" 01023 #include "fp_sqr_comba_4.i" 01024 #include "fp_sqr_comba_6.i" 01025 #include "fp_sqr_comba_7.i" 01026 #include "fp_sqr_comba_8.i" 01027 #include "fp_sqr_comba_9.i" 01028 #include "fp_sqr_comba_12.i" 01029 #include "fp_sqr_comba_17.i" 01030 #include "fp_sqr_comba_20.i" 01031 #include "fp_sqr_comba_24.i" 01032 #include "fp_sqr_comba_28.i" 01033 #include "fp_sqr_comba_32.i" 01034 #include "fp_sqr_comba_48.i" 01035 #include "fp_sqr_comba_64.i" 01036 #endif 01037 /* end fp_sqr_comba.c asm */ 01038 01039 /* start fp_mul_comba.c asm */ 01040 /* these are the combas. Worship them. */ 01041 #if defined(TFM_X86) 01042 /* Generic x86 optimized code */ 01043 01044 /* anything you need at the start */ 01045 #define COMBA_START 01046 01047 /* clear the chaining variables */ 01048 #define COMBA_CLEAR \ 01049 c0 = c1 = c2 = 0; 01050 01051 /* forward the carry to the next digit */ 01052 #define COMBA_FORWARD \ 01053 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01054 01055 /* store the first sum */ 01056 #define COMBA_STORE(x) \ 01057 x = c0; 01058 01059 /* store the second sum [carry] */ 01060 #define COMBA_STORE2(x) \ 01061 x = c1; 01062 01063 /* anything you need at the end */ 01064 #define COMBA_FINI 01065 01066 /* this should multiply i and j */ 01067 #define MULADD(i, j) \ 01068 __asm__( \ 01069 "movl %6,%%eax \n\t" \ 01070 "mull %7 \n\t" \ 01071 "addl %%eax,%0 \n\t" \ 01072 "adcl %%edx,%1 \n\t" \ 01073 "adcl $0,%2 \n\t" \ 01074 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc"); 01075 01076 #elif defined(TFM_X86_64) 01077 /* x86-64 optimized */ 01078 01079 /* anything you need at the start */ 01080 #define COMBA_START 01081 01082 /* clear the chaining variables */ 01083 #define COMBA_CLEAR \ 01084 c0 = c1 = c2 = 0; 01085 01086 /* forward the carry to the next digit */ 01087 #define COMBA_FORWARD \ 01088 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01089 01090 /* store the first sum */ 01091 #define COMBA_STORE(x) \ 01092 x = c0; 01093 01094 /* store the second sum [carry] */ 01095 #define COMBA_STORE2(x) \ 01096 x = c1; 01097 01098 /* anything you need at the end */ 01099 #define COMBA_FINI 01100 01101 /* this should multiply i and j */ 01102 #define MULADD(i, j) \ 01103 __asm__ ( \ 01104 "movq %6,%%rax \n\t" \ 01105 "mulq %7 \n\t" \ 01106 "addq %%rax,%0 \n\t" \ 01107 "adcq %%rdx,%1 \n\t" \ 01108 "adcq $0,%2 \n\t" \ 01109 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc"); 01110 01111 #elif defined(TFM_SSE2) 01112 /* use SSE2 optimizations */ 01113 01114 /* anything you need at the start */ 01115 #define COMBA_START 01116 01117 /* clear the chaining variables */ 01118 #define COMBA_CLEAR \ 01119 c0 = c1 = c2 = 0; 01120 01121 /* forward the carry to the next digit */ 01122 #define COMBA_FORWARD \ 01123 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01124 01125 /* store the first sum */ 01126 #define COMBA_STORE(x) \ 01127 x = c0; 01128 01129 /* store the second sum [carry] */ 01130 #define COMBA_STORE2(x) \ 01131 x = c1; 01132 01133 /* anything you need at the end */ 01134 #define COMBA_FINI \ 01135 __asm__("emms"); 01136 01137 /* this should multiply i and j */ 01138 #define MULADD(i, j) \ 01139 __asm__( \ 01140 "movd %6,%%mm0 \n\t" \ 01141 "movd %7,%%mm1 \n\t" \ 01142 "pmuludq %%mm1,%%mm0\n\t" \ 01143 "movd %%mm0,%%eax \n\t" \ 01144 "psrlq $32,%%mm0 \n\t" \ 01145 "addl %%eax,%0 \n\t" \ 01146 "movd %%mm0,%%eax \n\t" \ 01147 "adcl %%eax,%1 \n\t" \ 01148 "adcl $0,%2 \n\t" \ 01149 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc"); 01150 01151 #elif defined(TFM_ARM) 01152 /* ARM code */ 01153 01154 #define COMBA_START 01155 01156 #define COMBA_CLEAR \ 01157 c0 = c1 = c2 = 0; 01158 01159 #define COMBA_FORWARD \ 01160 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01161 01162 #define COMBA_STORE(x) \ 01163 x = c0; 01164 01165 #define COMBA_STORE2(x) \ 01166 x = c1; 01167 01168 #define COMBA_FINI 01169 01170 #define MULADD(i, j) \ 01171 __asm__( \ 01172 " UMULL r0,r1,%6,%7 \n\t" \ 01173 " ADDS %0,%0,r0 \n\t" \ 01174 " ADCS %1,%1,r1 \n\t" \ 01175 " ADC %2,%2,#0 \n\t" \ 01176 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc"); 01177 01178 #elif defined(TFM_PPC32) 01179 /* For 32-bit PPC */ 01180 01181 #define COMBA_START 01182 01183 #define COMBA_CLEAR \ 01184 c0 = c1 = c2 = 0; 01185 01186 #define COMBA_FORWARD \ 01187 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01188 01189 #define COMBA_STORE(x) \ 01190 x = c0; 01191 01192 #define COMBA_STORE2(x) \ 01193 x = c1; 01194 01195 #define COMBA_FINI 01196 01197 /* untested: will mulhwu change the flags? Docs say no */ 01198 #define MULADD(i, j) \ 01199 __asm__( \ 01200 " mullw 16,%6,%7 \n\t" \ 01201 " addc %0,%0,16 \n\t" \ 01202 " mulhwu 16,%6,%7 \n\t" \ 01203 " adde %1,%1,16 \n\t" \ 01204 " addze %2,%2 \n\t" \ 01205 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16"); 01206 01207 #elif defined(TFM_PPC64) 01208 /* For 64-bit PPC */ 01209 01210 #define COMBA_START 01211 01212 #define COMBA_CLEAR \ 01213 c0 = c1 = c2 = 0; 01214 01215 #define COMBA_FORWARD \ 01216 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01217 01218 #define COMBA_STORE(x) \ 01219 x = c0; 01220 01221 #define COMBA_STORE2(x) \ 01222 x = c1; 01223 01224 #define COMBA_FINI 01225 01226 /* untested: will mulhwu change the flags? Docs say no */ 01227 #define MULADD(i, j) \ 01228 ____asm__( \ 01229 " mulld 16,%6,%7 \n\t" \ 01230 " addc %0,%0,16 \n\t" \ 01231 " mulhdu 16,%6,%7 \n\t" \ 01232 " adde %1,%1,16 \n\t" \ 01233 " addze %2,%2 \n\t" \ 01234 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16"); 01235 01236 #elif defined(TFM_AVR32) 01237 01238 /* ISO C code */ 01239 01240 #define COMBA_START 01241 01242 #define COMBA_CLEAR \ 01243 c0 = c1 = c2 = 0; 01244 01245 #define COMBA_FORWARD \ 01246 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01247 01248 #define COMBA_STORE(x) \ 01249 x = c0; 01250 01251 #define COMBA_STORE2(x) \ 01252 x = c1; 01253 01254 #define COMBA_FINI 01255 01256 #define MULADD(i, j) \ 01257 ____asm__( \ 01258 " mulu.d r2,%6,%7 \n\t"\ 01259 " add %0,r2 \n\t"\ 01260 " adc %1,%1,r3 \n\t"\ 01261 " acr %2 \n\t"\ 01262 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3"); 01263 01264 #else 01265 /* ISO C code */ 01266 01267 #define COMBA_START 01268 01269 #define COMBA_CLEAR \ 01270 c0 = c1 = c2 = 0; 01271 01272 #define COMBA_FORWARD \ 01273 do { c0 = c1; c1 = c2; c2 = 0; } while (0); 01274 01275 #define COMBA_STORE(x) \ 01276 x = c0; 01277 01278 #define COMBA_STORE2(x) \ 01279 x = c1; 01280 01281 #define COMBA_FINI 01282 01283 #define MULADD(i, j) \ 01284 do { fp_word t; \ 01285 t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t; \ 01286 t = (fp_word)c1 + (t >> DIGIT_BIT); \ 01287 c1 = (fp_digit)t; c2 += (fp_digit)(t >> DIGIT_BIT); \ 01288 } while (0); 01289 01290 #endif 01291 01292 01293 #ifdef TFM_SMALL_SET 01294 #include "fp_mul_comba_small_set.i" 01295 #include "fp_mul_comba_3.i" 01296 #include "fp_mul_comba_4.i" 01297 #include "fp_mul_comba_6.i" 01298 #include "fp_mul_comba_7.i" 01299 #include "fp_mul_comba_8.i" 01300 #include "fp_mul_comba_9.i" 01301 #include "fp_mul_comba_12.i" 01302 #include "fp_mul_comba_17.i" 01303 #include "fp_mul_comba_20.i" 01304 #include "fp_mul_comba_24.i" 01305 #include "fp_mul_comba_28.i" 01306 #include "fp_mul_comba_32.i" 01307 #include "fp_mul_comba_48.i" 01308 #include "fp_mul_comba_64.i" 01309 #endif 01310 01311 /* end fp_mul_comba.c asm */ 01312
Generated on Thu Jul 14 2022 20:26:02 by
1.7.2