wolf SSL / CyaSSL-2.9.4

Dependents:  

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers asm.c Source File

asm.c

00001 /* asm.c
00002  *
00003  * Copyright (C) 2006-2013 wolfSSL Inc.
00004  *
00005  * This file is part of CyaSSL.
00006  *
00007  * CyaSSL is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * CyaSSL is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
00020  */
00021 
00022 #ifdef HAVE_CONFIG_H
00023     #include <config.h>
00024 #endif
00025 
00026 #include <cyassl/ctaocrypt/settings.h>
00027 
00028 /*
00029  * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
00030  * http://math.libtomcrypt.com
00031  */
00032 
00033 
00034 /******************************************************************/
00035 /* fp_montgomery_reduce.c asm or generic */
00036 #if defined(TFM_X86) && !defined(TFM_SSE2) 
00037 /* x86-32 code */
00038 
00039 #define MONT_START 
00040 #define MONT_FINI
00041 #define LOOP_END
00042 #define LOOP_START \
00043    mu = c[x] * mp
00044 
00045 #define INNERMUL                                          \
00046 __asm__(                                                      \
00047    "movl %5,%%eax \n\t"                                   \
00048    "mull %4       \n\t"                                   \
00049    "addl %1,%%eax \n\t"                                   \
00050    "adcl $0,%%edx \n\t"                                   \
00051    "addl %%eax,%0 \n\t"                                   \
00052    "adcl $0,%%edx \n\t"                                   \
00053    "movl %%edx,%1 \n\t"                                   \
00054 :"=g"(_c[LO]), "=r"(cy)                                   \
00055 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++)              \
00056 : "%eax", "%edx", "cc")
00057 
00058 #define PROPCARRY                           \
00059 __asm__(                                        \
00060    "addl   %1,%0    \n\t"                   \
00061    "setb   %%al     \n\t"                   \
00062    "movzbl %%al,%1 \n\t"                    \
00063 :"=g"(_c[LO]), "=r"(cy)                     \
00064 :"0"(_c[LO]), "1"(cy)                       \
00065 : "%eax", "cc")
00066 
00067 /******************************************************************/
00068 #elif defined(TFM_X86_64)
00069 /* x86-64 code */
00070 
00071 #define MONT_START 
00072 #define MONT_FINI
00073 #define LOOP_END
00074 #define LOOP_START \
00075    mu = c[x] * mp
00076 
00077 #define INNERMUL                                          \
00078 __asm__(                                                      \
00079    "movq %5,%%rax \n\t"                                   \
00080    "mulq %4       \n\t"                                   \
00081    "addq %1,%%rax \n\t"                                   \
00082    "adcq $0,%%rdx \n\t"                                   \
00083    "addq %%rax,%0 \n\t"                                   \
00084    "adcq $0,%%rdx \n\t"                                   \
00085    "movq %%rdx,%1 \n\t"                                   \
00086 :"=g"(_c[LO]), "=r"(cy)                                   \
00087 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
00088 : "%rax", "%rdx", "cc")
00089 
00090 #define INNERMUL8 \
00091  __asm__(                  \
00092  "movq 0(%5),%%rax    \n\t"  \
00093  "movq 0(%2),%%r10    \n\t"  \
00094  "movq 0x8(%5),%%r11  \n\t"  \
00095  "mulq %4             \n\t"  \
00096  "addq %%r10,%%rax    \n\t"  \
00097  "adcq $0,%%rdx       \n\t"  \
00098  "movq 0x8(%2),%%r10  \n\t"  \
00099  "addq %3,%%rax       \n\t"  \
00100  "adcq $0,%%rdx       \n\t"  \
00101  "movq %%rax,0(%0)    \n\t"  \
00102  "movq %%rdx,%1       \n\t"  \
00103  \
00104  "movq %%r11,%%rax    \n\t"  \
00105  "movq 0x10(%5),%%r11 \n\t"  \
00106  "mulq %4             \n\t"  \
00107  "addq %%r10,%%rax    \n\t"  \
00108  "adcq $0,%%rdx       \n\t"  \
00109  "movq 0x10(%2),%%r10 \n\t"  \
00110  "addq %3,%%rax       \n\t"  \
00111  "adcq $0,%%rdx       \n\t"  \
00112  "movq %%rax,0x8(%0)  \n\t"  \
00113  "movq %%rdx,%1       \n\t"  \
00114  \
00115  "movq %%r11,%%rax    \n\t"  \
00116  "movq 0x18(%5),%%r11 \n\t"  \
00117  "mulq %4             \n\t"  \
00118  "addq %%r10,%%rax    \n\t"  \
00119  "adcq $0,%%rdx       \n\t"  \
00120  "movq 0x18(%2),%%r10 \n\t"  \
00121  "addq %3,%%rax       \n\t"  \
00122  "adcq $0,%%rdx       \n\t"  \
00123  "movq %%rax,0x10(%0) \n\t"  \
00124  "movq %%rdx,%1       \n\t"  \
00125  \
00126  "movq %%r11,%%rax    \n\t"  \
00127  "movq 0x20(%5),%%r11 \n\t"  \
00128  "mulq %4             \n\t"  \
00129  "addq %%r10,%%rax    \n\t"  \
00130  "adcq $0,%%rdx       \n\t"  \
00131  "movq 0x20(%2),%%r10 \n\t"  \
00132  "addq %3,%%rax       \n\t"  \
00133  "adcq $0,%%rdx       \n\t"  \
00134  "movq %%rax,0x18(%0) \n\t"  \
00135  "movq %%rdx,%1       \n\t"  \
00136  \
00137  "movq %%r11,%%rax    \n\t"  \
00138  "movq 0x28(%5),%%r11 \n\t"  \
00139  "mulq %4             \n\t"  \
00140  "addq %%r10,%%rax    \n\t"  \
00141  "adcq $0,%%rdx       \n\t"  \
00142  "movq 0x28(%2),%%r10 \n\t"  \
00143  "addq %3,%%rax       \n\t"  \
00144  "adcq $0,%%rdx       \n\t"  \
00145  "movq %%rax,0x20(%0) \n\t"  \
00146  "movq %%rdx,%1       \n\t"  \
00147  \
00148  "movq %%r11,%%rax    \n\t"  \
00149  "movq 0x30(%5),%%r11 \n\t"  \
00150  "mulq %4             \n\t"  \
00151  "addq %%r10,%%rax    \n\t"  \
00152  "adcq $0,%%rdx       \n\t"  \
00153  "movq 0x30(%2),%%r10 \n\t"  \
00154  "addq %3,%%rax       \n\t"  \
00155  "adcq $0,%%rdx       \n\t"  \
00156  "movq %%rax,0x28(%0) \n\t"  \
00157  "movq %%rdx,%1       \n\t"  \
00158  \
00159  "movq %%r11,%%rax    \n\t"  \
00160  "movq 0x38(%5),%%r11 \n\t"  \
00161  "mulq %4             \n\t"  \
00162  "addq %%r10,%%rax    \n\t"  \
00163  "adcq $0,%%rdx       \n\t"  \
00164  "movq 0x38(%2),%%r10 \n\t"  \
00165  "addq %3,%%rax       \n\t"  \
00166  "adcq $0,%%rdx       \n\t"  \
00167  "movq %%rax,0x30(%0) \n\t"  \
00168  "movq %%rdx,%1       \n\t"  \
00169  \
00170  "movq %%r11,%%rax    \n\t"  \
00171  "mulq %4             \n\t"  \
00172  "addq %%r10,%%rax    \n\t"  \
00173  "adcq $0,%%rdx       \n\t"  \
00174  "addq %3,%%rax       \n\t"  \
00175  "adcq $0,%%rdx       \n\t"  \
00176  "movq %%rax,0x38(%0) \n\t"  \
00177  "movq %%rdx,%1       \n\t"  \
00178  \
00179 :"=r"(_c), "=r"(cy)                    \
00180 : "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
00181 : "%rax", "%rdx", "%r10", "%r11", "cc")
00182 
00183 
00184 #define PROPCARRY                           \
00185 __asm__(                                        \
00186    "addq   %1,%0    \n\t"                   \
00187    "setb   %%al     \n\t"                   \
00188    "movzbq %%al,%1 \n\t"                    \
00189 :"=g"(_c[LO]), "=r"(cy)                     \
00190 :"0"(_c[LO]), "1"(cy)                       \
00191 : "%rax", "cc")
00192 
00193 /******************************************************************/
00194 #elif defined(TFM_SSE2)  
00195 /* SSE2 code (assumes 32-bit fp_digits) */
00196 /* XMM register assignments:
00197  * xmm0  *tmpm++, then Mu * (*tmpm++)
00198  * xmm1  c[x], then Mu
00199  * xmm2  mp
00200  * xmm3  cy
00201  * xmm4  _c[LO]
00202  */
00203 
00204 #define MONT_START \
00205    __asm__("movd %0,%%mm2"::"g"(mp))
00206 
00207 #define MONT_FINI \
00208    __asm__("emms")
00209 
00210 #define LOOP_START          \
00211 __asm__(                        \
00212 "movd %0,%%mm1        \n\t" \
00213 "pxor %%mm3,%%mm3     \n\t" \
00214 "pmuludq %%mm2,%%mm1  \n\t" \
00215 :: "g"(c[x]))
00216 
00217 /* pmuludq on mmx registers does a 32x32->64 multiply. */
00218 #define INNERMUL               \
00219 __asm__(                           \
00220    "movd %1,%%mm4        \n\t" \
00221    "movd %2,%%mm0        \n\t" \
00222    "paddq %%mm4,%%mm3    \n\t" \
00223    "pmuludq %%mm1,%%mm0  \n\t" \
00224    "paddq %%mm0,%%mm3    \n\t" \
00225    "movd %%mm3,%0        \n\t" \
00226    "psrlq $32, %%mm3     \n\t" \
00227 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
00228 
00229 #define INNERMUL8 \
00230 __asm__(                           \
00231    "movd 0(%1),%%mm4     \n\t" \
00232    "movd 0(%2),%%mm0     \n\t" \
00233    "paddq %%mm4,%%mm3    \n\t" \
00234    "pmuludq %%mm1,%%mm0  \n\t" \
00235    "movd 4(%2),%%mm5     \n\t" \
00236    "paddq %%mm0,%%mm3    \n\t" \
00237    "movd 4(%1),%%mm6     \n\t" \
00238    "movd %%mm3,0(%0)     \n\t" \
00239    "psrlq $32, %%mm3     \n\t" \
00240 \
00241    "paddq %%mm6,%%mm3    \n\t" \
00242    "pmuludq %%mm1,%%mm5  \n\t" \
00243    "movd 8(%2),%%mm6     \n\t" \
00244    "paddq %%mm5,%%mm3    \n\t" \
00245    "movd 8(%1),%%mm7     \n\t" \
00246    "movd %%mm3,4(%0)     \n\t" \
00247    "psrlq $32, %%mm3     \n\t" \
00248 \
00249    "paddq %%mm7,%%mm3    \n\t" \
00250    "pmuludq %%mm1,%%mm6  \n\t" \
00251    "movd 12(%2),%%mm7    \n\t" \
00252    "paddq %%mm6,%%mm3    \n\t" \
00253    "movd 12(%1),%%mm5     \n\t" \
00254    "movd %%mm3,8(%0)     \n\t" \
00255    "psrlq $32, %%mm3     \n\t" \
00256 \
00257    "paddq %%mm5,%%mm3    \n\t" \
00258    "pmuludq %%mm1,%%mm7  \n\t" \
00259    "movd 16(%2),%%mm5    \n\t" \
00260    "paddq %%mm7,%%mm3    \n\t" \
00261    "movd 16(%1),%%mm6    \n\t" \
00262    "movd %%mm3,12(%0)    \n\t" \
00263    "psrlq $32, %%mm3     \n\t" \
00264 \
00265    "paddq %%mm6,%%mm3    \n\t" \
00266    "pmuludq %%mm1,%%mm5  \n\t" \
00267    "movd 20(%2),%%mm6    \n\t" \
00268    "paddq %%mm5,%%mm3    \n\t" \
00269    "movd 20(%1),%%mm7    \n\t" \
00270    "movd %%mm3,16(%0)    \n\t" \
00271    "psrlq $32, %%mm3     \n\t" \
00272 \
00273    "paddq %%mm7,%%mm3    \n\t" \
00274    "pmuludq %%mm1,%%mm6  \n\t" \
00275    "movd 24(%2),%%mm7    \n\t" \
00276    "paddq %%mm6,%%mm3    \n\t" \
00277    "movd 24(%1),%%mm5     \n\t" \
00278    "movd %%mm3,20(%0)    \n\t" \
00279    "psrlq $32, %%mm3     \n\t" \
00280 \
00281    "paddq %%mm5,%%mm3    \n\t" \
00282    "pmuludq %%mm1,%%mm7  \n\t" \
00283    "movd 28(%2),%%mm5    \n\t" \
00284    "paddq %%mm7,%%mm3    \n\t" \
00285    "movd 28(%1),%%mm6    \n\t" \
00286    "movd %%mm3,24(%0)    \n\t" \
00287    "psrlq $32, %%mm3     \n\t" \
00288 \
00289    "paddq %%mm6,%%mm3    \n\t" \
00290    "pmuludq %%mm1,%%mm5  \n\t" \
00291    "paddq %%mm5,%%mm3    \n\t" \
00292    "movd %%mm3,28(%0)    \n\t" \
00293    "psrlq $32, %%mm3     \n\t" \
00294 :"=r"(_c) : "0"(_c), "r"(tmpm) );
00295 
00296 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack
00297    pointer */
00298 
00299 #define LOOP_END \
00300 __asm__( "movd %%mm3,%0  \n" :"=r"(cy))
00301 
00302 #define PROPCARRY                           \
00303 __asm__(                                        \
00304    "addl   %1,%0    \n\t"                   \
00305    "setb   %%al     \n\t"                   \
00306    "movzbl %%al,%1 \n\t"                    \
00307 :"=g"(_c[LO]), "=r"(cy)                     \
00308 :"0"(_c[LO]), "1"(cy)                       \
00309 : "%eax", "cc")
00310 
00311 /******************************************************************/
00312 #elif defined(TFM_ARM)
00313    /* ARMv4 code */
00314 
00315 #define MONT_START 
00316 #define MONT_FINI
00317 #define LOOP_END
00318 #define LOOP_START \
00319    mu = c[x] * mp
00320 
00321 
00322 #ifdef __thumb__
00323 
00324 #define INNERMUL                    \
00325 __asm__(                                \
00326     " LDR    r0,%1            \n\t" \
00327     " ADDS   r0,r0,%0         \n\t" \
00328     " ITE    CS               \n\t" \
00329     " MOVCS  %0,#1            \n\t" \
00330     " MOVCC  %0,#0            \n\t" \
00331     " UMLAL  r0,%0,%3,%4      \n\t" \
00332     " STR    r0,%1            \n\t" \
00333 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc");
00334 
00335 #define PROPCARRY                  \
00336 __asm__(                               \
00337     " LDR   r0,%1            \n\t" \
00338     " ADDS  r0,r0,%0         \n\t" \
00339     " STR   r0,%1            \n\t" \
00340     " ITE   CS               \n\t" \
00341     " MOVCS %0,#1            \n\t" \
00342     " MOVCC %0,#0            \n\t" \
00343 :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc");
00344 
00345 
00346 /* TAO thumb mode uses ite (if then else) to detect carry directly
00347  * fixed unmatched constraint warning by changing 1 to m  */
00348 
00349 #else  /* __thumb__ */
00350 
00351 #define INNERMUL                    \
00352 __asm__(                                \
00353     " LDR    r0,%1            \n\t" \
00354     " ADDS   r0,r0,%0         \n\t" \
00355     " MOVCS  %0,#1            \n\t" \
00356     " MOVCC  %0,#0            \n\t" \
00357     " UMLAL  r0,%0,%3,%4      \n\t" \
00358     " STR    r0,%1            \n\t" \
00359 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
00360 
00361 #define PROPCARRY                  \
00362 __asm__(                               \
00363     " LDR   r0,%1            \n\t" \
00364     " ADDS  r0,r0,%0         \n\t" \
00365     " STR   r0,%1            \n\t" \
00366     " MOVCS %0,#1            \n\t" \
00367     " MOVCC %0,#0            \n\t" \
00368 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
00369 
00370 #endif /* __thumb__ */
00371 
00372 #elif defined(TFM_PPC32)
00373 
00374 /* PPC32 */
00375 #define MONT_START 
00376 #define MONT_FINI
00377 #define LOOP_END
00378 #define LOOP_START \
00379    mu = c[x] * mp
00380 
00381 #define INNERMUL                     \
00382 __asm__(                                 \
00383    " mullw    16,%3,%4       \n\t"   \
00384    " mulhwu   17,%3,%4       \n\t"   \
00385    " addc     16,16,%0       \n\t"   \
00386    " addze    17,17          \n\t"   \
00387    " lwz      18,%1          \n\t"   \
00388    " addc     16,16,18       \n\t"   \
00389    " addze    %0,17          \n\t"   \
00390    " stw      16,%1          \n\t"   \
00391 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
00392 
00393 #define PROPCARRY                    \
00394 __asm__(                                 \
00395    " lwz      16,%1         \n\t"    \
00396    " addc     16,16,%0      \n\t"    \
00397    " stw      16,%1         \n\t"    \
00398    " xor      %0,%0,%0      \n\t"    \
00399    " addze    %0,%0         \n\t"    \
00400 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
00401 
00402 #elif defined(TFM_PPC64)
00403 
00404 /* PPC64 */
00405 #define MONT_START 
00406 #define MONT_FINI
00407 #define LOOP_END
00408 #define LOOP_START \
00409    mu = c[x] * mp
00410 
00411 #define INNERMUL                     \
00412 __asm__(                                 \
00413    " mulld    16,%3,%4       \n\t"   \
00414    " mulhdu   17,%3,%4       \n\t"   \
00415    " addc     16,16,%0       \n\t"   \
00416    " addze    17,17          \n\t"   \
00417    " ldx      18,0,%1        \n\t"   \
00418    " addc     16,16,18       \n\t"   \
00419    " addze    %0,17          \n\t"   \
00420    " sdx      16,0,%1        \n\t"   \
00421 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
00422 
00423 #define PROPCARRY                    \
00424 __asm__(                                 \
00425    " ldx      16,0,%1       \n\t"    \
00426    " addc     16,16,%0      \n\t"    \
00427    " sdx      16,0,%1       \n\t"    \
00428    " xor      %0,%0,%0      \n\t"    \
00429    " addze    %0,%0         \n\t"    \
00430 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
00431 
00432 /******************************************************************/
00433 
00434 #elif defined(TFM_AVR32)
00435 
00436 /* AVR32 */
00437 #define MONT_START 
00438 #define MONT_FINI
00439 #define LOOP_END
00440 #define LOOP_START \
00441    mu = c[x] * mp
00442 
00443 #define INNERMUL                    \
00444 __asm__(                                \
00445     " ld.w   r2,%1            \n\t" \
00446     " add    r2,%0            \n\t" \
00447     " eor    r3,r3            \n\t" \
00448     " acr    r3               \n\t" \
00449     " macu.d r2,%3,%4         \n\t" \
00450     " st.w   %1,r2            \n\t" \
00451     " mov    %0,r3            \n\t" \
00452 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
00453 
00454 #define PROPCARRY                    \
00455 __asm__(                                 \
00456    " ld.w     r2,%1         \n\t"    \
00457    " add      r2,%0         \n\t"    \
00458    " st.w     %1,r2         \n\t"    \
00459    " eor      %0,%0         \n\t"    \
00460    " acr      %0            \n\t"    \
00461 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
00462 
00463 #else
00464 
00465 /* ISO C code */
00466 #define MONT_START 
00467 #define MONT_FINI
00468 #define LOOP_END
00469 #define LOOP_START \
00470    mu = c[x] * mp
00471 
00472 #define INNERMUL                                      \
00473    do { fp_word t;                                    \
00474    t  = ((fp_word)_c[0] + (fp_word)cy) +              \
00475                 (((fp_word)mu) * ((fp_word)*tmpm++)); \
00476    _c[0] = (fp_digit)t;                               \
00477    cy = (fp_digit)(t >> DIGIT_BIT);                   \
00478    } while (0)
00479 
00480 #define PROPCARRY \
00481    do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
00482 
00483 #endif
00484 /******************************************************************/
00485 
00486 
00487 #define LO  0
00488 /* end fp_montogomery_reduce.c asm */
00489 
00490 
00491 /* start fp_sqr_comba.c asm */
00492 #if defined(TFM_X86)
00493 
00494 /* x86-32 optimized */
00495 
00496 #define COMBA_START
00497 
00498 #define CLEAR_CARRY \
00499    c0 = c1 = c2 = 0;
00500 
00501 #define COMBA_STORE(x) \
00502    x = c0;
00503 
00504 #define COMBA_STORE2(x) \
00505    x = c1;
00506 
00507 #define CARRY_FORWARD \
00508    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00509 
00510 #define COMBA_FINI
00511 
00512 #define SQRADD(i, j)                                      \
00513 __asm__(                                            \
00514      "movl  %6,%%eax     \n\t"                            \
00515      "mull  %%eax        \n\t"                            \
00516      "addl  %%eax,%0     \n\t"                            \
00517      "adcl  %%edx,%1     \n\t"                            \
00518      "adcl  $0,%2        \n\t"                            \
00519      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
00520 
00521 #define SQRADD2(i, j)                                     \
00522 __asm__(                                            \
00523      "movl  %6,%%eax     \n\t"                            \
00524      "mull  %7           \n\t"                            \
00525      "addl  %%eax,%0     \n\t"                            \
00526      "adcl  %%edx,%1     \n\t"                            \
00527      "adcl  $0,%2        \n\t"                            \
00528      "addl  %%eax,%0     \n\t"                            \
00529      "adcl  %%edx,%1     \n\t"                            \
00530      "adcl  $0,%2        \n\t"                            \
00531      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx", "cc");
00532 
00533 #define SQRADDSC(i, j)                                    \
00534 __asm__(                                                     \
00535      "movl  %3,%%eax     \n\t"                            \
00536      "mull  %4           \n\t"                            \
00537      "movl  %%eax,%0     \n\t"                            \
00538      "movl  %%edx,%1     \n\t"                            \
00539      "xorl  %2,%2        \n\t"                            \
00540      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc");
00541 
00542 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
00543 
00544 #define SQRADDAC(i, j)                                    \
00545 __asm__(                                                     \
00546      "movl  %6,%%eax     \n\t"                            \
00547      "mull  %7           \n\t"                            \
00548      "addl  %%eax,%0     \n\t"                            \
00549      "adcl  %%edx,%1     \n\t"                            \
00550      "adcl  $0,%2        \n\t"                            \
00551      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
00552 
00553 #define SQRADDDB                                          \
00554 __asm__(                                                     \
00555      "addl %6,%0         \n\t"                            \
00556      "adcl %7,%1         \n\t"                            \
00557      "adcl %8,%2         \n\t"                            \
00558      "addl %6,%0         \n\t"                            \
00559      "adcl %7,%1         \n\t"                            \
00560      "adcl %8,%2         \n\t"                            \
00561      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
00562 
00563 #elif defined(TFM_X86_64)
00564 /* x86-64 optimized */
00565 
00566 #define COMBA_START
00567 
00568 #define CLEAR_CARRY \
00569    c0 = c1 = c2 = 0;
00570 
00571 #define COMBA_STORE(x) \
00572    x = c0;
00573 
00574 #define COMBA_STORE2(x) \
00575    x = c1;
00576 
00577 #define CARRY_FORWARD \
00578    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00579 
00580 #define COMBA_FINI
00581 
00582 #define SQRADD(i, j)                                      \
00583 __asm__(                                                     \
00584      "movq  %6,%%rax     \n\t"                            \
00585      "mulq  %%rax        \n\t"                            \
00586      "addq  %%rax,%0     \n\t"                            \
00587      "adcq  %%rdx,%1     \n\t"                            \
00588      "adcq  $0,%2        \n\t"                            \
00589      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
00590 
00591 #define SQRADD2(i, j)                                     \
00592 __asm__(                                                     \
00593      "movq  %6,%%rax     \n\t"                            \
00594      "mulq  %7           \n\t"                            \
00595      "addq  %%rax,%0     \n\t"                            \
00596      "adcq  %%rdx,%1     \n\t"                            \
00597      "adcq  $0,%2        \n\t"                            \
00598      "addq  %%rax,%0     \n\t"                            \
00599      "adcq  %%rdx,%1     \n\t"                            \
00600      "adcq  $0,%2        \n\t"                            \
00601      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
00602 
00603 #define SQRADDSC(i, j)                                    \
00604 __asm__(                                                     \
00605      "movq  %3,%%rax     \n\t"                            \
00606      "mulq  %4           \n\t"                            \
00607      "movq  %%rax,%0     \n\t"                            \
00608      "movq  %%rdx,%1     \n\t"                            \
00609      "xorq  %2,%2        \n\t"                            \
00610      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
00611 
00612 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
00613 
00614 #define SQRADDAC(i, j)                                                         \
00615 __asm__(                                                     \
00616      "movq  %6,%%rax     \n\t"                            \
00617      "mulq  %7           \n\t"                            \
00618      "addq  %%rax,%0     \n\t"                            \
00619      "adcq  %%rdx,%1     \n\t"                            \
00620      "adcq  $0,%2        \n\t"                            \
00621      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
00622 
00623 #define SQRADDDB                                          \
00624 __asm__(                                                     \
00625      "addq %6,%0         \n\t"                            \
00626      "adcq %7,%1         \n\t"                            \
00627      "adcq %8,%2         \n\t"                            \
00628      "addq %6,%0         \n\t"                            \
00629      "adcq %7,%1         \n\t"                            \
00630      "adcq %8,%2         \n\t"                            \
00631      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
00632 
00633 #elif defined(TFM_SSE2)
00634 
00635 /* SSE2 Optimized */
00636 #define COMBA_START
00637 
00638 #define CLEAR_CARRY \
00639    c0 = c1 = c2 = 0;
00640 
00641 #define COMBA_STORE(x) \
00642    x = c0;
00643 
00644 #define COMBA_STORE2(x) \
00645    x = c1;
00646 
00647 #define CARRY_FORWARD \
00648    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00649 
00650 #define COMBA_FINI \
00651    __asm__("emms");
00652 
00653 #define SQRADD(i, j)                                      \
00654 __asm__(                                            \
00655      "movd  %6,%%mm0     \n\t"                            \
00656      "pmuludq %%mm0,%%mm0\n\t"                            \
00657      "movd  %%mm0,%%eax  \n\t"                            \
00658      "psrlq $32,%%mm0    \n\t"                            \
00659      "addl  %%eax,%0     \n\t"                            \
00660      "movd  %%mm0,%%eax  \n\t"                            \
00661      "adcl  %%eax,%1     \n\t"                            \
00662      "adcl  $0,%2        \n\t"                            \
00663      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
00664 
00665 #define SQRADD2(i, j)                                     \
00666 __asm__(                                            \
00667      "movd  %6,%%mm0     \n\t"                            \
00668      "movd  %7,%%mm1     \n\t"                            \
00669      "pmuludq %%mm1,%%mm0\n\t"                            \
00670      "movd  %%mm0,%%eax  \n\t"                            \
00671      "psrlq $32,%%mm0    \n\t"                            \
00672      "movd  %%mm0,%%edx  \n\t"                            \
00673      "addl  %%eax,%0     \n\t"                            \
00674      "adcl  %%edx,%1     \n\t"                            \
00675      "adcl  $0,%2        \n\t"                            \
00676      "addl  %%eax,%0     \n\t"                            \
00677      "adcl  %%edx,%1     \n\t"                            \
00678      "adcl  $0,%2        \n\t"                            \
00679      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
00680 
00681 #define SQRADDSC(i, j)                                                         \
00682 __asm__(                                            \
00683      "movd  %3,%%mm0     \n\t"                            \
00684      "movd  %4,%%mm1     \n\t"                            \
00685      "pmuludq %%mm1,%%mm0\n\t"                            \
00686      "movd  %%mm0,%0     \n\t"                            \
00687      "psrlq $32,%%mm0    \n\t"                            \
00688      "movd  %%mm0,%1     \n\t"                            \
00689      "xorl  %2,%2        \n\t"                            \
00690      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j));
00691 
00692 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
00693 
00694 #define SQRADDAC(i, j)                                                         \
00695 __asm__(                                            \
00696      "movd  %6,%%mm0     \n\t"                            \
00697      "movd  %7,%%mm1     \n\t"                            \
00698      "pmuludq %%mm1,%%mm0\n\t"                            \
00699      "movd  %%mm0,%%eax  \n\t"                            \
00700      "psrlq $32,%%mm0    \n\t"                            \
00701      "movd  %%mm0,%%edx  \n\t"                            \
00702      "addl  %%eax,%0     \n\t"                            \
00703      "adcl  %%edx,%1     \n\t"                            \
00704      "adcl  $0,%2        \n\t"                            \
00705      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)  :"%eax","%edx","cc");
00706 
00707 #define SQRADDDB                                          \
00708 __asm__(                                                     \
00709      "addl %6,%0         \n\t"                            \
00710      "adcl %7,%1         \n\t"                            \
00711      "adcl %8,%2         \n\t"                            \
00712      "addl %6,%0         \n\t"                            \
00713      "adcl %7,%1         \n\t"                            \
00714      "adcl %8,%2         \n\t"                            \
00715      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
00716 
00717 #elif defined(TFM_ARM)
00718 
00719 /* ARM code */
00720 
00721 #define COMBA_START
00722 
00723 #define CLEAR_CARRY \
00724    c0 = c1 = c2 = 0;
00725 
00726 #define COMBA_STORE(x) \
00727    x = c0;
00728 
00729 #define COMBA_STORE2(x) \
00730    x = c1;
00731 
00732 #define CARRY_FORWARD \
00733    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00734 
00735 #define COMBA_FINI
00736 
00737 /* multiplies point i and j, updates carry "c1" and digit c2 */
00738 #define SQRADD(i, j)                                             \
00739 __asm__(                                                             \
00740 "  UMULL  r0,r1,%6,%6              \n\t"                         \
00741 "  ADDS   %0,%0,r0                 \n\t"                         \
00742 "  ADCS   %1,%1,r1                 \n\t"                         \
00743 "  ADC    %2,%2,#0                 \n\t"                         \
00744 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
00745     
00746 /* for squaring some of the terms are doubled... */
00747 #define SQRADD2(i, j)                                            \
00748 __asm__(                                                             \
00749 "  UMULL  r0,r1,%6,%7              \n\t"                         \
00750 "  ADDS   %0,%0,r0                 \n\t"                         \
00751 "  ADCS   %1,%1,r1                 \n\t"                         \
00752 "  ADC    %2,%2,#0                 \n\t"                         \
00753 "  ADDS   %0,%0,r0                 \n\t"                         \
00754 "  ADCS   %1,%1,r1                 \n\t"                         \
00755 "  ADC    %2,%2,#0                 \n\t"                         \
00756 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
00757 
00758 #define SQRADDSC(i, j)                                           \
00759 __asm__(                                                             \
00760 "  UMULL  %0,%1,%3,%4              \n\t"                         \
00761 "  SUB    %2,%2,%2                 \n\t"                         \
00762 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc");
00763 
00764 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
00765 
00766 #define SQRADDAC(i, j)                                           \
00767 __asm__(                                                             \
00768 "  UMULL  r0,r1,%6,%7              \n\t"                         \
00769 "  ADDS   %0,%0,r0                 \n\t"                         \
00770 "  ADCS   %1,%1,r1                 \n\t"                         \
00771 "  ADC    %2,%2,#0                 \n\t"                         \
00772 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
00773 
00774 #define SQRADDDB                                                 \
00775 __asm__(                                                             \
00776 "  ADDS  %0,%0,%3                     \n\t"                      \
00777 "  ADCS  %1,%1,%4                     \n\t"                      \
00778 "  ADC   %2,%2,%5                     \n\t"                      \
00779 "  ADDS  %0,%0,%3                     \n\t"                      \
00780 "  ADCS  %1,%1,%4                     \n\t"                      \
00781 "  ADC   %2,%2,%5                     \n\t"                      \
00782 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
00783 
00784 #elif defined(TFM_PPC32)
00785 
00786 /* PPC32 */
00787 
00788 #define COMBA_START
00789 
00790 #define CLEAR_CARRY \
00791    c0 = c1 = c2 = 0;
00792 
00793 #define COMBA_STORE(x) \
00794    x = c0;
00795 
00796 #define COMBA_STORE2(x) \
00797    x = c1;
00798 
00799 #define CARRY_FORWARD \
00800    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00801 
00802 #define COMBA_FINI
00803 
00804 /* multiplies point i and j, updates carry "c1" and digit c2 */
00805 #define SQRADD(i, j)             \
00806 __asm__(                             \
00807    " mullw  16,%6,%6       \n\t" \
00808    " addc   %0,%0,16       \n\t" \
00809    " mulhwu 16,%6,%6       \n\t" \
00810    " adde   %1,%1,16       \n\t" \
00811    " addze  %2,%2          \n\t" \
00812 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
00813 
00814 /* for squaring some of the terms are doubled... */
00815 #define SQRADD2(i, j)            \
00816 __asm__(                             \
00817    " mullw  16,%6,%7       \n\t" \
00818    " mulhwu 17,%6,%7       \n\t" \
00819    " addc   %0,%0,16       \n\t" \
00820    " adde   %1,%1,17       \n\t" \
00821    " addze  %2,%2          \n\t" \
00822    " addc   %0,%0,16       \n\t" \
00823    " adde   %1,%1,17       \n\t" \
00824    " addze  %2,%2          \n\t" \
00825 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
00826 
00827 #define SQRADDSC(i, j)            \
00828 __asm__(                              \
00829    " mullw  %0,%6,%7        \n\t" \
00830    " mulhwu %1,%6,%7        \n\t" \
00831    " xor    %2,%2,%2        \n\t" \
00832 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
00833 
00834 #define SQRADDAC(i, j)           \
00835 __asm__(                             \
00836    " mullw  16,%6,%7       \n\t" \
00837    " addc   %0,%0,16       \n\t" \
00838    " mulhwu 16,%6,%7       \n\t" \
00839    " adde   %1,%1,16       \n\t" \
00840    " addze  %2,%2          \n\t" \
00841 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
00842 
00843 #define SQRADDDB                  \
00844 __asm__(                              \
00845    " addc   %0,%0,%3        \n\t" \
00846    " adde   %1,%1,%4        \n\t" \
00847    " adde   %2,%2,%5        \n\t" \
00848    " addc   %0,%0,%3        \n\t" \
00849    " adde   %1,%1,%4        \n\t" \
00850    " adde   %2,%2,%5        \n\t" \
00851 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
00852 
00853 #elif defined(TFM_PPC64)
00854 /* PPC64 */
00855 
00856 #define COMBA_START
00857 
00858 #define CLEAR_CARRY \
00859    c0 = c1 = c2 = 0;
00860 
00861 #define COMBA_STORE(x) \
00862    x = c0;
00863 
00864 #define COMBA_STORE2(x) \
00865    x = c1;
00866 
00867 #define CARRY_FORWARD \
00868    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00869 
00870 #define COMBA_FINI
00871 
00872 /* multiplies point i and j, updates carry "c1" and digit c2 */
00873 #define SQRADD(i, j)             \
00874 __asm__(                             \
00875    " mulld  16,%6,%6       \n\t" \
00876    " addc   %0,%0,16       \n\t" \
00877    " mulhdu 16,%6,%6       \n\t" \
00878    " adde   %1,%1,16       \n\t" \
00879    " addze  %2,%2          \n\t" \
00880 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
00881 
00882 /* for squaring some of the terms are doubled... */
00883 #define SQRADD2(i, j)            \
00884 __asm__(                             \
00885    " mulld  16,%6,%7       \n\t" \
00886    " mulhdu 17,%6,%7       \n\t" \
00887    " addc   %0,%0,16       \n\t" \
00888    " adde   %1,%1,17       \n\t" \
00889    " addze  %2,%2          \n\t" \
00890    " addc   %0,%0,16       \n\t" \
00891    " adde   %1,%1,17       \n\t" \
00892    " addze  %2,%2          \n\t" \
00893 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
00894 
00895 #define SQRADDSC(i, j)            \
00896 __asm__(                              \
00897    " mulld  %0,%6,%7        \n\t" \
00898    " mulhdu %1,%6,%7        \n\t" \
00899    " xor    %2,%2,%2        \n\t" \
00900 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
00901 
00902 #define SQRADDAC(i, j)           \
00903 __asm__(                             \
00904    " mulld  16,%6,%7       \n\t" \
00905    " addc   %0,%0,16       \n\t" \
00906    " mulhdu 16,%6,%7       \n\t" \
00907    " adde   %1,%1,16       \n\t" \
00908    " addze  %2,%2          \n\t" \
00909 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
00910 
00911 #define SQRADDDB                  \
00912 __asm__(                              \
00913    " addc   %0,%0,%3        \n\t" \
00914    " adde   %1,%1,%4        \n\t" \
00915    " adde   %2,%2,%5        \n\t" \
00916    " addc   %0,%0,%3        \n\t" \
00917    " adde   %1,%1,%4        \n\t" \
00918    " adde   %2,%2,%5        \n\t" \
00919 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
00920 
00921 
00922 #elif defined(TFM_AVR32)
00923 
00924 /* AVR32 */
00925 
00926 #define COMBA_START
00927 
00928 #define CLEAR_CARRY \
00929    c0 = c1 = c2 = 0;
00930 
00931 #define COMBA_STORE(x) \
00932    x = c0;
00933 
00934 #define COMBA_STORE2(x) \
00935    x = c1;
00936 
00937 #define CARRY_FORWARD \
00938    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00939 
00940 #define COMBA_FINI
00941 
00942 /* multiplies point i and j, updates carry "c1" and digit c2 */
00943 #define SQRADD(i, j)             \
00944 __asm__(                             \
00945    " mulu.d r2,%6,%6       \n\t" \
00946    " add    %0,%0,r2       \n\t" \
00947    " adc    %1,%1,r3       \n\t" \
00948    " acr    %2             \n\t" \
00949 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
00950 
00951 /* for squaring some of the terms are doubled... */
00952 #define SQRADD2(i, j)            \
00953 __asm__(                             \
00954    " mulu.d r2,%6,%7       \n\t" \
00955    " add    %0,%0,r2       \n\t" \
00956    " adc    %1,%1,r3       \n\t" \
00957    " acr    %2,            \n\t" \
00958    " add    %0,%0,r2       \n\t" \
00959    " adc    %1,%1,r3       \n\t" \
00960    " acr    %2,            \n\t" \
00961 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
00962 
00963 #define SQRADDSC(i, j)            \
00964 __asm__(                              \
00965    " mulu.d r2,%6,%7        \n\t" \
00966    " mov    %0,r2           \n\t" \
00967    " mov    %1,r3           \n\t" \
00968    " eor    %2,%2           \n\t" \
00969 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
00970 
00971 #define SQRADDAC(i, j)           \
00972 __asm__(                             \
00973    " mulu.d r2,%6,%7       \n\t" \
00974    " add    %0,%0,r2       \n\t" \
00975    " adc    %1,%1,r3       \n\t" \
00976    " acr    %2             \n\t" \
00977 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
00978 
00979 #define SQRADDDB                  \
00980 __asm__(                              \
00981    " add    %0,%0,%3        \n\t" \
00982    " adc    %1,%1,%4        \n\t" \
00983    " adc    %2,%2,%5        \n\t" \
00984    " add    %0,%0,%3        \n\t" \
00985    " adc    %1,%1,%4        \n\t" \
00986    " adc    %2,%2,%5        \n\t" \
00987 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
00988 
00989 
00990 #else
00991 
00992 #define TFM_ISO
00993 
00994 /* ISO C portable code */
00995 
00996 #define COMBA_START
00997 
00998 #define CLEAR_CARRY \
00999    c0 = c1 = c2 = 0;
01000 
01001 #define COMBA_STORE(x) \
01002    x = c0;
01003 
01004 #define COMBA_STORE2(x) \
01005    x = c1;
01006 
01007 #define CARRY_FORWARD \
01008    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01009 
01010 #define COMBA_FINI
01011 
01012 /* multiplies point i and j, updates carry "c1" and digit c2 */
01013 #define SQRADD(i, j)                                 \
01014    do { fp_word t;                                   \
01015    t = c0 + ((fp_word)i) * ((fp_word)j);  c0 = (fp_digit)t;    \
01016    t = c1 + (t >> DIGIT_BIT);             c1 = (fp_digit)t;    \
01017                                           c2 +=(fp_digit) (t >> DIGIT_BIT); \
01018    } while (0);
01019   
01020 
01021 /* for squaring some of the terms are doubled... */
01022 #define SQRADD2(i, j)                                                 \
01023    do { fp_word t;                                                    \
01024    t  = ((fp_word)i) * ((fp_word)j);                                  \
01025    tt = (fp_word)c0 + t;                 c0 = (fp_digit)tt;           \
01026    tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt;           \
01027                                          c2 +=(fp_digit)( tt >> DIGIT_BIT);    \
01028    tt = (fp_word)c0 + t;                 c0 = (fp_digit)tt;                    \
01029    tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt;            \
01030                                          c2 +=(fp_digit) (tt >> DIGIT_BIT);    \
01031    } while (0);
01032 
01033 #define SQRADDSC(i, j)                                                         \
01034    do { fp_word t;                                                             \
01035       t =  ((fp_word)i) * ((fp_word)j);                                        \
01036       sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0;                      \
01037    } while (0);
01038 
01039 #define SQRADDAC(i, j)                                                         \
01040    do { fp_word t;                                                             \
01041    t = sc0 + ((fp_word)i) * ((fp_word)j);  sc0 =  (fp_digit)t;                 \
01042    t = sc1 + (t >> DIGIT_BIT);             sc1 =  (fp_digit)t;                 \
01043                                            sc2 += (fp_digit)(t >> DIGIT_BIT);  \
01044    } while (0);
01045 
01046 #define SQRADDDB                                                               \
01047    do { fp_word t;                                                             \
01048    t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t;                 \
01049    t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT);                \
01050                                              c1 = (fp_digit)t;                 \
01051    c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT));   \
01052    } while (0);
01053 
01054 #endif
01055 
01056 #ifdef TFM_SMALL_SET
01057     #include "fp_sqr_comba_small_set.i"
01058 #endif
01059 
01060 #if defined(TFM_SQR3)
01061     #include "fp_sqr_comba_3.i"
01062 #endif
01063 #if defined(TFM_SQR4)
01064     #include "fp_sqr_comba_4.i"
01065 #endif
01066 #if defined(TFM_SQR6)
01067     #include "fp_sqr_comba_6.i"
01068 #endif
01069 #if defined(TFM_SQR7)
01070     #include "fp_sqr_comba_7.i"
01071 #endif
01072 #if defined(TFM_SQR8)
01073     #include "fp_sqr_comba_8.i"
01074 #endif
01075 #if defined(TFM_SQR9)
01076     #include "fp_sqr_comba_9.i"
01077 #endif
01078 #if defined(TFM_SQR12)
01079     #include "fp_sqr_comba_12.i"
01080 #endif
01081 #if defined(TFM_SQR17)
01082     #include "fp_sqr_comba_17.i"
01083 #endif
01084 #if defined(TFM_SQR20)
01085     #include "fp_sqr_comba_20.i"
01086 #endif
01087 #if defined(TFM_SQR24)
01088     #include "fp_sqr_comba_24.i"
01089 #endif
01090 #if defined(TFM_SQR28)
01091     #include "fp_sqr_comba_28.i"
01092 #endif
01093 #if defined(TFM_SQR32)
01094     #include "fp_sqr_comba_32.i"
01095 #endif
01096 #if defined(TFM_SQR48)
01097     #include "fp_sqr_comba_48.i"
01098 #endif
01099 #if defined(TFM_SQR64)
01100     #include "fp_sqr_comba_64.i"
01101 #endif
01102 /* end fp_sqr_comba.c asm */
01103 
01104 /* start fp_mul_comba.c asm */
01105 /* these are the combas.  Worship them. */
01106 #if defined(TFM_X86)
01107 /* Generic x86 optimized code */
01108 
01109 /* anything you need at the start */
01110 #define COMBA_START
01111 
01112 /* clear the chaining variables */
01113 #define COMBA_CLEAR \
01114    c0 = c1 = c2 = 0;
01115 
01116 /* forward the carry to the next digit */
01117 #define COMBA_FORWARD \
01118    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01119 
01120 /* store the first sum */
01121 #define COMBA_STORE(x) \
01122    x = c0;
01123 
01124 /* store the second sum [carry] */
01125 #define COMBA_STORE2(x) \
01126    x = c1;
01127 
01128 /* anything you need at the end */
01129 #define COMBA_FINI
01130 
01131 /* this should multiply i and j  */
01132 #define MULADD(i, j)                                      \
01133 __asm__(                                                      \
01134      "movl  %6,%%eax     \n\t"                            \
01135      "mull  %7           \n\t"                            \
01136      "addl  %%eax,%0     \n\t"                            \
01137      "adcl  %%edx,%1     \n\t"                            \
01138      "adcl  $0,%2        \n\t"                            \
01139      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
01140 
01141 #elif defined(TFM_X86_64)
01142 /* x86-64 optimized */
01143 
01144 /* anything you need at the start */
01145 #define COMBA_START
01146 
01147 /* clear the chaining variables */
01148 #define COMBA_CLEAR \
01149    c0 = c1 = c2 = 0;
01150 
01151 /* forward the carry to the next digit */
01152 #define COMBA_FORWARD \
01153    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01154 
01155 /* store the first sum */
01156 #define COMBA_STORE(x) \
01157    x = c0;
01158 
01159 /* store the second sum [carry] */
01160 #define COMBA_STORE2(x) \
01161    x = c1;
01162 
01163 /* anything you need at the end */
01164 #define COMBA_FINI
01165 
01166 /* this should multiply i and j  */
01167 #define MULADD(i, j)                                      \
01168 __asm__  (                                                    \
01169      "movq  %6,%%rax     \n\t"                            \
01170      "mulq  %7           \n\t"                            \
01171      "addq  %%rax,%0     \n\t"                            \
01172      "adcq  %%rdx,%1     \n\t"                            \
01173      "adcq  $0,%2        \n\t"                            \
01174      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
01175 
01176 #elif defined(TFM_SSE2)
01177 /* use SSE2 optimizations */
01178 
01179 /* anything you need at the start */
01180 #define COMBA_START
01181 
01182 /* clear the chaining variables */
01183 #define COMBA_CLEAR \
01184    c0 = c1 = c2 = 0;
01185 
01186 /* forward the carry to the next digit */
01187 #define COMBA_FORWARD \
01188    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01189 
01190 /* store the first sum */
01191 #define COMBA_STORE(x) \
01192    x = c0;
01193 
01194 /* store the second sum [carry] */
01195 #define COMBA_STORE2(x) \
01196    x = c1;
01197 
01198 /* anything you need at the end */
01199 #define COMBA_FINI \
01200    __asm__("emms");
01201 
01202 /* this should multiply i and j  */
01203 #define MULADD(i, j)                                     \
01204 __asm__(                                                     \
01205     "movd  %6,%%mm0     \n\t"                            \
01206     "movd  %7,%%mm1     \n\t"                            \
01207     "pmuludq %%mm1,%%mm0\n\t"                            \
01208     "movd  %%mm0,%%eax  \n\t"                            \
01209     "psrlq $32,%%mm0    \n\t"                            \
01210     "addl  %%eax,%0     \n\t"                            \
01211     "movd  %%mm0,%%eax  \n\t"                            \
01212     "adcl  %%eax,%1     \n\t"                            \
01213     "adcl  $0,%2        \n\t"                            \
01214     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","cc");
01215 
01216 #elif defined(TFM_ARM)
01217 /* ARM code */
01218 
01219 #define COMBA_START 
01220 
01221 #define COMBA_CLEAR \
01222    c0 = c1 = c2 = 0;
01223 
01224 #define COMBA_FORWARD \
01225    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01226 
01227 #define COMBA_STORE(x) \
01228    x = c0;
01229 
01230 #define COMBA_STORE2(x) \
01231    x = c1;
01232 
01233 #define COMBA_FINI
01234 
01235 #define MULADD(i, j)                                          \
01236 __asm__(                                                          \
01237 "  UMULL  r0,r1,%6,%7           \n\t"                         \
01238 "  ADDS   %0,%0,r0              \n\t"                         \
01239 "  ADCS   %1,%1,r1              \n\t"                         \
01240 "  ADC    %2,%2,#0              \n\t"                         \
01241 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
01242 
01243 #elif defined(TFM_PPC32)
01244 /* For 32-bit PPC */
01245 
01246 #define COMBA_START
01247 
01248 #define COMBA_CLEAR \
01249    c0 = c1 = c2 = 0;
01250 
01251 #define COMBA_FORWARD \
01252    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01253 
01254 #define COMBA_STORE(x) \
01255    x = c0;
01256 
01257 #define COMBA_STORE2(x) \
01258    x = c1;
01259 
01260 #define COMBA_FINI 
01261    
01262 /* untested: will mulhwu change the flags?  Docs say no */
01263 #define MULADD(i, j)              \
01264 __asm__(                              \
01265    " mullw  16,%6,%7       \n\t" \
01266    " addc   %0,%0,16       \n\t" \
01267    " mulhwu 16,%6,%7       \n\t" \
01268    " adde   %1,%1,16       \n\t" \
01269    " addze  %2,%2          \n\t" \
01270 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
01271 
01272 #elif defined(TFM_PPC64)
01273 /* For 64-bit PPC */
01274 
01275 #define COMBA_START
01276 
01277 #define COMBA_CLEAR \
01278    c0 = c1 = c2 = 0;
01279 
01280 #define COMBA_FORWARD \
01281    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01282 
01283 #define COMBA_STORE(x) \
01284    x = c0;
01285 
01286 #define COMBA_STORE2(x) \
01287    x = c1;
01288 
01289 #define COMBA_FINI 
01290    
01291 /* untested: will mulhwu change the flags?  Docs say no */
01292 #define MULADD(i, j)              \
01293 ____asm__(                              \
01294    " mulld  16,%6,%7       \n\t" \
01295    " addc   %0,%0,16       \n\t" \
01296    " mulhdu 16,%6,%7       \n\t" \
01297    " adde   %1,%1,16       \n\t" \
01298    " addze  %2,%2          \n\t" \
01299 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
01300 
01301 #elif defined(TFM_AVR32)
01302 
01303 /* ISO C code */
01304 
01305 #define COMBA_START
01306 
01307 #define COMBA_CLEAR \
01308    c0 = c1 = c2 = 0;
01309 
01310 #define COMBA_FORWARD \
01311    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01312 
01313 #define COMBA_STORE(x) \
01314    x = c0;
01315 
01316 #define COMBA_STORE2(x) \
01317    x = c1;
01318 
01319 #define COMBA_FINI 
01320    
01321 #define MULADD(i, j)             \
01322 ____asm__(                             \
01323    " mulu.d r2,%6,%7        \n\t"\
01324    " add    %0,r2           \n\t"\
01325    " adc    %1,%1,r3        \n\t"\
01326    " acr    %2              \n\t"\
01327 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
01328 
01329 #else
01330 /* ISO C code */
01331 
01332 #define COMBA_START
01333 
01334 #define COMBA_CLEAR \
01335    c0 = c1 = c2 = 0;
01336 
01337 #define COMBA_FORWARD \
01338    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01339 
01340 #define COMBA_STORE(x) \
01341    x = c0;
01342 
01343 #define COMBA_STORE2(x) \
01344    x = c1;
01345 
01346 #define COMBA_FINI 
01347    
01348 #define MULADD(i, j)                                                                                                                                  \
01349    do { fp_word t;                                                    \
01350    t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t;   \
01351    t = (fp_word)c1 + (t >> DIGIT_BIT);                                \
01352    c1 = (fp_digit)t; c2 += (fp_digit)(t >> DIGIT_BIT);                \
01353    } while (0);
01354 
01355 #endif
01356 
01357 
01358 #ifdef TFM_SMALL_SET
01359     #include "fp_mul_comba_small_set.i"
01360 #endif
01361 
01362 #if defined(TFM_MUL3)
01363     #include "fp_mul_comba_3.i"
01364 #endif
01365 #if defined(TFM_MUL4)
01366     #include "fp_mul_comba_4.i"
01367 #endif
01368 #if defined(TFM_MUL6)
01369     #include "fp_mul_comba_6.i"
01370 #endif
01371 #if defined(TFM_MUL7)
01372     #include "fp_mul_comba_7.i"
01373 #endif
01374 #if defined(TFM_MUL8)
01375     #include "fp_mul_comba_8.i"
01376 #endif
01377 #if defined(TFM_MUL9)
01378     #include "fp_mul_comba_9.i"
01379 #endif
01380 #if defined(TFM_MUL12)
01381     #include "fp_mul_comba_12.i"
01382 #endif
01383 #if defined(TFM_MUL17)
01384     #include "fp_mul_comba_17.i"
01385 #endif
01386 #if defined(TFM_MUL20)
01387     #include "fp_mul_comba_20.i"
01388 #endif
01389 #if defined(TFM_MUL24)
01390     #include "fp_mul_comba_24.i"
01391 #endif
01392 #if defined(TFM_MUL28)
01393     #include "fp_mul_comba_28.i"
01394 #endif
01395 #if defined(TFM_MUL32)
01396     #include "fp_mul_comba_32.i"
01397 #endif
01398 #if defined(TFM_MUL48)
01399     #include "fp_mul_comba_48.i"
01400 #endif
01401 #if defined(TFM_MUL64)
01402     #include "fp_mul_comba_64.i"
01403 #endif
01404 
01405 /* end fp_mul_comba.c asm */
01406