Francois Berder / cyassl-lib

Dependents:   TLS_cyassl TLS_cyassl

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers asm.c Source File

asm.c

00001 /* asm.c
00002  *
00003  * Copyright (C) 2006-2013 wolfSSL Inc.
00004  *
00005  * This file is part of CyaSSL.
00006  *
00007  * CyaSSL is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * CyaSSL is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
00020  */
00021 
00022 #ifdef HAVE_CONFIG_H
00023     #include <config.h>
00024 #endif
00025 
00026 #include <cyassl/ctaocrypt/settings.h>
00027 
00028 /*
00029  * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
00030  * http://math.libtomcrypt.com
00031  */
00032 
00033 
00034 /******************************************************************/
00035 /* fp_montgomery_reduce.c asm or generic */
00036 #if defined(TFM_X86) && !defined(TFM_SSE2) 
00037 /* x86-32 code */
00038 
00039 #define MONT_START 
00040 #define MONT_FINI
00041 #define LOOP_END
00042 #define LOOP_START \
00043    mu = c[x] * mp
00044 
00045 #define INNERMUL                                          \
00046 __asm__(                                                      \
00047    "movl %5,%%eax \n\t"                                   \
00048    "mull %4       \n\t"                                   \
00049    "addl %1,%%eax \n\t"                                   \
00050    "adcl $0,%%edx \n\t"                                   \
00051    "addl %%eax,%0 \n\t"                                   \
00052    "adcl $0,%%edx \n\t"                                   \
00053    "movl %%edx,%1 \n\t"                                   \
00054 :"=g"(_c[LO]), "=r"(cy)                                   \
00055 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++)              \
00056 : "%eax", "%edx", "cc")
00057 
00058 #define PROPCARRY                           \
00059 __asm__(                                        \
00060    "addl   %1,%0    \n\t"                   \
00061    "setb   %%al     \n\t"                   \
00062    "movzbl %%al,%1 \n\t"                    \
00063 :"=g"(_c[LO]), "=r"(cy)                     \
00064 :"0"(_c[LO]), "1"(cy)                       \
00065 : "%eax", "cc")
00066 
00067 /******************************************************************/
00068 #elif defined(TFM_X86_64)
00069 /* x86-64 code */
00070 
00071 #define MONT_START 
00072 #define MONT_FINI
00073 #define LOOP_END
00074 #define LOOP_START \
00075    mu = c[x] * mp
00076 
00077 #define INNERMUL                                          \
00078 __asm__(                                                      \
00079    "movq %5,%%rax \n\t"                                   \
00080    "mulq %4       \n\t"                                   \
00081    "addq %1,%%rax \n\t"                                   \
00082    "adcq $0,%%rdx \n\t"                                   \
00083    "addq %%rax,%0 \n\t"                                   \
00084    "adcq $0,%%rdx \n\t"                                   \
00085    "movq %%rdx,%1 \n\t"                                   \
00086 :"=g"(_c[LO]), "=r"(cy)                                   \
00087 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
00088 : "%rax", "%rdx", "cc")
00089 
00090 #define INNERMUL8 \
00091  __asm__(                  \
00092  "movq 0(%5),%%rax    \n\t"  \
00093  "movq 0(%2),%%r10    \n\t"  \
00094  "movq 0x8(%5),%%r11  \n\t"  \
00095  "mulq %4             \n\t"  \
00096  "addq %%r10,%%rax    \n\t"  \
00097  "adcq $0,%%rdx       \n\t"  \
00098  "movq 0x8(%2),%%r10  \n\t"  \
00099  "addq %3,%%rax       \n\t"  \
00100  "adcq $0,%%rdx       \n\t"  \
00101  "movq %%rax,0(%0)    \n\t"  \
00102  "movq %%rdx,%1       \n\t"  \
00103  \
00104  "movq %%r11,%%rax    \n\t"  \
00105  "movq 0x10(%5),%%r11 \n\t"  \
00106  "mulq %4             \n\t"  \
00107  "addq %%r10,%%rax    \n\t"  \
00108  "adcq $0,%%rdx       \n\t"  \
00109  "movq 0x10(%2),%%r10 \n\t"  \
00110  "addq %3,%%rax       \n\t"  \
00111  "adcq $0,%%rdx       \n\t"  \
00112  "movq %%rax,0x8(%0)  \n\t"  \
00113  "movq %%rdx,%1       \n\t"  \
00114  \
00115  "movq %%r11,%%rax    \n\t"  \
00116  "movq 0x18(%5),%%r11 \n\t"  \
00117  "mulq %4             \n\t"  \
00118  "addq %%r10,%%rax    \n\t"  \
00119  "adcq $0,%%rdx       \n\t"  \
00120  "movq 0x18(%2),%%r10 \n\t"  \
00121  "addq %3,%%rax       \n\t"  \
00122  "adcq $0,%%rdx       \n\t"  \
00123  "movq %%rax,0x10(%0) \n\t"  \
00124  "movq %%rdx,%1       \n\t"  \
00125  \
00126  "movq %%r11,%%rax    \n\t"  \
00127  "movq 0x20(%5),%%r11 \n\t"  \
00128  "mulq %4             \n\t"  \
00129  "addq %%r10,%%rax    \n\t"  \
00130  "adcq $0,%%rdx       \n\t"  \
00131  "movq 0x20(%2),%%r10 \n\t"  \
00132  "addq %3,%%rax       \n\t"  \
00133  "adcq $0,%%rdx       \n\t"  \
00134  "movq %%rax,0x18(%0) \n\t"  \
00135  "movq %%rdx,%1       \n\t"  \
00136  \
00137  "movq %%r11,%%rax    \n\t"  \
00138  "movq 0x28(%5),%%r11 \n\t"  \
00139  "mulq %4             \n\t"  \
00140  "addq %%r10,%%rax    \n\t"  \
00141  "adcq $0,%%rdx       \n\t"  \
00142  "movq 0x28(%2),%%r10 \n\t"  \
00143  "addq %3,%%rax       \n\t"  \
00144  "adcq $0,%%rdx       \n\t"  \
00145  "movq %%rax,0x20(%0) \n\t"  \
00146  "movq %%rdx,%1       \n\t"  \
00147  \
00148  "movq %%r11,%%rax    \n\t"  \
00149  "movq 0x30(%5),%%r11 \n\t"  \
00150  "mulq %4             \n\t"  \
00151  "addq %%r10,%%rax    \n\t"  \
00152  "adcq $0,%%rdx       \n\t"  \
00153  "movq 0x30(%2),%%r10 \n\t"  \
00154  "addq %3,%%rax       \n\t"  \
00155  "adcq $0,%%rdx       \n\t"  \
00156  "movq %%rax,0x28(%0) \n\t"  \
00157  "movq %%rdx,%1       \n\t"  \
00158  \
00159  "movq %%r11,%%rax    \n\t"  \
00160  "movq 0x38(%5),%%r11 \n\t"  \
00161  "mulq %4             \n\t"  \
00162  "addq %%r10,%%rax    \n\t"  \
00163  "adcq $0,%%rdx       \n\t"  \
00164  "movq 0x38(%2),%%r10 \n\t"  \
00165  "addq %3,%%rax       \n\t"  \
00166  "adcq $0,%%rdx       \n\t"  \
00167  "movq %%rax,0x30(%0) \n\t"  \
00168  "movq %%rdx,%1       \n\t"  \
00169  \
00170  "movq %%r11,%%rax    \n\t"  \
00171  "mulq %4             \n\t"  \
00172  "addq %%r10,%%rax    \n\t"  \
00173  "adcq $0,%%rdx       \n\t"  \
00174  "addq %3,%%rax       \n\t"  \
00175  "adcq $0,%%rdx       \n\t"  \
00176  "movq %%rax,0x38(%0) \n\t"  \
00177  "movq %%rdx,%1       \n\t"  \
00178  \
00179 :"=r"(_c), "=r"(cy)                    \
00180 : "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
00181 : "%rax", "%rdx", "%r10", "%r11", "cc")
00182 
00183 
00184 #define PROPCARRY                           \
00185 __asm__(                                        \
00186    "addq   %1,%0    \n\t"                   \
00187    "setb   %%al     \n\t"                   \
00188    "movzbq %%al,%1 \n\t"                    \
00189 :"=g"(_c[LO]), "=r"(cy)                     \
00190 :"0"(_c[LO]), "1"(cy)                       \
00191 : "%rax", "cc")
00192 
00193 /******************************************************************/
00194 #elif defined(TFM_SSE2)  
00195 /* SSE2 code (assumes 32-bit fp_digits) */
00196 /* XMM register assignments:
00197  * xmm0  *tmpm++, then Mu * (*tmpm++)
00198  * xmm1  c[x], then Mu
00199  * xmm2  mp
00200  * xmm3  cy
00201  * xmm4  _c[LO]
00202  */
00203 
00204 #define MONT_START \
00205    __asm__("movd %0,%%mm2"::"g"(mp))
00206 
00207 #define MONT_FINI \
00208    __asm__("emms")
00209 
00210 #define LOOP_START          \
00211 __asm__(                        \
00212 "movd %0,%%mm1        \n\t" \
00213 "pxor %%mm3,%%mm3     \n\t" \
00214 "pmuludq %%mm2,%%mm1  \n\t" \
00215 :: "g"(c[x]))
00216 
00217 /* pmuludq on mmx registers does a 32x32->64 multiply. */
00218 #define INNERMUL               \
00219 __asm__(                           \
00220    "movd %1,%%mm4        \n\t" \
00221    "movd %2,%%mm0        \n\t" \
00222    "paddq %%mm4,%%mm3    \n\t" \
00223    "pmuludq %%mm1,%%mm0  \n\t" \
00224    "paddq %%mm0,%%mm3    \n\t" \
00225    "movd %%mm3,%0        \n\t" \
00226    "psrlq $32, %%mm3     \n\t" \
00227 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
00228 
00229 #define INNERMUL8 \
00230 __asm__(                           \
00231    "movd 0(%1),%%mm4     \n\t" \
00232    "movd 0(%2),%%mm0     \n\t" \
00233    "paddq %%mm4,%%mm3    \n\t" \
00234    "pmuludq %%mm1,%%mm0  \n\t" \
00235    "movd 4(%2),%%mm5     \n\t" \
00236    "paddq %%mm0,%%mm3    \n\t" \
00237    "movd 4(%1),%%mm6     \n\t" \
00238    "movd %%mm3,0(%0)     \n\t" \
00239    "psrlq $32, %%mm3     \n\t" \
00240 \
00241    "paddq %%mm6,%%mm3    \n\t" \
00242    "pmuludq %%mm1,%%mm5  \n\t" \
00243    "movd 8(%2),%%mm6     \n\t" \
00244    "paddq %%mm5,%%mm3    \n\t" \
00245    "movd 8(%1),%%mm7     \n\t" \
00246    "movd %%mm3,4(%0)     \n\t" \
00247    "psrlq $32, %%mm3     \n\t" \
00248 \
00249    "paddq %%mm7,%%mm3    \n\t" \
00250    "pmuludq %%mm1,%%mm6  \n\t" \
00251    "movd 12(%2),%%mm7    \n\t" \
00252    "paddq %%mm6,%%mm3    \n\t" \
00253    "movd 12(%1),%%mm5     \n\t" \
00254    "movd %%mm3,8(%0)     \n\t" \
00255    "psrlq $32, %%mm3     \n\t" \
00256 \
00257    "paddq %%mm5,%%mm3    \n\t" \
00258    "pmuludq %%mm1,%%mm7  \n\t" \
00259    "movd 16(%2),%%mm5    \n\t" \
00260    "paddq %%mm7,%%mm3    \n\t" \
00261    "movd 16(%1),%%mm6    \n\t" \
00262    "movd %%mm3,12(%0)    \n\t" \
00263    "psrlq $32, %%mm3     \n\t" \
00264 \
00265    "paddq %%mm6,%%mm3    \n\t" \
00266    "pmuludq %%mm1,%%mm5  \n\t" \
00267    "movd 20(%2),%%mm6    \n\t" \
00268    "paddq %%mm5,%%mm3    \n\t" \
00269    "movd 20(%1),%%mm7    \n\t" \
00270    "movd %%mm3,16(%0)    \n\t" \
00271    "psrlq $32, %%mm3     \n\t" \
00272 \
00273    "paddq %%mm7,%%mm3    \n\t" \
00274    "pmuludq %%mm1,%%mm6  \n\t" \
00275    "movd 24(%2),%%mm7    \n\t" \
00276    "paddq %%mm6,%%mm3    \n\t" \
00277    "movd 24(%1),%%mm5     \n\t" \
00278    "movd %%mm3,20(%0)    \n\t" \
00279    "psrlq $32, %%mm3     \n\t" \
00280 \
00281    "paddq %%mm5,%%mm3    \n\t" \
00282    "pmuludq %%mm1,%%mm7  \n\t" \
00283    "movd 28(%2),%%mm5    \n\t" \
00284    "paddq %%mm7,%%mm3    \n\t" \
00285    "movd 28(%1),%%mm6    \n\t" \
00286    "movd %%mm3,24(%0)    \n\t" \
00287    "psrlq $32, %%mm3     \n\t" \
00288 \
00289    "paddq %%mm6,%%mm3    \n\t" \
00290    "pmuludq %%mm1,%%mm5  \n\t" \
00291    "paddq %%mm5,%%mm3    \n\t" \
00292    "movd %%mm3,28(%0)    \n\t" \
00293    "psrlq $32, %%mm3     \n\t" \
00294 :"=r"(_c) : "0"(_c), "r"(tmpm) );
00295 
00296 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack
00297    pointer */
00298 
00299 #define LOOP_END \
00300 __asm__( "movd %%mm3,%0  \n" :"=r"(cy))
00301 
00302 #define PROPCARRY                           \
00303 __asm__(                                        \
00304    "addl   %1,%0    \n\t"                   \
00305    "setb   %%al     \n\t"                   \
00306    "movzbl %%al,%1 \n\t"                    \
00307 :"=g"(_c[LO]), "=r"(cy)                     \
00308 :"0"(_c[LO]), "1"(cy)                       \
00309 : "%eax", "cc")
00310 
00311 /******************************************************************/
00312 #elif defined(TFM_ARM)
00313    /* ARMv4 code */
00314 
00315 #define MONT_START 
00316 #define MONT_FINI
00317 #define LOOP_END
00318 #define LOOP_START \
00319    mu = c[x] * mp
00320 
00321 #define INNERMUL                    \
00322 __asm__(                                \
00323     " LDR    r0,%1            \n\t" \
00324     " ADDS   r0,r0,%0         \n\t" \
00325     " MOVCS  %0,#1            \n\t" \
00326     " MOVCC  %0,#0            \n\t" \
00327     " UMLAL  r0,%0,%3,%4      \n\t" \
00328     " STR    r0,%1            \n\t" \
00329 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
00330 
00331 #define PROPCARRY                  \
00332 __asm__(                               \
00333     " LDR   r0,%1            \n\t" \
00334     " ADDS  r0,r0,%0         \n\t" \
00335     " STR   r0,%1            \n\t" \
00336     " MOVCS %0,#1            \n\t" \
00337     " MOVCC %0,#0            \n\t" \
00338 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
00339 
00340 #elif defined(TFM_PPC32)
00341 
00342 /* PPC32 */
00343 #define MONT_START 
00344 #define MONT_FINI
00345 #define LOOP_END
00346 #define LOOP_START \
00347    mu = c[x] * mp
00348 
00349 #define INNERMUL                     \
00350 __asm__(                                 \
00351    " mullw    16,%3,%4       \n\t"   \
00352    " mulhwu   17,%3,%4       \n\t"   \
00353    " addc     16,16,%0       \n\t"   \
00354    " addze    17,17          \n\t"   \
00355    " lwz      18,%1          \n\t"   \
00356    " addc     16,16,18       \n\t"   \
00357    " addze    %0,17          \n\t"   \
00358    " stw      16,%1          \n\t"   \
00359 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
00360 
00361 #define PROPCARRY                    \
00362 __asm__(                                 \
00363    " lwz      16,%1         \n\t"    \
00364    " addc     16,16,%0      \n\t"    \
00365    " stw      16,%1         \n\t"    \
00366    " xor      %0,%0,%0      \n\t"    \
00367    " addze    %0,%0         \n\t"    \
00368 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
00369 
00370 #elif defined(TFM_PPC64)
00371 
00372 /* PPC64 */
00373 #define MONT_START 
00374 #define MONT_FINI
00375 #define LOOP_END
00376 #define LOOP_START \
00377    mu = c[x] * mp
00378 
00379 #define INNERMUL                     \
00380 __asm__(                                 \
00381    " mulld    16,%3,%4       \n\t"   \
00382    " mulhdu   17,%3,%4       \n\t"   \
00383    " addc     16,16,%0       \n\t"   \
00384    " addze    17,17          \n\t"   \
00385    " ldx      18,0,%1        \n\t"   \
00386    " addc     16,16,18       \n\t"   \
00387    " addze    %0,17          \n\t"   \
00388    " sdx      16,0,%1        \n\t"   \
00389 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","cc"); ++tmpm;
00390 
00391 #define PROPCARRY                    \
00392 __asm__(                                 \
00393    " ldx      16,0,%1       \n\t"    \
00394    " addc     16,16,%0      \n\t"    \
00395    " sdx      16,0,%1       \n\t"    \
00396    " xor      %0,%0,%0      \n\t"    \
00397    " addze    %0,%0         \n\t"    \
00398 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","cc");
00399 
00400 /******************************************************************/
00401 
00402 #elif defined(TFM_AVR32)
00403 
00404 /* AVR32 */
00405 #define MONT_START 
00406 #define MONT_FINI
00407 #define LOOP_END
00408 #define LOOP_START \
00409    mu = c[x] * mp
00410 
00411 #define INNERMUL                    \
00412 __asm__(                                \
00413     " ld.w   r2,%1            \n\t" \
00414     " add    r2,%0            \n\t" \
00415     " eor    r3,r3            \n\t" \
00416     " acr    r3               \n\t" \
00417     " macu.d r2,%3,%4         \n\t" \
00418     " st.w   %1,r2            \n\t" \
00419     " mov    %0,r3            \n\t" \
00420 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
00421 
00422 #define PROPCARRY                    \
00423 __asm__(                                 \
00424    " ld.w     r2,%1         \n\t"    \
00425    " add      r2,%0         \n\t"    \
00426    " st.w     %1,r2         \n\t"    \
00427    " eor      %0,%0         \n\t"    \
00428    " acr      %0            \n\t"    \
00429 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
00430 
00431 #else
00432 
00433 /* ISO C code */
00434 #define MONT_START 
00435 #define MONT_FINI
00436 #define LOOP_END
00437 #define LOOP_START \
00438    mu = c[x] * mp
00439 
00440 #define INNERMUL                                      \
00441    do { fp_word t;                                    \
00442    t  = ((fp_word)_c[0] + (fp_word)cy) +              \
00443                 (((fp_word)mu) * ((fp_word)*tmpm++)); \
00444    _c[0] = (fp_digit)t;                               \
00445    cy = (fp_digit)(t >> DIGIT_BIT);                   \
00446    } while (0)
00447 
00448 #define PROPCARRY \
00449    do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
00450 
00451 #endif
00452 /******************************************************************/
00453 
00454 
00455 #define LO  0
00456 /* end fp_montogomery_reduce.c asm */
00457 
00458 
00459 /* start fp_sqr_comba.c asm */
00460 #if defined(TFM_X86)
00461 
00462 /* x86-32 optimized */
00463 
00464 #define COMBA_START
00465 
00466 #define CLEAR_CARRY \
00467    c0 = c1 = c2 = 0;
00468 
00469 #define COMBA_STORE(x) \
00470    x = c0;
00471 
00472 #define COMBA_STORE2(x) \
00473    x = c1;
00474 
00475 #define CARRY_FORWARD \
00476    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00477 
00478 #define COMBA_FINI
00479 
00480 #define SQRADD(i, j)                                      \
00481 __asm__(                                            \
00482      "movl  %6,%%eax     \n\t"                            \
00483      "mull  %%eax        \n\t"                            \
00484      "addl  %%eax,%0     \n\t"                            \
00485      "adcl  %%edx,%1     \n\t"                            \
00486      "adcl  $0,%2        \n\t"                            \
00487      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
00488 
00489 #define SQRADD2(i, j)                                     \
00490 __asm__(                                            \
00491      "movl  %6,%%eax     \n\t"                            \
00492      "mull  %7           \n\t"                            \
00493      "addl  %%eax,%0     \n\t"                            \
00494      "adcl  %%edx,%1     \n\t"                            \
00495      "adcl  $0,%2        \n\t"                            \
00496      "addl  %%eax,%0     \n\t"                            \
00497      "adcl  %%edx,%1     \n\t"                            \
00498      "adcl  $0,%2        \n\t"                            \
00499      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx", "cc");
00500 
00501 #define SQRADDSC(i, j)                                    \
00502 __asm__(                                                     \
00503      "movl  %3,%%eax     \n\t"                            \
00504      "mull  %4           \n\t"                            \
00505      "movl  %%eax,%0     \n\t"                            \
00506      "movl  %%edx,%1     \n\t"                            \
00507      "xorl  %2,%2        \n\t"                            \
00508      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc");
00509 
00510 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
00511 
00512 #define SQRADDAC(i, j)                                    \
00513 __asm__(                                                     \
00514      "movl  %6,%%eax     \n\t"                            \
00515      "mull  %7           \n\t"                            \
00516      "addl  %%eax,%0     \n\t"                            \
00517      "adcl  %%edx,%1     \n\t"                            \
00518      "adcl  $0,%2        \n\t"                            \
00519      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
00520 
00521 #define SQRADDDB                                          \
00522 __asm__(                                                     \
00523      "addl %6,%0         \n\t"                            \
00524      "adcl %7,%1         \n\t"                            \
00525      "adcl %8,%2         \n\t"                            \
00526      "addl %6,%0         \n\t"                            \
00527      "adcl %7,%1         \n\t"                            \
00528      "adcl %8,%2         \n\t"                            \
00529      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
00530 
00531 #elif defined(TFM_X86_64)
00532 /* x86-64 optimized */
00533 
00534 #define COMBA_START
00535 
00536 #define CLEAR_CARRY \
00537    c0 = c1 = c2 = 0;
00538 
00539 #define COMBA_STORE(x) \
00540    x = c0;
00541 
00542 #define COMBA_STORE2(x) \
00543    x = c1;
00544 
00545 #define CARRY_FORWARD \
00546    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00547 
00548 #define COMBA_FINI
00549 
00550 #define SQRADD(i, j)                                      \
00551 __asm__(                                                     \
00552      "movq  %6,%%rax     \n\t"                            \
00553      "mulq  %%rax        \n\t"                            \
00554      "addq  %%rax,%0     \n\t"                            \
00555      "adcq  %%rdx,%1     \n\t"                            \
00556      "adcq  $0,%2        \n\t"                            \
00557      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
00558 
00559 #define SQRADD2(i, j)                                     \
00560 __asm__(                                                     \
00561      "movq  %6,%%rax     \n\t"                            \
00562      "mulq  %7           \n\t"                            \
00563      "addq  %%rax,%0     \n\t"                            \
00564      "adcq  %%rdx,%1     \n\t"                            \
00565      "adcq  $0,%2        \n\t"                            \
00566      "addq  %%rax,%0     \n\t"                            \
00567      "adcq  %%rdx,%1     \n\t"                            \
00568      "adcq  $0,%2        \n\t"                            \
00569      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
00570 
00571 #define SQRADDSC(i, j)                                    \
00572 __asm__(                                                     \
00573      "movq  %3,%%rax     \n\t"                            \
00574      "mulq  %4           \n\t"                            \
00575      "movq  %%rax,%0     \n\t"                            \
00576      "movq  %%rdx,%1     \n\t"                            \
00577      "xorq  %2,%2        \n\t"                            \
00578      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
00579 
00580 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
00581 
00582 #define SQRADDAC(i, j)                                                         \
00583 __asm__(                                                     \
00584      "movq  %6,%%rax     \n\t"                            \
00585      "mulq  %7           \n\t"                            \
00586      "addq  %%rax,%0     \n\t"                            \
00587      "adcq  %%rdx,%1     \n\t"                            \
00588      "adcq  $0,%2        \n\t"                            \
00589      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
00590 
00591 #define SQRADDDB                                          \
00592 __asm__(                                                     \
00593      "addq %6,%0         \n\t"                            \
00594      "adcq %7,%1         \n\t"                            \
00595      "adcq %8,%2         \n\t"                            \
00596      "addq %6,%0         \n\t"                            \
00597      "adcq %7,%1         \n\t"                            \
00598      "adcq %8,%2         \n\t"                            \
00599      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
00600 
00601 #elif defined(TFM_SSE2)
00602 
00603 /* SSE2 Optimized */
00604 #define COMBA_START
00605 
00606 #define CLEAR_CARRY \
00607    c0 = c1 = c2 = 0;
00608 
00609 #define COMBA_STORE(x) \
00610    x = c0;
00611 
00612 #define COMBA_STORE2(x) \
00613    x = c1;
00614 
00615 #define CARRY_FORWARD \
00616    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00617 
00618 #define COMBA_FINI \
00619    __asm__("emms");
00620 
00621 #define SQRADD(i, j)                                      \
00622 __asm__(                                            \
00623      "movd  %6,%%mm0     \n\t"                            \
00624      "pmuludq %%mm0,%%mm0\n\t"                            \
00625      "movd  %%mm0,%%eax  \n\t"                            \
00626      "psrlq $32,%%mm0    \n\t"                            \
00627      "addl  %%eax,%0     \n\t"                            \
00628      "movd  %%mm0,%%eax  \n\t"                            \
00629      "adcl  %%eax,%1     \n\t"                            \
00630      "adcl  $0,%2        \n\t"                            \
00631      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
00632 
00633 #define SQRADD2(i, j)                                     \
00634 __asm__(                                            \
00635      "movd  %6,%%mm0     \n\t"                            \
00636      "movd  %7,%%mm1     \n\t"                            \
00637      "pmuludq %%mm1,%%mm0\n\t"                            \
00638      "movd  %%mm0,%%eax  \n\t"                            \
00639      "psrlq $32,%%mm0    \n\t"                            \
00640      "movd  %%mm0,%%edx  \n\t"                            \
00641      "addl  %%eax,%0     \n\t"                            \
00642      "adcl  %%edx,%1     \n\t"                            \
00643      "adcl  $0,%2        \n\t"                            \
00644      "addl  %%eax,%0     \n\t"                            \
00645      "adcl  %%edx,%1     \n\t"                            \
00646      "adcl  $0,%2        \n\t"                            \
00647      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
00648 
00649 #define SQRADDSC(i, j)                                                         \
00650 __asm__(                                            \
00651      "movd  %3,%%mm0     \n\t"                            \
00652      "movd  %4,%%mm1     \n\t"                            \
00653      "pmuludq %%mm1,%%mm0\n\t"                            \
00654      "movd  %%mm0,%0     \n\t"                            \
00655      "psrlq $32,%%mm0    \n\t"                            \
00656      "movd  %%mm0,%1     \n\t"                            \
00657      "xorl  %2,%2        \n\t"                            \
00658      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j));
00659 
00660 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
00661 
00662 #define SQRADDAC(i, j)                                                         \
00663 __asm__(                                            \
00664      "movd  %6,%%mm0     \n\t"                            \
00665      "movd  %7,%%mm1     \n\t"                            \
00666      "pmuludq %%mm1,%%mm0\n\t"                            \
00667      "movd  %%mm0,%%eax  \n\t"                            \
00668      "psrlq $32,%%mm0    \n\t"                            \
00669      "movd  %%mm0,%%edx  \n\t"                            \
00670      "addl  %%eax,%0     \n\t"                            \
00671      "adcl  %%edx,%1     \n\t"                            \
00672      "adcl  $0,%2        \n\t"                            \
00673      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)  :"%eax","%edx","cc");
00674 
00675 #define SQRADDDB                                          \
00676 __asm__(                                                     \
00677      "addl %6,%0         \n\t"                            \
00678      "adcl %7,%1         \n\t"                            \
00679      "adcl %8,%2         \n\t"                            \
00680      "addl %6,%0         \n\t"                            \
00681      "adcl %7,%1         \n\t"                            \
00682      "adcl %8,%2         \n\t"                            \
00683      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
00684 
00685 #elif defined(TFM_ARM)
00686 
00687 /* ARM code */
00688 
00689 #define COMBA_START
00690 
00691 #define CLEAR_CARRY \
00692    c0 = c1 = c2 = 0;
00693 
00694 #define COMBA_STORE(x) \
00695    x = c0;
00696 
00697 #define COMBA_STORE2(x) \
00698    x = c1;
00699 
00700 #define CARRY_FORWARD \
00701    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00702 
00703 #define COMBA_FINI
00704 
00705 /* multiplies point i and j, updates carry "c1" and digit c2 */
00706 #define SQRADD(i, j)                                             \
00707 __asm__(                                                             \
00708 "  UMULL  r0,r1,%6,%6              \n\t"                         \
00709 "  ADDS   %0,%0,r0                 \n\t"                         \
00710 "  ADCS   %1,%1,r1                 \n\t"                         \
00711 "  ADC    %2,%2,#0                 \n\t"                         \
00712 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
00713     
00714 /* for squaring some of the terms are doubled... */
00715 #define SQRADD2(i, j)                                            \
00716 __asm__(                                                             \
00717 "  UMULL  r0,r1,%6,%7              \n\t"                         \
00718 "  ADDS   %0,%0,r0                 \n\t"                         \
00719 "  ADCS   %1,%1,r1                 \n\t"                         \
00720 "  ADC    %2,%2,#0                 \n\t"                         \
00721 "  ADDS   %0,%0,r0                 \n\t"                         \
00722 "  ADCS   %1,%1,r1                 \n\t"                         \
00723 "  ADC    %2,%2,#0                 \n\t"                         \
00724 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
00725 
00726 #define SQRADDSC(i, j)                                           \
00727 __asm__(                                                             \
00728 "  UMULL  %0,%1,%6,%7              \n\t"                         \
00729 "  SUB    %2,%2,%2                 \n\t"                         \
00730 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "cc");
00731 
00732 #define SQRADDAC(i, j)                                           \
00733 __asm__(                                                             \
00734 "  UMULL  r0,r1,%6,%7              \n\t"                         \
00735 "  ADDS   %0,%0,r0                 \n\t"                         \
00736 "  ADCS   %1,%1,r1                 \n\t"                         \
00737 "  ADC    %2,%2,#0                 \n\t"                         \
00738 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
00739 
00740 #define SQRADDDB                                                 \
00741 __asm__(                                                             \
00742 "  ADDS  %0,%0,%3                     \n\t"                      \
00743 "  ADCS  %1,%1,%4                     \n\t"                      \
00744 "  ADC   %2,%2,%5                     \n\t"                      \
00745 "  ADDS  %0,%0,%3                     \n\t"                      \
00746 "  ADCS  %1,%1,%4                     \n\t"                      \
00747 "  ADC   %2,%2,%5                     \n\t"                      \
00748 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
00749 
00750 #elif defined(TFM_PPC32)
00751 
00752 /* PPC32 */
00753 
00754 #define COMBA_START
00755 
00756 #define CLEAR_CARRY \
00757    c0 = c1 = c2 = 0;
00758 
00759 #define COMBA_STORE(x) \
00760    x = c0;
00761 
00762 #define COMBA_STORE2(x) \
00763    x = c1;
00764 
00765 #define CARRY_FORWARD \
00766    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00767 
00768 #define COMBA_FINI
00769 
00770 /* multiplies point i and j, updates carry "c1" and digit c2 */
00771 #define SQRADD(i, j)             \
00772 __asm__(                             \
00773    " mullw  16,%6,%6       \n\t" \
00774    " addc   %0,%0,16       \n\t" \
00775    " mulhwu 16,%6,%6       \n\t" \
00776    " adde   %1,%1,16       \n\t" \
00777    " addze  %2,%2          \n\t" \
00778 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
00779 
00780 /* for squaring some of the terms are doubled... */
00781 #define SQRADD2(i, j)            \
00782 __asm__(                             \
00783    " mullw  16,%6,%7       \n\t" \
00784    " mulhwu 17,%6,%7       \n\t" \
00785    " addc   %0,%0,16       \n\t" \
00786    " adde   %1,%1,17       \n\t" \
00787    " addze  %2,%2          \n\t" \
00788    " addc   %0,%0,16       \n\t" \
00789    " adde   %1,%1,17       \n\t" \
00790    " addze  %2,%2          \n\t" \
00791 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
00792 
00793 #define SQRADDSC(i, j)            \
00794 __asm__(                              \
00795    " mullw  %0,%6,%7        \n\t" \
00796    " mulhwu %1,%6,%7        \n\t" \
00797    " xor    %2,%2,%2        \n\t" \
00798 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
00799 
00800 #define SQRADDAC(i, j)           \
00801 __asm__(                             \
00802    " mullw  16,%6,%7       \n\t" \
00803    " addc   %0,%0,16       \n\t" \
00804    " mulhwu 16,%6,%7       \n\t" \
00805    " adde   %1,%1,16       \n\t" \
00806    " addze  %2,%2          \n\t" \
00807 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
00808 
00809 #define SQRADDDB                  \
00810 __asm__(                              \
00811    " addc   %0,%0,%3        \n\t" \
00812    " adde   %1,%1,%4        \n\t" \
00813    " adde   %2,%2,%5        \n\t" \
00814    " addc   %0,%0,%3        \n\t" \
00815    " adde   %1,%1,%4        \n\t" \
00816    " adde   %2,%2,%5        \n\t" \
00817 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
00818 
00819 #elif defined(TFM_PPC64)
00820 /* PPC64 */
00821 
00822 #define COMBA_START
00823 
00824 #define CLEAR_CARRY \
00825    c0 = c1 = c2 = 0;
00826 
00827 #define COMBA_STORE(x) \
00828    x = c0;
00829 
00830 #define COMBA_STORE2(x) \
00831    x = c1;
00832 
00833 #define CARRY_FORWARD \
00834    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00835 
00836 #define COMBA_FINI
00837 
00838 /* multiplies point i and j, updates carry "c1" and digit c2 */
00839 #define SQRADD(i, j)             \
00840 __asm__(                             \
00841    " mulld  16,%6,%6       \n\t" \
00842    " addc   %0,%0,16       \n\t" \
00843    " mulhdu 16,%6,%6       \n\t" \
00844    " adde   %1,%1,16       \n\t" \
00845    " addze  %2,%2          \n\t" \
00846 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
00847 
00848 /* for squaring some of the terms are doubled... */
00849 #define SQRADD2(i, j)            \
00850 __asm__(                             \
00851    " mulld  16,%6,%7       \n\t" \
00852    " mulhdu 17,%6,%7       \n\t" \
00853    " addc   %0,%0,16       \n\t" \
00854    " adde   %1,%1,17       \n\t" \
00855    " addze  %2,%2          \n\t" \
00856    " addc   %0,%0,16       \n\t" \
00857    " adde   %1,%1,17       \n\t" \
00858    " addze  %2,%2          \n\t" \
00859 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
00860 
00861 #define SQRADDSC(i, j)            \
00862 __asm__(                              \
00863    " mulld  %0,%6,%7        \n\t" \
00864    " mulhdu %1,%6,%7        \n\t" \
00865    " xor    %2,%2,%2        \n\t" \
00866 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
00867 
00868 #define SQRADDAC(i, j)           \
00869 __asm__(                             \
00870    " mulld  16,%6,%7       \n\t" \
00871    " addc   %0,%0,16       \n\t" \
00872    " mulhdu 16,%6,%7       \n\t" \
00873    " adde   %1,%1,16       \n\t" \
00874    " addze  %2,%2          \n\t" \
00875 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
00876 
00877 #define SQRADDDB                  \
00878 __asm__(                              \
00879    " addc   %0,%0,%3        \n\t" \
00880    " adde   %1,%1,%4        \n\t" \
00881    " adde   %2,%2,%5        \n\t" \
00882    " addc   %0,%0,%3        \n\t" \
00883    " adde   %1,%1,%4        \n\t" \
00884    " adde   %2,%2,%5        \n\t" \
00885 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
00886 
00887 
00888 #elif defined(TFM_AVR32)
00889 
00890 /* AVR32 */
00891 
00892 #define COMBA_START
00893 
00894 #define CLEAR_CARRY \
00895    c0 = c1 = c2 = 0;
00896 
00897 #define COMBA_STORE(x) \
00898    x = c0;
00899 
00900 #define COMBA_STORE2(x) \
00901    x = c1;
00902 
00903 #define CARRY_FORWARD \
00904    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00905 
00906 #define COMBA_FINI
00907 
00908 /* multiplies point i and j, updates carry "c1" and digit c2 */
00909 #define SQRADD(i, j)             \
00910 __asm__(                             \
00911    " mulu.d r2,%6,%6       \n\t" \
00912    " add    %0,%0,r2       \n\t" \
00913    " adc    %1,%1,r3       \n\t" \
00914    " acr    %2             \n\t" \
00915 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
00916 
00917 /* for squaring some of the terms are doubled... */
00918 #define SQRADD2(i, j)            \
00919 __asm__(                             \
00920    " mulu.d r2,%6,%7       \n\t" \
00921    " add    %0,%0,r2       \n\t" \
00922    " adc    %1,%1,r3       \n\t" \
00923    " acr    %2,            \n\t" \
00924    " add    %0,%0,r2       \n\t" \
00925    " adc    %1,%1,r3       \n\t" \
00926    " acr    %2,            \n\t" \
00927 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
00928 
00929 #define SQRADDSC(i, j)            \
00930 __asm__(                              \
00931    " mulu.d r2,%6,%7        \n\t" \
00932    " mov    %0,r2           \n\t" \
00933    " mov    %1,r3           \n\t" \
00934    " eor    %2,%2           \n\t" \
00935 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
00936 
00937 #define SQRADDAC(i, j)           \
00938 __asm__(                             \
00939    " mulu.d r2,%6,%7       \n\t" \
00940    " add    %0,%0,r2       \n\t" \
00941    " adc    %1,%1,r3       \n\t" \
00942    " acr    %2             \n\t" \
00943 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
00944 
00945 #define SQRADDDB                  \
00946 __asm__(                              \
00947    " add    %0,%0,%3        \n\t" \
00948    " adc    %1,%1,%4        \n\t" \
00949    " adc    %2,%2,%5        \n\t" \
00950    " add    %0,%0,%3        \n\t" \
00951    " adc    %1,%1,%4        \n\t" \
00952    " adc    %2,%2,%5        \n\t" \
00953 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
00954 
00955 
00956 #else
00957 
00958 #define TFM_ISO
00959 
00960 /* ISO C portable code */
00961 
00962 #define COMBA_START
00963 
00964 #define CLEAR_CARRY \
00965    c0 = c1 = c2 = 0;
00966 
00967 #define COMBA_STORE(x) \
00968    x = c0;
00969 
00970 #define COMBA_STORE2(x) \
00971    x = c1;
00972 
00973 #define CARRY_FORWARD \
00974    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00975 
00976 #define COMBA_FINI
00977 
00978 /* multiplies point i and j, updates carry "c1" and digit c2 */
00979 #define SQRADD(i, j)                                 \
00980    do { fp_word t;                                   \
00981    t = c0 + ((fp_word)i) * ((fp_word)j);  c0 = (fp_digit)t;    \
00982    t = c1 + (t >> DIGIT_BIT);             c1 = (fp_digit)t;    \
00983                                           c2 +=(fp_digit) (t >> DIGIT_BIT); \
00984    } while (0);
00985   
00986 
00987 /* for squaring some of the terms are doubled... */
00988 #define SQRADD2(i, j)                                                 \
00989    do { fp_word t;                                                    \
00990    t  = ((fp_word)i) * ((fp_word)j);                                  \
00991    tt = (fp_word)c0 + t;                 c0 = (fp_digit)tt;           \
00992    tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt;           \
00993                                          c2 +=(fp_digit)( tt >> DIGIT_BIT);    \
00994    tt = (fp_word)c0 + t;                 c0 = (fp_digit)tt;                    \
00995    tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt;            \
00996                                          c2 +=(fp_digit) (tt >> DIGIT_BIT);    \
00997    } while (0);
00998 
00999 #define SQRADDSC(i, j)                                                         \
01000    do { fp_word t;                                                             \
01001       t =  ((fp_word)i) * ((fp_word)j);                                        \
01002       sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0;                      \
01003    } while (0);
01004 
01005 #define SQRADDAC(i, j)                                                         \
01006    do { fp_word t;                                                             \
01007    t = sc0 + ((fp_word)i) * ((fp_word)j);  sc0 = t;                            \
01008    t = sc1 + (t >> DIGIT_BIT);             sc1 = t; sc2 += t >> DIGIT_BIT;     \
01009    } while (0);
01010 
01011 #define SQRADDDB                                                               \
01012    do { fp_word t;                                                             \
01013    t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = t;                                                 \
01014    t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT); c1 = t;                              \
01015    c2 = c2 + ((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT);                                     \
01016    } while (0);
01017 
01018 #endif
01019 
01020 #ifdef TFM_SMALL_SET
01021 #include "fp_sqr_comba_small_set.i"
01022 #include "fp_sqr_comba_3.i"
01023 #include "fp_sqr_comba_4.i"
01024 #include "fp_sqr_comba_6.i"
01025 #include "fp_sqr_comba_7.i"
01026 #include "fp_sqr_comba_8.i"
01027 #include "fp_sqr_comba_9.i"
01028 #include "fp_sqr_comba_12.i"
01029 #include "fp_sqr_comba_17.i"
01030 #include "fp_sqr_comba_20.i"
01031 #include "fp_sqr_comba_24.i"
01032 #include "fp_sqr_comba_28.i"
01033 #include "fp_sqr_comba_32.i"
01034 #include "fp_sqr_comba_48.i"
01035 #include "fp_sqr_comba_64.i"
01036 #endif
01037 /* end fp_sqr_comba.c asm */
01038 
01039 /* start fp_mul_comba.c asm */
01040 /* these are the combas.  Worship them. */
01041 #if defined(TFM_X86)
01042 /* Generic x86 optimized code */
01043 
01044 /* anything you need at the start */
01045 #define COMBA_START
01046 
01047 /* clear the chaining variables */
01048 #define COMBA_CLEAR \
01049    c0 = c1 = c2 = 0;
01050 
01051 /* forward the carry to the next digit */
01052 #define COMBA_FORWARD \
01053    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01054 
01055 /* store the first sum */
01056 #define COMBA_STORE(x) \
01057    x = c0;
01058 
01059 /* store the second sum [carry] */
01060 #define COMBA_STORE2(x) \
01061    x = c1;
01062 
01063 /* anything you need at the end */
01064 #define COMBA_FINI
01065 
01066 /* this should multiply i and j  */
01067 #define MULADD(i, j)                                      \
01068 __asm__(                                                      \
01069      "movl  %6,%%eax     \n\t"                            \
01070      "mull  %7           \n\t"                            \
01071      "addl  %%eax,%0     \n\t"                            \
01072      "adcl  %%edx,%1     \n\t"                            \
01073      "adcl  $0,%2        \n\t"                            \
01074      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
01075 
01076 #elif defined(TFM_X86_64)
01077 /* x86-64 optimized */
01078 
01079 /* anything you need at the start */
01080 #define COMBA_START
01081 
01082 /* clear the chaining variables */
01083 #define COMBA_CLEAR \
01084    c0 = c1 = c2 = 0;
01085 
01086 /* forward the carry to the next digit */
01087 #define COMBA_FORWARD \
01088    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01089 
01090 /* store the first sum */
01091 #define COMBA_STORE(x) \
01092    x = c0;
01093 
01094 /* store the second sum [carry] */
01095 #define COMBA_STORE2(x) \
01096    x = c1;
01097 
01098 /* anything you need at the end */
01099 #define COMBA_FINI
01100 
01101 /* this should multiply i and j  */
01102 #define MULADD(i, j)                                      \
01103 __asm__  (                                                    \
01104      "movq  %6,%%rax     \n\t"                            \
01105      "mulq  %7           \n\t"                            \
01106      "addq  %%rax,%0     \n\t"                            \
01107      "adcq  %%rdx,%1     \n\t"                            \
01108      "adcq  $0,%2        \n\t"                            \
01109      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
01110 
01111 #elif defined(TFM_SSE2)
01112 /* use SSE2 optimizations */
01113 
01114 /* anything you need at the start */
01115 #define COMBA_START
01116 
01117 /* clear the chaining variables */
01118 #define COMBA_CLEAR \
01119    c0 = c1 = c2 = 0;
01120 
01121 /* forward the carry to the next digit */
01122 #define COMBA_FORWARD \
01123    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01124 
01125 /* store the first sum */
01126 #define COMBA_STORE(x) \
01127    x = c0;
01128 
01129 /* store the second sum [carry] */
01130 #define COMBA_STORE2(x) \
01131    x = c1;
01132 
01133 /* anything you need at the end */
01134 #define COMBA_FINI \
01135    __asm__("emms");
01136 
01137 /* this should multiply i and j  */
01138 #define MULADD(i, j)                                     \
01139 __asm__(                                                     \
01140     "movd  %6,%%mm0     \n\t"                            \
01141     "movd  %7,%%mm1     \n\t"                            \
01142     "pmuludq %%mm1,%%mm0\n\t"                            \
01143     "movd  %%mm0,%%eax  \n\t"                            \
01144     "psrlq $32,%%mm0    \n\t"                            \
01145     "addl  %%eax,%0     \n\t"                            \
01146     "movd  %%mm0,%%eax  \n\t"                            \
01147     "adcl  %%eax,%1     \n\t"                            \
01148     "adcl  $0,%2        \n\t"                            \
01149     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","cc");
01150 
01151 #elif defined(TFM_ARM)
01152 /* ARM code */
01153 
01154 #define COMBA_START 
01155 
01156 #define COMBA_CLEAR \
01157    c0 = c1 = c2 = 0;
01158 
01159 #define COMBA_FORWARD \
01160    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01161 
01162 #define COMBA_STORE(x) \
01163    x = c0;
01164 
01165 #define COMBA_STORE2(x) \
01166    x = c1;
01167 
01168 #define COMBA_FINI
01169 
01170 #define MULADD(i, j)                                          \
01171 __asm__(                                                          \
01172 "  UMULL  r0,r1,%6,%7           \n\t"                         \
01173 "  ADDS   %0,%0,r0              \n\t"                         \
01174 "  ADCS   %1,%1,r1              \n\t"                         \
01175 "  ADC    %2,%2,#0              \n\t"                         \
01176 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
01177 
01178 #elif defined(TFM_PPC32)
01179 /* For 32-bit PPC */
01180 
01181 #define COMBA_START
01182 
01183 #define COMBA_CLEAR \
01184    c0 = c1 = c2 = 0;
01185 
01186 #define COMBA_FORWARD \
01187    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01188 
01189 #define COMBA_STORE(x) \
01190    x = c0;
01191 
01192 #define COMBA_STORE2(x) \
01193    x = c1;
01194 
01195 #define COMBA_FINI 
01196    
01197 /* untested: will mulhwu change the flags?  Docs say no */
01198 #define MULADD(i, j)              \
01199 __asm__(                              \
01200    " mullw  16,%6,%7       \n\t" \
01201    " addc   %0,%0,16       \n\t" \
01202    " mulhwu 16,%6,%7       \n\t" \
01203    " adde   %1,%1,16       \n\t" \
01204    " addze  %2,%2          \n\t" \
01205 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
01206 
01207 #elif defined(TFM_PPC64)
01208 /* For 64-bit PPC */
01209 
01210 #define COMBA_START
01211 
01212 #define COMBA_CLEAR \
01213    c0 = c1 = c2 = 0;
01214 
01215 #define COMBA_FORWARD \
01216    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01217 
01218 #define COMBA_STORE(x) \
01219    x = c0;
01220 
01221 #define COMBA_STORE2(x) \
01222    x = c1;
01223 
01224 #define COMBA_FINI 
01225    
01226 /* untested: will mulhwu change the flags?  Docs say no */
01227 #define MULADD(i, j)              \
01228 ____asm__(                              \
01229    " mulld  16,%6,%7       \n\t" \
01230    " addc   %0,%0,16       \n\t" \
01231    " mulhdu 16,%6,%7       \n\t" \
01232    " adde   %1,%1,16       \n\t" \
01233    " addze  %2,%2          \n\t" \
01234 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
01235 
01236 #elif defined(TFM_AVR32)
01237 
01238 /* ISO C code */
01239 
01240 #define COMBA_START
01241 
01242 #define COMBA_CLEAR \
01243    c0 = c1 = c2 = 0;
01244 
01245 #define COMBA_FORWARD \
01246    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01247 
01248 #define COMBA_STORE(x) \
01249    x = c0;
01250 
01251 #define COMBA_STORE2(x) \
01252    x = c1;
01253 
01254 #define COMBA_FINI 
01255    
01256 #define MULADD(i, j)             \
01257 ____asm__(                             \
01258    " mulu.d r2,%6,%7        \n\t"\
01259    " add    %0,r2           \n\t"\
01260    " adc    %1,%1,r3        \n\t"\
01261    " acr    %2              \n\t"\
01262 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
01263 
01264 #else
01265 /* ISO C code */
01266 
01267 #define COMBA_START
01268 
01269 #define COMBA_CLEAR \
01270    c0 = c1 = c2 = 0;
01271 
01272 #define COMBA_FORWARD \
01273    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01274 
01275 #define COMBA_STORE(x) \
01276    x = c0;
01277 
01278 #define COMBA_STORE2(x) \
01279    x = c1;
01280 
01281 #define COMBA_FINI 
01282    
01283 #define MULADD(i, j)                                                                                                                                  \
01284    do { fp_word t;                                                    \
01285    t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = (fp_digit)t;   \
01286    t = (fp_word)c1 + (t >> DIGIT_BIT);                                \
01287    c1 = (fp_digit)t; c2 += (fp_digit)(t >> DIGIT_BIT);                \
01288    } while (0);
01289 
01290 #endif
01291 
01292 
01293 #ifdef TFM_SMALL_SET
01294 #include "fp_mul_comba_small_set.i"
01295 #include "fp_mul_comba_3.i"
01296 #include "fp_mul_comba_4.i"
01297 #include "fp_mul_comba_6.i"
01298 #include "fp_mul_comba_7.i"
01299 #include "fp_mul_comba_8.i"
01300 #include "fp_mul_comba_9.i"
01301 #include "fp_mul_comba_12.i"
01302 #include "fp_mul_comba_17.i"
01303 #include "fp_mul_comba_20.i"
01304 #include "fp_mul_comba_24.i"
01305 #include "fp_mul_comba_28.i"
01306 #include "fp_mul_comba_32.i"
01307 #include "fp_mul_comba_48.i"
01308 #include "fp_mul_comba_64.i"
01309 #endif
01310 
01311 /* end fp_mul_comba.c asm */
01312