ssh lib

Dependents:   OS

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers asm.c Source File

asm.c

00001 /* asm.c
00002  *
00003  * Copyright (C) 2006-2017 wolfSSL Inc.
00004  *
00005  * This file is part of wolfSSL.
00006  *
00007  * wolfSSL is free software; you can redistribute it and/or modify
00008  * it under the terms of the GNU General Public License as published by
00009  * the Free Software Foundation; either version 2 of the License, or
00010  * (at your option) any later version.
00011  *
00012  * wolfSSL is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  * GNU General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU General Public License
00018  * along with this program; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
00020  */
00021 
00022 
00023 #ifdef HAVE_CONFIG_H
00024     #include <config.h>
00025 #endif
00026 
00027 #include <wolfcrypt/settings.h>
00028 
00029 /*
00030  * Based on public domain TomsFastMath 0.10 by Tom St Denis, tomstdenis@iahu.ca,
00031  * http://math.libtomcrypt.com
00032  */
00033 
00034 
00035 /******************************************************************/
00036 /* fp_montgomery_reduce.c asm or generic */
00037 
00038 
00039 /* Each platform needs to query info type 1 from cpuid to see if aesni is
00040  * supported. Also, let's setup a macro for proper linkage w/o ABI conflicts
00041  */
00042 
00043 #if defined(HAVE_INTEL_MULX)
00044 #ifndef _MSC_VER
00045     #define cpuid(reg, leaf, sub)\
00046             __asm__ __volatile__ ("cpuid":\
00047              "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3]) :\
00048              "a" (leaf), "c"(sub));
00049 
00050     #define XASM_LINK(f) asm(f)
00051 #else
00052 
00053     #include <intrin.h>
00054     #define cpuid(a,b,c) __cpuidex((int*)a,b,c)
00055 
00056     #define XASM_LINK(f)
00057 
00058 #endif /* _MSC_VER */
00059 
00060 #define EAX 0
00061 #define EBX 1
00062 #define ECX 2
00063 #define EDX 3
00064 
00065 #define CPUID_AVX1   0x1
00066 #define CPUID_AVX2   0x2
00067 #define CPUID_RDRAND 0x4
00068 #define CPUID_RDSEED 0x8
00069 #define CPUID_BMI2   0x10   /* MULX, RORX */
00070 #define CPUID_ADX    0x20   /* ADCX, ADOX */
00071 
00072 #define IS_INTEL_AVX1       (cpuid_flags&CPUID_AVX1)
00073 #define IS_INTEL_AVX2       (cpuid_flags&CPUID_AVX2)
00074 #define IS_INTEL_BMI2       (cpuid_flags&CPUID_BMI2)
00075 #define IS_INTEL_ADX        (cpuid_flags&CPUID_ADX)
00076 #define IS_INTEL_RDRAND     (cpuid_flags&CPUID_RDRAND)
00077 #define IS_INTEL_RDSEED     (cpuid_flags&CPUID_RDSEED)
00078 #define SET_FLAGS
00079 
00080 static word32 cpuid_check = 0 ;
00081 static word32 cpuid_flags = 0 ;
00082 
00083 static word32 cpuid_flag(word32 leaf, word32 sub, word32 num, word32 bit) {
00084     int got_intel_cpu = 0;
00085     int got_amd_cpu = 0;
00086     unsigned int reg[5];
00087 
00088     reg[4] = '\0' ;
00089     cpuid(reg, 0, 0);
00090 
00091     /* check for intel cpu */
00092     if( memcmp((char *)&(reg[EBX]), "Genu", 4) == 0 &&
00093         memcmp((char *)&(reg[EDX]), "ineI", 4) == 0 &&
00094         memcmp((char *)&(reg[ECX]), "ntel", 4) == 0) {
00095         got_intel_cpu = 1;
00096     }
00097 
00098     /* check for AMD cpu */
00099     if( memcmp((char *)&(reg[EBX]), "Auth", 4) == 0 &&
00100         memcmp((char *)&(reg[EDX]), "enti", 4) == 0 &&
00101         memcmp((char *)&(reg[ECX]), "cAMD", 4) == 0) {
00102         got_amd_cpu = 1;
00103     }
00104     if (got_intel_cpu || got_amd_cpu) {
00105         cpuid(reg, leaf, sub);
00106         return((reg[num]>>bit)&0x1) ;
00107     }
00108     return 0 ;
00109 }
00110 
00111 WC_INLINE static int set_cpuid_flags(void) {
00112     if(cpuid_check == 0) {
00113         if(cpuid_flag(7, 0, EBX, 8)){  cpuid_flags |= CPUID_BMI2 ; }
00114         if(cpuid_flag(7, 0, EBX,19)){  cpuid_flags |= CPUID_ADX  ; }
00115         cpuid_check = 1 ;
00116         return 0 ;
00117     }
00118     return 1 ;
00119 }
00120 
00121 #define RETURN return
00122 #define IF_HAVE_INTEL_MULX(func, ret)    \
00123    if(cpuid_check==0)set_cpuid_flags() ; \
00124    if(IS_INTEL_BMI2 && IS_INTEL_ADX){  func;  ret ;  }
00125 
00126 #else
00127     #define IF_HAVE_INTEL_MULX(func, ret)
00128 #endif
00129 
00130 #if defined(TFM_X86) && !defined(TFM_SSE2)
00131 /* x86-32 code */
00132 
00133 #define MONT_START
00134 #define MONT_FINI
00135 #define LOOP_END
00136 #define LOOP_START \
00137    mu = c[x] * mp
00138 
00139 #define INNERMUL                                          \
00140 __asm__(                                                  \
00141    "movl %5,%%eax \n\t"                                   \
00142    "mull %4       \n\t"                                   \
00143    "addl %1,%%eax \n\t"                                   \
00144    "adcl $0,%%edx \n\t"                                   \
00145    "addl %%eax,%0 \n\t"                                   \
00146    "adcl $0,%%edx \n\t"                                   \
00147    "movl %%edx,%1 \n\t"                                   \
00148 :"=g"(_c[LO]), "=r"(cy)                                   \
00149 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
00150 : "%eax", "%edx", "cc")
00151 
00152 #define PROPCARRY                           \
00153 __asm__(                                    \
00154    "addl   %1,%0    \n\t"                   \
00155    "setb   %%al     \n\t"                   \
00156    "movzbl %%al,%1 \n\t"                    \
00157 :"=g"(_c[LO]), "=r"(cy)                     \
00158 :"0"(_c[LO]), "1"(cy)                       \
00159 : "%eax", "cc")
00160 
00161 /******************************************************************/
00162 #elif defined(TFM_X86_64)
00163 /* x86-64 code */
00164 
00165 #define MONT_START
00166 #define MONT_FINI
00167 #define LOOP_END
00168 #define LOOP_START \
00169    mu = c[x] * mp
00170 
00171 #define INNERMUL                                          \
00172 __asm__(                                                  \
00173    "movq %5,%%rax \n\t"                                   \
00174    "mulq %4       \n\t"                                   \
00175    "addq %1,%%rax \n\t"                                   \
00176    "adcq $0,%%rdx \n\t"                                   \
00177    "addq %%rax,%0 \n\t"                                   \
00178    "adcq $0,%%rdx \n\t"                                   \
00179    "movq %%rdx,%1 \n\t"                                   \
00180 :"=g"(_c[LO]), "=r"(cy)                                   \
00181 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++)              \
00182 : "%rax", "%rdx", "cc")
00183 
00184 #if defined(HAVE_INTEL_MULX)
00185 #define MULX_INNERMUL8(x,y,z,cy)                                       \
00186     __asm__  volatile (                                                \
00187         "movq   %[yn], %%rdx\n\t"                                      \
00188         "xorq   %%rcx, %%rcx\n\t"                                      \
00189         "movq   0(%[c]), %%r8\n\t"                                     \
00190         "movq   8(%[c]), %%r9\n\t"                                     \
00191         "movq   16(%[c]), %%r10\n\t"                                   \
00192         "movq   24(%[c]), %%r11\n\t"                                   \
00193         "movq   32(%[c]), %%r12\n\t"                                   \
00194         "movq   40(%[c]), %%r13\n\t"                                   \
00195         "movq   48(%[c]), %%r14\n\t"                                   \
00196         "movq   56(%[c]), %%r15\n\t"                                   \
00197                                                                        \
00198         "mulx   0(%[xp]), %%rax, %%rcx\n\t"                            \
00199         "adcxq  %[cy], %%r8\n\t"                                       \
00200         "adoxq  %%rax, %%r8\n\t"                                       \
00201         "mulx   8(%[xp]), %%rax, %[cy]\n\t"                            \
00202         "adcxq  %%rcx, %%r9\n\t"                                       \
00203         "adoxq  %%rax, %%r9\n\t"                                       \
00204         "mulx   16(%[xp]), %%rax, %%rcx\n\t"                           \
00205         "adcxq  %[cy], %%r10\n\t"                                      \
00206         "adoxq  %%rax, %%r10\n\t"                                      \
00207         "mulx   24(%[xp]), %%rax, %[cy]\n\t"                           \
00208         "adcxq  %%rcx, %%r11\n\t"                                      \
00209         "adoxq  %%rax, %%r11\n\t"                                      \
00210         "mulx   32(%[xp]), %%rax, %%rcx\n\t"                           \
00211         "adcxq  %[cy], %%r12\n\t"                                      \
00212         "adoxq  %%rax, %%r12\n\t"                                      \
00213         "mulx   40(%[xp]), %%rax, %[cy]\n\t"                           \
00214         "adcxq  %%rcx, %%r13\n\t"                                      \
00215         "adoxq  %%rax, %%r13\n\t"                                      \
00216         "mulx   48(%[xp]), %%rax, %%rcx\n\t"                           \
00217         "adcxq  %[cy], %%r14\n\t"                                      \
00218         "adoxq  %%rax, %%r14\n\t"                                      \
00219         "adcxq  %%rcx, %%r15\n\t"                                      \
00220         "mulx   56(%[xp]), %%rax, %[cy]\n\t"                           \
00221         "movq   $0, %%rdx\n\t"                                         \
00222         "adoxq  %%rdx, %%rax\n\t"                                      \
00223         "adcxq  %%rdx, %[cy]\n\t"                                      \
00224         "adoxq  %%rdx, %[cy]\n\t"                                      \
00225         "addq   %%rax, %%r15\n\t"                                      \
00226         "adcq   $0, %[cy]\n\t"                                         \
00227                                                                        \
00228         "movq   %%r8,   0(%[c])\n\t"                                   \
00229         "movq   %%r9,   8(%[c])\n\t"                                   \
00230         "movq   %%r10, 16(%[c])\n\t"                                   \
00231         "movq   %%r11, 24(%[c])\n\t"                                   \
00232         "movq   %%r12, 32(%[c])\n\t"                                   \
00233         "movq   %%r13, 40(%[c])\n\t"                                   \
00234         "movq   %%r14, 48(%[c])\n\t"                                   \
00235         "movq   %%r15, 56(%[c])\n\t"                                   \
00236         : [cy] "+r" (cy)                                               \
00237         : [xp] "r" (x), [c] "r" (c_mulx), [yn] "rm" (y)                \
00238         :"%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", \
00239          "%rdx", "%rax", "%rcx" \
00240     )
00241 
00242 #define INNERMUL8_MULX \
00243 {\
00244     MULX_INNERMUL8(tmpm, mu, _c, cy);\
00245 }
00246 #endif
00247 
00248 #define INNERMUL8 \
00249  __asm__(                    \
00250  "movq 0(%5),%%rax    \n\t"  \
00251  "movq 0(%2),%%r10    \n\t"  \
00252  "movq 0x8(%5),%%r11  \n\t"  \
00253  "mulq %4             \n\t"  \
00254  "addq %%r10,%%rax    \n\t"  \
00255  "adcq $0,%%rdx       \n\t"  \
00256  "movq 0x8(%2),%%r10  \n\t"  \
00257  "addq %3,%%rax       \n\t"  \
00258  "adcq $0,%%rdx       \n\t"  \
00259  "movq %%rax,0(%0)    \n\t"  \
00260  "movq %%rdx,%1       \n\t"  \
00261  \
00262  "movq %%r11,%%rax    \n\t"  \
00263  "movq 0x10(%5),%%r11 \n\t"  \
00264  "mulq %4             \n\t"  \
00265  "addq %%r10,%%rax    \n\t"  \
00266  "adcq $0,%%rdx       \n\t"  \
00267  "movq 0x10(%2),%%r10 \n\t"  \
00268  "addq %3,%%rax       \n\t"  \
00269  "adcq $0,%%rdx       \n\t"  \
00270  "movq %%rax,0x8(%0)  \n\t"  \
00271  "movq %%rdx,%1       \n\t"  \
00272  \
00273  "movq %%r11,%%rax    \n\t"  \
00274  "movq 0x18(%5),%%r11 \n\t"  \
00275  "mulq %4             \n\t"  \
00276  "addq %%r10,%%rax    \n\t"  \
00277  "adcq $0,%%rdx       \n\t"  \
00278  "movq 0x18(%2),%%r10 \n\t"  \
00279  "addq %3,%%rax       \n\t"  \
00280  "adcq $0,%%rdx       \n\t"  \
00281  "movq %%rax,0x10(%0) \n\t"  \
00282  "movq %%rdx,%1       \n\t"  \
00283  \
00284  "movq %%r11,%%rax    \n\t"  \
00285  "movq 0x20(%5),%%r11 \n\t"  \
00286  "mulq %4             \n\t"  \
00287  "addq %%r10,%%rax    \n\t"  \
00288  "adcq $0,%%rdx       \n\t"  \
00289  "movq 0x20(%2),%%r10 \n\t"  \
00290  "addq %3,%%rax       \n\t"  \
00291  "adcq $0,%%rdx       \n\t"  \
00292  "movq %%rax,0x18(%0) \n\t"  \
00293  "movq %%rdx,%1       \n\t"  \
00294  \
00295  "movq %%r11,%%rax    \n\t"  \
00296  "movq 0x28(%5),%%r11 \n\t"  \
00297  "mulq %4             \n\t"  \
00298  "addq %%r10,%%rax    \n\t"  \
00299  "adcq $0,%%rdx       \n\t"  \
00300  "movq 0x28(%2),%%r10 \n\t"  \
00301  "addq %3,%%rax       \n\t"  \
00302  "adcq $0,%%rdx       \n\t"  \
00303  "movq %%rax,0x20(%0) \n\t"  \
00304  "movq %%rdx,%1       \n\t"  \
00305  \
00306  "movq %%r11,%%rax    \n\t"  \
00307  "movq 0x30(%5),%%r11 \n\t"  \
00308  "mulq %4             \n\t"  \
00309  "addq %%r10,%%rax    \n\t"  \
00310  "adcq $0,%%rdx       \n\t"  \
00311  "movq 0x30(%2),%%r10 \n\t"  \
00312  "addq %3,%%rax       \n\t"  \
00313  "adcq $0,%%rdx       \n\t"  \
00314  "movq %%rax,0x28(%0) \n\t"  \
00315  "movq %%rdx,%1       \n\t"  \
00316  \
00317  "movq %%r11,%%rax    \n\t"  \
00318  "movq 0x38(%5),%%r11 \n\t"  \
00319  "mulq %4             \n\t"  \
00320  "addq %%r10,%%rax    \n\t"  \
00321  "adcq $0,%%rdx       \n\t"  \
00322  "movq 0x38(%2),%%r10 \n\t"  \
00323  "addq %3,%%rax       \n\t"  \
00324  "adcq $0,%%rdx       \n\t"  \
00325  "movq %%rax,0x30(%0) \n\t"  \
00326  "movq %%rdx,%1       \n\t"  \
00327  \
00328  "movq %%r11,%%rax    \n\t"  \
00329  "mulq %4             \n\t"  \
00330  "addq %%r10,%%rax    \n\t"  \
00331  "adcq $0,%%rdx       \n\t"  \
00332  "addq %3,%%rax       \n\t"  \
00333  "adcq $0,%%rdx       \n\t"  \
00334  "movq %%rax,0x38(%0) \n\t"  \
00335  "movq %%rdx,%1       \n\t"  \
00336  \
00337 :"=r"(_c), "=r"(cy)                    \
00338 : "0"(_c),  "1"(cy), "g"(mu), "r"(tmpm)\
00339 : "%rax", "%rdx", "%r10", "%r11", "cc")
00340 
00341 #define PROPCARRY                           \
00342 __asm__(                                    \
00343    "addq   %1,%0    \n\t"                   \
00344    "setb   %%al     \n\t"                   \
00345    "movzbq %%al,%1 \n\t"                    \
00346 :"=g"(_c[LO]), "=r"(cy)                     \
00347 :"0"(_c[LO]), "1"(cy)                       \
00348 : "%rax", "cc")
00349 
00350 /******************************************************************/
00351 #elif defined(TFM_SSE2)
00352 /* SSE2 code (assumes 32-bit fp_digits) */
00353 /* XMM register assignments:
00354  * xmm0  *tmpm++, then Mu * (*tmpm++)
00355  * xmm1  c[x], then Mu
00356  * xmm2  mp
00357  * xmm3  cy
00358  * xmm4  _c[LO]
00359  */
00360 
00361 #define MONT_START \
00362    __asm__("movd %0,%%mm2"::"g"(mp))
00363 
00364 #define MONT_FINI \
00365    __asm__("emms")
00366 
00367 #define LOOP_START          \
00368 __asm__(                    \
00369 "movd %0,%%mm1        \n\t" \
00370 "pxor %%mm3,%%mm3     \n\t" \
00371 "pmuludq %%mm2,%%mm1  \n\t" \
00372 :: "g"(c[x]))
00373 
00374 /* pmuludq on mmx registers does a 32x32->64 multiply. */
00375 #define INNERMUL               \
00376 __asm__(                       \
00377    "movd %1,%%mm4        \n\t" \
00378    "movd %2,%%mm0        \n\t" \
00379    "paddq %%mm4,%%mm3    \n\t" \
00380    "pmuludq %%mm1,%%mm0  \n\t" \
00381    "paddq %%mm0,%%mm3    \n\t" \
00382    "movd %%mm3,%0        \n\t" \
00383    "psrlq $32, %%mm3     \n\t" \
00384 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) );
00385 
00386 #define INNERMUL8 \
00387 __asm__(                       \
00388    "movd 0(%1),%%mm4     \n\t" \
00389    "movd 0(%2),%%mm0     \n\t" \
00390    "paddq %%mm4,%%mm3    \n\t" \
00391    "pmuludq %%mm1,%%mm0  \n\t" \
00392    "movd 4(%2),%%mm5     \n\t" \
00393    "paddq %%mm0,%%mm3    \n\t" \
00394    "movd 4(%1),%%mm6     \n\t" \
00395    "movd %%mm3,0(%0)     \n\t" \
00396    "psrlq $32, %%mm3     \n\t" \
00397 \
00398    "paddq %%mm6,%%mm3    \n\t" \
00399    "pmuludq %%mm1,%%mm5  \n\t" \
00400    "movd 8(%2),%%mm6     \n\t" \
00401    "paddq %%mm5,%%mm3    \n\t" \
00402    "movd 8(%1),%%mm7     \n\t" \
00403    "movd %%mm3,4(%0)     \n\t" \
00404    "psrlq $32, %%mm3     \n\t" \
00405 \
00406    "paddq %%mm7,%%mm3    \n\t" \
00407    "pmuludq %%mm1,%%mm6  \n\t" \
00408    "movd 12(%2),%%mm7    \n\t" \
00409    "paddq %%mm6,%%mm3    \n\t" \
00410    "movd 12(%1),%%mm5     \n\t" \
00411    "movd %%mm3,8(%0)     \n\t" \
00412    "psrlq $32, %%mm3     \n\t" \
00413 \
00414    "paddq %%mm5,%%mm3    \n\t" \
00415    "pmuludq %%mm1,%%mm7  \n\t" \
00416    "movd 16(%2),%%mm5    \n\t" \
00417    "paddq %%mm7,%%mm3    \n\t" \
00418    "movd 16(%1),%%mm6    \n\t" \
00419    "movd %%mm3,12(%0)    \n\t" \
00420    "psrlq $32, %%mm3     \n\t" \
00421 \
00422    "paddq %%mm6,%%mm3    \n\t" \
00423    "pmuludq %%mm1,%%mm5  \n\t" \
00424    "movd 20(%2),%%mm6    \n\t" \
00425    "paddq %%mm5,%%mm3    \n\t" \
00426    "movd 20(%1),%%mm7    \n\t" \
00427    "movd %%mm3,16(%0)    \n\t" \
00428    "psrlq $32, %%mm3     \n\t" \
00429 \
00430    "paddq %%mm7,%%mm3    \n\t" \
00431    "pmuludq %%mm1,%%mm6  \n\t" \
00432    "movd 24(%2),%%mm7    \n\t" \
00433    "paddq %%mm6,%%mm3    \n\t" \
00434    "movd 24(%1),%%mm5     \n\t" \
00435    "movd %%mm3,20(%0)    \n\t" \
00436    "psrlq $32, %%mm3     \n\t" \
00437 \
00438    "paddq %%mm5,%%mm3    \n\t" \
00439    "pmuludq %%mm1,%%mm7  \n\t" \
00440    "movd 28(%2),%%mm5    \n\t" \
00441    "paddq %%mm7,%%mm3    \n\t" \
00442    "movd 28(%1),%%mm6    \n\t" \
00443    "movd %%mm3,24(%0)    \n\t" \
00444    "psrlq $32, %%mm3     \n\t" \
00445 \
00446    "paddq %%mm6,%%mm3    \n\t" \
00447    "pmuludq %%mm1,%%mm5  \n\t" \
00448    "paddq %%mm5,%%mm3    \n\t" \
00449    "movd %%mm3,28(%0)    \n\t" \
00450    "psrlq $32, %%mm3     \n\t" \
00451 :"=r"(_c) : "0"(_c), "r"(tmpm) );
00452 
00453 /* TAO switched tmpm from "g" to "r" after gcc tried to index the indexed stack
00454    pointer */
00455 
00456 #define LOOP_END \
00457 __asm__( "movd %%mm3,%0  \n" :"=r"(cy))
00458 
00459 #define PROPCARRY                           \
00460 __asm__(                                    \
00461    "addl   %1,%0    \n\t"                   \
00462    "setb   %%al     \n\t"                   \
00463    "movzbl %%al,%1 \n\t"                    \
00464 :"=g"(_c[LO]), "=r"(cy)                     \
00465 :"0"(_c[LO]), "1"(cy)                       \
00466 : "%eax", "cc")
00467 
00468 /******************************************************************/
00469 #elif defined(TFM_ARM)
00470    /* ARMv4 code */
00471 
00472 #define MONT_START
00473 #define MONT_FINI
00474 #define LOOP_END
00475 #define LOOP_START \
00476    mu = c[x] * mp
00477 
00478 
00479 #ifdef __thumb__
00480 
00481 #define INNERMUL                    \
00482 __asm__(                            \
00483     " LDR    r0,%1            \n\t" \
00484     " ADDS   r0,r0,%0         \n\t" \
00485     " ITE    CS               \n\t" \
00486     " MOVCS  %0,#1            \n\t" \
00487     " MOVCC  %0,#0            \n\t" \
00488     " UMLAL  r0,%0,%3,%4      \n\t" \
00489     " STR    r0,%1            \n\t" \
00490 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"m"(_c[0]):"r0","cc");
00491 
00492 #define PROPCARRY                  \
00493 __asm__(                           \
00494     " LDR   r0,%1            \n\t" \
00495     " ADDS  r0,r0,%0         \n\t" \
00496     " STR   r0,%1            \n\t" \
00497     " ITE   CS               \n\t" \
00498     " MOVCS %0,#1            \n\t" \
00499     " MOVCC %0,#0            \n\t" \
00500 :"=r"(cy),"=m"(_c[0]):"0"(cy),"m"(_c[0]):"r0","cc");
00501 
00502 
00503 /* TAO thumb mode uses ite (if then else) to detect carry directly
00504  * fixed unmatched constraint warning by changing 1 to m  */
00505 
00506 #else  /* __thumb__ */
00507 
00508 #define INNERMUL                    \
00509 __asm__(                            \
00510     " LDR    r0,%1            \n\t" \
00511     " ADDS   r0,r0,%0         \n\t" \
00512     " MOVCS  %0,#1            \n\t" \
00513     " MOVCC  %0,#0            \n\t" \
00514     " UMLAL  r0,%0,%3,%4      \n\t" \
00515     " STR    r0,%1            \n\t" \
00516 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","cc");
00517 
00518 #define PROPCARRY                  \
00519 __asm__(                           \
00520     " LDR   r0,%1            \n\t" \
00521     " ADDS  r0,r0,%0         \n\t" \
00522     " STR   r0,%1            \n\t" \
00523     " MOVCS %0,#1            \n\t" \
00524     " MOVCC %0,#0            \n\t" \
00525 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","cc");
00526 
00527 #endif /* __thumb__ */
00528 
00529 #elif defined(TFM_PPC32)
00530 
00531 /* PPC32 */
00532 #define MONT_START
00533 #define MONT_FINI
00534 #define LOOP_END
00535 #define LOOP_START \
00536    mu = c[x] * mp
00537 
00538 #define INNERMUL                     \
00539 __asm__(                             \
00540    " mullw    16,%3,%4       \n\t"   \
00541    " mulhwu   17,%3,%4       \n\t"   \
00542    " addc     16,16,%2       \n\t"   \
00543    " addze    17,17          \n\t"   \
00544    " addc     %1,16,%5       \n\t"   \
00545    " addze    %0,17          \n\t"   \
00546 :"=r"(cy),"=r"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "cc"); ++tmpm;
00547 
00548 #define PROPCARRY                    \
00549 __asm__(                             \
00550    " addc     %1,%3,%2      \n\t"    \
00551    " xor      %0,%2,%2      \n\t"    \
00552    " addze    %0,%2         \n\t"    \
00553 :"=r"(cy),"=r"(_c[0]):"0"(cy),"1"(_c[0]):"cc");
00554 
00555 #elif defined(TFM_PPC64)
00556 
00557 /* PPC64 */
00558 #define MONT_START
00559 #define MONT_FINI
00560 #define LOOP_END
00561 #define LOOP_START \
00562    mu = c[x] * mp
00563 
00564 #define INNERMUL                      \
00565 __asm__(                              \
00566    " mulld    r16,%3,%4       \n\t"   \
00567    " mulhdu   r17,%3,%4       \n\t"   \
00568    " addc     r16,16,%0       \n\t"   \
00569    " addze    r17,r17         \n\t"   \
00570    " ldx      r18,0,%1        \n\t"   \
00571    " addc     r16,r16,r18     \n\t"   \
00572    " addze    %0,r17          \n\t"   \
00573    " sdx      r16,0,%1        \n\t"   \
00574 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","cc"); ++tmpm;
00575 
00576 #define PROPCARRY                     \
00577 __asm__(                              \
00578    " ldx      r16,0,%1       \n\t"    \
00579    " addc     r16,r16,%0     \n\t"    \
00580    " sdx      r16,0,%1       \n\t"    \
00581    " xor      %0,%0,%0       \n\t"    \
00582    " addze    %0,%0          \n\t"    \
00583 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","cc");
00584 
00585 /******************************************************************/
00586 
00587 #elif defined(TFM_AVR32)
00588 
00589 /* AVR32 */
00590 #define MONT_START
00591 #define MONT_FINI
00592 #define LOOP_END
00593 #define LOOP_START \
00594    mu = c[x] * mp
00595 
00596 #define INNERMUL                    \
00597 __asm__(                            \
00598     " ld.w   r2,%1            \n\t" \
00599     " add    r2,%0            \n\t" \
00600     " eor    r3,r3            \n\t" \
00601     " acr    r3               \n\t" \
00602     " macu.d r2,%3,%4         \n\t" \
00603     " st.w   %1,r2            \n\t" \
00604     " mov    %0,r3            \n\t" \
00605 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3");
00606 
00607 #define PROPCARRY                    \
00608 __asm__(                             \
00609    " ld.w     r2,%1         \n\t"    \
00610    " add      r2,%0         \n\t"    \
00611    " st.w     %1,r2         \n\t"    \
00612    " eor      %0,%0         \n\t"    \
00613    " acr      %0            \n\t"    \
00614 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","cc");
00615 
00616 /******************************************************************/
00617 #elif defined(TFM_MIPS)
00618 
00619 /* MIPS */
00620 #define MONT_START
00621 #define MONT_FINI
00622 #define LOOP_END
00623 #define LOOP_START \
00624    mu = c[x] * mp
00625 
00626 #define INNERMUL                     \
00627 __asm__(                             \
00628    " multu    %3,%4          \n\t"   \
00629    " mflo     $12            \n\t"   \
00630    " mfhi     $13            \n\t"   \
00631    " addu     $12,$12,%0     \n\t"   \
00632    " sltu     $10,$12,%0     \n\t"   \
00633    " addu     $13,$13,$10    \n\t"   \
00634    " lw       $10,%1         \n\t"   \
00635    " addu     $12,$12,$10    \n\t"   \
00636    " sltu     $10,$12,$10    \n\t"   \
00637    " addu     %0,$13,$10     \n\t"   \
00638    " sw       $12,%1         \n\t"   \
00639 :"+r"(cy),"+m"(_c[0]):""(cy),"r"(mu),"r"(tmpm[0]),""(_c[0]):"$10","$12","$13"); ++tmpm;
00640 
00641 #define PROPCARRY                    \
00642 __asm__(                             \
00643    " lw       $10,%1        \n\t"    \
00644    " addu     $10,$10,%0    \n\t"    \
00645    " sw       $10,%1        \n\t"    \
00646    " sltu     %0,$10,%0     \n\t"    \
00647 :"+r"(cy),"+m"(_c[0]):""(cy),""(_c[0]):"$10");
00648 
00649 /******************************************************************/
00650 #else
00651 
00652 /* ISO C code */
00653 #define MONT_START
00654 #define MONT_FINI
00655 #define LOOP_END
00656 #define LOOP_START \
00657    mu = c[x] * mp
00658 
00659 #define INNERMUL                                      \
00660    do { fp_word t;                                    \
00661    t  = ((fp_word)_c[0] + (fp_word)cy) +              \
00662                 (((fp_word)mu) * ((fp_word)*tmpm++)); \
00663    _c[0] = (fp_digit)t;                               \
00664    cy = (fp_digit)(t >> DIGIT_BIT);                   \
00665    } while (0)
00666 
00667 #define PROPCARRY \
00668    do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0)
00669 
00670 #endif
00671 /******************************************************************/
00672 
00673 
00674 #define LO  0
00675 /* end fp_montogomery_reduce.c asm */
00676 
00677 
00678 /* start fp_sqr_comba.c asm */
00679 #if defined(TFM_X86)
00680 
00681 /* x86-32 optimized */
00682 
00683 #define COMBA_START
00684 
00685 #define CLEAR_CARRY \
00686    c0 = c1 = c2 = 0;
00687 
00688 #define COMBA_STORE(x) \
00689    x = c0;
00690 
00691 #define COMBA_STORE2(x) \
00692    x = c1;
00693 
00694 #define CARRY_FORWARD \
00695    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00696 
00697 #define COMBA_FINI
00698 
00699 #define SQRADD(i, j)                                      \
00700 __asm__(                                                  \
00701      "movl  %6,%%eax     \n\t"                            \
00702      "mull  %%eax        \n\t"                            \
00703      "addl  %%eax,%0     \n\t"                            \
00704      "adcl  %%edx,%1     \n\t"                            \
00705      "adcl  $0,%2        \n\t"                            \
00706      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","cc");
00707 
00708 #define SQRADD2(i, j)                                     \
00709 __asm__(                                                  \
00710      "movl  %6,%%eax     \n\t"                            \
00711      "mull  %7           \n\t"                            \
00712      "addl  %%eax,%0     \n\t"                            \
00713      "adcl  %%edx,%1     \n\t"                            \
00714      "adcl  $0,%2        \n\t"                            \
00715      "addl  %%eax,%0     \n\t"                            \
00716      "adcl  %%edx,%1     \n\t"                            \
00717      "adcl  $0,%2        \n\t"                            \
00718      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx", "cc");
00719 
00720 #define SQRADDSC(i, j)                                    \
00721 __asm__(                                                     \
00722      "movl  %3,%%eax     \n\t"                            \
00723      "mull  %4           \n\t"                            \
00724      "movl  %%eax,%0     \n\t"                            \
00725      "movl  %%edx,%1     \n\t"                            \
00726      "xorl  %2,%2        \n\t"                            \
00727      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%eax","%edx","cc");
00728 
00729 #define SQRADDAC(i, j)                                    \
00730 __asm__(                                                  \
00731      "movl  %6,%%eax     \n\t"                            \
00732      "mull  %7           \n\t"                            \
00733      "addl  %%eax,%0     \n\t"                            \
00734      "adcl  %%edx,%1     \n\t"                            \
00735      "adcl  $0,%2        \n\t"                            \
00736      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","cc");
00737 
00738 #define SQRADDDB                                          \
00739 __asm__(                                                  \
00740      "addl %6,%0         \n\t"                            \
00741      "adcl %7,%1         \n\t"                            \
00742      "adcl %8,%2         \n\t"                            \
00743      "addl %6,%0         \n\t"                            \
00744      "adcl %7,%1         \n\t"                            \
00745      "adcl %8,%2         \n\t"                            \
00746      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
00747 
00748 #elif defined(TFM_X86_64)
00749 /* x86-64 optimized */
00750 
00751 #define COMBA_START
00752 
00753 #define CLEAR_CARRY \
00754    c0 = c1 = c2 = 0;
00755 
00756 #define COMBA_STORE(x) \
00757    x = c0;
00758 
00759 #define COMBA_STORE2(x) \
00760    x = c1;
00761 
00762 #define CARRY_FORWARD \
00763    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00764 
00765 #define COMBA_FINI
00766 
00767 #define SQRADD(i, j)                                      \
00768 __asm__(                                                  \
00769      "movq  %6,%%rax     \n\t"                            \
00770      "mulq  %%rax        \n\t"                            \
00771      "addq  %%rax,%0     \n\t"                            \
00772      "adcq  %%rdx,%1     \n\t"                            \
00773      "adcq  $0,%2        \n\t"                            \
00774      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "x"(i) :"%rax","%rdx","cc");
00775 
00776 #define SQRADD2(i, j)                                     \
00777 __asm__(                                                  \
00778      "movq  %6,%%rax     \n\t"                            \
00779      "mulq  %7           \n\t"                            \
00780      "addq  %%rax,%0     \n\t"                            \
00781      "adcq  %%rdx,%1     \n\t"                            \
00782      "adcq  $0,%2        \n\t"                            \
00783      "addq  %%rax,%0     \n\t"                            \
00784      "adcq  %%rdx,%1     \n\t"                            \
00785      "adcq  $0,%2        \n\t"                            \
00786      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
00787 
00788 #define SQRADDSC(i, j)                                    \
00789 __asm__(                                                  \
00790      "movq  %3,%%rax     \n\t"                            \
00791      "mulq  %4           \n\t"                            \
00792      "movq  %%rax,%0     \n\t"                            \
00793      "movq  %%rdx,%1     \n\t"                            \
00794      "xorq  %2,%2        \n\t"                            \
00795      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "g"(i), "g"(j) :"%rax","%rdx","cc");
00796 
00797 #define SQRADDAC(i, j)                                                         \
00798 __asm__(                                                  \
00799      "movq  %6,%%rax     \n\t"                            \
00800      "mulq  %7           \n\t"                            \
00801      "addq  %%rax,%0     \n\t"                            \
00802      "adcq  %%rdx,%1     \n\t"                            \
00803      "adcq  $0,%2        \n\t"                            \
00804      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
00805 
00806 #define SQRADDDB                                          \
00807 __asm__(                                                  \
00808      "addq %6,%0         \n\t"                            \
00809      "adcq %7,%1         \n\t"                            \
00810      "adcq %8,%2         \n\t"                            \
00811      "addq %6,%0         \n\t"                            \
00812      "adcq %7,%1         \n\t"                            \
00813      "adcq %8,%2         \n\t"                            \
00814      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
00815 
00816 #elif defined(TFM_SSE2)
00817 
00818 /* SSE2 Optimized */
00819 #define COMBA_START
00820 
00821 #define CLEAR_CARRY \
00822    c0 = c1 = c2 = 0;
00823 
00824 #define COMBA_STORE(x) \
00825    x = c0;
00826 
00827 #define COMBA_STORE2(x) \
00828    x = c1;
00829 
00830 #define CARRY_FORWARD \
00831    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00832 
00833 #define COMBA_FINI \
00834    __asm__("emms");
00835 
00836 #define SQRADD(i, j)                                      \
00837 __asm__(                                                  \
00838      "movd  %6,%%mm0     \n\t"                            \
00839      "pmuludq %%mm0,%%mm0\n\t"                            \
00840      "movd  %%mm0,%%eax  \n\t"                            \
00841      "psrlq $32,%%mm0    \n\t"                            \
00842      "addl  %%eax,%0     \n\t"                            \
00843      "movd  %%mm0,%%eax  \n\t"                            \
00844      "adcl  %%eax,%1     \n\t"                            \
00845      "adcl  $0,%2        \n\t"                            \
00846      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","cc");
00847 
00848 #define SQRADD2(i, j)                                     \
00849 __asm__(                                                  \
00850      "movd  %6,%%mm0     \n\t"                            \
00851      "movd  %7,%%mm1     \n\t"                            \
00852      "pmuludq %%mm1,%%mm0\n\t"                            \
00853      "movd  %%mm0,%%eax  \n\t"                            \
00854      "psrlq $32,%%mm0    \n\t"                            \
00855      "movd  %%mm0,%%edx  \n\t"                            \
00856      "addl  %%eax,%0     \n\t"                            \
00857      "adcl  %%edx,%1     \n\t"                            \
00858      "adcl  $0,%2        \n\t"                            \
00859      "addl  %%eax,%0     \n\t"                            \
00860      "adcl  %%edx,%1     \n\t"                            \
00861      "adcl  $0,%2        \n\t"                            \
00862      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
00863 
00864 #define SQRADDSC(i, j)                                                         \
00865 __asm__(                                                  \
00866      "movd  %3,%%mm0     \n\t"                            \
00867      "movd  %4,%%mm1     \n\t"                            \
00868      "pmuludq %%mm1,%%mm0\n\t"                            \
00869      "movd  %%mm0,%0     \n\t"                            \
00870      "psrlq $32,%%mm0    \n\t"                            \
00871      "movd  %%mm0,%1     \n\t"                            \
00872      "xorl  %2,%2        \n\t"                            \
00873      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "m"(i), "m"(j));
00874 
00875 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
00876 
00877 #define SQRADDAC(i, j)                                                         \
00878 __asm__(                                                  \
00879      "movd  %6,%%mm0     \n\t"                            \
00880      "movd  %7,%%mm1     \n\t"                            \
00881      "pmuludq %%mm1,%%mm0\n\t"                            \
00882      "movd  %%mm0,%%eax  \n\t"                            \
00883      "psrlq $32,%%mm0    \n\t"                            \
00884      "movd  %%mm0,%%edx  \n\t"                            \
00885      "addl  %%eax,%0     \n\t"                            \
00886      "adcl  %%edx,%1     \n\t"                            \
00887      "adcl  $0,%2        \n\t"                            \
00888      :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "m"(i), "m"(j)  :"%eax","%edx","cc");
00889 
00890 #define SQRADDDB                                          \
00891 __asm__(                                                  \
00892      "addl %6,%0         \n\t"                            \
00893      "adcl %7,%1         \n\t"                            \
00894      "adcl %8,%2         \n\t"                            \
00895      "addl %6,%0         \n\t"                            \
00896      "adcl %7,%1         \n\t"                            \
00897      "adcl %8,%2         \n\t"                            \
00898      :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
00899 
00900 #elif defined(TFM_ARM)
00901 
00902 /* ARM code */
00903 
00904 #define COMBA_START
00905 
00906 #define CLEAR_CARRY \
00907    c0 = c1 = c2 = 0;
00908 
00909 #define COMBA_STORE(x) \
00910    x = c0;
00911 
00912 #define COMBA_STORE2(x) \
00913    x = c1;
00914 
00915 #define CARRY_FORWARD \
00916    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00917 
00918 #define COMBA_FINI
00919 
00920 /* multiplies point i and j, updates carry "c1" and digit c2 */
00921 #define SQRADD(i, j)                                             \
00922 __asm__(                                                         \
00923 "  UMULL  r0,r1,%6,%6              \n\t"                         \
00924 "  ADDS   %0,%0,r0                 \n\t"                         \
00925 "  ADCS   %1,%1,r1                 \n\t"                         \
00926 "  ADC    %2,%2,#0                 \n\t"                         \
00927 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "cc");
00928 
00929 /* for squaring some of the terms are doubled... */
00930 #define SQRADD2(i, j)                                            \
00931 __asm__(                                                         \
00932 "  UMULL  r0,r1,%6,%7              \n\t"                         \
00933 "  ADDS   %0,%0,r0                 \n\t"                         \
00934 "  ADCS   %1,%1,r1                 \n\t"                         \
00935 "  ADC    %2,%2,#0                 \n\t"                         \
00936 "  ADDS   %0,%0,r0                 \n\t"                         \
00937 "  ADCS   %1,%1,r1                 \n\t"                         \
00938 "  ADC    %2,%2,#0                 \n\t"                         \
00939 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
00940 
00941 #define SQRADDSC(i, j)                                           \
00942 __asm__(                                                         \
00943 "  UMULL  %0,%1,%3,%4              \n\t"                         \
00944 "  SUB    %2,%2,%2                 \n\t"                         \
00945 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "r"(i), "r"(j) : "cc");
00946 
00947 /* TAO removed sc0,1,2 as input to remove warning so %6,%7 become %3,%4 */
00948 
00949 #define SQRADDAC(i, j)                                           \
00950 __asm__(                                                         \
00951 "  UMULL  r0,r1,%6,%7              \n\t"                         \
00952 "  ADDS   %0,%0,r0                 \n\t"                         \
00953 "  ADCS   %1,%1,r1                 \n\t"                         \
00954 "  ADC    %2,%2,#0                 \n\t"                         \
00955 :"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "cc");
00956 
00957 #define SQRADDDB                                                 \
00958 __asm__(                                                         \
00959 "  ADDS  %0,%0,%3                     \n\t"                      \
00960 "  ADCS  %1,%1,%4                     \n\t"                      \
00961 "  ADC   %2,%2,%5                     \n\t"                      \
00962 "  ADDS  %0,%0,%3                     \n\t"                      \
00963 "  ADCS  %1,%1,%4                     \n\t"                      \
00964 "  ADC   %2,%2,%5                     \n\t"                      \
00965 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
00966 
00967 #elif defined(TFM_PPC32)
00968 
00969 /* PPC32 */
00970 
00971 #define COMBA_START
00972 
00973 #define CLEAR_CARRY \
00974    c0 = c1 = c2 = 0;
00975 
00976 #define COMBA_STORE(x) \
00977    x = c0;
00978 
00979 #define COMBA_STORE2(x) \
00980    x = c1;
00981 
00982 #define CARRY_FORWARD \
00983    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
00984 
00985 #define COMBA_FINI
00986 
00987 /* multiplies point i and j, updates carry "c1" and digit c2 */
00988 #define SQRADD(i, j)             \
00989 __asm__(                         \
00990    " mullw  16,%6,%6       \n\t" \
00991    " addc   %0,%0,16       \n\t" \
00992    " mulhwu 16,%6,%6       \n\t" \
00993    " adde   %1,%1,16       \n\t" \
00994    " addze  %2,%2          \n\t" \
00995 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"16","cc");
00996 
00997 /* for squaring some of the terms are doubled... */
00998 #define SQRADD2(i, j)            \
00999 __asm__(                         \
01000    " mullw  16,%6,%7       \n\t" \
01001    " mulhwu 17,%6,%7       \n\t" \
01002    " addc   %0,%0,16       \n\t" \
01003    " adde   %1,%1,17       \n\t" \
01004    " addze  %2,%2          \n\t" \
01005    " addc   %0,%0,16       \n\t" \
01006    " adde   %1,%1,17       \n\t" \
01007    " addze  %2,%2          \n\t" \
01008 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16", "17","cc");
01009 
01010 #define SQRADDSC(i, j)            \
01011 __asm__(                          \
01012    " mullw  %0,%6,%7        \n\t" \
01013    " mulhwu %1,%6,%7        \n\t" \
01014    " xor    %2,%2,%2        \n\t" \
01015 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
01016 
01017 #define SQRADDAC(i, j)           \
01018 __asm__(                         \
01019    " mullw  16,%6,%7       \n\t" \
01020    " addc   %0,%0,16       \n\t" \
01021    " mulhwu 16,%6,%7       \n\t" \
01022    " adde   %1,%1,16       \n\t" \
01023    " addze  %2,%2          \n\t" \
01024 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"16", "cc");
01025 
01026 #define SQRADDDB                  \
01027 __asm__(                          \
01028    " addc   %0,%0,%3        \n\t" \
01029    " adde   %1,%1,%4        \n\t" \
01030    " adde   %2,%2,%5        \n\t" \
01031    " addc   %0,%0,%3        \n\t" \
01032    " adde   %1,%1,%4        \n\t" \
01033    " adde   %2,%2,%5        \n\t" \
01034 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
01035 
01036 #elif defined(TFM_PPC64)
01037 /* PPC64 */
01038 
01039 #define COMBA_START
01040 
01041 #define CLEAR_CARRY \
01042    c0 = c1 = c2 = 0;
01043 
01044 #define COMBA_STORE(x) \
01045    x = c0;
01046 
01047 #define COMBA_STORE2(x) \
01048    x = c1;
01049 
01050 #define CARRY_FORWARD \
01051    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01052 
01053 #define COMBA_FINI
01054 
01055 /* multiplies point i and j, updates carry "c1" and digit c2 */
01056 #define SQRADD(i, j)              \
01057 __asm__(                          \
01058    " mulld  r16,%6,%6       \n\t" \
01059    " addc   %0,%0,r16       \n\t" \
01060    " mulhdu r16,%6,%6       \n\t" \
01061    " adde   %1,%1,r16       \n\t" \
01062    " addze  %2,%2           \n\t" \
01063 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r16","cc");
01064 
01065 /* for squaring some of the terms are doubled... */
01066 #define SQRADD2(i, j)             \
01067 __asm__(                          \
01068    " mulld  r16,%6,%7       \n\t" \
01069    " mulhdu r17,%6,%7       \n\t" \
01070    " addc   %0,%0,r16       \n\t" \
01071    " adde   %1,%1,r17       \n\t" \
01072    " addze  %2,%2           \n\t" \
01073    " addc   %0,%0,r16       \n\t" \
01074    " adde   %1,%1,r17       \n\t" \
01075    " addze  %2,%2           \n\t" \
01076 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16", "r17","cc");
01077 
01078 #define SQRADDSC(i, j)            \
01079 __asm__(                          \
01080    " mulld  %0,%6,%7        \n\t" \
01081    " mulhdu %1,%6,%7        \n\t" \
01082    " xor    %2,%2,%2        \n\t" \
01083 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
01084 
01085 #define SQRADDAC(i, j)            \
01086 __asm__(                          \
01087    " mulld  r16,%6,%7       \n\t" \
01088    " addc   %0,%0,r16       \n\t" \
01089    " mulhdu r16,%6,%7       \n\t" \
01090    " adde   %1,%1,r16       \n\t" \
01091    " addze  %2,%2           \n\t" \
01092 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r16", "cc");
01093 
01094 #define SQRADDDB                  \
01095 __asm__(                          \
01096    " addc   %0,%0,%3        \n\t" \
01097    " adde   %1,%1,%4        \n\t" \
01098    " adde   %2,%2,%5        \n\t" \
01099    " addc   %0,%0,%3        \n\t" \
01100    " adde   %1,%1,%4        \n\t" \
01101    " adde   %2,%2,%5        \n\t" \
01102 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
01103 
01104 
01105 #elif defined(TFM_AVR32)
01106 
01107 /* AVR32 */
01108 
01109 #define COMBA_START
01110 
01111 #define CLEAR_CARRY \
01112    c0 = c1 = c2 = 0;
01113 
01114 #define COMBA_STORE(x) \
01115    x = c0;
01116 
01117 #define COMBA_STORE2(x) \
01118    x = c1;
01119 
01120 #define CARRY_FORWARD \
01121    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01122 
01123 #define COMBA_FINI
01124 
01125 /* multiplies point i and j, updates carry "c1" and digit c2 */
01126 #define SQRADD(i, j)             \
01127 __asm__(                         \
01128    " mulu.d r2,%6,%6       \n\t" \
01129    " add    %0,%0,r2       \n\t" \
01130    " adc    %1,%1,r3       \n\t" \
01131    " acr    %2             \n\t" \
01132 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"r2","r3");
01133 
01134 /* for squaring some of the terms are doubled... */
01135 #define SQRADD2(i, j)            \
01136 __asm__(                         \
01137    " mulu.d r2,%6,%7       \n\t" \
01138    " add    %0,%0,r2       \n\t" \
01139    " adc    %1,%1,r3       \n\t" \
01140    " acr    %2,            \n\t" \
01141    " add    %0,%0,r2       \n\t" \
01142    " adc    %1,%1,r3       \n\t" \
01143    " acr    %2,            \n\t" \
01144 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2", "r3");
01145 
01146 #define SQRADDSC(i, j)            \
01147 __asm__(                          \
01148    " mulu.d r2,%6,%7        \n\t" \
01149    " mov    %0,r2           \n\t" \
01150    " mov    %1,r3           \n\t" \
01151    " eor    %2,%2           \n\t" \
01152 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "r2", "r3");
01153 
01154 #define SQRADDAC(i, j)           \
01155 __asm__(                         \
01156    " mulu.d r2,%6,%7       \n\t" \
01157    " add    %0,%0,r2       \n\t" \
01158    " adc    %1,%1,r3       \n\t" \
01159    " acr    %2             \n\t" \
01160 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"r2", "r3");
01161 
01162 #define SQRADDDB                  \
01163 __asm__(                          \
01164    " add    %0,%0,%3        \n\t" \
01165    " adc    %1,%1,%4        \n\t" \
01166    " adc    %2,%2,%5        \n\t" \
01167    " add    %0,%0,%3        \n\t" \
01168    " adc    %1,%1,%4        \n\t" \
01169    " adc    %2,%2,%5        \n\t" \
01170 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "cc");
01171 
01172 #elif defined(TFM_MIPS)
01173 
01174 /* MIPS */
01175 #define COMBA_START
01176 
01177 #define CLEAR_CARRY \
01178    c0 = c1 = c2 = 0;
01179 
01180 #define COMBA_STORE(x) \
01181    x = c0;
01182 
01183 #define COMBA_STORE2(x) \
01184    x = c1;
01185 
01186 #define CARRY_FORWARD \
01187    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01188 
01189 #define COMBA_FINI
01190 
01191 /* multiplies point i and j, updates carry "c1" and digit c2 */
01192 #define SQRADD(i, j)              \
01193 __asm__(                          \
01194    " multu  %6,%6          \n\t"  \
01195    " mflo   $12            \n\t"  \
01196    " mfhi   $13            \n\t"  \
01197    " addu    %0,%0,$12     \n\t"  \
01198    " sltu   $12,%0,$12     \n\t"  \
01199    " addu    %1,%1,$13     \n\t"  \
01200    " sltu   $13,%1,$13     \n\t"  \
01201    " addu    %1,%1,$12     \n\t"  \
01202    " sltu   $12,%1,$12     \n\t"  \
01203    " addu    %2,%2,$13     \n\t"  \
01204    " addu    %2,%2,$12     \n\t"  \
01205 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13");
01206 
01207 /* for squaring some of the terms are doubled... */
01208 #define SQRADD2(i, j)            \
01209 __asm__(                         \
01210    " multu  %6,%7          \n\t" \
01211    " mflo   $12            \n\t" \
01212    " mfhi   $13            \n\t" \
01213                                  \
01214    " addu    %0,%0,$12     \n\t" \
01215    " sltu   $14,%0,$12     \n\t" \
01216    " addu    %1,%1,$13     \n\t" \
01217    " sltu   $15,%1,$13     \n\t" \
01218    " addu    %1,%1,$14     \n\t" \
01219    " sltu   $14,%1,$14     \n\t" \
01220    " addu    %2,%2,$15     \n\t" \
01221    " addu    %2,%2,$14     \n\t" \
01222                                  \
01223    " addu    %0,%0,$12     \n\t" \
01224    " sltu   $14,%0,$12     \n\t" \
01225    " addu    %1,%1,$13     \n\t" \
01226    " sltu   $15,%1,$13     \n\t" \
01227    " addu    %1,%1,$14     \n\t" \
01228    " sltu   $14,%1,$14     \n\t" \
01229    " addu    %2,%2,$15     \n\t" \
01230    " addu    %2,%2,$14     \n\t" \
01231 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15");
01232 
01233 #define SQRADDSC(i, j)            \
01234 __asm__(                          \
01235    " multu  %6,%7          \n\t"  \
01236    " mflo   %0             \n\t"  \
01237    " mfhi   %1             \n\t"  \
01238    " xor    %2,%2,%2       \n\t"  \
01239 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "cc");
01240 
01241 #define SQRADDAC(i, j)           \
01242 __asm__(                         \
01243    " multu  %6,%7          \n\t" \
01244    " mflo   $12            \n\t" \
01245    " mfhi   $13            \n\t" \
01246    " addu    %0,%0,$12     \n\t" \
01247    " sltu   $12,%0,$12     \n\t" \
01248    " addu    %1,%1,$13     \n\t" \
01249    " sltu   $13,%1,$13     \n\t" \
01250    " addu    %1,%1,$12     \n\t" \
01251    " sltu   $12,%1,$12     \n\t" \
01252    " addu    %2,%2,$13     \n\t" \
01253    " addu    %2,%2,$12     \n\t" \
01254 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14");
01255 
01256 #define SQRADDDB                  \
01257 __asm__(                          \
01258    " addu    %0,%0,%3       \n\t" \
01259    " sltu   $10,%0,%3       \n\t" \
01260    " addu    %1,%1,$10      \n\t" \
01261    " sltu   $10,%1,$10      \n\t" \
01262    " addu    %1,%1,%4       \n\t" \
01263    " sltu   $11,%1,%4       \n\t" \
01264    " addu    %2,%2,$10      \n\t" \
01265    " addu    %2,%2,$11      \n\t" \
01266    " addu    %2,%2,%5       \n\t" \
01267                                   \
01268    " addu    %0,%0,%3       \n\t" \
01269    " sltu   $10,%0,%3       \n\t" \
01270    " addu    %1,%1,$10      \n\t" \
01271    " sltu   $10,%1,$10      \n\t" \
01272    " addu    %1,%1,%4       \n\t" \
01273    " sltu   $11,%1,%4       \n\t" \
01274    " addu    %2,%2,$10      \n\t" \
01275    " addu    %2,%2,$11      \n\t" \
01276    " addu    %2,%2,%5       \n\t" \
01277 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11");
01278 
01279 #else
01280 
01281 #define TFM_ISO
01282 
01283 /* ISO C portable code */
01284 
01285 #define COMBA_START
01286 
01287 #define CLEAR_CARRY \
01288    c0 = c1 = c2 = 0;
01289 
01290 #define COMBA_STORE(x) \
01291    x = c0;
01292 
01293 #define COMBA_STORE2(x) \
01294    x = c1;
01295 
01296 #define CARRY_FORWARD \
01297    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01298 
01299 #define COMBA_FINI
01300 
01301 /* multiplies point i and j, updates carry "c1" and digit c2 */
01302 #define SQRADD(i, j)                                 \
01303    do { fp_word t;                                   \
01304    t = c0 + ((fp_word)i) * ((fp_word)j);  c0 = (fp_digit)t;    \
01305    t = c1 + (t >> DIGIT_BIT);             c1 = (fp_digit)t;    \
01306                                           c2 +=(fp_digit) (t >> DIGIT_BIT); \
01307    } while (0);
01308 
01309 
01310 /* for squaring some of the terms are doubled... */
01311 #define SQRADD2(i, j)                                                 \
01312    do { fp_word t;                                                    \
01313    t  = ((fp_word)i) * ((fp_word)j);                                  \
01314    tt = (fp_word)c0 + t;                 c0 = (fp_digit)tt;           \
01315    tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt;           \
01316                                          c2 +=(fp_digit)(tt >> DIGIT_BIT);     \
01317    tt = (fp_word)c0 + t;                 c0 = (fp_digit)tt;                    \
01318    tt = (fp_word)c1 + (tt >> DIGIT_BIT); c1 = (fp_digit)tt;            \
01319                                          c2 +=(fp_digit)(tt >> DIGIT_BIT);     \
01320    } while (0);
01321 
01322 #define SQRADDSC(i, j)                                                         \
01323    do { fp_word t;                                                             \
01324       t =  ((fp_word)i) * ((fp_word)j);                                        \
01325       sc0 = (fp_digit)t; sc1 = (t >> DIGIT_BIT); sc2 = 0;                      \
01326    } while (0);
01327 
01328 #define SQRADDAC(i, j)                                                         \
01329    do { fp_word t;                                                             \
01330    t = sc0 + ((fp_word)i) * ((fp_word)j);  sc0 =  (fp_digit)t;                 \
01331    t = sc1 + (t >> DIGIT_BIT);             sc1 =  (fp_digit)t;                 \
01332                                            sc2 += (fp_digit)(t >> DIGIT_BIT);  \
01333    } while (0);
01334 
01335 #define SQRADDDB                                                               \
01336    do { fp_word t;                                                             \
01337    t = ((fp_word)sc0) + ((fp_word)sc0) + c0; c0 = (fp_digit)t;                 \
01338    t = ((fp_word)sc1) + ((fp_word)sc1) + c1 + (t >> DIGIT_BIT);                \
01339                                              c1 = (fp_digit)t;                 \
01340    c2 = c2 + (fp_digit)(((fp_word)sc2) + ((fp_word)sc2) + (t >> DIGIT_BIT));   \
01341    } while (0);
01342 
01343 #endif
01344 
01345 #ifdef TFM_SMALL_SET
01346     #include "fp_sqr_comba_small_set.i"
01347 #endif
01348 
01349 #if defined(TFM_SQR3) && FP_SIZE >= 6
01350     #include "fp_sqr_comba_3.i"
01351 #endif
01352 #if defined(TFM_SQR4) && FP_SIZE >= 8
01353     #include "fp_sqr_comba_4.i"
01354 #endif
01355 #if defined(TFM_SQR6) && FP_SIZE >= 12
01356     #include "fp_sqr_comba_6.i"
01357 #endif
01358 #if defined(TFM_SQR7) && FP_SIZE >= 14
01359     #include "fp_sqr_comba_7.i"
01360 #endif
01361 #if defined(TFM_SQR8) && FP_SIZE >= 16
01362     #include "fp_sqr_comba_8.i"
01363 #endif
01364 #if defined(TFM_SQR9) && FP_SIZE >= 18
01365     #include "fp_sqr_comba_9.i"
01366 #endif
01367 #if defined(TFM_SQR12) && FP_SIZE >= 24
01368     #include "fp_sqr_comba_12.i"
01369 #endif
01370 #if defined(TFM_SQR17) && FP_SIZE >= 34
01371     #include "fp_sqr_comba_17.i"
01372 #endif
01373 #if defined(TFM_SQR20) && FP_SIZE >= 40
01374     #include "fp_sqr_comba_20.i"
01375 #endif
01376 #if defined(TFM_SQR24) && FP_SIZE >= 48
01377     #include "fp_sqr_comba_24.i"
01378 #endif
01379 #if defined(TFM_SQR28) && FP_SIZE >= 56
01380     #include "fp_sqr_comba_28.i"
01381 #endif
01382 #if defined(TFM_SQR32) && FP_SIZE >= 64
01383     #include "fp_sqr_comba_32.i"
01384 #endif
01385 #if defined(TFM_SQR48) && FP_SIZE >= 96
01386     #include "fp_sqr_comba_48.i"
01387 #endif
01388 #if defined(TFM_SQR64) && FP_SIZE >= 128
01389     #include "fp_sqr_comba_64.i"
01390 #endif
01391 /* end fp_sqr_comba.c asm */
01392 
01393 /* start fp_mul_comba.c asm */
01394 /* these are the combas.  Worship them. */
01395 #if defined(TFM_X86)
01396 /* Generic x86 optimized code */
01397 
01398 /* anything you need at the start */
01399 #define COMBA_START
01400 
01401 /* clear the chaining variables */
01402 #define COMBA_CLEAR \
01403    c0 = c1 = c2 = 0;
01404 
01405 /* forward the carry to the next digit */
01406 #define COMBA_FORWARD \
01407    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01408 
01409 /* store the first sum */
01410 #define COMBA_STORE(x) \
01411    x = c0;
01412 
01413 /* store the second sum [carry] */
01414 #define COMBA_STORE2(x) \
01415    x = c1;
01416 
01417 /* anything you need at the end */
01418 #define COMBA_FINI
01419 
01420 /* this should multiply i and j  */
01421 #define MULADD(i, j)                                      \
01422 __asm__(                                                  \
01423      "movl  %6,%%eax     \n\t"                            \
01424      "mull  %7           \n\t"                            \
01425      "addl  %%eax,%0     \n\t"                            \
01426      "adcl  %%edx,%1     \n\t"                            \
01427      "adcl  $0,%2        \n\t"                            \
01428      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","%edx","cc");
01429 
01430 #elif defined(TFM_X86_64)
01431 /* x86-64 optimized */
01432 
01433 /* anything you need at the start */
01434 #define COMBA_START
01435 
01436 /* clear the chaining variables */
01437 #define COMBA_CLEAR \
01438    c0 = c1 = c2 = 0;
01439 
01440 /* forward the carry to the next digit */
01441 #define COMBA_FORWARD \
01442    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01443 
01444 /* store the first sum */
01445 #define COMBA_STORE(x) \
01446    x = c0;
01447 
01448 /* store the second sum [carry] */
01449 #define COMBA_STORE2(x) \
01450    x = c1;
01451 
01452 /* anything you need at the end */
01453 #define COMBA_FINI
01454 
01455 /* this should multiply i and j  */
01456 #define MULADD(i, j)                                      \
01457 __asm__  (                                                \
01458      "movq  %6,%%rax     \n\t"                            \
01459      "mulq  %7           \n\t"                            \
01460      "addq  %%rax,%0     \n\t"                            \
01461      "adcq  %%rdx,%1     \n\t"                            \
01462      "adcq  $0,%2        \n\t"                            \
01463      :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j)  :"%rax","%rdx","cc");
01464 
01465 
01466 #if defined(HAVE_INTEL_MULX)
01467 #define MULADD_BODY(a,b,c)                              \
01468     __asm__ volatile(                                   \
01469          "movq  %[a0],%%rdx\n\t"                        \
01470          "xorq  %%rcx, %%rcx\n\t"                       \
01471          "movq  0(%[cp]),%%r8\n\t"                      \
01472          "movq  8(%[cp]),%%r9\n\t"                      \
01473          "movq  16(%[cp]),%%r10\n\t"                    \
01474          "movq  24(%[cp]),%%r11\n\t"                    \
01475          "movq  32(%[cp]),%%r12\n\t"                    \
01476          "movq  40(%[cp]),%%r13\n\t"                    \
01477                                                         \
01478          "mulx  (%[bp]),%%rax, %%rbx\n\t"               \
01479          "adoxq  %%rax, %%r8\n\t"                       \
01480          "mulx  8(%[bp]),%%rax, %%rcx\n\t"              \
01481          "adcxq  %%rbx, %%r9\n\t"                       \
01482          "adoxq  %%rax, %%r9\n\t"                       \
01483          "mulx  16(%[bp]),%%rax, %%rbx\n\t"             \
01484          "adcxq  %%rcx, %%r10\n\t"                      \
01485          "adoxq  %%rax, %%r10\n\t"                      \
01486          "mulx  24(%[bp]),%%rax, %%rcx\n\t"             \
01487          "adcxq  %%rbx, %%r11\n\t"                      \
01488          "adoxq  %%rax, %%r11\n\t"                      \
01489          "adcxq  %%rcx, %%r12\n\t"                      \
01490          "mov $0, %%rdx\n\t"                            \
01491          "adox %%rdx, %%r12\n\t"                        \
01492          "adcx %%rdx, %%r13\n\t"                        \
01493                                                         \
01494          "movq  %%r8, 0(%[cp])\n\t"                     \
01495          "movq  %%r9, 8(%[cp])\n\t"                     \
01496          "movq  %%r10, 16(%[cp])\n\t"                   \
01497          "movq  %%r11, 24(%[cp])\n\t"                   \
01498          "movq  %%r12, 32(%[cp])\n\t"                   \
01499          "movq  %%r13, 40(%[cp])\n\t"                   \
01500       :                                                 \
01501       : [a0] "r" (a->dp[ix]), [bp] "r" (&(b->dp[iy])),  \
01502         [cp] "r" (&(c->dp[iz]))                         \
01503       : "%r8", "%r9", "%r10", "%r11", "%r12", "%r13",   \
01504         "%rdx", "%rax", "%rcx", "%rbx"                  \
01505     )
01506 
01507 #define TFM_INTEL_MUL_COMBA(a, b, c)       \
01508     for (iz=0; iz<pa; iz++) c->dp[iz] = 0; \
01509     for (ix=0; ix<a->used; ix++) {         \
01510         for (iy=0; iy<b->used; iy+=4) {    \
01511             iz = ix + iy;                  \
01512             MULADD_BODY(a, b, c);          \
01513         }                                  \
01514     }
01515 #endif
01516 
01517 #elif defined(TFM_SSE2)
01518 /* use SSE2 optimizations */
01519 
01520 /* anything you need at the start */
01521 #define COMBA_START
01522 
01523 /* clear the chaining variables */
01524 #define COMBA_CLEAR \
01525    c0 = c1 = c2 = 0;
01526 
01527 /* forward the carry to the next digit */
01528 #define COMBA_FORWARD \
01529    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01530 
01531 /* store the first sum */
01532 #define COMBA_STORE(x) \
01533    x = c0;
01534 
01535 /* store the second sum [carry] */
01536 #define COMBA_STORE2(x) \
01537    x = c1;
01538 
01539 /* anything you need at the end */
01540 #define COMBA_FINI \
01541    __asm__("emms");
01542 
01543 /* this should multiply i and j  */
01544 #define MULADD(i, j)                                     \
01545 __asm__(                                                 \
01546     "movd  %6,%%mm0     \n\t"                            \
01547     "movd  %7,%%mm1     \n\t"                            \
01548     "pmuludq %%mm1,%%mm0\n\t"                            \
01549     "movd  %%mm0,%%eax  \n\t"                            \
01550     "psrlq $32,%%mm0    \n\t"                            \
01551     "addl  %%eax,%0     \n\t"                            \
01552     "movd  %%mm0,%%eax  \n\t"                            \
01553     "adcl  %%eax,%1     \n\t"                            \
01554     "adcl  $0,%2        \n\t"                            \
01555     :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j)  :"%eax","cc");
01556 
01557 #elif defined(TFM_ARM)
01558 /* ARM code */
01559 
01560 #define COMBA_START
01561 
01562 #define COMBA_CLEAR \
01563    c0 = c1 = c2 = 0;
01564 
01565 #define COMBA_FORWARD \
01566    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01567 
01568 #define COMBA_STORE(x) \
01569    x = c0;
01570 
01571 #define COMBA_STORE2(x) \
01572    x = c1;
01573 
01574 #define COMBA_FINI
01575 
01576 #define MULADD(i, j)                                          \
01577 __asm__(                                                      \
01578 "  UMULL  r0,r1,%6,%7           \n\t"                         \
01579 "  ADDS   %0,%0,r0              \n\t"                         \
01580 "  ADCS   %1,%1,r1              \n\t"                         \
01581 "  ADC    %2,%2,#0              \n\t"                         \
01582 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
01583 
01584 #elif defined(TFM_PPC32)
01585 /* For 32-bit PPC */
01586 
01587 #define COMBA_START
01588 
01589 #define COMBA_CLEAR \
01590    c0 = c1 = c2 = 0;
01591 
01592 #define COMBA_FORWARD \
01593    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01594 
01595 #define COMBA_STORE(x) \
01596    x = c0;
01597 
01598 #define COMBA_STORE2(x) \
01599    x = c1;
01600 
01601 #define COMBA_FINI
01602 
01603 /* untested: will mulhwu change the flags?  Docs say no */
01604 #define MULADD(i, j)             \
01605 __asm__(                         \
01606    " mullw  16,%6,%7       \n\t" \
01607    " addc   %0,%0,16       \n\t" \
01608    " mulhwu 16,%6,%7       \n\t" \
01609    " adde   %1,%1,16       \n\t" \
01610    " addze  %2,%2          \n\t" \
01611 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
01612 
01613 #elif defined(TFM_PPC64)
01614 /* For 64-bit PPC */
01615 
01616 #define COMBA_START
01617 
01618 #define COMBA_CLEAR \
01619    c0 = c1 = c2 = 0;
01620 
01621 #define COMBA_FORWARD \
01622    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01623 
01624 #define COMBA_STORE(x) \
01625    x = c0;
01626 
01627 #define COMBA_STORE2(x) \
01628    x = c1;
01629 
01630 #define COMBA_FINI
01631 
01632 /* untested: will mulhdu change the flags?  Docs say no */
01633 #define MULADD(i, j)              \
01634 ____asm__(                        \
01635    " mulld  r16,%6,%7       \n\t" \
01636    " addc   %0,%0,16        \n\t" \
01637    " mulhdu r16,%6,%7       \n\t" \
01638    " adde   %1,%1,16        \n\t" \
01639    " addze  %2,%2           \n\t" \
01640 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16");
01641 
01642 #elif defined(TFM_AVR32)
01643 
01644 /* ISO C code */
01645 
01646 #define COMBA_START
01647 
01648 #define COMBA_CLEAR \
01649    c0 = c1 = c2 = 0;
01650 
01651 #define COMBA_FORWARD \
01652    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01653 
01654 #define COMBA_STORE(x) \
01655    x = c0;
01656 
01657 #define COMBA_STORE2(x) \
01658    x = c1;
01659 
01660 #define COMBA_FINI
01661 
01662 #define MULADD(i, j)             \
01663 ____asm__(                       \
01664    " mulu.d r2,%6,%7        \n\t"\
01665    " add    %0,r2           \n\t"\
01666    " adc    %1,%1,r3        \n\t"\
01667    " acr    %2              \n\t"\
01668 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
01669 
01670 #elif defined(TFM_MIPS)
01671 
01672 /* MIPS */
01673 #define COMBA_START
01674 
01675 #define COMBA_CLEAR \
01676    c0 = c1 = c2 = 0;
01677 
01678 #define COMBA_FORWARD \
01679    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01680 
01681 #define COMBA_STORE(x) \
01682    x = c0;
01683 
01684 #define COMBA_STORE2(x) \
01685    x = c1;
01686 
01687 #define COMBA_FINI
01688 
01689 #define MULADD(i, j)              \
01690 __asm__(                          \
01691    " multu  %6,%7          \n\t"  \
01692    " mflo   $12            \n\t"  \
01693    " mfhi   $13            \n\t"  \
01694    " addu    %0,%0,$12     \n\t"  \
01695    " sltu   $12,%0,$12     \n\t"  \
01696    " addu    %1,%1,$13     \n\t"  \
01697    " sltu   $13,%1,$13     \n\t"  \
01698    " addu    %1,%1,$12     \n\t"  \
01699    " sltu   $12,%1,$12     \n\t"  \
01700    " addu    %2,%2,$13     \n\t"  \
01701    " addu    %2,%2,$12     \n\t"  \
01702 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12","$13");
01703 
01704 #else
01705 /* ISO C code */
01706 
01707 #define COMBA_START
01708 
01709 #define COMBA_CLEAR \
01710    c0 = c1 = c2 = 0;
01711 
01712 #define COMBA_FORWARD \
01713    do { c0 = c1; c1 = c2; c2 = 0; } while (0);
01714 
01715 #define COMBA_STORE(x) \
01716    x = c0;
01717 
01718 #define COMBA_STORE2(x) \
01719    x = c1;
01720 
01721 #define COMBA_FINI
01722 
01723 #define MULADD(i, j)                                                                                                                                  \
01724    do { fp_word t;                                      \
01725    t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j);       \
01726    c0 = (fp_digit)t;                                    \
01727    t = (fp_word)c1 + (t >> DIGIT_BIT);                  \
01728    c1 = (fp_digit)t;                                    \
01729    c2 += (fp_digit)(t >> DIGIT_BIT);                    \
01730    } while (0);
01731 
01732 #endif
01733 
01734 
01735 #ifdef TFM_SMALL_SET
01736     #include "fp_mul_comba_small_set.i"
01737 #endif
01738 
01739 #if defined(TFM_MUL3) && FP_SIZE >= 6
01740     #include "fp_mul_comba_3.i"
01741 #endif
01742 #if defined(TFM_MUL4) && FP_SIZE >= 8
01743     #include "fp_mul_comba_4.i"
01744 #endif
01745 #if defined(TFM_MUL6) && FP_SIZE >= 12
01746     #include "fp_mul_comba_6.i"
01747 #endif
01748 #if defined(TFM_MUL7) && FP_SIZE >= 14
01749     #include "fp_mul_comba_7.i"
01750 #endif
01751 #if defined(TFM_MUL8) && FP_SIZE >= 16
01752     #include "fp_mul_comba_8.i"
01753 #endif
01754 #if defined(TFM_MUL9) && FP_SIZE >= 18
01755     #include "fp_mul_comba_9.i"
01756 #endif
01757 #if defined(TFM_MUL12) && FP_SIZE >= 24
01758     #include "fp_mul_comba_12.i"
01759 #endif
01760 #if defined(TFM_MUL17) && FP_SIZE >= 34
01761     #include "fp_mul_comba_17.i"
01762 #endif
01763 #if defined(TFM_MUL20) && FP_SIZE >= 40
01764     #include "fp_mul_comba_20.i"
01765 #endif
01766 #if defined(TFM_MUL24) && FP_SIZE >= 48
01767     #include "fp_mul_comba_24.i"
01768 #endif
01769 #if defined(TFM_MUL28) && FP_SIZE >= 56
01770     #include "fp_mul_comba_28.i"
01771 #endif
01772 #if defined(TFM_MUL32) && FP_SIZE >= 64
01773     #include "fp_mul_comba_32.i"
01774 #endif
01775 #if defined(TFM_MUL48) && FP_SIZE >= 96
01776     #include "fp_mul_comba_48.i"
01777 #endif
01778 #if defined(TFM_MUL64) && FP_SIZE >= 128
01779     #include "fp_mul_comba_64.i"
01780 #endif
01781 
01782 /* end fp_mul_comba.c asm */
01783 
01784