Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Dependents: MiniTLS-HTTPS-Example
fp_montgomery_reduce.c
00001 /* TomsFastMath, a fast ISO C bignum library. 00002 * 00003 * This project is meant to fill in where LibTomMath 00004 * falls short. That is speed ;-) 00005 * 00006 * This project is public domain and free for all purposes. 00007 * 00008 * Tom St Denis, tomstdenis@gmail.com 00009 */ 00010 #include <tfm.h> 00011 00012 /******************************************************************/ 00013 #if defined(TFM_X86) && !defined(TFM_SSE2) 00014 /* x86-32 code */ 00015 00016 #define MONT_START 00017 #define MONT_FINI 00018 #define LOOP_END 00019 #define LOOP_START \ 00020 mu = c[x] * mp 00021 00022 #define INNERMUL \ 00023 asm( \ 00024 "movl %5,%%eax \n\t" \ 00025 "mull %4 \n\t" \ 00026 "addl %1,%%eax \n\t" \ 00027 "adcl $0,%%edx \n\t" \ 00028 "addl %%eax,%0 \n\t" \ 00029 "adcl $0,%%edx \n\t" \ 00030 "movl %%edx,%1 \n\t" \ 00031 :"=g"(_c[LO]), "=r"(cy) \ 00032 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \ 00033 : "%eax", "%edx", "%cc") 00034 00035 #define PROPCARRY \ 00036 asm( \ 00037 "addl %1,%0 \n\t" \ 00038 "setb %%al \n\t" \ 00039 "movzbl %%al,%1 \n\t" \ 00040 :"=g"(_c[LO]), "=r"(cy) \ 00041 :"0"(_c[LO]), "1"(cy) \ 00042 : "%eax", "%cc") 00043 00044 /******************************************************************/ 00045 #elif defined(TFM_X86_64) 00046 /* x86-64 code */ 00047 00048 #define MONT_START 00049 #define MONT_FINI 00050 #define LOOP_END 00051 #define LOOP_START \ 00052 mu = c[x] * mp 00053 00054 #define INNERMUL \ 00055 asm( \ 00056 "movq %5,%%rax \n\t" \ 00057 "mulq %4 \n\t" \ 00058 "addq %1,%%rax \n\t" \ 00059 "adcq $0,%%rdx \n\t" \ 00060 "addq %%rax,%0 \n\t" \ 00061 "adcq $0,%%rdx \n\t" \ 00062 "movq %%rdx,%1 \n\t" \ 00063 :"=g"(_c[LO]), "=r"(cy) \ 00064 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \ 00065 : "%rax", "%rdx", "%cc") 00066 00067 #define INNERMUL8 \ 00068 asm( \ 00069 "movq 0(%5),%%rax \n\t" \ 00070 "movq 0(%2),%%r10 \n\t" \ 00071 "movq 0x8(%5),%%r11 \n\t" \ 00072 "mulq %4 \n\t" \ 00073 "addq %%r10,%%rax \n\t" \ 00074 "adcq $0,%%rdx \n\t" \ 00075 "movq 0x8(%2),%%r10 \n\t" \ 00076 "addq %3,%%rax \n\t" \ 00077 "adcq $0,%%rdx \n\t" \ 00078 "movq %%rax,0(%0) \n\t" \ 00079 "movq %%rdx,%1 \n\t" \ 00080 \ 00081 "movq %%r11,%%rax \n\t" \ 00082 "movq 0x10(%5),%%r11 \n\t" \ 00083 "mulq %4 \n\t" \ 00084 "addq %%r10,%%rax \n\t" \ 00085 "adcq $0,%%rdx \n\t" \ 00086 "movq 0x10(%2),%%r10 \n\t" \ 00087 "addq %3,%%rax \n\t" \ 00088 "adcq $0,%%rdx \n\t" \ 00089 "movq %%rax,0x8(%0) \n\t" \ 00090 "movq %%rdx,%1 \n\t" \ 00091 \ 00092 "movq %%r11,%%rax \n\t" \ 00093 "movq 0x18(%5),%%r11 \n\t" \ 00094 "mulq %4 \n\t" \ 00095 "addq %%r10,%%rax \n\t" \ 00096 "adcq $0,%%rdx \n\t" \ 00097 "movq 0x18(%2),%%r10 \n\t" \ 00098 "addq %3,%%rax \n\t" \ 00099 "adcq $0,%%rdx \n\t" \ 00100 "movq %%rax,0x10(%0) \n\t" \ 00101 "movq %%rdx,%1 \n\t" \ 00102 \ 00103 "movq %%r11,%%rax \n\t" \ 00104 "movq 0x20(%5),%%r11 \n\t" \ 00105 "mulq %4 \n\t" \ 00106 "addq %%r10,%%rax \n\t" \ 00107 "adcq $0,%%rdx \n\t" \ 00108 "movq 0x20(%2),%%r10 \n\t" \ 00109 "addq %3,%%rax \n\t" \ 00110 "adcq $0,%%rdx \n\t" \ 00111 "movq %%rax,0x18(%0) \n\t" \ 00112 "movq %%rdx,%1 \n\t" \ 00113 \ 00114 "movq %%r11,%%rax \n\t" \ 00115 "movq 0x28(%5),%%r11 \n\t" \ 00116 "mulq %4 \n\t" \ 00117 "addq %%r10,%%rax \n\t" \ 00118 "adcq $0,%%rdx \n\t" \ 00119 "movq 0x28(%2),%%r10 \n\t" \ 00120 "addq %3,%%rax \n\t" \ 00121 "adcq $0,%%rdx \n\t" \ 00122 "movq %%rax,0x20(%0) \n\t" \ 00123 "movq %%rdx,%1 \n\t" \ 00124 \ 00125 "movq %%r11,%%rax \n\t" \ 00126 "movq 0x30(%5),%%r11 \n\t" \ 00127 "mulq %4 \n\t" \ 00128 "addq %%r10,%%rax \n\t" \ 00129 "adcq $0,%%rdx \n\t" \ 00130 "movq 0x30(%2),%%r10 \n\t" \ 00131 "addq %3,%%rax \n\t" \ 00132 "adcq $0,%%rdx \n\t" \ 00133 "movq %%rax,0x28(%0) \n\t" \ 00134 "movq %%rdx,%1 \n\t" \ 00135 \ 00136 "movq %%r11,%%rax \n\t" \ 00137 "movq 0x38(%5),%%r11 \n\t" \ 00138 "mulq %4 \n\t" \ 00139 "addq %%r10,%%rax \n\t" \ 00140 "adcq $0,%%rdx \n\t" \ 00141 "movq 0x38(%2),%%r10 \n\t" \ 00142 "addq %3,%%rax \n\t" \ 00143 "adcq $0,%%rdx \n\t" \ 00144 "movq %%rax,0x30(%0) \n\t" \ 00145 "movq %%rdx,%1 \n\t" \ 00146 \ 00147 "movq %%r11,%%rax \n\t" \ 00148 "mulq %4 \n\t" \ 00149 "addq %%r10,%%rax \n\t" \ 00150 "adcq $0,%%rdx \n\t" \ 00151 "addq %3,%%rax \n\t" \ 00152 "adcq $0,%%rdx \n\t" \ 00153 "movq %%rax,0x38(%0) \n\t" \ 00154 "movq %%rdx,%1 \n\t" \ 00155 \ 00156 :"=r"(_c), "=r"(cy) \ 00157 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\ 00158 : "%rax", "%rdx", "%r10", "%r11", "%cc") 00159 00160 00161 #define PROPCARRY \ 00162 asm( \ 00163 "addq %1,%0 \n\t" \ 00164 "setb %%al \n\t" \ 00165 "movzbq %%al,%1 \n\t" \ 00166 :"=g"(_c[LO]), "=r"(cy) \ 00167 :"0"(_c[LO]), "1"(cy) \ 00168 : "%rax", "%cc") 00169 00170 /******************************************************************/ 00171 #elif defined(TFM_SSE2) 00172 /* SSE2 code (assumes 32-bit fp_digits) */ 00173 /* XMM register assignments: 00174 * xmm0 *tmpm++, then Mu * (*tmpm++) 00175 * xmm1 c[x], then Mu 00176 * xmm2 mp 00177 * xmm3 cy 00178 * xmm4 _c[LO] 00179 */ 00180 00181 #define MONT_START \ 00182 asm("movd %0,%%mm2"::"g"(mp)) 00183 00184 #define MONT_FINI \ 00185 asm("emms") 00186 00187 #define LOOP_START \ 00188 asm( \ 00189 "movd %0,%%mm1 \n\t" \ 00190 "pxor %%mm3,%%mm3 \n\t" \ 00191 "pmuludq %%mm2,%%mm1 \n\t" \ 00192 :: "g"(c[x])) 00193 00194 /* pmuludq on mmx registers does a 32x32->64 multiply. */ 00195 #define INNERMUL \ 00196 asm( \ 00197 "movd %1,%%mm4 \n\t" \ 00198 "movd %2,%%mm0 \n\t" \ 00199 "paddq %%mm4,%%mm3 \n\t" \ 00200 "pmuludq %%mm1,%%mm0 \n\t" \ 00201 "paddq %%mm0,%%mm3 \n\t" \ 00202 "movd %%mm3,%0 \n\t" \ 00203 "psrlq $32, %%mm3 \n\t" \ 00204 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) ); 00205 00206 #define INNERMUL8 \ 00207 asm( \ 00208 "movd 0(%1),%%mm4 \n\t" \ 00209 "movd 0(%2),%%mm0 \n\t" \ 00210 "paddq %%mm4,%%mm3 \n\t" \ 00211 "pmuludq %%mm1,%%mm0 \n\t" \ 00212 "movd 4(%2),%%mm5 \n\t" \ 00213 "paddq %%mm0,%%mm3 \n\t" \ 00214 "movd 4(%1),%%mm6 \n\t" \ 00215 "movd %%mm3,0(%0) \n\t" \ 00216 "psrlq $32, %%mm3 \n\t" \ 00217 \ 00218 "paddq %%mm6,%%mm3 \n\t" \ 00219 "pmuludq %%mm1,%%mm5 \n\t" \ 00220 "movd 8(%2),%%mm6 \n\t" \ 00221 "paddq %%mm5,%%mm3 \n\t" \ 00222 "movd 8(%1),%%mm7 \n\t" \ 00223 "movd %%mm3,4(%0) \n\t" \ 00224 "psrlq $32, %%mm3 \n\t" \ 00225 \ 00226 "paddq %%mm7,%%mm3 \n\t" \ 00227 "pmuludq %%mm1,%%mm6 \n\t" \ 00228 "movd 12(%2),%%mm7 \n\t" \ 00229 "paddq %%mm6,%%mm3 \n\t" \ 00230 "movd 12(%1),%%mm5 \n\t" \ 00231 "movd %%mm3,8(%0) \n\t" \ 00232 "psrlq $32, %%mm3 \n\t" \ 00233 \ 00234 "paddq %%mm5,%%mm3 \n\t" \ 00235 "pmuludq %%mm1,%%mm7 \n\t" \ 00236 "movd 16(%2),%%mm5 \n\t" \ 00237 "paddq %%mm7,%%mm3 \n\t" \ 00238 "movd 16(%1),%%mm6 \n\t" \ 00239 "movd %%mm3,12(%0) \n\t" \ 00240 "psrlq $32, %%mm3 \n\t" \ 00241 \ 00242 "paddq %%mm6,%%mm3 \n\t" \ 00243 "pmuludq %%mm1,%%mm5 \n\t" \ 00244 "movd 20(%2),%%mm6 \n\t" \ 00245 "paddq %%mm5,%%mm3 \n\t" \ 00246 "movd 20(%1),%%mm7 \n\t" \ 00247 "movd %%mm3,16(%0) \n\t" \ 00248 "psrlq $32, %%mm3 \n\t" \ 00249 \ 00250 "paddq %%mm7,%%mm3 \n\t" \ 00251 "pmuludq %%mm1,%%mm6 \n\t" \ 00252 "movd 24(%2),%%mm7 \n\t" \ 00253 "paddq %%mm6,%%mm3 \n\t" \ 00254 "movd 24(%1),%%mm5 \n\t" \ 00255 "movd %%mm3,20(%0) \n\t" \ 00256 "psrlq $32, %%mm3 \n\t" \ 00257 \ 00258 "paddq %%mm5,%%mm3 \n\t" \ 00259 "pmuludq %%mm1,%%mm7 \n\t" \ 00260 "movd 28(%2),%%mm5 \n\t" \ 00261 "paddq %%mm7,%%mm3 \n\t" \ 00262 "movd 28(%1),%%mm6 \n\t" \ 00263 "movd %%mm3,24(%0) \n\t" \ 00264 "psrlq $32, %%mm3 \n\t" \ 00265 \ 00266 "paddq %%mm6,%%mm3 \n\t" \ 00267 "pmuludq %%mm1,%%mm5 \n\t" \ 00268 "paddq %%mm5,%%mm3 \n\t" \ 00269 "movd %%mm3,28(%0) \n\t" \ 00270 "psrlq $32, %%mm3 \n\t" \ 00271 :"=r"(_c) : "0"(_c), "g"(tmpm) ); 00272 00273 #define LOOP_END \ 00274 asm( "movd %%mm3,%0 \n" :"=r"(cy)) 00275 00276 #define PROPCARRY \ 00277 asm( \ 00278 "addl %1,%0 \n\t" \ 00279 "setb %%al \n\t" \ 00280 "movzbl %%al,%1 \n\t" \ 00281 :"=g"(_c[LO]), "=r"(cy) \ 00282 :"0"(_c[LO]), "1"(cy) \ 00283 : "%eax", "%cc") 00284 00285 /******************************************************************/ 00286 #elif defined(TFM_ARM) 00287 /* ARMv4 code */ 00288 00289 #define MONT_START 00290 #define MONT_FINI 00291 #define LOOP_END 00292 #define LOOP_START \ 00293 mu = c[x] * mp 00294 00295 #define INNERMUL \ 00296 asm( \ 00297 " LDR r0,%1 \n\t" \ 00298 " ADDS r0,r0,%0 \n\t" \ 00299 " MOVCS %0,#1 \n\t" \ 00300 " MOVCC %0,#0 \n\t" \ 00301 " UMLAL r0,%0,%3,%4 \n\t" \ 00302 " STR r0,%1 \n\t" \ 00303 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","%cc"); 00304 00305 #define PROPCARRY \ 00306 asm( \ 00307 " LDR r0,%1 \n\t" \ 00308 " ADDS r0,r0,%0 \n\t" \ 00309 " STR r0,%1 \n\t" \ 00310 " MOVCS %0,#1 \n\t" \ 00311 " MOVCC %0,#0 \n\t" \ 00312 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","%cc"); 00313 00314 /******************************************************************/ 00315 #elif defined(TFM_PPC32) 00316 00317 /* PPC32 */ 00318 #define MONT_START 00319 #define MONT_FINI 00320 #define LOOP_END 00321 #define LOOP_START \ 00322 mu = c[x] * mp 00323 00324 #define INNERMUL \ 00325 asm( \ 00326 " mullw 16,%3,%4 \n\t" \ 00327 " mulhwu 17,%3,%4 \n\t" \ 00328 " addc 16,16,%0 \n\t" \ 00329 " addze 17,17 \n\t" \ 00330 " lwz 18,%1 \n\t" \ 00331 " addc 16,16,18 \n\t" \ 00332 " addze %0,17 \n\t" \ 00333 " stw 16,%1 \n\t" \ 00334 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","%cc"); ++tmpm; 00335 00336 #define PROPCARRY \ 00337 asm( \ 00338 " lwz 16,%1 \n\t" \ 00339 " addc 16,16,%0 \n\t" \ 00340 " stw 16,%1 \n\t" \ 00341 " xor %0,%0,%0 \n\t" \ 00342 " addze %0,%0 \n\t" \ 00343 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","%cc"); 00344 00345 /******************************************************************/ 00346 #elif defined(TFM_PPC64) 00347 00348 /* PPC64 */ 00349 #define MONT_START 00350 #define MONT_FINI 00351 #define LOOP_END 00352 #define LOOP_START \ 00353 mu = c[x] * mp 00354 00355 #define INNERMUL \ 00356 asm( \ 00357 " mulld r16,%3,%4 \n\t" \ 00358 " mulhdu r17,%3,%4 \n\t" \ 00359 " addc r16,16,%0 \n\t" \ 00360 " addze r17,r17 \n\t" \ 00361 " ldx r18,0,%1 \n\t" \ 00362 " addc r16,r16,r18 \n\t" \ 00363 " addze %0,r17 \n\t" \ 00364 " sdx r16,0,%1 \n\t" \ 00365 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","%cc"); ++tmpm; 00366 00367 #define PROPCARRY \ 00368 asm( \ 00369 " ldx r16,0,%1 \n\t" \ 00370 " addc r16,r16,%0 \n\t" \ 00371 " sdx r16,0,%1 \n\t" \ 00372 " xor %0,%0,%0 \n\t" \ 00373 " addze %0,%0 \n\t" \ 00374 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","%cc"); 00375 00376 /******************************************************************/ 00377 #elif defined(TFM_AVR32) 00378 00379 /* AVR32 */ 00380 #define MONT_START 00381 #define MONT_FINI 00382 #define LOOP_END 00383 #define LOOP_START \ 00384 mu = c[x] * mp 00385 00386 #define INNERMUL \ 00387 asm( \ 00388 " ld.w r2,%1 \n\t" \ 00389 " add r2,%0 \n\t" \ 00390 " eor r3,r3 \n\t" \ 00391 " acr r3 \n\t" \ 00392 " macu.d r2,%3,%4 \n\t" \ 00393 " st.w %1,r2 \n\t" \ 00394 " mov %0,r3 \n\t" \ 00395 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3"); 00396 00397 #define PROPCARRY \ 00398 asm( \ 00399 " ld.w r2,%1 \n\t" \ 00400 " add r2,%0 \n\t" \ 00401 " st.w %1,r2 \n\t" \ 00402 " eor %0,%0 \n\t" \ 00403 " acr %0 \n\t" \ 00404 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","%cc"); 00405 00406 /******************************************************************/ 00407 #elif defined(TFM_MIPS) 00408 00409 /* MIPS */ 00410 #define MONT_START 00411 #define MONT_FINI 00412 #define LOOP_END 00413 #define LOOP_START \ 00414 mu = c[x] * mp 00415 00416 #define INNERMUL \ 00417 asm( \ 00418 " multu %3,%4 \n\t" \ 00419 " mflo $12 \n\t" \ 00420 " mfhi $13 \n\t" \ 00421 " addu $12,$12,%0 \n\t" \ 00422 " sltu $10,$12,%0 \n\t" \ 00423 " addu $13,$13,$10 \n\t" \ 00424 " lw $10,%1 \n\t" \ 00425 " addu $12,$12,$10 \n\t" \ 00426 " sltu $10,$12,$10 \n\t" \ 00427 " addu %0,$13,$10 \n\t" \ 00428 " sw $12,%1 \n\t" \ 00429 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"$10","$12","$13"); ++tmpm; 00430 00431 #define PROPCARRY \ 00432 asm( \ 00433 " lw $10,%1 \n\t" \ 00434 " addu $10,$10,%0 \n\t" \ 00435 " sw $10,%1 \n\t" \ 00436 " sltu %0,$10,%0 \n\t" \ 00437 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"$10"); 00438 00439 /******************************************************************/ 00440 #else 00441 00442 /* ISO C code */ 00443 #define MONT_START 00444 #define MONT_FINI 00445 #define LOOP_END 00446 #define LOOP_START \ 00447 mu = c[x] * mp 00448 00449 #define INNERMUL \ 00450 do { fp_word t; \ 00451 _c[0] = t = ((fp_word)_c[0] + (fp_word)cy) + \ 00452 (((fp_word)mu) * ((fp_word)*tmpm++)); \ 00453 cy = (t >> DIGIT_BIT); \ 00454 } while (0) 00455 00456 #define PROPCARRY \ 00457 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0) 00458 00459 #endif 00460 /******************************************************************/ 00461 00462 00463 #define LO 0 00464 00465 #ifdef TFM_SMALL_MONT_SET 00466 #include "fp_mont_small.i" 00467 #endif 00468 00469 /* computes x/R == x (mod N) via Montgomery Reduction */ 00470 void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) 00471 { 00472 fp_digit c[FP_SIZE], *_c, *tmpm, mu; 00473 int oldused, x, y, pa; 00474 00475 /* bail if too large */ 00476 if (m->used > (FP_SIZE/2)) { 00477 return; 00478 } 00479 00480 #ifdef TFM_SMALL_MONT_SET 00481 if (m->used <= 16) { 00482 fp_montgomery_reduce_small(a, m, mp); 00483 return; 00484 } 00485 #endif 00486 00487 #if defined(USE_MEMSET) 00488 /* now zero the buff */ 00489 memset(c, 0, sizeof c); 00490 #endif 00491 pa = m->used; 00492 00493 /* copy the input */ 00494 oldused = a->used; 00495 for (x = 0; x < oldused; x++) { 00496 c[x] = a->dp[x]; 00497 } 00498 #if !defined(USE_MEMSET) 00499 for (; x < 2*pa+1; x++) { 00500 c[x] = 0; 00501 } 00502 #endif 00503 MONT_START; 00504 00505 for (x = 0; x < pa; x++) { 00506 fp_digit cy = 0; 00507 /* get Mu for this round */ 00508 LOOP_START; 00509 _c = c + x; 00510 tmpm = m->dp; 00511 y = 0; 00512 #if (defined(TFM_SSE2) || defined(TFM_X86_64)) 00513 for (; y < (pa & ~7); y += 8) { 00514 INNERMUL8; 00515 _c += 8; 00516 tmpm += 8; 00517 } 00518 #endif 00519 00520 for (; y < pa; y++) { 00521 INNERMUL; 00522 ++_c; 00523 } 00524 LOOP_END; 00525 while (cy) { 00526 PROPCARRY; 00527 ++_c; 00528 } 00529 } 00530 00531 /* now copy out */ 00532 _c = c + pa; 00533 tmpm = a->dp; 00534 for (x = 0; x < pa+1; x++) { 00535 *tmpm++ = *_c++; 00536 } 00537 00538 for (; x < oldused; x++) { 00539 *tmpm++ = 0; 00540 } 00541 00542 MONT_FINI; 00543 00544 a->used = pa+1; 00545 fp_clamp(a); 00546 00547 /* if A >= m then A = A - m */ 00548 if (fp_cmp_mag (a, m) != FP_LT) { 00549 s_fp_sub (a, m, a); 00550 } 00551 } 00552 00553 00554 /* $Source: /cvs/libtom/tomsfastmath/src/mont/fp_montgomery_reduce.c,v $ */ 00555 /* $Revision: 1.2 $ */ 00556 /* $Date: 2007/03/14 23:47:42 $ */
Generated on Wed Jul 13 2022 00:22:54 by
1.7.2