Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
Fork of MiniTLS-GPL by
fp_montgomery_reduce.c
00001 /* TomsFastMath, a fast ISO C bignum library. 00002 * 00003 * This project is meant to fill in where LibTomMath 00004 * falls short. That is speed ;-) 00005 * 00006 * This project is public domain and free for all purposes. 00007 * 00008 * Tom St Denis, tomstdenis@gmail.com 00009 */ 00010 #include <tfm.h> 00011 00012 /******************************************************************/ 00013 #if defined(TFM_X86) && !defined(TFM_SSE2) 00014 /* x86-32 code */ 00015 00016 #define MONT_START 00017 #define MONT_FINI 00018 #define LOOP_END 00019 #define LOOP_START \ 00020 mu = c[x] * mp 00021 00022 #define INNERMUL \ 00023 asm( \ 00024 "movl %5,%%eax \n\t" \ 00025 "mull %4 \n\t" \ 00026 "addl %1,%%eax \n\t" \ 00027 "adcl $0,%%edx \n\t" \ 00028 "addl %%eax,%0 \n\t" \ 00029 "adcl $0,%%edx \n\t" \ 00030 "movl %%edx,%1 \n\t" \ 00031 :"=g"(_c[LO]), "=r"(cy) \ 00032 :"0"(_c[LO]), "1"(cy), "g"(mu), "g"(*tmpm++) \ 00033 : "%eax", "%edx", "%cc") 00034 00035 #define PROPCARRY \ 00036 asm( \ 00037 "addl %1,%0 \n\t" \ 00038 "setb %%al \n\t" \ 00039 "movzbl %%al,%1 \n\t" \ 00040 :"=g"(_c[LO]), "=r"(cy) \ 00041 :"0"(_c[LO]), "1"(cy) \ 00042 : "%eax", "%cc") 00043 00044 /******************************************************************/ 00045 #elif defined(TFM_X86_64) 00046 /* x86-64 code */ 00047 00048 #define MONT_START 00049 #define MONT_FINI 00050 #define LOOP_END 00051 #define LOOP_START \ 00052 mu = c[x] * mp 00053 00054 #define INNERMUL \ 00055 asm( \ 00056 "movq %5,%%rax \n\t" \ 00057 "mulq %4 \n\t" \ 00058 "addq %1,%%rax \n\t" \ 00059 "adcq $0,%%rdx \n\t" \ 00060 "addq %%rax,%0 \n\t" \ 00061 "adcq $0,%%rdx \n\t" \ 00062 "movq %%rdx,%1 \n\t" \ 00063 :"=g"(_c[LO]), "=r"(cy) \ 00064 :"0"(_c[LO]), "1"(cy), "r"(mu), "r"(*tmpm++) \ 00065 : "%rax", "%rdx", "%cc") 00066 00067 #define INNERMUL8 \ 00068 asm( \ 00069 "movq 0(%5),%%rax \n\t" \ 00070 "movq 0(%2),%%r10 \n\t" \ 00071 "movq 0x8(%5),%%r11 \n\t" \ 00072 "mulq %4 \n\t" \ 00073 "addq %%r10,%%rax \n\t" \ 00074 "adcq $0,%%rdx \n\t" \ 00075 "movq 0x8(%2),%%r10 \n\t" \ 00076 "addq %3,%%rax \n\t" \ 00077 "adcq $0,%%rdx \n\t" \ 00078 "movq %%rax,0(%0) \n\t" \ 00079 "movq %%rdx,%1 \n\t" \ 00080 \ 00081 "movq %%r11,%%rax \n\t" \ 00082 "movq 0x10(%5),%%r11 \n\t" \ 00083 "mulq %4 \n\t" \ 00084 "addq %%r10,%%rax \n\t" \ 00085 "adcq $0,%%rdx \n\t" \ 00086 "movq 0x10(%2),%%r10 \n\t" \ 00087 "addq %3,%%rax \n\t" \ 00088 "adcq $0,%%rdx \n\t" \ 00089 "movq %%rax,0x8(%0) \n\t" \ 00090 "movq %%rdx,%1 \n\t" \ 00091 \ 00092 "movq %%r11,%%rax \n\t" \ 00093 "movq 0x18(%5),%%r11 \n\t" \ 00094 "mulq %4 \n\t" \ 00095 "addq %%r10,%%rax \n\t" \ 00096 "adcq $0,%%rdx \n\t" \ 00097 "movq 0x18(%2),%%r10 \n\t" \ 00098 "addq %3,%%rax \n\t" \ 00099 "adcq $0,%%rdx \n\t" \ 00100 "movq %%rax,0x10(%0) \n\t" \ 00101 "movq %%rdx,%1 \n\t" \ 00102 \ 00103 "movq %%r11,%%rax \n\t" \ 00104 "movq 0x20(%5),%%r11 \n\t" \ 00105 "mulq %4 \n\t" \ 00106 "addq %%r10,%%rax \n\t" \ 00107 "adcq $0,%%rdx \n\t" \ 00108 "movq 0x20(%2),%%r10 \n\t" \ 00109 "addq %3,%%rax \n\t" \ 00110 "adcq $0,%%rdx \n\t" \ 00111 "movq %%rax,0x18(%0) \n\t" \ 00112 "movq %%rdx,%1 \n\t" \ 00113 \ 00114 "movq %%r11,%%rax \n\t" \ 00115 "movq 0x28(%5),%%r11 \n\t" \ 00116 "mulq %4 \n\t" \ 00117 "addq %%r10,%%rax \n\t" \ 00118 "adcq $0,%%rdx \n\t" \ 00119 "movq 0x28(%2),%%r10 \n\t" \ 00120 "addq %3,%%rax \n\t" \ 00121 "adcq $0,%%rdx \n\t" \ 00122 "movq %%rax,0x20(%0) \n\t" \ 00123 "movq %%rdx,%1 \n\t" \ 00124 \ 00125 "movq %%r11,%%rax \n\t" \ 00126 "movq 0x30(%5),%%r11 \n\t" \ 00127 "mulq %4 \n\t" \ 00128 "addq %%r10,%%rax \n\t" \ 00129 "adcq $0,%%rdx \n\t" \ 00130 "movq 0x30(%2),%%r10 \n\t" \ 00131 "addq %3,%%rax \n\t" \ 00132 "adcq $0,%%rdx \n\t" \ 00133 "movq %%rax,0x28(%0) \n\t" \ 00134 "movq %%rdx,%1 \n\t" \ 00135 \ 00136 "movq %%r11,%%rax \n\t" \ 00137 "movq 0x38(%5),%%r11 \n\t" \ 00138 "mulq %4 \n\t" \ 00139 "addq %%r10,%%rax \n\t" \ 00140 "adcq $0,%%rdx \n\t" \ 00141 "movq 0x38(%2),%%r10 \n\t" \ 00142 "addq %3,%%rax \n\t" \ 00143 "adcq $0,%%rdx \n\t" \ 00144 "movq %%rax,0x30(%0) \n\t" \ 00145 "movq %%rdx,%1 \n\t" \ 00146 \ 00147 "movq %%r11,%%rax \n\t" \ 00148 "mulq %4 \n\t" \ 00149 "addq %%r10,%%rax \n\t" \ 00150 "adcq $0,%%rdx \n\t" \ 00151 "addq %3,%%rax \n\t" \ 00152 "adcq $0,%%rdx \n\t" \ 00153 "movq %%rax,0x38(%0) \n\t" \ 00154 "movq %%rdx,%1 \n\t" \ 00155 \ 00156 :"=r"(_c), "=r"(cy) \ 00157 : "0"(_c), "1"(cy), "g"(mu), "r"(tmpm)\ 00158 : "%rax", "%rdx", "%r10", "%r11", "%cc") 00159 00160 00161 #define PROPCARRY \ 00162 asm( \ 00163 "addq %1,%0 \n\t" \ 00164 "setb %%al \n\t" \ 00165 "movzbq %%al,%1 \n\t" \ 00166 :"=g"(_c[LO]), "=r"(cy) \ 00167 :"0"(_c[LO]), "1"(cy) \ 00168 : "%rax", "%cc") 00169 00170 /******************************************************************/ 00171 #elif defined(TFM_SSE2) 00172 /* SSE2 code (assumes 32-bit fp_digits) */ 00173 /* XMM register assignments: 00174 * xmm0 *tmpm++, then Mu * (*tmpm++) 00175 * xmm1 c[x], then Mu 00176 * xmm2 mp 00177 * xmm3 cy 00178 * xmm4 _c[LO] 00179 */ 00180 00181 #define MONT_START \ 00182 asm("movd %0,%%mm2"::"g"(mp)) 00183 00184 #define MONT_FINI \ 00185 asm("emms") 00186 00187 #define LOOP_START \ 00188 asm( \ 00189 "movd %0,%%mm1 \n\t" \ 00190 "pxor %%mm3,%%mm3 \n\t" \ 00191 "pmuludq %%mm2,%%mm1 \n\t" \ 00192 :: "g"(c[x])) 00193 00194 /* pmuludq on mmx registers does a 32x32->64 multiply. */ 00195 #define INNERMUL \ 00196 asm( \ 00197 "movd %1,%%mm4 \n\t" \ 00198 "movd %2,%%mm0 \n\t" \ 00199 "paddq %%mm4,%%mm3 \n\t" \ 00200 "pmuludq %%mm1,%%mm0 \n\t" \ 00201 "paddq %%mm0,%%mm3 \n\t" \ 00202 "movd %%mm3,%0 \n\t" \ 00203 "psrlq $32, %%mm3 \n\t" \ 00204 :"=g"(_c[LO]) : "0"(_c[LO]), "g"(*tmpm++) ); 00205 00206 #define INNERMUL8 \ 00207 asm( \ 00208 "movd 0(%1),%%mm4 \n\t" \ 00209 "movd 0(%2),%%mm0 \n\t" \ 00210 "paddq %%mm4,%%mm3 \n\t" \ 00211 "pmuludq %%mm1,%%mm0 \n\t" \ 00212 "movd 4(%2),%%mm5 \n\t" \ 00213 "paddq %%mm0,%%mm3 \n\t" \ 00214 "movd 4(%1),%%mm6 \n\t" \ 00215 "movd %%mm3,0(%0) \n\t" \ 00216 "psrlq $32, %%mm3 \n\t" \ 00217 \ 00218 "paddq %%mm6,%%mm3 \n\t" \ 00219 "pmuludq %%mm1,%%mm5 \n\t" \ 00220 "movd 8(%2),%%mm6 \n\t" \ 00221 "paddq %%mm5,%%mm3 \n\t" \ 00222 "movd 8(%1),%%mm7 \n\t" \ 00223 "movd %%mm3,4(%0) \n\t" \ 00224 "psrlq $32, %%mm3 \n\t" \ 00225 \ 00226 "paddq %%mm7,%%mm3 \n\t" \ 00227 "pmuludq %%mm1,%%mm6 \n\t" \ 00228 "movd 12(%2),%%mm7 \n\t" \ 00229 "paddq %%mm6,%%mm3 \n\t" \ 00230 "movd 12(%1),%%mm5 \n\t" \ 00231 "movd %%mm3,8(%0) \n\t" \ 00232 "psrlq $32, %%mm3 \n\t" \ 00233 \ 00234 "paddq %%mm5,%%mm3 \n\t" \ 00235 "pmuludq %%mm1,%%mm7 \n\t" \ 00236 "movd 16(%2),%%mm5 \n\t" \ 00237 "paddq %%mm7,%%mm3 \n\t" \ 00238 "movd 16(%1),%%mm6 \n\t" \ 00239 "movd %%mm3,12(%0) \n\t" \ 00240 "psrlq $32, %%mm3 \n\t" \ 00241 \ 00242 "paddq %%mm6,%%mm3 \n\t" \ 00243 "pmuludq %%mm1,%%mm5 \n\t" \ 00244 "movd 20(%2),%%mm6 \n\t" \ 00245 "paddq %%mm5,%%mm3 \n\t" \ 00246 "movd 20(%1),%%mm7 \n\t" \ 00247 "movd %%mm3,16(%0) \n\t" \ 00248 "psrlq $32, %%mm3 \n\t" \ 00249 \ 00250 "paddq %%mm7,%%mm3 \n\t" \ 00251 "pmuludq %%mm1,%%mm6 \n\t" \ 00252 "movd 24(%2),%%mm7 \n\t" \ 00253 "paddq %%mm6,%%mm3 \n\t" \ 00254 "movd 24(%1),%%mm5 \n\t" \ 00255 "movd %%mm3,20(%0) \n\t" \ 00256 "psrlq $32, %%mm3 \n\t" \ 00257 \ 00258 "paddq %%mm5,%%mm3 \n\t" \ 00259 "pmuludq %%mm1,%%mm7 \n\t" \ 00260 "movd 28(%2),%%mm5 \n\t" \ 00261 "paddq %%mm7,%%mm3 \n\t" \ 00262 "movd 28(%1),%%mm6 \n\t" \ 00263 "movd %%mm3,24(%0) \n\t" \ 00264 "psrlq $32, %%mm3 \n\t" \ 00265 \ 00266 "paddq %%mm6,%%mm3 \n\t" \ 00267 "pmuludq %%mm1,%%mm5 \n\t" \ 00268 "paddq %%mm5,%%mm3 \n\t" \ 00269 "movd %%mm3,28(%0) \n\t" \ 00270 "psrlq $32, %%mm3 \n\t" \ 00271 :"=r"(_c) : "0"(_c), "g"(tmpm) ); 00272 00273 #define LOOP_END \ 00274 asm( "movd %%mm3,%0 \n" :"=r"(cy)) 00275 00276 #define PROPCARRY \ 00277 asm( \ 00278 "addl %1,%0 \n\t" \ 00279 "setb %%al \n\t" \ 00280 "movzbl %%al,%1 \n\t" \ 00281 :"=g"(_c[LO]), "=r"(cy) \ 00282 :"0"(_c[LO]), "1"(cy) \ 00283 : "%eax", "%cc") 00284 00285 /******************************************************************/ 00286 #elif defined(TFM_ARM) 00287 /* ARMv4 code */ 00288 00289 #define MONT_START 00290 #define MONT_FINI 00291 #define LOOP_END 00292 #define LOOP_START \ 00293 mu = c[x] * mp 00294 00295 #define INNERMUL \ 00296 asm( \ 00297 " LDR r0,%1 \n\t" \ 00298 " ADDS r0,r0,%0 \n\t" \ 00299 " MOVCS %0,#1 \n\t" \ 00300 " MOVCC %0,#0 \n\t" \ 00301 " UMLAL r0,%0,%3,%4 \n\t" \ 00302 " STR r0,%1 \n\t" \ 00303 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c[0]):"r0","%cc"); 00304 00305 #define PROPCARRY \ 00306 asm( \ 00307 " LDR r0,%1 \n\t" \ 00308 " ADDS r0,r0,%0 \n\t" \ 00309 " STR r0,%1 \n\t" \ 00310 " MOVCS %0,#1 \n\t" \ 00311 " MOVCC %0,#0 \n\t" \ 00312 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r0","%cc"); 00313 00314 /******************************************************************/ 00315 #elif defined(TFM_PPC32) 00316 00317 /* PPC32 */ 00318 #define MONT_START 00319 #define MONT_FINI 00320 #define LOOP_END 00321 #define LOOP_START \ 00322 mu = c[x] * mp 00323 00324 #define INNERMUL \ 00325 asm( \ 00326 " mullw 16,%3,%4 \n\t" \ 00327 " mulhwu 17,%3,%4 \n\t" \ 00328 " addc 16,16,%0 \n\t" \ 00329 " addze 17,17 \n\t" \ 00330 " lwz 18,%1 \n\t" \ 00331 " addc 16,16,18 \n\t" \ 00332 " addze %0,17 \n\t" \ 00333 " stw 16,%1 \n\t" \ 00334 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"16", "17", "18","%cc"); ++tmpm; 00335 00336 #define PROPCARRY \ 00337 asm( \ 00338 " lwz 16,%1 \n\t" \ 00339 " addc 16,16,%0 \n\t" \ 00340 " stw 16,%1 \n\t" \ 00341 " xor %0,%0,%0 \n\t" \ 00342 " addze %0,%0 \n\t" \ 00343 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"16","%cc"); 00344 00345 /******************************************************************/ 00346 #elif defined(TFM_PPC64) 00347 00348 /* PPC64 */ 00349 #define MONT_START 00350 #define MONT_FINI 00351 #define LOOP_END 00352 #define LOOP_START \ 00353 mu = c[x] * mp 00354 00355 #define INNERMUL \ 00356 asm( \ 00357 " mulld r16,%3,%4 \n\t" \ 00358 " mulhdu r17,%3,%4 \n\t" \ 00359 " addc r16,16,%0 \n\t" \ 00360 " addze r17,r17 \n\t" \ 00361 " ldx r18,0,%1 \n\t" \ 00362 " addc r16,r16,r18 \n\t" \ 00363 " addze %0,r17 \n\t" \ 00364 " sdx r16,0,%1 \n\t" \ 00365 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"r16", "r17", "r18","%cc"); ++tmpm; 00366 00367 #define PROPCARRY \ 00368 asm( \ 00369 " ldx r16,0,%1 \n\t" \ 00370 " addc r16,r16,%0 \n\t" \ 00371 " sdx r16,0,%1 \n\t" \ 00372 " xor %0,%0,%0 \n\t" \ 00373 " addze %0,%0 \n\t" \ 00374 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"r16","%cc"); 00375 00376 /******************************************************************/ 00377 #elif defined(TFM_AVR32) 00378 00379 /* AVR32 */ 00380 #define MONT_START 00381 #define MONT_FINI 00382 #define LOOP_END 00383 #define LOOP_START \ 00384 mu = c[x] * mp 00385 00386 #define INNERMUL \ 00387 asm( \ 00388 " ld.w r2,%1 \n\t" \ 00389 " add r2,%0 \n\t" \ 00390 " eor r3,r3 \n\t" \ 00391 " acr r3 \n\t" \ 00392 " macu.d r2,%3,%4 \n\t" \ 00393 " st.w %1,r2 \n\t" \ 00394 " mov %0,r3 \n\t" \ 00395 :"=r"(cy),"=r"(_c):"0"(cy),"r"(mu),"r"(*tmpm++),"1"(_c):"r2","r3"); 00396 00397 #define PROPCARRY \ 00398 asm( \ 00399 " ld.w r2,%1 \n\t" \ 00400 " add r2,%0 \n\t" \ 00401 " st.w %1,r2 \n\t" \ 00402 " eor %0,%0 \n\t" \ 00403 " acr %0 \n\t" \ 00404 :"=r"(cy),"=r"(&_c[0]):"0"(cy),"1"(&_c[0]):"r2","%cc"); 00405 00406 /******************************************************************/ 00407 #elif defined(TFM_MIPS) 00408 00409 /* MIPS */ 00410 #define MONT_START 00411 #define MONT_FINI 00412 #define LOOP_END 00413 #define LOOP_START \ 00414 mu = c[x] * mp 00415 00416 #define INNERMUL \ 00417 asm( \ 00418 " multu %3,%4 \n\t" \ 00419 " mflo $12 \n\t" \ 00420 " mfhi $13 \n\t" \ 00421 " addu $12,$12,%0 \n\t" \ 00422 " sltu $10,$12,%0 \n\t" \ 00423 " addu $13,$13,$10 \n\t" \ 00424 " lw $10,%1 \n\t" \ 00425 " addu $12,$12,$10 \n\t" \ 00426 " sltu $10,$12,$10 \n\t" \ 00427 " addu %0,$13,$10 \n\t" \ 00428 " sw $12,%1 \n\t" \ 00429 :"=r"(cy),"=m"(_c[0]):"0"(cy),"r"(mu),"r"(tmpm[0]),"1"(_c[0]):"$10","$12","$13"); ++tmpm; 00430 00431 #define PROPCARRY \ 00432 asm( \ 00433 " lw $10,%1 \n\t" \ 00434 " addu $10,$10,%0 \n\t" \ 00435 " sw $10,%1 \n\t" \ 00436 " sltu %0,$10,%0 \n\t" \ 00437 :"=r"(cy),"=m"(_c[0]):"0"(cy),"1"(_c[0]):"$10"); 00438 00439 /******************************************************************/ 00440 #else 00441 00442 /* ISO C code */ 00443 #define MONT_START 00444 #define MONT_FINI 00445 #define LOOP_END 00446 #define LOOP_START \ 00447 mu = c[x] * mp 00448 00449 #define INNERMUL \ 00450 do { fp_word t; \ 00451 _c[0] = t = ((fp_word)_c[0] + (fp_word)cy) + \ 00452 (((fp_word)mu) * ((fp_word)*tmpm++)); \ 00453 cy = (t >> DIGIT_BIT); \ 00454 } while (0) 00455 00456 #define PROPCARRY \ 00457 do { fp_digit t = _c[0] += cy; cy = (t < cy); } while (0) 00458 00459 #endif 00460 /******************************************************************/ 00461 00462 00463 #define LO 0 00464 00465 #ifdef TFM_SMALL_MONT_SET 00466 #include "fp_mont_small.i" 00467 #endif 00468 00469 /* computes x/R == x (mod N) via Montgomery Reduction */ 00470 void fp_montgomery_reduce(fp_int *a, fp_int *m, fp_digit mp) 00471 { 00472 fp_digit c[FP_SIZE], *_c, *tmpm, mu; 00473 int oldused, x, y, pa; 00474 00475 /* bail if too large */ 00476 if (m->used > (FP_SIZE/2)) { 00477 return; 00478 } 00479 00480 #ifdef TFM_SMALL_MONT_SET 00481 if (m->used <= 16) { 00482 fp_montgomery_reduce_small(a, m, mp); 00483 return; 00484 } 00485 #endif 00486 00487 #if defined(USE_MEMSET) 00488 /* now zero the buff */ 00489 memset(c, 0, sizeof c); 00490 #endif 00491 pa = m->used; 00492 00493 /* copy the input */ 00494 oldused = a->used; 00495 for (x = 0; x < oldused; x++) { 00496 c[x] = a->dp[x]; 00497 } 00498 #if !defined(USE_MEMSET) 00499 for (; x < 2*pa+1; x++) { 00500 c[x] = 0; 00501 } 00502 #endif 00503 MONT_START; 00504 00505 for (x = 0; x < pa; x++) { 00506 fp_digit cy = 0; 00507 /* get Mu for this round */ 00508 LOOP_START; 00509 _c = c + x; 00510 tmpm = m->dp; 00511 y = 0; 00512 #if (defined(TFM_SSE2) || defined(TFM_X86_64)) 00513 for (; y < (pa & ~7); y += 8) { 00514 INNERMUL8; 00515 _c += 8; 00516 tmpm += 8; 00517 } 00518 #endif 00519 00520 for (; y < pa; y++) { 00521 INNERMUL; 00522 ++_c; 00523 } 00524 LOOP_END; 00525 while (cy) { 00526 PROPCARRY; 00527 ++_c; 00528 } 00529 } 00530 00531 /* now copy out */ 00532 _c = c + pa; 00533 tmpm = a->dp; 00534 for (x = 0; x < pa+1; x++) { 00535 *tmpm++ = *_c++; 00536 } 00537 00538 for (; x < oldused; x++) { 00539 *tmpm++ = 0; 00540 } 00541 00542 MONT_FINI; 00543 00544 a->used = pa+1; 00545 fp_clamp(a); 00546 00547 /* if A >= m then A = A - m */ 00548 if (fp_cmp_mag (a, m) != FP_LT) { 00549 s_fp_sub (a, m, a); 00550 } 00551 } 00552 00553 00554 /* $Source: /cvs/libtom/tomsfastmath/src/mont/fp_montgomery_reduce.c,v $ */ 00555 /* $Revision: 1.2 $ */ 00556 /* $Date: 2007/03/14 23:47:42 $ */
Generated on Tue Jul 12 2022 19:20:10 by
1.7.2
