sara matheu / Mbed 2 deprecated CurvasElipticas

Dependencies:   mbed CyaSSL

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers ecc.cpp Source File

ecc.cpp

00001 #include "ecc.h"
00002 
00003 #include <string.h>
00004 
00005 typedef unsigned int uint;
00006 
00007 #define Curve_P_1 {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFD}
00008 #define Curve_P_2 {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}
00009 #define Curve_P_3 {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000, 0x00000001, 0xFFFFFFFF}
00010 #define Curve_P_4 {0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, \
00011     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}
00012 #define Curve_P_5 {0xFFFFFC2F, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}
00013 
00014 #define Curve_B_1 {0x2CEE5ED3, 0xD824993C, 0x1079F43D, 0xE87579C1}
00015 #define Curve_B_2 {0xC146B9B1, 0xFEB8DEEC, 0x72243049, 0x0FA7E9AB, 0xE59C80E7, 0x64210519}
00016 #define Curve_B_3 {0x27D2604B, 0x3BCE3C3E, 0xCC53B0F6, 0x651D06B0, 0x769886BC, 0xB3EBBD55, 0xAA3A93E7, 0x5AC635D8}
00017 #define Curve_B_4 {0xD3EC2AEF, 0x2A85C8ED, 0x8A2ED19D, 0xC656398D, 0x5013875A, 0x0314088F, 0xFE814112, 0x181D9C6E, \
00018     0xE3F82D19, 0x988E056B, 0xE23EE7E4, 0xB3312FA7}
00019 #define Curve_B_5 {0x00000007, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}
00020 
00021 #define Curve_G_1 { \
00022     {0xA52C5B86, 0x0C28607C, 0x8B899B2D, 0x161FF752}, \
00023     {0xDDED7A83, 0xC02DA292, 0x5BAFEB13, 0xCF5AC839}}
00024 
00025 #define Curve_G_2 { \
00026     {0x82FF1012, 0xF4FF0AFD, 0x43A18800, 0x7CBF20EB, 0xB03090F6, 0x188DA80E}, \
00027     {0x1E794811, 0x73F977A1, 0x6B24CDD5, 0x631011ED, 0xFFC8DA78, 0x07192B95}}
00028     
00029 #define Curve_G_3 { \
00030     {0xD898C296, 0xF4A13945, 0x2DEB33A0, 0x77037D81, 0x63A440F2, 0xF8BCE6E5, 0xE12C4247, 0x6B17D1F2}, \
00031     {0x37BF51F5, 0xCBB64068, 0x6B315ECE, 0x2BCE3357, 0x7C0F9E16, 0x8EE7EB4A, 0xFE1A7F9B, 0x4FE342E2}}
00032 
00033 #define Curve_G_4 { \
00034     {0x72760AB7, 0x3A545E38, 0xBF55296C, 0x5502F25D, 0x82542A38, 0x59F741E0, 0x8BA79B98, 0x6E1D3B62, \
00035         0xF320AD74, 0x8EB1C71E, 0xBE8B0537, 0xAA87CA22}, \
00036     {0x90EA0E5F, 0x7A431D7C, 0x1D7E819D, 0x0A60B1CE, 0xB5F0B8C0, 0xE9DA3113, 0x289A147C, 0xF8F41DBD, \
00037         0x9292DC29, 0x5D9E98BF, 0x96262C6F, 0x3617DE4A}}
00038 
00039 #define Curve_G_5 { \
00040     {0x16F81798, 0x59F2815B, 0x2DCE28D9, 0x029BFCDB, 0xCE870B07, 0x55A06295, 0xF9DCBBAC, 0x79BE667E}, \
00041     {0xFB10D4B8, 0x9C47D08F, 0xA6855419, 0xFD17B448, 0x0E1108A8, 0x5DA4FBFC, 0x26A3C465, 0x483ADA77}}
00042 
00043 #define Curve_N_1 {0x9038A115, 0x75A30D1B, 0x00000000, 0xFFFFFFFE}
00044 #define Curve_N_2 {0xB4D22831, 0x146BC9B1, 0x99DEF836, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}
00045 #define Curve_N_3 {0xFC632551, 0xF3B9CAC2, 0xA7179E84, 0xBCE6FAAD, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF}
00046 #define Curve_N_4 {0xCCC52973, 0xECEC196A, 0x48B0A77A, 0x581A0DB2, 0xF4372DDF, 0xC7634D81, 0xFFFFFFFF, 0xFFFFFFFF, \
00047     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}
00048 #define Curve_N_5 {0xD0364141, 0xBFD25E8C, 0xAF48A03B, 0xBAAEDCE6, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF}
00049 
00050 static uint32_t curve_p[NUM_ECC_DIGITS] = ECC_CONCAT(Curve_P_, ECC_CURVE);
00051 static uint32_t curve_b[NUM_ECC_DIGITS] = ECC_CONCAT(Curve_B_, ECC_CURVE);
00052 static EccPoint curve_G = ECC_CONCAT(Curve_G_, ECC_CURVE);
00053 static uint32_t curve_n[NUM_ECC_DIGITS] = ECC_CONCAT(Curve_N_, ECC_CURVE);
00054 
00055 static void vli_clear(uint32_t *p_vli)
00056 {
00057     uint i;
00058     for(i=0; i<NUM_ECC_DIGITS; ++i)
00059     {
00060         p_vli[i] = 0;
00061     }
00062 }
00063 
00064 /* Returns 1 if p_vli == 0, 0 otherwise. */
00065 static int vli_isZero(const uint32_t *p_vli)
00066 {
00067     uint i;
00068     for(i = 0; i < NUM_ECC_DIGITS; ++i)
00069     {
00070         if(p_vli[i])
00071         {
00072             return 0;
00073         }
00074     }
00075     return 1;
00076 }
00077 
00078 /* Returns nonzero if bit p_bit of p_vli is set. */
00079 static uint32_t vli_testBit(const uint32_t *p_vli, const unsigned int p_bit)
00080 {
00081     return (p_vli[p_bit/32] & (1 << (p_bit % 32)));
00082 }
00083 
00084 /* Counts the number of 32-bit "digits" in p_vli. */
00085 static uint vli_numDigits(const uint32_t *p_vli)
00086 {
00087     int i;
00088     /* Search from the end until we find a non-zero digit.
00089        We do it in reverse because we expect that most digits will be nonzero. */
00090     for(i = NUM_ECC_DIGITS - 1; i >= 0 && p_vli[i] == 0; --i)
00091     {
00092     }
00093 
00094     return (i + 1);
00095 }
00096 
00097 /* Counts the number of bits required for p_vli. */
00098 static uint vli_numBits(const uint32_t *p_vli)
00099 {
00100     uint i;
00101     uint32_t l_digit;
00102     
00103     uint l_numDigits = vli_numDigits(p_vli);
00104     if(l_numDigits == 0)
00105     {
00106         return 0;
00107     }
00108 
00109     l_digit = p_vli[l_numDigits - 1];
00110     for(i=0; l_digit; ++i)
00111     {
00112         l_digit >>= 1;
00113     }
00114     
00115     return ((l_numDigits - 1) * 32 + i);
00116 }
00117 
00118 /* Sets p_dest = p_src. */
00119 static void vli_set(uint32_t *p_dest, const uint32_t *p_src)
00120 {
00121     uint i;
00122     for(i=0; i<NUM_ECC_DIGITS; ++i)
00123     {
00124         p_dest[i] = p_src[i];
00125     }
00126 }
00127 
00128 /* Returns sign of p_left - p_right. */
00129 static int vli_cmp(const uint32_t *p_left, const uint32_t *p_right)
00130 {
00131     int i;
00132     for(i = NUM_ECC_DIGITS-1; i >= 0; --i)
00133     {
00134         if(p_left[i] > p_right[i])
00135         {
00136             return 1;
00137         }
00138         else if(p_left[i] < p_right[i])
00139         {
00140             return -1;
00141         }
00142     }
00143     return 0;
00144 }
00145 
00146 /* Computes p_result = p_in << c, returning carry. Can modify in place (if p_result == p_in). 0 < p_shift < 32. */
00147 static uint32_t vli_lshift(uint32_t *p_result, const uint32_t *p_in, const unsigned int p_shift)
00148 {
00149     uint32_t l_carry = 0;
00150     uint i;
00151     for(i = 0; i < NUM_ECC_DIGITS; ++i)
00152     {
00153         uint32_t l_temp = p_in[i];
00154         p_result[i] = (l_temp << p_shift) | l_carry;
00155         l_carry = l_temp >> (32 - p_shift);
00156     }
00157     
00158     return l_carry;
00159 }
00160 
00161 /* Computes p_vli = p_vli >> 1. */
00162 static void vli_rshift1(uint32_t *p_vli)
00163 {
00164     uint32_t *l_end = p_vli;
00165     uint32_t l_carry = 0;
00166     
00167     p_vli += NUM_ECC_DIGITS;
00168     while(p_vli-- > l_end)
00169     {
00170         uint32_t l_temp = *p_vli;
00171         *p_vli = (l_temp >> 1) | l_carry;
00172         l_carry = l_temp << 31;
00173     }
00174 }
00175 
00176 /* Computes p_result = p_left + p_right, returning carry. Can modify in place. */
00177 static uint32_t vli_add(uint32_t *p_result, const uint32_t *p_left, const uint32_t *p_right)
00178 {
00179 #if (ECC_ASM == ecc_asm_thumb || ECC_ASM == ecc_asm_thumb2 || ECC_ASM == ecc_asm_arm)
00180     uint32_t l_counter = NUM_ECC_DIGITS;
00181     uint32_t l_carry = 0; /* carry = 0 initially */
00182     uint32_t l_left;
00183     uint32_t l_right;
00184     
00185     asm volatile (
00186         ".syntax unified \n\t"
00187         "1: \n\t"
00188         "ldmia %[lptr]!, {%[left]} \n\t"  /* Load left word. */
00189         "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
00190         "lsrs %[carry], #1 \n\t"          /* Set up carry flag (l_carry = 0 after this). */
00191         "adcs %[left], %[right] \n\t"     /* Add with carry. */
00192         "adcs %[carry], %[carry] \n\t"    /* Store carry bit in l_carry. */
00193         "stmia %[dptr]!, {%[left]} \n\t"  /* Store result word. */
00194         "subs %[ctr], #1 \n\t"            /* Decrement index. */
00195         "bne 1b \n\t"                     /* Loop until index == 0. */
00196     #if (ECC_ASM != ecc_asm_thumb2)
00197         ".syntax divided \n\t"
00198     #endif
00199     #if (ECC_ASM == ecc_asm_thumb)
00200         : [dptr] "+l" (p_result), [lptr] "+l" (p_left), [rptr] "+l" (p_right), [ctr] "+l" (l_counter), [carry] "+l" (l_carry), [left] "=l" (l_left), [right] "=l" (l_right)
00201     #else
00202         : [dptr] "+r" (p_result), [lptr] "+r" (p_left), [rptr] "+r" (p_right), [ctr] "+r" (l_counter), [carry] "+r" (l_carry), [left] "=r" (l_left), [right] "=r" (l_right)
00203     #endif
00204         :
00205         : "cc", "memory"
00206     );
00207     return l_carry;
00208     
00209 #else
00210 
00211     uint32_t l_carry = 0;
00212     uint i;
00213     for(i=0; i<NUM_ECC_DIGITS; ++i)
00214     {
00215         uint32_t l_sum = p_left[i] + p_right[i] + l_carry;
00216         if(l_sum != p_left[i])
00217         {
00218             l_carry = (l_sum < p_left[i]);
00219         }
00220         p_result[i] = l_sum;
00221     }
00222     return l_carry;
00223 #endif
00224 }
00225 
00226 /* Computes p_result = p_left - p_right, returning borrow. Can modify in place. */
00227 static uint32_t vli_sub(uint32_t *p_result, const uint32_t *p_left, const uint32_t *p_right)
00228 {
00229 #if (ECC_ASM == ecc_asm_thumb || ECC_ASM == ecc_asm_thumb2 || ECC_ASM == ecc_asm_arm)
00230     uint32_t l_counter = NUM_ECC_DIGITS;
00231     uint32_t l_carry = 1; /* carry = 1 initially (means don't borrow) */
00232     uint32_t l_left;
00233     uint32_t l_right;
00234     
00235     asm volatile (
00236         ".syntax unified \n\t"
00237         "1: \n\t"
00238         "ldmia %[lptr]!, {%[left]} \n\t"  /* Load left word. */
00239         "ldmia %[rptr]!, {%[right]} \n\t" /* Load right word. */
00240         "lsrs %[carry], #1 \n\t"          /* Set up carry flag (l_carry = 0 after this). */
00241         "sbcs %[left], %[right] \n\t"     /* Subtract with borrow. */
00242         "adcs %[carry], %[carry] \n\t"    /* Store carry bit in l_carry. */
00243         "stmia %[dptr]!, {%[left]} \n\t"  /* Store result word. */
00244         "subs %[ctr], #1 \n\t"            /* Decrement index. */
00245         "bne 1b \n\t"                     /* Loop until index == 0. */
00246     #if (ECC_ASM != ecc_asm_thumb2)
00247         ".syntax divided \n\t"
00248     #endif
00249     #if (ECC_ASM == ecc_asm_thumb)
00250         : [dptr] "+l" (p_result), [lptr] "+l" (p_left), [rptr] "+l" (p_right), [ctr] "+l" (l_counter), [carry] "+l" (l_carry), [left] "=l" (l_left), [right] "=l" (l_right)
00251     #else
00252         : [dptr] "+r" (p_result), [lptr] "+r" (p_left), [rptr] "+r" (p_right), [ctr] "+r" (l_counter), [carry] "+r" (l_carry), [left] "=r" (l_left), [right] "=r" (l_right)
00253     #endif
00254         :
00255         : "cc", "memory"
00256     );
00257     return !l_carry;
00258 
00259 #else
00260 
00261     uint32_t l_borrow = 0;
00262     uint i;
00263     for(i=0; i<NUM_ECC_DIGITS; ++i)
00264     {
00265         uint32_t l_diff = p_left[i] - p_right[i] - l_borrow;
00266         if(l_diff != p_left[i])
00267         {
00268             l_borrow = (l_diff > p_left[i]);
00269         }
00270         p_result[i] = l_diff;
00271     }
00272     return l_borrow;
00273 #endif
00274 }
00275 
00276 /* Computes p_result = p_left * p_right. */
00277 static void vli_mult(uint32_t *p_result, const uint32_t *p_left, const uint32_t *p_right)
00278 {
00279 #if (ECC_ASM == ecc_asm_thumb2 || ECC_ASM == ecc_asm_arm)
00280     uint32_t c0 = 0;
00281     uint32_t c1 = 0;
00282     uint32_t c2 = 0;
00283     uint32_t k = 0;
00284     uint32_t i;
00285     uint32_t t0, t1;
00286     
00287     asm volatile (
00288         ".syntax unified \n\t"
00289         
00290         "1: \n\t" /* outer loop (k < NUM_ECC_DIGITS) */
00291         "movs %[i], #0 \n\t" /* i = 0 */
00292         "b 3f \n\t"
00293         
00294         "2: \n\t" /* outer loop (k >= NUM_ECC_DIGITS) */
00295         "movs %[i], %[k] \n\t"      /* i = k */
00296         "subs %[i], %[eccdm1] \n\t" /* i = k - (NUM_ECC_DIGITS - 1) (times 4) */
00297         
00298         "3: \n\t" /* inner loop */
00299         "subs %[t0], %[k], %[i] \n\t" /* t0 = k-i */
00300         
00301         "ldr %[t1], [%[right], %[t0]] \n\t" /* t1 = p_right[k-i] */
00302         "ldr %[t0], [%[left], %[i]] \n\t"   /* t0 = p_left[i] */
00303         
00304         "umull %[t0], %[t1], %[t0], %[t1] \n\t" /* (t0, t1) = p_left[i] * p_right[k-i] */
00305         
00306         "adds %[c0], %[t0] \n\t" /* add low word to c0 */
00307         "adcs %[c1], %[t1] \n\t" /* add high word to c1, including carry */
00308         "adcs %[c2], #0 \n\t"    /* add carry to c2 */
00309 
00310         "adds %[i], #4 \n\t"     /* i += 4 */
00311         "cmp %[i], %[eccd] \n\t" /* i < NUM_ECC_DIGITS (times 4)? */
00312         "bge 4f \n\t" /* if not, exit the loop */
00313         "cmp %[i], %[k] \n\t"    /* i <= k? */
00314         "ble 3b \n\t" /* if so, continue looping */
00315         
00316         "4: \n\t" /* end inner loop */
00317         
00318         "str %[c0], [%[result], %[k]] \n\t" /* p_result[k] = c0 */
00319         "mov %[c0], %[c1] \n\t"     /* c0 = c1 */
00320         "mov %[c1], %[c2] \n\t"     /* c1 = c2 */
00321         "movs %[c2], #0 \n\t"       /* c2 = 0 */
00322         "adds %[k], #4 \n\t"        /* k += 4 */
00323         "cmp %[k], %[eccd] \n\t"    /* k < NUM_ECC_DIGITS (times 4) ? */
00324         "blt 1b \n\t" /* if not, loop back, start with i = 0 */
00325         "cmp %[k], %[eccd2m1] \n\t" /* k < NUM_ECC_DIGITS * 2 - 1 (times 4) ? */
00326         "blt 2b \n\t" /* if not, loop back, start with i = (k+1) - NUM_ECC_DIGITS */
00327         /* end outer loop */
00328         
00329         "str %[c0], [%[result], %[k]] \n\t" /* p_result[NUM_ECC_DIGITS * 2 - 1] = c0 */
00330     #if (ECC_ASM != ecc_asm_thumb2)
00331         ".syntax divided \n\t"
00332     #endif
00333         : [c0] "+r" (c0), [c1] "+r" (c1), [c2] "+r" (c2), [k] "+r" (k), [i] "=&r" (i), [t0] "=&r" (t0), [t1] "=&r" (t1)
00334         : [result] "r" (p_result), [left] "r" (p_left), [right] "r" (p_right),
00335           [eccd] "I" (NUM_ECC_DIGITS * 4), [eccdm1] "I" ((NUM_ECC_DIGITS-1) * 4), [eccd2m1] "I" ((NUM_ECC_DIGITS * 2 - 1) * 4)
00336         : "cc", "memory"
00337     );
00338     
00339 #elif (ECC_ASM == ecc_asm_thumb)
00340 
00341     register uint32_t *r0 asm("r0") = p_result;
00342     register uint32_t *r1 asm("r1") = p_left;
00343     register uint32_t *r2 asm("r2") = p_right;
00344     
00345     asm volatile (
00346         ".syntax unified \n\t"
00347         "movs r3, #0 \n\t" /* c0 = 0 */
00348         "movs r4, #0 \n\t" /* c1 = 0 */
00349         "movs r5, #0 \n\t" /* c2 = 0 */
00350         "movs r6, #0 \n\t" /* k = 0 */
00351         
00352         "push {r0} \n\t" /* keep p_result on the stack */
00353         
00354         "1: \n\t" /* outer loop (k < NUM_ECC_DIGITS) */
00355         "movs r7, #0 \n\t" /* r7 = i = 0 */
00356         "b 3f \n\t"
00357         
00358         "2: \n\t" /* outer loop (k >= NUM_ECC_DIGITS) */
00359         "movs r7, r6 \n\t"        /* r7 = k */
00360         "subs r7, %[eccdm1] \n\t" /* r7 = i = k - (NUM_ECC_DIGITS - 1) (times 4) */
00361         
00362         "3: \n\t" /* inner loop */
00363         "push {r3, r4, r5, r6} \n\t" /* push things, r3 (c0) is at the top of stack. */
00364         "subs r0, r6, r7 \n\t"       /* r0 = k-i */
00365         
00366         "ldr r4, [r2, r0] \n\t" /* r4 = p_right[k-i] */
00367         "ldr r0, [r1, r7] \n\t" /* r0 = p_left[i] */
00368         
00369         "lsrs r3, r0, #16 \n\t" /* r3 = a1 */
00370         "uxth r0, r0 \n\t"      /* r0 = a0 */
00371         
00372         "lsrs r5, r4, #16 \n\t" /* r5 = b1 */
00373         "uxth r4, r4 \n\t"      /* r4 = b0 */
00374         
00375         "movs r6, r3 \n\t"     /* r6 = a1 */
00376         "muls r6, r5, r6 \n\t" /* r6 = a1*b1 */
00377         "muls r3, r4, r3 \n\t" /* r3 = b0*a1 */
00378         "muls r5, r0, r5 \n\t" /* r5 = a0*b1 */
00379         "muls r0, r4, r0 \n\t" /* r0 = a0*b0 */
00380         
00381         "movs r4, #0 \n\t"  /* r4 = 0 */
00382         "adds r3, r5 \n\t"  /* r3 = b0*a1 + a0*b1 */
00383         "adcs r4, r4 \n\t"  /* r4 = carry */
00384         "lsls r4, #16 \n\t" /* r4 = carry << 16 */
00385         "adds r6, r4 \n\t"  /* r6 = a1*b1 + carry */
00386         
00387         "lsls r4, r3, #16 \n\t" /* r4 = (b0*a1 + a0*b1) << 16 */
00388         "lsrs r3, #16 \n\t"     /* r3 = (b0*a1 + a0*b1) >> 16 */
00389         "adds r0, r4 \n\t"      /* r0 = low word = a0*b0 + ((b0*a1 + a0*b1) << 16) */
00390         "adcs r6, r3 \n\t"      /* r6 = high word = a1*b1 + carry + ((b0*a1 + a0*b1) >> 16) */
00391         
00392         "pop {r3, r4, r5} \n\t" /* r3 = c0, r4 = c1, r5 = c2 */
00393         "adds r3, r0 \n\t"      /* add low word to c0 */
00394         "adcs r4, r6 \n\t"      /* add high word to c1, including carry */
00395         "movs r0, #0 \n\t"      /* r0 = 0 (does not affect carry bit) */
00396         "adcs r5, r0 \n\t"      /* add carry to c2 */
00397         
00398         "pop {r6} \n\t" /* r6 = k */
00399 
00400         "adds r7, #4 \n\t"     /* i += 4 */
00401         "cmp r7, %[eccd] \n\t" /* i < NUM_ECC_DIGITS (times 4)? */
00402         "bge 4f \n\t" /* if not, exit the loop */
00403         "cmp r7, r6 \n\t"      /* i <= k? */
00404         "ble 3b \n\t" /* if so, continue looping */
00405         
00406         "4: \n\t" /* end inner loop */
00407         
00408         "ldr r0, [sp, #0] \n\t" /* r0 = p_result */
00409         
00410         "str r3, [r0, r6] \n\t"   /* p_result[k] = c0 */
00411         "mov r3, r4 \n\t"         /* c0 = c1 */
00412         "mov r4, r5 \n\t"         /* c1 = c2 */
00413         "movs r5, #0 \n\t"        /* c2 = 0 */
00414         "adds r6, #4 \n\t"        /* k += 4 */
00415         "cmp r6, %[eccd] \n\t"    /* k < NUM_ECC_DIGITS (times 4) ? */
00416         "blt 1b \n\t" /* if not, loop back, start with i = 0 */
00417         "cmp r6, %[eccd2m1] \n\t" /* k < NUM_ECC_DIGITS * 2 - 1 (times 4) ? */
00418         "blt 2b \n\t" /* if not, loop back, start with i = (k+1) - NUM_ECC_DIGITS */
00419         /* end outer loop */
00420         
00421         "str r3, [r0, r6] \n\t" /* p_result[NUM_ECC_DIGITS * 2 - 1] = c0 */
00422         "pop {r0} \n\t"         /* pop p_result off the stack */
00423         
00424         ".syntax divided \n\t"
00425         : 
00426         : [r0] "l" (r0), [r1] "l" (r1), [r2] "l" (r2), [eccd] "I" (NUM_ECC_DIGITS * 4), [eccdm1] "I" ((NUM_ECC_DIGITS-1) * 4), [eccd2m1] "I" ((NUM_ECC_DIGITS * 2 - 1) * 4)
00427         : "r3", "r4", "r5", "r6", "r7", "cc", "memory"
00428     );
00429 
00430 #else
00431 
00432     uint64_t r01 = 0;
00433     uint32_t r2 = 0;
00434     
00435     uint i, k;
00436     
00437     /* Compute each digit of p_result in sequence, maintaining the carries. */
00438     for(k=0; k < NUM_ECC_DIGITS*2 - 1; ++k)
00439     {
00440         uint l_min = (k < NUM_ECC_DIGITS ? 0 : (k + 1) - NUM_ECC_DIGITS);
00441         for(i=l_min; i<=k && i<NUM_ECC_DIGITS; ++i)
00442         {
00443             uint64_t l_product = (uint64_t)p_left[i] * p_right[k-i];
00444             r01 += l_product;
00445             r2 += (r01 < l_product);
00446         }
00447         p_result[k] = (uint32_t)r01;
00448         r01 = (r01 >> 32) | (((uint64_t)r2) << 32);
00449         r2 = 0;
00450     }
00451     
00452     p_result[NUM_ECC_DIGITS*2 - 1] = (uint32_t)r01;
00453 #endif
00454 }
00455 
00456 /* Computes p_result = (p_left + p_right) % p_mod.
00457    Assumes that p_left < p_mod and p_right < p_mod, p_result != p_mod. */
00458 static void vli_modAdd(uint32_t *p_result, const uint32_t *p_left,
00459     const uint32_t *p_right, const uint32_t *p_mod)
00460 {
00461     uint32_t l_carry = vli_add(p_result, p_left, p_right);
00462     if(l_carry || vli_cmp(p_result, p_mod) >= 0)
00463     { /* p_result > p_mod (p_result = p_mod + remainder), so subtract p_mod to get remainder. */
00464         vli_sub(p_result, p_result, p_mod);
00465     }
00466 }
00467 
00468 /* Computes p_result = (p_left - p_right) % p_mod.
00469    Assumes that p_left < p_mod and p_right < p_mod, p_result != p_mod. */
00470 static void vli_modSub(uint32_t *p_result, const uint32_t *p_left,
00471     const uint32_t *p_right, const uint32_t *p_mod)
00472 {
00473     uint32_t l_borrow = vli_sub(p_result, p_left, p_right);
00474     if(l_borrow)
00475     { /* In this case, p_result == -diff == (max int) - diff.
00476          Since -x % d == d - x, we can get the correct result from p_result + p_mod (with overflow). */
00477         vli_add(p_result, p_result, p_mod);
00478     }
00479 }
00480 
00481 #if (ECC_CURVE == secp128r1)
00482 
00483 /* Computes p_result = p_product % curve_p.
00484    See algorithm 5 and 6 from http://www.isys.uni-klu.ac.at/PDF/2001-0126-MT.pdf */
00485 static void vli_mmod_fast(uint32_t *p_result, const uint32_t *p_product)
00486 {
00487     uint32_t l_tmp[NUM_ECC_DIGITS];
00488     int l_carry;
00489     
00490     vli_set(p_result, p_product);
00491     
00492     l_tmp[0] = p_product[4];
00493     l_tmp[1] = p_product[5];
00494     l_tmp[2] = p_product[6];
00495     l_tmp[3] = (p_product[7] & 0x00000001) | (p_product[4] << 1);
00496     l_carry = vli_add(p_result, p_result, l_tmp);
00497     
00498     l_tmp[0] = (p_product[4] >> 31) | (p_product[5] << 1);
00499     l_tmp[1] = (p_product[5] >> 31) | (p_product[6] << 1);
00500     l_tmp[2] = (p_product[6] >> 31) | (p_product[7] << 1);
00501     l_tmp[3] = (p_product[7] >> 31) | ((p_product[4] & 0x80000000) >> 30) | (p_product[5] << 2);
00502     l_carry += vli_add(p_result, p_result, l_tmp);
00503     
00504     l_tmp[0] = (p_product[5] >> 30) | (p_product[6] << 2);
00505     l_tmp[1] = (p_product[6] >> 30) | (p_product[7] << 2);
00506     l_tmp[2] = (p_product[7] >> 30);
00507     l_tmp[3] = ((p_product[5] & 0xC0000000) >> 29) | (p_product[6] << 3);
00508     l_carry += vli_add(p_result, p_result, l_tmp);
00509     
00510     l_tmp[0] = (p_product[6] >> 29) | (p_product[7] << 3);
00511     l_tmp[1] = (p_product[7] >> 29);
00512     l_tmp[2] = 0;
00513     l_tmp[3] = ((p_product[6] & 0xE0000000) >> 28) | (p_product[7] << 4);
00514     l_carry += vli_add(p_result, p_result, l_tmp);
00515     
00516     l_tmp[0] = (p_product[7] >> 28);
00517     l_tmp[1] = 0;
00518     l_tmp[2] = 0;
00519     l_tmp[3] = (p_product[7] & 0xFFFFFFFE);
00520     l_carry += vli_add(p_result, p_result, l_tmp);
00521     
00522     l_tmp[0] = 0;
00523     l_tmp[1] = 0;
00524     l_tmp[2] = 0;
00525     l_tmp[3] = ((p_product[7] & 0xF0000000) >> 27);
00526     l_carry += vli_add(p_result, p_result, l_tmp);
00527     
00528     while(l_carry || vli_cmp(curve_p, p_result) != 1)
00529     {
00530         l_carry -= vli_sub(p_result, p_result, curve_p);
00531     }
00532 }
00533 
00534 #elif (ECC_CURVE == secp192r1)
00535 
00536 /* Computes p_result = p_product % curve_p.
00537    See algorithm 5 and 6 from http://www.isys.uni-klu.ac.at/PDF/2001-0126-MT.pdf */
00538 static void vli_mmod_fast(uint32_t *p_result, const uint32_t *p_product)
00539 {
00540     uint32_t l_tmp[NUM_ECC_DIGITS];
00541     int l_carry;
00542     
00543     vli_set(p_result, p_product);
00544     
00545     vli_set(l_tmp, &p_product[6]);
00546     l_carry = vli_add(p_result, p_result, l_tmp);
00547     
00548     l_tmp[0] = l_tmp[1] = 0;
00549     l_tmp[2] = p_product[6];
00550     l_tmp[3] = p_product[7];
00551     l_tmp[4] = p_product[8];
00552     l_tmp[5] = p_product[9];
00553     l_carry += vli_add(p_result, p_result, l_tmp);
00554     
00555     l_tmp[0] = l_tmp[2] = p_product[10];
00556     l_tmp[1] = l_tmp[3] = p_product[11];
00557     l_tmp[4] = l_tmp[5] = 0;
00558     l_carry += vli_add(p_result, p_result, l_tmp);
00559     
00560     while(l_carry || vli_cmp(curve_p, p_result) != 1)
00561     {
00562         l_carry -= vli_sub(p_result, p_result, curve_p);
00563     }
00564 }
00565 
00566 #elif (ECC_CURVE == secp256r1)
00567 
00568 /* Computes p_result = p_product % curve_p
00569    from http://www.nsa.gov/ia/_files/nist-routines.pdf */
00570 static void vli_mmod_fast(uint32_t *p_result, const uint32_t *p_product)
00571 {
00572     uint32_t l_tmp[NUM_ECC_DIGITS];
00573     int l_carry;
00574     
00575     /* t */
00576     vli_set(p_result, p_product);
00577     
00578     /* s1 */
00579     l_tmp[0] = l_tmp[1] = l_tmp[2] = 0;
00580     l_tmp[3] = p_product[11];
00581     l_tmp[4] = p_product[12];
00582     l_tmp[5] = p_product[13];
00583     l_tmp[6] = p_product[14];
00584     l_tmp[7] = p_product[15];
00585     l_carry = vli_lshift(l_tmp, l_tmp, 1);
00586     l_carry += vli_add(p_result, p_result, l_tmp);
00587     
00588     /* s2 */
00589     l_tmp[3] = p_product[12];
00590     l_tmp[4] = p_product[13];
00591     l_tmp[5] = p_product[14];
00592     l_tmp[6] = p_product[15];
00593     l_tmp[7] = 0;
00594     l_carry += vli_lshift(l_tmp, l_tmp, 1);
00595     l_carry += vli_add(p_result, p_result, l_tmp);
00596     
00597     /* s3 */
00598     l_tmp[0] = p_product[8];
00599     l_tmp[1] = p_product[9];
00600     l_tmp[2] = p_product[10];
00601     l_tmp[3] = l_tmp[4] = l_tmp[5] = 0;
00602     l_tmp[6] = p_product[14];
00603     l_tmp[7] = p_product[15];
00604     l_carry += vli_add(p_result, p_result, l_tmp);
00605     
00606     /* s4 */
00607     l_tmp[0] = p_product[9];
00608     l_tmp[1] = p_product[10];
00609     l_tmp[2] = p_product[11];
00610     l_tmp[3] = p_product[13];
00611     l_tmp[4] = p_product[14];
00612     l_tmp[5] = p_product[15];
00613     l_tmp[6] = p_product[13];
00614     l_tmp[7] = p_product[8];
00615     l_carry += vli_add(p_result, p_result, l_tmp);
00616     
00617     /* d1 */
00618     l_tmp[0] = p_product[11];
00619     l_tmp[1] = p_product[12];
00620     l_tmp[2] = p_product[13];
00621     l_tmp[3] = l_tmp[4] = l_tmp[5] = 0;
00622     l_tmp[6] = p_product[8];
00623     l_tmp[7] = p_product[10];
00624     l_carry -= vli_sub(p_result, p_result, l_tmp);
00625     
00626     /* d2 */
00627     l_tmp[0] = p_product[12];
00628     l_tmp[1] = p_product[13];
00629     l_tmp[2] = p_product[14];
00630     l_tmp[3] = p_product[15];
00631     l_tmp[4] = l_tmp[5] = 0;
00632     l_tmp[6] = p_product[9];
00633     l_tmp[7] = p_product[11];
00634     l_carry -= vli_sub(p_result, p_result, l_tmp);
00635     
00636     /* d3 */
00637     l_tmp[0] = p_product[13];
00638     l_tmp[1] = p_product[14];
00639     l_tmp[2] = p_product[15];
00640     l_tmp[3] = p_product[8];
00641     l_tmp[4] = p_product[9];
00642     l_tmp[5] = p_product[10];
00643     l_tmp[6] = 0;
00644     l_tmp[7] = p_product[12];
00645     l_carry -= vli_sub(p_result, p_result, l_tmp);
00646     
00647     /* d4 */
00648     l_tmp[0] = p_product[14];
00649     l_tmp[1] = p_product[15];
00650     l_tmp[2] = 0;
00651     l_tmp[3] = p_product[9];
00652     l_tmp[4] = p_product[10];
00653     l_tmp[5] = p_product[11];
00654     l_tmp[6] = 0;
00655     l_tmp[7] = p_product[13];
00656     l_carry -= vli_sub(p_result, p_result, l_tmp);
00657     
00658     if(l_carry < 0)
00659     {
00660         do
00661         {
00662             l_carry += vli_add(p_result, p_result, curve_p);
00663         } while(l_carry < 0);
00664     }
00665     else
00666     {
00667         while(l_carry || vli_cmp(curve_p, p_result) != 1)
00668         {
00669             l_carry -= vli_sub(p_result, p_result, curve_p);
00670         }
00671     }
00672 }
00673 
00674 #elif (ECC_CURVE == secp384r1)
00675 
00676 static void omega_mult(uint32_t *p_result, const uint32_t *p_right)
00677 {
00678     /* Multiply by (2^128 + 2^96 - 2^32 + 1). */
00679     vli_set(p_result, p_right); /* 1 */
00680     p_result[3 + NUM_ECC_DIGITS] = vli_add(p_result + 3, p_result + 3, p_right); /* 2^96 + 1 */
00681     p_result[4 + NUM_ECC_DIGITS] = vli_add(p_result + 4, p_result + 4, p_right); /* 2^128 + 2^96 + 1 */
00682     if(vli_sub(p_result + 1, p_result + 1, p_right)) /* 2^128 + 2^96 - 2^32 + 1 */
00683     { /* Propagate borrow if necessary. */
00684         uint i;
00685         for(i = 1 + NUM_ECC_DIGITS; ; ++i)
00686         {
00687             --p_result[i];
00688             if(p_result[i] != 0xffffffff)
00689             {
00690                 break;
00691             }
00692         }
00693     }
00694 }
00695 
00696 /* Computes p_result = p_product % curve_p
00697     see PDF "Comparing Elliptic Curve Cryptography and RSA on 8-bit CPUs"
00698     section "Curve-Specific Optimizations" */
00699 static void vli_mmod_fast(uint32_t *p_result, const uint32_t *p_product)
00700 {
00701     uint32_t l_tmp[2*NUM_ECC_DIGITS];
00702      
00703     while(!vli_isZero(p_product + NUM_ECC_DIGITS)) /* While c1 != 0 */
00704     {
00705         uint32_t l_carry = 0;
00706         uint i;
00707         
00708         vli_clear(l_tmp);
00709         vli_clear(l_tmp + NUM_ECC_DIGITS);
00710         omega_mult(l_tmp, p_product + NUM_ECC_DIGITS); /* tmp = w * c1 */
00711         vli_clear(p_product + NUM_ECC_DIGITS); /* p = c0 */
00712         
00713         /* (c1, c0) = c0 + w * c1 */
00714         for(i=0; i<NUM_ECC_DIGITS+5; ++i)
00715         {
00716             uint32_t l_sum = p_product[i] + l_tmp[i] + l_carry;
00717             if(l_sum != p_product[i])
00718             {
00719                 l_carry = (l_sum < p_product[i]);
00720             }
00721             p_product[i] = l_sum;
00722         }
00723     }
00724     
00725     while(vli_cmp(p_product, curve_p) > 0)
00726     {
00727         vli_sub(p_product, p_product, curve_p);
00728     }
00729     vli_set(p_result, p_product);
00730 }
00731 
00732 #elif (ECC_CURVE == secp256k1)
00733 
00734 static void omega_mult(uint32_t *p_result, const uint32_t *p_right)
00735 {
00736     /* Multiply by (2^32 + 2^9 + 2^8 + 2^7 + 2^6 + 2^4 + 1). */
00737     uint64_t l_mult = 0x3D1; /* everything except 2^32 */
00738     uint32_t l_carry = 0;
00739     uint i;
00740     for(i=0; i<NUM_ECC_DIGITS; ++i)
00741     {
00742         uint64_t p = l_mult * p_right[i] + l_carry;
00743         p_result[i] = (p & 0xffffffff);
00744         l_carry = p >> 32;
00745     }
00746     p_result[NUM_ECC_DIGITS] = l_carry;
00747     
00748     p_result[1 + NUM_ECC_DIGITS] = vli_add(p_result + 1, p_result + 1, p_right); /* add the 2^32 multiple */
00749 }
00750 
00751 /* Computes p_result = p_product % curve_p  */
00752 static void vli_mmod_fast(uint32_t *p_result, const uint32_t *p_product)
00753 {
00754     uint32_t l_tmp[2*NUM_ECC_DIGITS];
00755      
00756     while(!vli_isZero(p_product + NUM_ECC_DIGITS)) /* While c1 != 0 */
00757     {
00758         uint32_t l_carry = 0;
00759         uint i;
00760         
00761         vli_clear(l_tmp);
00762         vli_clear(l_tmp + NUM_ECC_DIGITS);
00763         omega_mult(l_tmp, p_product + NUM_ECC_DIGITS); /* tmp = w * c1 */
00764         vli_clear(p_product + NUM_ECC_DIGITS); /* p = c0 */
00765         
00766         /* (c1, c0) = c0 + w * c1 */
00767         for(i=0; i<NUM_ECC_DIGITS+2; ++i)
00768         {
00769             uint32_t l_sum = p_product[i] + l_tmp[i] + l_carry;
00770             if(l_sum != p_product[i])
00771             {
00772                 l_carry = (l_sum < p_product[i]);
00773             }
00774             p_product[i] = l_sum;
00775         }
00776     }
00777     
00778     while(vli_cmp(p_product, curve_p) > 0)
00779     {
00780         vli_sub(p_product, p_product, curve_p);
00781     }
00782     vli_set(p_result, p_product);
00783 }
00784 
00785 #endif
00786 
00787 /* Computes p_result = (p_left * p_right) % curve_p. */
00788 static void vli_modMult_fast(uint32_t *p_result, const uint32_t *p_left, const uint32_t *p_right)
00789 {
00790    uint32_t l_product[2 * NUM_ECC_DIGITS];
00791    vli_mult(l_product, p_left, p_right);
00792    vli_mmod_fast(p_result, l_product);
00793 }
00794 
00795 #if ECC_SQUARE_FUNC
00796 
00797 /* Computes p_result = p_left^2. */
00798 static void vli_square(uint32_t *p_result, const uint32_t *p_left)
00799 {
00800 #if (ECC_ASM == ecc_asm_thumb2 || ECC_ASM == ecc_asm_arm)
00801     uint32_t c0 = 0;
00802     uint32_t c1 = 0;
00803     uint32_t c2 = 0;
00804     uint32_t k = 0;
00805     uint32_t i, tt;
00806     uint32_t t0, t1;
00807     
00808     asm volatile (
00809         ".syntax unified \n\t"
00810         
00811         "1: \n\t" /* outer loop (k < NUM_ECC_DIGITS) */
00812         "movs %[i], #0 \n\t" /* i = 0 */
00813         "b 3f \n\t"
00814         
00815         "2: \n\t" /* outer loop (k >= NUM_ECC_DIGITS) */
00816         "movs %[i], %[k] \n\t"      /* i = k */
00817         "subs %[i], %[eccdm1] \n\t" /* i = k - (NUM_ECC_DIGITS - 1) (times 4) */
00818         
00819         "3: \n\t" /* inner loop */
00820         "subs %[tt], %[k], %[i] \n\t" /* tt = k-i */
00821         
00822         "ldr %[t1], [%[left], %[tt]] \n\t" /* t1 = p_left[k-i] */
00823         "ldr %[t0], [%[left], %[i]] \n\t"  /* t0 = p_left[i] */
00824         
00825         "umull %[t0], %[t1], %[t0], %[t1] \n\t" /* (t0, t1) = p_left[i] * p_right[k-i] */
00826         
00827         "cmp %[i], %[tt] \n\t"   /* (i < k-i) ? */
00828         "bge 4f \n\t" /* if i >= k-i, skip */
00829         "lsls %[t1], #1 \n\t"    /* high word << 1 */
00830         "adc %[c2], #0 \n\t"     /* add carry bit to c2 */
00831         "lsls %[t0], #1 \n\t"       /* low word << 1 */
00832         "adc %[t1], #0 \n\t"     /* add carry bit to high word */
00833         
00834         "4: \n\t"
00835 
00836         "adds %[c0], %[t0] \n\t" /* add low word to c0 */
00837         "adcs %[c1], %[t1] \n\t" /* add high word to c1, including carry */
00838         "adc %[c2], #0 \n\t"     /* add carry to c2 */
00839         
00840         "adds %[i], #4 \n\t"          /* i += 4 */
00841         "cmp %[i], %[k] \n\t"         /* i <= k? */
00842         "bge 5f \n\t" /* if not, exit the loop */
00843         "subs %[tt], %[k], %[i] \n\t" /* tt = k-i */
00844         "cmp %[i], %[tt] \n\t"        /* i <= k-i? */
00845         "ble 3b \n\t" /* if so, continue looping */
00846         
00847         "5: \n\t" /* end inner loop */
00848         
00849         "str %[c0], [%[result], %[k]] \n\t" /* p_result[k] = c0 */
00850         "mov %[c0], %[c1] \n\t"     /* c0 = c1 */
00851         "mov %[c1], %[c2] \n\t"     /* c1 = c2 */
00852         "movs %[c2], #0 \n\t"       /* c2 = 0 */
00853         "adds %[k], #4 \n\t"        /* k += 4 */
00854         "cmp %[k], %[eccd] \n\t"    /* k < NUM_ECC_DIGITS (times 4) ? */
00855         "blt 1b \n\t" /* if not, loop back, start with i = 0 */
00856         "cmp %[k], %[eccd2m1] \n\t" /* k < NUM_ECC_DIGITS * 2 - 1 (times 4) ? */
00857         "blt 2b \n\t" /* if not, loop back, start with i = (k+1) - NUM_ECC_DIGITS */
00858         /* end outer loop */
00859         
00860         "str %[c0], [%[result], %[k]] \n\t" /* p_result[NUM_ECC_DIGITS * 2 - 1] = c0 */
00861     #if (ECC_ASM != ecc_asm_thumb2)
00862         ".syntax divided \n\t"
00863     #endif
00864         : [c0] "+r" (c0), [c1] "+r" (c1), [c2] "+r" (c2), [k] "+r" (k), [i] "=&r" (i), [tt] "=&r" (tt), [t0] "=&r" (t0), [t1] "=&r" (t1)
00865         : [result] "r" (p_result), [left] "r" (p_left),
00866           [eccd] "I" (NUM_ECC_DIGITS * 4), [eccdm1] "I" ((NUM_ECC_DIGITS-1) * 4), [eccd2m1] "I" ((NUM_ECC_DIGITS * 2 - 1) * 4)
00867         : "cc", "memory"
00868     );
00869     
00870 #elif (ECC_ASM == ecc_asm_thumb)
00871 
00872     register uint32_t *r0 asm("r0") = p_result;
00873     register uint32_t *r1 asm("r1") = p_left;
00874     
00875     asm volatile (
00876         ".syntax unified \n\t"
00877         "movs r2, #0 \n\t" /* c0 = 0 */
00878         "movs r3, #0 \n\t" /* c1 = 0 */
00879         "movs r4, #0 \n\t" /* c2 = 0 */
00880         "movs r5, #0 \n\t" /* k = 0 */
00881         
00882         "push {r0} \n\t" /* keep p_result on the stack */
00883         
00884         "1: \n\t" /* outer loop (k < NUM_ECC_DIGITS) */
00885         "movs r6, #0 \n\t" /* r6 = i = 0 */
00886         "b 3f \n\t"
00887         
00888         "2: \n\t" /* outer loop (k >= NUM_ECC_DIGITS) */
00889         "movs r6, r5 \n\t"        /* r6 = k */
00890         "subs r6, %[eccdm1] \n\t" /* r6 = i = k - (NUM_ECC_DIGITS - 1) (times 4) */
00891         
00892         "3: \n\t" /* inner loop */
00893         "push {r2, r3, r4, r5} \n\t" /* push things, r2 (c0) is at the top of stack. */
00894         "subs r7, r5, r6 \n\t"       /* r7 = k-i */
00895         
00896         "ldr r3, [r1, r7] \n\t" /* r3 = p_left[k-i] */
00897         "ldr r0, [r1, r6] \n\t" /* r0 = p_left[i] */
00898         
00899         "lsrs r2, r0, #16 \n\t" /* r2 = a1 */
00900         "uxth r0, r0 \n\t"      /* r0 = a0 */
00901         
00902         "lsrs r4, r3, #16 \n\t" /* r4 = b1 */
00903         "uxth r3, r3 \n\t"      /* r3 = b0 */
00904         
00905         "movs r5, r2 \n\t"     /* r5 = a1 */
00906         "muls r5, r4, r5 \n\t" /* r5 = a1*b1 */
00907         "muls r2, r3, r2 \n\t" /* r2 = b0*a1 */
00908         "muls r4, r0, r4 \n\t" /* r4 = a0*b1 */
00909         "muls r0, r3, r0 \n\t" /* r0 = a0*b0 */
00910         
00911         "movs r3, #0 \n\t"  /* r3 = 0 */
00912         "adds r2, r4 \n\t"  /* r2 = b0*a1 + a0*b1 */
00913         "adcs r3, r3 \n\t"  /* r3 = carry */
00914         "lsls r3, #16 \n\t" /* r3 = carry << 16 */
00915         "adds r5, r3 \n\t"  /* r5 = a1*b1 + carry */
00916         
00917         "lsls r3, r2, #16 \n\t" /* r3 = (b0*a1 + a0*b1) << 16 */
00918         "lsrs r2, #16 \n\t"     /* r2 = (b0*a1 + a0*b1) >> 16 */
00919         "adds r0, r3 \n\t"      /* r0 = low word = a0*b0 + ((b0*a1 + a0*b1) << 16) */
00920         "adcs r5, r2 \n\t"      /* r5 = high word = a1*b1 + carry + ((b0*a1 + a0*b1) >> 16) */
00921     
00922         "movs r3, #0 \n\t"  /* r3 = 0 */
00923         "cmp r6, r7 \n\t"   /* (i < k-i) ? */
00924         "mov r7, r3 \n\t"   /* r7 = 0 (does not affect condition)*/
00925         "bge 4f \n\t" /* if i >= k-i, skip */
00926         "lsls r5, #1 \n\t"  /* high word << 1 */
00927         "adcs r7, r3 \n\t"  /* r7 = carry bit for c2 */
00928         "lsls r0, #1 \n\t"  /* low word << 1 */
00929         "adcs r5, r3 \n\t"  /* add carry from shift to high word */
00930         
00931         "4: \n\t"
00932         "pop {r2, r3, r4} \n\t" /* r2 = c0, r3 = c1, r4 = c2 */
00933         "adds r2, r0 \n\t"      /* add low word to c0 */
00934         "adcs r3, r5 \n\t"      /* add high word to c1, including carry */
00935         "movs r0, #0 \n\t"      /* r0 = 0 (does not affect carry bit) */
00936         "adcs r4, r0 \n\t"      /* add carry to c2 */
00937         "adds r4, r7 \n\t"      /* add carry from doubling (if any) */
00938         
00939         "pop {r5} \n\t" /* r5 = k */
00940         
00941         "adds r6, #4 \n\t"     /* i += 4 */
00942         "cmp r6, r5 \n\t"      /* i <= k? */
00943         "bge 5f \n\t" /* if not, exit the loop */
00944         "subs r7, r5, r6 \n\t" /* r7 = k-i */
00945         "cmp r6, r7 \n\t"      /* i <= k-i? */
00946         "ble 3b \n\t" /* if so, continue looping */
00947         
00948         "5: \n\t" /* end inner loop */
00949         
00950         "ldr r0, [sp, #0] \n\t" /* r0 = p_result */
00951         
00952         "str r2, [r0, r5] \n\t"   /* p_result[k] = c0 */
00953         "mov r2, r3 \n\t"         /* c0 = c1 */
00954         "mov r3, r4 \n\t"         /* c1 = c2 */
00955         "movs r4, #0 \n\t"        /* c2 = 0 */
00956         "adds r5, #4 \n\t"        /* k += 4 */
00957         "cmp r5, %[eccd] \n\t"    /* k < NUM_ECC_DIGITS (times 4) ? */
00958         "blt 1b \n\t" /* if not, loop back, start with i = 0 */
00959         "cmp r5, %[eccd2m1] \n\t" /* k < NUM_ECC_DIGITS * 2 - 1 (times 4) ? */
00960         "blt 2b \n\t" /* if not, loop back, start with i = (k+1) - NUM_ECC_DIGITS */
00961         /* end outer loop */
00962         
00963         "str r2, [r0, r5] \n\t" /* p_result[NUM_ECC_DIGITS * 2 - 1] = c0 */
00964         "pop {r0} \n\t"         /* pop p_result off the stack */
00965 
00966         ".syntax divided \n\t"
00967         : [r0] "+l" (r0), [r1] "+l" (r1)
00968         : [eccd] "I" (NUM_ECC_DIGITS * 4), [eccdm1] "I" ((NUM_ECC_DIGITS-1) * 4), [eccd2m1] "I" ((NUM_ECC_DIGITS * 2 - 1) * 4)
00969         : "r2", "r3", "r4", "r5", "r6", "r7", "cc", "memory"
00970     );
00971 
00972 #else
00973 
00974     uint64_t r01 = 0;
00975     uint32_t r2 = 0;
00976     
00977     uint i, k;
00978     for(k=0; k < NUM_ECC_DIGITS*2 - 1; ++k)
00979     {
00980         uint l_min = (k < NUM_ECC_DIGITS ? 0 : (k + 1) - NUM_ECC_DIGITS);
00981         for(i=l_min; i<=k && i<=k-i; ++i)
00982         {
00983             uint64_t l_product = (uint64_t)p_left[i] * p_left[k-i];
00984             if(i < k-i)
00985             {
00986                 r2 += l_product >> 63;
00987                 l_product *= 2;
00988             }
00989             r01 += l_product;
00990             r2 += (r01 < l_product);
00991         }
00992         p_result[k] = (uint32_t)r01;
00993         r01 = (r01 >> 32) | (((uint64_t)r2) << 32);
00994         r2 = 0;
00995     }
00996     
00997     p_result[NUM_ECC_DIGITS*2 - 1] = (uint32_t)r01;
00998 #endif
00999 }
01000 
01001 /* Computes p_result = p_left^2 % curve_p. */
01002 static void vli_modSquare_fast(uint32_t *p_result, const uint32_t *p_left)
01003 {
01004     uint32_t l_product[2 * NUM_ECC_DIGITS];
01005     vli_square(l_product, p_left);
01006     vli_mmod_fast(p_result, l_product);
01007 }
01008 
01009 #else /* ECC_SQUARE_FUNC */
01010 
01011 #define vli_square(result, left, size) vli_mult((result), (left), (left), (size))
01012 #define vli_modSquare_fast(result, left) vli_modMult_fast((result), (left), (left))
01013     
01014 #endif /* ECC_SQUARE_FUNC */
01015 
01016 #define EVEN(vli) (!(vli[0] & 1))
01017 /* Computes p_result = (1 / p_input) % p_mod. All VLIs are the same size.
01018    See "From Euclid's GCD to Montgomery Multiplication to the Great Divide"
01019    https://labs.oracle.com/techrep/2001/smli_tr-2001-95.pdf */
01020 static void vli_modInv(uint32_t *p_result, const uint32_t *p_input, const uint32_t *p_mod)
01021 {
01022     uint32_t a[NUM_ECC_DIGITS], b[NUM_ECC_DIGITS], u[NUM_ECC_DIGITS], v[NUM_ECC_DIGITS];
01023     uint32_t l_carry;
01024     
01025     vli_set(a, p_input);
01026     vli_set(b, p_mod);
01027     vli_clear(u);
01028     u[0] = 1;
01029     vli_clear(v);
01030     
01031     int l_cmpResult;
01032     while((l_cmpResult = vli_cmp(a, b)) != 0)
01033     {
01034         l_carry = 0;
01035         if(EVEN(a))
01036         {
01037             vli_rshift1(a);
01038             if(!EVEN(u))
01039             {
01040                 l_carry = vli_add(u, u, p_mod);
01041             }
01042             vli_rshift1(u);
01043             if(l_carry)
01044             {
01045                 u[NUM_ECC_DIGITS-1] |= 0x80000000;
01046             }
01047         }
01048         else if(EVEN(b))
01049         {
01050             vli_rshift1(b);
01051             if(!EVEN(v))
01052             {
01053                 l_carry = vli_add(v, v, p_mod);
01054             }
01055             vli_rshift1(v);
01056             if(l_carry)
01057             {
01058                 v[NUM_ECC_DIGITS-1] |= 0x80000000;
01059             }
01060         }
01061         else if(l_cmpResult > 0)
01062         {
01063             vli_sub(a, a, b);
01064             vli_rshift1(a);
01065             if(vli_cmp(u, v) < 0)
01066             {
01067                 vli_add(u, u, p_mod);
01068             }
01069             vli_sub(u, u, v);
01070             if(!EVEN(u))
01071             {
01072                 l_carry = vli_add(u, u, p_mod);
01073             }
01074             vli_rshift1(u);
01075             if(l_carry)
01076             {
01077                 u[NUM_ECC_DIGITS-1] |= 0x80000000;
01078             }
01079         }
01080         else
01081         {
01082             vli_sub(b, b, a);
01083             vli_rshift1(b);
01084             if(vli_cmp(v, u) < 0)
01085             {
01086                 vli_add(v, v, p_mod);
01087             }
01088             vli_sub(v, v, u);
01089             if(!EVEN(v))
01090             {
01091                 l_carry = vli_add(v, v, p_mod);
01092             }
01093             vli_rshift1(v);
01094             if(l_carry)
01095             {
01096                 v[NUM_ECC_DIGITS-1] |= 0x80000000;
01097             }
01098         }
01099     }
01100     
01101     vli_set(p_result, u);
01102 }
01103 
01104 /* ------ Point operations ------ */
01105 
01106 /* Returns 1 if p_point is the point at infinity, 0 otherwise. */
01107 static int EccPoint_isZero(const EccPoint *p_point)
01108 {
01109     return (vli_isZero(p_point->x) && vli_isZero(p_point->y));
01110 }
01111 
01112 /* Point multiplication algorithm using Montgomery's ladder with co-Z coordinates.
01113 From http://eprint.iacr.org/2011/338.pdf
01114 */
01115 
01116 /* Double in place */
01117 #if (ECC_CURVE == secp256k1)
01118 static void EccPoint_double_jacobian(uint32_t *X1, uint32_t *Y1, uint32_t *Z1)
01119 {
01120     /* t1 = X, t2 = Y, t3 = Z */
01121     uint32_t t4[NUM_ECC_DIGITS];
01122     uint32_t t5[NUM_ECC_DIGITS];
01123     
01124     if(vli_isZero(Z1))
01125     {
01126         return;
01127     }
01128     
01129     vli_modSquare_fast(t5, Y1);   /* t5 = y1^2 */
01130     vli_modMult_fast(t4, X1, t5); /* t4 = x1*y1^2 = A */
01131     vli_modSquare_fast(X1, X1);   /* t1 = x1^2 */
01132     vli_modSquare_fast(t5, t5);   /* t5 = y1^4 */
01133     vli_modMult_fast(Z1, Y1, Z1); /* t3 = y1*z1 = z3 */
01134     
01135     vli_modAdd(Y1, X1, X1, curve_p); /* t2 = 2*x1^2 */
01136     vli_modAdd(Y1, Y1, X1, curve_p); /* t2 = 3*x1^2 */
01137     if(vli_testBit(Y1, 0))
01138     {
01139         uint32_t l_carry = vli_add(Y1, Y1, curve_p);
01140         vli_rshift1(Y1);
01141         Y1[NUM_ECC_DIGITS-1] |= l_carry << 31;
01142     }
01143     else
01144     {
01145         vli_rshift1(Y1);
01146     }
01147     /* t2 = 3/2*(x1^2) = B */
01148     
01149     vli_modSquare_fast(X1, Y1);   /* t1 = B^2 */
01150     vli_modSub(X1, X1, t4, curve_p); /* t1 = B^2 - A */
01151     vli_modSub(X1, X1, t4, curve_p); /* t1 = B^2 - 2A = x3 */
01152     
01153     vli_modSub(t4, t4, X1, curve_p); /* t4 = A - x3 */
01154     vli_modMult_fast(Y1, Y1, t4);    /* t2 = B * (A - x3) */
01155     vli_modSub(Y1, Y1, t5, curve_p); /* t2 = B * (A - x3) - y1^4 = y3 */
01156 }
01157 #else
01158 static void EccPoint_double_jacobian(uint32_t *X1, uint32_t *Y1, uint32_t *Z1)
01159 {
01160     /* t1 = X, t2 = Y, t3 = Z */
01161     uint32_t t4[NUM_ECC_DIGITS];
01162     uint32_t t5[NUM_ECC_DIGITS];
01163     
01164     if(vli_isZero(Z1))
01165     {
01166         return;
01167     }
01168     
01169     vli_modSquare_fast(t4, Y1);   /* t4 = y1^2 */
01170     vli_modMult_fast(t5, X1, t4); /* t5 = x1*y1^2 = A */
01171     vli_modSquare_fast(t4, t4);   /* t4 = y1^4 */
01172     vli_modMult_fast(Y1, Y1, Z1); /* t2 = y1*z1 = z3 */
01173     vli_modSquare_fast(Z1, Z1);   /* t3 = z1^2 */
01174     
01175     vli_modAdd(X1, X1, Z1, curve_p); /* t1 = x1 + z1^2 */
01176     vli_modAdd(Z1, Z1, Z1, curve_p); /* t3 = 2*z1^2 */
01177     vli_modSub(Z1, X1, Z1, curve_p); /* t3 = x1 - z1^2 */
01178     vli_modMult_fast(X1, X1, Z1);    /* t1 = x1^2 - z1^4 */
01179     
01180     vli_modAdd(Z1, X1, X1, curve_p); /* t3 = 2*(x1^2 - z1^4) */
01181     vli_modAdd(X1, X1, Z1, curve_p); /* t1 = 3*(x1^2 - z1^4) */
01182     if(vli_testBit(X1, 0))
01183     {
01184         uint32_t l_carry = vli_add(X1, X1, curve_p);
01185         vli_rshift1(X1);
01186         X1[NUM_ECC_DIGITS-1] |= l_carry << 31;
01187     }
01188     else
01189     {
01190         vli_rshift1(X1);
01191     }
01192     /* t1 = 3/2*(x1^2 - z1^4) = B */
01193     
01194     vli_modSquare_fast(Z1, X1);      /* t3 = B^2 */
01195     vli_modSub(Z1, Z1, t5, curve_p); /* t3 = B^2 - A */
01196     vli_modSub(Z1, Z1, t5, curve_p); /* t3 = B^2 - 2A = x3 */
01197     vli_modSub(t5, t5, Z1, curve_p); /* t5 = A - x3 */
01198     vli_modMult_fast(X1, X1, t5);    /* t1 = B * (A - x3) */
01199     vli_modSub(t4, X1, t4, curve_p); /* t4 = B * (A - x3) - y1^4 = y3 */
01200     
01201     vli_set(X1, Z1);
01202     vli_set(Z1, Y1);
01203     vli_set(Y1, t4);
01204 }
01205 #endif
01206 
01207 /* Modify (x1, y1) => (x1 * z^2, y1 * z^3) */
01208 static void apply_z(uint32_t *X1, uint32_t *Y1, uint32_t *Z)
01209 {
01210     uint32_t t1[NUM_ECC_DIGITS];
01211 
01212     vli_modSquare_fast(t1, Z);    /* z^2 */
01213     vli_modMult_fast(X1, X1, t1); /* x1 * z^2 */
01214     vli_modMult_fast(t1, t1, Z);  /* z^3 */
01215     vli_modMult_fast(Y1, Y1, t1); /* y1 * z^3 */
01216 }
01217 
01218 /* P = (x1, y1) => 2P, (x2, y2) => P' */
01219 static void XYcZ_initial_double(uint32_t *X1, uint32_t *Y1, uint32_t *X2, uint32_t *Y2, const uint32_t *p_initialZ)
01220 {
01221     uint32_t z[NUM_ECC_DIGITS];
01222     
01223     vli_set(X2, X1);
01224     vli_set(Y2, Y1);
01225     
01226     vli_clear(z);
01227     z[0] = 1;
01228     if(p_initialZ)
01229     {
01230         vli_set(z, p_initialZ);
01231     }
01232     apply_z(X1, Y1, z);
01233     
01234     EccPoint_double_jacobian(X1, Y1, z);
01235     
01236     apply_z(X2, Y2, z);
01237 }
01238 
01239 /* Input P = (x1, y1, Z), Q = (x2, y2, Z)
01240    Output P' = (x1', y1', Z3), P + Q = (x3, y3, Z3)
01241    or P => P', Q => P + Q
01242 */
01243 static void XYcZ_add(uint32_t *X1, uint32_t *Y1, uint32_t *X2, uint32_t *Y2)
01244 {
01245     /* t1 = X1, t2 = Y1, t3 = X2, t4 = Y2 */
01246     uint32_t t5[NUM_ECC_DIGITS];
01247     
01248     vli_modSub(t5, X2, X1, curve_p); /* t5 = x2 - x1 */
01249     vli_modSquare_fast(t5, t5);      /* t5 = (x2 - x1)^2 = A */
01250     vli_modMult_fast(X1, X1, t5);    /* t1 = x1*A = B */
01251     vli_modMult_fast(X2, X2, t5);    /* t3 = x2*A = C */
01252     vli_modSub(Y2, Y2, Y1, curve_p); /* t4 = y2 - y1 */
01253     vli_modSquare_fast(t5, Y2);      /* t5 = (y2 - y1)^2 = D */
01254     
01255     vli_modSub(t5, t5, X1, curve_p); /* t5 = D - B */
01256     vli_modSub(t5, t5, X2, curve_p); /* t5 = D - B - C = x3 */
01257     vli_modSub(X2, X2, X1, curve_p); /* t3 = C - B */
01258     vli_modMult_fast(Y1, Y1, X2);    /* t2 = y1*(C - B) */
01259     vli_modSub(X2, X1, t5, curve_p); /* t3 = B - x3 */
01260     vli_modMult_fast(Y2, Y2, X2);    /* t4 = (y2 - y1)*(B - x3) */
01261     vli_modSub(Y2, Y2, Y1, curve_p); /* t4 = y3 */
01262     
01263     vli_set(X2, t5);
01264 }
01265 
01266 /* Input P = (x1, y1, Z), Q = (x2, y2, Z)
01267    Output P + Q = (x3, y3, Z3), P - Q = (x3', y3', Z3)
01268    or P => P - Q, Q => P + Q
01269 */
01270 static void XYcZ_addC(uint32_t *X1, uint32_t *Y1, uint32_t *X2, uint32_t *Y2)
01271 {
01272     /* t1 = X1, t2 = Y1, t3 = X2, t4 = Y2 */
01273     uint32_t t5[NUM_ECC_DIGITS];
01274     uint32_t t6[NUM_ECC_DIGITS];
01275     uint32_t t7[NUM_ECC_DIGITS];
01276     
01277     vli_modSub(t5, X2, X1, curve_p); /* t5 = x2 - x1 */
01278     vli_modSquare_fast(t5, t5);      /* t5 = (x2 - x1)^2 = A */
01279     vli_modMult_fast(X1, X1, t5);    /* t1 = x1*A = B */
01280     vli_modMult_fast(X2, X2, t5);    /* t3 = x2*A = C */
01281     vli_modAdd(t5, Y2, Y1, curve_p); /* t4 = y2 + y1 */
01282     vli_modSub(Y2, Y2, Y1, curve_p); /* t4 = y2 - y1 */
01283 
01284     vli_modSub(t6, X2, X1, curve_p); /* t6 = C - B */
01285     vli_modMult_fast(Y1, Y1, t6);    /* t2 = y1 * (C - B) */
01286     vli_modAdd(t6, X1, X2, curve_p); /* t6 = B + C */
01287     vli_modSquare_fast(X2, Y2);      /* t3 = (y2 - y1)^2 */
01288     vli_modSub(X2, X2, t6, curve_p); /* t3 = x3 */
01289     
01290     vli_modSub(t7, X1, X2, curve_p); /* t7 = B - x3 */
01291     vli_modMult_fast(Y2, Y2, t7);    /* t4 = (y2 - y1)*(B - x3) */
01292     vli_modSub(Y2, Y2, Y1, curve_p); /* t4 = y3 */
01293     
01294     vli_modSquare_fast(t7, t5);      /* t7 = (y2 + y1)^2 = F */
01295     vli_modSub(t7, t7, t6, curve_p); /* t7 = x3' */
01296     vli_modSub(t6, t7, X1, curve_p); /* t6 = x3' - B */
01297     vli_modMult_fast(t6, t6, t5);    /* t6 = (y2 + y1)*(x3' - B) */
01298     vli_modSub(Y1, t6, Y1, curve_p); /* t2 = y3' */
01299     
01300     vli_set(X1, t7);
01301 }
01302 
01303 static void EccPoint_mult(EccPoint *p_result, const EccPoint *p_point,
01304     const uint32_t *p_scalar, const uint32_t *p_initialZ)
01305 {
01306     /* R0 and R1 */
01307     uint32_t Rx[2][NUM_ECC_DIGITS];
01308     uint32_t Ry[2][NUM_ECC_DIGITS];
01309     uint32_t z[NUM_ECC_DIGITS];
01310     
01311     int i, nb;
01312     
01313     vli_set(Rx[1], p_point->x);
01314     vli_set(Ry[1], p_point->y);
01315 
01316     XYcZ_initial_double(Rx[1], Ry[1], Rx[0], Ry[0], p_initialZ);
01317 
01318     for(i = vli_numBits(p_scalar) - 2; i > 0; --i)
01319     {
01320         nb = !vli_testBit(p_scalar, i);
01321         XYcZ_addC(Rx[1-nb], Ry[1-nb], Rx[nb], Ry[nb]);
01322         XYcZ_add(Rx[nb], Ry[nb], Rx[1-nb], Ry[1-nb]);
01323     }
01324 
01325     nb = !vli_testBit(p_scalar, 0);
01326     XYcZ_addC(Rx[1-nb], Ry[1-nb], Rx[nb], Ry[nb]);
01327     
01328     /* Find final 1/Z value. */
01329     vli_modSub(z, Rx[1], Rx[0], curve_p); /* X1 - X0 */
01330     vli_modMult_fast(z, z, Ry[1-nb]);     /* Yb * (X1 - X0) */
01331     vli_modMult_fast(z, z, p_point->x);   /* xP * Yb * (X1 - X0) */
01332     vli_modInv(z, z, curve_p);            /* 1 / (xP * Yb * (X1 - X0)) */
01333     vli_modMult_fast(z, z, p_point->y);   /* yP / (xP * Yb * (X1 - X0)) */
01334     vli_modMult_fast(z, z, Rx[1-nb]);     /* Xb * yP / (xP * Yb * (X1 - X0)) */
01335     /* End 1/Z calculation */
01336 
01337     XYcZ_add(Rx[nb], Ry[nb], Rx[1-nb], Ry[1-nb]);
01338     
01339     apply_z(Rx[0], Ry[0], z);
01340     
01341     vli_set(p_result->x, Rx[0]);
01342     vli_set(p_result->y, Ry[0]);
01343 }
01344 
01345 int ecc_make_key(EccPoint *p_publicKey, uint32_t p_privateKey[NUM_ECC_DIGITS],
01346     const uint32_t p_random[NUM_ECC_DIGITS])
01347 {
01348     /* Make sure the private key is in the range [1, n-1].
01349        For the supported curves, n is always large enough that we only need to subtract once at most. */
01350     vli_set(p_privateKey, p_random);
01351     if(vli_cmp(curve_n, p_privateKey) != 1)
01352     {
01353         vli_sub(p_privateKey, p_privateKey, curve_n);
01354     }
01355     
01356     if(vli_isZero(p_privateKey))
01357     {
01358         return 0; /* The private key cannot be 0 (mod p). */
01359     }
01360     
01361     EccPoint_mult(p_publicKey, &curve_G, p_privateKey, NULL);
01362     return 1;
01363 }
01364 
01365 #if (ECC_CURVE == secp256k1)
01366 /* Compute p_result = x^3 + b */
01367 static void curve_x_side(uint32_t p_result[NUM_ECC_DIGITS], const uint32_t x[NUM_ECC_DIGITS])
01368 {
01369     vli_modSquare_fast(p_result, x); /* r = x^2 */
01370     vli_modMult_fast(p_result, p_result, x); /* r = x^3 */
01371     vli_modAdd(p_result, p_result, curve_b, curve_p); /* r = x^3 + b */
01372 }
01373 #else
01374 /* Compute p_result = x^3 - 3x + b */
01375 static void curve_x_side(uint32_t p_result[NUM_ECC_DIGITS], const uint32_t x[NUM_ECC_DIGITS])
01376 {
01377     uint32_t _3[NUM_ECC_DIGITS] = {3}; /* -a = 3 */
01378     
01379     vli_modSquare_fast(p_result, x); /* r = x^2 */
01380     vli_modSub(p_result, p_result, _3, curve_p); /* r = x^2 - 3 */
01381     vli_modMult_fast(p_result, p_result, x); /* r = x^3 - 3x */
01382     vli_modAdd(p_result, p_result, curve_b, curve_p); /* r = x^3 - 3x + b */
01383 }
01384 #endif
01385 
01386 int ecc_valid_public_key(const EccPoint *p_publicKey)
01387 {
01388     uint32_t l_tmp1[NUM_ECC_DIGITS];
01389     uint32_t l_tmp2[NUM_ECC_DIGITS];
01390     
01391     if(EccPoint_isZero(p_publicKey))
01392     {
01393         return 0;
01394     }
01395     
01396     if(vli_cmp(curve_p, p_publicKey->x) != 1 || vli_cmp(curve_p, p_publicKey->y) != 1)
01397     {
01398         return 0;
01399     }
01400     
01401     vli_modSquare_fast(l_tmp1, p_publicKey->y); /* tmp1 = y^2 */
01402     
01403     curve_x_side(l_tmp2, p_publicKey->x); /* tmp2 = x^3 - 3x + b */
01404     
01405     /* Make sure that y^2 == x^3 + ax + b */
01406     if(vli_cmp(l_tmp1, l_tmp2) != 0)
01407     {
01408         return 0;
01409     }
01410     
01411     return 1;
01412 }
01413 
01414 int ecdh_shared_secret(uint32_t p_secret[NUM_ECC_DIGITS], const EccPoint *p_publicKey,
01415     const uint32_t p_privateKey[NUM_ECC_DIGITS], const uint32_t p_random[NUM_ECC_DIGITS])
01416 {
01417     EccPoint l_product;
01418     
01419     EccPoint_mult(&l_product, p_publicKey, p_privateKey, p_random);
01420     if(EccPoint_isZero(&l_product))
01421     {
01422         return 0;
01423     }
01424     
01425     vli_set(p_secret, l_product.x);
01426     
01427     return 1;
01428 }
01429 
01430 /* -------- ECDSA code -------- */
01431 
01432 /* Computes p_result = (p_left * p_right) % p_mod. */
01433 static void vli_modMult(uint32_t *p_result, const uint32_t *p_left,
01434     const uint32_t *p_right, const uint32_t *p_mod)
01435 {
01436     uint32_t l_product[2 * NUM_ECC_DIGITS];
01437     uint32_t l_modMultiple[2 * NUM_ECC_DIGITS];
01438     uint l_digitShift, l_bitShift;
01439     uint l_productBits;
01440     uint l_modBits = vli_numBits(p_mod);
01441     
01442     vli_mult(l_product, p_left, p_right);
01443     l_productBits = vli_numBits(l_product + NUM_ECC_DIGITS);
01444     if(l_productBits)
01445     {
01446         l_productBits += NUM_ECC_DIGITS * 32;
01447     }
01448     else
01449     {
01450         l_productBits = vli_numBits(l_product);
01451     }
01452     
01453     if(l_productBits < l_modBits)
01454     { /* l_product < p_mod. */
01455         vli_set(p_result, l_product);
01456         return;
01457     }
01458     
01459     /* Shift p_mod by (l_leftBits - l_modBits). This multiplies p_mod by the largest
01460        power of two possible while still resulting in a number less than p_left. */
01461     vli_clear(l_modMultiple);
01462     vli_clear(l_modMultiple + NUM_ECC_DIGITS);
01463     l_digitShift = (l_productBits - l_modBits) / 32;
01464     l_bitShift = (l_productBits - l_modBits) % 32;
01465     if(l_bitShift)
01466     {
01467         l_modMultiple[l_digitShift + NUM_ECC_DIGITS] = vli_lshift(l_modMultiple + l_digitShift, p_mod, l_bitShift);
01468     }
01469     else
01470     {
01471         vli_set(l_modMultiple + l_digitShift, p_mod);
01472     }
01473 
01474     /* Subtract all multiples of p_mod to get the remainder. */
01475     vli_clear(p_result);
01476     p_result[0] = 1; /* Use p_result as a temp var to store 1 (for subtraction) */
01477     while(l_productBits > NUM_ECC_DIGITS * 32 || vli_cmp(l_modMultiple, p_mod) >= 0)
01478     {
01479         int l_cmp = vli_cmp(l_modMultiple + NUM_ECC_DIGITS, l_product + NUM_ECC_DIGITS);
01480         if(l_cmp < 0 || (l_cmp == 0 && vli_cmp(l_modMultiple, l_product) <= 0))
01481         {
01482             if(vli_sub(l_product, l_product, l_modMultiple))
01483             { /* borrow */
01484                 vli_sub(l_product + NUM_ECC_DIGITS, l_product + NUM_ECC_DIGITS, p_result);
01485             }
01486             vli_sub(l_product + NUM_ECC_DIGITS, l_product + NUM_ECC_DIGITS, l_modMultiple + NUM_ECC_DIGITS);
01487         }
01488         uint32_t l_carry = (l_modMultiple[NUM_ECC_DIGITS] & 0x01) << 31;
01489         vli_rshift1(l_modMultiple + NUM_ECC_DIGITS);
01490         vli_rshift1(l_modMultiple);
01491         l_modMultiple[NUM_ECC_DIGITS-1] |= l_carry;
01492         
01493         --l_productBits;
01494     }
01495     vli_set(p_result, l_product);
01496 }
01497 
01498 static inline unsigned int max(const unsigned int a, const unsigned int b)
01499 {
01500     return (a > b ? a : b);
01501 }
01502 
01503 int ecdsa_sign(uint32_t r[NUM_ECC_DIGITS], uint32_t s[NUM_ECC_DIGITS],
01504     const uint32_t p_privateKey[NUM_ECC_DIGITS], const uint32_t p_random[NUM_ECC_DIGITS],
01505     const uint32_t p_hash[NUM_ECC_DIGITS])
01506 {
01507     uint32_t k[NUM_ECC_DIGITS];
01508     EccPoint p;
01509     
01510     if(vli_isZero(p_random))
01511     { /* The random number must not be 0. */
01512         return 0;
01513     }
01514     
01515     vli_set(k, p_random);
01516     if(vli_cmp(curve_n, k) != 1)
01517     {
01518         vli_sub(k, k, curve_n);
01519     }
01520     
01521     /* tmp = k * G */
01522     EccPoint_mult(&p, &curve_G, k, NULL);
01523     
01524     /* r = x1 (mod n) */
01525     vli_set(r, p.x);
01526     if(vli_cmp(curve_n, r) != 1)
01527     {
01528         vli_sub(r, r, curve_n);
01529     }
01530     if(vli_isZero(r))
01531     { /* If r == 0, fail (need a different random number). */
01532         return 0;
01533     }
01534     
01535     vli_modMult(s, r, p_privateKey, curve_n); /* s = r*d */
01536     vli_modAdd(s, p_hash, s, curve_n); /* s = e + r*d */
01537     vli_modInv(k, k, curve_n); /* k = 1 / k */
01538     vli_modMult(s, s, k, curve_n); /* s = (e + r*d) / k */
01539     
01540     return 1;
01541 }
01542 
01543 int ecdsa_verify(const EccPoint *p_publicKey, const uint32_t p_hash[NUM_ECC_DIGITS],
01544     const uint32_t r[NUM_ECC_DIGITS], const uint32_t s[NUM_ECC_DIGITS])
01545 {
01546     uint32_t u1[NUM_ECC_DIGITS], u2[NUM_ECC_DIGITS];
01547     uint32_t z[NUM_ECC_DIGITS];
01548     EccPoint l_sum;
01549     uint32_t rx[NUM_ECC_DIGITS];
01550     uint32_t ry[NUM_ECC_DIGITS];
01551     uint32_t tx[NUM_ECC_DIGITS];
01552     uint32_t ty[NUM_ECC_DIGITS];
01553     uint32_t tz[NUM_ECC_DIGITS];
01554     
01555     if(vli_isZero(r) || vli_isZero(s))
01556     { /* r, s must not be 0. */
01557         return 0;
01558     }
01559     
01560     if(vli_cmp(curve_n, r) != 1 || vli_cmp(curve_n, s) != 1)
01561     { /* r, s must be < n. */
01562         return 0;
01563     }
01564 
01565     /* Calculate u1 and u2. */
01566     vli_modInv(z, s, curve_n); /* Z = s^-1 */
01567     vli_modMult(u1, p_hash, z, curve_n); /* u1 = e/s */
01568     vli_modMult(u2, r, z, curve_n); /* u2 = r/s */
01569     
01570     /* Calculate l_sum = G + Q. */
01571     vli_set(l_sum.x, p_publicKey->x);
01572     vli_set(l_sum.y, p_publicKey->y);
01573     vli_set(tx, curve_G.x);
01574     vli_set(ty, curve_G.y);
01575     vli_modSub(z, l_sum.x, tx, curve_p); /* Z = x2 - x1 */
01576     XYcZ_add(tx, ty, l_sum.x, l_sum.y);
01577     vli_modInv(z, z, curve_p); /* Z = 1/Z */
01578     apply_z(l_sum.x, l_sum.y, z);
01579     
01580     /* Use Shamir's trick to calculate u1*G + u2*Q */
01581     const EccPoint *l_points[4] = {NULL, &curve_G, p_publicKey, &l_sum};
01582     uint l_numBits = max(vli_numBits(u1), vli_numBits(u2));
01583     
01584     const EccPoint *l_point = l_points[(!!vli_testBit(u1, l_numBits-1)) | ((!!vli_testBit(u2, l_numBits-1)) << 1)];
01585     vli_set(rx, l_point->x);
01586     vli_set(ry, l_point->y);
01587     vli_clear(z);
01588     z[0] = 1;
01589 
01590     int i;
01591     for(i = l_numBits - 2; i >= 0; --i)
01592     {
01593         EccPoint_double_jacobian(rx, ry, z);
01594         
01595         int l_index = (!!vli_testBit(u1, i)) | ((!!vli_testBit(u2, i)) << 1);
01596         l_point = l_points[l_index];
01597         if(l_point)
01598         {
01599             vli_set(tx, l_point->x);
01600             vli_set(ty, l_point->y);
01601             apply_z(tx, ty, z);
01602             vli_modSub(tz, rx, tx, curve_p); /* Z = x2 - x1 */
01603             XYcZ_add(tx, ty, rx, ry);
01604             vli_modMult_fast(z, z, tz);
01605         }
01606     }
01607 
01608     vli_modInv(z, z, curve_p); /* Z = 1/Z */
01609     apply_z(rx, ry, z);
01610     
01611     /* v = x1 (mod n) */
01612     if(vli_cmp(curve_n, rx) != 1)
01613     {
01614         vli_sub(rx, rx, curve_n);
01615     }
01616 
01617     /* Accept only if v == r. */
01618     return (vli_cmp(rx, r) == 0);
01619 }
01620 
01621 void ecc_bytes2native(uint32_t p_native[NUM_ECC_DIGITS], const uint8_t p_bytes[NUM_ECC_DIGITS*4])
01622 {
01623     unsigned i;
01624     for(i=0; i<NUM_ECC_DIGITS; ++i)
01625     {
01626         uint8_t *p_digit = (uint8_t *)(p_bytes + 4 * (NUM_ECC_DIGITS - 1 - i));
01627         p_native[i] = ((uint32_t)p_digit[0] << 24) | ((uint32_t)p_digit[1] << 16) | ((uint32_t)p_digit[2] << 8) | (uint32_t)p_digit[3];
01628     }
01629 }
01630 
01631 void ecc_native2bytes(uint8_t p_bytes[NUM_ECC_DIGITS*4], const uint32_t p_native[NUM_ECC_DIGITS])
01632 {
01633     unsigned i;
01634     for(i=0; i<NUM_ECC_DIGITS; ++i)
01635     {
01636         uint8_t *p_digit = p_bytes + 4 * (NUM_ECC_DIGITS - 1 - i);
01637         p_digit[0] = p_native[i] >> 24;
01638         p_digit[1] = p_native[i] >> 16;
01639         p_digit[2] = p_native[i] >> 8;
01640         p_digit[3] = p_native[i];
01641     }
01642 }
01643 
01644 /* Compute a = sqrt(a) (mod curve_p). */
01645 static void mod_sqrt(uint32_t a[NUM_ECC_DIGITS])
01646 {
01647     unsigned i;
01648     uint32_t p1[NUM_ECC_DIGITS] = {1};
01649     uint32_t l_result[NUM_ECC_DIGITS] = {1};
01650     
01651     /* Since curve_p == 3 (mod 4) for all supported curves, we can
01652        compute sqrt(a) = a^((curve_p + 1) / 4) (mod curve_p). */
01653     vli_add(p1, curve_p, p1); /* p1 = curve_p + 1 */
01654     for(i = vli_numBits(p1) - 1; i > 1; --i)
01655     {
01656         vli_modSquare_fast(l_result, l_result);
01657         if(vli_testBit(p1, i))
01658         {
01659             vli_modMult_fast(l_result, l_result, a);
01660         }
01661     }
01662     vli_set(a, l_result);
01663 }
01664 
01665 void ecc_point_compress(uint8_t p_compressed[NUM_ECC_DIGITS*4 + 1], const EccPoint *p_point)
01666 {
01667     p_compressed[0] = 2 + (p_point->y[0] & 0x01);
01668     ecc_native2bytes(p_compressed + 1, p_point->x);
01669 }
01670 
01671 void ecc_point_decompress(EccPoint *p_point, const uint8_t p_compressed[NUM_ECC_DIGITS*4 + 1])
01672 {
01673     ecc_bytes2native(p_point->x, p_compressed + 1);
01674     curve_x_side(p_point->y, p_point->x);
01675     mod_sqrt(p_point->y);
01676     if((p_point->y[0] & 0x01) != (p_compressed[0] & 0x01))
01677     {
01678         vli_sub(p_point->y, curve_p, p_point->y);
01679     }
01680 }
01681