mbed TLS library
Dependents: HTTPClient-SSL WS_SERVER
aesni.c
00001 /* 00002 * AES-NI support functions 00003 * 00004 * Copyright (C) 2006-2014, ARM Limited, All Rights Reserved 00005 * 00006 * This file is part of mbed TLS (https://tls.mbed.org) 00007 * 00008 * This program is free software; you can redistribute it and/or modify 00009 * it under the terms of the GNU General Public License as published by 00010 * the Free Software Foundation; either version 2 of the License, or 00011 * (at your option) any later version. 00012 * 00013 * This program is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00016 * GNU General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU General Public License along 00019 * with this program; if not, write to the Free Software Foundation, Inc., 00020 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 00021 */ 00022 00023 /* 00024 * [AES-WP] http://software.intel.com/en-us/articles/intel-advanced-encryption-standard-aes-instructions-set 00025 * [CLMUL-WP] http://software.intel.com/en-us/articles/intel-carry-less-multiplication-instruction-and-its-usage-for-computing-the-gcm-mode/ 00026 */ 00027 00028 #if !defined(POLARSSL_CONFIG_FILE) 00029 #include "polarssl/config.h" 00030 #else 00031 #include POLARSSL_CONFIG_FILE 00032 #endif 00033 00034 #if defined(POLARSSL_AESNI_C) 00035 00036 #include "polarssl/aesni.h" 00037 00038 #include <string.h> 00039 00040 #if defined(POLARSSL_HAVE_X86_64) 00041 00042 /* 00043 * AES-NI support detection routine 00044 */ 00045 int aesni_supports( unsigned int what ) 00046 { 00047 static int done = 0; 00048 static unsigned int c = 0; 00049 00050 if( ! done ) 00051 { 00052 asm( "movl $1, %%eax \n\t" 00053 "cpuid \n\t" 00054 : "=c" (c) 00055 : 00056 : "eax", "ebx", "edx" ); 00057 done = 1; 00058 } 00059 00060 return( ( c & what ) != 0 ); 00061 } 00062 00063 /* 00064 * Binutils needs to be at least 2.19 to support AES-NI instructions. 00065 * Unfortunately, a lot of users have a lower version now (2014-04). 00066 * Emit bytecode directly in order to support "old" version of gas. 00067 * 00068 * Opcodes from the Intel architecture reference manual, vol. 3. 00069 * We always use registers, so we don't need prefixes for memory operands. 00070 * Operand macros are in gas order (src, dst) as opposed to Intel order 00071 * (dst, src) in order to blend better into the surrounding assembly code. 00072 */ 00073 #define AESDEC ".byte 0x66,0x0F,0x38,0xDE," 00074 #define AESDECLAST ".byte 0x66,0x0F,0x38,0xDF," 00075 #define AESENC ".byte 0x66,0x0F,0x38,0xDC," 00076 #define AESENCLAST ".byte 0x66,0x0F,0x38,0xDD," 00077 #define AESIMC ".byte 0x66,0x0F,0x38,0xDB," 00078 #define AESKEYGENA ".byte 0x66,0x0F,0x3A,0xDF," 00079 #define PCLMULQDQ ".byte 0x66,0x0F,0x3A,0x44," 00080 00081 #define xmm0_xmm0 "0xC0" 00082 #define xmm0_xmm1 "0xC8" 00083 #define xmm0_xmm2 "0xD0" 00084 #define xmm0_xmm3 "0xD8" 00085 #define xmm0_xmm4 "0xE0" 00086 #define xmm1_xmm0 "0xC1" 00087 #define xmm1_xmm2 "0xD1" 00088 00089 /* 00090 * AES-NI AES-ECB block en(de)cryption 00091 */ 00092 int aesni_crypt_ecb( aes_context *ctx, 00093 int mode, 00094 const unsigned char input[16], 00095 unsigned char output[16] ) 00096 { 00097 asm( "movdqu (%3), %%xmm0 \n\t" // load input 00098 "movdqu (%1), %%xmm1 \n\t" // load round key 0 00099 "pxor %%xmm1, %%xmm0 \n\t" // round 0 00100 "addq $16, %1 \n\t" // point to next round key 00101 "subl $1, %0 \n\t" // normal rounds = nr - 1 00102 "test %2, %2 \n\t" // mode? 00103 "jz 2f \n\t" // 0 = decrypt 00104 00105 "1: \n\t" // encryption loop 00106 "movdqu (%1), %%xmm1 \n\t" // load round key 00107 AESENC xmm1_xmm0 "\n\t" // do round 00108 "addq $16, %1 \n\t" // point to next round key 00109 "subl $1, %0 \n\t" // loop 00110 "jnz 1b \n\t" 00111 "movdqu (%1), %%xmm1 \n\t" // load round key 00112 AESENCLAST xmm1_xmm0 "\n\t" // last round 00113 "jmp 3f \n\t" 00114 00115 "2: \n\t" // decryption loop 00116 "movdqu (%1), %%xmm1 \n\t" 00117 AESDEC xmm1_xmm0 "\n\t" // do round 00118 "addq $16, %1 \n\t" 00119 "subl $1, %0 \n\t" 00120 "jnz 2b \n\t" 00121 "movdqu (%1), %%xmm1 \n\t" // load round key 00122 AESDECLAST xmm1_xmm0 "\n\t" // last round 00123 00124 "3: \n\t" 00125 "movdqu %%xmm0, (%4) \n\t" // export output 00126 : 00127 : "r" (ctx->nr ), "r" (ctx->rk ), "r" (mode), "r" (input), "r" (output) 00128 : "memory", "cc", "xmm0", "xmm1" ); 00129 00130 00131 return( 0 ); 00132 } 00133 00134 /* 00135 * GCM multiplication: c = a times b in GF(2^128) 00136 * Based on [CLMUL-WP] algorithms 1 (with equation 27) and 5. 00137 */ 00138 void aesni_gcm_mult( unsigned char c[16], 00139 const unsigned char a[16], 00140 const unsigned char b[16] ) 00141 { 00142 unsigned char aa[16], bb[16], cc[16]; 00143 size_t i; 00144 00145 /* The inputs are in big-endian order, so byte-reverse them */ 00146 for( i = 0; i < 16; i++ ) 00147 { 00148 aa[i] = a[15 - i]; 00149 bb[i] = b[15 - i]; 00150 } 00151 00152 asm( "movdqu (%0), %%xmm0 \n\t" // a1:a0 00153 "movdqu (%1), %%xmm1 \n\t" // b1:b0 00154 00155 /* 00156 * Caryless multiplication xmm2:xmm1 = xmm0 * xmm1 00157 * using [CLMUL-WP] algorithm 1 (p. 13). 00158 */ 00159 "movdqa %%xmm1, %%xmm2 \n\t" // copy of b1:b0 00160 "movdqa %%xmm1, %%xmm3 \n\t" // same 00161 "movdqa %%xmm1, %%xmm4 \n\t" // same 00162 PCLMULQDQ xmm0_xmm1 ",0x00 \n\t" // a0*b0 = c1:c0 00163 PCLMULQDQ xmm0_xmm2 ",0x11 \n\t" // a1*b1 = d1:d0 00164 PCLMULQDQ xmm0_xmm3 ",0x10 \n\t" // a0*b1 = e1:e0 00165 PCLMULQDQ xmm0_xmm4 ",0x01 \n\t" // a1*b0 = f1:f0 00166 "pxor %%xmm3, %%xmm4 \n\t" // e1+f1:e0+f0 00167 "movdqa %%xmm4, %%xmm3 \n\t" // same 00168 "psrldq $8, %%xmm4 \n\t" // 0:e1+f1 00169 "pslldq $8, %%xmm3 \n\t" // e0+f0:0 00170 "pxor %%xmm4, %%xmm2 \n\t" // d1:d0+e1+f1 00171 "pxor %%xmm3, %%xmm1 \n\t" // c1+e0+f1:c0 00172 00173 /* 00174 * Now shift the result one bit to the left, 00175 * taking advantage of [CLMUL-WP] eq 27 (p. 20) 00176 */ 00177 "movdqa %%xmm1, %%xmm3 \n\t" // r1:r0 00178 "movdqa %%xmm2, %%xmm4 \n\t" // r3:r2 00179 "psllq $1, %%xmm1 \n\t" // r1<<1:r0<<1 00180 "psllq $1, %%xmm2 \n\t" // r3<<1:r2<<1 00181 "psrlq $63, %%xmm3 \n\t" // r1>>63:r0>>63 00182 "psrlq $63, %%xmm4 \n\t" // r3>>63:r2>>63 00183 "movdqa %%xmm3, %%xmm5 \n\t" // r1>>63:r0>>63 00184 "pslldq $8, %%xmm3 \n\t" // r0>>63:0 00185 "pslldq $8, %%xmm4 \n\t" // r2>>63:0 00186 "psrldq $8, %%xmm5 \n\t" // 0:r1>>63 00187 "por %%xmm3, %%xmm1 \n\t" // r1<<1|r0>>63:r0<<1 00188 "por %%xmm4, %%xmm2 \n\t" // r3<<1|r2>>62:r2<<1 00189 "por %%xmm5, %%xmm2 \n\t" // r3<<1|r2>>62:r2<<1|r1>>63 00190 00191 /* 00192 * Now reduce modulo the GCM polynomial x^128 + x^7 + x^2 + x + 1 00193 * using [CLMUL-WP] algorithm 5 (p. 20). 00194 * Currently xmm2:xmm1 holds x3:x2:x1:x0 (already shifted). 00195 */ 00196 /* Step 2 (1) */ 00197 "movdqa %%xmm1, %%xmm3 \n\t" // x1:x0 00198 "movdqa %%xmm1, %%xmm4 \n\t" // same 00199 "movdqa %%xmm1, %%xmm5 \n\t" // same 00200 "psllq $63, %%xmm3 \n\t" // x1<<63:x0<<63 = stuff:a 00201 "psllq $62, %%xmm4 \n\t" // x1<<62:x0<<62 = stuff:b 00202 "psllq $57, %%xmm5 \n\t" // x1<<57:x0<<57 = stuff:c 00203 00204 /* Step 2 (2) */ 00205 "pxor %%xmm4, %%xmm3 \n\t" // stuff:a+b 00206 "pxor %%xmm5, %%xmm3 \n\t" // stuff:a+b+c 00207 "pslldq $8, %%xmm3 \n\t" // a+b+c:0 00208 "pxor %%xmm3, %%xmm1 \n\t" // x1+a+b+c:x0 = d:x0 00209 00210 /* Steps 3 and 4 */ 00211 "movdqa %%xmm1,%%xmm0 \n\t" // d:x0 00212 "movdqa %%xmm1,%%xmm4 \n\t" // same 00213 "movdqa %%xmm1,%%xmm5 \n\t" // same 00214 "psrlq $1, %%xmm0 \n\t" // e1:x0>>1 = e1:e0' 00215 "psrlq $2, %%xmm4 \n\t" // f1:x0>>2 = f1:f0' 00216 "psrlq $7, %%xmm5 \n\t" // g1:x0>>7 = g1:g0' 00217 "pxor %%xmm4, %%xmm0 \n\t" // e1+f1:e0'+f0' 00218 "pxor %%xmm5, %%xmm0 \n\t" // e1+f1+g1:e0'+f0'+g0' 00219 // e0'+f0'+g0' is almost e0+f0+g0, ex\tcept for some missing 00220 // bits carried from d. Now get those\t bits back in. 00221 "movdqa %%xmm1,%%xmm3 \n\t" // d:x0 00222 "movdqa %%xmm1,%%xmm4 \n\t" // same 00223 "movdqa %%xmm1,%%xmm5 \n\t" // same 00224 "psllq $63, %%xmm3 \n\t" // d<<63:stuff 00225 "psllq $62, %%xmm4 \n\t" // d<<62:stuff 00226 "psllq $57, %%xmm5 \n\t" // d<<57:stuff 00227 "pxor %%xmm4, %%xmm3 \n\t" // d<<63+d<<62:stuff 00228 "pxor %%xmm5, %%xmm3 \n\t" // missing bits of d:stuff 00229 "psrldq $8, %%xmm3 \n\t" // 0:missing bits of d 00230 "pxor %%xmm3, %%xmm0 \n\t" // e1+f1+g1:e0+f0+g0 00231 "pxor %%xmm1, %%xmm0 \n\t" // h1:h0 00232 "pxor %%xmm2, %%xmm0 \n\t" // x3+h1:x2+h0 00233 00234 "movdqu %%xmm0, (%2) \n\t" // done 00235 : 00236 : "r" (aa), "r" (bb), "r" (cc) 00237 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); 00238 00239 /* Now byte-reverse the outputs */ 00240 for( i = 0; i < 16; i++ ) 00241 c[i] = cc[15 - i]; 00242 00243 return; 00244 } 00245 00246 /* 00247 * Compute decryption round keys from encryption round keys 00248 */ 00249 void aesni_inverse_key( unsigned char *invkey, 00250 const unsigned char *fwdkey, int nr ) 00251 { 00252 unsigned char *ik = invkey; 00253 const unsigned char *fk = fwdkey + 16 * nr; 00254 00255 memcpy( ik, fk, 16 ); 00256 00257 for( fk -= 16, ik += 16; fk > fwdkey; fk -= 16, ik += 16 ) 00258 asm( "movdqu (%0), %%xmm0 \n\t" 00259 AESIMC xmm0_xmm0 "\n\t" 00260 "movdqu %%xmm0, (%1) \n\t" 00261 : 00262 : "r" (fk), "r" (ik) 00263 : "memory", "xmm0" ); 00264 00265 memcpy( ik, fk, 16 ); 00266 } 00267 00268 /* 00269 * Key expansion, 128-bit case 00270 */ 00271 static void aesni_setkey_enc_128( unsigned char *rk, 00272 const unsigned char *key ) 00273 { 00274 asm( "movdqu (%1), %%xmm0 \n\t" // copy the original key 00275 "movdqu %%xmm0, (%0) \n\t" // as round key 0 00276 "jmp 2f \n\t" // skip auxiliary routine 00277 00278 /* 00279 * Finish generating the next round key. 00280 * 00281 * On entry xmm0 is r3:r2:r1:r0 and xmm1 is X:stuff:stuff:stuff 00282 * with X = rot( sub( r3 ) ) ^ RCON. 00283 * 00284 * On exit, xmm0 is r7:r6:r5:r4 00285 * with r4 = X + r0, r5 = r4 + r1, r6 = r5 + r2, r7 = r6 + r3 00286 * and those are written to the round key buffer. 00287 */ 00288 "1: \n\t" 00289 "pshufd $0xff, %%xmm1, %%xmm1 \n\t" // X:X:X:X 00290 "pxor %%xmm0, %%xmm1 \n\t" // X+r3:X+r2:X+r1:r4 00291 "pslldq $4, %%xmm0 \n\t" // r2:r1:r0:0 00292 "pxor %%xmm0, %%xmm1 \n\t" // X+r3+r2:X+r2+r1:r5:r4 00293 "pslldq $4, %%xmm0 \n\t" // etc 00294 "pxor %%xmm0, %%xmm1 \n\t" 00295 "pslldq $4, %%xmm0 \n\t" 00296 "pxor %%xmm1, %%xmm0 \n\t" // update xmm0 for next time! 00297 "add $16, %0 \n\t" // point to next round key 00298 "movdqu %%xmm0, (%0) \n\t" // write it 00299 "ret \n\t" 00300 00301 /* Main "loop" */ 00302 "2: \n\t" 00303 AESKEYGENA xmm0_xmm1 ",0x01 \n\tcall 1b \n\t" 00304 AESKEYGENA xmm0_xmm1 ",0x02 \n\tcall 1b \n\t" 00305 AESKEYGENA xmm0_xmm1 ",0x04 \n\tcall 1b \n\t" 00306 AESKEYGENA xmm0_xmm1 ",0x08 \n\tcall 1b \n\t" 00307 AESKEYGENA xmm0_xmm1 ",0x10 \n\tcall 1b \n\t" 00308 AESKEYGENA xmm0_xmm1 ",0x20 \n\tcall 1b \n\t" 00309 AESKEYGENA xmm0_xmm1 ",0x40 \n\tcall 1b \n\t" 00310 AESKEYGENA xmm0_xmm1 ",0x80 \n\tcall 1b \n\t" 00311 AESKEYGENA xmm0_xmm1 ",0x1B \n\tcall 1b \n\t" 00312 AESKEYGENA xmm0_xmm1 ",0x36 \n\tcall 1b \n\t" 00313 : 00314 : "r" (rk), "r" (key) 00315 : "memory", "cc", "0" ); 00316 } 00317 00318 /* 00319 * Key expansion, 192-bit case 00320 */ 00321 static void aesni_setkey_enc_192( unsigned char *rk, 00322 const unsigned char *key ) 00323 { 00324 asm( "movdqu (%1), %%xmm0 \n\t" // copy original round key 00325 "movdqu %%xmm0, (%0) \n\t" 00326 "add $16, %0 \n\t" 00327 "movq 16(%1), %%xmm1 \n\t" 00328 "movq %%xmm1, (%0) \n\t" 00329 "add $8, %0 \n\t" 00330 "jmp 2f \n\t" // skip auxiliary routine 00331 00332 /* 00333 * Finish generating the next 6 quarter-keys. 00334 * 00335 * On entry xmm0 is r3:r2:r1:r0, xmm1 is stuff:stuff:r5:r4 00336 * and xmm2 is stuff:stuff:X:stuff with X = rot( sub( r3 ) ) ^ RCON. 00337 * 00338 * On exit, xmm0 is r9:r8:r7:r6 and xmm1 is stuff:stuff:r11:r10 00339 * and those are written to the round key buffer. 00340 */ 00341 "1: \n\t" 00342 "pshufd $0x55, %%xmm2, %%xmm2 \n\t" // X:X:X:X 00343 "pxor %%xmm0, %%xmm2 \n\t" // X+r3:X+r2:X+r1:r4 00344 "pslldq $4, %%xmm0 \n\t" // etc 00345 "pxor %%xmm0, %%xmm2 \n\t" 00346 "pslldq $4, %%xmm0 \n\t" 00347 "pxor %%xmm0, %%xmm2 \n\t" 00348 "pslldq $4, %%xmm0 \n\t" 00349 "pxor %%xmm2, %%xmm0 \n\t" // update xmm0 = r9:r8:r7:r6 00350 "movdqu %%xmm0, (%0) \n\t" 00351 "add $16, %0 \n\t" 00352 "pshufd $0xff, %%xmm0, %%xmm2 \n\t" // r9:r9:r9:r9 00353 "pxor %%xmm1, %%xmm2 \n\t" // stuff:stuff:r9+r5:r10 00354 "pslldq $4, %%xmm1 \n\t" // r2:r1:r0:0 00355 "pxor %%xmm2, %%xmm1 \n\t" // xmm1 = stuff:stuff:r11:r10 00356 "movq %%xmm1, (%0) \n\t" 00357 "add $8, %0 \n\t" 00358 "ret \n\t" 00359 00360 "2: \n\t" 00361 AESKEYGENA xmm1_xmm2 ",0x01 \n\tcall 1b \n\t" 00362 AESKEYGENA xmm1_xmm2 ",0x02 \n\tcall 1b \n\t" 00363 AESKEYGENA xmm1_xmm2 ",0x04 \n\tcall 1b \n\t" 00364 AESKEYGENA xmm1_xmm2 ",0x08 \n\tcall 1b \n\t" 00365 AESKEYGENA xmm1_xmm2 ",0x10 \n\tcall 1b \n\t" 00366 AESKEYGENA xmm1_xmm2 ",0x20 \n\tcall 1b \n\t" 00367 AESKEYGENA xmm1_xmm2 ",0x40 \n\tcall 1b \n\t" 00368 AESKEYGENA xmm1_xmm2 ",0x80 \n\tcall 1b \n\t" 00369 00370 : 00371 : "r" (rk), "r" (key) 00372 : "memory", "cc", "0" ); 00373 } 00374 00375 /* 00376 * Key expansion, 256-bit case 00377 */ 00378 static void aesni_setkey_enc_256( unsigned char *rk, 00379 const unsigned char *key ) 00380 { 00381 asm( "movdqu (%1), %%xmm0 \n\t" 00382 "movdqu %%xmm0, (%0) \n\t" 00383 "add $16, %0 \n\t" 00384 "movdqu 16(%1), %%xmm1 \n\t" 00385 "movdqu %%xmm1, (%0) \n\t" 00386 "jmp 2f \n\t" // skip auxiliary routine 00387 00388 /* 00389 * Finish generating the next two round keys. 00390 * 00391 * On entry xmm0 is r3:r2:r1:r0, xmm1 is r7:r6:r5:r4 and 00392 * xmm2 is X:stuff:stuff:stuff with X = rot( sub( r7 )) ^ RCON 00393 * 00394 * On exit, xmm0 is r11:r10:r9:r8 and xmm1 is r15:r14:r13:r12 00395 * and those have been written to the output buffer. 00396 */ 00397 "1: \n\t" 00398 "pshufd $0xff, %%xmm2, %%xmm2 \n\t" 00399 "pxor %%xmm0, %%xmm2 \n\t" 00400 "pslldq $4, %%xmm0 \n\t" 00401 "pxor %%xmm0, %%xmm2 \n\t" 00402 "pslldq $4, %%xmm0 \n\t" 00403 "pxor %%xmm0, %%xmm2 \n\t" 00404 "pslldq $4, %%xmm0 \n\t" 00405 "pxor %%xmm2, %%xmm0 \n\t" 00406 "add $16, %0 \n\t" 00407 "movdqu %%xmm0, (%0) \n\t" 00408 00409 /* Set xmm2 to stuff:Y:stuff:stuff with Y = subword( r11 ) 00410 * and proceed to generate next round key from there */ 00411 AESKEYGENA xmm0_xmm2 ",0x00 \n\t" 00412 "pshufd $0xaa, %%xmm2, %%xmm2 \n\t" 00413 "pxor %%xmm1, %%xmm2 \n\t" 00414 "pslldq $4, %%xmm1 \n\t" 00415 "pxor %%xmm1, %%xmm2 \n\t" 00416 "pslldq $4, %%xmm1 \n\t" 00417 "pxor %%xmm1, %%xmm2 \n\t" 00418 "pslldq $4, %%xmm1 \n\t" 00419 "pxor %%xmm2, %%xmm1 \n\t" 00420 "add $16, %0 \n\t" 00421 "movdqu %%xmm1, (%0) \n\t" 00422 "ret \n\t" 00423 00424 /* 00425 * Main "loop" - Generating one more key than necessary, 00426 * see definition of aes_context.buf 00427 */ 00428 "2: \n\t" 00429 AESKEYGENA xmm1_xmm2 ",0x01 \n\tcall 1b \n\t" 00430 AESKEYGENA xmm1_xmm2 ",0x02 \n\tcall 1b \n\t" 00431 AESKEYGENA xmm1_xmm2 ",0x04 \n\tcall 1b \n\t" 00432 AESKEYGENA xmm1_xmm2 ",0x08 \n\tcall 1b \n\t" 00433 AESKEYGENA xmm1_xmm2 ",0x10 \n\tcall 1b \n\t" 00434 AESKEYGENA xmm1_xmm2 ",0x20 \n\tcall 1b \n\t" 00435 AESKEYGENA xmm1_xmm2 ",0x40 \n\tcall 1b \n\t" 00436 : 00437 : "r" (rk), "r" (key) 00438 : "memory", "cc", "0" ); 00439 } 00440 00441 /* 00442 * Key expansion, wrapper 00443 */ 00444 int aesni_setkey_enc( unsigned char *rk, 00445 const unsigned char *key, 00446 size_t bits ) 00447 { 00448 switch( bits ) 00449 { 00450 case 128: aesni_setkey_enc_128( rk, key ); break; 00451 case 192: aesni_setkey_enc_192( rk, key ); break; 00452 case 256: aesni_setkey_enc_256( rk, key ); break; 00453 default : return( POLARSSL_ERR_AES_INVALID_KEY_LENGTH ); 00454 } 00455 00456 return( 0 ); 00457 } 00458 00459 #endif /* POLARSSL_HAVE_X86_64 */ 00460 00461 #endif /* POLARSSL_AESNI_C */ 00462
Generated on Tue Jul 12 2022 13:50:36 by 1.7.2