Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
aesni.c
00001 /* 00002 * AES-NI support functions 00003 * 00004 * Copyright (C) 2013, Brainspark B.V. 00005 * 00006 * This file is part of PolarSSL (http://www.polarssl.org) 00007 * Lead Maintainer: Paul Bakker <polarssl_maintainer at polarssl.org> 00008 * 00009 * All rights reserved. 00010 * 00011 * This program is free software; you can redistribute it and/or modify 00012 * it under the terms of the GNU General Public License as published by 00013 * the Free Software Foundation; either version 2 of the License, or 00014 * (at your option) any later version. 00015 * 00016 * This program is distributed in the hope that it will be useful, 00017 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00018 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00019 * GNU General Public License for more details. 00020 * 00021 * You should have received a copy of the GNU General Public License along 00022 * with this program; if not, write to the Free Software Foundation, Inc., 00023 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 00024 */ 00025 00026 /* 00027 * [AES-WP] http://software.intel.com/en-us/articles/intel-advanced-encryption-standard-aes-instructions-set 00028 * [CLMUL-WP] http://software.intel.com/en-us/articles/intel-carry-less-multiplication-instruction-and-its-usage-for-computing-the-gcm-mode/ 00029 */ 00030 00031 #if !defined(POLARSSL_CONFIG_FILE) 00032 #include "polarssl/config.h" 00033 #else 00034 #include POLARSSL_CONFIG_FILE 00035 #endif 00036 00037 #if defined(POLARSSL_AESNI_C) 00038 00039 #include "polarssl/aesni.h" 00040 #include <stdio.h> 00041 00042 #if defined(POLARSSL_HAVE_X86_64) 00043 00044 /* 00045 * AES-NI support detection routine 00046 */ 00047 int aesni_supports( unsigned int what ) 00048 { 00049 static int done = 0; 00050 static unsigned int c = 0; 00051 00052 if( ! done ) 00053 { 00054 asm( "movl $1, %%eax \n" 00055 "cpuid \n" 00056 : "=c" (c) 00057 : 00058 : "eax", "ebx", "edx" ); 00059 done = 1; 00060 } 00061 00062 return( ( c & what ) != 0 ); 00063 } 00064 00065 /* 00066 * Binutils needs to be at least 2.19 to support AES-NI instructions. 00067 * Unfortunately, a lot of users have a lower version now (2014-04). 00068 * Emit bytecode directly in order to support "old" version of gas. 00069 * 00070 * Opcodes from the Intel architecture reference manual, vol. 3. 00071 * We always use registers, so we don't need prefixes for memory operands. 00072 * Operand macros are in gas order (src, dst) as opposed to Intel order 00073 * (dst, src) in order to blend better into the surrounding assembly code. 00074 */ 00075 #define AESDEC ".byte 0x66,0x0F,0x38,0xDE," 00076 #define AESDECLAST ".byte 0x66,0x0F,0x38,0xDF," 00077 #define AESENC ".byte 0x66,0x0F,0x38,0xDC," 00078 #define AESENCLAST ".byte 0x66,0x0F,0x38,0xDD," 00079 #define AESIMC ".byte 0x66,0x0F,0x38,0xDB," 00080 #define AESKEYGENA ".byte 0x66,0x0F,0x3A,0xDF," 00081 #define PCLMULQDQ ".byte 0x66,0x0F,0x3A,0x44," 00082 00083 #define xmm0_xmm0 "0xC0" 00084 #define xmm0_xmm1 "0xC8" 00085 #define xmm0_xmm2 "0xD0" 00086 #define xmm0_xmm3 "0xD8" 00087 #define xmm0_xmm4 "0xE0" 00088 #define xmm1_xmm0 "0xC1" 00089 #define xmm1_xmm2 "0xD1" 00090 00091 /* 00092 * AES-NI AES-ECB block en(de)cryption 00093 */ 00094 int aesni_crypt_ecb( aes_context *ctx, 00095 int mode, 00096 const unsigned char input[16], 00097 unsigned char output[16] ) 00098 { 00099 asm( "movdqu (%3), %%xmm0 \n" // load input 00100 "movdqu (%1), %%xmm1 \n" // load round key 0 00101 "pxor %%xmm1, %%xmm0 \n" // round 0 00102 "addq $16, %1 \n" // point to next round key 00103 "subl $1, %0 \n" // normal rounds = nr - 1 00104 "test %2, %2 \n" // mode? 00105 "jz 2f \n" // 0 = decrypt 00106 00107 "1: \n" // encryption loop 00108 "movdqu (%1), %%xmm1 \n" // load round key 00109 AESENC xmm1_xmm0 "\n" // do round 00110 "addq $16, %1 \n" // point to next round key 00111 "subl $1, %0 \n" // loop 00112 "jnz 1b \n" 00113 "movdqu (%1), %%xmm1 \n" // load round key 00114 AESENCLAST xmm1_xmm0 "\n" // last round 00115 "jmp 3f \n" 00116 00117 "2: \n" // decryption loop 00118 "movdqu (%1), %%xmm1 \n" 00119 AESDEC xmm1_xmm0 "\n" // do round 00120 "addq $16, %1 \n" 00121 "subl $1, %0 \n" 00122 "jnz 2b \n" 00123 "movdqu (%1), %%xmm1 \n" // load round key 00124 AESDECLAST xmm1_xmm0 "\n" // last round 00125 00126 "3: \n" 00127 "movdqu %%xmm0, (%4) \n" // export output 00128 : 00129 : "r" (ctx->nr ), "r" (ctx->rk ), "r" (mode), "r" (input), "r" (output) 00130 : "memory", "cc", "xmm0", "xmm1" ); 00131 00132 00133 return( 0 ); 00134 } 00135 00136 /* 00137 * GCM multiplication: c = a times b in GF(2^128) 00138 * Based on [CLMUL-WP] algorithms 1 (with equation 27) and 5. 00139 */ 00140 void aesni_gcm_mult( unsigned char c[16], 00141 const unsigned char a[16], 00142 const unsigned char b[16] ) 00143 { 00144 unsigned char aa[16], bb[16], cc[16]; 00145 size_t i; 00146 00147 /* The inputs are in big-endian order, so byte-reverse them */ 00148 for( i = 0; i < 16; i++ ) 00149 { 00150 aa[i] = a[15 - i]; 00151 bb[i] = b[15 - i]; 00152 } 00153 00154 asm( "movdqu (%0), %%xmm0 \n" // a1:a0 00155 "movdqu (%1), %%xmm1 \n" // b1:b0 00156 00157 /* 00158 * Caryless multiplication xmm2:xmm1 = xmm0 * xmm1 00159 * using [CLMUL-WP] algorithm 1 (p. 13). 00160 */ 00161 "movdqa %%xmm1, %%xmm2 \n" // copy of b1:b0 00162 "movdqa %%xmm1, %%xmm3 \n" // same 00163 "movdqa %%xmm1, %%xmm4 \n" // same 00164 PCLMULQDQ xmm0_xmm1 ",0x00 \n" // a0*b0 = c1:c0 00165 PCLMULQDQ xmm0_xmm2 ",0x11 \n" // a1*b1 = d1:d0 00166 PCLMULQDQ xmm0_xmm3 ",0x10 \n" // a0*b1 = e1:e0 00167 PCLMULQDQ xmm0_xmm4 ",0x01 \n" // a1*b0 = f1:f0 00168 "pxor %%xmm3, %%xmm4 \n" // e1+f1:e0+f0 00169 "movdqa %%xmm4, %%xmm3 \n" // same 00170 "psrldq $8, %%xmm4 \n" // 0:e1+f1 00171 "pslldq $8, %%xmm3 \n" // e0+f0:0 00172 "pxor %%xmm4, %%xmm2 \n" // d1:d0+e1+f1 00173 "pxor %%xmm3, %%xmm1 \n" // c1+e0+f1:c0 00174 00175 /* 00176 * Now shift the result one bit to the left, 00177 * taking advantage of [CLMUL-WP] eq 27 (p. 20) 00178 */ 00179 "movdqa %%xmm1, %%xmm3 \n" // r1:r0 00180 "movdqa %%xmm2, %%xmm4 \n" // r3:r2 00181 "psllq $1, %%xmm1 \n" // r1<<1:r0<<1 00182 "psllq $1, %%xmm2 \n" // r3<<1:r2<<1 00183 "psrlq $63, %%xmm3 \n" // r1>>63:r0>>63 00184 "psrlq $63, %%xmm4 \n" // r3>>63:r2>>63 00185 "movdqa %%xmm3, %%xmm5 \n" // r1>>63:r0>>63 00186 "pslldq $8, %%xmm3 \n" // r0>>63:0 00187 "pslldq $8, %%xmm4 \n" // r2>>63:0 00188 "psrldq $8, %%xmm5 \n" // 0:r1>>63 00189 "por %%xmm3, %%xmm1 \n" // r1<<1|r0>>63:r0<<1 00190 "por %%xmm4, %%xmm2 \n" // r3<<1|r2>>62:r2<<1 00191 "por %%xmm5, %%xmm2 \n" // r3<<1|r2>>62:r2<<1|r1>>63 00192 00193 /* 00194 * Now reduce modulo the GCM polynomial x^128 + x^7 + x^2 + x + 1 00195 * using [CLMUL-WP] algorithm 5 (p. 20). 00196 * Currently xmm2:xmm1 holds x3:x2:x1:x0 (already shifted). 00197 */ 00198 /* Step 2 (1) */ 00199 "movdqa %%xmm1, %%xmm3 \n" // x1:x0 00200 "movdqa %%xmm1, %%xmm4 \n" // same 00201 "movdqa %%xmm1, %%xmm5 \n" // same 00202 "psllq $63, %%xmm3 \n" // x1<<63:x0<<63 = stuff:a 00203 "psllq $62, %%xmm4 \n" // x1<<62:x0<<62 = stuff:b 00204 "psllq $57, %%xmm5 \n" // x1<<57:x0<<57 = stuff:c 00205 00206 /* Step 2 (2) */ 00207 "pxor %%xmm4, %%xmm3 \n" // stuff:a+b 00208 "pxor %%xmm5, %%xmm3 \n" // stuff:a+b+c 00209 "pslldq $8, %%xmm3 \n" // a+b+c:0 00210 "pxor %%xmm3, %%xmm1 \n" // x1+a+b+c:x0 = d:x0 00211 00212 /* Steps 3 and 4 */ 00213 "movdqa %%xmm1,%%xmm0 \n" // d:x0 00214 "movdqa %%xmm1,%%xmm4 \n" // same 00215 "movdqa %%xmm1,%%xmm5 \n" // same 00216 "psrlq $1, %%xmm0 \n" // e1:x0>>1 = e1:e0' 00217 "psrlq $2, %%xmm4 \n" // f1:x0>>2 = f1:f0' 00218 "psrlq $7, %%xmm5 \n" // g1:x0>>7 = g1:g0' 00219 "pxor %%xmm4, %%xmm0 \n" // e1+f1:e0'+f0' 00220 "pxor %%xmm5, %%xmm0 \n" // e1+f1+g1:e0'+f0'+g0' 00221 // e0'+f0'+g0' is almost e0+f0+g0, except for some missing 00222 // bits carried from d. Now get those bits back in. 00223 "movdqa %%xmm1,%%xmm3 \n" // d:x0 00224 "movdqa %%xmm1,%%xmm4 \n" // same 00225 "movdqa %%xmm1,%%xmm5 \n" // same 00226 "psllq $63, %%xmm3 \n" // d<<63:stuff 00227 "psllq $62, %%xmm4 \n" // d<<62:stuff 00228 "psllq $57, %%xmm5 \n" // d<<57:stuff 00229 "pxor %%xmm4, %%xmm3 \n" // d<<63+d<<62:stuff 00230 "pxor %%xmm5, %%xmm3 \n" // missing bits of d:stuff 00231 "psrldq $8, %%xmm3 \n" // 0:missing bits of d 00232 "pxor %%xmm3, %%xmm0 \n" // e1+f1+g1:e0+f0+g0 00233 "pxor %%xmm1, %%xmm0 \n" // h1:h0 00234 "pxor %%xmm2, %%xmm0 \n" // x3+h1:x2+h0 00235 00236 "movdqu %%xmm0, (%2) \n" // done 00237 : 00238 : "r" (aa), "r" (bb), "r" (cc) 00239 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); 00240 00241 /* Now byte-reverse the outputs */ 00242 for( i = 0; i < 16; i++ ) 00243 c[i] = cc[15 - i]; 00244 00245 return; 00246 } 00247 00248 /* 00249 * Compute decryption round keys from encryption round keys 00250 */ 00251 void aesni_inverse_key( unsigned char *invkey, 00252 const unsigned char *fwdkey, int nr ) 00253 { 00254 unsigned char *ik = invkey; 00255 const unsigned char *fk = fwdkey + 16 * nr; 00256 00257 memcpy( ik, fk, 16 ); 00258 00259 for( fk -= 16, ik += 16; fk > fwdkey; fk -= 16, ik += 16 ) 00260 asm( "movdqu (%0), %%xmm0 \n" 00261 AESIMC xmm0_xmm0 "\n" 00262 "movdqu %%xmm0, (%1) \n" 00263 : 00264 : "r" (fk), "r" (ik) 00265 : "memory", "xmm0" ); 00266 00267 memcpy( ik, fk, 16 ); 00268 } 00269 00270 /* 00271 * Key expansion, 128-bit case 00272 */ 00273 static void aesni_setkey_enc_128( unsigned char *rk, 00274 const unsigned char *key ) 00275 { 00276 asm( "movdqu (%1), %%xmm0 \n" // copy the original key 00277 "movdqu %%xmm0, (%0) \n" // as round key 0 00278 "jmp 2f \n" // skip auxiliary routine 00279 00280 /* 00281 * Finish generating the next round key. 00282 * 00283 * On entry xmm0 is r3:r2:r1:r0 and xmm1 is X:stuff:stuff:stuff 00284 * with X = rot( sub( r3 ) ) ^ RCON. 00285 * 00286 * On exit, xmm0 is r7:r6:r5:r4 00287 * with r4 = X + r0, r5 = r4 + r1, r6 = r5 + r2, r7 = r6 + r3 00288 * and those are written to the round key buffer. 00289 */ 00290 "1: \n" 00291 "pshufd $0xff, %%xmm1, %%xmm1 \n" // X:X:X:X 00292 "pxor %%xmm0, %%xmm1 \n" // X+r3:X+r2:X+r1:r4 00293 "pslldq $4, %%xmm0 \n" // r2:r1:r0:0 00294 "pxor %%xmm0, %%xmm1 \n" // X+r3+r2:X+r2+r1:r5:r4 00295 "pslldq $4, %%xmm0 \n" // etc 00296 "pxor %%xmm0, %%xmm1 \n" 00297 "pslldq $4, %%xmm0 \n" 00298 "pxor %%xmm1, %%xmm0 \n" // update xmm0 for next time! 00299 "add $16, %0 \n" // point to next round key 00300 "movdqu %%xmm0, (%0) \n" // write it 00301 "ret \n" 00302 00303 /* Main "loop" */ 00304 "2: \n" 00305 AESKEYGENA xmm0_xmm1 ",0x01 \ncall 1b \n" 00306 AESKEYGENA xmm0_xmm1 ",0x02 \ncall 1b \n" 00307 AESKEYGENA xmm0_xmm1 ",0x04 \ncall 1b \n" 00308 AESKEYGENA xmm0_xmm1 ",0x08 \ncall 1b \n" 00309 AESKEYGENA xmm0_xmm1 ",0x10 \ncall 1b \n" 00310 AESKEYGENA xmm0_xmm1 ",0x20 \ncall 1b \n" 00311 AESKEYGENA xmm0_xmm1 ",0x40 \ncall 1b \n" 00312 AESKEYGENA xmm0_xmm1 ",0x80 \ncall 1b \n" 00313 AESKEYGENA xmm0_xmm1 ",0x1B \ncall 1b \n" 00314 AESKEYGENA xmm0_xmm1 ",0x36 \ncall 1b \n" 00315 : 00316 : "r" (rk), "r" (key) 00317 : "memory", "cc", "0" ); 00318 } 00319 00320 /* 00321 * Key expansion, 192-bit case 00322 */ 00323 static void aesni_setkey_enc_192( unsigned char *rk, 00324 const unsigned char *key ) 00325 { 00326 asm( "movdqu (%1), %%xmm0 \n" // copy original round key 00327 "movdqu %%xmm0, (%0) \n" 00328 "add $16, %0 \n" 00329 "movq 16(%1), %%xmm1 \n" 00330 "movq %%xmm1, (%0) \n" 00331 "add $8, %0 \n" 00332 "jmp 2f \n" // skip auxiliary routine 00333 00334 /* 00335 * Finish generating the next 6 quarter-keys. 00336 * 00337 * On entry xmm0 is r3:r2:r1:r0, xmm1 is stuff:stuff:r5:r4 00338 * and xmm2 is stuff:stuff:X:stuff with X = rot( sub( r3 ) ) ^ RCON. 00339 * 00340 * On exit, xmm0 is r9:r8:r7:r6 and xmm1 is stuff:stuff:r11:r10 00341 * and those are written to the round key buffer. 00342 */ 00343 "1: \n" 00344 "pshufd $0x55, %%xmm2, %%xmm2 \n" // X:X:X:X 00345 "pxor %%xmm0, %%xmm2 \n" // X+r3:X+r2:X+r1:r4 00346 "pslldq $4, %%xmm0 \n" // etc 00347 "pxor %%xmm0, %%xmm2 \n" 00348 "pslldq $4, %%xmm0 \n" 00349 "pxor %%xmm0, %%xmm2 \n" 00350 "pslldq $4, %%xmm0 \n" 00351 "pxor %%xmm2, %%xmm0 \n" // update xmm0 = r9:r8:r7:r6 00352 "movdqu %%xmm0, (%0) \n" 00353 "add $16, %0 \n" 00354 "pshufd $0xff, %%xmm0, %%xmm2 \n" // r9:r9:r9:r9 00355 "pxor %%xmm1, %%xmm2 \n" // stuff:stuff:r9+r5:r10 00356 "pslldq $4, %%xmm1 \n" // r2:r1:r0:0 00357 "pxor %%xmm2, %%xmm1 \n" // update xmm1 = stuff:stuff:r11:r10 00358 "movq %%xmm1, (%0) \n" 00359 "add $8, %0 \n" 00360 "ret \n" 00361 00362 "2: \n" 00363 AESKEYGENA xmm1_xmm2 ",0x01 \ncall 1b \n" 00364 AESKEYGENA xmm1_xmm2 ",0x02 \ncall 1b \n" 00365 AESKEYGENA xmm1_xmm2 ",0x04 \ncall 1b \n" 00366 AESKEYGENA xmm1_xmm2 ",0x08 \ncall 1b \n" 00367 AESKEYGENA xmm1_xmm2 ",0x10 \ncall 1b \n" 00368 AESKEYGENA xmm1_xmm2 ",0x20 \ncall 1b \n" 00369 AESKEYGENA xmm1_xmm2 ",0x40 \ncall 1b \n" 00370 AESKEYGENA xmm1_xmm2 ",0x80 \ncall 1b \n" 00371 00372 : 00373 : "r" (rk), "r" (key) 00374 : "memory", "cc", "0" ); 00375 } 00376 00377 /* 00378 * Key expansion, 256-bit case 00379 */ 00380 static void aesni_setkey_enc_256( unsigned char *rk, 00381 const unsigned char *key ) 00382 { 00383 asm( "movdqu (%1), %%xmm0 \n" 00384 "movdqu %%xmm0, (%0) \n" 00385 "add $16, %0 \n" 00386 "movdqu 16(%1), %%xmm1 \n" 00387 "movdqu %%xmm1, (%0) \n" 00388 "jmp 2f \n" // skip auxiliary routine 00389 00390 /* 00391 * Finish generating the next two round keys. 00392 * 00393 * On entry xmm0 is r3:r2:r1:r0, xmm1 is r7:r6:r5:r4 and 00394 * xmm2 is X:stuff:stuff:stuff with X = rot( sub( r7 )) ^ RCON 00395 * 00396 * On exit, xmm0 is r11:r10:r9:r8 and xmm1 is r15:r14:r13:r12 00397 * and those have been written to the output buffer. 00398 */ 00399 "1: \n" 00400 "pshufd $0xff, %%xmm2, %%xmm2 \n" 00401 "pxor %%xmm0, %%xmm2 \n" 00402 "pslldq $4, %%xmm0 \n" 00403 "pxor %%xmm0, %%xmm2 \n" 00404 "pslldq $4, %%xmm0 \n" 00405 "pxor %%xmm0, %%xmm2 \n" 00406 "pslldq $4, %%xmm0 \n" 00407 "pxor %%xmm2, %%xmm0 \n" 00408 "add $16, %0 \n" 00409 "movdqu %%xmm0, (%0) \n" 00410 00411 /* Set xmm2 to stuff:Y:stuff:stuff with Y = subword( r11 ) 00412 * and proceed to generate next round key from there */ 00413 AESKEYGENA xmm0_xmm2 ",0x00 \n" 00414 "pshufd $0xaa, %%xmm2, %%xmm2 \n" 00415 "pxor %%xmm1, %%xmm2 \n" 00416 "pslldq $4, %%xmm1 \n" 00417 "pxor %%xmm1, %%xmm2 \n" 00418 "pslldq $4, %%xmm1 \n" 00419 "pxor %%xmm1, %%xmm2 \n" 00420 "pslldq $4, %%xmm1 \n" 00421 "pxor %%xmm2, %%xmm1 \n" 00422 "add $16, %0 \n" 00423 "movdqu %%xmm1, (%0) \n" 00424 "ret \n" 00425 00426 /* 00427 * Main "loop" - Generating one more key than necessary, 00428 * see definition of aes_context.buf 00429 */ 00430 "2: \n" 00431 AESKEYGENA xmm1_xmm2 ",0x01 \ncall 1b \n" 00432 AESKEYGENA xmm1_xmm2 ",0x02 \ncall 1b \n" 00433 AESKEYGENA xmm1_xmm2 ",0x04 \ncall 1b \n" 00434 AESKEYGENA xmm1_xmm2 ",0x08 \ncall 1b \n" 00435 AESKEYGENA xmm1_xmm2 ",0x10 \ncall 1b \n" 00436 AESKEYGENA xmm1_xmm2 ",0x20 \ncall 1b \n" 00437 AESKEYGENA xmm1_xmm2 ",0x40 \ncall 1b \n" 00438 : 00439 : "r" (rk), "r" (key) 00440 : "memory", "cc", "0" ); 00441 } 00442 00443 /* 00444 * Key expansion, wrapper 00445 */ 00446 int aesni_setkey_enc( unsigned char *rk, 00447 const unsigned char *key, 00448 size_t bits ) 00449 { 00450 switch( bits ) 00451 { 00452 case 128: aesni_setkey_enc_128( rk, key ); break; 00453 case 192: aesni_setkey_enc_192( rk, key ); break; 00454 case 256: aesni_setkey_enc_256( rk, key ); break; 00455 default : return( POLARSSL_ERR_AES_INVALID_KEY_LENGTH ); 00456 } 00457 00458 return( 0 ); 00459 } 00460 00461 #endif /* POLARSSL_HAVE_X86_64 */ 00462 00463 #endif /* POLARSSL_AESNI_C */ 00464 00465
Generated on Tue Jul 12 2022 19:40:14 by
1.7.2