wolfSSL SSL/TLS library, support up to TLS1.3

Dependents:   CyaSSL-Twitter-OAuth4Tw Example-client-tls-cert TwitterReader TweetTest ... more

Revision:
16:8e0d178b1d1e
Parent:
15:117db924cf7c
--- a/wolfcrypt/src/aes.c	Sat Aug 18 22:20:43 2018 +0000
+++ b/wolfcrypt/src/aes.c	Thu Jun 04 23:57:22 2020 +0000
@@ -1,6 +1,6 @@
 /* aes.c
  *
- * Copyright (C) 2006-2017 wolfSSL Inc.
+ * Copyright (C) 2006-2020 wolfSSL Inc.
  *
  * This file is part of wolfSSL.
  *
@@ -29,6 +29,8 @@
 
 #if !defined(NO_AES)
 
+/* Tip: Locate the software cipher modes by searching for "Software AES" */
+
 #if defined(HAVE_FIPS) && \
     defined(HAVE_FIPS_VERSION) && (HAVE_FIPS_VERSION >= 2)
 
@@ -44,6 +46,10 @@
 #include <wolfssl/wolfcrypt/aes.h>
 #include <wolfssl/wolfcrypt/cpuid.h>
 
+#ifdef WOLF_CRYPTO_CB
+    #include <wolfssl/wolfcrypt/cryptocb.h>
+#endif
+
 
 /* fips wrapper calls, user can call direct */
 #if defined(HAVE_FIPS) && \
@@ -136,9 +142,9 @@
                                       byte* authTag, word32 authTagSz,
                                       const byte* authIn, word32 authInSz)
         {
-            if (aes == NULL || authTagSz > AES_BLOCK_SIZE
-                                    || authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ ||
-                                    ivSz > AES_BLOCK_SIZE) {
+            if (aes == NULL || authTagSz > AES_BLOCK_SIZE ||
+                        authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ ||
+                        ivSz == 0 || ivSz > AES_BLOCK_SIZE) {
                 return BAD_FUNC_ARG;
             }
 
@@ -154,7 +160,7 @@
             {
                 if (aes == NULL || out == NULL || in == NULL || iv == NULL
                         || authTag == NULL || authTagSz > AES_BLOCK_SIZE ||
-                        ivSz > AES_BLOCK_SIZE) {
+                        ivSz == 0 || ivSz > AES_BLOCK_SIZE) {
                     return BAD_FUNC_ARG;
                 }
 
@@ -227,11 +233,14 @@
         #endif /* HAVE_AES_DECRYPT */
     #endif /* HAVE_AESCCM && HAVE_FIPS_VERSION 2 */
 
-    int  wc_AesInit(Aes* aes, void* h, int i)
+    int wc_AesInit(Aes* aes, void* h, int i)
     {
-        (void)aes;
+        if (aes == NULL)
+            return BAD_FUNC_ARG;
+
         (void)h;
         (void)i;
+
         /* FIPS doesn't support:
             return AesInit(aes, h, i); */
         return 0;
@@ -279,106 +288,64 @@
 
 /* Define AES implementation includes and functions */
 #if defined(STM32_CRYPTO)
-     /* STM32F2/F4 hardware AES support for CBC, CTR modes */
-
-    #ifdef WOLFSSL_STM32L4
-        #define CRYP AES
-    #endif
-
-    /* CRYPT_AES_GCM starts the IV with 2 */
-    #define STM32_GCM_IV_START 2
+     /* STM32F2/F4/F7/L4 hardware AES support for ECB, CBC, CTR and GCM modes */
 
 #if defined(WOLFSSL_AES_DIRECT) || defined(HAVE_AESGCM) || defined(HAVE_AESCCM)
+
     static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
     {
         int ret = 0;
     #ifdef WOLFSSL_STM32_CUBEMX
         CRYP_HandleTypeDef hcryp;
-
-        XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef));
-        switch(aes->rounds) {
-            case 10: /* 128-bit key */
-                hcryp.Init.KeySize = CRYP_KEYSIZE_128B;
-                break;
-	#ifdef CRYP_KEYSIZE_192B
-            case 12: /* 192-bit key */
-                hcryp.Init.KeySize = CRYP_KEYSIZE_192B;
-                break;
-	#endif
-            case 14: /* 256-bit key */
-                hcryp.Init.KeySize = CRYP_KEYSIZE_256B;
-                break;
-            default:
-                break;
-        }
-        hcryp.Instance = CRYP;
-        hcryp.Init.DataType = CRYP_DATATYPE_8B;
-        hcryp.Init.pKey = (uint8_t*)aes->key;
-
+    #else
+        CRYP_InitTypeDef cryptInit;
+        CRYP_KeyInitTypeDef keyInit;
+    #endif
+
+    #ifdef WOLFSSL_STM32_CUBEMX
+        ret = wc_Stm32_Aes_Init(aes, &hcryp);
+        if (ret != 0)
+            return ret;
+
+    #ifdef STM32_CRYPTO_AES_ONLY
+        hcryp.Init.OperatingMode = CRYP_ALGOMODE_ENCRYPT;
+        hcryp.Init.ChainingMode  = CRYP_CHAINMODE_AES_ECB;
+        hcryp.Init.KeyWriteFlag  = CRYP_KEY_WRITE_ENABLE;
+    #elif defined(STM32_HAL_V2)
+        hcryp.Init.Algorithm  = CRYP_AES_ECB;
+    #endif
         HAL_CRYP_Init(&hcryp);
 
-        if (HAL_CRYP_AESECB_Encrypt(&hcryp, (uint8_t*)inBlock, AES_BLOCK_SIZE,
-                                    outBlock, STM32_HAL_TIMEOUT) != HAL_OK) {
-            ret = WC_TIMEOUT_E;
-        }
-
-        HAL_CRYP_DeInit(&hcryp);
+    #ifdef STM32_CRYPTO_AES_ONLY
+        ret = HAL_CRYPEx_AES(&hcryp, (uint8_t*)inBlock, AES_BLOCK_SIZE,
+            outBlock, STM32_HAL_TIMEOUT);
+    #elif defined(STM32_HAL_V2)
+        ret = HAL_CRYP_Encrypt(&hcryp, (uint32_t*)inBlock, AES_BLOCK_SIZE,
+            (uint32_t*)outBlock, STM32_HAL_TIMEOUT);
     #else
-        word32 *enc_key;
-        CRYP_InitTypeDef AES_CRYP_InitStructure;
-        CRYP_KeyInitTypeDef AES_CRYP_KeyInitStructure;
-
-        enc_key = aes->key;
-
-        /* crypto structure initialization */
-        CRYP_KeyStructInit(&AES_CRYP_KeyInitStructure);
-        CRYP_StructInit(&AES_CRYP_InitStructure);
+        ret = HAL_CRYP_AESECB_Encrypt(&hcryp, (uint8_t*)inBlock, AES_BLOCK_SIZE,
+            outBlock, STM32_HAL_TIMEOUT);
+    #endif
+        if (ret != HAL_OK) {
+            ret = WC_TIMEOUT_E;
+        }
+        HAL_CRYP_DeInit(&hcryp);
+
+    #else /* STD_PERI_LIB */
+        ret = wc_Stm32_Aes_Init(aes, &cryptInit, &keyInit);
+        if (ret != 0)
+            return ret;
 
         /* reset registers to their default values */
         CRYP_DeInit();
 
-        /* load key into correct registers */
-        switch (aes->rounds) {
-            case 10: /* 128-bit key */
-                AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_128b;
-                AES_CRYP_KeyInitStructure.CRYP_Key2Left  = enc_key[0];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[1];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Left  = enc_key[2];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[3];
-                break;
-
-            case 12: /* 192-bit key */
-                AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_192b;
-                AES_CRYP_KeyInitStructure.CRYP_Key1Left  = enc_key[0];
-                AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[1];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Left  = enc_key[2];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[3];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Left  = enc_key[4];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[5];
-                break;
-
-            case 14: /* 256-bit key */
-                AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_256b;
-                AES_CRYP_KeyInitStructure.CRYP_Key0Left  = enc_key[0];
-                AES_CRYP_KeyInitStructure.CRYP_Key0Right = enc_key[1];
-                AES_CRYP_KeyInitStructure.CRYP_Key1Left  = enc_key[2];
-                AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[3];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Left  = enc_key[4];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[5];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Left  = enc_key[6];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[7];
-                break;
-
-            default:
-                break;
-        }
-        CRYP_KeyInit(&AES_CRYP_KeyInitStructure);
-
-        /* set direction, mode, and datatype */
-        AES_CRYP_InitStructure.CRYP_AlgoDir  = CRYP_AlgoDir_Encrypt;
-        AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_ECB;
-        AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_8b;
-        CRYP_Init(&AES_CRYP_InitStructure);
+        /* setup key */
+        CRYP_KeyInit(&keyInit);
+
+        /* set direction and mode */
+        cryptInit.CRYP_AlgoDir  = CRYP_AlgoDir_Encrypt;
+        cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_ECB;
+        CRYP_Init(&cryptInit);
 
         /* enable crypto processor */
         CRYP_Cmd(ENABLE);
@@ -402,6 +369,7 @@
         /* disable crypto processor */
         CRYP_Cmd(DISABLE);
     #endif /* WOLFSSL_STM32_CUBEMX */
+
         return ret;
     }
 #endif /* WOLFSSL_AES_DIRECT || HAVE_AESGCM || HAVE_AESCCM */
@@ -413,103 +381,64 @@
         int ret = 0;
     #ifdef WOLFSSL_STM32_CUBEMX
         CRYP_HandleTypeDef hcryp;
-
-        XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef));
-        switch(aes->rounds) {
-            case 10: /* 128-bit key */
-                hcryp.Init.KeySize = CRYP_KEYSIZE_128B;
-                break;
-	#ifdef CRYP_KEYSIZE_192B
-            case 12: /* 192-bit key */
-                hcryp.Init.KeySize = CRYP_KEYSIZE_192B;
-                break;
-	#endif
-            case 14: /* 256-bit key */
-                hcryp.Init.KeySize = CRYP_KEYSIZE_256B;
-                break;
-            default:
-                break;
-        }
-        hcryp.Instance = CRYP;
-        hcryp.Init.DataType = CRYP_DATATYPE_8B;
-        hcryp.Init.pKey = (uint8_t*)aes->key;
-
+    #else
+        CRYP_InitTypeDef cryptInit;
+        CRYP_KeyInitTypeDef keyInit;
+    #endif
+
+    #ifdef WOLFSSL_STM32_CUBEMX
+        ret = wc_Stm32_Aes_Init(aes, &hcryp);
+        if (ret != 0)
+            return ret;
+
+    #ifdef STM32_CRYPTO_AES_ONLY
+        hcryp.Init.OperatingMode = CRYP_ALGOMODE_DECRYPT;
+        hcryp.Init.ChainingMode  = CRYP_CHAINMODE_AES_ECB;
+        hcryp.Init.KeyWriteFlag  = CRYP_KEY_WRITE_ENABLE;
+    #elif defined(STM32_HAL_V2)
+        hcryp.Init.Algorithm  = CRYP_AES_ECB;
+    #endif
         HAL_CRYP_Init(&hcryp);
 
-        if (HAL_CRYP_AESECB_Decrypt(&hcryp, (uint8_t*)inBlock, AES_BLOCK_SIZE,
-                                       outBlock, STM32_HAL_TIMEOUT) != HAL_OK) {
-            ret = WC_TIMEOUT_E;
-        }
-
-        HAL_CRYP_DeInit(&hcryp);
+    #ifdef STM32_CRYPTO_AES_ONLY
+        ret = HAL_CRYPEx_AES(&hcryp, (uint8_t*)inBlock, AES_BLOCK_SIZE,
+            outBlock, STM32_HAL_TIMEOUT);
+    #elif defined(STM32_HAL_V2)
+        ret = HAL_CRYP_Decrypt(&hcryp, (uint32_t*)inBlock, AES_BLOCK_SIZE,
+            (uint32_t*)outBlock, STM32_HAL_TIMEOUT);
     #else
-        word32 *enc_key;
-        CRYP_InitTypeDef AES_CRYP_InitStructure;
-        CRYP_KeyInitTypeDef AES_CRYP_KeyInitStructure;
-
-        enc_key = aes->key;
-
-        /* crypto structure initialization */
-        CRYP_KeyStructInit(&AES_CRYP_KeyInitStructure);
-        CRYP_StructInit(&AES_CRYP_InitStructure);
+        ret = HAL_CRYP_AESECB_Decrypt(&hcryp, (uint8_t*)inBlock, AES_BLOCK_SIZE,
+            outBlock, STM32_HAL_TIMEOUT);
+    #endif
+        if (ret != HAL_OK) {
+            ret = WC_TIMEOUT_E;
+        }
+        HAL_CRYP_DeInit(&hcryp);
+
+    #else /* STD_PERI_LIB */
+        ret = wc_Stm32_Aes_Init(aes, &cryptInit, &keyInit);
+        if (ret != 0)
+            return ret;
 
         /* reset registers to their default values */
         CRYP_DeInit();
 
-        /* load key into correct registers */
-        switch (aes->rounds) {
-            case 10: /* 128-bit key */
-                AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_128b;
-                AES_CRYP_KeyInitStructure.CRYP_Key2Left  = enc_key[0];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[1];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Left  = enc_key[2];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[3];
-                break;
-
-            case 12: /* 192-bit key */
-                AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_192b;
-                AES_CRYP_KeyInitStructure.CRYP_Key1Left  = enc_key[0];
-                AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[1];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Left  = enc_key[2];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[3];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Left  = enc_key[4];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[5];
-                break;
-
-            case 14: /* 256-bit key */
-                AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_256b;
-                AES_CRYP_KeyInitStructure.CRYP_Key0Left  = enc_key[0];
-                AES_CRYP_KeyInitStructure.CRYP_Key0Right = enc_key[1];
-                AES_CRYP_KeyInitStructure.CRYP_Key1Left  = enc_key[2];
-                AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[3];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Left  = enc_key[4];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[5];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Left  = enc_key[6];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[7];
-                break;
-
-            default:
-                break;
-        }
-        CRYP_KeyInit(&AES_CRYP_KeyInitStructure);
-
-        /* set direction, key, and datatype */
-        AES_CRYP_InitStructure.CRYP_AlgoDir  = CRYP_AlgoDir_Decrypt;
-        AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_Key;
-        AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_8b;
-        CRYP_Init(&AES_CRYP_InitStructure);
+        /* set direction and key */
+        CRYP_KeyInit(&keyInit);
+        cryptInit.CRYP_AlgoDir  = CRYP_AlgoDir_Decrypt;
+        cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_Key;
+        CRYP_Init(&cryptInit);
 
         /* enable crypto processor */
         CRYP_Cmd(ENABLE);
 
-        /* wait until decrypt key has been intialized */
+        /* wait until decrypt key has been initialized */
         while (CRYP_GetFlagStatus(CRYP_FLAG_BUSY) != RESET) {}
 
-        /* set direction, mode, and datatype */
-        AES_CRYP_InitStructure.CRYP_AlgoDir  = CRYP_AlgoDir_Decrypt;
-        AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_ECB;
-        AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_8b;
-        CRYP_Init(&AES_CRYP_InitStructure);
+        /* set direction and mode */
+        cryptInit.CRYP_AlgoDir  = CRYP_AlgoDir_Decrypt;
+        cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_ECB;
+        CRYP_Init(&cryptInit);
 
         /* enable crypto processor */
         CRYP_Cmd(ENABLE);
@@ -533,6 +462,7 @@
         /* disable crypto processor */
         CRYP_Cmd(DISABLE);
     #endif /* WOLFSSL_STM32_CUBEMX */
+
         return ret;
     }
     #endif /* WOLFSSL_AES_DIRECT || HAVE_AESCCM */
@@ -659,6 +589,24 @@
         #error nRF51 AES Hardware does not support decrypt
     #endif /* HAVE_AES_DECRYPT */
 
+#elif defined(WOLFSSL_ESP32WROOM32_CRYPT) && \
+    !defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_AES)
+
+    #include "wolfssl/wolfcrypt/port/Espressif/esp32-crypt.h"
+
+    #if defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT)
+    static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+    {
+        return wc_esp32AesEncrypt(aes, inBlock, outBlock);
+    }
+    #endif
+
+    #if defined(HAVE_AES_DECRYPT) && defined(WOLFSSL_AES_DIRECT)
+    static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+    {
+       return wc_esp32AesDecrypt(aes, inBlock, outBlock);
+    }
+    #endif
 
 #elif defined(WOLFSSL_AESNI)
 
@@ -672,10 +620,12 @@
         #define AESNI_ALIGN 16
     #endif
 
-    #ifndef _MSC_VER
+    #ifdef _MSC_VER
+        #define XASM_LINK(f)
+    #elif defined(__APPLE__)
+        #define XASM_LINK(f) asm("_" f)
+    #else
         #define XASM_LINK(f) asm(f)
-    #else
-        #define XASM_LINK(f)
     #endif /* _MSC_VER */
 
     static int checkAESNI = 0;
@@ -811,15 +761,164 @@
         }
     #endif /* HAVE_AES_DECRYPT */
 
-#elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES)
+#elif (defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES)) || \
+      ((defined(WOLFSSL_AFALG) || defined(WOLFSSL_DEVCRYPTO_AES)) && \
+        defined(HAVE_AESCCM))
         static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
         {
             wc_AesEncryptDirect(aes, outBlock, inBlock);
             return 0;
         }
+
+#elif defined(WOLFSSL_AFALG)
+#elif defined(WOLFSSL_DEVCRYPTO_AES)
+
+#elif defined(WOLFSSL_SCE) && !defined(WOLFSSL_SCE_NO_AES)
+    #include "hal_data.h"
+
+    #ifndef WOLFSSL_SCE_AES256_HANDLE
+        #define WOLFSSL_SCE_AES256_HANDLE g_sce_aes_256
+    #endif
+
+    #ifndef WOLFSSL_SCE_AES192_HANDLE
+        #define WOLFSSL_SCE_AES192_HANDLE g_sce_aes_192
+    #endif
+
+    #ifndef WOLFSSL_SCE_AES128_HANDLE
+        #define WOLFSSL_SCE_AES128_HANDLE g_sce_aes_128
+    #endif
+
+    static int AES_ECB_encrypt(Aes* aes, const byte* inBlock, byte* outBlock,
+            int sz)
+    {
+        uint32_t ret;
+
+        if (WOLFSSL_SCE_GSCE_HANDLE.p_cfg->endian_flag ==
+                CRYPTO_WORD_ENDIAN_BIG) {
+            ByteReverseWords((word32*)inBlock, (word32*)inBlock, sz);
+        }
+
+        switch (aes->keylen) {
+        #ifdef WOLFSSL_AES_128
+            case AES_128_KEY_SIZE:
+                ret = WOLFSSL_SCE_AES128_HANDLE.p_api->encrypt(
+                        WOLFSSL_SCE_AES128_HANDLE.p_ctrl, aes->key,
+                        NULL, (sz / sizeof(word32)), (word32*)inBlock,
+                        (word32*)outBlock);
+                break;
+        #endif
+        #ifdef WOLFSSL_AES_192
+            case AES_192_KEY_SIZE:
+                ret = WOLFSSL_SCE_AES192_HANDLE.p_api->encrypt(
+                        WOLFSSL_SCE_AES192_HANDLE.p_ctrl, aes->key,
+                        NULL, (sz / sizeof(word32)), (word32*)inBlock,
+                        (word32*)outBlock);
+                break;
+        #endif
+        #ifdef WOLFSSL_AES_256
+            case AES_256_KEY_SIZE:
+                ret = WOLFSSL_SCE_AES256_HANDLE.p_api->encrypt(
+                        WOLFSSL_SCE_AES256_HANDLE.p_ctrl, aes->key,
+                        NULL, (sz / sizeof(word32)), (word32*)inBlock,
+                        (word32*)outBlock);
+                break;
+        #endif
+            default:
+                WOLFSSL_MSG("Unknown key size");
+                return BAD_FUNC_ARG;
+        }
+
+        if (ret != SSP_SUCCESS) {
+           /* revert input */
+            ByteReverseWords((word32*)inBlock, (word32*)inBlock, sz);
+            return WC_HW_E;
+        }
+
+        if (WOLFSSL_SCE_GSCE_HANDLE.p_cfg->endian_flag ==
+                CRYPTO_WORD_ENDIAN_BIG) {
+            ByteReverseWords((word32*)outBlock, (word32*)outBlock, sz);
+            if (inBlock != outBlock) {
+                /* revert input */
+                ByteReverseWords((word32*)inBlock, (word32*)inBlock, sz);
+            }
+        }
+        return 0;
+    }
+
+    #if defined(HAVE_AES_DECRYPT)
+    static int AES_ECB_decrypt(Aes* aes, const byte* inBlock, byte* outBlock,
+            int sz)
+    {
+        uint32_t ret;
+
+        if (WOLFSSL_SCE_GSCE_HANDLE.p_cfg->endian_flag ==
+                CRYPTO_WORD_ENDIAN_BIG) {
+            ByteReverseWords((word32*)inBlock, (word32*)inBlock, sz);
+        }
+
+        switch (aes->keylen) {
+        #ifdef WOLFSSL_AES_128
+            case AES_128_KEY_SIZE:
+                ret = WOLFSSL_SCE_AES128_HANDLE.p_api->decrypt(
+                        WOLFSSL_SCE_AES128_HANDLE.p_ctrl, aes->key, aes->reg,
+                        (sz / sizeof(word32)), (word32*)inBlock,
+                        (word32*)outBlock);
+                break;
+        #endif
+        #ifdef WOLFSSL_AES_192
+            case AES_192_KEY_SIZE:
+                ret = WOLFSSL_SCE_AES192_HANDLE.p_api->decrypt(
+                        WOLFSSL_SCE_AES192_HANDLE.p_ctrl, aes->key, aes->reg,
+                        (sz / sizeof(word32)), (word32*)inBlock,
+                        (word32*)outBlock);
+                break;
+        #endif
+        #ifdef WOLFSSL_AES_256
+            case AES_256_KEY_SIZE:
+                ret = WOLFSSL_SCE_AES256_HANDLE.p_api->decrypt(
+                        WOLFSSL_SCE_AES256_HANDLE.p_ctrl, aes->key, aes->reg,
+                        (sz / sizeof(word32)), (word32*)inBlock,
+                        (word32*)outBlock);
+                break;
+        #endif
+            default:
+                WOLFSSL_MSG("Unknown key size");
+                return BAD_FUNC_ARG;
+        }
+        if (ret != SSP_SUCCESS) {
+            return WC_HW_E;
+        }
+
+        if (WOLFSSL_SCE_GSCE_HANDLE.p_cfg->endian_flag ==
+                CRYPTO_WORD_ENDIAN_BIG) {
+            ByteReverseWords((word32*)outBlock, (word32*)outBlock, sz);
+            if (inBlock != outBlock) {
+                /* revert input */
+                ByteReverseWords((word32*)inBlock, (word32*)inBlock, sz);
+            }
+        }
+
+        return 0;
+    }
+
+    #endif
+
+    #if defined(HAVE_AESGCM) || defined(WOLFSSL_AES_DIRECT)
+    static int wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+    {
+        return AES_ECB_encrypt(aes, inBlock, outBlock, AES_BLOCK_SIZE);
+    }
+    #endif
+
+    #if defined(HAVE_AES_DECRYPT) && defined(WOLFSSL_AES_DIRECT)
+    static int wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
+    {
+        return AES_ECB_decrypt(aes, inBlock, outBlock, AES_BLOCK_SIZE);
+    }
+    #endif
 #else
 
-    /* using wolfCrypt software AES implementation */
+    /* using wolfCrypt software implementation */
     #define NEED_AES_TABLES
 #endif
 
@@ -834,6 +933,7 @@
     /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
 };
 
+#ifndef WOLFSSL_AES_SMALL_TABLES
 static const word32 Te[4][256] = {
 {
     0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
@@ -1369,8 +1469,12 @@
     0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U,
 }
 };
-
-
+#endif /* HAVE_AES_DECRYPT */
+#endif
+
+#ifdef HAVE_AES_DECRYPT
+#if (defined(HAVE_AES_CBC) && !defined(WOLFSSL_DEVCRYPTO_CBC)) \
+			|| defined(WOLFSSL_AES_DIRECT)
 static const byte Td4[256] =
 {
     0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
@@ -1406,11 +1510,67 @@
     0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
     0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU,
 };
+#endif /* HAVE_AES_CBC || WOLFSSL_AES_DIRECT */
 #endif /* HAVE_AES_DECRYPT */
 
 #define GETBYTE(x, y) (word32)((byte)((x) >> (8 * (y))))
 
-
+#ifdef WOLFSSL_AES_SMALL_TABLES
+static const byte Tsbox[256] = {
+    0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
+    0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
+    0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
+    0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
+    0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
+    0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
+    0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
+    0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
+    0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
+    0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
+    0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
+    0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
+    0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
+    0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
+    0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
+    0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
+    0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
+    0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
+    0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
+    0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
+    0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
+    0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
+    0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
+    0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
+    0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
+    0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
+    0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
+    0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
+    0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
+    0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
+    0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
+    0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
+};
+
+#define AES_XTIME(x)    ((byte)((byte)((x) << 1) ^ ((0 - ((x) >> 7)) & 0x1b)))
+
+static word32 col_mul(word32 t, int i2, int i3, int ia, int ib)
+{
+    byte t3 = GETBYTE(t, i3);
+    byte tm = AES_XTIME(GETBYTE(t, i2) ^ t3);
+
+    return GETBYTE(t, ia) ^ GETBYTE(t, ib) ^ t3 ^ tm;
+}
+
+static word32 inv_col_mul(word32 t, int i9, int ib, int id, int ie)
+{
+    byte t9 = GETBYTE(t, i9);
+    byte tb = GETBYTE(t, ib);
+    byte td = GETBYTE(t, id);
+    byte te = GETBYTE(t, ie);
+    byte t0 = t9 ^ tb ^ td;
+    return t0 ^ AES_XTIME(AES_XTIME(AES_XTIME(t0 ^ te) ^ td ^ te) ^ tb ^ te);
+}
+#endif
 
 #if defined(HAVE_AES_CBC) || defined(WOLFSSL_AES_DIRECT) || defined(HAVE_AESGCM)
 
@@ -1425,6 +1585,7 @@
 #endif
 
 
+#ifndef WOLFSSL_AES_SMALL_TABLES
 /* load 4 Te Tables into cache by cache line stride */
 static WC_INLINE word32 PreFetchTe(void)
 {
@@ -1439,8 +1600,21 @@
     }
     return x;
 }
-
-
+#else
+/* load sbox into cache by cache line stride */
+static WC_INLINE word32 PreFetchSBox(void)
+{
+    word32 x = 0;
+    int i;
+
+    for (i = 0; i < 256; i += WC_CACHE_LINE_SZ/4) {
+        x &= Tsbox[i];
+    }
+    return x;
+}
+#endif
+
+/* Software AES - ECB Encrypt */
 static void wc_AesEncrypt(Aes* aes, const byte* inBlock, byte* outBlock)
 {
     word32 s0, s1, s2, s3;
@@ -1450,7 +1624,7 @@
 
     if (r > 7 || r == 0) {
         WOLFSSL_MSG("AesEncrypt encountered improper key, set it up");
-        return;  /* stop instead of segfaulting, set up your keys! */
+        return;  /* stop instead of seg-faulting, set up your keys! */
     }
 
 #ifdef WOLFSSL_AESNI
@@ -1475,8 +1649,8 @@
             tmp_align = tmp + (AESNI_ALIGN - ((size_t)tmp % AESNI_ALIGN));
 
             XMEMCPY(tmp_align, inBlock, AES_BLOCK_SIZE);
-            AES_ECB_encrypt(tmp_align, tmp_align, AES_BLOCK_SIZE, (byte*)aes->key,
-                            aes->rounds);
+            AES_ECB_encrypt(tmp_align, tmp_align, AES_BLOCK_SIZE,
+                    (byte*)aes->key, aes->rounds);
             XMEMCPY(outBlock, tmp_align, AES_BLOCK_SIZE);
             XFREE(tmp, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
             return;
@@ -1497,6 +1671,10 @@
         #endif
     }
 #endif
+#if defined(WOLFSSL_SCE) && !defined(WOLFSSL_SCE_NO_AES)
+    AES_ECB_encrypt(aes, inBlock, outBlock, AES_BLOCK_SIZE);
+    return;
+#endif
 
     /*
      * map byte array block to cipher state
@@ -1514,11 +1692,13 @@
     s3 = ByteReverseWord32(s3);
 #endif
 
+    /* AddRoundKey */
     s0 ^= rk[0];
     s1 ^= rk[1];
     s2 ^= rk[2];
     s3 ^= rk[3];
 
+#ifndef WOLFSSL_AES_SMALL_TABLES
     s0 |= PreFetchTe();
 
     /*
@@ -1527,28 +1707,28 @@
 
     for (;;) {
         t0 =
-            Te[0][GETBYTE(s0, 3)]  ^
-            Te[1][GETBYTE(s1, 2)]  ^
-            Te[2][GETBYTE(s2, 1)]  ^
-            Te[3][GETBYTE(s3, 0)]  ^
+            Te[0][GETBYTE(s0, 3)] ^
+            Te[1][GETBYTE(s1, 2)] ^
+            Te[2][GETBYTE(s2, 1)] ^
+            Te[3][GETBYTE(s3, 0)] ^
             rk[4];
         t1 =
-            Te[0][GETBYTE(s1, 3)]  ^
-            Te[1][GETBYTE(s2, 2)]  ^
-            Te[2][GETBYTE(s3, 1)]  ^
-            Te[3][GETBYTE(s0, 0)]  ^
+            Te[0][GETBYTE(s1, 3)] ^
+            Te[1][GETBYTE(s2, 2)] ^
+            Te[2][GETBYTE(s3, 1)] ^
+            Te[3][GETBYTE(s0, 0)] ^
             rk[5];
         t2 =
             Te[0][GETBYTE(s2, 3)] ^
-            Te[1][GETBYTE(s3, 2)]  ^
-            Te[2][GETBYTE(s0, 1)]  ^
-            Te[3][GETBYTE(s1, 0)]  ^
+            Te[1][GETBYTE(s3, 2)] ^
+            Te[2][GETBYTE(s0, 1)] ^
+            Te[3][GETBYTE(s1, 0)] ^
             rk[6];
         t3 =
             Te[0][GETBYTE(s3, 3)] ^
-            Te[1][GETBYTE(s0, 2)]  ^
-            Te[2][GETBYTE(s1, 1)]  ^
-            Te[3][GETBYTE(s2, 0)]  ^
+            Te[1][GETBYTE(s0, 2)] ^
+            Te[2][GETBYTE(s1, 1)] ^
+            Te[3][GETBYTE(s2, 0)] ^
             rk[7];
 
         rk += 8;
@@ -1611,6 +1791,84 @@
         (Te[0][GETBYTE(t1, 1)] & 0x0000ff00) ^
         (Te[1][GETBYTE(t2, 0)] & 0x000000ff) ^
         rk[3];
+#else
+    s0 |= PreFetchSBox();
+
+    r *= 2;
+    /* Two rounds at a time */
+    for (rk += 4; r > 1; r--, rk += 4) {
+        t0 =
+            ((word32)Tsbox[GETBYTE(s0, 3)] << 24) ^
+            ((word32)Tsbox[GETBYTE(s1, 2)] << 16) ^
+            ((word32)Tsbox[GETBYTE(s2, 1)] <<  8) ^
+            ((word32)Tsbox[GETBYTE(s3, 0)]);
+        t1 =
+            ((word32)Tsbox[GETBYTE(s1, 3)] << 24) ^
+            ((word32)Tsbox[GETBYTE(s2, 2)] << 16) ^
+            ((word32)Tsbox[GETBYTE(s3, 1)] <<  8) ^
+            ((word32)Tsbox[GETBYTE(s0, 0)]);
+        t2 =
+            ((word32)Tsbox[GETBYTE(s2, 3)] << 24) ^
+            ((word32)Tsbox[GETBYTE(s3, 2)] << 16) ^
+            ((word32)Tsbox[GETBYTE(s0, 1)] <<  8) ^
+            ((word32)Tsbox[GETBYTE(s1, 0)]);
+        t3 =
+            ((word32)Tsbox[GETBYTE(s3, 3)] << 24) ^
+            ((word32)Tsbox[GETBYTE(s0, 2)] << 16) ^
+            ((word32)Tsbox[GETBYTE(s1, 1)] <<  8) ^
+            ((word32)Tsbox[GETBYTE(s2, 0)]);
+
+        s0 =
+            (col_mul(t0, 3, 2, 0, 1) << 24) ^
+            (col_mul(t0, 2, 1, 0, 3) << 16) ^
+            (col_mul(t0, 1, 0, 2, 3) <<  8) ^
+            (col_mul(t0, 0, 3, 2, 1)      ) ^
+            rk[0];
+        s1 =
+            (col_mul(t1, 3, 2, 0, 1) << 24) ^
+            (col_mul(t1, 2, 1, 0, 3) << 16) ^
+            (col_mul(t1, 1, 0, 2, 3) <<  8) ^
+            (col_mul(t1, 0, 3, 2, 1)      ) ^
+            rk[1];
+        s2 =
+            (col_mul(t2, 3, 2, 0, 1) << 24) ^
+            (col_mul(t2, 2, 1, 0, 3) << 16) ^
+            (col_mul(t2, 1, 0, 2, 3) <<  8) ^
+            (col_mul(t2, 0, 3, 2, 1)      ) ^
+            rk[2];
+        s3 =
+            (col_mul(t3, 3, 2, 0, 1) << 24) ^
+            (col_mul(t3, 2, 1, 0, 3) << 16) ^
+            (col_mul(t3, 1, 0, 2, 3) <<  8) ^
+            (col_mul(t3, 0, 3, 2, 1)      ) ^
+            rk[3];
+    }
+
+    t0 =
+        ((word32)Tsbox[GETBYTE(s0, 3)] << 24) ^
+        ((word32)Tsbox[GETBYTE(s1, 2)] << 16) ^
+        ((word32)Tsbox[GETBYTE(s2, 1)] <<  8) ^
+        ((word32)Tsbox[GETBYTE(s3, 0)]);
+    t1 =
+        ((word32)Tsbox[GETBYTE(s1, 3)] << 24) ^
+        ((word32)Tsbox[GETBYTE(s2, 2)] << 16) ^
+        ((word32)Tsbox[GETBYTE(s3, 1)] <<  8) ^
+        ((word32)Tsbox[GETBYTE(s0, 0)]);
+    t2 =
+        ((word32)Tsbox[GETBYTE(s2, 3)] << 24) ^
+        ((word32)Tsbox[GETBYTE(s3, 2)] << 16) ^
+        ((word32)Tsbox[GETBYTE(s0, 1)] <<  8) ^
+        ((word32)Tsbox[GETBYTE(s1, 0)]);
+    t3 =
+        ((word32)Tsbox[GETBYTE(s3, 3)] << 24) ^
+        ((word32)Tsbox[GETBYTE(s0, 2)] << 16) ^
+        ((word32)Tsbox[GETBYTE(s1, 1)] <<  8) ^
+        ((word32)Tsbox[GETBYTE(s2, 0)]);
+    s0 = t0 ^ rk[0];
+    s1 = t1 ^ rk[1];
+    s2 = t2 ^ rk[2];
+    s3 = t3 ^ rk[3];
+#endif
 
     /* write out */
 #ifdef LITTLE_ENDIAN_ORDER
@@ -1629,8 +1887,10 @@
 #endif /* HAVE_AES_CBC || WOLFSSL_AES_DIRECT || HAVE_AESGCM */
 
 #if defined(HAVE_AES_DECRYPT)
-#if defined(HAVE_AES_CBC) || defined(WOLFSSL_AES_DIRECT)
-
+#if (defined(HAVE_AES_CBC) && !defined(WOLFSSL_DEVCRYPTO_CBC)) || \
+     defined(WOLFSSL_AES_DIRECT)
+
+#ifndef WOLFSSL_AES_SMALL_TABLES
 /* load 4 Td Tables into cache by cache line stride */
 static WC_INLINE word32 PreFetchTd(void)
 {
@@ -1645,6 +1905,7 @@
     }
     return x;
 }
+#endif
 
 /* load Td Table4 into cache by cache line stride */
 static WC_INLINE word32 PreFetchTd4(void)
@@ -1658,6 +1919,7 @@
     return x;
 }
 
+/* Software AES - ECB Decrypt */
 static void wc_AesDecrypt(Aes* aes, const byte* inBlock, byte* outBlock)
 {
     word32 s0, s1, s2, s3;
@@ -1667,7 +1929,7 @@
     const word32* rk = aes->key;
     if (r > 7 || r == 0) {
         WOLFSSL_MSG("AesDecrypt encountered improper key, set it up");
-        return;  /* stop instead of segfaulting, set up your keys! */
+        return;  /* stop instead of seg-faulting, set up your keys! */
     }
 #ifdef WOLFSSL_AESNI
     if (haveAESNI && aes->use_aesni) {
@@ -1681,7 +1943,8 @@
         #endif
 
         /* if input and output same will overwrite input iv */
-        XMEMCPY(aes->tmp, inBlock, AES_BLOCK_SIZE);
+        if ((const byte*)aes->tmp != inBlock)
+            XMEMCPY(aes->tmp, inBlock, AES_BLOCK_SIZE);
         AES_ECB_decrypt(inBlock, outBlock, AES_BLOCK_SIZE, (byte*)aes->key,
                         aes->rounds);
         return;
@@ -1692,6 +1955,9 @@
         #endif
     }
 #endif /* WOLFSSL_AESNI */
+#if defined(WOLFSSL_SCE) && !defined(WOLFSSL_SCE_NO_AES)
+    return AES_ECB_decrypt(aes, inBlock, outBlock, AES_BLOCK_SIZE);
+#endif
 
     /*
      * map byte array block to cipher state
@@ -1714,6 +1980,7 @@
     s2 ^= rk[2];
     s3 ^= rk[3];
 
+#ifndef WOLFSSL_AES_SMALL_TABLES
     s0 |= PreFetchTd();
 
     /*
@@ -1807,6 +2074,83 @@
         ((word32)Td4[GETBYTE(t1, 1)] <<  8) ^
         ((word32)Td4[GETBYTE(t0, 0)]) ^
         rk[3];
+#else
+    s0 |= PreFetchTd4();
+
+    r *= 2;
+    for (rk += 4; r > 1; r--, rk += 4) {
+        t0 =
+            ((word32)Td4[GETBYTE(s0, 3)] << 24) ^
+            ((word32)Td4[GETBYTE(s3, 2)] << 16) ^
+            ((word32)Td4[GETBYTE(s2, 1)] <<  8) ^
+            ((word32)Td4[GETBYTE(s1, 0)]) ^
+            rk[0];
+        t1 =
+            ((word32)Td4[GETBYTE(s1, 3)] << 24) ^
+            ((word32)Td4[GETBYTE(s0, 2)] << 16) ^
+            ((word32)Td4[GETBYTE(s3, 1)] <<  8) ^
+            ((word32)Td4[GETBYTE(s2, 0)]) ^
+            rk[1];
+        t2 =
+            ((word32)Td4[GETBYTE(s2, 3)] << 24) ^
+            ((word32)Td4[GETBYTE(s1, 2)] << 16) ^
+            ((word32)Td4[GETBYTE(s0, 1)] <<  8) ^
+            ((word32)Td4[GETBYTE(s3, 0)]) ^
+            rk[2];
+        t3 =
+            ((word32)Td4[GETBYTE(s3, 3)] << 24) ^
+            ((word32)Td4[GETBYTE(s2, 2)] << 16) ^
+            ((word32)Td4[GETBYTE(s1, 1)] <<  8) ^
+            ((word32)Td4[GETBYTE(s0, 0)]) ^
+            rk[3];
+
+        s0 =
+            (inv_col_mul(t0, 0, 2, 1, 3) << 24) ^
+            (inv_col_mul(t0, 3, 1, 0, 2) << 16) ^
+            (inv_col_mul(t0, 2, 0, 3, 1) <<  8) ^
+            (inv_col_mul(t0, 1, 3, 2, 0)      );
+        s1 =
+            (inv_col_mul(t1, 0, 2, 1, 3) << 24) ^
+            (inv_col_mul(t1, 3, 1, 0, 2) << 16) ^
+            (inv_col_mul(t1, 2, 0, 3, 1) <<  8) ^
+            (inv_col_mul(t1, 1, 3, 2, 0)      );
+        s2 =
+            (inv_col_mul(t2, 0, 2, 1, 3) << 24) ^
+            (inv_col_mul(t2, 3, 1, 0, 2) << 16) ^
+            (inv_col_mul(t2, 2, 0, 3, 1) <<  8) ^
+            (inv_col_mul(t2, 1, 3, 2, 0)      );
+        s3 =
+            (inv_col_mul(t3, 0, 2, 1, 3) << 24) ^
+            (inv_col_mul(t3, 3, 1, 0, 2) << 16) ^
+            (inv_col_mul(t3, 2, 0, 3, 1) <<  8) ^
+            (inv_col_mul(t3, 1, 3, 2, 0)      );
+    }
+
+    t0 =
+        ((word32)Td4[GETBYTE(s0, 3)] << 24) ^
+        ((word32)Td4[GETBYTE(s3, 2)] << 16) ^
+        ((word32)Td4[GETBYTE(s2, 1)] <<  8) ^
+        ((word32)Td4[GETBYTE(s1, 0)]);
+    t1 =
+        ((word32)Td4[GETBYTE(s1, 3)] << 24) ^
+        ((word32)Td4[GETBYTE(s0, 2)] << 16) ^
+        ((word32)Td4[GETBYTE(s3, 1)] <<  8) ^
+        ((word32)Td4[GETBYTE(s2, 0)]);
+    t2 =
+        ((word32)Td4[GETBYTE(s2, 3)] << 24) ^
+        ((word32)Td4[GETBYTE(s1, 2)] << 16) ^
+        ((word32)Td4[GETBYTE(s0, 1)] <<  8) ^
+        ((word32)Td4[GETBYTE(s3, 0)]);
+    t3 =
+        ((word32)Td4[GETBYTE(s3, 3)] << 24) ^
+        ((word32)Td4[GETBYTE(s2, 2)] << 16) ^
+        ((word32)Td4[GETBYTE(s1, 1)] <<  8) ^
+        ((word32)Td4[GETBYTE(s0, 0)]);
+    s0 = t0 ^ rk[0];
+    s1 = t1 ^ rk[1];
+    s2 = t2 ^ rk[2];
+    s3 = t3 ^ rk[3];
+#endif
 
     /* write out */
 #ifdef LITTLE_ENDIAN_ORDER
@@ -1834,20 +2178,27 @@
     int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
             const byte* iv, int dir)
     {
-        word32 *rk = aes->key;
+        word32 *rk;
 
         (void)dir;
 
-        if (!((keylen == 16) || (keylen == 24) || (keylen == 32)))
+        if (aes == NULL || (keylen != 16 &&
+        #ifdef WOLFSSL_AES_192
+            keylen != 24 &&
+        #endif
+            keylen != 32)) {
             return BAD_FUNC_ARG;
-
+        }
+
+        rk = aes->key;
         aes->keylen = keylen;
         aes->rounds = keylen/4 + 6;
         XMEMCPY(rk, userKey, keylen);
-    #ifndef WOLFSSL_STM32_CUBEMX
+    #if !defined(WOLFSSL_STM32_CUBEMX) || defined(STM32_HAL_V2)
         ByteReverseWords(rk, rk, keylen);
     #endif
-    #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER)
+    #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER) || \
+        defined(WOLFSSL_AES_OFB)
         aes->left = 0;
     #endif
 
@@ -1920,7 +2271,8 @@
         if (iv)
             XMEMCPY(aes->reg, iv, AES_BLOCK_SIZE);
 
-    #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER)
+    #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER) || \
+        defined(WOLFSSL_AES_OFB)
         aes->left = 0;
     #endif
 
@@ -1930,13 +2282,14 @@
     int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen, const byte* iv,
                   int dir)
     {
-        if (!((keylen == 16) || (keylen == 24) || (keylen == 32)))
+        if (aes == NULL || !((keylen == 16) || (keylen == 24) || (keylen == 32)))
             return BAD_FUNC_ARG;
 
         aes->rounds = keylen/4 + 6;
         XMEMCPY(aes->key, userKey, keylen);
 
-    #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER)
+    #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER) || \
+        defined(WOLFSSL_AES_OFB)
         aes->left = 0;
     #endif
 
@@ -1953,34 +2306,65 @@
         const byte* iv, int dir)
     {
         int ret;
-        byte *rk = (byte*)aes->key;
+        byte* rk;
+        byte* tmpKey = (byte*)userKey;
+        int tmpKeyDynamic = 0;
+        word32 alignOffset = 0;
 
         (void)dir;
 
         if (!((keylen == 16) || (keylen == 24) || (keylen == 32)))
             return BAD_FUNC_ARG;
-
+        if (aes == NULL)
+            return BAD_FUNC_ARG;
+
+        rk = (byte*)aes->key;
         if (rk == NULL)
             return BAD_FUNC_ARG;
 
-    #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER)
+    #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER) || \
+        defined(WOLFSSL_AES_OFB)
         aes->left = 0;
     #endif
 
         aes->rounds = keylen/4 + 6;
 
+    #ifdef FREESCALE_MMCAU_CLASSIC
+        if ((wolfssl_word)userKey % WOLFSSL_MMCAU_ALIGNMENT) {
+        #ifndef NO_WOLFSSL_ALLOC_ALIGN
+            byte* tmp = (byte*)XMALLOC(keylen + WOLFSSL_MMCAU_ALIGNMENT,
+                                       aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+            if (tmp == NULL) {
+                return MEMORY_E;
+            }
+            alignOffset = WOLFSSL_MMCAU_ALIGNMENT -
+                          ((wolfssl_word)tmp % WOLFSSL_MMCAU_ALIGNMENT);
+            tmpKey = tmp + alignOffset;
+            XMEMCPY(tmpKey, userKey, keylen);
+            tmpKeyDynamic = 1;
+        #else
+            WOLFSSL_MSG("Bad cau_aes_set_key alignment");
+            return BAD_ALIGN_E;
+        #endif
+        }
+    #endif
+
         ret = wolfSSL_CryptHwMutexLock();
         if(ret == 0) {
         #ifdef FREESCALE_MMCAU_CLASSIC
-            cau_aes_set_key(userKey, keylen*8, rk);
+            cau_aes_set_key(tmpKey, keylen*8, rk);
         #else
-            MMCAU_AES_SetKey(userKey, keylen, rk);
+            MMCAU_AES_SetKey(tmpKey, keylen, rk);
         #endif
             wolfSSL_CryptHwMutexUnLock();
 
             ret = wc_AesSetIV(aes, iv);
         }
 
+        if (tmpKeyDynamic == 1) {
+            XFREE(tmpKey - alignOffset, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+        }
+
         return ret;
     }
 
@@ -1999,14 +2383,15 @@
         (void)dir;
         (void)iv;
 
-        if (keylen != 16)
+        if (aes == NULL || keylen != 16)
             return BAD_FUNC_ARG;
 
         aes->keylen = keylen;
         aes->rounds = keylen/4 + 6;
         ret = nrf51_aes_set_key(userKey);
 
-    #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER)
+    #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER) || \
+        defined(WOLFSSL_AES_OFB)
         aes->left = 0;
     #endif
 
@@ -2018,11 +2403,122 @@
     {
         return wc_AesSetKey(aes, userKey, keylen, iv, dir);
     }
+#elif defined(WOLFSSL_ESP32WROOM32_CRYPT) && \
+    !defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_AES)
+
+    int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen,
+        const byte* iv, int dir)
+    {
+        (void)dir;
+        (void)iv;
+
+        if (aes == NULL || (keylen != 16 && keylen != 24 && keylen != 32)) {
+            return BAD_FUNC_ARG;
+        }
+
+        aes->keylen = keylen;
+        aes->rounds = keylen/4 + 6;
+
+        XMEMCPY(aes->key, userKey, keylen);
+        #if defined(WOLFSSL_AES_COUNTER)
+            aes->left = 0;
+        #endif
+        return wc_AesSetIV(aes, iv);
+    }
+
+    int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen,
+                        const byte* iv, int dir)
+    {
+        return wc_AesSetKey(aes, userKey, keylen, iv, dir);
+    }
+#elif defined(WOLFSSL_CRYPTOCELL) && defined(WOLFSSL_CRYPTOCELL_AES)
+
+    int wc_AesSetKey(Aes* aes, const byte* userKey, word32 keylen, const byte* iv,
+                    int dir)
+    {
+        SaSiError_t ret = SASI_OK;
+        SaSiAesIv_t iv_aes;
+
+        if (aes == NULL ||
+           (keylen != AES_128_KEY_SIZE &&
+            keylen != AES_192_KEY_SIZE &&
+            keylen != AES_256_KEY_SIZE)) {
+            return BAD_FUNC_ARG;
+        }
+    #if defined(AES_MAX_KEY_SIZE)
+        if (keylen > (AES_MAX_KEY_SIZE/8)) {
+            return BAD_FUNC_ARG;
+        }
+    #endif
+        if (dir != AES_ENCRYPTION &&
+            dir != AES_DECRYPTION) {
+            return BAD_FUNC_ARG;
+        }
+
+        if (dir == AES_ENCRYPTION) {
+            aes->ctx.mode = SASI_AES_ENCRYPT;
+            SaSi_AesInit(&aes->ctx.user_ctx,
+                         SASI_AES_ENCRYPT,
+                         SASI_AES_MODE_CBC,
+                         SASI_AES_PADDING_NONE);
+        }
+        else {
+            aes->ctx.mode = SASI_AES_DECRYPT;
+            SaSi_AesInit(&aes->ctx.user_ctx,
+                         SASI_AES_DECRYPT,
+                         SASI_AES_MODE_CBC,
+                         SASI_AES_PADDING_NONE);
+        }
+
+        aes->keylen = keylen;
+        aes->rounds = keylen/4 + 6;
+        XMEMCPY(aes->key, userKey, keylen);
+
+        aes->ctx.key.pKey = (uint8_t*)aes->key;
+        aes->ctx.key.keySize= keylen;
+
+        ret = SaSi_AesSetKey(&aes->ctx.user_ctx,
+                             SASI_AES_USER_KEY,
+                             &aes->ctx.key,
+                             sizeof(aes->ctx.key));
+        if (ret != SASI_OK) {
+            return BAD_FUNC_ARG;
+        }
+
+        ret = wc_AesSetIV(aes, iv);
+
+        if (iv)
+            XMEMCPY(iv_aes, iv, AES_BLOCK_SIZE);
+        else
+            XMEMSET(iv_aes,  0, AES_BLOCK_SIZE);
+
+
+        ret = SaSi_AesSetIv(&aes->ctx.user_ctx, iv_aes);
+        if (ret != SASI_OK) {
+            return ret;
+        }
+       return ret;
+    }
+    #if defined(WOLFSSL_AES_DIRECT)
+        int wc_AesSetKeyDirect(Aes* aes, const byte* userKey, word32 keylen,
+                            const byte* iv, int dir)
+        {
+            return wc_AesSetKey(aes, userKey, keylen, iv, dir);
+        }
+    #endif
 
 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES)
       /* implemented in wolfcrypt/src/port/caam/caam_aes.c */
 
+#elif defined(WOLFSSL_AFALG)
+    /* implemented in wolfcrypt/src/port/af_alg/afalg_aes.c */
+
+#elif defined(WOLFSSL_DEVCRYPTO_AES)
+    /* implemented in wolfcrypt/src/port/devcrypto/devcrypto_aes.c */
+
 #else
+
+    /* Software AES - SetKey */
     static int wc_AesSetKeyLocal(Aes* aes, const byte* userKey, word32 keylen,
                 const byte* iv, int dir)
     {
@@ -2035,7 +2531,8 @@
         #ifdef WOLFSSL_AESNI
             aes->use_aesni = 0;
         #endif /* WOLFSSL_AESNI */
-        #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER)
+        #if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_COUNTER) || \
+            defined(WOLFSSL_AES_OFB)
             aes->left = 0;
         #endif
 
@@ -2043,12 +2540,13 @@
         aes->rounds = (keylen/4) + 6;
 
         XMEMCPY(rk, userKey, keylen);
-    #if defined(LITTLE_ENDIAN_ORDER) && !defined(WOLFSSL_PIC32MZ_CRYPT)
+    #if defined(LITTLE_ENDIAN_ORDER) && !defined(WOLFSSL_PIC32MZ_CRYPT) && \
+        (!defined(WOLFSSL_ESP32WROOM32_CRYPT) || \
+          defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_AES))
         ByteReverseWords(rk, rk, keylen);
     #endif
 
 #ifdef NEED_AES_TABLES
-
         switch (keylen) {
     #if defined(AES_MAX_KEY_SIZE) && AES_MAX_KEY_SIZE >= 128 && \
             defined(WOLFSSL_AES_128)
@@ -2057,10 +2555,17 @@
             {
                 temp  = rk[3];
                 rk[4] = rk[0] ^
+            #ifndef WOLFSSL_AES_SMALL_TABLES
                     (Te[2][GETBYTE(temp, 2)] & 0xff000000) ^
                     (Te[3][GETBYTE(temp, 1)] & 0x00ff0000) ^
                     (Te[0][GETBYTE(temp, 0)] & 0x0000ff00) ^
                     (Te[1][GETBYTE(temp, 3)] & 0x000000ff) ^
+            #else
+                    ((word32)Tsbox[GETBYTE(temp, 2)] << 24) ^
+                    ((word32)Tsbox[GETBYTE(temp, 1)] << 16) ^
+                    ((word32)Tsbox[GETBYTE(temp, 0)] <<  8) ^
+                    ((word32)Tsbox[GETBYTE(temp, 3)]) ^
+            #endif
                     rcon[i];
                 rk[5] = rk[1] ^ rk[4];
                 rk[6] = rk[2] ^ rk[5];
@@ -2080,10 +2585,17 @@
             {
                 temp = rk[ 5];
                 rk[ 6] = rk[ 0] ^
+            #ifndef WOLFSSL_AES_SMALL_TABLES
                     (Te[2][GETBYTE(temp, 2)] & 0xff000000) ^
                     (Te[3][GETBYTE(temp, 1)] & 0x00ff0000) ^
                     (Te[0][GETBYTE(temp, 0)] & 0x0000ff00) ^
                     (Te[1][GETBYTE(temp, 3)] & 0x000000ff) ^
+            #else
+                    ((word32)Tsbox[GETBYTE(temp, 2)] << 24) ^
+                    ((word32)Tsbox[GETBYTE(temp, 1)] << 16) ^
+                    ((word32)Tsbox[GETBYTE(temp, 0)] <<  8) ^
+                    ((word32)Tsbox[GETBYTE(temp, 3)]) ^
+            #endif
                     rcon[i];
                 rk[ 7] = rk[ 1] ^ rk[ 6];
                 rk[ 8] = rk[ 2] ^ rk[ 7];
@@ -2104,10 +2616,17 @@
             {
                 temp = rk[ 7];
                 rk[ 8] = rk[ 0] ^
+            #ifndef WOLFSSL_AES_SMALL_TABLES
                     (Te[2][GETBYTE(temp, 2)] & 0xff000000) ^
                     (Te[3][GETBYTE(temp, 1)] & 0x00ff0000) ^
                     (Te[0][GETBYTE(temp, 0)] & 0x0000ff00) ^
                     (Te[1][GETBYTE(temp, 3)] & 0x000000ff) ^
+            #else
+                    ((word32)Tsbox[GETBYTE(temp, 2)] << 24) ^
+                    ((word32)Tsbox[GETBYTE(temp, 1)] << 16) ^
+                    ((word32)Tsbox[GETBYTE(temp, 0)] <<  8) ^
+                    ((word32)Tsbox[GETBYTE(temp, 3)]) ^
+            #endif
                     rcon[i];
                 rk[ 9] = rk[ 1] ^ rk[ 8];
                 rk[10] = rk[ 2] ^ rk[ 9];
@@ -2116,10 +2635,17 @@
                     break;
                 temp = rk[11];
                 rk[12] = rk[ 4] ^
+            #ifndef WOLFSSL_AES_SMALL_TABLES
                     (Te[2][GETBYTE(temp, 3)] & 0xff000000) ^
                     (Te[3][GETBYTE(temp, 2)] & 0x00ff0000) ^
                     (Te[0][GETBYTE(temp, 1)] & 0x0000ff00) ^
                     (Te[1][GETBYTE(temp, 0)] & 0x000000ff);
+            #else
+                    ((word32)Tsbox[GETBYTE(temp, 3)] << 24) ^
+                    ((word32)Tsbox[GETBYTE(temp, 2)] << 16) ^
+                    ((word32)Tsbox[GETBYTE(temp, 1)] <<  8) ^
+                    ((word32)Tsbox[GETBYTE(temp, 0)]);
+            #endif
                 rk[13] = rk[ 5] ^ rk[12];
                 rk[14] = rk[ 6] ^ rk[13];
                 rk[15] = rk[ 7] ^ rk[14];
@@ -2133,7 +2659,7 @@
             return BAD_FUNC_ARG;
         } /* switch */
 
-    #ifdef HAVE_AES_DECRYPT
+    #if defined(HAVE_AES_DECRYPT)
         if (dir == AES_DECRYPTION) {
             unsigned int j;
             rk = aes->key;
@@ -2145,6 +2671,7 @@
                 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
                 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
             }
+        #if !defined(WOLFSSL_AES_SMALL_TABLES)
             /* apply the inverse MixColumn transform to all round keys but the
                first and the last: */
             for (i = 1; i < aes->rounds; i++) {
@@ -2170,12 +2697,21 @@
                     Td[2][Te[1][GETBYTE(rk[3], 1)] & 0xff] ^
                     Td[3][Te[1][GETBYTE(rk[3], 0)] & 0xff];
             }
+        #endif
         }
     #else
         (void)dir;
     #endif /* HAVE_AES_DECRYPT */
+        (void)temp;
 #endif /* NEED_AES_TABLES */
 
+#if defined(WOLFSSL_SCE) && !defined(WOLFSSL_SCE_NO_AES)
+        XMEMCPY((byte*)aes->key, userKey, keylen);
+        if (WOLFSSL_SCE_GSCE_HANDLE.p_cfg->endian_flag == CRYPTO_WORD_ENDIAN_BIG) {
+            ByteReverseWords(aes->key, aes->key, 32);
+        }
+#endif
+
         return wc_AesSetIV(aes, iv);
     }
 
@@ -2217,13 +2753,16 @@
         aes->keylen = keylen;
         aes->rounds = keylen/4 + 6;
 
-    #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES)
-        if (aes->asyncDev.marker == WOLFSSL_ASYNC_MARKER_AES) {
-            XMEMCPY(aes->asyncKey, userKey, keylen);
-            if (iv)
-                XMEMCPY(aes->asyncIv, iv, AES_BLOCK_SIZE);
-        }
-    #endif /* WOLFSSL_ASYNC_CRYPT */
+    #if defined(WOLF_CRYPTO_CB) || (defined(WOLFSSL_DEVCRYPTO) && \
+        (defined(WOLFSSL_DEVCRYPTO_AES) || defined(WOLFSSL_DEVCRYPTO_CBC))) || \
+        (defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES))
+        #ifdef WOLF_CRYPTO_CB
+        if (aes->devId != INVALID_DEVID)
+        #endif
+        {
+            XMEMCPY(aes->devKey, userKey, keylen);
+        }
+    #endif
 
     #ifdef WOLFSSL_AESNI
         if (checkAESNI == 0) {
@@ -2231,12 +2770,15 @@
             checkAESNI = 1;
         }
         if (haveAESNI) {
-            #if defined(WOLFSSL_AES_COUNTER) || defined(WOLFSSL_AES_CFB)
+            #if defined(WOLFSSL_AES_COUNTER) || defined(WOLFSSL_AES_CFB) || \
+                defined(WOLFSSL_AES_OFB)
                 aes->left = 0;
             #endif /* WOLFSSL_AES_COUNTER */
             aes->use_aesni = 1;
             if (iv)
                 XMEMCPY(aes->reg, iv, AES_BLOCK_SIZE);
+            else
+                XMEMSET(aes->reg, 0, AES_BLOCK_SIZE);
             if (dir == AES_ENCRYPTION)
                 return AES_set_encrypt_key(userKey, keylen * 8, aes);
         #ifdef HAVE_AES_DECRYPT
@@ -2248,6 +2790,10 @@
 
         ret = wc_AesSetKeyLocal(aes, userKey, keylen, iv, dir);
 
+    #if defined(WOLFSSL_DEVCRYPTO) && \
+        (defined(WOLFSSL_DEVCRYPTO_AES) || defined(WOLFSSL_DEVCRYPTO_CBC))
+        aes->ctx.cfd = -1;
+    #endif
     #ifdef WOLFSSL_IMX6_CAAM_BLOB
         ForceZero(local, sizeof(local));
     #endif
@@ -2339,19 +2885,60 @@
     #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES)
         /* implemented in wolfcrypt/src/port/caam/caam_aes.c */
 
+    #elif defined(WOLFSSL_AFALG)
+        /* implemented in wolfcrypt/src/port/af_alg/afalg_aes.c */
+
+    #elif defined(WOLFSSL_DEVCRYPTO_AES)
+        /* implemented in wolfcrypt/src/port/devcrypt/devcrypto_aes.c */
+
+    #elif defined(STM32_CRYPTO)
+        /* Allow direct access to one block encrypt */
+        void wc_AesEncryptDirect(Aes* aes, byte* out, const byte* in)
+        {
+            if (wolfSSL_CryptHwMutexLock() == 0) {
+                wc_AesEncrypt(aes, in, out);
+                wolfSSL_CryptHwMutexUnLock();
+            }
+        }
+        #ifdef HAVE_AES_DECRYPT
+        /* Allow direct access to one block decrypt */
+        void wc_AesDecryptDirect(Aes* aes, byte* out, const byte* in)
+        {
+            if (wolfSSL_CryptHwMutexLock() == 0) {
+                wc_AesDecrypt(aes, in, out);
+                wolfSSL_CryptHwMutexUnLock();
+            }
+        }
+        #endif /* HAVE_AES_DECRYPT */
+
+    #elif defined(WOLFSSL_ESP32WROOM32_CRYPT) && \
+        !defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_AES)
+        
+        /* Allow direct access to one block encrypt */
+        void wc_AesEncryptDirect(Aes* aes, byte* out, const byte* in)
+        {
+            wc_AesEncrypt(aes, in, out);
+        }
+        #ifdef HAVE_AES_DECRYPT
+        /* Allow direct access to one block decrypt */
+        void wc_AesDecryptDirect(Aes* aes, byte* out, const byte* in)
+        {
+            wc_AesDecrypt(aes, in, out);
+        }
+        #endif /* HAVE_AES_DECRYPT */
     #else
         /* Allow direct access to one block encrypt */
         void wc_AesEncryptDirect(Aes* aes, byte* out, const byte* in)
         {
             wc_AesEncrypt(aes, in, out);
         }
-    #ifdef HAVE_AES_DECRYPT
+        #ifdef HAVE_AES_DECRYPT
         /* Allow direct access to one block decrypt */
         void wc_AesDecryptDirect(Aes* aes, byte* out, const byte* in)
         {
             wc_AesDecrypt(aes, in, out);
         }
-    #endif /* HAVE_AES_DECRYPT */
+        #endif /* HAVE_AES_DECRYPT */
     #endif /* AES direct block */
 #endif /* WOLFSSL_AES_DIRECT */
 
@@ -2367,32 +2954,38 @@
         word32 blocks = (sz / AES_BLOCK_SIZE);
         CRYP_HandleTypeDef hcryp;
 
-        XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef));
-        switch (aes->rounds) {
-            case 10: /* 128-bit key */
-                hcryp.Init.KeySize = CRYP_KEYSIZE_128B;
-                break;
-	#ifdef CRYP_KEYSIZE_192B
-            case 12: /* 192-bit key */
-                hcryp.Init.KeySize = CRYP_KEYSIZE_192B;
-                break;
-	#endif
-            case 14: /* 256-bit key */
-                hcryp.Init.KeySize = CRYP_KEYSIZE_256B;
-                break;
-            default:
-                break;
-        }
-        hcryp.Instance = CRYP;
-        hcryp.Init.DataType = CRYP_DATATYPE_8B;
-        hcryp.Init.pKey = (uint8_t*)aes->key;
-        hcryp.Init.pInitVect = (uint8_t*)aes->reg;
-
+        ret = wc_Stm32_Aes_Init(aes, &hcryp);
+        if (ret != 0)
+            return ret;
+
+        ret = wolfSSL_CryptHwMutexLock();
+        if (ret != 0) {
+            return ret;
+        }
+
+    #ifdef STM32_CRYPTO_AES_ONLY
+        hcryp.Init.OperatingMode = CRYP_ALGOMODE_ENCRYPT;
+        hcryp.Init.ChainingMode  = CRYP_CHAINMODE_AES_CBC;
+        hcryp.Init.KeyWriteFlag  = CRYP_KEY_WRITE_ENABLE;
+    #elif defined(STM32_HAL_V2)
+        hcryp.Init.Algorithm  = CRYP_AES_CBC;
+        ByteReverseWords(aes->reg, aes->reg, AES_BLOCK_SIZE);
+    #endif
+        hcryp.Init.pInitVect = (STM_CRYPT_TYPE*)aes->reg;
         HAL_CRYP_Init(&hcryp);
 
         while (blocks--) {
-            if (HAL_CRYP_AESCBC_Encrypt(&hcryp, (uint8_t*)in, AES_BLOCK_SIZE,
-                                           out, STM32_HAL_TIMEOUT) != HAL_OK) {
+        #ifdef STM32_CRYPTO_AES_ONLY
+            ret = HAL_CRYPEx_AES(&hcryp, (uint8_t*)in, AES_BLOCK_SIZE,
+                out, STM32_HAL_TIMEOUT);
+        #elif defined(STM32_HAL_V2)
+            ret = HAL_CRYP_Encrypt(&hcryp, (uint32_t*)in, AES_BLOCK_SIZE,
+                (uint32_t*)out, STM32_HAL_TIMEOUT);
+        #else
+            ret = HAL_CRYP_AESCBC_Encrypt(&hcryp, (uint8_t*)in, AES_BLOCK_SIZE,
+                out, STM32_HAL_TIMEOUT);
+        #endif
+            if (ret != HAL_OK) {
                 ret = WC_TIMEOUT_E;
                 break;
             }
@@ -2407,6 +3000,8 @@
 
         HAL_CRYP_DeInit(&hcryp);
 
+        wolfSSL_CryptHwMutexUnLock();
+
         return ret;
     }
     #ifdef HAVE_AES_DECRYPT
@@ -2416,33 +3011,44 @@
         word32 blocks = (sz / AES_BLOCK_SIZE);
         CRYP_HandleTypeDef hcryp;
 
-        XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef));
-        switch (aes->rounds) {
-            case 10: /* 128-bit key */
-                hcryp.Init.KeySize = CRYP_KEYSIZE_128B;
-                break;
-	#ifdef CRYP_KEYSIZE_192B
-            case 12: /* 192-bit key */
-                hcryp.Init.KeySize = CRYP_KEYSIZE_192B;
-                break;
-	#endif
-            case 14: /* 256-bit key */
-                hcryp.Init.KeySize = CRYP_KEYSIZE_256B;
-                break;
-            default:
-                break;
-        }
-        hcryp.Instance = CRYP;
-        hcryp.Init.DataType = CRYP_DATATYPE_8B;
-        hcryp.Init.pKey = (uint8_t*)aes->key;
-        hcryp.Init.pInitVect = (uint8_t*)aes->reg;
-
+        ret = wc_Stm32_Aes_Init(aes, &hcryp);
+        if (ret != 0)
+            return ret;
+
+        ret = wolfSSL_CryptHwMutexLock();
+        if (ret != 0) {
+            return ret;
+        }
+
+        /* if input and output same will overwrite input iv */
+        XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+
+    #ifdef STM32_CRYPTO_AES_ONLY
+        hcryp.Init.OperatingMode = CRYP_ALGOMODE_KEYDERIVATION_DECRYPT;
+        hcryp.Init.ChainingMode  = CRYP_CHAINMODE_AES_CBC;
+        hcryp.Init.KeyWriteFlag  = CRYP_KEY_WRITE_ENABLE;
+    #elif defined(STM32_HAL_V2)
+        hcryp.Init.Algorithm  = CRYP_AES_CBC;
+        ByteReverseWords(aes->reg, aes->reg, AES_BLOCK_SIZE);
+    #endif
+
+        hcryp.Init.pInitVect = (STM_CRYPT_TYPE*)aes->reg;
         HAL_CRYP_Init(&hcryp);
 
         while (blocks--) {
-            if (HAL_CRYP_AESCBC_Decrypt(&hcryp, (uint8_t*)in, AES_BLOCK_SIZE,
-                                           out, STM32_HAL_TIMEOUT) != HAL_OK) {
+        #ifdef STM32_CRYPTO_AES_ONLY
+            ret = HAL_CRYPEx_AES(&hcryp, (uint8_t*)in, AES_BLOCK_SIZE,
+                out, STM32_HAL_TIMEOUT);
+        #elif defined(STM32_HAL_V2)
+            ret = HAL_CRYP_Decrypt(&hcryp, (uint32_t*)in, AES_BLOCK_SIZE,
+                (uint32_t*)out, STM32_HAL_TIMEOUT);
+        #else
+            ret = HAL_CRYP_AESCBC_Decrypt(&hcryp, (uint8_t*)in, AES_BLOCK_SIZE,
+                out, STM32_HAL_TIMEOUT);
+        #endif
+            if (ret != HAL_OK) {
                 ret = WC_TIMEOUT_E;
+                break;
             }
 
             /* store iv for next call */
@@ -2453,80 +3059,51 @@
         }
 
         HAL_CRYP_DeInit(&hcryp);
+        wolfSSL_CryptHwMutexUnLock();
 
         return ret;
     }
     #endif /* HAVE_AES_DECRYPT */
-#else
+
+#else /* STD_PERI_LIB */
     int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
     {
-        word32 *enc_key, *iv;
+        int ret;
+        word32 *iv;
         word32 blocks = (sz / AES_BLOCK_SIZE);
-        CRYP_InitTypeDef AES_CRYP_InitStructure;
-        CRYP_KeyInitTypeDef AES_CRYP_KeyInitStructure;
-        CRYP_IVInitTypeDef AES_CRYP_IVInitStructure;
-
-        enc_key = aes->key;
-        iv = aes->reg;
-
-        /* crypto structure initialization */
-        CRYP_KeyStructInit(&AES_CRYP_KeyInitStructure);
-        CRYP_StructInit(&AES_CRYP_InitStructure);
-        CRYP_IVStructInit(&AES_CRYP_IVInitStructure);
+        CRYP_InitTypeDef cryptInit;
+        CRYP_KeyInitTypeDef keyInit;
+        CRYP_IVInitTypeDef ivInit;
+
+        ret = wc_Stm32_Aes_Init(aes, &cryptInit, &keyInit);
+        if (ret != 0)
+            return ret;
+
+        ret = wolfSSL_CryptHwMutexLock();
+        if (ret != 0) {
+            return ret;
+        }
 
         /* reset registers to their default values */
         CRYP_DeInit();
 
-        /* load key into correct registers */
-        switch (aes->rounds) {
-            case 10: /* 128-bit key */
-                AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_128b;
-                AES_CRYP_KeyInitStructure.CRYP_Key2Left  = enc_key[0];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[1];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Left  = enc_key[2];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[3];
-                break;
-
-            case 12: /* 192-bit key */
-                AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_192b;
-                AES_CRYP_KeyInitStructure.CRYP_Key1Left  = enc_key[0];
-                AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[1];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Left  = enc_key[2];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[3];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Left  = enc_key[4];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[5];
-                break;
-
-            case 14: /* 256-bit key */
-                AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_256b;
-                AES_CRYP_KeyInitStructure.CRYP_Key0Left  = enc_key[0];
-                AES_CRYP_KeyInitStructure.CRYP_Key0Right = enc_key[1];
-                AES_CRYP_KeyInitStructure.CRYP_Key1Left  = enc_key[2];
-                AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[3];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Left  = enc_key[4];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[5];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Left  = enc_key[6];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[7];
-                break;
-
-            default:
-                break;
-        }
-        CRYP_KeyInit(&AES_CRYP_KeyInitStructure);
+        /* set key */
+        CRYP_KeyInit(&keyInit);
 
         /* set iv */
+        iv = aes->reg;
+        CRYP_IVStructInit(&ivInit);
         ByteReverseWords(iv, iv, AES_BLOCK_SIZE);
-        AES_CRYP_IVInitStructure.CRYP_IV0Left  = iv[0];
-        AES_CRYP_IVInitStructure.CRYP_IV0Right = iv[1];
-        AES_CRYP_IVInitStructure.CRYP_IV1Left  = iv[2];
-        AES_CRYP_IVInitStructure.CRYP_IV1Right = iv[3];
-        CRYP_IVInit(&AES_CRYP_IVInitStructure);
-
-        /* set direction, mode, and datatype */
-        AES_CRYP_InitStructure.CRYP_AlgoDir  = CRYP_AlgoDir_Encrypt;
-        AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_CBC;
-        AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_8b;
-        CRYP_Init(&AES_CRYP_InitStructure);
+        ivInit.CRYP_IV0Left  = iv[0];
+        ivInit.CRYP_IV0Right = iv[1];
+        ivInit.CRYP_IV1Left  = iv[2];
+        ivInit.CRYP_IV1Right = iv[3];
+        CRYP_IVInit(&ivInit);
+
+        /* set direction and mode */
+        cryptInit.CRYP_AlgoDir  = CRYP_AlgoDir_Encrypt;
+        cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_CBC;
+        CRYP_Init(&cryptInit);
 
         /* enable crypto processor */
         CRYP_Cmd(ENABLE);
@@ -2558,26 +3135,29 @@
 
         /* disable crypto processor */
         CRYP_Cmd(DISABLE);
-
-        return 0;
+        wolfSSL_CryptHwMutexUnLock();
+
+        return ret;
     }
 
     #ifdef HAVE_AES_DECRYPT
     int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
     {
-        word32 *dec_key, *iv;
+        int ret;
+        word32 *iv;
         word32 blocks = (sz / AES_BLOCK_SIZE);
-        CRYP_InitTypeDef AES_CRYP_InitStructure;
-        CRYP_KeyInitTypeDef AES_CRYP_KeyInitStructure;
-        CRYP_IVInitTypeDef AES_CRYP_IVInitStructure;
-
-        dec_key = aes->key;
-        iv = aes->reg;
-
-        /* crypto structure initialization */
-        CRYP_KeyStructInit(&AES_CRYP_KeyInitStructure);
-        CRYP_StructInit(&AES_CRYP_InitStructure);
-        CRYP_IVStructInit(&AES_CRYP_IVInitStructure);
+        CRYP_InitTypeDef cryptInit;
+        CRYP_KeyInitTypeDef keyInit;
+        CRYP_IVInitTypeDef ivInit;
+
+        ret = wc_Stm32_Aes_Init(aes, &cryptInit, &keyInit);
+        if (ret != 0)
+            return ret;
+
+        ret = wolfSSL_CryptHwMutexLock();
+        if (ret != 0) {
+            return ret;
+        }
 
         /* if input and output same will overwrite input iv */
         XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
@@ -2585,48 +3165,11 @@
         /* reset registers to their default values */
         CRYP_DeInit();
 
-        /* load key into correct registers */
-        switch (aes->rounds) {
-            case 10: /* 128-bit key */
-                AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_128b;
-                AES_CRYP_KeyInitStructure.CRYP_Key2Left  = dec_key[0];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Right = dec_key[1];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Left  = dec_key[2];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Right = dec_key[3];
-                break;
-
-            case 12: /* 192-bit key */
-                AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_192b;
-                AES_CRYP_KeyInitStructure.CRYP_Key1Left  = dec_key[0];
-                AES_CRYP_KeyInitStructure.CRYP_Key1Right = dec_key[1];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Left  = dec_key[2];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Right = dec_key[3];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Left  = dec_key[4];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Right = dec_key[5];
-                break;
-
-            case 14: /* 256-bit key */
-                AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_256b;
-                AES_CRYP_KeyInitStructure.CRYP_Key0Left  = dec_key[0];
-                AES_CRYP_KeyInitStructure.CRYP_Key0Right = dec_key[1];
-                AES_CRYP_KeyInitStructure.CRYP_Key1Left  = dec_key[2];
-                AES_CRYP_KeyInitStructure.CRYP_Key1Right = dec_key[3];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Left  = dec_key[4];
-                AES_CRYP_KeyInitStructure.CRYP_Key2Right = dec_key[5];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Left  = dec_key[6];
-                AES_CRYP_KeyInitStructure.CRYP_Key3Right = dec_key[7];
-                break;
-
-            default:
-                break;
-        }
-
-        /* set direction, mode, and datatype for key preparation */
-        AES_CRYP_InitStructure.CRYP_AlgoDir  = CRYP_AlgoDir_Decrypt;
-        AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_Key;
-        AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_32b;
-        CRYP_Init(&AES_CRYP_InitStructure);
-        CRYP_KeyInit(&AES_CRYP_KeyInitStructure);
+        /* set direction and key */
+        CRYP_KeyInit(&keyInit);
+        cryptInit.CRYP_AlgoDir  = CRYP_AlgoDir_Decrypt;
+        cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_Key;
+        CRYP_Init(&cryptInit);
 
         /* enable crypto processor */
         CRYP_Cmd(ENABLE);
@@ -2634,20 +3177,20 @@
         /* wait until key has been prepared */
         while (CRYP_GetFlagStatus(CRYP_FLAG_BUSY) != RESET) {}
 
-        /* set direction, mode, and datatype for decryption */
-        AES_CRYP_InitStructure.CRYP_AlgoDir  = CRYP_AlgoDir_Decrypt;
-        AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_CBC;
-        AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_8b;
-        CRYP_Init(&AES_CRYP_InitStructure);
+        /* set direction and mode */
+        cryptInit.CRYP_AlgoDir  = CRYP_AlgoDir_Decrypt;
+        cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_CBC;
+        CRYP_Init(&cryptInit);
 
         /* set iv */
+        iv = aes->reg;
+        CRYP_IVStructInit(&ivInit);
         ByteReverseWords(iv, iv, AES_BLOCK_SIZE);
-
-        AES_CRYP_IVInitStructure.CRYP_IV0Left  = iv[0];
-        AES_CRYP_IVInitStructure.CRYP_IV0Right = iv[1];
-        AES_CRYP_IVInitStructure.CRYP_IV1Left  = iv[2];
-        AES_CRYP_IVInitStructure.CRYP_IV1Right = iv[3];
-        CRYP_IVInit(&AES_CRYP_IVInitStructure);
+        ivInit.CRYP_IV0Left  = iv[0];
+        ivInit.CRYP_IV0Right = iv[1];
+        ivInit.CRYP_IV1Left  = iv[2];
+        ivInit.CRYP_IV1Right = iv[3];
+        CRYP_IVInit(&ivInit);
 
         /* enable crypto processor */
         CRYP_Cmd(ENABLE);
@@ -2678,8 +3221,9 @@
 
         /* disable crypto processor */
         CRYP_Cmd(DISABLE);
-
-        return 0;
+        wolfSSL_CryptHwMutexUnLock();
+
+        return ret;
     }
     #endif /* HAVE_AES_DECRYPT */
 #endif /* WOLFSSL_STM32_CUBEMX */
@@ -2808,6 +3352,12 @@
 
         status = LTC_AES_EncryptCbc(LTC_BASE, in, out, blocks * AES_BLOCK_SIZE,
             iv, enc_key, keySize);
+
+        /* store iv for next call */
+        if (status == kStatus_Success) {
+            XMEMCPY(iv, out + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+        }
+
         return (status == kStatus_Success) ? 0 : -1;
     }
 
@@ -2818,6 +3368,7 @@
         status_t status;
         byte* iv, *dec_key;
         word32 blocks = (sz / AES_BLOCK_SIZE);
+        byte temp_block[AES_BLOCK_SIZE];
 
         iv      = (byte*)aes->reg;
         dec_key = (byte*)aes->key;
@@ -2827,8 +3378,17 @@
             return status;
         }
 
+        /* get IV for next call */
+        XMEMCPY(temp_block, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
+
         status = LTC_AES_DecryptCbc(LTC_BASE, in, out, blocks * AES_BLOCK_SIZE,
             iv, dec_key, keySize, kLTC_EncryptKey);
+
+        /* store IV for next call */
+        if (status == kStatus_Success) {
+            XMEMCPY(iv, temp_block, AES_BLOCK_SIZE);
+        }
+
         return (status == kStatus_Success) ? 0 : -1;
     }
     #endif /* HAVE_AES_DECRYPT */
@@ -2939,12 +3499,38 @@
         return ret;
     }
     #endif /* HAVE_AES_DECRYPT */
-
+#elif defined(WOLFSSL_ESP32WROOM32_CRYPT) && \
+    !defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_AES)
+
+    int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+    {
+        return wc_esp32AesCbcEncrypt(aes, out, in, sz);
+    }
+    int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+    {
+        return wc_esp32AesCbcDecrypt(aes, out, in, sz);
+    }
+#elif defined(WOLFSSL_CRYPTOCELL) && defined(WOLFSSL_CRYPTOCELL_AES)
+    int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+    {
+        return SaSi_AesBlock(&aes->ctx.user_ctx, (uint8_t* )in, sz, out);
+    }
+    int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+    {
+        return SaSi_AesBlock(&aes->ctx.user_ctx, (uint8_t* )in, sz, out);
+    }
 #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES)
       /* implemented in wolfcrypt/src/port/caam/caam_aes.c */
 
+#elif defined(WOLFSSL_AFALG)
+    /* implemented in wolfcrypt/src/port/af_alg/afalg_aes.c */
+
+#elif defined(WOLFSSL_DEVCRYPTO_CBC)
+    /* implemented in wolfcrypt/src/port/devcrypt/devcrypto_aes.c */
+
 #else
 
+    /* Software AES - CBC Encrypt */
     int wc_AesCbcEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
     {
         word32 blocks = (sz / AES_BLOCK_SIZE);
@@ -2953,6 +3539,14 @@
             return BAD_FUNC_ARG;
         }
 
+    #ifdef WOLF_CRYPTO_CB
+        if (aes->devId != INVALID_DEVID) {
+            int ret = wc_CryptoCb_AesCbcEncrypt(aes, out, in, sz);
+            if (ret != CRYPTOCB_UNAVAILABLE)
+                return ret;
+            /* fall-through when unavailable */
+        }
+    #endif
     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES)
         /* if async and byte count above threshold */
         if (aes->asyncDev.marker == WOLFSSL_ASYNC_MARKER_AES &&
@@ -2961,8 +3555,8 @@
             return NitroxAesCbcEncrypt(aes, out, in, sz);
         #elif defined(HAVE_INTEL_QA)
             return IntelQaSymAesCbcEncrypt(&aes->asyncDev, out, in, sz,
-                (const byte*)aes->asyncKey, aes->keylen,
-                (const byte*)aes->asyncIv, AES_BLOCK_SIZE);
+                (const byte*)aes->devKey, aes->keylen,
+                (byte*)aes->reg, AES_BLOCK_SIZE);
         #else /* WOLFSSL_ASYNC_CRYPT_TEST */
             if (wc_AsyncTestInit(&aes->asyncDev, ASYNC_TEST_AES_CBC_ENCRYPT)) {
                 WC_ASYNC_TEST* testDev = &aes->asyncDev.test;
@@ -3034,6 +3628,7 @@
     }
 
     #ifdef HAVE_AES_DECRYPT
+    /* Software AES - CBC Decrypt */
     int wc_AesCbcDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
     {
         word32 blocks;
@@ -3043,6 +3638,14 @@
             return BAD_FUNC_ARG;
         }
 
+    #ifdef WOLF_CRYPTO_CB
+        if (aes->devId != INVALID_DEVID) {
+            int ret = wc_CryptoCb_AesCbcDecrypt(aes, out, in, sz);
+            if (ret != CRYPTOCB_UNAVAILABLE)
+                return ret;
+            /* fall-through when unavailable */
+        }
+    #endif
     #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES)
         /* if async and byte count above threshold */
         if (aes->asyncDev.marker == WOLFSSL_ASYNC_MARKER_AES &&
@@ -3051,8 +3654,8 @@
             return NitroxAesCbcDecrypt(aes, out, in, sz);
         #elif defined(HAVE_INTEL_QA)
             return IntelQaSymAesCbcDecrypt(&aes->asyncDev, out, in, sz,
-                (const byte*)aes->asyncKey, aes->keylen,
-                (const byte*)aes->asyncIv, AES_BLOCK_SIZE);
+                (const byte*)aes->devKey, aes->keylen,
+                (byte*)aes->reg, AES_BLOCK_SIZE);
         #else /* WOLFSSL_ASYNC_CRYPT_TEST */
             if (wc_AsyncTestInit(&aes->asyncDev, ASYNC_TEST_AES_CBC_DECRYPT)) {
                 WC_ASYNC_TEST* testDev = &aes->asyncDev.test;
@@ -3101,6 +3704,7 @@
             XMEMCPY(aes->tmp, in, AES_BLOCK_SIZE);
             wc_AesDecrypt(aes, (byte*)aes->tmp, out);
             xorbuf(out, (byte*)aes->reg, AES_BLOCK_SIZE);
+            /* store iv for next call */
             XMEMCPY(aes->reg, aes->tmp, AES_BLOCK_SIZE);
 
             out += AES_BLOCK_SIZE;
@@ -3126,101 +3730,83 @@
             int ret = 0;
         #ifdef WOLFSSL_STM32_CUBEMX
             CRYP_HandleTypeDef hcryp;
-
-            XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef));
-            switch (aes->rounds) {
-                case 10: /* 128-bit key */
-                    hcryp.Init.KeySize = CRYP_KEYSIZE_128B;
-                    break;
-	#ifdef CRYP_KEYSIZE_192B
-                case 12: /* 192-bit key */
-                    hcryp.Init.KeySize = CRYP_KEYSIZE_192B;
-                    break;
-	#endif
-                case 14: /* 256-bit key */
-                    hcryp.Init.KeySize = CRYP_KEYSIZE_256B;
-                    break;
-                default:
-                    break;
+            #ifdef STM32_HAL_V2
+            word32 iv[AES_BLOCK_SIZE/sizeof(word32)];
+            #endif
+        #else
+            word32 *iv;
+            CRYP_InitTypeDef cryptInit;
+            CRYP_KeyInitTypeDef keyInit;
+            CRYP_IVInitTypeDef ivInit;
+        #endif
+
+            ret = wolfSSL_CryptHwMutexLock();
+            if (ret != 0) {
+                return ret;
+            }
+
+        #ifdef WOLFSSL_STM32_CUBEMX
+            ret = wc_Stm32_Aes_Init(aes, &hcryp);
+            if (ret != 0) {
+                wolfSSL_CryptHwMutexUnLock();
+                return ret;
             }
-            hcryp.Instance = CRYP;
-            hcryp.Init.DataType = CRYP_DATATYPE_8B;
-            hcryp.Init.pKey = (byte*)aes->key;
-            hcryp.Init.pInitVect = (byte*)aes->reg;
-
+
+        #ifdef STM32_CRYPTO_AES_ONLY
+            hcryp.Init.OperatingMode = CRYP_ALGOMODE_ENCRYPT;
+            hcryp.Init.ChainingMode  = CRYP_CHAINMODE_AES_CTR;
+            hcryp.Init.KeyWriteFlag  = CRYP_KEY_WRITE_ENABLE;
+            hcryp.Init.pInitVect = (STM_CRYPT_TYPE*)aes->reg;
+        #elif defined(STM32_HAL_V2)
+            hcryp.Init.Algorithm  = CRYP_AES_CTR;
+            ByteReverseWords(iv, aes->reg, AES_BLOCK_SIZE);
+            hcryp.Init.pInitVect = (STM_CRYPT_TYPE*)iv;
+        #else
+            hcryp.Init.pInitVect = (STM_CRYPT_TYPE*)aes->reg;
+        #endif
             HAL_CRYP_Init(&hcryp);
 
-            if (HAL_CRYP_AESCTR_Encrypt(&hcryp, (byte*)in, AES_BLOCK_SIZE, out,
-                                                STM32_HAL_TIMEOUT) != HAL_OK) {
-                /* failed */
+        #ifdef STM32_CRYPTO_AES_ONLY
+            ret = HAL_CRYPEx_AES(&hcryp, (byte*)in, AES_BLOCK_SIZE,
+                out, STM32_HAL_TIMEOUT);
+        #elif defined(STM32_HAL_V2)
+            ret = HAL_CRYP_Encrypt(&hcryp, (uint32_t*)in, AES_BLOCK_SIZE,
+                (uint32_t*)out, STM32_HAL_TIMEOUT);
+        #else
+            ret = HAL_CRYP_AESCTR_Encrypt(&hcryp, (byte*)in, AES_BLOCK_SIZE,
+                out, STM32_HAL_TIMEOUT);
+        #endif
+            if (ret != HAL_OK) {
                 ret = WC_TIMEOUT_E;
             }
-
             HAL_CRYP_DeInit(&hcryp);
 
         #else /* STD_PERI_LIB */
-            word32 *enc_key, *iv;
-            CRYP_InitTypeDef AES_CRYP_InitStructure;
-            CRYP_KeyInitTypeDef AES_CRYP_KeyInitStructure;
-            CRYP_IVInitTypeDef AES_CRYP_IVInitStructure;
-
-            enc_key = aes->key;
-            iv = aes->reg;
-
-            /* crypto structure initialization */
-            CRYP_KeyStructInit(&AES_CRYP_KeyInitStructure);
-            CRYP_StructInit(&AES_CRYP_InitStructure);
-            CRYP_IVStructInit(&AES_CRYP_IVInitStructure);
+            ret = wc_Stm32_Aes_Init(aes, &cryptInit, &keyInit);
+            if (ret != 0) {
+                wolfSSL_CryptHwMutexUnLock();
+                return ret;
+            }
 
             /* reset registers to their default values */
             CRYP_DeInit();
 
-            /* load key into correct registers */
-            switch (aes->rounds) {
-                case 10: /* 128-bit key */
-                    AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_128b;
-                    AES_CRYP_KeyInitStructure.CRYP_Key2Left  = enc_key[0];
-                    AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[1];
-                    AES_CRYP_KeyInitStructure.CRYP_Key3Left  = enc_key[2];
-                    AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[3];
-                    break;
-                case 12: /* 192-bit key */
-                    AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_192b;
-                    AES_CRYP_KeyInitStructure.CRYP_Key1Left  = enc_key[0];
-                    AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[1];
-                    AES_CRYP_KeyInitStructure.CRYP_Key2Left  = enc_key[2];
-                    AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[3];
-                    AES_CRYP_KeyInitStructure.CRYP_Key3Left  = enc_key[4];
-                    AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[5];
-                    break;
-                case 14: /* 256-bit key */
-                    AES_CRYP_InitStructure.CRYP_KeySize = CRYP_KeySize_256b;
-                    AES_CRYP_KeyInitStructure.CRYP_Key0Left  = enc_key[0];
-                    AES_CRYP_KeyInitStructure.CRYP_Key0Right = enc_key[1];
-                    AES_CRYP_KeyInitStructure.CRYP_Key1Left  = enc_key[2];
-                    AES_CRYP_KeyInitStructure.CRYP_Key1Right = enc_key[3];
-                    AES_CRYP_KeyInitStructure.CRYP_Key2Left  = enc_key[4];
-                    AES_CRYP_KeyInitStructure.CRYP_Key2Right = enc_key[5];
-                    AES_CRYP_KeyInitStructure.CRYP_Key3Left  = enc_key[6];
-                    AES_CRYP_KeyInitStructure.CRYP_Key3Right = enc_key[7];
-                    break;
-                default:
-                    break;
-            }
-            CRYP_KeyInit(&AES_CRYP_KeyInitStructure);
+            /* set key */
+            CRYP_KeyInit(&keyInit);
 
             /* set iv */
-            AES_CRYP_IVInitStructure.CRYP_IV0Left  = ByteReverseWord32(iv[0]);
-            AES_CRYP_IVInitStructure.CRYP_IV0Right = ByteReverseWord32(iv[1]);
-            AES_CRYP_IVInitStructure.CRYP_IV1Left  = ByteReverseWord32(iv[2]);
-            AES_CRYP_IVInitStructure.CRYP_IV1Right = ByteReverseWord32(iv[3]);
-            CRYP_IVInit(&AES_CRYP_IVInitStructure);
-
-            /* set direction, mode, and datatype */
-            AES_CRYP_InitStructure.CRYP_AlgoDir  = CRYP_AlgoDir_Encrypt;
-            AES_CRYP_InitStructure.CRYP_AlgoMode = CRYP_AlgoMode_AES_CTR;
-            AES_CRYP_InitStructure.CRYP_DataType = CRYP_DataType_8b;
-            CRYP_Init(&AES_CRYP_InitStructure);
+            iv = aes->reg;
+            CRYP_IVStructInit(&ivInit);
+            ivInit.CRYP_IV0Left  = ByteReverseWord32(iv[0]);
+            ivInit.CRYP_IV0Right = ByteReverseWord32(iv[1]);
+            ivInit.CRYP_IV1Left  = ByteReverseWord32(iv[2]);
+            ivInit.CRYP_IV1Right = ByteReverseWord32(iv[3]);
+            CRYP_IVInit(&ivInit);
+
+            /* set direction and mode */
+            cryptInit.CRYP_AlgoDir  = CRYP_AlgoDir_Encrypt;
+            cryptInit.CRYP_AlgoMode = CRYP_AlgoMode_AES_CTR;
+            CRYP_Init(&cryptInit);
 
             /* enable crypto processor */
             CRYP_Cmd(ENABLE);
@@ -3245,6 +3831,8 @@
             CRYP_Cmd(DISABLE);
 
         #endif /* WOLFSSL_STM32_CUBEMX */
+
+            wolfSSL_CryptHwMutexUnLock();
             return ret;
         }
 
@@ -3303,6 +3891,18 @@
     #elif defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES)
         /* implemented in wolfcrypt/src/port/caam/caam_aes.c */
 
+    #elif defined(WOLFSSL_AFALG)
+        /* implemented in wolfcrypt/src/port/af_alg/afalg_aes.c */
+
+    #elif defined(WOLFSSL_DEVCRYPTO_AES)
+        /* implemented in wolfcrypt/src/port/devcrypt/devcrypto_aes.c */
+   
+    #elif defined(WOLFSSL_ESP32WROOM32_CRYPT) && \
+        !defined(NO_WOLFSSL_ESP32WROOM32_CRYPT_AES)
+        /* esp32 doesn't support CRT mode by hw.     */
+        /* use aes ecnryption plus sw implementation */
+        #define NEED_AES_CTR_SOFT
+
     #else
 
         /* Use software based AES counter */
@@ -3321,9 +3921,11 @@
             }
         }
 
+        /* Software AES - CTR Encrypt */
         int wc_AesCtrEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
         {
             byte* tmp;
+            byte scratch[AES_BLOCK_SIZE];
 
             if (aes == NULL || out == NULL || in == NULL) {
                 return BAD_FUNC_ARG;
@@ -3342,8 +3944,9 @@
             #ifdef XTRANSFORM_AESCTRBLOCK
                 XTRANSFORM_AESCTRBLOCK(aes, out, in);
             #else
-                wc_AesEncrypt(aes, (byte*)aes->reg, out);
-                xorbuf(out, in, AES_BLOCK_SIZE);
+                wc_AesEncrypt(aes, (byte*)aes->reg, scratch);
+                xorbuf(scratch, in, AES_BLOCK_SIZE);
+                XMEMCPY(out, scratch, AES_BLOCK_SIZE);
             #endif
                 IncrementAesCounter((byte*)aes->reg);
 
@@ -3352,6 +3955,7 @@
                 sz  -= AES_BLOCK_SIZE;
                 aes->left = 0;
             }
+            ForceZero(scratch, AES_BLOCK_SIZE);
 
             /* handle non block size remaining and store unused byte count in left */
             if (sz) {
@@ -3412,6 +4016,13 @@
 
 #ifdef WOLFSSL_ARMASM
     /* implementation is located in wolfcrypt/src/port/arm/armv8-aes.c */
+
+#elif defined(WOLFSSL_AFALG)
+    /* implemented in wolfcrypt/src/port/afalg/afalg_aes.c */
+
+#elif defined(WOLFSSL_DEVCRYPTO_AES)
+    /* implemented in wolfcrypt/src/port/devcrypt/devcrypto_aes.c */
+
 #else /* software + AESNI implementation */
 
 #if !defined(FREESCALE_LTC_AES_GCM)
@@ -3425,6 +4036,18 @@
             return;
     }
 }
+#ifdef STM32_CRYPTO_AES_GCM
+static WC_INLINE void DecrementGcmCounter(byte* inOutCtr)
+{
+    int i;
+
+    /* in network byte order so start at end and work back */
+    for (i = AES_BLOCK_SIZE - 1; i >= AES_BLOCK_SIZE - CTR_SZ; i--) {
+        if (--inOutCtr[i] != 0xFF)  /* we're done unless we underflow */
+            return;
+    }
+}
+#endif /* STM32_CRYPTO_AES_GCM */
 #endif /* !FREESCALE_LTC_AES_GCM */
 
 #if defined(GCM_SMALL) || defined(GCM_TABLE)
@@ -3491,7 +4114,7 @@
 
 #endif /* GCM_TABLE */
 
-
+/* Software AES - GCM SetKey */
 int wc_AesGcmSetKey(Aes* aes, const byte* key, word32 len)
 {
     int  ret;
@@ -3517,6 +4140,12 @@
     if (!((len == 16) || (len == 24) || (len == 32)))
         return BAD_FUNC_ARG;
 
+#ifdef OPENSSL_EXTRA
+    if (aes != NULL) {
+        XMEMSET(aes->aadH, 0, sizeof(aes->aadH));
+        aes->aadLen = 0;
+    }
+#endif
     XMEMSET(iv, 0, AES_BLOCK_SIZE);
     ret = wc_AesSetKey(aes, key, len, iv, AES_ENCRYPTION);
 
@@ -3537,6 +4166,14 @@
 
 #if defined(WOLFSSL_XILINX_CRYPT)
     wc_AesGcmSetKey_ex(aes, key, len, XSECURE_CSU_AES_KEY_SRC_KUP);
+#elif defined(WOLFSSL_AFALG_XILINX_AES)
+    wc_AesGcmSetKey_ex(aes, key, len, 0);
+#endif
+
+#ifdef WOLF_CRYPTO_CB
+    if (aes->devId != INVALID_DEVID) {
+        XMEMCPY(aes->devKey, key, len);
+    }
 #endif
 
 #ifdef WOLFSSL_IMX6_CAAM_BLOB
@@ -3554,18 +4191,68 @@
     #define HAVE_INTEL_AVX2
 #endif /* USE_INTEL_SPEEDUP */
 
-#ifdef _MSC_VER
-    #define S(w,z) ((char)((unsigned long long)(w) >> (8*(7-(z))) & 0xFF))
-    #define M128_INIT(x,y) { S((x),7), S((x),6), S((x),5), S((x),4), \
-                             S((x),3), S((x),2), S((x),1), S((x),0), \
-                             S((y),7), S((y),6), S((y),5), S((y),4), \
-                             S((y),3), S((y),2), S((y),1), S((y),0) }
-#else
-    #define M128_INIT(x,y) { (x), (y) }
-#endif
-
-static const __m128i MOD2_128 = M128_INIT(0x1,
-                                           (long long int)0xc200000000000000UL);
+#ifndef _MSC_VER
+
+void AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
+                     const unsigned char* addt, const unsigned char* ivec,
+                     unsigned char *tag, unsigned int nbytes,
+                     unsigned int abytes, unsigned int ibytes,
+                     unsigned int tbytes, const unsigned char* key, int nr)
+                     XASM_LINK("AES_GCM_encrypt");
+#ifdef HAVE_INTEL_AVX1
+void AES_GCM_encrypt_avx1(const unsigned char *in, unsigned char *out,
+                          const unsigned char* addt, const unsigned char* ivec,
+                          unsigned char *tag, unsigned int nbytes,
+                          unsigned int abytes, unsigned int ibytes,
+                          unsigned int tbytes, const unsigned char* key,
+                          int nr)
+                          XASM_LINK("AES_GCM_encrypt_avx1");
+#ifdef HAVE_INTEL_AVX2
+void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out,
+                          const unsigned char* addt, const unsigned char* ivec,
+                          unsigned char *tag, unsigned int nbytes,
+                          unsigned int abytes, unsigned int ibytes,
+                          unsigned int tbytes, const unsigned char* key,
+                          int nr)
+                          XASM_LINK("AES_GCM_encrypt_avx2");
+#endif /* HAVE_INTEL_AVX2 */
+#endif /* HAVE_INTEL_AVX1 */
+
+#ifdef HAVE_AES_DECRYPT
+void AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
+                     const unsigned char* addt, const unsigned char* ivec,
+                     const unsigned char *tag, int nbytes, int abytes,
+                     int ibytes, int tbytes, const unsigned char* key, int nr,
+                     int* res)
+                     XASM_LINK("AES_GCM_decrypt");
+#ifdef HAVE_INTEL_AVX1
+void AES_GCM_decrypt_avx1(const unsigned char *in, unsigned char *out,
+                          const unsigned char* addt, const unsigned char* ivec,
+                          const unsigned char *tag, int nbytes, int abytes,
+                          int ibytes, int tbytes, const unsigned char* key,
+                          int nr, int* res)
+                          XASM_LINK("AES_GCM_decrypt_avx1");
+#ifdef HAVE_INTEL_AVX2
+void AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out,
+                          const unsigned char* addt, const unsigned char* ivec,
+                          const unsigned char *tag, int nbytes, int abytes,
+                          int ibytes, int tbytes, const unsigned char* key,
+                          int nr, int* res)
+                          XASM_LINK("AES_GCM_decrypt_avx2");
+#endif /* HAVE_INTEL_AVX2 */
+#endif /* HAVE_INTEL_AVX1 */
+#endif /* HAVE_AES_DECRYPT */
+
+#else /* _MSC_VER */
+
+#define S(w,z) ((char)((unsigned long long)(w) >> (8*(7-(z))) & 0xFF))
+#define M128_INIT(x,y) { S((x),7), S((x),6), S((x),5), S((x),4), \
+                         S((x),3), S((x),2), S((x),1), S((x),0), \
+                         S((y),7), S((y),6), S((y),5), S((y),4), \
+                         S((y),3), S((y),2), S((y),1), S((y),0) }
+
+static const __m128i MOD2_128 =
+        M128_INIT(0x1, (long long int)0xc200000000000000UL);
 
 
 /* See Intel® Carry-Less Multiplication Instruction
@@ -3586,3105 +4273,12 @@
 static const __m128i SEVEN = M128_INIT(0x0, 0x7);
 static const __m128i EIGHT = M128_INIT(0x0, 0x8);
 #endif
-static const __m128i BSWAP_EPI64 = M128_INIT(0x0001020304050607, 0x08090a0b0c0d0e0f);
-static const __m128i BSWAP_MASK  = M128_INIT(0x08090a0b0c0d0e0f, 0x0001020304050607);
-
-
-#ifndef _MSC_VER
-
-#define _VAR(a) "" #a ""
-#define VAR(a) _VAR(a)
-
-#define HR     %%xmm14
-#define XR     %%xmm15
-#define KR     %%ebx
-#define KR64   %%rbx
-#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL)
-#define CTR1   128(%%rsp)
-#define TR     144(%%rsp)
-#define HTR    %%rsp
-#define STACK_OFFSET    160
-#else
-#define CTR1   (%%rsp)
-#define TR     16(%%rsp)
-#define STACK_OFFSET    32
-#endif
-
-#define AESENC()                      \
-    "aesenc	%%xmm12, %%xmm4\n\t"  \
-    "aesenc	%%xmm12, %%xmm5\n\t"  \
-    "aesenc	%%xmm12, %%xmm6\n\t"  \
-    "aesenc	%%xmm12, %%xmm7\n\t"  \
-    "aesenc	%%xmm12, %%xmm8\n\t"  \
-    "aesenc	%%xmm12, %%xmm9\n\t"  \
-    "aesenc	%%xmm12, %%xmm10\n\t" \
-    "aesenc	%%xmm12, %%xmm11\n\t"
-
-#define AESENC_SET(o)                        \
-    "movdqa	" #o "(%[KEY]), %%xmm12\n\t" \
-    AESENC()
-
-#define AESENC_CTR()                        \
-    "movdqu	" VAR(CTR1) ", %%xmm4\n\t"  \
-    "movdqa	%[BSWAP_EPI64], %%xmm1\n\t" \
-    "movdqu	%%xmm4, %%xmm0\n\t"         \
-    "pshufb	%%xmm1, %%xmm4\n\t"         \
-    "movdqa	%%xmm0, %%xmm5\n\t"         \
-    "paddd	%[ONE], %%xmm5\n\t"         \
-    "pshufb	%%xmm1, %%xmm5\n\t"         \
-    "movdqa	%%xmm0, %%xmm6\n\t"         \
-    "paddd	%[TWO], %%xmm6\n\t"         \
-    "pshufb	%%xmm1, %%xmm6\n\t"         \
-    "movdqa	%%xmm0, %%xmm7\n\t"         \
-    "paddd	%[THREE], %%xmm7\n\t"       \
-    "pshufb	%%xmm1, %%xmm7\n\t"         \
-    "movdqa	%%xmm0, %%xmm8\n\t"         \
-    "paddd	%[FOUR], %%xmm8\n\t"        \
-    "pshufb	%%xmm1, %%xmm8\n\t"         \
-    "movdqa	%%xmm0, %%xmm9\n\t"         \
-    "paddd	%[FIVE], %%xmm9\n\t"        \
-    "pshufb	%%xmm1, %%xmm9\n\t"         \
-    "movdqa	%%xmm0, %%xmm10\n\t"        \
-    "paddd	%[SIX], %%xmm10\n\t"        \
-    "pshufb	%%xmm1, %%xmm10\n\t"        \
-    "movdqa	%%xmm0, %%xmm11\n\t"        \
-    "paddd	%[SEVEN], %%xmm11\n\t"      \
-    "pshufb	%%xmm1, %%xmm11\n\t"        \
-    "paddd	%[EIGHT], %%xmm0\n\t"
-
-#define AESENC_XOR()                       \
-    "movdqa	(%[KEY]), %%xmm12\n\t"     \
-    "movdqu	%%xmm0, " VAR(CTR1) "\n\t" \
-    "pxor	%%xmm12, %%xmm4\n\t"       \
-    "pxor	%%xmm12, %%xmm5\n\t"       \
-    "pxor	%%xmm12, %%xmm6\n\t"       \
-    "pxor	%%xmm12, %%xmm7\n\t"       \
-    "pxor	%%xmm12, %%xmm8\n\t"       \
-    "pxor	%%xmm12, %%xmm9\n\t"       \
-    "pxor	%%xmm12, %%xmm10\n\t"      \
-    "pxor	%%xmm12, %%xmm11\n\t"
-
-/* Encrypt and carry-less multiply for AVX1. */
-#define AESENC_PCLMUL_1(src, o1, o2, o3)            \
-    "movdqu	" #o3 "(" VAR(HTR) "), %%xmm12\n\t" \
-    "movdqu	" #o2 "(" #src "), %%xmm0\n\t"      \
-    "aesenc	" #o1 "(%[KEY]), %%xmm4\n\t"        \
-    "pshufb	%[BSWAP_MASK], %%xmm0\n\t"          \
-    "pxor	%%xmm2, %%xmm0\n\t"                 \
-    "pshufd	$0x4e, %%xmm12, %%xmm1\n\t"         \
-    "pshufd	$0x4e, %%xmm0, %%xmm14\n\t"         \
-    "pxor	%%xmm12, %%xmm1\n\t"                \
-    "pxor	%%xmm0, %%xmm14\n\t"                \
-    "movdqa	%%xmm0, %%xmm3\n\t"                 \
-    "pclmulqdq	$0x11, %%xmm12, %%xmm3\n\t"         \
-    "aesenc	" #o1 "(%[KEY]), %%xmm5\n\t"        \
-    "aesenc	" #o1 "(%[KEY]), %%xmm6\n\t"        \
-    "movdqa	%%xmm0, %%xmm2\n\t"                 \
-    "pclmulqdq	$0x00, %%xmm12, %%xmm2\n\t"         \
-    "aesenc	" #o1 "(%[KEY]), %%xmm7\n\t"        \
-    "aesenc	" #o1 "(%[KEY]), %%xmm8\n\t"        \
-    "pclmulqdq	$0x00, %%xmm14, %%xmm1\n\t"         \
-    "aesenc	" #o1 "(%[KEY]), %%xmm9\n\t"        \
-    "aesenc	" #o1 "(%[KEY]), %%xmm10\n\t"       \
-    "aesenc	" #o1 "(%[KEY]), %%xmm11\n\t"       \
-    "pxor      %%xmm2, %%xmm1\n\t"                  \
-    "pxor      %%xmm3, %%xmm1\n\t"                  \
-
-#define AESENC_PCLMUL_N(src, o1, o2, o3)            \
-    "movdqu	" #o3 "(" VAR(HTR) "), %%xmm12\n\t" \
-    "movdqu	" #o2 "(" #src" ), %%xmm0\n\t"      \
-    "pshufd	$0x4e, %%xmm12, %%xmm13\n\t"        \
-    "pshufb	%[BSWAP_MASK], %%xmm0\n\t"          \
-    "aesenc	" #o1 "(%[KEY]), %%xmm4\n\t"        \
-    "pxor	%%xmm12, %%xmm13\n\t"               \
-    "pshufd	$0x4e, %%xmm0, %%xmm14\n\t"         \
-    "pxor	%%xmm0, %%xmm14\n\t"                \
-    "movdqa	%%xmm0, %%xmm15\n\t"                \
-    "pclmulqdq	$0x11, %%xmm12, %%xmm15\n\t"        \
-    "aesenc	" #o1 "(%[KEY]), %%xmm5\n\t"        \
-    "aesenc	" #o1 "(%[KEY]), %%xmm6\n\t"        \
-    "pclmulqdq	$0x00, %%xmm0, %%xmm12\n\t"         \
-    "aesenc	" #o1 "(%[KEY]), %%xmm7\n\t"        \
-    "aesenc	" #o1 "(%[KEY]), %%xmm8\n\t"        \
-    "pclmulqdq	$0x00, %%xmm14, %%xmm13\n\t"        \
-    "aesenc	" #o1 "(%[KEY]), %%xmm9\n\t"        \
-    "aesenc	" #o1 "(%[KEY]), %%xmm10\n\t"       \
-    "aesenc	" #o1 "(%[KEY]), %%xmm11\n\t"       \
-    "pxor      %%xmm12, %%xmm1\n\t"                 \
-    "pxor      %%xmm12, %%xmm2\n\t"                 \
-    "pxor      %%xmm15, %%xmm1\n\t"                 \
-    "pxor      %%xmm15, %%xmm3\n\t"                 \
-    "pxor      %%xmm13, %%xmm1\n\t"                 \
-
-#define AESENC_PCLMUL_L(o)                   \
-    "movdqa	%%xmm1, %%xmm14\n\t"         \
-    "psrldq	$8, %%xmm1\n\t"              \
-    "pslldq	$8, %%xmm14\n\t"             \
-    "aesenc	" #o "(%[KEY]), %%xmm4\n\t"  \
-    "pxor      %%xmm14, %%xmm2\n\t"          \
-    "pxor      %%xmm1, %%xmm3\n\t"           \
-    "movdqa	%%xmm2, %%xmm12\n\t"         \
-    "movdqa	%%xmm2, %%xmm13\n\t"         \
-    "movdqa	%%xmm2, %%xmm14\n\t"         \
-    "aesenc	" #o "(%[KEY]), %%xmm5\n\t"  \
-    "pslld	$31, %%xmm12\n\t"            \
-    "pslld	$30, %%xmm13\n\t"            \
-    "pslld	$25, %%xmm14\n\t"            \
-    "aesenc	" #o "(%[KEY]), %%xmm6\n\t"  \
-    "pxor	%%xmm13, %%xmm12\n\t"        \
-    "pxor	%%xmm14, %%xmm12\n\t"        \
-    "aesenc	" #o "(%[KEY]), %%xmm7\n\t"  \
-    "movdqa	%%xmm12, %%xmm13\n\t"        \
-    "pslldq	$12, %%xmm12\n\t"            \
-    "psrldq	$4, %%xmm13\n\t"             \
-    "aesenc	" #o "(%[KEY]), %%xmm8\n\t"  \
-    "pxor	%%xmm12, %%xmm2\n\t"         \
-    "movdqa	%%xmm2, %%xmm14\n\t"         \
-    "movdqa	%%xmm2, %%xmm1\n\t"          \
-    "movdqa	%%xmm2, %%xmm0\n\t"          \
-    "aesenc	" #o "(%[KEY]), %%xmm9\n\t"  \
-    "psrld	$1, %%xmm14\n\t"             \
-    "psrld	$2, %%xmm1\n\t"              \
-    "psrld	$7, %%xmm0\n\t"              \
-    "aesenc	" #o "(%[KEY]), %%xmm10\n\t" \
-    "pxor	%%xmm1, %%xmm14\n\t"         \
-    "pxor	%%xmm0, %%xmm14\n\t"         \
-    "aesenc	" #o "(%[KEY]), %%xmm11\n\t" \
-    "pxor	%%xmm13, %%xmm14\n\t"        \
-    "pxor	%%xmm14, %%xmm2\n\t"         \
-    "pxor	%%xmm3, %%xmm2\n\t"          \
-
-/* Encrypt and carry-less multiply with last key. */
-#define AESENC_LAST(in, out)                \
-    "aesenclast	%%xmm12, %%xmm4\n\t"        \
-    "aesenclast	%%xmm12, %%xmm5\n\t"        \
-    "movdqu	   (" #in "),%%xmm0\n\t"    \
-    "movdqu	 16(" #in "),%%xmm1\n\t"    \
-    "pxor	%%xmm0, %%xmm4\n\t"         \
-    "pxor	%%xmm1, %%xmm5\n\t"         \
-    "movdqu	%%xmm4,    (" #out ")\n\t"  \
-    "movdqu	%%xmm5,  16(" #out ")\n\t"  \
-    "aesenclast	%%xmm12, %%xmm6\n\t"        \
-    "aesenclast	%%xmm12, %%xmm7\n\t"        \
-    "movdqu	 32(" #in "),%%xmm0\n\t"    \
-    "movdqu	 48(" #in "),%%xmm1\n\t"    \
-    "pxor	%%xmm0, %%xmm6\n\t"         \
-    "pxor	%%xmm1, %%xmm7\n\t"         \
-    "movdqu	%%xmm6,  32(" #out ")\n\t"  \
-    "movdqu	%%xmm7,  48(" #out ")\n\t"  \
-    "aesenclast	%%xmm12, %%xmm8\n\t"        \
-    "aesenclast	%%xmm12, %%xmm9\n\t"        \
-    "movdqu	 64(" #in "),%%xmm0\n\t"    \
-    "movdqu	 80(" #in "),%%xmm1\n\t"    \
-    "pxor	%%xmm0, %%xmm8\n\t"         \
-    "pxor	%%xmm1, %%xmm9\n\t"         \
-    "movdqu	%%xmm8,  64(" #out ")\n\t"  \
-    "movdqu	%%xmm9,  80(" #out ")\n\t"  \
-    "aesenclast	%%xmm12, %%xmm10\n\t"       \
-    "aesenclast	%%xmm12, %%xmm11\n\t"       \
-    "movdqu	 96(" #in "),%%xmm0\n\t"    \
-    "movdqu	112(" #in "),%%xmm1\n\t"    \
-    "pxor	%%xmm0, %%xmm10\n\t"        \
-    "pxor	%%xmm1, %%xmm11\n\t"        \
-    "movdqu	%%xmm10,  96(" #out ")\n\t" \
-    "movdqu	%%xmm11, 112(" #out ")\n\t"
-
-#define _AESENC_AVX(r)                    \
-    "aesenc	16(%[KEY]), " #r "\n\t"   \
-    "aesenc	32(%[KEY]), " #r "\n\t"   \
-    "aesenc	48(%[KEY]), " #r "\n\t"   \
-    "aesenc	64(%[KEY]), " #r "\n\t"   \
-    "aesenc	80(%[KEY]), " #r "\n\t"   \
-    "aesenc	96(%[KEY]), " #r "\n\t"   \
-    "aesenc	112(%[KEY]), " #r "\n\t"  \
-    "aesenc	128(%[KEY]), " #r "\n\t"  \
-    "aesenc	144(%[KEY]), " #r "\n\t"  \
-    "cmpl	$11, %[nr]\n\t"           \
-    "movdqa	160(%[KEY]), %%xmm5\n\t"  \
-    "jl		%=f\n\t"                  \
-    "aesenc	%%xmm5, " #r "\n\t"       \
-    "aesenc	176(%[KEY]), " #r "\n\t"  \
-    "cmpl	$13, %[nr]\n\t"           \
-    "movdqa	192(%[KEY]), %%xmm5\n\t"  \
-    "jl		%=f\n\t"                  \
-    "aesenc	%%xmm5, " #r "\n\t"       \
-    "aesenc	208(%[KEY]), " #r "\n\t"  \
-    "movdqa	224(%[KEY]), %%xmm5\n\t"  \
-    "%=:\n\t"                             \
-    "aesenclast	%%xmm5, " #r "\n\t"
-#define AESENC_AVX(r)                     \
-        _AESENC_AVX(r)
-
-#define AESENC_BLOCK(in, out)               \
-    "movdqu	" VAR(CTR1) ", %%xmm4\n\t"  \
-    "movdqu	%%xmm4, %%xmm5\n\t"         \
-    "pshufb	%[BSWAP_EPI64], %%xmm4\n\t" \
-    "paddd	%[ONE], %%xmm5\n\t"         \
-    "pxor	(%[KEY]), %%xmm4\n\t"       \
-    "movdqu	%%xmm5, " VAR(CTR1) "\n\t"  \
-    AESENC_AVX(%%xmm4)                      \
-    "movdqu	(" #in "), %%xmm5\n\t"      \
-    "pxor	%%xmm5, %%xmm4\n\t"         \
-    "movdqu	%%xmm4, (" #out ")\n\t"     \
-    "pshufb	%[BSWAP_MASK], %%xmm4\n\t"  \
-    "pxor	%%xmm4, " VAR(XR) "\n\t"
-
-#define _AESENC_GFMUL(in, out, H, X)            \
-    "movdqu	" VAR(CTR1) ", %%xmm4\n\t"      \
-    "movdqu	%%xmm4, %%xmm5\n\t"             \
-    "pshufb	%[BSWAP_EPI64], %%xmm4\n\t"     \
-    "paddd	%[ONE], %%xmm5\n\t"             \
-    "pxor	(%[KEY]), %%xmm4\n\t"           \
-    "movdqu	%%xmm5, " VAR(CTR1) "\n\t"      \
-    "movdqa	" #X ", %%xmm6\n\t"             \
-    "pclmulqdq	$0x10, " #H ", %%xmm6\n\t"      \
-    "aesenc	16(%[KEY]), %%xmm4\n\t"         \
-    "aesenc	32(%[KEY]), %%xmm4\n\t"         \
-    "movdqa	" #X ", %%xmm7\n\t"             \
-    "pclmulqdq	$0x01, " #H ", %%xmm7\n\t"      \
-    "aesenc	48(%[KEY]), %%xmm4\n\t"         \
-    "aesenc	64(%[KEY]), %%xmm4\n\t"         \
-    "movdqa	" #X ", %%xmm8\n\t"             \
-    "pclmulqdq	$0x00, " #H ", %%xmm8\n\t"      \
-    "aesenc	80(%[KEY]), %%xmm4\n\t"         \
-    "movdqa	" #X ", %%xmm1\n\t"             \
-    "pclmulqdq	$0x11, " #H ", %%xmm1\n\t"      \
-    "aesenc	96(%[KEY]), %%xmm4\n\t"         \
-    "pxor	%%xmm7, %%xmm6\n\t"             \
-    "movdqa	%%xmm6, %%xmm2\n\t"             \
-    "psrldq	$8, %%xmm6\n\t"                 \
-    "pslldq	$8, %%xmm2\n\t"                 \
-    "aesenc	112(%[KEY]), %%xmm4\n\t"        \
-    "movdqa	%%xmm1, %%xmm3\n\t"             \
-    "pxor	%%xmm8, %%xmm2\n\t"             \
-    "pxor	%%xmm6, %%xmm3\n\t"             \
-    "movdqa	%[MOD2_128], %%xmm0\n\t"        \
-    "movdqa	%%xmm2, %%xmm7\n\t"             \
-    "pclmulqdq	$0x10, %%xmm0, %%xmm7\n\t"      \
-    "aesenc	128(%[KEY]), %%xmm4\n\t"        \
-    "pshufd	$0x4e, %%xmm2, %%xmm6\n\t"      \
-    "pxor	%%xmm7, %%xmm6\n\t"             \
-    "movdqa	%%xmm6, %%xmm7\n\t"             \
-    "pclmulqdq	$0x10, %%xmm0, %%xmm7\n\t"      \
-    "aesenc	144(%[KEY]), %%xmm4\n\t"        \
-    "pshufd	$0x4e, %%xmm6, " VAR(XR) "\n\t" \
-    "pxor	%%xmm7, " VAR(XR) "\n\t"        \
-    "pxor	%%xmm3, " VAR(XR) "\n\t"        \
-    "cmpl	$11, %[nr]\n\t"                 \
-    "movdqu	160(%[KEY]), %%xmm5\n\t"        \
-    "jl		%=f\n\t"                        \
-    "aesenc	%%xmm5, %%xmm4\n\t"             \
-    "aesenc	176(%[KEY]), %%xmm4\n\t"        \
-    "cmpl	$13, %[nr]\n\t"                 \
-    "movdqu	192(%[KEY]), %%xmm5\n\t"        \
-    "jl		%=f\n\t"                        \
-    "aesenc	%%xmm5, %%xmm4\n\t"             \
-    "aesenc	208(%[KEY]), %%xmm4\n\t"        \
-    "movdqa	224(%[KEY]), %%xmm5\n\t"        \
-    "%=:\n\t"                                   \
-    "aesenclast	%%xmm5, %%xmm4\n\t"             \
-    "movdqu	(" #in "), %%xmm5\n\t"          \
-    "pxor	%%xmm5, %%xmm4\n\t"             \
-    "movdqu	%%xmm4, (" #out ")\n\t"
-#define AESENC_GFMUL(in, out, H, X)             \
-       _AESENC_GFMUL(in, out, H, X)
-
-#define _GHASH_GFMUL_AVX(r, r2, a, b)      \
-    "pshufd	$0x4e, "#a", %%xmm1\n\t"   \
-    "pshufd	$0x4e, "#b", %%xmm2\n\t"   \
-    "movdqa	"#b", %%xmm3\n\t"          \
-    "movdqa	"#b", %%xmm0\n\t"          \
-    "pclmulqdq	$0x11, "#a", %%xmm3\n\t"   \
-    "pclmulqdq	$0x00, "#a", %%xmm0\n\t"   \
-    "pxor	"#a", %%xmm1\n\t"          \
-    "pxor	"#b", %%xmm2\n\t"          \
-    "pclmulqdq	$0x00, %%xmm2, %%xmm1\n\t" \
-    "pxor	%%xmm0, %%xmm1\n\t"        \
-    "pxor	%%xmm3, %%xmm1\n\t"        \
-    "movdqa	%%xmm1, %%xmm2\n\t"        \
-    "movdqa	%%xmm0, "#r2"\n\t"         \
-    "movdqa	%%xmm3, " #r "\n\t"        \
-    "pslldq	$8, %%xmm2\n\t"            \
-    "psrldq	$8, %%xmm1\n\t"            \
-    "pxor	%%xmm2, "#r2"\n\t"         \
-    "pxor	%%xmm1, " #r "\n\t"
-#define GHASH_GFMUL_AVX(r, r2, a, b)       \
-       _GHASH_GFMUL_AVX(r, r2, a, b)
-
-#define _GHASH_GFMUL_XOR_AVX(r, r2, a, b)  \
-    "pshufd	$0x4e, "#a", %%xmm1\n\t"   \
-    "pshufd	$0x4e, "#b", %%xmm2\n\t"   \
-    "movdqa	"#b", %%xmm3\n\t"          \
-    "movdqa	"#b", %%xmm0\n\t"          \
-    "pclmulqdq	$0x11, "#a", %%xmm3\n\t"   \
-    "pclmulqdq	$0x00, "#a", %%xmm0\n\t"   \
-    "pxor	"#a", %%xmm1\n\t"          \
-    "pxor	"#b", %%xmm2\n\t"          \
-    "pclmulqdq	$0x00, %%xmm2, %%xmm1\n\t" \
-    "pxor	%%xmm0, %%xmm1\n\t"        \
-    "pxor	%%xmm3, %%xmm1\n\t"        \
-    "movdqa	%%xmm1, %%xmm2\n\t"        \
-    "pxor	%%xmm0, "#r2"\n\t"         \
-    "pxor	%%xmm3, " #r "\n\t"        \
-    "pslldq	$8, %%xmm2\n\t"            \
-    "psrldq	$8, %%xmm1\n\t"            \
-    "pxor	%%xmm2, "#r2"\n\t"         \
-    "pxor	%%xmm1, " #r "\n\t"
-#define GHASH_GFMUL_XOR_AVX(r, r2, a, b)   \
-       _GHASH_GFMUL_XOR_AVX(r, r2, a, b)
-
-#define GHASH_MID_AVX(r, r2)        \
-    "movdqa	"#r2", %%xmm0\n\t"  \
-    "movdqa	" #r ", %%xmm1\n\t" \
-    "psrld	$31, %%xmm0\n\t"    \
-    "psrld	$31, %%xmm1\n\t"    \
-    "pslld	$1, "#r2"\n\t"      \
-    "pslld	$1, " #r "\n\t"     \
-    "movdqa	%%xmm0, %%xmm2\n\t" \
-    "pslldq	$4, %%xmm0\n\t"     \
-    "psrldq	$12, %%xmm2\n\t"    \
-    "pslldq	$4, %%xmm1\n\t"     \
-    "por	%%xmm2, " #r "\n\t" \
-    "por	%%xmm0, "#r2"\n\t"  \
-    "por	%%xmm1, " #r "\n\t"
-
-#define _GHASH_GFMUL_RED_AVX(r, a, b)      \
-    "pshufd	$0x4e, "#a", %%xmm5\n\t"   \
-    "pshufd	$0x4e, "#b", %%xmm6\n\t"   \
-    "movdqa	"#b", %%xmm7\n\t"          \
-    "movdqa	"#b", %%xmm4\n\t"          \
-    "pclmulqdq	$0x11, "#a", %%xmm7\n\t"   \
-    "pclmulqdq	$0x00, "#a", %%xmm4\n\t"   \
-    "pxor	"#a", %%xmm5\n\t"          \
-    "pxor	"#b", %%xmm6\n\t"          \
-    "pclmulqdq	$0x00, %%xmm6, %%xmm5\n\t" \
-    "pxor	%%xmm4, %%xmm5\n\t"        \
-    "pxor	%%xmm7, %%xmm5\n\t"        \
-    "movdqa	%%xmm5, %%xmm6\n\t"        \
-    "movdqa	%%xmm7, " #r "\n\t"        \
-    "pslldq	$8, %%xmm6\n\t"            \
-    "psrldq	$8, %%xmm5\n\t"            \
-    "pxor	%%xmm6, %%xmm4\n\t"        \
-    "pxor	%%xmm5, " #r "\n\t"        \
-    "movdqa	%%xmm4, %%xmm8\n\t"        \
-    "movdqa	%%xmm4, %%xmm9\n\t"        \
-    "movdqa	%%xmm4, %%xmm10\n\t"       \
-    "pslld	$31, %%xmm8\n\t"           \
-    "pslld	$30, %%xmm9\n\t"           \
-    "pslld	$25, %%xmm10\n\t"          \
-    "pxor	%%xmm9, %%xmm8\n\t"        \
-    "pxor	%%xmm10, %%xmm8\n\t"       \
-    "movdqa	%%xmm8, %%xmm9\n\t"        \
-    "psrldq	$4, %%xmm9\n\t"            \
-    "pslldq	$12, %%xmm8\n\t"           \
-    "pxor	%%xmm8, %%xmm4\n\t"        \
-    "movdqa	%%xmm4, %%xmm10\n\t"       \
-    "movdqa	%%xmm4, %%xmm6\n\t"        \
-    "movdqa	%%xmm4, %%xmm5\n\t"        \
-    "psrld	$1, %%xmm10\n\t"           \
-    "psrld	$2, %%xmm6\n\t"            \
-    "psrld	$7, %%xmm5\n\t"            \
-    "pxor	%%xmm6, %%xmm10\n\t"       \
-    "pxor	%%xmm5, %%xmm10\n\t"       \
-    "pxor	%%xmm9, %%xmm10\n\t"       \
-    "pxor	%%xmm4, %%xmm10\n\t"       \
-    "pxor	%%xmm10, " #r "\n\t"
-#define GHASH_GFMUL_RED_AVX(r, a, b)       \
-       _GHASH_GFMUL_RED_AVX(r, a, b)
-
-#define GHASH_RED_AVX(r, r2)           \
-    "movdqa	"#r2", %%xmm0\n\t"     \
-    "movdqa	"#r2", %%xmm1\n\t"     \
-    "movdqa	"#r2", %%xmm2\n\t"     \
-    "pslld	$31, %%xmm0\n\t"       \
-    "pslld	$30, %%xmm1\n\t"       \
-    "pslld	$25, %%xmm2\n\t"       \
-    "pxor	%%xmm1, %%xmm0\n\t"    \
-    "pxor	%%xmm2, %%xmm0\n\t"    \
-    "movdqa	%%xmm0, %%xmm1\n\t"    \
-    "psrldq	$4, %%xmm1\n\t"        \
-    "pslldq	$12, %%xmm0\n\t"       \
-    "pxor	%%xmm0, "#r2"\n\t"     \
-    "movdqa	"#r2", %%xmm2\n\t"     \
-    "movdqa	"#r2", %%xmm3\n\t"     \
-    "movdqa	"#r2", %%xmm0\n\t"     \
-    "psrld	$1, %%xmm2\n\t"        \
-    "psrld	$2, %%xmm3\n\t"        \
-    "psrld	$7, %%xmm0\n\t"        \
-    "pxor	%%xmm3, %%xmm2\n\t"    \
-    "pxor	%%xmm0, %%xmm2\n\t"    \
-    "pxor	%%xmm1, %%xmm2\n\t"    \
-    "pxor	"#r2", %%xmm2\n\t"     \
-    "pxor	%%xmm2, " #r "\n\t"
-
-#define GHASH_GFMUL_RED_XOR_AVX(r, r2, a, b) \
-    GHASH_GFMUL_XOR_AVX(r, r2, a, b)         \
-    GHASH_RED_AVX(r, r2)
-
-#define GHASH_FULL_AVX(r, r2, a, b) \
-    GHASH_GFMUL_AVX(r, r2, a, b)    \
-    GHASH_MID_AVX(r, r2)            \
-    GHASH_RED_AVX(r, r2)
-
-#define CALC_IV_12() \
-    "# Calculate values when IV is 12 bytes\n\t"      \
-    "# Set counter based on IV\n\t"                   \
-    "movl	$0x01000000, %%ecx\n\t"               \
-    "pinsrq	$0, 0(%%rax), %%xmm13\n\t"            \
-    "pinsrd	$2, 8(%%rax), %%xmm13\n\t"            \
-    "pinsrd	$3, %%ecx, %%xmm13\n\t"               \
-    "# H = Encrypt X(=0) and T = Encrypt counter\n\t" \
-    "movdqu	%%xmm13, %%xmm1\n\t"                  \
-    "movdqa	  0(%[KEY]), " VAR(HR) "\n\t"         \
-    "pxor	" VAR(HR) ", %%xmm1\n\t"              \
-    "movdqa	 16(%[KEY]), %%xmm12\n\t"             \
-    "aesenc	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenc	%%xmm12, %%xmm1\n\t"                  \
-    "movdqa	 32(%[KEY]), %%xmm12\n\t"             \
-    "aesenc	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenc	%%xmm12, %%xmm1\n\t"                  \
-    "movdqa	 48(%[KEY]), %%xmm12\n\t"             \
-    "aesenc	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenc	%%xmm12, %%xmm1\n\t"                  \
-    "movdqa	 64(%[KEY]), %%xmm12\n\t"             \
-    "aesenc	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenc	%%xmm12, %%xmm1\n\t"                  \
-    "movdqa	 80(%[KEY]), %%xmm12\n\t"             \
-    "aesenc	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenc	%%xmm12, %%xmm1\n\t"                  \
-    "movdqa	 96(%[KEY]), %%xmm12\n\t"             \
-    "aesenc	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenc	%%xmm12, %%xmm1\n\t"                  \
-    "movdqa	112(%[KEY]), %%xmm12\n\t"             \
-    "aesenc	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenc	%%xmm12, %%xmm1\n\t"                  \
-    "movdqa	128(%[KEY]), %%xmm12\n\t"             \
-    "aesenc	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenc	%%xmm12, %%xmm1\n\t"                  \
-    "movdqa	144(%[KEY]), %%xmm12\n\t"             \
-    "aesenc	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenc	%%xmm12, %%xmm1\n\t"                  \
-    "cmpl	$11, %[nr]\n\t"                       \
-    "movdqa	160(%[KEY]), %%xmm12\n\t"             \
-    "jl	31f\n\t"                                      \
-    "aesenc	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenc	%%xmm12, %%xmm1\n\t"                  \
-    "movdqa	176(%[KEY]), %%xmm12\n\t"             \
-    "aesenc	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenc	%%xmm12, %%xmm1\n\t"                  \
-    "cmpl	$13, %[nr]\n\t"                       \
-    "movdqa	192(%[KEY]), %%xmm12\n\t"             \
-    "jl	31f\n\t"                                      \
-    "aesenc	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenc	%%xmm12, %%xmm1\n\t"                  \
-    "movdqu	208(%[KEY]), %%xmm12\n\t"             \
-    "aesenc	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenc	%%xmm12, %%xmm1\n\t"                  \
-    "movdqu	224(%[KEY]), %%xmm12\n\t"             \
-    "31:\n\t"                                         \
-    "aesenclast	%%xmm12, " VAR(HR) "\n\t"             \
-    "aesenclast	%%xmm12, %%xmm1\n\t"                  \
-    "pshufb	%[BSWAP_MASK], " VAR(HR) "\n\t"       \
-    "movdqu	%%xmm1, " VAR(TR) "\n\t"              \
-    "jmp	39f\n\t"
-
-#define CALC_IV()                                    \
-    "# Calculate values when IV is not 12 bytes\n\t" \
-    "# H = Encrypt X(=0)\n\t"                        \
-    "movdqa	0(%[KEY]), " VAR(HR) "\n\t"          \
-    AESENC_AVX(HR)                                   \
-    "pshufb	%[BSWAP_MASK], " VAR(HR) "\n\t"      \
-    "# Calc counter\n\t"                             \
-    "# Initialization vector\n\t"                    \
-    "cmpl	$0, %%edx\n\t"                       \
-    "movq	$0, %%rcx\n\t"                       \
-    "je	45f\n\t"                                     \
-    "cmpl	$16, %%edx\n\t"                      \
-    "jl	44f\n\t"                                     \
-    "andl	$0xfffffff0, %%edx\n\t"              \
-    "\n"                                             \
-    "43:\n\t"                                        \
-    "movdqu	(%%rax,%%rcx,1), %%xmm4\n\t"         \
-    "pshufb	%[BSWAP_MASK], %%xmm4\n\t"           \
-    "pxor	%%xmm4, %%xmm13\n\t"                 \
-    GHASH_FULL_AVX(%%xmm13, %%xmm12, %%xmm13, HR)    \
-    "addl	$16, %%ecx\n\t"                      \
-    "cmpl	%%edx, %%ecx\n\t"                    \
-    "jl	43b\n\t"                                     \
-    "movl	%[ibytes], %%edx\n\t"                \
-    "cmpl	%%edx, %%ecx\n\t"                    \
-    "je	45f\n\t"                                     \
-    "\n"                                             \
-    "44:\n\t"                                        \
-    "subq	$16, %%rsp\n\t"                      \
-    "pxor	%%xmm4, %%xmm4\n\t"                  \
-    "xorl	%%ebx, %%ebx\n\t"                    \
-    "movdqu	%%xmm4, (%%rsp)\n\t"                 \
-    "42:\n\t"                                        \
-    "movzbl	(%%rax,%%rcx,1), %%r13d\n\t"         \
-    "movb	%%r13b, (%%rsp,%%rbx,1)\n\t"         \
-    "incl	%%ecx\n\t"                           \
-    "incl	%%ebx\n\t"                           \
-    "cmpl	%%edx, %%ecx\n\t"                    \
-    "jl	42b\n\t"                                     \
-    "movdqu	(%%rsp), %%xmm4\n\t"                 \
-    "addq	$16, %%rsp\n\t"                      \
-    "pshufb	%[BSWAP_MASK], %%xmm4\n\t"           \
-    "pxor	%%xmm4, %%xmm13\n\t"                 \
-    GHASH_FULL_AVX(%%xmm13, %%xmm12, %%xmm13, HR)    \
-    "\n"                                             \
-    "45:\n\t"                                        \
-    "# T = Encrypt counter\n\t"                      \
-    "pxor	%%xmm0, %%xmm0\n\t"                  \
-    "shll	$3, %%edx\n\t"                       \
-    "pinsrq	$0, %%rdx, %%xmm0\n\t"               \
-    "pxor	%%xmm0, %%xmm13\n\t"                 \
-    GHASH_FULL_AVX(%%xmm13, %%xmm12, %%xmm13, HR)    \
-    "pshufb	%[BSWAP_MASK], %%xmm13\n\t"          \
-    "#   Encrypt counter\n\t"                        \
-    "movdqa	0(%[KEY]), %%xmm4\n\t"               \
-    "pxor	%%xmm13, %%xmm4\n\t"                 \
-    AESENC_AVX(%%xmm4)                               \
-    "movdqu	%%xmm4, " VAR(TR) "\n\t"
-
-#define CALC_AAD()                           \
-    "# Additional authentication data\n\t"   \
-    "movl	%[abytes], %%edx\n\t"        \
-    "cmpl	$0, %%edx\n\t"               \
-    "je		25f\n\t"                     \
-    "movq	%[addt], %%rax\n\t"          \
-    "xorl	%%ecx, %%ecx\n\t"            \
-    "cmpl	$16, %%edx\n\t"              \
-    "jl		24f\n\t"                     \
-    "andl	$0xfffffff0, %%edx\n\t"      \
-    "\n"                                     \
-    "23:\n\t"                                \
-    "movdqu	(%%rax,%%rcx,1), %%xmm4\n\t" \
-    "pshufb	%[BSWAP_MASK], %%xmm4\n\t"   \
-    "pxor	%%xmm4, " VAR(XR) "\n\t"     \
-    GHASH_FULL_AVX(XR, %%xmm12, XR, HR)      \
-    "addl	$16, %%ecx\n\t"              \
-    "cmpl	%%edx, %%ecx\n\t"            \
-    "jl		23b\n\t"                     \
-    "movl	%[abytes], %%edx\n\t"        \
-    "cmpl	%%edx, %%ecx\n\t"            \
-    "je		25f\n\t"                     \
-    "\n"                                     \
-    "24:\n\t"                                \
-    "subq	$16, %%rsp\n\t"              \
-    "pxor	%%xmm4, %%xmm4\n\t"          \
-    "xorl	%%ebx, %%ebx\n\t"            \
-    "movdqu	%%xmm4, (%%rsp)\n\t"         \
-    "22:\n\t"                                \
-    "movzbl	(%%rax,%%rcx,1), %%r13d\n\t" \
-    "movb	%%r13b, (%%rsp,%%rbx,1)\n\t" \
-    "incl	%%ecx\n\t"                   \
-    "incl	%%ebx\n\t"                   \
-    "cmpl	%%edx, %%ecx\n\t"            \
-    "jl		22b\n\t"                     \
-    "movdqu	(%%rsp), %%xmm4\n\t"         \
-    "addq	$16, %%rsp\n\t"              \
-    "pshufb	%[BSWAP_MASK], %%xmm4\n\t"   \
-    "pxor	%%xmm4, " VAR(XR) "\n\t"     \
-    GHASH_FULL_AVX(XR, %%xmm12, XR, HR)      \
-    "\n"                                     \
-    "25:\n\t"
-
-#define CALC_HT_8_AVX()                            \
-    "movdqa	" VAR(XR) ", %%xmm2\n\t"           \
-    "# H ^ 1\n\t"                                  \
-    "movdqu	" VAR(HR) ", 0(" VAR(HTR) ")\n\t"  \
-    "# H ^ 2\n\t"                                  \
-    GHASH_GFMUL_RED_AVX(%%xmm0, HR, HR)            \
-    "movdqu	%%xmm0 ,  16(" VAR(HTR) ")\n\t"    \
-    "# H ^ 3\n\t"                                  \
-    GHASH_GFMUL_RED_AVX(%%xmm1, HR, %%xmm0)        \
-    "movdqu	%%xmm1 ,  32(" VAR(HTR) ")\n\t"    \
-    "# H ^ 4\n\t"                                  \
-    GHASH_GFMUL_RED_AVX(%%xmm3, %%xmm0, %%xmm0)    \
-    "movdqu	%%xmm3 ,  48(" VAR(HTR) ")\n\t"    \
-    "# H ^ 5\n\t"                                  \
-    GHASH_GFMUL_RED_AVX(%%xmm12, %%xmm0, %%xmm1)   \
-    "movdqu	%%xmm12,  64(" VAR(HTR) ")\n\t"    \
-    "# H ^ 6\n\t"                                  \
-    GHASH_GFMUL_RED_AVX(%%xmm12, %%xmm1, %%xmm1)   \
-    "movdqu	%%xmm12,  80(" VAR(HTR) ")\n\t"    \
-    "# H ^ 7\n\t"                                  \
-    GHASH_GFMUL_RED_AVX(%%xmm12, %%xmm1, %%xmm3)   \
-    "movdqu	%%xmm12,  96(" VAR(HTR) ")\n\t"    \
-    "# H ^ 8\n\t"                                  \
-    GHASH_GFMUL_RED_AVX(%%xmm12, %%xmm3, %%xmm3)   \
-    "movdqu	%%xmm12, 112(" VAR(HTR) ")\n\t"
-
-#define AESENC_128_GHASH_AVX(src, o)                 \
-    "leaq	(%[in]," VAR(KR64) ",1), %%rcx\n\t"  \
-    "leaq	(%[out]," VAR(KR64) ",1), %%rdx\n\t" \
-    /* src is either %%rcx or %%rdx */               \
-    AESENC_CTR()                                     \
-    AESENC_XOR()                                     \
-    AESENC_PCLMUL_1(src,  16, o-128, 112)            \
-    AESENC_PCLMUL_N(src,  32, o-112,  96)            \
-    AESENC_PCLMUL_N(src,  48, o -96,  80)            \
-    AESENC_PCLMUL_N(src,  64, o -80,  64)            \
-    AESENC_PCLMUL_N(src,  80, o -64,  48)            \
-    AESENC_PCLMUL_N(src,  96, o -48,  32)            \
-    AESENC_PCLMUL_N(src, 112, o -32,  16)            \
-    AESENC_PCLMUL_N(src, 128, o -16,   0)            \
-    AESENC_PCLMUL_L(144)                             \
-    "cmpl	$11, %[nr]\n\t"                      \
-    "movdqa	160(%[KEY]), %%xmm12\n\t"            \
-    "jl		4f\n\t"                              \
-    AESENC()                                         \
-    AESENC_SET(176)                                  \
-    "cmpl	$13, %[nr]\n\t"                      \
-    "movdqa	192(%[KEY]), %%xmm12\n\t"            \
-    "jl		4f\n\t"                              \
-    AESENC()                                         \
-    AESENC_SET(208)                                  \
-    "movdqa	224(%[KEY]), %%xmm12\n\t"            \
-    "\n"                                             \
-"4:\n\t"                                             \
-    AESENC_LAST(%%rcx, %%rdx)
-
-#define AESENC_LAST15_ENC_AVX()                       \
-    "movl	%[nbytes], %%ecx\n\t"                 \
-    "movl	%%ecx, %%edx\n\t"                     \
-    "andl	$0x0f, %%ecx\n\t"                     \
-    "jz		55f\n\t"                              \
-    "movdqu	" VAR(CTR1) ", %%xmm13\n\t"           \
-    "pshufb	%[BSWAP_EPI64], %%xmm13\n\t"          \
-    "pxor	0(%[KEY]), %%xmm13\n\t"               \
-    AESENC_AVX(%%xmm13)                               \
-    "subq	$16, %%rsp\n\t"                       \
-    "xorl	%%ecx, %%ecx\n\t"                     \
-    "movdqu	%%xmm13, (%%rsp)\n\t"                 \
-    "\n"                                              \
-    "51:\n\t"                                         \
-    "movzbl	(%[in]," VAR(KR64) ",1), %%r13d\n\t"  \
-    "xorb	(%%rsp,%%rcx,1), %%r13b\n\t"          \
-    "movb	%%r13b, (%[out]," VAR(KR64) ",1)\n\t" \
-    "movb	%%r13b, (%%rsp,%%rcx,1)\n\t"          \
-    "incl	" VAR(KR) "\n\t"                      \
-    "incl	%%ecx\n\t"                            \
-    "cmpl	%%edx, " VAR(KR) "\n\t"               \
-    "jl		51b\n\t"                              \
-    "xorq	%%r13, %%r13\n\t"                     \
-    "cmpl	$16, %%ecx\n\t"                       \
-    "je		53f\n\t"                              \
-    "\n"                                              \
-    "52:\n\t"                                         \
-    "movb	%%r13b, (%%rsp,%%rcx,1)\n\t"          \
-    "incl	%%ecx\n\t"                            \
-    "cmpl	$16, %%ecx\n\t"                       \
-    "jl		52b\n\t"                              \
-    "53:\n\t"                                         \
-    "movdqu	(%%rsp), %%xmm13\n\t"                 \
-    "addq	$16, %%rsp\n\t"                       \
-    "pshufb	%[BSWAP_MASK], %%xmm13\n\t"           \
-    "pxor	%%xmm13, " VAR(XR) "\n\t"             \
-    GHASH_GFMUL_RED_AVX(XR, HR, XR)                   \
-
-#define AESENC_LAST15_DEC_AVX()                       \
-    "movl	%[nbytes], %%ecx\n\t"                 \
-    "movl	%%ecx, %%edx\n\t"                     \
-    "andl	$0x0f, %%ecx\n\t"                     \
-    "jz		55f\n\t"                              \
-    "movdqu	" VAR(CTR1) ", %%xmm13\n\t"           \
-    "pshufb	%[BSWAP_EPI64], %%xmm13\n\t"          \
-    "pxor	0(%[KEY]), %%xmm13\n\t"               \
-    AESENC_AVX(%%xmm13)                               \
-    "subq	$32, %%rsp\n\t"                       \
-    "xorl	%%ecx, %%ecx\n\t"                     \
-    "movdqu	%%xmm13, (%%rsp)\n\t"                 \
-    "pxor	%%xmm0, %%xmm0\n\t"                   \
-    "movdqu	%%xmm0, 16(%%rsp)\n\t"                \
-    "\n"                                              \
-    "51:\n\t"                                         \
-    "movzbl	(%[in]," VAR(KR64) ",1), %%r13d\n\t"  \
-    "movb	%%r13b, 16(%%rsp,%%rcx,1)\n\t"        \
-    "xorb	(%%rsp,%%rcx,1), %%r13b\n\t"          \
-    "movb	%%r13b, (%[out]," VAR(KR64) ",1)\n\t" \
-    "incl	" VAR(KR) "\n\t"                      \
-    "incl	%%ecx\n\t"                            \
-    "cmpl	%%edx, " VAR(KR) "\n\t"               \
-    "jl		51b\n\t"                              \
-    "53:\n\t"                                         \
-    "movdqu	16(%%rsp), %%xmm13\n\t"               \
-    "addq	$32, %%rsp\n\t"                       \
-    "pshufb	%[BSWAP_MASK], %%xmm13\n\t"           \
-    "pxor	%%xmm13, " VAR(XR) "\n\t"             \
-    GHASH_GFMUL_RED_AVX(XR, HR, XR)                   \
-
-#define CALC_TAG()                              \
-    "movl	%[nbytes], %%edx\n\t"           \
-    "movl	%[abytes], %%ecx\n\t"           \
-    "shlq	$3, %%rdx\n\t"                  \
-    "shlq	$3, %%rcx\n\t"                  \
-    "pinsrq	$0, %%rdx, %%xmm0\n\t"          \
-    "pinsrq	$1, %%rcx, %%xmm0\n\t"          \
-    "pxor	%%xmm0, " VAR(XR) "\n\t"        \
-    GHASH_GFMUL_RED_AVX(XR, HR, XR)             \
-    "pshufb	%[BSWAP_MASK], " VAR(XR) "\n\t" \
-    "movdqu	" VAR(TR) ", %%xmm0\n\t"        \
-    "pxor	" VAR(XR) ", %%xmm0\n\t"        \
-
-#define STORE_TAG()                           \
-    "cmpl	$16, %[tbytes]\n\t"           \
-    "je		71f\n\t"                      \
-    "xorq	%%rcx, %%rcx\n\t"             \
-    "movdqu	%%xmm0, (%%rsp)\n\t"          \
-    "73:\n\t"                                 \
-    "movzbl	(%%rsp,%%rcx,1), %%r13d\n\t"  \
-    "movb	%%r13b, (%[tag],%%rcx,1)\n\t" \
-    "incl	%%ecx\n\t"                    \
-    "cmpl	%[tbytes], %%ecx\n\t"         \
-    "jne	73b\n\t"                      \
-    "jmp	72f\n\t"                      \
-    "\n"                                      \
-    "71:\n\t"                                 \
-    "movdqu	%%xmm0, (%[tag])\n\t"         \
-    "\n"                                      \
-    "72:\n\t"
-
-#define CMP_TAG()                                          \
-    "cmpl	$16, %[tbytes]\n\t"                        \
-    "je		71f\n\t"                                   \
-    "subq	$16, %%rsp\n\t"                            \
-    "xorq	%%rcx, %%rcx\n\t"                          \
-    "xorq	%%rax, %%rax\n\t"                          \
-    "movdqu	%%xmm0, (%%rsp)\n\t"                       \
-    "\n"                                                   \
-    "73:\n\t"                                              \
-    "movzbl	(%%rsp,%%rcx,1), %%r13d\n\t"               \
-    "xorb	(%[tag],%%rcx,1), %%r13b\n\t"              \
-    "orb	%%r13b, %%al\n\t"                          \
-    "incl	%%ecx\n\t"                                 \
-    "cmpl	%[tbytes], %%ecx\n\t"                      \
-    "jne	73b\n\t"                                   \
-    "cmpb	$0x00, %%al\n\t"                           \
-    "sete	%%al\n\t"                                  \
-    "addq	$16, %%rsp\n\t"                            \
-    "xorq	%%rcx, %%rcx\n\t"                          \
-    "jmp	72f\n\t"                                   \
-    "\n"                                                   \
-    "71:\n\t"                                              \
-    "movdqu	(%[tag]), %%xmm1\n\t"                      \
-    "pcmpeqb	%%xmm1, %%xmm0\n\t"                        \
-    "pmovmskb	%%xmm0, %%edx\n\t"                         \
-    "# %%edx == 0xFFFF then return 1 else => return 0\n\t" \
-    "xorl	%%eax, %%eax\n\t"                          \
-    "cmpl	$0xffff, %%edx\n\t"                        \
-    "sete	%%al\n\t"                                  \
-    "\n"                                                   \
-    "72:\n\t"                                              \
-    "movl	%%eax, (%[res])\n\t"
-
-static void AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
-                            const unsigned char* addt,
-                            const unsigned char* ivec, unsigned char *tag,
-                            unsigned int nbytes, unsigned int abytes,
-                            unsigned int ibytes, unsigned int tbytes,
-                            const unsigned char* key, int nr)
-{
-    register const unsigned char* iv asm("rax") = ivec;
-    register unsigned int ivLen asm("ebx") = ibytes;
-
-    __asm__ __volatile__ (
-        "subq	$" VAR(STACK_OFFSET) ", %%rsp\n\t"
-        /* Counter is xmm13 */
-        "pxor	%%xmm13, %%xmm13\n\t"
-        "pxor	" VAR(XR) ", " VAR(XR) "\n\t"
-        "movl	%[ibytes], %%edx\n\t"
-        "cmpl	$12, %%edx\n\t"
-        "jne	35f\n\t"
-        CALC_IV_12()
-        "\n"
-        "35:\n\t"
-        CALC_IV()
-        "\n"
-        "39:\n\t"
-
-        CALC_AAD()
-
-        "# Calculate counter and H\n\t"
-        "pshufb	%[BSWAP_EPI64], %%xmm13\n\t"
-        "movdqa	" VAR(HR) ", %%xmm5\n\t"
-        "paddd	%[ONE], %%xmm13\n\t"
-        "movdqa	" VAR(HR) ", %%xmm4\n\t"
-        "movdqu	%%xmm13, " VAR(CTR1) "\n\t"
-        "psrlq	$63, %%xmm5\n\t"
-        "psllq	$1, %%xmm4\n\t"
-        "pslldq	$8, %%xmm5\n\t"
-        "por	%%xmm5, %%xmm4\n\t"
-        "pshufd	$0xff, " VAR(HR) ", " VAR(HR) "\n\t"
-        "psrad	$31, " VAR(HR) "\n\t"
-        "pand	%[MOD2_128], " VAR(HR) "\n\t"
-        "pxor	%%xmm4, " VAR(HR) "\n\t"
-
-        "xorl	" VAR(KR) ", " VAR(KR) "\n\t"
-
-#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL)
-        "cmpl	$128, %[nbytes]\n\t"
-        "movl	%[nbytes], %%r13d\n\t"
-        "jl	5f\n\t"
-        "andl	$0xffffff80, %%r13d\n\t"
-
-        CALC_HT_8_AVX()
-
-        "# First 128 bytes of input\n\t"
-        AESENC_CTR()
-        AESENC_XOR()
-        AESENC_SET(16)
-        AESENC_SET(32)
-        AESENC_SET(48)
-        AESENC_SET(64)
-        AESENC_SET(80)
-        AESENC_SET(96)
-        AESENC_SET(112)
-        AESENC_SET(128)
-        AESENC_SET(144)
-        "cmpl	$11, %[nr]\n\t"
-        "movdqa	160(%[KEY]), %%xmm12\n\t"
-        "jl	1f\n\t"
-        AESENC()
-        AESENC_SET(176)
-        "cmpl	$13, %[nr]\n\t"
-        "movdqa	192(%[KEY]), %%xmm12\n\t"
-        "jl	1f\n\t"
-        AESENC()
-        AESENC_SET(208)
-        "movdqa	224(%[KEY]), %%xmm12\n\t"
-        "\n"
-    "1:\n\t"
-        AESENC_LAST(%[in], %[out])
-
-        "cmpl	$128, %%r13d\n\t"
-        "movl	$128, " VAR(KR) "\n\t"
-        "jle	2f\n\t"
-
-        "# More 128 bytes of input\n\t"
-        "\n"
-    "3:\n\t"
-        AESENC_128_GHASH_AVX(%%rdx, 0)
-        "addl	$128, " VAR(KR) "\n\t"
-        "cmpl	%%r13d, " VAR(KR) "\n\t"
-        "jl	3b\n\t"
-        "\n"
-    "2:\n\t"
-        "movdqa	%[BSWAP_MASK], %%xmm13\n\t"
-        "pshufb	%%xmm13, %%xmm4\n\t"
-        "pshufb	%%xmm13, %%xmm5\n\t"
-        "pshufb	%%xmm13, %%xmm6\n\t"
-        "pshufb	%%xmm13, %%xmm7\n\t"
-        "pxor	%%xmm2, %%xmm4\n\t"
-        "pshufb	%%xmm13, %%xmm8\n\t"
-        "pshufb	%%xmm13, %%xmm9\n\t"
-        "pshufb	%%xmm13, %%xmm10\n\t"
-        "pshufb	%%xmm13, %%xmm11\n\t"
-
-        "movdqu	112(" VAR(HTR) "), %%xmm12\n\t"
-        GHASH_GFMUL_AVX(XR, %%xmm13, %%xmm4, %%xmm12)
-        "movdqu	 96(" VAR(HTR) "), %%xmm12\n\t"
-        GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm5, %%xmm12)
-        "movdqu	 80(" VAR(HTR) "), %%xmm12\n\t"
-        GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm6, %%xmm12)
-        "movdqu	 64(" VAR(HTR) "), %%xmm12\n\t"
-        GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm7, %%xmm12)
-        "movdqu	 48(" VAR(HTR) "), %%xmm12\n\t"
-        GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm8, %%xmm12)
-        "movdqu	 32(" VAR(HTR) "), %%xmm12\n\t"
-        GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm9, %%xmm12)
-        "movdqu	 16(" VAR(HTR) "), %%xmm12\n\t"
-        GHASH_GFMUL_XOR_AVX(XR, %%xmm13, %%xmm10, %%xmm12)
-        "movdqu	   (" VAR(HTR) "), %%xmm12\n\t"
-        GHASH_GFMUL_RED_XOR_AVX(XR, %%xmm13, %%xmm11, %%xmm12)
-
-        "movdqu	0(" VAR(HTR) "), " VAR(HR) "\n\t"
-        "\n"
-    "5:\n\t"
-        "movl	%[nbytes], %%edx\n\t"
-        "cmpl	%%edx, " VAR(KR) "\n\t"
-        "jge	55f\n\t"
-#endif
-
-        "movl	%[nbytes], %%r13d\n\t"
-        "andl	$0xfffffff0, %%r13d\n\t"
-        "cmpl	%%r13d, " VAR(KR) "\n\t"
-        "jge	14f\n\t"
-
-        "leaq	(%[in]," VAR(KR64) ",1), %%rcx\n\t"
-        "leaq	(%[out]," VAR(KR64) ",1), %%rdx\n\t"
-        AESENC_BLOCK(%%rcx, %%rdx)
-        "addl	$16, " VAR(KR) "\n\t"
-        "cmpl	%%r13d, " VAR(KR) "\n\t"
-        "jge	13f\n\t"
-        "\n"
-        "12:\n\t"
-        "leaq	(%[in]," VAR(KR64) ",1), %%rcx\n\t"
-        "leaq	(%[out]," VAR(KR64) ",1), %%rdx\n\t"
-        AESENC_GFMUL(%%rcx, %%rdx, HR, XR)
-        "pshufb	%[BSWAP_MASK], %%xmm4\n\t"
-        "pxor	%%xmm4, " VAR(XR) "\n\t"
-        "addl	$16, " VAR(KR) "\n\t"
-        "cmpl	%%r13d, " VAR(KR) "\n\t"
-        "jl	12b\n\t"
-        "\n"
-        "13:\n\t"
-        GHASH_GFMUL_RED_AVX(XR, HR, XR)
-        "\n"
-        "14:\n\t"
-
-        AESENC_LAST15_ENC_AVX()
-        "\n"
-        "55:\n\t"
-
-        CALC_TAG()
-        STORE_TAG()
-        "addq	$" VAR(STACK_OFFSET) ", %%rsp\n\t"
-
-        :
-        : [KEY] "r" (key),
-          [in] "r" (in), [out] "r" (out), [nr] "r" (nr),
-          [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt),
-          [ivec] "r" (iv), [ibytes] "r" (ivLen), [tbytes] "r" (tbytes),
-          [tag] "r" (tag),
-          [BSWAP_MASK] "m" (BSWAP_MASK),
-          [BSWAP_EPI64] "m" (BSWAP_EPI64),
-          [ONE] "m" (ONE),
-#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL)
-          [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR),
-          [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN),
-          [EIGHT] "m" (EIGHT),
-#endif
-          [MOD2_128] "m" (MOD2_128)
-        : "xmm15", "xmm14", "xmm13", "xmm12",
-          "xmm0", "xmm1", "xmm2", "xmm3", "memory",
-          "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11",
-          "rcx", "rdx", "r13"
-    );
-}
-
-#ifdef HAVE_INTEL_AVX1
-/* Encrypt with key in xmm12. */
-#define VAESENC()                              \
-    "vaesenc	%%xmm12, %%xmm4, %%xmm4\n\t"   \
-    "vaesenc	%%xmm12, %%xmm5, %%xmm5\n\t"   \
-    "vaesenc	%%xmm12, %%xmm6, %%xmm6\n\t"   \
-    "vaesenc	%%xmm12, %%xmm7, %%xmm7\n\t"   \
-    "vaesenc	%%xmm12, %%xmm8, %%xmm8\n\t"   \
-    "vaesenc	%%xmm12, %%xmm9, %%xmm9\n\t"   \
-    "vaesenc	%%xmm12, %%xmm10, %%xmm10\n\t" \
-    "vaesenc	%%xmm12, %%xmm11, %%xmm11\n\t"
-
-#define VAESENC_SET(o)                         \
-    "vmovdqa	"#o"(%[KEY]), %%xmm12\n\t"     \
-    VAESENC()
-
-#define VAESENC_CTR()                          \
-    "vmovdqu	" VAR(CTR1) ", %%xmm0\n\t"     \
-    "vmovdqa	%[BSWAP_EPI64], %%xmm1\n\t"    \
-    "vpshufb	%%xmm1, %%xmm0, %%xmm4\n\t"    \
-    "vpaddd	%[ONE], %%xmm0, %%xmm5\n\t"    \
-    "vpshufb	%%xmm1, %%xmm5, %%xmm5\n\t"    \
-    "vpaddd	%[TWO], %%xmm0, %%xmm6\n\t"    \
-    "vpshufb	%%xmm1, %%xmm6, %%xmm6\n\t"    \
-    "vpaddd	%[THREE], %%xmm0, %%xmm7\n\t"  \
-    "vpshufb	%%xmm1, %%xmm7, %%xmm7\n\t"    \
-    "vpaddd	%[FOUR], %%xmm0, %%xmm8\n\t"   \
-    "vpshufb	%%xmm1, %%xmm8, %%xmm8\n\t"    \
-    "vpaddd	%[FIVE], %%xmm0, %%xmm9\n\t"   \
-    "vpshufb	%%xmm1, %%xmm9, %%xmm9\n\t"    \
-    "vpaddd	%[SIX], %%xmm0, %%xmm10\n\t"   \
-    "vpshufb	%%xmm1, %%xmm10, %%xmm10\n\t"  \
-    "vpaddd	%[SEVEN], %%xmm0, %%xmm11\n\t" \
-    "vpshufb	%%xmm1, %%xmm11, %%xmm11\n\t"  \
-    "vpaddd	%[EIGHT], %%xmm0, %%xmm0\n\t"
-
-#define VAESENC_XOR()                          \
-    "vmovdqa	(%[KEY]), %%xmm12\n\t"         \
-    "vmovdqu	%%xmm0, " VAR(CTR1) "\n\t"     \
-    "vpxor	%%xmm12, %%xmm4, %%xmm4\n\t"   \
-    "vpxor	%%xmm12, %%xmm5, %%xmm5\n\t"   \
-    "vpxor	%%xmm12, %%xmm6, %%xmm6\n\t"   \
-    "vpxor	%%xmm12, %%xmm7, %%xmm7\n\t"   \
-    "vpxor	%%xmm12, %%xmm8, %%xmm8\n\t"   \
-    "vpxor	%%xmm12, %%xmm9, %%xmm9\n\t"   \
-    "vpxor	%%xmm12, %%xmm10, %%xmm10\n\t" \
-    "vpxor	%%xmm12, %%xmm11, %%xmm11\n\t"
-
-#define VAESENC_128()                     \
-    VAESENC_CTR()                         \
-    VAESENC_XOR()                         \
-    VAESENC_SET(16)                       \
-    VAESENC_SET(32)                       \
-    VAESENC_SET(48)                       \
-    VAESENC_SET(64)                       \
-    VAESENC_SET(80)                       \
-    VAESENC_SET(96)                       \
-    VAESENC_SET(112)                      \
-    VAESENC_SET(128)                      \
-    VAESENC_SET(144)                      \
-    "cmpl	$11, %[nr]\n\t"           \
-    "vmovdqa	160(%[KEY]), %%xmm12\n\t" \
-    "jl	1f\n\t"                           \
-    VAESENC()                             \
-    VAESENC_SET(176)                      \
-    "cmpl	$13, %[nr]\n\t"           \
-    "vmovdqa	192(%[KEY]), %%xmm12\n\t" \
-    "jl	1f\n\t"                           \
-    VAESENC()                             \
-    VAESENC_SET(208)                      \
-    "vmovdqa	224(%[KEY]), %%xmm12\n\t" \
-    "\n"                                  \
-"1:\n\t"                                  \
-    VAESENC_LAST(%[in], %[out])
-
-/* Encrypt and carry-less multiply for AVX1. */
-#define VAESENC_PCLMUL_1(src, o1, o2, o3)              \
-    "vmovdqu	" #o3 "(" VAR(HTR) "), %%xmm12\n\t"    \
-    "vmovdqu	" #o2 "(" #src "), %%xmm0\n\t"         \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm4, %%xmm4\n\t"   \
-    "vpshufb	%[BSWAP_MASK], %%xmm0, %%xmm0\n\t"     \
-    "vpxor	%%xmm2, %%xmm0, %%xmm0\n\t"            \
-    "vpshufd	$0x4e, %%xmm12, %%xmm1\n\t"            \
-    "vpshufd	$0x4e, %%xmm0, %%xmm14\n\t"            \
-    "vpxor	%%xmm12, %%xmm1, %%xmm1\n\t"           \
-    "vpxor	%%xmm0, %%xmm14, %%xmm14\n\t"          \
-    "vpclmulqdq	$0x11, %%xmm12, %%xmm0, %%xmm3\n\t"    \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm5, %%xmm5\n\t"   \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm6, %%xmm6\n\t"   \
-    "vpclmulqdq	$0x00, %%xmm12, %%xmm0, %%xmm2\n\t"    \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm7, %%xmm7\n\t"   \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm8, %%xmm8\n\t"   \
-    "vpclmulqdq	$0x00, %%xmm14, %%xmm1, %%xmm1\n\t"    \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm9, %%xmm9\n\t"   \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm10, %%xmm10\n\t" \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm11, %%xmm11\n\t" \
-    "vpxor      %%xmm2, %%xmm1, %%xmm1\n\t"            \
-    "vpxor      %%xmm3, %%xmm1, %%xmm1\n\t"            \
-
-#define VAESENC_PCLMUL_N(src, o1, o2, o3)               \
-    "vmovdqu	" #o3 "(" VAR(HTR) "), %%xmm12\n\t"     \
-    "vmovdqu	" #o2 "(" #src "), %%xmm0\n\t"          \
-    "vpshufd	$0x4e, %%xmm12, %%xmm13\n\t"            \
-    "vpshufb	%[BSWAP_MASK], %%xmm0, %%xmm0\n\t"      \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vpxor	%%xmm12, %%xmm13, %%xmm13\n\t"          \
-    "vpshufd	$0x4e, %%xmm0, %%xmm14\n\t"             \
-    "vpxor	%%xmm0, %%xmm14, %%xmm14\n\t"           \
-    "vpclmulqdq	$0x11, %%xmm12, %%xmm0, %%xmm15\n\t"    \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm5, %%xmm5\n\t"    \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm6, %%xmm6\n\t"    \
-    "vpclmulqdq	$0x00, %%xmm12, %%xmm0, %%xmm12\n\t"    \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm7, %%xmm7\n\t"    \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm8, %%xmm8\n\t"    \
-    "vpclmulqdq	$0x00, %%xmm14, %%xmm13, %%xmm13\n\t"   \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm9, %%xmm9\n\t"    \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm10, %%xmm10\n\t"  \
-    "vaesenc	" #o1 "(%[KEY]), %%xmm11, %%xmm11\n\t"  \
-    "vpxor      %%xmm12, %%xmm1, %%xmm1\n\t"            \
-    "vpxor      %%xmm12, %%xmm2, %%xmm2\n\t"            \
-    "vpxor      %%xmm15, %%xmm1, %%xmm1\n\t"            \
-    "vpxor      %%xmm15, %%xmm3, %%xmm3\n\t"            \
-    "vpxor      %%xmm13, %%xmm1, %%xmm1\n\t"            \
-
-#define VAESENC_PCLMUL_L(o)                         \
-    "vpslldq	$8, %%xmm1, %%xmm14\n\t"            \
-    "vpsrldq	$8, %%xmm1, %%xmm1\n\t"             \
-    "vaesenc	"#o"(%[KEY]), %%xmm4, %%xmm4\n\t"   \
-    "vpxor      %%xmm14, %%xmm2, %%xmm2\n\t"        \
-    "vpxor      %%xmm1, %%xmm3, %%xmm3\n\t"         \
-    "vaesenc	"#o"(%[KEY]), %%xmm5, %%xmm5\n\t"   \
-    "vpslld	$31, %%xmm2, %%xmm12\n\t"           \
-    "vpslld	$30, %%xmm2, %%xmm13\n\t"           \
-    "vpslld	$25, %%xmm2, %%xmm14\n\t"           \
-    "vaesenc	"#o"(%[KEY]), %%xmm6, %%xmm6\n\t"   \
-    "vpxor	%%xmm13, %%xmm12, %%xmm12\n\t"      \
-    "vpxor	%%xmm14, %%xmm12, %%xmm12\n\t"      \
-    "vaesenc	"#o"(%[KEY]), %%xmm7, %%xmm7\n\t"   \
-    "vpsrldq	$4, %%xmm12, %%xmm13\n\t"           \
-    "vpslldq	$12, %%xmm12, %%xmm12\n\t"          \
-    "vaesenc	"#o"(%[KEY]), %%xmm8, %%xmm8\n\t"   \
-    "vpxor	%%xmm12, %%xmm2, %%xmm2\n\t"        \
-    "vpsrld	$1, %%xmm2, %%xmm14\n\t"            \
-    "vaesenc	"#o"(%[KEY]), %%xmm9, %%xmm9\n\t"   \
-    "vpsrld	$2, %%xmm2, %%xmm1\n\t"             \
-    "vpsrld	$7, %%xmm2, %%xmm0\n\t"             \
-    "vaesenc	"#o"(%[KEY]), %%xmm10, %%xmm10\n\t" \
-    "vpxor	%%xmm1, %%xmm14, %%xmm14\n\t"       \
-    "vpxor	%%xmm0, %%xmm14, %%xmm14\n\t"       \
-    "vaesenc	"#o"(%[KEY]), %%xmm11, %%xmm11\n\t" \
-    "vpxor	%%xmm13, %%xmm14, %%xmm14\n\t"      \
-    "vpxor	%%xmm14, %%xmm2, %%xmm2\n\t"        \
-    "vpxor	%%xmm3, %%xmm2, %%xmm2\n\t"         \
-
-
-/* Encrypt and carry-less multiply with last key. */
-#define VAESENC_LAST(in, out)                          \
-    "vaesenclast	%%xmm12, %%xmm4, %%xmm4\n\t"   \
-    "vaesenclast	%%xmm12, %%xmm5, %%xmm5\n\t"   \
-    "vmovdqu		   (" #in "), %%xmm0\n\t"      \
-    "vmovdqu		 16(" #in "), %%xmm1\n\t"      \
-    "vpxor		%%xmm0, %%xmm4, %%xmm4\n\t"    \
-    "vpxor		%%xmm1, %%xmm5, %%xmm5\n\t"    \
-    "vmovdqu		%%xmm4,    (" #out ")\n\t"     \
-    "vmovdqu		%%xmm5,  16(" #out ")\n\t"     \
-    "vaesenclast	%%xmm12, %%xmm6, %%xmm6\n\t"   \
-    "vaesenclast	%%xmm12, %%xmm7, %%xmm7\n\t"   \
-    "vmovdqu		 32(" #in "), %%xmm0\n\t"      \
-    "vmovdqu		 48(" #in "), %%xmm1\n\t"      \
-    "vpxor		%%xmm0, %%xmm6, %%xmm6\n\t"    \
-    "vpxor		%%xmm1, %%xmm7, %%xmm7\n\t"    \
-    "vmovdqu		%%xmm6,  32(" #out ")\n\t"     \
-    "vmovdqu		%%xmm7,  48(" #out ")\n\t"     \
-    "vaesenclast	%%xmm12, %%xmm8, %%xmm8\n\t"   \
-    "vaesenclast	%%xmm12, %%xmm9, %%xmm9\n\t"   \
-    "vmovdqu		 64(" #in "), %%xmm0\n\t"      \
-    "vmovdqu		 80(" #in "), %%xmm1\n\t"      \
-    "vpxor		%%xmm0, %%xmm8, %%xmm8\n\t"    \
-    "vpxor		%%xmm1, %%xmm9, %%xmm9\n\t"    \
-    "vmovdqu		%%xmm8,  64(" #out ")\n\t"     \
-    "vmovdqu		%%xmm9,  80(" #out ")\n\t"     \
-    "vaesenclast	%%xmm12, %%xmm10, %%xmm10\n\t" \
-    "vaesenclast	%%xmm12, %%xmm11, %%xmm11\n\t" \
-    "vmovdqu		 96(" #in "), %%xmm0\n\t"      \
-    "vmovdqu		112(" #in "), %%xmm1\n\t"      \
-    "vpxor		%%xmm0, %%xmm10, %%xmm10\n\t"  \
-    "vpxor		%%xmm1, %%xmm11, %%xmm11\n\t"  \
-    "vmovdqu		%%xmm10,  96(" #out ")\n\t"    \
-    "vmovdqu		%%xmm11, 112(" #out ")\n\t"
-
-#define VAESENC_BLOCK()                                       \
-    "vmovdqu		" VAR(CTR1) ", %%xmm5\n\t"            \
-    "vpshufb		%[BSWAP_EPI64], %%xmm5, %%xmm4\n\t"   \
-    "vpaddd		%[ONE], %%xmm5, %%xmm5\n\t"           \
-    "vmovdqu		%%xmm5, " VAR(CTR1) "\n\t"            \
-    "vpxor		(%[KEY]), %%xmm4, %%xmm4\n\t"         \
-    "vaesenc		16(%[KEY]), %%xmm4, %%xmm4\n\t"       \
-    "vaesenc		32(%[KEY]), %%xmm4, %%xmm4\n\t"       \
-    "vaesenc		48(%[KEY]), %%xmm4, %%xmm4\n\t"       \
-    "vaesenc		64(%[KEY]), %%xmm4, %%xmm4\n\t"       \
-    "vaesenc		80(%[KEY]), %%xmm4, %%xmm4\n\t"       \
-    "vaesenc		96(%[KEY]), %%xmm4, %%xmm4\n\t"       \
-    "vaesenc		112(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vaesenc		128(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vaesenc		144(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "cmpl		$11, %[nr]\n\t"                       \
-    "vmovdqa		160(%[KEY]), %%xmm5\n\t"              \
-    "jl			%=f\n\t"                              \
-    "vaesenc		%%xmm5, %%xmm4, %%xmm4\n\t"           \
-    "vaesenc		176(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "cmpl		$13, %[nr]\n\t"                       \
-    "vmovdqa		192(%[KEY]), %%xmm5\n\t"              \
-    "jl			%=f\n\t"                              \
-    "vaesenc		%%xmm5, %%xmm4, %%xmm4\n\t"           \
-    "vaesenc		208(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vmovdqa		224(%[KEY]), %%xmm5\n\t"              \
-    "%=:\n\t"                                                 \
-    "vaesenclast	%%xmm5, %%xmm4, %%xmm4\n\t"           \
-    "vmovdqu		(%[in]," VAR(KR64) ",1), %%xmm5\n\t"  \
-    "vpxor		%%xmm5, %%xmm4, %%xmm4\n\t"           \
-    "vmovdqu		%%xmm4, (%[out]," VAR(KR64) ",1)\n\t" \
-    "vpshufb		%[BSWAP_MASK], %%xmm4, %%xmm4\n\t"    \
-    "vpxor		%%xmm4, " VAR(XR) ", " VAR(XR) "\n\t"
-
-#define _VAESENC_GFMUL(in, H, X)                              \
-    "vmovdqu		" VAR(CTR1) ", %%xmm5\n\t"            \
-    "vpshufb		%[BSWAP_EPI64], %%xmm5, %%xmm4\n\t"   \
-    "vpaddd		%[ONE], %%xmm5, %%xmm5\n\t"           \
-    "vmovdqu		%%xmm5, " VAR(CTR1) "\n\t"            \
-    "vpxor		(%[KEY]), %%xmm4, %%xmm4\n\t"         \
-    "vpclmulqdq		$0x10, " #H ", " #X ", %%xmm6\n\t"    \
-    "vaesenc		16(%[KEY]), %%xmm4, %%xmm4\n\t"       \
-    "vaesenc		32(%[KEY]), %%xmm4, %%xmm4\n\t"       \
-    "vpclmulqdq		$0x01, " #H ", " #X ", %%xmm7\n\t"    \
-    "vaesenc		48(%[KEY]), %%xmm4, %%xmm4\n\t"       \
-    "vaesenc		64(%[KEY]), %%xmm4, %%xmm4\n\t"       \
-    "vpclmulqdq		$0x00, " #H ", " #X ", %%xmm8\n\t"    \
-    "vaesenc		80(%[KEY]), %%xmm4, %%xmm4\n\t"       \
-    "vpclmulqdq		$0x11, " #H ", " #X ", %%xmm1\n\t"    \
-    "vaesenc		96(%[KEY]), %%xmm4, %%xmm4\n\t"       \
-    "vpxor		%%xmm7, %%xmm6, %%xmm6\n\t"           \
-    "vpslldq		$8, %%xmm6, %%xmm2\n\t"               \
-    "vpsrldq		$8, %%xmm6, %%xmm6\n\t"               \
-    "vaesenc		112(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vpxor		%%xmm8, %%xmm2, %%xmm2\n\t"           \
-    "vpxor		%%xmm6, %%xmm1, %%xmm3\n\t"           \
-    "vmovdqa		%[MOD2_128], %%xmm0\n\t"              \
-    "vpclmulqdq		$0x10, %%xmm0, %%xmm2, %%xmm7\n\t"    \
-    "vaesenc		128(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vpshufd		$0x4e, %%xmm2, %%xmm6\n\t"            \
-    "vpxor		%%xmm7, %%xmm6, %%xmm6\n\t"           \
-    "vpclmulqdq		$0x10, %%xmm0, %%xmm6, %%xmm7\n\t"    \
-    "vaesenc		144(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vpshufd		$0x4e, %%xmm6, %%xmm6\n\t"            \
-    "vpxor		%%xmm7, %%xmm6, %%xmm6\n\t"           \
-    "vpxor		%%xmm3, %%xmm6, " VAR(XR) "\n\t"      \
-    "cmpl		$11, %[nr]\n\t"                       \
-    "vmovdqa		160(%[KEY]), %%xmm5\n\t"              \
-    "jl			1f\n\t"                               \
-    "vaesenc		%%xmm5, %%xmm4, %%xmm4\n\t"           \
-    "vaesenc		176(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "cmpl		$13, %[nr]\n\t"                       \
-    "vmovdqa		192(%[KEY]), %%xmm5\n\t"              \
-    "jl			1f\n\t"                               \
-    "vaesenc		%%xmm5, %%xmm4, %%xmm4\n\t"           \
-    "vaesenc		208(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vmovdqa		224(%[KEY]), %%xmm5\n\t"              \
-    "1:\n\t"                                                  \
-    "vaesenclast	%%xmm5, %%xmm4, %%xmm4\n\t"           \
-    "vmovdqu		" #in ", %%xmm0\n\t"                  \
-    "vpxor		%%xmm0, %%xmm4, %%xmm4\n\t"           \
-    "vmovdqu		%%xmm4, (%[out]," VAR(KR64) ",1)\n\t"
-#define VAESENC_GFMUL(in, H, X)                               \
-       _VAESENC_GFMUL(in, H, X)
-
-
-#define _GHASH_GFMUL_AVX1(r, r2, a, b)             \
-    "vpshufd	$0x4e, "#a", %%xmm1\n\t"           \
-    "vpshufd	$0x4e, "#b", %%xmm2\n\t"           \
-    "vpclmulqdq	$0x11, "#a", "#b", %%xmm3\n\t"     \
-    "vpclmulqdq	$0x00, "#a", "#b", %%xmm0\n\t"     \
-    "vpxor	"#a", %%xmm1, %%xmm1\n\t"          \
-    "vpxor	"#b", %%xmm2, %%xmm2\n\t"          \
-    "vpclmulqdq	$0x00, %%xmm2, %%xmm1, %%xmm1\n\t" \
-    "vpxor	%%xmm0, %%xmm1, %%xmm1\n\t"        \
-    "vpxor	%%xmm3, %%xmm1, %%xmm1\n\t"        \
-    "vmovdqa	%%xmm0, "#r2"\n\t"                 \
-    "vmovdqa	%%xmm3, " #r "\n\t"                \
-    "vpslldq	$8, %%xmm1, %%xmm2\n\t"            \
-    "vpsrldq	$8, %%xmm1, %%xmm1\n\t"            \
-    "vpxor	%%xmm2, "#r2", "#r2"\n\t"          \
-    "vpxor	%%xmm1, " #r ", " #r "\n\t"
-#define GHASH_GFMUL_AVX1(r, r2, a, b)              \
-       _GHASH_GFMUL_AVX1(r, r2, a, b)
-
-#define _GHASH_GFMUL_XOR_AVX1(r, r2, a, b)         \
-    "vpshufd	$0x4e, "#a", %%xmm1\n\t"           \
-    "vpshufd	$0x4e, "#b", %%xmm2\n\t"           \
-    "vpclmulqdq	$0x11, "#a", "#b", %%xmm3\n\t"     \
-    "vpclmulqdq	$0x00, "#a", "#b", %%xmm0\n\t"     \
-    "vpxor	"#a", %%xmm1, %%xmm1\n\t"          \
-    "vpxor	"#b", %%xmm2, %%xmm2\n\t"          \
-    "vpclmulqdq	$0x00, %%xmm2, %%xmm1, %%xmm1\n\t" \
-    "vpxor	%%xmm0, %%xmm1, %%xmm1\n\t"        \
-    "vpxor	%%xmm3, %%xmm1, %%xmm1\n\t"        \
-    "vpxor	%%xmm0, "#r2", "#r2"\n\t"          \
-    "vpxor	%%xmm3, " #r ", " #r "\n\t"        \
-    "vpslldq	$8, %%xmm1, %%xmm2\n\t"            \
-    "vpsrldq	$8, %%xmm1, %%xmm1\n\t"            \
-    "vpxor	%%xmm2, "#r2", "#r2"\n\t"          \
-    "vpxor	%%xmm1, " #r ", " #r "\n\t"
-#define GHASH_GFMUL_XOR_AVX1(r, r2, a, b)          \
-       _GHASH_GFMUL_XOR_AVX1(r, r2, a, b)
-
-#define GHASH_MID_AVX1(r, r2)               \
-    "vpsrld	$31, "#r2", %%xmm0\n\t"     \
-    "vpsrld	$31, " #r ", %%xmm1\n\t"    \
-    "vpslld	$1, "#r2", "#r2"\n\t"       \
-    "vpslld	$1, " #r ", " #r "\n\t"     \
-    "vpsrldq	$12, %%xmm0, %%xmm2\n\t"    \
-    "vpslldq	$4, %%xmm0, %%xmm0\n\t"     \
-    "vpslldq	$4, %%xmm1, %%xmm1\n\t"     \
-    "vpor	%%xmm2, " #r ", " #r "\n\t" \
-    "vpor	%%xmm0, "#r2", "#r2"\n\t"   \
-    "vpor	%%xmm1, " #r ", " #r "\n\t"
-
-#define _GHASH_GFMUL_RED_AVX1(r, a, b)             \
-    "vpshufd	$0x4e, "#a", %%xmm5\n\t"           \
-    "vpshufd	$0x4e, "#b", %%xmm6\n\t"           \
-    "vpclmulqdq	$0x11, "#a", "#b", %%xmm7\n\t"     \
-    "vpclmulqdq	$0x00, "#a", "#b", %%xmm4\n\t"     \
-    "vpxor	"#a", %%xmm5, %%xmm5\n\t"          \
-    "vpxor	"#b", %%xmm6, %%xmm6\n\t"          \
-    "vpclmulqdq	$0x00, %%xmm6, %%xmm5, %%xmm5\n\t" \
-    "vpxor	%%xmm4, %%xmm5, %%xmm5\n\t"        \
-    "vpxor	%%xmm7, %%xmm5, %%xmm5\n\t"        \
-    "vpslldq	$8, %%xmm5, %%xmm6\n\t"            \
-    "vpsrldq	$8, %%xmm5, %%xmm5\n\t"            \
-    "vpxor	%%xmm6, %%xmm4, %%xmm4\n\t"        \
-    "vpxor	%%xmm5, %%xmm7, " #r "\n\t"        \
-    "vpslld	$31, %%xmm4, %%xmm8\n\t"           \
-    "vpslld	$30, %%xmm4, %%xmm9\n\t"           \
-    "vpslld	$25, %%xmm4, %%xmm10\n\t"          \
-    "vpxor	%%xmm9, %%xmm8, %%xmm8\n\t"        \
-    "vpxor	%%xmm10, %%xmm8, %%xmm8\n\t"       \
-    "vpsrldq	$4, %%xmm8, %%xmm9\n\t"            \
-    "vpslldq	$12, %%xmm8, %%xmm8\n\t"           \
-    "vpxor	%%xmm8, %%xmm4, %%xmm4\n\t"        \
-    "vpsrld	$1, %%xmm4, %%xmm10\n\t"           \
-    "vpsrld	$2, %%xmm4, %%xmm6\n\t"            \
-    "vpsrld	$7, %%xmm4, %%xmm5\n\t"            \
-    "vpxor	%%xmm6, %%xmm10, %%xmm10\n\t"      \
-    "vpxor	%%xmm5, %%xmm10, %%xmm10\n\t"      \
-    "vpxor	%%xmm9, %%xmm10, %%xmm10\n\t"      \
-    "vpxor	%%xmm4, %%xmm10, %%xmm10\n\t"      \
-    "vpxor	%%xmm10, " #r ", " #r "\n\t"
-#define GHASH_GFMUL_RED_AVX1(r, a, b)              \
-       _GHASH_GFMUL_RED_AVX1(r, a, b)
-
-#define _GHASH_GFSQR_RED_AVX1(r, a)                \
-    "vpclmulqdq	$0x00, "#a", "#a", %%xmm4\n\t"     \
-    "vpclmulqdq	$0x11, "#a", "#a", " #r "\n\t"     \
-    "vpslld	$31, %%xmm4, %%xmm8\n\t"           \
-    "vpslld	$30, %%xmm4, %%xmm9\n\t"           \
-    "vpslld	$25, %%xmm4, %%xmm10\n\t"          \
-    "vpxor	%%xmm9, %%xmm8, %%xmm8\n\t"        \
-    "vpxor	%%xmm10, %%xmm8, %%xmm8\n\t"       \
-    "vpsrldq	$4, %%xmm8, %%xmm9\n\t"            \
-    "vpslldq	$12, %%xmm8, %%xmm8\n\t"           \
-    "vpxor	%%xmm8, %%xmm4, %%xmm4\n\t"        \
-    "vpsrld	$1, %%xmm4, %%xmm10\n\t"           \
-    "vpsrld	$2, %%xmm4, %%xmm6\n\t"            \
-    "vpsrld	$7, %%xmm4, %%xmm5\n\t"            \
-    "vpxor	%%xmm6, %%xmm10, %%xmm10\n\t"      \
-    "vpxor	%%xmm5, %%xmm10, %%xmm10\n\t"      \
-    "vpxor	%%xmm9, %%xmm10, %%xmm10\n\t"      \
-    "vpxor	%%xmm4, %%xmm10, %%xmm10\n\t"      \
-    "vpxor	%%xmm10, " #r ", " #r "\n\t"
-#define GHASH_GFSQR_RED_AVX1(r, a)                 \
-       _GHASH_GFSQR_RED_AVX1(r, a)
-
-#define GHASH_RED_AVX1(r, r2)                \
-    "vpslld	$31, "#r2", %%xmm0\n\t"      \
-    "vpslld	$30, "#r2", %%xmm1\n\t"      \
-    "vpslld	$25, "#r2", %%xmm2\n\t"      \
-    "vpxor	%%xmm1, %%xmm0, %%xmm0\n\t"  \
-    "vpxor	%%xmm2, %%xmm0, %%xmm0\n\t"  \
-    "vmovdqa	%%xmm0, %%xmm1\n\t"          \
-    "vpsrldq	$4, %%xmm1, %%xmm1\n\t"      \
-    "vpslldq	$12, %%xmm0, %%xmm0\n\t"     \
-    "vpxor	%%xmm0, "#r2", "#r2"\n\t"    \
-    "vpsrld	$1, "#r2", %%xmm2\n\t"       \
-    "vpsrld	$2, "#r2", %%xmm3\n\t"       \
-    "vpsrld	$7, "#r2", %%xmm0\n\t"       \
-    "vpxor	%%xmm3, %%xmm2, %%xmm2\n\t"  \
-    "vpxor	%%xmm0, %%xmm2, %%xmm2\n\t"  \
-    "vpxor	%%xmm1, %%xmm2, %%xmm2\n\t"  \
-    "vpxor	"#r2", %%xmm2, %%xmm2\n\t"   \
-    "vpxor	%%xmm2, " #r ", " #r "\n\t"
-
-#define GHASH_GFMUL_RED_XOR_AVX1(r, r2, a, b) \
-    GHASH_GFMUL_XOR_AVX1(r, r2, a, b)         \
-    GHASH_RED_AVX1(r, r2)
-
-#define GHASH_FULL_AVX1(r, r2, a, b) \
-    GHASH_GFMUL_AVX1(r, r2, a, b)    \
-    GHASH_MID_AVX1(r, r2)            \
-    GHASH_RED_AVX1(r, r2)
-
-#define CALC_IV_12_AVX1()                                            \
-    "# Calculate values when IV is 12 bytes\n\t"                     \
-    "# Set counter based on IV\n\t"                                  \
-    "movl		$0x01000000, %%ecx\n\t"                      \
-    "vpinsrq		$0, 0(%%rax), %%xmm13, %%xmm13\n\t"          \
-    "vpinsrd		$2, 8(%%rax), %%xmm13, %%xmm13\n\t"          \
-    "vpinsrd		$3, %%ecx, %%xmm13, %%xmm13\n\t"             \
-    "# H = Encrypt X(=0) and T = Encrypt counter\n\t"                \
-    "vmovdqa		  0(%[KEY]), " VAR(HR) "\n\t"                \
-    "vpxor		" VAR(HR) ", %%xmm13, %%xmm1\n\t"            \
-    "vmovdqa		 16(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		 32(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		 48(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		 64(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		 80(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		 96(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		112(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		128(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		144(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "cmpl		$11, %[nr]\n\t"                              \
-    "vmovdqa		160(%[KEY]), %%xmm12\n\t"                    \
-    "jl	31f\n\t"                                                     \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		176(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "cmpl		$13, %[nr]\n\t"                              \
-    "vmovdqa		192(%[KEY]), %%xmm12\n\t"                    \
-    "jl	31f\n\t"                                                     \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		208(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqu		224(%[KEY]), %%xmm12\n\t"                    \
-    "31:\n\t"                                                        \
-    "vaesenclast	%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenclast	%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vpshufb		%[BSWAP_MASK], " VAR(HR) ", " VAR(HR) "\n\t" \
-    "vmovdqu		%%xmm1, " VAR(TR) "\n\t"                     \
-    "jmp		39f\n\t"
-
-#define CALC_IV_AVX1()                                       \
-    "# Calculate values when IV is not 12 bytes\n\t"         \
-    "# H = Encrypt X(=0)\n\t"                                \
-    "vmovdqa	0(%[KEY]), " VAR(HR) "\n\t"                  \
-    VAESENC_AVX(HR)                                          \
-    "vpshufb	%[BSWAP_MASK], " VAR(HR) ", " VAR(HR) "\n\t" \
-    "# Calc counter\n\t"                                     \
-    "# Initialization vector\n\t"                            \
-    "cmpl	$0, %%edx\n\t"                               \
-    "movq	$0, %%rcx\n\t"                               \
-    "je	45f\n\t"                                             \
-    "cmpl	$16, %%edx\n\t"                              \
-    "jl	44f\n\t"                                             \
-    "andl	$0xfffffff0, %%edx\n\t"                      \
-    "\n"                                                     \
-    "43:\n\t"                                                \
-    "vmovdqu	(%%rax,%%rcx,1), %%xmm4\n\t"                 \
-    "vpshufb	%[BSWAP_MASK], %%xmm4, %%xmm4\n\t"           \
-    "vpxor	%%xmm4, %%xmm13, %%xmm13\n\t"                \
-    GHASH_FULL_AVX1(%%xmm13, %%xmm12, %%xmm13, HR)           \
-    "addl	$16, %%ecx\n\t"                              \
-    "cmpl	%%edx, %%ecx\n\t"                            \
-    "jl	43b\n\t"                                             \
-    "movl	%[ibytes], %%edx\n\t"                        \
-    "cmpl	%%edx, %%ecx\n\t"                            \
-    "je	45f\n\t"                                             \
-    "\n"                                                     \
-    "44:\n\t"                                                \
-    "subq	$16, %%rsp\n\t"                              \
-    "vpxor	%%xmm4, %%xmm4, %%xmm4\n\t"                  \
-    "xorl	%%ebx, %%ebx\n\t"                            \
-    "vmovdqu	%%xmm4, (%%rsp)\n\t"                         \
-    "42:\n\t"                                                \
-    "movzbl	(%%rax,%%rcx,1), %%r13d\n\t"                 \
-    "movb	%%r13b, (%%rsp,%%rbx,1)\n\t"                 \
-    "incl	%%ecx\n\t"                                   \
-    "incl	%%ebx\n\t"                                   \
-    "cmpl	%%edx, %%ecx\n\t"                            \
-    "jl	42b\n\t"                                             \
-    "vmovdqu	(%%rsp), %%xmm4\n\t"                         \
-    "addq	$16, %%rsp\n\t"                              \
-    "vpshufb	%[BSWAP_MASK], %%xmm4, %%xmm4\n\t"           \
-    "vpxor	%%xmm4, %%xmm13, %%xmm13\n\t"                \
-    GHASH_FULL_AVX1(%%xmm13, %%xmm12, %%xmm13, HR)           \
-    "\n"                                                     \
-    "45:\n\t"                                                \
-    "# T = Encrypt counter\n\t"                              \
-    "vpxor	%%xmm0, %%xmm0, %%xmm0\n\t"                  \
-    "shll	$3, %%edx\n\t"                               \
-    "vpinsrq	$0, %%rdx, %%xmm0, %%xmm0\n\t"               \
-    "vpxor	%%xmm0, %%xmm13, %%xmm13\n\t"                \
-    GHASH_FULL_AVX1(%%xmm13, %%xmm12, %%xmm13, HR)           \
-    "vpshufb	%[BSWAP_MASK], %%xmm13, %%xmm13\n\t"         \
-    "#   Encrypt counter\n\t"                                \
-    "vmovdqa	0(%[KEY]), %%xmm4\n\t"                       \
-    "vpxor	%%xmm13, %%xmm4, %%xmm4\n\t"                 \
-    VAESENC_AVX(%%xmm4)                                      \
-    "vmovdqu	%%xmm4, " VAR(TR) "\n\t"
-
-#define CALC_AAD_AVX1()                                \
-    "# Additional authentication data\n\t"             \
-    "movl	%[abytes], %%edx\n\t"                  \
-    "cmpl	$0, %%edx\n\t"                         \
-    "je		25f\n\t"                               \
-    "movq	%[addt], %%rax\n\t"                    \
-    "xorl	%%ecx, %%ecx\n\t"                      \
-    "cmpl	$16, %%edx\n\t"                        \
-    "jl		24f\n\t"                               \
-    "andl	$0xfffffff0, %%edx\n\t"                \
-    "\n"                                               \
-    "23:\n\t"                                          \
-    "vmovdqu	(%%rax,%%rcx,1), %%xmm4\n\t"           \
-    "vpshufb	%[BSWAP_MASK], %%xmm4, %%xmm4\n\t"     \
-    "vpxor	%%xmm4, " VAR(XR) ", " VAR(XR) "\n\t"  \
-    GHASH_FULL_AVX1(XR, %%xmm12, XR, HR)               \
-    "addl	$16, %%ecx\n\t"                        \
-    "cmpl	%%edx, %%ecx\n\t"                      \
-    "jl		23b\n\t"                               \
-    "movl	%[abytes], %%edx\n\t"                  \
-    "cmpl	%%edx, %%ecx\n\t"                      \
-    "je		25f\n\t"                               \
-    "\n"                                               \
-    "24:\n\t"                                          \
-    "subq	$16, %%rsp\n\t"                        \
-    "vpxor	%%xmm4, %%xmm4, %%xmm4\n\t"            \
-    "xorl	%%ebx, %%ebx\n\t"                      \
-    "vmovdqu	%%xmm4, (%%rsp)\n\t"                   \
-    "22:\n\t"                                          \
-    "movzbl	(%%rax,%%rcx,1), %%r13d\n\t"           \
-    "movb	%%r13b, (%%rsp,%%rbx,1)\n\t"           \
-    "incl	%%ecx\n\t"                             \
-    "incl	%%ebx\n\t"                             \
-    "cmpl	%%edx, %%ecx\n\t"                      \
-    "jl		22b\n\t"                               \
-    "vmovdqu	(%%rsp), %%xmm4\n\t"                   \
-    "addq	$16, %%rsp\n\t"                        \
-    "vpshufb	%[BSWAP_MASK], %%xmm4, %%xmm4\n\t"     \
-    "vpxor	%%xmm4, " VAR(XR) ", " VAR(XR) "\n\t"  \
-    GHASH_FULL_AVX1(XR, %%xmm12, XR, HR)               \
-    "\n"                                               \
-    "25:\n\t"
-
-#define CALC_HT_8_AVX1()                          \
-    "vmovdqa	" VAR(XR) ", %%xmm2\n\t"          \
-    "# H ^ 1\n\t"                                 \
-    "vmovdqu	" VAR(HR) ", 0(" VAR(HTR) ")\n\t" \
-    "# H ^ 2\n\t"                                 \
-    GHASH_GFSQR_RED_AVX1(%%xmm0, HR)              \
-    "vmovdqu	%%xmm0 ,  16(" VAR(HTR) ")\n\t"   \
-    "# H ^ 3\n\t"                                 \
-    GHASH_GFMUL_RED_AVX1(%%xmm1, HR, %%xmm0)      \
-    "vmovdqu	%%xmm1 ,  32(" VAR(HTR) ")\n\t"   \
-    "# H ^ 4\n\t"                                 \
-    GHASH_GFSQR_RED_AVX1(%%xmm3, %%xmm0)          \
-    "vmovdqu	%%xmm3 ,  48(" VAR(HTR) ")\n\t"   \
-    "# H ^ 5\n\t"                                 \
-    GHASH_GFMUL_RED_AVX1(%%xmm12, %%xmm0, %%xmm1) \
-    "vmovdqu	%%xmm12,  64(" VAR(HTR) ")\n\t"   \
-    "# H ^ 6\n\t"                                 \
-    GHASH_GFSQR_RED_AVX1(%%xmm12, %%xmm1)         \
-    "vmovdqu	%%xmm12,  80(" VAR(HTR) ")\n\t"   \
-    "# H ^ 7\n\t"                                 \
-    GHASH_GFMUL_RED_AVX1(%%xmm12, %%xmm1, %%xmm3) \
-    "vmovdqu	%%xmm12,  96(" VAR(HTR) ")\n\t"   \
-    "# H ^ 8\n\t"                                 \
-    GHASH_GFSQR_RED_AVX1(%%xmm12, %%xmm3)         \
-    "vmovdqu	%%xmm12, 112(" VAR(HTR) ")\n\t"
-
-#define VAESENC_128_GHASH_AVX1(src, o)               \
-    "leaq	(%[in]," VAR(KR64) ",1), %%rcx\n\t"  \
-    "leaq	(%[out]," VAR(KR64) ",1), %%rdx\n\t" \
-    /* src is either %%rcx or %%rdx */             \
-    VAESENC_CTR()                                  \
-    VAESENC_XOR()                                  \
-    VAESENC_PCLMUL_1(src,  16, (o-128), 112)       \
-    VAESENC_PCLMUL_N(src,  32, (o-112),  96)       \
-    VAESENC_PCLMUL_N(src,  48, (o- 96),  80)       \
-    VAESENC_PCLMUL_N(src,  64, (o- 80),  64)       \
-    VAESENC_PCLMUL_N(src,  80, (o- 64),  48)       \
-    VAESENC_PCLMUL_N(src,  96, (o- 48),  32)       \
-    VAESENC_PCLMUL_N(src, 112, (o- 32),  16)       \
-    VAESENC_PCLMUL_N(src, 128, (o- 16),   0)       \
-    VAESENC_PCLMUL_L(144)                          \
-    "cmpl	$11, %[nr]\n\t"                    \
-    "vmovdqa	160(%[KEY]), %%xmm12\n\t"          \
-    "jl		4f\n\t"                            \
-    VAESENC()                                      \
-    VAESENC_SET(176)                               \
-    "cmpl	$13, %[nr]\n\t"                    \
-    "vmovdqa	192(%[KEY]), %%xmm12\n\t"          \
-    "jl		4f\n\t"                            \
-    VAESENC()                                      \
-    VAESENC_SET(208)                               \
-    "vmovdqa	224(%[KEY]), %%xmm12\n\t"          \
-    "\n"                                           \
-"4:\n\t"                                           \
-    VAESENC_LAST(%%rcx, %%rdx)
-
-#define _VAESENC_AVX(r)                                  \
-    "vaesenc		16(%[KEY]), " #r ", " #r "\n\t"  \
-    "vaesenc		32(%[KEY]), " #r ", " #r "\n\t"  \
-    "vaesenc		48(%[KEY]), " #r ", " #r "\n\t"  \
-    "vaesenc		64(%[KEY]), " #r ", " #r "\n\t"  \
-    "vaesenc		80(%[KEY]), " #r ", " #r "\n\t"  \
-    "vaesenc		96(%[KEY]), " #r ", " #r "\n\t"  \
-    "vaesenc		112(%[KEY]), " #r ", " #r "\n\t" \
-    "vaesenc		128(%[KEY]), " #r ", " #r "\n\t" \
-    "vaesenc		144(%[KEY]), " #r ", " #r "\n\t" \
-    "cmpl		$11, %[nr]\n\t"                  \
-    "vmovdqa		160(%[KEY]), %%xmm5\n\t"         \
-    "jl			%=f\n\t"                         \
-    "vaesenc		%%xmm5, " #r ", " #r "\n\t"      \
-    "vaesenc		176(%[KEY]), " #r ", " #r "\n\t" \
-    "cmpl		$13, %[nr]\n\t"                  \
-    "vmovdqa		192(%[KEY]), %%xmm5\n\t"         \
-    "jl			%=f\n\t"                         \
-    "vaesenc		%%xmm5, " #r ", " #r "\n\t"      \
-    "vaesenc		208(%[KEY]), " #r ", " #r "\n\t" \
-    "vmovdqa		224(%[KEY]), %%xmm5\n\t"         \
-    "%=:\n\t"                                            \
-    "vaesenclast	%%xmm5, " #r ", " #r "\n\t"
-#define VAESENC_AVX(r)                                   \
-        _VAESENC_AVX(r)
-
-#define AESENC_LAST15_ENC_AVX1()                        \
-    "movl	%[nbytes], %%ecx\n\t"                   \
-    "movl	%%ecx, %%edx\n\t"                       \
-    "andl	$0x0f, %%ecx\n\t"                       \
-    "jz		55f\n\t"                                \
-    "vmovdqu	" VAR(CTR1) ", %%xmm13\n\t"             \
-    "vpshufb	%[BSWAP_EPI64], %%xmm13, %%xmm13\n\t"   \
-    "vpxor	0(%[KEY]), %%xmm13, %%xmm13\n\t"        \
-    VAESENC_AVX(%%xmm13)                                \
-    "subq	$16, %%rsp\n\t"                         \
-    "xorl	%%ecx, %%ecx\n\t"                       \
-    "vmovdqu	%%xmm13, (%%rsp)\n\t"                   \
-    "\n"                                                \
-    "51:\n\t"                                           \
-    "movzbl	(%[in]," VAR(KR64) ",1), %%r13d\n\t"    \
-    "xorb	(%%rsp,%%rcx,1), %%r13b\n\t"            \
-    "movb	%%r13b, (%[out]," VAR(KR64) ",1)\n\t"   \
-    "movb	%%r13b, (%%rsp,%%rcx,1)\n\t"            \
-    "incl	" VAR(KR) "\n\t"                        \
-    "incl	%%ecx\n\t"                              \
-    "cmpl	%%edx, " VAR(KR) "\n\t"                 \
-    "jl		51b\n\t"                                \
-    "xorq	%%r13, %%r13\n\t"                       \
-    "cmpl	$16, %%ecx\n\t"                         \
-    "je		53f\n\t"                                \
-    "\n"                                                \
-    "52:\n\t"                                           \
-    "movb	%%r13b, (%%rsp,%%rcx,1)\n\t"            \
-    "incl	%%ecx\n\t"                              \
-    "cmpl	$16, %%ecx\n\t"                         \
-    "jl		52b\n\t"                                \
-    "53:\n\t"                                           \
-    "vmovdqu	(%%rsp), %%xmm13\n\t"                   \
-    "addq	$16, %%rsp\n\t"                         \
-    "vpshufb	%[BSWAP_MASK], %%xmm13, %%xmm13\n\t"    \
-    "vpxor	%%xmm13, " VAR(XR) ", " VAR(XR) "\n\t"  \
-    GHASH_GFMUL_RED_AVX1(XR, HR, XR)                    \
-
-#define AESENC_LAST15_DEC_AVX1()                        \
-    "movl	%[nbytes], %%ecx\n\t"                   \
-    "movl	%%ecx, %%edx\n\t"                       \
-    "andl	$0x0f, %%ecx\n\t"                       \
-    "jz		55f\n\t"                                \
-    "vmovdqu	" VAR(CTR1) ", %%xmm13\n\t"             \
-    "vpshufb	%[BSWAP_EPI64], %%xmm13, %%xmm13\n\t"   \
-    "vpxor	0(%[KEY]), %%xmm13, %%xmm13\n\t"        \
-    VAESENC_AVX(%%xmm13)                                \
-    "subq	$32, %%rsp\n\t"                         \
-    "xorl	%%ecx, %%ecx\n\t"                       \
-    "vmovdqu	%%xmm13, (%%rsp)\n\t"                   \
-    "vpxor	%%xmm0, %%xmm0, %%xmm0\n\t"             \
-    "vmovdqu	%%xmm0, 16(%%rsp)\n\t"                  \
-    "\n"                                                \
-    "51:\n\t"                                           \
-    "movzbl	(%[in]," VAR(KR64) ",1), %%r13d\n\t"    \
-    "movb	%%r13b, 16(%%rsp,%%rcx,1)\n\t"          \
-    "xorb	(%%rsp,%%rcx,1), %%r13b\n\t"            \
-    "movb	%%r13b, (%[out]," VAR(KR64) ",1)\n\t"   \
-    "incl	" VAR(KR) "\n\t"                        \
-    "incl	%%ecx\n\t"                              \
-    "cmpl	%%edx, " VAR(KR) "\n\t"                 \
-    "jl		51b\n\t"                                \
-    "53:\n\t"                                           \
-    "vmovdqu	16(%%rsp), %%xmm13\n\t"                 \
-    "addq	$32, %%rsp\n\t"                         \
-    "vpshufb	%[BSWAP_MASK], %%xmm13, %%xmm13\n\t"    \
-    "vpxor	%%xmm13, " VAR(XR) ", " VAR(XR) "\n\t"  \
-    GHASH_GFMUL_RED_AVX1(XR, HR, XR)                    \
-
-#define CALC_TAG_AVX1()                                      \
-    "movl	%[nbytes], %%edx\n\t"                        \
-    "movl	%[abytes], %%ecx\n\t"                        \
-    "shlq	$3, %%rdx\n\t"                               \
-    "shlq	$3, %%rcx\n\t"                               \
-    "vpinsrq	$0, %%rdx, %%xmm0, %%xmm0\n\t"               \
-    "vpinsrq	$1, %%rcx, %%xmm0, %%xmm0\n\t"               \
-    "vpxor	%%xmm0, " VAR(XR) ", " VAR(XR) "\n\t"        \
-    GHASH_GFMUL_RED_AVX1(XR, HR, XR)                         \
-    "vpshufb	%[BSWAP_MASK], " VAR(XR) ", " VAR(XR) "\n\t" \
-    "vpxor	" VAR(TR) ", " VAR(XR) ", %%xmm0\n\t"        \
-
-#define STORE_TAG_AVX()                       \
-    "cmpl	$16, %[tbytes]\n\t"           \
-    "je		71f\n\t"                      \
-    "xorq	%%rcx, %%rcx\n\t"             \
-    "vmovdqu	%%xmm0, (%%rsp)\n\t"          \
-    "73:\n\t"                                 \
-    "movzbl	(%%rsp,%%rcx,1), %%r13d\n\t"  \
-    "movb	%%r13b, (%[tag],%%rcx,1)\n\t" \
-    "incl	%%ecx\n\t"                    \
-    "cmpl	%[tbytes], %%ecx\n\t"         \
-    "jne	73b\n\t"                      \
-    "jmp	72f\n\t"                      \
-    "\n"                                      \
-    "71:\n\t"                                 \
-    "vmovdqu	%%xmm0, (%[tag])\n\t"         \
-    "\n"                                      \
-    "72:\n\t"
-
-#define CMP_TAG_AVX()                                      \
-    "cmpl	$16, %[tbytes]\n\t"                        \
-    "je		71f\n\t"                                   \
-    "subq	$16, %%rsp\n\t"                            \
-    "xorq	%%rcx, %%rcx\n\t"                          \
-    "xorq	%%rax, %%rax\n\t"                          \
-    "vmovdqu	%%xmm0, (%%rsp)\n\t"                       \
-    "\n"                                                   \
-    "73:\n\t"                                              \
-    "movzbl	(%%rsp,%%rcx,1), %%r13d\n\t"               \
-    "xorb	(%[tag],%%rcx,1), %%r13b\n\t"              \
-    "orb	%%r13b, %%al\n\t"                          \
-    "incl	%%ecx\n\t"                                 \
-    "cmpl	%[tbytes], %%ecx\n\t"                      \
-    "jne	73b\n\t"                                   \
-    "cmpb	$0x00, %%al\n\t"                           \
-    "sete	%%al\n\t"                                  \
-    "addq	$16, %%rsp\n\t"                            \
-    "jmp	72f\n\t"                                   \
-    "\n"                                                   \
-    "71:\n\t"                                              \
-    "vmovdqu	(%[tag]), %%xmm1\n\t"                      \
-    "vpcmpeqb	%%xmm1, %%xmm0, %%xmm0\n\t"                \
-    "vpmovmskb	%%xmm0, %%edx\n\t"                         \
-    "# %%edx == 0xFFFF then return 1 else => return 0\n\t" \
-    "xorl	%%eax, %%eax\n\t"                          \
-    "cmpl	$0xffff, %%edx\n\t"                        \
-    "sete	%%al\n\t"                                  \
-    "\n"                                                   \
-    "72:\n\t"                                              \
-    "movl	%%eax, (%[res])\n\t"
-
-static void AES_GCM_encrypt_avx1(const unsigned char *in, unsigned char *out,
-                                 const unsigned char* addt,
-                                 const unsigned char* ivec, unsigned char *tag,
-                                 unsigned int nbytes, unsigned int abytes,
-                                 unsigned int ibytes, unsigned int tbytes,
-                                 const unsigned char* key, int nr)
-{
-    register const unsigned char* iv asm("rax") = ivec;
-    register unsigned int ivLen asm("ebx") = ibytes;
-
-    __asm__ __volatile__ (
-        "subq		$" VAR(STACK_OFFSET) ", %%rsp\n\t"
-        /* Counter is xmm13 */
-        "vpxor		%%xmm13, %%xmm13, %%xmm13\n\t"
-        "vpxor		" VAR(XR) ", " VAR(XR) ", " VAR(XR) "\n\t"
-        "movl		%[ibytes], %%edx\n\t"
-        "cmpl		$12, %%edx\n\t"
-        "jne		35f\n\t"
-        CALC_IV_12_AVX1()
-        "\n"
-        "35:\n\t"
-        CALC_IV_AVX1()
-        "\n"
-        "39:\n\t"
-
-        CALC_AAD_AVX1()
-
-        "# Calculate counter and H\n\t"
-        "vpsrlq		$63, " VAR(HR) ", %%xmm5\n\t"
-        "vpsllq		$1, " VAR(HR) ", %%xmm4\n\t"
-        "vpslldq	$8, %%xmm5, %%xmm5\n\t"
-        "vpor		%%xmm5, %%xmm4, %%xmm4\n\t"
-        "vpshufd	$0xff, " VAR(HR) ", " VAR(HR) "\n\t"
-        "vpsrad		$31, " VAR(HR) ", " VAR(HR) "\n\t"
-        "vpshufb	%[BSWAP_EPI64], %%xmm13, %%xmm13\n\t"
-        "vpand		%[MOD2_128], " VAR(HR) ", " VAR(HR) "\n\t"
-        "vpaddd		%[ONE], %%xmm13, %%xmm13\n\t"
-        "vpxor		%%xmm4, " VAR(HR) ", " VAR(HR) "\n\t"
-        "vmovdqu	%%xmm13, " VAR(CTR1) "\n\t"
-
-        "xorl		" VAR(KR) ", " VAR(KR) "\n\t"
-
-#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL)
-        "cmpl	$128, %[nbytes]\n\t"
-        "movl	%[nbytes], %%r13d\n\t"
-        "jl	5f\n\t"
-        "andl	$0xffffff80, %%r13d\n\t"
-
-        CALC_HT_8_AVX1()
-
-        "# First 128 bytes of input\n\t"
-        VAESENC_128()
-
-        "cmpl	$128, %%r13d\n\t"
-        "movl	$128, " VAR(KR) "\n\t"
-        "jle	2f\n\t"
-
-        "# More 128 bytes of input\n\t"
-        "\n"
-    "3:\n\t"
-        VAESENC_128_GHASH_AVX1(%%rdx, 0)
-        "addl	$128, " VAR(KR) "\n\t"
-        "cmpl	%%r13d, " VAR(KR) "\n\t"
-        "jl	3b\n\t"
-        "\n"
-    "2:\n\t"
-        "vmovdqa	%[BSWAP_MASK], %%xmm13\n\t"
-        "vpshufb	%%xmm13, %%xmm4, %%xmm4\n\t"
-        "vpshufb	%%xmm13, %%xmm5, %%xmm5\n\t"
-        "vpshufb	%%xmm13, %%xmm6, %%xmm6\n\t"
-        "vpshufb	%%xmm13, %%xmm7, %%xmm7\n\t"
-        "vpxor		%%xmm2, %%xmm4, %%xmm4\n\t"
-        "vpshufb	%%xmm13, %%xmm8, %%xmm8\n\t"
-        "vpshufb	%%xmm13, %%xmm9, %%xmm9\n\t"
-        "vpshufb	%%xmm13, %%xmm10, %%xmm10\n\t"
-        "vpshufb	%%xmm13, %%xmm11, %%xmm11\n\t"
-
-        "vmovdqu	   (" VAR(HTR) "), %%xmm12\n\t"
-        "vmovdqu	 16(" VAR(HTR) "), %%xmm14\n\t"
-        GHASH_GFMUL_AVX1(XR, %%xmm13, %%xmm11, %%xmm12)
-        GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm10, %%xmm14)
-        "vmovdqu	 32(" VAR(HTR) "), %%xmm12\n\t"
-        "vmovdqu	 48(" VAR(HTR) "), %%xmm14\n\t"
-        GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm9, %%xmm12)
-        GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm8, %%xmm14)
-        "vmovdqu	 64(" VAR(HTR) "), %%xmm12\n\t"
-        "vmovdqu	 80(" VAR(HTR) "), %%xmm14\n\t"
-        GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm7, %%xmm12)
-        GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm6, %%xmm14)
-        "vmovdqu	 96(" VAR(HTR) "), %%xmm12\n\t"
-        "vmovdqu	112(" VAR(HTR) "), %%xmm14\n\t"
-        GHASH_GFMUL_XOR_AVX1(XR, %%xmm13, %%xmm5, %%xmm12)
-        GHASH_GFMUL_RED_XOR_AVX1(XR, %%xmm13, %%xmm4, %%xmm14)
-
-        "vmovdqu	0(" VAR(HTR) "), " VAR(HR) "\n\t"
-        "\n"
-    "5:\n\t"
-        "movl		%[nbytes], %%edx\n\t"
-        "cmpl		%%edx, " VAR(KR) "\n\t"
-        "jge		55f\n\t"
-#endif
-
-        "movl		%[nbytes], %%r13d\n\t"
-        "andl		$0xfffffff0, %%r13d\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jge		14f\n\t"
-
-        VAESENC_BLOCK()
-        "addl		$16, " VAR(KR) "\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jge		13f\n\t"
-        "\n"
-        "12:\n\t"
-        "vmovdqu	(%[in]," VAR(KR64) ",1), %%xmm9\n\t"
-        VAESENC_GFMUL(%%xmm9, HR, XR)
-        "vpshufb	%[BSWAP_MASK], %%xmm4, %%xmm4\n\t"
-        "addl		$16, " VAR(KR) "\n\t"
-        "vpxor		%%xmm4, " VAR(XR) ", " VAR(XR) "\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jl		12b\n\t"
-        "\n"
-        "13:\n\t"
-        GHASH_GFMUL_RED_AVX1(XR, HR, XR)
-        "\n"
-        "14:\n\t"
-
-        AESENC_LAST15_ENC_AVX1()
-        "\n"
-        "55:\n\t"
-
-        CALC_TAG_AVX1()
-        STORE_TAG_AVX()
-        "addq		$" VAR(STACK_OFFSET) ", %%rsp\n\t"
-        "vzeroupper\n\t"
-
-        :
-        : [KEY] "r" (key),
-          [in] "r" (in), [out] "r" (out), [nr] "r" (nr),
-          [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt),
-          [ivec] "r" (iv), [ibytes] "r" (ivLen), [tbytes] "r" (tbytes),
-          [tag] "r" (tag),
-          [BSWAP_MASK] "m" (BSWAP_MASK),
-          [BSWAP_EPI64] "m" (BSWAP_EPI64),
-          [ONE] "m" (ONE),
-#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL)
-          [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR),
-          [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN),
-          [EIGHT] "m" (EIGHT),
-#endif
-          [MOD2_128] "m" (MOD2_128)
-        : "xmm15", "xmm14", "xmm13", "xmm12",
-          "xmm0", "xmm1", "xmm2", "xmm3", "memory",
-          "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11",
-          "rcx", "rdx", "r13"
-    );
-}
-
-#ifdef HAVE_INTEL_AVX2
-/* Encrypt and carry-less multiply for AVX2. */
-#define VAESENC_PCLMUL_AVX2_1(src, o1, o2, o3)        \
-    "vmovdqu	" #o2 "(" #src "), %%xmm12\n\t"       \
-    "vmovdqa	" #o1 "(%[KEY]), %%xmm0\n\t"          \
-    "vpshufb	%[BSWAP_MASK], %%xmm12, %%xmm12\n\t"  \
-    "vmovdqu	" #o3 "(" VAR(HTR) "), %%xmm13\n\t"   \
-    "vpxor	%%xmm2, %%xmm12, %%xmm12\n\t"         \
-    "vpclmulqdq	$0x10, %%xmm13, %%xmm12, %%xmm1\n\t"  \
-    "vpclmulqdq	$0x01, %%xmm13, %%xmm12, %%xmm14\n\t" \
-    "vpclmulqdq	$0x00, %%xmm13, %%xmm12, %%xmm2\n\t"  \
-    "vpclmulqdq	$0x11, %%xmm13, %%xmm12, %%xmm3\n\t"  \
-    "vaesenc	%%xmm0, %%xmm4, %%xmm4\n\t"           \
-    "vaesenc	%%xmm0, %%xmm5, %%xmm5\n\t"           \
-    "vaesenc	%%xmm0, %%xmm6, %%xmm6\n\t"           \
-    "vaesenc	%%xmm0, %%xmm7, %%xmm7\n\t"           \
-    "vaesenc	%%xmm0, %%xmm8, %%xmm8\n\t"           \
-    "vaesenc	%%xmm0, %%xmm9, %%xmm9\n\t"           \
-    "vaesenc	%%xmm0, %%xmm10, %%xmm10\n\t"         \
-    "vaesenc	%%xmm0, %%xmm11, %%xmm11\n\t"         \
-
-#define VAESENC_PCLMUL_AVX2_2(src, o1, o2, o3)        \
-    "vmovdqu	" #o2 "(" #src "), %%xmm12\n\t"       \
-    "vmovdqu	" #o3 "(" VAR(HTR) "), %%xmm0\n\t"    \
-    "vpshufb	%[BSWAP_MASK], %%xmm12, %%xmm12\n\t"  \
-    "vpxor	%%xmm14, %%xmm1, %%xmm1\n\t"          \
-    "vpclmulqdq	$0x10, %%xmm0, %%xmm12, %%xmm13\n\t"  \
-    "vpclmulqdq	$0x01, %%xmm0, %%xmm12, %%xmm14\n\t"  \
-    "vpclmulqdq	$0x00, %%xmm0, %%xmm12, %%xmm15\n\t"  \
-    "vpclmulqdq	$0x11, %%xmm0, %%xmm12, %%xmm12\n\t"  \
-    "vmovdqa	" #o1 "(%[KEY]), %%xmm0\n\t"          \
-    "vpxor	%%xmm13, %%xmm1, %%xmm1\n\t"          \
-    "vpxor	%%xmm12, %%xmm3, %%xmm3\n\t"          \
-    "vaesenc	%%xmm0, %%xmm4, %%xmm4\n\t"           \
-    "vaesenc	%%xmm0, %%xmm5, %%xmm5\n\t"           \
-    "vaesenc	%%xmm0, %%xmm6, %%xmm6\n\t"           \
-    "vaesenc	%%xmm0, %%xmm7, %%xmm7\n\t"           \
-    "vaesenc	%%xmm0, %%xmm8, %%xmm8\n\t"           \
-    "vaesenc	%%xmm0, %%xmm9, %%xmm9\n\t"           \
-    "vaesenc	%%xmm0, %%xmm10, %%xmm10\n\t"         \
-    "vaesenc	%%xmm0, %%xmm11, %%xmm11\n\t"         \
-
-#define VAESENC_PCLMUL_AVX2_N(src, o1, o2, o3)        \
-    "vmovdqu	" #o2 "(" #src "), %%xmm12\n\t"       \
-    "vmovdqu	" #o3 "(" VAR(HTR) "), %%xmm0\n\t"    \
-    "vpshufb	%[BSWAP_MASK], %%xmm12, %%xmm12\n\t"  \
-    "vpxor	%%xmm14, %%xmm1, %%xmm1\n\t"          \
-    "vpxor	%%xmm15, %%xmm2, %%xmm2\n\t"          \
-    "vpclmulqdq	$0x10, %%xmm0, %%xmm12, %%xmm13\n\t"  \
-    "vpclmulqdq	$0x01, %%xmm0, %%xmm12, %%xmm14\n\t"  \
-    "vpclmulqdq	$0x00, %%xmm0, %%xmm12, %%xmm15\n\t"  \
-    "vpclmulqdq	$0x11, %%xmm0, %%xmm12, %%xmm12\n\t"  \
-    "vmovdqa	" #o1 "(%[KEY]), %%xmm0\n\t"          \
-    "vpxor	%%xmm13, %%xmm1, %%xmm1\n\t"          \
-    "vpxor	%%xmm12, %%xmm3, %%xmm3\n\t"          \
-    "vaesenc	%%xmm0, %%xmm4, %%xmm4\n\t"           \
-    "vaesenc	%%xmm0, %%xmm5, %%xmm5\n\t"           \
-    "vaesenc	%%xmm0, %%xmm6, %%xmm6\n\t"           \
-    "vaesenc	%%xmm0, %%xmm7, %%xmm7\n\t"           \
-    "vaesenc	%%xmm0, %%xmm8, %%xmm8\n\t"           \
-    "vaesenc	%%xmm0, %%xmm9, %%xmm9\n\t"           \
-    "vaesenc	%%xmm0, %%xmm10, %%xmm10\n\t"         \
-    "vaesenc	%%xmm0, %%xmm11, %%xmm11\n\t"         \
-
-#define VAESENC_PCLMUL_AVX2_L(o)                      \
-    "vpxor	%%xmm14, %%xmm1, %%xmm1\n\t"          \
-    "vpxor	%%xmm15, %%xmm2, %%xmm2\n\t"          \
-    "vpslldq	$8, %%xmm1, %%xmm12\n\t"              \
-    "vpsrldq	$8, %%xmm1, %%xmm1\n\t"               \
-    "vmovdqa	"#o"(%[KEY]), %%xmm15\n\t"            \
-    "vmovdqa	%[MOD2_128], %%xmm0\n\t"              \
-    "vaesenc	%%xmm15, %%xmm4, %%xmm4\n\t"          \
-    "vpxor	%%xmm12, %%xmm2, %%xmm2\n\t"          \
-    "vpxor	%%xmm1, %%xmm3, %%xmm3\n\t"           \
-    "vpclmulqdq	$0x10, %%xmm0, %%xmm2, %%xmm14\n\t"   \
-    "vaesenc	%%xmm15, %%xmm5, %%xmm5\n\t"          \
-    "vaesenc	%%xmm15, %%xmm6, %%xmm6\n\t"          \
-    "vaesenc	%%xmm15, %%xmm7, %%xmm7\n\t"          \
-    "vpshufd	$0x4e, %%xmm2, %%xmm2\n\t"            \
-    "vpxor	%%xmm14, %%xmm2, %%xmm2\n\t"          \
-    "vpclmulqdq	$0x10, %%xmm0, %%xmm2, %%xmm14\n\t"   \
-    "vaesenc	%%xmm15, %%xmm8, %%xmm8\n\t"          \
-    "vaesenc	%%xmm15, %%xmm9, %%xmm9\n\t"          \
-    "vaesenc	%%xmm15, %%xmm10, %%xmm10\n\t"        \
-    "vpshufd	$0x4e, %%xmm2, %%xmm2\n\t"            \
-    "vpxor	%%xmm14, %%xmm2, %%xmm2\n\t"          \
-    "vpxor	%%xmm3, %%xmm2, %%xmm2\n\t"           \
-    "vaesenc	%%xmm15, %%xmm11, %%xmm11\n\t"
-
-#define VAESENC_BLOCK_AVX2()                                  \
-    "vmovdqu		" VAR(CTR1) ", %%xmm5\n\t"            \
-    "vpshufb		%[BSWAP_EPI64], %%xmm5, %%xmm4\n\t"   \
-    "vpaddd		%[ONE], %%xmm5, %%xmm5\n\t"           \
-    "vmovdqu		%%xmm5, " VAR(CTR1) "\n\t"            \
-    "vpxor		   (%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vaesenc		 16(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vaesenc		 32(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vaesenc		 48(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vaesenc		 64(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vaesenc		 80(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vaesenc		 96(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vaesenc		112(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vaesenc		128(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vaesenc		144(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "cmpl		$11, %[nr]\n\t"                       \
-    "vmovdqa		160(%[KEY]), %%xmm5\n\t"              \
-    "jl			%=f\n\t"                              \
-    "vaesenc		%%xmm5, %%xmm4, %%xmm4\n\t"           \
-    "vaesenc		176(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "cmpl		$13, %[nr]\n\t"                       \
-    "vmovdqa		192(%[KEY]), %%xmm5\n\t"              \
-    "jl			%=f\n\t"                              \
-    "vaesenc		%%xmm5, %%xmm4, %%xmm4\n\t"           \
-    "vaesenc		208(%[KEY]), %%xmm4, %%xmm4\n\t"      \
-    "vmovdqa		224(%[KEY]), %%xmm5\n\t"              \
-    "%=:\n\t"                                                 \
-    "vaesenclast	%%xmm5, %%xmm4, %%xmm4\n\t"           \
-    "vmovdqu		(%[in]," VAR(KR64) ",1), %%xmm5\n\t"  \
-    "vpxor		%%xmm5, %%xmm4, %%xmm4\n\t"           \
-    "vmovdqu		%%xmm4, (%[out]," VAR(KR64) ",1)\n\t" \
-    "vpshufb		%[BSWAP_MASK], %%xmm4, %%xmm4\n\t"    \
-    "vpxor		%%xmm4, " VAR(XR) ", " VAR(XR) "\n\t"
-
-/* Karatsuba multiplication - slower
- * H01 = H[1] ^ H[0] (top and bottom 64-bits XORed)
- */
-#define _VAESENC_GFMUL_AVX2(in, H, X, ctr1, H01)            \
-    "vpxor		   (%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		 16(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		 32(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		 48(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		 64(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		 80(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		 96(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		112(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		128(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		144(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "cmpl		$11, %[nr]\n\t"                     \
-    "vmovdqa		160(%[KEY]), %%xmm5\n\t"            \
-    "jl			%=f\n\t"                            \
-    "vaesenc		%%xmm5, %%xmm4, %%xmm4\n\t"         \
-    "vaesenc		176(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "cmpl		$13, %[nr]\n\t"                     \
-    "vmovdqa		192(%[KEY]), %%xmm5\n\t"            \
-    "jl			%=f\n\t"                            \
-    "vaesenc		%%xmm5, %%xmm4, %%xmm4\n\t"         \
-    "vaesenc		208(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vmovdqa		224(%[KEY]), %%xmm5\n\t"            \
-    "%=:\n\t"                                               \
-    "vaesenclast	%%xmm5, %%xmm4, %%xmm4\n\t"         \
-    "vmovdqu		" #in ", %%xmm0\n\t"                \
-    "vpxor		%%xmm0, %%xmm4, %%xmm4\n\t"         \
-                                                            \
-    "vpsrldq	$8, " #X ", %%xmm2\n\t"                     \
-    "vpxor	" #X ", %%xmm2, %%xmm2\n\t"                 \
-    "vpclmulqdq	$0x00, " #H ", " #X ", %%xmm5\n\t"          \
-    "vpclmulqdq	$0x11, " #H ", " #X ", %%xmm8\n\t"          \
-    "vpclmulqdq	$0x00, "#H01", %%xmm2, %%xmm7\n\t"          \
-    "vpxor	%%xmm5, %%xmm7, %%xmm7\n\t"                 \
-    "vpxor	%%xmm8, %%xmm7, %%xmm7\n\t"                 \
-    "vpslldq	$8, %%xmm7, %%xmm6\n\t"                     \
-    "vpsrldq	$8, %%xmm7, %%xmm7\n\t"                     \
-    "vpxor	%%xmm7, %%xmm8, %%xmm8\n\t"                 \
-    "vpxor	%%xmm5, %%xmm6, %%xmm6\n\t"                 \
-                                                            \
-    "vpclmulqdq	$0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t"     \
-    "vpshufd	$0x4e, %%xmm6, %%xmm6\n\t"                  \
-    "vpxor	%%xmm5, %%xmm6, %%xmm6\n\t"                 \
-    "vpclmulqdq	$0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t"     \
-    "vpshufd	$0x4e, %%xmm6, %%xmm6\n\t"                  \
-    "vpxor	%%xmm8, %%xmm6, %%xmm6\n\t"                 \
-    "vpxor	%%xmm5, %%xmm6, " VAR(XR) "\n\t"
-#define VAESENC_GFMUL_AVX2(in, H, X, ctr1)                  \
-       _VAESENC_GFMUL_AVX2(in, H, X, ctr1)
-
-#define _VAESENC_GFMUL_SB_AVX2(in, H, X, ctr1)              \
-    "vpclmulqdq	$0x10, " #H ", " #X ", %%xmm7\n\t"          \
-    "vpclmulqdq	$0x01, " #H ", " #X ", %%xmm6\n\t"          \
-    "vpclmulqdq	$0x00, " #H ", " #X ", %%xmm5\n\t"          \
-    "vpclmulqdq	$0x11, " #H ", " #X ", %%xmm8\n\t"          \
-    "vpxor		   (%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		 16(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vpxor	%%xmm6, %%xmm7, %%xmm7\n\t"                 \
-    "vpslldq	$8, %%xmm7, %%xmm6\n\t"                     \
-    "vpsrldq	$8, %%xmm7, %%xmm7\n\t"                     \
-    "vaesenc		 32(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vpxor	%%xmm5, %%xmm6, %%xmm6\n\t"                 \
-    "vpclmulqdq	$0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t"     \
-    "vaesenc		 48(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		 64(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		 80(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vpshufd	$0x4e, %%xmm6, %%xmm6\n\t"                  \
-    "vpxor	%%xmm5, %%xmm6, %%xmm6\n\t"                 \
-    "vpclmulqdq	$0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t"     \
-    "vaesenc		 96(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		112(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vaesenc		128(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vpshufd	$0x4e, %%xmm6, %%xmm6\n\t"                  \
-    "vaesenc		144(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vpxor	%%xmm7, %%xmm8, %%xmm8\n\t"                 \
-    "vpxor	%%xmm8, %%xmm6, %%xmm6\n\t"                 \
-    "cmpl		$11, %[nr]\n\t"                     \
-    "vmovdqa		160(%[KEY]), %%xmm3\n\t"            \
-    "jl			%=f\n\t"                            \
-    "vaesenc		%%xmm3, %%xmm4, %%xmm4\n\t"         \
-    "vaesenc		176(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "cmpl		$13, %[nr]\n\t"                     \
-    "vmovdqa		192(%[KEY]), %%xmm3\n\t"            \
-    "jl			%=f\n\t"                            \
-    "vaesenc		%%xmm3, %%xmm4, %%xmm4\n\t"         \
-    "vaesenc		208(%[KEY]), %%xmm4, %%xmm4\n\t"    \
-    "vmovdqa		224(%[KEY]), %%xmm3\n\t"            \
-    "%=:\n\t"                                               \
-    "vaesenclast	%%xmm3, %%xmm4, %%xmm4\n\t"         \
-    "vpxor	%%xmm5, %%xmm6, " VAR(XR) "\n\t"            \
-    "vmovdqu		" #in ", %%xmm5\n\t"                \
-    "vpxor		%%xmm5, %%xmm4, %%xmm4\n\t"
-#define VAESENC_GFMUL_SB_AVX2(in, H, X, ctr1)               \
-       _VAESENC_GFMUL_SB_AVX2(in, H, X, ctr1)
-
-
-#define _GHASH_GFMUL_AVX2(r, r2, a, b)         \
-    "vpclmulqdq	$0x10, "#a", "#b", %%xmm2\n\t" \
-    "vpclmulqdq	$0x01, "#a", "#b", %%xmm1\n\t" \
-    "vpclmulqdq	$0x00, "#a", "#b", %%xmm0\n\t" \
-    "vpclmulqdq	$0x11, "#a", "#b", %%xmm3\n\t" \
-    "vpxor	%%xmm1, %%xmm2, %%xmm2\n\t"    \
-    "vpslldq	$8, %%xmm2, %%xmm1\n\t"        \
-    "vpsrldq	$8, %%xmm2, %%xmm2\n\t"        \
-    "vpxor	%%xmm1, %%xmm0, "#r2"\n\t"     \
-    "vpxor	%%xmm2, %%xmm3, " #r "\n\t"
-#define GHASH_GFMUL_AVX2(r, r2, a, b)          \
-       _GHASH_GFMUL_AVX2(r, r2, a, b)
-
-#define GHASH_MID_AVX2(r, r2)               \
-    "vpsrld	$31, "#r2", %%xmm0\n\t"     \
-    "vpsrld	$31, " #r ", %%xmm1\n\t"    \
-    "vpslld	$1, "#r2", "#r2"\n\t"       \
-    "vpslld	$1, " #r ", " #r "\n\t"     \
-    "vpsrldq	$12, %%xmm0, %%xmm2\n\t"    \
-    "vpslldq	$4, %%xmm0, %%xmm0\n\t"     \
-    "vpslldq	$4, %%xmm1, %%xmm1\n\t"     \
-    "vpor	%%xmm2, " #r ", " #r "\n\t" \
-    "vpor	%%xmm0, "#r2", "#r2"\n\t"   \
-    "vpor	%%xmm1, " #r ", " #r "\n\t"
-
-#define _GHASH_GFMUL_RED_AVX2(r, a, b)                  \
-    "vpclmulqdq	$0x10, "#a", "#b", %%xmm7\n\t"          \
-    "vpclmulqdq	$0x01, "#a", "#b", %%xmm6\n\t"          \
-    "vpclmulqdq	$0x00, "#a", "#b", %%xmm5\n\t"          \
-    "vpxor	%%xmm6, %%xmm7, %%xmm7\n\t"             \
-    "vpslldq	$8, %%xmm7, %%xmm6\n\t"                 \
-    "vpsrldq	$8, %%xmm7, %%xmm7\n\t"                 \
-    "vpxor	%%xmm5, %%xmm6, %%xmm6\n\t"             \
-    "vpclmulqdq	$0x11, "#a", "#b", %%xmm8\n\t"          \
-    "vpclmulqdq	$0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \
-    "vpshufd	$0x4e, %%xmm6, %%xmm6\n\t"              \
-    "vpxor	%%xmm5, %%xmm6, %%xmm6\n\t"             \
-    "vpclmulqdq	$0x10, %[MOD2_128], %%xmm6, %%xmm5\n\t" \
-    "vpshufd	$0x4e, %%xmm6, %%xmm6\n\t"              \
-    "vpxor	%%xmm7, %%xmm8, %%xmm8\n\t"             \
-    "vpxor	%%xmm8, %%xmm6, %%xmm6\n\t"             \
-    "vpxor	%%xmm5, %%xmm6, " #r "\n\t"
-#define GHASH_GFMUL_RED_AVX2(r, a, b)                   \
-       _GHASH_GFMUL_RED_AVX2(r, a, b)
-
-#define _GHASH_GFSQR_RED2_AVX2(r, a, mod128)            \
-    "vpclmulqdq	$0x00, "#a", "#a", %%xmm6\n\t"          \
-    "vpclmulqdq	$0x11, "#a", "#a", %%xmm8\n\t"          \
-    "vpclmulqdq	$0x10, "#mod128", %%xmm6, %%xmm5\n\t"   \
-    "vpshufd	$0x4e, %%xmm6, %%xmm6\n\t"              \
-    "vpxor	%%xmm5, %%xmm6, %%xmm6\n\t"             \
-    "vpclmulqdq	$0x10, "#mod128", %%xmm6, %%xmm5\n\t"   \
-    "vpshufd	$0x4e, %%xmm6, %%xmm6\n\t"              \
-    "vpxor	%%xmm5, %%xmm6, %%xmm6\n\t"             \
-    "vpxor	%%xmm6, %%xmm8, " #r "\n\t"
-#define GHASH_GFSQR_RED2_AVX2(r, a, mod128)             \
-       _GHASH_GFSQR_RED2_AVX2(r, a, mod128)
-
-#define _GHASH_GFMUL_SQR_RED2_AVX2(rm, rs, a, b, mod128) \
-    "vpclmulqdq	$0x10, "#a", "#b", %%xmm7\n\t"           \
-    "vpclmulqdq	$0x01, "#a", "#b", %%xmm6\n\t"           \
-    "vpclmulqdq	$0x00, "#a", "#b", %%xmm5\n\t"           \
-    "vpclmulqdq	$0x11, "#a", "#b", %%xmm8\n\t"           \
-    "vpclmulqdq	$0x00, "#b", "#b", %%xmm9\n\t"           \
-    "vpclmulqdq	$0x11, "#b", "#b", %%xmm10\n\t"          \
-    "vpxor	%%xmm6, %%xmm7, %%xmm7\n\t"              \
-    "vpslldq	$8, %%xmm7, %%xmm6\n\t"                  \
-    "vpsrldq	$8, %%xmm7, %%xmm7\n\t"                  \
-    "vpxor	%%xmm5, %%xmm6, %%xmm6\n\t"              \
-    "vpclmulqdq	$0x10, "#mod128", %%xmm9, %%xmm4\n\t"    \
-    "vpclmulqdq	$0x10, "#mod128", %%xmm6, %%xmm5\n\t"    \
-    "vpshufd	$0x4e, %%xmm6, %%xmm6\n\t"               \
-    "vpshufd	$0x4e, %%xmm9, %%xmm9\n\t"               \
-    "vpxor	%%xmm5, %%xmm6, %%xmm6\n\t"              \
-    "vpxor	%%xmm4, %%xmm9, %%xmm9\n\t"              \
-    "vpclmulqdq	$0x10, "#mod128", %%xmm6, %%xmm5\n\t"    \
-    "vpclmulqdq	$0x10, "#mod128", %%xmm9, %%xmm4\n\t"    \
-    "vpshufd	$0x4e, %%xmm6, %%xmm6\n\t"               \
-    "vpshufd	$0x4e, %%xmm9, %%xmm9\n\t"               \
-    "vpxor	%%xmm7, %%xmm8, %%xmm8\n\t"              \
-    "vpxor	%%xmm4, %%xmm9, %%xmm9\n\t"              \
-    "vpxor	%%xmm8, %%xmm6, %%xmm6\n\t"              \
-    "vpxor	%%xmm10, %%xmm9, "#rs"\n\t"              \
-    "vpxor	%%xmm5, %%xmm6, "#rm"\n\t"
-#define GHASH_GFMUL_SQR_RED2_AVX2(rm, rs, a, b, mod128)  \
-       _GHASH_GFMUL_SQR_RED2_AVX2(rm, rs, a, b, mod128)
-
-#define CALC_HT_8_AVX2()                                                \
-    "vmovdqa	%[MOD2_128], %%xmm11\n\t"                               \
-    "vmovdqa	" VAR(XR) ", %%xmm2\n\t"                                \
-    "# H ^ 1 and H ^ 2\n\t"                                             \
-    GHASH_GFSQR_RED2_AVX2(%%xmm0, HR, %%xmm11)                          \
-    "vmovdqu	" VAR(HR) ", 0(" VAR(HTR) ")\n\t"                       \
-    "vmovdqu	%%xmm0 ,  16(" VAR(HTR) ")\n\t"                         \
-    "# H ^ 3 and H ^ 4\n\t"                                             \
-    GHASH_GFMUL_SQR_RED2_AVX2(%%xmm1, %%xmm3, HR, %%xmm0, %%xmm11)      \
-    "vmovdqu	%%xmm1 ,  32(" VAR(HTR) ")\n\t"                         \
-    "vmovdqu	%%xmm3 ,  48(" VAR(HTR) ")\n\t"                         \
-    "# H ^ 5 and H ^ 6\n\t"                                             \
-    GHASH_GFMUL_SQR_RED2_AVX2(%%xmm12, %%xmm0, %%xmm0, %%xmm1, %%xmm11) \
-    "vmovdqu	%%xmm12,  64(" VAR(HTR) ")\n\t"                         \
-    "vmovdqu	%%xmm0 ,  80(" VAR(HTR) ")\n\t"                         \
-    "# H ^ 7 and H ^ 8\n\t"                                             \
-    GHASH_GFMUL_SQR_RED2_AVX2(%%xmm12, %%xmm0, %%xmm1, %%xmm3, %%xmm11) \
-    "vmovdqu	%%xmm12,  96(" VAR(HTR) ")\n\t"                         \
-    "vmovdqu	%%xmm0 , 112(" VAR(HTR) ")\n\t"
-
-#define _GHASH_RED_AVX2(r, r2)                     \
-    "vmovdqa	%[MOD2_128], %%xmm2\n\t"           \
-    "vpclmulqdq	$0x10, %%xmm2, "#r2", %%xmm0\n\t"  \
-    "vpshufd	$0x4e, "#r2", %%xmm1\n\t"          \
-    "vpxor	%%xmm0, %%xmm1, %%xmm1\n\t"        \
-    "vpclmulqdq	$0x10, %%xmm2, %%xmm1, %%xmm0\n\t" \
-    "vpshufd	$0x4e, %%xmm1, %%xmm1\n\t"         \
-    "vpxor	%%xmm0, %%xmm1, %%xmm1\n\t"        \
-    "vpxor	%%xmm1, " #r ", " #r "\n\t"
-#define GHASH_RED_AVX2(r, r2)                      \
-       _GHASH_RED_AVX2(r, r2)
-
-#define GHASH_FULL_AVX2(r, r2, a, b) \
-    GHASH_GFMUL_AVX2(r, r2, a, b)    \
-    GHASH_MID_AVX2(r, r2)            \
-    GHASH_RED_AVX2(r, r2)
-
-#define _GFMUL_3V_AVX2(r, r2, r3, a, b)        \
-    "vpclmulqdq	$0x10, "#a", "#b", "#r3"\n\t"  \
-    "vpclmulqdq	$0x01, "#a", "#b", %%xmm1\n\t" \
-    "vpclmulqdq	$0x00, "#a", "#b", "#r2"\n\t"  \
-    "vpclmulqdq	$0x11, "#a", "#b", " #r "\n\t" \
-    "vpxor	%%xmm1, "#r3", "#r3"\n\t"
-#define GFMUL_3V_AVX2(r, r2, r3, a, b)         \
-       _GFMUL_3V_AVX2(r, r2, r3, a, b)
-
-#define _GFMUL_XOR_3V_AVX2(r, r2, r3, a, b)    \
-    "vpclmulqdq	$0x10, "#a", "#b", %%xmm2\n\t" \
-    "vpclmulqdq	$0x01, "#a", "#b", %%xmm1\n\t" \
-    "vpclmulqdq	$0x00, "#a", "#b", %%xmm0\n\t" \
-    "vpclmulqdq	$0x11, "#a", "#b", %%xmm3\n\t" \
-    "vpxor	%%xmm1, %%xmm2, %%xmm2\n\t"    \
-    "vpxor	%%xmm3, " #r ", " #r "\n\t"    \
-    "vpxor	%%xmm2, "#r3", "#r3"\n\t"      \
-    "vpxor	%%xmm0, "#r2", "#r2"\n\t"
-#define GFMUL_XOR_3V_AVX2(r, r2, r3, a, b)     \
-       _GFMUL_XOR_3V_AVX2(r, r2, r3, a, b)
-
-#define GHASH_GFMUL_RED_8_AVX2()                              \
-    "vmovdqu	   (" VAR(HTR) "), %%xmm12\n\t"               \
-    GFMUL_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm11, %%xmm12)     \
-    "vmovdqu	 16(" VAR(HTR) "), %%xmm12\n\t"               \
-    GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm10, %%xmm12) \
-    "vmovdqu	 32(" VAR(HTR) "), %%xmm11\n\t"               \
-    "vmovdqu	 48(" VAR(HTR) "), %%xmm12\n\t"               \
-    GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm9, %%xmm11)  \
-    GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm8, %%xmm12)  \
-    "vmovdqu	 64(" VAR(HTR) "), %%xmm11\n\t"               \
-    "vmovdqu	 80(" VAR(HTR) "), %%xmm12\n\t"               \
-    GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm7, %%xmm11)  \
-    GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm6, %%xmm12)  \
-    "vmovdqu	 96(" VAR(HTR) "), %%xmm11\n\t"               \
-    "vmovdqu	112(" VAR(HTR) "), %%xmm12\n\t"               \
-    GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm5, %%xmm11)  \
-    GFMUL_XOR_3V_AVX2(XR, %%xmm13, %%xmm14, %%xmm4, %%xmm12)  \
-    "vpslldq	$8, %%xmm14, %%xmm12\n\t"                     \
-    "vpsrldq	$8, %%xmm14, %%xmm14\n\t"                     \
-    "vpxor	%%xmm12, %%xmm13, %%xmm13\n\t"                \
-    "vpxor	%%xmm14, " VAR(XR) ", " VAR(XR) "\n\t"        \
-    GHASH_RED_AVX2(XR, %%xmm13)
-
-#define CALC_IV_12_AVX2()                                            \
-    "# Calculate values when IV is 12 bytes\n\t"                     \
-    "# Set counter based on IV\n\t"                                  \
-    "movl		$0x01000000, %%ecx\n\t"                      \
-    "vpinsrq		$0, 0(%%rax), %%xmm13, %%xmm13\n\t"          \
-    "vpinsrd		$2, 8(%%rax), %%xmm13, %%xmm13\n\t"          \
-    "vpinsrd		$3, %%ecx, %%xmm13, %%xmm13\n\t"             \
-    "# H = Encrypt X(=0) and T = Encrypt counter\n\t"                \
-    "vmovdqa		  0(%[KEY]), " VAR(HR) "\n\t"                \
-    "vmovdqa		 16(%[KEY]), %%xmm12\n\t"                    \
-    "vpxor		" VAR(HR) ", %%xmm13, %%xmm1\n\t"            \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		 32(%[KEY]), %%xmm0\n\t"                     \
-    "vmovdqa		 48(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm0, " VAR(HR) ", " VAR(HR) "\n\t"        \
-    "vaesenc		%%xmm0, %%xmm1, %%xmm1\n\t"                  \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		 64(%[KEY]), %%xmm0\n\t"                     \
-    "vmovdqa		 80(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm0, " VAR(HR) ", " VAR(HR) "\n\t"        \
-    "vaesenc		%%xmm0, %%xmm1, %%xmm1\n\t"                  \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		 96(%[KEY]), %%xmm0\n\t"                     \
-    "vmovdqa		112(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm0, " VAR(HR) ", " VAR(HR) "\n\t"        \
-    "vaesenc		%%xmm0, %%xmm1, %%xmm1\n\t"                  \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqa		128(%[KEY]), %%xmm0\n\t"                     \
-    "vmovdqa		144(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm0, " VAR(HR) ", " VAR(HR) "\n\t"        \
-    "vaesenc		%%xmm0, %%xmm1, %%xmm1\n\t"                  \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "cmpl		$11, %[nr]\n\t"                              \
-    "vmovdqa		160(%[KEY]), %%xmm0\n\t"                     \
-    "jl	31f\n\t"                                                     \
-    "vmovdqa		176(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm0, " VAR(HR) ", " VAR(HR) "\n\t"        \
-    "vaesenc		%%xmm0, %%xmm1, %%xmm1\n\t"                  \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "cmpl		$13, %[nr]\n\t"                              \
-    "vmovdqa		192(%[KEY]), %%xmm0\n\t"                     \
-    "jl	31f\n\t"                                                     \
-    "vmovdqa		208(%[KEY]), %%xmm12\n\t"                    \
-    "vaesenc		%%xmm0, " VAR(HR) ", " VAR(HR) "\n\t"        \
-    "vaesenc		%%xmm0, %%xmm1, %%xmm1\n\t"                  \
-    "vaesenc		%%xmm12, " VAR(HR) ", " VAR(HR) "\n\t"       \
-    "vaesenc		%%xmm12, %%xmm1, %%xmm1\n\t"                 \
-    "vmovdqu		224(%[KEY]), %%xmm0\n\t"                     \
-    "31:\n\t"                                                        \
-    "vaesenclast	%%xmm0, " VAR(HR) ", " VAR(HR) "\n\t"        \
-    "vaesenclast	%%xmm0, %%xmm1, %%xmm1\n\t"                  \
-    "vpshufb		%[BSWAP_MASK], " VAR(HR) ", " VAR(HR) "\n\t" \
-    "vmovdqu		%%xmm1, " VAR(TR) "\n\t"                     \
-
-#define CALC_IV_AVX2()                                       \
-    "# Calculate values when IV is not 12 bytes\n\t"         \
-    "# H = Encrypt X(=0)\n\t"                                \
-    "vmovdqa	0(%[KEY]), " VAR(HR) "\n\t"                  \
-    VAESENC_AVX(HR)                                          \
-    "vpshufb	%[BSWAP_MASK], " VAR(HR) ", " VAR(HR) "\n\t" \
-    "# Calc counter\n\t"                                     \
-    "# Initialization vector\n\t"                            \
-    "cmpl	$0, %%edx\n\t"                               \
-    "movq	$0, %%rcx\n\t"                               \
-    "je	45f\n\t"                                             \
-    "cmpl	$16, %%edx\n\t"                              \
-    "jl	44f\n\t"                                             \
-    "andl	$0xfffffff0, %%edx\n\t"                      \
-    "\n"                                                     \
-    "43:\n\t"                                                \
-    "vmovdqu	(%%rax,%%rcx,1), %%xmm4\n\t"                 \
-    "vpshufb	%[BSWAP_MASK], %%xmm4, %%xmm4\n\t"           \
-    "vpxor	%%xmm4, %%xmm13, %%xmm13\n\t"                \
-    GHASH_FULL_AVX2(%%xmm13, %%xmm12, %%xmm13, HR)           \
-    "addl	$16, %%ecx\n\t"                              \
-    "cmpl	%%edx, %%ecx\n\t"                            \
-    "jl	43b\n\t"                                             \
-    "movl	%[ibytes], %%edx\n\t"                        \
-    "cmpl	%%edx, %%ecx\n\t"                            \
-    "je	45f\n\t"                                             \
-    "\n"                                                     \
-    "44:\n\t"                                                \
-    "subq	$16, %%rsp\n\t"                              \
-    "vpxor	%%xmm4, %%xmm4, %%xmm4\n\t"                  \
-    "xorl	%%ebx, %%ebx\n\t"                            \
-    "vmovdqu	%%xmm4, (%%rsp)\n\t"                         \
-    "42:\n\t"                                                \
-    "movzbl	(%%rax,%%rcx,1), %%r13d\n\t"                 \
-    "movb	%%r13b, (%%rsp,%%rbx,1)\n\t"                 \
-    "incl	%%ecx\n\t"                                   \
-    "incl	%%ebx\n\t"                                   \
-    "cmpl	%%edx, %%ecx\n\t"                            \
-    "jl	42b\n\t"                                             \
-    "vmovdqu	(%%rsp), %%xmm4\n\t"                         \
-    "addq	$16, %%rsp\n\t"                              \
-    "vpshufb	%[BSWAP_MASK], %%xmm4, %%xmm4\n\t"           \
-    "vpxor	%%xmm4, %%xmm13, %%xmm13\n\t"                \
-    GHASH_FULL_AVX2(%%xmm13, %%xmm12, %%xmm13, HR)           \
-    "\n"                                                     \
-    "45:\n\t"                                                \
-    "# T = Encrypt counter\n\t"                              \
-    "vpxor	%%xmm0, %%xmm0, %%xmm0\n\t"                  \
-    "shll	$3, %%edx\n\t"                               \
-    "vpinsrq	$0, %%rdx, %%xmm0, %%xmm0\n\t"               \
-    "vpxor	%%xmm0, %%xmm13, %%xmm13\n\t"                \
-    GHASH_FULL_AVX2(%%xmm13, %%xmm12, %%xmm13, HR)           \
-    "vpshufb	%[BSWAP_MASK], %%xmm13, %%xmm13\n\t"         \
-    "#   Encrypt counter\n\t"                                \
-    "vmovdqa	0(%[KEY]), %%xmm4\n\t"                       \
-    "vpxor	%%xmm13, %%xmm4, %%xmm4\n\t"                 \
-    VAESENC_AVX(%%xmm4)                                      \
-    "vmovdqu	%%xmm4, " VAR(TR) "\n\t"
-
-#define CALC_AAD_AVX2()                                \
-    "# Additional authentication data\n\t"             \
-    "movl	%[abytes], %%edx\n\t"                  \
-    "cmpl	$0, %%edx\n\t"                         \
-    "je		25f\n\t"                               \
-    "movq	%[addt], %%rax\n\t"                    \
-    "xorl	%%ecx, %%ecx\n\t"                      \
-    "cmpl	$16, %%edx\n\t"                        \
-    "jl		24f\n\t"                               \
-    "andl	$0xfffffff0, %%edx\n\t"                \
-    "\n"                                               \
-    "23:\n\t"                                          \
-    "vmovdqu	(%%rax,%%rcx,1), %%xmm4\n\t"           \
-    "vpshufb	%[BSWAP_MASK], %%xmm4, %%xmm4\n\t"     \
-    "vpxor	%%xmm4, " VAR(XR) ", " VAR(XR) "\n\t"  \
-    GHASH_FULL_AVX2(XR, %%xmm12, XR, HR)               \
-    "addl	$16, %%ecx\n\t"                        \
-    "cmpl	%%edx, %%ecx\n\t"                      \
-    "jl		23b\n\t"                               \
-    "movl	%[abytes], %%edx\n\t"                  \
-    "cmpl	%%edx, %%ecx\n\t"                      \
-    "je		25f\n\t"                               \
-    "\n"                                               \
-    "24:\n\t"                                          \
-    "subq	$16, %%rsp\n\t"                        \
-    "vpxor	%%xmm4, %%xmm4, %%xmm4\n\t"            \
-    "xorl	%%ebx, %%ebx\n\t"                      \
-    "vmovdqu	%%xmm4, (%%rsp)\n\t"                   \
-    "22:\n\t"                                          \
-    "movzbl	(%%rax,%%rcx,1), %%r13d\n\t"           \
-    "movb	%%r13b, (%%rsp,%%rbx,1)\n\t"           \
-    "incl	%%ecx\n\t"                             \
-    "incl	%%ebx\n\t"                             \
-    "cmpl	%%edx, %%ecx\n\t"                      \
-    "jl		22b\n\t"                               \
-    "vmovdqu	(%%rsp), %%xmm4\n\t"                   \
-    "addq	$16, %%rsp\n\t"                        \
-    "vpshufb	%[BSWAP_MASK], %%xmm4, %%xmm4\n\t"     \
-    "vpxor	%%xmm4, " VAR(XR) ", " VAR(XR) "\n\t"  \
-    GHASH_FULL_AVX2(XR, %%xmm12, XR, HR)               \
-    "\n"                                               \
-    "25:\n\t"
-
-#define VAESENC_128_GHASH_AVX2(src, o)               \
-    "leaq	(%[in]," VAR(KR64) ",1), %%rcx\n\t"  \
-    "leaq	(%[out]," VAR(KR64) ",1), %%rdx\n\t" \
-    /* src is either %%rcx or %%rdx */             \
-    VAESENC_CTR()                                  \
-    VAESENC_XOR()                                  \
-    VAESENC_PCLMUL_AVX2_1(src,  16, (o-128), 112)  \
-    VAESENC_PCLMUL_AVX2_2(src,  32, (o-112),  96)  \
-    VAESENC_PCLMUL_AVX2_N(src,  48, (o- 96),  80)  \
-    VAESENC_PCLMUL_AVX2_N(src,  64, (o- 80),  64)  \
-    VAESENC_PCLMUL_AVX2_N(src,  80, (o- 64),  48)  \
-    VAESENC_PCLMUL_AVX2_N(src,  96, (o- 48),  32)  \
-    VAESENC_PCLMUL_AVX2_N(src, 112, (o- 32),  16)  \
-    VAESENC_PCLMUL_AVX2_N(src, 128, (o- 16),   0)  \
-    VAESENC_PCLMUL_AVX2_L(144)                     \
-    "cmpl	$11, %[nr]\n\t"                    \
-    "vmovdqa	160(%[KEY]), %%xmm12\n\t"          \
-    "jl		4f\n\t"                            \
-    VAESENC()                                      \
-    VAESENC_SET(176)                               \
-    "cmpl	$13, %[nr]\n\t"                    \
-    "vmovdqa	192(%[KEY]), %%xmm12\n\t"          \
-    "jl		4f\n\t"                            \
-    VAESENC()                                      \
-    VAESENC_SET(208)                               \
-    "vmovdqa	224(%[KEY]), %%xmm12\n\t"          \
-    "\n"                                           \
-"4:\n\t"                                           \
-    VAESENC_LAST(%%rcx, %%rdx)
-
-#define AESENC_LAST15_ENC_AVX2()                        \
-    "movl	%[nbytes], %%ecx\n\t"                   \
-    "movl	%%ecx, %%edx\n\t"                       \
-    "andl	$0x0f, %%ecx\n\t"                       \
-    "jz		55f\n\t"                                \
-    "vmovdqu	" VAR(CTR1) ", %%xmm13\n\t"             \
-    "vpshufb	%[BSWAP_EPI64], %%xmm13, %%xmm13\n\t"   \
-    "vpxor	0(%[KEY]), %%xmm13, %%xmm13\n\t"        \
-    VAESENC_AVX(%%xmm13)                                \
-    "subq	$16, %%rsp\n\t"                         \
-    "xorl	%%ecx, %%ecx\n\t"                       \
-    "vmovdqu	%%xmm13, (%%rsp)\n\t"                   \
-    "\n"                                                \
-    "51:\n\t"                                           \
-    "movzbl	(%[in]," VAR(KR64) ",1), %%r13d\n\t"    \
-    "xorb	(%%rsp,%%rcx,1), %%r13b\n\t"            \
-    "movb	%%r13b, (%[out]," VAR(KR64) ",1)\n\t"   \
-    "movb	%%r13b, (%%rsp,%%rcx,1)\n\t"            \
-    "incl	" VAR(KR) "\n\t"                        \
-    "incl	%%ecx\n\t"                              \
-    "cmpl	%%edx, " VAR(KR) "\n\t"                 \
-    "jl		51b\n\t"                                \
-    "xorq	%%r13, %%r13\n\t"                       \
-    "cmpl	$16, %%ecx\n\t"                         \
-    "je		53f\n\t"                                \
-    "\n"                                                \
-    "52:\n\t"                                           \
-    "movb	%%r13b, (%%rsp,%%rcx,1)\n\t"            \
-    "incl	%%ecx\n\t"                              \
-    "cmpl	$16, %%ecx\n\t"                         \
-    "jl		52b\n\t"                                \
-    "53:\n\t"                                           \
-    "vmovdqu	(%%rsp), %%xmm13\n\t"                   \
-    "addq	$16, %%rsp\n\t"                         \
-    "vpshufb	%[BSWAP_MASK], %%xmm13, %%xmm13\n\t"    \
-    "vpxor	%%xmm13, " VAR(XR) ", " VAR(XR) "\n\t"  \
-    GHASH_GFMUL_RED_AVX2(XR, HR, XR)                    \
-
-#define AESENC_LAST15_DEC_AVX2()                        \
-    "movl	%[nbytes], %%ecx\n\t"                   \
-    "movl	%%ecx, %%edx\n\t"                       \
-    "andl	$0x0f, %%ecx\n\t"                       \
-    "jz		55f\n\t"                                \
-    "vmovdqu	" VAR(CTR1) ", %%xmm13\n\t"             \
-    "vpshufb	%[BSWAP_EPI64], %%xmm13, %%xmm13\n\t"   \
-    "vpxor	0(%[KEY]), %%xmm13, %%xmm13\n\t"        \
-    VAESENC_AVX(%%xmm13)                                \
-    "subq	$32, %%rsp\n\t"                         \
-    "xorl	%%ecx, %%ecx\n\t"                       \
-    "vmovdqu	%%xmm13, (%%rsp)\n\t"                   \
-    "vpxor	%%xmm0, %%xmm0, %%xmm0\n\t"             \
-    "vmovdqu	%%xmm0, 16(%%rsp)\n\t"                  \
-    "\n"                                                \
-    "51:\n\t"                                           \
-    "movzbl	(%[in]," VAR(KR64) ",1), %%r13d\n\t"    \
-    "movb	%%r13b, 16(%%rsp,%%rcx,1)\n\t"          \
-    "xorb	(%%rsp,%%rcx,1), %%r13b\n\t"            \
-    "movb	%%r13b, (%[out]," VAR(KR64) ",1)\n\t"   \
-    "incl	" VAR(KR) "\n\t"                        \
-    "incl	%%ecx\n\t"                              \
-    "cmpl	%%edx, " VAR(KR) "\n\t"                 \
-    "jl		51b\n\t"                                \
-    "53:\n\t"                                           \
-    "vmovdqu	16(%%rsp), %%xmm13\n\t"                 \
-    "addq	$32, %%rsp\n\t"                         \
-    "vpshufb	%[BSWAP_MASK], %%xmm13, %%xmm13\n\t"    \
-    "vpxor	%%xmm13, " VAR(XR) ", " VAR(XR) "\n\t"  \
-    GHASH_GFMUL_RED_AVX2(XR, HR, XR)                    \
-
-#define CALC_TAG_AVX2()                                      \
-    "movl	%[nbytes], %%edx\n\t"                        \
-    "movl	%[abytes], %%ecx\n\t"                        \
-    "shlq	$3, %%rdx\n\t"                               \
-    "shlq	$3, %%rcx\n\t"                               \
-    "vpinsrq	$0, %%rdx, %%xmm0, %%xmm0\n\t"               \
-    "vpinsrq	$1, %%rcx, %%xmm0, %%xmm0\n\t"               \
-    "vpxor	%%xmm0, " VAR(XR) ", " VAR(XR) "\n\t"        \
-    GHASH_GFMUL_RED_AVX2(XR, HR, XR)                         \
-    "vpshufb	%[BSWAP_MASK], " VAR(XR) ", " VAR(XR) "\n\t" \
-    "vpxor	" VAR(TR) ", " VAR(XR) ", %%xmm0\n\t"        \
-
-
-static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out,
-                                 const unsigned char* addt,
-                                 const unsigned char* ivec, unsigned char *tag,
-                                 unsigned int nbytes, unsigned int abytes,
-                                 unsigned int ibytes, unsigned int tbytes,
-                                 const unsigned char* key, int nr)
-{
-    register const unsigned char* iv asm("rax") = ivec;
-    register unsigned int ivLen asm("ebx") = ibytes;
-
-    __asm__ __volatile__ (
-        "subq		$" VAR(STACK_OFFSET) ", %%rsp\n\t"
-        /* Counter is xmm13 */
-        "vpxor		%%xmm13, %%xmm13, %%xmm13\n\t"
-        "vpxor		" VAR(XR) ", " VAR(XR) ", " VAR(XR) "\n\t"
-        "movl		%[ibytes], %%edx\n\t"
-        "cmpl		$12, %%edx\n\t"
-        "jne		35f\n\t"
-        CALC_IV_12_AVX2()
-        "jmp		39f\n\t"
-        "\n"
-        "35:\n\t"
-        CALC_IV_AVX2()
-        "\n"
-        "39:\n\t"
-
-        CALC_AAD_AVX2()
-
-        "# Calculate counter and H\n\t"
-        "vpsrlq		$63, " VAR(HR) ", %%xmm5\n\t"
-        "vpsllq		$1, " VAR(HR) ", %%xmm4\n\t"
-        "vpslldq	$8, %%xmm5, %%xmm5\n\t"
-        "vpor		%%xmm5, %%xmm4, %%xmm4\n\t"
-        "vpshufd	$0xff, " VAR(HR) ", " VAR(HR) "\n\t"
-        "vpsrad		$31, " VAR(HR) ", " VAR(HR) "\n\t"
-        "vpshufb	%[BSWAP_EPI64], %%xmm13, %%xmm13\n\t"
-        "vpand		%[MOD2_128], " VAR(HR) ", " VAR(HR) "\n\t"
-        "vpaddd		%[ONE], %%xmm13, %%xmm13\n\t"
-        "vpxor		%%xmm4, " VAR(HR) ", " VAR(HR) "\n\t"
-        "vmovdqu	%%xmm13, " VAR(CTR1) "\n\t"
-
-        "xorl		" VAR(KR) ", " VAR(KR) "\n\t"
-
-#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL)
-        "cmpl	$128, %[nbytes]\n\t"
-        "movl	%[nbytes], %%r13d\n\t"
-        "jl	5f\n\t"
-        "andl	$0xffffff80, %%r13d\n\t"
-
-        CALC_HT_8_AVX2()
-
-        "# First 128 bytes of input\n\t"
-        VAESENC_128()
-
-        "cmpl	$128, %%r13d\n\t"
-        "movl	$128, " VAR(KR) "\n\t"
-        "jle	2f\n\t"
-
-        "# More 128 bytes of input\n\t"
-        "\n"
-    "3:\n\t"
-        VAESENC_128_GHASH_AVX2(%%rdx, 0)
-        "addl	$128, " VAR(KR) "\n\t"
-        "cmpl	%%r13d, " VAR(KR) "\n\t"
-        "jl	3b\n\t"
-        "\n"
-    "2:\n\t"
-        "vmovdqa	%[BSWAP_MASK], %%xmm13\n\t"
-        "vpshufb	%%xmm13, %%xmm4, %%xmm4\n\t"
-        "vpshufb	%%xmm13, %%xmm5, %%xmm5\n\t"
-        "vpshufb	%%xmm13, %%xmm6, %%xmm6\n\t"
-        "vpshufb	%%xmm13, %%xmm7, %%xmm7\n\t"
-        "vpshufb	%%xmm13, %%xmm8, %%xmm8\n\t"
-        "vpshufb	%%xmm13, %%xmm9, %%xmm9\n\t"
-        "vpshufb	%%xmm13, %%xmm10, %%xmm10\n\t"
-        "vpshufb	%%xmm13, %%xmm11, %%xmm11\n\t"
-        "vpxor		%%xmm2, %%xmm4, %%xmm4\n\t"
-
-        GHASH_GFMUL_RED_8_AVX2()
-
-        "vmovdqu	0(" VAR(HTR) "), " VAR(HR) "\n\t"
-        "\n"
-    "5:\n\t"
-        "movl		%[nbytes], %%edx\n\t"
-        "cmpl		%%edx, " VAR(KR) "\n\t"
-        "jge		55f\n\t"
-#endif
-
-        "movl		%[nbytes], %%r13d\n\t"
-        "andl		$0xfffffff0, %%r13d\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jge		14f\n\t"
-
-        VAESENC_BLOCK_AVX2()
-        "addl		$16, " VAR(KR) "\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jge		13f\n\t"
-        "vmovdqa	%[MOD2_128], %%xmm0\n\t"
-        "\n"
-        "12:\n\t"
-        "vmovdqu	(%[in]," VAR(KR64) ",1), %%xmm9\n\t"
-        "vmovdqu	" VAR(CTR1) ", %%xmm5\n\t"
-        "vpshufb	%[BSWAP_EPI64], %%xmm5, %%xmm4\n\t"
-        "vpaddd		%[ONE], %%xmm5, %%xmm5\n\t"
-        "vmovdqu	%%xmm5, " VAR(CTR1) "\n\t"
-        VAESENC_GFMUL_SB_AVX2(%%xmm9, HR, XR, CTR1)
-        "vmovdqu	%%xmm4, (%[out]," VAR(KR64) ",1)\n\t"
-        "vpshufb	%[BSWAP_MASK], %%xmm4, %%xmm4\n\t"
-        "addl		$16, " VAR(KR) "\n\t"
-        "vpxor		%%xmm4, " VAR(XR) ", " VAR(XR) "\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jl		12b\n\t"
-        "\n"
-        "13:\n\t"
-        GHASH_GFMUL_RED_AVX2(XR, HR, XR)
-        "\n"
-        "14:\n\t"
-
-        AESENC_LAST15_ENC_AVX2()
-        "\n"
-        "55:\n\t"
-
-        CALC_TAG_AVX2()
-        STORE_TAG_AVX()
-        "addq		$" VAR(STACK_OFFSET) ", %%rsp\n\t"
-        "vzeroupper\n\t"
-
-        :
-        : [KEY] "r" (key),
-          [in] "r" (in), [out] "r" (out), [nr] "r" (nr),
-          [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt),
-          [ivec] "r" (iv), [ibytes] "r" (ivLen), [tbytes] "r" (tbytes),
-          [tag] "r" (tag),
-          [BSWAP_MASK] "m" (BSWAP_MASK),
-          [BSWAP_EPI64] "m" (BSWAP_EPI64),
-          [ONE] "m" (ONE),
-#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL)
-          [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR),
-          [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN),
-          [EIGHT] "m" (EIGHT),
-#endif
-          [MOD2_128] "m" (MOD2_128)
-        : "xmm15", "xmm14", "xmm13", "xmm12",
-          "xmm0", "xmm1", "xmm2", "xmm3", "memory",
-          "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11",
-          "rcx", "rdx", "r13"
-    );
-}
-#endif /* HAVE_INTEL_AVX2 */
-#endif /* HAVE_INTEL_AVX1 */
-
-#ifdef HAVE_AES_DECRYPT
-/* Figure 10. AES-GCM – Decrypt With Single Block Ghash at a Time */
-
-static void AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
-                            const unsigned char* addt,
-                            const unsigned char* ivec, const unsigned char *tag,
-                            int nbytes, int abytes, int ibytes, int tbytes,
-                            const unsigned char* key, int nr, int* res)
-{
-    register const unsigned char* iv asm("rax") = ivec;
-    register int ivLen asm("ebx") = ibytes;
-    register int tagLen asm("edx") = tbytes;
-
-    __asm__ __volatile__ (
-        "pushq		%%rdx\n\t"
-        "subq		$" VAR(STACK_OFFSET) ", %%rsp\n\t"
-        /* Counter is xmm13 */
-        "pxor		%%xmm13, %%xmm13\n\t"
-        "pxor		%%xmm15, %%xmm15\n\t"
-        "movl		%[ibytes], %%edx\n\t"
-        "cmpl		$12, %%edx\n\t"
-        "jne		35f\n\t"
-        CALC_IV_12()
-        "\n"
-        "35:\n\t"
-        CALC_IV()
-        "\n"
-        "39:\n\t"
-
-        CALC_AAD()
-
-        "# Calculate counter and H\n\t"
-        "pshufb		%[BSWAP_EPI64], %%xmm13\n\t"
-        "movdqa		" VAR(HR) ", %%xmm5\n\t"
-        "paddd		%[ONE], %%xmm13\n\t"
-        "movdqa		" VAR(HR) ", %%xmm4\n\t"
-        "movdqu		%%xmm13, " VAR(CTR1) "\n\t"
-        "psrlq		$63, %%xmm5\n\t"
-        "psllq		$1, %%xmm4\n\t"
-        "pslldq		$8, %%xmm5\n\t"
-        "por		%%xmm5, %%xmm4\n\t"
-        "pshufd		$0xff, " VAR(HR) ", " VAR(HR) "\n\t"
-        "psrad		$31, " VAR(HR) "\n\t"
-        "pand		%[MOD2_128], " VAR(HR) "\n\t"
-        "pxor		%%xmm4, " VAR(HR) "\n\t"
-
-        "xorl		" VAR(KR) ", " VAR(KR) "\n\t"
-
-#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL)
-        "cmpl		$128, %[nbytes]\n\t"
-        "jl		5f\n\t"
-
-        CALC_HT_8_AVX()
-
-        "movl		%[nbytes], %%r13d\n\t"
-        "andl		$0xffffff80, %%r13d\n\t"
-        "\n"
-        "2:\n\t"
-        AESENC_128_GHASH_AVX(%%rcx, 128)
-        "addl		$128, " VAR(KR) "\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jl		2b\n\t"
-
-        "movdqa		%%xmm2, " VAR(XR) "\n\t"
-        "movdqu		(%%rsp), " VAR(HR) "\n\t"
-    "5:\n\t"
-        "movl		%[nbytes], %%edx\n\t"
-        "cmpl		%%edx, " VAR(KR) "\n\t"
-        "jge		55f\n\t"
-#endif
-        "movl		%[nbytes], %%r13d\n\t"
-        "andl		$0xfffffff0, %%r13d\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jge		13f\n\t"
-
-        "\n"
-        "12:\n\t"
-        "leaq		(%[in]," VAR(KR64) ",1), %%rcx\n\t"
-        "leaq		(%[out]," VAR(KR64) ",1), %%rdx\n\t"
-        "movdqu		(%%rcx), %%xmm1\n\t"
-        "movdqa		" VAR(HR) ", %%xmm0\n\t"
-        "pshufb		%[BSWAP_MASK], %%xmm1\n\t"
-        "pxor		" VAR(XR) ", %%xmm1\n\t"
-        AESENC_GFMUL(%%rcx, %%rdx, %%xmm0, %%xmm1)
-        "addl		$16, " VAR(KR) "\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jl		12b\n\t"
-        "\n"
-        "13:\n\t"
-
-        AESENC_LAST15_DEC_AVX()
-        "\n"
-        "55:\n\t"
-
-        CALC_TAG()
-        "addq		$" VAR(STACK_OFFSET) ", %%rsp\n\t"
-        "popq		%%rdx\n\t"
-        CMP_TAG()
-
-        :
-        : [KEY] "r" (key),
-          [in] "r" (in), [out] "r" (out), [nr] "r" (nr),
-          [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt),
-          [ivec] "r" (iv), [ibytes] "r" (ivLen), [tbytes] "r" (tagLen),
-          [tag] "r" (tag), [res] "r" (res),
-          [BSWAP_MASK] "m" (BSWAP_MASK),
-          [BSWAP_EPI64] "m" (BSWAP_EPI64),
-          [ONE] "m" (ONE),
-#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL)
-          [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR),
-          [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN),
-          [EIGHT] "m" (EIGHT),
-#endif
-          [MOD2_128] "m" (MOD2_128)
-        : "xmm15", "xmm14", "xmm13", "xmm12",
-          "xmm0", "xmm1", "xmm2", "xmm3", "memory",
-          "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11",
-          "rcx", "r13"
-    );
-}
-
-#ifdef HAVE_INTEL_AVX1
-static void AES_GCM_decrypt_avx1(const unsigned char *in, unsigned char *out,
-                                 const unsigned char* addt,
-                                 const unsigned char* ivec,
-                                 const unsigned char *tag, int nbytes,
-                                 int abytes, int ibytes, int tbytes,
-                                 const unsigned char* key, int nr, int* res)
-{
-    register const unsigned char* iv asm("rax") = ivec;
-    register int ivLen asm("ebx") = ibytes;
-    register int tagLen asm("edx") = tbytes;
-
-    __asm__ __volatile__ (
-        "pushq		%%rdx\n\t"
-        "subq		$" VAR(STACK_OFFSET) ", %%rsp\n\t"
-        /* Counter is xmm13 */
-        "vpxor		%%xmm13, %%xmm13, %%xmm13\n\t"
-        "vpxor		%%xmm15, %%xmm15, %%xmm15\n\t"
-        "movl		%[ibytes], %%edx\n\t"
-        "cmpl		$12, %%edx\n\t"
-        "jne		35f\n\t"
-        CALC_IV_12_AVX1()
-        "\n"
-        "35:\n\t"
-        CALC_IV_AVX1()
-        "\n"
-        "39:\n\t"
-
-        CALC_AAD_AVX1()
-
-        "# Calculate counter and H\n\t"
-        "vpsrlq		$63, " VAR(HR) ", %%xmm5\n\t"
-        "vpsllq		$1, " VAR(HR) ", %%xmm4\n\t"
-        "vpslldq	$8, %%xmm5, %%xmm5\n\t"
-        "vpor		%%xmm5, %%xmm4, %%xmm4\n\t"
-        "vpshufd	$0xff, " VAR(HR) ", " VAR(HR) "\n\t"
-        "vpsrad		$31, " VAR(HR) ", " VAR(HR) "\n\t"
-        "vpshufb	%[BSWAP_EPI64], %%xmm13, %%xmm13\n\t"
-        "vpand		%[MOD2_128], " VAR(HR) ", " VAR(HR) "\n\t"
-        "vpaddd		%[ONE], %%xmm13, %%xmm13\n\t"
-        "vpxor		%%xmm4, " VAR(HR) ", " VAR(HR) "\n\t"
-        "vmovdqu	%%xmm13, " VAR(CTR1) "\n\t"
-
-        "xorl		" VAR(KR) ", " VAR(KR) "\n\t"
-
-#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL)
-        "cmpl		$128, %[nbytes]\n\t"
-        "jl		5f\n\t"
-
-        CALC_HT_8_AVX1()
-
-        "movl		%[nbytes], %%r13d\n\t"
-        "andl		$0xffffff80, %%r13d\n\t"
-        "\n"
-        "2:\n\t"
-        VAESENC_128_GHASH_AVX1(%%rcx, 128)
-        "addl		$128, " VAR(KR) "\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jl		2b\n\t"
-
-        "vmovdqa	%%xmm2, " VAR(XR) "\n\t"
-        "vmovdqu	(%%rsp), " VAR(HR) "\n\t"
-    "5:\n\t"
-        "movl		%[nbytes], %%edx\n\t"
-        "cmpl		%%edx, " VAR(KR) "\n\t"
-        "jge		55f\n\t"
-#endif
-        "movl		%[nbytes], %%r13d\n\t"
-        "andl		$0xfffffff0, %%r13d\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jge		13f\n\t"
-
-        "\n"
-        "12:\n\t"
-        "vmovdqu	(%[in]," VAR(KR64) ",1), %%xmm9\n\t"
-        "vmovdqa	" VAR(HR) ", %%xmm0\n\t"
-        "vpshufb	%[BSWAP_MASK], %%xmm9, %%xmm1\n\t"
-        "vpxor		" VAR(XR) ", %%xmm1, %%xmm1\n\t"
-        VAESENC_GFMUL(%%xmm9, %%xmm0, %%xmm1)
-        "addl		$16, " VAR(KR) "\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jl		12b\n\t"
-        "\n"
-        "13:\n\t"
-
-        AESENC_LAST15_DEC_AVX1()
-        "\n"
-        "55:\n\t"
-
-        CALC_TAG_AVX1()
-        "addq		$" VAR(STACK_OFFSET) ", %%rsp\n\t"
-        "popq		%%rdx\n\t"
-        CMP_TAG_AVX()
-        "vzeroupper\n\t"
-
-        :
-        : [KEY] "r" (key),
-          [in] "r" (in), [out] "r" (out), [nr] "r" (nr),
-          [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt),
-          [ivec] "r" (iv), [ibytes] "r" (ivLen), [tbytes] "r" (tagLen),
-          [tag] "r" (tag), [res] "r" (res),
-          [BSWAP_MASK] "m" (BSWAP_MASK),
-          [BSWAP_EPI64] "m" (BSWAP_EPI64),
-          [ONE] "m" (ONE),
-#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX1_NO_UNROLL)
-          [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR),
-          [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN),
-          [EIGHT] "m" (EIGHT),
-#endif
-          [MOD2_128] "m" (MOD2_128)
-        : "xmm15", "xmm14", "xmm13", "xmm12",
-          "xmm0", "xmm1", "xmm2", "xmm3", "memory",
-          "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11",
-          "rcx", "r13"
-    );
-}
-
-#ifdef HAVE_INTEL_AVX2
-static void AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out,
-                                 const unsigned char* addt,
-                                 const unsigned char* ivec,
-                                 const unsigned char *tag, int nbytes,
-                                 int abytes, int ibytes, int tbytes,
-                                 const unsigned char* key, int nr, int* res)
-{
-    register const unsigned char* iv asm("rax") = ivec;
-    register int ivLen asm("ebx") = ibytes;
-    register int tagLen asm("edx") = tbytes;
-
-    __asm__ __volatile__ (
-        "pushq		%%rdx\n\t"
-        "subq		$" VAR(STACK_OFFSET) ", %%rsp\n\t"
-        /* Counter is xmm13 */
-        "vpxor		%%xmm13, %%xmm13, %%xmm13\n\t"
-        "vpxor		%%xmm15, %%xmm15, %%xmm15\n\t"
-        "movl		%[ibytes], %%edx\n\t"
-        "cmpl		$12, %%edx\n\t"
-        "jne		35f\n\t"
-        CALC_IV_12_AVX2()
-        "jmp		39f\n\t"
-        "\n"
-        "35:\n\t"
-        CALC_IV_AVX2()
-        "\n"
-        "39:\n\t"
-
-        CALC_AAD_AVX2()
-
-        "# Calculate counter and H\n\t"
-        "vpsrlq		$63, " VAR(HR) ", %%xmm5\n\t"
-        "vpsllq		$1, " VAR(HR) ", %%xmm4\n\t"
-        "vpslldq	$8, %%xmm5, %%xmm5\n\t"
-        "vpor		%%xmm5, %%xmm4, %%xmm4\n\t"
-        "vpshufd	$0xff, " VAR(HR) ", " VAR(HR) "\n\t"
-        "vpsrad		$31, " VAR(HR) ", " VAR(HR) "\n\t"
-        "vpshufb	%[BSWAP_EPI64], %%xmm13, %%xmm13\n\t"
-        "vpand		%[MOD2_128], " VAR(HR) ", " VAR(HR) "\n\t"
-        "vpaddd		%[ONE], %%xmm13, %%xmm13\n\t"
-        "vpxor		%%xmm4, " VAR(HR) ", " VAR(HR) "\n\t"
-        "vmovdqu	%%xmm13, " VAR(CTR1) "\n\t"
-
-        "xorl		" VAR(KR) ", " VAR(KR) "\n\t"
-
-#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL)
-        "cmpl		$128, %[nbytes]\n\t"
-        "jl		5f\n\t"
-
-        CALC_HT_8_AVX2()
-
-        "movl		%[nbytes], %%r13d\n\t"
-        "andl		$0xffffff80, %%r13d\n\t"
-        "\n"
-        "2:\n\t"
-        VAESENC_128_GHASH_AVX2(%%rcx, 128)
-        "addl		$128, " VAR(KR) "\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jl		2b\n\t"
-
-        "vmovdqa	%%xmm2, " VAR(XR) "\n\t"
-        "vmovdqu	(%%rsp), " VAR(HR) "\n\t"
-    "5:\n\t"
-        "movl		%[nbytes], %%edx\n\t"
-        "cmpl		%%edx, " VAR(KR) "\n\t"
-        "jge		55f\n\t"
-#endif
-        "movl		%[nbytes], %%r13d\n\t"
-        "andl		$0xfffffff0, %%r13d\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jge		13f\n\t"
-
-        "vmovdqa	%[MOD2_128], %%xmm0\n\t"
-        "\n"
-        "12:\n\t"
-        "vmovdqu	(%[in]," VAR(KR64) ",1), %%xmm9\n\t"
-        "vmovdqu	" VAR(CTR1) ", %%xmm5\n\t"
-        "vpshufb	%[BSWAP_MASK], %%xmm9, %%xmm1\n\t"
-        "vpshufb	%[BSWAP_EPI64], %%xmm5, %%xmm4\n\t"
-        "vpaddd		%[ONE], %%xmm5, %%xmm5\n\t"
-        "vpxor		" VAR(XR) ", %%xmm1, %%xmm1\n\t"
-        "vmovdqu	%%xmm5, " VAR(CTR1) "\n\t"
-        VAESENC_GFMUL_SB_AVX2(%%xmm9, HR, %%xmm1, CTR1)
-        "vmovdqu	%%xmm4, (%[out]," VAR(KR64) ",1)\n\t"
-        "addl		$16, " VAR(KR) "\n\t"
-        "cmpl		%%r13d, " VAR(KR) "\n\t"
-        "jl		12b\n\t"
-        "\n"
-        "13:\n\t"
-
-        AESENC_LAST15_DEC_AVX2()
-        "\n"
-        "55:\n\t"
-
-        CALC_TAG_AVX2()
-        "addq		$" VAR(STACK_OFFSET) ", %%rsp\n\t"
-        "popq		%%rdx\n\t"
-        CMP_TAG_AVX()
-        "vzeroupper\n\t"
-
-        :
-        : [KEY] "r" (key),
-          [in] "r" (in), [out] "r" (out), [nr] "r" (nr),
-          [nbytes] "r" (nbytes), [abytes] "r" (abytes), [addt] "r" (addt),
-          [ivec] "r" (iv), [ibytes] "r" (ivLen), [tbytes] "r" (tagLen),
-          [tag] "r" (tag), [res] "r" (res),
-          [BSWAP_MASK] "m" (BSWAP_MASK),
-          [BSWAP_EPI64] "m" (BSWAP_EPI64),
-          [ONE] "m" (ONE),
-#if !defined(AES_GCM_AESNI_NO_UNROLL) && !defined(AES_GCM_AVX2_NO_UNROLL)
-          [TWO] "m" (TWO), [THREE] "m" (THREE), [FOUR] "m" (FOUR),
-          [FIVE] "m" (FIVE), [SIX] "m" (SIX), [SEVEN] "m" (SEVEN),
-          [EIGHT] "m" (EIGHT),
-#endif
-          [MOD2_128] "m" (MOD2_128)
-        : "xmm15", "xmm14", "xmm13", "xmm12",
-          "xmm0", "xmm1", "xmm2", "xmm3", "memory",
-          "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11",
-          "rcx", "r13"
-    );
-}
-#endif /* HAVE_INTEL_AVX2 */
-#endif /* HAVE_INTEL_AVX1 */
-#endif /* HAVE_AES_DECRYPT */
-
-#else /* _MSC_VER */
+static const __m128i BSWAP_EPI64 =
+        M128_INIT(0x0001020304050607, 0x08090a0b0c0d0e0f);
+static const __m128i BSWAP_MASK =
+        M128_INIT(0x08090a0b0c0d0e0f, 0x0001020304050607);
+
+
 /* The following are for MSC based builds which do not allow
  * inline assembly. Intrinsic functions are used instead. */
 
@@ -7013,7 +4607,7 @@
     __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
 #endif
 
-    if (ibytes == 12)
+    if (ibytes == GCM_NONCE_MID_SZ)
         aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T);
     else
         aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T);
@@ -7451,7 +5045,7 @@
     __m128i tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
 #endif /* AES_GCM_AESNI_NO_UNROLL */
 
-    if (ibytes == 12)
+    if (ibytes == GCM_NONCE_MID_SZ)
         aes_gcm_calc_iv_12(KEY, ivec, nr, H, Y, T);
     else
         aes_gcm_calc_iv(KEY, ivec, ibytes, nr, H, Y, T);
@@ -8070,6 +5664,13 @@
             x[1] ^= bigA[1];
             GMULT(x, bigH);
         }
+#ifdef OPENSSL_EXTRA
+        /* store AAD partial tag for next call */
+        aes->aadH[0] = (word32)((x[0] & 0xFFFFFFFF00000000) >> 32);
+        aes->aadH[1] = (word32)(x[0] & 0xFFFFFFFF);
+        aes->aadH[2] = (word32)((x[1] & 0xFFFFFFFF00000000) >> 32);
+        aes->aadH[3] = (word32)(x[1] & 0xFFFFFFFF);
+#endif
     }
 
     /* Hash in C, the Ciphertext */
@@ -8077,6 +5678,13 @@
         word64 bigC[2];
         blocks = cSz / AES_BLOCK_SIZE;
         partial = cSz % AES_BLOCK_SIZE;
+#ifdef OPENSSL_EXTRA
+        /* Start from last AAD partial tag */
+        if(aes->aadLen) {
+            x[0] = ((word64)aes->aadH[0]) << 32 | aes->aadH[1];
+            x[1] = ((word64)aes->aadH[2]) << 32 | aes->aadH[3];
+         }
+#endif
         while (blocks--) {
             XMEMCPY(bigC, c, AES_BLOCK_SIZE);
             #ifdef LITTLE_ENDIAN_ORDER
@@ -8103,7 +5711,10 @@
     {
         word64 len[2];
         len[0] = aSz; len[1] = cSz;
-
+#ifdef OPENSSL_EXTRA
+        if (aes->aadLen)
+            len[0] = (word64)aes->aadLen;
+#endif
         /* Lengths are in bytes. Convert to bits. */
         len[0] *= 8;
         len[1] *= 8;
@@ -8269,7 +5880,7 @@
 #endif /* end GCM_WORD32 */
 
 
-#if !defined(WOLFSSL_XILINX_CRYPT)
+#if !defined(WOLFSSL_XILINX_CRYPT) && !defined(WOLFSSL_AFALG_XILINX_AES)
 #ifdef FREESCALE_LTC_AES_GCM
 int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
                    const byte* iv, word32 ivSz,
@@ -8280,7 +5891,7 @@
     word32 keySize;
 
     /* argument checks */
-    if (aes == NULL || authTagSz > AES_BLOCK_SIZE) {
+    if (aes == NULL || authTagSz > AES_BLOCK_SIZE || ivSz == 0) {
         return BAD_FUNC_ARG;
     }
 
@@ -8298,43 +5909,66 @@
 
     return (status == kStatus_Success) ? 0 : AES_GCM_AUTH_E;
 }
+
 #else
-#if defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || \
-                              defined(WOLFSSL_STM32F7) || \
-                              defined(WOLFSSL_STM32L4))
-
-static WC_INLINE int wc_AesGcmEncrypt_STM32(Aes* aes, byte* out, const byte* in,
-                                         word32 sz, const byte* iv, word32 ivSz,
-                                         byte* authTag, word32 authTagSz,
-                                         const byte* authIn, word32 authInSz)
+
+#ifdef STM32_CRYPTO_AES_GCM
+
+/* this function supports inline encrypt */
+static int wc_AesGcmEncrypt_STM32(Aes* aes, byte* out, const byte* in, word32 sz,
+                                  const byte* iv, word32 ivSz,
+                                  byte* authTag, word32 authTagSz,
+                                  const byte* authIn, word32 authInSz)
 {
     int ret;
+#ifdef WOLFSSL_STM32_CUBEMX
+    CRYP_HandleTypeDef hcryp;
+#else
+    word32 keyCopy[AES_256_KEY_SIZE/sizeof(word32)];
+#endif
     word32 keySize;
-    byte initialCounter[AES_BLOCK_SIZE];
-    #ifdef WOLFSSL_STM32_CUBEMX
-        CRYP_HandleTypeDef hcryp;
-    #else
-        byte keyCopy[AES_BLOCK_SIZE * 2];
-    #endif /* WOLFSSL_STM32_CUBEMX */
-    int status = 0;
+    int status = HAL_OK;
+    word32 blocks = sz / AES_BLOCK_SIZE;
+    word32 partial = sz % AES_BLOCK_SIZE;
+    byte tag[AES_BLOCK_SIZE];
+    byte partialBlock[AES_BLOCK_SIZE];
+    byte ctr[AES_BLOCK_SIZE];
     byte* authInPadded = NULL;
-    byte tag[AES_BLOCK_SIZE];
     int authPadSz;
 
     ret = wc_AesGetKeySize(aes, &keySize);
     if (ret != 0)
         return ret;
 
-    XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
-    XMEMCPY(initialCounter, iv, ivSz);
-    initialCounter[AES_BLOCK_SIZE - 1] = STM32_GCM_IV_START;
-
-    /* pad authIn if it is not a block multiple */
-    if ((authInSz % AES_BLOCK_SIZE) != 0) {
+#ifdef WOLFSSL_STM32_CUBEMX
+    ret = wc_Stm32_Aes_Init(aes, &hcryp);
+    if (ret != 0)
+        return ret;
+#endif
+
+    ret = wolfSSL_CryptHwMutexLock();
+    if (ret != 0) {
+        return ret;
+    }
+
+    XMEMSET(ctr, 0, AES_BLOCK_SIZE);
+    if (ivSz == GCM_NONCE_MID_SZ) {
+        XMEMCPY(ctr, iv, ivSz);
+        ctr[AES_BLOCK_SIZE - 1] = 1;
+    }
+    else {
+        GHASH(aes, NULL, 0, iv, ivSz, ctr, AES_BLOCK_SIZE);
+    }
+    /* Hardware requires counter + 1 */
+    IncrementGcmCounter(ctr);
+
+    if (authInSz == 0 || (authInSz % AES_BLOCK_SIZE) != 0) {
+        /* Need to pad the AAD to a full block with zeros. */
         authPadSz = ((authInSz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE;
-        /* Need to pad the AAD to a full block with zeros. */
-        authInPadded = XMALLOC(authPadSz, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+        authInPadded = (byte*)XMALLOC(authPadSz, aes->heap,
+            DYNAMIC_TYPE_TMP_BUFFER);
         if (authInPadded == NULL) {
+            wolfSSL_CryptHwMutexUnLock();
             return MEMORY_E;
         }
         XMEMSET(authInPadded, 0, authPadSz);
@@ -8344,32 +5978,12 @@
         authInPadded = (byte*)authIn;
     }
 
-
 #ifdef WOLFSSL_STM32_CUBEMX
-    XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef));
-    switch (keySize) {
-        case 16: /* 128-bit key */
-            hcryp.Init.KeySize = CRYP_KEYSIZE_128B;
-            break;
-#ifdef CRYP_KEYSIZE_192B
-        case 24: /* 192-bit key */
-            hcryp.Init.KeySize = CRYP_KEYSIZE_192B;
-            break;
-#endif
-    	case 32: /* 256-bit key */
-            hcryp.Init.KeySize = CRYP_KEYSIZE_256B;
-            break;
-        default:
-            break;
-    }
-    hcryp.Instance = CRYP;
-    hcryp.Init.DataType = CRYP_DATATYPE_8B;
-    hcryp.Init.pKey = (byte*)aes->key;
-    hcryp.Init.pInitVect = initialCounter;
-    hcryp.Init.Header = authInPadded;
+    hcryp.Init.pInitVect = (STM_CRYPT_TYPE*)ctr;
+    hcryp.Init.Header = (STM_CRYPT_TYPE*)authInPadded;
     hcryp.Init.HeaderSize = authInSz;
 
-#ifdef WOLFSSL_STM32L4
+#ifdef STM32_CRYPTO_AES_ONLY
     /* Set the CRYP parameters */
     hcryp.Init.ChainingMode  = CRYP_CHAINMODE_AES_GCM_GMAC;
     hcryp.Init.OperatingMode = CRYP_ALGOMODE_ENCRYPT;
@@ -8382,24 +5996,59 @@
         /* GCM header phase */
         hcryp.Init.GCMCMACPhase  = CRYP_HEADER_PHASE;
         status = HAL_CRYPEx_AES_Auth(&hcryp, NULL, 0, NULL, STM32_HAL_TIMEOUT);
-        if (status == HAL_OK) {
-            /* GCM payload phase */
-            hcryp.Init.GCMCMACPhase  = CRYP_PAYLOAD_PHASE;
-            status = HAL_CRYPEx_AES_Auth(&hcryp, (byte*)in, sz, out, STM32_HAL_TIMEOUT);
-            if (status == HAL_OK) {
-                /* GCM final phase */
-                hcryp.Init.GCMCMACPhase  = CRYP_FINAL_PHASE;
-                status = HAL_CRYPEx_AES_Auth(&hcryp, NULL, sz, tag, STM32_HAL_TIMEOUT);
-            }
-        }
+    }
+    if (status == HAL_OK) {
+        /* GCM payload phase - blocks */
+        hcryp.Init.GCMCMACPhase  = CRYP_PAYLOAD_PHASE;
+        if (blocks) {
+            status = HAL_CRYPEx_AES_Auth(&hcryp, (byte*)in,
+                (blocks * AES_BLOCK_SIZE), out, STM32_HAL_TIMEOUT);
+        }
+    }
+    if (status == HAL_OK && (partial != 0 || blocks == 0)) {
+        /* GCM payload phase - partial remainder */
+        XMEMSET(partialBlock, 0, sizeof(partialBlock));
+        XMEMCPY(partialBlock, in + (blocks * AES_BLOCK_SIZE), partial);
+        status = HAL_CRYPEx_AES_Auth(&hcryp, partialBlock, partial,
+            partialBlock, STM32_HAL_TIMEOUT);
+        XMEMCPY(out + (blocks * AES_BLOCK_SIZE), partialBlock, partial);
+    }
+    if (status == HAL_OK) {
+        /* GCM final phase */
+        hcryp.Init.GCMCMACPhase  = CRYP_FINAL_PHASE;
+        status = HAL_CRYPEx_AES_Auth(&hcryp, NULL, sz, tag, STM32_HAL_TIMEOUT);
+    }
+#elif defined(STM32_HAL_V2)
+    hcryp.Init.Algorithm  = CRYP_AES_GCM;
+    ByteReverseWords((word32*)partialBlock, (word32*)ctr, AES_BLOCK_SIZE);
+    hcryp.Init.pInitVect = (STM_CRYPT_TYPE*)partialBlock;
+    HAL_CRYP_Init(&hcryp);
+
+    /* GCM payload phase - can handle partial blocks */
+    status = HAL_CRYP_Encrypt(&hcryp, (uint32_t*)in,
+        (blocks * AES_BLOCK_SIZE) + partial, (uint32_t*)out, STM32_HAL_TIMEOUT);
+    if (status == HAL_OK) {
+        /* Compute the authTag */
+        status = HAL_CRYPEx_AESGCM_GenerateAuthTAG(&hcryp, (uint32_t*)tag,
+            STM32_HAL_TIMEOUT);
     }
 #else
     HAL_CRYP_Init(&hcryp);
-
-    status = HAL_CRYPEx_AESGCM_Encrypt(&hcryp, (byte*)in, sz,
-                                       out, STM32_HAL_TIMEOUT);
-    /* Compute the authTag */
+    if (blocks) {
+        /* GCM payload phase - blocks */
+        status = HAL_CRYPEx_AESGCM_Encrypt(&hcryp, (byte*)in,
+            (blocks * AES_BLOCK_SIZE), out, STM32_HAL_TIMEOUT);
+    }
+    if (status == HAL_OK && (partial != 0 || blocks == 0)) {
+        /* GCM payload phase - partial remainder */
+        XMEMSET(partialBlock, 0, sizeof(partialBlock));
+        XMEMCPY(partialBlock, in + (blocks * AES_BLOCK_SIZE), partial);
+        status = HAL_CRYPEx_AESGCM_Encrypt(&hcryp, partialBlock, partial,
+            partialBlock, STM32_HAL_TIMEOUT);
+        XMEMCPY(out + (blocks * AES_BLOCK_SIZE), partialBlock, partial);
+    }
     if (status == HAL_OK) {
+        /* Compute the authTag */
         status = HAL_CRYPEx_AESGCM_Finish(&hcryp, sz, tag, STM32_HAL_TIMEOUT);
     }
 #endif
@@ -8407,29 +6056,46 @@
     if (status != HAL_OK)
         ret = AES_GCM_AUTH_E;
     HAL_CRYP_DeInit(&hcryp);
-#else
-    ByteReverseWords((word32*)keyCopy, (word32*)aes->key, keySize);
-    status = CRYP_AES_GCM(MODE_ENCRYPT, (uint8_t*)initialCounter,
-                         (uint8_t*)keyCopy,     keySize * 8,
-                         (uint8_t*)in,          sz,
-                         (uint8_t*)authInPadded,authInSz,
-                         (uint8_t*)out,         tag);
+
+#else /* STD_PERI_LIB */
+    ByteReverseWords(keyCopy, (word32*)aes->key, keySize);
+    status = CRYP_AES_GCM(MODE_ENCRYPT, (uint8_t*)ctr,
+                         (uint8_t*)keyCopy,      keySize * 8,
+                         (uint8_t*)in,           sz,
+                         (uint8_t*)authInPadded, authInSz,
+                         (uint8_t*)out,          tag);
     if (status != SUCCESS)
         ret = AES_GCM_AUTH_E;
 #endif /* WOLFSSL_STM32_CUBEMX */
 
-    /* authTag may be shorter than AES_BLOCK_SZ, store separately */
-    if (ret == 0)
-    	XMEMCPY(authTag, tag, authTagSz);
-
-    /* We only allocate extra memory if authInPadded is not a multiple of AES_BLOCK_SZ */
-    if (authInPadded != NULL && authInSz != authPadSz) {
+    if (ret == 0) {
+        /* return authTag */
+        if (authTag) {
+            /* STM32 GCM won't compute Auth correctly for partial or
+                when IV != 12, so use software here */
+            if (sz == 0 || partial != 0 || ivSz != GCM_NONCE_MID_SZ) {
+                DecrementGcmCounter(ctr); /* hardware requires +1, so subtract it */
+                GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz);
+                wc_AesEncrypt(aes, ctr, tag);
+                xorbuf(authTag, tag, authTagSz);
+            }
+            else {
+                XMEMCPY(authTag, tag, authTagSz);
+            }
+        }
+    }
+
+    /* Free memory if not a multiple of AES_BLOCK_SZ */
+    if (authInPadded != authIn) {
         XFREE(authInPadded, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
     }
 
+    wolfSSL_CryptHwMutexUnLock();
+
     return ret;
 }
-#endif /* STM32_CRYPTO */
+
+#endif /* STM32_CRYPTO_AES_GCM */
 
 #ifdef WOLFSSL_AESNI
 int AES_GCM_encrypt_C(Aes* aes, byte* out, const byte* in, word32 sz,
@@ -8453,21 +6119,31 @@
     byte initialCounter[AES_BLOCK_SIZE];
     byte *ctr;
     byte scratch[AES_BLOCK_SIZE];
-
+#ifdef OPENSSL_EXTRA
+    word32 aadTemp;
+#endif
     ctr = counter;
     XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
+    XMEMSET(scratch, 0, AES_BLOCK_SIZE);
     if (ivSz == GCM_NONCE_MID_SZ) {
         XMEMCPY(initialCounter, iv, ivSz);
         initialCounter[AES_BLOCK_SIZE - 1] = 1;
     }
     else {
+#ifdef OPENSSL_EXTRA
+        aadTemp = aes->aadLen;
+        aes->aadLen = 0;
+#endif
         GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+#ifdef OPENSSL_EXTRA
+        aes->aadLen = aadTemp;
+#endif
     }
     XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE);
 
 #ifdef WOLFSSL_PIC32MZ_CRYPT
     if (blocks) {
-        /* use intitial IV for PIC32 HW, but don't use it below */
+        /* use initial IV for HW, but don't use it below */
         XMEMCPY(aes->reg, ctr, AES_BLOCK_SIZE);
 
         ret = wc_Pic32AesCrypt(
@@ -8483,7 +6159,7 @@
 #if defined(HAVE_AES_ECB) && !defined(WOLFSSL_PIC32MZ_CRYPT)
     /* some hardware acceleration can gain performance from doing AES encryption
      * of the whole buffer at once */
-    if (c != p) { /* can not handle inline encryption */
+    if (c != p && blocks > 0) { /* can not handle inline encryption */
         while (blocks--) {
             IncrementGcmCounter(ctr);
             XMEMCPY(c, ctr, AES_BLOCK_SIZE);
@@ -8497,11 +6173,11 @@
         p += AES_BLOCK_SIZE * blocks;
     }
     else
-#endif /* HAVE_AES_ECB */
+#endif /* HAVE_AES_ECB && !WOLFSSL_PIC32MZ_CRYPT */
 
     while (blocks--) {
         IncrementGcmCounter(ctr);
-    #ifndef WOLFSSL_PIC32MZ_CRYPT
+    #if !defined(WOLFSSL_PIC32MZ_CRYPT)
         wc_AesEncrypt(aes, ctr, scratch);
         xorbuf(scratch, p, AES_BLOCK_SIZE);
         XMEMCPY(c, scratch, AES_BLOCK_SIZE);
@@ -8516,21 +6192,28 @@
         xorbuf(scratch, p, partial);
         XMEMCPY(c, scratch, partial);
     }
-
-    GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz);
-    wc_AesEncrypt(aes, initialCounter, scratch);
-    xorbuf(authTag, scratch, authTagSz);
+    if (authTag) {
+        GHASH(aes, authIn, authInSz, out, sz, authTag, authTagSz);
+        wc_AesEncrypt(aes, initialCounter, scratch);
+        xorbuf(authTag, scratch, authTagSz);
+#ifdef OPENSSL_EXTRA
+        if (!in && !sz)
+            /* store AAD size for next call */
+            aes->aadLen = authInSz;
+#endif
+    }
 
     return ret;
 }
 
+/* Software AES - GCM Encrypt */
 int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
                    const byte* iv, word32 ivSz,
                    byte* authTag, word32 authTagSz,
                    const byte* authIn, word32 authInSz)
 {
     /* argument checks */
-    if (aes == NULL || authTagSz > AES_BLOCK_SIZE) {
+    if (aes == NULL || authTagSz > AES_BLOCK_SIZE || ivSz == 0) {
         return BAD_FUNC_ARG;
     }
 
@@ -8539,27 +6222,13 @@
         return BAD_FUNC_ARG;
     }
 
-#if defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || \
-                              defined(WOLFSSL_STM32F7) || \
-                              defined(WOLFSSL_STM32L4))
-
-    /* additional argument checks - STM32 HW only supports 12 byte IV */
-    if (ivSz != GCM_NONCE_MID_SZ) {
-        return BAD_FUNC_ARG;
-    }
-
-    /* STM32 HW AES-GCM requires / assumes inputs are a multiple of block size.
-     * We can avoid this by zero padding (authIn) AAD, but zero-padded plaintext
-     * will be encrypted and output incorrectly, causing a bad authTag.
-     * We will use HW accelerated AES-GCM if plain%AES_BLOCK_SZ==0.
-     * Otherwise, we will use accelerated AES_CTR for encrypt, and then
-     * perform GHASH in software.
-     * See NIST SP 800-38D */
-
-    /* Plain text is a multiple of block size, so use HW-Accelerated AES_GCM */
-    if (sz % AES_BLOCK_SIZE == 0) {
-        return wc_AesGcmEncrypt_STM32(aes, out, in, sz, iv, ivSz,
-                                      authTag, authTagSz, authIn, authInSz);
+#ifdef WOLF_CRYPTO_CB
+    if (aes->devId != INVALID_DEVID) {
+        int ret = wc_CryptoCb_AesGcmEncrypt(aes, out, in, sz, iv, ivSz,
+            authTag, authTagSz, authIn, authInSz);
+        if (ret != CRYPTOCB_UNAVAILABLE)
+            return ret;
+        /* fall-through when unavailable */
     }
 #endif
 
@@ -8572,13 +6241,13 @@
         #ifdef HAVE_CAVIUM_V
         if (authInSz == 20) { /* Nitrox V GCM is only working with 20 byte AAD */
             return NitroxAesGcmEncrypt(aes, out, in, sz,
-                (const byte*)aes->asyncKey, aes->keylen, iv, ivSz,
+                (const byte*)aes->devKey, aes->keylen, iv, ivSz,
                 authTag, authTagSz, authIn, authInSz);
         }
         #endif
     #elif defined(HAVE_INTEL_QA)
         return IntelQaSymAesGcmEncrypt(&aes->asyncDev, out, in, sz,
-            (const byte*)aes->asyncKey, aes->keylen, iv, ivSz,
+            (const byte*)aes->devKey, aes->keylen, iv, ivSz,
             authTag, authTagSz, authIn, authInSz);
     #else /* WOLFSSL_ASYNC_CRYPT_TEST */
         if (wc_AsyncTestInit(&aes->asyncDev, ASYNC_TEST_AES_GCM_ENCRYPT)) {
@@ -8599,7 +6268,17 @@
     }
 #endif /* WOLFSSL_ASYNC_CRYPT */
 
-    /* Software AES-GCM */
+#ifdef STM32_CRYPTO_AES_GCM
+    /* The STM standard peripheral library API's doesn't support partial blocks */
+    #ifdef STD_PERI_LIB
+    if (partial == 0)
+    #endif
+    {
+        return wc_AesGcmEncrypt_STM32(
+            aes, out, in, sz, iv, ivSz,
+            authTag, authTagSz, authIn, authInSz);
+    }
+#endif /* STM32_CRYPTO_AES_GCM */
 
 #ifdef WOLFSSL_AESNI
     #ifdef HAVE_INTEL_AVX2
@@ -8633,6 +6312,8 @@
 #endif
 
 
+
+/* AES GCM Decrypt */
 #if defined(HAVE_AES_DECRYPT) || defined(HAVE_AESGCM_DECRYPT)
 #ifdef FREESCALE_LTC_AES_GCM
 int  wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
@@ -8645,8 +6326,12 @@
     status_t status;
 
     /* argument checks */
-    if (aes == NULL || out == NULL || in == NULL || iv == NULL ||
-        authTag == NULL || authTagSz > AES_BLOCK_SIZE) {
+    /* If the sz is non-zero, both in and out must be set. If sz is 0,
+     * in and out are don't cares, as this is is the GMAC case. */
+    if (aes == NULL || iv == NULL || (sz != 0 && (in == NULL || out == NULL)) ||
+        authTag == NULL || authTagSz > AES_BLOCK_SIZE || authTagSz == 0 ||
+        ivSz == 0) {
+
         return BAD_FUNC_ARG;
     }
 
@@ -8660,72 +6345,66 @@
 
     return (status == kStatus_Success) ? 0 : AES_GCM_AUTH_E;
 }
-#elif defined(STM32_CRYPTO) && (defined(WOLFSSL_STM32F4) || \
-                                defined(WOLFSSL_STM32F7) || \
-                                defined(WOLFSSL_STM32L4))
-int  wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
-                   const byte* iv, word32 ivSz,
-                   const byte* authTag, word32 authTagSz,
-                   const byte* authIn, word32 authInSz)
+
+#else
+
+#ifdef STM32_CRYPTO_AES_GCM
+/* this function supports inline decrypt */
+static int wc_AesGcmDecrypt_STM32(Aes* aes, byte* out,
+                                  const byte* in, word32 sz,
+                                  const byte* iv, word32 ivSz,
+                                  const byte* authTag, word32 authTagSz,
+                                  const byte* authIn, word32 authInSz)
 {
     int ret;
+#ifdef WOLFSSL_STM32_CUBEMX
+    CRYP_HandleTypeDef hcryp;
+#else
+    word32 keyCopy[AES_256_KEY_SIZE/sizeof(word32)];
+#endif
     word32 keySize;
-    #ifdef WOLFSSL_STM32_CUBEMX
-        CRYP_HandleTypeDef hcryp;
-    #else
-        byte keyCopy[AES_BLOCK_SIZE * 2];
-    #endif /* WOLFSSL_STM32_CUBEMX */
-    int  status;
-    int  inPadSz, authPadSz;
+    int status = HAL_OK;
+    word32 blocks = sz / AES_BLOCK_SIZE;
+    word32 partial = sz % AES_BLOCK_SIZE;
     byte tag[AES_BLOCK_SIZE];
-    byte *inPadded = NULL;
-    byte *authInPadded = NULL;
-    byte initialCounter[AES_BLOCK_SIZE];
-
-    /* argument checks */
-    if (aes == NULL || out == NULL || in == NULL || iv == NULL ||
-        authTag == NULL || authTagSz > AES_BLOCK_SIZE) {
-        return BAD_FUNC_ARG;
-    }
+    byte partialBlock[AES_BLOCK_SIZE];
+    byte ctr[AES_BLOCK_SIZE];
+    byte* authInPadded = NULL;
+    int authPadSz;
 
     ret = wc_AesGetKeySize(aes, &keySize);
+    if (ret != 0)
+        return ret;
+
+#ifdef WOLFSSL_STM32_CUBEMX
+    ret = wc_Stm32_Aes_Init(aes, &hcryp);
+    if (ret != 0)
+        return ret;
+#endif
+
+    ret = wolfSSL_CryptHwMutexLock();
     if (ret != 0) {
         return ret;
     }
 
-    /* additional argument checks - STM32 HW only supports 12 byte IV */
-    if (ivSz != GCM_NONCE_MID_SZ) {
-        return BAD_FUNC_ARG;
-    }
-
-    XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
-    XMEMCPY(initialCounter, iv, ivSz);
-    initialCounter[AES_BLOCK_SIZE - 1] = STM32_GCM_IV_START;
-
-    /* Need to pad the AAD and input cipher text to a full block size since
-     * CRYP_AES_GCM will assume these are a multiple of AES_BLOCK_SIZE.
-     * It is okay to pad with zeros because GCM does this before GHASH already.
-     * See NIST SP 800-38D */
-
-    if ((sz % AES_BLOCK_SIZE) > 0) {
-        inPadSz = ((sz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE;
-        inPadded = XMALLOC(inPadSz, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
-        if (inPadded == NULL) {
-            return MEMORY_E;
-        }
-        XMEMSET(inPadded, 0, inPadSz);
-        XMEMCPY(inPadded, in, sz);
-    } else {
-        inPadSz = sz;
-        inPadded = (byte*)in;
-    }
-
-    if ((authInSz % AES_BLOCK_SIZE) > 0) {
+    XMEMSET(ctr, 0, AES_BLOCK_SIZE);
+    if (ivSz == GCM_NONCE_MID_SZ) {
+        XMEMCPY(ctr, iv, ivSz);
+        ctr[AES_BLOCK_SIZE - 1] = 1;
+    }
+    else {
+        GHASH(aes, NULL, 0, iv, ivSz, ctr, AES_BLOCK_SIZE);
+    }
+    /* Hardware requires counter + 1 */
+    IncrementGcmCounter(ctr);
+
+    if (authInSz == 0 || (authInSz % AES_BLOCK_SIZE) != 0) {
+        /* Need to pad the AAD to a full block with zeros. */
         authPadSz = ((authInSz / AES_BLOCK_SIZE) + 1) * AES_BLOCK_SIZE;
-        authInPadded = XMALLOC(authPadSz, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+        authInPadded = (byte*)XMALLOC(authPadSz, aes->heap,
+            DYNAMIC_TYPE_TMP_BUFFER);
         if (authInPadded == NULL) {
-            if (inPadded != NULL && inPadSz != sz)
-                XFREE(inPadded , aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+            wolfSSL_CryptHwMutexUnLock();
             return MEMORY_E;
         }
         XMEMSET(authInPadded, 0, authPadSz);
@@ -8736,30 +6415,11 @@
     }
 
 #ifdef WOLFSSL_STM32_CUBEMX
-    XMEMSET(&hcryp, 0, sizeof(CRYP_HandleTypeDef));
-    switch(keySize) {
-        case 16: /* 128-bit key */
-            hcryp.Init.KeySize = CRYP_KEYSIZE_128B;
-            break;
-#ifdef CRYP_KEYSIZE_192B
-        case 24: /* 192-bit key */
-            hcryp.Init.KeySize = CRYP_KEYSIZE_192B;
-            break;
-#endif
-        case 32: /* 256-bit key */
-            hcryp.Init.KeySize = CRYP_KEYSIZE_256B;
-            break;
-        default:
-            break;
-    }
-    hcryp.Instance = CRYP;
-    hcryp.Init.DataType = CRYP_DATATYPE_8B;
-    hcryp.Init.pKey = (byte*)aes->key;
-    hcryp.Init.pInitVect = initialCounter;
-    hcryp.Init.Header = authInPadded;
+    hcryp.Init.pInitVect = (STM_CRYPT_TYPE*)ctr;
+    hcryp.Init.Header = (STM_CRYPT_TYPE*)authInPadded;
     hcryp.Init.HeaderSize = authInSz;
 
-#ifdef WOLFSSL_STM32L4
+#ifdef STM32_CRYPTO_AES_ONLY
     /* Set the CRYP parameters */
     hcryp.Init.ChainingMode  = CRYP_CHAINMODE_AES_GCM_GMAC;
     hcryp.Init.OperatingMode = CRYP_ALGOMODE_DECRYPT;
@@ -8770,29 +6430,61 @@
     status = HAL_CRYPEx_AES_Auth(&hcryp, NULL, 0, NULL, STM32_HAL_TIMEOUT);
     if (status == HAL_OK) {
         /* GCM header phase */
-        hcryp.Init.GCMCMACPhase  = CRYP_HEADER_PHASE;
+        hcryp.Init.GCMCMACPhase = CRYP_HEADER_PHASE;
         status = HAL_CRYPEx_AES_Auth(&hcryp, NULL, 0, NULL, STM32_HAL_TIMEOUT);
-        if (status == HAL_OK) {
-            /* GCM payload phase */
-            hcryp.Init.GCMCMACPhase  = CRYP_PAYLOAD_PHASE;
-            status = HAL_CRYPEx_AES_Auth(&hcryp, (byte*)inPadded, sz, inPadded,
-                STM32_HAL_TIMEOUT);
-            if (status == HAL_OK) {
-                /* GCM final phase */
-                hcryp.Init.GCMCMACPhase  = CRYP_FINAL_PHASE;
-                status = HAL_CRYPEx_AES_Auth(&hcryp, NULL, sz, tag,
-                    STM32_HAL_TIMEOUT);
-            }
-        }
+    }
+    if (status == HAL_OK) {
+        /* GCM payload phase - blocks */
+        hcryp.Init.GCMCMACPhase  = CRYP_PAYLOAD_PHASE;
+        if (blocks) {
+            status = HAL_CRYPEx_AES_Auth(&hcryp, (byte*)in,
+                (blocks * AES_BLOCK_SIZE), out, STM32_HAL_TIMEOUT);
+        }
+    }
+    if (status == HAL_OK && (partial != 0 || blocks == 0)) {
+        /* GCM payload phase - partial remainder */
+        XMEMSET(partialBlock, 0, sizeof(partialBlock));
+        XMEMCPY(partialBlock, in + (blocks * AES_BLOCK_SIZE), partial);
+        status = HAL_CRYPEx_AES_Auth(&hcryp, partialBlock, partial,
+            partialBlock, STM32_HAL_TIMEOUT);
+        XMEMCPY(out + (blocks * AES_BLOCK_SIZE), partialBlock, partial);
+    }
+    if (status == HAL_OK) {
+        /* GCM final phase */
+        hcryp.Init.GCMCMACPhase = CRYP_FINAL_PHASE;
+        status = HAL_CRYPEx_AES_Auth(&hcryp, NULL, sz, tag, STM32_HAL_TIMEOUT);
+    }
+#elif defined(STM32_HAL_V2)
+    hcryp.Init.Algorithm = CRYP_AES_GCM;
+    ByteReverseWords((word32*)partialBlock, (word32*)ctr, AES_BLOCK_SIZE);
+    hcryp.Init.pInitVect = (STM_CRYPT_TYPE*)partialBlock;
+    HAL_CRYP_Init(&hcryp);
+
+    /* GCM payload phase - can handle partial blocks */
+    status = HAL_CRYP_Decrypt(&hcryp, (uint32_t*)in,
+        (blocks * AES_BLOCK_SIZE) + partial, (uint32_t*)out, STM32_HAL_TIMEOUT);
+    if (status == HAL_OK) {
+        /* Compute the authTag */
+        status = HAL_CRYPEx_AESGCM_GenerateAuthTAG(&hcryp, (uint32_t*)tag,
+            STM32_HAL_TIMEOUT);
     }
 #else
     HAL_CRYP_Init(&hcryp);
-    /* Use inPadded for output buffer instead of
-    * out so that we don't overflow our size. */
-    status = HAL_CRYPEx_AESGCM_Decrypt(&hcryp, (byte*)inPadded,
-                                    sz, inPadded, STM32_HAL_TIMEOUT);
-    /* Compute the authTag */
+    if (blocks) {
+        /* GCM payload phase - blocks */
+        status = HAL_CRYPEx_AESGCM_Decrypt(&hcryp, (byte*)in,
+            (blocks * AES_BLOCK_SIZE), out, STM32_HAL_TIMEOUT);
+    }
+    if (status == HAL_OK && (partial != 0 || blocks == 0)) {
+        /* GCM payload phase - partial remainder */
+        XMEMSET(partialBlock, 0, sizeof(partialBlock));
+        XMEMCPY(partialBlock, in + (blocks * AES_BLOCK_SIZE), partial);
+        status = HAL_CRYPEx_AESGCM_Decrypt(&hcryp, partialBlock, partial,
+            partialBlock, STM32_HAL_TIMEOUT);
+        XMEMCPY(out + (blocks * AES_BLOCK_SIZE), partialBlock, partial);
+    }
     if (status == HAL_OK) {
+        /* Compute the authTag */
         status = HAL_CRYPEx_AESGCM_Finish(&hcryp, sz, tag, STM32_HAL_TIMEOUT);
     }
 #endif
@@ -8801,37 +6493,46 @@
         ret = AES_GCM_AUTH_E;
 
     HAL_CRYP_DeInit(&hcryp);
-#else
-    ByteReverseWords((word32*)keyCopy, (word32*)aes->key, keySize);
+
+#else /* STD_PERI_LIB */
+    ByteReverseWords(keyCopy, (word32*)aes->key, aes->keylen);
 
     /* Input size and auth size need to be the actual sizes, even though
      * they are not block aligned, because this length (in bits) is used
-     * in the final GHASH. Use inPadded for output buffer instead of
-     * out so that we don't overflow our size.                         */
-    status = CRYP_AES_GCM(MODE_DECRYPT, (uint8_t*)initialCounter,
-                         (uint8_t*)keyCopy,     keySize * 8,
-                         (uint8_t*)inPadded,    sz,
-                         (uint8_t*)authInPadded,authInSz,
-                         (uint8_t*)inPadded,    tag);
+     * in the final GHASH. */
+    status = CRYP_AES_GCM(MODE_DECRYPT, (uint8_t*)ctr,
+                         (uint8_t*)keyCopy,      keySize * 8,
+                         (uint8_t*)in,           sz,
+                         (uint8_t*)authInPadded, authInSz,
+                         (uint8_t*)out,          tag);
     if (status != SUCCESS)
         ret = AES_GCM_AUTH_E;
 #endif /* WOLFSSL_STM32_CUBEMX */
 
-    if (ret == 0 && ConstantCompare(authTag, tag, authTagSz) == 0) {
-        /* Only keep the decrypted data if authTag success. */
-        XMEMCPY(out, inPadded, sz);
-        ret = 0; /* success */
-    }
-
-    /* only allocate padding buffers if the inputs are not a multiple of block sz */
-    if (inPadded != NULL && inPadSz != sz)
-        XFREE(inPadded , aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
-    if (authInPadded != NULL && authPadSz != authInSz)
+    /* STM32 GCM hardware only supports IV of 12 bytes, so use software for auth */
+    if (sz == 0 || ivSz != GCM_NONCE_MID_SZ) {
+        DecrementGcmCounter(ctr); /* hardware requires +1, so subtract it */
+        GHASH(aes, authIn, authInSz, in, sz, tag, sizeof(tag));
+        wc_AesEncrypt(aes, ctr, partialBlock);
+        xorbuf(tag, partialBlock, sizeof(tag));
+    }
+
+    if (ConstantCompare(authTag, tag, authTagSz) != 0) {
+        ret = AES_GCM_AUTH_E;
+    }
+
+    /* Free memory if not a multiple of AES_BLOCK_SZ */
+    if (authInPadded != authIn) {
         XFREE(authInPadded, aes->heap, DYNAMIC_TYPE_TMP_BUFFER);
+    }
+
+    wolfSSL_CryptHwMutexUnLock();
 
     return ret;
 }
-#else
+
+#endif /* STM32_CRYPTO_AES_GCM */
+
 #ifdef WOLFSSL_AESNI
 int AES_GCM_decrypt_C(Aes* aes, byte* out, const byte* in, word32 sz,
                       const byte* iv, word32 ivSz,
@@ -8856,15 +6557,24 @@
     byte scratch[AES_BLOCK_SIZE];
     byte Tprime[AES_BLOCK_SIZE];
     byte EKY0[AES_BLOCK_SIZE];
+#ifdef OPENSSL_EXTRA
+    word32 aadTemp;
+#endif
     ctr = counter;
-
     XMEMSET(initialCounter, 0, AES_BLOCK_SIZE);
     if (ivSz == GCM_NONCE_MID_SZ) {
         XMEMCPY(initialCounter, iv, ivSz);
         initialCounter[AES_BLOCK_SIZE - 1] = 1;
     }
     else {
+#ifdef OPENSSL_EXTRA
+        aadTemp = aes->aadLen;
+        aes->aadLen = 0;
+#endif
         GHASH(aes, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE);
+#ifdef OPENSSL_EXTRA
+        aes->aadLen = aadTemp;
+#endif
     }
     XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE);
 
@@ -8873,13 +6583,20 @@
     wc_AesEncrypt(aes, ctr, EKY0);
     xorbuf(Tprime, EKY0, sizeof(Tprime));
 
+#ifdef OPENSSL_EXTRA
+    if (!out) {
+        /* authenticated, non-confidential data */
+        /* store AAD size for next call */
+        aes->aadLen = authInSz;
+    }
+#endif
     if (ConstantCompare(authTag, Tprime, authTagSz) != 0) {
         return AES_GCM_AUTH_E;
     }
 
-#ifdef WOLFSSL_PIC32MZ_CRYPT
+#if defined(WOLFSSL_PIC32MZ_CRYPT)
     if (blocks) {
-        /* use intitial IV for PIC32 HW, but don't use it below */
+        /* use initial IV for HW, but don't use it below */
         XMEMCPY(aes->reg, ctr, AES_BLOCK_SIZE);
 
         ret = wc_Pic32AesCrypt(
@@ -8895,7 +6612,7 @@
 #if defined(HAVE_AES_ECB) && !defined(WOLFSSL_PIC32MZ_CRYPT)
     /* some hardware acceleration can gain performance from doing AES encryption
      * of the whole buffer at once */
-    if (c != p) { /* can not handle inline decryption */
+    if (c != p && blocks > 0) { /* can not handle inline decryption */
         while (blocks--) {
             IncrementGcmCounter(ctr);
             XMEMCPY(p, ctr, AES_BLOCK_SIZE);
@@ -8904,15 +6621,16 @@
 
         /* reset number of blocks and then do encryption */
         blocks = sz / AES_BLOCK_SIZE;
+
         wc_AesEcbEncrypt(aes, out, out, AES_BLOCK_SIZE * blocks);
         xorbuf(out, c, AES_BLOCK_SIZE * blocks);
         c += AES_BLOCK_SIZE * blocks;
     }
     else
-#endif /* HAVE_AES_ECB */
+#endif /* HAVE_AES_ECB && !PIC32MZ */
     while (blocks--) {
         IncrementGcmCounter(ctr);
-    #ifndef WOLFSSL_PIC32MZ_CRYPT
+    #if !defined(WOLFSSL_PIC32MZ_CRYPT)
         wc_AesEncrypt(aes, ctr, scratch);
         xorbuf(scratch, c, AES_BLOCK_SIZE);
         XMEMCPY(p, scratch, AES_BLOCK_SIZE);
@@ -8931,24 +6649,36 @@
     return ret;
 }
 
+/* Software AES - GCM Decrypt */
 int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
                      const byte* iv, word32 ivSz,
                      const byte* authTag, word32 authTagSz,
                      const byte* authIn, word32 authInSz)
 {
 #ifdef WOLFSSL_AESNI
-    int res;
+    int res = AES_GCM_AUTH_E;
 #endif
 
     /* argument checks */
     /* If the sz is non-zero, both in and out must be set. If sz is 0,
      * in and out are don't cares, as this is is the GMAC case. */
     if (aes == NULL || iv == NULL || (sz != 0 && (in == NULL || out == NULL)) ||
-        authTag == NULL || authTagSz > AES_BLOCK_SIZE || authTagSz == 0) {
+        authTag == NULL || authTagSz > AES_BLOCK_SIZE || authTagSz == 0 ||
+        ivSz == 0) {
 
         return BAD_FUNC_ARG;
     }
 
+#ifdef WOLF_CRYPTO_CB
+    if (aes->devId != INVALID_DEVID) {
+        int ret = wc_CryptoCb_AesGcmDecrypt(aes, out, in, sz, iv, ivSz,
+            authTag, authTagSz, authIn, authInSz);
+        if (ret != CRYPTOCB_UNAVAILABLE)
+            return ret;
+        /* fall-through when unavailable */
+    }
+#endif
+
 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES)
     /* if async and byte count above threshold */
     /* only 12-byte IV is supported in HW */
@@ -8958,13 +6688,13 @@
         #ifdef HAVE_CAVIUM_V
         if (authInSz == 20) { /* Nitrox V GCM is only working with 20 byte AAD */
             return NitroxAesGcmDecrypt(aes, out, in, sz,
-                (const byte*)aes->asyncKey, aes->keylen, iv, ivSz,
+                (const byte*)aes->devKey, aes->keylen, iv, ivSz,
                 authTag, authTagSz, authIn, authInSz);
         }
         #endif
     #elif defined(HAVE_INTEL_QA)
         return IntelQaSymAesGcmDecrypt(&aes->asyncDev, out, in, sz,
-            (const byte*)aes->asyncKey, aes->keylen, iv, ivSz,
+            (const byte*)aes->devKey, aes->keylen, iv, ivSz,
             authTag, authTagSz, authIn, authInSz);
     #else /* WOLFSSL_ASYNC_CRYPT_TEST */
         if (wc_AsyncTestInit(&aes->asyncDev, ASYNC_TEST_AES_GCM_DECRYPT)) {
@@ -8985,7 +6715,17 @@
     }
 #endif /* WOLFSSL_ASYNC_CRYPT */
 
-    /* software AES GCM */
+#ifdef STM32_CRYPTO_AES_GCM
+    /* The STM standard peripheral library API's doesn't support partial blocks */
+    #ifdef STD_PERI_LIB
+    if (partial == 0)
+    #endif
+    {
+        return wc_AesGcmDecrypt_STM32(
+            aes, out, in, sz, iv, ivSz,
+            authTag, authTagSz, authIn, authInSz);
+    }
+#endif /* STM32_CRYPTO_AES_GCM */
 
 #ifdef WOLFSSL_AESNI
     #ifdef HAVE_INTEL_AVX2
@@ -9024,7 +6764,7 @@
 }
 #endif
 #endif /* HAVE_AES_DECRYPT || HAVE_AESGCM_DECRYPT */
-#endif /* (WOLFSSL_XILINX_CRYPT) */
+#endif /* WOLFSSL_XILINX_CRYPT */
 #endif /* end of block for AESGCM implementation selection */
 
 
@@ -9124,7 +6864,8 @@
                                (byte*)aes->reg, ivOutSz,
                                authTag, authTagSz,
                                authIn, authInSz);
-        IncCtr((byte*)aes->reg, ivOutSz);
+        if (ret == 0)
+            IncCtr((byte*)aes->reg, ivOutSz);
     }
 
     return ret;
@@ -9135,21 +6876,24 @@
             byte* authTag, word32 authTagSz, WC_RNG* rng)
 {
     Aes aes;
-    int ret = 0;
+    int ret;
 
     if (key == NULL || iv == NULL || (authIn == NULL && authInSz != 0) ||
         authTag == NULL || authTagSz == 0 || rng == NULL) {
 
-        ret = BAD_FUNC_ARG;
-    }
-
-    if (ret == 0)
+        return BAD_FUNC_ARG;
+    }
+
+    ret = wc_AesInit(&aes, NULL, INVALID_DEVID);
+    if (ret == 0) {
         ret = wc_AesGcmSetKey(&aes, key, keySz);
-    if (ret == 0)
-        ret = wc_AesGcmSetIV(&aes, ivSz, NULL, 0, rng);
-    if (ret == 0)
-        ret = wc_AesGcmEncrypt_ex(&aes, NULL, NULL, 0, iv, ivSz,
+        if (ret == 0)
+            ret = wc_AesGcmSetIV(&aes, ivSz, NULL, 0, rng);
+        if (ret == 0)
+            ret = wc_AesGcmEncrypt_ex(&aes, NULL, NULL, 0, iv, ivSz,
                                   authTag, authTagSz, authIn, authInSz);
+        wc_AesFree(&aes);
+    }
     ForceZero(&aes, sizeof(aes));
 
     return ret;
@@ -9160,22 +6904,36 @@
                   const byte* authIn, word32 authInSz,
                   const byte* authTag, word32 authTagSz)
 {
+    int ret;
+#ifndef NO_AES_DECRYPT
     Aes aes;
-    int ret = 0;
 
     if (key == NULL || iv == NULL || (authIn == NULL && authInSz != 0) ||
         authTag == NULL || authTagSz == 0 || authTagSz > AES_BLOCK_SIZE) {
 
-        ret = BAD_FUNC_ARG;
-    }
-
-    if (ret == 0)
+        return BAD_FUNC_ARG;
+    }
+
+    ret = wc_AesInit(&aes, NULL, INVALID_DEVID);
+    if (ret == 0) {
         ret = wc_AesGcmSetKey(&aes, key, keySz);
-    if (ret == 0)
-        ret = wc_AesGcmDecrypt(&aes, NULL, NULL, 0, iv, ivSz,
+        if (ret == 0)
+            ret = wc_AesGcmDecrypt(&aes, NULL, NULL, 0, iv, ivSz,
                                   authTag, authTagSz, authIn, authInSz);
+        wc_AesFree(&aes);
+    }
     ForceZero(&aes, sizeof(aes));
-
+#else
+    (void)key;
+    (void)keySz;
+    (void)iv;
+    (void)ivSz;
+    (void)authIn;
+    (void)authInSz;
+    (void)authTag;
+    (void)authTagSz;
+    ret = NOT_COMPILED_IN;
+#endif
     return ret;
 }
 
@@ -9286,10 +7044,9 @@
 }
 #endif /* HAVE_AES_DECRYPT */
 
-
-/* software AES CCM */
 #else
 
+/* Software CCM */
 static void roll_x(Aes* aes, const byte* in, word32 inSz, byte* out)
 {
     /* process the bulk of the data */
@@ -9362,14 +7119,60 @@
     }
 }
 
+#ifdef WOLFSSL_AESNI
+static WC_INLINE void AesCcmCtrIncSet4(byte* B, word32 lenSz)
+{
+    word32 i;
+
+    /* B+1 = B */
+    XMEMCPY(B + AES_BLOCK_SIZE * 1, B, AES_BLOCK_SIZE);
+    /* B+2,B+3 = B,B+1 */
+    XMEMCPY(B + AES_BLOCK_SIZE * 2, B, AES_BLOCK_SIZE * 2);
+
+    for (i = 0; i < lenSz; i++) {
+        if (++B[AES_BLOCK_SIZE * 1 - 1 - i] != 0) break;
+    }
+    B[AES_BLOCK_SIZE * 2 - 1] += 2;
+    if (B[AES_BLOCK_SIZE * 2 - 1] < 2) {
+        for (i = 1; i < lenSz; i++) {
+            if (++B[AES_BLOCK_SIZE * 2 - 1 - i] != 0) break;
+        }
+    }
+    B[AES_BLOCK_SIZE * 3 - 1] += 3;
+    if (B[AES_BLOCK_SIZE * 3 - 1] < 3) {
+        for (i = 1; i < lenSz; i++) {
+            if (++B[AES_BLOCK_SIZE * 3 - 1 - i] != 0) break;
+        }
+    }
+}
+
+static WC_INLINE void AesCcmCtrInc4(byte* B, word32 lenSz)
+{
+    word32 i;
+
+    B[AES_BLOCK_SIZE - 1] += 4;
+    if (B[AES_BLOCK_SIZE - 1] < 4) {
+        for (i = 1; i < lenSz; i++) {
+            if (++B[AES_BLOCK_SIZE - 1 - i] != 0) break;
+        }
+    }
+}
+#endif
+
+/* Software AES - CCM Encrypt */
 /* return 0 on success */
 int wc_AesCcmEncrypt(Aes* aes, byte* out, const byte* in, word32 inSz,
                    const byte* nonce, word32 nonceSz,
                    byte* authTag, word32 authTagSz,
                    const byte* authIn, word32 authInSz)
 {
+#ifndef WOLFSSL_AESNI
     byte A[AES_BLOCK_SIZE];
     byte B[AES_BLOCK_SIZE];
+#else
+    ALIGN128 byte A[AES_BLOCK_SIZE * 4];
+    ALIGN128 byte B[AES_BLOCK_SIZE * 4];
+#endif
     byte lenSz;
     word32 i;
     byte mask = 0xFF;
@@ -9381,6 +7184,7 @@
             authTagSz > AES_BLOCK_SIZE)
         return BAD_FUNC_ARG;
 
+    XMEMSET(A, 0, sizeof(A));
     XMEMCPY(B+1, nonce, nonceSz);
     lenSz = AES_BLOCK_SIZE - 1 - (byte)nonceSz;
     B[0] = (authInSz > 0 ? 64 : 0)
@@ -9407,6 +7211,26 @@
     xorbuf(authTag, A, authTagSz);
 
     B[15] = 1;
+#ifdef WOLFSSL_AESNI
+    if (haveAESNI && aes->use_aesni) {
+        while (inSz >= AES_BLOCK_SIZE * 4) {
+            AesCcmCtrIncSet4(B, lenSz);
+
+            AES_ECB_encrypt(B, A, AES_BLOCK_SIZE * 4, (byte*)aes->key,
+                            aes->rounds);
+            xorbuf(A, in, AES_BLOCK_SIZE * 4);
+            XMEMCPY(out, A, AES_BLOCK_SIZE * 4);
+
+            inSz -= AES_BLOCK_SIZE * 4;
+            in += AES_BLOCK_SIZE * 4;
+            out += AES_BLOCK_SIZE * 4;
+
+            if (inSz < AES_BLOCK_SIZE * 4) {
+                AesCcmCtrInc4(B, lenSz);
+            }
+        }
+    }
+#endif
     while (inSz >= AES_BLOCK_SIZE) {
         wc_AesEncrypt(aes, B, A);
         xorbuf(A, in, AES_BLOCK_SIZE);
@@ -9430,13 +7254,19 @@
 }
 
 #ifdef HAVE_AES_DECRYPT
+/* Software AES - CCM Decrypt */
 int  wc_AesCcmDecrypt(Aes* aes, byte* out, const byte* in, word32 inSz,
                    const byte* nonce, word32 nonceSz,
                    const byte* authTag, word32 authTagSz,
                    const byte* authIn, word32 authInSz)
 {
+#ifndef WOLFSSL_AESNI
     byte A[AES_BLOCK_SIZE];
     byte B[AES_BLOCK_SIZE];
+#else
+    ALIGN128 byte B[AES_BLOCK_SIZE * 4];
+    ALIGN128 byte A[AES_BLOCK_SIZE * 4];
+#endif
     byte* o;
     byte lenSz;
     word32 i, oSz;
@@ -9460,6 +7290,26 @@
         B[AES_BLOCK_SIZE - 1 - i] = 0;
     B[15] = 1;
 
+#ifdef WOLFSSL_AESNI
+    if (haveAESNI && aes->use_aesni) {
+        while (oSz >= AES_BLOCK_SIZE * 4) {
+            AesCcmCtrIncSet4(B, lenSz);
+
+            AES_ECB_encrypt(B, A, AES_BLOCK_SIZE * 4, (byte*)aes->key,
+                            aes->rounds);
+            xorbuf(A, in, AES_BLOCK_SIZE * 4);
+            XMEMCPY(o, A, AES_BLOCK_SIZE * 4);
+
+            oSz -= AES_BLOCK_SIZE * 4;
+            in += AES_BLOCK_SIZE * 4;
+            o += AES_BLOCK_SIZE * 4;
+
+            if (oSz < AES_BLOCK_SIZE * 4) {
+                AesCcmCtrInc4(B, lenSz);
+            }
+        }
+    }
+#endif
     while (oSz >= AES_BLOCK_SIZE) {
         wc_AesEncrypt(aes, B, A);
         xorbuf(A, in, AES_BLOCK_SIZE);
@@ -9521,7 +7371,7 @@
 }
 
 #endif /* HAVE_AES_DECRYPT */
-#endif /* software AES CCM */
+#endif /* software CCM */
 
 /* abstract functions that call lower level AESCCM functions */
 #ifndef WC_NO_RNG
@@ -9579,8 +7429,10 @@
                                (byte*)aes->reg, aes->nonceSz,
                                authTag, authTagSz,
                                authIn, authInSz);
-        XMEMCPY(ivOut, aes->reg, aes->nonceSz);
-        IncCtr((byte*)aes->reg, aes->nonceSz);
+        if (ret == 0) {
+            XMEMCPY(ivOut, aes->reg, aes->nonceSz);
+            IncCtr((byte*)aes->reg, aes->nonceSz);
+        }
     }
 
     return ret;
@@ -9601,15 +7453,57 @@
 
     aes->heap = heap;
 
+#ifdef WOLF_CRYPTO_CB
+    aes->devId = devId;
+    aes->devCtx = NULL;
+#else
+    (void)devId;
+#endif
 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES)
     ret = wolfAsync_DevCtxInit(&aes->asyncDev, WOLFSSL_ASYNC_MARKER_AES,
                                                         aes->heap, devId);
-#else
-    (void)devId;
 #endif /* WOLFSSL_ASYNC_CRYPT */
 
+#ifdef WOLFSSL_AFALG
+    aes->alFd = -1;
+    aes->rdFd = -1;
+#endif
+#if defined(WOLFSSL_DEVCRYPTO) && \
+   (defined(WOLFSSL_DEVCRYPTO_AES) || defined(WOLFSSL_DEVCRYPTO_CBC))
+    aes->ctx.cfd = -1;
+#endif
+#if defined(WOLFSSL_CRYPTOCELL) && defined(WOLFSSL_CRYPTOCELL_AES)
+    XMEMSET(&aes->ctx, 0, sizeof(aes->ctx));
+#endif
+#ifdef HAVE_AESGCM
+#ifdef OPENSSL_EXTRA
+    XMEMSET(aes->aadH, 0, sizeof(aes->aadH));
+    aes->aadLen = 0;
+#endif
+#endif
+    return ret;
+}
+
+#ifdef HAVE_PKCS11
+int  wc_AesInit_Id(Aes* aes, unsigned char* id, int len, void* heap, int devId)
+{
+    int ret = 0;
+
+    if (aes == NULL)
+        ret = BAD_FUNC_ARG;
+    if (ret == 0 && (len < 0 || len > AES_MAX_ID_LEN))
+        ret = BUFFER_E;
+
+    if (ret == 0)
+        ret  = wc_AesInit(aes, heap, devId);
+    if (ret == 0) {
+        XMEMCPY(aes->id, id, len);
+        aes->idLen = len;
+    }
+
     return ret;
 }
+#endif
 
 /* Free Aes from use with async hardware */
 void wc_AesFree(Aes* aes)
@@ -9620,6 +7514,23 @@
 #if defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES)
     wolfAsync_DevCtxFree(&aes->asyncDev, WOLFSSL_ASYNC_MARKER_AES);
 #endif /* WOLFSSL_ASYNC_CRYPT */
+#if defined(WOLFSSL_AFALG) || defined(WOLFSSL_AFALG_XILINX_AES)
+    if (aes->rdFd > 0) { /* negative is error case */
+        close(aes->rdFd);
+    }
+    if (aes->alFd > 0) {
+        close(aes->alFd);
+    }
+#endif /* WOLFSSL_AFALG */
+#if defined(WOLFSSL_DEVCRYPTO) && \
+    (defined(WOLFSSL_DEVCRYPTO_AES) || defined(WOLFSSL_DEVCRYPTO_CBC))
+    wc_DevCryptoFree(&aes->ctx);
+#endif
+#if defined(WOLF_CRYPTO_CB) || (defined(WOLFSSL_DEVCRYPTO) && \
+    (defined(WOLFSSL_DEVCRYPTO_AES) || defined(WOLFSSL_DEVCRYPTO_CBC))) || \
+    (defined(WOLFSSL_ASYNC_CRYPT) && defined(WC_ASYNC_ENABLE_AES))
+    ForceZero((byte*)aes->devKey, AES_MAX_KEY_SIZE/WOLFSSL_BIT_SIZE);
+#endif
 }
 
 
@@ -9630,23 +7541,26 @@
     if (aes == NULL || keySize == NULL) {
         return BAD_FUNC_ARG;
     }
-
+#if defined(WOLFSSL_CRYPTOCELL) && defined(WOLFSSL_CRYPTOCELL_AES)
+    *keySize = aes->ctx.key.keySize;
+    return ret;
+#endif
     switch (aes->rounds) {
-    #ifdef WOLFSSL_AES_128
+#ifdef WOLFSSL_AES_128
     case 10:
         *keySize = 16;
         break;
-    #endif
-    #ifdef WOLFSSL_AES_192
+#endif
+#ifdef WOLFSSL_AES_192
     case 12:
         *keySize = 24;
         break;
-    #endif
-    #ifdef WOLFSSL_AES_256
+#endif
+#ifdef WOLFSSL_AES_256
     case 14:
         *keySize = 32;
         break;
-    #endif
+#endif
     default:
         *keySize = 0;
         ret = BAD_FUNC_ARG;
@@ -9660,9 +7574,36 @@
 #ifdef HAVE_AES_ECB
 #if defined(WOLFSSL_IMX6_CAAM) && !defined(NO_IMX6_CAAM_AES)
     /* implemented in wolfcrypt/src/port/caam/caam_aes.c */
+
+#elif defined(WOLFSSL_AFALG)
+    /* implemented in wolfcrypt/src/port/af_alg/afalg_aes.c */
+
+#elif defined(WOLFSSL_DEVCRYPTO_AES)
+    /* implemented in wolfcrypt/src/port/devcrypt/devcrypto_aes.c */
+
+#elif defined(WOLFSSL_SCE) && !defined(WOLFSSL_SCE_NO_AES)
+
+/* Software AES - ECB */
+int wc_AesEcbEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+    if ((in == NULL) || (out == NULL) || (aes == NULL))
+        return BAD_FUNC_ARG;
+
+        return AES_ECB_encrypt(aes, in, out, sz);
+}
+
+
+int wc_AesEcbDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+    if ((in == NULL) || (out == NULL) || (aes == NULL))
+        return BAD_FUNC_ARG;
+
+        return AES_ECB_decrypt(aes, in, out, sz);
+}
+
 #else
 
-/* software implementation */
+/* Software AES - ECB */
 int wc_AesEcbEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
 {
     word32 blocks = sz / AES_BLOCK_SIZE;
@@ -9698,44 +7639,66 @@
 #endif
 #endif /* HAVE_AES_ECB */
 
-#ifdef WOLFSSL_AES_CFB
-/* CFB 128
+#if defined(WOLFSSL_AES_CFB) || defined(WOLFSSL_AES_OFB)
+/* Feedback AES mode
  *
  * aes structure holding key to use for encryption
  * out buffer to hold result of encryption (must be at least as large as input
  *     buffer)
  * in  buffer to encrypt
  * sz  size of input buffer
+ * mode flag to specify AES mode
  *
  * returns 0 on success and negative error values on failure
  */
-int wc_AesCfbEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+/* Software AES - CFB Encrypt */
+static int wc_AesFeedbackEncrypt(Aes* aes, byte* out, const byte* in,
+        word32 sz, byte mode)
 {
     byte*  tmp = NULL;
+#ifdef WOLFSSL_AES_CFB
     byte*  reg = NULL;
-
-    WOLFSSL_ENTER("wc_AesCfbEncrypt");
+#endif
 
     if (aes == NULL || out == NULL || in == NULL) {
         return BAD_FUNC_ARG;
     }
 
+#ifdef WOLFSSL_AES_CFB
     if (aes->left && sz) {
         reg = (byte*)aes->reg + AES_BLOCK_SIZE - aes->left;
     }
+#endif
 
     /* consume any unused bytes left in aes->tmp */
     tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left;
     while (aes->left && sz) {
-        *(out++) = *(reg++) = *(in++) ^ *(tmp++);
+        *(out) = *(in++) ^ *(tmp++);
+    #ifdef WOLFSSL_AES_CFB
+        if (mode == AES_CFB_MODE) {
+            *(reg++) = *out;
+        }
+    #endif
+        out++;
         aes->left--;
         sz--;
     }
 
     while (sz >= AES_BLOCK_SIZE) {
-        wc_AesEncryptDirect(aes, out, (byte*)aes->reg);
-        xorbuf(out, in, AES_BLOCK_SIZE);
-        XMEMCPY(aes->reg, out, AES_BLOCK_SIZE);
+        /* Using aes->tmp here for inline case i.e. in=out */
+        wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg);
+    #ifdef WOLFSSL_AES_OFB
+        if (mode == AES_OFB_MODE) {
+            XMEMCPY(aes->reg, aes->tmp, AES_BLOCK_SIZE);
+        }
+    #endif
+        xorbuf((byte*)aes->tmp, in, AES_BLOCK_SIZE);
+    #ifdef WOLFSSL_AES_CFB
+        if (mode == AES_CFB_MODE) {
+            XMEMCPY(aes->reg, aes->tmp, AES_BLOCK_SIZE);
+        }
+    #endif
+        XMEMCPY(out, aes->tmp, AES_BLOCK_SIZE);
         out += AES_BLOCK_SIZE;
         in  += AES_BLOCK_SIZE;
         sz  -= AES_BLOCK_SIZE;
@@ -9747,10 +7710,23 @@
         wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg);
         aes->left = AES_BLOCK_SIZE;
         tmp = (byte*)aes->tmp;
+    #ifdef WOLFSSL_AES_OFB
+        if (mode == AES_OFB_MODE) {
+            XMEMCPY(aes->reg, aes->tmp, AES_BLOCK_SIZE);
+        }
+    #endif
+    #ifdef WOLFSSL_AES_CFB
         reg = (byte*)aes->reg;
+    #endif
 
         while (sz--) {
-            *(out++) = *(reg++) = *(in++) ^ *(tmp++);
+            *(out) = *(in++) ^ *(tmp++);
+        #ifdef WOLFSSL_AES_CFB
+            if (mode == AES_CFB_MODE) {
+                *(reg++) = *out;
+            }
+        #endif
+            out++;
             aes->left--;
         }
     }
@@ -9770,21 +7746,23 @@
  *
  * returns 0 on success and negative error values on failure
  */
-int wc_AesCfbDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+/* Software AES - CFB Decrypt */
+static int wc_AesFeedbackDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
+        byte mode)
 {
     byte*  tmp;
 
-    WOLFSSL_ENTER("wc_AesCfbDecrypt");
-
     if (aes == NULL || out == NULL || in == NULL) {
         return BAD_FUNC_ARG;
     }
 
+    #ifdef WOLFSSL_AES_CFB
     /* check if more input needs copied over to aes->reg */
-    if (aes->left && sz) {
+    if (aes->left && sz && mode == AES_CFB_MODE) {
         int size = min(aes->left, sz);
         XMEMCPY((byte*)aes->reg + AES_BLOCK_SIZE - aes->left, in, size);
     }
+    #endif
 
     /* consume any unused bytes left in aes->tmp */
     tmp = (byte*)aes->tmp + AES_BLOCK_SIZE - aes->left;
@@ -9795,9 +7773,20 @@
     }
 
     while (sz > AES_BLOCK_SIZE) {
-        wc_AesEncryptDirect(aes, out, (byte*)aes->reg);
-        xorbuf(out, in, AES_BLOCK_SIZE);
-        XMEMCPY(aes->reg, in, AES_BLOCK_SIZE);
+        /* Using aes->tmp here for inline case i.e. in=out */
+        wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg);
+    #ifdef WOLFSSL_AES_OFB
+        if (mode == AES_OFB_MODE) {
+            XMEMCPY((byte*)aes->reg, (byte*)aes->tmp, AES_BLOCK_SIZE);
+        }
+    #endif
+        xorbuf((byte*)aes->tmp, in, AES_BLOCK_SIZE);
+    #ifdef WOLFSSL_AES_CFB
+        if (mode == AES_CFB_MODE) {
+            XMEMCPY(aes->reg, in, AES_BLOCK_SIZE);
+        }
+    #endif
+        XMEMCPY(out, (byte*)aes->tmp, AES_BLOCK_SIZE);
         out += AES_BLOCK_SIZE;
         in  += AES_BLOCK_SIZE;
         sz  -= AES_BLOCK_SIZE;
@@ -9807,7 +7796,17 @@
     /* decrypt left over data */
     if (sz) {
         wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg);
-        XMEMCPY(aes->reg, in, sz);
+    #ifdef WOLFSSL_AES_CFB
+        if (mode == AES_CFB_MODE) {
+            XMEMCPY(aes->reg, in, sz);
+        }
+    #endif
+    #ifdef WOLFSSL_AES_OFB
+        if (mode == AES_OFB_MODE) {
+            XMEMCPY(aes->reg, aes->tmp, AES_BLOCK_SIZE);
+        }
+    #endif
+
         aes->left = AES_BLOCK_SIZE;
         tmp = (byte*)aes->tmp;
 
@@ -9822,6 +7821,282 @@
 #endif /* HAVE_AES_DECRYPT */
 #endif /* WOLFSSL_AES_CFB */
 
+#ifdef WOLFSSL_AES_CFB
+/* CFB 128
+ *
+ * aes structure holding key to use for encryption
+ * out buffer to hold result of encryption (must be at least as large as input
+ *     buffer)
+ * in  buffer to encrypt
+ * sz  size of input buffer
+ *
+ * returns 0 on success and negative error values on failure
+ */
+/* Software AES - CFB Encrypt */
+int wc_AesCfbEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+    return wc_AesFeedbackEncrypt(aes, out, in, sz, AES_CFB_MODE);
+}
+
+
+#ifdef HAVE_AES_DECRYPT
+/* CFB 128
+ *
+ * aes structure holding key to use for decryption
+ * out buffer to hold result of decryption (must be at least as large as input
+ *     buffer)
+ * in  buffer to decrypt
+ * sz  size of input buffer
+ *
+ * returns 0 on success and negative error values on failure
+ */
+/* Software AES - CFB Decrypt */
+int wc_AesCfbDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+    return wc_AesFeedbackDecrypt(aes, out, in, sz, AES_CFB_MODE);
+}
+#endif /* HAVE_AES_DECRYPT */
+
+
+/* shift the whole AES_BLOCK_SIZE array left by 8 or 1 bits */
+static void shiftLeftArray(byte* ary, byte shift)
+{
+    int i;
+
+    if (shift == WOLFSSL_BIT_SIZE) {
+        /* shifting over by 8 bits */
+        for (i = 0; i < AES_BLOCK_SIZE - 1; i++) {
+            ary[i] = ary[i+1];
+        }
+        ary[i] = 0;
+    }
+    else {
+        byte carry = 0;
+
+        /* shifting over by 7 or less bits */
+        for (i = 0; i < AES_BLOCK_SIZE - 1; i++) {
+            carry = ary[i+1] & (0XFF << (WOLFSSL_BIT_SIZE - shift));
+            carry >>= (WOLFSSL_BIT_SIZE - shift);
+            ary[i] = (ary[i] << shift) + carry;
+        }
+        ary[i] = ary[i] << shift;
+    }
+}
+
+
+/* returns 0 on success and negative values on failure */
+static int wc_AesFeedbackCFB8(Aes* aes, byte* out, const byte* in,
+        word32 sz, byte dir)
+{
+    byte *pt;
+
+    if (aes == NULL || out == NULL || in == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (sz == 0) {
+        return 0;
+    }
+
+    while (sz > 0) {
+        wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg);
+        if (dir == AES_DECRYPTION) {
+            pt = (byte*)aes->reg;
+
+            /* LSB + CAT */
+            shiftLeftArray(pt, WOLFSSL_BIT_SIZE);
+            pt[AES_BLOCK_SIZE - 1] = in[0];
+        }
+
+        /* MSB + XOR */
+        out[0] = aes->tmp[0] ^ in[0];
+        if (dir == AES_ENCRYPTION) {
+            pt = (byte*)aes->reg;
+
+            /* LSB + CAT */
+            shiftLeftArray(pt, WOLFSSL_BIT_SIZE);
+            pt[AES_BLOCK_SIZE - 1] = out[0];
+        }
+
+        out += 1;
+        in  += 1;
+        sz  -= 1;
+    }
+
+    return 0;
+}
+
+
+/* returns 0 on success and negative values on failure */
+static int wc_AesFeedbackCFB1(Aes* aes, byte* out, const byte* in,
+        word32 sz, byte dir)
+{
+    byte tmp;
+    byte cur = 0; /* hold current work in order to handle inline in=out */
+    byte* pt;
+    int bit = 7;
+
+    if (aes == NULL || out == NULL || in == NULL) {
+        return BAD_FUNC_ARG;
+    }
+
+    if (sz == 0) {
+        return 0;
+    }
+
+    while (sz > 0) {
+        wc_AesEncryptDirect(aes, (byte*)aes->tmp, (byte*)aes->reg);
+        if (dir == AES_DECRYPTION) {
+            pt = (byte*)aes->reg;
+
+            /* LSB + CAT */
+            tmp = (0X01 << bit) & in[0];
+            tmp = tmp >> bit;
+            tmp &= 0x01;
+            shiftLeftArray((byte*)aes->reg, 1);
+            pt[AES_BLOCK_SIZE - 1] |= tmp;
+        }
+
+        /* MSB  + XOR */
+        tmp = (0X01 << bit) & in[0];
+        pt = (byte*)aes->tmp;
+        tmp = (pt[0] >> 7) ^ (tmp >> bit);
+        tmp &= 0x01;
+        cur |= (tmp << bit);
+
+
+        if (dir == AES_ENCRYPTION) {
+            pt = (byte*)aes->reg;
+
+            /* LSB + CAT */
+            shiftLeftArray((byte*)aes->reg, 1);
+            pt[AES_BLOCK_SIZE - 1] |= tmp;
+        }
+
+        bit--;
+        if (bit < 0) {
+            out[0] = cur;
+            out += 1;
+            in  += 1;
+            sz  -= 1;
+            bit = 7;
+            cur = 0;
+        }
+        else {
+            sz -= 1;
+        }
+    }
+
+    if (bit > 0 && bit < 7) {
+        out[0] = cur;
+    }
+
+    return 0;
+}
+
+
+/* CFB 1
+ *
+ * aes structure holding key to use for encryption
+ * out buffer to hold result of encryption (must be at least as large as input
+ *     buffer)
+ * in  buffer to encrypt (packed to left, i.e. 101 is 0x90)
+ * sz  size of input buffer in bits (0x1 would be size of 1 and 0xFF size of 8)
+ *
+ * returns 0 on success and negative values on failure
+ */
+int wc_AesCfb1Encrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+    return wc_AesFeedbackCFB1(aes, out, in, sz, AES_ENCRYPTION);
+}
+
+
+/* CFB 8
+ *
+ * aes structure holding key to use for encryption
+ * out buffer to hold result of encryption (must be at least as large as input
+ *     buffer)
+ * in  buffer to encrypt
+ * sz  size of input buffer
+ *
+ * returns 0 on success and negative values on failure
+ */
+int wc_AesCfb8Encrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+    return wc_AesFeedbackCFB8(aes, out, in, sz, AES_ENCRYPTION);
+}
+#ifdef HAVE_AES_DECRYPT
+
+/* CFB 1
+ *
+ * aes structure holding key to use for encryption
+ * out buffer to hold result of encryption (must be at least as large as input
+ *     buffer)
+ * in  buffer to encrypt
+ * sz  size of input buffer in bits (0x1 would be size of 1 and 0xFF size of 8)
+ *
+ * returns 0 on success and negative values on failure
+ */
+int wc_AesCfb1Decrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+    return wc_AesFeedbackCFB1(aes, out, in, sz, AES_DECRYPTION);
+}
+
+
+/* CFB 8
+ *
+ * aes structure holding key to use for encryption
+ * out buffer to hold result of encryption (must be at least as large as input
+ *     buffer)
+ * in  buffer to encrypt
+ * sz  size of input buffer
+ *
+ * returns 0 on success and negative values on failure
+ */
+int wc_AesCfb8Decrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+    return wc_AesFeedbackCFB8(aes, out, in, sz, AES_DECRYPTION);
+}
+#endif /* HAVE_AES_DECRYPT */
+#endif /* WOLFSSL_AES_CFB */
+
+#ifdef WOLFSSL_AES_OFB
+/* OFB
+ *
+ * aes structure holding key to use for encryption
+ * out buffer to hold result of encryption (must be at least as large as input
+ *     buffer)
+ * in  buffer to encrypt
+ * sz  size of input buffer
+ *
+ * returns 0 on success and negative error values on failure
+ */
+/* Software AES - CFB Encrypt */
+int wc_AesOfbEncrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+    return wc_AesFeedbackEncrypt(aes, out, in, sz, AES_OFB_MODE);
+}
+
+
+#ifdef HAVE_AES_DECRYPT
+/* OFB
+ *
+ * aes structure holding key to use for decryption
+ * out buffer to hold result of decryption (must be at least as large as input
+ *     buffer)
+ * in  buffer to decrypt
+ * sz  size of input buffer
+ *
+ * returns 0 on success and negative error values on failure
+ */
+/* Software AES - OFB Decrypt */
+int wc_AesOfbDecrypt(Aes* aes, byte* out, const byte* in, word32 sz)
+{
+    return wc_AesFeedbackDecrypt(aes, out, in, sz, AES_OFB_MODE);
+}
+#endif /* HAVE_AES_DECRYPT */
+#endif /* WOLFSSL_AES_OFB */
+
 
 #ifdef HAVE_AES_KEYWRAP
 
@@ -10148,7 +8423,7 @@
         word32 j;
         byte carry = 0;
 
-        /* multiply by shift left and propogate carry */
+        /* multiply by shift left and propagate carry */
         for (j = 0; j < AES_BLOCK_SIZE && outSz > 0; j++, outSz--) {
             byte tmpC;
 
@@ -10186,6 +8461,7 @@
  *
  * returns 0 on success
  */
+/* Software AES - XTS Encrypt  */
 int wc_AesXtsEncrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
         const byte* i, word32 iSz)
 {
@@ -10238,7 +8514,7 @@
     #endif
             xorbuf(out, tmp, AES_BLOCK_SIZE);
 
-            /* multiply by shift left and propogate carry */
+            /* multiply by shift left and propagate carry */
             for (j = 0; j < AES_BLOCK_SIZE; j++) {
                 byte tmpC;
 
@@ -10293,6 +8569,7 @@
  *
  * returns 0 on success
  */
+/* Software AES - XTS Decrypt */
 int wc_AesXtsDecrypt(XtsAes* xaes, byte* out, const byte* in, word32 sz,
         const byte* i, word32 iSz)
 {
@@ -10352,7 +8629,7 @@
     #endif
             xorbuf(out, tmp, AES_BLOCK_SIZE);
 
-            /* multiply by shift left and propogate carry */
+            /* multiply by shift left and propagate carry */
             for (j = 0; j < AES_BLOCK_SIZE; j++) {
                 byte tmpC;
 
@@ -10376,7 +8653,7 @@
             byte buf[AES_BLOCK_SIZE];
             byte tmp2[AES_BLOCK_SIZE];
 
-            /* multiply by shift left and propogate carry */
+            /* multiply by shift left and propagate carry */
             for (j = 0; j < AES_BLOCK_SIZE; j++) {
                 byte tmpC;