krb5 commit: Add Intel AESNI assembly files

Greg Hudson ghudson at MIT.EDU
Fri May 24 14:26:12 EDT 2013


https://github.com/krb5/krb5/commit/7809ae6c7d9d737e1a7becc0851148c73c095c4b
commit 7809ae6c7d9d737e1a7becc0851148c73c095c4b
Author: Greg Hudson <ghudson at mit.edu>
Date:   Wed Apr 24 13:04:19 2013 -0400

    Add Intel AESNI assembly files
    
    Add assembly files from the Intel AESNI Sample Library, version 1.2,
    which implement AES encryption using AES-NI instructions.  Trailing
    whitespace was removed.

 NOTICE                               |   38 +
 doc/notice.rst                       |   38 +
 src/lib/crypto/builtin/aes/iaesx64.s | 2081 ++++++++++++++++++++++++++++++++
 src/lib/crypto/builtin/aes/iaesx86.s | 2199 ++++++++++++++++++++++++++++++++++
 4 files changed, 4356 insertions(+), 0 deletions(-)

diff --git a/NOTICE b/NOTICE
index ac08bba..08712a3 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1173,3 +1173,41 @@ The bundled libev source code is subject to the following license:
    the GPL in this and the other files of this package. If you do not
    delete the provisions above, a recipient may use your version of
    this file under either the BSD or the GPL.
+
+======================================================================
+
+Files copied from the Intel AESNI Sample Library are subject to the
+following license:
+
+   Copyright (C) 2010, Intel Corporation
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+      * Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above
+        copyright notice, this list of conditions and the following
+        disclaimer in the documentation and/or other materials
+        provided with the distribution.
+
+      * Neither the name of Intel Corporation nor the names of its
+        contributors may be used to endorse or promote products
+        derived from this software without specific prior written
+        permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+   FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+   COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+   INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+   SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+   STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+   OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/doc/notice.rst b/doc/notice.rst
index 0215841..33ba998 100644
--- a/doc/notice.rst
+++ b/doc/notice.rst
@@ -1120,3 +1120,41 @@ The bundled libev source code is subject to the following license:
     required by the GPL in this and the other files of this package. If you do
     not delete the provisions above, a recipient may use your version of this
     file under either the BSD or the GPL.
+
+-------------------
+
+Files copied from the Intel AESNI Sample Library are subject to the
+following license:
+
+    Copyright |copy| 2010, Intel Corporation
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+        * Redistributions of source code must retain the above
+          copyright notice, this list of conditions and the following
+          disclaimer.
+        * Redistributions in binary form must reproduce the above
+          copyright notice, this list of conditions and the following
+          disclaimer in the documentation and/or other materials
+          provided with the distribution.
+        * Neither the name of Intel Corporation nor the names of its
+          contributors may be used to endorse or promote products
+          derived from this software without specific prior written
+          permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+    CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+    INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+    MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+    BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+    TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+    ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+    TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+    THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+    SUCH DAMAGE.
diff --git a/src/lib/crypto/builtin/aes/iaesx64.s b/src/lib/crypto/builtin/aes/iaesx64.s
new file mode 100644
index 0000000..1012e36
--- /dev/null
+++ b/src/lib/crypto/builtin/aes/iaesx64.s
@@ -0,0 +1,2081 @@
+[bits 64]
+[CPU intelnop]
+
+; Copyright (c) 2010, Intel Corporation
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+;     * Redistributions of source code must retain the above copyright notice,
+;       this list of conditions and the following disclaimer.
+;     * Redistributions in binary form must reproduce the above copyright notice,
+;       this list of conditions and the following disclaimer in the documentation
+;       and/or other materials provided with the distribution.
+;     * Neither the name of Intel Corporation nor the names of its contributors
+;       may be used to endorse or promote products derived from this software
+;       without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+; IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+; BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+; ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%macro linux_setup 0
+%ifdef __linux__
+	mov rcx, rdi
+	mov rdx, rsi
+%endif
+%endmacro
+
+%macro inversekey 1
+	movdqu  xmm1,%1
+	aesimc	xmm0,xmm1
+	movdqu	%1,xmm0
+%endmacro
+
+%macro aesdeclast1 1
+	aesdeclast	xmm0,%1
+%endmacro
+
+%macro aesenclast1 1
+	aesenclast	xmm0,%1
+%endmacro
+
+%macro aesdec1 1
+	aesdec	xmm0,%1
+%endmacro
+
+%macro aesenc1 1
+	aesenc	xmm0,%1
+%endmacro
+
+
+%macro aesdeclast1_u 1
+	movdqu xmm4,%1
+	aesdeclast	xmm0,xmm4
+%endmacro
+
+%macro aesenclast1_u 1
+	movdqu xmm4,%1
+	aesenclast	xmm0,xmm4
+%endmacro
+
+%macro aesdec1_u 1
+	movdqu xmm4,%1
+	aesdec	xmm0,xmm4
+%endmacro
+
+%macro aesenc1_u 1
+	movdqu xmm4,%1
+	aesenc	xmm0,xmm4
+%endmacro
+
+%macro aesdec4 1
+	movdqa	xmm4,%1
+
+	aesdec	xmm0,xmm4
+	aesdec	xmm1,xmm4
+	aesdec	xmm2,xmm4
+	aesdec	xmm3,xmm4
+
+%endmacro
+
+%macro aesdeclast4 1
+	movdqa	xmm4,%1
+
+	aesdeclast	xmm0,xmm4
+	aesdeclast	xmm1,xmm4
+	aesdeclast	xmm2,xmm4
+	aesdeclast	xmm3,xmm4
+
+%endmacro
+
+
+%macro aesenc4 1
+	movdqa	xmm4,%1
+
+	aesenc	xmm0,xmm4
+	aesenc	xmm1,xmm4
+	aesenc	xmm2,xmm4
+	aesenc	xmm3,xmm4
+
+%endmacro
+
+%macro aesenclast4 1
+	movdqa	xmm4,%1
+
+	aesenclast	xmm0,xmm4
+	aesenclast	xmm1,xmm4
+	aesenclast	xmm2,xmm4
+	aesenclast	xmm3,xmm4
+
+%endmacro
+
+
+%macro load_and_inc4 1
+	movdqa	xmm4,%1
+	movdqa	xmm0,xmm5
+	pshufb	xmm0, xmm6 ; byte swap counter back
+	movdqa  xmm1,xmm5
+	paddd	xmm1,[counter_add_one wrt rip]
+	pshufb	xmm1, xmm6 ; byte swap counter back
+	movdqa  xmm2,xmm5
+	paddd	xmm2,[counter_add_two wrt rip]
+	pshufb	xmm2, xmm6 ; byte swap counter back
+	movdqa  xmm3,xmm5
+	paddd	xmm3,[counter_add_three wrt rip]
+	pshufb	xmm3, xmm6 ; byte swap counter back
+	pxor	xmm0,xmm4
+	paddd	xmm5,[counter_add_four wrt rip]
+	pxor	xmm1,xmm4
+	pxor	xmm2,xmm4
+	pxor	xmm3,xmm4
+%endmacro
+
+%macro xor_with_input4 1
+	movdqu xmm4,[%1]
+	pxor xmm0,xmm4
+	movdqu xmm4,[%1+16]
+	pxor xmm1,xmm4
+	movdqu xmm4,[%1+32]
+	pxor xmm2,xmm4
+	movdqu xmm4,[%1+48]
+	pxor xmm3,xmm4
+%endmacro
+
+
+
+%macro load_and_xor4 2
+	movdqa	xmm4,%2
+	movdqu	xmm0,[%1 + 0*16]
+	pxor	xmm0,xmm4
+	movdqu	xmm1,[%1 + 1*16]
+	pxor	xmm1,xmm4
+	movdqu	xmm2,[%1 + 2*16]
+	pxor	xmm2,xmm4
+	movdqu	xmm3,[%1 + 3*16]
+	pxor	xmm3,xmm4
+%endmacro
+
+%macro store4 1
+	movdqu [%1 + 0*16],xmm0
+	movdqu [%1 + 1*16],xmm1
+	movdqu [%1 + 2*16],xmm2
+	movdqu [%1 + 3*16],xmm3
+%endmacro
+
+%macro copy_round_keys 3
+	movdqu xmm4,[%2 + ((%3)*16)]
+	movdqa [%1 + ((%3)*16)],xmm4
+%endmacro
+
+
+%macro key_expansion_1_192 1
+		;; Assumes the xmm3 includes all zeros at this point.
+        pshufd xmm2, xmm2, 11111111b
+        shufps xmm3, xmm1, 00010000b
+        pxor xmm1, xmm3
+        shufps xmm3, xmm1, 10001100b
+        pxor xmm1, xmm3
+		pxor xmm1, xmm2
+		movdqu [rdx+%1], xmm1
+%endmacro
+
+; Calculate w10 and w11 using calculated w9 and known w4-w5
+%macro key_expansion_2_192 1
+		movdqa xmm5, xmm4
+		pslldq xmm5, 4
+		shufps xmm6, xmm1, 11110000b
+		pxor xmm6, xmm5
+		pxor xmm4, xmm6
+		pshufd xmm7, xmm4, 00001110b
+		movdqu [rdx+%1], xmm7
+%endmacro
+
+
+section .data
+align 16
+shuffle_mask:
+DD 0FFFFFFFFh
+DD 03020100h
+DD 07060504h
+DD 0B0A0908h
+
+byte_swap_16:
+DDQ 0x000102030405060708090A0B0C0D0E0F
+
+align 16
+counter_add_one:
+DD 1
+DD 0
+DD 0
+DD 0
+
+counter_add_two:
+DD 2
+DD 0
+DD 0
+DD 0
+
+counter_add_three:
+DD 3
+DD 0
+DD 0
+DD 0
+
+counter_add_four:
+DD 4
+DD 0
+DD 0
+DD 0
+
+
+
+section .text
+
+align 16
+key_expansion256:
+
+    pshufd xmm2, xmm2, 011111111b
+
+    movdqa xmm4, xmm1
+    pshufb xmm4, xmm5
+    pxor xmm1, xmm4
+    pshufb xmm4, xmm5
+    pxor xmm1, xmm4
+    pshufb xmm4, xmm5
+    pxor xmm1, xmm4
+    pxor xmm1, xmm2
+
+    movdqu [rdx], xmm1
+    add rdx, 0x10
+
+    aeskeygenassist xmm4, xmm1, 0
+    pshufd xmm2, xmm4, 010101010b
+
+    movdqa xmm4, xmm3
+    pshufb xmm4, xmm5
+    pxor xmm3, xmm4
+    pshufb xmm4, xmm5
+    pxor xmm3, xmm4
+    pshufb xmm4, xmm5
+    pxor xmm3, xmm4
+    pxor xmm3, xmm2
+
+    movdqu [rdx], xmm3
+    add rdx, 0x10
+
+    ret
+
+
+
+align 16
+key_expansion128:
+    pshufd xmm2, xmm2, 0xFF;
+    movdqa xmm3, xmm1
+    pshufb xmm3, xmm5
+    pxor xmm1, xmm3
+    pshufb xmm3, xmm5
+    pxor xmm1, xmm3
+    pshufb xmm3, xmm5
+    pxor xmm1, xmm3
+    pxor xmm1, xmm2
+
+    ; storing the result in the key schedule array
+    movdqu [rdx], xmm1
+    add rdx, 0x10
+    ret
+
+
+
+
+
+
+align 16
+global iEncExpandKey128
+iEncExpandKey128:
+
+		linux_setup
+
+        movdqu xmm1, [rcx]    ; loading the key
+
+        movdqu [rdx], xmm1
+
+        movdqa xmm5, [shuffle_mask wrt rip]
+
+        add rdx,16
+
+        aeskeygenassist xmm2, xmm1, 0x1     ; Generating round key 1
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x2     ; Generating round key 2
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x4     ; Generating round key 3
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x8     ; Generating round key 4
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x10    ; Generating round key 5
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x20    ; Generating round key 6
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x40    ; Generating round key 7
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x80    ; Generating round key 8
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x1b    ; Generating round key 9
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x36    ; Generating round key 10
+        call key_expansion128
+
+		ret
+
+
+
+align 16
+global iEncExpandKey192
+iEncExpandKey192:
+
+		linux_setup
+		sub rsp,64+8
+		movdqa	[rsp],xmm6
+		movdqa	[rsp+16],xmm7
+
+
+        movq xmm7, [rcx+16]	; loading the AES key
+        movq [rdx+16], xmm7  ; Storing key in memory where all key expansion
+        pshufd xmm4, xmm7, 01001111b
+        movdqu xmm1, [rcx]	; loading the AES key
+        movdqu [rdx], xmm1  ; Storing key in memory where all key expansion
+
+        pxor xmm3, xmm3		; Set xmm3 to be all zeros. Required for the key_expansion.
+        pxor xmm6, xmm6		; Set xmm3 to be all zeros. Required for the key_expansion.
+
+        aeskeygenassist xmm2, xmm4, 0x1     ; Complete round key 1 and generate round key 2
+        key_expansion_1_192 24
+		key_expansion_2_192 40
+
+        aeskeygenassist xmm2, xmm4, 0x2     ; Generate round key 3 and part of round key 4
+        key_expansion_1_192 48
+		key_expansion_2_192 64
+
+        aeskeygenassist xmm2, xmm4, 0x4     ; Complete round key 4 and generate round key 5
+        key_expansion_1_192 72
+		key_expansion_2_192 88
+
+        aeskeygenassist xmm2, xmm4, 0x8     ; Generate round key 6 and part of round key 7
+        key_expansion_1_192 96
+		key_expansion_2_192 112
+
+        aeskeygenassist xmm2, xmm4, 0x10     ; Complete round key 7 and generate round key 8
+        key_expansion_1_192 120
+		key_expansion_2_192 136
+
+        aeskeygenassist xmm2, xmm4, 0x20     ; Generate round key 9 and part of round key 10
+        key_expansion_1_192 144
+		key_expansion_2_192 160
+
+        aeskeygenassist xmm2, xmm4, 0x40     ; Complete round key 10 and generate round key 11
+        key_expansion_1_192 168
+		key_expansion_2_192 184
+
+        aeskeygenassist xmm2, xmm4, 0x80     ; Generate round key 12
+        key_expansion_1_192 192
+
+
+		movdqa	xmm6,[rsp]
+		movdqa	xmm7,[rsp+16]
+		add rsp,64+8
+
+		ret
+
+
+
+
+align 16
+global iDecExpandKey128
+iDecExpandKey128:
+
+	linux_setup
+	push rcx
+	push rdx
+	sub rsp,16+8
+
+	call iEncExpandKey128
+
+	add rsp,16+8
+	pop rdx
+	pop rcx
+
+	inversekey [rdx + 1*16]
+	inversekey [rdx + 2*16]
+	inversekey [rdx + 3*16]
+	inversekey [rdx + 4*16]
+	inversekey [rdx + 5*16]
+	inversekey [rdx + 6*16]
+	inversekey [rdx + 7*16]
+	inversekey [rdx + 8*16]
+	inversekey [rdx + 9*16]
+
+	ret
+
+
+align 16
+global iDecExpandKey192
+iDecExpandKey192:
+
+	linux_setup
+	push rcx
+	push rdx
+	sub rsp,16+8
+
+	call iEncExpandKey192
+
+	add rsp,16+8
+	pop rdx
+	pop rcx
+
+
+	inversekey [rdx + 1*16]
+	inversekey [rdx + 2*16]
+	inversekey [rdx + 3*16]
+	inversekey [rdx + 4*16]
+	inversekey [rdx + 5*16]
+	inversekey [rdx + 6*16]
+	inversekey [rdx + 7*16]
+	inversekey [rdx + 8*16]
+	inversekey [rdx + 9*16]
+	inversekey [rdx + 10*16]
+	inversekey [rdx + 11*16]
+
+	ret
+
+
+
+align 16
+global iDecExpandKey256
+iDecExpandKey256:
+
+	linux_setup
+	push rcx
+	push rdx
+	sub rsp,16+8
+
+	call iEncExpandKey256
+
+	add rsp,16+8
+	pop rdx
+	pop rcx
+
+	inversekey [rdx + 1*16]
+	inversekey [rdx + 2*16]
+	inversekey [rdx + 3*16]
+	inversekey [rdx + 4*16]
+	inversekey [rdx + 5*16]
+	inversekey [rdx + 6*16]
+	inversekey [rdx + 7*16]
+	inversekey [rdx + 8*16]
+	inversekey [rdx + 9*16]
+	inversekey [rdx + 10*16]
+	inversekey [rdx + 11*16]
+	inversekey [rdx + 12*16]
+	inversekey [rdx + 13*16]
+
+	ret
+
+
+
+
+align 16
+global iEncExpandKey256
+iEncExpandKey256:
+
+	linux_setup
+
+    movdqu xmm1, [rcx]    ; loading the key
+    movdqu xmm3, [rcx+16]
+    movdqu [rdx], xmm1  ; Storing key in memory where all key schedule will be stored
+    movdqu [rdx+16], xmm3
+
+    add rdx,32
+
+    movdqa xmm5, [shuffle_mask wrt rip]  ; this mask is used by key_expansion
+
+    aeskeygenassist xmm2, xmm3, 0x1     ;
+    call key_expansion256
+    aeskeygenassist xmm2, xmm3, 0x2     ;
+    call key_expansion256
+    aeskeygenassist xmm2, xmm3, 0x4     ;
+    call key_expansion256
+    aeskeygenassist xmm2, xmm3, 0x8     ;
+    call key_expansion256
+    aeskeygenassist xmm2, xmm3, 0x10    ;
+    call key_expansion256
+    aeskeygenassist xmm2, xmm3, 0x20    ;
+    call key_expansion256
+    aeskeygenassist xmm2, xmm3, 0x40    ;
+;    call key_expansion256
+
+    pshufd xmm2, xmm2, 011111111b
+
+    movdqa xmm4, xmm1
+    pshufb xmm4, xmm5
+    pxor xmm1, xmm4
+    pshufb xmm4, xmm5
+    pxor xmm1, xmm4
+    pshufb xmm4, xmm5
+    pxor xmm1, xmm4
+    pxor xmm1, xmm2
+
+    movdqu [rdx], xmm1
+
+
+	ret
+
+
+
+
+
+
+align 16
+global iDec128
+iDec128:
+
+	linux_setup
+	sub rsp,16*16+8
+
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+	sub r8,rdx
+
+	test eax,eax
+	jz end_dec128
+
+	cmp eax,4
+	jl	lp128decsingle
+
+	test	rcx,0xf
+	jz		lp128decfour
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	mov rcx,rsp
+
+
+
+align 16
+lp128decfour:
+
+	test eax,eax
+	jz end_dec128
+
+	cmp eax,4
+	jl	lp128decsingle
+
+	load_and_xor4 rdx, [rcx+10*16]
+	add rdx,16*4
+	aesdec4 [rcx+9*16]
+	aesdec4 [rcx+8*16]
+	aesdec4 [rcx+7*16]
+	aesdec4 [rcx+6*16]
+	aesdec4 [rcx+5*16]
+	aesdec4 [rcx+4*16]
+	aesdec4 [rcx+3*16]
+	aesdec4 [rcx+2*16]
+	aesdec4 [rcx+1*16]
+	aesdeclast4 [rcx+0*16]
+
+	sub eax,4
+	store4 r8+rdx-(16*4)
+	jmp lp128decfour
+
+
+	align 16
+lp128decsingle:
+
+	movdqu xmm0, [rdx]
+	movdqu xmm4,[rcx+10*16]
+	pxor xmm0, xmm4
+	aesdec1_u [rcx+9*16]
+	aesdec1_u [rcx+8*16]
+	aesdec1_u [rcx+7*16]
+	aesdec1_u [rcx+6*16]
+	aesdec1_u [rcx+5*16]
+	aesdec1_u [rcx+4*16]
+	aesdec1_u [rcx+3*16]
+	aesdec1_u [rcx+2*16]
+	aesdec1_u [rcx+1*16]
+	aesdeclast1_u [rcx+0*16]
+
+	add rdx, 16
+	movdqu  [r8 + rdx - 16], xmm0
+	dec eax
+	jnz lp128decsingle
+
+end_dec128:
+
+	add rsp,16*16+8
+	ret
+
+
+align 16
+global iDec128_CBC
+iDec128_CBC:
+
+	linux_setup
+	sub rsp,16*16+8
+
+	mov r9,rcx
+	mov rax,[rcx+24]
+	movdqu	xmm5,[rax]
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+
+	sub r8,rdx
+
+
+	test eax,eax
+	jz end_dec128_CBC
+
+	cmp eax,4
+	jl	lp128decsingle_CBC
+
+	test	rcx,0xf
+	jz		lp128decfour_CBC
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	mov rcx,rsp
+
+
+align 16
+lp128decfour_CBC:
+
+	test eax,eax
+	jz end_dec128_CBC
+
+	cmp eax,4
+	jl	lp128decsingle_CBC
+
+	load_and_xor4 rdx, [rcx+10*16]
+	add rdx,16*4
+	aesdec4 [rcx+9*16]
+	aesdec4 [rcx+8*16]
+	aesdec4 [rcx+7*16]
+	aesdec4 [rcx+6*16]
+	aesdec4 [rcx+5*16]
+	aesdec4 [rcx+4*16]
+	aesdec4 [rcx+3*16]
+	aesdec4 [rcx+2*16]
+	aesdec4 [rcx+1*16]
+	aesdeclast4 [rcx+0*16]
+
+	pxor	xmm0,xmm5
+	movdqu	xmm4,[rdx - 16*4 + 0*16]
+	pxor	xmm1,xmm4
+	movdqu	xmm4,[rdx - 16*4 + 1*16]
+	pxor	xmm2,xmm4
+	movdqu	xmm4,[rdx - 16*4 + 2*16]
+	pxor	xmm3,xmm4
+	movdqu	xmm5,[rdx - 16*4 + 3*16]
+
+	sub eax,4
+	store4 r8+rdx-(16*4)
+	jmp lp128decfour_CBC
+
+
+	align 16
+lp128decsingle_CBC:
+
+	movdqu xmm0, [rdx]
+	movdqa	xmm1,xmm0
+	movdqu xmm4,[rcx+10*16]
+	pxor xmm0, xmm4
+	aesdec1_u [rcx+9*16]
+	aesdec1_u [rcx+8*16]
+	aesdec1_u [rcx+7*16]
+	aesdec1_u [rcx+6*16]
+	aesdec1_u [rcx+5*16]
+	aesdec1_u [rcx+4*16]
+	aesdec1_u [rcx+3*16]
+	aesdec1_u [rcx+2*16]
+	aesdec1_u [rcx+1*16]
+	aesdeclast1_u [rcx+0*16]
+
+	pxor	xmm0,xmm5
+	movdqa	xmm5,xmm1
+	add rdx, 16
+	movdqu  [r8 + rdx - 16], xmm0
+	dec eax
+	jnz lp128decsingle_CBC
+
+end_dec128_CBC:
+
+	mov	   r9,[r9+24]
+	movdqu [r9],xmm5
+	add rsp,16*16+8
+	ret
+
+
+align 16
+global iDec192_CBC
+iDec192_CBC:
+
+	linux_setup
+	sub rsp,16*16+8
+
+	mov r9,rcx
+	mov rax,[rcx+24]
+	movdqu	xmm5,[rax]
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+
+	sub r8,rdx
+
+	test eax,eax
+	jz end_dec192_CBC
+
+	cmp eax,4
+	jl	lp192decsingle_CBC
+
+	test	rcx,0xf
+	jz		lp192decfour_CBC
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	copy_round_keys rsp,rcx,11
+	copy_round_keys rsp,rcx,12
+	mov rcx,rsp
+
+
+align 16
+lp192decfour_CBC:
+
+	test eax,eax
+	jz end_dec192_CBC
+
+	cmp eax,4
+	jl	lp192decsingle_CBC
+
+	load_and_xor4 rdx, [rcx+12*16]
+	add rdx,16*4
+	aesdec4 [rcx+11*16]
+	aesdec4 [rcx+10*16]
+	aesdec4 [rcx+9*16]
+	aesdec4 [rcx+8*16]
+	aesdec4 [rcx+7*16]
+	aesdec4 [rcx+6*16]
+	aesdec4 [rcx+5*16]
+	aesdec4 [rcx+4*16]
+	aesdec4 [rcx+3*16]
+	aesdec4 [rcx+2*16]
+	aesdec4 [rcx+1*16]
+	aesdeclast4 [rcx+0*16]
+
+	pxor	xmm0,xmm5
+	movdqu	xmm4,[rdx - 16*4 + 0*16]
+	pxor	xmm1,xmm4
+	movdqu	xmm4,[rdx - 16*4 + 1*16]
+	pxor	xmm2,xmm4
+	movdqu	xmm4,[rdx - 16*4 + 2*16]
+	pxor	xmm3,xmm4
+	movdqu	xmm5,[rdx - 16*4 + 3*16]
+
+	sub eax,4
+	store4 r8+rdx-(16*4)
+	jmp lp192decfour_CBC
+
+
+	align 16
+lp192decsingle_CBC:
+
+	movdqu xmm0, [rdx]
+	movdqu xmm4,[rcx+12*16]
+	movdqa	xmm1,xmm0
+	pxor xmm0, xmm4
+	aesdec1_u [rcx+11*16]
+	aesdec1_u [rcx+10*16]
+	aesdec1_u [rcx+9*16]
+	aesdec1_u [rcx+8*16]
+	aesdec1_u [rcx+7*16]
+	aesdec1_u [rcx+6*16]
+	aesdec1_u [rcx+5*16]
+	aesdec1_u [rcx+4*16]
+	aesdec1_u [rcx+3*16]
+	aesdec1_u [rcx+2*16]
+	aesdec1_u [rcx+1*16]
+	aesdeclast1_u [rcx+0*16]
+
+	pxor	xmm0,xmm5
+	movdqa	xmm5,xmm1
+	add rdx, 16
+	movdqu  [r8 + rdx - 16], xmm0
+	dec eax
+	jnz lp192decsingle_CBC
+
+end_dec192_CBC:
+
+	mov	   r9,[r9+24]
+	movdqu [r9],xmm5
+	add rsp,16*16+8
+	ret
+
+
+
+
+align 16
+global iDec256_CBC
+iDec256_CBC:
+
+	linux_setup
+	sub rsp,16*16+8
+
+	mov r9,rcx
+	mov rax,[rcx+24]
+	movdqu	xmm5,[rax]
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+
+	sub r8,rdx
+
+	test eax,eax
+	jz end_dec256_CBC
+
+	cmp eax,4
+	jl	lp256decsingle_CBC
+
+	test	rcx,0xf
+	jz		lp256decfour_CBC
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	copy_round_keys rsp,rcx,11
+	copy_round_keys rsp,rcx,12
+	copy_round_keys rsp,rcx,13
+	copy_round_keys rsp,rcx,14
+	mov rcx,rsp
+
+align 16
+lp256decfour_CBC:
+
+	test eax,eax
+	jz end_dec256_CBC
+
+	cmp eax,4
+	jl	lp256decsingle_CBC
+
+	load_and_xor4 rdx, [rcx+14*16]
+	add rdx,16*4
+	aesdec4 [rcx+13*16]
+	aesdec4 [rcx+12*16]
+	aesdec4 [rcx+11*16]
+	aesdec4 [rcx+10*16]
+	aesdec4 [rcx+9*16]
+	aesdec4 [rcx+8*16]
+	aesdec4 [rcx+7*16]
+	aesdec4 [rcx+6*16]
+	aesdec4 [rcx+5*16]
+	aesdec4 [rcx+4*16]
+	aesdec4 [rcx+3*16]
+	aesdec4 [rcx+2*16]
+	aesdec4 [rcx+1*16]
+	aesdeclast4 [rcx+0*16]
+
+	pxor	xmm0,xmm5
+	movdqu	xmm4,[rdx - 16*4 + 0*16]
+	pxor	xmm1,xmm4
+	movdqu	xmm4,[rdx - 16*4 + 1*16]
+	pxor	xmm2,xmm4
+	movdqu	xmm4,[rdx - 16*4 + 2*16]
+	pxor	xmm3,xmm4
+	movdqu	xmm5,[rdx - 16*4 + 3*16]
+
+	sub eax,4
+	store4 r8+rdx-(16*4)
+	jmp lp256decfour_CBC
+
+
+	align 16
+lp256decsingle_CBC:
+
+	movdqu xmm0, [rdx]
+	movdqu xmm4,[rcx+14*16]
+	movdqa	xmm1,xmm0
+	pxor xmm0, xmm4
+	aesdec1_u [rcx+13*16]
+	aesdec1_u [rcx+12*16]
+	aesdec1_u [rcx+11*16]
+	aesdec1_u [rcx+10*16]
+	aesdec1_u [rcx+9*16]
+	aesdec1_u [rcx+8*16]
+	aesdec1_u [rcx+7*16]
+	aesdec1_u [rcx+6*16]
+	aesdec1_u [rcx+5*16]
+	aesdec1_u [rcx+4*16]
+	aesdec1_u [rcx+3*16]
+	aesdec1_u [rcx+2*16]
+	aesdec1_u [rcx+1*16]
+	aesdeclast1_u [rcx+0*16]
+
+	pxor	xmm0,xmm5
+	movdqa	xmm5,xmm1
+	add rdx, 16
+	movdqu  [r8 + rdx - 16], xmm0
+	dec eax
+	jnz lp256decsingle_CBC
+
+end_dec256_CBC:
+
+	mov	   r9,[r9+24]
+	movdqu [r9],xmm5
+	add rsp,16*16+8
+	ret
+
+
+
+
+
+align 16
+global iDec192
+iDec192:
+
+	linux_setup
+	sub rsp,16*16+8
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+	sub r8,rdx
+
+	test eax,eax
+	jz end_dec192
+
+	cmp eax,4
+	jl	lp192decsingle
+
+	test	rcx,0xf
+	jz		lp192decfour
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	copy_round_keys rsp,rcx,11
+	copy_round_keys rsp,rcx,12
+	mov rcx,rsp
+
+align 16
+lp192decfour:
+
+	test eax,eax
+	jz end_dec192
+
+	cmp eax,4
+	jl	lp192decsingle
+
+	load_and_xor4 rdx, [rcx+12*16]
+	add rdx,16*4
+	aesdec4 [rcx+11*16]
+	aesdec4 [rcx+10*16]
+	aesdec4 [rcx+9*16]
+	aesdec4 [rcx+8*16]
+	aesdec4 [rcx+7*16]
+	aesdec4 [rcx+6*16]
+	aesdec4 [rcx+5*16]
+	aesdec4 [rcx+4*16]
+	aesdec4 [rcx+3*16]
+	aesdec4 [rcx+2*16]
+	aesdec4 [rcx+1*16]
+	aesdeclast4 [rcx+0*16]
+
+	sub eax,4
+	store4 r8+rdx-(16*4)
+	jmp lp192decfour
+
+
+	align 16
+lp192decsingle:
+
+	movdqu xmm0, [rdx]
+	movdqu xmm4,[rcx+12*16]
+	pxor xmm0, xmm4
+	aesdec1_u [rcx+11*16]
+	aesdec1_u [rcx+10*16]
+	aesdec1_u [rcx+9*16]
+	aesdec1_u [rcx+8*16]
+	aesdec1_u [rcx+7*16]
+	aesdec1_u [rcx+6*16]
+	aesdec1_u [rcx+5*16]
+	aesdec1_u [rcx+4*16]
+	aesdec1_u [rcx+3*16]
+	aesdec1_u [rcx+2*16]
+	aesdec1_u [rcx+1*16]
+	aesdeclast1_u [rcx+0*16]
+
+	add rdx, 16
+	movdqu  [r8 + rdx - 16], xmm0
+	dec eax
+	jnz lp192decsingle
+
+end_dec192:
+
+	add rsp,16*16+8
+	ret
+
+
+
+
+align 16
+global iDec256
+iDec256:
+
+	linux_setup
+	sub rsp,16*16+8
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+	sub r8,rdx
+
+
+	test eax,eax
+	jz end_dec256
+
+	cmp eax,4
+	jl lp256dec
+
+	test	rcx,0xf
+	jz		lp256dec4
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	copy_round_keys rsp,rcx,11
+	copy_round_keys rsp,rcx,12
+	copy_round_keys rsp,rcx,13
+	copy_round_keys rsp,rcx,14
+	mov rcx,rsp
+
+
+	align 16
+lp256dec4:
+	test eax,eax
+	jz end_dec256
+
+	cmp eax,4
+	jl lp256dec
+
+	load_and_xor4 rdx,[rcx+14*16]
+	add rdx, 4*16
+	aesdec4 [rcx+13*16]
+	aesdec4 [rcx+12*16]
+	aesdec4 [rcx+11*16]
+	aesdec4 [rcx+10*16]
+	aesdec4 [rcx+9*16]
+	aesdec4 [rcx+8*16]
+	aesdec4 [rcx+7*16]
+	aesdec4 [rcx+6*16]
+	aesdec4 [rcx+5*16]
+	aesdec4 [rcx+4*16]
+	aesdec4 [rcx+3*16]
+	aesdec4 [rcx+2*16]
+	aesdec4 [rcx+1*16]
+	aesdeclast4 [rcx+0*16]
+
+	store4 r8+rdx-16*4
+	sub eax,4
+	jmp lp256dec4
+
+	align 16
+lp256dec:
+
+	movdqu xmm0, [rdx]
+	movdqu xmm4,[rcx+14*16]
+	add rdx, 16
+	pxor xmm0, xmm4                    ; Round 0 (only xor)
+	aesdec1_u [rcx+13*16]
+	aesdec1_u [rcx+12*16]
+	aesdec1_u [rcx+11*16]
+	aesdec1_u [rcx+10*16]
+	aesdec1_u [rcx+9*16]
+	aesdec1_u [rcx+8*16]
+	aesdec1_u [rcx+7*16]
+	aesdec1_u [rcx+6*16]
+	aesdec1_u [rcx+5*16]
+	aesdec1_u [rcx+4*16]
+	aesdec1_u [rcx+3*16]
+	aesdec1_u [rcx+2*16]
+	aesdec1_u [rcx+1*16]
+	aesdeclast1_u [rcx+0*16]
+
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [r8+rdx-16], xmm0
+	dec eax
+	jnz lp256dec
+
+end_dec256:
+
+	add rsp,16*16+8
+	ret
+
+
+
+
+
+
+align 16
+global iEnc128
+iEnc128:
+
+	linux_setup
+	sub rsp,16*16+8
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+	sub r8,rdx
+
+
+	test eax,eax
+	jz end_enc128
+
+	cmp eax,4
+	jl lp128encsingle
+
+	test	rcx,0xf
+	jz		lpenc128four
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	mov rcx,rsp
+
+
+	align 16
+
+lpenc128four:
+
+	test eax,eax
+	jz end_enc128
+
+	cmp eax,4
+	jl lp128encsingle
+
+	load_and_xor4 rdx,[rcx+0*16]
+	add rdx,4*16
+	aesenc4	[rcx+1*16]
+	aesenc4	[rcx+2*16]
+	aesenc4	[rcx+3*16]
+	aesenc4	[rcx+4*16]
+	aesenc4	[rcx+5*16]
+	aesenc4	[rcx+6*16]
+	aesenc4	[rcx+7*16]
+	aesenc4	[rcx+8*16]
+	aesenc4	[rcx+9*16]
+	aesenclast4	[rcx+10*16]
+
+	store4 r8+rdx-16*4
+	sub eax,4
+	jmp lpenc128four
+
+	align 16
+lp128encsingle:
+
+	movdqu xmm0, [rdx]
+	movdqu xmm4,[rcx+0*16]
+	add rdx, 16
+	pxor xmm0, xmm4
+	aesenc1_u [rcx+1*16]
+	aesenc1_u [rcx+2*16]
+	aesenc1_u [rcx+3*16]
+	aesenc1_u [rcx+4*16]
+	aesenc1_u [rcx+5*16]
+	aesenc1_u [rcx+6*16]
+	aesenc1_u [rcx+7*16]
+	aesenc1_u [rcx+8*16]
+	aesenc1_u [rcx+9*16]
+	aesenclast1_u [rcx+10*16]
+
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [r8+rdx-16], xmm0
+	dec eax
+	jnz lp128encsingle
+
+end_enc128:
+
+	add rsp,16*16+8
+	ret
+
+
+align 16
+global iEnc128_CTR
+iEnc128_CTR:
+
+	linux_setup
+
+	mov r9,rcx
+	mov rax,[rcx+24]
+	movdqu xmm5,[rax]
+
+
+	sub rsp,16*16+8+16
+
+	movdqa [rsp+16*16], xmm6
+	movdqa xmm6, [byte_swap_16 wrt rip]
+	pshufb xmm5, xmm6 ; byte swap counter
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+	sub r8,rdx
+
+	test eax,eax
+	jz end_encctr128
+
+	cmp eax,4
+	jl lp128encctrsingle
+
+	test	rcx,0xf
+	jz		lpencctr128four
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	mov rcx,rsp
+
+
+	align 16
+
+lpencctr128four:
+
+	test eax,eax
+	jz end_encctr128
+
+	cmp eax,4
+	jl lp128encctrsingle
+
+	load_and_inc4 [rcx+0*16]
+	add rdx,4*16
+	aesenc4	[rcx+1*16]
+	aesenc4	[rcx+2*16]
+	aesenc4	[rcx+3*16]
+	aesenc4	[rcx+4*16]
+	aesenc4	[rcx+5*16]
+	aesenc4	[rcx+6*16]
+	aesenc4	[rcx+7*16]
+	aesenc4	[rcx+8*16]
+	aesenc4	[rcx+9*16]
+	aesenclast4	[rcx+10*16]
+	xor_with_input4 rdx-(4*16)
+
+	store4 r8+rdx-16*4
+	sub eax,4
+	jmp lpencctr128four
+
+	align 16
+lp128encctrsingle:
+
+	movdqa xmm0,xmm5
+	pshufb	xmm0, xmm6 ; byte swap counter back
+	paddd	xmm5,[counter_add_one wrt rip]
+	add rdx, 16
+	movdqu xmm4,[rcx+0*16]
+	pxor xmm0, xmm4
+	aesenc1_u [rcx+1*16]
+	aesenc1_u [rcx+2*16]
+	aesenc1_u [rcx+3*16]
+	aesenc1_u [rcx+4*16]
+	aesenc1_u [rcx+5*16]
+	aesenc1_u [rcx+6*16]
+	aesenc1_u [rcx+7*16]
+	aesenc1_u [rcx+8*16]
+	aesenc1_u [rcx+9*16]
+	aesenclast1_u [rcx+10*16]
+	movdqu xmm4, [rdx-16]
+	pxor  xmm0,xmm4
+
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [r8+rdx-16], xmm0
+	dec eax
+	jnz lp128encctrsingle
+
+end_encctr128:
+
+	mov	   r9,[r9+24]
+
+	pshufb xmm5, xmm6 ; byte swap counter
+	movdqu [r9],xmm5
+	movdqa xmm6, [rsp+16*16]
+	add rsp,16*16+8+16
+	ret
+
+
+
+align 16
+global iEnc192_CTR
+iEnc192_CTR:
+
+	linux_setup
+
+	mov r9,rcx
+	mov rax,[rcx+24]
+	movdqu xmm5,[rax]
+
+
+	sub rsp,16*16+8+16
+
+	movdqa [rsp+16*16], xmm6
+	movdqa xmm6, [byte_swap_16 wrt rip]
+	pshufb xmm5, xmm6 ; byte swap counter
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+	sub r8,rdx
+
+
+	test eax,eax
+	jz end_encctr192
+
+	cmp eax,4
+	jl lp192encctrsingle
+
+	test	rcx,0xf
+	jz		lpencctr192four
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	copy_round_keys rsp,rcx,11
+	copy_round_keys rsp,rcx,12
+	mov rcx,rsp
+
+
+	align 16
+
+lpencctr192four:
+
+	test eax,eax
+	jz end_encctr192
+
+	cmp eax,4
+	jl lp192encctrsingle
+
+	load_and_inc4 [rcx+0*16]
+	add rdx,4*16
+	aesenc4	[rcx+1*16]
+	aesenc4	[rcx+2*16]
+	aesenc4	[rcx+3*16]
+	aesenc4	[rcx+4*16]
+	aesenc4	[rcx+5*16]
+	aesenc4	[rcx+6*16]
+	aesenc4	[rcx+7*16]
+	aesenc4	[rcx+8*16]
+	aesenc4	[rcx+9*16]
+	aesenc4	[rcx+10*16]
+	aesenc4	[rcx+11*16]
+	aesenclast4	[rcx+12*16]
+	xor_with_input4 rdx-(4*16)
+
+	store4 r8+rdx-16*4
+	sub eax,4
+	jmp lpencctr192four
+
+	align 16
+lp192encctrsingle:
+
+	movdqa xmm0,xmm5
+	pshufb	xmm0, xmm6 ; byte swap counter back
+	movdqu xmm4,[rcx+0*16]
+	paddd	xmm5,[counter_add_one wrt rip]
+	add rdx, 16
+	pxor xmm0, xmm4
+	aesenc1_u [rcx+1*16]
+	aesenc1_u [rcx+2*16]
+	aesenc1_u [rcx+3*16]
+	aesenc1_u [rcx+4*16]
+	aesenc1_u [rcx+5*16]
+	aesenc1_u [rcx+6*16]
+	aesenc1_u [rcx+7*16]
+	aesenc1_u [rcx+8*16]
+	aesenc1_u [rcx+9*16]
+	aesenc1_u [rcx+10*16]
+	aesenc1_u [rcx+11*16]
+	aesenclast1_u [rcx+12*16]
+	movdqu xmm4, [rdx-16]
+	pxor  xmm0,xmm4
+
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [r8+rdx-16], xmm0
+	dec eax
+	jnz lp192encctrsingle
+
+end_encctr192:
+
+	mov	   r9,[r9+24]
+	pshufb xmm5, xmm6 ; byte swap counter
+	movdqu [r9],xmm5
+	movdqa xmm6, [rsp+16*16]
+	add rsp,16*16+8+16
+	ret
+
+
+align 16
+global iEnc256_CTR
+iEnc256_CTR:
+
+	linux_setup
+
+	mov r9,rcx
+	mov rax,[rcx+24]
+	movdqu xmm5,[rax]
+
+
+	sub rsp,16*16+8+16
+
+	movdqa [rsp+16*16], xmm6
+	movdqa xmm6, [byte_swap_16 wrt rip]
+	pshufb xmm5, xmm6 ; byte swap counter
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+	sub r8,rdx
+
+
+	test eax,eax
+	jz end_encctr256
+
+	cmp eax,4
+	jl lp256encctrsingle
+
+	test	rcx,0xf
+	jz		lpencctr256four
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	copy_round_keys rsp,rcx,11
+	copy_round_keys rsp,rcx,12
+	copy_round_keys rsp,rcx,13
+	copy_round_keys rsp,rcx,14
+	mov rcx,rsp
+
+
+	align 16
+
+lpencctr256four:
+
+	test eax,eax
+	jz end_encctr256
+
+	cmp eax,4
+	jl lp256encctrsingle
+
+	load_and_inc4 [rcx+0*16]
+	add rdx,4*16
+	aesenc4	[rcx+1*16]
+	aesenc4	[rcx+2*16]
+	aesenc4	[rcx+3*16]
+	aesenc4	[rcx+4*16]
+	aesenc4	[rcx+5*16]
+	aesenc4	[rcx+6*16]
+	aesenc4	[rcx+7*16]
+	aesenc4	[rcx+8*16]
+	aesenc4	[rcx+9*16]
+	aesenc4	[rcx+10*16]
+	aesenc4	[rcx+11*16]
+	aesenc4	[rcx+12*16]
+	aesenc4	[rcx+13*16]
+	aesenclast4	[rcx+14*16]
+	xor_with_input4 rdx-(4*16)
+
+	store4 r8+rdx-16*4
+	sub eax,4
+	jmp lpencctr256four
+
+	align 16
+lp256encctrsingle:
+
+	movdqa xmm0,xmm5
+	pshufb	xmm0, xmm6 ; byte swap counter back
+	movdqu xmm4,[rcx+0*16]
+	paddd	xmm5,[counter_add_one wrt rip]
+	add rdx, 16
+	pxor xmm0, xmm4
+	aesenc1_u [rcx+1*16]
+	aesenc1_u [rcx+2*16]
+	aesenc1_u [rcx+3*16]
+	aesenc1_u [rcx+4*16]
+	aesenc1_u [rcx+5*16]
+	aesenc1_u [rcx+6*16]
+	aesenc1_u [rcx+7*16]
+	aesenc1_u [rcx+8*16]
+	aesenc1_u [rcx+9*16]
+	aesenc1_u [rcx+10*16]
+	aesenc1_u [rcx+11*16]
+	aesenc1_u [rcx+12*16]
+	aesenc1_u [rcx+13*16]
+	aesenclast1_u [rcx+14*16]
+	movdqu xmm4, [rdx-16]
+	pxor  xmm0,xmm4
+
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [r8+rdx-16], xmm0
+	dec eax
+	jnz lp256encctrsingle
+
+end_encctr256:
+
+	mov	   r9,[r9+24]
+	pshufb xmm5, xmm6 ; byte swap counter
+	movdqu [r9],xmm5
+	movdqa xmm6, [rsp+16*16]
+	add rsp,16*16+8+16
+	ret
+
+
+
+
+
+
+
+align 16
+global iEnc128_CBC
+iEnc128_CBC:
+
+	linux_setup
+	sub rsp,16*16+8
+
+	mov r9,rcx
+	mov rax,[rcx+24]
+	movdqu xmm1,[rax]
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+	sub r8,rdx
+
+
+	test	rcx,0xf
+	jz		lp128encsingle_CBC
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	mov rcx,rsp
+
+
+	align 16
+
+lp128encsingle_CBC:
+
+	movdqu xmm0, [rdx]
+	movdqu xmm4,[rcx+0*16]
+	add rdx, 16
+	pxor xmm0, xmm1
+	pxor xmm0, xmm4
+	aesenc1 [rcx+1*16]
+	aesenc1 [rcx+2*16]
+	aesenc1 [rcx+3*16]
+	aesenc1 [rcx+4*16]
+	aesenc1 [rcx+5*16]
+	aesenc1 [rcx+6*16]
+	aesenc1 [rcx+7*16]
+	aesenc1 [rcx+8*16]
+	aesenc1 [rcx+9*16]
+	aesenclast1 [rcx+10*16]
+	movdqa xmm1,xmm0
+
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [r8+rdx-16], xmm0
+	dec eax
+	jnz lp128encsingle_CBC
+
+	mov	   r9,[r9+24]
+	movdqu [r9],xmm1
+	add rsp,16*16+8
+	ret
+
+
+align 16
+global iEnc192_CBC
+iEnc192_CBC:
+
+	linux_setup
+	sub rsp,16*16+8
+	mov r9,rcx
+	mov rax,[rcx+24]
+	movdqu xmm1,[rax]
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+	sub r8,rdx
+
+	test	rcx,0xf
+	jz		lp192encsingle_CBC
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	copy_round_keys rsp,rcx,11
+	copy_round_keys rsp,rcx,12
+	mov rcx,rsp
+
+
+
+	align 16
+
+lp192encsingle_CBC:
+
+	movdqu xmm0, [rdx]
+	movdqu xmm4, [rcx+0*16]
+	add rdx, 16
+	pxor xmm0, xmm1
+	pxor xmm0, xmm4
+	aesenc1 [rcx+1*16]
+	aesenc1 [rcx+2*16]
+	aesenc1 [rcx+3*16]
+	aesenc1 [rcx+4*16]
+	aesenc1 [rcx+5*16]
+	aesenc1 [rcx+6*16]
+	aesenc1 [rcx+7*16]
+	aesenc1 [rcx+8*16]
+	aesenc1 [rcx+9*16]
+	aesenc1 [rcx+10*16]
+	aesenc1 [rcx+11*16]
+	aesenclast1 [rcx+12*16]
+	movdqa xmm1,xmm0
+
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [r8+rdx-16], xmm0
+	dec eax
+	jnz lp192encsingle_CBC
+
+	mov	   r9,[r9+24]
+	movdqu [r9],xmm1
+
+	add rsp,16*16+8
+	ret
+
+
+align 16
+global iEnc256_CBC
+iEnc256_CBC:
+
+	linux_setup
+	sub rsp,16*16+8
+
+	mov r9,rcx
+	mov rax,[rcx+24]
+	movdqu xmm1,[rax]
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+	sub r8,rdx
+
+	test	rcx,0xf
+	jz		lp256encsingle_CBC
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	copy_round_keys rsp,rcx,11
+	copy_round_keys rsp,rcx,12
+	copy_round_keys rsp,rcx,13
+	copy_round_keys rsp,rcx,14
+	mov rcx,rsp
+
+	align 16
+
+lp256encsingle_CBC:
+
+	movdqu xmm0, [rdx]
+	movdqu xmm4, [rcx+0*16]
+	add rdx, 16
+	pxor xmm0, xmm1
+	pxor xmm0, xmm4
+	aesenc1 [rcx+1*16]
+	aesenc1 [rcx+2*16]
+	aesenc1 [rcx+3*16]
+	aesenc1 [rcx+4*16]
+	aesenc1 [rcx+5*16]
+	aesenc1 [rcx+6*16]
+	aesenc1 [rcx+7*16]
+	aesenc1 [rcx+8*16]
+	aesenc1 [rcx+9*16]
+	aesenc1 [rcx+10*16]
+	aesenc1 [rcx+11*16]
+	aesenc1 [rcx+12*16]
+	aesenc1 [rcx+13*16]
+	aesenclast1 [rcx+14*16]
+	movdqa xmm1,xmm0
+
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [r8+rdx-16], xmm0
+	dec eax
+	jnz lp256encsingle_CBC
+
+	mov	   r9,[r9+24]
+	movdqu [r9],xmm1
+	add rsp,16*16+8
+	ret
+
+
+
+
+align 16
+global iEnc192
+iEnc192:
+
+	linux_setup
+	sub rsp,16*16+8
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+	sub r8,rdx
+
+	test eax,eax
+	jz end_enc192
+
+	cmp eax,4
+	jl lp192encsingle
+
+	test	rcx,0xf
+	jz		lpenc192four
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	copy_round_keys rsp,rcx,11
+	copy_round_keys rsp,rcx,12
+	mov rcx,rsp
+
+
+	align 16
+
+lpenc192four:
+
+	test eax,eax
+	jz end_enc192
+
+	cmp eax,4
+	jl lp192encsingle
+
+	load_and_xor4 rdx,[rcx+0*16]
+	add rdx,4*16
+	aesenc4	[rcx+1*16]
+	aesenc4	[rcx+2*16]
+	aesenc4	[rcx+3*16]
+	aesenc4	[rcx+4*16]
+	aesenc4	[rcx+5*16]
+	aesenc4	[rcx+6*16]
+	aesenc4	[rcx+7*16]
+	aesenc4	[rcx+8*16]
+	aesenc4	[rcx+9*16]
+	aesenc4	[rcx+10*16]
+	aesenc4	[rcx+11*16]
+	aesenclast4	[rcx+12*16]
+
+	store4 r8+rdx-16*4
+	sub eax,4
+	jmp lpenc192four
+
+	align 16
+lp192encsingle:
+
+	movdqu xmm0, [rdx]
+	movdqu xmm4, [rcx+0*16]
+	add rdx, 16
+	pxor xmm0, xmm4
+	aesenc1_u [rcx+1*16]
+	aesenc1_u [rcx+2*16]
+	aesenc1_u [rcx+3*16]
+	aesenc1_u [rcx+4*16]
+	aesenc1_u [rcx+5*16]
+	aesenc1_u [rcx+6*16]
+	aesenc1_u [rcx+7*16]
+	aesenc1_u [rcx+8*16]
+	aesenc1_u [rcx+9*16]
+	aesenc1_u [rcx+10*16]
+	aesenc1_u [rcx+11*16]
+	aesenclast1_u [rcx+12*16]
+
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [r8+rdx-16], xmm0
+	dec eax
+	jnz lp192encsingle
+
+end_enc192:
+
+	add rsp,16*16+8
+	ret
+
+
+
+
+
+
+align 16
+global iEnc256
+iEnc256:
+
+	linux_setup
+	sub rsp,16*16+8
+
+	mov eax,[rcx+32] ; numblocks
+	mov rdx,[rcx]
+	mov r8,[rcx+8]
+	mov rcx,[rcx+16]
+
+	sub r8,rdx
+
+
+	test eax,eax
+	jz end_enc256
+
+	cmp eax,4
+	jl lp256enc
+
+	test	rcx,0xf
+	jz		lp256enc4
+
+	copy_round_keys rsp,rcx,0
+	copy_round_keys rsp,rcx,1
+	copy_round_keys rsp,rcx,2
+	copy_round_keys rsp,rcx,3
+	copy_round_keys rsp,rcx,4
+	copy_round_keys rsp,rcx,5
+	copy_round_keys rsp,rcx,6
+	copy_round_keys rsp,rcx,7
+	copy_round_keys rsp,rcx,8
+	copy_round_keys rsp,rcx,9
+	copy_round_keys rsp,rcx,10
+	copy_round_keys rsp,rcx,11
+	copy_round_keys rsp,rcx,12
+	copy_round_keys rsp,rcx,13
+	copy_round_keys rsp,rcx,14
+	mov rcx,rsp
+
+
+	align 16
+
+lp256enc4:
+	test eax,eax
+	jz end_enc256
+
+	cmp eax,4
+	jl lp256enc
+
+
+	load_and_xor4 rdx,[rcx+0*16]
+	add rdx, 16*4
+	aesenc4 [rcx+1*16]
+	aesenc4 [rcx+2*16]
+	aesenc4 [rcx+3*16]
+	aesenc4 [rcx+4*16]
+	aesenc4 [rcx+5*16]
+	aesenc4 [rcx+6*16]
+	aesenc4 [rcx+7*16]
+	aesenc4 [rcx+8*16]
+	aesenc4 [rcx+9*16]
+	aesenc4 [rcx+10*16]
+	aesenc4 [rcx+11*16]
+	aesenc4 [rcx+12*16]
+	aesenc4 [rcx+13*16]
+	aesenclast4 [rcx+14*16]
+
+	store4  r8+rdx-16*4
+	sub eax,4
+	jmp lp256enc4
+
+	align 16
+lp256enc:
+
+	movdqu xmm0, [rdx]
+	movdqu xmm4, [rcx+0*16]
+	add rdx, 16
+	pxor xmm0, xmm4
+	aesenc1_u [rcx+1*16]
+	aesenc1_u [rcx+2*16]
+	aesenc1_u [rcx+3*16]
+	aesenc1_u [rcx+4*16]
+	aesenc1_u [rcx+5*16]
+	aesenc1_u [rcx+6*16]
+	aesenc1_u [rcx+7*16]
+	aesenc1_u [rcx+8*16]
+	aesenc1_u [rcx+9*16]
+	aesenc1_u [rcx+10*16]
+	aesenc1_u [rcx+11*16]
+	aesenc1_u [rcx+12*16]
+	aesenc1_u [rcx+13*16]
+	aesenclast1_u [rcx+14*16]
+
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [r8+rdx-16], xmm0
+	dec eax
+	jnz lp256enc
+
+end_enc256:
+
+	add rsp,16*16+8
+	ret
diff --git a/src/lib/crypto/builtin/aes/iaesx86.s b/src/lib/crypto/builtin/aes/iaesx86.s
new file mode 100644
index 0000000..c65921b
--- /dev/null
+++ b/src/lib/crypto/builtin/aes/iaesx86.s
@@ -0,0 +1,2199 @@
+[bits 32]
+[CPU intelnop]
+
+; Copyright (c) 2010, Intel Corporation
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+;     * Redistributions of source code must retain the above copyright notice,
+;       this list of conditions and the following disclaimer.
+;     * Redistributions in binary form must reproduce the above copyright notice,
+;       this list of conditions and the following disclaimer in the documentation
+;       and/or other materials provided with the distribution.
+;     * Neither the name of Intel Corporation nor the names of its contributors
+;       may be used to endorse or promote products derived from this software
+;       without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+; IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+; INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+; BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+; ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+%macro inversekey 1
+	movdqu  xmm1,%1
+	aesimc	xmm0,xmm1
+	movdqu	%1,xmm0
+%endmacro
+
+
+%macro aesdec4 1
+	movdqa	xmm4,%1
+
+	aesdec	xmm0,xmm4
+	aesdec	xmm1,xmm4
+	aesdec	xmm2,xmm4
+	aesdec	xmm3,xmm4
+
+%endmacro
+
+
+%macro aesdeclast4 1
+	movdqa	xmm4,%1
+
+	aesdeclast	xmm0,xmm4
+	aesdeclast	xmm1,xmm4
+	aesdeclast	xmm2,xmm4
+	aesdeclast	xmm3,xmm4
+
+%endmacro
+
+
+%macro aesenc4 1
+	movdqa	xmm4,%1
+
+	aesenc	xmm0,xmm4
+	aesenc	xmm1,xmm4
+	aesenc	xmm2,xmm4
+	aesenc	xmm3,xmm4
+
+%endmacro
+
+%macro aesenclast4 1
+	movdqa	xmm4,%1
+
+	aesenclast	xmm0,xmm4
+	aesenclast	xmm1,xmm4
+	aesenclast	xmm2,xmm4
+	aesenclast	xmm3,xmm4
+
+%endmacro
+
+
+%macro aesdeclast1 1
+	aesdeclast	xmm0,%1
+%endmacro
+
+%macro aesenclast1 1
+	aesenclast	xmm0,%1
+%endmacro
+
+%macro aesdec1 1
+	aesdec	xmm0,%1
+%endmacro
+
+;abab
+%macro aesenc1 1
+	aesenc	xmm0,%1
+%endmacro
+
+
+%macro aesdeclast1_u 1
+	movdqu xmm4,%1
+	aesdeclast	xmm0,xmm4
+%endmacro
+
+%macro aesenclast1_u 1
+	movdqu xmm4,%1
+	aesenclast	xmm0,xmm4
+%endmacro
+
+%macro aesdec1_u 1
+	movdqu xmm4,%1
+	aesdec	xmm0,xmm4
+%endmacro
+
+%macro aesenc1_u 1
+	movdqu xmm4,%1
+	aesenc	xmm0,xmm4
+%endmacro
+
+
+%macro load_and_xor4 2
+	movdqa	xmm4,%2
+	movdqu	xmm0,[%1 + 0*16]
+	pxor	xmm0,xmm4
+	movdqu	xmm1,[%1 + 1*16]
+	pxor	xmm1,xmm4
+	movdqu	xmm2,[%1 + 2*16]
+	pxor	xmm2,xmm4
+	movdqu	xmm3,[%1 + 3*16]
+	pxor	xmm3,xmm4
+%endmacro
+
+
+%macro load_and_inc4 1
+	movdqa	xmm4,%1
+	movdqa	xmm0,xmm5
+	pshufb	xmm0, xmm6 ; byte swap counter back
+	movdqa  xmm1,xmm5
+	paddd	xmm1,[counter_add_one]
+	pshufb	xmm1, xmm6 ; byte swap counter back
+	movdqa  xmm2,xmm5
+	paddd	xmm2,[counter_add_two]
+	pshufb	xmm2, xmm6 ; byte swap counter back
+	movdqa  xmm3,xmm5
+	paddd	xmm3,[counter_add_three]
+	pshufb	xmm3, xmm6 ; byte swap counter back
+	pxor	xmm0,xmm4
+	paddd	xmm5,[counter_add_four]
+	pxor	xmm1,xmm4
+	pxor	xmm2,xmm4
+	pxor	xmm3,xmm4
+%endmacro
+
+%macro xor_with_input4 1
+	movdqu xmm4,[%1]
+	pxor xmm0,xmm4
+	movdqu xmm4,[%1+16]
+	pxor xmm1,xmm4
+	movdqu xmm4,[%1+32]
+	pxor xmm2,xmm4
+	movdqu xmm4,[%1+48]
+	pxor xmm3,xmm4
+%endmacro
+
+%macro store4 1
+	movdqu [%1 + 0*16],xmm0
+	movdqu [%1 + 1*16],xmm1
+	movdqu [%1 + 2*16],xmm2
+	movdqu [%1 + 3*16],xmm3
+%endmacro
+
+
+%macro copy_round_keys 3
+	movdqu xmm4,[%2 + ((%3)*16)]
+	movdqa [%1 + ((%3)*16)],xmm4
+%endmacro
+
+;abab
+%macro copy_round_keyx 3
+	movdqu xmm4,[%2 + ((%3)*16)]
+	movdqa %1,xmm4
+%endmacro
+
+
+
+%macro key_expansion_1_192 1
+		;; Assumes the xmm3 includes all zeros at this point.
+        pshufd xmm2, xmm2, 11111111b
+        shufps xmm3, xmm1, 00010000b
+        pxor xmm1, xmm3
+        shufps xmm3, xmm1, 10001100b
+        pxor xmm1, xmm3
+		pxor xmm1, xmm2
+		movdqu [edx+%1], xmm1
+%endmacro
+
+; Calculate w10 and w11 using calculated w9 and known w4-w5
+%macro key_expansion_2_192 1
+		movdqa xmm5, xmm4
+		pslldq xmm5, 4
+		shufps xmm6, xmm1, 11110000b
+		pxor xmm6, xmm5
+		pxor xmm4, xmm6
+		pshufd xmm7, xmm4, 00001110b
+		movdqu [edx+%1], xmm7
+%endmacro
+
+
+
+
+
+section .data
+align 16
+shuffle_mask:
+DD 0FFFFFFFFh
+DD 03020100h
+DD 07060504h
+DD 0B0A0908h
+
+byte_swap_16:
+DDQ 0x000102030405060708090A0B0C0D0E0F
+
+align 16
+counter_add_one:
+DD 1
+DD 0
+DD 0
+DD 0
+
+counter_add_two:
+DD 2
+DD 0
+DD 0
+DD 0
+
+counter_add_three:
+DD 3
+DD 0
+DD 0
+DD 0
+
+counter_add_four:
+DD 4
+DD 0
+DD 0
+DD 0
+
+
+section .text
+
+
+
+align 16
+key_expansion256:
+
+    pshufd xmm2, xmm2, 011111111b
+
+    movdqu xmm4, xmm1
+    pshufb xmm4, xmm5
+    pxor xmm1, xmm4
+    pshufb xmm4, xmm5
+    pxor xmm1, xmm4
+    pshufb xmm4, xmm5
+    pxor xmm1, xmm4
+    pxor xmm1, xmm2
+
+    movdqu [edx], xmm1
+    add edx, 0x10
+
+    aeskeygenassist xmm4, xmm1, 0
+    pshufd xmm2, xmm4, 010101010b
+
+    movdqu xmm4, xmm3
+    pshufb xmm4, xmm5
+    pxor xmm3, xmm4
+    pshufb xmm4, xmm5
+    pxor xmm3, xmm4
+    pshufb xmm4, xmm5
+    pxor xmm3, xmm4
+    pxor xmm3, xmm2
+
+    movdqu [edx], xmm3
+    add edx, 0x10
+
+    ret
+
+
+
+align 16
+key_expansion128:
+    pshufd xmm2, xmm2, 0xFF;
+    movdqu xmm3, xmm1
+    pshufb xmm3, xmm5
+    pxor xmm1, xmm3
+    pshufb xmm3, xmm5
+    pxor xmm1, xmm3
+    pshufb xmm3, xmm5
+    pxor xmm1, xmm3
+    pxor xmm1, xmm2
+
+    ; storing the result in the key schedule array
+    movdqu [edx], xmm1
+    add edx, 0x10
+    ret
+
+
+
+align 16
+global _iEncExpandKey128
+_iEncExpandKey128:
+
+	mov ecx,[esp-4+8]		;input
+	mov edx,[esp-4+12]		;ctx
+
+        movdqu xmm1, [ecx]    ; loading the key
+
+        movdqu [edx], xmm1
+
+        movdqa xmm5, [shuffle_mask]
+
+        add edx,16
+
+        aeskeygenassist xmm2, xmm1, 0x1     ; Generating round key 1
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x2     ; Generating round key 2
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x4     ; Generating round key 3
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x8     ; Generating round key 4
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x10    ; Generating round key 5
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x20    ; Generating round key 6
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x40    ; Generating round key 7
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x80    ; Generating round key 8
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x1b    ; Generating round key 9
+        call key_expansion128
+        aeskeygenassist xmm2, xmm1, 0x36    ; Generating round key 10
+        call key_expansion128
+
+	ret
+
+
+align 16
+global _iEncExpandKey192
+_iEncExpandKey192:
+
+	mov ecx,[esp-4+8]		;input
+	mov edx,[esp-4+12]		;ctx
+
+        movq xmm7, [ecx+16]	; loading the AES key
+        movq [edx+16], xmm7  ; Storing key in memory where all key expansion
+        pshufd xmm4, xmm7, 01001111b
+        movdqu xmm1, [ecx]	; loading the AES key
+        movdqu [edx], xmm1  ; Storing key in memory where all key expansion
+
+        pxor xmm3, xmm3		; Set xmm3 to be all zeros. Required for the key_expansion.
+        pxor xmm6, xmm6		; Set xmm3 to be all zeros. Required for the key_expansion.
+
+        aeskeygenassist xmm2, xmm4, 0x1     ; Complete round key 1 and generate round key 2
+        key_expansion_1_192 24
+	key_expansion_2_192 40
+
+        aeskeygenassist xmm2, xmm4, 0x2     ; Generate round key 3 and part of round key 4
+        key_expansion_1_192 48
+	key_expansion_2_192 64
+
+        aeskeygenassist xmm2, xmm4, 0x4     ; Complete round key 4 and generate round key 5
+        key_expansion_1_192 72
+	key_expansion_2_192 88
+
+        aeskeygenassist xmm2, xmm4, 0x8     ; Generate round key 6 and part of round key 7
+        key_expansion_1_192 96
+	key_expansion_2_192 112
+
+        aeskeygenassist xmm2, xmm4, 0x10     ; Complete round key 7 and generate round key 8
+        key_expansion_1_192 120
+	key_expansion_2_192 136
+
+        aeskeygenassist xmm2, xmm4, 0x20     ; Generate round key 9 and part of round key 10
+        key_expansion_1_192 144
+	key_expansion_2_192 160
+
+        aeskeygenassist xmm2, xmm4, 0x40     ; Complete round key 10 and generate round key 11
+        key_expansion_1_192 168
+	key_expansion_2_192 184
+
+        aeskeygenassist xmm2, xmm4, 0x80     ; Generate round key 12
+        key_expansion_1_192 192
+
+	ret
+
+
+
+
+
+
+align 16
+global _iDecExpandKey128
+_iDecExpandKey128:
+	push DWORD [esp+8]
+	push DWORD [esp+8]
+
+	call _iEncExpandKey128
+	add esp,8
+
+	mov edx,[esp-4+12]		;ctx
+
+	inversekey	[edx + 1*16]
+	inversekey	[edx + 2*16]
+	inversekey	[edx + 3*16]
+	inversekey	[edx + 4*16]
+	inversekey	[edx + 5*16]
+	inversekey	[edx + 6*16]
+	inversekey	[edx + 7*16]
+	inversekey	[edx + 8*16]
+	inversekey	[edx + 9*16]
+
+	ret
+
+
+
+
+align 16
+global _iDecExpandKey192
+_iDecExpandKey192:
+	push DWORD [esp+8]
+	push DWORD [esp+8]
+
+	call _iEncExpandKey192
+	add esp,8
+
+	mov edx,[esp-4+12]		;ctx
+
+	inversekey	[edx + 1*16]
+	inversekey	[edx + 2*16]
+	inversekey	[edx + 3*16]
+	inversekey	[edx + 4*16]
+	inversekey	[edx + 5*16]
+	inversekey	[edx + 6*16]
+	inversekey	[edx + 7*16]
+	inversekey	[edx + 8*16]
+	inversekey	[edx + 9*16]
+	inversekey	[edx + 10*16]
+	inversekey	[edx + 11*16]
+
+	ret
+
+
+
+
+align 16
+global _iDecExpandKey256
+_iDecExpandKey256:
+	push DWORD [esp+8]
+	push DWORD [esp+8]
+
+	call _iEncExpandKey256
+	add esp, 8
+
+	mov edx, [esp-4+12]		;expanded key
+
+	inversekey	[edx + 1*16]
+	inversekey	[edx + 2*16]
+	inversekey	[edx + 3*16]
+	inversekey	[edx + 4*16]
+	inversekey	[edx + 5*16]
+	inversekey	[edx + 6*16]
+	inversekey	[edx + 7*16]
+	inversekey	[edx + 8*16]
+	inversekey	[edx + 9*16]
+	inversekey	[edx + 10*16]
+	inversekey	[edx + 11*16]
+	inversekey	[edx + 12*16]
+	inversekey	[edx + 13*16]
+
+	ret
+
+
+
+
+align 16
+global _iEncExpandKey256
+_iEncExpandKey256:
+	mov ecx, [esp-4+8]		;input
+	mov edx, [esp-4+12]		;expanded key
+
+
+    movdqu xmm1, [ecx]    ; loading the key
+    movdqu xmm3, [ecx+16]
+    movdqu [edx], xmm1  ; Storing key in memory where all key schedule will be stored
+    movdqu [edx+16], xmm3
+
+    add edx,32
+
+    movdqa xmm5, [shuffle_mask]  ; this mask is used by key_expansion
+
+    aeskeygenassist xmm2, xmm3, 0x1     ;
+    call key_expansion256
+    aeskeygenassist xmm2, xmm3, 0x2     ;
+    call key_expansion256
+    aeskeygenassist xmm2, xmm3, 0x4     ;
+    call key_expansion256
+    aeskeygenassist xmm2, xmm3, 0x8     ;
+    call key_expansion256
+    aeskeygenassist xmm2, xmm3, 0x10    ;
+    call key_expansion256
+    aeskeygenassist xmm2, xmm3, 0x20    ;
+    call key_expansion256
+    aeskeygenassist xmm2, xmm3, 0x40    ;
+;    call key_expansion256
+
+    pshufd xmm2, xmm2, 011111111b
+
+    movdqu xmm4, xmm1
+    pshufb xmm4, xmm5
+    pxor xmm1, xmm4
+    pshufb xmm4, xmm5
+    pxor xmm1, xmm4
+    pshufb xmm4, xmm5
+    pxor xmm1, xmm4
+    pxor xmm1, xmm2
+
+    movdqu [edx], xmm1
+
+
+	ret
+
+
+
+
+
+
+align 16
+global _iDec128
+_iDec128:
+	mov ecx,[esp-4+8]
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+
+	sub edi,esi
+
+	test eax,eax
+	jz end_dec128
+
+	cmp eax,4
+	jl	lp128decsingle
+
+	test	ecx,0xf
+	jz		lp128decfour
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	mov ecx,esp
+
+
+align 16
+lp128decfour:
+
+	test eax,eax
+	jz end_dec128
+
+	cmp eax,4
+	jl	lp128decsingle
+
+	load_and_xor4 esi, [ecx+10*16]
+	add esi,16*4
+	aesdec4 [ecx+9*16]
+	aesdec4 [ecx+8*16]
+	aesdec4 [ecx+7*16]
+	aesdec4 [ecx+6*16]
+	aesdec4 [ecx+5*16]
+	aesdec4 [ecx+4*16]
+	aesdec4 [ecx+3*16]
+	aesdec4 [ecx+2*16]
+	aesdec4 [ecx+1*16]
+	aesdeclast4 [ecx+0*16]
+
+	sub eax,4
+	store4 esi+edi-(16*4)
+	jmp lp128decfour
+
+
+	align 16
+lp128decsingle:
+
+	movdqu xmm0, [esi]
+	movdqu xmm4,[ecx+10*16]
+	pxor xmm0, xmm4
+	aesdec1_u  [ecx+9*16]
+	aesdec1_u  [ecx+8*16]
+	aesdec1_u  [ecx+7*16]
+	aesdec1_u  [ecx+6*16]
+	aesdec1_u  [ecx+5*16]
+	aesdec1_u  [ecx+4*16]
+	aesdec1_u  [ecx+3*16]
+	aesdec1_u  [ecx+2*16]
+	aesdec1_u  [ecx+1*16]
+	aesdeclast1_u [ecx+0*16]
+
+	add esi, 16
+	movdqu  [edi+esi - 16], xmm0
+	dec eax
+	jnz lp128decsingle
+
+end_dec128:
+
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+
+	ret
+
+
+
+align 16
+global _iDec128_CBC
+_iDec128_CBC:
+	mov ecx,[esp-4+8]
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov eax,[ecx+12]
+	movdqu xmm5,[eax]	;iv
+
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+
+	sub edi,esi
+
+	test eax,eax
+	jz end_dec128_CBC
+
+	cmp eax,4
+	jl	lp128decsingle_CBC
+
+	test	ecx,0xf
+	jz		lp128decfour_CBC
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	mov ecx,esp
+
+
+align 16
+lp128decfour_CBC:
+
+	test eax,eax
+	jz end_dec128_CBC
+
+	cmp eax,4
+	jl	lp128decsingle_CBC
+
+	load_and_xor4 esi, [ecx+10*16]
+	add esi,16*4
+	aesdec4 [ecx+9*16]
+	aesdec4 [ecx+8*16]
+	aesdec4 [ecx+7*16]
+	aesdec4 [ecx+6*16]
+	aesdec4 [ecx+5*16]
+	aesdec4 [ecx+4*16]
+	aesdec4 [ecx+3*16]
+	aesdec4 [ecx+2*16]
+	aesdec4 [ecx+1*16]
+	aesdeclast4 [ecx+0*16]
+
+	pxor	xmm0,xmm5
+	movdqu	xmm4,[esi- 16*4 + 0*16]
+	pxor	xmm1,xmm4
+	movdqu	xmm4,[esi- 16*4 + 1*16]
+	pxor	xmm2,xmm4
+	movdqu	xmm4,[esi- 16*4 + 2*16]
+	pxor	xmm3,xmm4
+	movdqu	xmm5,[esi- 16*4 + 3*16]
+
+	sub eax,4
+	store4 esi+edi-(16*4)
+	jmp lp128decfour_CBC
+
+
+	align 16
+lp128decsingle_CBC:
+
+	movdqu xmm0, [esi]
+	movdqa xmm1,xmm0
+	movdqu xmm4,[ecx+10*16]
+	pxor xmm0, xmm4
+	aesdec1_u  [ecx+9*16]
+	aesdec1_u  [ecx+8*16]
+	aesdec1_u  [ecx+7*16]
+	aesdec1_u  [ecx+6*16]
+	aesdec1_u  [ecx+5*16]
+	aesdec1_u  [ecx+4*16]
+	aesdec1_u  [ecx+3*16]
+	aesdec1_u  [ecx+2*16]
+	aesdec1_u  [ecx+1*16]
+	aesdeclast1_u [ecx+0*16]
+
+	pxor	xmm0,xmm5
+	movdqa	xmm5,xmm1
+
+	add esi, 16
+	movdqu  [edi+esi - 16], xmm0
+	dec eax
+	jnz lp128decsingle_CBC
+
+end_dec128_CBC:
+
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+
+	mov ecx,[esp-4+8]   ; first arg
+	mov ecx,[ecx+12]
+	movdqu	[ecx],xmm5 ; store last iv for chaining
+
+	ret
+
+
+
+
+
+
+align 16
+global _iDec192
+_iDec192:
+	mov ecx,[esp-4+8]
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+
+	sub edi,esi
+
+	test eax,eax
+	jz end_dec192
+
+	cmp eax,4
+	jl	lp192decsingle
+
+	test	ecx,0xf
+	jz		lp192decfour
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	copy_round_keys esp,ecx,11
+	copy_round_keys esp,ecx,12
+	mov ecx,esp
+
+
+align 16
+lp192decfour:
+
+	test eax,eax
+	jz end_dec192
+
+	cmp eax,4
+	jl	lp192decsingle
+
+	load_and_xor4 esi, [ecx+12*16]
+	add esi,16*4
+	aesdec4 [ecx+11*16]
+	aesdec4 [ecx+10*16]
+	aesdec4 [ecx+9*16]
+	aesdec4 [ecx+8*16]
+	aesdec4 [ecx+7*16]
+	aesdec4 [ecx+6*16]
+	aesdec4 [ecx+5*16]
+	aesdec4 [ecx+4*16]
+	aesdec4 [ecx+3*16]
+	aesdec4 [ecx+2*16]
+	aesdec4 [ecx+1*16]
+	aesdeclast4 [ecx+0*16]
+
+	sub eax,4
+	store4 esi+edi-(16*4)
+	jmp lp192decfour
+
+
+	align 16
+lp192decsingle:
+
+	movdqu xmm0, [esi]
+	movdqu xmm4,[ecx+12*16]
+	pxor xmm0, xmm4
+	aesdec1_u [ecx+11*16]
+	aesdec1_u  [ecx+10*16]
+	aesdec1_u  [ecx+9*16]
+	aesdec1_u  [ecx+8*16]
+	aesdec1_u  [ecx+7*16]
+	aesdec1_u  [ecx+6*16]
+	aesdec1_u  [ecx+5*16]
+	aesdec1_u  [ecx+4*16]
+	aesdec1_u  [ecx+3*16]
+	aesdec1_u  [ecx+2*16]
+	aesdec1_u  [ecx+1*16]
+	aesdeclast1_u  [ecx+0*16]
+
+	add esi, 16
+	movdqu  [edi+esi - 16], xmm0
+	dec eax
+	jnz lp192decsingle
+
+end_dec192:
+
+
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+
+	ret
+
+
+align 16
+global _iDec192_CBC
+_iDec192_CBC:
+	mov ecx,[esp-4+8]
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov eax,[ecx+12]
+	movdqu xmm5,[eax]	;iv
+
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+
+	sub edi,esi
+
+	test eax,eax
+	jz end_dec192_CBC
+
+	cmp eax,4
+	jl	lp192decsingle_CBC
+
+	test	ecx,0xf
+	jz		lp192decfour_CBC
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	copy_round_keys esp,ecx,11
+	copy_round_keys esp,ecx,12
+	mov ecx,esp
+
+align 16
+lp192decfour_CBC:
+
+	test eax,eax
+	jz end_dec192_CBC
+
+	cmp eax,4
+	jl	lp192decsingle_CBC
+
+	load_and_xor4 esi, [ecx+12*16]
+	add esi,16*4
+	aesdec4 [ecx+11*16]
+	aesdec4 [ecx+10*16]
+	aesdec4 [ecx+9*16]
+	aesdec4 [ecx+8*16]
+	aesdec4 [ecx+7*16]
+	aesdec4 [ecx+6*16]
+	aesdec4 [ecx+5*16]
+	aesdec4 [ecx+4*16]
+	aesdec4 [ecx+3*16]
+	aesdec4 [ecx+2*16]
+	aesdec4 [ecx+1*16]
+	aesdeclast4 [ecx+0*16]
+
+	pxor	xmm0,xmm5
+	movdqu	xmm4,[esi- 16*4 + 0*16]
+	pxor	xmm1,xmm4
+	movdqu	xmm4,[esi- 16*4 + 1*16]
+	pxor	xmm2,xmm4
+	movdqu	xmm4,[esi- 16*4 + 2*16]
+	pxor	xmm3,xmm4
+	movdqu	xmm5,[esi- 16*4 + 3*16]
+
+	sub eax,4
+	store4 esi+edi-(16*4)
+	jmp lp192decfour_CBC
+
+
+	align 16
+lp192decsingle_CBC:
+
+	movdqu xmm0, [esi]
+	movdqu xmm4,[ecx+12*16]
+	movdqa xmm1,xmm0
+	pxor xmm0, xmm4
+	aesdec1_u [ecx+11*16]
+	aesdec1_u [ecx+10*16]
+	aesdec1_u [ecx+9*16]
+	aesdec1_u [ecx+8*16]
+	aesdec1_u [ecx+7*16]
+	aesdec1_u [ecx+6*16]
+	aesdec1_u [ecx+5*16]
+	aesdec1_u [ecx+4*16]
+	aesdec1_u [ecx+3*16]
+	aesdec1_u [ecx+2*16]
+	aesdec1_u [ecx+1*16]
+	aesdeclast1_u [ecx+0*16]
+
+	pxor	xmm0,xmm5
+	movdqa	xmm5,xmm1
+
+	add esi, 16
+	movdqu  [edi+esi - 16], xmm0
+	dec eax
+	jnz lp192decsingle_CBC
+
+end_dec192_CBC:
+
+
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+
+	mov ecx,[esp-4+8]
+	mov ecx,[ecx+12]
+	movdqu	[ecx],xmm5 ; store last iv for chaining
+
+	ret
+
+
+
+
+
+align 16
+global _iDec256
+_iDec256:
+	mov ecx, [esp-4+8]
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+
+	sub edi,esi
+
+
+	test eax,eax
+	jz end_dec256
+
+	cmp eax,4
+	jl lp256dec
+
+	test	ecx,0xf
+	jz	lp256dec4
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	copy_round_keys esp,ecx,11
+	copy_round_keys esp,ecx,12
+	copy_round_keys esp,ecx,13
+	copy_round_keys esp,ecx,14
+	mov ecx,esp
+
+	align 16
+lp256dec4:
+	test eax,eax
+	jz end_dec256
+
+	cmp eax,4
+	jl lp256dec
+
+	load_and_xor4 esi,[ecx+14*16]
+	add esi, 4*16
+	aesdec4 [ecx+13*16]
+	aesdec4 [ecx+12*16]
+	aesdec4 [ecx+11*16]
+	aesdec4 [ecx+10*16]
+	aesdec4 [ecx+9*16]
+	aesdec4 [ecx+8*16]
+	aesdec4 [ecx+7*16]
+	aesdec4 [ecx+6*16]
+	aesdec4 [ecx+5*16]
+	aesdec4 [ecx+4*16]
+	aesdec4 [ecx+3*16]
+	aesdec4 [ecx+2*16]
+	aesdec4 [ecx+1*16]
+	aesdeclast4 [ecx+0*16]
+
+	store4 esi+edi-16*4
+	sub eax,4
+	jmp lp256dec4
+
+	align 16
+lp256dec:
+
+	movdqu xmm0, [esi]
+	movdqu xmm4,[ecx+14*16]
+	add esi, 16
+	pxor xmm0, xmm4                     ; Round 0 (only xor)
+	aesdec1_u  [ecx+13*16]
+	aesdec1_u  [ecx+12*16]
+	aesdec1_u  [ecx+11*16]
+	aesdec1_u  [ecx+10*16]
+	aesdec1_u  [ecx+9*16]
+	aesdec1_u  [ecx+8*16]
+	aesdec1_u  [ecx+7*16]
+	aesdec1_u  [ecx+6*16]
+	aesdec1_u  [ecx+5*16]
+	aesdec1_u  [ecx+4*16]
+	aesdec1_u  [ecx+3*16]
+	aesdec1_u  [ecx+2*16]
+	aesdec1_u  [ecx+1*16]
+	aesdeclast1_u  [ecx+0*16]
+
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [esi+edi-16], xmm0
+	dec eax
+	jnz lp256dec
+
+end_dec256:
+
+
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+
+	ret
+
+
+
+
+align 16
+global _iDec256_CBC
+_iDec256_CBC:
+	mov ecx,[esp-4+8]
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov eax,[ecx+12]
+	movdqu xmm5,[eax]	;iv
+
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+
+	sub edi,esi
+
+	test eax,eax
+	jz end_dec256_CBC
+
+	cmp eax,4
+	jl	lp256decsingle_CBC
+
+	test	ecx,0xf
+	jz	lp256decfour_CBC
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	copy_round_keys esp,ecx,11
+	copy_round_keys esp,ecx,12
+	copy_round_keys esp,ecx,13
+	copy_round_keys esp,ecx,14
+	mov ecx,esp
+
+align 16
+lp256decfour_CBC:
+
+	test eax,eax
+	jz end_dec256_CBC
+
+	cmp eax,4
+	jl	lp256decsingle_CBC
+
+	load_and_xor4 esi, [ecx+14*16]
+	add esi,16*4
+	aesdec4 [ecx+13*16]
+	aesdec4 [ecx+12*16]
+	aesdec4 [ecx+11*16]
+	aesdec4 [ecx+10*16]
+	aesdec4 [ecx+9*16]
+	aesdec4 [ecx+8*16]
+	aesdec4 [ecx+7*16]
+	aesdec4 [ecx+6*16]
+	aesdec4 [ecx+5*16]
+	aesdec4 [ecx+4*16]
+	aesdec4 [ecx+3*16]
+	aesdec4 [ecx+2*16]
+	aesdec4 [ecx+1*16]
+	aesdeclast4 [ecx+0*16]
+
+	pxor	xmm0,xmm5
+	movdqu	xmm4,[esi- 16*4 + 0*16]
+	pxor	xmm1,xmm4
+	movdqu	xmm4,[esi- 16*4 + 1*16]
+	pxor	xmm2,xmm4
+	movdqu	xmm4,[esi- 16*4 + 2*16]
+	pxor	xmm3,xmm4
+	movdqu	xmm5,[esi- 16*4 + 3*16]
+
+	sub eax,4
+	store4 esi+edi-(16*4)
+	jmp lp256decfour_CBC
+
+
+	align 16
+lp256decsingle_CBC:
+
+	movdqu xmm0, [esi]
+	movdqa xmm1,xmm0
+	movdqu xmm4, [ecx+14*16]
+	pxor xmm0, xmm4
+	aesdec1_u  [ecx+13*16]
+	aesdec1_u  [ecx+12*16]
+	aesdec1_u  [ecx+11*16]
+	aesdec1_u  [ecx+10*16]
+	aesdec1_u  [ecx+9*16]
+	aesdec1_u  [ecx+8*16]
+	aesdec1_u  [ecx+7*16]
+	aesdec1_u  [ecx+6*16]
+	aesdec1_u  [ecx+5*16]
+	aesdec1_u  [ecx+4*16]
+	aesdec1_u  [ecx+3*16]
+	aesdec1_u  [ecx+2*16]
+	aesdec1_u  [ecx+1*16]
+	aesdeclast1_u  [ecx+0*16]
+
+	pxor	xmm0,xmm5
+	movdqa	xmm5,xmm1
+
+	add esi, 16
+	movdqu  [edi+esi - 16], xmm0
+	dec eax
+	jnz lp256decsingle_CBC
+
+end_dec256_CBC:
+
+
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+
+	mov ecx,[esp-4+8]  ; first arg
+	mov ecx,[ecx+12]
+	movdqu	[ecx],xmm5 ; store last iv for chaining
+
+	ret
+
+
+
+
+
+
+
+
+
+align 16
+global _iEnc128
+_iEnc128:
+	mov ecx,[esp-4+8]
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+
+	sub edi,esi
+
+	test eax,eax
+	jz end_enc128
+
+	cmp eax,4
+	jl lp128encsingle
+
+	test	ecx,0xf
+	jz		lpenc128four
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	mov ecx,esp
+
+
+	align 16
+
+lpenc128four:
+
+	test eax,eax
+	jz end_enc128
+
+	cmp eax,4
+	jl lp128encsingle
+
+	load_and_xor4 esi,[ecx+0*16]
+	add esi,4*16
+	aesenc4	[ecx+1*16]
+	aesenc4	[ecx+2*16]
+	aesenc4	[ecx+3*16]
+	aesenc4	[ecx+4*16]
+	aesenc4	[ecx+5*16]
+	aesenc4	[ecx+6*16]
+	aesenc4	[ecx+7*16]
+	aesenc4	[ecx+8*16]
+	aesenc4	[ecx+9*16]
+	aesenclast4	[ecx+10*16]
+
+	store4 esi+edi-16*4
+	sub eax,4
+	jmp lpenc128four
+
+	align 16
+lp128encsingle:
+
+	movdqu xmm0, [esi]
+	add esi, 16
+	movdqu xmm4,[ecx+0*16]
+	pxor xmm0, xmm4
+	aesenc1_u  [ecx+1*16]
+	aesenc1_u  [ecx+2*16]
+	aesenc1_u  [ecx+3*16]
+	aesenc1_u  [ecx+4*16]
+	aesenc1_u  [ecx+5*16]
+	aesenc1_u  [ecx+6*16]
+	aesenc1_u  [ecx+7*16]
+	aesenc1_u  [ecx+8*16]
+	aesenc1_u  [ecx+9*16]
+	aesenclast1_u  [ecx+10*16]
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [esi+edi-16], xmm0
+	dec eax
+	jnz lp128encsingle
+
+end_enc128:
+
+
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+
+	ret
+
+
+align 16
+global _iEnc128_CTR
+_iEnc128_CTR:
+	mov ecx,[esp-4+8]
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov	eax,[ecx+12]
+    movdqu xmm5,[eax]	;initial counter
+	movdqa xmm6, [byte_swap_16]
+	pshufb xmm5, xmm6 ; byte swap counter
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+
+	sub edi,esi
+
+	test eax,eax
+	jz end_encctr128
+
+	cmp eax,4
+	jl lp128encctrsingle
+
+	test	ecx,0xf
+	jz		lpencctr128four
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	mov ecx,esp
+
+
+	align 16
+
+lpencctr128four:
+
+	test eax,eax
+	jz end_encctr128
+
+	cmp eax,4
+	jl lp128encctrsingle
+
+	load_and_inc4 [ecx+0*16]
+	add esi,4*16
+	aesenc4	[ecx+1*16]
+	aesenc4	[ecx+2*16]
+	aesenc4	[ecx+3*16]
+	aesenc4	[ecx+4*16]
+	aesenc4	[ecx+5*16]
+	aesenc4	[ecx+6*16]
+	aesenc4	[ecx+7*16]
+	aesenc4	[ecx+8*16]
+	aesenc4	[ecx+9*16]
+	aesenclast4	[ecx+10*16]
+	xor_with_input4 esi-(4*16)
+
+	store4 esi+edi-16*4
+	sub eax,4
+	jmp lpencctr128four
+
+	align 16
+lp128encctrsingle:
+
+	movdqa	xmm0,xmm5
+	pshufb	xmm0, xmm6 ; byte swap counter back
+	paddd	xmm5,[counter_add_one]
+	add esi, 16
+	movdqu xmm4,[ecx+0*16]
+	pxor xmm0, xmm4
+	aesenc1_u [ecx+1*16]
+	aesenc1_u [ecx+2*16]
+	aesenc1_u [ecx+3*16]
+	aesenc1_u [ecx+4*16]
+	aesenc1_u [ecx+5*16]
+	aesenc1_u [ecx+6*16]
+	aesenc1_u [ecx+7*16]
+	aesenc1_u [ecx+8*16]
+	aesenc1_u [ecx+9*16]
+	aesenclast1_u [ecx+10*16]
+	movdqu xmm4, [esi-16]
+	pxor	xmm0,xmm4
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [esi+edi-16], xmm0
+	dec eax
+	jnz lp128encctrsingle
+
+end_encctr128:
+	pshufb xmm5, xmm6 ; byte swap counter
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+
+	mov ecx,[esp-4+8]  ; first arg
+	mov ecx,[ecx+12]
+	movdqu	[ecx],xmm5 ; store last counter for chaining
+
+	ret
+
+
+align 16
+global _iEnc192_CTR
+_iEnc192_CTR:
+	mov ecx,[esp-4+8]
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov	eax,[ecx+12]
+	movdqu xmm5,[eax]	;initial counter
+	movdqa xmm6, [byte_swap_16]
+	pshufb xmm5, xmm6 ; byte swap counter
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+
+	sub edi,esi
+
+	test eax,eax
+	jz end_encctr192
+
+	cmp eax,4
+	jl lp192encctrsingle
+
+	test	ecx,0xf
+	jz lpencctr192four
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	copy_round_keys esp,ecx,11
+	copy_round_keys esp,ecx,12
+	mov ecx,esp
+
+
+	align 16
+
+lpencctr192four:
+
+	test eax,eax
+	jz end_encctr192
+
+	cmp eax,4
+	jl lp192encctrsingle
+
+	load_and_inc4 [ecx+0*16]
+	add esi,4*16
+	aesenc4	[ecx+1*16]
+	aesenc4	[ecx+2*16]
+	aesenc4	[ecx+3*16]
+	aesenc4	[ecx+4*16]
+	aesenc4	[ecx+5*16]
+	aesenc4	[ecx+6*16]
+	aesenc4	[ecx+7*16]
+	aesenc4	[ecx+8*16]
+	aesenc4	[ecx+9*16]
+	aesenc4	[ecx+10*16]
+	aesenc4	[ecx+11*16]
+	aesenclast4	[ecx+12*16]
+	xor_with_input4 esi-(4*16)
+
+	store4 esi+edi-16*4
+	sub eax,4
+	jmp lpencctr192four
+
+	align 16
+lp192encctrsingle:
+
+	movdqa	xmm0,xmm5
+	pshufb	xmm0, xmm6 ; byte swap counter back
+	paddd	xmm5,[counter_add_one]
+	add esi, 16
+	movdqu xmm4,[ecx+0*16]
+	pxor xmm0, xmm4
+	aesenc1_u  [ecx+1*16]
+	aesenc1_u  [ecx+2*16]
+	aesenc1_u  [ecx+3*16]
+	aesenc1_u  [ecx+4*16]
+	aesenc1_u  [ecx+5*16]
+	aesenc1_u  [ecx+6*16]
+	aesenc1_u  [ecx+7*16]
+	aesenc1_u  [ecx+8*16]
+	aesenc1_u  [ecx+9*16]
+	aesenc1_u  [ecx+10*16]
+	aesenc1_u  [ecx+11*16]
+	aesenclast1_u  [ecx+12*16]
+	movdqu xmm4, [esi-16]
+	pxor	xmm0,xmm4
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [esi+edi-16], xmm0
+	dec eax
+	jnz lp192encctrsingle
+
+end_encctr192:
+
+	pshufb xmm5, xmm6 ; byte swap counter
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+
+	mov ecx,[esp-4+8]  ; first arg
+	mov ecx,[ecx+12]
+	movdqu	[ecx],xmm5 ; store last counter for chaining
+
+	ret
+
+
+align 16
+global _iEnc256_CTR
+_iEnc256_CTR:
+	mov ecx,[esp-4+8]
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov	eax,[ecx+12]
+	movdqu xmm5,[eax]	;initial counter
+	movdqa xmm6, [byte_swap_16]
+	pshufb xmm5, xmm6 ; byte swap counter
+
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+
+	sub edi,esi
+
+	test eax,eax
+	jz end_encctr256
+
+	cmp eax,4
+	jl lp256encctrsingle
+
+	test	ecx,0xf
+	jz	lpencctr256four
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	copy_round_keys esp,ecx,11
+	copy_round_keys esp,ecx,12
+	copy_round_keys esp,ecx,13
+	copy_round_keys esp,ecx,14
+	mov ecx,esp
+
+
+	align 16
+
+lpencctr256four:
+
+	test eax,eax
+	jz end_encctr256
+
+	cmp eax,4
+	jl lp256encctrsingle
+
+	load_and_inc4 [ecx+0*16]
+	add esi,4*16
+	aesenc4	[ecx+1*16]
+	aesenc4	[ecx+2*16]
+	aesenc4	[ecx+3*16]
+	aesenc4	[ecx+4*16]
+	aesenc4	[ecx+5*16]
+	aesenc4	[ecx+6*16]
+	aesenc4	[ecx+7*16]
+	aesenc4	[ecx+8*16]
+	aesenc4	[ecx+9*16]
+	aesenc4	[ecx+10*16]
+	aesenc4	[ecx+11*16]
+	aesenc4	[ecx+12*16]
+	aesenc4	[ecx+13*16]
+	aesenclast4	[ecx+14*16]
+	xor_with_input4 esi-(4*16)
+
+	store4 esi+edi-16*4
+	sub eax,4
+	jmp lpencctr256four
+
+	align 16
+
+lp256encctrsingle:
+
+	movdqa	xmm0,xmm5
+	pshufb	xmm0, xmm6 ; byte swap counter back
+	paddd	xmm5,[counter_add_one]
+	add esi, 16
+	movdqu xmm4,[ecx+0*16]
+	pxor xmm0, xmm4
+	aesenc1_u  [ecx+1*16]
+	aesenc1_u  [ecx+2*16]
+	aesenc1_u  [ecx+3*16]
+	aesenc1_u  [ecx+4*16]
+	aesenc1_u  [ecx+5*16]
+	aesenc1_u  [ecx+6*16]
+	aesenc1_u  [ecx+7*16]
+	aesenc1_u  [ecx+8*16]
+	aesenc1_u  [ecx+9*16]
+	aesenc1_u  [ecx+10*16]
+	aesenc1_u  [ecx+11*16]
+	aesenc1_u  [ecx+12*16]
+	aesenc1_u  [ecx+13*16]
+	aesenclast1_u  [ecx+14*16]
+	movdqu xmm4, [esi-16]
+	pxor	xmm0,xmm4
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [esi+edi-16], xmm0
+	dec eax
+	jnz lp256encctrsingle
+
+end_encctr256:
+
+	pshufb xmm5, xmm6 ; byte swap counter
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+
+	mov ecx,[esp-4+8]  ; first arg
+	mov ecx,[ecx+12]
+	movdqu	[ecx],xmm5 ; store last counter for chaining
+
+	ret
+
+
+
+
+
+
+align 16
+global _iEnc128_CBC
+_iEnc128_CBC:
+	mov ecx,[esp-4+8]
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov	eax,[ecx+12]
+	movdqu xmm1,[eax]	;iv
+
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+	sub edi,esi
+
+	test	ecx,0xf
+	jz		lp128encsingle_CBC
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	mov ecx,esp
+
+	align 16
+
+lp128encsingle_CBC:
+
+	movdqu xmm0, [esi]
+	add esi, 16
+	pxor xmm0, xmm1
+	movdqu xmm4,[ecx+0*16]
+	pxor xmm0, xmm4
+	aesenc1  [ecx+1*16]
+	aesenc1  [ecx+2*16]
+	aesenc1  [ecx+3*16]
+	aesenc1  [ecx+4*16]
+	aesenc1  [ecx+5*16]
+	aesenc1  [ecx+6*16]
+	aesenc1  [ecx+7*16]
+	aesenc1  [ecx+8*16]
+	aesenc1  [ecx+9*16]
+	aesenclast1  [ecx+10*16]
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [esi+edi-16], xmm0
+	movdqa xmm1,xmm0
+	dec eax
+	jnz lp128encsingle_CBC
+
+
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+	mov ecx,[esp-4+8]  ; first arg
+	mov ecx,[ecx+12]
+	movdqu	[ecx],xmm1 ; store last iv for chaining
+
+	ret
+
+
+align 16
+global _iEnc192_CBC
+_iEnc192_CBC:
+	mov ecx,[esp-4+8]  ; first arg
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov	eax,[ecx+12]
+	movdqu xmm1,[eax]	;iv
+
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+	sub edi,esi
+
+	test	ecx,0xf
+	jz		lp192encsingle_CBC
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	copy_round_keys esp,ecx,11
+	copy_round_keys esp,ecx,12
+	mov ecx,esp
+
+	align 16
+
+lp192encsingle_CBC:
+
+	movdqu xmm0, [esi]
+	add esi, 16
+	pxor xmm0, xmm1
+	movdqu xmm4,[ecx+0*16]
+	pxor xmm0, xmm4
+	aesenc1  [ecx+1*16]
+	aesenc1  [ecx+2*16]
+	aesenc1  [ecx+3*16]
+	aesenc1  [ecx+4*16]
+	aesenc1  [ecx+5*16]
+	aesenc1  [ecx+6*16]
+	aesenc1  [ecx+7*16]
+	aesenc1  [ecx+8*16]
+	aesenc1  [ecx+9*16]
+	aesenc1  [ecx+10*16]
+	aesenc1  [ecx+11*16]
+	aesenclast1  [ecx+12*16]
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [esi+edi-16], xmm0
+	movdqa xmm1,xmm0
+	dec eax
+	jnz lp192encsingle_CBC
+
+
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+	mov ecx,[esp-4+8]  ; first arg
+	mov ecx,[ecx+12]
+	movdqu	[ecx],xmm1 ; store last iv for chaining
+
+	ret
+
+align 16
+global _iEnc256_CBC
+_iEnc256_CBC:
+	mov ecx,[esp-4+8]  ; first arg
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov	eax,[ecx+12]
+	movdqu xmm1,[eax]	;iv
+
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+	sub edi,esi
+
+	test	ecx,0xf
+	jz		lp256encsingle_CBC
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	copy_round_keys esp,ecx,11
+	copy_round_keys esp,ecx,12
+	copy_round_keys esp,ecx,13
+	copy_round_keys esp,ecx,14
+	mov ecx,esp
+
+	align 16
+
+lp256encsingle_CBC:
+
+;abab
+	movdqu xmm0, [esi]
+	add esi, 16
+	pxor xmm0, xmm1
+	movdqu xmm4,[ecx+0*16]
+	pxor xmm0, xmm4
+	aesenc1 [ecx+1*16]
+	aesenc1 [ecx+2*16]
+	aesenc1 [ecx+3*16]
+	aesenc1 [ecx+4*16]
+	aesenc1 [ecx+5*16]
+	aesenc1 [ecx+6*16]
+	aesenc1 [ecx+7*16]
+	aesenc1 [ecx+8*16]
+	aesenc1 [ecx+9*16]
+	aesenc1 [ecx+10*16]
+	aesenc1 [ecx+11*16]
+	aesenc1 [ecx+12*16]
+	aesenc1 [ecx+13*16]
+	aesenclast1 [ecx+14*16]
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [esi+edi-16], xmm0
+	movdqa xmm1,xmm0
+	dec eax
+	jnz lp256encsingle_CBC
+
+
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+	mov ecx,[esp-4+8]
+	mov ecx,[ecx+12]
+	movdqu	[ecx],xmm1 ; store last iv for chaining
+
+	ret
+
+
+
+
+
+align 16
+global _iEnc192
+_iEnc192:
+	mov ecx,[esp-4+8]
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+
+	sub edi,esi
+
+	test eax,eax
+	jz end_enc192
+
+	cmp eax,4
+	jl lp192encsingle
+
+	test	ecx,0xf
+	jz		lpenc192four
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	copy_round_keys esp,ecx,11
+	copy_round_keys esp,ecx,12
+	mov ecx,esp
+
+	align 16
+
+lpenc192four:
+
+	test eax,eax
+	jz end_enc192
+
+	cmp eax,4
+	jl lp192encsingle
+
+	load_and_xor4 esi,[ecx+0*16]
+	add esi,4*16
+	aesenc4	[ecx+1*16]
+	aesenc4	[ecx+2*16]
+	aesenc4	[ecx+3*16]
+	aesenc4	[ecx+4*16]
+	aesenc4	[ecx+5*16]
+	aesenc4	[ecx+6*16]
+	aesenc4	[ecx+7*16]
+	aesenc4	[ecx+8*16]
+	aesenc4	[ecx+9*16]
+	aesenc4	[ecx+10*16]
+	aesenc4	[ecx+11*16]
+	aesenclast4	[ecx+12*16]
+
+	store4 esi+edi-16*4
+	sub eax,4
+	jmp lpenc192four
+
+	align 16
+lp192encsingle:
+
+	movdqu xmm0, [esi]
+	add esi, 16
+	movdqu xmm4,[ecx+0*16]
+	pxor xmm0, xmm4
+	aesenc1_u [ecx+1*16]
+	aesenc1_u [ecx+2*16]
+	aesenc1_u [ecx+3*16]
+	aesenc1_u [ecx+4*16]
+	aesenc1_u [ecx+5*16]
+	aesenc1_u [ecx+6*16]
+	aesenc1_u [ecx+7*16]
+	aesenc1_u [ecx+8*16]
+	aesenc1_u [ecx+9*16]
+	aesenc1_u [ecx+10*16]
+	aesenc1_u [ecx+11*16]
+	aesenclast1_u [ecx+12*16]
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [esi+edi-16], xmm0
+	dec eax
+	jnz lp192encsingle
+
+end_enc192:
+
+
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+
+	ret
+
+
+
+
+align 16
+global _iEnc256
+_iEnc256:
+	mov ecx,[esp-4+8]
+
+	push esi
+	push edi
+	push ebp
+	mov ebp,esp
+
+	sub esp,16*16
+	and esp,0xfffffff0
+
+	mov eax,[ecx+16] ; numblocks
+	mov esi,[ecx]
+	mov edi,[ecx+4]
+	mov ecx,[ecx+8]
+
+	sub edi,esi
+
+	test eax,eax
+	jz end_enc256
+
+	cmp eax,4
+	jl lp256enc
+
+	test	ecx,0xf
+	jz	lp256enc4
+
+	copy_round_keys esp,ecx,0
+	copy_round_keys esp,ecx,1
+	copy_round_keys esp,ecx,2
+	copy_round_keys esp,ecx,3
+	copy_round_keys esp,ecx,4
+	copy_round_keys esp,ecx,5
+	copy_round_keys esp,ecx,6
+	copy_round_keys esp,ecx,7
+	copy_round_keys esp,ecx,8
+	copy_round_keys esp,ecx,9
+	copy_round_keys esp,ecx,10
+	copy_round_keys esp,ecx,11
+	copy_round_keys esp,ecx,12
+	copy_round_keys esp,ecx,13
+	copy_round_keys esp,ecx,14
+	mov ecx,esp
+
+
+
+	align 16
+
+lp256enc4:
+	test eax,eax
+	jz end_enc256
+
+	cmp eax,4
+	jl lp256enc
+
+
+	load_and_xor4 esi,[ecx+0*16]
+	add esi, 16*4
+	aesenc4 [ecx+1*16]
+	aesenc4 [ecx+2*16]
+	aesenc4 [ecx+3*16]
+	aesenc4 [ecx+4*16]
+	aesenc4 [ecx+5*16]
+	aesenc4 [ecx+6*16]
+	aesenc4 [ecx+7*16]
+	aesenc4 [ecx+8*16]
+	aesenc4 [ecx+9*16]
+	aesenc4 [ecx+10*16]
+	aesenc4 [ecx+11*16]
+	aesenc4 [ecx+12*16]
+	aesenc4 [ecx+13*16]
+	aesenclast4 [ecx+14*16]
+
+	store4  esi+edi-16*4
+	sub eax,4
+	jmp lp256enc4
+
+	align 16
+lp256enc:
+
+	movdqu xmm0, [esi]
+	add esi, 16
+	movdqu xmm4,[ecx+0*16]
+	pxor xmm0, xmm4
+	aesenc1_u [ecx+1*16]
+	aesenc1_u [ecx+2*16]
+	aesenc1_u [ecx+3*16]
+	aesenc1_u [ecx+4*16]
+	aesenc1_u [ecx+5*16]
+	aesenc1_u [ecx+6*16]
+	aesenc1_u [ecx+7*16]
+	aesenc1_u [ecx+8*16]
+	aesenc1_u [ecx+9*16]
+	aesenc1_u [ecx+10*16]
+	aesenc1_u [ecx+11*16]
+	aesenc1_u [ecx+12*16]
+	aesenc1_u [ecx+13*16]
+	aesenclast1_u [ecx+14*16]
+
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [esi+edi-16], xmm0
+	dec eax
+	jnz lp256enc
+
+end_enc256:
+
+
+	mov esp,ebp
+	pop ebp
+	pop edi
+	pop esi
+
+	ret


More information about the cvs-krb5 mailing list