krb5 commit: Adjust AESNI sources for krb5 tree

Greg Hudson ghudson at MIT.EDU
Fri May 24 14:26:13 EDT 2013


https://github.com/krb5/krb5/commit/0231309631acb59cc8b22227ca461005f38cc668
commit 0231309631acb59cc8b22227ca461005f38cc668
Author: Greg Hudson <ghudson at mit.edu>
Date:   Sat May 4 19:09:38 2013 -0400

    Adjust AESNI sources for krb5 tree
    
    Remove functions we don't need.  Add macros to redefine functions with
    an appropriate namespace prefix.

 src/lib/crypto/builtin/aes/iaesx64.s | 1277 +---------------------------
 src/lib/crypto/builtin/aes/iaesx86.s | 1594 +++-------------------------------
 2 files changed, 150 insertions(+), 2721 deletions(-)

diff --git a/src/lib/crypto/builtin/aes/iaesx64.s b/src/lib/crypto/builtin/aes/iaesx64.s
index 1012e36..1c091c1 100644
--- a/src/lib/crypto/builtin/aes/iaesx64.s
+++ b/src/lib/crypto/builtin/aes/iaesx64.s
@@ -27,6 +27,15 @@
 ; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 ; ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%define iEncExpandKey128 k5_iEncExpandKey128
+%define iEncExpandKey256 k5_iEncExpandKey256
+%define iDecExpandKey128 k5_iDecExpandKey128
+%define iDecExpandKey256 k5_iDecExpandKey256
+%define iEnc128_CBC      k5_iEnc128_CBC
+%define iEnc256_CBC      k5_iEnc256_CBC
+%define iDec128_CBC      k5_iDec128_CBC
+%define iDec256_CBC      k5_iDec256_CBC
+
 %macro linux_setup 0
 %ifdef __linux__
 	mov rcx, rdi
@@ -338,66 +347,6 @@ iEncExpandKey128:
 
 
 align 16
-global iEncExpandKey192
-iEncExpandKey192:
-
-		linux_setup
-		sub rsp,64+8
-		movdqa	[rsp],xmm6
-		movdqa	[rsp+16],xmm7
-
-
-        movq xmm7, [rcx+16]	; loading the AES key
-        movq [rdx+16], xmm7  ; Storing key in memory where all key expansion
-        pshufd xmm4, xmm7, 01001111b
-        movdqu xmm1, [rcx]	; loading the AES key
-        movdqu [rdx], xmm1  ; Storing key in memory where all key expansion
-
-        pxor xmm3, xmm3		; Set xmm3 to be all zeros. Required for the key_expansion.
-        pxor xmm6, xmm6		; Set xmm3 to be all zeros. Required for the key_expansion.
-
-        aeskeygenassist xmm2, xmm4, 0x1     ; Complete round key 1 and generate round key 2
-        key_expansion_1_192 24
-		key_expansion_2_192 40
-
-        aeskeygenassist xmm2, xmm4, 0x2     ; Generate round key 3 and part of round key 4
-        key_expansion_1_192 48
-		key_expansion_2_192 64
-
-        aeskeygenassist xmm2, xmm4, 0x4     ; Complete round key 4 and generate round key 5
-        key_expansion_1_192 72
-		key_expansion_2_192 88
-
-        aeskeygenassist xmm2, xmm4, 0x8     ; Generate round key 6 and part of round key 7
-        key_expansion_1_192 96
-		key_expansion_2_192 112
-
-        aeskeygenassist xmm2, xmm4, 0x10     ; Complete round key 7 and generate round key 8
-        key_expansion_1_192 120
-		key_expansion_2_192 136
-
-        aeskeygenassist xmm2, xmm4, 0x20     ; Generate round key 9 and part of round key 10
-        key_expansion_1_192 144
-		key_expansion_2_192 160
-
-        aeskeygenassist xmm2, xmm4, 0x40     ; Complete round key 10 and generate round key 11
-        key_expansion_1_192 168
-		key_expansion_2_192 184
-
-        aeskeygenassist xmm2, xmm4, 0x80     ; Generate round key 12
-        key_expansion_1_192 192
-
-
-		movdqa	xmm6,[rsp]
-		movdqa	xmm7,[rsp+16]
-		add rsp,64+8
-
-		ret
-
-
-
-
-align 16
 global iDecExpandKey128
 iDecExpandKey128:
 
@@ -425,37 +374,6 @@ iDecExpandKey128:
 	ret
 
 
-align 16
-global iDecExpandKey192
-iDecExpandKey192:
-
-	linux_setup
-	push rcx
-	push rdx
-	sub rsp,16+8
-
-	call iEncExpandKey192
-
-	add rsp,16+8
-	pop rdx
-	pop rcx
-
-
-	inversekey [rdx + 1*16]
-	inversekey [rdx + 2*16]
-	inversekey [rdx + 3*16]
-	inversekey [rdx + 4*16]
-	inversekey [rdx + 5*16]
-	inversekey [rdx + 6*16]
-	inversekey [rdx + 7*16]
-	inversekey [rdx + 8*16]
-	inversekey [rdx + 9*16]
-	inversekey [rdx + 10*16]
-	inversekey [rdx + 11*16]
-
-	ret
-
-
 
 align 16
 global iDecExpandKey256
@@ -539,103 +457,6 @@ iEncExpandKey256:
 
 
 
-
-
-
-align 16
-global iDec128
-iDec128:
-
-	linux_setup
-	sub rsp,16*16+8
-
-
-	mov eax,[rcx+32] ; numblocks
-	mov rdx,[rcx]
-	mov r8,[rcx+8]
-	mov rcx,[rcx+16]
-
-	sub r8,rdx
-
-	test eax,eax
-	jz end_dec128
-
-	cmp eax,4
-	jl	lp128decsingle
-
-	test	rcx,0xf
-	jz		lp128decfour
-
-	copy_round_keys rsp,rcx,0
-	copy_round_keys rsp,rcx,1
-	copy_round_keys rsp,rcx,2
-	copy_round_keys rsp,rcx,3
-	copy_round_keys rsp,rcx,4
-	copy_round_keys rsp,rcx,5
-	copy_round_keys rsp,rcx,6
-	copy_round_keys rsp,rcx,7
-	copy_round_keys rsp,rcx,8
-	copy_round_keys rsp,rcx,9
-	copy_round_keys rsp,rcx,10
-	mov rcx,rsp
-
-
-
-align 16
-lp128decfour:
-
-	test eax,eax
-	jz end_dec128
-
-	cmp eax,4
-	jl	lp128decsingle
-
-	load_and_xor4 rdx, [rcx+10*16]
-	add rdx,16*4
-	aesdec4 [rcx+9*16]
-	aesdec4 [rcx+8*16]
-	aesdec4 [rcx+7*16]
-	aesdec4 [rcx+6*16]
-	aesdec4 [rcx+5*16]
-	aesdec4 [rcx+4*16]
-	aesdec4 [rcx+3*16]
-	aesdec4 [rcx+2*16]
-	aesdec4 [rcx+1*16]
-	aesdeclast4 [rcx+0*16]
-
-	sub eax,4
-	store4 r8+rdx-(16*4)
-	jmp lp128decfour
-
-
-	align 16
-lp128decsingle:
-
-	movdqu xmm0, [rdx]
-	movdqu xmm4,[rcx+10*16]
-	pxor xmm0, xmm4
-	aesdec1_u [rcx+9*16]
-	aesdec1_u [rcx+8*16]
-	aesdec1_u [rcx+7*16]
-	aesdec1_u [rcx+6*16]
-	aesdec1_u [rcx+5*16]
-	aesdec1_u [rcx+4*16]
-	aesdec1_u [rcx+3*16]
-	aesdec1_u [rcx+2*16]
-	aesdec1_u [rcx+1*16]
-	aesdeclast1_u [rcx+0*16]
-
-	add rdx, 16
-	movdqu  [r8 + rdx - 16], xmm0
-	dec eax
-	jnz lp128decsingle
-
-end_dec128:
-
-	add rsp,16*16+8
-	ret
-
-
 align 16
 global iDec128_CBC
 iDec128_CBC:
@@ -748,124 +569,6 @@ end_dec128_CBC:
 	ret
 
 
-align 16
-global iDec192_CBC
-iDec192_CBC:
-
-	linux_setup
-	sub rsp,16*16+8
-
-	mov r9,rcx
-	mov rax,[rcx+24]
-	movdqu	xmm5,[rax]
-
-	mov eax,[rcx+32] ; numblocks
-	mov rdx,[rcx]
-	mov r8,[rcx+8]
-	mov rcx,[rcx+16]
-
-
-	sub r8,rdx
-
-	test eax,eax
-	jz end_dec192_CBC
-
-	cmp eax,4
-	jl	lp192decsingle_CBC
-
-	test	rcx,0xf
-	jz		lp192decfour_CBC
-
-	copy_round_keys rsp,rcx,0
-	copy_round_keys rsp,rcx,1
-	copy_round_keys rsp,rcx,2
-	copy_round_keys rsp,rcx,3
-	copy_round_keys rsp,rcx,4
-	copy_round_keys rsp,rcx,5
-	copy_round_keys rsp,rcx,6
-	copy_round_keys rsp,rcx,7
-	copy_round_keys rsp,rcx,8
-	copy_round_keys rsp,rcx,9
-	copy_round_keys rsp,rcx,10
-	copy_round_keys rsp,rcx,11
-	copy_round_keys rsp,rcx,12
-	mov rcx,rsp
-
-
-align 16
-lp192decfour_CBC:
-
-	test eax,eax
-	jz end_dec192_CBC
-
-	cmp eax,4
-	jl	lp192decsingle_CBC
-
-	load_and_xor4 rdx, [rcx+12*16]
-	add rdx,16*4
-	aesdec4 [rcx+11*16]
-	aesdec4 [rcx+10*16]
-	aesdec4 [rcx+9*16]
-	aesdec4 [rcx+8*16]
-	aesdec4 [rcx+7*16]
-	aesdec4 [rcx+6*16]
-	aesdec4 [rcx+5*16]
-	aesdec4 [rcx+4*16]
-	aesdec4 [rcx+3*16]
-	aesdec4 [rcx+2*16]
-	aesdec4 [rcx+1*16]
-	aesdeclast4 [rcx+0*16]
-
-	pxor	xmm0,xmm5
-	movdqu	xmm4,[rdx - 16*4 + 0*16]
-	pxor	xmm1,xmm4
-	movdqu	xmm4,[rdx - 16*4 + 1*16]
-	pxor	xmm2,xmm4
-	movdqu	xmm4,[rdx - 16*4 + 2*16]
-	pxor	xmm3,xmm4
-	movdqu	xmm5,[rdx - 16*4 + 3*16]
-
-	sub eax,4
-	store4 r8+rdx-(16*4)
-	jmp lp192decfour_CBC
-
-
-	align 16
-lp192decsingle_CBC:
-
-	movdqu xmm0, [rdx]
-	movdqu xmm4,[rcx+12*16]
-	movdqa	xmm1,xmm0
-	pxor xmm0, xmm4
-	aesdec1_u [rcx+11*16]
-	aesdec1_u [rcx+10*16]
-	aesdec1_u [rcx+9*16]
-	aesdec1_u [rcx+8*16]
-	aesdec1_u [rcx+7*16]
-	aesdec1_u [rcx+6*16]
-	aesdec1_u [rcx+5*16]
-	aesdec1_u [rcx+4*16]
-	aesdec1_u [rcx+3*16]
-	aesdec1_u [rcx+2*16]
-	aesdec1_u [rcx+1*16]
-	aesdeclast1_u [rcx+0*16]
-
-	pxor	xmm0,xmm5
-	movdqa	xmm5,xmm1
-	add rdx, 16
-	movdqu  [r8 + rdx - 16], xmm0
-	dec eax
-	jnz lp192decsingle_CBC
-
-end_dec192_CBC:
-
-	mov	   r9,[r9+24]
-	movdqu [r9],xmm5
-	add rsp,16*16+8
-	ret
-
-
-
 
 align 16
 global iDec256_CBC
@@ -990,15 +693,17 @@ end_dec256_CBC:
 
 
 
-
-
 align 16
-global iDec192
-iDec192:
+global iEnc128_CBC
+iEnc128_CBC:
 
 	linux_setup
 	sub rsp,16*16+8
 
+	mov r9,rcx
+	mov rax,[rcx+24]
+	movdqu xmm1,[rax]
+
 	mov eax,[rcx+32] ; numblocks
 	mov rdx,[rcx]
 	mov r8,[rcx+8]
@@ -1006,677 +711,9 @@ iDec192:
 
 	sub r8,rdx
 
-	test eax,eax
-	jz end_dec192
-
-	cmp eax,4
-	jl	lp192decsingle
 
 	test	rcx,0xf
-	jz		lp192decfour
-
-	copy_round_keys rsp,rcx,0
-	copy_round_keys rsp,rcx,1
-	copy_round_keys rsp,rcx,2
-	copy_round_keys rsp,rcx,3
-	copy_round_keys rsp,rcx,4
-	copy_round_keys rsp,rcx,5
-	copy_round_keys rsp,rcx,6
-	copy_round_keys rsp,rcx,7
-	copy_round_keys rsp,rcx,8
-	copy_round_keys rsp,rcx,9
-	copy_round_keys rsp,rcx,10
-	copy_round_keys rsp,rcx,11
-	copy_round_keys rsp,rcx,12
-	mov rcx,rsp
-
-align 16
-lp192decfour:
-
-	test eax,eax
-	jz end_dec192
-
-	cmp eax,4
-	jl	lp192decsingle
-
-	load_and_xor4 rdx, [rcx+12*16]
-	add rdx,16*4
-	aesdec4 [rcx+11*16]
-	aesdec4 [rcx+10*16]
-	aesdec4 [rcx+9*16]
-	aesdec4 [rcx+8*16]
-	aesdec4 [rcx+7*16]
-	aesdec4 [rcx+6*16]
-	aesdec4 [rcx+5*16]
-	aesdec4 [rcx+4*16]
-	aesdec4 [rcx+3*16]
-	aesdec4 [rcx+2*16]
-	aesdec4 [rcx+1*16]
-	aesdeclast4 [rcx+0*16]
-
-	sub eax,4
-	store4 r8+rdx-(16*4)
-	jmp lp192decfour
-
-
-	align 16
-lp192decsingle:
-
-	movdqu xmm0, [rdx]
-	movdqu xmm4,[rcx+12*16]
-	pxor xmm0, xmm4
-	aesdec1_u [rcx+11*16]
-	aesdec1_u [rcx+10*16]
-	aesdec1_u [rcx+9*16]
-	aesdec1_u [rcx+8*16]
-	aesdec1_u [rcx+7*16]
-	aesdec1_u [rcx+6*16]
-	aesdec1_u [rcx+5*16]
-	aesdec1_u [rcx+4*16]
-	aesdec1_u [rcx+3*16]
-	aesdec1_u [rcx+2*16]
-	aesdec1_u [rcx+1*16]
-	aesdeclast1_u [rcx+0*16]
-
-	add rdx, 16
-	movdqu  [r8 + rdx - 16], xmm0
-	dec eax
-	jnz lp192decsingle
-
-end_dec192:
-
-	add rsp,16*16+8
-	ret
-
-
-
-
-align 16
-global iDec256
-iDec256:
-
-	linux_setup
-	sub rsp,16*16+8
-
-	mov eax,[rcx+32] ; numblocks
-	mov rdx,[rcx]
-	mov r8,[rcx+8]
-	mov rcx,[rcx+16]
-
-	sub r8,rdx
-
-
-	test eax,eax
-	jz end_dec256
-
-	cmp eax,4
-	jl lp256dec
-
-	test	rcx,0xf
-	jz		lp256dec4
-
-	copy_round_keys rsp,rcx,0
-	copy_round_keys rsp,rcx,1
-	copy_round_keys rsp,rcx,2
-	copy_round_keys rsp,rcx,3
-	copy_round_keys rsp,rcx,4
-	copy_round_keys rsp,rcx,5
-	copy_round_keys rsp,rcx,6
-	copy_round_keys rsp,rcx,7
-	copy_round_keys rsp,rcx,8
-	copy_round_keys rsp,rcx,9
-	copy_round_keys rsp,rcx,10
-	copy_round_keys rsp,rcx,11
-	copy_round_keys rsp,rcx,12
-	copy_round_keys rsp,rcx,13
-	copy_round_keys rsp,rcx,14
-	mov rcx,rsp
-
-
-	align 16
-lp256dec4:
-	test eax,eax
-	jz end_dec256
-
-	cmp eax,4
-	jl lp256dec
-
-	load_and_xor4 rdx,[rcx+14*16]
-	add rdx, 4*16
-	aesdec4 [rcx+13*16]
-	aesdec4 [rcx+12*16]
-	aesdec4 [rcx+11*16]
-	aesdec4 [rcx+10*16]
-	aesdec4 [rcx+9*16]
-	aesdec4 [rcx+8*16]
-	aesdec4 [rcx+7*16]
-	aesdec4 [rcx+6*16]
-	aesdec4 [rcx+5*16]
-	aesdec4 [rcx+4*16]
-	aesdec4 [rcx+3*16]
-	aesdec4 [rcx+2*16]
-	aesdec4 [rcx+1*16]
-	aesdeclast4 [rcx+0*16]
-
-	store4 r8+rdx-16*4
-	sub eax,4
-	jmp lp256dec4
-
-	align 16
-lp256dec:
-
-	movdqu xmm0, [rdx]
-	movdqu xmm4,[rcx+14*16]
-	add rdx, 16
-	pxor xmm0, xmm4                    ; Round 0 (only xor)
-	aesdec1_u [rcx+13*16]
-	aesdec1_u [rcx+12*16]
-	aesdec1_u [rcx+11*16]
-	aesdec1_u [rcx+10*16]
-	aesdec1_u [rcx+9*16]
-	aesdec1_u [rcx+8*16]
-	aesdec1_u [rcx+7*16]
-	aesdec1_u [rcx+6*16]
-	aesdec1_u [rcx+5*16]
-	aesdec1_u [rcx+4*16]
-	aesdec1_u [rcx+3*16]
-	aesdec1_u [rcx+2*16]
-	aesdec1_u [rcx+1*16]
-	aesdeclast1_u [rcx+0*16]
-
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [r8+rdx-16], xmm0
-	dec eax
-	jnz lp256dec
-
-end_dec256:
-
-	add rsp,16*16+8
-	ret
-
-
-
-
-
-
-align 16
-global iEnc128
-iEnc128:
-
-	linux_setup
-	sub rsp,16*16+8
-
-	mov eax,[rcx+32] ; numblocks
-	mov rdx,[rcx]
-	mov r8,[rcx+8]
-	mov rcx,[rcx+16]
-
-	sub r8,rdx
-
-
-	test eax,eax
-	jz end_enc128
-
-	cmp eax,4
-	jl lp128encsingle
-
-	test	rcx,0xf
-	jz		lpenc128four
-
-	copy_round_keys rsp,rcx,0
-	copy_round_keys rsp,rcx,1
-	copy_round_keys rsp,rcx,2
-	copy_round_keys rsp,rcx,3
-	copy_round_keys rsp,rcx,4
-	copy_round_keys rsp,rcx,5
-	copy_round_keys rsp,rcx,6
-	copy_round_keys rsp,rcx,7
-	copy_round_keys rsp,rcx,8
-	copy_round_keys rsp,rcx,9
-	copy_round_keys rsp,rcx,10
-	mov rcx,rsp
-
-
-	align 16
-
-lpenc128four:
-
-	test eax,eax
-	jz end_enc128
-
-	cmp eax,4
-	jl lp128encsingle
-
-	load_and_xor4 rdx,[rcx+0*16]
-	add rdx,4*16
-	aesenc4	[rcx+1*16]
-	aesenc4	[rcx+2*16]
-	aesenc4	[rcx+3*16]
-	aesenc4	[rcx+4*16]
-	aesenc4	[rcx+5*16]
-	aesenc4	[rcx+6*16]
-	aesenc4	[rcx+7*16]
-	aesenc4	[rcx+8*16]
-	aesenc4	[rcx+9*16]
-	aesenclast4	[rcx+10*16]
-
-	store4 r8+rdx-16*4
-	sub eax,4
-	jmp lpenc128four
-
-	align 16
-lp128encsingle:
-
-	movdqu xmm0, [rdx]
-	movdqu xmm4,[rcx+0*16]
-	add rdx, 16
-	pxor xmm0, xmm4
-	aesenc1_u [rcx+1*16]
-	aesenc1_u [rcx+2*16]
-	aesenc1_u [rcx+3*16]
-	aesenc1_u [rcx+4*16]
-	aesenc1_u [rcx+5*16]
-	aesenc1_u [rcx+6*16]
-	aesenc1_u [rcx+7*16]
-	aesenc1_u [rcx+8*16]
-	aesenc1_u [rcx+9*16]
-	aesenclast1_u [rcx+10*16]
-
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [r8+rdx-16], xmm0
-	dec eax
-	jnz lp128encsingle
-
-end_enc128:
-
-	add rsp,16*16+8
-	ret
-
-
-align 16
-global iEnc128_CTR
-iEnc128_CTR:
-
-	linux_setup
-
-	mov r9,rcx
-	mov rax,[rcx+24]
-	movdqu xmm5,[rax]
-
-
-	sub rsp,16*16+8+16
-
-	movdqa [rsp+16*16], xmm6
-	movdqa xmm6, [byte_swap_16 wrt rip]
-	pshufb xmm5, xmm6 ; byte swap counter
-
-	mov eax,[rcx+32] ; numblocks
-	mov rdx,[rcx]
-	mov r8,[rcx+8]
-	mov rcx,[rcx+16]
-
-	sub r8,rdx
-
-	test eax,eax
-	jz end_encctr128
-
-	cmp eax,4
-	jl lp128encctrsingle
-
-	test	rcx,0xf
-	jz		lpencctr128four
-
-	copy_round_keys rsp,rcx,0
-	copy_round_keys rsp,rcx,1
-	copy_round_keys rsp,rcx,2
-	copy_round_keys rsp,rcx,3
-	copy_round_keys rsp,rcx,4
-	copy_round_keys rsp,rcx,5
-	copy_round_keys rsp,rcx,6
-	copy_round_keys rsp,rcx,7
-	copy_round_keys rsp,rcx,8
-	copy_round_keys rsp,rcx,9
-	copy_round_keys rsp,rcx,10
-	mov rcx,rsp
-
-
-	align 16
-
-lpencctr128four:
-
-	test eax,eax
-	jz end_encctr128
-
-	cmp eax,4
-	jl lp128encctrsingle
-
-	load_and_inc4 [rcx+0*16]
-	add rdx,4*16
-	aesenc4	[rcx+1*16]
-	aesenc4	[rcx+2*16]
-	aesenc4	[rcx+3*16]
-	aesenc4	[rcx+4*16]
-	aesenc4	[rcx+5*16]
-	aesenc4	[rcx+6*16]
-	aesenc4	[rcx+7*16]
-	aesenc4	[rcx+8*16]
-	aesenc4	[rcx+9*16]
-	aesenclast4	[rcx+10*16]
-	xor_with_input4 rdx-(4*16)
-
-	store4 r8+rdx-16*4
-	sub eax,4
-	jmp lpencctr128four
-
-	align 16
-lp128encctrsingle:
-
-	movdqa xmm0,xmm5
-	pshufb	xmm0, xmm6 ; byte swap counter back
-	paddd	xmm5,[counter_add_one wrt rip]
-	add rdx, 16
-	movdqu xmm4,[rcx+0*16]
-	pxor xmm0, xmm4
-	aesenc1_u [rcx+1*16]
-	aesenc1_u [rcx+2*16]
-	aesenc1_u [rcx+3*16]
-	aesenc1_u [rcx+4*16]
-	aesenc1_u [rcx+5*16]
-	aesenc1_u [rcx+6*16]
-	aesenc1_u [rcx+7*16]
-	aesenc1_u [rcx+8*16]
-	aesenc1_u [rcx+9*16]
-	aesenclast1_u [rcx+10*16]
-	movdqu xmm4, [rdx-16]
-	pxor  xmm0,xmm4
-
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [r8+rdx-16], xmm0
-	dec eax
-	jnz lp128encctrsingle
-
-end_encctr128:
-
-	mov	   r9,[r9+24]
-
-	pshufb xmm5, xmm6 ; byte swap counter
-	movdqu [r9],xmm5
-	movdqa xmm6, [rsp+16*16]
-	add rsp,16*16+8+16
-	ret
-
-
-
-align 16
-global iEnc192_CTR
-iEnc192_CTR:
-
-	linux_setup
-
-	mov r9,rcx
-	mov rax,[rcx+24]
-	movdqu xmm5,[rax]
-
-
-	sub rsp,16*16+8+16
-
-	movdqa [rsp+16*16], xmm6
-	movdqa xmm6, [byte_swap_16 wrt rip]
-	pshufb xmm5, xmm6 ; byte swap counter
-
-	mov eax,[rcx+32] ; numblocks
-	mov rdx,[rcx]
-	mov r8,[rcx+8]
-	mov rcx,[rcx+16]
-
-	sub r8,rdx
-
-
-	test eax,eax
-	jz end_encctr192
-
-	cmp eax,4
-	jl lp192encctrsingle
-
-	test	rcx,0xf
-	jz		lpencctr192four
-
-	copy_round_keys rsp,rcx,0
-	copy_round_keys rsp,rcx,1
-	copy_round_keys rsp,rcx,2
-	copy_round_keys rsp,rcx,3
-	copy_round_keys rsp,rcx,4
-	copy_round_keys rsp,rcx,5
-	copy_round_keys rsp,rcx,6
-	copy_round_keys rsp,rcx,7
-	copy_round_keys rsp,rcx,8
-	copy_round_keys rsp,rcx,9
-	copy_round_keys rsp,rcx,10
-	copy_round_keys rsp,rcx,11
-	copy_round_keys rsp,rcx,12
-	mov rcx,rsp
-
-
-	align 16
-
-lpencctr192four:
-
-	test eax,eax
-	jz end_encctr192
-
-	cmp eax,4
-	jl lp192encctrsingle
-
-	load_and_inc4 [rcx+0*16]
-	add rdx,4*16
-	aesenc4	[rcx+1*16]
-	aesenc4	[rcx+2*16]
-	aesenc4	[rcx+3*16]
-	aesenc4	[rcx+4*16]
-	aesenc4	[rcx+5*16]
-	aesenc4	[rcx+6*16]
-	aesenc4	[rcx+7*16]
-	aesenc4	[rcx+8*16]
-	aesenc4	[rcx+9*16]
-	aesenc4	[rcx+10*16]
-	aesenc4	[rcx+11*16]
-	aesenclast4	[rcx+12*16]
-	xor_with_input4 rdx-(4*16)
-
-	store4 r8+rdx-16*4
-	sub eax,4
-	jmp lpencctr192four
-
-	align 16
-lp192encctrsingle:
-
-	movdqa xmm0,xmm5
-	pshufb	xmm0, xmm6 ; byte swap counter back
-	movdqu xmm4,[rcx+0*16]
-	paddd	xmm5,[counter_add_one wrt rip]
-	add rdx, 16
-	pxor xmm0, xmm4
-	aesenc1_u [rcx+1*16]
-	aesenc1_u [rcx+2*16]
-	aesenc1_u [rcx+3*16]
-	aesenc1_u [rcx+4*16]
-	aesenc1_u [rcx+5*16]
-	aesenc1_u [rcx+6*16]
-	aesenc1_u [rcx+7*16]
-	aesenc1_u [rcx+8*16]
-	aesenc1_u [rcx+9*16]
-	aesenc1_u [rcx+10*16]
-	aesenc1_u [rcx+11*16]
-	aesenclast1_u [rcx+12*16]
-	movdqu xmm4, [rdx-16]
-	pxor  xmm0,xmm4
-
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [r8+rdx-16], xmm0
-	dec eax
-	jnz lp192encctrsingle
-
-end_encctr192:
-
-	mov	   r9,[r9+24]
-	pshufb xmm5, xmm6 ; byte swap counter
-	movdqu [r9],xmm5
-	movdqa xmm6, [rsp+16*16]
-	add rsp,16*16+8+16
-	ret
-
-
-align 16
-global iEnc256_CTR
-iEnc256_CTR:
-
-	linux_setup
-
-	mov r9,rcx
-	mov rax,[rcx+24]
-	movdqu xmm5,[rax]
-
-
-	sub rsp,16*16+8+16
-
-	movdqa [rsp+16*16], xmm6
-	movdqa xmm6, [byte_swap_16 wrt rip]
-	pshufb xmm5, xmm6 ; byte swap counter
-
-	mov eax,[rcx+32] ; numblocks
-	mov rdx,[rcx]
-	mov r8,[rcx+8]
-	mov rcx,[rcx+16]
-
-	sub r8,rdx
-
-
-	test eax,eax
-	jz end_encctr256
-
-	cmp eax,4
-	jl lp256encctrsingle
-
-	test	rcx,0xf
-	jz		lpencctr256four
-
-	copy_round_keys rsp,rcx,0
-	copy_round_keys rsp,rcx,1
-	copy_round_keys rsp,rcx,2
-	copy_round_keys rsp,rcx,3
-	copy_round_keys rsp,rcx,4
-	copy_round_keys rsp,rcx,5
-	copy_round_keys rsp,rcx,6
-	copy_round_keys rsp,rcx,7
-	copy_round_keys rsp,rcx,8
-	copy_round_keys rsp,rcx,9
-	copy_round_keys rsp,rcx,10
-	copy_round_keys rsp,rcx,11
-	copy_round_keys rsp,rcx,12
-	copy_round_keys rsp,rcx,13
-	copy_round_keys rsp,rcx,14
-	mov rcx,rsp
-
-
-	align 16
-
-lpencctr256four:
-
-	test eax,eax
-	jz end_encctr256
-
-	cmp eax,4
-	jl lp256encctrsingle
-
-	load_and_inc4 [rcx+0*16]
-	add rdx,4*16
-	aesenc4	[rcx+1*16]
-	aesenc4	[rcx+2*16]
-	aesenc4	[rcx+3*16]
-	aesenc4	[rcx+4*16]
-	aesenc4	[rcx+5*16]
-	aesenc4	[rcx+6*16]
-	aesenc4	[rcx+7*16]
-	aesenc4	[rcx+8*16]
-	aesenc4	[rcx+9*16]
-	aesenc4	[rcx+10*16]
-	aesenc4	[rcx+11*16]
-	aesenc4	[rcx+12*16]
-	aesenc4	[rcx+13*16]
-	aesenclast4	[rcx+14*16]
-	xor_with_input4 rdx-(4*16)
-
-	store4 r8+rdx-16*4
-	sub eax,4
-	jmp lpencctr256four
-
-	align 16
-lp256encctrsingle:
-
-	movdqa xmm0,xmm5
-	pshufb	xmm0, xmm6 ; byte swap counter back
-	movdqu xmm4,[rcx+0*16]
-	paddd	xmm5,[counter_add_one wrt rip]
-	add rdx, 16
-	pxor xmm0, xmm4
-	aesenc1_u [rcx+1*16]
-	aesenc1_u [rcx+2*16]
-	aesenc1_u [rcx+3*16]
-	aesenc1_u [rcx+4*16]
-	aesenc1_u [rcx+5*16]
-	aesenc1_u [rcx+6*16]
-	aesenc1_u [rcx+7*16]
-	aesenc1_u [rcx+8*16]
-	aesenc1_u [rcx+9*16]
-	aesenc1_u [rcx+10*16]
-	aesenc1_u [rcx+11*16]
-	aesenc1_u [rcx+12*16]
-	aesenc1_u [rcx+13*16]
-	aesenclast1_u [rcx+14*16]
-	movdqu xmm4, [rdx-16]
-	pxor  xmm0,xmm4
-
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [r8+rdx-16], xmm0
-	dec eax
-	jnz lp256encctrsingle
-
-end_encctr256:
-
-	mov	   r9,[r9+24]
-	pshufb xmm5, xmm6 ; byte swap counter
-	movdqu [r9],xmm5
-	movdqa xmm6, [rsp+16*16]
-	add rsp,16*16+8+16
-	ret
-
-
-
-
-
-
-
-align 16
-global iEnc128_CBC
-iEnc128_CBC:
-
-	linux_setup
-	sub rsp,16*16+8
-
-	mov r9,rcx
-	mov rax,[rcx+24]
-	movdqu xmm1,[rax]
-
-	mov eax,[rcx+32] ; numblocks
-	mov rdx,[rcx]
-	mov r8,[rcx+8]
-	mov rcx,[rcx+16]
-
-	sub r8,rdx
-
-
-	test	rcx,0xf
-	jz		lp128encsingle_CBC
+	jz		lp128encsingle_CBC
 
 	copy_round_keys rsp,rcx,0
 	copy_round_keys rsp,rcx,1
@@ -1724,77 +761,6 @@ lp128encsingle_CBC:
 	ret
 
 
-align 16
-global iEnc192_CBC
-iEnc192_CBC:
-
-	linux_setup
-	sub rsp,16*16+8
-	mov r9,rcx
-	mov rax,[rcx+24]
-	movdqu xmm1,[rax]
-
-	mov eax,[rcx+32] ; numblocks
-	mov rdx,[rcx]
-	mov r8,[rcx+8]
-	mov rcx,[rcx+16]
-
-	sub r8,rdx
-
-	test	rcx,0xf
-	jz		lp192encsingle_CBC
-
-	copy_round_keys rsp,rcx,0
-	copy_round_keys rsp,rcx,1
-	copy_round_keys rsp,rcx,2
-	copy_round_keys rsp,rcx,3
-	copy_round_keys rsp,rcx,4
-	copy_round_keys rsp,rcx,5
-	copy_round_keys rsp,rcx,6
-	copy_round_keys rsp,rcx,7
-	copy_round_keys rsp,rcx,8
-	copy_round_keys rsp,rcx,9
-	copy_round_keys rsp,rcx,10
-	copy_round_keys rsp,rcx,11
-	copy_round_keys rsp,rcx,12
-	mov rcx,rsp
-
-
-
-	align 16
-
-lp192encsingle_CBC:
-
-	movdqu xmm0, [rdx]
-	movdqu xmm4, [rcx+0*16]
-	add rdx, 16
-	pxor xmm0, xmm1
-	pxor xmm0, xmm4
-	aesenc1 [rcx+1*16]
-	aesenc1 [rcx+2*16]
-	aesenc1 [rcx+3*16]
-	aesenc1 [rcx+4*16]
-	aesenc1 [rcx+5*16]
-	aesenc1 [rcx+6*16]
-	aesenc1 [rcx+7*16]
-	aesenc1 [rcx+8*16]
-	aesenc1 [rcx+9*16]
-	aesenc1 [rcx+10*16]
-	aesenc1 [rcx+11*16]
-	aesenclast1 [rcx+12*16]
-	movdqa xmm1,xmm0
-
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [r8+rdx-16], xmm0
-	dec eax
-	jnz lp192encsingle_CBC
-
-	mov	   r9,[r9+24]
-	movdqu [r9],xmm1
-
-	add rsp,16*16+8
-	ret
-
 
 align 16
 global iEnc256_CBC
@@ -1868,214 +834,3 @@ lp256encsingle_CBC:
 	movdqu [r9],xmm1
 	add rsp,16*16+8
 	ret
-
-
-
-
-align 16
-global iEnc192
-iEnc192:
-
-	linux_setup
-	sub rsp,16*16+8
-
-	mov eax,[rcx+32] ; numblocks
-	mov rdx,[rcx]
-	mov r8,[rcx+8]
-	mov rcx,[rcx+16]
-
-	sub r8,rdx
-
-	test eax,eax
-	jz end_enc192
-
-	cmp eax,4
-	jl lp192encsingle
-
-	test	rcx,0xf
-	jz		lpenc192four
-
-	copy_round_keys rsp,rcx,0
-	copy_round_keys rsp,rcx,1
-	copy_round_keys rsp,rcx,2
-	copy_round_keys rsp,rcx,3
-	copy_round_keys rsp,rcx,4
-	copy_round_keys rsp,rcx,5
-	copy_round_keys rsp,rcx,6
-	copy_round_keys rsp,rcx,7
-	copy_round_keys rsp,rcx,8
-	copy_round_keys rsp,rcx,9
-	copy_round_keys rsp,rcx,10
-	copy_round_keys rsp,rcx,11
-	copy_round_keys rsp,rcx,12
-	mov rcx,rsp
-
-
-	align 16
-
-lpenc192four:
-
-	test eax,eax
-	jz end_enc192
-
-	cmp eax,4
-	jl lp192encsingle
-
-	load_and_xor4 rdx,[rcx+0*16]
-	add rdx,4*16
-	aesenc4	[rcx+1*16]
-	aesenc4	[rcx+2*16]
-	aesenc4	[rcx+3*16]
-	aesenc4	[rcx+4*16]
-	aesenc4	[rcx+5*16]
-	aesenc4	[rcx+6*16]
-	aesenc4	[rcx+7*16]
-	aesenc4	[rcx+8*16]
-	aesenc4	[rcx+9*16]
-	aesenc4	[rcx+10*16]
-	aesenc4	[rcx+11*16]
-	aesenclast4	[rcx+12*16]
-
-	store4 r8+rdx-16*4
-	sub eax,4
-	jmp lpenc192four
-
-	align 16
-lp192encsingle:
-
-	movdqu xmm0, [rdx]
-	movdqu xmm4, [rcx+0*16]
-	add rdx, 16
-	pxor xmm0, xmm4
-	aesenc1_u [rcx+1*16]
-	aesenc1_u [rcx+2*16]
-	aesenc1_u [rcx+3*16]
-	aesenc1_u [rcx+4*16]
-	aesenc1_u [rcx+5*16]
-	aesenc1_u [rcx+6*16]
-	aesenc1_u [rcx+7*16]
-	aesenc1_u [rcx+8*16]
-	aesenc1_u [rcx+9*16]
-	aesenc1_u [rcx+10*16]
-	aesenc1_u [rcx+11*16]
-	aesenclast1_u [rcx+12*16]
-
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [r8+rdx-16], xmm0
-	dec eax
-	jnz lp192encsingle
-
-end_enc192:
-
-	add rsp,16*16+8
-	ret
-
-
-
-
-
-
-align 16
-global iEnc256
-iEnc256:
-
-	linux_setup
-	sub rsp,16*16+8
-
-	mov eax,[rcx+32] ; numblocks
-	mov rdx,[rcx]
-	mov r8,[rcx+8]
-	mov rcx,[rcx+16]
-
-	sub r8,rdx
-
-
-	test eax,eax
-	jz end_enc256
-
-	cmp eax,4
-	jl lp256enc
-
-	test	rcx,0xf
-	jz		lp256enc4
-
-	copy_round_keys rsp,rcx,0
-	copy_round_keys rsp,rcx,1
-	copy_round_keys rsp,rcx,2
-	copy_round_keys rsp,rcx,3
-	copy_round_keys rsp,rcx,4
-	copy_round_keys rsp,rcx,5
-	copy_round_keys rsp,rcx,6
-	copy_round_keys rsp,rcx,7
-	copy_round_keys rsp,rcx,8
-	copy_round_keys rsp,rcx,9
-	copy_round_keys rsp,rcx,10
-	copy_round_keys rsp,rcx,11
-	copy_round_keys rsp,rcx,12
-	copy_round_keys rsp,rcx,13
-	copy_round_keys rsp,rcx,14
-	mov rcx,rsp
-
-
-	align 16
-
-lp256enc4:
-	test eax,eax
-	jz end_enc256
-
-	cmp eax,4
-	jl lp256enc
-
-
-	load_and_xor4 rdx,[rcx+0*16]
-	add rdx, 16*4
-	aesenc4 [rcx+1*16]
-	aesenc4 [rcx+2*16]
-	aesenc4 [rcx+3*16]
-	aesenc4 [rcx+4*16]
-	aesenc4 [rcx+5*16]
-	aesenc4 [rcx+6*16]
-	aesenc4 [rcx+7*16]
-	aesenc4 [rcx+8*16]
-	aesenc4 [rcx+9*16]
-	aesenc4 [rcx+10*16]
-	aesenc4 [rcx+11*16]
-	aesenc4 [rcx+12*16]
-	aesenc4 [rcx+13*16]
-	aesenclast4 [rcx+14*16]
-
-	store4  r8+rdx-16*4
-	sub eax,4
-	jmp lp256enc4
-
-	align 16
-lp256enc:
-
-	movdqu xmm0, [rdx]
-	movdqu xmm4, [rcx+0*16]
-	add rdx, 16
-	pxor xmm0, xmm4
-	aesenc1_u [rcx+1*16]
-	aesenc1_u [rcx+2*16]
-	aesenc1_u [rcx+3*16]
-	aesenc1_u [rcx+4*16]
-	aesenc1_u [rcx+5*16]
-	aesenc1_u [rcx+6*16]
-	aesenc1_u [rcx+7*16]
-	aesenc1_u [rcx+8*16]
-	aesenc1_u [rcx+9*16]
-	aesenc1_u [rcx+10*16]
-	aesenc1_u [rcx+11*16]
-	aesenc1_u [rcx+12*16]
-	aesenc1_u [rcx+13*16]
-	aesenclast1_u [rcx+14*16]
-
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [r8+rdx-16], xmm0
-	dec eax
-	jnz lp256enc
-
-end_enc256:
-
-	add rsp,16*16+8
-	ret
diff --git a/src/lib/crypto/builtin/aes/iaesx86.s b/src/lib/crypto/builtin/aes/iaesx86.s
index c65921b..b667acd 100644
--- a/src/lib/crypto/builtin/aes/iaesx86.s
+++ b/src/lib/crypto/builtin/aes/iaesx86.s
@@ -27,6 +27,14 @@
 ; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 ; ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+%define _iEncExpandKey128 k5_iEncExpandKey128
+%define _iEncExpandKey256 k5_iEncExpandKey256
+%define _iDecExpandKey128 k5_iDecExpandKey128
+%define _iDecExpandKey256 k5_iDecExpandKey256
+%define _iEnc128_CBC      k5_iEnc128_CBC
+%define _iEnc256_CBC      k5_iEnc256_CBC
+%define _iDec128_CBC      k5_iDec128_CBC
+%define _iDec256_CBC      k5_iDec256_CBC
 
 %macro inversekey 1
 	movdqu  xmm1,%1
@@ -343,59 +351,6 @@ _iEncExpandKey128:
 	ret
 
 
-align 16
-global _iEncExpandKey192
-_iEncExpandKey192:
-
-	mov ecx,[esp-4+8]		;input
-	mov edx,[esp-4+12]		;ctx
-
-        movq xmm7, [ecx+16]	; loading the AES key
-        movq [edx+16], xmm7  ; Storing key in memory where all key expansion
-        pshufd xmm4, xmm7, 01001111b
-        movdqu xmm1, [ecx]	; loading the AES key
-        movdqu [edx], xmm1  ; Storing key in memory where all key expansion
-
-        pxor xmm3, xmm3		; Set xmm3 to be all zeros. Required for the key_expansion.
-        pxor xmm6, xmm6		; Set xmm3 to be all zeros. Required for the key_expansion.
-
-        aeskeygenassist xmm2, xmm4, 0x1     ; Complete round key 1 and generate round key 2
-        key_expansion_1_192 24
-	key_expansion_2_192 40
-
-        aeskeygenassist xmm2, xmm4, 0x2     ; Generate round key 3 and part of round key 4
-        key_expansion_1_192 48
-	key_expansion_2_192 64
-
-        aeskeygenassist xmm2, xmm4, 0x4     ; Complete round key 4 and generate round key 5
-        key_expansion_1_192 72
-	key_expansion_2_192 88
-
-        aeskeygenassist xmm2, xmm4, 0x8     ; Generate round key 6 and part of round key 7
-        key_expansion_1_192 96
-	key_expansion_2_192 112
-
-        aeskeygenassist xmm2, xmm4, 0x10     ; Complete round key 7 and generate round key 8
-        key_expansion_1_192 120
-	key_expansion_2_192 136
-
-        aeskeygenassist xmm2, xmm4, 0x20     ; Generate round key 9 and part of round key 10
-        key_expansion_1_192 144
-	key_expansion_2_192 160
-
-        aeskeygenassist xmm2, xmm4, 0x40     ; Complete round key 10 and generate round key 11
-        key_expansion_1_192 168
-	key_expansion_2_192 184
-
-        aeskeygenassist xmm2, xmm4, 0x80     ; Generate round key 12
-        key_expansion_1_192 192
-
-	ret
-
-
-
-
-
 
 align 16
 global _iDecExpandKey128
@@ -422,35 +377,6 @@ _iDecExpandKey128:
 
 
 
-
-align 16
-global _iDecExpandKey192
-_iDecExpandKey192:
-	push DWORD [esp+8]
-	push DWORD [esp+8]
-
-	call _iEncExpandKey192
-	add esp,8
-
-	mov edx,[esp-4+12]		;ctx
-
-	inversekey	[edx + 1*16]
-	inversekey	[edx + 2*16]
-	inversekey	[edx + 3*16]
-	inversekey	[edx + 4*16]
-	inversekey	[edx + 5*16]
-	inversekey	[edx + 6*16]
-	inversekey	[edx + 7*16]
-	inversekey	[edx + 8*16]
-	inversekey	[edx + 9*16]
-	inversekey	[edx + 10*16]
-	inversekey	[edx + 11*16]
-
-	ret
-
-
-
-
 align 16
 global _iDecExpandKey256
 _iDecExpandKey256:
@@ -530,22 +456,21 @@ _iEncExpandKey256:
 
 
 
-
-
-
 align 16
-global _iDec128
-_iDec128:
+global _iDec128_CBC
+_iDec128_CBC:
 	mov ecx,[esp-4+8]
 
 	push esi
 	push edi
 	push ebp
 	mov ebp,esp
-
 	sub esp,16*16
 	and esp,0xfffffff0
 
+	mov eax,[ecx+12]
+	movdqu xmm5,[eax]	;iv
+
 	mov eax,[ecx+16] ; numblocks
 	mov esi,[ecx]
 	mov edi,[ecx+4]
@@ -554,13 +479,13 @@ _iDec128:
 	sub edi,esi
 
 	test eax,eax
-	jz end_dec128
+	jz end_dec128_CBC
 
 	cmp eax,4
-	jl	lp128decsingle
+	jl	lp128decsingle_CBC
 
 	test	ecx,0xf
-	jz		lp128decfour
+	jz		lp128decfour_CBC
 
 	copy_round_keys esp,ecx,0
 	copy_round_keys esp,ecx,1
@@ -577,13 +502,13 @@ _iDec128:
 
 
 align 16
-lp128decfour:
+lp128decfour_CBC:
 
 	test eax,eax
-	jz end_dec128
+	jz end_dec128_CBC
 
 	cmp eax,4
-	jl	lp128decsingle
+	jl	lp128decsingle_CBC
 
 	load_and_xor4 esi, [ecx+10*16]
 	add esi,16*4
@@ -598,15 +523,25 @@ lp128decfour:
 	aesdec4 [ecx+1*16]
 	aesdeclast4 [ecx+0*16]
 
+	pxor	xmm0,xmm5
+	movdqu	xmm4,[esi- 16*4 + 0*16]
+	pxor	xmm1,xmm4
+	movdqu	xmm4,[esi- 16*4 + 1*16]
+	pxor	xmm2,xmm4
+	movdqu	xmm4,[esi- 16*4 + 2*16]
+	pxor	xmm3,xmm4
+	movdqu	xmm5,[esi- 16*4 + 3*16]
+
 	sub eax,4
 	store4 esi+edi-(16*4)
-	jmp lp128decfour
+	jmp lp128decfour_CBC
 
 
 	align 16
-lp128decsingle:
+lp128decsingle_CBC:
 
 	movdqu xmm0, [esi]
+	movdqa xmm1,xmm0
 	movdqu xmm4,[ecx+10*16]
 	pxor xmm0, xmm4
 	aesdec1_u  [ecx+9*16]
@@ -620,31 +555,39 @@ lp128decsingle:
 	aesdec1_u  [ecx+1*16]
 	aesdeclast1_u [ecx+0*16]
 
+	pxor	xmm0,xmm5
+	movdqa	xmm5,xmm1
+
 	add esi, 16
 	movdqu  [edi+esi - 16], xmm0
 	dec eax
-	jnz lp128decsingle
+	jnz lp128decsingle_CBC
 
-end_dec128:
+end_dec128_CBC:
 
 	mov esp,ebp
 	pop ebp
 	pop edi
 	pop esi
 
+	mov ecx,[esp-4+8]   ; first arg
+	mov ecx,[ecx+12]
+	movdqu	[ecx],xmm5 ; store last iv for chaining
+
 	ret
 
 
 
 align 16
-global _iDec128_CBC
-_iDec128_CBC:
+global _iDec256_CBC
+_iDec256_CBC:
 	mov ecx,[esp-4+8]
 
 	push esi
 	push edi
 	push ebp
 	mov ebp,esp
+
 	sub esp,16*16
 	and esp,0xfffffff0
 
@@ -659,13 +602,13 @@ _iDec128_CBC:
 	sub edi,esi
 
 	test eax,eax
-	jz end_dec128_CBC
+	jz end_dec256_CBC
 
 	cmp eax,4
-	jl	lp128decsingle_CBC
+	jl	lp256decsingle_CBC
 
 	test	ecx,0xf
-	jz		lp128decfour_CBC
+	jz	lp256decfour_CBC
 
 	copy_round_keys esp,ecx,0
 	copy_round_keys esp,ecx,1
@@ -678,20 +621,27 @@ _iDec128_CBC:
 	copy_round_keys esp,ecx,8
 	copy_round_keys esp,ecx,9
 	copy_round_keys esp,ecx,10
+	copy_round_keys esp,ecx,11
+	copy_round_keys esp,ecx,12
+	copy_round_keys esp,ecx,13
+	copy_round_keys esp,ecx,14
 	mov ecx,esp
 
-
 align 16
-lp128decfour_CBC:
+lp256decfour_CBC:
 
 	test eax,eax
-	jz end_dec128_CBC
+	jz end_dec256_CBC
 
 	cmp eax,4
-	jl	lp128decsingle_CBC
+	jl	lp256decsingle_CBC
 
-	load_and_xor4 esi, [ecx+10*16]
+	load_and_xor4 esi, [ecx+14*16]
 	add esi,16*4
+	aesdec4 [ecx+13*16]
+	aesdec4 [ecx+12*16]
+	aesdec4 [ecx+11*16]
+	aesdec4 [ecx+10*16]
 	aesdec4 [ecx+9*16]
 	aesdec4 [ecx+8*16]
 	aesdec4 [ecx+7*16]
@@ -714,16 +664,20 @@ lp128decfour_CBC:
 
 	sub eax,4
 	store4 esi+edi-(16*4)
-	jmp lp128decfour_CBC
+	jmp lp256decfour_CBC
 
 
 	align 16
-lp128decsingle_CBC:
+lp256decsingle_CBC:
 
 	movdqu xmm0, [esi]
 	movdqa xmm1,xmm0
-	movdqu xmm4,[ecx+10*16]
+	movdqu xmm4, [ecx+14*16]
 	pxor xmm0, xmm4
+	aesdec1_u  [ecx+13*16]
+	aesdec1_u  [ecx+12*16]
+	aesdec1_u  [ecx+11*16]
+	aesdec1_u  [ecx+10*16]
 	aesdec1_u  [ecx+9*16]
 	aesdec1_u  [ecx+8*16]
 	aesdec1_u  [ecx+7*16]
@@ -733,7 +687,7 @@ lp128decsingle_CBC:
 	aesdec1_u  [ecx+3*16]
 	aesdec1_u  [ecx+2*16]
 	aesdec1_u  [ecx+1*16]
-	aesdeclast1_u [ecx+0*16]
+	aesdeclast1_u  [ecx+0*16]
 
 	pxor	xmm0,xmm5
 	movdqa	xmm5,xmm1
@@ -741,16 +695,17 @@ lp128decsingle_CBC:
 	add esi, 16
 	movdqu  [edi+esi - 16], xmm0
 	dec eax
-	jnz lp128decsingle_CBC
+	jnz lp256decsingle_CBC
+
+end_dec256_CBC:
 
-end_dec128_CBC:
 
 	mov esp,ebp
 	pop ebp
 	pop edi
 	pop esi
 
-	mov ecx,[esp-4+8]   ; first arg
+	mov ecx,[esp-4+8]  ; first arg
 	mov ecx,[ecx+12]
 	movdqu	[ecx],xmm5 ; store last iv for chaining
 
@@ -758,12 +713,9 @@ end_dec128_CBC:
 
 
 
-
-
-
 align 16
-global _iDec192
-_iDec192:
+global _iEnc128_CBC
+_iEnc128_CBC:
 	mov ecx,[esp-4+8]
 
 	push esi
@@ -774,21 +726,17 @@ _iDec192:
 	sub esp,16*16
 	and esp,0xfffffff0
 
+	mov	eax,[ecx+12]
+	movdqu xmm1,[eax]	;iv
+
 	mov eax,[ecx+16] ; numblocks
 	mov esi,[ecx]
 	mov edi,[ecx+4]
 	mov ecx,[ecx+8]
-
 	sub edi,esi
 
-	test eax,eax
-	jz end_dec192
-
-	cmp eax,4
-	jl	lp192decsingle
-
 	test	ecx,0xf
-	jz		lp192decfour
+	jz		lp128encsingle_CBC
 
 	copy_round_keys esp,ecx,0
 	copy_round_keys esp,ecx,1
@@ -801,79 +749,50 @@ _iDec192:
 	copy_round_keys esp,ecx,8
 	copy_round_keys esp,ecx,9
 	copy_round_keys esp,ecx,10
-	copy_round_keys esp,ecx,11
-	copy_round_keys esp,ecx,12
 	mov ecx,esp
 
-
-align 16
-lp192decfour:
-
-	test eax,eax
-	jz end_dec192
-
-	cmp eax,4
-	jl	lp192decsingle
-
-	load_and_xor4 esi, [ecx+12*16]
-	add esi,16*4
-	aesdec4 [ecx+11*16]
-	aesdec4 [ecx+10*16]
-	aesdec4 [ecx+9*16]
-	aesdec4 [ecx+8*16]
-	aesdec4 [ecx+7*16]
-	aesdec4 [ecx+6*16]
-	aesdec4 [ecx+5*16]
-	aesdec4 [ecx+4*16]
-	aesdec4 [ecx+3*16]
-	aesdec4 [ecx+2*16]
-	aesdec4 [ecx+1*16]
-	aesdeclast4 [ecx+0*16]
-
-	sub eax,4
-	store4 esi+edi-(16*4)
-	jmp lp192decfour
-
-
 	align 16
-lp192decsingle:
 
-	movdqu xmm0, [esi]
-	movdqu xmm4,[ecx+12*16]
-	pxor xmm0, xmm4
-	aesdec1_u [ecx+11*16]
-	aesdec1_u  [ecx+10*16]
-	aesdec1_u  [ecx+9*16]
-	aesdec1_u  [ecx+8*16]
-	aesdec1_u  [ecx+7*16]
-	aesdec1_u  [ecx+6*16]
-	aesdec1_u  [ecx+5*16]
-	aesdec1_u  [ecx+4*16]
-	aesdec1_u  [ecx+3*16]
-	aesdec1_u  [ecx+2*16]
-	aesdec1_u  [ecx+1*16]
-	aesdeclast1_u  [ecx+0*16]
+lp128encsingle_CBC:
 
+	movdqu xmm0, [esi]
 	add esi, 16
-	movdqu  [edi+esi - 16], xmm0
+	pxor xmm0, xmm1
+	movdqu xmm4,[ecx+0*16]
+	pxor xmm0, xmm4
+	aesenc1  [ecx+1*16]
+	aesenc1  [ecx+2*16]
+	aesenc1  [ecx+3*16]
+	aesenc1  [ecx+4*16]
+	aesenc1  [ecx+5*16]
+	aesenc1  [ecx+6*16]
+	aesenc1  [ecx+7*16]
+	aesenc1  [ecx+8*16]
+	aesenc1  [ecx+9*16]
+	aesenclast1  [ecx+10*16]
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [esi+edi-16], xmm0
+	movdqa xmm1,xmm0
 	dec eax
-	jnz lp192decsingle
-
-end_dec192:
+	jnz lp128encsingle_CBC
 
 
 	mov esp,ebp
 	pop ebp
 	pop edi
 	pop esi
+	mov ecx,[esp-4+8]  ; first arg
+	mov ecx,[ecx+12]
+	movdqu	[ecx],xmm1 ; store last iv for chaining
 
 	ret
 
 
+
 align 16
-global _iDec192_CBC
-_iDec192_CBC:
-	mov ecx,[esp-4+8]
+global _iEnc256_CBC
+_iEnc256_CBC:
+	mov ecx,[esp-4+8]  ; first arg
 
 	push esi
 	push edi
@@ -883,24 +802,17 @@ _iDec192_CBC:
 	sub esp,16*16
 	and esp,0xfffffff0
 
-	mov eax,[ecx+12]
-	movdqu xmm5,[eax]	;iv
+	mov	eax,[ecx+12]
+	movdqu xmm1,[eax]	;iv
 
 	mov eax,[ecx+16] ; numblocks
 	mov esi,[ecx]
 	mov edi,[ecx+4]
 	mov ecx,[ecx+8]
-
 	sub edi,esi
 
-	test eax,eax
-	jz end_dec192_CBC
-
-	cmp eax,4
-	jl	lp192decsingle_CBC
-
 	test	ecx,0xf
-	jz		lp192decfour_CBC
+	jz		lp256encsingle_CBC
 
 	copy_round_keys esp,ecx,0
 	copy_round_keys esp,ecx,1
@@ -915,1047 +827,39 @@ _iDec192_CBC:
 	copy_round_keys esp,ecx,10
 	copy_round_keys esp,ecx,11
 	copy_round_keys esp,ecx,12
+	copy_round_keys esp,ecx,13
+	copy_round_keys esp,ecx,14
 	mov ecx,esp
 
-align 16
-lp192decfour_CBC:
+	align 16
 
-	test eax,eax
-	jz end_dec192_CBC
+lp256encsingle_CBC:
 
-	cmp eax,4
-	jl	lp192decsingle_CBC
-
-	load_and_xor4 esi, [ecx+12*16]
-	add esi,16*4
-	aesdec4 [ecx+11*16]
-	aesdec4 [ecx+10*16]
-	aesdec4 [ecx+9*16]
-	aesdec4 [ecx+8*16]
-	aesdec4 [ecx+7*16]
-	aesdec4 [ecx+6*16]
-	aesdec4 [ecx+5*16]
-	aesdec4 [ecx+4*16]
-	aesdec4 [ecx+3*16]
-	aesdec4 [ecx+2*16]
-	aesdec4 [ecx+1*16]
-	aesdeclast4 [ecx+0*16]
-
-	pxor	xmm0,xmm5
-	movdqu	xmm4,[esi- 16*4 + 0*16]
-	pxor	xmm1,xmm4
-	movdqu	xmm4,[esi- 16*4 + 1*16]
-	pxor	xmm2,xmm4
-	movdqu	xmm4,[esi- 16*4 + 2*16]
-	pxor	xmm3,xmm4
-	movdqu	xmm5,[esi- 16*4 + 3*16]
-
-	sub eax,4
-	store4 esi+edi-(16*4)
-	jmp lp192decfour_CBC
-
-
-	align 16
-lp192decsingle_CBC:
-
-	movdqu xmm0, [esi]
-	movdqu xmm4,[ecx+12*16]
-	movdqa xmm1,xmm0
-	pxor xmm0, xmm4
-	aesdec1_u [ecx+11*16]
-	aesdec1_u [ecx+10*16]
-	aesdec1_u [ecx+9*16]
-	aesdec1_u [ecx+8*16]
-	aesdec1_u [ecx+7*16]
-	aesdec1_u [ecx+6*16]
-	aesdec1_u [ecx+5*16]
-	aesdec1_u [ecx+4*16]
-	aesdec1_u [ecx+3*16]
-	aesdec1_u [ecx+2*16]
-	aesdec1_u [ecx+1*16]
-	aesdeclast1_u [ecx+0*16]
-
-	pxor	xmm0,xmm5
-	movdqa	xmm5,xmm1
-
-	add esi, 16
-	movdqu  [edi+esi - 16], xmm0
-	dec eax
-	jnz lp192decsingle_CBC
-
-end_dec192_CBC:
-
-
-	mov esp,ebp
-	pop ebp
-	pop edi
-	pop esi
-
-	mov ecx,[esp-4+8]
-	mov ecx,[ecx+12]
-	movdqu	[ecx],xmm5 ; store last iv for chaining
-
-	ret
-
-
-
-
-
-align 16
-global _iDec256
-_iDec256:
-	mov ecx, [esp-4+8]
-
-	push esi
-	push edi
-	push ebp
-	mov ebp,esp
-
-	sub esp,16*16
-	and esp,0xfffffff0
-
-	mov eax,[ecx+16] ; numblocks
-	mov esi,[ecx]
-	mov edi,[ecx+4]
-	mov ecx,[ecx+8]
-
-	sub edi,esi
-
-
-	test eax,eax
-	jz end_dec256
-
-	cmp eax,4
-	jl lp256dec
-
-	test	ecx,0xf
-	jz	lp256dec4
-
-	copy_round_keys esp,ecx,0
-	copy_round_keys esp,ecx,1
-	copy_round_keys esp,ecx,2
-	copy_round_keys esp,ecx,3
-	copy_round_keys esp,ecx,4
-	copy_round_keys esp,ecx,5
-	copy_round_keys esp,ecx,6
-	copy_round_keys esp,ecx,7
-	copy_round_keys esp,ecx,8
-	copy_round_keys esp,ecx,9
-	copy_round_keys esp,ecx,10
-	copy_round_keys esp,ecx,11
-	copy_round_keys esp,ecx,12
-	copy_round_keys esp,ecx,13
-	copy_round_keys esp,ecx,14
-	mov ecx,esp
-
-	align 16
-lp256dec4:
-	test eax,eax
-	jz end_dec256
-
-	cmp eax,4
-	jl lp256dec
-
-	load_and_xor4 esi,[ecx+14*16]
-	add esi, 4*16
-	aesdec4 [ecx+13*16]
-	aesdec4 [ecx+12*16]
-	aesdec4 [ecx+11*16]
-	aesdec4 [ecx+10*16]
-	aesdec4 [ecx+9*16]
-	aesdec4 [ecx+8*16]
-	aesdec4 [ecx+7*16]
-	aesdec4 [ecx+6*16]
-	aesdec4 [ecx+5*16]
-	aesdec4 [ecx+4*16]
-	aesdec4 [ecx+3*16]
-	aesdec4 [ecx+2*16]
-	aesdec4 [ecx+1*16]
-	aesdeclast4 [ecx+0*16]
-
-	store4 esi+edi-16*4
-	sub eax,4
-	jmp lp256dec4
-
-	align 16
-lp256dec:
-
-	movdqu xmm0, [esi]
-	movdqu xmm4,[ecx+14*16]
-	add esi, 16
-	pxor xmm0, xmm4                     ; Round 0 (only xor)
-	aesdec1_u  [ecx+13*16]
-	aesdec1_u  [ecx+12*16]
-	aesdec1_u  [ecx+11*16]
-	aesdec1_u  [ecx+10*16]
-	aesdec1_u  [ecx+9*16]
-	aesdec1_u  [ecx+8*16]
-	aesdec1_u  [ecx+7*16]
-	aesdec1_u  [ecx+6*16]
-	aesdec1_u  [ecx+5*16]
-	aesdec1_u  [ecx+4*16]
-	aesdec1_u  [ecx+3*16]
-	aesdec1_u  [ecx+2*16]
-	aesdec1_u  [ecx+1*16]
-	aesdeclast1_u  [ecx+0*16]
-
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [esi+edi-16], xmm0
-	dec eax
-	jnz lp256dec
-
-end_dec256:
-
-
-	mov esp,ebp
-	pop ebp
-	pop edi
-	pop esi
-
-	ret
-
-
-
-
-align 16
-global _iDec256_CBC
-_iDec256_CBC:
-	mov ecx,[esp-4+8]
-
-	push esi
-	push edi
-	push ebp
-	mov ebp,esp
-
-	sub esp,16*16
-	and esp,0xfffffff0
-
-	mov eax,[ecx+12]
-	movdqu xmm5,[eax]	;iv
-
-	mov eax,[ecx+16] ; numblocks
-	mov esi,[ecx]
-	mov edi,[ecx+4]
-	mov ecx,[ecx+8]
-
-	sub edi,esi
-
-	test eax,eax
-	jz end_dec256_CBC
-
-	cmp eax,4
-	jl	lp256decsingle_CBC
-
-	test	ecx,0xf
-	jz	lp256decfour_CBC
-
-	copy_round_keys esp,ecx,0
-	copy_round_keys esp,ecx,1
-	copy_round_keys esp,ecx,2
-	copy_round_keys esp,ecx,3
-	copy_round_keys esp,ecx,4
-	copy_round_keys esp,ecx,5
-	copy_round_keys esp,ecx,6
-	copy_round_keys esp,ecx,7
-	copy_round_keys esp,ecx,8
-	copy_round_keys esp,ecx,9
-	copy_round_keys esp,ecx,10
-	copy_round_keys esp,ecx,11
-	copy_round_keys esp,ecx,12
-	copy_round_keys esp,ecx,13
-	copy_round_keys esp,ecx,14
-	mov ecx,esp
-
-align 16
-lp256decfour_CBC:
-
-	test eax,eax
-	jz end_dec256_CBC
-
-	cmp eax,4
-	jl	lp256decsingle_CBC
-
-	load_and_xor4 esi, [ecx+14*16]
-	add esi,16*4
-	aesdec4 [ecx+13*16]
-	aesdec4 [ecx+12*16]
-	aesdec4 [ecx+11*16]
-	aesdec4 [ecx+10*16]
-	aesdec4 [ecx+9*16]
-	aesdec4 [ecx+8*16]
-	aesdec4 [ecx+7*16]
-	aesdec4 [ecx+6*16]
-	aesdec4 [ecx+5*16]
-	aesdec4 [ecx+4*16]
-	aesdec4 [ecx+3*16]
-	aesdec4 [ecx+2*16]
-	aesdec4 [ecx+1*16]
-	aesdeclast4 [ecx+0*16]
-
-	pxor	xmm0,xmm5
-	movdqu	xmm4,[esi- 16*4 + 0*16]
-	pxor	xmm1,xmm4
-	movdqu	xmm4,[esi- 16*4 + 1*16]
-	pxor	xmm2,xmm4
-	movdqu	xmm4,[esi- 16*4 + 2*16]
-	pxor	xmm3,xmm4
-	movdqu	xmm5,[esi- 16*4 + 3*16]
-
-	sub eax,4
-	store4 esi+edi-(16*4)
-	jmp lp256decfour_CBC
-
-
-	align 16
-lp256decsingle_CBC:
-
-	movdqu xmm0, [esi]
-	movdqa xmm1,xmm0
-	movdqu xmm4, [ecx+14*16]
-	pxor xmm0, xmm4
-	aesdec1_u  [ecx+13*16]
-	aesdec1_u  [ecx+12*16]
-	aesdec1_u  [ecx+11*16]
-	aesdec1_u  [ecx+10*16]
-	aesdec1_u  [ecx+9*16]
-	aesdec1_u  [ecx+8*16]
-	aesdec1_u  [ecx+7*16]
-	aesdec1_u  [ecx+6*16]
-	aesdec1_u  [ecx+5*16]
-	aesdec1_u  [ecx+4*16]
-	aesdec1_u  [ecx+3*16]
-	aesdec1_u  [ecx+2*16]
-	aesdec1_u  [ecx+1*16]
-	aesdeclast1_u  [ecx+0*16]
-
-	pxor	xmm0,xmm5
-	movdqa	xmm5,xmm1
-
-	add esi, 16
-	movdqu  [edi+esi - 16], xmm0
-	dec eax
-	jnz lp256decsingle_CBC
-
-end_dec256_CBC:
-
-
-	mov esp,ebp
-	pop ebp
-	pop edi
-	pop esi
-
-	mov ecx,[esp-4+8]  ; first arg
-	mov ecx,[ecx+12]
-	movdqu	[ecx],xmm5 ; store last iv for chaining
-
-	ret
-
-
-
-
-
-
-
-
-
-align 16
-global _iEnc128
-_iEnc128:
-	mov ecx,[esp-4+8]
-
-	push esi
-	push edi
-	push ebp
-	mov ebp,esp
-
-	sub esp,16*16
-	and esp,0xfffffff0
-
-	mov eax,[ecx+16] ; numblocks
-	mov esi,[ecx]
-	mov edi,[ecx+4]
-	mov ecx,[ecx+8]
-
-	sub edi,esi
-
-	test eax,eax
-	jz end_enc128
-
-	cmp eax,4
-	jl lp128encsingle
-
-	test	ecx,0xf
-	jz		lpenc128four
-
-	copy_round_keys esp,ecx,0
-	copy_round_keys esp,ecx,1
-	copy_round_keys esp,ecx,2
-	copy_round_keys esp,ecx,3
-	copy_round_keys esp,ecx,4
-	copy_round_keys esp,ecx,5
-	copy_round_keys esp,ecx,6
-	copy_round_keys esp,ecx,7
-	copy_round_keys esp,ecx,8
-	copy_round_keys esp,ecx,9
-	copy_round_keys esp,ecx,10
-	mov ecx,esp
-
-
-	align 16
-
-lpenc128four:
-
-	test eax,eax
-	jz end_enc128
-
-	cmp eax,4
-	jl lp128encsingle
-
-	load_and_xor4 esi,[ecx+0*16]
-	add esi,4*16
-	aesenc4	[ecx+1*16]
-	aesenc4	[ecx+2*16]
-	aesenc4	[ecx+3*16]
-	aesenc4	[ecx+4*16]
-	aesenc4	[ecx+5*16]
-	aesenc4	[ecx+6*16]
-	aesenc4	[ecx+7*16]
-	aesenc4	[ecx+8*16]
-	aesenc4	[ecx+9*16]
-	aesenclast4	[ecx+10*16]
-
-	store4 esi+edi-16*4
-	sub eax,4
-	jmp lpenc128four
-
-	align 16
-lp128encsingle:
-
-	movdqu xmm0, [esi]
-	add esi, 16
-	movdqu xmm4,[ecx+0*16]
-	pxor xmm0, xmm4
-	aesenc1_u  [ecx+1*16]
-	aesenc1_u  [ecx+2*16]
-	aesenc1_u  [ecx+3*16]
-	aesenc1_u  [ecx+4*16]
-	aesenc1_u  [ecx+5*16]
-	aesenc1_u  [ecx+6*16]
-	aesenc1_u  [ecx+7*16]
-	aesenc1_u  [ecx+8*16]
-	aesenc1_u  [ecx+9*16]
-	aesenclast1_u  [ecx+10*16]
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [esi+edi-16], xmm0
-	dec eax
-	jnz lp128encsingle
-
-end_enc128:
-
-
-	mov esp,ebp
-	pop ebp
-	pop edi
-	pop esi
-
-	ret
-
-
-align 16
-global _iEnc128_CTR
-_iEnc128_CTR:
-	mov ecx,[esp-4+8]
-
-	push esi
-	push edi
-	push ebp
-	mov ebp,esp
-
-	sub esp,16*16
-	and esp,0xfffffff0
-
-	mov	eax,[ecx+12]
-    movdqu xmm5,[eax]	;initial counter
-	movdqa xmm6, [byte_swap_16]
-	pshufb xmm5, xmm6 ; byte swap counter
-	mov eax,[ecx+16] ; numblocks
-	mov esi,[ecx]
-	mov edi,[ecx+4]
-	mov ecx,[ecx+8]
-
-	sub edi,esi
-
-	test eax,eax
-	jz end_encctr128
-
-	cmp eax,4
-	jl lp128encctrsingle
-
-	test	ecx,0xf
-	jz		lpencctr128four
-
-	copy_round_keys esp,ecx,0
-	copy_round_keys esp,ecx,1
-	copy_round_keys esp,ecx,2
-	copy_round_keys esp,ecx,3
-	copy_round_keys esp,ecx,4
-	copy_round_keys esp,ecx,5
-	copy_round_keys esp,ecx,6
-	copy_round_keys esp,ecx,7
-	copy_round_keys esp,ecx,8
-	copy_round_keys esp,ecx,9
-	copy_round_keys esp,ecx,10
-	mov ecx,esp
-
-
-	align 16
-
-lpencctr128four:
-
-	test eax,eax
-	jz end_encctr128
-
-	cmp eax,4
-	jl lp128encctrsingle
-
-	load_and_inc4 [ecx+0*16]
-	add esi,4*16
-	aesenc4	[ecx+1*16]
-	aesenc4	[ecx+2*16]
-	aesenc4	[ecx+3*16]
-	aesenc4	[ecx+4*16]
-	aesenc4	[ecx+5*16]
-	aesenc4	[ecx+6*16]
-	aesenc4	[ecx+7*16]
-	aesenc4	[ecx+8*16]
-	aesenc4	[ecx+9*16]
-	aesenclast4	[ecx+10*16]
-	xor_with_input4 esi-(4*16)
-
-	store4 esi+edi-16*4
-	sub eax,4
-	jmp lpencctr128four
-
-	align 16
-lp128encctrsingle:
-
-	movdqa	xmm0,xmm5
-	pshufb	xmm0, xmm6 ; byte swap counter back
-	paddd	xmm5,[counter_add_one]
-	add esi, 16
-	movdqu xmm4,[ecx+0*16]
-	pxor xmm0, xmm4
-	aesenc1_u [ecx+1*16]
-	aesenc1_u [ecx+2*16]
-	aesenc1_u [ecx+3*16]
-	aesenc1_u [ecx+4*16]
-	aesenc1_u [ecx+5*16]
-	aesenc1_u [ecx+6*16]
-	aesenc1_u [ecx+7*16]
-	aesenc1_u [ecx+8*16]
-	aesenc1_u [ecx+9*16]
-	aesenclast1_u [ecx+10*16]
-	movdqu xmm4, [esi-16]
-	pxor	xmm0,xmm4
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [esi+edi-16], xmm0
-	dec eax
-	jnz lp128encctrsingle
-
-end_encctr128:
-	pshufb xmm5, xmm6 ; byte swap counter
-	mov esp,ebp
-	pop ebp
-	pop edi
-	pop esi
-
-	mov ecx,[esp-4+8]  ; first arg
-	mov ecx,[ecx+12]
-	movdqu	[ecx],xmm5 ; store last counter for chaining
-
-	ret
-
-
-align 16
-global _iEnc192_CTR
-_iEnc192_CTR:
-	mov ecx,[esp-4+8]
-
-	push esi
-	push edi
-	push ebp
-	mov ebp,esp
-
-	sub esp,16*16
-	and esp,0xfffffff0
-
-	mov	eax,[ecx+12]
-	movdqu xmm5,[eax]	;initial counter
-	movdqa xmm6, [byte_swap_16]
-	pshufb xmm5, xmm6 ; byte swap counter
-	mov eax,[ecx+16] ; numblocks
-	mov esi,[ecx]
-	mov edi,[ecx+4]
-	mov ecx,[ecx+8]
-
-	sub edi,esi
-
-	test eax,eax
-	jz end_encctr192
-
-	cmp eax,4
-	jl lp192encctrsingle
-
-	test	ecx,0xf
-	jz lpencctr192four
-
-	copy_round_keys esp,ecx,0
-	copy_round_keys esp,ecx,1
-	copy_round_keys esp,ecx,2
-	copy_round_keys esp,ecx,3
-	copy_round_keys esp,ecx,4
-	copy_round_keys esp,ecx,5
-	copy_round_keys esp,ecx,6
-	copy_round_keys esp,ecx,7
-	copy_round_keys esp,ecx,8
-	copy_round_keys esp,ecx,9
-	copy_round_keys esp,ecx,10
-	copy_round_keys esp,ecx,11
-	copy_round_keys esp,ecx,12
-	mov ecx,esp
-
-
-	align 16
-
-lpencctr192four:
-
-	test eax,eax
-	jz end_encctr192
-
-	cmp eax,4
-	jl lp192encctrsingle
-
-	load_and_inc4 [ecx+0*16]
-	add esi,4*16
-	aesenc4	[ecx+1*16]
-	aesenc4	[ecx+2*16]
-	aesenc4	[ecx+3*16]
-	aesenc4	[ecx+4*16]
-	aesenc4	[ecx+5*16]
-	aesenc4	[ecx+6*16]
-	aesenc4	[ecx+7*16]
-	aesenc4	[ecx+8*16]
-	aesenc4	[ecx+9*16]
-	aesenc4	[ecx+10*16]
-	aesenc4	[ecx+11*16]
-	aesenclast4	[ecx+12*16]
-	xor_with_input4 esi-(4*16)
-
-	store4 esi+edi-16*4
-	sub eax,4
-	jmp lpencctr192four
-
-	align 16
-lp192encctrsingle:
-
-	movdqa	xmm0,xmm5
-	pshufb	xmm0, xmm6 ; byte swap counter back
-	paddd	xmm5,[counter_add_one]
-	add esi, 16
-	movdqu xmm4,[ecx+0*16]
-	pxor xmm0, xmm4
-	aesenc1_u  [ecx+1*16]
-	aesenc1_u  [ecx+2*16]
-	aesenc1_u  [ecx+3*16]
-	aesenc1_u  [ecx+4*16]
-	aesenc1_u  [ecx+5*16]
-	aesenc1_u  [ecx+6*16]
-	aesenc1_u  [ecx+7*16]
-	aesenc1_u  [ecx+8*16]
-	aesenc1_u  [ecx+9*16]
-	aesenc1_u  [ecx+10*16]
-	aesenc1_u  [ecx+11*16]
-	aesenclast1_u  [ecx+12*16]
-	movdqu xmm4, [esi-16]
-	pxor	xmm0,xmm4
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [esi+edi-16], xmm0
-	dec eax
-	jnz lp192encctrsingle
-
-end_encctr192:
-
-	pshufb xmm5, xmm6 ; byte swap counter
-	mov esp,ebp
-	pop ebp
-	pop edi
-	pop esi
-
-	mov ecx,[esp-4+8]  ; first arg
-	mov ecx,[ecx+12]
-	movdqu	[ecx],xmm5 ; store last counter for chaining
-
-	ret
-
-
-align 16
-global _iEnc256_CTR
-_iEnc256_CTR:
-	mov ecx,[esp-4+8]
-
-	push esi
-	push edi
-	push ebp
-	mov ebp,esp
-
-	sub esp,16*16
-	and esp,0xfffffff0
-
-	mov	eax,[ecx+12]
-	movdqu xmm5,[eax]	;initial counter
-	movdqa xmm6, [byte_swap_16]
-	pshufb xmm5, xmm6 ; byte swap counter
-
-	mov eax,[ecx+16] ; numblocks
-	mov esi,[ecx]
-	mov edi,[ecx+4]
-	mov ecx,[ecx+8]
-
-	sub edi,esi
-
-	test eax,eax
-	jz end_encctr256
-
-	cmp eax,4
-	jl lp256encctrsingle
-
-	test	ecx,0xf
-	jz	lpencctr256four
-
-	copy_round_keys esp,ecx,0
-	copy_round_keys esp,ecx,1
-	copy_round_keys esp,ecx,2
-	copy_round_keys esp,ecx,3
-	copy_round_keys esp,ecx,4
-	copy_round_keys esp,ecx,5
-	copy_round_keys esp,ecx,6
-	copy_round_keys esp,ecx,7
-	copy_round_keys esp,ecx,8
-	copy_round_keys esp,ecx,9
-	copy_round_keys esp,ecx,10
-	copy_round_keys esp,ecx,11
-	copy_round_keys esp,ecx,12
-	copy_round_keys esp,ecx,13
-	copy_round_keys esp,ecx,14
-	mov ecx,esp
-
-
-	align 16
-
-lpencctr256four:
-
-	test eax,eax
-	jz end_encctr256
-
-	cmp eax,4
-	jl lp256encctrsingle
-
-	load_and_inc4 [ecx+0*16]
-	add esi,4*16
-	aesenc4	[ecx+1*16]
-	aesenc4	[ecx+2*16]
-	aesenc4	[ecx+3*16]
-	aesenc4	[ecx+4*16]
-	aesenc4	[ecx+5*16]
-	aesenc4	[ecx+6*16]
-	aesenc4	[ecx+7*16]
-	aesenc4	[ecx+8*16]
-	aesenc4	[ecx+9*16]
-	aesenc4	[ecx+10*16]
-	aesenc4	[ecx+11*16]
-	aesenc4	[ecx+12*16]
-	aesenc4	[ecx+13*16]
-	aesenclast4	[ecx+14*16]
-	xor_with_input4 esi-(4*16)
-
-	store4 esi+edi-16*4
-	sub eax,4
-	jmp lpencctr256four
-
-	align 16
-
-lp256encctrsingle:
-
-	movdqa	xmm0,xmm5
-	pshufb	xmm0, xmm6 ; byte swap counter back
-	paddd	xmm5,[counter_add_one]
-	add esi, 16
-	movdqu xmm4,[ecx+0*16]
-	pxor xmm0, xmm4
-	aesenc1_u  [ecx+1*16]
-	aesenc1_u  [ecx+2*16]
-	aesenc1_u  [ecx+3*16]
-	aesenc1_u  [ecx+4*16]
-	aesenc1_u  [ecx+5*16]
-	aesenc1_u  [ecx+6*16]
-	aesenc1_u  [ecx+7*16]
-	aesenc1_u  [ecx+8*16]
-	aesenc1_u  [ecx+9*16]
-	aesenc1_u  [ecx+10*16]
-	aesenc1_u  [ecx+11*16]
-	aesenc1_u  [ecx+12*16]
-	aesenc1_u  [ecx+13*16]
-	aesenclast1_u  [ecx+14*16]
-	movdqu xmm4, [esi-16]
-	pxor	xmm0,xmm4
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [esi+edi-16], xmm0
-	dec eax
-	jnz lp256encctrsingle
-
-end_encctr256:
-
-	pshufb xmm5, xmm6 ; byte swap counter
-	mov esp,ebp
-	pop ebp
-	pop edi
-	pop esi
-
-	mov ecx,[esp-4+8]  ; first arg
-	mov ecx,[ecx+12]
-	movdqu	[ecx],xmm5 ; store last counter for chaining
-
-	ret
-
-
-
-
-
-
-align 16
-global _iEnc128_CBC
-_iEnc128_CBC:
-	mov ecx,[esp-4+8]
-
-	push esi
-	push edi
-	push ebp
-	mov ebp,esp
-
-	sub esp,16*16
-	and esp,0xfffffff0
-
-	mov	eax,[ecx+12]
-	movdqu xmm1,[eax]	;iv
-
-	mov eax,[ecx+16] ; numblocks
-	mov esi,[ecx]
-	mov edi,[ecx+4]
-	mov ecx,[ecx+8]
-	sub edi,esi
-
-	test	ecx,0xf
-	jz		lp128encsingle_CBC
-
-	copy_round_keys esp,ecx,0
-	copy_round_keys esp,ecx,1
-	copy_round_keys esp,ecx,2
-	copy_round_keys esp,ecx,3
-	copy_round_keys esp,ecx,4
-	copy_round_keys esp,ecx,5
-	copy_round_keys esp,ecx,6
-	copy_round_keys esp,ecx,7
-	copy_round_keys esp,ecx,8
-	copy_round_keys esp,ecx,9
-	copy_round_keys esp,ecx,10
-	mov ecx,esp
-
-	align 16
-
-lp128encsingle_CBC:
-
-	movdqu xmm0, [esi]
-	add esi, 16
-	pxor xmm0, xmm1
-	movdqu xmm4,[ecx+0*16]
-	pxor xmm0, xmm4
-	aesenc1  [ecx+1*16]
-	aesenc1  [ecx+2*16]
-	aesenc1  [ecx+3*16]
-	aesenc1  [ecx+4*16]
-	aesenc1  [ecx+5*16]
-	aesenc1  [ecx+6*16]
-	aesenc1  [ecx+7*16]
-	aesenc1  [ecx+8*16]
-	aesenc1  [ecx+9*16]
-	aesenclast1  [ecx+10*16]
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [esi+edi-16], xmm0
-	movdqa xmm1,xmm0
-	dec eax
-	jnz lp128encsingle_CBC
-
-
-	mov esp,ebp
-	pop ebp
-	pop edi
-	pop esi
-	mov ecx,[esp-4+8]  ; first arg
-	mov ecx,[ecx+12]
-	movdqu	[ecx],xmm1 ; store last iv for chaining
-
-	ret
-
-
-align 16
-global _iEnc192_CBC
-_iEnc192_CBC:
-	mov ecx,[esp-4+8]  ; first arg
-
-	push esi
-	push edi
-	push ebp
-	mov ebp,esp
-
-	sub esp,16*16
-	and esp,0xfffffff0
-
-	mov	eax,[ecx+12]
-	movdqu xmm1,[eax]	;iv
-
-	mov eax,[ecx+16] ; numblocks
-	mov esi,[ecx]
-	mov edi,[ecx+4]
-	mov ecx,[ecx+8]
-	sub edi,esi
-
-	test	ecx,0xf
-	jz		lp192encsingle_CBC
-
-	copy_round_keys esp,ecx,0
-	copy_round_keys esp,ecx,1
-	copy_round_keys esp,ecx,2
-	copy_round_keys esp,ecx,3
-	copy_round_keys esp,ecx,4
-	copy_round_keys esp,ecx,5
-	copy_round_keys esp,ecx,6
-	copy_round_keys esp,ecx,7
-	copy_round_keys esp,ecx,8
-	copy_round_keys esp,ecx,9
-	copy_round_keys esp,ecx,10
-	copy_round_keys esp,ecx,11
-	copy_round_keys esp,ecx,12
-	mov ecx,esp
-
-	align 16
-
-lp192encsingle_CBC:
-
-	movdqu xmm0, [esi]
-	add esi, 16
-	pxor xmm0, xmm1
-	movdqu xmm4,[ecx+0*16]
-	pxor xmm0, xmm4
-	aesenc1  [ecx+1*16]
-	aesenc1  [ecx+2*16]
-	aesenc1  [ecx+3*16]
-	aesenc1  [ecx+4*16]
-	aesenc1  [ecx+5*16]
-	aesenc1  [ecx+6*16]
-	aesenc1  [ecx+7*16]
-	aesenc1  [ecx+8*16]
-	aesenc1  [ecx+9*16]
-	aesenc1  [ecx+10*16]
-	aesenc1  [ecx+11*16]
-	aesenclast1  [ecx+12*16]
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [esi+edi-16], xmm0
-	movdqa xmm1,xmm0
-	dec eax
-	jnz lp192encsingle_CBC
-
-
-	mov esp,ebp
-	pop ebp
-	pop edi
-	pop esi
-	mov ecx,[esp-4+8]  ; first arg
-	mov ecx,[ecx+12]
-	movdqu	[ecx],xmm1 ; store last iv for chaining
-
-	ret
-
-align 16
-global _iEnc256_CBC
-_iEnc256_CBC:
-	mov ecx,[esp-4+8]  ; first arg
-
-	push esi
-	push edi
-	push ebp
-	mov ebp,esp
-
-	sub esp,16*16
-	and esp,0xfffffff0
-
-	mov	eax,[ecx+12]
-	movdqu xmm1,[eax]	;iv
-
-	mov eax,[ecx+16] ; numblocks
-	mov esi,[ecx]
-	mov edi,[ecx+4]
-	mov ecx,[ecx+8]
-	sub edi,esi
-
-	test	ecx,0xf
-	jz		lp256encsingle_CBC
-
-	copy_round_keys esp,ecx,0
-	copy_round_keys esp,ecx,1
-	copy_round_keys esp,ecx,2
-	copy_round_keys esp,ecx,3
-	copy_round_keys esp,ecx,4
-	copy_round_keys esp,ecx,5
-	copy_round_keys esp,ecx,6
-	copy_round_keys esp,ecx,7
-	copy_round_keys esp,ecx,8
-	copy_round_keys esp,ecx,9
-	copy_round_keys esp,ecx,10
-	copy_round_keys esp,ecx,11
-	copy_round_keys esp,ecx,12
-	copy_round_keys esp,ecx,13
-	copy_round_keys esp,ecx,14
-	mov ecx,esp
-
-	align 16
-
-lp256encsingle_CBC:
-
-;abab
-	movdqu xmm0, [esi]
-	add esi, 16
-	pxor xmm0, xmm1
-	movdqu xmm4,[ecx+0*16]
-	pxor xmm0, xmm4
-	aesenc1 [ecx+1*16]
-	aesenc1 [ecx+2*16]
-	aesenc1 [ecx+3*16]
-	aesenc1 [ecx+4*16]
-	aesenc1 [ecx+5*16]
-	aesenc1 [ecx+6*16]
-	aesenc1 [ecx+7*16]
-	aesenc1 [ecx+8*16]
-	aesenc1 [ecx+9*16]
-	aesenc1 [ecx+10*16]
-	aesenc1 [ecx+11*16]
-	aesenc1 [ecx+12*16]
-	aesenc1 [ecx+13*16]
-	aesenclast1 [ecx+14*16]
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [esi+edi-16], xmm0
-	movdqa xmm1,xmm0
-	dec eax
-	jnz lp256encsingle_CBC
+;abab
+	movdqu xmm0, [esi]
+	add esi, 16
+	pxor xmm0, xmm1
+	movdqu xmm4,[ecx+0*16]
+	pxor xmm0, xmm4
+	aesenc1 [ecx+1*16]
+	aesenc1 [ecx+2*16]
+	aesenc1 [ecx+3*16]
+	aesenc1 [ecx+4*16]
+	aesenc1 [ecx+5*16]
+	aesenc1 [ecx+6*16]
+	aesenc1 [ecx+7*16]
+	aesenc1 [ecx+8*16]
+	aesenc1 [ecx+9*16]
+	aesenc1 [ecx+10*16]
+	aesenc1 [ecx+11*16]
+	aesenc1 [ecx+12*16]
+	aesenc1 [ecx+13*16]
+	aesenclast1 [ecx+14*16]
+		; Store output encrypted data into CIPHERTEXT array
+	movdqu  [esi+edi-16], xmm0
+	movdqa xmm1,xmm0
+	dec eax
+	jnz lp256encsingle_CBC
 
 
 	mov esp,ebp
@@ -1967,233 +871,3 @@ lp256encsingle_CBC:
 	movdqu	[ecx],xmm1 ; store last iv for chaining
 
 	ret
-
-
-
-
-
-align 16
-global _iEnc192
-_iEnc192:
-	mov ecx,[esp-4+8]
-
-	push esi
-	push edi
-	push ebp
-	mov ebp,esp
-
-	sub esp,16*16
-	and esp,0xfffffff0
-
-	mov eax,[ecx+16] ; numblocks
-	mov esi,[ecx]
-	mov edi,[ecx+4]
-	mov ecx,[ecx+8]
-
-	sub edi,esi
-
-	test eax,eax
-	jz end_enc192
-
-	cmp eax,4
-	jl lp192encsingle
-
-	test	ecx,0xf
-	jz		lpenc192four
-
-	copy_round_keys esp,ecx,0
-	copy_round_keys esp,ecx,1
-	copy_round_keys esp,ecx,2
-	copy_round_keys esp,ecx,3
-	copy_round_keys esp,ecx,4
-	copy_round_keys esp,ecx,5
-	copy_round_keys esp,ecx,6
-	copy_round_keys esp,ecx,7
-	copy_round_keys esp,ecx,8
-	copy_round_keys esp,ecx,9
-	copy_round_keys esp,ecx,10
-	copy_round_keys esp,ecx,11
-	copy_round_keys esp,ecx,12
-	mov ecx,esp
-
-	align 16
-
-lpenc192four:
-
-	test eax,eax
-	jz end_enc192
-
-	cmp eax,4
-	jl lp192encsingle
-
-	load_and_xor4 esi,[ecx+0*16]
-	add esi,4*16
-	aesenc4	[ecx+1*16]
-	aesenc4	[ecx+2*16]
-	aesenc4	[ecx+3*16]
-	aesenc4	[ecx+4*16]
-	aesenc4	[ecx+5*16]
-	aesenc4	[ecx+6*16]
-	aesenc4	[ecx+7*16]
-	aesenc4	[ecx+8*16]
-	aesenc4	[ecx+9*16]
-	aesenc4	[ecx+10*16]
-	aesenc4	[ecx+11*16]
-	aesenclast4	[ecx+12*16]
-
-	store4 esi+edi-16*4
-	sub eax,4
-	jmp lpenc192four
-
-	align 16
-lp192encsingle:
-
-	movdqu xmm0, [esi]
-	add esi, 16
-	movdqu xmm4,[ecx+0*16]
-	pxor xmm0, xmm4
-	aesenc1_u [ecx+1*16]
-	aesenc1_u [ecx+2*16]
-	aesenc1_u [ecx+3*16]
-	aesenc1_u [ecx+4*16]
-	aesenc1_u [ecx+5*16]
-	aesenc1_u [ecx+6*16]
-	aesenc1_u [ecx+7*16]
-	aesenc1_u [ecx+8*16]
-	aesenc1_u [ecx+9*16]
-	aesenc1_u [ecx+10*16]
-	aesenc1_u [ecx+11*16]
-	aesenclast1_u [ecx+12*16]
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [esi+edi-16], xmm0
-	dec eax
-	jnz lp192encsingle
-
-end_enc192:
-
-
-	mov esp,ebp
-	pop ebp
-	pop edi
-	pop esi
-
-	ret
-
-
-
-
-align 16
-global _iEnc256
-_iEnc256:
-	mov ecx,[esp-4+8]
-
-	push esi
-	push edi
-	push ebp
-	mov ebp,esp
-
-	sub esp,16*16
-	and esp,0xfffffff0
-
-	mov eax,[ecx+16] ; numblocks
-	mov esi,[ecx]
-	mov edi,[ecx+4]
-	mov ecx,[ecx+8]
-
-	sub edi,esi
-
-	test eax,eax
-	jz end_enc256
-
-	cmp eax,4
-	jl lp256enc
-
-	test	ecx,0xf
-	jz	lp256enc4
-
-	copy_round_keys esp,ecx,0
-	copy_round_keys esp,ecx,1
-	copy_round_keys esp,ecx,2
-	copy_round_keys esp,ecx,3
-	copy_round_keys esp,ecx,4
-	copy_round_keys esp,ecx,5
-	copy_round_keys esp,ecx,6
-	copy_round_keys esp,ecx,7
-	copy_round_keys esp,ecx,8
-	copy_round_keys esp,ecx,9
-	copy_round_keys esp,ecx,10
-	copy_round_keys esp,ecx,11
-	copy_round_keys esp,ecx,12
-	copy_round_keys esp,ecx,13
-	copy_round_keys esp,ecx,14
-	mov ecx,esp
-
-
-
-	align 16
-
-lp256enc4:
-	test eax,eax
-	jz end_enc256
-
-	cmp eax,4
-	jl lp256enc
-
-
-	load_and_xor4 esi,[ecx+0*16]
-	add esi, 16*4
-	aesenc4 [ecx+1*16]
-	aesenc4 [ecx+2*16]
-	aesenc4 [ecx+3*16]
-	aesenc4 [ecx+4*16]
-	aesenc4 [ecx+5*16]
-	aesenc4 [ecx+6*16]
-	aesenc4 [ecx+7*16]
-	aesenc4 [ecx+8*16]
-	aesenc4 [ecx+9*16]
-	aesenc4 [ecx+10*16]
-	aesenc4 [ecx+11*16]
-	aesenc4 [ecx+12*16]
-	aesenc4 [ecx+13*16]
-	aesenclast4 [ecx+14*16]
-
-	store4  esi+edi-16*4
-	sub eax,4
-	jmp lp256enc4
-
-	align 16
-lp256enc:
-
-	movdqu xmm0, [esi]
-	add esi, 16
-	movdqu xmm4,[ecx+0*16]
-	pxor xmm0, xmm4
-	aesenc1_u [ecx+1*16]
-	aesenc1_u [ecx+2*16]
-	aesenc1_u [ecx+3*16]
-	aesenc1_u [ecx+4*16]
-	aesenc1_u [ecx+5*16]
-	aesenc1_u [ecx+6*16]
-	aesenc1_u [ecx+7*16]
-	aesenc1_u [ecx+8*16]
-	aesenc1_u [ecx+9*16]
-	aesenc1_u [ecx+10*16]
-	aesenc1_u [ecx+11*16]
-	aesenc1_u [ecx+12*16]
-	aesenc1_u [ecx+13*16]
-	aesenclast1_u [ecx+14*16]
-
-		; Store output encrypted data into CIPHERTEXT array
-	movdqu  [esi+edi-16], xmm0
-	dec eax
-	jnz lp256enc
-
-end_enc256:
-
-
-	mov esp,ebp
-	pop ebp
-	pop edi
-	pop esi
-
-	ret


More information about the cvs-krb5 mailing list