android_kernel_xiaomi_sm8350/arch/x86_64/crypto/twofish-x86_64-asm.S

/***************************************************************************
*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
*                                                                         *
*   This program is free software; you can redistribute it and/or modify  *
*   it under the terms of the GNU General Public License as published by  *
*   the Free Software Foundation; either version 2 of the License, or     *
*   (at your option) any later version.                                   *
*                                                                         *
*   This program is distributed in the hope that it will be useful,       *
*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*   GNU General Public License for more details.                          *
*                                                                         *
*   You should have received a copy of the GNU General Public License     *
*   along with this program; if not, write to the                         *
*   Free Software Foundation, Inc.,                                       *
*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
***************************************************************************/

.file "twofish-x86_64-asm.S"
.text

#include <asm/asm-offsets.h>

#define a_offset	0
#define b_offset	4
#define c_offset	8
#define d_offset	12

/* Structure of the crypto context struct*/

#define s0	0	/* S0 Array 256 Words each */
#define s1	1024	/* S1 Array */
#define s2	2048	/* S2 Array */
#define s3	3072	/* S3 Array */
#define w	4096	/* 8 whitening keys (word) */
#define k	4128	/* key 1-32 ( word ) */

/* define a few register aliases to allow macro substitution */

#define R0     %rax
#define R0D    %eax
#define R0B    %al
#define R0H    %ah

#define R1     %rbx
#define R1D    %ebx
#define R1B    %bl
#define R1H    %bh

#define R2     %rcx
#define R2D    %ecx
#define R2B    %cl
#define R2H    %ch

#define R3     %rdx
#define R3D    %edx
#define R3B    %dl
#define R3H    %dh


/* performs input whitening */
#define input_whitening(src,context,offset)\
	xor	w+offset(context),	src;

/* performs input whitening */
#define output_whitening(src,context,offset)\
	xor	w+16+offset(context),	src;


/*
 * a input register containing a (rotated 16)
 * b input register containing b
 * c input register containing c
 * d input register containing d (already rol $1)
 * operations on a and b are interleaved to increase performance
 */
#define encrypt_round(a,b,c,d,round)\
	movzx	b ## B,		%edi;\
	mov	s1(%r11,%rdi,4),%r8d;\
	movzx	a ## B,		%edi;\
	mov	s2(%r11,%rdi,4),%r9d;\
	movzx	b ## H,		%edi;\
	ror	$16,		b ## D;\
	xor	s2(%r11,%rdi,4),%r8d;\
	movzx	a ## H,		%edi;\
	ror	$16,		a ## D;\
	xor	s3(%r11,%rdi,4),%r9d;\
	movzx	b ## B,		%edi;\
	xor	s3(%r11,%rdi,4),%r8d;\
	movzx	a ## B,		%edi;\
	xor	(%r11,%rdi,4),	%r9d;\
	movzx	b ## H,		%edi;\
	ror	$15,		b ## D;\
	xor	(%r11,%rdi,4),	%r8d;\
	movzx	a ## H,		%edi;\
	xor	s1(%r11,%rdi,4),%r9d;\
	add	%r8d,		%r9d;\
	add	%r9d,		%r8d;\
	add	k+round(%r11),	%r9d;\
	xor	%r9d,		c ## D;\
	rol	$15,		c ## D;\
	add	k+4+round(%r11),%r8d;\
	xor	%r8d,		d ## D;

/*
 * a input register containing a(rotated 16)
 * b input register containing b
 * c input register containing c
 * d input register containing d (already rol $1)
 * operations on a and b are interleaved to increase performance
 * during the round a and b are prepared for the output whitening
 */
#define encrypt_last_round(a,b,c,d,round)\
	mov	b ## D,		%r10d;\
	shl	$32,		%r10;\
	movzx	b ## B,		%edi;\
	mov	s1(%r11,%rdi,4),%r8d;\
	movzx	a ## B,		%edi;\
	mov	s2(%r11,%rdi,4),%r9d;\
	movzx	b ## H,		%edi;\
	ror	$16,		b ## D;\
	xor	s2(%r11,%rdi,4),%r8d;\
	movzx	a ## H,		%edi;\
	ror	$16,		a ## D;\
	xor	s3(%r11,%rdi,4),%r9d;\
	movzx	b ## B,		%edi;\
	xor	s3(%r11,%rdi,4),%r8d;\
	movzx	a ## B,		%edi;\
	xor	(%r11,%rdi,4),	%r9d;\
	xor	a,		%r10;\
	movzx	b ## H,		%edi;\
	xor	(%r11,%rdi,4),	%r8d;\
	movzx	a ## H,		%edi;\
	xor	s1(%r11,%rdi,4),%r9d;\
	add	%r8d,		%r9d;\
	add	%r9d,		%r8d;\
	add	k+round(%r11),	%r9d;\
	xor	%r9d,		c ## D;\
	ror	$1,		c ## D;\
	add	k+4+round(%r11),%r8d;\
	xor	%r8d,		d ## D

/*
 * a input register containing a
 * b input register containing b (rotated 16)
 * c input register containing c (already rol $1)
 * d input register containing d
 * operations on a and b are interleaved to increase performance
 */
#define decrypt_round(a,b,c,d,round)\
	movzx	a ## B,		%edi;\
	mov	(%r11,%rdi,4),	%r9d;\
	movzx	b ## B,		%edi;\
	mov	s3(%r11,%rdi,4),%r8d;\
	movzx	a ## H,		%edi;\
	ror	$16,		a ## D;\
	xor	s1(%r11,%rdi,4),%r9d;\
	movzx	b ## H,		%edi;\
	ror	$16,		b ## D;\
	xor	(%r11,%rdi,4),	%r8d;\
	movzx	a ## B,		%edi;\
	xor	s2(%r11,%rdi,4),%r9d;\
	movzx	b ## B,		%edi;\
	xor	s1(%r11,%rdi,4),%r8d;\
	movzx	a ## H,		%edi;\
	ror	$15,		a ## D;\
	xor	s3(%r11,%rdi,4),%r9d;\
	movzx	b ## H,		%edi;\
	xor	s2(%r11,%rdi,4),%r8d;\
	add	%r8d,		%r9d;\
	add	%r9d,		%r8d;\
	add	k+round(%r11),	%r9d;\
	xor	%r9d,		c ## D;\
	add	k+4+round(%r11),%r8d;\
	xor	%r8d,		d ## D;\
	rol	$15,		d ## D;

/*
 * a input register containing a
 * b input register containing b
 * c input register containing c (already rol $1)
 * d input register containing d
 * operations on a and b are interleaved to increase performance
 * during the round a and b are prepared for the output whitening
 */
#define decrypt_last_round(a,b,c,d,round)\
	movzx	a ## B,		%edi;\
	mov	(%r11,%rdi,4),	%r9d;\
	movzx	b ## B,		%edi;\
	mov	s3(%r11,%rdi,4),%r8d;\
	movzx	b ## H,		%edi;\
	ror	$16,		b ## D;\
	xor	(%r11,%rdi,4),	%r8d;\
	movzx	a ## H,		%edi;\
	mov	b ## D,		%r10d;\
	shl	$32,		%r10;\
	xor	a,		%r10;\
	ror	$16,		a ## D;\
	xor	s1(%r11,%rdi,4),%r9d;\
	movzx	b ## B,		%edi;\
	xor	s1(%r11,%rdi,4),%r8d;\
	movzx	a ## B,		%edi;\
	xor	s2(%r11,%rdi,4),%r9d;\
	movzx	b ## H,		%edi;\
	xor	s2(%r11,%rdi,4),%r8d;\
	movzx	a ## H,		%edi;\
	xor	s3(%r11,%rdi,4),%r9d;\
	add	%r8d,		%r9d;\
	add	%r9d,		%r8d;\
	add	k+round(%r11),	%r9d;\
	xor	%r9d,		c ## D;\
	add	k+4+round(%r11),%r8d;\
	xor	%r8d,		d ## D;\
	ror	$1,		d ## D;

.align 8
.global twofish_enc_blk
.global twofish_dec_blk

twofish_enc_blk:
	pushq    R1

	/* %rdi contains the crypto tfm adress */
	/* %rsi contains the output adress */
	/* %rdx contains the input adress */
	add	$crypto_tfm_ctx_offset, %rdi	/* set ctx adress */
	/* ctx adress is moved to free one non-rex register
	as target for the 8bit high operations */
	mov	%rdi,		%r11

	movq	(R3),	R1
	movq	8(R3),	R3
	input_whitening(R1,%r11,a_offset)
	input_whitening(R3,%r11,c_offset)
	mov	R1D,	R0D
	rol	$16,	R0D
	shr	$32,	R1
	mov	R3D,	R2D
	shr	$32,	R3
	rol	$1,	R3D

	encrypt_round(R0,R1,R2,R3,0);
	encrypt_round(R2,R3,R0,R1,8);
	encrypt_round(R0,R1,R2,R3,2*8);
	encrypt_round(R2,R3,R0,R1,3*8);
	encrypt_round(R0,R1,R2,R3,4*8);
	encrypt_round(R2,R3,R0,R1,5*8);
	encrypt_round(R0,R1,R2,R3,6*8);
	encrypt_round(R2,R3,R0,R1,7*8);
	encrypt_round(R0,R1,R2,R3,8*8);
	encrypt_round(R2,R3,R0,R1,9*8);
	encrypt_round(R0,R1,R2,R3,10*8);
	encrypt_round(R2,R3,R0,R1,11*8);
	encrypt_round(R0,R1,R2,R3,12*8);
	encrypt_round(R2,R3,R0,R1,13*8);
	encrypt_round(R0,R1,R2,R3,14*8);
	encrypt_last_round(R2,R3,R0,R1,15*8);


	output_whitening(%r10,%r11,a_offset)
	movq	%r10,	(%rsi)

	shl	$32,	R1
	xor	R0,	R1

	output_whitening(R1,%r11,c_offset)
	movq	R1,	8(%rsi)

	popq	R1
	movq	$1,%rax
	ret

twofish_dec_blk:
	pushq    R1

	/* %rdi contains the crypto tfm adress */
	/* %rsi contains the output adress */
	/* %rdx contains the input adress */
	add	$crypto_tfm_ctx_offset, %rdi	/* set ctx adress */
	/* ctx adress is moved to free one non-rex register
	as target for the 8bit high operations */
	mov	%rdi,		%r11

	movq	(R3),	R1
	movq	8(R3),	R3
	output_whitening(R1,%r11,a_offset)
	output_whitening(R3,%r11,c_offset)
	mov	R1D,	R0D
	shr	$32,	R1
	rol	$16,	R1D
	mov	R3D,	R2D
	shr	$32,	R3
	rol	$1,	R2D

	decrypt_round(R0,R1,R2,R3,15*8);
	decrypt_round(R2,R3,R0,R1,14*8);
	decrypt_round(R0,R1,R2,R3,13*8);
	decrypt_round(R2,R3,R0,R1,12*8);
	decrypt_round(R0,R1,R2,R3,11*8);
	decrypt_round(R2,R3,R0,R1,10*8);
	decrypt_round(R0,R1,R2,R3,9*8);
	decrypt_round(R2,R3,R0,R1,8*8);
	decrypt_round(R0,R1,R2,R3,7*8);
	decrypt_round(R2,R3,R0,R1,6*8);
	decrypt_round(R0,R1,R2,R3,5*8);
	decrypt_round(R2,R3,R0,R1,4*8);
	decrypt_round(R0,R1,R2,R3,3*8);
	decrypt_round(R2,R3,R0,R1,2*8);
	decrypt_round(R0,R1,R2,R3,1*8);
	decrypt_last_round(R2,R3,R0,R1,0);

	input_whitening(%r10,%r11,a_offset)
	movq	%r10,	(%rsi)

	shl	$32,	R1
	xor	R0,	R1

	input_whitening(R1,%r11,c_offset)
	movq	R1,	8(%rsi)

	popq	R1
	movq	$1,%rax
	ret
[CRYPTO] twofish: x86-64 assembly version The patch passed the trycpt tests and automated filesystem tests. This rewrite resulted in some nice perfomance increase over my last patch. Short summary of the tcrypt benchmarks: Twofish Assembler vs. Twofish C (256bit 8kb block CBC) encrypt: -27% Cycles decrypt: -23% Cycles Twofish Assembler vs. AES Assembler (128bit 8kb block CBC) encrypt: +18% Cycles decrypt: +15% Cycles Twofish Assembler vs. AES Assembler (256bit 8kb block CBC) encrypt: -9% Cycles decrypt: -8% Cycles Full Output: http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-twofish-c-x86_64.txt http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-twofish-asm-x86_64.txt http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-aes-asm-x86_64.txt Here is another bonnie++ benchmark with encrypted filesystems. Most runs maxed out the hd. It should give some idea what the module can do for encrypted filesystem performance even though you can't see the full numbers. http://homepages.tu-darmstadt.de/~fritschi/twofish/output_20060610_130806_x86_64.html Signed-off-by: Joachim Fritschi <jfritschi@freenet.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> 2006-06-20 07:12:02 -04:00			`/***************************************************************************`
			`* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *`
			`* *`
			`* This program is free software; you can redistribute it and/or modify *`
			`* it under the terms of the GNU General Public License as published by *`
			`* the Free Software Foundation; either version 2 of the License, or *`
			`* (at your option) any later version. *`
			`* *`
			`* This program is distributed in the hope that it will be useful, *`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of *`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *`
			`* GNU General Public License for more details. *`
			`* *`
			`* You should have received a copy of the GNU General Public License *`
			`* along with this program; if not, write to the *`
			`* Free Software Foundation, Inc., *`
			`* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *`
			`***************************************************************************/`

			`.file "twofish-x86_64-asm.S"`
			`.text`

			`#include <asm/asm-offsets.h>`

			`#define a_offset 0`
			`#define b_offset 4`
			`#define c_offset 8`
			`#define d_offset 12`

			`/* Structure of the crypto context struct*/`

			`#define s0 0 /* S0 Array 256 Words each */`
			`#define s1 1024 /* S1 Array */`
			`#define s2 2048 /* S2 Array */`
			`#define s3 3072 /* S3 Array */`
			`#define w 4096 /* 8 whitening keys (word) */`
			`#define k 4128 /* key 1-32 ( word ) */`

			`/* define a few register aliases to allow macro substitution */`

			`#define R0 %rax`
			`#define R0D %eax`
			`#define R0B %al`
			`#define R0H %ah`

			`#define R1 %rbx`
			`#define R1D %ebx`
			`#define R1B %bl`
			`#define R1H %bh`

			`#define R2 %rcx`
			`#define R2D %ecx`
			`#define R2B %cl`
			`#define R2H %ch`

			`#define R3 %rdx`
			`#define R3D %edx`
			`#define R3B %dl`
			`#define R3H %dh`


			`/* performs input whitening */`
			`#define input_whitening(src,context,offset)\`
			`xor w+offset(context), src;`

			`/* performs input whitening */`
			`#define output_whitening(src,context,offset)\`
			`xor w+16+offset(context), src;`


			`/*`
			`* a input register containing a (rotated 16)`
			`* b input register containing b`
			`* c input register containing c`
			`* d input register containing d (already rol $1)`
			`* operations on a and b are interleaved to increase performance`
			`*/`
			`#define encrypt_round(a,b,c,d,round)\`
			`movzx b ## B, %edi;\`
			`mov s1(%r11,%rdi,4),%r8d;\`
			`movzx a ## B, %edi;\`
			`mov s2(%r11,%rdi,4),%r9d;\`
			`movzx b ## H, %edi;\`
			`ror $16, b ## D;\`
			`xor s2(%r11,%rdi,4),%r8d;\`
			`movzx a ## H, %edi;\`
			`ror $16, a ## D;\`
			`xor s3(%r11,%rdi,4),%r9d;\`
			`movzx b ## B, %edi;\`
			`xor s3(%r11,%rdi,4),%r8d;\`
			`movzx a ## B, %edi;\`
			`xor (%r11,%rdi,4), %r9d;\`
			`movzx b ## H, %edi;\`
			`ror $15, b ## D;\`
			`xor (%r11,%rdi,4), %r8d;\`
			`movzx a ## H, %edi;\`
			`xor s1(%r11,%rdi,4),%r9d;\`
			`add %r8d, %r9d;\`
			`add %r9d, %r8d;\`
			`add k+round(%r11), %r9d;\`
			`xor %r9d, c ## D;\`
			`rol $15, c ## D;\`
			`add k+4+round(%r11),%r8d;\`
			`xor %r8d, d ## D;`

			`/*`
			`* a input register containing a(rotated 16)`
			`* b input register containing b`
			`* c input register containing c`
			`* d input register containing d (already rol $1)`
			`* operations on a and b are interleaved to increase performance`
			`* during the round a and b are prepared for the output whitening`
			`*/`
			`#define encrypt_last_round(a,b,c,d,round)\`
			`mov b ## D, %r10d;\`
			`shl $32, %r10;\`
			`movzx b ## B, %edi;\`
			`mov s1(%r11,%rdi,4),%r8d;\`
			`movzx a ## B, %edi;\`
			`mov s2(%r11,%rdi,4),%r9d;\`
			`movzx b ## H, %edi;\`
			`ror $16, b ## D;\`
			`xor s2(%r11,%rdi,4),%r8d;\`
			`movzx a ## H, %edi;\`
			`ror $16, a ## D;\`
			`xor s3(%r11,%rdi,4),%r9d;\`
			`movzx b ## B, %edi;\`
			`xor s3(%r11,%rdi,4),%r8d;\`
			`movzx a ## B, %edi;\`
			`xor (%r11,%rdi,4), %r9d;\`
			`xor a, %r10;\`
			`movzx b ## H, %edi;\`
			`xor (%r11,%rdi,4), %r8d;\`
			`movzx a ## H, %edi;\`
			`xor s1(%r11,%rdi,4),%r9d;\`
			`add %r8d, %r9d;\`
			`add %r9d, %r8d;\`
			`add k+round(%r11), %r9d;\`
			`xor %r9d, c ## D;\`
			`ror $1, c ## D;\`
			`add k+4+round(%r11),%r8d;\`
			`xor %r8d, d ## D`

			`/*`
			`* a input register containing a`
			`* b input register containing b (rotated 16)`
			`* c input register containing c (already rol $1)`
			`* d input register containing d`
			`* operations on a and b are interleaved to increase performance`
			`*/`
			`#define decrypt_round(a,b,c,d,round)\`
			`movzx a ## B, %edi;\`
			`mov (%r11,%rdi,4), %r9d;\`
			`movzx b ## B, %edi;\`
			`mov s3(%r11,%rdi,4),%r8d;\`
			`movzx a ## H, %edi;\`
			`ror $16, a ## D;\`
			`xor s1(%r11,%rdi,4),%r9d;\`
			`movzx b ## H, %edi;\`
			`ror $16, b ## D;\`
			`xor (%r11,%rdi,4), %r8d;\`
			`movzx a ## B, %edi;\`
			`xor s2(%r11,%rdi,4),%r9d;\`
			`movzx b ## B, %edi;\`
			`xor s1(%r11,%rdi,4),%r8d;\`
			`movzx a ## H, %edi;\`
			`ror $15, a ## D;\`
			`xor s3(%r11,%rdi,4),%r9d;\`
			`movzx b ## H, %edi;\`
			`xor s2(%r11,%rdi,4),%r8d;\`
			`add %r8d, %r9d;\`
			`add %r9d, %r8d;\`
			`add k+round(%r11), %r9d;\`
			`xor %r9d, c ## D;\`
			`add k+4+round(%r11),%r8d;\`
			`xor %r8d, d ## D;\`
			`rol $15, d ## D;`

			`/*`
			`* a input register containing a`
			`* b input register containing b`
			`* c input register containing c (already rol $1)`
			`* d input register containing d`
			`* operations on a and b are interleaved to increase performance`
			`* during the round a and b are prepared for the output whitening`
			`*/`
			`#define decrypt_last_round(a,b,c,d,round)\`
			`movzx a ## B, %edi;\`
			`mov (%r11,%rdi,4), %r9d;\`
			`movzx b ## B, %edi;\`
			`mov s3(%r11,%rdi,4),%r8d;\`
			`movzx b ## H, %edi;\`
			`ror $16, b ## D;\`
			`xor (%r11,%rdi,4), %r8d;\`
			`movzx a ## H, %edi;\`
			`mov b ## D, %r10d;\`
			`shl $32, %r10;\`
			`xor a, %r10;\`
			`ror $16, a ## D;\`
			`xor s1(%r11,%rdi,4),%r9d;\`
			`movzx b ## B, %edi;\`
			`xor s1(%r11,%rdi,4),%r8d;\`
			`movzx a ## B, %edi;\`
			`xor s2(%r11,%rdi,4),%r9d;\`
			`movzx b ## H, %edi;\`
			`xor s2(%r11,%rdi,4),%r8d;\`
			`movzx a ## H, %edi;\`
			`xor s3(%r11,%rdi,4),%r9d;\`
			`add %r8d, %r9d;\`
			`add %r9d, %r8d;\`
			`add k+round(%r11), %r9d;\`
			`xor %r9d, c ## D;\`
			`add k+4+round(%r11),%r8d;\`
			`xor %r8d, d ## D;\`
			`ror $1, d ## D;`

			`.align 8`
			`.global twofish_enc_blk`
			`.global twofish_dec_blk`

			`twofish_enc_blk:`
			`pushq R1`

			`/* %rdi contains the crypto tfm adress */`
			`/* %rsi contains the output adress */`
			`/* %rdx contains the input adress */`
			`add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */`
			`/* ctx adress is moved to free one non-rex register`
			`as target for the 8bit high operations */`
			`mov %rdi, %r11`

			`movq (R3), R1`
			`movq 8(R3), R3`
			`input_whitening(R1,%r11,a_offset)`
			`input_whitening(R3,%r11,c_offset)`
			`mov R1D, R0D`
			`rol $16, R0D`
			`shr $32, R1`
			`mov R3D, R2D`
			`shr $32, R3`
			`rol $1, R3D`

			`encrypt_round(R0,R1,R2,R3,0);`
			`encrypt_round(R2,R3,R0,R1,8);`
			`encrypt_round(R0,R1,R2,R3,2*8);`
			`encrypt_round(R2,R3,R0,R1,3*8);`
			`encrypt_round(R0,R1,R2,R3,4*8);`
			`encrypt_round(R2,R3,R0,R1,5*8);`
			`encrypt_round(R0,R1,R2,R3,6*8);`
			`encrypt_round(R2,R3,R0,R1,7*8);`
			`encrypt_round(R0,R1,R2,R3,8*8);`
			`encrypt_round(R2,R3,R0,R1,9*8);`
			`encrypt_round(R0,R1,R2,R3,10*8);`
			`encrypt_round(R2,R3,R0,R1,11*8);`
			`encrypt_round(R0,R1,R2,R3,12*8);`
			`encrypt_round(R2,R3,R0,R1,13*8);`
			`encrypt_round(R0,R1,R2,R3,14*8);`
			`encrypt_last_round(R2,R3,R0,R1,15*8);`


			`output_whitening(%r10,%r11,a_offset)`
			`movq %r10, (%rsi)`

			`shl $32, R1`
			`xor R0, R1`

			`output_whitening(R1,%r11,c_offset)`
			`movq R1, 8(%rsi)`

			`popq R1`
			`movq $1,%rax`
			`ret`

			`twofish_dec_blk:`
			`pushq R1`

			`/* %rdi contains the crypto tfm adress */`
			`/* %rsi contains the output adress */`
			`/* %rdx contains the input adress */`
			`add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */`
			`/* ctx adress is moved to free one non-rex register`
			`as target for the 8bit high operations */`
			`mov %rdi, %r11`

			`movq (R3), R1`
			`movq 8(R3), R3`
			`output_whitening(R1,%r11,a_offset)`
			`output_whitening(R3,%r11,c_offset)`
			`mov R1D, R0D`
			`shr $32, R1`
			`rol $16, R1D`
			`mov R3D, R2D`
			`shr $32, R3`
			`rol $1, R2D`

			`decrypt_round(R0,R1,R2,R3,15*8);`
			`decrypt_round(R2,R3,R0,R1,14*8);`
			`decrypt_round(R0,R1,R2,R3,13*8);`
			`decrypt_round(R2,R3,R0,R1,12*8);`
			`decrypt_round(R0,R1,R2,R3,11*8);`
			`decrypt_round(R2,R3,R0,R1,10*8);`
			`decrypt_round(R0,R1,R2,R3,9*8);`
			`decrypt_round(R2,R3,R0,R1,8*8);`
			`decrypt_round(R0,R1,R2,R3,7*8);`
			`decrypt_round(R2,R3,R0,R1,6*8);`
			`decrypt_round(R0,R1,R2,R3,5*8);`
			`decrypt_round(R2,R3,R0,R1,4*8);`
			`decrypt_round(R0,R1,R2,R3,3*8);`
			`decrypt_round(R2,R3,R0,R1,2*8);`
			`decrypt_round(R0,R1,R2,R3,1*8);`
			`decrypt_last_round(R2,R3,R0,R1,0);`

			`input_whitening(%r10,%r11,a_offset)`
			`movq %r10, (%rsi)`

			`shl $32, R1`
			`xor R0, R1`

			`input_whitening(R1,%r11,c_offset)`
			`movq R1, 8(%rsi)`

			`popq R1`
			`movq $1,%rax`
			`ret`