#! /usr/bin/env perl
# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

#
# This module implements support for SM4 hw support on aarch64
# Oct 2021
#

# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";

open OUT,"| \"$^X\" $xlate $flavour \"$output\""
    or die "can't call $xlate: $!";
*STDOUT=*OUT;

$prefix="sm4_v8";
my @rks=map("v$_",(0..7));

sub rev32() {
my $dst = shift;
my $src = shift;
$code.=<<___;
#ifndef __ARMEB__
	rev32	$dst.16b,$src.16b
#endif
___
}

sub enc_blk () {
my $data = shift;
$code.=<<___;
	sm4e	$data.4s,@rks[0].4s
	sm4e	$data.4s,@rks[1].4s
	sm4e	$data.4s,@rks[2].4s
	sm4e	$data.4s,@rks[3].4s
	sm4e	$data.4s,@rks[4].4s
	sm4e	$data.4s,@rks[5].4s
	sm4e	$data.4s,@rks[6].4s
	sm4e	$data.4s,@rks[7].4s
	rev64	$data.4S,$data.4S
	ext	$data.16b,$data.16b,$data.16b,#8
___
}

sub enc_4blks () {
my $data0 = shift;
my $data1 = shift;
my $data2 = shift;
my $data3 = shift;
$code.=<<___;
	sm4e	$data0.4s,@rks[0].4s
	sm4e	$data1.4s,@rks[0].4s
	sm4e	$data2.4s,@rks[0].4s
	sm4e	$data3.4s,@rks[0].4s

	sm4e	$data0.4s,@rks[1].4s
	sm4e	$data1.4s,@rks[1].4s
	sm4e	$data2.4s,@rks[1].4s
	sm4e	$data3.4s,@rks[1].4s

	sm4e	$data0.4s,@rks[2].4s
	sm4e	$data1.4s,@rks[2].4s
	sm4e	$data2.4s,@rks[2].4s
	sm4e	$data3.4s,@rks[2].4s

	sm4e	$data0.4s,@rks[3].4s
	sm4e	$data1.4s,@rks[3].4s
	sm4e	$data2.4s,@rks[3].4s
	sm4e	$data3.4s,@rks[3].4s

	sm4e	$data0.4s,@rks[4].4s
	sm4e	$data1.4s,@rks[4].4s
	sm4e	$data2.4s,@rks[4].4s
	sm4e	$data3.4s,@rks[4].4s

	sm4e	$data0.4s,@rks[5].4s
	sm4e	$data1.4s,@rks[5].4s
	sm4e	$data2.4s,@rks[5].4s
	sm4e	$data3.4s,@rks[5].4s

	sm4e	$data0.4s,@rks[6].4s
	sm4e	$data1.4s,@rks[6].4s
	sm4e	$data2.4s,@rks[6].4s
	sm4e	$data3.4s,@rks[6].4s

	sm4e	$data0.4s,@rks[7].4s
	rev64	$data0.4S,$data0.4S
	sm4e	$data1.4s,@rks[7].4s
	ext	$data0.16b,$data0.16b,$data0.16b,#8
	rev64	$data1.4S,$data1.4S
	sm4e	$data2.4s,@rks[7].4s
	ext	$data1.16b,$data1.16b,$data1.16b,#8
	rev64	$data2.4S,$data2.4S
	sm4e	$data3.4s,@rks[7].4s
	ext	$data2.16b,$data2.16b,$data2.16b,#8
	rev64	$data3.4S,$data3.4S
	ext	$data3.16b,$data3.16b,$data3.16b,#8
___
}

$code=<<___;
#include "arm_arch.h"
.arch	armv8-a+crypto
.text
___

{{{
$code.=<<___;
.align	6
.Lck:
	.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
	.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
	.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
	.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
	.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
	.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
	.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
	.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
.Lfk:
	.long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
___
}}}

{{{
my ($key,$keys)=("x0","x1");
my ($tmp)=("x2");
my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7));
my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
my ($fkconst) = ("v24");
$code.=<<___;
.globl	${prefix}_set_encrypt_key
.type	${prefix}_set_encrypt_key,%function
.align	5
${prefix}_set_encrypt_key:
	AARCH64_VALID_CALL_TARGET
	ld1	{$key0.4s},[$key]
	adr	$tmp,.Lfk
	ld1	{$fkconst.4s},[$tmp]
	adr	$tmp,.Lck
	ld1	{$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
___
	&rev32($key0, $key0);
$code.=<<___;
	ld1	{$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
	eor	$key0.16b,$key0.16b,$fkconst.16b;
	sm4ekey	$key0.4S,$key0.4S,$const0.4S
	sm4ekey	$key1.4S,$key0.4S,$const1.4S
	sm4ekey	$key2.4S,$key1.4S,$const2.4S
	sm4ekey	$key3.4S,$key2.4S,$const3.4S
	sm4ekey	$key4.4S,$key3.4S,$const4.4S
	st1	{$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64
	sm4ekey	$key5.4S,$key4.4S,$const5.4S
	sm4ekey	$key6.4S,$key5.4S,$const6.4S
	sm4ekey	$key7.4S,$key6.4S,$const7.4S
	st1	{$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys]
	ret
.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
___
}}}

{{{
my ($key,$keys)=("x0","x1");
my ($tmp)=("x2");
my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7));
my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
my ($fkconst) = ("v24");
$code.=<<___;
.globl	${prefix}_set_decrypt_key
.type	${prefix}_set_decrypt_key,%function
.align	5
${prefix}_set_decrypt_key:
	AARCH64_VALID_CALL_TARGET
	ld1	{$key0.4s},[$key]
	adr	$tmp,.Lfk
	ld1	{$fkconst.4s},[$tmp]
	adr	$tmp, .Lck
	ld1	{$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
___
	&rev32($key0, $key0);
$code.=<<___;
	ld1	{$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
	eor	$key0.16b, $key0.16b,$fkconst.16b;
	sm4ekey	$key0.4S,$key0.4S,$const0.4S
	sm4ekey	$key1.4S,$key0.4S,$const1.4S
	sm4ekey	$key2.4S,$key1.4S,$const2.4S
	rev64	$key0.4s,$key0.4s
	rev64	$key1.4s,$key1.4s
	ext	$key0.16b,$key0.16b,$key0.16b,#8
	ext	$key1.16b,$key1.16b,$key1.16b,#8
	sm4ekey	$key3.4S,$key2.4S,$const3.4S
	sm4ekey	$key4.4S,$key3.4S,$const4.4S
	rev64	$key2.4s,$key2.4s
	rev64	$key3.4s,$key3.4s
	ext	$key2.16b,$key2.16b,$key2.16b,#8
	ext	$key3.16b,$key3.16b,$key3.16b,#8
	sm4ekey	$key5.4S,$key4.4S,$const5.4S
	sm4ekey	$key6.4S,$key5.4S,$const6.4S
	rev64	$key4.4s,$key4.4s
	rev64	$key5.4s,$key5.4s
	ext	$key4.16b,$key4.16b,$key4.16b,#8
	ext	$key5.16b,$key5.16b,$key5.16b,#8
	sm4ekey	$key7.4S,$key6.4S,$const7.4S
	rev64	$key6.4s, $key6.4s
	rev64	$key7.4s, $key7.4s
	ext	$key6.16b,$key6.16b,$key6.16b,#8
	ext	$key7.16b,$key7.16b,$key7.16b,#8
	st1	{$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64
	st1	{$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys]
	ret
.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
___
}}}

{{{
sub gen_block () {
my $dir = shift;
my ($inp,$out,$rk)=map("x$_",(0..2));
my ($data)=("v16");
$code.=<<___;
.globl	${prefix}_${dir}crypt
.type	${prefix}_${dir}crypt,%function
.align	5
${prefix}_${dir}crypt:
	AARCH64_VALID_CALL_TARGET
	ld1	{$data.4s},[$inp]
	ld1	{@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
	ld1	{@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
___
	&rev32($data,$data);
	&enc_blk($data);
	&rev32($data,$data);
$code.=<<___;
	st1	{$data.4s},[$out]
	ret
.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
___
}

&gen_block("en");
&gen_block("de");
}}}

{{{
my ($inp,$out,$len,$rk)=map("x$_",(0..3));
my ($enc) = ("w4");
my @dat=map("v$_",(16..23));
$code.=<<___;
.globl	${prefix}_ecb_encrypt
.type	${prefix}_ecb_encrypt,%function
.align	5
${prefix}_ecb_encrypt:
	AARCH64_VALID_CALL_TARGET
	ld1	{@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
	ld1	{@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
1:
	cmp	$len,#64
	b.lt	1f
	ld1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
	cmp	$len,#128
	b.lt	2f
	ld1	{@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64
	// 8 blocks
___
	&rev32(@dat[0],@dat[0]);
	&rev32(@dat[1],@dat[1]);
	&rev32(@dat[2],@dat[2]);
	&rev32(@dat[3],@dat[3]);
	&rev32(@dat[4],@dat[4]);
	&rev32(@dat[5],@dat[5]);
	&rev32(@dat[6],@dat[6]);
	&rev32(@dat[7],@dat[7]);
	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
	&enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
	&rev32(@dat[0],@dat[0]);
	&rev32(@dat[1],@dat[1]);
	&rev32(@dat[2],@dat[2]);
	&rev32(@dat[3],@dat[3]);
	&rev32(@dat[4],@dat[4]);
	&rev32(@dat[5],@dat[5]);
$code.=<<___;
	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
___
	&rev32(@dat[6],@dat[6]);
	&rev32(@dat[7],@dat[7]);
$code.=<<___;
	st1	{@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
	subs	$len,$len,#128
	b.gt	1b
	ret
	// 4 blocks
2:
___
	&rev32(@dat[0],@dat[0]);
	&rev32(@dat[1],@dat[1]);
	&rev32(@dat[2],@dat[2]);
	&rev32(@dat[3],@dat[3]);
	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
	&rev32(@dat[0],@dat[0]);
	&rev32(@dat[1],@dat[1]);
	&rev32(@dat[2],@dat[2]);
	&rev32(@dat[3],@dat[3]);
$code.=<<___;
	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
	subs	$len,$len,#64
	b.gt	1b
1:
	subs	$len,$len,#16
	b.lt	1f
	ld1	{@dat[0].4s},[$inp],#16
___
	&rev32(@dat[0],@dat[0]);
	&enc_blk(@dat[0]);
	&rev32(@dat[0],@dat[0]);
$code.=<<___;
	st1	{@dat[0].4s},[$out],#16
	b.ne	1b
1:
	ret
.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
___
}}}

{{{
my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
my ($enc) = ("w5");
my @dat=map("v$_",(16..23));
my @in=map("v$_",(24..31));
my ($ivec) = ("v8");
$code.=<<___;
.globl	${prefix}_cbc_encrypt
.type	${prefix}_cbc_encrypt,%function
.align	5
${prefix}_cbc_encrypt:
	AARCH64_VALID_CALL_TARGET
	stp	d8,d9,[sp, #-16]!

	ld1	{@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
	ld1	{@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
	ld1	{$ivec.4s},[$ivp]
	cmp	$enc,#0
	b.eq	.Ldec
1:
	cmp	$len, #64
	b.lt	1f
	ld1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
	eor	@dat[0].16b,@dat[0].16b,$ivec.16b
___
	&rev32(@dat[1],@dat[1]);
	&rev32(@dat[0],@dat[0]);
	&rev32(@dat[2],@dat[2]);
	&rev32(@dat[3],@dat[3]);
	&enc_blk(@dat[0]);
$code.=<<___;
	eor	@dat[1].16b,@dat[1].16b,@dat[0].16b
___
	&enc_blk(@dat[1]);
	&rev32(@dat[0],@dat[0]);
$code.=<<___;
	eor	@dat[2].16b,@dat[2].16b,@dat[1].16b
___
	&enc_blk(@dat[2]);
	&rev32(@dat[1],@dat[1]);
$code.=<<___;
	eor	@dat[3].16b,@dat[3].16b,@dat[2].16b
___
	&enc_blk(@dat[3]);
	&rev32(@dat[2],@dat[2]);
	&rev32(@dat[3],@dat[3]);
$code.=<<___;
	mov	$ivec.16b,@dat[3].16b
	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
	subs	$len,$len,#64
	b.ne	1b
1:
	subs	$len,$len,#16
	b.lt	3f
	ld1	{@dat[0].4s},[$inp],#16
	eor	$ivec.16b,$ivec.16b,@dat[0].16b
___
	&rev32($ivec,$ivec);
	&enc_blk($ivec);
	&rev32($ivec,$ivec);
$code.=<<___;
	st1	{$ivec.16b},[$out],#16
	b.ne	1b
	b	3f
.Ldec:
1:
	cmp	$len, #64
	b.lt	1f
	ld1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp]
	ld1	{@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
	cmp	$len,#128
	b.lt	2f
	// 8 blocks mode
	ld1	{@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp]
	ld1	{@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
___
	&rev32(@dat[0],@dat[0]);
	&rev32(@dat[1],@dat[1]);
	&rev32(@dat[2],@dat[2]);
	&rev32(@dat[3],$dat[3]);
	&rev32(@dat[4],@dat[4]);
	&rev32(@dat[5],@dat[5]);
	&rev32(@dat[6],@dat[6]);
	&rev32(@dat[7],$dat[7]);
	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
	&enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
	&rev32(@dat[0],@dat[0]);
	&rev32(@dat[1],@dat[1]);
	&rev32(@dat[2],@dat[2]);
	&rev32(@dat[3],@dat[3]);
	&rev32(@dat[4],@dat[4]);
	&rev32(@dat[5],@dat[5]);
	&rev32(@dat[6],@dat[6]);
	&rev32(@dat[7],@dat[7]);
$code.=<<___;
	eor	@dat[0].16b,@dat[0].16b,$ivec.16b
	eor	@dat[1].16b,@dat[1].16b,@in[0].16b
	eor	@dat[2].16b,@dat[2].16b,@in[1].16b
	mov	$ivec.16b,@in[7].16b
	eor	@dat[3].16b,$dat[3].16b,@in[2].16b
	eor	@dat[4].16b,$dat[4].16b,@in[3].16b
	eor	@dat[5].16b,$dat[5].16b,@in[4].16b
	eor	@dat[6].16b,$dat[6].16b,@in[5].16b
	eor	@dat[7].16b,$dat[7].16b,@in[6].16b
	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
	st1	{@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
	subs	$len,$len,128
	b.gt	1b
	b	3f
	// 4 blocks mode
2:
___
	&rev32(@dat[0],@dat[0]);
	&rev32(@dat[1],@dat[1]);
	&rev32(@dat[2],@dat[2]);
	&rev32(@dat[3],$dat[3]);
	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
	&rev32(@dat[0],@dat[0]);
	&rev32(@dat[1],@dat[1]);
	&rev32(@dat[2],@dat[2]);
	&rev32(@dat[3],@dat[3]);
$code.=<<___;
	eor	@dat[0].16b,@dat[0].16b,$ivec.16b
	eor	@dat[1].16b,@dat[1].16b,@in[0].16b
	mov	$ivec.16b,@in[3].16b
	eor	@dat[2].16b,@dat[2].16b,@in[1].16b
	eor	@dat[3].16b,$dat[3].16b,@in[2].16b
	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
	subs	$len,$len,#64
	b.gt	1b
1:
	subs	$len,$len,#16
	b.lt	3f
	ld1	{@dat[0].4s},[$inp],#16
	mov	@in[0].16b,@dat[0].16b
___
	&rev32(@dat[0],@dat[0]);
	&enc_blk(@dat[0]);
	&rev32(@dat[0],@dat[0]);
$code.=<<___;
	eor	@dat[0].16b,@dat[0].16b,$ivec.16b
	mov	$ivec.16b,@in[0].16b
	st1	{@dat[0].16b},[$out],#16
	b.ne	1b
3:
	// save back IV
	st1	{$ivec.16b},[$ivp]
	ldp	d8,d9,[sp],#16
	ret
.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
___
}}}

{{{
my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
my ($ctr)=("w5");
my @dat=map("v$_",(16..23));
my @in=map("v$_",(24..31));
my ($ivec)=("v8");
$code.=<<___;
.globl	${prefix}_ctr32_encrypt_blocks
.type	${prefix}_ctr32_encrypt_blocks,%function
.align	5
${prefix}_ctr32_encrypt_blocks:
	AARCH64_VALID_CALL_TARGET
	stp	d8,d9,[sp, #-16]!

	ld1	{$ivec.4s},[$ivp]
	ld1	{@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
	ld1	{@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
___
	&rev32($ivec,$ivec);
$code.=<<___;
	mov	$ctr,$ivec.s[3]
1:
	cmp	$len,#4
	b.lt	1f
	ld1	{@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
	mov	@dat[0].16b,$ivec.16b
	mov	@dat[1].16b,$ivec.16b
	mov	@dat[2].16b,$ivec.16b
	mov	@dat[3].16b,$ivec.16b
	add	$ctr,$ctr,#1
	mov	$dat[1].s[3],$ctr
	add	$ctr,$ctr,#1
	mov	@dat[2].s[3],$ctr
	add	$ctr,$ctr,#1
	mov	@dat[3].s[3],$ctr
	cmp	$len,#8
	b.lt	2f
	ld1	{@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
	mov	@dat[4].16b,$ivec.16b
	mov	@dat[5].16b,$ivec.16b
	mov	@dat[6].16b,$ivec.16b
	mov	@dat[7].16b,$ivec.16b
	add	$ctr,$ctr,#1
	mov	$dat[4].s[3],$ctr
	add	$ctr,$ctr,#1
	mov	@dat[5].s[3],$ctr
	add	$ctr,$ctr,#1
	mov	@dat[6].s[3],$ctr
	add	$ctr,$ctr,#1
	mov	@dat[7].s[3],$ctr
___
	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
	&enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
	&rev32(@dat[0],@dat[0]);
	&rev32(@dat[1],@dat[1]);
	&rev32(@dat[2],@dat[2]);
	&rev32(@dat[3],@dat[3]);
	&rev32(@dat[4],@dat[4]);
	&rev32(@dat[5],@dat[5]);
	&rev32(@dat[6],@dat[6]);
	&rev32(@dat[7],@dat[7]);
$code.=<<___;
	eor	@dat[0].16b,@dat[0].16b,@in[0].16b
	eor	@dat[1].16b,@dat[1].16b,@in[1].16b
	eor	@dat[2].16b,@dat[2].16b,@in[2].16b
	eor	@dat[3].16b,@dat[3].16b,@in[3].16b
	eor	@dat[4].16b,@dat[4].16b,@in[4].16b
	eor	@dat[5].16b,@dat[5].16b,@in[5].16b
	eor	@dat[6].16b,@dat[6].16b,@in[6].16b
	eor	@dat[7].16b,@dat[7].16b,@in[7].16b
	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
	st1	{@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
	subs	$len,$len,#8
	b.eq	3f
	add	$ctr,$ctr,#1
	mov	$ivec.s[3],$ctr
	b	1b
2:
___
	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
	&rev32(@dat[0],@dat[0]);
	&rev32(@dat[1],@dat[1]);
	&rev32(@dat[2],@dat[2]);
	&rev32(@dat[3],@dat[3]);
$code.=<<___;
	eor	@dat[0].16b,@dat[0].16b,@in[0].16b
	eor	@dat[1].16b,@dat[1].16b,@in[1].16b
	eor	@dat[2].16b,@dat[2].16b,@in[2].16b
	eor	@dat[3].16b,@dat[3].16b,@in[3].16b
	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
	subs	$len,$len,#4
	b.eq	3f
	add	$ctr,$ctr,#1
	mov	$ivec.s[3],$ctr
	b	1b
1:
	subs	$len,$len,#1
	b.lt	3f
	mov	$dat[0].16b,$ivec.16b
	ld1	{@in[0].4s},[$inp],#16
___
	&enc_blk(@dat[0]);
	&rev32(@dat[0],@dat[0]);
$code.=<<___;
	eor	$dat[0].16b,$dat[0].16b,@in[0].16b
	st1	{$dat[0].4s},[$out],#16
	b.eq	3f
	add	$ctr,$ctr,#1
	mov	$ivec.s[3],$ctr
	b	1b
3:
	ldp	d8,d9,[sp],#16
	ret
.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
########################################
{   my  %opcode = (
        "sm4e"          => 0xcec08400,
        "sm4ekey"       => 0xce60c800);

    sub unsm4 {
        my ($mnemonic,$arg)=@_;

        $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
        &&
        sprintf ".inst\t0x%08x\t//%s %s",
                        $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
                        $mnemonic,$arg;
    }
}

open SELF,$0;
while(<SELF>) {
        next if (/^#!/);
        last if (!s/^#/\/\// and !/^$/);
        print;
}
close SELF;

foreach(split("\n",$code)) {
	s/\`([^\`]*)\`/eval($1)/ge;

	s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge;
	print $_,"\n";
}

close STDOUT or die "error closing STDOUT: $!";
