vcpkg/ports/openssl/asm-comments.patch
Kai Pastor e2f6384a79
[openssl] Update to 3.2.0 (#35734)
Co-authored-by: مهدي شينون (Mehdi Chinoune) <mehdi.chinoune@hotmail.com>
2023-12-19 16:31:02 -08:00

705 lines
16 KiB
Diff

diff --git a/crypto/ec/asm/ecp_sm2p256-armv8.pl b/crypto/ec/asm/ecp_sm2p256-armv8.pl
index 5095086..62fadc0 100644
--- a/crypto/ec/asm/ecp_sm2p256-armv8.pl
+++ b/crypto/ec/asm/ecp_sm2p256-armv8.pl
@@ -28,44 +28,44 @@ my ($t4,$t5,$t6,$t7,$t8)=map("x$_",(15..19));
sub bn_mod_add() {
my $mod = shift;
$code.=<<___;
- # Load inputs
+ // Load inputs
ldp $s0,$s1,[x1]
ldp $s2,$s3,[x1,#16]
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
- # Addition
+ // Addition
adds $s0,$s0,$s4
adcs $s1,$s1,$s5
adcs $s2,$s2,$s6
adcs $s3,$s3,$s7
adc $t4,xzr,xzr
- # Load polynomial
+ // Load polynomial
adr x2,$mod
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
- # Backup Addition
+ // Backup Addition
mov $t0,$s0
mov $t1,$s1
mov $t2,$s2
mov $t3,$s3
- # Sub polynomial
+ // Sub polynomial
subs $t0,$t0,$s4
sbcs $t1,$t1,$s5
sbcs $t2,$t2,$s6
sbcs $t3,$t3,$s7
sbcs $t4,$t4,xzr
- # Select based on carry
+ // Select based on carry
csel $s0,$s0,$t0,cc
csel $s1,$s1,$t1,cc
csel $s2,$s2,$t2,cc
csel $s3,$s3,$t3,cc
- # Store results
+ // Store results
stp $s0,$s1,[x0]
stp $s2,$s3,[x0,#16]
___
@@ -74,44 +74,44 @@ ___
sub bn_mod_sub() {
my $mod = shift;
$code.=<<___;
- # Load inputs
+ // Load inputs
ldp $s0,$s1,[x1]
ldp $s2,$s3,[x1,#16]
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
- # Subtraction
+ // Subtraction
subs $s0,$s0,$s4
sbcs $s1,$s1,$s5
sbcs $s2,$s2,$s6
sbcs $s3,$s3,$s7
sbc $t4,xzr,xzr
- # Load polynomial
+ // Load polynomial
adr x2,$mod
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
- # Backup subtraction
+ // Backup subtraction
mov $t0,$s0
mov $t1,$s1
mov $t2,$s2
mov $t3,$s3
- # Add polynomial
+ // Add polynomial
adds $t0,$t0,$s4
adcs $t1,$t1,$s5
adcs $t2,$t2,$s6
adcs $t3,$t3,$s7
tst $t4,$t4
- # Select based on carry
+ // Select based on carry
csel $s0,$s0,$t0,eq
csel $s1,$s1,$t1,eq
csel $s2,$s2,$t2,eq
csel $s3,$s3,$t3,eq
- # Store results
+ // Store results
stp $s0,$s1,[x0]
stp $s2,$s3,[x0,#16]
___
@@ -120,38 +120,38 @@ ___
sub bn_mod_div_by_2() {
my $mod = shift;
$code.=<<___;
- # Load inputs
+ // Load inputs
ldp $s0,$s1,[x1]
ldp $s2,$s3,[x1,#16]
- # Save the least significant bit
+ // Save the least significant bit
mov $t0,$s0
- # Right shift 1
+ // Right shift 1
extr $s0,$s1,$s0,#1
extr $s1,$s2,$s1,#1
extr $s2,$s3,$s2,#1
lsr $s3,$s3,#1
- # Load mod
+ // Load mod
adr x2,$mod
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
- # Parity check
+ // Parity check
tst $t0,#1
csel $s4,xzr,$s4,eq
csel $s5,xzr,$s5,eq
csel $s6,xzr,$s6,eq
csel $s7,xzr,$s7,eq
- # Add
+ // Add
adds $s0,$s0,$s4
adcs $s1,$s1,$s5
adcs $s2,$s2,$s6
adc $s3,$s3,$s7
- # Store results
+ // Store results
stp $s0,$s1,[x0]
stp $s2,$s3,[x0,#16]
___
@@ -183,17 +183,17 @@ $code.=<<___;
.align 5
bn_rshift1:
AARCH64_VALID_CALL_TARGET
- # Load inputs
+ // Load inputs
ldp $s0,$s1,[x0]
ldp $s2,$s3,[x0,#16]
- # Right shift
+ // Right shift
extr $s0,$s1,$s0,#1
extr $s1,$s2,$s1,#1
extr $s2,$s3,$s2,#1
lsr $s3,$s3,#1
- # Store results
+ // Store results
stp $s0,$s1,[x0]
stp $s2,$s3,[x0,#16]
@@ -206,19 +206,19 @@ bn_rshift1:
.align 5
bn_sub:
AARCH64_VALID_CALL_TARGET
- # Load inputs
+ // Load inputs
ldp $s0,$s1,[x1]
ldp $s2,$s3,[x1,#16]
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
- # Subtraction
+ // Subtraction
subs $s0,$s0,$s4
sbcs $s1,$s1,$s5
sbcs $s2,$s2,$s6
sbc $s3,$s3,$s7
- # Store results
+ // Store results
stp $s0,$s1,[x0]
stp $s2,$s3,[x0,#16]
@@ -255,11 +255,11 @@ $code.=<<___;
.align 5
ecp_sm2p256_mul_by_3:
AARCH64_VALID_CALL_TARGET
- # Load inputs
+ // Load inputs
ldp $s0,$s1,[x1]
ldp $s2,$s3,[x1,#16]
- # 2*a
+ // 2*a
adds $s0,$s0,$s0
adcs $s1,$s1,$s1
adcs $s2,$s2,$s2
@@ -271,7 +271,7 @@ ecp_sm2p256_mul_by_3:
mov $t2,$s2
mov $t3,$s3
- # Sub polynomial
+ // Sub polynomial
adr x2,.Lpoly
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
@@ -287,7 +287,7 @@ ecp_sm2p256_mul_by_3:
csel $s3,$s3,$t3,cs
eor $t4,$t4,$t4
- # 3*a
+ // 3*a
ldp $s4,$s5,[x1]
ldp $s6,$s7,[x1,#16]
adds $s0,$s0,$s4
@@ -301,7 +301,7 @@ ecp_sm2p256_mul_by_3:
mov $t2,$s2
mov $t3,$s3
- # Sub polynomial
+ // Sub polynomial
adr x2,.Lpoly
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
@@ -316,7 +316,7 @@ ecp_sm2p256_mul_by_3:
csel $s2,$s2,$t2,cs
csel $s3,$s3,$t3,cs
- # Store results
+ // Store results
stp $s0,$s1,[x0]
stp $s2,$s3,[x0,#16]
@@ -360,45 +360,45 @@ $code.=<<___;
.size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord
.macro RDC
- # a = | s7 | ... | s0 |, where si are 64-bit quantities
- # = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities
- # | s7 | s6 | s5 | s4 |
- # | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 |
- # | s3 | s2 | s1 | s0 |
- # | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 |
- # =================================================
- # | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+)
- # | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+)
- # | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+)
- # | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+)
- # | a12 | 0 | s7 | a13 | 0 | s6 | (+)
- # | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+)
- # | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+)
- # | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
- # | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
- # | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
- # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
- # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
- # | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
- # | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-)
- # | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-)
- # | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-)
- # | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-)
- # | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
- # | V[3] | V[2] | V[1] | V[0] |
-
- # 1. 64-bit addition
- # t2=s6+s7+s7
+ // a = | s7 | ... | s0 |, where si are 64-bit quantities
+ // = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities
+ // | s7 | s6 | s5 | s4 |
+ // | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 |
+ // | s3 | s2 | s1 | s0 |
+ // | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 |
+ // =================================================
+ // | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+)
+ // | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+)
+ // | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+)
+ // | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+)
+ // | a12 | 0 | s7 | a13 | 0 | s6 | (+)
+ // | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+)
+ // | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+)
+ // | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
+ // | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
+ // | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
+ // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
+ // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
+ // | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
+ // | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-)
+ // | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-)
+ // | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-)
+ // | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-)
+ // | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
+ // | V[3] | V[2] | V[1] | V[0] |
+
+ // 1. 64-bit addition
+ // t2=s6+s7+s7
adds $t2,$s6,$s7
adcs $t1,xzr,xzr
adds $t2,$t2,$s7
adcs $t1,$t1,xzr
- # t3=s4+s5+t2
+ // t3=s4+s5+t2
adds $t3,$s4,$t2
adcs $t4,$t1,xzr
adds $t3,$t3,$s5
adcs $t4,$t4,xzr
- # sum
+ // sum
adds $s0,$s0,$t3
adcs $s1,$s1,$t4
adcs $s2,$s2,$t2
@@ -410,7 +410,7 @@ $code.=<<___;
stp $s0,$s1,[sp,#32]
stp $s2,$s3,[sp,#48]
- # 2. 64-bit to 32-bit spread
+ // 2. 64-bit to 32-bit spread
mov $t1,#0xffffffff
mov $s0,$s4
mov $s1,$s5
@@ -425,7 +425,7 @@ $code.=<<___;
lsr $s6,$s6,#32 // a13
lsr $s7,$s7,#32 // a15
- # 3. 32-bit addition
+ // 3. 32-bit addition
add $t1,$a14,$a12 // t1 <- a12 + a14
add $t2,$a15,$a13 // t2 <- a13 + a15
add $t3,$a8,$a9 // t3 <- a8 + a9
@@ -446,53 +446,53 @@ $code.=<<___;
add $a11,$a11,$t2 // a11 <- a9 + a11 + 2*(a13 + a15)
add $t1,$t1,$t4 // t1 <- a10 + a12 + 2*a14
- # U[0] s5 a9 + a11 + 2*(a13 + a15)
- # U[1] t1 a10 + a12 + 2*a14
- # U[2] -t3 a8 + a9 + a13 + a14
- # U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15
- # U[4] s4 a9 + a13 + a15
- # U[5] t4 a10 + a14
- # U[6] s7 a11 + a15
- # U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
+ // U[0] s5 a9 + a11 + 2*(a13 + a15)
+ // U[1] t1 a10 + a12 + 2*a14
+ // U[2] -t3 a8 + a9 + a13 + a14
+ // U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15
+ // U[4] s4 a9 + a13 + a15
+ // U[5] t4 a10 + a14
+ // U[6] s7 a11 + a15
+ // U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
- # 4. 32-bit to 64-bit
+ // 4. 32-bit to 64-bit
lsl $s0,$t1,#32
extr $t1,$s2,$t1,#32
extr $s2,$t4,$s2,#32
extr $t4,$s1,$t4,#32
lsr $s1,$s1,#32
- # 5. 64-bit addition
+ // 5. 64-bit addition
adds $s5,$s5,$s0
adcs $t1,$t1,xzr
adcs $s4,$s4,$s2
adcs $s7,$s7,$t4
adcs $t0,$t0,$s1
- # V[0] s5
- # V[1] t1
- # V[2] s4
- # V[3] s7
- # carry t0
- # sub t3
+ // V[0] s5
+ // V[1] t1
+ // V[2] s4
+ // V[3] s7
+ // carry t0
+ // sub t3
- # 5. Process s0-s3
+ // 5. Process s0-s3
ldp $s0,$s1,[sp,#32]
ldp $s2,$s3,[sp,#48]
- # add with V0-V3
+ // add with V0-V3
adds $s0,$s0,$s5
adcs $s1,$s1,$t1
adcs $s2,$s2,$s4
adcs $s3,$s3,$s7
adcs $t0,$t0,xzr
- # sub with t3
+ // sub with t3
subs $s1,$s1,$t3
sbcs $s2,$s2,xzr
sbcs $s3,$s3,xzr
sbcs $t0,$t0,xzr
- # 6. MOD
- # First Mod
+ // 6. MOD
+ // First Mod
lsl $t1,$t0,#32
subs $t2,$t1,$t0
@@ -501,8 +501,8 @@ $code.=<<___;
adcs $s2,$s2,xzr
adcs $s3,$s3,$t1
- # Last Mod
- # return y - p if y > p else y
+ // Last Mod
+ // return y - p if y > p else y
mov $s4,$s0
mov $s5,$s1
mov $s6,$s2
@@ -533,44 +533,44 @@ $code.=<<___;
.align 5
ecp_sm2p256_mul:
AARCH64_SIGN_LINK_REGISTER
- # Store scalar registers
+ // Store scalar registers
stp x29,x30,[sp,#-80]!
add x29,sp,#0
stp x16,x17,[sp,#16]
stp x18,x19,[sp,#64]
- # Load inputs
+ // Load inputs
ldp $s0,$s1,[x1]
ldp $s2,$s3,[x1,#16]
ldp $s4,$s5,[x2]
ldp $s6,$s7,[x2,#16]
-### multiplication ###
- # ========================
- # s3 s2 s1 s0
- # * s7 s6 s5 s4
- # ------------------------
- # + s0 s0 s0 s0
- # * * * *
- # s7 s6 s5 s4
- # s1 s1 s1 s1
- # * * * *
- # s7 s6 s5 s4
- # s2 s2 s2 s2
- # * * * *
- # s7 s6 s5 s4
- # s3 s3 s3 s3
- # * * * *
- # s7 s6 s5 s4
- # ------------------------
- # s7 s6 s5 s4 s3 s2 s1 s0
- # ========================
-
-### s0*s4 ###
+// ### multiplication ###
+ // ========================
+ // s3 s2 s1 s0
+ // * s7 s6 s5 s4
+ // ------------------------
+ // + s0 s0 s0 s0
+ // * * * *
+ // s7 s6 s5 s4
+ // s1 s1 s1 s1
+ // * * * *
+ // s7 s6 s5 s4
+ // s2 s2 s2 s2
+ // * * * *
+ // s7 s6 s5 s4
+ // s3 s3 s3 s3
+ // * * * *
+ // s7 s6 s5 s4
+ // ------------------------
+ // s7 s6 s5 s4 s3 s2 s1 s0
+ // ========================
+
+// ### s0*s4 ###
mul $t5,$s0,$s4
umulh $t2,$s0,$s4
-### s1*s4 + s0*s5 ###
+// ### s1*s4 + s0*s5 ###
mul $t0,$s1,$s4
umulh $t1,$s1,$s4
adds $t2,$t2,$t0
@@ -582,7 +582,7 @@ ecp_sm2p256_mul:
adcs $t3,$t3,$t1
adcs $t4,xzr,xzr
-### s2*s4 + s1*s5 + s0*s6 ###
+// ### s2*s4 + s1*s5 + s0*s6 ###
mul $t0,$s2,$s4
umulh $t1,$s2,$s4
adds $t3,$t3,$t0
@@ -600,7 +600,7 @@ ecp_sm2p256_mul:
adcs $t4,$t4,$t1
adcs $t6,$t6,xzr
-### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
+// ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
mul $t0,$s3,$s4
umulh $t1,$s3,$s4
adds $t4,$t4,$t0
@@ -625,7 +625,7 @@ ecp_sm2p256_mul:
adcs $t6,$t6,$t1
adcs $t7,$t7,xzr
-### s3*s5 + s2*s6 + s1*s7 ###
+// ### s3*s5 + s2*s6 + s1*s7 ###
mul $t0,$s3,$s5
umulh $t1,$s3,$s5
adds $t6,$t6,$t0
@@ -644,7 +644,7 @@ ecp_sm2p256_mul:
adcs $t7,$t7,$t1
adcs $t8,$t8,xzr
-### s3*s6 + s2*s7 ###
+// ### s3*s6 + s2*s7 ###
mul $t0,$s3,$s6
umulh $t1,$s3,$s6
adds $t7,$t7,$t0
@@ -657,7 +657,7 @@ ecp_sm2p256_mul:
adcs $t8,$t8,$t1
adcs $t6,$t6,xzr
-### s3*s7 ###
+// ### s3*s7 ###
mul $t0,$s3,$s7
umulh $t1,$s3,$s7
adds $s6,$t8,$t0
@@ -668,15 +668,15 @@ ecp_sm2p256_mul:
mov $s2,$t3
mov $s3,$t4
- # result of mul: s7 s6 s5 s4 s3 s2 s1 s0
+ // result of mul: s7 s6 s5 s4 s3 s2 s1 s0
-### Reduction ###
+// ### Reduction ###
RDC
stp $s0,$s1,[x0]
stp $s2,$s3,[x0,#16]
- # Restore scalar registers
+ // Restore scalar registers
ldp x16,x17,[sp,#16]
ldp x18,x19,[sp,#64]
ldp x29,x30,[sp],#80
@@ -692,48 +692,48 @@ ecp_sm2p256_mul:
ecp_sm2p256_sqr:
AARCH64_SIGN_LINK_REGISTER
- # Store scalar registers
+ // Store scalar registers
stp x29,x30,[sp,#-80]!
add x29,sp,#0
stp x16,x17,[sp,#16]
stp x18,x19,[sp,#64]
- # Load inputs
+ // Load inputs
ldp $s4,$s5,[x1]
ldp $s6,$s7,[x1,#16]
-### square ###
- # ========================
- # s7 s6 s5 s4
- # * s7 s6 s5 s4
- # ------------------------
- # + s4 s4 s4 s4
- # * * * *
- # s7 s6 s5 s4
- # s5 s5 s5 s5
- # * * * *
- # s7 s6 s5 s4
- # s6 s6 s6 s6
- # * * * *
- # s7 s6 s5 s4
- # s7 s7 s7 s7
- # * * * *
- # s7 s6 s5 s4
- # ------------------------
- # s7 s6 s5 s4 s3 s2 s1 s0
- # ========================
-
-### s4*s5 ###
+// ### square ###
+ // ========================
+ // s7 s6 s5 s4
+ // * s7 s6 s5 s4
+ // ------------------------
+ // + s4 s4 s4 s4
+ // * * * *
+ // s7 s6 s5 s4
+ // s5 s5 s5 s5
+ // * * * *
+ // s7 s6 s5 s4
+ // s6 s6 s6 s6
+ // * * * *
+ // s7 s6 s5 s4
+ // s7 s7 s7 s7
+ // * * * *
+ // s7 s6 s5 s4
+ // ------------------------
+ // s7 s6 s5 s4 s3 s2 s1 s0
+ // ========================
+
+// ### s4*s5 ###
mul $s1,$s4,$s5
umulh $s2,$s4,$s5
-### s4*s6 ###
+// ### s4*s6 ###
mul $t0,$s6,$s4
umulh $s3,$s6,$s4
adds $s2,$s2,$t0
adcs $s3,$s3,xzr
-### s4*s7 + s5*s6 ###
+// ### s4*s7 + s5*s6 ###
mul $t0,$s7,$s4
umulh $t1,$s7,$s4
adds $s3,$s3,$t0
@@ -745,19 +745,19 @@ ecp_sm2p256_sqr:
adcs $s0,$s0,$t1
adcs $t2,xzr,xzr
-### s5*s7 ###
+// ### s5*s7 ###
mul $t0,$s7,$s5
umulh $t1,$s7,$s5
adds $s0,$s0,$t0
adcs $t2,$t2,$t1
-### s6*s7 ###
+// ### s6*s7 ###
mul $t0,$s7,$s6
umulh $t1,$s7,$s6
adds $t2,$t2,$t0
adcs $t3,$t1,xzr
-### 2*(t3,t2,s0,s3,s2,s1) ###
+// ### 2*(t3,t2,s0,s3,s2,s1) ###
adds $s1,$s1,$s1
adcs $s2,$s2,$s2
adcs $s3,$s3,$s3
@@ -766,19 +766,19 @@ ecp_sm2p256_sqr:
adcs $t3,$t3,$t3
adcs $t4,xzr,xzr
-### s4*s4 ###
+// ### s4*s4 ###
mul $t5,$s4,$s4
umulh $t6,$s4,$s4
-### s5*s5 ###
+// ### s5*s5 ###
mul $s4,$s5,$s5
umulh $s5,$s5,$s5
-### s6*s6 ###
+// ### s6*s6 ###
mul $t0,$s6,$s6
umulh $t1,$s6,$s6
-### s7*s7 ###
+// ### s7*s7 ###
mul $t7,$s7,$s7
umulh $t8,$s7,$s7
@@ -796,15 +796,15 @@ ecp_sm2p256_sqr:
mov $s6,$t3
mov $s7,$t4
- # result of mul: s7 s6 s5 s4 s3 s2 s1 s0
+ // result of mul: s7 s6 s5 s4 s3 s2 s1 s0
-### Reduction ###
+// ### Reduction ###
RDC
stp $s0,$s1,[x0]
stp $s2,$s3,[x0,#16]
- # Restore scalar registers
+ // Restore scalar registers
ldp x16,x17,[sp,#16]
ldp x18,x19,[sp,#64]
ldp x29,x30,[sp],#80