diff --git a/crypto/ec/asm/ecp_sm2p256-armv8.pl b/crypto/ec/asm/ecp_sm2p256-armv8.pl index 5095086..62fadc0 100644 --- a/crypto/ec/asm/ecp_sm2p256-armv8.pl +++ b/crypto/ec/asm/ecp_sm2p256-armv8.pl @@ -28,44 +28,44 @@ my ($t4,$t5,$t6,$t7,$t8)=map("x$_",(15..19)); sub bn_mod_add() { my $mod = shift; $code.=<<___; - # Load inputs + // Load inputs ldp $s0,$s1,[x1] ldp $s2,$s3,[x1,#16] ldp $s4,$s5,[x2] ldp $s6,$s7,[x2,#16] - # Addition + // Addition adds $s0,$s0,$s4 adcs $s1,$s1,$s5 adcs $s2,$s2,$s6 adcs $s3,$s3,$s7 adc $t4,xzr,xzr - # Load polynomial + // Load polynomial adr x2,$mod ldp $s4,$s5,[x2] ldp $s6,$s7,[x2,#16] - # Backup Addition + // Backup Addition mov $t0,$s0 mov $t1,$s1 mov $t2,$s2 mov $t3,$s3 - # Sub polynomial + // Sub polynomial subs $t0,$t0,$s4 sbcs $t1,$t1,$s5 sbcs $t2,$t2,$s6 sbcs $t3,$t3,$s7 sbcs $t4,$t4,xzr - # Select based on carry + // Select based on carry csel $s0,$s0,$t0,cc csel $s1,$s1,$t1,cc csel $s2,$s2,$t2,cc csel $s3,$s3,$t3,cc - # Store results + // Store results stp $s0,$s1,[x0] stp $s2,$s3,[x0,#16] ___ @@ -74,44 +74,44 @@ ___ sub bn_mod_sub() { my $mod = shift; $code.=<<___; - # Load inputs + // Load inputs ldp $s0,$s1,[x1] ldp $s2,$s3,[x1,#16] ldp $s4,$s5,[x2] ldp $s6,$s7,[x2,#16] - # Subtraction + // Subtraction subs $s0,$s0,$s4 sbcs $s1,$s1,$s5 sbcs $s2,$s2,$s6 sbcs $s3,$s3,$s7 sbc $t4,xzr,xzr - # Load polynomial + // Load polynomial adr x2,$mod ldp $s4,$s5,[x2] ldp $s6,$s7,[x2,#16] - # Backup subtraction + // Backup subtraction mov $t0,$s0 mov $t1,$s1 mov $t2,$s2 mov $t3,$s3 - # Add polynomial + // Add polynomial adds $t0,$t0,$s4 adcs $t1,$t1,$s5 adcs $t2,$t2,$s6 adcs $t3,$t3,$s7 tst $t4,$t4 - # Select based on carry + // Select based on carry csel $s0,$s0,$t0,eq csel $s1,$s1,$t1,eq csel $s2,$s2,$t2,eq csel $s3,$s3,$t3,eq - # Store results + // Store results stp $s0,$s1,[x0] stp $s2,$s3,[x0,#16] ___ @@ -120,38 +120,38 @@ ___ sub bn_mod_div_by_2() { my $mod = shift; $code.=<<___; - # Load inputs + // Load inputs ldp $s0,$s1,[x1] ldp $s2,$s3,[x1,#16] - # Save the least significant bit + // Save the least significant bit mov $t0,$s0 - # Right shift 1 + // Right shift 1 extr $s0,$s1,$s0,#1 extr $s1,$s2,$s1,#1 extr $s2,$s3,$s2,#1 lsr $s3,$s3,#1 - # Load mod + // Load mod adr x2,$mod ldp $s4,$s5,[x2] ldp $s6,$s7,[x2,#16] - # Parity check + // Parity check tst $t0,#1 csel $s4,xzr,$s4,eq csel $s5,xzr,$s5,eq csel $s6,xzr,$s6,eq csel $s7,xzr,$s7,eq - # Add + // Add adds $s0,$s0,$s4 adcs $s1,$s1,$s5 adcs $s2,$s2,$s6 adc $s3,$s3,$s7 - # Store results + // Store results stp $s0,$s1,[x0] stp $s2,$s3,[x0,#16] ___ @@ -183,17 +183,17 @@ $code.=<<___; .align 5 bn_rshift1: AARCH64_VALID_CALL_TARGET - # Load inputs + // Load inputs ldp $s0,$s1,[x0] ldp $s2,$s3,[x0,#16] - # Right shift + // Right shift extr $s0,$s1,$s0,#1 extr $s1,$s2,$s1,#1 extr $s2,$s3,$s2,#1 lsr $s3,$s3,#1 - # Store results + // Store results stp $s0,$s1,[x0] stp $s2,$s3,[x0,#16] @@ -206,19 +206,19 @@ bn_rshift1: .align 5 bn_sub: AARCH64_VALID_CALL_TARGET - # Load inputs + // Load inputs ldp $s0,$s1,[x1] ldp $s2,$s3,[x1,#16] ldp $s4,$s5,[x2] ldp $s6,$s7,[x2,#16] - # Subtraction + // Subtraction subs $s0,$s0,$s4 sbcs $s1,$s1,$s5 sbcs $s2,$s2,$s6 sbc $s3,$s3,$s7 - # Store results + // Store results stp $s0,$s1,[x0] stp $s2,$s3,[x0,#16] @@ -255,11 +255,11 @@ $code.=<<___; .align 5 ecp_sm2p256_mul_by_3: AARCH64_VALID_CALL_TARGET - # Load inputs + // Load inputs ldp $s0,$s1,[x1] ldp $s2,$s3,[x1,#16] - # 2*a + // 2*a adds $s0,$s0,$s0 adcs $s1,$s1,$s1 adcs $s2,$s2,$s2 @@ -271,7 +271,7 @@ ecp_sm2p256_mul_by_3: mov $t2,$s2 mov $t3,$s3 - # Sub polynomial + // Sub polynomial adr x2,.Lpoly ldp $s4,$s5,[x2] ldp $s6,$s7,[x2,#16] @@ -287,7 +287,7 @@ ecp_sm2p256_mul_by_3: csel $s3,$s3,$t3,cs eor $t4,$t4,$t4 - # 3*a + // 3*a ldp $s4,$s5,[x1] ldp $s6,$s7,[x1,#16] adds $s0,$s0,$s4 @@ -301,7 +301,7 @@ ecp_sm2p256_mul_by_3: mov $t2,$s2 mov $t3,$s3 - # Sub polynomial + // Sub polynomial adr x2,.Lpoly ldp $s4,$s5,[x2] ldp $s6,$s7,[x2,#16] @@ -316,7 +316,7 @@ ecp_sm2p256_mul_by_3: csel $s2,$s2,$t2,cs csel $s3,$s3,$t3,cs - # Store results + // Store results stp $s0,$s1,[x0] stp $s2,$s3,[x0,#16] @@ -360,45 +360,45 @@ $code.=<<___; .size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord .macro RDC - # a = | s7 | ... | s0 |, where si are 64-bit quantities - # = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities - # | s7 | s6 | s5 | s4 | - # | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 | - # | s3 | s2 | s1 | s0 | - # | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 | - # ================================================= - # | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+) - # | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+) - # | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+) - # | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+) - # | a12 | 0 | s7 | a13 | 0 | s6 | (+) - # | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+) - # | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+) - # | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+) - # | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+) - # | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) - # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) - # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+) - # | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+) - # | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-) - # | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-) - # | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-) - # | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-) - # | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]| - # | V[3] | V[2] | V[1] | V[0] | - - # 1. 64-bit addition - # t2=s6+s7+s7 + // a = | s7 | ... | s0 |, where si are 64-bit quantities + // = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities + // | s7 | s6 | s5 | s4 | + // | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 | + // | s3 | s2 | s1 | s0 | + // | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 | + // ================================================= + // | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+) + // | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+) + // | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+) + // | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+) + // | a12 | 0 | s7 | a13 | 0 | s6 | (+) + // | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+) + // | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+) + // | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+) + // | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+) + // | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) + // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) + // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+) + // | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+) + // | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-) + // | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-) + // | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-) + // | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-) + // | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]| + // | V[3] | V[2] | V[1] | V[0] | + + // 1. 64-bit addition + // t2=s6+s7+s7 adds $t2,$s6,$s7 adcs $t1,xzr,xzr adds $t2,$t2,$s7 adcs $t1,$t1,xzr - # t3=s4+s5+t2 + // t3=s4+s5+t2 adds $t3,$s4,$t2 adcs $t4,$t1,xzr adds $t3,$t3,$s5 adcs $t4,$t4,xzr - # sum + // sum adds $s0,$s0,$t3 adcs $s1,$s1,$t4 adcs $s2,$s2,$t2 @@ -410,7 +410,7 @@ $code.=<<___; stp $s0,$s1,[sp,#32] stp $s2,$s3,[sp,#48] - # 2. 64-bit to 32-bit spread + // 2. 64-bit to 32-bit spread mov $t1,#0xffffffff mov $s0,$s4 mov $s1,$s5 @@ -425,7 +425,7 @@ $code.=<<___; lsr $s6,$s6,#32 // a13 lsr $s7,$s7,#32 // a15 - # 3. 32-bit addition + // 3. 32-bit addition add $t1,$a14,$a12 // t1 <- a12 + a14 add $t2,$a15,$a13 // t2 <- a13 + a15 add $t3,$a8,$a9 // t3 <- a8 + a9 @@ -446,53 +446,53 @@ $code.=<<___; add $a11,$a11,$t2 // a11 <- a9 + a11 + 2*(a13 + a15) add $t1,$t1,$t4 // t1 <- a10 + a12 + 2*a14 - # U[0] s5 a9 + a11 + 2*(a13 + a15) - # U[1] t1 a10 + a12 + 2*a14 - # U[2] -t3 a8 + a9 + a13 + a14 - # U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15 - # U[4] s4 a9 + a13 + a15 - # U[5] t4 a10 + a14 - # U[6] s7 a11 + a15 - # U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) + // U[0] s5 a9 + a11 + 2*(a13 + a15) + // U[1] t1 a10 + a12 + 2*a14 + // U[2] -t3 a8 + a9 + a13 + a14 + // U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15 + // U[4] s4 a9 + a13 + a15 + // U[5] t4 a10 + a14 + // U[6] s7 a11 + a15 + // U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) - # 4. 32-bit to 64-bit + // 4. 32-bit to 64-bit lsl $s0,$t1,#32 extr $t1,$s2,$t1,#32 extr $s2,$t4,$s2,#32 extr $t4,$s1,$t4,#32 lsr $s1,$s1,#32 - # 5. 64-bit addition + // 5. 64-bit addition adds $s5,$s5,$s0 adcs $t1,$t1,xzr adcs $s4,$s4,$s2 adcs $s7,$s7,$t4 adcs $t0,$t0,$s1 - # V[0] s5 - # V[1] t1 - # V[2] s4 - # V[3] s7 - # carry t0 - # sub t3 + // V[0] s5 + // V[1] t1 + // V[2] s4 + // V[3] s7 + // carry t0 + // sub t3 - # 5. Process s0-s3 + // 5. Process s0-s3 ldp $s0,$s1,[sp,#32] ldp $s2,$s3,[sp,#48] - # add with V0-V3 + // add with V0-V3 adds $s0,$s0,$s5 adcs $s1,$s1,$t1 adcs $s2,$s2,$s4 adcs $s3,$s3,$s7 adcs $t0,$t0,xzr - # sub with t3 + // sub with t3 subs $s1,$s1,$t3 sbcs $s2,$s2,xzr sbcs $s3,$s3,xzr sbcs $t0,$t0,xzr - # 6. MOD - # First Mod + // 6. MOD + // First Mod lsl $t1,$t0,#32 subs $t2,$t1,$t0 @@ -501,8 +501,8 @@ $code.=<<___; adcs $s2,$s2,xzr adcs $s3,$s3,$t1 - # Last Mod - # return y - p if y > p else y + // Last Mod + // return y - p if y > p else y mov $s4,$s0 mov $s5,$s1 mov $s6,$s2 @@ -533,44 +533,44 @@ $code.=<<___; .align 5 ecp_sm2p256_mul: AARCH64_SIGN_LINK_REGISTER - # Store scalar registers + // Store scalar registers stp x29,x30,[sp,#-80]! add x29,sp,#0 stp x16,x17,[sp,#16] stp x18,x19,[sp,#64] - # Load inputs + // Load inputs ldp $s0,$s1,[x1] ldp $s2,$s3,[x1,#16] ldp $s4,$s5,[x2] ldp $s6,$s7,[x2,#16] -### multiplication ### - # ======================== - # s3 s2 s1 s0 - # * s7 s6 s5 s4 - # ------------------------ - # + s0 s0 s0 s0 - # * * * * - # s7 s6 s5 s4 - # s1 s1 s1 s1 - # * * * * - # s7 s6 s5 s4 - # s2 s2 s2 s2 - # * * * * - # s7 s6 s5 s4 - # s3 s3 s3 s3 - # * * * * - # s7 s6 s5 s4 - # ------------------------ - # s7 s6 s5 s4 s3 s2 s1 s0 - # ======================== - -### s0*s4 ### +// ### multiplication ### + // ======================== + // s3 s2 s1 s0 + // * s7 s6 s5 s4 + // ------------------------ + // + s0 s0 s0 s0 + // * * * * + // s7 s6 s5 s4 + // s1 s1 s1 s1 + // * * * * + // s7 s6 s5 s4 + // s2 s2 s2 s2 + // * * * * + // s7 s6 s5 s4 + // s3 s3 s3 s3 + // * * * * + // s7 s6 s5 s4 + // ------------------------ + // s7 s6 s5 s4 s3 s2 s1 s0 + // ======================== + +// ### s0*s4 ### mul $t5,$s0,$s4 umulh $t2,$s0,$s4 -### s1*s4 + s0*s5 ### +// ### s1*s4 + s0*s5 ### mul $t0,$s1,$s4 umulh $t1,$s1,$s4 adds $t2,$t2,$t0 @@ -582,7 +582,7 @@ ecp_sm2p256_mul: adcs $t3,$t3,$t1 adcs $t4,xzr,xzr -### s2*s4 + s1*s5 + s0*s6 ### +// ### s2*s4 + s1*s5 + s0*s6 ### mul $t0,$s2,$s4 umulh $t1,$s2,$s4 adds $t3,$t3,$t0 @@ -600,7 +600,7 @@ ecp_sm2p256_mul: adcs $t4,$t4,$t1 adcs $t6,$t6,xzr -### s3*s4 + s2*s5 + s1*s6 + s0*s7 ### +// ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ### mul $t0,$s3,$s4 umulh $t1,$s3,$s4 adds $t4,$t4,$t0 @@ -625,7 +625,7 @@ ecp_sm2p256_mul: adcs $t6,$t6,$t1 adcs $t7,$t7,xzr -### s3*s5 + s2*s6 + s1*s7 ### +// ### s3*s5 + s2*s6 + s1*s7 ### mul $t0,$s3,$s5 umulh $t1,$s3,$s5 adds $t6,$t6,$t0 @@ -644,7 +644,7 @@ ecp_sm2p256_mul: adcs $t7,$t7,$t1 adcs $t8,$t8,xzr -### s3*s6 + s2*s7 ### +// ### s3*s6 + s2*s7 ### mul $t0,$s3,$s6 umulh $t1,$s3,$s6 adds $t7,$t7,$t0 @@ -657,7 +657,7 @@ ecp_sm2p256_mul: adcs $t8,$t8,$t1 adcs $t6,$t6,xzr -### s3*s7 ### +// ### s3*s7 ### mul $t0,$s3,$s7 umulh $t1,$s3,$s7 adds $s6,$t8,$t0 @@ -668,15 +668,15 @@ ecp_sm2p256_mul: mov $s2,$t3 mov $s3,$t4 - # result of mul: s7 s6 s5 s4 s3 s2 s1 s0 + // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 -### Reduction ### +// ### Reduction ### RDC stp $s0,$s1,[x0] stp $s2,$s3,[x0,#16] - # Restore scalar registers + // Restore scalar registers ldp x16,x17,[sp,#16] ldp x18,x19,[sp,#64] ldp x29,x30,[sp],#80 @@ -692,48 +692,48 @@ ecp_sm2p256_mul: ecp_sm2p256_sqr: AARCH64_SIGN_LINK_REGISTER - # Store scalar registers + // Store scalar registers stp x29,x30,[sp,#-80]! add x29,sp,#0 stp x16,x17,[sp,#16] stp x18,x19,[sp,#64] - # Load inputs + // Load inputs ldp $s4,$s5,[x1] ldp $s6,$s7,[x1,#16] -### square ### - # ======================== - # s7 s6 s5 s4 - # * s7 s6 s5 s4 - # ------------------------ - # + s4 s4 s4 s4 - # * * * * - # s7 s6 s5 s4 - # s5 s5 s5 s5 - # * * * * - # s7 s6 s5 s4 - # s6 s6 s6 s6 - # * * * * - # s7 s6 s5 s4 - # s7 s7 s7 s7 - # * * * * - # s7 s6 s5 s4 - # ------------------------ - # s7 s6 s5 s4 s3 s2 s1 s0 - # ======================== - -### s4*s5 ### +// ### square ### + // ======================== + // s7 s6 s5 s4 + // * s7 s6 s5 s4 + // ------------------------ + // + s4 s4 s4 s4 + // * * * * + // s7 s6 s5 s4 + // s5 s5 s5 s5 + // * * * * + // s7 s6 s5 s4 + // s6 s6 s6 s6 + // * * * * + // s7 s6 s5 s4 + // s7 s7 s7 s7 + // * * * * + // s7 s6 s5 s4 + // ------------------------ + // s7 s6 s5 s4 s3 s2 s1 s0 + // ======================== + +// ### s4*s5 ### mul $s1,$s4,$s5 umulh $s2,$s4,$s5 -### s4*s6 ### +// ### s4*s6 ### mul $t0,$s6,$s4 umulh $s3,$s6,$s4 adds $s2,$s2,$t0 adcs $s3,$s3,xzr -### s4*s7 + s5*s6 ### +// ### s4*s7 + s5*s6 ### mul $t0,$s7,$s4 umulh $t1,$s7,$s4 adds $s3,$s3,$t0 @@ -745,19 +745,19 @@ ecp_sm2p256_sqr: adcs $s0,$s0,$t1 adcs $t2,xzr,xzr -### s5*s7 ### +// ### s5*s7 ### mul $t0,$s7,$s5 umulh $t1,$s7,$s5 adds $s0,$s0,$t0 adcs $t2,$t2,$t1 -### s6*s7 ### +// ### s6*s7 ### mul $t0,$s7,$s6 umulh $t1,$s7,$s6 adds $t2,$t2,$t0 adcs $t3,$t1,xzr -### 2*(t3,t2,s0,s3,s2,s1) ### +// ### 2*(t3,t2,s0,s3,s2,s1) ### adds $s1,$s1,$s1 adcs $s2,$s2,$s2 adcs $s3,$s3,$s3 @@ -766,19 +766,19 @@ ecp_sm2p256_sqr: adcs $t3,$t3,$t3 adcs $t4,xzr,xzr -### s4*s4 ### +// ### s4*s4 ### mul $t5,$s4,$s4 umulh $t6,$s4,$s4 -### s5*s5 ### +// ### s5*s5 ### mul $s4,$s5,$s5 umulh $s5,$s5,$s5 -### s6*s6 ### +// ### s6*s6 ### mul $t0,$s6,$s6 umulh $t1,$s6,$s6 -### s7*s7 ### +// ### s7*s7 ### mul $t7,$s7,$s7 umulh $t8,$s7,$s7 @@ -796,15 +796,15 @@ ecp_sm2p256_sqr: mov $s6,$t3 mov $s7,$t4 - # result of mul: s7 s6 s5 s4 s3 s2 s1 s0 + // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 -### Reduction ### +// ### Reduction ### RDC stp $s0,$s1,[x0] stp $s2,$s3,[x0,#16] - # Restore scalar registers + // Restore scalar registers ldp x16,x17,[sp,#16] ldp x18,x19,[sp,#64] ldp x29,x30,[sp],#80