-rw-r--r-- 71818 lib25519-20260614/crypto_nP/montgomery25519/arm64-neonplusuma10l/mladder.S raw
#include "crypto_asm_hidden.h" // linker define mladder /* Assembly for Montgomery ladder */ #define IMPL1 #ifdef IMPL1 .p2align 4 ASM_HIDDEN _CRYPTO_SHARED_NAMESPACE(mladder) .globl _CRYPTO_SHARED_NAMESPACE(mladder) ASM_HIDDEN CRYPTO_SHARED_NAMESPACE(mladder) .globl CRYPTO_SHARED_NAMESPACE(mladder) _CRYPTO_SHARED_NAMESPACE(mladder): CRYPTO_SHARED_NAMESPACE(mladder): sub sp, sp, #560 add x10, sp, #392 stp x19, x20, [x10, #0] stp x21, x22, [x10, #16] stp x23, x24, [x10, #32] stp x25, x26, [x10, #48] stp x27, x28, [x10, #64] stp x29, x30, [x10, #80] stp d8, d9, [x10, #96] stp d10, d11, [x10, #112] stp d12, d13, [x10, #128] stp d14, d15, [x10, #144] // clamp scalar ldr x3, [x2, #0] and x3, x3, #0xfffffffffffffff8 str x3, [sp, #104] ldr x3, [x2, #8] str x3, [sp, #112] ldr x3, [x2, #16] str x3, [sp, #120] ldr x4, [x2, #24] orr x4, x4, #0x4000000000000000 str x4, [sp, #128] str x0, [sp, #144] // load point ldp x4, x5, [x1, #0] ldp x6, x7, [x1, #16] // X1 and w8, w4, #0x3ffffff ubfx x9, x4, #26, #25 lsr x10, x4, #51 orr w10, w10, w5, lsl #13 and w10, w10, #0x3ffffff ubfx x11, x5, #13, #25 lsr x12, x5, #38 and w13, w6, #0x1ffffff ubfx x14, x6, #25, #26 lsr x15, x6, #51 orr w15, w15, w7, lsl #13 and w15, w15, #0x1ffffff ubfx x16, x7, #12, #26 ubfx x17, x7, #38, #25 add x0, sp, #352 stp w8, w9, [x0, #0] stp w10, w11, [x0, #8] stp w12, w13, [x0, #16] stp w14, w15, [x0, #24] stp w16, w17, [x0, #32] mov x20, #1 // X2 ← 1 mov v11.d[0], x20 mov v13.d[0], xzr mov v15.d[0], xzr mov v17.d[0], xzr mov v19.d[0], xzr // Z2 ← 0 mov v10.d[0], xzr mov v12.d[0], xzr mov v14.d[0], xzr mov v16.d[0], xzr mov v18.d[0], xzr // X3 ← X1 mov v21.s[0], w8 mov v21.s[1], w9 mov v23.s[0], w10 mov v23.s[1], w11 mov v25.s[0], w12 mov v25.s[1], w13 mov v27.s[0], w14 mov v27.s[1], w15 mov v29.s[0], w16 mov v29.s[1], w17 // Z3 ← 1 mov v20.d[0], x20 mov v22.d[0], xzr mov v24.d[0], xzr mov v26.d[0], xzr mov v28.d[0], xzr mov w30, #19 dup v31.2s, w30 mov w29, #0x3ffffff dup v30.2d, x29 movz x1, #0xffda movk x1, #0x07ff, lsl 16 movk x1, #0xfffe, lsl 32 movk x1, #0x03ff, lsl 48 movz x2, #0xfffe movk x2, #0x07ff, lsl 16 movk x2, #0xfffe, lsl 32 movk x2, #0x03ff, lsl 48 stp x2, x1, [sp, #0] mov w0, #254 str w0, [sp, #96] mov w0, #0xda strb w0, [sp, #152] ldrb w1, [sp, #135] lsr w1, w1, #6 str w1, [sp, #100] // Montgomery ladder loop .L0: tst w1, #1 ldr d9, [sp, #0] ldr d8, [sp, #8] // T1 = X2 + Z2, T2 = X2 - Z2 add v0.2s, v11.2s, v10.2s add v1.2s, v13.2s, v12.2s add v2.2s, v15.2s, v14.2s add v3.2s, v17.2s, v16.2s add v4.2s, v19.2s, v18.2s add v11.2s, v8.2s, v11.2s add v13.2s, v9.2s, v13.2s add v15.2s, v9.2s, v15.2s add v17.2s, v9.2s, v17.2s add v19.2s, v9.2s, v19.2s sub v11.2s, v11.2s, v10.2s sub v13.2s, v13.2s, v12.2s sub v15.2s, v15.2s, v14.2s sub v17.2s, v17.2s, v16.2s sub v19.2s, v19.2s, v18.2s // T4 = X3 - Z3, T3 = X3 + Z3 add v10.2s, v8.2s, v21.2s add v12.2s, v9.2s, v23.2s add v14.2s, v9.2s, v25.2s add v16.2s, v9.2s, v27.2s add v18.2s, v9.2s, v29.2s add v5.2s, v21.2s, v20.2s add v6.2s, v23.2s, v22.2s add v7.2s, v25.2s, v24.2s add v8.2s, v27.2s, v26.2s add v9.2s, v29.2s, v28.2s sub v21.2s, v10.2s, v20.2s sub v23.2s, v12.2s, v22.2s sub v25.2s, v14.2s, v24.2s sub v27.2s, v16.2s, v26.2s sub v29.2s, v18.2s, v28.2s // CSelect(T1,T3,b) fcsel d10, d0, d5, eq fcsel d12, d1, d6, eq fcsel d14, d2, d7, eq fcsel d16, d3, d8, eq fcsel d18, d4, d9, eq // save T1 resulted from CSelect mov x10, v10.d[0] mov x12, v12.d[0] mov x14, v14.d[0] mov x16, v16.d[0] mov x18, v18.d[0] // CSelect(T2,T4,b) fcsel d20, d11, d21, eq fcsel d22, d13, d23, eq fcsel d24, d15, d25, eq fcsel d26, d17, d27, eq fcsel d28, d19, d29, eq // save T2 resulted from CSelect stp d20, d22, [sp, #160] stp d24, d26, [sp, #176] str d28, [sp, #192] // <T1,T2> = <X2 + Z2,X2 - Z2> other half trn1 v10.2s, v0.2s, v11.2s trn2 v11.2s, v0.2s, v11.2s trn1 v12.2s, v1.2s, v13.2s trn2 v13.2s, v1.2s, v13.2s trn1 v14.2s, v2.2s, v15.2s trn2 v15.2s, v2.2s, v15.2s trn1 v16.2s, v3.2s, v17.2s trn2 v17.2s, v3.2s, v17.2s trn1 v18.2s, v4.2s, v19.2s trn2 v19.2s, v4.2s, v19.2s // <T4,T3> = <X3 - Z3,X3 + Z3> trn1 v20.2s, v21.2s, v5.2s trn2 v21.2s, v21.2s, v5.2s trn1 v22.2s, v23.2s, v6.2s trn2 v23.2s, v23.2s, v6.2s trn1 v24.2s, v25.2s, v7.2s trn2 v25.2s, v25.2s, v7.2s trn1 v26.2s, v27.2s, v8.2s trn2 v27.2s, v27.2s, v8.2s trn1 v28.2s, v29.2s, v9.2s trn2 v29.2s, v29.2s, v9.2s // T1 ← T1^2 lsr x11, x10, #32 lsr x13, x12, #32 lsr x15, x14, #32 lsr x17, x16, #32 lsr x19, x18, #32 mul w20, w16, w30 mul w21, w18, w30 add x25, x30, x30 mul w22, w15, w25 mul w23, w17, w25 mul w24, w19, w25 umull x0, w10, w10 add x25, x10, x10 umull x1, w25, w11 umull x2, w25, w12 umull x3, w25, w13 umull x4, w25, w14 umull x5, w25, w15 umull x6, w25, w16 umull x7, w25, w17 umull x8, w25, w18 umaddl x4, w12, w12, x4 umaddl x8, w14, w14, x8 add x10, x15, x15 umaddl x1, w20, w10, x1 umaddl x2, w20, w16, x2 add x9, x17, x17 umaddl x3, w21, w10, x3 umaddl x5, w21, w9, x5 umaddl x6, w21, w18, x6 umaddl x0, w22, w15, x0 umaddl x1, w23, w14, x1 umaddl x2, w23, w10, x2 umaddl x3, w23, w16, x3 umaddl x4, w23, w17, x4 umaddl x1, w24, w12, x1 umaddl x3, w24, w14, x3 umaddl x4, w24, w10, x4 umaddl x5, w24, w16, x5 umaddl x6, w24, w9, x6 umaddl x7, w24, w18, x7 umaddl x8, w24, w19, x8 add x26, x11, x11 umaddl x0, w26, w24, x0 umaddl x2, w26, w11, x2 umaddl x3, w26, w12, x3 umaddl x5, w26, w14, x5 umaddl x6, w26, w10, x6 umaddl x7, w26, w16, x7 umaddl x8, w26, w9, x8 umull x9, w25, w19 umaddl x9, w26, w18, x9 add x27, x12, x12 umaddl x0, w27, w21, x0 umaddl x5, w27, w13, x5 umaddl x8, w27, w16, x8 umaddl x6, w27, w14, x6 umaddl x7, w27, w15, x7 umaddl x9, w27, w17, x9 add x28, x13, x13 umaddl x4, w26, w28, x4 umaddl x0, w28, w23, x0 umaddl x1, w28, w21, x1 umaddl x2, w28, w24, x2 umaddl x8, w28, w10, x8 umaddl x6, w28, w13, x6 umaddl x7, w28, w14, x7 umaddl x9, w28, w16, x9 add x29, x14, x14 umaddl x0, w29, w20, x0 umaddl x2, w29, w21, x2 umaddl x9, w29, w15, x9 add x18, x16, x16 umaddl x4, w18, w21, x4 add x1, x1, x0, lsr #26 and x0, x0, #0x3ffffff add x2, x2, x1, lsr #25 and x1, x1, #0x1ffffff add x3, x3, x2, lsr #26 and x2, x2, #0x3ffffff add x4, x4, x3, lsr #25 bfi x2, x3, #32, #25 add x5, x5, x4, lsr #26 and x4, x4, #0x3ffffff add x6, x6, x5, lsr #25 bfi x4, x5, #32, #25 add x7, x7, x6, lsr #26 and x6, x6, #0x3ffffff add x8, x8, x7, lsr #25 bfi x6, x7, #32, #25 add x9, x9, x8, lsr #26 and x8, x8, #0x3ffffff bic x10, x9, #0x1ffffff add x0, x0, x10, lsr #25 add x0, x0, x10, lsr #24 add x0, x0, x10, lsr #21 bfi x8, x9, #32, #25 add x1, x1, x0, lsr #26 and x0, x0, #0x3ffffff bfi x0, x1, #32, #26 stp x0, x2, [sp, #208] stp x4, x6, [sp, #224] str x8, [sp, #240] // T2 ← T2^2 ldp x10, x12, [sp, #160] ldp x14, x16, [sp, #176] ldr x18, [sp, #192] lsr x11, x10, #32 lsr x13, x12, #32 lsr x15, x14, #32 lsr x17, x16, #32 lsr x19, x18, #32 mul w20, w16, w30 mul w21, w18, w30 add x25, x30, x30 mul w22, w15, w25 mul w23, w17, w25 mul w24, w19, w25 umull x0, w10, w10 add x25, x10, x10 umull x1, w25, w11 umull x2, w25, w12 umull x3, w25, w13 umull x4, w25, w14 umull x5, w25, w15 umull x6, w25, w16 umull x7, w25, w17 umull x8, w25, w18 umaddl x4, w12, w12, x4 umaddl x8, w14, w14, x8 add x10, x15, x15 umaddl x1, w20, w10, x1 umaddl x2, w20, w16, x2 add x9, x17, x17 umaddl x3, w21, w10, x3 umaddl x5, w21, w9, x5 umaddl x6, w21, w18, x6 umaddl x0, w22, w15, x0 umaddl x1, w23, w14, x1 umaddl x2, w23, w10, x2 umaddl x3, w23, w16, x3 umaddl x4, w23, w17, x4 umaddl x1, w24, w12, x1 umaddl x3, w24, w14, x3 umaddl x4, w24, w10, x4 umaddl x5, w24, w16, x5 umaddl x6, w24, w9, x6 umaddl x7, w24, w18, x7 umaddl x8, w24, w19, x8 add x26, x11, x11 umaddl x0, w26, w24, x0 umaddl x2, w26, w11, x2 umaddl x3, w26, w12, x3 umaddl x5, w26, w14, x5 umaddl x6, w26, w10, x6 umaddl x7, w26, w16, x7 umaddl x8, w26, w9, x8 umull x9, w25, w19 umaddl x9, w26, w18, x9 add x27, x12, x12 umaddl x0, w27, w21, x0 umaddl x5, w27, w13, x5 umaddl x8, w27, w16, x8 umaddl x6, w27, w14, x6 umaddl x7, w27, w15, x7 umaddl x9, w27, w17, x9 add x28, x13, x13 umaddl x4, w26, w28, x4 umaddl x0, w28, w23, x0 umaddl x1, w28, w21, x1 umaddl x2, w28, w24, x2 umaddl x8, w28, w10, x8 umaddl x6, w28, w13, x6 umaddl x7, w28, w14, x7 umaddl x9, w28, w16, x9 add x29, x14, x14 umaddl x0, w29, w20, x0 umaddl x2, w29, w21, x2 umaddl x9, w29, w15, x9 add x18, x16, x16 umaddl x4, w18, w21, x4 add x1, x1, x0, lsr #26 and x0, x0, #0x3ffffff add x2, x2, x1, lsr #25 and x1, x1, #0x1ffffff add x3, x3, x2, lsr #26 and x2, x2, #0x3ffffff add x4, x4, x3, lsr #25 and x3, x3, #0x1ffffff bfi x2, x3, #32, #25 add x5, x5, x4, lsr #26 and x4, x4, #0x3ffffff add x6, x6, x5, lsr #25 and x5, x5, #0x1ffffff bfi x4, x5, #32, #25 add x7, x7, x6, lsr #26 and x6, x6, #0x3ffffff add x8, x8, x7, lsr #25 and x7, x7, #0x1ffffff bfi x6, x7, #32, #25 add x9, x9, x8, lsr #26 and x8, x8, #0x3ffffff bic x10, x9, #0x1ffffff add x0, x0, x10, lsr #25 add x0, x0, x10, lsr #24 add x0, x0, x10, lsr #21 and x9, x9, #0x1ffffff bfi x8, x9, #32, #25 add x1, x1, x0, lsr #26 and x0, x0, #0x3ffffff bfi x0, x1, #32, #26 stp x0, x2, [sp, #304] stp x4, x6, [sp, #320] str x8, [sp, #336] // Z2 ← T1 - T2 ldp x11, x13, [sp, #208] ldp x15, x17, [sp, #224] ldr x19, [sp, #240] ldp x22, x21, [sp, #0] add x13, x13, x22 add x15, x15, x22 add x17, x17, x22 add x19, x19, x22 add x11, x11, x21 sub x10, x11, x0 sub x12, x13, x2 sub x14, x15, x4 sub x16, x17, x6 sub x18, x19, x8 lsr x11, x10, #32 lsr x13, x12, #32 lsr x15, x14, #32 lsr x17, x16, #32 lsr x19, x18, #32 // T2 ← aZ2 + T2 movz x20, #0xdb42 movk x20, #0x0001, lsl 16 mov w0, w0 umaddl x0, w10, w20, x0 umaddl x21, w11, w20, x1 mov w2, w2 umaddl x22, w12, w20, x2 umaddl x23, w13, w20, x3 mov w4, w4 umaddl x24, w14, w20, x4 umaddl x25, w15, w20, x5 mov w6, w6 umaddl x26, w16, w20, x6 umaddl x27, w17, w20, x7 mov w8, w8 umaddl x28, w18, w20, x8 umaddl x29, w19, w20, x9 add x21, x21, x0, lsr #26 and x20, x0, #0x3ffffff add x22, x22, x21, lsr #25 and x21, x21, #0x1ffffff add x23, x23, x22, lsr #26 and x22, x22, #0x3ffffff add x24, x24, x23, lsr #25 and x23, x23, #0x1ffffff add x25, x25, x24, lsr #26 and x24, x24, #0x3ffffff add x26, x26, x25, lsr #25 and x25, x25, #0x1ffffff add x27, x27, x26, lsr #26 and x26, x26, #0x3ffffff add x28, x28, x27, lsr #25 and x27, x27, #0x1ffffff add x29, x29, x28, lsr #26 and x28, x28, #0x3ffffff bic x7, x29, #0x1ffffff add x20, x20, x7, lsr #25 add x20, x20, x7, lsr #24 add x20, x20, x7, lsr #21 and x29, x29, #0x1ffffff // Z2 ← Z2 · T2 umull x0, w10, w20 umull x1, w10, w21 umull x2, w10, w22 umull x3, w10, w23 umull x4, w10, w24 umull x5, w10, w25 umull x6, w10, w26 umull x7, w10, w27 umull x8, w10, w28 umull x9, w10, w29 umaddl x1, w11, w20, x1 umaddl x3, w11, w22, x3 umaddl x5, w11, w24, x5 umaddl x7, w11, w26, x7 umaddl x9, w11, w28, x9 umaddl x2, w12, w20, x2 umaddl x3, w12, w21, x3 umaddl x4, w12, w22, x4 umaddl x5, w12, w23, x5 umaddl x6, w12, w24, x6 umaddl x7, w12, w25, x7 umaddl x8, w12, w26, x8 umaddl x9, w12, w27, x9 umaddl x3, w13, w20, x3 umaddl x5, w13, w22, x5 umaddl x7, w13, w24, x7 umaddl x9, w13, w26, x9 umaddl x4, w14, w20, x4 umaddl x5, w14, w21, x5 umaddl x6, w14, w22, x6 umaddl x7, w14, w23, x7 umaddl x8, w14, w24, x8 umaddl x9, w14, w25, x9 umaddl x5, w15, w20, x5 umaddl x7, w15, w22, x7 umaddl x9, w15, w24, x9 umaddl x6, w16, w20, x6 umaddl x7, w16, w21, x7 umaddl x8, w16, w22, x8 umaddl x9, w16, w23, x9 umaddl x7, w17, w20, x7 umaddl x9, w17, w22, x9 umaddl x8, w18, w20, x8 umaddl x9, w18, w21, x9 umaddl x9, w19, w20, x9 mul w22, w22, w30 mul w24, w24, w30 mul w26, w26, w30 mul w28, w28, w30 umaddl x0, w12, w28, x0 umaddl x1, w13, w28, x1 umaddl x0, w14, w26, x0 umaddl x2, w14, w28, x2 umaddl x1, w15, w26, x1 umaddl x3, w15, w28, x3 umaddl x0, w16, w24, x0 umaddl x2, w16, w26, x2 umaddl x4, w16, w28, x4 umaddl x1, w17, w24, x1 umaddl x3, w17, w26, x3 umaddl x5, w17, w28, x5 umaddl x0, w18, w22, x0 umaddl x2, w18, w24, x2 umaddl x6, w18, w28, x6 umaddl x4, w18, w26, x4 umaddl x1, w19, w22, x1 umaddl x3, w19, w24, x3 umaddl x5, w19, w26, x5 umaddl x7, w19, w28, x7 add x11, x11, x11 umaddl x2, w11, w21, x2 umaddl x4, w11, w23, x4 umaddl x6, w11, w25, x6 umaddl x8, w11, w27, x8 add x13, x13, x13 umaddl x4, w13, w21, x4 umaddl x6, w13, w23, x6 umaddl x8, w13, w25, x8 add x15, x15, x15 umaddl x6, w15, w21, x6 umaddl x8, w15, w23, x8 add x17, x17, x17 umaddl x8, w17, w21, x8 mul w21, w21, w30 mul w23, w23, w30 mul w25, w25, w30 mul w27, w27, w30 mul w29, w29, w30 umaddl x0, w11, w29, x0 umaddl x1, w12, w29, x1 umaddl x0, w13, w27, x0 umaddl x2, w13, w29, x2 umaddl x1, w14, w27, x1 umaddl x3, w14, w29, x3 umaddl x0, w15, w25, x0 umaddl x2, w15, w27, x2 umaddl x4, w15, w29, x4 umaddl x1, w16, w25, x1 umaddl x3, w16, w27, x3 umaddl x5, w16, w29, x5 umaddl x0, w17, w23, x0 umaddl x2, w17, w25, x2 umaddl x4, w17, w27, x4 umaddl x6, w17, w29, x6 umaddl x1, w18, w23, x1 umaddl x3, w18, w25, x3 umaddl x5, w18, w27, x5 umaddl x7, w18, w29, x7 add x19, x19, x19 umaddl x0, w19, w21, x0 umaddl x2, w19, w23, x2 umaddl x4, w19, w25, x4 umaddl x6, w19, w27, x6 umaddl x8, w19, w29, x8 add x1, x1, x0, lsr #26 and x0, x0, #0x3ffffff add x2, x2, x1, lsr #25 and x1, x1, #0x1ffffff add x3, x3, x2, lsr #26 and x22, x2, #0x3ffffff add x4, x4, x3, lsr #25 bfi x22, x3, #32, #25 add x5, x5, x4, lsr #26 and x24, x4, #0x3ffffff add x6, x6, x5, lsr #25 bfi x24, x5, #32, #25 add x7, x7, x6, lsr #26 and x26, x6, #0x3ffffff add x8, x8, x7, lsr #25 bfi x26, x7, #32, #25 add x9, x9, x8, lsr #26 and x28, x8, #0x3ffffff bic x10, x9, #0x1ffffff add x0, x0, x10, lsr #25 add x0, x0, x10, lsr #24 add x0, x0, x10, lsr #21 bfi x28, x9, #32, #25 add x1, x1, x0, lsr #26 and x20, x0, #0x3ffffff bfi x20, x1, #32, #26 ldr x2, [sp, #96] lsr x3, x2, #32 add x4, sp, #104 subs w0, w2, #1 asr w1, w0, #5 ldr w1, [x4, w1, SXTW #2] and w4, w0, #0x1f lsr w1, w1, w4 stp w0, w1, [sp, #96] // <T5,T6> ← Mul(<T1,T2>,<T4,T3>) umull v0.2d, v10.2s, v20.2s umull v1.2d, v10.2s, v21.2s umlal v1.2d, v11.2s, v20.2s umull v2.2d, v10.2s, v22.2s umlal v2.2d, v12.2s, v20.2s umull v3.2d, v10.2s, v23.2s umlal v3.2d, v11.2s, v22.2s umlal v3.2d, v12.2s, v21.2s umlal v3.2d, v13.2s, v20.2s umull v4.2d, v10.2s, v24.2s umlal v4.2d, v12.2s, v22.2s umlal v4.2d, v14.2s, v20.2s umull v5.2d, v10.2s, v25.2s umlal v5.2d, v11.2s, v24.2s umlal v5.2d, v12.2s, v23.2s umlal v5.2d, v13.2s, v22.2s umlal v5.2d, v14.2s, v21.2s umlal v5.2d, v15.2s, v20.2s umull v6.2d, v10.2s, v26.2s umlal v6.2d, v12.2s, v24.2s umlal v6.2d, v14.2s, v22.2s umlal v6.2d, v16.2s, v20.2s umull v7.2d, v10.2s, v27.2s umlal v7.2d, v11.2s, v26.2s umlal v7.2d, v12.2s, v25.2s umlal v7.2d, v13.2s, v24.2s umlal v7.2d, v14.2s, v23.2s umlal v7.2d, v15.2s, v22.2s umlal v7.2d, v16.2s, v21.2s umlal v7.2d, v17.2s, v20.2s umull v8.2d, v10.2s, v28.2s umlal v8.2d, v12.2s, v26.2s umlal v8.2d, v14.2s, v24.2s umlal v8.2d, v16.2s, v22.2s umlal v8.2d, v18.2s, v20.2s umull v9.2d, v10.2s, v29.2s umlal v9.2d, v11.2s, v28.2s umlal v9.2d, v12.2s, v27.2s umlal v9.2d, v13.2s, v26.2s umlal v9.2d, v14.2s, v25.2s umlal v9.2d, v15.2s, v24.2s umlal v9.2d, v16.2s, v23.2s umlal v9.2d, v17.2s, v22.2s umlal v9.2d, v18.2s, v21.2s umlal v9.2d, v19.2s, v20.2s mul v22.2s, v22.2s, v31.2s mul v24.2s, v24.2s, v31.2s mul v26.2s, v26.2s, v31.2s mul v28.2s, v28.2s, v31.2s umlal v0.2d, v12.2s, v28.2s umlal v0.2d, v14.2s, v26.2s umlal v0.2d, v16.2s, v24.2s umlal v0.2d, v18.2s, v22.2s umlal v1.2d, v13.2s, v28.2s umlal v1.2d, v15.2s, v26.2s umlal v1.2d, v17.2s, v24.2s umlal v1.2d, v19.2s, v22.2s umlal v2.2d, v14.2s, v28.2s umlal v2.2d, v16.2s, v26.2s umlal v2.2d, v18.2s, v24.2s umlal v3.2d, v15.2s, v28.2s umlal v3.2d, v17.2s, v26.2s umlal v3.2d, v19.2s, v24.2s umlal v4.2d, v16.2s, v28.2s umlal v4.2d, v18.2s, v26.2s umlal v5.2d, v17.2s, v28.2s umlal v5.2d, v19.2s, v26.2s umlal v6.2d, v18.2s, v28.2s umlal v7.2d, v19.2s, v28.2s shl v11.2s, v11.2s, #1 shl v13.2s, v13.2s, #1 shl v15.2s, v15.2s, #1 shl v17.2s, v17.2s, #1 shl v19.2s, v19.2s, #1 umlal v2.2d, v11.2s, v21.2s umlal v4.2d, v11.2s, v23.2s umlal v4.2d, v13.2s, v21.2s umlal v6.2d, v11.2s, v25.2s umlal v6.2d, v13.2s, v23.2s umlal v6.2d, v15.2s, v21.2s umlal v8.2d, v11.2s, v27.2s umlal v8.2d, v13.2s, v25.2s umlal v8.2d, v15.2s, v23.2s umlal v8.2d, v17.2s, v21.2s mul v21.2s, v21.2s, v31.2s mul v23.2s, v23.2s, v31.2s mul v25.2s, v25.2s, v31.2s mul v27.2s, v27.2s, v31.2s mul v29.2s, v29.2s, v31.2s umlal v0.2d, v11.2s, v29.2s umlal v0.2d, v13.2s, v27.2s umlal v0.2d, v15.2s, v25.2s umlal v0.2d, v17.2s, v23.2s umlal v0.2d, v19.2s, v21.2s umlal v1.2d, v12.2s, v29.2s umlal v1.2d, v14.2s, v27.2s umlal v1.2d, v16.2s, v25.2s umlal v1.2d, v18.2s, v23.2s umlal v2.2d, v13.2s, v29.2s umlal v2.2d, v15.2s, v27.2s umlal v2.2d, v17.2s, v25.2s umlal v2.2d, v19.2s, v23.2s umlal v3.2d, v14.2s, v29.2s umlal v3.2d, v16.2s, v27.2s umlal v3.2d, v18.2s, v25.2s umlal v4.2d, v15.2s, v29.2s umlal v4.2d, v17.2s, v27.2s umlal v4.2d, v19.2s, v25.2s umlal v5.2d, v16.2s, v29.2s umlal v5.2d, v18.2s, v27.2s umlal v6.2d, v17.2s, v29.2s umlal v6.2d, v19.2s, v27.2s umlal v7.2d, v18.2s, v29.2s umlal v8.2d, v19.2s, v29.2s ushr v25.2d, v30.2d, #1 usra v1.2d, v0.2d, #26 and v0.16b, v0.16b, v30.16b usra v2.2d, v1.2d, #25 and v1.16b, v1.16b, v25.16b usra v3.2d, v2.2d, #26 and v2.16b, v2.16b, v30.16b usra v4.2d, v3.2d, #25 and v3.16b, v3.16b, v25.16b usra v5.2d, v4.2d, #26 and v4.16b, v4.16b, v30.16b usra v6.2d, v5.2d, #25 and v5.16b, v5.16b, v25.16b usra v7.2d, v6.2d, #26 and v6.16b, v6.16b, v30.16b usra v8.2d, v7.2d, #25 and v7.16b, v7.16b, v25.16b usra v9.2d, v8.2d, #26 and v8.16b, v8.16b, v30.16b bic v10.16b, v9.16b, v25.16b usra v0.2d, v10.2d, #25 usra v0.2d, v10.2d, #24 usra v0.2d, v10.2d, #21 and v9.16b, v9.16b, v25.16b usra v1.2d, v0.2d, #26 and v0.16b, v0.16b, v30.16b // X3 ← T5 + T6, Z3 ← T5 - T6 uzp1 v0.4s, v0.4s, v1.4s uzp1 v1.4s, v2.4s, v3.4s uzp1 v2.4s, v0.4s, v1.4s uzp2 v3.4s, v0.4s, v1.4s uzp1 v4.4s, v4.4s, v5.4s uzp1 v5.4s, v6.4s, v7.4s uzp1 v6.4s, v4.4s, v5.4s uzp2 v7.4s, v4.4s, v5.4s trn1 v8.4s, v8.4s, v9.4s mov v9.d[0], v8.d[1] // X3 ← T5 + T6 add v1.4s, v2.4s, v3.4s add v4.4s, v6.4s, v7.4s add v5.4s, v8.4s, v9.4s // Z3 ← T5 - T6 ld1r {v10.2d}, [sp] add v6.4s, v6.4s, v10.4s add v8.4s, v8.4s, v10.4s ldr b11, [sp, #152] mov v10.b[0], v11.b[0] add v2.4s, v2.4s, v10.4s sub v2.4s, v2.4s, v3.4s sub v6.4s, v6.4s, v7.4s sub v8.4s, v8.4s, v9.4s // <Z3,X3> ← <T5 - T6,T5 + T6> zip1 v10.4s, v2.4s, v1.4s zip2 v12.4s, v2.4s, v1.4s zip1 v14.4s, v6.4s, v4.4s zip2 v16.4s, v6.4s, v4.4s zip1 v18.4s, v8.4s, v5.4s mov v11.d[0], v10.d[1] mov v13.d[0], v12.d[1] mov v15.d[0], v14.d[1] mov v17.d[0], v16.d[1] mov v19.d[0], v18.d[1] // <Z3,X3> ← Sqr(<Z3,X3>) shl v0.2d, v31.2d, #1 mul v20.2s, v16.2s, v31.2s mul v21.2s, v18.2s, v31.2s mul v22.2s, v15.2s, v0.2s mul v23.2s, v17.2s, v0.2s mul v24.2s, v19.2s, v0.2s shl v25.2s, v10.2s, #1 shl v26.2s, v11.2s, #1 shl v27.2s, v12.2s, #1 shl v28.2s, v13.2s, #1 shl v29.2s, v14.2s, #1 shl v7.2s, v15.2s, #1 shl v8.2s, v16.2s, #1 shl v9.2s, v17.2s, #1 umull v0.2d, v10.2s, v10.2s umlal v0.2d, v26.2s, v24.2s umlal v0.2d, v27.2s, v21.2s umlal v0.2d, v28.2s, v23.2s umlal v0.2d, v29.2s, v20.2s umlal v0.2d, v22.2s, v15.2s umull v1.2d, v25.2s, v11.2s umlal v1.2d, v24.2s, v12.2s umlal v1.2d, v28.2s, v21.2s umlal v1.2d, v23.2s, v14.2s umlal v1.2d, v20.2s, v7.2s umull v2.2d, v25.2s, v12.2s umlal v2.2d, v26.2s, v11.2s umlal v2.2d, v28.2s, v24.2s umlal v2.2d, v29.2s, v21.2s umlal v2.2d, v23.2s, v7.2s umlal v2.2d, v20.2s, v16.2s umull v3.2d, v25.2s, v13.2s umlal v3.2d, v26.2s, v12.2s umlal v3.2d, v24.2s, v14.2s umlal v3.2d, v21.2s, v7.2s umlal v3.2d, v23.2s, v16.2s umull v4.2d, v25.2s, v14.2s umlal v4.2d, v26.2s, v28.2s umlal v4.2d, v12.2s, v12.2s umlal v4.2d, v24.2s, v7.2s umlal v4.2d, v8.2s, v21.2s umlal v4.2d, v23.2s, v17.2s umull v5.2d, v25.2s, v15.2s umlal v5.2d, v26.2s, v14.2s umlal v5.2d, v27.2s, v13.2s umlal v5.2d, v24.2s, v16.2s umlal v5.2d, v21.2s, v9.2s umull v6.2d, v25.2s, v16.2s umlal v6.2d, v26.2s, v7.2s umlal v6.2d, v27.2s, v14.2s umlal v6.2d, v28.2s, v13.2s umlal v6.2d, v24.2s, v9.2s umlal v6.2d, v21.2s, v18.2s umull v8.2d, v25.2s, v18.2s umlal v8.2d, v26.2s, v9.2s umlal v8.2d, v27.2s, v16.2s umlal v8.2d, v28.2s, v7.2s umlal v8.2d, v14.2s, v14.2s umlal v8.2d, v24.2s, v19.2s umull v7.2d, v25.2s, v17.2s umlal v7.2d, v26.2s, v16.2s umlal v7.2d, v27.2s, v15.2s umlal v7.2d, v28.2s, v14.2s umlal v7.2d, v24.2s, v18.2s umull v9.2d, v25.2s, v19.2s umlal v9.2d, v26.2s, v18.2s umlal v9.2d, v27.2s, v17.2s umlal v9.2d, v28.2s, v16.2s umlal v9.2d, v29.2s, v15.2s ushr v12.2d, v30.2d, #1 usra v1.2d, v0.2d, #26 and v0.16b, v0.16b, v30.16b usra v2.2d, v1.2d, #25 and v21.16b, v1.16b, v12.16b usra v3.2d, v2.2d, #26 and v22.16b, v2.16b, v30.16b usra v4.2d, v3.2d, #25 and v23.16b, v3.16b, v12.16b usra v5.2d, v4.2d, #26 and v24.16b, v4.16b, v30.16b usra v6.2d, v5.2d, #25 and v25.16b, v5.16b, v12.16b usra v7.2d, v6.2d, #26 and v26.16b, v6.16b, v30.16b usra v8.2d, v7.2d, #25 and v27.16b, v7.16b, v12.16b usra v9.2d, v8.2d, #26 and v28.16b, v8.16b, v30.16b bic v10.16b, v9.16b, v12.16b usra v0.2d, v10.2d, #25 usra v0.2d, v10.2d, #24 usra v0.2d, v10.2d, #21 and v29.16b, v9.16b, v12.16b usra v21.2d, v0.2d, #26 and v20.16b, v0.16b, v30.16b zip2 v0.4s, v20.4s, v21.4s zip2 v1.4s, v22.4s, v23.4s zip2 v2.4s, v24.4s, v25.4s zip2 v3.4s, v26.4s, v27.4s zip2 v4.4s, v28.4s, v29.4s stp d0, d1, [sp, #256] stp d2, d3, [sp, #272] str d4, [sp, #288] // inputs <Z3,T1> and <X1,T2> add x10, sp, #208 add x11, sp, #352 add x12, sp, #304 ld2 {v20.s, v21.s}[1], [x10], #8 ld2 {v10.s, v11.s}[0], [x11], #8 ld2 {v10.s, v11.s}[1], [x12], #8 ld2 {v22.s, v23.s}[1], [x10], #8 ld2 {v12.s, v13.s}[0], [x11], #8 ld2 {v12.s, v13.s}[1], [x12], #8 ld2 {v24.s, v25.s}[1], [x10], #8 ld2 {v14.s, v15.s}[0], [x11], #8 ld2 {v14.s, v15.s}[1], [x12], #8 ld2 {v26.s, v27.s}[1], [x10], #8 ld2 {v16.s, v17.s}[0], [x11], #8 ld2 {v16.s, v17.s}[1], [x12], #8 ld2 {v28.s, v29.s}[1], [x10], #8 ld2 {v18.s, v19.s}[0], [x11], #8 ld2 {v18.s, v19.s}[1], [x12], #8 // <Z3,X2> ← Mul(<Z3,T1>,<X1,T2>) umull v0.2d, v10.2s, v20.2s umull v1.2d, v10.2s, v21.2s umlal v1.2d, v11.2s, v20.2s umull v2.2d, v10.2s, v22.2s umlal v2.2d, v12.2s, v20.2s umull v3.2d, v10.2s, v23.2s umlal v3.2d, v11.2s, v22.2s umlal v3.2d, v12.2s, v21.2s umlal v3.2d, v13.2s, v20.2s umull v4.2d, v10.2s, v24.2s umlal v4.2d, v12.2s, v22.2s umlal v4.2d, v14.2s, v20.2s umull v5.2d, v10.2s, v25.2s umlal v5.2d, v11.2s, v24.2s umlal v5.2d, v12.2s, v23.2s umlal v5.2d, v13.2s, v22.2s umlal v5.2d, v14.2s, v21.2s umlal v5.2d, v15.2s, v20.2s umull v6.2d, v10.2s, v26.2s umlal v6.2d, v12.2s, v24.2s umlal v6.2d, v14.2s, v22.2s umlal v6.2d, v16.2s, v20.2s umull v7.2d, v10.2s, v27.2s umlal v7.2d, v11.2s, v26.2s umlal v7.2d, v12.2s, v25.2s umlal v7.2d, v13.2s, v24.2s umlal v7.2d, v14.2s, v23.2s umlal v7.2d, v15.2s, v22.2s umlal v7.2d, v16.2s, v21.2s umlal v7.2d, v17.2s, v20.2s umull v8.2d, v10.2s, v28.2s umlal v8.2d, v12.2s, v26.2s umlal v8.2d, v14.2s, v24.2s umlal v8.2d, v16.2s, v22.2s umlal v8.2d, v18.2s, v20.2s umull v9.2d, v10.2s, v29.2s umlal v9.2d, v11.2s, v28.2s umlal v9.2d, v12.2s, v27.2s umlal v9.2d, v13.2s, v26.2s umlal v9.2d, v14.2s, v25.2s umlal v9.2d, v15.2s, v24.2s umlal v9.2d, v16.2s, v23.2s umlal v9.2d, v17.2s, v22.2s umlal v9.2d, v18.2s, v21.2s umlal v9.2d, v19.2s, v20.2s mul v22.2s, v22.2s, v31.2s mul v24.2s, v24.2s, v31.2s mul v26.2s, v26.2s, v31.2s mul v28.2s, v28.2s, v31.2s umlal v0.2d, v12.2s, v28.2s umlal v0.2d, v14.2s, v26.2s umlal v0.2d, v16.2s, v24.2s umlal v0.2d, v18.2s, v22.2s umlal v1.2d, v13.2s, v28.2s umlal v1.2d, v15.2s, v26.2s umlal v1.2d, v17.2s, v24.2s umlal v1.2d, v19.2s, v22.2s umlal v2.2d, v14.2s, v28.2s umlal v2.2d, v16.2s, v26.2s umlal v2.2d, v18.2s, v24.2s umlal v3.2d, v15.2s, v28.2s umlal v3.2d, v17.2s, v26.2s umlal v3.2d, v19.2s, v24.2s umlal v4.2d, v16.2s, v28.2s umlal v4.2d, v18.2s, v26.2s umlal v5.2d, v17.2s, v28.2s umlal v5.2d, v19.2s, v26.2s umlal v6.2d, v18.2s, v28.2s umlal v7.2d, v19.2s, v28.2s shl v11.2s, v11.2s, #1 shl v13.2s, v13.2s, #1 shl v15.2s, v15.2s, #1 shl v17.2s, v17.2s, #1 shl v19.2s, v19.2s, #1 umlal v2.2d, v11.2s, v21.2s umlal v4.2d, v11.2s, v23.2s umlal v4.2d, v13.2s, v21.2s umlal v6.2d, v11.2s, v25.2s umlal v6.2d, v13.2s, v23.2s umlal v6.2d, v15.2s, v21.2s umlal v8.2d, v11.2s, v27.2s umlal v8.2d, v13.2s, v25.2s umlal v8.2d, v15.2s, v23.2s umlal v8.2d, v17.2s, v21.2s mul v21.2s, v21.2s, v31.2s mul v23.2s, v23.2s, v31.2s mul v25.2s, v25.2s, v31.2s mul v27.2s, v27.2s, v31.2s mul v29.2s, v29.2s, v31.2s umlal v0.2d, v11.2s, v29.2s umlal v0.2d, v13.2s, v27.2s umlal v0.2d, v15.2s, v25.2s umlal v0.2d, v17.2s, v23.2s umlal v0.2d, v19.2s, v21.2s umlal v1.2d, v12.2s, v29.2s umlal v1.2d, v14.2s, v27.2s umlal v1.2d, v16.2s, v25.2s umlal v1.2d, v18.2s, v23.2s umlal v2.2d, v13.2s, v29.2s umlal v2.2d, v15.2s, v27.2s umlal v2.2d, v17.2s, v25.2s umlal v2.2d, v19.2s, v23.2s umlal v3.2d, v14.2s, v29.2s umlal v3.2d, v16.2s, v27.2s umlal v3.2d, v18.2s, v25.2s umlal v4.2d, v15.2s, v29.2s umlal v4.2d, v17.2s, v27.2s umlal v4.2d, v19.2s, v25.2s umlal v5.2d, v16.2s, v29.2s umlal v5.2d, v18.2s, v27.2s umlal v6.2d, v17.2s, v29.2s umlal v6.2d, v19.2s, v27.2s umlal v7.2d, v18.2s, v29.2s umlal v8.2d, v19.2s, v29.2s ushr v25.2d, v30.2d, #1 usra v1.2d, v0.2d, #26 and v0.16b, v0.16b, v30.16b usra v2.2d, v1.2d, #25 and v1.16b, v1.16b, v25.16b usra v3.2d, v2.2d, #26 and v2.16b, v2.16b, v30.16b usra v4.2d, v3.2d, #25 and v3.16b, v3.16b, v25.16b usra v5.2d, v4.2d, #26 and v4.16b, v4.16b, v30.16b usra v6.2d, v5.2d, #25 and v5.16b, v5.16b, v25.16b usra v7.2d, v6.2d, #26 and v6.16b, v6.16b, v30.16b usra v8.2d, v7.2d, #25 and v7.16b, v7.16b, v25.16b usra v9.2d, v8.2d, #26 and v8.16b, v8.16b, v30.16b bic v10.16b, v9.16b, v25.16b usra v0.2d, v10.2d, #25 usra v0.2d, v10.2d, #24 usra v0.2d, v10.2d, #21 and v9.16b, v9.16b, v25.16b usra v1.2d, v0.2d, #26 and v0.16b, v0.16b, v30.16b // Z3 zip1 v20.2s, v0.2s, v1.2s zip1 v22.2s, v2.2s, v3.2s zip1 v24.2s, v4.2s, v5.2s zip1 v26.2s, v6.2s, v7.2s zip1 v28.2s, v8.2s, v9.2s // X2 zip2 v11.4s, v0.4s, v1.4s zip2 v13.4s, v2.4s, v3.4s zip2 v15.4s, v4.4s, v5.4s zip2 v17.4s, v6.4s, v7.4s zip2 v19.4s, v8.4s, v9.4s eor w1, w1, w3 // Z2 mov v10.d[0], x20 mov v12.d[0], x22 mov v14.d[0], x24 mov v16.d[0], x26 mov v18.d[0], x28 // X3 ldp d21, d23, [sp, #256] ldp d25, d27, [sp, #272] ldr d29, [sp, #288] bpl .L0 ldr x0, [sp, #144] // X2 stp d11, d13, [x0, #0] stp d15, d17, [x0, #16] str d19, [x0, #32] // Z2 stp d10, d12, [x0, #40] stp d14, d16, [x0, #56] str d18, [x0, #72] add x10, sp, #392 ldp d14, d15, [x10, #144] ldp d12, d13, [x10, #128] ldp d10, d11, [x10, #112] ldp d8, d9, [x10, #96] ldp x29, x30, [x10, #80] ldp x27, x28, [x10, #64] ldp x25, x26, [x10, #48] ldp x23, x24, [x10, #32] ldp x21, x22, [x10, #16] ldp x19, x20, [x10, #0] add sp, sp, #560 ret #endif #ifdef IMPL2 .p2align 4 ASM_HIDDEN _CRYPTO_SHARED_NAMESPACE(mladder) .globl _CRYPTO_SHARED_NAMESPACE(mladder) ASM_HIDDEN CRYPTO_SHARED_NAMESPACE(mladder) .globl CRYPTO_SHARED_NAMESPACE(mladder) _CRYPTO_SHARED_NAMESPACE(mladder): CRYPTO_SHARED_NAMESPACE(mladder): sub sp, sp, #448 stp x19, x20, [sp, #16] stp x21, x22, [sp, #32] stp x23, x24, [sp, #48] stp x25, x26, [sp, #64] stp x27, x28, [sp, #80] stp x29, x30, [sp, #96] stp d8, d9, [sp, #112] stp d10, d11, [sp, #128] stp d12, d13, [sp, #144] stp d14, d15, [sp, #160] // clamp scalar ldr x3, [x2, #0] and x3, x3, #0xfffffffffffffff8 str x3, [x2, #0] ldr x4, [x2, #24] orr x4, x4, #0x4000000000000000 str x4, [x2, #24] stp x0, x2, [sp, #176] // load point ldp x4, x5, [x1, #0] ldp x6, x7, [x1, #16] // X1 and w8, w4, #0x3ffffff ubfx x9, x4, #26, #25 lsr x10, x4, #51 orr w10, w10, w5, lsl #13 and w10, w10, #0x3ffffff ubfx x11, x5, #13, #25 lsr x12, x5, #38 and w13, w6, #0x1ffffff ubfx x14, x6, #25, #26 lsr x15, x6, #51 orr w15, w15, w7, lsl #13 and w15, w15, #0x1ffffff ubfx x16, x7, #12, #26 ubfx x17, x7, #38, #25 add x0, sp, #400 stp w8, w9, [x0, #0] stp w10, w11, [x0, #8] stp w12, w13, [x0, #16] stp w14, w15, [x0, #24] stp w16, w17, [x0, #32] mov x20, #1 // X2 ← 1 mov v11.d[0], x20 mov v13.d[0], xzr mov v15.d[0], xzr mov v17.d[0], xzr mov v19.d[0], xzr // Z2 ← 0 mov v10.d[0], xzr mov v12.d[0], xzr mov v14.d[0], xzr mov v16.d[0], xzr mov v18.d[0], xzr // X3 ← X1 mov v21.s[0], w8 mov v21.s[1], w9 mov v23.s[0], w10 mov v23.s[1], w11 mov v25.s[0], w12 mov v25.s[1], w13 mov v27.s[0], w14 mov v27.s[1], w15 mov v29.s[0], w16 mov v29.s[1], w17 // Z3 ← 1 mov v20.d[0], x20 mov v22.d[0], xzr mov v24.d[0], xzr mov v26.d[0], xzr mov v28.d[0], xzr mov w30, #19 dup v31.2s, w30 mov w29, #0x3ffffff dup v30.2d, x29 mov x5, #254 stp x5, xzr, [sp, #192] movz x1, #0xffda movk x1, #0x07ff, lsl 16 movk x1, #0xfffe, lsl 32 movk x1, #0x03ff, lsl 48 movz x2, #0xfffe movk x2, #0x07ff, lsl 16 movk x2, #0xfffe, lsl 32 movk x2, #0x03ff, lsl 48 stp x2, x1, [sp, #0] mov w1, #0xda strb w1, [sp, #440] // Montgomery ladder loop .L1: ldr d8, [sp, #8] ldr d9, [sp, #0] // T1 = X2 + Z2, T2 = X2 - Z2 add v0.2s, v11.2s, v10.2s add v1.2s, v13.2s, v12.2s add v2.2s, v15.2s, v14.2s add v3.2s, v17.2s, v16.2s add v4.2s, v19.2s, v18.2s add v11.2s, v8.2s, v11.2s add v13.2s, v9.2s, v13.2s add v15.2s, v9.2s, v15.2s add v17.2s, v9.2s, v17.2s add v19.2s, v9.2s, v19.2s sub v11.2s, v11.2s, v10.2s sub v13.2s, v13.2s, v12.2s sub v15.2s, v15.2s, v14.2s sub v17.2s, v17.2s, v16.2s sub v19.2s, v19.2s, v18.2s // T4 = X3 - Z3, T3 = X3 + Z3 add v10.2s, v8.2s, v21.2s add v12.2s, v9.2s, v23.2s add v14.2s, v9.2s, v25.2s add v16.2s, v9.2s, v27.2s add v18.2s, v9.2s, v29.2s add v5.2s, v21.2s, v20.2s add v6.2s, v23.2s, v22.2s add v7.2s, v25.2s, v24.2s add v8.2s, v27.2s, v26.2s add v9.2s, v29.2s, v28.2s sub v21.2s, v10.2s, v20.2s sub v23.2s, v12.2s, v22.2s sub v25.2s, v14.2s, v24.2s sub v27.2s, v16.2s, v26.2s sub v29.2s, v18.2s, v28.2s // get current scalar bit ldr x2, [sp, #184] ldp x5, x6, [sp, #192] bic x3, x5, #0x3f lsr x3, x3, #3 ldr x4, [x2, x3] lsr x4, x4, x5 and w4, w4, #1 // compare current with previous scalar bit cmp w4, w6 // CSelect(T1,T3,b) fcsel d10, d5, d0, ne fcsel d12, d6, d1, ne fcsel d14, d7, d2, ne fcsel d16, d8, d3, ne fcsel d18, d9, d4, ne // save T1 resulted from CSelect mov x10, v10.d[0] mov x12, v12.d[0] mov x14, v14.d[0] mov x16, v16.d[0] mov x18, v18.d[0] lsr x11, x10, #32 lsr x13, x12, #32 lsr x15, x14, #32 lsr x17, x16, #32 lsr x19, x18, #32 // CSelect(T2,T4,b) fcsel d20, d21, d11, ne fcsel d22, d23, d13, ne fcsel d24, d25, d15, ne fcsel d26, d27, d17, ne fcsel d28, d29, d19, ne // save T2 resulted from CSelect stp d20, d22, [sp, #208] stp d24, d26, [sp, #224] str d28, [sp, #240] // update previous scalar bit subs w5, w5, #1 stp x5, x4, [sp, #192] // <T1,T2> = <X2 + Z2,X2 - Z2> trn1 v10.2s, v0.2s, v11.2s trn2 v11.2s, v0.2s, v11.2s trn1 v12.2s, v1.2s, v13.2s trn2 v13.2s, v1.2s, v13.2s trn1 v14.2s, v2.2s, v15.2s trn2 v15.2s, v2.2s, v15.2s trn1 v16.2s, v3.2s, v17.2s trn2 v17.2s, v3.2s, v17.2s trn1 v18.2s, v4.2s, v19.2s trn2 v19.2s, v4.2s, v19.2s // <T4,T3> = <X3 - Z3,X3 + Z3> trn1 v20.2s, v21.2s, v5.2s trn2 v21.2s, v21.2s, v5.2s trn1 v22.2s, v23.2s, v6.2s trn2 v23.2s, v23.2s, v6.2s trn1 v24.2s, v25.2s, v7.2s trn2 v25.2s, v25.2s, v7.2s trn1 v26.2s, v27.2s, v8.2s trn2 v27.2s, v27.2s, v8.2s trn1 v28.2s, v29.2s, v9.2s trn2 v29.2s, v29.2s, v9.2s // <T5,T6> ← Mul(<T1,T2>,<T4,T3>) umull v0.2d, v10.2s, v20.2s umull v1.2d, v10.2s, v21.2s umlal v1.2d, v11.2s, v20.2s umull v2.2d, v10.2s, v22.2s umlal v2.2d, v12.2s, v20.2s umull v3.2d, v10.2s, v23.2s umlal v3.2d, v11.2s, v22.2s umlal v3.2d, v12.2s, v21.2s umlal v3.2d, v13.2s, v20.2s umull v4.2d, v10.2s, v24.2s umlal v4.2d, v12.2s, v22.2s umlal v4.2d, v14.2s, v20.2s umull v5.2d, v10.2s, v25.2s umlal v5.2d, v11.2s, v24.2s umlal v5.2d, v12.2s, v23.2s umlal v5.2d, v13.2s, v22.2s umlal v5.2d, v14.2s, v21.2s umlal v5.2d, v15.2s, v20.2s umull v6.2d, v10.2s, v26.2s umlal v6.2d, v12.2s, v24.2s umlal v6.2d, v14.2s, v22.2s umlal v6.2d, v16.2s, v20.2s umull v7.2d, v10.2s, v27.2s umlal v7.2d, v11.2s, v26.2s umlal v7.2d, v12.2s, v25.2s umlal v7.2d, v13.2s, v24.2s umlal v7.2d, v14.2s, v23.2s umlal v7.2d, v15.2s, v22.2s umlal v7.2d, v16.2s, v21.2s umlal v7.2d, v17.2s, v20.2s umull v8.2d, v10.2s, v28.2s umlal v8.2d, v12.2s, v26.2s umlal v8.2d, v14.2s, v24.2s umlal v8.2d, v16.2s, v22.2s umlal v8.2d, v18.2s, v20.2s umull v9.2d, v10.2s, v29.2s umlal v9.2d, v11.2s, v28.2s umlal v9.2d, v12.2s, v27.2s umlal v9.2d, v13.2s, v26.2s umlal v9.2d, v14.2s, v25.2s umlal v9.2d, v15.2s, v24.2s umlal v9.2d, v16.2s, v23.2s umlal v9.2d, v17.2s, v22.2s umlal v9.2d, v18.2s, v21.2s umlal v9.2d, v19.2s, v20.2s mul v22.2s, v22.2s, v31.2s mul v24.2s, v24.2s, v31.2s mul v26.2s, v26.2s, v31.2s mul v28.2s, v28.2s, v31.2s umlal v0.2d, v12.2s, v28.2s umlal v0.2d, v14.2s, v26.2s umlal v0.2d, v16.2s, v24.2s umlal v0.2d, v18.2s, v22.2s umlal v1.2d, v13.2s, v28.2s umlal v1.2d, v15.2s, v26.2s umlal v1.2d, v17.2s, v24.2s umlal v1.2d, v19.2s, v22.2s umlal v2.2d, v14.2s, v28.2s umlal v2.2d, v16.2s, v26.2s umlal v2.2d, v18.2s, v24.2s umlal v3.2d, v15.2s, v28.2s umlal v3.2d, v17.2s, v26.2s umlal v3.2d, v19.2s, v24.2s umlal v4.2d, v16.2s, v28.2s umlal v4.2d, v18.2s, v26.2s umlal v5.2d, v17.2s, v28.2s umlal v5.2d, v19.2s, v26.2s umlal v6.2d, v18.2s, v28.2s umlal v7.2d, v19.2s, v28.2s shl v11.2s, v11.2s, #1 shl v13.2s, v13.2s, #1 shl v15.2s, v15.2s, #1 shl v17.2s, v17.2s, #1 shl v19.2s, v19.2s, #1 umlal v2.2d, v11.2s, v21.2s umlal v4.2d, v11.2s, v23.2s umlal v4.2d, v13.2s, v21.2s umlal v6.2d, v11.2s, v25.2s umlal v6.2d, v13.2s, v23.2s umlal v6.2d, v15.2s, v21.2s umlal v8.2d, v11.2s, v27.2s umlal v8.2d, v13.2s, v25.2s umlal v8.2d, v15.2s, v23.2s umlal v8.2d, v17.2s, v21.2s mul v21.2s, v21.2s, v31.2s mul v23.2s, v23.2s, v31.2s mul v25.2s, v25.2s, v31.2s mul v27.2s, v27.2s, v31.2s mul v29.2s, v29.2s, v31.2s umlal v0.2d, v11.2s, v29.2s umlal v0.2d, v13.2s, v27.2s umlal v0.2d, v15.2s, v25.2s umlal v0.2d, v17.2s, v23.2s umlal v0.2d, v19.2s, v21.2s umlal v1.2d, v12.2s, v29.2s umlal v1.2d, v14.2s, v27.2s umlal v1.2d, v16.2s, v25.2s umlal v1.2d, v18.2s, v23.2s umlal v2.2d, v13.2s, v29.2s umlal v2.2d, v15.2s, v27.2s umlal v2.2d, v17.2s, v25.2s umlal v2.2d, v19.2s, v23.2s umlal v3.2d, v14.2s, v29.2s umlal v3.2d, v16.2s, v27.2s umlal v3.2d, v18.2s, v25.2s umlal v4.2d, v15.2s, v29.2s umlal v4.2d, v17.2s, v27.2s umlal v4.2d, v19.2s, v25.2s umlal v5.2d, v16.2s, v29.2s umlal v5.2d, v18.2s, v27.2s umlal v6.2d, v17.2s, v29.2s umlal v6.2d, v19.2s, v27.2s umlal v7.2d, v18.2s, v29.2s umlal v8.2d, v19.2s, v29.2s ushr v25.2d, v30.2d, #1 usra v6.2d, v5.2d, #25 and v5.16b, v5.16b, v25.16b usra v1.2d, v0.2d, #26 and v0.16b, v0.16b, v30.16b usra v7.2d, v6.2d, #26 and v6.16b, v6.16b, v30.16b usra v2.2d, v1.2d, #25 and v1.16b, v1.16b, v25.16b usra v8.2d, v7.2d, #25 and v7.16b, v7.16b, v25.16b usra v3.2d, v2.2d, #26 and v2.16b, v2.16b, v30.16b usra v9.2d, v8.2d, #26 and v8.16b, v8.16b, v30.16b usra v4.2d, v3.2d, #25 and v3.16b, v3.16b, v25.16b bic v10.16b, v9.16b, v25.16b usra v0.2d, v10.2d, #25 usra v0.2d, v10.2d, #24 usra v0.2d, v10.2d, #21 and v9.16b, v9.16b, v25.16b usra v5.2d, v4.2d, #26 and v4.16b, v4.16b, v30.16b usra v1.2d, v0.2d, #26 and v0.16b, v0.16b, v30.16b usra v6.2d, v5.2d, #25 and v5.16b, v5.16b, v25.16b // T1 ← T1^2 mul w20, w16, w30 mul w21, w18, w30 add w25, w30, w30 mul w22, w15, w25 mul w23, w17, w25 mul w24, w19, w25 umull x0, w10, w10 add w25, w10, w10 umull x1, w25, w11 umull x2, w25, w12 umull x3, w25, w13 umull x4, w25, w14 umull x5, w25, w15 umull x6, w25, w16 umull x7, w25, w17 umull x8, w25, w18 umaddl x4, w12, w12, x4 umaddl x8, w14, w14, x8 add w10, w15, w15 umaddl x1, w20, w10, x1 umaddl x2, w20, w16, x2 add w9, w17, w17 umaddl x3, w21, w10, x3 umaddl x5, w21, w9, x5 umaddl x6, w21, w18, x6 umaddl x0, w22, w15, x0 umaddl x1, w23, w14, x1 umaddl x2, w23, w10, x2 umaddl x3, w23, w16, x3 umaddl x4, w23, w17, x4 umaddl x1, w24, w12, x1 umaddl x3, w24, w14, x3 umaddl x4, w24, w10, x4 umaddl x5, w24, w16, x5 umaddl x6, w24, w9, x6 umaddl x7, w24, w18, x7 umaddl x8, w24, w19, x8 add w26, w11, w11 umaddl x0, w26, w24, x0 umaddl x2, w26, w11, x2 umaddl x3, w26, w12, x3 umaddl x5, w26, w14, x5 umaddl x6, w26, w10, x6 umaddl x7, w26, w16, x7 umaddl x8, w26, w9, x8 umull x9, w25, w19 umaddl x9, w26, w18, x9 add w27, w12, w12 umaddl x0, w27, w21, x0 umaddl x5, w27, w13, x5 umaddl x8, w27, w16, x8 umaddl x6, w27, w14, x6 umaddl x7, w27, w15, x7 umaddl x9, w27, w17, x9 add w28, w13, w13 umaddl x4, w26, w28, x4 umaddl x0, w28, w23, x0 umaddl x1, w28, w21, x1 umaddl x2, w28, w24, x2 umaddl x8, w28, w10, x8 umaddl x6, w28, w13, x6 umaddl x7, w28, w14, x7 umaddl x9, w28, w16, x9 add w29, w14, w14 umaddl x0, w29, w20, x0 umaddl x2, w29, w21, x2 umaddl x9, w29, w15, x9 add w18, w16, w16 umaddl x4, w18, w21, x4 add x6, x6, x5, lsr #25 and x5, x5, #0x1ffffff add x1, x1, x0, lsr #26 and x0, x0, #0x3ffffff add x7, x7, x6, lsr #26 and x6, x6, #0x3ffffff add x2, x2, x1, lsr #25 and x1, x1, #0x1ffffff add x8, x8, x7, lsr #25 and x7, x7, #0x1ffffff add x3, x3, x2, lsr #26 and x2, x2, #0x3ffffff add x9, x9, x8, lsr #26 and x8, x8, #0x3ffffff add x4, x4, x3, lsr #25 and x3, x3, #0x1ffffff bfi x2, x3, #32, #25 bic x10, x9, #0x1ffffff add x0, x0, x10, lsr #25 add x0, x0, x10, lsr #24 add x0, x0, x10, lsr #21 and x9, x9, #0x1ffffff bfi x8, x9, #32, #25 add x5, x5, x4, lsr #26 and x4, x4, #0x3ffffff add x1, x1, x0, lsr #26 and x0, x0, #0x3ffffff bfi x0, x1, #32, #26 add x6, x6, x5, lsr #25 bfi x6, x7, #32, #25 and x5, x5, #0x1ffffff bfi x4, x5, #32, #25 stp x0, x2, [sp, #256] stp x4, x6, [sp, #272] str x8, [sp, #288] // X3 ← T5 + T6, Z3 ← T5 - T6 uzp1 v0.4s, v0.4s, v1.4s uzp1 v1.4s, v2.4s, v3.4s uzp1 v2.4s, v0.4s, v1.4s uzp2 v3.4s, v0.4s, v1.4s uzp1 v4.4s, v4.4s, v5.4s uzp1 v5.4s, v6.4s, v7.4s uzp1 v6.4s, v4.4s, v5.4s uzp2 v7.4s, v4.4s, v5.4s trn1 v8.4s, v8.4s, v9.4s mov v9.d[0], v8.d[1] // X3 ← T5 + T6 add v1.4s, v2.4s, v3.4s add v4.4s, v6.4s, v7.4s add v5.4s, v8.4s, v9.4s // Z3 ← T5 - T6 ld1r {v10.2d}, [sp] add v6.4s, v6.4s, v10.4s add v8.4s, v8.4s, v10.4s ldr b11, [sp, #440] mov v10.b[0], v11.b[0] add v2.4s, v2.4s, v10.4s sub v2.4s, v2.4s, v3.4s sub v6.4s, v6.4s, v7.4s sub v8.4s, v8.4s, v9.4s // <Z3,X3> ← <T5 - T6,T5 + T6> zip1 v10.4s, v2.4s, v1.4s zip2 v12.4s, v2.4s, v1.4s zip1 v14.4s, v6.4s, v4.4s zip2 v16.4s, v6.4s, v4.4s zip1 v18.4s, v8.4s, v5.4s mov v11.d[0], v10.d[1] mov v13.d[0], v12.d[1] mov v15.d[0], v14.d[1] mov v17.d[0], v16.d[1] mov v19.d[0], v18.d[1] // <Z3,X3> ← Sqr(<Z3,X3>) shl v0.2d, v31.2d, #1 mul v20.2s, v16.2s, v31.2s mul v21.2s, v18.2s, v31.2s mul v22.2s, v15.2s, v0.2s mul v23.2s, v17.2s, v0.2s mul v24.2s, v19.2s, v0.2s shl v25.2s, v10.2s, #1 shl v26.2s, v11.2s, #1 shl v27.2s, v12.2s, #1 shl v28.2s, v13.2s, #1 shl v29.2s, v14.2s, #1 shl v7.2s, v15.2s, #1 shl v8.2s, v16.2s, #1 shl v9.2s, v17.2s, #1 umull v0.2d, v10.2s, v10.2s umlal v0.2d, v26.2s, v24.2s umlal v0.2d, v27.2s, v21.2s umlal v0.2d, v28.2s, v23.2s umlal v0.2d, v29.2s, v20.2s umlal v0.2d, v22.2s, v15.2s umull v1.2d, v25.2s, v11.2s umlal v1.2d, v24.2s, v12.2s umlal v1.2d, v28.2s, v21.2s umlal v1.2d, v23.2s, v14.2s umlal v1.2d, v20.2s, v7.2s umull v2.2d, v25.2s, v12.2s umlal v2.2d, v26.2s, v11.2s umlal v2.2d, v28.2s, v24.2s umlal v2.2d, v29.2s, v21.2s umlal v2.2d, v23.2s, v7.2s umlal v2.2d, v20.2s, v16.2s umull v3.2d, v25.2s, v13.2s umlal v3.2d, v26.2s, v12.2s umlal v3.2d, v24.2s, v14.2s umlal v3.2d, v21.2s, v7.2s umlal v3.2d, v23.2s, v16.2s umull v4.2d, v25.2s, v14.2s umlal v4.2d, v26.2s, v28.2s umlal v4.2d, v12.2s, v12.2s umlal v4.2d, v24.2s, v7.2s umlal v4.2d, v8.2s, v21.2s umlal v4.2d, v23.2s, v17.2s umull v5.2d, v25.2s, v15.2s umlal v5.2d, v26.2s, v14.2s umlal v5.2d, v27.2s, v13.2s umlal v5.2d, v24.2s, v16.2s umlal v5.2d, v21.2s, v9.2s umull v6.2d, v25.2s, v16.2s umlal v6.2d, v26.2s, v7.2s umlal v6.2d, v27.2s, v14.2s umlal v6.2d, v28.2s, v13.2s umlal v6.2d, v24.2s, v9.2s umlal v6.2d, v21.2s, v18.2s umull v8.2d, v25.2s, v18.2s umlal v8.2d, v26.2s, v9.2s umlal v8.2d, v27.2s, v16.2s umlal v8.2d, v28.2s, v7.2s umlal v8.2d, v14.2s, v14.2s umlal v8.2d, v24.2s, v19.2s umull v7.2d, v25.2s, v17.2s umlal v7.2d, v26.2s, v16.2s umlal v7.2d, v27.2s, v15.2s umlal v7.2d, v28.2s, v14.2s umlal v7.2d, v24.2s, v18.2s umull v9.2d, v25.2s, v19.2s umlal v9.2d, v26.2s, v18.2s umlal v9.2d, v27.2s, v17.2s umlal v9.2d, v28.2s, v16.2s umlal v9.2d, v29.2s, v15.2s ushr v12.2d, v30.2d, #1 usra v6.2d, v5.2d, #25 and v25.16b, v5.16b, v12.16b usra v1.2d, v0.2d, #26 and v20.16b, v0.16b, v30.16b usra v7.2d, v6.2d, #26 and v26.16b, v6.16b, v30.16b usra v2.2d, v1.2d, #25 and v21.16b, v1.16b, v12.16b usra v8.2d, v7.2d, #25 and v27.16b, v7.16b, v12.16b usra v3.2d, v2.2d, #26 and v22.16b, v2.16b, v30.16b usra v9.2d, v8.2d, #26 and v28.16b, v8.16b, v30.16b usra v4.2d, v3.2d, #25 and v23.16b, v3.16b, v12.16b bic v10.16b, v9.16b, v12.16b usra v20.2d, v10.2d, #25 usra v20.2d, v10.2d, #24 usra v20.2d, v10.2d, #21 and v29.16b, v9.16b, v12.16b usra v25.2d, v4.2d, #26 and v24.16b, v4.16b, v30.16b usra v21.2d, v20.2d, #26 and v20.16b, v20.16b, v30.16b usra v26.2d, v25.2d, #25 and v25.16b, v25.16b, v12.16b zip2 v0.4s, v20.4s, v21.4s zip2 v1.4s, v22.4s, v23.4s zip2 v2.4s, v24.4s, v25.4s zip2 v3.4s, v26.4s, v27.4s zip2 v4.4s, v28.4s, v29.4s stp d0, d1, [sp, #304] stp d2, d3, [sp, #320] str d4, [sp, #336] // T2 ← T2^2 ldp w10, w11, [sp, #208] ldp w12, w13, [sp, #216] ldp w14, w15, [sp, #224] ldp w16, w17, [sp, #232] ldp w18, w19, [sp, #240] mul w20, w16, w30 mul w21, w18, w30 add w25, w30, w30 mul w22, w15, w25 mul w23, w17, w25 mul w24, w19, w25 umull x0, w10, w10 add w25, w10, w10 umull x1, w25, w11 umull x2, w25, w12 umull x3, w25, w13 umull x4, w25, w14 umull x5, w25, w15 umull x6, w25, w16 umull x7, w25, w17 umull x8, w25, w18 umaddl x4, w12, w12, x4 umaddl x8, w14, w14, x8 add w10, w15, w15 umaddl x1, w20, w10, x1 umaddl x2, w20, w16, x2 add w9, w17, w17 umaddl x3, w21, w10, x3 umaddl x5, w21, w9, x5 umaddl x6, w21, w18, x6 umaddl x0, w22, w15, x0 umaddl x1, w23, w14, x1 umaddl x2, w23, w10, x2 umaddl x3, w23, w16, x3 umaddl x4, w23, w17, x4 umaddl x1, w24, w12, x1 umaddl x3, w24, w14, x3 umaddl x4, w24, w10, x4 umaddl x5, w24, w16, x5 umaddl x6, w24, w9, x6 umaddl x7, w24, w18, x7 umaddl x8, w24, w19, x8 add w26, w11, w11 umaddl x0, w26, w24, x0 umaddl x2, w26, w11, x2 umaddl x3, w26, w12, x3 umaddl x5, w26, w14, x5 umaddl x6, w26, w10, x6 umaddl x7, w26, w16, x7 umaddl x8, w26, w9, x8 umull x9, w25, w19 umaddl x9, w26, w18, x9 add w27, w12, w12 umaddl x0, w27, w21, x0 umaddl x5, w27, w13, x5 umaddl x8, w27, w16, x8 umaddl x6, w27, w14, x6 umaddl x7, w27, w15, x7 umaddl x9, w27, w17, x9 add w28, w13, w13 umaddl x4, w26, w28, x4 umaddl x0, w28, w23, x0 umaddl x1, w28, w21, x1 umaddl x2, w28, w24, x2 umaddl x8, w28, w10, x8 umaddl x6, w28, w13, x6 umaddl x7, w28, w14, x7 umaddl x9, w28, w16, x9 add w29, w14, w14 umaddl x0, w29, w20, x0 umaddl x2, w29, w21, x2 umaddl x9, w29, w15, x9 add w18, w16, w16 umaddl x4, w18, w21, x4 add x6, x6, x5, lsr #25 and x5, x5, #0x1ffffff add x1, x1, x0, lsr #26 and x0, x0, #0x3ffffff add x7, x7, x6, lsr #26 and x6, x6, #0x3ffffff add x2, x2, x1, lsr #25 and x1, x1, #0x1ffffff add x8, x8, x7, lsr #25 and x7, x7, #0x1ffffff add x3, x3, x2, lsr #26 and x2, x2, #0x3ffffff add x9, x9, x8, lsr #26 and x8, x8, #0x3ffffff add x4, x4, x3, lsr #25 and x3, x3, #0x1ffffff bfi x2, x3, #32, #25 bic x10, x9, #0x1ffffff add x0, x0, x10, lsr #25 add x0, x0, x10, lsr #24 add x0, x0, x10, lsr #21 and x9, x9, #0x1ffffff bfi x8, x9, #32, #25 add x5, x5, x4, lsr #26 and x4, x4, #0x3ffffff add x1, x1, x0, lsr #26 and x0, x0, #0x3ffffff bfi x0, x1, #32, #26 add x6, x6, x5, lsr #25 bfi x6, x7, #32, #25 and x5, x5, #0x1ffffff bfi x4, x5, #32, #25 stp x0, x2, [sp, #352] stp x4, x6, [sp, #368] str x8, [sp, #384] // Z2 ← T1 - T2 ldp x11, x13, [sp, #256] ldp x15, x17, [sp, #272] ldr x19, [sp, #288] ldp x22, x21, [sp, #0] add x11, x11, x21 add x13, x13, x22 add x15, x15, x22 add x17, x17, x22 add x19, x19, x22 sub x10, x11, x0 sub x12, x13, x2 sub x14, x15, x4 sub x16, x17, x6 sub x18, x19, x8 lsr x11, x10, #32 lsr x13, x12, #32 lsr x15, x14, #32 lsr x17, x16, #32 lsr x19, x18, #32 // T2 ← aZ2 + T2 movz x20, #0xdb42 movk x20, #0x0001, lsl 16 mov w0, w0 umaddl x0, w10, w20, x0 umaddl x21, w11, w20, x1 mov w2, w2 umaddl x22, w12, w20, x2 umaddl x23, w13, w20, x3 mov w4, w4 umaddl x24, w14, w20, x4 umaddl x25, w15, w20, x5 mov w6, w6 umaddl x26, w16, w20, x6 umaddl x27, w17, w20, x7 mov w8, w8 umaddl x28, w18, w20, x8 umaddl x29, w19, w20, x9 add x26, x26, x25, lsr #25 and x25, x25, #0x1ffffff add x21, x21, x0, lsr #26 and x20, x0, #0x3ffffff add x27, x27, x26, lsr #26 and x26, x26, #0x3ffffff add x22, x22, x21, lsr #25 and x21, x21, #0x1ffffff add x28, x28, x27, lsr #25 and x27, x27, #0x1ffffff add x23, x23, x22, lsr #26 and x22, x22, #0x3ffffff add x29, x29, x28, lsr #26 and x28, x28, #0x3ffffff add x24, x24, x23, lsr #25 and x23, x23, #0x1ffffff bic x7, x29, #0x1ffffff add x20, x20, x7, lsr #25 add x20, x20, x7, lsr #24 add x20, x20, x7, lsr #21 and x29, x29, #0x1ffffff add x25, x25, x24, lsr #26 and x24, x24, #0x3ffffff add x21, x21, x20, lsr #26 and x20, x20, #0x3ffffff add x26, x26, x25, lsr #25 and x25, x25, #0x1ffffff // Z2 ← Z2 · T2 umull x0, w10, w20 umull x1, w10, w21 umull x2, w10, w22 umull x3, w10, w23 umull x4, w10, w24 umull x5, w10, w25 umull x6, w10, w26 umull x7, w10, w27 umull x8, w10, w28 umull x9, w10, w29 umaddl x1, w11, w20, x1 umaddl x3, w11, w22, x3 umaddl x5, w11, w24, x5 umaddl x7, w11, w26, x7 umaddl x9, w11, w28, x9 umaddl x2, w12, w20, x2 umaddl x3, w12, w21, x3 umaddl x4, w12, w22, x4 umaddl x5, w12, w23, x5 umaddl x6, w12, w24, x6 umaddl x7, w12, w25, x7 umaddl x8, w12, w26, x8 umaddl x9, w12, w27, x9 umaddl x3, w13, w20, x3 umaddl x5, w13, w22, x5 umaddl x7, w13, w24, x7 umaddl x9, w13, w26, x9 umaddl x4, w14, w20, x4 umaddl x5, w14, w21, x5 umaddl x6, w14, w22, x6 umaddl x7, w14, w23, x7 umaddl x8, w14, w24, x8 umaddl x9, w14, w25, x9 umaddl x5, w15, w20, x5 umaddl x7, w15, w22, x7 umaddl x9, w15, w24, x9 umaddl x6, w16, w20, x6 umaddl x7, w16, w21, x7 umaddl x8, w16, w22, x8 umaddl x9, w16, w23, x9 umaddl x7, w17, w20, x7 umaddl x9, w17, w22, x9 umaddl x8, w18, w20, x8 umaddl x9, w18, w21, x9 umaddl x9, w19, w20, x9 mul w22, w22, w30 mul w24, w24, w30 mul w26, w26, w30 mul w28, w28, w30 umaddl x0, w12, w28, x0 umaddl x1, w13, w28, x1 umaddl x0, w14, w26, x0 umaddl x2, w14, w28, x2 umaddl x1, w15, w26, x1 umaddl x3, w15, w28, x3 umaddl x0, w16, w24, x0 umaddl x2, w16, w26, x2 umaddl x4, w16, w28, x4 umaddl x1, w17, w24, x1 umaddl x3, w17, w26, x3 umaddl x5, w17, w28, x5 umaddl x0, w18, w22, x0 umaddl x2, w18, w24, x2 umaddl x6, w18, w28, x6 umaddl x4, w18, w26, x4 umaddl x1, w19, w22, x1 umaddl x3, w19, w24, x3 umaddl x5, w19, w26, x5 umaddl x7, w19, w28, x7 add w11, w11, w11 umaddl x2, w11, w21, x2 umaddl x4, w11, w23, x4 umaddl x6, w11, w25, x6 umaddl x8, w11, w27, x8 add w13, w13, w13 umaddl x4, w13, w21, x4 umaddl x6, w13, w23, x6 umaddl x8, w13, w25, x8 add w15, w15, w15 umaddl x6, w15, w21, x6 umaddl x8, w15, w23, x8 add w17, w17, w17 umaddl x8, w17, w21, x8 mul w21, w21, w30 mul w23, w23, w30 mul w25, w25, w30 mul w27, w27, w30 mul w29, w29, w30 umaddl x0, w11, w29, x0 umaddl x1, w12, w29, x1 umaddl x0, w13, w27, x0 umaddl x2, w13, w29, x2 umaddl x1, w14, w27, x1 umaddl x3, w14, w29, x3 umaddl x0, w15, w25, x0 umaddl x2, w15, w27, x2 umaddl x4, w15, w29, x4 umaddl x1, w16, w25, x1 umaddl x3, w16, w27, x3 umaddl x5, w16, w29, x5 umaddl x0, w17, w23, x0 umaddl x2, w17, w25, x2 umaddl x4, w17, w27, x4 umaddl x6, w17, w29, x6 umaddl x1, w18, w23, x1 umaddl x3, w18, w25, x3 umaddl x5, w18, w27, x5 umaddl x7, w18, w29, x7 add w19, w19, w19 umaddl x0, w19, w21, x0 umaddl x2, w19, w23, x2 umaddl x4, w19, w25, x4 umaddl x6, w19, w27, x6 umaddl x8, w19, w29, x8 add x6, x6, x5, lsr #25 and x5, x5, #0x1ffffff add x1, x1, x0, lsr #26 and x0, x0, #0x3ffffff add x7, x7, x6, lsr #26 and x6, x6, #0x3ffffff add x2, x2, x1, lsr #25 and x1, x1, #0x1ffffff add x8, x8, x7, lsr #25 and x7, x7, #0x1ffffff add x3, x3, x2, lsr #26 and x2, x2, #0x3ffffff add x9, x9, x8, lsr #26 and x8, x8, #0x3ffffff add x4, x4, x3, lsr #25 and x3, x3, #0x1ffffff bfi x2, x3, #32, #25 bic x10, x9, #0x1ffffff add x0, x0, x10, lsr #25 add x0, x0, x10, lsr #24 add x0, x0, x10, lsr #21 and x9, x9, #0x1ffffff bfi x8, x9, #32, #25 add x5, x5, x4, lsr #26 and x4, x4, #0x3ffffff add x1, x1, x0, lsr #26 and x0, x0, #0x3ffffff bfi x0, x1, #32, #26 add x6, x6, x5, lsr #25 bfi x6, x7, #32, #25 and x5, x5, #0x1ffffff bfi x4, x5, #32, #25 // inputs <Z3,T1> and <X1,T2> add x10, sp, #256 add x11, sp, #400 add x12, sp, #352 ld2 {v20.s, v21.s}[1], [x10], #8 ld2 {v10.s, v11.s}[0], [x11], #8 ld2 {v10.s, v11.s}[1], [x12], #8 ld2 {v22.s, v23.s}[1], [x10], #8 ld2 {v12.s, v13.s}[0], [x11], #8 ld2 {v12.s, v13.s}[1], [x12], #8 ld2 {v24.s, v25.s}[1], [x10], #8 ld2 {v14.s, v15.s}[0], [x11], #8 ld2 {v14.s, v15.s}[1], [x12], #8 ld2 {v26.s, v27.s}[1], [x10], #8 ld2 {v16.s, v17.s}[0], [x11], #8 ld2 {v16.s, v17.s}[1], [x12], #8 ld2 {v28.s, v29.s}[1], [x10], #8 ld2 {v18.s, v19.s}[0], [x11], #8 ld2 {v18.s, v19.s}[1], [x12], #8 // <Z3,X2> ← Mul(<Z3,T1>,<X1,T2>) umull v0.2d, v10.2s, v20.2s umull v1.2d, v10.2s, v21.2s umlal v1.2d, v11.2s, v20.2s umull v2.2d, v10.2s, v22.2s umlal v2.2d, v12.2s, v20.2s umull v3.2d, v10.2s, v23.2s umlal v3.2d, v11.2s, v22.2s umlal v3.2d, v12.2s, v21.2s umlal v3.2d, v13.2s, v20.2s umull v4.2d, v10.2s, v24.2s umlal v4.2d, v12.2s, v22.2s umlal v4.2d, v14.2s, v20.2s umull v5.2d, v10.2s, v25.2s umlal v5.2d, v11.2s, v24.2s umlal v5.2d, v12.2s, v23.2s umlal v5.2d, v13.2s, v22.2s umlal v5.2d, v14.2s, v21.2s umlal v5.2d, v15.2s, v20.2s umull v6.2d, v10.2s, v26.2s umlal v6.2d, v12.2s, v24.2s umlal v6.2d, v14.2s, v22.2s umlal v6.2d, v16.2s, v20.2s umull v7.2d, v10.2s, v27.2s umlal v7.2d, v11.2s, v26.2s umlal v7.2d, v12.2s, v25.2s umlal v7.2d, v13.2s, v24.2s umlal v7.2d, v14.2s, v23.2s umlal v7.2d, v15.2s, v22.2s umlal v7.2d, v16.2s, v21.2s umlal v7.2d, v17.2s, v20.2s umull v8.2d, v10.2s, v28.2s umlal v8.2d, v12.2s, v26.2s umlal v8.2d, v14.2s, v24.2s umlal v8.2d, v16.2s, v22.2s umlal v8.2d, v18.2s, v20.2s umull v9.2d, v10.2s, v29.2s umlal v9.2d, v11.2s, v28.2s umlal v9.2d, v12.2s, v27.2s umlal v9.2d, v13.2s, v26.2s umlal v9.2d, v14.2s, v25.2s umlal v9.2d, v15.2s, v24.2s umlal v9.2d, v16.2s, v23.2s umlal v9.2d, v17.2s, v22.2s umlal v9.2d, v18.2s, v21.2s umlal v9.2d, v19.2s, v20.2s mul v22.2s, v22.2s, v31.2s mul v24.2s, v24.2s, v31.2s mul v26.2s, v26.2s, v31.2s mul v28.2s, v28.2s, v31.2s umlal v0.2d, v12.2s, v28.2s umlal v0.2d, v14.2s, v26.2s umlal v0.2d, v16.2s, v24.2s umlal v0.2d, v18.2s, v22.2s umlal v1.2d, v13.2s, v28.2s umlal v1.2d, v15.2s, v26.2s umlal v1.2d, v17.2s, v24.2s umlal v1.2d, v19.2s, v22.2s umlal v2.2d, v14.2s, v28.2s umlal v2.2d, v16.2s, v26.2s umlal v2.2d, v18.2s, v24.2s umlal v3.2d, v15.2s, v28.2s umlal v3.2d, v17.2s, v26.2s umlal v3.2d, v19.2s, v24.2s umlal v4.2d, v16.2s, v28.2s umlal v4.2d, v18.2s, v26.2s umlal v5.2d, v17.2s, v28.2s umlal v5.2d, v19.2s, v26.2s umlal v6.2d, v18.2s, v28.2s umlal v7.2d, v19.2s, v28.2s shl v11.2s, v11.2s, #1 shl v13.2s, v13.2s, #1 shl v15.2s, v15.2s, #1 shl v17.2s, v17.2s, #1 shl v19.2s, v19.2s, #1 umlal v2.2d, v11.2s, v21.2s umlal v4.2d, v11.2s, v23.2s umlal v4.2d, v13.2s, v21.2s umlal v6.2d, v11.2s, v25.2s umlal v6.2d, v13.2s, v23.2s umlal v6.2d, v15.2s, v21.2s umlal v8.2d, v11.2s, v27.2s umlal v8.2d, v13.2s, v25.2s umlal v8.2d, v15.2s, v23.2s umlal v8.2d, v17.2s, v21.2s mul v21.2s, v21.2s, v31.2s mul v23.2s, v23.2s, v31.2s mul v25.2s, v25.2s, v31.2s mul v27.2s, v27.2s, v31.2s mul v29.2s, v29.2s, v31.2s umlal v0.2d, v11.2s, v29.2s umlal v0.2d, v13.2s, v27.2s umlal v0.2d, v15.2s, v25.2s umlal v0.2d, v17.2s, v23.2s umlal v0.2d, v19.2s, v21.2s umlal v1.2d, v12.2s, v29.2s umlal v1.2d, v14.2s, v27.2s umlal v1.2d, v16.2s, v25.2s umlal v1.2d, v18.2s, v23.2s umlal v2.2d, v13.2s, v29.2s umlal v2.2d, v15.2s, v27.2s umlal v2.2d, v17.2s, v25.2s umlal v2.2d, v19.2s, v23.2s umlal v3.2d, v14.2s, v29.2s umlal v3.2d, v16.2s, v27.2s umlal v3.2d, v18.2s, v25.2s umlal v4.2d, v15.2s, v29.2s umlal v4.2d, v17.2s, v27.2s umlal v4.2d, v19.2s, v25.2s umlal v5.2d, v16.2s, v29.2s umlal v5.2d, v18.2s, v27.2s umlal v6.2d, v17.2s, v29.2s umlal v6.2d, v19.2s, v27.2s umlal v7.2d, v18.2s, v29.2s umlal v8.2d, v19.2s, v29.2s ushr v15.2d, v30.2d, #1 usra v6.2d, v5.2d, #25 and v5.16b, v5.16b, v15.16b usra v1.2d, v0.2d, #26 and v0.16b, v0.16b, v30.16b usra v7.2d, v6.2d, #26 and v6.16b, v6.16b, v30.16b usra v2.2d, v1.2d, #25 and v1.16b, v1.16b, v15.16b usra v8.2d, v7.2d, #25 and v7.16b, v7.16b, v15.16b usra v3.2d, v2.2d, #26 and v2.16b, v2.16b, v30.16b usra v9.2d, v8.2d, #26 and v8.16b, v8.16b, v30.16b usra v4.2d, v3.2d, #25 and v3.16b, v3.16b, v15.16b bic v10.16b, v9.16b, v15.16b usra v0.2d, v10.2d, #25 usra v0.2d, v10.2d, #24 usra v0.2d, v10.2d, #21 and v9.16b, v9.16b, v15.16b usra v5.2d, v4.2d, #26 and v4.16b, v4.16b, v30.16b usra v1.2d, v0.2d, #26 and v0.16b, v0.16b, v30.16b usra v6.2d, v5.2d, #25 and v5.16b, v5.16b, v15.16b // Z3 mov v20.s[0], v0.s[0] mov v20.s[1], v1.s[0] mov v22.s[0], v2.s[0] mov v22.s[1], v3.s[0] mov v24.s[0], v4.s[0] mov v24.s[1], v5.s[0] mov v26.s[0], v6.s[0] mov v26.s[1], v7.s[0] mov v28.s[0], v8.s[0] mov v28.s[1], v9.s[0] // X2 mov v11.s[0], v0.s[2] mov v11.s[1], v1.s[2] mov v13.s[0], v2.s[2] mov v13.s[1], v3.s[2] mov v15.s[0], v4.s[2] mov v15.s[1], v5.s[2] mov v17.s[0], v6.s[2] mov v17.s[1], v7.s[2] mov v19.s[0], v8.s[2] mov v19.s[1], v9.s[2] // Z2 mov v10.d[0], x0 mov v12.d[0], x2 mov v14.d[0], x4 mov v16.d[0], x6 mov v18.d[0], x8 // X3 ldp d21, d23, [sp, #304] ldp d25, d27, [sp, #320] ldr d29, [sp, #336] bpl .L1 //ldr w5, [sp, #192] //cmp w5, wzr //bge .L0 ldr x0, [sp, #176] // X2 mov w10, v11.s[0] mov w11, v11.s[1] mov w12, v13.s[0] mov w13, v13.s[1] mov w14, v15.s[0] mov w15, v15.s[1] mov w16, v17.s[0] mov w17, v17.s[1] mov w18, v19.s[0] mov w19, v19.s[1] stp w10, w11, [x0, #0] stp w12, w13, [x0, #8] stp w14, w15, [x0, #16] stp w16, w17, [x0, #24] stp w18, w19, [x0, #32] // Z2 mov w10, v10.s[0] mov w11, v10.s[1] mov w12, v12.s[0] mov w13, v12.s[1] mov w14, v14.s[0] mov w15, v14.s[1] mov w16, v16.s[0] mov w17, v16.s[1] mov w18, v18.s[0] mov w19, v18.s[1] stp w10, w11, [x0, #40] stp w12, w13, [x0, #48] stp w14, w15, [x0, #56] stp w16, w17, [x0, #64] stp w18, w19, [x0, #72] ldp d14, d15, [sp, #160] ldp d12, d13, [sp, #144] ldp d10, d11, [sp, #128] ldp d8, d9, [sp, #112] ldp x29, x30, [sp, #96] ldp x27, x28, [sp, #80] ldp x25, x26, [sp, #64] ldp x23, x24, [sp, #48] ldp x21, x22, [sp, #32] ldp x19, x20, [sp, #16] add sp, sp, #448 ret #endif .section .note.GNU-stack,"",@progbits