#include "crypto_asm_hidden.h" // linker define mladder /* Assembly for Montgomery ladder. The code has been optimized using Slothy. https://github.com/slothy-optimizer/slothy */ .p2align 4 ASM_HIDDEN _CRYPTO_SHARED_NAMESPACE(mladder) .globl _CRYPTO_SHARED_NAMESPACE(mladder) ASM_HIDDEN CRYPTO_SHARED_NAMESPACE(mladder) .globl CRYPTO_SHARED_NAMESPACE(mladder) _CRYPTO_SHARED_NAMESPACE(mladder): CRYPTO_SHARED_NAMESPACE(mladder): sub sp, sp, #560 add x10, sp, #392 stp x19, x20, [x10, #0] stp x21, x22, [x10, #16] stp x23, x24, [x10, #32] stp x25, x26, [x10, #48] stp x27, x28, [x10, #64] stp x29, x30, [x10, #80] stp d8, d9, [x10, #96] stp d10, d11, [x10, #112] stp d12, d13, [x10, #128] stp d14, d15, [x10, #144] // clamp scalar ldr x3, [x2, #0] and x3, x3, #0xfffffffffffffff8 str x3, [sp, #104] ldr x3, [x2, #8] str x3, [sp, #112] ldr x3, [x2, #16] str x3, [sp, #120] ldr x4, [x2, #24] orr x4, x4, #0x4000000000000000 str x4, [sp, #128] str x0, [sp, #144] // load point ldp x4, x5, [x1, #0] ldp x6, x7, [x1, #16] // X1 and w8, w4, #0x3ffffff ubfx x9, x4, #26, #25 lsr x10, x4, #51 orr w10, w10, w5, lsl #13 and w10, w10, #0x3ffffff ubfx x11, x5, #13, #25 lsr x12, x5, #38 and w13, w6, #0x1ffffff ubfx x14, x6, #25, #26 lsr x15, x6, #51 orr w15, w15, w7, lsl #13 and w15, w15, #0x1ffffff ubfx x16, x7, #12, #26 ubfx x17, x7, #38, #25 add x0, sp, #352 stp w8, w9, [x0, #0] stp w10, w11, [x0, #8] stp w12, w13, [x0, #16] stp w14, w15, [x0, #24] stp w16, w17, [x0, #32] mov x20, #1 // X2 ← 1 mov v11.d[0], x20 mov v13.d[0], xzr mov v15.d[0], xzr mov v17.d[0], xzr mov v19.d[0], xzr // Z2 ← 0 mov v10.d[0], xzr mov v12.d[0], xzr mov v14.d[0], xzr mov v16.d[0], xzr mov v18.d[0], xzr // X3 ← X1 mov v21.s[0], w8 mov v21.s[1], w9 mov v23.s[0], w10 mov v23.s[1], w11 mov v25.s[0], w12 mov v25.s[1], w13 mov v27.s[0], w14 mov v27.s[1], w15 mov v29.s[0], w16 mov v29.s[1], w17 // Z3 ← 1 mov v20.d[0], x20 mov v22.d[0], xzr mov v24.d[0], xzr mov v26.d[0], xzr mov v28.d[0], xzr mov w30, #19 dup v31.2s, w30 mov w29, #0x3ffffff dup v30.2d, x29 movz x1, #0xffda movk x1, #0x07ff, lsl 16 movk x1, #0xfffe, lsl 32 movk x1, #0x03ff, lsl 48 movz x2, #0xfffe movk x2, #0x07ff, lsl 16 movk x2, #0xfffe, lsl 32 movk x2, #0x03ff, lsl 48 stp x2, x1, [sp, #0] mov w0, #254 str w0, [sp, #96] mov w0, #0xda strb w0, [sp, #152] ldrb w1, [sp, #135] lsr w1, w1, #6 str w1, [sp, #100] // Montgomery ladder loop .L0: /* slothy optimized code starts */ ldr d0, [sp, #0] tst w1, #1 add v6.2s, v21.2s, v20.2s ldr d2, [sp, #8] add v1.2s, v11.2s, v10.2s add v9.2s, v29.2s, v28.2s add v29.2s, v0.2s, v29.2s add v8.2s, v0.2s, v23.2s add v5.2s, v13.2s, v12.2s add v4.2s, v2.2s, v21.2s sub v29.2s, v29.2s, v28.2s sub v8.2s, v8.2s, v22.2s add v3.2s, v2.2s, v11.2s add v28.2s, v0.2s, v27.2s add v11.2s, v15.2s, v14.2s add v7.2s, v0.2s, v15.2s sub v28.2s, v28.2s, v26.2s add v2.2s, v23.2s, v22.2s sub v15.2s, v3.2s, v10.2s add v3.2s, v0.2s, v17.2s sub v23.2s, v4.2s, v20.2s add v21.2s, v0.2s, v25.2s add v20.2s, v27.2s, v26.2s sub v27.2s, v3.2s, v16.2s fcsel d10, d1, d6, eq sub v4.2s, v21.2s, v24.2s fcsel d26, d5, d2, eq add v16.2s, v17.2s, v16.2s fcsel d22, d15, d23, eq mov x10, v10.d[0] mov x4, v26.d[0] add v17.2s, v0.2s, v13.2s fcsel d3, d16, d20, eq add v26.2s, v19.2s, v18.2s add v0.2s, v0.2s, v19.2s sub v19.2s, v17.2s, v12.2s mov x20, v3.d[0] fcsel d3, d27, d28, eq trn1 v13.2s, v28.2s, v20.2s fcsel d21, d26, d9, eq trn1 v10.2s, v29.2s, v9.2s fcsel d12, d19, d8, eq sub v17.2s, v0.2s, v18.2s sub v0.2s, v7.2s, v14.2s mov x25, v21.d[0] mul v7.2s, v13.2s, v31.2s stp d22, d12, [sp, #160] trn1 v12.2s, v5.2s, v19.2s trn2 v19.2s, v5.2s, v19.2s trn2 v5.2s, v1.2s, v15.2s fcsel d14, d17, d29, eq trn2 v21.2s, v16.2s, v27.2s mul v22.2s, v10.2s, v31.2s fcsel d18, d0, d4, eq trn2 v29.2s, v29.2s, v9.2s trn2 v9.2s, v28.2s, v20.2s trn1 v20.2s, v16.2s, v27.2s str d14, [sp, #192] stp d18, d3, [sp, #176] add v3.2s, v25.2s, v24.2s trn1 v14.2s, v8.2s, v2.2s trn1 v25.2s, v1.2s, v15.2s trn1 v15.2s, v11.2s, v0.2s lsr x8, x10, #32 mul v24.2s, v29.2s, v31.2s fcsel d28, d11, d3, eq trn2 v1.2s, v8.2s, v2.2s umull v2.2d, v25.2s, v10.2s lsr x18, x20, #32 umull v27.2d, v25.2s, v1.2s add x14, x10, x10 umull v8.2d, v25.2s, v14.2s lsr x29, x25, #32 mov x0, v28.d[0] trn2 v28.2s, v11.2s, v0.2s umlal v2.2d, v12.2s, v13.2s add x15, x30, x30 umull v0.2d, v25.2s, v29.2s mul w24, w25, w30 umlal v27.2d, v5.2s, v14.2s umull x7, w14, w20 trn1 v18.2s, v4.2s, v3.2s trn1 v11.2s, v26.2s, v17.2s trn2 v16.2s, v23.2s, v6.2s umull x9, w14, w18 trn2 v17.2s, v26.2s, v17.2s umlal v2.2d, v15.2s, v18.2s mul w3, w29, w15 trn1 v6.2s, v23.2s, v6.2s trn2 v23.2s, v4.2s, v3.2s umull v26.2d, v25.2s, v13.2s umull x27, w14, w4 umlal v27.2d, v12.2s, v16.2s mul w11, w20, w30 umlal v0.2d, v5.2s, v10.2s umull x13, w14, w25 umull v10.2d, v25.2s, v9.2s mul w1, w18, w15 umull v3.2d, v25.2s, v6.2s umull x6, w14, w0 umull v4.2d, v25.2s, v23.2s add x12, x18, x18 umlal v0.2d, v12.2s, v9.2s lsr x19, x4, #32 umlal v0.2d, v19.2s, v13.2s umaddl x6, w4, w4, x6 umlal v10.2d, v5.2s, v13.2s umaddl x22, w0, w0, x13 umlal v10.2d, v12.2s, v23.2s umaddl x2, w3, w25, x9 umlal v26.2d, v12.2s, v18.2s umaddl x5, w1, w18, x6 umull v13.2d, v25.2s, v16.2s lsr x23, x0, #32 umlal v4.2d, v5.2s, v18.2s add x6, x23, x23 umull v29.2d, v25.2s, v18.2s umull x9, w14, w8 umlal v26.2d, v15.2s, v14.2s umaddl x13, w11, w20, x27 mul v25.2s, v18.2s, v31.2s add x27, x8, x8 umlal v2.2d, v20.2s, v14.2s umaddl x26, w24, w25, x7 umlal v4.2d, v12.2s, v1.2s umaddl x9, w11, w6, x9 umlal v3.2d, v12.2s, v22.2s umaddl x17, w1, w6, x13 umlal v3.2d, v15.2s, v7.2s umull x16, w10, w10 umlal v3.2d, v20.2s, v25.2s umlal v2.2d, v11.2s, v6.2s umaddl x21, w3, w29, x22 umlal v27.2d, v19.2s, v6.2s mul w7, w23, w15 umlal v0.2d, v15.2s, v23.2s umaddl x9, w1, w0, x9 umlal v0.2d, v28.2s, v18.2s umaddl x22, w27, w8, x17 umlal v10.2d, v19.2s, v18.2s umull x29, w14, w29 umlal v4.2d, v19.2s, v14.2s umaddl x13, w7, w23, x16 umlal v26.2d, v20.2s, v6.2s add x8, x4, x4 umlal v13.2d, v5.2s, v6.2s umull x7, w14, w19 umlal v10.2d, v15.2s, v1.2s add x28, x19, x19 umlal v4.2d, v15.2s, v16.2s umaddl x15, w27, w3, x13 shl v5.2s, v5.2s, #1 umaddl x13, w24, w6, x7 umlal v8.2d, v12.2s, v6.2s umaddl x7, w3, w12, x26 umlal v29.2d, v12.2s, v14.2s umaddl x16, w3, w4, x9 umlal v10.2d, v28.2s, v14.2s umaddl x13, w1, w20, x13 umlal v27.2d, v28.2s, v22.2s umaddl x9, w27, w6, x7 umlal v0.2d, v20.2s, v1.2s umaddl x17, w28, w24, x16 umlal v0.2d, v21.2s, v14.2s umaddl x16, w3, w0, x13 umlal v4.2d, v28.2s, v6.2s umull x13, w14, w23 umlal v13.2d, v19.2s, v22.2s umaddl x15, w8, w24, x15 umaddl x29, w27, w25, x29 mul v18.2s, v14.2s, v31.2s umlal v0.2d, v11.2s, v16.2s umaddl x13, w24, w12, x13 umlal v0.2d, v17.2s, v6.2s add x26, x0, x0 umlal v29.2d, v15.2s, v6.2s umaddl x10, w8, w0, x9 umlal v10.2d, v20.2s, v16.2s umaddl x13, w3, w20, x13 shl v19.2s, v19.2s, #1 umlal v27.2d, v21.2s, v7.2s add x25, x30, x30 umlal v10.2d, v21.2s, v6.2s umaddl x7, w3, w6, x5 umlal v13.2d, v28.2s, v7.2s umaddl x29, w8, w18, x29 shl v6.2s, v28.2s, #1 umaddl x5, w27, w0, x13 umlal v8.2d, v15.2s, v22.2s umaddl x9, w27, w28, x7 umlal v8.2d, v20.2s, v7.2s umaddl x7, w28, w3, x22 umaddl x13, w27, w4, x16 add x3, x20, x20 umlal v3.2d, v11.2s, v18.2s ldr x4, [sp, #192] umlal v27.2d, v17.2s, v25.2s umaddl x18, w26, w24, x7 umlal v4.2d, v21.2s, v22.2s umaddl x7, w27, w20, x2 umlal v13.2d, v21.2s, v25.2s umaddl x3, w3, w24, x9 umlal v13.2d, v17.2s, v18.2s ldp x2, x16, [sp, #160] umlal v27.2d, v15.2s, v24.2s umaddl x14, w8, w23, x7 umlal v26.2d, v11.2s, v22.2s umaddl x7, w28, w1, x15 umlal v29.2d, v20.2s, v22.2s umaddl x10, w28, w19, x10 umlal v8.2d, v11.2s, v25.2s umull x22, w2, w2 shl v25.2s, v17.2s, #1 umaddl x1, w26, w11, x7 umlal v2.2d, v5.2s, v9.2s umaddl x21, w27, w12, x21 umlal v10.2d, v17.2s, v22.2s umaddl x12, w28, w0, x14 add x15, x17, x1, lsr #26 mul v14.2s, v23.2s, v31.2s umaddl x14, w8, w20, x21 mul v9.2s, v9.2s, v31.2s umlal v13.2d, v12.2s, v24.2s add x11, x18, x15, lsr #25 umlal v3.2d, v5.2s, v24.2s umaddl x18, w8, w19, x5 mul v18.2s, v16.2s, v31.2s add x19, x13, x11, lsr #26 umlal v8.2d, v5.2s, v16.2s add x8, x16, x16 umlal v8.2d, v19.2s, v24.2s add x7, x3, x19, lsr #25 umlal v3.2d, v19.2s, v9.2s and x1, x1, #0x3ffffff umlal v2.2d, v19.2s, v23.2s umaddl x13, w28, w20, x29 shl v22.2s, v21.2s, #1 add x21, x18, x7, lsr #26 umlal v8.2d, v6.2s, v9.2s and x18, x7, #0x3ffffff umlal v8.2d, v22.2s, v14.2s add x29, x10, x21, lsr #25 umlal v13.2d, v15.2s, v9.2s ldp x0, x24, [sp, #176] umaddl x7, w28, w6, x14 mul v15.2s, v1.2s, v31.2s umlal v4.2d, v17.2s, v7.2s umaddl x13, w26, w23, x13 umlal v2.2d, v6.2s, v1.2s add x14, x12, x29, lsr #26 umlal v2.2d, v22.2s, v16.2s lsr x23, x0, #32 umlal v8.2d, v25.2s, v15.2s lsr x26, x24, #32 umlal v13.2d, v20.2s, v14.2s lsr x20, x2, #32 umlal v13.2d, v11.2s, v15.2s add x12, x7, x14, lsr #25 umlal v3.2d, v6.2s, v14.2s and x3, x29, #0x3ffffff umlal v3.2d, v22.2s, v15.2s add x27, x13, x12, lsr #26 umlal v3.2d, v25.2s, v18.2s mul w10, w24, w30 umlal v29.2d, v11.2s, v7.2s mul w6, w23, w25 umlal v26.2d, v5.2s, v23.2s bfi x18, x21, #32, #25 umlal v27.2d, v20.2s, v9.2s bic x9, x27, #0x1ffffff umlal v27.2d, v11.2s, v14.2s add x13, x2, x2 umlal v29.2d, v5.2s, v1.2s lsr x5, x16, #32 umaddl x22, w6, w23, x22 umlal v29.2d, v19.2s, v16.2s lsr x21, x4, #32 usra v13.2d, v3.2d, #26 umull x6, w13, w5 bfi x3, x14, #32, #25 mul w17, w4, w30 umlal v26.2d, v19.2s, v1.2s add x29, x1, x9, lsr 25 add x1, x23, x23 mul w2, w26, w25 umlal v10.2d, v11.2s, v24.2s umaddl x6, w17, w1, x6 umlal v2.2d, v25.2s, v24.2s and x11, x11, #0x3ffffff umlal v26.2d, v6.2s, v16.2s add x28, x20, x20 usra v8.2d, v13.2d, #25 umaddl x7, w2, w24, x6 umlal v29.2d, v6.2s, v24.2s umull x14, w13, w16 umlal v29.2d, v22.2s, v9.2s umull x6, w13, w26 umlal v29.2d, v25.2s, v14.2s mul w25, w21, w25 usra v27.2d, v8.2d, #26 umaddl x14, w10, w24, x14 umlal v4.2d, v20.2s, v24.2s stp x18, x3, [sp, #224] umlal v4.2d, v11.2s, v9.2s add x3, x29, x9, lsr #24 usra v29.2d, v27.2d, #25 umaddl x22, w28, w25, x22 umlal v26.2d, v22.2s, v24.2s add x29, x5, x5 umlal v26.2d, v25.2s, v9.2s and x18, x12, #0x3ffffff usra v4.2d, v29.2d, #26 bfi x18, x27, #32, #25 ld1r {v20.2d}, [sp] add x12, x3, x9, lsr #21 ushr v18.2d, v30.2d, #1 bfi x11, x19, #32, #25 usra v26.2d, v4.2d, #25 umaddl x9, w25, w4, x6 and v5.16b, v29.16b, v30.16b and v1.16b, v27.16b, v18.16b umaddl x14, w2, w1, x14 and v28.16b, v13.16b, v18.16b and x6, x15, #0x1ffffff usra v10.2d, v26.2d, #26 umaddl x15, w28, w24, x9 and v15.16b, v4.16b, v18.16b add x9, x6, x12, lsr #26 and v26.16b, v26.16b, v30.16b and x3, x12, #0x3ffffff usra v2.2d, v10.2d, #25 bfi x3, x9, #32, #26 shl v14.2d, v31.2d, #1 umull x6, w13, w20 uzp1 v5.4s, v5.4s, v15.4s umaddl x9, w8, w17, x22 usra v0.2d, v2.2d, #26 stp x3, x11, [sp, #208] and v17.16b, v10.16b, v18.16b umaddl x12, w10, w1, x6 and v10.16b, v3.16b, v30.16b str x18, [sp, #240] bic v16.16b, v0.16b, v18.16b umull x6, w13, w21 and v15.16b, v8.16b, v30.16b umaddl x12, w2, w0, x12 usra v10.2d, v16.2d, #25 umaddl x11, w28, w20, x14 ushr v9.2d, v30.2d, #1 add x18, x0, x0 uzp1 v8.4s, v15.4s, v1.4s umaddl x6, w28, w4, x6 usra v10.2d, v16.2d, #24 umaddl x19, w25, w0, x7 and v15.16b, v2.16b, v30.16b umull x7, w13, w0 and v11.16b, v0.16b, v18.16b umaddl x12, w25, w16, x12 usra v10.2d, v16.2d, #21 umaddl x6, w8, w26, x6 uzp1 v17.4s, v26.4s, v17.4s umaddl x7, w16, w16, x7 trn1 v3.4s, v15.4s, v11.4s umaddl x27, w28, w16, x19 umaddl x16, w29, w24, x6 ldr b23, [sp, #152] uzp1 v1.4s, v5.4s, v17.4s and v15.16b, v10.16b, v30.16b umull x19, w13, w23 uzp2 v2.4s, v5.4s, v17.4s umull x6, w13, w4 add v16.4s, v1.4s, v20.4s umaddl x14, w29, w2, x9 umull x9, w13, w24 mov v11.d[0], v3.d[1] usra v28.2d, v10.2d, #26 umaddl x22, w0, w0, x6 add v4.4s, v3.4s, v20.4s umaddl x16, w18, w23, x16 add v18.4s, v1.4s, v2.4s add x20, x26, x26 uzp1 v15.4s, v15.4s, v28.4s umaddl x6, w2, w26, x7 sub v10.4s, v16.4s, v2.4s umaddl x7, w29, w17, x12 umaddl x19, w17, w20, x19 mov v20.b[0], v23.b[0] uzp1 v1.4s, v15.4s, v8.4s umaddl x13, w25, w1, x6 sub v4.4s, v4.4s, v11.4s umaddl x12, w18, w10, x14 uzp2 v17.4s, v15.4s, v8.4s umaddl x6, w17, w4, x9 add v29.4s, v1.4s, v20.4s umaddl x14, w28, w29, x13 add v15.4s, v1.4s, v17.4s umaddl x3, w29, w25, x11 zip1 v16.4s, v10.4s, v18.4s add x9, x7, x12, lsr #26 sub v2.4s, v29.4s, v17.4s umaddl x13, w25, w20, x6 shl v8.2s, v16.2s, #1 mov v6.d[0], v16.d[1] zip2 v1.4s, v10.4s, v18.4s add x6, x24, x24 zip2 v13.4s, v2.4s, v15.4s umaddl x7, w28, w1, x13 zip1 v17.4s, v2.4s, v15.4s umaddl x26, w6, w17, x14 umaddl x2, w8, w23, x15 mul v5.2s, v6.2s, v14.2s mov v23.d[0], v17.d[1] mov v19.d[0], v13.d[1] add v15.4s, v3.4s, v11.4s shl v21.2s, v19.2s, #1 and x10, x9, #0x1ffffff umull v26.2d, v17.2s, v17.2s umaddl x13, w25, w24, x19 mov v22.d[0], v1.d[1] shl v3.2s, v17.2s, #1 zip1 v2.4s, v4.4s, v15.4s umaddl x22, w25, w21, x22 umull v0.2d, v3.2s, v16.2s umaddl x6, w28, w0, x13 umaddl x14, w18, w17, x3 shl v24.2s, v13.2s, #1 mul v27.2s, v2.2s, v31.2s umaddl x13, w8, w0, x7 shl v28.2s, v22.2s, #1 mov v25.d[0], v2.d[1] ldp x17, x11, [sp, #0] shl v7.2s, v23.2s, #1 mul v12.2s, v22.2s, v14.2s add x3, x14, x9, lsr #25 umull v29.2d, v3.2s, v1.2s umaddl x7, w28, w20, x22 shl v10.2s, v1.2s, #1 umaddl x21, w29, w5, x13 shl v11.2s, v6.2s, #1 ldp x9, x23, [sp, #224] umull v17.2d, v3.2s, v25.2s add x13, x27, x3, lsr #26 umlal v29.2d, v7.2s, v11.2s and x18, x3, #0x3ffffff umlal v29.2d, v24.2s, v16.2s umaddl x14, w8, w24, x7 umull v4.2d, v3.2s, v6.2s ldp x7, x24, [sp, #208] umull v15.2d, v3.2s, v19.2s and x22, x12, #0x3ffffff umlal v17.2d, v7.2s, v2.2s and x20, x13, #0x1ffffff umull v18.2d, v3.2s, v23.2s umaddl x6, w8, w5, x6 umlal v29.2d, v21.2s, v19.2s add x3, x26, x13, lsr #25 umlal v4.2d, v7.2s, v16.2s add x5, x7, x11 umlal v4.2d, v24.2s, v19.2s umaddl x7, w29, w0, x2 umlal v0.2d, v7.2s, v21.2s add x13, x6, x3, lsr #26 mul v20.2s, v25.2s, v14.2s add x15, x9, x17 mul v14.2s, v1.2s, v31.2s add x9, x21, x13, lsr #25 and x2, x13, #0x1ffffff umlal v15.2d, v7.2s, v13.2s umaddl x6, w29, w1, x14 umlal v0.2d, v13.2s, v13.2s and x25, x3, #0x3ffffff umull v19.2d, v3.2s, v13.2s bfi x18, x20, #32, #25 umlal v4.2d, v20.2s, v1.2s add x27, x24, x17 umlal v26.2d, v7.2s, v20.2s sub x24, x27, x18 umlal v26.2d, v24.2s, v27.2s bfi x25, x2, #32, #25 umlal v26.2d, v21.2s, v12.2s add x0, x7, x9, lsr #26 umlal v18.2d, v20.2s, v13.2s mov w12, w25 umlal v18.2d, v21.2s, v27.2s and x19, x0, #0x1ffffff umlal v15.2d, v20.2s, v16.2s ldr x3, [sp, #240] umlal v0.2d, v20.2s, v11.2s add x26, x6, x0, lsr #25 umlal v19.2d, v7.2s, v23.2s sub x4, x15, x25 umlal v18.2d, v12.2s, v16.2s add x29, x16, x26, lsr #26 umlal v15.2d, v27.2s, v11.2s lsr x7, x4, #32 umlal v26.2d, v8.2s, v14.2s bic x13, x29, #0x1ffffff umlal v0.2d, v10.2s, v27.2s add x6, x22, x13, lsr #25 umlal v0.2d, v12.2s, v22.2s and x1, x9, #0x3ffffff umull v10.2d, v3.2s, v22.2s add x6, x6, x13, lsr #24 umlal v26.2d, v5.2s, v6.2s and x11, x29, #0x1ffffff umlal v19.2d, v21.2s, v20.2s add x6, x6, x13, lsr #21 umlal v17.2d, v24.2s, v22.2s mov w22, w18 umlal v10.2d, v7.2s, v1.2s add x0, x3, x17 umlal v10.2d, v24.2s, v6.2s and x13, x26, #0x3ffffff umlal v10.2d, v21.2s, v16.2s bfi x13, x11, #32, #25 umlal v10.2d, v20.2s, v2.2s add x27, x23, x17 and v23.16b, v26.16b, v30.16b sub x23, x0, x13 umull v22.2d, v3.2s, v2.2s add x9, x10, x6, lsr #26 umlal v29.2d, v20.2s, v28.2s movz x8, #0xdb42 movk x8, #0x0001, lsl 16 umlal v4.2d, v27.2s, v28.2s and x16, x6, #0x3ffffff umlal v18.2d, v14.2s, v11.2s bfi x16, x9, #32, #26 umlal v19.2d, v8.2s, v27.2s umaddl x21, w7, w8, x2 umlal v19.2d, v12.2s, v11.2s mov w29, w16 umlal v19.2d, v14.2s, v1.2s sub x5, x5, x16 usra v18.2d, v26.2d, #26 umaddl x17, w5, w8, x29 umlal v22.2d, v7.2s, v28.2s umaddl x15, w4, w8, x12 umlal v15.2d, v12.2s, v1.2s bfi x1, x19, #32, #25 usra v19.2d, v18.2d, #25 mov w6, w13 umlal v17.2d, v21.2s, v1.2s sub x0, x27, x1 umlal v17.2d, v8.2s, v6.2s lsr x27, x0, #32 usra v15.2d, v19.2d, #26 mov w10, w1 umlal v22.2d, v24.2s, v1.2s umaddl x2, w27, w8, x19 and v24.16b, v18.16b, v9.16b umaddl x29, w0, w8, x10 usra v0.2d, v15.2d, #25 umaddl x28, w23, w8, x6 umlal v29.2d, v27.2s, v2.2s stp x25, x1, [sp, #320] umlal v22.2d, v21.2s, v11.2s and v1.16b, v15.16b, v9.16b lsr x25, x23, #32 usra v4.2d, v0.2d, #26 stp x16, x18, [sp, #304] and v7.16b, v0.16b, v30.16b lsr x16, x24, #32 and v0.16b, v19.16b, v30.16b umaddl x11, w25, w8, x11 usra v29.2d, v4.2d, #25 umaddl x14, w16, w8, x20 umlal v22.2d, v16.2s, v16.2s lsr x10, x5, #32 umlal v22.2d, v20.2s, v25.2s umaddl x3, w24, w8, x22 usra v10.2d, v29.2d, #26 umaddl x6, w10, w8, x9 and v27.16b, v29.16b, v30.16b str x13, [sp, #336] and v8.16b, v4.16b, v9.16b add x13, sp, #304 usra v22.2d, v10.2d, #25 add x18, x6, x17, lsr #26 zip2 v20.4s, v7.4s, v8.4s and x1, x17, #0x3ffffff and v28.16b, v10.16b, v9.16b add x20, x3, x18, lsr #25 usra v17.2d, v22.2d, #26 add x6, sp, #352 and v5.16b, v22.16b, v30.16b add x8, x14, x20, lsr #26 ld2 {v21.S, v22.S}[1], [x13], #8 and x20, x20, #0x3ffffff bic v2.16b, v17.16b, v9.16b add x19, x15, x8, lsr #25 and v6.16b, v17.16b, v9.16b and x14, x8, #0x1ffffff usra v23.2d, v2.2d, #25 add x8, sp, #208 add x3, x21, x19, lsr #26 ld2 {v12.S, v13.S}[1], [x13], #8 umull x21, w5, w20 ld2 {v21.S, v22.S}[0], [x6], #8 usra v23.2d, v2.2d, #24 and x12, x18, #0x1ffffff ld2 {v14.S, v15.S}[1], [x13], #8 and x26, x19, #0x3ffffff ld2 {v12.S, v13.S}[0], [x6], #8 zip2 v25.4s, v27.4s, v28.4s add x15, x29, x3, lsr #25 usra v23.2d, v2.2d, #21 and x29, x3, #0x1ffffff add x3, x2, x15, lsr #26 ld2 {v16.S, v17.S}[1], [x13], #8 ld2 {v14.S, v15.S}[0], [x6], #8 umull x19, w5, w29 usra v24.2d, v23.2d, #26 add x2, x28, x3, lsr #25 and v23.16b, v23.16b, v30.16b umull x18, w5, w14 add x22, x11, x2, lsr #26 ld2 {v18.S, v19.S}[1], [x13], #8 zip2 v26.4s, v23.4s, v24.4s and x11, x3, #0x1ffffff ld2 {v23.S, v24.S}[1], [x8], #8 bic x3, x22, #0x1ffffff zip2 v10.4s, v0.4s, v1.4s add x28, x1, x3, lsr #25 umaddl x19, w10, w26, x19 ld2 {v16.S, v17.S}[0], [x6], #8 add x9, x28, x3, lsr #24 ld2 {v0.S, v1.S}[1], [x8], #8 zip2 v2.4s, v5.4s, v6.4s umull x13, w5, w12 ld2 {v18.S, v19.S}[0], [x6], #8 add x6, x9, x3, lsr #21 ld2 {v7.S, v8.S}[1], [x8], #8 and x28, x2, #0x3ffffff umull v4.2d, v21.2s, v24.2s umaddl x13, w10, w6, x13 umull v9.2d, v21.2s, v0.2s mul w17, w28, w30 ld2 {v27.S, v28.S}[1], [x8], #8 umull x3, w5, w28 umull v11.2d, v21.2s, v8.2s and x9, x22, #0x1ffffff umlal v4.2d, v22.2s, v23.2s umaddl x1, w16, w17, x13 umull v3.2d, v21.2s, v27.2s and x2, x15, #0x3ffffff umlal v9.2d, v12.2s, v23.2s mul w15, w2, w30 stp d26, d10, [sp, #256] stp d20, d25, [sp, #272] umull v26.2d, v21.2s, v7.2s umull v29.2d, v21.2s, v28.2s umull x22, w5, w11 umlal v11.2d, v22.2s, v7.2s umaddl x1, w7, w15, x1 umlal v11.2d, v12.2s, v1.2s umaddl x19, w24, w14, x19 umlal v3.2d, v12.2s, v7.2s umaddl x21, w24, w6, x21 umlal v3.2d, v14.2s, v0.2s umull x13, w5, w9 umlal v3.2d, v16.2s, v23.2s umaddl x22, w10, w2, x22 umull v25.2d, v21.2s, v1.2s umaddl x21, w4, w17, x21 umlal v11.2d, v13.2s, v0.2s umaddl x13, w10, w28, x13 umlal v11.2d, v14.2s, v24.2s umaddl x28, w10, w20, x18 umull v20.2d, v21.2s, v23.2s umaddl x3, w24, w2, x3 umlal v25.2d, v22.2s, v0.2s umaddl x13, w24, w11, x13 str d2, [sp, #288] shl v2.2s, v22.2s, #1 umlal v29.2d, v22.2s, v27.2s mul w9, w9, w30 umlal v29.2d, v12.2s, v8.2s umaddl x3, w4, w26, x3 umlal v29.2d, v13.2s, v7.2s umaddl x22, w24, w29, x22 ld2 {v5.S, v6.S}[1], [x8], #8 umaddl x13, w16, w2, x13 umlal v11.2d, v15.2s, v23.2s umaddl x3, w0, w20, x3 umlal v25.2d, v12.2s, v24.2s umaddl x8, w24, w12, x28 umlal v29.2d, v14.2s, v1.2s umaddl x22, w16, w26, x22 umlal v29.2d, v15.2s, v0.2s add x10, x10, x10 umull v10.2d, v21.2s, v5.2s umaddl x13, w4, w29, x13 umull v21.2d, v21.2s, v6.2s umull x28, w5, w2 mul v6.2s, v6.2s, v31.2s umaddl x8, w16, w6, x8 umlal v29.2d, v16.2s, v24.2s umlal v25.2d, v13.2s, v23.2s umaddl x13, w7, w26, x13 umlal v29.2d, v17.2s, v23.2s umaddl x18, w24, w26, x28 umlal v10.2d, v12.2s, v27.2s umaddl x8, w7, w17, x8 umlal v10.2d, v14.2s, v7.2s umaddl x28, w0, w14, x13 umlal v21.2d, v22.2s, v5.2s umaddl x21, w0, w15, x21 mul w13, w26, w30 mul v22.2s, v27.2s, v31.2s umull x26, w5, w26 mul v5.2s, v5.2s, v31.2s umlal v10.2d, v16.2s, v0.2s umaddl x2, w16, w20, x19 umlal v21.2d, v12.2s, v28.2s umaddl x21, w23, w13, x21 umlal v21.2d, v13.2s, v27.2s umaddl x19, w23, w6, x3 umlal v25.2d, v15.2s, v5.2s umaddl x3, w24, w20, x26 umlal v26.2d, v12.2s, v0.2s umaddl x28, w27, w20, x28 mul v27.2s, v7.2s, v31.2s umaddl x21, w10, w12, x21 umlal v21.2d, v14.2s, v8.2s umaddl x19, w10, w11, x19 umlal v21.2d, v15.2s, v7.2s umaddl x22, w4, w14, x22 umlal v25.2d, v17.2s, v22.2s umaddl x18, w4, w20, x18 umlal v25.2d, v19.2s, v27.2s add x26, x16, x16 umlal v25.2d, v14.2s, v6.2s umaddl x16, w26, w9, x21 umlal v26.2d, v14.2s, v23.2s umaddl x21, w7, w20, x22 umlal v26.2d, v16.2s, v5.2s mul w20, w20, w30 umlal v26.2d, v18.2s, v22.2s umaddl x1, w27, w13, x1 umlal v3.2d, v18.2s, v5.2s mul w11, w11, w30 umlal v4.2d, v13.2s, v5.2s umaddl x8, w27, w15, x8 umlal v4.2d, v15.2s, v22.2s umlal v10.2d, v18.2s, v23.2s umull x22, w5, w6 umlal v10.2d, v2.2s, v28.2s umaddl x19, w26, w29, x19 shl v13.2s, v13.2s, #1 umaddl x1, w25, w20, x1 umlal v3.2d, v2.2s, v8.2s umaddl x18, w0, w6, x18 umlal v3.2d, v13.2s, v1.2s umaddl x5, w4, w6, x3 umlal v26.2d, v2.2s, v1.2s umaddl x22, w24, w17, x22 umaddl x3, w4, w15, x22 mul v28.2s, v28.2s, v31.2s umaddl x24, w24, w9, x1 mul v7.2s, v1.2s, v31.2s umlal v4.2d, v17.2s, v27.2s add x1, x7, x7 umlal v26.2d, v13.2s, v24.2s umaddl x22, w23, w17, x18 umlal v11.2d, v17.2s, v5.2s umaddl x18, w0, w13, x3 umaddl x3, w23, w12, x28 shl v15.2s, v15.2s, #1 umlal v29.2d, v19.2s, v5.2s umaddl x28, w4, w12, x2 umlal v20.2d, v12.2s, v5.2s umaddl x18, w23, w20, x18 umlal v3.2d, v15.2s, v24.2s umaddl x2, w0, w12, x21 umlal v9.2d, v14.2s, v5.2s umaddl x13, w25, w13, x8 umlal v9.2d, v16.2s, v22.2s umaddl x21, w0, w17, x5 umlal v11.2d, v19.2s, v22.2s umaddl x5, w1, w11, x16 umaddl x28, w7, w6, x28 mul v5.2s, v0.2s, v31.2s umlal v20.2d, v14.2s, v22.2s umaddl x16, w23, w15, x21 umaddl x21, w10, w29, x22 mul v22.2s, v8.2s, v31.2s umlal v11.2d, v16.2s, v6.2s umaddl x20, w25, w6, x3 umlal v4.2d, v19.2s, v5.2s umaddl x7, w27, w17, x28 umlal v4.2d, v12.2s, v6.2s umaddl x28, w1, w14, x19 add x3, x27, x27 umlal v9.2d, v18.2s, v27.2s umaddl x13, w4, w9, x13 umlal v9.2d, v2.2s, v24.2s umaddl x22, w10, w14, x16 umlal v20.2d, v16.2s, v27.2s umaddl x7, w25, w15, x7 umlal v20.2d, v18.2s, v5.2s mul w8, w29, w30 umlal v20.2d, v2.2s, v6.2s umaddl x15, w4, w11, x24 umaddl x4, w26, w14, x21 mul v5.2s, v24.2s, v31.2s umlal v25.2d, v16.2s, v28.2s umaddl x29, w27, w6, x2 umlal v25.2d, v18.2s, v22.2s umaddl x19, w0, w9, x7 umlal v20.2d, v13.2s, v28.2s umaddl x7, w0, w11, x13 umlal v4.2d, v14.2s, v28.2s umaddl x6, w10, w9, x18 umlal v21.2d, v16.2s, v1.2s mul w24, w14, w30 umlal v10.2d, v13.2s, v8.2s umaddl x10, w23, w8, x7 umlal v10.2d, v15.2s, v1.2s umaddl x6, w26, w11, x6 umlal v4.2d, v16.2s, v22.2s umaddl x7, w3, w8, x5 umlal v4.2d, v18.2s, v7.2s umaddl x16, w0, w8, x15 umlal v11.2d, v18.2s, v28.2s umaddl x13, w1, w8, x6 ushr v8.2d, v30.2d, #1 add x15, x25, x25 umlal v9.2d, v13.2s, v6.2s umaddl x25, w25, w17, x29 umlal v9.2d, v15.2s, v28.2s umaddl x17, w3, w24, x13 umlal v20.2d, v15.2s, v22.2s umaddl x0, w23, w24, x16 shl v2.2s, v19.2s, #1 umlal v26.2d, v15.2s, v6.2s umaddl x2, w26, w12, x22 shl v27.2s, v17.2s, #1 umlal v9.2d, v27.2s, v22.2s mul w13, w12, w30 umlal v20.2d, v27.2s, v7.2s umaddl x6, w1, w12, x4 umlal v20.2d, v2.2s, v5.2s umaddl x22, w23, w11, x19 umlal v26.2d, v27.2s, v28.2s umaddl x16, w1, w9, x2 umlal v21.2d, v17.2s, v0.2s umaddl x19, w3, w12, x28 umlal v21.2d, v18.2s, v24.2s umaddl x29, w3, w9, x6 umlal v21.2d, v19.2s, v23.2s ldr x14, [sp, #96] umlal v9.2d, v2.2s, v7.2s umaddl x17, w15, w13, x17 usra v4.2d, v20.2d, #26 umaddl x29, w15, w11, x29 umlal v10.2d, v27.2s, v24.2s umaddl x5, w3, w11, x16 umlal v29.2d, v18.2s, v6.2s add x21, x0, x17, lsr #26 usra v9.2d, v4.2d, #25 subs w0, w14, #1 umlal v3.2d, v27.2s, v6.2s umaddl x4, w15, w8, x5 umlal v26.2d, v2.2s, v22.2s umaddl x7, w15, w24, x7 usra v25.2d, v9.2d, #26 add x13, sp, #104 and v7.16b, v9.16b, v30.16b asr w16, w0, #5 umlal v3.2d, v2.2s, v28.2s add x24, x7, x21, lsr #25 usra v26.2d, v25.2d, #25 ldr w6, [x13, w16, sxtw #2] and v13.16b, v25.16b, v8.16b add x12, x10, x24, lsr #26 umlal v10.2d, v2.2s, v6.2s lsr x13, x14, #32 usra v11.2d, v26.2d, #26 add x14, x4, x12, lsr #25 and v23.16b, v26.16b, v30.16b and w27, w0, #0x1f and v2.16b, v4.16b, v8.16b lsr w16, w6, w27 usra v3.2d, v11.2d, #25 add x3, x22, x14, lsr #26 and v25.16b, v20.16b, v30.16b umaddl x28, w23, w9, x25 and v15.16b, v11.16b, v8.16b add x29, x29, x3, lsr #25 usra v29.2d, v3.2d, #26 umaddl x6, w15, w9, x19 and v5.16b, v3.16b, v30.16b add x22, x28, x29, lsr #26 zip1 v24.2s, v23.2s, v15.2s zip1 v22.2s, v7.2s, v13.2s usra v10.2d, v29.2d, #25 add x9, x6, x22, lsr #25 and v6.16b, v29.16b, v8.16b eor w1, w16, w13 ldr d29, [sp, #288] add x5, x20, x9, lsr #26 usra v21.2d, v10.2d, #26 and x28, x17, #0x3ffffff and v0.16b, v10.16b, v30.16b bic x6, x5, #0x1ffffff zip2 v17.4s, v5.4s, v6.4s add x7, x28, x6, lsr #25 bic v10.16b, v21.16b, v8.16b and x9, x9, #0x3ffffff and v1.16b, v21.16b, v8.16b add x17, x7, x6, lsr #24 usra v25.2d, v10.2d, #25 bfi x9, x5, #32, #25 zip2 v19.4s, v0.4s, v1.4s add x6, x17, x6, lsr #21 zip2 v15.4s, v23.4s, v15.4s and x7, x21, #0x1ffffff usra v25.2d, v10.2d, #24 add x13, x7, x6, lsr #26 ldp d21, d23, [sp, #256] and x6, x6, #0x3ffffff bfi x6, x13, #32, #26 and x19, x24, #0x3ffffff usra v25.2d, v10.2d, #21 bfi x19, x12, #32, #25 mov v10.d[0], x6 mov v18.d[0], x9 mov v12.d[0], x19 zip1 v26.2s, v5.2s, v6.2s usra v2.2d, v25.2d, #26 and x6, x14, #0x3ffffff and v25.16b, v25.16b, v30.16b bfi x6, x3, #32, #25 zip2 v13.4s, v7.4s, v13.4s and x7, x29, #0x3ffffff zip2 v11.4s, v25.4s, v2.4s bfi x7, x22, #32, #25 zip1 v20.2s, v25.2s, v2.2s mov v14.d[0], x6 mov v16.d[0], x7 zip1 v28.2s, v0.2s, v1.2s ldp d25, d27, [sp, #272] stp w0, w16, [sp, #96] /* slothy optimized code ends */ bpl .L0 ldr x0, [sp, #144] // X2 stp d11, d13, [x0, #0] stp d15, d17, [x0, #16] str d19, [x0, #32] // Z2 stp d10, d12, [x0, #40] stp d14, d16, [x0, #56] str d18, [x0, #72] add x10, sp, #392 ldp d14, d15, [x10, #144] ldp d12, d13, [x10, #128] ldp d10, d11, [x10, #112] ldp d8, d9, [x10, #96] ldp x29, x30, [x10, #80] ldp x27, x28, [x10, #64] ldp x25, x26, [x10, #48] ldp x23, x24, [x10, #32] ldp x21, x22, [x10, #16] ldp x19, x20, [x10, #0] add sp, sp, #560 ret ret .section .note.GNU-stack,"",@progbits