-rw-r--r-- 37208 lib25519-20260614/crypto_nP/montgomery25519/arm64-neonplusuma9l-opt/mladder.S raw
#include "crypto_asm_hidden.h"
// linker define mladder
/* Assembly for Montgomery ladder.
The code has been optimized using Slothy.
https://github.com/slothy-optimizer/slothy
*/
.p2align 4
ASM_HIDDEN _CRYPTO_SHARED_NAMESPACE(mladder)
.globl _CRYPTO_SHARED_NAMESPACE(mladder)
ASM_HIDDEN CRYPTO_SHARED_NAMESPACE(mladder)
.globl CRYPTO_SHARED_NAMESPACE(mladder)
_CRYPTO_SHARED_NAMESPACE(mladder):
CRYPTO_SHARED_NAMESPACE(mladder):
sub sp, sp, #576
add x10, sp, #408
stp x19, x20, [x10, #0]
stp x21, x22, [x10, #16]
stp x23, x24, [x10, #32]
stp x25, x26, [x10, #48]
stp x27, x28, [x10, #64]
stp x29, x30, [x10, #80]
stp d8, d9, [x10, #96]
stp d10, d11, [x10, #112]
stp d12, d13, [x10, #128]
stp d14, d15, [x10, #144]
// clamp scalar
ldr x3, [x2, #0]
and x3, x3, #0xfffffffffffffff8
str x3, [sp, #104]
ldr x3, [x2, #8]
str x3, [sp, #112]
ldr x3, [x2, #16]
str x3, [sp, #120]
ldr x4, [x2, #24]
orr x4, x4, #0x4000000000000000
str x4, [sp, #128]
str x0, [sp, #144]
// load point
ldp x4, x5, [x1, #0]
ldp x6, x7, [x1, #16]
// X1
and w8, w4, #0x1fffffff
ubfx x9, x4, #29, #29
lsr x10, x4, #58
orr w10, w10, w5, lsl #6
and w10, w10, #0x1fffffff
ubfx x11, x5, #23, #29
lsr x12, x5, #52
orr w12, w12, w6, lsl #12
and w12, w12, #0x1fffffff
ubfx x13, x6, #17, #29
lsr x14, x6, #46
orr w14, w14, w7, lsl #18
and w14, w14, #0x1fffffff
ubfx x15, x7, #11, #29
ubfx x16, x7, #40, #23
add x0, sp, #360
stp w8, w9, [x0, #0]
stp w10, w11, [x0, #8]
stp w12, w13, [x0, #16]
stp w14, w15, [x0, #24]
stp w16, wzr, [x0, #32]
mov x20, #1
// X2 ← 1
mov v11.d[0], x20
mov v13.d[0], xzr
mov v15.d[0], xzr
mov v17.d[0], xzr
mov v19.d[0], xzr
// Z2 ← 0
mov v10.d[0], xzr
mov v12.d[0], xzr
mov v14.d[0], xzr
mov v16.d[0], xzr
mov v18.d[0], xzr
// X3 ← X1
mov v21.s[0], w8
mov v21.s[1], w9
mov v23.s[0], w10
mov v23.s[1], w11
mov v25.s[0], w12
mov v25.s[1], w13
mov v27.s[0], w14
mov v27.s[1], w15
mov v29.s[0], w16
mov v29.s[1], wzr
// Z3 ← 1
mov v20.d[0], x20
mov v22.d[0], xzr
mov v24.d[0], xzr
mov v26.d[0], xzr
mov v28.d[0], xzr
mov w30, #1216
dup v31.2s, w30
mov w29, #0x1fffffff
dup v30.2d, x29
movz x1, #0xffda
movk x1, #0x3fff, lsl 16
movk x1, #0xfffe, lsl 32
movk x1, #0x3fff, lsl 48
movz x2, #0xfffe
movk x2, #0x3fff, lsl 16
movk x2, #0xfffe, lsl 32
movk x2, #0x3fff, lsl 48
movz x3, #0xfffe
movk x3, #0x00ff, lsl 16
stp x2, x1, [sp, #0]
str x3, [sp, #160]
mov w0, #254
str w0, [sp, #96]
mov w0, #0xda
strb w0, [sp, #152]
ldrb w1, [sp, #135]
lsr w1, w1, #6
str w1, [sp, #100]
// Montgomery ladder loop
.L0:
/* slothy optimized code starts */
ldr d6, [sp, #0]
ldr d0, [sp, #160]
add v1.2s, v15.2s, v14.2s
add x9, sp, #576
add v8.2s, v27.2s, v26.2s
add v4.2s, v6.2s, v27.2s
add v3.2s, v29.2s, v28.2s
add v31.2s, v6.2s, v15.2s
add v5.2s, v6.2s, v23.2s
add v7.2s, v0.2s, v29.2s
add v15.2s, v21.2s, v20.2s
sub v2.2s, v31.2s, v14.2s
add v14.2s, v13.2s, v12.2s
sub v4.2s, v4.2s, v26.2s
ldr d9, [sp, #8]
sub v26.2s, v7.2s, v28.2s
add v28.2s, v23.2s, v22.2s
add v23.2s, v6.2s, v17.2s
tst w1, #1
add v13.2s, v6.2s, v13.2s
add v29.2s, v9.2s, v21.2s
add v21.2s, v0.2s, v19.2s
fcsel d31, d14, d28, eq
add v0.2s, v9.2s, v11.2s
sub v21.2s, v21.2s, v18.2s
add v19.2s, v19.2s, v18.2s
sub v18.2s, v5.2s, v22.2s
add v22.2s, v25.2s, v24.2s
mov x4, v31.d[0]
trn2 v5.2s, v1.2s, v2.2s
fcsel d9, d19, d3, eq
sub v27.2s, v29.2s, v20.2s
fcsel d29, d1, d22, eq
add x19, x4, x4
add v6.2s, v6.2s, v25.2s
trn1 v7.2s, v4.2s, v8.2s
mov x20, v29.d[0]
trn1 v25.2s, v27.2s, v15.2s
mov x7, v9.d[0]
trn1 v19.2s, v19.2s, v21.2s
lsr x6, x20, #32
sub v20.2s, v13.2s, v12.2s
add x23, x6, x6
trn1 v9.2s, v26.2s, v3.2s
add v12.2s, v11.2s, v10.2s
lsr x2, x4, #32
add x0, x2, x2
add v31.2s, v17.2s, v16.2s
umull x16, w19, w7
sub v16.2s, v23.2s, v16.2s
fcsel d29, d12, d15, eq
umull x11, w0, w7
umull v23.2d, v5.2s, v9.2s
add x26, x20, x20
mov x13, v29.d[0]
sub v17.2s, v0.2s, v10.2s
umull x28, w26, w7
sub v10.2s, v6.2s, v24.2s
add x25, x13, x13
trn1 v6.2s, v31.2s, v16.2s
umull x9, w25, w20
lsr x15, x13, #32
umull x24, w25, w7
add x18, x15, x15
fcsel d3, d16, d4, eq
umull x1, w18, w7
fcsel d13, d31, d8, eq
umaddl x8, w18, w2, x9
fcsel d11, d20, d18, eq
trn2 v4.2s, v4.2s, v8.2s
mov x12, v13.d[0]
umull x21, w23, w7
trn2 v13.2s, v31.2s, v16.2s
trn2 v0.2s, v18.2s, v28.2s
fcsel d24, d17, d27, eq
lsr x17, x12, #32
trn1 v29.2s, v12.2s, v17.2s
add x27, x12, x12
fcsel d31, d21, d26, eq
umaddl x9, w19, w17, x1
trn1 v21.2s, v1.2s, v2.2s
umaddl x11, w26, w17, x11
umaddl x1, w0, w17, x16
add x22, x17, x17
fcsel d2, d2, d10, eq
umaddl x9, w0, w12, x9
umaddl x3, w23, w12, x11
trn2 v16.2s, v10.2s, v22.2s
umlal v23.2d, v6.2s, v4.2s
umaddl x1, w26, w12, x1
umlal v23.2d, v13.2s, v7.2s
umaddl x26, w26, w6, x9
stp d24, d11, [sp, #168]
trn1 v10.2s, v10.2s, v22.2s
trn2 v11.2s, v14.2s, v20.2s
umull v22.2d, v29.2s, v9.2s
umaddl x16, w18, w17, x24
umull v24.2d, v11.2s, v9.2s
umull x24, w7, w7
umull v8.2d, v29.2s, v7.2s
umaddl x9, w23, w17, x28
trn1 v1.2s, v18.2s, v28.2s
trn2 v28.2s, v27.2s, v15.2s
umull v15.2d, v29.2s, v10.2s
umaddl x1, w6, w6, x1
umaddl x28, w27, w17, x21
and x21, x26, #0x1fffffff
umaddl x11, w12, w12, x9
trn2 v18.2s, v12.2s, v17.2s
umlal v24.2d, v21.2s, v4.2s
umull x9, w27, w7
umull v17.2d, v18.2s, v9.2s
add x14, x1, x26, lsr #29
stp d2, d3, [sp, #184]
trn1 v12.2s, v14.2s, v20.2s
umull v3.2d, v29.2s, v16.2s
add x23, x3, x14, lsr #29
umlal v15.2d, v18.2s, v0.2s
and x1, x14, #0x1fffffff
umlal v17.2d, v12.2s, v4.2s
add x26, x11, x23, lsr #29
umlal v17.2d, v11.2s, v7.2s
umaddl x8, w4, w4, x8
umlal v17.2d, v21.2s, v16.2s
umull x13, w13, w13
umlal v17.2d, v5.2s, v10.2s
umull x1, w1, w30
umlal v17.2d, v6.2s, v0.2s
umull x11, w25, w15
umlal v17.2d, v13.2s, v1.2s
umull x21, w21, w30
umlal v17.2d, v19.2s, v28.2s
add x5, x28, x26, lsr #29
umlal v15.2d, v12.2s, v1.2s
add x14, x11, x1
umlal v15.2d, v11.2s, v28.2s
umaddl x28, w17, w17, x9
umlal v15.2d, v21.2s, v25.2s
and x9, x26, #0x1fffffff
umlal v3.2d, v18.2s, v10.2s
add x27, x13, x21
umlal v3.2d, v12.2s, v0.2s
umull x1, w25, w17
umlal v24.2d, v5.2s, v7.2s
umull v14.2d, v12.2s, v9.2s
umlal v8.2d, v18.2s, v16.2s
umaddl x21, w18, w12, x1
umlal v8.2d, v12.2s, v10.2s
umull x26, w25, w2
umlal v24.2d, v6.2s, v16.2s
umull x11, w25, w4
umull v2.2d, v29.2s, v0.2s
umull x17, w25, w6
umlal v14.2d, v11.2s, v4.2s
umaddl x1, w19, w6, x21
umlal v14.2d, v21.2s, v7.2s
umaddl x15, w15, w15, x11
umlal v14.2d, v5.2s, v16.2s
add x13, x14, x27, lsr #29
umlal v2.2d, v18.2s, v1.2s
and x11, x5, #0x1fffffff
dup v26.2d, x29
str d31, [sp, #200]
umull v31.2d, v29.2s, v4.2s
add x28, x28, x5, lsr #29
umlal v8.2d, v11.2s, v0.2s
ldp x5, x21, [sp, #184]
umlal v2.2d, v12.2s, v28.2s
and x14, x23, #0x1fffffff
umlal v2.2d, v11.2s, v25.2s
umull x14, w14, w30
umlal v31.2d, v18.2s, v7.2s
umaddl x23, w18, w20, x17
umlal v31.2d, v12.2s, v16.2s
add x3, x5, x5
umlal v31.2d, v11.2s, v10.2s
add x14, x15, x14
umlal v31.2d, v21.2s, v0.2s
and x15, x28, #0x1fffffff
umlal v31.2d, v5.2s, v1.2s
umull x17, w15, w30
umlal v31.2d, v6.2s, v28.2s
umull x15, w11, w30
umlal v8.2d, v21.2s, v1.2s
umull x7, w22, w7
umlal v24.2d, v13.2s, v10.2s
umull x9, w9, w30
umlal v24.2d, v19.2s, v0.2s
umaddl x11, w18, w4, x26
umlal v22.2d, v18.2s, v4.2s
umaddl x26, w19, w12, x16
umlal v22.2d, v12.2s, v7.2s
add x16, x7, x28, lsr #29
umlal v22.2d, v11.2s, v16.2s
add x7, x11, x9
umull x11, w25, w12
umlal v22.2d, v21.2s, v10.2s
add x28, x24, x16, lsr #29
umlal v3.2d, v11.2s, v1.2s
and x16, x16, #0x1fffffff
umlal v3.2d, v21.2s, v28.2s
umaddl x22, w0, w6, x26
umull v21.2d, v21.2s, v9.2s
umaddl x23, w19, w2, x23
umlal v23.2d, v19.2s, v16.2s
umull x9, w16, w30
umlal v14.2d, v6.2s, v10.2s
ldr x26, [sp, #200]
umull v11.2d, v29.2s, v25.2s
lsr x24, x28, #29
umlal v21.2d, v5.2s, v4.2s
umull x4, w24, w30
umlal v21.2d, v6.2s, v7.2s
umaddl x1, w0, w20, x1
umlal v21.2d, v13.2s, v16.2s
ldp x16, x12, [sp, #168]
umlal v14.2d, v13.2s, v0.2s
and x25, x27, #0x1fffffff
umlal v14.2d, v19.2s, v1.2s
lsr x27, x21, #32
umaddl x6, w18, w6, x11
add x24, x14, x13, lsr #29
umaddl x18, w20, w20, x22
lsr x22, x16, #32
umlal v8.2d, v5.2s, v28.2s
add x0, x7, x24, lsr #29
usra v14.2d, v17.2d, #29
and x11, x24, #0x1fffffff
add x17, x23, x17
umaddl x24, w19, w20, x6
umlal v21.2d, v19.2s, v10.2s
add x23, x8, x15
usra v24.2d, v14.2d, #29
and x8, x28, #0x1fffffff
add x14, x23, x0, lsr #29
umaddl x28, w2, w2, x24
umull v10.2d, v6.2s, v9.2s
umull x23, w8, w30
add x15, x17, x14, lsr #29
add x24, x22, x22
add x17, x18, x4
add x28, x28, x9
usra v21.2d, v24.2d, #29
add x8, x28, x15, lsr #29
add x28, x1, x23
bfi x11, x0, #32, #29
umlal v22.2d, v5.2s, v0.2s
add x19, x28, x8, lsr #29
dup v30.2s, w30
and x10, x14, #0x1fffffff
bfi x10, x15, #32, #29
and x15, x8, #0x1fffffff
bfi x15, x19, #32, #29
umlal v3.2d, v5.2s, v25.2s
add x18, x16, x16
stp x10, x15, [sp, #232]
umull x10, w3, w26
usra v23.2d, v21.2d, #29
add x17, x17, x19, lsr #29
umlal v10.2d, v13.2s, v4.2s
lsr x20, x5, #32
umlal v10.2d, v19.2s, v7.2s
bic x9, x17, #0x7fffff
umull v0.2d, v29.2s, v28.2s
add x28, x25, x9, lsr #23
umull v7.2d, v29.2s, v1.2s
add x0, x20, x20
add x4, x28, x9, lsr #22
umull x2, w18, w21
usra v10.2d, v23.2d, #29
umull x19, w24, w26
and v27.16b, v23.16b, v26.16b
umaddl x23, w0, w27, x10
umlal v22.2d, v6.2s, v1.2s
add x28, x4, x9, lsr #19
and x10, x13, #0x1fffffff
add x7, x12, x12
and v21.16b, v21.16b, v26.16b
add x4, x10, x28, lsr #29
xtn v16.2s, v21.2d
umaddl x15, w21, w21, x23
and x23, x28, #0x1fffffff
umull x28, w18, w27
and v23.16b, v17.16b, v26.16b
umull x9, w18, w5
umull x1, w18, w26
xtn v5.2s, v23.2d
and v23.16b, v14.16b, v26.16b
umaddl x14, w24, w21, x28
umull v17.2d, v13.2s, v9.2s
bfi x23, x4, #32, #30
umull v20.2d, v5.2s, v30.2s
umaddl x6, w24, w27, x1
stp x23, x11, [sp, #216]
umaddl x4, w7, w20, x14
lsr x23, x12, #32
umull x25, w16, w16
and v29.16b, v10.16b, v26.16b
umull x1, w7, w26
add x16, x23, x23
umaddl x14, w7, w27, x19
umaddl x11, w16, w21, x14
xtn v14.2s, v27.2d
umlal v22.2d, v13.2s, v28.2s
umaddl x1, w16, w27, x1
umlal v31.2d, v13.2s, v25.2s
umull x13, w18, w20
umlal v8.2d, v6.2s, v25.2s
umaddl x10, w3, w20, x11
umull v16.2d, v16.2s, v30.2s
umaddl x1, w3, w21, x1
umaddl x19, w20, w20, x1
and x11, x17, #0x7fffff
umaddl x8, w24, w23, x9
and x1, x10, #0x1fffffff
str x11, [sp, #248]
umull x14, w1, w30
add x17, x19, x10, lsr #29
umull x9, w18, w22
umlal v17.2d, v19.2s, v4.2s
umull x10, w18, w23
umlal v7.2d, v18.2s, v28.2s
and x1, x17, #0x1fffffff
umull x1, w1, w30
add x19, x25, x14
add v16.2d, v2.2d, v16.2d
umull x28, w16, w26
usra v17.2d, v10.2d, #29
umaddl x13, w24, w5, x13
umlal v0.2d, v18.2s, v25.2s
add x1, x9, x1
umlal v22.2d, v19.2s, v25.2s
umaddl x11, w24, w12, x10
umaddl x14, w24, w20, x2
xtn v29.2s, v29.2d
xtn v18.2s, v23.2d
umaddl x25, w7, w21, x6
umlal v7.2d, v12.2s, v25.2s
lsr x24, x29, #6
umull v21.2d, v18.2s, v30.2s
umull x9, w0, w26
and v18.16b, v17.16b, v26.16b
umaddl x2, w3, w27, x28
umaddl x3, w12, w12, x8
xtn v23.2s, v18.2d
and v18.16b, v24.16b, v26.16b
add x28, x21, x21
add v21.2d, v0.2d, v21.2d
umaddl x0, w0, w21, x2
add x1, x1, x19, lsr #29
add x6, sp, #576
ldr b10, [sp, #152]
umull x2, w26, w26
and x8, x1, #0x1fffffff
and x19, x19, #0x1fffffff
ldr x6, [x6, #0]
umull v6.2d, v19.2s, v9.2s
umaddl x21, w28, w27, x9
umaddl x10, w7, w5, x14
xtn v1.2s, v18.2d
umull v23.2d, v23.2s, v30.2s
add x9, x0, x17, lsr #29
add x0, x27, x27
ldp x14, x17, [sp, #216]
usra v6.2d, v17.2d, #29
add x15, x15, x9, lsr #29
umull v5.2d, v1.2s, v30.2s
umull x28, w28, w26
add v4.2d, v8.2d, v23.2d
add x21, x21, x15, lsr #29
ushr v9.2d, v6.2d, #29
umaddl x10, w23, w23, x10
umaddl x27, w27, w27, x28
ldr x28, [sp, #160]
xtn v17.2s, v9.2d
umaddl x20, w16, w20, x25
umull v9.2d, v29.2s, v30.2s
umull x26, w0, w26
umull v19.2d, v17.2s, v30.2s
add x25, x27, x21, lsr #29
umaddl x20, w5, w5, x20
ldr d8, [sp, #160]
umull v28.2d, v14.2s, v30.2s
add x27, x26, x25, lsr #29
ld1r {v14.2d}, [sp]
and x25, x25, #0x1fffffff
umull x26, w25, w30
add x25, x2, x27, lsr #29
add v12.2d, v11.2d, v20.2d
and x0, x27, #0x1fffffff
lsr x27, x25, #29
umull x0, w0, w30
add v24.2d, v7.2d, v5.2d
umull x18, w18, w12
usra v21.2d, v12.2d, #29
umaddl x2, w7, w23, x13
add v29.2d, v22.2d, v19.2d
umull x23, w27, w30
and v18.16b, v12.16b, v26.16b
umaddl x22, w22, w22, x18
usra v24.2d, v21.2d, #29
add x10, x10, x0
and v5.16b, v6.16b, v26.16b
add x13, x2, x26
and v7.16b, v21.16b, v26.16b
and x12, x9, #0x1fffffff
add v1.2d, v15.2d, v28.2d
umull x18, w12, w30
usra v16.2d, v24.2d, #29
and x7, x15, #0x1fffffff
and v2.16b, v24.16b, v26.16b
and x21, x21, #0x1fffffff
add v9.2d, v3.2d, v9.2d
umull x26, w21, w30
usra v1.2d, v16.2d, #29
umull x2, w7, w30
and v24.16b, v16.16b, v26.16b
add x21, x22, x18
xtn v5.2s, v5.2d
add x12, x21, x1, lsr #29
usra v9.2d, v1.2d, #29
add x9, x11, x2
umull v21.2d, v5.2s, v30.2s
add x9, x9, x12, lsr #29
mov v30.d[0], x24
add x18, x3, x26
usra v4.2d, v9.2d, #29
add x27, x18, x9, lsr #29
uzp1 v15.4s, v2.4s, v24.4s
and x1, x25, #0x1fffffff
add v31.2d, v31.2d, v21.2d
umull x15, w1, w30
and v21.16b, v9.16b, v26.16b
add x21, x13, x27, lsr #29
and v6.16b, v4.16b, v26.16b
add x11, x20, x23
usra v31.2d, v4.2d, #29
add x22, x10, x21, lsr #29
and v24.16b, v1.16b, v26.16b
add x1, x4, x15
add x25, x1, x22, lsr #29
mov v30.d[1], x24
and v2.16b, v31.16b, v26.16b
ldp x13, x3, [sp, #0]
usra v29.2d, v31.2d, #29
and x1, x25, #0x1fffffff
uzp1 v2.4s, v6.4s, v2.4s
umaddl x26, w16, w5, x1
uzp1 v24.4s, v24.4s, v21.4s
add x5, x11, x25, lsr #29
bic v5.16b, v29.16b, v30.16b
and x24, x21, #0x1fffffff
and v21.16b, v29.16b, v30.16b
uzp2 v25.4s, v24.4s, v2.4s
add x11, x17, x13
trn1 v21.4s, v21.4s, v19.4s
and x23, x27, #0x1fffffff
usra v18.2d, v5.2d, #23
and x17, x12, #0x1fffffff
mov v19.d[0], v21.d[1]
dup v30.2s, w30
add v8.4s, v21.4s, v8.4s
and x2, x9, #0x1fffffff
uzp1 v12.4s, v24.4s, v2.4s
bfi x17, x2, #32, #29
usra v18.2d, v5.2d, #22
add x0, x14, x3
add v31.4s, v12.4s, v25.4s
bfi x23, x24, #32, #29
add v2.4s, v12.4s, v14.4s
add x14, x5, x26, lsr #29
usra v18.2d, v5.2d, #19
movz x6, #0xdb42
movk x6, #0x0001, lsl 16
sub v8.4s, v8.4s, v19.4s
mov w27, w23
sub v6.4s, v2.4s, v25.4s
bic x21, x14, #0x7fffff
add v2.4s, v21.4s, v19.4s
add x1, x19, x21, lsr #23
usra v7.2d, v18.2d, #29
ldp x3, x9, [sp, #232]
zip1 v1.4s, v6.4s, v31.4s
and x7, x22, #0x1fffffff
zip1 v8.4s, v8.4s, v2.4s
add x1, x1, x21, lsr #22
mov v14.b[0], v10.b[0]
mov v23.d[0], v1.d[1]
zip2 v0.4s, v6.4s, v31.4s
add x15, x9, x13
add v2.2d, v23.2d, v23.2d
ldr x10, [sp, #248]
add v5.2d, v0.2d, v0.2d
and x22, x26, #0x1fffffff
and v25.16b, v18.16b, v26.16b
sub x25, x11, x17
dup v26.2d, x29
mov v13.d[0], v0.d[1]
uzp1 v22.4s, v25.4s, v7.4s
add x16, x10, x28
add x9, x1, x21, lsr #19
umull v3.2d, v2.2s, v8.2s
add x1, x3, x13
add v12.2d, v1.2d, v1.2d
umull v19.2d, v5.2s, v8.2s
uzp2 v29.4s, v22.4s, v15.4s
umull v11.2d, v8.2s, v8.2s
add x21, x8, x9, lsr #29
uzp1 v25.4s, v22.4s, v15.4s
and x4, x9, #0x1fffffff
add v21.2d, v13.2d, v13.2d
mov w26, w17
add v18.4s, v25.4s, v14.4s
and x12, x14, #0x7fffff
umull v17.2d, v12.2s, v8.2s
umull v20.2d, v21.2s, v8.2s
bfi x4, x21, #32, #30
sub v31.4s, v18.4s, v29.4s
sub x3, x1, x23
add v6.4s, v25.4s, v29.4s
sub x5, x0, x4
umlal v3.2d, v5.2s, v13.2s
lsr x18, x5, #32
umlal v17.2d, v2.2s, v13.2s
umaddl x11, w25, w6, x26
umlal v17.2d, v0.2s, v0.2s
umaddl x9, w18, w6, x21
zip1 v18.4s, v31.4s, v6.4s
stp x4, x17, [sp, #312]
zip2 v4.4s, v31.4s, v6.4s
mov w1, w4
umull v27.2d, v18.2s, v18.2s
umaddl x0, w5, w6, x1
add v15.2d, v18.2d, v18.2d
lsr x10, x25, #32
mov v10.d[0], v18.d[1]
mov v28.d[0], v4.d[1]
add v22.2d, v4.2d, v4.2d
add x21, x9, x0, lsr #29
add v29.2d, v28.2d, v28.2d
umaddl x1, w10, w6, x2
umull v9.2d, v15.2s, v28.2s
umaddl x28, w3, w6, x27
umull v7.2d, v15.2s, v10.2s
add x8, x11, x21, lsr #29
add v25.2d, v10.2d, v10.2d
bfi x7, x22, #32, #29
umull v14.2d, v15.2s, v1.2s
add x2, x1, x8, lsr #29
umull v24.2d, v15.2s, v0.2s
sub x27, x15, x7
umlal v9.2d, v25.2s, v4.2s
lsr x13, x3, #32
umull v18.2d, v29.2s, v8.2s
add x9, x28, x2, lsr #29
umull v16.2d, v15.2s, v8.2s
umull v6.2d, v15.2s, v4.2s
umull v31.2d, v15.2s, v23.2s
umlal v14.2d, v25.2s, v28.2s
stp x23, x7, [sp, #328]
umlal v14.2d, v4.2s, v4.2s
umlal v6.2d, v10.2s, v10.2s
umlal v24.2d, v25.2s, v23.2s
umaddl x28, w13, w6, x24
umull v21.2d, v22.2s, v8.2s
umlal v31.2d, v25.2s, v1.2s
str x12, [sp, #344]
umlal v31.2d, v22.2s, v28.2s
and x11, x9, #0x1fffffff
umlal v19.2d, v13.2s, v13.2s
and x14, x8, #0x1fffffff
umlal v24.2d, v22.2s, v1.2s
and x0, x0, #0x1fffffff
umlal v21.2d, v29.2s, v13.2s
and x20, x21, #0x1fffffff
umlal v21.2d, v12.2s, v0.2s
umull x26, w5, w20
umlal v21.2d, v23.2s, v23.2s
sub x1, x16, x12
umlal v16.2d, v25.2s, v13.2s
mov w23, w7
umlal v16.2d, v22.2s, v0.2s
umaddl x7, w27, w6, x23
umlal v18.2d, v12.2s, v13.2s
lsr x4, x27, #32
umlal v18.2d, v2.2s, v0.2s
add x19, x28, x9, lsr #29
umull v5.2d, v25.2s, v8.2s
umaddl x22, w4, w6, x22
umlal v16.2d, v29.2s, v23.2s
add x7, x7, x19, lsr #29
umlal v16.2d, v1.2s, v1.2s
umaddl x17, w1, w6, x12
umull v2.2d, v15.2s, v13.2s
add x24, x22, x7, lsr #29
umlal v5.2d, v22.2s, v13.2s
and x16, x7, #0x1fffffff
umlal v5.2d, v29.2s, v0.2s
add x22, x17, x24, lsr #29
umlal v5.2d, v12.2s, v23.2s
and x15, x2, #0x1fffffff
umlal v2.2d, v25.2s, v0.2s
and x2, x22, #0x7fffff
umlal v2.2d, v22.2s, v23.2s
umull x7, w25, w2
umlal v24.2d, v28.2s, v28.2s
umull x12, w5, w2
usra v21.2d, v5.2d, #29
and x17, x24, #0x1fffffff
and v8.16b, v5.16b, v26.16b
umaddl x7, w10, w17, x7
xtn v5.2s, v8.2d
bic x24, x22, #0x7fffff
usra v18.2d, v21.2d, #29
umull v10.2d, v5.2s, v30.2s
add x28, x0, x24, lsr #23
and v12.16b, v21.16b, v26.16b
xtn v28.2s, v12.2d
usra v17.2d, v18.2d, #29
umaddl x7, w3, w16, x7
umull v28.2d, v28.2s, v30.2s
and x19, x19, #0x1fffffff
and v5.16b, v18.16b, v26.16b
umull x21, w18, w2
add x8, x28, x24, lsr #22
xtn v18.2s, v5.2d
usra v3.2d, v17.2d, #29
umaddl x22, w13, w19, x7
add v22.2d, v7.2d, v28.2d
umaddl x9, w25, w17, x21
and v17.16b, v17.16b, v26.16b
umull x6, w5, w14
add v4.2d, v27.2d, v10.2d
umull x23, w10, w2
umull v25.2d, v18.2s, v30.2s
umaddl x0, w10, w16, x9
usra v19.2d, v3.2d, #29
umaddl x9, w27, w11, x22
usra v22.2d, v4.2d, #29
umaddl x28, w3, w17, x23
and v23.16b, v3.16b, v26.16b
umaddl x21, w3, w19, x0
and v18.16b, v19.16b, v26.16b
umaddl x0, w4, w15, x9
umaddl x9, w13, w16, x28
xtn v15.2s, v17.2d
add v6.2d, v6.2d, v25.2d
umaddl x21, w13, w11, x21
umaddl x21, w27, w15, x21
xtn v5.2s, v23.2d
umull v21.2d, v15.2s, v30.2s
umaddl x9, w27, w19, x9
umaddl x7, w1, w14, x0
xtn v18.2s, v18.2d
usra v20.2d, v19.2d, #29
umaddl x0, w4, w14, x21
umull v28.2d, v5.2s, v30.2s
umaddl x22, w4, w11, x9
usra v6.2d, v22.2d, #29
add v19.2d, v14.2d, v28.2d
add x21, x8, x24, lsr #19
umull v18.2d, v18.2s, v30.2s
umaddl x9, w1, w20, x0
and v23.16b, v20.16b, v26.16b
umull x23, w5, w21
usra v11.2d, v20.2d, #29
xtn v12.2s, v23.2d
umaddl x26, w18, w21, x26
add v21.2d, v9.2d, v21.2d
and v8.16b, v11.16b, v26.16b
xtn v27.2s, v8.2d
umaddl x0, w1, w15, x22
usra v21.2d, v6.2d, #29
add x24, x7, x9, lsr #29
umull v23.2d, v12.2s, v30.2s
umull v3.2d, v27.2s, v30.2s
add x8, x0, x24, lsr #29
usra v19.2d, v21.2d, #29
umaddl x0, w18, w20, x6
add v5.2d, v31.2d, v18.2d
add v15.2d, v24.2d, v23.2d
ushr v7.2d, v11.2d, #29
usra v5.2d, v19.2d, #29
add v3.2d, v2.2d, v3.2d
and x24, x24, #0x1fffffff
xtn v9.2s, v7.2d
umull x22, w24, w30
usra v15.2d, v5.2d, #29
umull x7, w5, w17
umull v31.2d, v9.2s, v30.2s
umull x28, w5, w16
and v14.16b, v22.16b, v26.16b
umull x6, w5, w15
usra v3.2d, v15.2d, #29
add x22, x26, x22
and v0.16b, v6.16b, v26.16b
umull x24, w5, w19
add v25.2d, v16.2d, v31.2d
umull x26, w5, w11
and v11.16b, v3.16b, v26.16b
umaddl x5, w18, w16, x7
umlal v11.2d, v29.2s, v1.2s
umaddl x7, w18, w11, x24
and v1.16b, v21.16b, v26.16b
and x24, x9, #0x1fffffff
usra v25.2d, v3.2d, #29
umaddl x9, w25, w21, x0
and v10.16b, v4.16b, v26.16b
lsr x0, x29, #6
mov v30.d[0], x0
and v6.16b, v19.16b, v26.16b
and v20.16b, v15.16b, v26.16b
usra v25.2d, v11.2d, #29
mov v30.d[1], x0
add x0, sp, #312
and v7.16b, v5.16b, v26.16b
ld2 {v16.S, v17.S}[1], [x0], #8
bic v2.16b, v25.16b, v30.16b
and v21.16b, v11.16b, v26.16b
usra v10.2d, v2.2d, #23
umaddl x26, w18, w15, x26
ld2 {v23.S, v24.S}[1], [x0], #8
zip2 v8.4s, v6.4s, v7.4s
umaddl x12, w18, w17, x12
usra v10.2d, v2.2d, #22
ld2 {v27.S, v28.S}[1], [x0], #8
zip2 v5.4s, v0.4s, v1.4s
usra v10.2d, v2.2d, #19
ld2 {v11.S, v12.S}[1], [x0], #8
umaddl x12, w25, w16, x12
zip2 v3.4s, v20.4s, v21.4s
umaddl x28, w18, w19, x28
usra v14.2d, v10.2d, #29
umaddl x18, w18, w14, x6
and v13.16b, v10.16b, v26.16b
add x6, sp, #216
ld2 {v18.S, v19.S}[1], [x0], #8
zip2 v10.4s, v13.4s, v14.4s
umaddl x0, w10, w19, x12
ld2 {v13.S, v14.S}[1], [x6], #8
stp d10, d5, [sp, #264]
stp d8, d3, [sp, #280]
and v25.16b, v25.16b, v30.16b
add x12, sp, #360
ld2 {v0.S, v1.S}[1], [x6], #8
ld2 {v16.S, v17.S}[0], [x12], #8
zip2 v4.4s, v25.4s, v26.4s
ld2 {v6.S, v7.S}[1], [x6], #8
ld2 {v23.S, v24.S}[0], [x12], #8
umull v2.2d, v16.2s, v0.2s
umull v29.2d, v16.2s, v6.2s
ld2 {v27.S, v28.S}[0], [x12], #8
ld2 {v20.S, v21.S}[1], [x6], #8
umlal v2.2d, v17.2s, v14.2s
ld2 {v11.S, v12.S}[0], [x12], #8
ld2 {v25.S, v26.S}[1], [x6], #8
umull v9.2d, v16.2s, v20.2s
umull v8.2d, v16.2s, v21.2s
umull v5.2d, v27.2s, v25.2s
umull v22.2d, v16.2s, v25.2s
umlal v9.2d, v17.2s, v7.2s
umlal v8.2d, v17.2s, v20.2s
umlal v5.2d, v28.2s, v21.2s
umlal v22.2d, v17.2s, v21.2s
umlal v22.2d, v23.2s, v20.2s
umull v31.2d, v17.2s, v25.2s
umlal v9.2d, v23.2s, v6.2s
umlal v8.2d, v23.2s, v7.2s
umlal v22.2d, v24.2s, v7.2s
ld2 {v18.S, v19.S}[0], [x12], #8
umull v19.2d, v16.2s, v7.2s
umlal v31.2d, v23.2s, v21.2s
umull v15.2d, v23.2s, v25.2s
umlal v8.2d, v24.2s, v6.2s
umlal v9.2d, v24.2s, v1.2s
umlal v31.2d, v24.2s, v20.2s
umull v10.2d, v28.2s, v25.2s
umlal v22.2d, v27.2s, v6.2s
umlal v29.2d, v17.2s, v1.2s
umull v3.2d, v24.2s, v25.2s
umlal v19.2d, v17.2s, v6.2s
umlal v10.2d, v11.2s, v21.2s
umlal v29.2d, v23.2s, v0.2s
umlal v31.2d, v27.2s, v7.2s
umlal v31.2d, v28.2s, v6.2s
umlal v31.2d, v11.2s, v1.2s
umlal v3.2d, v27.2s, v21.2s
umlal v19.2d, v23.2s, v1.2s
umlal v19.2d, v24.2s, v0.2s
umlal v31.2d, v12.2s, v0.2s
umlal v3.2d, v28.2s, v20.2s
umlal v5.2d, v11.2s, v20.2s
umlal v5.2d, v12.2s, v7.2s
umlal v31.2d, v18.2s, v14.2s
umlal v3.2d, v11.2s, v7.2s
umlal v15.2d, v24.2s, v21.2s
umlal v15.2d, v27.2s, v20.2s
umlal v15.2d, v28.2s, v7.2s
umlal v15.2d, v11.2s, v6.2s
umlal v15.2d, v12.2s, v1.2s
umaddl x12, w25, w14, x26
umlal v15.2d, v18.2s, v0.2s
umull x24, w24, w30
umlal v10.2d, v12.2s, v20.2s
umull x6, w3, w2
umlal v10.2d, v18.2s, v7.2s
umaddl x26, w25, w19, x5
umlal v29.2d, v24.2s, v14.2s
and x5, x8, #0x1fffffff
umlal v3.2d, v12.2s, v6.2s
umull x5, w5, w30
umlal v8.2d, v27.2s, v1.2s
add x24, x23, x24
usra v15.2d, v31.2d, #29
dup v26.2d, x29
str d4, [sp, #296]
umlal v2.2d, v23.2s, v13.2s
umlal v5.2d, v18.2s, v6.2s
umaddl x23, w13, w17, x6
umlal v19.2d, v27.2s, v14.2s
add x6, x9, x5
umlal v9.2d, v27.2s, v0.2s
umull x5, w13, w2
umlal v9.2d, v28.2s, v14.2s
umaddl x9, w27, w16, x23
umlal v29.2d, v27.2s, v13.2s
umaddl x28, w25, w11, x28
umull v27.2d, v12.2s, v25.2s
umaddl x5, w27, w17, x5
umlal v8.2d, v28.2s, v0.2s
umaddl x9, w4, w19, x9
umlal v8.2d, v11.2s, v14.2s
umaddl x28, w10, w15, x28
umlal v19.2d, v28.2s, v13.2s
umaddl x23, w4, w16, x5
umlal v27.2d, v18.2s, v21.2s
add x22, x22, x24, lsr #29
umull v7.2d, v18.2s, v25.2s
umaddl x28, w3, w14, x28
umull v25.2d, v11.2s, v25.2s
umaddl x5, w1, w19, x23
umlal v22.2d, v28.2s, v1.2s
umaddl x23, w10, w11, x26
umlal v22.2d, v11.2s, v0.2s
umaddl x9, w1, w11, x9
umlal v3.2d, v18.2s, v1.2s
umaddl x26, w3, w11, x0
umull v28.2d, v16.2s, v1.2s
umaddl x19, w10, w20, x12
umlal v25.2d, v12.2s, v21.2s
add x12, sp, #576
umlal v22.2d, v12.2s, v14.2s
and x24, x24, #0x1fffffff
usra v3.2d, v15.2d, #29
ldr x0, [x12, #0]
umull v1.2d, v16.2s, v14.2s
add x11, x9, x8, lsr #29
umlal v28.2d, v17.2s, v0.2s
umaddl x19, w3, w21, x19
usra v5.2d, v3.2d, #29
umull x8, w4, w2
dup v30.2s, w30
umlal v25.2d, v18.2s, v20.2s
umaddl x18, w25, w20, x18
add x0, x5, x11, lsr #29
and v21.16b, v15.16b, v26.16b
umaddl x9, w13, w15, x26
umaddl x26, w27, w14, x9
xtn v4.2s, v21.2d
usra v10.2d, v5.2d, #29
umaddl x12, w25, w15, x7
and v20.16b, v31.16b, v26.16b
and x25, x11, #0x1fffffff
and v5.16b, v5.16b, v26.16b
umaddl x26, w4, w20, x26
umlal v1.2d, v17.2s, v13.2s
umull x9, w1, w2
usra v25.2d, v10.2d, #29
umull x2, w27, w2
umlal v8.2d, v12.2s, v13.2s
umaddl x18, w10, w21, x18
umlal v28.2d, v23.2s, v14.2s
umaddl x7, w10, w14, x12
usra v27.2d, v25.2d, #29
umaddl x2, w4, w17, x2
and v23.16b, v10.16b, v26.16b
umull x10, w25, w30
umaddl x12, w1, w17, x8
xtn v21.2s, v23.2d
umaddl x11, w1, w16, x2
xtn v6.2s, v20.2d
umull v31.2d, v21.2s, v30.2s
umaddl x16, w13, w20, x28
umull v17.2d, v16.2s, v13.2s
add x8, x18, x10
umull v14.2d, v4.2s, v30.2s
umaddl x10, w1, w21, x26
umull v23.2d, v6.2s, v30.2s
umaddl x26, w27, w21, x16
usra v7.2d, v27.2d, #29
and x5, x0, #0x1fffffff
and v15.16b, v25.16b, v26.16b
umull x5, w5, w30
add v6.2d, v1.2d, v14.2d
umaddl x17, w3, w15, x23
add v4.2d, v17.2d, v23.2d
ldr x16, [sp, #96]
and v17.16b, v7.16b, v26.16b
add x23, x6, x22, lsr #29
and v21.16b, v27.16b, v26.16b
umaddl x1, w13, w14, x17
and v12.16b, v3.16b, v26.16b
add x28, x11, x0, lsr #29
xtn v16.2s, v12.2d
add x6, x8, x23, lsr #29
usra v6.2d, v4.2d, #29
and v0.16b, v6.16b, v26.16b
add v23.2d, v29.2d, v31.2d
umlal v9.2d, v11.2s, v13.2s
umaddl x27, w27, w20, x1
umull v10.2d, v16.2s, v30.2s
add x2, x12, x28, lsr #29
umaddl x7, w3, w20, x7
xtn v20.2s, v21.2d
xtn v3.2s, v15.2d
umlal v28.2d, v24.2s, v13.2s
umull v14.2d, v3.2s, v30.2s
and x25, x2, #0x1fffffff
and v4.16b, v4.16b, v26.16b
umaddl x17, w4, w21, x27
add v1.2d, v2.2d, v10.2d
add x5, x19, x5
xtn v2.2s, v5.2d
add x15, x5, x6, lsr #29
umull v5.2d, v20.2s, v30.2s
add x0, x9, x2, lsr #29
umull v12.2d, v2.2s, v30.2s
umaddl x13, w13, w21, x7
usra v1.2d, v6.2d, #29
and x9, x0, #0x1fffffff
ldp d25, d27, [sp, #280]
umull x18, w9, w30
and x1, x28, #0x1fffffff
lsr x27, x0, #29
add v28.2d, v28.2d, v12.2d
umull x11, w1, w30
ushr v2.2d, v7.2d, #29
umull x1, w27, w30
and v20.16b, v1.16b, v26.16b
umull x21, w25, w30
usra v28.2d, v1.2d, #29
add x2, x13, x11
add v21.2d, v19.2d, v14.2d
add x4, x2, x15, lsr #29
add v31.2d, v9.2d, v5.2d
add x11, x26, x21
usra v23.2d, v28.2d, #29
add x21, x11, x4, lsr #29
add x9, x17, x18
xtn v24.2s, v2.2d
add x3, x9, x21, lsr #29
xtn v7.2s, v17.2d
usra v21.2d, v23.2d, #29
add x1, x10, x1
umull v16.2d, v7.2s, v30.2s
add x1, x1, x3, lsr #29
and x21, x21, #0x1fffffff
ldr d29, [sp, #296]
and v1.16b, v28.16b, v26.16b
bic x11, x1, #0x7fffff
umull v11.2d, v24.2s, v30.2s
bfi x21, x3, #32, #29
umlal v22.2d, v18.2s, v13.2s
add x9, x24, x11, lsr #23
usra v31.2d, v21.2d, #29
lsr x10, x29, #6
add v28.2d, v8.2d, v16.2d
add x9, x9, x11, lsr #22
mov v16.d[0], x21
mov v30.d[0], x10
add v2.2d, v22.2d, v11.2d
usra v28.2d, v31.2d, #29
add x9, x9, x11, lsr #19
zip2 v13.4s, v20.4s, v1.4s
and x1, x1, #0x7fffff
mov v30.d[1], x10
mov v18.d[0], x1
usra v2.2d, v28.2d, #29
and x11, x22, #0x1fffffff
and v5.16b, v28.16b, v26.16b
subs w0, w16, #1
and v31.16b, v31.16b, v26.16b
and x25, x15, #0x1fffffff
bic v10.16b, v2.16b, v30.16b
add x28, sp, #104
and v8.16b, v2.16b, v30.16b
asr w13, w0, #5
usra v4.2d, v10.2d, #23
ldr w28, [x28, w13, SXTW #2]
and v6.16b, v23.16b, v26.16b
and x13, x23, #0x1fffffff
and v23.16b, v21.16b, v26.16b
bfi x25, x4, #32, #29
zip2 v19.4s, v8.4s, v11.4s
bfi x13, x6, #32, #29
usra v4.2d, v10.2d, #22
lsr x26, x16, #32
zip2 v15.4s, v6.4s, v23.4s
add x10, x11, x9, lsr #29
zip2 v17.4s, v31.4s, v5.4s
and x9, x9, #0x1fffffff
usra v4.2d, v10.2d, #19
bfi x9, x10, #32, #30
zip1 v24.2s, v6.2s, v23.2s
mov v12.d[0], x13
mov v14.d[0], x25
mov v10.d[0], x9
usra v0.2d, v4.2d, #29
and w17, w0, #0x1f
and v2.16b, v4.16b, v26.16b
lsr w21, w28, w17
zip1 v26.2s, v31.2s, v5.2s
zip1 v22.2s, v20.2s, v1.2s
zip1 v28.2s, v8.2s, v11.2s
zip1 v20.2s, v2.2s, v0.2s
zip2 v11.4s, v2.4s, v0.4s
stp w0, w21, [sp, #96]
ldp d21, d23, [sp, #264]
eor w1, w21, w26
/* slothy optimized code ends */
bpl .L0
ldr x0, [sp, #144]
// X2
mov w10, v11.s[0]
mov w11, v11.s[1]
mov w12, v13.s[0]
mov w13, v13.s[1]
mov w14, v15.s[0]
mov w15, v15.s[1]
mov w16, v17.s[0]
mov w17, v17.s[1]
mov w18, v19.s[0]
stp w13, w17, [x0, #0]
stp w14, w18, [x0, #8]
stp w15, w10, [x0, #16]
stp w16, w11, [x0, #24]
stp w12, wzr, [x0, #32]
// Z2
mov w10, v10.s[0]
mov w11, v10.s[1]
mov w12, v12.s[0]
mov w13, v12.s[1]
mov w14, v14.s[0]
mov w15, v14.s[1]
mov w16, v16.s[0]
mov w17, v16.s[1]
mov w18, v18.s[0]
stp w13, w17, [x0, #40]
stp w14, w18, [x0, #48]
stp w15, w10, [x0, #56]
stp w16, w11, [x0, #64]
stp w12, wzr, [x0, #72]
add x10, sp, #408
ldp d14, d15, [x10, #144]
ldp d12, d13, [x10, #128]
ldp d10, d11, [x10, #112]
ldp d8, d9, [x10, #96]
ldp x29, x30, [x10, #80]
ldp x27, x28, [x10, #64]
ldp x25, x26, [x10, #48]
ldp x23, x24, [x10, #32]
ldp x21, x22, [x10, #16]
ldp x19, x20, [x10, #0]
add sp, sp, #576
ret
.section .note.GNU-stack,"",@progbits