-rw-r--r-- 27998 lib25519-20230630/crypto_nP/montgomery25519/amd64-maa4/mladder.S raw
#include "crypto_asm_hidden.h"
// linker define mladder
// linker use mask63
/* Assembly for Montgomery ladder. */
#define mask63 CRYPTO_SHARED_NAMESPACE(mask63)
.p2align 5
ASM_HIDDEN _CRYPTO_SHARED_NAMESPACE(mladder)
.globl _CRYPTO_SHARED_NAMESPACE(mladder)
ASM_HIDDEN CRYPTO_SHARED_NAMESPACE(mladder)
.globl CRYPTO_SHARED_NAMESPACE(mladder)
_CRYPTO_SHARED_NAMESPACE(mladder):
CRYPTO_SHARED_NAMESPACE(mladder):
movq %rsp,%r11
andq $-32,%rsp
subq $376,%rsp
movq %r11,0(%rsp)
movq %r12,8(%rsp)
movq %r13,16(%rsp)
movq %r14,24(%rsp)
movq %r15,32(%rsp)
movq %rbx,40(%rsp)
movq %rbp,48(%rsp)
movq %rdi,56(%rsp)
// X1 ← XP,X3 ← XP
movq 0(%rsi),%r8
movq %r8,72(%rsp)
movq %r8,168(%rsp)
movq 8(%rsi),%r8
movq %r8,80(%rsp)
movq %r8,176(%rsp)
movq 16(%rsi),%r8
movq %r8,88(%rsp)
movq %r8,184(%rsp)
movq 24(%rsi),%r8
movq %r8,96(%rsp)
movq %r8,192(%rsp)
// X2 ← 1
movq $1,104(%rsp)
movq $0,112(%rsp)
movq $0,120(%rsp)
movq $0,128(%rsp)
// Z2 ← 0
movq $0,136(%rsp)
movq $0,144(%rsp)
movq $0,152(%rsp)
movq $0,160(%rsp)
// Z3 ← 1
movq $1,200(%rsp)
movq $0,208(%rsp)
movq $0,216(%rsp)
movq $0,224(%rsp)
movq $31,240(%rsp)
movb $6,232(%rsp)
movb $0,234(%rsp)
movq %rdx,64(%rsp)
movq %rdx,%rax
// Montgomery ladder loop
.L1:
addq 240(%rsp),%rax
movb 0(%rax),%r14b
movb %r14b,236(%rsp)
.L2:
/*
* Montgomery ladder step
*
* Reduction ideas for addition and subtraction are taken from the 64-bit implementation
* "amd64-64" of the work "https://link.springer.com/article/10.1007/s13389-012-0027-1"
*
* T1 ← X2 + Z2
* T2 ← X2 - Z2
* T3 ← X3 + Z3
* T4 ← X3 - Z3
* Z3 ← T2 · T3
* X3 ← T1 · T4
*
* bit ← n[i]
* select ← bit ⊕ prevbit
* prevbit ← bit
* CSelect(T1,T3,select): if (select == 1) {T1 = T3}
* CSelect(T2,T4,select): if (select == 1) {T2 = T4}
*
* T2 ← T2^2
* T1 ← T1^2
* T3 ← X3 + Z3
* Z3 ← X3 - Z3
* Z3 ← Z3^2
* X3 ← T3^2
* T3 ← T1 - T2
* T4 ← ((A + 2)/4) · T3
* T4 ← T4 + T2
* X2 ← T1 · T2
* Z2 ← T3 · T4
* Z3 ← Z3 · X1
*
*/
// X2
movq 104(%rsp),%r8
movq 112(%rsp),%r9
movq 120(%rsp),%r10
movq 128(%rsp),%r11
// copy X2
movq %r8,%rax
movq %r9,%rbx
movq %r10,%rbp
movq %r11,%rsi
// T1 ← X2 + Z2
addq 136(%rsp),%r8
adcq 144(%rsp),%r9
adcq 152(%rsp),%r10
adcq 160(%rsp),%r11
movq $0,%rdi
movq $38,%rcx
cmovae %rdi,%rcx
addq %rcx,%r8
adcq %rdi,%r9
adcq %rdi,%r10
adcq %rdi,%r11
cmovc %rcx,%rdi
addq %rdi,%r8
movq %r8,248(%rsp)
movq %r9,256(%rsp)
movq %r10,264(%rsp)
movq %r11,272(%rsp)
// T2 ← X2 - Z2
subq 136(%rsp),%rax
sbbq 144(%rsp),%rbx
sbbq 152(%rsp),%rbp
sbbq 160(%rsp),%rsi
movq $0,%rdi
movq $38,%rcx
cmovae %rdi,%rcx
subq %rcx,%rax
sbbq %rdi,%rbx
sbbq %rdi,%rbp
sbbq %rdi,%rsi
cmovc %rcx,%rdi
subq %rdi,%rax
movq %rax,280(%rsp)
movq %rbx,288(%rsp)
movq %rbp,296(%rsp)
movq %rsi,304(%rsp)
// X3
movq 168(%rsp),%r8
movq 176(%rsp),%r9
movq 184(%rsp),%r10
movq 192(%rsp),%r11
// copy X3
movq %r8,%rax
movq %r9,%rbx
movq %r10,%rbp
movq %r11,%rsi
// T3 ← X3 + Z3
addq 200(%rsp),%r8
adcq 208(%rsp),%r9
adcq 216(%rsp),%r10
adcq 224(%rsp),%r11
movq $0,%rdi
movq $38,%rcx
cmovae %rdi,%rcx
addq %rcx,%r8
adcq %rdi,%r9
adcq %rdi,%r10
adcq %rdi,%r11
cmovc %rcx,%rdi
addq %rdi,%r8
movq %r8,312(%rsp)
movq %r9,320(%rsp)
movq %r10,328(%rsp)
movq %r11,336(%rsp)
// T4 ← X3 - Z3
subq 200(%rsp),%rax
sbbq 208(%rsp),%rbx
sbbq 216(%rsp),%rbp
sbbq 224(%rsp),%rsi
movq $0,%rdi
movq $38,%rcx
cmovae %rdi,%rcx
subq %rcx,%rax
sbbq %rdi,%rbx
sbbq %rdi,%rbp
sbbq %rdi,%rsi
cmovc %rcx,%rdi
subq %rdi,%rax
movq %rax,344(%rsp)
movq %rbx,352(%rsp)
movq %rbp,360(%rsp)
movq %rsi,368(%rsp)
// Z3 ← T2 · T3
movq 288(%rsp),%rax
mulq 336(%rsp)
movq %rax,%r8
xorq %r9,%r9
movq %rdx,%r10
xorq %r11,%r11
movq 296(%rsp),%rax
mulq 328(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 304(%rsp),%rax
mulq 320(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 296(%rsp),%rax
mulq 336(%rsp)
addq %rax,%r10
adcq $0,%r11
movq %rdx,%r12
xorq %r13,%r13
movq 304(%rsp),%rax
mulq 328(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq $38,%rax
mulq %r10
imul $38,%r11,%r11
movq %rax,%r10
addq %rdx,%r11
movq 304(%rsp),%rax
mulq 336(%rsp)
addq %rax,%r12
adcq $0,%r13
movq $38,%rax
mulq %rdx
movq %rax,%r14
movq %rdx,%r15
movq $38,%rax
mulq %r12
imul $38,%r13,%r13
movq %rax,%r12
addq %rdx,%r13
movq 280(%rsp),%rax
mulq 336(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 288(%rsp),%rax
mulq 328(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 296(%rsp),%rax
mulq 320(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 304(%rsp),%rax
mulq 312(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq $38,%rax
mulq %r8
imul $38,%r9,%r9
movq %rax,%r8
addq %rdx,%r9
movq 280(%rsp),%rax
mulq 312(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 280(%rsp),%rax
mulq 320(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq 288(%rsp),%rax
mulq 312(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq 280(%rsp),%rax
mulq 328(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq 288(%rsp),%rax
mulq 320(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq 296(%rsp),%rax
mulq 312(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
addq %r9,%r10
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
addq %r13,%r14
adcq $0,%r15
shld $1,%r14,%r15
andq mask63(%rip),%r14
imul $19,%r15,%r15
addq %r15,%r8
adcq $0,%r10
adcq $0,%r12
adcq $0,%r14
movq %r8,200(%rsp)
movq %r10,208(%rsp)
movq %r12,216(%rsp)
movq %r14,224(%rsp)
// X3 ← T1 · T4
movq 256(%rsp),%rax
mulq 368(%rsp)
movq %rax,%r8
xorq %r9,%r9
movq %rdx,%r10
xorq %r11,%r11
movq 264(%rsp),%rax
mulq 360(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 272(%rsp),%rax
mulq 352(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 264(%rsp),%rax
mulq 368(%rsp)
addq %rax,%r10
adcq $0,%r11
movq %rdx,%r12
xorq %r13,%r13
movq 272(%rsp),%rax
mulq 360(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq $38,%rax
mulq %r10
imul $38,%r11,%r11
movq %rax,%r10
addq %rdx,%r11
movq 272(%rsp),%rax
mulq 368(%rsp)
addq %rax,%r12
adcq $0,%r13
movq $38,%rax
mulq %rdx
movq %rax,%r14
movq %rdx,%r15
movq $38,%rax
mulq %r12
imul $38,%r13,%r13
movq %rax,%r12
addq %rdx,%r13
movq 248(%rsp),%rax
mulq 368(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 256(%rsp),%rax
mulq 360(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 264(%rsp),%rax
mulq 352(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 272(%rsp),%rax
mulq 344(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq $38,%rax
mulq %r8
imul $38,%r9,%r9
movq %rax,%r8
addq %rdx,%r9
movq 248(%rsp),%rax
mulq 344(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 248(%rsp),%rax
mulq 352(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq 256(%rsp),%rax
mulq 344(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq 248(%rsp),%rax
mulq 360(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq 256(%rsp),%rax
mulq 352(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq 264(%rsp),%rax
mulq 344(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
addq %r9,%r10
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
addq %r13,%r14
adcq $0,%r15
shld $1,%r14,%r15
andq mask63(%rip),%r14
imul $19,%r15,%r15
addq %r15,%r8
adcq $0,%r10
adcq $0,%r12
adcq $0,%r14
movq %r8,168(%rsp)
movq %r10,176(%rsp)
movq %r12,184(%rsp)
movq %r14,192(%rsp)
movb 232(%rsp),%cl
movb 236(%rsp),%bl
shrb %cl,%bl
andb $1,%bl
movb %bl,%cl
xorb 234(%rsp),%bl
movb %cl,234(%rsp)
cmpb $1,%bl
// CSelect(T1,T3,select)
movq 248(%rsp),%r8
movq 256(%rsp),%r9
movq 264(%rsp),%r10
movq 272(%rsp),%r11
movq 312(%rsp),%r12
movq 320(%rsp),%r13
movq 328(%rsp),%r14
movq 336(%rsp),%r15
cmove %r12,%r8
cmove %r13,%r9
cmove %r14,%r10
cmove %r15,%r11
movq %r8,248(%rsp)
movq %r9,256(%rsp)
movq %r10,264(%rsp)
movq %r11,272(%rsp)
// CSelect(T2,T4,select)
movq 280(%rsp),%rbx
movq 288(%rsp),%rbp
movq 296(%rsp),%rcx
movq 304(%rsp),%rsi
movq 344(%rsp),%r12
movq 352(%rsp),%r13
movq 360(%rsp),%r14
movq 368(%rsp),%r15
cmove %r12,%rbx
cmove %r13,%rbp
cmove %r14,%rcx
cmove %r15,%rsi
// T2 ← T2^2
movq %rsi,%rax
mulq %rsi
movq %rax,%r12
movq $0,%r13
movq $38,%rax
mulq %rdx
movq %rax,%r14
movq %rdx,%r15
movq %rbp,%rax
mulq %rsi
movq %rax,%r8
xorq %r9,%r9
movq %rdx,%r10
xorq %r11,%r11
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rcx,%rax
mulq %rcx
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rcx,%rax
mulq %rsi
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq $38,%rax
mulq %r10
imul $38,%r11,%r11
movq %rax,%r10
addq %rdx,%r11
movq $38,%rax
mulq %r12
imul $38,%r13,%r13
movq %rax,%r12
addq %rdx,%r13
movq %rbx,%rax
mulq %rsi
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq %rbp,%rax
mulq %rcx
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq $38,%rax
mulq %r8
imul $38,%r9,%r9
movq %rax,%r8
addq %rdx,%r9
movq %rbx,%rax
mulq %rbx
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rbx,%rax
mulq %rbp
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq %rbx,%rax
mulq %rcx
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq %rbp,%rax
mulq %rbp
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
addq %r9,%r10
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
addq %r13,%r14
adcq $0,%r15
shld $1,%r14,%r15
imul $19,%r15,%r15
andq mask63(%rip),%r14
addq %r15,%r8
adcq $0,%r10
adcq $0,%r12
adcq $0,%r14
movq %r8,280(%rsp)
movq %r10,288(%rsp)
movq %r12,296(%rsp)
movq %r14,304(%rsp)
// T1 ← T1^2
movq 272(%rsp),%rax
mulq 272(%rsp)
movq %rax,%r12
xorq %r13,%r13
movq $38,%rax
mulq %rdx
movq %rax,%r14
movq %rdx,%r15
movq 256(%rsp),%rax
mulq 272(%rsp)
movq %rax,%r8
xorq %r9,%r9
movq %rdx,%r10
xorq %r11,%r11
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 264(%rsp),%rax
mulq 264(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 264(%rsp),%rax
mulq 272(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq $38,%rax
mulq %r10
imul $38,%r11,%r11
movq %rax,%r10
addq %rdx,%r11
movq $38,%rax
mulq %r12
imul $38,%r13,%r13
movq %rax,%r12
addq %rdx,%r13
movq 248(%rsp),%rax
mulq 272(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 256(%rsp),%rax
mulq 264(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq $38,%rax
mulq %r8
imul $38,%r9,%r9
movq %rax,%r8
addq %rdx,%r9
movq 248(%rsp),%rax
mulq 248(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 248(%rsp),%rax
mulq 256(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq 248(%rsp),%rax
mulq 264(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq 256(%rsp),%rax
mulq 256(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
addq %r9,%r10
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
addq %r13,%r14
adcq $0,%r15
shld $1,%r14,%r15
andq mask63(%rip),%r14
imul $19,%r15,%r15
addq %r15,%r8
adcq $0,%r10
adcq $0,%r12
adcq $0,%r14
movq %r8,248(%rsp)
movq %r10,256(%rsp)
movq %r12,264(%rsp)
movq %r14,272(%rsp)
// T3 ← X3 + Z3
movq 168(%rsp),%r8
movq 176(%rsp),%r9
movq 184(%rsp),%r10
movq 192(%rsp),%r11
movq %r8,%rbx
movq %r9,%rbp
movq %r10,%rcx
movq %r11,%rsi
addq 200(%rsp),%r8
adcq 208(%rsp),%r9
adcq 216(%rsp),%r10
adcq 224(%rsp),%r11
movq $0,%rax
movq $38,%rdx
cmovae %rax,%rdx
addq %rdx,%r8
adcq %rax,%r9
adcq %rax,%r10
adcq %rax,%r11
cmovc %rdx,%rax
addq %rax,%r8
movq %r8,168(%rsp)
movq %r9,176(%rsp)
movq %r10,184(%rsp)
movq %r11,192(%rsp)
// Z3 ← X3 - Z3
subq 200(%rsp),%rbx
sbbq 208(%rsp),%rbp
sbbq 216(%rsp),%rcx
sbbq 224(%rsp),%rsi
movq $0,%rax
movq $38,%rdx
cmovae %rax,%rdx
subq %rdx,%rbx
sbbq %rax,%rbp
sbbq %rax,%rcx
sbbq %rax,%rsi
cmovc %rdx,%rax
subq %rax,%rbx
// Z3 ← Z3^2
movq %rsi,%rax
mulq %rsi
movq %rax,%r12
xorq %r13,%r13
movq $38,%rax
mulq %rdx
movq %rax,%r14
movq %rdx,%r15
movq %rbp,%rax
mulq %rsi
movq %rax,%r8
xorq %r9,%r9
movq %rdx,%r10
xorq %r11,%r11
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rcx,%rax
mulq %rcx
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rcx,%rax
mulq %rsi
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq $38,%rax
mulq %r10
imul $38,%r11,%r11
movq %rax,%r10
addq %rdx,%r11
movq $38,%rax
mulq %r12
imul $38,%r13,%r13
movq %rax,%r12
addq %rdx,%r13
movq %rbx,%rax
mulq %rsi
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq %rbp,%rax
mulq %rcx
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq $38,%rax
mulq %r8
imul $38,%r9,%r9
movq %rax,%r8
addq %rdx,%r9
movq %rbx,%rax
mulq %rbx
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq %rbx,%rax
mulq %rbp
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq %rbx,%rax
mulq %rcx
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq %rbp,%rax
mulq %rbp
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
addq %r9,%r10
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
addq %r13,%r14
adcq $0,%r15
shld $1,%r14,%r15
imul $19,%r15,%r15
andq mask63(%rip),%r14
addq %r15,%r8
adcq $0,%r10
adcq $0,%r12
adcq $0,%r14
movq %r8,200(%rsp)
movq %r10,208(%rsp)
movq %r12,216(%rsp)
movq %r14,224(%rsp)
// X3 ← T3^2
movq 192(%rsp),%rax
mulq 192(%rsp)
movq %rax,%r12
xorq %r13,%r13
movq $38,%rax
mulq %rdx
movq %rax,%r14
movq %rdx,%r15
movq 176(%rsp),%rax
mulq 192(%rsp)
movq %rax,%r8
xorq %r9,%r9
movq %rdx,%r10
xorq %r11,%r11
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 184(%rsp),%rax
mulq 184(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 184(%rsp),%rax
mulq 192(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq $38,%rax
mulq %r10
imul $38,%r11,%r11
movq %rax,%r10
addq %rdx,%r11
movq $38,%rax
mulq %r12
imul $38,%r13,%r13
movq %rax,%r12
addq %rdx,%r13
movq 168(%rsp),%rax
mulq 192(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 176(%rsp),%rax
mulq 184(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq $38,%rax
mulq %r8
imul $38,%r9,%r9
movq %rax,%r8
addq %rdx,%r9
movq 168(%rsp),%rax
mulq 168(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 168(%rsp),%rax
mulq 176(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq 168(%rsp),%rax
mulq 184(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq 176(%rsp),%rax
mulq 176(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
addq %r9,%r10
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
addq %r13,%r14
adcq $0,%r15
shld $1,%r14,%r15
andq mask63(%rip),%r14
imul $19,%r15,%r15
addq %r15,%r8
adcq $0,%r10
adcq $0,%r12
adcq $0,%r14
// update X3
movq %r8,168(%rsp)
movq %r10,176(%rsp)
movq %r12,184(%rsp)
movq %r14,192(%rsp)
// T3 ← T1 - T2
movq 248(%rsp),%rbx
movq 256(%rsp),%rbp
movq 264(%rsp),%rcx
movq 272(%rsp),%rsi
subq 280(%rsp),%rbx
sbbq 288(%rsp),%rbp
sbbq 296(%rsp),%rcx
sbbq 304(%rsp),%rsi
movq $0,%rax
movq $38,%rdx
cmovae %rax,%rdx
subq %rdx,%rbx
sbbq %rax,%rbp
sbbq %rax,%rcx
sbbq %rax,%rsi
cmovc %rdx,%rax
subq %rax,%rbx
movq %rbx,312(%rsp)
movq %rbp,320(%rsp)
movq %rcx,328(%rsp)
movq %rsi,336(%rsp)
// T4 ← ((A + 2)/4) · T3
movq $121666,%rax
mulq %rbx
movq %rax,%r8
movq %rdx,%r9
movq $0,%r10
movq $121666,%rax
mulq %rbp
addq %rax,%r9
adcq %rdx,%r10
movq $0,%r11
movq $121666,%rax
mulq %rcx
addq %rax,%r10
adcq %rdx,%r11
movq $0,%r12
movq $121666,%rax
mulq %rsi
addq %rax,%r11
adcq %rdx,%r12
shld $1,%r11,%r12
andq mask63(%rip),%r11
imul $19,%r12,%r12
addq %r12,%r8
adcq $0,%r9
adcq $0,%r10
adcq $0,%r11
// T4 ← T4 + T2
addq 280(%rsp),%r8
adcq 288(%rsp),%r9
adcq 296(%rsp),%r10
adcq 304(%rsp),%r11
movq $0,%rax
movq $38,%rdx
cmovae %rax,%rdx
addq %rdx,%r8
adcq %rax,%r9
adcq %rax,%r10
adcq %rax,%r11
cmovc %rdx,%rax
addq %rax,%r8
movq %r8,344(%rsp)
movq %r9,352(%rsp)
movq %r10,360(%rsp)
movq %r11,368(%rsp)
// X2 ← T1 · T2
movq 256(%rsp),%rax
mulq 304(%rsp)
movq %rax,%r8
xorq %r9,%r9
movq %rdx,%r10
xorq %r11,%r11
movq 264(%rsp),%rax
mulq 296(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 272(%rsp),%rax
mulq 288(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 264(%rsp),%rax
mulq 304(%rsp)
addq %rax,%r10
adcq $0,%r11
movq %rdx,%r12
xorq %r13,%r13
movq 272(%rsp),%rax
mulq 296(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq $38,%rax
mulq %r10
imul $38,%r11,%r11
movq %rax,%r10
addq %rdx,%r11
movq 272(%rsp),%rax
mulq 304(%rsp)
addq %rax,%r12
adcq $0,%r13
movq $38,%rax
mulq %rdx
movq %rax,%r14
movq %rdx,%r15
movq $38,%rax
mulq %r12
imul $38,%r13,%r13
movq %rax,%r12
addq %rdx,%r13
movq 248(%rsp),%rax
mulq 304(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 256(%rsp),%rax
mulq 296(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 264(%rsp),%rax
mulq 288(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 272(%rsp),%rax
mulq 280(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq $38,%rax
mulq %r8
imul $38,%r9,%r9
movq %rax,%r8
addq %rdx,%r9
movq 248(%rsp),%rax
mulq 280(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 248(%rsp),%rax
mulq 288(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq 256(%rsp),%rax
mulq 280(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq 248(%rsp),%rax
mulq 296(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq 256(%rsp),%rax
mulq 288(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq 264(%rsp),%rax
mulq 280(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
addq %r9,%r10
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
addq %r13,%r14
adcq $0,%r15
shld $1,%r14,%r15
andq mask63(%rip),%r14
imul $19,%r15,%r15
addq %r15,%r8
adcq $0,%r10
adcq $0,%r12
adcq $0,%r14
// update X2
movq %r8,104(%rsp)
movq %r10,112(%rsp)
movq %r12,120(%rsp)
movq %r14,128(%rsp)
// Z2 ← T3 · T4
movq 320(%rsp),%rax
mulq 368(%rsp)
movq %rax,%r8
xorq %r9,%r9
movq %rdx,%r10
xorq %r11,%r11
movq 328(%rsp),%rax
mulq 360(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 336(%rsp),%rax
mulq 352(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 328(%rsp),%rax
mulq 368(%rsp)
addq %rax,%r10
adcq $0,%r11
movq %rdx,%r12
xorq %r13,%r13
movq 336(%rsp),%rax
mulq 360(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq $38,%rax
mulq %r10
imul $38,%r11,%r11
movq %rax,%r10
addq %rdx,%r11
movq 336(%rsp),%rax
mulq 368(%rsp)
addq %rax,%r12
adcq $0,%r13
movq $38,%rax
mulq %rdx
movq %rax,%r14
movq %rdx,%r15
movq $38,%rax
mulq %r12
imul $38,%r13,%r13
movq %rax,%r12
addq %rdx,%r13
movq 312(%rsp),%rax
mulq 368(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 320(%rsp),%rax
mulq 360(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 328(%rsp),%rax
mulq 352(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 336(%rsp),%rax
mulq 344(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq $38,%rax
mulq %r8
imul $38,%r9,%r9
movq %rax,%r8
addq %rdx,%r9
movq 312(%rsp),%rax
mulq 344(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 312(%rsp),%rax
mulq 352(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq 320(%rsp),%rax
mulq 344(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq 312(%rsp),%rax
mulq 360(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq 320(%rsp),%rax
mulq 352(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq 328(%rsp),%rax
mulq 344(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
addq %r9,%r10
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
addq %r13,%r14
adcq $0,%r15
shld $1,%r14,%r15
andq mask63(%rip),%r14
imul $19,%r15,%r15
addq %r15,%r8
adcq $0,%r10
adcq $0,%r12
adcq $0,%r14
// update Z2
movq %r8,136(%rsp)
movq %r10,144(%rsp)
movq %r12,152(%rsp)
movq %r14,160(%rsp)
// Z3 ← Z3 · X1
movq 80(%rsp),%rax
mulq 224(%rsp)
movq %rax,%r8
xorq %r9,%r9
movq %rdx,%r10
xorq %r11,%r11
movq 88(%rsp),%rax
mulq 216(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 96(%rsp),%rax
mulq 208(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 88(%rsp),%rax
mulq 224(%rsp)
addq %rax,%r10
adcq $0,%r11
movq %rdx,%r12
xorq %r13,%r13
movq 96(%rsp),%rax
mulq 216(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq $38,%rax
mulq %r10
imul $38,%r11,%r11
movq %rax,%r10
addq %rdx,%r11
movq 96(%rsp),%rax
mulq 224(%rsp)
addq %rax,%r12
adcq $0,%r13
movq $38,%rax
mulq %rdx
movq %rax,%r14
movq %rdx,%r15
movq $38,%rax
mulq %r12
imul $38,%r13,%r13
movq %rax,%r12
addq %rdx,%r13
movq 72(%rsp),%rax
mulq 224(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 80(%rsp),%rax
mulq 216(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 88(%rsp),%rax
mulq 208(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq 96(%rsp),%rax
mulq 200(%rsp)
addq %rax,%r14
adcq $0,%r15
addq %rdx,%r8
adcq $0,%r9
movq $38,%rax
mulq %r8
imul $38,%r9,%r9
movq %rax,%r8
addq %rdx,%r9
movq 72(%rsp),%rax
mulq 200(%rsp)
addq %rax,%r8
adcq $0,%r9
addq %rdx,%r10
adcq $0,%r11
movq 72(%rsp),%rax
mulq 208(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq 80(%rsp),%rax
mulq 200(%rsp)
addq %rax,%r10
adcq $0,%r11
addq %rdx,%r12
adcq $0,%r13
movq 72(%rsp),%rax
mulq 216(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq 80(%rsp),%rax
mulq 208(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
movq 88(%rsp),%rax
mulq 200(%rsp)
addq %rax,%r12
adcq $0,%r13
addq %rdx,%r14
adcq $0,%r15
addq %r9,%r10
adcq $0,%r11
addq %r11,%r12
adcq $0,%r13
addq %r13,%r14
adcq $0,%r15
shld $1,%r14,%r15
andq mask63(%rip),%r14
imul $19,%r15,%r15
addq %r15,%r8
adcq $0,%r10
adcq $0,%r12
adcq $0,%r14
// update Z3
movq %r8,200(%rsp)
movq %r10,208(%rsp)
movq %r12,216(%rsp)
movq %r14,224(%rsp)
movb 232(%rsp),%cl
subb $1,%cl
movb %cl,232(%rsp)
cmpb $0,%cl
jge .L2
movb $7,232(%rsp)
movq 64(%rsp),%rax
movq 240(%rsp),%r15
subq $1,%r15
movq %r15,240(%rsp)
cmpq $0,%r15
jge .L1
movq 56(%rsp),%rdi
movq 104(%rsp),%r8
movq 112(%rsp),%r9
movq 120(%rsp),%r10
movq 128(%rsp),%r11
// store final value of X2
movq %r8,0(%rdi)
movq %r9,8(%rdi)
movq %r10,16(%rdi)
movq %r11,24(%rdi)
movq 136(%rsp),%r8
movq 144(%rsp),%r9
movq 152(%rsp),%r10
movq 160(%rsp),%r11
// store final value of Z2
movq %r8,32(%rdi)
movq %r9,40(%rdi)
movq %r10,48(%rdi)
movq %r11,56(%rdi)
movq 0(%rsp),%r11
movq 8(%rsp),%r12
movq 16(%rsp),%r13
movq 24(%rsp),%r14
movq 32(%rsp),%r15
movq 40(%rsp),%rbx
movq 48(%rsp),%rbp
movq %r11,%rsp
ret