AMD64 Multiprecision Arithmetic
Eric Bainville - Dec 2006Multiply and accumulate
This function multiplies the number represented by vector (X,N) by a constant K, then adds or subtracts it to/from the number (Z,N).
The pattern below is extracted from P. Gaudry's code (see Introduction for a reference), and runs in the same 3.13 (25/8) cycles/word. It is necessary to pipeline the computation to reach this speed: output word i is computed after input word i+1 has been multiplied. This compensates the large latency (4/5 cycles for RAX/RDX) of the mul instruction.
shr N, 3
xor AUX0, AUX0
; AUX0: input carry in 0..K
align 16
.a:
lea Z, [Z + 64]
mov RAX, [X]
mul K
mov AUX1, 0
add AUX0, RAX
adc AUX1, RDX
mov RAX, [X + 8]
mul K
OP [Z - 64], AUX0
mov AUX0, 0
adc AUX1, RAX
adc AUX0, RDX
mov RAX, [X + 16]
mul K
OP [Z - 56], AUX1
mov AUX1, 0
adc AUX0, RAX
adc AUX1, RDX
mov RAX, [X + 24]
mul K
OP [Z - 48], AUX0
mov AUX0, 0
adc AUX1, RAX
adc AUX0, RDX
mov RAX, [X + 32]
mul K
OP [Z - 40], AUX1
mov AUX1, 0
adc AUX0, RAX
adc AUX1, RDX
mov RAX, [X + 40]
mul K
OP [Z - 32], AUX0
mov AUX0, 0
adc AUX1, RAX
adc AUX0, RDX
mov RAX, [X + 48]
mul K
OP [Z - 24], AUX1
mov AUX1, 0
adc AUX0, RAX
adc AUX1, RDX
mov RAX, [X + 56]
mul K
OP [Z - 16], AUX0
mov AUX0, 0
adc AUX1, RAX
adc AUX0, RDX
lea X, [X + 64]
OP [Z - 8], AUX1
adc AUX0, 0
dec N
jnz .a
; AUX0: output carry in 0..K
| AMD64 Multiprecision : Scaling | Top of Page | AMD64 Multiprecision : Left and right shifts |
