AMD64 Multiprecision Arithmetic

Eric Bainville - Dec 2006

Multiply and accumulate

This function multiplies the number represented by vector (X,N) by a constant K, then adds or subtracts it to/from the number (Z,N).

The pattern below is extracted from P. Gaudry's code (see Introduction for a reference), and runs in the same 3.13 (25/8) cycles/word. It is necessary to pipeline the computation to reach this speed: output word i is computed after input word i+1 has been multiplied. This compensates the large latency (4/5 cycles for RAX/RDX) of the mul instruction.

        shr     N, 3
	xor     AUX0, AUX0
	; AUX0: input carry in 0..K
        align   16
.a:
        lea     Z, [Z + 64]
        mov     RAX, [X]
        mul     K
        mov     AUX1, 0
        add     AUX0, RAX
        adc     AUX1, RDX
        
        mov     RAX, [X + 8]
        mul     K
        OP      [Z - 64], AUX0
        mov     AUX0, 0
        adc     AUX1, RAX
        adc     AUX0, RDX
        
        mov     RAX, [X + 16]
        mul     K
        OP      [Z - 56], AUX1
        mov     AUX1, 0
        adc     AUX0, RAX
        adc     AUX1, RDX
        
        mov     RAX, [X + 24]
        mul     K
        OP      [Z - 48], AUX0
        mov     AUX0, 0
        adc     AUX1, RAX
        adc     AUX0, RDX
        
        mov     RAX, [X + 32]
        mul     K
        OP      [Z - 40], AUX1
        mov     AUX1, 0
        adc     AUX0, RAX
        adc     AUX1, RDX
        
        mov     RAX, [X + 40]
        mul     K
        OP      [Z - 32], AUX0
        mov     AUX0, 0
        adc     AUX1, RAX
        adc     AUX0, RDX
        
        mov     RAX, [X + 48]
        mul     K
        OP      [Z - 24], AUX1
        mov     AUX1, 0
        adc     AUX0, RAX
        adc     AUX1, RDX
        
        mov     RAX, [X + 56]
        mul     K
        OP      [Z - 16], AUX0
        mov     AUX0, 0
        adc     AUX1, RAX
        adc     AUX0, RDX
        
        lea     X, [X + 64]
        OP      [Z - 8], AUX1
        adc     AUX0, 0

        dec     N
        jnz     .a
	; AUX0: output carry in 0..K