segment .text
global corr
;
; rcx: x array
; rdx: y array
; r10: loop counter
; r8: n
; xmm0: 2 parts of sum_x
; xmm1: 2 parts of sum_y
; xmm2: 2 parts of sum_xx
; xmm3: 2 parts of sum_yy
; xmm4: 2 parts of sum_xy
; xmm5: 2 x values - later squared
; xmm6: 2 y values - later squared
; xmm7: 2 xy values
corr:
alias X, rcx
alias Y, rdx
alias I, r10
alias N, r8
fpalias Sum_x, 0
fpalias Sum_y, 1
fpalias Sum_xx, 2
fpalias Sum_yy, 3
fpalias Sum_xy, 4
fpalias Xvals, 5
fpalias Yvals, 6
fpalias XYvals, 7
fpalias Sum_x2, 8
fpalias Sum_y2, 9
fpalias Sum_xx2, 10
fpalias Sum_yy2, 11
fpalias Sum_xy2, 12
fpalias Xvals2, 13
fpalias Yvals2, 14
fpalias XYvals2, 15
xor r9d, r9d
mov qI, qN
subpd xSum_x, xSum_x
movapd xSum_y, xSum_x
movapd xSum_xx, xSum_x
movapd xSum_yy, xSum_x
movapd xSum_xy, xSum_x
movapd xSum_x2, xSum_x
movapd xSum_y2, xSum_x
movapd xSum_xx2, xSum_x
movapd xSum_yy2, xSum_x
movapd xSum_xy2, xSum_x
.more:
movapd xXvals, [qX+r9] ; mov x
movapd xYvals, [qY+r9] ; mov y
movapd xXYvals, xXvals ; mov x
mulpd xXYvals, xYvals ; xy
addpd xSum_x, xXvals ; sum_x
addpd xSum_y, xYvals ; sum_y
mulpd xXvals, xXvals ; xx
mulpd xYvals, xYvals ; yy
addpd xmm2, xXvals ; sum_xx
addpd xmm3, xYvals ; sum_yy
addpd xmm4, xXYvals ; sum_xy
movapd xXvals2, [qX+r9+16] ; mov x
movapd xYvals2, [qY+r9+16] ; mov y
movapd xXYvals, xXvals2 ; mov x
mulpd xXYvals, xYvals2 ; xy
addpd xSum_x2, xXvals2 ; sum_x
addpd xSum_y2, xYvals2 ; sum_y
mulpd xXvals2, xXvals2 ; xx
mulpd xYvals2, xYvals2 ; yy
addpd xSum_xx2, xXvals2 ; sum_xx
addpd xSum_yy2, xYvals2 ; sum_yy
addpd xSum_xy2, xXYvals ; sum_xy
add r9, 32
sub qI, 4
jnz .more
addpd xSum_x, xSum_x2
addpd xSum_y, xSum_y2
addpd xSum_xx, xSum_xx2
addpd xSum_yy, xSum_yy2
addpd xSum_xy, xSum_xy2
haddpd xSum_x, xSum_x ; sum_x
haddpd xSum_y, xSum_y ; sum_y
haddpd xSum_xx, xSum_xx ; sum_xx
haddpd xSum_yy, xSum_yy ; sum_yy
haddpd xSum_xy, xSum_xy ; sum_xy
movsd xYvals, xSum_x ; sum_x
movsd xXYvals, xSum_y ; sum_y
fpalias N, 15
cvtsi2sd xN, qN ; n
mulsd xYvals, xYvals ; sum_x*sum_x
mulsd xXYvals, xXYvals ; sum_y*sum_y
fpalias NSum_xx, 2
fpalias NSum_yy, 3
mulsd xNSum_xx, xN ; n*sum_xx
mulsd xNSum_yy, xN ; n*sum_yy
subsd xNSum_xx, xYvals ; n*sum_xx-sum_x*sum_x
subsd xNSum_yy, xXYvals ; n*sum_yy-sum_y*sum_y
fpalias Denom, 2
mulsd xDenom, xNSum_yy ; denom1*denom2
sqrtsd xDenom, xDenom ; denom
fpalias NSum_xy, 4
mulsd xNSum_xy, xN ; n*sum_xy
mulsd xSum_x, xSum_y ; sum_x*sum_y
subsd xNSum_xy, xSum_x ; n*sum_xy-sum_x*sum_y
divsd xNSum_xy, xDenom ; correlation
movsd xmm0, xNSum_xy ; need in xmm0
ret