segment .text
cname corr
global corr
; rdi, rsi, rdx, rcx, r8, r9
;
; rdi: x array
; rsi: y array
; rcx: loop counter
; rdx: n
; xmm0: 2 parts of sum_x
; xmm1: 2 parts of sum_y
; xmm2: 2 parts of sum_xx
; xmm3: 2 parts of sum_yy
; xmm4: 2 parts of sum_xy
; xmm5: 2 x values - later squared
; xmm6: 2 y values - later squared
; xmm7: 2 xy values
corr:
xor r8, r8
mov rcx, rdx
subpd xmm0, xmm0
movapd xmm1, xmm0
movapd xmm2, xmm0
movapd xmm3, xmm0
movapd xmm4, xmm0
movapd xmm8, xmm0
movapd xmm9, xmm0
movapd xmm10, xmm0
movapd xmm11, xmm0
movapd xmm12, xmm0
.more:
movapd xmm5, [rdi+r8] ; mov x
movapd xmm6, [rsi+r8] ; mov y
movapd xmm7, xmm5 ; mov x
mulpd xmm7, xmm6 ; xy
addpd xmm0, xmm5 ; sum_x
addpd xmm1, xmm6 ; sum_y
mulpd xmm5, xmm5 ; xx
mulpd xmm6, xmm6 ; yy
addpd xmm2, xmm5 ; sum_xx
addpd xmm3, xmm6 ; sum_yy
addpd xmm4, xmm7 ; sum_xy
movapd xmm13, [rdi+r8+16] ; mov x
movapd xmm14, [rsi+r8+16] ; mov y
movapd xmm15, xmm13 ; mov x
mulpd xmm15, xmm14 ; xy
addpd xmm8, xmm13 ; sum_x
addpd xmm9, xmm14 ; sum_y
mulpd xmm13, xmm13 ; xx
mulpd xmm14, xmm14 ; yy
addpd xmm10, xmm13 ; sum_xx
addpd xmm11, xmm14 ; sum_yy
addpd xmm12, xmm15 ; sum_xy
add r8, 32
sub rcx, 4
jnz .more
addpd xmm0, xmm8
addpd xmm1, xmm9
addpd xmm2, xmm10
addpd xmm3, xmm11
addpd xmm4, xmm12
haddpd xmm0, xmm0 ; sum_x
haddpd xmm1, xmm1 ; sum_y
haddpd xmm2, xmm2 ; sum_xx
haddpd xmm3, xmm3 ; sum_yy
haddpd xmm4, xmm4 ; sum_xy
movsd xmm6, xmm0 ; sum_x
movsd xmm7, xmm1 ; sum_y
cvtsi2sd xmm8, rdx ; n
mulsd xmm6, xmm6 ; sum_x*sum_x
mulsd xmm7, xmm7 ; sum_y*sum_y
mulsd xmm2, xmm8 ; n*sum_xx
mulsd xmm3, xmm8 ; n*sum_yy
subsd xmm2, xmm6 ; n*sum_xx-sum_x*sum_x
subsd xmm3, xmm7 ; n*sum_yy-sum_y*sum_y
mulsd xmm2, xmm3 ; denom*denom
sqrtsd xmm2, xmm2 ; denom
mulsd xmm4, xmm8 ; n*sum_xy
mulsd xmm0, xmm1 ; sum_x*sum_y
subsd xmm4, xmm0 ; n*sum_xy-sum_x*sum_y
divsd xmm4, xmm2 ; correlation
movsd xmm0, xmm4 ; need in xmm0
ret