; xmm0: input data
; move xmm0 to xmm1
; shift left
; move xmm0 to xmm2
; shift right
;
; use punpcldw to convert low bytes to words
; use pslldq to shift left 2 bytes
; use psrldq to shift right 2 bytes
; use pmullw, paddw
; use cvtpi2ps to convert dwords to float
; use punpcklwd to convert low words to dwords
; use punpckhwd to convert high words to dwords
;rdi, rsi, rdx, rcx, r8, r9
;xmm0-7
;caller cleans up stack
;stack aligned on 16 bytes so that local data for functions can be properly
;aligned for SSE, ...
;callee must preserve rbx, rbp, r12-r15
%macro multipush 1-*
%rep %0
push %1
%rotate 1
%endrep
%endmacro
%macro multipop 1-*
%rep %0
%rotate -1
pop %1
%endrep
%endmacro
; sobel ( input, output, rows, cols );
; char input[rows][cols]
; float output[rows][cols]
; boundary of the output array will be unfilled
;
segment .data
rows equ 20000
cols equ 20000
dsize equ rows*cols
segment .bss
data: resb dsize
result: resd dsize
segment .text
global sobel, main
sobel:
.cols equ 0
.rows equ 8
.output equ 16
.input equ 24
.bpir equ 32
.bpor equ 40
multipush rbx, rbx, r12, r13, r14, r15
sub rsp, 48
cmp rdx, 3
jle .noworktodo
cmp rcx, 3
jle .noworktodo
mov [rsp+.input], rdi
mov [rsp+.output], rsi
mov [rsp+.rows], rdx
mov [rsp+.cols], rcx
mov [rsp+.bpir], rcx
imul rcx, 4
mov [rsp+.bpor], rcx
mov rax, [rsp+.rows]; count of rows to process
sub rax, 2
mov r8, [rsp+.input]
add r8, rdx
mov r9, r8 ; address of row
mov r10, r8
sub r8, rdx ; address of first byte of row-1
add r10, rdx ; address of first byte of row+1
pxor xmm1, xmm1
pxor xmm2, xmm2
pxor xmm6, xmm6
pxor xmm7, xmm7
pxor xmm9, xmm9
pxor xmm10, xmm10
pxor xmm11, xmm11
pxor xmm12, xmm12
pxor xmm13, xmm13
pxor xmm14, xmm14
pxor xmm15, xmm15
.more_rows:
mov rbx, 1 ; first column to process
.more_cols:
; 8 low data row-1
; 8 low data shifted left
; 8 low data shifted right
; 8 low data row
; 8 low data shifted left
; 8 low data shifted right
; 8 low data row+1
; 8 low data shifted left
; 8 low data shifted right
movdqu xmm0, [r8+rbx-1] ; data for 1st row of 3
movdqu xmm1, xmm0
movdqu xmm2, xmm0
psrldq xmm1, 1
psrldq xmm2, 2
movdqa xmm3, xmm0
movdqa xmm4, xmm1
movdqa xmm5, xmm2
punpcklbw xmm3, xmm13
punpcklbw xmm4, xmm14
punpcklbw xmm5, xmm15 ; 8 values for 1st row
psubw xmm11, xmm3
psubw xmm9, xmm3
paddw xmm11, xmm5
psubw xmm9, xmm4
psubw xmm9, xmm4
psubw xmm9, xmm5 ; finished tally for 1st row, 1st 8
punpckhbw xmm0, xmm13
punpckhbw xmm1, xmm14
punpckhbw xmm2, xmm15
psubw xmm12, xmm0
psubw xmm10, xmm0
paddw xmm12, xmm2
psubw xmm10, xmm1
psubw xmm10, xmm1
psubw xmm10, xmm2 ; finished tally for 1st row, last 6
movdqu xmm0, [r9+rbx-1] ; data for 1st row of 3
;movdqu xmm1, xmm0
movdqu xmm2, xmm0
;psrldq xmm1, 1
psrldq xmm2, 2
movdqa xmm3, xmm0
;movdqa xmm4, xmm1
movdqa xmm5, xmm2
punpcklbw xmm3, xmm13
;punpcklbw xmm4, xmm14
punpcklbw xmm5, xmm15 ; 8 values for 1st row
psubw xmm11, xmm3
psubw xmm11, xmm3
paddw xmm11, xmm5
paddw xmm11, xmm5
punpckhbw xmm0, xmm13
;punpckhbw xmm1, xmm14
punpckhbw xmm2, xmm15
psubw xmm12, xmm0
psubw xmm12, xmm0
paddw xmm12, xmm2
paddw xmm12, xmm2 ; finished tally for 2nd row, last 6
movdqu xmm0, [r10+rbx-1] ; data for 3rd row of 3
movdqu xmm1, xmm0
movdqu xmm2, xmm0
psrldq xmm1, 1
psrldq xmm2, 2
movdqa xmm3, xmm0
movdqa xmm4, xmm1
movdqa xmm5, xmm2
punpcklbw xmm3, xmm13
punpcklbw xmm4, xmm14
punpcklbw xmm5, xmm15 ; 8 values for 3rd row
psubw xmm11, xmm3
paddw xmm9, xmm3
paddw xmm11, xmm5
paddw xmm9, xmm4
paddw xmm9, xmm4
paddw xmm9, xmm5 ; finished tally for 3rd row, 1st 8
punpckhbw xmm0, xmm13
punpckhbw xmm1, xmm14
punpckhbw xmm2, xmm15
psubw xmm12, xmm0
paddw xmm10, xmm0
paddw xmm12, xmm2
paddw xmm10, xmm1
paddw xmm10, xmm1
paddw xmm10, xmm2 ; finished tally for 3rd row, last 6
add rbx, 14 ; process 14 Sobel values
cmp rbx, rdx
jl .more_cols
add r8, rdx
add r9, rdx
add r10, rdx
dec rax ; 1 fewer row to process
jp .more_rows
.noworktodo:
add rsp, 48
multipop rbx, rbx, r12, r13, r14, r15
ret
main:
push rbp
mov rbp, rsp
sub rsp, 32
mov rcx, dsize
lea rdi, [data]
mov al, 255
.more:
inc al
stosb
dec rcx
jnz .more
lea rdi, [data]
lea rsi, [result]
mov rdx, rows
mov rcx, cols
call sobel
leave
ret