.text
.intel_syntax noprefix
.file "encoder.c"
.globl qoi_write_32 # -- Begin function qoi_write_32
.p2align 4, 0x90
.type qoi_write_32,@function
qoi_write_32: # @qoi_write_32
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
mov eax, edx
shr eax, 24
movsxd r8, dword ptr [rsi]
lea ecx, [r8 + 1]
mov dword ptr [rsi], ecx
mov byte ptr [rdi + r8], al
mov eax, edx
shr eax, 16
movsxd r8, dword ptr [rsi]
lea ecx, [r8 + 1]
mov dword ptr [rsi], ecx
mov byte ptr [rdi + r8], al
movsxd rax, dword ptr [rsi]
lea ecx, [rax + 1]
mov dword ptr [rsi], ecx
mov byte ptr [rdi + rax], dh
movsxd rax, dword ptr [rsi]
lea ecx, [rax + 1]
mov dword ptr [rsi], ecx
mov byte ptr [rdi + rax], dl
mov rsp, rbp
pop rbp
ret
.Lfunc_end0:
.size qoi_write_32, .Lfunc_end0-qoi_write_32
# -- End function
.globl qoi_read_32 # -- Begin function qoi_read_32
.p2align 4, 0x90
.type qoi_read_32,@function
qoi_read_32: # @qoi_read_32
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
movsxd rcx, dword ptr [rsi]
lea rax, [rcx + 1]
mov dword ptr [rsi], eax
movzx r8d, byte ptr [rdi + rcx]
lea rax, [rcx + 2]
mov dword ptr [rsi], eax
movzx r9d, byte ptr [rdi + rcx + 1]
lea rax, [rcx + 3]
mov dword ptr [rsi], eax
movzx eax, byte ptr [rdi + rcx + 2]
lea edx, [rcx + 4]
mov dword ptr [rsi], edx
movzx ecx, byte ptr [rdi + rcx + 3]
shl r8d, 24
shl r9d, 16
or r9d, r8d
shl eax, 8
or eax, r9d
or eax, ecx
mov rsp, rbp
pop rbp
ret
.Lfunc_end1:
.size qoi_read_32, .Lfunc_end1-qoi_read_32
# -- End function
.globl pixel_cpy # -- Begin function pixel_cpy
.p2align 4, 0x90
.type pixel_cpy,@function
pixel_cpy: # @pixel_cpy
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
test rdx, rdx
je .LBB2_20
# %bb.1:
cmp rdx, 32
jb .LBB2_13
# %bb.2:
lea rax, [rsi + rdx]
cmp rax, rdi
jbe .LBB2_4
# %bb.3:
lea rax, [rdi + rdx]
cmp rax, rsi
jbe .LBB2_4
.LBB2_13:
lea r8, [rdx - 1]
mov r9, rdx
and r9, 7
je .LBB2_17
.LBB2_14:
xor ecx, ecx
.p2align 4, 0x90
.LBB2_15: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rsi + rcx]
mov byte ptr [rdi + rcx], al
add rcx, 1
cmp r9, rcx
jne .LBB2_15
# %bb.16:
sub rdx, rcx
add rsi, rcx
add rdi, rcx
.LBB2_17:
cmp r8, 7
jb .LBB2_20
# %bb.18:
xor eax, eax
.p2align 4, 0x90
.LBB2_19: # =>This Inner Loop Header: Depth=1
movzx ecx, byte ptr [rsi + rax]
mov byte ptr [rdi + rax], cl
movzx ecx, byte ptr [rsi + rax + 1]
mov byte ptr [rdi + rax + 1], cl
movzx ecx, byte ptr [rsi + rax + 2]
mov byte ptr [rdi + rax + 2], cl
movzx ecx, byte ptr [rsi + rax + 3]
mov byte ptr [rdi + rax + 3], cl
movzx ecx, byte ptr [rsi + rax + 4]
mov byte ptr [rdi + rax + 4], cl
movzx ecx, byte ptr [rsi + rax + 5]
mov byte ptr [rdi + rax + 5], cl
movzx ecx, byte ptr [rsi + rax + 6]
mov byte ptr [rdi + rax + 6], cl
movzx ecx, byte ptr [rsi + rax + 7]
mov byte ptr [rdi + rax + 7], cl
add rax, 8
cmp rdx, rax
jne .LBB2_19
jmp .LBB2_20
.LBB2_4:
mov r8, rdx
and r8, -32
lea rax, [r8 - 32]
mov rcx, rax
shr rcx, 5
add rcx, 1
mov r9d, ecx
and r9d, 3
cmp rax, 96
jae .LBB2_6
# %bb.5:
xor eax, eax
jmp .LBB2_8
.LBB2_6:
sub rcx, r9
xor eax, eax
.p2align 4, 0x90
.LBB2_7: # =>This Inner Loop Header: Depth=1
movups xmm0, xmmword ptr [rsi + rax]
movups xmm1, xmmword ptr [rsi + rax + 16]
movups xmmword ptr [rdi + rax], xmm0
movups xmmword ptr [rdi + rax + 16], xmm1
movups xmm0, xmmword ptr [rsi + rax + 32]
movups xmm1, xmmword ptr [rsi + rax + 48]
movups xmmword ptr [rdi + rax + 32], xmm0
movups xmmword ptr [rdi + rax + 48], xmm1
movups xmm0, xmmword ptr [rsi + rax + 64]
movups xmm1, xmmword ptr [rsi + rax + 80]
movups xmmword ptr [rdi + rax + 64], xmm0
movups xmmword ptr [rdi + rax + 80], xmm1
movups xmm0, xmmword ptr [rsi + rax + 96]
movups xmm1, xmmword ptr [rsi + rax + 112]
movups xmmword ptr [rdi + rax + 96], xmm0
movups xmmword ptr [rdi + rax + 112], xmm1
sub rax, -128
add rcx, -4
jne .LBB2_7
.LBB2_8:
test r9, r9
je .LBB2_11
# %bb.9:
add rax, 16
neg r9
.p2align 4, 0x90
.LBB2_10: # =>This Inner Loop Header: Depth=1
movups xmm0, xmmword ptr [rsi + rax - 16]
movups xmm1, xmmword ptr [rsi + rax]
movups xmmword ptr [rdi + rax - 16], xmm0
movups xmmword ptr [rdi + rax], xmm1
add rax, 32
inc r9
jne .LBB2_10
.LBB2_11:
cmp r8, rdx
jne .LBB2_12
.LBB2_20:
mov eax, 1
mov rsp, rbp
pop rbp
ret
.LBB2_12:
and edx, 31
add rsi, r8
add rdi, r8
lea r8, [rdx - 1]
mov r9, rdx
and r9, 7
jne .LBB2_14
jmp .LBB2_17
.Lfunc_end2:
.size pixel_cpy, .Lfunc_end2-pixel_cpy
# -- End function
.globl qoi_pixel_encoder # -- Begin function qoi_pixel_encoder
.p2align 4, 0x90
.type qoi_pixel_encoder,@function
qoi_pixel_encoder: # @qoi_pixel_encoder
# %bb.0:
push rbp
mov rbp, rsp
push r15
push r14
push r13
push r12
push rbx
and rsp, -8
mov ebx, dword ptr [rbp + 16]
mov r10, qword ptr [rbp + 24]
mov r11, qword ptr [rbp + 32]
xor eax, eax
cmp r11, r10
sete al
add r9d, -1
xor r9d, ecx
add ebx, -1
xor ebx, r8d
or ebx, r9d
sete cl
mov r9b, byte ptr [r11]
mov r13b, byte ptr [r11 + 1]
mov r14b, byte ptr [r11 + 2]
mov r8b, byte ptr [r11 + 3]
mov ebx, dword ptr [rdx + 4*rax]
cmp r11, r10
jne .LBB3_3
# %bb.1:
test cl, cl
jne .LBB3_3
# %bb.2:
cmp ebx, 8224
jne .LBB3_7
.LBB3_3:
shl rax, 2
cmp ebx, 32
jg .LBB3_5
# %bb.4:
add bl, -1
or bl, 64
mov r12d, 1
mov r15, rsi
jmp .LBB3_6
.LBB3_5:
add ebx, -33
mov dword ptr [rdx + rax], ebx
shr ebx, 8
or bl, 96
lea r15, [rsi + 4]
movsxd rcx, dword ptr [rsi]
mov byte ptr [rdi + rcx], bl
mov bl, byte ptr [rdx + rax]
mov r12d, 2
.LBB3_6:
lea rsi, [rsi + 4*r12]
movsxd rcx, dword ptr [r15]
mov byte ptr [rdi + rcx], bl
mov dword ptr [rdx + rax], 0
.LBB3_7:
cmp r11, r10
je .LBB3_25
# %bb.8:
mov rax, qword ptr [rbp + 40]
xor r13b, r9b
xor r13b, r14b
xor r13b, r8b
movzx ecx, r13b
shl rcx, 5
mov rax, qword ptr [rax + rcx]
cmp rax, r11
je .LBB3_26
# %bb.9:
mov cl, byte ptr [r11]
mov byte ptr [rax], cl
mov cl, byte ptr [r11 + 1]
mov byte ptr [rax + 1], cl
mov cl, byte ptr [r11 + 2]
mov byte ptr [rax + 2], cl
mov cl, byte ptr [r11 + 3]
mov byte ptr [rax + 3], cl
movsx ecx, byte ptr [r11]
movsx eax, byte ptr [r10]
sub ecx, eax
movsx r9d, byte ptr [r11 + 1]
movsx eax, byte ptr [r10 + 1]
sub r9d, eax
movsx edx, byte ptr [r11 + 2]
movsx eax, byte ptr [r10 + 2]
sub edx, eax
movsx r8d, byte ptr [r11 + 3]
movsx eax, byte ptr [r10 + 3]
sub r8d, eax
lea r15d, [rcx + 16]
lea eax, [r9 + 16]
or eax, r15d
lea r14d, [rdx + 16]
lea r10d, [r8 + 16]
mov ebx, r14d
or ebx, r10d
or ebx, eax
cmp ebx, 32
jae .LBB3_16
# %bb.10:
lea r11d, [rdx + 2]
cmp r11d, 3
ja .LBB3_13
# %bb.11:
lea eax, [rcx + 2]
lea ebx, [r9 + 2]
or ebx, eax
and ebx, -4
or ebx, r8d
jne .LBB3_13
# %bb.12:
shl ecx, 4
add ecx, 32
lea eax, [4*r9 + 8]
or eax, ecx
or eax, r11d
or al, -128
movsxd rcx, dword ptr [rsi]
mov byte ptr [rdi + rcx], al
jmp .LBB3_25
.LBB3_26:
movsxd rax, dword ptr [rsi]
mov byte ptr [rdi + rax], r13b
jmp .LBB3_25
.LBB3_16:
test ecx, ecx
setne al
shl al, 3
test r9d, r9d
setne bl
shl bl, 2
or bl, al
test edx, edx
setne al
add al, al
or al, bl
test r8d, r8d
setne bl
or bl, al
or bl, -16
movsxd rax, dword ptr [rsi]
mov byte ptr [rdi + rax], bl
test ecx, ecx
je .LBB3_17
# %bb.18:
mov al, byte ptr [r11]
movsxd rcx, dword ptr [rsi + 4]
add rsi, 8
mov byte ptr [rdi + rcx], al
test r9d, r9d
je .LBB3_21
.LBB3_20:
mov al, byte ptr [r11 + 1]
movsxd rcx, dword ptr [rsi]
add rsi, 4
mov byte ptr [rdi + rcx], al
.LBB3_21:
test edx, edx
je .LBB3_23
# %bb.22:
mov al, byte ptr [r11 + 2]
movsxd rcx, dword ptr [rsi]
add rsi, 4
mov byte ptr [rdi + rcx], al
.LBB3_23:
test r8d, r8d
je .LBB3_25
# %bb.24:
mov al, byte ptr [r11 + 3]
movsxd rcx, dword ptr [rsi]
mov byte ptr [rdi + rcx], al
jmp .LBB3_25
.LBB3_13:
lea eax, [r9 + 8]
add edx, 8
or eax, edx
and eax, -16
or eax, r8d
je .LBB3_14
# %bb.15:
mov eax, r15d
shr al
or al, -32
movsxd rcx, dword ptr [rsi]
mov byte ptr [rdi + rcx], al
shl r15d, 7
lea eax, [4*r9 + 64]
or eax, r15d
mov ecx, r14d
shr ecx, 3
or ecx, eax
movsxd rax, dword ptr [rsi + 4]
mov byte ptr [rdi + rax], cl
shl r14d, 7
or r10d, r14d
movsxd rax, dword ptr [rsi + 8]
mov byte ptr [rdi + rax], r10b
jmp .LBB3_25
.LBB3_17:
add rsi, 4
test r9d, r9d
jne .LBB3_20
jmp .LBB3_21
.LBB3_14:
or r15b, -64
movsxd rax, dword ptr [rsi]
mov byte ptr [rdi + rax], r15b
shl r9d, 4
sub r9d, -128
or edx, r9d
movsxd rax, dword ptr [rsi + 4]
mov byte ptr [rdi + rax], dl
.LBB3_25:
mov eax, 1
lea rsp, [rbp - 40]
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
.Lfunc_end3:
.size qoi_pixel_encoder, .Lfunc_end3-qoi_pixel_encoder
# -- End function
.ident "clang version 10.0.0-4ubuntu1 "
.section ".note.GNU-stack","",@progbits
.addrsig
Hi i am testing the c2goasm, for a simple function it works, but for bigger one it is took like forever, i wonder if there was a problem in my config or a bug?
build command:
clang -S -O3 -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti $1Generated CLang ASM
C code