Skip to content
This repository was archived by the owner on Dec 1, 2021. It is now read-only.
This repository was archived by the owner on Dec 1, 2021. It is now read-only.

Took forever to generate the goasm #30

@ii64

Description

@ii64

Hi i am testing the c2goasm, for a simple function it works, but for bigger one it is took like forever, i wonder if there was a problem in my config or a bug?

image
image

build command: clang -S -O3 -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti $1

Generated CLang ASM
	.text
	.intel_syntax noprefix
	.file	"encoder.c"
	.globl	qoi_write_32            # -- Begin function qoi_write_32
	.p2align	4, 0x90
	.type	qoi_write_32,@function
qoi_write_32:                           # @qoi_write_32
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	mov	eax, edx
	shr	eax, 24
	movsxd	r8, dword ptr [rsi]
	lea	ecx, [r8 + 1]
	mov	dword ptr [rsi], ecx
	mov	byte ptr [rdi + r8], al
	mov	eax, edx
	shr	eax, 16
	movsxd	r8, dword ptr [rsi]
	lea	ecx, [r8 + 1]
	mov	dword ptr [rsi], ecx
	mov	byte ptr [rdi + r8], al
	movsxd	rax, dword ptr [rsi]
	lea	ecx, [rax + 1]
	mov	dword ptr [rsi], ecx
	mov	byte ptr [rdi + rax], dh
	movsxd	rax, dword ptr [rsi]
	lea	ecx, [rax + 1]
	mov	dword ptr [rsi], ecx
	mov	byte ptr [rdi + rax], dl
	mov	rsp, rbp
	pop	rbp
	ret
.Lfunc_end0:
	.size	qoi_write_32, .Lfunc_end0-qoi_write_32
                                        # -- End function
	.globl	qoi_read_32             # -- Begin function qoi_read_32
	.p2align	4, 0x90
	.type	qoi_read_32,@function
qoi_read_32:                            # @qoi_read_32
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	movsxd	rcx, dword ptr [rsi]
	lea	rax, [rcx + 1]
	mov	dword ptr [rsi], eax
	movzx	r8d, byte ptr [rdi + rcx]
	lea	rax, [rcx + 2]
	mov	dword ptr [rsi], eax
	movzx	r9d, byte ptr [rdi + rcx + 1]
	lea	rax, [rcx + 3]
	mov	dword ptr [rsi], eax
	movzx	eax, byte ptr [rdi + rcx + 2]
	lea	edx, [rcx + 4]
	mov	dword ptr [rsi], edx
	movzx	ecx, byte ptr [rdi + rcx + 3]
	shl	r8d, 24
	shl	r9d, 16
	or	r9d, r8d
	shl	eax, 8
	or	eax, r9d
	or	eax, ecx
	mov	rsp, rbp
	pop	rbp
	ret
.Lfunc_end1:
	.size	qoi_read_32, .Lfunc_end1-qoi_read_32
                                        # -- End function
	.globl	pixel_cpy               # -- Begin function pixel_cpy
	.p2align	4, 0x90
	.type	pixel_cpy,@function
pixel_cpy:                              # @pixel_cpy
# %bb.0:
	push	rbp
	mov	rbp, rsp
	and	rsp, -8
	test	rdx, rdx
	je	.LBB2_20
# %bb.1:
	cmp	rdx, 32
	jb	.LBB2_13
# %bb.2:
	lea	rax, [rsi + rdx]
	cmp	rax, rdi
	jbe	.LBB2_4
# %bb.3:
	lea	rax, [rdi + rdx]
	cmp	rax, rsi
	jbe	.LBB2_4
.LBB2_13:
	lea	r8, [rdx - 1]
	mov	r9, rdx
	and	r9, 7
	je	.LBB2_17
.LBB2_14:
	xor	ecx, ecx
	.p2align	4, 0x90
.LBB2_15:                               # =>This Inner Loop Header: Depth=1
	movzx	eax, byte ptr [rsi + rcx]
	mov	byte ptr [rdi + rcx], al
	add	rcx, 1
	cmp	r9, rcx
	jne	.LBB2_15
# %bb.16:
	sub	rdx, rcx
	add	rsi, rcx
	add	rdi, rcx
.LBB2_17:
	cmp	r8, 7
	jb	.LBB2_20
# %bb.18:
	xor	eax, eax
	.p2align	4, 0x90
.LBB2_19:                               # =>This Inner Loop Header: Depth=1
	movzx	ecx, byte ptr [rsi + rax]
	mov	byte ptr [rdi + rax], cl
	movzx	ecx, byte ptr [rsi + rax + 1]
	mov	byte ptr [rdi + rax + 1], cl
	movzx	ecx, byte ptr [rsi + rax + 2]
	mov	byte ptr [rdi + rax + 2], cl
	movzx	ecx, byte ptr [rsi + rax + 3]
	mov	byte ptr [rdi + rax + 3], cl
	movzx	ecx, byte ptr [rsi + rax + 4]
	mov	byte ptr [rdi + rax + 4], cl
	movzx	ecx, byte ptr [rsi + rax + 5]
	mov	byte ptr [rdi + rax + 5], cl
	movzx	ecx, byte ptr [rsi + rax + 6]
	mov	byte ptr [rdi + rax + 6], cl
	movzx	ecx, byte ptr [rsi + rax + 7]
	mov	byte ptr [rdi + rax + 7], cl
	add	rax, 8
	cmp	rdx, rax
	jne	.LBB2_19
	jmp	.LBB2_20
.LBB2_4:
	mov	r8, rdx
	and	r8, -32
	lea	rax, [r8 - 32]
	mov	rcx, rax
	shr	rcx, 5
	add	rcx, 1
	mov	r9d, ecx
	and	r9d, 3
	cmp	rax, 96
	jae	.LBB2_6
# %bb.5:
	xor	eax, eax
	jmp	.LBB2_8
.LBB2_6:
	sub	rcx, r9
	xor	eax, eax
	.p2align	4, 0x90
.LBB2_7:                                # =>This Inner Loop Header: Depth=1
	movups	xmm0, xmmword ptr [rsi + rax]
	movups	xmm1, xmmword ptr [rsi + rax + 16]
	movups	xmmword ptr [rdi + rax], xmm0
	movups	xmmword ptr [rdi + rax + 16], xmm1
	movups	xmm0, xmmword ptr [rsi + rax + 32]
	movups	xmm1, xmmword ptr [rsi + rax + 48]
	movups	xmmword ptr [rdi + rax + 32], xmm0
	movups	xmmword ptr [rdi + rax + 48], xmm1
	movups	xmm0, xmmword ptr [rsi + rax + 64]
	movups	xmm1, xmmword ptr [rsi + rax + 80]
	movups	xmmword ptr [rdi + rax + 64], xmm0
	movups	xmmword ptr [rdi + rax + 80], xmm1
	movups	xmm0, xmmword ptr [rsi + rax + 96]
	movups	xmm1, xmmword ptr [rsi + rax + 112]
	movups	xmmword ptr [rdi + rax + 96], xmm0
	movups	xmmword ptr [rdi + rax + 112], xmm1
	sub	rax, -128
	add	rcx, -4
	jne	.LBB2_7
.LBB2_8:
	test	r9, r9
	je	.LBB2_11
# %bb.9:
	add	rax, 16
	neg	r9
	.p2align	4, 0x90
.LBB2_10:                               # =>This Inner Loop Header: Depth=1
	movups	xmm0, xmmword ptr [rsi + rax - 16]
	movups	xmm1, xmmword ptr [rsi + rax]
	movups	xmmword ptr [rdi + rax - 16], xmm0
	movups	xmmword ptr [rdi + rax], xmm1
	add	rax, 32
	inc	r9
	jne	.LBB2_10
.LBB2_11:
	cmp	r8, rdx
	jne	.LBB2_12
.LBB2_20:
	mov	eax, 1
	mov	rsp, rbp
	pop	rbp
	ret
.LBB2_12:
	and	edx, 31
	add	rsi, r8
	add	rdi, r8
	lea	r8, [rdx - 1]
	mov	r9, rdx
	and	r9, 7
	jne	.LBB2_14
	jmp	.LBB2_17
.Lfunc_end2:
	.size	pixel_cpy, .Lfunc_end2-pixel_cpy
                                        # -- End function
	.globl	qoi_pixel_encoder       # -- Begin function qoi_pixel_encoder
	.p2align	4, 0x90
	.type	qoi_pixel_encoder,@function
qoi_pixel_encoder:                      # @qoi_pixel_encoder
# %bb.0:
	push	rbp
	mov	rbp, rsp
	push	r15
	push	r14
	push	r13
	push	r12
	push	rbx
	and	rsp, -8
	mov	ebx, dword ptr [rbp + 16]
	mov	r10, qword ptr [rbp + 24]
	mov	r11, qword ptr [rbp + 32]
	xor	eax, eax
	cmp	r11, r10
	sete	al
	add	r9d, -1
	xor	r9d, ecx
	add	ebx, -1
	xor	ebx, r8d
	or	ebx, r9d
	sete	cl
	mov	r9b, byte ptr [r11]
	mov	r13b, byte ptr [r11 + 1]
	mov	r14b, byte ptr [r11 + 2]
	mov	r8b, byte ptr [r11 + 3]
	mov	ebx, dword ptr [rdx + 4*rax]
	cmp	r11, r10
	jne	.LBB3_3
# %bb.1:
	test	cl, cl
	jne	.LBB3_3
# %bb.2:
	cmp	ebx, 8224
	jne	.LBB3_7
.LBB3_3:
	shl	rax, 2
	cmp	ebx, 32
	jg	.LBB3_5
# %bb.4:
	add	bl, -1
	or	bl, 64
	mov	r12d, 1
	mov	r15, rsi
	jmp	.LBB3_6
.LBB3_5:
	add	ebx, -33
	mov	dword ptr [rdx + rax], ebx
	shr	ebx, 8
	or	bl, 96
	lea	r15, [rsi + 4]
	movsxd	rcx, dword ptr [rsi]
	mov	byte ptr [rdi + rcx], bl
	mov	bl, byte ptr [rdx + rax]
	mov	r12d, 2
.LBB3_6:
	lea	rsi, [rsi + 4*r12]
	movsxd	rcx, dword ptr [r15]
	mov	byte ptr [rdi + rcx], bl
	mov	dword ptr [rdx + rax], 0
.LBB3_7:
	cmp	r11, r10
	je	.LBB3_25
# %bb.8:
	mov	rax, qword ptr [rbp + 40]
	xor	r13b, r9b
	xor	r13b, r14b
	xor	r13b, r8b
	movzx	ecx, r13b
	shl	rcx, 5
	mov	rax, qword ptr [rax + rcx]
	cmp	rax, r11
	je	.LBB3_26
# %bb.9:
	mov	cl, byte ptr [r11]
	mov	byte ptr [rax], cl
	mov	cl, byte ptr [r11 + 1]
	mov	byte ptr [rax + 1], cl
	mov	cl, byte ptr [r11 + 2]
	mov	byte ptr [rax + 2], cl
	mov	cl, byte ptr [r11 + 3]
	mov	byte ptr [rax + 3], cl
	movsx	ecx, byte ptr [r11]
	movsx	eax, byte ptr [r10]
	sub	ecx, eax
	movsx	r9d, byte ptr [r11 + 1]
	movsx	eax, byte ptr [r10 + 1]
	sub	r9d, eax
	movsx	edx, byte ptr [r11 + 2]
	movsx	eax, byte ptr [r10 + 2]
	sub	edx, eax
	movsx	r8d, byte ptr [r11 + 3]
	movsx	eax, byte ptr [r10 + 3]
	sub	r8d, eax
	lea	r15d, [rcx + 16]
	lea	eax, [r9 + 16]
	or	eax, r15d
	lea	r14d, [rdx + 16]
	lea	r10d, [r8 + 16]
	mov	ebx, r14d
	or	ebx, r10d
	or	ebx, eax
	cmp	ebx, 32
	jae	.LBB3_16
# %bb.10:
	lea	r11d, [rdx + 2]
	cmp	r11d, 3
	ja	.LBB3_13
# %bb.11:
	lea	eax, [rcx + 2]
	lea	ebx, [r9 + 2]
	or	ebx, eax
	and	ebx, -4
	or	ebx, r8d
	jne	.LBB3_13
# %bb.12:
	shl	ecx, 4
	add	ecx, 32
	lea	eax, [4*r9 + 8]
	or	eax, ecx
	or	eax, r11d
	or	al, -128
	movsxd	rcx, dword ptr [rsi]
	mov	byte ptr [rdi + rcx], al
	jmp	.LBB3_25
.LBB3_26:
	movsxd	rax, dword ptr [rsi]
	mov	byte ptr [rdi + rax], r13b
	jmp	.LBB3_25
.LBB3_16:
	test	ecx, ecx
	setne	al
	shl	al, 3
	test	r9d, r9d
	setne	bl
	shl	bl, 2
	or	bl, al
	test	edx, edx
	setne	al
	add	al, al
	or	al, bl
	test	r8d, r8d
	setne	bl
	or	bl, al
	or	bl, -16
	movsxd	rax, dword ptr [rsi]
	mov	byte ptr [rdi + rax], bl
	test	ecx, ecx
	je	.LBB3_17
# %bb.18:
	mov	al, byte ptr [r11]
	movsxd	rcx, dword ptr [rsi + 4]
	add	rsi, 8
	mov	byte ptr [rdi + rcx], al
	test	r9d, r9d
	je	.LBB3_21
.LBB3_20:
	mov	al, byte ptr [r11 + 1]
	movsxd	rcx, dword ptr [rsi]
	add	rsi, 4
	mov	byte ptr [rdi + rcx], al
.LBB3_21:
	test	edx, edx
	je	.LBB3_23
# %bb.22:
	mov	al, byte ptr [r11 + 2]
	movsxd	rcx, dword ptr [rsi]
	add	rsi, 4
	mov	byte ptr [rdi + rcx], al
.LBB3_23:
	test	r8d, r8d
	je	.LBB3_25
# %bb.24:
	mov	al, byte ptr [r11 + 3]
	movsxd	rcx, dword ptr [rsi]
	mov	byte ptr [rdi + rcx], al
	jmp	.LBB3_25
.LBB3_13:
	lea	eax, [r9 + 8]
	add	edx, 8
	or	eax, edx
	and	eax, -16
	or	eax, r8d
	je	.LBB3_14
# %bb.15:
	mov	eax, r15d
	shr	al
	or	al, -32
	movsxd	rcx, dword ptr [rsi]
	mov	byte ptr [rdi + rcx], al
	shl	r15d, 7
	lea	eax, [4*r9 + 64]
	or	eax, r15d
	mov	ecx, r14d
	shr	ecx, 3
	or	ecx, eax
	movsxd	rax, dword ptr [rsi + 4]
	mov	byte ptr [rdi + rax], cl
	shl	r14d, 7
	or	r10d, r14d
	movsxd	rax, dword ptr [rsi + 8]
	mov	byte ptr [rdi + rax], r10b
	jmp	.LBB3_25
.LBB3_17:
	add	rsi, 4
	test	r9d, r9d
	jne	.LBB3_20
	jmp	.LBB3_21
.LBB3_14:
	or	r15b, -64
	movsxd	rax, dword ptr [rsi]
	mov	byte ptr [rdi + rax], r15b
	shl	r9d, 4
	sub	r9d, -128
	or	edx, r9d
	movsxd	rax, dword ptr [rsi + 4]
	mov	byte ptr [rdi + rax], dl
.LBB3_25:
	mov	eax, 1
	lea	rsp, [rbp - 40]
	pop	rbx
	pop	r12
	pop	r13
	pop	r14
	pop	r15
	pop	rbp
	ret
.Lfunc_end3:
	.size	qoi_pixel_encoder, .Lfunc_end3-qoi_pixel_encoder
                                        # -- End function
	.ident	"clang version 10.0.0-4ubuntu1 "
	.section	".note.GNU-stack","",@progbits
	.addrsig
C code
#ifndef QOI_ENCODER_
#define QOI_ENCODER_

#include <stddef.h>
#include "qoi.h"

void pixel_cpy(char *dst, char *src, size_t sz)
{
    while (sz--)
    {
        *dst++ = *src++;
    }
}

int qoi_pixel_encoder(
    char *data, int *cur, int *run,
    const int x, const int y,
    const int maxX, const int maxY,
    const char *px_prev, char *px,
    char **index) // [64][4]
{
    qoi_rgba_t px_ = {.rgba = {
                          .r = px[0],
                          .g = px[1],
                          .b = px[2],
                          .a = px[3],
                      }};

    if (px == px_prev)
    {
        *run++;
    }

    int last_pixel = x == maxX - 1 && y == (maxY - 1);
    if (*run > 0 && *run == 0x2020 || px != px_prev || last_pixel)
    {
        if (*run < 33)
        {
            *(data + *cur++) = QOI_RUN_8 | *run - 1;
        }
        else
        {
            *run -= 33;
            *(data + *cur++) = QOI_RUN_16 | *run >> 8;
            *(data + *cur++) = *run & 0xFF;
        }
        *run = 0;
    }

    if (px != px_prev)
    {
        int index_pos = QOI_COLOR_HASH(px_);
        if (index[index_pos * 4] == px)
        {
            *(data + *cur++) = QOI_INDEX | index_pos;
        }
        else
        {
            pixel_cpy(index[index_pos * 4], px, 4);
            int vr = px[0] - px_prev[0];
            int vg = px[1] - px_prev[1];
            int vb = px[2] - px_prev[2];
            int va = px[3] - px_prev[3];

            if (
                vr > -17 && vr < 16 &&
                vg > -17 && vg < 16 &&
                vb > -17 && vb < 16 &&
                va > -17 && va < 16)
            {
                if (
                    va == 0 &&
                    vr > -3 && vr < 2 &&
                    vg > -3 && vg < 2 &&
                    vb > -3 && vb < 2)
                {
                    *(data + *cur++) = QOI_DIFF_8 | (vr + 2) << 4 | (vg + 2) << 2 | (vb + 2);
                }
                else if (
                    va == 0 &&
                    vr > -17 && vr < 16 &&
                    vg > -9 && vg < 8 &&
                    vb > -9 && vb < 8)
                {
                    *(data + *cur++) = QOI_DIFF_16 | (vr + 16);
                    *(data + *cur++) = (vg + 8) << 4 | (vb + 8);
                }
                else
                {
                    *(data + *cur++) = QOI_DIFF_24 | (vr + 16) >> 1;
                    *(data + *cur++) = (vr + 16) << 7 | (vg + 16) << 2 | (vb + 16) >> 3;
                    *(data + *cur++) = (vb + 16) << 7 | (va + 16);
                }
            }
            else
            {
                *(data + *cur++) = QOI_COLOR | (vr ? 8 : 0) | (vg ? 4 : 0) | (vb ? 2 : 0) | (va ? 1 : 0);
                if (vr)
                {
                    *(data + *cur++) = px[0];
                }
                if (vg)
                {
                    *(data + *cur++) = px[1];
                }
                if (vb)
                {
                    *(data + *cur++) = px[2];
                }
                if (va)
                {
                    *(data + *cur++) = px[3];
                }
            }
        }
        px_prev = px;
    }
    return 1;
}
#endif

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions