cosmopolitan/third_party/stb/idct-sse.S

427 lines
10 KiB
ArmAsm

/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/macros.h"
/ Computes inverse discrete cosine transform.
/
/ @note used to decode jpeg
.p2align 4
stbi__idct_simd$sse:
push %rbp
mov %rsp,%rbp
movslq %esi,%rsi
lea (%rdi,%rsi),%rax
sub $96,%rsp
movdqa 32(%rdx),%xmm0
movdqa 112(%rdx),%xmm9
movdqa 48(%rdx),%xmm1
movdqa 80(%rdx),%xmm7
movdqa %xmm0,%xmm2
punpcklwd 96(%rdx),%xmm2
punpckhwd 96(%rdx),%xmm0
movdqa %xmm9,%xmm8
movdqa 16(%rdx),%xmm5
movdqa %xmm2,%xmm3
movdqa %xmm2,%xmm6
movdqa %xmm0,%xmm2
pmaddwd .LC1(%rip),%xmm3
movdqa %xmm0,%xmm4
pmaddwd .LC1(%rip),%xmm2
pmaddwd .LC0(%rip),%xmm4
punpckhwd %xmm1,%xmm8
pmaddwd .LC0(%rip),%xmm6
movaps %xmm3,-48(%rbp)
movdqa (%rdx),%xmm3
movaps %xmm2,-64(%rbp)
movdqa 64(%rdx),%xmm2
movdqa %xmm3,%xmm0
movaps %xmm4,-32(%rbp)
paddw %xmm2,%xmm0
psubw %xmm2,%xmm3
movaps %xmm6,-16(%rbp)
movdqa %xmm0,%xmm4
pxor %xmm0,%xmm0
movdqa %xmm0,%xmm11
movdqa %xmm0,%xmm12
movdqa %xmm0,%xmm2
punpcklwd %xmm4,%xmm11
punpckhwd %xmm3,%xmm12
punpcklwd %xmm3,%xmm2
movdqa %xmm11,%xmm13
movdqa %xmm0,%xmm11
movdqa %xmm12,%xmm3
punpckhwd %xmm4,%xmm11
movdqa %xmm8,%xmm12
movdqa %xmm8,%xmm4
movdqa %xmm11,%xmm14
movdqa %xmm7,%xmm8
movdqa %xmm9,%xmm11
punpckhwd %xmm5,%xmm8
psrad $4,%xmm3
punpcklwd %xmm1,%xmm11
psrad $4,%xmm13
psrad $4,%xmm14
movdqa %xmm11,%xmm15
movaps %xmm13,-80(%rbp)
movdqa %xmm8,%xmm6
paddw %xmm7,%xmm1
pmaddwd .LC3(%rip),%xmm15
movaps %xmm14,-96(%rbp)
movdqa %xmm8,%xmm14
movdqa %xmm5,%xmm8
pmaddwd .LC2(%rip),%xmm11
pmaddwd .LC2(%rip),%xmm12
paddw %xmm9,%xmm8
psrad $4,%xmm2
pmaddwd .LC3(%rip),%xmm4
pmaddwd .LC5(%rip),%xmm6
pmaddwd .LC4(%rip),%xmm14
movdqa %xmm4,%xmm10
movdqa %xmm7,%xmm4
movdqa %xmm8,%xmm7
punpcklwd %xmm5,%xmm4
punpcklwd %xmm1,%xmm7
punpckhwd %xmm1,%xmm8
movdqa %xmm4,%xmm13
movdqa %xmm7,%xmm9
pmaddwd .LC5(%rip),%xmm4
pmaddwd .LC6(%rip),%xmm9
movdqa %xmm8,%xmm5
movdqa %xmm7,%xmm1
pmaddwd .LC7(%rip),%xmm8
pmaddwd .LC6(%rip),%xmm5
movdqa %xmm15,%xmm7
paddd %xmm9,%xmm11
paddd %xmm9,%xmm4
movdqa .LC8(%rip),%xmm9
paddd %xmm8,%xmm14
paddd %xmm10,%xmm8
movdqa -96(%rbp),%xmm10
paddd -64(%rbp),%xmm10
pmaddwd .LC7(%rip),%xmm1
pmaddwd .LC4(%rip),%xmm13
paddd %xmm5,%xmm12
paddd %xmm5,%xmm6
paddd %xmm9,%xmm10
movdqa -80(%rbp),%xmm5
paddd -48(%rbp),%xmm5
paddd %xmm1,%xmm13
paddd %xmm1,%xmm7
movdqa %xmm10,%xmm1
psubd %xmm6,%xmm10
paddd %xmm9,%xmm5
paddd %xmm6,%xmm1
psrad $10,%xmm10
movdqa -16(%rbp),%xmm6
movdqa %xmm1,%xmm15
movdqa %xmm5,%xmm1
psubd %xmm4,%xmm5
psrad $10,%xmm5
paddd %xmm4,%xmm1
paddd %xmm2,%xmm6
packssdw %xmm10,%xmm5
movdqa -32(%rbp),%xmm10
paddd %xmm9,%xmm6
paddd %xmm9,%xmm2
psrad $10,%xmm15
psrad $10,%xmm1
psubd -16(%rbp),%xmm2
paddd %xmm3,%xmm10
paddd %xmm9,%xmm3
packssdw %xmm15,%xmm1
paddd %xmm9,%xmm10
psubd -32(%rbp),%xmm3
movdqa %xmm10,%xmm4
psubd %xmm8,%xmm10
paddd %xmm8,%xmm4
psrad $10,%xmm10
movdqa %xmm4,%xmm15
movdqa %xmm6,%xmm4
psubd %xmm7,%xmm6
psrad $10,%xmm6
psrad $10,%xmm15
paddd %xmm7,%xmm4
movdqa %xmm3,%xmm7
psubd %xmm14,%xmm3
packssdw %xmm10,%xmm6
psrad $10,%xmm3
psrad $10,%xmm4
paddd %xmm14,%xmm7
movdqa %xmm7,%xmm8
movdqa %xmm2,%xmm7
psubd %xmm13,%xmm2
paddd %xmm13,%xmm7
psrad $10,%xmm8
packssdw %xmm15,%xmm4
psrad $10,%xmm7
psrad $10,%xmm2
packssdw %xmm8,%xmm7
movdqa -80(%rbp),%xmm8
packssdw %xmm3,%xmm2
paddd %xmm9,%xmm8
paddd -96(%rbp),%xmm9
psubd -48(%rbp),%xmm8
psubd -64(%rbp),%xmm9
movdqa %xmm8,%xmm3
movdqa %xmm9,%xmm10
psubd %xmm11,%xmm8
paddd %xmm12,%xmm10
paddd %xmm11,%xmm3
psrad $10,%xmm8
psrad $10,%xmm10
psrad $10,%xmm3
psubd %xmm12,%xmm9
psrad $10,%xmm9
packssdw %xmm10,%xmm3
movdqa %xmm1,%xmm10
packssdw %xmm9,%xmm8
movdqa %xmm7,%xmm9
punpckhwd %xmm6,%xmm7
punpcklwd %xmm6,%xmm9
punpcklwd %xmm8,%xmm10
punpckhwd %xmm8,%xmm1
movdqa %xmm3,%xmm6
movdqa %xmm4,%xmm8
punpckhwd %xmm5,%xmm3
punpcklwd %xmm5,%xmm6
punpcklwd %xmm2,%xmm8
movdqa %xmm3,%xmm5
punpckhwd %xmm2,%xmm4
movdqa %xmm8,%xmm3
movdqa %xmm10,%xmm2
punpckhwd %xmm6,%xmm8
punpcklwd %xmm6,%xmm3
punpcklwd %xmm9,%xmm2
movdqa %xmm8,%xmm6
movdqa %xmm4,%xmm8
punpckhwd %xmm9,%xmm10
punpcklwd %xmm5,%xmm8
punpckhwd %xmm5,%xmm4
movdqa %xmm2,%xmm5
punpcklwd %xmm3,%xmm5
punpckhwd %xmm3,%xmm2
movdqa %xmm1,%xmm15
movdqa %xmm10,%xmm3
punpckhwd %xmm7,%xmm1
punpckhwd %xmm6,%xmm10
punpcklwd %xmm6,%xmm3
movdqa %xmm1,%xmm6
punpckhwd %xmm4,%xmm1
punpcklwd %xmm4,%xmm6
movdqa %xmm3,%xmm4
punpcklwd %xmm7,%xmm15
punpcklwd %xmm6,%xmm4
punpckhwd %xmm6,%xmm3
movdqa %xmm15,%xmm7
movdqa %xmm4,%xmm6
punpcklwd %xmm8,%xmm7
movdqa %xmm3,%xmm11
movdqa %xmm4,%xmm12
movdqa %xmm3,%xmm4
movdqa %xmm5,%xmm3
paddw %xmm7,%xmm3
movdqa %xmm1,%xmm9
punpckhwd %xmm8,%xmm15
punpcklwd %xmm10,%xmm9
psubw %xmm7,%xmm5
movdqa %xmm15,%xmm7
movdqa %xmm9,%xmm14
punpcklwd %xmm2,%xmm7
movdqa %xmm1,%xmm8
pmaddwd .LC0(%rip),%xmm6
punpckhwd %xmm10,%xmm8
paddw %xmm15,%xmm10
movaps %xmm6,-16(%rbp)
pmaddwd .LC1(%rip),%xmm4
movdqa %xmm0,%xmm6
pmaddwd .LC0(%rip),%xmm11
pmaddwd .LC2(%rip),%xmm14
pmaddwd .LC1(%rip),%xmm12
pmaddwd .LC3(%rip),%xmm9
movaps %xmm4,-64(%rbp)
movdqa %xmm3,%xmm4
movdqa %xmm0,%xmm3
punpckhwd %xmm4,%xmm6
punpcklwd %xmm4,%xmm3
movdqa %xmm0,%xmm4
movaps %xmm11,-32(%rbp)
movdqa %xmm6,%xmm13
movdqa %xmm15,%xmm6
punpcklwd %xmm5,%xmm4
movaps %xmm12,-48(%rbp)
punpckhwd %xmm2,%xmm6
paddw %xmm1,%xmm2
punpckhwd %xmm5,%xmm0
movdqa %xmm14,%xmm11
movdqa %xmm2,%xmm5
movdqa %xmm7,%xmm14
punpckhwd %xmm10,%xmm2
psrad $4,%xmm13
punpcklwd %xmm10,%xmm5
movaps %xmm13,-80(%rbp)
movdqa %xmm8,%xmm12
movdqa %xmm5,%xmm10
pmaddwd .LC4(%rip),%xmm14
pmaddwd .LC6(%rip),%xmm10
movdqa %xmm2,%xmm15
pmaddwd .LC7(%rip),%xmm5
pmaddwd .LC3(%rip),%xmm8
pmaddwd .LC5(%rip),%xmm7
movdqa %xmm14,%xmm13
movdqa %xmm6,%xmm14
paddd %xmm5,%xmm13
paddd %xmm5,%xmm9
pmaddwd .LC5(%rip),%xmm6
psrad $4,%xmm3
pmaddwd .LC6(%rip),%xmm15
paddd %xmm10,%xmm7
paddd %xmm10,%xmm11
psrad $4,%xmm4
pmaddwd .LC2(%rip),%xmm12
psrad $4,%xmm0
pmaddwd .LC4(%rip),%xmm14
pmaddwd .LC7(%rip),%xmm2
movdqa -80(%rbp),%xmm5
paddd %xmm15,%xmm12
paddd -64(%rbp),%xmm5
paddd %xmm2,%xmm14
paddd %xmm8,%xmm2
movdqa -48(%rbp),%xmm8
paddd %xmm6,%xmm15
movdqa .LC9(%rip),%xmm6
paddd %xmm3,%xmm8
paddd %xmm6,%xmm8
paddd %xmm6,%xmm5
movdqa %xmm5,%xmm10
movdqa %xmm8,%xmm1
psubd %xmm15,%xmm5
psubd %xmm7,%xmm8
psrad $17,%xmm5
paddd %xmm7,%xmm1
movdqa -32(%rbp),%xmm7
psrad $17,%xmm8
paddd %xmm15,%xmm10
paddd %xmm6,%xmm3
packssdw %xmm5,%xmm8
movdqa -16(%rbp),%xmm5
paddd %xmm0,%xmm7
paddd %xmm6,%xmm0
paddd %xmm6,%xmm7
psrad $17,%xmm10
psubd -32(%rbp),%xmm0
paddd %xmm4,%xmm5
psrad $17,%xmm1
movdqa %xmm7,%xmm15
paddd %xmm6,%xmm5
packssdw %xmm10,%xmm1
psubd %xmm2,%xmm7
movdqa %xmm5,%xmm10
paddd %xmm6,%xmm4
psubd %xmm9,%xmm5
psubd -16(%rbp),%xmm4
psrad $17,%xmm7
paddd %xmm2,%xmm15
psrad $17,%xmm5
psubd -48(%rbp),%xmm3
paddd -80(%rbp),%xmm6
packssdw %xmm7,%xmm5
movdqa %xmm4,%xmm2
movdqa %xmm0,%xmm7
psubd -64(%rbp),%xmm6
paddd %xmm14,%xmm7
psrad $17,%xmm15
paddd %xmm13,%xmm2
psubd %xmm14,%xmm0
psrad $17,%xmm7
psubd %xmm13,%xmm4
psrad $17,%xmm0
paddd %xmm9,%xmm10
psrad $17,%xmm2
psrad $17,%xmm4
packuswb %xmm8,%xmm5
packssdw %xmm0,%xmm4
packssdw %xmm7,%xmm2
movdqa %xmm3,%xmm0
movdqa %xmm6,%xmm7
psrad $17,%xmm10
paddd %xmm11,%xmm0
paddd %xmm12,%xmm7
psubd %xmm12,%xmm6
packssdw %xmm15,%xmm10
psubd %xmm11,%xmm3
psrad $17,%xmm7
packuswb %xmm10,%xmm1
psrad $17,%xmm0
psrad $17,%xmm6
psrad $17,%xmm3
packssdw %xmm7,%xmm0
packssdw %xmm6,%xmm3
packuswb %xmm0,%xmm2
movdqa %xmm1,%xmm0
packuswb %xmm4,%xmm3
movdqa %xmm2,%xmm4
punpckhbw %xmm5,%xmm2
punpcklbw %xmm3,%xmm0
punpcklbw %xmm5,%xmm4
punpckhbw %xmm3,%xmm1
movdqa %xmm2,%xmm3
movdqa %xmm0,%xmm2
movdqa %xmm1,%xmm5
punpcklbw %xmm4,%xmm2
punpckhbw %xmm4,%xmm0
punpcklbw %xmm3,%xmm5
movdqa %xmm2,%xmm4
punpckhbw %xmm5,%xmm2
punpckhbw %xmm3,%xmm1
punpcklbw %xmm5,%xmm4
movdqa %xmm0,%xmm3
punpckhbw %xmm1,%xmm0
movq %xmm4,(%rdi)
pshufd $78,%xmm4,%xmm4
punpcklbw %xmm1,%xmm3
movq %xmm4,(%rax)
add %rsi,%rax
movq %xmm2,(%rax)
add %rsi,%rax
pshufd $78,%xmm2,%xmm2
movq %xmm2,(%rax)
add %rsi,%rax
movq %xmm3,(%rax)
add %rsi,%rax
pshufd $78,%xmm3,%xmm3
movq %xmm3,(%rax)
movq %xmm0,(%rax,%rsi)
pshufd $78,%xmm0,%xmm0
movq %xmm0,(%rax,%rsi,2)
leave
ret
.endfn stbi__idct_simd$sse,globl
.rodata.cst16
.LC0: .value 2217,-5350,2217,-5350,2217,-5350,2217,-5350
.LC1: .value 5352,2217,5352,2217,5352,2217,5352,2217
.LC2: .value -6811,-8034,-6811,-8034,-6811,-8034,-6811,-8034
.LC3: .value -8034,4552,-8034,4552,-8034,4552,-8034,4552
.LC4: .value 6813,-1597,6813,-1597,6813,-1597,6813,-1597
.LC5: .value -1597,4552,-1597,4552,-1597,4552,-1597,4552
.LC6: .value 1131,4816,1131,4816,1131,4816,1131,4816
.LC7: .value 4816,-5681,4816,-5681,4816,-5681,4816,-5681
.LC8: .long 0x200,0x200,0x200,0x200
.LC9: .long 0x1010000,0x1010000,0x1010000,0x1010000