cosmopolitan/third_party/stb/idct-sse.S

428 lines
11 KiB
ArmAsm

/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
This program is free software; you can redistribute it and/or modify │
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License. │
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of │
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software │
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301 USA
*/
#include "libc/macros.h"
/ Computes inverse discrete cosine transform.
/
/ @note used to decode jpeg
.p2align 4
stbi__idct_simd$sse:
push %rbp
mov %rsp,%rbp
movslq %esi,%rsi
lea (%rdi,%rsi),%rax
sub $96,%rsp
movdqa 32(%rdx),%xmm0
movdqa 112(%rdx),%xmm9
movdqa 48(%rdx),%xmm1
movdqa 80(%rdx),%xmm7
movdqa %xmm0,%xmm2
punpcklwd 96(%rdx),%xmm2
punpckhwd 96(%rdx),%xmm0
movdqa %xmm9,%xmm8
movdqa 16(%rdx),%xmm5
movdqa %xmm2,%xmm3
movdqa %xmm2,%xmm6
movdqa %xmm0,%xmm2
pmaddwd .LC1(%rip),%xmm3
movdqa %xmm0,%xmm4
pmaddwd .LC1(%rip),%xmm2
pmaddwd .LC0(%rip),%xmm4
punpckhwd %xmm1,%xmm8
pmaddwd .LC0(%rip),%xmm6
movaps %xmm3,-48(%rbp)
movdqa (%rdx),%xmm3
movaps %xmm2,-64(%rbp)
movdqa 64(%rdx),%xmm2
movdqa %xmm3,%xmm0
movaps %xmm4,-32(%rbp)
paddw %xmm2,%xmm0
psubw %xmm2,%xmm3
movaps %xmm6,-16(%rbp)
movdqa %xmm0,%xmm4
pxor %xmm0,%xmm0
movdqa %xmm0,%xmm11
movdqa %xmm0,%xmm12
movdqa %xmm0,%xmm2
punpcklwd %xmm4,%xmm11
punpckhwd %xmm3,%xmm12
punpcklwd %xmm3,%xmm2
movdqa %xmm11,%xmm13
movdqa %xmm0,%xmm11
movdqa %xmm12,%xmm3
punpckhwd %xmm4,%xmm11
movdqa %xmm8,%xmm12
movdqa %xmm8,%xmm4
movdqa %xmm11,%xmm14
movdqa %xmm7,%xmm8
movdqa %xmm9,%xmm11
punpckhwd %xmm5,%xmm8
psrad $4,%xmm3
punpcklwd %xmm1,%xmm11
psrad $4,%xmm13
psrad $4,%xmm14
movdqa %xmm11,%xmm15
movaps %xmm13,-80(%rbp)
movdqa %xmm8,%xmm6
paddw %xmm7,%xmm1
pmaddwd .LC3(%rip),%xmm15
movaps %xmm14,-96(%rbp)
movdqa %xmm8,%xmm14
movdqa %xmm5,%xmm8
pmaddwd .LC2(%rip),%xmm11
pmaddwd .LC2(%rip),%xmm12
paddw %xmm9,%xmm8
psrad $4,%xmm2
pmaddwd .LC3(%rip),%xmm4
pmaddwd .LC5(%rip),%xmm6
pmaddwd .LC4(%rip),%xmm14
movdqa %xmm4,%xmm10
movdqa %xmm7,%xmm4
movdqa %xmm8,%xmm7
punpcklwd %xmm5,%xmm4
punpcklwd %xmm1,%xmm7
punpckhwd %xmm1,%xmm8
movdqa %xmm4,%xmm13
movdqa %xmm7,%xmm9
pmaddwd .LC5(%rip),%xmm4
pmaddwd .LC6(%rip),%xmm9
movdqa %xmm8,%xmm5
movdqa %xmm7,%xmm1
pmaddwd .LC7(%rip),%xmm8
pmaddwd .LC6(%rip),%xmm5
movdqa %xmm15,%xmm7
paddd %xmm9,%xmm11
paddd %xmm9,%xmm4
movdqa .LC8(%rip),%xmm9
paddd %xmm8,%xmm14
paddd %xmm10,%xmm8
movdqa -96(%rbp),%xmm10
paddd -64(%rbp),%xmm10
pmaddwd .LC7(%rip),%xmm1
pmaddwd .LC4(%rip),%xmm13
paddd %xmm5,%xmm12
paddd %xmm5,%xmm6
paddd %xmm9,%xmm10
movdqa -80(%rbp),%xmm5
paddd -48(%rbp),%xmm5
paddd %xmm1,%xmm13
paddd %xmm1,%xmm7
movdqa %xmm10,%xmm1
psubd %xmm6,%xmm10
paddd %xmm9,%xmm5
paddd %xmm6,%xmm1
psrad $10,%xmm10
movdqa -16(%rbp),%xmm6
movdqa %xmm1,%xmm15
movdqa %xmm5,%xmm1
psubd %xmm4,%xmm5
psrad $10,%xmm5
paddd %xmm4,%xmm1
paddd %xmm2,%xmm6
packssdw %xmm10,%xmm5
movdqa -32(%rbp),%xmm10
paddd %xmm9,%xmm6
paddd %xmm9,%xmm2
psrad $10,%xmm15
psrad $10,%xmm1
psubd -16(%rbp),%xmm2
paddd %xmm3,%xmm10
paddd %xmm9,%xmm3
packssdw %xmm15,%xmm1
paddd %xmm9,%xmm10
psubd -32(%rbp),%xmm3
movdqa %xmm10,%xmm4
psubd %xmm8,%xmm10
paddd %xmm8,%xmm4
psrad $10,%xmm10
movdqa %xmm4,%xmm15
movdqa %xmm6,%xmm4
psubd %xmm7,%xmm6
psrad $10,%xmm6
psrad $10,%xmm15
paddd %xmm7,%xmm4
movdqa %xmm3,%xmm7
psubd %xmm14,%xmm3
packssdw %xmm10,%xmm6
psrad $10,%xmm3
psrad $10,%xmm4
paddd %xmm14,%xmm7
movdqa %xmm7,%xmm8
movdqa %xmm2,%xmm7
psubd %xmm13,%xmm2
paddd %xmm13,%xmm7
psrad $10,%xmm8
packssdw %xmm15,%xmm4
psrad $10,%xmm7
psrad $10,%xmm2
packssdw %xmm8,%xmm7
movdqa -80(%rbp),%xmm8
packssdw %xmm3,%xmm2
paddd %xmm9,%xmm8
paddd -96(%rbp),%xmm9
psubd -48(%rbp),%xmm8
psubd -64(%rbp),%xmm9
movdqa %xmm8,%xmm3
movdqa %xmm9,%xmm10
psubd %xmm11,%xmm8
paddd %xmm12,%xmm10
paddd %xmm11,%xmm3
psrad $10,%xmm8
psrad $10,%xmm10
psrad $10,%xmm3
psubd %xmm12,%xmm9
psrad $10,%xmm9
packssdw %xmm10,%xmm3
movdqa %xmm1,%xmm10
packssdw %xmm9,%xmm8
movdqa %xmm7,%xmm9
punpckhwd %xmm6,%xmm7
punpcklwd %xmm6,%xmm9
punpcklwd %xmm8,%xmm10
punpckhwd %xmm8,%xmm1
movdqa %xmm3,%xmm6
movdqa %xmm4,%xmm8
punpckhwd %xmm5,%xmm3
punpcklwd %xmm5,%xmm6
punpcklwd %xmm2,%xmm8
movdqa %xmm3,%xmm5
punpckhwd %xmm2,%xmm4
movdqa %xmm8,%xmm3
movdqa %xmm10,%xmm2
punpckhwd %xmm6,%xmm8
punpcklwd %xmm6,%xmm3
punpcklwd %xmm9,%xmm2
movdqa %xmm8,%xmm6
movdqa %xmm4,%xmm8
punpckhwd %xmm9,%xmm10
punpcklwd %xmm5,%xmm8
punpckhwd %xmm5,%xmm4
movdqa %xmm2,%xmm5
punpcklwd %xmm3,%xmm5
punpckhwd %xmm3,%xmm2
movdqa %xmm1,%xmm15
movdqa %xmm10,%xmm3
punpckhwd %xmm7,%xmm1
punpckhwd %xmm6,%xmm10
punpcklwd %xmm6,%xmm3
movdqa %xmm1,%xmm6
punpckhwd %xmm4,%xmm1
punpcklwd %xmm4,%xmm6
movdqa %xmm3,%xmm4
punpcklwd %xmm7,%xmm15
punpcklwd %xmm6,%xmm4
punpckhwd %xmm6,%xmm3
movdqa %xmm15,%xmm7
movdqa %xmm4,%xmm6
punpcklwd %xmm8,%xmm7
movdqa %xmm3,%xmm11
movdqa %xmm4,%xmm12
movdqa %xmm3,%xmm4
movdqa %xmm5,%xmm3
paddw %xmm7,%xmm3
movdqa %xmm1,%xmm9
punpckhwd %xmm8,%xmm15
punpcklwd %xmm10,%xmm9
psubw %xmm7,%xmm5
movdqa %xmm15,%xmm7
movdqa %xmm9,%xmm14
punpcklwd %xmm2,%xmm7
movdqa %xmm1,%xmm8
pmaddwd .LC0(%rip),%xmm6
punpckhwd %xmm10,%xmm8
paddw %xmm15,%xmm10
movaps %xmm6,-16(%rbp)
pmaddwd .LC1(%rip),%xmm4
movdqa %xmm0,%xmm6
pmaddwd .LC0(%rip),%xmm11
pmaddwd .LC2(%rip),%xmm14
pmaddwd .LC1(%rip),%xmm12
pmaddwd .LC3(%rip),%xmm9
movaps %xmm4,-64(%rbp)
movdqa %xmm3,%xmm4
movdqa %xmm0,%xmm3
punpckhwd %xmm4,%xmm6
punpcklwd %xmm4,%xmm3
movdqa %xmm0,%xmm4
movaps %xmm11,-32(%rbp)
movdqa %xmm6,%xmm13
movdqa %xmm15,%xmm6
punpcklwd %xmm5,%xmm4
movaps %xmm12,-48(%rbp)
punpckhwd %xmm2,%xmm6
paddw %xmm1,%xmm2
punpckhwd %xmm5,%xmm0
movdqa %xmm14,%xmm11
movdqa %xmm2,%xmm5
movdqa %xmm7,%xmm14
punpckhwd %xmm10,%xmm2
psrad $4,%xmm13
punpcklwd %xmm10,%xmm5
movaps %xmm13,-80(%rbp)
movdqa %xmm8,%xmm12
movdqa %xmm5,%xmm10
pmaddwd .LC4(%rip),%xmm14
pmaddwd .LC6(%rip),%xmm10
movdqa %xmm2,%xmm15
pmaddwd .LC7(%rip),%xmm5
pmaddwd .LC3(%rip),%xmm8
pmaddwd .LC5(%rip),%xmm7
movdqa %xmm14,%xmm13
movdqa %xmm6,%xmm14
paddd %xmm5,%xmm13
paddd %xmm5,%xmm9
pmaddwd .LC5(%rip),%xmm6
psrad $4,%xmm3
pmaddwd .LC6(%rip),%xmm15
paddd %xmm10,%xmm7
paddd %xmm10,%xmm11
psrad $4,%xmm4
pmaddwd .LC2(%rip),%xmm12
psrad $4,%xmm0
pmaddwd .LC4(%rip),%xmm14
pmaddwd .LC7(%rip),%xmm2
movdqa -80(%rbp),%xmm5
paddd %xmm15,%xmm12
paddd -64(%rbp),%xmm5
paddd %xmm2,%xmm14
paddd %xmm8,%xmm2
movdqa -48(%rbp),%xmm8
paddd %xmm6,%xmm15
movdqa .LC9(%rip),%xmm6
paddd %xmm3,%xmm8
paddd %xmm6,%xmm8
paddd %xmm6,%xmm5
movdqa %xmm5,%xmm10
movdqa %xmm8,%xmm1
psubd %xmm15,%xmm5
psubd %xmm7,%xmm8
psrad $17,%xmm5
paddd %xmm7,%xmm1
movdqa -32(%rbp),%xmm7
psrad $17,%xmm8
paddd %xmm15,%xmm10
paddd %xmm6,%xmm3
packssdw %xmm5,%xmm8
movdqa -16(%rbp),%xmm5
paddd %xmm0,%xmm7
paddd %xmm6,%xmm0
paddd %xmm6,%xmm7
psrad $17,%xmm10
psubd -32(%rbp),%xmm0
paddd %xmm4,%xmm5
psrad $17,%xmm1
movdqa %xmm7,%xmm15
paddd %xmm6,%xmm5
packssdw %xmm10,%xmm1
psubd %xmm2,%xmm7
movdqa %xmm5,%xmm10
paddd %xmm6,%xmm4
psubd %xmm9,%xmm5
psubd -16(%rbp),%xmm4
psrad $17,%xmm7
paddd %xmm2,%xmm15
psrad $17,%xmm5
psubd -48(%rbp),%xmm3
paddd -80(%rbp),%xmm6
packssdw %xmm7,%xmm5
movdqa %xmm4,%xmm2
movdqa %xmm0,%xmm7
psubd -64(%rbp),%xmm6
paddd %xmm14,%xmm7
psrad $17,%xmm15
paddd %xmm13,%xmm2
psubd %xmm14,%xmm0
psrad $17,%xmm7
psubd %xmm13,%xmm4
psrad $17,%xmm0
paddd %xmm9,%xmm10
psrad $17,%xmm2
psrad $17,%xmm4
packuswb %xmm8,%xmm5
packssdw %xmm0,%xmm4
packssdw %xmm7,%xmm2
movdqa %xmm3,%xmm0
movdqa %xmm6,%xmm7
psrad $17,%xmm10
paddd %xmm11,%xmm0
paddd %xmm12,%xmm7
psubd %xmm12,%xmm6
packssdw %xmm15,%xmm10
psubd %xmm11,%xmm3
psrad $17,%xmm7
packuswb %xmm10,%xmm1
psrad $17,%xmm0
psrad $17,%xmm6
psrad $17,%xmm3
packssdw %xmm7,%xmm0
packssdw %xmm6,%xmm3
packuswb %xmm0,%xmm2
movdqa %xmm1,%xmm0
packuswb %xmm4,%xmm3
movdqa %xmm2,%xmm4
punpckhbw %xmm5,%xmm2
punpcklbw %xmm3,%xmm0
punpcklbw %xmm5,%xmm4
punpckhbw %xmm3,%xmm1
movdqa %xmm2,%xmm3
movdqa %xmm0,%xmm2
movdqa %xmm1,%xmm5
punpcklbw %xmm4,%xmm2
punpckhbw %xmm4,%xmm0
punpcklbw %xmm3,%xmm5
movdqa %xmm2,%xmm4
punpckhbw %xmm5,%xmm2
punpckhbw %xmm3,%xmm1
punpcklbw %xmm5,%xmm4
movdqa %xmm0,%xmm3
punpckhbw %xmm1,%xmm0
movq %xmm4,(%rdi)
pshufd $78,%xmm4,%xmm4
punpcklbw %xmm1,%xmm3
movq %xmm4,(%rax)
add %rsi,%rax
movq %xmm2,(%rax)
add %rsi,%rax
pshufd $78,%xmm2,%xmm2
movq %xmm2,(%rax)
add %rsi,%rax
movq %xmm3,(%rax)
add %rsi,%rax
pshufd $78,%xmm3,%xmm3
movq %xmm3,(%rax)
movq %xmm0,(%rax,%rsi)
pshufd $78,%xmm0,%xmm0
movq %xmm0,(%rax,%rsi,2)
leave
ret
.endfn stbi__idct_simd$sse,globl
.rodata.cst16
.LC0: .value 2217,-5350,2217,-5350,2217,-5350,2217,-5350
.LC1: .value 5352,2217,5352,2217,5352,2217,5352,2217
.LC2: .value -6811,-8034,-6811,-8034,-6811,-8034,-6811,-8034
.LC3: .value -8034,4552,-8034,4552,-8034,4552,-8034,4552
.LC4: .value 6813,-1597,6813,-1597,6813,-1597,6813,-1597
.LC5: .value -1597,4552,-1597,4552,-1597,4552,-1597,4552
.LC6: .value 1131,4816,1131,4816,1131,4816,1131,4816
.LC7: .value 4816,-5681,4816,-5681,4816,-5681,4816,-5681
.LC8: .long 0x200,0x200,0x200,0x200
.LC9: .long 0x1010000,0x1010000,0x1010000,0x1010000