427 lines
10 KiB
ArmAsm
427 lines
10 KiB
ArmAsm
/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
|
|
│vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│
|
|
╞══════════════════════════════════════════════════════════════════════════════╡
|
|
│ Copyright 2020 Justine Alexandra Roberts Tunney │
|
|
│ │
|
|
│ Permission to use, copy, modify, and/or distribute this software for │
|
|
│ any purpose with or without fee is hereby granted, provided that the │
|
|
│ above copyright notice and this permission notice appear in all copies. │
|
|
│ │
|
|
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
|
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
|
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
|
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
|
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
|
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
|
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
|
│ PERFORMANCE OF THIS SOFTWARE. │
|
|
╚─────────────────────────────────────────────────────────────────────────────*/
|
|
#include "libc/macros.h"
|
|
|
|
/ Computes inverse discrete cosine transform.
|
|
/
|
|
/ @note used to decode jpeg
|
|
.p2align 4
|
|
stbi__idct_simd$sse:
|
|
push %rbp
|
|
mov %rsp,%rbp
|
|
movslq %esi,%rsi
|
|
lea (%rdi,%rsi),%rax
|
|
sub $96,%rsp
|
|
movdqa 32(%rdx),%xmm0
|
|
movdqa 112(%rdx),%xmm9
|
|
movdqa 48(%rdx),%xmm1
|
|
movdqa 80(%rdx),%xmm7
|
|
movdqa %xmm0,%xmm2
|
|
punpcklwd 96(%rdx),%xmm2
|
|
punpckhwd 96(%rdx),%xmm0
|
|
movdqa %xmm9,%xmm8
|
|
movdqa 16(%rdx),%xmm5
|
|
movdqa %xmm2,%xmm3
|
|
movdqa %xmm2,%xmm6
|
|
movdqa %xmm0,%xmm2
|
|
pmaddwd .LC1(%rip),%xmm3
|
|
movdqa %xmm0,%xmm4
|
|
pmaddwd .LC1(%rip),%xmm2
|
|
pmaddwd .LC0(%rip),%xmm4
|
|
punpckhwd %xmm1,%xmm8
|
|
pmaddwd .LC0(%rip),%xmm6
|
|
movaps %xmm3,-48(%rbp)
|
|
movdqa (%rdx),%xmm3
|
|
movaps %xmm2,-64(%rbp)
|
|
movdqa 64(%rdx),%xmm2
|
|
movdqa %xmm3,%xmm0
|
|
movaps %xmm4,-32(%rbp)
|
|
paddw %xmm2,%xmm0
|
|
psubw %xmm2,%xmm3
|
|
movaps %xmm6,-16(%rbp)
|
|
movdqa %xmm0,%xmm4
|
|
pxor %xmm0,%xmm0
|
|
movdqa %xmm0,%xmm11
|
|
movdqa %xmm0,%xmm12
|
|
movdqa %xmm0,%xmm2
|
|
punpcklwd %xmm4,%xmm11
|
|
punpckhwd %xmm3,%xmm12
|
|
punpcklwd %xmm3,%xmm2
|
|
movdqa %xmm11,%xmm13
|
|
movdqa %xmm0,%xmm11
|
|
movdqa %xmm12,%xmm3
|
|
punpckhwd %xmm4,%xmm11
|
|
movdqa %xmm8,%xmm12
|
|
movdqa %xmm8,%xmm4
|
|
movdqa %xmm11,%xmm14
|
|
movdqa %xmm7,%xmm8
|
|
movdqa %xmm9,%xmm11
|
|
punpckhwd %xmm5,%xmm8
|
|
psrad $4,%xmm3
|
|
punpcklwd %xmm1,%xmm11
|
|
psrad $4,%xmm13
|
|
psrad $4,%xmm14
|
|
movdqa %xmm11,%xmm15
|
|
movaps %xmm13,-80(%rbp)
|
|
movdqa %xmm8,%xmm6
|
|
paddw %xmm7,%xmm1
|
|
pmaddwd .LC3(%rip),%xmm15
|
|
movaps %xmm14,-96(%rbp)
|
|
movdqa %xmm8,%xmm14
|
|
movdqa %xmm5,%xmm8
|
|
pmaddwd .LC2(%rip),%xmm11
|
|
pmaddwd .LC2(%rip),%xmm12
|
|
paddw %xmm9,%xmm8
|
|
psrad $4,%xmm2
|
|
pmaddwd .LC3(%rip),%xmm4
|
|
pmaddwd .LC5(%rip),%xmm6
|
|
pmaddwd .LC4(%rip),%xmm14
|
|
movdqa %xmm4,%xmm10
|
|
movdqa %xmm7,%xmm4
|
|
movdqa %xmm8,%xmm7
|
|
punpcklwd %xmm5,%xmm4
|
|
punpcklwd %xmm1,%xmm7
|
|
punpckhwd %xmm1,%xmm8
|
|
movdqa %xmm4,%xmm13
|
|
movdqa %xmm7,%xmm9
|
|
pmaddwd .LC5(%rip),%xmm4
|
|
pmaddwd .LC6(%rip),%xmm9
|
|
movdqa %xmm8,%xmm5
|
|
movdqa %xmm7,%xmm1
|
|
pmaddwd .LC7(%rip),%xmm8
|
|
pmaddwd .LC6(%rip),%xmm5
|
|
movdqa %xmm15,%xmm7
|
|
paddd %xmm9,%xmm11
|
|
paddd %xmm9,%xmm4
|
|
movdqa .LC8(%rip),%xmm9
|
|
paddd %xmm8,%xmm14
|
|
paddd %xmm10,%xmm8
|
|
movdqa -96(%rbp),%xmm10
|
|
paddd -64(%rbp),%xmm10
|
|
pmaddwd .LC7(%rip),%xmm1
|
|
pmaddwd .LC4(%rip),%xmm13
|
|
paddd %xmm5,%xmm12
|
|
paddd %xmm5,%xmm6
|
|
paddd %xmm9,%xmm10
|
|
movdqa -80(%rbp),%xmm5
|
|
paddd -48(%rbp),%xmm5
|
|
paddd %xmm1,%xmm13
|
|
paddd %xmm1,%xmm7
|
|
movdqa %xmm10,%xmm1
|
|
psubd %xmm6,%xmm10
|
|
paddd %xmm9,%xmm5
|
|
paddd %xmm6,%xmm1
|
|
psrad $10,%xmm10
|
|
movdqa -16(%rbp),%xmm6
|
|
movdqa %xmm1,%xmm15
|
|
movdqa %xmm5,%xmm1
|
|
psubd %xmm4,%xmm5
|
|
psrad $10,%xmm5
|
|
paddd %xmm4,%xmm1
|
|
paddd %xmm2,%xmm6
|
|
packssdw %xmm10,%xmm5
|
|
movdqa -32(%rbp),%xmm10
|
|
paddd %xmm9,%xmm6
|
|
paddd %xmm9,%xmm2
|
|
psrad $10,%xmm15
|
|
psrad $10,%xmm1
|
|
psubd -16(%rbp),%xmm2
|
|
paddd %xmm3,%xmm10
|
|
paddd %xmm9,%xmm3
|
|
packssdw %xmm15,%xmm1
|
|
paddd %xmm9,%xmm10
|
|
psubd -32(%rbp),%xmm3
|
|
movdqa %xmm10,%xmm4
|
|
psubd %xmm8,%xmm10
|
|
paddd %xmm8,%xmm4
|
|
psrad $10,%xmm10
|
|
movdqa %xmm4,%xmm15
|
|
movdqa %xmm6,%xmm4
|
|
psubd %xmm7,%xmm6
|
|
psrad $10,%xmm6
|
|
psrad $10,%xmm15
|
|
paddd %xmm7,%xmm4
|
|
movdqa %xmm3,%xmm7
|
|
psubd %xmm14,%xmm3
|
|
packssdw %xmm10,%xmm6
|
|
psrad $10,%xmm3
|
|
psrad $10,%xmm4
|
|
paddd %xmm14,%xmm7
|
|
movdqa %xmm7,%xmm8
|
|
movdqa %xmm2,%xmm7
|
|
psubd %xmm13,%xmm2
|
|
paddd %xmm13,%xmm7
|
|
psrad $10,%xmm8
|
|
packssdw %xmm15,%xmm4
|
|
psrad $10,%xmm7
|
|
psrad $10,%xmm2
|
|
packssdw %xmm8,%xmm7
|
|
movdqa -80(%rbp),%xmm8
|
|
packssdw %xmm3,%xmm2
|
|
paddd %xmm9,%xmm8
|
|
paddd -96(%rbp),%xmm9
|
|
psubd -48(%rbp),%xmm8
|
|
psubd -64(%rbp),%xmm9
|
|
movdqa %xmm8,%xmm3
|
|
movdqa %xmm9,%xmm10
|
|
psubd %xmm11,%xmm8
|
|
paddd %xmm12,%xmm10
|
|
paddd %xmm11,%xmm3
|
|
psrad $10,%xmm8
|
|
psrad $10,%xmm10
|
|
psrad $10,%xmm3
|
|
psubd %xmm12,%xmm9
|
|
psrad $10,%xmm9
|
|
packssdw %xmm10,%xmm3
|
|
movdqa %xmm1,%xmm10
|
|
packssdw %xmm9,%xmm8
|
|
movdqa %xmm7,%xmm9
|
|
punpckhwd %xmm6,%xmm7
|
|
punpcklwd %xmm6,%xmm9
|
|
punpcklwd %xmm8,%xmm10
|
|
punpckhwd %xmm8,%xmm1
|
|
movdqa %xmm3,%xmm6
|
|
movdqa %xmm4,%xmm8
|
|
punpckhwd %xmm5,%xmm3
|
|
punpcklwd %xmm5,%xmm6
|
|
punpcklwd %xmm2,%xmm8
|
|
movdqa %xmm3,%xmm5
|
|
punpckhwd %xmm2,%xmm4
|
|
movdqa %xmm8,%xmm3
|
|
movdqa %xmm10,%xmm2
|
|
punpckhwd %xmm6,%xmm8
|
|
punpcklwd %xmm6,%xmm3
|
|
punpcklwd %xmm9,%xmm2
|
|
movdqa %xmm8,%xmm6
|
|
movdqa %xmm4,%xmm8
|
|
punpckhwd %xmm9,%xmm10
|
|
punpcklwd %xmm5,%xmm8
|
|
punpckhwd %xmm5,%xmm4
|
|
movdqa %xmm2,%xmm5
|
|
punpcklwd %xmm3,%xmm5
|
|
punpckhwd %xmm3,%xmm2
|
|
movdqa %xmm1,%xmm15
|
|
movdqa %xmm10,%xmm3
|
|
punpckhwd %xmm7,%xmm1
|
|
punpckhwd %xmm6,%xmm10
|
|
punpcklwd %xmm6,%xmm3
|
|
movdqa %xmm1,%xmm6
|
|
punpckhwd %xmm4,%xmm1
|
|
punpcklwd %xmm4,%xmm6
|
|
movdqa %xmm3,%xmm4
|
|
punpcklwd %xmm7,%xmm15
|
|
punpcklwd %xmm6,%xmm4
|
|
punpckhwd %xmm6,%xmm3
|
|
movdqa %xmm15,%xmm7
|
|
movdqa %xmm4,%xmm6
|
|
punpcklwd %xmm8,%xmm7
|
|
movdqa %xmm3,%xmm11
|
|
movdqa %xmm4,%xmm12
|
|
movdqa %xmm3,%xmm4
|
|
movdqa %xmm5,%xmm3
|
|
paddw %xmm7,%xmm3
|
|
movdqa %xmm1,%xmm9
|
|
punpckhwd %xmm8,%xmm15
|
|
punpcklwd %xmm10,%xmm9
|
|
psubw %xmm7,%xmm5
|
|
movdqa %xmm15,%xmm7
|
|
movdqa %xmm9,%xmm14
|
|
punpcklwd %xmm2,%xmm7
|
|
movdqa %xmm1,%xmm8
|
|
pmaddwd .LC0(%rip),%xmm6
|
|
punpckhwd %xmm10,%xmm8
|
|
paddw %xmm15,%xmm10
|
|
movaps %xmm6,-16(%rbp)
|
|
pmaddwd .LC1(%rip),%xmm4
|
|
movdqa %xmm0,%xmm6
|
|
pmaddwd .LC0(%rip),%xmm11
|
|
pmaddwd .LC2(%rip),%xmm14
|
|
pmaddwd .LC1(%rip),%xmm12
|
|
pmaddwd .LC3(%rip),%xmm9
|
|
movaps %xmm4,-64(%rbp)
|
|
movdqa %xmm3,%xmm4
|
|
movdqa %xmm0,%xmm3
|
|
punpckhwd %xmm4,%xmm6
|
|
punpcklwd %xmm4,%xmm3
|
|
movdqa %xmm0,%xmm4
|
|
movaps %xmm11,-32(%rbp)
|
|
movdqa %xmm6,%xmm13
|
|
movdqa %xmm15,%xmm6
|
|
punpcklwd %xmm5,%xmm4
|
|
movaps %xmm12,-48(%rbp)
|
|
punpckhwd %xmm2,%xmm6
|
|
paddw %xmm1,%xmm2
|
|
punpckhwd %xmm5,%xmm0
|
|
movdqa %xmm14,%xmm11
|
|
movdqa %xmm2,%xmm5
|
|
movdqa %xmm7,%xmm14
|
|
punpckhwd %xmm10,%xmm2
|
|
psrad $4,%xmm13
|
|
punpcklwd %xmm10,%xmm5
|
|
movaps %xmm13,-80(%rbp)
|
|
movdqa %xmm8,%xmm12
|
|
movdqa %xmm5,%xmm10
|
|
pmaddwd .LC4(%rip),%xmm14
|
|
pmaddwd .LC6(%rip),%xmm10
|
|
movdqa %xmm2,%xmm15
|
|
pmaddwd .LC7(%rip),%xmm5
|
|
pmaddwd .LC3(%rip),%xmm8
|
|
pmaddwd .LC5(%rip),%xmm7
|
|
movdqa %xmm14,%xmm13
|
|
movdqa %xmm6,%xmm14
|
|
paddd %xmm5,%xmm13
|
|
paddd %xmm5,%xmm9
|
|
pmaddwd .LC5(%rip),%xmm6
|
|
psrad $4,%xmm3
|
|
pmaddwd .LC6(%rip),%xmm15
|
|
paddd %xmm10,%xmm7
|
|
paddd %xmm10,%xmm11
|
|
psrad $4,%xmm4
|
|
pmaddwd .LC2(%rip),%xmm12
|
|
psrad $4,%xmm0
|
|
pmaddwd .LC4(%rip),%xmm14
|
|
pmaddwd .LC7(%rip),%xmm2
|
|
movdqa -80(%rbp),%xmm5
|
|
paddd %xmm15,%xmm12
|
|
paddd -64(%rbp),%xmm5
|
|
paddd %xmm2,%xmm14
|
|
paddd %xmm8,%xmm2
|
|
movdqa -48(%rbp),%xmm8
|
|
paddd %xmm6,%xmm15
|
|
movdqa .LC9(%rip),%xmm6
|
|
paddd %xmm3,%xmm8
|
|
paddd %xmm6,%xmm8
|
|
paddd %xmm6,%xmm5
|
|
movdqa %xmm5,%xmm10
|
|
movdqa %xmm8,%xmm1
|
|
psubd %xmm15,%xmm5
|
|
psubd %xmm7,%xmm8
|
|
psrad $17,%xmm5
|
|
paddd %xmm7,%xmm1
|
|
movdqa -32(%rbp),%xmm7
|
|
psrad $17,%xmm8
|
|
paddd %xmm15,%xmm10
|
|
paddd %xmm6,%xmm3
|
|
packssdw %xmm5,%xmm8
|
|
movdqa -16(%rbp),%xmm5
|
|
paddd %xmm0,%xmm7
|
|
paddd %xmm6,%xmm0
|
|
paddd %xmm6,%xmm7
|
|
psrad $17,%xmm10
|
|
psubd -32(%rbp),%xmm0
|
|
paddd %xmm4,%xmm5
|
|
psrad $17,%xmm1
|
|
movdqa %xmm7,%xmm15
|
|
paddd %xmm6,%xmm5
|
|
packssdw %xmm10,%xmm1
|
|
psubd %xmm2,%xmm7
|
|
movdqa %xmm5,%xmm10
|
|
paddd %xmm6,%xmm4
|
|
psubd %xmm9,%xmm5
|
|
psubd -16(%rbp),%xmm4
|
|
psrad $17,%xmm7
|
|
paddd %xmm2,%xmm15
|
|
psrad $17,%xmm5
|
|
psubd -48(%rbp),%xmm3
|
|
paddd -80(%rbp),%xmm6
|
|
packssdw %xmm7,%xmm5
|
|
movdqa %xmm4,%xmm2
|
|
movdqa %xmm0,%xmm7
|
|
psubd -64(%rbp),%xmm6
|
|
paddd %xmm14,%xmm7
|
|
psrad $17,%xmm15
|
|
paddd %xmm13,%xmm2
|
|
psubd %xmm14,%xmm0
|
|
psrad $17,%xmm7
|
|
psubd %xmm13,%xmm4
|
|
psrad $17,%xmm0
|
|
paddd %xmm9,%xmm10
|
|
psrad $17,%xmm2
|
|
psrad $17,%xmm4
|
|
packuswb %xmm8,%xmm5
|
|
packssdw %xmm0,%xmm4
|
|
packssdw %xmm7,%xmm2
|
|
movdqa %xmm3,%xmm0
|
|
movdqa %xmm6,%xmm7
|
|
psrad $17,%xmm10
|
|
paddd %xmm11,%xmm0
|
|
paddd %xmm12,%xmm7
|
|
psubd %xmm12,%xmm6
|
|
packssdw %xmm15,%xmm10
|
|
psubd %xmm11,%xmm3
|
|
psrad $17,%xmm7
|
|
packuswb %xmm10,%xmm1
|
|
psrad $17,%xmm0
|
|
psrad $17,%xmm6
|
|
psrad $17,%xmm3
|
|
packssdw %xmm7,%xmm0
|
|
packssdw %xmm6,%xmm3
|
|
packuswb %xmm0,%xmm2
|
|
movdqa %xmm1,%xmm0
|
|
packuswb %xmm4,%xmm3
|
|
movdqa %xmm2,%xmm4
|
|
punpckhbw %xmm5,%xmm2
|
|
punpcklbw %xmm3,%xmm0
|
|
punpcklbw %xmm5,%xmm4
|
|
punpckhbw %xmm3,%xmm1
|
|
movdqa %xmm2,%xmm3
|
|
movdqa %xmm0,%xmm2
|
|
movdqa %xmm1,%xmm5
|
|
punpcklbw %xmm4,%xmm2
|
|
punpckhbw %xmm4,%xmm0
|
|
punpcklbw %xmm3,%xmm5
|
|
movdqa %xmm2,%xmm4
|
|
punpckhbw %xmm5,%xmm2
|
|
punpckhbw %xmm3,%xmm1
|
|
punpcklbw %xmm5,%xmm4
|
|
movdqa %xmm0,%xmm3
|
|
punpckhbw %xmm1,%xmm0
|
|
movq %xmm4,(%rdi)
|
|
pshufd $78,%xmm4,%xmm4
|
|
punpcklbw %xmm1,%xmm3
|
|
movq %xmm4,(%rax)
|
|
add %rsi,%rax
|
|
movq %xmm2,(%rax)
|
|
add %rsi,%rax
|
|
pshufd $78,%xmm2,%xmm2
|
|
movq %xmm2,(%rax)
|
|
add %rsi,%rax
|
|
movq %xmm3,(%rax)
|
|
add %rsi,%rax
|
|
pshufd $78,%xmm3,%xmm3
|
|
movq %xmm3,(%rax)
|
|
movq %xmm0,(%rax,%rsi)
|
|
pshufd $78,%xmm0,%xmm0
|
|
movq %xmm0,(%rax,%rsi,2)
|
|
leave
|
|
ret
|
|
.endfn stbi__idct_simd$sse,globl
|
|
|
|
.rodata.cst16
|
|
.LC0: .value 2217,-5350,2217,-5350,2217,-5350,2217,-5350
|
|
.LC1: .value 5352,2217,5352,2217,5352,2217,5352,2217
|
|
.LC2: .value -6811,-8034,-6811,-8034,-6811,-8034,-6811,-8034
|
|
.LC3: .value -8034,4552,-8034,4552,-8034,4552,-8034,4552
|
|
.LC4: .value 6813,-1597,6813,-1597,6813,-1597,6813,-1597
|
|
.LC5: .value -1597,4552,-1597,4552,-1597,4552,-1597,4552
|
|
.LC6: .value 1131,4816,1131,4816,1131,4816,1131,4816
|
|
.LC7: .value 4816,-5681,4816,-5681,4816,-5681,4816,-5681
|
|
.LC8: .long 0x200,0x200,0x200,0x200
|
|
.LC9: .long 0x1010000,0x1010000,0x1010000,0x1010000
|