/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ │vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ ╞══════════════════════════════════════════════════════════════════════════════╡ │ Copyright 2020 Justine Alexandra Roberts Tunney │ │ │ │ Permission to use, copy, modify, and/or distribute this software for │ │ any purpose with or without fee is hereby granted, provided that the │ │ above copyright notice and this permission notice appear in all copies. │ │ │ │ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │ │ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │ │ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │ │ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │ │ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │ │ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │ │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │ │ PERFORMANCE OF THIS SOFTWARE. │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/macros.h" / Computes inverse discrete cosine transform. / / @note used to decode jpeg .p2align 4 stbi__idct_simd$sse: push %rbp mov %rsp,%rbp movslq %esi,%rsi lea (%rdi,%rsi),%rax sub $96,%rsp movdqa 32(%rdx),%xmm0 movdqa 112(%rdx),%xmm9 movdqa 48(%rdx),%xmm1 movdqa 80(%rdx),%xmm7 movdqa %xmm0,%xmm2 punpcklwd 96(%rdx),%xmm2 punpckhwd 96(%rdx),%xmm0 movdqa %xmm9,%xmm8 movdqa 16(%rdx),%xmm5 movdqa %xmm2,%xmm3 movdqa %xmm2,%xmm6 movdqa %xmm0,%xmm2 pmaddwd .LC1(%rip),%xmm3 movdqa %xmm0,%xmm4 pmaddwd .LC1(%rip),%xmm2 pmaddwd .LC0(%rip),%xmm4 punpckhwd %xmm1,%xmm8 pmaddwd .LC0(%rip),%xmm6 movaps %xmm3,-48(%rbp) movdqa (%rdx),%xmm3 movaps %xmm2,-64(%rbp) movdqa 64(%rdx),%xmm2 movdqa %xmm3,%xmm0 movaps %xmm4,-32(%rbp) paddw %xmm2,%xmm0 psubw %xmm2,%xmm3 movaps %xmm6,-16(%rbp) movdqa %xmm0,%xmm4 pxor %xmm0,%xmm0 movdqa %xmm0,%xmm11 movdqa %xmm0,%xmm12 movdqa %xmm0,%xmm2 punpcklwd %xmm4,%xmm11 punpckhwd %xmm3,%xmm12 punpcklwd %xmm3,%xmm2 movdqa %xmm11,%xmm13 movdqa %xmm0,%xmm11 movdqa %xmm12,%xmm3 punpckhwd %xmm4,%xmm11 movdqa %xmm8,%xmm12 movdqa %xmm8,%xmm4 movdqa %xmm11,%xmm14 movdqa %xmm7,%xmm8 movdqa %xmm9,%xmm11 punpckhwd %xmm5,%xmm8 psrad $4,%xmm3 punpcklwd %xmm1,%xmm11 psrad $4,%xmm13 psrad $4,%xmm14 movdqa %xmm11,%xmm15 movaps %xmm13,-80(%rbp) movdqa %xmm8,%xmm6 paddw %xmm7,%xmm1 pmaddwd .LC3(%rip),%xmm15 movaps %xmm14,-96(%rbp) movdqa %xmm8,%xmm14 movdqa %xmm5,%xmm8 pmaddwd .LC2(%rip),%xmm11 pmaddwd .LC2(%rip),%xmm12 paddw %xmm9,%xmm8 psrad $4,%xmm2 pmaddwd .LC3(%rip),%xmm4 pmaddwd .LC5(%rip),%xmm6 pmaddwd .LC4(%rip),%xmm14 movdqa %xmm4,%xmm10 movdqa %xmm7,%xmm4 movdqa %xmm8,%xmm7 punpcklwd %xmm5,%xmm4 punpcklwd %xmm1,%xmm7 punpckhwd %xmm1,%xmm8 movdqa %xmm4,%xmm13 movdqa %xmm7,%xmm9 pmaddwd .LC5(%rip),%xmm4 pmaddwd .LC6(%rip),%xmm9 movdqa %xmm8,%xmm5 movdqa %xmm7,%xmm1 pmaddwd .LC7(%rip),%xmm8 pmaddwd .LC6(%rip),%xmm5 movdqa %xmm15,%xmm7 paddd %xmm9,%xmm11 paddd %xmm9,%xmm4 movdqa .LC8(%rip),%xmm9 paddd %xmm8,%xmm14 paddd %xmm10,%xmm8 movdqa -96(%rbp),%xmm10 paddd -64(%rbp),%xmm10 pmaddwd .LC7(%rip),%xmm1 pmaddwd .LC4(%rip),%xmm13 paddd %xmm5,%xmm12 paddd %xmm5,%xmm6 paddd %xmm9,%xmm10 movdqa -80(%rbp),%xmm5 paddd -48(%rbp),%xmm5 paddd %xmm1,%xmm13 paddd %xmm1,%xmm7 movdqa %xmm10,%xmm1 psubd %xmm6,%xmm10 paddd %xmm9,%xmm5 paddd %xmm6,%xmm1 psrad $10,%xmm10 movdqa -16(%rbp),%xmm6 movdqa %xmm1,%xmm15 movdqa %xmm5,%xmm1 psubd %xmm4,%xmm5 psrad $10,%xmm5 paddd %xmm4,%xmm1 paddd %xmm2,%xmm6 packssdw %xmm10,%xmm5 movdqa -32(%rbp),%xmm10 paddd %xmm9,%xmm6 paddd %xmm9,%xmm2 psrad $10,%xmm15 psrad $10,%xmm1 psubd -16(%rbp),%xmm2 paddd %xmm3,%xmm10 paddd %xmm9,%xmm3 packssdw %xmm15,%xmm1 paddd %xmm9,%xmm10 psubd -32(%rbp),%xmm3 movdqa %xmm10,%xmm4 psubd %xmm8,%xmm10 paddd %xmm8,%xmm4 psrad $10,%xmm10 movdqa %xmm4,%xmm15 movdqa %xmm6,%xmm4 psubd %xmm7,%xmm6 psrad $10,%xmm6 psrad $10,%xmm15 paddd %xmm7,%xmm4 movdqa %xmm3,%xmm7 psubd %xmm14,%xmm3 packssdw %xmm10,%xmm6 psrad $10,%xmm3 psrad $10,%xmm4 paddd %xmm14,%xmm7 movdqa %xmm7,%xmm8 movdqa %xmm2,%xmm7 psubd %xmm13,%xmm2 paddd %xmm13,%xmm7 psrad $10,%xmm8 packssdw %xmm15,%xmm4 psrad $10,%xmm7 psrad $10,%xmm2 packssdw %xmm8,%xmm7 movdqa -80(%rbp),%xmm8 packssdw %xmm3,%xmm2 paddd %xmm9,%xmm8 paddd -96(%rbp),%xmm9 psubd -48(%rbp),%xmm8 psubd -64(%rbp),%xmm9 movdqa %xmm8,%xmm3 movdqa %xmm9,%xmm10 psubd %xmm11,%xmm8 paddd %xmm12,%xmm10 paddd %xmm11,%xmm3 psrad $10,%xmm8 psrad $10,%xmm10 psrad $10,%xmm3 psubd %xmm12,%xmm9 psrad $10,%xmm9 packssdw %xmm10,%xmm3 movdqa %xmm1,%xmm10 packssdw %xmm9,%xmm8 movdqa %xmm7,%xmm9 punpckhwd %xmm6,%xmm7 punpcklwd %xmm6,%xmm9 punpcklwd %xmm8,%xmm10 punpckhwd %xmm8,%xmm1 movdqa %xmm3,%xmm6 movdqa %xmm4,%xmm8 punpckhwd %xmm5,%xmm3 punpcklwd %xmm5,%xmm6 punpcklwd %xmm2,%xmm8 movdqa %xmm3,%xmm5 punpckhwd %xmm2,%xmm4 movdqa %xmm8,%xmm3 movdqa %xmm10,%xmm2 punpckhwd %xmm6,%xmm8 punpcklwd %xmm6,%xmm3 punpcklwd %xmm9,%xmm2 movdqa %xmm8,%xmm6 movdqa %xmm4,%xmm8 punpckhwd %xmm9,%xmm10 punpcklwd %xmm5,%xmm8 punpckhwd %xmm5,%xmm4 movdqa %xmm2,%xmm5 punpcklwd %xmm3,%xmm5 punpckhwd %xmm3,%xmm2 movdqa %xmm1,%xmm15 movdqa %xmm10,%xmm3 punpckhwd %xmm7,%xmm1 punpckhwd %xmm6,%xmm10 punpcklwd %xmm6,%xmm3 movdqa %xmm1,%xmm6 punpckhwd %xmm4,%xmm1 punpcklwd %xmm4,%xmm6 movdqa %xmm3,%xmm4 punpcklwd %xmm7,%xmm15 punpcklwd %xmm6,%xmm4 punpckhwd %xmm6,%xmm3 movdqa %xmm15,%xmm7 movdqa %xmm4,%xmm6 punpcklwd %xmm8,%xmm7 movdqa %xmm3,%xmm11 movdqa %xmm4,%xmm12 movdqa %xmm3,%xmm4 movdqa %xmm5,%xmm3 paddw %xmm7,%xmm3 movdqa %xmm1,%xmm9 punpckhwd %xmm8,%xmm15 punpcklwd %xmm10,%xmm9 psubw %xmm7,%xmm5 movdqa %xmm15,%xmm7 movdqa %xmm9,%xmm14 punpcklwd %xmm2,%xmm7 movdqa %xmm1,%xmm8 pmaddwd .LC0(%rip),%xmm6 punpckhwd %xmm10,%xmm8 paddw %xmm15,%xmm10 movaps %xmm6,-16(%rbp) pmaddwd .LC1(%rip),%xmm4 movdqa %xmm0,%xmm6 pmaddwd .LC0(%rip),%xmm11 pmaddwd .LC2(%rip),%xmm14 pmaddwd .LC1(%rip),%xmm12 pmaddwd .LC3(%rip),%xmm9 movaps %xmm4,-64(%rbp) movdqa %xmm3,%xmm4 movdqa %xmm0,%xmm3 punpckhwd %xmm4,%xmm6 punpcklwd %xmm4,%xmm3 movdqa %xmm0,%xmm4 movaps %xmm11,-32(%rbp) movdqa %xmm6,%xmm13 movdqa %xmm15,%xmm6 punpcklwd %xmm5,%xmm4 movaps %xmm12,-48(%rbp) punpckhwd %xmm2,%xmm6 paddw %xmm1,%xmm2 punpckhwd %xmm5,%xmm0 movdqa %xmm14,%xmm11 movdqa %xmm2,%xmm5 movdqa %xmm7,%xmm14 punpckhwd %xmm10,%xmm2 psrad $4,%xmm13 punpcklwd %xmm10,%xmm5 movaps %xmm13,-80(%rbp) movdqa %xmm8,%xmm12 movdqa %xmm5,%xmm10 pmaddwd .LC4(%rip),%xmm14 pmaddwd .LC6(%rip),%xmm10 movdqa %xmm2,%xmm15 pmaddwd .LC7(%rip),%xmm5 pmaddwd .LC3(%rip),%xmm8 pmaddwd .LC5(%rip),%xmm7 movdqa %xmm14,%xmm13 movdqa %xmm6,%xmm14 paddd %xmm5,%xmm13 paddd %xmm5,%xmm9 pmaddwd .LC5(%rip),%xmm6 psrad $4,%xmm3 pmaddwd .LC6(%rip),%xmm15 paddd %xmm10,%xmm7 paddd %xmm10,%xmm11 psrad $4,%xmm4 pmaddwd .LC2(%rip),%xmm12 psrad $4,%xmm0 pmaddwd .LC4(%rip),%xmm14 pmaddwd .LC7(%rip),%xmm2 movdqa -80(%rbp),%xmm5 paddd %xmm15,%xmm12 paddd -64(%rbp),%xmm5 paddd %xmm2,%xmm14 paddd %xmm8,%xmm2 movdqa -48(%rbp),%xmm8 paddd %xmm6,%xmm15 movdqa .LC9(%rip),%xmm6 paddd %xmm3,%xmm8 paddd %xmm6,%xmm8 paddd %xmm6,%xmm5 movdqa %xmm5,%xmm10 movdqa %xmm8,%xmm1 psubd %xmm15,%xmm5 psubd %xmm7,%xmm8 psrad $17,%xmm5 paddd %xmm7,%xmm1 movdqa -32(%rbp),%xmm7 psrad $17,%xmm8 paddd %xmm15,%xmm10 paddd %xmm6,%xmm3 packssdw %xmm5,%xmm8 movdqa -16(%rbp),%xmm5 paddd %xmm0,%xmm7 paddd %xmm6,%xmm0 paddd %xmm6,%xmm7 psrad $17,%xmm10 psubd -32(%rbp),%xmm0 paddd %xmm4,%xmm5 psrad $17,%xmm1 movdqa %xmm7,%xmm15 paddd %xmm6,%xmm5 packssdw %xmm10,%xmm1 psubd %xmm2,%xmm7 movdqa %xmm5,%xmm10 paddd %xmm6,%xmm4 psubd %xmm9,%xmm5 psubd -16(%rbp),%xmm4 psrad $17,%xmm7 paddd %xmm2,%xmm15 psrad $17,%xmm5 psubd -48(%rbp),%xmm3 paddd -80(%rbp),%xmm6 packssdw %xmm7,%xmm5 movdqa %xmm4,%xmm2 movdqa %xmm0,%xmm7 psubd -64(%rbp),%xmm6 paddd %xmm14,%xmm7 psrad $17,%xmm15 paddd %xmm13,%xmm2 psubd %xmm14,%xmm0 psrad $17,%xmm7 psubd %xmm13,%xmm4 psrad $17,%xmm0 paddd %xmm9,%xmm10 psrad $17,%xmm2 psrad $17,%xmm4 packuswb %xmm8,%xmm5 packssdw %xmm0,%xmm4 packssdw %xmm7,%xmm2 movdqa %xmm3,%xmm0 movdqa %xmm6,%xmm7 psrad $17,%xmm10 paddd %xmm11,%xmm0 paddd %xmm12,%xmm7 psubd %xmm12,%xmm6 packssdw %xmm15,%xmm10 psubd %xmm11,%xmm3 psrad $17,%xmm7 packuswb %xmm10,%xmm1 psrad $17,%xmm0 psrad $17,%xmm6 psrad $17,%xmm3 packssdw %xmm7,%xmm0 packssdw %xmm6,%xmm3 packuswb %xmm0,%xmm2 movdqa %xmm1,%xmm0 packuswb %xmm4,%xmm3 movdqa %xmm2,%xmm4 punpckhbw %xmm5,%xmm2 punpcklbw %xmm3,%xmm0 punpcklbw %xmm5,%xmm4 punpckhbw %xmm3,%xmm1 movdqa %xmm2,%xmm3 movdqa %xmm0,%xmm2 movdqa %xmm1,%xmm5 punpcklbw %xmm4,%xmm2 punpckhbw %xmm4,%xmm0 punpcklbw %xmm3,%xmm5 movdqa %xmm2,%xmm4 punpckhbw %xmm5,%xmm2 punpckhbw %xmm3,%xmm1 punpcklbw %xmm5,%xmm4 movdqa %xmm0,%xmm3 punpckhbw %xmm1,%xmm0 movq %xmm4,(%rdi) pshufd $78,%xmm4,%xmm4 punpcklbw %xmm1,%xmm3 movq %xmm4,(%rax) add %rsi,%rax movq %xmm2,(%rax) add %rsi,%rax pshufd $78,%xmm2,%xmm2 movq %xmm2,(%rax) add %rsi,%rax movq %xmm3,(%rax) add %rsi,%rax pshufd $78,%xmm3,%xmm3 movq %xmm3,(%rax) movq %xmm0,(%rax,%rsi) pshufd $78,%xmm0,%xmm0 movq %xmm0,(%rax,%rsi,2) leave ret .endfn stbi__idct_simd$sse,globl .rodata.cst16 .LC0: .value 2217,-5350,2217,-5350,2217,-5350,2217,-5350 .LC1: .value 5352,2217,5352,2217,5352,2217,5352,2217 .LC2: .value -6811,-8034,-6811,-8034,-6811,-8034,-6811,-8034 .LC3: .value -8034,4552,-8034,4552,-8034,4552,-8034,4552 .LC4: .value 6813,-1597,6813,-1597,6813,-1597,6813,-1597 .LC5: .value -1597,4552,-1597,4552,-1597,4552,-1597,4552 .LC6: .value 1131,4816,1131,4816,1131,4816,1131,4816 .LC7: .value 4816,-5681,4816,-5681,4816,-5681,4816,-5681 .LC8: .long 0x200,0x200,0x200,0x200 .LC9: .long 0x1010000,0x1010000,0x1010000,0x1010000