/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│ │vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi│ ╞══════════════════════════════════════════════════════════════════════════════╡ │ Copyright 2020 Justine Alexandra Roberts Tunney │ │ │ │ This program is free software; you can redistribute it and/or modify │ │ it under the terms of the GNU General Public License as published by │ │ the Free Software Foundation; version 2 of the License. │ │ │ │ This program is distributed in the hope that it will be useful, but │ │ WITHOUT ANY WARRANTY; without even the implied warranty of │ │ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │ │ General Public License for more details. │ │ │ │ You should have received a copy of the GNU General Public License │ │ along with this program; if not, write to the Free Software │ │ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │ │ 02110-1301 USA │ ╚─────────────────────────────────────────────────────────────────────────────*/ #include "libc/macros.h" .align 16 stbi__YCbCr_to_RGB_row$sse2: .leafprologue .profilable xor %eax,%eax cmp $8,%r8d jl 1f xor %eax,%eax movdqa 2f(%rip),%xmm2 movdqa 3f(%rip),%xmm8 movdqa 4f(%rip),%xmm9 movdqa 5f(%rip),%xmm10 movdqa 6f(%rip),%xmm4 movdqa 7f(%rip),%xmm5 .align 16 0: movq (%rsi,%rax),%xmm6 movq (%rcx,%rax),%xmm7 movq (%rdx,%rax),%xmm1 movdqa %xmm2,%xmm0 punpcklbw %xmm6,%xmm0 pxor %xmm2,%xmm7 pxor %xmm6,%xmm6 punpcklbw %xmm7,%xmm6 pxor %xmm2,%xmm1 pxor %xmm3,%xmm3 punpcklbw %xmm1,%xmm3 psrlw $4,%xmm0 movdqa %xmm6,%xmm7 pmulhw %xmm8,%xmm7 movdqa %xmm3,%xmm1 pmulhw %xmm9,%xmm1 pmulhw %xmm10,%xmm3 pmulhw %xmm4,%xmm6 paddw %xmm1,%xmm6 paddw %xmm0,%xmm7 paddw %xmm0,%xmm3 paddw %xmm0,%xmm6 psraw $4,%xmm7 psraw $4,%xmm3 packuswb %xmm3,%xmm7 psraw $4,%xmm6 packuswb %xmm5,%xmm6 movdqa %xmm7,%xmm0 punpcklbw %xmm6,%xmm0 punpckhbw %xmm6,%xmm7 movdqa %xmm0,%xmm1 punpcklwd %xmm7,%xmm1 punpckhwd %xmm7,%xmm0 movdqu %xmm1,(%rdi,%rax,4) movdqu %xmm0,16(%rdi,%rax,4) add $8,%rax lea 7(%rax),%r9d cmp %r8d,%r9d jl 0b 1: .leafepilogue .endfn stbi__YCbCr_to_RGB_row$sse2,globl .rodata.cst16 2: .byte 128,128,128,128,128,128,128,128 .zero 8 3: .short 5743,5743,5743,5743,5743,5743,5743,5743 4: .short 64126,64126,64126,64126,64126,64126,64126,64126 5: .short 7258,7258,7258,7258,7258,7258,7258,7258 6: .short 62611,62611,62611,62611,62611,62611,62611,62611 7: .short 255,255,255,255,255,255,255,255 .end / These should be better but need to get them to work 3: .short 11485,11485,11485,11485,11485,11485,11485,11485 # J′R m=13 99.964387% 4: .short -11277,-11277,-11277,-11277,-11277,-11277,-11277,-11277 # J′G m=15 99.935941% 5: .short 14516,14516,14516,14516,14516,14516,14516,14516 # J′B m=13 99.947219% 6: .short -23401,-23401,-23401,-23401,-23401,-23401,-23401,-23401 # J′G m=15 99.935941% 7: .short 255,255,255,255,255,255,255,255