cosmopolitan/third_party/stb/ycbcr-sse2.S

95 lines
3.7 KiB
ArmAsm
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
This program is free software; you can redistribute it and/or modify │
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License. │
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of │
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software │
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301 USA
*/
#include "libc/macros.h"
.align 16
stbi__YCbCr_to_RGB_row$sse2:
.leafprologue
.profilable
xor %eax,%eax
cmp $8,%r8d
jl 1f
xor %eax,%eax
movdqa 2f(%rip),%xmm2
movdqa 3f(%rip),%xmm8
movdqa 4f(%rip),%xmm9
movdqa 5f(%rip),%xmm10
movdqa 6f(%rip),%xmm4
movdqa 7f(%rip),%xmm5
.align 16
0: movq (%rsi,%rax),%xmm6
movq (%rcx,%rax),%xmm7
movq (%rdx,%rax),%xmm1
movdqa %xmm2,%xmm0
punpcklbw %xmm6,%xmm0
pxor %xmm2,%xmm7
pxor %xmm6,%xmm6
punpcklbw %xmm7,%xmm6
pxor %xmm2,%xmm1
pxor %xmm3,%xmm3
punpcklbw %xmm1,%xmm3
psrlw $4,%xmm0
movdqa %xmm6,%xmm7
pmulhw %xmm8,%xmm7
movdqa %xmm3,%xmm1
pmulhw %xmm9,%xmm1
pmulhw %xmm10,%xmm3
pmulhw %xmm4,%xmm6
paddw %xmm1,%xmm6
paddw %xmm0,%xmm7
paddw %xmm0,%xmm3
paddw %xmm0,%xmm6
psraw $4,%xmm7
psraw $4,%xmm3
packuswb %xmm3,%xmm7
psraw $4,%xmm6
packuswb %xmm5,%xmm6
movdqa %xmm7,%xmm0
punpcklbw %xmm6,%xmm0
punpckhbw %xmm6,%xmm7
movdqa %xmm0,%xmm1
punpcklwd %xmm7,%xmm1
punpckhwd %xmm7,%xmm0
movdqu %xmm1,(%rdi,%rax,4)
movdqu %xmm0,16(%rdi,%rax,4)
add $8,%rax
lea 7(%rax),%r9d
cmp %r8d,%r9d
jl 0b
1: .leafepilogue
.endfn stbi__YCbCr_to_RGB_row$sse2,globl
.rodata.cst16
2: .byte 128,128,128,128,128,128,128,128
.zero 8
3: .short 5743,5743,5743,5743,5743,5743,5743,5743
4: .short 64126,64126,64126,64126,64126,64126,64126,64126
5: .short 7258,7258,7258,7258,7258,7258,7258,7258
6: .short 62611,62611,62611,62611,62611,62611,62611,62611
7: .short 255,255,255,255,255,255,255,255
.end
/ These should be better but need to get them to work
3: .short 11485,11485,11485,11485,11485,11485,11485,11485 # JR m=13 99.964387%
4: .short -11277,-11277,-11277,-11277,-11277,-11277,-11277,-11277 # JG m=15 99.935941%
5: .short 14516,14516,14516,14516,14516,14516,14516,14516 # JB m=13 99.947219%
6: .short -23401,-23401,-23401,-23401,-23401,-23401,-23401,-23401 # JG m=15 99.935941%
7: .short 255,255,255,255,255,255,255,255