cosmopolitan/third_party/stb/ycbcr-sse2.S

95 lines
3.7 KiB
ArmAsm
Raw Normal View History

/*-*- mode:unix-assembly; indent-tabs-mode:t; tab-width:8; coding:utf-8 -*-│
2020-06-15 14:18:57 +00:00
vi: set et ft=asm ts=8 tw=8 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
This program is free software; you can redistribute it and/or modify │
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License. │
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of │
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software │
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301 USA
*/
#include "libc/macros.h"
.align 16
stbi__YCbCr_to_RGB_row$sse2:
.leafprologue
.profilable
xor %eax,%eax
cmp $8,%r8d
jl 1f
xor %eax,%eax
movdqa 2f(%rip),%xmm2
movdqa 3f(%rip),%xmm8
movdqa 4f(%rip),%xmm9
movdqa 5f(%rip),%xmm10
movdqa 6f(%rip),%xmm4
movdqa 7f(%rip),%xmm5
.align 16
0: movq (%rsi,%rax),%xmm6
movq (%rcx,%rax),%xmm7
movq (%rdx,%rax),%xmm1
movdqa %xmm2,%xmm0
punpcklbw %xmm6,%xmm0
pxor %xmm2,%xmm7
pxor %xmm6,%xmm6
punpcklbw %xmm7,%xmm6
pxor %xmm2,%xmm1
pxor %xmm3,%xmm3
punpcklbw %xmm1,%xmm3
psrlw $4,%xmm0
movdqa %xmm6,%xmm7
pmulhw %xmm8,%xmm7
movdqa %xmm3,%xmm1
pmulhw %xmm9,%xmm1
pmulhw %xmm10,%xmm3
pmulhw %xmm4,%xmm6
paddw %xmm1,%xmm6
paddw %xmm0,%xmm7
paddw %xmm0,%xmm3
paddw %xmm0,%xmm6
psraw $4,%xmm7
psraw $4,%xmm3
packuswb %xmm3,%xmm7
psraw $4,%xmm6
packuswb %xmm5,%xmm6
movdqa %xmm7,%xmm0
punpcklbw %xmm6,%xmm0
punpckhbw %xmm6,%xmm7
movdqa %xmm0,%xmm1
punpcklwd %xmm7,%xmm1
punpckhwd %xmm7,%xmm0
movdqu %xmm1,(%rdi,%rax,4)
movdqu %xmm0,16(%rdi,%rax,4)
add $8,%rax
lea 7(%rax),%r9d
cmp %r8d,%r9d
jl 0b
1: .leafepilogue
.endfn stbi__YCbCr_to_RGB_row$sse2,globl
.rodata.cst16
2: .byte 128,128,128,128,128,128,128,128
.zero 8
3: .short 5743,5743,5743,5743,5743,5743,5743,5743
4: .short 64126,64126,64126,64126,64126,64126,64126,64126
5: .short 7258,7258,7258,7258,7258,7258,7258,7258
6: .short 62611,62611,62611,62611,62611,62611,62611,62611
7: .short 255,255,255,255,255,255,255,255
.end
/ These should be better but need to get them to work
3: .short 11485,11485,11485,11485,11485,11485,11485,11485 # JR m=13 99.964387%
4: .short -11277,-11277,-11277,-11277,-11277,-11277,-11277,-11277 # JG m=15 99.935941%
5: .short 14516,14516,14516,14516,14516,14516,14516,14516 # JB m=13 99.947219%
6: .short -23401,-23401,-23401,-23401,-23401,-23401,-23401,-23401 # JG m=15 99.935941%
7: .short 255,255,255,255,255,255,255,255