cosmopolitan/dsp/scale/cdecimate2xuint8x8.c

100 lines
4.5 KiB
C
Raw Normal View History

2020-06-15 14:18:57 +00:00
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2020 Justine Alexandra Roberts Tunney
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301 USA
*/
#include "libc/assert.h"
#include "libc/intrin/packuswb.h"
#include "libc/intrin/paddw.h"
#include "libc/intrin/palignr.h"
#include "libc/intrin/pmaddubsw.h"
#include "libc/intrin/psraw.h"
#include "libc/log/check.h"
#include "libc/log/log.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#define TAPS 8
#define RATIO 2
#define OFFSET 3
#define STRIDE 8
#define SPREAD (STRIDE * RATIO + TAPS - OFFSET)
#define OVERLAP (SPREAD - STRIDE * RATIO)
#define LOOKBEHIND OFFSET
#define LOOKAHEAD (SPREAD - LOOKBEHIND)
#define SCALE 5
#define ROUND (1 << (SCALE - 1))
/**
* Performs 2D Motion Picture Convolution Acceleration by Leveraging SSSE3.
*
* @note H/T John Costella, Jean-Baptiste Joseph Fourier
* @note RIP Huixiang Chen
*/
void *cDecimate2xUint8x8(unsigned long n, unsigned char A[n],
const signed char K[8]) {
short kRound[8] = {ROUND, ROUND, ROUND, ROUND, ROUND, ROUND, ROUND, ROUND};
signed char kMadd1[16] = {K[0], K[1], K[0], K[1], K[0], K[1], K[0], K[1],
K[0], K[1], K[0], K[1], K[0], K[1], K[0], K[1]};
signed char kMadd2[16] = {K[2], K[3], K[2], K[3], K[2], K[3], K[2], K[3],
K[2], K[3], K[2], K[3], K[2], K[3], K[2], K[3]};
signed char kMadd3[16] = {K[4], K[5], K[4], K[5], K[4], K[5], K[4], K[5],
K[4], K[5], K[4], K[5], K[4], K[5], K[4], K[5]};
signed char kMadd4[16] = {K[6], K[7], K[6], K[7], K[6], K[7], K[6], K[7],
K[6], K[7], K[6], K[7], K[6], K[7], K[6], K[7]};
unsigned char in1[16], in2[16], in3[16], in4[32];
unsigned char bv0[16], bv1[16], bv2[16], bv3[16];
short wv0[8], wv1[8], wv2[8], wv3[8];
unsigned long i, j, v, w, o;
if (n >= STRIDE) {
i = 0;
w = (n + RATIO / 2) / RATIO;
memset(in1, A[0], sizeof(in1));
memset(in2, A[n - 1], 16);
memcpy(in2, A, MIN(16, n));
for (; i < w; i += STRIDE) {
j = i * RATIO + 16;
if (j + 16 <= n) {
memcpy(in3, &A[j], 16);
} else {
memset(in3, A[n - 1], 16);
if (j < n) {
memcpy(in3, &A[j], n - j);
}
}
palignr(bv0, in2, in1, 13);
palignr(bv1, in2, in1, 15);
palignr(bv2, in3, in2, 1);
palignr(bv3, in3, in2, 3);
pmaddubsw(wv0, bv0, kMadd1);
pmaddubsw(wv1, bv1, kMadd2);
pmaddubsw(wv2, bv2, kMadd3);
pmaddubsw(wv3, bv3, kMadd4);
paddw(wv0, wv0, kRound);
paddw(wv0, wv0, wv1);
paddw(wv0, wv0, wv2);
paddw(wv0, wv0, wv3);
psraw(wv0, wv0, SCALE);
packuswb(bv2, wv0, wv0);
memcpy(&A[i], bv2, STRIDE);
memcpy(in1, in2, 16);
memcpy(in2, in3, 16);
}
}
return A;
}