cosmopolitan/dsp/scale/cdecimate2xuint8x8.c

100 lines
4.5 KiB
C

/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi│
╞══════════════════════════════════════════════════════════════════════════════╡
│ Copyright 2020 Justine Alexandra Roberts Tunney │
│ │
│ This program is free software; you can redistribute it and/or modify │
│ it under the terms of the GNU General Public License as published by │
│ the Free Software Foundation; version 2 of the License. │
│ │
│ This program is distributed in the hope that it will be useful, but │
│ WITHOUT ANY WARRANTY; without even the implied warranty of │
│ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU │
│ General Public License for more details. │
│ │
│ You should have received a copy of the GNU General Public License │
│ along with this program; if not, write to the Free Software │
│ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA │
│ 02110-1301 USA │
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/assert.h"
#include "libc/intrin/packuswb.h"
#include "libc/intrin/paddw.h"
#include "libc/intrin/palignr.h"
#include "libc/intrin/pmaddubsw.h"
#include "libc/intrin/psraw.h"
#include "libc/log/check.h"
#include "libc/log/log.h"
#include "libc/nexgen32e/x86feature.h"
#include "libc/str/str.h"
#define TAPS 8
#define RATIO 2
#define OFFSET 3
#define STRIDE 8
#define SPREAD (STRIDE * RATIO + TAPS - OFFSET)
#define OVERLAP (SPREAD - STRIDE * RATIO)
#define LOOKBEHIND OFFSET
#define LOOKAHEAD (SPREAD - LOOKBEHIND)
#define SCALE 5
#define ROUND (1 << (SCALE - 1))
/**
* Performs 2D Motion Picture Convolution Acceleration by Leveraging SSSE3.
*
* @note H/T John Costella, Jean-Baptiste Joseph Fourier
* @note RIP Huixiang Chen
*/
void *cDecimate2xUint8x8(unsigned long n, unsigned char A[n],
const signed char K[8]) {
short kRound[8] = {ROUND, ROUND, ROUND, ROUND, ROUND, ROUND, ROUND, ROUND};
signed char kMadd1[16] = {K[0], K[1], K[0], K[1], K[0], K[1], K[0], K[1],
K[0], K[1], K[0], K[1], K[0], K[1], K[0], K[1]};
signed char kMadd2[16] = {K[2], K[3], K[2], K[3], K[2], K[3], K[2], K[3],
K[2], K[3], K[2], K[3], K[2], K[3], K[2], K[3]};
signed char kMadd3[16] = {K[4], K[5], K[4], K[5], K[4], K[5], K[4], K[5],
K[4], K[5], K[4], K[5], K[4], K[5], K[4], K[5]};
signed char kMadd4[16] = {K[6], K[7], K[6], K[7], K[6], K[7], K[6], K[7],
K[6], K[7], K[6], K[7], K[6], K[7], K[6], K[7]};
unsigned char in1[16], in2[16], in3[16], in4[32];
unsigned char bv0[16], bv1[16], bv2[16], bv3[16];
short wv0[8], wv1[8], wv2[8], wv3[8];
unsigned long i, j, v, w, o;
if (n >= STRIDE) {
i = 0;
w = (n + RATIO / 2) / RATIO;
memset(in1, A[0], sizeof(in1));
memset(in2, A[n - 1], 16);
memcpy(in2, A, MIN(16, n));
for (; i < w; i += STRIDE) {
j = i * RATIO + 16;
if (j + 16 <= n) {
memcpy(in3, &A[j], 16);
} else {
memset(in3, A[n - 1], 16);
if (j < n) {
memcpy(in3, &A[j], n - j);
}
}
palignr(bv0, in2, in1, 13);
palignr(bv1, in2, in1, 15);
palignr(bv2, in3, in2, 1);
palignr(bv3, in3, in2, 3);
pmaddubsw(wv0, bv0, kMadd1);
pmaddubsw(wv1, bv1, kMadd2);
pmaddubsw(wv2, bv2, kMadd3);
pmaddubsw(wv3, bv3, kMadd4);
paddw(wv0, wv0, kRound);
paddw(wv0, wv0, wv1);
paddw(wv0, wv0, wv2);
paddw(wv0, wv0, wv3);
psraw(wv0, wv0, SCALE);
packuswb(bv2, wv0, wv0);
memcpy(&A[i], bv2, STRIDE);
memcpy(in1, in2, 16);
memcpy(in2, in3, 16);
}
}
return A;
}