cosmopolitan/dsp/core/kss8.h

44 lines
2.4 KiB
C

#ifndef COSMOPOLITAN_DSP_CORE_KSS8_H_
#define COSMOPOLITAN_DSP_CORE_KSS8_H_
#include "libc/limits.h"
#include "libc/macros.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
/**
* Performs 16-bit scaled rounded saturated madd w/ eight coefficients or fewer.
*
* (Σᵢ₌₀₋₈𝑘ᵢ𝑥ᵢ + 2ᵐ⁻¹)/2ᵐ
*
* @note compiler struggles with this
*/
#define KSS8(M, K1, K2, K3, K4, K5, K6, K7, K8, X1, X2, X3, X4, X5, X6, X7, \
X8) \
({ \
short x1, x2, x3, x4, x5, x6, x7, x8; \
x1 = X1, x2 = X2, x3 = X3, x4 = X4; \
x5 = X5, x6 = X6, x7 = X7, x8 = X8; \
x1 = MIN(SHRT_MAX, MAX(SHRT_MIN, x1 * K1)); \
x2 = MIN(SHRT_MAX, MAX(SHRT_MIN, x2 * K2)); \
x3 = MIN(SHRT_MAX, MAX(SHRT_MIN, x3 * K3)); \
x4 = MIN(SHRT_MAX, MAX(SHRT_MIN, x4 * K4)); \
x5 = MIN(SHRT_MAX, MAX(SHRT_MIN, x5 * K5)); \
x6 = MIN(SHRT_MAX, MAX(SHRT_MIN, x6 * K6)); \
x7 = MIN(SHRT_MAX, MAX(SHRT_MIN, x7 * K7)); \
x8 = MIN(SHRT_MAX, MAX(SHRT_MIN, x8 * K8)); \
x1 = MIN(SHRT_MAX, MAX(SHRT_MIN, x1 + x2)); \
x3 = MIN(SHRT_MAX, MAX(SHRT_MIN, x3 + x4)); \
x5 = MIN(SHRT_MAX, MAX(SHRT_MIN, x5 + x6)); \
x7 = MIN(SHRT_MAX, MAX(SHRT_MIN, x7 + x8)); \
x1 = MIN(SHRT_MAX, MAX(SHRT_MIN, x1 + x3)); \
x5 = MIN(SHRT_MAX, MAX(SHRT_MIN, x5 + x7)); \
x1 = MIN(SHRT_MAX, MAX(SHRT_MIN, x1 + x5)); \
if (M) { \
x1 += 1 << MAX(0, M - 1); \
x1 >>= M; \
} \
x1; \
})
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_DSP_CORE_KSS8_H_ */