#ifndef COSMOPOLITAN_TOOL_VIZ_LIB_CONVOLVE_H_
#define COSMOPOLITAN_TOOL_VIZ_LIB_CONVOLVE_H_
#include "libc/bits/xmmintrin.internal.h"
#include "libc/str/str.h"
#include "tool/viz/lib/graphic.h"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_

forceinline void convolve(unsigned yn, unsigned xn, __m128 img[yn][xn], int KW,
                          const float kernel[KW][KW], float C1, float C2) {
  /* TODO(jart): nontemporal herringbone strategy */
  float f;
  struct Graphic g;
  unsigned y, x, i, j;
  __v4sf p, kflip[KW][KW], (*tmp)[yn][xn];
  for (i = 0; i < KW; ++i) {
    for (j = 0; j < KW; ++j) {
      f = kernel[i][j] / C1;
      kflip[KW - i - 1][KW - j - 1] = (__v4sf){f, f, f, f};
    }
  }
  memset(&g, 0, sizeof(g));
  resizegraphic(&g, yn, xn);
  tmp = g.b.p;
  for (y = 0; y < yn - KW; ++y) {
    for (x = 0; x < xn - KW; ++x) {
      memset(&p, 0, sizeof(p));
      for (i = 0; i < KW; ++i) {
        for (j = 0; j < KW; ++j) {
          p += img[y + i][x + j] * kflip[i][j] + C2;
        }
      }
      memcpy(&(*tmp)[y + KW / 2][x + KW / 2], &p, sizeof(p));
    }
  }
  memcpy(img, tmp, yn * xn * sizeof(img[0][0]));
  bfree(&g.b);
}

COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_TOOL_VIZ_LIB_CONVOLVE_H_ */