cosmopolitan/third_party/avir/avir.h

/* clang-format off */
//$ nobt
//$ nocpp

/**
 * @file avir.h
 *
 * @brief The "main" inclusion file with all required classes and functions.
 *
 * This is the "main" inclusion file for the "AVIR" image resizer. This
 * inclusion file contains implementation of the AVIR image resizing algorithm
 * in its entirety. Also includes several classes and functions that can be
 * useful elsewhere.
 *
 * AVIR Copyright (c) 2015-2019 Aleksey Vaneev
 *
 * @mainpage
 *
 * @section intro_sec Introduction
 *
 * Description is available at https://github.com/avaneev/avir
 *
 * AVIR is devoted to women. Your digital photos can look good at any size!
 *
 * @section license License
 *
 * AVIR License Agreement
 *
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2019 Aleksey Vaneev
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 * Please credit the author of this library in your documentation in the
 * following way: "AVIR image resizing algorithm designed by Aleksey Vaneev"
 *
 * @version 2.4
 */

#ifndef AVIR_CIMAGERESIZER_INCLUDED
#define AVIR_CIMAGERESIZER_INCLUDED

#include "third_party/avir/notice.h"
#include "libc/bits/xmmintrin.internal.h"
#include "libc/str/str.h"
#include "libc/mem/mem.h"
#include "libc/bits/bits.h"
#include "libc/math.h"

namespace avir {

/**
 * The macro defines AVIR version string.
 */

#define AVIR_VERSION "2.4"

/**
 * The macro equals to "pi" constant, fills 53-bit floating point mantissa.
 * Undefined at the end of file.
 */

#define AVIR_PI 3.1415926535897932

/**
 * The macro equals to "pi divided by 2" constant, fills 53-bit floating
 * point mantissa. Undefined at the end of file.
 */

#define AVIR_PId2 1.5707963267948966

/**
 * Rounding function, based on the (int) typecast. Biased result. Not suitable
 * for numbers >= 2^31.
 *
 * @param d Value to round.
 * @return Rounded value. Some bias may be introduced.
 */

template <class T>
inline T round(const T d) {
  return (d < 0.0 ? -(T)(int)((T)0.5 - d) : (T)(int)(d + (T)0.5));
}

/**
 * Template function "clamps" (clips) the specified value so that it is not
 * lesser than "minv", and not greater than "maxv".
 *
 * @param Value Value to clamp.
 * @param minv Minimal allowed value.
 * @param maxv Maximal allowed value.
 * @return The clamped value.
 */

template <class T>
inline T clamp(const T& Value, const T minv, const T maxv) {
  if (Value < minv) {
    return (minv);
  } else if (Value > maxv) {
    return (maxv);
  } else {
    return (Value);
  }
}

/**
 * Power 2.4 approximation function, designed for sRGB gamma correction.
 *
 * @param x Argument, in the range 0.09 to 1.
 * @return Value raised into power 2.4, approximate.
 */

template <class T>
inline T pow24_sRGB(const T x) {
  const double x2 = x * x;
  const double x3 = x2 * x;
  const double x4 = x2 * x2;

  return ((T)(0.0985766365536824 + 0.839474952656502 * x2 +
              0.363287814061725 * x3 -
              0.0125559718896615 / (0.12758338921578 + 0.290283465468235 * x) -
              0.231757513261358 * x - 0.0395365717969074 * x4));
}

/**
 * Power 1/2.4 approximation function, designed for sRGB gamma correction.
 *
 * @param x Argument, in the range 0.003 to 1.
 * @return Value raised into power 1/2.4, approximate.
 */

template <class T>
inline T pow24i_sRGB(const T x) {
  const double sx = sqrt(x);
  const double ssx = sqrt(sx);
  const double sssx = sqrt(ssx);

  return ((T)(0.000213364515060263 + 0.0149409239419218 * x +
              0.433973412731747 * sx +
              ssx * (0.659628181609715 * sssx - 0.0380957908841466 -
                     0.0706476137208521 * sx)));
}

/**
 * Function approximately linearizes the sRGB gamma value.
 *
 * @param s sRGB gamma value, in the range 0 to 1.
 * @return Linearized sRGB gamma value, approximated.
 */

template <class T>
inline T convertSRGB2Lin(const T s) {
  const T a = (T)0.055;

  if (s <= (T)0.04045) {
    return (s / (T)12.92);
  }

  return (pow24_sRGB((s + a) / ((T)1 + a)));
}

/**
 * Function approximately de-linearizes the linear gamma value.
 *
 * @param s Linear gamma value, in the range 0 to 1.
 * @return sRGB gamma value, approximated.
 */

template <class T>
inline T convertLin2SRGB(const T s) {
  const T a = (T)0.055;

  if (s <= (T)0.0031308) {
    return ((T)12.92 * s);
  }

  return (((T)1 + a) * pow24i_sRGB(s) - a);
}

/**
 * Function converts (via typecast) specified array of type T1 values of
 * length l into array of type T2 values. If T1 is the same as T2, copy
 * operation is performed. When copying data at overlapping address spaces,
 * "op" should be lower than "ip".
 *
 * @param ip Input buffer.
 * @param[out] op Output buffer.
 * @param l The number of elements to copy.
 * @param ip Input buffer pointer increment.
 * @param op Output buffer pointer increment.
 */

template <class T1, class T2>
inline void copyArray(const T1* ip, T2* op, int l, const int ipinc = 1,
                      const int opinc = 1) {
  while (l > 0) {
    *op = (T2)*ip;
    op += opinc;
    ip += ipinc;
    l--;
  }
}

/**
 * Function adds values located in array "ip" to array "op".
 *
 * @param ip Input buffer.
 * @param[out] op Output buffer.
 * @param l The number of elements to add.
 * @param ip Input buffer pointer increment.
 * @param op Output buffer pointer increment.
 */

template <class T1, class T2>
inline void addArray(const T1* ip, T2* op, int l, const int ipinc = 1,
                     const int opinc = 1) {
  while (l > 0) {
    *op += *ip;
    op += opinc;
    ip += ipinc;
    l--;
  }
}

/**
 * Function that replicates a set of adjacent elements several times in a row.
 * This operation is usually used to replicate pixels at the start or end of
 * image's scanline.
 *
 * @param ip Source array.
 * @param ipl Source array length (usually 1..4, but can be any number).
 * @param[out] op Destination buffer.
 * @param l Number of times the source array should be replicated (the
 * destination buffer should be able to hold ipl * l number of elements).
 * @param opinc Destination buffer position increment after replicating the
 * source array. This value should be equal to at least ipl.
 */

template <class T1, class T2>
inline void replicateArray(const T1* const ip, const int ipl, T2* op, int l,
                           const int opinc) {
  if (ipl == 1) {
    while (l > 0) {
      op[0] = ip[0];
      op += opinc;
      l--;
    }
  } else if (ipl == 4) {
    while (l > 0) {
      op[0] = ip[0];
      op[1] = ip[1];
      op[2] = ip[2];
      op[3] = ip[3];
      op += opinc;
      l--;
    }
  } else if (ipl == 3) {
    while (l > 0) {
      op[0] = ip[0];
      op[1] = ip[1];
      op[2] = ip[2];
      op += opinc;
      l--;
    }
  } else if (ipl == 2) {
    while (l > 0) {
      op[0] = ip[0];
      op[1] = ip[1];
      op += opinc;
      l--;
    }
  } else {
    while (l > 0) {
      int i;

      for (i = 0; i < ipl; i++) {
        op[i] = ip[i];
      }

      op += opinc;
      l--;
    }
  }
}

/**
 * Function calculates frequency response of the specified FIR filter at the
 * specified circular frequency. Phase can be calculated as atan2( im, re ).
 * Function uses computationally-efficient oscillators instead of "cos" and
 * "sin" functions.
 *
 * @param flt FIR filter's coefficients.
 * @param fltlen Number of coefficients (taps) in the filter.
 * @param th Circular frequency [0; pi].
 * @param[out] re0 Resulting real part of the complex frequency response.
 * @param[out] im0 Resulting imaginary part of the complex frequency response.
 * @param fltlat Filter's latency in samples (taps).
 */

template <class T>
inline void calcFIRFilterResponse(const T* flt, int fltlen, const double th,
                                  double& re0, double& im0,
                                  const int fltlat = 0) {
  const double sincr = 2.0 * cos(th);
  double cvalue1;
  double svalue1;

  if (fltlat == 0) {
    cvalue1 = 1.0;
    svalue1 = 0.0;
  } else {
    cvalue1 = cos(-fltlat * th);
    svalue1 = sin(-fltlat * th);
  }

  double cvalue2 = cos(-(fltlat + 1) * th);
  double svalue2 = sin(-(fltlat + 1) * th);

  double re = 0.0;
  double im = 0.0;

  while (fltlen > 0) {
    re += cvalue1 * flt[0];
    im += svalue1 * flt[0];
    flt++;
    fltlen--;

    double tmp = cvalue1;
    cvalue1 = sincr * cvalue1 - cvalue2;
    cvalue2 = tmp;

    tmp = svalue1;
    svalue1 = sincr * svalue1 - svalue2;
    svalue2 = tmp;
  }

  re0 = re;
  im0 = im;
}

/**
 * Function normalizes FIR filter so that its frequency response at DC is
 * equal to DCGain.
 *
 * @param[in,out] p Filter coefficients.
 * @param l Filter length.
 * @param DCGain Filter's gain at DC.
 * @param pstep "p" array step.
 */

template <class T>
inline void normalizeFIRFilter(T* const p, const int l, const double DCGain,
                               const int pstep = 1) {
  double s = 0.0;
  T* pp = p;
  int i = l;

  while (i > 0) {
    s += *pp;
    pp += pstep;
    i--;
  }

  s = DCGain / s;
  pp = p;
  i = l;

  while (i > 0) {
    *pp = (T)(*pp * s);
    pp += pstep;
    i--;
  }
}

/**
 * @brief Memory buffer class for element array storage, with capacity
 * tracking.
 *
 * Allows easier handling of memory blocks allocation and automatic
 * deallocation for arrays (buffers) consisting of elements of specified
 * class. Tracks buffer's capacity in "int" variable; unsuitable for
 * allocation of very large memory blocks (with more than 2 billion elements).
 *
 * This class manages memory space only - it does not perform element class
 * construction (initialization) operations. Buffer's required memory address
 * alignment specification is supported.
 *
 * Uses standard library to allocate and deallocate memory.
 *
 * @tparam T Buffer element's type.
 * @tparam capint Buffer capacity's type to use. Use size_t for large buffers.
 */

template <class T, typename capint = int>
class CBuffer {
 public:
  CBuffer() : Data(NULL), DataAligned(NULL), Capacity(0), Alignment(0) {}

  /**
   * Constructor creates the buffer with the specified capacity.
   *
   * @param aCapacity Buffer's capacity.
   * @param aAlignment Buffer's required memory address alignment. 0 - use
   * stdlib's default alignment.
   */

  CBuffer(const capint aCapacity, const int aAlignment = 0) {
    allocinit(aCapacity, aAlignment);
  }

  CBuffer(const CBuffer& Source) {
    allocinit(Source.Capacity, Source.Alignment);
    memcpy(DataAligned, Source.DataAligned, Capacity * sizeof(T));
  }

  ~CBuffer() { freeData(); }

  CBuffer& operator=(const CBuffer& Source) {
    alloc(Source.Capacity, Source.Alignment);
    memcpy(DataAligned, Source.DataAligned, Capacity * sizeof(T));
    return (*this);
  }

  /**
   * Function allocates memory so that the specified number of elements
   * can be stored in *this buffer object.
   *
   * @param aCapacity Storage for this number of elements to allocate.
   * @param aAlignment Buffer's required memory address alignment,
   * power-of-2 values only. 0 - use stdlib's default alignment.
   */

  void alloc(const capint aCapacity, const int aAlignment = 0) {
    freeData();
    allocinit(aCapacity, aAlignment);
  }

  /**
   * Function deallocates any previously allocated buffer.
   */

  void free() {
    freeData();
    Data = NULL;
    DataAligned = NULL;
    Capacity = 0;
    Alignment = 0;
  }

  /**
   * @return The capacity of the element buffer.
   */

  capint getCapacity() const { return (Capacity); }

  /**
   * Function "forces" *this buffer to have an arbitary capacity. Calling
   * this function invalidates all further operations except deleting *this
   * object. This function should not be usually used at all. Function can
   * be used to "model" certain buffer capacity without calling a costly
   * memory allocation function.
   *
   * @param NewCapacity A new "forced" capacity.
   */

  void forceCapacity(const capint NewCapacity) { Capacity = NewCapacity; }

  /**
   * Function reallocates *this buffer to a larger size so that it will be
   * able to hold the specified number of elements. Downsizing is not
   * performed. Alignment is not changed.
   *
   * @param NewCapacity New (increased) capacity.
   * @param DoDataCopy "True" if data in the buffer should be retained.
   */

  void increaseCapacity(const capint NewCapacity,
                        const bool DoDataCopy = true) {
    if (NewCapacity < Capacity) {
      return;
    }

    if (DoDataCopy) {
      const capint PrevCapacity = Capacity;
      T* const PrevData = Data;
      T* const PrevDataAligned = DataAligned;

      allocinit(NewCapacity, Alignment);
      memcpy(DataAligned, PrevDataAligned, PrevCapacity * sizeof(T));

      ::free(PrevData);
    } else {
      ::free(Data);
      allocinit(NewCapacity, Alignment);
    }
  }

  /**
   * Function "truncates" (reduces) capacity of the buffer without
   * reallocating it. Alignment is not changed.
   *
   * @param NewCapacity New required capacity.
   */

  void truncateCapacity(const capint NewCapacity) {
    if (NewCapacity >= Capacity) {
      return;
    }

    Capacity = NewCapacity;
  }

  /**
   * Function increases capacity so that the specified number of
   * elements can be stored. This function increases the previous capacity
   * value by third the current capacity value until space for the required
   * number of elements is available. Alignment is not changed.
   *
   * @param ReqCapacity Required capacity.
   */

  void updateCapacity(const capint ReqCapacity) {
    if (ReqCapacity <= Capacity) {
      return;
    }

    capint NewCapacity = Capacity;

    while (NewCapacity < ReqCapacity) {
      NewCapacity += NewCapacity / 3 + 1;
    }

    increaseCapacity(NewCapacity);
  }

  operator T*() const { return (DataAligned); }

 private:
  T* Data;          ///< Element buffer pointer.
                    ///<
  T* DataAligned;   ///< Memory address-aligned element buffer pointer.
                    ///<
  capint Capacity;  ///< Element buffer capacity.
                    ///<
  int Alignment;    ///< Memory address alignment in use. 0 - use stdlib's
                    ///< default alignment.
                    ///<

  /**
   * Internal element buffer allocation function used during object
   * construction.
   *
   * @param aCapacity Storage for this number of elements to allocate.
   * @param aAlignment Buffer's required memory address alignment. 0 - use
   * stdlib's default alignment.
   */

  void allocinit(const capint aCapacity, const int aAlignment) {
    if (aAlignment == 0) {
      Data = (T*)::malloc(aCapacity * sizeof(T));
      DataAligned = Data;
      Alignment = 0;
    } else {
      Data = (T*)::malloc(aCapacity * sizeof(T) + aAlignment);
      DataAligned = alignptr(Data, aAlignment);
      Alignment = aAlignment;
    }

    Capacity = aCapacity;
  }

  /**
   * Function frees a previously allocated Data buffer.
   */

  void freeData() { ::free(Data); }

  /**
   * Function modifies the specified pointer so that it becomes memory
   * address-aligned.
   *
   * @param ptr Pointer to align.
   * @param align Alignment in bytes to apply.
   * @return Pointer aligned to align bytes. Works with power-of-2
   * alignments only. If no alignment is necessary, "align" bytes will be
   * added to the pointer value.
   */

  template <class Tp>
  inline Tp alignptr(const Tp ptr, const uintptr_t align) {
    return ((Tp)((uintptr_t)ptr + align - ((uintptr_t)ptr & (align - 1))));
  }
};

/**
 * Function optimizes the length of the symmetric-odd FIR filter by removing
 * left- and rightmost elements that are below specific threshold.
 *
 * Synthetic test shows that filter gets optimized in 2..3% of cases and in
 * each such case optimization reduces filter length by 6..8%. Optimization,
 * however, may skew the results of algorithm modeling and complexity
 * calculation leading to a choice of a less optimal algorithm.
 *
 * @param[in,out] Flt Buffer that contains filter being optimized.
 * @param[in,out] FltLatency Variable that holds the current latency of the
 * filter. May be adjusted on function return.
 * @param Threshold Threshold level.
 */

template <class T>
inline void optimizeFIRFilter(CBuffer<T>& Flt, int& FltLatency,
                              T const Threshold = (T)0.00001) {
  int i;

  // Optimize length.

  for (i = 0; i <= FltLatency; i++) {
    if (fabs(Flt[i]) >= Threshold || i == FltLatency) {
      if (i > 0) {
        const int NewCapacity = Flt.getCapacity() - i * 2;
        copyArray(&Flt[i], &Flt[0], NewCapacity);
        Flt.truncateCapacity(NewCapacity);
        FltLatency -= i;
      }

      break;
    }
  }
}

/**
 * @brief Array of structured objects.
 *
 * Implements allocation of a linear array of objects of class T (which are
 * initialized), addressable via operator[]. Each object is created via the
 * "operator new". New object insertions are quick since implementation uses
 * prior space allocation (capacity), thus not requiring frequent memory block
 * reallocations.
 *
 * @tparam T Array element's type.
 */

template <class T>
class CStructArray {
 public:
  CStructArray() : ItemCount(0) {}

  CStructArray(const CStructArray& Source)
      : ItemCount(0), Items(Source.getItemCount()) {
    while (ItemCount < Source.getItemCount()) {
      Items[ItemCount] = new T(Source[ItemCount]);
      ItemCount++;
    }
  }

  ~CStructArray() { clear(); }

  CStructArray& operator=(const CStructArray& Source) {
    clear();

    const int NewCount = Source.ItemCount;
    Items.updateCapacity(NewCount);

    while (ItemCount < NewCount) {
      Items[ItemCount] = new T(Source[ItemCount]);
      ItemCount++;
    }

    return (*this);
  }

  T& operator[](const int Index) { return (*Items[Index]); }

  const T& operator[](const int Index) const { return (*Items[Index]); }

  /**
   * Function creates a new object of type T with the default constructor
   * and adds this object to the array.
   *
   * @return Reference to a newly added object.
   */

  T& add() {
    if (ItemCount == Items.getCapacity()) {
      Items.increaseCapacity(ItemCount * 3 / 2 + 1);
    }

    Items[ItemCount] = new T();
    ItemCount++;

    return ((*this)[ItemCount - 1]);
  }

  /**
   * Function changes number of allocated items. New items are created with
   * the default constructor. If NewCount is below the current item count,
   * items that are above NewCount range will be destructed.
   *
   * @param NewCount New requested item count.
   */

  void setItemCount(const int NewCount) {
    if (NewCount > ItemCount) {
      Items.increaseCapacity(NewCount);

      while (ItemCount < NewCount) {
        Items[ItemCount] = new T();
        ItemCount++;
      }
    } else {
      while (ItemCount > NewCount) {
        ItemCount--;
        delete Items[ItemCount];
      }
    }
  }

  /**
   * Function erases all items of *this array.
   */

  void clear() {
    while (ItemCount > 0) {
      ItemCount--;
      delete Items[ItemCount];
    }
  }

  /**
   * @return The number of allocated items.
   */

  int getItemCount() const { return (ItemCount); }

 private:
  int ItemCount;      ///< The number of items available in the array.
                      ///<
  CBuffer<T*> Items;  ///< Element buffer.
                      ///<
};

/**
 * @brief Sine signal generator class.
 *
 * Class implements sine signal generator without biasing, with
 * constructor-based initalization only. This generator uses oscillator
 * instead of "sin" function.
 */

class CSineGen {
 public:
  /**
   * Constructor initializes *this sine signal generator.
   *
   * @param si Sine function increment, in radians.
   * @param ph Starting phase, in radians. Add 0.5 * AVIR_PI for cosine
   * function.
   */

  CSineGen(const double si, const double ph)
      : svalue1(sin(ph)), svalue2(sin(ph - si)), sincr(2.0 * cos(si)) {}

  /**
   * @return The next value of the sine function, without biasing.
   */

  double generate() {
    const double res = svalue1;

    svalue1 = sincr * res - svalue2;
    svalue2 = res;

    return (res);
  }

 private:
  double svalue1;  ///< Current sine value.
                   ///<
  double svalue2;  ///< Previous sine value.
                   ///<
  double sincr;    ///< Sine value increment.
                   ///<
};

/**
 * @brief Peaked Cosine window function generator class.
 *
 * Class implements Peaked Cosine window function generator. Generates the
 * right-handed half of the window function. The Alpha parameter of this
 * window function offers the control of the balance between the early and
 * later taps of the filter. E.g. at Alpha=1 both early and later taps are
 * attenuated, but at Alpha=4 mostly later taps are attenuated. This offers a
 * great control over ringing artifacts produced by a low-pass filter in image
 * processing, without compromising achieved image sharpness.
 */

class CDSPWindowGenPeakedCosine {
 public:
  /**
   * Constructor initializes *this window function generator.
   *
   * @param aAlpha Alpha parameter, affects the peak shape (peak
   * augmentation) of the window function. Should be >= 1.0.
   * @param aLen2 Half filter's length (non-truncated).
   */

  CDSPWindowGenPeakedCosine(const double aAlpha, const double aLen2)
      : Alpha(aAlpha),
        Len2(aLen2),
        wn(0),
        w1(AVIR_PId2 / Len2, AVIR_PI * 0.5) {}

  /**
   * @return The next Peaked Cosine window function coefficient.
   */

  double generate() {
    const double h = pow(wn / Len2, Alpha);
    wn++;

    return (w1.generate() * (1.0 - h));
  }

 private:
  double Alpha;  ///< Alpha parameter, affects the peak shape of window.
                 ///<
  double Len2;   ///< Half length of the window function.
                 ///<
  int wn;        ///< Window function integer position. 0 - center of the
                 ///< window function.
                 ///<
  CSineGen w1;   ///< Sine-wave generator.
                 ///<
};

/**
 * @brief FIR filter-based equalizer generator.
 *
 * Class implements an object used to generate symmetric-odd FIR filters with
 * the specified frequency response (aka paragraphic equalizer). The
 * calculated filter is windowed by the Peaked Cosine window function.
 *
 * In image processing, due to short length of filters being used (6-8 taps)
 * the resulting frequency response of the filter is approximate and may be
 * mathematically imperfect, but still adequate to the visual requirements.
 *
 * On a side note, this equalizer generator can be successfully used for audio
 * signal equalization as well: for example, it is used in almost the same
 * form in Voxengo Marvel GEQ equalizer plug-in.
 *
 * Filter generation is based on decomposition of frequency range into
 * spectral bands, with each band represented by linear and ramp "kernels".
 * When the filter is built, these kernels are combined together with
 * different weights that approximate the required frequency response.
 */

class CDSPFIREQ {
 public:
  /**
   * Function initializes *this object with the required parameters. The
   * gain of frequencies beyond the MinFreq..MaxFreq range are controlled by
   * the first and the last band's gain.
   *
   * @param SampleRate Processing sample rate (use 2 for image processing).
   * @param aFilterLength Required filter length in samples (taps). The
   * actual filter length is truncated to an integer value.
   * @param aBandCount Number of band crossover points required to control,
   * including bands at MinFreq and MaxFreq.
   * @param MinFreq Minimal frequency that should be controlled.
   * @param MaxFreq Maximal frequency that should be controlled.
   * @param IsLogBands "True" if the bands should be spaced logarithmically.
   * @param WFAlpha Peaked Cosine window function's Alpha parameter.
   */

  void init(const double SampleRate, const double aFilterLength,
            const int aBandCount, const double MinFreq, const double MaxFreq,
            const bool IsLogBands, const double WFAlpha) {
    FilterLength = aFilterLength;
    BandCount = aBandCount;

    CenterFreqs.alloc(BandCount);

    z = (int)ceil(FilterLength * 0.5);
    zi = z + (z & 1);
    z2 = z * 2;

    CBuffer<double> oscbuf(z2);
    initOscBuf(oscbuf);

    CBuffer<double> winbuf(z);
    initWinBuf(winbuf, WFAlpha);

    UseFirstVirtBand = (MinFreq > 0.0);
    const int k = zi * (BandCount + (UseFirstVirtBand ? 1 : 0));
    Kernels1.alloc(k);
    Kernels2.alloc(k);

    double m;   // Frequency step multiplier.
    double mo;  // Frequency step offset (addition).

    if (IsLogBands) {
      m = exp(log(MaxFreq / MinFreq) / (BandCount - 1));
      mo = 0.0;
    } else {
      m = 1.0;
      mo = (MaxFreq - MinFreq) / (BandCount - 1);
    }

    double f = MinFreq;
    double x1 = 0.0;
    double x2;
    int si;

    if (UseFirstVirtBand) {
      si = 0;
    } else {
      si = 1;
      CenterFreqs[0] = 0.0;
      f = f * m + mo;
    }

    double* kernbuf1 = &Kernels1[0];
    double* kernbuf2 = &Kernels2[0];
    int i;

    for (i = si; i < BandCount; i++) {
      x2 = f * 2.0 / SampleRate;
      CenterFreqs[i] = x2;

      fillBandKernel(x1, x2, kernbuf1, kernbuf2, oscbuf, winbuf);

      kernbuf1 += zi;
      kernbuf2 += zi;
      x1 = x2;
      f = f * m + mo;
    }

    if (x1 < 1.0) {
      UseLastVirtBand = true;
      fillBandKernel(x1, 1.0, kernbuf1, kernbuf2, oscbuf, winbuf);
    } else {
      UseLastVirtBand = false;
    }
  }

  /**
   * @return Filter's length, in samples (taps).
   */

  int getFilterLength() const { return (z2 - 1); }

  /**
   * @return Filter's latency (group delay), in samples (taps).
   */

  int getFilterLatency() const { return (z - 1); }

  /**
   * Function creates symmetric-odd FIR filter with the specified gain
   * levels at band crossover points.
   *
   * @param BandGains Array of linear gain levels, count=BandCount specified
   * in the init() function.
   * @param[out] Filter Output filter buffer, length = getFilterLength().
   */

  void buildFilter(const double* const BandGains, double* const Filter) {
    const double* kernbuf1 = &Kernels1[0];
    const double* kernbuf2 = &Kernels2[0];
    double x1 = 0.0;
    double y1 = BandGains[0];
    double x2;
    double y2;

    int i;
    int si;

    if (UseFirstVirtBand) {
      si = 1;
      x2 = CenterFreqs[0];
      y2 = y1;
    } else {
      si = 2;
      x2 = CenterFreqs[1];
      y2 = BandGains[1];
    }

    copyBandKernel(Filter, kernbuf1, kernbuf2, y1 - y2, x1 * y2 - x2 * y1);

    kernbuf1 += zi;
    kernbuf2 += zi;
    x1 = x2;
    y1 = y2;

    for (i = si; i < BandCount; i++) {
      x2 = CenterFreqs[i];
      y2 = BandGains[i];

      addBandKernel(Filter, kernbuf1, kernbuf2, y1 - y2, x1 * y2 - x2 * y1);

      kernbuf1 += zi;
      kernbuf2 += zi;
      x1 = x2;
      y1 = y2;
    }

    if (UseLastVirtBand) {
      addBandKernel(Filter, kernbuf1, kernbuf2, y1 - y2, x1 * y2 - y1);
    }

    for (i = 0; i < z - 1; i++) {
      Filter[z + i] = Filter[z - 2 - i];
    }
  }

  /**
   * Function calculates filter's length (in samples) and latency depending
   * on the required non-truncated filter length.
   *
   * @param aFilterLength Required filter length in samples (non-truncated).
   * @param[out] Latency Resulting latency (group delay) of the filter,
   * in samples (taps).
   * @return Filter length in samples (taps).
   */

  static int calcFilterLength(const double aFilterLength, int& Latency) {
    const int l = (int)ceil(aFilterLength * 0.5);
    Latency = l - 1;

    return (l * 2 - 1);
  }

 private:
  double FilterLength;  ///< Length of filter.
                        ///<
  int z;                ///< Equals (int) ceil( FilterLength * 0.5 ).
                        ///<
  int zi;  ///< Equals "z" if z is even, or z + 1 if z is odd. Used as a
           ///< Kernels1 and Kernels2 size multiplier and kernel buffer
           ///< increment to make sure each kernel buffer is 16-byte aligned.
           ///<
  int z2;  ///< Equals z * 2.
           ///<
  int BandCount;                ///< Number of controllable bands.
                                ///<
  CBuffer<double> CenterFreqs;  ///< Center frequencies for all bands,
                                ///< normalized to 0.0-1.0 range.
                                ///<
  CBuffer<double> Kernels1;     ///< Half-length kernel buffers for each
                                ///< spectral band (linear part).
                                ///<
  CBuffer<double> Kernels2;     ///< Half-length kernel buffers for each
                                ///< spectral band (ramp part).
                                ///<
  bool UseFirstVirtBand;        ///< "True" if the first virtual band
                          ///< (between 0.0 and MinFreq) should be used. The
                          ///< first virtual band won't be used if MinFreq
                          ///< equals 0.0.
                          ///<
  bool UseLastVirtBand;  ///< "True" if the last virtual band (between
                         ///< MaxFreq and SampleRate * 0.5) should be used. The
                         ///< last virtual band won't be used if MaxFreq * 2.0
                         ///< equals SampleRate.
                         ///<

  /**
   * Function initializes the "oscbuf" used in the fillBandKernel()
   * function.
   *
   * @param oscbuf Oscillator buffer, length = z * 2.
   */

  void initOscBuf(double* oscbuf) const {
    int i = z;

    while (i > 0) {
      oscbuf[0] = 0.0;
      oscbuf[1] = 1.0;
      oscbuf += 2;
      i--;
    }
  }

  /**
   * Function initializes window function buffer. This function generates
   * Peaked Cosine window function.
   *
   * @param winbuf Windowing buffer.
   * @param Alpha Peaked Cosine alpha parameter.
   */

  void initWinBuf(double* winbuf, const double Alpha) const {
    CDSPWindowGenPeakedCosine wf(Alpha, FilterLength * 0.5);
    int i;

    for (i = 1; i <= z; i++) {
      winbuf[z - i] = wf.generate();
    }
  }

  /**
   * Function fills first half of symmetric-odd FIR kernel for the band.
   * This function should be called successively for adjacent bands.
   * Previous band's x2 should be equal to current band's x1. A band kernel
   * consists of 2 elements: linear kernel and ramp kernel.
   *
   * @param x1 Band's left corner frequency (0..1).
   * @param x2 Band's right corner frequency (0..1).
   * @param kernbuf1 Band kernel buffer 1 (linear part), length = z.
   * @param kernbuf2 Band kernel buffer 2 (ramp part), length = z.
   * @param oscbuf Oscillation buffer. Before the first call of the
   * fillBandKernel() should be initialized with the call of the
   * initOscBuf() function.
   * @param winbuf Buffer that contains windowing function.
   */

  void fillBandKernel(const double x1, const double x2, double* kernbuf1,
                      double* kernbuf2, double* oscbuf,
                      const double* const winbuf) {
    const double s2_incr = AVIR_PI * x2;
    const double s2_coeff = 2.0 * cos(s2_incr);

    double s2_value1 = sin(s2_incr * (-z + 1));
    double c2_value1 = sin(s2_incr * (-z + 1) + AVIR_PI * 0.5);
    oscbuf[0] = sin(s2_incr * -z);
    oscbuf[1] = sin(s2_incr * -z + AVIR_PI * 0.5);

    int ks;

    for (ks = 1; ks < z; ks++) {
      const int ks2 = ks * 2;
      const double s1_value1 = oscbuf[ks2];
      const double c1_value1 = oscbuf[ks2 + 1];
      oscbuf[ks2] = s2_value1;
      oscbuf[ks2 + 1] = c2_value1;

      const double x = AVIR_PI * (ks - z);
      const double v0 = winbuf[ks - 1] / ((x1 - x2) * x);

      kernbuf1[ks - 1] =
          (x2 * s2_value1 - x1 * s1_value1 + (c2_value1 - c1_value1) / x) * v0;

      kernbuf2[ks - 1] = (s2_value1 - s1_value1) * v0;

      s2_value1 = s2_coeff * s2_value1 - oscbuf[ks2 - 2];
      c2_value1 = s2_coeff * c2_value1 - oscbuf[ks2 - 1];
    }

    kernbuf1[z - 1] = (x2 * x2 - x1 * x1) / (x1 - x2) * 0.5;
    kernbuf2[z - 1] = -1.0;
  }

  /**
   * Function copies band kernel's elements to the output buffer.
   *
   * @param outbuf Output buffer.
   * @param kernbuf1 Kernel buffer 1 (linear part).
   * @param kernbuf2 Kernel buffer 2 (ramp part).
   * @param c Multiplier for linear kernel element.
   * @param d Multiplier for ramp kernel element.
   */

  void copyBandKernel(double* outbuf, const double* const kernbuf1,
                      const double* const kernbuf2, const double c,
                      const double d) const {
    int ks;

    for (ks = 0; ks < z; ks++) {
      outbuf[ks] = c * kernbuf1[ks] + d * kernbuf2[ks];
    }
  }

  /**
   * Function adds band kernel's elements to the output buffer.
   *
   * @param outbuf Output buffer.
   * @param kernbuf1 Kernel buffer 1 (linear part).
   * @param kernbuf2 Kernel buffer 2 (ramp part).
   * @param c Multiplier for linear kernel element.
   * @param d Multiplier for ramp kernel element.
   */

  void addBandKernel(double* outbuf, const double* const kernbuf1,
                     const double* const kernbuf2, const double c,
                     const double d) const {
    int ks;

    for (ks = 0; ks < z; ks++) {
      outbuf[ks] += c * kernbuf1[ks] + d * kernbuf2[ks];
    }
  }
};

/**
 * @brief Low-pass filter windowed by Peaked Cosine window function.
 *
 * This class implements calculation of linear-phase symmetric-odd FIR
 * low-pass filter windowed by the Peaked Cosine window function, for image
 * processing applications.
 */

class CDSPPeakedCosineLPF {
 public:
  int fl2;  ///< Half filter's length, excluding the peak value. This value
            ///< can be also used as filter's latency in samples (taps).
            ///<
  int FilterLen;  ///< Filter's length in samples (taps).
                  ///<

  /**
   * Constructor initalizes *this object.
   *
   * @param aLen2 Half-length (non-truncated) of low-pass filter, in samples
   * (taps).
   * @param aFreq2 Low-pass filter's corner frequency [0; pi].
   * @param aAlpha Peaked Cosine window function Alpha parameter.
   */

  CDSPPeakedCosineLPF(const double aLen2, const double aFreq2,
                      const double aAlpha)
      : fl2((int)ceil(aLen2) - 1),
        FilterLen(fl2 + fl2 + 1),
        Len2(aLen2),
        Freq2(aFreq2),
        Alpha(aAlpha) {}

  /**
   * Function generates a linear-phase low-pass filter windowed by Peaked
   * Cosine window function.
   *
   * @param[out] op Output buffer, length = FilterLen (fl2 * 2 + 1).
   * @param DCGain Required gain at DC. The resulting filter will be
   * normalized to achieve this DC gain.
   */

  template <class T>
  void generateLPF(T* op, const double DCGain) {
    CDSPWindowGenPeakedCosine wf(Alpha, Len2);
    CSineGen f2(Freq2, 0.0);

    op += fl2;
    T* op2 = op;
    f2.generate();
    int t = 1;

    *op = (T)(Freq2 * wf.generate() / AVIR_PI);
    double s = *op;

    while (t <= fl2) {
      const double v = f2.generate() * wf.generate() / t / AVIR_PI;
      op++;
      op2--;
      *op = (T)v;
      *op2 = (T)v;
      s += *op + *op2;
      t++;
    }

    t = FilterLen;
    s = DCGain / s;

    while (t > 0) {
      *op2 = (T)(*op2 * s);
      op2++;
      t--;
    }
  }

 private:
  double Len2;   ///< Half-length (non-truncated) of low-pass filter, in
                 ///< samples (taps).
                 ///<
  double Freq2;  ///< Low-pass filter's corner frequency.
                 ///<
  double Alpha;  ///< Peaked Cosine window function Alpha parameter.
                 ///<
};

/**
 * @brief Buffer class for parametrized low-pass filter.
 *
 * This class extends the CBuffer< double > class by adding several variables
 * that define a symmetric-odd FIR low-pass filter windowed by Peaked Cosine
 * window function. This class can be used to compare filters without
 * comparing their buffer contents.
 */

class CFltBuffer : public CBuffer<double> {
 public:
  double Len2;    ///< Half-length (non-truncated) of low-pass filters, in
                  ///< samples (taps).
                  ///<
  double Freq;    ///< Low-pass filter's corner frequency.
                  ///<
  double Alpha;   ///< Peaked Cosine window function Alpha parameter.
                  ///<
  double DCGain;  ///< DC gain applied to the filter.
                  ///<

  CFltBuffer()
      : CBuffer<double>(), Len2(0.0), Freq(0.0), Alpha(0.0), DCGain(0.0) {}

  /**
   * @param b2 Filter buffer to compare *this object to.
   * @return Operator returns "true" if both filters have same parameters.
   */

  bool operator==(const CFltBuffer& b2) const {
    return (Len2 == b2.Len2 && Freq == b2.Freq && Alpha == b2.Alpha &&
            DCGain == b2.DCGain);
  }
};

/**
 * @brief Sinc function-based fractional delay filter bank.
 *
 * Class implements storage and initialization of a bank of sinc
 * function-based fractional delay filters, expressed as 1st order polynomial
 * interpolation coefficients. The filters are produced from a single "long"
 * windowed low-pass filter. Also supports 0th-order ("nearest neighbor")
 * interpolation.
 *
 * This class also supports multiplication of each fractional delay filter by
 * an external filter (usually a low-pass filter).
 *
 * @tparam fptype Specifies storage type of the filter coefficients bank. The
 * filters are initially calculated using the "double" precision.
 */

template <class fptype>
class CDSPFracFilterBankLin {
 public:
  CDSPFracFilterBankLin() : Order(-1) {}

  /**
   * Copy constructor copies a limited set of parameters of the source
   * filter bank. The actual filters are not copied. Such copying is used
   * during filtering steps "modeling" stage. A further init() function
   * call is required.
   *
   * @param s Source filter bank.
   */

  void copyInitParams(const CDSPFracFilterBankLin& s) {
    WFLen2 = s.WFLen2;
    WFFreq = s.WFFreq;
    WFAlpha = s.WFAlpha;
    FracCount = s.FracCount;
    Order = s.Order;
    Alignment = s.Alignment;
    SrcFilterLen = s.SrcFilterLen;
    FilterLen = s.FilterLen;
    FilterSize = s.FilterSize;
    IsSrcTableBuilt = false;
    ExtFilter = s.ExtFilter;
    TableFillFlags.alloc(s.TableFillFlags.getCapacity());
    int i;

    // Copy table fill flags, but shifted so that further initialization
    // is still possible (such feature should not be used, though).

    for (i = 0; i < TableFillFlags.getCapacity(); i++) {
      TableFillFlags[i] = (uint8_t)(s.TableFillFlags[i] << 2);
    }
  }

  /**
   * Operator compares *this filter bank and another filter bank and returns
   * "true" if their parameters are equal. Alignment is not taken into
   * account.
   *
   * @param s Filter bank to compare to.
   * @return "True" if compared banks have equal parameters.
   */

  bool operator==(const CDSPFracFilterBankLin& s) const {
    return (Order == s.Order && WFLen2 == s.WFLen2 && WFFreq == s.WFFreq &&
            WFAlpha == s.WFAlpha && FracCount == s.FracCount &&
            ExtFilter == s.ExtFilter);
  }

  /**
   * Function initializes (builds) the filter bank based on the supplied
   * parameters. If the supplied parameters are equal to previously defined
   * parameters, function does nothing (alignment is assumed to be never
   * changing between the init() function calls).
   *
   * @param ReqFracCount Required number of fractional delays in the filter
   * bank. The minimal value is 2.
   * @param ReqOrder Required order of the interpolation polynomial
   * (0 or 1).
   * @param BaseLen Low-pass filter's base length, in samples (taps).
   * Affects the actual length of the filter and its overall steepness.
   * @param Cutoff Low-pass filter's normalized cutoff frequency [0; 1].
   * @param aWFAlpha Peaked Cosine window function's Alpha parameter.
   * @param aExtFilter External filter to apply to each fractional delay
   * filter.
   * @param aAlignment Memory alignment of the filter bank, power-of-2
   * value. 0 - use default stdlib alignment.
   * @param FltLenAlign Filter's length alignment, power-of-2 value.
   */

  void init(const int ReqFracCount, const int ReqOrder, const double BaseLen,
            const double Cutoff, const double aWFAlpha,
            const CFltBuffer& aExtFilter, const int aAlignment = 0,
            const int FltLenAlign = 1) {
    double NewWFLen2 = 0.5 * BaseLen * ReqFracCount;
    double NewWFFreq = AVIR_PI * Cutoff / ReqFracCount;
    double NewWFAlpha = aWFAlpha;

    if (ReqOrder == Order && NewWFLen2 == WFLen2 && NewWFFreq == WFFreq &&
        NewWFAlpha == WFAlpha && ReqFracCount == FracCount &&
        aExtFilter == ExtFilter) {
      IsInitRequired = false;
      return;
    }

    WFLen2 = NewWFLen2;
    WFFreq = NewWFFreq;
    WFAlpha = NewWFAlpha;
    FracCount = ReqFracCount;
    Order = ReqOrder;
    Alignment = aAlignment;
    ExtFilter = aExtFilter;

    CDSPPeakedCosineLPF p(WFLen2, WFFreq, WFAlpha);
    SrcFilterLen = (p.fl2 / ReqFracCount + 1) * 2;

    const int ElementSize = ReqOrder + 1;
    FilterLen = SrcFilterLen;

    if (ExtFilter.getCapacity() > 0) {
      FilterLen += ExtFilter.getCapacity() - 1;
    }

    FilterLen = (FilterLen + FltLenAlign - 1) & ~(FltLenAlign - 1);
    FilterSize = FilterLen * ElementSize;
    IsSrcTableBuilt = false;
    IsInitRequired = true;
  }

  /**
   * @return The length of each fractional delay filter, in samples (taps).
   * Always an even value.
   */

  int getFilterLen() const { return (FilterLen); }

  /**
   * @return The number of fractional filters in use by *this bank.
   */

  int getFracCount() const { return (FracCount); }

  /**
   * @return The order of the interpolation polynomial.
   */

  int getOrder() const { return (Order); }

  /**
   * Function returns the pointer to the specified interpolation table
   * filter.
   *
   * @param i Filter (fractional delay) index, in the range 0 to
   * ReqFracCount - 1, inclusive.
   * @return Pointer to filter. Higher order polynomial coefficients are
   * stored after after previous order coefficients, separated by FilterLen
   * elements.
   */

  const fptype* getFilter(const int i) {
    if (!IsSrcTableBuilt) {
      buildSrcTable();
    }

    fptype* const Res = &Table[i * FilterSize];

    if ((TableFillFlags[i] & 2) == 0) {
      createFilter(i);
      TableFillFlags[i] |= 2;

      if (Order > 0) {
        createFilter(i + 1);
        const fptype* const Res2 = Res + FilterSize;
        fptype* const op = Res + FilterLen;
        int j;

        // Create higher-order interpolation coefficients (linear
        // interpolation).

        for (j = 0; j < FilterLen; j++) {
          op[j] = Res2[j] - Res[j];
        }
      }
    }

    return (Res);
  }

  /**
   * Function makes sure all fractional delay filters were created.
   */

  void createAllFilters() {
    int i;

    for (i = 0; i < FracCount; i++) {
      getFilter(i);
    }
  }

  /**
   * Function returns an approximate initialization complexity, expressed in
   * the number of multiply-add operations. This includes fractional delay
   * filters calculation and multiplication by an external filter. This
   * function can only be called after the init() function.
   *
   * @param FracUseMap Fractional delays use map, each element corresponds
   * to a single fractional delay, will be compared to the internal table
   * fill flags. This map should include 0 and 1 values only.
   * @return The complexity of the initialization, expressed in the number
   * of multiply-add operations.
   */

  int calcInitComplexity(const CBuffer<uint8_t>& FracUseMap) const {
    const int FltInitCost = 65;  // Cost to initialize a single sample
                                 // of the fractional delay filter.
    const int FltUseCost =
        FilterLen * Order +
        SrcFilterLen * ExtFilter.getCapacity();  // Cost to use a single
                                                 // fractional delay filter.
    const int ucb[2] = {0, FltUseCost};
    int ic;
    int i;

    if (IsInitRequired) {
      ic = FracCount * SrcFilterLen * FltInitCost;

      for (i = 0; i < FracCount; i++) {
        ic += ucb[FracUseMap[i]];
      }
    } else {
      ic = 0;

      for (i = 0; i < FracCount; i++) {
        if (FracUseMap[i] != 0) {
          ic += ucb[TableFillFlags[i] == 0 ? 1 : 0];
        }
      }
    }

    return (ic);
  }

 private:
  static const int InterpPoints = 2;  ///< The maximal number of points the
                                      ///< interpolation is based on.
                                      ///<
  double WFLen2;                      ///< Window function's Len2 parameter.
                                      ///<
  double WFFreq;                      ///< Window function's Freq parameter.
                                      ///<
  double WFAlpha;                     ///< Window function's Alpha parameter.
                                      ///<
  int FracCount;     ///< The required number of fractional delay filters.
                     ///<
  int Order;         ///< The order of the interpolation polynomial.
                     ///<
  int Alignment;     ///< The required filter table alignment.
                     ///<
  int SrcFilterLen;  ///< Length of the "source" filters. This is always an
                     ///< even value.
                     ///<
  int FilterLen;     ///< Specifies the number of samples (taps) each fractional
                  ///< delay filter has. This is always an even value, adjusted
                  ///< by the FltLenAlign.
                  ///<
  int FilterSize;       ///< The size of a single filter element, equals
                        ///< FilterLen * ElementSize.
                        ///<
  bool IsInitRequired;  ///< "True" if SrcTable filter table initialization
                        ///< is required. This value is available only after the
                        ///< call to the init() function.
                        ///<
  CBuffer<fptype> Table;  ///< Interpolation table, size equals to
                          ///< ReqFracCount * FilterLen * ElementSize.
                          ///<
  CBuffer<uint8_t>
      TableFillFlags;    ///< Contains ReqFracCount + 1
                         ///< elements. Bit 0 of every element is 1 if Table
                         ///< already contains the filter from SrcTable filtered
                         ///< by ExtFilter. Bit 1 of every element means higher
                         ///< order coefficients were filled for the filter.
                         ///<
  CFltBuffer ExtFilter;  ///< External filter that should be applied to every
                         ///< fractional delay filter. Can be empty. Half of
                         ///< this filter's capacity is used as latency (group
                         ///< delay) value of the filter.
                         ///<
  CBuffer<double> SrcTable;  ///< Source table of delay filters, contains
                             ///< ReqFracCount + 1 elements. This table is used
                             ///< to fill the Table with the actual filters,
                             ///< filtered by an external filter.
                             ///<
  bool IsSrcTableBuilt;      ///< "True" if the SrcTable was built already. This
                         ///< variable is set to "false" in the init() function.
                         ///<

  /**
   * Function builds source table used in the createFilter() function.
   */

  void buildSrcTable() {
    IsSrcTableBuilt = true;
    IsInitRequired = false;

    CDSPPeakedCosineLPF p(WFLen2, WFFreq, WFAlpha);

    const int BufLen = SrcFilterLen * FracCount + InterpPoints - 1;
    const int BufOffs = InterpPoints / 2 - 1;
    const int BufCenter = SrcFilterLen * FracCount / 2 + BufOffs;

    CBuffer<double> Buf(BufLen);
    memset(Buf, 0, (BufCenter - p.fl2) * sizeof(double));
    int i = BufLen - BufCenter - p.fl2 - 1;
    memset(&Buf[BufLen - i], 0, i * sizeof(double));

    p.generateLPF(&Buf[BufCenter - p.fl2], FracCount);

    SrcTable.alloc((FracCount + 1) * SrcFilterLen);
    TableFillFlags.alloc(FracCount + 1);
    int j;
    double* op0 = SrcTable;

    for (i = FracCount; i >= 0; i--) {
      TableFillFlags[i] = 0;
      double* p = Buf + BufOffs + i;

      for (j = 0; j < SrcFilterLen; j++) {
        op0[0] = p[0];
        op0++;
        p += FracCount;
      }
    }

    Table.alloc((FracCount + 1) * FilterSize, Alignment);
  }

  /**
   * Function creates the specified filter in the Table by copying it from
   * the SrcTable and filtering by ExtFilter. Function does nothing if
   * filter was already created.
   *
   * @param k Filter index to create, in the range 0 to FracCount,
   * inclusive.
   */

  void createFilter(const int k) {
    if (TableFillFlags[k] != 0) {
      return;
    }

    TableFillFlags[k] |= 1;
    const int ExtFilterLatency = ExtFilter.getCapacity() / 2;
    const int ResLatency = ExtFilterLatency + SrcFilterLen / 2;
    int ResLen = SrcFilterLen;

    if (ExtFilter.getCapacity() > 0) {
      ResLen += ExtFilter.getCapacity() - 1;
    }

    const int ResOffs = FilterLen / 2 - ResLatency;
    fptype* op = &Table[k * FilterSize];
    int i;

    for (i = 0; i < ResOffs; i++) {
      op[i] = 0.0;
    }

    for (i = ResOffs + ResLen; i < FilterLen; i++) {
      op[i] = 0.0;
    }

    op += ResOffs;
    const double* const srcflt = &SrcTable[k * SrcFilterLen];

    if (ExtFilter.getCapacity() == 0) {
      for (i = 0; i < ResLen; i++) {
        op[i] = (fptype)srcflt[i];
      }

      return;
    }

    // Perform convolution of extflt and srcflt.

    const double* const extflt = &ExtFilter[0];
    int j;

    for (j = 0; j < ResLen; j++) {
      int k = 0;
      int l = j - ExtFilter.getCapacity() + 1;
      int r = l + ExtFilter.getCapacity();

      if (l < 0) {
        k -= l;
        l = 0;
      }

      if (r > SrcFilterLen) {
        r = SrcFilterLen;
      }

      const double* const extfltb = extflt + k;
      const double* const srcfltb = srcflt + l;
      double s = 0.0;
      l = r - l;

      for (i = 0; i < l; i++) {
        s += extfltb[i] * srcfltb[i];
      }

      op[j] = (fptype)s;
    }
  }
};

/**
 * @brief Thread pool for multi-threaded image resizing operation.
 *
 * This base class is used to organize a multi-threaded image resizing
 * operation. The thread pool should consist of threads that initially wait
 * for a signal. Upon receiving a signal (via the startAllWorkloads()
 * function) each previously added thread should execute its workload's
 * process() function once, and return to the wait signal state again. The
 * thread pool should be also able to efficiently wait for all workloads to
 * finish via the waitAllWorkloadsToFinish() function.
 *
 * The image resizing algorithm makes calls to functions of this class.
 */

class CImageResizerThreadPool {
 public:
  CImageResizerThreadPool() {}

  virtual ~CImageResizerThreadPool() {}

  /**
   * @brief Thread pool's workload object class.
   *
   * This class should be used as a base class for objects that perform the
   * actual work spread over several threads.
   */

  class CWorkload {
   public:
    virtual ~CWorkload() {}

    /**
     * Function that gets called from the thread when thread pool's
     * startAllWorkloads() function is called.
     */

    virtual void process() = 0;
  };

  /**
   * @return The suggested number of workloads (and their associated
   * threads) to add. The minimal value this function can return is 1. The
   * usual value may depend on the number of physical and virtual cores
   * present in the system, and on other considerations.
   */

  virtual int getSuggestedWorkloadCount() const { return (1); }

  /**
   * Function adds a new workload (and possibly thread) to the thread pool.
   * The caller decides how many parallel workloads (and threads) it
   * requires, but this number will not exceed the value returned by the
   * getSuggestedWorkloadCount() function. It is implementation-specific how
   * many workloads to associate with a single thread. But for efficiency
   * reasons each workload should be associated with its own thread.
   *
   * Note that the same set of workload objects will be processed each time
   * the startAllWorkloads() function is called. This means that workload
   * objects are added only once. The caller changes the state of the
   * workload objects and then calls the startAllWorkloads() function to
   * process them.
   *
   * @param Workload Workload object whose process() function will be called
   * from within the thread when the startAllWorkloads() function is called.
   */

  virtual void addWorkload(CWorkload* const Workload) {}

  /**
   * Function starts all workloads associated with threads previously added
   * via the addWorkload() function. It is assumed that this function
   * performs the necessary "memory barrier" (or "cache sync") kind of
   * operation so that all threads catch up the prior changes made to the
   * workload objects during their wait state.
   */

  virtual void startAllWorkloads() {}

  /**
   * Function waits for all workloads to finish.
   */

  virtual void waitAllWorkloadsToFinish() {}

  /**
   * Function removes all workloads previously added via the addWorkload()
   * function. This function gets called only after the
   * waitAllWorkloadsToFinish() function call.
   */

  virtual void removeAllWorkloads() {}
};

/**
 * @brief Resizing algorithm parameters structure.
 *
 * This structure holds all selectable parameters used by the resizing
 * algorithm at various stages, for both downsizing and upsizing. There are no
 * other parameters exist that can optimize the performance of the resizing
 * algorithm. Filter length parameters can take fractional values.
 *
 * Beside quality, these parameters (except Alpha parameters) directly affect
 * the computative cost of the resizing algorithm. It is possible to trade
 * the visual quality for computative cost.
 *
 * Anti-alias filtering during downsizing can be defined as a considerable
 * reduction of contrast of smallest features of an image. Unfortunately, such
 * de-contrasting partially affects features of all sizes thus producing a
 * non-linearity of frequency response. All pre-defined parameter sets are
 * described by 3 values separated by slashes. The first value is the
 * de-contrasting factor of small features (which are being removed) while
 * the second value is the de-contrasting factor of large features (which
 * should remain intact), with value of 1 equating to "no contrast change".
 * The third value is the optimization score (see below), with value of 0
 * equating to the "perfect" linearity of frequency response.
 *
 * The pre-defined parameter sets offered by this library were auto-optimized
 * for the given LPFltBaseLen, IntFltLen and CorrFltAlpha values. The
 * optimization goal was to minimize the score: the sum of squares of the
 * difference between original and processed images (which was not actually
 * resized, k=1). The original image was a 0.5 megapixel uniformly-distributed
 * white-noise image with pixel intensities in the 0-1 range. Such goal
 * converges very well and produces filtering system with the flattest
 * frequency response possible for the given constraints. With this goal,
 * increasing the LPFltBaseLen value reduces the general amount of aliasing
 * artifacts.
 */

struct CImageResizerParams {
  double CorrFltAlpha;  ///< Alpha parameter of the Peaked Cosine window
                        ///< function used on the correction filter. The
                        ///< "usable" values are in the narrow range 1.0 to 1.5.
                        ///<
  double CorrFltLen;    ///< Correction filter's length in samples (taps). The
                        ///< "usable" range is narrow, 5.5 to 8, as to minimize
  ///< the "overcorrection" which is mathematically precise,
  ///< but visually unacceptable.
  ///<
  double IntFltAlpha;   ///< Alpha parameter of the Peaked Cosine window
                        ///< function used on the interpolation low-pass filter.
                        ///< The "usable" values are in the range 1.5 to 2.5.
                        ///<
  double IntFltCutoff;  ///< Interpolation low-pass filter's cutoff frequency
                        ///< (normalized, [0; 1]). The "usable" range is 0.6 to
                        ///< 0.8.
                        ///<
  double IntFltLen;     ///< Interpolation low-pass filter's length in samples
                        ///< (taps). The length value should be at least 18 or
  ///< otherwise a "dark grid" artifact will be introduced if
  ///< a further sharpening is applied. IntFltLen together
  ///< with other IntFlt parameters should be tuned in a way
  ///< that produces the flattest frequency response in 0-0.5
  ///< normalized frequency range (this range is due to 2X
  ///< upsampling).
  ///<
  double LPFltAlpha;    ///< Alpha parameter of the Peaked Cosine window
                        ///< function used on the low-pass filter. The "usable"
                        ///< values are in the range 1.5 to 6.5.
                        ///<
  double LPFltBaseLen;  ///< Base length of the low-pass (aka anti-aliasing
                        ///< or reconstruction) filter, in samples (taps),
                        ///< further adjusted by the actual cutoff frequency,
                        ///< upsampling and downsampling factors. The "usable"
                        ///< range is between 6 and 9.
                        ///<
  double LPFltCutoffMult;  ///< Low-pass filter's cutoff frequency
                           ///< multiplier. This value can be both below and
                           ///< above 1.0 as low-pass filters are inserted on
                           ///< downsampling and upsampling steps and always
                           ///< have corner frequency equal to or below 0.5pi.
                           ///< This multiplier shifts low-pass filter's corner
                           ///< frequency towards lower (if below 1.0) or higher
                           ///< (if above 1.0) frequencies. This multiplier can
                           ///< be way below 1.0 since any additional
                           ///< high-frequency damping will be partially
                           ///< corrected by the correction filter. The "usable"
                           ///< range is 0.3 to 1.0.
                           ///<

  CImageResizerParams()
      : HBFltAlpha(1.75395), HBFltCutoff(0.40356), HBFltLen(22.00000) {}

  double HBFltAlpha;   ///< Half-band filter's Alpha. Assigned internally.
                       ///<
  double HBFltCutoff;  ///< Half-band filter's cutoff point [0; 1]. Assigned
                       ///< internally.
                       ///<
  double HBFltLen;     ///< Length of the half-band low-pass filter. Assigned
                       ///< internally. Internally used to perform 2X or higher
  ///< downsampling. These filter parameters should be treated
  ///< as "technical" and do not require adjustment as they
  ///< were tuned to suit all combinations of other
  ///< parameters. This half-band filter provides a wide
  ///< transition band (for minimal ringing artifacts) and a
  ///< high stop-band attenuation (for minimal aliasing).
  ///<
};

/**
 * @brief The default set of resizing algorithm parameters
 * (10.01/1.029/0.019169).
 *
 * This is the default set of resizing parameters that was designed to deliver
 * a sharp image while still providing a low amount of ringing artifacts, and
 * having a reasonable computational cost.
 */

struct CImageResizerParamsDef : public CImageResizerParams {
  CImageResizerParamsDef() {
    CorrFltAlpha = 1.0;  // 10.01/1.88/1.029(522.43)/0.019169:258648,446808
    CorrFltLen = 6.30770;
    IntFltAlpha = 2.27825;
    IntFltCutoff = 0.75493;
    IntFltLen = 18.0;
    LPFltAlpha = 3.40127;
    LPFltBaseLen = 7.78;
    LPFltCutoffMult = 0.78797;
  }
};

/**
 * @brief Set of resizing algorithm parameters for ultra-low-ringing
 * performance (7.69/1.069/0.000245).
 *
 * This set of resizing algorithm parameters offers the lowest amount of
 * ringing this library is capable of providing while still offering a decent
 * quality. Low ringing is attained at the expense of higher aliasing
 * artifacts and a slightly reduced contrast.
 */

struct CImageResizerParamsULR : public CImageResizerParams {
  CImageResizerParamsULR() {
    CorrFltAlpha = 1.0;  // 7.69/1.97/1.069(31445.45)/0.000245:258627,436845
    CorrFltLen = 5.83280;
    IntFltAlpha = 2.11453;
    IntFltCutoff = 0.73986;
    IntFltLen = 18.0;
    LPFltAlpha = 1.73455;
    LPFltBaseLen = 6.40;
    LPFltCutoffMult = 0.61314;
  }
};

/**
 * @brief Set of resizing algorithm parameters for low-ringing performance
 * (7.86/1.065/0.000106).
 *
 * This set of resizing algorithm parameters offers a very low-ringing
 * performance at the expense of higher aliasing artifacts and a slightly
 * reduced contrast.
 */

struct CImageResizerParamsLR : public CImageResizerParams {
  CImageResizerParamsLR() {
    CorrFltAlpha = 1.0;  // 7.86/1.96/1.065(73865.02)/0.000106:258636,437381
    CorrFltLen = 5.87671;
    IntFltAlpha = 2.25322;
    IntFltCutoff = 0.74090;
    IntFltLen = 18.0;
    LPFltAlpha = 1.79306;
    LPFltBaseLen = 7.00;
    LPFltCutoffMult = 0.68881;
  }
};

/**
 * @brief Set of resizing algorithm parameters for lower-ringing performance
 * (8.86/1.046/0.010168).
 *
 * This set of resizing algorithm parameters offers a lower-ringing
 * performance in comparison to the default setting, at the expense of higher
 * aliasing artifacts and a slightly reduced contrast.
 */

struct CImageResizerParamsLow : public CImageResizerParams {
  CImageResizerParamsLow() {
    CorrFltAlpha = 1.0;  // 8.86/1.92/1.046(871.54)/0.010168:258647,442252
    CorrFltLen = 6.09757;
    IntFltAlpha = 2.36704;
    IntFltCutoff = 0.74674;
    IntFltLen = 18.0;
    LPFltAlpha = 2.19427;
    LPFltBaseLen = 7.66;
    LPFltCutoffMult = 0.75380;
  }
};

/**
 * @brief Set of resizing algorithm parameters for low-aliasing
 * resizing (11.81/1.012/0.038379).
 *
 * This set of resizing algorithm parameters offers a considerable
 * anti-aliasing performance with a good frequency response linearity (and
 * contrast). This is an intermediate setting between the default and Ultra
 * parameters.
 */

struct CImageResizerParamsHigh : public CImageResizerParams {
  CImageResizerParamsHigh() {
    CorrFltAlpha = 1.0;  // 11.81/1.83/1.012(307.84)/0.038379:258660,452719
    CorrFltLen = 6.80909;
    IntFltAlpha = 2.44917;
    IntFltCutoff = 0.75856;
    IntFltLen = 18.0;
    LPFltAlpha = 4.39527;
    LPFltBaseLen = 8.18;
    LPFltCutoffMult = 0.79172;
  }
};

/**
 * @brief Set of resizing algorithm parameters for ultra low-aliasing
 * resizing (13.65/1.001/0.000483).
 *
 * This set of resizing algorithm parameters offers a very considerable
 * anti-aliasing performance with a good frequency response linearity (and
 * contrast). This set of parameters is computationally expensive and may
 * produce ringing artifacts on sharp features.
 */

struct CImageResizerParamsUltra : public CImageResizerParams {
  CImageResizerParamsUltra() {
    CorrFltAlpha = 1.0;  // 13.65/1.79/1.001(28288.41)/0.000483:258658,457974
    CorrFltLen = 7.48060;
    IntFltAlpha = 1.93750;
    IntFltCutoff = 0.75462;
    IntFltLen = 18.0;
    LPFltAlpha = 5.55209;
    LPFltBaseLen = 8.34;
    LPFltCutoffMult = 0.78002;
  }
};

/**
 * @brief Image resizing variables class.
 *
 * This is an utility "catch all" class that defines various variables used
 * during image resizing. Several variables that are explicitly initialized in
 * this class' constructor are also used as additional "input" variables to
 * the image resizing function. These variables will not be changed by the
 * avir::CImageResizer<>::resizeImage() function.
 */

class CImageResizerVars {
 public:
  int ElCount;    ///< The number of "fptype" elements used to store 1 pixel.
                  ///<
  int ElCountIO;  ///< The number of source and destination image's elements
                  ///< used to store 1 pixel.
                  ///<
  int fppack;     ///< The number of atomic types stored in a single "fptype"
                  ///< element.
                  ///<
  int fpalign;    ///< Suggested alignment size in bytes. This is not a
                ///< required alignment, because image resizing algorithm cannot
                ///< be made to have a strictly aligned data access in all cases
                ///< (e.g. de-interleaved interpolation cannot perform aligned
                ///< accesses).
                ///<
  int elalign;   ///< Length alignment of arrays of elements. This applies to
                 ///< filters and intermediate buffers: this constant forces
                 ///< filters and scanlines to have a length which is a multiple
                 ///< of this value, for more efficient SIMD implementation.
                 ///<
  int packmode;  ///< 0 if interleaved packing, 1 if de-interleaved.
                 ///<
  int BufLen[2];   ///< Intermediate buffers' lengths in "fptype" elements.
  int BufOffs[2];  ///< Offsets into the intermediate buffers, used to
                   ///< provide prefix elements required during processing so
                   ///< that no "out of range" access happens. This offset is a
                   ///< multiple of ElCount if pixels are stored in interleaved
                   ///< form.
                   ///<
  double k;  ///< Resizing step coefficient, updated to reflect the actually
             ///< used coefficient during resizing.
             ///<
  double o;  ///< Starting pixel offset inside the source image, updated to
             ///< reflect the actually used offset during resizing.
             ///<
  int ResizeStep;       ///< Index of the resizing step in the latest filtering
                        ///< steps array.
                        ///<
  double InGammaMult;   ///< Input gamma multiplier, used to convert input
                        ///< data to 0 to 1 range. 0.0 if no gamma is in use.
                        ///<
  double OutGammaMult;  ///< Output gamma multiplier, used to convert data to
                        ///< 0 to 255/65535 range. 0.0 if no gamma is in use.
                        ///<

  double ox;  ///< Start X pixel offset within source image (can be
              ///< negative). Positive offset moves image to the left.
              ///<
  double oy;  ///< Start Y pixel offset within source image (can be
              ///< negative). Positive offset moves image to the top.
              ///<
  CImageResizerThreadPool*
      ThreadPool;     ///< Thread pool to be used by the
                      ///< image resizing function. Set to NULL to use
                      ///< single-threaded processing.
                      ///<
  bool UseSRGBGamma;  ///< Perform sRGB gamma linearization (correction).
                      ///<
  int BuildMode;      ///< The build mode to use, for debugging purposes. Set to
                  ///< -1 to select a minimal-complexity mode automatically. All
                  ///< build modes deliver similar results with minor
                  ///< deviations.
                  ///<
  int RndSeed;  ///< Random seed parameter. This parameter may be incremented
                ///< after each random generator initialization. The use of this
                ///< variable depends on the ditherer implementation.
                ///<

  CImageResizerVars()
      : ox(0.0),
        oy(0.0),
        ThreadPool(NULL),
        UseSRGBGamma(false),
        BuildMode(-1),
        RndSeed(0) {}
};

/**
 * @brief Image resizer's filtering step class.
 *
 * Class defines data to perform a single filtering step over a whole
 * horizontal or vertical scanline. Resizing consists of 1 or more steps that
 * may be performed before the actual resizing takes place. Filtering may also
 * follow a resizing step. Each step must ensure that scanline data contains
 * enough pixels to perform the next step (which may be resizing) without
 * exceeding scanline's bounds.
 *
 * A derived class must implement several "const" and "static" functions that
 * are used to perform the actual filtering in interleaved or de-interleaved
 * mode.
 *
 * @tparam fptype Floating point type to use for storing pixel elements. SIMD
 * types can be used: in this case each element may hold a whole pixel.
 * @tparam fptypeatom The atomic type the "fptype" consists of.
 */

template <class fptype, class fptypeatom>
class CImageResizerFilterStep {
 public:
  bool IsUpsample;      ///< "True" if this step is an upsampling step, "false"
                        ///< if downsampling step. Should be set to "false" if
                        ///< ResampleFactor equals 0.
                        ///<
  int ResampleFactor;   ///< Resample factor (>=1). If 0, this is a resizing
                        ///< step. This value should be >1 if IsUpsample equals
                        ///< "true".
                        ///<
  CBuffer<fptype> Flt;  ///< Filter to use at this step.
                        ///<
  CFltBuffer FltOrig;   ///< Originally-designed filter. This buffer may not
                        ///< be assigned. Assigned by filters that precede the
                        ///< resizing step if such filter is planned to be
  ///< embedded into the interpolation filter as "external"
  ///< filter. If IsUpsample=true and this filter buffer is
  ///< not empty, the upsampling step will not itself apply
  ///< any filtering over upsampled input scanline.
  ///<
  double DCGain;   ///< DC gain which was applied to the filter. Not defined
                   ///< if ResampleFactor = 0.
                   ///<
  int FltLatency;  ///< Filter's latency (group delay, shift) in pixels.
                   ///<
  const CImageResizerVars* Vars;  ///< Image resizing-related variables.
                                  ///<
  int InLen;                      ///< Input scanline's length in pixels.
                                  ///<
  int InBuf;                      ///< Input buffer index, 0 or 1.
                                  ///<
  int InPrefix;  ///< Required input prefix pixels. These prefix pixels will
                 ///< be filled with source scanline's first pixel value. If
                 ///< IsUpsample is "true", this is the additional number of
                 ///< times the first pixel will be filtered before processing
                 ///< scanline, this number is also reflected in the OutPrefix.
                 ///<
  int InSuffix;  ///< Required input suffix pixels. These suffix pixels will
                 ///< be filled with source scanline's last pixel value. If
                 ///< IsUpsample is "true", this is the additional number of
                 ///< times the last pixel will be filtered before processing
                 ///< scanline, this number is also reflected in the OutSuffix.
                 ///<
  int InElIncr;  ///< Pixel element increment within the input buffer, used
                 ///< during de-interleaved processing: in this case each
                 ///< image's channel is stored independently, InElIncr elements
                 ///< apart.
                 ///<
  int OutLen;    ///< Length of the resulting scanline.
                 ///<
  int OutBuf;    ///< Output buffer index. 0 or 1; 2 for the last step.
                 ///<
  int OutPrefix;  ///< Required output prefix pixels. These prefix pixels
                  ///< will not be pre-filled with any values. Value is valid
                  ///< only if IsUpsample equals "true".
                  ///<
  int OutSuffix;  ///< Required input suffix pixels. These suffix pixels will
                  ///< not be pre-filled with any values. Value is valid only if
                  ///< IsUpsample equals "true".
                  ///<
  int OutElIncr;  ///< Pixel element increment within the output buffer, used
                  ///< during de-interleaved processing. Equals to the
                  ///< InBufElIncr of the next step.
                  ///<
  CBuffer<fptype> PrefixDC;  ///< DC component fluctuations added at the
                             ///< start of the resulting scanline, used when
                             ///< IsUpsample equals "true".
                             ///<
  CBuffer<fptype> SuffixDC;  ///< DC component fluctuations added at the
                             ///< end of the resulting scanline, used when
                             ///< IsUpsample equals "true".
                             ///<
  int EdgePixelCount;        ///< The number of edge pixels added. Affects the
                       ///< initial position within the input scanline, used to
                       ///< produce edge pixels. This variable is used and
                       ///< should be defined when IsUpsample=false and
                       ///< ResampleFactor>0. When assigning this variable it is
                       ///< also necessary to update InPrefix, OutLen and Vars.o
                       ///< variables.
                       ///<
  static const int EdgePixelCountDef =
      3;  ///< The default number of pixels
          ///< additionally produced at scanline edges during filtering. This is
          ///< required to reduce edge artifacts.
          ///<

  /**
   * @brief Resizing position structure.
   *
   * Structure holds resizing position and pointer to fractional delay
   * filter.
   */

  struct CResizePos {
    int SrcPosInt;      ///< Source scanline position.
                        ///<
    int fti;            ///< Fractional delay filter index.
                        ///<
    const fptype* ftp;  ///< Fractional delay filter pointer.
                        ///<
    fptypeatom x;       ///< Interpolation coefficient between delay filters.
                        ///<
    int SrcOffs;        ///< Source scanline offset.
                        ///<
  };

  /**
   * @brief Resizing positions buffer class.
   *
   * This class combines buffer together with variables that define resizing
   * stepping.
   */

  class CRPosBuf : public CBuffer<CResizePos> {
   public:
    double k;       ///< Resizing step.
                    ///<
    double o;       ///< Resizing offset.
                    ///<
    int FracCount;  ///< The number of fractional delay filters in a filter
                    ///< bank used together with this buffer.
                    ///<
  };

  /**
   * @brief Resizing positions buffer array class.
   *
   * This class combines structure array of the CRPosBuf class objects with
   * the function that locates or creates buffer with the required resizing
   * stepping.
   */

  class CRPosBufArray : public CStructArray<CRPosBuf> {
   public:
    using CStructArray<CRPosBuf>::add;
    using CStructArray<CRPosBuf>::getItemCount;

    /**
     * Function returns the resizing positions buffer with the required
     * stepping. If no such buffer exists, it is created.
     *
     * @param k Resizing step.
     * @param o Resizing offset.
     * @param FracCount The number of fractional delay filters in a filter
     * bank used together with this buffer.
     * @return Reference to the CRPosBuf object.
     */

    CRPosBuf& getRPosBuf(const double k, const double o, const int FracCount) {
      int i;

      for (i = 0; i < getItemCount(); i++) {
        CRPosBuf& Buf = (*this)[i];

        if (Buf.k == k && Buf.o == o && Buf.FracCount == FracCount) {
          return (Buf);
        }
      }

      CRPosBuf& NewBuf = add();
      NewBuf.k = k;
      NewBuf.o = o;
      NewBuf.FracCount = FracCount;

      return (NewBuf);
    }
  };

  CRPosBuf* RPosBuf;  ///< Resizing positions buffer. Used when
                      ///< ResampleFactor equals 0 (resizing step).
                      ///<
  CDSPFracFilterBankLin<fptype>* FltBank;  ///< Filter bank in use by *this
                                           ///< resizing step.
                                           ///<
};

/**
 * @brief Interleaved filtering steps implementation class.
 *
 * This class implements scanline filtering functions in interleaved mode.
 * This means that each pixel is processed independently, not in groups.
 *
 * @tparam fptype Floating point type to use for storing pixel elements. SIMD
 * types can be used: in this case each element may hold a whole pixel.
 * @tparam fptypeatom The atomic type the "fptype" consists of.
 */

template <class fptype, class fptypeatom>
class CImageResizerFilterStepINL
    : public CImageResizerFilterStep<fptype, fptypeatom> {
 public:
  using CImageResizerFilterStep<fptype, fptypeatom>::IsUpsample;
  using CImageResizerFilterStep<fptype, fptypeatom>::ResampleFactor;
  using CImageResizerFilterStep<fptype, fptypeatom>::Flt;
  using CImageResizerFilterStep<fptype, fptypeatom>::FltOrig;
  using CImageResizerFilterStep<fptype, fptypeatom>::FltLatency;
  using CImageResizerFilterStep<fptype, fptypeatom>::Vars;
  using CImageResizerFilterStep<fptype, fptypeatom>::InLen;
  using CImageResizerFilterStep<fptype, fptypeatom>::InPrefix;
  using CImageResizerFilterStep<fptype, fptypeatom>::InSuffix;
  using CImageResizerFilterStep<fptype, fptypeatom>::OutLen;
  using CImageResizerFilterStep<fptype, fptypeatom>::OutPrefix;
  using CImageResizerFilterStep<fptype, fptypeatom>::OutSuffix;
  using CImageResizerFilterStep<fptype, fptypeatom>::PrefixDC;
  using CImageResizerFilterStep<fptype, fptypeatom>::SuffixDC;
  using CImageResizerFilterStep<fptype, fptypeatom>::RPosBuf;
  using CImageResizerFilterStep<fptype, fptypeatom>::FltBank;
  using CImageResizerFilterStep<fptype, fptypeatom>::EdgePixelCount;

  /**
   * Function performs "packing" of a scanline and type conversion.
   * Scanline, depending on the "fptype" can be potentially stored as a
   * packed SIMD values having a certain atomic type. If required, the sRGB
   * gamma correction is applied.
   *
   * @param ip Input scanline.
   * @param op0 Output scanline.
   * @param l0 The number of pixels to "pack".
   */

  template <class Tin>
  void packScanline(const Tin* ip, fptype* const op0, const int l0) const {
    const int ElCount = Vars->ElCount;
    const int ElCountIO = Vars->ElCountIO;
    fptype* op = op0;
    int l = l0;

    if (!Vars->UseSRGBGamma) {
      if (ElCountIO == 1) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = (fptypeatom)ip[0];
          op += ElCount;
          ip++;
          l--;
        }
      } else if (ElCountIO == 4) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = (fptypeatom)ip[0];
          v[1] = (fptypeatom)ip[1];
          v[2] = (fptypeatom)ip[2];
          v[3] = (fptypeatom)ip[3];
          op += ElCount;
          ip += 4;
          l--;
        }
      } else if (ElCountIO == 3) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = (fptypeatom)ip[0];
          v[1] = (fptypeatom)ip[1];
          v[2] = (fptypeatom)ip[2];
          op += ElCount;
          ip += 3;
          l--;
        }
      } else if (ElCountIO == 2) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = (fptypeatom)ip[0];
          v[1] = (fptypeatom)ip[1];
          op += ElCount;
          ip += 2;
          l--;
        }
      }
    } else {
      const fptypeatom gm = (fptypeatom)Vars->InGammaMult;

      if (ElCountIO == 1) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = convertSRGB2Lin((fptypeatom)ip[0] * gm);
          op += ElCount;
          ip++;
          l--;
        }
      } else if (ElCountIO == 4) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = convertSRGB2Lin((fptypeatom)ip[0] * gm);
          v[1] = convertSRGB2Lin((fptypeatom)ip[1] * gm);
          v[2] = convertSRGB2Lin((fptypeatom)ip[2] * gm);
          v[3] = convertSRGB2Lin((fptypeatom)ip[3] * gm);
          op += ElCount;
          ip += 4;
          l--;
        }
      } else if (ElCountIO == 3) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = convertSRGB2Lin((fptypeatom)ip[0] * gm);
          v[1] = convertSRGB2Lin((fptypeatom)ip[1] * gm);
          v[2] = convertSRGB2Lin((fptypeatom)ip[2] * gm);
          op += ElCount;
          ip += 3;
          l--;
        }
      } else if (ElCountIO == 2) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = convertSRGB2Lin((fptypeatom)ip[0] * gm);
          v[1] = convertSRGB2Lin((fptypeatom)ip[1] * gm);
          op += ElCount;
          ip += 2;
          l--;
        }
      }
    }

    const int ZeroCount = ElCount * Vars->fppack - ElCountIO;
    op = op0;
    l = l0;

    if (ZeroCount == 1) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)op + ElCountIO;
        v[0] = (fptypeatom)0;
        op += ElCount;
        l--;
      }
    } else if (ZeroCount == 2) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)op + ElCountIO;
        v[0] = (fptypeatom)0;
        v[1] = (fptypeatom)0;
        op += ElCount;
        l--;
      }
    } else if (ZeroCount == 3) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)op + ElCountIO;
        v[0] = (fptypeatom)0;
        v[1] = (fptypeatom)0;
        v[2] = (fptypeatom)0;
        op += ElCount;
        l--;
      }
    }
  }

  /**
   * Function applies Linear to sRGB gamma correction to the specified
   * scanline.
   *
   * @param p Scanline.
   * @param l The number of pixels to de-linearize.
   * @param Vars0 Image resizing-related variables.
   */

  static void applySRGBGamma(fptype* p, int l, const CImageResizerVars& Vars0) {
    const int ElCount = Vars0.ElCount;
    const int ElCountIO = Vars0.ElCountIO;
    const fptypeatom gm = (fptypeatom)Vars0.OutGammaMult;

    if (ElCountIO == 1) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)p;
        v[0] = convertLin2SRGB(v[0]) * gm;
        p += ElCount;
        l--;
      }
    } else if (ElCountIO == 4) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)p;
        v[0] = convertLin2SRGB(v[0]) * gm;
        v[1] = convertLin2SRGB(v[1]) * gm;
        v[2] = convertLin2SRGB(v[2]) * gm;
        v[3] = convertLin2SRGB(v[3]) * gm;
        p += ElCount;
        l--;
      }
    } else if (ElCountIO == 3) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)p;
        v[0] = convertLin2SRGB(v[0]) * gm;
        v[1] = convertLin2SRGB(v[1]) * gm;
        v[2] = convertLin2SRGB(v[2]) * gm;
        p += ElCount;
        l--;
      }
    } else if (ElCountIO == 2) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)p;
        v[0] = convertLin2SRGB(v[0]) * gm;
        v[1] = convertLin2SRGB(v[1]) * gm;
        p += ElCount;
        l--;
      }
    }
  }

  /**
   * Function converts vertical scanline to horizontal scanline. This
   * function is called by the image resizer when image is resized
   * vertically. This means that the vertical scanline is stored in the
   * same format produced by the packScanline() and maintained by other
   * filtering functions.
   *
   * @param ip Input vertical scanline.
   * @param op Output buffer (temporary buffer used during resizing).
   * @param SrcLen The number of pixels in the input scanline, also used to
   * calculate input buffer increment.
   * @param SrcIncr Input buffer increment to the next vertical pixel.
   */

  void convertVtoH(const fptype* ip, fptype* op, const int SrcLen,
                   const int SrcIncr) const {
    const int ElCount = Vars->ElCount;
    int j;

    if (ElCount == 1) {
      for (j = 0; j < SrcLen; j++) {
        op[0] = ip[0];
        ip += SrcIncr;
        op++;
      }
    } else if (ElCount == 4) {
      for (j = 0; j < SrcLen; j++) {
        op[0] = ip[0];
        op[1] = ip[1];
        op[2] = ip[2];
        op[3] = ip[3];
        ip += SrcIncr;
        op += 4;
      }
    } else if (ElCount == 3) {
      for (j = 0; j < SrcLen; j++) {
        op[0] = ip[0];
        op[1] = ip[1];
        op[2] = ip[2];
        ip += SrcIncr;
        op += 3;
      }
    } else if (ElCount == 2) {
      for (j = 0; j < SrcLen; j++) {
        op[0] = ip[0];
        op[1] = ip[1];
        ip += SrcIncr;
        op += 2;
      }
    }
  }

  /**
   * Function performs "unpacking" of a scanline and type conversion
   * (truncation is used when floating point is converted to integer).
   * Scanline, depending on the "fptype" can be potentially stored as a
   * packed SIMD values having a certain atomic type. The unpacking function
   * assumes that scanline is stored in the style produced by the
   * packScanline() function.
   *
   * @param ip Input scanline.
   * @param op Output scanline.
   * @param l The number of pixels to "unpack".
   * @param Vars0 Image resizing-related variables.
   */

  template <class Tout>
  static void unpackScanline(const fptype* ip, Tout* op, int l,
                             const CImageResizerVars& Vars0) {
    const int ElCount = Vars0.ElCount;
    const int ElCountIO = Vars0.ElCountIO;

    if (ElCountIO == 1) {
      while (l > 0) {
        const fptypeatom* v = (const fptypeatom*)ip;
        op[0] = (Tout)v[0];
        ip += ElCount;
        op++;
        l--;
      }
    } else if (ElCountIO == 4) {
      while (l > 0) {
        const fptypeatom* v = (const fptypeatom*)ip;
        op[0] = (Tout)v[0];
        op[1] = (Tout)v[1];
        op[2] = (Tout)v[2];
        op[3] = (Tout)v[3];
        ip += ElCount;
        op += 4;
        l--;
      }
    } else if (ElCountIO == 3) {
      while (l > 0) {
        const fptypeatom* v = (const fptypeatom*)ip;
        op[0] = (Tout)v[0];
        op[1] = (Tout)v[1];
        op[2] = (Tout)v[2];
        ip += ElCount;
        op += 3;
        l--;
      }
    } else if (ElCountIO == 2) {
      while (l > 0) {
        const fptypeatom* v = (const fptypeatom*)ip;
        op[0] = (Tout)v[0];
        op[1] = (Tout)v[1];
        ip += ElCount;
        op += 2;
        l--;
      }
    }
  }

  /**
   * Function prepares input scanline buffer for *this filtering step.
   * Left- and right-most pixels are replicated to make sure no buffer
   * overrun happens. Such approach also allows to bypass any pointer
   * range checks.
   *
   * @param Src Source buffer.
   */

  void prepareInBuf(fptype* Src) const {
    if (IsUpsample || InPrefix + InSuffix == 0) {
      return;
    }

    const int ElCount = Vars->ElCount;
    replicateArray(Src, ElCount, Src - ElCount, InPrefix, -ElCount);

    Src += (InLen - 1) * ElCount;
    replicateArray(Src, ElCount, Src + ElCount, InSuffix, ElCount);
  }

  /**
   * Function peforms scanline upsampling with filtering.
   *
   * @param Src Source scanline buffer (length = this -> InLen). Source
   * scanline increment will be equal to ElCount.
   * @param Dst Destination scanline buffer.
   */

  void doUpsample(const fptype* const Src, fptype* const Dst) const {
    const int ElCount = Vars->ElCount;
    fptype* op0 = &Dst[-OutPrefix * ElCount];
    memset(&op0->value, 0, (OutPrefix + OutLen + OutSuffix) * ElCount * sizeof(op0->value));

    const fptype* ip = Src;
    const int opstep = ElCount * ResampleFactor;
    int l;

    if (FltOrig.getCapacity() > 0) {
      // Do not perform filtering, only upsample.

      op0 += (OutPrefix % ResampleFactor) * ElCount;
      l = OutPrefix / ResampleFactor;

      if (ElCount == 1) {
        while (l > 0) {
          op0[0] = ip[0];
          op0 += opstep;
          l--;
        }

        l = InLen - 1;

        while (l > 0) {
          op0[0] = ip[0];
          op0 += opstep;
          ip += ElCount;
          l--;
        }

        l = OutSuffix / ResampleFactor;

        while (l >= 0) {
          op0[0] = ip[0];
          op0 += opstep;
          l--;
        }
      } else if (ElCount == 4) {
        while (l > 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0[2] = ip[2];
          op0[3] = ip[3];
          op0 += opstep;
          l--;
        }

        l = InLen - 1;

        while (l > 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0[2] = ip[2];
          op0[3] = ip[3];
          op0 += opstep;
          ip += ElCount;
          l--;
        }

        l = OutSuffix / ResampleFactor;

        while (l >= 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0[2] = ip[2];
          op0[3] = ip[3];
          op0 += opstep;
          l--;
        }
      } else if (ElCount == 3) {
        while (l > 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0[2] = ip[2];
          op0 += opstep;
          l--;
        }

        l = InLen - 1;

        while (l > 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0[2] = ip[2];
          op0 += opstep;
          ip += ElCount;
          l--;
        }

        l = OutSuffix / ResampleFactor;

        while (l >= 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0[2] = ip[2];
          op0 += opstep;
          l--;
        }
      } else if (ElCount == 2) {
        while (l > 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0 += opstep;
          l--;
        }

        l = InLen - 1;

        while (l > 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0 += opstep;
          ip += ElCount;
          l--;
        }

        l = OutSuffix / ResampleFactor;

        while (l >= 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0 += opstep;
          l--;
        }
      }

      return;
    }

    const fptype* const f = Flt;
    const int flen = Flt.getCapacity();
    fptype* op;
    int i;

    if (ElCount == 1) {
      l = InPrefix;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[i] += f[i] * ip[0];
        }

        op0 += opstep;
        l--;
      }

      l = InLen - 1;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[i] += f[i] * ip[0];
        }

        ip += ElCount;
        op0 += opstep;
        l--;
      }

      l = InSuffix;

      while (l >= 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[i] += f[i] * ip[0];
        }

        op0 += opstep;
        l--;
      }
    } else if (ElCount == 4) {
      l = InPrefix;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op[2] += f[i] * ip[2];
          op[3] += f[i] * ip[3];
          op += 4;
        }

        op0 += opstep;
        l--;
      }

      l = InLen - 1;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op[2] += f[i] * ip[2];
          op[3] += f[i] * ip[3];
          op += 4;
        }

        ip += ElCount;
        op0 += opstep;
        l--;
      }

      l = InSuffix;

      while (l >= 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op[2] += f[i] * ip[2];
          op[3] += f[i] * ip[3];
          op += 4;
        }

        op0 += opstep;
        l--;
      }
    } else if (ElCount == 3) {
      l = InPrefix;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op[2] += f[i] * ip[2];
          op += 3;
        }

        op0 += opstep;
        l--;
      }

      l = InLen - 1;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op[2] += f[i] * ip[2];
          op += 3;
        }

        ip += ElCount;
        op0 += opstep;
        l--;
      }

      l = InSuffix;

      while (l >= 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op[2] += f[i] * ip[2];
          op += 3;
        }

        op0 += opstep;
        l--;
      }
    } else if (ElCount == 2) {
      l = InPrefix;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op += 2;
        }

        op0 += opstep;
        l--;
      }

      l = InLen - 1;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op += 2;
        }

        ip += ElCount;
        op0 += opstep;
        l--;
      }

      l = InSuffix;

      while (l >= 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op += 2;
        }

        op0 += opstep;
        l--;
      }
    }

    op = op0;
    const fptype* dc = SuffixDC;
    l = SuffixDC.getCapacity();

    if (ElCount == 1) {
      for (i = 0; i < l; i++) {
        op[i] += ip[0] * dc[i];
      }
    } else if (ElCount == 4) {
      while (l > 0) {
        op[0] += ip[0] * dc[0];
        op[1] += ip[1] * dc[0];
        op[2] += ip[2] * dc[0];
        op[3] += ip[3] * dc[0];
        dc++;
        op += 4;
        l--;
      }
    } else if (ElCount == 3) {
      while (l > 0) {
        op[0] += ip[0] * dc[0];
        op[1] += ip[1] * dc[0];
        op[2] += ip[2] * dc[0];
        dc++;
        op += 3;
        l--;
      }
    } else if (ElCount == 2) {
      while (l > 0) {
        op[0] += ip[0] * dc[0];
        op[1] += ip[1] * dc[0];
        dc++;
        op += 2;
        l--;
      }
    }

    ip = Src;
    op = Dst - InPrefix * opstep;
    dc = PrefixDC;
    l = PrefixDC.getCapacity();

    if (ElCount == 1) {
      for (i = 0; i < l; i++) {
        op[i] += ip[0] * dc[i];
      }
    } else if (ElCount == 4) {
      while (l > 0) {
        op[0] += ip[0] * dc[0];
        op[1] += ip[1] * dc[0];
        op[2] += ip[2] * dc[0];
        op[3] += ip[3] * dc[0];
        dc++;
        op += 4;
        l--;
      }
    } else if (ElCount == 3) {
      while (l > 0) {
        op[0] += ip[0] * dc[0];
        op[1] += ip[1] * dc[0];
        op[2] += ip[2] * dc[0];
        dc++;
        op += 3;
        l--;
      }
    } else if (ElCount == 2) {
      while (l > 0) {
        op[0] += ip[0] * dc[0];
        op[1] += ip[1] * dc[0];
        dc++;
        op += 2;
        l--;
      }
    }
  }

  /**
   * Function peforms scanline filtering with optional downsampling.
   * Function makes use of the symmetry of the filter.
   *
   * @param Src Source scanline buffer (length = this -> InLen). Source
   * scanline increment will be equal to ElCount.
   * @param Dst Destination scanline buffer.
   * @param DstIncr Destination scanline buffer increment, used for
   * horizontal or vertical scanline stepping.
   */

  void doFilter(const fptype* const Src, fptype* Dst, const int DstIncr) const {
    const int ElCount = Vars->ElCount;
    const fptype* const f = &Flt[FltLatency];
    const int flen = FltLatency + 1;
    const int ipstep = ElCount * ResampleFactor;
    const fptype* ip = Src - EdgePixelCount * ipstep;
    const fptype* ip1;
    const fptype* ip2;
    int l = OutLen;
    int i;

    if (ElCount == 1) {
      while (l > 0) {
        fptype s = f[0] * ip[0];
        ip1 = ip;
        ip2 = ip;

        for (i = 1; i < flen; i++) {
          ip1++;
          ip2--;
          s += f[i] * (ip1[0] + ip2[0]);
        }

        Dst[0] = s;
        Dst += DstIncr;
        ip += ipstep;
        l--;
      }
    } else if (ElCount == 4) {
      while (l > 0) {
        fptype s1 = f[0] * ip[0];
        fptype s2 = f[0] * ip[1];
        fptype s3 = f[0] * ip[2];
        fptype s4 = f[0] * ip[3];
        ip1 = ip;
        ip2 = ip;

        for (i = 1; i < flen; i++) {
          ip1 += 4;
          ip2 -= 4;
          s1 += f[i] * (ip1[0] + ip2[0]);
          s2 += f[i] * (ip1[1] + ip2[1]);
          s3 += f[i] * (ip1[2] + ip2[2]);
          s4 += f[i] * (ip1[3] + ip2[3]);
        }

        Dst[0] = s1;
        Dst[1] = s2;
        Dst[2] = s3;
        Dst[3] = s4;
        Dst += DstIncr;
        ip += ipstep;
        l--;
      }
    } else if (ElCount == 3) {
      while (l > 0) {
        fptype s1 = f[0] * ip[0];
        fptype s2 = f[0] * ip[1];
        fptype s3 = f[0] * ip[2];
        ip1 = ip;
        ip2 = ip;

        for (i = 1; i < flen; i++) {
          ip1 += 3;
          ip2 -= 3;
          s1 += f[i] * (ip1[0] + ip2[0]);
          s2 += f[i] * (ip1[1] + ip2[1]);
          s3 += f[i] * (ip1[2] + ip2[2]);
        }

        Dst[0] = s1;
        Dst[1] = s2;
        Dst[2] = s3;
        Dst += DstIncr;
        ip += ipstep;
        l--;
      }
    } else if (ElCount == 2) {
      while (l > 0) {
        fptype s1 = f[0] * ip[0];
        fptype s2 = f[0] * ip[1];
        ip1 = ip;
        ip2 = ip;

        for (i = 1; i < flen; i++) {
          ip1 += 2;
          ip2 -= 2;
          s1 += f[i] * (ip1[0] + ip2[0]);
          s2 += f[i] * (ip1[1] + ip2[1]);
        }

        Dst[0] = s1;
        Dst[1] = s2;
        Dst += DstIncr;
        ip += ipstep;
        l--;
      }
    }
  }

  /**
   * Function performs resizing of a single scanline. This function does
   * not "know" about the length of the source scanline buffer. This buffer
   * should be padded with enough pixels so that ( SrcPos - FilterLenD2 ) is
   * always >= 0 and ( SrcPos + ( DstLineLen - 1 ) * k + FilterLenD2 + 1 )
   * does not exceed source scanline's buffer length. SrcLine's increment is
   * assumed to be equal to ElCount.
   *
   * @param SrcLine Source scanline buffer.
   * @param DstLine Destination (resized) scanline buffer.
   * @param DstLineIncr Destination scanline position increment, used for
   * horizontal or vertical scanline stepping.
   * @param xx Temporary buffer, of size FltBank -> getFilterLen(), must be
   * aligned by fpclass :: fpalign.
   */

  void doResize(const fptype* SrcLine, fptype* DstLine, const int DstLineIncr,
                fptype* const) const {
    const int IntFltLen = FltBank->getFilterLen();
    const int ElCount = Vars->ElCount;
    const typename CImageResizerFilterStep<fptype, fptypeatom>::CResizePos*
        rpos = &(*RPosBuf)[0];

    const typename CImageResizerFilterStep<
        fptype, fptypeatom>::CResizePos* const rpose = rpos + OutLen;

#define AVIR_RESIZE_PART1                        \
  while (rpos < rpose) {                         \
    const fptype x = (fptype)rpos->x;            \
    const fptype* const ftp = rpos->ftp;         \
    const fptype* const ftp2 = ftp + IntFltLen;  \
    const fptype* Src = SrcLine + rpos->SrcOffs; \
    int i;

#define AVIR_RESIZE_PART1nx                      \
  while (rpos < rpose) {                         \
    const fptype* const ftp = rpos->ftp;         \
    const fptype* Src = SrcLine + rpos->SrcOffs; \
    int i;

#define AVIR_RESIZE_PART2 \
  DstLine += DstLineIncr; \
  rpos++;                 \
  }

    if (FltBank->getOrder() == 1) {
      if (ElCount == 1) {
        AVIR_RESIZE_PART1

        fptype sum = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          sum += (ftp[i] + ftp2[i] * x) * Src[i];
        }

        DstLine[0] = sum;

        AVIR_RESIZE_PART2
      } else if (ElCount == 4) {
        AVIR_RESIZE_PART1

        fptype sum[4];
        sum[0] = 0.0;
        sum[1] = 0.0;
        sum[2] = 0.0;
        sum[3] = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          const fptype xx = ftp[i] + ftp2[i] * x;
          sum[0] += xx * Src[0];
          sum[1] += xx * Src[1];
          sum[2] += xx * Src[2];
          sum[3] += xx * Src[3];
          Src += 4;
        }

        DstLine[0] = sum[0];
        DstLine[1] = sum[1];
        DstLine[2] = sum[2];
        DstLine[3] = sum[3];

        AVIR_RESIZE_PART2
      } else if (ElCount == 3) {
        AVIR_RESIZE_PART1

        fptype sum[3];
        sum[0] = 0.0;
        sum[1] = 0.0;
        sum[2] = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          const fptype xx = ftp[i] + ftp2[i] * x;
          sum[0] += xx * Src[0];
          sum[1] += xx * Src[1];
          sum[2] += xx * Src[2];
          Src += 3;
        }

        DstLine[0] = sum[0];
        DstLine[1] = sum[1];
        DstLine[2] = sum[2];

        AVIR_RESIZE_PART2
      } else if (ElCount == 2) {
        AVIR_RESIZE_PART1

        fptype sum[2];
        sum[0] = 0.0;
        sum[1] = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          const fptype xx = ftp[i] + ftp2[i] * x;
          sum[0] += xx * Src[0];
          sum[1] += xx * Src[1];
          Src += 2;
        }

        DstLine[0] = sum[0];
        DstLine[1] = sum[1];

        AVIR_RESIZE_PART2
      }
    } else {
      if (ElCount == 1) {
        AVIR_RESIZE_PART1nx

            fptype sum = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          sum += ftp[i] * Src[i];
        }

        DstLine[0] = sum;

        AVIR_RESIZE_PART2
      } else if (ElCount == 4) {
        AVIR_RESIZE_PART1nx

            fptype sum[4];
        sum[0] = 0.0;
        sum[1] = 0.0;
        sum[2] = 0.0;
        sum[3] = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          const fptype xx = ftp[i];
          sum[0] += xx * Src[0];
          sum[1] += xx * Src[1];
          sum[2] += xx * Src[2];
          sum[3] += xx * Src[3];
          Src += 4;
        }

        DstLine[0] = sum[0];
        DstLine[1] = sum[1];
        DstLine[2] = sum[2];
        DstLine[3] = sum[3];

        AVIR_RESIZE_PART2
      } else if (ElCount == 3) {
        AVIR_RESIZE_PART1nx

            fptype sum[3];
        sum[0] = 0.0;
        sum[1] = 0.0;
        sum[2] = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          const fptype xx = ftp[i];
          sum[0] += xx * Src[0];
          sum[1] += xx * Src[1];
          sum[2] += xx * Src[2];
          Src += 3;
        }

        DstLine[0] = sum[0];
        DstLine[1] = sum[1];
        DstLine[2] = sum[2];

        AVIR_RESIZE_PART2
      } else if (ElCount == 2) {
        AVIR_RESIZE_PART1nx

            fptype sum[2];
        sum[0] = 0.0;
        sum[1] = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          const fptype xx = ftp[i];
          sum[0] += xx * Src[0];
          sum[1] += xx * Src[1];
          Src += 2;
        }

        DstLine[0] = sum[0];
        DstLine[1] = sum[1];

        AVIR_RESIZE_PART2
      }
    }
  }
#undef AVIR_RESIZE_PART2
#undef AVIR_RESIZE_PART1nx
#undef AVIR_RESIZE_PART1
};

/**
 * @brief Image resizer's default dithering class.
 *
 * This class defines an object that performs rounding, clipping and dithering
 * operations over horizontal scanline pixels before scanline is stored in the
 * output buffer.
 *
 * The ditherer should expect the same storage order of the pixels in a
 * scanline as used in the "filtering step" class. So, a separate ditherer
 * class should be defined for each scanline pixel storage style. The default
 * ditherer implements a simple rounding without dithering: it can be used for
 * an efficient dithering method which can be multi-threaded.
 *
 * @tparam fptype Floating point type to use for storing pixel data. SIMD
 * types can be used.
 */

template <class fptype>
class CImageResizerDithererDefINL {
 public:
  /**
   * Function initializes the ditherer object.
   *
   * @param aLen Scanline length in pixels to process.
   * @param aVars Image resizing-related variables.
   * @param aTrMul Bit-depth truncation multiplier. 1 - no additional
   * truncation.
   * @param aPkOut Peak output value allowed.
   */

  void init(const int aLen, const CImageResizerVars& aVars, const double aTrMul,
            const double aPkOut) {
    Len = aLen;
    Vars = &aVars;
    LenE = aLen * Vars->ElCount;
    TrMul0 = aTrMul;
    PkOut0 = aPkOut;
  }

  /**
   * @return "True" if dithering is recursive relative to scanlines meaning
   * multi-threaded execution is not supported by this dithering method.
   */

  static bool isRecursive() { return (false); }

  /**
   * Function performs rounding and clipping operations.
   *
   * @param ResScanline The buffer containing the final scanline.
   */

  void dither(fptype* const ResScanline) const {
    const fptype c0 = 0.0;
    const fptype PkOut = (fptype)PkOut0;
    int j;

    if (TrMul0 == 1.0) {
      // Optimization - do not perform bit depth truncation.

      for (j = 0; j < LenE; j++) {
        ResScanline[j] = clamp(round(ResScanline[j]), c0, PkOut);
      }
    } else {
      const fptype TrMul = (fptype)TrMul0;

      for (j = 0; j < LenE; j++) {
        const fptype z0 = round(ResScanline[j] / TrMul) * TrMul;
        ResScanline[j] = clamp(z0, c0, PkOut);
      }
    }
  }

 protected:
  int Len;                        ///< Scanline's length in pixels.
                                  ///<
  const CImageResizerVars* Vars;  ///< Image resizing-related variables.
                                  ///<
  int LenE;                       ///< = LenE * ElCount.
                                  ///<
  double TrMul0;                  ///< Bit-depth truncation multiplier.
                                  ///<
  double PkOut0;                  ///< Peak output value allowed.
                                  ///<
};

/**
 * @brief Image resizer's error-diffusion dithering class, interleaved mode.
 *
 * This ditherer implements error-diffusion dithering which looks good, and
 * whose results are compressed by PNG well. This implementation uses
 * weighting coefficients obtained via machine optimization and visual
 * evaluation.
 *
 * @tparam fptype Floating point type to use for storing pixel data. SIMD
 * types can be used.
 */

template <class fptype>
class CImageResizerDithererErrdINL
    : public CImageResizerDithererDefINL<fptype> {
 public:
  /**
   * Function initializes the ditherer object.
   *
   * @param aLen Scanline length in pixels to process.
   * @param aVars Image resizing-related variables.
   * @param aTrMul Bit-depth truncation multiplier. 1 - no additional
   * truncation.
   * @param aPkOut Peak output value allowed.
   */

  void init(const int aLen, const CImageResizerVars& aVars, const double aTrMul,
            const double aPkOut) {
    CImageResizerDithererDefINL<fptype>::init(aLen, aVars, aTrMul, aPkOut);

    ResScanlineDith0.alloc(LenE + Vars->ElCount, sizeof(fptype));
    ResScanlineDith = ResScanlineDith0 + Vars->ElCount;
    int i;

    for (i = 0; i < LenE + Vars->ElCount; i++) {
      ResScanlineDith0[i] = 0.0;
    }
  }

  static bool isRecursive() { return (true); }

  void dither(fptype* const ResScanline) {
    const int ElCount = Vars->ElCount;
    const fptype c0 = 0.0;
    const fptype TrMul = (fptype)TrMul0;
    const fptype PkOut = (fptype)PkOut0;
    int j;

    for (j = 0; j < LenE; j++) {
      ResScanline[j] += ResScanlineDith[j];
      ResScanlineDith[j] = 0.0;
    }

    for (j = 0; j < LenE - ElCount; j++) {
      // Perform rounding, noise estimation and saturation.

      const fptype z0 = round(ResScanline[j] / TrMul) * TrMul;
      const fptype Noise = ResScanline[j] - z0;
      ResScanline[j] = clamp(z0, c0, PkOut);

      ResScanline[j + ElCount] += Noise * (fptype)0.364842;
      ResScanlineDith[j - ElCount] += Noise * (fptype)0.207305;
      ResScanlineDith[j] += Noise * (fptype)0.364842;
      ResScanlineDith[j + ElCount] += Noise * (fptype)0.063011;
    }

    while (j < LenE) {
      const fptype z0 = round(ResScanline[j] / TrMul) * TrMul;
      const fptype Noise = ResScanline[j] - z0;
      ResScanline[j] = clamp(z0, c0, PkOut);

      ResScanlineDith[j - ElCount] += Noise * (fptype)0.207305;
      ResScanlineDith[j] += Noise * (fptype)0.364842;
      j++;
    }
  }

 protected:
  using CImageResizerDithererDefINL<fptype>::Len;
  using CImageResizerDithererDefINL<fptype>::Vars;
  using CImageResizerDithererDefINL<fptype>::LenE;
  using CImageResizerDithererDefINL<fptype>::TrMul0;
  using CImageResizerDithererDefINL<fptype>::PkOut0;

  CBuffer<fptype> ResScanlineDith0;  ///< Error diffusion buffer.
                                     ///<
  fptype* ResScanlineDith;  ///< Error diffusion buffer pointer which skips
                            ///< the first ElCount elements.
                            ///<
};

/**
 * @brief Floating-point processing definition and abstraction class.
 *
 * This class defines several constants and typedefs that point to classes
 * that should be used by the image resizing algorithm. Such "definition
 * class" can be used to define alternative scanline processing algorithms
 * (e.g. SIMD) and image scanline packing styles used during processing. This
 * class also offers an abstraction layer for dithering, rounding and
 * clamping (saturation) operation.
 *
 * The fpclass_def class can be used to define processing using both SIMD and
 * non-SIMD types, but using algorithms that are operate on interleaved pixels
 * and non-SIMD optimized themselves.
 *
 * @tparam afptype Floating point type to use for storing intermediate data
 * and variables. For variables that are not used in intensive calculations
 * the "double" type is always used. On the latest Intel processors (like
 * i7-4770K) there is almost no performance difference between "double" and
 * "float". Image quality differences between "double" and "float" are not
 * apparent on 8-bit images. At the same time the "float" uses half amount of
 * working memory the "double" type uses. SIMD types can be used. The
 * functions round() and clamp() in the "avir" or other visible namespace
 * should be available for the specified type. SIMD types allow to perform
 * resizing of images with more than 4 channels, to be exact 4 * SIMD element
 * number (e.g. 16 for float4), without modification of the image resizing
 * algorithm required.
 * @tparam afptypeatom The atomic type the "afptype" consists of.
 * @tparam adith Ditherer class to use during processing.
 */

template <class afptype, class afptypeatom = afptype,
          class adith = CImageResizerDithererDefINL<afptype> >
class fpclass_def {
 public:
  typedef afptype fptype;  ///< Floating-point type to use during processing.
                           ///<
  typedef afptypeatom fptypeatom;  ///< Atomic type "fptype" consists of.
                                   ///<
  static const int fppack =
      sizeof(fptype) /
      sizeof(fptypeatom);  ///<
                           ///< The number of atomic types stored in a single
                           ///< "fptype" element.
                           ///<
  static const int fpalign =
      sizeof(fptype);  ///< Suggested alignment size
                       ///< in bytes. This is not a required alignment, because
                       ///< image resizing algorithm cannot be made to have a
                       ///< strictly aligned data access at all steps (e.g.
                       ///< interpolation cannot perform aligned accesses).
                       ///<
  static const int elalign =
      1;  ///< Length alignment of arrays of elements.
          ///< This applies to filters and intermediate buffers: this constant
          ///< forces filters and scanlines to have a length which is a multiple
          ///< of this value, for more efficient SIMD implementation.
          ///<
  static const int packmode = 0;  ///< 0 if interleaved packing, 1 if
                                  ///< de-interleaved.
                                  ///<
  typedef CImageResizerFilterStepINL<fptype, fptypeatom>
      CFilterStep;          ///<
                            ///< Filtering step class to use during processing.
                            ///<
  typedef adith CDitherer;  ///< Ditherer class to use during processing.
                            ///<
};

/**
 * @brief Image resizer class.
 *
 * The object of this class can be used to resize 1-4 channel images to any
 * required size. Resizing is performed by utilizing interpolated sinc
 * fractional delay filters plus (if necessary) a cascade of built-in
 * sinc function-based 2X upsampling or 2X downsampling stages, followed by a
 * correction filtering.
 *
 * Object of this class can be allocated on stack.
 *
 * @tparam fpclass Floating-point processing definition class to use. See
 * avir::fpclass_def for more details.
 */

template <class fpclass = fpclass_def<float> >
class CImageResizer {
 public:
  /**
   * Constructor initializes the resizer.
   *
   * @param aResBitDepth Required bit depth of resulting image (1-16). If
   * integer value output is used (e.g. uint8_t), the bit depth also affects
   * rounding: for example, if aResBitDepth=6 and "Tout" is uint8_t, the
   * result will be rounded to 6 most significant bits (2 least significant
   * bits truncated, with dithering applied).
   * @param aSrcBitDepth Source image's real bit-depth. Set to 0 to use
   * aResBitDepth.
   * @param aParams Resizing algorithm's parameters to use. Leave out for
   * default values. Can be useful when performing automatic optimization of
   * parameters.
   */

  CImageResizer(const int aResBitDepth = 8, const int aSrcBitDepth = 0,
                const CImageResizerParams& aParams = CImageResizerParamsDef())
      : Params(aParams), ResBitDepth(aResBitDepth) {
    SrcBitDepth = (aSrcBitDepth == 0 ? ResBitDepth : aSrcBitDepth);

    initFilterBank(FixedFilterBank, 1.0, false, CFltBuffer());
    FixedFilterBank.createAllFilters();
  }

  /**
   * Function resizes image.
   *
   * @param SrcBuf Source image buffer.
   * @param SrcWidth Source image width.
   * @param SrcHeight Source image height.
   * @param SrcScanlineSize Physical size of source scanline in elements
   * (not bytes). If this value is below 1, SrcWidth * ElCountIO will be
   * used as the physical source scanline size.
   * @param[out] NewBuf Buffer to accept the resized image. Can be equal to
   * SrcBuf if the size of the resized image is smaller or equal to source
   * image in size.
   * @param NewWidth New image width.
   * @param NewHeight New image height.
   * @param ElCountIO The number of elements (channels) used to store each
   * source and destination pixel (1-4).
   * @param k Resizing step (one output pixel corresponds to "k" input
   * pixels). A downsizing factor if > 1.0; upsizing factor if <= 1.0.
   * Multiply by -1 if you would like to bypass "ox" and "oy" adjustment
   * which is done by default to produce a centered image. If step value
   * equals 0, the step value will be chosen automatically and independently
   * for horizontal and vertical resizing.
   * @param[in,out] aVars Pointer to variables structure to be passed to the
   * image resizing function. Can be NULL. Only variables that are
   * initialized in default constructor of this structure are accepted by
   * this function. These variables will not be changed by this function.
   * All other variables can be modified by this function. The access to
   * this object is not thread-safe, each concurrent instance of this
   * function should use a separate aVars object.
   * @tparam Tin Input buffer element's type. Can be uint8_t (0-255 value
   * range), uint16_t (0-65535 value range), float (0.0-1.0 value range),
   * double (0.0-1.0 value range). Larger integer types are treated as
   * uint16_t. Signed integer types are unsupported.
   * @tparam Tout Output buffer element's type. Can be uint8_t (0-255 value
   * range), uint16_t (0-65535 value range), float (0.0-1.0 value range),
   * double (0.0-1.0 value range). Larger integer types are treated as
   * uint16_t. Signed integer types are unsupported.
   */

  template <class Tin, class Tout>
  void resizeImage(const Tin* const SrcBuf, const int SrcWidth,
                   const int SrcHeight, int SrcScanlineSize, Tout* const NewBuf,
                   const int NewWidth, const int NewHeight, const int ElCountIO,
                   const double k,
                   CImageResizerVars* const aVars = NULL) const {
    if (SrcWidth == 0 || SrcHeight == 0) {
      memset(NewBuf, 0, (size_t)NewWidth * NewHeight * sizeof(Tout));

      return;
    } else if (NewWidth == 0 || NewHeight == 0) {
      return;
    }

    CImageResizerVars DefVars;
    CImageResizerVars& Vars = (aVars == NULL ? DefVars : *aVars);

    CImageResizerThreadPool DefThreadPool;
    CImageResizerThreadPool& ThreadPool =
        (Vars.ThreadPool == NULL ? DefThreadPool : *Vars.ThreadPool);

    // Define resizing steps, also optionally modify offsets so that
    // resizing produces a "centered" image.

    double kx;
    double ky;
    double ox = Vars.ox;
    double oy = Vars.oy;

    if (k == 0.0) {
      if (NewWidth > SrcWidth) {
        kx = (double)(SrcWidth - 1) / (NewWidth - 1);
      } else {
        kx = (double)SrcWidth / NewWidth;
        ox += (kx - 1.0) * 0.5;
      }

      if (NewHeight > SrcHeight) {
        ky = (double)(SrcHeight - 1) / (NewHeight - 1);
      } else {
        ky = (double)SrcHeight / NewHeight;
        oy += (ky - 1.0) * 0.5;
      }
    } else if (k > 0.0) {
      kx = k;
      ky = k;

      if (k > 1.0) {
        const double ko = (k - 1.0) * 0.5;
        ox += ko;
        oy += ko;
      }
    } else {
      kx = -k;
      ky = -k;
    }

    // Evaluate pre-multipliers used on the output stage.

    const bool IsInFloat = ((Tin)0.4 != 0);
    const bool IsOutFloat = ((Tout)0.4 != 0);
    double OutMul;  // Output multiplier.

    if (Vars.UseSRGBGamma) {
      if (IsInFloat) {
        Vars.InGammaMult = 1.0;
      } else {
        Vars.InGammaMult = 1.0 / (sizeof(Tin) == 1 ? 255.0 : 65535.0);
      }

      if (IsOutFloat) {
        Vars.OutGammaMult = 1.0;
      } else {
        Vars.OutGammaMult = (sizeof(Tout) == 1 ? 255.0 : 65535.0);
      }

      OutMul = 1.0;
    } else {
      if (IsOutFloat) {
        OutMul = 1.0;
      } else {
        OutMul = (sizeof(Tout) == 1 ? 255.0 : 65535.0);
      }

      if (!IsInFloat) {
        OutMul /= (sizeof(Tin) == 1 ? 255.0 : 65535.0);
      }
    }

    // Fill widely-used variables.

    const int ElCount = (ElCountIO + fpclass ::fppack - 1) / fpclass ::fppack;

    const int NewWidthE = NewWidth * ElCount;

    if (SrcScanlineSize < 1) {
      SrcScanlineSize = SrcWidth * ElCountIO;
    }

    Vars.ElCount = ElCount;
    Vars.ElCountIO = ElCountIO;
    Vars.fppack = fpclass ::fppack;
    Vars.fpalign = fpclass ::fpalign;
    Vars.elalign = fpclass ::elalign;
    Vars.packmode = fpclass ::packmode;

    // Horizontal scanline filtering and resizing.

    CDSPFracFilterBankLin<fptype> FltBank;
    CFilterSteps FltSteps;
    typename CFilterStep ::CRPosBufArray RPosBufArray;
    CBuffer<uint8_t> UsedFracMap;

    // Perform the filtering steps modeling at various modes, find the
    // most efficient mode for both horizontal and vertical resizing.

    int UseBuildMode = 1;
    const int BuildModeCount = (FixedFilterBank.getOrder() == 0 ? 4 : 2);

    int m;

    if (Vars.BuildMode >= 0) {
      UseBuildMode = Vars.BuildMode;
    } else {
      int BestScore = 0x7FFFFFFF;

      for (m = 0; m < BuildModeCount; m++) {
        CDSPFracFilterBankLin<fptype> TmpBank;
        CFilterSteps TmpSteps;
        Vars.k = kx;
        Vars.o = ox;
        buildFilterSteps(TmpSteps, Vars, TmpBank, OutMul, m, true);
        updateFilterStepBuffers(TmpSteps, Vars, RPosBufArray, SrcWidth,
                                NewWidth);

        fillUsedFracMap(TmpSteps[Vars.ResizeStep], UsedFracMap);
        const int c = calcComplexity(TmpSteps, Vars, UsedFracMap, SrcHeight);

        if (c < BestScore) {
          UseBuildMode = m;
          BestScore = c;
        }
      }
    }

    // Perform the actual filtering steps building.

    Vars.k = kx;
    Vars.o = ox;
    buildFilterSteps(FltSteps, Vars, FltBank, OutMul, UseBuildMode, false);

    updateFilterStepBuffers(FltSteps, Vars, RPosBufArray, SrcWidth, NewWidth);

    updateBufLenAndRPosPtrs(FltSteps, Vars, NewWidth);

    const int ThreadCount = ThreadPool.getSuggestedWorkloadCount();
    // Includes the current thread.

    CStructArray<CThreadData<Tin, Tout> > td;
    td.setItemCount(ThreadCount);
    int i;

    for (i = 0; i < ThreadCount; i++) {
      if (i > 0) {
        ThreadPool.addWorkload(&td[i]);
      }

      td[i].init(i, ThreadCount, FltSteps, Vars);

      td[i].initScanlineQueue(td[i].sopResizeH, SrcHeight, SrcWidth);
    }

    CBuffer<fptype, size_t> FltBuf(
        (size_t)NewWidthE * SrcHeight,
        fpclass ::fpalign);  // Temporary buffer that receives
                             // horizontally-filtered and resized image.

    for (i = 0; i < SrcHeight; i++) {
      td[i % ThreadCount].addScanlineToQueue(
          (void*)&SrcBuf[(size_t)i * SrcScanlineSize],
          &FltBuf[(size_t)i * NewWidthE]);
    }

    ThreadPool.startAllWorkloads();
    td[0].processScanlineQueue();
    ThreadPool.waitAllWorkloadsToFinish();

    // Vertical scanline filtering and resizing, reuse previously defined
    // filtering steps if possible.

    const int PrevUseBuildMode = UseBuildMode;

    if (Vars.BuildMode >= 0) {
      UseBuildMode = Vars.BuildMode;
    } else {
      CImageResizerVars TmpVars(Vars);
      int BestScore = 0x7FFFFFFF;

      for (m = 0; m < BuildModeCount; m++) {
        CDSPFracFilterBankLin<fptype> TmpBank;
        TmpBank.copyInitParams(FltBank);
        CFilterSteps TmpSteps;
        TmpVars.k = ky;
        TmpVars.o = oy;
        buildFilterSteps(TmpSteps, TmpVars, TmpBank, 1.0, m, true);
        updateFilterStepBuffers(TmpSteps, TmpVars, RPosBufArray, SrcHeight,
                                NewHeight);

        fillUsedFracMap(TmpSteps[TmpVars.ResizeStep], UsedFracMap);

        const int c = calcComplexity(TmpSteps, TmpVars, UsedFracMap, NewWidth);

        if (c < BestScore) {
          UseBuildMode = m;
          BestScore = c;
        }
      }
    }

    Vars.k = ky;
    Vars.o = oy;

    if (UseBuildMode == PrevUseBuildMode && ky == kx) {
      if (OutMul != 1.0) {
        modifyCorrFilterDCGain(FltSteps, 1.0 / OutMul);
      }
    } else {
      buildFilterSteps(FltSteps, Vars, FltBank, 1.0, UseBuildMode, false);
    }

    updateFilterStepBuffers(FltSteps, Vars, RPosBufArray, SrcHeight, NewHeight);

    updateBufLenAndRPosPtrs(FltSteps, Vars, NewWidth);

    if (IsOutFloat && sizeof(FltBuf[0]) == sizeof(Tout) &&
        fpclass ::packmode == 0) {
      // In-place output.

      for (i = 0; i < ThreadCount; i++) {
        td[i].initScanlineQueue(td[i].sopResizeV, NewWidth, SrcHeight,
                                NewWidthE, NewWidthE);
      }

      for (i = 0; i < NewWidth; i++) {
        td[i % ThreadCount].addScanlineToQueue(
            &FltBuf[(size_t)i * ElCount],
            (fptype*)&NewBuf[(size_t)i * ElCount]);
      }

      ThreadPool.startAllWorkloads();
      td[0].processScanlineQueue();
      ThreadPool.waitAllWorkloadsToFinish();
      ThreadPool.removeAllWorkloads();

      return;
    }

    CBuffer<fptype, size_t> ResBuf((size_t)NewWidthE * NewHeight,
                                   fpclass ::fpalign);

    for (i = 0; i < ThreadCount; i++) {
      td[i].initScanlineQueue(td[i].sopResizeV, NewWidth, SrcHeight, NewWidthE,
                              NewWidthE);
    }

    const int im = (fpclass ::packmode == 0 ? ElCount : 1);

    for (i = 0; i < NewWidth; i++) {
      td[i % ThreadCount].addScanlineToQueue(&FltBuf[(size_t)i * im],
                                             &ResBuf[(size_t)i * im]);
    }

    ThreadPool.startAllWorkloads();
    td[0].processScanlineQueue();
    ThreadPool.waitAllWorkloadsToFinish();

    if (IsOutFloat) {
      // Perform output, but skip dithering.

      for (i = 0; i < ThreadCount; i++) {
        td[i].initScanlineQueue(td[i].sopUnpackH, NewHeight, NewWidth);
      }

      for (i = 0; i < NewHeight; i++) {
        td[i % ThreadCount].addScanlineToQueue(
            &ResBuf[(size_t)i * NewWidthE],
            &NewBuf[(size_t)i * NewWidth * ElCountIO]);
      }

      ThreadPool.startAllWorkloads();
      td[0].processScanlineQueue();
      ThreadPool.waitAllWorkloadsToFinish();
      ThreadPool.removeAllWorkloads();

      return;
    }

    // Perform output with dithering (for integer output only).

    int TruncBits;  // The number of lower bits to truncate and dither.
    int OutRange;   // Output range.

    if (sizeof(Tout) == 1) {
      TruncBits = 8 - ResBitDepth;
      OutRange = 255;
    } else {
      TruncBits = 16 - ResBitDepth;
      OutRange = 65535;
    }

    const double PkOut = OutRange;
    const double TrMul =
        (TruncBits > 0 ? PkOut / (OutRange >> TruncBits) : 1.0);

    if (CDitherer ::isRecursive()) {
      td[0].getDitherer().init(NewWidth, Vars, TrMul, PkOut);

      if (Vars.UseSRGBGamma) {
        for (i = 0; i < NewHeight; i++) {
          fptype* const ResScanline = &ResBuf[(size_t)i * NewWidthE];

          CFilterStep ::applySRGBGamma(ResScanline, NewWidth, Vars);

          td[0].getDitherer().dither(ResScanline);

          CFilterStep ::unpackScanline(
              ResScanline, &NewBuf[(size_t)i * NewWidth * ElCountIO], NewWidth,
              Vars);
        }
      } else {
        for (i = 0; i < NewHeight; i++) {
          fptype* const ResScanline = &ResBuf[(size_t)i * NewWidthE];

          td[0].getDitherer().dither(ResScanline);

          CFilterStep ::unpackScanline(
              ResScanline, &NewBuf[(size_t)i * NewWidth * ElCountIO], NewWidth,
              Vars);
        }
      }
    } else {
      for (i = 0; i < ThreadCount; i++) {
        td[i].initScanlineQueue(td[i].sopDitherAndUnpackH, NewHeight, NewWidth);

        td[i].getDitherer().init(NewWidth, Vars, TrMul, PkOut);
      }

      for (i = 0; i < NewHeight; i++) {
        td[i % ThreadCount].addScanlineToQueue(
            &ResBuf[(size_t)i * NewWidthE],
            &NewBuf[(size_t)i * NewWidth * ElCountIO]);
      }

      ThreadPool.startAllWorkloads();
      td[0].processScanlineQueue();
      ThreadPool.waitAllWorkloadsToFinish();
    }

    ThreadPool.removeAllWorkloads();
  }

 private:
  typedef typename fpclass ::fptype fptype;  ///< Floating-point type to use
                                             ///< during processing.
                                             ///<
  typedef typename fpclass ::CFilterStep
      CFilterStep;  ///< Filtering step
                    ///< class to use during processing.
                    ///<
  typedef typename fpclass ::CDitherer CDitherer;  ///< Ditherer class to
                                                   ///< use during processing.
                                                   ///<
  CImageResizerParams Params;  ///< Algorithm's parameters currently in use.
                               ///<
  int SrcBitDepth;             ///< Bit resolution of the source image.
                               ///<
  int ResBitDepth;             ///< Bit resolution of the resulting image.
                               ///<
  CDSPFracFilterBankLin<fptype>
      FixedFilterBank;  ///< Fractional delay
                        ///< filter bank with fixed characteristics, mainly for
                        ///< upsizing cases.
                        ///<

  /**
   * @brief Filtering steps array.
   *
   * The object of this class stores filtering steps together.
   */

  typedef CStructArray<CFilterStep> CFilterSteps;

  /**
   * Function initializes the filter bank in the specified resizing step
   * according to the source and resulting image bit depths.
   *
   * @param FltBank Filter bank to initialize.
   * @param CutoffMult Cutoff multiplier, 0 to 1. 1 corresponds to 0.5pi
   * cutoff point.
   * @param ForceHiOrder "True" if a high-order interpolation should be
   * forced which requires considerably less resources for initialization.
   * @param ExtFilter External filter to apply to interpolation filter.
   */

  void initFilterBank(CDSPFracFilterBankLin<fptype>& FltBank,
                      const double CutoffMult, const bool ForceHiOrder,
                      const CFltBuffer& ExtFilter) const {
    const int IntBitDepth =
        (ResBitDepth > SrcBitDepth ? ResBitDepth : SrcBitDepth);

    const double SNR = -6.02 * (IntBitDepth + 3);
    int UseOrder;
    int FracCount;  // The number of fractional delay filters sampled by
                    // the filter bank. This variable affects the
                    // signal-to-noise ratio at interpolation stage.
                    // Theoretically, at UseOrder==1, 8-bit image resizing
                    // requires 66.2 dB SNR or 11. 16-bit resizing requires
                    // 114.4 dB SNR or 150. At UseOrder=0 the required number of
                    // filters is exponentially higher.

    if (ForceHiOrder || IntBitDepth > 8) {
      UseOrder = 1;  // -146 dB max
      FracCount = (int)ceil(0.23134052 * exp(-0.058062929 * SNR));
    } else {
      UseOrder = 0;  // -72 dB max
      FracCount = (int)ceil(0.33287686 * exp(-0.11334583 * SNR));
    }

    if (FracCount < 2) {
      FracCount = 2;
    }

    FltBank.init(FracCount, UseOrder, Params.IntFltLen / CutoffMult,
                 Params.IntFltCutoff * CutoffMult, Params.IntFltAlpha,
                 ExtFilter, fpclass ::fpalign, fpclass ::elalign);
  }

  /**
   * Function allocates filter buffer taking "fpclass" alignments into
   * account. The allocated buffer may be larger than the requested size: in
   * this case the additional elements will be zeroed by this function.
   *
   * @param Flt Filter buffer.
   * @param ReqCapacity The required filter buffer's capacity.
   * @param IsModel "True" if filtering steps modeling is performed without
   * actual filter allocation.
   * @param FltExt If non-NULL this variable will receive the number of
   * elements the filter was extended by.
   */

  static void allocFilter(CBuffer<fptype>& Flt, const int ReqCapacity,
                          const bool IsModel = false,
                          int* const FltExt = NULL) {
    int UseCapacity =
        (ReqCapacity + fpclass ::elalign - 1) & ~(fpclass ::elalign - 1);

    int Ext = UseCapacity - ReqCapacity;

    if (FltExt != NULL) {
      *FltExt = Ext;
    }

    if (IsModel) {
      Flt.forceCapacity(UseCapacity);
      return;
    }

    Flt.alloc(UseCapacity, fpclass ::fpalign);

    while (Ext > 0) {
      Ext--;
      Flt[ReqCapacity + Ext] = 0.0;
    }
  }

  /**
   * Function assigns filter parameters to the specified filtering step
   * object.
   *
   * @param fs Filtering step to assign parameter to. This step cannot be
   * the last step if ResampleFactor greater than 1 was specified.
   * @param IsUpsample "True" if upsampling step. Should be set to "false"
   * if FltCutoff is negative.
   * @param ResampleFactor Resampling factor of this filter (>=1).
   * @param FltCutoff Filter cutoff point. This value will be divided by the
   * ResampleFactor if IsUpsample equals "true". If zero value was
   * specified, the "half-band" predefined filter will be created. In this
   * case the ResampleFactor will modify the filter cutoff point.
   * @param DCGain DC gain to apply to the filter. Assigned to filtering
   * step's DCGain variable.
   * @param UseFltOrig "True" if the originally-designed filter should be
   * left in filtering step's FltOrig buffer. Otherwise it will be freed.
   * @param IsModel "True" if filtering steps modeling is performed without
   * actual filter building.
   */

  void assignFilterParams(CFilterStep& fs, const bool IsUpsample,
                          const int ResampleFactor, const double FltCutoff,
                          const double DCGain, const bool UseFltOrig,
                          const bool IsModel) const {
    double FltAlpha;
    double Len2;
    double Freq;

    if (FltCutoff == 0.0) {
      const double m = 2.0 / ResampleFactor;
      FltAlpha = Params.HBFltAlpha;
      Len2 = 0.5 * Params.HBFltLen / m;
      Freq = AVIR_PI * Params.HBFltCutoff * m;
    } else {
      FltAlpha = Params.LPFltAlpha;
      Len2 = 0.25 * Params.LPFltBaseLen / FltCutoff;
      Freq = AVIR_PI * Params.LPFltCutoffMult * FltCutoff;
    }

    if (IsUpsample) {
      Len2 *= ResampleFactor;
      Freq /= ResampleFactor;
      fs.DCGain = DCGain * ResampleFactor;
    } else {
      fs.DCGain = DCGain;
    }

    fs.FltOrig.Len2 = Len2;
    fs.FltOrig.Freq = Freq;
    fs.FltOrig.Alpha = FltAlpha;
    fs.FltOrig.DCGain = fs.DCGain;

    CDSPPeakedCosineLPF w(Len2, Freq, FltAlpha);

    fs.IsUpsample = IsUpsample;
    fs.ResampleFactor = ResampleFactor;
    fs.FltLatency = w.fl2;

    int FltExt;  // Filter's extension due to fpclass :: elalign.

    if (IsModel) {
      allocFilter(fs.Flt, w.FilterLen, true, &FltExt);

      if (UseFltOrig) {
        // Allocate a real buffer even in modeling mode since this
        // filter may be copied by the filter bank.

        fs.FltOrig.alloc(w.FilterLen);
        memset(&fs.FltOrig[0], 0, w.FilterLen * sizeof(fs.FltOrig[0]));
      }
    } else {
      fs.FltOrig.alloc(w.FilterLen);

      w.generateLPF(&fs.FltOrig[0], 1.0);
      optimizeFIRFilter(fs.FltOrig, fs.FltLatency);
      normalizeFIRFilter(&fs.FltOrig[0], fs.FltOrig.getCapacity(), fs.DCGain);

      allocFilter(fs.Flt, fs.FltOrig.getCapacity(), false, &FltExt);
      copyArray(&fs.FltOrig[0], &fs.Flt[0], fs.FltOrig.getCapacity());

      if (!UseFltOrig) {
        fs.FltOrig.free();
      }
    }

    if (IsUpsample) {
      int l = fs.Flt.getCapacity() - fs.FltLatency - ResampleFactor - FltExt;

      allocFilter(fs.PrefixDC, l, IsModel);
      allocFilter(fs.SuffixDC, fs.FltLatency, IsModel);

      if (IsModel) {
        return;
      }

      // Create prefix and suffix "tails" used during upsampling.

      const fptype* ip = &fs.Flt[fs.FltLatency + ResampleFactor];
      copyArray(ip, &fs.PrefixDC[0], l);

      while (true) {
        ip += ResampleFactor;
        l -= ResampleFactor;

        if (l <= 0) {
          break;
        }

        addArray(ip, &fs.PrefixDC[0], l);
      }

      l = fs.FltLatency;
      fptype* op = &fs.SuffixDC[0];
      copyArray(&fs.Flt[0], op, l);

      while (true) {
        op += ResampleFactor;
        l -= ResampleFactor;

        if (l <= 0) {
          break;
        }

        addArray(&fs.Flt[0], op, l);
      }
    } else if (!UseFltOrig) {
      fs.EdgePixelCount = fs.EdgePixelCountDef;
    }
  }

  /**
   * Function adds a correction filter that tries to achieve a linear
   * frequency response at all frequencies. The actual resulting response
   * may feature a slight damping of the highest frequencies since a
   * suitably short correction filter cannot fix steep high-frequency
   * damping.
   *
   * This function assumes that the resizing step is currently the last
   * step, even if it was not inserted yet: this allows placement of the
   * correction filter both before and after the resizing step.
   *
   * @param Steps Filtering steps.
   * @param bw Resulting bandwidth relative to the original bandwidth (which
   * is 1.0), usually 1/k. Should be <= 1.0.
   * @param IsPreCorrection "True" if the filtering step was already created
   * and it is first in the Steps array. "True" also adds edge pixels to
   * reduce edge artifacts.
   * @param IsModel "True" if filtering steps modeling is performed without
   * actual filter building.
   */

  void addCorrectionFilter(CFilterSteps& Steps, const double bw,
                           const bool IsPreCorrection,
                           const bool IsModel) const {
    CFilterStep& fs = (IsPreCorrection ? Steps[0] : Steps.add());
    fs.IsUpsample = false;
    fs.ResampleFactor = 1;
    fs.DCGain = 1.0;
    fs.EdgePixelCount = (IsPreCorrection ? fs.EdgePixelCountDef : 0);

    if (IsModel) {
      allocFilter(
          fs.Flt,
          CDSPFIREQ ::calcFilterLength(Params.CorrFltLen, fs.FltLatency), true);

      return;
    }

    const int BinCount = 65;  // Frequency response bins to control.
    const int BinCount1 = BinCount - 1;
    double curbw = 1.0;  // Bandwidth of the filter at the current step.
    int i;
    int j;
    double re;
    double im;

    CBuffer<double> Bins(BinCount);  // Adjustment introduced by all
                                     // steps at all frequencies of interest.

    for (j = 0; j < BinCount; j++) {
      Bins[j] = 1.0;
    }

    const int si = (IsPreCorrection ? 1 : 0);

    for (i = si; i < Steps.getItemCount() - (si ^ 1); i++) {
      const CFilterStep& fs = Steps[i];

      if (fs.IsUpsample) {
        curbw *= fs.ResampleFactor;

        if (fs.FltOrig.getCapacity() > 0) {
          continue;
        }
      }

      const double dcg = 1.0 / fs.DCGain;  // DC gain correction.
      const fptype* Flt;
      int FltLen;

      if (fs.ResampleFactor == 0) {
        Flt = fs.FltBank->getFilter(0);
        FltLen = fs.FltBank->getFilterLen();
      } else {
        Flt = &fs.Flt[0];
        FltLen = fs.Flt.getCapacity();
      }

      // Calculate frequency response adjustment introduced by the
      // filter at this step, within the bounds of bandwidth of
      // interest.

      for (j = 0; j < BinCount; j++) {
        const double th = AVIR_PI * bw / curbw * j / BinCount1;

        calcFIRFilterResponse(Flt, FltLen, th, re, im);

        Bins[j] /= sqrt(re * re + im * im) * dcg;
      }

      if (!fs.IsUpsample && fs.ResampleFactor > 1) {
        curbw /= fs.ResampleFactor;
      }
    }

    // Calculate filter.

    CDSPFIREQ EQ;
    EQ.init(bw * 2.0, Params.CorrFltLen, BinCount, 0.0, bw, false,
            Params.CorrFltAlpha);

    fs.FltLatency = EQ.getFilterLatency();

    CBuffer<double> Filter(EQ.getFilterLength());
    EQ.buildFilter(Bins, &Filter[0]);
    normalizeFIRFilter(&Filter[0], Filter.getCapacity(), 1.0);
    optimizeFIRFilter(Filter, fs.FltLatency);
    normalizeFIRFilter(&Filter[0], Filter.getCapacity(), 1.0);

    allocFilter(fs.Flt, Filter.getCapacity());
    copyArray(&Filter[0], &fs.Flt[0], Filter.getCapacity());

    // Print a theoretically achieved final frequency response at various
    // feature sizes (from DC to 1 pixel). Values above 255 means features
    // become brighter, values below 255 means features become dimmer.

    /*		const double sbw = ( bw > 1.0 ? 1.0 / bw : 1.0 );

                    for( j = 0; j < BinCount; j++ )
                    {
                            const double th = AVIR_PI * sbw * j / BinCount1;

                            calcFIRFilterResponse( &fs.Flt[ 0 ],
       fs.Flt.getCapacity(), th, re, im );

                            printf( "%f\n", sqrt( re * re + im * im ) / Bins[ j
       ] * 255 );
                    }

                    printf( "***\n" );*/
  }

  /**
   * Function adds a sharpening filter if image is being upsized. Such
   * sharpening allows to spot interpolation filter's stop-band attenuation:
   * if attenuation is too weak, a "dark grid" and other artifacts may
   * become visible.
   *
   * It is assumed that 40 decibel stop-band attenuation should be
   * considered a required minimum: this allows application of (deliberately
   * strong) 64X sharpening without spotting any artifacts.
   *
   * @param Steps Filtering steps.
   * @param bw Resulting bandwidth relative to the original bandwidth (which
   * is 1.0), usually 1/k.
   * @param IsModel "True" if filtering steps modeling is performed without
   * actual filter building.
   */

  static void addSharpenTest(CFilterSteps& Steps, const double bw,
                             const bool IsModel) {
    if (bw <= 1.0) {
      return;
    }

    const double FltLen = 10.0 * bw;

    CFilterStep& fs = Steps.add();
    fs.IsUpsample = false;
    fs.ResampleFactor = 1;
    fs.DCGain = 1.0;
    fs.EdgePixelCount = 0;

    if (IsModel) {
      allocFilter(fs.Flt, CDSPFIREQ ::calcFilterLength(FltLen, fs.FltLatency),
                  true);

      return;
    }

    const int BinCount = 200;
    CBuffer<double> Bins(BinCount);
    int Thresh = (int)round(BinCount / bw * 1.75);

    if (Thresh > BinCount) {
      Thresh = BinCount;
    }

    int j;

    for (j = 0; j < Thresh; j++) {
      Bins[j] = 1.0;
    }

    for (j = Thresh; j < BinCount; j++) {
      Bins[j] = 256.0;
    }

    CDSPFIREQ EQ;
    EQ.init(bw * 2.0, FltLen, BinCount, 0.0, bw, false, 1.7);

    fs.FltLatency = EQ.getFilterLatency();

    CBuffer<double> Filter(EQ.getFilterLength());
    EQ.buildFilter(Bins, &Filter[0]);
    normalizeFIRFilter(&Filter[0], Filter.getCapacity(), 1.0);
    optimizeFIRFilter(Filter, fs.FltLatency);
    normalizeFIRFilter(&Filter[0], Filter.getCapacity(), 1.0);

    allocFilter(fs.Flt, Filter.getCapacity());
    copyArray(&Filter[0], &fs.Flt[0], Filter.getCapacity());

    /*		for( j = 0; j < BinCount; j++ )
                    {
                            const double th = AVIR_PI * j / ( BinCount - 1 );
                            double re;
                            double im;

                            calcFIRFilterResponse( &fs.Flt[ 0 ],
       fs.Flt.getCapacity(), th, re, im );

                            printf( "%f\n", sqrt( re * re + im * im ));
                    }

                    printf( "***\n" );*/
  }

  /**
   * Function builds sequence of filtering steps depending on the specified
   * resizing coefficient. The last steps included are always the resizing
   * step then (possibly) the correction step.
   *
   * @param Steps Array that receives filtering steps.
   * @param[out] Vars Variables object.
   * @param FltBank Filter bank to initialize and use.
   * @param DCGain The overall DC gain to apply. This DC gain is applied to
   * the first filtering step only (upsampling or filtering step).
   * @param ModeFlags Build mode flags to use. This is a bitmap of switches
   * that enable or disable certain algorithm features.
   * @param IsModel "True" if filtering steps modeling is performed without
   * the actual filter allocation and building.
   */

  void buildFilterSteps(CFilterSteps& Steps, CImageResizerVars& Vars,
                        CDSPFracFilterBankLin<fptype>& FltBank,
                        const double DCGain, const int ModeFlags,
                        const bool IsModel) const {
    Steps.clear();

    const bool DoFltAndIntCombo =
        ((ModeFlags & 1) != 0);  // Do filter
                                 // and interpolator combining.
    const bool ForceHiOrderInt =
        ((ModeFlags & 2) != 0);  // Force use
                                 // of a higher-order interpolation.
    const bool UseHalfband = ((ModeFlags & 4) != 0);  // Use half-band
                                                      // filter.

    const double bw = 1.0 / Vars.k;  // Resulting bandwidth.
    const int UpsampleFactor = ((int)floor(Vars.k) < 2 ? 2 : 1);
    double IntCutoffMult;     // Interpolation filter cutoff multiplier.
    CFilterStep* ReuseStep;   // If not NULL, resizing step should use
                              // this step object instead of creating a new one.
    CFilterStep* ExtFltStep;  // Use FltOrig of this step as the external
                              // filter to applied to the interpolator.
    bool IsPreCorrection;     // "True" if the correction filter is applied
                              // first.
    double FltCutoff;         // Cutoff frequency of the first filtering step.
    double corrbw;            ///< Bandwidth at the correction step.

    if (Vars.k <= 1.0) {
      IsPreCorrection = true;
      FltCutoff = 1.0;
      corrbw = 1.0;
      Steps.add();
    } else {
      IsPreCorrection = false;
      FltCutoff = bw;
      corrbw = bw;
    }

    // Add 1 upsampling or several downsampling filters.

    if (UpsampleFactor > 1) {
      CFilterStep& fs = Steps.add();
      assignFilterParams(fs, true, UpsampleFactor, FltCutoff, DCGain,
                         DoFltAndIntCombo, IsModel);

      IntCutoffMult = FltCutoff * 2.0 / UpsampleFactor;
      ReuseStep = NULL;
      ExtFltStep = (DoFltAndIntCombo ? &fs : NULL);
    } else {
      int DownsampleFactor;

      while (true) {
        DownsampleFactor = (int)floor(0.5 / FltCutoff);
        bool DoHBFltAdd;

        if (DownsampleFactor > 16) {
          // Add half-band filter unconditionally in order to keep
          // filter lengths lower for more precise frequency
          // response and less edge artifacts.

          DoHBFltAdd = true;
          DownsampleFactor = 16;
        } else {
          DoHBFltAdd = (UseHalfband && DownsampleFactor > 1);
        }

        if (DoHBFltAdd) {
          assignFilterParams(Steps.add(), false, DownsampleFactor, 0.0, 1.0,
                             false, IsModel);

          FltCutoff *= DownsampleFactor;
        } else {
          if (DownsampleFactor < 1) {
            DownsampleFactor = 1;
          }

          break;
        }
      }

      CFilterStep& fs = Steps.add();
      assignFilterParams(fs, false, DownsampleFactor, FltCutoff, DCGain,
                         DoFltAndIntCombo, IsModel);

      IntCutoffMult = FltCutoff / 0.5;

      if (DoFltAndIntCombo) {
        ReuseStep = &fs;
        ExtFltStep = &fs;
      } else {
        IntCutoffMult *= DownsampleFactor;
        ReuseStep = NULL;
        ExtFltStep = NULL;
      }
    }

    // Insert resizing and correction steps.

    CFilterStep& fs = (ReuseStep == NULL ? Steps.add() : *ReuseStep);

    Vars.ResizeStep = Steps.getItemCount() - 1;
    fs.IsUpsample = false;
    fs.ResampleFactor = 0;
    fs.DCGain = (ExtFltStep == NULL ? 1.0 : ExtFltStep->DCGain);

    initFilterBank(FltBank, IntCutoffMult, ForceHiOrderInt,
                   (ExtFltStep == NULL ? fs.FltOrig : ExtFltStep->FltOrig));

    if (FltBank == FixedFilterBank) {
      fs.FltBank = (CDSPFracFilterBankLin<fptype>*)&FixedFilterBank;
    } else {
      fs.FltBank = &FltBank;
    }

    addCorrectionFilter(Steps, corrbw, IsPreCorrection, IsModel);

    // addSharpenTest( Steps, bw, IsModel );
  }

  /**
   * Function extends *this upsampling step so that it produces more
   * upsampled pixels that cover the prefix and suffix needs of the next
   * step. After the call to this function the InPrefix and InSuffix
   * variables of the next step will be set to zero.
   *
   * @param fs Upsampling filtering step.
   * @param NextStep The next step structure.
   */

  static void extendUpsample(CFilterStep& fs, CFilterStep& NextStep) {
    fs.InPrefix =
        (NextStep.InPrefix + fs.ResampleFactor - 1) / fs.ResampleFactor;

    fs.OutPrefix += fs.InPrefix * fs.ResampleFactor;
    NextStep.InPrefix = 0;

    fs.InSuffix =
        (NextStep.InSuffix + fs.ResampleFactor - 1) / fs.ResampleFactor;

    fs.OutSuffix += fs.InSuffix * fs.ResampleFactor;
    NextStep.InSuffix = 0;
  }

  /**
   * Function fills resizing step's RPosBuf array, excluding the actual
   * "ftp" pointers and "SrcOffs" offsets.
   *
   * This array should be cleared if the resizing step or offset were
   * changed. Otherwise this function only fills the elements required to
   * cover resizing step's OutLen.
   *
   * This function is called by the updateFilterStepBuffers() function.
   *
   * @param fs Resizing step.
   * @param Vars Variables object.
   */

  static void fillRPosBuf(CFilterStep& fs, const CImageResizerVars& Vars) {
    const int PrevLen = fs.RPosBuf->getCapacity();

    if (fs.OutLen > PrevLen) {
      fs.RPosBuf->increaseCapacity(fs.OutLen);
    }

    typename CFilterStep ::CResizePos* rpos = &(*fs.RPosBuf)[PrevLen];
    const int FracCount = fs.FltBank->getFracCount();
    const double o = Vars.o;
    const double k = Vars.k;
    int i;

    for (i = PrevLen; i < fs.OutLen; i++) {
      const double SrcPos = o + k * i;
      const int SrcPosInt = (int)floor(SrcPos);
      const double x = (SrcPos - SrcPosInt) * FracCount;
      const int fti = (int)x;
      rpos->x = (typename fpclass ::fptypeatom)(x - fti);
      rpos->fti = fti;
      rpos->SrcPosInt = SrcPosInt;
      rpos++;
    }
  }

  /**
   * Function updates filtering step buffer lengths depending on the
   * specified source and new scanline lengths. This function should be
   * called after the buildFilterSteps() function.
   *
   * @param Steps Array that receives filtering steps.
   * @param[out] Vars Variables object, will receive buffer size and length.
   * This function expects "k" and "o" variable values that will be
   * adjusted by this function.
   * @param RPosBufArray Resizing position buffers array, used to obtain
   * buffer to initialize and use (will be reused if it is already fully or
   * partially filled).
   * @param SrcLen Source scanline's length in pixels.
   * @param NewLen New scanline's length in pixels.
   */

  static void updateFilterStepBuffers(
      CFilterSteps& Steps, CImageResizerVars& Vars,
      typename CFilterStep ::CRPosBufArray& RPosBufArray, int SrcLen,
      const int NewLen) {
    int upstep = -1;
    int InBuf = 0;
    int i;

    for (i = 0; i < Steps.getItemCount(); i++) {
      CFilterStep& fs = Steps[i];

      fs.Vars = &Vars;
      fs.InLen = SrcLen;
      fs.InBuf = InBuf;
      fs.OutBuf = (InBuf + 1) & 1;

      if (fs.IsUpsample) {
        upstep = i;
        Vars.k *= fs.ResampleFactor;
        Vars.o *= fs.ResampleFactor;
        fs.InPrefix = 0;
        fs.InSuffix = 0;
        fs.OutLen = fs.InLen * fs.ResampleFactor;
        fs.OutPrefix = fs.FltLatency;
        fs.OutSuffix = fs.Flt.getCapacity() - fs.FltLatency - fs.ResampleFactor;

        int l0 = fs.OutPrefix + fs.OutLen + fs.OutSuffix;
        int l = fs.InLen * fs.ResampleFactor + fs.SuffixDC.getCapacity();

        if (l > l0) {
          fs.OutSuffix += l - l0;
        }

        l0 = fs.OutLen + fs.OutSuffix;

        if (fs.PrefixDC.getCapacity() > l0) {
          fs.OutSuffix += fs.PrefixDC.getCapacity() - l0;
        }
      } else if (fs.ResampleFactor == 0) {
        const int FilterLenD2 = fs.FltBank->getFilterLen() / 2;
        const int FilterLenD21 = FilterLenD2 - 1;

        const int ResizeLPix = (int)floor(Vars.o) - FilterLenD21;
        fs.InPrefix = (ResizeLPix < 0 ? -ResizeLPix : 0);
        const int ResizeRPix =
            (int)floor(Vars.o + (NewLen - 1) * Vars.k) + FilterLenD2 + 1;

        fs.InSuffix = (ResizeRPix > fs.InLen ? ResizeRPix - fs.InLen : 0);

        fs.OutLen = NewLen;
        fs.RPosBuf = &RPosBufArray.getRPosBuf(Vars.k, Vars.o,
                                              fs.FltBank->getFracCount());

        fillRPosBuf(fs, Vars);
      } else {
        Vars.k /= fs.ResampleFactor;
        Vars.o /= fs.ResampleFactor;
        Vars.o += fs.EdgePixelCount;

        fs.InPrefix = fs.FltLatency;
        fs.InSuffix = fs.Flt.getCapacity() - fs.FltLatency - 1;

        // Additionally extend OutLen to produce more precise edge
        // pixels.

        fs.OutLen = (fs.InLen + fs.ResampleFactor - 1) / fs.ResampleFactor +
                    fs.EdgePixelCount;

        fs.InSuffix += (fs.OutLen - 1) * fs.ResampleFactor + 1 - fs.InLen;

        fs.InPrefix += fs.EdgePixelCount * fs.ResampleFactor;
        fs.OutLen += fs.EdgePixelCount;
      }

      InBuf = fs.OutBuf;
      SrcLen = fs.OutLen;
    }

    Steps[Steps.getItemCount() - 1].OutBuf = 2;

    if (upstep != -1) {
      extendUpsample(Steps[upstep], Steps[upstep + 1]);
    }
  }

  /**
   * Function calculates an optimal intermediate buffer length that will
   * cover all needs of the specified filtering steps. This function should
   * be called after the updateFilterStepBuffers() function.
   *
   * Function also updates resizing step's RPosBuf pointers to the filter
   * bank and SrcOffs values.
   *
   * @param Steps Filtering steps.
   * @param[out] Vars Variables object, will receive buffer size and length.
   * @param ResElIncr Resulting (final) element increment, used to produce
   * de-interleaved result. For horizontal processing this value is equal
   * to last step's OutLen, for vertical processing this value is equal to
   * resulting image's width.
   */

  static void updateBufLenAndRPosPtrs(CFilterSteps& Steps,
                                      CImageResizerVars& Vars,
                                      const int ResElIncr) {
    int MaxPrefix[2] = {0, 0};
    int MaxLen[2] = {0, 0};
    int i;

    for (i = 0; i < Steps.getItemCount(); i++) {
      CFilterStep& fs = Steps[i];
      const int ib = fs.InBuf;

      if (fs.InPrefix > MaxPrefix[ib]) {
        MaxPrefix[ib] = fs.InPrefix;
      }

      int l = fs.InLen + fs.InSuffix;

      if (l > MaxLen[ib]) {
        MaxLen[ib] = l;
      }

      fs.InElIncr = fs.InPrefix + l;

      if (fs.OutBuf == 2) {
        break;
      }

      const int ob = fs.OutBuf;

      if (fs.IsUpsample) {
        if (fs.OutPrefix > MaxPrefix[ob]) {
          MaxPrefix[ob] = fs.OutPrefix;
        }

        l = fs.OutLen + fs.OutSuffix;

        if (l > MaxLen[ob]) {
          MaxLen[ob] = l;
        }
      } else {
        if (fs.OutLen > MaxLen[ob]) {
          MaxLen[ob] = fs.OutLen;
        }
      }
    }

    // Update OutElIncr values of all steps.

    for (i = 0; i < Steps.getItemCount(); i++) {
      CFilterStep& fs = Steps[i];

      if (fs.OutBuf == 2) {
        fs.OutElIncr = ResElIncr;
        break;
      }

      CFilterStep& fs2 = Steps[i + 1];

      if (fs.IsUpsample) {
        fs.OutElIncr = fs.OutPrefix + fs.OutLen + fs.OutSuffix;

        if (fs.OutElIncr > fs2.InElIncr) {
          fs2.InElIncr = fs.OutElIncr;
        } else {
          fs.OutElIncr = fs2.InElIncr;
        }
      } else {
        fs.OutElIncr = fs2.InElIncr;
      }
    }

    // Update temporary buffer's length.

    for (i = 0; i < 2; i++) {
      Vars.BufLen[i] = MaxPrefix[i] + MaxLen[i];
      Vars.BufOffs[i] = MaxPrefix[i];

      if (Vars.packmode == 0) {
        Vars.BufOffs[i] *= Vars.ElCount;
      }

      Vars.BufLen[i] *= Vars.ElCount;
    }

    // Update RPosBuf pointers and SrcOffs.

    CFilterStep& fs = Steps[Vars.ResizeStep];
    typename CFilterStep ::CResizePos* rpos = &(*fs.RPosBuf)[0];
    const int em = (fpclass ::packmode == 0 ? Vars.ElCount : 1);
    const int FilterLenD21 = fs.FltBank->getFilterLen() / 2 - 1;

    for (i = 0; i < fs.OutLen; i++) {
      rpos->ftp = fs.FltBank->getFilter(rpos->fti);
      rpos->SrcOffs = (rpos->SrcPosInt - FilterLenD21) * em;
      rpos++;
    }
  }

  /**
   * Function modifies the overall (DC) gain of the correction filter in the
   * pre-built filtering steps array.
   *
   * @param Steps Filtering steps.
   * @param m Multiplier to apply to the correction filter.
   */

  void modifyCorrFilterDCGain(CFilterSteps& Steps, const double m) const {
    CBuffer<fptype>* Flt;
    const int z = Steps.getItemCount() - 1;

    if (!Steps[z].IsUpsample && Steps[z].ResampleFactor == 1) {
      Flt = &Steps[z].Flt;
    } else {
      Flt = &Steps[0].Flt;
    }

    int i;

    for (i = 0; i < Flt->getCapacity(); i++) {
      (*Flt)[i] = (fptype)((double)(*Flt)[i] * m);
    }
  }

  /**
   * Function builds a map of used fractional delay filters based on the
   * resizing positions buffer.
   *
   * @param fs Resizing step.
   * @param[out] UsedFracMap Map of used fractional delay filters.
   */

  static void fillUsedFracMap(const CFilterStep& fs,
                              CBuffer<uint8_t>& UsedFracMap) {
    const int FracCount = fs.FltBank->getFracCount();
    UsedFracMap.increaseCapacity(FracCount, false);
    memset(&UsedFracMap[0], 0, FracCount * sizeof(UsedFracMap[0]));

    typename CFilterStep ::CResizePos* rpos = &(*fs.RPosBuf)[0];
    int i;

    for (i = 0; i < fs.OutLen; i++) {
      UsedFracMap[rpos->fti] |= 1;
      rpos++;
    }
  }

  /**
   * Function calculates the overall filtering steps complexity per
   * scanline. Each complexity unit corresponds to a single multiply-add
   * operation. Data copy and pointer math operations are not included in
   * this calculation, it is assumed that they correlate to the multiply-add
   * operations. Calculation also does not include final rounding, dithering
   * and clamping operations since they cannot be optimized out anyway.
   *
   * Calculation of the CRPosBuf buffer is not included since it cannot be
   * avoided.
   *
   * This function should be called after the updateFilterStepBuffers()
   * function.
   *
   * @param Steps Filtering steps array.
   * @param Vars Variables object.
   * @param UsedFracMap The map of used fractional delay filters.
   * @param ScanlineCount Scanline count.
   */

  static int calcComplexity(const CFilterSteps& Steps,
                            const CImageResizerVars& Vars,
                            const CBuffer<uint8_t>& UsedFracMap,
                            const int ScanlineCount) {
    int fcnum;    // Filter complexity multiplier numerator.
    int fcdenom;  // Filter complexity multiplier denominator.

    if (Vars.packmode != 0) {
      fcnum = 1;
      fcdenom = 1;
    } else {
      // In interleaved processing mode, filters require 1 less
      // multiplication per 2 multiply-add instructions.

      fcnum = 3;
      fcdenom = 4;
    }

    int s = 0;   // Complexity per one scanline.
    int s2 = 0;  // Complexity per all scanlines.
    int i;

    for (i = 0; i < Steps.getItemCount(); i++) {
      const CFilterStep& fs = Steps[i];

      s2 += 65 * fs.Flt.getCapacity();  // Filter creation complexity.

      if (fs.IsUpsample) {
        if (fs.FltOrig.getCapacity() > 0) {
          continue;
        }

        s += (fs.Flt.getCapacity() * (fs.InPrefix + fs.InLen + fs.InSuffix) +
              fs.SuffixDC.getCapacity() + fs.PrefixDC.getCapacity()) *
             Vars.ElCount;
      } else if (fs.ResampleFactor == 0) {
        s += fs.FltBank->getFilterLen() *
             (fs.FltBank->getOrder() + Vars.ElCount) * fs.OutLen;

        s2 += fs.FltBank->calcInitComplexity(UsedFracMap);
      } else {
        s += fs.Flt.getCapacity() * Vars.ElCount * fs.OutLen * fcnum / fcdenom;
      }
    }

    return (s + s2 / ScanlineCount);
  }

  /**
   * @brief Thread-isolated data used for scanline processing.
   *
   * This structure holds data necessary for image's horizontal or vertical
   * scanline processing, including scanline processing queue.
   *
   * @tparam Tin Source element data type. Intermediate buffers store data
   * in floating point format.
   * @tparam Tout Destination element data type. Intermediate buffers store
   * data in floating point format.
   */

  template <class Tin, class Tout>
  class CThreadData : public CImageResizerThreadPool ::CWorkload {
   public:
    virtual void process() { processScanlineQueue(); }

    /**
     * This enumeration lists possible scanline operations.
     */

    enum EScanlineOperation {
      sopResizeH,           ///< Resize horizontal scanline.
                            ///<
      sopResizeV,           ///< Resize vertical scanline.
                            ///<
      sopDitherAndUnpackH,  ///< Dither and unpack horizontal scanline.
                            ///<
      sopUnpackH            ///< Unpack horizontal scanline.
                            ///<
    };

    /**
     * Function initializes *this thread data object and assigns certain
     * variables provided by the higher level code.
     *
     * @param aThreadIndex Index of this thread data (0-based).
     * @param aThreadCount Total number of threads used during processing.
     * @param aSteps Filtering steps.
     * @param aVars Image resizer variables.
     */

    void init(const int aThreadIndex, const int aThreadCount,
              const CFilterSteps& aSteps, const CImageResizerVars& aVars) {
      ThreadIndex = aThreadIndex;
      ThreadCount = aThreadCount;
      Steps = &aSteps;
      Vars = &aVars;
    }

    /**
     * Function initializes scanline processing queue, and updates
     * capacities of intermediate buffers.
     *
     * @param aOp Operation to perform over scanline.
     * @param TotalLines The total number of scanlines that will be
     * processed by all threads.
     * @param aSrcLen Source scanline length in pixels.
     * @param aSrcIncr Source scanline buffer increment. Ignored in
     * horizontal scanline processing.
     * @param aResIncr Resulting scanline buffer increment. Ignored in
     * horizontal scanline processing.
     */

    void initScanlineQueue(const EScanlineOperation aOp, const int TotalLines,
                           const int aSrcLen, const int aSrcIncr = 0,
                           const int aResIncr = 0) {
      const int l = Vars->BufLen[0] + Vars->BufLen[1];

      if (Bufs.getCapacity() < l) {
        Bufs.alloc(l, fpclass ::fpalign);
      }

      BufPtrs[0] = Bufs + Vars->BufOffs[0];
      BufPtrs[1] = Bufs + Vars->BufLen[0] + Vars->BufOffs[1];

      int j;
      int ml = 0;

      for (j = 0; j < Steps->getItemCount(); j++) {
        const CFilterStep& fs = (*Steps)[j];

        if (fs.ResampleFactor == 0 && ml < fs.FltBank->getFilterLen()) {
          ml = fs.FltBank->getFilterLen();
        }
      }

      TmpFltBuf.alloc(ml, fpclass ::fpalign);
      ScanlineOp = aOp;
      SrcLen = aSrcLen;
      SrcIncr = aSrcIncr;
      ResIncr = aResIncr;
      QueueLen = 0;
      Queue.increaseCapacity((TotalLines + ThreadCount - 1) / ThreadCount,
                             false);
    }

    /**
     * Function adds a scanline to the queue buffer. The
     * initScanlineQueue() function should be called before calling this
     * function. The number of calls to this add function should not
     * exceed the TotalLines spread over all threads.
     *
     * @param SrcBuf Source scanline buffer.
     * @param ResBuf Resulting scanline buffer.
     */

    void addScanlineToQueue(void* const SrcBuf, void* const ResBuf) {
      Queue[QueueLen].SrcBuf = SrcBuf;
      Queue[QueueLen].ResBuf = ResBuf;
      QueueLen++;
    }

    /**
     * Function processes all queued scanlines.
     */

    void processScanlineQueue() {
      int i;

      switch (ScanlineOp) {
        case sopResizeH: {
          for (i = 0; i < QueueLen; i++) {
            resizeScanlineH((Tin*)Queue[i].SrcBuf, (fptype*)Queue[i].ResBuf);
          }

          break;
        }

        case sopResizeV: {
          for (i = 0; i < QueueLen; i++) {
            resizeScanlineV((fptype*)Queue[i].SrcBuf, (fptype*)Queue[i].ResBuf);
          }

          break;
        }

        case sopDitherAndUnpackH: {
          if (Vars->UseSRGBGamma) {
            for (i = 0; i < QueueLen; i++) {
              CFilterStep ::applySRGBGamma((fptype*)Queue[i].SrcBuf, SrcLen,
                                           *Vars);

              Ditherer.dither((fptype*)Queue[i].SrcBuf);

              CFilterStep ::unpackScanline((fptype*)Queue[i].SrcBuf,
                                           (Tout*)Queue[i].ResBuf, SrcLen,
                                           *Vars);
            }
          } else {
            for (i = 0; i < QueueLen; i++) {
              Ditherer.dither((fptype*)Queue[i].SrcBuf);

              CFilterStep ::unpackScanline((fptype*)Queue[i].SrcBuf,
                                           (Tout*)Queue[i].ResBuf, SrcLen,
                                           *Vars);
            }
          }

          break;
        }

        case sopUnpackH: {
          if (Vars->UseSRGBGamma) {
            for (i = 0; i < QueueLen; i++) {
              CFilterStep ::applySRGBGamma((fptype*)Queue[i].SrcBuf, SrcLen,
                                           *Vars);

              CFilterStep ::unpackScanline((fptype*)Queue[i].SrcBuf,
                                           (Tout*)Queue[i].ResBuf, SrcLen,
                                           *Vars);
            }
          } else {
            for (i = 0; i < QueueLen; i++) {
              CFilterStep ::unpackScanline((fptype*)Queue[i].SrcBuf,
                                           (Tout*)Queue[i].ResBuf, SrcLen,
                                           *Vars);
            }
          }

          break;
        }
      }
    }

    /**
     * Function returns ditherer object associated with *this thread data
     * object.
     */

    CDitherer& getDitherer() { return (Ditherer); }

   private:
    int ThreadIndex;                ///< Thread index.
                                    ///<
    int ThreadCount;                ///< Thread count.
                                    ///<
    const CFilterSteps* Steps;      ///< Filtering steps.
                                    ///<
    const CImageResizerVars* Vars;  ///< Image resizer variables.
                                    ///<
    CBuffer<fptype> Bufs;           ///< Flip-flop intermediate buffers.
                                    ///<
    fptype* BufPtrs[3];  ///< Flip-flop buffer pointers (referenced by
                         ///< filtering step's InBuf and OutBuf indices).
                         ///<
    CBuffer<fptype>
        TmpFltBuf;  ///< Temporary buffer used in the
                    ///< doResize() function, aligned by fpclass :: fpalign.
                    ///<
    EScanlineOperation ScanlineOp;  ///< Operation to perform over
                                    ///< scanline.
                                    ///<
    int SrcLen;   ///< Source scanline length in the last queue.
                  ///<
    int SrcIncr;  ///< Source scanline buffer increment in the last queue.
                  ///<
    int ResIncr;  ///< Resulting scanline buffer increment in the last
                  ///< queue.
                  ///<
    CDitherer Ditherer;  ///< Ditherer object to use.
                         ///<

    /**
     * @brief Scanline processing queue item.
     *
     * Scanline processing queue item.
     */

    struct CQueueItem {
      void* SrcBuf;  ///< Source scanline buffer, will by typecasted to
                     ///< Tin or fptype*.
                     ///<
      void* ResBuf;  ///< Resulting scanline buffer, will by typecasted
                     ///< to Tout or fptype*.
                     ///<
    };

    CBuffer<CQueueItem> Queue;  ///< Scanline processing queue.
                                ///<
    int QueueLen;               ///< Queue length.
                                ///<

    /**
     * Function resizes a single horizontal scanline.
     *
     * @param SrcBuf Source scanline buffer. Can be either horizontal or
     * vertical.
     * @param ResBuf Resulting scanline buffer.
     */

    void resizeScanlineH(const Tin* const SrcBuf, fptype* const ResBuf) {
      (*Steps)[0].packScanline(SrcBuf, BufPtrs[0], SrcLen);
      BufPtrs[2] = ResBuf;
      int j;

      for (j = 0; j < Steps->getItemCount(); j++) {
        const CFilterStep& fs = (*Steps)[j];
        fs.prepareInBuf(BufPtrs[fs.InBuf]);
        const int DstIncr = (Vars->packmode == 0 ? Vars->ElCount : 1);

        if (fs.ResampleFactor != 0) {
          if (fs.IsUpsample) {
            fs.doUpsample(BufPtrs[fs.InBuf], BufPtrs[fs.OutBuf]);
          } else {
            fs.doFilter(BufPtrs[fs.InBuf], BufPtrs[fs.OutBuf], DstIncr);
          }
        } else {
          fs.doResize(BufPtrs[fs.InBuf], BufPtrs[fs.OutBuf], DstIncr,
                      TmpFltBuf);
        }
      }
    }

    /**
     * Function resizes a single vertical scanline.
     *
     * @param SrcBuf Source scanline buffer. Can be either horizontal or
     * vertical.
     * @param ResBuf Resulting scanline buffer.
     */

    void resizeScanlineV(const fptype* const SrcBuf, fptype* const ResBuf) {
      (*Steps)[0].convertVtoH(SrcBuf, BufPtrs[0], SrcLen, SrcIncr);

      BufPtrs[2] = ResBuf;
      int j;

      for (j = 0; j < Steps->getItemCount(); j++) {
        const CFilterStep& fs = (*Steps)[j];
        fs.prepareInBuf(BufPtrs[fs.InBuf]);
        const int DstIncr =
            (fs.OutBuf == 2 ? ResIncr
                            : (Vars->packmode == 0 ? Vars->ElCount : 1));

        if (fs.ResampleFactor != 0) {
          if (fs.IsUpsample) {
            fs.doUpsample(BufPtrs[fs.InBuf], BufPtrs[fs.OutBuf]);
          } else {
            fs.doFilter(BufPtrs[fs.InBuf], BufPtrs[fs.OutBuf], DstIncr);
          }
        } else {
          fs.doResize(BufPtrs[fs.InBuf], BufPtrs[fs.OutBuf], DstIncr,
                      TmpFltBuf);
        }
      }
    }
  };
};

#undef AVIR_PI
#undef AVIR_PId2

}  // namespace avir

#endif  // AVIR_CIMAGERESIZER_INCLUDED
//$ nobt
//$ nocpp

/**
 * @file avir.h
 *
 * @brief The "main" inclusion file with all required classes and functions.
 *
 * This is the "main" inclusion file for the "AVIR" image resizer. This
 * inclusion file contains implementation of the AVIR image resizing algorithm
 * in its entirety. Also includes several classes and functions that can be
 * useful elsewhere.
 *
 * AVIR Copyright (c) 2015-2019 Aleksey Vaneev
 *
 * @mainpage
 *
 * @section intro_sec Introduction
 *
 * Description is available at https://github.com/avaneev/avir
 *
 * AVIR is devoted to women. Your digital photos can look good at any size!
 *
 * @section license License
 *
 * AVIR License Agreement
 *
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2019 Aleksey Vaneev
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 * Please credit the author of this library in your documentation in the
 * following way: "AVIR image resizing algorithm designed by Aleksey Vaneev"
 *
 * @version 2.4
 */

#ifndef AVIR_CIMAGERESIZER_INCLUDED
#define AVIR_CIMAGERESIZER_INCLUDED

#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>

namespace avir {

/**
 * The macro defines AVIR version string.
 */

#define AVIR_VERSION "2.4"

/**
 * The macro equals to "pi" constant, fills 53-bit floating point mantissa.
 * Undefined at the end of file.
 */

#define AVIR_PI 3.1415926535897932

/**
 * The macro equals to "pi divided by 2" constant, fills 53-bit floating
 * point mantissa. Undefined at the end of file.
 */

#define AVIR_PId2 1.5707963267948966

/**
 * Rounding function, based on the (int) typecast. Biased result. Not suitable
 * for numbers >= 2^31.
 *
 * @param d Value to round.
 * @return Rounded value. Some bias may be introduced.
 */

template <class T>
inline T round(const T d) {
  return (d < 0.0 ? -(T)(int)((T)0.5 - d) : (T)(int)(d + (T)0.5));
}

/**
 * Template function "clamps" (clips) the specified value so that it is not
 * lesser than "minv", and not greater than "maxv".
 *
 * @param Value Value to clamp.
 * @param minv Minimal allowed value.
 * @param maxv Maximal allowed value.
 * @return The clamped value.
 */

template <class T>
inline T clamp(const T& Value, const T minv, const T maxv) {
  if (Value < minv) {
    return (minv);
  } else if (Value > maxv) {
    return (maxv);
  } else {
    return (Value);
  }
}

/**
 * Power 2.4 approximation function, designed for sRGB gamma correction.
 *
 * @param x Argument, in the range 0.09 to 1.
 * @return Value raised into power 2.4, approximate.
 */

template <class T>
inline T pow24_sRGB(const T x) {
  const double x2 = x * x;
  const double x3 = x2 * x;
  const double x4 = x2 * x2;

  return ((T)(0.0985766365536824 + 0.839474952656502 * x2 +
              0.363287814061725 * x3 -
              0.0125559718896615 / (0.12758338921578 + 0.290283465468235 * x) -
              0.231757513261358 * x - 0.0395365717969074 * x4));
}

/**
 * Power 1/2.4 approximation function, designed for sRGB gamma correction.
 *
 * @param x Argument, in the range 0.003 to 1.
 * @return Value raised into power 1/2.4, approximate.
 */

template <class T>
inline T pow24i_sRGB(const T x) {
  const double sx = sqrt(x);
  const double ssx = sqrt(sx);
  const double sssx = sqrt(ssx);

  return ((T)(0.000213364515060263 + 0.0149409239419218 * x +
              0.433973412731747 * sx +
              ssx * (0.659628181609715 * sssx - 0.0380957908841466 -
                     0.0706476137208521 * sx)));
}

/**
 * Function approximately linearizes the sRGB gamma value.
 *
 * @param s sRGB gamma value, in the range 0 to 1.
 * @return Linearized sRGB gamma value, approximated.
 */

template <class T>
inline T convertSRGB2Lin(const T s) {
  const T a = (T)0.055;

  if (s <= (T)0.04045) {
    return (s / (T)12.92);
  }

  return (pow24_sRGB((s + a) / ((T)1 + a)));
}

/**
 * Function approximately de-linearizes the linear gamma value.
 *
 * @param s Linear gamma value, in the range 0 to 1.
 * @return sRGB gamma value, approximated.
 */

template <class T>
inline T convertLin2SRGB(const T s) {
  const T a = (T)0.055;

  if (s <= (T)0.0031308) {
    return ((T)12.92 * s);
  }

  return (((T)1 + a) * pow24i_sRGB(s) - a);
}

/**
 * Function converts (via typecast) specified array of type T1 values of
 * length l into array of type T2 values. If T1 is the same as T2, copy
 * operation is performed. When copying data at overlapping address spaces,
 * "op" should be lower than "ip".
 *
 * @param ip Input buffer.
 * @param[out] op Output buffer.
 * @param l The number of elements to copy.
 * @param ip Input buffer pointer increment.
 * @param op Output buffer pointer increment.
 */

template <class T1, class T2>
inline void copyArray(const T1* ip, T2* op, int l, const int ipinc = 1,
                      const int opinc = 1) {
  while (l > 0) {
    *op = (T2)*ip;
    op += opinc;
    ip += ipinc;
    l--;
  }
}

/**
 * Function adds values located in array "ip" to array "op".
 *
 * @param ip Input buffer.
 * @param[out] op Output buffer.
 * @param l The number of elements to add.
 * @param ip Input buffer pointer increment.
 * @param op Output buffer pointer increment.
 */

template <class T1, class T2>
inline void addArray(const T1* ip, T2* op, int l, const int ipinc = 1,
                     const int opinc = 1) {
  while (l > 0) {
    *op += *ip;
    op += opinc;
    ip += ipinc;
    l--;
  }
}

/**
 * Function that replicates a set of adjacent elements several times in a row.
 * This operation is usually used to replicate pixels at the start or end of
 * image's scanline.
 *
 * @param ip Source array.
 * @param ipl Source array length (usually 1..4, but can be any number).
 * @param[out] op Destination buffer.
 * @param l Number of times the source array should be replicated (the
 * destination buffer should be able to hold ipl * l number of elements).
 * @param opinc Destination buffer position increment after replicating the
 * source array. This value should be equal to at least ipl.
 */

template <class T1, class T2>
inline void replicateArray(const T1* const ip, const int ipl, T2* op, int l,
                           const int opinc) {
  if (ipl == 1) {
    while (l > 0) {
      op[0] = ip[0];
      op += opinc;
      l--;
    }
  } else if (ipl == 4) {
    while (l > 0) {
      op[0] = ip[0];
      op[1] = ip[1];
      op[2] = ip[2];
      op[3] = ip[3];
      op += opinc;
      l--;
    }
  } else if (ipl == 3) {
    while (l > 0) {
      op[0] = ip[0];
      op[1] = ip[1];
      op[2] = ip[2];
      op += opinc;
      l--;
    }
  } else if (ipl == 2) {
    while (l > 0) {
      op[0] = ip[0];
      op[1] = ip[1];
      op += opinc;
      l--;
    }
  } else {
    while (l > 0) {
      int i;

      for (i = 0; i < ipl; i++) {
        op[i] = ip[i];
      }

      op += opinc;
      l--;
    }
  }
}

/**
 * Function calculates frequency response of the specified FIR filter at the
 * specified circular frequency. Phase can be calculated as atan2( im, re ).
 * Function uses computationally-efficient oscillators instead of "cos" and
 * "sin" functions.
 *
 * @param flt FIR filter's coefficients.
 * @param fltlen Number of coefficients (taps) in the filter.
 * @param th Circular frequency [0; pi].
 * @param[out] re0 Resulting real part of the complex frequency response.
 * @param[out] im0 Resulting imaginary part of the complex frequency response.
 * @param fltlat Filter's latency in samples (taps).
 */

template <class T>
inline void calcFIRFilterResponse(const T* flt, int fltlen, const double th,
                                  double& re0, double& im0,
                                  const int fltlat = 0) {
  const double sincr = 2.0 * cos(th);
  double cvalue1;
  double svalue1;

  if (fltlat == 0) {
    cvalue1 = 1.0;
    svalue1 = 0.0;
  } else {
    cvalue1 = cos(-fltlat * th);
    svalue1 = sin(-fltlat * th);
  }

  double cvalue2 = cos(-(fltlat + 1) * th);
  double svalue2 = sin(-(fltlat + 1) * th);

  double re = 0.0;
  double im = 0.0;

  while (fltlen > 0) {
    re += cvalue1 * flt[0];
    im += svalue1 * flt[0];
    flt++;
    fltlen--;

    double tmp = cvalue1;
    cvalue1 = sincr * cvalue1 - cvalue2;
    cvalue2 = tmp;

    tmp = svalue1;
    svalue1 = sincr * svalue1 - svalue2;
    svalue2 = tmp;
  }

  re0 = re;
  im0 = im;
}

/**
 * Function normalizes FIR filter so that its frequency response at DC is
 * equal to DCGain.
 *
 * @param[in,out] p Filter coefficients.
 * @param l Filter length.
 * @param DCGain Filter's gain at DC.
 * @param pstep "p" array step.
 */

template <class T>
inline void normalizeFIRFilter(T* const p, const int l, const double DCGain,
                               const int pstep = 1) {
  double s = 0.0;
  T* pp = p;
  int i = l;

  while (i > 0) {
    s += *pp;
    pp += pstep;
    i--;
  }

  s = DCGain / s;
  pp = p;
  i = l;

  while (i > 0) {
    *pp = (T)(*pp * s);
    pp += pstep;
    i--;
  }
}

/**
 * @brief Memory buffer class for element array storage, with capacity
 * tracking.
 *
 * Allows easier handling of memory blocks allocation and automatic
 * deallocation for arrays (buffers) consisting of elements of specified
 * class. Tracks buffer's capacity in "int" variable; unsuitable for
 * allocation of very large memory blocks (with more than 2 billion elements).
 *
 * This class manages memory space only - it does not perform element class
 * construction (initialization) operations. Buffer's required memory address
 * alignment specification is supported.
 *
 * Uses standard library to allocate and deallocate memory.
 *
 * @tparam T Buffer element's type.
 * @tparam capint Buffer capacity's type to use. Use size_t for large buffers.
 */

template <class T, typename capint = int>
class CBuffer {
 public:
  CBuffer() : Data(NULL), DataAligned(NULL), Capacity(0), Alignment(0) {}

  /**
   * Constructor creates the buffer with the specified capacity.
   *
   * @param aCapacity Buffer's capacity.
   * @param aAlignment Buffer's required memory address alignment. 0 - use
   * stdlib's default alignment.
   */

  CBuffer(const capint aCapacity, const int aAlignment = 0) {
    allocinit(aCapacity, aAlignment);
  }

  CBuffer(const CBuffer& Source) {
    allocinit(Source.Capacity, Source.Alignment);
    memcpy(DataAligned, Source.DataAligned, Capacity * sizeof(T));
  }

  ~CBuffer() { freeData(); }

  CBuffer& operator=(const CBuffer& Source) {
    alloc(Source.Capacity, Source.Alignment);
    memcpy(DataAligned, Source.DataAligned, Capacity * sizeof(T));
    return (*this);
  }

  /**
   * Function allocates memory so that the specified number of elements
   * can be stored in *this buffer object.
   *
   * @param aCapacity Storage for this number of elements to allocate.
   * @param aAlignment Buffer's required memory address alignment,
   * power-of-2 values only. 0 - use stdlib's default alignment.
   */

  void alloc(const capint aCapacity, const int aAlignment = 0) {
    freeData();
    allocinit(aCapacity, aAlignment);
  }

  /**
   * Function deallocates any previously allocated buffer.
   */

  void free() {
    freeData();
    Data = NULL;
    DataAligned = NULL;
    Capacity = 0;
    Alignment = 0;
  }

  /**
   * @return The capacity of the element buffer.
   */

  capint getCapacity() const { return (Capacity); }

  /**
   * Function "forces" *this buffer to have an arbitary capacity. Calling
   * this function invalidates all further operations except deleting *this
   * object. This function should not be usually used at all. Function can
   * be used to "model" certain buffer capacity without calling a costly
   * memory allocation function.
   *
   * @param NewCapacity A new "forced" capacity.
   */

  void forceCapacity(const capint NewCapacity) { Capacity = NewCapacity; }

  /**
   * Function reallocates *this buffer to a larger size so that it will be
   * able to hold the specified number of elements. Downsizing is not
   * performed. Alignment is not changed.
   *
   * @param NewCapacity New (increased) capacity.
   * @param DoDataCopy "True" if data in the buffer should be retained.
   */

  void increaseCapacity(const capint NewCapacity,
                        const bool DoDataCopy = true) {
    if (NewCapacity < Capacity) {
      return;
    }

    if (DoDataCopy) {
      const capint PrevCapacity = Capacity;
      T* const PrevData = Data;
      T* const PrevDataAligned = DataAligned;

      allocinit(NewCapacity, Alignment);
      memcpy(DataAligned, PrevDataAligned, PrevCapacity * sizeof(T));

      ::free(PrevData);
    } else {
      ::free(Data);
      allocinit(NewCapacity, Alignment);
    }
  }

  /**
   * Function "truncates" (reduces) capacity of the buffer without
   * reallocating it. Alignment is not changed.
   *
   * @param NewCapacity New required capacity.
   */

  void truncateCapacity(const capint NewCapacity) {
    if (NewCapacity >= Capacity) {
      return;
    }

    Capacity = NewCapacity;
  }

  /**
   * Function increases capacity so that the specified number of
   * elements can be stored. This function increases the previous capacity
   * value by third the current capacity value until space for the required
   * number of elements is available. Alignment is not changed.
   *
   * @param ReqCapacity Required capacity.
   */

  void updateCapacity(const capint ReqCapacity) {
    if (ReqCapacity <= Capacity) {
      return;
    }

    capint NewCapacity = Capacity;

    while (NewCapacity < ReqCapacity) {
      NewCapacity += NewCapacity / 3 + 1;
    }

    increaseCapacity(NewCapacity);
  }

  operator T*() const { return (DataAligned); }

 private:
  T* Data;          ///< Element buffer pointer.
                    ///<
  T* DataAligned;   ///< Memory address-aligned element buffer pointer.
                    ///<
  capint Capacity;  ///< Element buffer capacity.
                    ///<
  int Alignment;    ///< Memory address alignment in use. 0 - use stdlib's
                    ///< default alignment.
                    ///<

  /**
   * Internal element buffer allocation function used during object
   * construction.
   *
   * @param aCapacity Storage for this number of elements to allocate.
   * @param aAlignment Buffer's required memory address alignment. 0 - use
   * stdlib's default alignment.
   */

  void allocinit(const capint aCapacity, const int aAlignment) {
    if (aAlignment == 0) {
      Data = (T*)::malloc(aCapacity * sizeof(T));
      DataAligned = Data;
      Alignment = 0;
    } else {
      Data = (T*)::malloc(aCapacity * sizeof(T) + aAlignment);
      DataAligned = alignptr(Data, aAlignment);
      Alignment = aAlignment;
    }

    Capacity = aCapacity;
  }

  /**
   * Function frees a previously allocated Data buffer.
   */

  void freeData() { ::free(Data); }

  /**
   * Function modifies the specified pointer so that it becomes memory
   * address-aligned.
   *
   * @param ptr Pointer to align.
   * @param align Alignment in bytes to apply.
   * @return Pointer aligned to align bytes. Works with power-of-2
   * alignments only. If no alignment is necessary, "align" bytes will be
   * added to the pointer value.
   */

  template <class Tp>
  inline Tp alignptr(const Tp ptr, const uintptr_t align) {
    return ((Tp)((uintptr_t)ptr + align - ((uintptr_t)ptr & (align - 1))));
  }
};

/**
 * Function optimizes the length of the symmetric-odd FIR filter by removing
 * left- and rightmost elements that are below specific threshold.
 *
 * Synthetic test shows that filter gets optimized in 2..3% of cases and in
 * each such case optimization reduces filter length by 6..8%. Optimization,
 * however, may skew the results of algorithm modeling and complexity
 * calculation leading to a choice of a less optimal algorithm.
 *
 * @param[in,out] Flt Buffer that contains filter being optimized.
 * @param[in,out] FltLatency Variable that holds the current latency of the
 * filter. May be adjusted on function return.
 * @param Threshold Threshold level.
 */

template <class T>
inline void optimizeFIRFilter(CBuffer<T>& Flt, int& FltLatency,
                              T const Threshold = (T)0.00001) {
  int i;

  // Optimize length.

  for (i = 0; i <= FltLatency; i++) {
    if (fabs(Flt[i]) >= Threshold || i == FltLatency) {
      if (i > 0) {
        const int NewCapacity = Flt.getCapacity() - i * 2;
        copyArray(&Flt[i], &Flt[0], NewCapacity);
        Flt.truncateCapacity(NewCapacity);
        FltLatency -= i;
      }

      break;
    }
  }
}

/**
 * @brief Array of structured objects.
 *
 * Implements allocation of a linear array of objects of class T (which are
 * initialized), addressable via operator[]. Each object is created via the
 * "operator new". New object insertions are quick since implementation uses
 * prior space allocation (capacity), thus not requiring frequent memory block
 * reallocations.
 *
 * @tparam T Array element's type.
 */

template <class T>
class CStructArray {
 public:
  CStructArray() : ItemCount(0) {}

  CStructArray(const CStructArray& Source)
      : ItemCount(0), Items(Source.getItemCount()) {
    while (ItemCount < Source.getItemCount()) {
      Items[ItemCount] = new T(Source[ItemCount]);
      ItemCount++;
    }
  }

  ~CStructArray() { clear(); }

  CStructArray& operator=(const CStructArray& Source) {
    clear();

    const int NewCount = Source.ItemCount;
    Items.updateCapacity(NewCount);

    while (ItemCount < NewCount) {
      Items[ItemCount] = new T(Source[ItemCount]);
      ItemCount++;
    }

    return (*this);
  }

  T& operator[](const int Index) { return (*Items[Index]); }

  const T& operator[](const int Index) const { return (*Items[Index]); }

  /**
   * Function creates a new object of type T with the default constructor
   * and adds this object to the array.
   *
   * @return Reference to a newly added object.
   */

  T& add() {
    if (ItemCount == Items.getCapacity()) {
      Items.increaseCapacity(ItemCount * 3 / 2 + 1);
    }

    Items[ItemCount] = new T();
    ItemCount++;

    return ((*this)[ItemCount - 1]);
  }

  /**
   * Function changes number of allocated items. New items are created with
   * the default constructor. If NewCount is below the current item count,
   * items that are above NewCount range will be destructed.
   *
   * @param NewCount New requested item count.
   */

  void setItemCount(const int NewCount) {
    if (NewCount > ItemCount) {
      Items.increaseCapacity(NewCount);

      while (ItemCount < NewCount) {
        Items[ItemCount] = new T();
        ItemCount++;
      }
    } else {
      while (ItemCount > NewCount) {
        ItemCount--;
        delete Items[ItemCount];
      }
    }
  }

  /**
   * Function erases all items of *this array.
   */

  void clear() {
    while (ItemCount > 0) {
      ItemCount--;
      delete Items[ItemCount];
    }
  }

  /**
   * @return The number of allocated items.
   */

  int getItemCount() const { return (ItemCount); }

 private:
  int ItemCount;      ///< The number of items available in the array.
                      ///<
  CBuffer<T*> Items;  ///< Element buffer.
                      ///<
};

/**
 * @brief Sine signal generator class.
 *
 * Class implements sine signal generator without biasing, with
 * constructor-based initalization only. This generator uses oscillator
 * instead of "sin" function.
 */

class CSineGen {
 public:
  /**
   * Constructor initializes *this sine signal generator.
   *
   * @param si Sine function increment, in radians.
   * @param ph Starting phase, in radians. Add 0.5 * AVIR_PI for cosine
   * function.
   */

  CSineGen(const double si, const double ph)
      : svalue1(sin(ph)), svalue2(sin(ph - si)), sincr(2.0 * cos(si)) {}

  /**
   * @return The next value of the sine function, without biasing.
   */

  double generate() {
    const double res = svalue1;

    svalue1 = sincr * res - svalue2;
    svalue2 = res;

    return (res);
  }

 private:
  double svalue1;  ///< Current sine value.
                   ///<
  double svalue2;  ///< Previous sine value.
                   ///<
  double sincr;    ///< Sine value increment.
                   ///<
};

/**
 * @brief Peaked Cosine window function generator class.
 *
 * Class implements Peaked Cosine window function generator. Generates the
 * right-handed half of the window function. The Alpha parameter of this
 * window function offers the control of the balance between the early and
 * later taps of the filter. E.g. at Alpha=1 both early and later taps are
 * attenuated, but at Alpha=4 mostly later taps are attenuated. This offers a
 * great control over ringing artifacts produced by a low-pass filter in image
 * processing, without compromising achieved image sharpness.
 */

class CDSPWindowGenPeakedCosine {
 public:
  /**
   * Constructor initializes *this window function generator.
   *
   * @param aAlpha Alpha parameter, affects the peak shape (peak
   * augmentation) of the window function. Should be >= 1.0.
   * @param aLen2 Half filter's length (non-truncated).
   */

  CDSPWindowGenPeakedCosine(const double aAlpha, const double aLen2)
      : Alpha(aAlpha),
        Len2(aLen2),
        wn(0),
        w1(AVIR_PId2 / Len2, AVIR_PI * 0.5) {}

  /**
   * @return The next Peaked Cosine window function coefficient.
   */

  double generate() {
    const double h = pow(wn / Len2, Alpha);
    wn++;

    return (w1.generate() * (1.0 - h));
  }

 private:
  double Alpha;  ///< Alpha parameter, affects the peak shape of window.
                 ///<
  double Len2;   ///< Half length of the window function.
                 ///<
  int wn;        ///< Window function integer position. 0 - center of the
                 ///< window function.
                 ///<
  CSineGen w1;   ///< Sine-wave generator.
                 ///<
};

/**
 * @brief FIR filter-based equalizer generator.
 *
 * Class implements an object used to generate symmetric-odd FIR filters with
 * the specified frequency response (aka paragraphic equalizer). The
 * calculated filter is windowed by the Peaked Cosine window function.
 *
 * In image processing, due to short length of filters being used (6-8 taps)
 * the resulting frequency response of the filter is approximate and may be
 * mathematically imperfect, but still adequate to the visual requirements.
 *
 * On a side note, this equalizer generator can be successfully used for audio
 * signal equalization as well: for example, it is used in almost the same
 * form in Voxengo Marvel GEQ equalizer plug-in.
 *
 * Filter generation is based on decomposition of frequency range into
 * spectral bands, with each band represented by linear and ramp "kernels".
 * When the filter is built, these kernels are combined together with
 * different weights that approximate the required frequency response.
 */

class CDSPFIREQ {
 public:
  /**
   * Function initializes *this object with the required parameters. The
   * gain of frequencies beyond the MinFreq..MaxFreq range are controlled by
   * the first and the last band's gain.
   *
   * @param SampleRate Processing sample rate (use 2 for image processing).
   * @param aFilterLength Required filter length in samples (taps). The
   * actual filter length is truncated to an integer value.
   * @param aBandCount Number of band crossover points required to control,
   * including bands at MinFreq and MaxFreq.
   * @param MinFreq Minimal frequency that should be controlled.
   * @param MaxFreq Maximal frequency that should be controlled.
   * @param IsLogBands "True" if the bands should be spaced logarithmically.
   * @param WFAlpha Peaked Cosine window function's Alpha parameter.
   */

  void init(const double SampleRate, const double aFilterLength,
            const int aBandCount, const double MinFreq, const double MaxFreq,
            const bool IsLogBands, const double WFAlpha) {
    FilterLength = aFilterLength;
    BandCount = aBandCount;

    CenterFreqs.alloc(BandCount);

    z = (int)ceil(FilterLength * 0.5);
    zi = z + (z & 1);
    z2 = z * 2;

    CBuffer<double> oscbuf(z2);
    initOscBuf(oscbuf);

    CBuffer<double> winbuf(z);
    initWinBuf(winbuf, WFAlpha);

    UseFirstVirtBand = (MinFreq > 0.0);
    const int k = zi * (BandCount + (UseFirstVirtBand ? 1 : 0));
    Kernels1.alloc(k);
    Kernels2.alloc(k);

    double m;   // Frequency step multiplier.
    double mo;  // Frequency step offset (addition).

    if (IsLogBands) {
      m = exp(log(MaxFreq / MinFreq) / (BandCount - 1));
      mo = 0.0;
    } else {
      m = 1.0;
      mo = (MaxFreq - MinFreq) / (BandCount - 1);
    }

    double f = MinFreq;
    double x1 = 0.0;
    double x2;
    int si;

    if (UseFirstVirtBand) {
      si = 0;
    } else {
      si = 1;
      CenterFreqs[0] = 0.0;
      f = f * m + mo;
    }

    double* kernbuf1 = &Kernels1[0];
    double* kernbuf2 = &Kernels2[0];
    int i;

    for (i = si; i < BandCount; i++) {
      x2 = f * 2.0 / SampleRate;
      CenterFreqs[i] = x2;

      fillBandKernel(x1, x2, kernbuf1, kernbuf2, oscbuf, winbuf);

      kernbuf1 += zi;
      kernbuf2 += zi;
      x1 = x2;
      f = f * m + mo;
    }

    if (x1 < 1.0) {
      UseLastVirtBand = true;
      fillBandKernel(x1, 1.0, kernbuf1, kernbuf2, oscbuf, winbuf);
    } else {
      UseLastVirtBand = false;
    }
  }

  /**
   * @return Filter's length, in samples (taps).
   */

  int getFilterLength() const { return (z2 - 1); }

  /**
   * @return Filter's latency (group delay), in samples (taps).
   */

  int getFilterLatency() const { return (z - 1); }

  /**
   * Function creates symmetric-odd FIR filter with the specified gain
   * levels at band crossover points.
   *
   * @param BandGains Array of linear gain levels, count=BandCount specified
   * in the init() function.
   * @param[out] Filter Output filter buffer, length = getFilterLength().
   */

  void buildFilter(const double* const BandGains, double* const Filter) {
    const double* kernbuf1 = &Kernels1[0];
    const double* kernbuf2 = &Kernels2[0];
    double x1 = 0.0;
    double y1 = BandGains[0];
    double x2;
    double y2;

    int i;
    int si;

    if (UseFirstVirtBand) {
      si = 1;
      x2 = CenterFreqs[0];
      y2 = y1;
    } else {
      si = 2;
      x2 = CenterFreqs[1];
      y2 = BandGains[1];
    }

    copyBandKernel(Filter, kernbuf1, kernbuf2, y1 - y2, x1 * y2 - x2 * y1);

    kernbuf1 += zi;
    kernbuf2 += zi;
    x1 = x2;
    y1 = y2;

    for (i = si; i < BandCount; i++) {
      x2 = CenterFreqs[i];
      y2 = BandGains[i];

      addBandKernel(Filter, kernbuf1, kernbuf2, y1 - y2, x1 * y2 - x2 * y1);

      kernbuf1 += zi;
      kernbuf2 += zi;
      x1 = x2;
      y1 = y2;
    }

    if (UseLastVirtBand) {
      addBandKernel(Filter, kernbuf1, kernbuf2, y1 - y2, x1 * y2 - y1);
    }

    for (i = 0; i < z - 1; i++) {
      Filter[z + i] = Filter[z - 2 - i];
    }
  }

  /**
   * Function calculates filter's length (in samples) and latency depending
   * on the required non-truncated filter length.
   *
   * @param aFilterLength Required filter length in samples (non-truncated).
   * @param[out] Latency Resulting latency (group delay) of the filter,
   * in samples (taps).
   * @return Filter length in samples (taps).
   */

  static int calcFilterLength(const double aFilterLength, int& Latency) {
    const int l = (int)ceil(aFilterLength * 0.5);
    Latency = l - 1;

    return (l * 2 - 1);
  }

 private:
  double FilterLength;  ///< Length of filter.
                        ///<
  int z;                ///< Equals (int) ceil( FilterLength * 0.5 ).
                        ///<
  int zi;  ///< Equals "z" if z is even, or z + 1 if z is odd. Used as a
           ///< Kernels1 and Kernels2 size multiplier and kernel buffer
           ///< increment to make sure each kernel buffer is 16-byte aligned.
           ///<
  int z2;  ///< Equals z * 2.
           ///<
  int BandCount;                ///< Number of controllable bands.
                                ///<
  CBuffer<double> CenterFreqs;  ///< Center frequencies for all bands,
                                ///< normalized to 0.0-1.0 range.
                                ///<
  CBuffer<double> Kernels1;     ///< Half-length kernel buffers for each
                                ///< spectral band (linear part).
                                ///<
  CBuffer<double> Kernels2;     ///< Half-length kernel buffers for each
                                ///< spectral band (ramp part).
                                ///<
  bool UseFirstVirtBand;        ///< "True" if the first virtual band
                          ///< (between 0.0 and MinFreq) should be used. The
                          ///< first virtual band won't be used if MinFreq
                          ///< equals 0.0.
                          ///<
  bool UseLastVirtBand;  ///< "True" if the last virtual band (between
                         ///< MaxFreq and SampleRate * 0.5) should be used. The
                         ///< last virtual band won't be used if MaxFreq * 2.0
                         ///< equals SampleRate.
                         ///<

  /**
   * Function initializes the "oscbuf" used in the fillBandKernel()
   * function.
   *
   * @param oscbuf Oscillator buffer, length = z * 2.
   */

  void initOscBuf(double* oscbuf) const {
    int i = z;

    while (i > 0) {
      oscbuf[0] = 0.0;
      oscbuf[1] = 1.0;
      oscbuf += 2;
      i--;
    }
  }

  /**
   * Function initializes window function buffer. This function generates
   * Peaked Cosine window function.
   *
   * @param winbuf Windowing buffer.
   * @param Alpha Peaked Cosine alpha parameter.
   */

  void initWinBuf(double* winbuf, const double Alpha) const {
    CDSPWindowGenPeakedCosine wf(Alpha, FilterLength * 0.5);
    int i;

    for (i = 1; i <= z; i++) {
      winbuf[z - i] = wf.generate();
    }
  }

  /**
   * Function fills first half of symmetric-odd FIR kernel for the band.
   * This function should be called successively for adjacent bands.
   * Previous band's x2 should be equal to current band's x1. A band kernel
   * consists of 2 elements: linear kernel and ramp kernel.
   *
   * @param x1 Band's left corner frequency (0..1).
   * @param x2 Band's right corner frequency (0..1).
   * @param kernbuf1 Band kernel buffer 1 (linear part), length = z.
   * @param kernbuf2 Band kernel buffer 2 (ramp part), length = z.
   * @param oscbuf Oscillation buffer. Before the first call of the
   * fillBandKernel() should be initialized with the call of the
   * initOscBuf() function.
   * @param winbuf Buffer that contains windowing function.
   */

  void fillBandKernel(const double x1, const double x2, double* kernbuf1,
                      double* kernbuf2, double* oscbuf,
                      const double* const winbuf) {
    const double s2_incr = AVIR_PI * x2;
    const double s2_coeff = 2.0 * cos(s2_incr);

    double s2_value1 = sin(s2_incr * (-z + 1));
    double c2_value1 = sin(s2_incr * (-z + 1) + AVIR_PI * 0.5);
    oscbuf[0] = sin(s2_incr * -z);
    oscbuf[1] = sin(s2_incr * -z + AVIR_PI * 0.5);

    int ks;

    for (ks = 1; ks < z; ks++) {
      const int ks2 = ks * 2;
      const double s1_value1 = oscbuf[ks2];
      const double c1_value1 = oscbuf[ks2 + 1];
      oscbuf[ks2] = s2_value1;
      oscbuf[ks2 + 1] = c2_value1;

      const double x = AVIR_PI * (ks - z);
      const double v0 = winbuf[ks - 1] / ((x1 - x2) * x);

      kernbuf1[ks - 1] =
          (x2 * s2_value1 - x1 * s1_value1 + (c2_value1 - c1_value1) / x) * v0;

      kernbuf2[ks - 1] = (s2_value1 - s1_value1) * v0;

      s2_value1 = s2_coeff * s2_value1 - oscbuf[ks2 - 2];
      c2_value1 = s2_coeff * c2_value1 - oscbuf[ks2 - 1];
    }

    kernbuf1[z - 1] = (x2 * x2 - x1 * x1) / (x1 - x2) * 0.5;
    kernbuf2[z - 1] = -1.0;
  }

  /**
   * Function copies band kernel's elements to the output buffer.
   *
   * @param outbuf Output buffer.
   * @param kernbuf1 Kernel buffer 1 (linear part).
   * @param kernbuf2 Kernel buffer 2 (ramp part).
   * @param c Multiplier for linear kernel element.
   * @param d Multiplier for ramp kernel element.
   */

  void copyBandKernel(double* outbuf, const double* const kernbuf1,
                      const double* const kernbuf2, const double c,
                      const double d) const {
    int ks;

    for (ks = 0; ks < z; ks++) {
      outbuf[ks] = c * kernbuf1[ks] + d * kernbuf2[ks];
    }
  }

  /**
   * Function adds band kernel's elements to the output buffer.
   *
   * @param outbuf Output buffer.
   * @param kernbuf1 Kernel buffer 1 (linear part).
   * @param kernbuf2 Kernel buffer 2 (ramp part).
   * @param c Multiplier for linear kernel element.
   * @param d Multiplier for ramp kernel element.
   */

  void addBandKernel(double* outbuf, const double* const kernbuf1,
                     const double* const kernbuf2, const double c,
                     const double d) const {
    int ks;

    for (ks = 0; ks < z; ks++) {
      outbuf[ks] += c * kernbuf1[ks] + d * kernbuf2[ks];
    }
  }
};

/**
 * @brief Low-pass filter windowed by Peaked Cosine window function.
 *
 * This class implements calculation of linear-phase symmetric-odd FIR
 * low-pass filter windowed by the Peaked Cosine window function, for image
 * processing applications.
 */

class CDSPPeakedCosineLPF {
 public:
  int fl2;  ///< Half filter's length, excluding the peak value. This value
            ///< can be also used as filter's latency in samples (taps).
            ///<
  int FilterLen;  ///< Filter's length in samples (taps).
                  ///<

  /**
   * Constructor initalizes *this object.
   *
   * @param aLen2 Half-length (non-truncated) of low-pass filter, in samples
   * (taps).
   * @param aFreq2 Low-pass filter's corner frequency [0; pi].
   * @param aAlpha Peaked Cosine window function Alpha parameter.
   */

  CDSPPeakedCosineLPF(const double aLen2, const double aFreq2,
                      const double aAlpha)
      : fl2((int)ceil(aLen2) - 1),
        FilterLen(fl2 + fl2 + 1),
        Len2(aLen2),
        Freq2(aFreq2),
        Alpha(aAlpha) {}

  /**
   * Function generates a linear-phase low-pass filter windowed by Peaked
   * Cosine window function.
   *
   * @param[out] op Output buffer, length = FilterLen (fl2 * 2 + 1).
   * @param DCGain Required gain at DC. The resulting filter will be
   * normalized to achieve this DC gain.
   */

  template <class T>
  void generateLPF(T* op, const double DCGain) {
    CDSPWindowGenPeakedCosine wf(Alpha, Len2);
    CSineGen f2(Freq2, 0.0);

    op += fl2;
    T* op2 = op;
    f2.generate();
    int t = 1;

    *op = (T)(Freq2 * wf.generate() / AVIR_PI);
    double s = *op;

    while (t <= fl2) {
      const double v = f2.generate() * wf.generate() / t / AVIR_PI;
      op++;
      op2--;
      *op = (T)v;
      *op2 = (T)v;
      s += *op + *op2;
      t++;
    }

    t = FilterLen;
    s = DCGain / s;

    while (t > 0) {
      *op2 = (T)(*op2 * s);
      op2++;
      t--;
    }
  }

 private:
  double Len2;   ///< Half-length (non-truncated) of low-pass filter, in
                 ///< samples (taps).
                 ///<
  double Freq2;  ///< Low-pass filter's corner frequency.
                 ///<
  double Alpha;  ///< Peaked Cosine window function Alpha parameter.
                 ///<
};

/**
 * @brief Buffer class for parametrized low-pass filter.
 *
 * This class extends the CBuffer< double > class by adding several variables
 * that define a symmetric-odd FIR low-pass filter windowed by Peaked Cosine
 * window function. This class can be used to compare filters without
 * comparing their buffer contents.
 */

class CFltBuffer : public CBuffer<double> {
 public:
  double Len2;    ///< Half-length (non-truncated) of low-pass filters, in
                  ///< samples (taps).
                  ///<
  double Freq;    ///< Low-pass filter's corner frequency.
                  ///<
  double Alpha;   ///< Peaked Cosine window function Alpha parameter.
                  ///<
  double DCGain;  ///< DC gain applied to the filter.
                  ///<

  CFltBuffer()
      : CBuffer<double>(), Len2(0.0), Freq(0.0), Alpha(0.0), DCGain(0.0) {}

  /**
   * @param b2 Filter buffer to compare *this object to.
   * @return Operator returns "true" if both filters have same parameters.
   */

  bool operator==(const CFltBuffer& b2) const {
    return (Len2 == b2.Len2 && Freq == b2.Freq && Alpha == b2.Alpha &&
            DCGain == b2.DCGain);
  }
};

/**
 * @brief Sinc function-based fractional delay filter bank.
 *
 * Class implements storage and initialization of a bank of sinc
 * function-based fractional delay filters, expressed as 1st order polynomial
 * interpolation coefficients. The filters are produced from a single "long"
 * windowed low-pass filter. Also supports 0th-order ("nearest neighbor")
 * interpolation.
 *
 * This class also supports multiplication of each fractional delay filter by
 * an external filter (usually a low-pass filter).
 *
 * @tparam fptype Specifies storage type of the filter coefficients bank. The
 * filters are initially calculated using the "double" precision.
 */

template <class fptype>
class CDSPFracFilterBankLin {
 public:
  CDSPFracFilterBankLin() : Order(-1) {}

  /**
   * Copy constructor copies a limited set of parameters of the source
   * filter bank. The actual filters are not copied. Such copying is used
   * during filtering steps "modeling" stage. A further init() function
   * call is required.
   *
   * @param s Source filter bank.
   */

  void copyInitParams(const CDSPFracFilterBankLin& s) {
    WFLen2 = s.WFLen2;
    WFFreq = s.WFFreq;
    WFAlpha = s.WFAlpha;
    FracCount = s.FracCount;
    Order = s.Order;
    Alignment = s.Alignment;
    SrcFilterLen = s.SrcFilterLen;
    FilterLen = s.FilterLen;
    FilterSize = s.FilterSize;
    IsSrcTableBuilt = false;
    ExtFilter = s.ExtFilter;
    TableFillFlags.alloc(s.TableFillFlags.getCapacity());
    int i;

    // Copy table fill flags, but shifted so that further initialization
    // is still possible (such feature should not be used, though).

    for (i = 0; i < TableFillFlags.getCapacity(); i++) {
      TableFillFlags[i] = (uint8_t)(s.TableFillFlags[i] << 2);
    }
  }

  /**
   * Operator compares *this filter bank and another filter bank and returns
   * "true" if their parameters are equal. Alignment is not taken into
   * account.
   *
   * @param s Filter bank to compare to.
   * @return "True" if compared banks have equal parameters.
   */

  bool operator==(const CDSPFracFilterBankLin& s) const {
    return (Order == s.Order && WFLen2 == s.WFLen2 && WFFreq == s.WFFreq &&
            WFAlpha == s.WFAlpha && FracCount == s.FracCount &&
            ExtFilter == s.ExtFilter);
  }

  /**
   * Function initializes (builds) the filter bank based on the supplied
   * parameters. If the supplied parameters are equal to previously defined
   * parameters, function does nothing (alignment is assumed to be never
   * changing between the init() function calls).
   *
   * @param ReqFracCount Required number of fractional delays in the filter
   * bank. The minimal value is 2.
   * @param ReqOrder Required order of the interpolation polynomial
   * (0 or 1).
   * @param BaseLen Low-pass filter's base length, in samples (taps).
   * Affects the actual length of the filter and its overall steepness.
   * @param Cutoff Low-pass filter's normalized cutoff frequency [0; 1].
   * @param aWFAlpha Peaked Cosine window function's Alpha parameter.
   * @param aExtFilter External filter to apply to each fractional delay
   * filter.
   * @param aAlignment Memory alignment of the filter bank, power-of-2
   * value. 0 - use default stdlib alignment.
   * @param FltLenAlign Filter's length alignment, power-of-2 value.
   */

  void init(const int ReqFracCount, const int ReqOrder, const double BaseLen,
            const double Cutoff, const double aWFAlpha,
            const CFltBuffer& aExtFilter, const int aAlignment = 0,
            const int FltLenAlign = 1) {
    double NewWFLen2 = 0.5 * BaseLen * ReqFracCount;
    double NewWFFreq = AVIR_PI * Cutoff / ReqFracCount;
    double NewWFAlpha = aWFAlpha;

    if (ReqOrder == Order && NewWFLen2 == WFLen2 && NewWFFreq == WFFreq &&
        NewWFAlpha == WFAlpha && ReqFracCount == FracCount &&
        aExtFilter == ExtFilter) {
      IsInitRequired = false;
      return;
    }

    WFLen2 = NewWFLen2;
    WFFreq = NewWFFreq;
    WFAlpha = NewWFAlpha;
    FracCount = ReqFracCount;
    Order = ReqOrder;
    Alignment = aAlignment;
    ExtFilter = aExtFilter;

    CDSPPeakedCosineLPF p(WFLen2, WFFreq, WFAlpha);
    SrcFilterLen = (p.fl2 / ReqFracCount + 1) * 2;

    const int ElementSize = ReqOrder + 1;
    FilterLen = SrcFilterLen;

    if (ExtFilter.getCapacity() > 0) {
      FilterLen += ExtFilter.getCapacity() - 1;
    }

    FilterLen = (FilterLen + FltLenAlign - 1) & ~(FltLenAlign - 1);
    FilterSize = FilterLen * ElementSize;
    IsSrcTableBuilt = false;
    IsInitRequired = true;
  }

  /**
   * @return The length of each fractional delay filter, in samples (taps).
   * Always an even value.
   */

  int getFilterLen() const { return (FilterLen); }

  /**
   * @return The number of fractional filters in use by *this bank.
   */

  int getFracCount() const { return (FracCount); }

  /**
   * @return The order of the interpolation polynomial.
   */

  int getOrder() const { return (Order); }

  /**
   * Function returns the pointer to the specified interpolation table
   * filter.
   *
   * @param i Filter (fractional delay) index, in the range 0 to
   * ReqFracCount - 1, inclusive.
   * @return Pointer to filter. Higher order polynomial coefficients are
   * stored after after previous order coefficients, separated by FilterLen
   * elements.
   */

  const fptype* getFilter(const int i) {
    if (!IsSrcTableBuilt) {
      buildSrcTable();
    }

    fptype* const Res = &Table[i * FilterSize];

    if ((TableFillFlags[i] & 2) == 0) {
      createFilter(i);
      TableFillFlags[i] |= 2;

      if (Order > 0) {
        createFilter(i + 1);
        const fptype* const Res2 = Res + FilterSize;
        fptype* const op = Res + FilterLen;
        int j;

        // Create higher-order interpolation coefficients (linear
        // interpolation).

        for (j = 0; j < FilterLen; j++) {
          op[j] = Res2[j] - Res[j];
        }
      }
    }

    return (Res);
  }

  /**
   * Function makes sure all fractional delay filters were created.
   */

  void createAllFilters() {
    int i;

    for (i = 0; i < FracCount; i++) {
      getFilter(i);
    }
  }

  /**
   * Function returns an approximate initialization complexity, expressed in
   * the number of multiply-add operations. This includes fractional delay
   * filters calculation and multiplication by an external filter. This
   * function can only be called after the init() function.
   *
   * @param FracUseMap Fractional delays use map, each element corresponds
   * to a single fractional delay, will be compared to the internal table
   * fill flags. This map should include 0 and 1 values only.
   * @return The complexity of the initialization, expressed in the number
   * of multiply-add operations.
   */

  int calcInitComplexity(const CBuffer<uint8_t>& FracUseMap) const {
    const int FltInitCost = 65;  // Cost to initialize a single sample
                                 // of the fractional delay filter.
    const int FltUseCost =
        FilterLen * Order +
        SrcFilterLen * ExtFilter.getCapacity();  // Cost to use a single
                                                 // fractional delay filter.
    const int ucb[2] = {0, FltUseCost};
    int ic;
    int i;

    if (IsInitRequired) {
      ic = FracCount * SrcFilterLen * FltInitCost;

      for (i = 0; i < FracCount; i++) {
        ic += ucb[FracUseMap[i]];
      }
    } else {
      ic = 0;

      for (i = 0; i < FracCount; i++) {
        if (FracUseMap[i] != 0) {
          ic += ucb[TableFillFlags[i] == 0 ? 1 : 0];
        }
      }
    }

    return (ic);
  }

 private:
  static const int InterpPoints = 2;  ///< The maximal number of points the
                                      ///< interpolation is based on.
                                      ///<
  double WFLen2;                      ///< Window function's Len2 parameter.
                                      ///<
  double WFFreq;                      ///< Window function's Freq parameter.
                                      ///<
  double WFAlpha;                     ///< Window function's Alpha parameter.
                                      ///<
  int FracCount;     ///< The required number of fractional delay filters.
                     ///<
  int Order;         ///< The order of the interpolation polynomial.
                     ///<
  int Alignment;     ///< The required filter table alignment.
                     ///<
  int SrcFilterLen;  ///< Length of the "source" filters. This is always an
                     ///< even value.
                     ///<
  int FilterLen;     ///< Specifies the number of samples (taps) each fractional
                  ///< delay filter has. This is always an even value, adjusted
                  ///< by the FltLenAlign.
                  ///<
  int FilterSize;       ///< The size of a single filter element, equals
                        ///< FilterLen * ElementSize.
                        ///<
  bool IsInitRequired;  ///< "True" if SrcTable filter table initialization
                        ///< is required. This value is available only after the
                        ///< call to the init() function.
                        ///<
  CBuffer<fptype> Table;  ///< Interpolation table, size equals to
                          ///< ReqFracCount * FilterLen * ElementSize.
                          ///<
  CBuffer<uint8_t>
      TableFillFlags;    ///< Contains ReqFracCount + 1
                         ///< elements. Bit 0 of every element is 1 if Table
                         ///< already contains the filter from SrcTable filtered
                         ///< by ExtFilter. Bit 1 of every element means higher
                         ///< order coefficients were filled for the filter.
                         ///<
  CFltBuffer ExtFilter;  ///< External filter that should be applied to every
                         ///< fractional delay filter. Can be empty. Half of
                         ///< this filter's capacity is used as latency (group
                         ///< delay) value of the filter.
                         ///<
  CBuffer<double> SrcTable;  ///< Source table of delay filters, contains
                             ///< ReqFracCount + 1 elements. This table is used
                             ///< to fill the Table with the actual filters,
                             ///< filtered by an external filter.
                             ///<
  bool IsSrcTableBuilt;      ///< "True" if the SrcTable was built already. This
                         ///< variable is set to "false" in the init() function.
                         ///<

  /**
   * Function builds source table used in the createFilter() function.
   */

  void buildSrcTable() {
    IsSrcTableBuilt = true;
    IsInitRequired = false;

    CDSPPeakedCosineLPF p(WFLen2, WFFreq, WFAlpha);

    const int BufLen = SrcFilterLen * FracCount + InterpPoints - 1;
    const int BufOffs = InterpPoints / 2 - 1;
    const int BufCenter = SrcFilterLen * FracCount / 2 + BufOffs;

    CBuffer<double> Buf(BufLen);
    memset(Buf, 0, (BufCenter - p.fl2) * sizeof(double));
    int i = BufLen - BufCenter - p.fl2 - 1;
    memset(&Buf[BufLen - i], 0, i * sizeof(double));

    p.generateLPF(&Buf[BufCenter - p.fl2], FracCount);
    abort();

    SrcTable.alloc((FracCount + 1) * SrcFilterLen);
    TableFillFlags.alloc(FracCount + 1);
    int j;
    double* op0 = SrcTable;

    for (i = FracCount; i >= 0; i--) {
      TableFillFlags[i] = 0;
      double* p = Buf + BufOffs + i;

      for (j = 0; j < SrcFilterLen; j++) {
        op0[0] = p[0];
        op0++;
        p += FracCount;
      }
    }

    Table.alloc((FracCount + 1) * FilterSize, Alignment);
  }

  /**
   * Function creates the specified filter in the Table by copying it from
   * the SrcTable and filtering by ExtFilter. Function does nothing if
   * filter was already created.
   *
   * @param k Filter index to create, in the range 0 to FracCount,
   * inclusive.
   */

  void createFilter(const int k) {
    if (TableFillFlags[k] != 0) {
      return;
    }

    TableFillFlags[k] |= 1;
    const int ExtFilterLatency = ExtFilter.getCapacity() / 2;
    const int ResLatency = ExtFilterLatency + SrcFilterLen / 2;
    int ResLen = SrcFilterLen;

    if (ExtFilter.getCapacity() > 0) {
      ResLen += ExtFilter.getCapacity() - 1;
    }

    const int ResOffs = FilterLen / 2 - ResLatency;
    fptype* op = &Table[k * FilterSize];
    int i;

    for (i = 0; i < ResOffs; i++) {
      op[i] = 0.0;
    }

    for (i = ResOffs + ResLen; i < FilterLen; i++) {
      op[i] = 0.0;
    }

    op += ResOffs;
    const double* const srcflt = &SrcTable[k * SrcFilterLen];

    if (ExtFilter.getCapacity() == 0) {
      for (i = 0; i < ResLen; i++) {
        op[i] = (fptype)srcflt[i];
      }

      return;
    }

    // Perform convolution of extflt and srcflt.

    const double* const extflt = &ExtFilter[0];
    int j;

    for (j = 0; j < ResLen; j++) {
      int k = 0;
      int l = j - ExtFilter.getCapacity() + 1;
      int r = l + ExtFilter.getCapacity();

      if (l < 0) {
        k -= l;
        l = 0;
      }

      if (r > SrcFilterLen) {
        r = SrcFilterLen;
      }

      const double* const extfltb = extflt + k;
      const double* const srcfltb = srcflt + l;
      double s = 0.0;
      l = r - l;

      for (i = 0; i < l; i++) {
        s += extfltb[i] * srcfltb[i];
      }

      op[j] = (fptype)s;
    }
  }
};

/**
 * @brief Thread pool for multi-threaded image resizing operation.
 *
 * This base class is used to organize a multi-threaded image resizing
 * operation. The thread pool should consist of threads that initially wait
 * for a signal. Upon receiving a signal (via the startAllWorkloads()
 * function) each previously added thread should execute its workload's
 * process() function once, and return to the wait signal state again. The
 * thread pool should be also able to efficiently wait for all workloads to
 * finish via the waitAllWorkloadsToFinish() function.
 *
 * The image resizing algorithm makes calls to functions of this class.
 */

class CImageResizerThreadPool {
 public:
  CImageResizerThreadPool() {}

  virtual ~CImageResizerThreadPool() {}

  /**
   * @brief Thread pool's workload object class.
   *
   * This class should be used as a base class for objects that perform the
   * actual work spread over several threads.
   */

  class CWorkload {
   public:
    virtual ~CWorkload() {}

    /**
     * Function that gets called from the thread when thread pool's
     * startAllWorkloads() function is called.
     */

    virtual void process() = 0;
  };

  /**
   * @return The suggested number of workloads (and their associated
   * threads) to add. The minimal value this function can return is 1. The
   * usual value may depend on the number of physical and virtual cores
   * present in the system, and on other considerations.
   */

  virtual int getSuggestedWorkloadCount() const { return (1); }

  /**
   * Function adds a new workload (and possibly thread) to the thread pool.
   * The caller decides how many parallel workloads (and threads) it
   * requires, but this number will not exceed the value returned by the
   * getSuggestedWorkloadCount() function. It is implementation-specific how
   * many workloads to associate with a single thread. But for efficiency
   * reasons each workload should be associated with its own thread.
   *
   * Note that the same set of workload objects will be processed each time
   * the startAllWorkloads() function is called. This means that workload
   * objects are added only once. The caller changes the state of the
   * workload objects and then calls the startAllWorkloads() function to
   * process them.
   *
   * @param Workload Workload object whose process() function will be called
   * from within the thread when the startAllWorkloads() function is called.
   */

  virtual void addWorkload(CWorkload* const Workload) {}

  /**
   * Function starts all workloads associated with threads previously added
   * via the addWorkload() function. It is assumed that this function
   * performs the necessary "memory barrier" (or "cache sync") kind of
   * operation so that all threads catch up the prior changes made to the
   * workload objects during their wait state.
   */

  virtual void startAllWorkloads() {}

  /**
   * Function waits for all workloads to finish.
   */

  virtual void waitAllWorkloadsToFinish() {}

  /**
   * Function removes all workloads previously added via the addWorkload()
   * function. This function gets called only after the
   * waitAllWorkloadsToFinish() function call.
   */

  virtual void removeAllWorkloads() {}
};

/**
 * @brief Resizing algorithm parameters structure.
 *
 * This structure holds all selectable parameters used by the resizing
 * algorithm at various stages, for both downsizing and upsizing. There are no
 * other parameters exist that can optimize the performance of the resizing
 * algorithm. Filter length parameters can take fractional values.
 *
 * Beside quality, these parameters (except Alpha parameters) directly affect
 * the computative cost of the resizing algorithm. It is possible to trade
 * the visual quality for computative cost.
 *
 * Anti-alias filtering during downsizing can be defined as a considerable
 * reduction of contrast of smallest features of an image. Unfortunately, such
 * de-contrasting partially affects features of all sizes thus producing a
 * non-linearity of frequency response. All pre-defined parameter sets are
 * described by 3 values separated by slashes. The first value is the
 * de-contrasting factor of small features (which are being removed) while
 * the second value is the de-contrasting factor of large features (which
 * should remain intact), with value of 1 equating to "no contrast change".
 * The third value is the optimization score (see below), with value of 0
 * equating to the "perfect" linearity of frequency response.
 *
 * The pre-defined parameter sets offered by this library were auto-optimized
 * for the given LPFltBaseLen, IntFltLen and CorrFltAlpha values. The
 * optimization goal was to minimize the score: the sum of squares of the
 * difference between original and processed images (which was not actually
 * resized, k=1). The original image was a 0.5 megapixel uniformly-distributed
 * white-noise image with pixel intensities in the 0-1 range. Such goal
 * converges very well and produces filtering system with the flattest
 * frequency response possible for the given constraints. With this goal,
 * increasing the LPFltBaseLen value reduces the general amount of aliasing
 * artifacts.
 */

struct CImageResizerParams {
  double CorrFltAlpha;  ///< Alpha parameter of the Peaked Cosine window
                        ///< function used on the correction filter. The
                        ///< "usable" values are in the narrow range 1.0 to 1.5.
                        ///<
  double CorrFltLen;    ///< Correction filter's length in samples (taps). The
                        ///< "usable" range is narrow, 5.5 to 8, as to minimize
  ///< the "overcorrection" which is mathematically precise,
  ///< but visually unacceptable.
  ///<
  double IntFltAlpha;   ///< Alpha parameter of the Peaked Cosine window
                        ///< function used on the interpolation low-pass filter.
                        ///< The "usable" values are in the range 1.5 to 2.5.
                        ///<
  double IntFltCutoff;  ///< Interpolation low-pass filter's cutoff frequency
                        ///< (normalized, [0; 1]). The "usable" range is 0.6 to
                        ///< 0.8.
                        ///<
  double IntFltLen;     ///< Interpolation low-pass filter's length in samples
                        ///< (taps). The length value should be at least 18 or
  ///< otherwise a "dark grid" artifact will be introduced if
  ///< a further sharpening is applied. IntFltLen together
  ///< with other IntFlt parameters should be tuned in a way
  ///< that produces the flattest frequency response in 0-0.5
  ///< normalized frequency range (this range is due to 2X
  ///< upsampling).
  ///<
  double LPFltAlpha;    ///< Alpha parameter of the Peaked Cosine window
                        ///< function used on the low-pass filter. The "usable"
                        ///< values are in the range 1.5 to 6.5.
                        ///<
  double LPFltBaseLen;  ///< Base length of the low-pass (aka anti-aliasing
                        ///< or reconstruction) filter, in samples (taps),
                        ///< further adjusted by the actual cutoff frequency,
                        ///< upsampling and downsampling factors. The "usable"
                        ///< range is between 6 and 9.
                        ///<
  double LPFltCutoffMult;  ///< Low-pass filter's cutoff frequency
                           ///< multiplier. This value can be both below and
                           ///< above 1.0 as low-pass filters are inserted on
                           ///< downsampling and upsampling steps and always
                           ///< have corner frequency equal to or below 0.5pi.
                           ///< This multiplier shifts low-pass filter's corner
                           ///< frequency towards lower (if below 1.0) or higher
                           ///< (if above 1.0) frequencies. This multiplier can
                           ///< be way below 1.0 since any additional
                           ///< high-frequency damping will be partially
                           ///< corrected by the correction filter. The "usable"
                           ///< range is 0.3 to 1.0.
                           ///<

  CImageResizerParams()
      : HBFltAlpha(1.75395), HBFltCutoff(0.40356), HBFltLen(22.00000) {}

  double HBFltAlpha;   ///< Half-band filter's Alpha. Assigned internally.
                       ///<
  double HBFltCutoff;  ///< Half-band filter's cutoff point [0; 1]. Assigned
                       ///< internally.
                       ///<
  double HBFltLen;     ///< Length of the half-band low-pass filter. Assigned
                       ///< internally. Internally used to perform 2X or higher
  ///< downsampling. These filter parameters should be treated
  ///< as "technical" and do not require adjustment as they
  ///< were tuned to suit all combinations of other
  ///< parameters. This half-band filter provides a wide
  ///< transition band (for minimal ringing artifacts) and a
  ///< high stop-band attenuation (for minimal aliasing).
  ///<
};

/**
 * @brief The default set of resizing algorithm parameters
 * (10.01/1.029/0.019169).
 *
 * This is the default set of resizing parameters that was designed to deliver
 * a sharp image while still providing a low amount of ringing artifacts, and
 * having a reasonable computational cost.
 */

struct CImageResizerParamsDef : public CImageResizerParams {
  CImageResizerParamsDef() {
    CorrFltAlpha = 1.0;  // 10.01/1.88/1.029(522.43)/0.019169:258648,446808
    CorrFltLen = 6.30770;
    IntFltAlpha = 2.27825;
    IntFltCutoff = 0.75493;
    IntFltLen = 18.0;
    LPFltAlpha = 3.40127;
    LPFltBaseLen = 7.78;
    LPFltCutoffMult = 0.78797;
  }
};

/**
 * @brief Set of resizing algorithm parameters for ultra-low-ringing
 * performance (7.69/1.069/0.000245).
 *
 * This set of resizing algorithm parameters offers the lowest amount of
 * ringing this library is capable of providing while still offering a decent
 * quality. Low ringing is attained at the expense of higher aliasing
 * artifacts and a slightly reduced contrast.
 */

struct CImageResizerParamsULR : public CImageResizerParams {
  CImageResizerParamsULR() {
    CorrFltAlpha = 1.0;  // 7.69/1.97/1.069(31445.45)/0.000245:258627,436845
    CorrFltLen = 5.83280;
    IntFltAlpha = 2.11453;
    IntFltCutoff = 0.73986;
    IntFltLen = 18.0;
    LPFltAlpha = 1.73455;
    LPFltBaseLen = 6.40;
    LPFltCutoffMult = 0.61314;
  }
};

/**
 * @brief Set of resizing algorithm parameters for low-ringing performance
 * (7.86/1.065/0.000106).
 *
 * This set of resizing algorithm parameters offers a very low-ringing
 * performance at the expense of higher aliasing artifacts and a slightly
 * reduced contrast.
 */

struct CImageResizerParamsLR : public CImageResizerParams {
  CImageResizerParamsLR() {
    CorrFltAlpha = 1.0;  // 7.86/1.96/1.065(73865.02)/0.000106:258636,437381
    CorrFltLen = 5.87671;
    IntFltAlpha = 2.25322;
    IntFltCutoff = 0.74090;
    IntFltLen = 18.0;
    LPFltAlpha = 1.79306;
    LPFltBaseLen = 7.00;
    LPFltCutoffMult = 0.68881;
  }
};

/**
 * @brief Set of resizing algorithm parameters for lower-ringing performance
 * (8.86/1.046/0.010168).
 *
 * This set of resizing algorithm parameters offers a lower-ringing
 * performance in comparison to the default setting, at the expense of higher
 * aliasing artifacts and a slightly reduced contrast.
 */

struct CImageResizerParamsLow : public CImageResizerParams {
  CImageResizerParamsLow() {
    CorrFltAlpha = 1.0;  // 8.86/1.92/1.046(871.54)/0.010168:258647,442252
    CorrFltLen = 6.09757;
    IntFltAlpha = 2.36704;
    IntFltCutoff = 0.74674;
    IntFltLen = 18.0;
    LPFltAlpha = 2.19427;
    LPFltBaseLen = 7.66;
    LPFltCutoffMult = 0.75380;
  }
};

/**
 * @brief Set of resizing algorithm parameters for low-aliasing
 * resizing (11.81/1.012/0.038379).
 *
 * This set of resizing algorithm parameters offers a considerable
 * anti-aliasing performance with a good frequency response linearity (and
 * contrast). This is an intermediate setting between the default and Ultra
 * parameters.
 */

struct CImageResizerParamsHigh : public CImageResizerParams {
  CImageResizerParamsHigh() {
    CorrFltAlpha = 1.0;  // 11.81/1.83/1.012(307.84)/0.038379:258660,452719
    CorrFltLen = 6.80909;
    IntFltAlpha = 2.44917;
    IntFltCutoff = 0.75856;
    IntFltLen = 18.0;
    LPFltAlpha = 4.39527;
    LPFltBaseLen = 8.18;
    LPFltCutoffMult = 0.79172;
  }
};

/**
 * @brief Set of resizing algorithm parameters for ultra low-aliasing
 * resizing (13.65/1.001/0.000483).
 *
 * This set of resizing algorithm parameters offers a very considerable
 * anti-aliasing performance with a good frequency response linearity (and
 * contrast). This set of parameters is computationally expensive and may
 * produce ringing artifacts on sharp features.
 */

struct CImageResizerParamsUltra : public CImageResizerParams {
  CImageResizerParamsUltra() {
    CorrFltAlpha = 1.0;  // 13.65/1.79/1.001(28288.41)/0.000483:258658,457974
    CorrFltLen = 7.48060;
    IntFltAlpha = 1.93750;
    IntFltCutoff = 0.75462;
    IntFltLen = 18.0;
    LPFltAlpha = 5.55209;
    LPFltBaseLen = 8.34;
    LPFltCutoffMult = 0.78002;
  }
};

/**
 * @brief Image resizing variables class.
 *
 * This is an utility "catch all" class that defines various variables used
 * during image resizing. Several variables that are explicitly initialized in
 * this class' constructor are also used as additional "input" variables to
 * the image resizing function. These variables will not be changed by the
 * avir::CImageResizer<>::resizeImage() function.
 */

class CImageResizerVars {
 public:
  int ElCount;    ///< The number of "fptype" elements used to store 1 pixel.
                  ///<
  int ElCountIO;  ///< The number of source and destination image's elements
                  ///< used to store 1 pixel.
                  ///<
  int fppack;     ///< The number of atomic types stored in a single "fptype"
                  ///< element.
                  ///<
  int fpalign;    ///< Suggested alignment size in bytes. This is not a
                ///< required alignment, because image resizing algorithm cannot
                ///< be made to have a strictly aligned data access in all cases
                ///< (e.g. de-interleaved interpolation cannot perform aligned
                ///< accesses).
                ///<
  int elalign;   ///< Length alignment of arrays of elements. This applies to
                 ///< filters and intermediate buffers: this constant forces
                 ///< filters and scanlines to have a length which is a multiple
                 ///< of this value, for more efficient SIMD implementation.
                 ///<
  int packmode;  ///< 0 if interleaved packing, 1 if de-interleaved.
                 ///<
  int BufLen[2];   ///< Intermediate buffers' lengths in "fptype" elements.
  int BufOffs[2];  ///< Offsets into the intermediate buffers, used to
                   ///< provide prefix elements required during processing so
                   ///< that no "out of range" access happens. This offset is a
                   ///< multiple of ElCount if pixels are stored in interleaved
                   ///< form.
                   ///<
  double k;  ///< Resizing step coefficient, updated to reflect the actually
             ///< used coefficient during resizing.
             ///<
  double o;  ///< Starting pixel offset inside the source image, updated to
             ///< reflect the actually used offset during resizing.
             ///<
  int ResizeStep;       ///< Index of the resizing step in the latest filtering
                        ///< steps array.
                        ///<
  double InGammaMult;   ///< Input gamma multiplier, used to convert input
                        ///< data to 0 to 1 range. 0.0 if no gamma is in use.
                        ///<
  double OutGammaMult;  ///< Output gamma multiplier, used to convert data to
                        ///< 0 to 255/65535 range. 0.0 if no gamma is in use.
                        ///<

  double ox;  ///< Start X pixel offset within source image (can be
              ///< negative). Positive offset moves image to the left.
              ///<
  double oy;  ///< Start Y pixel offset within source image (can be
              ///< negative). Positive offset moves image to the top.
              ///<
  CImageResizerThreadPool*
      ThreadPool;     ///< Thread pool to be used by the
                      ///< image resizing function. Set to NULL to use
                      ///< single-threaded processing.
                      ///<
  bool UseSRGBGamma;  ///< Perform sRGB gamma linearization (correction).
                      ///<
  int BuildMode;      ///< The build mode to use, for debugging purposes. Set to
                  ///< -1 to select a minimal-complexity mode automatically. All
                  ///< build modes deliver similar results with minor
                  ///< deviations.
                  ///<
  int RndSeed;  ///< Random seed parameter. This parameter may be incremented
                ///< after each random generator initialization. The use of this
                ///< variable depends on the ditherer implementation.
                ///<

  CImageResizerVars()
      : ox(0.0),
        oy(0.0),
        ThreadPool(NULL),
        UseSRGBGamma(false),
        BuildMode(-1),
        RndSeed(0) {}
};

/**
 * @brief Image resizer's filtering step class.
 *
 * Class defines data to perform a single filtering step over a whole
 * horizontal or vertical scanline. Resizing consists of 1 or more steps that
 * may be performed before the actual resizing takes place. Filtering may also
 * follow a resizing step. Each step must ensure that scanline data contains
 * enough pixels to perform the next step (which may be resizing) without
 * exceeding scanline's bounds.
 *
 * A derived class must implement several "const" and "static" functions that
 * are used to perform the actual filtering in interleaved or de-interleaved
 * mode.
 *
 * @tparam fptype Floating point type to use for storing pixel elements. SIMD
 * types can be used: in this case each element may hold a whole pixel.
 * @tparam fptypeatom The atomic type the "fptype" consists of.
 */

template <class fptype, class fptypeatom>
class CImageResizerFilterStep {
 public:
  bool IsUpsample;      ///< "True" if this step is an upsampling step, "false"
                        ///< if downsampling step. Should be set to "false" if
                        ///< ResampleFactor equals 0.
                        ///<
  int ResampleFactor;   ///< Resample factor (>=1). If 0, this is a resizing
                        ///< step. This value should be >1 if IsUpsample equals
                        ///< "true".
                        ///<
  CBuffer<fptype> Flt;  ///< Filter to use at this step.
                        ///<
  CFltBuffer FltOrig;   ///< Originally-designed filter. This buffer may not
                        ///< be assigned. Assigned by filters that precede the
                        ///< resizing step if such filter is planned to be
  ///< embedded into the interpolation filter as "external"
  ///< filter. If IsUpsample=true and this filter buffer is
  ///< not empty, the upsampling step will not itself apply
  ///< any filtering over upsampled input scanline.
  ///<
  double DCGain;   ///< DC gain which was applied to the filter. Not defined
                   ///< if ResampleFactor = 0.
                   ///<
  int FltLatency;  ///< Filter's latency (group delay, shift) in pixels.
                   ///<
  const CImageResizerVars* Vars;  ///< Image resizing-related variables.
                                  ///<
  int InLen;                      ///< Input scanline's length in pixels.
                                  ///<
  int InBuf;                      ///< Input buffer index, 0 or 1.
                                  ///<
  int InPrefix;  ///< Required input prefix pixels. These prefix pixels will
                 ///< be filled with source scanline's first pixel value. If
                 ///< IsUpsample is "true", this is the additional number of
                 ///< times the first pixel will be filtered before processing
                 ///< scanline, this number is also reflected in the OutPrefix.
                 ///<
  int InSuffix;  ///< Required input suffix pixels. These suffix pixels will
                 ///< be filled with source scanline's last pixel value. If
                 ///< IsUpsample is "true", this is the additional number of
                 ///< times the last pixel will be filtered before processing
                 ///< scanline, this number is also reflected in the OutSuffix.
                 ///<
  int InElIncr;  ///< Pixel element increment within the input buffer, used
                 ///< during de-interleaved processing: in this case each
                 ///< image's channel is stored independently, InElIncr elements
                 ///< apart.
                 ///<
  int OutLen;    ///< Length of the resulting scanline.
                 ///<
  int OutBuf;    ///< Output buffer index. 0 or 1; 2 for the last step.
                 ///<
  int OutPrefix;  ///< Required output prefix pixels. These prefix pixels
                  ///< will not be pre-filled with any values. Value is valid
                  ///< only if IsUpsample equals "true".
                  ///<
  int OutSuffix;  ///< Required input suffix pixels. These suffix pixels will
                  ///< not be pre-filled with any values. Value is valid only if
                  ///< IsUpsample equals "true".
                  ///<
  int OutElIncr;  ///< Pixel element increment within the output buffer, used
                  ///< during de-interleaved processing. Equals to the
                  ///< InBufElIncr of the next step.
                  ///<
  CBuffer<fptype> PrefixDC;  ///< DC component fluctuations added at the
                             ///< start of the resulting scanline, used when
                             ///< IsUpsample equals "true".
                             ///<
  CBuffer<fptype> SuffixDC;  ///< DC component fluctuations added at the
                             ///< end of the resulting scanline, used when
                             ///< IsUpsample equals "true".
                             ///<
  int EdgePixelCount;        ///< The number of edge pixels added. Affects the
                       ///< initial position within the input scanline, used to
                       ///< produce edge pixels. This variable is used and
                       ///< should be defined when IsUpsample=false and
                       ///< ResampleFactor>0. When assigning this variable it is
                       ///< also necessary to update InPrefix, OutLen and Vars.o
                       ///< variables.
                       ///<
  static const int EdgePixelCountDef =
      3;  ///< The default number of pixels
          ///< additionally produced at scanline edges during filtering. This is
          ///< required to reduce edge artifacts.
          ///<

  /**
   * @brief Resizing position structure.
   *
   * Structure holds resizing position and pointer to fractional delay
   * filter.
   */

  struct CResizePos {
    int SrcPosInt;      ///< Source scanline position.
                        ///<
    int fti;            ///< Fractional delay filter index.
                        ///<
    const fptype* ftp;  ///< Fractional delay filter pointer.
                        ///<
    fptypeatom x;       ///< Interpolation coefficient between delay filters.
                        ///<
    int SrcOffs;        ///< Source scanline offset.
                        ///<
  };

  /**
   * @brief Resizing positions buffer class.
   *
   * This class combines buffer together with variables that define resizing
   * stepping.
   */

  class CRPosBuf : public CBuffer<CResizePos> {
   public:
    double k;       ///< Resizing step.
                    ///<
    double o;       ///< Resizing offset.
                    ///<
    int FracCount;  ///< The number of fractional delay filters in a filter
                    ///< bank used together with this buffer.
                    ///<
  };

  /**
   * @brief Resizing positions buffer array class.
   *
   * This class combines structure array of the CRPosBuf class objects with
   * the function that locates or creates buffer with the required resizing
   * stepping.
   */

  class CRPosBufArray : public CStructArray<CRPosBuf> {
   public:
    using CStructArray<CRPosBuf>::add;
    using CStructArray<CRPosBuf>::getItemCount;

    /**
     * Function returns the resizing positions buffer with the required
     * stepping. If no such buffer exists, it is created.
     *
     * @param k Resizing step.
     * @param o Resizing offset.
     * @param FracCount The number of fractional delay filters in a filter
     * bank used together with this buffer.
     * @return Reference to the CRPosBuf object.
     */

    CRPosBuf& getRPosBuf(const double k, const double o, const int FracCount) {
      int i;

      for (i = 0; i < getItemCount(); i++) {
        CRPosBuf& Buf = (*this)[i];

        if (Buf.k == k && Buf.o == o && Buf.FracCount == FracCount) {
          return (Buf);
        }
      }

      CRPosBuf& NewBuf = add();
      NewBuf.k = k;
      NewBuf.o = o;
      NewBuf.FracCount = FracCount;

      return (NewBuf);
    }
  };

  CRPosBuf* RPosBuf;  ///< Resizing positions buffer. Used when
                      ///< ResampleFactor equals 0 (resizing step).
                      ///<
  CDSPFracFilterBankLin<fptype>* FltBank;  ///< Filter bank in use by *this
                                           ///< resizing step.
                                           ///<
};

/**
 * @brief Interleaved filtering steps implementation class.
 *
 * This class implements scanline filtering functions in interleaved mode.
 * This means that each pixel is processed independently, not in groups.
 *
 * @tparam fptype Floating point type to use for storing pixel elements. SIMD
 * types can be used: in this case each element may hold a whole pixel.
 * @tparam fptypeatom The atomic type the "fptype" consists of.
 */

template <class fptype, class fptypeatom>
class CImageResizerFilterStepINL
    : public CImageResizerFilterStep<fptype, fptypeatom> {
 public:
  using CImageResizerFilterStep<fptype, fptypeatom>::IsUpsample;
  using CImageResizerFilterStep<fptype, fptypeatom>::ResampleFactor;
  using CImageResizerFilterStep<fptype, fptypeatom>::Flt;
  using CImageResizerFilterStep<fptype, fptypeatom>::FltOrig;
  using CImageResizerFilterStep<fptype, fptypeatom>::FltLatency;
  using CImageResizerFilterStep<fptype, fptypeatom>::Vars;
  using CImageResizerFilterStep<fptype, fptypeatom>::InLen;
  using CImageResizerFilterStep<fptype, fptypeatom>::InPrefix;
  using CImageResizerFilterStep<fptype, fptypeatom>::InSuffix;
  using CImageResizerFilterStep<fptype, fptypeatom>::OutLen;
  using CImageResizerFilterStep<fptype, fptypeatom>::OutPrefix;
  using CImageResizerFilterStep<fptype, fptypeatom>::OutSuffix;
  using CImageResizerFilterStep<fptype, fptypeatom>::PrefixDC;
  using CImageResizerFilterStep<fptype, fptypeatom>::SuffixDC;
  using CImageResizerFilterStep<fptype, fptypeatom>::RPosBuf;
  using CImageResizerFilterStep<fptype, fptypeatom>::FltBank;
  using CImageResizerFilterStep<fptype, fptypeatom>::EdgePixelCount;

  /**
   * Function performs "packing" of a scanline and type conversion.
   * Scanline, depending on the "fptype" can be potentially stored as a
   * packed SIMD values having a certain atomic type. If required, the sRGB
   * gamma correction is applied.
   *
   * @param ip Input scanline.
   * @param op0 Output scanline.
   * @param l0 The number of pixels to "pack".
   */

  template <class Tin>
  void packScanline(const Tin* ip, fptype* const op0, const int l0) const {
    const int ElCount = Vars->ElCount;
    const int ElCountIO = Vars->ElCountIO;
    fptype* op = op0;
    int l = l0;

    if (!Vars->UseSRGBGamma) {
      if (ElCountIO == 1) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = (fptypeatom)ip[0];
          op += ElCount;
          ip++;
          l--;
        }
      } else if (ElCountIO == 4) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = (fptypeatom)ip[0];
          v[1] = (fptypeatom)ip[1];
          v[2] = (fptypeatom)ip[2];
          v[3] = (fptypeatom)ip[3];
          op += ElCount;
          ip += 4;
          l--;
        }
      } else if (ElCountIO == 3) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = (fptypeatom)ip[0];
          v[1] = (fptypeatom)ip[1];
          v[2] = (fptypeatom)ip[2];
          op += ElCount;
          ip += 3;
          l--;
        }
      } else if (ElCountIO == 2) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = (fptypeatom)ip[0];
          v[1] = (fptypeatom)ip[1];
          op += ElCount;
          ip += 2;
          l--;
        }
      }
    } else {
      const fptypeatom gm = (fptypeatom)Vars->InGammaMult;

      if (ElCountIO == 1) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = convertSRGB2Lin((fptypeatom)ip[0] * gm);
          op += ElCount;
          ip++;
          l--;
        }
      } else if (ElCountIO == 4) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = convertSRGB2Lin((fptypeatom)ip[0] * gm);
          v[1] = convertSRGB2Lin((fptypeatom)ip[1] * gm);
          v[2] = convertSRGB2Lin((fptypeatom)ip[2] * gm);
          v[3] = convertSRGB2Lin((fptypeatom)ip[3] * gm);
          op += ElCount;
          ip += 4;
          l--;
        }
      } else if (ElCountIO == 3) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = convertSRGB2Lin((fptypeatom)ip[0] * gm);
          v[1] = convertSRGB2Lin((fptypeatom)ip[1] * gm);
          v[2] = convertSRGB2Lin((fptypeatom)ip[2] * gm);
          op += ElCount;
          ip += 3;
          l--;
        }
      } else if (ElCountIO == 2) {
        while (l > 0) {
          fptypeatom* v = (fptypeatom*)op;
          v[0] = convertSRGB2Lin((fptypeatom)ip[0] * gm);
          v[1] = convertSRGB2Lin((fptypeatom)ip[1] * gm);
          op += ElCount;
          ip += 2;
          l--;
        }
      }
    }

    const int ZeroCount = ElCount * Vars->fppack - ElCountIO;
    op = op0;
    l = l0;

    if (ZeroCount == 1) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)op + ElCountIO;
        v[0] = (fptypeatom)0;
        op += ElCount;
        l--;
      }
    } else if (ZeroCount == 2) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)op + ElCountIO;
        v[0] = (fptypeatom)0;
        v[1] = (fptypeatom)0;
        op += ElCount;
        l--;
      }
    } else if (ZeroCount == 3) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)op + ElCountIO;
        v[0] = (fptypeatom)0;
        v[1] = (fptypeatom)0;
        v[2] = (fptypeatom)0;
        op += ElCount;
        l--;
      }
    }
  }

  /**
   * Function applies Linear to sRGB gamma correction to the specified
   * scanline.
   *
   * @param p Scanline.
   * @param l The number of pixels to de-linearize.
   * @param Vars0 Image resizing-related variables.
   */

  static void applySRGBGamma(fptype* p, int l, const CImageResizerVars& Vars0) {
    const int ElCount = Vars0.ElCount;
    const int ElCountIO = Vars0.ElCountIO;
    const fptypeatom gm = (fptypeatom)Vars0.OutGammaMult;

    if (ElCountIO == 1) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)p;
        v[0] = convertLin2SRGB(v[0]) * gm;
        p += ElCount;
        l--;
      }
    } else if (ElCountIO == 4) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)p;
        v[0] = convertLin2SRGB(v[0]) * gm;
        v[1] = convertLin2SRGB(v[1]) * gm;
        v[2] = convertLin2SRGB(v[2]) * gm;
        v[3] = convertLin2SRGB(v[3]) * gm;
        p += ElCount;
        l--;
      }
    } else if (ElCountIO == 3) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)p;
        v[0] = convertLin2SRGB(v[0]) * gm;
        v[1] = convertLin2SRGB(v[1]) * gm;
        v[2] = convertLin2SRGB(v[2]) * gm;
        p += ElCount;
        l--;
      }
    } else if (ElCountIO == 2) {
      while (l > 0) {
        fptypeatom* v = (fptypeatom*)p;
        v[0] = convertLin2SRGB(v[0]) * gm;
        v[1] = convertLin2SRGB(v[1]) * gm;
        p += ElCount;
        l--;
      }
    }
  }

  /**
   * Function converts vertical scanline to horizontal scanline. This
   * function is called by the image resizer when image is resized
   * vertically. This means that the vertical scanline is stored in the
   * same format produced by the packScanline() and maintained by other
   * filtering functions.
   *
   * @param ip Input vertical scanline.
   * @param op Output buffer (temporary buffer used during resizing).
   * @param SrcLen The number of pixels in the input scanline, also used to
   * calculate input buffer increment.
   * @param SrcIncr Input buffer increment to the next vertical pixel.
   */

  void convertVtoH(const fptype* ip, fptype* op, const int SrcLen,
                   const int SrcIncr) const {
    const int ElCount = Vars->ElCount;
    int j;

    if (ElCount == 1) {
      for (j = 0; j < SrcLen; j++) {
        op[0] = ip[0];
        ip += SrcIncr;
        op++;
      }
    } else if (ElCount == 4) {
      for (j = 0; j < SrcLen; j++) {
        op[0] = ip[0];
        op[1] = ip[1];
        op[2] = ip[2];
        op[3] = ip[3];
        ip += SrcIncr;
        op += 4;
      }
    } else if (ElCount == 3) {
      for (j = 0; j < SrcLen; j++) {
        op[0] = ip[0];
        op[1] = ip[1];
        op[2] = ip[2];
        ip += SrcIncr;
        op += 3;
      }
    } else if (ElCount == 2) {
      for (j = 0; j < SrcLen; j++) {
        op[0] = ip[0];
        op[1] = ip[1];
        ip += SrcIncr;
        op += 2;
      }
    }
  }

  /**
   * Function performs "unpacking" of a scanline and type conversion
   * (truncation is used when floating point is converted to integer).
   * Scanline, depending on the "fptype" can be potentially stored as a
   * packed SIMD values having a certain atomic type. The unpacking function
   * assumes that scanline is stored in the style produced by the
   * packScanline() function.
   *
   * @param ip Input scanline.
   * @param op Output scanline.
   * @param l The number of pixels to "unpack".
   * @param Vars0 Image resizing-related variables.
   */

  template <class Tout>
  static void unpackScanline(const fptype* ip, Tout* op, int l,
                             const CImageResizerVars& Vars0) {
    const int ElCount = Vars0.ElCount;
    const int ElCountIO = Vars0.ElCountIO;

    if (ElCountIO == 1) {
      while (l > 0) {
        const fptypeatom* v = (const fptypeatom*)ip;
        op[0] = (Tout)v[0];
        ip += ElCount;
        op++;
        l--;
      }
    } else if (ElCountIO == 4) {
      while (l > 0) {
        const fptypeatom* v = (const fptypeatom*)ip;
        op[0] = (Tout)v[0];
        op[1] = (Tout)v[1];
        op[2] = (Tout)v[2];
        op[3] = (Tout)v[3];
        ip += ElCount;
        op += 4;
        l--;
      }
    } else if (ElCountIO == 3) {
      while (l > 0) {
        const fptypeatom* v = (const fptypeatom*)ip;
        op[0] = (Tout)v[0];
        op[1] = (Tout)v[1];
        op[2] = (Tout)v[2];
        ip += ElCount;
        op += 3;
        l--;
      }
    } else if (ElCountIO == 2) {
      while (l > 0) {
        const fptypeatom* v = (const fptypeatom*)ip;
        op[0] = (Tout)v[0];
        op[1] = (Tout)v[1];
        ip += ElCount;
        op += 2;
        l--;
      }
    }
  }

  /**
   * Function prepares input scanline buffer for *this filtering step.
   * Left- and right-most pixels are replicated to make sure no buffer
   * overrun happens. Such approach also allows to bypass any pointer
   * range checks.
   *
   * @param Src Source buffer.
   */

  void prepareInBuf(fptype* Src) const {
    if (IsUpsample || InPrefix + InSuffix == 0) {
      return;
    }

    const int ElCount = Vars->ElCount;
    replicateArray(Src, ElCount, Src - ElCount, InPrefix, -ElCount);

    Src += (InLen - 1) * ElCount;
    replicateArray(Src, ElCount, Src + ElCount, InSuffix, ElCount);
  }

  /**
   * Function peforms scanline upsampling with filtering.
   *
   * @param Src Source scanline buffer (length = this -> InLen). Source
   * scanline increment will be equal to ElCount.
   * @param Dst Destination scanline buffer.
   */

  void doUpsample(const fptype* const Src, fptype* const Dst) const {
    const int ElCount = Vars->ElCount;
    fptype* op0 = &Dst[-OutPrefix * ElCount];
    memset(op0, 0, (OutPrefix + OutLen + OutSuffix) * ElCount * sizeof(fptype));

    const fptype* ip = Src;
    const int opstep = ElCount * ResampleFactor;
    int l;

    if (FltOrig.getCapacity() > 0) {
      // Do not perform filtering, only upsample.

      op0 += (OutPrefix % ResampleFactor) * ElCount;
      l = OutPrefix / ResampleFactor;

      if (ElCount == 1) {
        while (l > 0) {
          op0[0] = ip[0];
          op0 += opstep;
          l--;
        }

        l = InLen - 1;

        while (l > 0) {
          op0[0] = ip[0];
          op0 += opstep;
          ip += ElCount;
          l--;
        }

        l = OutSuffix / ResampleFactor;

        while (l >= 0) {
          op0[0] = ip[0];
          op0 += opstep;
          l--;
        }
      } else if (ElCount == 4) {
        while (l > 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0[2] = ip[2];
          op0[3] = ip[3];
          op0 += opstep;
          l--;
        }

        l = InLen - 1;

        while (l > 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0[2] = ip[2];
          op0[3] = ip[3];
          op0 += opstep;
          ip += ElCount;
          l--;
        }

        l = OutSuffix / ResampleFactor;

        while (l >= 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0[2] = ip[2];
          op0[3] = ip[3];
          op0 += opstep;
          l--;
        }
      } else if (ElCount == 3) {
        while (l > 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0[2] = ip[2];
          op0 += opstep;
          l--;
        }

        l = InLen - 1;

        while (l > 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0[2] = ip[2];
          op0 += opstep;
          ip += ElCount;
          l--;
        }

        l = OutSuffix / ResampleFactor;

        while (l >= 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0[2] = ip[2];
          op0 += opstep;
          l--;
        }
      } else if (ElCount == 2) {
        while (l > 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0 += opstep;
          l--;
        }

        l = InLen - 1;

        while (l > 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0 += opstep;
          ip += ElCount;
          l--;
        }

        l = OutSuffix / ResampleFactor;

        while (l >= 0) {
          op0[0] = ip[0];
          op0[1] = ip[1];
          op0 += opstep;
          l--;
        }
      }

      return;
    }

    const fptype* const f = Flt;
    const int flen = Flt.getCapacity();
    fptype* op;
    int i;

    if (ElCount == 1) {
      l = InPrefix;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[i] += f[i] * ip[0];
        }

        op0 += opstep;
        l--;
      }

      l = InLen - 1;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[i] += f[i] * ip[0];
        }

        ip += ElCount;
        op0 += opstep;
        l--;
      }

      l = InSuffix;

      while (l >= 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[i] += f[i] * ip[0];
        }

        op0 += opstep;
        l--;
      }
    } else if (ElCount == 4) {
      l = InPrefix;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op[2] += f[i] * ip[2];
          op[3] += f[i] * ip[3];
          op += 4;
        }

        op0 += opstep;
        l--;
      }

      l = InLen - 1;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op[2] += f[i] * ip[2];
          op[3] += f[i] * ip[3];
          op += 4;
        }

        ip += ElCount;
        op0 += opstep;
        l--;
      }

      l = InSuffix;

      while (l >= 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op[2] += f[i] * ip[2];
          op[3] += f[i] * ip[3];
          op += 4;
        }

        op0 += opstep;
        l--;
      }
    } else if (ElCount == 3) {
      l = InPrefix;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op[2] += f[i] * ip[2];
          op += 3;
        }

        op0 += opstep;
        l--;
      }

      l = InLen - 1;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op[2] += f[i] * ip[2];
          op += 3;
        }

        ip += ElCount;
        op0 += opstep;
        l--;
      }

      l = InSuffix;

      while (l >= 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op[2] += f[i] * ip[2];
          op += 3;
        }

        op0 += opstep;
        l--;
      }
    } else if (ElCount == 2) {
      l = InPrefix;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op += 2;
        }

        op0 += opstep;
        l--;
      }

      l = InLen - 1;

      while (l > 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op += 2;
        }

        ip += ElCount;
        op0 += opstep;
        l--;
      }

      l = InSuffix;

      while (l >= 0) {
        op = op0;

        for (i = 0; i < flen; i++) {
          op[0] += f[i] * ip[0];
          op[1] += f[i] * ip[1];
          op += 2;
        }

        op0 += opstep;
        l--;
      }
    }

    op = op0;
    const fptype* dc = SuffixDC;
    l = SuffixDC.getCapacity();

    if (ElCount == 1) {
      for (i = 0; i < l; i++) {
        op[i] += ip[0] * dc[i];
      }
    } else if (ElCount == 4) {
      while (l > 0) {
        op[0] += ip[0] * dc[0];
        op[1] += ip[1] * dc[0];
        op[2] += ip[2] * dc[0];
        op[3] += ip[3] * dc[0];
        dc++;
        op += 4;
        l--;
      }
    } else if (ElCount == 3) {
      while (l > 0) {
        op[0] += ip[0] * dc[0];
        op[1] += ip[1] * dc[0];
        op[2] += ip[2] * dc[0];
        dc++;
        op += 3;
        l--;
      }
    } else if (ElCount == 2) {
      while (l > 0) {
        op[0] += ip[0] * dc[0];
        op[1] += ip[1] * dc[0];
        dc++;
        op += 2;
        l--;
      }
    }

    ip = Src;
    op = Dst - InPrefix * opstep;
    dc = PrefixDC;
    l = PrefixDC.getCapacity();

    if (ElCount == 1) {
      for (i = 0; i < l; i++) {
        op[i] += ip[0] * dc[i];
      }
    } else if (ElCount == 4) {
      while (l > 0) {
        op[0] += ip[0] * dc[0];
        op[1] += ip[1] * dc[0];
        op[2] += ip[2] * dc[0];
        op[3] += ip[3] * dc[0];
        dc++;
        op += 4;
        l--;
      }
    } else if (ElCount == 3) {
      while (l > 0) {
        op[0] += ip[0] * dc[0];
        op[1] += ip[1] * dc[0];
        op[2] += ip[2] * dc[0];
        dc++;
        op += 3;
        l--;
      }
    } else if (ElCount == 2) {
      while (l > 0) {
        op[0] += ip[0] * dc[0];
        op[1] += ip[1] * dc[0];
        dc++;
        op += 2;
        l--;
      }
    }
  }

  /**
   * Function peforms scanline filtering with optional downsampling.
   * Function makes use of the symmetry of the filter.
   *
   * @param Src Source scanline buffer (length = this -> InLen). Source
   * scanline increment will be equal to ElCount.
   * @param Dst Destination scanline buffer.
   * @param DstIncr Destination scanline buffer increment, used for
   * horizontal or vertical scanline stepping.
   */

  void doFilter(const fptype* const Src, fptype* Dst, const int DstIncr) const {
    const int ElCount = Vars->ElCount;
    const fptype* const f = &Flt[FltLatency];
    const int flen = FltLatency + 1;
    const int ipstep = ElCount * ResampleFactor;
    const fptype* ip = Src - EdgePixelCount * ipstep;
    const fptype* ip1;
    const fptype* ip2;
    int l = OutLen;
    int i;

    if (ElCount == 1) {
      while (l > 0) {
        fptype s = f[0] * ip[0];
        ip1 = ip;
        ip2 = ip;

        for (i = 1; i < flen; i++) {
          ip1++;
          ip2--;
          s += f[i] * (ip1[0] + ip2[0]);
        }

        Dst[0] = s;
        Dst += DstIncr;
        ip += ipstep;
        l--;
      }
    } else if (ElCount == 4) {
      while (l > 0) {
        fptype s1 = f[0] * ip[0];
        fptype s2 = f[0] * ip[1];
        fptype s3 = f[0] * ip[2];
        fptype s4 = f[0] * ip[3];
        ip1 = ip;
        ip2 = ip;

        for (i = 1; i < flen; i++) {
          ip1 += 4;
          ip2 -= 4;
          s1 += f[i] * (ip1[0] + ip2[0]);
          s2 += f[i] * (ip1[1] + ip2[1]);
          s3 += f[i] * (ip1[2] + ip2[2]);
          s4 += f[i] * (ip1[3] + ip2[3]);
        }

        Dst[0] = s1;
        Dst[1] = s2;
        Dst[2] = s3;
        Dst[3] = s4;
        Dst += DstIncr;
        ip += ipstep;
        l--;
      }
    } else if (ElCount == 3) {
      while (l > 0) {
        fptype s1 = f[0] * ip[0];
        fptype s2 = f[0] * ip[1];
        fptype s3 = f[0] * ip[2];
        ip1 = ip;
        ip2 = ip;

        for (i = 1; i < flen; i++) {
          ip1 += 3;
          ip2 -= 3;
          s1 += f[i] * (ip1[0] + ip2[0]);
          s2 += f[i] * (ip1[1] + ip2[1]);
          s3 += f[i] * (ip1[2] + ip2[2]);
        }

        Dst[0] = s1;
        Dst[1] = s2;
        Dst[2] = s3;
        Dst += DstIncr;
        ip += ipstep;
        l--;
      }
    } else if (ElCount == 2) {
      while (l > 0) {
        fptype s1 = f[0] * ip[0];
        fptype s2 = f[0] * ip[1];
        ip1 = ip;
        ip2 = ip;

        for (i = 1; i < flen; i++) {
          ip1 += 2;
          ip2 -= 2;
          s1 += f[i] * (ip1[0] + ip2[0]);
          s2 += f[i] * (ip1[1] + ip2[1]);
        }

        Dst[0] = s1;
        Dst[1] = s2;
        Dst += DstIncr;
        ip += ipstep;
        l--;
      }
    }
  }

  /**
   * Function performs resizing of a single scanline. This function does
   * not "know" about the length of the source scanline buffer. This buffer
   * should be padded with enough pixels so that ( SrcPos - FilterLenD2 ) is
   * always >= 0 and ( SrcPos + ( DstLineLen - 1 ) * k + FilterLenD2 + 1 )
   * does not exceed source scanline's buffer length. SrcLine's increment is
   * assumed to be equal to ElCount.
   *
   * @param SrcLine Source scanline buffer.
   * @param DstLine Destination (resized) scanline buffer.
   * @param DstLineIncr Destination scanline position increment, used for
   * horizontal or vertical scanline stepping.
   * @param xx Temporary buffer, of size FltBank -> getFilterLen(), must be
   * aligned by fpclass :: fpalign.
   */

  void doResize(const fptype* SrcLine, fptype* DstLine, const int DstLineIncr,
                fptype* const) const {
    const int IntFltLen = FltBank->getFilterLen();
    const int ElCount = Vars->ElCount;
    const typename CImageResizerFilterStep<fptype, fptypeatom>::CResizePos*
        rpos = &(*RPosBuf)[0];

    const typename CImageResizerFilterStep<
        fptype, fptypeatom>::CResizePos* const rpose = rpos + OutLen;

#define AVIR_RESIZE_PART1                        \
  while (rpos < rpose) {                         \
    const fptype x = (fptype)rpos->x;            \
    const fptype* const ftp = rpos->ftp;         \
    const fptype* const ftp2 = ftp + IntFltLen;  \
    const fptype* Src = SrcLine + rpos->SrcOffs; \
    int i;

#define AVIR_RESIZE_PART1nx                      \
  while (rpos < rpose) {                         \
    const fptype* const ftp = rpos->ftp;         \
    const fptype* Src = SrcLine + rpos->SrcOffs; \
    int i;

#define AVIR_RESIZE_PART2 \
  DstLine += DstLineIncr; \
  rpos++;                 \
  }

    if (FltBank->getOrder() == 1) {
      if (ElCount == 1) {
        AVIR_RESIZE_PART1

        fptype sum = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          sum += (ftp[i] + ftp2[i] * x) * Src[i];
        }

        DstLine[0] = sum;

        AVIR_RESIZE_PART2
      } else if (ElCount == 4) {
        AVIR_RESIZE_PART1

        fptype sum[4];
        sum[0] = 0.0;
        sum[1] = 0.0;
        sum[2] = 0.0;
        sum[3] = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          const fptype xx = ftp[i] + ftp2[i] * x;
          sum[0] += xx * Src[0];
          sum[1] += xx * Src[1];
          sum[2] += xx * Src[2];
          sum[3] += xx * Src[3];
          Src += 4;
        }

        DstLine[0] = sum[0];
        DstLine[1] = sum[1];
        DstLine[2] = sum[2];
        DstLine[3] = sum[3];

        AVIR_RESIZE_PART2
      } else if (ElCount == 3) {
        AVIR_RESIZE_PART1

        fptype sum[3];
        sum[0] = 0.0;
        sum[1] = 0.0;
        sum[2] = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          const fptype xx = ftp[i] + ftp2[i] * x;
          sum[0] += xx * Src[0];
          sum[1] += xx * Src[1];
          sum[2] += xx * Src[2];
          Src += 3;
        }

        DstLine[0] = sum[0];
        DstLine[1] = sum[1];
        DstLine[2] = sum[2];

        AVIR_RESIZE_PART2
      } else if (ElCount == 2) {
        AVIR_RESIZE_PART1

        fptype sum[2];
        sum[0] = 0.0;
        sum[1] = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          const fptype xx = ftp[i] + ftp2[i] * x;
          sum[0] += xx * Src[0];
          sum[1] += xx * Src[1];
          Src += 2;
        }

        DstLine[0] = sum[0];
        DstLine[1] = sum[1];

        AVIR_RESIZE_PART2
      }
    } else {
      if (ElCount == 1) {
        AVIR_RESIZE_PART1nx

            fptype sum = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          sum += ftp[i] * Src[i];
        }

        DstLine[0] = sum;

        AVIR_RESIZE_PART2
      } else if (ElCount == 4) {
        AVIR_RESIZE_PART1nx

            fptype sum[4];
        sum[0] = 0.0;
        sum[1] = 0.0;
        sum[2] = 0.0;
        sum[3] = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          const fptype xx = ftp[i];
          sum[0] += xx * Src[0];
          sum[1] += xx * Src[1];
          sum[2] += xx * Src[2];
          sum[3] += xx * Src[3];
          Src += 4;
        }

        DstLine[0] = sum[0];
        DstLine[1] = sum[1];
        DstLine[2] = sum[2];
        DstLine[3] = sum[3];

        AVIR_RESIZE_PART2
      } else if (ElCount == 3) {
        AVIR_RESIZE_PART1nx

            fptype sum[3];
        sum[0] = 0.0;
        sum[1] = 0.0;
        sum[2] = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          const fptype xx = ftp[i];
          sum[0] += xx * Src[0];
          sum[1] += xx * Src[1];
          sum[2] += xx * Src[2];
          Src += 3;
        }

        DstLine[0] = sum[0];
        DstLine[1] = sum[1];
        DstLine[2] = sum[2];

        AVIR_RESIZE_PART2
      } else if (ElCount == 2) {
        AVIR_RESIZE_PART1nx

            fptype sum[2];
        sum[0] = 0.0;
        sum[1] = 0.0;

        for (i = 0; i < IntFltLen; i++) {
          const fptype xx = ftp[i];
          sum[0] += xx * Src[0];
          sum[1] += xx * Src[1];
          Src += 2;
        }

        DstLine[0] = sum[0];
        DstLine[1] = sum[1];

        AVIR_RESIZE_PART2
      }
    }
  }
#undef AVIR_RESIZE_PART2
#undef AVIR_RESIZE_PART1nx
#undef AVIR_RESIZE_PART1
};

/**
 * @brief Image resizer's default dithering class.
 *
 * This class defines an object that performs rounding, clipping and dithering
 * operations over horizontal scanline pixels before scanline is stored in the
 * output buffer.
 *
 * The ditherer should expect the same storage order of the pixels in a
 * scanline as used in the "filtering step" class. So, a separate ditherer
 * class should be defined for each scanline pixel storage style. The default
 * ditherer implements a simple rounding without dithering: it can be used for
 * an efficient dithering method which can be multi-threaded.
 *
 * @tparam fptype Floating point type to use for storing pixel data. SIMD
 * types can be used.
 */

template <class fptype>
class CImageResizerDithererDefINL {
 public:
  /**
   * Function initializes the ditherer object.
   *
   * @param aLen Scanline length in pixels to process.
   * @param aVars Image resizing-related variables.
   * @param aTrMul Bit-depth truncation multiplier. 1 - no additional
   * truncation.
   * @param aPkOut Peak output value allowed.
   */

  void init(const int aLen, const CImageResizerVars& aVars, const double aTrMul,
            const double aPkOut) {
    Len = aLen;
    Vars = &aVars;
    LenE = aLen * Vars->ElCount;
    TrMul0 = aTrMul;
    PkOut0 = aPkOut;
  }

  /**
   * @return "True" if dithering is recursive relative to scanlines meaning
   * multi-threaded execution is not supported by this dithering method.
   */

  static bool isRecursive() { return (false); }

  /**
   * Function performs rounding and clipping operations.
   *
   * @param ResScanline The buffer containing the final scanline.
   */

  void dither(fptype* const ResScanline) const {
    const fptype c0 = 0.0;
    const fptype PkOut = (fptype)PkOut0;
    int j;

    if (TrMul0 == 1.0) {
      // Optimization - do not perform bit depth truncation.

      for (j = 0; j < LenE; j++) {
        ResScanline[j] = clamp(round(ResScanline[j]), c0, PkOut);
      }
    } else {
      const fptype TrMul = (fptype)TrMul0;

      for (j = 0; j < LenE; j++) {
        const fptype z0 = round(ResScanline[j] / TrMul) * TrMul;
        ResScanline[j] = clamp(z0, c0, PkOut);
      }
    }
  }

 protected:
  int Len;                        ///< Scanline's length in pixels.
                                  ///<
  const CImageResizerVars* Vars;  ///< Image resizing-related variables.
                                  ///<
  int LenE;                       ///< = LenE * ElCount.
                                  ///<
  double TrMul0;                  ///< Bit-depth truncation multiplier.
                                  ///<
  double PkOut0;                  ///< Peak output value allowed.
                                  ///<
};

/**
 * @brief Image resizer's error-diffusion dithering class, interleaved mode.
 *
 * This ditherer implements error-diffusion dithering which looks good, and
 * whose results are compressed by PNG well. This implementation uses
 * weighting coefficients obtained via machine optimization and visual
 * evaluation.
 *
 * @tparam fptype Floating point type to use for storing pixel data. SIMD
 * types can be used.
 */

template <class fptype>
class CImageResizerDithererErrdINL
    : public CImageResizerDithererDefINL<fptype> {
 public:
  /**
   * Function initializes the ditherer object.
   *
   * @param aLen Scanline length in pixels to process.
   * @param aVars Image resizing-related variables.
   * @param aTrMul Bit-depth truncation multiplier. 1 - no additional
   * truncation.
   * @param aPkOut Peak output value allowed.
   */

  void init(const int aLen, const CImageResizerVars& aVars, const double aTrMul,
            const double aPkOut) {
    CImageResizerDithererDefINL<fptype>::init(aLen, aVars, aTrMul, aPkOut);

    ResScanlineDith0.alloc(LenE + Vars->ElCount, sizeof(fptype));
    ResScanlineDith = ResScanlineDith0 + Vars->ElCount;
    int i;

    for (i = 0; i < LenE + Vars->ElCount; i++) {
      ResScanlineDith0[i] = 0.0;
    }
  }

  static bool isRecursive() { return (true); }

  void dither(fptype* const ResScanline) {
    const int ElCount = Vars->ElCount;
    const fptype c0 = 0.0;
    const fptype TrMul = (fptype)TrMul0;
    const fptype PkOut = (fptype)PkOut0;
    int j;

    for (j = 0; j < LenE; j++) {
      ResScanline[j] += ResScanlineDith[j];
      ResScanlineDith[j] = 0.0;
    }

    for (j = 0; j < LenE - ElCount; j++) {
      // Perform rounding, noise estimation and saturation.

      const fptype z0 = round(ResScanline[j] / TrMul) * TrMul;
      const fptype Noise = ResScanline[j] - z0;
      ResScanline[j] = clamp(z0, c0, PkOut);

      ResScanline[j + ElCount] += Noise * (fptype)0.364842;
      ResScanlineDith[j - ElCount] += Noise * (fptype)0.207305;
      ResScanlineDith[j] += Noise * (fptype)0.364842;
      ResScanlineDith[j + ElCount] += Noise * (fptype)0.063011;
    }

    while (j < LenE) {
      const fptype z0 = round(ResScanline[j] / TrMul) * TrMul;
      const fptype Noise = ResScanline[j] - z0;
      ResScanline[j] = clamp(z0, c0, PkOut);

      ResScanlineDith[j - ElCount] += Noise * (fptype)0.207305;
      ResScanlineDith[j] += Noise * (fptype)0.364842;
      j++;
    }
  }

 protected:
  using CImageResizerDithererDefINL<fptype>::Len;
  using CImageResizerDithererDefINL<fptype>::Vars;
  using CImageResizerDithererDefINL<fptype>::LenE;
  using CImageResizerDithererDefINL<fptype>::TrMul0;
  using CImageResizerDithererDefINL<fptype>::PkOut0;

  CBuffer<fptype> ResScanlineDith0;  ///< Error diffusion buffer.
                                     ///<
  fptype* ResScanlineDith;  ///< Error diffusion buffer pointer which skips
                            ///< the first ElCount elements.
                            ///<
};

/**
 * @brief Floating-point processing definition and abstraction class.
 *
 * This class defines several constants and typedefs that point to classes
 * that should be used by the image resizing algorithm. Such "definition
 * class" can be used to define alternative scanline processing algorithms
 * (e.g. SIMD) and image scanline packing styles used during processing. This
 * class also offers an abstraction layer for dithering, rounding and
 * clamping (saturation) operation.
 *
 * The fpclass_def class can be used to define processing using both SIMD and
 * non-SIMD types, but using algorithms that are operate on interleaved pixels
 * and non-SIMD optimized themselves.
 *
 * @tparam afptype Floating point type to use for storing intermediate data
 * and variables. For variables that are not used in intensive calculations
 * the "double" type is always used. On the latest Intel processors (like
 * i7-4770K) there is almost no performance difference between "double" and
 * "float". Image quality differences between "double" and "float" are not
 * apparent on 8-bit images. At the same time the "float" uses half amount of
 * working memory the "double" type uses. SIMD types can be used. The
 * functions round() and clamp() in the "avir" or other visible namespace
 * should be available for the specified type. SIMD types allow to perform
 * resizing of images with more than 4 channels, to be exact 4 * SIMD element
 * number (e.g. 16 for float4), without modification of the image resizing
 * algorithm required.
 * @tparam afptypeatom The atomic type the "afptype" consists of.
 * @tparam adith Ditherer class to use during processing.
 */

template <class afptype, class afptypeatom = afptype,
          class adith = CImageResizerDithererDefINL<afptype> >
class fpclass_def {
 public:
  typedef afptype fptype;  ///< Floating-point type to use during processing.
                           ///<
  typedef afptypeatom fptypeatom;  ///< Atomic type "fptype" consists of.
                                   ///<
  static const int fppack =
      sizeof(fptype) /
      sizeof(fptypeatom);  ///<
                           ///< The number of atomic types stored in a single
                           ///< "fptype" element.
                           ///<
  static const int fpalign =
      sizeof(fptype);  ///< Suggested alignment size
                       ///< in bytes. This is not a required alignment, because
                       ///< image resizing algorithm cannot be made to have a
                       ///< strictly aligned data access at all steps (e.g.
                       ///< interpolation cannot perform aligned accesses).
                       ///<
  static const int elalign =
      1;  ///< Length alignment of arrays of elements.
          ///< This applies to filters and intermediate buffers: this constant
          ///< forces filters and scanlines to have a length which is a multiple
          ///< of this value, for more efficient SIMD implementation.
          ///<
  static const int packmode = 0;  ///< 0 if interleaved packing, 1 if
                                  ///< de-interleaved.
                                  ///<
  typedef CImageResizerFilterStepINL<fptype, fptypeatom>
      CFilterStep;          ///<
                            ///< Filtering step class to use during processing.
                            ///<
  typedef adith CDitherer;  ///< Ditherer class to use during processing.
                            ///<
};

/**
 * @brief Image resizer class.
 *
 * The object of this class can be used to resize 1-4 channel images to any
 * required size. Resizing is performed by utilizing interpolated sinc
 * fractional delay filters plus (if necessary) a cascade of built-in
 * sinc function-based 2X upsampling or 2X downsampling stages, followed by a
 * correction filtering.
 *
 * Object of this class can be allocated on stack.
 *
 * @tparam fpclass Floating-point processing definition class to use. See
 * avir::fpclass_def for more details.
 */

template <class fpclass = fpclass_def<float> >
class CImageResizer {
 public:
  /**
   * Constructor initializes the resizer.
   *
   * @param aResBitDepth Required bit depth of resulting image (1-16). If
   * integer value output is used (e.g. uint8_t), the bit depth also affects
   * rounding: for example, if aResBitDepth=6 and "Tout" is uint8_t, the
   * result will be rounded to 6 most significant bits (2 least significant
   * bits truncated, with dithering applied).
   * @param aSrcBitDepth Source image's real bit-depth. Set to 0 to use
   * aResBitDepth.
   * @param aParams Resizing algorithm's parameters to use. Leave out for
   * default values. Can be useful when performing automatic optimization of
   * parameters.
   */

  CImageResizer(const int aResBitDepth = 8, const int aSrcBitDepth = 0,
                const CImageResizerParams& aParams = CImageResizerParamsDef())
      : Params(aParams), ResBitDepth(aResBitDepth) {
    SrcBitDepth = (aSrcBitDepth == 0 ? ResBitDepth : aSrcBitDepth);

    initFilterBank(FixedFilterBank, 1.0, false, CFltBuffer());
    FixedFilterBank.createAllFilters();
  }

  /**
   * Function resizes image.
   *
   * @param SrcBuf Source image buffer.
   * @param SrcWidth Source image width.
   * @param SrcHeight Source image height.
   * @param SrcScanlineSize Physical size of source scanline in elements
   * (not bytes). If this value is below 1, SrcWidth * ElCountIO will be
   * used as the physical source scanline size.
   * @param[out] NewBuf Buffer to accept the resized image. Can be equal to
   * SrcBuf if the size of the resized image is smaller or equal to source
   * image in size.
   * @param NewWidth New image width.
   * @param NewHeight New image height.
   * @param ElCountIO The number of elements (channels) used to store each
   * source and destination pixel (1-4).
   * @param k Resizing step (one output pixel corresponds to "k" input
   * pixels). A downsizing factor if > 1.0; upsizing factor if <= 1.0.
   * Multiply by -1 if you would like to bypass "ox" and "oy" adjustment
   * which is done by default to produce a centered image. If step value
   * equals 0, the step value will be chosen automatically and independently
   * for horizontal and vertical resizing.
   * @param[in,out] aVars Pointer to variables structure to be passed to the
   * image resizing function. Can be NULL. Only variables that are
   * initialized in default constructor of this structure are accepted by
   * this function. These variables will not be changed by this function.
   * All other variables can be modified by this function. The access to
   * this object is not thread-safe, each concurrent instance of this
   * function should use a separate aVars object.
   * @tparam Tin Input buffer element's type. Can be uint8_t (0-255 value
   * range), uint16_t (0-65535 value range), float (0.0-1.0 value range),
   * double (0.0-1.0 value range). Larger integer types are treated as
   * uint16_t. Signed integer types are unsupported.
   * @tparam Tout Output buffer element's type. Can be uint8_t (0-255 value
   * range), uint16_t (0-65535 value range), float (0.0-1.0 value range),
   * double (0.0-1.0 value range). Larger integer types are treated as
   * uint16_t. Signed integer types are unsupported.
   */

  template <class Tin, class Tout>
  void resizeImage(const Tin* const SrcBuf, const int SrcWidth,
                   const int SrcHeight, int SrcScanlineSize, Tout* const NewBuf,
                   const int NewWidth, const int NewHeight, const int ElCountIO,
                   const double k,
                   CImageResizerVars* const aVars = NULL) const {
    if (SrcWidth == 0 || SrcHeight == 0) {
      memset(NewBuf, 0, (size_t)NewWidth * NewHeight * sizeof(Tout));

      return;
    } else if (NewWidth == 0 || NewHeight == 0) {
      return;
    }

    CImageResizerVars DefVars;
    CImageResizerVars& Vars = (aVars == NULL ? DefVars : *aVars);

    CImageResizerThreadPool DefThreadPool;
    CImageResizerThreadPool& ThreadPool =
        (Vars.ThreadPool == NULL ? DefThreadPool : *Vars.ThreadPool);

    // Define resizing steps, also optionally modify offsets so that
    // resizing produces a "centered" image.

    double kx;
    double ky;
    double ox = Vars.ox;
    double oy = Vars.oy;

    if (k == 0.0) {
      if (NewWidth > SrcWidth) {
        kx = (double)(SrcWidth - 1) / (NewWidth - 1);
      } else {
        kx = (double)SrcWidth / NewWidth;
        ox += (kx - 1.0) * 0.5;
      }

      if (NewHeight > SrcHeight) {
        ky = (double)(SrcHeight - 1) / (NewHeight - 1);
      } else {
        ky = (double)SrcHeight / NewHeight;
        oy += (ky - 1.0) * 0.5;
      }
    } else if (k > 0.0) {
      kx = k;
      ky = k;

      if (k > 1.0) {
        const double ko = (k - 1.0) * 0.5;
        ox += ko;
        oy += ko;
      }
    } else {
      kx = -k;
      ky = -k;
    }

    // Evaluate pre-multipliers used on the output stage.

    const bool IsInFloat = ((Tin)0.4 != 0);
    const bool IsOutFloat = ((Tout)0.4 != 0);
    double OutMul;  // Output multiplier.

    if (Vars.UseSRGBGamma) {
      if (IsInFloat) {
        Vars.InGammaMult = 1.0;
      } else {
        Vars.InGammaMult = 1.0 / (sizeof(Tin) == 1 ? 255.0 : 65535.0);
      }

      if (IsOutFloat) {
        Vars.OutGammaMult = 1.0;
      } else {
        Vars.OutGammaMult = (sizeof(Tout) == 1 ? 255.0 : 65535.0);
      }

      OutMul = 1.0;
    } else {
      if (IsOutFloat) {
        OutMul = 1.0;
      } else {
        OutMul = (sizeof(Tout) == 1 ? 255.0 : 65535.0);
      }

      if (!IsInFloat) {
        OutMul /= (sizeof(Tin) == 1 ? 255.0 : 65535.0);
      }
    }

    // Fill widely-used variables.

    const int ElCount = (ElCountIO + fpclass ::fppack - 1) / fpclass ::fppack;

    const int NewWidthE = NewWidth * ElCount;

    if (SrcScanlineSize < 1) {
      SrcScanlineSize = SrcWidth * ElCountIO;
    }

    Vars.ElCount = ElCount;
    Vars.ElCountIO = ElCountIO;
    Vars.fppack = fpclass ::fppack;
    Vars.fpalign = fpclass ::fpalign;
    Vars.elalign = fpclass ::elalign;
    Vars.packmode = fpclass ::packmode;

    // Horizontal scanline filtering and resizing.

    CDSPFracFilterBankLin<fptype> FltBank;
    CFilterSteps FltSteps;
    typename CFilterStep ::CRPosBufArray RPosBufArray;
    CBuffer<uint8_t> UsedFracMap;

    // Perform the filtering steps modeling at various modes, find the
    // most efficient mode for both horizontal and vertical resizing.

    int UseBuildMode = 1;
    const int BuildModeCount = (FixedFilterBank.getOrder() == 0 ? 4 : 2);

    int m;

    if (Vars.BuildMode >= 0) {
      UseBuildMode = Vars.BuildMode;
    } else {
      int BestScore = 0x7FFFFFFF;

      for (m = 0; m < BuildModeCount; m++) {
        CDSPFracFilterBankLin<fptype> TmpBank;
        CFilterSteps TmpSteps;
        Vars.k = kx;
        Vars.o = ox;
        buildFilterSteps(TmpSteps, Vars, TmpBank, OutMul, m, true);
        updateFilterStepBuffers(TmpSteps, Vars, RPosBufArray, SrcWidth,
                                NewWidth);

        fillUsedFracMap(TmpSteps[Vars.ResizeStep], UsedFracMap);
        const int c = calcComplexity(TmpSteps, Vars, UsedFracMap, SrcHeight);

        if (c < BestScore) {
          UseBuildMode = m;
          BestScore = c;
        }
      }
    }

    // Perform the actual filtering steps building.

    Vars.k = kx;
    Vars.o = ox;
    buildFilterSteps(FltSteps, Vars, FltBank, OutMul, UseBuildMode, false);

    updateFilterStepBuffers(FltSteps, Vars, RPosBufArray, SrcWidth, NewWidth);

    updateBufLenAndRPosPtrs(FltSteps, Vars, NewWidth);

    const int ThreadCount = ThreadPool.getSuggestedWorkloadCount();
    // Includes the current thread.

    CStructArray<CThreadData<Tin, Tout> > td;
    td.setItemCount(ThreadCount);
    int i;

    for (i = 0; i < ThreadCount; i++) {
      if (i > 0) {
        ThreadPool.addWorkload(&td[i]);
      }

      td[i].init(i, ThreadCount, FltSteps, Vars);

      td[i].initScanlineQueue(td[i].sopResizeH, SrcHeight, SrcWidth);
    }

    CBuffer<fptype, size_t> FltBuf(
        (size_t)NewWidthE * SrcHeight,
        fpclass ::fpalign);  // Temporary buffer that receives
                             // horizontally-filtered and resized image.

    for (i = 0; i < SrcHeight; i++) {
      td[i % ThreadCount].addScanlineToQueue(
          (void*)&SrcBuf[(size_t)i * SrcScanlineSize],
          &FltBuf[(size_t)i * NewWidthE]);
    }

    ThreadPool.startAllWorkloads();
    td[0].processScanlineQueue();
    ThreadPool.waitAllWorkloadsToFinish();

    // Vertical scanline filtering and resizing, reuse previously defined
    // filtering steps if possible.

    const int PrevUseBuildMode = UseBuildMode;

    if (Vars.BuildMode >= 0) {
      UseBuildMode = Vars.BuildMode;
    } else {
      CImageResizerVars TmpVars(Vars);
      int BestScore = 0x7FFFFFFF;

      for (m = 0; m < BuildModeCount; m++) {
        CDSPFracFilterBankLin<fptype> TmpBank;
        TmpBank.copyInitParams(FltBank);
        CFilterSteps TmpSteps;
        TmpVars.k = ky;
        TmpVars.o = oy;
        buildFilterSteps(TmpSteps, TmpVars, TmpBank, 1.0, m, true);
        updateFilterStepBuffers(TmpSteps, TmpVars, RPosBufArray, SrcHeight,
                                NewHeight);

        fillUsedFracMap(TmpSteps[TmpVars.ResizeStep], UsedFracMap);

        const int c = calcComplexity(TmpSteps, TmpVars, UsedFracMap, NewWidth);

        if (c < BestScore) {
          UseBuildMode = m;
          BestScore = c;
        }
      }
    }

    Vars.k = ky;
    Vars.o = oy;

    if (UseBuildMode == PrevUseBuildMode && ky == kx) {
      if (OutMul != 1.0) {
        modifyCorrFilterDCGain(FltSteps, 1.0 / OutMul);
      }
    } else {
      buildFilterSteps(FltSteps, Vars, FltBank, 1.0, UseBuildMode, false);
    }

    updateFilterStepBuffers(FltSteps, Vars, RPosBufArray, SrcHeight, NewHeight);

    updateBufLenAndRPosPtrs(FltSteps, Vars, NewWidth);

    if (IsOutFloat && sizeof(FltBuf[0]) == sizeof(Tout) &&
        fpclass ::packmode == 0) {
      // In-place output.

      for (i = 0; i < ThreadCount; i++) {
        td[i].initScanlineQueue(td[i].sopResizeV, NewWidth, SrcHeight,
                                NewWidthE, NewWidthE);
      }

      for (i = 0; i < NewWidth; i++) {
        td[i % ThreadCount].addScanlineToQueue(
            &FltBuf[(size_t)i * ElCount],
            (fptype*)&NewBuf[(size_t)i * ElCount]);
      }

      ThreadPool.startAllWorkloads();
      td[0].processScanlineQueue();
      ThreadPool.waitAllWorkloadsToFinish();
      ThreadPool.removeAllWorkloads();

      return;
    }

    CBuffer<fptype, size_t> ResBuf((size_t)NewWidthE * NewHeight,
                                   fpclass ::fpalign);

    for (i = 0; i < ThreadCount; i++) {
      td[i].initScanlineQueue(td[i].sopResizeV, NewWidth, SrcHeight, NewWidthE,
                              NewWidthE);
    }

    const int im = (fpclass ::packmode == 0 ? ElCount : 1);

    for (i = 0; i < NewWidth; i++) {
      td[i % ThreadCount].addScanlineToQueue(&FltBuf[(size_t)i * im],
                                             &ResBuf[(size_t)i * im]);
    }

    ThreadPool.startAllWorkloads();
    td[0].processScanlineQueue();
    ThreadPool.waitAllWorkloadsToFinish();

    if (IsOutFloat) {
      // Perform output, but skip dithering.

      for (i = 0; i < ThreadCount; i++) {
        td[i].initScanlineQueue(td[i].sopUnpackH, NewHeight, NewWidth);
      }

      for (i = 0; i < NewHeight; i++) {
        td[i % ThreadCount].addScanlineToQueue(
            &ResBuf[(size_t)i * NewWidthE],
            &NewBuf[(size_t)i * NewWidth * ElCountIO]);
      }

      ThreadPool.startAllWorkloads();
      td[0].processScanlineQueue();
      ThreadPool.waitAllWorkloadsToFinish();
      ThreadPool.removeAllWorkloads();

      return;
    }

    // Perform output with dithering (for integer output only).

    int TruncBits;  // The number of lower bits to truncate and dither.
    int OutRange;   // Output range.

    if (sizeof(Tout) == 1) {
      TruncBits = 8 - ResBitDepth;
      OutRange = 255;
    } else {
      TruncBits = 16 - ResBitDepth;
      OutRange = 65535;
    }

    const double PkOut = OutRange;
    const double TrMul =
        (TruncBits > 0 ? PkOut / (OutRange >> TruncBits) : 1.0);

    if (CDitherer ::isRecursive()) {
      td[0].getDitherer().init(NewWidth, Vars, TrMul, PkOut);

      if (Vars.UseSRGBGamma) {
        for (i = 0; i < NewHeight; i++) {
          fptype* const ResScanline = &ResBuf[(size_t)i * NewWidthE];

          CFilterStep ::applySRGBGamma(ResScanline, NewWidth, Vars);

          td[0].getDitherer().dither(ResScanline);

          CFilterStep ::unpackScanline(
              ResScanline, &NewBuf[(size_t)i * NewWidth * ElCountIO], NewWidth,
              Vars);
        }
      } else {
        for (i = 0; i < NewHeight; i++) {
          fptype* const ResScanline = &ResBuf[(size_t)i * NewWidthE];

          td[0].getDitherer().dither(ResScanline);

          CFilterStep ::unpackScanline(
              ResScanline, &NewBuf[(size_t)i * NewWidth * ElCountIO], NewWidth,
              Vars);
        }
      }
    } else {
      for (i = 0; i < ThreadCount; i++) {
        td[i].initScanlineQueue(td[i].sopDitherAndUnpackH, NewHeight, NewWidth);

        td[i].getDitherer().init(NewWidth, Vars, TrMul, PkOut);
      }

      for (i = 0; i < NewHeight; i++) {
        td[i % ThreadCount].addScanlineToQueue(
            &ResBuf[(size_t)i * NewWidthE],
            &NewBuf[(size_t)i * NewWidth * ElCountIO]);
      }

      ThreadPool.startAllWorkloads();
      td[0].processScanlineQueue();
      ThreadPool.waitAllWorkloadsToFinish();
    }

    ThreadPool.removeAllWorkloads();
  }

 private:
  typedef typename fpclass ::fptype fptype;  ///< Floating-point type to use
                                             ///< during processing.
                                             ///<
  typedef typename fpclass ::CFilterStep
      CFilterStep;  ///< Filtering step
                    ///< class to use during processing.
                    ///<
  typedef typename fpclass ::CDitherer CDitherer;  ///< Ditherer class to
                                                   ///< use during processing.
                                                   ///<
  CImageResizerParams Params;  ///< Algorithm's parameters currently in use.
                               ///<
  int SrcBitDepth;             ///< Bit resolution of the source image.
                               ///<
  int ResBitDepth;             ///< Bit resolution of the resulting image.
                               ///<
  CDSPFracFilterBankLin<fptype>
      FixedFilterBank;  ///< Fractional delay
                        ///< filter bank with fixed characteristics, mainly for
                        ///< upsizing cases.
                        ///<

  /**
   * @brief Filtering steps array.
   *
   * The object of this class stores filtering steps together.
   */

  typedef CStructArray<CFilterStep> CFilterSteps;

  /**
   * Function initializes the filter bank in the specified resizing step
   * according to the source and resulting image bit depths.
   *
   * @param FltBank Filter bank to initialize.
   * @param CutoffMult Cutoff multiplier, 0 to 1. 1 corresponds to 0.5pi
   * cutoff point.
   * @param ForceHiOrder "True" if a high-order interpolation should be
   * forced which requires considerably less resources for initialization.
   * @param ExtFilter External filter to apply to interpolation filter.
   */

  void initFilterBank(CDSPFracFilterBankLin<fptype>& FltBank,
                      const double CutoffMult, const bool ForceHiOrder,
                      const CFltBuffer& ExtFilter) const {
    const int IntBitDepth =
        (ResBitDepth > SrcBitDepth ? ResBitDepth : SrcBitDepth);

    const double SNR = -6.02 * (IntBitDepth + 3);
    int UseOrder;
    int FracCount;  // The number of fractional delay filters sampled by
                    // the filter bank. This variable affects the
                    // signal-to-noise ratio at interpolation stage.
                    // Theoretically, at UseOrder==1, 8-bit image resizing
                    // requires 66.2 dB SNR or 11. 16-bit resizing requires
                    // 114.4 dB SNR or 150. At UseOrder=0 the required number of
                    // filters is exponentially higher.

    if (ForceHiOrder || IntBitDepth > 8) {
      UseOrder = 1;  // -146 dB max
      FracCount = (int)ceil(0.23134052 * exp(-0.058062929 * SNR));
    } else {
      UseOrder = 0;  // -72 dB max
      FracCount = (int)ceil(0.33287686 * exp(-0.11334583 * SNR));
    }

    if (FracCount < 2) {
      FracCount = 2;
    }

    FltBank.init(FracCount, UseOrder, Params.IntFltLen / CutoffMult,
                 Params.IntFltCutoff * CutoffMult, Params.IntFltAlpha,
                 ExtFilter, fpclass ::fpalign, fpclass ::elalign);
  }

  /**
   * Function allocates filter buffer taking "fpclass" alignments into
   * account. The allocated buffer may be larger than the requested size: in
   * this case the additional elements will be zeroed by this function.
   *
   * @param Flt Filter buffer.
   * @param ReqCapacity The required filter buffer's capacity.
   * @param IsModel "True" if filtering steps modeling is performed without
   * actual filter allocation.
   * @param FltExt If non-NULL this variable will receive the number of
   * elements the filter was extended by.
   */

  static void allocFilter(CBuffer<fptype>& Flt, const int ReqCapacity,
                          const bool IsModel = false,
                          int* const FltExt = NULL) {
    int UseCapacity =
        (ReqCapacity + fpclass ::elalign - 1) & ~(fpclass ::elalign - 1);

    int Ext = UseCapacity - ReqCapacity;

    if (FltExt != NULL) {
      *FltExt = Ext;
    }

    if (IsModel) {
      Flt.forceCapacity(UseCapacity);
      return;
    }

    Flt.alloc(UseCapacity, fpclass ::fpalign);

    while (Ext > 0) {
      Ext--;
      Flt[ReqCapacity + Ext] = 0.0;
    }
  }

  /**
   * Function assigns filter parameters to the specified filtering step
   * object.
   *
   * @param fs Filtering step to assign parameter to. This step cannot be
   * the last step if ResampleFactor greater than 1 was specified.
   * @param IsUpsample "True" if upsampling step. Should be set to "false"
   * if FltCutoff is negative.
   * @param ResampleFactor Resampling factor of this filter (>=1).
   * @param FltCutoff Filter cutoff point. This value will be divided by the
   * ResampleFactor if IsUpsample equals "true". If zero value was
   * specified, the "half-band" predefined filter will be created. In this
   * case the ResampleFactor will modify the filter cutoff point.
   * @param DCGain DC gain to apply to the filter. Assigned to filtering
   * step's DCGain variable.
   * @param UseFltOrig "True" if the originally-designed filter should be
   * left in filtering step's FltOrig buffer. Otherwise it will be freed.
   * @param IsModel "True" if filtering steps modeling is performed without
   * actual filter building.
   */

  void assignFilterParams(CFilterStep& fs, const bool IsUpsample,
                          const int ResampleFactor, const double FltCutoff,
                          const double DCGain, const bool UseFltOrig,
                          const bool IsModel) const {
    double FltAlpha;
    double Len2;
    double Freq;

    if (FltCutoff == 0.0) {
      const double m = 2.0 / ResampleFactor;
      FltAlpha = Params.HBFltAlpha;
      Len2 = 0.5 * Params.HBFltLen / m;
      Freq = AVIR_PI * Params.HBFltCutoff * m;
    } else {
      FltAlpha = Params.LPFltAlpha;
      Len2 = 0.25 * Params.LPFltBaseLen / FltCutoff;
      Freq = AVIR_PI * Params.LPFltCutoffMult * FltCutoff;
    }

    if (IsUpsample) {
      Len2 *= ResampleFactor;
      Freq /= ResampleFactor;
      fs.DCGain = DCGain * ResampleFactor;
    } else {
      fs.DCGain = DCGain;
    }

    fs.FltOrig.Len2 = Len2;
    fs.FltOrig.Freq = Freq;
    fs.FltOrig.Alpha = FltAlpha;
    fs.FltOrig.DCGain = fs.DCGain;

    CDSPPeakedCosineLPF w(Len2, Freq, FltAlpha);

    fs.IsUpsample = IsUpsample;
    fs.ResampleFactor = ResampleFactor;
    fs.FltLatency = w.fl2;

    int FltExt;  // Filter's extension due to fpclass :: elalign.

    if (IsModel) {
      allocFilter(fs.Flt, w.FilterLen, true, &FltExt);

      if (UseFltOrig) {
        // Allocate a real buffer even in modeling mode since this
        // filter may be copied by the filter bank.

        fs.FltOrig.alloc(w.FilterLen);
        memset(&fs.FltOrig[0], 0, w.FilterLen * sizeof(fs.FltOrig[0]));
      }
    } else {
      fs.FltOrig.alloc(w.FilterLen);

      w.generateLPF(&fs.FltOrig[0], 1.0);
      optimizeFIRFilter(fs.FltOrig, fs.FltLatency);
      normalizeFIRFilter(&fs.FltOrig[0], fs.FltOrig.getCapacity(), fs.DCGain);

      allocFilter(fs.Flt, fs.FltOrig.getCapacity(), false, &FltExt);
      copyArray(&fs.FltOrig[0], &fs.Flt[0], fs.FltOrig.getCapacity());

      if (!UseFltOrig) {
        fs.FltOrig.free();
      }
    }

    if (IsUpsample) {
      int l = fs.Flt.getCapacity() - fs.FltLatency - ResampleFactor - FltExt;

      allocFilter(fs.PrefixDC, l, IsModel);
      allocFilter(fs.SuffixDC, fs.FltLatency, IsModel);

      if (IsModel) {
        return;
      }

      // Create prefix and suffix "tails" used during upsampling.

      const fptype* ip = &fs.Flt[fs.FltLatency + ResampleFactor];
      copyArray(ip, &fs.PrefixDC[0], l);

      while (true) {
        ip += ResampleFactor;
        l -= ResampleFactor;

        if (l <= 0) {
          break;
        }

        addArray(ip, &fs.PrefixDC[0], l);
      }

      l = fs.FltLatency;
      fptype* op = &fs.SuffixDC[0];
      copyArray(&fs.Flt[0], op, l);

      while (true) {
        op += ResampleFactor;
        l -= ResampleFactor;

        if (l <= 0) {
          break;
        }

        addArray(&fs.Flt[0], op, l);
      }
    } else if (!UseFltOrig) {
      fs.EdgePixelCount = fs.EdgePixelCountDef;
    }
  }

  /**
   * Function adds a correction filter that tries to achieve a linear
   * frequency response at all frequencies. The actual resulting response
   * may feature a slight damping of the highest frequencies since a
   * suitably short correction filter cannot fix steep high-frequency
   * damping.
   *
   * This function assumes that the resizing step is currently the last
   * step, even if it was not inserted yet: this allows placement of the
   * correction filter both before and after the resizing step.
   *
   * @param Steps Filtering steps.
   * @param bw Resulting bandwidth relative to the original bandwidth (which
   * is 1.0), usually 1/k. Should be <= 1.0.
   * @param IsPreCorrection "True" if the filtering step was already created
   * and it is first in the Steps array. "True" also adds edge pixels to
   * reduce edge artifacts.
   * @param IsModel "True" if filtering steps modeling is performed without
   * actual filter building.
   */

  void addCorrectionFilter(CFilterSteps& Steps, const double bw,
                           const bool IsPreCorrection,
                           const bool IsModel) const {
    CFilterStep& fs = (IsPreCorrection ? Steps[0] : Steps.add());
    fs.IsUpsample = false;
    fs.ResampleFactor = 1;
    fs.DCGain = 1.0;
    fs.EdgePixelCount = (IsPreCorrection ? fs.EdgePixelCountDef : 0);

    if (IsModel) {
      allocFilter(
          fs.Flt,
          CDSPFIREQ ::calcFilterLength(Params.CorrFltLen, fs.FltLatency), true);

      return;
    }

    const int BinCount = 65;  // Frequency response bins to control.
    const int BinCount1 = BinCount - 1;
    double curbw = 1.0;  // Bandwidth of the filter at the current step.
    int i;
    int j;
    double re;
    double im;

    CBuffer<double> Bins(BinCount);  // Adjustment introduced by all
                                     // steps at all frequencies of interest.

    for (j = 0; j < BinCount; j++) {
      Bins[j] = 1.0;
    }

    const int si = (IsPreCorrection ? 1 : 0);

    for (i = si; i < Steps.getItemCount() - (si ^ 1); i++) {
      const CFilterStep& fs = Steps[i];

      if (fs.IsUpsample) {
        curbw *= fs.ResampleFactor;

        if (fs.FltOrig.getCapacity() > 0) {
          continue;
        }
      }

      const double dcg = 1.0 / fs.DCGain;  // DC gain correction.
      const fptype* Flt;
      int FltLen;

      if (fs.ResampleFactor == 0) {
        Flt = fs.FltBank->getFilter(0);
        FltLen = fs.FltBank->getFilterLen();
      } else {
        Flt = &fs.Flt[0];
        FltLen = fs.Flt.getCapacity();
      }

      // Calculate frequency response adjustment introduced by the
      // filter at this step, within the bounds of bandwidth of
      // interest.

      for (j = 0; j < BinCount; j++) {
        const double th = AVIR_PI * bw / curbw * j / BinCount1;

        calcFIRFilterResponse(Flt, FltLen, th, re, im);

        Bins[j] /= sqrt(re * re + im * im) * dcg;
      }

      if (!fs.IsUpsample && fs.ResampleFactor > 1) {
        curbw /= fs.ResampleFactor;
      }
    }

    // Calculate filter.

    CDSPFIREQ EQ;
    EQ.init(bw * 2.0, Params.CorrFltLen, BinCount, 0.0, bw, false,
            Params.CorrFltAlpha);

    fs.FltLatency = EQ.getFilterLatency();

    CBuffer<double> Filter(EQ.getFilterLength());
    EQ.buildFilter(Bins, &Filter[0]);
    normalizeFIRFilter(&Filter[0], Filter.getCapacity(), 1.0);
    optimizeFIRFilter(Filter, fs.FltLatency);
    normalizeFIRFilter(&Filter[0], Filter.getCapacity(), 1.0);

    allocFilter(fs.Flt, Filter.getCapacity());
    copyArray(&Filter[0], &fs.Flt[0], Filter.getCapacity());

    // Print a theoretically achieved final frequency response at various
    // feature sizes (from DC to 1 pixel). Values above 255 means features
    // become brighter, values below 255 means features become dimmer.

    /*		const double sbw = ( bw > 1.0 ? 1.0 / bw : 1.0 );

                    for( j = 0; j < BinCount; j++ )
                    {
                            const double th = AVIR_PI * sbw * j / BinCount1;

                            calcFIRFilterResponse( &fs.Flt[ 0 ],
       fs.Flt.getCapacity(), th, re, im );

                            printf( "%f\n", sqrt( re * re + im * im ) / Bins[ j
       ] * 255 );
                    }

                    printf( "***\n" );*/
  }

  /**
   * Function adds a sharpening filter if image is being upsized. Such
   * sharpening allows to spot interpolation filter's stop-band attenuation:
   * if attenuation is too weak, a "dark grid" and other artifacts may
   * become visible.
   *
   * It is assumed that 40 decibel stop-band attenuation should be
   * considered a required minimum: this allows application of (deliberately
   * strong) 64X sharpening without spotting any artifacts.
   *
   * @param Steps Filtering steps.
   * @param bw Resulting bandwidth relative to the original bandwidth (which
   * is 1.0), usually 1/k.
   * @param IsModel "True" if filtering steps modeling is performed without
   * actual filter building.
   */

  static void addSharpenTest(CFilterSteps& Steps, const double bw,
                             const bool IsModel) {
    if (bw <= 1.0) {
      return;
    }

    const double FltLen = 10.0 * bw;

    CFilterStep& fs = Steps.add();
    fs.IsUpsample = false;
    fs.ResampleFactor = 1;
    fs.DCGain = 1.0;
    fs.EdgePixelCount = 0;

    if (IsModel) {
      allocFilter(fs.Flt, CDSPFIREQ ::calcFilterLength(FltLen, fs.FltLatency),
                  true);

      return;
    }

    const int BinCount = 200;
    CBuffer<double> Bins(BinCount);
    int Thresh = (int)round(BinCount / bw * 1.75);

    if (Thresh > BinCount) {
      Thresh = BinCount;
    }

    int j;

    for (j = 0; j < Thresh; j++) {
      Bins[j] = 1.0;
    }

    for (j = Thresh; j < BinCount; j++) {
      Bins[j] = 256.0;
    }

    CDSPFIREQ EQ;
    EQ.init(bw * 2.0, FltLen, BinCount, 0.0, bw, false, 1.7);

    fs.FltLatency = EQ.getFilterLatency();

    CBuffer<double> Filter(EQ.getFilterLength());
    EQ.buildFilter(Bins, &Filter[0]);
    normalizeFIRFilter(&Filter[0], Filter.getCapacity(), 1.0);
    optimizeFIRFilter(Filter, fs.FltLatency);
    normalizeFIRFilter(&Filter[0], Filter.getCapacity(), 1.0);

    allocFilter(fs.Flt, Filter.getCapacity());
    copyArray(&Filter[0], &fs.Flt[0], Filter.getCapacity());

    /*		for( j = 0; j < BinCount; j++ )
                    {
                            const double th = AVIR_PI * j / ( BinCount - 1 );
                            double re;
                            double im;

                            calcFIRFilterResponse( &fs.Flt[ 0 ],
       fs.Flt.getCapacity(), th, re, im );

                            printf( "%f\n", sqrt( re * re + im * im ));
                    }

                    printf( "***\n" );*/
  }

  /**
   * Function builds sequence of filtering steps depending on the specified
   * resizing coefficient. The last steps included are always the resizing
   * step then (possibly) the correction step.
   *
   * @param Steps Array that receives filtering steps.
   * @param[out] Vars Variables object.
   * @param FltBank Filter bank to initialize and use.
   * @param DCGain The overall DC gain to apply. This DC gain is applied to
   * the first filtering step only (upsampling or filtering step).
   * @param ModeFlags Build mode flags to use. This is a bitmap of switches
   * that enable or disable certain algorithm features.
   * @param IsModel "True" if filtering steps modeling is performed without
   * the actual filter allocation and building.
   */

  void buildFilterSteps(CFilterSteps& Steps, CImageResizerVars& Vars,
                        CDSPFracFilterBankLin<fptype>& FltBank,
                        const double DCGain, const int ModeFlags,
                        const bool IsModel) const {
    Steps.clear();

    const bool DoFltAndIntCombo =
        ((ModeFlags & 1) != 0);  // Do filter
                                 // and interpolator combining.
    const bool ForceHiOrderInt =
        ((ModeFlags & 2) != 0);  // Force use
                                 // of a higher-order interpolation.
    const bool UseHalfband = ((ModeFlags & 4) != 0);  // Use half-band
                                                      // filter.

    const double bw = 1.0 / Vars.k;  // Resulting bandwidth.
    const int UpsampleFactor = ((int)floor(Vars.k) < 2 ? 2 : 1);
    double IntCutoffMult;     // Interpolation filter cutoff multiplier.
    CFilterStep* ReuseStep;   // If not NULL, resizing step should use
                              // this step object instead of creating a new one.
    CFilterStep* ExtFltStep;  // Use FltOrig of this step as the external
                              // filter to applied to the interpolator.
    bool IsPreCorrection;     // "True" if the correction filter is applied
                              // first.
    double FltCutoff;         // Cutoff frequency of the first filtering step.
    double corrbw;            ///< Bandwidth at the correction step.

    if (Vars.k <= 1.0) {
      IsPreCorrection = true;
      FltCutoff = 1.0;
      corrbw = 1.0;
      Steps.add();
    } else {
      IsPreCorrection = false;
      FltCutoff = bw;
      corrbw = bw;
    }

    // Add 1 upsampling or several downsampling filters.

    if (UpsampleFactor > 1) {
      CFilterStep& fs = Steps.add();
      assignFilterParams(fs, true, UpsampleFactor, FltCutoff, DCGain,
                         DoFltAndIntCombo, IsModel);

      IntCutoffMult = FltCutoff * 2.0 / UpsampleFactor;
      ReuseStep = NULL;
      ExtFltStep = (DoFltAndIntCombo ? &fs : NULL);
    } else {
      int DownsampleFactor;

      while (true) {
        DownsampleFactor = (int)floor(0.5 / FltCutoff);
        bool DoHBFltAdd;

        if (DownsampleFactor > 16) {
          // Add half-band filter unconditionally in order to keep
          // filter lengths lower for more precise frequency
          // response and less edge artifacts.

          DoHBFltAdd = true;
          DownsampleFactor = 16;
        } else {
          DoHBFltAdd = (UseHalfband && DownsampleFactor > 1);
        }

        if (DoHBFltAdd) {
          assignFilterParams(Steps.add(), false, DownsampleFactor, 0.0, 1.0,
                             false, IsModel);

          FltCutoff *= DownsampleFactor;
        } else {
          if (DownsampleFactor < 1) {
            DownsampleFactor = 1;
          }

          break;
        }
      }

      CFilterStep& fs = Steps.add();
      assignFilterParams(fs, false, DownsampleFactor, FltCutoff, DCGain,
                         DoFltAndIntCombo, IsModel);

      IntCutoffMult = FltCutoff / 0.5;

      if (DoFltAndIntCombo) {
        ReuseStep = &fs;
        ExtFltStep = &fs;
      } else {
        IntCutoffMult *= DownsampleFactor;
        ReuseStep = NULL;
        ExtFltStep = NULL;
      }
    }

    // Insert resizing and correction steps.

    CFilterStep& fs = (ReuseStep == NULL ? Steps.add() : *ReuseStep);

    Vars.ResizeStep = Steps.getItemCount() - 1;
    fs.IsUpsample = false;
    fs.ResampleFactor = 0;
    fs.DCGain = (ExtFltStep == NULL ? 1.0 : ExtFltStep->DCGain);

    initFilterBank(FltBank, IntCutoffMult, ForceHiOrderInt,
                   (ExtFltStep == NULL ? fs.FltOrig : ExtFltStep->FltOrig));

    if (FltBank == FixedFilterBank) {
      fs.FltBank = (CDSPFracFilterBankLin<fptype>*)&FixedFilterBank;
    } else {
      fs.FltBank = &FltBank;
    }

    addCorrectionFilter(Steps, corrbw, IsPreCorrection, IsModel);

    // addSharpenTest( Steps, bw, IsModel );
  }

  /**
   * Function extends *this upsampling step so that it produces more
   * upsampled pixels that cover the prefix and suffix needs of the next
   * step. After the call to this function the InPrefix and InSuffix
   * variables of the next step will be set to zero.
   *
   * @param fs Upsampling filtering step.
   * @param NextStep The next step structure.
   */

  static void extendUpsample(CFilterStep& fs, CFilterStep& NextStep) {
    fs.InPrefix =
        (NextStep.InPrefix + fs.ResampleFactor - 1) / fs.ResampleFactor;

    fs.OutPrefix += fs.InPrefix * fs.ResampleFactor;
    NextStep.InPrefix = 0;

    fs.InSuffix =
        (NextStep.InSuffix + fs.ResampleFactor - 1) / fs.ResampleFactor;

    fs.OutSuffix += fs.InSuffix * fs.ResampleFactor;
    NextStep.InSuffix = 0;
  }

  /**
   * Function fills resizing step's RPosBuf array, excluding the actual
   * "ftp" pointers and "SrcOffs" offsets.
   *
   * This array should be cleared if the resizing step or offset were
   * changed. Otherwise this function only fills the elements required to
   * cover resizing step's OutLen.
   *
   * This function is called by the updateFilterStepBuffers() function.
   *
   * @param fs Resizing step.
   * @param Vars Variables object.
   */

  static void fillRPosBuf(CFilterStep& fs, const CImageResizerVars& Vars) {
    const int PrevLen = fs.RPosBuf->getCapacity();

    if (fs.OutLen > PrevLen) {
      fs.RPosBuf->increaseCapacity(fs.OutLen);
    }

    typename CFilterStep ::CResizePos* rpos = &(*fs.RPosBuf)[PrevLen];
    const int FracCount = fs.FltBank->getFracCount();
    const double o = Vars.o;
    const double k = Vars.k;
    int i;

    for (i = PrevLen; i < fs.OutLen; i++) {
      const double SrcPos = o + k * i;
      const int SrcPosInt = (int)floor(SrcPos);
      const double x = (SrcPos - SrcPosInt) * FracCount;
      const int fti = (int)x;
      rpos->x = (typename fpclass ::fptypeatom)(x - fti);
      rpos->fti = fti;
      rpos->SrcPosInt = SrcPosInt;
      rpos++;
    }
  }

  /**
   * Function updates filtering step buffer lengths depending on the
   * specified source and new scanline lengths. This function should be
   * called after the buildFilterSteps() function.
   *
   * @param Steps Array that receives filtering steps.
   * @param[out] Vars Variables object, will receive buffer size and length.
   * This function expects "k" and "o" variable values that will be
   * adjusted by this function.
   * @param RPosBufArray Resizing position buffers array, used to obtain
   * buffer to initialize and use (will be reused if it is already fully or
   * partially filled).
   * @param SrcLen Source scanline's length in pixels.
   * @param NewLen New scanline's length in pixels.
   */

  static void updateFilterStepBuffers(
      CFilterSteps& Steps, CImageResizerVars& Vars,
      typename CFilterStep ::CRPosBufArray& RPosBufArray, int SrcLen,
      const int NewLen) {
    int upstep = -1;
    int InBuf = 0;
    int i;

    for (i = 0; i < Steps.getItemCount(); i++) {
      CFilterStep& fs = Steps[i];

      fs.Vars = &Vars;
      fs.InLen = SrcLen;
      fs.InBuf = InBuf;
      fs.OutBuf = (InBuf + 1) & 1;

      if (fs.IsUpsample) {
        upstep = i;
        Vars.k *= fs.ResampleFactor;
        Vars.o *= fs.ResampleFactor;
        fs.InPrefix = 0;
        fs.InSuffix = 0;
        fs.OutLen = fs.InLen * fs.ResampleFactor;
        fs.OutPrefix = fs.FltLatency;
        fs.OutSuffix = fs.Flt.getCapacity() - fs.FltLatency - fs.ResampleFactor;

        int l0 = fs.OutPrefix + fs.OutLen + fs.OutSuffix;
        int l = fs.InLen * fs.ResampleFactor + fs.SuffixDC.getCapacity();

        if (l > l0) {
          fs.OutSuffix += l - l0;
        }

        l0 = fs.OutLen + fs.OutSuffix;

        if (fs.PrefixDC.getCapacity() > l0) {
          fs.OutSuffix += fs.PrefixDC.getCapacity() - l0;
        }
      } else if (fs.ResampleFactor == 0) {
        const int FilterLenD2 = fs.FltBank->getFilterLen() / 2;
        const int FilterLenD21 = FilterLenD2 - 1;

        const int ResizeLPix = (int)floor(Vars.o) - FilterLenD21;
        fs.InPrefix = (ResizeLPix < 0 ? -ResizeLPix : 0);
        const int ResizeRPix =
            (int)floor(Vars.o + (NewLen - 1) * Vars.k) + FilterLenD2 + 1;

        fs.InSuffix = (ResizeRPix > fs.InLen ? ResizeRPix - fs.InLen : 0);

        fs.OutLen = NewLen;
        fs.RPosBuf = &RPosBufArray.getRPosBuf(Vars.k, Vars.o,
                                              fs.FltBank->getFracCount());

        fillRPosBuf(fs, Vars);
      } else {
        Vars.k /= fs.ResampleFactor;
        Vars.o /= fs.ResampleFactor;
        Vars.o += fs.EdgePixelCount;

        fs.InPrefix = fs.FltLatency;
        fs.InSuffix = fs.Flt.getCapacity() - fs.FltLatency - 1;

        // Additionally extend OutLen to produce more precise edge
        // pixels.

        fs.OutLen = (fs.InLen + fs.ResampleFactor - 1) / fs.ResampleFactor +
                    fs.EdgePixelCount;

        fs.InSuffix += (fs.OutLen - 1) * fs.ResampleFactor + 1 - fs.InLen;

        fs.InPrefix += fs.EdgePixelCount * fs.ResampleFactor;
        fs.OutLen += fs.EdgePixelCount;
      }

      InBuf = fs.OutBuf;
      SrcLen = fs.OutLen;
    }

    Steps[Steps.getItemCount() - 1].OutBuf = 2;

    if (upstep != -1) {
      extendUpsample(Steps[upstep], Steps[upstep + 1]);
    }
  }

  /**
   * Function calculates an optimal intermediate buffer length that will
   * cover all needs of the specified filtering steps. This function should
   * be called after the updateFilterStepBuffers() function.
   *
   * Function also updates resizing step's RPosBuf pointers to the filter
   * bank and SrcOffs values.
   *
   * @param Steps Filtering steps.
   * @param[out] Vars Variables object, will receive buffer size and length.
   * @param ResElIncr Resulting (final) element increment, used to produce
   * de-interleaved result. For horizontal processing this value is equal
   * to last step's OutLen, for vertical processing this value is equal to
   * resulting image's width.
   */

  static void updateBufLenAndRPosPtrs(CFilterSteps& Steps,
                                      CImageResizerVars& Vars,
                                      const int ResElIncr) {
    int MaxPrefix[2] = {0, 0};
    int MaxLen[2] = {0, 0};
    int i;

    for (i = 0; i < Steps.getItemCount(); i++) {
      CFilterStep& fs = Steps[i];
      const int ib = fs.InBuf;

      if (fs.InPrefix > MaxPrefix[ib]) {
        MaxPrefix[ib] = fs.InPrefix;
      }

      int l = fs.InLen + fs.InSuffix;

      if (l > MaxLen[ib]) {
        MaxLen[ib] = l;
      }

      fs.InElIncr = fs.InPrefix + l;

      if (fs.OutBuf == 2) {
        break;
      }

      const int ob = fs.OutBuf;

      if (fs.IsUpsample) {
        if (fs.OutPrefix > MaxPrefix[ob]) {
          MaxPrefix[ob] = fs.OutPrefix;
        }

        l = fs.OutLen + fs.OutSuffix;

        if (l > MaxLen[ob]) {
          MaxLen[ob] = l;
        }
      } else {
        if (fs.OutLen > MaxLen[ob]) {
          MaxLen[ob] = fs.OutLen;
        }
      }
    }

    // Update OutElIncr values of all steps.

    for (i = 0; i < Steps.getItemCount(); i++) {
      CFilterStep& fs = Steps[i];

      if (fs.OutBuf == 2) {
        fs.OutElIncr = ResElIncr;
        break;
      }

      CFilterStep& fs2 = Steps[i + 1];

      if (fs.IsUpsample) {
        fs.OutElIncr = fs.OutPrefix + fs.OutLen + fs.OutSuffix;

        if (fs.OutElIncr > fs2.InElIncr) {
          fs2.InElIncr = fs.OutElIncr;
        } else {
          fs.OutElIncr = fs2.InElIncr;
        }
      } else {
        fs.OutElIncr = fs2.InElIncr;
      }
    }

    // Update temporary buffer's length.

    for (i = 0; i < 2; i++) {
      Vars.BufLen[i] = MaxPrefix[i] + MaxLen[i];
      Vars.BufOffs[i] = MaxPrefix[i];

      if (Vars.packmode == 0) {
        Vars.BufOffs[i] *= Vars.ElCount;
      }

      Vars.BufLen[i] *= Vars.ElCount;
    }

    // Update RPosBuf pointers and SrcOffs.

    CFilterStep& fs = Steps[Vars.ResizeStep];
    typename CFilterStep ::CResizePos* rpos = &(*fs.RPosBuf)[0];
    const int em = (fpclass ::packmode == 0 ? Vars.ElCount : 1);
    const int FilterLenD21 = fs.FltBank->getFilterLen() / 2 - 1;

    for (i = 0; i < fs.OutLen; i++) {
      rpos->ftp = fs.FltBank->getFilter(rpos->fti);
      rpos->SrcOffs = (rpos->SrcPosInt - FilterLenD21) * em;
      rpos++;
    }
  }

  /**
   * Function modifies the overall (DC) gain of the correction filter in the
   * pre-built filtering steps array.
   *
   * @param Steps Filtering steps.
   * @param m Multiplier to apply to the correction filter.
   */

  void modifyCorrFilterDCGain(CFilterSteps& Steps, const double m) const {
    CBuffer<fptype>* Flt;
    const int z = Steps.getItemCount() - 1;

    if (!Steps[z].IsUpsample && Steps[z].ResampleFactor == 1) {
      Flt = &Steps[z].Flt;
    } else {
      Flt = &Steps[0].Flt;
    }

    int i;

    for (i = 0; i < Flt->getCapacity(); i++) {
      (*Flt)[i] = (fptype)((double)(*Flt)[i] * m);
    }
  }

  /**
   * Function builds a map of used fractional delay filters based on the
   * resizing positions buffer.
   *
   * @param fs Resizing step.
   * @param[out] UsedFracMap Map of used fractional delay filters.
   */

  static void fillUsedFracMap(const CFilterStep& fs,
                              CBuffer<uint8_t>& UsedFracMap) {
    const int FracCount = fs.FltBank->getFracCount();
    UsedFracMap.increaseCapacity(FracCount, false);
    memset(&UsedFracMap[0], 0, FracCount * sizeof(UsedFracMap[0]));

    typename CFilterStep ::CResizePos* rpos = &(*fs.RPosBuf)[0];
    int i;

    for (i = 0; i < fs.OutLen; i++) {
      UsedFracMap[rpos->fti] |= 1;
      rpos++;
    }
  }

  /**
   * Function calculates the overall filtering steps complexity per
   * scanline. Each complexity unit corresponds to a single multiply-add
   * operation. Data copy and pointer math operations are not included in
   * this calculation, it is assumed that they correlate to the multiply-add
   * operations. Calculation also does not include final rounding, dithering
   * and clamping operations since they cannot be optimized out anyway.
   *
   * Calculation of the CRPosBuf buffer is not included since it cannot be
   * avoided.
   *
   * This function should be called after the updateFilterStepBuffers()
   * function.
   *
   * @param Steps Filtering steps array.
   * @param Vars Variables object.
   * @param UsedFracMap The map of used fractional delay filters.
   * @param ScanlineCount Scanline count.
   */

  static int calcComplexity(const CFilterSteps& Steps,
                            const CImageResizerVars& Vars,
                            const CBuffer<uint8_t>& UsedFracMap,
                            const int ScanlineCount) {
    int fcnum;    // Filter complexity multiplier numerator.
    int fcdenom;  // Filter complexity multiplier denominator.

    if (Vars.packmode != 0) {
      fcnum = 1;
      fcdenom = 1;
    } else {
      // In interleaved processing mode, filters require 1 less
      // multiplication per 2 multiply-add instructions.

      fcnum = 3;
      fcdenom = 4;
    }

    int s = 0;   // Complexity per one scanline.
    int s2 = 0;  // Complexity per all scanlines.
    int i;

    for (i = 0; i < Steps.getItemCount(); i++) {
      const CFilterStep& fs = Steps[i];

      s2 += 65 * fs.Flt.getCapacity();  // Filter creation complexity.

      if (fs.IsUpsample) {
        if (fs.FltOrig.getCapacity() > 0) {
          continue;
        }

        s += (fs.Flt.getCapacity() * (fs.InPrefix + fs.InLen + fs.InSuffix) +
              fs.SuffixDC.getCapacity() + fs.PrefixDC.getCapacity()) *
             Vars.ElCount;
      } else if (fs.ResampleFactor == 0) {
        s += fs.FltBank->getFilterLen() *
             (fs.FltBank->getOrder() + Vars.ElCount) * fs.OutLen;

        s2 += fs.FltBank->calcInitComplexity(UsedFracMap);
      } else {
        s += fs.Flt.getCapacity() * Vars.ElCount * fs.OutLen * fcnum / fcdenom;
      }
    }

    return (s + s2 / ScanlineCount);
  }

  /**
   * @brief Thread-isolated data used for scanline processing.
   *
   * This structure holds data necessary for image's horizontal or vertical
   * scanline processing, including scanline processing queue.
   *
   * @tparam Tin Source element data type. Intermediate buffers store data
   * in floating point format.
   * @tparam Tout Destination element data type. Intermediate buffers store
   * data in floating point format.
   */

  template <class Tin, class Tout>
  class CThreadData : public CImageResizerThreadPool ::CWorkload {
   public:
    virtual void process() { processScanlineQueue(); }

    /**
     * This enumeration lists possible scanline operations.
     */

    enum EScanlineOperation {
      sopResizeH,           ///< Resize horizontal scanline.
                            ///<
      sopResizeV,           ///< Resize vertical scanline.
                            ///<
      sopDitherAndUnpackH,  ///< Dither and unpack horizontal scanline.
                            ///<
      sopUnpackH            ///< Unpack horizontal scanline.
                            ///<
    };

    /**
     * Function initializes *this thread data object and assigns certain
     * variables provided by the higher level code.
     *
     * @param aThreadIndex Index of this thread data (0-based).
     * @param aThreadCount Total number of threads used during processing.
     * @param aSteps Filtering steps.
     * @param aVars Image resizer variables.
     */

    void init(const int aThreadIndex, const int aThreadCount,
              const CFilterSteps& aSteps, const CImageResizerVars& aVars) {
      ThreadIndex = aThreadIndex;
      ThreadCount = aThreadCount;
      Steps = &aSteps;
      Vars = &aVars;
    }

    /**
     * Function initializes scanline processing queue, and updates
     * capacities of intermediate buffers.
     *
     * @param aOp Operation to perform over scanline.
     * @param TotalLines The total number of scanlines that will be
     * processed by all threads.
     * @param aSrcLen Source scanline length in pixels.
     * @param aSrcIncr Source scanline buffer increment. Ignored in
     * horizontal scanline processing.
     * @param aResIncr Resulting scanline buffer increment. Ignored in
     * horizontal scanline processing.
     */

    void initScanlineQueue(const EScanlineOperation aOp, const int TotalLines,
                           const int aSrcLen, const int aSrcIncr = 0,
                           const int aResIncr = 0) {
      const int l = Vars->BufLen[0] + Vars->BufLen[1];

      if (Bufs.getCapacity() < l) {
        Bufs.alloc(l, fpclass ::fpalign);
      }

      BufPtrs[0] = Bufs + Vars->BufOffs[0];
      BufPtrs[1] = Bufs + Vars->BufLen[0] + Vars->BufOffs[1];

      int j;
      int ml = 0;

      for (j = 0; j < Steps->getItemCount(); j++) {
        const CFilterStep& fs = (*Steps)[j];

        if (fs.ResampleFactor == 0 && ml < fs.FltBank->getFilterLen()) {
          ml = fs.FltBank->getFilterLen();
        }
      }

      TmpFltBuf.alloc(ml, fpclass ::fpalign);
      ScanlineOp = aOp;
      SrcLen = aSrcLen;
      SrcIncr = aSrcIncr;
      ResIncr = aResIncr;
      QueueLen = 0;
      Queue.increaseCapacity((TotalLines + ThreadCount - 1) / ThreadCount,
                             false);
    }

    /**
     * Function adds a scanline to the queue buffer. The
     * initScanlineQueue() function should be called before calling this
     * function. The number of calls to this add function should not
     * exceed the TotalLines spread over all threads.
     *
     * @param SrcBuf Source scanline buffer.
     * @param ResBuf Resulting scanline buffer.
     */

    void addScanlineToQueue(void* const SrcBuf, void* const ResBuf) {
      Queue[QueueLen].SrcBuf = SrcBuf;
      Queue[QueueLen].ResBuf = ResBuf;
      QueueLen++;
    }

    /**
     * Function processes all queued scanlines.
     */

    void processScanlineQueue() {
      int i;

      switch (ScanlineOp) {
        case sopResizeH: {
          for (i = 0; i < QueueLen; i++) {
            resizeScanlineH((Tin*)Queue[i].SrcBuf, (fptype*)Queue[i].ResBuf);
          }

          break;
        }

        case sopResizeV: {
          for (i = 0; i < QueueLen; i++) {
            resizeScanlineV((fptype*)Queue[i].SrcBuf, (fptype*)Queue[i].ResBuf);
          }

          break;
        }

        case sopDitherAndUnpackH: {
          if (Vars->UseSRGBGamma) {
            for (i = 0; i < QueueLen; i++) {
              CFilterStep ::applySRGBGamma((fptype*)Queue[i].SrcBuf, SrcLen,
                                           *Vars);

              Ditherer.dither((fptype*)Queue[i].SrcBuf);

              CFilterStep ::unpackScanline((fptype*)Queue[i].SrcBuf,
                                           (Tout*)Queue[i].ResBuf, SrcLen,
                                           *Vars);
            }
          } else {
            for (i = 0; i < QueueLen; i++) {
              Ditherer.dither((fptype*)Queue[i].SrcBuf);

              CFilterStep ::unpackScanline((fptype*)Queue[i].SrcBuf,
                                           (Tout*)Queue[i].ResBuf, SrcLen,
                                           *Vars);
            }
          }

          break;
        }

        case sopUnpackH: {
          if (Vars->UseSRGBGamma) {
            for (i = 0; i < QueueLen; i++) {
              CFilterStep ::applySRGBGamma((fptype*)Queue[i].SrcBuf, SrcLen,
                                           *Vars);

              CFilterStep ::unpackScanline((fptype*)Queue[i].SrcBuf,
                                           (Tout*)Queue[i].ResBuf, SrcLen,
                                           *Vars);
            }
          } else {
            for (i = 0; i < QueueLen; i++) {
              CFilterStep ::unpackScanline((fptype*)Queue[i].SrcBuf,
                                           (Tout*)Queue[i].ResBuf, SrcLen,
                                           *Vars);
            }
          }

          break;
        }
      }
    }

    /**
     * Function returns ditherer object associated with *this thread data
     * object.
     */

    CDitherer& getDitherer() { return (Ditherer); }

   private:
    int ThreadIndex;                ///< Thread index.
                                    ///<
    int ThreadCount;                ///< Thread count.
                                    ///<
    const CFilterSteps* Steps;      ///< Filtering steps.
                                    ///<
    const CImageResizerVars* Vars;  ///< Image resizer variables.
                                    ///<
    CBuffer<fptype> Bufs;           ///< Flip-flop intermediate buffers.
                                    ///<
    fptype* BufPtrs[3];  ///< Flip-flop buffer pointers (referenced by
                         ///< filtering step's InBuf and OutBuf indices).
                         ///<
    CBuffer<fptype>
        TmpFltBuf;  ///< Temporary buffer used in the
                    ///< doResize() function, aligned by fpclass :: fpalign.
                    ///<
    EScanlineOperation ScanlineOp;  ///< Operation to perform over
                                    ///< scanline.
                                    ///<
    int SrcLen;   ///< Source scanline length in the last queue.
                  ///<
    int SrcIncr;  ///< Source scanline buffer increment in the last queue.
                  ///<
    int ResIncr;  ///< Resulting scanline buffer increment in the last
                  ///< queue.
                  ///<
    CDitherer Ditherer;  ///< Ditherer object to use.
                         ///<

    /**
     * @brief Scanline processing queue item.
     *
     * Scanline processing queue item.
     */

    struct CQueueItem {
      void* SrcBuf;  ///< Source scanline buffer, will by typecasted to
                     ///< Tin or fptype*.
                     ///<
      void* ResBuf;  ///< Resulting scanline buffer, will by typecasted
                     ///< to Tout or fptype*.
                     ///<
    };

    CBuffer<CQueueItem> Queue;  ///< Scanline processing queue.
                                ///<
    int QueueLen;               ///< Queue length.
                                ///<

    /**
     * Function resizes a single horizontal scanline.
     *
     * @param SrcBuf Source scanline buffer. Can be either horizontal or
     * vertical.
     * @param ResBuf Resulting scanline buffer.
     */

    void resizeScanlineH(const Tin* const SrcBuf, fptype* const ResBuf) {
      (*Steps)[0].packScanline(SrcBuf, BufPtrs[0], SrcLen);
      BufPtrs[2] = ResBuf;
      int j;

      for (j = 0; j < Steps->getItemCount(); j++) {
        const CFilterStep& fs = (*Steps)[j];
        fs.prepareInBuf(BufPtrs[fs.InBuf]);
        const int DstIncr = (Vars->packmode == 0 ? Vars->ElCount : 1);

        if (fs.ResampleFactor != 0) {
          if (fs.IsUpsample) {
            fs.doUpsample(BufPtrs[fs.InBuf], BufPtrs[fs.OutBuf]);
          } else {
            fs.doFilter(BufPtrs[fs.InBuf], BufPtrs[fs.OutBuf], DstIncr);
          }
        } else {
          fs.doResize(BufPtrs[fs.InBuf], BufPtrs[fs.OutBuf], DstIncr,
                      TmpFltBuf);
        }
      }
    }

    /**
     * Function resizes a single vertical scanline.
     *
     * @param SrcBuf Source scanline buffer. Can be either horizontal or
     * vertical.
     * @param ResBuf Resulting scanline buffer.
     */

    void resizeScanlineV(const fptype* const SrcBuf, fptype* const ResBuf) {
      (*Steps)[0].convertVtoH(SrcBuf, BufPtrs[0], SrcLen, SrcIncr);

      BufPtrs[2] = ResBuf;
      int j;

      for (j = 0; j < Steps->getItemCount(); j++) {
        const CFilterStep& fs = (*Steps)[j];
        fs.prepareInBuf(BufPtrs[fs.InBuf]);
        const int DstIncr =
            (fs.OutBuf == 2 ? ResIncr
                            : (Vars->packmode == 0 ? Vars->ElCount : 1));

        if (fs.ResampleFactor != 0) {
          if (fs.IsUpsample) {
            fs.doUpsample(BufPtrs[fs.InBuf], BufPtrs[fs.OutBuf]);
          } else {
            fs.doFilter(BufPtrs[fs.InBuf], BufPtrs[fs.OutBuf], DstIncr);
          }
        } else {
          fs.doResize(BufPtrs[fs.InBuf], BufPtrs[fs.OutBuf], DstIncr,
                      TmpFltBuf);
        }
      }
    }
  };
};

#undef AVIR_PI
#undef AVIR_PId2

}  // namespace avir

#endif  // AVIR_CIMAGERESIZER_INCLUDED
/* clang-format off */
//$ nobt
//$ nocpp
#include "libc/calls/calls.h"

/**
 * @file avir.h
 *
 * @brief The "main" inclusion file with all required classes and functions.
 *
 * This is the "main" inclusion file for the "AVIR" image resizer. This
 * inclusion file contains implementation of the AVIR image resizing algorithm
 * in its entirety. Also includes several classes and functions that can be
 * useful elsewhere.
 *
 * AVIR Copyright (c) 2015-2019 Aleksey Vaneev
 *
 * @mainpage
 *
 * @section intro_sec Introduction
 *
 * Description is available at https://github.com/avaneev/avir
 *
 * AVIR is devoted to women. Your digital photos can look good at any size!
 *
 * @section license License
 *
 * AVIR License Agreement
 *
 * The MIT License (MIT)
 *
 * Copyright (c) 2015-2019 Aleksey Vaneev
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 *
 * Please credit the author of this library in your documentation in the
 * following way: "AVIR image resizing algorithm designed by Aleksey Vaneev"
 *
 * @version 2.4
 */

#ifndef AVIR_CIMAGERESIZER_INCLUDED
#define AVIR_CIMAGERESIZER_INCLUDED

#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

namespace avir {

/**
 * The macro defines AVIR version string.
 */

#define AVIR_VERSION "2.4"

/**
 * The macro equals to "pi" constant, fills 53-bit floating point mantissa.
 * Undefined at the end of file.
 */

#define AVIR_PI 3.1415926535897932

/**
 * The macro equals to "pi divided by 2" constant, fills 53-bit floating
 * point mantissa. Undefined at the end of file.
 */

#define AVIR_PId2 1.5707963267948966

/**
 * Rounding function, based on the (int) typecast. Biased result. Not suitable
 * for numbers >= 2^31.
 *
 * @param d Value to round.
 * @return Rounded value. Some bias may be introduced.
 */

template< class T >
inline T round( const T d )
{
	return( d < 0.0 ? -(T) (int) ( (T) 0.5 - d ) : (T) (int) ( d + (T) 0.5 ));
}

/**
 * Template function "clamps" (clips) the specified value so that it is not
 * lesser than "minv", and not greater than "maxv".
 *
 * @param Value Value to clamp.
 * @param minv Minimal allowed value.
 * @param maxv Maximal allowed value.
 * @return The clamped value.
 */

template< class T >
inline T clamp( const T& Value, const T minv, const T maxv )
{
	if( Value < minv )
	{
		return( minv );
	}
	else
	if( Value > maxv )
	{
		return( maxv );
	}
	else
	{
		return( Value );
	}
}

/**
 * Power 2.4 approximation function, designed for sRGB gamma correction.
 *
 * @param x Argument, in the range 0.09 to 1.
 * @return Value raised into power 2.4, approximate.
 */

template< class T >
inline T pow24_sRGB( const T x )
{
	const double x2 = x * x;
	const double x3 = x2 * x;
	const double x4 = x2 * x2;

	return( (T) ( 0.0985766365536824 + 0.839474952656502 * x2 +
		0.363287814061725 * x3 - 0.0125559718896615 /
		( 0.12758338921578 + 0.290283465468235 * x ) -
		0.231757513261358 * x - 0.0395365717969074 * x4 ));
}

/**
 * Power 1/2.4 approximation function, designed for sRGB gamma correction.
 *
 * @param x Argument, in the range 0.003 to 1.
 * @return Value raised into power 1/2.4, approximate.
 */

template< class T >
inline T pow24i_sRGB( const T x )
{
	const double sx = sqrt( x );
	const double ssx = sqrt( sx );
	const double sssx = sqrt( ssx );

	return( (T) ( 0.000213364515060263 + 0.0149409239419218 * x +
		0.433973412731747 * sx + ssx * ( 0.659628181609715 * sssx -
		0.0380957908841466 - 0.0706476137208521 * sx )));
}

/**
 * Function approximately linearizes the sRGB gamma value.
 *
 * @param s sRGB gamma value, in the range 0 to 1.
 * @return Linearized sRGB gamma value, approximated.
 */

template< class T >
inline T convertSRGB2Lin( const T s )
{
	const T a = (T) 0.055;

	if( s <= (T) 0.04045 )
	{
		return( s / (T) 12.92 );
	}

	return( pow24_sRGB(( s + a ) / ( (T) 1 + a )));
}

/**
 * Function approximately de-linearizes the linear gamma value.
 *
 * @param s Linear gamma value, in the range 0 to 1.
 * @return sRGB gamma value, approximated.
 */

template< class T >
inline T convertLin2SRGB( const T s )
{
	const T a = (T) 0.055;

	if( s <= (T) 0.0031308 )
	{
		return( (T) 12.92 * s );
	}

	return(( (T) 1 + a ) * pow24i_sRGB( s ) - a );
}

/**
 * Function converts (via typecast) specified array of type T1 values of
 * length l into array of type T2 values. If T1 is the same as T2, copy
 * operation is performed. When copying data at overlapping address spaces,
 * "op" should be lower than "ip".
 *
 * @param ip Input buffer.
 * @param[out] op Output buffer.
 * @param l The number of elements to copy.
 * @param ip Input buffer pointer increment.
 * @param op Output buffer pointer increment.
 */

template< class T1, class T2 >
inline void copyArray( const T1* ip, T2* op, int l,
	const int ipinc = 1, const int opinc = 1 )
{
	while( l > 0 )
	{
		*op = (T2) *ip;
		op += opinc;
		ip += ipinc;
		l--;
	}
}

/**
 * Function adds values located in array "ip" to array "op".
 *
 * @param ip Input buffer.
 * @param[out] op Output buffer.
 * @param l The number of elements to add.
 * @param ip Input buffer pointer increment.
 * @param op Output buffer pointer increment.
 */

template< class T1, class T2 >
inline void addArray( const T1* ip, T2* op, int l,
	const int ipinc = 1, const int opinc = 1 )
{
	while( l > 0 )
	{
		*op += *ip;
		op += opinc;
		ip += ipinc;
		l--;
	}
}

/**
 * Function that replicates a set of adjacent elements several times in a row.
 * This operation is usually used to replicate pixels at the start or end of
 * image's scanline.
 *
 * @param ip Source array.
 * @param ipl Source array length (usually 1..4, but can be any number).
 * @param[out] op Destination buffer.
 * @param l Number of times the source array should be replicated (the
 * destination buffer should be able to hold ipl * l number of elements).
 * @param opinc Destination buffer position increment after replicating the
 * source array. This value should be equal to at least ipl.
 */

template< class T1, class T2 >
inline void replicateArray( const T1* const ip, const int ipl, T2* op, int l,
	const int opinc )
{
	if( ipl == 1 )
	{
		while( l > 0 )
		{
			op[ 0 ] = ip[ 0 ];
			op += opinc;
			l--;
		}
	}
	else
	if( ipl == 4 )
	{
		while( l > 0 )
		{
			op[ 0 ] = ip[ 0 ];
			op[ 1 ] = ip[ 1 ];
			op[ 2 ] = ip[ 2 ];
			op[ 3 ] = ip[ 3 ];
			op += opinc;
			l--;
		}
	}
	else
	if( ipl == 3 )
	{
		while( l > 0 )
		{
			op[ 0 ] = ip[ 0 ];
			op[ 1 ] = ip[ 1 ];
			op[ 2 ] = ip[ 2 ];
			op += opinc;
			l--;
		}
	}
	else
	if( ipl == 2 )
	{
		while( l > 0 )
		{
			op[ 0 ] = ip[ 0 ];
			op[ 1 ] = ip[ 1 ];
			op += opinc;
			l--;
		}
	}
	else
	{
		while( l > 0 )
		{
			int i;

			for( i = 0; i < ipl; i++ )
			{
				op[ i ] = ip[ i ];
			}

			op += opinc;
			l--;
		}
	}
}

/**
 * Function calculates frequency response of the specified FIR filter at the
 * specified circular frequency. Phase can be calculated as atan2( im, re ).
 * Function uses computationally-efficient oscillators instead of "cos" and
 * "sin" functions.
 *
 * @param flt FIR filter's coefficients.
 * @param fltlen Number of coefficients (taps) in the filter.
 * @param th Circular frequency [0; pi].
 * @param[out] re0 Resulting real part of the complex frequency response.
 * @param[out] im0 Resulting imaginary part of the complex frequency response.
 * @param fltlat Filter's latency in samples (taps).
 */

template< class T >
inline void calcFIRFilterResponse( const T* flt, int fltlen,
	const double th, double& re0, double& im0, const int fltlat = 0 )
{
	const double sincr = 2.0 * cos( th );
	double cvalue1;
	double svalue1;

	if( fltlat == 0 )
	{
		cvalue1 = 1.0;
		svalue1 = 0.0;
	}
	else
	{
		cvalue1 = cos( -fltlat * th );
		svalue1 = sin( -fltlat * th );
	}

	double cvalue2 = cos( -( fltlat + 1 ) * th );
	double svalue2 = sin( -( fltlat + 1 ) * th );

	double re = 0.0;
	double im = 0.0;

	while( fltlen > 0 )
	{
		re += cvalue1 * flt[ 0 ];
		im += svalue1 * flt[ 0 ];
		flt++;
		fltlen--;

		double tmp = cvalue1;
		cvalue1 = sincr * cvalue1 - cvalue2;
		cvalue2 = tmp;

		tmp = svalue1;
		svalue1 = sincr * svalue1 - svalue2;
		svalue2 = tmp;
	}

	re0 = re;
	im0 = im;
}

/**
 * Function normalizes FIR filter so that its frequency response at DC is
 * equal to DCGain.
 *
 * @param[in,out] p Filter coefficients.
 * @param l Filter length.
 * @param DCGain Filter's gain at DC.
 * @param pstep "p" array step.
 */

template< class T >
inline void normalizeFIRFilter( T* const p, const int l, const double DCGain,
	const int pstep = 1 )
{
	double s = 0.0;
	T* pp = p;
	int i = l;

	while( i > 0 )
	{
		s += *pp;
		pp += pstep;
		i--;
	}

	s = DCGain / s;
	pp = p;
	i = l;

	while( i > 0 )
	{
		*pp = (T) ( *pp * s );
		pp += pstep;
		i--;
	}
}

/**
 * @brief Memory buffer class for element array storage, with capacity
 * tracking.
 *
 * Allows easier handling of memory blocks allocation and automatic
 * deallocation for arrays (buffers) consisting of elements of specified
 * class. Tracks buffer's capacity in "int" variable; unsuitable for
 * allocation of very large memory blocks (with more than 2 billion elements).
 *
 * This class manages memory space only - it does not perform element class
 * construction (initialization) operations. Buffer's required memory address
 * alignment specification is supported.
 *
 * Uses standard library to allocate and deallocate memory.
 *
 * @tparam T Buffer element's type.
 * @tparam capint Buffer capacity's type to use. Use size_t for large buffers.
 */

template< class T, typename capint = int >
class CBuffer
{
public:
	CBuffer()
		: Data( NULL )
		, DataAligned( NULL )
		, Capacity( 0 )
		, Alignment( 0 )
	{
	}

	/**
	 * Constructor creates the buffer with the specified capacity.
	 *
	 * @param aCapacity Buffer's capacity.
	 * @param aAlignment Buffer's required memory address alignment. 0 - use
	 * stdlib's default alignment.
	 */

	CBuffer( const capint aCapacity, const int aAlignment = 0 )
	{
		allocinit( aCapacity, aAlignment );
	}

	CBuffer( const CBuffer& Source )
	{
		allocinit( Source.Capacity, Source.Alignment );
		memcpy( DataAligned, Source.DataAligned, Capacity * sizeof( T ));
	}

	~CBuffer()
	{
		freeData();
	}

	CBuffer& operator = ( const CBuffer& Source )
	{
		alloc( Source.Capacity, Source.Alignment );
		memcpy( DataAligned, Source.DataAligned, Capacity * sizeof( T ));
		return( *this );
	}

	/**
	 * Function allocates memory so that the specified number of elements
	 * can be stored in *this buffer object.
	 *
	 * @param aCapacity Storage for this number of elements to allocate.
	 * @param aAlignment Buffer's required memory address alignment,
	 * power-of-2 values only. 0 - use stdlib's default alignment.
	 */

	void alloc( const capint aCapacity, const int aAlignment = 0 )
	{
		freeData();
		allocinit( aCapacity, aAlignment );
	}

	/**
	 * Function deallocates any previously allocated buffer.
	 */

	void free()
	{
		freeData();
		Data = NULL;
		DataAligned = NULL;
		Capacity = 0;
		Alignment = 0;
	}

	/**
	 * @return The capacity of the element buffer.
	 */

	capint getCapacity() const
	{
		return( Capacity );
	}

	/**
	 * Function "forces" *this buffer to have an arbitary capacity. Calling
	 * this function invalidates all further operations except deleting *this
	 * object. This function should not be usually used at all. Function can
	 * be used to "model" certain buffer capacity without calling a costly
	 * memory allocation function.
	 *
	 * @param NewCapacity A new "forced" capacity.
	 */

	void forceCapacity( const capint NewCapacity )
	{
		Capacity = NewCapacity;
	}

	/**
	 * Function reallocates *this buffer to a larger size so that it will be
	 * able to hold the specified number of elements. Downsizing is not
	 * performed. Alignment is not changed.
	 *
	 * @param NewCapacity New (increased) capacity.
	 * @param DoDataCopy "True" if data in the buffer should be retained.
	 */

	void increaseCapacity( const capint NewCapacity,
		const bool DoDataCopy = true )
	{
		if( NewCapacity < Capacity )
		{
			return;
		}

		if( DoDataCopy )
		{
			const capint PrevCapacity = Capacity;
			T* const PrevData = Data;
			T* const PrevDataAligned = DataAligned;

			allocinit( NewCapacity, Alignment );
			memcpy( DataAligned, PrevDataAligned, PrevCapacity * sizeof( T ));

			:: free( PrevData );
		}
		else
		{
			:: free( Data );
			allocinit( NewCapacity, Alignment );
		}
	}

	/**
	 * Function "truncates" (reduces) capacity of the buffer without
	 * reallocating it. Alignment is not changed.
	 *
	 * @param NewCapacity New required capacity.
	 */

	void truncateCapacity( const capint NewCapacity )
	{
		if( NewCapacity >= Capacity )
		{
			return;
		}

		Capacity = NewCapacity;
	}

	/**
	 * Function increases capacity so that the specified number of
	 * elements can be stored. This function increases the previous capacity
	 * value by third the current capacity value until space for the required
	 * number of elements is available. Alignment is not changed.
	 *
	 * @param ReqCapacity Required capacity.
	 */

	void updateCapacity( const capint ReqCapacity )
	{
		if( ReqCapacity <= Capacity )
		{
			return;
		}

		capint NewCapacity = Capacity;

		while( NewCapacity < ReqCapacity )
		{
			NewCapacity += NewCapacity / 3 + 1;
		}

		increaseCapacity( NewCapacity );
	}

	operator T* () const
	{
		return( DataAligned );
	}

private:
	T* Data; ///< Element buffer pointer.
		///<
	T* DataAligned; ///< Memory address-aligned element buffer pointer.
		///<
	capint Capacity; ///< Element buffer capacity.
		///<
	int Alignment; ///< Memory address alignment in use. 0 - use stdlib's
		///< default alignment.
		///<

	/**
	 * Internal element buffer allocation function used during object
	 * construction.
	 *
	 * @param aCapacity Storage for this number of elements to allocate.
	 * @param aAlignment Buffer's required memory address alignment. 0 - use
	 * stdlib's default alignment.
	 */

	void allocinit( const capint aCapacity, const int aAlignment )
	{
		if( aAlignment == 0 )
		{
			Data = (T*) :: malloc( aCapacity * sizeof( T ));
			DataAligned = Data;
			Alignment = 0;
		}
		else
		{
			Data = (T*) :: malloc( aCapacity * sizeof( T ) + aAlignment );
			DataAligned = alignptr( Data, aAlignment );
			Alignment = aAlignment;
		}

		Capacity = aCapacity;
	}

	/**
	 * Function frees a previously allocated Data buffer.
	 */

	void freeData()
	{
		:: free( Data );
	}

	/**
	 * Function modifies the specified pointer so that it becomes memory
	 * address-aligned.
	 *
	 * @param ptr Pointer to align.
	 * @param align Alignment in bytes to apply.
	 * @return Pointer aligned to align bytes. Works with power-of-2
	 * alignments only. If no alignment is necessary, "align" bytes will be
	 * added to the pointer value.
	 */

	template< class Tp >
	inline Tp alignptr( const Tp ptr, const uintptr_t align )
	{
		return( (Tp) ( (uintptr_t) ptr + align -
			( (uintptr_t) ptr & ( align - 1 ))) );
	}
};

/**
 * Function optimizes the length of the symmetric-odd FIR filter by removing
 * left- and rightmost elements that are below specific threshold.
 *
 * Synthetic test shows that filter gets optimized in 2..3% of cases and in
 * each such case optimization reduces filter length by 6..8%. Optimization,
 * however, may skew the results of algorithm modeling and complexity
 * calculation leading to a choice of a less optimal algorithm.
 *
 * @param[in,out] Flt Buffer that contains filter being optimized.
 * @param[in,out] FltLatency Variable that holds the current latency of the
 * filter. May be adjusted on function return.
 * @param Threshold Threshold level.
 */

template< class T >
inline void optimizeFIRFilter( CBuffer< T >& Flt, int& FltLatency,
	T const Threshold = (T) 0.00001 )
{
	int i;

	// Optimize length.

	for( i = 0; i <= FltLatency; i++ )
	{
		if( fabs( Flt[ i ]) >= Threshold || i == FltLatency )
		{
			if( i > 0 )
			{
				const int NewCapacity = Flt.getCapacity() - i * 2;
				copyArray( &Flt[ i ], &Flt[ 0 ], NewCapacity );
				Flt.truncateCapacity( NewCapacity );
				FltLatency -= i;
			}

			break;
		}
	}
}

/**
 * @brief Array of structured objects.
 *
 * Implements allocation of a linear array of objects of class T (which are
 * initialized), addressable via operator[]. Each object is created via the
 * "operator new". New object insertions are quick since implementation uses
 * prior space allocation (capacity), thus not requiring frequent memory block
 * reallocations.
 *
 * @tparam T Array element's type.
 */

template< class T >
class CStructArray
{
public:
	CStructArray()
		: ItemCount( 0 )
	{
	}

	CStructArray( const CStructArray& Source )
		: ItemCount( 0 )
		, Items( Source.getItemCount() )
	{
		while( ItemCount < Source.getItemCount() )
		{
			Items[ ItemCount ] = new T( Source[ ItemCount ]);
			ItemCount++;
		}
	}

	~CStructArray()
	{
		clear();
	}

	CStructArray& operator = ( const CStructArray& Source )
	{
		clear();

		const int NewCount = Source.ItemCount;
		Items.updateCapacity( NewCount );

		while( ItemCount < NewCount )
		{
			Items[ ItemCount ] = new T( Source[ ItemCount ]);
			ItemCount++;
		}

		return( *this );
	}

	T& operator []( const int Index )
	{
		return( *Items[ Index ]);
	}

	const T& operator []( const int Index ) const
	{
		return( *Items[ Index ]);
	}

	/**
	 * Function creates a new object of type T with the default constructor
	 * and adds this object to the array.
	 *
	 * @return Reference to a newly added object.
	 */

	T& add()
	{
		if( ItemCount == Items.getCapacity() )
		{
			Items.increaseCapacity( ItemCount * 3 / 2 + 1 );
		}

		Items[ ItemCount ] = new T();
		ItemCount++;

		return( (*this)[ ItemCount - 1 ]);
	}

	/**
	 * Function changes number of allocated items. New items are created with
	 * the default constructor. If NewCount is below the current item count,
	 * items that are above NewCount range will be destructed.
	 *
	 * @param NewCount New requested item count.
	 */

	void setItemCount( const int NewCount )
	{
		if( NewCount > ItemCount )
		{
			Items.increaseCapacity( NewCount );

			while( ItemCount < NewCount )
			{
				Items[ ItemCount ] = new T();
				ItemCount++;
			}
		}
		else
		{
			while( ItemCount > NewCount )
			{
				ItemCount--;
				delete Items[ ItemCount ];
			}
		}
	}

	/**
	 * Function erases all items of *this array.
	 */

	void clear()
	{
		while( ItemCount > 0 )
		{
			ItemCount--;
			delete Items[ ItemCount ];
		}
	}

	/**
	 * @return The number of allocated items.
	 */

	int getItemCount() const
	{
		return( ItemCount );
	}

private:
	int ItemCount; ///< The number of items available in the array.
		///<
	CBuffer< T* > Items; ///< Element buffer.
		///<
};

/**
 * @brief Sine signal generator class.
 *
 * Class implements sine signal generator without biasing, with
 * constructor-based initalization only. This generator uses oscillator
 * instead of "sin" function.
 */

class CSineGen
{
public:
	/**
	 * Constructor initializes *this sine signal generator.
	 *
	 * @param si Sine function increment, in radians.
	 * @param ph Starting phase, in radians. Add 0.5 * AVIR_PI for cosine
	 * function.
	 */

	CSineGen( const double si, const double ph )
		: svalue1( sin( ph ))
		, svalue2( sin( ph - si ))
		, sincr( 2.0 * cos( si ))
	{
	}

	/**
	 * @return The next value of the sine function, without biasing.
	 */

	double generate()
	{
		const double res = svalue1;

		svalue1 = sincr * res - svalue2;
		svalue2 = res;

		return( res );
	}

private:
	double svalue1; ///< Current sine value.
		///<
	double svalue2; ///< Previous sine value.
		///<
	double sincr; ///< Sine value increment.
		///<
};

/**
 * @brief Peaked Cosine window function generator class.
 *
 * Class implements Peaked Cosine window function generator. Generates the
 * right-handed half of the window function. The Alpha parameter of this
 * window function offers the control of the balance between the early and
 * later taps of the filter. E.g. at Alpha=1 both early and later taps are
 * attenuated, but at Alpha=4 mostly later taps are attenuated. This offers a
 * great control over ringing artifacts produced by a low-pass filter in image
 * processing, without compromising achieved image sharpness.
 */

class CDSPWindowGenPeakedCosine
{
public:
	/**
	 * Constructor initializes *this window function generator.
	 *
	 * @param aAlpha Alpha parameter, affects the peak shape (peak
	 * augmentation) of the window function. Should be >= 1.0.
	 * @param aLen2 Half filter's length (non-truncated).
	 */

	CDSPWindowGenPeakedCosine( const double aAlpha, const double aLen2 )
		: Alpha( aAlpha )
		, Len2( aLen2 )
		, wn( 0 )
		, w1( AVIR_PId2 / Len2, AVIR_PI * 0.5 )
	{
	}

	/**
	 * @return The next Peaked Cosine window function coefficient.
	 */

	double generate()
	{
		const double h = pow( wn / Len2, Alpha );
		wn++;

		return( w1.generate() * ( 1.0 - h ));
	}

private:
	double Alpha; ///< Alpha parameter, affects the peak shape of window.
		///<
	double Len2; ///< Half length of the window function.
		///<
	int wn; ///< Window function integer position. 0 - center of the
		///< window function.
		///<
	CSineGen w1; ///< Sine-wave generator.
		///<
};

/**
 * @brief FIR filter-based equalizer generator.
 *
 * Class implements an object used to generate symmetric-odd FIR filters with
 * the specified frequency response (aka paragraphic equalizer). The
 * calculated filter is windowed by the Peaked Cosine window function.
 *
 * In image processing, due to short length of filters being used (6-8 taps)
 * the resulting frequency response of the filter is approximate and may be
 * mathematically imperfect, but still adequate to the visual requirements.
 *
 * On a side note, this equalizer generator can be successfully used for audio
 * signal equalization as well: for example, it is used in almost the same
 * form in Voxengo Marvel GEQ equalizer plug-in.
 *
 * Filter generation is based on decomposition of frequency range into
 * spectral bands, with each band represented by linear and ramp "kernels".
 * When the filter is built, these kernels are combined together with
 * different weights that approximate the required frequency response.
 */

class CDSPFIREQ
{
public:
	/**
	 * Function initializes *this object with the required parameters. The
	 * gain of frequencies beyond the MinFreq..MaxFreq range are controlled by
	 * the first and the last band's gain.
	 *
	 * @param SampleRate Processing sample rate (use 2 for image processing).
	 * @param aFilterLength Required filter length in samples (taps). The
	 * actual filter length is truncated to an integer value.
	 * @param aBandCount Number of band crossover points required to control,
	 * including bands at MinFreq and MaxFreq.
	 * @param MinFreq Minimal frequency that should be controlled.
	 * @param MaxFreq Maximal frequency that should be controlled.
	 * @param IsLogBands "True" if the bands should be spaced logarithmically.
	 * @param WFAlpha Peaked Cosine window function's Alpha parameter.
	 */

	void init( const double SampleRate, const double aFilterLength,
		const int aBandCount, const double MinFreq, const double MaxFreq,
		const bool IsLogBands, const double WFAlpha )
	{
		FilterLength = aFilterLength;
		BandCount = aBandCount;

		CenterFreqs.alloc( BandCount );

		z = (int) ceil( FilterLength * 0.5 );
		zi = z + ( z & 1 );
		z2 = z * 2;

		CBuffer< double > oscbuf( z2 );
		initOscBuf( oscbuf );

		CBuffer< double > winbuf( z );
		initWinBuf( winbuf, WFAlpha );

		UseFirstVirtBand = ( MinFreq > 0.0 );
		const int k = zi * ( BandCount + ( UseFirstVirtBand ? 1 : 0 ));
		Kernels1.alloc( k );
		Kernels2.alloc( k );

		double m; // Frequency step multiplier.
		double mo; // Frequency step offset (addition).

		if( IsLogBands )
		{
			m = exp( log( MaxFreq / MinFreq ) / ( BandCount - 1 ));
			mo = 0.0;
		}
		else
		{
			m = 1.0;
			mo = ( MaxFreq - MinFreq ) / ( BandCount - 1 );
		}

		double f = MinFreq;
		double x1 = 0.0;
		double x2;
		int si;

		if( UseFirstVirtBand )
		{
			si = 0;
		}
		else
		{
			si = 1;
			CenterFreqs[ 0 ] = 0.0;
			f = f * m + mo;
		}

		double* kernbuf1 = &Kernels1[ 0 ];
		double* kernbuf2 = &Kernels2[ 0 ];
		int i;

		for( i = si; i < BandCount; i++ )
		{
			x2 = f * 2.0 / SampleRate;
			CenterFreqs[ i ] = x2;

			fillBandKernel( x1, x2, kernbuf1, kernbuf2, oscbuf, winbuf );

			kernbuf1 += zi;
			kernbuf2 += zi;
			x1 = x2;
			f = f * m + mo;
		}

		if( x1 < 1.0 )
		{
			UseLastVirtBand = true;
			fillBandKernel( x1, 1.0, kernbuf1, kernbuf2, oscbuf, winbuf );
		}
		else
		{
			UseLastVirtBand = false;
		}
	}

	/**
	 * @return Filter's length, in samples (taps).
	 */

	int getFilterLength() const
	{
		return( z2 - 1 );
	}

	/**
	 * @return Filter's latency (group delay), in samples (taps).
	 */

	int getFilterLatency() const
	{
		return( z - 1 );
	}

	/**
	 * Function creates symmetric-odd FIR filter with the specified gain
	 * levels at band crossover points.
	 *
	 * @param BandGains Array of linear gain levels, count=BandCount specified
	 * in the init() function.
	 * @param[out] Filter Output filter buffer, length = getFilterLength().
	 */

	void buildFilter( const double* const BandGains, double* const Filter )
	{
		const double* kernbuf1 = &Kernels1[ 0 ];
		const double* kernbuf2 = &Kernels2[ 0 ];
		double x1 = 0.0;
		double y1 = BandGains[ 0 ];
		double x2;
		double y2;

		int i;
		int si;

		if( UseFirstVirtBand )
		{
			si = 1;
			x2 = CenterFreqs[ 0 ];
			y2 = y1;
		}
		else
		{
			si = 2;
			x2 = CenterFreqs[ 1 ];
			y2 = BandGains[ 1 ];
		}

		copyBandKernel( Filter, kernbuf1, kernbuf2, y1 - y2,
			x1 * y2 - x2 * y1 );

		kernbuf1 += zi;
		kernbuf2 += zi;
		x1 = x2;
		y1 = y2;

		for( i = si; i < BandCount; i++ )
		{
			x2 = CenterFreqs[ i ];
			y2 = BandGains[ i ];

			addBandKernel( Filter, kernbuf1, kernbuf2, y1 - y2,
				x1 * y2 - x2 * y1 );

			kernbuf1 += zi;
			kernbuf2 += zi;
			x1 = x2;
			y1 = y2;
		}

		if( UseLastVirtBand )
		{
			addBandKernel( Filter, kernbuf1, kernbuf2, y1 - y2,
				x1 * y2 - y1 );
		}

		for( i = 0; i < z - 1; i++ )
		{
			Filter[ z + i ] = Filter[ z - 2 - i ];
		}
	}

	/**
	 * Function calculates filter's length (in samples) and latency depending
	 * on the required non-truncated filter length.
	 *
	 * @param aFilterLength Required filter length in samples (non-truncated).
	 * @param[out] Latency Resulting latency (group delay) of the filter,
	 * in samples (taps).
	 * @return Filter length in samples (taps).
	 */

	static int calcFilterLength( const double aFilterLength, int& Latency )
	{
		const int l = (int) ceil( aFilterLength * 0.5 );
		Latency = l - 1;

		return( l * 2 - 1 );
	}

private:
	double FilterLength; ///< Length of filter.
		///<
	int z; ///< Equals (int) ceil( FilterLength * 0.5 ).
		///<
	int zi; ///< Equals "z" if z is even, or z + 1 if z is odd. Used as a
		///< Kernels1 and Kernels2 size multiplier and kernel buffer increment
		///< to make sure each kernel buffer is 16-byte aligned.
		///<
	int z2; ///< Equals z * 2.
		///<
	int BandCount; ///< Number of controllable bands.
		///<
	CBuffer< double > CenterFreqs; ///< Center frequencies for all bands,
		///< normalized to 0.0-1.0 range.
		///<
	CBuffer< double > Kernels1; ///< Half-length kernel buffers for each
		///< spectral band (linear part).
		///<
	CBuffer< double > Kernels2; ///< Half-length kernel buffers for each
		///< spectral band (ramp part).
		///<
	bool UseFirstVirtBand; ///< "True" if the first virtual band
		///< (between 0.0 and MinFreq) should be used. The first virtual band
		///< won't be used if MinFreq equals 0.0.
		///<
	bool UseLastVirtBand; ///< "True" if the last virtual band (between
		///< MaxFreq and SampleRate * 0.5) should be used. The last virtual
		///< band won't be used if MaxFreq * 2.0 equals SampleRate.
		///<

	/**
	 * Function initializes the "oscbuf" used in the fillBandKernel()
	 * function.
	 *
	 * @param oscbuf Oscillator buffer, length = z * 2.
	 */

	void initOscBuf( double* oscbuf ) const
	{
		int i = z;

		while( i > 0 )
		{
			oscbuf[ 0 ] = 0.0;
			oscbuf[ 1 ] = 1.0;
			oscbuf += 2;
			i--;
		}
	}

	/**
	 * Function initializes window function buffer. This function generates
	 * Peaked Cosine window function.
	 *
	 * @param winbuf Windowing buffer.
	 * @param Alpha Peaked Cosine alpha parameter.
	 */

	void initWinBuf( double* winbuf, const double Alpha ) const
	{
		CDSPWindowGenPeakedCosine wf( Alpha, FilterLength * 0.5 );
		int i;

		for( i = 1; i <= z; i++ )
		{
			winbuf[ z - i ] = wf.generate();
		}
	}

	/**
	 * Function fills first half of symmetric-odd FIR kernel for the band.
	 * This function should be called successively for adjacent bands.
	 * Previous band's x2 should be equal to current band's x1. A band kernel
	 * consists of 2 elements: linear kernel and ramp kernel.
	 *
	 * @param x1 Band's left corner frequency (0..1).
	 * @param x2 Band's right corner frequency (0..1).
	 * @param kernbuf1 Band kernel buffer 1 (linear part), length = z.
	 * @param kernbuf2 Band kernel buffer 2 (ramp part), length = z.
	 * @param oscbuf Oscillation buffer. Before the first call of the
	 * fillBandKernel() should be initialized with the call of the
	 * initOscBuf() function.
	 * @param winbuf Buffer that contains windowing function.
	 */

	void fillBandKernel( const double x1, const double x2, double* kernbuf1,
		double* kernbuf2, double* oscbuf, const double* const winbuf )
	{
		const double s2_incr = AVIR_PI * x2;
		const double s2_coeff = 2.0 * cos( s2_incr );

		double s2_value1 = sin( s2_incr * ( -z + 1 ));
		double c2_value1 = sin( s2_incr * ( -z + 1 ) + AVIR_PI * 0.5 );
		oscbuf[ 0 ] = sin( s2_incr * -z );
		oscbuf[ 1 ] = sin( s2_incr * -z + AVIR_PI * 0.5 );

		int ks;

		for( ks = 1; ks < z; ks++ )
		{
			const int ks2 = ks * 2;
			const double s1_value1 = oscbuf[ ks2 ];
			const double c1_value1 = oscbuf[ ks2 + 1 ];
			oscbuf[ ks2 ] = s2_value1;
			oscbuf[ ks2 + 1 ] = c2_value1;

			const double x = AVIR_PI * ( ks - z );
			const double v0 = winbuf[ ks - 1 ] / (( x1 - x2 ) * x );

			kernbuf1[ ks - 1 ] = ( x2 * s2_value1 - x1 * s1_value1 +
				( c2_value1 - c1_value1 ) / x ) * v0;

			kernbuf2[ ks - 1 ] = ( s2_value1 - s1_value1 ) * v0;

			s2_value1 = s2_coeff * s2_value1 - oscbuf[ ks2 - 2 ];
			c2_value1 = s2_coeff * c2_value1 - oscbuf[ ks2 - 1 ];
		}

		kernbuf1[ z - 1 ] = ( x2 * x2 - x1 * x1 ) / ( x1 - x2 ) * 0.5;
		kernbuf2[ z - 1 ] = -1.0;
	}

	/**
	 * Function copies band kernel's elements to the output buffer.
	 *
	 * @param outbuf Output buffer.
	 * @param kernbuf1 Kernel buffer 1 (linear part).
	 * @param kernbuf2 Kernel buffer 2 (ramp part).
	 * @param c Multiplier for linear kernel element.
	 * @param d Multiplier for ramp kernel element.
	 */

	void copyBandKernel( double* outbuf, const double* const kernbuf1,
		const double* const kernbuf2, const double c, const double d ) const
	{
		int ks;

		for( ks = 0; ks < z; ks++ )
		{
			outbuf[ ks ] = c * kernbuf1[ ks ] + d * kernbuf2[ ks ];
		}
	}

	/**
	 * Function adds band kernel's elements to the output buffer.
	 *
	 * @param outbuf Output buffer.
	 * @param kernbuf1 Kernel buffer 1 (linear part).
	 * @param kernbuf2 Kernel buffer 2 (ramp part).
	 * @param c Multiplier for linear kernel element.
	 * @param d Multiplier for ramp kernel element.
	 */

	void addBandKernel( double* outbuf, const double* const kernbuf1,
		const double* const kernbuf2, const double c, const double d ) const
	{
		int ks;

		for( ks = 0; ks < z; ks++ )
		{
			outbuf[ ks ] += c * kernbuf1[ ks ] + d * kernbuf2[ ks ];
		}
	}
};

/**
 * @brief Low-pass filter windowed by Peaked Cosine window function.
 *
 * This class implements calculation of linear-phase symmetric-odd FIR
 * low-pass filter windowed by the Peaked Cosine window function, for image
 * processing applications.
 */

class CDSPPeakedCosineLPF
{
public:
	int fl2; ///< Half filter's length, excluding the peak value. This value
		///< can be also used as filter's latency in samples (taps).
		///<
	int FilterLen; ///< Filter's length in samples (taps).
		///<

	/**
	 * Constructor initalizes *this object.
	 *
	 * @param aLen2 Half-length (non-truncated) of low-pass filter, in samples
	 * (taps).
	 * @param aFreq2 Low-pass filter's corner frequency [0; pi].
	 * @param aAlpha Peaked Cosine window function Alpha parameter.
	 */

	CDSPPeakedCosineLPF( const double aLen2, const double aFreq2,
		const double aAlpha )
		: fl2( (int) ceil( aLen2 ) - 1 )
		, FilterLen( fl2 + fl2 + 1 )
		, Len2( aLen2 )
		, Freq2( aFreq2 )
		, Alpha( aAlpha )
	{
	}

	/**
	 * Function generates a linear-phase low-pass filter windowed by Peaked
	 * Cosine window function.
	 *
	 * @param[out] op Output buffer, length = FilterLen (fl2 * 2 + 1).
	 * @param DCGain Required gain at DC. The resulting filter will be
	 * normalized to achieve this DC gain.
	 */

	template< class T >
	void generateLPF( T* op, const double DCGain )
	{
		CDSPWindowGenPeakedCosine wf( Alpha, Len2 );
		CSineGen f2( Freq2, 0.0 );

		op += fl2;
		T* op2 = op;
		f2.generate();
		int t = 1;

		*op = (T) ( Freq2 * wf.generate() / AVIR_PI );
		double s = *op;

		while( t <= fl2 )
		{
			const double v = f2.generate() * wf.generate() / t / AVIR_PI;
			op++;
			op2--;
			*op = (T) v;
			*op2 = (T) v;
			s += *op + *op2;
			t++;
		}

		t = FilterLen;
		s = DCGain / s;

		while( t > 0 )
		{
			*op2 = (T) ( *op2 * s );
			op2++;
			t--;
		}
	}

private:
	double Len2; ///< Half-length (non-truncated) of low-pass filter, in
		///< samples (taps).
		///<
	double Freq2; ///< Low-pass filter's corner frequency.
		///<
	double Alpha; ///< Peaked Cosine window function Alpha parameter.
		///<
};

/**
 * @brief Buffer class for parametrized low-pass filter.
 *
 * This class extends the CBuffer< double > class by adding several variables
 * that define a symmetric-odd FIR low-pass filter windowed by Peaked Cosine
 * window function. This class can be used to compare filters without
 * comparing their buffer contents.
 */

class CFltBuffer : public CBuffer< double >
{
public:
	double Len2; ///< Half-length (non-truncated) of low-pass filters, in
		///< samples (taps).
		///<
	double Freq; ///< Low-pass filter's corner frequency.
		///<
	double Alpha; ///< Peaked Cosine window function Alpha parameter.
		///<
	double DCGain; ///< DC gain applied to the filter.
		///<

	CFltBuffer()
		: CBuffer< double >()
		, Len2( 0.0 )
		, Freq( 0.0 )
		, Alpha( 0.0 )
		, DCGain( 0.0 )
	{
	}

	/**
	 * @param b2 Filter buffer to compare *this object to.
	 * @return Operator returns "true" if both filters have same parameters.
	 */

	bool operator == ( const CFltBuffer& b2 ) const
	{
		return( Len2 == b2.Len2 && Freq == b2.Freq && Alpha == b2.Alpha &&
			DCGain == b2.DCGain );
	}
};

/**
 * @brief Sinc function-based fractional delay filter bank.
 *
 * Class implements storage and initialization of a bank of sinc
 * function-based fractional delay filters, expressed as 1st order polynomial
 * interpolation coefficients. The filters are produced from a single "long"
 * windowed low-pass filter. Also supports 0th-order ("nearest neighbor")
 * interpolation.
 *
 * This class also supports multiplication of each fractional delay filter by
 * an external filter (usually a low-pass filter).
 *
 * @tparam fptype Specifies storage type of the filter coefficients bank. The
 * filters are initially calculated using the "double" precision.
 */

template< class fptype >
class CDSPFracFilterBankLin
{
public:
	CDSPFracFilterBankLin()
		: Order( -1 )
	{
	}

	/**
	 * Copy constructor copies a limited set of parameters of the source
	 * filter bank. The actual filters are not copied. Such copying is used
	 * during filtering steps "modeling" stage. A further init() function
	 * call is required.
	 *
	 * @param s Source filter bank.
	 */

	void copyInitParams( const CDSPFracFilterBankLin& s )
	{
		WFLen2 = s.WFLen2;
		WFFreq = s.WFFreq;
		WFAlpha = s.WFAlpha;
		FracCount = s.FracCount;
		Order = s.Order;
		Alignment = s.Alignment;
		SrcFilterLen = s.SrcFilterLen;
		FilterLen = s.FilterLen;
		FilterSize = s.FilterSize;
		IsSrcTableBuilt = false;
		ExtFilter = s.ExtFilter;
		TableFillFlags.alloc( s.TableFillFlags.getCapacity() );
		int i;

		// Copy table fill flags, but shifted so that further initialization
		// is still possible (such feature should not be used, though).

		for( i = 0; i < TableFillFlags.getCapacity(); i++ )
		{
			TableFillFlags[ i ] = (uint8_t) ( s.TableFillFlags[ i ] << 2 );
		}
	}

	/**
	 * Operator compares *this filter bank and another filter bank and returns
	 * "true" if their parameters are equal. Alignment is not taken into
	 * account.
	 *
	 * @param s Filter bank to compare to.
	 * @return "True" if compared banks have equal parameters.
	 */

	bool operator == ( const CDSPFracFilterBankLin& s ) const
	{
		return( Order == s.Order && WFLen2 == s.WFLen2 &&
			WFFreq == s.WFFreq && WFAlpha == s.WFAlpha &&
			FracCount == s.FracCount && ExtFilter == s.ExtFilter );
	}

	/**
	 * Function initializes (builds) the filter bank based on the supplied
	 * parameters. If the supplied parameters are equal to previously defined
	 * parameters, function does nothing (alignment is assumed to be never
	 * changing between the init() function calls).
	 *
	 * @param ReqFracCount Required number of fractional delays in the filter
	 * bank. The minimal value is 2.
	 * @param ReqOrder Required order of the interpolation polynomial
	 * (0 or 1).
	 * @param BaseLen Low-pass filter's base length, in samples (taps).
	 * Affects the actual length of the filter and its overall steepness.
	 * @param Cutoff Low-pass filter's normalized cutoff frequency [0; 1].
	 * @param aWFAlpha Peaked Cosine window function's Alpha parameter.
	 * @param aExtFilter External filter to apply to each fractional delay
	 * filter.
	 * @param aAlignment Memory alignment of the filter bank, power-of-2
	 * value. 0 - use default stdlib alignment.
	 * @param FltLenAlign Filter's length alignment, power-of-2 value.
	 */

	void init( const int ReqFracCount, const int ReqOrder,
		const double BaseLen, const double Cutoff, const double aWFAlpha,
		const CFltBuffer& aExtFilter, const int aAlignment = 0,
		const int FltLenAlign = 1 )
	{
		double NewWFLen2 = 0.5 * BaseLen * ReqFracCount;
		double NewWFFreq = AVIR_PI * Cutoff / ReqFracCount;
		double NewWFAlpha = aWFAlpha;

		if( ReqOrder == Order && NewWFLen2 == WFLen2 && NewWFFreq == WFFreq &&
			NewWFAlpha == WFAlpha && ReqFracCount == FracCount &&
			aExtFilter == ExtFilter )
		{
			IsInitRequired = false;
			return;
		}

		WFLen2 = NewWFLen2;
		WFFreq = NewWFFreq;
		WFAlpha = NewWFAlpha;
		FracCount = ReqFracCount;
		Order = ReqOrder;
		Alignment = aAlignment;
		ExtFilter = aExtFilter;

		CDSPPeakedCosineLPF p( WFLen2, WFFreq, WFAlpha );
		SrcFilterLen = ( p.fl2 / ReqFracCount + 1 ) * 2;

		const int ElementSize = ReqOrder + 1;
		FilterLen = SrcFilterLen;

		if( ExtFilter.getCapacity() > 0 )
		{
			FilterLen += ExtFilter.getCapacity() - 1;
		}

		FilterLen = ( FilterLen + FltLenAlign - 1 ) & ~( FltLenAlign - 1 );
		FilterSize = FilterLen * ElementSize;
		IsSrcTableBuilt = false;
		IsInitRequired = true;
	}

	/**
	 * @return The length of each fractional delay filter, in samples (taps).
	 * Always an even value.
	 */

	int getFilterLen() const
	{
		return( FilterLen );
	}

	/**
	 * @return The number of fractional filters in use by *this bank.
	 */

	int getFracCount() const
	{
		return( FracCount );
	}

	/**
	 * @return The order of the interpolation polynomial.
	 */

	int getOrder() const
	{
		return( Order );
	}

	/**
	 * Function returns the pointer to the specified interpolation table
	 * filter.
	 *
	 * @param i Filter (fractional delay) index, in the range 0 to
	 * ReqFracCount - 1, inclusive.
	 * @return Pointer to filter. Higher order polynomial coefficients are
	 * stored after after previous order coefficients, separated by FilterLen
	 * elements.
	 */

	const fptype* getFilter( const int i )
	{
		if( !IsSrcTableBuilt )
		{
			buildSrcTable();
		}

		fptype* const Res = &Table[ i * FilterSize ];

		if(( TableFillFlags[ i ] & 2 ) == 0 )
		{
			createFilter( i );
			TableFillFlags[ i ] |= 2;

			if( Order > 0 )
			{
				createFilter( i + 1 );
				const fptype* const Res2 = Res + FilterSize;
				fptype* const op = Res + FilterLen;
				int j;

				// Create higher-order interpolation coefficients (linear
				// interpolation).

				for( j = 0; j < FilterLen; j++ )
				{
					op[ j ] = Res2[ j ] - Res[ j ];
				}
			}
		}

		return( Res );
	}

	/**
	 * Function makes sure all fractional delay filters were created.
	 */

	void createAllFilters()
	{
		int i;

		for( i = 0; i < FracCount; i++ )
		{
			getFilter( i );
		}
	}

	/**
	 * Function returns an approximate initialization complexity, expressed in
	 * the number of multiply-add operations. This includes fractional delay
	 * filters calculation and multiplication by an external filter. This
	 * function can only be called after the init() function.
	 *
	 * @param FracUseMap Fractional delays use map, each element corresponds
	 * to a single fractional delay, will be compared to the internal table
	 * fill flags. This map should include 0 and 1 values only.
	 * @return The complexity of the initialization, expressed in the number
	 * of multiply-add operations.
	 */

	int calcInitComplexity( const CBuffer< uint8_t >& FracUseMap ) const
	{
		const int FltInitCost = 65; // Cost to initialize a single sample
			// of the fractional delay filter.
		const int FltUseCost = FilterLen * Order +
			SrcFilterLen * ExtFilter.getCapacity(); // Cost to use a single
			// fractional delay filter.
		const int ucb[ 2 ] = { 0, FltUseCost };
		int ic;
		int i;

		if( IsInitRequired )
		{
			ic = FracCount * SrcFilterLen * FltInitCost;

			for( i = 0; i < FracCount; i++ )
			{
				ic += ucb[ FracUseMap[ i ]];
			}
		}
		else
		{
			ic = 0;

			for( i = 0; i < FracCount; i++ )
			{
				if( FracUseMap[ i ] != 0 )
				{
					ic += ucb[ TableFillFlags[ i ] == 0 ? 1 : 0 ];
				}
			}
		}

		return( ic );
	}

private:
	static const int InterpPoints = 2; ///< The maximal number of points the
		///< interpolation is based on.
		///<
	double WFLen2; ///< Window function's Len2 parameter.
		///<
	double WFFreq; ///< Window function's Freq parameter.
		///<
	double WFAlpha; ///< Window function's Alpha parameter.
		///<
	int FracCount; ///< The required number of fractional delay filters.
		///<
	int Order; ///< The order of the interpolation polynomial.
		///<
	int Alignment; ///< The required filter table alignment.
		///<
	int SrcFilterLen; ///< Length of the "source" filters. This is always an
		///< even value.
		///<
	int FilterLen; ///< Specifies the number of samples (taps) each fractional
		///< delay filter has. This is always an even value, adjusted by the
		///< FltLenAlign.
		///<
	int FilterSize; ///< The size of a single filter element, equals
		///< FilterLen * ElementSize.
		///<
	bool IsInitRequired; ///< "True" if SrcTable filter table initialization
		///< is required. This value is available only after the call to the
		///< init() function.
		///<
	CBuffer< fptype > Table; ///< Interpolation table, size equals to
		///< ReqFracCount * FilterLen * ElementSize.
		///<
	CBuffer< uint8_t > TableFillFlags; ///< Contains ReqFracCount + 1
		///< elements. Bit 0 of every element is 1 if Table already contains
		///< the filter from SrcTable filtered by ExtFilter. Bit 1 of every
		///< element means higher order coefficients were filled for the
		///< filter.
		///<
	CFltBuffer ExtFilter; ///< External filter that should be applied to every
		///< fractional delay filter. Can be empty. Half of this filter's
		///< capacity is used as latency (group delay) value of the filter.
		///<
	CBuffer< double > SrcTable; ///< Source table of delay filters, contains
		///< ReqFracCount + 1 elements. This table is used to fill the Table
		///< with the actual filters, filtered by an external filter.
		///<
	bool IsSrcTableBuilt; ///< "True" if the SrcTable was built already. This
		///< variable is set to "false" in the init() function.
		///<

	/**
	 * Function builds source table used in the createFilter() function.
	 */

	void buildSrcTable()
	{
		IsSrcTableBuilt = true;
		IsInitRequired = false;

		CDSPPeakedCosineLPF p( WFLen2, WFFreq, WFAlpha );

		const int BufLen = SrcFilterLen * FracCount + InterpPoints - 1;
		const int BufOffs = InterpPoints / 2 - 1;
		const int BufCenter = SrcFilterLen * FracCount / 2 + BufOffs;

		CBuffer< double > Buf( BufLen );
		memset( Buf, 0, ( BufCenter - p.fl2 ) * sizeof( double ));
		int i = BufLen - BufCenter - p.fl2 - 1;
		memset( &Buf[ BufLen - i ], 0, i * sizeof( double ));

		p.generateLPF( &Buf[ BufCenter - p.fl2 ], FracCount ); asm("int3");

		SrcTable.alloc(( FracCount + 1 ) * SrcFilterLen );
		TableFillFlags.alloc( FracCount + 1 );
		int j;
		double* op0 = SrcTable;

		for( i = FracCount; i >= 0; i-- )
		{
			TableFillFlags[ i ] = 0;
			double* p = Buf + BufOffs + i;

			for( j = 0; j < SrcFilterLen; j++ )
			{
				op0[ 0 ] = p[ 0 ];
				op0++;
				p += FracCount;
			}
		}

		Table.alloc(( FracCount + 1 ) * FilterSize, Alignment );
	}

	/**
	 * Function creates the specified filter in the Table by copying it from
	 * the SrcTable and filtering by ExtFilter. Function does nothing if
	 * filter was already created.
	 *
	 * @param k Filter index to create, in the range 0 to FracCount,
	 * inclusive.
	 */

	void createFilter( const int k )
	{
		if( TableFillFlags[ k ] != 0 )
		{
			return;
		}

		TableFillFlags[ k ] |= 1;
		const int ExtFilterLatency = ExtFilter.getCapacity() / 2;
		const int ResLatency = ExtFilterLatency + SrcFilterLen / 2;
		int ResLen = SrcFilterLen;

		if( ExtFilter.getCapacity() > 0 )
		{
			ResLen += ExtFilter.getCapacity() - 1;
		}

		const int ResOffs = FilterLen / 2 - ResLatency;
		fptype* op = &Table[ k * FilterSize ];
		int i;

		for( i = 0; i < ResOffs; i++ )
		{
			op[ i ] = 0.0;
		}

		for( i = ResOffs + ResLen; i < FilterLen; i++ )
		{
			op[ i ] = 0.0;
		}

		op += ResOffs;
		const double* const srcflt = &SrcTable[ k * SrcFilterLen ];

		if( ExtFilter.getCapacity() == 0 )
		{
			for( i = 0; i < ResLen; i++ )
			{
				op[ i ] = (fptype) srcflt[ i ];
			}

			return;
		}

		// Perform convolution of extflt and srcflt.

		const double* const extflt = &ExtFilter[ 0 ];
		int j;

		for( j = 0; j < ResLen; j++ )
		{
			int k = 0;
			int l = j - ExtFilter.getCapacity() + 1;
			int r = l + ExtFilter.getCapacity();

			if( l < 0 )
			{
				k -= l;
				l = 0;
			}

			if( r > SrcFilterLen )
			{
				r = SrcFilterLen;
			}

			const double* const extfltb = extflt + k;
			const double* const srcfltb = srcflt + l;
			double s = 0.0;
			l = r - l;

			for( i = 0; i < l; i++ )
			{
				s += extfltb[ i ] * srcfltb[ i ];
			}

			op[ j ] = (fptype) s;
		}
	}
};

/**
 * @brief Thread pool for multi-threaded image resizing operation.
 *
 * This base class is used to organize a multi-threaded image resizing
 * operation. The thread pool should consist of threads that initially wait
 * for a signal. Upon receiving a signal (via the startAllWorkloads()
 * function) each previously added thread should execute its workload's
 * process() function once, and return to the wait signal state again. The
 * thread pool should be also able to efficiently wait for all workloads to
 * finish via the waitAllWorkloadsToFinish() function.
 *
 * The image resizing algorithm makes calls to functions of this class.
 */

class CImageResizerThreadPool
{
public:
	CImageResizerThreadPool()
	{
	}

	virtual ~CImageResizerThreadPool()
	{
	}

	/**
	 * @brief Thread pool's workload object class.
	 *
	 * This class should be used as a base class for objects that perform the
	 * actual work spread over several threads.
	 */

	class CWorkload
	{
	public:
		virtual ~CWorkload()
		{
		}

		/**
		 * Function that gets called from the thread when thread pool's
		 * startAllWorkloads() function is called.
		 */

		virtual void process() = 0;
	};

	/**
	 * @return The suggested number of workloads (and their associated
	 * threads) to add. The minimal value this function can return is 1. The
	 * usual value may depend on the number of physical and virtual cores
	 * present in the system, and on other considerations.
	 */

	virtual int getSuggestedWorkloadCount() const
	{
		return( 1 );
	}

	/**
	 * Function adds a new workload (and possibly thread) to the thread pool.
	 * The caller decides how many parallel workloads (and threads) it
	 * requires, but this number will not exceed the value returned by the
	 * getSuggestedWorkloadCount() function. It is implementation-specific how
	 * many workloads to associate with a single thread. But for efficiency
	 * reasons each workload should be associated with its own thread.
	 *
	 * Note that the same set of workload objects will be processed each time
	 * the startAllWorkloads() function is called. This means that workload
	 * objects are added only once. The caller changes the state of the
	 * workload objects and then calls the startAllWorkloads() function to
	 * process them.
	 *
	 * @param Workload Workload object whose process() function will be called
	 * from within the thread when the startAllWorkloads() function is called.
	 */

	virtual void addWorkload( CWorkload* const Workload )
	{
	}

	/**
	 * Function starts all workloads associated with threads previously added
	 * via the addWorkload() function. It is assumed that this function
	 * performs the necessary "memory barrier" (or "cache sync") kind of
	 * operation so that all threads catch up the prior changes made to the
	 * workload objects during their wait state.
	 */

	virtual void startAllWorkloads()
	{
	}

	/**
	 * Function waits for all workloads to finish.
	 */

	virtual void waitAllWorkloadsToFinish()
	{
	}

	/**
	 * Function removes all workloads previously added via the addWorkload()
	 * function. This function gets called only after the
	 * waitAllWorkloadsToFinish() function call.
	 */

	virtual void removeAllWorkloads()
	{
	}
};

/**
 * @brief Resizing algorithm parameters structure.
 *
 * This structure holds all selectable parameters used by the resizing
 * algorithm at various stages, for both downsizing and upsizing. There are no
 * other parameters exist that can optimize the performance of the resizing
 * algorithm. Filter length parameters can take fractional values.
 *
 * Beside quality, these parameters (except Alpha parameters) directly affect
 * the computative cost of the resizing algorithm. It is possible to trade
 * the visual quality for computative cost.
 *
 * Anti-alias filtering during downsizing can be defined as a considerable
 * reduction of contrast of smallest features of an image. Unfortunately, such
 * de-contrasting partially affects features of all sizes thus producing a
 * non-linearity of frequency response. All pre-defined parameter sets are
 * described by 3 values separated by slashes. The first value is the
 * de-contrasting factor of small features (which are being removed) while
 * the second value is the de-contrasting factor of large features (which
 * should remain intact), with value of 1 equating to "no contrast change".
 * The third value is the optimization score (see below), with value of 0
 * equating to the "perfect" linearity of frequency response.
 *
 * The pre-defined parameter sets offered by this library were auto-optimized
 * for the given LPFltBaseLen, IntFltLen and CorrFltAlpha values. The
 * optimization goal was to minimize the score: the sum of squares of the
 * difference between original and processed images (which was not actually
 * resized, k=1). The original image was a 0.5 megapixel uniformly-distributed
 * white-noise image with pixel intensities in the 0-1 range. Such goal
 * converges very well and produces filtering system with the flattest
 * frequency response possible for the given constraints. With this goal,
 * increasing the LPFltBaseLen value reduces the general amount of aliasing
 * artifacts.
 */

struct CImageResizerParams
{
	double CorrFltAlpha; ///< Alpha parameter of the Peaked Cosine window
		///< function used on the correction filter. The "usable" values are
		///< in the narrow range 1.0 to 1.5.
		///<
	double CorrFltLen; ///< Correction filter's length in samples (taps). The
		///< "usable" range is narrow, 5.5 to 8, as to minimize the
		///< "overcorrection" which is mathematically precise, but visually
		///< unacceptable.
		///<
	double IntFltAlpha; ///< Alpha parameter of the Peaked Cosine window
		///< function used on the interpolation low-pass filter. The "usable"
		///< values are in the range 1.5 to 2.5.
		///<
	double IntFltCutoff; ///< Interpolation low-pass filter's cutoff frequency
		///< (normalized, [0; 1]). The "usable" range is 0.6 to 0.8.
		///<
	double IntFltLen; ///< Interpolation low-pass filter's length in samples
		///< (taps). The length value should be at least 18 or otherwise a
		///< "dark grid" artifact will be introduced if a further sharpening
		///< is applied. IntFltLen together with other IntFlt parameters
		///< should be tuned in a way that produces the flattest frequency
		///< response in 0-0.5 normalized frequency range (this range is due
		///< to 2X upsampling).
		///<
	double LPFltAlpha; ///< Alpha parameter of the Peaked Cosine window
		///< function used on the low-pass filter. The "usable" values are
		///< in the range 1.5 to 6.5.
		///<
	double LPFltBaseLen; ///< Base length of the low-pass (aka anti-aliasing
		///< or reconstruction) filter, in samples (taps), further adjusted by
		///< the actual cutoff frequency, upsampling and downsampling factors.
		///< The "usable" range is between 6 and 9.
		///<
	double LPFltCutoffMult; ///< Low-pass filter's cutoff frequency
		///< multiplier. This value can be both below and above 1.0 as
		///< low-pass filters are inserted on downsampling and upsampling
		///< steps and always have corner frequency equal to or below 0.5pi.
		///< This multiplier shifts low-pass filter's corner frequency towards
		///< lower (if below 1.0) or higher (if above 1.0) frequencies. This
		///< multiplier can be way below 1.0 since any additional
		///< high-frequency damping will be partially corrected by the
		///< correction filter. The "usable" range is 0.3 to 1.0.
		///<

	CImageResizerParams()
		: HBFltAlpha( 1.75395 )
		, HBFltCutoff( 0.40356 )
		, HBFltLen( 22.00000 )
	{
	}

	double HBFltAlpha; ///< Half-band filter's Alpha. Assigned internally.
		///<
	double HBFltCutoff; ///< Half-band filter's cutoff point [0; 1]. Assigned
		///< internally.
		///<
	double HBFltLen; ///< Length of the half-band low-pass filter. Assigned
		///< internally. Internally used to perform 2X or higher downsampling.
		///< These filter parameters should be treated as "technical" and do
		///< not require adjustment as they were tuned to suit all
		///< combinations of other parameters. This half-band filter provides
		///< a wide transition band (for minimal ringing artifacts) and a high
		///< stop-band attenuation (for minimal aliasing).
		///<
};

/**
 * @brief The default set of resizing algorithm parameters
 * (10.01/1.029/0.019169).
 *
 * This is the default set of resizing parameters that was designed to deliver
 * a sharp image while still providing a low amount of ringing artifacts, and
 * having a reasonable computational cost.
 */

struct CImageResizerParamsDef : public CImageResizerParams
{
	CImageResizerParamsDef()
	{
		CorrFltAlpha = 1.0;//10.01/1.88/1.029(522.43)/0.019169:258648,446808
		CorrFltLen = 6.30770;
		IntFltAlpha = 2.27825;
		IntFltCutoff = 0.75493;
		IntFltLen = 18.0;
		LPFltAlpha = 3.40127;
		LPFltBaseLen = 7.78;
		LPFltCutoffMult = 0.78797;
	}
};

/**
 * @brief Set of resizing algorithm parameters for ultra-low-ringing
 * performance (7.69/1.069/0.000245).
 *
 * This set of resizing algorithm parameters offers the lowest amount of
 * ringing this library is capable of providing while still offering a decent
 * quality. Low ringing is attained at the expense of higher aliasing
 * artifacts and a slightly reduced contrast.
 */

struct CImageResizerParamsULR : public CImageResizerParams
{
	CImageResizerParamsULR()
	{
		CorrFltAlpha = 1.0;//7.69/1.97/1.069(31445.45)/0.000245:258627,436845
		CorrFltLen = 5.83280;
		IntFltAlpha = 2.11453;
		IntFltCutoff = 0.73986;
		IntFltLen = 18.0;
		LPFltAlpha = 1.73455;
		LPFltBaseLen = 6.40;
		LPFltCutoffMult = 0.61314;
	}
};

/**
 * @brief Set of resizing algorithm parameters for low-ringing performance
 * (7.86/1.065/0.000106).
 *
 * This set of resizing algorithm parameters offers a very low-ringing
 * performance at the expense of higher aliasing artifacts and a slightly
 * reduced contrast.
 */

struct CImageResizerParamsLR : public CImageResizerParams
{
	CImageResizerParamsLR()
	{
		CorrFltAlpha = 1.0;//7.86/1.96/1.065(73865.02)/0.000106:258636,437381
		CorrFltLen = 5.87671;
		IntFltAlpha = 2.25322;
		IntFltCutoff = 0.74090;
		IntFltLen = 18.0;
		LPFltAlpha = 1.79306;
		LPFltBaseLen = 7.00;
		LPFltCutoffMult = 0.68881;
	}
};

/**
 * @brief Set of resizing algorithm parameters for lower-ringing performance
 * (8.86/1.046/0.010168).
 *
 * This set of resizing algorithm parameters offers a lower-ringing
 * performance in comparison to the default setting, at the expense of higher
 * aliasing artifacts and a slightly reduced contrast.
 */

struct CImageResizerParamsLow : public CImageResizerParams
{
	CImageResizerParamsLow()
	{
		CorrFltAlpha = 1.0;//8.86/1.92/1.046(871.54)/0.010168:258647,442252
		CorrFltLen = 6.09757;
		IntFltAlpha = 2.36704;
		IntFltCutoff = 0.74674;
		IntFltLen = 18.0;
		LPFltAlpha = 2.19427;
		LPFltBaseLen = 7.66;
		LPFltCutoffMult = 0.75380;
	}
};

/**
 * @brief Set of resizing algorithm parameters for low-aliasing
 * resizing (11.81/1.012/0.038379).
 *
 * This set of resizing algorithm parameters offers a considerable
 * anti-aliasing performance with a good frequency response linearity (and
 * contrast). This is an intermediate setting between the default and Ultra
 * parameters.
 */

struct CImageResizerParamsHigh : public CImageResizerParams
{
	CImageResizerParamsHigh()
	{
		CorrFltAlpha = 1.0;//11.81/1.83/1.012(307.84)/0.038379:258660,452719
		CorrFltLen = 6.80909;
		IntFltAlpha = 2.44917;
		IntFltCutoff = 0.75856;
		IntFltLen = 18.0;
		LPFltAlpha = 4.39527;
		LPFltBaseLen = 8.18;
		LPFltCutoffMult = 0.79172;
	}
};

/**
 * @brief Set of resizing algorithm parameters for ultra low-aliasing
 * resizing (13.65/1.001/0.000483).
 *
 * This set of resizing algorithm parameters offers a very considerable
 * anti-aliasing performance with a good frequency response linearity (and
 * contrast). This set of parameters is computationally expensive and may
 * produce ringing artifacts on sharp features.
 */

struct CImageResizerParamsUltra : public CImageResizerParams
{
	CImageResizerParamsUltra()
	{
		CorrFltAlpha = 1.0;//13.65/1.79/1.001(28288.41)/0.000483:258658,457974
		CorrFltLen = 7.48060;
		IntFltAlpha = 1.93750;
		IntFltCutoff = 0.75462;
		IntFltLen = 18.0;
		LPFltAlpha = 5.55209;
		LPFltBaseLen = 8.34;
		LPFltCutoffMult = 0.78002;
	}
};

/**
 * @brief Image resizing variables class.
 *
 * This is an utility "catch all" class that defines various variables used
 * during image resizing. Several variables that are explicitly initialized in
 * this class' constructor are also used as additional "input" variables to
 * the image resizing function. These variables will not be changed by the
 * avir::CImageResizer<>::resizeImage() function.
 */

class CImageResizerVars
{
public:
	int ElCount; ///< The number of "fptype" elements used to store 1 pixel.
		///<
	int ElCountIO; ///< The number of source and destination image's elements
		///< used to store 1 pixel.
		///<
	int fppack; ///< The number of atomic types stored in a single "fptype"
		///< element.
		///<
	int fpalign; ///< Suggested alignment size in bytes. This is not a
		///< required alignment, because image resizing algorithm cannot be
		///< made to have a strictly aligned data access in all cases (e.g.
		///< de-interleaved interpolation cannot perform aligned accesses).
		///<
	int elalign; ///< Length alignment of arrays of elements. This applies to
		///< filters and intermediate buffers: this constant forces filters
		///< and scanlines to have a length which is a multiple of this value,
		///< for more efficient SIMD implementation.
		///<
	int packmode; ///< 0 if interleaved packing, 1 if de-interleaved.
		///<
	int BufLen[ 2 ]; ///< Intermediate buffers' lengths in "fptype" elements.
	int BufOffs[ 2 ]; ///< Offsets into the intermediate buffers, used to
		///< provide prefix elements required during processing so that no
		///< "out of range" access happens. This offset is a multiple of
		///< ElCount if pixels are stored in interleaved form.
		///<
	double k; ///< Resizing step coefficient, updated to reflect the actually
		///< used coefficient during resizing.
		///<
	double o; ///< Starting pixel offset inside the source image, updated to
		///< reflect the actually used offset during resizing.
		///<
	int ResizeStep; ///< Index of the resizing step in the latest filtering
		///< steps array.
		///<
	double InGammaMult; ///< Input gamma multiplier, used to convert input
		///< data to 0 to 1 range. 0.0 if no gamma is in use.
		///<
	double OutGammaMult; ///< Output gamma multiplier, used to convert data to
		///< 0 to 255/65535 range. 0.0 if no gamma is in use.
		///<

	double ox; ///< Start X pixel offset within source image (can be
		///< negative). Positive offset moves image to the left.
		///<
	double oy; ///< Start Y pixel offset within source image (can be
		///< negative). Positive offset moves image to the top.
		///<
	CImageResizerThreadPool* ThreadPool; ///< Thread pool to be used by the
		///< image resizing function. Set to NULL to use single-threaded
		///< processing.
		///<
	bool UseSRGBGamma; ///< Perform sRGB gamma linearization (correction).
		///<
	int BuildMode; ///< The build mode to use, for debugging purposes. Set to
		///< -1 to select a minimal-complexity mode automatically. All build
		///< modes deliver similar results with minor deviations.
		///<
	int RndSeed; ///< Random seed parameter. This parameter may be incremented
		///< after each random generator initialization. The use of this
		///< variable depends on the ditherer implementation.
		///<

	CImageResizerVars()
		: ox( 0.0 )
		, oy( 0.0 )
		, ThreadPool( NULL )
		, UseSRGBGamma( false )
		, BuildMode( -1 )
		, RndSeed( 0 )
	{
	}
};

/**
 * @brief Image resizer's filtering step class.
 *
 * Class defines data to perform a single filtering step over a whole
 * horizontal or vertical scanline. Resizing consists of 1 or more steps that
 * may be performed before the actual resizing takes place. Filtering may also
 * follow a resizing step. Each step must ensure that scanline data contains
 * enough pixels to perform the next step (which may be resizing) without
 * exceeding scanline's bounds.
 *
 * A derived class must implement several "const" and "static" functions that
 * are used to perform the actual filtering in interleaved or de-interleaved
 * mode.
 *
 * @tparam fptype Floating point type to use for storing pixel elements. SIMD
 * types can be used: in this case each element may hold a whole pixel.
 * @tparam fptypeatom The atomic type the "fptype" consists of.
 */

template< class fptype, class fptypeatom >
class CImageResizerFilterStep
{
public:
	bool IsUpsample; ///< "True" if this step is an upsampling step, "false"
		///< if downsampling step. Should be set to "false" if ResampleFactor
		///< equals 0.
		///<
	int ResampleFactor; ///< Resample factor (>=1). If 0, this is a resizing
		///< step. This value should be >1 if IsUpsample equals "true".
		///<
	CBuffer< fptype > Flt; ///< Filter to use at this step.
		///<
	CFltBuffer FltOrig; ///< Originally-designed filter. This buffer may not
		///< be assigned. Assigned by filters that precede the resizing step
		///< if such filter is planned to be embedded into the interpolation
		///< filter as "external" filter. If IsUpsample=true and this filter
		///< buffer is not empty, the upsampling step will not itself apply
		///< any filtering over upsampled input scanline.
		///<
	double DCGain; ///< DC gain which was applied to the filter. Not defined
		///< if ResampleFactor = 0.
		///<
	int FltLatency; ///< Filter's latency (group delay, shift) in pixels.
		///<
	const CImageResizerVars* Vars; ///< Image resizing-related variables.
		///<
	int InLen; ///< Input scanline's length in pixels.
		///<
	int InBuf; ///< Input buffer index, 0 or 1.
		///<
	int InPrefix; ///< Required input prefix pixels. These prefix pixels will
		///< be filled with source scanline's first pixel value. If IsUpsample
		///< is "true", this is the additional number of times the first pixel
		///< will be filtered before processing scanline, this number is also
		///< reflected in the OutPrefix.
		///<
	int InSuffix; ///< Required input suffix pixels. These suffix pixels will
		///< be filled with source scanline's last pixel value. If IsUpsample
		///< is "true", this is the additional number of times the last pixel
		///< will be filtered before processing scanline, this number is also
		///< reflected in the OutSuffix.
		///<
	int InElIncr; ///< Pixel element increment within the input buffer, used
		///< during de-interleaved processing: in this case each image's
		///< channel is stored independently, InElIncr elements apart.
		///<
	int OutLen; ///< Length of the resulting scanline.
		///<
	int OutBuf; ///< Output buffer index. 0 or 1; 2 for the last step.
		///<
	int OutPrefix; ///< Required output prefix pixels. These prefix pixels
		///< will not be pre-filled with any values. Value is valid only if
		///< IsUpsample equals "true".
		///<
	int OutSuffix; ///< Required input suffix pixels. These suffix pixels will
		///< not be pre-filled with any values. Value is valid only if
		///< IsUpsample equals "true".
		///<
	int OutElIncr; ///< Pixel element increment within the output buffer, used
		///< during de-interleaved processing. Equals to the InBufElIncr of
		///< the next step.
		///<
	CBuffer< fptype > PrefixDC; ///< DC component fluctuations added at the
		///< start of the resulting scanline, used when IsUpsample equals
		///< "true".
		///<
	CBuffer< fptype > SuffixDC; ///< DC component fluctuations added at the
		///< end of the resulting scanline, used when IsUpsample equals
		///< "true".
		///<
	int EdgePixelCount; ///< The number of edge pixels added. Affects the
		///< initial position within the input scanline, used to produce edge
		///< pixels. This variable is used and should be defined when
		///< IsUpsample=false and ResampleFactor>0. When assigning this
		///< variable it is also necessary to update InPrefix, OutLen and
		///< Vars.o variables.
		///<
	static const int EdgePixelCountDef = 3; ///< The default number of pixels
		///< additionally produced at scanline edges during filtering. This is
		///< required to reduce edge artifacts.
		///<

	/**
	 * @brief Resizing position structure.
	 *
	 * Structure holds resizing position and pointer to fractional delay
	 * filter.
	 */

	struct CResizePos
	{
		int SrcPosInt; ///< Source scanline position.
			///<
		int fti; ///< Fractional delay filter index.
			///<
		const fptype* ftp; ///< Fractional delay filter pointer.
			///<
		fptypeatom x; ///< Interpolation coefficient between delay filters.
			///<
		int SrcOffs; ///< Source scanline offset.
			///<
	};

	/**
	 * @brief Resizing positions buffer class.
	 *
	 * This class combines buffer together with variables that define resizing
	 * stepping.
	 */

	class CRPosBuf : public CBuffer< CResizePos >
	{
	public:
		double k; ///< Resizing step.
			///<
		double o; ///< Resizing offset.
			///<
		int FracCount; ///< The number of fractional delay filters in a filter
			///< bank used together with this buffer.
			///<
	};

	/**
	 * @brief Resizing positions buffer array class.
	 *
	 * This class combines structure array of the CRPosBuf class objects with
	 * the function that locates or creates buffer with the required resizing
	 * stepping.
	 */

	class CRPosBufArray : public CStructArray< CRPosBuf >
	{
	public:
		using CStructArray< CRPosBuf > :: add;
		using CStructArray< CRPosBuf > :: getItemCount;

		/**
		 * Function returns the resizing positions buffer with the required
		 * stepping. If no such buffer exists, it is created.
		 *
		 * @param k Resizing step.
		 * @param o Resizing offset.
		 * @param FracCount The number of fractional delay filters in a filter
		 * bank used together with this buffer.
		 * @return Reference to the CRPosBuf object.
		 */

		CRPosBuf& getRPosBuf( const double k, const double o,
			const int FracCount )
		{
			int i;

			for( i = 0; i < getItemCount(); i++ )
			{
				CRPosBuf& Buf = (*this)[ i ];

				if( Buf.k == k && Buf.o == o && Buf.FracCount == FracCount )
				{
					return( Buf );
				}
			}

			CRPosBuf& NewBuf = add();
			NewBuf.k = k;
			NewBuf.o = o;
			NewBuf.FracCount = FracCount;

			return( NewBuf );
		}
	};

	CRPosBuf* RPosBuf; ///< Resizing positions buffer. Used when
		///< ResampleFactor equals 0 (resizing step).
		///<
	CDSPFracFilterBankLin< fptype >* FltBank; ///< Filter bank in use by *this
		///< resizing step.
		///<
};

/**
 * @brief Interleaved filtering steps implementation class.
 *
 * This class implements scanline filtering functions in interleaved mode.
 * This means that each pixel is processed independently, not in groups.
 *
 * @tparam fptype Floating point type to use for storing pixel elements. SIMD
 * types can be used: in this case each element may hold a whole pixel.
 * @tparam fptypeatom The atomic type the "fptype" consists of.
 */

template< class fptype, class fptypeatom >
class CImageResizerFilterStepINL :
	public CImageResizerFilterStep< fptype, fptypeatom >
{
public:
	using CImageResizerFilterStep< fptype, fptypeatom > :: IsUpsample;
	using CImageResizerFilterStep< fptype, fptypeatom > :: ResampleFactor;
	using CImageResizerFilterStep< fptype, fptypeatom > :: Flt;
	using CImageResizerFilterStep< fptype, fptypeatom > :: FltOrig;
	using CImageResizerFilterStep< fptype, fptypeatom > :: FltLatency;
	using CImageResizerFilterStep< fptype, fptypeatom > :: Vars;
	using CImageResizerFilterStep< fptype, fptypeatom > :: InLen;
	using CImageResizerFilterStep< fptype, fptypeatom > :: InPrefix;
	using CImageResizerFilterStep< fptype, fptypeatom > :: InSuffix;
	using CImageResizerFilterStep< fptype, fptypeatom > :: OutLen;
	using CImageResizerFilterStep< fptype, fptypeatom > :: OutPrefix;
	using CImageResizerFilterStep< fptype, fptypeatom > :: OutSuffix;
	using CImageResizerFilterStep< fptype, fptypeatom > :: PrefixDC;
	using CImageResizerFilterStep< fptype, fptypeatom > :: SuffixDC;
	using CImageResizerFilterStep< fptype, fptypeatom > :: RPosBuf;
	using CImageResizerFilterStep< fptype, fptypeatom > :: FltBank;
	using CImageResizerFilterStep< fptype, fptypeatom > :: EdgePixelCount;

	/**
	 * Function performs "packing" of a scanline and type conversion.
	 * Scanline, depending on the "fptype" can be potentially stored as a
	 * packed SIMD values having a certain atomic type. If required, the sRGB
	 * gamma correction is applied.
	 *
	 * @param ip Input scanline.
	 * @param op0 Output scanline.
	 * @param l0 The number of pixels to "pack".
	 */

	template< class Tin >
	void packScanline( const Tin* ip, fptype* const op0, const int l0 ) const
	{
		const int ElCount = Vars -> ElCount;
		const int ElCountIO = Vars -> ElCountIO;
		fptype* op = op0;
		int l = l0;

		if( !Vars -> UseSRGBGamma )
		{
			if( ElCountIO == 1 )
			{
				while( l > 0 )
				{
					fptypeatom* v = (fptypeatom*) op;
					v[ 0 ] = (fptypeatom) ip[ 0 ];
					op += ElCount;
					ip++;
					l--;
				}
			}
			else
			if( ElCountIO == 4 )
			{
				while( l > 0 )
				{
					fptypeatom* v = (fptypeatom*) op;
					v[ 0 ] = (fptypeatom) ip[ 0 ];
					v[ 1 ] = (fptypeatom) ip[ 1 ];
					v[ 2 ] = (fptypeatom) ip[ 2 ];
					v[ 3 ] = (fptypeatom) ip[ 3 ];
					op += ElCount;
					ip += 4;
					l--;
				}
			}
			else
			if( ElCountIO == 3 )
			{
				while( l > 0 )
				{
					fptypeatom* v = (fptypeatom*) op;
					v[ 0 ] = (fptypeatom) ip[ 0 ];
					v[ 1 ] = (fptypeatom) ip[ 1 ];
					v[ 2 ] = (fptypeatom) ip[ 2 ];
					op += ElCount;
					ip += 3;
					l--;
				}
			}
			else
			if( ElCountIO == 2 )
			{
				while( l > 0 )
				{
					fptypeatom* v = (fptypeatom*) op;
					v[ 0 ] = (fptypeatom) ip[ 0 ];
					v[ 1 ] = (fptypeatom) ip[ 1 ];
					op += ElCount;
					ip += 2;
					l--;
				}
			}
		}
		else
		{
			const fptypeatom gm = (fptypeatom) Vars -> InGammaMult;

			if( ElCountIO == 1 )
			{
				while( l > 0 )
				{
					fptypeatom* v = (fptypeatom*) op;
					v[ 0 ] = convertSRGB2Lin( (fptypeatom) ip[ 0 ] * gm );
					op += ElCount;
					ip++;
					l--;
				}
			}
			else
			if( ElCountIO == 4 )
			{
				while( l > 0 )
				{
					fptypeatom* v = (fptypeatom*) op;
					v[ 0 ] = convertSRGB2Lin( (fptypeatom) ip[ 0 ] * gm );
					v[ 1 ] = convertSRGB2Lin( (fptypeatom) ip[ 1 ] * gm );
					v[ 2 ] = convertSRGB2Lin( (fptypeatom) ip[ 2 ] * gm );
					v[ 3 ] = convertSRGB2Lin( (fptypeatom) ip[ 3 ] * gm );
					op += ElCount;
					ip += 4;
					l--;
				}
			}
			else
			if( ElCountIO == 3 )
			{
				while( l > 0 )
				{
					fptypeatom* v = (fptypeatom*) op;
					v[ 0 ] = convertSRGB2Lin( (fptypeatom) ip[ 0 ] * gm );
					v[ 1 ] = convertSRGB2Lin( (fptypeatom) ip[ 1 ] * gm );
					v[ 2 ] = convertSRGB2Lin( (fptypeatom) ip[ 2 ] * gm );
					op += ElCount;
					ip += 3;
					l--;
				}
			}
			else
			if( ElCountIO == 2 )
			{
				while( l > 0 )
				{
					fptypeatom* v = (fptypeatom*) op;
					v[ 0 ] = convertSRGB2Lin( (fptypeatom) ip[ 0 ] * gm );
					v[ 1 ] = convertSRGB2Lin( (fptypeatom) ip[ 1 ] * gm );
					op += ElCount;
					ip += 2;
					l--;
				}
			}
		}

		const int ZeroCount = ElCount * Vars -> fppack - ElCountIO;
		op = op0;
		l = l0;

		if( ZeroCount == 1 )
		{
			while( l > 0 )
			{
				fptypeatom* v = (fptypeatom*) op + ElCountIO;
				v[ 0 ] = (fptypeatom) 0;
				op += ElCount;
				l--;
			}
		}
		else
		if( ZeroCount == 2 )
		{
			while( l > 0 )
			{
				fptypeatom* v = (fptypeatom*) op + ElCountIO;
				v[ 0 ] = (fptypeatom) 0;
				v[ 1 ] = (fptypeatom) 0;
				op += ElCount;
				l--;
			}
		}
		else
		if( ZeroCount == 3 )
		{
			while( l > 0 )
			{
				fptypeatom* v = (fptypeatom*) op + ElCountIO;
				v[ 0 ] = (fptypeatom) 0;
				v[ 1 ] = (fptypeatom) 0;
				v[ 2 ] = (fptypeatom) 0;
				op += ElCount;
				l--;
			}
		}
	}

	/**
	 * Function applies Linear to sRGB gamma correction to the specified
	 * scanline.
	 *
	 * @param p Scanline.
	 * @param l The number of pixels to de-linearize.
	 * @param Vars0 Image resizing-related variables.
	 */

	static void applySRGBGamma( fptype* p, int l,
		const CImageResizerVars& Vars0 )
	{
		const int ElCount = Vars0.ElCount;
		const int ElCountIO = Vars0.ElCountIO;
		const fptypeatom gm = (fptypeatom) Vars0.OutGammaMult;

		if( ElCountIO == 1 )
		{
			while( l > 0 )
			{
				fptypeatom* v = (fptypeatom*) p;
				v[ 0 ] = convertLin2SRGB( v[ 0 ]) * gm;
				p += ElCount;
				l--;
			}
		}
		else
		if( ElCountIO == 4 )
		{
			while( l > 0 )
			{
				fptypeatom* v = (fptypeatom*) p;
				v[ 0 ] = convertLin2SRGB( v[ 0 ]) * gm;
				v[ 1 ] = convertLin2SRGB( v[ 1 ]) * gm;
				v[ 2 ] = convertLin2SRGB( v[ 2 ]) * gm;
				v[ 3 ] = convertLin2SRGB( v[ 3 ]) * gm;
				p += ElCount;
				l--;
			}
		}
		else
		if( ElCountIO == 3 )
		{
			while( l > 0 )
			{
				fptypeatom* v = (fptypeatom*) p;
				v[ 0 ] = convertLin2SRGB( v[ 0 ]) * gm;
				v[ 1 ] = convertLin2SRGB( v[ 1 ]) * gm;
				v[ 2 ] = convertLin2SRGB( v[ 2 ]) * gm;
				p += ElCount;
				l--;
			}
		}
		else
		if( ElCountIO == 2 )
		{
			while( l > 0 )
			{
				fptypeatom* v = (fptypeatom*) p;
				v[ 0 ] = convertLin2SRGB( v[ 0 ]) * gm;
				v[ 1 ] = convertLin2SRGB( v[ 1 ]) * gm;
				p += ElCount;
				l--;
			}
		}
	}

	/**
	 * Function converts vertical scanline to horizontal scanline. This
	 * function is called by the image resizer when image is resized
	 * vertically. This means that the vertical scanline is stored in the
	 * same format produced by the packScanline() and maintained by other
	 * filtering functions.
	 *
	 * @param ip Input vertical scanline.
	 * @param op Output buffer (temporary buffer used during resizing).
	 * @param SrcLen The number of pixels in the input scanline, also used to
	 * calculate input buffer increment.
	 * @param SrcIncr Input buffer increment to the next vertical pixel.
	 */

	void convertVtoH( const fptype* ip, fptype* op, const int SrcLen,
		const int SrcIncr ) const
	{
		const int ElCount = Vars -> ElCount;
		int j;

		if( ElCount == 1 )
		{
			for( j = 0; j < SrcLen; j++ )
			{
				op[ 0 ] = ip[ 0 ];
				ip += SrcIncr;
				op++;
			}
		}
		else
		if( ElCount == 4 )
		{
			for( j = 0; j < SrcLen; j++ )
			{
				op[ 0 ] = ip[ 0 ];
				op[ 1 ] = ip[ 1 ];
				op[ 2 ] = ip[ 2 ];
				op[ 3 ] = ip[ 3 ];
				ip += SrcIncr;
				op += 4;
			}
		}
		else
		if( ElCount == 3 )
		{
			for( j = 0; j < SrcLen; j++ )
			{
				op[ 0 ] = ip[ 0 ];
				op[ 1 ] = ip[ 1 ];
				op[ 2 ] = ip[ 2 ];
				ip += SrcIncr;
				op += 3;
			}
		}
		else
		if( ElCount == 2 )
		{
			for( j = 0; j < SrcLen; j++ )
			{
				op[ 0 ] = ip[ 0 ];
				op[ 1 ] = ip[ 1 ];
				ip += SrcIncr;
				op += 2;
			}
		}
	}

	/**
	 * Function performs "unpacking" of a scanline and type conversion
	 * (truncation is used when floating point is converted to integer).
	 * Scanline, depending on the "fptype" can be potentially stored as a
	 * packed SIMD values having a certain atomic type. The unpacking function
	 * assumes that scanline is stored in the style produced by the
	 * packScanline() function.
	 *
	 * @param ip Input scanline.
	 * @param op Output scanline.
	 * @param l The number of pixels to "unpack".
	 * @param Vars0 Image resizing-related variables.
	 */

	template< class Tout >
	static void unpackScanline( const fptype* ip, Tout* op, int l,
		const CImageResizerVars& Vars0 )
	{
		const int ElCount = Vars0.ElCount;
		const int ElCountIO = Vars0.ElCountIO;
                        /* dprintf(2, "BOOP ElCount=%d ElCountIO=%d l=%d op=%p\n", ElCount, ElCountIO, l, op); */

		if( ElCountIO == 1 )
		{
			while( l > 0 )
			{
				const fptypeatom* v = (const fptypeatom*) ip;
				op[ 0 ] = (Tout) v[ 0 ];
				ip += ElCount;
				op++;
				l--;
			}
		}
		else
		if( ElCountIO == 4 )
		{
			while( l > 0 )
			{
				const fptypeatom* v = (const fptypeatom*) ip;
				op[ 0 ] = (Tout) v[ 0 ];
				op[ 1 ] = (Tout) v[ 1 ];
				op[ 2 ] = (Tout) v[ 2 ];
				op[ 3 ] = (Tout) v[ 3 ];
				ip += ElCount;
				op += 4;
				l--;
			}
		}
		else
		if( ElCountIO == 3 )
		{
			while( l > 0 )
			{
				const fptypeatom* v = (const fptypeatom*) ip;
                                /* DebugBreak(); */
                        /* dprintf(2, "BOOP ElCount=%d ElCountIO=%d l=%d op=%p v=%p v[0]=%d\n", ElCount, ElCountIO, l, op, v, (Tout)v[0]); */
				op[ 0 ] = (Tout) v[ 0 ];
				op[ 1 ] = (Tout) v[ 1 ];
				op[ 2 ] = (Tout) v[ 2 ];
				ip += ElCount;
				op += 3;
				l--;
			}
		}
		else
		if( ElCountIO == 2 )
		{
			while( l > 0 )
			{
				const fptypeatom* v = (const fptypeatom*) ip;
				op[ 0 ] = (Tout) v[ 0 ];
				op[ 1 ] = (Tout) v[ 1 ];
				ip += ElCount;
				op += 2;
				l--;
			}
		}
	}

	/**
	 * Function prepares input scanline buffer for *this filtering step.
	 * Left- and right-most pixels are replicated to make sure no buffer
	 * overrun happens. Such approach also allows to bypass any pointer
	 * range checks.
	 *
	 * @param Src Source buffer.
	 */

	void prepareInBuf( fptype* Src ) const
	{
		if( IsUpsample || InPrefix + InSuffix == 0 )
		{
			return;
		}

		const int ElCount = Vars -> ElCount;
		replicateArray( Src, ElCount, Src - ElCount, InPrefix, -ElCount );

		Src += ( InLen - 1 ) * ElCount;
		replicateArray( Src, ElCount, Src + ElCount, InSuffix, ElCount );
	}

	/**
	 * Function peforms scanline upsampling with filtering.
	 *
	 * @param Src Source scanline buffer (length = this -> InLen). Source
	 * scanline increment will be equal to ElCount.
	 * @param Dst Destination scanline buffer.
	 */

	void doUpsample( const fptype* const Src, fptype* const Dst ) const
	{
		const int ElCount = Vars -> ElCount;
		fptype* op0 = &Dst[ -OutPrefix * ElCount ];
		memset( op0, 0, ( OutPrefix + OutLen + OutSuffix ) * ElCount *
			sizeof( fptype ));

		const fptype* ip = Src;
		const int opstep = ElCount * ResampleFactor;
		int l;

		if( FltOrig.getCapacity() > 0 )
		{
			// Do not perform filtering, only upsample.

			op0 += ( OutPrefix % ResampleFactor ) * ElCount;
			l = OutPrefix / ResampleFactor;

			if( ElCount == 1 )
			{
				while( l > 0 )
				{
					op0[ 0 ] = ip[ 0 ];
					op0 += opstep;
					l--;
				}

				l = InLen - 1;

				while( l > 0 )
				{
					op0[ 0 ] = ip[ 0 ];
					op0 += opstep;
					ip += ElCount;
					l--;
				}

				l = OutSuffix / ResampleFactor;

				while( l >= 0 )
				{
					op0[ 0 ] = ip[ 0 ];
					op0 += opstep;
					l--;
				}
			}
			else
			if( ElCount == 4 )
			{
				while( l > 0 )
				{
					op0[ 0 ] = ip[ 0 ];
					op0[ 1 ] = ip[ 1 ];
					op0[ 2 ] = ip[ 2 ];
					op0[ 3 ] = ip[ 3 ];
					op0 += opstep;
					l--;
				}

				l = InLen - 1;

				while( l > 0 )
				{
					op0[ 0 ] = ip[ 0 ];
					op0[ 1 ] = ip[ 1 ];
					op0[ 2 ] = ip[ 2 ];
					op0[ 3 ] = ip[ 3 ];
					op0 += opstep;
					ip += ElCount;
					l--;
				}

				l = OutSuffix / ResampleFactor;

				while( l >= 0 )
				{
					op0[ 0 ] = ip[ 0 ];
					op0[ 1 ] = ip[ 1 ];
					op0[ 2 ] = ip[ 2 ];
					op0[ 3 ] = ip[ 3 ];
					op0 += opstep;
					l--;
				}
			}
			else
			if( ElCount == 3 )
			{
				while( l > 0 )
				{
					op0[ 0 ] = ip[ 0 ];
					op0[ 1 ] = ip[ 1 ];
					op0[ 2 ] = ip[ 2 ];
					op0 += opstep;
					l--;
				}

				l = InLen - 1;

				while( l > 0 )
				{
					op0[ 0 ] = ip[ 0 ];
					op0[ 1 ] = ip[ 1 ];
					op0[ 2 ] = ip[ 2 ];
					op0 += opstep;
					ip += ElCount;
					l--;
				}

				l = OutSuffix / ResampleFactor;

				while( l >= 0 )
				{
					op0[ 0 ] = ip[ 0 ];
					op0[ 1 ] = ip[ 1 ];
					op0[ 2 ] = ip[ 2 ];
					op0 += opstep;
					l--;
				}
			}
			else
			if( ElCount == 2 )
			{
				while( l > 0 )
				{
					op0[ 0 ] = ip[ 0 ];
					op0[ 1 ] = ip[ 1 ];
					op0 += opstep;
					l--;
				}

				l = InLen - 1;

				while( l > 0 )
				{
					op0[ 0 ] = ip[ 0 ];
					op0[ 1 ] = ip[ 1 ];
					op0 += opstep;
					ip += ElCount;
					l--;
				}

				l = OutSuffix / ResampleFactor;

				while( l >= 0 )
				{
					op0[ 0 ] = ip[ 0 ];
					op0[ 1 ] = ip[ 1 ];
					op0 += opstep;
					l--;
				}
			}

			return;
		}

		const fptype* const f = Flt;
		const int flen = Flt.getCapacity();
		fptype* op;
		int i;

		if( ElCount == 1 )
		{
			l = InPrefix;

			while( l > 0 )
			{
				op = op0;

				for( i = 0; i < flen; i++ )
				{
					op[ i ] += f[ i ] * ip[ 0 ];
				}

				op0 += opstep;
				l--;
			}

			l = InLen - 1;

			while( l > 0 )
			{
				op = op0;

				for( i = 0; i < flen; i++ )
				{
					op[ i ] += f[ i ] * ip[ 0 ];
				}

				ip += ElCount;
				op0 += opstep;
				l--;
			}

			l = InSuffix;

			while( l >= 0 )
			{
				op = op0;

				for( i = 0; i < flen; i++ )
				{
					op[ i ] += f[ i ] * ip[ 0 ];
				}

				op0 += opstep;
				l--;
			}
		}
		else
		if( ElCount == 4 )
		{
			l = InPrefix;

			while( l > 0 )
			{
				op = op0;

				for( i = 0; i < flen; i++ )
				{
					op[ 0 ] += f[ i ] * ip[ 0 ];
					op[ 1 ] += f[ i ] * ip[ 1 ];
					op[ 2 ] += f[ i ] * ip[ 2 ];
					op[ 3 ] += f[ i ] * ip[ 3 ];
					op += 4;
				}

				op0 += opstep;
				l--;
			}

			l = InLen - 1;

			while( l > 0 )
			{
				op = op0;

				for( i = 0; i < flen; i++ )
				{
					op[ 0 ] += f[ i ] * ip[ 0 ];
					op[ 1 ] += f[ i ] * ip[ 1 ];
					op[ 2 ] += f[ i ] * ip[ 2 ];
					op[ 3 ] += f[ i ] * ip[ 3 ];
					op += 4;
				}

				ip += ElCount;
				op0 += opstep;
				l--;
			}

			l = InSuffix;

			while( l >= 0 )
			{
				op = op0;

				for( i = 0; i < flen; i++ )
				{
					op[ 0 ] += f[ i ] * ip[ 0 ];
					op[ 1 ] += f[ i ] * ip[ 1 ];
					op[ 2 ] += f[ i ] * ip[ 2 ];
					op[ 3 ] += f[ i ] * ip[ 3 ];
					op += 4;
				}

				op0 += opstep;
				l--;
			}
		}
		else
		if( ElCount == 3 )
		{
			l = InPrefix;

			while( l > 0 )
			{
				op = op0;

				for( i = 0; i < flen; i++ )
				{
					op[ 0 ] += f[ i ] * ip[ 0 ];
					op[ 1 ] += f[ i ] * ip[ 1 ];
					op[ 2 ] += f[ i ] * ip[ 2 ];
					op += 3;
				}

				op0 += opstep;
				l--;
			}

			l = InLen - 1;

			while( l > 0 )
			{
				op = op0;

				for( i = 0; i < flen; i++ )
				{
					op[ 0 ] += f[ i ] * ip[ 0 ];
					op[ 1 ] += f[ i ] * ip[ 1 ];
					op[ 2 ] += f[ i ] * ip[ 2 ];
					op += 3;
				}

				ip += ElCount;
				op0 += opstep;
				l--;
			}

			l = InSuffix;

			while( l >= 0 )
			{
				op = op0;

				for( i = 0; i < flen; i++ )
				{
					op[ 0 ] += f[ i ] * ip[ 0 ];
					op[ 1 ] += f[ i ] * ip[ 1 ];
					op[ 2 ] += f[ i ] * ip[ 2 ];
					op += 3;
				}

				op0 += opstep;
				l--;
			}
		}
		else
		if( ElCount == 2 )
		{
			l = InPrefix;

			while( l > 0 )
			{
				op = op0;

				for( i = 0; i < flen; i++ )
				{
					op[ 0 ] += f[ i ] * ip[ 0 ];
					op[ 1 ] += f[ i ] * ip[ 1 ];
					op += 2;
				}

				op0 += opstep;
				l--;
			}

			l = InLen - 1;

			while( l > 0 )
			{
				op = op0;

				for( i = 0; i < flen; i++ )
				{
					op[ 0 ] += f[ i ] * ip[ 0 ];
					op[ 1 ] += f[ i ] * ip[ 1 ];
					op += 2;
				}

				ip += ElCount;
				op0 += opstep;
				l--;
			}

			l = InSuffix;

			while( l >= 0 )
			{
				op = op0;

				for( i = 0; i < flen; i++ )
				{
					op[ 0 ] += f[ i ] * ip[ 0 ];
					op[ 1 ] += f[ i ] * ip[ 1 ];
					op += 2;
				}

				op0 += opstep;
				l--;
			}
		}

		op = op0;
		const fptype* dc = SuffixDC;
		l = SuffixDC.getCapacity();

		if( ElCount == 1 )
		{
			for( i = 0; i < l; i++ )
			{
				op[ i ] += ip[ 0 ] * dc[ i ];
			}
		}
		else
		if( ElCount == 4 )
		{
			while( l > 0 )
			{
				op[ 0 ] += ip[ 0 ] * dc[ 0 ];
				op[ 1 ] += ip[ 1 ] * dc[ 0 ];
				op[ 2 ] += ip[ 2 ] * dc[ 0 ];
				op[ 3 ] += ip[ 3 ] * dc[ 0 ];
				dc++;
				op += 4;
				l--;
			}
		}
		else
		if( ElCount == 3 )
		{
			while( l > 0 )
			{
				op[ 0 ] += ip[ 0 ] * dc[ 0 ];
				op[ 1 ] += ip[ 1 ] * dc[ 0 ];
				op[ 2 ] += ip[ 2 ] * dc[ 0 ];
				dc++;
				op += 3;
				l--;
			}
		}
		else
		if( ElCount == 2 )
		{
			while( l > 0 )
			{
				op[ 0 ] += ip[ 0 ] * dc[ 0 ];
				op[ 1 ] += ip[ 1 ] * dc[ 0 ];
				dc++;
				op += 2;
				l--;
			}
		}

		ip = Src;
		op = Dst - InPrefix * opstep;
		dc = PrefixDC;
		l = PrefixDC.getCapacity();

		if( ElCount == 1 )
		{
			for( i = 0; i < l; i++ )
			{
				op[ i ] += ip[ 0 ] * dc[ i ];
			}
		}
		else
		if( ElCount == 4 )
		{
			while( l > 0 )
			{
				op[ 0 ] += ip[ 0 ] * dc[ 0 ];
				op[ 1 ] += ip[ 1 ] * dc[ 0 ];
				op[ 2 ] += ip[ 2 ] * dc[ 0 ];
				op[ 3 ] += ip[ 3 ] * dc[ 0 ];
				dc++;
				op += 4;
				l--;
			}
		}
		else
		if( ElCount == 3 )
		{
			while( l > 0 )
			{
				op[ 0 ] += ip[ 0 ] * dc[ 0 ];
				op[ 1 ] += ip[ 1 ] * dc[ 0 ];
				op[ 2 ] += ip[ 2 ] * dc[ 0 ];
				dc++;
				op += 3;
				l--;
			}
		}
		else
		if( ElCount == 2 )
		{
			while( l > 0 )
			{
				op[ 0 ] += ip[ 0 ] * dc[ 0 ];
				op[ 1 ] += ip[ 1 ] * dc[ 0 ];
				dc++;
				op += 2;
				l--;
			}
		}
	}

	/**
	 * Function peforms scanline filtering with optional downsampling.
	 * Function makes use of the symmetry of the filter.
	 *
	 * @param Src Source scanline buffer (length = this -> InLen). Source
	 * scanline increment will be equal to ElCount.
	 * @param Dst Destination scanline buffer.
	 * @param DstIncr Destination scanline buffer increment, used for
	 * horizontal or vertical scanline stepping.
	 */

	void doFilter( const fptype* const Src, fptype* Dst,
		const int DstIncr ) const
	{
		const int ElCount = Vars -> ElCount;
		const fptype* const f = &Flt[ FltLatency ];
		const int flen = FltLatency + 1;
		const int ipstep = ElCount * ResampleFactor;
		const fptype* ip = Src - EdgePixelCount * ipstep;
		const fptype* ip1;
		const fptype* ip2;
		int l = OutLen;
		int i;

		if( ElCount == 1 )
		{
			while( l > 0 )
			{
				fptype s = f[ 0 ] * ip[ 0 ];
				ip1 = ip;
				ip2 = ip;

				for( i = 1; i < flen; i++ )
				{
					ip1++;
					ip2--;
					s += f[ i ] * ( ip1[ 0 ] + ip2[ 0 ]);
				}

				Dst[ 0 ] = s;
				Dst += DstIncr;
				ip += ipstep;
				l--;
			}
		}
		else
		if( ElCount == 4 )
		{
			while( l > 0 )
			{
				fptype s1 = f[ 0 ] * ip[ 0 ];
				fptype s2 = f[ 0 ] * ip[ 1 ];
				fptype s3 = f[ 0 ] * ip[ 2 ];
				fptype s4 = f[ 0 ] * ip[ 3 ];
				ip1 = ip;
				ip2 = ip;

				for( i = 1; i < flen; i++ )
				{
					ip1 += 4;
					ip2 -= 4;
					s1 += f[ i ] * ( ip1[ 0 ] + ip2[ 0 ]);
					s2 += f[ i ] * ( ip1[ 1 ] + ip2[ 1 ]);
					s3 += f[ i ] * ( ip1[ 2 ] + ip2[ 2 ]);
					s4 += f[ i ] * ( ip1[ 3 ] + ip2[ 3 ]);
				}

				Dst[ 0 ] = s1;
				Dst[ 1 ] = s2;
				Dst[ 2 ] = s3;
				Dst[ 3 ] = s4;
				Dst += DstIncr;
				ip += ipstep;
				l--;
			}
		}
		else
		if( ElCount == 3 )
		{
			while( l > 0 )
			{
				fptype s1 = f[ 0 ] * ip[ 0 ];
				fptype s2 = f[ 0 ] * ip[ 1 ];
				fptype s3 = f[ 0 ] * ip[ 2 ];
				ip1 = ip;
				ip2 = ip;

				for( i = 1; i < flen; i++ )
				{
					ip1 += 3;
					ip2 -= 3;
					s1 += f[ i ] * ( ip1[ 0 ] + ip2[ 0 ]);
					s2 += f[ i ] * ( ip1[ 1 ] + ip2[ 1 ]);
					s3 += f[ i ] * ( ip1[ 2 ] + ip2[ 2 ]);
				}

				Dst[ 0 ] = s1;
				Dst[ 1 ] = s2;
				Dst[ 2 ] = s3;
				Dst += DstIncr;
				ip += ipstep;
				l--;
			}
		}
		else
		if( ElCount == 2 )
		{
			while( l > 0 )
			{
				fptype s1 = f[ 0 ] * ip[ 0 ];
				fptype s2 = f[ 0 ] * ip[ 1 ];
				ip1 = ip;
				ip2 = ip;

				for( i = 1; i < flen; i++ )
				{
					ip1 += 2;
					ip2 -= 2;
					s1 += f[ i ] * ( ip1[ 0 ] + ip2[ 0 ]);
					s2 += f[ i ] * ( ip1[ 1 ] + ip2[ 1 ]);
				}

				Dst[ 0 ] = s1;
				Dst[ 1 ] = s2;
				Dst += DstIncr;
				ip += ipstep;
				l--;
			}
		}
	}

	/**
	 * Function performs resizing of a single scanline. This function does
	 * not "know" about the length of the source scanline buffer. This buffer
	 * should be padded with enough pixels so that ( SrcPos - FilterLenD2 ) is
	 * always >= 0 and ( SrcPos + ( DstLineLen - 1 ) * k + FilterLenD2 + 1 )
	 * does not exceed source scanline's buffer length. SrcLine's increment is
	 * assumed to be equal to ElCount.
	 *
	 * @param SrcLine Source scanline buffer.
	 * @param DstLine Destination (resized) scanline buffer.
	 * @param DstLineIncr Destination scanline position increment, used for
	 * horizontal or vertical scanline stepping.
	 * @param xx Temporary buffer, of size FltBank -> getFilterLen(), must be
	 * aligned by fpclass :: fpalign.
	 */

	void doResize( const fptype* SrcLine, fptype* DstLine,
		const int DstLineIncr, fptype* const ) const
	{
		const int IntFltLen = FltBank -> getFilterLen();
		const int ElCount = Vars -> ElCount;
		const typename CImageResizerFilterStep< fptype, fptypeatom > ::
			CResizePos* rpos = &(*RPosBuf)[ 0 ];

		const typename CImageResizerFilterStep< fptype, fptypeatom > ::
			CResizePos* const rpose = rpos + OutLen;

#define AVIR_RESIZE_PART1 \
			while( rpos < rpose ) \
			{ \
				const fptype x = (fptype) rpos -> x; \
				const fptype* const ftp = rpos -> ftp; \
				const fptype* const ftp2 = ftp + IntFltLen; \
				const fptype* Src = SrcLine + rpos -> SrcOffs; \
				int i;

#define AVIR_RESIZE_PART1nx \
			while( rpos < rpose ) \
			{ \
				const fptype* const ftp = rpos -> ftp; \
				const fptype* Src = SrcLine + rpos -> SrcOffs; \
				int i;

#define AVIR_RESIZE_PART2 \
				DstLine += DstLineIncr; \
				rpos++; \
			}

		if( FltBank -> getOrder() == 1 )
		{
			if( ElCount == 1 )
			{
				AVIR_RESIZE_PART1

				fptype sum = 0.0;

				for( i = 0; i < IntFltLen; i++ )
				{
					sum += ( ftp[ i ] + ftp2[ i ] * x ) * Src[ i ];
				}

				DstLine[ 0 ] = sum;

				AVIR_RESIZE_PART2
			}
			else
			if( ElCount == 4 )
			{
				AVIR_RESIZE_PART1

				fptype sum[ 4 ];
				sum[ 0 ] = 0.0;
				sum[ 1 ] = 0.0;
				sum[ 2 ] = 0.0;
				sum[ 3 ] = 0.0;

				for( i = 0; i < IntFltLen; i++ )
				{
					const fptype xx = ftp[ i ] + ftp2[ i ] * x;
					sum[ 0 ] += xx * Src[ 0 ];
					sum[ 1 ] += xx * Src[ 1 ];
					sum[ 2 ] += xx * Src[ 2 ];
					sum[ 3 ] += xx * Src[ 3 ];
					Src += 4;
				}

				DstLine[ 0 ] = sum[ 0 ];
				DstLine[ 1 ] = sum[ 1 ];
				DstLine[ 2 ] = sum[ 2 ];
				DstLine[ 3 ] = sum[ 3 ];

				AVIR_RESIZE_PART2
			}
			else
			if( ElCount == 3 )
			{
				AVIR_RESIZE_PART1

				fptype sum[ 3 ];
				sum[ 0 ] = 0.0;
				sum[ 1 ] = 0.0;
				sum[ 2 ] = 0.0;

				for( i = 0; i < IntFltLen; i++ )
				{
					const fptype xx = ftp[ i ] + ftp2[ i ] * x;
					sum[ 0 ] += xx * Src[ 0 ];
					sum[ 1 ] += xx * Src[ 1 ];
					sum[ 2 ] += xx * Src[ 2 ];
					Src += 3;
				}

				DstLine[ 0 ] = sum[ 0 ];
				DstLine[ 1 ] = sum[ 1 ];
				DstLine[ 2 ] = sum[ 2 ];

				AVIR_RESIZE_PART2
			}
			else
			if( ElCount == 2 )
			{
				AVIR_RESIZE_PART1

				fptype sum[ 2 ];
				sum[ 0 ] = 0.0;
				sum[ 1 ] = 0.0;

				for( i = 0; i < IntFltLen; i++ )
				{
					const fptype xx = ftp[ i ] + ftp2[ i ] * x;
					sum[ 0 ] += xx * Src[ 0 ];
					sum[ 1 ] += xx * Src[ 1 ];
					Src += 2;
				}

				DstLine[ 0 ] = sum[ 0 ];
				DstLine[ 1 ] = sum[ 1 ];

				AVIR_RESIZE_PART2
			}
		}
		else
		{
			if( ElCount == 1 )
			{
				AVIR_RESIZE_PART1nx

				fptype sum = 0.0;

				for( i = 0; i < IntFltLen; i++ )
				{
					sum += ftp[ i ] * Src[ i ];
				}

				DstLine[ 0 ] = sum;

				AVIR_RESIZE_PART2
			}
			else
			if( ElCount == 4 )
			{
				AVIR_RESIZE_PART1nx

				fptype sum[ 4 ];
				sum[ 0 ] = 0.0;
				sum[ 1 ] = 0.0;
				sum[ 2 ] = 0.0;
				sum[ 3 ] = 0.0;

				for( i = 0; i < IntFltLen; i++ )
				{
					const fptype xx = ftp[ i ];
					sum[ 0 ] += xx * Src[ 0 ];
					sum[ 1 ] += xx * Src[ 1 ];
					sum[ 2 ] += xx * Src[ 2 ];
					sum[ 3 ] += xx * Src[ 3 ];
					Src += 4;
				}

				DstLine[ 0 ] = sum[ 0 ];
				DstLine[ 1 ] = sum[ 1 ];
				DstLine[ 2 ] = sum[ 2 ];
				DstLine[ 3 ] = sum[ 3 ];

				AVIR_RESIZE_PART2
			}
			else
			if( ElCount == 3 )
			{
				AVIR_RESIZE_PART1nx

				fptype sum[ 3 ];
				sum[ 0 ] = 0.0;
				sum[ 1 ] = 0.0;
				sum[ 2 ] = 0.0;

				for( i = 0; i < IntFltLen; i++ )
				{
					const fptype xx = ftp[ i ];
					sum[ 0 ] += xx * Src[ 0 ];
					sum[ 1 ] += xx * Src[ 1 ];
					sum[ 2 ] += xx * Src[ 2 ];
					Src += 3;
				}

				DstLine[ 0 ] = sum[ 0 ];
				DstLine[ 1 ] = sum[ 1 ];
				DstLine[ 2 ] = sum[ 2 ];

				AVIR_RESIZE_PART2
			}
			else
			if( ElCount == 2 )
			{
				AVIR_RESIZE_PART1nx

				fptype sum[ 2 ];
				sum[ 0 ] = 0.0;
				sum[ 1 ] = 0.0;

				for( i = 0; i < IntFltLen; i++ )
				{
					const fptype xx = ftp[ i ];
					sum[ 0 ] += xx * Src[ 0 ];
					sum[ 1 ] += xx * Src[ 1 ];
					Src += 2;
				}

				DstLine[ 0 ] = sum[ 0 ];
				DstLine[ 1 ] = sum[ 1 ];

				AVIR_RESIZE_PART2
			}
		}
	}
#undef AVIR_RESIZE_PART2
#undef AVIR_RESIZE_PART1nx
#undef AVIR_RESIZE_PART1
};

/**
 * @brief Image resizer's default dithering class.
 *
 * This class defines an object that performs rounding, clipping and dithering
 * operations over horizontal scanline pixels before scanline is stored in the
 * output buffer.
 *
 * The ditherer should expect the same storage order of the pixels in a
 * scanline as used in the "filtering step" class. So, a separate ditherer
 * class should be defined for each scanline pixel storage style. The default
 * ditherer implements a simple rounding without dithering: it can be used for
 * an efficient dithering method which can be multi-threaded.
 *
 * @tparam fptype Floating point type to use for storing pixel data. SIMD
 * types can be used.
 */

template< class fptype >
class CImageResizerDithererDefINL
{
public:
	/**
	 * Function initializes the ditherer object.
	 *
	 * @param aLen Scanline length in pixels to process.
	 * @param aVars Image resizing-related variables.
	 * @param aTrMul Bit-depth truncation multiplier. 1 - no additional
	 * truncation.
	 * @param aPkOut Peak output value allowed.
	 */

	void init( const int aLen, const CImageResizerVars& aVars,
		const double aTrMul, const double aPkOut )
	{
		Len = aLen;
		Vars = &aVars;
		LenE = aLen * Vars -> ElCount;
		TrMul0 = aTrMul;
		PkOut0 = aPkOut;
	}

	/**
	 * @return "True" if dithering is recursive relative to scanlines meaning
	 * multi-threaded execution is not supported by this dithering method.
	 */

	static bool isRecursive()
	{
		return( false );
	}

	/**
	 * Function performs rounding and clipping operations.
	 *
	 * @param ResScanline The buffer containing the final scanline.
	 */

	void dither( fptype* const ResScanline ) const
	{
		const fptype c0 = 0.0;
		const fptype PkOut = (fptype) PkOut0;
		int j;

		if( TrMul0 == 1.0 )
		{
			// Optimization - do not perform bit depth truncation.

			for( j = 0; j < LenE; j++ )
			{
				ResScanline[ j ] = clamp( round( ResScanline[ j ]), c0,
					PkOut );
			}
		}
		else
		{
			const fptype TrMul = (fptype) TrMul0;

			for( j = 0; j < LenE; j++ )
			{
				const fptype z0 = round( ResScanline[ j ] / TrMul ) * TrMul;
				ResScanline[ j ] = clamp( z0, c0, PkOut );
			}
		}
	}

protected:
	int Len; ///< Scanline's length in pixels.
		///<
	const CImageResizerVars* Vars; ///< Image resizing-related variables.
		///<
	int LenE; ///< = LenE * ElCount.
		///<
	double TrMul0; ///< Bit-depth truncation multiplier.
		///<
	double PkOut0; ///< Peak output value allowed.
		///<
};

/**
 * @brief Image resizer's error-diffusion dithering class, interleaved mode.
 *
 * This ditherer implements error-diffusion dithering which looks good, and
 * whose results are compressed by PNG well. This implementation uses
 * weighting coefficients obtained via machine optimization and visual
 * evaluation.
 *
 * @tparam fptype Floating point type to use for storing pixel data. SIMD
 * types can be used.
 */

template< class fptype >
class CImageResizerDithererErrdINL :
	public CImageResizerDithererDefINL< fptype >
{
public:
	/**
	 * Function initializes the ditherer object.
	 *
	 * @param aLen Scanline length in pixels to process.
	 * @param aVars Image resizing-related variables.
	 * @param aTrMul Bit-depth truncation multiplier. 1 - no additional
	 * truncation.
	 * @param aPkOut Peak output value allowed.
	 */

	void init( const int aLen, const CImageResizerVars& aVars,
		const double aTrMul, const double aPkOut )
	{
		CImageResizerDithererDefINL< fptype > :: init( aLen, aVars, aTrMul,
			aPkOut );

		ResScanlineDith0.alloc( LenE + Vars -> ElCount, sizeof( fptype ));
		ResScanlineDith = ResScanlineDith0 + Vars -> ElCount;
		int i;

		for( i = 0; i < LenE + Vars -> ElCount; i++ )
		{
			ResScanlineDith0[ i ] = 0.0;
		}
	}

	static bool isRecursive()
	{
		return( true );
	}

	void dither( fptype* const ResScanline )
	{
		const int ElCount = Vars -> ElCount;
		const fptype c0 = 0.0;
		const fptype TrMul = (fptype) TrMul0;
		const fptype PkOut = (fptype) PkOut0;
		int j;

		for( j = 0; j < LenE; j++ )
		{
			ResScanline[ j ] += ResScanlineDith[ j ];
			ResScanlineDith[ j ] = 0.0;
		}

		for( j = 0; j < LenE - ElCount; j++ )
		{
			// Perform rounding, noise estimation and saturation.

			const fptype z0 = round( ResScanline[ j ] / TrMul ) * TrMul;
			const fptype Noise = ResScanline[ j ] - z0;
			ResScanline[ j ] = clamp( z0, c0, PkOut );

			ResScanline[ j + ElCount ] += Noise * (fptype) 0.364842;
			ResScanlineDith[ j - ElCount ] += Noise * (fptype) 0.207305;
			ResScanlineDith[ j ] += Noise * (fptype) 0.364842;
			ResScanlineDith[ j + ElCount ] += Noise * (fptype) 0.063011;
		}

		while( j < LenE )
		{
			const fptype z0 = round( ResScanline[ j ] / TrMul ) * TrMul;
			const fptype Noise = ResScanline[ j ] - z0;
			ResScanline[ j ] = clamp( z0, c0, PkOut );

			ResScanlineDith[ j - ElCount ] += Noise * (fptype) 0.207305;
			ResScanlineDith[ j ] += Noise * (fptype) 0.364842;
			j++;
		}
	}

protected:
	using CImageResizerDithererDefINL< fptype > :: Len;
	using CImageResizerDithererDefINL< fptype > :: Vars;
	using CImageResizerDithererDefINL< fptype > :: LenE;
	using CImageResizerDithererDefINL< fptype > :: TrMul0;
	using CImageResizerDithererDefINL< fptype > :: PkOut0;

	CBuffer< fptype > ResScanlineDith0; ///< Error diffusion buffer.
		///<
	fptype* ResScanlineDith; ///< Error diffusion buffer pointer which skips
		///< the first ElCount elements.
		///<
};

/**
 * @brief Floating-point processing definition and abstraction class.
 *
 * This class defines several constants and typedefs that point to classes
 * that should be used by the image resizing algorithm. Such "definition
 * class" can be used to define alternative scanline processing algorithms
 * (e.g. SIMD) and image scanline packing styles used during processing. This
 * class also offers an abstraction layer for dithering, rounding and
 * clamping (saturation) operation.
 *
 * The fpclass_def class can be used to define processing using both SIMD and
 * non-SIMD types, but using algorithms that are operate on interleaved pixels
 * and non-SIMD optimized themselves.
 *
 * @tparam afptype Floating point type to use for storing intermediate data
 * and variables. For variables that are not used in intensive calculations
 * the "double" type is always used. On the latest Intel processors (like
 * i7-4770K) there is almost no performance difference between "double" and
 * "float". Image quality differences between "double" and "float" are not
 * apparent on 8-bit images. At the same time the "float" uses half amount of
 * working memory the "double" type uses. SIMD types can be used. The
 * functions round() and clamp() in the "avir" or other visible namespace
 * should be available for the specified type. SIMD types allow to perform
 * resizing of images with more than 4 channels, to be exact 4 * SIMD element
 * number (e.g. 16 for float4), without modification of the image resizing
 * algorithm required.
 * @tparam afptypeatom The atomic type the "afptype" consists of.
 * @tparam adith Ditherer class to use during processing.
 */

template< class afptype, class afptypeatom = afptype,
	class adith = CImageResizerDithererDefINL< afptype > >
class fpclass_def
{
public:
	typedef afptype fptype; ///< Floating-point type to use during processing.
		///<
	typedef afptypeatom fptypeatom; ///< Atomic type "fptype" consists of.
		///<
	static const int fppack = sizeof( fptype ) / sizeof( fptypeatom ); ///<
		///< The number of atomic types stored in a single "fptype" element.
		///<
	static const int fpalign = sizeof( fptype ); ///< Suggested alignment size
		///< in bytes. This is not a required alignment, because image
		///< resizing algorithm cannot be made to have a strictly aligned data
		///< access at all steps (e.g. interpolation cannot perform aligned
		///< accesses).
		///<
	static const int elalign = 1; ///< Length alignment of arrays of elements.
		///< This applies to filters and intermediate buffers: this constant
		///< forces filters and scanlines to have a length which is a multiple
		///< of this value, for more efficient SIMD implementation.
		///<
	static const int packmode = 0; ///< 0 if interleaved packing, 1 if
		///< de-interleaved.
		///<
	typedef CImageResizerFilterStepINL< fptype, fptypeatom > CFilterStep; ///<
		///< Filtering step class to use during processing.
		///<
	typedef adith CDitherer; ///< Ditherer class to use during processing.
		///<
};

/**
 * @brief Image resizer class.
 *
 * The object of this class can be used to resize 1-4 channel images to any
 * required size. Resizing is performed by utilizing interpolated sinc
 * fractional delay filters plus (if necessary) a cascade of built-in
 * sinc function-based 2X upsampling or 2X downsampling stages, followed by a
 * correction filtering.
 *
 * Object of this class can be allocated on stack.
 *
 * @tparam fpclass Floating-point processing definition class to use. See
 * avir::fpclass_def for more details.
 */

template< class fpclass = fpclass_def< float > >
class CImageResizer
{
public:
	/**
	 * Constructor initializes the resizer.
	 *
	 * @param aResBitDepth Required bit depth of resulting image (1-16). If
	 * integer value output is used (e.g. uint8_t), the bit depth also affects
	 * rounding: for example, if aResBitDepth=6 and "Tout" is uint8_t, the
	 * result will be rounded to 6 most significant bits (2 least significant
	 * bits truncated, with dithering applied).
	 * @param aSrcBitDepth Source image's real bit-depth. Set to 0 to use
	 * aResBitDepth.
	 * @param aParams Resizing algorithm's parameters to use. Leave out for
	 * default values. Can be useful when performing automatic optimization of
	 * parameters.
	 */

	CImageResizer( const int aResBitDepth = 8, const int aSrcBitDepth = 0,
		const CImageResizerParams& aParams = CImageResizerParamsDef() )
		: Params( aParams )
		, ResBitDepth( aResBitDepth )
	{
		SrcBitDepth = ( aSrcBitDepth == 0 ? ResBitDepth : aSrcBitDepth );

		initFilterBank( FixedFilterBank, 1.0, false, CFltBuffer() );
		FixedFilterBank.createAllFilters();
	}

	/**
	 * Function resizes image.
	 *
	 * @param SrcBuf Source image buffer.
	 * @param SrcWidth Source image width.
	 * @param SrcHeight Source image height.
	 * @param SrcScanlineSize Physical size of source scanline in elements
	 * (not bytes). If this value is below 1, SrcWidth * ElCountIO will be
	 * used as the physical source scanline size.
	 * @param[out] NewBuf Buffer to accept the resized image. Can be equal to
	 * SrcBuf if the size of the resized image is smaller or equal to source
	 * image in size.
	 * @param NewWidth New image width.
	 * @param NewHeight New image height.
	 * @param ElCountIO The number of elements (channels) used to store each
	 * source and destination pixel (1-4).
	 * @param k Resizing step (one output pixel corresponds to "k" input
	 * pixels). A downsizing factor if > 1.0; upsizing factor if <= 1.0.
	 * Multiply by -1 if you would like to bypass "ox" and "oy" adjustment
	 * which is done by default to produce a centered image. If step value
	 * equals 0, the step value will be chosen automatically and independently
	 * for horizontal and vertical resizing.
	 * @param[in,out] aVars Pointer to variables structure to be passed to the
	 * image resizing function. Can be NULL. Only variables that are
	 * initialized in default constructor of this structure are accepted by
	 * this function. These variables will not be changed by this function.
	 * All other variables can be modified by this function. The access to
	 * this object is not thread-safe, each concurrent instance of this
	 * function should use a separate aVars object.
	 * @tparam Tin Input buffer element's type. Can be uint8_t (0-255 value
	 * range), uint16_t (0-65535 value range), float (0.0-1.0 value range),
	 * double (0.0-1.0 value range). Larger integer types are treated as
	 * uint16_t. Signed integer types are unsupported.
	 * @tparam Tout Output buffer element's type. Can be uint8_t (0-255 value
	 * range), uint16_t (0-65535 value range), float (0.0-1.0 value range),
	 * double (0.0-1.0 value range). Larger integer types are treated as
	 * uint16_t. Signed integer types are unsupported.
	 */

	template< class Tin, class Tout >
	void resizeImage( const Tin* const SrcBuf, const int SrcWidth,
		const int SrcHeight, int SrcScanlineSize, Tout* const NewBuf,
		const int NewWidth, const int NewHeight, const int ElCountIO,
		const double k, CImageResizerVars* const aVars = NULL ) const
	{
		if( SrcWidth == 0 || SrcHeight == 0 )
		{
			memset( NewBuf, 0, (size_t) NewWidth * NewHeight *
				sizeof( Tout ));

			return;
		}
		else
		if( NewWidth == 0 || NewHeight == 0 )
		{
			return;
		}

		CImageResizerVars DefVars;
		CImageResizerVars& Vars = ( aVars == NULL ? DefVars : *aVars );

		CImageResizerThreadPool DefThreadPool;
		CImageResizerThreadPool& ThreadPool = ( Vars.ThreadPool == NULL ?
			DefThreadPool : *Vars.ThreadPool );

		// Define resizing steps, also optionally modify offsets so that
		// resizing produces a "centered" image.

		double kx;
		double ky;
		double ox = Vars.ox;
		double oy = Vars.oy;

		if( k == 0.0 )
		{
			if( NewWidth > SrcWidth )
			{
				kx = (double) ( SrcWidth - 1 ) / ( NewWidth - 1 );
			}
			else
			{
				kx = (double) SrcWidth / NewWidth;
				ox += ( kx - 1.0 ) * 0.5;
			}

			if( NewHeight > SrcHeight )
			{
				ky = (double) ( SrcHeight - 1 ) / ( NewHeight - 1 );
			}
			else
			{
				ky = (double) SrcHeight / NewHeight;
				oy += ( ky - 1.0 ) * 0.5;
			}
		}
		else
		if( k > 0.0 )
		{
			kx = k;
			ky = k;

			if( k > 1.0 )
			{
				const double ko = ( k - 1.0 ) * 0.5;
				ox += ko;
				oy += ko;
			}
		}
		else
		{
			kx = -k;
			ky = -k;
		}

		// Evaluate pre-multipliers used on the output stage.

		const bool IsInFloat = ( (Tin) 0.4 != 0 );
		const bool IsOutFloat = ( (Tout) 0.4 != 0 );
		double OutMul; // Output multiplier.

		if( Vars.UseSRGBGamma )
		{
			if( IsInFloat )
			{
				Vars.InGammaMult = 1.0;
			}
			else
			{
				Vars.InGammaMult =
					1.0 / ( sizeof( Tin ) == 1 ? 255.0 : 65535.0 );
			}

			if( IsOutFloat )
			{
				Vars.OutGammaMult = 1.0;
			}
			else
			{
				Vars.OutGammaMult = ( sizeof( Tout ) == 1 ? 255.0 : 65535.0 );
			}

			OutMul = 1.0;
		}
		else
		{
			if( IsOutFloat )
			{
				OutMul = 1.0;
			}
			else
			{
				OutMul = ( sizeof( Tout ) == 1 ? 255.0 : 65535.0 );
			}

			if( !IsInFloat )
			{
				OutMul /= ( sizeof( Tin ) == 1 ? 255.0 : 65535.0 );
			}
		}

		// Fill widely-used variables.

		const int ElCount = ( ElCountIO + fpclass :: fppack - 1 ) /
			fpclass :: fppack;

		const int NewWidthE = NewWidth * ElCount;

		if( SrcScanlineSize < 1 )
		{
			SrcScanlineSize = SrcWidth * ElCountIO;
		}

		Vars.ElCount = ElCount;
		Vars.ElCountIO = ElCountIO;
		Vars.fppack = fpclass :: fppack;
		Vars.fpalign = fpclass :: fpalign;
		Vars.elalign = fpclass :: elalign;
		Vars.packmode = fpclass :: packmode;

		// Horizontal scanline filtering and resizing.

		CDSPFracFilterBankLin< fptype > FltBank;
		CFilterSteps FltSteps;
		typename CFilterStep :: CRPosBufArray RPosBufArray;
		CBuffer< uint8_t > UsedFracMap;

		// Perform the filtering steps modeling at various modes, find the
		// most efficient mode for both horizontal and vertical resizing.

		int UseBuildMode = 1;
		const int BuildModeCount =
			( FixedFilterBank.getOrder() == 0 ? 4 : 2 );

		int m;

		if( Vars.BuildMode >= 0 )
		{
			UseBuildMode = Vars.BuildMode;
		}
		else
		{
			int BestScore = 0x7FFFFFFF;

			for( m = 0; m < BuildModeCount; m++ )
			{
				CDSPFracFilterBankLin< fptype > TmpBank;
				CFilterSteps TmpSteps;
				Vars.k = kx;
				Vars.o = ox;
				buildFilterSteps( TmpSteps, Vars, TmpBank, OutMul, m, true );
				updateFilterStepBuffers( TmpSteps, Vars, RPosBufArray,
					SrcWidth, NewWidth );

				fillUsedFracMap( TmpSteps[ Vars.ResizeStep ], UsedFracMap );
				const int c = calcComplexity( TmpSteps, Vars, UsedFracMap,
					SrcHeight );

				if( c < BestScore )
				{
					UseBuildMode = m;
					BestScore = c;
				}
			}
		}

		// Perform the actual filtering steps building.

		Vars.k = kx;
		Vars.o = ox;
		buildFilterSteps( FltSteps, Vars, FltBank, OutMul, UseBuildMode,
			false );

		updateFilterStepBuffers( FltSteps, Vars, RPosBufArray, SrcWidth,
			NewWidth );

		updateBufLenAndRPosPtrs( FltSteps, Vars, NewWidth );

		const int ThreadCount = ThreadPool.getSuggestedWorkloadCount();
			// Includes the current thread.

		CStructArray< CThreadData< Tin, Tout > > td;
		td.setItemCount( ThreadCount );
		int i;

		for( i = 0; i < ThreadCount; i++ )
		{
			if( i > 0 )
			{
				ThreadPool.addWorkload( &td[ i ]);
			}

			td[ i ].init( i, ThreadCount, FltSteps, Vars );

			td[ i ].initScanlineQueue( td[ i ].sopResizeH, SrcHeight,
				SrcWidth );
		}

		CBuffer< fptype, size_t > FltBuf( (size_t) NewWidthE * SrcHeight,
			fpclass :: fpalign ); // Temporary buffer that receives
			// horizontally-filtered and resized image.

		for( i = 0; i < SrcHeight; i++ )
		{
			td[ i % ThreadCount ].addScanlineToQueue(
				(void*) &SrcBuf[ (size_t) i * SrcScanlineSize ],
				&FltBuf[ (size_t) i * NewWidthE ]);
		}

		ThreadPool.startAllWorkloads();
		td[ 0 ].processScanlineQueue();
		ThreadPool.waitAllWorkloadsToFinish();

		// Vertical scanline filtering and resizing, reuse previously defined
		// filtering steps if possible.

		const int PrevUseBuildMode = UseBuildMode;

		if( Vars.BuildMode >= 0 )
		{
			UseBuildMode = Vars.BuildMode;
		}
		else
		{
			CImageResizerVars TmpVars( Vars );
			int BestScore = 0x7FFFFFFF;

			for( m = 0; m < BuildModeCount; m++ )
			{
				CDSPFracFilterBankLin< fptype > TmpBank;
				TmpBank.copyInitParams( FltBank );
				CFilterSteps TmpSteps;
				TmpVars.k = ky;
				TmpVars.o = oy;
				buildFilterSteps( TmpSteps, TmpVars, TmpBank, 1.0, m, true );
				updateFilterStepBuffers( TmpSteps, TmpVars, RPosBufArray,
					SrcHeight, NewHeight );

				fillUsedFracMap( TmpSteps[ TmpVars.ResizeStep ],
					UsedFracMap );

				const int c = calcComplexity( TmpSteps, TmpVars, UsedFracMap,
					NewWidth );

				if( c < BestScore )
				{
					UseBuildMode = m;
					BestScore = c;
				}
			}
		}

		Vars.k = ky;
		Vars.o = oy;

		if( UseBuildMode == PrevUseBuildMode && ky == kx )
		{
			if( OutMul != 1.0 )
			{
				modifyCorrFilterDCGain( FltSteps, 1.0 / OutMul );
			}
		}
		else
		{
			buildFilterSteps( FltSteps, Vars, FltBank, 1.0, UseBuildMode,
				false );
		}

		updateFilterStepBuffers( FltSteps, Vars, RPosBufArray, SrcHeight,
			NewHeight );

		updateBufLenAndRPosPtrs( FltSteps, Vars, NewWidth );

		if( IsOutFloat && sizeof( FltBuf[ 0 ]) == sizeof( Tout ) &&
			fpclass :: packmode == 0 )
		{
			// In-place output.

			for( i = 0; i < ThreadCount; i++ )
			{
				td[ i ].initScanlineQueue( td[ i ].sopResizeV, NewWidth,
					SrcHeight, NewWidthE, NewWidthE );
			}

			for( i = 0; i < NewWidth; i++ )
			{
				td[ i % ThreadCount ].addScanlineToQueue(
					&FltBuf[ (size_t) i * ElCount ],
					(fptype*) &NewBuf[ (size_t) i * ElCount ]);
			}

			ThreadPool.startAllWorkloads();
			td[ 0 ].processScanlineQueue();
			ThreadPool.waitAllWorkloadsToFinish();
			ThreadPool.removeAllWorkloads();

			return;
		}

		CBuffer< fptype, size_t > ResBuf( (size_t) NewWidthE * NewHeight,
			fpclass :: fpalign );

		for( i = 0; i < ThreadCount; i++ )
		{
			td[ i ].initScanlineQueue( td[ i ].sopResizeV, NewWidth,
				SrcHeight, NewWidthE, NewWidthE );
		}

		const int im = ( fpclass :: packmode == 0 ? ElCount : 1 );

		for( i = 0; i < NewWidth; i++ )
		{
			td[ i % ThreadCount ].addScanlineToQueue(
				&FltBuf[ (size_t) i * im ], &ResBuf[ (size_t) i * im ]);
		}

		ThreadPool.startAllWorkloads();
		td[ 0 ].processScanlineQueue();
		ThreadPool.waitAllWorkloadsToFinish();

		if( IsOutFloat )
		{
			// Perform output, but skip dithering.

			for( i = 0; i < ThreadCount; i++ )
			{
				td[ i ].initScanlineQueue( td[ i ].sopUnpackH,
					NewHeight, NewWidth );
			}

			for( i = 0; i < NewHeight; i++ )
			{
				td[ i % ThreadCount ].addScanlineToQueue(
					&ResBuf[ (size_t) i * NewWidthE ],
					&NewBuf[ (size_t) i * NewWidth * ElCountIO ]);
			}

			ThreadPool.startAllWorkloads();
			td[ 0 ].processScanlineQueue();
			ThreadPool.waitAllWorkloadsToFinish();
			ThreadPool.removeAllWorkloads();

			return;
		}

		// Perform output with dithering (for integer output only).

		int TruncBits; // The number of lower bits to truncate and dither.
		int OutRange; // Output range.

		if( sizeof( Tout ) == 1 )
		{
			TruncBits = 8 - ResBitDepth;
			OutRange = 255;
		}
		else
		{
			TruncBits = 16 - ResBitDepth;
			OutRange = 65535;
		}

		const double PkOut = OutRange;
		const double TrMul = ( TruncBits > 0 ?
			PkOut / ( OutRange >> TruncBits ) : 1.0 );

		if( CDitherer :: isRecursive() )
		{
			td[ 0 ].getDitherer().init( NewWidth, Vars, TrMul, PkOut );

			if( Vars.UseSRGBGamma )
			{
				for( i = 0; i < NewHeight; i++ )
				{
					fptype* const ResScanline =
						&ResBuf[ (size_t) i * NewWidthE ];

					CFilterStep :: applySRGBGamma( ResScanline, NewWidth,
						Vars );

					td[ 0 ].getDitherer().dither( ResScanline );

					CFilterStep :: unpackScanline( ResScanline,
						&NewBuf[ (size_t) i * NewWidth * ElCountIO ],
						NewWidth, Vars );
				}
			}
			else
			{
				for( i = 0; i < NewHeight; i++ )
				{
					fptype* const ResScanline =
						&ResBuf[ (size_t) i * NewWidthE ];

					td[ 0 ].getDitherer().dither( ResScanline );

					CFilterStep :: unpackScanline( ResScanline,
						&NewBuf[ (size_t) i * NewWidth * ElCountIO ],
						NewWidth, Vars );
				}
			}
		}
		else
		{
			for( i = 0; i < ThreadCount; i++ )
			{
				td[ i ].initScanlineQueue( td[ i ].sopDitherAndUnpackH,
					NewHeight, NewWidth );

				td[ i ].getDitherer().init( NewWidth, Vars, TrMul, PkOut );
			}

			for( i = 0; i < NewHeight; i++ )
			{
				td[ i % ThreadCount ].addScanlineToQueue(
					&ResBuf[ (size_t) i * NewWidthE ],
					&NewBuf[ (size_t) i * NewWidth * ElCountIO ]);
			}

			ThreadPool.startAllWorkloads();
			td[ 0 ].processScanlineQueue();
			ThreadPool.waitAllWorkloadsToFinish();
		}

		ThreadPool.removeAllWorkloads();
	}

private:
	typedef typename fpclass :: fptype fptype; ///< Floating-point type to use
		///< during processing.
		///<
	typedef typename fpclass :: CFilterStep CFilterStep; ///< Filtering step
		///< class to use during processing.
		///<
	typedef typename fpclass :: CDitherer CDitherer; ///< Ditherer class to
		///< use during processing.
		///<
	CImageResizerParams Params; ///< Algorithm's parameters currently in use.
		///<
	int SrcBitDepth; ///< Bit resolution of the source image.
		///<
	int ResBitDepth; ///< Bit resolution of the resulting image.
		///<
	CDSPFracFilterBankLin< fptype > FixedFilterBank; ///< Fractional delay
		///< filter bank with fixed characteristics, mainly for upsizing
		///< cases.
		///<

	/**
	 * @brief Filtering steps array.
	 *
	 * The object of this class stores filtering steps together.
	 */

	typedef CStructArray< CFilterStep > CFilterSteps;

	/**
	 * Function initializes the filter bank in the specified resizing step
	 * according to the source and resulting image bit depths.
	 *
	 * @param FltBank Filter bank to initialize.
	 * @param CutoffMult Cutoff multiplier, 0 to 1. 1 corresponds to 0.5pi
	 * cutoff point.
	 * @param ForceHiOrder "True" if a high-order interpolation should be
	 * forced which requires considerably less resources for initialization.
	 * @param ExtFilter External filter to apply to interpolation filter.
	 */

	void initFilterBank( CDSPFracFilterBankLin< fptype >& FltBank,
		const double CutoffMult, const bool ForceHiOrder,
		const CFltBuffer& ExtFilter ) const
	{
		const int IntBitDepth = ( ResBitDepth > SrcBitDepth ? ResBitDepth :
			SrcBitDepth );

		const double SNR = -6.02 * ( IntBitDepth + 3 );
		int UseOrder;
		int FracCount; // The number of fractional delay filters sampled by
			// the filter bank. This variable affects the signal-to-noise
			// ratio at interpolation stage. Theoretically, at UseOrder==1,
			// 8-bit image resizing requires 66.2 dB SNR or 11. 16-bit
			// resizing requires 114.4 dB SNR or 150. At UseOrder=0 the
			// required number of filters is exponentially higher.

		if( ForceHiOrder || IntBitDepth > 8 )
		{
			UseOrder = 1; // -146 dB max
			FracCount = (int) ceil( 0.23134052 * exp( -0.058062929 * SNR ));
		}
		else
		{
			UseOrder = 0; // -72 dB max
			FracCount = (int) ceil( 0.33287686 * exp( -0.11334583 * SNR ));
		}

		if( FracCount < 2 )
		{
			FracCount = 2;
		}

		FltBank.init( FracCount, UseOrder, Params.IntFltLen / CutoffMult,
			Params.IntFltCutoff * CutoffMult, Params.IntFltAlpha, ExtFilter,
			fpclass :: fpalign, fpclass :: elalign );
	}

	/**
	 * Function allocates filter buffer taking "fpclass" alignments into
	 * account. The allocated buffer may be larger than the requested size: in
	 * this case the additional elements will be zeroed by this function.
	 *
	 * @param Flt Filter buffer.
	 * @param ReqCapacity The required filter buffer's capacity.
	 * @param IsModel "True" if filtering steps modeling is performed without
	 * actual filter allocation.
	 * @param FltExt If non-NULL this variable will receive the number of
	 * elements the filter was extended by.
	 */

	static void allocFilter( CBuffer< fptype >& Flt, const int ReqCapacity,
		const bool IsModel = false, int* const FltExt = NULL )
	{
		int UseCapacity = ( ReqCapacity + fpclass :: elalign - 1 ) &
			~( fpclass :: elalign - 1 );

		int Ext = UseCapacity - ReqCapacity;

		if( FltExt != NULL )
		{
			*FltExt = Ext;
		}

		if( IsModel )
		{
			Flt.forceCapacity( UseCapacity );
			return;
		}

		Flt.alloc( UseCapacity, fpclass :: fpalign );

		while( Ext > 0 )
		{
			Ext--;
			Flt[ ReqCapacity + Ext ] = 0.0;
		}
	}

	/**
	 * Function assigns filter parameters to the specified filtering step
	 * object.
	 *
	 * @param fs Filtering step to assign parameter to. This step cannot be
	 * the last step if ResampleFactor greater than 1 was specified.
	 * @param IsUpsample "True" if upsampling step. Should be set to "false"
	 * if FltCutoff is negative.
	 * @param ResampleFactor Resampling factor of this filter (>=1).
	 * @param FltCutoff Filter cutoff point. This value will be divided by the
	 * ResampleFactor if IsUpsample equals "true". If zero value was
	 * specified, the "half-band" predefined filter will be created. In this
	 * case the ResampleFactor will modify the filter cutoff point.
	 * @param DCGain DC gain to apply to the filter. Assigned to filtering
	 * step's DCGain variable.
	 * @param UseFltOrig "True" if the originally-designed filter should be
	 * left in filtering step's FltOrig buffer. Otherwise it will be freed.
	 * @param IsModel "True" if filtering steps modeling is performed without
	 * actual filter building.
	 */

	void assignFilterParams( CFilterStep& fs, const bool IsUpsample,
		const int ResampleFactor, const double FltCutoff, const double DCGain,
		const bool UseFltOrig, const bool IsModel ) const
	{
		double FltAlpha;
		double Len2;
		double Freq;

		if( FltCutoff == 0.0 )
		{
			const double m = 2.0 / ResampleFactor;
			FltAlpha = Params.HBFltAlpha;
			Len2 = 0.5 * Params.HBFltLen / m;
			Freq = AVIR_PI * Params.HBFltCutoff * m;
		}
		else
		{
			FltAlpha = Params.LPFltAlpha;
			Len2 = 0.25 * Params.LPFltBaseLen / FltCutoff;
			Freq = AVIR_PI * Params.LPFltCutoffMult * FltCutoff;
		}

		if( IsUpsample )
		{
			Len2 *= ResampleFactor;
			Freq /= ResampleFactor;
			fs.DCGain = DCGain * ResampleFactor;
		}
		else
		{
			fs.DCGain = DCGain;
		}

		fs.FltOrig.Len2 = Len2;
		fs.FltOrig.Freq = Freq;
		fs.FltOrig.Alpha = FltAlpha;
		fs.FltOrig.DCGain = fs.DCGain;

		CDSPPeakedCosineLPF w( Len2, Freq, FltAlpha );

		fs.IsUpsample = IsUpsample;
		fs.ResampleFactor = ResampleFactor;
		fs.FltLatency = w.fl2;

		int FltExt; // Filter's extension due to fpclass :: elalign.

		if( IsModel )
		{
			allocFilter( fs.Flt, w.FilterLen, true, &FltExt );

			if( UseFltOrig )
			{
				// Allocate a real buffer even in modeling mode since this
				// filter may be copied by the filter bank.

				fs.FltOrig.alloc( w.FilterLen );
				memset( &fs.FltOrig[ 0 ], 0,
					w.FilterLen * sizeof( fs.FltOrig[ 0 ]));
			}
		}
		else
		{
			fs.FltOrig.alloc( w.FilterLen );

			w.generateLPF( &fs.FltOrig[ 0 ], 1.0 );
			optimizeFIRFilter( fs.FltOrig, fs.FltLatency );
			normalizeFIRFilter( &fs.FltOrig[ 0 ], fs.FltOrig.getCapacity(),
				fs.DCGain );

			allocFilter( fs.Flt, fs.FltOrig.getCapacity(), false, &FltExt );
			copyArray( &fs.FltOrig[ 0 ], &fs.Flt[ 0 ],
				fs.FltOrig.getCapacity() );

			if( !UseFltOrig )
			{
				fs.FltOrig.free();
			}
		}

		if( IsUpsample )
		{
			int l = fs.Flt.getCapacity() - fs.FltLatency - ResampleFactor -
				FltExt;

			allocFilter( fs.PrefixDC, l, IsModel );
			allocFilter( fs.SuffixDC, fs.FltLatency, IsModel );

			if( IsModel )
			{
				return;
			}

			// Create prefix and suffix "tails" used during upsampling.

			const fptype* ip = &fs.Flt[ fs.FltLatency + ResampleFactor ];
			copyArray( ip, &fs.PrefixDC[ 0 ], l );

			while( true )
			{
				ip += ResampleFactor;
				l -= ResampleFactor;

				if( l <= 0 )
				{
					break;
				}

				addArray( ip, &fs.PrefixDC[ 0 ], l );
			}

			l = fs.FltLatency;
			fptype* op = &fs.SuffixDC[ 0 ];
			copyArray( &fs.Flt[ 0 ], op, l );

			while( true )
			{
				op += ResampleFactor;
				l -= ResampleFactor;

				if( l <= 0 )
				{
					break;
				}

				addArray( &fs.Flt[ 0 ], op, l );
			}
		}
		else
		if( !UseFltOrig )
		{
			fs.EdgePixelCount = fs.EdgePixelCountDef;
		}
	}

	/**
	 * Function adds a correction filter that tries to achieve a linear
	 * frequency response at all frequencies. The actual resulting response
	 * may feature a slight damping of the highest frequencies since a
	 * suitably short correction filter cannot fix steep high-frequency
	 * damping.
	 *
	 * This function assumes that the resizing step is currently the last
	 * step, even if it was not inserted yet: this allows placement of the
	 * correction filter both before and after the resizing step.
	 *
	 * @param Steps Filtering steps.
	 * @param bw Resulting bandwidth relative to the original bandwidth (which
	 * is 1.0), usually 1/k. Should be <= 1.0.
	 * @param IsPreCorrection "True" if the filtering step was already created
	 * and it is first in the Steps array. "True" also adds edge pixels to
	 * reduce edge artifacts.
	 * @param IsModel "True" if filtering steps modeling is performed without
	 * actual filter building.
	 */

	void addCorrectionFilter( CFilterSteps& Steps, const double bw,
		const bool IsPreCorrection, const bool IsModel ) const
	{
		CFilterStep& fs = ( IsPreCorrection ? Steps[ 0 ] : Steps.add() );
		fs.IsUpsample = false;
		fs.ResampleFactor = 1;
		fs.DCGain = 1.0;
		fs.EdgePixelCount = ( IsPreCorrection ? fs.EdgePixelCountDef : 0 );

		if( IsModel )
		{
			allocFilter( fs.Flt, CDSPFIREQ :: calcFilterLength(
				Params.CorrFltLen, fs.FltLatency ), true );

			return;
		}

		const int BinCount = 65; // Frequency response bins to control.
		const int BinCount1 = BinCount - 1;
		double curbw = 1.0; // Bandwidth of the filter at the current step.
		int i;
		int j;
		double re;
		double im;

		CBuffer< double > Bins( BinCount ); // Adjustment introduced by all
			// steps at all frequencies of interest.

		for( j = 0; j < BinCount; j++ )
		{
			Bins[ j ] = 1.0;
		}

		const int si = ( IsPreCorrection ? 1 : 0 );

		for( i = si; i < Steps.getItemCount() - ( si ^ 1 ); i++ )
		{
			const CFilterStep& fs = Steps[ i ];

			if( fs.IsUpsample )
			{
				curbw *= fs.ResampleFactor;

				if( fs.FltOrig.getCapacity() > 0 )
				{
					continue;
				}
			}

			const double dcg = 1.0 / fs.DCGain; // DC gain correction.
			const fptype* Flt;
			int FltLen;

			if( fs.ResampleFactor == 0 )
			{
				Flt = fs.FltBank -> getFilter( 0 );
				FltLen = fs.FltBank -> getFilterLen();
			}
			else
			{
				Flt = &fs.Flt[ 0 ];
				FltLen = fs.Flt.getCapacity();
			}

			// Calculate frequency response adjustment introduced by the
			// filter at this step, within the bounds of bandwidth of
			// interest.

			for( j = 0; j < BinCount; j++ )
			{
				const double th = AVIR_PI * bw / curbw * j / BinCount1;

				calcFIRFilterResponse( Flt, FltLen, th, re, im );

				Bins[ j ] /= sqrt( re * re + im * im ) * dcg;
			}

			if( !fs.IsUpsample && fs.ResampleFactor > 1 )
			{
				curbw /= fs.ResampleFactor;
			}
		}

		// Calculate filter.

		CDSPFIREQ EQ;
		EQ.init( bw * 2.0, Params.CorrFltLen, BinCount, 0.0, bw, false,
			Params.CorrFltAlpha );

		fs.FltLatency = EQ.getFilterLatency();

		CBuffer< double > Filter( EQ.getFilterLength() );
		EQ.buildFilter( Bins, &Filter[ 0 ]);
		normalizeFIRFilter( &Filter[ 0 ], Filter.getCapacity(), 1.0 );
		optimizeFIRFilter( Filter, fs.FltLatency );
		normalizeFIRFilter( &Filter[ 0 ], Filter.getCapacity(), 1.0 );

		allocFilter( fs.Flt, Filter.getCapacity() );
		copyArray( &Filter[ 0 ], &fs.Flt[ 0 ], Filter.getCapacity() );

		// Print a theoretically achieved final frequency response at various
		// feature sizes (from DC to 1 pixel). Values above 255 means features
		// become brighter, values below 255 means features become dimmer.

/*		const double sbw = ( bw > 1.0 ? 1.0 / bw : 1.0 );

		for( j = 0; j < BinCount; j++ )
		{
			const double th = AVIR_PI * sbw * j / BinCount1;

			calcFIRFilterResponse( &fs.Flt[ 0 ], fs.Flt.getCapacity(),
				th, re, im );

			printf( "%f\n", sqrt( re * re + im * im ) / Bins[ j ] * 255 );
		}

		printf( "***\n" );*/
	}

	/**
	 * Function adds a sharpening filter if image is being upsized. Such
	 * sharpening allows to spot interpolation filter's stop-band attenuation:
	 * if attenuation is too weak, a "dark grid" and other artifacts may
	 * become visible.
	 *
	 * It is assumed that 40 decibel stop-band attenuation should be
	 * considered a required minimum: this allows application of (deliberately
	 * strong) 64X sharpening without spotting any artifacts.
	 *
	 * @param Steps Filtering steps.
	 * @param bw Resulting bandwidth relative to the original bandwidth (which
	 * is 1.0), usually 1/k.
	 * @param IsModel "True" if filtering steps modeling is performed without
	 * actual filter building.
	 */

	static void addSharpenTest( CFilterSteps& Steps, const double bw,
		const bool IsModel )
	{
		if( bw <= 1.0 )
		{
			return;
		}

		const double FltLen = 10.0 * bw;

		CFilterStep& fs = Steps.add();
		fs.IsUpsample = false;
		fs.ResampleFactor = 1;
		fs.DCGain = 1.0;
		fs.EdgePixelCount = 0;

		if( IsModel )
		{
			allocFilter( fs.Flt, CDSPFIREQ :: calcFilterLength( FltLen,
				fs.FltLatency ), true );

			return;
		}

		const int BinCount = 200;
		CBuffer< double > Bins( BinCount );
		int Thresh = (int) round( BinCount / bw * 1.75 );

		if( Thresh > BinCount )
		{
			Thresh = BinCount;
		}

		int j;

		for( j = 0; j < Thresh; j++ )
		{
			Bins[ j ] = 1.0;
		}

		for( j = Thresh; j < BinCount; j++ )
		{
			Bins[ j ] = 256.0;
		}

		CDSPFIREQ EQ;
		EQ.init( bw * 2.0, FltLen, BinCount, 0.0, bw, false, 1.7 );

		fs.FltLatency = EQ.getFilterLatency();

		CBuffer< double > Filter( EQ.getFilterLength() );
		EQ.buildFilter( Bins, &Filter[ 0 ]);
		normalizeFIRFilter( &Filter[ 0 ], Filter.getCapacity(), 1.0 );
		optimizeFIRFilter( Filter, fs.FltLatency );
		normalizeFIRFilter( &Filter[ 0 ], Filter.getCapacity(), 1.0 );

		allocFilter( fs.Flt, Filter.getCapacity() );
		copyArray( &Filter[ 0 ], &fs.Flt[ 0 ], Filter.getCapacity() );

/*		for( j = 0; j < BinCount; j++ )
		{
			const double th = AVIR_PI * j / ( BinCount - 1 );
			double re;
			double im;

			calcFIRFilterResponse( &fs.Flt[ 0 ], fs.Flt.getCapacity(),
				th, re, im );

			printf( "%f\n", sqrt( re * re + im * im ));
		}

		printf( "***\n" );*/
	}

	/**
	 * Function builds sequence of filtering steps depending on the specified
	 * resizing coefficient. The last steps included are always the resizing
	 * step then (possibly) the correction step.
	 *
	 * @param Steps Array that receives filtering steps.
	 * @param[out] Vars Variables object.
	 * @param FltBank Filter bank to initialize and use.
	 * @param DCGain The overall DC gain to apply. This DC gain is applied to
	 * the first filtering step only (upsampling or filtering step).
	 * @param ModeFlags Build mode flags to use. This is a bitmap of switches
	 * that enable or disable certain algorithm features.
	 * @param IsModel "True" if filtering steps modeling is performed without
	 * the actual filter allocation and building.
	 */

	void buildFilterSteps( CFilterSteps& Steps, CImageResizerVars& Vars,
		CDSPFracFilterBankLin< fptype >& FltBank, const double DCGain,
		const int ModeFlags, const bool IsModel ) const
	{
		Steps.clear();

		const bool DoFltAndIntCombo = (( ModeFlags & 1 ) != 0 ); // Do filter
			// and interpolator combining.
		const bool ForceHiOrderInt = (( ModeFlags & 2 ) != 0 ); // Force use
			// of a higher-order interpolation.
		const bool UseHalfband = (( ModeFlags & 4 ) != 0 ); // Use half-band
			// filter.

		const double bw = 1.0 / Vars.k; // Resulting bandwidth.
		const int UpsampleFactor = ( (int) floor( Vars.k ) < 2 ? 2 : 1 );
		double IntCutoffMult; // Interpolation filter cutoff multiplier.
		CFilterStep* ReuseStep; // If not NULL, resizing step should use
			// this step object instead of creating a new one.
		CFilterStep* ExtFltStep; // Use FltOrig of this step as the external
			// filter to applied to the interpolator.
		bool IsPreCorrection; // "True" if the correction filter is applied
			// first.
		double FltCutoff; // Cutoff frequency of the first filtering step.
		double corrbw; ///< Bandwidth at the correction step.

		if( Vars.k <= 1.0 )
		{
			IsPreCorrection = true;
			FltCutoff = 1.0;
			corrbw = 1.0;
			Steps.add();
		}
		else
		{
			IsPreCorrection = false;
			FltCutoff = bw;
			corrbw = bw;
		}

		// Add 1 upsampling or several downsampling filters.

		if( UpsampleFactor > 1 )
		{
			CFilterStep& fs = Steps.add();
			assignFilterParams( fs, true, UpsampleFactor, FltCutoff, DCGain,
				DoFltAndIntCombo, IsModel );

			IntCutoffMult = FltCutoff * 2.0 / UpsampleFactor;
			ReuseStep = NULL;
			ExtFltStep = ( DoFltAndIntCombo ? &fs : NULL );
		}
		else
		{
			int DownsampleFactor;

			while( true )
			{
				DownsampleFactor = (int) floor( 0.5 / FltCutoff );
				bool DoHBFltAdd;

				if( DownsampleFactor > 16 )
				{
					// Add half-band filter unconditionally in order to keep
					// filter lengths lower for more precise frequency
					// response and less edge artifacts.

					DoHBFltAdd = true;
					DownsampleFactor = 16;
				}
				else
				{
					DoHBFltAdd = ( UseHalfband && DownsampleFactor > 1 );
				}

				if( DoHBFltAdd )
				{
					assignFilterParams( Steps.add(), false, DownsampleFactor,
						0.0, 1.0, false, IsModel );

					FltCutoff *= DownsampleFactor;
				}
				else
				{
					if( DownsampleFactor < 1 )
					{
						DownsampleFactor = 1;
					}

					break;
				}
			}

			CFilterStep& fs = Steps.add();
			assignFilterParams( fs, false, DownsampleFactor, FltCutoff,
				DCGain, DoFltAndIntCombo, IsModel );

			IntCutoffMult = FltCutoff / 0.5;

			if( DoFltAndIntCombo )
			{
				ReuseStep = &fs;
				ExtFltStep = &fs;
			}
			else
			{
				IntCutoffMult *= DownsampleFactor;
				ReuseStep = NULL;
				ExtFltStep = NULL;
			}
		}

		// Insert resizing and correction steps.

		CFilterStep& fs = ( ReuseStep == NULL ? Steps.add() : *ReuseStep );

		Vars.ResizeStep = Steps.getItemCount() - 1;
		fs.IsUpsample = false;
		fs.ResampleFactor = 0;
		fs.DCGain = ( ExtFltStep == NULL ? 1.0 : ExtFltStep -> DCGain );

		initFilterBank( FltBank, IntCutoffMult, ForceHiOrderInt,
			( ExtFltStep == NULL ? fs.FltOrig : ExtFltStep -> FltOrig ));

		if( FltBank == FixedFilterBank )
		{
			fs.FltBank = (CDSPFracFilterBankLin< fptype >*) &FixedFilterBank;
		}
		else
		{
			fs.FltBank = &FltBank;
		}

		addCorrectionFilter( Steps, corrbw, IsPreCorrection, IsModel );

		//addSharpenTest( Steps, bw, IsModel );
	}

	/**
	 * Function extends *this upsampling step so that it produces more
	 * upsampled pixels that cover the prefix and suffix needs of the next
	 * step. After the call to this function the InPrefix and InSuffix
	 * variables of the next step will be set to zero.
	 *
	 * @param fs Upsampling filtering step.
	 * @param NextStep The next step structure.
	 */

	static void extendUpsample( CFilterStep& fs, CFilterStep& NextStep )
	{
		fs.InPrefix = ( NextStep.InPrefix + fs.ResampleFactor - 1 ) /
			fs.ResampleFactor;

		fs.OutPrefix += fs.InPrefix * fs.ResampleFactor;
		NextStep.InPrefix = 0;

		fs.InSuffix = ( NextStep.InSuffix + fs.ResampleFactor - 1 ) /
			fs.ResampleFactor;

		fs.OutSuffix += fs.InSuffix * fs.ResampleFactor;
		NextStep.InSuffix = 0;
	}

	/**
	 * Function fills resizing step's RPosBuf array, excluding the actual
	 * "ftp" pointers and "SrcOffs" offsets.
	 *
	 * This array should be cleared if the resizing step or offset were
	 * changed. Otherwise this function only fills the elements required to
	 * cover resizing step's OutLen.
	 *
	 * This function is called by the updateFilterStepBuffers() function.
	 *
	 * @param fs Resizing step.
	 * @param Vars Variables object.
	 */

	static void fillRPosBuf( CFilterStep& fs, const CImageResizerVars& Vars )
	{
		const int PrevLen = fs.RPosBuf -> getCapacity();

		if( fs.OutLen > PrevLen )
		{
			fs.RPosBuf -> increaseCapacity( fs.OutLen );
		}

		typename CFilterStep :: CResizePos* rpos = &(*fs.RPosBuf)[ PrevLen ];
		const int FracCount = fs.FltBank -> getFracCount();
		const double o = Vars.o;
		const double k = Vars.k;
		int i;

		for( i = PrevLen; i < fs.OutLen; i++ )
		{
			const double SrcPos = o + k * i;
			const int SrcPosInt = (int) floor( SrcPos );
			const double x = ( SrcPos - SrcPosInt ) * FracCount;
			const int fti = (int) x;
			rpos -> x = (typename fpclass :: fptypeatom) ( x - fti );
			rpos -> fti = fti;
			rpos -> SrcPosInt = SrcPosInt;
			rpos++;
		}
	}

	/**
	 * Function updates filtering step buffer lengths depending on the
	 * specified source and new scanline lengths. This function should be
	 * called after the buildFilterSteps() function.
	 *
	 * @param Steps Array that receives filtering steps.
	 * @param[out] Vars Variables object, will receive buffer size and length.
	 * This function expects "k" and "o" variable values that will be
	 * adjusted by this function.
	 * @param RPosBufArray Resizing position buffers array, used to obtain
	 * buffer to initialize and use (will be reused if it is already fully or
	 * partially filled).
	 * @param SrcLen Source scanline's length in pixels.
	 * @param NewLen New scanline's length in pixels.
	 */

	static void updateFilterStepBuffers( CFilterSteps& Steps,
		CImageResizerVars& Vars,
		typename CFilterStep :: CRPosBufArray& RPosBufArray, int SrcLen,
		const int NewLen )
	{
		int upstep = -1;
		int InBuf = 0;
		int i;

		for( i = 0; i < Steps.getItemCount(); i++ )
		{
			CFilterStep& fs = Steps[ i ];

			fs.Vars = &Vars;
			fs.InLen = SrcLen;
			fs.InBuf = InBuf;
			fs.OutBuf = ( InBuf + 1 ) & 1;

			if( fs.IsUpsample )
			{
				upstep = i;
				Vars.k *= fs.ResampleFactor;
				Vars.o *= fs.ResampleFactor;
				fs.InPrefix = 0;
				fs.InSuffix = 0;
				fs.OutLen = fs.InLen * fs.ResampleFactor;
				fs.OutPrefix = fs.FltLatency;
				fs.OutSuffix = fs.Flt.getCapacity() - fs.FltLatency -
					fs.ResampleFactor;

				int l0 = fs.OutPrefix + fs.OutLen + fs.OutSuffix;
				int l = fs.InLen * fs.ResampleFactor +
					fs.SuffixDC.getCapacity();

				if( l > l0 )
				{
					fs.OutSuffix += l - l0;
				}

				l0 = fs.OutLen + fs.OutSuffix;

				if( fs.PrefixDC.getCapacity() > l0 )
				{
					fs.OutSuffix += fs.PrefixDC.getCapacity() - l0;
				}
			}
			else
			if( fs.ResampleFactor == 0 )
			{
				const int FilterLenD2 = fs.FltBank -> getFilterLen() / 2;
				const int FilterLenD21 = FilterLenD2 - 1;

				const int ResizeLPix = (int) floor( Vars.o ) - FilterLenD21;
				fs.InPrefix = ( ResizeLPix < 0 ? -ResizeLPix : 0 );
				const int ResizeRPix = (int) floor( Vars.o +
					( NewLen - 1 ) * Vars.k ) + FilterLenD2 + 1;

				fs.InSuffix = ( ResizeRPix > fs.InLen ?
					ResizeRPix - fs.InLen : 0 );

				fs.OutLen = NewLen;
				fs.RPosBuf = &RPosBufArray.getRPosBuf( Vars.k, Vars.o,
					fs.FltBank -> getFracCount() );

				fillRPosBuf( fs, Vars );
			}
			else
			{
				Vars.k /= fs.ResampleFactor;
				Vars.o /= fs.ResampleFactor;
				Vars.o += fs.EdgePixelCount;

				fs.InPrefix = fs.FltLatency;
				fs.InSuffix = fs.Flt.getCapacity() - fs.FltLatency - 1;

				// Additionally extend OutLen to produce more precise edge
				// pixels.

				fs.OutLen = ( fs.InLen + fs.ResampleFactor - 1 ) /
					fs.ResampleFactor + fs.EdgePixelCount;

				fs.InSuffix += ( fs.OutLen - 1 ) * fs.ResampleFactor + 1 -
					fs.InLen;

				fs.InPrefix += fs.EdgePixelCount * fs.ResampleFactor;
				fs.OutLen += fs.EdgePixelCount;
			}

			InBuf = fs.OutBuf;
			SrcLen = fs.OutLen;
		}

		Steps[ Steps.getItemCount() - 1 ].OutBuf = 2;

		if( upstep != -1 )
		{
			extendUpsample( Steps[ upstep ], Steps[ upstep + 1 ]);
		}
	}

	/**
	 * Function calculates an optimal intermediate buffer length that will
	 * cover all needs of the specified filtering steps. This function should
	 * be called after the updateFilterStepBuffers() function.
	 *
	 * Function also updates resizing step's RPosBuf pointers to the filter
	 * bank and SrcOffs values.
	 *
	 * @param Steps Filtering steps.
	 * @param[out] Vars Variables object, will receive buffer size and length.
	 * @param ResElIncr Resulting (final) element increment, used to produce
	 * de-interleaved result. For horizontal processing this value is equal
	 * to last step's OutLen, for vertical processing this value is equal to
	 * resulting image's width.
	 */

	static void updateBufLenAndRPosPtrs( CFilterSteps& Steps,
		CImageResizerVars& Vars, const int ResElIncr )
	{
		int MaxPrefix[ 2 ] = { 0, 0 };
		int MaxLen[ 2 ] = { 0, 0 };
		int i;

		for( i = 0; i < Steps.getItemCount(); i++ )
		{
			CFilterStep& fs = Steps[ i ];
			const int ib = fs.InBuf;

			if( fs.InPrefix > MaxPrefix[ ib ])
			{
				MaxPrefix[ ib ] = fs.InPrefix;
			}

			int l = fs.InLen + fs.InSuffix;

			if( l > MaxLen[ ib ])
			{
				MaxLen[ ib ] = l;
			}

			fs.InElIncr = fs.InPrefix + l;

			if( fs.OutBuf == 2 )
			{
				break;
			}

			const int ob = fs.OutBuf;

			if( fs.IsUpsample )
			{
				if( fs.OutPrefix > MaxPrefix[ ob ])
				{
					MaxPrefix[ ob ] = fs.OutPrefix;
				}

				l = fs.OutLen + fs.OutSuffix;

				if( l > MaxLen[ ob ])
				{
					MaxLen[ ob ] = l;
				}
			}
			else
			{
				if( fs.OutLen > MaxLen[ ob ])
				{
					MaxLen[ ob ] = fs.OutLen;
				}
			}
		}

		// Update OutElIncr values of all steps.

		for( i = 0; i < Steps.getItemCount(); i++ )
		{
			CFilterStep& fs = Steps[ i ];

			if( fs.OutBuf == 2 )
			{
				fs.OutElIncr = ResElIncr;
				break;
			}

			CFilterStep& fs2 = Steps[ i + 1 ];

			if( fs.IsUpsample )
			{
				fs.OutElIncr = fs.OutPrefix + fs.OutLen + fs.OutSuffix;

				if( fs.OutElIncr > fs2.InElIncr )
				{
					fs2.InElIncr = fs.OutElIncr;
				}
				else
				{
					fs.OutElIncr = fs2.InElIncr;
				}
			}
			else
			{
				fs.OutElIncr = fs2.InElIncr;
			}
		}

		// Update temporary buffer's length.

		for( i = 0; i < 2; i++ )
		{
			Vars.BufLen[ i ] = MaxPrefix[ i ] + MaxLen[ i ];
			Vars.BufOffs[ i ] = MaxPrefix[ i ];

			if( Vars.packmode == 0 )
			{
				Vars.BufOffs[ i ] *= Vars.ElCount;
			}

			Vars.BufLen[ i ] *= Vars.ElCount;
		}

		// Update RPosBuf pointers and SrcOffs.

		CFilterStep& fs = Steps[ Vars.ResizeStep ];
		typename CFilterStep :: CResizePos* rpos = &(*fs.RPosBuf)[ 0 ];
		const int em = ( fpclass :: packmode == 0 ? Vars.ElCount : 1 );
		const int FilterLenD21 = fs.FltBank -> getFilterLen() / 2 - 1;

		for( i = 0; i < fs.OutLen; i++ )
		{
			rpos -> ftp = fs.FltBank -> getFilter( rpos -> fti );
			rpos -> SrcOffs = ( rpos -> SrcPosInt - FilterLenD21 ) * em;
			rpos++;
		}
	}

	/**
	 * Function modifies the overall (DC) gain of the correction filter in the
	 * pre-built filtering steps array.
	 *
	 * @param Steps Filtering steps.
	 * @param m Multiplier to apply to the correction filter.
	 */

	void modifyCorrFilterDCGain( CFilterSteps& Steps, const double m ) const
	{
		CBuffer< fptype >* Flt;
		const int z = Steps.getItemCount() - 1;

		if( !Steps[ z ].IsUpsample && Steps[ z ].ResampleFactor == 1 )
		{
			Flt = &Steps[ z ].Flt;
		}
		else
		{
			Flt = &Steps[ 0 ].Flt;
		}

		int i;

		for( i = 0; i < Flt -> getCapacity(); i++ )
		{
			(*Flt)[ i ] = (fptype) ( (double) (*Flt)[ i ] * m );
		}
	}

	/**
	 * Function builds a map of used fractional delay filters based on the
	 * resizing positions buffer.
	 *
	 * @param fs Resizing step.
	 * @param[out] UsedFracMap Map of used fractional delay filters.
	 */

	static void fillUsedFracMap( const CFilterStep& fs,
		CBuffer< uint8_t >& UsedFracMap )
	{
		const int FracCount = fs.FltBank -> getFracCount();
		UsedFracMap.increaseCapacity( FracCount, false );
		memset( &UsedFracMap[ 0 ], 0, FracCount * sizeof( UsedFracMap[ 0 ]));

		typename CFilterStep :: CResizePos* rpos = &(*fs.RPosBuf)[ 0 ];
		int i;

		for( i = 0; i < fs.OutLen; i++ )
		{
			UsedFracMap[ rpos -> fti ] |= 1;
			rpos++;
		}
	}

	/**
	 * Function calculates the overall filtering steps complexity per
	 * scanline. Each complexity unit corresponds to a single multiply-add
	 * operation. Data copy and pointer math operations are not included in
	 * this calculation, it is assumed that they correlate to the multiply-add
	 * operations. Calculation also does not include final rounding, dithering
	 * and clamping operations since they cannot be optimized out anyway.
	 *
	 * Calculation of the CRPosBuf buffer is not included since it cannot be
	 * avoided.
	 *
 	 * This function should be called after the updateFilterStepBuffers()
	 * function.
	 *
	 * @param Steps Filtering steps array.
	 * @param Vars Variables object.
	 * @param UsedFracMap The map of used fractional delay filters.
	 * @param ScanlineCount Scanline count.
	 */

	static int calcComplexity( const CFilterSteps& Steps,
		const CImageResizerVars& Vars, const CBuffer< uint8_t >& UsedFracMap,
		const int ScanlineCount )
	{
		int fcnum; // Filter complexity multiplier numerator.
		int fcdenom; // Filter complexity multiplier denominator.

		if( Vars.packmode != 0 )
		{
			fcnum = 1;
			fcdenom = 1;
		}
		else
		{
			// In interleaved processing mode, filters require 1 less
			// multiplication per 2 multiply-add instructions.

			fcnum = 3;
			fcdenom = 4;
		}

		int s = 0; // Complexity per one scanline.
		int s2 = 0; // Complexity per all scanlines.
		int i;

		for( i = 0; i < Steps.getItemCount(); i++ )
		{
			const CFilterStep& fs = Steps[ i ];

			s2 += 65 * fs.Flt.getCapacity(); // Filter creation complexity.

			if( fs.IsUpsample )
			{
				if( fs.FltOrig.getCapacity() > 0 )
				{
					continue;
				}

				s += ( fs.Flt.getCapacity() *
					( fs.InPrefix + fs.InLen + fs.InSuffix ) +
					fs.SuffixDC.getCapacity() + fs.PrefixDC.getCapacity() ) *
					Vars.ElCount;
			}
			else
			if( fs.ResampleFactor == 0 )
			{
				s += fs.FltBank -> getFilterLen() *
					( fs.FltBank -> getOrder() + Vars.ElCount ) * fs.OutLen;

				s2 += fs.FltBank -> calcInitComplexity( UsedFracMap );
			}
			else
			{
				s += fs.Flt.getCapacity() * Vars.ElCount * fs.OutLen *
					fcnum / fcdenom;
			}
		}

		return( s + s2 / ScanlineCount );
	}

	/**
	 * @brief Thread-isolated data used for scanline processing.
	 *
	 * This structure holds data necessary for image's horizontal or vertical
	 * scanline processing, including scanline processing queue.
	 *
	 * @tparam Tin Source element data type. Intermediate buffers store data
	 * in floating point format.
	 * @tparam Tout Destination element data type. Intermediate buffers store
	 * data in floating point format.
	 */

	template< class Tin, class Tout >
	class CThreadData : public CImageResizerThreadPool :: CWorkload
	{
	public:
		virtual void process()
		{
			processScanlineQueue();
		}

		/**
		 * This enumeration lists possible scanline operations.
		 */

		enum EScanlineOperation
		{
			sopResizeH, ///< Resize horizontal scanline.
				///<
			sopResizeV, ///< Resize vertical scanline.
				///<
			sopDitherAndUnpackH, ///< Dither and unpack horizontal scanline.
				///<
			sopUnpackH ///< Unpack horizontal scanline.
				///<
		};

		/**
		 * Function initializes *this thread data object and assigns certain
		 * variables provided by the higher level code.
		 *
		 * @param aThreadIndex Index of this thread data (0-based).
		 * @param aThreadCount Total number of threads used during processing.
		 * @param aSteps Filtering steps.
		 * @param aVars Image resizer variables.
		 */

		void init( const int aThreadIndex, const int aThreadCount,
			const CFilterSteps& aSteps, const CImageResizerVars& aVars )
		{
			ThreadIndex = aThreadIndex;
			ThreadCount = aThreadCount;
			Steps = &aSteps;
			Vars = &aVars;
		}

		/**
		 * Function initializes scanline processing queue, and updates
		 * capacities of intermediate buffers.
		 *
		 * @param aOp Operation to perform over scanline.
		 * @param TotalLines The total number of scanlines that will be
		 * processed by all threads.
		 * @param aSrcLen Source scanline length in pixels.
		 * @param aSrcIncr Source scanline buffer increment. Ignored in
		 * horizontal scanline processing.
		 * @param aResIncr Resulting scanline buffer increment. Ignored in
		 * horizontal scanline processing.
		 */

		void initScanlineQueue( const EScanlineOperation aOp,
			const int TotalLines, const int aSrcLen, const int aSrcIncr = 0,
			const int aResIncr = 0 )
		{
			const int l = Vars -> BufLen[ 0 ] + Vars -> BufLen[ 1 ];

			if( Bufs.getCapacity() < l )
			{
				Bufs.alloc( l, fpclass :: fpalign );
			}

			BufPtrs[ 0 ] = Bufs + Vars -> BufOffs[ 0 ];
			BufPtrs[ 1 ] = Bufs + Vars -> BufLen[ 0 ] + Vars -> BufOffs[ 1 ];

			int j;
			int ml = 0;

			for( j = 0; j < Steps -> getItemCount(); j++ )
			{
				const CFilterStep& fs = (*Steps)[ j ];

				if( fs.ResampleFactor == 0 &&
					ml < fs.FltBank -> getFilterLen() )
				{
					ml = fs.FltBank -> getFilterLen();
				}
			}

			TmpFltBuf.alloc( ml, fpclass :: fpalign );
			ScanlineOp = aOp;
			SrcLen = aSrcLen;
			SrcIncr = aSrcIncr;
			ResIncr = aResIncr;
			QueueLen = 0;
			Queue.increaseCapacity(( TotalLines + ThreadCount - 1 ) /
				ThreadCount, false );
		}

		/**
		 * Function adds a scanline to the queue buffer. The
		 * initScanlineQueue() function should be called before calling this
		 * function. The number of calls to this add function should not
		 * exceed the TotalLines spread over all threads.
		 *
		 * @param SrcBuf Source scanline buffer.
		 * @param ResBuf Resulting scanline buffer.
		 */

		void addScanlineToQueue( void* const SrcBuf, void* const ResBuf )
		{
			Queue[ QueueLen ].SrcBuf = SrcBuf;
			Queue[ QueueLen ].ResBuf = ResBuf;
			QueueLen++;
		}

		/**
		 * Function processes all queued scanlines.
		 */

		void processScanlineQueue()
		{
			int i;

			switch( ScanlineOp )
			{
				case sopResizeH:
				{
					for( i = 0; i < QueueLen; i++ )
					{
						resizeScanlineH( (Tin*) Queue[ i ].SrcBuf,
							(fptype*) Queue[ i ].ResBuf );
					}

					break;
				}

				case sopResizeV:
				{
					for( i = 0; i < QueueLen; i++ )
					{
						resizeScanlineV( (fptype*) Queue[ i ].SrcBuf,
							(fptype*) Queue[ i ].ResBuf );
					}

					break;
				}

				case sopDitherAndUnpackH:
				{
					if( Vars -> UseSRGBGamma )
					{
						for( i = 0; i < QueueLen; i++ )
						{
							CFilterStep :: applySRGBGamma(
								(fptype*) Queue[ i ].SrcBuf, SrcLen, *Vars );

							Ditherer.dither( (fptype*) Queue[ i ].SrcBuf );

							CFilterStep :: unpackScanline(
								(fptype*) Queue[ i ].SrcBuf,
								(Tout*) Queue[ i ].ResBuf, SrcLen, *Vars );
						}
					}
					else
					{
						for( i = 0; i < QueueLen; i++ )
						{
							Ditherer.dither( (fptype*) Queue[ i ].SrcBuf );

							CFilterStep :: unpackScanline(
								(fptype*) Queue[ i ].SrcBuf,
								(Tout*) Queue[ i ].ResBuf, SrcLen, *Vars );
						}
					}

					break;
				}

				case sopUnpackH:
				{
					if( Vars -> UseSRGBGamma )
					{
						for( i = 0; i < QueueLen; i++ )
						{
							CFilterStep :: applySRGBGamma(
								(fptype*) Queue[ i ].SrcBuf, SrcLen, *Vars );

							CFilterStep :: unpackScanline(
								(fptype*) Queue[ i ].SrcBuf,
								(Tout*) Queue[ i ].ResBuf, SrcLen, *Vars );
						}
					}
					else
					{
						for( i = 0; i < QueueLen; i++ )
						{
							CFilterStep :: unpackScanline(
								(fptype*) Queue[ i ].SrcBuf,
								(Tout*) Queue[ i ].ResBuf, SrcLen, *Vars );
						}
					}

					break;
				}
			}
		}

		/**
		 * Function returns ditherer object associated with *this thread data
		 * object.
		 */

		CDitherer& getDitherer()
		{
			return( Ditherer );
		}

	private:
		int ThreadIndex; ///< Thread index.
			///<
		int ThreadCount; ///< Thread count.
			///<
		const CFilterSteps* Steps; ///< Filtering steps.
			///<
		const CImageResizerVars* Vars; ///< Image resizer variables.
			///<
		CBuffer< fptype > Bufs; ///< Flip-flop intermediate buffers.
			///<
		fptype* BufPtrs[ 3 ]; ///< Flip-flop buffer pointers (referenced by
			///< filtering step's InBuf and OutBuf indices).
			///<
		CBuffer< fptype > TmpFltBuf; ///< Temporary buffer used in the
			///< doResize() function, aligned by fpclass :: fpalign.
			///<
		EScanlineOperation ScanlineOp; ///< Operation to perform over
			///< scanline.
			///<
		int SrcLen; ///< Source scanline length in the last queue.
			///<
		int SrcIncr; ///< Source scanline buffer increment in the last queue.
			///<
		int ResIncr; ///< Resulting scanline buffer increment in the last
			///< queue.
			///<
		CDitherer Ditherer; ///< Ditherer object to use.
			///<

		/**
		 * @brief Scanline processing queue item.
		 *
		 * Scanline processing queue item.
		 */

		struct CQueueItem
		{
			void* SrcBuf; ///< Source scanline buffer, will by typecasted to
				///< Tin or fptype*.
				///<
			void* ResBuf; ///< Resulting scanline buffer, will by typecasted
				///< to Tout or fptype*.
				///<
		};

		CBuffer< CQueueItem > Queue; ///< Scanline processing queue.
			///<
		int QueueLen; ///< Queue length.
			///<

		/**
		 * Function resizes a single horizontal scanline.
		 *
		 * @param SrcBuf Source scanline buffer. Can be either horizontal or
		 * vertical.
		 * @param ResBuf Resulting scanline buffer.
		 */

		void resizeScanlineH( const Tin* const SrcBuf, fptype* const ResBuf )
		{
			(*Steps)[ 0 ].packScanline( SrcBuf, BufPtrs[ 0 ], SrcLen );
			BufPtrs[ 2 ] = ResBuf;
			int j;

			for( j = 0; j < Steps -> getItemCount(); j++ )
			{
				const CFilterStep& fs = (*Steps)[ j ];
				fs.prepareInBuf( BufPtrs[ fs.InBuf ]);
				const int DstIncr =
					( Vars -> packmode == 0 ? Vars -> ElCount : 1 );

				if( fs.ResampleFactor != 0 )
				{
					if( fs.IsUpsample )
					{
						fs.doUpsample( BufPtrs[ fs.InBuf ],
							BufPtrs[ fs.OutBuf ]);
					}
					else
					{
						fs.doFilter( BufPtrs[ fs.InBuf ],
							BufPtrs[ fs.OutBuf ], DstIncr );
					}
				}
				else
				{
					fs.doResize( BufPtrs[ fs.InBuf ], BufPtrs[ fs.OutBuf ],
						DstIncr, TmpFltBuf );
				}
			}
		}

		/**
		 * Function resizes a single vertical scanline.
		 *
		 * @param SrcBuf Source scanline buffer. Can be either horizontal or
		 * vertical.
		 * @param ResBuf Resulting scanline buffer.
		 */

		void resizeScanlineV( const fptype* const SrcBuf,
			fptype* const ResBuf )
		{
			(*Steps)[ 0 ].convertVtoH( SrcBuf, BufPtrs[ 0 ], SrcLen,
				SrcIncr );

			BufPtrs[ 2 ] = ResBuf;
			int j;

			for( j = 0; j < Steps -> getItemCount(); j++ )
			{
				const CFilterStep& fs = (*Steps)[ j ];
				fs.prepareInBuf( BufPtrs[ fs.InBuf ]);
				const int DstIncr = ( fs.OutBuf == 2 ? ResIncr :
					( Vars -> packmode == 0 ? Vars -> ElCount : 1 ));

				if( fs.ResampleFactor != 0 )
				{
					if( fs.IsUpsample )
					{
						fs.doUpsample( BufPtrs[ fs.InBuf ],
							BufPtrs[ fs.OutBuf ]);
					}
					else
					{
						fs.doFilter( BufPtrs[ fs.InBuf ],
							BufPtrs[ fs.OutBuf ], DstIncr );
					}
				}
				else
				{
					fs.doResize( BufPtrs[ fs.InBuf ], BufPtrs[ fs.OutBuf ],
						DstIncr, TmpFltBuf );
				}
			}
		}
	};
};

#undef AVIR_PI
#undef AVIR_PId2

} // namespace avir

#endif // AVIR_CIMAGERESIZER_INCLUDED