abadams/gist:aa785b8d3a1202c59327387a96387a8b

## gistfile1.txt
template<int _stride>
struct alignas(64) IIR2Pairwise {
    constexpr static int stride = _stride;

    constexpr static int rate = 2;
    constexpr static int fmas_per_output = 2;  // Assume it can be vectorized perfectly

    float_vec_t<stride> state[2] = {};
    const float c[5] = {};

    void run(const float_vec *__restrict__ input, float_vec *__restrict__ output) {
        const auto *in = (const float_vec_t<stride> *)input;
        auto *out = (float_vec_t<stride> *)output;

        /* A second-order IIR with input x and output y, can be represented like
           this (I believe this is known as controller-canonical form):

          s[i] = [0 1] s[i-1] + [0] x[i]
                 [b a]          [1]

          y[i] = [0 1] s[i]

          s[i] is a 2-vector representing the previous two outputs (oldest
          first). In the definition of s[i], the second row is just the
          definition of the IIR (using a and b for alpha and beta), and the
          first row just shuffles the old most-recent output to be the new
          second-most-recent output.

          We can also take two steps at a time. You can just subtitute out s[i -
          1] above and do the matrix math, but I find it easier to think about
          starting from the recurrence;

          y[i] = a y[i-1] + b y[i-2] + x[i]

          The next output is given by:

          y[i+1] = a y[i] + b y[i-1] + x[i+1]

          Eliminating y[i] in the second line gives

          y[i+1] = a (a y[i-1] + b y[i-2] + x[i]) + b y[i-1] + x[i+1]

          Expanding and collecting like terms:

          y[i+1] = (a^2 + b) y[i-1] + ab y[i-2] + a x[i] + x[i+1]

          Represented as before using a state which is the previous two outputs
          (oldest first) this is equivalent to:

          s[i] = [b  a    ] s[i-2] + [1 0] x[i]
                 [ab a^2+b]        + [a 1] x[i+1]

          y[i]   = [1 0] s[i]
          y[i+1]   [0 1]

          This is more stable to accumulated rounding errors than the original,
          because y[i+1] looks back two outputs instead of just one, so the
          critical path length is shorter, and there have been fewer
          opportunities for rounding. But we can make it even more stable with a
          change of basis.

          Define our new state s' as follows:

          s' = [ 1 0] s
               [-1 1]

          Inverting, we get

          s = [1 0] s'
              [1 1]

          So now instead of being the last two outputs, s' represents two
          outputs ago, and the difference of the previous two outputs. For
          low-pass filters, these differences tend to be very very small, so it
          really helps precision to track that tiny number separately in its own
          floating point variable.

          If we replace s with s' in the equations above and multiply out the
          matrices, we get:

          s'[i] = [b        a      ] s'[i-2] + [1   0] x[i]
                  [ab+a^2-a a^2+b-a]           [a-1 1] x[i+1]

          y[i]   = [1 0] s'[i]
          y[i+1]   [1 1]

          This is what is implemented below. c[0] through c[4] are the
          non-trivial coefficients in the matrix above in this order:

          s'[i] = [c[0] c[1]] s'[i-2] + [1   0] x[i]
                  [c[2] c[3]]           [c[4] 1] x[i+1]


          Note that we're now doing 5 fmas and an add to produce two outputs, as
          opposed to the serial method above, which would require 4 fmas to
          produce two outputs. However, in practice, the performance is not
          measurably worse, because eliminating the dependence of y[i+1] on y[i]
          eases up instruction latency issues, which are the limiting factor in
          the serial version.
        */

        for (int i = 0; i < rate * vec_lanes / stride; i += 2) {
            auto v0 = in[i], v1 = in[i + 1];
            auto s0 = state[0], s1 = state[1];

            auto new_s0 = v0;
            new_s0 += c[0] * s0;
            new_s0 += c[1] * s1;
            auto new_s1 = v1;
            new_s1 += c[3] * s1;
            new_s1 += c[2] * s0;
            new_s1 += c[4] * v0;
            s0 = new_s0;
            s1 = new_s1;

            out[i] = s0;
            out[i + 1] = s0 + s1;

            state[0] = s0;
            state[1] = s1;
        }
    }

    IIR2Pairwise(double alpha, double beta)
        : c{float(alpha + beta),
            float(alpha),
            float(alpha * (beta + alpha - 1)),
            float(alpha * (alpha - 1) + beta),
            float(alpha - 1)} {
    }
};
	template<int _stride>
	struct alignas(64) IIR2Pairwise {
	constexpr static int stride = _stride;

	constexpr static int rate = 2;
	constexpr static int fmas_per_output = 2; // Assume it can be vectorized perfectly

	float_vec_t<stride> state[2] = {};
	const float c[5] = {};

	void run(const float_vec __restrict__ input, float_vec __restrict__ output) {
	const auto in = (const float_vec_t<stride> )input;
	auto out = (float_vec_t<stride> )output;

	/* A second-order IIR with input x and output y, can be represented like
	this (I believe this is known as controller-canonical form):

	s[i] = [0 1] s[i-1] + [0] x[i]
	[b a] [1]

	y[i] = [0 1] s[i]

	s[i] is a 2-vector representing the previous two outputs (oldest
	first). In the definition of s[i], the second row is just the
	definition of the IIR (using a and b for alpha and beta), and the
	first row just shuffles the old most-recent output to be the new
	second-most-recent output.

	We can also take two steps at a time. You can just subtitute out s[i -
	1] above and do the matrix math, but I find it easier to think about
	starting from the recurrence;

	y[i] = a y[i-1] + b y[i-2] + x[i]

	The next output is given by:

	y[i+1] = a y[i] + b y[i-1] + x[i+1]

	Eliminating y[i] in the second line gives

	y[i+1] = a (a y[i-1] + b y[i-2] + x[i]) + b y[i-1] + x[i+1]

	Expanding and collecting like terms:

	y[i+1] = (a^2 + b) y[i-1] + ab y[i-2] + a x[i] + x[i+1]

	Represented as before using a state which is the previous two outputs
	(oldest first) this is equivalent to:

	s[i] = [b a ] s[i-2] + [1 0] x[i]
	[ab a^2+b] + [a 1] x[i+1]

	y[i] = [1 0] s[i]
	y[i+1] [0 1]

	This is more stable to accumulated rounding errors than the original,
	because y[i+1] looks back two outputs instead of just one, so the
	critical path length is shorter, and there have been fewer
	opportunities for rounding. But we can make it even more stable with a
	change of basis.

	Define our new state s' as follows:

	s' = [ 1 0] s
	[-1 1]

	Inverting, we get

	s = [1 0] s'
	[1 1]

	So now instead of being the last two outputs, s' represents two
	outputs ago, and the difference of the previous two outputs. For
	low-pass filters, these differences tend to be very very small, so it
	really helps precision to track that tiny number separately in its own
	floating point variable.

	If we replace s with s' in the equations above and multiply out the
	matrices, we get:

	s'[i] = [b a ] s'[i-2] + [1 0] x[i]
	[ab+a^2-a a^2+b-a] [a-1 1] x[i+1]

	y[i] = [1 0] s'[i]
	y[i+1] [1 1]

	This is what is implemented below. c[0] through c[4] are the
	non-trivial coefficients in the matrix above in this order:

	s'[i] = [c[0] c[1]] s'[i-2] + [1 0] x[i]
	[c[2] c[3]] [c[4] 1] x[i+1]


	Note that we're now doing 5 fmas and an add to produce two outputs, as
	opposed to the serial method above, which would require 4 fmas to
	produce two outputs. However, in practice, the performance is not
	measurably worse, because eliminating the dependence of y[i+1] on y[i]
	eases up instruction latency issues, which are the limiting factor in
	the serial version.
	*/

	for (int i = 0; i < rate * vec_lanes / stride; i += 2) {
	auto v0 = in[i], v1 = in[i + 1];
	auto s0 = state[0], s1 = state[1];

	auto new_s0 = v0;
	new_s0 += c[0] * s0;
	new_s0 += c[1] * s1;
	auto new_s1 = v1;
	new_s1 += c[3] * s1;
	new_s1 += c[2] * s0;
	new_s1 += c[4] * v0;
	s0 = new_s0;
	s1 = new_s1;

	out[i] = s0;
	out[i + 1] = s0 + s1;

	state[0] = s0;
	state[1] = s1;
	}
	}

	IIR2Pairwise(double alpha, double beta)
	: c{float(alpha + beta),
	float(alpha),
	float(alpha * (beta + alpha - 1)),
	float(alpha * (alpha - 1) + beta),
	float(alpha - 1)} {
	}
	};