chikuzen/rgba_to_grey.cpp

## rgba_to_grey.cpp
#include <cstdint>
#include <algorithm>
#include <iostream>
#include <tmmintrin.h>

// pattern0: (R + G + B) / 3
// pattern1: max(R, G, B)
// pattern2: (R + 2*G + B) / 4
// pattern3: R*0.299 + G*0.587 + B*0.114


struct RGBA {
    uint8_t b, g, r, a;
};


enum CPU: int {
    HAS_SSE2 = 0,
    HAS_SSSE3,
};


template <CPU ARCH>
static inline void load_and_shuffle(const uint8_t* src, __m128i& rg, __m128i& ba)
{
    auto t0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
    auto t1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + 16));
    if (ARCH < HAS_SSSE3) {
        auto t2 = _mm_unpacklo_epi8(t0, t1);    // r0r4g0g4b0b4a0a4r1r5g1g5b1b5a1a5
        auto t3 = _mm_unpackhi_epi8(t0, t1);    // r2r6g2g6b2b6a2a6r3r7g3g7b3b7a3a7
        t0 = _mm_unpacklo_epi8(t2, t3);         // r0r2r4r6g0g2g4g6b0b2b4b6a0a2a4a6
        t1 = _mm_unpackhi_epi8(t2, t3);         // r1r3r5r7g1g3g5g7b1b3b5b7a1a3a5a7
        rg = _mm_unpacklo_epi8(t0, t1);         // rrrrrrrrgggggggg
        ba = _mm_unpackhi_epi8(t0, t1);         // bbbbbbbbaaaaaaaa
    } else {
        const __m128i shmask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
        t0 = _mm_shuffle_epi8(t0, shmask);  // rrrrggggbbbbaaaa
        t1 = _mm_shuffle_epi8(t1, shmask);  // rrrrggggbbbbaaaa
        rg = _mm_unpacklo_epi32(t0, t1);    // rrrrrrrrgggggggg
        ba = _mm_unpackhi_epi32(t0, t1);    // bbbbbbbbaaaaaaaa
    }
}


template <CPU ARCH>
static inline __m128i hadd_epi32(const __m128i& a, const __m128i& b)
{
    if (ARCH < HAS_SSSE3) {
        auto t0 = _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0));
        auto t1 = _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1));
        return _mm_add_epi32(_mm_castps_si128(t0), _mm_castps_si128(t1));
    } else {
        return _mm_hadd_epi32(a, b);
    }
}


// srcとdstのstrideは等しいものとする
void pattern0_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
    constexpr auto coef = static_cast<int>(32768.0 / 3 + 0.5); // 10923

    for (size_t y = 0; y < height; ++y) {
        auto s = reinterpret_cast<const RGBA*>(src);
        auto d = reinterpret_cast<RGBA*>(dst);
        for (size_t x = 0; x < width; ++x) {
            int v = ((s[x].r + s[x].g + s[x].b) * coef + 16384) >> 15;
            d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
            d[x].a = s[x].a;
        }
        src += stride;
        dst += stride;
    }
}


// めんどいのでstrideは常に32の倍数とする
template <CPU ARCH>
void pattern0_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
    constexpr int16_t c = static_cast<int16_t>(32768.0 / 3 + 0.5);
    const __m128i coef = _mm_setr_epi16(c, 1, c, 1, c, 1, c, 1);
    const __m128i round = _mm_set1_epi16(16384);
    const __m128i zero = _mm_setzero_si128();

    for (size_t y = 0; y < height; ++y) {
        for (size_t x = 0; x < width * 4; x += 32) {
            __m128i rg, ba;
            load_and_shuffle<ARCH>(src + x, rg, ba);

            auto rgb0 = _mm_add_epi16(_mm_unpacklo_epi8(rg, zero), _mm_unpackhi_epi8(rg, zero));
            rgb0 = _mm_add_epi16(rgb0, _mm_unpacklo_epi8(ba, zero));
            auto rgb1 = _mm_unpackhi_epi16(rgb0, round);
            rgb0 = _mm_unpacklo_epi16(rgb0, round);

            rgb0 = _mm_madd_epi16(rgb0, coef);
            rgb1 = _mm_madd_epi16(rgb1, coef);
            rgb0 = _mm_srli_epi32(rgb0, 15);
            rgb1 = _mm_srli_epi32(rgb1, 15);

            rgb0 = _mm_packs_epi32(rgb0, rgb1);
            rgb0 = _mm_packus_epi16(rgb0, rgb0);
            rg = _mm_unpacklo_epi8(rgb0, rgb0);
            ba = _mm_unpacklo_epi8(rgb0, _mm_srli_si128(ba, 8));
            rgb0 = _mm_unpacklo_epi16(rg, ba);
            rgb1 = _mm_unpackhi_epi16(rg, ba);

            _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), rgb0);
            _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x + 16), rgb1);
        }
        src += stride;
        dst += stride;
    }
}


void pattern1_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
    for (size_t y = 0; y < height; ++y) {
        auto s = reinterpret_cast<const RGBA*>(src);
        auto d = reinterpret_cast<RGBA*>(dst);
        for (size_t x = 0; x < width; ++x) {
            int v = std::max(std::max(s[x].r, s[x].g), s[x].b);
            d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
            d[x].a = s[x].a;
        }
        src += stride;
        dst += stride;
    }
}


template <CPU ARCH>
void pattern1_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
    for (size_t y = 0; y < height; ++y) {
        for (size_t x = 0; x < width * 4; x += 32) {
            __m128i rg, ba;
            load_and_shuffle<ARCH>(src + x, rg, ba);

            auto maximum = _mm_max_epu8(rg, _mm_srli_si128(rg, 8));
            maximum = _mm_max_epu8(maximum, ba);

            rg = _mm_unpacklo_epi8(maximum, maximum);
            ba = _mm_unpacklo_epi8(maximum, _mm_srli_si128(ba, 8));
            _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), _mm_unpacklo_epi16(rg, ba));
            _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x + 16), _mm_unpackhi_epi16(rg, ba));
        }
        src += stride;
        dst += stride;
    }
}


void pattern2_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
    for (size_t y = 0; y < height; ++y) {
        auto s = reinterpret_cast<const RGBA*>(src);
        auto d = reinterpret_cast<RGBA*>(dst);
        for (size_t x = 0; x < width; ++x) {
            int v = (s[x].r + s[x].g + s[x].g + s[x].b + 2) / 4;
            d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
            d[x].a = s[x].a;
        }
        src += stride;
        dst += stride;
    }
}


template <CPU ARCH>
void pattern2_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
    for (size_t y = 0; y < height; ++y) {
        for (size_t x = 0; x < width * 4; x += 32) {
            __m128i rg, ba;
            load_and_shuffle<ARCH>(src + x, rg, ba);

            auto tmp = _mm_srli_si128(rg, 8);
            rg = _mm_avg_epu8(rg, tmp);
            tmp = _mm_subs_epu8(_mm_avg_epu8(tmp, ba), _mm_set1_epi8(1));
            tmp = _mm_avg_epu8(tmp, rg);
            rg = _mm_unpacklo_epi8(tmp, tmp);
            ba = _mm_unpacklo_epi8(tmp, _mm_srli_si128(ba, 8));
            _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), _mm_unpacklo_epi16(rg, ba));
            _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x + 16), _mm_unpackhi_epi16(rg, ba));
        }
        src += stride;
        dst += stride;
    }
}


void pattern3_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
    constexpr auto coef_r = static_cast<int>(0.299 * 32768 + 0.5);
    constexpr auto coef_g = static_cast<int>(0.587 * 32768 + 0.5);
    constexpr auto coef_b = static_cast<int>(0.114 * 32768 + 0.5);

    for (size_t y = 0; y < height; ++y) {
        auto s = reinterpret_cast<const RGBA*>(src);
        auto d = reinterpret_cast<RGBA*>(dst);
        for (size_t x = 0; x < width; ++x) {
            int v = (s[x].r * coef_r + s[x].g * coef_g + s[x].b * coef_b + 16384) >> 15;
            d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
            d[x].a = s[x].a;
        }
        src += stride;
        dst += stride;
    }
}


template <CPU ARCH>
void pattern3_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
    constexpr auto cr = static_cast<int16_t>(0.299 * 32768 + 0.5);
    constexpr auto cg = static_cast<int16_t>(0.587 * 32768 + 0.5);
    constexpr auto cb = static_cast<int16_t>(0.114 * 32768 + 0.5);
    const __m128i coef = _mm_setr_epi16(cb, cg, cr, 0, cb, cg, cr, 0);
    const __m128i zero = _mm_setzero_si128();

    for (size_t y = 0; y < height; ++y) {
        for (size_t x = 0; x < width * 4; x += 16) {
            auto s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + x));

            auto rgba0 = _mm_unpacklo_epi8(s, zero);
            auto rgba1 = _mm_unpackhi_epi8(s, zero);
            rgba0 = _mm_madd_epi16(rgba0, coef);
            rgba1 = _mm_madd_epi16(rgba1, coef);

            rgba0 = hadd_epi32<ARCH>(rgba0, rgba1);
            rgba0 = _mm_add_epi32(rgba0, _mm_set1_epi32(16784));
            rgba0 = _mm_srli_epi32(rgba0, 15);

            rgba0 = _mm_or_si128(rgba0, _mm_slli_epi32(rgba0, 8));
            rgba0 = _mm_or_si128(rgba0, _mm_slli_epi32(rgba0, 8));
            rgba0 = _mm_or_si128(rgba0, _mm_and_si128(s, _mm_set1_epi32(0xFF000000)));
            _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), rgba0);
        }
        src += stride;
        dst += stride;
    }
}


void pattern0f_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
    for (size_t y = 0; y < height; ++y) {
        auto s = reinterpret_cast<const RGBA*>(src);
        auto d = reinterpret_cast<RGBA*>(dst);
        for (size_t x = 0; x < width; ++x) {
            float v = (s[x].r + s[x].g + s[x].b) * 0.33333f + 0.5f;
            d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
            d[x].a = s[x].a;
        }
        src += stride;
        dst += stride;
    }
}


void pattern0f_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
    const __m128i mask = _mm_set1_epi32(0x000000FF);

    for (size_t y = 0; y < height; ++y) {
        for (size_t x = 0; x < width * 4; x += 16) {
            auto s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + x));

            auto rgbi = _mm_add_epi32(_mm_and_si128(mask, s), _mm_and_si128(mask, _mm_srli_epi32(s, 8)));
            rgbi = _mm_add_epi32(rgbi, _mm_and_si128(mask, _mm_srli_epi32(s, 16)));
            rgbi = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rgbi), _mm_set1_ps(0.33333f)));

            rgbi = _mm_or_si128(rgbi, _mm_slli_epi32(rgbi, 8));
            rgbi = _mm_or_si128(rgbi, _mm_slli_epi32(rgbi, 8));
            rgbi = _mm_or_si128(rgbi, _mm_and_si128(s, _mm_set1_epi32(0xFF000000)));
            _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), rgbi);
        }
        src += stride;
        dst += stride;
    }
}


void pattern3f_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
    for (size_t y = 0; y < height; ++y) {
        auto s = reinterpret_cast<const RGBA*>(src);
        auto d = reinterpret_cast<RGBA*>(dst);
        for (size_t x = 0; x < width; ++x) {
            auto v = (s[x].r * 0.299f + s[x].g * 0.587f + s[x].b * 0.114f + 0.5f);
            d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
            d[x].a = s[x].a;
        }
        src += stride;
        dst += stride;
    }
}


void pattern3f_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
{
    const __m128i mask = _mm_set1_epi32(0x000000FF);

    for (size_t y = 0; y < height; ++y) {
        for (size_t x = 0; x < width * 4; x += 16) {
            auto s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + x));

            auto bf = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(mask, s)), _mm_set1_ps(0.114f));
            auto gf = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(s, 8))), _mm_set1_ps(0.587f));
            auto rf = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(s, 16))), _mm_set1_ps(0.299f));
            auto rgbi = _mm_cvtps_epi32(_mm_add_ps(_mm_add_ps(bf, gf), rf));

            rgbi = _mm_or_si128(rgbi, _mm_slli_epi32(rgbi, 8));
            rgbi = _mm_or_si128(rgbi, _mm_slli_epi32(rgbi, 8));
            rgbi = _mm_or_si128(rgbi, _mm_and_si128(s, _mm_set1_epi32(0xFF000000)));
            _mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), rgbi);
        }
        src += stride;
        dst += stride;
    }
}


#if defined(BUILD_AVS_PLUGIN)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <avisynth.h>


class MyGrey: public GenericVideoFilter {
    const int mode;
public:
    MyGrey(PClip c, int m) : GenericVideoFilter(c), mode(m)
    {
        std::cerr << "mode: " << mode << std::endl;
    }
    ~MyGrey() {}
    PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
    {
        constexpr CPU ARCH = HAS_SSSE3;

        auto src = child->GetFrame(n, env);
        auto dst = env->NewVideoFrame(vi);

        auto srcp = src->GetReadPtr();
        auto dstp = dst->GetWritePtr();
        size_t stride = dst->GetPitch();

        switch (mode) {
        case 0:
            pattern0_c(srcp, dstp, vi.width, vi.height, stride);
            break;
        case 1:
            pattern0_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
            break;
        case 2:
            pattern1_c(srcp, dstp, vi.width, vi.height, stride);
            break;
        case 3:
            pattern1_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
            break;
        case 4:
            pattern2_c(srcp, dstp, vi.width, vi.height, stride);
            break;
        case 5:
            pattern2_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
            break;
        case 6:
            pattern3_c(srcp, dstp, vi.width, vi.height, stride);
            break;
        case 7:
            pattern3_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
            break;
        case 8:
            pattern0f_c(srcp, dstp, vi.width, vi.height, stride);
            break;
        case 9:
            pattern0f_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
            break;
        case 10:
            pattern3f_c(srcp, dstp, vi.width, vi.height, stride);
            break;
        default:
            pattern3f_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
        }
        return dst;
    }
};


static AVSValue __cdecl create(AVSValue args, void*, IScriptEnvironment* env)
{
    PClip c = args[0].AsClip();
    const VideoInfo& vi = c->GetVideoInfo();
    if (!vi.IsRGB32()) {
        env->ThrowError("not RGB32");
    }
    int pitch = (vi.width * 4 + 15) & ~15;
    if (pitch % 32 != 0) {
        env->ThrowError("pitch must be mod32");
    }
    int mode = std::min(std::max(args[1].AsInt(0), 0), 11);
    return new MyGrey(c, mode);
}


const AVS_Linkage* AVS_linkage = nullptr;

extern "C" __declspec(dllexport) const char* __stdcall
AvisynthPluginInit3(IScriptEnvironment* env, const AVS_Linkage* const vectors)
{
    AVS_linkage = vectors;
    env->AddFunction("MyGrey", "c[mode]i", create, nullptr);
    return nullptr;
}
#else

#include <random>
#include <chrono>
#include <vector>
#include <utility>
#include <stdexcept>

class RGBAFrame {
    void* buff;
    const size_t width;
    const size_t height;
    const size_t stride;
public:
    RGBAFrame(size_t w, size_t h): width(w), height(h), stride((w * 4 + 31) & ~31)
    {
        buff = _mm_malloc(stride * height, 16);
        if (!buff) {
            throw std::runtime_error("failed to allocate memory");
        }
    }
    ~RGBAFrame() { _mm_free(buff); buff = nullptr; }
    uint8_t* getPtr() const { return static_cast<uint8_t*>(buff); }
    const size_t getStride() const { return stride; }
    void fill_random()
    {
        std::random_device s;
        std::mt19937 gen(s());
        std::uniform_int_distribution<> dist(INT32_MIN, INT32_MAX);
        size_t size = stride * height / sizeof(int32_t);
        auto orig = static_cast<int32_t*>(buff);

        for (size_t i = 0; i < size; ++i) {
            orig[i] = dist(gen);
        }
    }
};

int main(void)
{
    using namespace std::chrono;
    using std::make_pair;

    constexpr CPU ARCH = HAS_SSSE3;
    constexpr int num = 100;
    const size_t w = 1920, h = 1080;

    auto src = RGBAFrame(w, h);
    auto dst = RGBAFrame(w, h);
    src.fill_random();

    uint8_t *s = src.getPtr(), *d = dst.getPtr();
    auto st = src.getStride();

    std::vector<std::pair<const char*, duration<int64_t, std::nano>>> duration;
    duration.reserve(12);

    auto now = high_resolution_clock::now();
    for (int i = 0; i < num; ++i) pattern0_c(s, d, w, h, st);
    duration.emplace_back(make_pair("0-c", high_resolution_clock::now() - now));

    now = high_resolution_clock::now();
    for (int i = 0; i < num; ++i) pattern0_simd<ARCH>(s, d, w, h, st);
    duration.emplace_back(make_pair("0-simd", high_resolution_clock::now() - now));

    now = high_resolution_clock::now();
    for (int i = 0; i < num; ++i) pattern1_c(s, d, w, h, st);
    duration.emplace_back(make_pair("1-c", high_resolution_clock::now() - now));

    now = high_resolution_clock::now();
    for (int i = 0; i < num; ++i) pattern1_simd<ARCH>(s, d, w, h, st);
    duration.emplace_back(make_pair("1-simd", high_resolution_clock::now() - now));

    now = high_resolution_clock::now();
    for (int i = 0; i < num; ++i) pattern2_c(s, d, w, h, st);
    duration.emplace_back(make_pair("2-c", high_resolution_clock::now() - now));

    now = high_resolution_clock::now();
    for (int i = 0; i < num; ++i) pattern2_simd<ARCH>(s, d, w, h, st);
    duration.emplace_back(make_pair("2-simd", high_resolution_clock::now() - now));

    now = high_resolution_clock::now();
    for (int i = 0; i < num; ++i) pattern3_c(s, d, w, h, st);
    duration.emplace_back(make_pair("3-c", high_resolution_clock::now() - now));

    now = high_resolution_clock::now();
    for (int i = 0; i < num; ++i) pattern3_simd<ARCH>(s, d, w, h, st);
    duration.emplace_back(make_pair("3-simd", high_resolution_clock::now() - now));

    now = high_resolution_clock::now();
    for (int i = 0; i < num; ++i) pattern0f_c(s, d, w, h, st);
    duration.emplace_back(make_pair("0f-c", high_resolution_clock::now() - now));

    now = high_resolution_clock::now();
    for (int i = 0; i < num; ++i) pattern0f_simd(s, d, w, h, st);
    duration.emplace_back(make_pair("0f-simd", high_resolution_clock::now() - now));

    now = high_resolution_clock::now();
    for (int i = 0; i < num; ++i) pattern3f_c(s, d, w, h, st);
    duration.emplace_back(make_pair("3f-c", high_resolution_clock::now() - now));

    now = high_resolution_clock::now();
    for (int i = 0; i < num; ++i) pattern3f_simd(s, d, w, h, st);
    duration.emplace_back(make_pair("3f-simd", high_resolution_clock::now() - now));

    for (const auto& d : duration) {
        std::cout << d.first << ": " << duration_cast<nanoseconds>(d.second).count() << " [nsec]\n";
    }

    return 0;
}


#endif  // BUILD_AVS_PLUGIN
	#include <cstdint>
	#include <algorithm>
	#include <iostream>
	#include <tmmintrin.h>

	// pattern0: (R + G + B) / 3
	// pattern1: max(R, G, B)
	// pattern2: (R + 2*G + B) / 4
	// pattern3: R0.299 + G0.587 + B*0.114


	struct RGBA {
	uint8_t b, g, r, a;
	};


	enum CPU: int {
	HAS_SSE2 = 0,
	HAS_SSSE3,
	};


	template <CPU ARCH>
	static inline void load_and_shuffle(const uint8_t* src, __m128i& rg, __m128i& ba)
	{
	auto t0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
	auto t1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + 16));
	if (ARCH < HAS_SSSE3) {
	auto t2 = _mm_unpacklo_epi8(t0, t1); // r0r4g0g4b0b4a0a4r1r5g1g5b1b5a1a5
	auto t3 = _mm_unpackhi_epi8(t0, t1); // r2r6g2g6b2b6a2a6r3r7g3g7b3b7a3a7
	t0 = _mm_unpacklo_epi8(t2, t3); // r0r2r4r6g0g2g4g6b0b2b4b6a0a2a4a6
	t1 = _mm_unpackhi_epi8(t2, t3); // r1r3r5r7g1g3g5g7b1b3b5b7a1a3a5a7
	rg = _mm_unpacklo_epi8(t0, t1); // rrrrrrrrgggggggg
	ba = _mm_unpackhi_epi8(t0, t1); // bbbbbbbbaaaaaaaa
	} else {
	const __m128i shmask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
	t0 = _mm_shuffle_epi8(t0, shmask); // rrrrggggbbbbaaaa
	t1 = _mm_shuffle_epi8(t1, shmask); // rrrrggggbbbbaaaa
	rg = _mm_unpacklo_epi32(t0, t1); // rrrrrrrrgggggggg
	ba = _mm_unpackhi_epi32(t0, t1); // bbbbbbbbaaaaaaaa
	}
	}


	template <CPU ARCH>
	static inline __m128i hadd_epi32(const __m128i& a, const __m128i& b)
	{
	if (ARCH < HAS_SSSE3) {
	auto t0 = _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0));
	auto t1 = _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1));
	return _mm_add_epi32(_mm_castps_si128(t0), _mm_castps_si128(t1));
	} else {
	return _mm_hadd_epi32(a, b);
	}
	}


	// srcとdstのstrideは等しいものとする
	void pattern0_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
	{
	constexpr auto coef = static_cast<int>(32768.0 / 3 + 0.5); // 10923

	for (size_t y = 0; y < height; ++y) {
	auto s = reinterpret_cast<const RGBA*>(src);
	auto d = reinterpret_cast<RGBA*>(dst);
	for (size_t x = 0; x < width; ++x) {
	int v = ((s[x].r + s[x].g + s[x].b) * coef + 16384) >> 15;
	d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
	d[x].a = s[x].a;
	}
	src += stride;
	dst += stride;
	}
	}


	// めんどいのでstrideは常に32の倍数とする
	template <CPU ARCH>
	void pattern0_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
	{
	constexpr int16_t c = static_cast<int16_t>(32768.0 / 3 + 0.5);
	const __m128i coef = _mm_setr_epi16(c, 1, c, 1, c, 1, c, 1);
	const __m128i round = _mm_set1_epi16(16384);
	const __m128i zero = _mm_setzero_si128();

	for (size_t y = 0; y < height; ++y) {
	for (size_t x = 0; x < width * 4; x += 32) {
	__m128i rg, ba;
	load_and_shuffle<ARCH>(src + x, rg, ba);

	auto rgb0 = _mm_add_epi16(_mm_unpacklo_epi8(rg, zero), _mm_unpackhi_epi8(rg, zero));
	rgb0 = _mm_add_epi16(rgb0, _mm_unpacklo_epi8(ba, zero));
	auto rgb1 = _mm_unpackhi_epi16(rgb0, round);
	rgb0 = _mm_unpacklo_epi16(rgb0, round);

	rgb0 = _mm_madd_epi16(rgb0, coef);
	rgb1 = _mm_madd_epi16(rgb1, coef);
	rgb0 = _mm_srli_epi32(rgb0, 15);
	rgb1 = _mm_srli_epi32(rgb1, 15);

	rgb0 = _mm_packs_epi32(rgb0, rgb1);
	rgb0 = _mm_packus_epi16(rgb0, rgb0);
	rg = _mm_unpacklo_epi8(rgb0, rgb0);
	ba = _mm_unpacklo_epi8(rgb0, _mm_srli_si128(ba, 8));
	rgb0 = _mm_unpacklo_epi16(rg, ba);
	rgb1 = _mm_unpackhi_epi16(rg, ba);

	_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), rgb0);
	_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x + 16), rgb1);
	}
	src += stride;
	dst += stride;
	}
	}


	void pattern1_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
	{
	for (size_t y = 0; y < height; ++y) {
	auto s = reinterpret_cast<const RGBA*>(src);
	auto d = reinterpret_cast<RGBA*>(dst);
	for (size_t x = 0; x < width; ++x) {
	int v = std::max(std::max(s[x].r, s[x].g), s[x].b);
	d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
	d[x].a = s[x].a;
	}
	src += stride;
	dst += stride;
	}
	}


	template <CPU ARCH>
	void pattern1_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
	{
	for (size_t y = 0; y < height; ++y) {
	for (size_t x = 0; x < width * 4; x += 32) {
	__m128i rg, ba;
	load_and_shuffle<ARCH>(src + x, rg, ba);

	auto maximum = _mm_max_epu8(rg, _mm_srli_si128(rg, 8));
	maximum = _mm_max_epu8(maximum, ba);

	rg = _mm_unpacklo_epi8(maximum, maximum);
	ba = _mm_unpacklo_epi8(maximum, _mm_srli_si128(ba, 8));
	_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), _mm_unpacklo_epi16(rg, ba));
	_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x + 16), _mm_unpackhi_epi16(rg, ba));
	}
	src += stride;
	dst += stride;
	}
	}


	void pattern2_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
	{
	for (size_t y = 0; y < height; ++y) {
	auto s = reinterpret_cast<const RGBA*>(src);
	auto d = reinterpret_cast<RGBA*>(dst);
	for (size_t x = 0; x < width; ++x) {
	int v = (s[x].r + s[x].g + s[x].g + s[x].b + 2) / 4;
	d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
	d[x].a = s[x].a;
	}
	src += stride;
	dst += stride;
	}
	}


	template <CPU ARCH>
	void pattern2_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
	{
	for (size_t y = 0; y < height; ++y) {
	for (size_t x = 0; x < width * 4; x += 32) {
	__m128i rg, ba;
	load_and_shuffle<ARCH>(src + x, rg, ba);

	auto tmp = _mm_srli_si128(rg, 8);
	rg = _mm_avg_epu8(rg, tmp);
	tmp = _mm_subs_epu8(_mm_avg_epu8(tmp, ba), _mm_set1_epi8(1));
	tmp = _mm_avg_epu8(tmp, rg);
	rg = _mm_unpacklo_epi8(tmp, tmp);
	ba = _mm_unpacklo_epi8(tmp, _mm_srli_si128(ba, 8));
	_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), _mm_unpacklo_epi16(rg, ba));
	_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x + 16), _mm_unpackhi_epi16(rg, ba));
	}
	src += stride;
	dst += stride;
	}
	}


	void pattern3_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
	{
	constexpr auto coef_r = static_cast<int>(0.299 * 32768 + 0.5);
	constexpr auto coef_g = static_cast<int>(0.587 * 32768 + 0.5);
	constexpr auto coef_b = static_cast<int>(0.114 * 32768 + 0.5);

	for (size_t y = 0; y < height; ++y) {
	auto s = reinterpret_cast<const RGBA*>(src);
	auto d = reinterpret_cast<RGBA*>(dst);
	for (size_t x = 0; x < width; ++x) {
	int v = (s[x].r * coef_r + s[x].g * coef_g + s[x].b * coef_b + 16384) >> 15;
	d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
	d[x].a = s[x].a;
	}
	src += stride;
	dst += stride;
	}
	}


	template <CPU ARCH>
	void pattern3_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
	{
	constexpr auto cr = static_cast<int16_t>(0.299 * 32768 + 0.5);
	constexpr auto cg = static_cast<int16_t>(0.587 * 32768 + 0.5);
	constexpr auto cb = static_cast<int16_t>(0.114 * 32768 + 0.5);
	const __m128i coef = _mm_setr_epi16(cb, cg, cr, 0, cb, cg, cr, 0);
	const __m128i zero = _mm_setzero_si128();

	for (size_t y = 0; y < height; ++y) {
	for (size_t x = 0; x < width * 4; x += 16) {
	auto s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + x));

	auto rgba0 = _mm_unpacklo_epi8(s, zero);
	auto rgba1 = _mm_unpackhi_epi8(s, zero);
	rgba0 = _mm_madd_epi16(rgba0, coef);
	rgba1 = _mm_madd_epi16(rgba1, coef);

	rgba0 = hadd_epi32<ARCH>(rgba0, rgba1);
	rgba0 = _mm_add_epi32(rgba0, _mm_set1_epi32(16784));
	rgba0 = _mm_srli_epi32(rgba0, 15);

	rgba0 = _mm_or_si128(rgba0, _mm_slli_epi32(rgba0, 8));
	rgba0 = _mm_or_si128(rgba0, _mm_slli_epi32(rgba0, 8));
	rgba0 = _mm_or_si128(rgba0, _mm_and_si128(s, _mm_set1_epi32(0xFF000000)));
	_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), rgba0);
	}
	src += stride;
	dst += stride;
	}
	}


	void pattern0f_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
	{
	for (size_t y = 0; y < height; ++y) {
	auto s = reinterpret_cast<const RGBA*>(src);
	auto d = reinterpret_cast<RGBA*>(dst);
	for (size_t x = 0; x < width; ++x) {
	float v = (s[x].r + s[x].g + s[x].b) * 0.33333f + 0.5f;
	d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
	d[x].a = s[x].a;
	}
	src += stride;
	dst += stride;
	}
	}


	void pattern0f_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
	{
	const __m128i mask = _mm_set1_epi32(0x000000FF);

	for (size_t y = 0; y < height; ++y) {
	for (size_t x = 0; x < width * 4; x += 16) {
	auto s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + x));

	auto rgbi = _mm_add_epi32(_mm_and_si128(mask, s), _mm_and_si128(mask, _mm_srli_epi32(s, 8)));
	rgbi = _mm_add_epi32(rgbi, _mm_and_si128(mask, _mm_srli_epi32(s, 16)));
	rgbi = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(rgbi), _mm_set1_ps(0.33333f)));

	rgbi = _mm_or_si128(rgbi, _mm_slli_epi32(rgbi, 8));
	rgbi = _mm_or_si128(rgbi, _mm_slli_epi32(rgbi, 8));
	rgbi = _mm_or_si128(rgbi, _mm_and_si128(s, _mm_set1_epi32(0xFF000000)));
	_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), rgbi);
	}
	src += stride;
	dst += stride;
	}
	}


	void pattern3f_c(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
	{
	for (size_t y = 0; y < height; ++y) {
	auto s = reinterpret_cast<const RGBA*>(src);
	auto d = reinterpret_cast<RGBA*>(dst);
	for (size_t x = 0; x < width; ++x) {
	auto v = (s[x].r * 0.299f + s[x].g * 0.587f + s[x].b * 0.114f + 0.5f);
	d[x].r = d[x].g = d[x].b = static_cast<uint8_t>(v);
	d[x].a = s[x].a;
	}
	src += stride;
	dst += stride;
	}
	}


	void pattern3f_simd(const uint8_t* src, uint8_t* dst, const size_t width, const size_t height, const size_t stride)
	{
	const __m128i mask = _mm_set1_epi32(0x000000FF);

	for (size_t y = 0; y < height; ++y) {
	for (size_t x = 0; x < width * 4; x += 16) {
	auto s = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src + x));

	auto bf = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(mask, s)), _mm_set1_ps(0.114f));
	auto gf = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(s, 8))), _mm_set1_ps(0.587f));
	auto rf = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(mask, _mm_srli_epi32(s, 16))), _mm_set1_ps(0.299f));
	auto rgbi = _mm_cvtps_epi32(_mm_add_ps(_mm_add_ps(bf, gf), rf));

	rgbi = _mm_or_si128(rgbi, _mm_slli_epi32(rgbi, 8));
	rgbi = _mm_or_si128(rgbi, _mm_slli_epi32(rgbi, 8));
	rgbi = _mm_or_si128(rgbi, _mm_and_si128(s, _mm_set1_epi32(0xFF000000)));
	_mm_storeu_si128(reinterpret_cast<__m128i*>(dst + x), rgbi);
	}
	src += stride;
	dst += stride;
	}
	}


	#if defined(BUILD_AVS_PLUGIN)
	#define WIN32_LEAN_AND_MEAN
	#define NOMINMAX
	#include <windows.h>
	#include <avisynth.h>


	class MyGrey: public GenericVideoFilter {
	const int mode;
	public:
	MyGrey(PClip c, int m) : GenericVideoFilter(c), mode(m)
	{
	std::cerr << "mode: " << mode << std::endl;
	}
	~MyGrey() {}
	PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
	{
	constexpr CPU ARCH = HAS_SSSE3;

	auto src = child->GetFrame(n, env);
	auto dst = env->NewVideoFrame(vi);

	auto srcp = src->GetReadPtr();
	auto dstp = dst->GetWritePtr();
	size_t stride = dst->GetPitch();

	switch (mode) {
	case 0:
	pattern0_c(srcp, dstp, vi.width, vi.height, stride);
	break;
	case 1:
	pattern0_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
	break;
	case 2:
	pattern1_c(srcp, dstp, vi.width, vi.height, stride);
	break;
	case 3:
	pattern1_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
	break;
	case 4:
	pattern2_c(srcp, dstp, vi.width, vi.height, stride);
	break;
	case 5:
	pattern2_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
	break;
	case 6:
	pattern3_c(srcp, dstp, vi.width, vi.height, stride);
	break;
	case 7:
	pattern3_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
	break;
	case 8:
	pattern0f_c(srcp, dstp, vi.width, vi.height, stride);
	break;
	case 9:
	pattern0f_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
	break;
	case 10:
	pattern3f_c(srcp, dstp, vi.width, vi.height, stride);
	break;
	default:
	pattern3f_simd<ARCH>(srcp, dstp, vi.width, vi.height, stride);
	}
	return dst;
	}
	};


	static AVSValue __cdecl create(AVSValue args, void, IScriptEnvironment env)
	{
	PClip c = args[0].AsClip();
	const VideoInfo& vi = c->GetVideoInfo();
	if (!vi.IsRGB32()) {
	env->ThrowError("not RGB32");
	}
	int pitch = (vi.width * 4 + 15) & ~15;
	if (pitch % 32 != 0) {
	env->ThrowError("pitch must be mod32");
	}
	int mode = std::min(std::max(args[1].AsInt(0), 0), 11);
	return new MyGrey(c, mode);
	}


	const AVS_Linkage* AVS_linkage = nullptr;

	extern "C" __declspec(dllexport) const char* __stdcall
	AvisynthPluginInit3(IScriptEnvironment* env, const AVS_Linkage* const vectors)
	{
	AVS_linkage = vectors;
	env->AddFunction("MyGrey", "c[mode]i", create, nullptr);
	return nullptr;
	}
	#else

	#include <random>
	#include <chrono>
	#include <vector>
	#include <utility>
	#include <stdexcept>

	class RGBAFrame {
	void* buff;
	const size_t width;
	const size_t height;
	const size_t stride;
	public:
	RGBAFrame(size_t w, size_t h): width(w), height(h), stride((w * 4 + 31) & ~31)
	{
	buff = _mm_malloc(stride * height, 16);
	if (!buff) {
	throw std::runtime_error("failed to allocate memory");
	}
	}
	~RGBAFrame() { _mm_free(buff); buff = nullptr; }
	uint8_t* getPtr() const { return static_cast<uint8_t*>(buff); }
	const size_t getStride() const { return stride; }
	void fill_random()
	{
	std::random_device s;
	std::mt19937 gen(s());
	std::uniform_int_distribution<> dist(INT32_MIN, INT32_MAX);
	size_t size = stride * height / sizeof(int32_t);
	auto orig = static_cast<int32_t*>(buff);

	for (size_t i = 0; i < size; ++i) {
	orig[i] = dist(gen);
	}
	}
	};

	int main(void)
	{
	using namespace std::chrono;
	using std::make_pair;

	constexpr CPU ARCH = HAS_SSSE3;
	constexpr int num = 100;
	const size_t w = 1920, h = 1080;

	auto src = RGBAFrame(w, h);
	auto dst = RGBAFrame(w, h);
	src.fill_random();

	uint8_t s = src.getPtr(), d = dst.getPtr();
	auto st = src.getStride();

	std::vector<std::pair<const char*, duration<int64_t, std::nano>>> duration;
	duration.reserve(12);

	auto now = high_resolution_clock::now();
	for (int i = 0; i < num; ++i) pattern0_c(s, d, w, h, st);
	duration.emplace_back(make_pair("0-c", high_resolution_clock::now() - now));

	now = high_resolution_clock::now();
	for (int i = 0; i < num; ++i) pattern0_simd<ARCH>(s, d, w, h, st);
	duration.emplace_back(make_pair("0-simd", high_resolution_clock::now() - now));

	now = high_resolution_clock::now();
	for (int i = 0; i < num; ++i) pattern1_c(s, d, w, h, st);
	duration.emplace_back(make_pair("1-c", high_resolution_clock::now() - now));

	now = high_resolution_clock::now();
	for (int i = 0; i < num; ++i) pattern1_simd<ARCH>(s, d, w, h, st);
	duration.emplace_back(make_pair("1-simd", high_resolution_clock::now() - now));

	now = high_resolution_clock::now();
	for (int i = 0; i < num; ++i) pattern2_c(s, d, w, h, st);
	duration.emplace_back(make_pair("2-c", high_resolution_clock::now() - now));

	now = high_resolution_clock::now();
	for (int i = 0; i < num; ++i) pattern2_simd<ARCH>(s, d, w, h, st);
	duration.emplace_back(make_pair("2-simd", high_resolution_clock::now() - now));

	now = high_resolution_clock::now();
	for (int i = 0; i < num; ++i) pattern3_c(s, d, w, h, st);
	duration.emplace_back(make_pair("3-c", high_resolution_clock::now() - now));

	now = high_resolution_clock::now();
	for (int i = 0; i < num; ++i) pattern3_simd<ARCH>(s, d, w, h, st);
	duration.emplace_back(make_pair("3-simd", high_resolution_clock::now() - now));

	now = high_resolution_clock::now();
	for (int i = 0; i < num; ++i) pattern0f_c(s, d, w, h, st);
	duration.emplace_back(make_pair("0f-c", high_resolution_clock::now() - now));

	now = high_resolution_clock::now();
	for (int i = 0; i < num; ++i) pattern0f_simd(s, d, w, h, st);
	duration.emplace_back(make_pair("0f-simd", high_resolution_clock::now() - now));

	now = high_resolution_clock::now();
	for (int i = 0; i < num; ++i) pattern3f_c(s, d, w, h, st);
	duration.emplace_back(make_pair("3f-c", high_resolution_clock::now() - now));

	now = high_resolution_clock::now();
	for (int i = 0; i < num; ++i) pattern3f_simd(s, d, w, h, st);
	duration.emplace_back(make_pair("3f-simd", high_resolution_clock::now() - now));

	for (const auto& d : duration) {
	std::cout << d.first << ": " << duration_cast<nanoseconds>(d.second).count() << " [nsec]\n";
	}

	return 0;
	}


	#endif // BUILD_AVS_PLUGIN