YashasSamaga/backward.cu

## backward.cu
#include <cuda_runtime.h>

#include <random>
#include <iostream>

struct relu_grad
{
    __device__ float operator()(float x) { return x > 0; }
};

struct mish_grad_dn
{
    __device__ float softplus_kernel(float x, float threshold = 20)
    {
        if (x > threshold) return x;
        else if (x < -threshold) return expf(x);
        return log1pf(expf(x));
    }

    __device__ float operator()(float x)
    {
        const float MISH_THRESHOLD = 20.0f;

        const float sp = softplus_kernel(x, MISH_THRESHOLD);
        const float grad_sp = -expm1f(-sp);
        const float tsp = tanh(sp);
        const float grad_tsp = (1 - tsp*tsp) * grad_sp;
        const float grad = x * grad_tsp + tsp;
        return grad;
    }
};

struct mish_grad_tb
{
    __device__ float operator()(float x)
    {
        const float THRESHOLD = 20.0f;

        const float sp = x < THRESHOLD ? log1p(expf(x)) : x;
        const float grad_sp = 1 - exp(-sp);
        const float tsp = tanh(sp);
        const float grad_tsp = (1 - tsp*tsp) * grad_sp;
        const float grad = x * grad_tsp + tsp;
        return grad;
    }
};

struct mish_grad_tb_expm1
{
    __device__ float operator()(float x)
    {
        const float THRESHOLD = 20.0f;

        const float sp = x < THRESHOLD ? log1p(expf(x)) : x;
        const float grad_sp = -expm1(-sp);
        const float tsp = tanh(sp);
        const float grad_tsp = (1 - tsp*tsp) * grad_sp;
        const float grad = x * grad_tsp + tsp;
        return grad;
    }
};

struct mish_grad_fast
{
    __device__ float operator()(float x)
    {
        auto e = __expf(x);
        auto n = e * e + 2 * e;

        float tsp;
        if (x <= -0.6f)
            tsp = __fdividef(n, n + 2);
        else
            tsp = 1 - __fdividef(2, n + 2);

        const float grad_sp = __fdividef(e, e + 1);

        const float grad_tsp = (1 - tsp*tsp) * grad_sp;
        const float grad = x * grad_tsp + tsp;

        return x > 10.5f ? 1 : grad;
    }
};

struct mish_grad_double
{
    __device__ float operator()(float x)
    {
        const double sp = log1p(exp(x));
        const double grad_sp = -expm1(-sp);
        const double tsp = tanh(sp);
        const double grad_tsp = (1 - tsp*tsp) * grad_sp;
        const double grad = x * grad_tsp + tsp;
        return grad;
    }
};

template <class GradientFunc>
__global__ void grad_vec1(float* __restrict__ dz, const float* __restrict__ input, int n)
{
    GradientFunc grad;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x)
    {
        dz[i] *= grad(input[i]);
    }
}

template <class GradientFunc>
__global__ void grad_vec4(float4* __restrict__ dz, const float4* __restrict__ input, int n)
{
    GradientFunc grad;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x)
    {
        float4 temp = input[i];
        float4 dy = dz[i];
        dy.w *= grad(temp.w);
        dy.x *= grad(temp.x);
        dy.y *= grad(temp.y);
        dy.z *= grad(temp.z);
        dz[i] = dy;
    }
}

__global__ void limit_2L1S_v1(float * __restrict__ dz, const float * __restrict__ input, int n)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x)
        dz[i] += input[i];
}

__global__ void limit_2L1S_v4(float4 * __restrict__ dz, const float4 * __restrict__ input, int n)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x)
    {
        auto dy = dz[i];
        auto inp = input[i];
        dy.w += inp.w;
        dy.x += inp.x;
        dy.y += inp.y;
        dy.z += inp.z;
        dz[i] = dy;
    }
}

template <class GradientFunc>
__global__ void dump()
{
    GradientFunc grad;
    for (float x = -100; x < 20; x += 0.0001)
        printf("%.7f %.7e\n", x, grad(x));
}

int main ()
{
    if (1)
    {
        dump<mish_grad_tb><<<1, 1>>>();
        cudaDeviceSynchronize();
        return 0;
    }

    constexpr int N = 1024 * 1024 * 16;
    float *input_activation;
    float *grad;
    cudaMalloc(&input_activation, N * sizeof(float));
    cudaMalloc(&grad, N * sizeof(float));

    float *input_activation_h = new float[N];
    float *grad_h = new float[N];
    float *output_h = new float[N];
    float *output_ref = new float[N];

    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<float> in_dis(-50, 50);

    for (int i = 0; i < N; i++)
    {
        long double a = in_dis(gen);
        input_activation_h[i] = a;

        long double dy = 1.0;
        grad_h[i] = dy;

        const long double sp = std::log1p(std::exp(a));
        const long double grad_sp = -std::expm1(-sp);
        const long double tsp = std::tanh(sp);
        const long double grad_tsp = (1 - tsp * tsp) * grad_sp;
        const long double grad = a * grad_tsp + tsp;

        output_ref[i] = dy * grad;
    }

    auto lInorm = [&] (float* x, float* y, int n) {
        float max = 0;
        for (int i = 0; i < n; i++)
            max = std::max(max, std::abs(y[i] - x[i]));
        return max;
    };

    auto l2norm = [] (float* x, float* y, int n) {
        std::vector<double> diff(n);
        for (int i = 0; i < n; i++)
            diff[i] = y[i] - x[i];
        auto sqr_sum = std::accumulate(std::begin(diff), std::end(diff), 0.0, [](auto lhs, auto rhs) { return lhs + rhs * rhs; });
        return std::sqrt(sqr_sum);
    };

    auto grad4 = reinterpret_cast<float4*>(grad);
    auto input_activation4 = reinterpret_cast<float4*>(input_activation);

    // vec1
    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    grad_vec1<relu_grad><<<10, 1024>>>(grad, input_activation, N);
    cudaDeviceSynchronize();

    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    grad_vec4<relu_grad><<<10, 1024>>>(grad4, input_activation4, N / 4);
    cudaDeviceSynchronize();

    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    limit_2L1S_v1<<<10, 1024>>>(grad, input_activation, N);
    cudaDeviceSynchronize();

    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    limit_2L1S_v4<<<10, 1024>>>(grad4, input_activation4, N / 4);
    cudaDeviceSynchronize();

    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    grad_vec1<mish_grad_dn><<<10, 1024>>>(grad, input_activation, N);
    cudaDeviceSynchronize();
    cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "[vec1] mish_grad_dn: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    grad_vec4<mish_grad_dn><<<10, 1024>>>(grad4, input_activation4, N / 4);
    cudaDeviceSynchronize();
    cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "[vec1] mish_grad_dn: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    grad_vec1<mish_grad_tb><<<10, 1024>>>(grad, input_activation, N);
    cudaDeviceSynchronize();
    cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "[vec1] mish_grad_tb: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    grad_vec4<mish_grad_tb><<<10, 1024>>>(grad4, input_activation4, N / 4);
    cudaDeviceSynchronize();
    cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "[vec1] mish_grad_tb: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    grad_vec1<mish_grad_tb_expm1><<<10, 1024>>>(grad, input_activation, N);
    cudaDeviceSynchronize();
    cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "[vec1] mish_grad_tb_expm1: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    grad_vec4<mish_grad_tb_expm1><<<10, 1024>>>(grad4, input_activation4, N / 4);
    cudaDeviceSynchronize();
    cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "[vec1] mish_grad_tb_expm1: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    grad_vec1<mish_grad_fast><<<10, 1024>>>(grad, input_activation, N);
    cudaDeviceSynchronize();
    cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "[vec1] mish_grad_fast: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    grad_vec4<mish_grad_fast><<<10, 1024>>>(grad4, input_activation4, N / 4);
    cudaDeviceSynchronize();
    cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "[vec1] mish_grad_fast: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    grad_vec1<mish_grad_double><<<10, 1024>>>(grad, input_activation, N);
    cudaDeviceSynchronize();
    cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "[vec1] mish_grad_double: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

    cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
    grad_vec4<mish_grad_double><<<10, 1024>>>(grad4, input_activation4, N / 4);
    cudaDeviceSynchronize();
    cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
    std::cout << "[vec1] mish_grad_double: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';
    return 0;
}

## plot.py
import numpy as np

SMOOTHING_STEP_SIZE = 1000
LEFT_X_CUTOFF = -100
RIGHT_X_CUTOFF = 100

def ref_mish(x):
    return x * np.tanh(np.log1p(np.exp(x)))

def ref_grad(x):
    sp = np.log1p(np.exp(x))
    grad_sp = -np.expm1(-sp)
    tsp = np.tanh(sp)
    grad_tsp = (1 - tsp * tsp) * grad_sp
    return x * grad_tsp + tsp

def generate_stats(src):
    x_list = []
    y_list = []

    with open(src, "r") as f:
        for line in f.readlines():
            x, y = [float(field.strip()) for field in line.split(' ')]
            if LEFT_X_CUTOFF < x and x < RIGHT_X_CUTOFF:
                x_list.append(x)
                y_list.append(y)

    rel_error_log10 = []
    abs_diff_err = []

    for x, y in zip(x_list, y_list):
        x128 = np.float128(x)
        y128 = np.float128(y)
        ref = ref_grad(x128)

        diff = np.abs(y128 - ref)
        rerr = -np.Inf if diff == 0 else np.log10(np.abs(diff / ref))

        log_diff = 0 if diff == 0 else np.log10(diff)

        rel_error_log10.append(float(rerr))
        abs_diff_err.append(float(diff))

    # smoothing
    x_final = []
    rel_error_log10_final = []
    abs_diff_err_final = []

    for step in range(len(x_list) // SMOOTHING_STEP_SIZE):
        ibegin = step * SMOOTHING_STEP_SIZE
        iend = ibegin + SMOOTHING_STEP_SIZE

        avg_x = np.mean(x_list[ibegin : iend])
        max_rel_err_log10 = np.max(rel_error_log10[ibegin : iend])
        max_diff_err = np.max(abs_diff_err[ibegin : iend])

        x_final.append(avg_x)
        rel_error_log10_final.append(max_rel_err_log10)
        abs_diff_err_final.append(max_diff_err)

    return x_final, rel_error_log10_final, abs_diff_err_final

x_double, re_double, ad_double = generate_stats("dump_fast_grad")
x_ocv, re_ocv, ad_ocv = generate_stats("dump_dn_grad")
x_tb, re_tb, ad_tb = generate_stats("dump_tb_grad")

import matplotlib.pyplot as plt

linewidth = 0.5

fig, ax = plt.subplots(1, 3)

ax[0].plot(x_double, re_double, linewidth = linewidth, c = 'g', label = "fast grad")
ax[0].plot(x_ocv, re_ocv, linewidth = linewidth, c = 'r', label = "darknet")
ax[0].plot(x_tb, re_tb, linewidth = linewidth, c = 'b', label = "tb")
ax[0].set_title("relative error (log10)")
ax[0].legend()

ax[1].plot(x_double, ad_double, linewidth = linewidth, c = 'g', label = "fast grad")
ax[1].plot(x_ocv, ad_ocv, linewidth = linewidth, c = 'r', label = "darknet")
ax[1].plot(x_tb, ad_tb, linewidth = linewidth, c = 'b', label = "tb")
ax[1].set_title("abs(diff)")
ax[1].legend()

ax[2].plot(x_double, [np.log10(a) for a in ad_double], linewidth = linewidth, c = 'g', label = "fast grad")
ax[2].plot(x_ocv, [np.log10(a) for a in ad_ocv], linewidth = linewidth, c = 'r', label = "darknet")
ax[2].plot(x_tb, [np.log10(a) for a in ad_tb], linewidth = linewidth, c = 'b', label = "tb")
ax[2].set_title("log10(abs(diff))")
ax[2].legend()

plt.show()

print(np.max(re_ocv), np.max(ad_ocv))
print(x_ocv[np.argmax(re_ocv)], x_ocv[np.argmax(ad_ocv)])

print(np.max(re_tb), np.max(ad_tb))
print(x_tb[np.argmax(re_tb)], x_tb[np.argmax(ad_tb)])
	#include <cuda_runtime.h>

	#include <random>
	#include <iostream>

	struct relu_grad
	{
	__device__ float operator()(float x) { return x > 0; }
	};

	struct mish_grad_dn
	{
	__device__ float softplus_kernel(float x, float threshold = 20)
	{
	if (x > threshold) return x;
	else if (x < -threshold) return expf(x);
	return log1pf(expf(x));
	}

	__device__ float operator()(float x)
	{
	const float MISH_THRESHOLD = 20.0f;

	const float sp = softplus_kernel(x, MISH_THRESHOLD);
	const float grad_sp = -expm1f(-sp);
	const float tsp = tanh(sp);
	const float grad_tsp = (1 - tsptsp) grad_sp;
	const float grad = x * grad_tsp + tsp;
	return grad;
	}
	};

	struct mish_grad_tb
	{
	__device__ float operator()(float x)
	{
	const float THRESHOLD = 20.0f;

	const float sp = x < THRESHOLD ? log1p(expf(x)) : x;
	const float grad_sp = 1 - exp(-sp);
	const float tsp = tanh(sp);
	const float grad_tsp = (1 - tsptsp) grad_sp;
	const float grad = x * grad_tsp + tsp;
	return grad;
	}
	};

	struct mish_grad_tb_expm1
	{
	__device__ float operator()(float x)
	{
	const float THRESHOLD = 20.0f;

	const float sp = x < THRESHOLD ? log1p(expf(x)) : x;
	const float grad_sp = -expm1(-sp);
	const float tsp = tanh(sp);
	const float grad_tsp = (1 - tsptsp) grad_sp;
	const float grad = x * grad_tsp + tsp;
	return grad;
	}
	};

	struct mish_grad_fast
	{
	__device__ float operator()(float x)
	{
	auto e = __expf(x);
	auto n = e * e + 2 * e;

	float tsp;
	if (x <= -0.6f)
	tsp = __fdividef(n, n + 2);
	else
	tsp = 1 - __fdividef(2, n + 2);

	const float grad_sp = __fdividef(e, e + 1);

	const float grad_tsp = (1 - tsptsp) grad_sp;
	const float grad = x * grad_tsp + tsp;

	return x > 10.5f ? 1 : grad;
	}
	};

	struct mish_grad_double
	{
	__device__ float operator()(float x)
	{
	const double sp = log1p(exp(x));
	const double grad_sp = -expm1(-sp);
	const double tsp = tanh(sp);
	const double grad_tsp = (1 - tsptsp) grad_sp;
	const double grad = x * grad_tsp + tsp;
	return grad;
	}
	};

	template <class GradientFunc>
	__global__ void grad_vec1(float* __restrict__ dz, const float* __restrict__ input, int n)
	{
	GradientFunc grad;
	for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x)
	{
	dz[i] *= grad(input[i]);
	}
	}

	template <class GradientFunc>
	__global__ void grad_vec4(float4* __restrict__ dz, const float4* __restrict__ input, int n)
	{
	GradientFunc grad;
	for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x)
	{
	float4 temp = input[i];
	float4 dy = dz[i];
	dy.w *= grad(temp.w);
	dy.x *= grad(temp.x);
	dy.y *= grad(temp.y);
	dy.z *= grad(temp.z);
	dz[i] = dy;
	}
	}

	__global__ void limit_2L1S_v1(float * __restrict__ dz, const float * __restrict__ input, int n)
	{
	for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x)
	dz[i] += input[i];
	}

	__global__ void limit_2L1S_v4(float4 * __restrict__ dz, const float4 * __restrict__ input, int n)
	{
	for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x)
	{
	auto dy = dz[i];
	auto inp = input[i];
	dy.w += inp.w;
	dy.x += inp.x;
	dy.y += inp.y;
	dy.z += inp.z;
	dz[i] = dy;
	}
	}

	template <class GradientFunc>
	__global__ void dump()
	{
	GradientFunc grad;
	for (float x = -100; x < 20; x += 0.0001)
	printf("%.7f %.7e\n", x, grad(x));
	}

	int main ()
	{
	if (1)
	{
	dump<mish_grad_tb><<<1, 1>>>();
	cudaDeviceSynchronize();
	return 0;
	}

	constexpr int N = 1024 * 1024 * 16;
	float *input_activation;
	float *grad;
	cudaMalloc(&input_activation, N * sizeof(float));
	cudaMalloc(&grad, N * sizeof(float));

	float *input_activation_h = new float[N];
	float *grad_h = new float[N];
	float *output_h = new float[N];
	float *output_ref = new float[N];

	std::random_device rd;
	std::mt19937 gen(rd());
	std::uniform_real_distribution<float> in_dis(-50, 50);

	for (int i = 0; i < N; i++)
	{
	long double a = in_dis(gen);
	input_activation_h[i] = a;

	long double dy = 1.0;
	grad_h[i] = dy;

	const long double sp = std::log1p(std::exp(a));
	const long double grad_sp = -std::expm1(-sp);
	const long double tsp = std::tanh(sp);
	const long double grad_tsp = (1 - tsp * tsp) * grad_sp;
	const long double grad = a * grad_tsp + tsp;

	output_ref[i] = dy * grad;
	}

	auto lInorm = [&] (float* x, float* y, int n) {
	float max = 0;
	for (int i = 0; i < n; i++)
	max = std::max(max, std::abs(y[i] - x[i]));
	return max;
	};

	auto l2norm = [] (float* x, float* y, int n) {
	std::vector<double> diff(n);
	for (int i = 0; i < n; i++)
	diff[i] = y[i] - x[i];
	auto sqr_sum = std::accumulate(std::begin(diff), std::end(diff), 0.0, [](auto lhs, auto rhs) { return lhs + rhs * rhs; });
	return std::sqrt(sqr_sum);
	};

	auto grad4 = reinterpret_cast<float4*>(grad);
	auto input_activation4 = reinterpret_cast<float4*>(input_activation);

	// vec1
	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	grad_vec1<relu_grad><<<10, 1024>>>(grad, input_activation, N);
	cudaDeviceSynchronize();

	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	grad_vec4<relu_grad><<<10, 1024>>>(grad4, input_activation4, N / 4);
	cudaDeviceSynchronize();

	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	limit_2L1S_v1<<<10, 1024>>>(grad, input_activation, N);
	cudaDeviceSynchronize();

	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	limit_2L1S_v4<<<10, 1024>>>(grad4, input_activation4, N / 4);
	cudaDeviceSynchronize();

	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	grad_vec1<mish_grad_dn><<<10, 1024>>>(grad, input_activation, N);
	cudaDeviceSynchronize();
	cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "[vec1] mish_grad_dn: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	grad_vec4<mish_grad_dn><<<10, 1024>>>(grad4, input_activation4, N / 4);
	cudaDeviceSynchronize();
	cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "[vec1] mish_grad_dn: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	grad_vec1<mish_grad_tb><<<10, 1024>>>(grad, input_activation, N);
	cudaDeviceSynchronize();
	cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "[vec1] mish_grad_tb: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	grad_vec4<mish_grad_tb><<<10, 1024>>>(grad4, input_activation4, N / 4);
	cudaDeviceSynchronize();
	cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "[vec1] mish_grad_tb: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	grad_vec1<mish_grad_tb_expm1><<<10, 1024>>>(grad, input_activation, N);
	cudaDeviceSynchronize();
	cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "[vec1] mish_grad_tb_expm1: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	grad_vec4<mish_grad_tb_expm1><<<10, 1024>>>(grad4, input_activation4, N / 4);
	cudaDeviceSynchronize();
	cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "[vec1] mish_grad_tb_expm1: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	grad_vec1<mish_grad_fast><<<10, 1024>>>(grad, input_activation, N);
	cudaDeviceSynchronize();
	cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "[vec1] mish_grad_fast: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	grad_vec4<mish_grad_fast><<<10, 1024>>>(grad4, input_activation4, N / 4);
	cudaDeviceSynchronize();
	cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "[vec1] mish_grad_fast: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	grad_vec1<mish_grad_double><<<10, 1024>>>(grad, input_activation, N);
	cudaDeviceSynchronize();
	cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "[vec1] mish_grad_double: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';

	cudaMemcpy(input_activation, input_activation_h, N * sizeof(float), cudaMemcpyHostToDevice);
	cudaMemcpy(grad, grad_h, N * sizeof(float), cudaMemcpyHostToDevice);
	grad_vec4<mish_grad_double><<<10, 1024>>>(grad4, input_activation4, N / 4);
	cudaDeviceSynchronize();
	cudaMemcpy(output_h, grad, N * sizeof(float), cudaMemcpyDeviceToHost);
	std::cout << "[vec1] mish_grad_double: " << l2norm(output_ref, output_h, N) << ' ' << lInorm(output_ref, output_h, N) << '\n';
	return 0;
	}
	import numpy as np

	SMOOTHING_STEP_SIZE = 1000
	LEFT_X_CUTOFF = -100
	RIGHT_X_CUTOFF = 100

	def ref_mish(x):
	return x * np.tanh(np.log1p(np.exp(x)))

	def ref_grad(x):
	sp = np.log1p(np.exp(x))
	grad_sp = -np.expm1(-sp)
	tsp = np.tanh(sp)
	grad_tsp = (1 - tsp * tsp) * grad_sp
	return x * grad_tsp + tsp

	def generate_stats(src):
	x_list = []
	y_list = []

	with open(src, "r") as f:
	for line in f.readlines():
	x, y = [float(field.strip()) for field in line.split(' ')]
	if LEFT_X_CUTOFF < x and x < RIGHT_X_CUTOFF:
	x_list.append(x)
	y_list.append(y)

	rel_error_log10 = []
	abs_diff_err = []

	for x, y in zip(x_list, y_list):
	x128 = np.float128(x)
	y128 = np.float128(y)
	ref = ref_grad(x128)

	diff = np.abs(y128 - ref)
	rerr = -np.Inf if diff == 0 else np.log10(np.abs(diff / ref))

	log_diff = 0 if diff == 0 else np.log10(diff)

	rel_error_log10.append(float(rerr))
	abs_diff_err.append(float(diff))

	# smoothing
	x_final = []
	rel_error_log10_final = []
	abs_diff_err_final = []

	for step in range(len(x_list) // SMOOTHING_STEP_SIZE):
	ibegin = step * SMOOTHING_STEP_SIZE
	iend = ibegin + SMOOTHING_STEP_SIZE

	avg_x = np.mean(x_list[ibegin : iend])
	max_rel_err_log10 = np.max(rel_error_log10[ibegin : iend])
	max_diff_err = np.max(abs_diff_err[ibegin : iend])

	x_final.append(avg_x)
	rel_error_log10_final.append(max_rel_err_log10)
	abs_diff_err_final.append(max_diff_err)

	return x_final, rel_error_log10_final, abs_diff_err_final

	x_double, re_double, ad_double = generate_stats("dump_fast_grad")
	x_ocv, re_ocv, ad_ocv = generate_stats("dump_dn_grad")
	x_tb, re_tb, ad_tb = generate_stats("dump_tb_grad")

	import matplotlib.pyplot as plt

	linewidth = 0.5

	fig, ax = plt.subplots(1, 3)

	ax[0].plot(x_double, re_double, linewidth = linewidth, c = 'g', label = "fast grad")
	ax[0].plot(x_ocv, re_ocv, linewidth = linewidth, c = 'r', label = "darknet")
	ax[0].plot(x_tb, re_tb, linewidth = linewidth, c = 'b', label = "tb")
	ax[0].set_title("relative error (log10)")
	ax[0].legend()

	ax[1].plot(x_double, ad_double, linewidth = linewidth, c = 'g', label = "fast grad")
	ax[1].plot(x_ocv, ad_ocv, linewidth = linewidth, c = 'r', label = "darknet")
	ax[1].plot(x_tb, ad_tb, linewidth = linewidth, c = 'b', label = "tb")
	ax[1].set_title("abs(diff)")
	ax[1].legend()

	ax[2].plot(x_double, [np.log10(a) for a in ad_double], linewidth = linewidth, c = 'g', label = "fast grad")
	ax[2].plot(x_ocv, [np.log10(a) for a in ad_ocv], linewidth = linewidth, c = 'r', label = "darknet")
	ax[2].plot(x_tb, [np.log10(a) for a in ad_tb], linewidth = linewidth, c = 'b', label = "tb")
	ax[2].set_title("log10(abs(diff))")
	ax[2].legend()

	plt.show()

	print(np.max(re_ocv), np.max(ad_ocv))
	print(x_ocv[np.argmax(re_ocv)], x_ocv[np.argmax(ad_ocv)])

	print(np.max(re_tb), np.max(ad_tb))
	print(x_tb[np.argmax(re_tb)], x_tb[np.argmax(ad_tb)])