sukinull/kernel_3a_stats.cl

## kernel_3a_stats.cl
/*
 * function: kernel_3a_stats
 * input:    image2d_t as read only
 * output:   XCamGridStat, stats results
 */

typedef struct
{
    unsigned int avg_y;

    unsigned int avg_r;
    unsigned int avg_gr;
    unsigned int avg_gb;
    unsigned int avg_b;
    unsigned int valid_wb_count;

    unsigned int f_value1;
    unsigned int f_value2;
} XCamGridStat;

__kernel void kernel_3a_stats (__read_only image2d_t input, __global XCamGridStat *output)
{
    int x = get_global_id (0);
    int y = get_global_id (1);
    int w = get_global_size (0);
    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

    int x0 = 16 * x;
    int y0 = 16 * y;
    float sum_gr = 0.0f, sum_r = 0.0f, sum_b = 0.0f, sum_gb = 0.0f;
    float avg_gr = 0.0f, avg_r = 0.0f, avg_b = 0.0f, avg_gb = 0.0f;
    int i = 0, j = 0;
    float count = (16.0 / 2) * (16.0 / 2);
    float4 p[4];

#pragma unroll
    for (j = 0; j < 16; j += 2) {
#pragma unroll
        for (i = 0; i < 16; i += 2) {
            p[0] = read_imagef (input, sampler, (int2)(x0 + i, y0 + j));
            p[1] = read_imagef (input, sampler, (int2)(x0 + i, y0 + j + 1));
            p[2] = read_imagef (input, sampler, (int2)(x0 + i + 1, y0 + j));
            p[3] = read_imagef (input, sampler, (int2)(x0 + i + 1, y0 + j + 1));
            sum_gr += p[0].x;
            sum_b += p[1].x;
            sum_r += p[2].x;
            sum_gb += p[3].x;
        }
    }

    avg_gr = sum_gr / count;
    avg_r = sum_r / count;
    avg_b = sum_b / count;
    avg_gb = sum_gb / count;

    output[y * w + x].avg_gr = convert_uint(avg_gr * 256.0);
    output[y * w + x].avg_r = convert_uint(avg_r * 256.0);
    output[y * w + x].avg_b = convert_uint(avg_b * 256.0);
    output[y * w + x].avg_gb = convert_uint(avg_gb * 256.0);
    output[y * w + x].valid_wb_count = convert_uint(count);
    output[y * w + x].avg_y = convert_uint(((avg_gr + avg_gb) / 2.0f) * 256.0);
    output[y * w + x].f_value1 = 0;
    output[y * w + x].f_value2 = 0;

}


## kernel_bayer_basic.cl
/*
 * function: kernel_bayer_copy
 *     sample code of default kernel arguments
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 */

//#define ENABLE_IMAGE_2D_INPUT 0

/*
 * GROUP_PIXEL_X_SIZE = 2 * GROUP_CELL_X_SIZE
 * GROUP_PIXEL_Y_SIZE = 2 * GROUP_CELL_Y_SIZE
*/

#define GROUP_CELL_X_SIZE 64
#define GROUP_CELL_Y_SIZE 4

//float4; 16
#define SLM_X_SIZE (GROUP_CELL_X_SIZE / 4)
#define SLM_Y_SIZE GROUP_CELL_Y_SIZE

#define STATS_3A_CELL_X_SIZE 8
#define STATS_3A_CELL_Y_SIZE GROUP_CELL_Y_SIZE

typedef struct  {
    float  level_gr;  /* Black level for GR pixels */
    float  level_r;   /* Black level for R pixels */
    float  level_b;   /* Black level for B pixels */
    float  level_gb;  /* Black level for GB pixels */
    uint   color_bits;
} CLBLCConfig;


typedef struct
{
    float r_gain;
    float gr_gain;
    float gb_gain;
    float b_gain;
} CLWBConfig;

inline int slm_pos (const int x, const int y)
{
    return mad24 (y, SLM_X_SIZE, x);
}

inline void gamma_correct(float8 *in_out, __global float *table)
{
    in_out->s0 = table[clamp(convert_int(in_out->s0 * 255.0f), 0, 255)];
    in_out->s1 = table[clamp(convert_int(in_out->s1 * 255.0f), 0, 255)];
    in_out->s2 = table[clamp(convert_int(in_out->s2 * 255.0f), 0, 255)];
    in_out->s3 = table[clamp(convert_int(in_out->s3 * 255.0f), 0, 255)];
    in_out->s4 = table[clamp(convert_int(in_out->s4 * 255.0f), 0, 255)];
    in_out->s5 = table[clamp(convert_int(in_out->s5 * 255.0f), 0, 255)];
    in_out->s6 = table[clamp(convert_int(in_out->s6 * 255.0f), 0, 255)];
    in_out->s7 = table[clamp(convert_int(in_out->s7 * 255.0f), 0, 255)];
}

inline float avg_float8 (float8 data)
{
    return (data.s0 + data.s1 + data.s2 + data.s3 + data.s4 + data.s5 + data.s6 + data.s7) * 0.125f;
}

inline void stats_3a_calculate (
    __local float4 * slm_gr,
    __local float4 * slm_r,
    __local float4 * slm_b,
    __local float4 * slm_gb,
    __global ushort8 * stats_output,
    CLWBConfig *wb_config)
{
    const int group_x_size = get_num_groups (0);
    const int group_id_x = get_group_id (0);
    const int group_id_y = get_group_id (1);

    const int l_id_x = get_local_id (0);
    const int l_id_y = get_local_id (1);
    const int l_size_x = get_local_size (0);
    const int stats_float4_x_count = STATS_3A_CELL_X_SIZE / 4;
    int count =  stats_float4_x_count * STATS_3A_CELL_Y_SIZE / 4;

    int index = mad24 (l_id_y, l_size_x, l_id_x);
    int index_x = index % SLM_X_SIZE;
    int index_y = index / SLM_X_SIZE;

    if (mad24 (index_y,  stats_float4_x_count, index_x % stats_float4_x_count) < count) {
        int pitch_count = count / stats_float4_x_count * SLM_X_SIZE;
        int index1 = index + pitch_count;
        int index2 = index1 + pitch_count;
        int index3 = index2 + pitch_count;
        slm_gr[index] = (slm_gr[index] + slm_gr[index1] + slm_gr[index2] + slm_gr[index3]) * 0.25f;
        slm_r[index] = (slm_r[index] + slm_r[index1] + slm_r[index2] + slm_r[index3]) * 0.25f;
        slm_b[index] = (slm_b[index] + slm_b[index1] + slm_b[index2] + slm_b[index3]) * 0.25f;
        slm_gb[index] = (slm_gb[index] + slm_gb[index1] + slm_gb[index2] + slm_gb[index3]) * 0.25f;
    }
    barrier (CLK_LOCAL_MEM_FENCE);

    if (index < SLM_X_SIZE / 2) {
        float result_gr, result_r, result_b, result_gb, avg_y;
        float8 tmp;
        tmp = ((__local float8*)slm_gr)[index];
        result_gr = avg_float8 (tmp);

        tmp = ((__local float8*)slm_r)[index];
        result_r = avg_float8 (tmp);

        tmp = ((__local float8*)slm_b)[index];
        result_b = avg_float8 (tmp);

        tmp = ((__local float8*)slm_gb)[index];
        result_gb = avg_float8 (tmp);

        avg_y = convert_uchar_sat(
                    mad ((result_gr * wb_config->gr_gain + result_gb * wb_config->gb_gain), 74.843f,
                         mad (result_r * wb_config->r_gain, 76.245f, result_b * 29.070f)));

        int out_index = mad24 (mad24 (group_id_y, group_x_size, group_id_x),
                               (GROUP_CELL_X_SIZE / STATS_3A_CELL_X_SIZE) * (GROUP_CELL_Y_SIZE / STATS_3A_CELL_Y_SIZE),
                               index);

        //ushort avg_y; avg_r; avg_gr; avg_gb; avg_b; valid_wb_count; f_value1; f_value2;
        stats_output[out_index] = (ushort8) (
                                      convert_ushort (convert_uchar_sat (avg_y)),
                                      convert_ushort (convert_uchar_sat (result_r * 255.0f)),
                                      convert_ushort (convert_uchar_sat (result_gr * 255.0f)),
                                      convert_ushort (convert_uchar_sat (result_gb * 255.0f)),
                                      convert_ushort (convert_uchar_sat (result_b * 255.0f)),
                                      STATS_3A_CELL_X_SIZE * STATS_3A_CELL_Y_SIZE,
                                      0,
                                      0);
    }
}


__kernel void kernel_bayer_basic (
#if ENABLE_IMAGE_2D_INPUT
    __read_only image2d_t input,
#else
    __global const ushort8 *input,
#endif
    uint input_aligned_width,
    __write_only image2d_t output,
    uint out_height,
    CLBLCConfig blc_config,
    CLWBConfig wb_config,
    __global float *gamma_table,
    __global ushort8 *stats_output
)
{
    int g_x = get_global_id (0);
    int g_y = get_global_id (1);

    const int l_x = get_local_id (0);
    const int l_y = get_local_id (1);
    const int l_x_size = get_local_size (0);
    const int l_y_size = get_local_size (1);
    const int group_id_x = get_group_id (0);
    const int group_id_y = get_group_id (1);

    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

    int index = mad24 (l_y, l_x_size, l_x);
    int x_cell_start = (GROUP_CELL_X_SIZE / 4) * group_id_x;
    int y_cell_start = GROUP_CELL_Y_SIZE * group_id_y;
    int x, y;

    float blc_multiplier = (float)(1 << (16 - blc_config.color_bits));

    __local float4 slm_gr[SLM_X_SIZE * SLM_Y_SIZE], slm_r[SLM_X_SIZE * SLM_Y_SIZE], slm_b[SLM_X_SIZE * SLM_Y_SIZE], slm_gb[SLM_X_SIZE * SLM_Y_SIZE];

    for (; index < SLM_X_SIZE * SLM_Y_SIZE; index += l_x_size * l_y_size) {
        float8 line1;
        float8 line2;

        x = index % SLM_X_SIZE + x_cell_start;
        y = index / SLM_X_SIZE + y_cell_start;

#if ENABLE_IMAGE_2D_INPUT
        line1 = convert_float8 (as_ushort8 (read_imageui(input, sampler, (int2)(x, y * 2)))) / 65536.0f;
        line2 = convert_float8 (as_ushort8 (read_imageui(input, sampler, (int2)(x, y * 2 + 1)))) / 65536.0f;
#else
        line1 = convert_float8 (input [y * 2 * input_aligned_width + x]) / 65536.0f;
        line2 = convert_float8 (input [(y * 2 + 1) * input_aligned_width + x]) / 65536.0f;
#endif

        float4 gr = mad (line1.even, blc_multiplier, - blc_config.level_gr);
        float4 r = mad (line1.odd, blc_multiplier, - blc_config.level_r);
        float4 b = mad (line2.even, blc_multiplier, - blc_config.level_b);
        float4 gb = mad (line2.odd, blc_multiplier, - blc_config.level_gb);

        slm_gr[index] = gr;
        slm_r[index] =  r;
        slm_b[index] =  b;
        slm_gb[index] = gb;
    }
    barrier(CLK_LOCAL_MEM_FENCE);

    float8 data_gr, data_r, data_b, data_gb;
    index = mad24 (l_y, l_x_size, l_x);
    x = mad24 (GROUP_CELL_X_SIZE / 8, group_id_x,  index % (SLM_X_SIZE / 2));
    y = mad24 (GROUP_CELL_Y_SIZE, group_id_y,  index / (SLM_X_SIZE / 2));

    data_gr = ((__local float8*)slm_gr)[index];
    data_gr = data_gr * wb_config.gr_gain;

    data_r = ((__local float8*)slm_r)[index];
    data_r  = data_r * wb_config.r_gain;

    data_b = ((__local float8*)slm_b)[index];
    data_b = data_b * wb_config.b_gain;

    data_gb = ((__local float8*)slm_gb)[index];
    data_gb = data_gb * wb_config.gb_gain;

#if ENABLE_GAMMA
    gamma_correct (&data_gr, gamma_table);
    gamma_correct (&data_r, gamma_table);
    gamma_correct (&data_b, gamma_table);
    gamma_correct (&data_gb, gamma_table);
#endif

#if 0
    if (x % 16 == 0 && y % 16 == 0) {
        uint8 value = convert_uint8(convert_uchar8_sat(data_gr * 255.0f));
        printf ("(x:%d, y:%d) (blc.bit:%d, level:%d) (wb.gr:%f)=> (%d, %d, %d, %d, %d, %d, %d, %d)\n",
                x * 8, y,
                blc_config.color_bits, convert_uint(blc_config.level_gr * 255.0f),
                wb_config.gr_gain,
                value.s0, value.s1, value.s2, value.s3, value.s4, value.s5, value.s6, value.s7);
    }
#endif

    write_imageui (output, (int2)(x, y), as_uint4 (convert_ushort8 (data_gr * 65536.0f)));
    write_imageui (output, (int2)(x, y + out_height), as_uint4 (convert_ushort8 (data_r * 65536.0f)));
    write_imageui (output, (int2)(x, y + out_height * 2), as_uint4 (convert_ushort8 (data_b * 65536.0f)));
    write_imageui (output, (int2)(x, y + out_height * 3), as_uint4 (convert_ushort8 (data_gb * 65536.0f)));

    stats_3a_calculate (slm_gr, slm_r, slm_b, slm_gb, stats_output, &wb_config);
}


## kernel_bayer_pipe.cl
/*
 * function: kernel_bayer_pipe
 * params:
 *   input:    image2d_t as read only
 *   output:   image2d_t as write only
 *   blc_config: black level correction configuration
 *   wb_config: whitebalance configuration
 *   gamma_table: RGGB table
 *   stats_output: 3a stats output
 */


#define WORKGROUP_CELL_WIDTH 8
#define WORKGROUP_CELL_HEIGHT 8

#define DEMOSAIC_X_CELL_PER_WORKITEM 2

#define PIXEL_PER_CELL 2

#define SLM_CELL_X_OFFSET 4
#define SLM_CELL_Y_OFFSET 1

// 8x8
#define SLM_CELL_X_VALID_SIZE WORKGROUP_CELL_WIDTH
#define SLM_CELL_Y_VALID_SIZE WORKGROUP_CELL_HEIGHT

// 10x10
#define SLM_CELL_X_SIZE (SLM_CELL_X_VALID_SIZE + SLM_CELL_X_OFFSET * 2)
#define SLM_CELL_Y_SIZE (SLM_CELL_Y_VALID_SIZE + SLM_CELL_Y_OFFSET * 2)

#define GUASS_DELTA_S_1      1.031739f
#define GUASS_DELTA_S_1_5    1.072799f
#define GUASS_DELTA_S_2      1.133173f
#define GUASS_DELTA_S_2_5    1.215717f

typedef struct
{
    float           ee_gain;
    float           ee_threshold;
    float           nr_gain;
} CLEeConfig;

inline int get_shared_pos_x (int i)
{
    return i % SLM_CELL_X_SIZE;
}

inline int get_shared_pos_y (int i)
{
    return i / SLM_CELL_X_SIZE;
}

inline int shared_pos (int x, int y)
{
    return mad24(y, SLM_CELL_X_SIZE, x);
}

/* BA10=> GRBG  */
inline void grbg_slm_load (
    __local float *px, __local float *py, __local float *pz, __local float *pw,
    int index, __read_only image2d_t input, int x_start, int y_start
)
{
    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    float4 data1, data2, line1, line2;
    int x0 = (get_shared_pos_x (index) + x_start) / 4;
    int y0 = get_shared_pos_y (index) + y_start;
    int2 pos = (int2)(x0, y0);
    float4 gr, r, b, gb;

    gr = read_imagef (input, sampler, (int2)(x0, y0));
    r = read_imagef (input, sampler, (int2)(x0, y0 + 544));
    b = read_imagef (input, sampler, (int2)(x0, y0 + 544 * 2));
    gb = read_imagef (input, sampler, (int2)(x0, y0 + 544 * 3));

    (*(__local float4 *)(px + index)) = gr;
    (*(__local float4 *)(py + index)) = r;
    (*(__local float4 *)(pz + index)) = b;
    (*(__local float4 *)(pw + index)) = gb;
}

#define MAX_DELTA_COFF 5.0f
#define MIN_DELTA_COFF 1.0f
#define DEFAULT_DELTA_COFF 4.0f

inline float2 delta_coff (float2 in, __local float *table)
{
    float2 out;
    out.x = table[(int)(fabs(in.x * 64.0f))];
    out.y = table[(int)(fabs(in.y * 64.0f))];

    return out;
}

inline float2 dot_denoise (float2 value, float2 in1, float2 in2, float2 in3, float2 in4, __local float *table, float gain)
{
    float2 coff0, coff1, coff2, coff3, coff4, coff5;
    coff0 = delta_coff (0.0, table) * gain;
    coff1 = delta_coff (in1 - value, table);
    coff2 = delta_coff (in2 - value, table);
    coff3 = delta_coff (in3 - value, table);
    coff4 = delta_coff (in4 - value, table);
    //(in1 * coff1 + in2 * coff2 + in3 * coff3 + in4 * coff4 + value * coff0)
    float2 sum1 = (mad (in1, coff1,
                        mad (in2, coff2,
                             mad (in3, coff3,
                                  mad (in4, coff4, value * coff0)))));
    return  sum1 / (coff0 + coff1 + coff2 + coff3 + coff4);
}

inline float2 dot_ee (float2 value, float2 in1, float2 in2, float2 in3, float2 in4, float2 out, CLEeConfig ee_config, float2 *egain)
{
    float2 eH = value  - in1 * 0.5f - in3 * 0.5f;
    float2 eV = value  - in2 * 0.5f - in4 * 0.5f;
    float2 ee;

    eH.x = eH.x > ee_config.ee_threshold ? eH.x : 0.0f;
    eH.y = eH.y > ee_config.ee_threshold ? eH.y : 0.0f;

    eV.x = eV.x > ee_config.ee_threshold ? eV.x : 0.0f;
    eV.y = eV.y > ee_config.ee_threshold ? eV.y : 0.0f;

    ee.x = fmax(eH.x, eV.x);
    ee.y = fmax(eH.y, eV.y);

    egain[0] = mad(ee, ee_config.ee_gain, out) / out;

    return out * egain[0];
}

inline float2 dot_denoise_ee (float2 value, float2 in1, float2 in2, float2 in3, float2 in4, __local float *table, float gain, float2 *egain, CLEeConfig ee_config)
{
    float2 out = dot_denoise(value, in1, in2, in3, in4, table, gain);
    return dot_ee(value, in1, in2, in3, in4, out, ee_config, egain);
}

void demosaic_2_cell (
    __local float *x_data_in, __local float *y_data_in, __local float *z_data_in, __local float *w_data_in,
    int in_x, int in_y,
    __write_only image2d_t out, uint out_height, int out_x, int out_y)
{
    float4 out_data;
    float2 value;
    int index;
    {
        float3 R_y[2];
        index = shared_pos (in_x - 1, in_y);
        R_y[0] = *(__local float3*)(y_data_in + index);
        index = shared_pos (in_x - 1, in_y + 1);
        R_y[1] = *(__local float3*)(y_data_in + index);

        out_data.s02 = (R_y[0].s01 + R_y[0].s12) * 0.5f;
        out_data.s13 = R_y[0].s12;
        write_imagef (out, (int2)(out_x, out_y), out_data);

        out_data.s02 = (R_y[0].s01 + R_y[0].s12 + R_y[1].s01 + R_y[1].s12) * 0.25f;
        out_data.s13 = (R_y[0].s12 + R_y[1].s12) * 0.5f;
        write_imagef (out, (int2)(out_x, out_y + 1), out_data);
    }

    {
        float3 B_z[2];
        index = shared_pos (in_x, in_y - 1);
        B_z[0] = *(__local float3*)(z_data_in + index);
        index = shared_pos (in_x, in_y);
        B_z[1] = *(__local float3*)(z_data_in + index);

        out_data.s02 = (B_z[0].s01 + B_z[1].s01) * 0.5f;
        out_data.s13 = (B_z[0].s01 + B_z[0].s12 + B_z[1].s01 + B_z[1].s12) * 0.25f;
        write_imagef (out, (int2)(out_x, out_y + out_height * 2), out_data);

        out_data.s02 = B_z[1].s01;
        out_data.s13 = (B_z[1].s01 + B_z[1].s12) * 0.5f;
        write_imagef (out, (int2)(out_x, out_y + 1 + out_height * 2), out_data);
    }

    {
        float3 Gr_x[2], Gb_w[2];
        index = shared_pos (in_x, in_y);
        Gr_x[0] = *(__local float3*)(x_data_in + index);
        index = shared_pos (in_x, in_y + 1);
        Gr_x[1] = *(__local float3*)(x_data_in + index);

        index = shared_pos (in_x - 1, in_y - 1);
        Gb_w[0] = *(__local float3*)(w_data_in + index);
        index = shared_pos (in_x - 1, in_y);
        Gb_w[1] = *(__local float3*)(w_data_in + index);

        out_data.s02 = (Gr_x[0].s01 * 4.0f + Gb_w[0].s01 +
                        Gb_w[0].s12 + Gb_w[1].s01 + Gb_w[1].s12) * 0.125f;
        out_data.s13 = (Gr_x[0].s01 + Gr_x[0].s12 + Gb_w[0].s12 + Gb_w[1].s12) * 0.25f;
        write_imagef (out, (int2)(out_x, out_y + out_height), out_data);

        out_data.s02 = (Gr_x[0].s01 + Gr_x[1].s01 + Gb_w[1].s01 + Gb_w[1].s12) * 0.25f;

        out_data.s13 = (Gb_w[1].s12 * 4.0f + Gr_x[0].s01 +
                        Gr_x[0].s12 + Gr_x[1].s01 + Gr_x[1].s12) * 0.125f;
        write_imagef (out, (int2)(out_x, out_y + 1 + out_height), out_data);
    }
}

void demosaic_denoise_2_cell (
    __local float *x_data_in, __local float *y_data_in, __local float *z_data_in, __local float *w_data_in,
    int in_x, int in_y,
    __write_only image2d_t out, uint out_height, int out_x, int out_y, __local float *table, CLEeConfig ee_config)
{
    float4 out_data[2];
    float2 value;
    int index;
    float2 egain[4];

    ///////////////////////////////////////G///////////////////////////////////
    {
        float3 Gr_x[2], Gb_w[2];
        index = shared_pos (in_x - 1, in_y - 1);
        Gb_w[0] = *(__local float3*)(w_data_in + index);
        index = shared_pos (in_x - 1, in_y);
        Gb_w[1] = *(__local float3*)(w_data_in + index);

        index = shared_pos (in_x, in_y);
        Gr_x[0] = *(__local float3*)(x_data_in + index);
        index = shared_pos (in_x, in_y + 1);
        Gr_x[1] = *(__local float3*)(x_data_in + index);

        value = mad (Gr_x[0].s01, 4.0f,  (Gb_w[0].s01 +
                                          Gb_w[0].s12 + Gb_w[1].s01 + Gb_w[1].s12)) * 0.125f;
        out_data[0].s02 = dot_denoise_ee (value, Gb_w[0].s01, Gb_w[0].s12, Gb_w[1].s01, Gb_w[1].s12, table, GUASS_DELTA_S_1_5, &egain[0], ee_config);
        value = (Gr_x[0].s01 + Gr_x[0].s12 +
                 Gb_w[0].s12 + Gb_w[1].s12) * 0.25f;
        out_data[0].s13 = dot_denoise_ee(value, Gr_x[0].s01, Gr_x[0].s12, Gb_w[0].s12, Gb_w[1].s12, table, GUASS_DELTA_S_1, &egain[1], ee_config);

        value = (Gr_x[0].s01 + Gr_x[1].s01 +
                 Gb_w[1].s01 + Gb_w[1].s12) * 0.25f;
        out_data[1].s02 = dot_denoise_ee (value, Gr_x[0].s01, Gr_x[1].s01, Gb_w[1].s01, Gb_w[1].s12, table, GUASS_DELTA_S_1, &egain[2], ee_config);

        value = mad (Gb_w[1].s12, 4.0f, (Gr_x[0].s01 +
                                         Gr_x[0].s12 + Gr_x[1].s01 + Gr_x[1].s12)) * 0.125f;
        out_data[1].s13 = dot_denoise_ee (value, Gr_x[0].s01, Gr_x[0].s12, Gr_x[1].s01, Gr_x[1].s12, table, GUASS_DELTA_S_1_5, &egain[3], ee_config);

        write_imagef (out, (int2)(out_x, out_y + out_height), out_data[0]);
        write_imagef (out, (int2)(out_x, out_y + 1 + out_height), out_data[1]);
    }

    ////////////////////////////////R//////////////////////////////////////////
    {
        float4 R_y[3];
        index = shared_pos (in_x - 1, in_y - 1);
        R_y[0] = *(__local float4*)(y_data_in + index);
        index = shared_pos (in_x - 1, in_y);
        R_y[1] = *(__local float4*)(y_data_in + index);
        index = shared_pos (in_x - 1, in_y + 1);
        R_y[2] = *(__local float4*)(y_data_in + index);

        value = (R_y[1].s01 + R_y[1].s12) * 0.5f;
        out_data[0].s02 = dot_denoise (value, R_y[0].s01, R_y[0].s12, R_y[2].s01, R_y[2].s12, table, GUASS_DELTA_S_2_5) * egain[0];

        value = R_y[1].s12;
        out_data[0].s13 = dot_denoise (value, R_y[0].s12, R_y[1].s01, R_y[1].s23, R_y[2].s12, table, GUASS_DELTA_S_2) * egain[1];

        value = (R_y[1].s01 + R_y[1].s12 +
                 R_y[2].s01 + R_y[2].s12) * 0.25f;
        out_data[1].s02 = dot_denoise (value, R_y[1].s01, R_y[1].s12, R_y[2].s01, R_y[2].s12, table, GUASS_DELTA_S_1_5) * egain[2];

        value = (R_y[1].s12 + R_y[2].s12) * 0.5f;
        out_data[1].s13 = dot_denoise (value, R_y[1].s01, R_y[1].s23, R_y[2].s01, R_y[2].s23, table, GUASS_DELTA_S_2_5) * egain[3];

        write_imagef (out, (int2)(out_x, out_y), out_data[0]);
        write_imagef (out, (int2)(out_x, out_y + 1), out_data[1]);

    }
    ////////////////////////////////B//////////////////////////////////////////
    {
        float4 B_z[3];
        index = shared_pos (in_x - 1, in_y - 1);
        B_z[0] = *(__local float4*)(z_data_in + index);
        index = shared_pos (in_x - 1, in_y);
        B_z[1] = *(__local float4*)(z_data_in + index);
        index = shared_pos (in_x - 1, in_y + 1);
        B_z[2] = *(__local float4*)(z_data_in + index);

        value = (B_z[0].s12 + B_z[1].s12) * 0.5f;
        out_data[0].s02 = dot_denoise (value, B_z[0].s01, B_z[0].s23, B_z[1].s01, B_z[1].s23, table, GUASS_DELTA_S_2_5) * egain[0];

        value = (B_z[0].s12 + B_z[0].s23 +
                 B_z[1].s12 + B_z[1].s23) * 0.25f;
        out_data[0].s13 = dot_denoise (value, B_z[0].s12, B_z[0].s23, B_z[1].s12, B_z[1].s23, table, GUASS_DELTA_S_1_5) * egain[1];

        value = B_z[1].s12;
        out_data[1].s02 = dot_denoise (value, B_z[0].s12, B_z[1].s01, B_z[1].s23, B_z[2].s12, table, GUASS_DELTA_S_2) * egain[2];

        value = (B_z[1].s12 + B_z[1].s23) * 0.5f;
        out_data[1].s13 = dot_denoise (value, B_z[0].s12, B_z[0].s23, B_z[2].s12, B_z[2].s23, table, GUASS_DELTA_S_2_5) * egain[3];

        write_imagef (out, (int2)(out_x, out_y + out_height * 2), out_data[0]);
        write_imagef (out, (int2)(out_x, out_y + 1 + out_height * 2), out_data[1]);
    }
}

void shared_demosaic (
    __local float *x_data_in, __local float *y_data_in, __local float *z_data_in, __local float *w_data_in,
    int in_x, int in_y,
    __write_only image2d_t out, uint output_height, int out_x, int out_y,
    uint has_denoise, __local float *table, CLEeConfig ee_config)
{
    if (has_denoise) {
        demosaic_denoise_2_cell (
            x_data_in, y_data_in, z_data_in, w_data_in, in_x, in_y,
            out, output_height, out_x, out_y, table, ee_config);
    } else {
        demosaic_2_cell (
            x_data_in, y_data_in, z_data_in, w_data_in, in_x, in_y,
            out, output_height, out_x, out_y);
    }
}

__kernel void kernel_bayer_pipe (__read_only image2d_t input,
                                 uint input_height,
                                 __write_only image2d_t output,
                                 uint output_height,
                                 __global float * bnr_table,
                                 uint has_denoise,
                                 CLEeConfig ee_config
                                )
{
    int g_id_x = get_global_id (0);
    int g_id_y = get_global_id (1);
    int g_size_x = get_global_size (0);
    int g_size_y = get_global_size (1);

    int l_id_x = get_local_id(0);
    int l_id_y = get_local_id(1);
    int l_size_x = get_local_size (0);
    int l_size_y = get_local_size (1);

    __local float p1_x[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE], p1_y[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE], p1_z[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE], p1_w[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE];
    __local float4 p2[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE];
    __local float4 *stats_cache = p2;
    __local float SLM_delta_coef_table[64];

    int out_x_start, out_y_start;
    int x_start = get_group_id (0) * WORKGROUP_CELL_WIDTH;
    int y_start = get_group_id (1) * WORKGROUP_CELL_HEIGHT;
    int i = mad24 (l_id_y, l_size_x, l_id_x);
    int j = i;

    i *= 4;
    for (; i < SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE; i += (l_size_x * l_size_y) * 4) {
        grbg_slm_load (p1_x, p1_y, p1_z, p1_w, i,
                       input,
                       x_start - SLM_CELL_X_OFFSET, y_start - SLM_CELL_Y_OFFSET);
    }
    for(; j < 64; j += l_size_x * l_size_y)
        SLM_delta_coef_table[j] = bnr_table[j];

    barrier(CLK_LOCAL_MEM_FENCE);

    i = mad24 (l_id_y, l_size_x, l_id_x);
    int workitem_x_size = (SLM_CELL_X_VALID_SIZE / DEMOSAIC_X_CELL_PER_WORKITEM);
    int input_x = (i % workitem_x_size) * DEMOSAIC_X_CELL_PER_WORKITEM;
    int input_y = i / workitem_x_size;

    shared_demosaic (
        p1_x, p1_y, p1_z, p1_w,
        input_x + SLM_CELL_X_OFFSET, input_y + SLM_CELL_Y_OFFSET,
        output, output_height,
        (input_x + x_start) * PIXEL_PER_CELL / 4, (input_y + y_start) * PIXEL_PER_CELL, has_denoise, SLM_delta_coef_table, ee_config);
}


## kernel_blc.cl
/*
 * function: kernel_blc
 *     black level correction for sensor data input
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 * blc_config: black level correction configuration
 * color_bits: identify 10bit or 12bit data
 * param:
 */

typedef struct
{
    float  level_gr;  /* Black level for GR pixels */
    float  level_r;   /* Black level for R pixels */
    float  level_b;   /* Black level for B pixels */
    float  level_gb;   /* Black level for GB pixels */
    uint   color_bits;
} BLCConfig;

uint decompression(uint data_in, uint color_bits)
{
    uint data_out = 0;

    data_out = data_in << (16 - color_bits);

    return data_out;
}

__kernel void kernel_blc (__read_only image2d_t input,
                          __write_only image2d_t output,
                          BLCConfig blc_config)
{
    int x0 = 2 * get_global_id (0);
    int y0 = 2 * get_global_id (1);
    int x1 = x0 + 1;
    int y1 = y0 + 1;
    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

    int2 pos_r = (int2)(x0, y0);
    int2 pos_gr = (int2)(x1, y0);
    int2 pos_gb = (int2)(x0, y1);
    int2 pos_b = (int2)(x1, y1);

    uint4 pixel;

    pixel = read_imageui(input, sampler, pos_r);
    pixel.x = floor(pixel.x - blc_config.level_r * (pown(2.0, blc_config.color_bits)) + 0.5);
    pixel.x = decompression(pixel.x, blc_config.color_bits);
    write_imageui(output, pos_r, pixel);

    pixel = read_imageui(input, sampler, pos_gr);
    pixel.x = floor(pixel.x - blc_config.level_gr * (pown(2.0, blc_config.color_bits)) + 0.5);
    pixel.x = decompression(pixel.x, blc_config.color_bits);
    write_imageui(output, pos_gr, pixel);

    pixel = read_imageui(input, sampler, pos_gb);
    pixel.x = floor(pixel.x - blc_config.level_gb * (pown(2.0, blc_config.color_bits)) + 0.5);
    pixel.x = decompression(pixel.x, blc_config.color_bits);
    write_imageui(output, pos_gb, pixel);

    pixel = read_imageui(input, sampler, pos_b);
    pixel.x = floor(pixel.x - blc_config.level_b * (pown(2.0, blc_config.color_bits)) + 0.5);
    pixel.x = decompression(pixel.x, blc_config.color_bits);
    write_imageui(output, pos_b, pixel);
}

## kernel_bnr.cl
/*
 * function: kernel_bnr
 *     implementation of bayer noise reduction
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 * bnr_gain: strength of noise reduction
 * direction: sensitivity of edge
 * todo: add the upstream algorithm for BNR
 */

__kernel void kernel_bnr (__read_only image2d_t input, __write_only image2d_t output,
                          float bnr_gain, float direction)
{
    int x = get_global_id (0);
    int y = get_global_id (1);
    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

    float4 p;
    p = read_imagef(input, sampler, (int2)(x, y));

    write_imagef(output, (int2)(x, y), p);
}


## kernel_demosaic.cl
/*
 * function: kernel_demosaic
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 */

__kernel void kernel_demosaic (__read_only image2d_t input, __write_only image2d_t output)
{
    int x = 2 * get_global_id (0);
    int y = 2 * get_global_id (1);
//    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

    int x0 = x - 1;
    int y0 = y - 1;
    float4 p[16];
#if 0
    p[0] = read_imagef (input, sampler, (int2)(x0, y0));
    p[1] = read_imagef (input, sampler, (int2)(x0 + 1, y0));
    p[2] = read_imagef (input, sampler, (int2)(x0 + 2, y0));
    p[3] = read_imagef (input, sampler, (int2)(x0 + 3, y0));
    p[4] = read_imagef (input, sampler, (int2)(x0, y0 + 1));
    p[5] = read_imagef (input, sampler, (int2)(x0 + 1, y0 + 1));
    p[6] = read_imagef (input, sampler, (int2)(x0 + 2, y0 + 1));
    p[7] = read_imagef (input, sampler, (int2)(x0 + 3, y0 + 1));
    p[8] = read_imagef (input, sampler, (int2)(x0, y0 + 2));
    p[9] = read_imagef (input, sampler, (int2)(x0 + 1, y0 + 2));
    p[10] = read_imagef (input, sampler, (int2)(x0 + 2, y0 + 2));
    p[11] = read_imagef (input, sampler, (int2)(x0 + 3, y0 + 2));
    p[12] = read_imagef (input, sampler, (int2)(x0, y0 + 3));
    p[13] = read_imagef (input, sampler, (int2)(x0 + 1, y0 + 3));
    p[14] = read_imagef (input, sampler, (int2)(x0 + 2, y0 + 3));
    p[15] = read_imagef (input, sampler, (int2)(x0 + 3, y0 + 3));
#endif

#pragma unroll

    for (int i = 0; i < 16; ++i) {
        p[i] = read_imagef (input, sampler, (int2)(x0 + i % 4, y0 + i / 4));
    }

    float4 p00, p01, p10, p11;
    p00.x = (p[4].x + p[6].x) / 2.0;
    p00.y = (p[5].x * 4 + p[0].x + p[2].x + p[8].x + p[10].x) / 8.0;
    p00.z = (p[1].x + p[9].x) / 2.0;
    p01.x = p[6].x;
    p01.y = (p[2].x + p[5].x + p[7].x + p[10].x) / 4.0;
    p01.z = (p[1].x + p[3].x + p[9].x + p[11].x) / 4.0;
    p10.x = (p[4].x + p[6].x + p[12].x + p[14].x) / 4.0;
    p10.y = (p[5].x + p[8].x + p[10].x + p[13].x) / 4.0;
    p10.z = p[9].x;
    p11.x = (p[6].x + p[14].x) / 2.0;
    p11.y = (p[10].x * 4 + p[5].x + p[7].x + p[13].x + p[15].x) / 8.0;
    p11.z = (p[9].x + p[11].x) / 2.0;

    write_imagef (output, (int2)(x, y), p00);
    write_imagef (output, (int2)(x + 1, y), p01);
    write_imagef (output, (int2)(x, y + 1), p10);
    write_imagef (output, (int2)(x + 1, y + 1), p11);
}


## kernel_denoise.cl
/*
 * function: kernel_denoise
 *     bi-laterial filter for denoise usage
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 * sigma_r:  the parameter to set sigma_r in the Gaussian filtering
 * imw:      image width, used for edge detect
 * imh:      image height, used for edge detect
 */

__constant float gausssingle[25]={0.6411,0.7574,0.8007,0.7574,0.6411,0.7574,0.8948,0.9459,0.8948,0.7574,0.8007,0.94595945,1,0.9459,0.8007,0.7574,0.8948,0.9459,0.8948,0.7574,0.6411,0.7574,0.8007,0.7574,0.6411};

#define LOCAL_SIZE_X 16
#define LOCAL_SIZE_Y 15

__kernel void kernel_denoise(__read_only image2d_t srcRGB, __write_only image2d_t dstRGB, float sigma_r, unsigned int imw, unsigned int imh)
{
    int x = get_global_id(1);    //[0,imw-1]
    int y = get_global_id(0);    //[0,imh-1]
    int localX = get_local_id(1);    //[0,imw/120-1]
    int localY = get_local_id(0);    //[0,imh/72-1]
    //printf("localX=%d,localY=%d\n",localX,localY);

    float normF=0;
    float H=0;
    float delta=0;
    int i=0,j=0;
    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE |CLK_FILTER_NEAREST;
    sigma_r = 2*pown(sigma_r,2);

    //coord in srcY
    float4 line;
    line.x=0;
    line.y=0;
    line.z=0;
    line.w=1.0;

    __local float4 pixel[LOCAL_SIZE_X+4][LOCAL_SIZE_Y+4];
    bool interior = x > 1 && x <(imw-3)
    && y>1 && y< (imh-3);
    if(interior)
    {
        pixel[localX+2][localY+2]=read_imagef(srcRGB, sampler,(int2)(x,y));

        if(localX==0)
        {
            if(localY==0)
            {
                pixel[0][0]=read_imagef(srcRGB, sampler,(int2)(x-2,y-2));
                pixel[0][1]=read_imagef(srcRGB, sampler,(int2)(x-2,y-1));
                pixel[0][2]=read_imagef(srcRGB, sampler,(int2)(x-2,y  ));
                pixel[1][0]=read_imagef(srcRGB, sampler,(int2)(x-1,y-2));
                pixel[1][1]=read_imagef(srcRGB, sampler,(int2)(x-1,y-1));
                pixel[1][2]=read_imagef(srcRGB, sampler,(int2)(x-1,y  ));
                pixel[2][0]=read_imagef(srcRGB, sampler,(int2)(x  ,y-2));
                pixel[2][1]=read_imagef(srcRGB, sampler,(int2)(x  ,y-1));
            }
            else if(localY==LOCAL_SIZE_Y-1)
            {
                pixel[0][LOCAL_SIZE_Y-1+2]=read_imagef(srcRGB, sampler,(int2)(x-2,y  ));
                pixel[0][LOCAL_SIZE_Y  +2]=read_imagef(srcRGB, sampler,(int2)(x-2,y+1));
                pixel[0][LOCAL_SIZE_Y+1+2]=read_imagef(srcRGB, sampler,(int2)(x-2,y+2));
                pixel[1][LOCAL_SIZE_Y-1+2]=read_imagef(srcRGB, sampler,(int2)(x-1,y  ));
                pixel[1][LOCAL_SIZE_Y  +2]=read_imagef(srcRGB, sampler,(int2)(x-1,y+1));
                pixel[1][LOCAL_SIZE_Y+1+2]=read_imagef(srcRGB, sampler,(int2)(x-1,y+2));
                pixel[2][LOCAL_SIZE_Y  +2]=read_imagef(srcRGB, sampler,(int2)(x  ,y+1));
                pixel[2][LOCAL_SIZE_Y+1+2]=read_imagef(srcRGB, sampler,(int2)(x  ,y+2));
            }
            else
            {
                pixel[0][localY+2]=read_imagef(srcRGB, sampler,(int2)(x-2,y));
                pixel[1][localY+2]=read_imagef(srcRGB, sampler,(int2)(x-1,y));
            }
        }
        else if(localX==LOCAL_SIZE_X-1)
        {
            if(localY==0)
            {
                pixel[LOCAL_SIZE_X-1+2+0][0]=read_imagef(srcRGB, sampler,(int2)(x  ,y-2));
                pixel[LOCAL_SIZE_X-1+2+0][1]=read_imagef(srcRGB, sampler,(int2)(x  ,y-1));
                //pixel[LOCAL_SIZE_X-1+2+0][2]=read_imagef(srcRGB, sampler,(int2)(x  ,y));
                pixel[LOCAL_SIZE_X-1+2+1][0]=read_imagef(srcRGB, sampler,(int2)(x+1,y-2));
                pixel[LOCAL_SIZE_X-1+2+1][1]=read_imagef(srcRGB, sampler,(int2)(x+1,y-1));
                pixel[LOCAL_SIZE_X-1+2+1][2]=read_imagef(srcRGB, sampler,(int2)(x+1,y  ));
                pixel[LOCAL_SIZE_X-1+2+2][0]=read_imagef(srcRGB, sampler,(int2)(x+2,y-2));
                pixel[LOCAL_SIZE_X-1+2+2][1]=read_imagef(srcRGB, sampler,(int2)(x+2,y-1));
                pixel[LOCAL_SIZE_X-1+2+2][2]=read_imagef(srcRGB, sampler,(int2)(x+2,y  ));
            }
            else if(localY==LOCAL_SIZE_Y-1)
            {
                //	pixel[LOCAL_SIZE_X-1+2+0][LOCAL_SIZE_Y-1+2+0]=read_imagef(srcRGB, sampler,(int2)(x  ,y  ));
                pixel[LOCAL_SIZE_X-1+2+0][LOCAL_SIZE_Y-1+2+1]=read_imagef(srcRGB, sampler,(int2)(x  ,y+1));
                pixel[LOCAL_SIZE_X-1+2+0][LOCAL_SIZE_Y-1+2+2]=read_imagef(srcRGB, sampler,(int2)(x  ,y+2));
                pixel[LOCAL_SIZE_X-1+2+1][LOCAL_SIZE_Y-1+2+0]=read_imagef(srcRGB, sampler,(int2)(x+1,y  ));
                pixel[LOCAL_SIZE_X-1+2+1][LOCAL_SIZE_Y-1+2+1]=read_imagef(srcRGB, sampler,(int2)(x+1,y+1));
                pixel[LOCAL_SIZE_X-1+2+1][LOCAL_SIZE_Y-1+2+2]=read_imagef(srcRGB, sampler,(int2)(x+1,y+2));
                pixel[LOCAL_SIZE_X-1+2+2][LOCAL_SIZE_Y-1+2+0]=read_imagef(srcRGB, sampler,(int2)(x+2,y  ));
                pixel[LOCAL_SIZE_X-1+2+2][LOCAL_SIZE_Y-1+2+1]=read_imagef(srcRGB, sampler,(int2)(x+2,y+1));
                pixel[LOCAL_SIZE_X-1+2+2][LOCAL_SIZE_Y-1+2+2]=read_imagef(srcRGB, sampler,(int2)(x+2,y+2));
            }
            else
            {
                pixel[LOCAL_SIZE_X-1+2+1][localY+2]=read_imagef(srcRGB, sampler,(int2)(x+1,y  ));
                pixel[LOCAL_SIZE_X-1+2+2][localY+2]=read_imagef(srcRGB, sampler,(int2)(x+2,y  ));
            }
        }
        else if(localY==0)
        {
            pixel[localX+2][0]=read_imagef(srcRGB,sampler,(int2)(x,y-2));
            pixel[localX+2][1]=read_imagef(srcRGB,sampler,(int2)(x,y-1));
        }
        else if(localY==LOCAL_SIZE_Y-1)
        {
        pixel[localX+2][LOCAL_SIZE_Y-1+2+1]=read_imagef(srcRGB,sampler,(int2)(x,y+1));
        pixel[localX+2][LOCAL_SIZE_Y-1+2+2]=read_imagef(srcRGB,sampler,(int2)(x,y+2));
    }
    }else{
        line=read_imagef(srcRGB, sampler,(int2)(x,y));
    }

    barrier(CLK_LOCAL_MEM_FENCE);
    if (interior) {
#pragma unroll
        for(i=0;i<5;i++)
        {
#pragma unroll
             for(j=0;j<5;j++)
             {
                  delta=pown(pixel[localX+i][localY+j].x-pixel[localX+2][localY+2].x,2) +
                  pown(pixel[localX+i][localY+j].y-pixel[localX+2][localY+2].y,2) +
                  pown(pixel[localX+i][localY+j].z-pixel[localX+2][localY+2].z,2);
                  H = (exp(-(delta/sigma_r)))*gausssingle[i*5+j];
                  normF+=H;
                  line.x+=pixel[localX+i][localY+j].x*H;
                  line.y+=pixel[localX+i][localY+j].y*H;
                  line.z+=pixel[localX+i][localY+j].z*H;
             }
        }

        line.x=line.x/normF;
        line.y=line.y/normF;
        line.z=line.z/normF;
    }

    write_imagef(dstRGB,(int2)(x,y),line);
}

## kernel_dpc.cl
/*
 * function: kernel_dpc
 *     defect pixel correction on bayer data input
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 * gr_threshold:   GR threshold of defect pixel correction
 * r_threshold:    R threshold of defect pixel correction
 * b_threshold:    B threshold of defect pixel correction
 * gb_threshold:   GB threshold of defect pixel correction
 * param:
 */

__kernel void kernel_dpc (__read_only image2d_t input,
                          __write_only image2d_t output,
                          float gr_threshold, float r_threshold,
                          float b_threshold, float gb_threshold)
{
    int x = get_global_id (0);
    int y = get_global_id (1);
    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    float4 p[9];
    p[0] = read_imagef(input, sampler, (int2)(x - 2, y - 2));
    p[1] = read_imagef(input, sampler, (int2)(x, y - 2));
    p[2] = read_imagef(input, sampler, (int2)(x + 2, y - 2));
    p[3] = read_imagef(input, sampler, (int2)(x - 2, y));
    p[4] = read_imagef(input, sampler, (int2)(x, y));
    p[5] = read_imagef(input, sampler, (int2)(x + 2, y));
    p[6] = read_imagef(input, sampler, (int2)(x - 2, y + 2));
    p[7] = read_imagef(input, sampler, (int2)(x, y + 2));
    p[8] = read_imagef(input, sampler, (int2)(x + 2, y + 2));

    float aveVer = (p[1].x + p[7].x) / 2;
    float aveHor = (p[3].x + p[5].x) / 2;
    float avePosDia = (p[0].x + p[8].x) / 2;
    float aveNegDia = (p[2].x + p[6].x) / 2;

    float aveMin, aveMax;
    if (aveVer > aveHor)  {
        aveMin = aveHor;
        aveMax = aveVer;
    }
    else {
        aveMin = aveVer;
        aveMax = aveHor;
    }

    if (avePosDia < aveMin)
        aveMin = avePosDia;
    else if (avePosDia > aveMax)
        aveMax = avePosDia;

    if (aveNegDia < aveMin)
        aveMin = aveNegDia;
    else if (aveNegDia > aveMax)
        aveMax = aveNegDia;

    float edgeVer = p[4].x - aveVer;
    float edgeHor = p[4].x - aveHor;
    float edgeNeighbourVer = (p[3].x + p[5].x - (p[0].x + p[2].x + p[6].x + p[8].x) / 2) / 2;
    float edgeNeighbourHor = (p[1].x + p[7].x - (p[0].x + p[2].x + p[6].x + p[8].x) / 2) / 2;

    float threshold;
    if (x % 2 == 0)
        threshold = (y % 2 == 0) ? gr_threshold : b_threshold;
    else
        threshold = (y % 2 == 0) ? r_threshold : gb_threshold;

    float4 pixelOut;
    pixelOut.x = p[4].x;
    pixelOut.y = p[4].y;
    pixelOut.z = p[4].z;
    pixelOut.w = p[4].w;
    if ((edgeVer > edgeNeighbourVer) && (edgeHor > edgeNeighbourHor))  {
        if ((p[4].x - aveMax) > threshold)  {
            pixelOut.x = aveMax;
        }
    }
    if ((edgeVer < edgeNeighbourVer) && (edgeHor < edgeNeighbourHor))  {
        if ((aveMin - p[4].x) > threshold)  {
            pixelOut.x = aveMin;
        }
    }

    write_imagef (output, (int2)(x, y), pixelOut);
}

## kernel_ee.cl
/*
 * function: kernel_ee
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 * ee_config: Edge enhancement configuration
 */

typedef struct
{
    float           ee_gain;
    float           ee_threshold;
    float           nr_gain;
} CLEeConfig;

__constant float lv[25] = {0.0, 0.0, 0.0, 0.0, 0.0,
                           0.0, 0.0, -1.0, 0.0, 0.0,
                           -1.0, -14.0, 32.0, -14.0, -1.0,
                           0.0, 0.0, -1.0, 0.0, 0.0,
                           0.0, 0.0, 0.0, 0.0, 0.0
                          };

__constant float lh[25] = {0.0, 0.0, -1.0, 0.0, 0.0,
                           0.0, 0.0, -14.0, 0.0, 0.0,
                           0.0, -1.0, 32.0, -1.0, 0.0,
                           0.0, 0.0, -14.0, 0.0, 0.0,
                           0.0, 0.0, -1.0, 0.0, 0.0
                          };

__constant float la[25] = {0.0, 0.0, -2.0, 0.0, 0.0,
                           0.0, -2.0, -2.0, -2.0, 0.0,
                           -2.0, -2.0, 24.0, -2.0, -2.0,
                           0.0, -2.0, -2.0, -2.0, 0.0,
                           0.0, 0.0, -2.0, 0.0, 0.0
                          };

__constant float na[25] = { -1.0, -1.0, -1.0, -1.0, -1.0,
                            -1.0, -1.0, -1.0, -1.0, -1.0,
                            -1.0, -1.0, 16.0, -1.0, -1.0,
                            -1.0, -1.0, -1.0, -1.0, -1.0,
                            -1.0, -1.0, -1.0, -1.0, -1.0
                          };
__kernel void kernel_ee (__read_only image2d_t input, __write_only image2d_t output, uint vertical_offset_in, uint vertical_offset_out, CLEeConfig ee_config)
{
    int x = get_global_id (0);
    int y = get_global_id (1);
    int X = get_global_size(0);
    int Y = get_global_size(1);

    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
    float4 y_in, y_out, uv_in;
    float4 a[5], b[5], c[5], d[5], e[5];

    // cpy UV
    if(y % 2 == 0) {
        uv_in = read_imagef(input, sampler, (int2)(x, y / 2 + vertical_offset_in));
        write_imagef(output, (int2)(x, y / 2 + vertical_offset_out), uv_in);
    }

        a[0] = read_imagef(input, sampler, (int2)(x - 2, y - 2));
        a[1] = read_imagef(input, sampler, (int2)(x - 1, y - 2));
        a[2] = read_imagef(input, sampler, (int2)(x, y - 2));
        a[3] = read_imagef(input, sampler, (int2)(x + 1, y - 2));
        a[4] = read_imagef(input, sampler, (int2)(x + 2, y - 2));

        b[0] = read_imagef(input, sampler, (int2)(x - 2, y - 1));
        b[1] = read_imagef(input, sampler, (int2)(x - 1, y - 1));
        b[2] = read_imagef(input, sampler, (int2)(x, y - 1));
        b[3] = read_imagef(input, sampler, (int2)(x + 1, y - 1));
        b[4] = read_imagef(input, sampler, (int2)(x + 2, y - 1));

        c[0] = read_imagef(input, sampler, (int2)(x - 2, y));
        c[1] = read_imagef(input, sampler, (int2)(x - 1, y));
        c[2] = read_imagef(input, sampler, (int2)(x, y));
        c[3] = read_imagef(input, sampler, (int2)(x + 1, y));
        c[4] = read_imagef(input, sampler, (int2)(x + 2, y));

        d[0] = read_imagef(input, sampler, (int2)(x - 2, y + 1));
        d[1] = read_imagef(input, sampler, (int2)(x - 1, y + 1));
        d[2] = read_imagef(input, sampler, (int2)(x, y + 1));
        d[3] = read_imagef(input, sampler, (int2)(x + 1, y + 1));
        d[4] = read_imagef(input, sampler, (int2)(x + 2, y + 1));

        e[0] = read_imagef(input, sampler, (int2)(x - 2, y + 2));
        e[1] = read_imagef(input, sampler, (int2)(x - 1, y + 2));
        e[2] = read_imagef(input, sampler, (int2)(x, y + 2));
        e[3] = read_imagef(input, sampler, (int2)(x + 1, y + 2));
        e[4] = read_imagef(input, sampler, (int2)(x + 2, y + 2));

        float eV = (a[0].x * lv[0] + a[1].x * lv[1] + a[2].x * lv[2] + a[3].x * lv[3] + a[4].x * lv[4]
                    + b[0].x * lv[5] + b[1].x * lv[6] + b[2].x * lv[7] + b[3].x * lv[8] + b[4].x * lv[9]
                    + c[0].x * lv[10] + c[1].x * lv[11] + c[2].x * lv[12] + c[3].x * lv[13] + c[4].x * lv[14]
                    + d[0].x * lv[15] + d[1].x * lv[16] + d[2].x * lv[17] + d[3].x * lv[18] + d[4].x * lv[19]
                    + e[0].x * lv[20] + e[1].x * lv[21] + e[2].x * lv[22] + e[3].x * lv[23] + e[4].x * lv[24]) * 255.0;

        float eH = (a[0].x * lh[0] + a[1].x * lh[1] + a[2].x * lh[2] + a[3].x * lh[3] + a[4].x * lh[4]
                    + b[0].x * lh[5] + b[1].x * lh[6] + b[2].x * lh[7] + b[3].x * lh[8] + b[4].x * lh[9]
                    + c[0].x * lh[10] + c[1].x * lh[11] + c[2].x * lh[12] + c[3].x * lh[13] + c[4].x * lh[14]
                    + d[0].x * lh[15] + d[1].x * lh[16] + d[2].x * lh[17] + d[3].x * lh[18] + d[4].x * lh[19]
                    + e[0].x * lh[20] + e[1].x * lh[21] + e[2].x * lh[22] + e[3].x * lh[23] + e[4].x * lh[24]) * 255.0;


        float eA = (a[0].x * la[0] + a[1].x * la[1] + a[2].x * la[2] + a[3].x * la[3] + a[4].x * la[4]
                    + b[0].x * la[5] + b[1].x * la[6] + b[2].x * la[7] + b[3].x * la[8] + b[4].x * la[9]
                    + c[0].x * la[10] + c[1].x * la[11] + c[2].x * la[12] + c[3].x * la[13] + c[4].x * la[14]
                    + d[0].x * la[15] + d[1].x * la[16] + d[2].x * la[17] + d[3].x * la[18] + d[4].x * la[19]
                    + e[0].x * la[20] + e[1].x * la[21] + e[2].x * la[22] + e[3].x * la[23] + e[4].x * la[24]) * 255.0;

        float nA = (a[0].x * na[0] + a[1].x * na[1] + a[2].x * na[2] + a[3].x * na[3] + a[4].x * na[4]
                    + b[0].x * na[5] + b[1].x * na[6] + b[2].x * na[7] + b[3].x * na[8] + b[4].x * na[9]
                    + c[0].x * na[10] + c[1].x * na[11] + c[2].x * na[12] + c[3].x * na[13] + c[4].x * na[14]
                    + d[0].x * na[15] + d[1].x * na[16] + d[2].x * na[17] + d[3].x * na[18] + d[4].x * na[19]
                    + e[0].x * na[20] + e[1].x * na[21] + e[2].x * na[22] + e[3].x * na[23] + e[4].x * na[24]) * 255.0;


        float nV = eH;
        float nH = eV;

        float dV = (fabs(2.0 * b[1].x - a[1].x - c[1].x) + fabs(2.0 * b[2].x - a[2].x - c[2].x) + fabs(2.0 * b[3].x - a[3].x - c[3].x) + fabs(2.0 * c[1].x - b[1].x - d[1].x) + fabs(2.0 * c[2].x - b[2].x - d[2].x) + fabs(2.0 * c[3].x - b[3].x - d[3].x) + fabs(2.0 * d[1].x - c[1].x - e[1].x) + fabs(2.0 * d[2].x - c[2].x - e[2].x) + fabs(2.0 * d[3].x - c[3].x - e[3].x)) * 255.0;
        float dH = (fabs(2.0 * b[1].x - b[0].x - b[2].x) + fabs(2.0 * b[2].x - b[1].x - b[3].x) + fabs(2.0 * b[3].x - b[2].x - b[4].x) + fabs(2.0 * c[1].x - c[0].x - c[2].x) + fabs(2.0 * c[2].x - c[1].x - c[3].x) + fabs(2.0 * c[3].x - c[2].x - c[4].x) + fabs(2.0 * d[1].x - d[0].x - d[2].x) + fabs(2.0 * d[2].x - d[1].x - d[3].x) + fabs(2.0 * d[3].x - d[2].x - d[4].x)) * 255.0;
        float dA = (fabs(2.0 * c[2].x - b[2].x - d[2].x) + fabs(2.0 * c[2].x - c[1].x - c[3].x) + fabs(2.0 * c[2].x - b[1].x - d[3].x) + fabs(2.0 * c[2].x - b[3].x - d[1].x) + fabs(2.0 * c[2].x - a[0].x - e[4].x) + fabs(2.0 * c[2].x - a[4].x - e[0].x) + fabs(2.0 * c[2].x - c[0].x - c[4].x) + fabs(2.0 * c[2].x - a[2].x - e[2].x) + fabs(2.0 * c[2].x - (b[0].x + d[0].x + b[4].x + d[4].x) / 2.0)) * 255.0;

        float edge = dH < (dV < dA ? dV : dA) ? eH  : (dV < dA ? eV : eA);
        float noise = dH < (dV < dA ? dV : dA) ? nH  : (dV < dA ? nV : nA);
        float dir = dH < (dV < dA ? dA : dV) ? (dV < dA ? dA : dV) : dH;
        noise = noise * ee_config.nr_gain / 16.0;
        edge = edge * ee_config.ee_gain / 16.0;

        y_out.x = dir > ee_config.ee_threshold ? (c[2].x * 255.0 + edge - noise) / 255.0 : c[2].x;
        y_out.y = 0.0;
        y_out.z = 0.0;
        y_out.w = 1.0;
        write_imagef(output, (int2)(x, y), y_out);
}

## kernel_gamma.cl
/*
 * function: kernel_gamma
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 * table: gamma table.
 */
//TODO: linear scale should be a problem
__kernel void kernel_gamma (__read_only image2d_t input, __write_only image2d_t output, __global float *table)
{
    int x = get_global_id (0);
    int y = get_global_id (1);
    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
    float4 pixel_in[8], pixel_out[8];
    int i=0,j=0;

#pragma unroll
    for(j=0;j<2;j++) {
#pragma unroll
	for(i=0;i<4;i++) {
	     pixel_in[j*4 + i] = read_imagef(input, sampler,(int2)(4*x + i, 2*y + j));
	     pixel_out[j*4 + i].x = table[convert_int(pixel_in[j*4 + i].x * 255.0)] / 255.0;
	     pixel_out[j*4 + i].y = table[convert_int(pixel_in[j*4 + i].y * 255.0)] / 255.0;
	     pixel_out[j*4 + i].z = table[convert_int(pixel_in[j*4 + i].z * 255.0)] / 255.0;
	     pixel_out[j*4 + i].w = 0.0;
	     write_imagef(output, (int2)(4*x + i, 2*y + j), pixel_out[j*4 + i]);
	 }
    }
}

## kernel_hdr_lab.cl
/*
 * function: kernel_hdr_lab
 *     sample code of default kernel arguments
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 */
__constant float table[] = {4.3287616, 5.9319224, 7.1324205, 8.1288157, 8.9965763, 9.7739191, 10.483327, 11.139331, 11.751958, 12.328467, 12.874310, 13.393700, 13.889977, 14.365837, 14.823494, 15.264792, 15.691289, 16.104307, 16.504990, 16.894327, 17.273184, 17.642323, 18.002419, 18.354071, 18.697817, 19.034143, 19.363485, 19.686239, 20.002764, 20.313389, 20.618416, 20.918123, 21.212763, 21.502573, 21.787767, 22.068552, 22.345116, 22.617630, 22.886259, 23.151157, 23.412468, 23.670324, 23.924854, 24.176174, 24.424398, 24.669630, 24.911972, 25.151518, 25.388355, 25.622572, 25.854246, 26.083458, 26.310276, 26.534771, 26.757010, 26.977057, 27.194969, 27.410807, 27.624624, 27.836473, 28.046406, 28.254467, 28.254467, 28.254467, 28.254467, 28.254467, 28.254467, 28.254467, 28.254467, 28.254467, 28.377895, 28.567524, 28.755781, 28.942692, 29.128284, 29.312582, 29.495617, 29.677410, 29.857986, 30.037369, 30.215582, 30.392645, 30.568581, 30.743410, 30.917152, 31.089828, 31.261454, 31.432049, 31.601633, 31.770222, 31.937832, 32.104481, 32.270180, 32.434952, 32.598808, 32.761761, 32.923832, 33.085026, 33.245361, 33.404850, 33.563507, 33.721340, 33.878368, 34.034599, 34.190044, 34.344715, 34.498627, 34.651783, 34.804199, 34.955887, 35.106853, 35.257107, 35.406662, 35.555523, 35.703705, 35.851208, 35.998051, 36.144238, 36.289776, 36.434673, 36.578941, 36.722588, 36.865616, 37.008038, 37.008038, 37.008038, 37.008038, 37.008038, 37.008038, 37.008038, 37.008038, 37.008038, 37.008038, 37.008038, 37.008038, 37.008038, 37.013512, 37.148350, 37.282703, 37.416573, 37.549969, 37.682888, 37.815342, 37.947330, 38.078865, 38.209946, 38.340580, 38.470768, 38.600517, 38.729836, 38.858719, 38.987175, 39.115215, 39.242832, 39.370041, 39.496838, 39.623226, 39.749214, 39.874802, 40, 40.124805, 40.249222, 40.373260, 40.496914, 40.620193, 40.743095, 40.865631, 40.987804, 41.109608, 41.231056, 41.352146, 41.472885, 41.593269, 41.713306, 41.833000, 41.952354, 42.071369, 42.190048, 42.308392, 42.426407, 42.544094, 42.661457, 42.778500, 42.895222, 43.011627, 43.127716, 43.243496, 43.243496, 43.243496, 43.243496, 43.243496, 43.243496, 43.243496, 43.243496, 43.243496, 43.243496, 43.243496, 43.243496, 43.243496, 43.243496, 43.243496, 43.243496, 43.243496, 43.315910, 43.427536, 43.538902, 43.650013, 43.760872, 43.871479, 43.981831, 44.091938, 44.201797, 44.311413, 44.420780, 44.529911, 44.638798, 44.747448, 44.855862, 44.964039, 45.071983, 45.179695, 45.287178, 45.394428, 45.501453, 45.608253, 45.714825, 45.821178, 45.927307, 46.033215, 46.138905, 46.244377, 46.349632, 46.454674, 46.559505, 46.664120, 46.768528, 46.872723, 46.976711, 47.080494, 47.184067, 47.287441, 47.390610, 47.493576, 47.596344, 47.698910, 47.801281, 47.903454, 48.005428, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.107212, 48.116840, 48.216503, 48.316002, 48.415333, 48.514507, 48.613514, 48.712364, 48.811050, 48.909580, 49.007950, 49.106163, 49.204220, 49.302116, 49.399860, 49.497452, 49.594887, 49.692173, 49.789303, 49.886284, 49.983109, 50.079788, 50.176319, 50.272701, 50.368935, 50.465023, 50.560966, 50.656761, 50.752411, 50.847919, 50.943279, 51.038502, 51.133583, 51.228519, 51.323318, 51.417973, 51.512493, 51.606873, 51.701115, 51.795219, 51.889191, 51.983021, 52.076717, 52.170280, 52.263706, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.357002, 52.370274, 52.462727, 52.555069, 52.647293, 52.739407, 52.831406, 52.923294, 53.015072, 53.106739, 53.198296, 53.289738, 53.381077, 53.472301, 53.563419, 53.654430, 53.745327, 53.836124, 53.926811, 54.017391, 54.107864, 54.198231, 54.288494, 54.378651, 54.468708, 54.558655, 54.648502, 54.738243, 54.827885, 54.917419, 55.006851, 55.096188, 55.185417, 55.274551, 55.363579, 55.452511, 55.541340, 55.630070, 55.718704, 55.807240, 55.895676, 55.984013, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.072254, 56.136337, 56.224670, 56.312920, 56.401089, 56.489174, 56.577179, 56.665100, 56.752941, 56.840698, 56.928371, 57.015968, 57.103481, 57.190918, 57.278271, 57.365547, 57.452744, 57.539856, 57.626896, 57.713852, 57.800732, 57.887531, 57.974255, 58.060902, 58.147465, 58.233959, 58.320370, 58.406708, 58.492970, 58.579155, 58.665260, 58.751289, 58.837250, 58.923130, 59.008938, 59.094666, 59.180325, 59.265907, 59.351414, 59.436852, 59.522209, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.607498, 59.676125, 59.762394, 59.848598, 59.934742, 60.020828, 60.106846, 60.192806, 60.278702, 60.364540, 60.450314, 60.536030, 60.621685, 60.707275, 60.792812, 60.878284, 60.963699, 61.049049, 61.134342, 61.219578, 61.304752, 61.389870, 61.474926, 61.559925, 61.644867, 61.729748, 61.814568, 61.899334, 61.984039, 62.068687, 62.153282, 62.237812, 62.322292, 62.406708, 62.491070, 62.575375, 62.659622, 62.743816, 62.827950, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.912029, 62.940441, 63.026379, 63.112267, 63.198112, 63.283909, 63.369659, 63.455360, 63.541019, 63.626637, 63.712200, 63.797718, 63.883190, 63.968624, 64.054001, 64.139343, 64.224632, 64.309875, 64.395081, 64.480240, 64.565346, 64.650414, 64.735435, 64.820412, 64.905342, 64.990234, 65.075073, 65.159874, 65.244629, 65.329346, 65.414009, 65.498634, 65.583214, 65.667747, 65.752243, 65.836693, 65.921097, 66.005463, 66.089783, 66.174057, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.258286, 66.291786, 66.378784, 66.465752, 66.552689, 66.639587, 66.726456, 66.813293, 66.900085, 66.986855, 67.073586, 67.160286, 67.246948, 67.333572, 67.420174, 67.506737, 67.593269, 67.679764, 67.766228, 67.852654, 67.939056, 68.025421, 68.111755, 68.198051, 68.284325, 68.370560, 68.456757, 68.542923, 68.629059, 68.715164, 68.801239, 68.887276, 68.973282, 69.059258, 69.145203, 69.231110, 69.316986, 69.402832, 69.488655, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.574432, 69.659309, 69.748878, 69.838425, 69.927956, 70.017456, 70.106934, 70.196388, 70.285820, 70.375237, 70.464622, 70.553986, 70.643333, 70.732658, 70.821953, 70.911224, 71.000481, 71.089706, 71.178917, 71.268105, 71.357262, 71.446404, 71.535530, 71.624619, 71.713692, 71.802750, 71.891777, 71.980789, 72.069771, 72.158737, 72.247673, 72.336594, 72.425499, 72.514374, 72.603226, 72.692062, 72.780876, 72.869659, 72.958427, 73.047173, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.135902, 73.151604, 73.245392, 73.339172, 73.432938, 73.526695, 73.620430, 73.714165, 73.807884, 73.901588, 73.995285, 74.088966, 74.182640, 74.276299, 74.369942, 74.463577, 74.557198, 74.650818, 74.744415, 74.838005, 74.931580, 75.025139, 75.118698, 75.212242, 75.305771, 75.399284, 75.492798, 75.586296, 75.679771, 75.773247, 75.866707, 75.960159, 76.053596, 76.147018, 76.240433, 76.333839, 76.427231, 76.520615, 76.613983, 76.707336, 76.800690, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.894020, 76.900002, 77, 77.100006, 77.200005, 77.300003, 77.400002, 77.500000, 77.600006, 77.700005, 77.800003, 77.900002, 78, 78.099998, 78.199997, 78.300003, 78.400002, 78.500000, 78.599998, 78.699997, 78.799995, 78.899994, 79, 79.099998, 79.199997, 79.299995, 79.400002, 79.500000, 79.599998, 79.699997, 79.799995, 79.900002, 80, 80.099998, 80.199997, 80.299995, 80.400002, 80.500000, 80.599998, 80.699997, 80.800003, 80.900002, 81, 81.099998, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.199997, 81.299294, 81.408134, 81.516991, 81.625862, 81.734749, 81.843643, 81.952560, 82.061493, 82.170433, 82.279388, 82.388359, 82.497353, 82.606354, 82.715363, 82.824394, 82.933441, 83.042496, 83.151573, 83.260658, 83.369759, 83.478874, 83.588005, 83.697151, 83.806305, 83.915474, 84.024666, 84.133865, 84.243080, 84.352310, 84.461555, 84.570816, 84.680084, 84.789368, 84.898666, 85.007988, 85.117310, 85.226654, 85.336006, 85.445374, 85.554764, 85.664162, 85.773575, 85.882996, 85.992439, 86.101898, 86.211365, 86.320839, 86.430336, 86.539848, 86.649376, 86.758911, 86.868454, 86.978027, 87.087608, 87.197197, 87.306801, 87.416420, 87.526054, 87.635704, 87.745361, 87.855034, 87.964722, 88.074432, 88.184143, 88.293877, 88.403618, 88.513374, 88.623146, 88.732933, 88.842728, 88.952538, 89.062363, 89.172203, 89.282051, 89.391914, 89.501793, 89.611694, 89.721596, 89.831512, 89.941444, 90.051399, 90.161354, 90.271324, 90.381310, 90.491310, 90.601326, 90.711349, 90.821388, 90.931442, 91.041512, 91.151596, 91.261681, 91.371788, 91.481911, 91.592049, 91.702187, 91.812347, 91.922516, 92.032707, 92.142906, 92.253120, 92.363342, 92.473579, 92.583839, 92.694099, 92.804375, 92.914665, 93.024971, 93.135292, 93.245621, 93.355965, 93.466316, 93.576683, 93.687073, 93.797470, 93.907875, 94.018295, 94.128731, 94.239182, 94.349640, 94.460106, 94.570595, 94.681099, 94.791611, 94.902130, 95.012665, 95.123215, 95.233788, 95.344360, 95.454948, 95.565552, 95.676170, 95.786797, 95.897430, 96.008087, 96.118752, 96.229431, 96.340126, 96.450829, 96.561539, 96.672272, 96.783012, 96.893768, 97.004532, 97.115311, 97.226112, 97.336914, 97.447731, 97.558563, 97.669403, 97.780266, 97.891129, 98.002007, 98.112900, 98.223816, 98.334732, 98.445656, 98.556602, 98.667557, 98.778526, 98.889511, 99.000504, 99.111504, 99.222519, 99.333557, 99.444603, 99.555656, 99.666718, 99.777809, 99.888893, 100};

static float fun(float in)
{
    return in > 0.008856 ? (native_powr(in, 1.0 / 3)) : (7.787 * in + 16.0 / 116);
}
__kernel void kernel_hdr_lab (__read_only image2d_t input, __write_only image2d_t output)
{
    int x = get_global_id (0);
    int y = get_global_id (1);
    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
    float4 pixel_in, pixel_out;
    unsigned int table_id;
    pixel_in = read_imagef(input, sampler, (int2)(x, y));

    float X, Y, Z, L, a, b;
    X = 0.412453 * pixel_in.x + 0.357580 * pixel_in.y + 0.180423 * pixel_in.z;
    Y = 0.212671 * pixel_in.x + 0.715160 * pixel_in.y + 0.072169 * pixel_in.z;
    Z = 0.019334 * pixel_in.x + 0.119193 * pixel_in.y + 0.950227 * pixel_in.z;
    L = Y > 0.008856 ? (116.0 * native_powr(Y, 1.0 / 3) - 16.0) : 903.3 * Y;
    a = 500 * (fun(X) - fun(Y));
    b = 200 * (fun(Y) - fun(Z));
    table_id = (unsigned int)(L * 10) < 1000 ? (unsigned int)(L * 10) : 999;
    L = table[table_id];
    float fX, fY, fZ;
    Y = (L / 116.0) * (L / 116.0) * (L / 116.0);
    Y = Y < 0.008856 ? L / 903.3 : Y;
    fY = Y > 0.008856 ? (native_powr(Y, 1.0 / 3.0)) : (7.787 * Y + 16.0 / 116.0);
    fX = a / 500.0 + fY;
    X = fX * fX * fX;
    X = X < 0.008865 ? ((fX - 16.0 / 116.0) / 7.787) : X;
    fZ = fY - b / 200.0;
    Z = fZ * fZ * fZ;
    Z = Z < 0.008865 ? ((fZ - 16.0 / 116.0) / 7.787) : Z;

    pixel_out.x = 3.240479 * X - 1.537150 * Y - 0.498535 * Z;
    pixel_out.y = -0.969256 * X + 1.875992 * Y + 0.041556 * Z;
    pixel_out.z = 0.055648 * X - 0.204043 * Y + 1.204043 * Z;
    pixel_out.w = pixel_in.w;
    write_imagef(output, (int2)(x, y), pixel_out);
}


## kernel_hdr_rgb.cl
/*
 * function: kernel_hdr_rgb
 *     sample code of default kernel arguments
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 */
__constant int HDRTable[1024] = {104, 106, 108, 109, 111, 113, 114, 116, 118, 119, 121, 123, 125, 126, 128, 130, 131, 133, 135, 136, 138, 139, 141, 143, 144, 146, 148, 149, 151, 152, 154, 156, 157, 159, 160, 162, 164, 165, 167, 168, 170, 171, 173, 174, 176, 177, 179, 181, 182, 184, 185, 187, 188, 190, 191, 193, 194, 195, 197, 198, 200, 201, 203, 204, 206, 207, 209, 210, 211, 213, 214, 216, 217, 219, 220, 221, 223, 224, 226, 227, 228, 230, 231, 232, 234, 235, 236, 238, 239, 241, 242, 243, 245, 246, 247, 248, 250, 251, 252, 254, 255, 256, 258, 259, 260, 261, 263, 264, 265, 267, 268, 269, 270, 272, 273, 274, 275, 277, 278, 279, 280, 281, 283, 284, 285, 286, 287, 289, 290, 291, 292, 293, 295, 296, 297, 298, 299, 300, 302, 303, 304, 305, 306, 307, 308, 310, 311, 312, 313, 314, 315, 316, 317, 318, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 389, 390, 391, 392, 393, 394, 395, 395, 396, 397, 398, 399, 400, 401, 401, 402, 403, 404, 405, 406, 406, 407, 408, 409, 410, 410, 411, 412, 413, 414, 415, 415, 416, 417, 418, 418, 419, 420, 421, 422, 422, 423, 424, 425, 425, 426, 427, 428, 429, 429, 430, 431, 432, 432, 433, 434, 435, 435, 436, 437, 437, 438, 439, 440, 440, 441, 442, 442, 443, 444, 445, 445, 446, 447, 447, 448, 449, 450, 450, 451, 452, 452, 453, 454, 454, 455, 456, 456, 457, 458, 458, 459, 460, 460, 461, 462, 462, 463, 464, 464, 465, 466, 466, 467, 468, 468, 469, 469, 470, 471, 471, 472, 473, 473, 474, 475, 475, 476, 476, 477, 478, 478, 479, 479, 480, 481, 481, 482, 483, 483, 484, 484, 485, 486, 486, 487, 487, 488, 488, 489, 490, 490, 491, 491, 492, 493, 493, 494, 494, 495, 495, 496, 497, 497, 498, 498, 499, 499, 500, 501, 501, 502, 502, 503, 503, 504, 504, 505, 506, 506, 507, 507, 508, 508, 509, 509, 510, 510, 511, 512, 512, 513, 513, 514, 514, 515, 515, 516, 516, 517, 517, 518, 518, 519, 519, 520, 521, 521, 522, 522, 523, 523, 524, 524, 525, 525, 526, 526, 527, 527, 528, 528, 529, 529, 530, 530, 531, 531, 532, 532, 533, 533, 534, 534, 535, 535, 536, 536, 537, 537, 538, 538, 539, 539, 540, 540, 541, 541, 542, 542, 543, 543, 544, 544, 545, 545, 546, 546, 547, 547, 548, 548, 549, 549, 550, 550, 551, 551, 552, 552, 553, 553, 554, 554, 554, 555, 555, 556, 556, 557, 557, 558, 558, 559, 559, 560, 560, 561, 561, 562, 562, 563, 563, 564, 564, 565, 565, 566, 566, 567, 567, 568, 568, 568, 569, 569, 570, 570, 571, 571, 572, 572, 573, 573, 574, 574, 575, 575, 576, 576, 577, 577, 578, 578, 579, 579, 580, 580, 581, 581, 582, 582, 582, 583, 583, 584, 584, 585, 585, 586, 586, 587, 587, 588, 588, 589, 589, 590, 590, 591, 591, 592, 592, 593, 593, 594, 594, 595, 595, 596, 596, 597, 597, 598, 598, 599, 599, 600, 600, 601, 601, 602, 602, 603, 603, 604, 604, 605, 605, 606, 606, 607, 607, 608, 608, 609, 609, 610, 611, 611, 612, 612, 613, 613, 614, 614, 615, 615, 616, 616, 617, 617, 618, 618, 619, 619, 620, 621, 621, 622, 622, 623, 623, 624, 624, 625, 625, 626, 627, 627, 628, 628, 629, 629, 630, 630, 631, 632, 632, 633, 633, 634, 634, 635, 636, 636, 637, 637, 638, 638, 639, 640, 640, 641, 641, 642, 642, 643, 644, 644, 645, 645, 646, 647, 647, 648, 648, 649, 650, 650, 651, 651, 652, 653, 653, 654, 654, 655, 656, 656, 657, 658, 658, 659, 659, 660, 661, 661, 662, 663, 663, 664, 665, 665, 666, 666, 667, 668, 668, 669, 670, 670, 671, 672, 672, 673, 674, 674, 675, 676, 676, 677, 678, 678, 679, 680, 680, 681, 682, 683, 683, 684, 685, 685, 686, 687, 687, 688, 689, 690, 690, 691, 692, 692, 693, 694, 695, 695, 696, 697, 698, 698, 699, 700, 701, 701, 702, 703, 704, 704, 705, 706, 707, 707, 708, 709, 710, 710, 711, 712, 713, 714, 714, 715, 716, 717, 718, 718, 719, 720, 721, 722, 722, 723, 724, 725, 726, 726, 727, 728, 729, 730, 731, 731, 732, 733, 734, 735, 736, 737, 737, 738, 739, 740, 741, 742, 743, 744, 744, 745, 746, 747, 748, 749, 750, 751, 752, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 814, 815, 816, 817, 818, 819, 820, 821, 822, 824, 825, 826, 827, 828, 829, 830, 832, 833, 834, 835, 836, 837, 839, 840, 841, 842, 843, 845, 846, 847, 848, 849, 851, 852, 853, 854, 855, 857, 858, 859, 860, 862, 863, 864, 865, 867, 868, 869, 870, 872, 873, 874, 876, 877, 878, 880, 881, 882, 883, 885, 886, 887, 889, 890, 891, 893, 894, 895, 897, 898, 900, 901, 902, 904, 905, 906, 908, 909, 911, 912, 913, 915, 916, 918, 919, 920, 922, 923, 925, 926, 928, 929, 931, 932, 933, 935, 936, 938, 939, 941, 942, 944, 945, 947, 948, 950, 951, 953, 954, 956, 957, 959, 961, 962, 964, 965, 967, 968, 970, 972, 973, 975, 976, 978, 979, 981, 983, 984, 986, 988, 989, 991, 992, 994, 996, 997, 999, 1001, 1002, 1004, 1006, 1007, 1009, 1011, 1012, 1014, 1016, 1017, 1019, 1021, 1023};
__kernel void kernel_hdr_rgb (__read_only image2d_t input, __write_only image2d_t output)
{
    int x = get_global_id (0);
    int y = get_global_id (1);
    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

    int2 pos = (int2)(x, y);
    float4 pixel_in, pixel_out;
    pixel_in = read_imagef(input, sampler, pos);
    pixel_out.x = convert_float(HDRTable[convert_int(pixel_in.x * 1023)] / 1023.0);
    pixel_out.y = convert_float(HDRTable[convert_int(pixel_in.y * 1023)] / 1023.0);
    pixel_out.z = convert_float(HDRTable[convert_int(pixel_in.z * 1023)] / 1023.0);
    pixel_out.w = 0.0;
    write_imagef(output, pos, pixel_out);
}


## kernel_image_scaler.cl
/**
* \brief Image scaling kernel function.
* \param[in] input Input image object.
* \param[out] output scaled output image object.
* \param[in] output_widht: output width
* \param[in] output_height: output height
* \param[in] vertical_offset:  vertical offset from y to uv
*/
//TODO: This is the most instesrting kernel, using hardware sampler to scale
__kernel void kernel_image_scaler (__read_only image2d_t input,
                                   __write_only image2d_t output,
                                   const uint output_widht,
                                   const uint output_height)
{
    int x = get_global_id(0);
    int y = get_global_id(1);

    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;

    float2 normCoor = convert_float2((int2)(x, y)) / (float2)(output_widht, output_height);
    float4 scaled_pixel = read_imagef(input, sampler, normCoor);
    write_imagef(output, (int2)(x, y), scaled_pixel);
}


## kernel_macc.cl
/*
 * function: kernel_macc
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 * table: macc table.
 */
unsigned int get_sector_id (float u, float v)
{
    u = fabs(u) > 0.00001f ? u : 0.00001f;
    float tg = v / u;
    unsigned int se = tg > 1 ? (tg > 2 ? 3 : 2) : (tg > 0.5 ? 1 : 0);
    unsigned int so = tg > -1 ? (tg > -0.5 ? 3 : 2) : (tg > -2 ? 1 : 0);
    return tg > 0 ? (u > 0 ? se : (se + 8)) : (u > 0 ? (so + 12) : (so + 4));
}
__kernel void kernel_macc (__read_only image2d_t input, __write_only image2d_t output, __global float *table)
{
    int x = get_global_id (0);
    int y = get_global_id (1);
    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
    float4 pixel_in[8], pixel_out[8];
    float Y[8], ui[8], vi[8], uo[8], vo[8];
    unsigned int table_id[8];
    int i = 0, j = 0;

#pragma unroll
    for(j = 0; j < 2; j++) {
#pragma unroll
        for(i = 0; i < 4; i++) {
            pixel_in[j * 4 + i] = read_imagef(input, sampler, (int2)(4 * x + i, 2 * y + j));
            Y[j * 4 + i] = 0.3 * pixel_in[j * 4 + i].x + 0.59 * pixel_in[j * 4 + i].y + 0.11 * pixel_in[j * 4 + i].z;
            ui[j * 4 + i] = 0.493 * (pixel_in[j * 4 + i].z - Y[j * 4 + i]);
            vi[j * 4 + i] = 0.877 * (pixel_in[j * 4 + i].x - Y[j * 4 + i]);
            table_id[j * 4 + i] = get_sector_id(ui[j * 4 + i], vi[j * 4 + i]);
            uo[j * 4 + i] = ui[j * 4 + i] * table[4 * table_id[j * 4 + i]] + vi[j * 4 + i] * table[4 * table_id[j * 4 + i] + 1];
            vo[j * 4 + i] = ui[j * 4 + i] * table[4 * table_id[j * 4 + i] + 2] + vi[j * 4 + i] * table[4 * table_id[j * 4 + i] + 3];
            pixel_out[j * 4 + i].x = Y[j * 4 + i] + 1.14 * vo[j * 4 + i];
            pixel_out[j * 4 + i].y = Y[j * 4 + i] - 0.39 * uo[j * 4 + i] - 0.58 * vo[j * 4 + i];
            pixel_out[j * 4 + i].z = Y[j * 4 + i] + 2.03 * uo[j * 4 + i];
            pixel_out[j * 4 + i].w = 0.0;
            write_imagef(output, (int2)(4 * x + i, 2 * y + j), pixel_out[j * 4 + i]);
        }
    }
}


## kernel_snr.cl
/*
 * function: kernel_snr
 *     implementation of simple noise reduction
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 */

__kernel void kernel_snr (__read_only image2d_t input, __write_only image2d_t output)
{
    int x = get_global_id (0);
    int y = get_global_id (1);
    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

    float4 p[9];
    p[0] = read_imagef(input, sampler, (int2)(x - 1, y - 1));
    p[1] = read_imagef(input, sampler, (int2)(x, y - 1));
    p[2] = read_imagef(input, sampler, (int2)(x + 1, y - 1));
    p[3] = read_imagef(input, sampler, (int2)(x - 1, y));
    p[4] = read_imagef(input, sampler, (int2)(x, y));
    p[5] = read_imagef(input, sampler, (int2)(x + 1, y));
    p[6] = read_imagef(input, sampler, (int2)(x - 1, y + 1));
    p[7] = read_imagef(input, sampler, (int2)(x, y + 1));
    p[8] = read_imagef(input, sampler, (int2)(x + 1, y + 1));

    float4 pixel_out;
    pixel_out.x = (p[0].x + p[1].x + p[2].x + p[3].x + p[4].x + p[5].x + p[6].x + p[7].x + p[8].x) / 9.0f;
    pixel_out.y = (p[0].y + p[1].y + p[2].y + p[3].y + p[4].y + p[5].y + p[6].y + p[7].y + p[8].y) / 9.0f;
    pixel_out.z = (p[0].z + p[1].z + p[2].z + p[3].z + p[4].z + p[5].z + p[6].z + p[7].z + p[8].z) / 9.0f;
    pixel_out.w = p[4].w;
    write_imagef(output, (int2)(x, y), pixel_out);
}

## kernel_tonemapping.cl
/*
 * function: kernel_tonemapping
 *     implementation of tone mapping
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 */

#define WORK_ITEM_X_SIZE 8
#define WORK_ITEM_Y_SIZE 8

#define SHARED_PIXEL_X_SIZE 10
#define SHARED_PIXEL_Y_SIZE 10

__kernel void kernel_tonemapping (__read_only image2d_t input, __write_only image2d_t output, float y_max, float y_target, int image_height)
{
    int g_id_x = get_global_id (0);
    int g_id_y = get_global_id (1);

    int group_id_x = get_group_id(0);
    int group_id_y = get_group_id(1);

    int local_id_x = get_local_id(0);
    int local_id_y = get_local_id(1);

    int g_size_x = get_global_size (0);
    int g_size_y = get_global_size (1);

    int local_index = local_id_y * WORK_ITEM_X_SIZE + local_id_x;

    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;

    __local float4 local_src_data[SHARED_PIXEL_X_SIZE * SHARED_PIXEL_Y_SIZE];

    float4 src_data_Gr = read_imagef (input, sampler, (int2)(g_id_x, g_id_y));
    float4 src_data_R = read_imagef (input, sampler, (int2)(g_id_x, g_id_y + image_height));
    float4 src_data_B = read_imagef (input, sampler, (int2)(g_id_x, g_id_y + image_height * 2));
    float4 src_data_Gb = read_imagef (input, sampler, (int2)(g_id_x, g_id_y + image_height * 3));

    float4 src_data_G = (src_data_Gr + src_data_Gb) / 2;

    float4 src_y_data = 0.0f;
    src_y_data = mad(src_data_R, 255.f * 0.299f, src_y_data);
    src_y_data = mad(src_data_G, 255.f * 0.587f, src_y_data);
    src_y_data = mad(src_data_B, 255.f * 0.114f, src_y_data);

    local_src_data[(local_id_y + 1) * SHARED_PIXEL_X_SIZE + local_id_x + 1] = src_y_data;

    if(local_index < SHARED_PIXEL_X_SIZE * SHARED_PIXEL_Y_SIZE - WORK_ITEM_X_SIZE * WORK_ITEM_Y_SIZE)
    {
        int target_index = local_index <= SHARED_PIXEL_X_SIZE ? local_index : (local_index <= (SHARED_PIXEL_X_SIZE * SHARED_PIXEL_Y_SIZE - WORK_ITEM_X_SIZE * WORK_ITEM_Y_SIZE - SHARED_PIXEL_X_SIZE) ? (local_index + WORK_ITEM_X_SIZE + (local_index - (SHARED_PIXEL_X_SIZE + 1)) / 2 * WORK_ITEM_X_SIZE) : (local_index + WORK_ITEM_X_SIZE * WORK_ITEM_Y_SIZE));
        int start_x = mad24(group_id_x, WORK_ITEM_X_SIZE, -1);
        int start_y = mad24(group_id_y, WORK_ITEM_Y_SIZE, -1);
        int offset_x = target_index % SHARED_PIXEL_X_SIZE;
        int offset_y = target_index / SHARED_PIXEL_X_SIZE;

        float4 data_Gr = read_imagef (input, sampler, (int2)(start_x + offset_x, start_y + offset_y));
        float4 data_R = read_imagef (input, sampler, (int2)(start_x + offset_x, start_y + offset_y + image_height));
        float4 data_B = read_imagef (input, sampler, (int2)(start_x + offset_x, start_y + offset_y + image_height * 2));
        float4 data_Gb = read_imagef (input, sampler, (int2)(start_x + offset_x, start_y + offset_y + image_height * 3));

        float4 data_G = (data_Gr + data_Gb) / 2;

        float4 y_data = 0.0f;
        y_data = mad(data_R, 255.f * 0.299f, y_data);
        y_data = mad(data_G, 255.f * 0.587f, y_data);
        y_data = mad(data_B, 255.f * 0.114f, y_data);
        local_src_data[target_index] = y_data;
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    float gaussian_table[9] = {0.075f, 0.124f, 0.075f,
                               0.124f, 0.204f, 0.124f,
                               0.075f, 0.124f, 0.075f
                              };
    float4 src_ym_data = 0.0f;

    float16 integrate_data = *((__local float16 *)(local_src_data + local_id_y * SHARED_PIXEL_X_SIZE + local_id_x));

    src_ym_data = mad(integrate_data.s3456, (float4)gaussian_table[0], src_ym_data);
    src_ym_data = mad(integrate_data.s4567, (float4)gaussian_table[1], src_ym_data);
    src_ym_data = mad(integrate_data.s5678, (float4)gaussian_table[2], src_ym_data);

    integrate_data = *((__local float16 *)(local_src_data + (local_id_y + 1) * SHARED_PIXEL_X_SIZE + local_id_x));

    src_ym_data = mad(integrate_data.s3456, (float4)gaussian_table[3], src_ym_data);
    src_ym_data = mad(src_y_data, (float4)gaussian_table[4], src_ym_data);
    src_ym_data = mad(integrate_data.s5678, (float4)gaussian_table[5], src_ym_data);

    integrate_data = *((__local float16 *)(local_src_data + (local_id_y + 2) * SHARED_PIXEL_X_SIZE + local_id_x));

    src_ym_data = mad(integrate_data.s3456, (float4)gaussian_table[6], src_ym_data);
    src_ym_data = mad(integrate_data.s4567, (float4)gaussian_table[7], src_ym_data);
    src_ym_data = mad(integrate_data.s5678, (float4)gaussian_table[8], src_ym_data);

    float4 gain = ((float4)(y_max + y_target) + src_ym_data) / (src_y_data + src_ym_data + (float4)y_target);
    src_data_Gr = src_data_Gr * gain;
    src_data_R = src_data_R * gain;
    src_data_B = src_data_B * gain;
    src_data_Gb = src_data_Gb * gain;

    write_imagef(output, (int2)(g_id_x, g_id_y), src_data_Gr);
    write_imagef(output, (int2)(g_id_x, g_id_y + image_height), src_data_R);
    write_imagef(output, (int2)(g_id_x, g_id_y + image_height * 2), src_data_B);
    write_imagef(output, (int2)(g_id_x, g_id_y + image_height * 3), src_data_Gb);
}

## kernel_wb.cl
/*
 * function: kernel_wb
 *     black level correction for sensor data input
 * input:    image2d_t as read only
 * output:   image2d_t as write only
 * wb_config: white balance configuration
 */

typedef struct
{
    float r_gain;
    float gr_gain;
    float gb_gain;
    float b_gain;
} CLWBConfig;

__kernel void kernel_wb (__read_only image2d_t input,
                         __write_only image2d_t output,
                         CLWBConfig wb_config)
{
    int x = get_global_id (0);
    int y = get_global_id (1);
    sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
    float4 Gr_in, R_in, B_in, Gb_in;
    float4 Gr_out, R_out, B_out, Gb_out;
    Gr_in = read_imagef(input, sampler, (int2)(2 * x, 2 * y));
    R_in = read_imagef(input, sampler, (int2)(2 * x + 1, 2 * y));
    B_in = read_imagef(input, sampler, (int2)(2 * x, 2 * y + 1));
    Gb_in = read_imagef(input, sampler, (int2)(2 * x + 1, 2 * y + 1));
    Gr_out.x = Gr_in.x * wb_config.gr_gain;
    Gr_out.y = 0.0;
    Gr_out.z = 0.0;
    Gr_out.w = 1.0;
    R_out.x = R_in.x * wb_config.r_gain;
    R_out.y = 0.0;
    R_out.z = 0.0;
    R_out.w = 1.0;
    B_out.x = B_in.x * wb_config.b_gain;
    B_out.y = 0.0;
    B_out.z = 0.0;
    B_out.w = 1.0;
    Gb_out.x = Gb_in.x * wb_config.gb_gain;
    Gb_out.y = 0.0;
    Gb_out.z = 0.0;
    Gb_out.w = 1.0;
    write_imagef(output, (int2)(2 * x, 2 * y), Gr_out);
    write_imagef(output, (int2)(2 * x + 1, 2 * y), R_out);
    write_imagef(output, (int2)(2 * x, 2 * y + 1), B_out);
    write_imagef(output, (int2)(2 * x + 1, 2 * y + 1), Gb_out);
}
	/*
	* function: kernel_3a_stats
	* input: image2d_t as read only
	* output: XCamGridStat, stats results
	*/

	typedef struct
	{
	unsigned int avg_y;

	unsigned int avg_r;
	unsigned int avg_gr;
	unsigned int avg_gb;
	unsigned int avg_b;
	unsigned int valid_wb_count;

	unsigned int f_value1;
	unsigned int f_value2;
	} XCamGridStat;

	__kernel void kernel_3a_stats (__read_only image2d_t input, __global XCamGridStat *output)
	{
	int x = get_global_id (0);
	int y = get_global_id (1);
	int w = get_global_size (0);
	sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_NONE \| CLK_FILTER_NEAREST;

	int x0 = 16 * x;
	int y0 = 16 * y;
	float sum_gr = 0.0f, sum_r = 0.0f, sum_b = 0.0f, sum_gb = 0.0f;
	float avg_gr = 0.0f, avg_r = 0.0f, avg_b = 0.0f, avg_gb = 0.0f;
	int i = 0, j = 0;
	float count = (16.0 / 2) * (16.0 / 2);
	float4 p[4];

	#pragma unroll
	for (j = 0; j < 16; j += 2) {
	#pragma unroll
	for (i = 0; i < 16; i += 2) {
	p[0] = read_imagef (input, sampler, (int2)(x0 + i, y0 + j));
	p[1] = read_imagef (input, sampler, (int2)(x0 + i, y0 + j + 1));
	p[2] = read_imagef (input, sampler, (int2)(x0 + i + 1, y0 + j));
	p[3] = read_imagef (input, sampler, (int2)(x0 + i + 1, y0 + j + 1));
	sum_gr += p[0].x;
	sum_b += p[1].x;
	sum_r += p[2].x;
	sum_gb += p[3].x;
	}
	}

	avg_gr = sum_gr / count;
	avg_r = sum_r / count;
	avg_b = sum_b / count;
	avg_gb = sum_gb / count;

	output[y * w + x].avg_gr = convert_uint(avg_gr * 256.0);
	output[y * w + x].avg_r = convert_uint(avg_r * 256.0);
	output[y * w + x].avg_b = convert_uint(avg_b * 256.0);
	output[y * w + x].avg_gb = convert_uint(avg_gb * 256.0);
	output[y * w + x].valid_wb_count = convert_uint(count);
	output[y * w + x].avg_y = convert_uint(((avg_gr + avg_gb) / 2.0f) * 256.0);
	output[y * w + x].f_value1 = 0;
	output[y * w + x].f_value2 = 0;

	}
	/*
	* function: kernel_bayer_copy
	* sample code of default kernel arguments
	* input: image2d_t as read only
	* output: image2d_t as write only
	*/

	//#define ENABLE_IMAGE_2D_INPUT 0

	/*
	* GROUP_PIXEL_X_SIZE = 2 * GROUP_CELL_X_SIZE
	* GROUP_PIXEL_Y_SIZE = 2 * GROUP_CELL_Y_SIZE
	*/

	#define GROUP_CELL_X_SIZE 64
	#define GROUP_CELL_Y_SIZE 4

	//float4; 16
	#define SLM_X_SIZE (GROUP_CELL_X_SIZE / 4)
	#define SLM_Y_SIZE GROUP_CELL_Y_SIZE

	#define STATS_3A_CELL_X_SIZE 8
	#define STATS_3A_CELL_Y_SIZE GROUP_CELL_Y_SIZE

	typedef struct {
	float level_gr; /* Black level for GR pixels */
	float level_r; /* Black level for R pixels */
	float level_b; /* Black level for B pixels */
	float level_gb; /* Black level for GB pixels */
	uint color_bits;
	} CLBLCConfig;


	typedef struct
	{
	float r_gain;
	float gr_gain;
	float gb_gain;
	float b_gain;
	} CLWBConfig;

	inline int slm_pos (const int x, const int y)
	{
	return mad24 (y, SLM_X_SIZE, x);
	}

	inline void gamma_correct(float8 in_out, __global float table)
	{
	in_out->s0 = table[clamp(convert_int(in_out->s0 * 255.0f), 0, 255)];
	in_out->s1 = table[clamp(convert_int(in_out->s1 * 255.0f), 0, 255)];
	in_out->s2 = table[clamp(convert_int(in_out->s2 * 255.0f), 0, 255)];
	in_out->s3 = table[clamp(convert_int(in_out->s3 * 255.0f), 0, 255)];
	in_out->s4 = table[clamp(convert_int(in_out->s4 * 255.0f), 0, 255)];
	in_out->s5 = table[clamp(convert_int(in_out->s5 * 255.0f), 0, 255)];
	in_out->s6 = table[clamp(convert_int(in_out->s6 * 255.0f), 0, 255)];
	in_out->s7 = table[clamp(convert_int(in_out->s7 * 255.0f), 0, 255)];
	}

	inline float avg_float8 (float8 data)
	{
	return (data.s0 + data.s1 + data.s2 + data.s3 + data.s4 + data.s5 + data.s6 + data.s7) * 0.125f;
	}

	inline void stats_3a_calculate (
	__local float4 * slm_gr,
	__local float4 * slm_r,
	__local float4 * slm_b,
	__local float4 * slm_gb,
	__global ushort8 * stats_output,
	CLWBConfig *wb_config)
	{
	const int group_x_size = get_num_groups (0);
	const int group_id_x = get_group_id (0);
	const int group_id_y = get_group_id (1);

	const int l_id_x = get_local_id (0);
	const int l_id_y = get_local_id (1);
	const int l_size_x = get_local_size (0);
	const int stats_float4_x_count = STATS_3A_CELL_X_SIZE / 4;
	int count = stats_float4_x_count * STATS_3A_CELL_Y_SIZE / 4;

	int index = mad24 (l_id_y, l_size_x, l_id_x);
	int index_x = index % SLM_X_SIZE;
	int index_y = index / SLM_X_SIZE;

	if (mad24 (index_y, stats_float4_x_count, index_x % stats_float4_x_count) < count) {
	int pitch_count = count / stats_float4_x_count * SLM_X_SIZE;
	int index1 = index + pitch_count;
	int index2 = index1 + pitch_count;
	int index3 = index2 + pitch_count;
	slm_gr[index] = (slm_gr[index] + slm_gr[index1] + slm_gr[index2] + slm_gr[index3]) * 0.25f;
	slm_r[index] = (slm_r[index] + slm_r[index1] + slm_r[index2] + slm_r[index3]) * 0.25f;
	slm_b[index] = (slm_b[index] + slm_b[index1] + slm_b[index2] + slm_b[index3]) * 0.25f;
	slm_gb[index] = (slm_gb[index] + slm_gb[index1] + slm_gb[index2] + slm_gb[index3]) * 0.25f;
	}
	barrier (CLK_LOCAL_MEM_FENCE);

	if (index < SLM_X_SIZE / 2) {
	float result_gr, result_r, result_b, result_gb, avg_y;
	float8 tmp;
	tmp = ((__local float8*)slm_gr)[index];
	result_gr = avg_float8 (tmp);

	tmp = ((__local float8*)slm_r)[index];
	result_r = avg_float8 (tmp);

	tmp = ((__local float8*)slm_b)[index];
	result_b = avg_float8 (tmp);

	tmp = ((__local float8*)slm_gb)[index];
	result_gb = avg_float8 (tmp);

	avg_y = convert_uchar_sat(
	mad ((result_gr * wb_config->gr_gain + result_gb * wb_config->gb_gain), 74.843f,
	mad (result_r * wb_config->r_gain, 76.245f, result_b * 29.070f)));

	int out_index = mad24 (mad24 (group_id_y, group_x_size, group_id_x),
	(GROUP_CELL_X_SIZE / STATS_3A_CELL_X_SIZE) * (GROUP_CELL_Y_SIZE / STATS_3A_CELL_Y_SIZE),
	index);

	//ushort avg_y; avg_r; avg_gr; avg_gb; avg_b; valid_wb_count; f_value1; f_value2;
	stats_output[out_index] = (ushort8) (
	convert_ushort (convert_uchar_sat (avg_y)),
	convert_ushort (convert_uchar_sat (result_r * 255.0f)),
	convert_ushort (convert_uchar_sat (result_gr * 255.0f)),
	convert_ushort (convert_uchar_sat (result_gb * 255.0f)),
	convert_ushort (convert_uchar_sat (result_b * 255.0f)),
	STATS_3A_CELL_X_SIZE * STATS_3A_CELL_Y_SIZE,
	0,
	0);
	}
	}


	__kernel void kernel_bayer_basic (
	#if ENABLE_IMAGE_2D_INPUT
	__read_only image2d_t input,
	#else
	__global const ushort8 *input,
	#endif
	uint input_aligned_width,
	__write_only image2d_t output,
	uint out_height,
	CLBLCConfig blc_config,
	CLWBConfig wb_config,
	__global float *gamma_table,
	__global ushort8 *stats_output
	)
	{
	int g_x = get_global_id (0);
	int g_y = get_global_id (1);

	const int l_x = get_local_id (0);
	const int l_y = get_local_id (1);
	const int l_x_size = get_local_size (0);
	const int l_y_size = get_local_size (1);
	const int group_id_x = get_group_id (0);
	const int group_id_y = get_group_id (1);

	sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_NONE \| CLK_FILTER_NEAREST;

	int index = mad24 (l_y, l_x_size, l_x);
	int x_cell_start = (GROUP_CELL_X_SIZE / 4) * group_id_x;
	int y_cell_start = GROUP_CELL_Y_SIZE * group_id_y;
	int x, y;

	float blc_multiplier = (float)(1 << (16 - blc_config.color_bits));

	__local float4 slm_gr[SLM_X_SIZE * SLM_Y_SIZE], slm_r[SLM_X_SIZE * SLM_Y_SIZE], slm_b[SLM_X_SIZE * SLM_Y_SIZE], slm_gb[SLM_X_SIZE * SLM_Y_SIZE];

	for (; index < SLM_X_SIZE * SLM_Y_SIZE; index += l_x_size * l_y_size) {
	float8 line1;
	float8 line2;

	x = index % SLM_X_SIZE + x_cell_start;
	y = index / SLM_X_SIZE + y_cell_start;

	#if ENABLE_IMAGE_2D_INPUT
	line1 = convert_float8 (as_ushort8 (read_imageui(input, sampler, (int2)(x, y * 2)))) / 65536.0f;
	line2 = convert_float8 (as_ushort8 (read_imageui(input, sampler, (int2)(x, y * 2 + 1)))) / 65536.0f;
	#else
	line1 = convert_float8 (input [y * 2 * input_aligned_width + x]) / 65536.0f;
	line2 = convert_float8 (input [(y * 2 + 1) * input_aligned_width + x]) / 65536.0f;
	#endif

	float4 gr = mad (line1.even, blc_multiplier, - blc_config.level_gr);
	float4 r = mad (line1.odd, blc_multiplier, - blc_config.level_r);
	float4 b = mad (line2.even, blc_multiplier, - blc_config.level_b);
	float4 gb = mad (line2.odd, blc_multiplier, - blc_config.level_gb);

	slm_gr[index] = gr;
	slm_r[index] = r;
	slm_b[index] = b;
	slm_gb[index] = gb;
	}
	barrier(CLK_LOCAL_MEM_FENCE);

	float8 data_gr, data_r, data_b, data_gb;
	index = mad24 (l_y, l_x_size, l_x);
	x = mad24 (GROUP_CELL_X_SIZE / 8, group_id_x, index % (SLM_X_SIZE / 2));
	y = mad24 (GROUP_CELL_Y_SIZE, group_id_y, index / (SLM_X_SIZE / 2));

	data_gr = ((__local float8*)slm_gr)[index];
	data_gr = data_gr * wb_config.gr_gain;

	data_r = ((__local float8*)slm_r)[index];
	data_r = data_r * wb_config.r_gain;

	data_b = ((__local float8*)slm_b)[index];
	data_b = data_b * wb_config.b_gain;

	data_gb = ((__local float8*)slm_gb)[index];
	data_gb = data_gb * wb_config.gb_gain;

	#if ENABLE_GAMMA
	gamma_correct (&data_gr, gamma_table);
	gamma_correct (&data_r, gamma_table);
	gamma_correct (&data_b, gamma_table);
	gamma_correct (&data_gb, gamma_table);
	#endif

	#if 0
	if (x % 16 == 0 && y % 16 == 0) {
	uint8 value = convert_uint8(convert_uchar8_sat(data_gr * 255.0f));
	printf ("(x:%d, y:%d) (blc.bit:%d, level:%d) (wb.gr:%f)=> (%d, %d, %d, %d, %d, %d, %d, %d)\n",
	x * 8, y,
	blc_config.color_bits, convert_uint(blc_config.level_gr * 255.0f),
	wb_config.gr_gain,
	value.s0, value.s1, value.s2, value.s3, value.s4, value.s5, value.s6, value.s7);
	}
	#endif

	write_imageui (output, (int2)(x, y), as_uint4 (convert_ushort8 (data_gr * 65536.0f)));
	write_imageui (output, (int2)(x, y + out_height), as_uint4 (convert_ushort8 (data_r * 65536.0f)));
	write_imageui (output, (int2)(x, y + out_height * 2), as_uint4 (convert_ushort8 (data_b * 65536.0f)));
	write_imageui (output, (int2)(x, y + out_height * 3), as_uint4 (convert_ushort8 (data_gb * 65536.0f)));

	stats_3a_calculate (slm_gr, slm_r, slm_b, slm_gb, stats_output, &wb_config);
	}
	/*
	* function: kernel_bayer_pipe
	* params:
	* input: image2d_t as read only
	* output: image2d_t as write only
	* blc_config: black level correction configuration
	* wb_config: whitebalance configuration
	* gamma_table: RGGB table
	* stats_output: 3a stats output
	*/


	#define WORKGROUP_CELL_WIDTH 8
	#define WORKGROUP_CELL_HEIGHT 8

	#define DEMOSAIC_X_CELL_PER_WORKITEM 2

	#define PIXEL_PER_CELL 2

	#define SLM_CELL_X_OFFSET 4
	#define SLM_CELL_Y_OFFSET 1

	// 8x8
	#define SLM_CELL_X_VALID_SIZE WORKGROUP_CELL_WIDTH
	#define SLM_CELL_Y_VALID_SIZE WORKGROUP_CELL_HEIGHT

	// 10x10
	#define SLM_CELL_X_SIZE (SLM_CELL_X_VALID_SIZE + SLM_CELL_X_OFFSET * 2)
	#define SLM_CELL_Y_SIZE (SLM_CELL_Y_VALID_SIZE + SLM_CELL_Y_OFFSET * 2)

	#define GUASS_DELTA_S_1 1.031739f
	#define GUASS_DELTA_S_1_5 1.072799f
	#define GUASS_DELTA_S_2 1.133173f
	#define GUASS_DELTA_S_2_5 1.215717f

	typedef struct
	{
	float ee_gain;
	float ee_threshold;
	float nr_gain;
	} CLEeConfig;

	inline int get_shared_pos_x (int i)
	{
	return i % SLM_CELL_X_SIZE;
	}

	inline int get_shared_pos_y (int i)
	{
	return i / SLM_CELL_X_SIZE;
	}

	inline int shared_pos (int x, int y)
	{
	return mad24(y, SLM_CELL_X_SIZE, x);
	}

	/* BA10=> GRBG */
	inline void grbg_slm_load (
	__local float px, __local float py, __local float pz, __local float pw,
	int index, __read_only image2d_t input, int x_start, int y_start
	)
	{
	sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_CLAMP_TO_EDGE \| CLK_FILTER_NEAREST;
	float4 data1, data2, line1, line2;
	int x0 = (get_shared_pos_x (index) + x_start) / 4;
	int y0 = get_shared_pos_y (index) + y_start;
	int2 pos = (int2)(x0, y0);
	float4 gr, r, b, gb;

	gr = read_imagef (input, sampler, (int2)(x0, y0));
	r = read_imagef (input, sampler, (int2)(x0, y0 + 544));
	b = read_imagef (input, sampler, (int2)(x0, y0 + 544 * 2));
	gb = read_imagef (input, sampler, (int2)(x0, y0 + 544 * 3));

	((__local float4 )(px + index)) = gr;
	((__local float4 )(py + index)) = r;
	((__local float4 )(pz + index)) = b;
	((__local float4 )(pw + index)) = gb;
	}

	#define MAX_DELTA_COFF 5.0f
	#define MIN_DELTA_COFF 1.0f
	#define DEFAULT_DELTA_COFF 4.0f

	inline float2 delta_coff (float2 in, __local float *table)
	{
	float2 out;
	out.x = table[(int)(fabs(in.x * 64.0f))];
	out.y = table[(int)(fabs(in.y * 64.0f))];

	return out;
	}

	inline float2 dot_denoise (float2 value, float2 in1, float2 in2, float2 in3, float2 in4, __local float *table, float gain)
	{
	float2 coff0, coff1, coff2, coff3, coff4, coff5;
	coff0 = delta_coff (0.0, table) * gain;
	coff1 = delta_coff (in1 - value, table);
	coff2 = delta_coff (in2 - value, table);
	coff3 = delta_coff (in3 - value, table);
	coff4 = delta_coff (in4 - value, table);
	//(in1 * coff1 + in2 * coff2 + in3 * coff3 + in4 * coff4 + value * coff0)
	float2 sum1 = (mad (in1, coff1,
	mad (in2, coff2,
	mad (in3, coff3,
	mad (in4, coff4, value * coff0)))));
	return sum1 / (coff0 + coff1 + coff2 + coff3 + coff4);
	}

	inline float2 dot_ee (float2 value, float2 in1, float2 in2, float2 in3, float2 in4, float2 out, CLEeConfig ee_config, float2 *egain)
	{
	float2 eH = value - in1 * 0.5f - in3 * 0.5f;
	float2 eV = value - in2 * 0.5f - in4 * 0.5f;
	float2 ee;

	eH.x = eH.x > ee_config.ee_threshold ? eH.x : 0.0f;
	eH.y = eH.y > ee_config.ee_threshold ? eH.y : 0.0f;

	eV.x = eV.x > ee_config.ee_threshold ? eV.x : 0.0f;
	eV.y = eV.y > ee_config.ee_threshold ? eV.y : 0.0f;

	ee.x = fmax(eH.x, eV.x);
	ee.y = fmax(eH.y, eV.y);

	egain[0] = mad(ee, ee_config.ee_gain, out) / out;

	return out * egain[0];
	}

	inline float2 dot_denoise_ee (float2 value, float2 in1, float2 in2, float2 in3, float2 in4, __local float table, float gain, float2 egain, CLEeConfig ee_config)
	{
	float2 out = dot_denoise(value, in1, in2, in3, in4, table, gain);
	return dot_ee(value, in1, in2, in3, in4, out, ee_config, egain);
	}

	void demosaic_2_cell (
	__local float x_data_in, __local float y_data_in, __local float z_data_in, __local float w_data_in,
	int in_x, int in_y,
	__write_only image2d_t out, uint out_height, int out_x, int out_y)
	{
	float4 out_data;
	float2 value;
	int index;
	{
	float3 R_y[2];
	index = shared_pos (in_x - 1, in_y);
	R_y[0] = (__local float3)(y_data_in + index);
	index = shared_pos (in_x - 1, in_y + 1);
	R_y[1] = (__local float3)(y_data_in + index);

	out_data.s02 = (R_y[0].s01 + R_y[0].s12) * 0.5f;
	out_data.s13 = R_y[0].s12;
	write_imagef (out, (int2)(out_x, out_y), out_data);

	out_data.s02 = (R_y[0].s01 + R_y[0].s12 + R_y[1].s01 + R_y[1].s12) * 0.25f;
	out_data.s13 = (R_y[0].s12 + R_y[1].s12) * 0.5f;
	write_imagef (out, (int2)(out_x, out_y + 1), out_data);
	}

	{
	float3 B_z[2];
	index = shared_pos (in_x, in_y - 1);
	B_z[0] = (__local float3)(z_data_in + index);
	index = shared_pos (in_x, in_y);
	B_z[1] = (__local float3)(z_data_in + index);

	out_data.s02 = (B_z[0].s01 + B_z[1].s01) * 0.5f;
	out_data.s13 = (B_z[0].s01 + B_z[0].s12 + B_z[1].s01 + B_z[1].s12) * 0.25f;
	write_imagef (out, (int2)(out_x, out_y + out_height * 2), out_data);

	out_data.s02 = B_z[1].s01;
	out_data.s13 = (B_z[1].s01 + B_z[1].s12) * 0.5f;
	write_imagef (out, (int2)(out_x, out_y + 1 + out_height * 2), out_data);
	}

	{
	float3 Gr_x[2], Gb_w[2];
	index = shared_pos (in_x, in_y);
	Gr_x[0] = (__local float3)(x_data_in + index);
	index = shared_pos (in_x, in_y + 1);
	Gr_x[1] = (__local float3)(x_data_in + index);

	index = shared_pos (in_x - 1, in_y - 1);
	Gb_w[0] = (__local float3)(w_data_in + index);
	index = shared_pos (in_x - 1, in_y);
	Gb_w[1] = (__local float3)(w_data_in + index);

	out_data.s02 = (Gr_x[0].s01 * 4.0f + Gb_w[0].s01 +
	Gb_w[0].s12 + Gb_w[1].s01 + Gb_w[1].s12) * 0.125f;
	out_data.s13 = (Gr_x[0].s01 + Gr_x[0].s12 + Gb_w[0].s12 + Gb_w[1].s12) * 0.25f;
	write_imagef (out, (int2)(out_x, out_y + out_height), out_data);

	out_data.s02 = (Gr_x[0].s01 + Gr_x[1].s01 + Gb_w[1].s01 + Gb_w[1].s12) * 0.25f;

	out_data.s13 = (Gb_w[1].s12 * 4.0f + Gr_x[0].s01 +
	Gr_x[0].s12 + Gr_x[1].s01 + Gr_x[1].s12) * 0.125f;
	write_imagef (out, (int2)(out_x, out_y + 1 + out_height), out_data);
	}
	}

	void demosaic_denoise_2_cell (
	__local float x_data_in, __local float y_data_in, __local float z_data_in, __local float w_data_in,
	int in_x, int in_y,
	__write_only image2d_t out, uint out_height, int out_x, int out_y, __local float *table, CLEeConfig ee_config)
	{
	float4 out_data[2];
	float2 value;
	int index;
	float2 egain[4];

	///////////////////////////////////////G///////////////////////////////////
	{
	float3 Gr_x[2], Gb_w[2];
	index = shared_pos (in_x - 1, in_y - 1);
	Gb_w[0] = (__local float3)(w_data_in + index);
	index = shared_pos (in_x - 1, in_y);
	Gb_w[1] = (__local float3)(w_data_in + index);

	index = shared_pos (in_x, in_y);
	Gr_x[0] = (__local float3)(x_data_in + index);
	index = shared_pos (in_x, in_y + 1);
	Gr_x[1] = (__local float3)(x_data_in + index);

	value = mad (Gr_x[0].s01, 4.0f, (Gb_w[0].s01 +
	Gb_w[0].s12 + Gb_w[1].s01 + Gb_w[1].s12)) * 0.125f;
	out_data[0].s02 = dot_denoise_ee (value, Gb_w[0].s01, Gb_w[0].s12, Gb_w[1].s01, Gb_w[1].s12, table, GUASS_DELTA_S_1_5, &egain[0], ee_config);
	value = (Gr_x[0].s01 + Gr_x[0].s12 +
	Gb_w[0].s12 + Gb_w[1].s12) * 0.25f;
	out_data[0].s13 = dot_denoise_ee(value, Gr_x[0].s01, Gr_x[0].s12, Gb_w[0].s12, Gb_w[1].s12, table, GUASS_DELTA_S_1, &egain[1], ee_config);

	value = (Gr_x[0].s01 + Gr_x[1].s01 +
	Gb_w[1].s01 + Gb_w[1].s12) * 0.25f;
	out_data[1].s02 = dot_denoise_ee (value, Gr_x[0].s01, Gr_x[1].s01, Gb_w[1].s01, Gb_w[1].s12, table, GUASS_DELTA_S_1, &egain[2], ee_config);

	value = mad (Gb_w[1].s12, 4.0f, (Gr_x[0].s01 +
	Gr_x[0].s12 + Gr_x[1].s01 + Gr_x[1].s12)) * 0.125f;
	out_data[1].s13 = dot_denoise_ee (value, Gr_x[0].s01, Gr_x[0].s12, Gr_x[1].s01, Gr_x[1].s12, table, GUASS_DELTA_S_1_5, &egain[3], ee_config);

	write_imagef (out, (int2)(out_x, out_y + out_height), out_data[0]);
	write_imagef (out, (int2)(out_x, out_y + 1 + out_height), out_data[1]);
	}

	////////////////////////////////R//////////////////////////////////////////
	{
	float4 R_y[3];
	index = shared_pos (in_x - 1, in_y - 1);
	R_y[0] = (__local float4)(y_data_in + index);
	index = shared_pos (in_x - 1, in_y);
	R_y[1] = (__local float4)(y_data_in + index);
	index = shared_pos (in_x - 1, in_y + 1);
	R_y[2] = (__local float4)(y_data_in + index);

	value = (R_y[1].s01 + R_y[1].s12) * 0.5f;
	out_data[0].s02 = dot_denoise (value, R_y[0].s01, R_y[0].s12, R_y[2].s01, R_y[2].s12, table, GUASS_DELTA_S_2_5) * egain[0];

	value = R_y[1].s12;
	out_data[0].s13 = dot_denoise (value, R_y[0].s12, R_y[1].s01, R_y[1].s23, R_y[2].s12, table, GUASS_DELTA_S_2) * egain[1];

	value = (R_y[1].s01 + R_y[1].s12 +
	R_y[2].s01 + R_y[2].s12) * 0.25f;
	out_data[1].s02 = dot_denoise (value, R_y[1].s01, R_y[1].s12, R_y[2].s01, R_y[2].s12, table, GUASS_DELTA_S_1_5) * egain[2];

	value = (R_y[1].s12 + R_y[2].s12) * 0.5f;
	out_data[1].s13 = dot_denoise (value, R_y[1].s01, R_y[1].s23, R_y[2].s01, R_y[2].s23, table, GUASS_DELTA_S_2_5) * egain[3];

	write_imagef (out, (int2)(out_x, out_y), out_data[0]);
	write_imagef (out, (int2)(out_x, out_y + 1), out_data[1]);

	}
	////////////////////////////////B//////////////////////////////////////////
	{
	float4 B_z[3];
	index = shared_pos (in_x - 1, in_y - 1);
	B_z[0] = (__local float4)(z_data_in + index);
	index = shared_pos (in_x - 1, in_y);
	B_z[1] = (__local float4)(z_data_in + index);
	index = shared_pos (in_x - 1, in_y + 1);
	B_z[2] = (__local float4)(z_data_in + index);

	value = (B_z[0].s12 + B_z[1].s12) * 0.5f;
	out_data[0].s02 = dot_denoise (value, B_z[0].s01, B_z[0].s23, B_z[1].s01, B_z[1].s23, table, GUASS_DELTA_S_2_5) * egain[0];

	value = (B_z[0].s12 + B_z[0].s23 +
	B_z[1].s12 + B_z[1].s23) * 0.25f;
	out_data[0].s13 = dot_denoise (value, B_z[0].s12, B_z[0].s23, B_z[1].s12, B_z[1].s23, table, GUASS_DELTA_S_1_5) * egain[1];

	value = B_z[1].s12;
	out_data[1].s02 = dot_denoise (value, B_z[0].s12, B_z[1].s01, B_z[1].s23, B_z[2].s12, table, GUASS_DELTA_S_2) * egain[2];

	value = (B_z[1].s12 + B_z[1].s23) * 0.5f;
	out_data[1].s13 = dot_denoise (value, B_z[0].s12, B_z[0].s23, B_z[2].s12, B_z[2].s23, table, GUASS_DELTA_S_2_5) * egain[3];

	write_imagef (out, (int2)(out_x, out_y + out_height * 2), out_data[0]);
	write_imagef (out, (int2)(out_x, out_y + 1 + out_height * 2), out_data[1]);
	}
	}

	void shared_demosaic (
	__local float x_data_in, __local float y_data_in, __local float z_data_in, __local float w_data_in,
	int in_x, int in_y,
	__write_only image2d_t out, uint output_height, int out_x, int out_y,
	uint has_denoise, __local float *table, CLEeConfig ee_config)
	{
	if (has_denoise) {
	demosaic_denoise_2_cell (
	x_data_in, y_data_in, z_data_in, w_data_in, in_x, in_y,
	out, output_height, out_x, out_y, table, ee_config);
	} else {
	demosaic_2_cell (
	x_data_in, y_data_in, z_data_in, w_data_in, in_x, in_y,
	out, output_height, out_x, out_y);
	}
	}

	__kernel void kernel_bayer_pipe (__read_only image2d_t input,
	uint input_height,
	__write_only image2d_t output,
	uint output_height,
	__global float * bnr_table,
	uint has_denoise,
	CLEeConfig ee_config
	)
	{
	int g_id_x = get_global_id (0);
	int g_id_y = get_global_id (1);
	int g_size_x = get_global_size (0);
	int g_size_y = get_global_size (1);

	int l_id_x = get_local_id(0);
	int l_id_y = get_local_id(1);
	int l_size_x = get_local_size (0);
	int l_size_y = get_local_size (1);

	__local float p1_x[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE], p1_y[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE], p1_z[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE], p1_w[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE];
	__local float4 p2[SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE];
	__local float4 *stats_cache = p2;
	__local float SLM_delta_coef_table[64];

	int out_x_start, out_y_start;
	int x_start = get_group_id (0) * WORKGROUP_CELL_WIDTH;
	int y_start = get_group_id (1) * WORKGROUP_CELL_HEIGHT;
	int i = mad24 (l_id_y, l_size_x, l_id_x);
	int j = i;

	i *= 4;
	for (; i < SLM_CELL_X_SIZE * SLM_CELL_Y_SIZE; i += (l_size_x * l_size_y) * 4) {
	grbg_slm_load (p1_x, p1_y, p1_z, p1_w, i,
	input,
	x_start - SLM_CELL_X_OFFSET, y_start - SLM_CELL_Y_OFFSET);
	}
	for(; j < 64; j += l_size_x * l_size_y)
	SLM_delta_coef_table[j] = bnr_table[j];

	barrier(CLK_LOCAL_MEM_FENCE);

	i = mad24 (l_id_y, l_size_x, l_id_x);
	int workitem_x_size = (SLM_CELL_X_VALID_SIZE / DEMOSAIC_X_CELL_PER_WORKITEM);
	int input_x = (i % workitem_x_size) * DEMOSAIC_X_CELL_PER_WORKITEM;
	int input_y = i / workitem_x_size;

	shared_demosaic (
	p1_x, p1_y, p1_z, p1_w,
	input_x + SLM_CELL_X_OFFSET, input_y + SLM_CELL_Y_OFFSET,
	output, output_height,
	(input_x + x_start) * PIXEL_PER_CELL / 4, (input_y + y_start) * PIXEL_PER_CELL, has_denoise, SLM_delta_coef_table, ee_config);
	}
	/*
	* function: kernel_blc
	* black level correction for sensor data input
	* input: image2d_t as read only
	* output: image2d_t as write only
	* blc_config: black level correction configuration
	* color_bits: identify 10bit or 12bit data
	* param:
	*/

	typedef struct
	{
	float level_gr; /* Black level for GR pixels */
	float level_r; /* Black level for R pixels */
	float level_b; /* Black level for B pixels */
	float level_gb; /* Black level for GB pixels */
	uint color_bits;
	} BLCConfig;

	uint decompression(uint data_in, uint color_bits)
	{
	uint data_out = 0;

	data_out = data_in << (16 - color_bits);

	return data_out;
	}

	__kernel void kernel_blc (__read_only image2d_t input,
	__write_only image2d_t output,
	BLCConfig blc_config)
	{
	int x0 = 2 * get_global_id (0);
	int y0 = 2 * get_global_id (1);
	int x1 = x0 + 1;
	int y1 = y0 + 1;
	sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_CLAMP_TO_EDGE \| CLK_FILTER_NEAREST;

	int2 pos_r = (int2)(x0, y0);
	int2 pos_gr = (int2)(x1, y0);
	int2 pos_gb = (int2)(x0, y1);
	int2 pos_b = (int2)(x1, y1);

	uint4 pixel;

	pixel = read_imageui(input, sampler, pos_r);
	pixel.x = floor(pixel.x - blc_config.level_r * (pown(2.0, blc_config.color_bits)) + 0.5);
	pixel.x = decompression(pixel.x, blc_config.color_bits);
	write_imageui(output, pos_r, pixel);

	pixel = read_imageui(input, sampler, pos_gr);
	pixel.x = floor(pixel.x - blc_config.level_gr * (pown(2.0, blc_config.color_bits)) + 0.5);
	pixel.x = decompression(pixel.x, blc_config.color_bits);
	write_imageui(output, pos_gr, pixel);

	pixel = read_imageui(input, sampler, pos_gb);
	pixel.x = floor(pixel.x - blc_config.level_gb * (pown(2.0, blc_config.color_bits)) + 0.5);
	pixel.x = decompression(pixel.x, blc_config.color_bits);
	write_imageui(output, pos_gb, pixel);

	pixel = read_imageui(input, sampler, pos_b);
	pixel.x = floor(pixel.x - blc_config.level_b * (pown(2.0, blc_config.color_bits)) + 0.5);
	pixel.x = decompression(pixel.x, blc_config.color_bits);
	write_imageui(output, pos_b, pixel);
	}
	/*
	* function: kernel_bnr
	* implementation of bayer noise reduction
	* input: image2d_t as read only
	* output: image2d_t as write only
	* bnr_gain: strength of noise reduction
	* direction: sensitivity of edge
	* todo: add the upstream algorithm for BNR
	*/

	__kernel void kernel_bnr (__read_only image2d_t input, __write_only image2d_t output,
	float bnr_gain, float direction)
	{
	int x = get_global_id (0);
	int y = get_global_id (1);
	sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_CLAMP_TO_EDGE \| CLK_FILTER_NEAREST;

	float4 p;
	p = read_imagef(input, sampler, (int2)(x, y));

	write_imagef(output, (int2)(x, y), p);
	}
	/*
	* function: kernel_demosaic
	* input: image2d_t as read only
	* output: image2d_t as write only
	*/

	__kernel void kernel_demosaic (__read_only image2d_t input, __write_only image2d_t output)
	{
	int x = 2 * get_global_id (0);
	int y = 2 * get_global_id (1);
	// sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_NONE \| CLK_FILTER_NEAREST;
	sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_CLAMP_TO_EDGE \| CLK_FILTER_NEAREST;

	int x0 = x - 1;
	int y0 = y - 1;
	float4 p[16];
	#if 0
	p[0] = read_imagef (input, sampler, (int2)(x0, y0));
	p[1] = read_imagef (input, sampler, (int2)(x0 + 1, y0));
	p[2] = read_imagef (input, sampler, (int2)(x0 + 2, y0));
	p[3] = read_imagef (input, sampler, (int2)(x0 + 3, y0));
	p[4] = read_imagef (input, sampler, (int2)(x0, y0 + 1));
	p[5] = read_imagef (input, sampler, (int2)(x0 + 1, y0 + 1));
	p[6] = read_imagef (input, sampler, (int2)(x0 + 2, y0 + 1));
	p[7] = read_imagef (input, sampler, (int2)(x0 + 3, y0 + 1));
	p[8] = read_imagef (input, sampler, (int2)(x0, y0 + 2));
	p[9] = read_imagef (input, sampler, (int2)(x0 + 1, y0 + 2));
	p[10] = read_imagef (input, sampler, (int2)(x0 + 2, y0 + 2));
	p[11] = read_imagef (input, sampler, (int2)(x0 + 3, y0 + 2));
	p[12] = read_imagef (input, sampler, (int2)(x0, y0 + 3));
	p[13] = read_imagef (input, sampler, (int2)(x0 + 1, y0 + 3));
	p[14] = read_imagef (input, sampler, (int2)(x0 + 2, y0 + 3));
	p[15] = read_imagef (input, sampler, (int2)(x0 + 3, y0 + 3));
	#endif

	#pragma unroll

	for (int i = 0; i < 16; ++i) {
	p[i] = read_imagef (input, sampler, (int2)(x0 + i % 4, y0 + i / 4));
	}

	float4 p00, p01, p10, p11;
	p00.x = (p[4].x + p[6].x) / 2.0;
	p00.y = (p[5].x * 4 + p[0].x + p[2].x + p[8].x + p[10].x) / 8.0;
	p00.z = (p[1].x + p[9].x) / 2.0;
	p01.x = p[6].x;
	p01.y = (p[2].x + p[5].x + p[7].x + p[10].x) / 4.0;
	p01.z = (p[1].x + p[3].x + p[9].x + p[11].x) / 4.0;
	p10.x = (p[4].x + p[6].x + p[12].x + p[14].x) / 4.0;
	p10.y = (p[5].x + p[8].x + p[10].x + p[13].x) / 4.0;
	p10.z = p[9].x;
	p11.x = (p[6].x + p[14].x) / 2.0;
	p11.y = (p[10].x * 4 + p[5].x + p[7].x + p[13].x + p[15].x) / 8.0;
	p11.z = (p[9].x + p[11].x) / 2.0;

	write_imagef (output, (int2)(x, y), p00);
	write_imagef (output, (int2)(x + 1, y), p01);
	write_imagef (output, (int2)(x, y + 1), p10);
	write_imagef (output, (int2)(x + 1, y + 1), p11);
	}
	/*
	* function: kernel_denoise
	* bi-laterial filter for denoise usage
	* input: image2d_t as read only
	* output: image2d_t as write only
	* sigma_r: the parameter to set sigma_r in the Gaussian filtering
	* imw: image width, used for edge detect
	* imh: image height, used for edge detect
	*/

	__constant float gausssingle[25]={0.6411,0.7574,0.8007,0.7574,0.6411,0.7574,0.8948,0.9459,0.8948,0.7574,0.8007,0.94595945,1,0.9459,0.8007,0.7574,0.8948,0.9459,0.8948,0.7574,0.6411,0.7574,0.8007,0.7574,0.6411};

	#define LOCAL_SIZE_X 16
	#define LOCAL_SIZE_Y 15

	__kernel void kernel_denoise(__read_only image2d_t srcRGB, __write_only image2d_t dstRGB, float sigma_r, unsigned int imw, unsigned int imh)
	{
	int x = get_global_id(1); //[0,imw-1]
	int y = get_global_id(0); //[0,imh-1]
	int localX = get_local_id(1); //[0,imw/120-1]
	int localY = get_local_id(0); //[0,imh/72-1]
	//printf("localX=%d,localY=%d\n",localX,localY);

	float normF=0;
	float H=0;
	float delta=0;
	int i=0,j=0;
	sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_NONE \|CLK_FILTER_NEAREST;
	sigma_r = 2*pown(sigma_r,2);

	//coord in srcY
	float4 line;
	line.x=0;
	line.y=0;
	line.z=0;
	line.w=1.0;

	__local float4 pixel[LOCAL_SIZE_X+4][LOCAL_SIZE_Y+4];
	bool interior = x > 1 && x <(imw-3)
	&& y>1 && y< (imh-3);
	if(interior)
	{
	pixel[localX+2][localY+2]=read_imagef(srcRGB, sampler,(int2)(x,y));

	if(localX==0)
	{
	if(localY==0)
	{
	pixel[0][0]=read_imagef(srcRGB, sampler,(int2)(x-2,y-2));
	pixel[0][1]=read_imagef(srcRGB, sampler,(int2)(x-2,y-1));
	pixel[0][2]=read_imagef(srcRGB, sampler,(int2)(x-2,y ));
	pixel[1][0]=read_imagef(srcRGB, sampler,(int2)(x-1,y-2));
	pixel[1][1]=read_imagef(srcRGB, sampler,(int2)(x-1,y-1));
	pixel[1][2]=read_imagef(srcRGB, sampler,(int2)(x-1,y ));
	pixel[2][0]=read_imagef(srcRGB, sampler,(int2)(x ,y-2));
	pixel[2][1]=read_imagef(srcRGB, sampler,(int2)(x ,y-1));
	}
	else if(localY==LOCAL_SIZE_Y-1)
	{
	pixel[0][LOCAL_SIZE_Y-1+2]=read_imagef(srcRGB, sampler,(int2)(x-2,y ));
	pixel[0][LOCAL_SIZE_Y +2]=read_imagef(srcRGB, sampler,(int2)(x-2,y+1));
	pixel[0][LOCAL_SIZE_Y+1+2]=read_imagef(srcRGB, sampler,(int2)(x-2,y+2));
	pixel[1][LOCAL_SIZE_Y-1+2]=read_imagef(srcRGB, sampler,(int2)(x-1,y ));
	pixel[1][LOCAL_SIZE_Y +2]=read_imagef(srcRGB, sampler,(int2)(x-1,y+1));
	pixel[1][LOCAL_SIZE_Y+1+2]=read_imagef(srcRGB, sampler,(int2)(x-1,y+2));
	pixel[2][LOCAL_SIZE_Y +2]=read_imagef(srcRGB, sampler,(int2)(x ,y+1));
	pixel[2][LOCAL_SIZE_Y+1+2]=read_imagef(srcRGB, sampler,(int2)(x ,y+2));
	}
	else
	{
	pixel[0][localY+2]=read_imagef(srcRGB, sampler,(int2)(x-2,y));
	pixel[1][localY+2]=read_imagef(srcRGB, sampler,(int2)(x-1,y));
	}
	}
	else if(localX==LOCAL_SIZE_X-1)
	{
	if(localY==0)
	{
	pixel[LOCAL_SIZE_X-1+2+0][0]=read_imagef(srcRGB, sampler,(int2)(x ,y-2));
	pixel[LOCAL_SIZE_X-1+2+0][1]=read_imagef(srcRGB, sampler,(int2)(x ,y-1));
	//pixel[LOCAL_SIZE_X-1+2+0][2]=read_imagef(srcRGB, sampler,(int2)(x ,y));
	pixel[LOCAL_SIZE_X-1+2+1][0]=read_imagef(srcRGB, sampler,(int2)(x+1,y-2));
	pixel[LOCAL_SIZE_X-1+2+1][1]=read_imagef(srcRGB, sampler,(int2)(x+1,y-1));
	pixel[LOCAL_SIZE_X-1+2+1][2]=read_imagef(srcRGB, sampler,(int2)(x+1,y ));
	pixel[LOCAL_SIZE_X-1+2+2][0]=read_imagef(srcRGB, sampler,(int2)(x+2,y-2));
	pixel[LOCAL_SIZE_X-1+2+2][1]=read_imagef(srcRGB, sampler,(int2)(x+2,y-1));
	pixel[LOCAL_SIZE_X-1+2+2][2]=read_imagef(srcRGB, sampler,(int2)(x+2,y ));
	}
	else if(localY==LOCAL_SIZE_Y-1)
	{
	// pixel[LOCAL_SIZE_X-1+2+0][LOCAL_SIZE_Y-1+2+0]=read_imagef(srcRGB, sampler,(int2)(x ,y ));
	pixel[LOCAL_SIZE_X-1+2+0][LOCAL_SIZE_Y-1+2+1]=read_imagef(srcRGB, sampler,(int2)(x ,y+1));
	pixel[LOCAL_SIZE_X-1+2+0][LOCAL_SIZE_Y-1+2+2]=read_imagef(srcRGB, sampler,(int2)(x ,y+2));
	pixel[LOCAL_SIZE_X-1+2+1][LOCAL_SIZE_Y-1+2+0]=read_imagef(srcRGB, sampler,(int2)(x+1,y ));
	pixel[LOCAL_SIZE_X-1+2+1][LOCAL_SIZE_Y-1+2+1]=read_imagef(srcRGB, sampler,(int2)(x+1,y+1));
	pixel[LOCAL_SIZE_X-1+2+1][LOCAL_SIZE_Y-1+2+2]=read_imagef(srcRGB, sampler,(int2)(x+1,y+2));
	pixel[LOCAL_SIZE_X-1+2+2][LOCAL_SIZE_Y-1+2+0]=read_imagef(srcRGB, sampler,(int2)(x+2,y ));
	pixel[LOCAL_SIZE_X-1+2+2][LOCAL_SIZE_Y-1+2+1]=read_imagef(srcRGB, sampler,(int2)(x+2,y+1));
	pixel[LOCAL_SIZE_X-1+2+2][LOCAL_SIZE_Y-1+2+2]=read_imagef(srcRGB, sampler,(int2)(x+2,y+2));
	}
	else
	{
	pixel[LOCAL_SIZE_X-1+2+1][localY+2]=read_imagef(srcRGB, sampler,(int2)(x+1,y ));
	pixel[LOCAL_SIZE_X-1+2+2][localY+2]=read_imagef(srcRGB, sampler,(int2)(x+2,y ));
	}
	}
	else if(localY==0)
	{
	pixel[localX+2][0]=read_imagef(srcRGB,sampler,(int2)(x,y-2));
	pixel[localX+2][1]=read_imagef(srcRGB,sampler,(int2)(x,y-1));
	}
	else if(localY==LOCAL_SIZE_Y-1)
	{
	pixel[localX+2][LOCAL_SIZE_Y-1+2+1]=read_imagef(srcRGB,sampler,(int2)(x,y+1));
	pixel[localX+2][LOCAL_SIZE_Y-1+2+2]=read_imagef(srcRGB,sampler,(int2)(x,y+2));
	}
	}else{
	line=read_imagef(srcRGB, sampler,(int2)(x,y));
	}

	barrier(CLK_LOCAL_MEM_FENCE);
	if (interior) {
	#pragma unroll
	for(i=0;i<5;i++)
	{
	#pragma unroll
	for(j=0;j<5;j++)
	{
	delta=pown(pixel[localX+i][localY+j].x-pixel[localX+2][localY+2].x,2) +
	pown(pixel[localX+i][localY+j].y-pixel[localX+2][localY+2].y,2) +
	pown(pixel[localX+i][localY+j].z-pixel[localX+2][localY+2].z,2);
	H = (exp(-(delta/sigma_r)))gausssingle[i5+j];
	normF+=H;
	line.x+=pixel[localX+i][localY+j].x*H;
	line.y+=pixel[localX+i][localY+j].y*H;
	line.z+=pixel[localX+i][localY+j].z*H;
	}
	}

	line.x=line.x/normF;
	line.y=line.y/normF;
	line.z=line.z/normF;
	}

	write_imagef(dstRGB,(int2)(x,y),line);
	}
	/*
	* function: kernel_dpc
	* defect pixel correction on bayer data input
	* input: image2d_t as read only
	* output: image2d_t as write only
	* gr_threshold: GR threshold of defect pixel correction
	* r_threshold: R threshold of defect pixel correction
	* b_threshold: B threshold of defect pixel correction
	* gb_threshold: GB threshold of defect pixel correction
	* param:
	*/

	__kernel void kernel_dpc (__read_only image2d_t input,
	__write_only image2d_t output,
	float gr_threshold, float r_threshold,
	float b_threshold, float gb_threshold)
	{
	int x = get_global_id (0);
	int y = get_global_id (1);
	sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_CLAMP_TO_EDGE \| CLK_FILTER_NEAREST;
	float4 p[9];
	p[0] = read_imagef(input, sampler, (int2)(x - 2, y - 2));
	p[1] = read_imagef(input, sampler, (int2)(x, y - 2));
	p[2] = read_imagef(input, sampler, (int2)(x + 2, y - 2));
	p[3] = read_imagef(input, sampler, (int2)(x - 2, y));
	p[4] = read_imagef(input, sampler, (int2)(x, y));
	p[5] = read_imagef(input, sampler, (int2)(x + 2, y));
	p[6] = read_imagef(input, sampler, (int2)(x - 2, y + 2));
	p[7] = read_imagef(input, sampler, (int2)(x, y + 2));
	p[8] = read_imagef(input, sampler, (int2)(x + 2, y + 2));

	float aveVer = (p[1].x + p[7].x) / 2;
	float aveHor = (p[3].x + p[5].x) / 2;
	float avePosDia = (p[0].x + p[8].x) / 2;
	float aveNegDia = (p[2].x + p[6].x) / 2;

	float aveMin, aveMax;
	if (aveVer > aveHor) {
	aveMin = aveHor;
	aveMax = aveVer;
	}
	else {
	aveMin = aveVer;
	aveMax = aveHor;
	}

	if (avePosDia < aveMin)
	aveMin = avePosDia;
	else if (avePosDia > aveMax)
	aveMax = avePosDia;

	if (aveNegDia < aveMin)
	aveMin = aveNegDia;
	else if (aveNegDia > aveMax)
	aveMax = aveNegDia;

	float edgeVer = p[4].x - aveVer;
	float edgeHor = p[4].x - aveHor;
	float edgeNeighbourVer = (p[3].x + p[5].x - (p[0].x + p[2].x + p[6].x + p[8].x) / 2) / 2;
	float edgeNeighbourHor = (p[1].x + p[7].x - (p[0].x + p[2].x + p[6].x + p[8].x) / 2) / 2;

	float threshold;
	if (x % 2 == 0)
	threshold = (y % 2 == 0) ? gr_threshold : b_threshold;
	else
	threshold = (y % 2 == 0) ? r_threshold : gb_threshold;

	float4 pixelOut;
	pixelOut.x = p[4].x;
	pixelOut.y = p[4].y;
	pixelOut.z = p[4].z;
	pixelOut.w = p[4].w;
	if ((edgeVer > edgeNeighbourVer) && (edgeHor > edgeNeighbourHor)) {
	if ((p[4].x - aveMax) > threshold) {
	pixelOut.x = aveMax;
	}
	}
	if ((edgeVer < edgeNeighbourVer) && (edgeHor < edgeNeighbourHor)) {
	if ((aveMin - p[4].x) > threshold) {
	pixelOut.x = aveMin;
	}
	}

	write_imagef (output, (int2)(x, y), pixelOut);
	}
	/*
	* function: kernel_ee
	* input: image2d_t as read only
	* output: image2d_t as write only
	* ee_config: Edge enhancement configuration
	*/

	typedef struct
	{
	float ee_gain;
	float ee_threshold;
	float nr_gain;
	} CLEeConfig;

	__constant float lv[25] = {0.0, 0.0, 0.0, 0.0, 0.0,
	0.0, 0.0, -1.0, 0.0, 0.0,
	-1.0, -14.0, 32.0, -14.0, -1.0,
	0.0, 0.0, -1.0, 0.0, 0.0,
	0.0, 0.0, 0.0, 0.0, 0.0
	};

	__constant float lh[25] = {0.0, 0.0, -1.0, 0.0, 0.0,
	0.0, 0.0, -14.0, 0.0, 0.0,
	0.0, -1.0, 32.0, -1.0, 0.0,
	0.0, 0.0, -14.0, 0.0, 0.0,
	0.0, 0.0, -1.0, 0.0, 0.0
	};

	__constant float la[25] = {0.0, 0.0, -2.0, 0.0, 0.0,
	0.0, -2.0, -2.0, -2.0, 0.0,
	-2.0, -2.0, 24.0, -2.0, -2.0,
	0.0, -2.0, -2.0, -2.0, 0.0,
	0.0, 0.0, -2.0, 0.0, 0.0
	};

	__constant float na[25] = { -1.0, -1.0, -1.0, -1.0, -1.0,
	-1.0, -1.0, -1.0, -1.0, -1.0,
	-1.0, -1.0, 16.0, -1.0, -1.0,
	-1.0, -1.0, -1.0, -1.0, -1.0,
	-1.0, -1.0, -1.0, -1.0, -1.0
	};
	__kernel void kernel_ee (__read_only image2d_t input, __write_only image2d_t output, uint vertical_offset_in, uint vertical_offset_out, CLEeConfig ee_config)
	{
	int x = get_global_id (0);
	int y = get_global_id (1);
	int X = get_global_size(0);
	int Y = get_global_size(1);

	sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_CLAMP_TO_EDGE \| CLK_FILTER_NEAREST;
	float4 y_in, y_out, uv_in;
	float4 a[5], b[5], c[5], d[5], e[5];

	// cpy UV
	if(y % 2 == 0) {
	uv_in = read_imagef(input, sampler, (int2)(x, y / 2 + vertical_offset_in));
	write_imagef(output, (int2)(x, y / 2 + vertical_offset_out), uv_in);
	}

	a[0] = read_imagef(input, sampler, (int2)(x - 2, y - 2));
	a[1] = read_imagef(input, sampler, (int2)(x - 1, y - 2));
	a[2] = read_imagef(input, sampler, (int2)(x, y - 2));
	a[3] = read_imagef(input, sampler, (int2)(x + 1, y - 2));
	a[4] = read_imagef(input, sampler, (int2)(x + 2, y - 2));

	b[0] = read_imagef(input, sampler, (int2)(x - 2, y - 1));
	b[1] = read_imagef(input, sampler, (int2)(x - 1, y - 1));
	b[2] = read_imagef(input, sampler, (int2)(x, y - 1));
	b[3] = read_imagef(input, sampler, (int2)(x + 1, y - 1));
	b[4] = read_imagef(input, sampler, (int2)(x + 2, y - 1));

	c[0] = read_imagef(input, sampler, (int2)(x - 2, y));
	c[1] = read_imagef(input, sampler, (int2)(x - 1, y));
	c[2] = read_imagef(input, sampler, (int2)(x, y));
	c[3] = read_imagef(input, sampler, (int2)(x + 1, y));
	c[4] = read_imagef(input, sampler, (int2)(x + 2, y));

	d[0] = read_imagef(input, sampler, (int2)(x - 2, y + 1));
	d[1] = read_imagef(input, sampler, (int2)(x - 1, y + 1));
	d[2] = read_imagef(input, sampler, (int2)(x, y + 1));
	d[3] = read_imagef(input, sampler, (int2)(x + 1, y + 1));
	d[4] = read_imagef(input, sampler, (int2)(x + 2, y + 1));

	e[0] = read_imagef(input, sampler, (int2)(x - 2, y + 2));
	e[1] = read_imagef(input, sampler, (int2)(x - 1, y + 2));
	e[2] = read_imagef(input, sampler, (int2)(x, y + 2));
	e[3] = read_imagef(input, sampler, (int2)(x + 1, y + 2));
	e[4] = read_imagef(input, sampler, (int2)(x + 2, y + 2));

	float eV = (a[0].x * lv[0] + a[1].x * lv[1] + a[2].x * lv[2] + a[3].x * lv[3] + a[4].x * lv[4]
	+ b[0].x * lv[5] + b[1].x * lv[6] + b[2].x * lv[7] + b[3].x * lv[8] + b[4].x * lv[9]
	+ c[0].x * lv[10] + c[1].x * lv[11] + c[2].x * lv[12] + c[3].x * lv[13] + c[4].x * lv[14]
	+ d[0].x * lv[15] + d[1].x * lv[16] + d[2].x * lv[17] + d[3].x * lv[18] + d[4].x * lv[19]
	+ e[0].x * lv[20] + e[1].x * lv[21] + e[2].x * lv[22] + e[3].x * lv[23] + e[4].x * lv[24]) * 255.0;

	float eH = (a[0].x * lh[0] + a[1].x * lh[1] + a[2].x * lh[2] + a[3].x * lh[3] + a[4].x * lh[4]
	+ b[0].x * lh[5] + b[1].x * lh[6] + b[2].x * lh[7] + b[3].x * lh[8] + b[4].x * lh[9]
	+ c[0].x * lh[10] + c[1].x * lh[11] + c[2].x * lh[12] + c[3].x * lh[13] + c[4].x * lh[14]
	+ d[0].x * lh[15] + d[1].x * lh[16] + d[2].x * lh[17] + d[3].x * lh[18] + d[4].x * lh[19]
	+ e[0].x * lh[20] + e[1].x * lh[21] + e[2].x * lh[22] + e[3].x * lh[23] + e[4].x * lh[24]) * 255.0;


	float eA = (a[0].x * la[0] + a[1].x * la[1] + a[2].x * la[2] + a[3].x * la[3] + a[4].x * la[4]
	+ b[0].x * la[5] + b[1].x * la[6] + b[2].x * la[7] + b[3].x * la[8] + b[4].x * la[9]
	+ c[0].x * la[10] + c[1].x * la[11] + c[2].x * la[12] + c[3].x * la[13] + c[4].x * la[14]
	+ d[0].x * la[15] + d[1].x * la[16] + d[2].x * la[17] + d[3].x * la[18] + d[4].x * la[19]
	+ e[0].x * la[20] + e[1].x * la[21] + e[2].x * la[22] + e[3].x * la[23] + e[4].x * la[24]) * 255.0;

	float nA = (a[0].x * na[0] + a[1].x * na[1] + a[2].x * na[2] + a[3].x * na[3] + a[4].x * na[4]
	+ b[0].x * na[5] + b[1].x * na[6] + b[2].x * na[7] + b[3].x * na[8] + b[4].x * na[9]
	+ c[0].x * na[10] + c[1].x * na[11] + c[2].x * na[12] + c[3].x * na[13] + c[4].x * na[14]
	+ d[0].x * na[15] + d[1].x * na[16] + d[2].x * na[17] + d[3].x * na[18] + d[4].x * na[19]
	+ e[0].x * na[20] + e[1].x * na[21] + e[2].x * na[22] + e[3].x * na[23] + e[4].x * na[24]) * 255.0;


	float nV = eH;
	float nH = eV;

	float dV = (fabs(2.0 * b[1].x - a[1].x - c[1].x) + fabs(2.0 * b[2].x - a[2].x - c[2].x) + fabs(2.0 * b[3].x - a[3].x - c[3].x) + fabs(2.0 * c[1].x - b[1].x - d[1].x) + fabs(2.0 * c[2].x - b[2].x - d[2].x) + fabs(2.0 * c[3].x - b[3].x - d[3].x) + fabs(2.0 * d[1].x - c[1].x - e[1].x) + fabs(2.0 * d[2].x - c[2].x - e[2].x) + fabs(2.0 * d[3].x - c[3].x - e[3].x)) * 255.0;
	float dH = (fabs(2.0 * b[1].x - b[0].x - b[2].x) + fabs(2.0 * b[2].x - b[1].x - b[3].x) + fabs(2.0 * b[3].x - b[2].x - b[4].x) + fabs(2.0 * c[1].x - c[0].x - c[2].x) + fabs(2.0 * c[2].x - c[1].x - c[3].x) + fabs(2.0 * c[3].x - c[2].x - c[4].x) + fabs(2.0 * d[1].x - d[0].x - d[2].x) + fabs(2.0 * d[2].x - d[1].x - d[3].x) + fabs(2.0 * d[3].x - d[2].x - d[4].x)) * 255.0;
	float dA = (fabs(2.0 * c[2].x - b[2].x - d[2].x) + fabs(2.0 * c[2].x - c[1].x - c[3].x) + fabs(2.0 * c[2].x - b[1].x - d[3].x) + fabs(2.0 * c[2].x - b[3].x - d[1].x) + fabs(2.0 * c[2].x - a[0].x - e[4].x) + fabs(2.0 * c[2].x - a[4].x - e[0].x) + fabs(2.0 * c[2].x - c[0].x - c[4].x) + fabs(2.0 * c[2].x - a[2].x - e[2].x) + fabs(2.0 * c[2].x - (b[0].x + d[0].x + b[4].x + d[4].x) / 2.0)) * 255.0;

	float edge = dH < (dV < dA ? dV : dA) ? eH : (dV < dA ? eV : eA);
	float noise = dH < (dV < dA ? dV : dA) ? nH : (dV < dA ? nV : nA);
	float dir = dH < (dV < dA ? dA : dV) ? (dV < dA ? dA : dV) : dH;
	noise = noise * ee_config.nr_gain / 16.0;
	edge = edge * ee_config.ee_gain / 16.0;

	y_out.x = dir > ee_config.ee_threshold ? (c[2].x * 255.0 + edge - noise) / 255.0 : c[2].x;
	y_out.y = 0.0;
	y_out.z = 0.0;
	y_out.w = 1.0;
	write_imagef(output, (int2)(x, y), y_out);
	}
	/*
	* function: kernel_gamma
	* input: image2d_t as read only
	* output: image2d_t as write only
	* table: gamma table.
	*/
	//TODO: linear scale should be a problem
	__kernel void kernel_gamma (__read_only image2d_t input, __write_only image2d_t output, __global float *table)
	{
	int x = get_global_id (0);
	int y = get_global_id (1);
	sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE \| CLK_ADDRESS_NONE \| CLK_FILTER_NEAREST;
	float4 pixel_in[8], pixel_out[8];
	int i=0,j=0;

	#pragma unroll
	for(j=0;j<2;j++) {
	#pragma unroll
	for(i=0;i<4;i++) {
	pixel_in[j4 + i] = read_imagef(input, sampler,(int2)(4x + i, 2*y + j));
	pixel_out[j4 + i].x = table[convert_int(pixel_in[j4 + i].x * 255.0)] / 255.0;
	pixel_out[j4 + i].y = table[convert_int(pixel_in[j4 + i].y * 255.0)] / 255.0;
	pixel_out[j4 + i].z = table[convert_int(pixel_in[j4 + i].z * 255.0)] / 255.0;
	pixel_out[j*4 + i].w = 0.0;
	write_imagef(output, (int2)(4x + i, 2y + j), pixel_out[j*4 + i]);
	}
	}
	}