Watson1978/bit.c

## bit.c

#line 1 "narray/gen/tmpl/lib.c"
/*

  Ruby/Cumo::GSL - GSL wrapper for Ruby/Cumo::NArray

  created on: 2017-03-11
  Copyright (C) 2017 Masahiro Tanaka
  Copyright (C) 2018 Naotoshi Seo
*/

#include <ruby.h>
#include <assert.h>
#include "cumo.h"
#include "cumo/narray.h"
#include "cumo/template.h"
#include "SFMT.h"
#include "cumo/cuda/memory_pool.h"
#include "cumo/cuda/runtime.h"
#include "cumo/indexer.h"

#define m_map(x) m_num_to_data(rb_yield(m_data_to_num(x)))

narray/types/bit.c
#include <static ID cumo_id_cast;
static ID cumo_id_divmod;
static ID cumo_id_eq;
static ID cumo_id_mulsum;
static ID cumo_id_ne;
static ID cumo_id_pow;
>

VALUE cT;
extern VALUE cRT;

cumo/types/bit.h
void
Init_
#line 1 "narray/gen/tmpl/class.c"
/*
  class definition:
*/

VALUE Cumo::Bit;

static VALUE cT(VALUE,VALUE);

bit_store


#line 1 "narray/gen/tmpl/alloc_func.c"
static size_t
_memsize(const void* ptr)
{
    size_t size = sizeof(cumo_narray_data_t);
    const cumo_narray_data_t *na = (const cumo_narray_data_t*)ptr;

    assert(na->base.type == CUMO_NARRAY_DATA_T);

    if (na->ptr != NULL) {

        size += ((na->base.size-1)/8/sizeof(CUMO_BIT_DIGIT)+1)*sizeof(CUMO_BIT_DIGIT);

    }
    if (na->base.size > 0) {
        if (na->base.shape != NULL && na->base.shape != &(na->base.size)) {
            size += sizeof(size_t) * na->base.ndim;
        }
    }
    return size;
}

static void
bit_free(void* ptr)
{
    cumo_narray_data_t *na = (cumo_narray_data_t*)ptr;

    assert(na->base.type == CUMO_NARRAY_DATA_T);

    if (na->ptr != NULL) {
        cumo_cuda_runtime_free(na->ptr);
        na->ptr = NULL;
    }
    if (na->base.size > 0) {
        if (na->base.shape != NULL && na->base.shape != &(na->base.size)) {
            xfree(na->base.shape);
            na->base.shape = NULL;
        }
    }
    xfree(na);
}

static cumo_narray_type_info_t bit_info = {

    1,             // element_bits
    0,             // element_bytes
    1,             // element_stride (in bits)

};


static const rb_data_type_t bit_data_type = {
    "bit",
    {0, Cumo::Bit_free, _memsize,},
    &cumo_na_data_type,
    &bitbit_info,
    0, // flags
};


static VALUE
bit(VALUE klass)
{
    cumo_narray_data_t *na = ALLOC(cumo_narray_data_t);

    na->base.ndim = 0;
    na->base.type = CUMO_NARRAY_DATA_T;
    na->base.flag[0] = CUMO_NA_FL0_INIT;
    na->base.flag[1] = CUMO_NA_FL1_INIT;
    na->base.size = 0;
    na->base.shape = NULL;
    na->base.reduce = INT2FIX(0);
    na->ptr = NULL;
    return TypedData_Wrap_Struct(klass, &bit_s_alloc_func_data_type, (void*)na);
}
bit

#line 1 "narray/gen/tmpl_bit/allocate.c"
static VALUE
(VALUE self)
{
    cumo_narray_t *na;
    char *ptr;

    CumoGetNArray(self,na);

    switch(CUMO_NA_TYPE(na)) {
    case CUMO_NARRAY_DATA_T:
        ptr = CUMO_NA_DATA_PTR(na);
        if (na->size > 0 && ptr == NULL) {
            ptr = cumo_cuda_runtime_malloc(((na->size-1)/8/sizeof(CUMO_BIT_DIGIT)+1)*sizeof(CUMO_BIT_DIGIT));
            CUMO_NA_DATA_PTR(na) = ptr;
        }
        break;
    case CUMO_NARRAY_VIEW_T:
        rb_funcall(CUMO_NA_VIEW_DATA(na), rb_intern("allocate"), 0);
        break;
    default:
        rb_raise(rb_eRuntimeError,"invalid narray type");
    }
    return self;
}
bit_allocate

#line 1 "narray/gen/tmpl_bit/extract.c"
/*
  Extract an element only if self is a dimensionless NArray.
  @overload extract
  @return [Numeric,Cumo::NArray]
  --- Extract element value as Ruby Object if self is a dimensionless NArray,
  otherwise returns self.
*/

// TODO(sonots): Return Cumo::Bit instead of ruby built-in object to avoid synchronization
static VALUE
(VALUE self)
{
    CUMO_BIT_DIGIT *ptr, val;
    size_t pos;
    cumo_narray_t *na;
    CumoGetNArray(self,na);

    if (na->ndim==0) {
        pos = cumo_na_get_offset(self);
        ptr = (CUMO_BIT_DIGIT*)cumo_na_get_pointer_for_read(self);

        CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("bit_extract", "");
        cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

        val = ((*((ptr)+(pos)/CUMO_NB)) >> ((pos)%CUMO_NB)) & 1u;
        cumo_na_release_lock(self);
        return INT2FIX(val);
    }
    return self;
}
extractbit

#line 1 "narray/gen/tmpl_bit/extract_cpu.c"
/*
  Extract an element only if self is a dimensionless NArray.
  @overload extract_cpu
  @return [Numeric,Cumo::NArray]
  --- Extract element value as Ruby Object if self is a dimensionless NArray,
  otherwise returns self.
*/

static VALUE
(VALUE self)
{
    CUMO_BIT_DIGIT *ptr, val;
    size_t pos;
    cumo_narray_t *na;
    CumoGetNArray(self,na);

    if (na->ndim==0) {
        pos = cumo_na_get_offset(self);
        ptr = (CUMO_BIT_DIGIT*)cumo_na_get_pointer_for_read(self);

        CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("bit_extract_cpu", "");
        cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

        val = ((*((ptr)+(pos)/CUMO_NB)) >> ((pos)%CUMO_NB)) & 1u;
        cumo_na_release_lock(self);
        return INT2FIX(val);
    }
    return self;
}
extract_cpubit

#line 1 "narray/gen/tmpl/new_dim0.c"
void (dtype *ptr, dtype x);

static VALUE
cumo_bit_new_dim0_kernel_launch(dtype x)
{
    VALUE v;
    dtype *ptr;

    v = cumo_na_new(cT, 0, NULL);
    ptr = (dtype*)cumo_na_get_pointer_for_write(v);
    bit_new_dim0(ptr, x);

    cumo_na_release_lock(v);
    return v;
}
cumo_bit_new_dim0_kernel_launch

#line 1 "narray/gen/tmpl/store.c"


/*
  Store elements to Cumo::
#line 1 "narray/gen/tmpl/store_numeric.c"
static VALUE
(VALUE self, VALUE obj)
{
    dtype x;
    x = m_num_to_data(obj);
    obj = bit_store_numeric_new_dim0(x);
    bit(self,obj);
    return self;
}
bit_store
#line 1 "narray/gen/tmpl_bit/store_bit.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t  n;
    size_t  p1, p3;
    ssize_t s1, s3;
    size_t *idx1, *idx3;
    int     o1, l1, r1, len;
    CUMO_BIT_DIGIT *a1, *a3;
    CUMO_BIT_DIGIT  x;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_bit_store_bit", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, n);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a3, p3, s3, idx3);
    CUMO_INIT_PTR_BIT_IDX(lp, 1, a1, p1, s1, idx1);
    if (s1!=1 || s3!=1 || idx1 || idx3) {
        for (; n--;) {
            CUMO_LOAD_BIT_STEP(a1, p1, s1, idx1, x);
            CUMO_STORE_BIT_STEP(a3, p3, s3, idx3, x);
        }
    } else {
        o1 =  p1 % CUMO_NB;
        o1 -= p3;
        l1 =  CUMO_NB+o1;
        r1 =  CUMO_NB-o1;
        if (p3>0 || n<CUMO_NB) {
            len = CUMO_NB - p3;
            if ((int)n<len) len=n;
            if (o1>=0) x = *a1>>o1;
            else       x = *a1<<-o1;
            if (p1+len>CUMO_NB)  x |= *(a1+1)<<r1;
            a1++;
            *a3 = (x & (CUMO_SLB(len)<<p3)) | (*a3 & ~(CUMO_SLB(len)<<p3));
            a3++;
            n -= len;
        }
        if (o1==0) {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *(a1++);
                *(a3++) = x;
            }
        } else {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *a1>>o1;
                if (o1<0)  x |= *(a1-1)>>l1;
                if (o1>0)  x |= *(a1+1)<<r1;
                a1++;
                *(a3++) = x;
            }
        }
        if (n>0) {
            x = *a1>>o1;
            if (o1<0)  x |= *(a1-1)>>l1;
            *a3 = (x & CUMO_SLB(n)) | (*a3 & CUMO_BALL<<n);
        }
    }
}

static VALUE
bitBit(VALUE self, VALUE obj)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_store_bit, CUMO_FULL_LOOP, 2,0, ain,0};

    cumo_na_ndloop(&ndf, 2, self, obj);
    return self;
}
iter_bit_store_bit
#line 1 "narray/gen/tmpl_bit/store_from.c"
static void
(cumo_na_loop_t *const lp)
{
    ssize_t  i, s1, s2;
    size_t   p1;
    char    *p2;
    size_t  *idx1, *idx2;
    iter_bit_store_dfloat x;
    CUMO_BIT_DIGIT *a1;
    CUMO_BIT_DIGIT  y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("store_double", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_IDX(lp, 1, p2, s2, idx2);

    if (idx2) {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,dfloatDFloat,x);
                y = double(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,m_from_real,x);
                y = double(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    } else {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_real,x);
                y = double(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_real,x);
                y = double(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    }
}


static VALUE
m_from_real(VALUE self, VALUE obj)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_store_dfloat, CUMO_FULL_LOOP, 2,0, ain,0};

    cumo_na_ndloop(&ndf, 2, self, obj);
    return self;
}
iter_bit_store_dfloat
#line 1 "narray/gen/tmpl_bit/store_from.c"
static void
(cumo_na_loop_t *const lp)
{
    ssize_t  i, s1, s2;
    size_t   p1;
    char    *p2;
    size_t  *idx1, *idx2;
    iter_bit_store_sfloat x;
    CUMO_BIT_DIGIT *a1;
    CUMO_BIT_DIGIT  y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("store_float", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_IDX(lp, 1, p2, s2, idx2);

    if (idx2) {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,sfloatSFloat,x);
                y = float(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,m_from_real,x);
                y = float(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    } else {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_real,x);
                y = float(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_real,x);
                y = float(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    }
}


static VALUE
m_from_real(VALUE self, VALUE obj)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_store_sfloat, CUMO_FULL_LOOP, 2,0, ain,0};

    cumo_na_ndloop(&ndf, 2, self, obj);
    return self;
}
iter_bit_store_sfloat
#line 1 "narray/gen/tmpl_bit/store_from.c"
static void
(cumo_na_loop_t *const lp)
{
    ssize_t  i, s1, s2;
    size_t   p1;
    char    *p2;
    size_t  *idx1, *idx2;
    iter_bit_store_int64 x;
    CUMO_BIT_DIGIT *a1;
    CUMO_BIT_DIGIT  y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("store_int64_t", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_IDX(lp, 1, p2, s2, idx2);

    if (idx2) {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,int64Int64,x);
                y = int64_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,m_from_int64,x);
                y = int64_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    } else {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_int64,x);
                y = int64_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_int64,x);
                y = int64_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    }
}


static VALUE
m_from_int64(VALUE self, VALUE obj)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_store_int64, CUMO_FULL_LOOP, 2,0, ain,0};

    cumo_na_ndloop(&ndf, 2, self, obj);
    return self;
}
iter_bit_store_int64
#line 1 "narray/gen/tmpl_bit/store_from.c"
static void
(cumo_na_loop_t *const lp)
{
    ssize_t  i, s1, s2;
    size_t   p1;
    char    *p2;
    size_t  *idx1, *idx2;
    iter_bit_store_int32 x;
    CUMO_BIT_DIGIT *a1;
    CUMO_BIT_DIGIT  y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("store_int32_t", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_IDX(lp, 1, p2, s2, idx2);

    if (idx2) {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,int32Int32,x);
                y = int32_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,m_from_int32,x);
                y = int32_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    } else {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_int32,x);
                y = int32_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_int32,x);
                y = int32_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    }
}


static VALUE
m_from_int32(VALUE self, VALUE obj)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_store_int32, CUMO_FULL_LOOP, 2,0, ain,0};

    cumo_na_ndloop(&ndf, 2, self, obj);
    return self;
}
iter_bit_store_int32
#line 1 "narray/gen/tmpl_bit/store_from.c"
static void
(cumo_na_loop_t *const lp)
{
    ssize_t  i, s1, s2;
    size_t   p1;
    char    *p2;
    size_t  *idx1, *idx2;
    iter_bit_store_int16 x;
    CUMO_BIT_DIGIT *a1;
    CUMO_BIT_DIGIT  y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("store_int16_t", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_IDX(lp, 1, p2, s2, idx2);

    if (idx2) {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,int16Int16,x);
                y = int16_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,m_from_sint,x);
                y = int16_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    } else {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_sint,x);
                y = int16_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_sint,x);
                y = int16_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    }
}


static VALUE
m_from_sint(VALUE self, VALUE obj)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_store_int16, CUMO_FULL_LOOP, 2,0, ain,0};

    cumo_na_ndloop(&ndf, 2, self, obj);
    return self;
}
iter_bit_store_int16
#line 1 "narray/gen/tmpl_bit/store_from.c"
static void
(cumo_na_loop_t *const lp)
{
    ssize_t  i, s1, s2;
    size_t   p1;
    char    *p2;
    size_t  *idx1, *idx2;
    iter_bit_store_int8 x;
    CUMO_BIT_DIGIT *a1;
    CUMO_BIT_DIGIT  y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("store_int8_t", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_IDX(lp, 1, p2, s2, idx2);

    if (idx2) {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,int8Int8,x);
                y = int8_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,m_from_sint,x);
                y = int8_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    } else {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_sint,x);
                y = int8_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_sint,x);
                y = int8_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    }
}


static VALUE
m_from_sint(VALUE self, VALUE obj)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_store_int8, CUMO_FULL_LOOP, 2,0, ain,0};

    cumo_na_ndloop(&ndf, 2, self, obj);
    return self;
}
iter_bit_store_int8
#line 1 "narray/gen/tmpl_bit/store_from.c"
static void
(cumo_na_loop_t *const lp)
{
    ssize_t  i, s1, s2;
    size_t   p1;
    char    *p2;
    size_t  *idx1, *idx2;
    iter_bit_store_uint64 x;
    CUMO_BIT_DIGIT *a1;
    CUMO_BIT_DIGIT  y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("store_u_int64_t", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_IDX(lp, 1, p2, s2, idx2);

    if (idx2) {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,uint64UInt64,x);
                y = u_int64_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,m_from_uint64,x);
                y = u_int64_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    } else {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_uint64,x);
                y = u_int64_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_uint64,x);
                y = u_int64_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    }
}


static VALUE
m_from_uint64(VALUE self, VALUE obj)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_store_uint64, CUMO_FULL_LOOP, 2,0, ain,0};

    cumo_na_ndloop(&ndf, 2, self, obj);
    return self;
}
iter_bit_store_uint64
#line 1 "narray/gen/tmpl_bit/store_from.c"
static void
(cumo_na_loop_t *const lp)
{
    ssize_t  i, s1, s2;
    size_t   p1;
    char    *p2;
    size_t  *idx1, *idx2;
    iter_bit_store_uint32 x;
    CUMO_BIT_DIGIT *a1;
    CUMO_BIT_DIGIT  y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("store_u_int32_t", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_IDX(lp, 1, p2, s2, idx2);

    if (idx2) {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,uint32UInt32,x);
                y = u_int32_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,m_from_uint32,x);
                y = u_int32_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    } else {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_uint32,x);
                y = u_int32_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_uint32,x);
                y = u_int32_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    }
}


static VALUE
m_from_uint32(VALUE self, VALUE obj)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_store_uint32, CUMO_FULL_LOOP, 2,0, ain,0};

    cumo_na_ndloop(&ndf, 2, self, obj);
    return self;
}
iter_bit_store_uint32
#line 1 "narray/gen/tmpl_bit/store_from.c"
static void
(cumo_na_loop_t *const lp)
{
    ssize_t  i, s1, s2;
    size_t   p1;
    char    *p2;
    size_t  *idx1, *idx2;
    iter_bit_store_uint16 x;
    CUMO_BIT_DIGIT *a1;
    CUMO_BIT_DIGIT  y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("store_u_int16_t", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_IDX(lp, 1, p2, s2, idx2);

    if (idx2) {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,uint16UInt16,x);
                y = u_int16_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,m_from_sint,x);
                y = u_int16_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    } else {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_sint,x);
                y = u_int16_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_sint,x);
                y = u_int16_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    }
}


static VALUE
m_from_sint(VALUE self, VALUE obj)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_store_uint16, CUMO_FULL_LOOP, 2,0, ain,0};

    cumo_na_ndloop(&ndf, 2, self, obj);
    return self;
}
iter_bit_store_uint16
#line 1 "narray/gen/tmpl_bit/store_from.c"
static void
(cumo_na_loop_t *const lp)
{
    ssize_t  i, s1, s2;
    size_t   p1;
    char    *p2;
    size_t  *idx1, *idx2;
    iter_bit_store_uint8 x;
    CUMO_BIT_DIGIT *a1;
    CUMO_BIT_DIGIT  y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("store_u_int8_t", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_IDX(lp, 1, p2, s2, idx2);

    if (idx2) {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,uint8UInt8,x);
                y = u_int8_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,m_from_sint,x);
                y = u_int8_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    } else {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_sint,x);
                y = u_int8_t(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_from_sint,x);
                y = u_int8_t(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    }
}


static VALUE
m_from_sint(VALUE self, VALUE obj)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_store_uint8, CUMO_FULL_LOOP, 2,0, ain,0};

    cumo_na_ndloop(&ndf, 2, self, obj);
    return self;
}
iter_bit_store_uint8
#line 1 "narray/gen/tmpl_bit/store_from.c"
static void
(cumo_na_loop_t *const lp)
{
    ssize_t  i, s1, s2;
    size_t   p1;
    char    *p2;
    size_t  *idx1, *idx2;
    iter_bit_store_robject x;
    CUMO_BIT_DIGIT *a1;
    CUMO_BIT_DIGIT  y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("store_VALUE", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_IDX(lp, 1, p2, s2, idx2);

    if (idx2) {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,robjectRObject,x);
                y = VALUE(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_INDEX(p2,idx2,m_num_to_data,x);
                y = VALUE(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    } else {
        if (idx1) {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_num_to_data,x);
                y = VALUE(x);
                CUMO_STORE_BIT(a1, p1+*idx1, y); idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_GET_DATA_STRIDE(p2,s2,m_num_to_data,x);
                y = VALUE(x);
                CUMO_STORE_BIT(a1, p1, y); p1+=s1;
            }
        }
    }
}


static VALUE
m_num_to_data(VALUE self, VALUE obj)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_store_robject, CUMO_FULL_LOOP, 2,0, ain,0};

    cumo_na_ndloop(&ndf, 2, self, obj);
    return self;
}
iter_bit_store_robject
#line 1 "narray/gen/tmpl_bit/store_array.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t i, n;
    size_t i1, n1;
    VALUE  v1, *ptr;
    CUMO_BIT_DIGIT *a1;
    size_t p1;
    size_t s1, *idx1;
    VALUE  x;
    double y;
    CUMO_BIT_DIGIT z;
    size_t len, c;
    double beg, step;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_bit_store_array", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, n);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    v1 = lp->args[1].value;
    i = 0;

    if (lp->args[1].ptr) {
        if (v1 == Qtrue) {
            iter_arraybit_store_(lp);
            i = lp->args[1].shape[0];
            if (idx1) {
                idx1 += i;
            } else {
                p1 += s1 * i;
            }
        }
        goto loop_end;
    }

    ptr = &v1;

    switch(TYPE(v1)) {
    case T_ARRAY:
        n1 = RARRAY_LEN(v1);
        ptr = RARRAY_PTR(v1);
        break;
    case T_NIL:
        n1 = 0;
        break;
    default:
        n1 = 1;
    }

    if (idx1) {
        for (i=i1=0; i1<n1 && i<n; i++,i1++) {
            x = ptr[i1];
            if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, cumo_na_cStep)) {
                cumo_na_step_sequence(x,&len,&beg,&step);
                for (c=0; c<len && i<n; c++,i++) {
                    y = beg + step * c;
                    z = m_from_double(y);
                    CUMO_STORE_BIT(a1, p1+*idx1, z); idx1++;
                }
            }
            if (TYPE(x) != T_ARRAY) {
                if (x == Qnil) x = INT2FIX(0);
                z = m_num_to_data(x);
                CUMO_STORE_BIT(a1, p1+*idx1, z); idx1++;
            }
        }
    } else {
        for (i=i1=0; i1<n1 && i<n; i++,i1++) {
            x = ptr[i1];
            if (rb_obj_is_kind_of(x, rb_cRange) || rb_obj_is_kind_of(x, cumo_na_cStep)) {
                cumo_na_step_sequence(x,&len,&beg,&step);
                for (c=0; c<len && i<n; c++,i++) {
                    y = beg + step * c;
                    z = m_from_double(y);
                    CUMO_STORE_BIT(a1, p1, z); p1+=s1;
                }
            }
            if (TYPE(x) != T_ARRAY) {
                z = m_num_to_data(x);
                CUMO_STORE_BIT(a1, p1, z); p1+=s1;
            }
        }
    }

 loop_end:
    z = m_zero;
    if (idx1) {
        for (; i<n; i++) {
            CUMO_STORE_BIT(a1, p1+*idx1, z); idx1++;
        }
    } else {
        for (; i<n; i++) {
            CUMO_STORE_BIT(a1, p1, z); p1+=s1;
        }
    }
}

static VALUE
bitbit(VALUE self, VALUE rary)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0}, {rb_cArray,0}};
    cumo_ndfunc_t ndf = {bit_store_array, CUMO_FULL_LOOP, 2, 0, ain, 0};

    cumo_na_ndloop_store_rarray(&ndf, self, rary);
    return self;
}
iter_bit_store_array from other.
  @overload store(other)
  @param [Object] other
  @return [Cumo::Bit] self
*/
static VALUE
Bit(VALUE self, VALUE obj)
{
    VALUE r, klass;

    klass = CLASS_OF(obj);


    if (bit_store) {
        klass==cumo_cBit(self,obj);
        return self;
    }

    if (bit_store_bit) {
        IS_INTEGER_CLASS(klass) || klass==rb_cFloat || klass==rb_cComplex(self,obj);
        return self;
    }

    if (bit_store_numeric) {
        klass==cumo_cDFloat(self,obj);
        return self;
    }

    if (bit_store_dfloat) {
        klass==cumo_cSFloat(self,obj);
        return self;
    }

    if (bit_store_sfloat) {
        klass==cumo_cInt64(self,obj);
        return self;
    }

    if (bit_store_int64) {
        klass==cumo_cInt32(self,obj);
        return self;
    }

    if (bit_store_int32) {
        klass==cumo_cInt16(self,obj);
        return self;
    }

    if (bit_store_int16) {
        klass==cumo_cInt8(self,obj);
        return self;
    }

    if (bit_store_int8) {
        klass==cumo_cUInt64(self,obj);
        return self;
    }

    if (bit_store_uint64) {
        klass==cumo_cUInt32(self,obj);
        return self;
    }

    if (bit_store_uint32) {
        klass==cumo_cUInt16(self,obj);
        return self;
    }

    if (bit_store_uint16) {
        klass==cumo_cUInt8(self,obj);
        return self;
    }

    if (bit_store_uint8) {
        klass==cumo_cRObject(self,obj);
        return self;
    }

    if (bit_store_robject) {
        klass==rb_cArray(self,obj);
        return self;
    }


    if (CumoIsNArray(obj)) {
        r = rb_funcall(obj, rb_intern("coerce_cast"), 1, cT);
        if (CLASS_OF(r)==cT) {
            bit_store_array(self,r);
            return self;
        }
    }


    rb_raise(cumo_na_eCastError, "unknown conversion from %s to %s",
             rb_class2name(CLASS_OF(obj)),
             rb_class2name(CLASS_OF(self)));

    return self;
}
bit_store

#line 1 "narray/gen/tmpl/extract_data.c"
/*
  Convert a data value of obj (with a single element) to dtype.
*/
/*
static dtype
(VALUE obj)
{
    cumo_narray_t *na;
    dtype  x;
    char  *ptr;
    size_t pos;
    VALUE  r, klass;

    CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("bit_extract_data", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    if (CumoIsNArray(obj)) {
        CumoGetNArray(obj,na);
        if (na->size != 1) {
            rb_raise(cumo_na_eShapeError,"narray size should be 1");
       }
        klass = CLASS_OF(obj);
        ptr = cumo_na_get_pointer_for_read(obj);
        pos = cumo_na_get_offset(obj);

        if (extract_databit) {
            klass==cumo_cBit;
            return x;
        }

        if ({BIT_DIGIT b; CUMO_LOAD_BIT(ptr,pos,b); x = m_from_sint(b);}) {
            klass==cumo_cDFloat;
            return x;
        }

        if (x = m_from_real(*(double*)(ptr+pos))) {
            klass==cumo_cSFloat;
            return x;
        }

        if (x = m_from_real(*(float*)(ptr+pos))) {
            klass==cumo_cInt64;
            return x;
        }

        if (x = m_from_int64(*(int64_t*)(ptr+pos))) {
            klass==cumo_cInt32;
            return x;
        }

        if (x = m_from_int32(*(int32_t*)(ptr+pos))) {
            klass==cumo_cInt16;
            return x;
        }

        if (x = m_from_sint(*(int16_t*)(ptr+pos))) {
            klass==cumo_cInt8;
            return x;
        }

        if (x = m_from_sint(*(int8_t*)(ptr+pos))) {
            klass==cumo_cUInt64;
            return x;
        }

        if (x = m_from_uint64(*(u_int64_t*)(ptr+pos))) {
            klass==cumo_cUInt32;
            return x;
        }

        if (x = m_from_uint32(*(u_int32_t*)(ptr+pos))) {
            klass==cumo_cUInt16;
            return x;
        }

        if (x = m_from_sint(*(u_int16_t*)(ptr+pos))) {
            klass==cumo_cUInt8;
            return x;
        }

        if (x = m_from_sint(*(u_int8_t*)(ptr+pos))) {
            klass==cumo_cRObject;
            return x;
        }


        // coerce
        r = rb_funcall(obj, rb_intern("coerce_cast"), 1, cT);
        if (CLASS_OF(r)==cT) {
            return x = m_num_to_data(*(VALUE*)(ptr+pos))(r);
        }

        rb_raise(cumo_na_eCastError, "unknown conversion from %s to %s",
                 rb_class2name(CLASS_OF(obj)),
                 rb_class2name(cT));

    }
    if (TYPE(obj)==T_ARRAY) {
        if (RARRAY_LEN(obj) != 1) {
            rb_raise(cumo_na_eShapeError,"array size should be 1");
        }
        return m_num_to_data(RARRAY_AREF(obj,0));
    }
    return m_num_to_data(obj);
}
*/
bit_extract_data

#line 1 "narray/gen/tmpl/cast_array.c"
static VALUE
(VALUE rary)
{
    VALUE nary;
    cumo_narray_t *na;

    nary = cumo_na_s_new_like(cT, rary);
    CumoGetNArray(nary,na);
    if (na->size > 0) {
        bit_cast_array(nary,rary);
    }
    return nary;
}
bit_store_array

#line 1 "narray/gen/tmpl/cast.c"
/*
  Cast object to Cumo::.
  @overload [](elements)
  @overload Bit(array)
  @param [Numeric,Array] elements
  @param [Array] array
  @return [Cumo::cast]
*/
static VALUE
Bit(VALUE type, VALUE obj)
{
    VALUE v;
    cumo_narray_t *na;
    dtype x;

    if (CLASS_OF(obj)==cT) {
        return obj;
    }
    if (RTEST(rb_obj_is_kind_of(obj,rb_cNumeric))) {
        x = m_num_to_data(obj);
        return bit_s_cast_new_dim0(x);
    }
    if (RTEST(rb_obj_is_kind_of(obj,rb_cArray))) {
        return bit(obj);
    }
    if (CumoIsNArray(obj)) {
        CumoGetNArray(obj,na);
        v = cumo_na_new(cT, CUMO_NA_NDIM(na), CUMO_NA_SHAPE(na));
        if (CUMO_NA_SIZE(na) > 0) {
            bit_cast_array(v,obj);
        }
        return v;
    }

    rb_raise(cumo_na_eCastError,"cannot cast to %s",rb_class2name(type));
    return Qnil;

}
bit_store

#line 1 "narray/gen/tmpl_bit/aref.c"
static VALUE
_cpu(int argc, VALUE *argv, VALUE self);

/*
  Array element referenece or slice view.
  @overload [](dim0,...,dimL)
  @param [Numeric,Range,etc] dim0,...,dimL  Multi-dimensional Index.
  @return [Numeric,NArray::bit_aref] Element object or NArray view.

  --- Returns the element at +dim0+, +dim1+, ... are Numeric indices
  for each dimension, or returns a NArray View as a sliced subarray if
  +dim0+, +dim1+, ... includes other than Numeric index, e.g., Range
  or Array or true.

  @example
      a = Cumo::DFloat.new(4,5).seq
      => Cumo::DFloat#shape=[4,5]
      [[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]]

      a[1,1]
      => 6.0

      a[1..3,1]
      => Cumo::DFloat#shape=[3]
      [6, 11, 16]

      a[1,[1,3,4]]
      => Cumo::DFloat#shape=[3]
      [6, 8, 9]

      a[true,2].fill(99)
      a
      => Cumo::DFloat#shape=[4,5]
      [[0, 1, 99, 3, 4],
       [5, 6, 99, 8, 9],
       [10, 11, 99, 13, 14],
       [15, 16, 99, 18, 19]]
 */
static VALUE
Bit(int argc, VALUE *argv, VALUE self)
{
    if (cumo_compatible_mode_enabled_p()) {
        return bit_aref_cpu(argc, argv, self);
    } else {
        int result_nd;
        size_t pos;

        result_nd = cumo_na_get_result_dimension(self, argc, argv, 1, &pos);
        return cumo_na_aref_main(argc, argv, self, 0, result_nd, pos);
    }
}
bit_aref

#line 1 "narray/gen/tmpl_bit/aref_cpu.c"
/*
  Array element referenece or slice view.
  @overload [](dim0,...,dimL)
  @param [Numeric,Range,etc] dim0,...,dimL  Multi-dimensional Index.
  @return [Numeric,NArray::] Element object or NArray view.

  --- Returns the element at +dim0+, +dim1+, ... are Numeric indices
  for each dimension, or returns a NArray View as a sliced subarray if
  +dim0+, +dim1+, ... includes other than Numeric index, e.g., Range
  or Array or true.

  @example
      a = Cumo::DFloat.new(4,5).seq
      => Cumo::DFloat#shape=[4,5]
      [[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19]]

      a[1,1]
      => 6.0

      a[1..3,1]
      => Cumo::DFloat#shape=[3]
      [6, 11, 16]

      a[1,[1,3,4]]
      => Cumo::DFloat#shape=[3]
      [6, 8, 9]

      a[true,2].fill(99)
      a
      => Cumo::DFloat#shape=[4,5]
      [[0, 1, 99, 3, 4],
       [5, 6, 99, 8, 9],
       [10, 11, 99, 13, 14],
       [15, 16, 99, 18, 19]]
 */
static VALUE
Bit(int argc, VALUE *argv, VALUE self)
{
    int nd;
    size_t pos;
    char *ptr;
    dtype x;

    nd = cumo_na_get_result_dimension(self, argc, argv, 1, &pos);
    if (nd) {
        return cumo_na_aref_main(argc, argv, self, 0, nd, pos);
    } else {
        ptr = cumo_na_get_pointer_for_read(self);
        CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("bit_aref_cpu", "");
        cumo_cuda_runtime_check_status(cudaDeviceSynchronize());
        CUMO_LOAD_BIT(ptr,pos,x);
        return m_data_to_num(x);
    }
}
aref_cpubit

#line 1 "narray/gen/tmpl_bit/aset.c"
/*
  Array element(s) set.
  @overload []=(dim0,..,dimL,val)
  @param [Numeric,Range,etc] dim0,..,dimL  Multi-dimensional Index.
  @param [Numeric,Cumo::NArray,etc] val  Value(s) to be set to self.
  @return [Numeric] returns val (last argument).

  --- Replace element(s) at +dim0+, +dim1+, ... (index/range/array/true
  for each dimention). Broadcasting mechanism is applied.

  @example
      a = Cumo::DFloat.new(3,4).seq
      => Cumo::DFloat#shape=[3,4]
      [[0, 1, 2, 3],
       [4, 5, 6, 7],
       [8, 9, 10, 11]]

      a[1,2]=99
      a
      => Cumo::DFloat#shape=[3,4]
      [[0, 1, 2, 3],
       [4, 5, 99, 7],
       [8, 9, 10, 11]]

      a[1,[0,2]] = [101,102]
      a
      => Cumo::DFloat#shape=[3,4]
      [[0, 1, 2, 3],
       [101, 5, 102, 7],
       [8, 9, 10, 11]]

      a[1,true]=99
      a
      => Cumo::DFloat#shape=[3,4]
      [[0, 1, 2, 3],
       [99, 99, 99, 99],
       [8, 9, 10, 11]]

*/
static VALUE
(int argc, VALUE *argv, VALUE self)
{
    int nd;
    size_t pos;
    VALUE a;

    argc--;
    if (argc==0) {
        bit_aset(self, argv[argc]);
    } else {
        nd = cumo_na_get_result_dimension(self, argc, argv, 1, &pos);
        a = cumo_na_aref_main(argc, argv, self, 0, nd, pos);
        bit_store(a, argv[argc]);
    }
    return argv[argc];
}
bit_store

#line 1 "narray/gen/tmpl/coerce_cast.c"
/*
  return NArray with cast to the type of self.
  @overload coerce_cast(type)
  @return [nil]
*/
static VALUE
(VALUE self, VALUE type)
{
    return Qnil;
}
bit_coerce_cast

#line 1 "narray/gen/tmpl_bit/to_a.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t     i;
    CUMO_BIT_DIGIT *a1;
    size_t     p1;
    ssize_t    s1;
    size_t    *idx1;
    CUMO_BIT_DIGIT  x=0;
    VALUE      a, y;

    CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("iter_bit_to_a", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    a = rb_ary_new2(i);
    rb_ary_push(lp->args[1].value, a);
    if (idx1) {
        for (; i--;) {
            CUMO_LOAD_BIT(a1,p1+*idx1,x); idx1++;
            y = m_data_to_num(x);
            rb_ary_push(a,y);
        }
    } else {
        for (; i--;) {
            CUMO_LOAD_BIT(a1,p1,x); p1+=s1;
            y = m_data_to_num(x);
            rb_ary_push(a,y);
        }
    }
}

/*
  Convert self to Array.
  @overload to_abit
  @return [Array]
*/
static VALUE
to_a(VALUE self)
{
    cumo_ndfunc_arg_in_t ain[3] = {{Qnil,0},{cumo_sym_loop_opt},{cumo_sym_option}};
    cumo_ndfunc_arg_out_t aout[1] = {{rb_cArray,0}}; // dummy?
    cumo_ndfunc_t ndf = {bit_to_a, CUMO_FULL_LOOP_NIP, 3,1, ain,aout};

    return cumo_na_ndloop_cast_narray_to_rarray(&ndf, self, Qnil);
}
iter_bit_to_a

#line 1 "narray/gen/tmpl_bit/fill.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t  n;
    size_t  p3;
    ssize_t s3;
    size_t *idx3;
    int     len;
    CUMO_BIT_DIGIT *a3;
    CUMO_BIT_DIGIT  y;
    VALUE x = lp->option;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_bit_fill", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    if (x==INT2FIX(0) || x==Qfalse) {
        y = 0;
    } else
    if (x==INT2FIX(1) || x==Qtrue) {
        y = ~(CUMO_BIT_DIGIT)0;
    } else {
        rb_raise(rb_eArgError, "invalid value for Bit");
    }

    CUMO_INIT_COUNTER(lp, n);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a3, p3, s3, idx3);
    if (idx3) {
        y = y&1;
        for (; n--;) {
            CUMO_STORE_BIT(a3, p3+*idx3, y); idx3++;
        }
    } else if (s3!=1) {
        y = y&1;
        for (; n--;) {
            CUMO_STORE_BIT(a3, p3, y); p3+=s3;
        }
    } else {
        if (p3>0 || n<CUMO_NB) {
            len = CUMO_NB - p3;
            if ((int)n<len) len=n;
            *a3 = (y & (CUMO_SLB(len)<<p3)) | (*a3 & ~(CUMO_SLB(len)<<p3));
            a3++;
            n -= len;
        }
        for (; n>=CUMO_NB; n-=CUMO_NB) {
            *(a3++) = y;
        }
        if (n>0) {
            *a3 = (y & CUMO_SLB(n)) | (*a3 & CUMO_BALL<<n);
        }
    }
}

/*
  Fill elements with other.
  @overload fillbit other
  @param [Numeric] other
  @return [Cumo::fill] self.
*/
static VALUE
Bit(VALUE self, VALUE val)
{
    cumo_ndfunc_arg_in_t ain[2] = {{CUMO_OVERWRITE,0},{cumo_sym_option}};
    cumo_ndfunc_t ndf = {bit_fill, CUMO_FULL_LOOP, 2,0, ain,0};

    cumo_na_ndloop(&ndf, 2, self, val);
    return self;
}
iter_bit_fill

#line 1 "narray/gen/tmpl_bit/format.c"
static VALUE
format_(VALUE fmt, dtype x)
{
    if (NIL_P(fmt)) {
        char s[4];
        int n;
        n = m_sprintf(s,x);
        return rb_str_new(s,n);
    }
    return rb_funcall(fmt, '%', 1, m_data_to_num(x));
}

static void
bit(cumo_na_loop_t *const lp)
{
    size_t  i;
    CUMO_BIT_DIGIT *a1, x=0;
    size_t     p1;
    char      *p2;
    ssize_t    s1, s2;
    size_t    *idx1;
    VALUE  y;
    VALUE  fmt = lp->option;

    CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("iter_bit_format", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR(lp, 1, p2, s2);

    if (idx1) {
        for (; i--;) {
            CUMO_LOAD_BIT(a1, p1+*idx1, x); idx1++;
            y = format_formatbit(fmt, x);
            CUMO_SET_DATA_STRIDE(p2, s2, VALUE, y);
        }
    } else {
        for (; i--;) {
            CUMO_LOAD_BIT(a1, p1, x); p1+=s1;
            y = format_bit(fmt, x);
            CUMO_SET_DATA_STRIDE(p2, s2, VALUE, y);
        }
    }
}

/*
  Format elements into strings.
  @overload bit format
  @param [String] format
  @return [Cumo::RObject] array of formated strings.
*/
static VALUE
format(int argc, VALUE *argv, VALUE self)
{
    VALUE fmt=Qnil;

    cumo_ndfunc_arg_in_t ain[2] = {{Qnil,0},{cumo_sym_option}};
    cumo_ndfunc_arg_out_t aout[1] = {{cumo_cRObject,0}};
    cumo_ndfunc_t ndf = {bit_format, CUMO_FULL_LOOP_NIP, 2,1, ain,aout};

    rb_scan_args(argc, argv, "01", &fmt);
    return cumo_na_ndloop(&ndf, 2, self, fmt);
}
iter_bit_format

#line 1 "narray/gen/tmpl_bit/format_to_a.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t  i;
    CUMO_BIT_DIGIT *a1, x=0;
    size_t     p1;
    ssize_t    s1;
    size_t   *idx1;
    VALUE y;
    VALUE fmt = lp->option;
    volatile VALUE a;

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    a = rb_ary_new2(i);
    rb_ary_push(lp->args[1].value, a);
    if (idx1) {
        for (; i--;) {
            CUMO_LOAD_BIT(a1, p1+*idx1, x); idx1++;
            y = format_bit(fmt, x);
            rb_ary_push(a,y);
        }
    } else {
        for (; i--;) {
            CUMO_LOAD_BIT(a1, p1, x); p1+=s1;
            y = format_bit(fmt, x);
            rb_ary_push(a,y);
        }
    }
}

/*
  Format elements into strings.
  @overload iter_bit_format_to_a format
  @param [String] format
  @return [Array] array of formated strings.
*/
static VALUE
format_to_a(int argc, VALUE *argv, VALUE self)
{
    VALUE fmt=Qnil;
    cumo_ndfunc_arg_in_t ain[3] = {{Qnil,0},{cumo_sym_loop_opt},{cumo_sym_option}};
    cumo_ndfunc_arg_out_t aout[1] = {{rb_cArray,0}}; // dummy?
    cumo_ndfunc_t ndf = {bit_format_to_a, CUMO_FULL_LOOP_NIP, 3,1, ain,aout};

    CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("iter_bit_format_to_a", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    rb_scan_args(argc, argv, "01", &fmt);
    return cumo_na_ndloop_cast_narray_to_rarray(&ndf, self, fmt);
}
format_to_abit

#line 1 "narray/gen/tmpl_bit/inspect.c"
static VALUE
(char *ptr, size_t pos, VALUE fmt)
{
    dtype x;
    CUMO_LOAD_BIT(ptr,pos,x);
    return format_iter_bit_inspect(fmt, x);
}

/*
  Returns a string containing a human-readable representation of NArray.
  @overload inspect
  @return [String]
*/
static VALUE
bit(VALUE ary)
{
    CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("bit_inspect", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    return cumo_na_ndloop_inspect(ary, inspectbit, Qnil);
}
iter_bit_inspect

#line 1 "narray/gen/tmpl_bit/each.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t   i;
    CUMO_BIT_DIGIT *a1, x=0;
    size_t   p1;
    ssize_t  s1;
    size_t  *idx1;
    VALUE  y;

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);

    CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("iter_bit_each", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    if (idx1) {
        for (; i--;) {
            CUMO_LOAD_BIT(a1, p1+*idx1, x); idx1++;
            y = m_data_to_num(x);
            rb_yield(y);
        }
    } else {
        for (; i--;) {
            CUMO_LOAD_BIT(a1, p1, x); p1+=s1;
            y = m_data_to_num(x);
            rb_yield(y);
        }
    }
}

/*
  Calls the given block once for each element in self,
  passing that element as a parameter.
  @overload eachbit
  @return [Cumo::NArray] self
  For a block {|x| ... }
  @yield [x]  x is element of NArray.
*/
static VALUE
each(VALUE self)
{
    cumo_ndfunc_arg_in_t ain[1] = {{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_each, CUMO_FULL_LOOP_NIP, 1,0, ain,0};

    cumo_na_ndloop(&ndf, 1, self);
    return self;
}
iter_bit_each

#line 1 "narray/gen/tmpl_bit/each_with_index.c"
static inline void
yield_each_with_index(dtype x, size_t *c, VALUE *a, int nd, int md)
{
    int j;

    a[0] = m_data_to_num(x);
    for (j=0; j<=nd; j++) {
        a[j+1] = SIZET2NUM(c[j]);
    }
    rb_yield(rb_ary_new4(md,a));
}


static void
(cumo_na_loop_t *const lp)
{
    size_t   i;
    CUMO_BIT_DIGIT *a1, x=0;
    size_t   p1;
    ssize_t  s1;
    size_t  *idx1;

    VALUE *a;
    size_t *c;
    int nd, md;

    c = (size_t*)(lp->opt_ptr);
    nd = lp->ndim - 1;
    md = lp->ndim + 1;
    a = ALLOCA_N(VALUE,md);

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    c[nd] = 0;

    CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("iter_bit_each_with_index", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    if (idx1) {
        for (; i--;) {
            CUMO_LOAD_BIT(a1, p1+*idx1, x); idx1++;
            yield_each_with_index(x,c,a,nd,md);
            c[nd]++;
        }
    } else {
        for (; i--;) {
            CUMO_LOAD_BIT(a1, p1, x); p1+=s1;
            yield_each_with_index(x,c,a,nd,md);
            c[nd]++;
        }
    }
}

/*
  Invokes the given block once for each element of self,
  passing that element and indices along each axis as parameters.
  @overload each_with_indexbit
  @return [Cumo::NArray] self
  For a block {|x,i,j,...| ... }
  @yield [x,i,j,...]  x is an element, i,j,... are multidimensional indices.
*/
static VALUE
each_with_index(VALUE self)
{
    cumo_ndfunc_arg_in_t ain[1] = {{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_each_with_index, CUMO_FULL_LOOP_NIP, 1,0, ain,0};

    cumo_na_ndloop_with_index(&ndf, 1, self);
    return self;
}
iter_bit_each_with_index

#line 1 "narray/gen/tmpl_bit/unary.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t  n;
    size_t  p1, p3;
    ssize_t s1, s3;
    size_t *idx1, *idx3;
    int     o1, l1, r1, len;
    CUMO_BIT_DIGIT *a1, *a3;
    CUMO_BIT_DIGIT  x;
    CUMO_BIT_DIGIT  y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_bit_copy", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, n);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_BIT_IDX(lp, 1, a3, p3, s3, idx3);
    if (s1!=1 || s3!=1 || idx1 || idx3) {
        for (; n--;) {
            CUMO_LOAD_BIT_STEP(a1, p1, s1, idx1, x);
            y = m_copybit(x);
            CUMO_STORE_BIT_STEP(a3, p3, s3, idx3, y);
        }
    } else {
        o1 =  p1 % CUMO_NB;
        o1 -= p3;
        l1 =  CUMO_NB+o1;
        r1 =  CUMO_NB-o1;
        if (p3>0 || n<CUMO_NB) {
            len = CUMO_NB - p3;
            if ((int)n<len) len=n;
            if (o1>=0) x = *a1>>o1;
            else       x = *a1<<-o1;
            if (p1+len>CUMO_NB)  x |= *(a1+1)<<r1;
            a1++;
            y = m_copy(x);
            *a3 = (y & (CUMO_SLB(len)<<p3)) | (*a3 & ~(CUMO_SLB(len)<<p3));
            a3++;
            n -= len;
        }
        if (o1==0) {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *(a1++);
                y = m_copy(x);
                *(a3++) = y;
            }
        } else {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *a1>>o1;
                if (o1<0)  x |= *(a1-1)>>l1;
                if (o1>0)  x |= *(a1+1)<<r1;
                a1++;
                y = m_copy(x);
                *(a3++) = y;
            }
        }
        if (n>0) {
            x = *a1>>o1;
            if (o1<0)  x |= *(a1-1)>>l1;
            y = m_copy(x);
            *a3 = (y & CUMO_SLB(n)) | (*a3 & CUMO_BALL<<n);
        }
    }
}

/*
  Unary copy.
  @overload copy
  @return [Cumo::copy]  of self.
*/
static VALUE
Bitcopy(VALUE self)
{
    cumo_ndfunc_arg_in_t ain[1] = {{cT,0}};
    cumo_ndfunc_arg_out_t aout[1] = {{cT,0}};
    cumo_ndfunc_t ndf = {bit_copy, CUMO_FULL_LOOP, 1,1, ain,aout};

    return cumo_na_ndloop(&ndf, 1, self);
}
iter_bit_copy

#line 1 "narray/gen/tmpl_bit/unary.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t  n;
    size_t  p1, p3;
    ssize_t s1, s3;
    size_t *idx1, *idx3;
    int     o1, l1, r1, len;
    CUMO_BIT_DIGIT *a1, *a3;
    CUMO_BIT_DIGIT  x;
    CUMO_BIT_DIGIT  y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_bit_not", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, n);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_BIT_IDX(lp, 1, a3, p3, s3, idx3);
    if (s1!=1 || s3!=1 || idx1 || idx3) {
        for (; n--;) {
            CUMO_LOAD_BIT_STEP(a1, p1, s1, idx1, x);
            y = m_notbit(x);
            CUMO_STORE_BIT_STEP(a3, p3, s3, idx3, y);
        }
    } else {
        o1 =  p1 % CUMO_NB;
        o1 -= p3;
        l1 =  CUMO_NB+o1;
        r1 =  CUMO_NB-o1;
        if (p3>0 || n<CUMO_NB) {
            len = CUMO_NB - p3;
            if ((int)n<len) len=n;
            if (o1>=0) x = *a1>>o1;
            else       x = *a1<<-o1;
            if (p1+len>CUMO_NB)  x |= *(a1+1)<<r1;
            a1++;
            y = m_not(x);
            *a3 = (y & (CUMO_SLB(len)<<p3)) | (*a3 & ~(CUMO_SLB(len)<<p3));
            a3++;
            n -= len;
        }
        if (o1==0) {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *(a1++);
                y = m_not(x);
                *(a3++) = y;
            }
        } else {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *a1>>o1;
                if (o1<0)  x |= *(a1-1)>>l1;
                if (o1>0)  x |= *(a1+1)<<r1;
                a1++;
                y = m_not(x);
                *(a3++) = y;
            }
        }
        if (n>0) {
            x = *a1>>o1;
            if (o1<0)  x |= *(a1-1)>>l1;
            y = m_not(x);
            *a3 = (y & CUMO_SLB(n)) | (*a3 & CUMO_BALL<<n);
        }
    }
}

/*
  Unary not.
  @overload not
  @return [Cumo::not]  of self.
*/
static VALUE
Bitnot(VALUE self)
{
    cumo_ndfunc_arg_in_t ain[1] = {{cT,0}};
    cumo_ndfunc_arg_out_t aout[1] = {{cT,0}};
    cumo_ndfunc_t ndf = {bit_not, CUMO_FULL_LOOP, 1,1, ain,aout};

    return cumo_na_ndloop(&ndf, 1, self);
}
iter_bit_not

#line 1 "narray/gen/tmpl_bit/binary.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t  n;
    size_t  p1, p2, p3;
    ssize_t s1, s2, s3;
    size_t *idx1, *idx2, *idx3;
    int     o1, o2, l1, l2, r1, r2, len;
    CUMO_BIT_DIGIT *a1, *a2, *a3;
    CUMO_BIT_DIGIT  x, y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_bit_and", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, n);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_BIT_IDX(lp, 1, a2, p2, s2, idx2);
    CUMO_INIT_PTR_BIT_IDX(lp, 2, a3, p3, s3, idx3);
    if (s1!=1 || s2!=1 || s3!=1 || idx1 || idx2 || idx3) {
        for (; n--;) {
            CUMO_LOAD_BIT_STEP(a1, p1, s1, idx1, x);
            CUMO_LOAD_BIT_STEP(a2, p2, s2, idx2, y);
            x = m_andbit(x,y);
            CUMO_STORE_BIT_STEP(a3, p3, s3, idx3, x);
        }
    } else {
        o1 =  p1 % CUMO_NB;
        o1 -= p3;
        o2 =  p2 % CUMO_NB;
        o2 -= p3;
        l1 =  CUMO_NB+o1;
        r1 =  CUMO_NB-o1;
        l2 =  CUMO_NB+o2;
        r2 =  CUMO_NB-o2;
        if (p3>0 || n<CUMO_NB) {
            len = CUMO_NB - p3;
            if ((int)n<len) len=n;
            if (o1>=0) x = *a1>>o1;
            else       x = *a1<<-o1;
            if (p1+len>CUMO_NB)  x |= *(a1+1)<<r1;
            a1++;
            if (o2>=0) y = *a2>>o2;
            else       y = *a2<<-o2;
            if (p2+len>CUMO_NB)  y |= *(a2+1)<<r2;
            a2++;
            x = m_and(x,y);
            *a3 = (x & (CUMO_SLB(len)<<p3)) | (*a3 & ~(CUMO_SLB(len)<<p3));
            a3++;
            n -= len;
        }
        if (o1==0 && o2==0) {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *(a1++);
                y = *(a2++);
                x = m_and(x,y);
                *(a3++) = x;
            }
        } else {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *a1>>o1;
                if (o1<0)  x |= *(a1-1)>>l1;
                if (o1>0)  x |= *(a1+1)<<r1;
                a1++;
                y = *a2>>o2;
                if (o2<0)  y |= *(a2-1)>>l2;
                if (o2>0)  y |= *(a2+1)<<r2;
                a2++;
                x = m_and(x,y);
                *(a3++) = x;
            }
        }
        if (n>0) {
            x = *a1>>o1;
            if (o1<0)  x |= *(a1-1)>>l1;
            y = *a2>>o2;
            if (o2<0)  y |= *(a2-1)>>l2;
            x = m_and(x,y);
            *a3 = (x & CUMO_SLB(n)) | (*a3 & CUMO_BALL<<n);
        }
    }
}

/*
  Binary and.
  @overload and other
  @param [Cumo::NArray,Numeric] other
  @return [Cumo::NArray] & of self and other.
*/
static VALUE
and(VALUE self, VALUE other)
{
    cumo_ndfunc_arg_in_t ain[2] = {{cT,0},{cT,0}};
    cumo_ndfunc_arg_out_t aout[1] = {{cT,0}};
    cumo_ndfunc_t ndf = { bit_and, CUMO_FULL_LOOP, 2, 1, ain, aout };

    return cumo_na_ndloop(&ndf, 2, self, other);
}
iter_bit_and

#line 1 "narray/gen/tmpl_bit/binary.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t  n;
    size_t  p1, p2, p3;
    ssize_t s1, s2, s3;
    size_t *idx1, *idx2, *idx3;
    int     o1, o2, l1, l2, r1, r2, len;
    CUMO_BIT_DIGIT *a1, *a2, *a3;
    CUMO_BIT_DIGIT  x, y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_bit_or", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, n);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_BIT_IDX(lp, 1, a2, p2, s2, idx2);
    CUMO_INIT_PTR_BIT_IDX(lp, 2, a3, p3, s3, idx3);
    if (s1!=1 || s2!=1 || s3!=1 || idx1 || idx2 || idx3) {
        for (; n--;) {
            CUMO_LOAD_BIT_STEP(a1, p1, s1, idx1, x);
            CUMO_LOAD_BIT_STEP(a2, p2, s2, idx2, y);
            x = m_orbit(x,y);
            CUMO_STORE_BIT_STEP(a3, p3, s3, idx3, x);
        }
    } else {
        o1 =  p1 % CUMO_NB;
        o1 -= p3;
        o2 =  p2 % CUMO_NB;
        o2 -= p3;
        l1 =  CUMO_NB+o1;
        r1 =  CUMO_NB-o1;
        l2 =  CUMO_NB+o2;
        r2 =  CUMO_NB-o2;
        if (p3>0 || n<CUMO_NB) {
            len = CUMO_NB - p3;
            if ((int)n<len) len=n;
            if (o1>=0) x = *a1>>o1;
            else       x = *a1<<-o1;
            if (p1+len>CUMO_NB)  x |= *(a1+1)<<r1;
            a1++;
            if (o2>=0) y = *a2>>o2;
            else       y = *a2<<-o2;
            if (p2+len>CUMO_NB)  y |= *(a2+1)<<r2;
            a2++;
            x = m_or(x,y);
            *a3 = (x & (CUMO_SLB(len)<<p3)) | (*a3 & ~(CUMO_SLB(len)<<p3));
            a3++;
            n -= len;
        }
        if (o1==0 && o2==0) {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *(a1++);
                y = *(a2++);
                x = m_or(x,y);
                *(a3++) = x;
            }
        } else {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *a1>>o1;
                if (o1<0)  x |= *(a1-1)>>l1;
                if (o1>0)  x |= *(a1+1)<<r1;
                a1++;
                y = *a2>>o2;
                if (o2<0)  y |= *(a2-1)>>l2;
                if (o2>0)  y |= *(a2+1)<<r2;
                a2++;
                x = m_or(x,y);
                *(a3++) = x;
            }
        }
        if (n>0) {
            x = *a1>>o1;
            if (o1<0)  x |= *(a1-1)>>l1;
            y = *a2>>o2;
            if (o2<0)  y |= *(a2-1)>>l2;
            x = m_or(x,y);
            *a3 = (x & CUMO_SLB(n)) | (*a3 & CUMO_BALL<<n);
        }
    }
}

/*
  Binary or.
  @overload or other
  @param [Cumo::NArray,Numeric] other
  @return [Cumo::NArray] | of self and other.
*/
static VALUE
or(VALUE self, VALUE other)
{
    cumo_ndfunc_arg_in_t ain[2] = {{cT,0},{cT,0}};
    cumo_ndfunc_arg_out_t aout[1] = {{cT,0}};
    cumo_ndfunc_t ndf = { bit_or, CUMO_FULL_LOOP, 2, 1, ain, aout };

    return cumo_na_ndloop(&ndf, 2, self, other);
}
iter_bit_or

#line 1 "narray/gen/tmpl_bit/binary.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t  n;
    size_t  p1, p2, p3;
    ssize_t s1, s2, s3;
    size_t *idx1, *idx2, *idx3;
    int     o1, o2, l1, l2, r1, r2, len;
    CUMO_BIT_DIGIT *a1, *a2, *a3;
    CUMO_BIT_DIGIT  x, y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_bit_xor", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, n);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_BIT_IDX(lp, 1, a2, p2, s2, idx2);
    CUMO_INIT_PTR_BIT_IDX(lp, 2, a3, p3, s3, idx3);
    if (s1!=1 || s2!=1 || s3!=1 || idx1 || idx2 || idx3) {
        for (; n--;) {
            CUMO_LOAD_BIT_STEP(a1, p1, s1, idx1, x);
            CUMO_LOAD_BIT_STEP(a2, p2, s2, idx2, y);
            x = m_xorbit(x,y);
            CUMO_STORE_BIT_STEP(a3, p3, s3, idx3, x);
        }
    } else {
        o1 =  p1 % CUMO_NB;
        o1 -= p3;
        o2 =  p2 % CUMO_NB;
        o2 -= p3;
        l1 =  CUMO_NB+o1;
        r1 =  CUMO_NB-o1;
        l2 =  CUMO_NB+o2;
        r2 =  CUMO_NB-o2;
        if (p3>0 || n<CUMO_NB) {
            len = CUMO_NB - p3;
            if ((int)n<len) len=n;
            if (o1>=0) x = *a1>>o1;
            else       x = *a1<<-o1;
            if (p1+len>CUMO_NB)  x |= *(a1+1)<<r1;
            a1++;
            if (o2>=0) y = *a2>>o2;
            else       y = *a2<<-o2;
            if (p2+len>CUMO_NB)  y |= *(a2+1)<<r2;
            a2++;
            x = m_xor(x,y);
            *a3 = (x & (CUMO_SLB(len)<<p3)) | (*a3 & ~(CUMO_SLB(len)<<p3));
            a3++;
            n -= len;
        }
        if (o1==0 && o2==0) {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *(a1++);
                y = *(a2++);
                x = m_xor(x,y);
                *(a3++) = x;
            }
        } else {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *a1>>o1;
                if (o1<0)  x |= *(a1-1)>>l1;
                if (o1>0)  x |= *(a1+1)<<r1;
                a1++;
                y = *a2>>o2;
                if (o2<0)  y |= *(a2-1)>>l2;
                if (o2>0)  y |= *(a2+1)<<r2;
                a2++;
                x = m_xor(x,y);
                *(a3++) = x;
            }
        }
        if (n>0) {
            x = *a1>>o1;
            if (o1<0)  x |= *(a1-1)>>l1;
            y = *a2>>o2;
            if (o2<0)  y |= *(a2-1)>>l2;
            x = m_xor(x,y);
            *a3 = (x & CUMO_SLB(n)) | (*a3 & CUMO_BALL<<n);
        }
    }
}

/*
  Binary xor.
  @overload xor other
  @param [Cumo::NArray,Numeric] other
  @return [Cumo::NArray] ^ of self and other.
*/
static VALUE
xor(VALUE self, VALUE other)
{
    cumo_ndfunc_arg_in_t ain[2] = {{cT,0},{cT,0}};
    cumo_ndfunc_arg_out_t aout[1] = {{cT,0}};
    cumo_ndfunc_t ndf = { bit_xor, CUMO_FULL_LOOP, 2, 1, ain, aout };

    return cumo_na_ndloop(&ndf, 2, self, other);
}
iter_bit_xor

#line 1 "narray/gen/tmpl_bit/binary.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t  n;
    size_t  p1, p2, p3;
    ssize_t s1, s2, s3;
    size_t *idx1, *idx2, *idx3;
    int     o1, o2, l1, l2, r1, r2, len;
    CUMO_BIT_DIGIT *a1, *a2, *a3;
    CUMO_BIT_DIGIT  x, y;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_bit_eq", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, n);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_BIT_IDX(lp, 1, a2, p2, s2, idx2);
    CUMO_INIT_PTR_BIT_IDX(lp, 2, a3, p3, s3, idx3);
    if (s1!=1 || s2!=1 || s3!=1 || idx1 || idx2 || idx3) {
        for (; n--;) {
            CUMO_LOAD_BIT_STEP(a1, p1, s1, idx1, x);
            CUMO_LOAD_BIT_STEP(a2, p2, s2, idx2, y);
            x = m_eqbit(x,y);
            CUMO_STORE_BIT_STEP(a3, p3, s3, idx3, x);
        }
    } else {
        o1 =  p1 % CUMO_NB;
        o1 -= p3;
        o2 =  p2 % CUMO_NB;
        o2 -= p3;
        l1 =  CUMO_NB+o1;
        r1 =  CUMO_NB-o1;
        l2 =  CUMO_NB+o2;
        r2 =  CUMO_NB-o2;
        if (p3>0 || n<CUMO_NB) {
            len = CUMO_NB - p3;
            if ((int)n<len) len=n;
            if (o1>=0) x = *a1>>o1;
            else       x = *a1<<-o1;
            if (p1+len>CUMO_NB)  x |= *(a1+1)<<r1;
            a1++;
            if (o2>=0) y = *a2>>o2;
            else       y = *a2<<-o2;
            if (p2+len>CUMO_NB)  y |= *(a2+1)<<r2;
            a2++;
            x = m_eq(x,y);
            *a3 = (x & (CUMO_SLB(len)<<p3)) | (*a3 & ~(CUMO_SLB(len)<<p3));
            a3++;
            n -= len;
        }
        if (o1==0 && o2==0) {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *(a1++);
                y = *(a2++);
                x = m_eq(x,y);
                *(a3++) = x;
            }
        } else {
            for (; n>=CUMO_NB; n-=CUMO_NB) {
                x = *a1>>o1;
                if (o1<0)  x |= *(a1-1)>>l1;
                if (o1>0)  x |= *(a1+1)<<r1;
                a1++;
                y = *a2>>o2;
                if (o2<0)  y |= *(a2-1)>>l2;
                if (o2>0)  y |= *(a2+1)<<r2;
                a2++;
                x = m_eq(x,y);
                *(a3++) = x;
            }
        }
        if (n>0) {
            x = *a1>>o1;
            if (o1<0)  x |= *(a1-1)>>l1;
            y = *a2>>o2;
            if (o2<0)  y |= *(a2-1)>>l2;
            x = m_eq(x,y);
            *a3 = (x & CUMO_SLB(n)) | (*a3 & CUMO_BALL<<n);
        }
    }
}

/*
  Binary eq.
  @overload eq other
  @param [Cumo::NArray,Numeric] other
  @return [Cumo::NArray] eq of self and other.
*/
static VALUE
eq(VALUE self, VALUE other)
{
    cumo_ndfunc_arg_in_t ain[2] = {{cT,0},{cT,0}};
    cumo_ndfunc_arg_out_t aout[1] = {{cT,0}};
    cumo_ndfunc_t ndf = { bit_eq, CUMO_FULL_LOOP, 2, 1, ain, aout };

    return cumo_na_ndloop(&ndf, 2, self, other);
}
iter_bit_eq

#line 1 "narray/gen/tmpl_bit/bit_count.c"
#undef int_t
#define int_t uint64_t

void (size_t p1, char *p2, CUMO_BIT_DIGIT *a1, size_t *idx1, uint64_t n);
void cumo_iter_bit_count_true_index_kernel_launch(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, ssize_t s1, uint64_t n);
void cumo_iter_bit_count_true_stride_kernel_launch(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, size_t *idx1, ssize_t s2, uint64_t n);
void cumo_iter_bit_count_true_index_stride_kernel_launch(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, ssize_t s1, ssize_t s2, uint64_t n);

static void
cumo_iter_bit_count_true_stride_stride_kernel_launch(cumo_na_loop_t *const lp)
{
    size_t  i;
    CUMO_BIT_DIGIT *a1;
    size_t  p1;
    char   *p2;
    ssize_t s1, s2;
    size_t *idx1;

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR(lp, 1, p2, s2);

    if (s2==0) {
        if (idx1) {
            iter_bit_count_true(p1,p2,a1,idx1,i);
        } else {
            cumo_iter_bit_count_true_index_kernel_launch(p1,p2,a1,s1,i);
        }
    } else {
        if (idx1) {
            cumo_iter_bit_count_true_stride_kernel_launch(p1,p2,a1,idx1,s2,i);
        } else {
            cumo_iter_bit_count_true_index_stride_kernel_launch(p1,p2,a1,s1,s2,i);
        }
    }
}

static VALUE
cumo_iter_bit_count_true_stride_stride_kernel_launch_cpu(int argc, VALUE *argv, VALUE self);

/*
  Returns the number of bits.
  If argument is supplied, return Int-array counted along the axes.
  @overload bit_count_true(axis:nil, keepdims:false)
  @param [Integer,Array,Range] axis (keyword) axes to be counted.
  @param [TrueClass] keepdims (keyword) If true, the reduced axes are left in the result array as dimensions with size one.
  @return [Cumo::UInt64]
*/
static VALUE
count_true(int argc, VALUE *argv, VALUE self)
{
    if (cumo_compatible_mode_enabled_p()) {
        return bit_count_true_cpu(argc, argv, self);
    } else {
        VALUE v, reduce;
        cumo_ndfunc_arg_in_t ain[3] = {{cT,0},{cumo_sym_reduce,0},{cumo_sym_init,0}};
        cumo_ndfunc_arg_out_t aout[1] = {{cumo_cUInt64,0}};
        cumo_ndfunc_t ndf = { bit_count_true, CUMO_FULL_LOOP_NIP, 3, 1, ain, aout };

        reduce = cumo_na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
        v = cumo_na_ndloop(&ndf, 3, self, reduce, INT2FIX(0));
        return v;
    }
}
iter_bit_count_true


#line 1 "narray/gen/tmpl_bit/bit_count.c"
#undef int_t
#define int_t uint64_t

void (size_t p1, char *p2, CUMO_BIT_DIGIT *a1, size_t *idx1, uint64_t n);
void cumo_iter_bit_count_false_index_kernel_launch(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, ssize_t s1, uint64_t n);
void cumo_iter_bit_count_false_stride_kernel_launch(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, size_t *idx1, ssize_t s2, uint64_t n);
void cumo_iter_bit_count_false_index_stride_kernel_launch(size_t p1, char *p2, CUMO_BIT_DIGIT *a1, ssize_t s1, ssize_t s2, uint64_t n);

static void
cumo_iter_bit_count_false_stride_stride_kernel_launch(cumo_na_loop_t *const lp)
{
    size_t  i;
    CUMO_BIT_DIGIT *a1;
    size_t  p1;
    char   *p2;
    ssize_t s1, s2;
    size_t *idx1;

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR(lp, 1, p2, s2);

    if (s2==0) {
        if (idx1) {
            iter_bit_count_false(p1,p2,a1,idx1,i);
        } else {
            cumo_iter_bit_count_false_index_kernel_launch(p1,p2,a1,s1,i);
        }
    } else {
        if (idx1) {
            cumo_iter_bit_count_false_stride_kernel_launch(p1,p2,a1,idx1,s2,i);
        } else {
            cumo_iter_bit_count_false_index_stride_kernel_launch(p1,p2,a1,s1,s2,i);
        }
    }
}

static VALUE
cumo_iter_bit_count_false_stride_stride_kernel_launch_cpu(int argc, VALUE *argv, VALUE self);

/*
  Returns the number of bits.
  If argument is supplied, return Int-array counted along the axes.
  @overload bit_count_false(axis:nil, keepdims:false)
  @param [Integer,Array,Range] axis (keyword) axes to be counted.
  @param [TrueClass] keepdims (keyword) If true, the reduced axes are left in the result array as dimensions with size one.
  @return [Cumo::UInt64]
*/
static VALUE
count_false(int argc, VALUE *argv, VALUE self)
{
    if (cumo_compatible_mode_enabled_p()) {
        return bit_count_false_cpu(argc, argv, self);
    } else {
        VALUE v, reduce;
        cumo_ndfunc_arg_in_t ain[3] = {{cT,0},{cumo_sym_reduce,0},{cumo_sym_init,0}};
        cumo_ndfunc_arg_out_t aout[1] = {{cumo_cUInt64,0}};
        cumo_ndfunc_t ndf = { bit_count_false, CUMO_FULL_LOOP_NIP, 3, 1, ain, aout };

        reduce = cumo_na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
        v = cumo_na_ndloop(&ndf, 3, self, reduce, INT2FIX(0));
        return v;
    }
}
iter_bit_count_false


#line 1 "narray/gen/tmpl_bit/bit_count_cpu.c"
#undef int_t
#define int_t int64_t

static void
(cumo_na_loop_t *const lp)
{
    size_t  i;
    CUMO_BIT_DIGIT *a1;
    size_t  p1;
    char   *p2;
    ssize_t s1, s2;
    size_t *idx1;
    CUMO_BIT_DIGIT x=0;
    int_t   y;

    CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("iter_bit_count_true_cpu", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR(lp, 1, p2, s2);
    if (s2==0) {
        CUMO_GET_DATA(p2, int_t, y);
        if (idx1) {
            for (; i--;) {
                CUMO_LOAD_BIT(a1, p1+*idx1, x);
                idx1++;
                if (m_count_true_cpubit(x)) {
                    y++;
                }
            }
        } else {
            for (; i--;) {
                CUMO_LOAD_BIT(a1, p1, x);
                p1 += s1;
                if (m_count_true_cpu(x)) {
                    y++;
                }
            }
        }
        *(int_t*)p2 = y;
    } else {
        if (idx1) {
            for (; i--;) {
                CUMO_LOAD_BIT(a1, p1+*idx1, x);
                idx1++;
                if (m_count_true_cpu(x)) {
                    CUMO_GET_DATA(p2, int_t, y);
                    y++;
                    CUMO_SET_DATA(p2, int_t, y);
                }
                p2+=s2;
            }
        } else {
            for (; i--;) {
                CUMO_LOAD_BIT(a1, p1, x);
                p1+=s1;
                if (m_count_true_cpu(x)) {
                    CUMO_GET_DATA(p2, int_t, y);
                    y++;
                    CUMO_SET_DATA(p2, int_t, y);
                }
                p2+=s2;
            }
        }
    }
}

/*
  Returns the number of bits.
  If argument is supplied, return Int-array counted along the axes.
  @overload count_true_cpu(axis:nil, keepdims:false)
  @param [Integer,Array,Range] axis (keyword) axes to be counted.
  @param [TrueClass] keepdims (keyword) If true, the reduced axes are left in the result array as dimensions with size one.
  @return [Cumo::Int64]
*/
static VALUE
count_true_cpu(int argc, VALUE *argv, VALUE self)
{
    VALUE v, reduce;
    cumo_ndfunc_arg_in_t ain[3] = {{cT,0},{cumo_sym_reduce,0},{cumo_sym_init,0}};
    cumo_ndfunc_arg_out_t aout[1] = {{cumo_cInt64,0}};
    cumo_ndfunc_t ndf = { bit_count_true_cpu, CUMO_FULL_LOOP_NIP, 3, 1, ain, aout };

    reduce = cumo_na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
    v = cumo_na_ndloop(&ndf, 3, self, reduce, INT2FIX(0));
    return rb_funcall(v,rb_intern("extract_cpu"),0);
}
iter_bit_count_true_cpu


#line 1 "narray/gen/tmpl_bit/bit_count_cpu.c"
#undef int_t
#define int_t int64_t

static void
(cumo_na_loop_t *const lp)
{
    size_t  i;
    CUMO_BIT_DIGIT *a1;
    size_t  p1;
    char   *p2;
    ssize_t s1, s2;
    size_t *idx1;
    CUMO_BIT_DIGIT x=0;
    int_t   y;

    CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("iter_bit_count_false_cpu", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR(lp, 1, p2, s2);
    if (s2==0) {
        CUMO_GET_DATA(p2, int_t, y);
        if (idx1) {
            for (; i--;) {
                CUMO_LOAD_BIT(a1, p1+*idx1, x);
                idx1++;
                if (m_count_false_cpubit(x)) {
                    y++;
                }
            }
        } else {
            for (; i--;) {
                CUMO_LOAD_BIT(a1, p1, x);
                p1 += s1;
                if (m_count_false_cpu(x)) {
                    y++;
                }
            }
        }
        *(int_t*)p2 = y;
    } else {
        if (idx1) {
            for (; i--;) {
                CUMO_LOAD_BIT(a1, p1+*idx1, x);
                idx1++;
                if (m_count_false_cpu(x)) {
                    CUMO_GET_DATA(p2, int_t, y);
                    y++;
                    CUMO_SET_DATA(p2, int_t, y);
                }
                p2+=s2;
            }
        } else {
            for (; i--;) {
                CUMO_LOAD_BIT(a1, p1, x);
                p1+=s1;
                if (m_count_false_cpu(x)) {
                    CUMO_GET_DATA(p2, int_t, y);
                    y++;
                    CUMO_SET_DATA(p2, int_t, y);
                }
                p2+=s2;
            }
        }
    }
}

/*
  Returns the number of bits.
  If argument is supplied, return Int-array counted along the axes.
  @overload count_false_cpu(axis:nil, keepdims:false)
  @param [Integer,Array,Range] axis (keyword) axes to be counted.
  @param [TrueClass] keepdims (keyword) If true, the reduced axes are left in the result array as dimensions with size one.
  @return [Cumo::Int64]
*/
static VALUE
count_false_cpu(int argc, VALUE *argv, VALUE self)
{
    VALUE v, reduce;
    cumo_ndfunc_arg_in_t ain[3] = {{cT,0},{cumo_sym_reduce,0},{cumo_sym_init,0}};
    cumo_ndfunc_arg_out_t aout[1] = {{cumo_cInt64,0}};
    cumo_ndfunc_t ndf = { bit_count_false_cpu, CUMO_FULL_LOOP_NIP, 3, 1, ain, aout };

    reduce = cumo_na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
    v = cumo_na_ndloop(&ndf, 3, self, reduce, INT2FIX(0));
    return rb_funcall(v,rb_intern("extract_cpu"),0);
}
iter_bit_count_false_cpu


#line 1 "narray/gen/tmpl_bit/bit_reduce.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t     i;
    CUMO_BIT_DIGIT *a1, *a2;
    size_t     p1,  p2;
    ssize_t    s1,  s2;
    size_t    *idx1, *idx2;
    CUMO_BIT_DIGIT  x=0, y=0;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_bit_all_p", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_BIT_IDX(lp, 1, a2, p2, s2, idx2);
    if (idx2) {
        if (idx1) {
            for (; i--;) {
                CUMO_LOAD_BIT(a2, p2+*idx2, y);
                if (y == all?bit) {
                    CUMO_LOAD_BIT(a1, p1+*idx1, x);
                    if (x != 1) {
                        CUMO_STORE_BIT(a2, p2+*idx2, x);
                    }
                }
                idx1++;
                idx2++;
            }
        } else {
            for (; i--;) {
                CUMO_LOAD_BIT(a2, p2+*idx2, y);
                if (y == 1) {
                    CUMO_LOAD_BIT(a1, p1, x);
                    if (x != 1) {
                        CUMO_STORE_BIT(a2, p2+*idx2, x);
                    }
                }
                p1 += s1;
                idx2++;
            }
        }
    } else if (s2) {
        if (idx1) {
            for (; i--;) {
                CUMO_LOAD_BIT(a2, p2, y);
                if (y == 1) {
                    CUMO_LOAD_BIT(a1, p1+*idx1, x);
                    if (x != 1) {
                        CUMO_STORE_BIT(a2, p2, x);
                    }
                }
                idx1++;
                p2 += s2;
            }
        } else {
            for (; i--;) {
                CUMO_LOAD_BIT(a2, p2, y);
                if (y == 1) {
                    CUMO_LOAD_BIT(a1, p1, x);
                    if (x != 1) {
                        CUMO_STORE_BIT(a2, p2, x);
                    }
                }
                p1 += s1;
                p2 += s2;
            }
        }
    } else {
        CUMO_LOAD_BIT(a2, p2, x);
        if (x != 1) {
            return;
        }
        if (idx1) {
            for (; i--;) {
                CUMO_LOAD_BIT(a1, p1+*idx1, y);
                if (y != 1) {
                    CUMO_STORE_BIT(a2, p2, y);
                    return;
                }
                idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_LOAD_BIT(a1, p1, y);
                if (y != 1) {
                    CUMO_STORE_BIT(a2, p2, y);
                    return;
                }
                p1 += s1;
            }
        }
    }
}

/*
  Return true if all of bits are one (true).
  If argument is supplied, return Bit-array reduced along the axes.
  @overload 1(axis:nil, keepdims:false)
  @param [Integer,Array,Range] axis (keyword) axes to be reduced.
  @param [TrueClass] keepdims (keyword) If true, the reduced axes are left in the result array as dimensions with size one.
  @return [Cumo::Bit] .
*/
static VALUE
all?(int argc, VALUE *argv, VALUE self)
{
    VALUE v, reduce;
    cumo_ndfunc_arg_in_t ain[3] = {{cT,0},{cumo_sym_reduce,0},{cumo_sym_init,0}};
    cumo_ndfunc_arg_out_t aout[1] = {{cumo_cBit,0}};
    cumo_ndfunc_t ndf = {bit_all_p, CUMO_FULL_LOOP_NIP, 3,1, ain,aout};

    reduce = cumo_na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
    v = cumo_na_ndloop(&ndf, 3, self, reduce, INT2FIX(iter_bit_all_p));
    if (argc > 0) {
        return v;
    }
    v = 1(v);
    switch (v) {
    case INT2FIX(0):
        return Qfalse;
    case INT2FIX(1):
        return Qtrue;
    default:
        rb_bug("unexpected result");
        return v;
    }
}
bit_extract

#line 1 "narray/gen/tmpl_bit/bit_reduce.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t     i;
    CUMO_BIT_DIGIT *a1, *a2;
    size_t     p1,  p2;
    ssize_t    s1,  s2;
    size_t    *idx1, *idx2;
    CUMO_BIT_DIGIT  x=0, y=0;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_bit_any_p", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a1, p1, s1, idx1);
    CUMO_INIT_PTR_BIT_IDX(lp, 1, a2, p2, s2, idx2);
    if (idx2) {
        if (idx1) {
            for (; i--;) {
                CUMO_LOAD_BIT(a2, p2+*idx2, y);
                if (y == any?bit) {
                    CUMO_LOAD_BIT(a1, p1+*idx1, x);
                    if (x != 0) {
                        CUMO_STORE_BIT(a2, p2+*idx2, x);
                    }
                }
                idx1++;
                idx2++;
            }
        } else {
            for (; i--;) {
                CUMO_LOAD_BIT(a2, p2+*idx2, y);
                if (y == 0) {
                    CUMO_LOAD_BIT(a1, p1, x);
                    if (x != 0) {
                        CUMO_STORE_BIT(a2, p2+*idx2, x);
                    }
                }
                p1 += s1;
                idx2++;
            }
        }
    } else if (s2) {
        if (idx1) {
            for (; i--;) {
                CUMO_LOAD_BIT(a2, p2, y);
                if (y == 0) {
                    CUMO_LOAD_BIT(a1, p1+*idx1, x);
                    if (x != 0) {
                        CUMO_STORE_BIT(a2, p2, x);
                    }
                }
                idx1++;
                p2 += s2;
            }
        } else {
            for (; i--;) {
                CUMO_LOAD_BIT(a2, p2, y);
                if (y == 0) {
                    CUMO_LOAD_BIT(a1, p1, x);
                    if (x != 0) {
                        CUMO_STORE_BIT(a2, p2, x);
                    }
                }
                p1 += s1;
                p2 += s2;
            }
        }
    } else {
        CUMO_LOAD_BIT(a2, p2, x);
        if (x != 0) {
            return;
        }
        if (idx1) {
            for (; i--;) {
                CUMO_LOAD_BIT(a1, p1+*idx1, y);
                if (y != 0) {
                    CUMO_STORE_BIT(a2, p2, y);
                    return;
                }
                idx1++;
            }
        } else {
            for (; i--;) {
                CUMO_LOAD_BIT(a1, p1, y);
                if (y != 0) {
                    CUMO_STORE_BIT(a2, p2, y);
                    return;
                }
                p1 += s1;
            }
        }
    }
}

/*

  Return true if any of bits is one (true).
  If argument is supplied, return Bit-array reduced along the axes.
  @overload 0(axis:nil, keepdims:false)
  @param [Integer,Array,Range] axis (keyword) axes to be reduced.
  @param [TrueClass] keepdims (keyword) If true, the reduced axes are left in the result array as dimensions with size one.
  @return [Cumo::Bit] .
*/
static VALUE
any?(int argc, VALUE *argv, VALUE self)
{
    VALUE v, reduce;
    cumo_ndfunc_arg_in_t ain[3] = {{cT,0},{cumo_sym_reduce,0},{cumo_sym_init,0}};
    cumo_ndfunc_arg_out_t aout[1] = {{cumo_cBit,0}};
    cumo_ndfunc_t ndf = {bit_any_p, CUMO_FULL_LOOP_NIP, 3,1, ain,aout};

    reduce = cumo_na_reduce_dimension(argc, argv, 1, &self, &ndf, 0);
    v = cumo_na_ndloop(&ndf, 3, self, reduce, INT2FIX(iter_bit_any_p));
    if (argc > 0) {
        return v;
    }
    v = 0(v);
    switch (v) {
    case INT2FIX(0):
        return Qfalse;
    case INT2FIX(1):
        return Qtrue;
    default:
        rb_bug("unexpected result");
        return v;
    }
}
bit_extract

#line 1 "narray/gen/tmpl_bit/none_p.c"
static VALUE
(int argc, VALUE *argv, VALUE self)
{
    VALUE v;

    v = bit_none_p(argc,argv,self);

    if (v==Qtrue) {
        return Qfalse;
    } else if (v==Qfalse) {
        return Qtrue;
    }
    return bit_any_p(v);
}
bit_not

#line 1 "narray/gen/tmpl_bit/where.c"
typedef struct {
    size_t count;
    char  *idx0;
    char  *idx1;
    size_t elmsz;
} where_opt_t;

#define STORE_INT(ptr, esz, x) memcpy(ptr,&(x),esz)

static void
(cumo_na_loop_t *const lp)
{
    size_t  i;
    CUMO_BIT_DIGIT *a;
    size_t  p;
    ssize_t s;
    size_t *idx;
    CUMO_BIT_DIGIT x=0;
    char   *idx1;
    size_t  count;
    size_t  e;
    where_opt_t *g;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_bit_where", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    g = (where_opt_t*)(lp->opt_ptr);
    count = g->count;
    idx1  = g->idx1;
    e     = g->elmsz;
    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a, p, s, idx);
    if (idx) {
        for (; i--;) {
            CUMO_LOAD_BIT(a, p+*idx, x);
            idx++;
            if (x!=0) {
                STORE_INT(idx1,e,count);
                idx1 += e;
            }
            count++;
        }
    } else {
        for (; i--;) {
            CUMO_LOAD_BIT(a, p, x);
            p+=s;
            if (x!=0) {
                STORE_INT(idx1,e,count);
                idx1 += e;
            }
            count++;
        }
    }
    g->count = count;
    g->idx1  = idx1;
}

/*
  Returns the array of index where the bit is one (true).
  @overload wherebit
  @return [Cumo::Int32,Cumo::Int64]
*/
static VALUE
where(VALUE self)
{
    volatile VALUE idx_1;
    size_t size, n_1;
    where_opt_t *g;

    cumo_ndfunc_arg_in_t ain[1] = {{cT,0}};
    cumo_ndfunc_t ndf = { bit_where, CUMO_FULL_LOOP, 1, 0, ain, 0 };

    size = CUMO_RNARRAY_SIZE(self);
    n_1 = NUM2SIZET(iter_bit_where(0, NULL, self));
    g = ALLOCA_N(where_opt_t,1);
    g->count = 0;
    if (size>4294967295ul) {
        idx_1 = cumo_na_new(cumo_cInt64, 1, &n_1);
        g->elmsz = 8;
    } else {
        idx_1 = cumo_na_new(cumo_cInt32, 1, &n_1);
        g->elmsz = 4;
    }
    g->idx1 = cumo_na_get_pointer_for_write(idx_1);
    g->idx0 = NULL;
    cumo_na_ndloop3(&ndf, g, 1, self);
    cumo_na_release_lock(idx_1);
    return idx_1;
}
bit_count_true_cpu

#line 1 "narray/gen/tmpl_bit/where2.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t  i;
    CUMO_BIT_DIGIT *a;
    size_t  p;
    ssize_t s;
    size_t *idx;
    CUMO_BIT_DIGIT x=0;
    char   *idx0, *idx1;
    size_t  count;
    size_t  e;
    where_opt_t *g;

    // TODO(sonots): CUDA kernelize
    CUMO_SHOW_SYNCHRONIZE_FIXME_WARNING_ONCE("iter_bit_where2", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    g = (where_opt_t*)(lp->opt_ptr);
    count = g->count;
    idx0  = g->idx0;
    idx1  = g->idx1;
    e     = g->elmsz;
    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a, p, s, idx);
    if (idx) {
        for (; i--;) {
            CUMO_LOAD_BIT(a, p+*idx, x);
            idx++;
            if (x==0) {
                STORE_INT(idx0,e,count);
                idx0 += e;
            } else {
                STORE_INT(idx1,e,count);
                idx1 += e;
            }
            count++;
        }
    } else {
        for (; i--;) {
            CUMO_LOAD_BIT(a, p, x);
            p+=s;
            if (x==0) {
                STORE_INT(idx0,e,count);
                idx0 += e;
            } else {
                STORE_INT(idx1,e,count);
                idx1 += e;
            }
            count++;
        }
    }
    g->count = count;
    g->idx0  = idx0;
    g->idx1  = idx1;
}

/*
  Returns two index arrays.
  The first array contains index where the bit is one (true).
  The second array contains index where the bit is zero (false).
  @overload where2bit
  @return [Cumo::Int32,Cumo::Int64]*2
*/
static VALUE
where2(VALUE self)
{
    VALUE idx_1, idx_0;
    size_t size, n_1, n_0;
    where_opt_t *g;

    cumo_ndfunc_arg_in_t ain[1] = {{cT,0}};
    cumo_ndfunc_t ndf = { bit_where2, CUMO_FULL_LOOP, 1, 0, ain, 0 };

    size = CUMO_RNARRAY_SIZE(self);
    n_1 = NUM2SIZET(iter_bit_where2(0, NULL, self));
    n_0 = size - n_1;
    g = ALLOCA_N(where_opt_t,1);
    g->count = 0;
    if (size>4294967295ul) {
        idx_1 = cumo_na_new(cumo_cInt64, 1, &n_1);
        idx_0 = cumo_na_new(cumo_cInt64, 1, &n_0);
        g->elmsz = 8;
    } else {
        idx_1 = cumo_na_new(cumo_cInt32, 1, &n_1);
        idx_0 = cumo_na_new(cumo_cInt32, 1, &n_0);
        g->elmsz = 4;
    }
    g->idx1 = cumo_na_get_pointer_for_write(idx_1);
    g->idx0 = cumo_na_get_pointer_for_write(idx_0);
    cumo_na_ndloop3(&ndf, g, 1, self);
    cumo_na_release_lock(idx_0);
    cumo_na_release_lock(idx_1);
    return rb_assoc_new(idx_1,idx_0);
}
bit_count_true_cpu

#line 1 "narray/gen/tmpl_bit/mask.c"
static void
(cumo_na_loop_t *const lp)
{
    size_t  i;
    CUMO_BIT_DIGIT *a;
    size_t  p1, p2;
    ssize_t s1, s2;
    size_t *idx1, *idx2, *pidx;
    CUMO_BIT_DIGIT x=0;
    size_t  count;
    where_opt_t *g;

    CUMO_SHOW_SYNCHRONIZE_WARNING_ONCE("iter_bit_mask", "");
    cumo_cuda_runtime_check_status(cudaDeviceSynchronize());

    g = (where_opt_t*)(lp->opt_ptr);
    count = g->count;
    pidx  = (size_t*)(g->idx1);
    CUMO_INIT_COUNTER(lp, i);
    CUMO_INIT_PTR_BIT_IDX(lp, 0, a, p1, s1, idx1);
    //CUMO_INIT_PTR_IDX(lp, 1, p2, s2, idx2);
    p2 = lp->args[1].iter[0].pos;
    s2 = lp->args[1].iter[0].step;
    idx2 = lp->args[1].iter[0].idx;

    if (idx1) {
        if (idx2) {
            for (; i--;) {
                CUMO_LOAD_BIT(a, p1+*idx1, x);
                idx1++;
                if (x) {
                    *(pidx++) = p2+*idx2;
                    count++;
                }
                idx2++;
            }
        } else {
            for (; i--;) {
                CUMO_LOAD_BIT(a, p1+*idx1, x);
                idx1++;
                if (x) {
                    *(pidx++) = p2;
                    count++;
                }
                p2 += s2;
            }
        }
    } else {
        if (idx2) {
            for (; i--;) {
                CUMO_LOAD_BIT(a, p1, x);
                p1 += s1;
                if (x) {
                    *(pidx++) = p2+*idx2;
                    count++;
                }
                idx2++;
            }
        } else {
            for (; i--;) {
                CUMO_LOAD_BIT(a, p1, x);
                p1 += s1;
                if (x) {
                    *(pidx++) = p2;
                    count++;
                }
                p2 += s2;
            }
        }
    }
    g->count = count;
    g->idx1  = (char*)pidx;
}

#if   SIZEOF_VOIDP == 8
#define cIndex cumo_cInt64
#elif SIZEOF_VOIDP == 4
#define cIndex cumo_cInt32
#endif

/*
  Return subarray of argument masked with self bit array.
  @overload maskbit(array)
  @param [Cumo::NArray] array  narray to be masked.
  @return [Cumo::NArray]  view of masked array.
*/
static VALUE
mask(VALUE mask, VALUE val)
{
    volatile VALUE idx_1, view;
    cumo_narray_data_t *nidx;
    cumo_narray_view_t *nv;
    cumo_narray_t      *na;
    cumo_narray_view_t *na1;
    cumo_stridx_t stridx0;
    size_t n_1;
    where_opt_t g;
    cumo_ndfunc_arg_in_t ain[2] = {{cT,0},{Qnil,0}};
    cumo_ndfunc_t ndf = {bit_mask, CUMO_FULL_LOOP, 2, 0, ain, 0};

    // TODO(sonots): bit_count_true synchronizes with CPU. Avoid.
    n_1 = NUM2SIZET(iter_bit_mask(0, NULL, mask));
    idx_1 = cumo_na_new(cIndex, 1, &n_1);
    g.count = 0;
    g.elmsz = SIZEOF_VOIDP;
    g.idx1 = cumo_na_get_pointer_for_write(idx_1);
    g.idx0 = NULL;
    cumo_na_ndloop3(&ndf, &g, 2, mask, val);

    view = cumo_na_s_allocate_view(CLASS_OF(val));
    CumoGetNArrayView(view, nv);
    cumo_na_setup_shape((cumo_narray_t*)nv, 1, &n_1);

    CumoGetNArrayData(idx_1,nidx);
    CUMO_SDX_SET_INDEX(stridx0,(size_t*)nidx->ptr);
    nidx->ptr = NULL;

    nv->stridx = ALLOC_N(cumo_stridx_t,1);
    nv->stridx[0] = stridx0;
    nv->offset = 0;

    CumoGetNArray(val, na);
    switch(CUMO_NA_TYPE(na)) {
    case CUMO_NARRAY_DATA_T:
        nv->data = val;
        break;
    case CUMO_NARRAY_VIEW_T:
        CumoGetNArrayView(val, na1);
        nv->data = na1->data;
        break;
    default:
        rb_raise(rb_eRuntimeError,"invalid CUMO_NA_TYPE: %d",CUMO_NA_TYPE(na));
    }

    return view;
}
bit_count_true_cpu

(void)
{
    VALUE hCast, cumo_bit;

    mCumo = rb_define_module("Cumo");


    mCumo


cumo_id_cast = rb_intern("cast");cumo_id_divmod = rb_intern("divmod");cumo_id_eq = rb_intern("eq");cumo_id_mulsum = rb_intern("mulsum");cumo_id_ne = rb_intern("ne");cumo_id_pow = rb_intern("pow");}

#line 1 "narray/gen/tmpl/init_class.c"
    /*
      Document-class:
      Cumo::Bit
    */
    cT = rb_define_class_under(, "", cNArray);


    hCast = rb_hash_new();
    rb_define_const(cT, "UPCAST", hCast);
    rb_hash_aset(hCast, rb_cArray,   cT);

    mCumoBit


    #ifdef RUBY_INTEGER_UNIFICATIONrb_hash_aset(hCast, rb_cInteger, cT);#elserb_hash_aset(hCast, rb_cFixnum, cT);rb_hash_aset(hCast, rb_cBignum, cT);#endifrb_hash_aset(hCast, rb_cFloat, cumo_cDFloat);rb_hash_aset(hCast, rb_cComplex, cumo_cDComplex);rb_hash_aset(hCast, cumo_cRObject, cumo_cRObject);rb_hash_aset(hCast, cumo_cDComplex, cumo_cDComplex);rb_hash_aset(hCast, cumo_cSComplex, cumo_cSComplex);rb_hash_aset(hCast, cumo_cDFloat, cumo_cDFloat);rb_hash_aset(hCast, cumo_cSFloat, cumo_cSFloat);rb_hash_aset(hCast, cumo_cInt64, cumo_cInt64);rb_hash_aset(hCast, cumo_cInt32, cumo_cInt32);rb_hash_aset(hCast, cumo_cInt16, cumo_cInt16);rb_hash_aset(hCast, cumo_cInt8, cumo_cInt8);rb_hash_aset(hCast, cumo_cUInt64, cumo_cUInt64);rb_hash_aset(hCast, cumo_cUInt32, cumo_cUInt32);rb_hash_aset(hCast, cumo_cUInt16, cumo_cUInt16);rb_hash_aset(hCast, cumo_cUInt8, cumo_cUInt8);


    rb_define_singleton_method(cT, "[]", /**/
    rb_define_const(cT,"ELEMENT_BIT_SIZE",INT2FIX(1));/**/
    rb_define_const(cT,"ELEMENT_BYTE_SIZE",rb_float_new(1.0/8));/**/
    rb_define_const(cT,"CONTIGUOUS_STRIDE",INT2FIX(1));rb_define_alloc_func(cT, bit_s_alloc_func);rb_define_method(cT, "allocate", bit_allocate, 0);rb_define_method(cT, "extract", bit_extract, 0);rb_define_method(cT, "extract_cpu", bit_extract_cpu, 0);rb_define_method(cT, "store", bit_store, 1);rb_define_singleton_method(cT, "cast", bit_s_cast, 1);rb_define_method(cT, "[]", bit_aref, -1);rb_define_method(cT, "aref_cpu", bit_aref_cpu, -1);rb_define_method(cT, "[]=", bit_aset, -1);rb_define_method(cT, "coerce_cast", bit_coerce_cast, 1);rb_define_method(cT, "to_a", bit_to_a, 0);rb_define_method(cT, "fill", bit_fill, 1);rb_define_method(cT, "format", bit_format, -1);rb_define_method(cT, "format_to_a", bit_format_to_a, -1);rb_define_method(cT, "inspect", bit_inspect, 0);rb_define_method(cT, "each", bit_each, 0);rb_define_method(cT, "each_with_index", bit_each_with_index, 0);rb_define_method(cT, "copy", bit_copy, 0);rb_define_method(cT, "~", bit_not, 0);rb_define_method(cT, "&", bit_and, 1);rb_define_method(cT, "|", bit_or, 1);rb_define_method(cT, "^", bit_xor, 1);rb_define_method(cT, "eq", bit_eq, 1);rb_define_method(cT, "count_true", bit_count_true, -1);rb_define_alias(cT, "count_1", "count_true");rb_define_alias(cT, "count", "count_true");rb_define_method(cT, "count_false", bit_count_false, -1);rb_define_alias(cT, "count_0", "count_false");rb_define_method(cT, "count_true_cpu", bit_count_true_cpu, -1);rb_define_alias(cT, "count_1_cpu", "count_true_cpu");rb_define_alias(cT, "count_cpu", "count_true_cpu");rb_define_method(cT, "count_false_cpu", bit_count_false_cpu, -1);rb_define_alias(cT, "count_0_cpu", "count_false_cpu");rb_define_method(cT, "all?", bit_all_p, -1);rb_define_method(cT, "any?", bit_any_p, -1);rb_define_method(cT, "none?", bit_none_p, -1);rb_define_method(cT, "where", bit_where, 0);rb_define_method(cT, "where2", bit_where2, 0);rb_define_method(cT, "mask", bit_mask, 1);, -2);
bit_s_cast