Skip to content

Instantly share code, notes, and snippets.

@allanmac
Created July 14, 2013 21:21
Show Gist options
  • Save allanmac/5996141 to your computer and use it in GitHub Desktop.
Save allanmac/5996141 to your computer and use it in GitHub Desktop.
Examine the SASS that's generated for barrier reduction operations: __syncthreads_count(), __syncthreads_or(), __syncthreads_and() as well as the regular __syncthreads() barrier op. Somewhat surprisingly these are not mapped to a number of SASS ops. The barrier reductions are executed and the result is moved from a "barrier register" to a regula…
//
//
//
#define KERNEL_QUALIFIERS extern "C" __global__
//
//
//
KERNEL_QUALIFIERS
void
sync(const int* const vin, int* const vout)
{
const int v = vin[threadIdx.x];
__syncthreads();
vout[threadIdx.x] = v;
}
//
//
//
KERNEL_QUALIFIERS
void
syncCount(const int* const vin, int* const vout, unsigned int* const cout)
{
const int v = vin[threadIdx.x];
const bool c = v != 0;
const unsigned int d = __syncthreads_count(c);
cout[threadIdx.x] = d;
vout[threadIdx.x] = v;
}
//
//
//
KERNEL_QUALIFIERS
void
syncOr(const int* const vin, int* const vout)
{
const int v = vin[threadIdx.x];
const bool c = v != 0;
const bool p = __syncthreads_or(c);
vout[threadIdx.x] = p ? v : 0;
}
//
//
//
KERNEL_QUALIFIERS
void
syncAnd(const int* const vin, int* const vout)
{
const int v = vin[threadIdx.x];
const bool c = v != 0;
const bool p = __syncthreads_and(c);
vout[threadIdx.x] = p ? v : 0;
}
//
//
//
@allanmac
Copy link
Author

Compiled with nvcc -m 32 -arch sm_35 -Xptxas=-v,-abi=no -cubin sync.cu

Dumped with cuobjdump.exe -sass sync.cubin

        Function : sync
    .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                            /* 0x0a8880c480a0a0a0 */
        /*0008*/                S2R R0, SR_TID.X;           /* 0x86400000109c0002 */
        /*0010*/                SHF.L R1, RZ, 0x2, R0;      /* 0xb7c00000011ffc05 */
        /*0018*/                IADD R0, R1, c[0x0][0x140]; /* 0x60800000281c0402 */
        /*0020*/                LDG R0, [R0];               /* 0x600210047f9c0001 */
        /*0028*/                BAR.SYNC 0x0;               /* 0x8540dc00001c0002 */
        /*0030*/                IADD R1, R1, c[0x0][0x144]; /* 0x60800000289c0406 */
        /*0038*/                TEXDEPBAR 0x0;              /* 0x77000000001c0002 */
                                                            /* 0x080000000000b810 */
        /*0048*/                ST [R1], R0;                /* 0xe4000000001c0400 */
        /*0050*/                EXIT ;                      /* 0x18000000001c003c */
        /*0058*/                BRA 0x58;                   /* 0x12007ffffc1c003c */
        /*0060*/                NOP;                        /* 0x85800000001c3c02 */
        /*0068*/                NOP;                        /* 0x85800000001c3c02 */
        /*0070*/                NOP;                        /* 0x85800000001c3c02 */
        /*0078*/                NOP;                        /* 0x85800000001c3c02 */
        .....................

        Function : syncCount
    .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                                     /* 0x08b0a30880a0a0a0 */
        /*0008*/                S2R R0, SR_TID.X;                    /* 0x86400000109c0002 */
        /*0010*/                SHF.L R2, RZ, 0x2, R0;               /* 0xb7c00000011ffc09 */
        /*0018*/                IADD R0, R2, c[0x0][0x140];          /* 0x60800000281c0802 */
        /*0020*/                LDG R1, [R0];                        /* 0x600210047f9c0005 */
        /*0028*/                TEXDEPBAR 0x0;                       /* 0x77000000001c0002 */
        /*0030*/                ICMP.EQ R0, RZ, 0x1, R1;             /* 0xb2280400009ffc01 */
        /*0038*/                ISETP.NE.U32.AND P0, PT, R0, RZ, PT; /* 0xdb501c007f9c001e */
                                                                     /* 0x08b810b8a01000c4 */
        /*0048*/                BAR.RED.POPC 0x0, P0;                /* 0x8540c010001c0002 */
        /*0050*/                B2R.RESULT R0;                       /* 0x85c70008001ffc02 */
        /*0058*/                IADD R3, R2, c[0x0][0x148];          /* 0x60800000291c080e */
        /*0060*/                IADD R2, R2, c[0x0][0x144];          /* 0x60800000289c080a */
        /*0068*/                ST [R3], R0;                         /* 0xe4000000001c0c00 */
        /*0070*/                ST [R2], R1;                         /* 0xe4000000001c0804 */
        /*0078*/                EXIT ;                               /* 0x18000000001c003c */
        /*0080*/                BRA 0x80;                            /* 0x12007ffffc1c003c */
        /*0088*/                NOP;                                 /* 0x85800000001c3c02 */
        /*0090*/                NOP;                                 /* 0x85800000001c3c02 */
        /*0098*/                NOP;                                 /* 0x85800000001c3c02 */
        /*00a0*/                NOP;                                 /* 0x85800000001c3c02 */
        /*00a8*/                NOP;                                 /* 0x85800000001c3c02 */
        /*00b0*/                NOP;                                 /* 0x85800000001c3c02 */
        /*00b8*/                NOP;                                 /* 0x85800000001c3c02 */
        ..........................

        Function : syncOr
    .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                                     /* 0x08b0a30880a0a0a0 */
        /*0008*/                S2R R0, SR_TID.X;                    /* 0x86400000109c0002 */
        /*0010*/                SHF.L R1, RZ, 0x2, R0;               /* 0xb7c00000011ffc05 */
        /*0018*/                IADD R0, R1, c[0x0][0x140];          /* 0x60800000281c0402 */
        /*0020*/                LDG R0, [R0];                        /* 0x600210047f9c0001 */
        /*0028*/                TEXDEPBAR 0x0;                       /* 0x77000000001c0002 */
        /*0030*/                ICMP.EQ R2, RZ, 0x1, R0;             /* 0xb2280000009ffc09 */
        /*0038*/                ISETP.NE.U32.AND P1, PT, R2, RZ, PT; /* 0xdb501c007f9c083e */
                                                                     /* 0x08b810a0a01000c4 */
        /*0048*/                BAR.RED.OR 0x0, P1;                  /* 0x8540c490001c0002 */
        /*0050*/                B2R.RESULT RZ, P0;                   /* 0x85c00008001ffffe */
        /*0058*/                SEL R2, RZ, 0x1, !P0;                /* 0xc5002000009ffc09 */
        /*0060*/                IADD R1, R1, c[0x0][0x144];          /* 0x60800000289c0406 */
        /*0068*/                ICMP.EQ R0, RZ, R0, R2;              /* 0xda280800001ffc02 */
        /*0070*/                ST [R1], R0;                         /* 0xe4000000001c0400 */
        /*0078*/                EXIT ;                               /* 0x18000000001c003c */
        /*0080*/                BRA 0x80;                            /* 0x12007ffffc1c003c */
        /*0088*/                NOP;                                 /* 0x85800000001c3c02 */
        /*0090*/                NOP;                                 /* 0x85800000001c3c02 */
        /*0098*/                NOP;                                 /* 0x85800000001c3c02 */
        /*00a0*/                NOP;                                 /* 0x85800000001c3c02 */
        /*00a8*/                NOP;                                 /* 0x85800000001c3c02 */
        /*00b0*/                NOP;                                 /* 0x85800000001c3c02 */
        /*00b8*/                NOP;                                 /* 0x85800000001c3c02 */
        .......................

    code for sm_35
        Function : syncAnd
    .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                                     /* 0x08b0a30880a0a0a0 */
        /*0008*/                S2R R0, SR_TID.X;                    /* 0x86400000109c0002 */
        /*0010*/                SHF.L R1, RZ, 0x2, R0;               /* 0xb7c00000011ffc05 */
        /*0018*/                IADD R0, R1, c[0x0][0x140];          /* 0x60800000281c0402 */
        /*0020*/                LDG R0, [R0];                        /* 0x600210047f9c0001 */
        /*0028*/                TEXDEPBAR 0x0;                       /* 0x77000000001c0002 */
        /*0030*/                ICMP.EQ R2, RZ, 0x1, R0;             /* 0xb2280000009ffc09 */
        /*0038*/                ISETP.NE.U32.AND P1, PT, R2, RZ, PT; /* 0xdb501c007f9c083e */
                                                                     /* 0x08b810a0a01000c4 */
        /*0048*/                BAR.RED.AND 0x0, P1;                 /* 0x8540c450001c0002 */
        /*0050*/                B2R.RESULT RZ, P0;                   /* 0x85c00008001ffffe */
        /*0058*/                SEL R2, RZ, 0x1, !P0;                /* 0xc5002000009ffc09 */
        /*0060*/                IADD R1, R1, c[0x0][0x144];          /* 0x60800000289c0406 */
        /*0068*/                ICMP.EQ R0, RZ, R0, R2;              /* 0xda280800001ffc02 */
        /*0070*/                ST [R1], R0;                         /* 0xe4000000001c0400 */
        /*0078*/                EXIT ;                               /* 0x18000000001c003c */
        /*0080*/                BRA 0x80;                            /* 0x12007ffffc1c003c */
        /*0088*/                NOP;                                 /* 0x85800000001c3c02 */
        /*0090*/                NOP;                                 /* 0x85800000001c3c02 */
        /*0098*/                NOP;                                 /* 0x85800000001c3c02 */
        /*00a0*/                NOP;                                 /* 0x85800000001c3c02 */
        /*00a8*/                NOP;                                 /* 0x85800000001c3c02 */
        /*00b0*/                NOP;                                 /* 0x85800000001c3c02 */
        /*00b8*/                NOP;                                 /* 0x85800000001c3c02 */
        ........................

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment