Skip to content

Instantly share code, notes, and snippets.

@bitbank2
Last active April 28, 2021 06:07
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bitbank2/0c4cfe12c58ae7b605bbeccbf93b2d36 to your computer and use it in GitHub Desktop.
Save bitbank2/0c4cfe12c58ae7b605bbeccbf93b2d36 to your computer and use it in GitHub Desktop.
C Optimization demo project
//
// main.c
// perf_demo
//
// Created by Laurence Bank on 3/24/21.
//
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <arm_neon.h>
uint8_t * ReadBMP(const char *fname, int *width, int *height)
{
int y, w, h, offset;
uint8_t *s, *d, *pBitmap;
int pitch;
int iSize, iDelta;
FILE *infile;
infile = fopen(fname, "r+b");
if (infile == NULL) {
printf("Error opening input file %s\n", fname);
return NULL;
}
// Read the bitmap into RAM
fseek(infile, 0, SEEK_END);
iSize = (int)ftell(infile);
fseek(infile, 0, SEEK_SET);
pBitmap = (uint8_t *)malloc(iSize);
fread(pBitmap, 1, iSize, infile);
fclose(infile);
if (pBitmap[0] != 'B' || pBitmap[1] != 'M' || pBitmap[14] < 0x28) {
free(pBitmap);
printf("Not a Windows BMP file!\n");
return NULL;
}
w = *(int32_t *)&pBitmap[18];
h = *(int32_t *)&pBitmap[22];
offset = *(int32_t *)&pBitmap[10]; // offset to bits
pitch = (w + 3) & 0xfffc; // DWORD aligned
// move up the pixels
d = pBitmap;
s = &pBitmap[offset];
iDelta = pitch;
if (h < 0) {
iDelta = -pitch;
s = &pBitmap[offset + (h-1) * pitch];
}
for (y=0; y<h; y++) {
memcpy(d, s, pitch);
d += pitch;
s += iDelta;
}
*width = w;
*height = h;
return pBitmap;
} /* ReadBMP() */
void WriteBMP(const char *name, uint8_t *pData, int width, int height, int bpp)
{
int bsize, lsize, i, iHeaderSize, iBodySize;
uint8_t pBuf[1024]; // holds BMP header
FILE *outfile;
bsize = lsize = ((width * bpp) + 7)/8;
lsize = (lsize + 3) & 0xfffc; // DWORD aligned
iHeaderSize = 54;
iHeaderSize += (1<<(bpp+2));
iBodySize = lsize * height;
i = iBodySize + iHeaderSize; // datasize
memset(pBuf, 0, 54);
pBuf[0] = 'B';
pBuf[1] = 'M';
pBuf[2] = i & 0xff; // 4 bytes of file size
pBuf[3] = (i >> 8) & 0xff;
pBuf[4] = (i >> 16) & 0xff;
pBuf[5] = (i >> 24) & 0xff;
/* Offset to data bits */
pBuf[10] = iHeaderSize & 0xff;
pBuf[11] = (unsigned char)(iHeaderSize >> 8);
pBuf[14] = 0x28;
pBuf[18] = width & 0xff; // xsize low
pBuf[19] = (unsigned char)(width >> 8); // xsize high
i = height; // top down bitmap
pBuf[22] = i & 0xff; // ysize low
pBuf[23] = (unsigned char)(i >> 8); // ysize high
pBuf[24] = 0; //0xff;
pBuf[25] = 0; //0xff;
pBuf[26] = 1; // number of planes
pBuf[28] = (uint8_t)bpp;
pBuf[30] = 0; // uncompressed
i = iBodySize;
pBuf[34] = i & 0xff; // data size
pBuf[35] = (i >> 8) & 0xff;
pBuf[36] = (i >> 16) & 0xff;
pBuf[37] = (i >> 24) & 0xff;
pBuf[54] = pBuf[55] = pBuf[56] = pBuf[57] = pBuf[61] = 0; // palette
pBuf[58] = pBuf[59] = pBuf[60] = 0xff;
outfile = fopen((char *)name, "w+b");
if (outfile)
{
uint8_t *s = pData;
fwrite(pBuf, 1, iHeaderSize, outfile);
for (i=0; i<height; i++)
{
fwrite(s, 1, lsize, outfile);
s += lsize;
}
fclose(outfile);
}
} /* WriteBMP() */
void Threshold_SIMD(uint8_t *pGray, uint8_t *pBitonal, int width, int height, uint8_t threshold)
{
int x, y;
uint8_t *s, *d;
int src_pitch, dst_pitch;
uint8x16_t in0, in1, thresh, mask, out, zero;
uint8_t ucMask[] = {128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1};
src_pitch = (width + 3) & 0xfffc; // Windows BMP files need every line to be DWORD aligned
dst_pitch = (width+7)/8;
dst_pitch = (dst_pitch + 3) & 0xfffc; // DWORD align the 1-bpp output
mask = vld1q_u8(ucMask); // load the bit position mask into register variable
thresh = vdupq_n_u8(threshold); // duplicate the pixel threshold value across all 16 lanes of a "Q" register
zero = vdupq_n_u8(0);
for (y=0; y<height; y++) { // loop over the input+output image
s = &pGray[y * src_pitch];
d = &pBitonal[y * dst_pitch];
for (x=0; x<width; x+=32) {
in0 = vld1q_u8(s); // Read 32 grayscale pixels at a time into 2 128-bit SIMD registers
in1 = vld1q_u8(s+16); // we should interleave more loads, but 2 is a good starting point
s += 32; // advance source pointer 32 bytes (32 grayscale pixels)
in0 = vcgeq_u8(in0, thresh); // compare 16 pixels with 16 threshold values
in1 = vcgeq_u8(in1, thresh); // the result will form a new vector of 0's (false) and 1's (true) in each lane
in0 = vandq_u8(in0, mask); // logical AND the comparison vector with our bit position mask
in1 = vandq_u8(in1, mask);
out = vpaddq_u8(in0, in1); // 32->16 - pairwise add 32 pixels/lanes into 16
out = vpaddq_u8(out, zero); // 16->8
out = vpaddq_u8(out, zero); // 8->4
*(uint32_t *)d = vgetq_lane_u32(vreinterpretq_u32_u8 (out), 0); // store 32 bits (1-bpp pixels) at once
d += 4; // advance destination pointer 4 bytes (32-bits)
}
} // for y
} /* Threshold_SIMD() */
void Threshold_Optimized(uint8_t *pGray, uint8_t *pBitonal, int width, int height, uint8_t threshold)
{
int x, y, bit, src_index, dst_index;
int src_pitch, dst_pitch;
src_pitch = (width + 3) & 0xfffc;
dst_pitch = (width+7)/8;
dst_pitch = (dst_pitch + 3) & 0xfffc;
for (y=0; y<height; y++) {
for (x=0; x<width; x++) {
src_index = (y * src_pitch + x);
dst_index = (y * dst_pitch + x/8);
bit = 0x80 >> (x & 7); // bits are arranged as MSB first
if (pGray[src_index] > threshold)
pBitonal[dst_index] |= bit;
else
pBitonal[dst_index] &= ~bit;
}
} // for y
} /* Threshold_Optimized() */
void Threshold_Original(uint8_t *pGray, uint8_t *pBitonal, int width, int height, uint8_t threshold)
{
int x, y, bit, src_index, dst_index;
int src_pitch, dst_pitch;
src_pitch = (width + 3) & 0xfffc;
dst_pitch = (width+7)/8;
dst_pitch = (dst_pitch + 3) & 0xfffc;
for (y=0; y<height; y++) {
for (x=0; x<width; x++) {
src_index = (y * src_pitch + x);
dst_index = (y * dst_pitch + x/8);
bit = 0x80 >> (x & 7); // bits are arranged as MSB first
if (pGray[src_index] > threshold)
pBitonal[dst_index] |= bit;
else
pBitonal[dst_index] &= ~bit;
}
} // for y
} /* Threshold_Original() */
int main(int argc, const char * argv[])
{
int width, height, out_pitch;
uint8_t *pGray, *pBitonal;
if (argc != 3) {
printf("Usage: threshold <infile> <outfile\n");
return 0;
}
pGray = ReadBMP(argv[1], &width, &height);
if (pGray == NULL)
return -1;
out_pitch = (width + 7)/8;
out_pitch = (out_pitch + 3) & 0xfffc; // DWORD aligned
pBitonal = malloc(out_pitch * height);
for (int i=0; i<1000; i++) {
Threshold_SIMD(pGray, pBitonal, width, height, 127);
// Threshold_Original(pGray, pBitonal, width, height, 127);
}
WriteBMP(argv[2], pBitonal, width, height, 1);
return 0;
} /* main() */
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment