Last active
April 28, 2021 06:07
-
-
Save bitbank2/0c4cfe12c58ae7b605bbeccbf93b2d36 to your computer and use it in GitHub Desktop.
C Optimization demo project
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// main.c | |
// perf_demo | |
// | |
// Created by Laurence Bank on 3/24/21. | |
// | |
#include <stdio.h> | |
#include <string.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <arm_neon.h> | |
uint8_t * ReadBMP(const char *fname, int *width, int *height) | |
{ | |
int y, w, h, offset; | |
uint8_t *s, *d, *pBitmap; | |
int pitch; | |
int iSize, iDelta; | |
FILE *infile; | |
infile = fopen(fname, "r+b"); | |
if (infile == NULL) { | |
printf("Error opening input file %s\n", fname); | |
return NULL; | |
} | |
// Read the bitmap into RAM | |
fseek(infile, 0, SEEK_END); | |
iSize = (int)ftell(infile); | |
fseek(infile, 0, SEEK_SET); | |
pBitmap = (uint8_t *)malloc(iSize); | |
fread(pBitmap, 1, iSize, infile); | |
fclose(infile); | |
if (pBitmap[0] != 'B' || pBitmap[1] != 'M' || pBitmap[14] < 0x28) { | |
free(pBitmap); | |
printf("Not a Windows BMP file!\n"); | |
return NULL; | |
} | |
w = *(int32_t *)&pBitmap[18]; | |
h = *(int32_t *)&pBitmap[22]; | |
offset = *(int32_t *)&pBitmap[10]; // offset to bits | |
pitch = (w + 3) & 0xfffc; // DWORD aligned | |
// move up the pixels | |
d = pBitmap; | |
s = &pBitmap[offset]; | |
iDelta = pitch; | |
if (h < 0) { | |
iDelta = -pitch; | |
s = &pBitmap[offset + (h-1) * pitch]; | |
} | |
for (y=0; y<h; y++) { | |
memcpy(d, s, pitch); | |
d += pitch; | |
s += iDelta; | |
} | |
*width = w; | |
*height = h; | |
return pBitmap; | |
} /* ReadBMP() */ | |
void WriteBMP(const char *name, uint8_t *pData, int width, int height, int bpp) | |
{ | |
int bsize, lsize, i, iHeaderSize, iBodySize; | |
uint8_t pBuf[1024]; // holds BMP header | |
FILE *outfile; | |
bsize = lsize = ((width * bpp) + 7)/8; | |
lsize = (lsize + 3) & 0xfffc; // DWORD aligned | |
iHeaderSize = 54; | |
iHeaderSize += (1<<(bpp+2)); | |
iBodySize = lsize * height; | |
i = iBodySize + iHeaderSize; // datasize | |
memset(pBuf, 0, 54); | |
pBuf[0] = 'B'; | |
pBuf[1] = 'M'; | |
pBuf[2] = i & 0xff; // 4 bytes of file size | |
pBuf[3] = (i >> 8) & 0xff; | |
pBuf[4] = (i >> 16) & 0xff; | |
pBuf[5] = (i >> 24) & 0xff; | |
/* Offset to data bits */ | |
pBuf[10] = iHeaderSize & 0xff; | |
pBuf[11] = (unsigned char)(iHeaderSize >> 8); | |
pBuf[14] = 0x28; | |
pBuf[18] = width & 0xff; // xsize low | |
pBuf[19] = (unsigned char)(width >> 8); // xsize high | |
i = height; // top down bitmap | |
pBuf[22] = i & 0xff; // ysize low | |
pBuf[23] = (unsigned char)(i >> 8); // ysize high | |
pBuf[24] = 0; //0xff; | |
pBuf[25] = 0; //0xff; | |
pBuf[26] = 1; // number of planes | |
pBuf[28] = (uint8_t)bpp; | |
pBuf[30] = 0; // uncompressed | |
i = iBodySize; | |
pBuf[34] = i & 0xff; // data size | |
pBuf[35] = (i >> 8) & 0xff; | |
pBuf[36] = (i >> 16) & 0xff; | |
pBuf[37] = (i >> 24) & 0xff; | |
pBuf[54] = pBuf[55] = pBuf[56] = pBuf[57] = pBuf[61] = 0; // palette | |
pBuf[58] = pBuf[59] = pBuf[60] = 0xff; | |
outfile = fopen((char *)name, "w+b"); | |
if (outfile) | |
{ | |
uint8_t *s = pData; | |
fwrite(pBuf, 1, iHeaderSize, outfile); | |
for (i=0; i<height; i++) | |
{ | |
fwrite(s, 1, lsize, outfile); | |
s += lsize; | |
} | |
fclose(outfile); | |
} | |
} /* WriteBMP() */ | |
void Threshold_SIMD(uint8_t *pGray, uint8_t *pBitonal, int width, int height, uint8_t threshold) | |
{ | |
int x, y; | |
uint8_t *s, *d; | |
int src_pitch, dst_pitch; | |
uint8x16_t in0, in1, thresh, mask, out, zero; | |
uint8_t ucMask[] = {128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1}; | |
src_pitch = (width + 3) & 0xfffc; // Windows BMP files need every line to be DWORD aligned | |
dst_pitch = (width+7)/8; | |
dst_pitch = (dst_pitch + 3) & 0xfffc; // DWORD align the 1-bpp output | |
mask = vld1q_u8(ucMask); // load the bit position mask into register variable | |
thresh = vdupq_n_u8(threshold); // duplicate the pixel threshold value across all 16 lanes of a "Q" register | |
zero = vdupq_n_u8(0); | |
for (y=0; y<height; y++) { // loop over the input+output image | |
s = &pGray[y * src_pitch]; | |
d = &pBitonal[y * dst_pitch]; | |
for (x=0; x<width; x+=32) { | |
in0 = vld1q_u8(s); // Read 32 grayscale pixels at a time into 2 128-bit SIMD registers | |
in1 = vld1q_u8(s+16); // we should interleave more loads, but 2 is a good starting point | |
s += 32; // advance source pointer 32 bytes (32 grayscale pixels) | |
in0 = vcgeq_u8(in0, thresh); // compare 16 pixels with 16 threshold values | |
in1 = vcgeq_u8(in1, thresh); // the result will form a new vector of 0's (false) and 1's (true) in each lane | |
in0 = vandq_u8(in0, mask); // logical AND the comparison vector with our bit position mask | |
in1 = vandq_u8(in1, mask); | |
out = vpaddq_u8(in0, in1); // 32->16 - pairwise add 32 pixels/lanes into 16 | |
out = vpaddq_u8(out, zero); // 16->8 | |
out = vpaddq_u8(out, zero); // 8->4 | |
*(uint32_t *)d = vgetq_lane_u32(vreinterpretq_u32_u8 (out), 0); // store 32 bits (1-bpp pixels) at once | |
d += 4; // advance destination pointer 4 bytes (32-bits) | |
} | |
} // for y | |
} /* Threshold_SIMD() */ | |
void Threshold_Optimized(uint8_t *pGray, uint8_t *pBitonal, int width, int height, uint8_t threshold) | |
{ | |
int x, y, bit, src_index, dst_index; | |
int src_pitch, dst_pitch; | |
src_pitch = (width + 3) & 0xfffc; | |
dst_pitch = (width+7)/8; | |
dst_pitch = (dst_pitch + 3) & 0xfffc; | |
for (y=0; y<height; y++) { | |
for (x=0; x<width; x++) { | |
src_index = (y * src_pitch + x); | |
dst_index = (y * dst_pitch + x/8); | |
bit = 0x80 >> (x & 7); // bits are arranged as MSB first | |
if (pGray[src_index] > threshold) | |
pBitonal[dst_index] |= bit; | |
else | |
pBitonal[dst_index] &= ~bit; | |
} | |
} // for y | |
} /* Threshold_Optimized() */ | |
void Threshold_Original(uint8_t *pGray, uint8_t *pBitonal, int width, int height, uint8_t threshold) | |
{ | |
int x, y, bit, src_index, dst_index; | |
int src_pitch, dst_pitch; | |
src_pitch = (width + 3) & 0xfffc; | |
dst_pitch = (width+7)/8; | |
dst_pitch = (dst_pitch + 3) & 0xfffc; | |
for (y=0; y<height; y++) { | |
for (x=0; x<width; x++) { | |
src_index = (y * src_pitch + x); | |
dst_index = (y * dst_pitch + x/8); | |
bit = 0x80 >> (x & 7); // bits are arranged as MSB first | |
if (pGray[src_index] > threshold) | |
pBitonal[dst_index] |= bit; | |
else | |
pBitonal[dst_index] &= ~bit; | |
} | |
} // for y | |
} /* Threshold_Original() */ | |
int main(int argc, const char * argv[]) | |
{ | |
int width, height, out_pitch; | |
uint8_t *pGray, *pBitonal; | |
if (argc != 3) { | |
printf("Usage: threshold <infile> <outfile\n"); | |
return 0; | |
} | |
pGray = ReadBMP(argv[1], &width, &height); | |
if (pGray == NULL) | |
return -1; | |
out_pitch = (width + 7)/8; | |
out_pitch = (out_pitch + 3) & 0xfffc; // DWORD aligned | |
pBitonal = malloc(out_pitch * height); | |
for (int i=0; i<1000; i++) { | |
Threshold_SIMD(pGray, pBitonal, width, height, 127); | |
// Threshold_Original(pGray, pBitonal, width, height, 127); | |
} | |
WriteBMP(argv[2], pBitonal, width, height, 1); | |
return 0; | |
} /* main() */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment