Skip to content

Instantly share code, notes, and snippets.

@ivan-krukov
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ivan-krukov/64a3cbe98f997779e856 to your computer and use it in GitHub Desktop.
Save ivan-krukov/64a3cbe98f997779e856 to your computer and use it in GitHub Desktop.
Memory-efficient median calculation
#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
/**
* Median calculation
* This program takes an input file with one number per line and calculates mean and median.
* There is also an option to skip K first lines
* The input file is memory-mapped and we use an on-line algorithm to calculate the median.
* Thus, there is no need to sort the complete array. This is convenient for large inputs.
* The algorithm is by Torben Mogensen, with the discussion available here:
* http://ndevilla.free.fr/median/median/index.html
*
* USAGE:
* gcc median.c -o median
* ./median LARGE_INPUT_FILE <LINES_TO_SKIP>
*
* AUTHORS:
* Ivan Kryukov
* N. Devillard
* Torben Mogensen
*
*/
unsigned long file_size(int fd)
{
struct stat s;
int status = fstat(fd, &s);
return s.st_size;
}
float get_number (char *data, unsigned long i, unsigned long c)
{
size_t chunk_size = i - c;
char chunk [chunk_size];
chunk[chunk_size] = '\0';
memcpy(chunk, &data[c], chunk_size);
return atof(chunk);
}
int main(int argc, char const *argv[])
{
//Open file for reading - no error checking
int fd = open (argv[1], O_RDONLY);
//Initialize bytes to the file size
unsigned long bytes = file_size(fd);
printf("File size: %lu\n", bytes);
//Memory-map the file
char *data = (char *) mmap (0, bytes, PROT_READ, MAP_PRIVATE, fd, 0);
printf("Memory-mapped the file\n");
unsigned long i, less, greater, equal, c = 0;
//Number of lines
unsigned long N = 1;
//Median calculation variables
float min, max, guess, maxltguess, mingtguess, median;
float sum = 0;
//Skip argv[2] first lines
if (argv[2]) {
int k;
for (k = 0; k < atoi(argv[2]); ++k) {
i = strchr(data, '\n') - data;
data += i+1;
}
}
//Get the first element of the file
i = strchr(data,'\n') - data;
float zeroth = get_number(data,i,0);
min = zeroth;
max = zeroth;
c = i + 1;
printf("Looking for min and max\n");
for (i = c; i <= bytes; i++) {
if (data[i] == '\n' || i == bytes) { //here we want end of line or one past the end of file (last element)
float n = get_number(data,i,c);
sum += n;
//Find min
if (n < min) {
min = n;
}
//Find max
if (n > max) {
max = n;
}
//advance pointer
c = i+1;
//Line count
N++;
}
}
//Calculate mean
float mean = sum / (float)(N);
//Print summary statistics
printf("min: %f, max: %f, data points: %lu\n", min, max, N);
printf("Mean: %f\n", mean);
/*
* The following code is public domain.
* Algorithm by Torben Mogensen, implementation by N. Devillard, modified for mmap by iK.
* This code in public domain.
* http://ndevilla.free.fr/median/median/index.html
*/
printf("Looking for median\n");
while (1) {
guess = (min+max)/2;
less = 0; greater = 0; equal = 0;
maxltguess = min;
mingtguess = max;
c = 0;
for (i = 0; i <= bytes; i++) {
if (data[i] == '\n' || i == bytes) {
float n = get_number(data,i,c);
if (n < guess) {
less++;
if (n > maxltguess) {
maxltguess = n;
}
} else if (n > guess) {
greater++;
if (n < mingtguess) {
mingtguess = n;
}
} else {
equal++;
}
c = i + 1;
}
}
if (less <= (N+1)/2 && greater <= (N+1)/2) {
break;
}
else if (less>greater) {
max = maxltguess;
}
else {
min = mingtguess;
}
}
if (less >= (N+1)/2) {
median = maxltguess;
}
else if (less+equal >= (N+1)/2) {
median = guess;
}
else {
median = mingtguess;
}
printf("Median: %f\n",median);
munmap(data,N);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment