Skip to content

Instantly share code, notes, and snippets.

@arjo129
Last active June 14, 2020 07:21
Show Gist options
  • Save arjo129/ca5569db6dc5434a27aea092299b9b49 to your computer and use it in GitHub Desktop.
Save arjo129/ca5569db6dc5434a27aea092299b9b49 to your computer and use it in GitHub Desktop.
A replacement for the coreutils `shuf` tool for huge files on disk with long (multi-GB) lines.
#define MAX_CAPACITY 10000
#define MIN_BUFFER 1000
#define MAX_LINE 10
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
long getRandomNumber(long max) {
return random()%max;
}
typedef struct ResizeableArray {
long* items;
size_t capacity;
size_t count;
} ResizeableArray;
ResizeableArray init_array() {
ResizeableArray r;
r.capacity = MIN_BUFFER;
r.count = 0;
r.items = (long*)malloc(r.capacity*sizeof(long));
return r;
}
void push(ResizeableArray* array, long value){
array->count++;
if(array->count == array->capacity){
array->capacity *= 2;
array->items = (long*)realloc(array->items, array->capacity*sizeof(long));
}
array->items[array->count-1] = value;
}
void pop(ResizeableArray* array){
if(array->count == 0) return;
array->count--;
}
void shuffle(ResizeableArray* array) {
long i;
for(i = array->count; i > 1; i--){
long p = getRandomNumber(i);
long tmp = array->items[i-1];
array->items[i-1] = array->items[p-1];
array->items[p-1] = tmp;
}
}
void destroy(ResizeableArray* array) {
free(array->items);
}
void init_random() {
time_t t;
srand((unsigned) time(&t));
}
void outputLineAt(FILE* fp, long line_start){
char buffer[MAX_LINE];
size_t length;
memset(buffer, 0, sizeof(buffer));
int err = fseek(fp, line_start+1, SEEK_SET);
fflush(fp);
if (err) {
perror("failed to seek");
exit(-1);
}
do {
char* res = fgets(buffer, sizeof(buffer), fp);
printf("%s", buffer);
length = strlen(buffer);
if(res == NULL) {
return;
}
} while (length > 0 && buffer[length-1] != '\n');
}
int main(int argc, char** argv) {
if (argc < 1) {
printf("Usage: shuff <file name>\n");
return -1;
}
char* infile = argv[1];
FILE* fp = fopen(infile, "rb");
if (fp == NULL) {
perror("Error opening file");
return -1;
}
init_random();
char buffer[MAX_CAPACITY];
long offset;
size_t numbytes;
ResizeableArray array = init_array();
bool lastCharWasLineEnd = false;
push(&array, 0);
//Index the file. Find all points where a new line occurs
do {
offset = ftell(fp);
numbytes = fread(buffer, sizeof(*buffer), sizeof(buffer), fp);
for(size_t i = 0; i < numbytes; i++) {
if(buffer[i] == '\n') {
push(&array, offset+i);
}
}
} while(numbytes==sizeof(buffer));
//Shuffle the lines
shuffle(&array);
//Perform IO
for (size_t i = 0; i < array.count; i++) {
long line_start = array.items[i];
outputLineAt(fp, line_start);
}
fclose(fp);
//destroy(&array);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment