Last active
June 14, 2020 07:21
-
-
Save arjo129/ca5569db6dc5434a27aea092299b9b49 to your computer and use it in GitHub Desktop.
A replacement for the coreutils `shuf` tool for huge files on disk with long (multi-GB) lines.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define MAX_CAPACITY 10000 | |
#define MIN_BUFFER 1000 | |
#define MAX_LINE 10 | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdbool.h> | |
#include <string.h> | |
long getRandomNumber(long max) { | |
return random()%max; | |
} | |
typedef struct ResizeableArray { | |
long* items; | |
size_t capacity; | |
size_t count; | |
} ResizeableArray; | |
ResizeableArray init_array() { | |
ResizeableArray r; | |
r.capacity = MIN_BUFFER; | |
r.count = 0; | |
r.items = (long*)malloc(r.capacity*sizeof(long)); | |
return r; | |
} | |
void push(ResizeableArray* array, long value){ | |
array->count++; | |
if(array->count == array->capacity){ | |
array->capacity *= 2; | |
array->items = (long*)realloc(array->items, array->capacity*sizeof(long)); | |
} | |
array->items[array->count-1] = value; | |
} | |
void pop(ResizeableArray* array){ | |
if(array->count == 0) return; | |
array->count--; | |
} | |
void shuffle(ResizeableArray* array) { | |
long i; | |
for(i = array->count; i > 1; i--){ | |
long p = getRandomNumber(i); | |
long tmp = array->items[i-1]; | |
array->items[i-1] = array->items[p-1]; | |
array->items[p-1] = tmp; | |
} | |
} | |
void destroy(ResizeableArray* array) { | |
free(array->items); | |
} | |
void init_random() { | |
time_t t; | |
srand((unsigned) time(&t)); | |
} | |
void outputLineAt(FILE* fp, long line_start){ | |
char buffer[MAX_LINE]; | |
size_t length; | |
memset(buffer, 0, sizeof(buffer)); | |
int err = fseek(fp, line_start+1, SEEK_SET); | |
fflush(fp); | |
if (err) { | |
perror("failed to seek"); | |
exit(-1); | |
} | |
do { | |
char* res = fgets(buffer, sizeof(buffer), fp); | |
printf("%s", buffer); | |
length = strlen(buffer); | |
if(res == NULL) { | |
return; | |
} | |
} while (length > 0 && buffer[length-1] != '\n'); | |
} | |
int main(int argc, char** argv) { | |
if (argc < 1) { | |
printf("Usage: shuff <file name>\n"); | |
return -1; | |
} | |
char* infile = argv[1]; | |
FILE* fp = fopen(infile, "rb"); | |
if (fp == NULL) { | |
perror("Error opening file"); | |
return -1; | |
} | |
init_random(); | |
char buffer[MAX_CAPACITY]; | |
long offset; | |
size_t numbytes; | |
ResizeableArray array = init_array(); | |
bool lastCharWasLineEnd = false; | |
push(&array, 0); | |
//Index the file. Find all points where a new line occurs | |
do { | |
offset = ftell(fp); | |
numbytes = fread(buffer, sizeof(*buffer), sizeof(buffer), fp); | |
for(size_t i = 0; i < numbytes; i++) { | |
if(buffer[i] == '\n') { | |
push(&array, offset+i); | |
} | |
} | |
} while(numbytes==sizeof(buffer)); | |
//Shuffle the lines | |
shuffle(&array); | |
//Perform IO | |
for (size_t i = 0; i < array.count; i++) { | |
long line_start = array.items[i]; | |
outputLineAt(fp, line_start); | |
} | |
fclose(fp); | |
//destroy(&array); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment