Skip to content

Instantly share code, notes, and snippets.

@ali01
Last active August 29, 2015 14:00
Show Gist options
  • Save ali01/5b82bc26d0da3d969500 to your computer and use it in GitHub Desktop.
Save ali01/5b82bc26d0da3d969500 to your computer and use it in GitHub Desktop.
Program to extract the N longest lines of a file
#!/usr/bin/env python
import math
import random
# presently set so that the resulting file size is ~1GB on average
NUM_LINES = int(math.pow(2, 21))
MAX_LINE_LENGTH = int(math.pow(2, 10))
def generate_test_file():
random.seed()
with open('./data.txt', 'a') as f:
for i in range(0, NUM_LINES):
line_length = random.randint(0, MAX_LINE_LENGTH)
line = ''
for j in range(0, line_length):
line += str(unichr(random.randint(97, 122)))
f.write("%s\n" % line)
if __name__ == '__main__':
generate_test_file()
exit(0)
#include <cstring>
#include <deque>
#include <fcntl.h>
#include <iostream>
#include <string>
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>
using namespace std;
// Datastructure used to keep track of the N longest strings inserted.
class LongestStrQueue {
public:
// Constructor:
// uint32_t cap -- number of strings to keep track of
LongestStrQueue(uint32_t cap) : cap_(cap), min_length_(0) {}
// Inserts a string.
// Runs in linear time with respect to `cap`, However, only strings that are
// longer than min_length_, which is the length of the smallest string,
// currently stored, need be inserted. If the being inserted come from a
// pool of strings whose lengths are uniformly distributed [0, L), then the
// fraction of strings that need to be inserted decreases with time as
// min_length_ approaches L.
void
insert(string str) {
if (str.length() > min_length_) {
deque<string>::iterator it;
for (it = strings_.begin(); it != strings_.end(); ++it) {
if ((*it).length() < str.length())
break;
}
strings_.insert(it, str);
if (strings_.size() > cap_) {
strings_.pop_back();
}
} else if (strings_.size() < cap_) {
strings_.push_back(str);
min_length_ = str.length();
}
}
// Returns a const reference to the list of N.
// longest strings in sorted order (decreasing length).
const deque<string>&
strings() {
return strings_;
}
private:
deque<string> strings_; // list of strings in sorted order (decreasing length)
uint32_t min_length_; //
const uint32_t cap_; // maximum number of strings to keep track of
};
int
longest(const string& filepath, LongestStrQueue& longest_lines) {
int fd = open(filepath.c_str(), O_RDONLY);
// determine the size of the input file
struct stat st;
if (fstat(fd, &st) < 0) {
perror("fstat");
return -1;
}
// mmap input file
void *data = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0);
if (data == MAP_FAILED) {
perror("mmap");
return -1;
}
// breakdown file into lines and insert them into LongestStrQueue
const char *data_str = (const char *)data;
while (true) {
int index = strcspn(data_str, "\n");
if (data_str[index] == '\0')
break;
string line(data_str, index + 1);
longest_lines.insert(line);
data_str += index + 1;
}
return 0;
}
void
print_usage(const char *program_name) {
printf("usage: %s %s %s\n\n", program_name, "<input_filename>", "<num_lines");
printf("Options:\n");
printf(" -h show this help message and exit\n\n");
}
int
main(int argc, char **argv) {
// parse command line options
int c_i;
while ((c_i = getopt(argc, argv, "h")) != -1) {
switch(c_i) {
case 'h':
case '?':
default:
print_usage(argv[0]);
return 0;
}
}
// parse non-option arguments
// expect two non-option arguments
if (optind + 2 != argc) {
print_usage(argv[0]);
return -1;
}
const string filepath = argv[optind];
const uint32_t num_lines = stoi(argv[++optind]);
// heavy lifting done by longest()
LongestStrQueue longest_lines(num_lines);
longest(filepath, longest_lines);
// output result to stdout
deque<string> strings = longest_lines.strings();
for (deque<string>::iterator it = strings.begin(); it != strings.end(); ++it){
cout << *it;
}
return 0;
}
all:
g++ -O2 -std=c++11 longest.cpp -o longest
run: longest
@./longest
clean:
rm -rf longest
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment