Skip to content

Instantly share code, notes, and snippets.

@binary132
Created April 29, 2013 22:11
Show Gist options
  • Save binary132/5485217 to your computer and use it in GitHub Desktop.
Save binary132/5485217 to your computer and use it in GitHub Desktop.
Display 100 most common lines from STDIN. Unlimited input (external sort.)
#!/usr/local/bin/perl
use strict;
use warnings;
# JBS 2013
# External sort
# read lines (possibly billions of lines) from standard input
# print out the 100 most common lines
# 8388608 = 8Mi
# 134217728 = 128Mi
my $max_file_size = 131072;
sub make_sorted_chunk_files {
my %lines;
my $line;
my $file_id = 0;
while (<>) {
chomp;
$line = $_;
$lines{$line}++;
# When hash has max elements,
# sort hash and put it in a file.
# then nuke it and start over!
if (scalar keys %lines >= $max_file_size ) {
# sort keys by hash value, descending
my @tmpkeys = sort {$lines{$b} <=> $lines{$a}} keys %lines;
# open file for writeout (auto-close)
open (my $outfile, ">", $file_id . ".dat")
or die "Failed to open > " . $file_id . ".dat: $!";
$file_id++;
print $outfile "$_\n", $lines{$_}, "\n" foreach (@tmpkeys);
# reset hash
%lines = ();
}
}
# If there's anything left, wrap things up.
if (scalar keys %lines) {
my @tmpkeys = sort {$lines{$b} <=> $lines {$a}} keys %lines;
open (my $outfile, ">", $file_id . ".dat")
or die "Failed to open > " . $file_id . ".dat: $!";
$file_id++;
print $outfile "$_\n", $lines{$_}, "\n" foreach (@tmpkeys);
}
}
sub sort_chunks_top100 {
# Get a list of the dat files.
my @files = <*.dat>;
my @filehandles = map { open my $fh, '<', $_; $fh } @files;
my %most_frequent_lines;
# initialize values to second line (frequency number)
my @instrings = map { my $tmpstr = scalar <$_>; chomp $tmpstr; $tmpstr } @filehandles;
my @frequencies = map { my $tmpstr = scalar <$_>; chomp $tmpstr; $tmpstr } @filehandles;
# now until we have 100 most frequent strings,
until ((scalar keys %most_frequent_lines) >= 100) {
# find index of largest frequency in current frequencies
my $largest_index = 0;
my $max_frequency = $frequencies[$largest_index];
for (0 .. (scalar @frequencies) - 1) {
my $i = $_;
if ($max_frequency < $frequencies[$i]) {
$largest_index = $i;
$max_frequency = $frequencies[$i];
}
}
# insert that item into hash of largest, adding in case of collision
# my @tmp_split = split (/,/, $instrings[$largest_index]);
$most_frequent_lines{ $instrings[$largest_index] } += $frequencies[$largest_index];
# then get next line from that file,
$instrings[$largest_index] = scalar readline ($filehandles[$largest_index]);
$frequencies[$largest_index] = scalar readline ($filehandles[$largest_index]);
chomp $instrings[$largest_index];
chomp $frequencies[$largest_index];
}
# once we have read 100 strings into sorted list, we are done.
return sort {$most_frequent_lines{$b} <=> $most_frequent_lines{$a}} keys %most_frequent_lines;
}
make_sorted_chunk_files( );
{
local $, = "\n";
print sort_chunks_top100( );
}
@binary132
Copy link
Author

Note: script may not work perfectly. Not sure why, but the order of the top 100 lines seems incorrect, in spite of being sorted right when they're returned from the function. Otherwise, I believe it works correctly.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment