Skip to content

Instantly share code, notes, and snippets.

@fxn
Created May 20, 2011 23:03
Show Gist options
  • Save fxn/983977 to your computer and use it in GitHub Desktop.
Save fxn/983977 to your computer and use it in GitHub Desktop.
Computes the ancestry path of GeoPlanet places
use strict;
use warnings;
use constant {
# Original Yahoo! TSV.
GPP => 'geoplanet_places_7.6.0.tsv',
# Output, same Yahoo! TSV with an extra ancestry column.
ANC => 'geoplanet_places_with_ancestry_7.6.0.tsv'
};
# We compute woeid -> ancestor list in this hash.
#
# We need to deal with about 5.7 million records, so this script
# is optimized for speed an memory consumption. In particular this
# hash is global and will be modified in place.
my %ancestors = ();
# Initializes %ancestors with woeid -> [parent_id].
sub initialize_ancestors {
open my $gpp_fh, GPP or die $!;
while (my $line = <$gpp_fh>) {
next unless $line =~ /^\d/; # data records start with a woeid
chomp $line;
my @fields = split /\t/, $line;
my ($woeid, $parent_id) = @fields[0, -1];
$ancestors{$woeid} = [$parent_id];
}
}
# Once %ancestors is initialized, this subroutine returns a list
# with all the ancestors of $start up to the root Earth node.
#
# Note that the code leverages branches already computed, since
# they are also stored in %ancestors.
sub branch_up_to_earth {
my $start = shift;
if ($start == 0) {
();
} elsif ($start == 1) {
($start);
} elsif (@{$ancestors{$start}} > 1) {
($start, @{$ancestors{$start}});
} else {
($start, branch_up_to_earth($ancestors{$start}[0]));
}
}
# Ouput the original TSV plus the ancestry for each record.
sub append_ancestry_to_tsv {
open my $gpp_fh, GPP or die $!;
open my $anc_fh, '>', ANC or die $!;
my $header = <$gpp_fh>;
chomp $header;
print $anc_fh "$header\tAncestry\n";
while (my $line = <$gpp_fh>) {
if ($line =~ /^\d+/) {
my $woeid = $&;
chomp $line;
print $anc_fh "$line\t";
$ancestors{$woeid} = [branch_up_to_earth($ancestors{$woeid}[0])];
print $anc_fh '""', join('/', reverse(@{$ancestors{$woeid}})), '""' if @{$ancestors{$woeid}};
print $anc_fh "\n";
}
}
}
initialize_ancestors;
append_ancestry_to_tsv;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment