Skip to content

Instantly share code, notes, and snippets.

@grassa
Created July 15, 2010 14:46
Show Gist options
  • Save grassa/477033 to your computer and use it in GitHub Desktop.
Save grassa/477033 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl -w
use strict;
# Given properly formatted herbarium records, this script will produce two
# datasets of equal size for two time periods. From these, for each species,
# the mean value of each time period is calculated. The script outputs a
# tab-delimited table of columns for species, mean value of the first time
# period, and the mean value of the second time period. This table can be used
# to perform classical statistical tests in R. Justification for the technique
# is provided below.
# "When working with herbarium specimens, which were usually not sampled in a
# sytematic way, special consideration has to be given to potential biases
# arising out of different sampling efforts. To account for different sampling
# efforts in the two time periods, we applied a random subsampling procedure
# similiar to Warren et al. (2001) so that equal numbers of records were
# obtained in each 100 m band..."
# -Bergamini et al. (2009)
# "To equalize recorder effort, we subsampled the 1995-99 data by randomly
# selecting the 1970-82 number of records from the 1995-99 data, subsampling
# separately for each 100-km Ordnance Survey grid square to retain the broad
# geographical distribution of 1970-82 records."
# -Warren et al. (2001)
# open up our gbif/climate dataset. the in and out filenames are passed as arguments when the script is executed.
my $in = shift;
my $out = shift;
open IN, "<", $in;
open OUT, ">", $out;
# bin boundry dates are inclusive.
# these dates make up the "before climate change" date range.
my $bin1begin = 1900;
my $bin1end = 1970;
# these dates make up the "after climate change" date range.
my $bin2begin = 1985;
my $bin2end = 2010;
# these hashes will contain our data from the two time periods.
my %bin1;
my %bin2;
# for each record in our dataset.
while ($in = <IN>)
{
my $fileline = "$in";
chomp $fileline;
# extract pertinent data from pur records. we could choose to analyze some variable other than elevation (latitude?) if we wanted to.
my @line = split /\t/,$fileline;
my $species = $line[0];
my $year = $line[1];
my $ele = $line[4];
# reformat elevation to a five digit integer, with leading zeros if need be.
$ele = sprintf("%05d", "$ele");
# here we remove the last two digits of the elevation. this number classifies the record into a 100 meter "band."
my $band = "$ele";
chop $band;
chop $band;
# the dataset will be separated into two time period bins (before and after climate change).
# each bin will contain its records assigned to elevation bands. we will use these bands when subsampling later.
# two hashes of arrays are used to map this data structure.
if (($year >= $bin1begin) and ($year <= $bin1end))
# does the record belong in the "before(%bin1)" time period?
{
# if so, take the record($fileline) and push it onto the @array keyed by its $band.
push(@{ $bin1{"$band"}}, "$fileline");
}
elsif (($year >= $bin2begin) and ($year <= $bin2end))
# does the record belong in the "after(%bin2)" time period?
{
# if so, take the record($fileline) and push it onto the @array keyed by its $band.
push(@{ $bin2{"$band"}}, "$fileline");
}
}
# now we have our dataset seperated into two time periods, each of which is divided by elevation.
# this will allow us to subsample from the larger bin while simultaneously avoiding bias from sampling at higher elevations in later years.
#these arrays will hold our before and after binned datasets of equal size.
my @before;
my @after;
#this array will be used to temporarily hold the dataset subsampled from the larger bin for each band.
my @subsample;
# there aren't many records in the upper bands, but i go through all bands that are earthly possible (mount everest is a little less than 8900 meters above sea level), to be conservative.
my $n = 0;
while ($n<90)
{
# tempororary arrays hold the data extracted from the hashes for each band while we're doing comparisons and subsampling.
my @band1;
my @band2;
# here i generate the three digit key corresponding to the elevation bands.
my $nnn = sprintf("%03d", "$n");
# are there records in the current band for each bin?
if (exists $bin1{"$nnn"})
{
# if so, take them out, and put them in the temporary arrays.
@band1 = @{ $bin1{"$nnn"}};
}
if (exists $bin2{"$nnn"})
{
@band2 = @{ $bin2{"$nnn"}};
}
# formally evaluate the size of each array.
my $count1 = scalar @band1;
my $count2 = scalar @band2;
# then compare, as we subsample from the larger of the two.
if ($count1<$count2)
{
# we can keep the smaller of the two elevation bands, so we just append it to our final array.
@before = (@before, @band1);
# this subroutine mathematically permutes the larger array in place.
FISHER_YATES_SHUFFLE( \@band2 );
# now subsample by taking a slice from the shuffled larger array that is equal in size to the smaller array.
@subsample = @band2[1..$count1];
# append the subsample to the final array.
@after = (@after,@subsample);
}
elsif ($count1>$count2)
{
@after = (@after, @band2);
FISHER_YATES_SHUFFLE( \@band1 );
@subsample = @band1[1..$count2];
@before = (@before,@subsample);
}
# no subsampling is needed if the elevation bands are of equal size in both bins.
else
{
@before = (@before, @band1);
@after = (@after, @band2);
}
# increment to the next band.
++$n;
}
# now we have two datasets of equal size with approximately equal distributions of elevation.
# what is their size?
my $count = scalar @before;
# we will be comparing the mean elevevation for each species. i again chose to use hashes of arrays.
my %species_before;
my %species_after;
# i keep a running list of each species encountered
my @specieslist;
# for all the record lines
for(1..$count)
{
# take a record from each dataset.
my $beforeline= pop @before;
my $afterline= pop @after;
# split the fields.
my @before_data = split /\t/,$beforeline;
my @after_data = split /\t/,$afterline;
# extract the species name,
my $before_species_key = $before_data[0];
my $after_species_key = $after_data[0];
# and the elevation datum.
my $before_species_value = $before_data[4];
my $after_species_value = $after_data[4];
# push the elevation datum($before_species_value) onto an @array keyed by the species it belongs to($before_species_key) in its proper time period hash(%species_before).
push(@{ $species_before{"$before_species_key"}}, "$before_species_value");
push(@{ $species_after{"$after_species_key"}}, "$after_species_value");
# push both species names onto the running list of those encountered.
push @specieslist, ("$before_species_key","$after_species_key");
}
# get an array of unique species names. i didn't write the next four lines, but i did test them.
my @unique_specieslist;
# declare a hash.
my %saw;
# make sure it's undefined.
undef %saw;
# our unique species list will consist of the first encounter of each species as we iterate through the list of all species encountered in the previous block.
@unique_specieslist = grep(!$saw{$_}++, @specieslist);
# for each species in our dataset, we'll calculate its mean elevation before climate change, and after.
foreach (@unique_specieslist)
{
# i like to name the current string.
my $species = "$_";
# these arrays will temporarily hold the elevations extracted from the hashes for the current species.
my @beforevalues;
my @aftervalues;
# these strings will hold the mean elevation.
my $beforemean;
my $aftermean;
# if the species is represented for the time period,
if (exists $species_before{"$species"})
{
# extract its elevation data from the hash.
@beforevalues = @{ $species_before{"$species"}};
}
if (exists $species_after{"$species"})
{
@aftervalues = @{ $species_after{"$species"}};
}
# how many records do we have in each time period?
my $count_before = scalar @beforevalues;
my $count_after = scalar @aftervalues;
# we will analyze species with at least 12 records in each time period.
if (($count_before>11) and ($count_after>11))
{
# get the mean elevation before and after climate change. we pass the array containing them to a subroutine.
$beforemean = MEAN(\@beforevalues);
$aftermean = MEAN(\@aftervalues);
# print the species name and both means.
print OUT "$species\t$beforemean\t$aftermean\n";
}
}
# FISHER_YATES_SHUFFLE( \@array ) : generate a random permutation of @array in place.
# this code is from the Perl Cookbook ch 4.17; the algorithm's literal description was read from Wikipedia.
# the fisher yates shuffle is unbiased, so that every permutation is equally likely.
sub FISHER_YATES_SHUFFLE
{
# take the passed array.
my $array = shift;
my $i;
# for i, from the array size down to one,
for ($i = @$array; --$i; )
{
# obtain a random integer, j that is 0<=j<=i.
my $j = int rand ($i+1);
# equivalent swaps do nothing. wikipedia implies that this line may be waste.
next if $i == $j;
# exchange array element i for j.
@$array[$i,$j] = @$array[$j,$i];
}
}
# calculate the mean of an array of numbers.
sub MEAN
{
# make sure the subroutine is passed a defined array.
@_ == 1 or die ('Sub usage: $mean = mean(\@array);');
# take the passed array.
my ($array_ref) = @_;
my $sum;
# get its size.
my $count = scalar @$array_ref;
# sum the values contained in the array
foreach (@$array_ref)
{
$sum += $_;
}
# divide the array's sum by its size to return its mean.
return $sum / $count;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment