Created
July 15, 2010 14:46
-
-
Save grassa/477033 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
# Given properly formatted herbarium records, this script will produce two | |
# datasets of equal size for two time periods. From these, for each species, | |
# the mean value of each time period is calculated. The script outputs a | |
# tab-delimited table of columns for species, mean value of the first time | |
# period, and the mean value of the second time period. This table can be used | |
# to perform classical statistical tests in R. Justification for the technique | |
# is provided below. | |
# "When working with herbarium specimens, which were usually not sampled in a | |
# sytematic way, special consideration has to be given to potential biases | |
# arising out of different sampling efforts. To account for different sampling | |
# efforts in the two time periods, we applied a random subsampling procedure | |
# similiar to Warren et al. (2001) so that equal numbers of records were | |
# obtained in each 100 m band..." | |
# -Bergamini et al. (2009) | |
# "To equalize recorder effort, we subsampled the 1995-99 data by randomly | |
# selecting the 1970-82 number of records from the 1995-99 data, subsampling | |
# separately for each 100-km Ordnance Survey grid square to retain the broad | |
# geographical distribution of 1970-82 records." | |
# -Warren et al. (2001) | |
# open up our gbif/climate dataset. the in and out filenames are passed as arguments when the script is executed. | |
my $in = shift; | |
my $out = shift; | |
open IN, "<", $in; | |
open OUT, ">", $out; | |
# bin boundry dates are inclusive. | |
# these dates make up the "before climate change" date range. | |
my $bin1begin = 1900; | |
my $bin1end = 1970; | |
# these dates make up the "after climate change" date range. | |
my $bin2begin = 1985; | |
my $bin2end = 2010; | |
# these hashes will contain our data from the two time periods. | |
my %bin1; | |
my %bin2; | |
# for each record in our dataset. | |
while ($in = <IN>) | |
{ | |
my $fileline = "$in"; | |
chomp $fileline; | |
# extract pertinent data from pur records. we could choose to analyze some variable other than elevation (latitude?) if we wanted to. | |
my @line = split /\t/,$fileline; | |
my $species = $line[0]; | |
my $year = $line[1]; | |
my $ele = $line[4]; | |
# reformat elevation to a five digit integer, with leading zeros if need be. | |
$ele = sprintf("%05d", "$ele"); | |
# here we remove the last two digits of the elevation. this number classifies the record into a 100 meter "band." | |
my $band = "$ele"; | |
chop $band; | |
chop $band; | |
# the dataset will be separated into two time period bins (before and after climate change). | |
# each bin will contain its records assigned to elevation bands. we will use these bands when subsampling later. | |
# two hashes of arrays are used to map this data structure. | |
if (($year >= $bin1begin) and ($year <= $bin1end)) | |
# does the record belong in the "before(%bin1)" time period? | |
{ | |
# if so, take the record($fileline) and push it onto the @array keyed by its $band. | |
push(@{ $bin1{"$band"}}, "$fileline"); | |
} | |
elsif (($year >= $bin2begin) and ($year <= $bin2end)) | |
# does the record belong in the "after(%bin2)" time period? | |
{ | |
# if so, take the record($fileline) and push it onto the @array keyed by its $band. | |
push(@{ $bin2{"$band"}}, "$fileline"); | |
} | |
} | |
# now we have our dataset seperated into two time periods, each of which is divided by elevation. | |
# this will allow us to subsample from the larger bin while simultaneously avoiding bias from sampling at higher elevations in later years. | |
#these arrays will hold our before and after binned datasets of equal size. | |
my @before; | |
my @after; | |
#this array will be used to temporarily hold the dataset subsampled from the larger bin for each band. | |
my @subsample; | |
# there aren't many records in the upper bands, but i go through all bands that are earthly possible (mount everest is a little less than 8900 meters above sea level), to be conservative. | |
my $n = 0; | |
while ($n<90) | |
{ | |
# tempororary arrays hold the data extracted from the hashes for each band while we're doing comparisons and subsampling. | |
my @band1; | |
my @band2; | |
# here i generate the three digit key corresponding to the elevation bands. | |
my $nnn = sprintf("%03d", "$n"); | |
# are there records in the current band for each bin? | |
if (exists $bin1{"$nnn"}) | |
{ | |
# if so, take them out, and put them in the temporary arrays. | |
@band1 = @{ $bin1{"$nnn"}}; | |
} | |
if (exists $bin2{"$nnn"}) | |
{ | |
@band2 = @{ $bin2{"$nnn"}}; | |
} | |
# formally evaluate the size of each array. | |
my $count1 = scalar @band1; | |
my $count2 = scalar @band2; | |
# then compare, as we subsample from the larger of the two. | |
if ($count1<$count2) | |
{ | |
# we can keep the smaller of the two elevation bands, so we just append it to our final array. | |
@before = (@before, @band1); | |
# this subroutine mathematically permutes the larger array in place. | |
FISHER_YATES_SHUFFLE( \@band2 ); | |
# now subsample by taking a slice from the shuffled larger array that is equal in size to the smaller array. | |
@subsample = @band2[1..$count1]; | |
# append the subsample to the final array. | |
@after = (@after,@subsample); | |
} | |
elsif ($count1>$count2) | |
{ | |
@after = (@after, @band2); | |
FISHER_YATES_SHUFFLE( \@band1 ); | |
@subsample = @band1[1..$count2]; | |
@before = (@before,@subsample); | |
} | |
# no subsampling is needed if the elevation bands are of equal size in both bins. | |
else | |
{ | |
@before = (@before, @band1); | |
@after = (@after, @band2); | |
} | |
# increment to the next band. | |
++$n; | |
} | |
# now we have two datasets of equal size with approximately equal distributions of elevation. | |
# what is their size? | |
my $count = scalar @before; | |
# we will be comparing the mean elevevation for each species. i again chose to use hashes of arrays. | |
my %species_before; | |
my %species_after; | |
# i keep a running list of each species encountered | |
my @specieslist; | |
# for all the record lines | |
for(1..$count) | |
{ | |
# take a record from each dataset. | |
my $beforeline= pop @before; | |
my $afterline= pop @after; | |
# split the fields. | |
my @before_data = split /\t/,$beforeline; | |
my @after_data = split /\t/,$afterline; | |
# extract the species name, | |
my $before_species_key = $before_data[0]; | |
my $after_species_key = $after_data[0]; | |
# and the elevation datum. | |
my $before_species_value = $before_data[4]; | |
my $after_species_value = $after_data[4]; | |
# push the elevation datum($before_species_value) onto an @array keyed by the species it belongs to($before_species_key) in its proper time period hash(%species_before). | |
push(@{ $species_before{"$before_species_key"}}, "$before_species_value"); | |
push(@{ $species_after{"$after_species_key"}}, "$after_species_value"); | |
# push both species names onto the running list of those encountered. | |
push @specieslist, ("$before_species_key","$after_species_key"); | |
} | |
# get an array of unique species names. i didn't write the next four lines, but i did test them. | |
my @unique_specieslist; | |
# declare a hash. | |
my %saw; | |
# make sure it's undefined. | |
undef %saw; | |
# our unique species list will consist of the first encounter of each species as we iterate through the list of all species encountered in the previous block. | |
@unique_specieslist = grep(!$saw{$_}++, @specieslist); | |
# for each species in our dataset, we'll calculate its mean elevation before climate change, and after. | |
foreach (@unique_specieslist) | |
{ | |
# i like to name the current string. | |
my $species = "$_"; | |
# these arrays will temporarily hold the elevations extracted from the hashes for the current species. | |
my @beforevalues; | |
my @aftervalues; | |
# these strings will hold the mean elevation. | |
my $beforemean; | |
my $aftermean; | |
# if the species is represented for the time period, | |
if (exists $species_before{"$species"}) | |
{ | |
# extract its elevation data from the hash. | |
@beforevalues = @{ $species_before{"$species"}}; | |
} | |
if (exists $species_after{"$species"}) | |
{ | |
@aftervalues = @{ $species_after{"$species"}}; | |
} | |
# how many records do we have in each time period? | |
my $count_before = scalar @beforevalues; | |
my $count_after = scalar @aftervalues; | |
# we will analyze species with at least 12 records in each time period. | |
if (($count_before>11) and ($count_after>11)) | |
{ | |
# get the mean elevation before and after climate change. we pass the array containing them to a subroutine. | |
$beforemean = MEAN(\@beforevalues); | |
$aftermean = MEAN(\@aftervalues); | |
# print the species name and both means. | |
print OUT "$species\t$beforemean\t$aftermean\n"; | |
} | |
} | |
# FISHER_YATES_SHUFFLE( \@array ) : generate a random permutation of @array in place. | |
# this code is from the Perl Cookbook ch 4.17; the algorithm's literal description was read from Wikipedia. | |
# the fisher yates shuffle is unbiased, so that every permutation is equally likely. | |
sub FISHER_YATES_SHUFFLE | |
{ | |
# take the passed array. | |
my $array = shift; | |
my $i; | |
# for i, from the array size down to one, | |
for ($i = @$array; --$i; ) | |
{ | |
# obtain a random integer, j that is 0<=j<=i. | |
my $j = int rand ($i+1); | |
# equivalent swaps do nothing. wikipedia implies that this line may be waste. | |
next if $i == $j; | |
# exchange array element i for j. | |
@$array[$i,$j] = @$array[$j,$i]; | |
} | |
} | |
# calculate the mean of an array of numbers. | |
sub MEAN | |
{ | |
# make sure the subroutine is passed a defined array. | |
@_ == 1 or die ('Sub usage: $mean = mean(\@array);'); | |
# take the passed array. | |
my ($array_ref) = @_; | |
my $sum; | |
# get its size. | |
my $count = scalar @$array_ref; | |
# sum the values contained in the array | |
foreach (@$array_ref) | |
{ | |
$sum += $_; | |
} | |
# divide the array's sum by its size to return its mean. | |
return $sum / $count; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment