grassa/bin_subsample_mean_gbif.pl

## bin_subsample_mean_gbif.pl
#!/usr/bin/perl -w
use strict;

# Given properly formatted herbarium records, this script will produce two
# datasets of equal size for two time periods.  From these, for each species,
# the mean value of each time period is calculated.  The script outputs a
# tab-delimited table of columns for species, mean value of the first time
# period, and the mean value of the second time period.  This table can be used
# to perform classical statistical tests in R.  Justification for the technique
# is provided below.

#   "When working with herbarium specimens, which were usually not sampled in a
# sytematic way, special consideration has to be given to potential biases
# arising out of different sampling efforts.  To account for different sampling
# efforts in the two time periods, we applied a random subsampling procedure
# similiar to Warren et al. (2001) so that equal numbers of records were
# obtained in each 100 m band..."
#   -Bergamini et al. (2009)

#   "To equalize recorder effort, we subsampled the 1995-99 data by randomly
# selecting the 1970-82 number of records from the 1995-99 data, subsampling
# separately for each 100-km Ordnance Survey grid square to retain the broad
# geographical distribution of 1970-82 records."
#   -Warren et al. (2001)

# open up our gbif/climate dataset.  the in and out filenames are passed as arguments when the script is executed.
my $in = shift;
my $out = shift;
open IN, "<", $in;
open OUT, ">", $out;
# bin boundry dates are inclusive.
# these dates make up the "before climate change" date range.
my $bin1begin = 1900;
my $bin1end = 1970;
# these dates make up the "after climate change" date range.
my $bin2begin = 1985;
my $bin2end = 2010;

# these hashes will contain our data from the two time periods.
my %bin1;
my %bin2;

# for each record in our dataset.
while ($in = <IN>)
{
    my $fileline = "$in";
    chomp $fileline;
    # extract pertinent data from pur records.  we could choose to analyze some variable other than elevation (latitude?) if we wanted to.
    my @line = split /\t/,$fileline;
    my $species = $line[0];
    my $year = $line[1];
    my $ele = $line[4];

    # reformat elevation to a five digit integer, with leading zeros if need be.
    $ele = sprintf("%05d", "$ele");
    # here we remove the last two digits of the elevation.  this number classifies the record into a 100 meter "band."
    my $band = "$ele";
    chop $band;
    chop $band;

    # the dataset will be separated into two time period bins (before and after climate change).
    # each bin will contain its records assigned to elevation bands.  we will use these bands when subsampling later.
    # two hashes of arrays are used to map this data structure.

    if (($year >= $bin1begin) and ($year <= $bin1end))
    # does the record belong in the "before(%bin1)" time period?
    {
        # if so, take the record($fileline) and push it onto the @array keyed by its $band.
        push(@{ $bin1{"$band"}}, "$fileline");
    }
    elsif (($year >= $bin2begin) and ($year <= $bin2end))
    # does the record belong in the "after(%bin2)" time period?
    {
        # if so, take the record($fileline) and push it onto the @array keyed by its $band.
        push(@{ $bin2{"$band"}}, "$fileline");
    }
}
# now we have our dataset seperated into two time periods, each of which is divided by elevation.
# this will allow us to subsample from the larger bin while simultaneously avoiding bias from sampling at higher elevations in later years.

#these arrays will hold our before and after binned datasets of equal size.
my @before;
my @after;

#this array will be used to temporarily hold the dataset subsampled from the larger bin for each band.
my @subsample;

# there aren't many records in the upper bands, but i go through all bands that are earthly possible (mount everest is a little less than 8900 meters above sea level), to be conservative.
my $n = 0;
while ($n<90)
{
    # tempororary arrays hold the data extracted from the hashes for each band while we're doing comparisons and subsampling.
    my @band1;
    my @band2;
    # here i generate the three digit key corresponding to the elevation bands.
    my $nnn = sprintf("%03d", "$n");

    # are there records in the current band for each bin?
    if (exists $bin1{"$nnn"})
    {
        # if so, take them out, and put them in the temporary arrays.
        @band1 = @{ $bin1{"$nnn"}};
    }
    if (exists $bin2{"$nnn"})
    {
        @band2 = @{ $bin2{"$nnn"}};
    }

    # formally evaluate the size of each array.
    my $count1 = scalar @band1;
    my $count2 = scalar @band2;
    # then compare, as we subsample from the larger of the two.
    if ($count1<$count2)
    {
        # we can keep the smaller of the two elevation bands, so we just append it to our final array.
        @before = (@before, @band1);
        # this subroutine mathematically permutes the larger array in place.
        FISHER_YATES_SHUFFLE( \@band2 );
        # now subsample by taking a slice from the shuffled larger array that is equal in size to the smaller array.
        @subsample = @band2[1..$count1];
        # append the subsample to the final array.
        @after = (@after,@subsample);
    }

    elsif ($count1>$count2)
    {
        @after = (@after, @band2);
        FISHER_YATES_SHUFFLE( \@band1 );
        @subsample = @band1[1..$count2];
        @before = (@before,@subsample);
    }
    # no subsampling is needed if the elevation bands are of equal size in both bins.
    else
    {
        @before = (@before, @band1);
        @after = (@after, @band2);
    }
    # increment to the next band.
    ++$n;
}
# now we have two datasets of equal size with approximately equal distributions of elevation.

# what is their size?
my $count = scalar @before;

# we will be comparing the mean elevevation for each species.  i again chose to use hashes of arrays.
my %species_before;
my %species_after;
# i keep a running list of each species encountered
my @specieslist;
# for all the record lines
for(1..$count)
{
    # take a record from each dataset.
    my $beforeline= pop @before;
    my $afterline= pop @after;
    # split the fields.
    my @before_data = split /\t/,$beforeline;
    my @after_data = split /\t/,$afterline;
    # extract the species name,
    my $before_species_key = $before_data[0];
    my $after_species_key = $after_data[0];
    # and the elevation datum.
    my $before_species_value = $before_data[4];
    my $after_species_value = $after_data[4];
    # push the elevation datum($before_species_value) onto an @array keyed by the species it belongs to($before_species_key) in its proper time period hash(%species_before).
    push(@{ $species_before{"$before_species_key"}}, "$before_species_value");
    push(@{ $species_after{"$after_species_key"}}, "$after_species_value");
    # push both species names onto the running list of those encountered.
    push @specieslist, ("$before_species_key","$after_species_key");
}


# get an array of unique species names.  i didn't write the next four lines, but i did test them.
my @unique_specieslist;
# declare a hash.
my %saw;
# make sure it's undefined.
undef %saw;
# our unique species list will consist of the first encounter of each species as we iterate through the list of all species encountered in the previous block.
@unique_specieslist = grep(!$saw{$_}++, @specieslist);

# for each species in our dataset, we'll calculate its mean elevation before climate change, and after.
foreach (@unique_specieslist)
{
    # i like to name the current string.
    my $species = "$_";
    # these arrays will temporarily hold the elevations extracted from the hashes for the current species.
    my @beforevalues;
    my @aftervalues;
    # these strings will hold the mean elevation.
    my $beforemean;
    my $aftermean;
    # if the species is represented for the time period,
    if (exists $species_before{"$species"})
    {
        # extract its elevation data from the hash.
        @beforevalues = @{ $species_before{"$species"}};
    }
    if (exists $species_after{"$species"})
    {
        @aftervalues = @{ $species_after{"$species"}};
    }
    # how many records do we have in each time period?
    my $count_before = scalar @beforevalues;
    my $count_after = scalar @aftervalues;
    # we will analyze species with at least 12 records in each time period.
    if (($count_before>11) and ($count_after>11))
    {
        # get the mean elevation before and after climate change.  we pass the array containing them to a subroutine.
        $beforemean = MEAN(\@beforevalues);
        $aftermean = MEAN(\@aftervalues);
        # print the species name and both means.
        print OUT "$species\t$beforemean\t$aftermean\n";
    }

}

# FISHER_YATES_SHUFFLE( \@array ) : generate a random permutation of @array in place.
    # this code is from the Perl Cookbook ch 4.17; the algorithm's literal description was read from Wikipedia.
# the fisher yates shuffle is unbiased, so that every permutation is equally likely.
sub FISHER_YATES_SHUFFLE
{
    # take the passed array.
    my $array = shift;
    my $i;
    # for i, from the array size down to one,
    for ($i = @$array; --$i; )
    {
        # obtain a random integer, j that is 0<=j<=i.
        my $j = int rand ($i+1);
        # equivalent swaps do nothing.  wikipedia implies that this line may be waste.
        next if $i == $j;
        # exchange array element i for j.
        @$array[$i,$j] = @$array[$j,$i];
    }
}

# calculate the mean of an array of numbers.
sub MEAN
{
    # make sure the subroutine is passed a defined array.
    @_ == 1 or die ('Sub usage: $mean = mean(\@array);');
    # take the passed array.
    my ($array_ref) = @_;
    my $sum;
    # get its size.
    my $count = scalar @$array_ref;
    # sum the values contained in the array
    foreach (@$array_ref)
    {
        $sum += $_;
    }
    # divide the array's sum by its size to return its mean.
    return $sum / $count;
}
	#!/usr/bin/perl -w
	use strict;

	# Given properly formatted herbarium records, this script will produce two
	# datasets of equal size for two time periods. From these, for each species,
	# the mean value of each time period is calculated. The script outputs a
	# tab-delimited table of columns for species, mean value of the first time
	# period, and the mean value of the second time period. This table can be used
	# to perform classical statistical tests in R. Justification for the technique
	# is provided below.

	# "When working with herbarium specimens, which were usually not sampled in a
	# sytematic way, special consideration has to be given to potential biases
	# arising out of different sampling efforts. To account for different sampling
	# efforts in the two time periods, we applied a random subsampling procedure
	# similiar to Warren et al. (2001) so that equal numbers of records were
	# obtained in each 100 m band..."
	# -Bergamini et al. (2009)

	# "To equalize recorder effort, we subsampled the 1995-99 data by randomly
	# selecting the 1970-82 number of records from the 1995-99 data, subsampling
	# separately for each 100-km Ordnance Survey grid square to retain the broad
	# geographical distribution of 1970-82 records."
	# -Warren et al. (2001)

	# open up our gbif/climate dataset. the in and out filenames are passed as arguments when the script is executed.
	my $in = shift;
	my $out = shift;
	open IN, "<", $in;
	open OUT, ">", $out;
	# bin boundry dates are inclusive.
	# these dates make up the "before climate change" date range.
	my $bin1begin = 1900;
	my $bin1end = 1970;
	# these dates make up the "after climate change" date range.
	my $bin2begin = 1985;
	my $bin2end = 2010;

	# these hashes will contain our data from the two time periods.
	my %bin1;
	my %bin2;

	# for each record in our dataset.
	while ($in = <IN>)
	{
	my $fileline = "$in";
	chomp $fileline;
	# extract pertinent data from pur records. we could choose to analyze some variable other than elevation (latitude?) if we wanted to.
	my @line = split /\t/,$fileline;
	my $species = $line[0];
	my $year = $line[1];
	my $ele = $line[4];

	# reformat elevation to a five digit integer, with leading zeros if need be.
	$ele = sprintf("%05d", "$ele");
	# here we remove the last two digits of the elevation. this number classifies the record into a 100 meter "band."
	my $band = "$ele";
	chop $band;
	chop $band;

	# the dataset will be separated into two time period bins (before and after climate change).
	# each bin will contain its records assigned to elevation bands. we will use these bands when subsampling later.
	# two hashes of arrays are used to map this data structure.

	if (($year >= $bin1begin) and ($year <= $bin1end))
	# does the record belong in the "before(%bin1)" time period?
	{
	# if so, take the record($fileline) and push it onto the @array keyed by its $band.
	push(@{ $bin1{"$band"}}, "$fileline");
	}
	elsif (($year >= $bin2begin) and ($year <= $bin2end))
	# does the record belong in the "after(%bin2)" time period?
	{
	# if so, take the record($fileline) and push it onto the @array keyed by its $band.
	push(@{ $bin2{"$band"}}, "$fileline");
	}
	}
	# now we have our dataset seperated into two time periods, each of which is divided by elevation.
	# this will allow us to subsample from the larger bin while simultaneously avoiding bias from sampling at higher elevations in later years.

	#these arrays will hold our before and after binned datasets of equal size.
	my @before;
	my @after;

	#this array will be used to temporarily hold the dataset subsampled from the larger bin for each band.
	my @subsample;

	# there aren't many records in the upper bands, but i go through all bands that are earthly possible (mount everest is a little less than 8900 meters above sea level), to be conservative.
	my $n = 0;
	while ($n<90)
	{
	# tempororary arrays hold the data extracted from the hashes for each band while we're doing comparisons and subsampling.
	my @band1;
	my @band2;
	# here i generate the three digit key corresponding to the elevation bands.
	my $nnn = sprintf("%03d", "$n");

	# are there records in the current band for each bin?
	if (exists $bin1{"$nnn"})
	{
	# if so, take them out, and put them in the temporary arrays.
	@band1 = @{ $bin1{"$nnn"}};
	}
	if (exists $bin2{"$nnn"})
	{
	@band2 = @{ $bin2{"$nnn"}};
	}

	# formally evaluate the size of each array.
	my $count1 = scalar @band1;
	my $count2 = scalar @band2;
	# then compare, as we subsample from the larger of the two.
	if ($count1<$count2)
	{
	# we can keep the smaller of the two elevation bands, so we just append it to our final array.
	@before = (@before, @band1);
	# this subroutine mathematically permutes the larger array in place.
	FISHER_YATES_SHUFFLE( \@band2 );
	# now subsample by taking a slice from the shuffled larger array that is equal in size to the smaller array.
	@subsample = @band2[1..$count1];
	# append the subsample to the final array.
	@after = (@after,@subsample);
	}

	elsif ($count1>$count2)
	{
	@after = (@after, @band2);
	FISHER_YATES_SHUFFLE( \@band1 );
	@subsample = @band1[1..$count2];
	@before = (@before,@subsample);
	}
	# no subsampling is needed if the elevation bands are of equal size in both bins.
	else
	{
	@before = (@before, @band1);
	@after = (@after, @band2);
	}
	# increment to the next band.
	++$n;
	}
	# now we have two datasets of equal size with approximately equal distributions of elevation.

	# what is their size?
	my $count = scalar @before;

	# we will be comparing the mean elevevation for each species. i again chose to use hashes of arrays.
	my %species_before;
	my %species_after;
	# i keep a running list of each species encountered
	my @specieslist;
	# for all the record lines
	for(1..$count)
	{
	# take a record from each dataset.
	my $beforeline= pop @before;
	my $afterline= pop @after;
	# split the fields.
	my @before_data = split /\t/,$beforeline;
	my @after_data = split /\t/,$afterline;
	# extract the species name,
	my $before_species_key = $before_data[0];
	my $after_species_key = $after_data[0];
	# and the elevation datum.
	my $before_species_value = $before_data[4];
	my $after_species_value = $after_data[4];
	# push the elevation datum($before_species_value) onto an @array keyed by the species it belongs to($before_species_key) in its proper time period hash(%species_before).
	push(@{ $species_before{"$before_species_key"}}, "$before_species_value");
	push(@{ $species_after{"$after_species_key"}}, "$after_species_value");
	# push both species names onto the running list of those encountered.
	push @specieslist, ("$before_species_key","$after_species_key");
	}


	# get an array of unique species names. i didn't write the next four lines, but i did test them.
	my @unique_specieslist;
	# declare a hash.
	my %saw;
	# make sure it's undefined.
	undef %saw;
	# our unique species list will consist of the first encounter of each species as we iterate through the list of all species encountered in the previous block.
	@unique_specieslist = grep(!$saw{$_}++, @specieslist);

	# for each species in our dataset, we'll calculate its mean elevation before climate change, and after.
	foreach (@unique_specieslist)
	{
	# i like to name the current string.
	my $species = "$_";
	# these arrays will temporarily hold the elevations extracted from the hashes for the current species.
	my @beforevalues;
	my @aftervalues;
	# these strings will hold the mean elevation.
	my $beforemean;
	my $aftermean;
	# if the species is represented for the time period,
	if (exists $species_before{"$species"})
	{
	# extract its elevation data from the hash.
	@beforevalues = @{ $species_before{"$species"}};
	}
	if (exists $species_after{"$species"})
	{
	@aftervalues = @{ $species_after{"$species"}};
	}
	# how many records do we have in each time period?
	my $count_before = scalar @beforevalues;
	my $count_after = scalar @aftervalues;
	# we will analyze species with at least 12 records in each time period.
	if (($count_before>11) and ($count_after>11))
	{
	# get the mean elevation before and after climate change. we pass the array containing them to a subroutine.
	$beforemean = MEAN(\@beforevalues);
	$aftermean = MEAN(\@aftervalues);
	# print the species name and both means.
	print OUT "$species\t$beforemean\t$aftermean\n";
	}

	}

	# FISHER_YATES_SHUFFLE( \@array ) : generate a random permutation of @array in place.
	# this code is from the Perl Cookbook ch 4.17; the algorithm's literal description was read from Wikipedia.
	# the fisher yates shuffle is unbiased, so that every permutation is equally likely.
	sub FISHER_YATES_SHUFFLE
	{
	# take the passed array.
	my $array = shift;
	my $i;
	# for i, from the array size down to one,
	for ($i = @$array; --$i; )
	{
	# obtain a random integer, j that is 0<=j<=i.
	my $j = int rand ($i+1);
	# equivalent swaps do nothing. wikipedia implies that this line may be waste.
	next if $i == $j;
	# exchange array element i for j.
	@$array[$i,$j] = @$array[$j,$i];
	}
	}

	# calculate the mean of an array of numbers.
	sub MEAN
	{
	# make sure the subroutine is passed a defined array.
	@_ == 1 or die ('Sub usage: $mean = mean(\@array);');
	# take the passed array.
	my ($array_ref) = @_;
	my $sum;
	# get its size.
	my $count = scalar @$array_ref;
	# sum the values contained in the array
	foreach (@$array_ref)
	{
	$sum += $_;
	}
	# divide the array's sum by its size to return its mean.
	return $sum / $count;
	}