Skip to content

Instantly share code, notes, and snippets.

@dansmith01
Created August 8, 2013 21:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dansmith01/6188797 to your computer and use it in GitHub Desktop.
Save dansmith01/6188797 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
#--------------------------------------------------------------------------#
# Description: Combines multiple peptide readings into protein- #
# centric data #
# #
# Author: Daniel Patrick Smith #
# Version: 2.0 / November 29th, 2011 #
# Affiliation: Giovannoni Laboratory, Oregon State University #
# License: GNU General Public License v3 #
# #
# Reference: Smith, D.P. et al. Proteomic and Transcriptomic Analysis of #
# Candidatus Pelagibacter ubique Describes the First PII- #
# Independent Response to Nitrogen Limitation in a Free- #
# Living Alphaproteobacterium. In preparation. #
#--------------------------------------------------------------------------#
use strict;
use warnings;
use Statistics::Distributions;
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# Define Data File and Treatment Groups ( --> EDIT THIS SECTION <-- )
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
my $file = "Peptides.txt";
# Data File - tab-delimited peptide readings.
# 1st column is the peptide sequence (ignored)
# 2nd column is the protein ID
# Remaining columns are abundance measurements for each sample
# 1st row contains sample IDs for each sample
# Technical replicates should be given the same sample ID
# Example (partial) data file:
# Peptide Protein SL-1 SL-1 SL-1 SL-2 SL-2 SL-2
# -.MKMFYEKD.A C134_0001 NA NA NA 22.38 22.02 22.26
# K.ARDLALSYASA.I C134_0001 20.85 21.75 21.87 21.07 22.00 21.74
# K.DADVDLIK.S C134_0001 24.52 23.25 24.60 24.09 NA 24.55
# K.DLADHPIEK.V C134_0001 24.66 26.44 25.98 26.19 26.54 26.01
# K.DLDVFMVAPK.G C134_0001 24.20 23.81 24.27 24.27 22.54 24.57
# K.DSGAKEVVVALR.D C134_0001 23.62 23.25 23.68 23.31 23.80 23.36
# K.EVLADIQSGK.F C134_0001 NA 21.32 23.36 NA NA 22.55
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
my %sets = ( "Le" => ["SL-2", "SL-3", "SL-5"],
"Lt" => ["SL-6", "SL-8", "SL-9", "SL-10", "SL-11", "SL-12"],
"Ls" => ["SL-20", "SL-21", "SL-22", "SL-23", "SL-24"],
"Ce" => ["SL-1", "SL-4", "SL-7", "SL-14"],
"Ct" => ["SL-15", "SL-16", "SL-17", "SL-25"],
"Cs" => ["SL-26", "SL-27", "SL-28", "SL-29"]
);
# Sets are used to group biological replicates together into treatments.
# Above, "Le" and "Cs" represent "Limited exponential" and "control
# stationary", respectively. However, any treatment identifier may be used.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
my @compare = (["Ce", "Le"], ["Ce","Ls"], ["Ce", "Cs"], ["Cs", "Ls"]);
# These are the treatments that should be compared. E.g., the first pair,
# ["Ce", "Le"] will compute the fold change in expression between those
# two treatments by dividing the abundance in one by the other: Le / Ce
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# --> DO NOT EDIT BELOW THIS LINE <--
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# Read in peptide abundances from user-specified file
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
my @raw_data = ();
open(FILE, $file) or die ("Can't open file '$file': $!\n");
my $line = <FILE>;
chomp $line;
my @headers = split("\t", $line);
while ($line = <FILE>) {
chomp $line;
my @f = split("\t", $line);
my $pep_readings = { 'protein' => $f[1] };
push @{$pep_readings->{$headers[$_]}}, $f[$_] foreach (2..$#f);
push @raw_data, $pep_readings;
}
close FILE;
# Average peptides together by treatments, set undef if >=50% are NA
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
my @avg_data = ();
my $prot2idx = {};
foreach my $raw (@raw_data) {
my $new = {};
$new->{'protein'} = $raw->{'protein'};
while (my ($cond, $SL_ID_arr) = each %sets) {
my @vals = ();
push @vals, @{$raw->{$_}} foreach (@{$SL_ID_arr});
@vals = grep(/^[\d\.]+$/, @vals);
if (scalar(@vals) < 3) {
$new->{$cond} = undef;
$raw->{$cond} = undef;
} else {
$new->{$cond} = 2 ** avg(@vals);
$raw->{$cond} = [@vals];
}
}
push @{$prot2idx->{$raw->{'protein'}}}, scalar(@avg_data);
push @avg_data, $new;
}
# Output headers
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
print "Protein ";
foreach my $cond_pair (@compare) {
my ($ref_condition, $exp_condition) = @{$cond_pair};
print "\t". $ref_condition ."->". $exp_condition ."\tP-Value\t";
}
print "\n";
# Calculate fold changes in protein abundances
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
my @protlist = sort {$a cmp $b} (keys %{$prot2idx});
foreach my $protein (@protlist) {
print $protein;
# Iterate over each pair of treatments specified by the user
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
foreach my $cond_pair (@compare) {
my ($ref_condition, $exp_condition) = @{$cond_pair};
my @ratios = ();
my @ttests = ();
my $expONLY = 0;
my $refONLY = 0;
my $fold_change = undef;
my $p_value = 0;
# Loop over each peptide for the current protein
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
foreach my $idx (@{$prot2idx->{$protein}}) {
my $ref_val = $avg_data[$idx]->{$ref_condition};
my $ref_arr = $raw_data[$idx]->{$ref_condition};
my $exp_val = $avg_data[$idx]->{$exp_condition};
my $exp_arr = $raw_data[$idx]->{$exp_condition};
# If the peptide was detected in both treatments,
# calculate the ratio (fold change).
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
if ($ref_val && $exp_val) {
if ($exp_val < $ref_val) {
($exp_arr, $ref_arr) = ($ref_arr, $exp_arr);
}
push @ratios, log10($exp_val / $ref_val);
push @ttests, ttest($exp_arr, $ref_arr);
}
# Keep track of peptides only occuring in one treatment
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
elsif ($ref_val) { $refONLY++; }
elsif ($exp_val) { $expONLY++; }
}
# Loop over each peptide for the current protein
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
if (@ratios) {
# Set p-value to 1 where the ratio is opposite the average
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
my $avg_ratio = avg(@ratios);
foreach (0..$#ratios) {
$ttests[$_] = 1 if ($ratios[$_] > 0 xor $avg_ratio > 0);
}
$p_value = $ttests[0];
$fold_change = sprintf("%.2f", (10 ** $avg_ratio));
if (scalar(@ttests) > 1) {
# Bonferroni correction
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
foreach (0..$#ttests) {
$ttests[$_] = $ttests[$_] * scalar(@ttests);
$ttests[$_] = 1 if ($ttests[$_] > 1);
}
# Combine multiple p-values into one
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
my $df = 2 * scalar(@ttests);
my $chi_sq = 0;
$chi_sq += log($_) foreach (@ttests);
$chi_sq *= -2;
$p_value = Statistics::Distributions::chisqrprob($df,$chi_sq);
}
}
# Report instances where the peptides for a given protein were
# only detected in one treatment
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
elsif ($expONLY > $refONLY && $expONLY >= 3) {
$fold_change = "INF";
$p_value = "0*";
}
elsif ($expONLY < $refONLY && $refONLY >= 3) {
$fold_change = "0";
$p_value = "0*";
}
# Also report instances where the peptides for a given protein were
# never detected in either treatment
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
else {
$fold_change = "ND";
$p_value = "1*";
}
print "\t$fold_change\t$p_value";
}
print "\n";
}
# Two-tailed Student's t-test calculation
# Statistical Sleuth, 2nd ed. pg 44
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
sub ttest {
my @i_arr = @{$_[0]};
my @j_arr = @{$_[1]};
my $i_num = scalar(@i_arr) || return 1;
my $j_num = scalar(@j_arr) || return 0;
my $i_df = ($i_num - 1) || return 1;
my $j_df = ($j_num - 1) || return 0;
my $i_ave = 0; $i_ave += $_ foreach (@i_arr); $i_ave /= $i_num;
my $j_ave = 0; $j_ave += $_ foreach (@j_arr); $j_ave /= $j_num;
return 1 unless ($i_ave > $j_ave);
my $i_sd = 0; $i_sd += ($_ - $i_ave) ** 2 foreach (@i_arr);
$i_sd = sqrt($i_sd / $i_df);
my $j_sd = 0; $j_sd += ($_ - $j_ave) ** 2 foreach (@j_arr);
$j_sd = sqrt($j_sd / $j_df);
my $pooled_df = $i_df + $j_df;
my $pooled_sd = ($i_df * ($i_sd ** 2)) + ($j_df * ($j_sd ** 2));
$pooled_sd = sqrt($pooled_sd / $pooled_df);
my $pooled_se = $pooled_sd * sqrt((1 / $i_num) + (1 / $j_num));
return 1 unless ($pooled_se != 0);
my $t_statistic = ($i_ave - $j_ave) / $pooled_se;
my $p_value = Statistics::Distributions::tprob($pooled_df, $t_statistic);
return $p_value;
}
# Helper Functions
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
sub avg {
my $sum = 0;
$sum += $_ foreach (@_);
my $avg = scalar(@_) ? $sum / scalar(@_) : 0;
return $avg;
}
sub log10 {
my $n = shift;
return log($n)/log(10);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment