Created
August 8, 2013 21:12
-
-
Save dansmith01/6188797 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
#--------------------------------------------------------------------------# | |
# Description: Combines multiple peptide readings into protein- # | |
# centric data # | |
# # | |
# Author: Daniel Patrick Smith # | |
# Version: 2.0 / November 29th, 2011 # | |
# Affiliation: Giovannoni Laboratory, Oregon State University # | |
# License: GNU General Public License v3 # | |
# # | |
# Reference: Smith, D.P. et al. Proteomic and Transcriptomic Analysis of # | |
# Candidatus Pelagibacter ubique Describes the First PII- # | |
# Independent Response to Nitrogen Limitation in a Free- # | |
# Living Alphaproteobacterium. In preparation. # | |
#--------------------------------------------------------------------------# | |
use strict; | |
use warnings; | |
use Statistics::Distributions; | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
# Define Data File and Treatment Groups ( --> EDIT THIS SECTION <-- ) | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
my $file = "Peptides.txt"; | |
# Data File - tab-delimited peptide readings. | |
# 1st column is the peptide sequence (ignored) | |
# 2nd column is the protein ID | |
# Remaining columns are abundance measurements for each sample | |
# 1st row contains sample IDs for each sample | |
# Technical replicates should be given the same sample ID | |
# Example (partial) data file: | |
# Peptide Protein SL-1 SL-1 SL-1 SL-2 SL-2 SL-2 | |
# -.MKMFYEKD.A C134_0001 NA NA NA 22.38 22.02 22.26 | |
# K.ARDLALSYASA.I C134_0001 20.85 21.75 21.87 21.07 22.00 21.74 | |
# K.DADVDLIK.S C134_0001 24.52 23.25 24.60 24.09 NA 24.55 | |
# K.DLADHPIEK.V C134_0001 24.66 26.44 25.98 26.19 26.54 26.01 | |
# K.DLDVFMVAPK.G C134_0001 24.20 23.81 24.27 24.27 22.54 24.57 | |
# K.DSGAKEVVVALR.D C134_0001 23.62 23.25 23.68 23.31 23.80 23.36 | |
# K.EVLADIQSGK.F C134_0001 NA 21.32 23.36 NA NA 22.55 | |
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | |
my %sets = ( "Le" => ["SL-2", "SL-3", "SL-5"], | |
"Lt" => ["SL-6", "SL-8", "SL-9", "SL-10", "SL-11", "SL-12"], | |
"Ls" => ["SL-20", "SL-21", "SL-22", "SL-23", "SL-24"], | |
"Ce" => ["SL-1", "SL-4", "SL-7", "SL-14"], | |
"Ct" => ["SL-15", "SL-16", "SL-17", "SL-25"], | |
"Cs" => ["SL-26", "SL-27", "SL-28", "SL-29"] | |
); | |
# Sets are used to group biological replicates together into treatments. | |
# Above, "Le" and "Cs" represent "Limited exponential" and "control | |
# stationary", respectively. However, any treatment identifier may be used. | |
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | |
my @compare = (["Ce", "Le"], ["Ce","Ls"], ["Ce", "Cs"], ["Cs", "Ls"]); | |
# These are the treatments that should be compared. E.g., the first pair, | |
# ["Ce", "Le"] will compute the fold change in expression between those | |
# two treatments by dividing the abundance in one by the other: Le / Ce | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
# --> DO NOT EDIT BELOW THIS LINE <-- | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
# Read in peptide abundances from user-specified file | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
my @raw_data = (); | |
open(FILE, $file) or die ("Can't open file '$file': $!\n"); | |
my $line = <FILE>; | |
chomp $line; | |
my @headers = split("\t", $line); | |
while ($line = <FILE>) { | |
chomp $line; | |
my @f = split("\t", $line); | |
my $pep_readings = { 'protein' => $f[1] }; | |
push @{$pep_readings->{$headers[$_]}}, $f[$_] foreach (2..$#f); | |
push @raw_data, $pep_readings; | |
} | |
close FILE; | |
# Average peptides together by treatments, set undef if >=50% are NA | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
my @avg_data = (); | |
my $prot2idx = {}; | |
foreach my $raw (@raw_data) { | |
my $new = {}; | |
$new->{'protein'} = $raw->{'protein'}; | |
while (my ($cond, $SL_ID_arr) = each %sets) { | |
my @vals = (); | |
push @vals, @{$raw->{$_}} foreach (@{$SL_ID_arr}); | |
@vals = grep(/^[\d\.]+$/, @vals); | |
if (scalar(@vals) < 3) { | |
$new->{$cond} = undef; | |
$raw->{$cond} = undef; | |
} else { | |
$new->{$cond} = 2 ** avg(@vals); | |
$raw->{$cond} = [@vals]; | |
} | |
} | |
push @{$prot2idx->{$raw->{'protein'}}}, scalar(@avg_data); | |
push @avg_data, $new; | |
} | |
# Output headers | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
print "Protein "; | |
foreach my $cond_pair (@compare) { | |
my ($ref_condition, $exp_condition) = @{$cond_pair}; | |
print "\t". $ref_condition ."->". $exp_condition ."\tP-Value\t"; | |
} | |
print "\n"; | |
# Calculate fold changes in protein abundances | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
my @protlist = sort {$a cmp $b} (keys %{$prot2idx}); | |
foreach my $protein (@protlist) { | |
print $protein; | |
# Iterate over each pair of treatments specified by the user | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
foreach my $cond_pair (@compare) { | |
my ($ref_condition, $exp_condition) = @{$cond_pair}; | |
my @ratios = (); | |
my @ttests = (); | |
my $expONLY = 0; | |
my $refONLY = 0; | |
my $fold_change = undef; | |
my $p_value = 0; | |
# Loop over each peptide for the current protein | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
foreach my $idx (@{$prot2idx->{$protein}}) { | |
my $ref_val = $avg_data[$idx]->{$ref_condition}; | |
my $ref_arr = $raw_data[$idx]->{$ref_condition}; | |
my $exp_val = $avg_data[$idx]->{$exp_condition}; | |
my $exp_arr = $raw_data[$idx]->{$exp_condition}; | |
# If the peptide was detected in both treatments, | |
# calculate the ratio (fold change). | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
if ($ref_val && $exp_val) { | |
if ($exp_val < $ref_val) { | |
($exp_arr, $ref_arr) = ($ref_arr, $exp_arr); | |
} | |
push @ratios, log10($exp_val / $ref_val); | |
push @ttests, ttest($exp_arr, $ref_arr); | |
} | |
# Keep track of peptides only occuring in one treatment | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
elsif ($ref_val) { $refONLY++; } | |
elsif ($exp_val) { $expONLY++; } | |
} | |
# Loop over each peptide for the current protein | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
if (@ratios) { | |
# Set p-value to 1 where the ratio is opposite the average | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
my $avg_ratio = avg(@ratios); | |
foreach (0..$#ratios) { | |
$ttests[$_] = 1 if ($ratios[$_] > 0 xor $avg_ratio > 0); | |
} | |
$p_value = $ttests[0]; | |
$fold_change = sprintf("%.2f", (10 ** $avg_ratio)); | |
if (scalar(@ttests) > 1) { | |
# Bonferroni correction | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
foreach (0..$#ttests) { | |
$ttests[$_] = $ttests[$_] * scalar(@ttests); | |
$ttests[$_] = 1 if ($ttests[$_] > 1); | |
} | |
# Combine multiple p-values into one | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
my $df = 2 * scalar(@ttests); | |
my $chi_sq = 0; | |
$chi_sq += log($_) foreach (@ttests); | |
$chi_sq *= -2; | |
$p_value = Statistics::Distributions::chisqrprob($df,$chi_sq); | |
} | |
} | |
# Report instances where the peptides for a given protein were | |
# only detected in one treatment | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
elsif ($expONLY > $refONLY && $expONLY >= 3) { | |
$fold_change = "INF"; | |
$p_value = "0*"; | |
} | |
elsif ($expONLY < $refONLY && $refONLY >= 3) { | |
$fold_change = "0"; | |
$p_value = "0*"; | |
} | |
# Also report instances where the peptides for a given protein were | |
# never detected in either treatment | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
else { | |
$fold_change = "ND"; | |
$p_value = "1*"; | |
} | |
print "\t$fold_change\t$p_value"; | |
} | |
print "\n"; | |
} | |
# Two-tailed Student's t-test calculation | |
# Statistical Sleuth, 2nd ed. pg 44 | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
sub ttest { | |
my @i_arr = @{$_[0]}; | |
my @j_arr = @{$_[1]}; | |
my $i_num = scalar(@i_arr) || return 1; | |
my $j_num = scalar(@j_arr) || return 0; | |
my $i_df = ($i_num - 1) || return 1; | |
my $j_df = ($j_num - 1) || return 0; | |
my $i_ave = 0; $i_ave += $_ foreach (@i_arr); $i_ave /= $i_num; | |
my $j_ave = 0; $j_ave += $_ foreach (@j_arr); $j_ave /= $j_num; | |
return 1 unless ($i_ave > $j_ave); | |
my $i_sd = 0; $i_sd += ($_ - $i_ave) ** 2 foreach (@i_arr); | |
$i_sd = sqrt($i_sd / $i_df); | |
my $j_sd = 0; $j_sd += ($_ - $j_ave) ** 2 foreach (@j_arr); | |
$j_sd = sqrt($j_sd / $j_df); | |
my $pooled_df = $i_df + $j_df; | |
my $pooled_sd = ($i_df * ($i_sd ** 2)) + ($j_df * ($j_sd ** 2)); | |
$pooled_sd = sqrt($pooled_sd / $pooled_df); | |
my $pooled_se = $pooled_sd * sqrt((1 / $i_num) + (1 / $j_num)); | |
return 1 unless ($pooled_se != 0); | |
my $t_statistic = ($i_ave - $j_ave) / $pooled_se; | |
my $p_value = Statistics::Distributions::tprob($pooled_df, $t_statistic); | |
return $p_value; | |
} | |
# Helper Functions | |
#-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- | |
sub avg { | |
my $sum = 0; | |
$sum += $_ foreach (@_); | |
my $avg = scalar(@_) ? $sum / scalar(@_) : 0; | |
return $avg; | |
} | |
sub log10 { | |
my $n = shift; | |
return log($n)/log(10); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment