Last active
May 18, 2017 05:11
-
-
Save adiamb/1cc48369e070a70ecf310ab0074b50c0 to your computer and use it in GitHub Desktop.
Mass_spec_AA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###PYTHON############# | |
#Purpose : Replication of MS spectra analysis | |
#Author : Aditya Ambati | |
#date : May 14 2017 | |
#update : version 1 | |
#####import libraries | |
from itertools import chain | |
import pandas as pd | |
import numpy as np | |
%matplotlib | |
import re | |
import numpy | |
def peponly(pepseq): | |
return(re.sub(pattern='[^A-Z]*',repl='', string=pepseq)) | |
def process_raw_spectra(spectra, outfile): | |
spectra2=spectra[['Protein\nRank','Peptide\n< ProteinMetrics Confidential >','Modification Type(s)', 'Starting\nposition', 'Protein Name']] | |
## rename the columns | |
spectra2.columns = ['protein_rank', 'peptide', 'modtype', 'start_pos', 'protein_name'] | |
## define a function to return only alphabets | |
spectra2.loc[:, ('pep')] = [peponly(i) for i in spectra2.peptide] ### assign to a new column the raw peptide | |
spectra2.loc[:, ('len1')] = [len(i)-1 for i in spectra2.pep] | |
untagged=[] | |
for i in range(len(spectra2.pep)): | |
line = spectra2.pep[i] | |
untagged.append(line[1:spectra2.len1[i]]) ## remove first and last character | |
spectra2.loc[:, ('tag_rem_pep')] = untagged ## add this into the df | |
spectra2.loc[:, ('tag_rm_len')] = [len(i) for i in spectra2.tag_rem_pep] | |
spectra2.loc[:, ('tag_rm_len')]=spectra2['tag_rm_len'].astype(int) | |
spectra2.loc[:, ('templen')] = spectra2.tag_rm_len + spectra2.start_pos ### add the startingpos + untagged peptide length | |
spectra2.loc[:, ('org_length')] = [len(i) for i in spectra2.peptide] ## get the orginal length of the ms raw peptide | |
maxDT=spectra2.groupby(['protein_rank', 'protein_name']).agg({'templen':'max'}) ## group by the protein name and get max length of each uniq protein | |
maxDT.reset_index(inplace=True) ## reset the index | |
emp =[] | |
for q, w, in zip(maxDT.protein_rank, maxDT.templen): | |
z=pd.DataFrame(np.repeat(q, w), np.arange(1, w+1)) | |
z = z.reset_index() | |
emp.append(z) | |
res = pd.concat(emp) | |
res.columns = ['postion', 'protein_rank'] | |
res.reset_index(drop = True, inplace=True) | |
print res | |
#res.drop('index', axis =1, inplace=True) | |
### make a db of every peptide in the ms file | |
res['total'] = 0 | |
res['pep'] = 0 | |
for i in range(0, len(spectra2.protein_rank)): | |
index_pos = np.arange(spectra2.start_pos[i], spectra2.templen[i]) | |
index_pos2=res[res.postion.isin(index_pos)] | |
res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('total')] = res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('total')]+1 | |
res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('pep')] = list(("".join(spectra2.tag_rem_pep[i]))) | |
#re.search("\\[[^A-Z]*\\]", spectra2.peptide[1]) | |
#spec_uniq=spectra2[['protein_rank', 'tag_rem_pep', 'protein_name', 'start_pos', 'tag_rm_len']].drop_duplicates() | |
#res.loc[:, total[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]]]] = res.total[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]]] | |
#[m.start() for m in re.finditer("\\[[^A-Z]*\\]", q)] | |
##############get the peptide with mods | |
for i in range(0, len(spectra2.peptide)): | |
spectra2.loc[i, ('pep_with_tag')] = spectra2.peptide[i][2:spectra2.org_length[i]-2] | |
spectra2.loc[:, 'after_tag_len'] = [len(i) for i in spectra2.pep_with_tag] | |
#[m.start() for m in re.finditer("\\[[^A-Z]*\\]", spectra2.pep_with_tag[1145])] | |
modslen =[] | |
for i in range(0, len(spectra2.pep_with_tag)): | |
modslen.append([m.start() for m in re.finditer("\\[[^A-Z]*\\]", spectra2.pep_with_tag[i])]) | |
mods_no =[] | |
for i in range(0, len(spectra2.pep_with_tag)): | |
mods_no.append(re.findall("\\[[^A-Z]*\\]", spectra2.pep_with_tag[i])) | |
offset_mod =[] | |
for mods in mods_no: | |
mods_temp = [len(i) for i in mods] | |
offset_mod.append(mods_temp) | |
res['mod'] = 0 | |
n =0 | |
d =0 | |
test =[] | |
for i, j, k, l, rank in zip(modslen, offset_mod, spectra2.pep_with_tag, spectra2.start_pos, spectra2.protein_rank): | |
if len(i) !=0 and i[0] >= 1: | |
n += 1 | |
#print i, j, k, l, rank | |
ptms =[] | |
for s in range(len(i)): | |
if len(i) == 1: | |
ptms.append([i[s]]) | |
break | |
elif len(i) == 2: | |
ptms.append([i[s], i[s+1]-j[s]]) | |
break | |
elif len(i) == 3: | |
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2])]) | |
break | |
elif len(i) == 4: | |
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3])]) | |
break | |
elif len(i) == 5: | |
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4])]) | |
break | |
elif len(i) == 6: | |
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4]), i[s+5]-np.sum(j[s:s+5])]) | |
break | |
elif len(i) == 7: | |
ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4]), i[s+5]-np.sum(j[s:s+5]), i[s+6]-np.sum(j[s:s+6])]) | |
break | |
ptms1 = list(chain.from_iterable(ptms)) | |
ptms2 = np.array(ptms1)+l-1 | |
print ptms2 | |
test.append(ptms2.tolist()) | |
index_mod=res[res.postion.isin(ptms2)] | |
res.loc[index_mod.index[index_mod.protein_rank == rank], ('mod')] = res.loc[index_mod.index[index_mod.protein_rank == rank], ('mod')]+1 | |
#print np.array(ptms)[1:]+l-1 | |
else: | |
d+=1 | |
test.append(0) | |
spectra2['mod_pos'] = test ## add in the PTM positions into the main df | |
## merge the res with protein names | |
res2=res.merge(maxDT, on='protein_rank') | |
########## get the mutated pos if present | |
res2['mut_pos'] = 0 | |
mut_pos =[] | |
for i in range(0, len(res2.protein_name)): | |
q=re.findall(".+\\(([0-9]+)\\).+", res2.protein_name[i]) | |
mut_pos.append(''.join(q)) | |
new_list = [element or '0' for element in mut_pos] | |
res2.loc[:, ('mut_pos')]= new_list | |
res2.loc[:, ('mut_pos')] = res2['mut_pos'].astype(int) | |
# | |
########## remove all zero occurence positions to reduce data | |
db1=res2.loc[res2['total'] !=0] | |
#### select only identified flu proteins | |
flu_mut=db1[db1.protein_name.str.contains('>HA-|>NP-|>NA-|>MA1-|>PB2-')] | |
flu_mut.reset_index(inplace =True, drop =True) | |
flu_mut.loc[:, ('flu_mut_prot')]=[flu_mut.protein_name[i][0:4] for i in range(0, len(flu_mut.protein_name))] | |
flu_mut.loc[:, ('protein_name_pos')]= [re.sub(".+\\(([0-9]+)\\).+","\\1", flu_mut.protein_name[i]) for i in range(0, len(flu_mut.protein_name))] | |
#### select the others proteins starting with >gi | |
non_mut = db1[db1.protein_name.str.contains('>gi')] | |
### TAKE ONLY PEPTIDE THAT HAVE SAME POSITION AS THE MUTATED POSITION | |
flu_mut1=flu_mut.loc[flu_mut['postion'] == flu_mut.mut_pos] | |
flu_mut1.sort_values(['total'], ascending=False, inplace=True) ## sort the values | |
flu_mut1.reset_index(inplace=True, drop =True) | |
flu_mut1.loc[:, ('pep_rank')] = range(1, len(flu_mut1)+1) ## make a new col called rank | |
flu_mut2 = flu_mut1.drop(['templen'], axis=1) | |
flu_mut2.reset_index(inplace=True, drop=True) | |
flu_mut2.loc[:, ('flu_mut_prot')]=[flu_mut2.protein_name[i][0:4] for i in range(0, len(flu_mut2.protein_name))] | |
flu_mut2.loc[:, ('protein_name_pos')]= [re.sub(".+\\(([0-9]+)\\).+","\\1", flu_mut2.protein_name[i]) for i in range(0, len(flu_mut2.protein_name))] | |
## get unmutated occurence for each mutation to calculate the ratios | |
for i in range(0, len(flu_mut2)): | |
flu_mut2.loc[i, ('unmutated_occurence')]=np.sum(flu_mut.total[(flu_mut.flu_mut_prot == flu_mut2.flu_mut_prot[i]) & (flu_mut.protein_name_pos != flu_mut2.protein_name_pos[i]) & (flu_mut.postion == flu_mut2.postion[i])]) | |
###take the non_mut wild type files and assign protein IDs | |
non_mut.loc[:, ('id1')] = 'NaN' | |
non_mut.reset_index(drop=True, inplace=True) | |
non_mut.loc[non_mut.protein_name.str.contains('hemagglutinin'), ('id1')] = ">HA" | |
non_mut.loc[non_mut.protein_name.str.contains('nucleocapsid protein'), ('id1')] = ">NP" | |
non_mut.loc[non_mut.protein_name.str.contains('neuraminidase'), ('id1')] = ">NA" | |
non_mut.loc[non_mut.protein_name.str.contains('polymerase PB2'), ('id1')] = ">PB2" | |
non_mut.loc[non_mut.protein_name.str.contains('matrix protein 1'), ('id1')] = ">MA1" | |
## remove others that are not necessary | |
non_mut1=non_mut.loc[non_mut['id1'] != 'NaN'] | |
non_mut1.reset_index(drop=True, inplace=True) | |
############################################################################## to match other wild type flu peptides | |
for i in range(0, len(flu_mut2.protein_name)): | |
for j in range(0, len(non_mut1.id1)): | |
if non_mut1.id1[j] == flu_mut2.flu_mut_prot[i] and non_mut1.postion[j] == flu_mut2.postion[i]: | |
flu_mut2.loc[i, ('unmutated_occurence')]=non_mut1.loc[j, ('total')] + flu_mut2.loc[i, ('unmutated_occurence')] | |
flu_mut2.loc[:, ('Ratio_mut_vs_unmut')]=flu_mut2.total/flu_mut2.unmutated_occurence | |
flu_mut2.loc[:, ('Description')] = [re.sub(' .*$','', i) for i in flu_mut2.protein_name] | |
flu_mut2.to_csv(outfile, index=False) | |
vacc_list = [i.replace('\n', '') for i in open('vacc_list', 'r')] | |
for files in vacc_list: | |
spectra = pd.read_csv(files, header = 0, sep = ";") | |
outfile = '_'.join(files.split('_')[0:2]) + '_TRYP_May17_db.csv' | |
process_raw_spectra(spectra, outfile) | |
print outfile | |
### make a db file containing all the analysis files | |
dblist = [i.replace('\n', '') for i in open('dblist', 'r')] | |
vacc_ids=[dblist[i].split('_')[1]+str(i+1) for i in range(0, len(dblist))] | |
vacc_mut=[] | |
for i in range(0, len(vacc_ids)): | |
tempfile = pd.read_csv(dblist[i], header=0) | |
tempfile['Description'] = [re.sub('>', '', tempfile.Description[i]) for i in range(0, len(tempfile.Description))] | |
vacc_mut.append(tempfile) | |
ARP1=vacc_mut[0] | |
ARP2=vacc_mut[1] | |
ARP3=vacc_mut[2] | |
ARP4=vacc_mut[3] | |
PAN5=vacc_mut[4] | |
PAN6=vacc_mut[5] | |
PAN7=vacc_mut[6] | |
PAN8=vacc_mut[7] | |
ARP9=vacc_mut[8] | |
dbscols =[] | |
for i in range(0, len(vacc_ids)): | |
db_ids = ((vacc_ids[i]+' ') *4).split(' ')[:4] | |
print db_ids | |
db_ids[0] = db_ids[0]+'_M' | |
db_ids[1] = db_ids[1]+'_I' | |
db_ids[2] = db_ids[2]+'_P_Rank' | |
db_ids[3] = db_ids[3]+'_Ratio_M_vs_unmut' | |
dbscols.extend(db_ids) | |
## read in the database file and modify and intialize | |
db = pd.read_csv('/home/labcomp/Desktop/databasefinale.csv', header=0) | |
database=db[['Description', 'Position','MutatedAA','InitialAA']] | |
or_cols=list(database.columns.values) ## extract existing DB columsn to re_index the DB frame with new colums | |
or_cols.extend(dbscols) | |
database=database.reindex(columns=or_cols) ## reassign index | |
######################################################################## | |
for i in range(0, len(ARP1)): | |
database.loc[(database['Description'] == ARP1.Description[i]), ('ARP1_M')] = ARP1.total[i] | |
database.loc[(database['Description'] == ARP1.Description[i]), ('ARP1_I')] = ARP1.unmutated_occurence[i] | |
database.loc[(database['Description'] == ARP1.Description[i]), ('ARP1_P_Rank')] = ARP1.pep_rank[i] | |
database.loc[(database['Description'] == ARP1.Description[i]), ('ARP1_Ratio_M_vs_unmut')] = ARP1.Ratio_mut_vs_unmut[i] | |
for i in range(0, len(ARP2)): | |
database.loc[(database['Description'] == ARP2.Description[i]), ('ARP2_M')] = ARP2.total[i] | |
database.loc[(database['Description'] == ARP2.Description[i]), ('ARP2_I')] = ARP2.unmutated_occurence[i] | |
database.loc[(database['Description'] == ARP2.Description[i]), ('ARP2_P_Rank')] = ARP2.pep_rank[i] | |
database.loc[(database['Description'] == ARP2.Description[i]), ('ARP2_Ratio_M_vs_unmut')] = ARP2.Ratio_mut_vs_unmut[i] | |
for i in range(0, len(ARP3)): | |
database.loc[(database['Description'] == ARP3.Description[i]), ('ARP3_M')] = ARP3.total[i] | |
database.loc[(database['Description'] == ARP3.Description[i]), ('ARP3_I')] = ARP3.unmutated_occurence[i] | |
database.loc[(database['Description'] == ARP3.Description[i]), ('ARP3_P_Rank')] = ARP3.pep_rank[i] | |
database.loc[(database['Description'] == ARP3.Description[i]), ('ARP3_Ratio_M_vs_unmut')] = ARP3.Ratio_mut_vs_unmut[i] | |
for i in range(0, len(ARP4)): | |
database.loc[(database['Description'] == ARP4.Description[i]), ('ARP4_M')] = ARP4.total[i] | |
database.loc[(database['Description'] == ARP4.Description[i]), ('ARP4_I')] = ARP4.unmutated_occurence[i] | |
database.loc[(database['Description'] == ARP4.Description[i]), ('ARP4_P_Rank')] = ARP4.pep_rank[i] | |
database.loc[(database['Description'] == ARP4.Description[i]), ('ARP4_Ratio_M_vs_unmut')] = ARP4.Ratio_mut_vs_unmut[i] | |
for i in range(0, len(ARP9)): | |
database.loc[(database['Description'] == ARP9.Description[i]), ('ARP9_M')] = ARP9.total[i] | |
database.loc[(database['Description'] == ARP9.Description[i]), ('ARP9_I')] = ARP9.unmutated_occurence[i] | |
database.loc[(database['Description'] == ARP9.Description[i]), ('ARP9_P_Rank')] = ARP9.pep_rank[i] | |
database.loc[(database['Description'] == ARP9.Description[i]), ('ARP9_Ratio_M_vs_unmut')] = ARP9.Ratio_mut_vs_unmut[i] | |
for i in range(0, len(PAN5)): | |
database.loc[(database['Description'] == PAN5.Description[i]), ('PAN5_M')] = PAN5.total[i] | |
database.loc[(database['Description'] == PAN5.Description[i]), ('PAN5_I')] = PAN5.unmutated_occurence[i] | |
database.loc[(database['Description'] == PAN5.Description[i]), ('PAN5_P_Rank')] = PAN5.pep_rank[i] | |
database.loc[(database['Description'] == PAN5.Description[i]), ('PAN5_Ratio_M_vs_unmut')] = PAN5.Ratio_mut_vs_unmut[i] | |
for i in range(0, len(PAN6)): | |
database.loc[(database['Description'] == PAN6.Description[i]), ('PAN6_M')] = PAN6.total[i] | |
database.loc[(database['Description'] == PAN6.Description[i]), ('PAN6_I')] = PAN6.unmutated_occurence[i] | |
database.loc[(database['Description'] == PAN6.Description[i]), ('PAN6_P_Rank')] = PAN6.pep_rank[i] | |
database.loc[(database['Description'] == PAN6.Description[i]), ('PAN6_Ratio_M_vs_unmut')] = PAN6.Ratio_mut_vs_unmut[i] | |
for i in range(0, len(PAN7)): | |
database.loc[(database['Description'] == PAN7.Description[i]), ('PAN7_M')] = PAN7.total[i] | |
database.loc[(database['Description'] == PAN7.Description[i]), ('PAN7_I')] = PAN7.unmutated_occurence[i] | |
database.loc[(database['Description'] == PAN7.Description[i]), ('PAN7_P_Rank')] = PAN7.pep_rank[i] | |
database.loc[(database['Description'] == PAN7.Description[i]), ('PAN7_Ratio_M_vs_unmut')] = PAN7.Ratio_mut_vs_unmut[i] | |
for i in range(0, len(PAN8)): | |
database.loc[(database['Description'] == PAN8.Description[i]), ('PAN8_M')] = PAN8.total[i] | |
database.loc[(database['Description'] == PAN8.Description[i]), ('PAN8_I')] = PAN8.unmutated_occurence[i] | |
database.loc[(database['Description'] == PAN8.Description[i]), ('PAN8_P_Rank')] = PAN8.pep_rank[i] | |
database.loc[(database['Description'] == PAN8.Description[i]), ('PAN8_Ratio_M_vs_unmut')] = PAN8.Ratio_mut_vs_unmut[i] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment