adiamb/MS_analysis_pythonimplementation_code_compile_may16.py

## MS_analysis_pythonimplementation_code_compile_may16.py
###PYTHON#############
#Purpose : Replication of MS spectra analysis
#Author : Aditya Ambati
#date : May 14 2017
#update : version 1
#####import libraries


from itertools import chain
import pandas as pd
import numpy as np
%matplotlib
import re
spectra = pd.read_csv('AFLSA208A_PAN_TRYPSIN.csv', header = 0, sep = ";") ## read in the file
### take only selected columns
##
#array(['Query #:z', 'Protein\nRank','Peptide\n< ProteinMetrics Confidential >', 'Glycans\nNHFAGNa','Modification Type(s)', 'Observed\nm/z', 'z', 'Observed\n(M+H)',
       #'Calc.\nmass (M+H)', 'Off-by-x\nerror', 'Mass error\n(ppm)',
       #'Starting\nposition', 'Cleavage', 'Score', 'Delta', 'Delta\nMod',
       #'|Log Prob|', '# of unique\npeptides', 'Protein Name',
      # 'Protein\nDB number', 'Comment', 'Scan #', 'Scan Time'], dtype=object)
## subset the df
spectra2=spectra[['Protein\nRank','Peptide\n< ProteinMetrics Confidential >','Modification Type(s)', 'Starting\nposition', 'Protein Name']]
## rename the columns
spectra2.columns = ['protein_rank', 'peptide', 'modtype', 'start_pos', 'protein_name']

## define a function to return only alphabets
def peponly(pepseq):
	return(re.sub(pattern='[^A-Z]*',repl='', string=pepseq))
spectra2.loc[:, ('pep')] = [peponly(i) for i in spectra2.peptide] ### assign to a new column the raw peptide
spectra2.loc[:, ('len1')] = [len(i)-1 for i in spectra2.pep]

untagged=[]
for i in range(len(spectra2.pep)):
	line = spectra2.pep[i]
	untagged.append(line[1:spectra2.len1[i]]) ## remove first and last character

spectra2.loc[:, ('tag_rem_pep')] = untagged  ## add this into the df
spectra2.loc[:, ('tag_rm_len')] = [len(i) for i in spectra2.tag_rem_pep]
spectra2.loc[:, ('tag_rm_len')]=spectra2['tag_rm_len'].astype(int)
spectra2.loc[:, ('templen')] = spectra2.tag_rm_len + spectra2.start_pos ### add the startingpos + untagged peptide length
spectra2.loc[:, 'org_length'] = [len(i) for i in spectra2.peptide] ## get the orginal length of the ms raw peptide


import numpy
maxDT=spectra2.groupby(['protein_rank', 'protein_name']).agg({'templen':'max'}) ## group by the protein  name and get max length of each uniq protein
maxDT = maxDT.reset_index() ## reset the index

emp =[]
for q, w, in zip(maxDT.protein_rank, maxDT.templen):
	z=pd.DataFrame(np.repeat(q, w), np.arange(1, w+1))
	z = z.reset_index()
	emp.append(z)
res = pd.concat(emp)
res.columns = ['postion', 'protein_rank']
res=res.reset_index()
res.drop('index', axis =1, inplace=True)

### make a db of every peptide in the ms file
res['total'] = 0
res['pep'] = 0
for i in range(0, len(spectra2.protein_rank)):
	index_pos = np.arange(spectra2.start_pos[i], spectra2.templen[i])
	index_pos2=res[res.postion.isin(index_pos)]
	res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('total')] = res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('total')]+1
	res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('pep')] = list(("".join(spectra2.tag_rem_pep[i])))

#re.search("\\[[^A-Z]*\\]", spectra2.peptide[1])

#spec_uniq=spectra2[['protein_rank', 'tag_rem_pep', 'protein_name', 'start_pos', 'tag_rm_len']].drop_duplicates()


#res.loc[:, total[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]]]] = res.total[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]]]

#[m.start() for m in re.finditer("\\[[^A-Z]*\\]", q)]

##############get the peptide with mods
for i in range(0, len(spectra2.peptide)):
	spectra2.loc[i, ('pep_with_tag')] = spectra2.peptide[i][2:spectra2.org_length[i]-2]
spectra2.loc[:, 'after_tag_len'] = [len(i) for i in spectra2.pep_with_tag]

#[m.start() for m in re.finditer("\\[[^A-Z]*\\]", spectra2.pep_with_tag[1145])]
modslen =[]
for i in range(0, len(spectra2.pep_with_tag)):
	modslen.append([m.start() for m in re.finditer("\\[[^A-Z]*\\]", spectra2.pep_with_tag[i])])

mods_no =[]
for i in range(0, len(spectra2.pep_with_tag)):
	mods_no.append(re.findall("\\[[^A-Z]*\\]", spectra2.pep_with_tag[i]))

offset_mod =[]
for mods in mods_no:
	mods_temp = [len(i) for i in mods]
	offset_mod.append(mods_temp)


res['mod'] = 0
n =0
d =0
test =[]
for i, j, k, l, rank in zip(modslen, offset_mod, spectra2.pep_with_tag, spectra2.start_pos, spectra2.protein_rank):
	if len(i) !=0 and i[0] >= 1:
		n += 1
		#print i, j, k, l, rank
		ptms =[]
		for s in range(len(i)):
			if len(i) == 1:
				ptms.append([i[s]])
				break
			elif len(i) == 2:
				ptms.append([i[s], i[s+1]-j[s]])
				break
			elif len(i) == 3:
				ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2])])
				break
			elif len(i) == 4:
				ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3])])
				break
			elif len(i) == 5:
				ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4])])
				break
			elif len(i) == 6:
				ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4]), i[s+5]-np.sum(j[s:s+5])])
				break
			elif len(i) == 7:
				ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4]), i[s+5]-np.sum(j[s:s+5]), i[s+6]-np.sum(j[s:s+6])])
				break

		ptms1 = list(chain.from_iterable(ptms))
		ptms2 = np.array(ptms1)+l-1
		print ptms2
		test.append(ptms2.tolist())
		index_mod=res[res.postion.isin(ptms2)]
		res.loc[index_mod.index[index_mod.protein_rank == rank], ('mod')] = res.loc[index_mod.index[index_mod.protein_rank == rank], ('mod')]+1
		#print np.array(ptms)[1:]+l-1
	else:
		d+=1
		test.append(0)

spectra2['mod_pos'] = test ## add in the PTM positions into the main df

## merge the res with protein names
res2=res.merge(maxDT, on='protein_rank')
########## get the mutated pos if present
res2['mut_pos'] = 0
mut_pos =[]
for i in range(0, len(res2.protein_name)):
	q=re.findall(".+\\(([0-9]+)\\).+", res2.protein_name[i])
	mut_pos.append(''.join(q))

new_list = [element or '0' for element in mut_pos]
res2.loc[:, ('mut_pos')]= new_list
res2.loc[:, ('mut_pos')] = res2['mut_pos'].astype(int)
#
########## remove all zero occurence positions to reduce data
db1=res2.loc[res2['total'] !=0]
#### select only identified flu proteins
flu_mut=db1[db1.protein_name.str.contains('>HA-|>NP-|>NA-|>MA1-|>PB2-')]
flu_mut.reset_index(inplace =True, drop =True)
flu_mut.loc[:, ('flu_mut_prot')]=[flu_mut.protein_name[i][0:4] for i in range(0, len(flu_mut.protein_name))]
flu_mut.loc[:, ('protein_name_pos')]= [re.sub(".+\\(([0-9]+)\\).+","\\1", flu_mut.protein_name[i]) for i in range(0, len(flu_mut.protein_name))]

#### select the others proteins starting with >gi
non_mut = db1[db1.protein_name.str.contains('>gi')]


### TAKE ONLY PEPTIDE THAT HAVE SAME POSITION AS THE MUTATED POSITION
flu_mut1=flu_mut.loc[flu_mut['postion'] == flu_mut.mut_pos]
flu_mut1.sort_values(['total'], ascending=False, inplace=True) ## sort the values
flu_mut1.reset_index(inplace=True, drop =True)
flu_mut1.loc[:, ('pep_rank')] = range(1, len(flu_mut1)+1) ## make a new col called rank
flu_mut2 = flu_mut1.drop(['templen'], axis=1)
flu_mut2.reset_index(inplace=True, drop=True)
flu_mut2.loc[:, ('flu_mut_prot')]=[flu_mut2.protein_name[i][0:4] for i in range(0, len(flu_mut2.protein_name))]
flu_mut2.loc[:, ('protein_name_pos')]= [re.sub(".+\\(([0-9]+)\\).+","\\1", flu_mut2.protein_name[i]) for i in range(0, len(flu_mut2.protein_name))]

## get unmutated occurence for each mutation to calculate the ratios

for i in range(0, len(flu_mut2)):
	flu_mut2.loc[i, ('unmutated_occurence')]=np.sum(flu_mut.total[(flu_mut.flu_mut_prot == flu_mut2.flu_mut_prot[i]) & (flu_mut.protein_name_pos != flu_mut2.protein_name_pos[i]) & (flu_mut.postion == flu_mut2.postion[i])])

###take the non_mut wild type files and assign protein IDs
non_mut.loc[:, ('id1')] = 'NaN'
non_mut.reset_index(drop=True, inplace=True)
non_mut.loc[non_mut.protein_name.str.contains('hemagglutinin'), ('id1')] = ">HA"
non_mut.loc[non_mut.protein_name.str.contains('nucleocapsid protein'), ('id1')] = ">NP"
non_mut.loc[non_mut.protein_name.str.contains('neuraminidase'), ('id1')] = ">NA"
non_mut.loc[non_mut.protein_name.str.contains('polymerase PB2'), ('id1')] = ">PB2"
non_mut.loc[non_mut.protein_name.str.contains('matrix protein 1'), ('id1')] = ">MA1"
## remove others that are not necessary
non_mut1=non_mut.loc[non_mut['id1'] != 'NaN']
non_mut1.reset_index(drop=True, inplace=True)
############################################################################## to match other wild type flu peptides
for i in range(0, len(flu_mut2.protein_name)):
	for j in range(0, len(non_mut1.id1)):
		if non_mut1.id1[j] == flu_mut2.flu_mut_prot[i] and non_mut1.postion[j] == flu_mut2.postion[i]:
			 flu_mut2.loc[i, ('unmutated_occurence')]=non_mut1.loc[j, ('total')] + flu_mut2.loc[i, ('unmutated_occurence')]

flu_mut2.loc[:, ('Ratio_mut_vs_unmut')]=flu_mut2.total/flu_mut2.unmutated_occurence
flu_mut2.loc[:, ('Description')] = [re.sub(' .*$','', i) for i in flu_mut2.protein_name]
flu_mut2.to_csv('AFLSA208A_PAN_TRY_DB.csv', index=False)


### match to the database


flu_mut = pd.read_csv('AFLSA096AA_PAN_TRYP_May17_db.csv', header=0)
flu_mut['Description'] = [re.sub('>', '', flu_mut.Description[i]) for i in range(0, len(flu_mut.Description))]
## read in the database file
db = pd.read_csv('/home/labcomp/Desktop/databasefinale.csv', header=0)
db.drop(['Unnamed: 0'], axis=1, inplace=True)
db.columns=[re.sub('[.]', '_', db.columns.values[i]) for i in range(0, len(db.columns.values))]
########################################################################
for i in range(0, len(flu_mut.Description)):
	db.loc[(db['Description'] == flu_mut.Description[i]), ('T_1_M')] = flu_mut.total[i]
	db.loc[(db['Description'] == flu_mut.Description[i]), ('T_1_I')] = flu_mut.unmutated_occurence[i]
	db.loc[(db['Description'] == flu_mut.Description[i]), ('T_1_Rank')] = flu_mut.pep_rank[i]
	db.loc[(db['Description'] == flu_mut.Description[i]), ('Ratio_mut_vs_unmut')] = flu_mut.Ratio_mut_vs_unmut[i]


################### boolean indexing
res2.loc[(res2['protein_rank'] == 4)]
n=0
for lines in spectra2.tag_rem_pep:
	a=re.findall('KTSSWP*', lines)
	n += 1
	if len(a) > 0:
		break

res2['mut_pos'] = 0
mut_pos =[]
for i in range(0, len(res2.protein_name)):
	q=re.findall(".+\\(([0-9]+)\\).+", res2.protein_name[i])
	mut_pos.append(''.join(q))

new_list = [element or '0' for element in mut_pos]
res2.loc['mut_pos']= new_list


res2['mut_pos'] = res2['mut_pos'].astype(int)
res3=res2[res2['postion'] == res2.mut_pos]

res3.sort_values(['total'], ascending=False, inplace=True)

for i in maxDT.protein_name:
	print re.findall('hemagglutinin', i)

res4=res3.loc[(res3['total']) !=0]

[1, 14, 24]
1		 9    14      24		9
C[+71.037]DNTC[+71.037]M[+15.995]ESVK
12345678910
CDNTCMESVK


for i, j, k in zip(modslen, offset_mod, spectra2.pep_with_tag):
	if len(i) > 1 and i[0] >= 1:
		print i, j, k
		for nos in range(0, len(i)-1):

			print i[nos],
			print i[nos+1] - j[nos]
			elif len(i) >2:
				#print i[nos]
				#print i[nos+1] - j[nos]
				print i[nos+2] - np.sum(j[nos:nos+1])

			print i[nos+2] - j[nos+nos+1]

for i, j, k in zip(modslen, offset_mod, spectra2.pep_with_tag):
	if len(i) > 1 and i[0] >= 1:

i = [6, 18, 27, 42]
j = [8, 8, 9, 9]
	###PYTHON#############
	#Purpose : Replication of MS spectra analysis
	#Author : Aditya Ambati
	#date : May 14 2017
	#update : version 1
	#####import libraries


	from itertools import chain
	import pandas as pd
	import numpy as np
	%matplotlib
	import re
	spectra = pd.read_csv('AFLSA208A_PAN_TRYPSIN.csv', header = 0, sep = ";") ## read in the file
	### take only selected columns
	##
	#array(['Query #:z', 'Protein\nRank','Peptide\n< ProteinMetrics Confidential >', 'Glycans\nNHFAGNa','Modification Type(s)', 'Observed\nm/z', 'z', 'Observed\n(M+H)',
	#'Calc.\nmass (M+H)', 'Off-by-x\nerror', 'Mass error\n(ppm)',
	#'Starting\nposition', 'Cleavage', 'Score', 'Delta', 'Delta\nMod',
	#'\|Log Prob\|', '# of unique\npeptides', 'Protein Name',
	# 'Protein\nDB number', 'Comment', 'Scan #', 'Scan Time'], dtype=object)
	## subset the df
	spectra2=spectra[['Protein\nRank','Peptide\n< ProteinMetrics Confidential >','Modification Type(s)', 'Starting\nposition', 'Protein Name']]
	## rename the columns
	spectra2.columns = ['protein_rank', 'peptide', 'modtype', 'start_pos', 'protein_name']

	## define a function to return only alphabets
	def peponly(pepseq):
	return(re.sub(pattern='[^A-Z]*',repl='', string=pepseq))
	spectra2.loc[:, ('pep')] = [peponly(i) for i in spectra2.peptide] ### assign to a new column the raw peptide
	spectra2.loc[:, ('len1')] = [len(i)-1 for i in spectra2.pep]

	untagged=[]
	for i in range(len(spectra2.pep)):
	line = spectra2.pep[i]
	untagged.append(line[1:spectra2.len1[i]]) ## remove first and last character

	spectra2.loc[:, ('tag_rem_pep')] = untagged ## add this into the df
	spectra2.loc[:, ('tag_rm_len')] = [len(i) for i in spectra2.tag_rem_pep]
	spectra2.loc[:, ('tag_rm_len')]=spectra2['tag_rm_len'].astype(int)
	spectra2.loc[:, ('templen')] = spectra2.tag_rm_len + spectra2.start_pos ### add the startingpos + untagged peptide length
	spectra2.loc[:, 'org_length'] = [len(i) for i in spectra2.peptide] ## get the orginal length of the ms raw peptide


	import numpy
	maxDT=spectra2.groupby(['protein_rank', 'protein_name']).agg({'templen':'max'}) ## group by the protein name and get max length of each uniq protein
	maxDT = maxDT.reset_index() ## reset the index

	emp =[]
	for q, w, in zip(maxDT.protein_rank, maxDT.templen):
	z=pd.DataFrame(np.repeat(q, w), np.arange(1, w+1))
	z = z.reset_index()
	emp.append(z)
	res = pd.concat(emp)
	res.columns = ['postion', 'protein_rank']
	res=res.reset_index()
	res.drop('index', axis =1, inplace=True)

	### make a db of every peptide in the ms file
	res['total'] = 0
	res['pep'] = 0
	for i in range(0, len(spectra2.protein_rank)):
	index_pos = np.arange(spectra2.start_pos[i], spectra2.templen[i])
	index_pos2=res[res.postion.isin(index_pos)]
	res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('total')] = res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('total')]+1
	res.loc[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]], ('pep')] = list(("".join(spectra2.tag_rem_pep[i])))

	#re.search("\\[[^A-Z]*\\]", spectra2.peptide[1])

	#spec_uniq=spectra2[['protein_rank', 'tag_rem_pep', 'protein_name', 'start_pos', 'tag_rm_len']].drop_duplicates()


	#res.loc[:, total[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]]]] = res.total[index_pos2.index[index_pos2.protein_rank == spectra2.protein_rank[i]]]

	#[m.start() for m in re.finditer("\\[[^A-Z]*\\]", q)]

	##############get the peptide with mods
	for i in range(0, len(spectra2.peptide)):
	spectra2.loc[i, ('pep_with_tag')] = spectra2.peptide[i][2:spectra2.org_length[i]-2]
	spectra2.loc[:, 'after_tag_len'] = [len(i) for i in spectra2.pep_with_tag]

	#[m.start() for m in re.finditer("\\[[^A-Z]*\\]", spectra2.pep_with_tag[1145])]
	modslen =[]
	for i in range(0, len(spectra2.pep_with_tag)):
	modslen.append([m.start() for m in re.finditer("\\[[^A-Z]*\\]", spectra2.pep_with_tag[i])])

	mods_no =[]
	for i in range(0, len(spectra2.pep_with_tag)):
	mods_no.append(re.findall("\\[[^A-Z]*\\]", spectra2.pep_with_tag[i]))

	offset_mod =[]
	for mods in mods_no:
	mods_temp = [len(i) for i in mods]
	offset_mod.append(mods_temp)




	res['mod'] = 0
	n =0
	d =0
	test =[]
	for i, j, k, l, rank in zip(modslen, offset_mod, spectra2.pep_with_tag, spectra2.start_pos, spectra2.protein_rank):
	if len(i) !=0 and i[0] >= 1:
	n += 1
	#print i, j, k, l, rank
	ptms =[]
	for s in range(len(i)):
	if len(i) == 1:
	ptms.append([i[s]])
	break
	elif len(i) == 2:
	ptms.append([i[s], i[s+1]-j[s]])
	break
	elif len(i) == 3:
	ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2])])
	break
	elif len(i) == 4:
	ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3])])
	break
	elif len(i) == 5:
	ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4])])
	break
	elif len(i) == 6:
	ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4]), i[s+5]-np.sum(j[s:s+5])])
	break
	elif len(i) == 7:
	ptms.append([i[s], i[s+1]-j[s], i[s+2]-np.sum(j[s:s+2]), i[s+3]-np.sum(j[s:s+3]), i[s+4]-np.sum(j[s:s+4]), i[s+5]-np.sum(j[s:s+5]), i[s+6]-np.sum(j[s:s+6])])
	break

	ptms1 = list(chain.from_iterable(ptms))
	ptms2 = np.array(ptms1)+l-1
	print ptms2
	test.append(ptms2.tolist())
	index_mod=res[res.postion.isin(ptms2)]
	res.loc[index_mod.index[index_mod.protein_rank == rank], ('mod')] = res.loc[index_mod.index[index_mod.protein_rank == rank], ('mod')]+1
	#print np.array(ptms)[1:]+l-1
	else:
	d+=1
	test.append(0)

	spectra2['mod_pos'] = test ## add in the PTM positions into the main df

	## merge the res with protein names
	res2=res.merge(maxDT, on='protein_rank')
	########## get the mutated pos if present
	res2['mut_pos'] = 0
	mut_pos =[]
	for i in range(0, len(res2.protein_name)):
	q=re.findall(".+\\(([0-9]+)\\).+", res2.protein_name[i])
	mut_pos.append(''.join(q))

	new_list = [element or '0' for element in mut_pos]
	res2.loc[:, ('mut_pos')]= new_list
	res2.loc[:, ('mut_pos')] = res2['mut_pos'].astype(int)
	#
	########## remove all zero occurence positions to reduce data
	db1=res2.loc[res2['total'] !=0]
	#### select only identified flu proteins
	flu_mut=db1[db1.protein_name.str.contains('>HA-\|>NP-\|>NA-\|>MA1-\|>PB2-')]
	flu_mut.reset_index(inplace =True, drop =True)
	flu_mut.loc[:, ('flu_mut_prot')]=[flu_mut.protein_name[i][0:4] for i in range(0, len(flu_mut.protein_name))]
	flu_mut.loc[:, ('protein_name_pos')]= [re.sub(".+\\(([0-9]+)\\).+","\\1", flu_mut.protein_name[i]) for i in range(0, len(flu_mut.protein_name))]

	#### select the others proteins starting with >gi
	non_mut = db1[db1.protein_name.str.contains('>gi')]


	### TAKE ONLY PEPTIDE THAT HAVE SAME POSITION AS THE MUTATED POSITION
	flu_mut1=flu_mut.loc[flu_mut['postion'] == flu_mut.mut_pos]
	flu_mut1.sort_values(['total'], ascending=False, inplace=True) ## sort the values
	flu_mut1.reset_index(inplace=True, drop =True)
	flu_mut1.loc[:, ('pep_rank')] = range(1, len(flu_mut1)+1) ## make a new col called rank
	flu_mut2 = flu_mut1.drop(['templen'], axis=1)
	flu_mut2.reset_index(inplace=True, drop=True)
	flu_mut2.loc[:, ('flu_mut_prot')]=[flu_mut2.protein_name[i][0:4] for i in range(0, len(flu_mut2.protein_name))]
	flu_mut2.loc[:, ('protein_name_pos')]= [re.sub(".+\\(([0-9]+)\\).+","\\1", flu_mut2.protein_name[i]) for i in range(0, len(flu_mut2.protein_name))]

	## get unmutated occurence for each mutation to calculate the ratios

	for i in range(0, len(flu_mut2)):
	flu_mut2.loc[i, ('unmutated_occurence')]=np.sum(flu_mut.total[(flu_mut.flu_mut_prot == flu_mut2.flu_mut_prot[i]) & (flu_mut.protein_name_pos != flu_mut2.protein_name_pos[i]) & (flu_mut.postion == flu_mut2.postion[i])])

	###take the non_mut wild type files and assign protein IDs
	non_mut.loc[:, ('id1')] = 'NaN'
	non_mut.reset_index(drop=True, inplace=True)
	non_mut.loc[non_mut.protein_name.str.contains('hemagglutinin'), ('id1')] = ">HA"
	non_mut.loc[non_mut.protein_name.str.contains('nucleocapsid protein'), ('id1')] = ">NP"
	non_mut.loc[non_mut.protein_name.str.contains('neuraminidase'), ('id1')] = ">NA"
	non_mut.loc[non_mut.protein_name.str.contains('polymerase PB2'), ('id1')] = ">PB2"
	non_mut.loc[non_mut.protein_name.str.contains('matrix protein 1'), ('id1')] = ">MA1"
	## remove others that are not necessary
	non_mut1=non_mut.loc[non_mut['id1'] != 'NaN']
	non_mut1.reset_index(drop=True, inplace=True)
	############################################################################## to match other wild type flu peptides
	for i in range(0, len(flu_mut2.protein_name)):
	for j in range(0, len(non_mut1.id1)):
	if non_mut1.id1[j] == flu_mut2.flu_mut_prot[i] and non_mut1.postion[j] == flu_mut2.postion[i]:
	flu_mut2.loc[i, ('unmutated_occurence')]=non_mut1.loc[j, ('total')] + flu_mut2.loc[i, ('unmutated_occurence')]

	flu_mut2.loc[:, ('Ratio_mut_vs_unmut')]=flu_mut2.total/flu_mut2.unmutated_occurence
	flu_mut2.loc[:, ('Description')] = [re.sub(' .*$','', i) for i in flu_mut2.protein_name]
	flu_mut2.to_csv('AFLSA208A_PAN_TRY_DB.csv', index=False)




	### match to the database


	flu_mut = pd.read_csv('AFLSA096AA_PAN_TRYP_May17_db.csv', header=0)
	flu_mut['Description'] = [re.sub('>', '', flu_mut.Description[i]) for i in range(0, len(flu_mut.Description))]
	## read in the database file
	db = pd.read_csv('/home/labcomp/Desktop/databasefinale.csv', header=0)
	db.drop(['Unnamed: 0'], axis=1, inplace=True)
	db.columns=[re.sub('[.]', '_', db.columns.values[i]) for i in range(0, len(db.columns.values))]
	########################################################################
	for i in range(0, len(flu_mut.Description)):
	db.loc[(db['Description'] == flu_mut.Description[i]), ('T_1_M')] = flu_mut.total[i]
	db.loc[(db['Description'] == flu_mut.Description[i]), ('T_1_I')] = flu_mut.unmutated_occurence[i]
	db.loc[(db['Description'] == flu_mut.Description[i]), ('T_1_Rank')] = flu_mut.pep_rank[i]
	db.loc[(db['Description'] == flu_mut.Description[i]), ('Ratio_mut_vs_unmut')] = flu_mut.Ratio_mut_vs_unmut[i]








	################### boolean indexing
	res2.loc[(res2['protein_rank'] == 4)]
	n=0
	for lines in spectra2.tag_rem_pep:
	a=re.findall('KTSSWP*', lines)
	n += 1
	if len(a) > 0:
	break

	res2['mut_pos'] = 0
	mut_pos =[]
	for i in range(0, len(res2.protein_name)):
	q=re.findall(".+\\(([0-9]+)\\).+", res2.protein_name[i])
	mut_pos.append(''.join(q))

	new_list = [element or '0' for element in mut_pos]
	res2.loc['mut_pos']= new_list


	res2['mut_pos'] = res2['mut_pos'].astype(int)
	res3=res2[res2['postion'] == res2.mut_pos]

	res3.sort_values(['total'], ascending=False, inplace=True)

	for i in maxDT.protein_name:
	print re.findall('hemagglutinin', i)

	res4=res3.loc[(res3['total']) !=0]

	[1, 14, 24]
	1 9 14 24 9
	C[+71.037]DNTC[+71.037]M[+15.995]ESVK
	12345678910
	CDNTCMESVK



	for i, j, k in zip(modslen, offset_mod, spectra2.pep_with_tag):
	if len(i) > 1 and i[0] >= 1:
	print i, j, k
	for nos in range(0, len(i)-1):

	print i[nos],
	print i[nos+1] - j[nos]
	elif len(i) >2:
	#print i[nos]
	#print i[nos+1] - j[nos]
	print i[nos+2] - np.sum(j[nos:nos+1])

	print i[nos+2] - j[nos+nos+1]

	for i, j, k in zip(modslen, offset_mod, spectra2.pep_with_tag):
	if len(i) > 1 and i[0] >= 1:

	i = [6, 18, 27, 42]
	j = [8, 8, 9, 9]