adiamb/Frameshift_1_Nucl.py

## Frameshift_1_Nucl.py
###PYTHON#############
#Purpose : Analyze frameshifts in the MS-tryspin/chemotrypsin digests
#Author : Aditya Ambati
#date : June7 2017
#update : version 1
#####import libraries
from itertools import chain
import pandas as pd
import numpy as np
%matplotlib
import re
from Bio import Entrez
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
################################################ define helper function ############################################
################ this function will take a list with filenames and return a MS output spectra file with trimmed columns and calculated peptide length

def peponly(pepseq):
	return(re.sub(pattern='[^A-Z]*',repl='', string=pepseq))
def spectra_processing(spectra):
	dfs =[]
	for i in range(0, len(spectra)):

		temp = spectra[i]#pd.read_csv(spectra[i], header = 0, sep = ";")
		temp1 = temp[['Protein\nRank','Peptide\n< ProteinMetrics Confidential >','Modification Type(s)', 'Starting\nposition', 'Protein Name', '|Log Prob|', 'file_id']]

	## rename the columns
		temp1.columns = ['protein_rank', 'peptide', 'modtype', 'start_pos', 'protein_name', 'log_prob', 'file_id']
		temp2=temp1[~temp1['protein_name'].str.contains('Reverse')]
		temp2.reset_index(drop=True, inplace=True)

		#temp2.loc[:, ('filename')] = spectra[i]
		temp2.loc[:, ('pep_wt_tags')]=[re.findall('\..*\.', i) for i in temp2.peptide] ## take only characters between the first and the last dot character
		temp2.loc[:, ('tag_rem_pep')] = [peponly(i) for i in temp2.pep_wt_tags] ### assign to a new column the raw peptide without tags
		temp2.loc[:, ('tag_rm_len')] = [len(i) for i in temp2.tag_rem_pep] ## get the raw peptide length
		temp2.loc[:, ('templen')] = temp2.tag_rm_len + temp2.start_pos ### add the startingpos + untagged peptide length
		#temp2.loc[:, 'org_length'] = [len(i) for i in temp2.peptide]
		dfs.append(temp2)
	return(dfs)

##### read in the 1 shift file

file_list =[] ## make a file list
for line in open('vacc_names_batches_matched', 'r'):
	t1=line.replace('\n', '')
	filename=t1.split('\t')[0]
	vaccine = t1.split('\t')[1]
	batch = t1.split('\t')[2]
	out = [filename, vaccine, batch]
	file_list.append(out)

filelist=pd.DataFrame(file_list, columns=['file', 'vacc', 'batch'])
filelist.loc[:, ('file_id')]=filelist.vacc +'_TRY_'+ filelist.batch ## add in an extra colum sepcifying fileID

shiftdfs=[pd.read_excel(i, sheetname='Spectra') for i in filelist.file] ## read in the spectra files in a loop.

shiftdf1 = shiftdfs[:]
for i in range(0, len(shiftdfs)): #### add in the file and lot ID into a column
	shiftdfs[i].loc[:, ('file_id')] = filelist.file_id[i]
############# call for spectra processing fucntion
shiftdf_proc = spectra_processing(shiftdf1)


all_1shift=pd.concat(shiftdf_proc, ignore_index=True)
onlyflu=all_1shift[all_1shift.protein_name.str.contains('Influenza')]
onlyflu.reset_index(drop=True, inplace=True)
onlyflu.loc[:, ('genbankID')]=[re.sub('>', '', i.split(' ')[0]) for i in onlyflu.protein_name]


################## read in the reference files needed to match the frameshifts
reference_ids = pd.read_csv('/media/labcomp/HDD/otherprojects/MS_Analysis/Frameshift/Frameshift_list_april18.csv', header=0)
reference_ids.columns=[re.sub(' ', '', i) for i in reference_ids.columns.values]
reference_gb = reference_ids.genbankID.tolist()
reference_pbid = reference_ids.proteinID.tolist()
reference_fasta = []
for i in reference_pbid:
	Entrez.email = "ambati@stanford.edu"
	handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=i) ## make a request to the ncbi
	seq_record = SeqIO.read(handle, "fasta")
	handle.close()
	reference_fasta.append(seq_record)

#onlyflu=shiftdf1[shiftdf1.protein_name.str.contains('Influenza')]
#onlyflu.reset_index(drop=True, inplace=True)
#onlyflu.loc[:, ('genbankID')]=[re.sub('>', '', i.split(' ')[0]) for i in onlyflu.protein_name]
#onlyflumerge=pd.merge(onlyflu, reference_ids, left_on='genbankID', right_on='genbankID', how='inner')
##################### make a df of proteinID and the sequences asscoiated with them
db_dict ={}
for i in range(0, len(reference_fasta)):
	seqid = reference_fasta[i].id
	fastaseq = reference_fasta[i].seq.tostring()
	db_dict[seqid] = fastaseq


ref_df_prots=pd.DataFrame.from_dict(db_dict.items())
ref_df_prots.columns = ['PID', 'SEQ']
########################## take only unique peptide to be tested against reference sequences
unique_peps=onlyflu.groupby(['tag_rem_pep', 'genbankID']).size()
uniques_df=pd.DataFrame(unique_peps)
uniques_df.reset_index(inplace=True)
### get a list of index where the peptide dont match
checkindexes=[]
for i in range(0, len(uniques_df.tag_rem_pep)):
	testx=ref_df_prots[ref_df_prots.SEQ.str.contains(uniques_df.tag_rem_pep[i])]
	if testx.shape[0] == 0:
		checkindexes.append(i)


unmatchedpep=uniques_df.ix[checkindexes]
unmatchedpep.reset_index(drop=True, inplace=True)
#### merge the proteinids and genbank ids
unmatchedpeps_df=pd.merge(unmatchedpep, reference_ids, left_on='genbankID', right_on='genbankID', how='inner')

unmatchedpeps_df.loc[:, ('blast_out')] = 'NaN'
checkfastas=[]
for i in range(0, len(unmatchedpeps_df.tag_rem_pep)):
	tempseq=SeqRecord(Seq(unmatchedpeps_df.tag_rem_pep[i], IUPAC.protein), id = unmatchedpeps_df.proteinID[i], name='index'+str(unmatchedpeps_df.strainname[i]))
	checkfastas.append(tempseq)
	#SeqIO.write(tempseq, '/home/labcomp/Desktop/Vaccine_MS_May16/1_shiftMS/Blast_files/' + tempseq.id.replace(' ', '')+'_'+str(i)+'.fasta', 'fasta')
	fasta_x=tempseq.id.replace(' ', '')+'_'+str(i)+'.fasta'
#SeqIO.write(checkfastas, 'check.fasta', 'fasta')
#blast_hit =[]
#with open('fasta_list', 'r') as f:
	#for line in f:
		#fasta_x = line.strip()

#BLASTN_command_C =  NcbiblastpCommandline(query='/home/labcomp/Desktop/Vaccine_MS_May16/1_shiftMS/Blast_files/ADE29085.1_467.fasta',db='/home/labcomp/frames_DB',num_threads=8,outfmt='"6 qseqid sseqid evalue bitscore length pident nident qframe sframe gaps sstart send sseq sstrand qstart qend qseq"',max_target_seqs=1)
#stdout_C, stderr_C = BLASTN_command_C()
	BLASTN_command_C =  NcbiblastpCommandline(query=fasta_x,db='/home/labcomp/frames_DB',num_threads=8,outfmt='"6 qseqid sseqid evalue bitscore length pident nident qframe sframe gaps sstart send sseq sstrand qstart qend qseq"',max_target_seqs=1)
	stdout_C, stderr_C = BLASTN_command_C()
	print stdout_C
	#blast_hit.append(stdout_C)
	unmatchedpeps_df.loc[i, ('blast_out')] = stdout_C


pot_frames=unmatchedpeps_df[unmatchedpeps_df.blast_out == '']

pot_frames=pot_frames.sort_values([0], ascending=False)
pot_frames.reset_index(drop=True, inplace=True)
frameshifted_1=pot_frames.ix[:11, ('tag_rem_pep')] # 3 or more

### reach into main df and pull these frames out
frames_1=onlyflu[onlyflu.tag_rem_pep.isin(frameshifted_1)]
frames_1[['tag_rem_pep', 'file_id']].groupby(['tag_rem_pep', 'file_id']).size()


######################################################################  Debugging #####################################################################################################
########################### prepare for local blast make a fasta list

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

blastlist = [line.replace('\n', '') for line in open('blast_list', 'r')]
blastlist.remove('my_blast.xml') ### remove a test file from the list

n = 0
for files in blastlist:
	for record in NCBIXML.parse(open(files)):
		if record.alignments:
			#print "QUERY: %s" % record.query[:60]
			for align in record.alignments:
	 			for hsp in align.hsps:
	 				if hsp.expect > 0.5:
	 					n += 1
	 					print "MATCH: %s " % align.title[:60]
	 					print hsp.expect
	 					print hsp.query, hsp.identities

checkpepsindex =[]
n = 0
for files in blastlist:
	for record in NCBIXML.parse(open(files)):
		if len(record.alignments) == 0:
			n += 1
			print record.query, files
			checkpepsindex.append(files.split('.')[1])
			#print "QUERY: %s" % record.query[:60]
			for align in record.alignments:
	 			for hsp in align.hsps:
	 				print hsp.expect
	 				n += 1
	 				if hsp.expect < 1e-10:
	 					n += 1
	 					print "MATCH: %s " % align.title[:60]
	 					print hsp.expect


pep_frame_index=[int(i.split('_')[1]) for i in checkpepsindex]

frameshifts=unmatchedpeps_df.ix[pep_frame_index]
frameshifts.sort_values([0], ascending=False)


SeqIO.write(reference_fasta, '/home/labcomp/Desktop/frames_reference.fasta', 'fasta')

filename='fasta_list'
filelines=`cat $filename`
for line in $filelines ; do
    #blastp -out $line.xml -outfmt 5 -query $line -db "nr" -entrez_query "Homo Sapiens[ORGN]" -remote
    blastp -out $line.xml -outfmt 5 -query $line -db ~/frames_DB -num_threads 8

    #echo $line
done

selectedhu.loc[selectedhu.tag_rem_pep.str.contains('ALTLLEDWCKGMDMDPR'), ('filename')].value_counts()
	###PYTHON#############
	#Purpose : Analyze frameshifts in the MS-tryspin/chemotrypsin digests
	#Author : Aditya Ambati
	#date : June7 2017
	#update : version 1
	#####import libraries
	from itertools import chain
	import pandas as pd
	import numpy as np
	%matplotlib
	import re
	from Bio import Entrez
	from Bio import SeqIO
	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord
	from Bio.Alphabet import IUPAC
	################################################ define helper function ############################################
	################ this function will take a list with filenames and return a MS output spectra file with trimmed columns and calculated peptide length

	def peponly(pepseq):
	return(re.sub(pattern='[^A-Z]*',repl='', string=pepseq))
	def spectra_processing(spectra):
	dfs =[]
	for i in range(0, len(spectra)):

	temp = spectra[i]#pd.read_csv(spectra[i], header = 0, sep = ";")
	temp1 = temp[['Protein\nRank','Peptide\n< ProteinMetrics Confidential >','Modification Type(s)', 'Starting\nposition', 'Protein Name', '\|Log Prob\|', 'file_id']]

	## rename the columns
	temp1.columns = ['protein_rank', 'peptide', 'modtype', 'start_pos', 'protein_name', 'log_prob', 'file_id']
	temp2=temp1[~temp1['protein_name'].str.contains('Reverse')]
	temp2.reset_index(drop=True, inplace=True)

	#temp2.loc[:, ('filename')] = spectra[i]
	temp2.loc[:, ('pep_wt_tags')]=[re.findall('\..*\.', i) for i in temp2.peptide] ## take only characters between the first and the last dot character
	temp2.loc[:, ('tag_rem_pep')] = [peponly(i) for i in temp2.pep_wt_tags] ### assign to a new column the raw peptide without tags
	temp2.loc[:, ('tag_rm_len')] = [len(i) for i in temp2.tag_rem_pep] ## get the raw peptide length
	temp2.loc[:, ('templen')] = temp2.tag_rm_len + temp2.start_pos ### add the startingpos + untagged peptide length
	#temp2.loc[:, 'org_length'] = [len(i) for i in temp2.peptide]
	dfs.append(temp2)
	return(dfs)

	##### read in the 1 shift file

	file_list =[] ## make a file list
	for line in open('vacc_names_batches_matched', 'r'):
	t1=line.replace('\n', '')
	filename=t1.split('\t')[0]
	vaccine = t1.split('\t')[1]
	batch = t1.split('\t')[2]
	out = [filename, vaccine, batch]
	file_list.append(out)

	filelist=pd.DataFrame(file_list, columns=['file', 'vacc', 'batch'])
	filelist.loc[:, ('file_id')]=filelist.vacc +'_TRY_'+ filelist.batch ## add in an extra colum sepcifying fileID

	shiftdfs=[pd.read_excel(i, sheetname='Spectra') for i in filelist.file] ## read in the spectra files in a loop.

	shiftdf1 = shiftdfs[:]
	for i in range(0, len(shiftdfs)): #### add in the file and lot ID into a column
	shiftdfs[i].loc[:, ('file_id')] = filelist.file_id[i]
	############# call for spectra processing fucntion
	shiftdf_proc = spectra_processing(shiftdf1)






	all_1shift=pd.concat(shiftdf_proc, ignore_index=True)
	onlyflu=all_1shift[all_1shift.protein_name.str.contains('Influenza')]
	onlyflu.reset_index(drop=True, inplace=True)
	onlyflu.loc[:, ('genbankID')]=[re.sub('>', '', i.split(' ')[0]) for i in onlyflu.protein_name]


	################## read in the reference files needed to match the frameshifts
	reference_ids = pd.read_csv('/media/labcomp/HDD/otherprojects/MS_Analysis/Frameshift/Frameshift_list_april18.csv', header=0)
	reference_ids.columns=[re.sub(' ', '', i) for i in reference_ids.columns.values]
	reference_gb = reference_ids.genbankID.tolist()
	reference_pbid = reference_ids.proteinID.tolist()
	reference_fasta = []
	for i in reference_pbid:
	Entrez.email = "ambati@stanford.edu"
	handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=i) ## make a request to the ncbi
	seq_record = SeqIO.read(handle, "fasta")
	handle.close()
	reference_fasta.append(seq_record)

	#onlyflu=shiftdf1[shiftdf1.protein_name.str.contains('Influenza')]
	#onlyflu.reset_index(drop=True, inplace=True)
	#onlyflu.loc[:, ('genbankID')]=[re.sub('>', '', i.split(' ')[0]) for i in onlyflu.protein_name]
	#onlyflumerge=pd.merge(onlyflu, reference_ids, left_on='genbankID', right_on='genbankID', how='inner')
	##################### make a df of proteinID and the sequences asscoiated with them
	db_dict ={}
	for i in range(0, len(reference_fasta)):
	seqid = reference_fasta[i].id
	fastaseq = reference_fasta[i].seq.tostring()
	db_dict[seqid] = fastaseq


	ref_df_prots=pd.DataFrame.from_dict(db_dict.items())
	ref_df_prots.columns = ['PID', 'SEQ']
	########################## take only unique peptide to be tested against reference sequences
	unique_peps=onlyflu.groupby(['tag_rem_pep', 'genbankID']).size()
	uniques_df=pd.DataFrame(unique_peps)
	uniques_df.reset_index(inplace=True)
	### get a list of index where the peptide dont match
	checkindexes=[]
	for i in range(0, len(uniques_df.tag_rem_pep)):
	testx=ref_df_prots[ref_df_prots.SEQ.str.contains(uniques_df.tag_rem_pep[i])]
	if testx.shape[0] == 0:
	checkindexes.append(i)





	unmatchedpep=uniques_df.ix[checkindexes]
	unmatchedpep.reset_index(drop=True, inplace=True)
	#### merge the proteinids and genbank ids
	unmatchedpeps_df=pd.merge(unmatchedpep, reference_ids, left_on='genbankID', right_on='genbankID', how='inner')

	unmatchedpeps_df.loc[:, ('blast_out')] = 'NaN'
	checkfastas=[]
	for i in range(0, len(unmatchedpeps_df.tag_rem_pep)):
	tempseq=SeqRecord(Seq(unmatchedpeps_df.tag_rem_pep[i], IUPAC.protein), id = unmatchedpeps_df.proteinID[i], name='index'+str(unmatchedpeps_df.strainname[i]))
	checkfastas.append(tempseq)
	#SeqIO.write(tempseq, '/home/labcomp/Desktop/Vaccine_MS_May16/1_shiftMS/Blast_files/' + tempseq.id.replace(' ', '')+'_'+str(i)+'.fasta', 'fasta')
	fasta_x=tempseq.id.replace(' ', '')+'_'+str(i)+'.fasta'
	#SeqIO.write(checkfastas, 'check.fasta', 'fasta')
	#blast_hit =[]
	#with open('fasta_list', 'r') as f:
	#for line in f:
	#fasta_x = line.strip()

	#BLASTN_command_C = NcbiblastpCommandline(query='/home/labcomp/Desktop/Vaccine_MS_May16/1_shiftMS/Blast_files/ADE29085.1_467.fasta',db='/home/labcomp/frames_DB',num_threads=8,outfmt='"6 qseqid sseqid evalue bitscore length pident nident qframe sframe gaps sstart send sseq sstrand qstart qend qseq"',max_target_seqs=1)
	#stdout_C, stderr_C = BLASTN_command_C()
	BLASTN_command_C = NcbiblastpCommandline(query=fasta_x,db='/home/labcomp/frames_DB',num_threads=8,outfmt='"6 qseqid sseqid evalue bitscore length pident nident qframe sframe gaps sstart send sseq sstrand qstart qend qseq"',max_target_seqs=1)
	stdout_C, stderr_C = BLASTN_command_C()
	print stdout_C
	#blast_hit.append(stdout_C)
	unmatchedpeps_df.loc[i, ('blast_out')] = stdout_C


	pot_frames=unmatchedpeps_df[unmatchedpeps_df.blast_out == '']

	pot_frames=pot_frames.sort_values([0], ascending=False)
	pot_frames.reset_index(drop=True, inplace=True)
	frameshifted_1=pot_frames.ix[:11, ('tag_rem_pep')] # 3 or more

	### reach into main df and pull these frames out
	frames_1=onlyflu[onlyflu.tag_rem_pep.isin(frameshifted_1)]
	frames_1[['tag_rem_pep', 'file_id']].groupby(['tag_rem_pep', 'file_id']).size()











	###################################################################### Debugging #####################################################################################################
	########################### prepare for local blast make a fasta list

	from Bio.Seq import Seq
	from Bio.SeqRecord import SeqRecord
	from Bio.Alphabet import IUPAC

	blastlist = [line.replace('\n', '') for line in open('blast_list', 'r')]
	blastlist.remove('my_blast.xml') ### remove a test file from the list

	n = 0
	for files in blastlist:
	for record in NCBIXML.parse(open(files)):
	if record.alignments:
	#print "QUERY: %s" % record.query[:60]
	for align in record.alignments:
	for hsp in align.hsps:
	if hsp.expect > 0.5:
	n += 1
	print "MATCH: %s " % align.title[:60]
	print hsp.expect
	print hsp.query, hsp.identities

	checkpepsindex =[]
	n = 0
	for files in blastlist:
	for record in NCBIXML.parse(open(files)):
	if len(record.alignments) == 0:
	n += 1
	print record.query, files
	checkpepsindex.append(files.split('.')[1])
	#print "QUERY: %s" % record.query[:60]
	for align in record.alignments:
	for hsp in align.hsps:
	print hsp.expect
	n += 1
	if hsp.expect < 1e-10:
	n += 1
	print "MATCH: %s " % align.title[:60]
	print hsp.expect


	pep_frame_index=[int(i.split('_')[1]) for i in checkpepsindex]

	frameshifts=unmatchedpeps_df.ix[pep_frame_index]
	frameshifts.sort_values([0], ascending=False)







	SeqIO.write(reference_fasta, '/home/labcomp/Desktop/frames_reference.fasta', 'fasta')

	filename='fasta_list'
	filelines=`cat $filename`
	for line in $filelines ; do
	#blastp -out $line.xml -outfmt 5 -query $line -db "nr" -entrez_query "Homo Sapiens[ORGN]" -remote
	blastp -out $line.xml -outfmt 5 -query $line -db ~/frames_DB -num_threads 8

	#echo $line
	done

	selectedhu.loc[selectedhu.tag_rem_pep.str.contains('ALTLLEDWCKGMDMDPR'), ('filename')].value_counts()