Skip to content

Instantly share code, notes, and snippets.

@adiamb
Created June 30, 2017 17:38
Show Gist options
  • Save adiamb/9c6b821b3694226ce9afc7ec2ae5392a to your computer and use it in GitHub Desktop.
Save adiamb/9c6b821b3694226ce9afc7ec2ae5392a to your computer and use it in GitHub Desktop.
Frameshift python script june 30 2017
###PYTHON#############
#Purpose : Analyze frameshifts in the MS-tryspin/chemotrypsin digests
#Author : Aditya Ambati
#date : June7 2017
#update : version 1
#####import libraries
from itertools import chain
import pandas as pd
import numpy as np
%matplotlib
import re
from Bio import Entrez
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
################################################ define helper function ############################################
################ this function will take a list with filenames and return a MS output spectra file with trimmed columns and calculated peptide length
def peponly(pepseq):
return(re.sub(pattern='[^A-Z]*',repl='', string=pepseq))
def spectra_processing(spectra):
dfs =[]
for i in range(0, len(spectra)):
temp = spectra[i]#pd.read_csv(spectra[i], header = 0, sep = ";")
temp1 = temp[['Protein\nRank','Peptide\n< ProteinMetrics Confidential >','Modification Type(s)', 'Starting\nposition', 'Protein Name', '|Log Prob|', 'file_id']]
## rename the columns
temp1.columns = ['protein_rank', 'peptide', 'modtype', 'start_pos', 'protein_name', 'log_prob', 'file_id']
temp2=temp1[~temp1['protein_name'].str.contains('Reverse')]
temp2.reset_index(drop=True, inplace=True)
#temp2.loc[:, ('filename')] = spectra[i]
temp2.loc[:, ('pep_wt_tags')]=[re.findall('\..*\.', i) for i in temp2.peptide] ## take only characters between the first and the last dot character
temp2.loc[:, ('tag_rem_pep')] = [peponly(i) for i in temp2.pep_wt_tags] ### assign to a new column the raw peptide without tags
temp2.loc[:, ('tag_rm_len')] = [len(i) for i in temp2.tag_rem_pep] ## get the raw peptide length
temp2.loc[:, ('templen')] = temp2.tag_rm_len + temp2.start_pos ### add the startingpos + untagged peptide length
#temp2.loc[:, 'org_length'] = [len(i) for i in temp2.peptide]
dfs.append(temp2)
return(dfs)
##### read in the 1 shift file
file_list =[] ## make a file list
for line in open('vacc_names_batches_matched', 'r'):
t1=line.replace('\n', '')
filename=t1.split('\t')[0]
vaccine = t1.split('\t')[1]
batch = t1.split('\t')[2]
out = [filename, vaccine, batch]
file_list.append(out)
filelist=pd.DataFrame(file_list, columns=['file', 'vacc', 'batch'])
filelist.loc[:, ('file_id')]=filelist.vacc +'_TRY_'+ filelist.batch ## add in an extra colum sepcifying fileID
shiftdfs=[pd.read_excel(i, sheetname='Spectra') for i in filelist.file] ## read in the spectra files in a loop.
shiftdf1 = shiftdfs[:]
for i in range(0, len(shiftdfs)): #### add in the file and lot ID into a column
shiftdfs[i].loc[:, ('file_id')] = filelist.file_id[i]
############# call for spectra processing fucntion
shiftdf_proc = spectra_processing(shiftdf1)
all_1shift=pd.concat(shiftdf_proc, ignore_index=True)
onlyflu=all_1shift[all_1shift.protein_name.str.contains('Influenza')]
onlyflu.reset_index(drop=True, inplace=True)
onlyflu.loc[:, ('genbankID')]=[re.sub('>', '', i.split(' ')[0]) for i in onlyflu.protein_name]
################## read in the reference files needed to match the frameshifts
reference_ids = pd.read_csv('/media/labcomp/HDD/otherprojects/MS_Analysis/Frameshift/Frameshift_list_april18.csv', header=0)
reference_ids.columns=[re.sub(' ', '', i) for i in reference_ids.columns.values]
reference_gb = reference_ids.genbankID.tolist()
reference_pbid = reference_ids.proteinID.tolist()
reference_fasta = []
for i in reference_pbid:
Entrez.email = "ambati@stanford.edu"
handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=i) ## make a request to the ncbi
seq_record = SeqIO.read(handle, "fasta")
handle.close()
reference_fasta.append(seq_record)
#onlyflu=shiftdf1[shiftdf1.protein_name.str.contains('Influenza')]
#onlyflu.reset_index(drop=True, inplace=True)
#onlyflu.loc[:, ('genbankID')]=[re.sub('>', '', i.split(' ')[0]) for i in onlyflu.protein_name]
#onlyflumerge=pd.merge(onlyflu, reference_ids, left_on='genbankID', right_on='genbankID', how='inner')
##################### make a df of proteinID and the sequences asscoiated with them
db_dict ={}
for i in range(0, len(reference_fasta)):
seqid = reference_fasta[i].id
fastaseq = reference_fasta[i].seq.tostring()
db_dict[seqid] = fastaseq
ref_df_prots=pd.DataFrame.from_dict(db_dict.items())
ref_df_prots.columns = ['PID', 'SEQ']
########################## take only unique peptide to be tested against reference sequences
unique_peps=onlyflu.groupby(['tag_rem_pep', 'genbankID']).size()
uniques_df=pd.DataFrame(unique_peps)
uniques_df.reset_index(inplace=True)
### get a list of index where the peptide dont match
checkindexes=[]
for i in range(0, len(uniques_df.tag_rem_pep)):
testx=ref_df_prots[ref_df_prots.SEQ.str.contains(uniques_df.tag_rem_pep[i])]
if testx.shape[0] == 0:
checkindexes.append(i)
unmatchedpep=uniques_df.ix[checkindexes]
unmatchedpep.reset_index(drop=True, inplace=True)
#### merge the proteinids and genbank ids
unmatchedpeps_df=pd.merge(unmatchedpep, reference_ids, left_on='genbankID', right_on='genbankID', how='inner')
unmatchedpeps_df.loc[:, ('blast_out')] = 'NaN'
checkfastas=[]
for i in range(0, len(unmatchedpeps_df.tag_rem_pep)):
tempseq=SeqRecord(Seq(unmatchedpeps_df.tag_rem_pep[i], IUPAC.protein), id = unmatchedpeps_df.proteinID[i], name='index'+str(unmatchedpeps_df.strainname[i]))
checkfastas.append(tempseq)
#SeqIO.write(tempseq, '/home/labcomp/Desktop/Vaccine_MS_May16/1_shiftMS/Blast_files/' + tempseq.id.replace(' ', '')+'_'+str(i)+'.fasta', 'fasta')
fasta_x=tempseq.id.replace(' ', '')+'_'+str(i)+'.fasta'
#SeqIO.write(checkfastas, 'check.fasta', 'fasta')
#blast_hit =[]
#with open('fasta_list', 'r') as f:
#for line in f:
#fasta_x = line.strip()
#BLASTN_command_C = NcbiblastpCommandline(query='/home/labcomp/Desktop/Vaccine_MS_May16/1_shiftMS/Blast_files/ADE29085.1_467.fasta',db='/home/labcomp/frames_DB',num_threads=8,outfmt='"6 qseqid sseqid evalue bitscore length pident nident qframe sframe gaps sstart send sseq sstrand qstart qend qseq"',max_target_seqs=1)
#stdout_C, stderr_C = BLASTN_command_C()
BLASTN_command_C = NcbiblastpCommandline(query=fasta_x,db='/home/labcomp/frames_DB',num_threads=8,outfmt='"6 qseqid sseqid evalue bitscore length pident nident qframe sframe gaps sstart send sseq sstrand qstart qend qseq"',max_target_seqs=1)
stdout_C, stderr_C = BLASTN_command_C()
print stdout_C
#blast_hit.append(stdout_C)
unmatchedpeps_df.loc[i, ('blast_out')] = stdout_C
pot_frames=unmatchedpeps_df[unmatchedpeps_df.blast_out == '']
pot_frames=pot_frames.sort_values([0], ascending=False)
pot_frames.reset_index(drop=True, inplace=True)
frameshifted_1=pot_frames.ix[:11, ('tag_rem_pep')] # 3 or more
### reach into main df and pull these frames out
frames_1=onlyflu[onlyflu.tag_rem_pep.isin(frameshifted_1)]
frames_1[['tag_rem_pep', 'file_id']].groupby(['tag_rem_pep', 'file_id']).size()
###################################################################### Debugging #####################################################################################################
########################### prepare for local blast make a fasta list
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
blastlist = [line.replace('\n', '') for line in open('blast_list', 'r')]
blastlist.remove('my_blast.xml') ### remove a test file from the list
n = 0
for files in blastlist:
for record in NCBIXML.parse(open(files)):
if record.alignments:
#print "QUERY: %s" % record.query[:60]
for align in record.alignments:
for hsp in align.hsps:
if hsp.expect > 0.5:
n += 1
print "MATCH: %s " % align.title[:60]
print hsp.expect
print hsp.query, hsp.identities
checkpepsindex =[]
n = 0
for files in blastlist:
for record in NCBIXML.parse(open(files)):
if len(record.alignments) == 0:
n += 1
print record.query, files
checkpepsindex.append(files.split('.')[1])
#print "QUERY: %s" % record.query[:60]
for align in record.alignments:
for hsp in align.hsps:
print hsp.expect
n += 1
if hsp.expect < 1e-10:
n += 1
print "MATCH: %s " % align.title[:60]
print hsp.expect
pep_frame_index=[int(i.split('_')[1]) for i in checkpepsindex]
frameshifts=unmatchedpeps_df.ix[pep_frame_index]
frameshifts.sort_values([0], ascending=False)
SeqIO.write(reference_fasta, '/home/labcomp/Desktop/frames_reference.fasta', 'fasta')
filename='fasta_list'
filelines=`cat $filename`
for line in $filelines ; do
#blastp -out $line.xml -outfmt 5 -query $line -db "nr" -entrez_query "Homo Sapiens[ORGN]" -remote
blastp -out $line.xml -outfmt 5 -query $line -db ~/frames_DB -num_threads 8
#echo $line
done
selectedhu.loc[selectedhu.tag_rem_pep.str.contains('ALTLLEDWCKGMDMDPR'), ('filename')].value_counts()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment