Skip to content

Instantly share code, notes, and snippets.

@chapmanb
Created January 25, 2012 11:50
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chapmanb/1675927 to your computer and use it in GitHub Desktop.
Save chapmanb/1675927 to your computer and use it in GitHub Desktop.
#!/usr/local/bin/python
# Copyright (C) 2004 Rune Linding & Lars Juhl Jensen - EMBL
# The DisEMBL is licensed under the GPL license
# (http://www.opensource.org/licenses/gpl-license.php)
# DisEMBL pipeline
from string import *
from sys import argv
from Bio import SeqIO
import fpformat
import sys
import tempfile
import os
from os import system,popen3
# change these to the correct paths
NN_bin = '/PATH/DisEMBL-1.4/disembl'
SG_bin = '/PATH/DisEMBL-1.4/sav_gol'
def JensenNet(sequence):
outFile = tempfile.mktemp()
inFile= tempfile.mktemp()
open(inFile,'w').write(sequence+'\n')
system(NN_bin + '< ' + inFile +' > ' + outFile)
REM465 = []
COILS = []
HOTLOOPS = []
resultsFile = open(outFile,'r')
results = resultsFile.readlines()
resultsFile.close()
for result in results:
coil = float(fpformat.fix(split(result)[0],6))
COILS.append(coil)
hotloop = float(fpformat.fix(split(result)[1],6))
HOTLOOPS.append(hotloop)
rem465 = float(fpformat.fix(split(result)[2],6))
REM465.append(rem465)
os.remove(inFile)
os.remove(outFile)
return COILS, HOTLOOPS, REM465
def SavitzkyGolay(window,derivative,datalist):
if len(datalist) < 2*window:
window = len(datalist)/2
elif window == 0:
window = 1
stdin, stdout, stderr = popen3(SG_bin + ' -V0 -D' + str(derivative) + ' -n' + str(window)+','+str(window))
for data in datalist:
stdin.write(`data`+'\n')
try:
stdin.close()
except:
print stderr.readlines()
results = stdout.readlines()
stdout.close()
SG_results = []
for result in results:
f = float(fpformat.fix(result,6))
if f < 0:
SG_results.append(0)
else:
SG_results.append(f)
return SG_results
def getSlices(NNdata, fold, join_frame, peak_frame, expect_val):
slices = []
inSlice = 0
for i in range(len(NNdata)):
if inSlice:
if NNdata[i] < expect_val:
if maxSlice >= fold*expect_val:
slices.append([beginSlice, endSlice])
inSlice = 0
else:
endSlice += 1
if NNdata[i] > maxSlice:
maxSlice = NNdata[i]
elif NNdata[i] >= expect_val:
beginSlice = i
endSlice = i
inSlice = 1
maxSlice = NNdata[i]
if inSlice and maxSlice >= fold*expect_val:
slices.append([beginSlice, endSlice])
i = 0
while i < len(slices):
if i+1 < len(slices) and slices[i+1][0]-slices[i][1] <= join_frame:
slices[i] = [ slices[i][0], slices[i+1][1] ]
del slices[i+1]
elif slices[i][1]-slices[i][0]+1 < peak_frame:
del slices[i]
else:
i += 1
return slices
def reportSlicesTXT(slices, sequence):
if slices == []:
s = lower(sequence)
else:
if slices[0][0] > 0:
s = lower(sequence[0:slices[0][0]])
else:
s = ''
for i in range(len(slices)):
if i > 0:
sys.stdout.write(', ')
sys.stdout.write( str(slices[i][0]+1) + '-' + str(slices[i][1]+1) )
s = s + upper(sequence[slices[i][0]:(slices[i][1]+1)])
if i < len(slices)-1:
s = s + lower(sequence[(slices[i][1]+1):(slices[i+1][0])])
elif slices[i][1] < len(sequence)-1:
s = s + lower(sequence[(slices[i][1]+1):(len(sequence))])
print ''
print s
def runDisEMBLpipeline():
try:
smooth_frame = int(sys.argv[1])
peak_frame = int(sys.argv[2])
join_frame = int(sys.argv[3])
fold_coils = float(sys.argv[4])
fold_hotloops = float(sys.argv[5])
fold_rem465 = float(sys.argv[6])
file = str(sys.argv[7])
except:
print '\nDisEMBL.py smooth_frame peak_frame join_frame fold_coils fold_hotloops fold_rem465 sequence_file\n'
print 'A default run would be: ./DisEMBL.py 8 8 4 1.2 1.4 1.2 fasta_file'
raise SystemExit
print ' ____ _ _____ __ __ ____ _ _ _ _'
print '| _ \(_)___| ____| \/ | __ )| | / || || |'
print '| | | | / __| _| | |\/| | _ \| | | || || |_'
print '| |_| | \__ \ |___| | | | |_) | |___ | ||__ _|'
print '|____/|_|___/_____|_| |_|____/|_____| |_(_) |_|'
print '# Copyright (C) 2004 - Rune Linding & Lars Juhl Jensen '
print '# EMBL Biocomputing Unit - Heidelberg - Germany '
print ''
db = open(file,'r')
for cur_record in SeqIO.parse(db, "fasta"):
sequence = upper(str(cur_record.seq))
# Run NN
COILS_raw, HOTLOOPS_raw, REM465_raw = JensenNet(sequence)
# Run Savitzky-Golay
REM465_smooth = SavitzkyGolay(smooth_frame,0,REM465_raw)
COILS_smooth = SavitzkyGolay(smooth_frame,0,COILS_raw)
HOTLOOPS_smooth = SavitzkyGolay(smooth_frame,0,HOTLOOPS_raw)
sys.stdout.write('> '+cur_record.id'_COILS ')
reportSlicesTXT( getSlices(COILS_smooth, fold_coils, join_frame, peak_frame, 0.43), sequence )
sys.stdout.write('> '+cur_record.id+'_REM465 ')
reportSlicesTXT( getSlices(REM465_smooth, fold_rem465, join_frame, peak_frame, 0.50), sequence )
sys.stdout.write('> '+cur_record.id+'_HOTLOOPS ')
reportSlicesTXT( getSlices(HOTLOOPS_smooth, fold_hotloops, join_frame, peak_frame, 0.086), sequence )
sys.stdout.write('\n')
return
runDisEMBLpipeline()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment