Skip to content

Instantly share code, notes, and snippets.

@sjcockell
Created October 25, 2010 15:51
Show Gist options
  • Save sjcockell/645180 to your computer and use it in GitHub Desktop.
Save sjcockell/645180 to your computer and use it in GitHub Desktop.
Build multiple sequence alignments from sub groups of proteins within a list file. Uses http://gist.github.com/329730, http://gist.github.com/644765 and muscle (http://www.drive5.com/muscle/downloads.htm)
import get_sequences
import uniprot_mapping
import urllib2
import shlex, subprocess
def main(file):
with open(file) as f:
data = f.read()
groups = data.split('"') #file has protein name per line with " delineating groups
groups = organise_groups(groups)
i = 1
for group in groups:
#build one alignment per group
s = 'align/group'+str(i)+'.seq'
o = 'align/group'+str(i)+'.aln'
out = open(s, 'w')
for g in group:
id = uniprot_mapping.uniprot_mapping('ACC+ID', 'ACC', g)
mapped = get_sequences.parse_return_string(id)
seq_url = 'http://www.uniprot.org/uniprot/'+mapped[1]+'.fasta'
out.write(urllib2.urlopen(seq_url).read())
out.close()
cmd_str = 'muscle -in '+s+' -out '+o
args = shlex.split(cmd_str)
p = subprocess.Popen(args) #runs muscle over protein group
i += 1
def organise_groups(g):
#clips out ASCII cruft from groups of proteins
groups = []
for group in g:
group = group.lstrip()
group = group.rstrip()
group = group.split('\n')
groups.append(group)
return groups
if __name__ == '__main__':
main('proteins')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment