Created
October 25, 2010 15:51
-
-
Save sjcockell/645180 to your computer and use it in GitHub Desktop.
Build multiple sequence alignments from sub groups of proteins within a list file. Uses http://gist.github.com/329730, http://gist.github.com/644765 and muscle (http://www.drive5.com/muscle/downloads.htm)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import get_sequences | |
import uniprot_mapping | |
import urllib2 | |
import shlex, subprocess | |
def main(file): | |
with open(file) as f: | |
data = f.read() | |
groups = data.split('"') #file has protein name per line with " delineating groups | |
groups = organise_groups(groups) | |
i = 1 | |
for group in groups: | |
#build one alignment per group | |
s = 'align/group'+str(i)+'.seq' | |
o = 'align/group'+str(i)+'.aln' | |
out = open(s, 'w') | |
for g in group: | |
id = uniprot_mapping.uniprot_mapping('ACC+ID', 'ACC', g) | |
mapped = get_sequences.parse_return_string(id) | |
seq_url = 'http://www.uniprot.org/uniprot/'+mapped[1]+'.fasta' | |
out.write(urllib2.urlopen(seq_url).read()) | |
out.close() | |
cmd_str = 'muscle -in '+s+' -out '+o | |
args = shlex.split(cmd_str) | |
p = subprocess.Popen(args) #runs muscle over protein group | |
i += 1 | |
def organise_groups(g): | |
#clips out ASCII cruft from groups of proteins | |
groups = [] | |
for group in g: | |
group = group.lstrip() | |
group = group.rstrip() | |
group = group.split('\n') | |
groups.append(group) | |
return groups | |
if __name__ == '__main__': | |
main('proteins') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment