sjcockell/build_alignments.py

## build_alignments.py
import get_sequences
import uniprot_mapping
import urllib2
import shlex, subprocess

def main(file):
    with open(file) as f:
        data = f.read()
    groups = data.split('"') #file has protein name per line with " delineating groups
    groups = organise_groups(groups)
    i = 1
    for group in groups:
        #build one alignment per group
        s = 'align/group'+str(i)+'.seq'
        o = 'align/group'+str(i)+'.aln'
        out = open(s, 'w')
        for g in group:
            id = uniprot_mapping.uniprot_mapping('ACC+ID', 'ACC', g)
            mapped = get_sequences.parse_return_string(id)
            seq_url = 'http://www.uniprot.org/uniprot/'+mapped[1]+'.fasta'
            out.write(urllib2.urlopen(seq_url).read())
        out.close()
        cmd_str = 'muscle -in '+s+' -out '+o
        args = shlex.split(cmd_str)
        p = subprocess.Popen(args) #runs muscle over protein group
        i += 1


def organise_groups(g):
    #clips out ASCII cruft from groups of proteins
    groups = []
    for group in g:
        group = group.lstrip()
        group = group.rstrip()
        group = group.split('\n')
        groups.append(group)
    return groups

if __name__ == '__main__':
    main('proteins')
	import get_sequences
	import uniprot_mapping
	import urllib2
	import shlex, subprocess

	def main(file):
	with open(file) as f:
	data = f.read()
	groups = data.split('"') #file has protein name per line with " delineating groups
	groups = organise_groups(groups)
	i = 1
	for group in groups:
	#build one alignment per group
	s = 'align/group'+str(i)+'.seq'
	o = 'align/group'+str(i)+'.aln'
	out = open(s, 'w')
	for g in group:
	id = uniprot_mapping.uniprot_mapping('ACC+ID', 'ACC', g)
	mapped = get_sequences.parse_return_string(id)
	seq_url = 'http://www.uniprot.org/uniprot/'+mapped[1]+'.fasta'
	out.write(urllib2.urlopen(seq_url).read())
	out.close()
	cmd_str = 'muscle -in '+s+' -out '+o
	args = shlex.split(cmd_str)
	p = subprocess.Popen(args) #runs muscle over protein group
	i += 1


	def organise_groups(g):
	#clips out ASCII cruft from groups of proteins
	groups = []
	for group in g:
	group = group.lstrip()
	group = group.rstrip()
	group = group.split('\n')
	groups.append(group)
	return groups

	if __name__ == '__main__':
	main('proteins')