sjcockell/get_sequence_from_name.py

## get_sequence_from_name.py
import uniprot_mapping
import urllib2

def main(file):
    fh = open(file, 'r')
    for line in fh.readlines():
        if not line.startswith('"'): #ignore comment lines
            name = line.rstrip()
            id = uniprot_mapping.uniprot_mapping('ACC+ID', 'ACC', name)
            mapped = parse_return_string(id)
            if not mapped[1]:
                print(name) #something went wrong
            else:
                seq_url = 'http://www.uniprot.org/uniprot/'+mapped[1]+'.fasta'
                response = urllib2.urlopen(seq_url)
                out = open('seqs/'+mapped[1]+'.fa', 'w')
                out.write(response.read())
                out.close()

def parse_return_string(string):
    from_id = None
    to_id = None
    lines = string.split('\n') #removes first element from list, always header
    lines.pop(0)
    for line in lines:
        if not to_id: #only get the first id - single accession sufficient
            try:
                from_id = line.split('\t')[0]
                to_id = line.split('\t')[1]
            except IndexError: #nothing in the list :(
                print 'ERROR:'
                print string
    return (from_id, to_id)

if __name__ == '__main__':
    main('./proteins') #assumes list of names in this file

## gistfile2.txt

      
    Raw
  

              gistfile2.txt
	import uniprot_mapping
	import urllib2

	def main(file):
	fh = open(file, 'r')
	for line in fh.readlines():
	if not line.startswith('"'): #ignore comment lines
	name = line.rstrip()
	id = uniprot_mapping.uniprot_mapping('ACC+ID', 'ACC', name)
	mapped = parse_return_string(id)
	if not mapped[1]:
	print(name) #something went wrong
	else:
	seq_url = 'http://www.uniprot.org/uniprot/'+mapped[1]+'.fasta'
	response = urllib2.urlopen(seq_url)
	out = open('seqs/'+mapped[1]+'.fa', 'w')
	out.write(response.read())
	out.close()

	def parse_return_string(string):
	from_id = None
	to_id = None
	lines = string.split('\n') #removes first element from list, always header
	lines.pop(0)
	for line in lines:
	if not to_id: #only get the first id - single accession sufficient
	try:
	from_id = line.split('\t')[0]
	to_id = line.split('\t')[1]
	except IndexError: #nothing in the list :(
	print 'ERROR:'
	print string
	return (from_id, to_id)

	if __name__ == '__main__':
	main('./proteins') #assumes list of names in this file