Skip to content

Instantly share code, notes, and snippets.

@sjcockell
Created October 25, 2010 10:51
Show Gist options
  • Save sjcockell/644765 to your computer and use it in GitHub Desktop.
Save sjcockell/644765 to your computer and use it in GitHub Desktop.
Get a protein sequence from UniProt when you only have that protein's name, not its accession. Uses http://gist.github.com/329730.
import uniprot_mapping
import urllib2
def main(file):
fh = open(file, 'r')
for line in fh.readlines():
if not line.startswith('"'): #ignore comment lines
name = line.rstrip()
id = uniprot_mapping.uniprot_mapping('ACC+ID', 'ACC', name)
mapped = parse_return_string(id)
if not mapped[1]:
print(name) #something went wrong
else:
seq_url = 'http://www.uniprot.org/uniprot/'+mapped[1]+'.fasta'
response = urllib2.urlopen(seq_url)
out = open('seqs/'+mapped[1]+'.fa', 'w')
out.write(response.read())
out.close()
def parse_return_string(string):
from_id = None
to_id = None
lines = string.split('\n') #removes first element from list, always header
lines.pop(0)
for line in lines:
if not to_id: #only get the first id - single accession sufficient
try:
from_id = line.split('\t')[0]
to_id = line.split('\t')[1]
except IndexError: #nothing in the list :(
print 'ERROR:'
print string
return (from_id, to_id)
if __name__ == '__main__':
main('./proteins') #assumes list of names in this file
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment