Created
October 25, 2010 10:51
-
-
Save sjcockell/644765 to your computer and use it in GitHub Desktop.
Get a protein sequence from UniProt when you only have that protein's name, not its accession. Uses http://gist.github.com/329730.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import uniprot_mapping | |
import urllib2 | |
def main(file): | |
fh = open(file, 'r') | |
for line in fh.readlines(): | |
if not line.startswith('"'): #ignore comment lines | |
name = line.rstrip() | |
id = uniprot_mapping.uniprot_mapping('ACC+ID', 'ACC', name) | |
mapped = parse_return_string(id) | |
if not mapped[1]: | |
print(name) #something went wrong | |
else: | |
seq_url = 'http://www.uniprot.org/uniprot/'+mapped[1]+'.fasta' | |
response = urllib2.urlopen(seq_url) | |
out = open('seqs/'+mapped[1]+'.fa', 'w') | |
out.write(response.read()) | |
out.close() | |
def parse_return_string(string): | |
from_id = None | |
to_id = None | |
lines = string.split('\n') #removes first element from list, always header | |
lines.pop(0) | |
for line in lines: | |
if not to_id: #only get the first id - single accession sufficient | |
try: | |
from_id = line.split('\t')[0] | |
to_id = line.split('\t')[1] | |
except IndexError: #nothing in the list :( | |
print 'ERROR:' | |
print string | |
return (from_id, to_id) | |
if __name__ == '__main__': | |
main('./proteins') #assumes list of names in this file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment