Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Extracts organism metadata for taxa in treebase repo from GenomesOnline via ncbi
#!/usr/bin/python2
# Quick and dirty web scraper for extracting html files of
# Organism information, Genome Project Information,
# Sequencing Information, Environmental Metadata
# and organism metadata
# from GOLD databases based on treebase taxa lists, via ncbi
# Saves pages in the current directory
from urllib2 import urlopen
import os
import codecs
import time
from bs4 import BeautifulSoup # easy_install beautifulsoup4
import lxml # easy_install lxml
#set this to the url of your treebase taxa list:
url = "http://www.treebase.org/treebase-web/search/study/taxa.html?id=10965"
#set delay between hits - to be nice to the server!
delay = 1
# front of url for ncbi taxa pages - should not need to change.
ncbi = "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?"
# front of url for GOLD taxa metadata pages - should not need to change.
gold = "http://genomesonline.org/cgi-bin/GOLD/bin/GOLDCards.cgi?"
#Read and parse treebase taxa list
ur = urlopen(url)
data = ur.read()
soup = BeautifulSoup(data)
# lazy list of ncbi urls in treebase taxa list
taxalist = (i for i in soup("a") if ncbi in i.attrs["href"])
# lazy list of parsed ncbi taxa pages
taxa = (BeautifulSoup(urlopen(taxa.attrs["href"])) for taxa in taxalist)
for taxon in taxa:
# list of GOLD metadata urls on ncbi taxa page
link = [i for i in taxon("a") if gold in i.attrs["href"]]
if link:
link = link[0]
meta = urlopen(link.attrs["href"]).read() # fetch metadata html
f = codecs.open(link.text.split(": ")[1] + ".html",
"w", "utf-8")
f.write(meta)
f.close()
print link.text + " found and saved..."
if delay: time.sleep(delay)
print "Done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment