Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Extracts organism metadata for taxa in treebase repo from GenomesOnline via ncbi
# Quick and dirty web scraper for extracting html files of
# Organism information, Genome Project Information,
# Sequencing Information, Environmental Metadata
# and organism metadata
# from GOLD databases based on treebase taxa lists, via ncbi
# Saves pages in the current directory
from urllib2 import urlopen
import os
import codecs
import time
from bs4 import BeautifulSoup # easy_install beautifulsoup4
import lxml # easy_install lxml
#set this to the url of your treebase taxa list:
url = ""
#set delay between hits - to be nice to the server!
delay = 1
# front of url for ncbi taxa pages - should not need to change.
ncbi = ""
# front of url for GOLD taxa metadata pages - should not need to change.
gold = ""
#Read and parse treebase taxa list
ur = urlopen(url)
data =
soup = BeautifulSoup(data)
# lazy list of ncbi urls in treebase taxa list
taxalist = (i for i in soup("a") if ncbi in i.attrs["href"])
# lazy list of parsed ncbi taxa pages
taxa = (BeautifulSoup(urlopen(taxa.attrs["href"])) for taxa in taxalist)
for taxon in taxa:
# list of GOLD metadata urls on ncbi taxa page
link = [i for i in taxon("a") if gold in i.attrs["href"]]
if link:
link = link[0]
meta = urlopen(link.attrs["href"]).read() # fetch metadata html
f =": ")[1] + ".html",
"w", "utf-8")
print link.text + " found and saved..."
if delay: time.sleep(delay)
print "Done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment