DASpringate/treebase_metadata.py

## treebase_metadata.py
#!/usr/bin/python2

# Quick and dirty web scraper for extracting html files of
# Organism information, Genome Project Information,
# Sequencing Information,  Environmental Metadata
# and organism metadata
# from GOLD databases based on treebase taxa lists, via ncbi
# Saves pages in the current directory

from urllib2 import urlopen
import os
import codecs
import time

from bs4 import BeautifulSoup # easy_install beautifulsoup4
import lxml # easy_install lxml

#set this to the url of your treebase taxa list:
url = "http://www.treebase.org/treebase-web/search/study/taxa.html?id=10965"
#set delay between hits - to be nice to the server!
delay = 1
# front of url for ncbi taxa pages - should not need to change.
ncbi = "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?"
# front of url for GOLD taxa metadata pages - should not need to change.
gold = "http://genomesonline.org/cgi-bin/GOLD/bin/GOLDCards.cgi?"

#Read and parse treebase taxa list
ur = urlopen(url)
data = ur.read()
soup = BeautifulSoup(data)

# lazy list of ncbi urls in treebase taxa list
taxalist = (i for i in soup("a") if ncbi in i.attrs["href"])
# lazy list of parsed ncbi taxa pages
taxa = (BeautifulSoup(urlopen(taxa.attrs["href"])) for taxa in taxalist)

for taxon in taxa:
    # list of GOLD metadata urls on ncbi taxa page
    link = [i for i in taxon("a") if gold in i.attrs["href"]]
    if link:
        link = link[0]
        meta = urlopen(link.attrs["href"]).read() # fetch metadata html
        f = codecs.open(link.text.split(": ")[1] + ".html",
                    "w", "utf-8")
        f.write(meta)
        f.close()
        print link.text + " found and saved..."
        if delay: time.sleep(delay)
print "Done."
	#!/usr/bin/python2

	# Quick and dirty web scraper for extracting html files of
	# Organism information, Genome Project Information,
	# Sequencing Information, Environmental Metadata
	# and organism metadata
	# from GOLD databases based on treebase taxa lists, via ncbi
	# Saves pages in the current directory

	from urllib2 import urlopen
	import os
	import codecs
	import time

	from bs4 import BeautifulSoup # easy_install beautifulsoup4
	import lxml # easy_install lxml

	#set this to the url of your treebase taxa list:
	url = "http://www.treebase.org/treebase-web/search/study/taxa.html?id=10965"
	#set delay between hits - to be nice to the server!
	delay = 1
	# front of url for ncbi taxa pages - should not need to change.
	ncbi = "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?"
	# front of url for GOLD taxa metadata pages - should not need to change.
	gold = "http://genomesonline.org/cgi-bin/GOLD/bin/GOLDCards.cgi?"

	#Read and parse treebase taxa list
	ur = urlopen(url)
	data = ur.read()
	soup = BeautifulSoup(data)

	# lazy list of ncbi urls in treebase taxa list
	taxalist = (i for i in soup("a") if ncbi in i.attrs["href"])
	# lazy list of parsed ncbi taxa pages
	taxa = (BeautifulSoup(urlopen(taxa.attrs["href"])) for taxa in taxalist)

	for taxon in taxa:
	# list of GOLD metadata urls on ncbi taxa page
	link = [i for i in taxon("a") if gold in i.attrs["href"]]
	if link:
	link = link[0]
	meta = urlopen(link.attrs["href"]).read() # fetch metadata html
	f = codecs.open(link.text.split(": ")[1] + ".html",
	"w", "utf-8")
	f.write(meta)
	f.close()
	print link.text + " found and saved..."
	if delay: time.sleep(delay)
	print "Done."