Skip to content

Instantly share code, notes, and snippets.

Created August 23, 2017 07:55
Show Gist options
  • Save iimog/a6a36a7b03906f18ac490b0a4708224c to your computer and use it in GitHub Desktop.
Save iimog/a6a36a7b03906f18ac490b0a4708224c to your computer and use it in GitHub Desktop.
A python script (using beautiful soup) to extract bee traits from the html pages at and save as FENNEC compatible tsv files
# coding: utf-8
from bs4 import BeautifulSoup
import glob
trait_types = dict()
trait_values_numeric = dict()
trait_values_categorical = dict()
general_citation = "Budrys, E., Budriene., A. and Orlovskyte. S. 2014. Cavity-nesting wasps and bees database."
for file in glob.glob('*.html'):
sid = file[0:file.find('.')]
origin_url = ''+sid
with open(file) as content:
soup = BeautifulSoup(content, 'html.parser')
classification = soup.find(id="content_div").h2.contents
scientific_name = classification[1].split(" / ")[2] +" "+ classification[0].string
reference = "None"
if soup.find(id="content_div").b is not None:
reference = soup.find(id="content_div").b.string
reference = "Source:"+general_citation+";Reference:"+reference
#print(scientific_name, reference)
for row in soup.find(id="content_div").table.find_all('tr'):
tds = row.find_all('td')
trait_type = tds[0].contents[0]
trait_definition = ""
if tds[0].contents[1].has_attr("onmouseover"):
trait_definition = tds[0].contents[1]['onmouseover'].split("'")[1]
numeric_value = ""
if len(tds[1].contents) > 0:
numeric_value = tds[1].contents[0]
categorical_value = tds[2].contents[0]
categorical_definition = ""
if tds[2].contents[1].has_attr("onmouseover"):
categorical_definition = tds[2].contents[1]['onmouseover'].split("'")[1]
trait_types[trait_type] = trait_definition
if not trait_type in trait_values_categorical:
trait_values_categorical[trait_type] = list()
trait_values_categorical[trait_type].append([scientific_name, categorical_value, categorical_definition, reference, origin_url])
if numeric_value != "":
if not trait_type in trait_values_numeric:
trait_values_numeric[trait_type] = list()
trait_values_numeric[trait_type].append([scientific_name, numeric_value, '', reference, origin_url])
with open("trait_types.tsv", "w") as f:
for tt in trait_types.keys():
f.write("\t".join([tt, trait_types[tt]])+"\n")
for tt in trait_values_numeric.keys():
with open(tt+"_numeric.tsv", "w") as f:
for value in trait_values_numeric[tt]:
for tt in trait_values_categorical.keys():
with open(tt+"_categorical.tsv", "w") as f:
for value in trait_values_categorical[tt]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment