Skip to content

Instantly share code, notes, and snippets.

@harej
Last active September 25, 2015 15:21
Show Gist options
  • Save harej/b85cc815843cc2178b56 to your computer and use it in GitHub Desktop.
Save harej/b85cc815843cc2178b56 to your computer and use it in GitHub Desktop.
A script to scrape the Pocket Guide to Chemical Hazards on NIOSH's website
# public domain
from bs4 import BeautifulSoup
import requests
def main():
manifest = {}
for id in range(1, 687): # starting with PGCH #1 and going to #686, the last one
if id == 553: # this one is irregular and should be skipped
continue
url = "http://www.cdc.gov/niosh/npg/npgd" + str(id).zfill(4) + ".html"
r = requests.get(url)
if r.status_code == 200:
manifest[id] = {}
soup = BeautifulSoup(r.text, 'html.parser')
cells = soup.find_all('td')
manifest[id]['cas_number'] = cells[2]
manifest[id]['rtecs_number'] = cells[3]
manifest[id]['conversion'] = cells[6]
manifest[id]['idlh'] = cells[7]
manifest[id]['rel_and_pel'] = cells[8]
manifest[id]['molecular_weight'] = cells[11]
manifest[id]['boiling_point'] = cells[12]
manifest[id]['melting_point'] = cells[13]
manifest[id]['solubility'] = cells[14]
manifest[id]['vapor_pressure'] = cells[15]
manifest[id]['ionization_potential'] = cells[16]
manifest[id]['specific_gravity'] = cells[17]
manifest[id]['flash_point'] = cells[18]
manifest[id]['upper_explosive_limit'] = cells[19]
manifest[id]['lower_explosive_limit'] = cells[20]
manifest[id]['relative_gas_density'] = cells[21]
manifest[id]['probably_nothing'] = cells[22]
manifest[id]['combustibility'] = cells[23]
manifest[id]['exposure_routes'] = cells[25]
manifest[id]['symptoms'] = cells[26]
manifest[id]['target_organs'] = cells[27]
schema = len(cells)
if schema == 32: # schema for non-carcinogens
manifest[id]['personal_protection_and_sanitation'] = cells[28]
manifest[id]['first_aid'] = cells[29]
elif schema == 33: # schema for carcinogens
manifest[id]['personal_protection_and_sanitation'] = cells[29]
manifest[id]['first_aid'] = cells[30]
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment