Skip to content

Instantly share code, notes, and snippets.

@datadavev
Last active August 22, 2018 00:05
Show Gist options
  • Save datadavev/7a3991a2104f3843af62bdbf83129b7c to your computer and use it in GitHub Desktop.
Save datadavev/7a3991a2104f3843af62bdbf83129b7c to your computer and use it in GitHub Desktop.
Extract structured data from DataONE search result page
import time
import urllib
from selenium import webdriver
import extruct
import pprint
import clipboard
pid = "https://pasta.lternet.edu/package/metadata/eml/knb-lter-fce/1224/1"
#The view service is broken by not accepting an escaped path element
#url = "https://search.dataone.org/view/" + urllib.parse.quote_plus(pid)
url = "https://search.dataone.org/view/" + pid
driver = webdriver.Chrome()
driver.get(url)
#wait for document to load
time.sleep(10)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
driver.close()
# Place the rendered HTML onto the clipboard
clipboard.copy(html.encode("utf-8"))
# Extract the JSON-LD data
data = extruct.extract(html, base_url=url)
pprint.pprint(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment