Skip to content

Instantly share code, notes, and snippets.

@bcbwilla
Last active August 29, 2015 14:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bcbwilla/777fc251e7e001c5668e to your computer and use it in GitHub Desktop.
Save bcbwilla/777fc251e7e001c5668e to your computer and use it in GitHub Desktop.
Scrapes data from mrl.uscb.edu thermoelectric materials datamine.
"""
Scrapes all dimensions of data from http://www.mrl.ucsb.edu:8080/datamine/thermoelectric.jsp.
RE: electrical resistivity
SE: seebeck coefficient
TH: thermal conductivity
AT: average atomic mass
SC: scarcity
HP: HHI (production)
HR: HHI (reserves)
PF: power factor
S2: seebeck squared
UCV: Unit cell volume
AAV: Average atomic volume
APC: Atoms per cell
CON: electrical conductivity
KEKT: Ke Ktotal
"""
import requests
import re
import ast
import csv
import sys
def extract_series(response_text):
""" Takes the text of the html page and extracts plot data.
Returns list of data series.
Note: If the html structure of the page changed for some reason, this
function may break.
"""
# extract the relevant part of the javascript code
s = response_text.split("series:")[2].split('}//')[0]
# remove newline, tab, etc.
s = ' '.join(s.split()).replace('null', 'None').replace('http://','')
# remove some key names not being in quotes
s = re.sub(r'[a-zA-Z]+:', lambda x: "'"+x.group(0)[:-1]+"':", s)
# remove html tags and comments
s = re.sub('(<[^<]+?>|//Fetch the \d+\D+ list of data string)', '', s)
return ast.literal_eval(s)
def generate_key(d):
""" Generates a unique 'key' for each data point
"""
return d['formula']+d['Structure']+d['author']+d['ZT']+str(d['y'])
def get_all_data():
""" Gets all dimenions of the data """
url = "http://www.mrl.ucsb.edu:8080/datamine/UploadServlet"
# Quantities to plot
axis_options = ['RE', 'TH', 'AT', 'SC', 'HP', 'HR', 'UCV', 'AAV', 'APC', 'CON', 'PF', 'S2', 'KEKT','SE']
post_data = {'PLOT_TYPE': 'STANDARD_PLOT_TYPE',
'Y_AXIS': 'SE',
'MARKER': 'ZT',
'SORTER': 'F_T'}
# Values common to all data points
common_properties = ['author', 'comment', 'temperature', 'form', 'url', 'formula', 'Structure', 'synthesis', 'ZT']
cleaned_data = {}
for axis_option in axis_options:
keys = []
# skip the case of X == Y
if axis_option == post_data['Y_AXIS']:
continue
print axis_option, 'continuing'
post_data['X_AXIS'] = axis_option
# get data from website
response = requests.post(url, data=post_data, files={'fileChooser':('','')})
if not response.status_code == 200:
print "Trouble getting data"
continue
# get the series
web_data = extract_series(response.text)
# each series is a different "class" of materials
for series in web_data:
for d in series['data']:
key = generate_key(d)
if not key in cleaned_data.keys():
cleaned_data[key] = {}
sub_data = cleaned_data[key]
sub_data[axis_option] = d['x']
if not 'class' in sub_data.keys():
sub_data['SE'] = d['y']
sub_data['class'] = series['name']
for prop in common_properties:
sub_data[prop] = d[prop]
keys.append(key)
assert len(keys) == len(set(keys)), \
"Error, data points not all unique for x-axis: %s" % axis_option
return cleaned_data
def write_file(data, filename):
""" Write the data to a file """
# turn into a list of dictionaries, no longer need keys
data = [x for x in data.values()]
with open(filename, 'w') as f:
writer = csv.DictWriter(f, fieldnames=data[0].keys(), delimiter='\t')
writer.writeheader()
writer.writerows(data)
def test_data(data):
""" Compares scraped data to a few points collected by hand """
def test(d):
d_key = generate_key(d)
d_compare = data[d_key]
for key in d.keys():
if key != 'x' and key != 'y':
dv1 = d[key]
dv2 = d_compare[key]
assert dv1 == dv2, "%s does not match, %s != %s" % (key, dv1, dv2)
d1 = {"x":'0.021',
"y":-68.,
"RE":'0.021',
"SE":-68.,
"url":"dx.doi.org/10.1016/j.jallcom.2005.04.060",
"ZT":'6.71E-3',
"formula": 'Fe0.978Co0.00196Si1.96Y0.12O0.18',
"comment": "",
"synthesis": "arc-melted, Ar",
"form": "polycrystalline",
"temperature": "1000",
"author": "Ito 2006",
"Structure": "ICSD #9119, 300K"}
d2 = {"x":'26',
"y":201.,
"APC":'26',
"SE":201.,
"url":"dx.doi.org/10.1002/adfm.201000970",
"ZT":'0.47',
"formula": 'Ca4.75Na0.25Al2Sb6',
"comment": "",
"synthesis": "solid state reaction, Ar",
"form": "polycrystalline",
"temperature": "700",
"author": "Toberer 2010",
"Structure": "ICSD #183853, 300K"}
d3 = {"x":'601.91',
"y":-226.,
"UCV":'601.91',
"SE": -226.,
"url":"dx.doi.org/10.1016/j.jallcom.2005.04.060",
"ZT":'1.98E-3',
"formula": 'Fe0.978Co0.00196Si1.96Y0.12O0.18',
"comment": "*extrapolated from 315 K",
"synthesis": "arc-melted, Ar",
"form": "polycrystalline",
"temperature": "300",
"author": "Ito 2006",
"Structure": "ICSD #9119, 300K"}
d4 = {"x":'1.26E-3',
"y":-120.,
"RE":'1.26E-3',
"SE":-120.,
"TH": "6",
"AT": '36.72',
"SC": '2.782E3',
"HP": '2.693E3',
"HR": '1.952E3',
"UCV": '59.36',
"AAV": '11.872',
"APC": '5',
"CON": '7.94E2',
'PF': '1.13E-3',
'S2': '1.43E4',
'KEKT': '0.129',
"url":"jjap.jsap.jp/link?JJAP/43/L540/",
"ZT":'0.08',
"formula": 'Sr0.9Y0.1Ti1O3',
"comment": "*kappa estimated from 300K",
"synthesis": "solid state reaction, air",
"form": "polycrystalline",
"temperature": "400",
"author": "Obara 2004",
"Structure": "ICSD #181231, 300K"}
d5 = {"x":'3.44E-3',
"y":-185.,
"RE": '3.44E-3',
"SE":-185.,
"TH": '5',
"AT": '36.72',
"SC": '2.782E3',
"HP": '2.693E3',
"HR": '1.952E3',
"UCV": '59.36',
"AAV": '11.872',
"APC": '5',
"CON": '2.91E2',
'PF': '9.98E-4',
'S2': '3.43E4',
'KEKT': '0.099',
"url":"jjap.jsap.jp/link?JJAP/43/L540/",
"ZT":'0.14',
"formula": 'Sr0.9Y0.1Ti1O3',
"comment": "*kappa estimated from 300K",
"synthesis": "solid state reaction, air",
"form": "polycrystalline",
"temperature": "700",
"author": "Obara 2004",
"Structure": "ICSD #181231, 300K"}
test(d1)
test(d2)
test(d3)
test(d4)
test(d5)
return True
if __name__ == '__main__':
if len(sys.argv) == 2:
filename = sys.argv[1]
else:
filename = "mrl_data.txt"
print "Getting data"
cleaned_data = get_all_data()
print "Testing data"
test_data(cleaned_data)
print "Tests ok\nWriting to file"
write_file(cleaned_data, filename)
print "Done."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment