Last active
August 29, 2015 14:15
-
-
Save bcbwilla/777fc251e7e001c5668e to your computer and use it in GitHub Desktop.
Scrapes data from mrl.uscb.edu thermoelectric materials datamine.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Scrapes all dimensions of data from http://www.mrl.ucsb.edu:8080/datamine/thermoelectric.jsp. | |
RE: electrical resistivity | |
SE: seebeck coefficient | |
TH: thermal conductivity | |
AT: average atomic mass | |
SC: scarcity | |
HP: HHI (production) | |
HR: HHI (reserves) | |
PF: power factor | |
S2: seebeck squared | |
UCV: Unit cell volume | |
AAV: Average atomic volume | |
APC: Atoms per cell | |
CON: electrical conductivity | |
KEKT: Ke Ktotal | |
""" | |
import requests | |
import re | |
import ast | |
import csv | |
import sys | |
def extract_series(response_text): | |
""" Takes the text of the html page and extracts plot data. | |
Returns list of data series. | |
Note: If the html structure of the page changed for some reason, this | |
function may break. | |
""" | |
# extract the relevant part of the javascript code | |
s = response_text.split("series:")[2].split('}//')[0] | |
# remove newline, tab, etc. | |
s = ' '.join(s.split()).replace('null', 'None').replace('http://','') | |
# remove some key names not being in quotes | |
s = re.sub(r'[a-zA-Z]+:', lambda x: "'"+x.group(0)[:-1]+"':", s) | |
# remove html tags and comments | |
s = re.sub('(<[^<]+?>|//Fetch the \d+\D+ list of data string)', '', s) | |
return ast.literal_eval(s) | |
def generate_key(d): | |
""" Generates a unique 'key' for each data point | |
""" | |
return d['formula']+d['Structure']+d['author']+d['ZT']+str(d['y']) | |
def get_all_data(): | |
""" Gets all dimenions of the data """ | |
url = "http://www.mrl.ucsb.edu:8080/datamine/UploadServlet" | |
# Quantities to plot | |
axis_options = ['RE', 'TH', 'AT', 'SC', 'HP', 'HR', 'UCV', 'AAV', 'APC', 'CON', 'PF', 'S2', 'KEKT','SE'] | |
post_data = {'PLOT_TYPE': 'STANDARD_PLOT_TYPE', | |
'Y_AXIS': 'SE', | |
'MARKER': 'ZT', | |
'SORTER': 'F_T'} | |
# Values common to all data points | |
common_properties = ['author', 'comment', 'temperature', 'form', 'url', 'formula', 'Structure', 'synthesis', 'ZT'] | |
cleaned_data = {} | |
for axis_option in axis_options: | |
keys = [] | |
# skip the case of X == Y | |
if axis_option == post_data['Y_AXIS']: | |
continue | |
print axis_option, 'continuing' | |
post_data['X_AXIS'] = axis_option | |
# get data from website | |
response = requests.post(url, data=post_data, files={'fileChooser':('','')}) | |
if not response.status_code == 200: | |
print "Trouble getting data" | |
continue | |
# get the series | |
web_data = extract_series(response.text) | |
# each series is a different "class" of materials | |
for series in web_data: | |
for d in series['data']: | |
key = generate_key(d) | |
if not key in cleaned_data.keys(): | |
cleaned_data[key] = {} | |
sub_data = cleaned_data[key] | |
sub_data[axis_option] = d['x'] | |
if not 'class' in sub_data.keys(): | |
sub_data['SE'] = d['y'] | |
sub_data['class'] = series['name'] | |
for prop in common_properties: | |
sub_data[prop] = d[prop] | |
keys.append(key) | |
assert len(keys) == len(set(keys)), \ | |
"Error, data points not all unique for x-axis: %s" % axis_option | |
return cleaned_data | |
def write_file(data, filename): | |
""" Write the data to a file """ | |
# turn into a list of dictionaries, no longer need keys | |
data = [x for x in data.values()] | |
with open(filename, 'w') as f: | |
writer = csv.DictWriter(f, fieldnames=data[0].keys(), delimiter='\t') | |
writer.writeheader() | |
writer.writerows(data) | |
def test_data(data): | |
""" Compares scraped data to a few points collected by hand """ | |
def test(d): | |
d_key = generate_key(d) | |
d_compare = data[d_key] | |
for key in d.keys(): | |
if key != 'x' and key != 'y': | |
dv1 = d[key] | |
dv2 = d_compare[key] | |
assert dv1 == dv2, "%s does not match, %s != %s" % (key, dv1, dv2) | |
d1 = {"x":'0.021', | |
"y":-68., | |
"RE":'0.021', | |
"SE":-68., | |
"url":"dx.doi.org/10.1016/j.jallcom.2005.04.060", | |
"ZT":'6.71E-3', | |
"formula": 'Fe0.978Co0.00196Si1.96Y0.12O0.18', | |
"comment": "", | |
"synthesis": "arc-melted, Ar", | |
"form": "polycrystalline", | |
"temperature": "1000", | |
"author": "Ito 2006", | |
"Structure": "ICSD #9119, 300K"} | |
d2 = {"x":'26', | |
"y":201., | |
"APC":'26', | |
"SE":201., | |
"url":"dx.doi.org/10.1002/adfm.201000970", | |
"ZT":'0.47', | |
"formula": 'Ca4.75Na0.25Al2Sb6', | |
"comment": "", | |
"synthesis": "solid state reaction, Ar", | |
"form": "polycrystalline", | |
"temperature": "700", | |
"author": "Toberer 2010", | |
"Structure": "ICSD #183853, 300K"} | |
d3 = {"x":'601.91', | |
"y":-226., | |
"UCV":'601.91', | |
"SE": -226., | |
"url":"dx.doi.org/10.1016/j.jallcom.2005.04.060", | |
"ZT":'1.98E-3', | |
"formula": 'Fe0.978Co0.00196Si1.96Y0.12O0.18', | |
"comment": "*extrapolated from 315 K", | |
"synthesis": "arc-melted, Ar", | |
"form": "polycrystalline", | |
"temperature": "300", | |
"author": "Ito 2006", | |
"Structure": "ICSD #9119, 300K"} | |
d4 = {"x":'1.26E-3', | |
"y":-120., | |
"RE":'1.26E-3', | |
"SE":-120., | |
"TH": "6", | |
"AT": '36.72', | |
"SC": '2.782E3', | |
"HP": '2.693E3', | |
"HR": '1.952E3', | |
"UCV": '59.36', | |
"AAV": '11.872', | |
"APC": '5', | |
"CON": '7.94E2', | |
'PF': '1.13E-3', | |
'S2': '1.43E4', | |
'KEKT': '0.129', | |
"url":"jjap.jsap.jp/link?JJAP/43/L540/", | |
"ZT":'0.08', | |
"formula": 'Sr0.9Y0.1Ti1O3', | |
"comment": "*kappa estimated from 300K", | |
"synthesis": "solid state reaction, air", | |
"form": "polycrystalline", | |
"temperature": "400", | |
"author": "Obara 2004", | |
"Structure": "ICSD #181231, 300K"} | |
d5 = {"x":'3.44E-3', | |
"y":-185., | |
"RE": '3.44E-3', | |
"SE":-185., | |
"TH": '5', | |
"AT": '36.72', | |
"SC": '2.782E3', | |
"HP": '2.693E3', | |
"HR": '1.952E3', | |
"UCV": '59.36', | |
"AAV": '11.872', | |
"APC": '5', | |
"CON": '2.91E2', | |
'PF': '9.98E-4', | |
'S2': '3.43E4', | |
'KEKT': '0.099', | |
"url":"jjap.jsap.jp/link?JJAP/43/L540/", | |
"ZT":'0.14', | |
"formula": 'Sr0.9Y0.1Ti1O3', | |
"comment": "*kappa estimated from 300K", | |
"synthesis": "solid state reaction, air", | |
"form": "polycrystalline", | |
"temperature": "700", | |
"author": "Obara 2004", | |
"Structure": "ICSD #181231, 300K"} | |
test(d1) | |
test(d2) | |
test(d3) | |
test(d4) | |
test(d5) | |
return True | |
if __name__ == '__main__': | |
if len(sys.argv) == 2: | |
filename = sys.argv[1] | |
else: | |
filename = "mrl_data.txt" | |
print "Getting data" | |
cleaned_data = get_all_data() | |
print "Testing data" | |
test_data(cleaned_data) | |
print "Tests ok\nWriting to file" | |
write_file(cleaned_data, filename) | |
print "Done." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment