bcbwilla/mrl_scraper.py

## mrl_scraper.py
"""
Scrapes all dimensions of data from http://www.mrl.ucsb.edu:8080/datamine/thermoelectric.jsp.

    RE:   electrical resistivity
    SE:   seebeck coefficient
    TH:   thermal conductivity
    AT:   average atomic mass
    SC:   scarcity
    HP:   HHI (production)
    HR:   HHI (reserves)
    PF:   power factor
    S2:   seebeck squared
    UCV:  Unit cell volume
    AAV:  Average atomic volume
    APC:  Atoms per cell
    CON:  electrical conductivity
    KEKT: Ke Ktotal
"""
import requests
import re
import ast
import csv
import sys

def extract_series(response_text):
    """ Takes the text of the html page and extracts plot data.

        Returns list of data series.

        Note: If the html structure of the page changed for some reason, this
        function may break.
    """
    # extract the relevant part of the javascript code
    s = response_text.split("series:")[2].split('}//')[0]
    # remove newline, tab, etc.
    s = ' '.join(s.split()).replace('null', 'None').replace('http://','')
    # remove some key names not being in quotes
    s = re.sub(r'[a-zA-Z]+:', lambda x: "'"+x.group(0)[:-1]+"':", s)
    # remove html tags and comments
    s = re.sub('(<[^<]+?>|//Fetch the \d+\D+ list of data string)', '', s)

    return ast.literal_eval(s)


def generate_key(d):
    """ Generates a unique 'key' for each data point
    """
    return d['formula']+d['Structure']+d['author']+d['ZT']+str(d['y'])

def get_all_data():
    """ Gets all dimenions of the data """

    url = "http://www.mrl.ucsb.edu:8080/datamine/UploadServlet"

    # Quantities to plot
    axis_options = ['RE', 'TH', 'AT', 'SC', 'HP', 'HR', 'UCV', 'AAV', 'APC', 'CON', 'PF', 'S2', 'KEKT','SE']

    post_data = {'PLOT_TYPE': 'STANDARD_PLOT_TYPE',
                 'Y_AXIS': 'SE',
                 'MARKER': 'ZT',
                 'SORTER': 'F_T'}

    # Values common to all data points
    common_properties = ['author', 'comment', 'temperature', 'form', 'url', 'formula', 'Structure', 'synthesis', 'ZT']

    cleaned_data = {}

    for axis_option in axis_options:
        keys = []

        # skip the case of X == Y
        if axis_option == post_data['Y_AXIS']:
            continue
            print axis_option, 'continuing'

        post_data['X_AXIS'] = axis_option

        # get data from website
        response = requests.post(url, data=post_data, files={'fileChooser':('','')})

        if not response.status_code == 200:
            print "Trouble getting data"
            continue

        # get the series
        web_data = extract_series(response.text)

        # each series is a different "class" of materials
        for series in web_data:

            for d in series['data']:
                key = generate_key(d)

                if not key in cleaned_data.keys():
                    cleaned_data[key] = {}

                sub_data = cleaned_data[key]
                sub_data[axis_option] = d['x']

                if not 'class' in sub_data.keys():
                    sub_data['SE'] = d['y']
                    sub_data['class'] = series['name']

                    for prop in common_properties:
                        sub_data[prop] = d[prop]

                keys.append(key)

        assert len(keys) == len(set(keys)), \
        "Error, data points not all unique for x-axis: %s" % axis_option

    return cleaned_data


def write_file(data, filename):
    """ Write the data to a file """

    # turn into a list of dictionaries, no longer need keys
    data = [x for x in data.values()]

    with open(filename, 'w') as f:
        writer = csv.DictWriter(f, fieldnames=data[0].keys(), delimiter='\t')
        writer.writeheader()
        writer.writerows(data)


def test_data(data):
    """ Compares scraped data to a few points collected by hand """

    def test(d):
        d_key = generate_key(d)
        d_compare = data[d_key]

        for key in d.keys():
            if key != 'x' and key != 'y':
                dv1 = d[key]
                dv2 = d_compare[key]
                assert dv1 == dv2, "%s does not match, %s != %s" % (key, dv1, dv2)

    d1 = {"x":'0.021',
          "y":-68.,
          "RE":'0.021',
          "SE":-68.,
          "url":"dx.doi.org/10.1016/j.jallcom.2005.04.060",
          "ZT":'6.71E-3',
          "formula": 'Fe0.978Co0.00196Si1.96Y0.12O0.18',
          "comment": "",
          "synthesis": "arc-melted, Ar",
          "form": "polycrystalline",
          "temperature": "1000",
          "author": "Ito 2006",
          "Structure": "ICSD #9119, 300K"}

    d2 = {"x":'26',
          "y":201.,
          "APC":'26',
          "SE":201.,
          "url":"dx.doi.org/10.1002/adfm.201000970",
          "ZT":'0.47',
          "formula": 'Ca4.75Na0.25Al2Sb6',
          "comment": "",
          "synthesis": "solid state reaction, Ar",
          "form": "polycrystalline",
          "temperature": "700",
          "author": "Toberer 2010",
          "Structure": "ICSD #183853, 300K"}


    d3 = {"x":'601.91',
          "y":-226.,
          "UCV":'601.91',
          "SE": -226.,
          "url":"dx.doi.org/10.1016/j.jallcom.2005.04.060",
          "ZT":'1.98E-3',
          "formula": 'Fe0.978Co0.00196Si1.96Y0.12O0.18',
          "comment": "*extrapolated from 315 K",
          "synthesis": "arc-melted, Ar",
          "form": "polycrystalline",
          "temperature": "300",
          "author": "Ito 2006",
          "Structure": "ICSD #9119, 300K"}


    d4 = {"x":'1.26E-3',
          "y":-120.,
          "RE":'1.26E-3',
          "SE":-120.,
          "TH": "6",
          "AT": '36.72',
          "SC": '2.782E3',
          "HP": '2.693E3',
          "HR": '1.952E3',
          "UCV": '59.36',
          "AAV": '11.872',
          "APC": '5',
          "CON": '7.94E2',
          'PF': '1.13E-3',
          'S2': '1.43E4',
          'KEKT': '0.129',
          "url":"jjap.jsap.jp/link?JJAP/43/L540/",
          "ZT":'0.08',
          "formula": 'Sr0.9Y0.1Ti1O3',
          "comment": "*kappa estimated from 300K",
          "synthesis": "solid state reaction, air",
          "form": "polycrystalline",
          "temperature": "400",
          "author": "Obara 2004",
          "Structure": "ICSD #181231, 300K"}


    d5 = {"x":'3.44E-3',
          "y":-185.,
          "RE": '3.44E-3',
          "SE":-185.,
          "TH": '5',
          "AT": '36.72',
          "SC": '2.782E3',
          "HP": '2.693E3',
          "HR": '1.952E3',
          "UCV": '59.36',
          "AAV": '11.872',
          "APC": '5',
          "CON": '2.91E2',
          'PF': '9.98E-4',
          'S2': '3.43E4',
          'KEKT': '0.099',
          "url":"jjap.jsap.jp/link?JJAP/43/L540/",
          "ZT":'0.14',
          "formula": 'Sr0.9Y0.1Ti1O3',
          "comment": "*kappa estimated from 300K",
          "synthesis": "solid state reaction, air",
          "form": "polycrystalline",
          "temperature": "700",
          "author": "Obara 2004",
          "Structure": "ICSD #181231, 300K"}

    test(d1)
    test(d2)
    test(d3)
    test(d4)
    test(d5)

    return True


if __name__ == '__main__':

    if len(sys.argv) == 2:
        filename = sys.argv[1]
    else:
        filename = "mrl_data.txt"

    print "Getting data"
    cleaned_data = get_all_data()

    print "Testing data"
    test_data(cleaned_data)

    print "Tests ok\nWriting to file"
    write_file(cleaned_data, filename)

    print "Done."
	"""
	Scrapes all dimensions of data from http://www.mrl.ucsb.edu:8080/datamine/thermoelectric.jsp.

	RE: electrical resistivity
	SE: seebeck coefficient
	TH: thermal conductivity
	AT: average atomic mass
	SC: scarcity
	HP: HHI (production)
	HR: HHI (reserves)
	PF: power factor
	S2: seebeck squared
	UCV: Unit cell volume
	AAV: Average atomic volume
	APC: Atoms per cell
	CON: electrical conductivity
	KEKT: Ke Ktotal
	"""
	import requests
	import re
	import ast
	import csv
	import sys

	def extract_series(response_text):
	""" Takes the text of the html page and extracts plot data.

	Returns list of data series.

	Note: If the html structure of the page changed for some reason, this
	function may break.
	"""
	# extract the relevant part of the javascript code
	s = response_text.split("series:")[2].split('}//')[0]
	# remove newline, tab, etc.
	s = ' '.join(s.split()).replace('null', 'None').replace('http://','')
	# remove some key names not being in quotes
	s = re.sub(r'[a-zA-Z]+:', lambda x: "'"+x.group(0)[:-1]+"':", s)
	# remove html tags and comments
	s = re.sub('(<[^<]+?>\|//Fetch the \d+\D+ list of data string)', '', s)

	return ast.literal_eval(s)


	def generate_key(d):
	""" Generates a unique 'key' for each data point
	"""
	return d['formula']+d['Structure']+d['author']+d['ZT']+str(d['y'])

	def get_all_data():
	""" Gets all dimenions of the data """

	url = "http://www.mrl.ucsb.edu:8080/datamine/UploadServlet"

	# Quantities to plot
	axis_options = ['RE', 'TH', 'AT', 'SC', 'HP', 'HR', 'UCV', 'AAV', 'APC', 'CON', 'PF', 'S2', 'KEKT','SE']

	post_data = {'PLOT_TYPE': 'STANDARD_PLOT_TYPE',
	'Y_AXIS': 'SE',
	'MARKER': 'ZT',
	'SORTER': 'F_T'}

	# Values common to all data points
	common_properties = ['author', 'comment', 'temperature', 'form', 'url', 'formula', 'Structure', 'synthesis', 'ZT']

	cleaned_data = {}

	for axis_option in axis_options:
	keys = []

	# skip the case of X == Y
	if axis_option == post_data['Y_AXIS']:
	continue
	print axis_option, 'continuing'

	post_data['X_AXIS'] = axis_option

	# get data from website
	response = requests.post(url, data=post_data, files={'fileChooser':('','')})

	if not response.status_code == 200:
	print "Trouble getting data"
	continue

	# get the series
	web_data = extract_series(response.text)

	# each series is a different "class" of materials
	for series in web_data:

	for d in series['data']:
	key = generate_key(d)

	if not key in cleaned_data.keys():
	cleaned_data[key] = {}

	sub_data = cleaned_data[key]
	sub_data[axis_option] = d['x']

	if not 'class' in sub_data.keys():
	sub_data['SE'] = d['y']
	sub_data['class'] = series['name']

	for prop in common_properties:
	sub_data[prop] = d[prop]

	keys.append(key)

	assert len(keys) == len(set(keys)), \
	"Error, data points not all unique for x-axis: %s" % axis_option

	return cleaned_data


	def write_file(data, filename):
	""" Write the data to a file """

	# turn into a list of dictionaries, no longer need keys
	data = [x for x in data.values()]

	with open(filename, 'w') as f:
	writer = csv.DictWriter(f, fieldnames=data[0].keys(), delimiter='\t')
	writer.writeheader()
	writer.writerows(data)


	def test_data(data):
	""" Compares scraped data to a few points collected by hand """

	def test(d):
	d_key = generate_key(d)
	d_compare = data[d_key]

	for key in d.keys():
	if key != 'x' and key != 'y':
	dv1 = d[key]
	dv2 = d_compare[key]
	assert dv1 == dv2, "%s does not match, %s != %s" % (key, dv1, dv2)

	d1 = {"x":'0.021',
	"y":-68.,
	"RE":'0.021',
	"SE":-68.,
	"url":"dx.doi.org/10.1016/j.jallcom.2005.04.060",
	"ZT":'6.71E-3',
	"formula": 'Fe0.978Co0.00196Si1.96Y0.12O0.18',
	"comment": "",
	"synthesis": "arc-melted, Ar",
	"form": "polycrystalline",
	"temperature": "1000",
	"author": "Ito 2006",
	"Structure": "ICSD #9119, 300K"}

	d2 = {"x":'26',
	"y":201.,
	"APC":'26',
	"SE":201.,
	"url":"dx.doi.org/10.1002/adfm.201000970",
	"ZT":'0.47',
	"formula": 'Ca4.75Na0.25Al2Sb6',
	"comment": "",
	"synthesis": "solid state reaction, Ar",
	"form": "polycrystalline",
	"temperature": "700",
	"author": "Toberer 2010",
	"Structure": "ICSD #183853, 300K"}


	d3 = {"x":'601.91',
	"y":-226.,
	"UCV":'601.91',
	"SE": -226.,
	"url":"dx.doi.org/10.1016/j.jallcom.2005.04.060",
	"ZT":'1.98E-3',
	"formula": 'Fe0.978Co0.00196Si1.96Y0.12O0.18',
	"comment": "*extrapolated from 315 K",
	"synthesis": "arc-melted, Ar",
	"form": "polycrystalline",
	"temperature": "300",
	"author": "Ito 2006",
	"Structure": "ICSD #9119, 300K"}



	d4 = {"x":'1.26E-3',
	"y":-120.,
	"RE":'1.26E-3',
	"SE":-120.,
	"TH": "6",
	"AT": '36.72',
	"SC": '2.782E3',
	"HP": '2.693E3',
	"HR": '1.952E3',
	"UCV": '59.36',
	"AAV": '11.872',
	"APC": '5',
	"CON": '7.94E2',
	'PF': '1.13E-3',
	'S2': '1.43E4',
	'KEKT': '0.129',
	"url":"jjap.jsap.jp/link?JJAP/43/L540/",
	"ZT":'0.08',
	"formula": 'Sr0.9Y0.1Ti1O3',
	"comment": "*kappa estimated from 300K",
	"synthesis": "solid state reaction, air",
	"form": "polycrystalline",
	"temperature": "400",
	"author": "Obara 2004",
	"Structure": "ICSD #181231, 300K"}


	d5 = {"x":'3.44E-3',
	"y":-185.,
	"RE": '3.44E-3',
	"SE":-185.,
	"TH": '5',
	"AT": '36.72',
	"SC": '2.782E3',
	"HP": '2.693E3',
	"HR": '1.952E3',
	"UCV": '59.36',
	"AAV": '11.872',
	"APC": '5',
	"CON": '2.91E2',
	'PF': '9.98E-4',
	'S2': '3.43E4',
	'KEKT': '0.099',
	"url":"jjap.jsap.jp/link?JJAP/43/L540/",
	"ZT":'0.14',
	"formula": 'Sr0.9Y0.1Ti1O3',
	"comment": "*kappa estimated from 300K",
	"synthesis": "solid state reaction, air",
	"form": "polycrystalline",
	"temperature": "700",
	"author": "Obara 2004",
	"Structure": "ICSD #181231, 300K"}

	test(d1)
	test(d2)
	test(d3)
	test(d4)
	test(d5)

	return True


	if __name__ == '__main__':

	if len(sys.argv) == 2:
	filename = sys.argv[1]
	else:
	filename = "mrl_data.txt"

	print "Getting data"
	cleaned_data = get_all_data()

	print "Testing data"
	test_data(cleaned_data)

	print "Tests ok\nWriting to file"
	write_file(cleaned_data, filename)

	print "Done."