flother/iceland_earthquakes.py

## iceland_earthquakes.py
# coding: UTF-8
"""
Scrape earthquakes measured by the Icelandic Meteorological Office (Veðurstofa
Íslands) from 1991 to 2015.

The data is collected from zipped KML files downloaded from vedur.is.
"""
from collections import OrderedDict
from cStringIO import StringIO
import datetime
import re
import sys
import zipfile

from csvkit.unicsv import UnicodeCSVWriter
import lxml.etree
import requests


YEAR_KML = "http://hraun.vedur.is/ja/google/kmls/{year}/{year}list.kmz"
YEARS = range(1991, 2016)
# Regular expression to parse date, time, depth in km and Richter magnitude
# from the text description in the Placemark elements.
DESC_RE = re.compile(r"""
    Time:\s(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})\s
    (?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})\.(?P<fraction>\d+)
    <br>
    Depth:\s(?P<depth>[\d\.]+)km\s<br>\sSize:\s(?P<size>\-?[\d\.]+|\*+)Ml
    <br>
""", re.X)
# Elements of interest (i.e. those that contain data) in the KML files.
PLACEMARK_ELEMENT = "{http://earth.google.com/kml/2.0}Placemark"
DESC_ELEMENT = "{http://earth.google.com/kml/2.0}description"
COORDS_ELEMENT = "{http://earth.google.com/kml/2.0}coordinates"
ELEMENTS = (PLACEMARK_ELEMENT, DESC_ELEMENT, COORDS_ELEMENT)
ATTRS = ["date_time", "latitude", "longitude", "altitude", "depth", "size"]


# Output data as a CSV to stdout.
csv = UnicodeCSVWriter(sys.stdout)
# Write header row.
csv.writerow(ATTRS)
# Download a zipped KML for each year.
for year in YEARS:
    response = requests.get(YEAR_KML.format(year=year))
    kml_zip = zipfile.ZipFile(StringIO(response.content))
    # Unzip and parse each KML file found in the ZIP. Uncompressed KML files
    # are available for download too, but the data for 2008 is truncated there,
    # so the ZIPs are used instead.
    for kml in kml_zip.infolist():
        earthquake = None
        # Iteratively parse each element of interest.
        for action, elem in lxml.etree.iterparse(kml_zip.open(kml),
                                                 tag=ELEMENTS,
                                                 events=("start", "end",)):
            if elem.tag == PLACEMARK_ELEMENT:
                if action == "start":
                    # If it's a new Placemark element, it's a new earthquake,
                    # so reset the data dictionary.
                    earthquake = dict(zip(ATTRS, [None] * len(ATTRS)))
                elif action == "end":
                    # If it's the end of a Placemark element, it's the end of
                    # the data on a single earthquake.
                    if None not in earthquake.values():
                        # At this point all required data is available for a
                        # single earthquake, so output the earthquake to the
                        # CSV.
                        csv.writerow([earthquake[k] for k in ATTRS])
                    else:
                        # We're missing at least one value for a single
                        # earthquake. Print an error and don't add the
                        # earthquake to the CSV.
                        missing_values = ", ".join((k for k, v in
                                                   earthquake.iteritems()
                                                   if v is None))
                        print >> sys.stderr, ("Discarding placemark missing "
                                              "{}".format(missing_values))
            elif earthquake is not None and action == "end":
                if elem.tag == DESC_ELEMENT:
                    # Parse the date and time of the earthquake, along with its
                    # depth below the surface and its magnitude. Date and time
                    # are available in an XML element but depth and magnitude
                    # aren't, so it's all collected from the description using
                    # a regular expression.
                    match = DESC_RE.search(elem.text)
                    if match:
                        timetuple = map(int, match.groups()[:7])
                        # Multiply second fraction by 10,000 to make it
                        # microseconds.
                        timetuple[6] = timetuple[6] * 10000
                        earthquake["date_time"] = datetime.datetime(*timetuple)
                        earthquake["depth"] = match.groupdict()["depth"]
                        size = match.groupdict()["size"]
                        # Size can, in a couple of cases, be "****". In that
                        # case store an empty string instead.
                        try:
                            float(size)
                        except ValueError:
                            print >> sys.stderr, ("Size '{}' not a number; "
                                                  "discarding.".format(size))
                            size = ""
                        earthquake["size"] = size
                    else:
                        print >> sys.stderr, "Can't parse '{}'.".format(
                            elem.text)
                elif elem.tag == COORDS_ELEMENT:
                    # Longitude, latitude, and altitude are stored in a
                    # comma-separated string in the coordinates element.
                    try:
                        coords = elem.text.split(",")
                        # A couple of longitudes have multiple decimal places,
                        # e.g. 12.3456.789. Fortunately in these cases it's
                        # always the first decimal place that's the correct
                        # one, so we can strip out the others to make it a
                        # valid floating-point number.
                        if len(coords) != 3:
                            print >> sys.stderr, ("Can't parse coords from "
                                                  "'{}'.".format(elem.text))
                        else:
                            lon, lat, alt = coords
                            if lon.count(".") > 1:
                                lon_parts = lon.split(".", 1)
                                lon = lon_parts[0] + lon_parts[1].replace(".",
                                                                          "")
                            earthquake["latitude"] = float(lat)
                            earthquake["longitude"] = float(lon)
                            earthquake["altitude"] = float(alt)
                    except ValueError:
                        print >> sys.stderr, ("Can't parse coordinates from "
                                              "'{}'.".format(elem.text))
	# coding: UTF-8
	"""
	Scrape earthquakes measured by the Icelandic Meteorological Office (Veðurstofa
	Íslands) from 1991 to 2015.

	The data is collected from zipped KML files downloaded from vedur.is.
	"""
	from collections import OrderedDict
	from cStringIO import StringIO
	import datetime
	import re
	import sys
	import zipfile

	from csvkit.unicsv import UnicodeCSVWriter
	import lxml.etree
	import requests


	YEAR_KML = "http://hraun.vedur.is/ja/google/kmls/{year}/{year}list.kmz"
	YEARS = range(1991, 2016)
	# Regular expression to parse date, time, depth in km and Richter magnitude
	# from the text description in the Placemark elements.
	DESC_RE = re.compile(r"""
	Time:\s(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})\s
	(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})\.(?P<fraction>\d+)
	<br>
	Depth:\s(?P<depth>[\d\.]+)km\s<br>\sSize:\s(?P<size>\-?[\d\.]+\|\*+)Ml
	<br>
	""", re.X)
	# Elements of interest (i.e. those that contain data) in the KML files.
	PLACEMARK_ELEMENT = "{http://earth.google.com/kml/2.0}Placemark"
	DESC_ELEMENT = "{http://earth.google.com/kml/2.0}description"
	COORDS_ELEMENT = "{http://earth.google.com/kml/2.0}coordinates"
	ELEMENTS = (PLACEMARK_ELEMENT, DESC_ELEMENT, COORDS_ELEMENT)
	ATTRS = ["date_time", "latitude", "longitude", "altitude", "depth", "size"]


	# Output data as a CSV to stdout.
	csv = UnicodeCSVWriter(sys.stdout)
	# Write header row.
	csv.writerow(ATTRS)
	# Download a zipped KML for each year.
	for year in YEARS:
	response = requests.get(YEAR_KML.format(year=year))
	kml_zip = zipfile.ZipFile(StringIO(response.content))
	# Unzip and parse each KML file found in the ZIP. Uncompressed KML files
	# are available for download too, but the data for 2008 is truncated there,
	# so the ZIPs are used instead.
	for kml in kml_zip.infolist():
	earthquake = None
	# Iteratively parse each element of interest.
	for action, elem in lxml.etree.iterparse(kml_zip.open(kml),
	tag=ELEMENTS,
	events=("start", "end",)):
	if elem.tag == PLACEMARK_ELEMENT:
	if action == "start":
	# If it's a new Placemark element, it's a new earthquake,
	# so reset the data dictionary.
	earthquake = dict(zip(ATTRS, [None] * len(ATTRS)))
	elif action == "end":
	# If it's the end of a Placemark element, it's the end of
	# the data on a single earthquake.
	if None not in earthquake.values():
	# At this point all required data is available for a
	# single earthquake, so output the earthquake to the
	# CSV.
	csv.writerow([earthquake[k] for k in ATTRS])
	else:
	# We're missing at least one value for a single
	# earthquake. Print an error and don't add the
	# earthquake to the CSV.
	missing_values = ", ".join((k for k, v in
	earthquake.iteritems()
	if v is None))
	print >> sys.stderr, ("Discarding placemark missing "
	"{}".format(missing_values))
	elif earthquake is not None and action == "end":
	if elem.tag == DESC_ELEMENT:
	# Parse the date and time of the earthquake, along with its
	# depth below the surface and its magnitude. Date and time
	# are available in an XML element but depth and magnitude
	# aren't, so it's all collected from the description using
	# a regular expression.
	match = DESC_RE.search(elem.text)
	if match:
	timetuple = map(int, match.groups()[:7])
	# Multiply second fraction by 10,000 to make it
	# microseconds.
	timetuple[6] = timetuple[6] * 10000
	earthquake["date_time"] = datetime.datetime(*timetuple)
	earthquake["depth"] = match.groupdict()["depth"]
	size = match.groupdict()["size"]
	# Size can, in a couple of cases, be "****". In that
	# case store an empty string instead.
	try:
	float(size)
	except ValueError:
	print >> sys.stderr, ("Size '{}' not a number; "
	"discarding.".format(size))
	size = ""
	earthquake["size"] = size
	else:
	print >> sys.stderr, "Can't parse '{}'.".format(
	elem.text)
	elif elem.tag == COORDS_ELEMENT:
	# Longitude, latitude, and altitude are stored in a
	# comma-separated string in the coordinates element.
	try:
	coords = elem.text.split(",")
	# A couple of longitudes have multiple decimal places,
	# e.g. 12.3456.789. Fortunately in these cases it's
	# always the first decimal place that's the correct
	# one, so we can strip out the others to make it a
	# valid floating-point number.
	if len(coords) != 3:
	print >> sys.stderr, ("Can't parse coords from "
	"'{}'.".format(elem.text))
	else:
	lon, lat, alt = coords
	if lon.count(".") > 1:
	lon_parts = lon.split(".", 1)
	lon = lon_parts[0] + lon_parts[1].replace(".",
	"")
	earthquake["latitude"] = float(lat)
	earthquake["longitude"] = float(lon)
	earthquake["altitude"] = float(alt)
	except ValueError:
	print >> sys.stderr, ("Can't parse coordinates from "
	"'{}'.".format(elem.text))