Skip to content

Instantly share code, notes, and snippets.

@flother
Last active September 3, 2015 22:44
Show Gist options
  • Save flother/91bc218212a4ce7783cb to your computer and use it in GitHub Desktop.
Save flother/91bc218212a4ce7783cb to your computer and use it in GitHub Desktop.
Scrape earthquakes measured by the Icelandic Meteorological Office (Veðurstofa Íslands) from 1991 to 2014.
# coding: UTF-8
"""
Scrape earthquakes measured by the Icelandic Meteorological Office (Veðurstofa
Íslands) from 1991 to 2015.
The data is collected from zipped KML files downloaded from vedur.is.
"""
from collections import OrderedDict
from cStringIO import StringIO
import datetime
import re
import sys
import zipfile
from csvkit.unicsv import UnicodeCSVWriter
import lxml.etree
import requests
YEAR_KML = "http://hraun.vedur.is/ja/google/kmls/{year}/{year}list.kmz"
YEARS = range(1991, 2016)
# Regular expression to parse date, time, depth in km and Richter magnitude
# from the text description in the Placemark elements.
DESC_RE = re.compile(r"""
Time:\s(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})\s
(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2})\.(?P<fraction>\d+)
<br>
Depth:\s(?P<depth>[\d\.]+)km\s<br>\sSize:\s(?P<size>\-?[\d\.]+|\*+)Ml
<br>
""", re.X)
# Elements of interest (i.e. those that contain data) in the KML files.
PLACEMARK_ELEMENT = "{http://earth.google.com/kml/2.0}Placemark"
DESC_ELEMENT = "{http://earth.google.com/kml/2.0}description"
COORDS_ELEMENT = "{http://earth.google.com/kml/2.0}coordinates"
ELEMENTS = (PLACEMARK_ELEMENT, DESC_ELEMENT, COORDS_ELEMENT)
ATTRS = ["date_time", "latitude", "longitude", "altitude", "depth", "size"]
# Output data as a CSV to stdout.
csv = UnicodeCSVWriter(sys.stdout)
# Write header row.
csv.writerow(ATTRS)
# Download a zipped KML for each year.
for year in YEARS:
response = requests.get(YEAR_KML.format(year=year))
kml_zip = zipfile.ZipFile(StringIO(response.content))
# Unzip and parse each KML file found in the ZIP. Uncompressed KML files
# are available for download too, but the data for 2008 is truncated there,
# so the ZIPs are used instead.
for kml in kml_zip.infolist():
earthquake = None
# Iteratively parse each element of interest.
for action, elem in lxml.etree.iterparse(kml_zip.open(kml),
tag=ELEMENTS,
events=("start", "end",)):
if elem.tag == PLACEMARK_ELEMENT:
if action == "start":
# If it's a new Placemark element, it's a new earthquake,
# so reset the data dictionary.
earthquake = dict(zip(ATTRS, [None] * len(ATTRS)))
elif action == "end":
# If it's the end of a Placemark element, it's the end of
# the data on a single earthquake.
if None not in earthquake.values():
# At this point all required data is available for a
# single earthquake, so output the earthquake to the
# CSV.
csv.writerow([earthquake[k] for k in ATTRS])
else:
# We're missing at least one value for a single
# earthquake. Print an error and don't add the
# earthquake to the CSV.
missing_values = ", ".join((k for k, v in
earthquake.iteritems()
if v is None))
print >> sys.stderr, ("Discarding placemark missing "
"{}".format(missing_values))
elif earthquake is not None and action == "end":
if elem.tag == DESC_ELEMENT:
# Parse the date and time of the earthquake, along with its
# depth below the surface and its magnitude. Date and time
# are available in an XML element but depth and magnitude
# aren't, so it's all collected from the description using
# a regular expression.
match = DESC_RE.search(elem.text)
if match:
timetuple = map(int, match.groups()[:7])
# Multiply second fraction by 10,000 to make it
# microseconds.
timetuple[6] = timetuple[6] * 10000
earthquake["date_time"] = datetime.datetime(*timetuple)
earthquake["depth"] = match.groupdict()["depth"]
size = match.groupdict()["size"]
# Size can, in a couple of cases, be "****". In that
# case store an empty string instead.
try:
float(size)
except ValueError:
print >> sys.stderr, ("Size '{}' not a number; "
"discarding.".format(size))
size = ""
earthquake["size"] = size
else:
print >> sys.stderr, "Can't parse '{}'.".format(
elem.text)
elif elem.tag == COORDS_ELEMENT:
# Longitude, latitude, and altitude are stored in a
# comma-separated string in the coordinates element.
try:
coords = elem.text.split(",")
# A couple of longitudes have multiple decimal places,
# e.g. 12.3456.789. Fortunately in these cases it's
# always the first decimal place that's the correct
# one, so we can strip out the others to make it a
# valid floating-point number.
if len(coords) != 3:
print >> sys.stderr, ("Can't parse coords from "
"'{}'.".format(elem.text))
else:
lon, lat, alt = coords
if lon.count(".") > 1:
lon_parts = lon.split(".", 1)
lon = lon_parts[0] + lon_parts[1].replace(".",
"")
earthquake["latitude"] = float(lat)
earthquake["longitude"] = float(lon)
earthquake["altitude"] = float(alt)
except ValueError:
print >> sys.stderr, ("Can't parse coordinates from "
"'{}'.".format(elem.text))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment