Skip to content

Instantly share code, notes, and snippets.

@bibliotechy
Created April 4, 2017 20:53
Show Gist options
  • Save bibliotechy/042099f6671edd7d4b26194799b48215 to your computer and use it in GitHub Desktop.
Save bibliotechy/042099f6671edd7d4b26194799b48215 to your computer and use it in GitHub Desktop.
download data from open data philly
from requests import exceptions, get
import unicodedata
import fileinput
import json
import re
import datetime, time
from os import mkdir, getcwd, path
DAY_IN_SECONDS = 86400
def filename_safe_string(value):
value = re.sub('[^\w\s-]', '', value).strip().lower()
value = re.sub('[-\s]+', '-', value)
return value
def write_to_log(string, filename="download.log"):
with open(filename, "a+") as log:
log.write(str(datetime.datetime.now()) + ": " + string + "\n")
def older_than_one_day(path):
now = time.time()
file_mod_time = path.getmtime(path)
return (now - file_mod_time) > DAY_IN_SECONDS
def should_create_file(filepath, resource={}):
if path.exists(filepath):
return older_than_update_frequency(filepath, resource)
else:
return True
def older_than_update_frequency(filepath, resource):
frequency_lookup = {
"Update Frequency: Daily": DAY_IN_SECONDS,
"Update Frequency: Daily": DAY_IN_SECONDS,
"Update Frequency: Weekly": (DAY_IN_SECONDS * 7),
"Update Frequency: Monthly": (DAY_IN_SECONDS * 31),
"Update frequency: Monthly": (DAY_IN_SECONDS * 31),
}
freq = frequency_lookup.get(resource['description'].strip(), False)
if freq:
now = time.time()
file_mod_time = path.getmtime(filepath)
return (now - file_mod_time) > freq
else:
return False
def is_expected_filetype(resource, request):
format_lookup = {
'csv': ('text/csv',),
'shp': ('application/zip', 'application/octet-stream'),
'kml': ('application/vnd.google-earth.kml+xml',),
'geojson': ('application/vnd.geo+json', 'application/json'),
'html': ('text/html',)
}
described_format = resource['format'].lower()
format_from_header = request.headers['Content-Type']
return any(format in format_from_header for format in format_lookup.get(described_format,()))
for line in fileinput.input():
package = json.loads(line)
package_name = filename_safe_string(package['title'])
package_dir = path.join(getcwd(), 'downloads', package_name )
if not path.exists(package_dir):
mkdir(package_dir)
print "Creating directory for: " + package['title']
package_meta_file_path = path.join(package_dir, package_name + "-metadata.json")
if should_create_file(package_meta_file_path):
with open(path.join(package_dir, package_name + ".json"), "w") as package_meta:
package_meta.write(line)
write_to_log("INFO : Creating metadata dump for package " + package_name)
for resource in package['resources']:
resource_name = filename_safe_string(resource['name']) + "." + resource['format'].strip()
resource_file_path = path.join(package_dir, resource_name )
if should_create_file(resource_file_path, resource):
write_to_log("INFO : Downloading the dataset for " + resource_name)
with open(resource_file_path, "w") as resource_file:
try:
request = get(resource['url'])
resource_file.write(request.content)
if not is_expected_filetype(resource, request):
write_to_log("DEBUG : Expected filetype was " + resource['format'] + " but downloaded file had header " + request.headers.get('Content-Type', "Undefined Content Type"), "headers.log" )
except exceptions.RequestException as e:
with open("download.log", "w+") as log:
log.write("ERROR : Failed download for " + resource_name + " at url " + resource['url'] + "\n")
resource_metadata_path = path.join(package_dir, resource_name + "-metadata.json")
if should_create_file(resource_metadata_path, resource):
with open(resource_metadata_path, "w") as resource_metadata_file:
resource_metadata_file.write(json.dumps(resource))
from requests import exceptions, get
import fileinput
import json
import re
import datetime, time
from os import mkdir, getcwd, path
DAY_IN_SECONDS = 86400
def write_to_log(string, filename="download.log"):
with open(filename, "a+") as log:
log.write(str(datetime.datetime.now()) + ": " + string + "\n")
def is_expected_filetype(resource, request):
format_lookup = {
'csv': ('text/csv',),
'shp': ('application/zip', 'application/octet-stream'),
'kml': ('application/vnd.google-earth.kml+xml',),
'geojson': ('application/vnd.geo+json', 'application/json'),
'html': ('text/html',)
}
described_format = resource['format'].lower()
format_from_header = request.headers.get('Content-Type', [])
return any(format in format_from_header for format in format_lookup.get(described_format,()))
def redirects(request):
return [[r.url, r.status] for r in request.history]
for line in fileinput.input():
package = json.loads(line)
headers_file = 'headers/odp_headers' + str(time.time())
for resource in package['resources']:
try:
r = get(resource['url'])
write_to_log(json.dumps({
'headers': dict(r.headers),
'content_type_match_format': is_expected_filetype(resource, r),
#'redirects': redirects(r),
'date': str(datetime.datetime.now()),
'package_id': package['id'],
'id': resource['id'],
'name': resource['name'],
'status': r.status_code,
'initial_url': resource['url']}),headers_file)
except exceptions.RequestException as e:
with open("headers/fail.log", "w+") as log:
log.write("ERROR : Failed download for " + resource['name'] + " at url " + resource['url'] + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment