bibliotechy/package_to_files.py

## package_to_files.py
from requests import exceptions, get
import unicodedata
import fileinput
import json
import re
import datetime, time
from os import mkdir, getcwd, path

DAY_IN_SECONDS = 86400

def filename_safe_string(value):
    value = re.sub('[^\w\s-]', '', value).strip().lower()
    value = re.sub('[-\s]+', '-', value)
    return value

def write_to_log(string, filename="download.log"):
  with open(filename, "a+") as log:
    log.write(str(datetime.datetime.now()) + ": " + string + "\n")

def older_than_one_day(path):
  now = time.time()
  file_mod_time = path.getmtime(path)
  return (now - file_mod_time) > DAY_IN_SECONDS

def should_create_file(filepath, resource={}):
  if path.exists(filepath):
    return older_than_update_frequency(filepath, resource)
  else:
    return True

def older_than_update_frequency(filepath, resource):
  frequency_lookup = {
    "Update Frequency: Daily": DAY_IN_SECONDS,
    "Update Frequency:  Daily": DAY_IN_SECONDS,
    "Update Frequency: Weekly": (DAY_IN_SECONDS * 7),
    "Update Frequency: Monthly": (DAY_IN_SECONDS * 31),
    "Update frequency: Monthly": (DAY_IN_SECONDS * 31),
    }
  freq = frequency_lookup.get(resource['description'].strip(), False)
  if freq:
    now = time.time()
    file_mod_time = path.getmtime(filepath)
    return (now - file_mod_time) > freq
  else:
    return False

def is_expected_filetype(resource, request):
  format_lookup = {
      'csv': ('text/csv',),
      'shp': ('application/zip', 'application/octet-stream'),
      'kml': ('application/vnd.google-earth.kml+xml',),
      'geojson': ('application/vnd.geo+json', 'application/json'),
      'html': ('text/html',)
      }

  described_format   = resource['format'].lower()
  format_from_header = request.headers['Content-Type']
  return any(format in format_from_header for format in format_lookup.get(described_format,()))

for line in fileinput.input():
  package = json.loads(line)

  package_name = filename_safe_string(package['title'])
  package_dir = path.join(getcwd(), 'downloads', package_name )

  if not path.exists(package_dir):
    mkdir(package_dir)
    print "Creating directory for: " + package['title']

  package_meta_file_path = path.join(package_dir, package_name + "-metadata.json")

  if should_create_file(package_meta_file_path):
    with open(path.join(package_dir, package_name + ".json"), "w") as package_meta:
      package_meta.write(line)

    write_to_log("INFO : Creating metadata dump for package " + package_name)

  for resource in package['resources']:
    resource_name = filename_safe_string(resource['name']) + "."  + resource['format'].strip()
    resource_file_path = path.join(package_dir, resource_name )

    if should_create_file(resource_file_path, resource):
      write_to_log("INFO : Downloading the dataset for " + resource_name)

      with open(resource_file_path, "w") as resource_file:
        try:
          request = get(resource['url'])
          resource_file.write(request.content)
          if not is_expected_filetype(resource, request):
            write_to_log("DEBUG : Expected filetype was " + resource['format'] + " but downloaded file had header " + request.headers.get('Content-Type', "Undefined Content Type"), "headers.log" )
        except exceptions.RequestException as e:
          with open("download.log", "w+") as log:
            log.write("ERROR : Failed download for " + resource_name + " at url " + resource['url'] + "\n")


    resource_metadata_path = path.join(package_dir, resource_name + "-metadata.json")
    if should_create_file(resource_metadata_path, resource):

      with open(resource_metadata_path, "w") as resource_metadata_file:
        resource_metadata_file.write(json.dumps(resource))

## package_to_headers.py
from requests import exceptions, get
import fileinput
import json
import re
import datetime, time
from os import mkdir, getcwd, path

DAY_IN_SECONDS = 86400

def write_to_log(string, filename="download.log"):
  with open(filename, "a+") as log:
    log.write(str(datetime.datetime.now()) + ": " + string + "\n")

def is_expected_filetype(resource, request):
  format_lookup = {
      'csv': ('text/csv',),
      'shp': ('application/zip', 'application/octet-stream'),
      'kml': ('application/vnd.google-earth.kml+xml',),
      'geojson': ('application/vnd.geo+json', 'application/json'),
      'html': ('text/html',)
      }

  described_format   = resource['format'].lower()
  format_from_header = request.headers.get('Content-Type', [])
  return any(format in format_from_header for format in format_lookup.get(described_format,()))

def redirects(request):
  return [[r.url, r.status] for r in request.history]

for line in fileinput.input():
  package = json.loads(line)
  headers_file = 'headers/odp_headers' + str(time.time())
  for resource in package['resources']:

    try:
      r = get(resource['url'])
      write_to_log(json.dumps({
          'headers': dict(r.headers),
          'content_type_match_format': is_expected_filetype(resource, r),
          #'redirects': redirects(r),
          'date': str(datetime.datetime.now()),
          'package_id': package['id'],
          'id': resource['id'],
          'name': resource['name'],
          'status': r.status_code,
          'initial_url': resource['url']}),headers_file)


    except exceptions.RequestException as e:
      with open("headers/fail.log", "w+") as log:
        log.write("ERROR : Failed download for " + resource['name'] + " at url " + resource['url'] + "\n")
	from requests import exceptions, get
	import unicodedata
	import fileinput
	import json
	import re
	import datetime, time
	from os import mkdir, getcwd, path

	DAY_IN_SECONDS = 86400

	def filename_safe_string(value):
	value = re.sub('[^\w\s-]', '', value).strip().lower()
	value = re.sub('[-\s]+', '-', value)
	return value

	def write_to_log(string, filename="download.log"):
	with open(filename, "a+") as log:
	log.write(str(datetime.datetime.now()) + ": " + string + "\n")

	def older_than_one_day(path):
	now = time.time()
	file_mod_time = path.getmtime(path)
	return (now - file_mod_time) > DAY_IN_SECONDS

	def should_create_file(filepath, resource={}):
	if path.exists(filepath):
	return older_than_update_frequency(filepath, resource)
	else:
	return True

	def older_than_update_frequency(filepath, resource):
	frequency_lookup = {
	"Update Frequency: Daily": DAY_IN_SECONDS,
	"Update Frequency: Daily": DAY_IN_SECONDS,
	"Update Frequency: Weekly": (DAY_IN_SECONDS * 7),
	"Update Frequency: Monthly": (DAY_IN_SECONDS * 31),
	"Update frequency: Monthly": (DAY_IN_SECONDS * 31),
	}
	freq = frequency_lookup.get(resource['description'].strip(), False)
	if freq:
	now = time.time()
	file_mod_time = path.getmtime(filepath)
	return (now - file_mod_time) > freq
	else:
	return False

	def is_expected_filetype(resource, request):
	format_lookup = {
	'csv': ('text/csv',),
	'shp': ('application/zip', 'application/octet-stream'),
	'kml': ('application/vnd.google-earth.kml+xml',),
	'geojson': ('application/vnd.geo+json', 'application/json'),
	'html': ('text/html',)
	}

	described_format = resource['format'].lower()
	format_from_header = request.headers['Content-Type']
	return any(format in format_from_header for format in format_lookup.get(described_format,()))

	for line in fileinput.input():
	package = json.loads(line)

	package_name = filename_safe_string(package['title'])
	package_dir = path.join(getcwd(), 'downloads', package_name )

	if not path.exists(package_dir):
	mkdir(package_dir)
	print "Creating directory for: " + package['title']

	package_meta_file_path = path.join(package_dir, package_name + "-metadata.json")

	if should_create_file(package_meta_file_path):
	with open(path.join(package_dir, package_name + ".json"), "w") as package_meta:
	package_meta.write(line)

	write_to_log("INFO : Creating metadata dump for package " + package_name)

	for resource in package['resources']:
	resource_name = filename_safe_string(resource['name']) + "." + resource['format'].strip()
	resource_file_path = path.join(package_dir, resource_name )

	if should_create_file(resource_file_path, resource):
	write_to_log("INFO : Downloading the dataset for " + resource_name)

	with open(resource_file_path, "w") as resource_file:
	try:
	request = get(resource['url'])
	resource_file.write(request.content)
	if not is_expected_filetype(resource, request):
	write_to_log("DEBUG : Expected filetype was " + resource['format'] + " but downloaded file had header " + request.headers.get('Content-Type', "Undefined Content Type"), "headers.log" )
	except exceptions.RequestException as e:
	with open("download.log", "w+") as log:
	log.write("ERROR : Failed download for " + resource_name + " at url " + resource['url'] + "\n")


	resource_metadata_path = path.join(package_dir, resource_name + "-metadata.json")
	if should_create_file(resource_metadata_path, resource):

	with open(resource_metadata_path, "w") as resource_metadata_file:
	resource_metadata_file.write(json.dumps(resource))