Last active
August 29, 2015 14:18
-
-
Save willycs40/10e5f3627e28fb3a21d8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
import datetime | |
import time | |
import os | |
# function to build form data given date and time | |
def get_form_data(date, time): | |
return { 'Type': 'Observation', | |
'PredictionSiteID': 'ALL', | |
'ObservationSiteID':'ALL', | |
'Date': date, | |
'PredictionTime':time} | |
# function to generate dates | |
def daterange(start_date, end_date): | |
for n in range(int ((end_date - start_date).days)): | |
yield start_date + datetime.timedelta(n) | |
# Settings | |
start_date = datetime.date(2014,1,1) | |
end_date = datetime.date(2014,1,3) | |
start_hour = 7 | |
end_hour = 18 | |
save_location = r'C:\Projects\WeatherData\output_{0}_{1}.csv' | |
timer_delay = 0.1 | |
# A list to hold errored dates | |
missing_dates = [] | |
# iterate through each date | |
for single_date in daterange(start_date, end_date): | |
request_date = time.strftime("%d/%m/%Y", single_date.timetuple()) # get the date string for use in the post request | |
file_date = time.strftime("%Y%m%d", single_date.timetuple()) # get the date string for use in the file name | |
# iterate through each hour | |
for x in xrange(start_hour,end_hour+1): # have to use end_hour + 1 as range is exclusive on upper side | |
request_time = str(x).zfill(2)+'00' # get the hour string | |
# get the form data | |
form_data = get_form_data(request_date, request_time) | |
# send the request, and then search the response for https addresses | |
try: | |
r = requests.post("http://datagovuk.cloudapp.net/query", data=form_data) | |
except: | |
missing_dates.append(['Post request failed.',request_date, request_time]) | |
continue | |
if not r.ok: | |
missing_dates.append(['Bad get response.',request_date, request_time]) | |
continue | |
# search the response for the csv url | |
urls = re.findall('https://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', r.text) | |
if len(urls)==0: | |
print("No Urls Found for {0}:{1}".format(request_date, request_time)) | |
if "No matching records were found." in r.text: | |
missing_dates.append(['No Url. Missing Data.',request_date, request_time]) | |
else: | |
missing_dates.append(['No Url. Other.',request_date, request_time]) | |
continue | |
# visit the csv url, streaming the file to disk | |
with open(save_location.format(file_date, request_time), 'wb') as handle: | |
try: | |
r = requests.get(urls[0], stream=True) | |
except: | |
missing_dates.append(['Get request failed.',request_date, request_time]) | |
continue | |
if not r.ok: | |
missing_dates.append(['Bad get response.',request_date, request_time]) | |
continue | |
for block in r.iter_content(1024): | |
if not block: | |
break | |
handle.write(block) | |
time.sleep(timer_delay) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment