-
-
Save Keats/0ba9be4e514b2a90e59f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import itertools | |
import os | |
import time | |
import requests | |
import bs4 | |
years = ["2011", "2012", "2013", "2014"] | |
months = [ | |
"01", "02", "03", "04", "05", "06", | |
"07", "08", "09", "10", "11", "12" | |
] | |
# Gives data in chronological order | |
perms = [ | |
"-".join(reversed(combo)) | |
for combo in itertools.product(years, months) | |
] | |
columns = [ | |
"month", | |
"avg_temp", | |
"min_temp", | |
"max_temp", | |
"humidity", | |
"rainfall", | |
"raindays", # > number of day with rainfall > 5mm | |
"snowdays" | |
] | |
base_url = "http://en.tutiempo.net/climate" | |
urls = { | |
"nice": "%s/%s/ws-76900.html", | |
"okinawa": "%s/%s/ws-479360.html", | |
"london": "%s/%s/ws-37720.html", | |
"montreal": "%s/%s/ws-716270.html" | |
} | |
def fetch_city(city, from_cache=True): | |
city_url = urls[city] | |
for perm in perms: | |
print("Getting %s for %s" % (perm, city)) | |
if not from_cache: | |
url = city_url % (base_url, perm) | |
response = requests.get(url) | |
with open("data/%s/%s.html" % (city, perm), "wb") as save: | |
save.write(response.text.encode("utf-8")) | |
row = parse_html(perm, response.text) | |
save_csv(city, row) | |
time.sleep(2) | |
else: | |
with open("data/%s/%s.html" % (city, perm), "r") as saved: | |
row = parse_html(perm, saved.read()) | |
save_csv(city, row) | |
def save_csv(city, row): | |
""" | |
Save CSVs for each to avoid having to refetch everything | |
""" | |
path = "%s.csv" % city | |
append_only = os.path.isfile(path) | |
if append_only: | |
mode = "a" | |
else: | |
mode = "w" | |
with open(path, mode) as csvfile: | |
writer = csv.writer(csvfile, delimiter=',') | |
if not append_only: | |
writer.writerow(columns) | |
writer.writerow(row) | |
def parse_html(month, html): | |
""" | |
This is daily data, we only care about monthly data | |
so we need to do avg etc ourselves for a month | |
""" | |
soup = bs4.BeautifulSoup(html) | |
rows = soup.select(".medias tr") | |
rows = rows[1:] | |
rows = rows[:-2] | |
number_days = len(rows) | |
number_days_humidity_measured = number_days | |
number_days_rainfall_measured = number_days | |
snow_days = 0 | |
raindays = 0 | |
avg_temp = None | |
min_temp = None | |
max_temp = None | |
avg_humidity = None | |
avg_rainfall = None | |
for row in rows: | |
tds = row.select("td") | |
temp = tds[1].text | |
if temp == "-": | |
# Skip days without temp measurement | |
continue | |
temp = float(temp) | |
if avg_temp is None: | |
avg_temp = temp | |
else: | |
avg_temp += temp | |
if min_temp is None: | |
min_temp = temp | |
else: | |
min_temp = temp if temp < min_temp else min_temp | |
if max_temp is None: | |
max_temp = temp | |
else: | |
max_temp = temp if temp > max_temp else max_temp | |
humidity = tds[5].text | |
if humidity == '-': | |
number_days_humidity_measured -= 1 | |
else: | |
humidity = float(humidity) | |
if avg_humidity is None: | |
avg_humidity = humidity | |
else: | |
avg_humidity += humidity | |
rainfall = tds[6].text | |
if rainfall == '-': | |
number_days_rainfall_measured -= 1 | |
continue | |
rainfall = float(rainfall) | |
if avg_rainfall is None: | |
avg_rainfall = rainfall | |
else: | |
avg_rainfall += rainfall | |
if rainfall > 5.0: | |
raindays += 1 | |
snowed = tds[12].text == "o" | |
if snowed: | |
snow_days += 1 | |
return [ | |
month, | |
round(avg_temp / number_days, 1), | |
min_temp, | |
max_temp, | |
round(avg_humidity / number_days_humidity_measured, 1), | |
round(avg_rainfall / number_days_rainfall_measured, 1), | |
raindays, | |
snow_days | |
] | |
for city in urls.keys(): | |
fetch_city(city) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment