Skip to content

Instantly share code, notes, and snippets.

@Keats

Keats/scraper.py Secret

Created May 7, 2015 13:25
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Keats/0ba9be4e514b2a90e59f to your computer and use it in GitHub Desktop.
Save Keats/0ba9be4e514b2a90e59f to your computer and use it in GitHub Desktop.
import csv
import itertools
import os
import time
import requests
import bs4
years = ["2011", "2012", "2013", "2014"]
months = [
"01", "02", "03", "04", "05", "06",
"07", "08", "09", "10", "11", "12"
]
# Gives data in chronological order
perms = [
"-".join(reversed(combo))
for combo in itertools.product(years, months)
]
columns = [
"month",
"avg_temp",
"min_temp",
"max_temp",
"humidity",
"rainfall",
"raindays", # > number of day with rainfall > 5mm
"snowdays"
]
base_url = "http://en.tutiempo.net/climate"
urls = {
"nice": "%s/%s/ws-76900.html",
"okinawa": "%s/%s/ws-479360.html",
"london": "%s/%s/ws-37720.html",
"montreal": "%s/%s/ws-716270.html"
}
def fetch_city(city, from_cache=True):
city_url = urls[city]
for perm in perms:
print("Getting %s for %s" % (perm, city))
if not from_cache:
url = city_url % (base_url, perm)
response = requests.get(url)
with open("data/%s/%s.html" % (city, perm), "wb") as save:
save.write(response.text.encode("utf-8"))
row = parse_html(perm, response.text)
save_csv(city, row)
time.sleep(2)
else:
with open("data/%s/%s.html" % (city, perm), "r") as saved:
row = parse_html(perm, saved.read())
save_csv(city, row)
def save_csv(city, row):
"""
Save CSVs for each to avoid having to refetch everything
"""
path = "%s.csv" % city
append_only = os.path.isfile(path)
if append_only:
mode = "a"
else:
mode = "w"
with open(path, mode) as csvfile:
writer = csv.writer(csvfile, delimiter=',')
if not append_only:
writer.writerow(columns)
writer.writerow(row)
def parse_html(month, html):
"""
This is daily data, we only care about monthly data
so we need to do avg etc ourselves for a month
"""
soup = bs4.BeautifulSoup(html)
rows = soup.select(".medias tr")
rows = rows[1:]
rows = rows[:-2]
number_days = len(rows)
number_days_humidity_measured = number_days
number_days_rainfall_measured = number_days
snow_days = 0
raindays = 0
avg_temp = None
min_temp = None
max_temp = None
avg_humidity = None
avg_rainfall = None
for row in rows:
tds = row.select("td")
temp = tds[1].text
if temp == "-":
# Skip days without temp measurement
continue
temp = float(temp)
if avg_temp is None:
avg_temp = temp
else:
avg_temp += temp
if min_temp is None:
min_temp = temp
else:
min_temp = temp if temp < min_temp else min_temp
if max_temp is None:
max_temp = temp
else:
max_temp = temp if temp > max_temp else max_temp
humidity = tds[5].text
if humidity == '-':
number_days_humidity_measured -= 1
else:
humidity = float(humidity)
if avg_humidity is None:
avg_humidity = humidity
else:
avg_humidity += humidity
rainfall = tds[6].text
if rainfall == '-':
number_days_rainfall_measured -= 1
continue
rainfall = float(rainfall)
if avg_rainfall is None:
avg_rainfall = rainfall
else:
avg_rainfall += rainfall
if rainfall > 5.0:
raindays += 1
snowed = tds[12].text == "o"
if snowed:
snow_days += 1
return [
month,
round(avg_temp / number_days, 1),
min_temp,
max_temp,
round(avg_humidity / number_days_humidity_measured, 1),
round(avg_rainfall / number_days_rainfall_measured, 1),
raindays,
snow_days
]
for city in urls.keys():
fetch_city(city)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment