Create a gist now

Instantly share code, notes, and snippets.

@psthomas /main.py
Last active Jun 26, 2016

Embed
What would you like to do?
Code for scraping temperature images from weatherspark.com, and counting pixels with Pillow. Blogpost describing code here: http://pstblog.com/2016/06/25/integration-points
import requests
import os
import csv
from PIL import Image
from bs4 import BeautifulSoup
import time
import json
import shutil
# Top 50 US by population https://en.wikipedia.org/wiki/List_of_United_States_cities_by_population
# Add Honolulu, Anchorage for geographic diversity
cities = [
["New York","https://weatherspark.com/averages/31081/New-York-United-States"],
["Los Angeles","https://weatherspark.com/averages/29963/Los-Angeles-California-United-States"],
["San Diego","https://weatherspark.com/averages/31552/San-Diego-California-United-States"],
["San Jose","https://weatherspark.com/averages/31616/San-Jose-California-United-States"],
["San Francisco","https://weatherspark.com/averages/31587/San-Francisco-California-United-States"],
["Sacramento","https://weatherspark.com/averages/31640/Sacramento-California-United-States"],
["Long Beach","https://weatherspark.com/averages/30723/Long-Beach-California-United-States"],
["Oakland","https://weatherspark.com/averages/31096/Oakland-California-United-States"],
["Chicago","https://weatherspark.com/averages/31158/Chicago-Illinois-United-States"],
["Houston","https://weatherspark.com/averages/29697/Houston-Texas-United-States"],
["San Antonio","https://weatherspark.com/averages/31554/San-Antonio-Texas-United-States"],
["Dallas","https://weatherspark.com/averages/30025/Dallas-Texas-United-States"],
["Austin","https://weatherspark.com/averages/29684/Austin-Texas-United-States"],
["Fort Worth","https://weatherspark.com/averages/30313/Fort-Worth-Texas-United-States"],
["El Paso","https://weatherspark.com/averages/30173/El-Paso-Texas-United-States"],
["Arlington","https://weatherspark.com/averages/30376/Arlington-Texas-United-States"],
["Philadelphia","https://weatherspark.com/averages/31254/Philadelphia-Pennsylvania-United-States"],
["Phoenix","https://weatherspark.com/averages/31259/Phoenix-Arizona-United-States"],
["Tucson","https://weatherspark.com/averages/31809/Tucson-Arizona-United-States"],
["Jacksonville","https://weatherspark.com/averages/30631/Jacksonville-Florida-United-States"],
["Indianapolis","https://weatherspark.com/averages/30595/Indianapolis-United-States"],
["Columbus","https://weatherspark.com/averages/29928/Columbus-Ohio-United-States"],
["Charlotte","https://weatherspark.com/averages/29925/Charlotte-North-Carolina-United-States"],
["Raleigh","https://weatherspark.com/averages/31452/Raleigh-North-Carolina-United-States"],
["Seattle","https://weatherspark.com/averages/31576/Seattle-Washington-United-States"],
["Denver","https://weatherspark.com/averages/30040/Denver-Colorado-United-States"],
["Colorado Springs","https://weatherspark.com/averages/29950/Colorado-Springs-United-States"],
["Detroit","https://weatherspark.com/averages/30042/Detroit-Michigan-United-States"],
["Washington","https://weatherspark.com/averages/30032/Arlington-County-District-of-Columbia-United-States"],
["Boston","https://weatherspark.com/averages/29794/Boston-Massachusetts-United-States"],
["Memphis","https://weatherspark.com/averages/30857/Memphis-Tennessee-United-States"],
["Nashville","https://weatherspark.com/averages/29787/Nashville-Tennessee-United-States"],
["Portland","https://weatherspark.com/averages/31237/Portland-Oregon-United-States"],
["Oklahoma City","https://weatherspark.com/averages/31441/Oklahoma-City-United-States"],
["Tulsa","https://weatherspark.com/averages/31807/Tulsa-Oklahoma-United-States"],
["Las Vegas","https://weatherspark.com/averages/30697/Las-Vegas-Nevada-United-States"],
["Baltimore","https://weatherspark.com/averages/30064/Baltimore-Maryland-United-States"],
["Louisville","https://weatherspark.com/averages/31571/Louisville-Kentucky-United-States"],
["Milwaukee","https://weatherspark.com/averages/30894/Milwaukee-Wisconsin-United-States"],
["Albuquerque","https://weatherspark.com/averages/29561/Albuquerque-New-Mexico-United-States"],
["Kansas City","https://weatherspark.com/averages/30837/Kansas-City-Missouri-United-States"],
["Atlanta","https://weatherspark.com/averages/29669/Atlanta-Georgia-United-States"],
["Virginia Beach","https://weatherspark.com/averages/31072/Virginia-Beach-United-States"],
["Omaha","https://weatherspark.com/averages/29812/Omaha-Nebraska-United-States"],
["Miami","https://weatherspark.com/averages/30883/Miami-Florida-United-States"],
["Minneapolis","https://weatherspark.com/averages/30956/Minneapolis-Minnesota-United-States"],
["Wichita","https://weatherspark.com/averages/30564/Wichita-Kansas-United-States"],
["New Orleans","https://weatherspark.com/averages/30961/New-Orleans-Louisiana-United-States"],
["Honolulu", "https://weatherspark.com/averages/33125/Honolulu-Hawaii-United-States"],
["Anchorage", "https://weatherspark.com/averages/33020/Anchorage-Alaska-United-States"]
]
temp_colors = {
(186,145,234):"Frigid",
(106,178,198):"Freezing",
(75,143,78):"Cold",
(138,217,89):"Cool",
(250,201,61):"Comfortable",
(207,96,66):"Warm",
(190,79,76):"Hot",
(158,72,64):"Sweltering"
}
def get_image_urls(city_list):
'''Takes a list of lists containing cities and urls, modifies original list to add urls for images'''
for i in range(len(city_list)):
city = city_list[i]
r = requests.get(city[1])
time.sleep(0.1) # Good web citizen
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
# Uses CSS selectors:
url = str(soup.select('img[basesrc$=/fraction_of_time_spent_in_various_temperature_bands]')[0].attrs.get('src'))
if url:
city_list[i].append("https:"+ url)
else:
city_list[i].append("")
print "Failed to parse image url: " + city[0]
else:
print "Failed to access city url: " + city[0]
city_list[i].append("")
def get_images(city_list):
'''Retrieves images using links from city_list'''
base_url = "https://weatherspark.com"
path = os.getcwd() + '/images'
for city in city_list:
r = requests.get(city[2],stream=True)
filepath = path + '/' + city[0].replace(" ","_") + ".png"
time.sleep(0.1) # Give that server a break, man
if r.status_code == 200:
with open(filepath, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
print "Downloaded image for "+ city[0]
else:
print "Failed to get" + city[0] + r.status_code
#Store intermediate cities list in store.txt
with open('store.txt', 'wb') as f:
json.dump(city_list,f)
def process_images(city_list):
'''Iterates through /images and counts, scores pixels'''
path = os.getcwd() + '/images'
for image in os.listdir(path):
if image.endswith(".png"):
fullpath = path + "/" + image
im_rgb = Image.open(fullpath).convert("RGB") #Convert from RGBA
width, height = im_rgb.size
pixels = width*height
colors = im_rgb.getcolors(pixels) #Default is 256, http://effbot.org/imagingbook/image.htm
result = process_colors(colors)
for i in range(len(city_list)):
if city_list[i][0] == image.split(".")[0].replace("_"," "):
city_list[i].append(result)
def process_colors(color_list):
'''Takes a list of pixel count, RGBA values from Pillow, returns summed counts for temp colors'''
#Reduce size of colors list to those with count > 100
reduced_colors = filter(lambda tup: tup[0] > 100, color_list)
out = {}
for i in reduced_colors:
for j in temp_colors.keys():
tup = i[1]
if max(abs(j[0]-tup[0]), abs(j[1]-tup[1]), abs(j[2]-tup[2])) < 10: #Infinity-norm distance technique between colors
if not out.has_key(temp_colors[j]):
out[temp_colors[j]] = 0
out[temp_colors[j]] += i[0]
total_pixels = reduce(lambda x,y: x + y[1], out.items(), 0)
#Discounts for different temperatures
discounts = {'Cool':0.70,'Comfortable':1.0,'Warm':0.80,"Sweltering":-1.0,"Frigid":-1.0,"Freezing":-0.5,"Cold":0.2,"Hot":-0.2}
score = 0.0
for i in out.keys():
pct = float(out[i])/total_pixels
out[i] = [out[i], pct]
score += pct*discounts[i]
out['score'] = score
return out
def csv_out(city_list):
'''Formats city_list for CSV output'''
with open('output.csv', 'wb') as csvfile:
row_writer = csv.writer(csvfile, delimiter=',')
row_writer.writerow(['city_name','sweltering_pct','hot_pct','cool_pct','freezing_pct','frigid_pct','comfortable_pct','warm_pct','cold_pct','score'])
temp_bands = temp_colors.values()
for city in city_list:
values = [city[0]]
for band in temp_bands:
if band in city[3]:
values.append(round(city[3][band][1]*100,1))
else:
values.append(0.0)
values.append(round(city[3]['score']*100,1))
row_writer.writerow(values)
if __name__ == "__main__":
# Only get images if intermediate file doesn't exist
if not os.path.isfile("./store.txt"):
get_image_urls(cities)
get_images(cities)
with open('store.txt', 'rb') as f:
cities = json.load(f)
process_images(cities)
csv_out(cities)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment