Instantly share code, notes, and snippets.

@psthomas /
Last active Jun 26, 2016

What would you like to do?
Code for scraping temperature images from, and counting pixels with Pillow. Blogpost describing code here:
import requests
import os
import csv
from PIL import Image
from bs4 import BeautifulSoup
import time
import json
import shutil
# Top 50 US by population
# Add Honolulu, Anchorage for geographic diversity
cities = [
["New York",""],
["Los Angeles",""],
["San Diego",""],
["San Jose",""],
["San Francisco",""],
["Long Beach",""],
["San Antonio",""],
["Fort Worth",""],
["El Paso",""],
["Colorado Springs",""],
["Oklahoma City",""],
["Las Vegas",""],
["Kansas City",""],
["Virginia Beach",""],
["New Orleans",""],
["Honolulu", ""],
["Anchorage", ""]
temp_colors = {
def get_image_urls(city_list):
'''Takes a list of lists containing cities and urls, modifies original list to add urls for images'''
for i in range(len(city_list)):
city = city_list[i]
r = requests.get(city[1])
time.sleep(0.1) # Good web citizen
if r.status_code == 200:
soup = BeautifulSoup(r.text, 'html.parser')
# Uses CSS selectors:
url = str('img[basesrc$=/fraction_of_time_spent_in_various_temperature_bands]')[0].attrs.get('src'))
if url:
city_list[i].append("https:"+ url)
print "Failed to parse image url: " + city[0]
print "Failed to access city url: " + city[0]
def get_images(city_list):
'''Retrieves images using links from city_list'''
base_url = ""
path = os.getcwd() + '/images'
for city in city_list:
r = requests.get(city[2],stream=True)
filepath = path + '/' + city[0].replace(" ","_") + ".png"
time.sleep(0.1) # Give that server a break, man
if r.status_code == 200:
with open(filepath, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
print "Downloaded image for "+ city[0]
print "Failed to get" + city[0] + r.status_code
#Store intermediate cities list in store.txt
with open('store.txt', 'wb') as f:
def process_images(city_list):
'''Iterates through /images and counts, scores pixels'''
path = os.getcwd() + '/images'
for image in os.listdir(path):
if image.endswith(".png"):
fullpath = path + "/" + image
im_rgb ="RGB") #Convert from RGBA
width, height = im_rgb.size
pixels = width*height
colors = im_rgb.getcolors(pixels) #Default is 256,
result = process_colors(colors)
for i in range(len(city_list)):
if city_list[i][0] == image.split(".")[0].replace("_"," "):
def process_colors(color_list):
'''Takes a list of pixel count, RGBA values from Pillow, returns summed counts for temp colors'''
#Reduce size of colors list to those with count > 100
reduced_colors = filter(lambda tup: tup[0] > 100, color_list)
out = {}
for i in reduced_colors:
for j in temp_colors.keys():
tup = i[1]
if max(abs(j[0]-tup[0]), abs(j[1]-tup[1]), abs(j[2]-tup[2])) < 10: #Infinity-norm distance technique between colors
if not out.has_key(temp_colors[j]):
out[temp_colors[j]] = 0
out[temp_colors[j]] += i[0]
total_pixels = reduce(lambda x,y: x + y[1], out.items(), 0)
#Discounts for different temperatures
discounts = {'Cool':0.70,'Comfortable':1.0,'Warm':0.80,"Sweltering":-1.0,"Frigid":-1.0,"Freezing":-0.5,"Cold":0.2,"Hot":-0.2}
score = 0.0
for i in out.keys():
pct = float(out[i])/total_pixels
out[i] = [out[i], pct]
score += pct*discounts[i]
out['score'] = score
return out
def csv_out(city_list):
'''Formats city_list for CSV output'''
with open('output.csv', 'wb') as csvfile:
row_writer = csv.writer(csvfile, delimiter=',')
temp_bands = temp_colors.values()
for city in city_list:
values = [city[0]]
for band in temp_bands:
if band in city[3]:
if __name__ == "__main__":
# Only get images if intermediate file doesn't exist
if not os.path.isfile("./store.txt"):
with open('store.txt', 'rb') as f:
cities = json.load(f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment