Skip to content

Instantly share code, notes, and snippets.

@ashtom84
Last active April 1, 2016 02:21
Show Gist options
  • Save ashtom84/57a47b018cc87885a931a3ecbd38ec68 to your computer and use it in GitHub Desktop.
Save ashtom84/57a47b018cc87885a931a3ecbd38ec68 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import mechanize
import cookielib
import pandas as pd
import numpy as np
from sys import stdout
import time
start_time = time.time()
# Dictionary of the US states and their abbreviations
states = {
'AK': 'Alaska',
'AL': 'Alabama',
'AR': 'Arkansas',
'AS': 'American Samoa',
'AZ': 'Arizona',
'CA': 'California',
'CO': 'Colorado',
'CT': 'Connecticut',
'DC': 'District of Columbia',
'DE': 'Delaware',
'FL': 'Florida',
'GA': 'Georgia',
'GU': 'Guam',
'HI': 'Hawaii',
'IA': 'Iowa',
'ID': 'Idaho',
'IL': 'Illinois',
'IN': 'Indiana',
'KS': 'Kansas',
'KY': 'Kentucky',
'LA': 'Louisiana',
'MA': 'Massachusetts',
'MD': 'Maryland',
'ME': 'Maine',
'MI': 'Michigan',
'MN': 'Minnesota',
'MO': 'Missouri',
'MP': 'Northern Mariana Islands',
'MS': 'Mississippi',
'MT': 'Montana',
'NA': 'National',
'NC': 'North Carolina',
'ND': 'North Dakota',
'NE': 'Nebraska',
'NH': 'New Hampshire',
'NJ': 'New Jersey',
'NM': 'New Mexico',
'NV': 'Nevada',
'NY': 'New York',
'OH': 'Ohio',
'OK': 'Oklahoma',
'OR': 'Oregon',
'PA': 'Pennsylvania',
'PR': 'Puerto Rico',
'RI': 'Rhode Island',
'SC': 'South Carolina',
'SD': 'South Dakota',
'TN': 'Tennessee',
'TX': 'Texas',
'UT': 'Utah',
'VA': 'Virginia',
'VI': 'Virgin Islands',
'VT': 'Vermont',
'WA': 'Washington',
'WI': 'Wisconsin',
'WV': 'West Virginia',
'WY': 'Wyoming'
}
# Browser
br = mechanize.Browser()
# Allow cookies
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# number of steps
step = 1
# initialization of the variables we try to retrieve
precipitation = -1
avg_days_precip = -1
avg_rainy_days = -1
avg_days_below_0 = -1
avg_days_above_32 = -1
cty = -1
precip = -1
cty2 = -1
abbr = -1
# variable to check wether the query was successful or required further search
status = ""
# names of the weather features we wish to retrieve
varNames = ["Average Precipitation", "Average Number of Days With Precipitation", \
"Average Number of Days Below 32F/0C", "Average Number of Days Above 90F/32C", \
"Average Number of Rainy Days"]
# initialization of the weather dataframe
df_weather = pd.DataFrame({'cty' : [], 'cty2' : [], 'abb' : [], 'avg_precipitation' : [], \
'avg_precipitation_mm' : [], 'avg_days_precip': [], \
'avg_rainy_days': [], 'avg_days_below_0': [], \
'avg_days_above_32': []})
# read the dataframe with motorcycles' information
bike = pd.read_csv("bike.csv")
# cities information from the dataframe bike. cty = "City, State", and cty2 = "City"
city_info = pd.DataFrame(index = bike.city, columns = ['cty', 'cty2', 'abb'])
city_info['cty2'] = city_info.index
for i in range(0, len(bike.city)):
if type(bike.iloc[i, 12]) == str:
abb = ''.join(bike.iloc[i, 12].split())
else:
abb = np.nan
if abb in states.keys():
city_info['abb'][i] = abb
else:
city_info['abb'][i] = "NaN"
# for every city that appears in bike we retrieve the weather information
for city in np.unique(bike.city)[1:]:
name = [city, city_info[city_info.index == city]['abb'][0]]
if name[0] != "NaN" and name[1] != "NaN" and name[1] in states.keys():
name2 = " ".join(map(lambda x: x.capitalize(), name[0].split(" "))) \
+ ", " + states[name[1]]
# Browser options
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# Specify browser to emulate
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) \
Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
# we open a browser and read the result of the query
url = 'http://www.weatherbase.com/search/search.php3'
br.open(url)
#response = br.response()
br.select_form('searchform')
form = str(city.upper())
br.form['query'] = form
req = br.submit()
cityR = BeautifulSoup(req.read())
# the query yields a path linked to the weather figures we seek
web_path = "http://www.weatherbase.com"
path_city1 = "recordOutboundLink(this, 'Search', '"
path_city2 = name2
path_city3 = ", United States of America'); return false;"
path_city = path_city1 + path_city2 + path_city3
# the results are recorded in an HTML table, we sift through the table to recover
# to the annual figures corresponding to the weather feature names we defined
for tag in cityR.find_all("a", class_ = "redglow"):
# check that we are on the right URL
if "weather/weather" in tag.get("href"):
loc = tag.get("href").split("cityname=", 1)[1].split("-")
loc = loc[0] + ", " + loc[1]
status = "query"
# or if the query failed and found only multiple likely locations
elif "search/search" in tag.get("href"):
loc = tag.get("href").split("geo=", 1)[1]
status = "search"
# find the correct location among the possible results of the query
if loc == name2:
if status == "query":
city_path = tag.get("href")
elif status == "search":
search_path = tag.get("href")
city_path = BeautifulSoup(requests.get(web_path + search_path).text, \
"lxml").find("a", class_ = "redglow").get("href").split()[0]
soup = BeautifulSoup(requests.get(web_path + city_path).text, "lxml")\
.find("div", class_ = "p402_premium")\
.find_all("table", class_ = "weather-table")
for sp in soup:
cat = sp.find("div", {"id": "h4font"}).string
if cat in varNames:
val = sp.find_next_sibling("table").find("td", class_ = "data").string
if cat == "Average Precipitation":
precipitation = val
if cat == "Average Number of Days With Precipitation":
avg_days_precip = val
if cat == "Average Number of Days Below 32F/0C":
avg_days_below_0 = val
if cat == "Average Number of Days Above 90F/32C":
avg_days_above_32 = val
if cat == "Average Number of Rainy Days":
avg_rainy_days = val
temp = pd.DataFrame({'cty' : name2, 'cty2' : name[0], 'abb' : name[1], \
'avg_precipitation' : precipitation, \
'avg_precipitation_mm' : float(precipitation)*25.4, \
'avg_days_precip': avg_days_precip, \
'avg_rainy_days': avg_rainy_days, \
'avg_days_below_0': avg_days_below_0, \
'avg_days_above_32': avg_days_above_32}, index = [0])
df_weather = pd.concat([df_weather, temp], axis=0)
df_weather.to_csv("df_weather.csv")
# re-initialize for the next step
precipitation = -1
avg_days_precip = -1
avg_rainy_days = -1
avg_days_below_0 = -1
avg_days_above_32 = -1
cty = -1
precip = -1
cty2 = -1
abbr = -1
stdout.write("Step = \r%d" % step)
stdout.flush()
step += 1
# time elapsed since scraping started
print("--- %s seconds ---" % (time.time() - start_time))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment