-
-
Save ashtom84/57a47b018cc87885a931a3ecbd38ec68 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import mechanize | |
import cookielib | |
import pandas as pd | |
import numpy as np | |
from sys import stdout | |
import time | |
start_time = time.time() | |
# Dictionary of the US states and their abbreviations | |
states = { | |
'AK': 'Alaska', | |
'AL': 'Alabama', | |
'AR': 'Arkansas', | |
'AS': 'American Samoa', | |
'AZ': 'Arizona', | |
'CA': 'California', | |
'CO': 'Colorado', | |
'CT': 'Connecticut', | |
'DC': 'District of Columbia', | |
'DE': 'Delaware', | |
'FL': 'Florida', | |
'GA': 'Georgia', | |
'GU': 'Guam', | |
'HI': 'Hawaii', | |
'IA': 'Iowa', | |
'ID': 'Idaho', | |
'IL': 'Illinois', | |
'IN': 'Indiana', | |
'KS': 'Kansas', | |
'KY': 'Kentucky', | |
'LA': 'Louisiana', | |
'MA': 'Massachusetts', | |
'MD': 'Maryland', | |
'ME': 'Maine', | |
'MI': 'Michigan', | |
'MN': 'Minnesota', | |
'MO': 'Missouri', | |
'MP': 'Northern Mariana Islands', | |
'MS': 'Mississippi', | |
'MT': 'Montana', | |
'NA': 'National', | |
'NC': 'North Carolina', | |
'ND': 'North Dakota', | |
'NE': 'Nebraska', | |
'NH': 'New Hampshire', | |
'NJ': 'New Jersey', | |
'NM': 'New Mexico', | |
'NV': 'Nevada', | |
'NY': 'New York', | |
'OH': 'Ohio', | |
'OK': 'Oklahoma', | |
'OR': 'Oregon', | |
'PA': 'Pennsylvania', | |
'PR': 'Puerto Rico', | |
'RI': 'Rhode Island', | |
'SC': 'South Carolina', | |
'SD': 'South Dakota', | |
'TN': 'Tennessee', | |
'TX': 'Texas', | |
'UT': 'Utah', | |
'VA': 'Virginia', | |
'VI': 'Virgin Islands', | |
'VT': 'Vermont', | |
'WA': 'Washington', | |
'WI': 'Wisconsin', | |
'WV': 'West Virginia', | |
'WY': 'Wyoming' | |
} | |
# Browser | |
br = mechanize.Browser() | |
# Allow cookies | |
cj = cookielib.LWPCookieJar() | |
br.set_cookiejar(cj) | |
# number of steps | |
step = 1 | |
# initialization of the variables we try to retrieve | |
precipitation = -1 | |
avg_days_precip = -1 | |
avg_rainy_days = -1 | |
avg_days_below_0 = -1 | |
avg_days_above_32 = -1 | |
cty = -1 | |
precip = -1 | |
cty2 = -1 | |
abbr = -1 | |
# variable to check wether the query was successful or required further search | |
status = "" | |
# names of the weather features we wish to retrieve | |
varNames = ["Average Precipitation", "Average Number of Days With Precipitation", \ | |
"Average Number of Days Below 32F/0C", "Average Number of Days Above 90F/32C", \ | |
"Average Number of Rainy Days"] | |
# initialization of the weather dataframe | |
df_weather = pd.DataFrame({'cty' : [], 'cty2' : [], 'abb' : [], 'avg_precipitation' : [], \ | |
'avg_precipitation_mm' : [], 'avg_days_precip': [], \ | |
'avg_rainy_days': [], 'avg_days_below_0': [], \ | |
'avg_days_above_32': []}) | |
# read the dataframe with motorcycles' information | |
bike = pd.read_csv("bike.csv") | |
# cities information from the dataframe bike. cty = "City, State", and cty2 = "City" | |
city_info = pd.DataFrame(index = bike.city, columns = ['cty', 'cty2', 'abb']) | |
city_info['cty2'] = city_info.index | |
for i in range(0, len(bike.city)): | |
if type(bike.iloc[i, 12]) == str: | |
abb = ''.join(bike.iloc[i, 12].split()) | |
else: | |
abb = np.nan | |
if abb in states.keys(): | |
city_info['abb'][i] = abb | |
else: | |
city_info['abb'][i] = "NaN" | |
# for every city that appears in bike we retrieve the weather information | |
for city in np.unique(bike.city)[1:]: | |
name = [city, city_info[city_info.index == city]['abb'][0]] | |
if name[0] != "NaN" and name[1] != "NaN" and name[1] in states.keys(): | |
name2 = " ".join(map(lambda x: x.capitalize(), name[0].split(" "))) \ | |
+ ", " + states[name[1]] | |
# Browser options | |
br.set_handle_equiv(True) | |
br.set_handle_redirect(True) | |
br.set_handle_referer(True) | |
br.set_handle_robots(False) | |
# Follows refresh 0 but not hangs on refresh > 0 | |
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) | |
# Specify browser to emulate | |
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) \ | |
Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] | |
# we open a browser and read the result of the query | |
url = 'http://www.weatherbase.com/search/search.php3' | |
br.open(url) | |
#response = br.response() | |
br.select_form('searchform') | |
form = str(city.upper()) | |
br.form['query'] = form | |
req = br.submit() | |
cityR = BeautifulSoup(req.read()) | |
# the query yields a path linked to the weather figures we seek | |
web_path = "http://www.weatherbase.com" | |
path_city1 = "recordOutboundLink(this, 'Search', '" | |
path_city2 = name2 | |
path_city3 = ", United States of America'); return false;" | |
path_city = path_city1 + path_city2 + path_city3 | |
# the results are recorded in an HTML table, we sift through the table to recover | |
# to the annual figures corresponding to the weather feature names we defined | |
for tag in cityR.find_all("a", class_ = "redglow"): | |
# check that we are on the right URL | |
if "weather/weather" in tag.get("href"): | |
loc = tag.get("href").split("cityname=", 1)[1].split("-") | |
loc = loc[0] + ", " + loc[1] | |
status = "query" | |
# or if the query failed and found only multiple likely locations | |
elif "search/search" in tag.get("href"): | |
loc = tag.get("href").split("geo=", 1)[1] | |
status = "search" | |
# find the correct location among the possible results of the query | |
if loc == name2: | |
if status == "query": | |
city_path = tag.get("href") | |
elif status == "search": | |
search_path = tag.get("href") | |
city_path = BeautifulSoup(requests.get(web_path + search_path).text, \ | |
"lxml").find("a", class_ = "redglow").get("href").split()[0] | |
soup = BeautifulSoup(requests.get(web_path + city_path).text, "lxml")\ | |
.find("div", class_ = "p402_premium")\ | |
.find_all("table", class_ = "weather-table") | |
for sp in soup: | |
cat = sp.find("div", {"id": "h4font"}).string | |
if cat in varNames: | |
val = sp.find_next_sibling("table").find("td", class_ = "data").string | |
if cat == "Average Precipitation": | |
precipitation = val | |
if cat == "Average Number of Days With Precipitation": | |
avg_days_precip = val | |
if cat == "Average Number of Days Below 32F/0C": | |
avg_days_below_0 = val | |
if cat == "Average Number of Days Above 90F/32C": | |
avg_days_above_32 = val | |
if cat == "Average Number of Rainy Days": | |
avg_rainy_days = val | |
temp = pd.DataFrame({'cty' : name2, 'cty2' : name[0], 'abb' : name[1], \ | |
'avg_precipitation' : precipitation, \ | |
'avg_precipitation_mm' : float(precipitation)*25.4, \ | |
'avg_days_precip': avg_days_precip, \ | |
'avg_rainy_days': avg_rainy_days, \ | |
'avg_days_below_0': avg_days_below_0, \ | |
'avg_days_above_32': avg_days_above_32}, index = [0]) | |
df_weather = pd.concat([df_weather, temp], axis=0) | |
df_weather.to_csv("df_weather.csv") | |
# re-initialize for the next step | |
precipitation = -1 | |
avg_days_precip = -1 | |
avg_rainy_days = -1 | |
avg_days_below_0 = -1 | |
avg_days_above_32 = -1 | |
cty = -1 | |
precip = -1 | |
cty2 = -1 | |
abbr = -1 | |
stdout.write("Step = \r%d" % step) | |
stdout.flush() | |
step += 1 | |
# time elapsed since scraping started | |
print("--- %s seconds ---" % (time.time() - start_time)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment