ashtom84/WeatherScrap Secret

## WeatherScrap
from bs4 import BeautifulSoup
import requests
import mechanize
import cookielib
import pandas as pd
import numpy as np
from sys import stdout
import time
start_time = time.time()

# Dictionary of the US states and their abbreviations
states = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

# Browser
br = mechanize.Browser()

# Allow cookies
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
# number of steps
step = 1
# initialization of the variables we try to retrieve
precipitation = -1
avg_days_precip = -1
avg_rainy_days = -1
avg_days_below_0 = -1
avg_days_above_32 = -1
cty = -1
precip = -1
cty2 = -1
abbr = -1

# variable to check wether the query was successful or required further search
status = ""

# names of the weather features we wish to retrieve
varNames = ["Average Precipitation", "Average Number of Days With Precipitation", \
           "Average Number of Days Below 32F/0C", "Average Number of Days Above 90F/32C", \
           "Average Number of Rainy Days"]
# initialization of the weather dataframe
df_weather = pd.DataFrame({'cty' : [], 'cty2' : [], 'abb' : [], 'avg_precipitation' : [], \
                           'avg_precipitation_mm' : [], 'avg_days_precip': [], \
                           'avg_rainy_days': [], 'avg_days_below_0': [], \
                           'avg_days_above_32': []})
# read the dataframe with motorcycles' information
bike = pd.read_csv("bike.csv")
# cities information from the dataframe bike. cty = "City, State", and cty2 = "City"
city_info = pd.DataFrame(index = bike.city, columns = ['cty', 'cty2', 'abb'])
city_info['cty2'] = city_info.index

for i in range(0, len(bike.city)):
    if type(bike.iloc[i, 12]) == str:
        abb = ''.join(bike.iloc[i, 12].split())
    else:
        abb = np.nan
    if abb in states.keys():
        city_info['abb'][i] = abb
    else:
        city_info['abb'][i] = "NaN"

# for every city that appears in bike we retrieve the weather information
for city in np.unique(bike.city)[1:]:
    name = [city, city_info[city_info.index == city]['abb'][0]]
    if name[0] != "NaN" and name[1] != "NaN" and name[1] in states.keys():
        name2 = " ".join(map(lambda x: x.capitalize(), name[0].split(" "))) \
                                    + ", " + states[name[1]]
        # Browser options
        br.set_handle_equiv(True)
        br.set_handle_redirect(True)
        br.set_handle_referer(True)
        br.set_handle_robots(False)

        # Follows refresh 0 but not hangs on refresh > 0
        br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

        # Specify browser to emulate
        br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) \
                                           Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]


        # we open a browser and read the result of the query
        url = 'http://www.weatherbase.com/search/search.php3'
        br.open(url)
        #response = br.response()

        br.select_form('searchform')

        form = str(city.upper())
        br.form['query'] = form
        req = br.submit()
        cityR = BeautifulSoup(req.read())

        # the query yields a path linked to the weather figures we seek
        web_path = "http://www.weatherbase.com"

        path_city1 = "recordOutboundLink(this, 'Search', '"
        path_city2 = name2
        path_city3 = ", United States of America'); return false;"

        path_city = path_city1 + path_city2 + path_city3

        # the results are recorded in an HTML table, we sift through the table to recover
        # to the annual figures corresponding to the weather feature names we defined
        for tag in cityR.find_all("a", class_ = "redglow"):
            # check that we are on the right URL
            if "weather/weather" in tag.get("href"):
                loc = tag.get("href").split("cityname=", 1)[1].split("-")
                loc = loc[0] + ", " + loc[1]
                status = "query"
            # or if the query failed and found only multiple likely locations
            elif "search/search" in tag.get("href"):
                loc = tag.get("href").split("geo=", 1)[1]
                status = "search"
                # find the correct location among the possible results of the query
            if loc == name2:
                if status == "query":
                    city_path = tag.get("href")
                elif status == "search":
                    search_path = tag.get("href")
                    city_path = BeautifulSoup(requests.get(web_path + search_path).text, \
                                    "lxml").find("a", class_ = "redglow").get("href").split()[0]

                soup = BeautifulSoup(requests.get(web_path + city_path).text, "lxml")\
                                    .find("div", class_ = "p402_premium")\
                                    .find_all("table", class_ = "weather-table")
                for sp in soup:
                    cat = sp.find("div", {"id": "h4font"}).string
                    if cat in varNames:
                        val = sp.find_next_sibling("table").find("td", class_ = "data").string
                        if cat == "Average Precipitation":
                            precipitation = val
                        if cat == "Average Number of Days With Precipitation":
                            avg_days_precip = val
                        if cat == "Average Number of Days Below 32F/0C":
                            avg_days_below_0 = val
                        if cat == "Average Number of Days Above 90F/32C":
                            avg_days_above_32 = val
                        if cat == "Average Number of Rainy Days":
                            avg_rainy_days = val
                temp = pd.DataFrame({'cty' : name2, 'cty2' : name[0], 'abb' : name[1], \
                           'avg_precipitation' : precipitation, \
                           'avg_precipitation_mm' : float(precipitation)*25.4, \
                           'avg_days_precip': avg_days_precip, \
                           'avg_rainy_days': avg_rainy_days, \
                           'avg_days_below_0': avg_days_below_0, \
                           'avg_days_above_32': avg_days_above_32}, index = [0])
                df_weather = pd.concat([df_weather, temp], axis=0)
                df_weather.to_csv("df_weather.csv")
                # re-initialize for the next step
                precipitation = -1
                avg_days_precip = -1
                avg_rainy_days = -1
                avg_days_below_0 = -1
                avg_days_above_32 = -1
                cty = -1
                precip = -1
                cty2 = -1
                abbr = -1

                stdout.write("Step = \r%d" % step)
                stdout.flush()
                step += 1

# time elapsed since scraping started
print("--- %s seconds ---" % (time.time() - start_time))
	from bs4 import BeautifulSoup
	import requests
	import mechanize
	import cookielib
	import pandas as pd
	import numpy as np
	from sys import stdout
	import time
	start_time = time.time()

	# Dictionary of the US states and their abbreviations
	states = {
	'AK': 'Alaska',
	'AL': 'Alabama',
	'AR': 'Arkansas',
	'AS': 'American Samoa',
	'AZ': 'Arizona',
	'CA': 'California',
	'CO': 'Colorado',
	'CT': 'Connecticut',
	'DC': 'District of Columbia',
	'DE': 'Delaware',
	'FL': 'Florida',
	'GA': 'Georgia',
	'GU': 'Guam',
	'HI': 'Hawaii',
	'IA': 'Iowa',
	'ID': 'Idaho',
	'IL': 'Illinois',
	'IN': 'Indiana',
	'KS': 'Kansas',
	'KY': 'Kentucky',
	'LA': 'Louisiana',
	'MA': 'Massachusetts',
	'MD': 'Maryland',
	'ME': 'Maine',
	'MI': 'Michigan',
	'MN': 'Minnesota',
	'MO': 'Missouri',
	'MP': 'Northern Mariana Islands',
	'MS': 'Mississippi',
	'MT': 'Montana',
	'NA': 'National',
	'NC': 'North Carolina',
	'ND': 'North Dakota',
	'NE': 'Nebraska',
	'NH': 'New Hampshire',
	'NJ': 'New Jersey',
	'NM': 'New Mexico',
	'NV': 'Nevada',
	'NY': 'New York',
	'OH': 'Ohio',
	'OK': 'Oklahoma',
	'OR': 'Oregon',
	'PA': 'Pennsylvania',
	'PR': 'Puerto Rico',
	'RI': 'Rhode Island',
	'SC': 'South Carolina',
	'SD': 'South Dakota',
	'TN': 'Tennessee',
	'TX': 'Texas',
	'UT': 'Utah',
	'VA': 'Virginia',
	'VI': 'Virgin Islands',
	'VT': 'Vermont',
	'WA': 'Washington',
	'WI': 'Wisconsin',
	'WV': 'West Virginia',
	'WY': 'Wyoming'
	}

	# Browser
	br = mechanize.Browser()

	# Allow cookies
	cj = cookielib.LWPCookieJar()
	br.set_cookiejar(cj)
	# number of steps
	step = 1
	# initialization of the variables we try to retrieve
	precipitation = -1
	avg_days_precip = -1
	avg_rainy_days = -1
	avg_days_below_0 = -1
	avg_days_above_32 = -1
	cty = -1
	precip = -1
	cty2 = -1
	abbr = -1

	# variable to check wether the query was successful or required further search
	status = ""

	# names of the weather features we wish to retrieve
	varNames = ["Average Precipitation", "Average Number of Days With Precipitation", \
	"Average Number of Days Below 32F/0C", "Average Number of Days Above 90F/32C", \
	"Average Number of Rainy Days"]
	# initialization of the weather dataframe
	df_weather = pd.DataFrame({'cty' : [], 'cty2' : [], 'abb' : [], 'avg_precipitation' : [], \
	'avg_precipitation_mm' : [], 'avg_days_precip': [], \
	'avg_rainy_days': [], 'avg_days_below_0': [], \
	'avg_days_above_32': []})
	# read the dataframe with motorcycles' information
	bike = pd.read_csv("bike.csv")
	# cities information from the dataframe bike. cty = "City, State", and cty2 = "City"
	city_info = pd.DataFrame(index = bike.city, columns = ['cty', 'cty2', 'abb'])
	city_info['cty2'] = city_info.index

	for i in range(0, len(bike.city)):
	if type(bike.iloc[i, 12]) == str:
	abb = ''.join(bike.iloc[i, 12].split())
	else:
	abb = np.nan
	if abb in states.keys():
	city_info['abb'][i] = abb
	else:
	city_info['abb'][i] = "NaN"

	# for every city that appears in bike we retrieve the weather information
	for city in np.unique(bike.city)[1:]:
	name = [city, city_info[city_info.index == city]['abb'][0]]
	if name[0] != "NaN" and name[1] != "NaN" and name[1] in states.keys():
	name2 = " ".join(map(lambda x: x.capitalize(), name[0].split(" "))) \
	+ ", " + states[name[1]]
	# Browser options
	br.set_handle_equiv(True)
	br.set_handle_redirect(True)
	br.set_handle_referer(True)
	br.set_handle_robots(False)

	# Follows refresh 0 but not hangs on refresh > 0
	br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

	# Specify browser to emulate
	br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) \
	Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]


	# we open a browser and read the result of the query
	url = 'http://www.weatherbase.com/search/search.php3'
	br.open(url)
	#response = br.response()

	br.select_form('searchform')

	form = str(city.upper())
	br.form['query'] = form
	req = br.submit()
	cityR = BeautifulSoup(req.read())

	# the query yields a path linked to the weather figures we seek
	web_path = "http://www.weatherbase.com"

	path_city1 = "recordOutboundLink(this, 'Search', '"
	path_city2 = name2
	path_city3 = ", United States of America'); return false;"

	path_city = path_city1 + path_city2 + path_city3

	# the results are recorded in an HTML table, we sift through the table to recover
	# to the annual figures corresponding to the weather feature names we defined
	for tag in cityR.find_all("a", class_ = "redglow"):
	# check that we are on the right URL
	if "weather/weather" in tag.get("href"):
	loc = tag.get("href").split("cityname=", 1)[1].split("-")
	loc = loc[0] + ", " + loc[1]
	status = "query"
	# or if the query failed and found only multiple likely locations
	elif "search/search" in tag.get("href"):
	loc = tag.get("href").split("geo=", 1)[1]
	status = "search"
	# find the correct location among the possible results of the query
	if loc == name2:
	if status == "query":
	city_path = tag.get("href")
	elif status == "search":
	search_path = tag.get("href")
	city_path = BeautifulSoup(requests.get(web_path + search_path).text, \
	"lxml").find("a", class_ = "redglow").get("href").split()[0]

	soup = BeautifulSoup(requests.get(web_path + city_path).text, "lxml")\
	.find("div", class_ = "p402_premium")\
	.find_all("table", class_ = "weather-table")
	for sp in soup:
	cat = sp.find("div", {"id": "h4font"}).string
	if cat in varNames:
	val = sp.find_next_sibling("table").find("td", class_ = "data").string
	if cat == "Average Precipitation":
	precipitation = val
	if cat == "Average Number of Days With Precipitation":
	avg_days_precip = val
	if cat == "Average Number of Days Below 32F/0C":
	avg_days_below_0 = val
	if cat == "Average Number of Days Above 90F/32C":
	avg_days_above_32 = val
	if cat == "Average Number of Rainy Days":
	avg_rainy_days = val
	temp = pd.DataFrame({'cty' : name2, 'cty2' : name[0], 'abb' : name[1], \
	'avg_precipitation' : precipitation, \
	'avg_precipitation_mm' : float(precipitation)*25.4, \
	'avg_days_precip': avg_days_precip, \
	'avg_rainy_days': avg_rainy_days, \
	'avg_days_below_0': avg_days_below_0, \
	'avg_days_above_32': avg_days_above_32}, index = [0])
	df_weather = pd.concat([df_weather, temp], axis=0)
	df_weather.to_csv("df_weather.csv")
	# re-initialize for the next step
	precipitation = -1
	avg_days_precip = -1
	avg_rainy_days = -1
	avg_days_below_0 = -1
	avg_days_above_32 = -1
	cty = -1
	precip = -1
	cty2 = -1
	abbr = -1

	stdout.write("Step = \r%d" % step)
	stdout.flush()
	step += 1

	# time elapsed since scraping started
	print("--- %s seconds ---" % (time.time() - start_time))