RyanCPeters/scraper_restructure.py

## scraper_restructure.py

from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import time
import datetime
import pandas as pd

# setting up url address paths
weather_data_url = 'https://www.ndbc.noaa.gov/station_page.php?station=44013'
tide_table_url = 'https://www.tide-forecast.com/locations/Castle-Island-Boston-Harbor-Massachusetts/tides/latest'


# key alias for tide_finder results, a.k.a. the tfr string
tide_finder_key = "tide_finder_results"
# The dictionary that we'll be saving all of our result strings in;
# The dict's final structure should look something like this (using json formatting):
#   data: {
#         url_string_1: {...},
#         url_string_2: {...},
#         ...,
#         url_string_n: {...},
#         tfr_string: {
#             "Mon 23 September":{
#                 "12:25am": "Low Tide",
#                 "6:37am": "High Tide",
#                 "12:43pm": "Low Tide",
#                 "6:53pm": "High Tide"
#             },
#             "Tues 24 September": {
#                 "Maybe not am": "Low Tide",
#                 "Maybe am": "High Tide",
#                 "Maybe not pm": "Low Tide",
#                 "Maybe pm": "High Tide"
#             },
#             "another date": {...}
#         }
#     }
data = {}

# Creating the reference sets that will be used to ensure data results are acceptable.
# Because we aren't iterating over the collections, instead we are checking if for specific
# element's inside the collection, we want to use the set api not the list api.
#
# When a collection is only used to check for the existence of a specific element, a set will almost
# always perform faster than a list. Where a list will require iteration over potentially all elements
# in the collection, a set will only need to build a hash for the search object and see if
# anything already exists at that hash's associated index in the set. This hashing is an
# O(1) constant time operation, as opposed to iteration being a O(n) time operation that grows in
# time cost as the list to check grows.
wave_height_set = {'0.1 ft', '0.2 ft', '0.3 ft', '0.4 ft', '0.5 ft', '0.6 ft', '0.7 ft', '0.8 ft', '0.9 ft',
                    '1.1 ft', '1.2 ft', '1.3 ft', '1.4 ft', '1.5 ft', '1.6 ft', '1.7 ft', '1.8 ft','1.9 ft',
                    '2.0 ft','2.3 ft','2.4 ft','2.5 ft','2.6 ft','2.7 ft','2.8 ft','2.9 ft'}
wave_interval_set = {'1 sec', '2 sec', '3 sec', '4 sec', '5 sec', '6 sec', '7 sec'}
wind_direction_set = {"SSW", "SW", "WSW", "W", "WNW", "NW", "NNW"}

# As a further simplification, it would seem to be easier to format the scraped data into floats and
# ints, which we can then do equality checks against
min_height,max_height = 0.1, 2.9
min_interval, max_interval = 1, 7

# This function retrieves the wave height
def wave_height_finder(data_d:dict, url_arg:str):
    """Populates the data_d dict with the scraped data.

    This is the only function which interacts with the web scraping api.

    :param data_d: A dictionary reference where result strings will be stored.
    :type data_d: dict
    :return: Returns nothing, instead it modifies the passed in dictionary reference, data_d.
    :rtype: None


    """
    #list of URLs to scrape from
    my_url = [url_arg]
    for url in my_url:
        #initiating python's ability to parse URL
        uClient = uReq(url)
        # this will offload our content in'to a variable
        page_html = uClient.read()
        # closes our client
        uClient.close()
        page_soup = BeautifulSoup(page_html, "html.parser")
        data_d[url] = {}
        # Fetching/Defining data to variables
        data_d[url]["time_now"] = datetime.datetime.now() # timestamp for this iteration of web scrape
        data_d[url]["wave_height"] = page_soup.find('td', string='Wave Height (WVHT):').find_next_sibling().get_text().strip()
        data_d[url]["wave_interval"] = page_soup.find('td', string='Dominant Wave Period (DPD):').find_next_sibling().get_text().strip()
        data_d[url]["wind_direction"] = page_soup.find('td', string='Wind Direction (WDIR):').find_next_sibling().get_text().strip()
        data_d[url]["wind_speed"] = page_soup.find('td', string='Wind Speed (WSPD):').find_next_sibling().get_text().strip()
        data_d[url]["air_temp"] = page_soup.find('td', string='Air Temperature (ATMP):').find_next_sibling().get_text().strip()
        data_d[url]["water_temp"] = page_soup.find('td', string='Water Temperature (WTMP):').find_next_sibling().get_text().strip()


def tide_finder(tfr:str, data_d:dict, table_url:str):
    """A one shot function that reports details regarding high and low tides for the next 24 hours.

    Will instantiate a new sub-dict inside data_d using tfr as the key. This sub-dict will in turn
    use date strings as keys to yet further sub-dicts. Those sub-dicts will map time stamps as keys
    onto the corresponding high/low tide label.

    These nested date dictionaries will in turn use the time stamps as keys to the associated tide
    state.

    E.G. using json formatting to show example of dict structure:

        data_d: {
            url_string_1: {...},
            url_string_2: {...},
            ...,
            url_string_n: {...},
            tfr_string: {
                "Mon 23 September":{
                    "12:25am": "Low Tide",
                    "6:37am": "High Tide",
                    "12:43pm": "Low Tide",
                    "6:53pm": "High Tide"
                },
                "Tues 24 September": {
                    "Maybe not am": "Low Tide",
                    "Maybe am": "High Tide",
                    "Maybe not pm": "Low Tide",
                    "Maybe pm": "High Tide"
                },
                "another date": {...}
            }
        }

    :param data_d: A dictionary reference where result strings will be stored.
    :type data_d: dict
    :return: A string representing the date, time, and tide state. It will also  modify the
    data_d dict reference to include the tfr string as a new key which contains tide datas.
    :rtype: str
    """
    # Using Pandas to parse through the html table found in the URL containing Tide Data
    tide_table = pd.read_html(table_url)[0]

    tide_ = tide_table['Tide'].values

    time_date = tide_table['Time (EDT) & Date'].values
    data_d[tfr] = {}
    # this loop will map tide state to date and time:
    #   format is:
    #     data[tfr][date][time in that date] = tide state
    for (t, i) in zip(tide_, time_date):
        time_date_sliced = i.split('(')
        _time, _date = time_date_sliced[0],time_date_sliced[1][:-1]
        # if _date is not already in data[tfr], then instantiate it as a new dict
        data_d[tfr][_date] = data_d[tfr].get(_date,{})
        # structure for this
        data_d[tfr][_date][_time] = t

    tides_string = []
    for date_key,ref in data_d[tide_finder_key].items():
        tides_string.append(date_key)
        for time_key,v in ref.items():
            tides_string.append(f"\t{v}: {time_key}")
    return "\n".join(tides_string)


def display_results(url_d:dict):
    """A repeatable output pattern that accepts the dictionary references created per url in the
    wave_height_finder(...) function. This allows the scraping process to gather data from multiple
    url resources and then pass the resulting sub-dictionaries into this function to display that
    data.

    :param url_d: A dictionary that contains all the scraped data specific to a single url.
    :type url_d:
    :return:
    :rtype:
    """
    # Loading Screens
    print("\n")
    print("Hacking Weather Sensors...")
    print("\n")
    # time.sleep_(1)
    print("Retrieving Government Data...")
    print("\n")
    # time.sleep_(1)
    print("Decrypting Classified Documents...")
    print("\n")
    # time.sleep_(1)
    print("Initializing Data...")
    print("\n")
    # time.sleep_(1)
    print("\n")

    print("-----Current Buoy Data for Boston Harbor:-----")
    # time.sleep_(1)
    print("\n")
    # Wind Direction Splicing
    wind_direction_abbreviated = url_d["wind_direction"].strip().replace("(","").replace(")","").split()[0]

    # Wind Speed Splicing
    # wind_speed_abbreviated = url_d["wind_speed"].split('.')
    # wind_speed_sliced = wind_speed_abbreviated[0]
    # wind_speed_abbreviated_int = int(wind_speed_sliced)
    wind_speed_abbreviated_int = int(url_d["wind_speed"].split('.')[0]) # equivalent one-liner

    # using 12-hour format
    current_time_sliced = url_d["time_now"].strftime('%Y/%m/%d %I:%M%p')
    # separating Date from Time (DD/MM/YYY HH/MM)
    date_today = current_time_sliced.split(' ')
    date_ = date_today[0]
    time_ = date_today[1]
    # Removing first letter if it's '0', so it doesn't read as '08:07pm', etc
    if time_[0]=="0":
        time_ = time_[1:]

    print("Date:", date_)
    print("Current Time:", time_)
    print("Location: Boston Harbor")
    # time.sleep_(1)
    print("\n")

    # Printing of current conditions
    print("Current Wave Height:", url_d["wave_height"])
    print("Current Wave Interval:", url_d["wave_interval"])
    print("Current Wave Direction:", url_d["wind_direction"])
    print("Current Wind Speed:", url_d["wind_speed"])
    print("Current Air Temp:", url_d["air_temp"])
    print("Current Water Temp:", url_d["water_temp"])
    print("\n")

    # Here is the logic used to determine if current conditions are generating good waves
    #   UPDATE NOTE: We don't need to check against a list of values when we can more easily
    #   format the input data to floats and ints then do equality comparisons in the if-block.
    height = float(url_d["wave_height"].strip().split()[0])
    interval = int(url_d["wave_interval"].strip().split()[0])
    if min_height <= height <= max_height \
            and min_interval <= interval <= max_interval \
            and wind_direction_abbreviated in wind_direction_set \
            and wind_speed_abbreviated_int<17:
        # time.sleep_(1)
        print("Good Waves Right Now in Boston! Go out ;& Surf!")
    else:
        # time.sleep_(1)
        print("Summary: Unfortunately, surf conditions in Boston are not good right now.")
        # time.sleep_(1)
        print("\n")
    # wind_speed_abbreviated = url_d["wind_speed"].split('.')
    # wind_speed_sliced = wind_speed_abbreviated[0]

if __name__ == '__main__':
    # data and tide_finder_key, are defined at the top of this file.
    # data is an empty dictionary
    # tide_finder_key is a string that I arbitrarily initialized to be "tide_finder_results".
    # all the tide_finder_key needs to satisfy is that it is not the same as any url that you
    # intend to scrape, as those urls will be used as other unique keys in the data dict.
    wave_height_finder(data)
    tides = tide_finder(tide_finder_key,data)
    print(tides)
    # for implementation of how data[tfr] is set up, see tide_finder() definition
    for url_key in data:
        if url_key != tide_finder_key:
            display_results(data[url_key])


## Useful Info:

# The following are the wind_direction values (abbreviated wind directions)
# need to do the same for wind speed ... ie when does it become too windy?

# West = GOOD
# East = BAD

# NNE = North-Northeast
# NE = Northeast
# ENE = East-Northeast
# E = East
# ESE = East-Southeast
# SE = Southeast
# SSE = South-Southeast
# S = South
# SSW = South-Southwest
# SW = Southwest
# WSW = West-Southwest
# W = West
# WNW = West-Northwest
# NW = Northwest
# NNW = North-Northwest

	from bs4 import BeautifulSoup
	from urllib.request import urlopen as uReq
	import time
	import datetime
	import pandas as pd

	# setting up url address paths
	weather_data_url = 'https://www.ndbc.noaa.gov/station_page.php?station=44013'
	tide_table_url = 'https://www.tide-forecast.com/locations/Castle-Island-Boston-Harbor-Massachusetts/tides/latest'


	# key alias for tide_finder results, a.k.a. the tfr string
	tide_finder_key = "tide_finder_results"
	# The dictionary that we'll be saving all of our result strings in;
	# The dict's final structure should look something like this (using json formatting):
	# data: {
	# url_string_1: {...},
	# url_string_2: {...},
	# ...,
	# url_string_n: {...},
	# tfr_string: {
	# "Mon 23 September":{
	# "12:25am": "Low Tide",
	# "6:37am": "High Tide",
	# "12:43pm": "Low Tide",
	# "6:53pm": "High Tide"
	# },
	# "Tues 24 September": {
	# "Maybe not am": "Low Tide",
	# "Maybe am": "High Tide",
	# "Maybe not pm": "Low Tide",
	# "Maybe pm": "High Tide"
	# },
	# "another date": {...}
	# }
	# }
	data = {}

	# Creating the reference sets that will be used to ensure data results are acceptable.
	# Because we aren't iterating over the collections, instead we are checking if for specific
	# element's inside the collection, we want to use the set api not the list api.
	#
	# When a collection is only used to check for the existence of a specific element, a set will almost
	# always perform faster than a list. Where a list will require iteration over potentially all elements
	# in the collection, a set will only need to build a hash for the search object and see if
	# anything already exists at that hash's associated index in the set. This hashing is an
	# O(1) constant time operation, as opposed to iteration being a O(n) time operation that grows in
	# time cost as the list to check grows.
	wave_height_set = {'0.1 ft', '0.2 ft', '0.3 ft', '0.4 ft', '0.5 ft', '0.6 ft', '0.7 ft', '0.8 ft', '0.9 ft',
	'1.1 ft', '1.2 ft', '1.3 ft', '1.4 ft', '1.5 ft', '1.6 ft', '1.7 ft', '1.8 ft','1.9 ft',
	'2.0 ft','2.3 ft','2.4 ft','2.5 ft','2.6 ft','2.7 ft','2.8 ft','2.9 ft'}
	wave_interval_set = {'1 sec', '2 sec', '3 sec', '4 sec', '5 sec', '6 sec', '7 sec'}
	wind_direction_set = {"SSW", "SW", "WSW", "W", "WNW", "NW", "NNW"}

	# As a further simplification, it would seem to be easier to format the scraped data into floats and
	# ints, which we can then do equality checks against
	min_height,max_height = 0.1, 2.9
	min_interval, max_interval = 1, 7

	# This function retrieves the wave height
	def wave_height_finder(data_d:dict, url_arg:str):
	"""Populates the data_d dict with the scraped data.

	This is the only function which interacts with the web scraping api.

	:param data_d: A dictionary reference where result strings will be stored.
	:type data_d: dict
	:return: Returns nothing, instead it modifies the passed in dictionary reference, data_d.
	:rtype: None


	"""
	#list of URLs to scrape from
	my_url = [url_arg]
	for url in my_url:
	#initiating python's ability to parse URL
	uClient = uReq(url)
	# this will offload our content in'to a variable
	page_html = uClient.read()
	# closes our client
	uClient.close()
	page_soup = BeautifulSoup(page_html, "html.parser")
	data_d[url] = {}
	# Fetching/Defining data to variables
	data_d[url]["time_now"] = datetime.datetime.now() # timestamp for this iteration of web scrape
	data_d[url]["wave_height"] = page_soup.find('td', string='Wave Height (WVHT):').find_next_sibling().get_text().strip()
	data_d[url]["wave_interval"] = page_soup.find('td', string='Dominant Wave Period (DPD):').find_next_sibling().get_text().strip()
	data_d[url]["wind_direction"] = page_soup.find('td', string='Wind Direction (WDIR):').find_next_sibling().get_text().strip()
	data_d[url]["wind_speed"] = page_soup.find('td', string='Wind Speed (WSPD):').find_next_sibling().get_text().strip()
	data_d[url]["air_temp"] = page_soup.find('td', string='Air Temperature (ATMP):').find_next_sibling().get_text().strip()
	data_d[url]["water_temp"] = page_soup.find('td', string='Water Temperature (WTMP):').find_next_sibling().get_text().strip()


	def tide_finder(tfr:str, data_d:dict, table_url:str):
	"""A one shot function that reports details regarding high and low tides for the next 24 hours.

	Will instantiate a new sub-dict inside data_d using tfr as the key. This sub-dict will in turn
	use date strings as keys to yet further sub-dicts. Those sub-dicts will map time stamps as keys
	onto the corresponding high/low tide label.

	These nested date dictionaries will in turn use the time stamps as keys to the associated tide
	state.

	E.G. using json formatting to show example of dict structure:

	data_d: {
	url_string_1: {...},
	url_string_2: {...},
	...,
	url_string_n: {...},
	tfr_string: {
	"Mon 23 September":{
	"12:25am": "Low Tide",
	"6:37am": "High Tide",
	"12:43pm": "Low Tide",
	"6:53pm": "High Tide"
	},
	"Tues 24 September": {
	"Maybe not am": "Low Tide",
	"Maybe am": "High Tide",
	"Maybe not pm": "Low Tide",
	"Maybe pm": "High Tide"
	},
	"another date": {...}
	}
	}

	:param data_d: A dictionary reference where result strings will be stored.
	:type data_d: dict
	:return: A string representing the date, time, and tide state. It will also modify the
	data_d dict reference to include the tfr string as a new key which contains tide datas.
	:rtype: str
	"""
	# Using Pandas to parse through the html table found in the URL containing Tide Data
	tide_table = pd.read_html(table_url)[0]

	tide_ = tide_table['Tide'].values

	time_date = tide_table['Time (EDT) & Date'].values
	data_d[tfr] = {}
	# this loop will map tide state to date and time:
	# format is:
	# data[tfr][date][time in that date] = tide state
	for (t, i) in zip(tide_, time_date):
	time_date_sliced = i.split('(')
	_time, _date = time_date_sliced[0],time_date_sliced[1][:-1]
	# if _date is not already in data[tfr], then instantiate it as a new dict
	data_d[tfr][_date] = data_d[tfr].get(_date,{})
	# structure for this
	data_d[tfr][_date][_time] = t

	tides_string = []
	for date_key,ref in data_d[tide_finder_key].items():
	tides_string.append(date_key)
	for time_key,v in ref.items():
	tides_string.append(f"\t{v}: {time_key}")
	return "\n".join(tides_string)


	def display_results(url_d:dict):
	"""A repeatable output pattern that accepts the dictionary references created per url in the
	wave_height_finder(...) function. This allows the scraping process to gather data from multiple
	url resources and then pass the resulting sub-dictionaries into this function to display that
	data.

	:param url_d: A dictionary that contains all the scraped data specific to a single url.
	:type url_d:
	:return:
	:rtype:
	"""
	# Loading Screens
	print("\n")
	print("Hacking Weather Sensors...")
	print("\n")
	# time.sleep_(1)
	print("Retrieving Government Data...")
	print("\n")
	# time.sleep_(1)
	print("Decrypting Classified Documents...")
	print("\n")
	# time.sleep_(1)
	print("Initializing Data...")
	print("\n")
	# time.sleep_(1)
	print("\n")

	print("-----Current Buoy Data for Boston Harbor:-----")
	# time.sleep_(1)
	print("\n")
	# Wind Direction Splicing
	wind_direction_abbreviated = url_d["wind_direction"].strip().replace("(","").replace(")","").split()[0]

	# Wind Speed Splicing
	# wind_speed_abbreviated = url_d["wind_speed"].split('.')
	# wind_speed_sliced = wind_speed_abbreviated[0]
	# wind_speed_abbreviated_int = int(wind_speed_sliced)
	wind_speed_abbreviated_int = int(url_d["wind_speed"].split('.')[0]) # equivalent one-liner

	# using 12-hour format
	current_time_sliced = url_d["time_now"].strftime('%Y/%m/%d %I:%M%p')
	# separating Date from Time (DD/MM/YYY HH/MM)
	date_today = current_time_sliced.split(' ')
	date_ = date_today[0]
	time_ = date_today[1]
	# Removing first letter if it's '0', so it doesn't read as '08:07pm', etc
	if time_[0]=="0":
	time_ = time_[1:]

	print("Date:", date_)
	print("Current Time:", time_)
	print("Location: Boston Harbor")
	# time.sleep_(1)
	print("\n")

	# Printing of current conditions
	print("Current Wave Height:", url_d["wave_height"])
	print("Current Wave Interval:", url_d["wave_interval"])
	print("Current Wave Direction:", url_d["wind_direction"])
	print("Current Wind Speed:", url_d["wind_speed"])
	print("Current Air Temp:", url_d["air_temp"])
	print("Current Water Temp:", url_d["water_temp"])
	print("\n")

	# Here is the logic used to determine if current conditions are generating good waves
	# UPDATE NOTE: We don't need to check against a list of values when we can more easily
	# format the input data to floats and ints then do equality comparisons in the if-block.
	height = float(url_d["wave_height"].strip().split()[0])
	interval = int(url_d["wave_interval"].strip().split()[0])
	if min_height <= height <= max_height \
	and min_interval <= interval <= max_interval \
	and wind_direction_abbreviated in wind_direction_set \
	and wind_speed_abbreviated_int<17:
	# time.sleep_(1)
	print("Good Waves Right Now in Boston! Go out ;& Surf!")
	else:
	# time.sleep_(1)
	print("Summary: Unfortunately, surf conditions in Boston are not good right now.")
	# time.sleep_(1)
	print("\n")
	# wind_speed_abbreviated = url_d["wind_speed"].split('.')
	# wind_speed_sliced = wind_speed_abbreviated[0]

	if __name__ == '__main__':
	# data and tide_finder_key, are defined at the top of this file.
	# data is an empty dictionary
	# tide_finder_key is a string that I arbitrarily initialized to be "tide_finder_results".
	# all the tide_finder_key needs to satisfy is that it is not the same as any url that you
	# intend to scrape, as those urls will be used as other unique keys in the data dict.
	wave_height_finder(data)
	tides = tide_finder(tide_finder_key,data)
	print(tides)
	# for implementation of how data[tfr] is set up, see tide_finder() definition
	for url_key in data:
	if url_key != tide_finder_key:
	display_results(data[url_key])



	## Useful Info:

	# The following are the wind_direction values (abbreviated wind directions)
	# need to do the same for wind speed ... ie when does it become too windy?

	# West = GOOD
	# East = BAD

	# NNE = North-Northeast
	# NE = Northeast
	# ENE = East-Northeast
	# E = East
	# ESE = East-Southeast
	# SE = Southeast
	# SSE = South-Southeast
	# S = South
	# SSW = South-Southwest
	# SW = Southwest
	# WSW = West-Southwest
	# W = West
	# WNW = West-Northwest
	# NW = Northwest
	# NNW = North-Northwest