Last active September 23, 2019 21:24
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import time
import datetime
import pandas as pd
# setting up url address paths
weather_data_url = ''
tide_table_url = ''
# key alias for tide_finder results, a.k.a. the tfr string
tide_finder_key = "tide_finder_results"
# The dictionary that we'll be saving all of our result strings in;
# The dict's final structure should look something like this (using json formatting):
# data: {
# url_string_1: {...},
# url_string_2: {...},
# ...,
# url_string_n: {...},
# tfr_string: {
# "Mon 23 September":{
# "12:25am": "Low Tide",
# "6:37am": "High Tide",
# "12:43pm": "Low Tide",
# "6:53pm": "High Tide"
# },
# "Tues 24 September": {
# "Maybe not am": "Low Tide",
# "Maybe am": "High Tide",
# "Maybe not pm": "Low Tide",
# "Maybe pm": "High Tide"
# },
# "another date": {...}
# }
# }
data = {}
# Creating the reference sets that will be used to ensure data results are acceptable.
# Because we aren't iterating over the collections, instead we are checking if for specific
# element's inside the collection, we want to use the set api not the list api.
# When a collection is only used to check for the existence of a specific element, a set will almost
# always perform faster than a list. Where a list will require iteration over potentially all elements
# in the collection, a set will only need to build a hash for the search object and see if
# anything already exists at that hash's associated index in the set. This hashing is an
# O(1) constant time operation, as opposed to iteration being a O(n) time operation that grows in
# time cost as the list to check grows.
wave_height_set = {'0.1 ft', '0.2 ft', '0.3 ft', '0.4 ft', '0.5 ft', '0.6 ft', '0.7 ft', '0.8 ft', '0.9 ft',
'1.1 ft', '1.2 ft', '1.3 ft', '1.4 ft', '1.5 ft', '1.6 ft', '1.7 ft', '1.8 ft','1.9 ft',
'2.0 ft','2.3 ft','2.4 ft','2.5 ft','2.6 ft','2.7 ft','2.8 ft','2.9 ft'}
wave_interval_set = {'1 sec', '2 sec', '3 sec', '4 sec', '5 sec', '6 sec', '7 sec'}
wind_direction_set = {"SSW", "SW", "WSW", "W", "WNW", "NW", "NNW"}
# As a further simplification, it would seem to be easier to format the scraped data into floats and
# ints, which we can then do equality checks against
min_height,max_height = 0.1, 2.9
min_interval, max_interval = 1, 7
# This function retrieves the wave height
def wave_height_finder(data_d:dict, url_arg:str):
"""Populates the data_d dict with the scraped data.
This is the only function which interacts with the web scraping api.
:param data_d: A dictionary reference where result strings will be stored.
:type data_d: dict
:return: Returns nothing, instead it modifies the passed in dictionary reference, data_d.
:rtype: None
#list of URLs to scrape from
my_url = [url_arg]
for url in my_url:
#initiating python's ability to parse URL
uClient = uReq(url)
# this will offload our content in'to a variable
page_html =
# closes our client
page_soup = BeautifulSoup(page_html, "html.parser")
data_d[url] = {}
# Fetching/Defining data to variables
data_d[url]["time_now"] = # timestamp for this iteration of web scrape
data_d[url]["wave_height"] = page_soup.find('td', string='Wave Height (WVHT):').find_next_sibling().get_text().strip()
data_d[url]["wave_interval"] = page_soup.find('td', string='Dominant Wave Period (DPD):').find_next_sibling().get_text().strip()
data_d[url]["wind_direction"] = page_soup.find('td', string='Wind Direction (WDIR):').find_next_sibling().get_text().strip()
data_d[url]["wind_speed"] = page_soup.find('td', string='Wind Speed (WSPD):').find_next_sibling().get_text().strip()
data_d[url]["air_temp"] = page_soup.find('td', string='Air Temperature (ATMP):').find_next_sibling().get_text().strip()
data_d[url]["water_temp"] = page_soup.find('td', string='Water Temperature (WTMP):').find_next_sibling().get_text().strip()
def tide_finder(tfr:str, data_d:dict, table_url:str):
"""A one shot function that reports details regarding high and low tides for the next 24 hours.
Will instantiate a new sub-dict inside data_d using tfr as the key. This sub-dict will in turn
use date strings as keys to yet further sub-dicts. Those sub-dicts will map time stamps as keys
onto the corresponding high/low tide label.
These nested date dictionaries will in turn use the time stamps as keys to the associated tide
E.G. using json formatting to show example of dict structure:
data_d: {
url_string_1: {...},
url_string_2: {...},
url_string_n: {...},
tfr_string: {
"Mon 23 September":{
"12:25am": "Low Tide",
"6:37am": "High Tide",
"12:43pm": "Low Tide",
"6:53pm": "High Tide"
"Tues 24 September": {
"Maybe not am": "Low Tide",
"Maybe am": "High Tide",
"Maybe not pm": "Low Tide",
"Maybe pm": "High Tide"
"another date": {...}
:param data_d: A dictionary reference where result strings will be stored.
:type data_d: dict
:return: A string representing the date, time, and tide state. It will also modify the
data_d dict reference to include the tfr string as a new key which contains tide datas.
:rtype: str
# Using Pandas to parse through the html table found in the URL containing Tide Data
tide_table = pd.read_html(table_url)[0]
tide_ = tide_table['Tide'].values
time_date = tide_table['Time (EDT) & Date'].values
data_d[tfr] = {}
# this loop will map tide state to date and time:
# format is:
# data[tfr][date][time in that date] = tide state
for (t, i) in zip(tide_, time_date):
time_date_sliced = i.split('(')
_time, _date = time_date_sliced[0],time_date_sliced[1][:-1]
# if _date is not already in data[tfr], then instantiate it as a new dict
data_d[tfr][_date] = data_d[tfr].get(_date,{})
# structure for this
data_d[tfr][_date][_time] = t
tides_string = []
for date_key,ref in data_d[tide_finder_key].items():
for time_key,v in ref.items():
tides_string.append(f"\t{v}: {time_key}")
return "\n".join(tides_string)
def display_results(url_d:dict):
"""A repeatable output pattern that accepts the dictionary references created per url in the
wave_height_finder(...) function. This allows the scraping process to gather data from multiple
url resources and then pass the resulting sub-dictionaries into this function to display that
:param url_d: A dictionary that contains all the scraped data specific to a single url.
:type url_d:
# Loading Screens
print("Hacking Weather Sensors...")
# time.sleep_(1)
print("Retrieving Government Data...")
# time.sleep_(1)
print("Decrypting Classified Documents...")
# time.sleep_(1)
print("Initializing Data...")
# time.sleep_(1)
print("-----Current Buoy Data for Boston Harbor:-----")
# time.sleep_(1)
# Wind Direction Splicing
wind_direction_abbreviated = url_d["wind_direction"].strip().replace("(","").replace(")","").split()[0]
# Wind Speed Splicing
# wind_speed_abbreviated = url_d["wind_speed"].split('.')
# wind_speed_sliced = wind_speed_abbreviated[0]
# wind_speed_abbreviated_int = int(wind_speed_sliced)
wind_speed_abbreviated_int = int(url_d["wind_speed"].split('.')[0]) # equivalent one-liner
# using 12-hour format
current_time_sliced = url_d["time_now"].strftime('%Y/%m/%d %I:%M%p')
# separating Date from Time (DD/MM/YYY HH/MM)
date_today = current_time_sliced.split(' ')
date_ = date_today[0]
time_ = date_today[1]
# Removing first letter if it's '0', so it doesn't read as '08:07pm', etc
if time_[0]=="0":
time_ = time_[1:]
print("Date:", date_)
print("Current Time:", time_)
print("Location: Boston Harbor")
# time.sleep_(1)
# Printing of current conditions
print("Current Wave Height:", url_d["wave_height"])
print("Current Wave Interval:", url_d["wave_interval"])
print("Current Wave Direction:", url_d["wind_direction"])
print("Current Wind Speed:", url_d["wind_speed"])
print("Current Air Temp:", url_d["air_temp"])
print("Current Water Temp:", url_d["water_temp"])
# Here is the logic used to determine if current conditions are generating good waves
# UPDATE NOTE: We don't need to check against a list of values when we can more easily
# format the input data to floats and ints then do equality comparisons in the if-block.
height = float(url_d["wave_height"].strip().split()[0])
interval = int(url_d["wave_interval"].strip().split()[0])
if min_height <= height <= max_height \
and min_interval <= interval <= max_interval \
and wind_direction_abbreviated in wind_direction_set \
and wind_speed_abbreviated_int<17:
# time.sleep_(1)
print("Good Waves Right Now in Boston! Go out ;& Surf!")
# time.sleep_(1)
print("Summary: Unfortunately, surf conditions in Boston are not good right now.")
# time.sleep_(1)
# wind_speed_abbreviated = url_d["wind_speed"].split('.')
# wind_speed_sliced = wind_speed_abbreviated[0]
if __name__ == '__main__':
# data and tide_finder_key, are defined at the top of this file.
# data is an empty dictionary
# tide_finder_key is a string that I arbitrarily initialized to be "tide_finder_results".
# all the tide_finder_key needs to satisfy is that it is not the same as any url that you
# intend to scrape, as those urls will be used as other unique keys in the data dict.
tides = tide_finder(tide_finder_key,data)
# for implementation of how data[tfr] is set up, see tide_finder() definition
for url_key in data:
if url_key != tide_finder_key:
## Useful Info:
# The following are the wind_direction values (abbreviated wind directions)
# need to do the same for wind speed ... ie when does it become too windy?
# West = GOOD
# East = BAD
# NNE = North-Northeast
# NE = Northeast
# ENE = East-Northeast
# E = East
# ESE = East-Southeast
# SE = Southeast
# SSE = South-Southeast
# S = South
# SSW = South-Southwest
# SW = Southwest
# WSW = West-Southwest
# W = West
# WNW = West-Northwest
# NW = Northwest
# NNW = North-Northwest
Made some last-second updates to the tide_finder doc_string for clarity on the return type. I also forgot to update the function signatures for both find_wave_height and tide_finder functions to accept their respective url argument params.

