Skip to content

Instantly share code, notes, and snippets.

@RyanCPeters
Last active September 23, 2019 21:24
Show Gist options
  • Save RyanCPeters/cec40a0ac73dc8449c709bc3a59db08b to your computer and use it in GitHub Desktop.
Save RyanCPeters/cec40a0ac73dc8449c709bc3a59db08b to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import time
import datetime
import pandas as pd
# setting up url address paths
weather_data_url = 'https://www.ndbc.noaa.gov/station_page.php?station=44013'
tide_table_url = 'https://www.tide-forecast.com/locations/Castle-Island-Boston-Harbor-Massachusetts/tides/latest'
# key alias for tide_finder results, a.k.a. the tfr string
tide_finder_key = "tide_finder_results"
# The dictionary that we'll be saving all of our result strings in;
# The dict's final structure should look something like this (using json formatting):
# data: {
# url_string_1: {...},
# url_string_2: {...},
# ...,
# url_string_n: {...},
# tfr_string: {
# "Mon 23 September":{
# "12:25am": "Low Tide",
# "6:37am": "High Tide",
# "12:43pm": "Low Tide",
# "6:53pm": "High Tide"
# },
# "Tues 24 September": {
# "Maybe not am": "Low Tide",
# "Maybe am": "High Tide",
# "Maybe not pm": "Low Tide",
# "Maybe pm": "High Tide"
# },
# "another date": {...}
# }
# }
data = {}
# Creating the reference sets that will be used to ensure data results are acceptable.
# Because we aren't iterating over the collections, instead we are checking if for specific
# element's inside the collection, we want to use the set api not the list api.
#
# When a collection is only used to check for the existence of a specific element, a set will almost
# always perform faster than a list. Where a list will require iteration over potentially all elements
# in the collection, a set will only need to build a hash for the search object and see if
# anything already exists at that hash's associated index in the set. This hashing is an
# O(1) constant time operation, as opposed to iteration being a O(n) time operation that grows in
# time cost as the list to check grows.
wave_height_set = {'0.1 ft', '0.2 ft', '0.3 ft', '0.4 ft', '0.5 ft', '0.6 ft', '0.7 ft', '0.8 ft', '0.9 ft',
'1.1 ft', '1.2 ft', '1.3 ft', '1.4 ft', '1.5 ft', '1.6 ft', '1.7 ft', '1.8 ft','1.9 ft',
'2.0 ft','2.3 ft','2.4 ft','2.5 ft','2.6 ft','2.7 ft','2.8 ft','2.9 ft'}
wave_interval_set = {'1 sec', '2 sec', '3 sec', '4 sec', '5 sec', '6 sec', '7 sec'}
wind_direction_set = {"SSW", "SW", "WSW", "W", "WNW", "NW", "NNW"}
# As a further simplification, it would seem to be easier to format the scraped data into floats and
# ints, which we can then do equality checks against
min_height,max_height = 0.1, 2.9
min_interval, max_interval = 1, 7
# This function retrieves the wave height
def wave_height_finder(data_d:dict, url_arg:str):
"""Populates the data_d dict with the scraped data.
This is the only function which interacts with the web scraping api.
:param data_d: A dictionary reference where result strings will be stored.
:type data_d: dict
:return: Returns nothing, instead it modifies the passed in dictionary reference, data_d.
:rtype: None
"""
#list of URLs to scrape from
my_url = [url_arg]
for url in my_url:
#initiating python's ability to parse URL
uClient = uReq(url)
# this will offload our content in'to a variable
page_html = uClient.read()
# closes our client
uClient.close()
page_soup = BeautifulSoup(page_html, "html.parser")
data_d[url] = {}
# Fetching/Defining data to variables
data_d[url]["time_now"] = datetime.datetime.now() # timestamp for this iteration of web scrape
data_d[url]["wave_height"] = page_soup.find('td', string='Wave Height (WVHT):').find_next_sibling().get_text().strip()
data_d[url]["wave_interval"] = page_soup.find('td', string='Dominant Wave Period (DPD):').find_next_sibling().get_text().strip()
data_d[url]["wind_direction"] = page_soup.find('td', string='Wind Direction (WDIR):').find_next_sibling().get_text().strip()
data_d[url]["wind_speed"] = page_soup.find('td', string='Wind Speed (WSPD):').find_next_sibling().get_text().strip()
data_d[url]["air_temp"] = page_soup.find('td', string='Air Temperature (ATMP):').find_next_sibling().get_text().strip()
data_d[url]["water_temp"] = page_soup.find('td', string='Water Temperature (WTMP):').find_next_sibling().get_text().strip()
def tide_finder(tfr:str, data_d:dict, table_url:str):
"""A one shot function that reports details regarding high and low tides for the next 24 hours.
Will instantiate a new sub-dict inside data_d using tfr as the key. This sub-dict will in turn
use date strings as keys to yet further sub-dicts. Those sub-dicts will map time stamps as keys
onto the corresponding high/low tide label.
These nested date dictionaries will in turn use the time stamps as keys to the associated tide
state.
E.G. using json formatting to show example of dict structure:
data_d: {
url_string_1: {...},
url_string_2: {...},
...,
url_string_n: {...},
tfr_string: {
"Mon 23 September":{
"12:25am": "Low Tide",
"6:37am": "High Tide",
"12:43pm": "Low Tide",
"6:53pm": "High Tide"
},
"Tues 24 September": {
"Maybe not am": "Low Tide",
"Maybe am": "High Tide",
"Maybe not pm": "Low Tide",
"Maybe pm": "High Tide"
},
"another date": {...}
}
}
:param data_d: A dictionary reference where result strings will be stored.
:type data_d: dict
:return: A string representing the date, time, and tide state. It will also modify the
data_d dict reference to include the tfr string as a new key which contains tide datas.
:rtype: str
"""
# Using Pandas to parse through the html table found in the URL containing Tide Data
tide_table = pd.read_html(table_url)[0]
tide_ = tide_table['Tide'].values
time_date = tide_table['Time (EDT) & Date'].values
data_d[tfr] = {}
# this loop will map tide state to date and time:
# format is:
# data[tfr][date][time in that date] = tide state
for (t, i) in zip(tide_, time_date):
time_date_sliced = i.split('(')
_time, _date = time_date_sliced[0],time_date_sliced[1][:-1]
# if _date is not already in data[tfr], then instantiate it as a new dict
data_d[tfr][_date] = data_d[tfr].get(_date,{})
# structure for this
data_d[tfr][_date][_time] = t
tides_string = []
for date_key,ref in data_d[tide_finder_key].items():
tides_string.append(date_key)
for time_key,v in ref.items():
tides_string.append(f"\t{v}: {time_key}")
return "\n".join(tides_string)
def display_results(url_d:dict):
"""A repeatable output pattern that accepts the dictionary references created per url in the
wave_height_finder(...) function. This allows the scraping process to gather data from multiple
url resources and then pass the resulting sub-dictionaries into this function to display that
data.
:param url_d: A dictionary that contains all the scraped data specific to a single url.
:type url_d:
:return:
:rtype:
"""
# Loading Screens
print("\n")
print("Hacking Weather Sensors...")
print("\n")
# time.sleep_(1)
print("Retrieving Government Data...")
print("\n")
# time.sleep_(1)
print("Decrypting Classified Documents...")
print("\n")
# time.sleep_(1)
print("Initializing Data...")
print("\n")
# time.sleep_(1)
print("\n")
print("-----Current Buoy Data for Boston Harbor:-----")
# time.sleep_(1)
print("\n")
# Wind Direction Splicing
wind_direction_abbreviated = url_d["wind_direction"].strip().replace("(","").replace(")","").split()[0]
# Wind Speed Splicing
# wind_speed_abbreviated = url_d["wind_speed"].split('.')
# wind_speed_sliced = wind_speed_abbreviated[0]
# wind_speed_abbreviated_int = int(wind_speed_sliced)
wind_speed_abbreviated_int = int(url_d["wind_speed"].split('.')[0]) # equivalent one-liner
# using 12-hour format
current_time_sliced = url_d["time_now"].strftime('%Y/%m/%d %I:%M%p')
# separating Date from Time (DD/MM/YYY HH/MM)
date_today = current_time_sliced.split(' ')
date_ = date_today[0]
time_ = date_today[1]
# Removing first letter if it's '0', so it doesn't read as '08:07pm', etc
if time_[0]=="0":
time_ = time_[1:]
print("Date:", date_)
print("Current Time:", time_)
print("Location: Boston Harbor")
# time.sleep_(1)
print("\n")
# Printing of current conditions
print("Current Wave Height:", url_d["wave_height"])
print("Current Wave Interval:", url_d["wave_interval"])
print("Current Wave Direction:", url_d["wind_direction"])
print("Current Wind Speed:", url_d["wind_speed"])
print("Current Air Temp:", url_d["air_temp"])
print("Current Water Temp:", url_d["water_temp"])
print("\n")
# Here is the logic used to determine if current conditions are generating good waves
# UPDATE NOTE: We don't need to check against a list of values when we can more easily
# format the input data to floats and ints then do equality comparisons in the if-block.
height = float(url_d["wave_height"].strip().split()[0])
interval = int(url_d["wave_interval"].strip().split()[0])
if min_height <= height <= max_height \
and min_interval <= interval <= max_interval \
and wind_direction_abbreviated in wind_direction_set \
and wind_speed_abbreviated_int<17:
# time.sleep_(1)
print("Good Waves Right Now in Boston! Go out ;& Surf!")
else:
# time.sleep_(1)
print("Summary: Unfortunately, surf conditions in Boston are not good right now.")
# time.sleep_(1)
print("\n")
# wind_speed_abbreviated = url_d["wind_speed"].split('.')
# wind_speed_sliced = wind_speed_abbreviated[0]
if __name__ == '__main__':
# data and tide_finder_key, are defined at the top of this file.
# data is an empty dictionary
# tide_finder_key is a string that I arbitrarily initialized to be "tide_finder_results".
# all the tide_finder_key needs to satisfy is that it is not the same as any url that you
# intend to scrape, as those urls will be used as other unique keys in the data dict.
wave_height_finder(data)
tides = tide_finder(tide_finder_key,data)
print(tides)
# for implementation of how data[tfr] is set up, see tide_finder() definition
for url_key in data:
if url_key != tide_finder_key:
display_results(data[url_key])
## Useful Info:
# The following are the wind_direction values (abbreviated wind directions)
# need to do the same for wind speed ... ie when does it become too windy?
# West = GOOD
# East = BAD
# NNE = North-Northeast
# NE = Northeast
# ENE = East-Northeast
# E = East
# ESE = East-Southeast
# SE = Southeast
# SSE = South-Southeast
# S = South
# SSW = South-Southwest
# SW = Southwest
# WSW = West-Southwest
# W = West
# WNW = West-Northwest
# NW = Northwest
# NNW = North-Northwest
@RyanCPeters
Copy link
Author

Made some last-second updates to the tide_finder doc_string for clarity on the return type. I also forgot to update the function signatures for both find_wave_height and tide_finder functions to accept their respective url argument params.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment