Skip to content

Instantly share code, notes, and snippets.

@greencoder
Last active August 29, 2015 14:15
Show Gist options
  • Save greencoder/229aadb9628c68d6b23d to your computer and use it in GitHub Desktop.
Save greencoder/229aadb9628c68d6b23d to your computer and use it in GitHub Desktop.
WX5280 Live Blog Notifier
[Pushover]
app_token: <your pushover app token>
user_key: <your pushover user key>
BeautifulSoup==3.2.1
argparse==1.3.0
dateutils==0.6.6
python-dateutil==2.4.0
pytz==2014.10
requests==2.5.1
six==1.9.0
wsgiref==0.1.2
import BeautifulSoup
import ConfigParser
import datetime
import dateutil.parser
import json
import os
import pytz
import re
import requests
import sys
def normalize_weekdays(value):
search_replace_patterns = [
(r'Mon.?', 'Monday'),
(r'\b(?P<day>Tue.?|Tues.?\b)', 'Tuesday'),
(r'\b(?P<day>Wed.?\b)', 'Wednesday'),
(r'\b(?P<day>Thu.?|Thur.?|Thurs.?\b)', 'Thursday'),
(r'\b(?P<day>Fri.?\b)', 'Friday'),
(r'\b(?P<day>Sat.?\b)', 'Saturday'),
(r'\b(?P<day>Sun.?\b)', 'Sunday'),
]
# Look for matches in the value
for pattern, substitution in search_replace_patterns:
regex = re.search(pattern, value)
if regex:
day_match = regex.groups()[0]
return re.sub(day_match, substitution, value)
# If we got this far, no match
return value
def normalize_time(value):
pattern = '\((?P<time>[0-9:]*[am|pm]*)\)'
regex = re.search(pattern, value)
if regex:
return re.sub(pattern, regex.groups()[0], value)
# If we got here, no match
return value
def load_entries_from_disk(filepath):
if os.path.exists(filepath):
with open(filepath, 'r') as f:
try:
return json.loads(f.read())
except ValueError:
return []
else:
return []
def save_entries_to_disk(entries, filepath):
with open(filepath, 'w') as f:
f.write(json.dumps(entries, indent=4))
def send_notification(app_token, user_key, message):
request = requests.post("https://api.pushover.net:443/1/messages.json", data={
"token": app_token,
"user": user_key,
"message": message,
"sound": "bugle",
"title": "WX5280 Live Blog Update",
"url": "http://www.weather5280.com/live-blog/",
"url_title": "View Online",
})
def log(message):
with open('log.txt', 'a') as f:
message = "%s\t%s" % (datetime.datetime.now().isoformat(), message)
f.write(message + "\n")
print message
if __name__ == "__main__":
log("Checking Live Blog Entries")
# Load the config info from the config.txt file
config = ConfigParser.ConfigParser()
config.read("config.txt")
pushover_app_token = config.get('Pushover', 'app_token')
pushover_user_key = config.get('Pushover', 'user_key')
# Make sure config loaded properly
if not pushover_app_token or not pushover_user_key:
message = "Error! config.txt missing app token and/or user key."
log(message)
sys.exit()
# Make the request for the live blog and turn it into a soup object
request = requests.get("http://www.weather5280.com/live-blog/")
soup = BeautifulSoup.BeautifulSoup(request.text)
# Load the entries from disk
existing_entries = load_entries_from_disk('entries.json')
# Keep track of all the entries
scraped_entries = []
# Find all the <h3>'s on the page
for h3_element in soup.findAll('h3'):
# The title is inside the <h3>
title = h3_element.text
# Find the next <p> tag, it contains the time info
p_time_element = h3_element.findNext('p')
raw_time_text = p_time_element.text
# Clean up the time string
time_text = normalize_weekdays(raw_time_text)
time_text = normalize_time(time_text)
# Parse the date string into a datetime object and
# make it timezone-aware
try:
posted_dt = dateutil.parser.parse(time_text)
posted_dt = posted_dt.replace(tzinfo=pytz.timezone('US/Mountain'))
posted_isoformat = posted_dt.isoformat()
except ValueError, e:
posted_isoformat = None
log("Error! Could not parse date: %s" % time_text)
# Add the value to the scraped entries
scraped_entries.append({
'title': title,
'posted-string': time_text,
'posted-time': posted_isoformat,
})
# Sort the scraped entries by date
sorted_scraped_entries = sorted(scraped_entries, key=lambda k: k['posted-time'], reverse=True)
# Save the entries to disk
save_entries_to_disk(sorted_scraped_entries, 'entries.json')
# If there are more scraped entries than existing ones, find
# the newest entry and notify
if len(sorted_scraped_entries) > len(existing_entries):
log("Found new entry. Notifying.")
message = sorted_scraped_entries[0]['title']
send_notification(pushover_app_token, pushover_user_key, message)
else:
log("No new entries found.")
@greencoder
Copy link
Author

Usage $ python scrape.py

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment