Skip to content

Instantly share code, notes, and snippets.

@barretts
Last active November 19, 2019 17:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save barretts/059d53371e4758a959a1d1aeba2a51a7 to your computer and use it in GitHub Desktop.
Save barretts/059d53371e4758a959a1d1aeba2a51a7 to your computer and use it in GitHub Desktop.
Change Detection script from https://torvald.no/web-change-detection.html
#!/usr/bin/python3
# Usage
# 0 * * * * /home/badguy90/bin/change_detection.py
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import difflib
import hashlib
import os
state_directory = "/tmp/"
pages = [
("https://stortinget.no/no/Stottemeny/Stilling-ledig/", ".jobbnorge-joblist-table"),
("https://stortinget.no/no/Saker-og-publikasjoner/Sporsmal/Skriftlige-sporsmal-og-svar/Skriftlig-sporsmal/?qid=74380", "#main-content"),
("https://www.smalhans.no/matogvin", '.menu-block'),
("https://www.digitalocean.com/legal/privacy-policy/", ".www-Section")
]
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_driver = "/path/to/chromedriver"
def get_element_from_url(url, selector):
driver = webdriver.Chrome(
chrome_options=chrome_options, executable_path=chrome_driver
)
driver.get(url)
html = driver.execute_script("return document.body.innerHTML")
soup = BeautifulSoup(html)
selected = soup.select(selector)
if not selected:
print("Selector {} not found at {}".format(selector, url))
return None
if len(selected) > 1:
print("Found multiple items with selector {} at {}. Selecting first.".format(selector, url))
return selected[0].prettify()
for url, selector in pages:
identifier = hashlib.md5((url+selector).encode('utf-8')).hexdigest()
file_name = "{}{}.txt".format(state_directory, identifier)
html_new = get_element_from_url(url, selector)
# cound not get content
if not html_new:
continue
# save new input and continue, we will check diff next time
if not os.path.exists(file_name):
f = open(file_name, "w+")
f.write(html_new)
continue
f = open(file_name, "r")
html_old = f.read()
diff = difflib.unified_diff(
html_old.splitlines(),
html_new.splitlines()
)
diff_output = '\n'.join(list(diff)[3:]) # The four first lines are just +++---
if diff_output:
print(' ***** {} ***** '.format(url))
print("\n" + diff_output + "\n")
# save new version to file
f = open(file_name, "w+")
f.write(html_new)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment