barretts/change_detection.py

## change_detection.py
#!/usr/bin/python3

# Usage
# 0 * * * * /home/badguy90/bin/change_detection.py

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import difflib
import hashlib
import os

state_directory = "/tmp/"

pages = [
    ("https://stortinget.no/no/Stottemeny/Stilling-ledig/", ".jobbnorge-joblist-table"),
    ("https://stortinget.no/no/Saker-og-publikasjoner/Sporsmal/Skriftlige-sporsmal-og-svar/Skriftlig-sporsmal/?qid=74380", "#main-content"),
    ("https://www.smalhans.no/matogvin", '.menu-block'),
    ("https://www.digitalocean.com/legal/privacy-policy/", ".www-Section")
]

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_driver = "/path/to/chromedriver"

def get_element_from_url(url, selector):
    driver = webdriver.Chrome(
        chrome_options=chrome_options, executable_path=chrome_driver
    )
    driver.get(url)
    html = driver.execute_script("return document.body.innerHTML")
    soup = BeautifulSoup(html)
    selected = soup.select(selector)
    if not selected:
        print("Selector {} not found at {}".format(selector, url))
        return None
    if len(selected) > 1:
        print("Found multiple items with selector {} at {}. Selecting first.".format(selector, url))

    return selected[0].prettify()

for url, selector in pages:
    identifier = hashlib.md5((url+selector).encode('utf-8')).hexdigest()
    file_name = "{}{}.txt".format(state_directory, identifier)

    html_new = get_element_from_url(url, selector)

    # cound not get content
    if not html_new:
        continue

    # save new input and continue, we will check diff next time
    if not os.path.exists(file_name):
        f = open(file_name, "w+")
        f.write(html_new)
        continue

    f = open(file_name, "r")
    html_old = f.read()

    diff = difflib.unified_diff(
        html_old.splitlines(),
        html_new.splitlines()
    )

    diff_output = '\n'.join(list(diff)[3:]) # The four first lines are just +++---

    if diff_output:
        print(' ***** {} ***** '.format(url))
        print("\n" + diff_output + "\n")
        # save new version to file
        f = open(file_name, "w+")
        f.write(html_new)
	#!/usr/bin/python3

	# Usage
	# 0 * * * * /home/badguy90/bin/change_detection.py

	from bs4 import BeautifulSoup
	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	import difflib
	import hashlib
	import os

	state_directory = "/tmp/"

	pages = [
	("https://stortinget.no/no/Stottemeny/Stilling-ledig/", ".jobbnorge-joblist-table"),
	("https://stortinget.no/no/Saker-og-publikasjoner/Sporsmal/Skriftlige-sporsmal-og-svar/Skriftlig-sporsmal/?qid=74380", "#main-content"),
	("https://www.smalhans.no/matogvin", '.menu-block'),
	("https://www.digitalocean.com/legal/privacy-policy/", ".www-Section")
	]

	chrome_options = Options()
	chrome_options.add_argument("--headless")
	chrome_driver = "/path/to/chromedriver"

	def get_element_from_url(url, selector):
	driver = webdriver.Chrome(
	chrome_options=chrome_options, executable_path=chrome_driver
	)
	driver.get(url)
	html = driver.execute_script("return document.body.innerHTML")
	soup = BeautifulSoup(html)
	selected = soup.select(selector)
	if not selected:
	print("Selector {} not found at {}".format(selector, url))
	return None
	if len(selected) > 1:
	print("Found multiple items with selector {} at {}. Selecting first.".format(selector, url))

	return selected[0].prettify()

	for url, selector in pages:
	identifier = hashlib.md5((url+selector).encode('utf-8')).hexdigest()
	file_name = "{}{}.txt".format(state_directory, identifier)

	html_new = get_element_from_url(url, selector)

	# cound not get content
	if not html_new:
	continue

	# save new input and continue, we will check diff next time
	if not os.path.exists(file_name):
	f = open(file_name, "w+")
	f.write(html_new)
	continue

	f = open(file_name, "r")
	html_old = f.read()

	diff = difflib.unified_diff(
	html_old.splitlines(),
	html_new.splitlines()
	)

	diff_output = '\n'.join(list(diff)[3:]) # The four first lines are just +++---

	if diff_output:
	print(' *** {} *** '.format(url))
	print("\n" + diff_output + "\n")
	# save new version to file
	f = open(file_name, "w+")
	f.write(html_new)