Skip to content

Instantly share code, notes, and snippets.

@linuskohl
Created August 20, 2019 16:01
Show Gist options
  • Save linuskohl/e5bfba277499e6938f870fe02e772adf to your computer and use it in GitHub Desktop.
Save linuskohl/e5bfba277499e6938f870fe02e772adf to your computer and use it in GitHub Desktop.
Tiny script to send notification emails on new openings on H-Soz-Kult
#!/usr/bin/env python
# coding: utf-8
import os
import feedparser
import requests
import sqlite3
from sqlite3 import IntegrityError
from lxml import html
import lxml
from string import Template
from smtplib import SMTP
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
__author__ = "Linus Kohl"
__email__ = "linus@munichresearch.com"
__license__ = "GPLv3"
# Mail settings
smtp = SMTP()
smtp_server = 'XXX'
smtp_port = 587
smtp_username = 'XXX'
smtp_password = 'XXX'
# thats where to send the emails to
recipient = 'XXX'
# email of the sender
sender = 'XXX'
subject = "New opening"
template = './message.txt'
db_file_name = "./stellen.db"
hsozkult_feed = "https://www.hsozkult.de/job/rss?page=2"
# XPaths
XPATH_TITLE = '//h2/text()'
XPATH_LOCATION = '//div[contains(@class, \'hfn-item-metafull\')]/div[1]/div[2]/text()'
XPATH_INSTITUTION = '//div[contains(@class, \'hfn-item-metafull\')]/div[2]/div[2]/text()'
XPATH_DEADLINE = '//div[contains(@class, \'hfn-item-metafull\')]/div[3]/div[2]/text()'
XPATH_LINK = '//div[contains(@class, \'hfn-item-metafull\')]/div[4]/div[2]/a/text()'
XPATH_TYPE = '//*[@id="hfn-item-sidebar-metainfo"]/div[6]/div[2]/a/text()'
XPATH_CONTENT = '//div[contains(@class, \'hfn-item-fulltext\')]/descendant::*/text()'
def init_db(connection):
cursor = connection.cursor()
sql_command = """CREATE TABLE processed (link VARCHAR(500) PRIMARY KEY);"""
cursor.execute(sql_command)
def connect_db():
if not os.path.isfile(db_file_name):
connection = sqlite3.connect(db_file_name)
init_db(connection)
return connection
else:
return sqlite3.connect(db_file_name)
def already_processed(connection, link):
cursor = connection.cursor()
cursor.execute("SELECT * FROM processed WHERE link=\"" + link + "\";")
result = cursor.fetchone()
return result is not None
def set_processed(connection, link):
try:
cursor = connection.cursor()
format_str = """INSERT INTO processed (link) VALUES ("{link}");"""
sql_command = format_str.format(link=link)
cursor.execute(sql_command)
connection.commit()
except IntegrityError:
pass
def read_template(filename):
with open(filename, 'r', encoding='utf-8') as template_file:
template_file_content = template_file.read()
return Template(template_file_content)
def extract_information(dom, path):
info = None
try:
info = dom.xpath(path)[0].strip().replace('\t', '')
except:
pass
return info
def parse_feed(url):
feed = feedparser.parse(url)
return feed.entries
def process_position(url):
req = requests.get(url)
dom = html.fromstring(req.text)
data = {}
data['title'] = extract_information(dom, XPATH_TITLE)
data['location'] = extract_information(dom, XPATH_LOCATION)
data['institution'] = extract_information(dom, XPATH_INSTITUTION)
data['deadline'] = extract_information(dom, XPATH_DEADLINE)
data['link'] = extract_information(dom, XPATH_LINK)
data['type'] = extract_information(dom, XPATH_TYPE)
content = extract_information(dom, XPATH_CONTENT)
return data
def send_email(data, recipient):
try:
msg = MIMEMultipart()
message = mail_template.substitute(TITLE=data['title'],
POSITION=data['type'],
LOCATION=data['location'],
INSTITUTION=data['institution'],
DEADLINE=data['deadline'],
LINK=data['link'])
msg['From'] = sender
msg['To'] = recipient
msg['Subject'] = subject
msg.attach(MIMEText(message, 'plain'))
smtp.send_message(msg)
except:
pass
def process_feed(connection, entries, recipient):
for entry in entries:
try:
if not already_processed(connection, entry.link):
data = process_position(entry.link)
send_email(data, recipient)
set_processed(connection, entry.link)
except:
pass
smtp.connect(smtp_server, port=smtp_port)
smtp.ehlo()
smtp.starttls()
smtp.ehlo()
smtp.login(smtp_username, smtp_password)
mail_template = read_template(template)
connection = connect_db()
entries = parse_feed(hsozkult_feed)
process_feed(connection, entries, recipient)
connection.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment