Created
February 5, 2021 11:10
-
-
Save imolein/79b3fadb8f54ad6988551f49d6b44d22 to your computer and use it in GitHub Desktop.
Generiert einen RSS feed aus den Pressemitteilungen des Landkreis Meissen
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env lua | |
-- Generiert einen RSS feed aus den Pressemitteilungen des Landkreis Meissen. | |
-- Schade das es keinen direkt auf der Seite gibt... | |
-- | |
-- Abhängigkeiten: | |
-- * lua - https://lua.org | |
-- * luasocket - https://github.com/diegonehab/luasocket | |
-- * lua-htmlparser - https://github.com/msva/lua-htmlparser | |
-- * etlua - https://github.com/leafo/etlua | |
-- * date - https://github.com/Tieske/date | |
local http = require('socket.http') | |
local htmlparser = require('htmlparser') | |
local etlua = require('etlua') | |
local date = require('date') | |
local URL = 'http://www.kreis-meissen.org' | |
local DESC = 'RSS Feed der Pressemitteilungen Kreis Meißen.' | |
local GEN = 'km_press_to_rss.lua' | |
local OUT = './km_pressemitteilungen.xml' | |
local LOG_FILE = './km_press_to_rss.log' | |
local TEMPLATE_FILE = './template.xml' | |
http.USERAGENT = 'km_press_to_rss/0.0.0 (Bitte stellt einen RSS Feed bereit)' | |
local logfile_fh = assert(io.open(LOG_FILE, 'a')) | |
local function logger(level, msg, fmt) | |
logfile_fh:write(('%s %s - %s\n'):format(os.date(nil, os.time()), level:upper(), msg:format(fmt))) | |
end | |
-- generiert vom Datum aus dem Titel einen rfc-822 date time string | |
local function rfc_822_date_time(raw) | |
local ok, date_obj = pcall(date, raw[3], raw[2], raw[1]) | |
if not ok then return end | |
return date_obj:fmt('%a, %d %b %Y %T %z') | |
end | |
-- erstellt für jeden Artikel eine table und fügt diese data.articles hinzu | |
local function get_articles(parsed, data) | |
for _, element in ipairs(parsed:select('div.inhaltsbereich-box')) do | |
local title = element:select('div.border_o_r > h2')[1]:getcontent() | |
local rfc_date = rfc_822_date_time({ title:match('^(%d+)%.(%d+)%.(%d+)%s+.*$') }) | |
logger('info', 'Found article %q', title) | |
table.insert(data.articles, { | |
title = title, | |
content = element:select('p')[1]:getcontent():gsub(' ', ' '), | |
link = URL .. element:select('a')[1].attributes.href, | |
date = rfc_date | |
}) | |
end | |
return data | |
end | |
-- parsed die empfangene Webseite | |
local function parse_html(raw_html) | |
local ok, parsed = pcall(htmlparser.parse, raw_html) | |
if not ok then | |
logger('error', 'Parsing received raw HTML failed: %s', parsed) | |
return | |
end | |
logger('info', 'Successful parsed raw HTML, now picking needed data out of it') | |
local data = { | |
title = parsed:select('title')[1]:getcontent(), | |
url = URL .. '/61.html', | |
description = DESC, | |
generator = GEN, | |
articles = {} | |
} | |
return get_articles(parsed, data) | |
end | |
-- holt die Pressemitteilungswebseite | |
local function get_data_from_url() | |
local data, code = http.request(URL.. '/61.html') | |
if code ~= 200 then | |
logger('error', 'Failed to receive website') | |
os.exit(1) | |
end | |
logger( 'info','Successful received press website') | |
return parse_html(data) | |
end | |
-- lädt das Template in etlua | |
local function load_template() | |
local fh = assert(io.open(TEMPLATE_FILE, 'r')) | |
local template_xml = fh:read('*a') | |
fh:close() | |
return etlua.compile(template_xml) | |
end | |
local template = load_template() | |
local data = get_data_from_url() | |
logfile_fh:close() | |
if not data then os.exit(1) end | |
-- schreibt den generierten RSS Feed in eine Datei | |
local fh = io.open(OUT, 'w') | |
fh:write(template(data)) | |
fh:close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="utf-8"?> | |
<rss version="2.0"> | |
<channel> | |
<title><%= title %></title> | |
<description><%= description %></description> | |
<link><%- url %></link> | |
<generator><%= generator %></generator> | |
<% if articles then %> | |
<% for _, article in ipairs(articles) do %> | |
<item> | |
<title><%= article.title %></title> | |
<description> | |
<% if article.content then %> | |
<%- article.content %> | |
<% end %> | |
</description> | |
<link><%- article.link %></link> | |
<% if article.date then %> | |
<pubDate><%= article.date %></pubDate> | |
<% end %> | |
</item> | |
<% end %> | |
<% end %> | |
</channel> | |
</rss> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment