Skip to content

Instantly share code, notes, and snippets.

@ZoomTen
Created July 25, 2022 16:06
Show Gist options
  • Save ZoomTen/daaa30af2fb0b48938bc6f4942ff1000 to your computer and use it in GitHub Desktop.
Save ZoomTen/daaa30af2fb0b48938bc6f4942ff1000 to your computer and use it in GitHub Desktop.
Gitea to RSS scraper
#!/usr/bin/env python3
import html
from datetime import datetime, timedelta, timezone
import re
import json
import sys
import requests
from bs4 import BeautifulSoup
VERSION = '2022.07.25.gitea'
# date funcs
def rfc822date(date):
return datetime.strftime(date, '%a, %d %b %Y %H:%M:%S %z').strip()
# utility funcs
def sanitize(tag):
pass
def escape(string):
return html.escape(string)
# simple function
def build_tree(tag, content, attrs={}):
encoded_content = ""
encoded_attrs = ""
if isinstance(content, str):
encoded_content = escape(content)
elif isinstance(content, dict):
for k, e in content.items():
encoded_content += build_tree(k, e)
elif isinstance(content, list):
for k in content:
if isinstance(k, dict):
encoded_content += build_tree(
k.get("tag"),
k.get("content"),
attrs=k.get("attrs", {})
)
for k, e in attrs.items():
encoded_attrs = " %s=\"%s\"" % (k, e)
return "\n<%s%s>%s</%s>" % (tag, encoded_attrs, encoded_content, tag)
def parse_page(url):
itemsList = []
with requests.get(url) as rq:
rr = BeautifulSoup(rq.text, 'html.parser')
html_commits = rr.select('.commit-list > tr')
for hcommit in html_commits:
author = hcommit.select('.author a')[0].text
title = hcommit.select('.commit-summary')[0].text
link = hcommit.select('.default-link')[0].attrs['href']
baseUrl = re.findall(r'^.+?[^\/:](?=[?\/]|$)', url)[0]
date = hcommit.select('.time-since')[0].attrs['title']
sha = hcommit.select('.shortsha')[0].text
itemsList.append({
"tag": "item",
"content": {
"title": title,
"link": baseUrl + link,
"author": author,
"description": sha,
"pubDate": date
}
})
return itemsList
def make_rss(url):
# init channel
channelData = [
{"tag":"title", "content":"Gitea"},
{"tag":"link", "content":url},
{"tag":"description", "content":"Latest commits from a gitea repo"},
{"tag":"language", "content":"en-US"},
{"tag":"generator", "content":"mkfeed.py %s" % VERSION},
{"tag":"lastBuildDate", "content":rfc822date(datetime.now(timezone.utc))},
]
# doctype
print("<?xml version=\"1.0\" encoding=\"utf-8\"?>", end="")
# make listing
itemList = []
itemList += parse_page(url)
channelData += itemList
# generate RSS feed
print(
build_tree(
"rss", {
"channel": channelData
},
attrs={"version": "2.0"})
)
if __name__ == '__main__':
try:
feed = make_rss(sys.argv[1])
except:
print('./gitea "https://<Gitea instance>/<Author>/<Repo>/commits/branch/<Branch>"')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment