Skip to content

Instantly share code, notes, and snippets.

@patrickbucher
Created June 20, 2022 19:13
Show Gist options
  • Save patrickbucher/9dc4f0e70db749747d6d3f43b6aaa87b to your computer and use it in GitHub Desktop.
Save patrickbucher/9dc4f0e70db749747d6d3f43b6aaa87b to your computer and use it in GitHub Desktop.
Exercise for DFDE Regexp Tutorial: Extract Thread Information from Sub-Forum
#!/usr/bin/env python3
from datetime import datetime
import re
import requests
import sys
def get_forum_text(forum_id):
url = f'https://debianforum.de/forum/viewforum.php?f={forum_id}'
return requests.get(url).text
def group_lines_to_thread_sections(text):
start = re.compile(r'<li class="row bg[12]"')
buf = []
threads = []
for line in text.split('\n'):
if start.search(line):
if buf:
threads.append(buf)
buf = []
buf.append(line.strip())
threads.append(buf)
return threads
if __name__ == '__main__':
if len(sys.argv) < 2:
print(f'usage: {sys.argv[0]} [forum_id]')
sys.exit(1)
dev_forum_id = int(sys.argv[1])
full_text = get_forum_text(dev_forum_id)
thread_sections = group_lines_to_thread_sections(full_text)
title_re = re.compile(r'class="topictitle">([^<]+)</a>')
answers_re = re.compile(r'<strong>([0-9]+)</strong>')
starter_re = re.compile(r'>([^<]+)</a>')
date_re = re.compile(r'(\d{2}\.\d{2}\.\d{4} \d{1,2}:\d{2}:\d{2})$')
date_fmt = '%d.%m.%Y %H:%M:%S'
extract_from_lineno_re_conv = {
'title': (4, title_re, lambda t: t),
'n_answers': (10, answers_re, lambda n: int(n)),
'starter': (13, starter_re, lambda s: s),
'date': (13, date_re, lambda d: datetime.strptime(d, date_fmt)),
}
entries = []
for sec in thread_sections:
entry = {}
for field, (n, r, f) in extract_from_lineno_re_conv.items():
m = r.search(sec[n])
entry[field] = f(m[1]) if m else '???'
if entry and 'date' in entry and entry['date'] != '???':
entries.append(entry)
for e in sorted(entries, key=lambda e: e['date'], reverse=True):
au = e['starter']
at = e['date'].strftime(date_fmt)
ti = e['title']
an = e['n_answers']
print(f'{au} ({at}): {ti} ({an} Antworten)')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment