Skip to content

Instantly share code, notes, and snippets.

@DanielOaks
Created November 26, 2017 14:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DanielOaks/ab92a5c69c4b5ba5934d78cb5d20de74 to your computer and use it in GitHub Desktop.
Save DanielOaks/ab92a5c69c4b5ba5934d78cb5d20de74 to your computer and use it in GitHub Desktop.
Making archives of Usenet backups
#!/usr/bin/env python3
"""gensite.py
Usage:
gensite.py <group> <source-dir>
gensite.py -h | --help
gensite.py --version
Options:
<source-dir> Where the directory we'll be grabbing files from exists.
-h --help Show this screen.
--version Show version.
"""
import os
import email
import pytz
import dateparser
import dateutil.parser
from docopt import docopt
arguments = docopt(__doc__, version='gensite.py 1.0')
sorted_threads = []
threads = {}
for (dirpath, dirnames, filenames) in os.walk(arguments['<source-dir>']):
for dirname in dirnames:
threads[dirname] = {
'name': '',
'datetime': None,
'posts': {},
'sorted_posts': [],
}
for (dpath, dnames, fnames) in os.walk(os.path.join(dirpath, dirname)):
for post in fnames:
threads[dirname]['posts'][post] = email.message_from_string(open(os.path.join(dpath, post)).read())
thread_time = dateutil.parser.parse('3000-12-25').replace(tzinfo=pytz.UTC)
sorted_posts = []
for post_id, info in threads[dirname]['posts'].items():
if info['Date'] == None:
post_time = thread_time
else:
date_string = info['Date'].strip().replace('UNDEFINED', 'UTC').replace('pacific', 'PST').replace('Central', 'CST').replace('PACIFIC', 'PST').replace('est', 'EST').replace('-600', '-0600')
try:
post_time = dateutil.parser.parse(date_string)
except:
post_time = dateparser.parse(date_string)
if post_time is None:
post_time = thread_time
print('fucking date format is broke:', date_string)
if post_time.tzinfo is None:
post_time = post_time.replace(tzinfo=pytz.UTC)
sorted_posts.append([post_time, post_id])
if post_time < thread_time:
thread_time = post_time
threads[dirname]['sorted_posts'] = sorted(sorted_posts)
threads[dirname]['datetime'] = thread_time
first_post_id = threads[dirname]['sorted_posts'][0][1]
first_post_subject = threads[dirname]['posts'][first_post_id]['Subject'].split('\n')[0]
while first_post_subject.startswith('Re: '):
first_post_subject = first_post_subject[4:]
threads[dirname]['name'] = first_post_subject
sorted_threads.append([thread_time, dirname])
# make index
thread_index_html = ''
for time, thread_id in sorted(sorted_threads):
thread_index_html += '<div>{} - <a href="./threads/{}.html">{}</a> - {} replies</div>\n'.format(time, thread_id, threads[thread_id]['name'], len(threads[thread_id]['posts']))
with open(os.path.join('gen-{}'.format(arguments['<group>']), 'index.html'), 'w') as index:
index.write("""
<html>
<body>
<h1>{board}</h1>
{threads}
</body>
</html>
""".format(board=arguments['<group>'], threads=thread_index_html))
#TODO(dan): make paginated thread index
# make threads themselves
for thread_id, thread in threads.items():
posts_html = ''
for post_time, post_id in thread['sorted_posts']:
post = thread['posts'][post_id]
posts_html += '<div style="font-family: monospace">{}</div>'.format(str(post).replace('\n', '<br/>')) + '<hr/>\n'
with open(os.path.join('gen-{}'.format(arguments['<group>']), 'threads', '{}.html'.format(thread_id)), 'w') as page:
page.write("""
<html>
<body>
<h1>{board}</h1>
<h2>{thread_name}</h2>
<span class="date">{datetime}</span>
{posts}
</body>
</html>
""".format(board=arguments['<group>'], thread_name=thread['name'], datetime=thread['datetime'], posts=posts_html))
thread_index_html += '<div>{} - <a href="./threads/{}.html">{}</a> - {} replies</div>\n'.format(time, thread_id, threads[thread_id]['name'], len(threads[thread_id]['posts']))
print('I have', len(threads), 'threads')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment