Created
November 26, 2017 14:44
-
-
Save DanielOaks/ab92a5c69c4b5ba5934d78cb5d20de74 to your computer and use it in GitHub Desktop.
Making archives of Usenet backups
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""gensite.py | |
Usage: | |
gensite.py <group> <source-dir> | |
gensite.py -h | --help | |
gensite.py --version | |
Options: | |
<source-dir> Where the directory we'll be grabbing files from exists. | |
-h --help Show this screen. | |
--version Show version. | |
""" | |
import os | |
import email | |
import pytz | |
import dateparser | |
import dateutil.parser | |
from docopt import docopt | |
arguments = docopt(__doc__, version='gensite.py 1.0') | |
sorted_threads = [] | |
threads = {} | |
for (dirpath, dirnames, filenames) in os.walk(arguments['<source-dir>']): | |
for dirname in dirnames: | |
threads[dirname] = { | |
'name': '', | |
'datetime': None, | |
'posts': {}, | |
'sorted_posts': [], | |
} | |
for (dpath, dnames, fnames) in os.walk(os.path.join(dirpath, dirname)): | |
for post in fnames: | |
threads[dirname]['posts'][post] = email.message_from_string(open(os.path.join(dpath, post)).read()) | |
thread_time = dateutil.parser.parse('3000-12-25').replace(tzinfo=pytz.UTC) | |
sorted_posts = [] | |
for post_id, info in threads[dirname]['posts'].items(): | |
if info['Date'] == None: | |
post_time = thread_time | |
else: | |
date_string = info['Date'].strip().replace('UNDEFINED', 'UTC').replace('pacific', 'PST').replace('Central', 'CST').replace('PACIFIC', 'PST').replace('est', 'EST').replace('-600', '-0600') | |
try: | |
post_time = dateutil.parser.parse(date_string) | |
except: | |
post_time = dateparser.parse(date_string) | |
if post_time is None: | |
post_time = thread_time | |
print('fucking date format is broke:', date_string) | |
if post_time.tzinfo is None: | |
post_time = post_time.replace(tzinfo=pytz.UTC) | |
sorted_posts.append([post_time, post_id]) | |
if post_time < thread_time: | |
thread_time = post_time | |
threads[dirname]['sorted_posts'] = sorted(sorted_posts) | |
threads[dirname]['datetime'] = thread_time | |
first_post_id = threads[dirname]['sorted_posts'][0][1] | |
first_post_subject = threads[dirname]['posts'][first_post_id]['Subject'].split('\n')[0] | |
while first_post_subject.startswith('Re: '): | |
first_post_subject = first_post_subject[4:] | |
threads[dirname]['name'] = first_post_subject | |
sorted_threads.append([thread_time, dirname]) | |
# make index | |
thread_index_html = '' | |
for time, thread_id in sorted(sorted_threads): | |
thread_index_html += '<div>{} - <a href="./threads/{}.html">{}</a> - {} replies</div>\n'.format(time, thread_id, threads[thread_id]['name'], len(threads[thread_id]['posts'])) | |
with open(os.path.join('gen-{}'.format(arguments['<group>']), 'index.html'), 'w') as index: | |
index.write(""" | |
<html> | |
<body> | |
<h1>{board}</h1> | |
{threads} | |
</body> | |
</html> | |
""".format(board=arguments['<group>'], threads=thread_index_html)) | |
#TODO(dan): make paginated thread index | |
# make threads themselves | |
for thread_id, thread in threads.items(): | |
posts_html = '' | |
for post_time, post_id in thread['sorted_posts']: | |
post = thread['posts'][post_id] | |
posts_html += '<div style="font-family: monospace">{}</div>'.format(str(post).replace('\n', '<br/>')) + '<hr/>\n' | |
with open(os.path.join('gen-{}'.format(arguments['<group>']), 'threads', '{}.html'.format(thread_id)), 'w') as page: | |
page.write(""" | |
<html> | |
<body> | |
<h1>{board}</h1> | |
<h2>{thread_name}</h2> | |
<span class="date">{datetime}</span> | |
{posts} | |
</body> | |
</html> | |
""".format(board=arguments['<group>'], thread_name=thread['name'], datetime=thread['datetime'], posts=posts_html)) | |
thread_index_html += '<div>{} - <a href="./threads/{}.html">{}</a> - {} replies</div>\n'.format(time, thread_id, threads[thread_id]['name'], len(threads[thread_id]['posts'])) | |
print('I have', len(threads), 'threads') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment