Created
September 23, 2013 18:29
-
-
Save omz/6674820 to your computer and use it in GitHub Desktop.
Simple converter script to get machine-readable JSON from a Glassboard HTML export
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
This script converts a Glassboard HTML archive to JSON. | |
Usage: | |
python glassboard2json.py glassboard_export/index.html -o output_file.json | |
Requirement: BeautifulSoup4 (bs4) | |
Notes: Nested replies are not fully supported, all posts that belong to a thread are exported as a flat list. | |
Attachments/images are not supported at all. | |
The output has the following structure: | |
{ | |
"threads": [ | |
[ | |
{ | |
"username": "Board Owner", | |
"date": "2012-10-18 01:25:00", | |
"text": "Board created for Board Owner" | |
} | |
], | |
[ | |
{ | |
"username": "Board Owner", | |
"date": "2012-10-18 01:30:00", | |
"text": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt." | |
}, | |
{ | |
"username": "User 1", | |
"date": "2012-10-18 01:32:00", | |
"text": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt." | |
} | |
] | |
] | |
"users": [ | |
"Board Owner", | |
"User 1" | |
], | |
"title": "Board Title" | |
} | |
""" | |
def main(): | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('input_path', help='The index.html file from the Glassboard export') | |
parser.add_argument('-o', '--output', help='The output file (default: stdout)') | |
args = parser.parse_args() | |
input_path = args.input_path | |
output_path = args.output | |
from bs4 import BeautifulSoup | |
import re | |
from datetime import datetime | |
import json | |
with open(input_path, 'r') as f: | |
html_doc = f.read() | |
# Workaround: | |
# Glassboard doesn't escape "<title>" tags in posts correctly, which causes bs4 (and Safari) to choke. | |
# I'd guess that there are some other cases that the Glassboard exporter doesn't handle correctly. | |
html_doc = html_doc.replace('<title>', '<title>') | |
soup = BeautifulSoup(html_doc) | |
posts = soup.find_all('div', attrs={'class': re.compile('(status|comment)Div')}) | |
board_title_element = soup.find('span', attrs={'class': 'boardTitle'}) | |
board_title = board_title_element.get_text() | |
current_thread = [] | |
all_threads = [] | |
all_users = set() | |
for post in posts: | |
post_class = post.attrs['class'][0] | |
text = post.get_text().strip() | |
lines = text.splitlines() | |
post_text = '\n'.join(lines[1:]) | |
metadata = lines[0] | |
match = re.search('(.*?)((January|February|March|April|May|June|July|August|September|October|November|December).*)', metadata) | |
username = match.group(1).strip() | |
all_users.add(username) | |
date_string = match.group(2) | |
dt = datetime.strptime(date_string, '%B %d, %Y - %I:%M %p GMT') | |
if post_class == 'statusDiv': | |
if current_thread: | |
all_threads.append(current_thread) | |
current_thread = [{'username': username, 'date': dt.isoformat(' '), 'text': post_text}] | |
else: | |
current_thread.append({'username': username, 'date': dt.isoformat(' '), 'text': post_text}) | |
if current_thread: | |
all_threads.append(current_thread) | |
output = {'title': board_title, 'threads': all_threads, 'users': list(all_users)} | |
if output_path: | |
with open(output_path, 'w') as f: | |
json.dump(output, f, indent=2) | |
else: | |
print json.dumps(output, indent=2) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment