Created
July 9, 2014 22:00
-
-
Save anonymous/92e90210e7d79d7590c8 to your computer and use it in GitHub Desktop.
Takes a Mefi Favorites Export file and outputs an HTML file containing the full content of all comments.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import bs4 | |
import sys | |
import urllib2 | |
import codecs | |
def grab_comment(url, comment_id): | |
print "[+] Retrieving comment #{} from {!r}...".format(comment_id, url) | |
thread = urllib2.urlopen(url) | |
soup = bs4.BeautifulSoup(thread) | |
anchor = soup.find("a", attrs={"name": str(comment_id)}) | |
# Sanity check | |
if anchor is None: | |
print " [!] Unexpected document structure; skipping." | |
return "" | |
comment = anchor.next_sibling | |
# Sanity check | |
if not (comment.name == "div" and "comments" in comment['class']): | |
print " [!] Unexpected document structure; skipping." | |
return "" | |
return comment.prettify() | |
def parse_favorites_file(filename): | |
print "[+] Loading favorites file..." | |
with open(filename) as f: | |
soup = bs4.BeautifulSoup(f) | |
comment_list_header = soup.find("h3", text="MetaFilter Comments") | |
# Sanity check | |
if comment_list_header is None: | |
raise ValueError("Unexpected document structure") | |
links = comment_list_header.find_next("dl").find_all("a") | |
for link in links: | |
yield link['href'].split("#") | |
if __name__ == "__main__": | |
if len(sys.argv) < 2: | |
print "Usage: {} [path_to_favorites_export_file]".format(sys.argv[0]) | |
sys.exit(1) | |
with codecs.open("metafilter_favorites_fulltext.html", "w", encoding="utf-16") as f: | |
f.write("<html><head><style>.comments {border: 1px solid black; padding: 1em; margin: 1em;}</style></head><body>\n") | |
for url, comment_id in parse_favorites_file(sys.argv[1]): | |
f.write("\n\n") | |
f.write(grab_comment(url, comment_id)) | |
f.write("</body></html>") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment