Skip to content

Instantly share code, notes, and snippets.

Created July 9, 2014 22:00
Show Gist options
  • Save anonymous/92e90210e7d79d7590c8 to your computer and use it in GitHub Desktop.
Save anonymous/92e90210e7d79d7590c8 to your computer and use it in GitHub Desktop.
Takes a Mefi Favorites Export file and outputs an HTML file containing the full content of all comments.
#!/usr/bin/env python
import bs4
import sys
import urllib2
import codecs
def grab_comment(url, comment_id):
print "[+] Retrieving comment #{} from {!r}...".format(comment_id, url)
thread = urllib2.urlopen(url)
soup = bs4.BeautifulSoup(thread)
anchor = soup.find("a", attrs={"name": str(comment_id)})
# Sanity check
if anchor is None:
print " [!] Unexpected document structure; skipping."
return ""
comment = anchor.next_sibling
# Sanity check
if not (comment.name == "div" and "comments" in comment['class']):
print " [!] Unexpected document structure; skipping."
return ""
return comment.prettify()
def parse_favorites_file(filename):
print "[+] Loading favorites file..."
with open(filename) as f:
soup = bs4.BeautifulSoup(f)
comment_list_header = soup.find("h3", text="MetaFilter Comments")
# Sanity check
if comment_list_header is None:
raise ValueError("Unexpected document structure")
links = comment_list_header.find_next("dl").find_all("a")
for link in links:
yield link['href'].split("#")
if __name__ == "__main__":
if len(sys.argv) < 2:
print "Usage: {} [path_to_favorites_export_file]".format(sys.argv[0])
sys.exit(1)
with codecs.open("metafilter_favorites_fulltext.html", "w", encoding="utf-16") as f:
f.write("<html><head><style>.comments {border: 1px solid black; padding: 1em; margin: 1em;}</style></head><body>\n")
for url, comment_id in parse_favorites_file(sys.argv[1]):
f.write("\n\n")
f.write(grab_comment(url, comment_id))
f.write("</body></html>")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment