/grab_fulltext_mefi_favorites.py

## grab_fulltext_mefi_favorites.py
#!/usr/bin/env python
import bs4
import sys
import urllib2
import codecs

def grab_comment(url, comment_id):
    print "[+] Retrieving comment #{} from {!r}...".format(comment_id, url)

    thread = urllib2.urlopen(url)
    soup = bs4.BeautifulSoup(thread)
    anchor = soup.find("a", attrs={"name": str(comment_id)})

    # Sanity check
    if anchor is None:
        print "  [!] Unexpected document structure; skipping."
        return ""

    comment = anchor.next_sibling

    # Sanity check
    if not (comment.name == "div" and "comments" in comment['class']):
        print "  [!] Unexpected document structure; skipping."
        return ""

    return comment.prettify()

def parse_favorites_file(filename):
    print "[+] Loading favorites file..."

    with open(filename) as f:
        soup = bs4.BeautifulSoup(f)
        comment_list_header = soup.find("h3", text="MetaFilter Comments")

        # Sanity check
        if comment_list_header is None:
            raise ValueError("Unexpected document structure")

        links = comment_list_header.find_next("dl").find_all("a")

        for link in links:
            yield link['href'].split("#")

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print "Usage: {} [path_to_favorites_export_file]".format(sys.argv[0])
        sys.exit(1)

    with codecs.open("metafilter_favorites_fulltext.html", "w", encoding="utf-16") as f:
        f.write("<html><head><style>.comments {border: 1px solid black; padding: 1em; margin: 1em;}</style></head><body>\n")

        for url, comment_id in parse_favorites_file(sys.argv[1]):
            f.write("\n\n")
            f.write(grab_comment(url, comment_id))

        f.write("</body></html>")
	#!/usr/bin/env python
	import bs4
	import sys
	import urllib2
	import codecs

	def grab_comment(url, comment_id):
	print "[+] Retrieving comment #{} from {!r}...".format(comment_id, url)

	thread = urllib2.urlopen(url)
	soup = bs4.BeautifulSoup(thread)
	anchor = soup.find("a", attrs={"name": str(comment_id)})

	# Sanity check
	if anchor is None:
	print " [!] Unexpected document structure; skipping."
	return ""

	comment = anchor.next_sibling

	# Sanity check
	if not (comment.name == "div" and "comments" in comment['class']):
	print " [!] Unexpected document structure; skipping."
	return ""

	return comment.prettify()

	def parse_favorites_file(filename):
	print "[+] Loading favorites file..."

	with open(filename) as f:
	soup = bs4.BeautifulSoup(f)
	comment_list_header = soup.find("h3", text="MetaFilter Comments")

	# Sanity check
	if comment_list_header is None:
	raise ValueError("Unexpected document structure")

	links = comment_list_header.find_next("dl").find_all("a")

	for link in links:
	yield link['href'].split("#")

	if __name__ == "__main__":
	if len(sys.argv) < 2:
	print "Usage: {} [path_to_favorites_export_file]".format(sys.argv[0])
	sys.exit(1)

	with codecs.open("metafilter_favorites_fulltext.html", "w", encoding="utf-16") as f:
	f.write("<html><head><style>.comments {border: 1px solid black; padding: 1em; margin: 1em;}</style></head><body>\n")

	for url, comment_id in parse_favorites_file(sys.argv[1]):
	f.write("\n\n")
	f.write(grab_comment(url, comment_id))

	f.write("</body></html>")