Skip to content

Instantly share code, notes, and snippets.

@tsudoko
Last active August 29, 2015 14:21
Show Gist options
  • Save tsudoko/69aaec15bd40b5b29e6f to your computer and use it in GitHub Desktop.
Save tsudoko/69aaec15bd40b5b29e6f to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import urllib.request
import os.path
import sys
def get_contents_plain(html):
soup = BeautifulSoup(html)
contents_body = soup.find("div", class_="contents_body")
contents_body.find("div", class_="fc2_footer").decompose()
[br.replace_with('\n') for br in contents_body.find_all("br")]
return contents_body.get_text()
if len(sys.argv) < 2:
print("usage: %s [site]" % os.path.basename(sys.argv[0]), file=sys.stderr)
else:
site = urllib.request.urlopen(sys.argv[1]).read()
print(get_contents_plain(site))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment