Skip to content

Instantly share code, notes, and snippets.

@hyounggyu
Created January 21, 2015 15:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save hyounggyu/b2682046b3cd5ccb4927 to your computer and use it in GitHub Desktop.
Save hyounggyu/b2682046b3cd5ccb4927 to your computer and use it in GitHub Desktop.
import sys
import json
import urllib.request
from bs4 import BeautifulSoup
from jinja2 import Template
# TODO: Remove '\xa0' character
def clean_text(text):
#return ''.join(c for c in text if c is not '\xa0')
return text
def update():
quickview_url = 'http://pressian.com/ezview/'
articlelist_url = quickview_url + 'article_left.html'
handler = urllib.request.urlopen(articlelist_url)
s_articles = BeautifulSoup(handler.read())
articles = list()
for link in s_articles.fieldset.find_all('a'):
handler = urllib.request.urlopen(quickview_url+link.get('href'))
content = handler.read().decode('utf-8', 'ignore')
start = content.find('<body')
end = content.find('</html>')
s_article = BeautifulSoup(content[start:end])
a = dict()
for s in s_article.find_all('div'):
if 'class' in s.attrs and 'arvtitle' in s['class']:
a['title'] = clean_text(s.h2.get_text())
a['subtitle'] = clean_text(s.h3.get_text())
a['byline'] = clean_text(s.p.get_text())
if 'class' in s.attrs and 'cnt_view' in s['class']:
a['content'] = clean_text(s.get_text())
articles.append(a)
with open('articles.json', 'w') as f:
f.write(json.dumps(articles))
def generate(dest_dir):
if dest_dir == None:
dest_dir = './'
articles = []
with open('articles.json', 'r') as f:
articles = json.loads(f.read())
html = ''
with open('layout.html', 'r') as f:
template = Template(f.read())
html = template.render(articles=articles)
with open(dest_dir + 'index.html', 'w') as f:
f.write(html)
def usage():
u = '''Usage: {} <command> [<args>]
Commands are:
update update news
generate <dest_dir> generate html file
'''.format(sys.argv[0])
print(u)
sys.exit(0)
if __name__ == '__main__':
if len(sys.argv) < 2:
usage()
elif sys.argv[1] == 'update':
update()
elif sys.argv[1] == 'generate':
generate(sys.argv[2] if len(sys.argv) == 3 else None)
else:
usage()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment