Skip to content

Instantly share code, notes, and snippets.

@wfwei
Last active December 24, 2015 04:09
Show Gist options
  • Save wfwei/6742336 to your computer and use it in GitHub Desktop.
Save wfwei/6742336 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# coding:utf-8
import urllib2
import BeautifulSoup
import re
import os
HOST = u'http://www.washingtonpost.com/'
image_patt = re.compile('.*photo-wrapper')
def fetch_news(url_host=HOST, url_path='politics', local_dir='/home/plex/washingtonpost'):
page = urllib2.urlopen(url=url_host+url_path, timeout=10000)
soup = BeautifulSoup.BeautifulSoup(page)
image_dir = '%s/%s/image/'%(local_dir, url_path)
news_dir = '%s/%s/news/'%(local_dir, url_path)
if not os.path.exists(image_dir):
os.makedirs(image_dir)
if not os.path.exists(news_dir):
os.makedirs(news_dir)
for item in soup.findAll(attrs={u'class':u'no-left'}):
news_link = str(item.a[u'href'])
if news_link.startswith('/'):
news_link = url_host + news_link[1:]
if '/'+url_path not in news_link:
print 'skip news link: %s' % news_link
continue
print 'processing news link: %s' % news_link
try:
news_page = urllib2.urlopen(news_link)
except Exception, e:
print 'Fail to fetch news:%s' % news_link
print e
continue
news_soup = BeautifulSoup.BeautifulSoup(news_page)
news_title = news_soup.title.text
print 'news title: %s' % news_title
_article = news_soup.find(id=u'article-leaf-page')
if not _article:
print 'Invalid page'
continue
image = _article.find(attrs={'class':image_patt})
if image:
image_link = image.find('img')['src']
if image_link.startswith('/'):
image_link = url_host + image_link[1:]
print 'images:', image_link
mat = re.search(r'[^.]+$', image_link)
if mat:
file_type = mat.group()
image_file = image_dir + '%s.%s'%(news_title, file_type)
try:
with open(image_file, 'wb') as f:
image_data = urllib2.urlopen(image_link)
f.write(image_data.read())
print 'save image:%s' % image_file
except Exception, e:
print 'fail to fetch image:%s' % image_link
print e
news_content = _article.find(id='article').findAll('p')
news_file = news_dir + '%s.txt' % news_title
print 'save news:%s' % news_file
with open(news_file, 'w') as f:
for content in news_content:
f.write(content.text)
f.write("\n")
pass
if __name__ == '__main__':
fetch_news(url_host=HOST, url_path='politics', local_dir='/home/plex/washingtonpost')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment