wfwei/crawler.py

## crawler.py
#!/usr/bin/python
# coding:utf-8

import urllib2
import BeautifulSoup
import re
import os

HOST = u'http://www.washingtonpost.com/'
image_patt = re.compile('.*photo-wrapper')

def fetch_news(url_host=HOST, url_path='politics', local_dir='/home/plex/washingtonpost'):
	page = urllib2.urlopen(url=url_host+url_path, timeout=10000)
	soup = BeautifulSoup.BeautifulSoup(page)

	image_dir = '%s/%s/image/'%(local_dir, url_path)
	news_dir = '%s/%s/news/'%(local_dir, url_path)
	if not os.path.exists(image_dir):
		os.makedirs(image_dir)
	if not os.path.exists(news_dir):
		os.makedirs(news_dir)

	for item in soup.findAll(attrs={u'class':u'no-left'}):
		news_link = str(item.a[u'href'])
		if news_link.startswith('/'):
			news_link = url_host + news_link[1:]
		if '/'+url_path not in news_link:
			print 'skip news link: %s' % news_link
			continue
		print 'processing news link: %s' % news_link

		try:
			news_page = urllib2.urlopen(news_link)
		except Exception, e:
			print 'Fail to fetch news:%s' % news_link
			print e
			continue
		news_soup = BeautifulSoup.BeautifulSoup(news_page)

		news_title = news_soup.title.text
		print 'news title: %s' % news_title

		_article = news_soup.find(id=u'article-leaf-page')
		if not _article:
			print 'Invalid page'
			continue

		image = _article.find(attrs={'class':image_patt})
		if image:
			image_link = image.find('img')['src']
			if image_link.startswith('/'):
				image_link = url_host + image_link[1:]
			print 'images:', image_link
			mat = re.search(r'[^.]+$', image_link)
			if mat:
				file_type = mat.group()
				image_file = image_dir + '%s.%s'%(news_title, file_type)
				try:
					with open(image_file, 'wb') as f:
						image_data = urllib2.urlopen(image_link)
						f.write(image_data.read())
						print 'save image:%s' % image_file
				except Exception, e:
					print 'fail to fetch image:%s' % image_link
					print e

		news_content = _article.find(id='article').findAll('p')
		news_file = news_dir + '%s.txt' % news_title
		print 'save news:%s' % news_file
		with open(news_file, 'w') as f:
			for content in news_content:
				f.write(content.text)
				f.write("\n")
	pass

if __name__ == '__main__':
	fetch_news(url_host=HOST, url_path='politics', local_dir='/home/plex/washingtonpost')
	#!/usr/bin/python
	# coding:utf-8

	import urllib2
	import BeautifulSoup
	import re
	import os

	HOST = u'http://www.washingtonpost.com/'
	image_patt = re.compile('.*photo-wrapper')

	def fetch_news(url_host=HOST, url_path='politics', local_dir='/home/plex/washingtonpost'):
	page = urllib2.urlopen(url=url_host+url_path, timeout=10000)
	soup = BeautifulSoup.BeautifulSoup(page)

	image_dir = '%s/%s/image/'%(local_dir, url_path)
	news_dir = '%s/%s/news/'%(local_dir, url_path)
	if not os.path.exists(image_dir):
	os.makedirs(image_dir)
	if not os.path.exists(news_dir):
	os.makedirs(news_dir)

	for item in soup.findAll(attrs={u'class':u'no-left'}):
	news_link = str(item.a[u'href'])
	if news_link.startswith('/'):
	news_link = url_host + news_link[1:]
	if '/'+url_path not in news_link:
	print 'skip news link: %s' % news_link
	continue
	print 'processing news link: %s' % news_link

	try:
	news_page = urllib2.urlopen(news_link)
	except Exception, e:
	print 'Fail to fetch news:%s' % news_link
	print e
	continue
	news_soup = BeautifulSoup.BeautifulSoup(news_page)

	news_title = news_soup.title.text
	print 'news title: %s' % news_title

	_article = news_soup.find(id=u'article-leaf-page')
	if not _article:
	print 'Invalid page'
	continue

	image = _article.find(attrs={'class':image_patt})
	if image:
	image_link = image.find('img')['src']
	if image_link.startswith('/'):
	image_link = url_host + image_link[1:]
	print 'images:', image_link
	mat = re.search(r'[^.]+$', image_link)
	if mat:
	file_type = mat.group()
	image_file = image_dir + '%s.%s'%(news_title, file_type)
	try:
	with open(image_file, 'wb') as f:
	image_data = urllib2.urlopen(image_link)
	f.write(image_data.read())
	print 'save image:%s' % image_file
	except Exception, e:
	print 'fail to fetch image:%s' % image_link
	print e

	news_content = _article.find(id='article').findAll('p')
	news_file = news_dir + '%s.txt' % news_title
	print 'save news:%s' % news_file
	with open(news_file, 'w') as f:
	for content in news_content:
	f.write(content.text)
	f.write("\n")
	pass

	if __name__ == '__main__':
	fetch_news(url_host=HOST, url_path='politics', local_dir='/home/plex/washingtonpost')