johnny5550822/parse_espn_nba_archive.py

## parse_espn_nba_archive.py
# A simple script to parse ESPN nba archive in a specified year with specified months, Code updated: 08/20/2015

import urllib2
from bs4 import BeautifulSoup # a very good python library for parsing html
import time
import re

# parameters
year = 2015
months = ['january','August']	# the service is only available in these two months
success_count = 0;	# to count the number of successful download

# open a new file
txt_name = 'nba_archive_%i.txt' % year
txt = open(txt_name,'w')

# loop through the interested months
for month in months:
	print month

	# Get the list of archives in a particular month
	espn_archive_link = 'http://espn.go.com/nba/news/archive/_/month/%s/year/%i' %(month,year)

	# To check if that month archive is available
	try:
		espn_site_content = urllib2.urlopen(espn_archive_link).read()	# read the current archive list, i.e. the current month

		# Initialize the beautiful soup object
		soup = BeautifulSoup(espn_site_content)
		text = soup.find('ul', {'class':'inline-list indent'}).find_all('a')	# find the tag which contains the archive, and then get all link

		# counter
		count=1

		for atag in text:
			link = str(atag["href"]) # get the href of an archive
			link_pieces = link.split("/id/")

			# To ensure that the link contains /id/, otherwise, ignore that link
			if len(link_pieces) >1:
				archive_id = (link_pieces[1].split('/'))[0]	# get the id of a archive

				# construct the text download link of the archive (without image)
				archive_link = "http://espn.go.com/espn/print?id=%s&type=HeadlineNews&imagesPrint=off" % (archive_id)

				# read the article; sometime the service maybe disconnected for unknown reasons (maybe security?), so just pass the current archive if it is exists.
				try:
					this_archive = urllib2.urlopen(archive_link).read()

					# make a soup
					a_soup = BeautifulSoup(this_archive)

					# remove the head, javascript and the style; remove all the whitespace at the start and end
					for script in a_soup.findAll(['head','script', 'style']):
						script.extract()
					text = a_soup.get_text().strip()

					# write to the file
					txt.write('\n')
					txt.write('--------------Next archive--------------')
					txt.write('\n')
					text = text.encode('ascii','ignore')	# ignore some character that is not belong to ascii code
					txt.write(str(text))

					success_count +=1
					print archive_id
					print "Current archive:%i" %count

					time.sleep(1.5)	# wait a few seconds because the service does not allow massive instance pool of data
				except Exception:
					pass
			# update counter
			count+=1
	except Exception:
		pass

txt.close()
print "Total number of successful download:%isuccess_count" %success_count
	# A simple script to parse ESPN nba archive in a specified year with specified months, Code updated: 08/20/2015

	import urllib2
	from bs4 import BeautifulSoup # a very good python library for parsing html
	import time
	import re

	# parameters
	year = 2015
	months = ['january','August'] # the service is only available in these two months
	success_count = 0; # to count the number of successful download

	# open a new file
	txt_name = 'nba_archive_%i.txt' % year
	txt = open(txt_name,'w')

	# loop through the interested months
	for month in months:
	print month

	# Get the list of archives in a particular month
	espn_archive_link = 'http://espn.go.com/nba/news/archive/_/month/%s/year/%i' %(month,year)

	# To check if that month archive is available
	try:
	espn_site_content = urllib2.urlopen(espn_archive_link).read() # read the current archive list, i.e. the current month

	# Initialize the beautiful soup object
	soup = BeautifulSoup(espn_site_content)
	text = soup.find('ul', {'class':'inline-list indent'}).find_all('a') # find the tag which contains the archive, and then get all link

	# counter
	count=1

	for atag in text:
	link = str(atag["href"]) # get the href of an archive
	link_pieces = link.split("/id/")

	# To ensure that the link contains /id/, otherwise, ignore that link
	if len(link_pieces) >1:
	archive_id = (link_pieces[1].split('/'))[0] # get the id of a archive

	# construct the text download link of the archive (without image)
	archive_link = "http://espn.go.com/espn/print?id=%s&type=HeadlineNews&imagesPrint=off" % (archive_id)

	# read the article; sometime the service maybe disconnected for unknown reasons (maybe security?), so just pass the current archive if it is exists.
	try:
	this_archive = urllib2.urlopen(archive_link).read()

	# make a soup
	a_soup = BeautifulSoup(this_archive)

	# remove the head, javascript and the style; remove all the whitespace at the start and end
	for script in a_soup.findAll(['head','script', 'style']):
	script.extract()
	text = a_soup.get_text().strip()

	# write to the file
	txt.write('\n')
	txt.write('--------------Next archive--------------')
	txt.write('\n')
	text = text.encode('ascii','ignore') # ignore some character that is not belong to ascii code
	txt.write(str(text))

	success_count +=1
	print archive_id
	print "Current archive:%i" %count

	time.sleep(1.5) # wait a few seconds because the service does not allow massive instance pool of data
	except Exception:
	pass
	# update counter
	count+=1
	except Exception:
	pass

	txt.close()
	print "Total number of successful download:%isuccess_count" %success_count