Skip to content

Instantly share code, notes, and snippets.

@johnny5550822
Last active August 29, 2015 14:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save johnny5550822/73d3e8136224c37c2599 to your computer and use it in GitHub Desktop.
Save johnny5550822/73d3e8136224c37c2599 to your computer and use it in GitHub Desktop.
A sample script to automatically parse the NBA news archive in ESPN.com (only text).
# A simple script to parse ESPN nba archive in a specified year with specified months, Code updated: 08/20/2015
import urllib2
from bs4 import BeautifulSoup # a very good python library for parsing html
import time
import re
# parameters
year = 2015
months = ['january','August'] # the service is only available in these two months
success_count = 0; # to count the number of successful download
# open a new file
txt_name = 'nba_archive_%i.txt' % year
txt = open(txt_name,'w')
# loop through the interested months
for month in months:
print month
# Get the list of archives in a particular month
espn_archive_link = 'http://espn.go.com/nba/news/archive/_/month/%s/year/%i' %(month,year)
# To check if that month archive is available
try:
espn_site_content = urllib2.urlopen(espn_archive_link).read() # read the current archive list, i.e. the current month
# Initialize the beautiful soup object
soup = BeautifulSoup(espn_site_content)
text = soup.find('ul', {'class':'inline-list indent'}).find_all('a') # find the tag which contains the archive, and then get all link
# counter
count=1
for atag in text:
link = str(atag["href"]) # get the href of an archive
link_pieces = link.split("/id/")
# To ensure that the link contains /id/, otherwise, ignore that link
if len(link_pieces) >1:
archive_id = (link_pieces[1].split('/'))[0] # get the id of a archive
# construct the text download link of the archive (without image)
archive_link = "http://espn.go.com/espn/print?id=%s&type=HeadlineNews&imagesPrint=off" % (archive_id)
# read the article; sometime the service maybe disconnected for unknown reasons (maybe security?), so just pass the current archive if it is exists.
try:
this_archive = urllib2.urlopen(archive_link).read()
# make a soup
a_soup = BeautifulSoup(this_archive)
# remove the head, javascript and the style; remove all the whitespace at the start and end
for script in a_soup.findAll(['head','script', 'style']):
script.extract()
text = a_soup.get_text().strip()
# write to the file
txt.write('\n')
txt.write('--------------Next archive--------------')
txt.write('\n')
text = text.encode('ascii','ignore') # ignore some character that is not belong to ascii code
txt.write(str(text))
success_count +=1
print archive_id
print "Current archive:%i" %count
time.sleep(1.5) # wait a few seconds because the service does not allow massive instance pool of data
except Exception:
pass
# update counter
count+=1
except Exception:
pass
txt.close()
print "Total number of successful download:%isuccess_count" %success_count
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment