Last active
August 29, 2015 14:27
-
-
Save johnny5550822/73d3e8136224c37c2599 to your computer and use it in GitHub Desktop.
A sample script to automatically parse the NBA news archive in ESPN.com (only text).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A simple script to parse ESPN nba archive in a specified year with specified months, Code updated: 08/20/2015 | |
import urllib2 | |
from bs4 import BeautifulSoup # a very good python library for parsing html | |
import time | |
import re | |
# parameters | |
year = 2015 | |
months = ['january','August'] # the service is only available in these two months | |
success_count = 0; # to count the number of successful download | |
# open a new file | |
txt_name = 'nba_archive_%i.txt' % year | |
txt = open(txt_name,'w') | |
# loop through the interested months | |
for month in months: | |
print month | |
# Get the list of archives in a particular month | |
espn_archive_link = 'http://espn.go.com/nba/news/archive/_/month/%s/year/%i' %(month,year) | |
# To check if that month archive is available | |
try: | |
espn_site_content = urllib2.urlopen(espn_archive_link).read() # read the current archive list, i.e. the current month | |
# Initialize the beautiful soup object | |
soup = BeautifulSoup(espn_site_content) | |
text = soup.find('ul', {'class':'inline-list indent'}).find_all('a') # find the tag which contains the archive, and then get all link | |
# counter | |
count=1 | |
for atag in text: | |
link = str(atag["href"]) # get the href of an archive | |
link_pieces = link.split("/id/") | |
# To ensure that the link contains /id/, otherwise, ignore that link | |
if len(link_pieces) >1: | |
archive_id = (link_pieces[1].split('/'))[0] # get the id of a archive | |
# construct the text download link of the archive (without image) | |
archive_link = "http://espn.go.com/espn/print?id=%s&type=HeadlineNews&imagesPrint=off" % (archive_id) | |
# read the article; sometime the service maybe disconnected for unknown reasons (maybe security?), so just pass the current archive if it is exists. | |
try: | |
this_archive = urllib2.urlopen(archive_link).read() | |
# make a soup | |
a_soup = BeautifulSoup(this_archive) | |
# remove the head, javascript and the style; remove all the whitespace at the start and end | |
for script in a_soup.findAll(['head','script', 'style']): | |
script.extract() | |
text = a_soup.get_text().strip() | |
# write to the file | |
txt.write('\n') | |
txt.write('--------------Next archive--------------') | |
txt.write('\n') | |
text = text.encode('ascii','ignore') # ignore some character that is not belong to ascii code | |
txt.write(str(text)) | |
success_count +=1 | |
print archive_id | |
print "Current archive:%i" %count | |
time.sleep(1.5) # wait a few seconds because the service does not allow massive instance pool of data | |
except Exception: | |
pass | |
# update counter | |
count+=1 | |
except Exception: | |
pass | |
txt.close() | |
print "Total number of successful download:%isuccess_count" %success_count | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment