Skip to content

Instantly share code, notes, and snippets.

@rahulkmr
Created February 7, 2010 06:18
Show Gist options
  • Save rahulkmr/297249 to your computer and use it in GitHub Desktop.
Save rahulkmr/297249 to your computer and use it in GitHub Desktop.
'''Read an html file from a file like object, parse(possibly selective) it and give back the data to the callback for processing.
'''
import urllib2
import re
import sys
from BeautifulSoup import BeautifulSoup
from BeautifulSoup import SoupStrainer
# Inline configs.
url = ""
srcFile = "Tasks.html"
# Restrict processing.
strainer = SoupStrainer('table', {'id' : 'objecttable'})
# Field number for account. Some user names to be ignored.
ACTIONS = 0
ACCT = 5
def makeMultiMatch(rawString, flags=0):
''' Returns regex for string list.
Returns compiled regex if list contains 1 element.
'''
if len(rawString) == 1:
return re.compile(rawString[0], flags)
return re.compile('|'.join(rawString), flags)
def parse(src, strainer, dataProcessor, ignoreIds=None):
" Read and parse 'src' selectively using 'strainer'. "
doc = BeautifulSoup(src, strainer)
# Make an ingore list pattern for given ids.
ignorePat = ignoreIds and makeMultiMatch(ignoreIds, re.I)
# Process all the rows in tbody.
for row in doc.tbody.findAll('tr'):
# Process row data.
dataProcessor(row.findAll('td'), ignorePat)
print "---------------"
def processCol(colData, ignorePat=None):
" Process col data as per the format. "
# Check for ignore id.
if (ignorePat and colData[ACCT].a and
ignorePat.match(colData[ACCT].a.string)):
return
# Process data. Skip the ACTIONS field
for data in colData[ACTIONS+1:]:
print data.string or (data.a and data.a.string) or "NA"
if __name__ == '__main__':
# src = urllib2.urlopen(url)
f = open(srcFile)
try:
src = f.read()
finally:
f.close()
# Check for ids to be ignored.
if len(sys.argv) > 1:
parse(src, strainer, processCol, sys.argv[1:])
else:
parse(src, strainer, processCol)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment