rahulkmr/parse.py

## parse.py
'''Read an html file from a file like object, parse(possibly selective) it and give back the data to the callback for processing.
'''

import urllib2
import re
import sys
from BeautifulSoup import BeautifulSoup
from BeautifulSoup import SoupStrainer

# Inline configs.
url = ""
srcFile = "Tasks.html"
# Restrict processing.
strainer = SoupStrainer('table', {'id' : 'objecttable'})
# Field number for account. Some user names to be ignored.
ACTIONS = 0
ACCT = 5


def makeMultiMatch(rawString, flags=0):
    ''' Returns regex for string list.
        Returns compiled regex if list contains 1 element.
    '''
    if len(rawString) == 1:
        return re.compile(rawString[0], flags)
    return re.compile('|'.join(rawString), flags)


def parse(src, strainer, dataProcessor, ignoreIds=None):
    " Read and parse 'src' selectively using 'strainer'. "
    doc = BeautifulSoup(src, strainer)
    # Make an ingore list pattern for given ids.
    ignorePat = ignoreIds and makeMultiMatch(ignoreIds, re.I)
    # Process all the rows in tbody.
    for row in doc.tbody.findAll('tr'):
        # Process row data.
        dataProcessor(row.findAll('td'), ignorePat)
        print "---------------"


def processCol(colData, ignorePat=None):
    " Process col data as per the format. "
    # Check for ignore id.
    if (ignorePat and colData[ACCT].a and
            ignorePat.match(colData[ACCT].a.string)):
        return
    # Process data. Skip the ACTIONS field
    for data in colData[ACTIONS+1:]:
        print data.string or (data.a and data.a.string) or "NA"


if __name__ == '__main__':
    # src = urllib2.urlopen(url)
    f = open(srcFile)
    try:
        src = f.read()
    finally:
        f.close()
    # Check for ids to be ignored.
    if len(sys.argv) > 1:
        parse(src, strainer, processCol, sys.argv[1:])
    else:
        parse(src, strainer, processCol)
	'''Read an html file from a file like object, parse(possibly selective) it and give back the data to the callback for processing.
	'''

	import urllib2
	import re
	import sys
	from BeautifulSoup import BeautifulSoup
	from BeautifulSoup import SoupStrainer

	# Inline configs.
	url = ""
	srcFile = "Tasks.html"
	# Restrict processing.
	strainer = SoupStrainer('table', {'id' : 'objecttable'})
	# Field number for account. Some user names to be ignored.
	ACTIONS = 0
	ACCT = 5


	def makeMultiMatch(rawString, flags=0):
	''' Returns regex for string list.
	Returns compiled regex if list contains 1 element.
	'''
	if len(rawString) == 1:
	return re.compile(rawString[0], flags)
	return re.compile('\|'.join(rawString), flags)


	def parse(src, strainer, dataProcessor, ignoreIds=None):
	" Read and parse 'src' selectively using 'strainer'. "
	doc = BeautifulSoup(src, strainer)
	# Make an ingore list pattern for given ids.
	ignorePat = ignoreIds and makeMultiMatch(ignoreIds, re.I)
	# Process all the rows in tbody.
	for row in doc.tbody.findAll('tr'):
	# Process row data.
	dataProcessor(row.findAll('td'), ignorePat)
	print "---------------"


	def processCol(colData, ignorePat=None):
	" Process col data as per the format. "
	# Check for ignore id.
	if (ignorePat and colData[ACCT].a and
	ignorePat.match(colData[ACCT].a.string)):
	return
	# Process data. Skip the ACTIONS field
	for data in colData[ACTIONS+1:]:
	print data.string or (data.a and data.a.string) or "NA"


	if __name__ == '__main__':
	# src = urllib2.urlopen(url)
	f = open(srcFile)
	try:
	src = f.read()
	finally:
	f.close()
	# Check for ids to be ignored.
	if len(sys.argv) > 1:
	parse(src, strainer, processCol, sys.argv[1:])
	else:
	parse(src, strainer, processCol)