dmahugh/norwescon_data.py

## norwescon_data.py
"""Functions for manipulating Norwescon attendance data.

Query functions:
attended_all() ---------> Get attendees of a specified list of conventions.
attended_one() ---------> Get attendees of a specified convention.

Data-scrubbing functions:
download_data() --------> Download data file and save a local copy.
fixups() ---------------> Apply various fixups to the live data file.
progressbar() ----------> Display progress bar showing completion status.
remove_blanks() --------> Remove non-blank lines from a text file.
remove_fields() --------> Remove un-used fields with escaping issues.

Dependencies:
- Python 3.x (written with Python 3.5)
- requests module (to install: pip install requests)
"""
import json
import os

import requests

#------------------------------------------------------------------------------
def attended_all(*, filename=None, conventions=None):
    """Get attendees of a specified list of conventions.

    filename = data file (valid JSON)
    conventions = list of convention numbers (e.g., ['1', '2', '3'])

    Prints first name, last name, city, state of each person who attended all
    of the listed conventions, comma-delimited.
    """
    with open(filename, 'r') as datafile:
        jsondata = json.loads(datafile.read())
        for person in jsondata:

            # create a temporary copy of the list of conventions, then
            # remove all conventions attended by this person
            templist = list(conventions)
            for convitem in person.get('Conventions', []):
                if convitem['ConventionNumber'] in templist:
                    templist.remove(convitem['ConventionNumber'])

            # if the temporary list is empty, they attended all of them
            if len(templist) == 0:
                print(person.get('FirstName', '') + ',' + \
                      person.get('LastName', '') + ',' + \
                      person.get('City', '') + ',' + \
                      person.get('State', ''))

#------------------------------------------------------------------------------
def attended_one(*, filename=None, convention=None):
    """Get attendees of a specified convention.

    filename = data file (valid JSON)
    convention = convention number (e.g., '1')

    Prints first name, last name, city, state of each attendee, comma-delimited.
    """
    with open(filename, 'r') as datafile:
        jsondata = json.loads(datafile.read())
        for person in jsondata:

            # determine whether they attended this convention
            attended = False
            for convitem in person.get('Conventions', []):
                if convitem['ConventionNumber'] == convention:
                    attended = True
                    break

            if attended:
                print(person.get('FirstName', '') + ',' + \
                      person.get('LastName', '') + ',' + \
                      person.get('City', '') + ',' + \
                      person.get('State', ''))

#------------------------------------------------------------------------------
def download_data(url=None, filename=None):
    """Download data file and save a local copy.
    """
    print('Downloading data from {0} ...'.format(url))

    response = requests.get(url)
    with open(filename, 'w') as fhandle:
        fhandle.write(response.text)

    filesize = os.stat(filename).st_size
    print('Data saved to {0} ({1} bytes)'.format(filename, filesize))

#------------------------------------------------------------------------------
def fixups(infile=None, outfile=None):
    """Apply various fixups to the live data file.

    infile = input filename
    outfile = output filename (if omitted, input file is modified in place)

    See below for the specific changes made. This function makes the edits
    needed for the data file to be loaded/parsed correctly by the json
    module of the Python standard library.
    """
    if not infile:
        print('ERROR: fixups() called with no input file.')
        return
    if not outfile:
        outfile = infile

    filesize = os.stat(infile).st_size
    print('Applying fixups to ' + infile + ', BEFORE size = ' + str(filesize))

    with open(infile, 'r') as inputfile:
        file_content = inputfile.read() # read the input file into a variable

    # write the outputfile with all the necessary changes
    with open(outfile, 'w') as outputfile:
        outputfile.write('[\n') # put [ at beginning of file to create a list

        # The raw data has extraneous trailing commas at the end of some
        # elements, which the strict JSON parser in Python won't accept.
        # To remove these, we take two passes through every line of the file:
        #
        # 1st pass = create an element_ends[] list of the line numbers of all
        #            end-of-element lines (i.e., lines starting with ] or })
        #
        # 2nd pass = write the output file, and if a line precedes an
        #            end-of-element line and has a trailing comma, remove
        #            the trailing comma.

        element_ends = [] # list of end-of-element line numbers
        for lineno, line in enumerate(file_content.split('\n')):
            stripped_line = line.strip()
            if stripped_line and stripped_line[0] in ['}', ']']:
                element_ends.append(lineno)

        total_lines = file_content.count('\n')
        progressbar.lastdisplay = ''
        for lineno, line in enumerate(file_content.split('\n')):
            if lineno % 100 == 0:
                progressbar(lineno/total_lines)
            if line and lineno + 1 in element_ends and line[-1] == ',':
                outputfile.write(line[:-1] + '\n')
            else:
                outputfile.write(line + '\n')

        progressbar(1)

        outputfile.write('\n]') # put ] at end of file to close the list

    filesize = os.stat(outfile).st_size
    print('Fixed-up file = ' + outfile + ', size = ' + str(filesize))

#------------------------------------------------------------------------------
def progressbar(progress):
    """Display progress bar showing completion status.

    1st parameter = current progress, as a value between 0 and 1.
    """
    done = int(50*progress)
    todo = 50 - done
    displaystr = '[' + done*'>' + todo*'-'  + ']'

    # we only allow for increasing % done, so when it gets to 100% add a
    # newline ...
    if displaystr == '[' + 50*'>' + ']':
        displaystr += '\n'

    if progressbar.lastdisplay != displaystr:
        print('\r' + displaystr, end='')
        progressbar.lastdisplay = displaystr

#------------------------------------------------------------------------------
def remove_blanks(infile=None, outfile=None):
    """Remove non-blank lines from a text file.

    1st parameter = infile (text file)
    2nd parameter = outfile (overwritten if it already exists)
    """
    print('Removing blank lines from {0}'.format(infile))

    filesize = os.stat(infile).st_size

    inputfile = open(infile, 'r')
    outputfile = open(outfile, 'w')

    blanklines = 0
    nonblanklines = 0
    bytesdone = 0

    progressbar.lastdisplay = ''
    for line in inputfile:
        bytesdone += len(line)
        progressbar(bytesdone/filesize)
        if line.strip():
            nonblanklines += 1
            outputfile.write(line)
        else:
            blanklines += 1

    progressbar(1)
    print('')

    inputfile.close()
    outputfile.close()

    print('{0} lines written to {1}. {2} blank lines removed.'. \
        format(nonblanklines, outfile, blanklines))

#------------------------------------------------------------------------------
def remove_fields(infile=None, outfile=None):
    """Remove un-used fields with escaping issues.

    1st parameter = infile (text file)
    2nd parameter = outfile (overwritten if it already exists)

    NOTE: by removing these lines brute-force, we create some invalid
    commas at the end of elements, but those will be removed in fixups().
    """
    badfields = ['"BadgeName"', '"Address1"', '"Address2"', '"Note"',
                 '"ExtraNote"', '"PaymentNote"']

    print('Removing fields from {0}'.format(infile))

    filesize = os.stat(infile).st_size

    inputfile = open(infile, 'r')
    outputfile = open(outfile, 'w')

    removed = 0
    keeplines = 0
    bytesdone = 0

    progressbar.lastdisplay = ''
    for line in inputfile:
        bytesdone += len(line)
        progressbar(bytesdone/filesize)
        if line.strip().split(':')[0] in badfields:
            removed += 1
        else:
            keeplines += 1
            outputfile.write(line)

    progressbar(1)
    print('')

    inputfile.close()
    outputfile.close()

    print('{0} lines written to {1}. {2} lines removed.'. \
        format(keeplines, outfile, removed))

# code to execute if running standalone ----------------------------------------
if __name__ == '__main__':
    URL = 'http://www.kolvir.com/MISC/NorwesconMembership1-20-2.txt'
    FILENAME01 = 'data01-raw.json'
    FILENAME02 = 'data02-noblanks.json'
    FILENAME03 = 'data03-removedfields.json'
    FILENAME04 = 'data04-fixedup.json'

    #download_data(url=URL, filename=FILENAME01)

    #remove_blanks(infile=FILENAME01, outfile=FILENAME02)
    #remove_fields(infile=FILENAME02, outfile=FILENAME03)
    #fixups(infile=FILENAME03, outfile=FILENAME04)

    # manual step: added a couple of missing closing braces, as reported by the
    # JSON parser in Python (they have have been removed by the brute-force
    # approach of some of the steps above)

    #attended_one(filename=FILENAME04, convention='0')

    ALL_CONVENTIONS = [str(_) for _ in list(range(0, 20))]
    attended_all(filename=FILENAME04, conventions=ALL_CONVENTIONS)
	"""Functions for manipulating Norwescon attendance data.

	Query functions:
	attended_all() ---------> Get attendees of a specified list of conventions.
	attended_one() ---------> Get attendees of a specified convention.

	Data-scrubbing functions:
	download_data() --------> Download data file and save a local copy.
	fixups() ---------------> Apply various fixups to the live data file.
	progressbar() ----------> Display progress bar showing completion status.
	remove_blanks() --------> Remove non-blank lines from a text file.
	remove_fields() --------> Remove un-used fields with escaping issues.

	Dependencies:
	- Python 3.x (written with Python 3.5)
	- requests module (to install: pip install requests)
	"""
	import json
	import os

	import requests

	#------------------------------------------------------------------------------
	def attended_all(*, filename=None, conventions=None):
	"""Get attendees of a specified list of conventions.

	filename = data file (valid JSON)
	conventions = list of convention numbers (e.g., ['1', '2', '3'])

	Prints first name, last name, city, state of each person who attended all
	of the listed conventions, comma-delimited.
	"""
	with open(filename, 'r') as datafile:
	jsondata = json.loads(datafile.read())
	for person in jsondata:

	# create a temporary copy of the list of conventions, then
	# remove all conventions attended by this person
	templist = list(conventions)
	for convitem in person.get('Conventions', []):
	if convitem['ConventionNumber'] in templist:
	templist.remove(convitem['ConventionNumber'])

	# if the temporary list is empty, they attended all of them
	if len(templist) == 0:
	print(person.get('FirstName', '') + ',' + \
	person.get('LastName', '') + ',' + \
	person.get('City', '') + ',' + \
	person.get('State', ''))

	#------------------------------------------------------------------------------
	def attended_one(*, filename=None, convention=None):
	"""Get attendees of a specified convention.

	filename = data file (valid JSON)
	convention = convention number (e.g., '1')

	Prints first name, last name, city, state of each attendee, comma-delimited.
	"""
	with open(filename, 'r') as datafile:
	jsondata = json.loads(datafile.read())
	for person in jsondata:

	# determine whether they attended this convention
	attended = False
	for convitem in person.get('Conventions', []):
	if convitem['ConventionNumber'] == convention:
	attended = True
	break

	if attended:
	print(person.get('FirstName', '') + ',' + \
	person.get('LastName', '') + ',' + \
	person.get('City', '') + ',' + \
	person.get('State', ''))

	#------------------------------------------------------------------------------
	def download_data(url=None, filename=None):
	"""Download data file and save a local copy.
	"""
	print('Downloading data from {0} ...'.format(url))

	response = requests.get(url)
	with open(filename, 'w') as fhandle:
	fhandle.write(response.text)

	filesize = os.stat(filename).st_size
	print('Data saved to {0} ({1} bytes)'.format(filename, filesize))

	#------------------------------------------------------------------------------
	def fixups(infile=None, outfile=None):
	"""Apply various fixups to the live data file.

	infile = input filename
	outfile = output filename (if omitted, input file is modified in place)

	See below for the specific changes made. This function makes the edits
	needed for the data file to be loaded/parsed correctly by the json
	module of the Python standard library.
	"""
	if not infile:
	print('ERROR: fixups() called with no input file.')
	return
	if not outfile:
	outfile = infile

	filesize = os.stat(infile).st_size
	print('Applying fixups to ' + infile + ', BEFORE size = ' + str(filesize))

	with open(infile, 'r') as inputfile:
	file_content = inputfile.read() # read the input file into a variable

	# write the outputfile with all the necessary changes
	with open(outfile, 'w') as outputfile:
	outputfile.write('[\n') # put [ at beginning of file to create a list

	# The raw data has extraneous trailing commas at the end of some
	# elements, which the strict JSON parser in Python won't accept.
	# To remove these, we take two passes through every line of the file:
	#
	# 1st pass = create an element_ends[] list of the line numbers of all
	# end-of-element lines (i.e., lines starting with ] or })
	#
	# 2nd pass = write the output file, and if a line precedes an
	# end-of-element line and has a trailing comma, remove
	# the trailing comma.

	element_ends = [] # list of end-of-element line numbers
	for lineno, line in enumerate(file_content.split('\n')):
	stripped_line = line.strip()
	if stripped_line and stripped_line[0] in ['}', ']']:
	element_ends.append(lineno)

	total_lines = file_content.count('\n')
	progressbar.lastdisplay = ''
	for lineno, line in enumerate(file_content.split('\n')):
	if lineno % 100 == 0:
	progressbar(lineno/total_lines)
	if line and lineno + 1 in element_ends and line[-1] == ',':
	outputfile.write(line[:-1] + '\n')
	else:
	outputfile.write(line + '\n')

	progressbar(1)

	outputfile.write('\n]') # put ] at end of file to close the list

	filesize = os.stat(outfile).st_size
	print('Fixed-up file = ' + outfile + ', size = ' + str(filesize))

	#------------------------------------------------------------------------------
	def progressbar(progress):
	"""Display progress bar showing completion status.

	1st parameter = current progress, as a value between 0 and 1.
	"""
	done = int(50*progress)
	todo = 50 - done
	displaystr = '[' + done'>' + todo'-' + ']'

	# we only allow for increasing % done, so when it gets to 100% add a
	# newline ...
	if displaystr == '[' + 50*'>' + ']':
	displaystr += '\n'

	if progressbar.lastdisplay != displaystr:
	print('\r' + displaystr, end='')
	progressbar.lastdisplay = displaystr

	#------------------------------------------------------------------------------
	def remove_blanks(infile=None, outfile=None):
	"""Remove non-blank lines from a text file.

	1st parameter = infile (text file)
	2nd parameter = outfile (overwritten if it already exists)
	"""
	print('Removing blank lines from {0}'.format(infile))

	filesize = os.stat(infile).st_size

	inputfile = open(infile, 'r')
	outputfile = open(outfile, 'w')

	blanklines = 0
	nonblanklines = 0
	bytesdone = 0

	progressbar.lastdisplay = ''
	for line in inputfile:
	bytesdone += len(line)
	progressbar(bytesdone/filesize)
	if line.strip():
	nonblanklines += 1
	outputfile.write(line)
	else:
	blanklines += 1

	progressbar(1)
	print('')

	inputfile.close()
	outputfile.close()

	print('{0} lines written to {1}. {2} blank lines removed.'. \
	format(nonblanklines, outfile, blanklines))

	#------------------------------------------------------------------------------
	def remove_fields(infile=None, outfile=None):
	"""Remove un-used fields with escaping issues.

	1st parameter = infile (text file)
	2nd parameter = outfile (overwritten if it already exists)

	NOTE: by removing these lines brute-force, we create some invalid
	commas at the end of elements, but those will be removed in fixups().
	"""
	badfields = ['"BadgeName"', '"Address1"', '"Address2"', '"Note"',
	'"ExtraNote"', '"PaymentNote"']

	print('Removing fields from {0}'.format(infile))

	filesize = os.stat(infile).st_size

	inputfile = open(infile, 'r')
	outputfile = open(outfile, 'w')

	removed = 0
	keeplines = 0
	bytesdone = 0

	progressbar.lastdisplay = ''
	for line in inputfile:
	bytesdone += len(line)
	progressbar(bytesdone/filesize)
	if line.strip().split(':')[0] in badfields:
	removed += 1
	else:
	keeplines += 1
	outputfile.write(line)

	progressbar(1)
	print('')

	inputfile.close()
	outputfile.close()

	print('{0} lines written to {1}. {2} lines removed.'. \
	format(keeplines, outfile, removed))

	# code to execute if running standalone ----------------------------------------
	if __name__ == '__main__':
	URL = 'http://www.kolvir.com/MISC/NorwesconMembership1-20-2.txt'
	FILENAME01 = 'data01-raw.json'
	FILENAME02 = 'data02-noblanks.json'
	FILENAME03 = 'data03-removedfields.json'
	FILENAME04 = 'data04-fixedup.json'

	#download_data(url=URL, filename=FILENAME01)

	#remove_blanks(infile=FILENAME01, outfile=FILENAME02)
	#remove_fields(infile=FILENAME02, outfile=FILENAME03)
	#fixups(infile=FILENAME03, outfile=FILENAME04)

	# manual step: added a couple of missing closing braces, as reported by the
	# JSON parser in Python (they have have been removed by the brute-force
	# approach of some of the steps above)

	#attended_one(filename=FILENAME04, convention='0')

	ALL_CONVENTIONS = [str(_) for _ in list(range(0, 20))]
	attended_all(filename=FILENAME04, conventions=ALL_CONVENTIONS)