Skip to content

Instantly share code, notes, and snippets.

@dmahugh
Last active May 18, 2016 16:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dmahugh/212af086397ab9a07c19f72f4a09bbdf to your computer and use it in GitHub Desktop.
Save dmahugh/212af086397ab9a07c19f72f4a09bbdf to your computer and use it in GitHub Desktop.
download the Norwescon attendance data, remove blank lines
"""Functions for manipulating Norwescon attendance data.
Query functions:
attended_all() ---------> Get attendees of a specified list of conventions.
attended_one() ---------> Get attendees of a specified convention.
Data-scrubbing functions:
download_data() --------> Download data file and save a local copy.
fixups() ---------------> Apply various fixups to the live data file.
progressbar() ----------> Display progress bar showing completion status.
remove_blanks() --------> Remove non-blank lines from a text file.
remove_fields() --------> Remove un-used fields with escaping issues.
Dependencies:
- Python 3.x (written with Python 3.5)
- requests module (to install: pip install requests)
"""
import json
import os
import requests
#------------------------------------------------------------------------------
def attended_all(*, filename=None, conventions=None):
"""Get attendees of a specified list of conventions.
filename = data file (valid JSON)
conventions = list of convention numbers (e.g., ['1', '2', '3'])
Prints first name, last name, city, state of each person who attended all
of the listed conventions, comma-delimited.
"""
with open(filename, 'r') as datafile:
jsondata = json.loads(datafile.read())
for person in jsondata:
# create a temporary copy of the list of conventions, then
# remove all conventions attended by this person
templist = list(conventions)
for convitem in person.get('Conventions', []):
if convitem['ConventionNumber'] in templist:
templist.remove(convitem['ConventionNumber'])
# if the temporary list is empty, they attended all of them
if len(templist) == 0:
print(person.get('FirstName', '') + ',' + \
person.get('LastName', '') + ',' + \
person.get('City', '') + ',' + \
person.get('State', ''))
#------------------------------------------------------------------------------
def attended_one(*, filename=None, convention=None):
"""Get attendees of a specified convention.
filename = data file (valid JSON)
convention = convention number (e.g., '1')
Prints first name, last name, city, state of each attendee, comma-delimited.
"""
with open(filename, 'r') as datafile:
jsondata = json.loads(datafile.read())
for person in jsondata:
# determine whether they attended this convention
attended = False
for convitem in person.get('Conventions', []):
if convitem['ConventionNumber'] == convention:
attended = True
break
if attended:
print(person.get('FirstName', '') + ',' + \
person.get('LastName', '') + ',' + \
person.get('City', '') + ',' + \
person.get('State', ''))
#------------------------------------------------------------------------------
def download_data(url=None, filename=None):
"""Download data file and save a local copy.
"""
print('Downloading data from {0} ...'.format(url))
response = requests.get(url)
with open(filename, 'w') as fhandle:
fhandle.write(response.text)
filesize = os.stat(filename).st_size
print('Data saved to {0} ({1} bytes)'.format(filename, filesize))
#------------------------------------------------------------------------------
def fixups(infile=None, outfile=None):
"""Apply various fixups to the live data file.
infile = input filename
outfile = output filename (if omitted, input file is modified in place)
See below for the specific changes made. This function makes the edits
needed for the data file to be loaded/parsed correctly by the json
module of the Python standard library.
"""
if not infile:
print('ERROR: fixups() called with no input file.')
return
if not outfile:
outfile = infile
filesize = os.stat(infile).st_size
print('Applying fixups to ' + infile + ', BEFORE size = ' + str(filesize))
with open(infile, 'r') as inputfile:
file_content = inputfile.read() # read the input file into a variable
# write the outputfile with all the necessary changes
with open(outfile, 'w') as outputfile:
outputfile.write('[\n') # put [ at beginning of file to create a list
# The raw data has extraneous trailing commas at the end of some
# elements, which the strict JSON parser in Python won't accept.
# To remove these, we take two passes through every line of the file:
#
# 1st pass = create an element_ends[] list of the line numbers of all
# end-of-element lines (i.e., lines starting with ] or })
#
# 2nd pass = write the output file, and if a line precedes an
# end-of-element line and has a trailing comma, remove
# the trailing comma.
element_ends = [] # list of end-of-element line numbers
for lineno, line in enumerate(file_content.split('\n')):
stripped_line = line.strip()
if stripped_line and stripped_line[0] in ['}', ']']:
element_ends.append(lineno)
total_lines = file_content.count('\n')
progressbar.lastdisplay = ''
for lineno, line in enumerate(file_content.split('\n')):
if lineno % 100 == 0:
progressbar(lineno/total_lines)
if line and lineno + 1 in element_ends and line[-1] == ',':
outputfile.write(line[:-1] + '\n')
else:
outputfile.write(line + '\n')
progressbar(1)
outputfile.write('\n]') # put ] at end of file to close the list
filesize = os.stat(outfile).st_size
print('Fixed-up file = ' + outfile + ', size = ' + str(filesize))
#------------------------------------------------------------------------------
def progressbar(progress):
"""Display progress bar showing completion status.
1st parameter = current progress, as a value between 0 and 1.
"""
done = int(50*progress)
todo = 50 - done
displaystr = '[' + done*'>' + todo*'-' + ']'
# we only allow for increasing % done, so when it gets to 100% add a
# newline ...
if displaystr == '[' + 50*'>' + ']':
displaystr += '\n'
if progressbar.lastdisplay != displaystr:
print('\r' + displaystr, end='')
progressbar.lastdisplay = displaystr
#------------------------------------------------------------------------------
def remove_blanks(infile=None, outfile=None):
"""Remove non-blank lines from a text file.
1st parameter = infile (text file)
2nd parameter = outfile (overwritten if it already exists)
"""
print('Removing blank lines from {0}'.format(infile))
filesize = os.stat(infile).st_size
inputfile = open(infile, 'r')
outputfile = open(outfile, 'w')
blanklines = 0
nonblanklines = 0
bytesdone = 0
progressbar.lastdisplay = ''
for line in inputfile:
bytesdone += len(line)
progressbar(bytesdone/filesize)
if line.strip():
nonblanklines += 1
outputfile.write(line)
else:
blanklines += 1
progressbar(1)
print('')
inputfile.close()
outputfile.close()
print('{0} lines written to {1}. {2} blank lines removed.'. \
format(nonblanklines, outfile, blanklines))
#------------------------------------------------------------------------------
def remove_fields(infile=None, outfile=None):
"""Remove un-used fields with escaping issues.
1st parameter = infile (text file)
2nd parameter = outfile (overwritten if it already exists)
NOTE: by removing these lines brute-force, we create some invalid
commas at the end of elements, but those will be removed in fixups().
"""
badfields = ['"BadgeName"', '"Address1"', '"Address2"', '"Note"',
'"ExtraNote"', '"PaymentNote"']
print('Removing fields from {0}'.format(infile))
filesize = os.stat(infile).st_size
inputfile = open(infile, 'r')
outputfile = open(outfile, 'w')
removed = 0
keeplines = 0
bytesdone = 0
progressbar.lastdisplay = ''
for line in inputfile:
bytesdone += len(line)
progressbar(bytesdone/filesize)
if line.strip().split(':')[0] in badfields:
removed += 1
else:
keeplines += 1
outputfile.write(line)
progressbar(1)
print('')
inputfile.close()
outputfile.close()
print('{0} lines written to {1}. {2} lines removed.'. \
format(keeplines, outfile, removed))
# code to execute if running standalone ----------------------------------------
if __name__ == '__main__':
URL = 'http://www.kolvir.com/MISC/NorwesconMembership1-20-2.txt'
FILENAME01 = 'data01-raw.json'
FILENAME02 = 'data02-noblanks.json'
FILENAME03 = 'data03-removedfields.json'
FILENAME04 = 'data04-fixedup.json'
#download_data(url=URL, filename=FILENAME01)
#remove_blanks(infile=FILENAME01, outfile=FILENAME02)
#remove_fields(infile=FILENAME02, outfile=FILENAME03)
#fixups(infile=FILENAME03, outfile=FILENAME04)
# manual step: added a couple of missing closing braces, as reported by the
# JSON parser in Python (they have have been removed by the brute-force
# approach of some of the steps above)
#attended_one(filename=FILENAME04, convention='0')
ALL_CONVENTIONS = [str(_) for _ in list(range(0, 20))]
attended_all(filename=FILENAME04, conventions=ALL_CONVENTIONS)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment