Last active
May 18, 2016 16:35
-
-
Save dmahugh/212af086397ab9a07c19f72f4a09bbdf to your computer and use it in GitHub Desktop.
download the Norwescon attendance data, remove blank lines
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Functions for manipulating Norwescon attendance data. | |
Query functions: | |
attended_all() ---------> Get attendees of a specified list of conventions. | |
attended_one() ---------> Get attendees of a specified convention. | |
Data-scrubbing functions: | |
download_data() --------> Download data file and save a local copy. | |
fixups() ---------------> Apply various fixups to the live data file. | |
progressbar() ----------> Display progress bar showing completion status. | |
remove_blanks() --------> Remove non-blank lines from a text file. | |
remove_fields() --------> Remove un-used fields with escaping issues. | |
Dependencies: | |
- Python 3.x (written with Python 3.5) | |
- requests module (to install: pip install requests) | |
""" | |
import json | |
import os | |
import requests | |
#------------------------------------------------------------------------------ | |
def attended_all(*, filename=None, conventions=None): | |
"""Get attendees of a specified list of conventions. | |
filename = data file (valid JSON) | |
conventions = list of convention numbers (e.g., ['1', '2', '3']) | |
Prints first name, last name, city, state of each person who attended all | |
of the listed conventions, comma-delimited. | |
""" | |
with open(filename, 'r') as datafile: | |
jsondata = json.loads(datafile.read()) | |
for person in jsondata: | |
# create a temporary copy of the list of conventions, then | |
# remove all conventions attended by this person | |
templist = list(conventions) | |
for convitem in person.get('Conventions', []): | |
if convitem['ConventionNumber'] in templist: | |
templist.remove(convitem['ConventionNumber']) | |
# if the temporary list is empty, they attended all of them | |
if len(templist) == 0: | |
print(person.get('FirstName', '') + ',' + \ | |
person.get('LastName', '') + ',' + \ | |
person.get('City', '') + ',' + \ | |
person.get('State', '')) | |
#------------------------------------------------------------------------------ | |
def attended_one(*, filename=None, convention=None): | |
"""Get attendees of a specified convention. | |
filename = data file (valid JSON) | |
convention = convention number (e.g., '1') | |
Prints first name, last name, city, state of each attendee, comma-delimited. | |
""" | |
with open(filename, 'r') as datafile: | |
jsondata = json.loads(datafile.read()) | |
for person in jsondata: | |
# determine whether they attended this convention | |
attended = False | |
for convitem in person.get('Conventions', []): | |
if convitem['ConventionNumber'] == convention: | |
attended = True | |
break | |
if attended: | |
print(person.get('FirstName', '') + ',' + \ | |
person.get('LastName', '') + ',' + \ | |
person.get('City', '') + ',' + \ | |
person.get('State', '')) | |
#------------------------------------------------------------------------------ | |
def download_data(url=None, filename=None): | |
"""Download data file and save a local copy. | |
""" | |
print('Downloading data from {0} ...'.format(url)) | |
response = requests.get(url) | |
with open(filename, 'w') as fhandle: | |
fhandle.write(response.text) | |
filesize = os.stat(filename).st_size | |
print('Data saved to {0} ({1} bytes)'.format(filename, filesize)) | |
#------------------------------------------------------------------------------ | |
def fixups(infile=None, outfile=None): | |
"""Apply various fixups to the live data file. | |
infile = input filename | |
outfile = output filename (if omitted, input file is modified in place) | |
See below for the specific changes made. This function makes the edits | |
needed for the data file to be loaded/parsed correctly by the json | |
module of the Python standard library. | |
""" | |
if not infile: | |
print('ERROR: fixups() called with no input file.') | |
return | |
if not outfile: | |
outfile = infile | |
filesize = os.stat(infile).st_size | |
print('Applying fixups to ' + infile + ', BEFORE size = ' + str(filesize)) | |
with open(infile, 'r') as inputfile: | |
file_content = inputfile.read() # read the input file into a variable | |
# write the outputfile with all the necessary changes | |
with open(outfile, 'w') as outputfile: | |
outputfile.write('[\n') # put [ at beginning of file to create a list | |
# The raw data has extraneous trailing commas at the end of some | |
# elements, which the strict JSON parser in Python won't accept. | |
# To remove these, we take two passes through every line of the file: | |
# | |
# 1st pass = create an element_ends[] list of the line numbers of all | |
# end-of-element lines (i.e., lines starting with ] or }) | |
# | |
# 2nd pass = write the output file, and if a line precedes an | |
# end-of-element line and has a trailing comma, remove | |
# the trailing comma. | |
element_ends = [] # list of end-of-element line numbers | |
for lineno, line in enumerate(file_content.split('\n')): | |
stripped_line = line.strip() | |
if stripped_line and stripped_line[0] in ['}', ']']: | |
element_ends.append(lineno) | |
total_lines = file_content.count('\n') | |
progressbar.lastdisplay = '' | |
for lineno, line in enumerate(file_content.split('\n')): | |
if lineno % 100 == 0: | |
progressbar(lineno/total_lines) | |
if line and lineno + 1 in element_ends and line[-1] == ',': | |
outputfile.write(line[:-1] + '\n') | |
else: | |
outputfile.write(line + '\n') | |
progressbar(1) | |
outputfile.write('\n]') # put ] at end of file to close the list | |
filesize = os.stat(outfile).st_size | |
print('Fixed-up file = ' + outfile + ', size = ' + str(filesize)) | |
#------------------------------------------------------------------------------ | |
def progressbar(progress): | |
"""Display progress bar showing completion status. | |
1st parameter = current progress, as a value between 0 and 1. | |
""" | |
done = int(50*progress) | |
todo = 50 - done | |
displaystr = '[' + done*'>' + todo*'-' + ']' | |
# we only allow for increasing % done, so when it gets to 100% add a | |
# newline ... | |
if displaystr == '[' + 50*'>' + ']': | |
displaystr += '\n' | |
if progressbar.lastdisplay != displaystr: | |
print('\r' + displaystr, end='') | |
progressbar.lastdisplay = displaystr | |
#------------------------------------------------------------------------------ | |
def remove_blanks(infile=None, outfile=None): | |
"""Remove non-blank lines from a text file. | |
1st parameter = infile (text file) | |
2nd parameter = outfile (overwritten if it already exists) | |
""" | |
print('Removing blank lines from {0}'.format(infile)) | |
filesize = os.stat(infile).st_size | |
inputfile = open(infile, 'r') | |
outputfile = open(outfile, 'w') | |
blanklines = 0 | |
nonblanklines = 0 | |
bytesdone = 0 | |
progressbar.lastdisplay = '' | |
for line in inputfile: | |
bytesdone += len(line) | |
progressbar(bytesdone/filesize) | |
if line.strip(): | |
nonblanklines += 1 | |
outputfile.write(line) | |
else: | |
blanklines += 1 | |
progressbar(1) | |
print('') | |
inputfile.close() | |
outputfile.close() | |
print('{0} lines written to {1}. {2} blank lines removed.'. \ | |
format(nonblanklines, outfile, blanklines)) | |
#------------------------------------------------------------------------------ | |
def remove_fields(infile=None, outfile=None): | |
"""Remove un-used fields with escaping issues. | |
1st parameter = infile (text file) | |
2nd parameter = outfile (overwritten if it already exists) | |
NOTE: by removing these lines brute-force, we create some invalid | |
commas at the end of elements, but those will be removed in fixups(). | |
""" | |
badfields = ['"BadgeName"', '"Address1"', '"Address2"', '"Note"', | |
'"ExtraNote"', '"PaymentNote"'] | |
print('Removing fields from {0}'.format(infile)) | |
filesize = os.stat(infile).st_size | |
inputfile = open(infile, 'r') | |
outputfile = open(outfile, 'w') | |
removed = 0 | |
keeplines = 0 | |
bytesdone = 0 | |
progressbar.lastdisplay = '' | |
for line in inputfile: | |
bytesdone += len(line) | |
progressbar(bytesdone/filesize) | |
if line.strip().split(':')[0] in badfields: | |
removed += 1 | |
else: | |
keeplines += 1 | |
outputfile.write(line) | |
progressbar(1) | |
print('') | |
inputfile.close() | |
outputfile.close() | |
print('{0} lines written to {1}. {2} lines removed.'. \ | |
format(keeplines, outfile, removed)) | |
# code to execute if running standalone ---------------------------------------- | |
if __name__ == '__main__': | |
URL = 'http://www.kolvir.com/MISC/NorwesconMembership1-20-2.txt' | |
FILENAME01 = 'data01-raw.json' | |
FILENAME02 = 'data02-noblanks.json' | |
FILENAME03 = 'data03-removedfields.json' | |
FILENAME04 = 'data04-fixedup.json' | |
#download_data(url=URL, filename=FILENAME01) | |
#remove_blanks(infile=FILENAME01, outfile=FILENAME02) | |
#remove_fields(infile=FILENAME02, outfile=FILENAME03) | |
#fixups(infile=FILENAME03, outfile=FILENAME04) | |
# manual step: added a couple of missing closing braces, as reported by the | |
# JSON parser in Python (they have have been removed by the brute-force | |
# approach of some of the steps above) | |
#attended_one(filename=FILENAME04, convention='0') | |
ALL_CONVENTIONS = [str(_) for _ in list(range(0, 20))] | |
attended_all(filename=FILENAME04, conventions=ALL_CONVENTIONS) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment