Created
April 5, 2016 15:16
-
-
Save gbinal/ac15ced3194902d0d6e56850a56c13bf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import re | |
import json | |
| |
def readData(inputFile): | |
outList = [] | |
with open(inputFile, 'rU') as infile: | |
reader = csv.reader(infile) | |
firstRow = True | |
for row in reader: | |
if firstRow == True: | |
firstRow = False | |
continue | |
else: | |
outList.append(row) | |
return outList | |
| |
def writeJson(inputData, fileName): | |
with open(fileName, 'w+') as outfile: | |
json.dump(inputData, outfile, indent = 4) | |
| |
def makeAgencyOutput(inputList, errorDict, errorTypeDict): | |
output = [] | |
for row in inputList: | |
subSet = row[1] | |
subDict = collections.OrderedDict({}) | |
subDict['Agency'] = row[0] | |
subDict['Errors'] = errorDict[row[0]] | |
for key, value in errorTypeDict.items(): | |
k = key | |
try: | |
subDict[k] = subSet[value] | |
except KeyError: | |
subDict[k] = 0 | |
except TypeError: | |
subDict[k] = 0 | |
output.append(subDict) | |
return output | |
| |
def getKey(item): | |
return item[0] | |
| |
def trimErrorField(errorField): | |
pieces = re.split('.*(Guideline)', errorField) | |
shortened = pieces[-1] | |
pieces = shortened.split('.') | |
num = pieces[0] | |
return num | |
| |
def categorize(dataset, referenceDict, colNum, altName): | |
for row in dataset: | |
if row[colNum] in referenceDict.keys(): | |
row.append(referenceDict[row[colNum]]) | |
else: | |
row.append(altName) | |
return dataset | |
| |
def countDict(dataset, colIndex): | |
output = {} | |
for row in dataset: | |
if row[colIndex] in output: | |
output[row[colIndex]] += 1 | |
else: | |
output[row[colIndex]] = 1 | |
return output | |
| |
#Read in a11y.csv for errors and domains.csv for agencies | |
ally1 = readData('a11y.csv') | |
domains = readData('domains.csv') | |
#need to remove ussm.gov, whistleblower.gov, and safeocs.gov from ally due to discrepancies between the datasets. Solve at some point | |
ally = [] | |
for row in ally1: | |
if row[0] != 'safeocs.gov' and row[0] != 'whistleblower.gov' and row[0] != 'ussm.gov': | |
ally.append(row) | |
| |
#Truncate the a11y file so that it's a bit more manageable. Need the domain name [0] and the principle [4] | |
main = [] | |
for row in ally: | |
main.append([row[0], trimErrorField(row[4])]) | |
| |
#Add the information on the agency [1] and branch [2] | |
for error in main: | |
for domain in domains: | |
if error[0] == domain[0].lower(): | |
error.append(domain[1]) | |
error.append(domain[2]) | |
| |
#Dictionaries; branches = branch lookup, errorCats = error category lookup | |
branches = {"Library of Congress":"Legislative","The Legislative Branch (Congress)":"Legislative", | |
"Government Printing Office":"Legislative","Congressional Office of Compliance":"Legislative", | |
"The Judicial Branch (Courts)":"Judicial"} | |
errorCats = {'1_4':'Color Contrast Error', '1_1':'Alt Tag Error', '4_1':'HTML/Attribute Error', '1_3':'Form Error'} | |
| |
#define branches for the 'main' and 'domains' sets, define error categories for 'main' | |
main = categorize(main, branches, -1, 'Executive') | |
domains = categorize(domains, branches, 2, 'Executive') | |
main = categorize(main, errorCats, 1, 'Other Error') | |
| |
totalErrorsByDomain = countDict(main, 0) | |
totalErrorsByAgency = countDict(main, 3) | |
| |
#createe dict of base vs. canonical domains | |
canonicals = {} | |
for row in ally: | |
try: | |
if row[0] in canonicals.keys(): | |
continue | |
else: | |
canonicals[row[0]] = row[1] | |
except KeyError: | |
continue | |
| |
| |
noErrors = [] | |
errors = [] | |
for domain in domains: | |
if not domain[0].lower() in totalErrorsByDomain.keys(): | |
noErrors.append(domain) | |
else: | |
errors.append(domain) | |
| |
for row in noErrors: | |
row.append(0) | |
row.append({}) | |
try: | |
if row[0] in canonicals.keys(): | |
row.append('http://' + canonicals[row[0].lower()]) | |
else: | |
row.append('http://' + row[0].lower()) | |
except TypeError: | |
continue | |
| |
for row in errors: | |
row.append(totalErrorsByDomain[row[0].lower()]) | |
subset = [] | |
for line in main: | |
if line[0] == row[0].lower(): | |
subset.append(line) | |
errorDict = countDict(subset, -1) | |
row.append(errorDict) | |
try: | |
if row[0] in canonicals.keys(): | |
row.append('http://' + canonicals[row[0].lower()]) | |
else: | |
row.append('http://' + row[0].lower()) | |
except TypeError: | |
continue | |
| |
domains = errors + noErrors | |
domains = sorted(domains, key = getKey) | |
| |
dictList = [] | |
for row in domains: | |
subDict = collections.OrderedDict({}) | |
subDict['agency'] = row[2] | |
subDict['branch'] = row[5] | |
subDict['canonical'] = row[8] | |
subDict['domain'] = row[0].lower() | |
subDict['errors'] = row[6] | |
subDict['errorlist'] = row[7] | |
dictList.append(subDict) | |
| |
finalDict = {} | |
finalDict['data'] = dictList | |
| |
writeJson(finalDict, 'domains.json') | |
| |
agencyList = [] | |
for row in main: | |
if row[3] in agencyList: | |
continue | |
else: | |
agencyList.append(row[3]) | |
| |
agencyErrorSets = [] | |
for agency in agencyList: | |
subList = [] | |
sub = {} | |
for row in main: | |
if row[3] == agency: | |
if row[-1] in sub: | |
sub[row[-1]] += 1 | |
else: | |
sub[row[-1]] = 1 | |
subList.append(agency) | |
subList.append(sub) | |
agencyErrorSets.append(subList) | |
| |
errorTypes = {'Color Contrast Errors':'Color Constrast Error', 'HTML/Attribute Errors':'HTML/Attribute Error', | |
'Form Errors':'Form Error', 'Alt Tag Errors':'Alt Tag Error', 'Other Errors':'Other Error'} | |
| |
output = makeAgencyOutput(agencyErrorSets, agencyErrorDict, errorTypes) | |
finalOutput = {} | |
finalOutput['data'] = output | |
| |
writeJson(finalOutput, 'agencies.json') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment