Skip to content

Instantly share code, notes, and snippets.

@geramirez
Created October 23, 2014 20:11
Show Gist options
  • Save geramirez/1355bd82ef47cbe5d1b3 to your computer and use it in GitHub Desktop.
Save geramirez/1355bd82ef47cbe5d1b3 to your computer and use it in GitHub Desktop.
layer with us contacts
#Update Yaml files with usa_id and description
import yaml
import xlrd
import os
import json
from glob import glob
import re
from fuzzywuzzy import fuzz
ACRONYM_FINDER = re.compile('\((.*?)\)') #re.compile('\((\w+)\)')
def float_to_int_str(number):
if type(number) == float:
return str(int(number))
else:
return number
def extract_acronym(usa_name):
acronym_list = ACRONYM_FINDER.findall(usa_name)
if len(acronym_list) == 1:
return acronym_list[0]
elif len(acronym_list) == 0:
return "None"
else:
return "Massive Error"
def return_closest(name_usacontacts,all_usa_data_keys):
name_usacontacts = ACRONYM_FINDER.sub("",name_usacontacts)
best_match = {'name_usacontacts':"none",
'name_all_usa_data':'none',
'score':0}
for name_all_usa_data in all_usa_data_keys:
score = fuzz.ratio(name_all_usa_data, name_usacontacts)
if score > best_match['score'] and score >= 80:
best_match = {'name_usacontacts':name_usacontacts,
'name_all_usa_data':name_all_usa_data,
'score':score}
return(best_match)
def load_all_usa_data():
with open('usagov-data/all_usa_data.json', 'r') as f:
all_usa_data = json.loads(f.read())
data = {}
for office in all_usa_data:
if office.get('Language') == "en":
data[office['Name']] = {
'description':office.get('Description', 'No Description'),
'id':office.get('Id', 'No Id'),
'acronym':extract_acronym(office['Name'])
}
return data
def load_usacontacts():
data = {}
xls_path = "usagov-data/usacontacts.xls" #"xls" + os.sep +
workbook = xlrd.open_workbook(xls_path)
for sheet in workbook.sheet_names():
sheet = workbook.sheet_by_name(sheet)
header_names = [sheet.cell_value(0, i) for i in range(sheet.ncols)]
print(header_names)
for row_num in range(1, sheet.nrows):
row = {header_names[i]: sheet.cell_value(row_num, i) for i in range(sheet.ncols)}
data[row['fh_name']] = row
return data
def merge_data():
usacontacts = load_usacontacts()
all_usa_data = load_all_usa_data()
merged_data = {}
counter = 0
for name_usacontacts in usacontacts:
#try to match on names
if name_usacontacts in all_usa_data.keys():
counter += 1
usacontacts[name_usacontacts]['description'] = all_usa_data[name_usacontacts]['description']
continue
#try to match on ids
else:
current_id = float_to_int_str(usacontacts[name_usacontacts]['usa_id'])
for name in all_usa_data.keys():
if all_usa_data[name]['id'] == current_id:
counter += 1
usacontacts[name_usacontacts]['description'] = all_usa_data[name]['description']
continue
#if all else fails try fuzzy search
closest_match = return_closest(name_usacontacts,all_usa_data.keys())
if closest_match['name_usacontacts'] != "none":
usacontacts[name_usacontacts]['description'] = all_usa_data[closest_match['name_all_usa_data']]['description']
counter += 1
continue
print("total merged: ",counter)
return usacontacts
def patch_yaml():
data = merge_data()
print("Number of key initially: ",len(data.keys()))
for filename in glob("data" + os.sep + "*.yaml"):
with open(filename) as f:
yaml_data = yaml.load(f.read())
if yaml_data['name'] in data.keys():
yaml_data['description'] = data[yaml_data['name']].get('description',"No Description")
yaml_data['usa_id'] = data[yaml_data['name']]['usa_id']
del data[yaml_data['name']]
for internal_data in yaml_data['departments']:
if internal_data['name'] in data.keys():
internal_data['description'] = data[internal_data['name']].get('description',"No Description")
internal_data['usa_id'] = data[internal_data['name']]['usa_id']
del data[internal_data['name']]
with open(filename, 'w') as f:
f.write(yaml.dump(yaml_data, default_flow_style=False, allow_unicode=True))
print("Number of keys after yaml update: ",len(data.keys()))
patch_yaml()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment