Skip to content

Instantly share code, notes, and snippets.

@a8dx
Created July 9, 2018 17:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save a8dx/20caedaf942f1810e16994bfdb57e8dc to your computer and use it in GitHub Desktop.
Save a8dx/20caedaf942f1810e16994bfdb57e8dc to your computer and use it in GitHub Desktop.
Fuzzy Wuzzy String Matching Example
# -- DistrictNameMatching.py
# Author: Anthony Louis D'Agostino (ald2187 [at] columbia.edu)
# Purpose: Given CSV lists of district-state name pairs, identifies the best match given the fuzzywuzzy library
# Notes: Default number of matches currently set to 3, though can be modified as input argument.
import os
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd
def districtMatch(master, using, master_dist, master_state, using_dist, using_state, outFile, num_match = 3):
"""
This function takes two sets of district-state names, and produces a DTA with a set number (default=3)
of matches with a flag for whether the district name has been completely matched.
Manual work is then required for districts where a perfect match has not been made.
master: file containing the master list of districts
using: file containing using list of districts, eg., each of these districts is compared against these
universe of master districts from the master file
master_dist: variable name pertaining to districts in master file
master_state: variable name pertaining to states in master file
using_dist: variable name pertaining to districts in using file
using_state: variable name pertaining to states in using file
num_match: number of matches generated, default is 3
outFile: includes path and filename for an outputted DTA file - should be "*.dta"
"""
master_dists = pd.read_csv(master, quotechar = '"', skipinitialspace = True, sep = None)
print " *** Now printing column values for master file *** "
print list(master_dists.columns.values)
using_dists = pd.read_csv(using, quotechar = '"', skipinitialspace = True, sep = None)
print " *** Now printing column values for using file *** "
print list(using_dists.columns.values)
# -- concatenate district and state names
master_names = master_dists[master_dist].map(str) + ", " + master_dists[master_state]
using_names = using_dists[using_dist].map(str) + ", " + using_dists[using_state]
fhp_new = [process.extract(x, master_names, limit=num_match) for x in using_names]
# -- generate column names
lab = ""
i = 1
while i <= num_match:
lab = lab + " " + "Match" + str(i)
i += 1
fhp_matches = pd.DataFrame(fhp_new, columns = lab.split())
d={}
for x in range(1,num_match+1):
d["Match{0}".format(x)]=[y[0] for y in fhp_matches['Match'+str(x)]]
d['using_original'] = using_names
#match1 = [x[0] for x in fhp_matches['Match1']]
d['perfect_match'] = d['Match1'] == d['using_original']
#fhp_matches['perfect_match'] = pd.Series(perfect_match, index = fhp_matches.index)
out = pd.DataFrame(d)
#out.to_stata(str(outFile + ".dta"))
out.to_csv(str(outFile + ".csv"))
print "******************************************"
print "*** Your analysis has been completed! *** "
print "******************************************"
return out
"""
BASIC FILES/PATHS WHOSE USE IS REPEATED
"""
baseDir = os.path.join("<insert path>")
outDir = os.path.join(baseDir, "Matched_Results")
if not os.path.exists(outDir):
os.makedirs(outDir)
"""
ICRISAT and 1971 Polygon borders
"""
master_file = os.path.join(baseDir, "ICRISAT_Meso_Names.csv")
input_1971 = os.path.join(baseDir,"District_Records_1971_Clean.csv")
outFile = os.path.join(outDir, "ICRI_1971_DistrictMatches")
icri_1971_match = districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile)
# -- alternatively, don't save as a workspace object
districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment