a8dx/DistrictNameMatching.py

## DistrictNameMatching.py
# -- DistrictNameMatching.py
# Author: Anthony Louis D'Agostino (ald2187 [at] columbia.edu)
# Purpose: Given CSV lists of district-state name pairs, identifies the best match given the fuzzywuzzy library
# Notes: Default number of matches currently set to 3, though can be modified as input argument.

import os
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pandas as pd


def districtMatch(master, using, master_dist, master_state, using_dist, using_state, outFile, num_match = 3):
	"""
	This function takes two sets of district-state names, and produces a DTA with a set number (default=3)
		of matches with a flag for whether the district name has been completely matched.

	Manual work is then required for districts where a perfect match has not been made.

	master: file containing the master list of districts
	using: file containing using list of districts, eg., each of these districts is compared against these
			universe of master districts from the master file
	master_dist: variable name pertaining to districts in master file
	master_state: variable name pertaining to states in master file
	using_dist: variable name pertaining to districts in using file
	using_state: variable name pertaining to states in using file
	num_match: number of matches generated, default is 3
	outFile: includes path and filename for an outputted DTA file - should be "*.dta"
	"""

	master_dists = pd.read_csv(master, quotechar = '"', skipinitialspace = True, sep = None)
	print " *** Now printing column values for master file *** "
	print list(master_dists.columns.values)

	using_dists = pd.read_csv(using, quotechar = '"', skipinitialspace = True, sep = None)
	print " *** Now printing column values for using file *** "
	print list(using_dists.columns.values)

	# -- concatenate district and state names
	master_names = master_dists[master_dist].map(str) + ", " + master_dists[master_state]
	using_names = using_dists[using_dist].map(str) + ", " + using_dists[using_state]

	fhp_new = [process.extract(x, master_names, limit=num_match) for x in using_names]

	# -- generate column names
	lab = ""
	i = 1
	while i <= num_match:
		lab = lab + " " + "Match" + str(i)
		i += 1


	fhp_matches = pd.DataFrame(fhp_new, columns = lab.split())

	d={}
	for x in range(1,num_match+1):
	    d["Match{0}".format(x)]=[y[0] for y in fhp_matches['Match'+str(x)]]


	d['using_original'] = using_names


	#match1 = [x[0] for x in fhp_matches['Match1']]
	d['perfect_match'] = d['Match1'] == d['using_original']

	#fhp_matches['perfect_match'] = pd.Series(perfect_match, index = fhp_matches.index)
	out = pd.DataFrame(d)
	#out.to_stata(str(outFile + ".dta"))
	out.to_csv(str(outFile + ".csv"))
	print "******************************************"
	print "*** Your analysis has been completed! *** "
	print "******************************************"

	return out


"""
BASIC FILES/PATHS WHOSE USE IS REPEATED
"""


baseDir = os.path.join("<insert path>")


outDir = os.path.join(baseDir, "Matched_Results")

if not os.path.exists(outDir):
	os.makedirs(outDir)


"""
ICRISAT and 1971 Polygon borders
"""

master_file = os.path.join(baseDir, "ICRISAT_Meso_Names.csv")
input_1971 = os.path.join(baseDir,"District_Records_1971_Clean.csv")

outFile = os.path.join(outDir, "ICRI_1971_DistrictMatches")
icri_1971_match = districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile)

# -- alternatively, don't save as a workspace object
districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile)
	# -- DistrictNameMatching.py
	# Author: Anthony Louis D'Agostino (ald2187 [at] columbia.edu)
	# Purpose: Given CSV lists of district-state name pairs, identifies the best match given the fuzzywuzzy library
	# Notes: Default number of matches currently set to 3, though can be modified as input argument.

	import os
	import numpy as np
	from fuzzywuzzy import fuzz
	from fuzzywuzzy import process
	import pandas as pd



	def districtMatch(master, using, master_dist, master_state, using_dist, using_state, outFile, num_match = 3):
	"""
	This function takes two sets of district-state names, and produces a DTA with a set number (default=3)
	of matches with a flag for whether the district name has been completely matched.

	Manual work is then required for districts where a perfect match has not been made.

	master: file containing the master list of districts
	using: file containing using list of districts, eg., each of these districts is compared against these
	universe of master districts from the master file
	master_dist: variable name pertaining to districts in master file
	master_state: variable name pertaining to states in master file
	using_dist: variable name pertaining to districts in using file
	using_state: variable name pertaining to states in using file
	num_match: number of matches generated, default is 3
	outFile: includes path and filename for an outputted DTA file - should be "*.dta"
	"""

	master_dists = pd.read_csv(master, quotechar = '"', skipinitialspace = True, sep = None)
	print " * Now printing column values for master file * "
	print list(master_dists.columns.values)

	using_dists = pd.read_csv(using, quotechar = '"', skipinitialspace = True, sep = None)
	print " * Now printing column values for using file * "
	print list(using_dists.columns.values)

	# -- concatenate district and state names
	master_names = master_dists[master_dist].map(str) + ", " + master_dists[master_state]
	using_names = using_dists[using_dist].map(str) + ", " + using_dists[using_state]

	fhp_new = [process.extract(x, master_names, limit=num_match) for x in using_names]

	# -- generate column names
	lab = ""
	i = 1
	while i <= num_match:
	lab = lab + " " + "Match" + str(i)
	i += 1



	fhp_matches = pd.DataFrame(fhp_new, columns = lab.split())

	d={}
	for x in range(1,num_match+1):
	d["Match{0}".format(x)]=[y[0] for y in fhp_matches['Match'+str(x)]]


	d['using_original'] = using_names


	#match1 = [x[0] for x in fhp_matches['Match1']]
	d['perfect_match'] = d['Match1'] == d['using_original']

	#fhp_matches['perfect_match'] = pd.Series(perfect_match, index = fhp_matches.index)
	out = pd.DataFrame(d)
	#out.to_stata(str(outFile + ".dta"))
	out.to_csv(str(outFile + ".csv"))
	print "******************************************"
	print "* Your analysis has been completed! * "
	print "******************************************"

	return out


	"""
	BASIC FILES/PATHS WHOSE USE IS REPEATED
	"""


	baseDir = os.path.join("<insert path>")


	outDir = os.path.join(baseDir, "Matched_Results")

	if not os.path.exists(outDir):
	os.makedirs(outDir)





	"""
	ICRISAT and 1971 Polygon borders
	"""

	master_file = os.path.join(baseDir, "ICRISAT_Meso_Names.csv")
	input_1971 = os.path.join(baseDir,"District_Records_1971_Clean.csv")

	outFile = os.path.join(outDir, "ICRI_1971_DistrictMatches")
	icri_1971_match = districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile)

	# -- alternatively, don't save as a workspace object
	districtMatch(input_1971, master_file, 'NAME', 'STATE_UT', 'distname', 'stname', outFile)