glamp/buildingtheclassifier.py

## buildingtheclassifier.py
# building the classifier
from sklearn.ensemble import RandomForestClassifier

# I find it easiest to just define all the features you'll be using in a list. In
# our case it's just the fuzzy scores that we generated using fuzzywuzzy.
features = [
    'name_ratio',
    'name_token_sort_ratio',
    'name_partial_ratio',
    'street_ratio',
    'street_token_sort_ratio',
    'street_partial_ratio'
]

# We're going to fit a RandomForest model to the data. Our target variable will be match
# which is a 0/1 variable indicating whether or not a given set of names/addresses are
# actually the same record.
clf = RandomForestClassifier()
clf.fit(df[features], df['match'])

# Just taking a look at the results. Note that this isn't cross-validated, I'm just trying
# to get a quick look at the model.
pd.crosstab(clf.predict(df[features]), df['match'])
	# building the classifier
	from sklearn.ensemble import RandomForestClassifier

	# I find it easiest to just define all the features you'll be using in a list. In
	# our case it's just the fuzzy scores that we generated using fuzzywuzzy.
	features = [
	'name_ratio',
	'name_token_sort_ratio',
	'name_partial_ratio',
	'street_ratio',
	'street_token_sort_ratio',
	'street_partial_ratio'
	]

	# We're going to fit a RandomForest model to the data. Our target variable will be match
	# which is a 0/1 variable indicating whether or not a given set of names/addresses are
	# actually the same record.
	clf = RandomForestClassifier()
	clf.fit(df[features], df['match'])

	# Just taking a look at the results. Note that this isn't cross-validated, I'm just trying
	# to get a quick look at the model.
	pd.crosstab(clf.predict(df[features]), df['match'])