Created
December 7, 2017 06:05
-
-
Save gavinwhyte/27e788f8f8f755f8291d7e9c4f858f3e to your computer and use it in GitHub Desktop.
Tax Machine Learning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import numpy as np | |
import nltk | |
from nltk.stem.porter import * | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | |
from sklearn.feature_extraction import DictVectorizer | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.metrics import accuracy_score, confusion_matrix | |
from sklearn import cross_validation | |
from SMOTE import * | |
import pandas as pd | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf8') | |
positive_data = [] | |
negative_data = [] | |
positive_keys = [] | |
negative_keys = [] | |
key_prob = {} | |
def one_hot(df, features): | |
v = DictVectorizer() | |
# print df[features].to_dict('records') | |
df_qual = v.fit_transform(df[features].to_dict('records')) | |
f_name = v.vocabulary_ | |
f_array = df_qual.toarray() | |
# print f_name | |
# print np.argwhere(np.isnan(f_array)) | |
# print np.amax(f_array) | |
# exit() | |
return f_array, v | |
def process_data_pd(): | |
data = pd.read_csv("data/new_training.csv") | |
data['Transaction Code'].replace(np.nan, 'NA',inplace=True) | |
data['Document Type'].replace(np.nan, 'NA',inplace=True) | |
data[' Amount (CC) '] = data[' Amount (CC) '].replace( '[\$,)]','', regex=True ).astype(float) | |
data['key'] = (data['Company Code'].astype(str) + data['Fiscal Year'].astype(str) + data['Document Nr'].astype(str) + data['Line Item'].astype(str)) | |
data['content'] = (data['Document Text'].astype(str) + ' ' + data['Item Text'].astype(str) + ' ' + data['GL Account Text'].astype(str)) | |
cat_data, _ = one_hot(data, ['Document Type', 'Transaction Code']) | |
for index, line in data.iterrows(): | |
if line["Label"] == 'Non-deductible': | |
positive_data.append((line['content'], cat_data[index], line[' Amount (CC) '])) | |
positive_keys.append(line['key']) | |
else: | |
negative_data.append((line['content'], cat_data[index], line[' Amount (CC) '])) | |
negative_keys.append(line['key']) | |
print len(positive_data), len(negative_data) | |
return data | |
def text2vec(): | |
pos_size = len(positive_data) | |
data_set = (positive_data + negative_data) | |
stemmed_data = [] | |
stemmer = PorterStemmer() | |
cat_data = [] | |
price_data = [] | |
for line in data_set: | |
cat_data.append(line[1]) | |
price_data.append(line[2]) | |
lowers = line[0].lower() | |
tokens = nltk.word_tokenize(lowers) | |
stemmed_tokens = [] | |
for item in tokens: | |
try: | |
stemmed = stemmer.stem(item) | |
stemmed_tokens.append(stemmed) | |
except: | |
pass | |
stemmed_line = ' '.join(stemmed_tokens) | |
stemmed_data.append(stemmed_line) | |
vectorizer = CountVectorizer(min_df=3, max_df=0.8) | |
X = vectorizer.fit_transform(stemmed_data) | |
#print vectorizer.get_feature_names() | |
transformer = TfidfTransformer(smooth_idf=False) | |
tfidf = transformer.fit_transform(X) | |
cat_data = np.array(cat_data) | |
price_data = np.array(price_data) | |
# print cat_data.shape, (price_data.T).shape | |
tfidf_array = tfidf.toarray() | |
data_array = np.concatenate((tfidf_array, cat_data), axis=1) | |
data_array = np.concatenate((data_array, price_data[:,None]), axis=1) | |
print tfidf_array.shape, data_array.shape | |
#return tfidf_array[:pos_size,:], tfidf_array[pos_size:,:] | |
return data_array[:pos_size,:], data_array[pos_size:,:] | |
def gen_data_set_with_smoke(pos_data): | |
synthetic_data = SMOTE(pos_data, 20000, 10) | |
new_pos_data = np.concatenate((pos_data, synthetic_data)) | |
return new_pos_data | |
def training_forest(data, labels): | |
best_acc = 0.0 | |
best_size = 0 | |
best_depth = 0 | |
#for depth in range(1, 10): | |
for depth in [9]: | |
#for size in [20, 40, 60, 80, 100, 120, 140, 160, 180, 200]: | |
for size in [60]: | |
clf = RandomForestClassifier(n_estimators=size, max_depth=depth, n_jobs=6) | |
scores = cross_validation.cross_val_score(clf, data, np.asarray(labels), cv=3) | |
acc = scores.mean() | |
print "size", size, ", depth:", depth, "acc:", acc | |
if acc > best_acc: | |
best_acc = acc | |
best_size = size | |
best_depth = depth | |
print "best size", best_size, ", best depth:", best_depth, "best acc", best_acc | |
clf = RandomForestClassifier(n_estimators=best_size, max_depth=best_depth, n_jobs=6) | |
clf.fit(data, np.asarray(labels)) | |
return clf | |
def test_forest(model, test_data): | |
res = model.predict(test_data) | |
print res | |
res = model.predict_proba(test_data) | |
print res | |
prob = res[:,1] | |
print prob | |
# res = np.argsort(prob) | |
# print res[::-1] | |
keys = positive_keys + negative_keys | |
for i in range(len(keys)): | |
key_prob[keys[i]] = prob[i] | |
# print key_prob | |
def output_res_new(data): | |
data['prob'] = data['key'].map(key_prob) | |
data.to_csv('data/res.csv') | |
def main(): | |
dt = process_data_pd() | |
pos_data, neg_data = text2vec() | |
new_pos_data = gen_data_set_with_smoke(pos_data) | |
print new_pos_data.shape | |
data = np.concatenate((new_pos_data, neg_data)) | |
label = ['1']*new_pos_data.shape[0] + ['0']*neg_data.shape[0] | |
model = training_forest(data, label) | |
test_data = np.concatenate((pos_data, neg_data)) | |
test_forest(model, test_data) | |
output_res_new(dt) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment