Skip to content

Instantly share code, notes, and snippets.

@gavinwhyte
Created December 7, 2017 06:05
Show Gist options
  • Save gavinwhyte/27e788f8f8f755f8291d7e9c4f858f3e to your computer and use it in GitHub Desktop.
Save gavinwhyte/27e788f8f8f755f8291d7e9c4f858f3e to your computer and use it in GitHub Desktop.
Tax Machine Learning
import csv
import numpy as np
import nltk
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import cross_validation
from SMOTE import *
import pandas as pd
import sys
reload(sys)
sys.setdefaultencoding('utf8')
positive_data = []
negative_data = []
positive_keys = []
negative_keys = []
key_prob = {}
def one_hot(df, features):
v = DictVectorizer()
# print df[features].to_dict('records')
df_qual = v.fit_transform(df[features].to_dict('records'))
f_name = v.vocabulary_
f_array = df_qual.toarray()
# print f_name
# print np.argwhere(np.isnan(f_array))
# print np.amax(f_array)
# exit()
return f_array, v
def process_data_pd():
data = pd.read_csv("data/new_training.csv")
data['Transaction Code'].replace(np.nan, 'NA',inplace=True)
data['Document Type'].replace(np.nan, 'NA',inplace=True)
data[' Amount (CC) '] = data[' Amount (CC) '].replace( '[\$,)]','', regex=True ).astype(float)
data['key'] = (data['Company Code'].astype(str) + data['Fiscal Year'].astype(str) + data['Document Nr'].astype(str) + data['Line Item'].astype(str))
data['content'] = (data['Document Text'].astype(str) + ' ' + data['Item Text'].astype(str) + ' ' + data['GL Account Text'].astype(str))
cat_data, _ = one_hot(data, ['Document Type', 'Transaction Code'])
for index, line in data.iterrows():
if line["Label"] == 'Non-deductible':
positive_data.append((line['content'], cat_data[index], line[' Amount (CC) ']))
positive_keys.append(line['key'])
else:
negative_data.append((line['content'], cat_data[index], line[' Amount (CC) ']))
negative_keys.append(line['key'])
print len(positive_data), len(negative_data)
return data
def text2vec():
pos_size = len(positive_data)
data_set = (positive_data + negative_data)
stemmed_data = []
stemmer = PorterStemmer()
cat_data = []
price_data = []
for line in data_set:
cat_data.append(line[1])
price_data.append(line[2])
lowers = line[0].lower()
tokens = nltk.word_tokenize(lowers)
stemmed_tokens = []
for item in tokens:
try:
stemmed = stemmer.stem(item)
stemmed_tokens.append(stemmed)
except:
pass
stemmed_line = ' '.join(stemmed_tokens)
stemmed_data.append(stemmed_line)
vectorizer = CountVectorizer(min_df=3, max_df=0.8)
X = vectorizer.fit_transform(stemmed_data)
#print vectorizer.get_feature_names()
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X)
cat_data = np.array(cat_data)
price_data = np.array(price_data)
# print cat_data.shape, (price_data.T).shape
tfidf_array = tfidf.toarray()
data_array = np.concatenate((tfidf_array, cat_data), axis=1)
data_array = np.concatenate((data_array, price_data[:,None]), axis=1)
print tfidf_array.shape, data_array.shape
#return tfidf_array[:pos_size,:], tfidf_array[pos_size:,:]
return data_array[:pos_size,:], data_array[pos_size:,:]
def gen_data_set_with_smoke(pos_data):
synthetic_data = SMOTE(pos_data, 20000, 10)
new_pos_data = np.concatenate((pos_data, synthetic_data))
return new_pos_data
def training_forest(data, labels):
best_acc = 0.0
best_size = 0
best_depth = 0
#for depth in range(1, 10):
for depth in [9]:
#for size in [20, 40, 60, 80, 100, 120, 140, 160, 180, 200]:
for size in [60]:
clf = RandomForestClassifier(n_estimators=size, max_depth=depth, n_jobs=6)
scores = cross_validation.cross_val_score(clf, data, np.asarray(labels), cv=3)
acc = scores.mean()
print "size", size, ", depth:", depth, "acc:", acc
if acc > best_acc:
best_acc = acc
best_size = size
best_depth = depth
print "best size", best_size, ", best depth:", best_depth, "best acc", best_acc
clf = RandomForestClassifier(n_estimators=best_size, max_depth=best_depth, n_jobs=6)
clf.fit(data, np.asarray(labels))
return clf
def test_forest(model, test_data):
res = model.predict(test_data)
print res
res = model.predict_proba(test_data)
print res
prob = res[:,1]
print prob
# res = np.argsort(prob)
# print res[::-1]
keys = positive_keys + negative_keys
for i in range(len(keys)):
key_prob[keys[i]] = prob[i]
# print key_prob
def output_res_new(data):
data['prob'] = data['key'].map(key_prob)
data.to_csv('data/res.csv')
def main():
dt = process_data_pd()
pos_data, neg_data = text2vec()
new_pos_data = gen_data_set_with_smoke(pos_data)
print new_pos_data.shape
data = np.concatenate((new_pos_data, neg_data))
label = ['1']*new_pos_data.shape[0] + ['0']*neg_data.shape[0]
model = training_forest(data, label)
test_data = np.concatenate((pos_data, neg_data))
test_forest(model, test_data)
output_res_new(dt)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment