Created
September 29, 2015 16:18
-
-
Save mwidjaja1/70e8a1cd3dcb05ea4a5c to your computer and use it in GitHub Desktop.
In this Kaggle.com project, I wanted to predict the probability of Reddit's Random Acts of Pizza group giving a free pizza in regards to its history of requests. This project was one of my first introductions in analyzing text for Machine Learning.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
On Training, this scored 0.736 | |
Magnitude: 3 default, NO synonyms, 3 lemmas, 4 Correct | |
When Leaf = 30 & Depth = 20 | |
False: 78% as 1401 and True: 40% as 86 | |
On Training, this scored 0.733 | |
Magnitude: 3 default, 0.5 synonyms, 3 lemmas, 4 Correct | |
When Leaf = 30 & Depth = 20 | |
False: 78% as 1377 and True: 40% as 103 | |
""" | |
import pandas as pd | |
from sklearn import tree, metrics | |
from sklearn.externals.six import StringIO | |
from sklearn.cross_validation import cross_val_score | |
import pydot | |
import string | |
from nltk.corpus import wordnet as wn | |
# Loads Data | |
path = '/Users/Matthew/Dropbox/Academics/Pizza' | |
trainDataOrg = pd.read_json(path + '/train.json') | |
usingTest = True | |
# If usingTest is True, we split trainDataOrg into two halves for test/train | |
# If usingTest is False, we import test.json which doesn't contain results. | |
if usingTest is True: | |
length = len(trainDataOrg)/2 | |
testData = trainDataOrg[length:] | |
trainData = trainDataOrg[:length] | |
else: | |
testDataOrg = pd.read_json(path + '/test.json') | |
length = 0 | |
testData = testDataOrg | |
trainData = trainDataOrg | |
""" Function A.1: scoreAssign -------------------------------------------------- | |
Purpose: This looks at each word from a series of text. If that word was | |
used successfully for a pizza, it's given +1 points. If not, it's | |
given -1 points. We use these points to determine how effective | |
each word is in obtaining a pizza. | |
Input: | |
word The word to analyze. | |
wordScore A dictionary containing word scores (Do this if one was made | |
in a previous function. Otherwise, pass in a blank dictionary). | |
result A boolean with the result if pizza was won. | |
magnitude A scalar which scales the +/- points. Default to 1 if unsure. | |
Output: | |
wordScore This gives the score of each word across all text. | |
-----------------------------------------------------------------------------""" | |
def scoreAssign(word, wordScore, result, magnitude): | |
if word in wordScore and result is True: | |
wordList = wordScore[word] | |
wordList[0] = wordList[0] + (4*magnitude) | |
wordList[1] = wordList[1] + 1 | |
#wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1 | |
elif word in wordScore and result is False: | |
wordList = wordScore[word] | |
wordList[1] = wordList[1] + 1 | |
#wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1 | |
elif word not in wordScore and result is True: | |
wordScore[word] = [4*magnitude,1] | |
#wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1 | |
elif word not in wordScore and result is False: | |
wordScore[word] = [0,1] | |
#wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1 | |
return wordScore | |
""" Function A.2: createScore -------------------------------------------------- | |
Purpose: This looks at each word from a series of text. If that word was | |
used successfully for a pizza, it's given +1 points. If not, it's | |
given -1 points. We use these points to determine how effective | |
each word is in obtaining a pizza. | |
Input: | |
textData A data frame with text in one column for further processing. | |
boolData A data frame with a boolean indicating if the pizza was won. | |
start The first index value of the dataset being passed through. This | |
is normally 0, but will be the variable 'length' if we split | |
the dataset into two (i.e. when we pass in testData). | |
wordScore A dictionary containing word scores (Do this if one was made | |
in a previous function. Otherwise, pass in a blank dictionary.) | |
lemmas A boolean. If True, we'll get wordScores for the first lemma | |
(root word) too. | |
synonym A boolean. If True, we'll get wordScores for synonyms too. | |
Output: | |
wordScore This gives the score of each word across all text. | |
-----------------------------------------------------------------------------""" | |
def createScore(textData, boolData, start, wordScore, lemmas, synonym): | |
textList = textData.values | |
# Processes Effectiveness of Words in Train Text | |
for i1 in range(len(textData)): | |
text = textList[i1] # Removes Punctuation & Sets Lowercase | |
text = str(text) | |
result = bool(boolData.requester_received_pizza[i1+start]) | |
# Calculates wordScore for each word in the given string of text | |
for word in text.split()[1:]: | |
if "u'" in word: | |
word = word[2:] | |
word = (word.translate(None, string.punctuation)).lower() | |
if "" is word: | |
continue | |
if len(word) >= 3: | |
wordScore = scoreAssign(word, wordScore, result, 3) | |
# Obtains Synonyms & calcuates its wordScore if synonym == True | |
if synonym == True: | |
synList = [x for x in wn.synsets(word)] | |
for i2 in range(len(synList)): | |
synWord = str(synList[i2]) | |
synWord = synWord[8:-7] | |
wordScore = scoreAssign(synWord, wordScore, result, 0.5) | |
# Obtains Lemmas & calcuates its wordScore if lemmas == True | |
if lemmas == True: | |
lemList = [x for x in wn.lemmas(word)] | |
for i2 in range(len(lemList)): | |
lemWord = str(lemList[i2]) | |
lemWord = lemWord.split('.')[0] | |
lemWord = lemWord[7:] | |
wordScore = scoreAssign(lemWord, wordScore, result, 3) | |
# Returns Outputs back to the Main Function | |
return wordScore | |
""" Function B: ratioScore ----------------------------------------------------- | |
Purpose: This looks at each word from a series of text. If that word was | |
used successfully for a pizza, it's given +1 points. If not, it's | |
given -1 points. We use these points to determine how effective | |
each word is in obtaining a pizza. | |
Input: | |
wordScore A dictionary containing word scores (Do this if one was made | |
in a previous function. Otherwise, pass in a blank dictionary.) | |
Output: | |
wordScore This gives the score of each word across all text. | |
-----------------------------------------------------------------------------""" | |
def ratioScore(wordScore): | |
for key in wordScore: | |
wordList = wordScore[key] | |
wordScore[key] = wordList[0]/(wordList[1]) | |
#wordScore[key] = wordList[0] | |
return wordScore | |
""" Function C: effectScore ------------------------------------------------------ | |
Purpose: This looks at each word from a series of text. If that word was | |
used successfully for a pizza, it's given +1 points. If not, it's | |
given -1 points. We use these points to determine how effective | |
each word is in obtaining a pizza. | |
Input: | |
textData A data frame with text in one column for further processing. | |
wordScore A dictionary containing word scores (Do this if one was made | |
in a previous function. Otherwise, pass in a blank dictionary.) | |
start The first index value of the dataset being passed through. This | |
is normally 0, but will be the variable 'length' if we split | |
the dataset into two (i.e. when we pass in testData). | |
lemmas A boolean. If True, we'll get wordScores for the lemma | |
(root word) too. | |
synonym A boolean. If True, we'll get wordScores for synonyms too. | |
outCol A string which will be used to name this column in reqScore. | |
Output: | |
outScore This gives a data frame with an index value in one column and | |
the score of each text based on the sum of its words' points. | |
-----------------------------------------------------------------------------""" | |
def effectScore(textData, wordScore, start, lemmas, synonym, outCol): | |
# Processes Effectiveness of Train Text | |
outScore = [] | |
textList = textData.values | |
for i1 in range(len(textData)): | |
text = textList[i1] # Remove Puncutation & Sets Lowercase | |
text = str(text) | |
tempScore = 0 # Allocates initial score for this request | |
for word in text.split()[1:]: | |
# Removes the u' suffix if it's present in a word | |
if "u'" in word: | |
word = word[2:] | |
# Removes all punctuation & makes all words lower-case | |
word = (word.translate(None, string.punctuation)).lower() | |
# Adds the word's score to the string's score | |
if word in wordScore: | |
tempScore = wordScore[word] + tempScore | |
# Add the word's synonym score to the string's score if valid | |
if synonym == True: | |
synList = [x for x in wn.synsets(word)] | |
for i2 in range(len(synList)): | |
synWord = str(synList[i2]) | |
synWord = synWord[8:-7] | |
if synWord in wordScore: | |
tempScore = wordScore[synWord] + tempScore | |
# Adds the word's lemma score to the string's score if valid | |
if lemmas == True: | |
lemList = [x for x in wn.lemmas(word)] | |
for i2 in range(len(lemList)): | |
lemWord = str(lemList[i2]) | |
lemWord = lemWord.split('.')[0] | |
lemWord = lemWord[7:] | |
if lemWord in wordScore: | |
tempScore = wordScore[lemWord] + tempScore | |
outScore.append(tempScore) | |
# Sets Index of Train Text | |
outIdx = range(0+start,len(outScore)+start,1) | |
outScore = pd.DataFrame(outScore, index=outIdx, columns=[outCol]) | |
# Returns Outputs back to the Main Function | |
return outScore | |
""" Function D: createFrame ---------------------------------------------------- | |
Purpose: With the name of the train/test dataset, we select the columns we | |
will analyze over, append the array from the 'effective' function, | |
& add column names. We use this to create the arrays needed to | |
run DecisionTreeClassifier and/or predict for data. | |
Input: | |
data A data frame with the initial training or test dataset. | |
new A data frame containing new data created from other functions. | |
usingTest A boolean indicating if we're using the test.json. If True, we | |
don't create y for pizza results. Note, usingTest must always | |
be set to False when we're working with training data. | |
Output: | |
y A one column array with the 'result' -- a boolean if pizza was won | |
x A multi-column array containing all data of relevance to solve. | |
-----------------------------------------------------------------------------""" | |
def createFrame(data, new, usingTest): | |
# Declares Input | |
if usingTest is True: | |
y = data.requester_received_pizza | |
else: | |
y = [] | |
# Creates Summarized Train Array & Converts Booleans to Int | |
savedData = data[[ | |
'requester_account_age_in_days_at_request', | |
'requester_days_since_first_post_on_raop_at_request', | |
'requester_number_of_comments_at_request', | |
'requester_number_of_comments_in_raop_at_request', | |
'requester_number_of_posts_at_request', | |
'requester_number_of_posts_on_raop_at_request', | |
'requester_number_of_subreddits_at_request', | |
'requester_upvotes_minus_downvotes_at_request', | |
'requester_upvotes_plus_downvotes_at_request']] | |
savedData.columns = [['AccountAge', 'DaysUntilPostR', 'NumComments', | |
'NumCommentsRaop', 'NumPosts', 'NumPostsRaop', | |
'Subreddit#', 'NetVotes', 'RawVotes']] | |
savedData = savedData.join(new) | |
# Declares Output | |
x = savedData.ix[:,'AccountAge':'RedditScore'] | |
# Returns Values back to the Main Function | |
return y,x | |
""" Main Function -- Part 1: Training Data ------------------------------------- | |
Purpose: Using half of the 'training' data set, we'll use this to train our | |
dataset with the help of the efective & createFrame functions | |
defined above. Afterwards, we will fit this to a Decision Tree | |
and output the results to the current path of this file. Also, we | |
will print to the terminal, a data frame with the variables which | |
made the greatest influence on our model. | |
Input: | |
trainData A data frame with (half of the) initial training dataset. | |
Output: | |
tree.pdf A PDF file with the decision tree | |
-----------------------------------------------------------------------------""" | |
# Initializes wordScore and redScore Dictionaries | |
wordScore = {'TotalWordsUsed' : [0,1]} | |
redScore = {'TotalWordsUsed' : [0,1]} | |
# Derives a 'success' score for each word in the request, title, & subreddit | |
wordScore = createScore(trainData[['request_text_edit_aware']], | |
trainData[['requester_received_pizza']], 0, wordScore, True, False) | |
wordScore = createScore(trainData[['request_title']], | |
trainData[['requester_received_pizza']], 0, wordScore, True, False) | |
redScore = createScore( | |
trainData[['requester_subreddits_at_request']], | |
trainData[['requester_received_pizza']], 0, redScore, False, False) | |
# Processes ratios of wordScore so best word = max of 1 & worst word = min of 0 | |
wordScore = ratioScore(wordScore) | |
redScore = ratioScore(redScore) | |
# Caluluates Effectiveness of each request using the scores | |
reqScore = effectScore(trainData[['request_text_edit_aware']], | |
wordScore, 0, True, False, 'RequestScore') | |
titleScore = effectScore(trainData[['request_title']], | |
wordScore, 0, True, False, 'TitleScore') | |
subScore = effectScore(trainData[['requester_subreddits_at_request']], | |
redScore, 0, False, False, 'RedditScore') | |
# Creates DataFrame for CLF | |
reqScore = reqScore.join(titleScore) | |
reqScore = reqScore.join(subScore) | |
(y,x) = createFrame(trainData, reqScore, True) | |
# Creates Decision Tree Classifer | |
# If Yes, we branch Left. If No, we branch Right. | |
clf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_leaf = 30, | |
max_depth=20) | |
clf = clf.fit(x, y) | |
# Export Tree | |
# To export, I first ran 'conda install pydot' | |
# Then installed the latest GraphViz with Google's help | |
dot_data = StringIO() | |
tree.export_graphviz(clf, out_file=dot_data) | |
graph = pydot.graph_from_dot_data(dot_data.getvalue()) | |
graph.write_png(path + "/tree.png") | |
# Print Data Frame | |
print pd.DataFrame(clf.feature_importances_, columns = ["Importance"], | |
index = x.columns).sort(ascending=False, columns="Importance") | |
""" Main Function -- Part 2: Test Data ----------------------------------------- | |
Purpose: Using half of the 'training' data set, we'll use this to test our | |
dataset with the help of the efective & createFrame functions | |
defined above. Afterwards, using the decision tree obtained above, | |
we will see how effective it is in predicting the results of this | |
half of the data set. | |
Input: | |
testData A data frame with (the other half of the) initial training | |
dataset. | |
-----------------------------------------------------------------------------""" | |
# Determines effectScore for Request, Title, & Subreddits | |
reqScore = effectScore(testData[['request_text_edit_aware']], | |
wordScore, length, True, False, 'RequestScore') | |
titleScore = effectScore(testData[['request_title']], | |
wordScore, length, True, False, 'TitleScore') | |
subScore = effectScore(testData[['requester_subreddits_at_request']], | |
redScore, length, False, False, 'RedditScore') | |
# Creates DataFrame for CLF | |
reqScore = reqScore.join(titleScore) | |
reqScore = reqScore.join(subScore) | |
# Creates Dataframe | |
(yTrain,xTrain) = createFrame(testData, reqScore, usingTest) | |
# Predict Data | |
yPred = clf.predict(xTrain) | |
# Output Results | |
if usingTest is True: | |
print "\nAccuracy: {0:.3f}".format(metrics.accuracy_score(yTrain,yPred)) | |
scores = cross_val_score(clf, xTrain, yPred) | |
print("Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()*2)) | |
print "\nClassification Report:" | |
print metrics.classification_report(yTrain,yPred) | |
print "Confusion Matrix:" | |
print metrics.confusion_matrix(yTrain,yPred) | |
else: | |
yPred = yPred.astype(int) | |
outArray = pd.DataFrame(yPred, index=testDataOrg.request_id, | |
columns=["requester_received_pizza"]) | |
outArray.to_csv('Data.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment