Skip to content

Instantly share code, notes, and snippets.

@mwidjaja1
Created September 29, 2015 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mwidjaja1/70e8a1cd3dcb05ea4a5c to your computer and use it in GitHub Desktop.
Save mwidjaja1/70e8a1cd3dcb05ea4a5c to your computer and use it in GitHub Desktop.
In this Kaggle.com project, I wanted to predict the probability of Reddit's Random Acts of Pizza group giving a free pizza in regards to its history of requests. This project was one of my first introductions in analyzing text for Machine Learning.
"""
On Training, this scored 0.736
Magnitude: 3 default, NO synonyms, 3 lemmas, 4 Correct
When Leaf = 30 & Depth = 20
False: 78% as 1401 and True: 40% as 86
On Training, this scored 0.733
Magnitude: 3 default, 0.5 synonyms, 3 lemmas, 4 Correct
When Leaf = 30 & Depth = 20
False: 78% as 1377 and True: 40% as 103
"""
import pandas as pd
from sklearn import tree, metrics
from sklearn.externals.six import StringIO
from sklearn.cross_validation import cross_val_score
import pydot
import string
from nltk.corpus import wordnet as wn
# Loads Data
path = '/Users/Matthew/Dropbox/Academics/Pizza'
trainDataOrg = pd.read_json(path + '/train.json')
usingTest = True
# If usingTest is True, we split trainDataOrg into two halves for test/train
# If usingTest is False, we import test.json which doesn't contain results.
if usingTest is True:
length = len(trainDataOrg)/2
testData = trainDataOrg[length:]
trainData = trainDataOrg[:length]
else:
testDataOrg = pd.read_json(path + '/test.json')
length = 0
testData = testDataOrg
trainData = trainDataOrg
""" Function A.1: scoreAssign --------------------------------------------------
Purpose: This looks at each word from a series of text. If that word was
used successfully for a pizza, it's given +1 points. If not, it's
given -1 points. We use these points to determine how effective
each word is in obtaining a pizza.
Input:
word The word to analyze.
wordScore A dictionary containing word scores (Do this if one was made
in a previous function. Otherwise, pass in a blank dictionary).
result A boolean with the result if pizza was won.
magnitude A scalar which scales the +/- points. Default to 1 if unsure.
Output:
wordScore This gives the score of each word across all text.
-----------------------------------------------------------------------------"""
def scoreAssign(word, wordScore, result, magnitude):
if word in wordScore and result is True:
wordList = wordScore[word]
wordList[0] = wordList[0] + (4*magnitude)
wordList[1] = wordList[1] + 1
#wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1
elif word in wordScore and result is False:
wordList = wordScore[word]
wordList[1] = wordList[1] + 1
#wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1
elif word not in wordScore and result is True:
wordScore[word] = [4*magnitude,1]
#wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1
elif word not in wordScore and result is False:
wordScore[word] = [0,1]
#wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1
return wordScore
""" Function A.2: createScore --------------------------------------------------
Purpose: This looks at each word from a series of text. If that word was
used successfully for a pizza, it's given +1 points. If not, it's
given -1 points. We use these points to determine how effective
each word is in obtaining a pizza.
Input:
textData A data frame with text in one column for further processing.
boolData A data frame with a boolean indicating if the pizza was won.
start The first index value of the dataset being passed through. This
is normally 0, but will be the variable 'length' if we split
the dataset into two (i.e. when we pass in testData).
wordScore A dictionary containing word scores (Do this if one was made
in a previous function. Otherwise, pass in a blank dictionary.)
lemmas A boolean. If True, we'll get wordScores for the first lemma
(root word) too.
synonym A boolean. If True, we'll get wordScores for synonyms too.
Output:
wordScore This gives the score of each word across all text.
-----------------------------------------------------------------------------"""
def createScore(textData, boolData, start, wordScore, lemmas, synonym):
textList = textData.values
# Processes Effectiveness of Words in Train Text
for i1 in range(len(textData)):
text = textList[i1] # Removes Punctuation & Sets Lowercase
text = str(text)
result = bool(boolData.requester_received_pizza[i1+start])
# Calculates wordScore for each word in the given string of text
for word in text.split()[1:]:
if "u'" in word:
word = word[2:]
word = (word.translate(None, string.punctuation)).lower()
if "" is word:
continue
if len(word) >= 3:
wordScore = scoreAssign(word, wordScore, result, 3)
# Obtains Synonyms & calcuates its wordScore if synonym == True
if synonym == True:
synList = [x for x in wn.synsets(word)]
for i2 in range(len(synList)):
synWord = str(synList[i2])
synWord = synWord[8:-7]
wordScore = scoreAssign(synWord, wordScore, result, 0.5)
# Obtains Lemmas & calcuates its wordScore if lemmas == True
if lemmas == True:
lemList = [x for x in wn.lemmas(word)]
for i2 in range(len(lemList)):
lemWord = str(lemList[i2])
lemWord = lemWord.split('.')[0]
lemWord = lemWord[7:]
wordScore = scoreAssign(lemWord, wordScore, result, 3)
# Returns Outputs back to the Main Function
return wordScore
""" Function B: ratioScore -----------------------------------------------------
Purpose: This looks at each word from a series of text. If that word was
used successfully for a pizza, it's given +1 points. If not, it's
given -1 points. We use these points to determine how effective
each word is in obtaining a pizza.
Input:
wordScore A dictionary containing word scores (Do this if one was made
in a previous function. Otherwise, pass in a blank dictionary.)
Output:
wordScore This gives the score of each word across all text.
-----------------------------------------------------------------------------"""
def ratioScore(wordScore):
for key in wordScore:
wordList = wordScore[key]
wordScore[key] = wordList[0]/(wordList[1])
#wordScore[key] = wordList[0]
return wordScore
""" Function C: effectScore ------------------------------------------------------
Purpose: This looks at each word from a series of text. If that word was
used successfully for a pizza, it's given +1 points. If not, it's
given -1 points. We use these points to determine how effective
each word is in obtaining a pizza.
Input:
textData A data frame with text in one column for further processing.
wordScore A dictionary containing word scores (Do this if one was made
in a previous function. Otherwise, pass in a blank dictionary.)
start The first index value of the dataset being passed through. This
is normally 0, but will be the variable 'length' if we split
the dataset into two (i.e. when we pass in testData).
lemmas A boolean. If True, we'll get wordScores for the lemma
(root word) too.
synonym A boolean. If True, we'll get wordScores for synonyms too.
outCol A string which will be used to name this column in reqScore.
Output:
outScore This gives a data frame with an index value in one column and
the score of each text based on the sum of its words' points.
-----------------------------------------------------------------------------"""
def effectScore(textData, wordScore, start, lemmas, synonym, outCol):
# Processes Effectiveness of Train Text
outScore = []
textList = textData.values
for i1 in range(len(textData)):
text = textList[i1] # Remove Puncutation & Sets Lowercase
text = str(text)
tempScore = 0 # Allocates initial score for this request
for word in text.split()[1:]:
# Removes the u' suffix if it's present in a word
if "u'" in word:
word = word[2:]
# Removes all punctuation & makes all words lower-case
word = (word.translate(None, string.punctuation)).lower()
# Adds the word's score to the string's score
if word in wordScore:
tempScore = wordScore[word] + tempScore
# Add the word's synonym score to the string's score if valid
if synonym == True:
synList = [x for x in wn.synsets(word)]
for i2 in range(len(synList)):
synWord = str(synList[i2])
synWord = synWord[8:-7]
if synWord in wordScore:
tempScore = wordScore[synWord] + tempScore
# Adds the word's lemma score to the string's score if valid
if lemmas == True:
lemList = [x for x in wn.lemmas(word)]
for i2 in range(len(lemList)):
lemWord = str(lemList[i2])
lemWord = lemWord.split('.')[0]
lemWord = lemWord[7:]
if lemWord in wordScore:
tempScore = wordScore[lemWord] + tempScore
outScore.append(tempScore)
# Sets Index of Train Text
outIdx = range(0+start,len(outScore)+start,1)
outScore = pd.DataFrame(outScore, index=outIdx, columns=[outCol])
# Returns Outputs back to the Main Function
return outScore
""" Function D: createFrame ----------------------------------------------------
Purpose: With the name of the train/test dataset, we select the columns we
will analyze over, append the array from the 'effective' function,
& add column names. We use this to create the arrays needed to
run DecisionTreeClassifier and/or predict for data.
Input:
data A data frame with the initial training or test dataset.
new A data frame containing new data created from other functions.
usingTest A boolean indicating if we're using the test.json. If True, we
don't create y for pizza results. Note, usingTest must always
be set to False when we're working with training data.
Output:
y A one column array with the 'result' -- a boolean if pizza was won
x A multi-column array containing all data of relevance to solve.
-----------------------------------------------------------------------------"""
def createFrame(data, new, usingTest):
# Declares Input
if usingTest is True:
y = data.requester_received_pizza
else:
y = []
# Creates Summarized Train Array & Converts Booleans to Int
savedData = data[[
'requester_account_age_in_days_at_request',
'requester_days_since_first_post_on_raop_at_request',
'requester_number_of_comments_at_request',
'requester_number_of_comments_in_raop_at_request',
'requester_number_of_posts_at_request',
'requester_number_of_posts_on_raop_at_request',
'requester_number_of_subreddits_at_request',
'requester_upvotes_minus_downvotes_at_request',
'requester_upvotes_plus_downvotes_at_request']]
savedData.columns = [['AccountAge', 'DaysUntilPostR', 'NumComments',
'NumCommentsRaop', 'NumPosts', 'NumPostsRaop',
'Subreddit#', 'NetVotes', 'RawVotes']]
savedData = savedData.join(new)
# Declares Output
x = savedData.ix[:,'AccountAge':'RedditScore']
# Returns Values back to the Main Function
return y,x
""" Main Function -- Part 1: Training Data -------------------------------------
Purpose: Using half of the 'training' data set, we'll use this to train our
dataset with the help of the efective & createFrame functions
defined above. Afterwards, we will fit this to a Decision Tree
and output the results to the current path of this file. Also, we
will print to the terminal, a data frame with the variables which
made the greatest influence on our model.
Input:
trainData A data frame with (half of the) initial training dataset.
Output:
tree.pdf A PDF file with the decision tree
-----------------------------------------------------------------------------"""
# Initializes wordScore and redScore Dictionaries
wordScore = {'TotalWordsUsed' : [0,1]}
redScore = {'TotalWordsUsed' : [0,1]}
# Derives a 'success' score for each word in the request, title, & subreddit
wordScore = createScore(trainData[['request_text_edit_aware']],
trainData[['requester_received_pizza']], 0, wordScore, True, False)
wordScore = createScore(trainData[['request_title']],
trainData[['requester_received_pizza']], 0, wordScore, True, False)
redScore = createScore(
trainData[['requester_subreddits_at_request']],
trainData[['requester_received_pizza']], 0, redScore, False, False)
# Processes ratios of wordScore so best word = max of 1 & worst word = min of 0
wordScore = ratioScore(wordScore)
redScore = ratioScore(redScore)
# Caluluates Effectiveness of each request using the scores
reqScore = effectScore(trainData[['request_text_edit_aware']],
wordScore, 0, True, False, 'RequestScore')
titleScore = effectScore(trainData[['request_title']],
wordScore, 0, True, False, 'TitleScore')
subScore = effectScore(trainData[['requester_subreddits_at_request']],
redScore, 0, False, False, 'RedditScore')
# Creates DataFrame for CLF
reqScore = reqScore.join(titleScore)
reqScore = reqScore.join(subScore)
(y,x) = createFrame(trainData, reqScore, True)
# Creates Decision Tree Classifer
# If Yes, we branch Left. If No, we branch Right.
clf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_leaf = 30,
max_depth=20)
clf = clf.fit(x, y)
# Export Tree
# To export, I first ran 'conda install pydot'
# Then installed the latest GraphViz with Google's help
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png(path + "/tree.png")
# Print Data Frame
print pd.DataFrame(clf.feature_importances_, columns = ["Importance"],
index = x.columns).sort(ascending=False, columns="Importance")
""" Main Function -- Part 2: Test Data -----------------------------------------
Purpose: Using half of the 'training' data set, we'll use this to test our
dataset with the help of the efective & createFrame functions
defined above. Afterwards, using the decision tree obtained above,
we will see how effective it is in predicting the results of this
half of the data set.
Input:
testData A data frame with (the other half of the) initial training
dataset.
-----------------------------------------------------------------------------"""
# Determines effectScore for Request, Title, & Subreddits
reqScore = effectScore(testData[['request_text_edit_aware']],
wordScore, length, True, False, 'RequestScore')
titleScore = effectScore(testData[['request_title']],
wordScore, length, True, False, 'TitleScore')
subScore = effectScore(testData[['requester_subreddits_at_request']],
redScore, length, False, False, 'RedditScore')
# Creates DataFrame for CLF
reqScore = reqScore.join(titleScore)
reqScore = reqScore.join(subScore)
# Creates Dataframe
(yTrain,xTrain) = createFrame(testData, reqScore, usingTest)
# Predict Data
yPred = clf.predict(xTrain)
# Output Results
if usingTest is True:
print "\nAccuracy: {0:.3f}".format(metrics.accuracy_score(yTrain,yPred))
scores = cross_val_score(clf, xTrain, yPred)
print("Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()*2))
print "\nClassification Report:"
print metrics.classification_report(yTrain,yPred)
print "Confusion Matrix:"
print metrics.confusion_matrix(yTrain,yPred)
else:
yPred = yPred.astype(int)
outArray = pd.DataFrame(yPred, index=testDataOrg.request_id,
columns=["requester_received_pizza"])
outArray.to_csv('Data.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment