mwidjaja1/Pizza.py

## Pizza.py

"""
On Training, this scored 0.736
Magnitude: 3 default, NO synonyms, 3 lemmas, 4 Correct
When Leaf = 30 & Depth = 20
False: 78% as 1401 and True: 40% as 86

On Training, this scored 0.733
Magnitude: 3 default, 0.5 synonyms, 3 lemmas, 4 Correct
When Leaf = 30 & Depth = 20
False: 78% as 1377 and True: 40% as 103
"""

import pandas as pd
from sklearn import tree, metrics
from sklearn.externals.six import StringIO
from sklearn.cross_validation import cross_val_score
import pydot
import string
from nltk.corpus import wordnet as wn

# Loads Data
path = '/Users/Matthew/Dropbox/Academics/Pizza'
trainDataOrg = pd.read_json(path + '/train.json')
usingTest = True

# If usingTest is True, we split trainDataOrg into two halves for test/train
# If usingTest is False, we import test.json which doesn't contain results.
if usingTest is True:
    length = len(trainDataOrg)/2
    testData = trainDataOrg[length:]
    trainData = trainDataOrg[:length]
else:
    testDataOrg = pd.read_json(path + '/test.json')
    length = 0
    testData = testDataOrg
    trainData = trainDataOrg


""" Function A.1: scoreAssign --------------------------------------------------
    Purpose: This looks at each word from a series of text. If that word was
             used successfully for a pizza, it's given +1 points. If not, it's
             given -1 points. We use these points to determine how effective
             each word is in obtaining a pizza.

    Input:
    word        The word to analyze.
    wordScore   A dictionary containing word scores (Do this if one was made
                in a previous function. Otherwise, pass in a blank dictionary).
    result      A boolean with the result if pizza was won.
    magnitude   A scalar which scales the +/- points. Default to 1 if unsure.

    Output:
    wordScore   This gives the score of each word across all text.
-----------------------------------------------------------------------------"""
def scoreAssign(word, wordScore, result, magnitude):
    if word in wordScore and result is True:
        wordList = wordScore[word]
        wordList[0] = wordList[0] + (4*magnitude)
        wordList[1] = wordList[1] + 1
        #wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1
    elif word in wordScore and result is False:
        wordList = wordScore[word]
        wordList[1] = wordList[1] + 1
        #wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1
    elif word not in wordScore and result is True:
        wordScore[word] = [4*magnitude,1]
        #wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1
    elif word not in wordScore and result is False:
        wordScore[word] = [0,1]
        #wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1
    return wordScore


""" Function A.2: createScore --------------------------------------------------
    Purpose: This looks at each word from a series of text. If that word was
             used successfully for a pizza, it's given +1 points. If not, it's
             given -1 points. We use these points to determine how effective
             each word is in obtaining a pizza.

    Input:
    textData    A data frame with text in one column for further processing.
    boolData    A data frame with a boolean indicating if the pizza was won.
    start       The first index value of the dataset being passed through. This
                is normally 0, but will be the variable 'length' if we split
                the dataset into two (i.e. when we pass in testData).
    wordScore   A dictionary containing word scores (Do this if one was made
                in a previous function. Otherwise, pass in a blank dictionary.)
    lemmas      A boolean. If True, we'll get wordScores for the first lemma
                (root word) too.
    synonym     A boolean. If True, we'll get wordScores for synonyms too.

    Output:
    wordScore   This gives the score of each word across all text.
-----------------------------------------------------------------------------"""
def createScore(textData, boolData, start, wordScore, lemmas, synonym):
    textList = textData.values

    # Processes Effectiveness of Words in Train Text
    for i1 in range(len(textData)):
        text = textList[i1]         # Removes Punctuation & Sets Lowercase
        text = str(text)
        result = bool(boolData.requester_received_pizza[i1+start])

        # Calculates wordScore for each word in the given string of text
        for word in text.split()[1:]:
            if "u'" in word:
                word = word[2:]
            word = (word.translate(None, string.punctuation)).lower()
            if "" is word:
                continue
            if len(word) >= 3:
                wordScore = scoreAssign(word, wordScore, result, 3)

            # Obtains Synonyms & calcuates its wordScore if synonym == True
            if synonym == True:
                synList = [x for x in wn.synsets(word)]
                for i2 in range(len(synList)):
                    synWord = str(synList[i2])
                    synWord = synWord[8:-7]
                    wordScore = scoreAssign(synWord, wordScore, result, 0.5)

            # Obtains Lemmas & calcuates its wordScore if lemmas == True
            if lemmas == True:
                lemList = [x for x in wn.lemmas(word)]
                for i2 in range(len(lemList)):
                    lemWord = str(lemList[i2])
                    lemWord = lemWord.split('.')[0]
                    lemWord = lemWord[7:]
                    wordScore = scoreAssign(lemWord, wordScore, result, 3)

    # Returns Outputs back to the Main Function
    return wordScore


""" Function B: ratioScore -----------------------------------------------------
    Purpose: This looks at each word from a series of text. If that word was
             used successfully for a pizza, it's given +1 points. If not, it's
             given -1 points. We use these points to determine how effective
             each word is in obtaining a pizza.

    Input:
    wordScore   A dictionary containing word scores (Do this if one was made
                in a previous function. Otherwise, pass in a blank dictionary.)

    Output:
    wordScore   This gives the score of each word across all text.
-----------------------------------------------------------------------------"""
def ratioScore(wordScore):
    for key in wordScore:
        wordList = wordScore[key]
        wordScore[key] = wordList[0]/(wordList[1])
        #wordScore[key] = wordList[0]
    return wordScore


""" Function C: effectScore ------------------------------------------------------
    Purpose: This looks at each word from a series of text. If that word was
             used successfully for a pizza, it's given +1 points. If not, it's
             given -1 points. We use these points to determine how effective
             each word is in obtaining a pizza.

    Input:
    textData    A data frame with text in one column for further processing.
    wordScore   A dictionary containing word scores (Do this if one was made
                in a previous function. Otherwise, pass in a blank dictionary.)
    start       The first index value of the dataset being passed through. This
                is normally 0, but will be the variable 'length' if we split
                the dataset into two (i.e. when we pass in testData).
    lemmas      A boolean. If True, we'll get wordScores for the lemma
                (root word) too.
    synonym     A boolean. If True, we'll get wordScores for synonyms too.
    outCol      A string which will be used to name this column in reqScore.

    Output:
    outScore    This gives a data frame with an index value in one column and
                the score of each text based on the sum of its words' points.
-----------------------------------------------------------------------------"""
def effectScore(textData, wordScore, start, lemmas, synonym, outCol):
    # Processes Effectiveness of Train Text
    outScore = []
    textList = textData.values

    for i1 in range(len(textData)):
        text = textList[i1]         # Remove Puncutation & Sets Lowercase
        text = str(text)
        tempScore = 0               # Allocates initial score for this request
        for word in text.split()[1:]:
            # Removes the u' suffix if it's present in a word
            if "u'" in word:
              word = word[2:]

            # Removes all punctuation & makes all words lower-case
            word = (word.translate(None, string.punctuation)).lower()

            # Adds the word's score to the string's score
            if word in wordScore:
                tempScore = wordScore[word] + tempScore

            # Add the word's synonym score to the string's score if valid
            if synonym == True:
                synList = [x for x in wn.synsets(word)]
                for i2 in range(len(synList)):
                    synWord = str(synList[i2])
                    synWord = synWord[8:-7]
                    if synWord in wordScore:
                        tempScore = wordScore[synWord] + tempScore

            # Adds the word's lemma score to the string's score if valid
            if lemmas == True:
                lemList = [x for x in wn.lemmas(word)]
                for i2 in range(len(lemList)):
                    lemWord = str(lemList[i2])
                    lemWord = lemWord.split('.')[0]
                    lemWord = lemWord[7:]
                    if lemWord in wordScore:
                       tempScore = wordScore[lemWord] + tempScore
        outScore.append(tempScore)

    # Sets Index of Train Text
    outIdx = range(0+start,len(outScore)+start,1)
    outScore = pd.DataFrame(outScore, index=outIdx, columns=[outCol])

    # Returns Outputs back to the Main Function
    return outScore


""" Function D: createFrame ----------------------------------------------------
    Purpose: With the name of the train/test dataset, we select the columns we
             will analyze over, append the array from the 'effective' function,
             & add column names. We use this to create the arrays needed to
             run DecisionTreeClassifier and/or predict for data.

    Input:
    data        A data frame with the initial training or test dataset.
    new         A data frame containing new data created from other functions.
    usingTest   A boolean indicating if we're using the test.json. If True, we
                don't create y for pizza results. Note, usingTest must always
                be set to False when we're working with training data.

    Output:
    y       A one column array with the 'result' -- a boolean if pizza was won
    x       A multi-column array containing all data of relevance to solve.
-----------------------------------------------------------------------------"""
def createFrame(data, new, usingTest):
    # Declares Input
    if usingTest is True:
        y = data.requester_received_pizza
    else:
        y = []

    # Creates Summarized Train Array & Converts Booleans to Int
    savedData = data[[
        'requester_account_age_in_days_at_request',
        'requester_days_since_first_post_on_raop_at_request',
        'requester_number_of_comments_at_request',
        'requester_number_of_comments_in_raop_at_request',
        'requester_number_of_posts_at_request',
        'requester_number_of_posts_on_raop_at_request',
        'requester_number_of_subreddits_at_request',
        'requester_upvotes_minus_downvotes_at_request',
        'requester_upvotes_plus_downvotes_at_request']]
    savedData.columns = [['AccountAge', 'DaysUntilPostR', 'NumComments',
                          'NumCommentsRaop', 'NumPosts', 'NumPostsRaop',
                          'Subreddit#', 'NetVotes', 'RawVotes']]
    savedData = savedData.join(new)

    # Declares Output
    x = savedData.ix[:,'AccountAge':'RedditScore']

    # Returns Values back to the Main Function
    return y,x


""" Main Function -- Part 1: Training Data -------------------------------------
    Purpose: Using half of the 'training' data set, we'll use this to train our
             dataset with the help of the efective & createFrame functions
             defined above. Afterwards, we will fit this to a Decision Tree
             and output the results to the current path of this file. Also, we
             will print to the terminal, a data frame with the variables which
             made the greatest influence on our model.

    Input:
    trainData    A data frame with (half of the) initial training dataset.

    Output:
    tree.pdf        A PDF file with the decision tree
-----------------------------------------------------------------------------"""
# Initializes wordScore and redScore Dictionaries
wordScore = {'TotalWordsUsed' : [0,1]}
redScore = {'TotalWordsUsed' : [0,1]}

# Derives a 'success' score for each word in the request, title, & subreddit
wordScore = createScore(trainData[['request_text_edit_aware']],
    trainData[['requester_received_pizza']], 0, wordScore, True, False)
wordScore = createScore(trainData[['request_title']],
    trainData[['requester_received_pizza']], 0, wordScore, True, False)
redScore = createScore(
    trainData[['requester_subreddits_at_request']],
    trainData[['requester_received_pizza']], 0, redScore, False, False)

# Processes ratios of wordScore so best word = max of 1 & worst word = min of 0
wordScore = ratioScore(wordScore)
redScore = ratioScore(redScore)

# Caluluates Effectiveness of each request using the scores
reqScore = effectScore(trainData[['request_text_edit_aware']],
     wordScore, 0, True, False, 'RequestScore')
titleScore = effectScore(trainData[['request_title']],
    wordScore, 0, True, False, 'TitleScore')
subScore = effectScore(trainData[['requester_subreddits_at_request']],
    redScore, 0, False, False, 'RedditScore')

# Creates DataFrame for CLF
reqScore = reqScore.join(titleScore)
reqScore = reqScore.join(subScore)
(y,x) = createFrame(trainData, reqScore, True)

# Creates Decision Tree Classifer
# If Yes, we branch Left. If No, we branch Right.
clf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_leaf = 30,
                                  max_depth=20)
clf = clf.fit(x, y)

# Export Tree
# To export, I first ran 'conda install pydot'
# Then installed the latest GraphViz with Google's help
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png(path + "/tree.png")

# Print Data Frame
print pd.DataFrame(clf.feature_importances_, columns = ["Importance"],
                   index = x.columns).sort(ascending=False, columns="Importance")


""" Main Function -- Part 2: Test Data -----------------------------------------
    Purpose: Using half of the 'training' data set, we'll use this to test our
             dataset with the help of the efective & createFrame functions
             defined above. Afterwards, using the decision tree obtained above,
             we will see how effective it is in predicting the results of this
             half of the data set.

    Input:
    testData    A data frame with (the other half of the) initial training
                dataset.
-----------------------------------------------------------------------------"""
# Determines effectScore for Request, Title, & Subreddits
reqScore = effectScore(testData[['request_text_edit_aware']],
    wordScore, length, True, False, 'RequestScore')
titleScore = effectScore(testData[['request_title']],
    wordScore, length, True, False, 'TitleScore')
subScore = effectScore(testData[['requester_subreddits_at_request']],
    redScore, length, False, False, 'RedditScore')

# Creates DataFrame for CLF
reqScore = reqScore.join(titleScore)
reqScore = reqScore.join(subScore)

# Creates Dataframe
(yTrain,xTrain) = createFrame(testData, reqScore, usingTest)

# Predict Data
yPred = clf.predict(xTrain)

# Output Results
if usingTest is True:
    print "\nAccuracy: {0:.3f}".format(metrics.accuracy_score(yTrain,yPred))
    scores = cross_val_score(clf, xTrain, yPred)
    print("Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()*2))
    print "\nClassification Report:"
    print metrics.classification_report(yTrain,yPred)
    print "Confusion Matrix:"
    print metrics.confusion_matrix(yTrain,yPred)
else:
    yPred = yPred.astype(int)
    outArray = pd.DataFrame(yPred, index=testDataOrg.request_id,
                            columns=["requester_received_pizza"])
    outArray.to_csv('Data.csv')

	"""
	On Training, this scored 0.736
	Magnitude: 3 default, NO synonyms, 3 lemmas, 4 Correct
	When Leaf = 30 & Depth = 20
	False: 78% as 1401 and True: 40% as 86

	On Training, this scored 0.733
	Magnitude: 3 default, 0.5 synonyms, 3 lemmas, 4 Correct
	When Leaf = 30 & Depth = 20
	False: 78% as 1377 and True: 40% as 103
	"""

	import pandas as pd
	from sklearn import tree, metrics
	from sklearn.externals.six import StringIO
	from sklearn.cross_validation import cross_val_score
	import pydot
	import string
	from nltk.corpus import wordnet as wn

	# Loads Data
	path = '/Users/Matthew/Dropbox/Academics/Pizza'
	trainDataOrg = pd.read_json(path + '/train.json')
	usingTest = True

	# If usingTest is True, we split trainDataOrg into two halves for test/train
	# If usingTest is False, we import test.json which doesn't contain results.
	if usingTest is True:
	length = len(trainDataOrg)/2
	testData = trainDataOrg[length:]
	trainData = trainDataOrg[:length]
	else:
	testDataOrg = pd.read_json(path + '/test.json')
	length = 0
	testData = testDataOrg
	trainData = trainDataOrg


	""" Function A.1: scoreAssign --------------------------------------------------
	Purpose: This looks at each word from a series of text. If that word was
	used successfully for a pizza, it's given +1 points. If not, it's
	given -1 points. We use these points to determine how effective
	each word is in obtaining a pizza.

	Input:
	word The word to analyze.
	wordScore A dictionary containing word scores (Do this if one was made
	in a previous function. Otherwise, pass in a blank dictionary).
	result A boolean with the result if pizza was won.
	magnitude A scalar which scales the +/- points. Default to 1 if unsure.

	Output:
	wordScore This gives the score of each word across all text.
	-----------------------------------------------------------------------------"""
	def scoreAssign(word, wordScore, result, magnitude):
	if word in wordScore and result is True:
	wordList = wordScore[word]
	wordList[0] = wordList[0] + (4*magnitude)
	wordList[1] = wordList[1] + 1
	#wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1
	elif word in wordScore and result is False:
	wordList = wordScore[word]
	wordList[1] = wordList[1] + 1
	#wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1
	elif word not in wordScore and result is True:
	wordScore[word] = [4*magnitude,1]
	#wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1
	elif word not in wordScore and result is False:
	wordScore[word] = [0,1]
	#wordScore['TotalWordsUsed'] = wordScore['TotalWordsUsed'] + 1
	return wordScore


	""" Function A.2: createScore --------------------------------------------------
	Purpose: This looks at each word from a series of text. If that word was
	used successfully for a pizza, it's given +1 points. If not, it's
	given -1 points. We use these points to determine how effective
	each word is in obtaining a pizza.

	Input:
	textData A data frame with text in one column for further processing.
	boolData A data frame with a boolean indicating if the pizza was won.
	start The first index value of the dataset being passed through. This
	is normally 0, but will be the variable 'length' if we split
	the dataset into two (i.e. when we pass in testData).
	wordScore A dictionary containing word scores (Do this if one was made
	in a previous function. Otherwise, pass in a blank dictionary.)
	lemmas A boolean. If True, we'll get wordScores for the first lemma
	(root word) too.
	synonym A boolean. If True, we'll get wordScores for synonyms too.

	Output:
	wordScore This gives the score of each word across all text.
	-----------------------------------------------------------------------------"""
	def createScore(textData, boolData, start, wordScore, lemmas, synonym):
	textList = textData.values

	# Processes Effectiveness of Words in Train Text
	for i1 in range(len(textData)):
	text = textList[i1] # Removes Punctuation & Sets Lowercase
	text = str(text)
	result = bool(boolData.requester_received_pizza[i1+start])

	# Calculates wordScore for each word in the given string of text
	for word in text.split()[1:]:
	if "u'" in word:
	word = word[2:]
	word = (word.translate(None, string.punctuation)).lower()
	if "" is word:
	continue
	if len(word) >= 3:
	wordScore = scoreAssign(word, wordScore, result, 3)

	# Obtains Synonyms & calcuates its wordScore if synonym == True
	if synonym == True:
	synList = [x for x in wn.synsets(word)]
	for i2 in range(len(synList)):
	synWord = str(synList[i2])
	synWord = synWord[8:-7]
	wordScore = scoreAssign(synWord, wordScore, result, 0.5)

	# Obtains Lemmas & calcuates its wordScore if lemmas == True
	if lemmas == True:
	lemList = [x for x in wn.lemmas(word)]
	for i2 in range(len(lemList)):
	lemWord = str(lemList[i2])
	lemWord = lemWord.split('.')[0]
	lemWord = lemWord[7:]
	wordScore = scoreAssign(lemWord, wordScore, result, 3)

	# Returns Outputs back to the Main Function
	return wordScore


	""" Function B: ratioScore -----------------------------------------------------
	Purpose: This looks at each word from a series of text. If that word was
	used successfully for a pizza, it's given +1 points. If not, it's
	given -1 points. We use these points to determine how effective
	each word is in obtaining a pizza.

	Input:
	wordScore A dictionary containing word scores (Do this if one was made
	in a previous function. Otherwise, pass in a blank dictionary.)

	Output:
	wordScore This gives the score of each word across all text.
	-----------------------------------------------------------------------------"""
	def ratioScore(wordScore):
	for key in wordScore:
	wordList = wordScore[key]
	wordScore[key] = wordList[0]/(wordList[1])
	#wordScore[key] = wordList[0]
	return wordScore


	""" Function C: effectScore ------------------------------------------------------
	Purpose: This looks at each word from a series of text. If that word was
	used successfully for a pizza, it's given +1 points. If not, it's
	given -1 points. We use these points to determine how effective
	each word is in obtaining a pizza.

	Input:
	textData A data frame with text in one column for further processing.
	wordScore A dictionary containing word scores (Do this if one was made
	in a previous function. Otherwise, pass in a blank dictionary.)
	start The first index value of the dataset being passed through. This
	is normally 0, but will be the variable 'length' if we split
	the dataset into two (i.e. when we pass in testData).
	lemmas A boolean. If True, we'll get wordScores for the lemma
	(root word) too.
	synonym A boolean. If True, we'll get wordScores for synonyms too.
	outCol A string which will be used to name this column in reqScore.

	Output:
	outScore This gives a data frame with an index value in one column and
	the score of each text based on the sum of its words' points.
	-----------------------------------------------------------------------------"""
	def effectScore(textData, wordScore, start, lemmas, synonym, outCol):
	# Processes Effectiveness of Train Text
	outScore = []
	textList = textData.values

	for i1 in range(len(textData)):
	text = textList[i1] # Remove Puncutation & Sets Lowercase
	text = str(text)
	tempScore = 0 # Allocates initial score for this request
	for word in text.split()[1:]:
	# Removes the u' suffix if it's present in a word
	if "u'" in word:
	word = word[2:]

	# Removes all punctuation & makes all words lower-case
	word = (word.translate(None, string.punctuation)).lower()

	# Adds the word's score to the string's score
	if word in wordScore:
	tempScore = wordScore[word] + tempScore

	# Add the word's synonym score to the string's score if valid
	if synonym == True:
	synList = [x for x in wn.synsets(word)]
	for i2 in range(len(synList)):
	synWord = str(synList[i2])
	synWord = synWord[8:-7]
	if synWord in wordScore:
	tempScore = wordScore[synWord] + tempScore

	# Adds the word's lemma score to the string's score if valid
	if lemmas == True:
	lemList = [x for x in wn.lemmas(word)]
	for i2 in range(len(lemList)):
	lemWord = str(lemList[i2])
	lemWord = lemWord.split('.')[0]
	lemWord = lemWord[7:]
	if lemWord in wordScore:
	tempScore = wordScore[lemWord] + tempScore
	outScore.append(tempScore)

	# Sets Index of Train Text
	outIdx = range(0+start,len(outScore)+start,1)
	outScore = pd.DataFrame(outScore, index=outIdx, columns=[outCol])

	# Returns Outputs back to the Main Function
	return outScore


	""" Function D: createFrame ----------------------------------------------------
	Purpose: With the name of the train/test dataset, we select the columns we
	will analyze over, append the array from the 'effective' function,
	& add column names. We use this to create the arrays needed to
	run DecisionTreeClassifier and/or predict for data.

	Input:
	data A data frame with the initial training or test dataset.
	new A data frame containing new data created from other functions.
	usingTest A boolean indicating if we're using the test.json. If True, we
	don't create y for pizza results. Note, usingTest must always
	be set to False when we're working with training data.

	Output:
	y A one column array with the 'result' -- a boolean if pizza was won
	x A multi-column array containing all data of relevance to solve.
	-----------------------------------------------------------------------------"""
	def createFrame(data, new, usingTest):
	# Declares Input
	if usingTest is True:
	y = data.requester_received_pizza
	else:
	y = []

	# Creates Summarized Train Array & Converts Booleans to Int
	savedData = data[[
	'requester_account_age_in_days_at_request',
	'requester_days_since_first_post_on_raop_at_request',
	'requester_number_of_comments_at_request',
	'requester_number_of_comments_in_raop_at_request',
	'requester_number_of_posts_at_request',
	'requester_number_of_posts_on_raop_at_request',
	'requester_number_of_subreddits_at_request',
	'requester_upvotes_minus_downvotes_at_request',
	'requester_upvotes_plus_downvotes_at_request']]
	savedData.columns = [['AccountAge', 'DaysUntilPostR', 'NumComments',
	'NumCommentsRaop', 'NumPosts', 'NumPostsRaop',
	'Subreddit#', 'NetVotes', 'RawVotes']]
	savedData = savedData.join(new)

	# Declares Output
	x = savedData.ix[:,'AccountAge':'RedditScore']

	# Returns Values back to the Main Function
	return y,x


	""" Main Function -- Part 1: Training Data -------------------------------------
	Purpose: Using half of the 'training' data set, we'll use this to train our
	dataset with the help of the efective & createFrame functions
	defined above. Afterwards, we will fit this to a Decision Tree
	and output the results to the current path of this file. Also, we
	will print to the terminal, a data frame with the variables which
	made the greatest influence on our model.

	Input:
	trainData A data frame with (half of the) initial training dataset.

	Output:
	tree.pdf A PDF file with the decision tree
	-----------------------------------------------------------------------------"""
	# Initializes wordScore and redScore Dictionaries
	wordScore = {'TotalWordsUsed' : [0,1]}
	redScore = {'TotalWordsUsed' : [0,1]}

	# Derives a 'success' score for each word in the request, title, & subreddit
	wordScore = createScore(trainData[['request_text_edit_aware']],
	trainData[['requester_received_pizza']], 0, wordScore, True, False)
	wordScore = createScore(trainData[['request_title']],
	trainData[['requester_received_pizza']], 0, wordScore, True, False)
	redScore = createScore(
	trainData[['requester_subreddits_at_request']],
	trainData[['requester_received_pizza']], 0, redScore, False, False)

	# Processes ratios of wordScore so best word = max of 1 & worst word = min of 0
	wordScore = ratioScore(wordScore)
	redScore = ratioScore(redScore)

	# Caluluates Effectiveness of each request using the scores
	reqScore = effectScore(trainData[['request_text_edit_aware']],
	wordScore, 0, True, False, 'RequestScore')
	titleScore = effectScore(trainData[['request_title']],
	wordScore, 0, True, False, 'TitleScore')
	subScore = effectScore(trainData[['requester_subreddits_at_request']],
	redScore, 0, False, False, 'RedditScore')

	# Creates DataFrame for CLF
	reqScore = reqScore.join(titleScore)
	reqScore = reqScore.join(subScore)
	(y,x) = createFrame(trainData, reqScore, True)

	# Creates Decision Tree Classifer
	# If Yes, we branch Left. If No, we branch Right.
	clf = tree.DecisionTreeClassifier(criterion='entropy', min_samples_leaf = 30,
	max_depth=20)
	clf = clf.fit(x, y)

	# Export Tree
	# To export, I first ran 'conda install pydot'
	# Then installed the latest GraphViz with Google's help
	dot_data = StringIO()
	tree.export_graphviz(clf, out_file=dot_data)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	graph.write_png(path + "/tree.png")

	# Print Data Frame
	print pd.DataFrame(clf.feature_importances_, columns = ["Importance"],
	index = x.columns).sort(ascending=False, columns="Importance")


	""" Main Function -- Part 2: Test Data -----------------------------------------
	Purpose: Using half of the 'training' data set, we'll use this to test our
	dataset with the help of the efective & createFrame functions
	defined above. Afterwards, using the decision tree obtained above,
	we will see how effective it is in predicting the results of this
	half of the data set.

	Input:
	testData A data frame with (the other half of the) initial training
	dataset.
	-----------------------------------------------------------------------------"""
	# Determines effectScore for Request, Title, & Subreddits
	reqScore = effectScore(testData[['request_text_edit_aware']],
	wordScore, length, True, False, 'RequestScore')
	titleScore = effectScore(testData[['request_title']],
	wordScore, length, True, False, 'TitleScore')
	subScore = effectScore(testData[['requester_subreddits_at_request']],
	redScore, length, False, False, 'RedditScore')

	# Creates DataFrame for CLF
	reqScore = reqScore.join(titleScore)
	reqScore = reqScore.join(subScore)

	# Creates Dataframe
	(yTrain,xTrain) = createFrame(testData, reqScore, usingTest)

	# Predict Data
	yPred = clf.predict(xTrain)

	# Output Results
	if usingTest is True:
	print "\nAccuracy: {0:.3f}".format(metrics.accuracy_score(yTrain,yPred))
	scores = cross_val_score(clf, xTrain, yPred)
	print("Score: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()*2))
	print "\nClassification Report:"
	print metrics.classification_report(yTrain,yPred)
	print "Confusion Matrix:"
	print metrics.confusion_matrix(yTrain,yPred)
	else:
	yPred = yPred.astype(int)
	outArray = pd.DataFrame(yPred, index=testDataOrg.request_id,
	columns=["requester_received_pizza"])
	outArray.to_csv('Data.csv')