MechCoder/scotus.py

## scotus.py
# coding: utf-8

# Predicting the Behavior of the Supreme Court of the United States: A General Approach
# ==================
#   *  __Title__: Predicting the Behavior of the Supreme Court of the United States: A General Approach
#   *  __Authors__: [Daniel Martin Katz](http://www.law.msu.edu/faculty_staff/profile.php?prof=780), [Michael J Bommarito II](http://bommaritollc.com/), [Josh Blackman](http://joshblackman.com)
#   *  __Paper URL__: [http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2463244](http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2463244)
#   *  __Blog URL__: [http://lexpredict.com/portfolio/predicting-the-supreme-court/](http://lexpredict.com/portfolio/predicting-the-supreme-court/)
#
# ## Paper Abstract
# Building upon developments in theoretical and applied machine learning, as well as the efforts of various scholars including Guimera and Sales-Pardo (2011), Ruger et al. (2004), and Martin et al. (2004), we construct a model designed to predict the voting behavior of the Supreme Court of the United States. Using the extremely randomized tree method first proposed in Geurts, et al. (2006), a method similar to the random forest approach developed in Breiman (2001), as well as novel feature engineering, we predict more than sixty years of decisions by the Supreme Court of the United States (1953-2013). Using only data available prior to the date of decision, our model correctly identifies 69.7% of the Court’s overall affirm/reverse decisions and correctly forecasts 70.9% of the votes of individual justices across 7,700 cases and more than 68,000 justice votes. Our performance is consistent with the general level of prediction offered by prior scholars. However, our model is distinctive as it is the first robust, generalized,and fully predictive model of Supreme Court voting behavior offered to date. Our model predicts six decades of behavior of thirty Justices appointed by thirteen Presidents. With a more sound methodological foundation, our results represent a major advance for the science of quantitative legal prediction and portend a range of other potential applications, such as those described in Katz (2013).
#
# ## Source Description
# The source and data in this repository allow for the reproduction of the results in this paper.
#
# ## Data Description
# The data used in this paper is available from the [Supreme Court Database (SCDB)](http://scdb.wustl.edu/).
#
# ## Version
# The latest version of this model was relesed in October 2015.

# In[1]:

# get_ipython().magic('matplotlib inline')

# Imports
import matplotlib.pyplot as plt
import statsmodels.stats.proportion

# seaborn
import seaborn
seaborn.set()
seaborn.set_style("darkgrid")

# Project imports
from model import *


# In[2]:

# Get raw data
raw_data = get_raw_scdb_data("data/input/SCDB_2015_01_justiceCentered_Citation.csv")

# Get feature data
feature_df = preprocess_raw_data(raw_data, include_direction=True)

# In[3]:

# Output some diagnostics on features
print(raw_data.shape)
print(feature_df.shape)
assert(raw_data.shape[0] == feature_df.shape[0])


# In[4]:

# Output basic quantities for sample
ind = raw_data["justice_outcome_disposition"] == -1
raw_data = raw_data[~ind]

print(pandas.DataFrame(raw_data["justice_outcome_disposition"].value_counts()))
print(pandas.DataFrame(raw_data["justice_outcome_disposition"].value_counts(normalize=True)))


# In[5]:

# Setup training time period
min_training_years = 5
term_range = range(raw_data["term"].min() + min_training_years,
                   raw_data["term"].max()+1)

# Setting growing random forest parameters
# Number of trees to grow per term
trees_per_term = 20

# Number of trees to begin with
initial_trees = min_training_years * trees_per_term

# Number of years between "forest fires"
reset_interval = 9999

# Setup model
m = None
term_count = 0

for term in term_range:
    # Diagnostic output
    print("Term: {0}".format(term))
    term_count += 1

    # Setup train and test periods
    train_index = (raw_data.loc[:, "term"] < term).values
    test_index = (raw_data.loc[:, "term"] == term).values

    # Setup train data
    feature_data_train = feature_df.loc[train_index, :]
    target_data_train = raw_data.loc[train_index, "justice_outcome_disposition"].astype(int).values

    # Setup test data
    feature_data_test = feature_df.loc[test_index, :]
    target_data_test = raw_data.loc[test_index, "justice_outcome_disposition"].astype(int).values

    # Check if we should rebuild the model based on changing natural court
    if term_count % reset_interval == 0:
        # "Forest fire;" grow a new forest from scratch
        print("Reset interval hit; rebuilding with {0} trees".format(initial_trees + (term_count * trees_per_term)))
        m = None
    else:
        # Check if the justice set has changed
        if set(raw_data.loc[raw_data.loc[:, "term"] == (term-1), "justice"].unique()) !=  set(raw_data.loc[raw_data.loc[:, "term"] == (term), "justice"].unique()):
            # natural Court change; trigger forest fire
            print("Natural court change; rebuilding with {0} trees".format(initial_trees + (term_count * trees_per_term)))

            m = None

    # Build or grow a model depending on initial/reset condition
    if not m:
        # Grow an initial forest
        m = sklearn.ensemble.RandomForestClassifier(n_estimators=initial_trees + (term_count * trees_per_term),
                                                    class_weight="balanced_subsample",
                                                    warm_start=True,
                                                    n_jobs=-1)
    else:
        # Grow the forest by increasing the number of trees (requires warm_start=True)
        m.set_params(n_estimators=initial_trees + (term_count * trees_per_term))

    # Fit the forest model
    m.fit(feature_data_train,
          target_data_train)

    # Fit the "dummy" model
    d = sklearn.dummy.DummyClassifier(strategy="most_frequent")
    d.fit(feature_data_train, target_data_train)

    # Perform forest predictions
    raw_data.loc[test_index, "rf_predicted"] = m.predict(feature_data_test)

    # Store scores per class
    scores = m.predict_proba(feature_data_test)
    raw_data.loc[test_index, "rf_predicted_score_other"] = scores[:, 0]
    raw_data.loc[test_index, "rf_predicted_score_affirm"] = scores[:, 1]
    raw_data.loc[test_index, "rf_predicted_score_reverse"] = scores[:, 2]

    # Store dummy predictions
    raw_data.loc[test_index, "dummy_predicted"] = d.predict(feature_data_test)


# # In[6]:

# # Evaluation range
evaluation_index = raw_data.loc[:, "term"].isin(term_range)
target_actual = raw_data.loc[evaluation_index, "justice_outcome_disposition"]
target_predicted = raw_data.loc[evaluation_index, "rf_predicted"]
target_dummy = raw_data.loc[evaluation_index, "dummy_predicted"]
raw_data.loc[:, "rf_correct"] = numpy.nan
raw_data.loc[:, "dummy_correct"] = numpy.nan
raw_data.loc[evaluation_index, "rf_correct"] = (target_actual == target_predicted).astype(float)
raw_data.loc[evaluation_index, "dummy_correct"] = (target_actual == target_dummy).astype(float)

# Compare model
print("RF model")
print("="*32)
print(sklearn.metrics.classification_report(target_actual, target_predicted))
print(sklearn.metrics.confusion_matrix(target_actual, target_predicted))
print(sklearn.metrics.accuracy_score(target_actual, target_predicted))
print("="*32)
print("")

# Dummy model
print("Dummy model")
print("="*32)
print(sklearn.metrics.classification_report(target_actual, target_dummy))
print(sklearn.metrics.confusion_matrix(target_actual, target_dummy))
print(sklearn.metrics.accuracy_score(target_actual, target_dummy))
print("="*32)
print("")


# In[7]:

# Setup time series
rf_correct_ts = raw_data.loc[evaluation_index, :].groupby("term")["rf_correct"].mean()
dummy_correct_ts = raw_data.loc[evaluation_index, :].groupby("term")["dummy_correct"].mean()

# Plot all accuracies
f = plt.figure(figsize=(16, 12))
plt.plot(rf_correct_ts.index, rf_correct_ts,
         marker='o', alpha=0.75)
plt.plot(dummy_correct_ts.index, dummy_correct_ts,
         marker='>', alpha=0.75)
plt.legend(('Random forest', 'Dummy'))


# In[8]:

# Setup time series
rf_spread_ts = rf_correct_ts - dummy_correct_ts

# Plot all accuracies
f = plt.figure(figsize=(16, 12))
plt.bar(rf_spread_ts.index, rf_spread_ts,
        alpha=0.75)
plt.xlabel("Term")
plt.ylabel("Spread (%)")
plt.title("Spread over dummy model for justice accuracy")


# In[9]:

# Feature importance
feature_importance_df = pandas.DataFrame(zip(feature_df.columns, m.feature_importances_),
                                         columns=["feature", "importance"])
feature_importance_df.sort_values(["importance"], ascending=False).head(10)


# In[10]:

# Output stats
print("t-test:")
print("Uncalibrated:")
print(scipy.stats.ttest_rel(rf_correct_ts.values,
                   dummy_correct_ts.values))

print("=" * 16)
print("ranksum-test:")
print("Uncalibrated:")
print(scipy.stats.ranksums(rf_correct_ts.values,
                   dummy_correct_ts.values))

print("=" * 16)
print("Binomial:")
print(statsmodels.stats.proportion.binom_test(raw_data.loc[evaluation_index, "rf_correct"].sum(),
                                              raw_data.loc[evaluation_index, "rf_correct"].shape[0],
                                              raw_data.loc[evaluation_index, "dummy_correct"].mean(),
                                              alternative="larger"))


# ## Case Outcomes

# In[11]:

# Get case-level prediction
#scdb_data.loc[evaluation_index, "rf_predicted_case"] =
rf_predicted_case = pandas.DataFrame(raw_data.loc[evaluation_index, :]    .groupby(["docketId"])["rf_predicted"]    .agg(lambda x: x.value_counts().index[0]))
rf_predicted_case.columns = ["rf_predicted_case"]

dummy_predicted_case = pandas.DataFrame(raw_data.loc[evaluation_index, :]    .groupby(["docketId"])["dummy_predicted"]    .agg(lambda x: x.value_counts().index[0]))
dummy_predicted_case.columns = ["dummy_predicted_case"]

# Set DFs
rf_predicted_case = raw_data[["docketId", "case_outcome_disposition", "rf_predicted"]].join(rf_predicted_case, on="docketId")
dumy_predicted_case = raw_data[["docketId", "dummy_predicted"]].join(dummy_predicted_case, on="docketId")

raw_data.loc[:, "rf_predicted_case"] = rf_predicted_case
raw_data.loc[:, "dummy_predicted_case"] = dumy_predicted_case


# In[12]:

# Output case distribution
case_outcomes = raw_data.groupby(["docketId"])["case_outcome_disposition"].agg(lambda x: x.mode())
case_outcomes = case_outcomes.apply(lambda x: int(x) if type(x) in [numpy.float64] else None)
print(case_outcomes.value_counts())
print(case_outcomes.value_counts(normalize=True))


# In[13]:

# Output comparison
# Evaluation range
evaluation_index = raw_data.loc[:, "term"].isin(term_range) & -raw_data.loc[:, "case_outcome_disposition"].isnull()
target_actual = raw_data.loc[evaluation_index, "case_outcome_disposition"]
target_predicted = raw_data.loc[evaluation_index, "rf_predicted_case"]
target_dummy = raw_data.loc[evaluation_index, "dummy_predicted_case"]

raw_data.loc[:, "rf_correct_case"] = numpy.nan
raw_data.loc[:, "dummy_correct_case"] = numpy.nan
raw_data.loc[evaluation_index, "rf_correct_case"] = (target_actual == target_predicted).astype(float)
raw_data.loc[evaluation_index, "dummy_correct_case"] = (target_actual == target_dummy).astype(float)

# Compare model
print("RF model")
print("="*32)
print(sklearn.metrics.classification_report(target_actual, target_predicted))
print(sklearn.metrics.confusion_matrix(target_actual, target_predicted))
print(sklearn.metrics.accuracy_score(target_actual, target_predicted))
print("="*32)
print("")

# Dummy model
print("Dummy model")
print("="*32)
print(sklearn.metrics.classification_report(target_actual, target_dummy))
print(sklearn.metrics.confusion_matrix(target_actual, target_dummy))
print(sklearn.metrics.accuracy_score(target_actual, target_dummy))
print("="*32)
print("")


# In[14]:

# Setup time series
rf_correct_case_ts = raw_data.loc[evaluation_index, :].groupby("term")["rf_correct_case"].mean()
dummy_correct_case_ts = raw_data.loc[evaluation_index, :].groupby("term")["dummy_correct_case"].mean()

# Plot all accuracies
f = plt.figure(figsize=(16, 12))
plt.plot(rf_correct_case_ts.index, rf_correct_case_ts,
         marker='o', alpha=0.75)
plt.plot(dummy_correct_case_ts.index, dummy_correct_case_ts,
         marker='>', alpha=0.75)
plt.legend(('Random forest', 'Dummy'))


# In[15]:

# Setup time series
rf_spread_case_ts = rf_correct_case_ts - dummy_correct_case_ts

# Plot all accuracies
f = plt.figure(figsize=(16, 12))
plt.bar(rf_spread_case_ts.index, rf_spread_case_ts,
        alpha=0.75)
plt.xlabel("Term")
plt.ylabel("Spread (%)")
plt.title("Spread over dummy model for case accuracy")


# In[16]:

# Setup time series
rf_spread_case_dir_ts = pandas.expanding_sum(numpy.sign(rf_spread_case_ts))

# Plot all accuracies
f = plt.figure(figsize=(16, 12))
plt.plot(rf_spread_case_dir_ts.index, rf_spread_case_dir_ts,
        alpha=0.75)


# In[24]:

# Output stats
print("t-test:")
print("Uncalibrated:")
print(scipy.stats.ttest_rel(rf_correct_case_ts.values,
                   dummy_correct_case_ts.values))

print("=" * 16)
print("ranksum-test:")
print("Uncalibrated:")
print(scipy.stats.ranksums(rf_correct_case_ts.values,
                   dummy_correct_case_ts.values))

print("=" * 16)
print("Binomial:")
case_accuracy_data = raw_data.loc[evaluation_index, ["docketId", "rf_correct_case", "dummy_correct_case"]].drop_duplicates()
print(statsmodels.stats.proportion.binom_test(case_accuracy_data["rf_correct_case"].sum(),
                                              case_accuracy_data["rf_correct_case"].shape[0],
                                              case_accuracy_data["dummy_correct_case"].mean(),
                                              alternative="larger"))


# In[18]:

raw_data.loc[raw_data.loc[:, "caseName"] == "MIRANDA v. ARIZONA",
              ["caseName", "justiceName", "case_outcome_disposition", "justice_outcome_disposition",
              "rf_predicted", "rf_predicted_score_affirm", "rf_predicted_score_reverse", "rf_correct_case", "dummy_correct_case"]].T


# In[19]:

raw_data.loc[raw_data.loc[:, "caseName"] == "OBERGEFELL v. HODGES",
              ["caseName", "justiceName", "case_outcome_disposition", "justice_outcome_disposition",
              "rf_predicted", "rf_predicted_score_affirm", "rf_predicted_score_reverse", "rf_correct_case", "dummy_correct_case"]].T


# In[20]:

raw_data.loc[raw_data.loc[:, "caseName"] == "KING v. BURWELL",
              ["caseName", "justiceName", "case_outcome_disposition", "justice_outcome_disposition",
              "rf_predicted", "rf_predicted_score_affirm", "rf_predicted_score_reverse", "rf_correct_case", "dummy_correct_case"]].T


# In[21]:

raw_data.loc[raw_data.loc[:, "caseName"] == 'CITIZENS UNITED v. FEDERAL ELECTION COMMISSION',
              ["caseName", "justiceName", "case_outcome_disposition", "justice_outcome_disposition",
              "rf_predicted", "rf_predicted_score_affirm", "rf_predicted_score_reverse", "rf_correct_case", "dummy_correct_case"]].T


# In[22]:

raw_data.loc[raw_data.loc[:, "caseName"] == 'DISTRICT OF COLUMBIA v. HELLER',
              ["caseName", "justiceName", "case_outcome_disposition", "justice_outcome_disposition",
              "rf_predicted", "rf_predicted_score_affirm", "rf_predicted_score_reverse", "rf_correct_case", "dummy_correct_case"]].T


# In[23]:

# Write output
import cPickle as pickle
import time

timestamp_suffix = time.strftime("%Y%m%d%H%M%S")
raw_data.to_csv("data/output/model_output_{0}_{1}.csv".format(trees_per_term, timestamp_suffix),
                 index=False)
# You probably don't want to do this, as the object can be tens of GB
#pickle.dump(m, open("scotus_scdb_model_{0}.pickle".format(timestamp_suffix), "w"))
	# coding: utf-8

	# Predicting the Behavior of the Supreme Court of the United States: A General Approach
	# ==================
	# * __Title__: Predicting the Behavior of the Supreme Court of the United States: A General Approach
	# * __Authors__: [Daniel Martin Katz](http://www.law.msu.edu/faculty_staff/profile.php?prof=780), [Michael J Bommarito II](http://bommaritollc.com/), [Josh Blackman](http://joshblackman.com)
	# * __Paper URL__: [http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2463244](http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2463244)
	# * __Blog URL__: [http://lexpredict.com/portfolio/predicting-the-supreme-court/](http://lexpredict.com/portfolio/predicting-the-supreme-court/)
	#
	# ## Paper Abstract
	# Building upon developments in theoretical and applied machine learning, as well as the efforts of various scholars including Guimera and Sales-Pardo (2011), Ruger et al. (2004), and Martin et al. (2004), we construct a model designed to predict the voting behavior of the Supreme Court of the United States. Using the extremely randomized tree method first proposed in Geurts, et al. (2006), a method similar to the random forest approach developed in Breiman (2001), as well as novel feature engineering, we predict more than sixty years of decisions by the Supreme Court of the United States (1953-2013). Using only data available prior to the date of decision, our model correctly identifies 69.7% of the Court’s overall affirm/reverse decisions and correctly forecasts 70.9% of the votes of individual justices across 7,700 cases and more than 68,000 justice votes. Our performance is consistent with the general level of prediction offered by prior scholars. However, our model is distinctive as it is the first robust, generalized,and fully predictive model of Supreme Court voting behavior offered to date. Our model predicts six decades of behavior of thirty Justices appointed by thirteen Presidents. With a more sound methodological foundation, our results represent a major advance for the science of quantitative legal prediction and portend a range of other potential applications, such as those described in Katz (2013).
	#
	# ## Source Description
	# The source and data in this repository allow for the reproduction of the results in this paper.
	#
	# ## Data Description
	# The data used in this paper is available from the [Supreme Court Database (SCDB)](http://scdb.wustl.edu/).
	#
	# ## Version
	# The latest version of this model was relesed in October 2015.

	# In[1]:

	# get_ipython().magic('matplotlib inline')

	# Imports
	import matplotlib.pyplot as plt
	import statsmodels.stats.proportion

	# seaborn
	import seaborn
	seaborn.set()
	seaborn.set_style("darkgrid")

	# Project imports
	from model import *


	# In[2]:

	# Get raw data
	raw_data = get_raw_scdb_data("data/input/SCDB_2015_01_justiceCentered_Citation.csv")

	# Get feature data
	feature_df = preprocess_raw_data(raw_data, include_direction=True)

	# In[3]:

	# Output some diagnostics on features
	print(raw_data.shape)
	print(feature_df.shape)
	assert(raw_data.shape[0] == feature_df.shape[0])


	# In[4]:

	# Output basic quantities for sample
	ind = raw_data["justice_outcome_disposition"] == -1
	raw_data = raw_data[~ind]

	print(pandas.DataFrame(raw_data["justice_outcome_disposition"].value_counts()))
	print(pandas.DataFrame(raw_data["justice_outcome_disposition"].value_counts(normalize=True)))


	# In[5]:

	# Setup training time period
	min_training_years = 5
	term_range = range(raw_data["term"].min() + min_training_years,
	raw_data["term"].max()+1)

	# Setting growing random forest parameters
	# Number of trees to grow per term
	trees_per_term = 20

	# Number of trees to begin with
	initial_trees = min_training_years * trees_per_term

	# Number of years between "forest fires"
	reset_interval = 9999

	# Setup model
	m = None
	term_count = 0

	for term in term_range:
	# Diagnostic output
	print("Term: {0}".format(term))
	term_count += 1

	# Setup train and test periods
	train_index = (raw_data.loc[:, "term"] < term).values
	test_index = (raw_data.loc[:, "term"] == term).values

	# Setup train data
	feature_data_train = feature_df.loc[train_index, :]
	target_data_train = raw_data.loc[train_index, "justice_outcome_disposition"].astype(int).values

	# Setup test data
	feature_data_test = feature_df.loc[test_index, :]
	target_data_test = raw_data.loc[test_index, "justice_outcome_disposition"].astype(int).values

	# Check if we should rebuild the model based on changing natural court
	if term_count % reset_interval == 0:
	# "Forest fire;" grow a new forest from scratch
	print("Reset interval hit; rebuilding with {0} trees".format(initial_trees + (term_count * trees_per_term)))
	m = None
	else:
	# Check if the justice set has changed
	if set(raw_data.loc[raw_data.loc[:, "term"] == (term-1), "justice"].unique()) != set(raw_data.loc[raw_data.loc[:, "term"] == (term), "justice"].unique()):
	# natural Court change; trigger forest fire
	print("Natural court change; rebuilding with {0} trees".format(initial_trees + (term_count * trees_per_term)))

	m = None

	# Build or grow a model depending on initial/reset condition
	if not m:
	# Grow an initial forest
	m = sklearn.ensemble.RandomForestClassifier(n_estimators=initial_trees + (term_count * trees_per_term),
	class_weight="balanced_subsample",
	warm_start=True,
	n_jobs=-1)
	else:
	# Grow the forest by increasing the number of trees (requires warm_start=True)
	m.set_params(n_estimators=initial_trees + (term_count * trees_per_term))

	# Fit the forest model
	m.fit(feature_data_train,
	target_data_train)

	# Fit the "dummy" model
	d = sklearn.dummy.DummyClassifier(strategy="most_frequent")
	d.fit(feature_data_train, target_data_train)

	# Perform forest predictions
	raw_data.loc[test_index, "rf_predicted"] = m.predict(feature_data_test)

	# Store scores per class
	scores = m.predict_proba(feature_data_test)
	raw_data.loc[test_index, "rf_predicted_score_other"] = scores[:, 0]
	raw_data.loc[test_index, "rf_predicted_score_affirm"] = scores[:, 1]
	raw_data.loc[test_index, "rf_predicted_score_reverse"] = scores[:, 2]

	# Store dummy predictions
	raw_data.loc[test_index, "dummy_predicted"] = d.predict(feature_data_test)


	# # In[6]:

	# # Evaluation range
	evaluation_index = raw_data.loc[:, "term"].isin(term_range)
	target_actual = raw_data.loc[evaluation_index, "justice_outcome_disposition"]
	target_predicted = raw_data.loc[evaluation_index, "rf_predicted"]
	target_dummy = raw_data.loc[evaluation_index, "dummy_predicted"]
	raw_data.loc[:, "rf_correct"] = numpy.nan
	raw_data.loc[:, "dummy_correct"] = numpy.nan
	raw_data.loc[evaluation_index, "rf_correct"] = (target_actual == target_predicted).astype(float)
	raw_data.loc[evaluation_index, "dummy_correct"] = (target_actual == target_dummy).astype(float)

	# Compare model
	print("RF model")
	print("="*32)
	print(sklearn.metrics.classification_report(target_actual, target_predicted))
	print(sklearn.metrics.confusion_matrix(target_actual, target_predicted))
	print(sklearn.metrics.accuracy_score(target_actual, target_predicted))
	print("="*32)
	print("")

	# Dummy model
	print("Dummy model")
	print("="*32)
	print(sklearn.metrics.classification_report(target_actual, target_dummy))
	print(sklearn.metrics.confusion_matrix(target_actual, target_dummy))
	print(sklearn.metrics.accuracy_score(target_actual, target_dummy))
	print("="*32)
	print("")


	# In[7]:

	# Setup time series
	rf_correct_ts = raw_data.loc[evaluation_index, :].groupby("term")["rf_correct"].mean()
	dummy_correct_ts = raw_data.loc[evaluation_index, :].groupby("term")["dummy_correct"].mean()

	# Plot all accuracies
	f = plt.figure(figsize=(16, 12))
	plt.plot(rf_correct_ts.index, rf_correct_ts,
	marker='o', alpha=0.75)
	plt.plot(dummy_correct_ts.index, dummy_correct_ts,
	marker='>', alpha=0.75)
	plt.legend(('Random forest', 'Dummy'))


	# In[8]:

	# Setup time series
	rf_spread_ts = rf_correct_ts - dummy_correct_ts

	# Plot all accuracies
	f = plt.figure(figsize=(16, 12))
	plt.bar(rf_spread_ts.index, rf_spread_ts,
	alpha=0.75)
	plt.xlabel("Term")
	plt.ylabel("Spread (%)")
	plt.title("Spread over dummy model for justice accuracy")


	# In[9]:

	# Feature importance
	feature_importance_df = pandas.DataFrame(zip(feature_df.columns, m.feature_importances_),
	columns=["feature", "importance"])
	feature_importance_df.sort_values(["importance"], ascending=False).head(10)


	# In[10]:

	# Output stats
	print("t-test:")
	print("Uncalibrated:")
	print(scipy.stats.ttest_rel(rf_correct_ts.values,
	dummy_correct_ts.values))

	print("=" * 16)
	print("ranksum-test:")
	print("Uncalibrated:")
	print(scipy.stats.ranksums(rf_correct_ts.values,
	dummy_correct_ts.values))

	print("=" * 16)
	print("Binomial:")
	print(statsmodels.stats.proportion.binom_test(raw_data.loc[evaluation_index, "rf_correct"].sum(),
	raw_data.loc[evaluation_index, "rf_correct"].shape[0],
	raw_data.loc[evaluation_index, "dummy_correct"].mean(),
	alternative="larger"))


	# ## Case Outcomes

	# In[11]:

	# Get case-level prediction
	#scdb_data.loc[evaluation_index, "rf_predicted_case"] =
	rf_predicted_case = pandas.DataFrame(raw_data.loc[evaluation_index, :] .groupby(["docketId"])["rf_predicted"] .agg(lambda x: x.value_counts().index[0]))
	rf_predicted_case.columns = ["rf_predicted_case"]

	dummy_predicted_case = pandas.DataFrame(raw_data.loc[evaluation_index, :] .groupby(["docketId"])["dummy_predicted"] .agg(lambda x: x.value_counts().index[0]))
	dummy_predicted_case.columns = ["dummy_predicted_case"]

	# Set DFs
	rf_predicted_case = raw_data[["docketId", "case_outcome_disposition", "rf_predicted"]].join(rf_predicted_case, on="docketId")
	dumy_predicted_case = raw_data[["docketId", "dummy_predicted"]].join(dummy_predicted_case, on="docketId")

	raw_data.loc[:, "rf_predicted_case"] = rf_predicted_case
	raw_data.loc[:, "dummy_predicted_case"] = dumy_predicted_case


	# In[12]:

	# Output case distribution
	case_outcomes = raw_data.groupby(["docketId"])["case_outcome_disposition"].agg(lambda x: x.mode())
	case_outcomes = case_outcomes.apply(lambda x: int(x) if type(x) in [numpy.float64] else None)
	print(case_outcomes.value_counts())
	print(case_outcomes.value_counts(normalize=True))


	# In[13]:

	# Output comparison
	# Evaluation range
	evaluation_index = raw_data.loc[:, "term"].isin(term_range) & -raw_data.loc[:, "case_outcome_disposition"].isnull()
	target_actual = raw_data.loc[evaluation_index, "case_outcome_disposition"]
	target_predicted = raw_data.loc[evaluation_index, "rf_predicted_case"]
	target_dummy = raw_data.loc[evaluation_index, "dummy_predicted_case"]

	raw_data.loc[:, "rf_correct_case"] = numpy.nan
	raw_data.loc[:, "dummy_correct_case"] = numpy.nan
	raw_data.loc[evaluation_index, "rf_correct_case"] = (target_actual == target_predicted).astype(float)
	raw_data.loc[evaluation_index, "dummy_correct_case"] = (target_actual == target_dummy).astype(float)

	# Compare model
	print("RF model")
	print("="*32)
	print(sklearn.metrics.classification_report(target_actual, target_predicted))
	print(sklearn.metrics.confusion_matrix(target_actual, target_predicted))
	print(sklearn.metrics.accuracy_score(target_actual, target_predicted))
	print("="*32)
	print("")

	# Dummy model
	print("Dummy model")
	print("="*32)
	print(sklearn.metrics.classification_report(target_actual, target_dummy))
	print(sklearn.metrics.confusion_matrix(target_actual, target_dummy))
	print(sklearn.metrics.accuracy_score(target_actual, target_dummy))
	print("="*32)
	print("")


	# In[14]:

	# Setup time series
	rf_correct_case_ts = raw_data.loc[evaluation_index, :].groupby("term")["rf_correct_case"].mean()
	dummy_correct_case_ts = raw_data.loc[evaluation_index, :].groupby("term")["dummy_correct_case"].mean()

	# Plot all accuracies
	f = plt.figure(figsize=(16, 12))
	plt.plot(rf_correct_case_ts.index, rf_correct_case_ts,
	marker='o', alpha=0.75)
	plt.plot(dummy_correct_case_ts.index, dummy_correct_case_ts,
	marker='>', alpha=0.75)
	plt.legend(('Random forest', 'Dummy'))


	# In[15]:

	# Setup time series
	rf_spread_case_ts = rf_correct_case_ts - dummy_correct_case_ts

	# Plot all accuracies
	f = plt.figure(figsize=(16, 12))
	plt.bar(rf_spread_case_ts.index, rf_spread_case_ts,
	alpha=0.75)
	plt.xlabel("Term")
	plt.ylabel("Spread (%)")
	plt.title("Spread over dummy model for case accuracy")


	# In[16]:

	# Setup time series
	rf_spread_case_dir_ts = pandas.expanding_sum(numpy.sign(rf_spread_case_ts))

	# Plot all accuracies
	f = plt.figure(figsize=(16, 12))
	plt.plot(rf_spread_case_dir_ts.index, rf_spread_case_dir_ts,
	alpha=0.75)


	# In[24]:

	# Output stats
	print("t-test:")
	print("Uncalibrated:")
	print(scipy.stats.ttest_rel(rf_correct_case_ts.values,
	dummy_correct_case_ts.values))

	print("=" * 16)
	print("ranksum-test:")
	print("Uncalibrated:")
	print(scipy.stats.ranksums(rf_correct_case_ts.values,
	dummy_correct_case_ts.values))

	print("=" * 16)
	print("Binomial:")
	case_accuracy_data = raw_data.loc[evaluation_index, ["docketId", "rf_correct_case", "dummy_correct_case"]].drop_duplicates()
	print(statsmodels.stats.proportion.binom_test(case_accuracy_data["rf_correct_case"].sum(),
	case_accuracy_data["rf_correct_case"].shape[0],
	case_accuracy_data["dummy_correct_case"].mean(),
	alternative="larger"))


	# In[18]:

	raw_data.loc[raw_data.loc[:, "caseName"] == "MIRANDA v. ARIZONA",
	["caseName", "justiceName", "case_outcome_disposition", "justice_outcome_disposition",
	"rf_predicted", "rf_predicted_score_affirm", "rf_predicted_score_reverse", "rf_correct_case", "dummy_correct_case"]].T


	# In[19]:

	raw_data.loc[raw_data.loc[:, "caseName"] == "OBERGEFELL v. HODGES",
	["caseName", "justiceName", "case_outcome_disposition", "justice_outcome_disposition",
	"rf_predicted", "rf_predicted_score_affirm", "rf_predicted_score_reverse", "rf_correct_case", "dummy_correct_case"]].T


	# In[20]:

	raw_data.loc[raw_data.loc[:, "caseName"] == "KING v. BURWELL",
	["caseName", "justiceName", "case_outcome_disposition", "justice_outcome_disposition",
	"rf_predicted", "rf_predicted_score_affirm", "rf_predicted_score_reverse", "rf_correct_case", "dummy_correct_case"]].T


	# In[21]:

	raw_data.loc[raw_data.loc[:, "caseName"] == 'CITIZENS UNITED v. FEDERAL ELECTION COMMISSION',
	["caseName", "justiceName", "case_outcome_disposition", "justice_outcome_disposition",
	"rf_predicted", "rf_predicted_score_affirm", "rf_predicted_score_reverse", "rf_correct_case", "dummy_correct_case"]].T


	# In[22]:

	raw_data.loc[raw_data.loc[:, "caseName"] == 'DISTRICT OF COLUMBIA v. HELLER',
	["caseName", "justiceName", "case_outcome_disposition", "justice_outcome_disposition",
	"rf_predicted", "rf_predicted_score_affirm", "rf_predicted_score_reverse", "rf_correct_case", "dummy_correct_case"]].T


	# In[23]:

	# Write output
	import cPickle as pickle
	import time

	timestamp_suffix = time.strftime("%Y%m%d%H%M%S")
	raw_data.to_csv("data/output/model_output_{0}_{1}.csv".format(trees_per_term, timestamp_suffix),
	index=False)
	# You probably don't want to do this, as the object can be tens of GB
	#pickle.dump(m, open("scotus_scdb_model_{0}.pickle".format(timestamp_suffix), "w"))