import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import json
data = json.loads(open('data.json').read())
map(lambda x: x*x, [1,2,3,4])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
from matplotlib import pyplot
%matplotlib inline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
models = [
('LR', LogisticRegression()),
('LDA', LinearDiscriminantAnalysis()),
('AdaBoost', LinearDiscriminantAnalysis()),
('KNN', KNeighborsClassifier()),
('CART', DecisionTreeClassifier()),
('NB', GaussianNB()),
('SVM', SVC())]
results = []
names = []
for name, model in models:
cv_results = cross_val_score(model, X, y, cv=10, n_jobs=-1, scoring='accuracy')
results.append(cv_results)
names.append(name)
print("%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()))
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()
from sklearn.metrics import confusion_matrix
y_pred = clf.predict(X_test)
confusion_matrix(y_test, y_pred)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
clf = Pipeline([
('select_best', SelectKBest(score_func=chi2, k=100)),
('rfe', RFE(LogisticRegression(), 60, verbose=1)),
('pca', PCA(n_components=20)),
('polynomial', PolynomialFeatures(2)),
('classify', LogisticRegression())])
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist([train[ train['Survived'] == 0 ][feature], train[ train['Survived'] == 1 ][feature]])
plt.legend(["Died", "Survived"])
sns.swarmplot(x="Pclass", y="Male", hue="Survived", data=train)
from pandas.tools.plotting import scatter_matrix
scatter_matrix(data)
data.hist()
data.plot(kind='density', subplots=True, layout=(3,3), sharex=False)
correlations = data.corr()
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
pyplot.show()
data.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
sns.pairplot(train[['Survived', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Male']], hue='Survived')
Dan Rufener [9:08 AM]
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import matplotlib.pyplot as plt
clf = RandomForestRegressor(n_estimators=1000, random_state=0, n_jobs=-1)
clf.fit(X, y)
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30, X.columns[f], importances[indices[f]]))
plt.figure(figsize=(15,8))
plt.title('Feature Importances')
plt.bar(range(X.shape[1]),
importances[indices],
color='lightblue',
align='center')
plt.xticks(range(X.shape[1]), X.columns[1::], rotation=90)
plt.show()
pca = PCA(n_components=50)
pca.fit(X_train)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.title("PCA elbow plot")
plt.xlabel("n_components")
plt.ylabel("cumulative explained variance ratio")
plt.show()