Skip to content

Instantly share code, notes, and snippets.

sampleddata = dist.rvs(200)
plt.hist(sampleddata,bins=20)
plt.xlabel('Sampled men height in cm')
plt.ylabel('PDF')
plt.show()
x,y = eval_pdf(dist)
plt.plot(x,y)
plt.xlabel('Men height in cm')
plt.ylabel('PDF')
plt.show()
# We will create a random variable object using scipy.stats
mu = 178
sigma = 7.7
dist = scipy.stats.norm(mu,sigma) # Creates a random variable object
def eval_pdf(rv):
mu = rv.mean()
sigma = rv.std()
xs = np.linspace(mu-4*sigma,mu+4*sigma,100) # defining x values as a range around the mean
ys = rv.pdf(xs)
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
rf_acc = []
rf_f1 = []
for rs in range(100):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=rs)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
rfclf = RandomForestClassifier(max_depth=50, random_state=0,n_estimators=100)
rfclf.fit(X_train, y_train)
y_pred_rf = rfclf.predict(X_test)
rf_acc.append(accuracy_score(y_test, y_pred_rf))
# Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
rfclf = RandomForestClassifier(max_depth=50, random_state=0,n_estimators=1000)
rfclf.fit(X_train, y_train)
y_pred_rf = rfclf.predict(X_test)
rf_probs = rfclf.predict_proba(X_test)
rf_probs = rf_probs[:, 1]
print('Random Forest: Accuracy=%.3f' % (accuracy_score(y_test, y_pred_rf)))
# Training the data and predicting
# Logistic regression
from sklearn.linear_model import LogisticRegression
# fit a model
lrclf = LogisticRegression(solver='lbfgs',max_iter = 10000)
lrclf.fit(X_train, y_train)
y_pred_lr = lrclf.predict(X_test)
lr_probs = lrclf.predict_proba(X_test)
lr_probs = lr_probs[:, 1]
# Loading a bunch of sklearn modules:
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)
print (y['binnedgrade'].value_counts()) # we clearly have imbalanced datasets.