Skip to content

Instantly share code, notes, and snippets.

View alexanderholt's full-sized avatar

alexanderholt

View GitHub Profile
subjective_cols = [col for col in sd.columns if col.startswith('subjective')]
print(subjective_cols)
core_samples = np.zeros_like(labels, dtype = bool) #switches array of binary into array of booleans
core_samples[dbscn.core_sample_indices_] = True
print(core_samples)
#I'm going to make this simple by writing a function to do this for different numbers of K
def cluster_batch(k, data=x_df):
k_mean = MiniBatchKMeans(n_clusters = k)
fitted = k_mean.fit(data)
labels = fitted.labels_
print(labels)
print("Labels: " + str(labels))
print("Centroids: " + str(fitted.cluster_centers_))
print("Silhouette Score: " + str(silhouette_score(data, labels,sample_size=int(data.shape[0]*.1))))
print("Silhouette Score: " + str(silhouette_score(data, labels,sample_size=int(data.shape[0]*.2))))
adults_new['native_born'] = [1 if i=='United-States' else 0 for i in adults['native-country']]
feature_importances = pd.DataFrame(model.feature_importances_,
index = X_train_simpler.columns,
columns=['importance']).sort_values('importance',
ascending=False)
feature_importances
feature_importances['importance'] = feature_importances['importance'] * 100
import seaborn as sns
plt.figure(figsize=(30,15)) # this creates a figure 8 inch wide, 4 inch high
from sklearn.feature_extraction.text import HashingVectorizer
hvec = HashingVectorizer(stop_words='english')
hvec.fit(data_train['data'])
hvecdata = hvec.transform(data_train['data'])
X_train = pd.DataFrame(hvecdata.todense())
print(X_train.shape)
X_test = pd.DataFrame(hvec.transform(data_test['data']).todense())
print(X_test.shape)
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(stop_words='english')
tvec.fit(data_train['data'])
tvecdata = tvec.transform(data_train['data'])
X_train = pd.DataFrame(tvec.fit_transform(data_train['data']).todense(),columns=tvec.get_feature_names())
print(X_train.shape)
X_test = pd.DataFrame(tvec.transform(data_test['data']).todense(),columns=tvec.get_feature_names())
k = 3
kmeans = cluster.KMeans(n_clusters=k)
kmeans.fit(X_scaled)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
inertia = kmeans.inertia_
print('Centroids:', centroids)
print('')
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression()
model = logit.fit(X_train, y_train)
#predictions = model.predict(X_test)
print("Score:", model.score(X_test, y_test))
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
plt.style.use('fivethirtyeight')
# plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'