This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# top 10 states in America | |
'''['California', | |
'Florida', | |
'Georgia', | |
'Illinois', | |
'Massachusetts', | |
'Michigan', | |
'New Jersey', | |
'New York', | |
'Pennsylvania', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# set up the parameters | |
n_init = 12 | |
max_iter = 225 | |
tol = 0.0001 | |
random_state = 42 | |
n_jobs = -1 | |
t0 = dt.now() | |
print("========= Start training ... ") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def overallAccuracy(clusterDF, labelsDF): | |
countByCluster = pd.DataFrame(data=clusterDF['cluster'].value_counts()) | |
countByCluster.reset_index(inplace=True, drop=False) | |
countByCluster.columns = ['cluster', 'clusterCount'] | |
# print('countByCluster \n', countByCluster) | |
preds = pd.concat([labelsDF, clusterDF], axis=1) | |
preds.columns = ['trueLabel', 'cluster'] | |
# print('preds \n', preds) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def overallAccuracy(clusterDF, labelsDF): | |
countByCluster = pd.DataFrame(data=clusterDF['cluster'].value_counts()) | |
countByCluster.reset_index(inplace=True, drop=False) | |
countByCluster.columns = ['cluster', 'clusterCount'] | |
# print('countByCluster \n', countByCluster) | |
preds = pd.concat([labelsDF, clusterDF], axis=1) | |
preds.columns = ['trueLabel', 'cluster'] | |
# print('preds \n', preds) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# set up the parameters | |
n_init = 12 | |
max_iter = 225 | |
tol = 0.0001 | |
random_state = 42 | |
n_jobs = -1 | |
n_clusters = 3 | |
t0 = dt.now() | |
print("========= Start training ... ") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create train and test datasets | |
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42) | |
print("Trianing ", X_train.shape,Y_train.shape) | |
print("Testing ",X_test.shape,Y_test.shape) | |
batch_size = 32 | |
model.fit(X_train, Y_train, epochs = 20, batch_size=batch_size, verbose = 2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
embed_dim = 128 | |
lstm_out = 196 | |
model = Sequential() | |
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1])) | |
model.add(SpatialDropout1D(0.4)) | |
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)) | |
model.add(Dense(3,activation='softmax')) | |
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy']) | |
print(model.summary()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
max_fatures = 2000 | |
tokenizer = Tokenizer(num_words=max_fatures, split=' ') | |
tokenizer.fit_on_texts(X) | |
X = tokenizer.texts_to_sequences(X) | |
X = pad_sequences(X) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import f1_score | |
from sklearn.metrics import roc_auc_score | |
from sklearn.metrics import average_precision_score | |
from sklearn.metrics import recall_score | |
def evaluation_scores(y_val, predicted): | |
print ("Accracy={}".format(accuracy_score(y_val, predicted))) | |
print ("F1_macro={}".format(f1_score(y_val, predicted, average='macro'))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def train_classifier(X_train, y_train): | |
""" | |
X_train, y_train — training text and sentiment | |
return: trained classifier | |
""" | |
# Create and fit LogisticRegression wraped into OneVsRestClassifier. | |
model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0)) |