Skip to content

Instantly share code, notes, and snippets.

View alinazhanguwo's full-sized avatar

Alina Zhang alinazhanguwo

View GitHub Profile
def overallAccuracy(clusterDF, labelsDF):
countByCluster = pd.DataFrame(data=clusterDF['cluster'].value_counts())
countByCluster.reset_index(inplace=True, drop=False)
countByCluster.columns = ['cluster', 'clusterCount']
# print('countByCluster \n', countByCluster)
preds = pd.concat([labelsDF, clusterDF], axis=1)
preds.columns = ['trueLabel', 'cluster']
# print('preds \n', preds)
def overallAccuracy(clusterDF, labelsDF):
countByCluster = pd.DataFrame(data=clusterDF['cluster'].value_counts())
countByCluster.reset_index(inplace=True, drop=False)
countByCluster.columns = ['cluster', 'clusterCount']
# print('countByCluster \n', countByCluster)
preds = pd.concat([labelsDF, clusterDF], axis=1)
preds.columns = ['trueLabel', 'cluster']
# print('preds \n', preds)
# set up the parameters
n_init = 12
max_iter = 225
tol = 0.0001
random_state = 42
n_jobs = -1
n_clusters = 3
t0 = dt.now()
print("========= Start training ... ")
# create train and test datasets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print("Trianing ", X_train.shape,Y_train.shape)
print("Testing ",X_test.shape,Y_test.shape)
batch_size = 32
model.fit(X_train, Y_train, epochs = 20, batch_size=batch_size, verbose = 2)
embed_dim = 128
lstm_out = 196
model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X)
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
def evaluation_scores(y_val, predicted):
print ("Accracy={}".format(accuracy_score(y_val, predicted)))
print ("F1_macro={}".format(f1_score(y_val, predicted, average='macro')))
def train_classifier(X_train, y_train):
"""
X_train, y_train — training text and sentiment
return: trained classifier
"""
# Create and fit LogisticRegression wraped into OneVsRestClassifier.
model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0))
%%time
svc = LinearSVC(dual=False)
svc.fit(X_train_tfidf, y_train)
scores = cross_val_score(svc, X_test_tfidf, y_test, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))
logreg = LogisticRegression()
%%time
logreg.fit(X_train_tfidf, y_train)
# Return accuracy
scores = cross_val_score(logreg, X_train_tfidf, y_train, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))