Skip to content

Instantly share code, notes, and snippets.

View alinazhanguwo's full-sized avatar

Alina Zhang alinazhanguwo

View GitHub Profile
@alinazhanguwo
alinazhanguwo / DeltaOfTwoDates.py
Last active September 26, 2020 02:22
compute the delta between two consecutive dates in number of Deaths for top 10 states in America
# top 10 states in America
'''['California',
'Florida',
'Georgia',
'Illinois',
'Massachusetts',
'Michigan',
'New Jersey',
'New York',
'Pennsylvania',
# set up the parameters
n_init = 12
max_iter = 225
tol = 0.0001
random_state = 42
n_jobs = -1
t0 = dt.now()
print("========= Start training ... ")
def overallAccuracy(clusterDF, labelsDF):
countByCluster = pd.DataFrame(data=clusterDF['cluster'].value_counts())
countByCluster.reset_index(inplace=True, drop=False)
countByCluster.columns = ['cluster', 'clusterCount']
# print('countByCluster \n', countByCluster)
preds = pd.concat([labelsDF, clusterDF], axis=1)
preds.columns = ['trueLabel', 'cluster']
# print('preds \n', preds)
def overallAccuracy(clusterDF, labelsDF):
countByCluster = pd.DataFrame(data=clusterDF['cluster'].value_counts())
countByCluster.reset_index(inplace=True, drop=False)
countByCluster.columns = ['cluster', 'clusterCount']
# print('countByCluster \n', countByCluster)
preds = pd.concat([labelsDF, clusterDF], axis=1)
preds.columns = ['trueLabel', 'cluster']
# print('preds \n', preds)
# set up the parameters
n_init = 12
max_iter = 225
tol = 0.0001
random_state = 42
n_jobs = -1
n_clusters = 3
t0 = dt.now()
print("========= Start training ... ")
# create train and test datasets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print("Trianing ", X_train.shape,Y_train.shape)
print("Testing ",X_test.shape,Y_test.shape)
batch_size = 32
model.fit(X_train, Y_train, epochs = 20, batch_size=batch_size, verbose = 2)
embed_dim = 128
lstm_out = 196
model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X)
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
def evaluation_scores(y_val, predicted):
print ("Accracy={}".format(accuracy_score(y_val, predicted)))
print ("F1_macro={}".format(f1_score(y_val, predicted, average='macro')))
def train_classifier(X_train, y_train):
"""
X_train, y_train — training text and sentiment
return: trained classifier
"""
# Create and fit LogisticRegression wraped into OneVsRestClassifier.
model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0))