Skip to content

Instantly share code, notes, and snippets.

Alina Zhang alinazhanguwo

Block or report user

Report or block alinazhanguwo

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
View kerasLSTMTraining.py
# create train and test datasets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print("Trianing ", X_train.shape,Y_train.shape)
print("Testing ",X_test.shape,Y_test.shape)
batch_size = 32
model.fit(X_train, Y_train, epochs = 20, batch_size=batch_size, verbose = 2)
View kerasLSTM.py
embed_dim = 128
lstm_out = 196
model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())
View kerasTokenizer.py
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X)
View evaluation.py
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
def evaluation_scores(y_val, predicted):
print ("Accracy={}".format(accuracy_score(y_val, predicted)))
print ("F1_macro={}".format(f1_score(y_val, predicted, average='macro')))
View OneVsRest.py
def train_classifier(X_train, y_train):
"""
X_train, y_train — training text and sentiment
return: trained classifier
"""
# Create and fit LogisticRegression wraped into OneVsRestClassifier.
model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0))
View linearSVC.py
%%time
svc = LinearSVC(dual=False)
svc.fit(X_train_tfidf, y_train)
scores = cross_val_score(svc, X_test_tfidf, y_test, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))
View LogisticRegression.py
logreg = LogisticRegression()
%%time
logreg.fit(X_train_tfidf, y_train)
# Return accuracy
scores = cross_val_score(logreg, X_train_tfidf, y_train, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))
View tokenization.py
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_features(X_train, X_val, X_test):
"""
X_train, X_val, X_test - input text
return TF-IDF vectorizer for each dataset
"""
# filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the tweets)
# ngram!!! --> ngram_range=(1,2)
View most common words.py
# Dictionary of all words from train corpus with their counts.
words_counts = {}
from collections import Counter
words_counts = Counter([word for line in X_train for word in line.split(' ')])
# Sorting
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]
# Top 10
View Text Pre-processing.py
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]')
STOPWORDS = stopwords.words('english')
STOPWORDS.extend(['rt', 'http']) # extend stopwords; rt means re-tweet
STOPWORDS = set(STOPWORDS)
def text_prepare(text):
"""
text: a string
You can’t perform that action at this time.