Skip to content

Instantly share code, notes, and snippets.

Alina Zhang alinazhanguwo

Block or report user

Report or block alinazhanguwo

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
# create train and test datasets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print("Trianing ", X_train.shape,Y_train.shape)
print("Testing ",X_test.shape,Y_test.shape)
batch_size = 32, Y_train, epochs = 20, batch_size=batch_size, verbose = 2)
embed_dim = 128
lstm_out = 196
model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X)
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
def evaluation_scores(y_val, predicted):
print ("Accracy={}".format(accuracy_score(y_val, predicted)))
print ("F1_macro={}".format(f1_score(y_val, predicted, average='macro')))
def train_classifier(X_train, y_train):
X_train, y_train — training text and sentiment
return: trained classifier
# Create and fit LogisticRegression wraped into OneVsRestClassifier.
model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0))
svc = LinearSVC(dual=False), y_train)
scores = cross_val_score(svc, X_test_tfidf, y_test, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))
logreg = LogisticRegression()
%%time, y_train)
# Return accuracy
scores = cross_val_score(logreg, X_train_tfidf, y_train, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_features(X_train, X_val, X_test):
X_train, X_val, X_test - input text
return TF-IDF vectorizer for each dataset
# filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the tweets)
# ngram!!! --> ngram_range=(1,2)
View most common
# Dictionary of all words from train corpus with their counts.
words_counts = {}
from collections import Counter
words_counts = Counter([word for line in X_train for word in line.split(' ')])
# Sorting
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]
# Top 10
View Text
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]')
STOPWORDS = stopwords.words('english')
STOPWORDS.extend(['rt', 'http']) # extend stopwords; rt means re-tweet
def text_prepare(text):
text: a string
You can’t perform that action at this time.