This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Computes softmax cross entropy between logits and labels | |
# Measures the probability error in discrete classification tasks | |
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both. | |
cross_entropy = tf.reduce_mean( | |
tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_)) | |
# Applies exponential decay to the learning rate!!! | |
learning_rate = tf.train.exponential_decay(0.05, global_step, 1000, 0.85, staircase=True) | |
# GradientDescentOptimizer is used to minimize loss |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create an interactive Tensorflow session | |
sess = tf.InteractiveSession() | |
# These will be inputs for the model | |
# 36*36 pixels, image with one channel graysacle | |
x = tf.placeholder("float", [None, 36, 36]) | |
# -1 is for reshaping which means to fill out the dimensions as needed to maintain the overall size | |
# 36,36 is image dimension | |
# the final 1, is now the number of channel | |
# we have grayscale image that the number of channel is 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Computes softmax cross entropy between logits and labels | |
# Measures the probability error in discrete classification tasks | |
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both. | |
cross_entropy = tf.reduce_mean( | |
tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_)) | |
# GradientDescentOptimizer is used to minimize loss | |
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#]') | |
BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]') | |
STOPWORDS = stopwords.words('english') | |
STOPWORDS.extend(['rt', 'http']) # extend stopwords; rt means re-tweet | |
STOPWORDS = set(STOPWORDS) | |
def text_prepare(text): | |
""" | |
text: a string | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Dictionary of all words from train corpus with their counts. | |
words_counts = {} | |
from collections import Counter | |
words_counts = Counter([word for line in X_train for word in line.split(' ')]) | |
# Sorting | |
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10] | |
# Top 10 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import TfidfVectorizer | |
def tfidf_features(X_train, X_val, X_test): | |
""" | |
X_train, X_val, X_test - input text | |
return TF-IDF vectorizer for each dataset | |
""" | |
# filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the tweets) | |
# ngram!!! --> ngram_range=(1,2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
logreg = LogisticRegression() | |
%%time | |
logreg.fit(X_train_tfidf, y_train) | |
# Return accuracy | |
scores = cross_val_score(logreg, X_train_tfidf, y_train, scoring='accuracy', n_jobs=-1, cv=3) | |
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%%time | |
svc = LinearSVC(dual=False) | |
svc.fit(X_train_tfidf, y_train) | |
scores = cross_val_score(svc, X_test_tfidf, y_test, scoring='accuracy', n_jobs=-1, cv=3) | |
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def train_classifier(X_train, y_train): | |
""" | |
X_train, y_train — training text and sentiment | |
return: trained classifier | |
""" | |
# Create and fit LogisticRegression wraped into OneVsRestClassifier. | |
model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import f1_score | |
from sklearn.metrics import roc_auc_score | |
from sklearn.metrics import average_precision_score | |
from sklearn.metrics import recall_score | |
def evaluation_scores(y_val, predicted): | |
print ("Accracy={}".format(accuracy_score(y_val, predicted))) | |
print ("F1_macro={}".format(f1_score(y_val, predicted, average='macro'))) |