Skip to content

Instantly share code, notes, and snippets.

View alinazhanguwo's full-sized avatar

Alina Zhang alinazhanguwo

View GitHub Profile
# Computes softmax cross entropy between logits and labels
# Measures the probability error in discrete classification tasks
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))
# Applies exponential decay to the learning rate!!!
learning_rate = tf.train.exponential_decay(0.05, global_step, 1000, 0.85, staircase=True)
# GradientDescentOptimizer is used to minimize loss
# Create an interactive Tensorflow session
sess = tf.InteractiveSession()
# These will be inputs for the model
# 36*36 pixels, image with one channel graysacle
x = tf.placeholder("float", [None, 36, 36])
# -1 is for reshaping which means to fill out the dimensions as needed to maintain the overall size
# 36,36 is image dimension
# the final 1, is now the number of channel
# we have grayscale image that the number of channel is 1
# Computes softmax cross entropy between logits and labels
# Measures the probability error in discrete classification tasks
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))
# GradientDescentOptimizer is used to minimize loss
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]')
STOPWORDS = stopwords.words('english')
STOPWORDS.extend(['rt', 'http']) # extend stopwords; rt means re-tweet
STOPWORDS = set(STOPWORDS)
def text_prepare(text):
"""
text: a string
# Dictionary of all words from train corpus with their counts.
words_counts = {}
from collections import Counter
words_counts = Counter([word for line in X_train for word in line.split(' ')])
# Sorting
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]
# Top 10
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_features(X_train, X_val, X_test):
"""
X_train, X_val, X_test - input text
return TF-IDF vectorizer for each dataset
"""
# filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the tweets)
# ngram!!! --> ngram_range=(1,2)
logreg = LogisticRegression()
%%time
logreg.fit(X_train_tfidf, y_train)
# Return accuracy
scores = cross_val_score(logreg, X_train_tfidf, y_train, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))
%%time
svc = LinearSVC(dual=False)
svc.fit(X_train_tfidf, y_train)
scores = cross_val_score(svc, X_test_tfidf, y_test, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))
def train_classifier(X_train, y_train):
"""
X_train, y_train — training text and sentiment
return: trained classifier
"""
# Create and fit LogisticRegression wraped into OneVsRestClassifier.
model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0))
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
def evaluation_scores(y_val, predicted):
print ("Accracy={}".format(accuracy_score(y_val, predicted)))
print ("F1_macro={}".format(f1_score(y_val, predicted, average='macro')))