Skip to content

Instantly share code, notes, and snippets.

View alinazhanguwo's full-sized avatar

Alina Zhang alinazhanguwo

View GitHub Profile
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_features(X_train, X_val, X_test):
"""
X_train, X_val, X_test - input text
return TF-IDF vectorizer for each dataset
"""
# filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the tweets)
# ngram!!! --> ngram_range=(1,2)
# Dictionary of all words from train corpus with their counts.
words_counts = {}
from collections import Counter
words_counts = Counter([word for line in X_train for word in line.split(' ')])
# Sorting
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]
# Top 10
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]')
STOPWORDS = stopwords.words('english')
STOPWORDS.extend(['rt', 'http']) # extend stopwords; rt means re-tweet
STOPWORDS = set(STOPWORDS)
def text_prepare(text):
"""
text: a string
# Computes softmax cross entropy between logits and labels
# Measures the probability error in discrete classification tasks
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))
# GradientDescentOptimizer is used to minimize loss
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
# Create an interactive Tensorflow session
sess = tf.InteractiveSession()
# These will be inputs for the model
# 36*36 pixels, image with one channel graysacle
x = tf.placeholder("float", [None, 36, 36])
# -1 is for reshaping which means to fill out the dimensions as needed to maintain the overall size
# 36,36 is image dimension
# the final 1, is now the number of channel
# we have grayscale image that the number of channel is 1
# Computes softmax cross entropy between logits and labels
# Measures the probability error in discrete classification tasks
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))
# Applies exponential decay to the learning rate!!!
learning_rate = tf.train.exponential_decay(0.05, global_step, 1000, 0.85, staircase=True)
# GradientDescentOptimizer is used to minimize loss
# Create an interactive Tensorflow session
sess = tf.InteractiveSession()
# These will be inputs for the model
# 36*36 pixels, image with one channel graysacle
x = tf.placeholder("float", [None, 36, 36])
# -1 is for reshaping which means to fill out the dimensions as needed to maintain the overall size
# 36,36 is image dimension
# the final 1, is now the number of channel
# we have grayscale image that the number of channel is 1
# Computes softmax cross entropy between logits and labels
# Measures the probability error in discrete classification tasks
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))
# GradientDescentOptimizer is used to minimize loss
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
# Define accuracy
# Create an interactive Tensorflow session
sess = tf.InteractiveSession()
# These will be inputs for the model
# Input pixels of images, flattened
# 1296 = 36*36 which is the size of images
x = tf.placeholder("float", [None, 1296])
## Known labels
y_ = tf.placeholder("float", [None,2])
# Computes softmax cross entropy between logits and labels
# Measures the probability error in discrete classification tasks
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))
# GradientDescentOptimizer is used to minimize loss
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
# Define accuracy