Alina Zhang alinazhanguwo

## tokenization.py
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(X_train, X_val, X_test):
    """
        X_train, X_val, X_test - input text
        return TF-IDF vectorizer for each dataset
    """

    # filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the tweets)
    # ngram!!! -->  ngram_range=(1,2)

## most common words.py
# Dictionary of all words from train corpus with their counts.
words_counts = {}

from collections import Counter
words_counts = Counter([word for line in X_train for word in line.split(' ')])

# Sorting
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]

# Top 10

## Text Pre-processing.py
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]')
STOPWORDS = stopwords.words('english')
STOPWORDS.extend(['rt', 'http']) # extend stopwords; rt means re-tweet
STOPWORDS = set(STOPWORDS)

def text_prepare(text):
    """
        text: a string


## Deeper CNN training.py
# Computes softmax cross entropy between logits and labels
# Measures the probability error in discrete classification tasks
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))

# GradientDescentOptimizer is used to minimize loss
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)


## Deeper CNN.py
# Create an interactive Tensorflow session
sess = tf.InteractiveSession()

# These will be inputs for the model
# 36*36 pixels, image with one channel graysacle
x = tf.placeholder("float", [None, 36, 36])
# -1 is for reshaping which means to fill out the dimensions as needed to maintain the overall size
# 36,36 is image dimension
# the final 1, is now the number of channel
# we have grayscale image that the number of channel is 1

## Deep CNN training.py
# Computes softmax cross entropy between logits and labels
# Measures the probability error in discrete classification tasks
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))

# Applies exponential decay to the learning rate!!!
learning_rate = tf.train.exponential_decay(0.05, global_step, 1000, 0.85, staircase=True)

# GradientDescentOptimizer is used to minimize loss

## Deep CNN.py
# Create an interactive Tensorflow session
sess = tf.InteractiveSession()

# These will be inputs for the model
# 36*36 pixels, image with one channel graysacle
x = tf.placeholder("float", [None, 36, 36])
# -1 is for reshaping which means to fill out the dimensions as needed to maintain the overall size
# 36,36 is image dimension
# the final 1, is now the number of channel
# we have grayscale image that the number of channel is 1

## Multiple hidden layers model training.py
# Computes softmax cross entropy between logits and labels
# Measures the probability error in discrete classification tasks
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))

# GradientDescentOptimizer is used to minimize loss
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)

# Define accuracy

## Multiple hidden layers model.py
# Create an interactive Tensorflow session
sess = tf.InteractiveSession()

# These will be inputs for the model
# Input pixels of images, flattened
# 1296 = 36*36 which is the size of images
x = tf.placeholder("float", [None, 1296])
## Known labels
y_ = tf.placeholder("float", [None,2])

## Single hidden layer model trainning.py
# Computes softmax cross entropy between logits and labels
# Measures the probability error in discrete classification tasks
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))

# GradientDescentOptimizer is used to minimize loss
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)

# Define accuracy
	from sklearn.feature_extraction.text import TfidfVectorizer

	def tfidf_features(X_train, X_val, X_test):
	"""
	X_train, X_val, X_test - input text
	return TF-IDF vectorizer for each dataset
	"""

	# filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the tweets)
	# ngram!!! --> ngram_range=(1,2)
	# Dictionary of all words from train corpus with their counts.
	words_counts = {}

	from collections import Counter
	words_counts = Counter([word for line in X_train for word in line.split(' ')])

	# Sorting
	most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]

	# Top 10
	REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\\|@,;#]')
	BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]')
	STOPWORDS = stopwords.words('english')
	STOPWORDS.extend(['rt', 'http']) # extend stopwords; rt means re-tweet
	STOPWORDS = set(STOPWORDS)

	def text_prepare(text):
	"""
	text: a string
	# Computes softmax cross entropy between logits and labels
	# Measures the probability error in discrete classification tasks
	# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
	cross_entropy = tf.reduce_mean(
	tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))

	# GradientDescentOptimizer is used to minimize loss
	train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)
	# Create an interactive Tensorflow session
	sess = tf.InteractiveSession()

	# These will be inputs for the model
	# 36*36 pixels, image with one channel graysacle
	x = tf.placeholder("float", [None, 36, 36])
	# -1 is for reshaping which means to fill out the dimensions as needed to maintain the overall size
	# 36,36 is image dimension
	# the final 1, is now the number of channel
	# we have grayscale image that the number of channel is 1