Alina Zhang alinazhanguwo

## Deep CNN training.py
# Computes softmax cross entropy between logits and labels
# Measures the probability error in discrete classification tasks
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))

# Applies exponential decay to the learning rate!!!
learning_rate = tf.train.exponential_decay(0.05, global_step, 1000, 0.85, staircase=True)

# GradientDescentOptimizer is used to minimize loss

## Deeper CNN.py
# Create an interactive Tensorflow session
sess = tf.InteractiveSession()

# These will be inputs for the model
# 36*36 pixels, image with one channel graysacle
x = tf.placeholder("float", [None, 36, 36])
# -1 is for reshaping which means to fill out the dimensions as needed to maintain the overall size
# 36,36 is image dimension
# the final 1, is now the number of channel
# we have grayscale image that the number of channel is 1

## Deeper CNN training.py
# Computes softmax cross entropy between logits and labels
# Measures the probability error in discrete classification tasks
# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))

# GradientDescentOptimizer is used to minimize loss
train_step = tf.train.GradientDescentOptimizer(0.01).minimize(cross_entropy)


## Text Pre-processing.py
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;#]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]')
STOPWORDS = stopwords.words('english')
STOPWORDS.extend(['rt', 'http']) # extend stopwords; rt means re-tweet
STOPWORDS = set(STOPWORDS)

def text_prepare(text):
    """
        text: a string


## most common words.py
# Dictionary of all words from train corpus with their counts.
words_counts = {}

from collections import Counter
words_counts = Counter([word for line in X_train for word in line.split(' ')])

# Sorting
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]

# Top 10

## tokenization.py
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(X_train, X_val, X_test):
    """
        X_train, X_val, X_test - input text
        return TF-IDF vectorizer for each dataset
    """

    # filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the tweets)
    # ngram!!! -->  ngram_range=(1,2)

## LogisticRegression.py
logreg = LogisticRegression()

%%time
logreg.fit(X_train_tfidf, y_train)

# Return accuracy
scores = cross_val_score(logreg, X_train_tfidf, y_train, scoring='accuracy', n_jobs=-1, cv=3)

print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

## linearSVC.py
%%time
svc = LinearSVC(dual=False)
svc.fit(X_train_tfidf, y_train)


scores = cross_val_score(svc, X_test_tfidf, y_test, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

## OneVsRest.py
def train_classifier(X_train, y_train):
    """
      X_train, y_train — training text and sentiment

      return: trained classifier
    """

    # Create and fit LogisticRegression wraped into OneVsRestClassifier.

    model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0))

## evaluation.py
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

def evaluation_scores(y_val, predicted):

    print ("Accracy={}".format(accuracy_score(y_val, predicted)))
    print ("F1_macro={}".format(f1_score(y_val, predicted, average='macro')))
	# Computes softmax cross entropy between logits and labels
	# Measures the probability error in discrete classification tasks
	# For example, each font image is labeled with one and only one label: an image can be font SansSerif or Serif, but not both.
	cross_entropy = tf.reduce_mean(
	tf.nn.softmax_cross_entropy_with_logits_v2(logits = y + 1e-50, labels = y_))

	# Applies exponential decay to the learning rate!!!
	learning_rate = tf.train.exponential_decay(0.05, global_step, 1000, 0.85, staircase=True)

	# GradientDescentOptimizer is used to minimize loss
	# Create an interactive Tensorflow session
	sess = tf.InteractiveSession()

	# These will be inputs for the model
	# 36*36 pixels, image with one channel graysacle
	x = tf.placeholder("float", [None, 36, 36])
	# -1 is for reshaping which means to fill out the dimensions as needed to maintain the overall size
	# 36,36 is image dimension
	# the final 1, is now the number of channel
	# we have grayscale image that the number of channel is 1
	REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\\|@,;#]')
	BAD_SYMBOLS_RE = re.compile('[^0-9a-z +_]')
	STOPWORDS = stopwords.words('english')
	STOPWORDS.extend(['rt', 'http']) # extend stopwords; rt means re-tweet
	STOPWORDS = set(STOPWORDS)

	def text_prepare(text):
	"""
	text: a string
	# Dictionary of all words from train corpus with their counts.
	words_counts = {}

	from collections import Counter
	words_counts = Counter([word for line in X_train for word in line.split(' ')])

	# Sorting
	most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:10]

	# Top 10
	from sklearn.feature_extraction.text import TfidfVectorizer

	def tfidf_features(X_train, X_val, X_test):
	"""
	X_train, X_val, X_test - input text
	return TF-IDF vectorizer for each dataset
	"""

	# filter out too rare words (occur less than in 5 titles) and too frequent words (occur more than in 90% of the tweets)
	# ngram!!! --> ngram_range=(1,2)
	logreg = LogisticRegression()

	%%time
	logreg.fit(X_train_tfidf, y_train)

	# Return accuracy
	scores = cross_val_score(logreg, X_train_tfidf, y_train, scoring='accuracy', n_jobs=-1, cv=3)

	print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))
	%%time
	svc = LinearSVC(dual=False)
	svc.fit(X_train_tfidf, y_train)


	scores = cross_val_score(svc, X_test_tfidf, y_test, scoring='accuracy', n_jobs=-1, cv=3)
	print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))
	def train_classifier(X_train, y_train):
	"""
	X_train, y_train — training text and sentiment

	return: trained classifier
	"""

	# Create and fit LogisticRegression wraped into OneVsRestClassifier.

	model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0))
	from sklearn.metrics import accuracy_score
	from sklearn.metrics import f1_score
	from sklearn.metrics import roc_auc_score
	from sklearn.metrics import average_precision_score
	from sklearn.metrics import recall_score

	def evaluation_scores(y_val, predicted):

	print ("Accracy={}".format(accuracy_score(y_val, predicted)))
	print ("F1_macro={}".format(f1_score(y_val, predicted, average='macro')))