Himanshu Lohiya himlohiya

## pos_tagging.py
pos_tagging_spacy(sentence) {
  sentence_nlp = nlp(sentence)

  # POS tagging with Spacy
  spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
  pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])
}

pos_tagging_nltk(sentence) {
  # POS tagging with nltk

## ner.py
ner(sentence) {
  sentence_nlp = nlp(sentence)
  # print named entities in article
  print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])

  # visualize named entities
  displacy.render(sentence_nlp, style='ent', jupyter=True)
}

## top_ner.py
named_entities = []
for sentence in corpus:
    temp_entity_name = ''
    temp_named_entity = None
    sentence = nlp(sentence)
    for word in sentence:
        term = word.text
        tag = word.ent_type_
        if tag:
            temp_entity_name = ' '.join([temp_entity_name, term]).strip()

## sentiment_analysis_afinn.py
# initialize afinn sentiment analyzer
from afinn import Afinn
af = Afinn()

# compute sentiment scores (polarity) and labels
sentiment_scores = [af.score(article) for article in corpus]
sentiment_category = ['positive' if score > 0
                          else 'negative' if score < 0
                              else 'neutral'
                                  for score in sentiment_scores]

## sentiment_visualisations.py
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))
sp = sns.stripplot(x='news_category', y="sentiment_score",
                   hue='news_category', data=df, ax=ax1)
bp = sns.boxplot(x='news_category', y="sentiment_score",
                 hue='news_category', data=df, palette="Set2", ax=ax2)
t = f.suptitle('Visualizing News Sentiment', fontsize=14

## sentiment_category.py
fc = sns.factorplot(x="news_category", hue="sentiment_category",
                    data=df, kind="count",
                    palette={"negative": "#FE2020",
                             "positive": "#BADD07",
                             "neutral": "#68BFF5"})

## most_negative_positive.py
pos_idx = df[(df.news_category=='technology') & (df.sentiment_score == 6)].index[0]
neg_idx = df[(df.news_category=='technology') & (df.sentiment_score == -15)].index[0]

print('Most Negative Tech News Article:', news_df.iloc[neg_idx][['news_article']][0])
print()
print('Most Positive Tech News Article:', news_df.iloc[pos_idx][['news_article']][0])

## sentiment_analysis_textblob.py
from textblob import TextBlob

# compute sentiment scores (polarity) and labels
sentiment_scores_tb = [round(TextBlob(article).sentiment.polarity, 3) for article in news_df['clean_text']]
sentiment_category_tb = ['positive' if score > 0
                             else 'negative' if score < 0
                                 else 'neutral'
                                     for score in sentiment_scores_tb]


## normalize_corpus.py
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True,
                     text_lemmatization=True, special_char_removal=True,
                     stopword_removal=True, remove_digits=True):

    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:

## nat.tf
# nat gw
resource "aws_eip" "nat" {
  vpc      = true
}
resource "aws_nat_gateway" "nat-gw" {
  allocation_id = "${aws_eip.nat.id}"
  subnet_id = "${aws_subnet.main-public-1.id}"
  depends_on = ["aws_internet_gateway.main-gw"]
}
	pos_tagging_spacy(sentence) {
	sentence_nlp = nlp(sentence)

	# POS tagging with Spacy
	spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
	pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])
	}

	pos_tagging_nltk(sentence) {
	# POS tagging with nltk
	ner(sentence) {
	sentence_nlp = nlp(sentence)
	# print named entities in article
	print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])

	# visualize named entities
	displacy.render(sentence_nlp, style='ent', jupyter=True)
	}
	named_entities = []
	for sentence in corpus:
	temp_entity_name = ''
	temp_named_entity = None
	sentence = nlp(sentence)
	for word in sentence:
	term = word.text
	tag = word.ent_type_
	if tag:
	temp_entity_name = ' '.join([temp_entity_name, term]).strip()
	# initialize afinn sentiment analyzer
	from afinn import Afinn
	af = Afinn()

	# compute sentiment scores (polarity) and labels
	sentiment_scores = [af.score(article) for article in corpus]
	sentiment_category = ['positive' if score > 0
	else 'negative' if score < 0
	else 'neutral'
	for score in sentiment_scores]
	f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))
	sp = sns.stripplot(x='news_category', y="sentiment_score",
	hue='news_category', data=df, ax=ax1)
	bp = sns.boxplot(x='news_category', y="sentiment_score",
	hue='news_category', data=df, palette="Set2", ax=ax2)
	t = f.suptitle('Visualizing News Sentiment', fontsize=14
	fc = sns.factorplot(x="news_category", hue="sentiment_category",
	data=df, kind="count",
	palette={"negative": "#FE2020",
	"positive": "#BADD07",
	"neutral": "#68BFF5"})
	pos_idx = df[(df.news_category=='technology') & (df.sentiment_score == 6)].index[0]
	neg_idx = df[(df.news_category=='technology') & (df.sentiment_score == -15)].index[0]

	print('Most Negative Tech News Article:', news_df.iloc[neg_idx][['news_article']][0])
	print()
	print('Most Positive Tech News Article:', news_df.iloc[pos_idx][['news_article']][0])
	from textblob import TextBlob

	# compute sentiment scores (polarity) and labels
	sentiment_scores_tb = [round(TextBlob(article).sentiment.polarity, 3) for article in news_df['clean_text']]
	sentiment_category_tb = ['positive' if score > 0
	else 'negative' if score < 0
	else 'neutral'
	for score in sentiment_scores_tb]
	def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
	accented_char_removal=True, text_lower_case=True,
	text_lemmatization=True, special_char_removal=True,
	stopword_removal=True, remove_digits=True):

	normalized_corpus = []
	# normalize each document in the corpus
	for doc in corpus:
	# strip HTML
	if html_stripping:
	# nat gw
	resource "aws_eip" "nat" {
	vpc = true
	}
	resource "aws_nat_gateway" "nat-gw" {
	allocation_id = "${aws_eip.nat.id}"
	subnet_id = "${aws_subnet.main-public-1.id}"
	depends_on = ["aws_internet_gateway.main-gw"]
	}