Skip to content

Instantly share code, notes, and snippets.

pos_tagging_spacy(sentence) {
sentence_nlp = nlp(sentence)
# POS tagging with Spacy
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])
}
pos_tagging_nltk(sentence) {
# POS tagging with nltk
ner(sentence) {
sentence_nlp = nlp(sentence)
# print named entities in article
print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])
# visualize named entities
displacy.render(sentence_nlp, style='ent', jupyter=True)
}
named_entities = []
for sentence in corpus:
temp_entity_name = ''
temp_named_entity = None
sentence = nlp(sentence)
for word in sentence:
term = word.text
tag = word.ent_type_
if tag:
temp_entity_name = ' '.join([temp_entity_name, term]).strip()
# initialize afinn sentiment analyzer
from afinn import Afinn
af = Afinn()
# compute sentiment scores (polarity) and labels
sentiment_scores = [af.score(article) for article in corpus]
sentiment_category = ['positive' if score > 0
else 'negative' if score < 0
else 'neutral'
for score in sentiment_scores]
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))
sp = sns.stripplot(x='news_category', y="sentiment_score",
hue='news_category', data=df, ax=ax1)
bp = sns.boxplot(x='news_category', y="sentiment_score",
hue='news_category', data=df, palette="Set2", ax=ax2)
t = f.suptitle('Visualizing News Sentiment', fontsize=14
fc = sns.factorplot(x="news_category", hue="sentiment_category",
data=df, kind="count",
palette={"negative": "#FE2020",
"positive": "#BADD07",
"neutral": "#68BFF5"})
pos_idx = df[(df.news_category=='technology') & (df.sentiment_score == 6)].index[0]
neg_idx = df[(df.news_category=='technology') & (df.sentiment_score == -15)].index[0]
print('Most Negative Tech News Article:', news_df.iloc[neg_idx][['news_article']][0])
print()
print('Most Positive Tech News Article:', news_df.iloc[pos_idx][['news_article']][0])
from textblob import TextBlob
# compute sentiment scores (polarity) and labels
sentiment_scores_tb = [round(TextBlob(article).sentiment.polarity, 3) for article in news_df['clean_text']]
sentiment_category_tb = ['positive' if score > 0
else 'negative' if score < 0
else 'neutral'
for score in sentiment_scores_tb]
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
accented_char_removal=True, text_lower_case=True,
text_lemmatization=True, special_char_removal=True,
stopword_removal=True, remove_digits=True):
normalized_corpus = []
# normalize each document in the corpus
for doc in corpus:
# strip HTML
if html_stripping:
# nat gw
resource "aws_eip" "nat" {
vpc = true
}
resource "aws_nat_gateway" "nat-gw" {
allocation_id = "${aws_eip.nat.id}"
subnet_id = "${aws_subnet.main-public-1.id}"
depends_on = ["aws_internet_gateway.main-gw"]
}