Skip to content

Instantly share code, notes, and snippets.

pos_idx = df[(df.news_category=='technology') & (df.sentiment_score == 6)].index[0]
neg_idx = df[(df.news_category=='technology') & (df.sentiment_score == -15)].index[0]
print('Most Negative Tech News Article:', news_df.iloc[neg_idx][['news_article']][0])
print()
print('Most Positive Tech News Article:', news_df.iloc[pos_idx][['news_article']][0])
fc = sns.factorplot(x="news_category", hue="sentiment_category",
data=df, kind="count",
palette={"negative": "#FE2020",
"positive": "#BADD07",
"neutral": "#68BFF5"})
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))
sp = sns.stripplot(x='news_category', y="sentiment_score",
hue='news_category', data=df, ax=ax1)
bp = sns.boxplot(x='news_category', y="sentiment_score",
hue='news_category', data=df, palette="Set2", ax=ax2)
t = f.suptitle('Visualizing News Sentiment', fontsize=14
# initialize afinn sentiment analyzer
from afinn import Afinn
af = Afinn()
# compute sentiment scores (polarity) and labels
sentiment_scores = [af.score(article) for article in corpus]
sentiment_category = ['positive' if score > 0
else 'negative' if score < 0
else 'neutral'
for score in sentiment_scores]
named_entities = []
for sentence in corpus:
temp_entity_name = ''
temp_named_entity = None
sentence = nlp(sentence)
for word in sentence:
term = word.text
tag = word.ent_type_
if tag:
temp_entity_name = ' '.join([temp_entity_name, term]).strip()
ner(sentence) {
sentence_nlp = nlp(sentence)
# print named entities in article
print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])
# visualize named entities
displacy.render(sentence_nlp, style='ent', jupyter=True)
}
pos_tagging_spacy(sentence) {
sentence_nlp = nlp(sentence)
# POS tagging with Spacy
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])
}
pos_tagging_nltk(sentence) {
# POS tagging with nltk
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
accented_char_removal=True, text_lower_case=True,
text_lemmatization=True, special_char_removal=True,
stopword_removal=True, remove_digits=True):
normalized_corpus = []
# normalize each document in the corpus
for doc in corpus:
# strip HTML
if html_stripping:
def remove_stopwords(text, is_lower_case=False):
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]
if is_lower_case:
filtered_tokens = [token for token in tokens if token not in stopword_list]
else:
filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def simple_stemmer(text):
ps = nltk.porter.PorterStemmer()
text = ' '.join([ps.stem(word) for word in text.split()])
return text
def lemmatize_text(text):
text = nlp(text)
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
return text