Skip to content

Instantly share code, notes, and snippets.

@manmohan24nov
Last active September 5, 2020 20:22
Show Gist options
  • Save manmohan24nov/4d57b1a3c0710d17c28edbcaeda4de40 to your computer and use it in GitHub Desktop.
Save manmohan24nov/4d57b1a3c0710d17c28edbcaeda4de40 to your computer and use it in GitHub Desktop.
from gensim.parsing.preprocessing
import remove_stopwords
import genism
from wordcloud import WordCloud
import numpy as np
import random
# import stopwords from gensim methods to stop_list variable
# You can also manually add stopwords
gensim_stopwords = gensim.parsing.preprocessing.STOPWORDS
stopwords_list = list(set(gensim_stopwords))
stopwords_update = ["mln", "vs","cts","said","billion","pct","dlrs","dlr"]
stopwords = stopwords_list + stopwords_update
articles_word_limit['temp_list'] = articles_word_limit['text_clean'].apply(lambda x:str(x).split())
# Remove stopwords from the articles
def remove_stopword(x):
return [word for word in x if word not in stopwords]
articles_word_limit['temp_list_stopw'] = articles_word_limit['temp_list'].apply(lambda x:remove_stopword(x))
# generate n-gram words
def generate_ngrams(text, n_gram=1):
ngrams = zip(*[text[i:] for i in range(n_gram)])
return [' '.join(ngram) for ngram in ngrams]
article_unigrams = defaultdict(int)
for tweet in articles_word_limit['temp_list_stopw']:
for word in generate_ngrams(tweet):
article_unigrams[word] += 1
article_unigrams_df = pd.DataFrame(sorted(article_unigrams.items(), key=lambda x: x[1])[::-1])
N=50
# bar graph Plot of top 50 common unigram in reuters articles
fig, axes = plt.subplots(figsize=(18, 50))
plt.tight_layout()
sns.barplot(y=article_unigrams_df[0].values[:N], x=article_unigrams_df[1].values[:N], color='red')
axes.spines['right'].set_visible(False)
axes.set_xlabel('')
axes.set_ylabel('')
axes.tick_params(axis='x', labelsize=13)
axes.tick_params(axis='y', labelsize=13)
axes.set_title(f'Top {N} most common unigrams in Reuters Articles', fontsize=15)
plt.show()
# Plot word cloud
def col_func(word, font_size, position, orientation, font_path, random_state):
colors = ['#b58900', '#cb4b16', '#dc322f', '#d33682', '#6c71c4',
'#268bd2', '#2aa198', '#859900']
return random.choice(colors)
fd = {
'fontsize': '32',
'fontweight' : 'normal',
'verticalalignment': 'baseline',
'horizontalalignment': 'center',
}
wc = WordCloud(width=2000, height=1000, collocations=False,
background_color="white",
color_func=col_func,
max_words=200,
random_state=np.random.randint(1, 8)) .generate_from_frequencies(article_unigrams)
fig, ax = plt.subplots(figsize=(20,10))
ax.imshow(wc, interpolation='bilinear')
ax.axis("off")
ax.set_title(‘Unigram Words of Reuters Articles’, pad=24, fontdict=fd)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment