Skip to content

Instantly share code, notes, and snippets.

@fclesio
Created July 3, 2019 10:39
Show Gist options
  • Save fclesio/8c5c98a120f5ebc50656cb0af6d40287 to your computer and use it in GitHub Desktop.
Save fclesio/8c5c98a120f5ebc50656cb0af6d40287 to your computer and use it in GitHub Desktop.
# Data exploration in some specific class to see the most frequent words
def get_word_frequency(artist):
# Word Frequency per Category
def cleanup_text(docs, logging=False):
texts = []
counter = 1
for doc in docs:
if counter % 1000 == 0 and logging:
print("Processed %d out of %d documents." % (counter, len(docs)))
counter += 1
doc = nlp(doc, disable=['parser', 'ner'])
tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
tokens = [tok for tok in tokens if tok not in stoplist and tok not in punctuations]
tokens = ' '.join(tokens)
texts.append(tokens)
return pd.Series(texts)
df_text = [text for text in df_raw_lyrics[df_raw_lyrics['artist'] == artist]['lyric']]
df_text_clean = cleanup_text(df_text)
df_text_clean = ' '.join(df_text_clean).split()
df_text_clean_counts = Counter(df_text_clean)
df_common_words = [word[0] for word in df_text_clean_counts.most_common(31)]
df_common_counts = [word[1] for word in df_text_clean_counts.most_common(31)]
df_common_words.pop(0)
df_common_counts.pop(0)
fig = plt.figure(figsize=(18,6))
sns.barplot(x=df_common_words, y=df_common_counts)
plt.title(f'Most Common Words used by {artist}')
plt.xticks(rotation=45)
plt.show()
fig.savefig(f'word_frequency_{artist}.png', format='png', dpi=500)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment