A "Best of the Best Practices" (BOBP) guide to developing in Python.
- "Build tools for others that you want to be built for you." - Kenneth Reitz
- "Simplicity is alway better than functionality." - Pieter Hintjens
import matplotlib.pyplot as plt | |
import nltk | |
import numpy as np | |
import os | |
import pandas as pd | |
import pyLDAvis | |
import pyLDAvis.sklearn | |
import random | |
import re | |
import seaborn as sns |
def get_language(text): | |
text = str(text) | |
b = TextBlob(text) | |
return b.detect_language() | |
# Include language in the DF | |
df_raw_lyrics['lang'] = df_raw_lyrics['lyric'].apply(get_language) | |
# Show stats about the language per artist | |
df_raw_lyrics.groupby(['artist', 'lang']).size().reset_index() |
# Filtering out non-EN songs | |
df_raw_lyrics = df_raw_lyrics[df_raw_lyrics['lang'] == 'en'] | |
# Lyrics per album | |
df_raw_lyrics.groupby(['artist', 'album']).size().reset_index() |
# Average songs per album | |
df_albuns = df_raw_lyrics.groupby(['artist', 'album']).size().reset_index() | |
df_albuns.columns = ['artist', 'album', 'qty_tracks'] | |
df_albuns.groupby(['artist']).agg({'qty_tracks': [np.size, np.mean]}).reset_index() |
# Convert the lyrics to string to not break the posterior converts | |
df_raw_lyrics['lyric'] = df_raw_lyrics['lyric'].astype(str) | |
# Remove all stopwords | |
df_raw_lyrics['lyric'] = df_raw_lyrics['lyric']\ | |
.apply(lambda x: ' '.join([item for item in x.lower()\ | |
.split() if item not in stoplist])) | |
# Quick check | |
df_raw_lyrics.head(5) |
# Data exploration in some specific class to see the most frequent words | |
def get_word_frequency(artist): | |
# Word Frequency per Category | |
def cleanup_text(docs, logging=False): | |
texts = [] | |
counter = 1 | |
for doc in docs: | |
if counter % 1000 == 0 and logging: | |
print("Processed %d out of %d documents." % (counter, len(docs))) |
# Most Common words: Angra | |
get_word_frequency('angra') |
# Most Common words: Sepultura | |
get_word_frequency('sepultura') |
# Word cloud with most common words | |
def show_wordcloud(text, artist): | |
# Create and generate a word cloud image: | |
wordcloud = WordCloud(stopwords=stoplist, background_color="white").generate(text) | |
# Display the generated image: | |
fig = plt.figure(figsize=(25,10)) | |
plt.imshow(wordcloud, interpolation='bilinear') | |
plt.title(f'Word Cloud for {artist}', fontsize=20) | |
plt.axis("off") |