Skip to content

Instantly share code, notes, and snippets.

@fclesio
fclesio / bobp-python.md
Created May 8, 2019 15:25 — forked from sloria/bobp-python.md
A "Best of the Best Practices" (BOBP) guide to developing in Python.

The Best of the Best Practices (BOBP) Guide for Python

A "Best of the Best Practices" (BOBP) guide to developing in Python.

In General

Values

  • "Build tools for others that you want to be built for you." - Kenneth Reitz
  • "Simplicity is alway better than functionality." - Pieter Hintjens
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import pyLDAvis
import pyLDAvis.sklearn
import random
import re
import seaborn as sns
def get_language(text):
text = str(text)
b = TextBlob(text)
return b.detect_language()
# Include language in the DF
df_raw_lyrics['lang'] = df_raw_lyrics['lyric'].apply(get_language)
# Show stats about the language per artist
df_raw_lyrics.groupby(['artist', 'lang']).size().reset_index()
# Filtering out non-EN songs
df_raw_lyrics = df_raw_lyrics[df_raw_lyrics['lang'] == 'en']
# Lyrics per album
df_raw_lyrics.groupby(['artist', 'album']).size().reset_index()
# Average songs per album
df_albuns = df_raw_lyrics.groupby(['artist', 'album']).size().reset_index()
df_albuns.columns = ['artist', 'album', 'qty_tracks']
df_albuns.groupby(['artist']).agg({'qty_tracks': [np.size, np.mean]}).reset_index()
# Convert the lyrics to string to not break the posterior converts
df_raw_lyrics['lyric'] = df_raw_lyrics['lyric'].astype(str)
# Remove all stopwords
df_raw_lyrics['lyric'] = df_raw_lyrics['lyric']\
.apply(lambda x: ' '.join([item for item in x.lower()\
.split() if item not in stoplist]))
# Quick check
df_raw_lyrics.head(5)
# Data exploration in some specific class to see the most frequent words
def get_word_frequency(artist):
# Word Frequency per Category
def cleanup_text(docs, logging=False):
texts = []
counter = 1
for doc in docs:
if counter % 1000 == 0 and logging:
print("Processed %d out of %d documents." % (counter, len(docs)))
# Most Common words: Angra
get_word_frequency('angra')
# Most Common words: Sepultura
get_word_frequency('sepultura')
# Word cloud with most common words
def show_wordcloud(text, artist):
# Create and generate a word cloud image:
wordcloud = WordCloud(stopwords=stoplist, background_color="white").generate(text)
# Display the generated image:
fig = plt.figure(figsize=(25,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title(f'Word Cloud for {artist}', fontsize=20)
plt.axis("off")