Flavio Clesio fclesio

## bobp-python.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                fclesio
                / bobp-python.md
            
            
              Created
              May 8, 2019 15:25
                — forked from sloria/bobp-python.md
            
              
                A "Best of the Best Practices" (BOBP) guide to developing in Python.
              
          
    The Best of the Best Practices (BOBP) Guide for Python

A "Best of the Best Practices" (BOBP) guide to developing in Python.
In General

Values


"Build tools for others that you want to be built for you." - Kenneth Reitz
"Simplicity is alway better than functionality." - Pieter Hintjens


## imports-data-preprocessing.py
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import pyLDAvis
import pyLDAvis.sklearn
import random
import re
import seaborn as sns

## text-blob.py
def get_language(text):
    text = str(text)
    b = TextBlob(text)
    return b.detect_language()

# Include language in the DF
df_raw_lyrics['lang'] = df_raw_lyrics['lyric'].apply(get_language)

# Show stats about the language per artist
df_raw_lyrics.groupby(['artist', 'lang']).size().reset_index()

## filtering-en-lyrics-per-album.py
# Filtering out non-EN songs
df_raw_lyrics = df_raw_lyrics[df_raw_lyrics['lang'] == 'en']

# Lyrics per album
df_raw_lyrics.groupby(['artist', 'album']).size().reset_index()

## average-songs.py
# Average songs per album
df_albuns = df_raw_lyrics.groupby(['artist', 'album']).size().reset_index()
df_albuns.columns = ['artist', 'album', 'qty_tracks']
df_albuns.groupby(['artist']).agg({'qty_tracks': [np.size, np.mean]}).reset_index()

## convert-stopwords-removal.py
# Convert the lyrics to string to not break the posterior converts
df_raw_lyrics['lyric'] = df_raw_lyrics['lyric'].astype(str)

# Remove all stopwords
df_raw_lyrics['lyric'] = df_raw_lyrics['lyric']\
    .apply(lambda x: ' '.join([item for item in x.lower()\
    .split() if item not in stoplist]))

# Quick check
df_raw_lyrics.head(5)

## word-frequency.py
# Data exploration in some specific class to see the most frequent words
def get_word_frequency(artist):

    # Word Frequency per Category
    def cleanup_text(docs, logging=False):
        texts = []
        counter = 1
        for doc in docs:
            if counter % 1000 == 0 and logging:
                print("Processed %d out of %d documents." % (counter, len(docs)))

## most-common-angra.py
# Most Common words: Angra
get_word_frequency('angra')

## most-common-sepultura.py
# Most Common words: Sepultura
get_word_frequency('sepultura')

## word-cloud-most-common.py
# Word cloud with most common words
def show_wordcloud(text, artist):
    # Create and generate a word cloud image:
    wordcloud = WordCloud(stopwords=stoplist, background_color="white").generate(text)

    # Display the generated image:
    fig = plt.figure(figsize=(25,10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {artist}', fontsize=20)
    plt.axis("off")
	import matplotlib.pyplot as plt
	import nltk
	import numpy as np
	import os
	import pandas as pd
	import pyLDAvis
	import pyLDAvis.sklearn
	import random
	import re
	import seaborn as sns
	def get_language(text):
	text = str(text)
	b = TextBlob(text)
	return b.detect_language()

	# Include language in the DF
	df_raw_lyrics['lang'] = df_raw_lyrics['lyric'].apply(get_language)

	# Show stats about the language per artist
	df_raw_lyrics.groupby(['artist', 'lang']).size().reset_index()
	# Filtering out non-EN songs
	df_raw_lyrics = df_raw_lyrics[df_raw_lyrics['lang'] == 'en']

	# Lyrics per album
	df_raw_lyrics.groupby(['artist', 'album']).size().reset_index()
	# Average songs per album
	df_albuns = df_raw_lyrics.groupby(['artist', 'album']).size().reset_index()
	df_albuns.columns = ['artist', 'album', 'qty_tracks']
	df_albuns.groupby(['artist']).agg({'qty_tracks': [np.size, np.mean]}).reset_index()
	# Convert the lyrics to string to not break the posterior converts
	df_raw_lyrics['lyric'] = df_raw_lyrics['lyric'].astype(str)

	# Remove all stopwords
	df_raw_lyrics['lyric'] = df_raw_lyrics['lyric']\
	.apply(lambda x: ' '.join([item for item in x.lower()\
	.split() if item not in stoplist]))

	# Quick check
	df_raw_lyrics.head(5)
	# Data exploration in some specific class to see the most frequent words
	def get_word_frequency(artist):

	# Word Frequency per Category
	def cleanup_text(docs, logging=False):
	texts = []
	counter = 1
	for doc in docs:
	if counter % 1000 == 0 and logging:
	print("Processed %d out of %d documents." % (counter, len(docs)))
	# Most Common words: Sepultura
	get_word_frequency('sepultura')
	# Word cloud with most common words
	def show_wordcloud(text, artist):
	# Create and generate a word cloud image:
	wordcloud = WordCloud(stopwords=stoplist, background_color="white").generate(text)

	# Display the generated image:
	fig = plt.figure(figsize=(25,10))
	plt.imshow(wordcloud, interpolation='bilinear')
	plt.title(f'Word Cloud for {artist}', fontsize=20)
	plt.axis("off")