Skip to content

Instantly share code, notes, and snippets.

@fclesio
Created July 3, 2019 10:30
Show Gist options
  • Save fclesio/eb5fedf5dd00b43fcab4ea955da7744c to your computer and use it in GitHub Desktop.
Save fclesio/eb5fedf5dd00b43fcab4ea955da7744c to your computer and use it in GitHub Desktop.
import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import pyLDAvis
import pyLDAvis.sklearn
import random
import re
import seaborn as sns
import spacy
import string
from collections import Counter
from PIL import Image
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en import English
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# Generate graphs inline in Jupyter
%matplotlib inline
# Lock random seeds used by libraries
random.seed(42)
np.random.seed(42)
# Define default stopwords list
stoplist = ENGLISH_STOP_WORDS
# Define function to cleanup text by removing
# personal pronouns, stopwords, and puncuation
nlp = spacy.load("en_core_web_sm")
punctuations = string.punctuation
# Datasets
filedir = os.path.dirname(os.path.realpath('__file__'))
filename = os.path.join(filedir, 'data/rebirth-remains.csv')
# Load file
df_raw_lyrics = pd.read_csv(filename, index_col=False)
df_raw_lyrics.columns = ['index','artist','album','lyric']
# One limitation of the wrapper that I used to get the data
# it's that contains a tons of bad records
df_raw_lyrics = df_raw_lyrics[pd.notnull(df_raw_lyrics['lyric'])]
df_raw_lyrics = df_raw_lyrics[~df_raw_lyrics["lyric"].str.contains("<span style=")]
df_raw_lyrics = df_raw_lyrics[~df_raw_lyrics["lyric"].str.contains("padding")]
df_raw_lyrics = df_raw_lyrics[~df_raw_lyrics["lyric"].str.contains("<img")]
# Basic counters
print(f'Qty rows: {df_raw_lyrics.shape[0]}, Qty columns: {df_raw_lyrics.shape[1]}')
# First look in the data
df_raw_lyrics.head(5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment