Skip to content

Instantly share code, notes, and snippets.

@mencarellic
Created May 27, 2019 01:04
Show Gist options
  • Save mencarellic/326da8f7f52f9d2f2ba28fd57fc1ffd1 to your computer and use it in GitHub Desktop.
Save mencarellic/326da8f7f52f9d2f2ba28fd57fc1ffd1 to your computer and use it in GitHub Desktop.
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
# Show all columns and do not truncate in a DF
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)
# set random seed here
seed = 0
np.random.seed(seed)
# Function to find n grams given a list of text
def find_ngrams(input_list, n):
return list(zip(*[input_list[i:] for i in range(n)]))
# Function to round an integer formatted year to the decade
def round_down(num):
try:
return num - (num % 10)
except:
return num
# Function that takes a stopwords list, min frequency, and a list of sentences and returns a counter object
def word_freq(stopwords, min_df, data):
vectorizer = CountVectorizer(min_df=min_df, stop_words=stopwords)
fitted = vectorizer.fit_transform(data)
vocab = list(vectorizer.get_feature_names())
counts = fitted.sum(axis=0).A1
return Counter(dict(zip(vocab, counts)))
# initialize regex pattern
alpha_regex = re.compile('[^a-zA-Z\s\']')
apost_regex = re.compile('[\']')
space_regex = re.compile(' {2,}')
# set stopwords list
base_stop = [apost_regex.sub('', word) for word in nltk.corpus.stopwords.words('english')]
extra_stop = ['']
all_stop = set(base_stop + extra_stop)
# cut words that are this many characters or less
minwordlen = 2
# Location of datafile
datafile = 'C:\\Git\\IST652\\Project\\data\\combined.csv'
# Read in CSV
df = pd.read_csv(datafile)
# drop the mongodb id column
df.drop(axis=1, columns=['Column1', 'combined'], inplace=True)
# get decade of song
df['decade'] = round_down(df['year'])
# remove new line characters (\r \n)
df.replace(r'(\r|\n|\r\n)', ' ', regex=True, inplace=True)
# make everything lowercase
for col in df.columns.values:
try:
df[col] = df[col].str.lower()
except:
pass
# Get a list of all lyrics that are cleaned up
tmp_all_lyrics = df['text'].tolist()
all_lyrics = [space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', song))) for song in tmp_all_lyrics]
all_lyrics_freq = word_freq(stopwords=all_stop, min_df=5, data=all_lyrics)
# Get list of all artists
artists = df['artist'].unique()
artist_lyrics = pd.DataFrame(columns=['artist', 'lyrics', 'nostop'])
# iterate over artists and create a df with the artist and lyrics
for artist in artists:
artist_index = len(artist_lyrics.index)
lyrics = df[df['artist'] == artist]['text'].tolist()
tmp = ''.join([lyric for lyric in lyrics])
cleaned = space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', tmp)))
no_stop = ' '.join([word for word in cleaned.split() if word not in all_stop and len(word) > minwordlen])
artist_lyrics.loc[artist_index] = [artist, cleaned, no_stop]
# Generate tokens, bigrams, and word frequencies
artist_lyrics['tokens'] = artist_lyrics['nostop'].map(lambda x: x.split(' '))
artist_lyrics['bigrams'] = artist_lyrics['nostop'].map(lambda x: find_ngrams(x.split(' '), 2))
artist_lyrics['top25'] = artist_lyrics['nostop'].map(lambda x: word_freq(stopwords=all_stop, min_df=1, data=[x]).most_common(25))
# Get list of all genres
genres = df['genre'].unique()
genre_lyrics = pd.DataFrame(columns=['genre', 'lyrics', 'nostop'])
# iterate over genres and create a df with the genre and lyrics
for genre in genres:
genre_index = len(genre_lyrics.index)
lyrics = df[df['genre'] == genre]['text'].tolist()
tmp = ''.join([lyric for lyric in lyrics])
cleaned = space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', tmp)))
no_stop = ' '.join([word for word in cleaned.split() if word not in all_stop and len(word) > minwordlen])
genre_lyrics.loc[genre_index] = [genre, cleaned, no_stop]
# Generate tokens and bigrams
genre_lyrics['tokens'] = genre_lyrics['nostop'].map(lambda x: x.split(' '))
genre_lyrics['bigrams'] = genre_lyrics['nostop'].map(lambda x: find_ngrams(x.split(' '), 2))
genre_lyrics['top25'] = genre_lyrics['nostop'].map(lambda x: word_freq(stopwords=all_stop, min_df=1, data=[x]).most_common(25))
# Get list of all decades
decades = df['decade'].unique()
decade_lyrics = pd.DataFrame(columns=['decade', 'lyrics', 'nostop'])
# iterate over genres and create a df with the genre and lyrics
for decade in decades:
decade_index = len(decade_lyrics.index)
lyrics = df[df['decade'] == decade]['text'].tolist()
tmp = ''.join([lyric for lyric in lyrics])
cleaned = space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', tmp)))
no_stop = ' '.join([word for word in cleaned.split() if word not in all_stop and len(word) > minwordlen])
decade_lyrics.loc[decade_index] = [decade, cleaned, no_stop]
# Generate tokens and bigrams
decade_lyrics['tokens'] = decade_lyrics['nostop'].map(lambda x: x.split(' '))
decade_lyrics['bigrams'] = decade_lyrics['nostop'].map(lambda x: find_ngrams(x.split(' '), 2))
decade_lyrics['top25'] = decade_lyrics['nostop'].map(lambda x: word_freq(stopwords=all_stop, min_df=1, data=[x]).most_common(25))
# Export to csv
artist_lyrics.to_csv('C:\\Git\\IST652\\Project\\data\\artists.csv', sep='\t', index=False)
genre_lyrics.to_csv('C:\\Git\\IST652\\Project\\data\\genre.csv', sep='\t', index=False)
decade_lyrics.to_csv('C:\\Git\\IST652\\Project\\data\\decade.csv', sep='\t', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment