Created
May 27, 2019 01:04
-
-
Save mencarellic/326da8f7f52f9d2f2ba28fd57fc1ffd1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import re | |
import nltk | |
from sklearn.feature_extraction.text import CountVectorizer | |
from collections import Counter | |
# Show all columns and do not truncate in a DF | |
pd.set_option('display.max_columns', None) | |
pd.set_option('display.max_colwidth', -1) | |
# set random seed here | |
seed = 0 | |
np.random.seed(seed) | |
# Function to find n grams given a list of text | |
def find_ngrams(input_list, n): | |
return list(zip(*[input_list[i:] for i in range(n)])) | |
# Function to round an integer formatted year to the decade | |
def round_down(num): | |
try: | |
return num - (num % 10) | |
except: | |
return num | |
# Function that takes a stopwords list, min frequency, and a list of sentences and returns a counter object | |
def word_freq(stopwords, min_df, data): | |
vectorizer = CountVectorizer(min_df=min_df, stop_words=stopwords) | |
fitted = vectorizer.fit_transform(data) | |
vocab = list(vectorizer.get_feature_names()) | |
counts = fitted.sum(axis=0).A1 | |
return Counter(dict(zip(vocab, counts))) | |
# initialize regex pattern | |
alpha_regex = re.compile('[^a-zA-Z\s\']') | |
apost_regex = re.compile('[\']') | |
space_regex = re.compile(' {2,}') | |
# set stopwords list | |
base_stop = [apost_regex.sub('', word) for word in nltk.corpus.stopwords.words('english')] | |
extra_stop = [''] | |
all_stop = set(base_stop + extra_stop) | |
# cut words that are this many characters or less | |
minwordlen = 2 | |
# Location of datafile | |
datafile = 'C:\\Git\\IST652\\Project\\data\\combined.csv' | |
# Read in CSV | |
df = pd.read_csv(datafile) | |
# drop the mongodb id column | |
df.drop(axis=1, columns=['Column1', 'combined'], inplace=True) | |
# get decade of song | |
df['decade'] = round_down(df['year']) | |
# remove new line characters (\r \n) | |
df.replace(r'(\r|\n|\r\n)', ' ', regex=True, inplace=True) | |
# make everything lowercase | |
for col in df.columns.values: | |
try: | |
df[col] = df[col].str.lower() | |
except: | |
pass | |
# Get a list of all lyrics that are cleaned up | |
tmp_all_lyrics = df['text'].tolist() | |
all_lyrics = [space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', song))) for song in tmp_all_lyrics] | |
all_lyrics_freq = word_freq(stopwords=all_stop, min_df=5, data=all_lyrics) | |
# Get list of all artists | |
artists = df['artist'].unique() | |
artist_lyrics = pd.DataFrame(columns=['artist', 'lyrics', 'nostop']) | |
# iterate over artists and create a df with the artist and lyrics | |
for artist in artists: | |
artist_index = len(artist_lyrics.index) | |
lyrics = df[df['artist'] == artist]['text'].tolist() | |
tmp = ''.join([lyric for lyric in lyrics]) | |
cleaned = space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', tmp))) | |
no_stop = ' '.join([word for word in cleaned.split() if word not in all_stop and len(word) > minwordlen]) | |
artist_lyrics.loc[artist_index] = [artist, cleaned, no_stop] | |
# Generate tokens, bigrams, and word frequencies | |
artist_lyrics['tokens'] = artist_lyrics['nostop'].map(lambda x: x.split(' ')) | |
artist_lyrics['bigrams'] = artist_lyrics['nostop'].map(lambda x: find_ngrams(x.split(' '), 2)) | |
artist_lyrics['top25'] = artist_lyrics['nostop'].map(lambda x: word_freq(stopwords=all_stop, min_df=1, data=[x]).most_common(25)) | |
# Get list of all genres | |
genres = df['genre'].unique() | |
genre_lyrics = pd.DataFrame(columns=['genre', 'lyrics', 'nostop']) | |
# iterate over genres and create a df with the genre and lyrics | |
for genre in genres: | |
genre_index = len(genre_lyrics.index) | |
lyrics = df[df['genre'] == genre]['text'].tolist() | |
tmp = ''.join([lyric for lyric in lyrics]) | |
cleaned = space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', tmp))) | |
no_stop = ' '.join([word for word in cleaned.split() if word not in all_stop and len(word) > minwordlen]) | |
genre_lyrics.loc[genre_index] = [genre, cleaned, no_stop] | |
# Generate tokens and bigrams | |
genre_lyrics['tokens'] = genre_lyrics['nostop'].map(lambda x: x.split(' ')) | |
genre_lyrics['bigrams'] = genre_lyrics['nostop'].map(lambda x: find_ngrams(x.split(' '), 2)) | |
genre_lyrics['top25'] = genre_lyrics['nostop'].map(lambda x: word_freq(stopwords=all_stop, min_df=1, data=[x]).most_common(25)) | |
# Get list of all decades | |
decades = df['decade'].unique() | |
decade_lyrics = pd.DataFrame(columns=['decade', 'lyrics', 'nostop']) | |
# iterate over genres and create a df with the genre and lyrics | |
for decade in decades: | |
decade_index = len(decade_lyrics.index) | |
lyrics = df[df['decade'] == decade]['text'].tolist() | |
tmp = ''.join([lyric for lyric in lyrics]) | |
cleaned = space_regex.sub(' ', apost_regex.sub('', alpha_regex.sub(' ', tmp))) | |
no_stop = ' '.join([word for word in cleaned.split() if word not in all_stop and len(word) > minwordlen]) | |
decade_lyrics.loc[decade_index] = [decade, cleaned, no_stop] | |
# Generate tokens and bigrams | |
decade_lyrics['tokens'] = decade_lyrics['nostop'].map(lambda x: x.split(' ')) | |
decade_lyrics['bigrams'] = decade_lyrics['nostop'].map(lambda x: find_ngrams(x.split(' '), 2)) | |
decade_lyrics['top25'] = decade_lyrics['nostop'].map(lambda x: word_freq(stopwords=all_stop, min_df=1, data=[x]).most_common(25)) | |
# Export to csv | |
artist_lyrics.to_csv('C:\\Git\\IST652\\Project\\data\\artists.csv', sep='\t', index=False) | |
genre_lyrics.to_csv('C:\\Git\\IST652\\Project\\data\\genre.csv', sep='\t', index=False) | |
decade_lyrics.to_csv('C:\\Git\\IST652\\Project\\data\\decade.csv', sep='\t', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment